From: imiyazaki Date: Thu, 23 Mar 2006 06:45:46 +0000 (+0000) Subject: fix UTF-16 bug. X-Git-Url: http://git.chise.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=c96a16203a844c5cc7abdb9a4b52a649b6a8b6b0;p=chise%2Fomega.git fix UTF-16 bug. --- diff --git a/add_adobecid.pl b/add_adobecid.pl index 29c1135..8c726fa 100644 --- a/add_adobecid.pl +++ b/add_adobecid.pl @@ -1,8 +1,8 @@ #!/usr/bin/perl -w use strict; -use vars qw($cmapfile $db_home $encoding - %cs_var +use vars qw($cmapfile $db_home $encoding $utf16 + %cs_var $ucs $cid $last $ciddb_filename $ciddb %ciddb %cid ); @@ -16,7 +16,7 @@ my $usage=< UniJIS-UTF16-H etc. available in Adobe Reader Directory. is directory to store BDB data, - typically /usr/local/share/chise/0.3/db. + typically /usr/local/share/chise/1.0/db. EOF #my $db_home="/usr/local/lib/chise/char-db"; @@ -47,6 +47,10 @@ unless(defined($cmapfile) and -f $cmapfile exit 1; } +if($cmapfile=~/utf16/io){ + $utf16=1; +} + # if working on Mac OS. if($^O=~/darwin/){ print STDERR "Using ^M as delimiter.\n"; @@ -61,7 +65,7 @@ $cs_var{'=ucs@cns'}=['=cns11643-1','=cns11643-2', $cs_var{'=ucs@gb'}=['=gb12345','=gb2312']; $cs_var{'=ucs@jis'}=['=jis-x0208','=jis-x0208-1978', - '=jis-x0208-1983','=jis-x0208-1990', + '=jis-x0208-1983','=jis-x0208-1990','=jis-x0208-1997', '=jis-x0212', '=jis-x0213-1-2000','=jis-x0213-2-2000']; @@ -84,22 +88,22 @@ print STDERR "Reading $cmapfile..."; open(CMAP,"<$cmapfile") or die $!; # taken from expandcmap.pl by taiji. while(){ - if(/begincidrange/){ + if(/begincidrange/o){ $in_cidrange=1; - }elsif(/endcidrange/){ + }elsif(/endcidrange/o){ $in_cidrange=0; - }elsif(/begincidchar/){ + }elsif(/begincidchar/o){ $in_cidchar=1; - }elsif(/endcidchar/){ + }elsif(/endcidchar/o){ $in_cidchar=0; }elsif($in_cidchar){ - if(/<([\da-fA-F]+)>\s*(\d+)/){ - ($ucs,$cid)=(hex($1),$2); + if(/<([\da-fA-F]+)>\s*(\d+)/o){ + ($ucs,$cid)=($utf16?&decode_utf16($1):hex($1),$2); &store_cid($ucs,$cid,$encoding); } }elsif($in_cidrange){ - if(/<([\da-fA-F]+)>\s*<([\da-fA-F]+)>\s*(\d+)/){ - ($ucs, $last, $cid) = (hex($1), hex($2), $3); + if(/<([\da-fA-F]+)>\s*<([\da-fA-F]+)>\s*(\d+)/o){ + ($ucs, $last, $cid) = ($utf16?&decode_utf16($1):hex($1), $utf16?&decode_utf16($2):hex($2), $3); while ($ucs <= $last) { &store_cid($ucs,$cid,$encoding); $cid++,$ucs++; @@ -153,12 +157,25 @@ sub store_cid{ } } } +# $char=&replace_denotational($char); if($debug){ print STDERR sprintf("%X:%d\n",unpack("U",$char),$cid); } $cid{$char}=$cid; } +sub replace_denotational($){ + my($in_char)=@_; + my($out_char); + my $ucs=unpack("U",$in_char); + + if(($out_char)=&get_chars_matching('=ucs@unicode',$ucs)){ + return $out_char; + }else{ + return $in_char; + } +} + sub replace_char_id{ my($ucs,$encoding)=@_; my($char); @@ -192,3 +209,14 @@ sub get_char_id_unified{ return (); } } + +sub decode_utf16($){ + my($in)=@_; + my($out); + if($in=~m/([\da-fA-F]{4})([\da-fA-F]{4})/o){ + $out=0x10000 + (hex($1) - 0xD800) * 0x400 + (hex($2) - 0xDC00); + }else{ + $out=hex($in); + } + return $out; +}