X-Git-Url: http://git.chise.org/gitweb/?a=blobdiff_plain;f=add_adobecid.pl;h=8c726fa1c77c9ac29662d8b66aae0f9461b73695;hb=c054d399007b8e474503284447af04c6f81f275d;hp=38f9546d564e46b02e2cc2f8771670dcf1938921;hpb=abc71447b51817e45069dacb6b1c1aee0fc606e4;p=chise%2Fomega.git diff --git a/add_adobecid.pl b/add_adobecid.pl index 38f9546..8c726fa 100644 --- a/add_adobecid.pl +++ b/add_adobecid.pl @@ -1,41 +1,22 @@ #!/usr/bin/perl -w use strict; -use vars qw($perl56 $perl58 - $cmapfile $db_home $encoding - %cs_var +use vars qw($cmapfile $db_home $encoding $utf16 + %cs_var $ucs $cid $last - $ciddb_filename $ciddb + $ciddb_filename $ciddb %ciddb %cid ); use BerkeleyDB; use Chise_utils ':all'; +require 5.008; my $debug=0; -if($^V and $^V ge v5.8){ - $perl58=1; -}elsif($^V and $^V ge v5.6){ - $perl56=1; -}else{ - print STDERR "This version is not supported."; -} -if($perl58){ - eval "use Encode"; - binmode(STDIN, ':encoding(utf8)'); - binmode(STDOUT, ':encoding(utf8)'); -} - -# if working on Mac OS. -if($^O=~/darwin/){ - print STDERR "Using ^M as delimiter.\n"; - $/=" "; -} - my $usage=< UniJIS-UTF16-H etc. available in Adobe Reader Directory. is directory to store BDB data, - typically /usr/local/lib/chise/chise-db. + typically /usr/local/share/chise/1.0/db. EOF #my $db_home="/usr/local/lib/chise/char-db"; @@ -66,6 +47,16 @@ unless(defined($cmapfile) and -f $cmapfile exit 1; } +if($cmapfile=~/utf16/io){ + $utf16=1; +} + +# if working on Mac OS. +if($^O=~/darwin/){ + print STDERR "Using ^M as delimiter.\n"; + $/=" "; +} + $cs_var{'=ucs@cns'}=['=cns11643-1','=cns11643-2', '=cns11643-3','=cns11643-4', '=cns11643-5','=cns11643-6', @@ -74,7 +65,7 @@ $cs_var{'=ucs@cns'}=['=cns11643-1','=cns11643-2', $cs_var{'=ucs@gb'}=['=gb12345','=gb2312']; $cs_var{'=ucs@jis'}=['=jis-x0208','=jis-x0208-1978', - '=jis-x0208-1983','=jis-x0208-1990', + '=jis-x0208-1983','=jis-x0208-1990','=jis-x0208-1997', '=jis-x0212', '=jis-x0213-1-2000','=jis-x0213-2-2000']; @@ -84,8 +75,11 @@ if(-f "$db_home/$ciddb_filename"){ print STDERR "Removing old DB $db_home/$ciddb_filename.\n"; unlink "$db_home/$ciddb_filename"; } -$ciddb=new BerkeleyDB::Hash - -Filename => "$db_home/$ciddb_filename", -Flags => DB_CREATE + +$ciddb=tie %ciddb, 'BerkeleyDB::Hash', + -Filename => "$db_home/$ciddb_filename", + -Flags => DB_CREATE|DB_TRUNCATE, + -Pagesize => 512, or die $!; my $in_cidrange=0; @@ -94,22 +88,22 @@ print STDERR "Reading $cmapfile..."; open(CMAP,"<$cmapfile") or die $!; # taken from expandcmap.pl by taiji. while(){ - if(/begincidrange/){ + if(/begincidrange/o){ $in_cidrange=1; - }elsif(/endcidrange/){ + }elsif(/endcidrange/o){ $in_cidrange=0; - }elsif(/begincidchar/){ + }elsif(/begincidchar/o){ $in_cidchar=1; - }elsif(/endcidchar/){ + }elsif(/endcidchar/o){ $in_cidchar=0; }elsif($in_cidchar){ - if(/<([\da-fA-F]+)>\s*(\d+)/){ - ($ucs,$cid)=(hex($1),$2); + if(/<([\da-fA-F]+)>\s*(\d+)/o){ + ($ucs,$cid)=($utf16?&decode_utf16($1):hex($1),$2); &store_cid($ucs,$cid,$encoding); } }elsif($in_cidrange){ - if(/<([\da-fA-F]+)>\s*<([\da-fA-F]+)>\s*(\d+)/){ - ($ucs, $last, $cid) = (hex($1), hex($2), $3); + if(/<([\da-fA-F]+)>\s*<([\da-fA-F]+)>\s*(\d+)/o){ + ($ucs, $last, $cid) = ($utf16?&decode_utf16($1):hex($1), $utf16?&decode_utf16($2):hex($2), $3); while ($ucs <= $last) { &store_cid($ucs,$cid,$encoding); $cid++,$ucs++; @@ -120,6 +114,17 @@ while(){ close(CMAP); print STDERR "done!\n"; +print STDERR "Storing data to CHISE DB..."; +foreach my $char (sort keys %cid){ + unless($ciddb->db_put("?".$char,$cid{$char})==0){ + die $!; + } +} +print STDERR "done!\n"; + +undef $ciddb; +untie %ciddb; + exit 0; sub store_cid{ @@ -152,11 +157,22 @@ sub store_cid{ } } } +# $char=&replace_denotational($char); if($debug){ print STDERR sprintf("%X:%d\n",unpack("U",$char),$cid); } - unless($ciddb->db_put("?".$char,$cid)==0){ - die $!; + $cid{$char}=$cid; +} + +sub replace_denotational($){ + my($in_char)=@_; + my($out_char); + my $ucs=unpack("U",$in_char); + + if(($out_char)=&get_chars_matching('=ucs@unicode',$ucs)){ + return $out_char; + }else{ + return $in_char; } } @@ -165,8 +181,6 @@ sub replace_char_id{ my($char); if(($char)=&get_chars_matching($encoding,$ucs)){ - $char=decode('utf8', $char) if($perl58); - $char=~s/^\?//; return unpack("U",$char); }else{ return undef; @@ -190,9 +204,19 @@ sub get_char_id_unified{ my($chars); if($chars=&get_char_attribute(pack("U",$char_id),'->ucs-unified')){ $chars=~s/^\((.*)\)$/$1/; - $chars=~s/\?//g; - return map {unpack("U",$_)} (split(/\s+/,$chars)); + return map {unpack("U",$_)} (split(/\s*\?/,$chars)); }else{ return (); } } + +sub decode_utf16($){ + my($in)=@_; + my($out); + if($in=~m/([\da-fA-F]{4})([\da-fA-F]{4})/o){ + $out=0x10000 + (hex($1) - 0xD800) * 0x400 + (hex($2) - 0xDC00); + }else{ + $out=hex($in); + } + return $out; +}