X-Git-Url: http://git.chise.org/gitweb/?a=blobdiff_plain;f=add_adobecid.pl;h=8c726fa1c77c9ac29662d8b66aae0f9461b73695;hb=c054d399007b8e474503284447af04c6f81f275d;hp=38f9546d564e46b02e2cc2f8771670dcf1938921;hpb=abc71447b51817e45069dacb6b1c1aee0fc606e4;p=chise%2Fomega.git

diff --git a/add_adobecid.pl b/add_adobecid.pl
index 38f9546..8c726fa 100644
--- a/add_adobecid.pl
+++ b/add_adobecid.pl
@@ -1,41 +1,22 @@
 #!/usr/bin/perl -w
 
 use strict;
-use vars qw($perl56 $perl58
-	    $cmapfile $db_home $encoding
-	    %cs_var
+use vars qw($cmapfile $db_home $encoding $utf16
+	    %cs_var 
 	    $ucs $cid $last
-	    $ciddb_filename $ciddb
+	    $ciddb_filename $ciddb %ciddb %cid
 	    );
 use BerkeleyDB;
 use Chise_utils ':all';
+require 5.008;
 
 my $debug=0;
 
-if($^V and $^V ge v5.8){
-    $perl58=1;
-}elsif($^V and $^V ge v5.6){
-    $perl56=1;
-}else{
-    print STDERR "This version is not supported.";
-}
-if($perl58){
-    eval "use Encode";
-    binmode(STDIN, ':encoding(utf8)');
-    binmode(STDOUT, ':encoding(utf8)');
-}
-
-# if working on Mac OS.
-if($^O=~/darwin/){
-    print STDERR "Using ^M as delimiter.\n";
-    $/="";
-}
-
 my $usage=<<EOF;
 Usage: perl $0 <CMAP file> <CHISE DB dir>
     <CMAP file> UniJIS-UTF16-H etc. available in Adobe Reader Directory.
     <CHISE DB dir> is directory to store BDB data,
-      typically /usr/local/lib/chise/chise-db.
+      typically /usr/local/share/chise/1.0/db.
 EOF
 
 #my $db_home="/usr/local/lib/chise/char-db";
@@ -66,6 +47,16 @@ unless(defined($cmapfile) and -f $cmapfile
     exit 1;
 }
 
+if($cmapfile=~/utf16/io){
+    $utf16=1;
+}
+
+# if working on Mac OS.
+if($^O=~/darwin/){
+    print STDERR "Using ^M as delimiter.\n";
+    $/="";
+}
+
 $cs_var{'=ucs@cns'}=['=cns11643-1','=cns11643-2',
 		     '=cns11643-3','=cns11643-4',
 		     '=cns11643-5','=cns11643-6',
@@ -74,7 +65,7 @@ $cs_var{'=ucs@cns'}=['=cns11643-1','=cns11643-2',
 $cs_var{'=ucs@gb'}=['=gb12345','=gb2312'];
 
 $cs_var{'=ucs@jis'}=['=jis-x0208','=jis-x0208-1978',
-		     '=jis-x0208-1983','=jis-x0208-1990',
+		     '=jis-x0208-1983','=jis-x0208-1990','=jis-x0208-1997',
 		     '=jis-x0212',
 		     '=jis-x0213-1-2000','=jis-x0213-2-2000'];
 
@@ -84,8 +75,11 @@ if(-f "$db_home/$ciddb_filename"){
     print STDERR "Removing old DB $db_home/$ciddb_filename.\n";
     unlink "$db_home/$ciddb_filename";
 }
-$ciddb=new BerkeleyDB::Hash
-    -Filename => "$db_home/$ciddb_filename", -Flags => DB_CREATE
+
+$ciddb=tie %ciddb, 'BerkeleyDB::Hash',
+    -Filename => "$db_home/$ciddb_filename",
+    -Flags => DB_CREATE|DB_TRUNCATE,
+    -Pagesize      => 512,
     or die $!;
 
 my $in_cidrange=0;
@@ -94,22 +88,22 @@ print STDERR "Reading $cmapfile...";
 open(CMAP,"<$cmapfile") or die $!;
 # taken from expandcmap.pl by taiji.
 while(<CMAP>){
-    if(/begincidrange/){
+    if(/begincidrange/o){
 	$in_cidrange=1;
-    }elsif(/endcidrange/){
+    }elsif(/endcidrange/o){
 	$in_cidrange=0;
-    }elsif(/begincidchar/){
+    }elsif(/begincidchar/o){
 	$in_cidchar=1;
-    }elsif(/endcidchar/){
+    }elsif(/endcidchar/o){
 	$in_cidchar=0;
     }elsif($in_cidchar){
-	if(/<([\da-fA-F]+)>\s*(\d+)/){
-	    ($ucs,$cid)=(hex($1),$2);
+	if(/<([\da-fA-F]+)>\s*(\d+)/o){
+	    ($ucs,$cid)=($utf16?&decode_utf16($1):hex($1),$2);
 	    &store_cid($ucs,$cid,$encoding);
 	}
     }elsif($in_cidrange){
-	if(/<([\da-fA-F]+)>\s*<([\da-fA-F]+)>\s*(\d+)/){
-	    ($ucs, $last, $cid) = (hex($1), hex($2), $3);
+	if(/<([\da-fA-F]+)>\s*<([\da-fA-F]+)>\s*(\d+)/o){
+	    ($ucs, $last, $cid) = ($utf16?&decode_utf16($1):hex($1), $utf16?&decode_utf16($2):hex($2), $3);
 	    while ($ucs <= $last) {
 		&store_cid($ucs,$cid,$encoding);
 		$cid++,$ucs++;
@@ -120,6 +114,17 @@ while(<CMAP>){
 close(CMAP);
 print STDERR "done!\n";
 
+print STDERR "Storing data to CHISE DB...";
+foreach my $char (sort keys %cid){
+    unless($ciddb->db_put("?".$char,$cid{$char})==0){
+	die $!;
+    }
+}
+print STDERR "done!\n";
+
+undef $ciddb;
+untie %ciddb;
+
 exit 0;
 
 sub store_cid{
@@ -152,11 +157,22 @@ sub store_cid{
 	    }
 	}
     }
+#    $char=&replace_denotational($char);
     if($debug){
 	print STDERR sprintf("%X:%d\n",unpack("U",$char),$cid);
     }
-    unless($ciddb->db_put("?".$char,$cid)==0){
-	die $!;
+    $cid{$char}=$cid;
+}
+
+sub replace_denotational($){
+    my($in_char)=@_;
+    my($out_char);
+    my $ucs=unpack("U",$in_char);
+
+    if(($out_char)=&get_chars_matching('=ucs@unicode',$ucs)){
+	return $out_char;
+    }else{
+	return $in_char;
     }
 }
 
@@ -165,8 +181,6 @@ sub replace_char_id{
     my($char);
 
     if(($char)=&get_chars_matching($encoding,$ucs)){
-	$char=decode('utf8', $char) if($perl58);
-	$char=~s/^\?//;
 	return unpack("U",$char);
     }else{
 	return undef;
@@ -190,9 +204,19 @@ sub get_char_id_unified{
     my($chars);
     if($chars=&get_char_attribute(pack("U",$char_id),'->ucs-unified')){
 	$chars=~s/^\((.*)\)$/$1/;
-	$chars=~s/\?//g;
-	return map {unpack("U",$_)} (split(/\s+/,$chars));
+	return map {unpack("U",$_)} (split(/\s*\?/,$chars));
     }else{
 	return ();
     }
 }
+
+sub decode_utf16($){
+    my($in)=@_;
+    my($out);
+    if($in=~m/([\da-fA-F]{4})([\da-fA-F]{4})/o){
+	$out=0x10000 + (hex($1) - 0xD800) * 0x400 + (hex($2) - 0xDC00);
+    }else{
+	$out=hex($in);
+    }
+    return $out;
+}