fix UTF-16 bug.

author imiyazaki <imiyazaki>

Thu, 23 Mar 2006 06:45:46 +0000 (06:45 +0000)

committer imiyazaki <imiyazaki>

Thu, 23 Mar 2006 06:45:46 +0000 (06:45 +0000)
author imiyazaki <imiyazaki>
Thu, 23 Mar 2006 06:45:46 +0000 (06:45 +0000)
committer imiyazaki <imiyazaki>
Thu, 23 Mar 2006 06:45:46 +0000 (06:45 +0000)
diff --git a/add_adobecid.pl b/add_adobecid.pl

index 29c1135..8c726fa 100644 (file)
--- a/add_adobecid.pl
+++ b/add_adobecid.pl
@@ -1,8 +1,8 @@
  #!/usr/bin/perl -w
  
  use strict;
-use vars qw($cmapfile $db_home $encoding
-           %cs_var
+use vars qw($cmapfile $db_home $encoding $utf16
+           %cs_var 
             $ucs $cid $last
             $ciddb_filename $ciddb %ciddb %cid
             );
@@ -16,7 +16,7 @@ my $usage=<<EOF;
  Usage: perl $0 <CMAP file> <CHISE DB dir>
      <CMAP file> UniJIS-UTF16-H etc. available in Adobe Reader Directory.
      <CHISE DB dir> is directory to store BDB data,
-      typically /usr/local/share/chise/0.3/db.
+      typically /usr/local/share/chise/1.0/db.
  EOF
  
  #my $db_home="/usr/local/lib/chise/char-db";
@@ -47,6 +47,10 @@ unless(defined($cmapfile) and -f $cmapfile
      exit 1;
  }
  
+if($cmapfile=~/utf16/io){
+    $utf16=1;
+}
+
  # if working on Mac OS.
  if($^O=~/darwin/){
      print STDERR "Using ^M as delimiter.\n";
@@ -61,7 +65,7 @@ $cs_var{'=ucs@cns'}=['=cns11643-1','=cns11643-2',
  $cs_var{'=ucs@gb'}=['=gb12345','=gb2312'];
  
  $cs_var{'=ucs@jis'}=['=jis-x0208','=jis-x0208-1978',
-                    '=jis-x0208-1983','=jis-x0208-1990',
+                    '=jis-x0208-1983','=jis-x0208-1990','=jis-x0208-1997',
                      '=jis-x0212',
                      '=jis-x0213-1-2000','=jis-x0213-2-2000'];
  
@@ -84,22 +88,22 @@ print STDERR "Reading $cmapfile...";
  open(CMAP,"<$cmapfile") or die $!;
  # taken from expandcmap.pl by taiji.
  while(<CMAP>){
-    if(/begincidrange/){
+    if(/begincidrange/o){
         $in_cidrange=1;
-    }elsif(/endcidrange/){
+    }elsif(/endcidrange/o){
         $in_cidrange=0;
-    }elsif(/begincidchar/){
+    }elsif(/begincidchar/o){
         $in_cidchar=1;
-    }elsif(/endcidchar/){
+    }elsif(/endcidchar/o){
         $in_cidchar=0;
      }elsif($in_cidchar){
-       if(/<([\da-fA-F]+)>\s*(\d+)/){
-           ($ucs,$cid)=(hex($1),$2);
+       if(/<([\da-fA-F]+)>\s*(\d+)/o){
+           ($ucs,$cid)=($utf16?&decode_utf16($1):hex($1),$2);
             &store_cid($ucs,$cid,$encoding);
         }
      }elsif($in_cidrange){
-       if(/<([\da-fA-F]+)>\s*<([\da-fA-F]+)>\s*(\d+)/){
-           ($ucs, $last, $cid) = (hex($1), hex($2), $3);
+       if(/<([\da-fA-F]+)>\s*<([\da-fA-F]+)>\s*(\d+)/o){
+           ($ucs, $last, $cid) = ($utf16?&decode_utf16($1):hex($1), $utf16?&decode_utf16($2):hex($2), $3);
             while ($ucs <= $last) {
                 &store_cid($ucs,$cid,$encoding);
                 $cid++,$ucs++;
@@ -153,12 +157,25 @@ sub store_cid{
             }
         }
      }
+#    $char=&replace_denotational($char);
      if($debug){
         print STDERR sprintf("%X:%d\n",unpack("U",$char),$cid);
      }
      $cid{$char}=$cid;
  }
  
+sub replace_denotational($){
+    my($in_char)=@_;
+    my($out_char);
+    my $ucs=unpack("U",$in_char);
+
+    if(($out_char)=&get_chars_matching('=ucs@unicode',$ucs)){
+       return $out_char;
+    }else{
+       return $in_char;
+    }
+}
+
  sub replace_char_id{
      my($ucs,$encoding)=@_;
      my($char);
@@ -192,3 +209,14 @@ sub get_char_id_unified{
         return ();
      }
  }
+
+sub decode_utf16($){
+    my($in)=@_;
+    my($out);
+    if($in=~m/([\da-fA-F]{4})([\da-fA-F]{4})/o){
+       $out=0x10000 + (hex($1) - 0xD800) * 0x400 + (hex($2) - 0xDC00);
+    }else{
+       $out=hex($in);
+    }
+    return $out;
+}
author	imiyazaki <imiyazaki>
	Thu, 23 Mar 2006 06:45:46 +0000 (06:45 +0000)
committer	imiyazaki <imiyazaki>
	Thu, 23 Mar 2006 06:45:46 +0000 (06:45 +0000)