use CHISE DB directly.

author imiyazaki <imiyazaki>

Sat, 18 Oct 2003 02:25:39 +0000 (02:25 +0000)

committer imiyazaki <imiyazaki>

Sat, 18 Oct 2003 02:25:39 +0000 (02:25 +0000)
author imiyazaki <imiyazaki>
Sat, 18 Oct 2003 02:25:39 +0000 (02:25 +0000)
committer imiyazaki <imiyazaki>
Sat, 18 Oct 2003 02:25:39 +0000 (02:25 +0000)
diff --git a/outCMAP b/outCMAP

index 16880b9..78d3666 100755 (executable)
--- a/outCMAP
+++ b/outCMAP
@@ -1,22 +1,29 @@
  #!/usr/bin/perl
  
+# ver.0.2
+
  use strict;
-use vars qw($opt_in_cs $opt_out_cs $opt_help $usage
-           $in_cs $out_cs
-           $i @chars
-           $char $char_id $out_char $omegadb_home
+use vars qw($opt_in_cs $opt_out_cs $opt_protrude
+           $opt_help $usage
+           $in_cs $out_cs $i @chars
+           @order $opt_order %order
+           $char $char_id $out_char
             $ids $ids_argc %ids $idsdb
             $idsdata_file $ids_start $font_start
             %utf8mcs_map_from
-           %cmap_to
             $inotp $perl56 $perl58
-           $useCDP $useHZK $useGT
             @CDP @HZK @GT
             );
  use Getopt::Long;
  use utf8;
  use Chise_utils ':all';
  
+### Options ###
+
+$opt_order='jtcgk';
+$opt_protrude=0;# 1=true, 0=false.
+
+###
  my $strictly_forbidden_after = '「【『［（〈“‘‘（〔｛《{\[\(\x{3016}｛｢';
  #       \x{3016} | # white 【
  
@@ -30,7 +37,7 @@ my $slightly_forbidden_after = '￥¥＄$〒♯＃#￠¢￡£＠@§';
  # in the right margin
  my $strictly_forbidden_before=
      '!,.:;?、。！，．：；？。\)#}’”〉》」』】〕\x{3017}）］｝｝」\]';
-#       \x{3017} | # white 】
+###       \x{3017} | # white 】
  
  my $forbidden_before
      = 'ー々ぁぃぅぇぉゃゅょっゎァィゥェォャュョッヮヵヶ';
@@ -42,16 +49,13 @@ my $asian = '\x{1100}-\x{11FF}\x{2E80}-\x{D7AF}\x{F900}-\x{FAFF}\x{FE30}-\x{FE4F
  
  my $space = '\x{0020}\x{0009}\x{000A}\x{000C}\x{000D}';
  
-$useGT=1;
-$useHZK=0;
-$useCDP=0;
-
  if($^V and $^V ge v5.8){
      $perl58=1;
  }elsif($^V and $^V ge v5.6){
      $perl56=1;
  }else{
      print STDERR "This versin is not supported.";
+    exit 1;
  }
  if($perl58){
      eval "use Encode";
@@ -59,21 +63,16 @@ if($perl58){
      binmode(STDOUT, ':encoding(utf8)');
  }
  
-$omegadb_home="/usr/local/lib/chise/omega/db";
-
  &GetOptions("in=s"=>\$opt_in_cs,
-           "i=s"=>\$opt_in_cs,
             "out=s"=>\$opt_out_cs,
-           "o=s"=>\$opt_out_cs,
-           "help",\$opt_help,
-           "h",\$opt_help);
+           "help",\$opt_help);
  
  $usage=<<EOF;
  Usage: $0 -i <input coding system> -o <cmap encoding>
      input coding system:
        Utf8mcs, Utf8cns, Utf8gb, Utf8jis, Utf8ks
      cmap encoding:
-      UniCNS, UniGB, UniJIS, UniKS
+      UniCNS, UniGB, UniJIS, UniKS, UniMulti
  EOF
  
  if($opt_in_cs or $opt_out_cs){
@@ -85,9 +84,11 @@ if($opt_in_cs or $opt_out_cs){
  }
  
  # $in_cs:
-#   utf-8-mcs,utf-8-cns,utf-8-gb,utf-8-jis,utf-8-ks,
+#   Utf8mcs,Utf8cns,Utf8gb,Utf8jis,Utf8ks,
  # $out_cs:
-#   UniCNS,UniGB,UniJIS,UniKS
+#   UniCNS,UniGB,UniJIS,UniKS,UniMulti
+
+$in_cs=~s/Utf8/ucs\@/;
  
  if($opt_help
     or not defined($in_cs)
@@ -114,72 +115,81 @@ $ids="";
  @HZK=("=hanziku-1","=hanziku-10","=hanziku-11","=hanziku-12","=hanziku-2","=hanziku-3","=hanziku-4","=hanziku-5","=hanziku-6","=hanziku-7","=hanziku-8","=hanziku-9");
  @CDP=("=big5-cdp");
  
+%order=('c'=>'UniCNS',
+       'g'=>'UniGB',
+       'j'=>'UniJIS',
+       'k'=>'UniKS',
+       't'=>'GT',
+#      not implemented yet.
+#      'h'=>'HZK',
+#      'd'=>'CDP',
+       );
+
+if(defined($opt_order)){
+    if($opt_order=~/^[cgjkt]*$/){
+       @order=split(//,$opt_order);
+       @order=map {$order{$_}} @order;
+    }else{
+       print STDERR "Invalid order!\n";
+       exit 1;
+    }
+}
+
  while(<>){
-    # temporary fix for using in OTP for perl 5.6.
-    s/(.)/pack("c",unpack("U",$1))/ge if($inotp
-                                        and $in_cs=~/utf8/i
-                                        and $perl56);
-    # for perl 5.8.
-    $_=decode('utf8', $_)  if ($inotp and $in_cs=~/utf8/i
-                              and $perl58);
+    if($perl56){
+       # for perl 5.6
+       if($inotp){
+           s/(.)/pack("c",&get_char_id(unpack("U",$1),$in_cs))/ge;
+       }else{
+           s/(.)/pack("U",&get_char_id(unpack("U",$1),$in_cs))/ge;
+       }
+    }elsif($perl58){
+       # for perl 5.8.
+       $_=decode('utf8', $_);
+       s/(.)/pack("U",&get_char_id(unpack("U",$1),$in_cs))/ge;
+    }
      s/(amp.+?;)/&tex_de_er($1)/ge;
  #    s/(&.+?;)/&tex_de_er($1)/ge;
-#    s/^(.*)$/&add_break($1)/e;
      @chars=split(//);
      for($i=0;$i<=$#chars;$i++){
-#    while(m/(.)/g){
-       $char=&get_char_in_utf8mcs($chars[$i],$in_cs);
+       $char=$chars[$i];
         $char_id=unpack("U",$char);
-       if($ids_argc>0){
-           # It's in IDS.
-           ($ids,$ids_argc)=&ids_rest($ids,$ids_argc,$char);
-           if($ids_argc==0){
-               if(($char_id=&get_char_id_for_ids($ids))
-                  and(($out_char=&get_output_char($char_id,$out_cs)))){
-                   print $out_char,&add_break($i);
-               }else{
-                   print &replace_ids($ids),&add_break($i) if($perl56);
-                   print encode('utf8', &replace_ids($ids)),&add_break($i) if($perl58);
+
+       if($char_id<=0x20){
+           print $chars[$i];
+           next;
+       }elsif($char_id>0x20 and $char_id<=0xff){
+           # Basic Latin
+           # Latin-1 Supplement
+           print &latin_parse();
+           next;
+       }elsif($char_id>=0x2ff0 and $char_id<=0x2fff){
+           # Ideographic Description Characters
+           print &ids_parse();
+           next;
+       }elsif($char_id >= 0x20000 && $char_id <=0x2a6df){
+           # CJK Unified Ideographs Extension B
+           if(not defined($ids{$char}) and $ids{$char}[1]>=0){
+               $ids{$char}[0]=$font_start;
+               $ids{$char}[1]=$ids_start;
+               $ids_start++;
+               if($ids_start>255){
+                   $ids_start=0;
+                   $font_start++;
                 }
-               $ids="";
             }
-       }elsif($char_id>=0x2ff0 and $char_id<=0x2fff){
-           ($ids,$ids_argc)=&ids_rest("",0,$char);
+           print "{\\fontencoding{OT1}\\fontfamily{" .
+               sprintf("chise%03d",$ids{$char}[0]) .
+               "}\\selectfont\\char$ids{$char}[1]}",&add_break($i);
             next;
         }else{
-           if($char_id<=0xff){
-               print $char;
-               next;
-           }
             if(($out_char=&get_output_char($char_id,$out_cs))){
                 print $out_char,&add_break($i);
-           }elsif($char_id >= 0x20000 && $char_id <=0x2a6df){
-               unless(defined($ids{$char}) and $ids{$char}[1]>=0){
-                   $ids{$char}[0]=$font_start;
-                   $ids{$char}[1]=$ids_start;
-                   $ids_start++;
-                   if($ids_start>255){
-                       $ids_start=0;
-                       $font_start++;
-                   }
-               }
-               print "{\\fontencoding{OT1}\\fontfamily{" .
-                   sprintf("chise%03d",$ids{$char}[0]) .
-                   "}\\selectfont\\char$ids{$char}[1]}",&add_break($i);
-               next;
             }else{
-               print &replace_ids(&get_ids($char)),&add_break($i);
+               print &get_macro_for_ids(&get_ids($char)),&add_break($i);
             }
         }
      }
-    if($ids_argc>0){
-       print STDERR "IDS parse error: $ids\n";
-#      print pack("U",0xfffd);
-       print pack("U",0x3013) if($perl56);
-       print encode('utf8',pack("U",0x3013)) if($perl58);
-       $ids="";
-       $ids_argc=0;
-    }
  }
  
  open(IDSDATA,">$idsdata_file") or die;
@@ -200,7 +210,7 @@ sub tex_de_er{
      $er=~s/^amp(.*);$/$1/;
  #    $er=~s/^&(.*);$/$1/;
      $out=&de_er($er);
-    if($out){
+    if($out and $out ne $er){
         return $out;
      }else{
         return "amp$er;";
@@ -214,9 +224,11 @@ sub add_break{
         if(($chars[$i+1]=~m/[$strictly_forbidden_before]/o)
            and($chars[$i+2]=~m/[$strictly_forbidden_before]/o)){
             return "\\CJKunbreakablekernone ";
-       }elsif(($chars[$i+1]=~m/[$strictly_forbidden_before]/o)
+       }elsif($opt_protrude){
+           if(($chars[$i+1]=~m/[$strictly_forbidden_before]/o)
                and($chars[$i+2]=~m/[^$strictly_forbidden_before]/o)){
-           return "\\CJKunbreakablekernone \\CJKprotrude ";
+               return "\\CJKunbreakablekernone \\CJKprotrude ";
+           }
         }
      }
      if(($i<$#chars)
@@ -249,7 +261,57 @@ sub add_break{
      }
  }
  
+sub latin_parse{
+    # arguments: none
+    # return: string for output with TeX macro.
+    my($char_id);
+    my $out_str=$chars[$i];
+    $i++;
+    while($i<=$#chars){
+       $char_id=unpack("U",$chars[$i]);
+       if($char_id<=0xff){
+           $out_str.=pack("U",$char_id);
+       }else{
+           $i--;
+           last;
+       }
+       $i++;
+    }
+    return '{\fontfamily{\rmdefault}\selectfont {'.$out_str.'}}';
+}
+
+sub ids_parse{
+    # arguments: none
+    # return: character for output,
+    #          TeX macro for ids,
+    #          or GETA character if ids is invalid.
+    my($ids,$ids_argc)=&ids_rest("",0,$chars[$i]);
+
+    while($ids_argc>0){
+       # We are in IDS.
+       $i++;
+       if($i>$#chars){
+           print STDERR "IDS parse error: $ids\n";
+#          return pack("U",0xfffd);
+           return pack("U",0x3013);
+       }
+
+       ($ids,$ids_argc)=&ids_rest($ids,$ids_argc,$chars[$i]);
+       if($ids_argc==0){
+           if(($char_id=&get_char_id_for_ids($ids))
+              and($out_char=&get_output_char($char_id,$out_cs))){
+               return $out_char;
+           }else{
+               return &get_macro_for_ids($ids) if($perl56);
+               return encode('utf8', &get_macro_for_ids($ids)) if($perl58);
+           }
+       }
+    }
+}
+
  sub ids_rest{
+    # arguments: <ids>, <rest number of arguments for ids>, <character>
+    # return: ids and rest number of arguments for ids.
      my($ids,$ids_argc,$char)=@_;
      my($argc);
      $argc=&ids_argc($char);
@@ -263,14 +325,17 @@ sub ids_rest{
      return ($ids,$ids_argc);
  }
  
-sub replace_ids{
+sub get_macro_for_ids{
+    # argument: <ids>
+    # return: TeX macro for ids
+    #          or GETA character if ids is invalid for KAGE.
      my($ids)=@_;
      $ids=&normalize_ids($ids,"UniJIS");
-#    return pack("U",0xfffd) if($ids!~/[$idc]/);
+#    return pack("U",0xfffd) if(($ids!~/[$idc]/)
      return pack("U",0x3013) if(($ids!~/[$idc]/)
                                or($ids=~/[\x{10000}-]/));
                      #irregular for KAGE.
-    unless(defined($ids{$ids}) and $ids{$ids}[1]>=0){
+    if(not defined($ids{$ids}) and $ids{$ids}[1]>=0){
         $ids{$ids}[0]=$font_start;
         $ids{$ids}[1]=$ids_start;
         $ids_start++;
@@ -279,13 +344,18 @@ sub replace_ids{
         $ids_start=0;
         $font_start++;
      }
-    return "{\\fontencoding{OT1}\\fontfamily{".sprintf("chise%03d",$ids{$ids}[0])."}\\selectfont\\char$ids{$ids}[1]}";
+    return "{\\fontencoding{OT1}\\fontfamily{"
+       .sprintf("chise%03d",$ids{$ids}[0])
+       ."}\\selectfont\\char$ids{$ids}[1]}";
  }
  
  sub normalize_ids{
+    # argument: <ids>, <output coding system>
+    # return: ids or GETA character if ids is invalid for KAGE.
      my($ids,$out_cs)=@_;
      $ids = decode('utf8', $ids) if $perl58;
-    $out_cs=~s/Uni(.+)/"ucs-".lc($1)/e;
+    $out_cs=~s/Uni(.+)/'ucs@'.lc($1)/e;
+
      my $output_ids="";
      my($char,$char_id,$output_char_id);
      while($ids=~m/(.)/g){
@@ -293,43 +363,117 @@ sub normalize_ids{
         $char_id=unpack("U",$char);
         if($char=~/[$idc]/){
             $output_ids.=$char;
-       }elsif($output_char_id=&get_char_attribute($char,$out_cs)){
+       }elsif($output_char_id=&get_char_attribute($char,"=$out_cs")){
             $output_ids.=pack("U",$output_char_id);
         }elsif($output_char_id=&get_char_attribute($char,"=ucs")){
             $output_ids.=pack("U",$output_char_id);
-       }elsif($output_char_id=&get_char_attribute($char,"ucs")){
-           $output_ids.=pack("U",$output_char_id);
         }else{
-           return pack("U",0xfffe);
+#          return pack("U",0xfffd);
+           return pack("U",0x3013);
         }
      }
      return $output_ids;
  }
  
  sub get_output_char{
+    # argument: <char-id>
+    # return: character in output coding system or TeX macro.
      my($char_id,$out_cs)=@_;
-    my($out_char_id,$suffix);
+    my($char,$out_char_id,$suffix);
      my($gt,$hzk,$cdp);
-    if(not defined($cmap_to{$out_cs})){
-       &get_cmap($out_cs);
-    }
-    if($out_char_id=$cmap_to{$out_cs}->{$char_id}){
-       return pack("U",$out_char_id);
-    }else{
-       return $gt if($useGT and $gt=&get_macro_for_GT($char_id));
-       return $hzk if($useHZK and $hzk=&get_macro_for_HZK($char_id));
-       return $cdp if($useCDP and $cdp=&get_macro_for_CDP($char_id));
-       return undef;
-    }
-}
  
-sub get_cmap{
-    my($out_cs)=@_;
-    tie %{$cmap_to{$out_cs}}, "BerkeleyDB::Hash",
-    -Filename => "$omegadb_home/$out_cs" or die $!;
+    $char=pack('U',$char_id);
+
+    if($out_cs eq 'UniJIS'
+       and &get_char_attribute($char,"adobe-unijis-utf16-h")){
+       if($out_char_id=&get_char_attribute($char,'=ucs@jis')
+          or $out_char_id=&get_char_attribute($char,'=ucs')
+          or $out_char_id=&get_char_attribute($char,'=>ucs@jis')
+          or $out_char_id=&get_char_attribute($char,'=>ucs')
+          ){
+           return pack("U",$out_char_id);
+       }
+    }elsif($out_cs eq 'UniGB'
+          and &get_char_attribute($char,"adobe-unigb-ucs2-h")){
+       if($out_char_id=&get_char_attribute($char,'=ucs@gb')
+          or $out_char_id=&get_char_attribute($char,'=ucs')
+          or $out_char_id=&get_char_attribute($char,'=>ucs@gb')
+          or $out_char_id=&get_char_attribute($char,'=>ucs')
+          ){
+           return pack("U",$out_char_id);
+       }
+    }elsif($out_cs eq 'UniCNS'
+          and &get_char_attribute($char,"adobe-unicns-ucs2-h")){
+       if($out_char_id=&get_char_attribute($char,'=ucs@cns')
+          or $out_char_id=&get_char_attribute($char,'=ucs')
+          or $out_char_id=&get_char_attribute($char,'=>ucs@cns')
+          or $out_char_id=&get_char_attribute($char,'=>ucs')
+          ){
+           return pack("U",$out_char_id);
+       }
+    }elsif($out_cs eq 'UniKS'
+          and &get_char_attribute($char,"adobe-uniks-ucs2-h")){
+       if($out_char_id=&get_char_attribute($char,'=ucs@ks')
+          or $out_char_id=&get_char_attribute($char,'=ucs')
+          or $out_char_id=&get_char_attribute($char,'=>ucs@ks')
+          or $out_char_id=&get_char_attribute($char,'=>ucs')
+          ){
+           return pack("U",$out_char_id);
+       }
+    }elsif($out_cs eq 'UniMulti'){
+       foreach $out_cs (@order){
+    
+           if($out_cs eq 'UniJIS'
+              and &get_char_attribute($char,"adobe-unijis-utf16-h")){
+               if($out_char_id=&get_char_attribute($char,'=ucs@jis')
+                  or $out_char_id=&get_char_attribute($char,'=ucs')
+                  or $out_char_id=&get_char_attribute($char,'=>ucs@jis')
+                  or $out_char_id=&get_char_attribute($char,'=>ucs')
+                  ){
+                   return '{\selectjisfont\char'.$out_char_id.'}';
+               }
+           }elsif($out_cs eq 'UniGB'
+                  and &get_char_attribute($char,"adobe-unigb-ucs2-h")){
+               if($out_char_id=&get_char_attribute($char,'=ucs@gb')
+                  or $out_char_id=&get_char_attribute($char,'=ucs')
+                  or $out_char_id=&get_char_attribute($char,'=>ucs@gb')
+                  or $out_char_id=&get_char_attribute($char,'=>ucs')
+                  ){
+                   return '{\selectgbsfont\char'.$out_char_id.'}';
+               }
+           }elsif($out_cs eq 'UniCNS'
+                  and &get_char_attribute($char,"adobe-unicns-ucs2-h")){
+               if($out_char_id=&get_char_attribute($char,'=ucs@cns')
+                  or $out_char_id=&get_char_attribute($char,'=ucs')
+                  or $out_char_id=&get_char_attribute($char,'=>ucs@cns')
+                  or $out_char_id=&get_char_attribute($char,'=>ucs')
+                  ){
+                   return '{\selectcnsfont\char'.$out_char_id.'}';
+               }
+           }elsif($out_cs eq 'UniKS'
+                  and &get_char_attribute($char,"adobe-uniks-ucs2-h")){
+               if($out_char_id=&get_char_attribute($char,'=ucs@ks')
+                  or $out_char_id=&get_char_attribute($char,'=ucs')
+                  or $out_char_id=&get_char_attribute($char,'=>ucs@ks')
+                  or $out_char_id=&get_char_attribute($char,'=>ucs')
+                  ){
+                   return '{\selectksxfont\char'.$out_char_id.'}';
+               }
+           }elsif($out_cs eq 'GT'){
+               return $gt if($gt=&get_macro_for_GT($char_id));
+           }elsif($out_cs eq 'HZK'){
+               return $hzk if($hzk=&get_macro_for_HZK($char_id));
+           }elsif($out_cs eq 'CDP'){
+               return $cdp if($cdp=&get_macro_for_CDP($char_id));
+           }
+       }
+    }
+    return undef;
  }
  
  sub get_ids{
+    # argument: <character>
+    # return: ids
      my($char)=@_;
      my $ids="";
      $ids=&get_char_attribute($char,"ids-aggregated")
@@ -341,63 +485,37 @@ sub get_ids{
  }
  
  sub get_char_id_for_ids{
+    # argument: <ideographic description sequence>
+    # return: char-id
      my($ids)=@_;
-    my($char_id,$char);
+    my($output_char);
      $ids=decode('utf8', $ids) if($perl58);
-#    $ids="(?".(join " ?",(split(//,$ids))).")";
-    &get_idsdb if(not defined($idsdb));
-    $char=$idsdb->{$ids};
-    $char=decode('utf8',$char) if($perl58);
-    if($char){
-       return unpack("U",$char);
+
+    if(($output_char)=&get_chars_matching("ids",$ids)){
+       return unpack("U",$output_char);
      }else{
         return undef;
      }
  }
  
-sub get_idsdb{
-    tie %{$idsdb}, "BerkeleyDB::Hash",
-    -Filename => "$omegadb_home/idsdb" or die $!;
-}
+sub get_char_id{
+    # argument: <char-id>, <input coding system>
+    # return:   char-id.
+    my($char_id,$in_cs)=@_;
+    my($output_char);
+
+    return $char_id if($in_cs eq 'ucs@mcs');
  
-sub get_char_in_utf8mcs_bak{
-    my($char,$in_cs)=@_;
-    return $char if($in_cs eq "Utf8mcs");
-    my($char_id,$output_char);
-    $in_cs=~s/Utf8/ucs-/;
-    $char_id=unpack("U",$char);
-    if(($output_char)=&get_chars_matching("$in_cs",$char_id)){
-       $output_char=decode('utf8', $output_char) if($perl58);
-       return $output_char;
+    if(($output_char)=&get_chars_matching("=$in_cs",$char_id)){
+       return unpack("U",$output_char);
      }else{
-       return $char;
+       return $char_id;
      }
  }
  
-sub get_char_in_utf8mcs{
-    # argument: <character>, <input coding system>
-    # return:   character in UTF-8mcs.
-    my($char,$in_cs)=@_;
-    my($char_id,$output_char_id);
-    return $char if($in_cs eq "Utf8mcs");
-    $char_id=unpack("U",$char);
-    &get_utf8mcs_map($in_cs) if(not defined($utf8mcs_map_from{$in_cs}));
-    if($output_char_id=$utf8mcs_map_from{$in_cs}->{$char_id}){
-       return pack("U",$output_char_id);
-   }else{
-       return $char;
-   }
-}
-
-sub get_utf8mcs_map{
-    my($in_cs)=@_;
-    my($suffix);
-    ($suffix=$in_cs)=~s/^Utf8//;
-    tie %{$utf8mcs_map_from{$in_cs}}, "BerkeleyDB::Hash",
-    -Filename => "$omegadb_home/ucs-$suffix" or die $!;
-}
-
  sub get_macro_for_GT{
+    # argument: <char-id>
+    # return: TeX macro for GT fonts.
      my($char_id)=@_;
      my($char,$gt,$GT);
      $char=pack("U",$char_id);
@@ -408,8 +526,9 @@ sub get_macro_for_GT{
         }
      }
      if($gt){
-       return "{\\fontencoding{OT1}\\fontfamily{".sprintf("gt%02d",$GT)."}\\selectfont\\char".($gt|0x8080)."}";
-#      return "\\GT{".sprintf("gt%02d",$GT)."}{\\char".($gt|0x8080)."}";
+       return "{\\fontencoding{OT1}\\fontfamily{"
+           .sprintf("gt%02d",$GT)
+           ."}\\selectfont\\char".($gt|0x8080)."}";
      }else{
         return undef;
      }
author	imiyazaki <imiyazaki>
	Sat, 18 Oct 2003 02:25:39 +0000 (02:25 +0000)
committer	imiyazaki <imiyazaki>
	Sat, 18 Oct 2003 02:25:39 +0000 (02:25 +0000)