add $opt_allow_unification.
authorimiyazaki <imiyazaki>
Wed, 19 Nov 2003 15:10:59 +0000 (15:10 +0000)
committerimiyazaki <imiyazaki>
Wed, 19 Nov 2003 15:10:59 +0000 (15:10 +0000)
simplify codes.

chise2otf/chise2otf

index 65fef19..d31c13d 100755 (executable)
@@ -2,11 +2,13 @@
 
 use strict;
 use vars qw($opt_in_cs $opt_order $opt_kage $opt_replace
+           $opt_use_kage_for_Ext_B $opt_allow_unification
            $opt_help $usage
            $in_cs $out_cs $i @chars
            @order $order %order
            @texmacro
            $char $char_id $out_char
+           $char_unified @chars_unified
            $ids $ids_argc %ids $idsdb
            $idsdata_file $ids_start $font_start
            $perl56 $perl58
@@ -21,6 +23,9 @@ require 5.008;
 my $omegadb_path="/usr/local/lib/chise/omega";
 $omegadb_path=~s!/$!!;
 
+# currently does not work, so...
+$opt_use_kage_for_Ext_B=0;
+
 my $makefonts="$omegadb_path/makefonts.pl";
 my $exec_makefonts=0;
 my $geta=pack("S",8750|0x8080);
@@ -29,13 +34,14 @@ my $geta=pack("S",8750|0x8080);
            "order=s"=>\$opt_order,
            "replace",\$opt_replace,
            "kage",\$opt_kage,
+           "unify",\$opt_allow_unification,
            "help",\$opt_help);
 
 $usage=<<EOF;
-Usage: $0 [-i <input coding system>] [-o <order of kanji>] [-k] <filename>
-    input coding system: (default: ucs\@mcs)
-      ucs\@mcs, ucs\@cns, ucs\@gb, ucs\@jis, ucs\@ks
-    order of kanji: (default: j)
+Usage: $0 [-i <input coding system>] [-o <order of kanji>] [-kru] <filename>
+    -i: input coding system: (default: ucs\@mcs)
+       ucs\@mcs, ucs\@cns, ucs\@gb, ucs\@jis, ucs\@ks
+    -o: order of kanji: (default: j)
        c: CNS
        g: GB
        j: JIS
@@ -43,8 +49,9 @@ Usage: $0 [-i <input coding system>] [-o <order of kanji>] [-k] <filename>
        G: GT
        m: Multi, use \\UTFM of otf.sty
       You can also combine them, ex. jtcgkm
-    k: use Kage server.
-    r: replace r and l with dot below to those with circle below.
+    -k: use Kage server.
+    -r: replace r and l with dot below to those with circle below.
+    -u: allow unification.
 EOF
 
 %order=('c'=>'UniCNS',
@@ -144,11 +151,12 @@ $ids="";
 
 while(<>){
     if($in_cs ne 'ucs@mcs'){
-       s/(.)/pack("U",&get_char_id(unpack("U",$1),$in_cs))/ge;
+       s/(.)/&get_char_in_mcs($1,$in_cs)/ge;
     }
     s/((?:^|[^\\])(?:\\\\)*)(&.*?;)/&de_tex_er($1,$2)/ge;
     s/((?:^|[^\\])(?:\\\\)*)\\([$idc])/$1.'\UTFM{'.sprintf("%X",unpack("U",$2)).'}'/ge;
     @chars=split(//);
+  CHAR:
     for($i=0;$i<=$#chars;$i++){
        $char=$chars[$i];
        $char_id=unpack("U",$chars[$i]);
@@ -156,38 +164,57 @@ while(<>){
        if($char_id<=0x7f){
            # Basic Latin
            print $char;
-           next;
+           next CHAR;
        }elsif(defined($texmacro[$char_id]) and $texmacro[$char_id]){
            # already defined for 
            # 0080..00FF; Latin-1 Supplement
            # 0100..017F; Latin Extended-A
            # 1E00..1EFF; Latin Extended Additional
            print $texmacro[$char_id];
-           next;
+           next CHAR;
        }elsif($char_id>=0x2ff0 and $char_id<=0x2fff){
            # Ideographic Description Characters
            print &ids_parse();
-           next;
+           next CHAR;
        }else{
-           if(($out_char=&get_output_char($char_id))){
+           if(($out_char=&get_output_char($char))){
                print $out_char;
-           }elsif($char_id >= 0x20000 && $char_id <=0x2a6df){
-               # CJK Unified Ideographs Extension B
-               if(not defined($ids{$char}) and $ids{$char}[1]>=0){
-                   $ids{$char}[0]=$font_start;
-                   $ids{$char}[1]=$ids_start;
-                   $ids_start++;
-                   if($ids_start>255){
-                       $ids_start=0;
-                       $font_start++;
+               next CHAR;
+           }else{
+               if($opt_allow_unification){
+                   @chars_unified=&get_chars_unified($char);
+                   if(@chars_unified>0){
+                       foreach $char_unified (@chars_unified){
+                           if(($out_char=&get_output_char($char_unified))){
+                               print $out_char;
+                               next CHAR;
+                           }
+                       }
                    }
                }
-               print "{\\fontencoding{OT1}\\fontfamily{" .
-                   sprintf("chise%03d",$ids{$char}[0]) .
-                   "}\\selectfont\\char$ids{$char}[1]}";
-               next;
-           }else{
-               print &get_macro_for_ids(&get_ids($char));
+               if($opt_use_kage_for_Ext_B
+                  and $char_id >= 0x20000 and $char_id <=0x2a6df){
+                   # CJK Unified Ideographs Extension B
+                   if(not defined($ids{$char}) and $ids{$char}[1]>=0){
+                       $ids{$char}[0]=$font_start;
+                       $ids{$char}[1]=$ids_start;
+                       $ids_start++;
+                       if($ids_start>255){
+                           $ids_start=0;
+                           $font_start++;
+                       }
+                   }
+                   print "{\\fontencoding{OT1}\\fontfamily{" .
+                       sprintf("chise%03d",$ids{$char}[0]) .
+                           "}\\selectfont\\char$ids{$char}[1]}";
+                   next CHAR;
+               }
+               if($ids=&get_ids($char)){
+                   print &get_macro_for_ids($ids);
+                   next CHAR;
+               }else{
+                   print $geta;
+               }
            }
        }
     }
@@ -229,7 +256,6 @@ sub de_tex_er{
            $value=hex($value);
        }
        ($output_char)=&get_chars_matching($er_alias{$atr},$value);
-#      utf8::decode($output_char);
     }
     if($output_char){
        return $before_er.$output_char;
@@ -255,8 +281,8 @@ sub ids_parse{
 
        ($ids,$ids_argc)=&ids_rest($ids,$ids_argc,$chars[$i]);
        if($ids_argc==0){
-           if(($char_id=&get_char_id_for_ids($ids))
-              and($out_char=&get_output_char($char_id))){
+           if(($char=&get_char_for_ids($ids))
+              and($out_char=&get_output_char($char))){
                return $out_char;
            }else{
                return &get_macro_for_ids($ids);
@@ -277,8 +303,6 @@ sub ids_rest{
        $ids_argc--;
     }
     $ids.=$char;
-#    $ids.=$char if($perl56);
-#    $ids.=encode('utf8',$char) if($perl58);
     return ($ids,$ids_argc);
 }
 
@@ -313,15 +337,17 @@ sub normalize_ids{
     $out_cs=~s/Uni(.+)/'ucs@'.lc($1)/e;
 
     my $output_ids="";
-    my($char,$char_id,$output_char_id);
+    my($char,$output_char_id);
     while($ids=~m/(.)/g){
        $char=$1;
-       $char_id=unpack("U",$char);
        if($char=~/[$idc]/){
            $output_ids.=$char;
-       }elsif($output_char_id=&get_char_attribute($char,"=$out_cs")){
-           $output_ids.=pack("U",$output_char_id);
-       }elsif($output_char_id=&get_char_attribute($char,"=ucs")){
+       }elsif($output_char_id=&get_char_attribute($char,"=$out_cs")
+              or $output_char_id=&get_char_attribute($char,"=ucs")
+              or $output_char_id=&get_char_attribute($char,"=>$out_cs")
+              or $output_char_id=&get_char_attribute($char,"=>ucs")
+              or $output_char_id=&get_char_attribute($char,"=>ucs*")
+             ){
            $output_ids.=pack("U",$output_char_id);
        }else{
            return $geta;
@@ -331,13 +357,11 @@ sub normalize_ids{
 }
 
 sub get_output_char{
-    # argument: <char-id>
+    # argument: <char>
     # return: character in EUC-JP or TeX macro for pTeX.
-    my($char_id)=@_;
-    my($char,$out_char,$out_char_id,$gt);
+    my($char)=@_;
+    my($out_char,$out_char_id,$gt);
     
-    $char=pack('U',$char_id);
-
     if($out_char=&get_char_attribute($char,'=jis-x0208')
        or $out_char=&get_char_attribute($char,'=jis-x0208-1983')
        or $out_char=&get_char_attribute($char,'=jis-x0208-1990')
@@ -351,6 +375,7 @@ sub get_output_char{
                   or $out_char_id=&get_char_attribute($char,'=ucs')
                   or $out_char_id=&get_char_attribute($char,'=>ucs@jis')
                   or $out_char_id=&get_char_attribute($char,'=>ucs')
+                  or $out_char_id=&get_char_attribute($char,'=>ucs*')
                   ){
                    return "\\UTF{".sprintf("%X",$out_char_id)."}";
                }
@@ -360,6 +385,7 @@ sub get_output_char{
                   or $out_char_id=&get_char_attribute($char,'=ucs')
                   or $out_char_id=&get_char_attribute($char,'=>ucs@gb')
                   or $out_char_id=&get_char_attribute($char,'=>ucs')
+                  or $out_char_id=&get_char_attribute($char,'=>ucs*')
                   ){
                    return "\\UTFC{".sprintf("%X",$out_char_id)."}";
                }
@@ -369,6 +395,7 @@ sub get_output_char{
                   or $out_char_id=&get_char_attribute($char,'=ucs')
                   or $out_char_id=&get_char_attribute($char,'=>ucs@cns')
                   or $out_char_id=&get_char_attribute($char,'=>ucs')
+                  or $out_char_id=&get_char_attribute($char,'=>ucs*')
                   ){
                    return "\\UTFT{".sprintf("%X",$out_char_id)."}";
                }
@@ -378,11 +405,12 @@ sub get_output_char{
                   or $out_char_id=&get_char_attribute($char,'=ucs')
                   or $out_char_id=&get_char_attribute($char,'=>ucs@ks')
                   or $out_char_id=&get_char_attribute($char,'=>ucs')
+                  or $out_char_id=&get_char_attribute($char,'=>ucs*')
                   ){
                    return "\\UTFK{".sprintf("%X",$out_char_id)."}";
                }
            }elsif($out_cs eq 'GT'){
-               return $gt if($gt=&get_macro_for_GT($char_id));
+               return $gt if($gt=&get_macro_for_GT($char));
            }elsif($out_cs eq 'Multi'){
                if($out_char_id=&get_char_attribute($char,'=ucs')){
                    return "\\UTFM{".sprintf("%X",$out_char_id)."}";
@@ -403,38 +431,58 @@ sub get_ids{
     return $ids;
 }
 
-sub get_char_id_for_ids{
+sub get_char_for_ids{
     # argument: <ideographic description sequence>
-    # return: char-id
+    # return: char or undef
     my($ids)=@_;
     my($output_char);
 
     if(($output_char)=&get_chars_matching("ids",$ids)){
-       return unpack("U",$output_char);
+       return $output_char;
     }else{
        return undef;
     }
 }
 
-sub get_char_id{
-    # argument: <char-id>, <input coding system>
-    # return:   char-id.
-    my($char_id,$in_cs)=@_;
+sub get_char_in_mcs{
+    # argument: <char>, <input coding system>
+    # return:   char in ucs@mcs.
+    my($char,$in_cs)=@_;
     my($output_char);
 
-    if(($output_char)=&get_chars_matching("=$in_cs",$char_id)){
-       return unpack("U",$output_char);
+    if(($output_char)=&get_chars_matching("=$in_cs",unpack("U",$char))){
+       return $output_char;
     }else{
-       return $char_id;
+       return $char;
+    }
+}
+
+sub get_chars_unified{
+    my($char)=@_;
+    my($chars,$ucs,$char_ucs);
+    my(@chars);
+
+    if($chars=&get_char_attribute($char,'->ucs-unified')){
+       $chars=~s/^\((.*)\)$/$1/;
+       return (split(/\s*\?/,$chars));
+    }elsif($ucs=&get_char_attribute($char,'=>ucs*')
+         or $ucs=&get_char_attribute($char,'=>ucs')){
+       $char_ucs=pack("U",$ucs);
+       if($chars=&get_char_attribute($char_ucs,'->ucs-unified')){
+           $chars=~s/^\((.*)\)$/$1/;
+           @chars=grep {not /^$char$/}
+               (split(/\s*\?/,$chars));
+           push(@chars,$char_ucs);
+           return @chars;
+       }
     }
 }
 
 sub get_macro_for_GT{
-    # argument: <char-id>
-    # return: TeX macro for GT fonts.
-    my($char_id)=@_;
-    my($char,$gt,$GT);
-    $char=pack("U",$char_id);
+    # argument: <char>
+    # return: TeX macro for GT fonts or undef.
+    my($char)=@_;
+    my($gt,$GT);
     foreach (@GT){
        if($gt=&get_char_attribute($char,$_)){
            m/gt\-pj\-(\d+)/ and $GT=$1;