add support for Utf8big5.
[chise/omega.git] / inCHISE
diff --git a/inCHISE b/inCHISE
index 59ea158..09cb0f8 100755 (executable)
--- a/inCHISE
+++ b/inCHISE
@@ -5,6 +5,7 @@
 use strict;
 use vars qw($omegadb_path
            $opt_protrude $opt_allow_unify
+           $opt_use_kage_for_Ext_B
            %opt_order %order %order_map
            $opt_in_cs $opt_out_cs
            $opt_help $usage
@@ -21,8 +22,6 @@ use Fcntl ':flock';
 use Chise_utils ':all';
 require 5.008;
 
-my $omegadb_path="/usr/local/lib/chise/omega";
-
 ### Options ###
 
 #$opt_order{'UniMulti'}='jcgk';
@@ -35,6 +34,9 @@ $opt_order{'UniKS'}='k';
 $opt_allow_unify=1; # 1=true, 0=false.
 $opt_protrude=0;# 1=true, 0=false.
 
+# currently does not work.
+$opt_use_kage_for_Ext_B=0;# 1=true, 0=false.
+
 ### End ###
 
 my $strictly_forbidden_after = '「【『[(〈“‘‘(〔{《{\[\(\x{3016}{「';
@@ -62,6 +64,18 @@ my $asian = '\x{1100}-\x{11FF}\x{2E80}-\x{D7AF}\x{F900}-\x{FAFF}\x{FE30}-\x{FE4F
 
 my $space = '\x{0020}\x{0009}\x{000A}\x{000C}\x{000D}';
 
+my %tex_meta=('#'=>'\#',
+             '$'=>'\\textdollar{}',
+             '%'=>'\%',
+             '&'=>'\&',
+             '{'=>'\\textbraceleft{}',
+             '}'=>'\\textbraceright{}',
+             '\\'=>'\\textbackslash{}',
+             '_'=>'\\textunderscore',
+            );
+
+my $tex_meta_re=join('|',map {quotemeta($_)} keys %tex_meta);
+
 &GetOptions("in=s"=>\$opt_in_cs,
            "out=s"=>\$opt_out_cs,
            "help",\$opt_help);
@@ -69,7 +83,7 @@ my $space = '\x{0020}\x{0009}\x{000A}\x{000C}\x{000D}';
 $usage=<<EOF;
 Usage: $0 -i <input coding system> -o <cmap encoding>
     input coding system:
-      Utf8mcs, Utf8cns, Utf8gb, Utf8jis, Utf8ks
+      Utf8mcs, Utf8cns, Utf8gb, Utf8jis, Utf8ks, Utf8big5
     cmap encoding:
       UniCNS, UniGB, UniJIS, UniKS, UniMulti
 EOF
@@ -82,7 +96,7 @@ if($opt_in_cs or $opt_out_cs){
 }
 
 # $in_cs:
-#   Utf8mcs,Utf8cns,Utf8gb,Utf8jis,Utf8ks,
+#   Utf8mcs,Utf8cns,Utf8gb,Utf8jis,Utf8ks,Utf8big5
 # $out_cs:
 #   UniCNS,UniGB,UniJIS,UniKS,UniMulti
 
@@ -119,8 +133,8 @@ if(-e $idsdata_file){
 $ids_argc=0;
 $ids="";
 
-$geta=pack("U",0x3013);
-#$geta=pack("U",0xfffd);
+#$geta=pack("U",0x3013);
+$geta=pack("U",0xfffd);
 
 @GT=(#"=gt","=gt-k",
      "=gt-pj-1","=gt-pj-2","=gt-pj-3","=gt-pj-4","=gt-pj-5",
@@ -170,6 +184,9 @@ while(<>){
        if($char_id<=0x20){
            print $chars[$i];
            next CHAR;
+       }elsif($char=~m/($tex_meta_re)/o){
+           print $tex_meta{$1};
+           next CHAR;
        }elsif($char_id>0x20 and $char_id<=0x02af){
            # Basic Latin
            # Latin-1 Supplement
@@ -185,21 +202,6 @@ while(<>){
        }else{
            if(($out_char=&get_output_char($char,$out_cs))){
                print $out_char,&add_break($i);
-           }elsif($char_id >= 0x20000 && $char_id <=0x2a6df){
-               # CJK Unified Ideographs Extension B
-               if(not defined($ids{$char}) and $ids{$char}[1]>=0){
-                   $ids{$char}[0]=$font_start;
-                   $ids{$char}[1]=$ids_start;
-                   $ids_start++;
-                   if($ids_start>255){
-                       $ids_start=0;
-                       $font_start++;
-                   }
-               }
-               print "{\\fontencoding{OT1}\\fontfamily{" .
-                   sprintf("chise%03d",$ids{$char}[0]) .
-                   "}\\selectfont\\char$ids{$char}[1]}",&add_break($i);
-               next CHAR;
            }else{
                if($opt_allow_unify){
                    @chars_unified=&get_chars_unified($char);
@@ -213,6 +215,24 @@ while(<>){
                        }
                    }
                }
+               if($opt_use_kage_for_Ext_B){
+                   if($char_id >= 0x20000 && $char_id <=0x2a6df){
+                       # CJK Unified Ideographs Extension B
+                       if(not defined($ids{$char}) and $ids{$char}[1]>=0){
+                           $ids{$char}[0]=$font_start;
+                           $ids{$char}[1]=$ids_start;
+                           $ids_start++;
+                           if($ids_start>255){
+                               $ids_start=0;
+                               $font_start++;
+                           }
+                       }
+                       print "{\\fontencoding{OT1}\\fontfamily{" .
+                           sprintf("chise%03d",$ids{$char}[0]) .
+                               "}\\selectfont\\char$ids{$char}[1]}",&add_break($i);
+                       next CHAR;
+                   }
+               }
                if($ids=&get_ids($char)){
                    print &get_macro_for_ids($ids),&add_break($i);
                }else{
@@ -318,7 +338,7 @@ sub latin_parse{
        }
        $i++;
     }
-    return '{\normalfont {'.$out_str.'}}';
+    return '{\fontencoding{UT1}\fontfamily{omlgc}\selectfont '.$out_str.'}';
 }
 
 sub ids_parse{
@@ -368,11 +388,11 @@ sub get_macro_for_ids{
     # return: TeX macro for ids
     #          or GETA character if ids is invalid for KAGE.
     my($ids)=@_;
-    $ids=&normalize_ids($ids,"UniJIS");
+    # $ids=&normalize_ids($ids,"UniJIS");
     return $geta if(($ids!~/[$idc]/)
                    or($ids=~/[\x{10000}-]/));
                     #irregular for KAGE.
-    if(not defined($ids{$ids}) and $ids{$ids}[1]>=0){
+    if(not defined($ids{$ids})){
        $ids{$ids}[0]=$font_start;
        $ids{$ids}[1]=$ids_start;
        $ids_start++;
@@ -398,13 +418,12 @@ sub normalize_ids{
        $char=$1;
        if($char=~/[$idc]/){
            $output_ids.=$char;
-       }elsif($output_char_id=&get_char_attribute($char,"=$out_cs")){
-           $output_ids.=pack("U",$output_char_id);
-       }elsif($output_char_id=&get_char_attribute($char,"=ucs")){
-           $output_ids.=pack("U",$output_char_id);
-       }elsif($output_char_id=&get_char_attribute($char,"=>$out_cs")){
-           $output_ids.=pack("U",$output_char_id);
-       }elsif($output_char_id=&get_char_attribute($char,"=>ucs")){
+       }elsif($output_char_id=&get_char_attribute($char,"=$out_cs")
+          or $output_char_id=&get_char_attribute($char,"=ucs")
+          or $output_char_id=&get_char_attribute($char,"=>$out_cs")
+          or $output_char_id=&get_char_attribute($char,"=>ucs")
+          or $output_char_id=&get_char_attribute($char,"=>ucs*")
+             ){
            $output_ids.=pack("U",$output_char_id);
        }else{
            return $geta;
@@ -427,6 +446,7 @@ sub get_output_char{
               or $out_char_id=&get_char_attribute($char,'=ucs')
               or $out_char_id=&get_char_attribute($char,'=>ucs@jis')
               or $out_char_id=&get_char_attribute($char,'=>ucs')
+              or $out_char_id=&get_char_attribute($char,'=>ucs*')
               ){
                return '{\selectjisfont\char'.$out_char_id.'}';
            }
@@ -436,6 +456,7 @@ sub get_output_char{
               or $out_char_id=&get_char_attribute($char,'=ucs')
               or $out_char_id=&get_char_attribute($char,'=>ucs@gb')
               or $out_char_id=&get_char_attribute($char,'=>ucs')
+              or $out_char_id=&get_char_attribute($char,'=>ucs*')
               ){
                return '{\selectgbsfont\char'.$out_char_id.'}';
            }
@@ -445,6 +466,7 @@ sub get_output_char{
               or $out_char_id=&get_char_attribute($char,'=ucs')
               or $out_char_id=&get_char_attribute($char,'=>ucs@cns')
               or $out_char_id=&get_char_attribute($char,'=>ucs')
+              or $out_char_id=&get_char_attribute($char,'=>ucs*')
               ){
                return '{\selectcnsfont\char'.$out_char_id.'}';
            }
@@ -453,8 +475,8 @@ sub get_output_char{
            if($out_char_id=&get_char_attribute($char,'=ucs@ks')
               or $out_char_id=&get_char_attribute($char,'=ucs')
               or $out_char_id=&get_char_attribute($char,'=>ucs@ks')
-              or $out_char_id=&get_char_attribute($char,'=>ucs*')
               or $out_char_id=&get_char_attribute($char,'=>ucs')
+              or $out_char_id=&get_char_attribute($char,'=>ucs*')
               ){
                return '{\selectksxfont\char'.$out_char_id.'}';
            }
@@ -476,6 +498,8 @@ sub get_ids{
     my $ids="";
     $ids=&get_char_attribute($char,"ids-aggregated")
        or $ids=&get_char_attribute($char,"ids");
+#    $ids=&get_char_attribute($char,"ids-decomposed")
+#      or $ids=&get_char_attribute($char,"ids");
 #        or $ids=&get_char_attribute($char,"ideographic-structure");
 #    $ids=~s/[? ()]//g;
     return $ids;