add support for Utf8big5.

[chise/omega.git] / inCHISE
diff --git a/inCHISE b/inCHISE

index 59ea158..09cb0f8 100755 (executable)
--- a/inCHISE
+++ b/inCHISE
@@ -5,6 +5,7 @@
  use strict;
  use vars qw($omegadb_path
             $opt_protrude $opt_allow_unify
+           $opt_use_kage_for_Ext_B
             %opt_order %order %order_map
             $opt_in_cs $opt_out_cs
             $opt_help $usage
@@ -21,8 +22,6 @@ use Fcntl ':flock';
  use Chise_utils ':all';
  require 5.008;
  
-my $omegadb_path="/usr/local/lib/chise/omega";
-
  ### Options ###
  
  #$opt_order{'UniMulti'}='jcgk';
@@ -35,6 +34,9 @@ $opt_order{'UniKS'}='k';
  $opt_allow_unify=1; # 1=true, 0=false.
  $opt_protrude=0;# 1=true, 0=false.
  
+# currently does not work.
+$opt_use_kage_for_Ext_B=0;# 1=true, 0=false.
+
  ### End ###
  
  my $strictly_forbidden_after = '「【『［（〈“‘‘（〔｛《{\[\(\x{3016}｛｢';
@@ -62,6 +64,18 @@ my $asian = '\x{1100}-\x{11FF}\x{2E80}-\x{D7AF}\x{F900}-\x{FAFF}\x{FE30}-\x{FE4F
  
  my $space = '\x{0020}\x{0009}\x{000A}\x{000C}\x{000D}';
  
+my %tex_meta=('#'=>'\#',
+             '$'=>'\\textdollar{}',
+             '%'=>'\%',
+             '&'=>'\&',
+             '{'=>'\\textbraceleft{}',
+             '}'=>'\\textbraceright{}',
+             '\\'=>'\\textbackslash{}',
+             '_'=>'\\textunderscore',
+            );
+
+my $tex_meta_re=join('|',map {quotemeta($_)} keys %tex_meta);
+
  &GetOptions("in=s"=>\$opt_in_cs,
             "out=s"=>\$opt_out_cs,
             "help",\$opt_help);
@@ -69,7 +83,7 @@ my $space = '\x{0020}\x{0009}\x{000A}\x{000C}\x{000D}';
  $usage=<<EOF;
  Usage: $0 -i <input coding system> -o <cmap encoding>
      input coding system:
-      Utf8mcs, Utf8cns, Utf8gb, Utf8jis, Utf8ks
+      Utf8mcs, Utf8cns, Utf8gb, Utf8jis, Utf8ks, Utf8big5
      cmap encoding:
        UniCNS, UniGB, UniJIS, UniKS, UniMulti
  EOF
@@ -82,7 +96,7 @@ if($opt_in_cs or $opt_out_cs){
  }
  
  # $in_cs:
-#   Utf8mcs,Utf8cns,Utf8gb,Utf8jis,Utf8ks,
+#   Utf8mcs,Utf8cns,Utf8gb,Utf8jis,Utf8ks,Utf8big5
  # $out_cs:
  #   UniCNS,UniGB,UniJIS,UniKS,UniMulti
  
@@ -119,8 +133,8 @@ if(-e $idsdata_file){
  $ids_argc=0;
  $ids="";
  
-$geta=pack("U",0x3013);
-#$geta=pack("U",0xfffd);
+#$geta=pack("U",0x3013);
+$geta=pack("U",0xfffd);
  
  @GT=(#"=gt","=gt-k",
       "=gt-pj-1","=gt-pj-2","=gt-pj-3","=gt-pj-4","=gt-pj-5",
@@ -170,6 +184,9 @@ while(<>){
         if($char_id<=0x20){
             print $chars[$i];
             next CHAR;
+       }elsif($char=~m/($tex_meta_re)/o){
+           print $tex_meta{$1};
+           next CHAR;
         }elsif($char_id>0x20 and $char_id<=0x02af){
             # Basic Latin
             # Latin-1 Supplement
@@ -185,21 +202,6 @@ while(<>){
         }else{
             if(($out_char=&get_output_char($char,$out_cs))){
                 print $out_char,&add_break($i);
-           }elsif($char_id >= 0x20000 && $char_id <=0x2a6df){
-               # CJK Unified Ideographs Extension B
-               if(not defined($ids{$char}) and $ids{$char}[1]>=0){
-                   $ids{$char}[0]=$font_start;
-                   $ids{$char}[1]=$ids_start;
-                   $ids_start++;
-                   if($ids_start>255){
-                       $ids_start=0;
-                       $font_start++;
-                   }
-               }
-               print "{\\fontencoding{OT1}\\fontfamily{" .
-                   sprintf("chise%03d",$ids{$char}[0]) .
-                   "}\\selectfont\\char$ids{$char}[1]}",&add_break($i);
-               next CHAR;
             }else{
                 if($opt_allow_unify){
                     @chars_unified=&get_chars_unified($char);
@@ -213,6 +215,24 @@ while(<>){
                         }
                     }
                 }
+               if($opt_use_kage_for_Ext_B){
+                   if($char_id >= 0x20000 && $char_id <=0x2a6df){
+                       # CJK Unified Ideographs Extension B
+                       if(not defined($ids{$char}) and $ids{$char}[1]>=0){
+                           $ids{$char}[0]=$font_start;
+                           $ids{$char}[1]=$ids_start;
+                           $ids_start++;
+                           if($ids_start>255){
+                               $ids_start=0;
+                               $font_start++;
+                           }
+                       }
+                       print "{\\fontencoding{OT1}\\fontfamily{" .
+                           sprintf("chise%03d",$ids{$char}[0]) .
+                               "}\\selectfont\\char$ids{$char}[1]}",&add_break($i);
+                       next CHAR;
+                   }
+               }
                 if($ids=&get_ids($char)){
                     print &get_macro_for_ids($ids),&add_break($i);
                 }else{
@@ -318,7 +338,7 @@ sub latin_parse{
         }
         $i++;
      }
-    return '{\normalfont {'.$out_str.'}}';
+    return '{\fontencoding{UT1}\fontfamily{omlgc}\selectfont '.$out_str.'}';
  }
  
  sub ids_parse{
@@ -368,11 +388,11 @@ sub get_macro_for_ids{
      # return: TeX macro for ids
      #          or GETA character if ids is invalid for KAGE.
      my($ids)=@_;
-    $ids=&normalize_ids($ids,"UniJIS");
+    # $ids=&normalize_ids($ids,"UniJIS");
      return $geta if(($ids!~/[$idc]/)
                     or($ids=~/[\x{10000}-]/));
                      #irregular for KAGE.
-    if(not defined($ids{$ids}) and $ids{$ids}[1]>=0){
+    if(not defined($ids{$ids})){
         $ids{$ids}[0]=$font_start;
         $ids{$ids}[1]=$ids_start;
         $ids_start++;
@@ -398,13 +418,12 @@ sub normalize_ids{
         $char=$1;
         if($char=~/[$idc]/){
             $output_ids.=$char;
-       }elsif($output_char_id=&get_char_attribute($char,"=$out_cs")){
-           $output_ids.=pack("U",$output_char_id);
-       }elsif($output_char_id=&get_char_attribute($char,"=ucs")){
-           $output_ids.=pack("U",$output_char_id);
-       }elsif($output_char_id=&get_char_attribute($char,"=>$out_cs")){
-           $output_ids.=pack("U",$output_char_id);
-       }elsif($output_char_id=&get_char_attribute($char,"=>ucs")){
+       }elsif($output_char_id=&get_char_attribute($char,"=$out_cs")
+          or $output_char_id=&get_char_attribute($char,"=ucs")
+          or $output_char_id=&get_char_attribute($char,"=>$out_cs")
+          or $output_char_id=&get_char_attribute($char,"=>ucs")
+          or $output_char_id=&get_char_attribute($char,"=>ucs*")
+             ){
             $output_ids.=pack("U",$output_char_id);
         }else{
             return $geta;
@@ -427,6 +446,7 @@ sub get_output_char{
                or $out_char_id=&get_char_attribute($char,'=ucs')
                or $out_char_id=&get_char_attribute($char,'=>ucs@jis')
                or $out_char_id=&get_char_attribute($char,'=>ucs')
+              or $out_char_id=&get_char_attribute($char,'=>ucs*')
                ){
                 return '{\selectjisfont\char'.$out_char_id.'}';
             }
@@ -436,6 +456,7 @@ sub get_output_char{
                or $out_char_id=&get_char_attribute($char,'=ucs')
                or $out_char_id=&get_char_attribute($char,'=>ucs@gb')
                or $out_char_id=&get_char_attribute($char,'=>ucs')
+              or $out_char_id=&get_char_attribute($char,'=>ucs*')
                ){
                 return '{\selectgbsfont\char'.$out_char_id.'}';
             }
@@ -445,6 +466,7 @@ sub get_output_char{
                or $out_char_id=&get_char_attribute($char,'=ucs')
                or $out_char_id=&get_char_attribute($char,'=>ucs@cns')
                or $out_char_id=&get_char_attribute($char,'=>ucs')
+              or $out_char_id=&get_char_attribute($char,'=>ucs*')
                ){
                 return '{\selectcnsfont\char'.$out_char_id.'}';
             }
@@ -453,8 +475,8 @@ sub get_output_char{
             if($out_char_id=&get_char_attribute($char,'=ucs@ks')
                or $out_char_id=&get_char_attribute($char,'=ucs')
                or $out_char_id=&get_char_attribute($char,'=>ucs@ks')
-              or $out_char_id=&get_char_attribute($char,'=>ucs*')
                or $out_char_id=&get_char_attribute($char,'=>ucs')
+              or $out_char_id=&get_char_attribute($char,'=>ucs*')
                ){
                 return '{\selectksxfont\char'.$out_char_id.'}';
             }
@@ -476,6 +498,8 @@ sub get_ids{
      my $ids="";
      $ids=&get_char_attribute($char,"ids-aggregated")
         or $ids=&get_char_attribute($char,"ids");
+#    $ids=&get_char_attribute($char,"ids-decomposed")
+#      or $ids=&get_char_attribute($char,"ids");
  #        or $ids=&get_char_attribute($char,"ideographic-structure");
  #    $ids=~s/[? ()]//g;
      return $ids;