add $allow_unify option.
authorimiyazaki <imiyazaki>
Thu, 6 Nov 2003 15:23:52 +0000 (15:23 +0000)
committerimiyazaki <imiyazaki>
Thu, 6 Nov 2003 15:23:52 +0000 (15:23 +0000)
inCHISE

diff --git a/inCHISE b/inCHISE
index 73ad012..2aeb780 100755 (executable)
--- a/inCHISE
+++ b/inCHISE
@@ -4,12 +4,13 @@
 
 use strict;
 use vars qw($omegadb_path
-           $opt_protrude %opt_order
+           $opt_protrude $opt_allow_unify
+           %opt_order %order %order_map
            $opt_in_cs $opt_out_cs
            $opt_help $usage
            $in_cs $out_cs $i @chars
-           %order %order_map
            $char $char_id $out_char
+           $char_id_unified @char_id_unified
            $ids $ids_argc %ids $idsdb
            $idsdata_file $ids_start $font_start
            @CDP @HZK @GT
@@ -31,6 +32,7 @@ $opt_order{'UniCNS'}='c';
 $opt_order{'UniJIS'}='j';
 $opt_order{'UniKS'}='k';
 
+$opt_allow_unify=0; # 1=true, 0=false.
 $opt_protrude=0;# 1=true, 0=false.
 
 ### End ###
@@ -100,7 +102,7 @@ $ids_start=0x00;
 $font_start=0;
 
 if(-e $idsdata_file){
-    open(IDSDATA,"+<$idsdata_file") or die;
+    open(IDSDATA,"+<:utf8",$idsdata_file) or die;
     flock(IDSDATA,LOCK_EX);
     seek(IDSDATA,0,0);
     while(<IDSDATA>){
@@ -109,7 +111,7 @@ if(-e $idsdata_file){
     seek(IDSDATA,0,0);
 #        require $idsdata_file;
 }else{
-    open(IDSDATA,">$idsdata_file") or die;
+    open(IDSDATA,">:utf8",$idsdata_file) or die;
     flock(IDSDATA,LOCK_EX);
     seek(IDSDATA,0,0);
 }
@@ -139,7 +141,7 @@ $ids="";
 
 foreach $out_cs ('UniGB','UniCNS','UniJIS','UniKS','UniMulti'){
     if(defined($opt_order{$out_cs})){
-       if($opt_order{$out_cs}=~/^[cgjkGHC]*$/){
+       if($opt_order{$out_cs}=~/^[cgjkGHC]+$/){
            @{$order{$out_cs}}=map {$order_map{$_}}
            (split(//,$opt_order{$out_cs}));
        }else{
@@ -157,13 +159,14 @@ while(<>){
     s/(amp.+?;)/&de_tex_er($1)/ge;
 #    s/(&.+?;)/&de_tex_er($1)/ge;
     @chars=split(//);
+  CHAR:
     for($i=0;$i<=$#chars;$i++){
        $char=$chars[$i];
        $char_id=unpack("U",$char);
 
        if($char_id<=0x20){
            print $chars[$i];
-           next;
+           next CHAR;
        }elsif($char_id>0x20 and $char_id<=0x02af){
            # Basic Latin
            # Latin-1 Supplement
@@ -171,11 +174,11 @@ while(<>){
            # Latin Extended-B
            # IPA Extensions
            print &latin_parse();
-           next;
+           next CHAR;
        }elsif($char_id>=0x2ff0 and $char_id<=0x2fff){
            # Ideographic Description Characters
            print &ids_parse();
-           next;
+           next CHAR;
        }else{
            if(($out_char=&get_output_char($char_id,$out_cs))){
                print $out_char,&add_break($i);
@@ -193,8 +196,20 @@ while(<>){
                print "{\\fontencoding{OT1}\\fontfamily{" .
                    sprintf("chise%03d",$ids{$char}[0]) .
                    "}\\selectfont\\char$ids{$char}[1]}",&add_break($i);
-               next;
+               next CHAR;
            }else{
+               if($opt_allow_unify){
+                   @char_id_unified=&get_char_id_unified($char_id);
+                   if(@char_id_unified>0){
+                       foreach $char_id_unified (@char_id_unified){
+                           if(($out_char
+                               =&get_output_char($char_id_unified,$out_cs))){
+                               print $out_char,&add_break($i);
+                               next CHAR;
+                           }
+                       }
+                   }
+               }
                if($ids=&get_ids($char)){
                    print &get_macro_for_ids($ids),&add_break($i);
                }else{
@@ -294,7 +309,7 @@ sub latin_parse{
     while($i<=$#chars){
        $char_id=unpack("U",$chars[$i]);
        if($char_id>0x20 and $char_id<=0x02af){
-           $out_str.=pack("U",$char_id);
+           $out_str.=$chars[$i];
        }else{
            $i--;
            last;
@@ -498,6 +513,29 @@ sub get_char_id{
     }
 }
 
+sub get_char_id_unified{
+    my($char_id)=@_;
+    my($char,$chars,$ucs);
+    my(@char_id);
+    $char=pack("U",$char_id);
+
+    if($chars=&get_char_attribute($char,'->ucs-unified')){
+       utf8::decode($chars);
+       $chars=~s/^\((.*)\)$/$1/;
+       return map {unpack("U",$_)} (split(/\s*\?/,$chars));
+    }elsif($ucs=&get_char_attribute(pack("U",$char_id),'=>ucs')){
+       if($chars=&get_char_attribute(pack("U",$ucs),'->ucs-unified')){
+           utf8::decode($chars);
+           $chars=~s/^\((.*)\)$/$1/;
+           @char_id=grep {$char_id!=$_}
+               map {unpack("U",$_)}
+                   (split(/\s*\?/,$chars));
+           push(@char_id,$ucs);
+           return @char_id;
+       }
+    }
+}
+
 sub get_macro_for_GT{
     # argument: <char-id>
     # return: TeX macro for GT fonts or undef.