From 2bc524534de15391ee48eb0ecd135ade042dffd0 Mon Sep 17 00:00:00 2001
From: imiyazaki <imiyazaki>
Date: Wed, 19 Nov 2003 15:10:59 +0000
Subject: [PATCH] add $opt_allow_unification. simplify codes.

---
 chise2otf/chise2otf |  162 +++++++++++++++++++++++++++++++++------------------
 1 file changed, 105 insertions(+), 57 deletions(-)
diff --git a/chise2otf/chise2otf b/chise2otf/chise2otf
index 65fef19..d31c13d 100755
--- a/chise2otf/chise2otf
+++ b/chise2otf/chise2otf
@@ -2,11 +2,13 @@
 
 use strict;
 use vars qw($opt_in_cs $opt_order $opt_kage $opt_replace
+	    $opt_use_kage_for_Ext_B $opt_allow_unification
 	    $opt_help $usage
 	    $in_cs $out_cs $i @chars
 	    @order $order %order
 	    @texmacro
 	    $char $char_id $out_char
+	    $char_unified @chars_unified
 	    $ids $ids_argc %ids $idsdb
 	    $idsdata_file $ids_start $font_start
 	    $perl56 $perl58
@@ -21,6 +23,9 @@ require 5.008;
 my $omegadb_path="/usr/local/lib/chise/omega";
 $omegadb_path=~s!/$!!;
 
+# currently does not work, so...
+$opt_use_kage_for_Ext_B=0;
+
 my $makefonts="$omegadb_path/makefonts.pl";
 my $exec_makefonts=0;
 my $geta=pack("S",8750|0x8080);
@@ -29,13 +34,14 @@ my $geta=pack("S",8750|0x8080);
 	    "order=s"=>\$opt_order,
 	    "replace",\$opt_replace,
 	    "kage",\$opt_kage,
+	    "unify",\$opt_allow_unification,
 	    "help",\$opt_help);
 
 $usage=<<EOF;
-Usage: $0 [-i <input coding system>] [-o <order of kanji>] [-k] <filename>
-    input coding system: (default: ucs\@mcs)
-      ucs\@mcs, ucs\@cns, ucs\@gb, ucs\@jis, ucs\@ks
-    order of kanji: (default: j)
+Usage: $0 [-i <input coding system>] [-o <order of kanji>] [-kru] <filename>
+    -i: input coding system: (default: ucs\@mcs)
+       ucs\@mcs, ucs\@cns, ucs\@gb, ucs\@jis, ucs\@ks
+    -o: order of kanji: (default: j)
        c: CNS
        g: GB
        j: JIS
@@ -43,8 +49,9 @@ Usage: $0 [-i <input coding system>] [-o <order of kanji>] [-k] <filename>
        G: GT
        m: Multi, use \\UTFM of otf.sty
       You can also combine them, ex. jtcgkm
-    k: use Kage server.
-    r: replace r and l with dot below to those with circle below.
+    -k: use Kage server.
+    -r: replace r and l with dot below to those with circle below.
+    -u: allow unification.
 EOF
 
 %order=('c'=>'UniCNS',
@@ -144,11 +151,12 @@ $ids="";
 
 while(<>){
     if($in_cs ne 'ucs@mcs'){
-	s/(.)/pack("U",&get_char_id(unpack("U",$1),$in_cs))/ge;
+	s/(.)/&get_char_in_mcs($1,$in_cs)/ge;
     }
     s/((?:^|[^\\])(?:\\\\)*)(&.*?;)/&de_tex_er($1,$2)/ge;
     s/((?:^|[^\\])(?:\\\\)*)\\([$idc])/$1.'\UTFM{'.sprintf("%X",unpack("U",$2)).'}'/ge;
     @chars=split(//);
+  CHAR:
     for($i=0;$i<=$#chars;$i++){
 	$char=$chars[$i];
 	$char_id=unpack("U",$chars[$i]);
@@ -156,38 +164,57 @@ while(<>){
 	if($char_id<=0x7f){
 	    # Basic Latin
 	    print $char;
-	    next;
+	    next CHAR;
 	}elsif(defined($texmacro[$char_id]) and $texmacro[$char_id]){
 	    # already defined for 
 	    # 0080..00FF; Latin-1 Supplement
 	    # 0100..017F; Latin Extended-A
 	    # 1E00..1EFF; Latin Extended Additional
 	    print $texmacro[$char_id];
-	    next;
+	    next CHAR;
 	}elsif($char_id>=0x2ff0 and $char_id<=0x2fff){
 	    # Ideographic Description Characters
 	    print &ids_parse();
-	    next;
+	    next CHAR;
 	}else{
-	    if(($out_char=&get_output_char($char_id))){
+	    if(($out_char=&get_output_char($char))){
 		print $out_char;
-	    }elsif($char_id >= 0x20000 && $char_id <=0x2a6df){
-		# CJK Unified Ideographs Extension B
-		if(not defined($ids{$char}) and $ids{$char}[1]>=0){
-		    $ids{$char}[0]=$font_start;
-		    $ids{$char}[1]=$ids_start;
-		    $ids_start++;
-		    if($ids_start>255){
-			$ids_start=0;
-			$font_start++;
+		next CHAR;
+	    }else{
+		if($opt_allow_unification){
+		    @chars_unified=&get_chars_unified($char);
+		    if(@chars_unified>0){
+			foreach $char_unified (@chars_unified){
+			    if(($out_char=&get_output_char($char_unified))){
+				print $out_char;
+				next CHAR;
+			    }
+			}
 		    }
 		}
-		print "{\\fontencoding{OT1}\\fontfamily{" .
-		    sprintf("chise%03d",$ids{$char}[0]) .
-		    "}\\selectfont\\char$ids{$char}[1]}";
-		next;
-	    }else{
-		print &get_macro_for_ids(&get_ids($char));
+		if($opt_use_kage_for_Ext_B
+		   and $char_id >= 0x20000 and $char_id <=0x2a6df){
+		    # CJK Unified Ideographs Extension B
+		    if(not defined($ids{$char}) and $ids{$char}[1]>=0){
+			$ids{$char}[0]=$font_start;
+			$ids{$char}[1]=$ids_start;
+			$ids_start++;
+			if($ids_start>255){
+			    $ids_start=0;
+			    $font_start++;
+			}
+		    }
+		    print "{\\fontencoding{OT1}\\fontfamily{" .
+			sprintf("chise%03d",$ids{$char}[0]) .
+			    "}\\selectfont\\char$ids{$char}[1]}";
+		    next CHAR;
+		}
+		if($ids=&get_ids($char)){
+		    print &get_macro_for_ids($ids);
+		    next CHAR;
+		}else{
+		    print $geta;
+		}
 	    }
 	}
     }
@@ -229,7 +256,6 @@ sub de_tex_er{
 	    $value=hex($value);
 	}
 	($output_char)=&get_chars_matching($er_alias{$atr},$value);
-#	utf8::decode($output_char);
     }
     if($output_char){
 	return $before_er.$output_char;
@@ -255,8 +281,8 @@ sub ids_parse{
 
 	($ids,$ids_argc)=&ids_rest($ids,$ids_argc,$chars[$i]);
 	if($ids_argc==0){
-	    if(($char_id=&get_char_id_for_ids($ids))
-	       and($out_char=&get_output_char($char_id))){
+	    if(($char=&get_char_for_ids($ids))
+	       and($out_char=&get_output_char($char))){
 		return $out_char;
 	    }else{
 		return &get_macro_for_ids($ids);
@@ -277,8 +303,6 @@ sub ids_rest{
 	$ids_argc--;
     }
     $ids.=$char;
-#    $ids.=$char if($perl56);
-#    $ids.=encode('utf8',$char) if($perl58);
     return ($ids,$ids_argc);
 }
 
@@ -313,15 +337,17 @@ sub normalize_ids{
     $out_cs=~s/Uni(.+)/'ucs@'.lc($1)/e;
 
     my $output_ids="";
-    my($char,$char_id,$output_char_id);
+    my($char,$output_char_id);
     while($ids=~m/(.)/g){
 	$char=$1;
-	$char_id=unpack("U",$char);
 	if($char=~/[$idc]/){
 	    $output_ids.=$char;
-	}elsif($output_char_id=&get_char_attribute($char,"=$out_cs")){
-	    $output_ids.=pack("U",$output_char_id);
-	}elsif($output_char_id=&get_char_attribute($char,"=ucs")){
+	}elsif($output_char_id=&get_char_attribute($char,"=$out_cs")
+	       or $output_char_id=&get_char_attribute($char,"=ucs")
+	       or $output_char_id=&get_char_attribute($char,"=>$out_cs")
+	       or $output_char_id=&get_char_attribute($char,"=>ucs")
+	       or $output_char_id=&get_char_attribute($char,"=>ucs*")
+	      ){
 	    $output_ids.=pack("U",$output_char_id);
 	}else{
 	    return $geta;
@@ -331,13 +357,11 @@ sub normalize_ids{
 }
 
 sub get_output_char{
-    # argument: <char-id>
+    # argument: <char>
     # return: character in EUC-JP or TeX macro for pTeX.
-    my($char_id)=@_;
-    my($char,$out_char,$out_char_id,$gt);
+    my($char)=@_;
+    my($out_char,$out_char_id,$gt);
     
-    $char=pack('U',$char_id);
-
     if($out_char=&get_char_attribute($char,'=jis-x0208')
        or $out_char=&get_char_attribute($char,'=jis-x0208-1983')
        or $out_char=&get_char_attribute($char,'=jis-x0208-1990')
@@ -351,6 +375,7 @@ sub get_output_char{
 		   or $out_char_id=&get_char_attribute($char,'=ucs')
 		   or $out_char_id=&get_char_attribute($char,'=>ucs@jis')
 		   or $out_char_id=&get_char_attribute($char,'=>ucs')
+		   or $out_char_id=&get_char_attribute($char,'=>ucs*')
 		   ){
 		    return "\\UTF{".sprintf("%X",$out_char_id)."}";
 		}
@@ -360,6 +385,7 @@ sub get_output_char{
 		   or $out_char_id=&get_char_attribute($char,'=ucs')
 		   or $out_char_id=&get_char_attribute($char,'=>ucs@gb')
 		   or $out_char_id=&get_char_attribute($char,'=>ucs')
+		   or $out_char_id=&get_char_attribute($char,'=>ucs*')
 		   ){
 		    return "\\UTFC{".sprintf("%X",$out_char_id)."}";
 		}
@@ -369,6 +395,7 @@ sub get_output_char{
 		   or $out_char_id=&get_char_attribute($char,'=ucs')
 		   or $out_char_id=&get_char_attribute($char,'=>ucs@cns')
 		   or $out_char_id=&get_char_attribute($char,'=>ucs')
+		   or $out_char_id=&get_char_attribute($char,'=>ucs*')
 		   ){
 		    return "\\UTFT{".sprintf("%X",$out_char_id)."}";
 		}
@@ -378,11 +405,12 @@ sub get_output_char{
 		   or $out_char_id=&get_char_attribute($char,'=ucs')
 		   or $out_char_id=&get_char_attribute($char,'=>ucs@ks')
 		   or $out_char_id=&get_char_attribute($char,'=>ucs')
+		   or $out_char_id=&get_char_attribute($char,'=>ucs*')
 		   ){
 		    return "\\UTFK{".sprintf("%X",$out_char_id)."}";
 		}
 	    }elsif($out_cs eq 'GT'){
-		return $gt if($gt=&get_macro_for_GT($char_id));
+		return $gt if($gt=&get_macro_for_GT($char));
 	    }elsif($out_cs eq 'Multi'){
 		if($out_char_id=&get_char_attribute($char,'=ucs')){
 		    return "\\UTFM{".sprintf("%X",$out_char_id)."}";
@@ -403,38 +431,58 @@ sub get_ids{
     return $ids;
 }
 
-sub get_char_id_for_ids{
+sub get_char_for_ids{
     # argument: <ideographic description sequence>
-    # return: char-id
+    # return: char or undef
     my($ids)=@_;
     my($output_char);
 
     if(($output_char)=&get_chars_matching("ids",$ids)){
-	return unpack("U",$output_char);
+	return $output_char;
     }else{
 	return undef;
     }
 }
 
-sub get_char_id{
-    # argument: <char-id>, <input coding system>
-    # return:   char-id.
-    my($char_id,$in_cs)=@_;
+sub get_char_in_mcs{
+    # argument: <char>, <input coding system>
+    # return:   char in ucs@mcs.
+    my($char,$in_cs)=@_;
     my($output_char);
 
-    if(($output_char)=&get_chars_matching("=$in_cs",$char_id)){
-	return unpack("U",$output_char);
+    if(($output_char)=&get_chars_matching("=$in_cs",unpack("U",$char))){
+	return $output_char;
     }else{
-	return $char_id;
+	return $char;
+    }
+}
+
+sub get_chars_unified{
+    my($char)=@_;
+    my($chars,$ucs,$char_ucs);
+    my(@chars);
+
+    if($chars=&get_char_attribute($char,'->ucs-unified')){
+	$chars=~s/^\((.*)\)$/$1/;
+	return (split(/\s*\?/,$chars));
+    }elsif($ucs=&get_char_attribute($char,'=>ucs*')
+	  or $ucs=&get_char_attribute($char,'=>ucs')){
+	$char_ucs=pack("U",$ucs);
+	if($chars=&get_char_attribute($char_ucs,'->ucs-unified')){
+	    $chars=~s/^\((.*)\)$/$1/;
+	    @chars=grep {not /^$char$/}
+		(split(/\s*\?/,$chars));
+	    push(@chars,$char_ucs);
+	    return @chars;
+	}
     }
 }
 
 sub get_macro_for_GT{
-    # argument: <char-id>
-    # return: TeX macro for GT fonts.
-    my($char_id)=@_;
-    my($char,$gt,$GT);
-    $char=pack("U",$char_id);
+    # argument: <char>
+    # return: TeX macro for GT fonts or undef.
+    my($char)=@_;
+    my($gt,$GT);
     foreach (@GT){
 	if($gt=&get_char_attribute($char,$_)){
 	    m/gt\-pj\-(\d+)/ and $GT=$1;
-- 
1.7.10.4