From: imiyazaki <imiyazaki>
Date: Thu, 6 Nov 2003 15:23:52 +0000 (+0000)
Subject: add $allow_unify option.
X-Git-Url: http://git.chise.org/gitweb/?a=commitdiff_plain;ds=sidebyside;h=74c76de92eb4a85cf65bff8e8047a40e2d6522e0;p=chise%2Fomega.git

add $allow_unify option.
---

diff --git a/inCHISE b/inCHISE
index 73ad012..2aeb780 100755
--- a/inCHISE
+++ b/inCHISE
@@ -4,12 +4,13 @@
 
 use strict;
 use vars qw($omegadb_path
-	    $opt_protrude %opt_order
+	    $opt_protrude $opt_allow_unify
+	    %opt_order %order %order_map
 	    $opt_in_cs $opt_out_cs
 	    $opt_help $usage
 	    $in_cs $out_cs $i @chars
-	    %order %order_map
 	    $char $char_id $out_char
+	    $char_id_unified @char_id_unified
 	    $ids $ids_argc %ids $idsdb
 	    $idsdata_file $ids_start $font_start
 	    @CDP @HZK @GT
@@ -31,6 +32,7 @@ $opt_order{'UniCNS'}='c';
 $opt_order{'UniJIS'}='j';
 $opt_order{'UniKS'}='k';
 
+$opt_allow_unify=0; # 1=true, 0=false.
 $opt_protrude=0;# 1=true, 0=false.
 
 ### End ###
@@ -100,7 +102,7 @@ $ids_start=0x00;
 $font_start=0;
 
 if(-e $idsdata_file){
-    open(IDSDATA,"+<$idsdata_file") or die;
+    open(IDSDATA,"+<:utf8",$idsdata_file) or die;
     flock(IDSDATA,LOCK_EX);
     seek(IDSDATA,0,0);
     while(<IDSDATA>){
@@ -109,7 +111,7 @@ if(-e $idsdata_file){
     seek(IDSDATA,0,0);
 #	  require $idsdata_file;
 }else{
-    open(IDSDATA,">$idsdata_file") or die;
+    open(IDSDATA,">:utf8",$idsdata_file) or die;
     flock(IDSDATA,LOCK_EX);
     seek(IDSDATA,0,0);
 }
@@ -139,7 +141,7 @@ $ids="";
 
 foreach $out_cs ('UniGB','UniCNS','UniJIS','UniKS','UniMulti'){
     if(defined($opt_order{$out_cs})){
-	if($opt_order{$out_cs}=~/^[cgjkGHC]*$/){
+	if($opt_order{$out_cs}=~/^[cgjkGHC]+$/){
 	    @{$order{$out_cs}}=map {$order_map{$_}}
 	    (split(//,$opt_order{$out_cs}));
 	}else{
@@ -157,13 +159,14 @@ while(<>){
     s/(amp.+?;)/&de_tex_er($1)/ge;
 #    s/(&.+?;)/&de_tex_er($1)/ge;
     @chars=split(//);
+  CHAR:
     for($i=0;$i<=$#chars;$i++){
 	$char=$chars[$i];
 	$char_id=unpack("U",$char);
 
 	if($char_id<=0x20){
 	    print $chars[$i];
-	    next;
+	    next CHAR;
 	}elsif($char_id>0x20 and $char_id<=0x02af){
 	    # Basic Latin
 	    # Latin-1 Supplement
@@ -171,11 +174,11 @@ while(<>){
 	    # Latin Extended-B
 	    # IPA Extensions
 	    print &latin_parse();
-	    next;
+	    next CHAR;
 	}elsif($char_id>=0x2ff0 and $char_id<=0x2fff){
 	    # Ideographic Description Characters
 	    print &ids_parse();
-	    next;
+	    next CHAR;
 	}else{
 	    if(($out_char=&get_output_char($char_id,$out_cs))){
 		print $out_char,&add_break($i);
@@ -193,8 +196,20 @@ while(<>){
 		print "{\\fontencoding{OT1}\\fontfamily{" .
 		    sprintf("chise%03d",$ids{$char}[0]) .
 		    "}\\selectfont\\char$ids{$char}[1]}",&add_break($i);
-		next;
+		next CHAR;
 	    }else{
+		if($opt_allow_unify){
+		    @char_id_unified=&get_char_id_unified($char_id);
+		    if(@char_id_unified>0){
+			foreach $char_id_unified (@char_id_unified){
+			    if(($out_char
+				=&get_output_char($char_id_unified,$out_cs))){
+				print $out_char,&add_break($i);
+				next CHAR;
+			    }
+			}
+		    }
+		}
 		if($ids=&get_ids($char)){
 		    print &get_macro_for_ids($ids),&add_break($i);
 		}else{
@@ -294,7 +309,7 @@ sub latin_parse{
     while($i<=$#chars){
 	$char_id=unpack("U",$chars[$i]);
 	if($char_id>0x20 and $char_id<=0x02af){
-	    $out_str.=pack("U",$char_id);
+	    $out_str.=$chars[$i];
 	}else{
 	    $i--;
 	    last;
@@ -498,6 +513,29 @@ sub get_char_id{
     }
 }
 
+sub get_char_id_unified{
+    my($char_id)=@_;
+    my($char,$chars,$ucs);
+    my(@char_id);
+    $char=pack("U",$char_id);
+
+    if($chars=&get_char_attribute($char,'->ucs-unified')){
+	utf8::decode($chars);
+	$chars=~s/^\((.*)\)$/$1/;
+	return map {unpack("U",$_)} (split(/\s*\?/,$chars));
+    }elsif($ucs=&get_char_attribute(pack("U",$char_id),'=>ucs')){
+	if($chars=&get_char_attribute(pack("U",$ucs),'->ucs-unified')){
+	    utf8::decode($chars);
+	    $chars=~s/^\((.*)\)$/$1/;
+	    @char_id=grep {$char_id!=$_}
+		map {unpack("U",$_)}
+		    (split(/\s*\?/,$chars));
+	    push(@char_id,$ucs);
+	    return @char_id;
+	}
+    }
+}
+
 sub get_macro_for_GT{
     # argument: <char-id>
     # return: TeX macro for GT fonts or undef.