From: imiyazaki Date: Thu, 6 Nov 2003 15:23:52 +0000 (+0000) Subject: add $allow_unify option. X-Git-Url: http://git.chise.org/gitweb/?a=commitdiff_plain;ds=sidebyside;h=74c76de92eb4a85cf65bff8e8047a40e2d6522e0;p=chise%2Fomega.git add $allow_unify option. --- diff --git a/inCHISE b/inCHISE index 73ad012..2aeb780 100755 --- a/inCHISE +++ b/inCHISE @@ -4,12 +4,13 @@ use strict; use vars qw($omegadb_path - $opt_protrude %opt_order + $opt_protrude $opt_allow_unify + %opt_order %order %order_map $opt_in_cs $opt_out_cs $opt_help $usage $in_cs $out_cs $i @chars - %order %order_map $char $char_id $out_char + $char_id_unified @char_id_unified $ids $ids_argc %ids $idsdb $idsdata_file $ids_start $font_start @CDP @HZK @GT @@ -31,6 +32,7 @@ $opt_order{'UniCNS'}='c'; $opt_order{'UniJIS'}='j'; $opt_order{'UniKS'}='k'; +$opt_allow_unify=0; # 1=true, 0=false. $opt_protrude=0;# 1=true, 0=false. ### End ### @@ -100,7 +102,7 @@ $ids_start=0x00; $font_start=0; if(-e $idsdata_file){ - open(IDSDATA,"+<$idsdata_file") or die; + open(IDSDATA,"+<:utf8",$idsdata_file) or die; flock(IDSDATA,LOCK_EX); seek(IDSDATA,0,0); while(){ @@ -109,7 +111,7 @@ if(-e $idsdata_file){ seek(IDSDATA,0,0); # require $idsdata_file; }else{ - open(IDSDATA,">$idsdata_file") or die; + open(IDSDATA,">:utf8",$idsdata_file) or die; flock(IDSDATA,LOCK_EX); seek(IDSDATA,0,0); } @@ -139,7 +141,7 @@ $ids=""; foreach $out_cs ('UniGB','UniCNS','UniJIS','UniKS','UniMulti'){ if(defined($opt_order{$out_cs})){ - if($opt_order{$out_cs}=~/^[cgjkGHC]*$/){ + if($opt_order{$out_cs}=~/^[cgjkGHC]+$/){ @{$order{$out_cs}}=map {$order_map{$_}} (split(//,$opt_order{$out_cs})); }else{ @@ -157,13 +159,14 @@ while(<>){ s/(amp.+?;)/&de_tex_er($1)/ge; # s/(&.+?;)/&de_tex_er($1)/ge; @chars=split(//); + CHAR: for($i=0;$i<=$#chars;$i++){ $char=$chars[$i]; $char_id=unpack("U",$char); if($char_id<=0x20){ print $chars[$i]; - next; + next CHAR; }elsif($char_id>0x20 and $char_id<=0x02af){ # Basic Latin # Latin-1 Supplement @@ -171,11 +174,11 @@ while(<>){ # Latin Extended-B # IPA Extensions print &latin_parse(); - next; + next CHAR; }elsif($char_id>=0x2ff0 and $char_id<=0x2fff){ # Ideographic Description Characters print &ids_parse(); - next; + next CHAR; }else{ if(($out_char=&get_output_char($char_id,$out_cs))){ print $out_char,&add_break($i); @@ -193,8 +196,20 @@ while(<>){ print "{\\fontencoding{OT1}\\fontfamily{" . sprintf("chise%03d",$ids{$char}[0]) . "}\\selectfont\\char$ids{$char}[1]}",&add_break($i); - next; + next CHAR; }else{ + if($opt_allow_unify){ + @char_id_unified=&get_char_id_unified($char_id); + if(@char_id_unified>0){ + foreach $char_id_unified (@char_id_unified){ + if(($out_char + =&get_output_char($char_id_unified,$out_cs))){ + print $out_char,&add_break($i); + next CHAR; + } + } + } + } if($ids=&get_ids($char)){ print &get_macro_for_ids($ids),&add_break($i); }else{ @@ -294,7 +309,7 @@ sub latin_parse{ while($i<=$#chars){ $char_id=unpack("U",$chars[$i]); if($char_id>0x20 and $char_id<=0x02af){ - $out_str.=pack("U",$char_id); + $out_str.=$chars[$i]; }else{ $i--; last; @@ -498,6 +513,29 @@ sub get_char_id{ } } +sub get_char_id_unified{ + my($char_id)=@_; + my($char,$chars,$ucs); + my(@char_id); + $char=pack("U",$char_id); + + if($chars=&get_char_attribute($char,'->ucs-unified')){ + utf8::decode($chars); + $chars=~s/^\((.*)\)$/$1/; + return map {unpack("U",$_)} (split(/\s*\?/,$chars)); + }elsif($ucs=&get_char_attribute(pack("U",$char_id),'=>ucs')){ + if($chars=&get_char_attribute(pack("U",$ucs),'->ucs-unified')){ + utf8::decode($chars); + $chars=~s/^\((.*)\)$/$1/; + @char_id=grep {$char_id!=$_} + map {unpack("U",$_)} + (split(/\s*\?/,$chars)); + push(@char_id,$ucs); + return @char_id; + } + } +} + sub get_macro_for_GT{ # argument: # return: TeX macro for GT fonts or undef.