From 2bc524534de15391ee48eb0ecd135ade042dffd0 Mon Sep 17 00:00:00 2001 From: imiyazaki Date: Wed, 19 Nov 2003 15:10:59 +0000 Subject: [PATCH] add $opt_allow_unification. simplify codes. --- chise2otf/chise2otf | 162 +++++++++++++++++++++++++++++++++------------------ 1 file changed, 105 insertions(+), 57 deletions(-) diff --git a/chise2otf/chise2otf b/chise2otf/chise2otf index 65fef19..d31c13d 100755 --- a/chise2otf/chise2otf +++ b/chise2otf/chise2otf @@ -2,11 +2,13 @@ use strict; use vars qw($opt_in_cs $opt_order $opt_kage $opt_replace + $opt_use_kage_for_Ext_B $opt_allow_unification $opt_help $usage $in_cs $out_cs $i @chars @order $order %order @texmacro $char $char_id $out_char + $char_unified @chars_unified $ids $ids_argc %ids $idsdb $idsdata_file $ids_start $font_start $perl56 $perl58 @@ -21,6 +23,9 @@ require 5.008; my $omegadb_path="/usr/local/lib/chise/omega"; $omegadb_path=~s!/$!!; +# currently does not work, so... +$opt_use_kage_for_Ext_B=0; + my $makefonts="$omegadb_path/makefonts.pl"; my $exec_makefonts=0; my $geta=pack("S",8750|0x8080); @@ -29,13 +34,14 @@ my $geta=pack("S",8750|0x8080); "order=s"=>\$opt_order, "replace",\$opt_replace, "kage",\$opt_kage, + "unify",\$opt_allow_unification, "help",\$opt_help); $usage=<] [-o ] [-k] - input coding system: (default: ucs\@mcs) - ucs\@mcs, ucs\@cns, ucs\@gb, ucs\@jis, ucs\@ks - order of kanji: (default: j) +Usage: $0 [-i ] [-o ] [-kru] + -i: input coding system: (default: ucs\@mcs) + ucs\@mcs, ucs\@cns, ucs\@gb, ucs\@jis, ucs\@ks + -o: order of kanji: (default: j) c: CNS g: GB j: JIS @@ -43,8 +49,9 @@ Usage: $0 [-i ] [-o ] [-k] G: GT m: Multi, use \\UTFM of otf.sty You can also combine them, ex. jtcgkm - k: use Kage server. - r: replace r and l with dot below to those with circle below. + -k: use Kage server. + -r: replace r and l with dot below to those with circle below. + -u: allow unification. EOF %order=('c'=>'UniCNS', @@ -144,11 +151,12 @@ $ids=""; while(<>){ if($in_cs ne 'ucs@mcs'){ - s/(.)/pack("U",&get_char_id(unpack("U",$1),$in_cs))/ge; + s/(.)/&get_char_in_mcs($1,$in_cs)/ge; } s/((?:^|[^\\])(?:\\\\)*)(&.*?;)/&de_tex_er($1,$2)/ge; s/((?:^|[^\\])(?:\\\\)*)\\([$idc])/$1.'\UTFM{'.sprintf("%X",unpack("U",$2)).'}'/ge; @chars=split(//); + CHAR: for($i=0;$i<=$#chars;$i++){ $char=$chars[$i]; $char_id=unpack("U",$chars[$i]); @@ -156,38 +164,57 @@ while(<>){ if($char_id<=0x7f){ # Basic Latin print $char; - next; + next CHAR; }elsif(defined($texmacro[$char_id]) and $texmacro[$char_id]){ # already defined for # 0080..00FF; Latin-1 Supplement # 0100..017F; Latin Extended-A # 1E00..1EFF; Latin Extended Additional print $texmacro[$char_id]; - next; + next CHAR; }elsif($char_id>=0x2ff0 and $char_id<=0x2fff){ # Ideographic Description Characters print &ids_parse(); - next; + next CHAR; }else{ - if(($out_char=&get_output_char($char_id))){ + if(($out_char=&get_output_char($char))){ print $out_char; - }elsif($char_id >= 0x20000 && $char_id <=0x2a6df){ - # CJK Unified Ideographs Extension B - if(not defined($ids{$char}) and $ids{$char}[1]>=0){ - $ids{$char}[0]=$font_start; - $ids{$char}[1]=$ids_start; - $ids_start++; - if($ids_start>255){ - $ids_start=0; - $font_start++; + next CHAR; + }else{ + if($opt_allow_unification){ + @chars_unified=&get_chars_unified($char); + if(@chars_unified>0){ + foreach $char_unified (@chars_unified){ + if(($out_char=&get_output_char($char_unified))){ + print $out_char; + next CHAR; + } + } } } - print "{\\fontencoding{OT1}\\fontfamily{" . - sprintf("chise%03d",$ids{$char}[0]) . - "}\\selectfont\\char$ids{$char}[1]}"; - next; - }else{ - print &get_macro_for_ids(&get_ids($char)); + if($opt_use_kage_for_Ext_B + and $char_id >= 0x20000 and $char_id <=0x2a6df){ + # CJK Unified Ideographs Extension B + if(not defined($ids{$char}) and $ids{$char}[1]>=0){ + $ids{$char}[0]=$font_start; + $ids{$char}[1]=$ids_start; + $ids_start++; + if($ids_start>255){ + $ids_start=0; + $font_start++; + } + } + print "{\\fontencoding{OT1}\\fontfamily{" . + sprintf("chise%03d",$ids{$char}[0]) . + "}\\selectfont\\char$ids{$char}[1]}"; + next CHAR; + } + if($ids=&get_ids($char)){ + print &get_macro_for_ids($ids); + next CHAR; + }else{ + print $geta; + } } } } @@ -229,7 +256,6 @@ sub de_tex_er{ $value=hex($value); } ($output_char)=&get_chars_matching($er_alias{$atr},$value); -# utf8::decode($output_char); } if($output_char){ return $before_er.$output_char; @@ -255,8 +281,8 @@ sub ids_parse{ ($ids,$ids_argc)=&ids_rest($ids,$ids_argc,$chars[$i]); if($ids_argc==0){ - if(($char_id=&get_char_id_for_ids($ids)) - and($out_char=&get_output_char($char_id))){ + if(($char=&get_char_for_ids($ids)) + and($out_char=&get_output_char($char))){ return $out_char; }else{ return &get_macro_for_ids($ids); @@ -277,8 +303,6 @@ sub ids_rest{ $ids_argc--; } $ids.=$char; -# $ids.=$char if($perl56); -# $ids.=encode('utf8',$char) if($perl58); return ($ids,$ids_argc); } @@ -313,15 +337,17 @@ sub normalize_ids{ $out_cs=~s/Uni(.+)/'ucs@'.lc($1)/e; my $output_ids=""; - my($char,$char_id,$output_char_id); + my($char,$output_char_id); while($ids=~m/(.)/g){ $char=$1; - $char_id=unpack("U",$char); if($char=~/[$idc]/){ $output_ids.=$char; - }elsif($output_char_id=&get_char_attribute($char,"=$out_cs")){ - $output_ids.=pack("U",$output_char_id); - }elsif($output_char_id=&get_char_attribute($char,"=ucs")){ + }elsif($output_char_id=&get_char_attribute($char,"=$out_cs") + or $output_char_id=&get_char_attribute($char,"=ucs") + or $output_char_id=&get_char_attribute($char,"=>$out_cs") + or $output_char_id=&get_char_attribute($char,"=>ucs") + or $output_char_id=&get_char_attribute($char,"=>ucs*") + ){ $output_ids.=pack("U",$output_char_id); }else{ return $geta; @@ -331,13 +357,11 @@ sub normalize_ids{ } sub get_output_char{ - # argument: + # argument: # return: character in EUC-JP or TeX macro for pTeX. - my($char_id)=@_; - my($char,$out_char,$out_char_id,$gt); + my($char)=@_; + my($out_char,$out_char_id,$gt); - $char=pack('U',$char_id); - if($out_char=&get_char_attribute($char,'=jis-x0208') or $out_char=&get_char_attribute($char,'=jis-x0208-1983') or $out_char=&get_char_attribute($char,'=jis-x0208-1990') @@ -351,6 +375,7 @@ sub get_output_char{ or $out_char_id=&get_char_attribute($char,'=ucs') or $out_char_id=&get_char_attribute($char,'=>ucs@jis') or $out_char_id=&get_char_attribute($char,'=>ucs') + or $out_char_id=&get_char_attribute($char,'=>ucs*') ){ return "\\UTF{".sprintf("%X",$out_char_id)."}"; } @@ -360,6 +385,7 @@ sub get_output_char{ or $out_char_id=&get_char_attribute($char,'=ucs') or $out_char_id=&get_char_attribute($char,'=>ucs@gb') or $out_char_id=&get_char_attribute($char,'=>ucs') + or $out_char_id=&get_char_attribute($char,'=>ucs*') ){ return "\\UTFC{".sprintf("%X",$out_char_id)."}"; } @@ -369,6 +395,7 @@ sub get_output_char{ or $out_char_id=&get_char_attribute($char,'=ucs') or $out_char_id=&get_char_attribute($char,'=>ucs@cns') or $out_char_id=&get_char_attribute($char,'=>ucs') + or $out_char_id=&get_char_attribute($char,'=>ucs*') ){ return "\\UTFT{".sprintf("%X",$out_char_id)."}"; } @@ -378,11 +405,12 @@ sub get_output_char{ or $out_char_id=&get_char_attribute($char,'=ucs') or $out_char_id=&get_char_attribute($char,'=>ucs@ks') or $out_char_id=&get_char_attribute($char,'=>ucs') + or $out_char_id=&get_char_attribute($char,'=>ucs*') ){ return "\\UTFK{".sprintf("%X",$out_char_id)."}"; } }elsif($out_cs eq 'GT'){ - return $gt if($gt=&get_macro_for_GT($char_id)); + return $gt if($gt=&get_macro_for_GT($char)); }elsif($out_cs eq 'Multi'){ if($out_char_id=&get_char_attribute($char,'=ucs')){ return "\\UTFM{".sprintf("%X",$out_char_id)."}"; @@ -403,38 +431,58 @@ sub get_ids{ return $ids; } -sub get_char_id_for_ids{ +sub get_char_for_ids{ # argument: - # return: char-id + # return: char or undef my($ids)=@_; my($output_char); if(($output_char)=&get_chars_matching("ids",$ids)){ - return unpack("U",$output_char); + return $output_char; }else{ return undef; } } -sub get_char_id{ - # argument: , - # return: char-id. - my($char_id,$in_cs)=@_; +sub get_char_in_mcs{ + # argument: , + # return: char in ucs@mcs. + my($char,$in_cs)=@_; my($output_char); - if(($output_char)=&get_chars_matching("=$in_cs",$char_id)){ - return unpack("U",$output_char); + if(($output_char)=&get_chars_matching("=$in_cs",unpack("U",$char))){ + return $output_char; }else{ - return $char_id; + return $char; + } +} + +sub get_chars_unified{ + my($char)=@_; + my($chars,$ucs,$char_ucs); + my(@chars); + + if($chars=&get_char_attribute($char,'->ucs-unified')){ + $chars=~s/^\((.*)\)$/$1/; + return (split(/\s*\?/,$chars)); + }elsif($ucs=&get_char_attribute($char,'=>ucs*') + or $ucs=&get_char_attribute($char,'=>ucs')){ + $char_ucs=pack("U",$ucs); + if($chars=&get_char_attribute($char_ucs,'->ucs-unified')){ + $chars=~s/^\((.*)\)$/$1/; + @chars=grep {not /^$char$/} + (split(/\s*\?/,$chars)); + push(@chars,$char_ucs); + return @chars; + } } } sub get_macro_for_GT{ - # argument: - # return: TeX macro for GT fonts. - my($char_id)=@_; - my($char,$gt,$GT); - $char=pack("U",$char_id); + # argument: + # return: TeX macro for GT fonts or undef. + my($char)=@_; + my($gt,$GT); foreach (@GT){ if($gt=&get_char_attribute($char,$_)){ m/gt\-pj\-(\d+)/ and $GT=$1; -- 1.7.10.4