X-Git-Url: http://git.chise.org/gitweb/?a=blobdiff_plain;f=chise2otf%2Fchise2otf;h=4659a4e46bf2ea30324b4704b61eaa8dcac7af06;hb=455d2c1110e3091cbcb34e60771a66a111654643;hp=bf9418f5d4f5f9304d65ad2c9c7d5fd009a61f3e;hpb=a82c439ebc29aad941631860fdd786270458a8d5;p=chise%2Fomega.git diff --git a/chise2otf/chise2otf b/chise2otf/chise2otf index bf9418f..4659a4e 100755 --- a/chise2otf/chise2otf +++ b/chise2otf/chise2otf @@ -2,11 +2,14 @@ use strict; use vars qw($opt_in_cs $opt_order $opt_kage $opt_replace + $opt_latin + $opt_use_kage_for_Ext_B $opt_allow_unification $opt_help $usage $in_cs $out_cs $i @chars @order $order %order @texmacro $char $char_id $out_char + $char_unified @chars_unified $ids $ids_argc %ids $idsdb $idsdata_file $ids_start $font_start $perl56 $perl58 @@ -18,10 +21,10 @@ use utf8; use Chise_utils ':all'; require 5.008; -my $omegadb_path="/usr/local/lib/chise/omega"; -$omegadb_path=~s!/$!!; +# currently does not work, so... +$opt_use_kage_for_Ext_B=0; -my $makefonts="$omegadb_path/makefonts.pl"; +my $makefonts="/usr/local/share/texmf/omega/ocp/local/chise/makefonts.pl"; my $exec_makefonts=0; my $geta=pack("S",8750|0x8080); @@ -29,22 +32,26 @@ my $geta=pack("S",8750|0x8080); "order=s"=>\$opt_order, "replace",\$opt_replace, "kage",\$opt_kage, + "latin",\$opt_latin, + "unify",\$opt_allow_unification, "help",\$opt_help); $usage=<] [-o ] [-k] - input coding system: (default: ucs\@mcs) - ucs\@mcs, ucs\@cns, ucs\@gb, ucs\@jis, ucs\@ks - order of kanji: (default: j) +Usage: $0 [-i ] [-o ] [-klru] + -i: input coding system: (default: ucs\@mcs) + ucs\@mcs, ucs\@cns, ucs\@gb, ucs\@jis, ucs\@ks + -o: order of kanji: (default: j) c: CNS g: GB j: JIS k: KS G: GT m: Multi, use \\UTFM of otf.sty - You can also combine them, ex. jtcgkm - k: use Kage server. - r: replace r and l with dot below to those with circle below. + You can also combine them, ex. jGcgkm + -k: use Kage server. + -l: preserve latin characters also in ucs\@jis environment. + -r: replace r and l with dot below to those with circle below. + -u: allow unification. EOF %order=('c'=>'UniCNS', @@ -109,21 +116,26 @@ if($opt_replace){ $texmacro[0x1E5D]='{\ifmmode \ucirc{\bar{r}}\else \ucirc{\={r}}\fi}'; } -$idsdata_file="$omegadb_path/idsdata.pl"; +$idsdata_file="$omegadb_path/idsdata.txt"; $ids_start=0x00; $font_start=0; if(-e $idsdata_file){ - open(IDSDATA,"+<$idsdata_file") or die; + open(IDSDATA,"+<:utf8",$idsdata_file) or die; flock(IDSDATA,LOCK_EX); seek(IDSDATA,0,0); while(){ - eval $_; + utf8::decode($_); + if(m/^START\t(\d+)\t(\d+)/){ + $font_start=$1,$ids_start=$2; + }elsif(m/^(.*?)\t(\d+)\t(\d+)/){ + $ids{$1}=[$2,$3,]; + } } seek(IDSDATA,0,0); -# require $idsdata_file; + truncate(IDSDATA,0); }else{ - open(IDSDATA,">$idsdata_file") or die; + open(IDSDATA,">:utf8",$idsdata_file) or die; flock(IDSDATA,LOCK_EX); seek(IDSDATA,0,0); } @@ -144,11 +156,12 @@ $ids=""; while(<>){ if($in_cs ne 'ucs@mcs'){ - s/(.)/pack("U",&get_char_id(unpack("U",$1),$in_cs))/ge; + s/(.)/&get_char_in_mcs($1,$in_cs)/ge; } s/((?:^|[^\\])(?:\\\\)*)(&.*?;)/&de_tex_er($1,$2)/ge; s/((?:^|[^\\])(?:\\\\)*)\\([$idc])/$1.'\UTFM{'.sprintf("%X",unpack("U",$2)).'}'/ge; @chars=split(//); + CHAR: for($i=0;$i<=$#chars;$i++){ $char=$chars[$i]; $char_id=unpack("U",$chars[$i]); @@ -156,51 +169,66 @@ while(<>){ if($char_id<=0x7f){ # Basic Latin print $char; - next; + next CHAR; }elsif(defined($texmacro[$char_id]) and $texmacro[$char_id]){ # already defined for # 0080..00FF; Latin-1 Supplement # 0100..017F; Latin Extended-A # 1E00..1EFF; Latin Extended Additional print $texmacro[$char_id]; - next; + next CHAR; }elsif($char_id>=0x2ff0 and $char_id<=0x2fff){ # Ideographic Description Characters print &ids_parse(); - next; + next CHAR; }else{ - if(($out_char=&get_output_char($char_id))){ + if(($out_char=&get_output_char($char))){ print $out_char; - }elsif($char_id >= 0x20000 && $char_id <=0x2a6df){ - # CJK Unified Ideographs Extension B - if(not defined($ids{$char}) and $ids{$char}[1]>=0){ - $ids{$char}[0]=$font_start; - $ids{$char}[1]=$ids_start; - $ids_start++; - if($ids_start>255){ - $ids_start=0; - $font_start++; + next CHAR; + }else{ + if($opt_allow_unification){ + @chars_unified=&get_chars_unified($char); + if(@chars_unified>0){ + foreach $char_unified (@chars_unified){ + if(($out_char=&get_output_char($char_unified))){ + print $out_char; + next CHAR; + } + } } } - print "{\\fontencoding{OT1}\\fontfamily{" . - sprintf("chise%03d",$ids{$char}[0]) . - "}\\selectfont\\char$ids{$char}[1]}"; - next; - }else{ - print &get_macro_for_ids(&get_ids($char)); + if($opt_use_kage_for_Ext_B + and $char_id >= 0x20000 and $char_id <=0x2a6df){ + # CJK Unified Ideographs Extension B + if(not defined($ids{$char}) and $ids{$char}[1]>=0){ + $ids{$char}[0]=$font_start; + $ids{$char}[1]=$ids_start; + $ids_start++; + if($ids_start>255){ + $ids_start=0; + $font_start++; + } + } + print "{\\fontencoding{OT1}\\fontfamily{" . + sprintf("chise%03d",$ids{$char}[0]) . + "}\\selectfont\\char$ids{$char}[1]}"; + next CHAR; + } + if($ids=&get_ids($char)){ + print &get_macro_for_ids($ids); + next CHAR; + }else{ + print $geta; + } } } } } -print IDSDATA 'use utf8;',"\n"; +print IDSDATA 'START',"\t",$font_start,"\t",$ids_start,"\n"; foreach $ids (keys %ids){ - print IDSDATA '$ids{\'',$ids,'\'}=' - ,'[',join ",",@{$ids{$ids}},"];\n"; + print IDSDATA $ids,"\t",join("\t",@{$ids{$ids}}),"\n"; } -print IDSDATA '$font_start=',$font_start,";\n"; -print IDSDATA '$ids_start=',$ids_start,";\n"; -print IDSDATA "1;"; flock(IDSDATA,LOCK_UN); if($exec_makefonts){ @@ -229,7 +257,6 @@ sub de_tex_er{ $value=hex($value); } ($output_char)=&get_chars_matching($er_alias{$atr},$value); - utf8::decode($output_char); } if($output_char){ return $before_er.$output_char; @@ -255,8 +282,8 @@ sub ids_parse{ ($ids,$ids_argc)=&ids_rest($ids,$ids_argc,$chars[$i]); if($ids_argc==0){ - if(($char_id=&get_char_id_for_ids($ids)) - and($out_char=&get_output_char($char_id))){ + if(($char=&get_char_for_ids($ids)) + and($out_char=&get_output_char($char))){ return $out_char; }else{ return &get_macro_for_ids($ids); @@ -277,8 +304,6 @@ sub ids_rest{ $ids_argc--; } $ids.=$char; -# $ids.=$char if($perl56); -# $ids.=encode('utf8',$char) if($perl58); return ($ids,$ids_argc); } @@ -288,11 +313,11 @@ sub get_macro_for_ids{ # or GETA character if ids is invalid for KAGE. my($ids)=@_; # return $geta if(not $exec_makefonts); - $ids=&normalize_ids($ids,"UniJIS"); +# $ids=&normalize_ids($ids,"UniJIS"); return $geta if(($ids!~/[$idc]/) or($ids=~/[\x{10000}-]/)); #irregular for KAGE. - if(not defined($ids{$ids}) and $ids{$ids}[1]>=0){ + if(not defined($ids{$ids})){ $ids{$ids}[0]=$font_start; $ids{$ids}[1]=$ids_start; $ids_start++; @@ -313,15 +338,17 @@ sub normalize_ids{ $out_cs=~s/Uni(.+)/'ucs@'.lc($1)/e; my $output_ids=""; - my($char,$char_id,$output_char_id); + my($char,$output_char_id); while($ids=~m/(.)/g){ $char=$1; - $char_id=unpack("U",$char); if($char=~/[$idc]/){ $output_ids.=$char; - }elsif($output_char_id=&get_char_attribute($char,"=$out_cs")){ - $output_ids.=pack("U",$output_char_id); - }elsif($output_char_id=&get_char_attribute($char,"=ucs")){ + }elsif($output_char_id=&get_char_attribute($char,"=$out_cs") + or $output_char_id=&get_char_attribute($char,"=ucs") + or $output_char_id=&get_char_attribute($char,"=>$out_cs") + or $output_char_id=&get_char_attribute($char,"=>ucs") + or $output_char_id=&get_char_attribute($char,"=>ucs*") + ){ $output_ids.=pack("U",$output_char_id); }else{ return $geta; @@ -331,14 +358,15 @@ sub normalize_ids{ } sub get_output_char{ - # argument: + # argument: # return: character in EUC-JP or TeX macro for pTeX. - my($char_id)=@_; - my($char,$out_char,$out_char_id,$gt); + my($char)=@_; + my($out_char,$out_char_id,$gt); - $char=pack('U',$char_id); - - if($out_char=&get_char_attribute($char,'=jis-x0208')){ + if($out_char=&get_char_attribute($char,'=jis-x0208') + or $out_char=&get_char_attribute($char,'=jis-x0208-1983') + or $out_char=&get_char_attribute($char,'=jis-x0208-1990') + ){ return pack("S",$out_char|0x8080); }else{ foreach $out_cs (@order){ @@ -348,6 +376,7 @@ sub get_output_char{ or $out_char_id=&get_char_attribute($char,'=ucs') or $out_char_id=&get_char_attribute($char,'=>ucs@jis') or $out_char_id=&get_char_attribute($char,'=>ucs') + or $out_char_id=&get_char_attribute($char,'=>ucs*') ){ return "\\UTF{".sprintf("%X",$out_char_id)."}"; } @@ -357,6 +386,7 @@ sub get_output_char{ or $out_char_id=&get_char_attribute($char,'=ucs') or $out_char_id=&get_char_attribute($char,'=>ucs@gb') or $out_char_id=&get_char_attribute($char,'=>ucs') + or $out_char_id=&get_char_attribute($char,'=>ucs*') ){ return "\\UTFC{".sprintf("%X",$out_char_id)."}"; } @@ -366,6 +396,7 @@ sub get_output_char{ or $out_char_id=&get_char_attribute($char,'=ucs') or $out_char_id=&get_char_attribute($char,'=>ucs@cns') or $out_char_id=&get_char_attribute($char,'=>ucs') + or $out_char_id=&get_char_attribute($char,'=>ucs*') ){ return "\\UTFT{".sprintf("%X",$out_char_id)."}"; } @@ -375,11 +406,12 @@ sub get_output_char{ or $out_char_id=&get_char_attribute($char,'=ucs') or $out_char_id=&get_char_attribute($char,'=>ucs@ks') or $out_char_id=&get_char_attribute($char,'=>ucs') + or $out_char_id=&get_char_attribute($char,'=>ucs*') ){ return "\\UTFK{".sprintf("%X",$out_char_id)."}"; } }elsif($out_cs eq 'GT'){ - return $gt if($gt=&get_macro_for_GT($char_id)); + return $gt if($gt=&get_macro_for_GT($char)); }elsif($out_cs eq 'Multi'){ if($out_char_id=&get_char_attribute($char,'=ucs')){ return "\\UTFM{".sprintf("%X",$out_char_id)."}"; @@ -400,38 +432,62 @@ sub get_ids{ return $ids; } -sub get_char_id_for_ids{ +sub get_char_for_ids{ # argument: - # return: char-id + # return: char or undef my($ids)=@_; my($output_char); if(($output_char)=&get_chars_matching("ids",$ids)){ - return unpack("U",$output_char); + return $output_char; }else{ return undef; } } -sub get_char_id{ - # argument: , - # return: char-id. - my($char_id,$in_cs)=@_; +sub get_char_in_mcs{ + # argument: , + # return: char in ucs@mcs. + my($char,$in_cs)=@_; my($output_char); + my $char_id=unpack("U",$char); + if($opt_latin and $texmacro[$char_id]){ + return $char; + } if(($output_char)=&get_chars_matching("=$in_cs",$char_id)){ - return unpack("U",$output_char); + return $output_char; }else{ - return $char_id; + return $char; + } +} + +sub get_chars_unified{ + my($char)=@_; + my($chars,$ucs,$char_ucs); + my(@chars); + + if($chars=&get_char_attribute($char,'->ucs-unified')){ + $chars=~s/^\((.*)\)$/$1/; + return (split(/\s*\?/,$chars)); + }elsif($ucs=&get_char_attribute($char,'=>ucs*') + or $ucs=&get_char_attribute($char,'=>ucs')){ + $char_ucs=pack("U",$ucs); + if($chars=&get_char_attribute($char_ucs,'->ucs-unified')){ + $chars=~s/^\((.*)\)$/$1/; + @chars=grep {not /^$char$/} + (split(/\s*\?/,$chars)); + push(@chars,$char_ucs); + return @chars; + } } } sub get_macro_for_GT{ - # argument: - # return: TeX macro for GT fonts. - my($char_id)=@_; - my($char,$gt,$GT); - $char=pack("U",$char_id); + # argument: + # return: TeX macro for GT fonts or undef. + my($char)=@_; + my($gt,$GT); foreach (@GT){ if($gt=&get_char_attribute($char,$_)){ m/gt\-pj\-(\d+)/ and $GT=$1;