X-Git-Url: http://git.chise.org/gitweb/?a=blobdiff_plain;f=inCHISE;h=08cc1bd45e1a05b6ec74e230d4c152d302707c56;hb=9cee15632b37dc119b0de0e8e90f69e482f6c7f9;hp=b202d60db2bc44b20f6f74ac133ae14516214f61;hpb=3650bf9ce485979d8071cabbfcb6f0ec88ced6e4;p=chise%2Fomega.git diff --git a/inCHISE b/inCHISE index b202d60..08cc1bd 100755 --- a/inCHISE +++ b/inCHISE @@ -8,6 +8,7 @@ use vars qw($omegadb_path $opt_use_kage_for_Ext_B %opt_order %order %order_map $opt_in_cs $opt_out_cs + $opt_preserve_spaces $opt_help $usage $in_cs $out_cs $i @chars $char $char_id $out_char @@ -60,10 +61,13 @@ my $forbidden_before my $slightly_forbidden_before = '\x{000a}\#\-‐−‰′″℃゛゜ゝゞヽヾ"%-゙゚'; -my $asian = '\x{1100}-\x{11FF}\x{2E80}-\x{D7AF}\x{F900}-\x{FAFF}\x{FE30}-\x{FE4F}\x{FF00}-\x{FFFFFF}'; # need to fix. +#my $asian = '\x{1100}-\x{11FF}\x{2E80}-\x{D7AF}\x{F900}-\x{FAFF}\x{FE30}-\x{FE4F}\x{FF00}-\x{FFFFFF}'; # need to fix. +my $asian = '\x{2E80}-\x{312f}\x{3190}-\x{ABFF}\x{F900}-\x{FAFF}\x{FE30}-\x{FE4F}\x{FF00}-\x{FFFFFF}'; # need to be fixed. my $space = '\x{0020}\x{0009}\x{000A}\x{000C}\x{000D}'; +my $hangul='\x{1100}-\x{11ff}\x{3130}-\x{318f}\x{AC00}-\x{D7AF}'; + my %tex_meta=('#'=>'\#', '$'=>'\\textdollar{}', '%'=>'\%', @@ -92,7 +96,12 @@ if($opt_in_cs or $opt_out_cs){ $in_cs=$opt_in_cs; $out_cs=$opt_out_cs; }elsif(@ARGV==0){ - ($in_cs,$out_cs)=($0=~/(Utf8.+)To(\w+)/); + ($in_cs + ,$out_cs + ,$opt_preserve_spaces) + =($0=~/(Utf8mcs|Utf8cns|Utf8gb|Utf8jis|Utf8ks|Utf8big5) + To (UniCNS|UniGB|UniJIS|UniKS|UniMulti) + (Sp)?/ox); } # $in_cs: @@ -174,17 +183,20 @@ while(<>){ if($in_cs ne 'ucs@mcs'){ s/(.)/&get_char_in_mcs($1,$in_cs)/ge; } - s/([$asian])\s+([$asian])/$1$2/g unless($out_cs eq 'UniKS'); - s/([$asian])\s*([^$asian])/$1 $2/g; - s/([^$asian])\s*([$asian])/$1 $2/g; + s/(amp.+?;)/&de_tex_er($1)/ge; +# s/(&.+?;)/&de_tex_er($1)/ge; + s/([$asian])\s+/$1/go unless($opt_preserve_spaces); + s/\s+([$asian])/$1/go unless($opt_preserve_spaces); + s/([$asian])\s*([^$asian$space])/$1 $2/go; + s/([^$asian$idc])\s*([$asian])/$1 $2/go; s/\-\-\-/pack("U",0x2014)/geo;# EM DASH s/\-\-/pack("U",0x2013)/geo;# EN DASH - s/\`\`/pack("U",0x201f)/geo;# DOUBLE HIGH-REVERSED-9 QUOTATION MARK - s/\`/pack("U",0x201b)/geo;# SINGLE HIGH-REVERSED-9 QUOTATION MARK +# s/\`\`/pack("U",0x201f)/geo;# DOUBLE HIGH-REVERSED-9 QUOTATION MARK + s/\`\`/pack("U",0x201c)/geo;# _TeX compatible_ +# s/\`/pack("U",0x201b)/geo;# SINGLE HIGH-REVERSED-9 QUOTATION MARK + s/\`/pack("U",0x2018)/geo;# _TeX compatible_ s/\'\'/pack("U",0x201d)/geo;# RIGHT DOUBLE QUOTATION MARK s/\'/pack("U",0x2019)/geo;# RIGHT DOUBLE QUOTATION MARK - s/(amp.+?;)/&de_tex_er($1)/ge; -# s/(&.+?;)/&de_tex_er($1)/ge; @chars=split(//); CHAR: for($i=0;$i<=$#chars;$i++){ @@ -196,25 +208,25 @@ while(<>){ if($i>0 and $i<$#chars){ if(($chars[$i-1]=~/[$asian]/ and $chars[$i+1]=~/[^$asian]/ - and $chars[$i+1]=~/[^$strictly_forbidden_after$slightly_forbidden_after]/) + and $chars[$i+1]=~/[$strictly_forbidden_before$slightly_forbidden_before]/) or($chars[$i-1]=~/[^$asian]/ and $chars[$i+1]=~/[$asian]/ - and $chars[$i-1]=~/[^$strictly_forbidden_before$slightly_forbidden_before]/) + and $chars[$i-1]=~/[$strictly_forbidden_after$slightly_forbidden_after]/) ){ print ''; }elsif(($chars[$i-1]=~/[$asian]/ - and $chars[$i+1]=~/[^$asian]/) - or($chars[$i-1]=~/[^$asian]/ - and $chars[$i+1]=~/[$asian]/)){ - print '{\selectjisfont\hspace{.25ex}}'; + and $chars[$i+1]=~/[^$asian]/)){ + print '\unskip\kern.25ex'; + }elsif($chars[$i-1]=~/[^$asian]/ + and $chars[$i+1]=~/[$asian]/){ + print '\kern.25ex'; }else{ print ' '; } + }else{ + print ' '; } next CHAR; - }elsif($char=~m/($tex_meta_re)/o){ - print $tex_meta{$1}; - next CHAR; }elsif(($char_id>0x20 and $char_id<=0x021f) # Basic Latin # Latin-1 Supplement @@ -270,8 +282,8 @@ while(<>){ $font_start++; } } - print "{\\fontencoding{OT1}\\fontfamily{" . - sprintf("chise%03d",$ids{$char}[0]) . + print "{\\fontencoding{OT1}\\fontfamily{", + sprintf("chise%03d",$ids{$char}[0]) , "}\\selectfont\\char$ids{$char}[1]}",&add_break($i); next CHAR; } @@ -327,11 +339,11 @@ sub add_break{ if($i<($#chars-1)){ if(($chars[$i+1]=~m/[$strictly_forbidden_before]/o) and($chars[$i+2]=~m/[$strictly_forbidden_before]/o)){ - return "\\CJKunbreakablekernone "; + return "\\CJKunbreakablekernone{}"; }elsif($opt_protrude){ if(($chars[$i+1]=~m/[$strictly_forbidden_before]/o) and($chars[$i+2]=~m/[^$strictly_forbidden_before]/o)){ - return "\\CJKunbreakablekernone \\CJKprotrude "; + return "\\CJKunbreakablekernone \\CJKprotrude{}"; } } } @@ -342,31 +354,31 @@ sub add_break{ } if(($i<$#chars) and($chars[$i+1]=~m/[$strictly_forbidden_before]/o)){ - return "\\CJKunbreakablekernone "; + return "\\CJKunbreakablekernone{}"; } if($chars[$i]=~m/[$strictly_forbidden_after]/o){ - return "\\CJKunbreakablekernone "; + return "\\CJKunbreakablekernone{}"; } if(($i<$#chars) and($chars[$i+1]=~m/[$forbidden_before]/o)){ - return "\\CJKunbreakablekerntwo "; + return "\\CJKunbreakablekerntwo{}"; } if($chars[$i]=~m/[$forbidden_after]/o){ - return "\\CJKunbreakablekerntwo "; + return "\\CJKunbreakablekerntwo{}"; } if(($i<$#chars) and($chars[$i+1]=~m/[$slightly_forbidden_before]/o)){ - return "\\CJKunbreakablekernthree "; + return "\\CJKunbreakablekernthree{}"; } if($chars[$i]=~m/[$slightly_forbidden_after]/o){ - return "\\CJKunbreakablekernthree "; + return "\\CJKunbreakablekernthree{}"; } if($chars[$i]=~m/[$asian]/o){ - return "\\CJKbreakablekern "; + return "\\CJKbreakablekern{}"; } if(($i<$#chars)and($chars[$i+1]=~m/[$asian]/o)){ - return "\\CJKbreakablekern "; + return "\\CJKbreakablekern{}"; } } @@ -377,7 +389,9 @@ sub latin_parse{ my $out_str=""; while($i<=$#chars){ $char_id=unpack("U",$chars[$i]); - if(($char_id>0x20 and $char_id<=0x021f) + if($chars[$i]=~m/($tex_meta_re)/o){ + $out_str.=$tex_meta{$chars[$i]}; + }elsif(($char_id>0x20 and $char_id<=0x021f) or($char_id>=0x0250 and $char_id<=0x02af)# IPA Extensions or($char_id>=0x0300 and $char_id<=0x033f)# Combining Diacritical Marks or($char_id>=0x0360 and $char_id<=0x036f) @@ -509,7 +523,7 @@ sub get_output_char{ or $out_char_id=&get_char_attribute($char,'=>ucs') or $out_char_id=&get_char_attribute($char,'=>ucs*') ){ - return '{\selectjisfont\char'.$out_char_id.'}'; + return "{\\selectjisfont\\char$out_char_id}"; } }elsif($out_cs eq 'UniGB' and &get_char_attribute($char,"vnd-adobe-cid-unigb-ucs2-h")){ @@ -519,7 +533,7 @@ sub get_output_char{ or $out_char_id=&get_char_attribute($char,'=>ucs') or $out_char_id=&get_char_attribute($char,'=>ucs*') ){ - return '{\selectgbsfont\char'.$out_char_id.'}'; + return "{\\selectgbsfont\\char$out_char_id}"; } }elsif($out_cs eq 'UniCNS' and &get_char_attribute($char,"vnd-adobe-cid-unicns-ucs2-h")){ @@ -529,7 +543,7 @@ sub get_output_char{ or $out_char_id=&get_char_attribute($char,'=>ucs') or $out_char_id=&get_char_attribute($char,'=>ucs*') ){ - return '{\selectcnsfont\char'.$out_char_id.'}'; + return "{\\selectcnsfont\\char$out_char_id}"; } }elsif($out_cs eq 'UniKS' and &get_char_attribute($char,"vnd-adobe-cid-uniks-ucs2-h")){ @@ -539,7 +553,7 @@ sub get_output_char{ or $out_char_id=&get_char_attribute($char,'=>ucs') or $out_char_id=&get_char_attribute($char,'=>ucs*') ){ - return '{\selectksxfont\char'.$out_char_id.'}'; + return "{\\selectksxfont\\char$out_char_id}"; } }elsif($out_cs eq 'GT'){ return $gt if($gt=&get_macro_for_GT($char)); @@ -647,7 +661,9 @@ sub get_macro_for_HZK{ } } if($hzk){ - return "{\\fontencoding{OT1}\\fontfamily{".sprintf("hzk%02d",$HZK)."}\\selectfont\\char".$hzk."}"; + return "{\\fontencoding{OT1}\\fontfamily{" + .sprintf("hzk%02d",$HZK) + ."}\\selectfont\\char".$hzk."}"; }else{ return undef; } @@ -666,9 +682,7 @@ sub get_macro_for_CDP{ if($cdp){ $ucs=&get_char_attribute(&get_chars_matching("=big5-pua",$cdp),"=ucs"); if($ucs){ - return "{\\fontencoding{OT1}\\fontfamily{cdp}\\selectfont\\char" - .$ucs. - "}"; + return "{\\fontencoding{OT1}\\fontfamily{cdp}\\selectfont\\char$ucs}"; }else{ print STDERR "This should not happen.\n"; print STDERR "ucs code point of CDP: $cdp not found.\n";