X-Git-Url: http://git.chise.org/gitweb/?a=blobdiff_plain;f=inCHISE;h=3fb24fa64bf1b4a8a154bf783ef1c00977a062a5;hb=850d741ce7ed32d75cec3753d0cfff8b47acf96b;hp=9f7a731f7ca075696055b492bb52bfbd7bdfc49f;hpb=f9120418868c224a29b458672f041e6d209d8762;p=chise%2Fomega.git diff --git a/inCHISE b/inCHISE index 9f7a731..3fb24fa 100755 --- a/inCHISE +++ b/inCHISE @@ -60,7 +60,7 @@ my $forbidden_before my $slightly_forbidden_before = '\x{000a}\#\-‐−‰′″℃゛゜ゝゞヽヾ"%-゙゚'; -my $asian = '\x{1100}-\x{11FF}\x{2E80}-\x{D7AF}\x{F900}-\x{FAFF}\x{FE30}-\x{FE4F}\x{FF00}-\x{FFFFFF}'; +my $asian = '\x{1100}-\x{11FF}\x{2E80}-\x{D7AF}\x{F900}-\x{FAFF}\x{FE30}-\x{FE4F}\x{FF00}-\x{FFFFFF}'; # need to fix. my $space = '\x{0020}\x{0009}\x{000A}\x{000C}\x{000D}'; @@ -170,11 +170,21 @@ foreach $out_cs ('UniCNS','UniGB','UniJIS','UniKS','UniMulti'){ while(<>){ utf8::decode($_); + if($in_cs ne 'ucs@mcs'){ s/(.)/&get_char_in_mcs($1,$in_cs)/ge; } s/(amp.+?;)/&de_tex_er($1)/ge; # s/(&.+?;)/&de_tex_er($1)/ge; + s/([$asian])\s+([$asian])/$1$2/g unless($out_cs eq 'UniKS'); + s/([$asian])\s*([^$asian])/$1 $2/g; + s/([^$asian$idc])\s*([$asian])/$1 $2/g; + s/\-\-\-/pack("U",0x2014)/geo;# EM DASH + s/\-\-/pack("U",0x2013)/geo;# EN DASH + s/\`\`/pack("U",0x201f)/geo;# DOUBLE HIGH-REVERSED-9 QUOTATION MARK + s/\`/pack("U",0x201b)/geo;# SINGLE HIGH-REVERSED-9 QUOTATION MARK + s/\'\'/pack("U",0x201d)/geo;# RIGHT DOUBLE QUOTATION MARK + s/\'/pack("U",0x2019)/geo;# RIGHT DOUBLE QUOTATION MARK @chars=split(//); CHAR: for($i=0;$i<=$#chars;$i++){ @@ -182,7 +192,25 @@ while(<>){ $char_id=unpack("U",$char); if($char_id<=0x20){ - print $chars[$i]; + # add quarter space between asian and non-asian. + if($i>0 and $i<$#chars){ + if(($chars[$i-1]=~/[$asian]/ + and $chars[$i+1]=~/[^$asian]/ + and $chars[$i+1]=~/[$strictly_forbidden_before$slightly_forbidden_before]/) + or($chars[$i-1]=~/[^$asian]/ + and $chars[$i+1]=~/[$asian]/ + and $chars[$i-1]=~/[$strictly_forbidden_after$slightly_forbidden_after]/) + ){ + print ''; + }elsif(($chars[$i-1]=~/[$asian]/ + and $chars[$i+1]=~/[^$asian]/) + or($chars[$i-1]=~/[^$asian]/ + and $chars[$i+1]=~/[$asian]/)){ + print '{\selectjisfont\hspace{.25ex}}'; + }else{ + print ' '; + } + } next CHAR; }elsif($char=~m/($tex_meta_re)/o){ print $tex_meta{$1}; @@ -205,6 +233,8 @@ while(<>){ # Cyrillic or($char_id>=0x0530 and $char_id<=0x058f) # Armenian + or($char_id>=0x2010 and $char_id<=0x2046) + # General Punctuation (partial) ){ print &latin_parse(); next CHAR; @@ -240,8 +270,8 @@ while(<>){ $font_start++; } } - print "{\\fontencoding{OT1}\\fontfamily{" . - sprintf("chise%03d",$ids{$char}[0]) . + print "{\\fontencoding{OT1}\\fontfamily{", + sprintf("chise%03d",$ids{$char}[0]) , "}\\selectfont\\char$ids{$char}[1]}",&add_break($i); next CHAR; } @@ -297,41 +327,46 @@ sub add_break{ if($i<($#chars-1)){ if(($chars[$i+1]=~m/[$strictly_forbidden_before]/o) and($chars[$i+2]=~m/[$strictly_forbidden_before]/o)){ - return "\\CJKunbreakablekernone "; + return "\\CJKunbreakablekernone{}"; }elsif($opt_protrude){ if(($chars[$i+1]=~m/[$strictly_forbidden_before]/o) and($chars[$i+2]=~m/[^$strictly_forbidden_before]/o)){ - return "\\CJKunbreakablekernone \\CJKprotrude "; + return "\\CJKunbreakablekernone \\CJKprotrude{}"; } } } if(($i<$#chars) + and($chars[$i+1]eq" ")){ + # preserve space. + return ""; + } + if(($i<$#chars) and($chars[$i+1]=~m/[$strictly_forbidden_before]/o)){ - return "\\CJKunbreakablekernone "; + return "\\CJKunbreakablekernone{}"; } if($chars[$i]=~m/[$strictly_forbidden_after]/o){ - return "\\CJKunbreakablekernone "; + return "\\CJKunbreakablekernone{}"; } if(($i<$#chars) and($chars[$i+1]=~m/[$forbidden_before]/o)){ - return "\\CJKunbreakablekerntwo "; + return "\\CJKunbreakablekerntwo{}"; } if($chars[$i]=~m/[$forbidden_after]/o){ - return "\\CJKunbreakablekerntwo "; + return "\\CJKunbreakablekerntwo{}"; } if(($i<$#chars) and($chars[$i+1]=~m/[$slightly_forbidden_before]/o)){ - return "\\CJKunbreakablekernthree "; + return "\\CJKunbreakablekernthree{}"; } if($chars[$i]=~m/[$slightly_forbidden_after]/o){ - return "\\CJKunbreakablekernthree "; + return "\\CJKunbreakablekernthree{}"; } if($chars[$i]=~m/[$asian]/o){ - return "\\CJKbreakablekern "; + return "\\CJKbreakablekern{}"; } if(($i<$#chars)and($chars[$i+1]=~m/[$asian]/o)){ - return "\\CJKbreakablekern "; + return "\\CJKbreakablekern{}"; } } @@ -339,8 +374,7 @@ sub latin_parse{ # arguments: none # return: string for output with TeX macro. my($char_id); - my $out_str=$chars[$i]; - $i++; + my $out_str=""; while($i<=$#chars){ $char_id=unpack("U",$chars[$i]); if(($char_id>0x20 and $char_id<=0x021f) @@ -356,13 +390,16 @@ sub latin_parse{ # Latin Extended Additional # 0x1e00 -> 0x0600, etc. $out_str.=pack("U",$char_id-0x1800); + }elsif($char_id>=0x2010 and $char_id<=0x2046){ + # General Punctuation (partial) + $out_str.=pack("U",$char_id-0x1000); }else{ $i--; last; } $i++; } - return '{\fontencoding{UT1}\fontfamily{omlgc}\selectfont '.$out_str.'}'; + return '{\fontencoding{OT1}\fontfamily{omlgc}\selectfont '.$out_str.'}';###UT1? } sub ids_parse{ @@ -472,7 +509,7 @@ sub get_output_char{ or $out_char_id=&get_char_attribute($char,'=>ucs') or $out_char_id=&get_char_attribute($char,'=>ucs*') ){ - return '{\selectjisfont\char'.$out_char_id.'}'; + return "{\\selectjisfont\\char$out_char_id}"; } }elsif($out_cs eq 'UniGB' and &get_char_attribute($char,"vnd-adobe-cid-unigb-ucs2-h")){ @@ -482,7 +519,7 @@ sub get_output_char{ or $out_char_id=&get_char_attribute($char,'=>ucs') or $out_char_id=&get_char_attribute($char,'=>ucs*') ){ - return '{\selectgbsfont\char'.$out_char_id.'}'; + return "{\\selectgbsfont\\char$out_char_id}"; } }elsif($out_cs eq 'UniCNS' and &get_char_attribute($char,"vnd-adobe-cid-unicns-ucs2-h")){ @@ -492,7 +529,7 @@ sub get_output_char{ or $out_char_id=&get_char_attribute($char,'=>ucs') or $out_char_id=&get_char_attribute($char,'=>ucs*') ){ - return '{\selectcnsfont\char'.$out_char_id.'}'; + return "{\\selectcnsfont\\char$out_char_id}"; } }elsif($out_cs eq 'UniKS' and &get_char_attribute($char,"vnd-adobe-cid-uniks-ucs2-h")){ @@ -502,7 +539,7 @@ sub get_output_char{ or $out_char_id=&get_char_attribute($char,'=>ucs') or $out_char_id=&get_char_attribute($char,'=>ucs*') ){ - return '{\selectksxfont\char'.$out_char_id.'}'; + return "{\\selectksxfont\\char$out_char_id}"; } }elsif($out_cs eq 'GT'){ return $gt if($gt=&get_macro_for_GT($char)); @@ -610,7 +647,9 @@ sub get_macro_for_HZK{ } } if($hzk){ - return "{\\fontencoding{OT1}\\fontfamily{".sprintf("hzk%02d",$HZK)."}\\selectfont\\char".$hzk."}"; + return "{\\fontencoding{OT1}\\fontfamily{" + .sprintf("hzk%02d",$HZK) + ."}\\selectfont\\char".$hzk."}"; }else{ return undef; } @@ -629,9 +668,7 @@ sub get_macro_for_CDP{ if($cdp){ $ucs=&get_char_attribute(&get_chars_matching("=big5-pua",$cdp),"=ucs"); if($ucs){ - return "{\\fontencoding{OT1}\\fontfamily{cdp}\\selectfont\\char" - .$ucs. - "}"; + return "{\\fontencoding{OT1}\\fontfamily{cdp}\\selectfont\\char$ucs}"; }else{ print STDERR "This should not happen.\n"; print STDERR "ucs code point of CDP: $cdp not found.\n";