X-Git-Url: http://git.chise.org/gitweb/?p=chise%2Fomega.git;a=blobdiff_plain;f=inCHISE;h=b06ef5e0f7fe367c065092a42611d998e60a3c97;hp=6bd683b08b7da233abb8f2f988ad504f361ea0b0;hb=27b33499c7ebe0a578cfab3ca7e5f82bab1e0242;hpb=ebfefbb60f308280e3c6b8e8aa0af7861f2c3746 diff --git a/inCHISE b/inCHISE index 6bd683b..b06ef5e 100755 --- a/inCHISE +++ b/inCHISE @@ -8,6 +8,7 @@ use vars qw($omegadb_path $opt_use_kage_for_Ext_B %opt_order %order %order_map $opt_in_cs $opt_out_cs + $opt_preserve_spaces $opt_help $usage $in_cs $out_cs $i @chars $char $char_id $out_char @@ -24,8 +25,8 @@ require 5.008; ### Options ### -#$opt_order{'UniMulti'}='jcgk'; -$opt_order{'UniMulti'}='jGcgkHC'; +$opt_order{'UniMulti'}='jGcgk'; +#$opt_order{'UniMulti'}='jGcgkHC'; $opt_order{'UniCNS'}='c'; $opt_order{'UniGB'}='g'; $opt_order{'UniJIS'}='j'; @@ -39,8 +40,7 @@ $opt_use_kage_for_Ext_B=0;# 1=true, 0=false. ### End ### -my $strictly_forbidden_after = '「【『[(〈“‘‘(〔{《{\[\(\x{3016}{「'; -# \x{3016} | # white 【 +my $strictly_forbidden_after = '「【『[(〈‘‛“‟(〔{《{\[\(〖{「'; my $forbidden_after = "\x{0000}"; @@ -51,8 +51,7 @@ my $slightly_forbidden_after = '¥¥$$〒♯##¢¢££@@§'; # All these characters are allowed to protrude # in the right margin my $strictly_forbidden_before= - '!,.:;?、。!,.:;?。\)#}’”〉》」』】〕\x{3017})]}}」\]'; -### \x{3017} | # white 】 + '!,.:;?、。!,.:;?。\)\]}’”〉》」』】〕〗)]}」'; my $forbidden_before = 'ー々ぁぃぅぇぉゃゅょっゎァィゥェォャュョッヮヵヶ'; @@ -60,8 +59,8 @@ my $forbidden_before my $slightly_forbidden_before = '\x{000a}\#\-‐−‰′″℃゛゜ゝゞヽヾ"%-゙゚'; -#my $asian = '\x{1100}-\x{11FF}\x{2E80}-\x{D7AF}\x{F900}-\x{FAFF}\x{FE30}-\x{FE4F}\x{FF00}-\x{FFFFFF}'; # need to fix. -my $asian = '\x{2E80}-\x{312f}\x{3190}-\x{ABFF}\x{F900}-\x{FAFF}\x{FE30}-\x{FE4F}\x{FF00}-\x{FFFFFF}'; # need to fix. +#my $asian = '\x{1100}-\x{11FF}\x{2E80}-\x{D7AF}\x{F900}-\x{FAFF}\x{FE30}-\x{FE4F}\x{FF00}-\x{FFFFFF}'; # need to be fixed. +my $asian = '\x{2E80}-\x{312f}\x{3190}-\x{ABFF}\x{F900}-\x{FAFF}\x{FE30}-\x{FE4F}\x{FF00}-\x{FFFFFF}'; # need to be fixed. my $space = '\x{0020}\x{0009}\x{000A}\x{000C}\x{000D}'; @@ -95,7 +94,12 @@ if($opt_in_cs or $opt_out_cs){ $in_cs=$opt_in_cs; $out_cs=$opt_out_cs; }elsif(@ARGV==0){ - ($in_cs,$out_cs)=($0=~/(Utf8.+)To(\w+)/); + ($in_cs + ,$out_cs + ,$opt_preserve_spaces) + =($0=~/(Utf8mcs|Utf8cns|Utf8gb|Utf8jis|Utf8ks|Utf8big5) + To (UniCNS|UniGB|UniJIS|UniKS|UniMulti) + (Sp)?/ox); } # $in_cs: @@ -114,7 +118,7 @@ if($opt_help $omegadb_path=~s!/$!!; -$idsdata_file="$omegadb_path/idsdata.pl"; +$idsdata_file="$omegadb_path/idsdata.txt"; $ids_start=0x00; $font_start=0; @@ -123,10 +127,17 @@ if(-e $idsdata_file){ flock(IDSDATA,LOCK_EX); seek(IDSDATA,0,0); while(){ - eval $_; + utf8::decode($_); + if(m/^START\t(\d+)\t(\d+)/){ + $font_start=$1,$ids_start=$2; + }elsif(m/^(.*?)\t(\d+)\t(\d+)/){ + $ids{$1}=[$2,$3,]; + }else{ + die "Irregular IDS file: $idsdata_file.\n"; + } } seek(IDSDATA,0,0); -# require $idsdata_file; + truncate(IDSDATA,0); }else{ open(IDSDATA,">:utf8",$idsdata_file) or die; flock(IDSDATA,LOCK_EX); @@ -171,16 +182,20 @@ foreach $out_cs ('UniCNS','UniGB','UniJIS','UniKS','UniMulti'){ } } +$/=""; + while(<>){ utf8::decode($_); + print '{\relax{}'; + if($in_cs ne 'ucs@mcs'){ s/(.)/&get_char_in_mcs($1,$in_cs)/ge; } s/(amp.+?;)/&de_tex_er($1)/ge; # s/(&.+?;)/&de_tex_er($1)/ge; - s/([$asian])\s+/$1/go unless($out_cs eq 'UniKS'); - s/\s+([$asian])/$1/go unless($out_cs eq 'UniKS'); + s/([$asian])[$space]+/$1/go unless($opt_preserve_spaces); + s/[$space]+([$asian])/$1/go unless($opt_preserve_spaces); s/([$asian])\s*([^$asian$space])/$1 $2/go; s/([^$asian$idc])\s*([$asian])/$1 $2/go; s/\-\-\-/pack("U",0x2014)/geo;# EM DASH @@ -192,10 +207,15 @@ while(<>){ s/\'\'/pack("U",0x201d)/geo;# RIGHT DOUBLE QUOTATION MARK s/\'/pack("U",0x2019)/geo;# RIGHT DOUBLE QUOTATION MARK @chars=split(//); + CHAR: for($i=0;$i<=$#chars;$i++){ $char=$chars[$i]; - $char_id=unpack("U",$char); + if($char_id=&get_char_attribute($char,'=ucs@unicode')){ + $char=pack("U",$char_id); + }else{ + $char_id=unpack("U",$char); + } if($char_id<=0x20){ # add quarter space between asian and non-asian. @@ -217,6 +237,8 @@ while(<>){ }else{ print ' '; } + }else{ + print ' '; } next CHAR; }elsif(($char_id>0x20 and $char_id<=0x021f) @@ -288,16 +310,13 @@ while(<>){ } } } + print '}'; } -print IDSDATA 'use utf8;',"\n"; +print IDSDATA 'START',"\t",$font_start,"\t",$ids_start,"\n"; foreach $ids (keys %ids){ - print IDSDATA '$ids{\'',$ids,'\'}=' - ,'[',join ",",@{$ids{$ids}},"];\n"; + print IDSDATA $ids,"\t",join("\t",@{$ids{$ids}}),"\n"; } -print IDSDATA '$font_start=',$font_start,";\n"; -print IDSDATA '$ids_start=',$ids_start,";\n"; -print IDSDATA "1;"; flock(IDSDATA,LOCK_UN); exit 0; @@ -470,7 +489,7 @@ sub get_macro_for_ids{ } return "{\\fontencoding{OT1}\\fontfamily{" .sprintf("chise%03d",$ids{$ids}[0]) - ."}\\selectfont\\char$ids{$ids}[1]}"; + ."}\\selectfont\\char$ids{$ids}[1]}\\relax{}"; } sub normalize_ids{ @@ -563,12 +582,12 @@ sub get_ids{ # return: ids my($char)=@_; my $ids=""; - $ids=&get_char_attribute($char,"ids-aggregated") - or $ids=&get_char_attribute($char,"ids"); -# $ids=&get_char_attribute($char,"ids-decomposed") +# $ids=&get_char_attribute($char,"ids-aggregated") # or $ids=&get_char_attribute($char,"ids"); -# or $ids=&get_char_attribute($char,"ideographic-structure"); -# $ids=~s/[? ()]//g; + $ids=&get_char_attribute($char,"ids-decomposed") + or $ids=&get_char_attribute($char,"ids") + or $ids=&get_char_attribute($char,"ideographic-structure"); + $ids=~s/[? ()]//g; return $ids; } @@ -591,7 +610,8 @@ sub get_char_in_mcs{ my($char,$in_cs)=@_; my($output_char); - return $char if($in_cs eq 'ucs@mcs'); + return $char if($in_cs eq 'ucs@mcs' or + $char le "~"); if(($output_char)=&get_chars_matching("=$in_cs",unpack("U",$char))){ return $output_char; @@ -602,14 +622,14 @@ sub get_char_in_mcs{ sub get_chars_unified{ my($char)=@_; - my($chars,$ucs,$char_ucs); + my($chars,$ucs,$char_ucs,$char_sub); my(@chars); if($chars=&get_char_attribute($char,'->ucs-unified')){ $chars=~s/^\((.*)\)$/$1/; return (split(/\s*\?/,$chars)); }elsif($ucs=&get_char_attribute($char,'=>ucs*') - or $ucs=&get_char_attribute($char,'=>ucs')){ + or $ucs=&get_char_attribute($char,'=>ucs')){ $char_ucs=pack("U",$ucs); if($chars=&get_char_attribute($char_ucs,'->ucs-unified')){ $chars=~s/^\((.*)\)$/$1/; @@ -618,6 +638,14 @@ sub get_chars_unified{ push(@chars,$char_ucs); return @chars; } + }elsif($char_sub=&get_char_attribute($char,'<-subsumptive')){ + $char_sub=~s/[? ()]//g; + $chars=&get_char_attribute($char_sub,'->subsumptive'); + @chars=grep {not /^$char$/} (split(/\s*\?/,$chars)); + push(@chars,$char_sub); + return @chars; + }else{ + return (); } }