inCHISE

   1 #!/usr/bin/perl -w -CSD
   2
   3 # ver.0.2
   4
   5 use strict;
   6 use vars qw($omegadb_path
   7             $opt_protrude $opt_allow_unify
   8             %opt_order %order %order_map
   9             $opt_in_cs $opt_out_cs
  10             $opt_help $usage
  11             $in_cs $out_cs $i @chars
  12             $char $char_id $out_char
  13             $char_unified @chars_unified
  14             $ids $ids_argc %ids $idsdb $geta
  15             $idsdata_file $ids_start $font_start
  16             @CDP @HZK @GT
  17             );
  18 use Getopt::Long;
  19 use utf8;
  20 use Fcntl ':flock';
  21 use Chise_utils ':all';
  22 require 5.008;
  23
  24 my $omegadb_path="/usr/local/lib/chise/omega";
  25
  26 ### Options ###
  27
  28 #$opt_order{'UniMulti'}='jcgk';
  29 $opt_order{'UniMulti'}='jGcgkHC';
  30 $opt_order{'UniCNS'}='c';
  31 $opt_order{'UniGB'}='g';
  32 $opt_order{'UniJIS'}='j';
  33 $opt_order{'UniKS'}='k';
  34
  35 $opt_allow_unify=1; # 1=true, 0=false.
  36 $opt_protrude=0;# 1=true, 0=false.
  37
  38 ### End ###
  39
  40 my $strictly_forbidden_after = '「【『［（〈“‘‘（〔｛《{\[\(\x{3016}｛｢';
  41 #       \x{3016} | # white 【
  42
  43 my $forbidden_after = "\x{0000}";
  44
  45 # ￥¥＄$〒♯＃#￠¢￡£＠@§
  46 my $slightly_forbidden_after = '￥¥＄$〒♯＃#￠¢￡£＠@§';
  47
  48 # $strictly_forbidden_before
  49 # All these characters are allowed to protrude
  50 # in the right margin
  51 my $strictly_forbidden_before=
  52     '!,.:;?、。！，．：；？。\)#}’”〉》」』】〕\x{3017}）］｝｝」\]';
  53 ###       \x{3017} | # white 】
  54
  55 my $forbidden_before
  56     = 'ー々ぁぃぅぇぉゃゅょっゎァィゥェォャュョッヮヵヶ';
  57
  58 my $slightly_forbidden_before
  59     = '\x{000a}\#\-‐−‰′″℃゛゜ゝゞヽヾ＂％－ﾞﾟ';
  60
  61 my $asian = '\x{1100}-\x{11FF}\x{2E80}-\x{D7AF}\x{F900}-\x{FAFF}\x{FE30}-\x{FE4F}\x{FF00}-\x{FFFFFF}';
  62
  63 my $space = '\x{0020}\x{0009}\x{000A}\x{000C}\x{000D}';
  64
  65 &GetOptions("in=s"=>\$opt_in_cs,
  66             "out=s"=>\$opt_out_cs,
  67             "help",\$opt_help);
  68
  69 $usage=<<EOF;
  70 Usage: $0 -i <input coding system> -o <cmap encoding>
  71     input coding system:
  72       Utf8mcs, Utf8cns, Utf8gb, Utf8jis, Utf8ks
  73     cmap encoding:
  74       UniCNS, UniGB, UniJIS, UniKS, UniMulti
  75 EOF
  76
  77 if($opt_in_cs or $opt_out_cs){
  78     $in_cs=$opt_in_cs;
  79     $out_cs=$opt_out_cs;
  80 }elsif(@ARGV==0){
  81     ($in_cs,$out_cs)=($0=~/(Utf8.+)To(\w+)/);
  82 }
  83
  84 # $in_cs:
  85 #   Utf8mcs,Utf8cns,Utf8gb,Utf8jis,Utf8ks,
  86 # $out_cs:
  87 #   UniCNS,UniGB,UniJIS,UniKS,UniMulti
  88
  89 $in_cs=~s/Utf8/ucs\@/;
  90
  91 if($opt_help
  92    or not defined($in_cs)
  93    or not defined($out_cs)){
  94     print $usage;
  95     exit 1;
  96 }
  97
  98 $omegadb_path=~s!/$!!;
  99
 100 $idsdata_file="$omegadb_path/idsdata.pl";
 101 $ids_start=0x00;
 102 $font_start=0;
 103
 104 if(-e $idsdata_file){
 105     open(IDSDATA,"+<:utf8",$idsdata_file) or die;
 106     flock(IDSDATA,LOCK_EX);
 107     seek(IDSDATA,0,0);
 108     while(<IDSDATA>){
 109         eval $_;
 110     }
 111     seek(IDSDATA,0,0);
 112 #         require $idsdata_file;
 113 }else{
 114     open(IDSDATA,">:utf8",$idsdata_file) or die;
 115     flock(IDSDATA,LOCK_EX);
 116     seek(IDSDATA,0,0);
 117 }
 118
 119 $ids_argc=0;
 120 $ids="";
 121
 122 $geta=pack("U",0x3013);
 123 #$geta=pack("U",0xfffd);
 124
 125 @GT=(#"=gt","=gt-k",
 126      "=gt-pj-1","=gt-pj-2","=gt-pj-3","=gt-pj-4","=gt-pj-5",
 127      "=gt-pj-6","=gt-pj-7","=gt-pj-8","=gt-pj-9","=gt-pj-10",
 128      "=gt-pj-11"
 129      #,"=gt-pj-k1","=gt-pj-k2"
 130      );
 131 @HZK=("=hanziku-1","=hanziku-2","=hanziku-3","=hanziku-4",
 132       "=hanziku-5","=hanziku-6","=hanziku-7","=hanziku-8",
 133       "=hanziku-9","=hanziku-10","=hanziku-11","=hanziku-12");
 134 @CDP=("=big5-cdp");
 135
 136 %order_map=('c'=>'UniCNS',
 137             'g'=>'UniGB',
 138             'j'=>'UniJIS',
 139             'k'=>'UniKS',
 140             'G'=>'GT',
 141             'H'=>'HZK',
 142             'C'=>'CDP',
 143             );
 144
 145 foreach $out_cs ('UniCNS','UniGB','UniJIS','UniKS','UniMulti'){
 146     if(defined($opt_order{$out_cs})){
 147         if($opt_order{$out_cs}=~/^[cgjkGHC]+$/){
 148             @{$order{$out_cs}}=map {$order_map{$_}}
 149             (split(//,$opt_order{$out_cs}));
 150         }else{
 151             print STDERR "Invalid order for $out_cs!\n";
 152             exit 1;
 153         }
 154     }
 155 }
 156
 157 while(<>){
 158     utf8::decode($_);
 159     if($in_cs ne 'ucs@mcs'){
 160         s/(.)/&get_char_in_mcs($1,$in_cs)/ge;
 161     }
 162     s/(amp.+?;)/&de_tex_er($1)/ge;
 163 #    s/(&.+?;)/&de_tex_er($1)/ge;
 164     @chars=split(//);
 165   CHAR:
 166     for($i=0;$i<=$#chars;$i++){
 167         $char=$chars[$i];
 168         $char_id=unpack("U",$char);
 169
 170         if($char_id<=0x20){
 171             print $chars[$i];
 172             next CHAR;
 173         }elsif($char_id>0x20 and $char_id<=0x02af){
 174             # Basic Latin
 175             # Latin-1 Supplement
 176             # Latin Extended-A
 177             # Latin Extended-B
 178             # IPA Extensions
 179             print &latin_parse();
 180             next CHAR;
 181         }elsif($char_id>=0x2ff0 and $char_id<=0x2fff){
 182             # Ideographic Description Characters
 183             print &ids_parse();
 184             next CHAR;
 185         }else{
 186             if(($out_char=&get_output_char($char,$out_cs))){
 187                 print $out_char,&add_break($i);
 188             }else{
 189                 if($opt_allow_unify){
 190                     @chars_unified=&get_chars_unified($char);
 191                     if(@chars_unified>0){
 192                         foreach $char_unified (@chars_unified){
 193                             if(($out_char
 194                                 =&get_output_char($char_unified,$out_cs))){
 195                                 print $out_char,&add_break($i);
 196                                 next CHAR;
 197                             }
 198                         }
 199                     }
 200                 }
 201                 if($char_id >= 0x20000 && $char_id <=0x2a6df){
 202                     # CJK Unified Ideographs Extension B
 203                     if(not defined($ids{$char}) and $ids{$char}[1]>=0){
 204                         $ids{$char}[0]=$font_start;
 205                         $ids{$char}[1]=$ids_start;
 206                         $ids_start++;
 207                         if($ids_start>255){
 208                             $ids_start=0;
 209                             $font_start++;
 210                         }
 211                     }
 212                     print "{\\fontencoding{OT1}\\fontfamily{" .
 213                         sprintf("chise%03d",$ids{$char}[0]) .
 214                             "}\\selectfont\\char$ids{$char}[1]}",&add_break($i);
 215                     next CHAR;
 216                 }
 217                 if($ids=&get_ids($char)){
 218                     print &get_macro_for_ids($ids),&add_break($i);
 219                 }else{
 220                     print '\rule{1ex}{1ex}',&add_break($i);
 221                 }
 222             }
 223         }
 224     }
 225 }
 226
 227 print IDSDATA 'use utf8;',"\n";
 228 foreach $ids (keys %ids){
 229     print IDSDATA '$ids{\'',$ids,'\'}='
 230     ,'[',join ",",@{$ids{$ids}},"];\n";
 231 }
 232 print IDSDATA '$font_start=',$font_start,";\n";
 233 print IDSDATA '$ids_start=',$ids_start,";\n";
 234 print IDSDATA "1;";
 235 flock(IDSDATA,LOCK_UN);
 236
 237 exit 0;
 238
 239 sub de_tex_er{
 240     my($er)=@_;
 241     my($prefix,$suffix);
 242     my($output_char,$atr,$value);
 243     $er=~/^(amp)(.*)(;)$/
 244         and $prefix=$1,$er=$2,$suffix=$3;
 245     $prefix or $prefix="",$suffix or $suffix="";
 246     if($er=~/^U[\+|\-]([a-fA-F\d]+)/){
 247         $output_char=pack("U",hex($1));
 248     }elsif($er=~/^(?:I\-)?($er_prefix_re)\-?([0-9a-fA-F]+)$/){
 249         ($atr,$value)=($1,$2);
 250         unless($er_alias{$atr}=~/daikanwa|gt/){
 251             $value=hex($value);
 252         }
 253         ($output_char)=&get_chars_matching($er_alias{$atr},$value);
 254     }
 255     if($output_char){
 256         return $output_char;
 257     }else{
 258         return $prefix.$er.$suffix;
 259     }
 260 }
 261
 262 sub add_break{
 263     my($i)=@_;
 264
 265     if($i<($#chars-1)){
 266         if(($chars[$i+1]=~m/[$strictly_forbidden_before]/o)
 267            and($chars[$i+2]=~m/[$strictly_forbidden_before]/o)){
 268             return "\\CJKunbreakablekernone ";
 269         }elsif($opt_protrude){
 270             if(($chars[$i+1]=~m/[$strictly_forbidden_before]/o)
 271                and($chars[$i+2]=~m/[^$strictly_forbidden_before]/o)){
 272                 return "\\CJKunbreakablekernone \\CJKprotrude ";
 273             }
 274         }
 275     }
 276     if(($i<$#chars)
 277        and($chars[$i+1]=~m/[$strictly_forbidden_before]/o)){
 278         return "\\CJKunbreakablekernone ";
 279     }
 280     if($chars[$i]=~m/[$strictly_forbidden_after]/o){
 281         return "\\CJKunbreakablekernone ";
 282     }
 283     if(($i<$#chars)
 284        and($chars[$i+1]=~m/[$forbidden_before]/o)){
 285         return "\\CJKunbreakablekerntwo ";
 286
 287     }
 288     if($chars[$i]=~m/[$forbidden_after]/o){
 289         return "\\CJKunbreakablekerntwo ";
 290     }
 291     if(($i<$#chars)
 292        and($chars[$i+1]=~m/[$slightly_forbidden_before]/o)){
 293         return "\\CJKunbreakablekernthree ";
 294     }
 295     if($chars[$i]=~m/[$slightly_forbidden_after]/o){
 296         return "\\CJKunbreakablekernthree ";
 297     }
 298     if($chars[$i]=~m/[$asian]/o){
 299         return "\\CJKbreakablekern ";
 300     }
 301     if(($i<$#chars)and($chars[$i+1]=~m/[$asian]/o)){
 302         return "\\CJKbreakablekern ";
 303     }
 304 }
 305
 306 sub latin_parse{
 307     # arguments: none
 308     # return: string for output with TeX macro.
 309     my($char_id);
 310     my $out_str=$chars[$i];
 311     $i++;
 312     while($i<=$#chars){
 313         $char_id=unpack("U",$chars[$i]);
 314         if($char_id>0x20 and $char_id<=0x02af){
 315             $out_str.=$chars[$i];
 316         }else{
 317             $i--;
 318             last;
 319         }
 320         $i++;
 321     }
 322     return '{\normalfont {'.$out_str.'}}';
 323 }
 324
 325 sub ids_parse{
 326     # arguments: none
 327     # return: character for output,
 328     #          TeX macro for ids,
 329     #          or GETA character if ids is invalid.
 330     my($ids,$ids_argc)=&ids_rest("",0,$chars[$i]);
 331
 332     while($ids_argc>0){
 333         # We are in IDS.
 334         $i++;
 335         if($i>$#chars){
 336             print STDERR "IDS parse error: $ids\n";
 337             return $geta;
 338         }
 339
 340         ($ids,$ids_argc)=&ids_rest($ids,$ids_argc,$chars[$i]);
 341         if($ids_argc==0){
 342             if(($char=&get_char_for_ids($ids))
 343                and($out_char=&get_output_char($char,$out_cs))){
 344                 return $out_char;
 345             }else{
 346                 return &get_macro_for_ids($ids);
 347             }
 348         }
 349     }
 350 }
 351
 352 sub ids_rest{
 353     # arguments: <ids>, <rest number of arguments for ids>, <character>
 354     # return: ids and rest number of arguments for ids.
 355     my($ids,$ids_argc,$char)=@_;
 356     my($argc);
 357     $argc=&ids_argc($char);
 358     if($argc){
 359         $ids_argc+=$ids_argc==0?$argc:$argc-1;
 360     }else{
 361         $ids_argc--;
 362     }
 363     $ids.=$char;
 364     return ($ids,$ids_argc);
 365 }
 366
 367 sub get_macro_for_ids{
 368     # argument: <ids>
 369     # return: TeX macro for ids
 370     #          or GETA character if ids is invalid for KAGE.
 371     my($ids)=@_;
 372     $ids=&normalize_ids($ids,"UniJIS");
 373     return $geta if(($ids!~/[$idc]/)
 374                     or($ids=~/[\x{10000}-]/));
 375                     #irregular for KAGE.
 376     if(not defined($ids{$ids}) and $ids{$ids}[1]>=0){
 377         $ids{$ids}[0]=$font_start;
 378         $ids{$ids}[1]=$ids_start;
 379         $ids_start++;
 380     }
 381     if($ids_start>255){
 382         $ids_start=0;
 383         $font_start++;
 384     }
 385     return "{\\fontencoding{OT1}\\fontfamily{"
 386         .sprintf("chise%03d",$ids{$ids}[0])
 387         ."}\\selectfont\\char$ids{$ids}[1]}";
 388 }
 389
 390 sub normalize_ids{
 391     # argument: <ids>, <output coding system>
 392     # return: ids or GETA character if ids is invalid for KAGE.
 393     my($ids,$out_cs)=@_;
 394     $out_cs=~s/Uni(.+)/'ucs@'.lc($1)/e;
 395
 396     my $output_ids="";
 397     my($char,$output_char_id);
 398     while($ids=~m/(.)/g){
 399         $char=$1;
 400         if($char=~/[$idc]/){
 401             $output_ids.=$char;
 402         }elsif($output_char_id=&get_char_attribute($char,"=$out_cs")
 403            or $output_char_id=&get_char_attribute($char,"=ucs")
 404            or $output_char_id=&get_char_attribute($char,"=>$out_cs")
 405            or $output_char_id=&get_char_attribute($char,"=>ucs")
 406            or $output_char_id=&get_char_attribute($char,"=>ucs*")
 407               ){
 408             $output_ids.=pack("U",$output_char_id);
 409         }else{
 410             return $geta;
 411         }
 412     }
 413     return $output_ids;
 414 }
 415
 416 sub get_output_char{
 417     # argument: <char>
 418     # return: character in output coding system or TeX macro or undef.
 419     my($char,$out_cs)=@_;
 420     my($out_char_id,$suffix);
 421     my($gt,$hzk,$cdp);
 422
 423     foreach $out_cs (@{$order{$out_cs}}){
 424         if($out_cs eq 'UniJIS'
 425            and &get_char_attribute($char,"vnd-adobe-cid-unijis-utf16-h")){
 426             if($out_char_id=&get_char_attribute($char,'=ucs@jis')
 427                or $out_char_id=&get_char_attribute($char,'=ucs')
 428                or $out_char_id=&get_char_attribute($char,'=>ucs@jis')
 429                or $out_char_id=&get_char_attribute($char,'=>ucs')
 430                or $out_char_id=&get_char_attribute($char,'=>ucs*')
 431                ){
 432                 return '{\selectjisfont\char'.$out_char_id.'}';
 433             }
 434         }elsif($out_cs eq 'UniGB'
 435                and &get_char_attribute($char,"vnd-adobe-cid-unigb-ucs2-h")){
 436             if($out_char_id=&get_char_attribute($char,'=ucs@gb')
 437                or $out_char_id=&get_char_attribute($char,'=ucs')
 438                or $out_char_id=&get_char_attribute($char,'=>ucs@gb')
 439                or $out_char_id=&get_char_attribute($char,'=>ucs')
 440                or $out_char_id=&get_char_attribute($char,'=>ucs*')
 441                ){
 442                 return '{\selectgbsfont\char'.$out_char_id.'}';
 443             }
 444         }elsif($out_cs eq 'UniCNS'
 445                and &get_char_attribute($char,"vnd-adobe-cid-unicns-ucs2-h")){
 446             if($out_char_id=&get_char_attribute($char,'=ucs@cns')
 447                or $out_char_id=&get_char_attribute($char,'=ucs')
 448                or $out_char_id=&get_char_attribute($char,'=>ucs@cns')
 449                or $out_char_id=&get_char_attribute($char,'=>ucs')
 450                or $out_char_id=&get_char_attribute($char,'=>ucs*')
 451                ){
 452                 return '{\selectcnsfont\char'.$out_char_id.'}';
 453             }
 454         }elsif($out_cs eq 'UniKS'
 455                and &get_char_attribute($char,"vnd-adobe-cid-uniks-ucs2-h")){
 456             if($out_char_id=&get_char_attribute($char,'=ucs@ks')
 457                or $out_char_id=&get_char_attribute($char,'=ucs')
 458                or $out_char_id=&get_char_attribute($char,'=>ucs@ks')
 459                or $out_char_id=&get_char_attribute($char,'=>ucs')
 460                or $out_char_id=&get_char_attribute($char,'=>ucs*')
 461                ){
 462                 return '{\selectksxfont\char'.$out_char_id.'}';
 463             }
 464         }elsif($out_cs eq 'GT'){
 465             return $gt if($gt=&get_macro_for_GT($char));
 466         }elsif($out_cs eq 'HZK'){
 467             return $hzk if($hzk=&get_macro_for_HZK($char));
 468         }elsif($out_cs eq 'CDP'){
 469             return $cdp if($cdp=&get_macro_for_CDP($char));
 470         }
 471     }
 472     return undef;
 473 }
 474
 475 sub get_ids{
 476     # argument: <character>
 477     # return: ids
 478     my($char)=@_;
 479     my $ids="";
 480     $ids=&get_char_attribute($char,"ids-aggregated")
 481         or $ids=&get_char_attribute($char,"ids");
 482 #         or $ids=&get_char_attribute($char,"ideographic-structure");
 483 #    $ids=~s/[? ()]//g;
 484     return $ids;
 485 }
 486
 487 sub get_char_for_ids{
 488     # argument: <ideographic description sequence>
 489     # return: char or undef.
 490     my($ids)=@_;
 491     my($output_char);
 492
 493     if(($output_char)=&get_chars_matching("ids",$ids)){
 494         return $output_char;
 495     }else{
 496         return undef;
 497     }
 498 }
 499
 500 sub get_char_in_mcs{
 501     # argument: <char>, <input coding system>
 502     # return:   char in ucs@mcs.
 503     my($char,$in_cs)=@_;
 504     my($output_char);
 505
 506     return $char if($in_cs eq 'ucs@mcs');
 507
 508     if(($output_char)=&get_chars_matching("=$in_cs",unpack("U",$char))){
 509         return $output_char;
 510     }else{
 511         return $char;
 512     }
 513 }
 514
 515 sub get_chars_unified{
 516     my($char)=@_;
 517     my($chars,$ucs,$char_ucs);
 518     my(@chars);
 519
 520     if($chars=&get_char_attribute($char,'->ucs-unified')){
 521         $chars=~s/^\((.*)\)$/$1/;
 522         return (split(/\s*\?/,$chars));
 523     }elsif($ucs=&get_char_attribute($char,'=>ucs*')
 524           or $ucs=&get_char_attribute($char,'=>ucs')){
 525         $char_ucs=pack("U",$ucs);
 526         if($chars=&get_char_attribute($char_ucs,'->ucs-unified')){
 527             $chars=~s/^\((.*)\)$/$1/;
 528             @chars=grep {not /^$char$/}
 529                 (split(/\s*\?/,$chars));
 530             push(@chars,$char_ucs);
 531             return @chars;
 532         }
 533     }
 534 }
 535
 536 sub get_macro_for_GT{
 537     # argument: <char>
 538     # return: TeX macro for GT fonts or undef.
 539     my($char)=@_;
 540     my($gt,$GT);
 541     foreach (@GT){
 542         if($gt=&get_char_attribute($char,$_)){
 543             m/gt\-pj\-(\d+)/ and $GT=$1;
 544             last;
 545         }
 546     }
 547     if($gt){
 548         return "{\\fontencoding{OT1}\\fontfamily{"
 549             .sprintf("gt%02d",$GT)
 550             ."}\\selectfont\\char".($gt|0x8080)."}";
 551     }else{
 552         return undef;
 553     }
 554 }
 555
 556 sub get_macro_for_HZK{
 557     # argument: <char>
 558     # return: TeX macro for Hanziku fonts or undef.
 559     my($char)=@_;
 560     my($hzk,$HZK);
 561     foreach (@HZK){
 562         if($hzk=&get_char_attribute($char,$_)){
 563             m/hanziku\-(\d+)/ and $HZK=$1;
 564             last;
 565         }
 566     }
 567     if($hzk){
 568         return "{\\fontencoding{OT1}\\fontfamily{".sprintf("hzk%02d",$HZK)."}\\selectfont\\char".$hzk."}";
 569     }else{
 570         return undef;
 571     }
 572 }
 573
 574 sub get_macro_for_CDP{
 575     # argument: <char>
 576     # return: TeX macro for CDP fonts or undef.
 577     my($char)=@_;
 578     my($cdp,$ucs);
 579     foreach (@CDP){
 580         if($cdp=&get_char_attribute($char,$_)){
 581             last;
 582         }
 583     }
 584     if($cdp){
 585         $ucs=&get_char_attribute(&get_chars_matching("=big5-pua",$cdp),"=ucs");
 586         if($ucs){
 587             return "{\\fontencoding{OT1}\\fontfamily{cdp}\\selectfont\\char"
 588                 .$ucs.
 589                     "}";
 590         }else{
 591             print STDERR "This should not happen.\n";
 592             print STDERR "ucs code point of CDP: $cdp not found.\n";
 593         }
 594     }else{
 595         return undef;
 596     }
 597 }