inCHISE

   1 #!/usr/bin/perl -w -CSD
   2
   3 # ver.0.2
   4
   5 use strict;
   6 use vars qw($omegadb_path
   7             $opt_protrude $opt_allow_unify
   8             %opt_order %order %order_map
   9             $opt_in_cs $opt_out_cs
  10             $opt_help $usage
  11             $in_cs $out_cs $i @chars
  12             $char $char_id $out_char
  13             $char_unified @chars_unified
  14             $ids $ids_argc %ids $idsdb $geta
  15             $idsdata_file $ids_start $font_start
  16             @CDP @HZK @GT
  17             );
  18 use Getopt::Long;
  19 use utf8;
  20 use Fcntl ':flock';
  21 use Chise_utils ':all';
  22 require 5.008;
  23
  24 my $omegadb_path="/usr/local/lib/chise/omega";
  25
  26 ### Options ###
  27
  28 #$opt_order{'UniMulti'}='jcgk';
  29 $opt_order{'UniMulti'}='jGcgkHC';
  30 $opt_order{'UniCNS'}='c';
  31 $opt_order{'UniGB'}='g';
  32 $opt_order{'UniJIS'}='j';
  33 $opt_order{'UniKS'}='k';
  34
  35 $opt_allow_unify=1; # 1=true, 0=false.
  36 $opt_protrude=0;# 1=true, 0=false.
  37
  38 ### End ###
  39
  40 my $strictly_forbidden_after = '「【『［（〈“‘‘（〔｛《{\[\(\x{3016}｛｢';
  41 #       \x{3016} | # white 【
  42
  43 my $forbidden_after = "\x{0000}";
  44
  45 # ￥¥＄$〒♯＃#￠¢￡£＠@§
  46 my $slightly_forbidden_after = '￥¥＄$〒♯＃#￠¢￡£＠@§';
  47
  48 # $strictly_forbidden_before
  49 # All these characters are allowed to protrude
  50 # in the right margin
  51 my $strictly_forbidden_before=
  52     '!,.:;?、。！，．：；？。\)#}’”〉》」』】〕\x{3017}）］｝｝」\]';
  53 ###       \x{3017} | # white 】
  54
  55 my $forbidden_before
  56     = 'ー々ぁぃぅぇぉゃゅょっゎァィゥェォャュョッヮヵヶ';
  57
  58 my $slightly_forbidden_before
  59     = '\x{000a}\#\-‐−‰′″℃゛゜ゝゞヽヾ＂％－ﾞﾟ';
  60
  61 my $asian = '\x{1100}-\x{11FF}\x{2E80}-\x{D7AF}\x{F900}-\x{FAFF}\x{FE30}-\x{FE4F}\x{FF00}-\x{FFFFFF}';
  62
  63 my $space = '\x{0020}\x{0009}\x{000A}\x{000C}\x{000D}';
  64
  65 &GetOptions("in=s"=>\$opt_in_cs,
  66             "out=s"=>\$opt_out_cs,
  67             "help",\$opt_help);
  68
  69 $usage=<<EOF;
  70 Usage: $0 -i <input coding system> -o <cmap encoding>
  71     input coding system:
  72       Utf8mcs, Utf8cns, Utf8gb, Utf8jis, Utf8ks
  73     cmap encoding:
  74       UniCNS, UniGB, UniJIS, UniKS, UniMulti
  75 EOF
  76
  77 if($opt_in_cs or $opt_out_cs){
  78     $in_cs=$opt_in_cs;
  79     $out_cs=$opt_out_cs;
  80 }elsif(@ARGV==0){
  81     ($in_cs,$out_cs)=($0=~/(Utf8.+)To(\w+)/);
  82 }
  83
  84 # $in_cs:
  85 #   Utf8mcs,Utf8cns,Utf8gb,Utf8jis,Utf8ks,
  86 # $out_cs:
  87 #   UniCNS,UniGB,UniJIS,UniKS,UniMulti
  88
  89 $in_cs=~s/Utf8/ucs\@/;
  90
  91 if($opt_help
  92    or not defined($in_cs)
  93    or not defined($out_cs)){
  94     print $usage;
  95     exit 1;
  96 }
  97
  98 $omegadb_path=~s!/$!!;
  99
 100 $idsdata_file="$omegadb_path/idsdata.pl";
 101 $ids_start=0x00;
 102 $font_start=0;
 103
 104 if(-e $idsdata_file){
 105     open(IDSDATA,"+<:utf8",$idsdata_file) or die;
 106     flock(IDSDATA,LOCK_EX);
 107     seek(IDSDATA,0,0);
 108     while(<IDSDATA>){
 109         eval $_;
 110     }
 111     seek(IDSDATA,0,0);
 112 #         require $idsdata_file;
 113 }else{
 114     open(IDSDATA,">:utf8",$idsdata_file) or die;
 115     flock(IDSDATA,LOCK_EX);
 116     seek(IDSDATA,0,0);
 117 }
 118
 119 $ids_argc=0;
 120 $ids="";
 121
 122 $geta=pack("U",0x3013);
 123 #$geta=pack("U",0xfffd);
 124
 125 @GT=(#"=gt","=gt-k",
 126      "=gt-pj-1","=gt-pj-2","=gt-pj-3","=gt-pj-4","=gt-pj-5",
 127      "=gt-pj-6","=gt-pj-7","=gt-pj-8","=gt-pj-9","=gt-pj-10",
 128      "=gt-pj-11"
 129      #,"=gt-pj-k1","=gt-pj-k2"
 130      );
 131 @HZK=("=hanziku-1","=hanziku-2","=hanziku-3","=hanziku-4",
 132       "=hanziku-5","=hanziku-6","=hanziku-7","=hanziku-8",
 133       "=hanziku-9","=hanziku-10","=hanziku-11","=hanziku-12");
 134 @CDP=("=big5-cdp");
 135
 136 %order_map=('c'=>'UniCNS',
 137             'g'=>'UniGB',
 138             'j'=>'UniJIS',
 139             'k'=>'UniKS',
 140             'G'=>'GT',
 141             'H'=>'HZK',
 142             'C'=>'CDP',
 143             );
 144
 145 foreach $out_cs ('UniCNS','UniGB','UniJIS','UniKS','UniMulti'){
 146     if(defined($opt_order{$out_cs})){
 147         if($opt_order{$out_cs}=~/^[cgjkGHC]+$/){
 148             @{$order{$out_cs}}=map {$order_map{$_}}
 149             (split(//,$opt_order{$out_cs}));
 150         }else{
 151             print STDERR "Invalid order for $out_cs!\n";
 152             exit 1;
 153         }
 154     }
 155 }
 156
 157 while(<>){
 158     utf8::decode($_);
 159     if($in_cs ne 'ucs@mcs'){
 160         s/(.)/&get_char_in_mcs($1,$in_cs)/ge;
 161     }
 162     s/(amp.+?;)/&de_tex_er($1)/ge;
 163 #    s/(&.+?;)/&de_tex_er($1)/ge;
 164     @chars=split(//);
 165   CHAR:
 166     for($i=0;$i<=$#chars;$i++){
 167         $char=$chars[$i];
 168         $char_id=unpack("U",$char);
 169
 170         if($char_id<=0x20){
 171             print $chars[$i];
 172             next CHAR;
 173         }elsif($char_id>0x20 and $char_id<=0x02af){
 174             # Basic Latin
 175             # Latin-1 Supplement
 176             # Latin Extended-A
 177             # Latin Extended-B
 178             # IPA Extensions
 179             print &latin_parse();
 180             next CHAR;
 181         }elsif($char_id>=0x2ff0 and $char_id<=0x2fff){
 182             # Ideographic Description Characters
 183             print &ids_parse();
 184             next CHAR;
 185         }else{
 186             if(($out_char=&get_output_char($char,$out_cs))){
 187                 print $out_char,&add_break($i);
 188             }elsif($opt_allow_unify){
 189                 @chars_unified=&get_chars_unified($char);
 190                 if(@chars_unified>0){
 191                     foreach $char_unified (@chars_unified){
 192                         if(($out_char
 193                             =&get_output_char($char_unified,$out_cs))){
 194                             print $out_char,&add_break($i);
 195                             next CHAR;
 196                         }
 197                     }
 198                 }
 199             }elsif($char_id >= 0x20000 && $char_id <=0x2a6df){
 200                 # CJK Unified Ideographs Extension B
 201                 if(not defined($ids{$char}) and $ids{$char}[1]>=0){
 202                     $ids{$char}[0]=$font_start;
 203                     $ids{$char}[1]=$ids_start;
 204                     $ids_start++;
 205                     if($ids_start>255){
 206                         $ids_start=0;
 207                         $font_start++;
 208                     }
 209                 }
 210                 print "{\\fontencoding{OT1}\\fontfamily{" .
 211                     sprintf("chise%03d",$ids{$char}[0]) .
 212                     "}\\selectfont\\char$ids{$char}[1]}",&add_break($i);
 213                 next CHAR;
 214             }else{
 215                 if($ids=&get_ids($char)){
 216                     print &get_macro_for_ids($ids),&add_break($i);
 217                 }else{
 218                     print '\rule{1ex}{1ex}',&add_break($i);
 219                 }
 220             }
 221         }
 222     }
 223 }
 224
 225 print IDSDATA 'use utf8;',"\n";
 226 foreach $ids (keys %ids){
 227     print IDSDATA '$ids{\'',$ids,'\'}='
 228     ,'[',join ",",@{$ids{$ids}},"];\n";
 229 }
 230 print IDSDATA '$font_start=',$font_start,";\n";
 231 print IDSDATA '$ids_start=',$ids_start,";\n";
 232 print IDSDATA "1;";
 233 flock(IDSDATA,LOCK_UN);
 234
 235 exit 0;
 236
 237 sub de_tex_er{
 238     my($er)=@_;
 239     my($prefix,$suffix);
 240     my($output_char,$atr,$value);
 241     $er=~/^(amp)(.*)(;)$/
 242         and $prefix=$1,$er=$2,$suffix=$3;
 243     $prefix or $prefix="",$suffix or $suffix="";
 244     if($er=~/^U[\+|\-]([a-fA-F\d]+)/){
 245         $output_char=pack("U",hex($1));
 246     }elsif($er=~/^(?:I\-)?($er_prefix_re)\-?([0-9a-fA-F]+)$/){
 247         ($atr,$value)=($1,$2);
 248         unless($er_alias{$atr}=~/daikanwa|gt/){
 249             $value=hex($value);
 250         }
 251         ($output_char)=&get_chars_matching($er_alias{$atr},$value);
 252     }
 253     if($output_char){
 254         return $output_char;
 255     }else{
 256         return $prefix.$er.$suffix;
 257     }
 258 }
 259
 260 sub add_break{
 261     my($i)=@_;
 262
 263     if($i<($#chars-1)){
 264         if(($chars[$i+1]=~m/[$strictly_forbidden_before]/o)
 265            and($chars[$i+2]=~m/[$strictly_forbidden_before]/o)){
 266             return "\\CJKunbreakablekernone ";
 267         }elsif($opt_protrude){
 268             if(($chars[$i+1]=~m/[$strictly_forbidden_before]/o)
 269                and($chars[$i+2]=~m/[^$strictly_forbidden_before]/o)){
 270                 return "\\CJKunbreakablekernone \\CJKprotrude ";
 271             }
 272         }
 273     }
 274     if(($i<$#chars)
 275        and($chars[$i+1]=~m/[$strictly_forbidden_before]/o)){
 276         return "\\CJKunbreakablekernone ";
 277     }
 278     if($chars[$i]=~m/[$strictly_forbidden_after]/o){
 279         return "\\CJKunbreakablekernone ";
 280     }
 281     if(($i<$#chars)
 282        and($chars[$i+1]=~m/[$forbidden_before]/o)){
 283         return "\\CJKunbreakablekerntwo ";
 284
 285     }
 286     if($chars[$i]=~m/[$forbidden_after]/o){
 287         return "\\CJKunbreakablekerntwo ";
 288     }
 289     if(($i<$#chars)
 290        and($chars[$i+1]=~m/[$slightly_forbidden_before]/o)){
 291         return "\\CJKunbreakablekernthree ";
 292     }
 293     if($chars[$i]=~m/[$slightly_forbidden_after]/o){
 294         return "\\CJKunbreakablekernthree ";
 295     }
 296     if($chars[$i]=~m/[$asian]/o){
 297         return "\\CJKbreakablekern ";
 298     }
 299     if(($i<$#chars)and($chars[$i+1]=~m/[$asian]/o)){
 300         return "\\CJKbreakablekern ";
 301     }
 302 }
 303
 304 sub latin_parse{
 305     # arguments: none
 306     # return: string for output with TeX macro.
 307     my($char_id);
 308     my $out_str=$chars[$i];
 309     $i++;
 310     while($i<=$#chars){
 311         $char_id=unpack("U",$chars[$i]);
 312         if($char_id>0x20 and $char_id<=0x02af){
 313             $out_str.=$chars[$i];
 314         }else{
 315             $i--;
 316             last;
 317         }
 318         $i++;
 319     }
 320     return '{\normalfont {'.$out_str.'}}';
 321 }
 322
 323 sub ids_parse{
 324     # arguments: none
 325     # return: character for output,
 326     #          TeX macro for ids,
 327     #          or GETA character if ids is invalid.
 328     my($ids,$ids_argc)=&ids_rest("",0,$chars[$i]);
 329
 330     while($ids_argc>0){
 331         # We are in IDS.
 332         $i++;
 333         if($i>$#chars){
 334             print STDERR "IDS parse error: $ids\n";
 335             return $geta;
 336         }
 337
 338         ($ids,$ids_argc)=&ids_rest($ids,$ids_argc,$chars[$i]);
 339         if($ids_argc==0){
 340             if(($char=&get_char_for_ids($ids))
 341                and($out_char=&get_output_char($char,$out_cs))){
 342                 return $out_char;
 343             }else{
 344                 return &get_macro_for_ids($ids);
 345             }
 346         }
 347     }
 348 }
 349
 350 sub ids_rest{
 351     # arguments: <ids>, <rest number of arguments for ids>, <character>
 352     # return: ids and rest number of arguments for ids.
 353     my($ids,$ids_argc,$char)=@_;
 354     my($argc);
 355     $argc=&ids_argc($char);
 356     if($argc){
 357         $ids_argc+=$ids_argc==0?$argc:$argc-1;
 358     }else{
 359         $ids_argc--;
 360     }
 361     $ids.=$char;
 362     return ($ids,$ids_argc);
 363 }
 364
 365 sub get_macro_for_ids{
 366     # argument: <ids>
 367     # return: TeX macro for ids
 368     #          or GETA character if ids is invalid for KAGE.
 369     my($ids)=@_;
 370     $ids=&normalize_ids($ids,"UniJIS");
 371     return $geta if(($ids!~/[$idc]/)
 372                     or($ids=~/[\x{10000}-]/));
 373                     #irregular for KAGE.
 374     if(not defined($ids{$ids}) and $ids{$ids}[1]>=0){
 375         $ids{$ids}[0]=$font_start;
 376         $ids{$ids}[1]=$ids_start;
 377         $ids_start++;
 378     }
 379     if($ids_start>255){
 380         $ids_start=0;
 381         $font_start++;
 382     }
 383     return "{\\fontencoding{OT1}\\fontfamily{"
 384         .sprintf("chise%03d",$ids{$ids}[0])
 385         ."}\\selectfont\\char$ids{$ids}[1]}";
 386 }
 387
 388 sub normalize_ids{
 389     # argument: <ids>, <output coding system>
 390     # return: ids or GETA character if ids is invalid for KAGE.
 391     my($ids,$out_cs)=@_;
 392     $out_cs=~s/Uni(.+)/'ucs@'.lc($1)/e;
 393
 394     my $output_ids="";
 395     my($char,$output_char_id);
 396     while($ids=~m/(.)/g){
 397         $char=$1;
 398         if($char=~/[$idc]/){
 399             $output_ids.=$char;
 400         }elsif($output_char_id=&get_char_attribute($char,"=$out_cs")
 401            or $output_char_id=&get_char_attribute($char,"=ucs")
 402            or $output_char_id=&get_char_attribute($char,"=>$out_cs")
 403            or $output_char_id=&get_char_attribute($char,"=>ucs")
 404            or $output_char_id=&get_char_attribute($char,"=>ucs*")
 405               ){
 406             $output_ids.=pack("U",$output_char_id);
 407         }else{
 408             return $geta;
 409         }
 410     }
 411     return $output_ids;
 412 }
 413
 414 sub get_output_char{
 415     # argument: <char>
 416     # return: character in output coding system or TeX macro or undef.
 417     my($char,$out_cs)=@_;
 418     my($out_char_id,$suffix);
 419     my($gt,$hzk,$cdp);
 420
 421     foreach $out_cs (@{$order{$out_cs}}){
 422         if($out_cs eq 'UniJIS'
 423            and &get_char_attribute($char,"vnd-adobe-cid-unijis-utf16-h")){
 424             if($out_char_id=&get_char_attribute($char,'=ucs@jis')
 425                or $out_char_id=&get_char_attribute($char,'=ucs')
 426                or $out_char_id=&get_char_attribute($char,'=>ucs@jis')
 427                or $out_char_id=&get_char_attribute($char,'=>ucs')
 428                or $out_char_id=&get_char_attribute($char,'=>ucs*')
 429                ){
 430                 return '{\selectjisfont\char'.$out_char_id.'}';
 431             }
 432         }elsif($out_cs eq 'UniGB'
 433                and &get_char_attribute($char,"vnd-adobe-cid-unigb-ucs2-h")){
 434             if($out_char_id=&get_char_attribute($char,'=ucs@gb')
 435                or $out_char_id=&get_char_attribute($char,'=ucs')
 436                or $out_char_id=&get_char_attribute($char,'=>ucs@gb')
 437                or $out_char_id=&get_char_attribute($char,'=>ucs')
 438                or $out_char_id=&get_char_attribute($char,'=>ucs*')
 439                ){
 440                 return '{\selectgbsfont\char'.$out_char_id.'}';
 441             }
 442         }elsif($out_cs eq 'UniCNS'
 443                and &get_char_attribute($char,"vnd-adobe-cid-unicns-ucs2-h")){
 444             if($out_char_id=&get_char_attribute($char,'=ucs@cns')
 445                or $out_char_id=&get_char_attribute($char,'=ucs')
 446                or $out_char_id=&get_char_attribute($char,'=>ucs@cns')
 447                or $out_char_id=&get_char_attribute($char,'=>ucs')
 448                or $out_char_id=&get_char_attribute($char,'=>ucs*')
 449                ){
 450                 return '{\selectcnsfont\char'.$out_char_id.'}';
 451             }
 452         }elsif($out_cs eq 'UniKS'
 453                and &get_char_attribute($char,"vnd-adobe-cid-uniks-ucs2-h")){
 454             if($out_char_id=&get_char_attribute($char,'=ucs@ks')
 455                or $out_char_id=&get_char_attribute($char,'=ucs')
 456                or $out_char_id=&get_char_attribute($char,'=>ucs@ks')
 457                or $out_char_id=&get_char_attribute($char,'=>ucs')
 458                or $out_char_id=&get_char_attribute($char,'=>ucs*')
 459                ){
 460                 return '{\selectksxfont\char'.$out_char_id.'}';
 461             }
 462         }elsif($out_cs eq 'GT'){
 463             return $gt if($gt=&get_macro_for_GT($char));
 464         }elsif($out_cs eq 'HZK'){
 465             return $hzk if($hzk=&get_macro_for_HZK($char));
 466         }elsif($out_cs eq 'CDP'){
 467             return $cdp if($cdp=&get_macro_for_CDP($char));
 468         }
 469     }
 470     return undef;
 471 }
 472
 473 sub get_ids{
 474     # argument: <character>
 475     # return: ids
 476     my($char)=@_;
 477     my $ids="";
 478     $ids=&get_char_attribute($char,"ids-aggregated")
 479         or $ids=&get_char_attribute($char,"ids");
 480 #         or $ids=&get_char_attribute($char,"ideographic-structure");
 481 #    $ids=~s/[? ()]//g;
 482     return $ids;
 483 }
 484
 485 sub get_char_for_ids{
 486     # argument: <ideographic description sequence>
 487     # return: char or undef.
 488     my($ids)=@_;
 489     my($output_char);
 490
 491     if(($output_char)=&get_chars_matching("ids",$ids)){
 492         return $output_char;
 493     }else{
 494         return undef;
 495     }
 496 }
 497
 498 sub get_char_in_mcs{
 499     # argument: <char>, <input coding system>
 500     # return:   char in ucs@mcs.
 501     my($char,$in_cs)=@_;
 502     my($output_char);
 503
 504     return $char if($in_cs eq 'ucs@mcs');
 505
 506     if(($output_char)=&get_chars_matching("=$in_cs",unpack("U",$char))){
 507         return $output_char;
 508     }else{
 509         return $char;
 510     }
 511 }
 512
 513 sub get_chars_unified{
 514     my($char)=@_;
 515     my($chars,$ucs,$char_ucs);
 516     my(@chars);
 517
 518     if($chars=&get_char_attribute($char,'->ucs-unified')){
 519         $chars=~s/^\((.*)\)$/$1/;
 520         return (split(/\s*\?/,$chars));
 521     }elsif($ucs=&get_char_attribute($char,'=>ucs*')
 522           or $ucs=&get_char_attribute($char,'=>ucs')){
 523         $char_ucs=pack("U",$ucs);
 524         if($chars=&get_char_attribute($char_ucs,'->ucs-unified')){
 525             $chars=~s/^\((.*)\)$/$1/;
 526             @chars=grep {not /^$char$/}
 527                 (split(/\s*\?/,$chars));
 528             push(@chars,$char_ucs);
 529             return @chars;
 530         }
 531     }
 532 }
 533
 534 sub get_macro_for_GT{
 535     # argument: <char>
 536     # return: TeX macro for GT fonts or undef.
 537     my($char)=@_;
 538     my($gt,$GT);
 539     foreach (@GT){
 540         if($gt=&get_char_attribute($char,$_)){
 541             m/gt\-pj\-(\d+)/ and $GT=$1;
 542             last;
 543         }
 544     }
 545     if($gt){
 546         return "{\\fontencoding{OT1}\\fontfamily{"
 547             .sprintf("gt%02d",$GT)
 548             ."}\\selectfont\\char".($gt|0x8080)."}";
 549     }else{
 550         return undef;
 551     }
 552 }
 553
 554 sub get_macro_for_HZK{
 555     # argument: <char>
 556     # return: TeX macro for Hanziku fonts or undef.
 557     my($char)=@_;
 558     my($hzk,$HZK);
 559     foreach (@HZK){
 560         if($hzk=&get_char_attribute($char,$_)){
 561             m/hanziku\-(\d+)/ and $HZK=$1;
 562             last;
 563         }
 564     }
 565     if($hzk){
 566         return "{\\fontencoding{OT1}\\fontfamily{".sprintf("hzk%02d",$HZK)."}\\selectfont\\char".$hzk."}";
 567     }else{
 568         return undef;
 569     }
 570 }
 571
 572 sub get_macro_for_CDP{
 573     # argument: <char>
 574     # return: TeX macro for CDP fonts or undef.
 575     my($char)=@_;
 576     my($cdp,$ucs);
 577     foreach (@CDP){
 578         if($cdp=&get_char_attribute($char,$_)){
 579             last;
 580         }
 581     }
 582     if($cdp){
 583         $ucs=&get_char_attribute(&get_chars_matching("=big5-pua",$cdp),"=ucs");
 584         if($ucs){
 585             return "{\\fontencoding{OT1}\\fontfamily{cdp}\\selectfont\\char"
 586                 .$ucs.
 587                     "}";
 588         }else{
 589             print STDERR "This should not happen.\n";
 590             print STDERR "ucs code point of CDP: $cdp not found.\n";
 591         }
 592     }else{
 593         return undef;
 594     }
 595 }