inCHISE

   1 #!/usr/bin/perl -CSD
   2
   3 # ver.0.2
   4
   5 use strict;
   6 use vars qw($omegadb_path
   7             $opt_protrude $opt_allow_unify
   8             %opt_order %order %order_map
   9             $opt_in_cs $opt_out_cs
  10             $opt_help $usage
  11             $in_cs $out_cs $i @chars
  12             $char $char_id $out_char
  13             $char_id_unified @char_id_unified
  14             $ids $ids_argc %ids $idsdb
  15             $idsdata_file $ids_start $font_start
  16             @CDP @HZK @GT
  17             );
  18 use Getopt::Long;
  19 use utf8;
  20 use Fcntl ':flock';
  21 use Chise_utils ':all';
  22 require 5.008;
  23
  24 my $omegadb_path="/usr/local/lib/chise/omega";
  25
  26 ### Options ###
  27
  28 #$opt_order{'UniMulti'}='jcgk';
  29 $opt_order{'UniMulti'}='jGcgkHC';
  30 $opt_order{'UniGB'}='g';
  31 $opt_order{'UniCNS'}='c';
  32 $opt_order{'UniJIS'}='j';
  33 $opt_order{'UniKS'}='k';
  34
  35 $opt_allow_unify=0; # 1=true, 0=false.
  36 $opt_protrude=0;# 1=true, 0=false.
  37
  38 ### End ###
  39
  40 my $strictly_forbidden_after = '「【『［（〈“‘‘（〔｛《{\[\(\x{3016}｛｢';
  41 #       \x{3016} | # white 【
  42
  43 my $forbidden_after = "\x{0000}";
  44
  45 # ￥¥＄$〒♯＃#￠¢￡£＠@§
  46 my $slightly_forbidden_after = '￥¥＄$〒♯＃#￠¢￡£＠@§';
  47
  48 # $strictly_forbidden_before
  49 # All these characters are allowed to protrude
  50 # in the right margin
  51 my $strictly_forbidden_before=
  52     '!,.:;?、。！，．：；？。\)#}’”〉》」』】〕\x{3017}）］｝｝」\]';
  53 ###       \x{3017} | # white 】
  54
  55 my $forbidden_before
  56     = 'ー々ぁぃぅぇぉゃゅょっゎァィゥェォャュョッヮヵヶ';
  57
  58 my $slightly_forbidden_before
  59     = '\x{000a}\#\-‐−‰′″℃゛゜ゝゞヽヾ＂％－ﾞﾟ';
  60
  61 my $asian = '\x{1100}-\x{11FF}\x{2E80}-\x{D7AF}\x{F900}-\x{FAFF}\x{FE30}-\x{FE4F}\x{FF00}-';
  62
  63 my $space = '\x{0020}\x{0009}\x{000A}\x{000C}\x{000D}';
  64
  65 &GetOptions("in=s"=>\$opt_in_cs,
  66             "out=s"=>\$opt_out_cs,
  67             "help",\$opt_help);
  68
  69 $usage=<<EOF;
  70 Usage: $0 -i <input coding system> -o <cmap encoding>
  71     input coding system:
  72       Utf8mcs, Utf8cns, Utf8gb, Utf8jis, Utf8ks
  73     cmap encoding:
  74       UniCNS, UniGB, UniJIS, UniKS, UniMulti
  75 EOF
  76
  77 if($opt_in_cs or $opt_out_cs){
  78     $in_cs=$opt_in_cs;
  79     $out_cs=$opt_out_cs;
  80 }elsif(@ARGV==0){
  81     ($in_cs,$out_cs)=($0=~/(Utf8.+)To(\w+)/);
  82 }
  83
  84 # $in_cs:
  85 #   Utf8mcs,Utf8cns,Utf8gb,Utf8jis,Utf8ks,
  86 # $out_cs:
  87 #   UniCNS,UniGB,UniJIS,UniKS,UniMulti
  88
  89 $in_cs=~s/Utf8/ucs\@/;
  90
  91 if($opt_help
  92    or not defined($in_cs)
  93    or not defined($out_cs)){
  94     print $usage;
  95     exit 1;
  96 }
  97
  98 $omegadb_path=~s!/$!!;
  99
 100 $idsdata_file="$omegadb_path/idsdata.pl";
 101 $ids_start=0x00;
 102 $font_start=0;
 103
 104 if(-e $idsdata_file){
 105     open(IDSDATA,"+<:utf8",$idsdata_file) or die;
 106     flock(IDSDATA,LOCK_EX);
 107     seek(IDSDATA,0,0);
 108     while(<IDSDATA>){
 109         eval $_;
 110     }
 111     seek(IDSDATA,0,0);
 112 #         require $idsdata_file;
 113 }else{
 114     open(IDSDATA,">:utf8",$idsdata_file) or die;
 115     flock(IDSDATA,LOCK_EX);
 116     seek(IDSDATA,0,0);
 117 }
 118
 119 $ids_argc=0;
 120 $ids="";
 121
 122 @GT=(#"=gt","=gt-k",
 123      "=gt-pj-1","=gt-pj-2","=gt-pj-3","=gt-pj-4","=gt-pj-5",
 124      "=gt-pj-6","=gt-pj-7","=gt-pj-8","=gt-pj-9","=gt-pj-10",
 125      "=gt-pj-11"
 126      #,"=gt-pj-k1","=gt-pj-k2"
 127      );
 128 @HZK=("=hanziku-1","=hanziku-2","=hanziku-3","=hanziku-4",
 129       "=hanziku-5","=hanziku-6","=hanziku-7","=hanziku-8",
 130       "=hanziku-9","=hanziku-10","=hanziku-11","=hanziku-12");
 131 @CDP=("=big5-cdp");
 132
 133 %order_map=('c'=>'UniCNS',
 134             'g'=>'UniGB',
 135             'j'=>'UniJIS',
 136             'k'=>'UniKS',
 137             'G'=>'GT',
 138             'H'=>'HZK',
 139             'C'=>'CDP',
 140             );
 141
 142 foreach $out_cs ('UniGB','UniCNS','UniJIS','UniKS','UniMulti'){
 143     if(defined($opt_order{$out_cs})){
 144         if($opt_order{$out_cs}=~/^[cgjkGHC]+$/){
 145             @{$order{$out_cs}}=map {$order_map{$_}}
 146             (split(//,$opt_order{$out_cs}));
 147         }else{
 148             print STDERR "Invalid order for $out_cs!\n";
 149             exit 1;
 150         }
 151     }
 152 }
 153
 154 while(<>){
 155     utf8::decode($_);
 156     if($in_cs ne 'ucs@mcs'){
 157         s/(.)/pack("U",&get_char_id(unpack("U",$1),$in_cs))/ge;
 158     }
 159     s/(amp.+?;)/&de_tex_er($1)/ge;
 160 #    s/(&.+?;)/&de_tex_er($1)/ge;
 161     @chars=split(//);
 162   CHAR:
 163     for($i=0;$i<=$#chars;$i++){
 164         $char=$chars[$i];
 165         $char_id=unpack("U",$char);
 166
 167         if($char_id<=0x20){
 168             print $chars[$i];
 169             next CHAR;
 170         }elsif($char_id>0x20 and $char_id<=0x02af){
 171             # Basic Latin
 172             # Latin-1 Supplement
 173             # Latin Extended-A
 174             # Latin Extended-B
 175             # IPA Extensions
 176             print &latin_parse();
 177             next CHAR;
 178         }elsif($char_id>=0x2ff0 and $char_id<=0x2fff){
 179             # Ideographic Description Characters
 180             print &ids_parse();
 181             next CHAR;
 182         }else{
 183             if(($out_char=&get_output_char($char_id,$out_cs))){
 184                 print $out_char,&add_break($i);
 185             }elsif($char_id >= 0x20000 && $char_id <=0x2a6df){
 186                 # CJK Unified Ideographs Extension B
 187                 if(not defined($ids{$char}) and $ids{$char}[1]>=0){
 188                     $ids{$char}[0]=$font_start;
 189                     $ids{$char}[1]=$ids_start;
 190                     $ids_start++;
 191                     if($ids_start>255){
 192                         $ids_start=0;
 193                         $font_start++;
 194                     }
 195                 }
 196                 print "{\\fontencoding{OT1}\\fontfamily{" .
 197                     sprintf("chise%03d",$ids{$char}[0]) .
 198                     "}\\selectfont\\char$ids{$char}[1]}",&add_break($i);
 199                 next CHAR;
 200             }else{
 201                 if($opt_allow_unify){
 202                     @char_id_unified=&get_char_id_unified($char_id);
 203                     if(@char_id_unified>0){
 204                         foreach $char_id_unified (@char_id_unified){
 205                             if(($out_char
 206                                 =&get_output_char($char_id_unified,$out_cs))){
 207                                 print $out_char,&add_break($i);
 208                                 next CHAR;
 209                             }
 210                         }
 211                     }
 212                 }
 213                 if($ids=&get_ids($char)){
 214                     print &get_macro_for_ids($ids),&add_break($i);
 215                 }else{
 216                     print '\rule{1ex}{1ex}',&add_break($i);
 217                 }
 218             }
 219         }
 220     }
 221 }
 222
 223 print IDSDATA 'use utf8;',"\n";
 224 foreach $ids (keys %ids){
 225     print IDSDATA '$ids{\'',$ids,'\'}='
 226     ,'[',join ",",@{$ids{$ids}},"];\n";
 227 }
 228 print IDSDATA '$font_start=',$font_start,";\n";
 229 print IDSDATA '$ids_start=',$ids_start,";\n";
 230 print IDSDATA "1;";
 231 flock(IDSDATA,LOCK_UN);
 232
 233 exit 0;
 234
 235 sub de_tex_er{
 236     my($er)=@_;
 237     my($prefix,$suffix);
 238     my($output_char,$atr,$value);
 239     $er=~/^(amp)(.*)(;)$/
 240         and $prefix=$1,$er=$2,$suffix=$3;
 241     $prefix or $prefix="",$suffix or $suffix="";
 242     if($er=~/^U[\+|\-]([a-fA-F\d]+)/){
 243         $output_char=pack("U",hex($1));
 244     }elsif($er=~/^(?:I\-)?($er_prefix_re)\-?([0-9a-fA-F]+)$/){
 245         ($atr,$value)=($1,$2);
 246         unless($er_alias{$atr}=~/daikanwa|gt/){
 247             $value=hex($value);
 248         }
 249         ($output_char)=&get_chars_matching($er_alias{$atr},$value);
 250         utf8::decode($output_char);
 251     }
 252     if($output_char){
 253         return $output_char;
 254     }else{
 255         return $prefix.$er.$suffix;
 256     }
 257 }
 258
 259 sub add_break{
 260     my($i)=@_;
 261
 262     if($i<($#chars-1)){
 263         if(($chars[$i+1]=~m/[$strictly_forbidden_before]/o)
 264            and($chars[$i+2]=~m/[$strictly_forbidden_before]/o)){
 265             return "\\CJKunbreakablekernone ";
 266         }elsif($opt_protrude){
 267             if(($chars[$i+1]=~m/[$strictly_forbidden_before]/o)
 268                and($chars[$i+2]=~m/[^$strictly_forbidden_before]/o)){
 269                 return "\\CJKunbreakablekernone \\CJKprotrude ";
 270             }
 271         }
 272     }
 273     if(($i<$#chars)
 274        and($chars[$i+1]=~m/[$strictly_forbidden_before]/o)){
 275         return "\\CJKunbreakablekernone ";
 276     }
 277     if($chars[$i]=~m/[$strictly_forbidden_after]/o){
 278         return "\\CJKunbreakablekernone ";
 279     }
 280     if(($i<$#chars)
 281        and($chars[$i+1]=~m/[$forbidden_before]/o)){
 282         return "\\CJKunbreakablekerntwo ";
 283
 284     }
 285     if($chars[$i]=~m/[$forbidden_after]/o){
 286         return "\\CJKunbreakablekerntwo ";
 287     }
 288     if(($i<$#chars)
 289        and($chars[$i+1]=~m/[$slightly_forbidden_before]/o)){
 290         return "\\CJKunbreakablekernthree ";
 291     }
 292     if($chars[$i]=~m/[$slightly_forbidden_after]/o){
 293         return "\\CJKunbreakablekernthree ";
 294     }
 295     if($chars[$i]=~m/[$asian]/o){
 296         return "\\CJKbreakablekern ";
 297     }
 298     if(($i<$#chars)and($chars[$i+1]=~m/[$asian]/o)){
 299         return "\\CJKbreakablekern ";
 300     }
 301 }
 302
 303 sub latin_parse{
 304     # arguments: none
 305     # return: string for output with TeX macro.
 306     my($char_id);
 307     my $out_str=$chars[$i];
 308     $i++;
 309     while($i<=$#chars){
 310         $char_id=unpack("U",$chars[$i]);
 311         if($char_id>0x20 and $char_id<=0x02af){
 312             $out_str.=$chars[$i];
 313         }else{
 314             $i--;
 315             last;
 316         }
 317         $i++;
 318     }
 319     return '{\normalfont {'.$out_str.'}}';
 320 }
 321
 322 sub ids_parse{
 323     # arguments: none
 324     # return: character for output,
 325     #          TeX macro for ids,
 326     #          or GETA character if ids is invalid.
 327     my($ids,$ids_argc)=&ids_rest("",0,$chars[$i]);
 328
 329     while($ids_argc>0){
 330         # We are in IDS.
 331         $i++;
 332         if($i>$#chars){
 333             print STDERR "IDS parse error: $ids\n";
 334 #           return pack("U",0xfffd);
 335             return pack("U",0x3013);
 336         }
 337
 338         ($ids,$ids_argc)=&ids_rest($ids,$ids_argc,$chars[$i]);
 339         if($ids_argc==0){
 340             if(($char_id=&get_char_id_for_ids($ids))
 341                and($out_char=&get_output_char($char_id,$out_cs))){
 342                 return $out_char;
 343             }else{
 344                 return &get_macro_for_ids($ids);
 345 #               return encode('utf8', &get_macro_for_ids($ids)) if($perl58);
 346             }
 347         }
 348     }
 349 }
 350
 351 sub ids_rest{
 352     # arguments: <ids>, <rest number of arguments for ids>, <character>
 353     # return: ids and rest number of arguments for ids.
 354     my($ids,$ids_argc,$char)=@_;
 355     my($argc);
 356     $argc=&ids_argc($char);
 357     if($argc){
 358         $ids_argc+= $ids_argc==0 ? $argc : $argc-1;
 359     }else{
 360         $ids_argc--;
 361     }
 362     $ids.=$char;
 363 #    $ids.=encode('utf8',$char) if($perl58);
 364     return ($ids,$ids_argc);
 365 }
 366
 367 sub get_macro_for_ids{
 368     # argument: <ids>
 369     # return: TeX macro for ids
 370     #          or GETA character if ids is invalid for KAGE.
 371     my($ids)=@_;
 372     $ids=&normalize_ids($ids,"UniJIS");
 373 #    return pack("U",0xfffd) if(($ids!~/[$idc]/)
 374     return pack("U",0x3013) if(($ids!~/[$idc]/)
 375                                or($ids=~/[\x{10000}-]/));
 376                     #irregular for KAGE.
 377     if(not defined($ids{$ids}) and $ids{$ids}[1]>=0){
 378         $ids{$ids}[0]=$font_start;
 379         $ids{$ids}[1]=$ids_start;
 380         $ids_start++;
 381     }
 382     if($ids_start>255){
 383         $ids_start=0;
 384         $font_start++;
 385     }
 386     return "{\\fontencoding{OT1}\\fontfamily{"
 387         .sprintf("chise%03d",$ids{$ids}[0])
 388         ."}\\selectfont\\char$ids{$ids}[1]}";
 389 }
 390
 391 sub normalize_ids{
 392     # argument: <ids>, <output coding system>
 393     # return: ids or GETA character if ids is invalid for KAGE.
 394     my($ids,$out_cs)=@_;
 395     $out_cs=~s/Uni(.+)/'ucs@'.lc($1)/e;
 396
 397     my $output_ids="";
 398     my($char,$char_id,$output_char_id);
 399     while($ids=~m/(.)/g){
 400         $char=$1;
 401         $char_id=unpack("U",$char);
 402         if($char=~/[$idc]/){
 403             $output_ids.=$char;
 404         }elsif($output_char_id=&get_char_attribute($char,"=$out_cs")){
 405             $output_ids.=pack("U",$output_char_id);
 406         }elsif($output_char_id=&get_char_attribute($char,"=ucs")){
 407             $output_ids.=pack("U",$output_char_id);
 408         }else{
 409 #           return pack("U",0xfffd);
 410             return pack("U",0x3013);
 411         }
 412     }
 413     return $output_ids;
 414 }
 415
 416 sub get_output_char{
 417     # argument: <char-id>
 418     # return: character in output coding system or TeX macro or undef.
 419     my($char_id,$out_cs)=@_;
 420     my($char,$out_char_id,$suffix);
 421     my($gt,$hzk,$cdp);
 422
 423     $char=pack('U',$char_id);
 424
 425     foreach $out_cs (@{$order{$out_cs}}){
 426         if($out_cs eq 'UniJIS'
 427            and &get_char_attribute($char,"vnd-adobe-cid-unijis-utf16-h")){
 428             if($out_char_id=&get_char_attribute($char,'=ucs@jis')
 429                or $out_char_id=&get_char_attribute($char,'=ucs')
 430                or $out_char_id=&get_char_attribute($char,'=>ucs@jis')
 431                or $out_char_id=&get_char_attribute($char,'=>ucs')
 432                ){
 433                 return '{\selectjisfont\char'.$out_char_id.'}';
 434             }
 435         }elsif($out_cs eq 'UniGB'
 436                and &get_char_attribute($char,"vnd-adobe-cid-unigb-ucs2-h")){
 437             if($out_char_id=&get_char_attribute($char,'=ucs@gb')
 438                or $out_char_id=&get_char_attribute($char,'=ucs')
 439                or $out_char_id=&get_char_attribute($char,'=>ucs@gb')
 440                or $out_char_id=&get_char_attribute($char,'=>ucs')
 441                ){
 442                 return '{\selectgbsfont\char'.$out_char_id.'}';
 443             }
 444         }elsif($out_cs eq 'UniCNS'
 445                and &get_char_attribute($char,"vnd-adobe-cid-unicns-ucs2-h")){
 446             if($out_char_id=&get_char_attribute($char,'=ucs@cns')
 447                or $out_char_id=&get_char_attribute($char,'=ucs')
 448                or $out_char_id=&get_char_attribute($char,'=>ucs@cns')
 449                or $out_char_id=&get_char_attribute($char,'=>ucs')
 450                ){
 451                 return '{\selectcnsfont\char'.$out_char_id.'}';
 452             }
 453         }elsif($out_cs eq 'UniKS'
 454                and &get_char_attribute($char,"vnd-adobe-cid-uniks-ucs2-h")){
 455             if($out_char_id=&get_char_attribute($char,'=ucs@ks')
 456                or $out_char_id=&get_char_attribute($char,'=ucs')
 457                or $out_char_id=&get_char_attribute($char,'=>ucs@ks')
 458                or $out_char_id=&get_char_attribute($char,'=>ucs')
 459                ){
 460                 return '{\selectksxfont\char'.$out_char_id.'}';
 461             }
 462         }elsif($out_cs eq 'GT'){
 463             return $gt if($gt=&get_macro_for_GT($char_id));
 464         }elsif($out_cs eq 'HZK'){
 465             return $hzk if($hzk=&get_macro_for_HZK($char_id));
 466         }elsif($out_cs eq 'CDP'){
 467             return $cdp if($cdp=&get_macro_for_CDP($char_id));
 468         }
 469     }
 470     return undef;
 471 }
 472
 473 sub get_ids{
 474     # argument: <character>
 475     # return: ids
 476     my($char)=@_;
 477     my $ids="";
 478     $ids=&get_char_attribute($char,"ids-aggregated")
 479         or $ids=&get_char_attribute($char,"ids");
 480 #         or $ids=&get_char_attribute($char,"ideographic-structure");
 481     utf8::decode($ids);
 482 #    $ids=~s/[? ()]//g;
 483     return $ids;
 484 }
 485
 486 sub get_char_id_for_ids{
 487     # argument: <ideographic description sequence>
 488     # return: char-id or undef.
 489     my($ids)=@_;
 490     my($output_char);
 491
 492     if(($output_char)=&get_chars_matching("ids",$ids)){
 493         utf8::decode($output_char);
 494         return unpack("U",$output_char);
 495     }else{
 496         return undef;
 497     }
 498 }
 499
 500 sub get_char_id{
 501     # argument: <char-id>, <input coding system>
 502     # return:   char-id.
 503     my($char_id,$in_cs)=@_;
 504     my($output_char);
 505
 506     return $char_id if($in_cs eq 'ucs@mcs');
 507
 508     if(($output_char)=&get_chars_matching("=$in_cs",$char_id)){
 509         utf8::decode($output_char);
 510         return unpack("U",$output_char);
 511     }else{
 512         return $char_id;
 513     }
 514 }
 515
 516 sub get_char_id_unified{
 517     my($char_id)=@_;
 518     my($char,$chars,$ucs);
 519     my(@char_id);
 520     $char=pack("U",$char_id);
 521
 522     if($chars=&get_char_attribute($char,'->ucs-unified')){
 523         utf8::decode($chars);
 524         $chars=~s/^\((.*)\)$/$1/;
 525         return map {unpack("U",$_)} (split(/\s*\?/,$chars));
 526     }elsif($ucs=&get_char_attribute(pack("U",$char_id),'=>ucs')){
 527         if($chars=&get_char_attribute(pack("U",$ucs),'->ucs-unified')){
 528             utf8::decode($chars);
 529             $chars=~s/^\((.*)\)$/$1/;
 530             @char_id=grep {$char_id!=$_}
 531                 map {unpack("U",$_)}
 532                     (split(/\s*\?/,$chars));
 533             push(@char_id,$ucs);
 534             return @char_id;
 535         }
 536     }
 537 }
 538
 539 sub get_macro_for_GT{
 540     # argument: <char-id>
 541     # return: TeX macro for GT fonts or undef.
 542     my($char_id)=@_;
 543     my($char,$gt,$GT);
 544     $char=pack("U",$char_id);
 545     foreach (@GT){
 546         if($gt=&get_char_attribute($char,$_)){
 547             m/gt\-pj\-(\d+)/ and $GT=$1;
 548             last;
 549         }
 550     }
 551     if($gt){
 552         return "{\\fontencoding{OT1}\\fontfamily{"
 553             .sprintf("gt%02d",$GT)
 554             ."}\\selectfont\\char".($gt|0x8080)."}";
 555     }else{
 556         return undef;
 557     }
 558 }
 559
 560 sub get_macro_for_HZK{
 561     # argument: <char-id>
 562     # return: TeX macro for Hanziku fonts or undef.
 563     my($char_id)=@_;
 564     my($char,$hzk,$HZK);
 565     $char=pack("U",$char_id);
 566     foreach (@HZK){
 567         if($hzk=&get_char_attribute($char,$_)){
 568             m/hanziku\-(\d+)/ and $HZK=$1;
 569             last;
 570         }
 571     }
 572     if($hzk){
 573         return "{\\fontencoding{OT1}\\fontfamily{".sprintf("hzk%02d",$HZK)."}\\selectfont\\char".$hzk."}";
 574     }else{
 575         return undef;
 576     }
 577 }
 578
 579 sub get_macro_for_CDP{
 580     # argument: <char-id>
 581     # return: TeX macro for CDP fonts or undef.
 582     my($char_id)=@_;
 583     my($char,$cdp,$ucs);
 584     $char=pack("U",$char_id);
 585     foreach (@CDP){
 586         if($cdp=&get_char_attribute($char,$_)){
 587             last;
 588         }
 589     }
 590     if($cdp){
 591         $ucs=&get_char_attribute(utf8::decode(&get_chars_matching("=big5-pua",$cdp)),"=ucs");
 592         if($ucs){
 593             return "{\\fontencoding{OT1}\\fontfamily{cdp}\\selectfont\\char"
 594                 .$ucs.
 595                     "}";
 596         }else{
 597             print STDERR "This hould not happen.\n";
 598             print STDERR "ucs code point of CDP: $cdp not found.\n";
 599         }
 600     }else{
 601         return undef;
 602     }
 603 }