inCHISE

   1 #!/usr/bin/perl -CSD
   2
   3 # ver.0.2
   4
   5 use strict;
   6 use vars qw($omegadb_path
   7             $opt_protrude %opt_order
   8             $opt_in_cs $opt_out_cs
   9             $opt_help $usage
  10             $in_cs $out_cs $i @chars
  11             %order %order_map
  12             $char $char_id $out_char
  13             $ids $ids_argc %ids $idsdb
  14             $idsdata_file $ids_start $font_start
  15             @CDP @HZK @GT
  16             );
  17 use Getopt::Long;
  18 use utf8;
  19 use Fcntl ':flock';
  20 use Chise_utils ':all';
  21 require 5.008;
  22
  23 my $omegadb_path="/usr/local/lib/chise/omega";
  24
  25 ### Options ###
  26
  27 #$opt_order{'UniMulti'}='jcgk';
  28 $opt_order{'UniMulti'}='jGcgkHC';
  29 $opt_order{'UniGB'}='g';
  30 $opt_order{'UniCNS'}='c';
  31 $opt_order{'UniJIS'}='j';
  32 $opt_order{'UniKS'}='k';
  33
  34 $opt_protrude=0;# 1=true, 0=false.
  35
  36 ### End ###
  37
  38 my $strictly_forbidden_after = '「【『［（〈“‘‘（〔｛《{\[\(\x{3016}｛｢';
  39 #       \x{3016} | # white 【
  40
  41 my $forbidden_after = "\x{0000}";
  42
  43 # ￥¥＄$〒♯＃#￠¢￡£＠@§
  44 my $slightly_forbidden_after = '￥¥＄$〒♯＃#￠¢￡£＠@§';
  45
  46 # $strictly_forbidden_before
  47 # All these characters are allowed to protrude
  48 # in the right margin
  49 my $strictly_forbidden_before=
  50     '!,.:;?、。！，．：；？。\)#}’”〉》」』】〕\x{3017}）］｝｝」\]';
  51 ###       \x{3017} | # white 】
  52
  53 my $forbidden_before
  54     = 'ー々ぁぃぅぇぉゃゅょっゎァィゥェォャュョッヮヵヶ';
  55
  56 my $slightly_forbidden_before
  57     = '\x{000a}\#\-‐−‰′″℃゛゜ゝゞヽヾ＂％－ﾞﾟ';
  58
  59 my $asian = '\x{1100}-\x{11FF}\x{2E80}-\x{D7AF}\x{F900}-\x{FAFF}\x{FE30}-\x{FE4F}\x{FF00}-';
  60
  61 my $space = '\x{0020}\x{0009}\x{000A}\x{000C}\x{000D}';
  62
  63 &GetOptions("in=s"=>\$opt_in_cs,
  64             "out=s"=>\$opt_out_cs,
  65             "help",\$opt_help);
  66
  67 $usage=<<EOF;
  68 Usage: $0 -i <input coding system> -o <cmap encoding>
  69     input coding system:
  70       Utf8mcs, Utf8cns, Utf8gb, Utf8jis, Utf8ks
  71     cmap encoding:
  72       UniCNS, UniGB, UniJIS, UniKS, UniMulti
  73 EOF
  74
  75 if($opt_in_cs or $opt_out_cs){
  76     $in_cs=$opt_in_cs;
  77     $out_cs=$opt_out_cs;
  78 }elsif(@ARGV==0){
  79     ($in_cs,$out_cs)=($0=~/(Utf8.+)To(\w+)/);
  80 }
  81
  82 # $in_cs:
  83 #   Utf8mcs,Utf8cns,Utf8gb,Utf8jis,Utf8ks,
  84 # $out_cs:
  85 #   UniCNS,UniGB,UniJIS,UniKS,UniMulti
  86
  87 $in_cs=~s/Utf8/ucs\@/;
  88
  89 if($opt_help
  90    or not defined($in_cs)
  91    or not defined($out_cs)){
  92     print $usage;
  93     exit 1;
  94 }
  95
  96 $omegadb_path=~s!/$!!;
  97
  98 $idsdata_file="$omegadb_path/idsdata.pl";
  99 $ids_start=0x00;
 100 $font_start=0;
 101
 102 if(-e $idsdata_file){
 103     open(IDSDATA,"+<$idsdata_file") or die;
 104     flock(IDSDATA,LOCK_EX);
 105     seek(IDSDATA,0,0);
 106     while(<IDSDATA>){
 107         eval $_;
 108     }
 109     seek(IDSDATA,0,0);
 110 #         require $idsdata_file;
 111 }else{
 112     open(IDSDATA,">$idsdata_file") or die;
 113     flock(IDSDATA,LOCK_EX);
 114     seek(IDSDATA,0,0);
 115 }
 116
 117 $ids_argc=0;
 118 $ids="";
 119
 120 @GT=(#"=gt","=gt-k",
 121      "=gt-pj-1","=gt-pj-2","=gt-pj-3","=gt-pj-4","=gt-pj-5",
 122      "=gt-pj-6","=gt-pj-7","=gt-pj-8","=gt-pj-9","=gt-pj-10",
 123      "=gt-pj-11"
 124      #,"=gt-pj-k1","=gt-pj-k2"
 125      );
 126 @HZK=("=hanziku-1","=hanziku-2","=hanziku-3","=hanziku-4",
 127       "=hanziku-5","=hanziku-6","=hanziku-7","=hanziku-8",
 128       "=hanziku-9","=hanziku-10","=hanziku-11","=hanziku-12");
 129 @CDP=("=big5-cdp");
 130
 131 %order_map=('c'=>'UniCNS',
 132             'g'=>'UniGB',
 133             'j'=>'UniJIS',
 134             'k'=>'UniKS',
 135             'G'=>'GT',
 136             'H'=>'HZK',
 137             'C'=>'CDP',
 138             );
 139
 140 foreach $out_cs ('UniGB','UniCNS','UniJIS','UniKS','UniMulti'){
 141     if(defined($opt_order{$out_cs})){
 142         if($opt_order{$out_cs}=~/^[cgjkGHC]*$/){
 143             @{$order{$out_cs}}=map {$order_map{$_}}
 144             (split(//,$opt_order{$out_cs}));
 145         }else{
 146             print STDERR "Invalid order for $out_cs!\n";
 147             exit 1;
 148         }
 149     }
 150 }
 151
 152 while(<>){
 153     utf8::decode($_);
 154     if($in_cs ne 'ucs@mcs'){
 155         s/(.)/pack("U",&get_char_id(unpack("U",$1),$in_cs))/ge;
 156     }
 157     s/(amp.+?;)/&de_tex_er($1)/ge;
 158 #    s/(&.+?;)/&de_tex_er($1)/ge;
 159     @chars=split(//);
 160     for($i=0;$i<=$#chars;$i++){
 161         $char=$chars[$i];
 162         $char_id=unpack("U",$char);
 163
 164         if($char_id<=0x20){
 165             print $chars[$i];
 166             next;
 167         }elsif($char_id>0x20 and $char_id<=0x02af){
 168             # Basic Latin
 169             # Latin-1 Supplement
 170             # Latin Extended-A
 171             # Latin Extended-B
 172             # IPA Extensions
 173             print &latin_parse();
 174             next;
 175         }elsif($char_id>=0x2ff0 and $char_id<=0x2fff){
 176             # Ideographic Description Characters
 177             print &ids_parse();
 178             next;
 179         }else{
 180             if(($out_char=&get_output_char($char_id,$out_cs))){
 181                 print $out_char,&add_break($i);
 182             }elsif($char_id >= 0x20000 && $char_id <=0x2a6df){
 183                 # CJK Unified Ideographs Extension B
 184                 if(not defined($ids{$char}) and $ids{$char}[1]>=0){
 185                     $ids{$char}[0]=$font_start;
 186                     $ids{$char}[1]=$ids_start;
 187                     $ids_start++;
 188                     if($ids_start>255){
 189                         $ids_start=0;
 190                         $font_start++;
 191                     }
 192                 }
 193                 print "{\\fontencoding{OT1}\\fontfamily{" .
 194                     sprintf("chise%03d",$ids{$char}[0]) .
 195                     "}\\selectfont\\char$ids{$char}[1]}",&add_break($i);
 196                 next;
 197             }else{
 198                 if($ids=&get_ids($char)){
 199                     print &get_macro_for_ids($ids),&add_break($i);
 200                 }else{
 201                     print '\rule{1ex}{1ex}',&add_break($i);
 202                 }
 203             }
 204         }
 205     }
 206 }
 207
 208 print IDSDATA 'use utf8;',"\n";
 209 foreach $ids (keys %ids){
 210     print IDSDATA '$ids{\'',$ids,'\'}='
 211     ,'[',join ",",@{$ids{$ids}},"];\n";
 212 }
 213 print IDSDATA '$font_start=',$font_start,";\n";
 214 print IDSDATA '$ids_start=',$ids_start,";\n";
 215 print IDSDATA "1;";
 216 flock(IDSDATA,LOCK_UN);
 217
 218 exit 0;
 219
 220 sub de_tex_er{
 221     my($er)=@_;
 222     my($prefix,$suffix);
 223     my($output_char,$atr,$value);
 224     $er=~/^(amp)(.*)(;)$/
 225         and $prefix=$1,$er=$2,$suffix=$3;
 226     $prefix or $prefix="",$suffix or $suffix="";
 227     if($er=~/^U[\+|\-]([a-fA-F\d]+)/){
 228         $output_char=pack("U",hex($1));
 229     }elsif($er=~/^(?:I\-)?($er_prefix_re)\-?([0-9a-fA-F]+)$/){
 230         ($atr,$value)=($1,$2);
 231         unless($er_alias{$atr}=~/daikanwa|gt/){
 232             $value=hex($value);
 233         }
 234         ($output_char)=&get_chars_matching($er_alias{$atr},$value);
 235         utf8::decode($output_char);
 236     }
 237     if($output_char){
 238         return $output_char;
 239     }else{
 240         return $prefix.$er.$suffix;
 241     }
 242 }
 243
 244 sub add_break{
 245     my($i)=@_;
 246
 247     if($i<($#chars-1)){
 248         if(($chars[$i+1]=~m/[$strictly_forbidden_before]/o)
 249            and($chars[$i+2]=~m/[$strictly_forbidden_before]/o)){
 250             return "\\CJKunbreakablekernone ";
 251         }elsif($opt_protrude){
 252             if(($chars[$i+1]=~m/[$strictly_forbidden_before]/o)
 253                and($chars[$i+2]=~m/[^$strictly_forbidden_before]/o)){
 254                 return "\\CJKunbreakablekernone \\CJKprotrude ";
 255             }
 256         }
 257     }
 258     if(($i<$#chars)
 259        and($chars[$i+1]=~m/[$strictly_forbidden_before]/o)){
 260         return "\\CJKunbreakablekernone ";
 261     }
 262     if($chars[$i]=~m/[$strictly_forbidden_after]/o){
 263         return "\\CJKunbreakablekernone ";
 264     }
 265     if(($i<$#chars)
 266        and($chars[$i+1]=~m/[$forbidden_before]/o)){
 267         return "\\CJKunbreakablekerntwo ";
 268
 269     }
 270     if($chars[$i]=~m/[$forbidden_after]/o){
 271         return "\\CJKunbreakablekerntwo ";
 272     }
 273     if(($i<$#chars)
 274        and($chars[$i+1]=~m/[$slightly_forbidden_before]/o)){
 275         return "\\CJKunbreakablekernthree ";
 276     }
 277     if($chars[$i]=~m/[$slightly_forbidden_after]/o){
 278         return "\\CJKunbreakablekernthree ";
 279     }
 280     if($chars[$i]=~m/[$asian]/o){
 281         return "\\CJKbreakablekern ";
 282     }
 283     if(($i<$#chars)and($chars[$i+1]=~m/[$asian]/o)){
 284         return "\\CJKbreakablekern ";
 285     }
 286 }
 287
 288 sub latin_parse{
 289     # arguments: none
 290     # return: string for output with TeX macro.
 291     my($char_id);
 292     my $out_str=$chars[$i];
 293     $i++;
 294     while($i<=$#chars){
 295         $char_id=unpack("U",$chars[$i]);
 296         if($char_id>0x20 and $char_id<=0x02af){
 297             $out_str.=pack("U",$char_id);
 298         }else{
 299             $i--;
 300             last;
 301         }
 302         $i++;
 303     }
 304     return '{\normalfont {'.$out_str.'}}';
 305 }
 306
 307 sub ids_parse{
 308     # arguments: none
 309     # return: character for output,
 310     #          TeX macro for ids,
 311     #          or GETA character if ids is invalid.
 312     my($ids,$ids_argc)=&ids_rest("",0,$chars[$i]);
 313
 314     while($ids_argc>0){
 315         # We are in IDS.
 316         $i++;
 317         if($i>$#chars){
 318             print STDERR "IDS parse error: $ids\n";
 319 #           return pack("U",0xfffd);
 320             return pack("U",0x3013);
 321         }
 322
 323         ($ids,$ids_argc)=&ids_rest($ids,$ids_argc,$chars[$i]);
 324         if($ids_argc==0){
 325             if(($char_id=&get_char_id_for_ids($ids))
 326                and($out_char=&get_output_char($char_id,$out_cs))){
 327                 return $out_char;
 328             }else{
 329                 return &get_macro_for_ids($ids);
 330 #               return encode('utf8', &get_macro_for_ids($ids)) if($perl58);
 331             }
 332         }
 333     }
 334 }
 335
 336 sub ids_rest{
 337     # arguments: <ids>, <rest number of arguments for ids>, <character>
 338     # return: ids and rest number of arguments for ids.
 339     my($ids,$ids_argc,$char)=@_;
 340     my($argc);
 341     $argc=&ids_argc($char);
 342     if($argc){
 343         $ids_argc+= $ids_argc==0 ? $argc : $argc-1;
 344     }else{
 345         $ids_argc--;
 346     }
 347     $ids.=$char;
 348 #    $ids.=encode('utf8',$char) if($perl58);
 349     return ($ids,$ids_argc);
 350 }
 351
 352 sub get_macro_for_ids{
 353     # argument: <ids>
 354     # return: TeX macro for ids
 355     #          or GETA character if ids is invalid for KAGE.
 356     my($ids)=@_;
 357     $ids=&normalize_ids($ids,"UniJIS");
 358 #    return pack("U",0xfffd) if(($ids!~/[$idc]/)
 359     return pack("U",0x3013) if(($ids!~/[$idc]/)
 360                                or($ids=~/[\x{10000}-]/));
 361                     #irregular for KAGE.
 362     if(not defined($ids{$ids}) and $ids{$ids}[1]>=0){
 363         $ids{$ids}[0]=$font_start;
 364         $ids{$ids}[1]=$ids_start;
 365         $ids_start++;
 366     }
 367     if($ids_start>255){
 368         $ids_start=0;
 369         $font_start++;
 370     }
 371     return "{\\fontencoding{OT1}\\fontfamily{"
 372         .sprintf("chise%03d",$ids{$ids}[0])
 373         ."}\\selectfont\\char$ids{$ids}[1]}";
 374 }
 375
 376 sub normalize_ids{
 377     # argument: <ids>, <output coding system>
 378     # return: ids or GETA character if ids is invalid for KAGE.
 379     my($ids,$out_cs)=@_;
 380     $out_cs=~s/Uni(.+)/'ucs@'.lc($1)/e;
 381
 382     my $output_ids="";
 383     my($char,$char_id,$output_char_id);
 384     while($ids=~m/(.)/g){
 385         $char=$1;
 386         $char_id=unpack("U",$char);
 387         if($char=~/[$idc]/){
 388             $output_ids.=$char;
 389         }elsif($output_char_id=&get_char_attribute($char,"=$out_cs")){
 390             $output_ids.=pack("U",$output_char_id);
 391         }elsif($output_char_id=&get_char_attribute($char,"=ucs")){
 392             $output_ids.=pack("U",$output_char_id);
 393         }else{
 394 #           return pack("U",0xfffd);
 395             return pack("U",0x3013);
 396         }
 397     }
 398     return $output_ids;
 399 }
 400
 401 sub get_output_char{
 402     # argument: <char-id>
 403     # return: character in output coding system or TeX macro or undef.
 404     my($char_id,$out_cs)=@_;
 405     my($char,$out_char_id,$suffix);
 406     my($gt,$hzk,$cdp);
 407
 408     $char=pack('U',$char_id);
 409
 410     foreach $out_cs (@{$order{$out_cs}}){
 411         if($out_cs eq 'UniJIS'
 412            and &get_char_attribute($char,"vnd-adobe-cid-unijis-utf16-h")){
 413             if($out_char_id=&get_char_attribute($char,'=ucs@jis')
 414                or $out_char_id=&get_char_attribute($char,'=ucs')
 415                or $out_char_id=&get_char_attribute($char,'=>ucs@jis')
 416                or $out_char_id=&get_char_attribute($char,'=>ucs')
 417                ){
 418                 return '{\selectjisfont\char'.$out_char_id.'}';
 419             }
 420         }elsif($out_cs eq 'UniGB'
 421                and &get_char_attribute($char,"vnd-adobe-cid-unigb-ucs2-h")){
 422             if($out_char_id=&get_char_attribute($char,'=ucs@gb')
 423                or $out_char_id=&get_char_attribute($char,'=ucs')
 424                or $out_char_id=&get_char_attribute($char,'=>ucs@gb')
 425                or $out_char_id=&get_char_attribute($char,'=>ucs')
 426                ){
 427                 return '{\selectgbsfont\char'.$out_char_id.'}';
 428             }
 429         }elsif($out_cs eq 'UniCNS'
 430                and &get_char_attribute($char,"vnd-adobe-cid-unicns-ucs2-h")){
 431             if($out_char_id=&get_char_attribute($char,'=ucs@cns')
 432                or $out_char_id=&get_char_attribute($char,'=ucs')
 433                or $out_char_id=&get_char_attribute($char,'=>ucs@cns')
 434                or $out_char_id=&get_char_attribute($char,'=>ucs')
 435                ){
 436                 return '{\selectcnsfont\char'.$out_char_id.'}';
 437             }
 438         }elsif($out_cs eq 'UniKS'
 439                and &get_char_attribute($char,"vnd-adobe-cid-uniks-ucs2-h")){
 440             if($out_char_id=&get_char_attribute($char,'=ucs@ks')
 441                or $out_char_id=&get_char_attribute($char,'=ucs')
 442                or $out_char_id=&get_char_attribute($char,'=>ucs@ks')
 443                or $out_char_id=&get_char_attribute($char,'=>ucs')
 444                ){
 445                 return '{\selectksxfont\char'.$out_char_id.'}';
 446             }
 447         }elsif($out_cs eq 'GT'){
 448             return $gt if($gt=&get_macro_for_GT($char_id));
 449         }elsif($out_cs eq 'HZK'){
 450             return $hzk if($hzk=&get_macro_for_HZK($char_id));
 451         }elsif($out_cs eq 'CDP'){
 452             return $cdp if($cdp=&get_macro_for_CDP($char_id));
 453         }
 454     }
 455     return undef;
 456 }
 457
 458 sub get_ids{
 459     # argument: <character>
 460     # return: ids
 461     my($char)=@_;
 462     my $ids="";
 463     $ids=&get_char_attribute($char,"ids-aggregated")
 464         or $ids=&get_char_attribute($char,"ids");
 465 #         or $ids=&get_char_attribute($char,"ideographic-structure");
 466     utf8::decode($ids);
 467 #    $ids=~s/[? ()]//g;
 468     return $ids;
 469 }
 470
 471 sub get_char_id_for_ids{
 472     # argument: <ideographic description sequence>
 473     # return: char-id or undef.
 474     my($ids)=@_;
 475     my($output_char);
 476
 477     if(($output_char)=&get_chars_matching("ids",$ids)){
 478         utf8::decode($output_char);
 479         return unpack("U",$output_char);
 480     }else{
 481         return undef;
 482     }
 483 }
 484
 485 sub get_char_id{
 486     # argument: <char-id>, <input coding system>
 487     # return:   char-id.
 488     my($char_id,$in_cs)=@_;
 489     my($output_char);
 490
 491     return $char_id if($in_cs eq 'ucs@mcs');
 492
 493     if(($output_char)=&get_chars_matching("=$in_cs",$char_id)){
 494         utf8::decode($output_char);
 495         return unpack("U",$output_char);
 496     }else{
 497         return $char_id;
 498     }
 499 }
 500
 501 sub get_macro_for_GT{
 502     # argument: <char-id>
 503     # return: TeX macro for GT fonts or undef.
 504     my($char_id)=@_;
 505     my($char,$gt,$GT);
 506     $char=pack("U",$char_id);
 507     foreach (@GT){
 508         if($gt=&get_char_attribute($char,$_)){
 509             m/gt\-pj\-(\d+)/ and $GT=$1;
 510             last;
 511         }
 512     }
 513     if($gt){
 514         return "{\\fontencoding{OT1}\\fontfamily{"
 515             .sprintf("gt%02d",$GT)
 516             ."}\\selectfont\\char".($gt|0x8080)."}";
 517     }else{
 518         return undef;
 519     }
 520 }
 521
 522 sub get_macro_for_HZK{
 523     # argument: <char-id>
 524     # return: TeX macro for Hanziku fonts or undef.
 525     my($char_id)=@_;
 526     my($char,$hzk,$HZK);
 527     $char=pack("U",$char_id);
 528     foreach (@HZK){
 529         if($hzk=&get_char_attribute($char,$_)){
 530             m/hanziku\-(\d+)/ and $HZK=$1;
 531             last;
 532         }
 533     }
 534     if($hzk){
 535         return "{\\fontencoding{OT1}\\fontfamily{".sprintf("hzk%02d",$HZK)."}\\selectfont\\char".$hzk."}";
 536     }else{
 537         return undef;
 538     }
 539 }
 540
 541 sub get_macro_for_CDP{
 542     # argument: <char-id>
 543     # return: TeX macro for CDP fonts or undef.
 544     my($char_id)=@_;
 545     my($char,$cdp,$ucs);
 546     $char=pack("U",$char_id);
 547     foreach (@CDP){
 548         if($cdp=&get_char_attribute($char,$_)){
 549             last;
 550         }
 551     }
 552     if($cdp){
 553         $ucs=&get_char_attribute(utf8::decode(&get_chars_matching("=big5-pua",$cdp)),"=ucs");
 554         if($ucs){
 555             return "{\\fontencoding{OT1}\\fontfamily{cdp}\\selectfont\\char"
 556                 .$ucs.
 557                     "}";
 558         }else{
 559             print STDERR "This hould not happen.\n";
 560             print STDERR "ucs code point of CDP: $cdp not found.\n";
 561         }
 562     }else{
 563         return undef;
 564     }
 565 }