outCMAP

   1 #!/usr/bin/perl
   2
   3 use strict;
   4 use vars qw($opt_in_cs $opt_out_cs $opt_help $usage
   5             $in_cs $out_cs
   6             $i @chars
   7             $char $char_id $out_char $omegadb_home
   8             $ids $ids_argc %ids $idsdb
   9             $idsdata_file $ids_start $font_start
  10             %utf8mcs_map_from
  11             %cmap_to
  12             $inotp $perl56 $perl58
  13             $useCDP $useHZK $useGT
  14             @CDP @HZK @GT
  15             );
  16 use Getopt::Long;
  17 use utf8;
  18 use Chise_utils ':all';
  19
  20 my $strictly_forbidden_after = "
  21       \x{0028} |
  22       \x{005B} |
  23       \x{007B} |
  24       \x{2018} |
  25       \x{201C} |
  26       \x{3008} |
  27       \x{300A} |
  28       \x{300C} |
  29       \x{300E} |
  30       \x{3010} |
  31       \x{3014} |
  32       \x{3016} |
  33       \x{FF08} |
  34       \x{FF08} |
  35       \x{FF3B} |
  36       \x{FF5B} |
  37       \x{FF5B} |
  38       \x{FF62}
  39 ";
  40
  41 my $forbidden_after = "\x{0000}";
  42
  43 # ￥¥＄$〒♯＃#￠¢￡£＠@§
  44 my $slightly_forbidden_after = "
  45       \x{FFE5} |
  46       \x{00A5} |
  47       \x{FF04} |
  48       \x{0024} |
  49       \x{3012} |
  50       \x{266F} |
  51       \x{FF03} |
  52       \x{0023} |
  53       \x{FFE0} |
  54       \x{00A2} |
  55       \x{FFE1} |
  56       \x{00A3} |
  57       \x{FF20} |
  58       \x{0040} |
  59       \x{00A7}
  60 ";
  61
  62 # All these characters are allowed to protrude
  63 # in the right margin
  64 my $strictly_forbidden_before = "
  65       \x{0021} | # !
  66       \x{002c} | # ,
  67       \x{002e} | # .
  68       \x{003a} | # :
  69       \x{003b} | # ;
  70       \x{003f} | # ?
  71       \x{3001} | # 、
  72       \x{3002} | # 。
  73       \x{ff01} | # ！
  74       \x{ff0c} | # ，
  75       \x{ff0e} | # ．
  76       \x{ff1a} | # ：
  77       \x{ff1b} | # ；
  78       \x{ff1f} | # ？
  79       \x{ff61} | # 。
  80       \x{0029} | # )
  81 #      \x{005d} | # ]
  82       \x{007d} | # }
  83       \x{2019} | # ’
  84       \x{201d} | # ”
  85       \x{3009} | # 〉
  86       \x{300b} | # 》
  87       \x{300d} | # 」
  88       \x{300f} | # 』
  89       \x{3011} | # 】
  90       \x{3015} | # 〕
  91       \x{3017} | # white 】
  92       \x{ff09} | # ）
  93       \x{ff3d} | # ］
  94       \x{ff5d} | # ｝
  95       \x{ff5d} | # ｝
  96       \x{ff63}   # 」
  97 ";
  98
  99 my $forbidden_before = "
 100       \x{30fc} | # ー
 101       \x{3005} | # 々
 102       \x{3041} | # ぁ
 103       \x{3043} | # ぃ
 104       \x{3045} | # ぅ
 105       \x{3047} | # ぇ
 106       \x{3049} | # ぉ
 107       \x{3083} | # ゃ
 108       \x{3085} | # ゅ
 109       \x{3087} | # ょ
 110       \x{3063} | # っ
 111       \x{308e} | # ゎ
 112       \x{30a1} | # ァ
 113       \x{30a3} | # ィ
 114       \x{30a5} | # ゥ
 115       \x{30a7} | # ェ
 116       \x{30a9} | # ォ
 117       \x{30e3} | # ャ
 118       \x{30e5} | # ュ
 119       \x{30e7} | # ョ
 120       \x{30c3} | # ッ
 121       \x{30ee} | # ヮ
 122       \x{30f5} | # ヵ
 123       \x{30f6}   # ヶ
 124 ";
 125
 126 #      \-       | # -
 127
 128 my $slightly_forbidden_before = "
 129       \x{000a} | # ???
 130       \#       | # #
 131       \x{2010} | # ‐
 132       \x{2012} | # −
 133       \x{2030} | # ‰
 134       \x{2032} | # ′
 135       \x{2033} | # ″
 136       \x{2103} | # ℃
 137       \x{309b} | # ゛
 138       \x{309c} | # ゜
 139       \x{309d} | # ゝ
 140       \x{309e} | # ゞ
 141       \x{30fd} | # ヽ
 142       \x{30fe} | # ヾ
 143       \x{ff02} | # ＂
 144       \x{ff05} | # ％
 145       \x{ff0d} | # －
 146       \x{ff9e} | # ﾞ
 147       \x{ff9f}   # ﾟ
 148 ";
 149
 150 my $asian = "\x{1100}-\x{11FF} | \x{2E80}-\x{D7AF} |
 151              \x{F900}-\x{FAFF} | \x{FE30}-\x{FE4F} |
 152              \x{FF00}-";
 153
 154 my $space = "\x{0020} | \x{0009} | \x{000A} | \x{000C} | \x{000D} ";
 155
 156
 157 $useGT=1;
 158 $useHZK=0;
 159 $useCDP=0;
 160
 161 if($^V and $^V ge v5.8){
 162     $perl58=1;
 163 }elsif($^V and $^V ge v5.6){
 164     $perl56=1;
 165 }else{
 166     print STDERR "This versin is not supported.";
 167 }
 168 if($perl58){
 169     eval "use Encode";
 170     binmode(STDIN, ':encoding(utf8)');
 171     binmode(STDOUT, ':encoding(utf8)');
 172 }
 173
 174 $omegadb_home="/Users/izumi/.chise";
 175
 176 &GetOptions("in=s"=>\$opt_in_cs,
 177             "i=s"=>\$opt_in_cs,
 178             "out=s"=>\$opt_out_cs,
 179             "o=s"=>\$opt_out_cs,
 180             "help",\$opt_help,
 181             "h",\$opt_help);
 182
 183 $usage=<<EOF;
 184 Usage: $0 -i <input coding system> -o <cmap encoding>
 185     input coding system:
 186       Utf8mcs, Utf8cns, Utf8gb, Utf8jis, Utf8ks
 187     cmap encoding:
 188       UniCNS, UniGB, UniJIS, UniKS
 189 EOF
 190
 191 if($opt_in_cs or $opt_out_cs){
 192     $in_cs=$opt_in_cs;
 193     $out_cs=$opt_out_cs;
 194 }elsif(@ARGV==0){
 195     ($in_cs,$out_cs)=($0=~/(Utf8.+)To(\w+)/);
 196     $inotp=1;
 197 }
 198
 199 # $in_cs:
 200 #   utf-8-mcs,utf-8-cns,utf-8-gb,utf-8-jis,utf-8-ks,
 201 # $out_cs:
 202 #   UniCNS,UniGB,UniJIS,UniKS
 203
 204 if($opt_help
 205    or not defined($in_cs)
 206    or not defined($out_cs)){
 207     print $usage;
 208     exit 1;
 209 }
 210
 211 $idsdata_file="idsdata.pl";
 212 $ids_start=0x00;
 213 $font_start=0;
 214
 215 if(-e $idsdata_file){
 216     require $idsdata_file;
 217 }
 218
 219 $ids_argc=0;
 220 $ids="";
 221
 222 @GT=(#"=gt","=gt-k",
 223      "=gt-pj-1","=gt-pj-2","=gt-pj-3","=gt-pj-4","=gt-pj-5","=gt-pj-6","=gt-pj-7","=gt-pj-8","=gt-pj-9","=gt-pj-10","=gt-pj-11"
 224      #,"=gt-pj-k1","=gt-pj-k2"
 225      );
 226 @HZK=("=hanziku-1","=hanziku-10","=hanziku-11","=hanziku-12","=hanziku-2","=hanziku-3","=hanziku-4","=hanziku-5","=hanziku-6","=hanziku-7","=hanziku-8","=hanziku-9");
 227 @CDP=("=big5-cdp");
 228
 229 while(<>){
 230     # temporary fix for using in OTP for perl 5.6.
 231     s/(.)/pack("c",unpack("U",$1))/ge if($inotp
 232                                          and $in_cs=~/utf8/i
 233                                          and $perl56);
 234     # for perl 5.8.
 235     $_=decode('utf8', $_)  if ($inotp and $in_cs=~/utf8/i
 236                                and $perl58);
 237     s/(amp.+?;)/&tex_de_er($1)/ge;
 238 #    s/(&.+?;)/&tex_de_er($1)/ge;
 239 #    s/^(.*)$/&add_break($1)/e;
 240     @chars=split(//);
 241     for($i=0;$i<=$#chars;$i++){
 242 #    while(m/(.)/g){
 243         $char=&get_char_in_utf8mcs($chars[$i],$in_cs);
 244         $char_id=unpack("U",$char);
 245         if($ids_argc>0){
 246             # It's in IDS.
 247             ($ids,$ids_argc)=&ids_rest($ids,$ids_argc,$char);
 248             if($ids_argc==0){
 249                 if(($char_id=&get_char_id_for_ids($ids))
 250                    and(($out_char=&get_output_char($char_id,$out_cs)))){
 251                     print $out_char,&add_break($i);
 252                 }else{
 253                     print &replace_ids($ids),&add_break($i) if($perl56);
 254                     print encode('utf8', &replace_ids($ids)),&add_break($i) if($perl58);
 255                 }
 256                 $ids="";
 257             }
 258         }elsif($char_id>=0x2ff0 and $char_id<=0x2fff){
 259             ($ids,$ids_argc)=&ids_rest("",0,$char);
 260             next;
 261         }else{
 262             if($char_id<=0xff){
 263                 print $char;
 264                 next;
 265             }
 266             if(($out_char=&get_output_char($char_id,$out_cs))){
 267                 print $out_char,&add_break($i);
 268             }elsif($char_id >= 0x20000 && $char_id <=0x2a6df){
 269                 unless(defined($ids{$char}) and $ids{$char}[1]>=0){
 270                     $ids{$char}[0]=$font_start;
 271                     $ids{$char}[1]=$ids_start;
 272                     $ids_start++;
 273                     if($ids_start>255){
 274                         $ids_start=0;
 275                         $font_start++;
 276                     }
 277                 }
 278                 print "{\\fontencoding{OT1}\\fontfamily{" .
 279                     sprintf("chise%03d",$ids{$char}[0]) .
 280                     "}\\selectfont\\char$ids{$char}[1]}",&add_break($i);
 281                 next;
 282             }else{
 283                 print &replace_ids(&get_ids($char)),&add_break($i);
 284             }
 285         }
 286     }
 287     if($ids_argc>0){
 288         print STDERR "IDS parse error: $ids\n";
 289 #       print pack("U",0xfffd);
 290         print pack("U",0x3013) if($perl56);
 291         print encode('utf8',pack("U",0x3013)) if($perl58);
 292         $ids="";
 293         $ids_argc=0;
 294     }
 295 }
 296
 297 open(IDSDATA,">$idsdata_file") or die;
 298 print IDSDATA 'use utf8;',"\n";
 299 foreach $ids (keys %ids){
 300     print IDSDATA '$ids{\'',$ids,'\'}=[',join ",",@{$ids{$ids}},"];\n" if($perl56);
 301     print IDSDATA '$ids{\'',encode('utf8',$ids),'\'}=[',join ",",@{$ids{$ids}},"];\n" if($perl58);
 302 }
 303 print IDSDATA '$font_start=',$font_start,";\n";
 304 print IDSDATA '$ids_start=',$ids_start,";\n";
 305 print IDSDATA "1;";
 306
 307 exit 0;
 308
 309 sub tex_de_er{
 310     my($er)=@_;
 311     my($out);
 312     $er=~s/^amp(.*);$/$1/;
 313 #    $er=~s/^&(.*);$/$1/;
 314     $out=&de_er($er);
 315     if($out){
 316         return $out;
 317     }else{
 318         return "amp$er;";
 319     }
 320 }
 321
 322 sub add_break{
 323     my($i)=@_;
 324
 325 #     $line=~s/(.)($strictly_forbidden_before)($not_strictly_forbidden_before)/
 326 #       $1."\\CJKunbreakablekernone \\CJKprotrude ".$2.$3/egx;
 327 #     $line=~s/(.)($strictly_forbidden_before)($strictly_forbidden_before)/
 328 #       $1."\\CJKunbreakablekernone ".$2.$3/egx;
 329
 330 #     $line=~s/(.)($strictly_forbidden_before)/
 331 #       $1."\\CJKunbreakablekernone ".$2/egx;
 332 #     $line=~s/(.)($forbidden_before)/
 333 #       $1."\\CJKunbreakablekerntwo ".$2/egx;
 334 #     $line=~s/(.)($slightly_forbidden_before)/
 335 #       $1."\\CJKunbreakablekernthree ".$2/egx;
 336
 337 #     $line=~s/($forbidden_after)(.)/
 338 #       $1."\\CJKunbreakablekerntwo ".$2/egx;
 339 #     $line=~s/($strictly_forbidden_after)(. )/
 340 #       $1."\\CJKunbreakablekernone ".$2/egx;
 341 #     $line=~s/($slightly_forbidden_after)(.)/
 342 #       $1."\\CJKunbreakablekernthree ".$2/egx;
 343
 344 #     $line=~s/($asian)(.)/$1\\CJKbreakablekern $2/g;
 345 #     $line=~s/(.)($asian)/$1\\CJKbreakablekern $2/g;
 346
 347     if($i<$#chars){
 348         if($i<($#chars-1)){
 349             if(($chars[$i+1]=~m/[$strictly_forbidden_before]/x)
 350                and($chars[$i+2]=~m/[^$strictly_forbidden_before]/x)){
 351                 return "\\CJKunbreakablekernone \\CJKprotrude ";
 352             }elsif(($chars[$i+1]=~m/[$strictly_forbidden_before]/x)
 353                and($chars[$i+2]=~m/[$strictly_forbidden_before]/x)){
 354                 return "\\CJKunbreakablekernone ";
 355             }
 356         }else{
 357             if($chars[$i+1]=~m/[$strictly_forbidden_before]/x){
 358                 return "\\CJKunbreakablekernone ";
 359             }elsif($chars[$i+1]=~m/[$forbidden_before]/x){
 360                 return "\\CJKunbreakablekerntwo ";
 361             }elsif($chars[$i+1]=~m/[$slightly_forbidden_before]/x){
 362                 return "\\CJKunbreakablekernthree ";
 363             }
 364         }
 365     }elsif($i>0){
 366         if($chars[$i]=~m/[$forbidden_after]/x){
 367             return "\\CJKunbreakablekerntwo ";
 368         }elsif($chars[$i]=~m/[$strictly_forbidden_after]/x){
 369             return "\\CJKunbreakablekernone ";
 370         }elsif($chars[$i]=~m/[$slightly_forbidden_after]/x){
 371             return "\\CJKunbreakablekernthree ";
 372         }
 373     }
 374     return "\\CJKbreakablekern ";
 375 }
 376
 377 sub ids_rest{
 378     my($ids,$ids_argc,$char)=@_;
 379     my($argc);
 380     $argc=&ids_argc($char);
 381     if($argc){
 382         $ids_argc+= $ids_argc==0 ? $argc : $argc-1;
 383     }else{
 384         $ids_argc--;
 385     }
 386     $ids.=$char if($perl56);
 387     $ids.=encode('utf8',$char) if($perl58);
 388     return ($ids,$ids_argc);
 389 }
 390
 391 sub replace_ids{
 392     my($ids)=@_;
 393     $ids=&normalize_ids($ids,"UniJIS");
 394 #    return pack("U",0xfffd) if($ids!~/[$idc]/);
 395     return pack("U",0x3013) if(($ids!~/[$idc]/)
 396                                or($ids=~/[\x{10000}-]/));
 397                     #irregular for KAGE.
 398     unless(defined($ids{$ids}) and $ids{$ids}[1]>=0){
 399         $ids{$ids}[0]=$font_start;
 400         $ids{$ids}[1]=$ids_start;
 401         $ids_start++;
 402     }
 403     if($ids_start>255){
 404         $ids_start=0;
 405         $font_start++;
 406     }
 407     return "{\\fontencoding{OT1}\\fontfamily{".sprintf("chise%03d",$ids{$ids}[0])."}\\selectfont\\char$ids{$ids}[1]}";
 408 }
 409
 410 sub normalize_ids{
 411     my($ids,$out_cs)=@_;
 412     $ids = decode('utf8', $ids) if $perl58;
 413     $out_cs=~s/Uni(.+)/"ucs-".lc($1)/e;
 414     my $output_ids="";
 415     my($char,$char_id,$output_char_id);
 416     while($ids=~m/(.)/g){
 417         $char=$1;
 418         $char_id=unpack("U",$char);
 419         if($char=~/[$idc]/){
 420             $output_ids.=$char;
 421         }elsif($output_char_id=&get_char_attribute($char,$out_cs)){
 422             $output_ids.=pack("U",$output_char_id);
 423         }elsif($output_char_id=&get_char_attribute($char,"=ucs")){
 424             $output_ids.=pack("U",$output_char_id);
 425         }elsif($output_char_id=&get_char_attribute($char,"ucs")){
 426             $output_ids.=pack("U",$output_char_id);
 427         }else{
 428             return pack("U",0xfffe);
 429         }
 430     }
 431     return $output_ids;
 432 }
 433
 434 sub get_output_char{
 435     my($char_id,$out_cs)=@_;
 436     my($out_char_id,$suffix);
 437     my($gt,$hzk,$cdp);
 438     if(not defined($cmap_to{$out_cs})){
 439         &get_cmap($out_cs);
 440     }
 441     if($out_char_id=$cmap_to{$out_cs}->{$char_id}){
 442         return pack("U",$out_char_id);
 443     }else{
 444         return $gt if($useGT and $gt=&get_macro_for_GT($char_id));
 445         return $hzk if($useHZK and $hzk=&get_macro_for_HZK($char_id));
 446         return $cdp if($useCDP and $cdp=&get_macro_for_CDP($char_id));
 447         return undef;
 448     }
 449 }
 450
 451 sub get_cmap{
 452     my($out_cs)=@_;
 453     tie %{$cmap_to{$out_cs}}, "BerkeleyDB::Hash",
 454     -Filename => "$omegadb_home/$out_cs" or die $!;
 455 }
 456
 457 sub get_ids{
 458     my($char)=@_;
 459     my $ids="";
 460     $ids=&get_char_attribute($char,"ids-aggregated")
 461         or &get_char_attribute($char,"ids");
 462 #       or &get_char_attribute($char,"ideographic-structure");
 463     $ids=decode('utf8', $ids) if($perl58);
 464 #    $ids=~s/[? ()]//g;
 465     return $ids;
 466 }
 467
 468 sub get_char_id_for_ids{
 469     my($ids)=@_;
 470     my($char_id,$char);
 471     $ids=decode('utf8', $ids) if($perl58);
 472 #    $ids="(?".(join " ?",(split(//,$ids))).")";
 473     &get_idsdb if(not defined($idsdb));
 474     $char=$idsdb->{$ids};
 475     $char=decode('utf8',$char) if($perl58);
 476     if($char){
 477         return unpack("U",$char);
 478     }else{
 479         return undef;
 480     }
 481 }
 482
 483 sub get_idsdb{
 484     tie %{$idsdb}, "BerkeleyDB::Hash",
 485     -Filename => "$omegadb_home/idsdb" or die $!;
 486 }
 487
 488 sub get_char_in_utf8mcs_bak{
 489     my($char,$in_cs)=@_;
 490     return $char if($in_cs eq "Utf8mcs");
 491     my($char_id,$output_char);
 492     $in_cs=~s/Utf8/ucs-/;
 493     $char_id=unpack("U",$char);
 494     if(($output_char)=&get_chars_matching("$in_cs",$char_id)){
 495         $output_char=decode('utf8', $output_char) if($perl58);
 496         return $output_char;
 497     }else{
 498         return $char;
 499     }
 500 }
 501
 502 sub get_char_in_utf8mcs{
 503     # argument: <character>, <input coding system>
 504     # return:   character in UTF-8mcs.
 505     my($char,$in_cs)=@_;
 506     my($char_id,$output_char_id);
 507     return $char if($in_cs eq "Utf8mcs");
 508     $char_id=unpack("U",$char);
 509     &get_utf8mcs_map($in_cs) if(not defined($utf8mcs_map_from{$in_cs}));
 510     if($output_char_id=$utf8mcs_map_from{$in_cs}->{$char_id}){
 511         return pack("U",$output_char_id);
 512    }else{
 513         return $char;
 514    }
 515 }
 516
 517 sub get_utf8mcs_map{
 518     my($in_cs)=@_;
 519     my($suffix);
 520     ($suffix=$in_cs)=~s/^Utf8//;
 521     tie %{$utf8mcs_map_from{$in_cs}}, "BerkeleyDB::Hash",
 522     -Filename => "$omegadb_home/ucs-$suffix" or die $!;
 523 }
 524
 525 sub get_macro_for_GT{
 526     my($char_id)=@_;
 527     my($char,$gt,$GT);
 528     $char=pack("U",$char_id);
 529     foreach (@GT){
 530         if($gt=&get_char_attribute($char,$_)){
 531             m/gt\-pj\-(\d+)/ and $GT=$1;
 532             last;
 533         }
 534     }
 535     if($gt){
 536         return "{\\fontencoding{OT1}\\fontfamily{".sprintf("gt%02d",$GT)."}\\selectfont\\char".($gt|0x8080)."}";
 537 #       return "\\GT{".sprintf("gt%02d",$GT)."}{\\char".($gt|0x8080)."}";
 538     }else{
 539         return undef;
 540     }
 541 }
 542
 543 sub get_macro_for_HZK{
 544     my($char_id)=@_;
 545     my($char,$hzk,$HZK);
 546     $char=pack("U",$char_id);
 547     foreach (@HZK){
 548         if($hzk=&get_char_attribute($char,$_)){
 549             m/hanziku\-(\d+)/ and $HZK=$1;
 550             last;
 551         }
 552     }
 553     if($hzk){
 554         return "{\\fontencoding{OT1}\\fontfamily{".sprintf("hzk%02d",$HZK)."}\\selectfont\\char".($hzk|0x8080)."}";
 555     }else{
 556         return undef;
 557     }
 558 }
 559
 560 sub get_macro_for_CDP{
 561     my($char_id)=@_;
 562     my($char,$cdp);
 563     $char=pack("U",$char_id);
 564     foreach (@CDP){
 565         if($cdp=&get_char_attribute($char,$_)){
 566             last;
 567         }
 568     }
 569     if($cdp){
 570         return "{\\fontencoding{OT1}\\fontfamily{cdp}\\selectfont\\char".($cdp|0x8080)."}";
 571     }else{
 572         return undef;
 573     }
 574 }