outCMAP

   1 #!/usr/bin/perl
   2
   3 use strict;
   4 use vars qw($opt_in_cs $opt_out_cs $opt_help $usage
   5             $in_cs $out_cs
   6             $char $char_id $out_char $omegadb_home
   7             $ids $ids_argc %ids $idsdb
   8             $idsdata_file $ids_start $font_start
   9             %utf8mcs_map_from
  10             %cmap_to
  11             $inotp $perl56 $perl58
  12             $useCDP $useHZK $useGT
  13             @CDP @HZK @GT
  14             );
  15 use Getopt::Long;
  16 use utf8;
  17 use Chise_utils ':all';
  18
  19 $useGT=1;
  20 $useHZK=0;
  21 $useCDP=0;
  22
  23 if($^V and $^V ge v5.8){
  24     $perl58=1;
  25 }elsif($^V and $^V ge v5.6){
  26     $perl56=1;
  27 }else{
  28     print STDERR "This versin is not supported.";
  29 }
  30 if($perl58){
  31     eval "use Encode";
  32     binmode(STDIN, ':encoding(utf8)');
  33     binmode(STDOUT, ':encoding(utf8)');
  34 }
  35
  36 $omegadb_home="$HOME/.chise";
  37
  38 &GetOptions("in=s"=>\$opt_in_cs,
  39             "i=s"=>\$opt_in_cs,
  40             "out=s"=>\$opt_out_cs,
  41             "o=s"=>\$opt_out_cs,
  42             "help",\$opt_help,
  43             "h",\$opt_help);
  44
  45 $usage=<<EOF;
  46 Usage: $0 -i <input coding system> -o <cmap encoding>
  47     input coding system:
  48       Utf8mcs, Utf8cns, Utf8gb, Utf8jis, Utf8ks
  49     cmap encoding:
  50       UniCNS, UniGB, UniJIS, UniKS
  51 EOF
  52
  53 if($opt_in_cs or $opt_out_cs){
  54     $in_cs=$opt_in_cs;
  55     $out_cs=$opt_out_cs;
  56 }elsif(@ARGV==0){
  57     ($in_cs,$out_cs)=($0=~/(Utf8.+)To(\w+)/);
  58     $inotp=1;
  59 }
  60
  61 # $in_cs:
  62 #   utf-8-mcs,utf-8-cns,utf-8-gb,utf-8-jis,utf-8-ks,
  63 # $out_cs:
  64 #   UniCNS,UniGB,UniJIS,UniKS
  65
  66 if($opt_help
  67    or not defined($in_cs)
  68    or not defined($out_cs)){
  69     print $usage;
  70     exit 1;
  71 }
  72
  73 $idsdata_file="idsdata.pl";
  74 $ids_start=0x00;
  75 $font_start=0;
  76
  77 if(-e $idsdata_file){
  78     require $idsdata_file;
  79 }
  80
  81 $ids_argc=0;
  82 $ids="";
  83
  84 @GT=(#"=gt","=gt-k",
  85      "=gt-pj-1","=gt-pj-2","=gt-pj-3","=gt-pj-4","=gt-pj-5","=gt-pj-6","=gt-pj-7","=gt-pj-8","=gt-pj-9","=gt-pj-10","=gt-pj-11"
  86      #,"=gt-pj-k1","=gt-pj-k2"
  87      );
  88 @HZK=("=hanziku-1","=hanziku-10","=hanziku-11","=hanziku-12","=hanziku-2","=hanziku-3","=hanziku-4","=hanziku-5","=hanziku-6","=hanziku-7","=hanziku-8","=hanziku-9");
  89 @CDP=("=big5-cdp");
  90
  91 while(<>){
  92     # temporary fix for using in OTP for perl 5.6.
  93     s/(.)/pack("c",unpack("U",$1))/ge if($inotp
  94                                          and $in_cs=~/utf8/i
  95                                          and $perl56);
  96     # for perl 5.8.
  97     $_=decode('utf8', $_)  if ($inotp and $in_cs=~/utf8/i
  98                                and $perl58);
  99     s/(amp.+?;)/&tex_de_er($1)/ge;
 100 #    s/(&.+?;)/&tex_de_er($1)/ge;
 101     while(m/(.)/g){
 102         $char=&get_char_in_utf8mcs($1,$in_cs);
 103         $char_id=unpack("U",$char);
 104         if($ids_argc>0){
 105             ($ids,$ids_argc)=&ids_rest($ids,$ids_argc,$char);
 106             if($ids_argc==0){
 107                 if(($char_id=&get_char_id_for_ids($ids))
 108                    and(($out_char=&get_output_char($char_id,$out_cs)))){
 109                     print $out_char;
 110                 }else{
 111                     print &replace_ids($ids) if($perl56);
 112                     print encode('utf8', &replace_ids($ids)) if($perl58);
 113                 }
 114                 $ids="";
 115             }
 116         }elsif($char_id>=0x2ff0 and $char_id<=0x2fff){
 117             ($ids,$ids_argc)=&ids_rest("",0,$char);
 118             next;
 119         }else{
 120             if($char_id<=0xff){
 121                 print $char;
 122                 next;
 123             }
 124             if(($out_char=&get_output_char($char_id,$out_cs))){
 125                 print $out_char;
 126             }elsif($char_id >= 0x20000 && $char_id <=0x2a6df){
 127                 unless(defined($ids{$char}) and $ids{$char}[1]>=0){
 128                     $ids{$char}[0]=$font_start;
 129                     $ids{$char}[1]=$ids_start;
 130                     $ids_start++;
 131                     if($ids_start>255){
 132                         $ids_start=0;
 133                         $font_start++;
 134                     }
 135                 }
 136                 print "{\\fontencoding{OT1}\\fontfamily{" .
 137                     sprintf("chise%03d",$ids{$char}[0]) .
 138                     "}\\selectfont\\char$ids{$char}[1]}";
 139                 next;
 140             }else{
 141                 print &replace_ids(&get_ids($char));
 142             }
 143         }
 144     }
 145     if($ids_argc>0){
 146         print STDERR "IDS parse error: $ids\n";
 147 #       print pack("U",0xfffd);
 148         print pack("U",0x3013) if($perl56);
 149         print encode('utf8',pack("U",0x3013)) if($perl58);
 150         $ids="";
 151         $ids_argc=0;
 152     }
 153 }
 154
 155 open(IDSDATA,">$idsdata_file") or die;
 156 print IDSDATA 'use utf8;',"\n";
 157 foreach $ids (keys %ids){
 158     print IDSDATA '$ids{\'',$ids,'\'}=[',join ",",@{$ids{$ids}},"];\n" if($perl56);
 159     print IDSDATA '$ids{\'',encode('utf8',$ids),'\'}=[',join ",",@{$ids{$ids}},"];\n" if($perl58);
 160 }
 161 print IDSDATA '$font_start=',$font_start,";\n";
 162 print IDSDATA '$ids_start=',$ids_start,";\n";
 163 print IDSDATA "1;";
 164
 165 exit 0;
 166
 167 sub tex_de_er{
 168     my($er)=@_;
 169     my($out);
 170     $er=~s/^amp(.*);$/$1/;
 171 #    $er=~s/^&(.*);$/$1/;
 172     $out=&de_er($er);
 173     if($out){
 174         return $out;
 175     }else{
 176         return "amp$er;";
 177     }
 178 }
 179
 180 sub ids_rest{
 181     my($ids,$ids_argc,$char)=@_;
 182     my($argc);
 183     $argc=&ids_argc($char);
 184     if($argc){
 185         $ids_argc+= $ids_argc==0 ? $argc : $argc-1;
 186     }else{
 187         $ids_argc--;
 188     }
 189     $ids.=$char if($perl56);
 190     $ids.=encode('utf8',$char) if($perl58);
 191     return ($ids,$ids_argc);
 192 }
 193
 194 sub replace_ids{
 195     my($ids)=@_;
 196     $ids=&normalize_ids($ids,"UniJIS");
 197 #    return pack("U",0xfffd) if($ids!~/[$idc]/);
 198     return pack("U",0x3013) if(($ids!~/[$idc]/)
 199                                or($ids=~/[\x{10000}-]/));
 200                     #irregular for KAGE.
 201     unless(defined($ids{$ids}) and $ids{$ids}[1]>=0){
 202         $ids{$ids}[0]=$font_start;
 203         $ids{$ids}[1]=$ids_start;
 204         $ids_start++;
 205     }
 206     if($ids_start>255){
 207         $ids_start=0;
 208         $font_start++;
 209     }
 210     return "{\\fontencoding{OT1}\\fontfamily{".sprintf("chise%03d",$ids{$ids}[0])."}\\selectfont\\char$ids{$ids}[1]}";
 211 }
 212
 213 sub normalize_ids{
 214     my($ids,$out_cs)=@_;
 215     $ids = decode('utf8', $ids) if $perl58;
 216     $out_cs=~s/Uni(.+)/"ucs-".lc($1)/e;
 217     my $output_ids="";
 218     my($char,$char_id,$output_char_id);
 219     while($ids=~m/(.)/g){
 220         $char=$1;
 221         $char_id=unpack("U",$char);
 222         if($char=~/[$idc]/){
 223             $output_ids.=$char;
 224         }elsif($output_char_id=&get_char_attribute($char,$out_cs)){
 225             $output_ids.=pack("U",$output_char_id);
 226         }elsif($output_char_id=&get_char_attribute($char,"=ucs")){
 227             $output_ids.=pack("U",$output_char_id);
 228         }elsif($output_char_id=&get_char_attribute($char,"ucs")){
 229             $output_ids.=pack("U",$output_char_id);
 230         }else{
 231             return pack("U",0xfffe);
 232         }
 233     }
 234     return $output_ids;
 235 }
 236
 237 sub get_output_char{
 238     my($char_id,$out_cs)=@_;
 239     my($out_char_id,$suffix);
 240     my($gt,$hzk,$cdp);
 241     if(not defined($cmap_to{$out_cs})){
 242         &get_cmap($out_cs);
 243     }
 244     if($out_char_id=$cmap_to{$out_cs}->{$char_id}){
 245         return pack("U",$out_char_id);
 246     }else{
 247         return $gt if($useGT and $gt=&get_macro_for_GT($char_id));
 248         return $hzk if($useHZK and $hzk=&get_macro_for_HZK($char_id));
 249         return $cdp if($useCDP and $cdp=&get_macro_for_CDP($char_id));
 250         return undef;
 251     }
 252 }
 253
 254 sub get_cmap{
 255     my($out_cs)=@_;
 256     tie %{$cmap_to{$out_cs}}, "BerkeleyDB::Hash",
 257     -Filename => "$omegadb_home/$out_cs" or die $!;
 258 }
 259
 260 sub get_ids{
 261     my($char)=@_;
 262     my $ids="";
 263     $ids=&get_char_attribute($char,"ids-aggregated")
 264         or &get_char_attribute($char,"ids");
 265 #       or &get_char_attribute($char,"ideographic-structure");
 266     $ids=decode('utf8', $ids) if($perl58);
 267 #    $ids=~s/[? ()]//g;
 268     return $ids;
 269 }
 270
 271 sub get_char_id_for_ids{
 272     my($ids)=@_;
 273     my($char_id,$char);
 274     $ids=decode('utf8', $ids) if($perl58);
 275 #    $ids="(?".(join " ?",(split(//,$ids))).")";
 276     &get_idsdb if(not defined($idsdb));
 277     $char=$idsdb->{$ids};
 278     $char=decode('utf8',$char) if($perl58);
 279     if($char){
 280         return unpack("U",$char);
 281     }else{
 282         return undef;
 283     }
 284 }
 285
 286 sub get_idsdb{
 287     tie %{$idsdb}, "BerkeleyDB::Hash",
 288     -Filename => "$omegadb_home/idsdb" or die $!;
 289 }
 290
 291 sub get_char_in_utf8mcs_bak{
 292     my($char,$in_cs)=@_;
 293     return $char if($in_cs eq "Utf8mcs");
 294     my($char_id,$output_char);
 295     $in_cs=~s/Utf8/ucs-/;
 296     $char_id=unpack("U",$char);
 297     if(($output_char)=&get_chars_matching("$in_cs",$char_id)){
 298         $output_char=decode('utf8', $output_char) if($perl58);
 299         return $output_char;
 300     }else{
 301         return $char;
 302     }
 303 }
 304
 305 sub get_char_in_utf8mcs{
 306     # argument: <character>, <input coding system>
 307     # return:   character in UTF-8mcs.
 308     my($char,$in_cs)=@_;
 309     my($char_id,$output_char_id);
 310     return $char if($in_cs eq "Utf8mcs");
 311     $char_id=unpack("U",$char);
 312     &get_utf8mcs_map($in_cs) if(not defined($utf8mcs_map_from{$in_cs}));
 313     if($output_char_id=$utf8mcs_map_from{$in_cs}->{$char_id}){
 314         return pack("U",$output_char_id);
 315    }else{
 316         return $char;
 317    }
 318 }
 319
 320 sub get_utf8mcs_map{
 321     my($in_cs)=@_;
 322     my($suffix);
 323     ($suffix=$in_cs)=~s/^Utf8//;
 324     tie %{$utf8mcs_map_from{$in_cs}}, "BerkeleyDB::Hash",
 325     -Filename => "$omegadb_home/ucs-$suffix" or die $!;
 326 }
 327
 328 sub get_macro_for_GT{
 329     my($char_id)=@_;
 330     my($char,$gt,$GT);
 331     $char=pack("U",$char_id);
 332     foreach (@GT){
 333         if($gt=&get_char_attribute($char,$_)){
 334             m/gt\-pj\-(\d+)/ and $GT=$1;
 335             last;
 336         }
 337     }
 338     if($gt){
 339         return "{\\fontencoding{OT1}\\fontfamily{".sprintf("gt%02d",$GT)."}\\selectfont\\char".($gt|0x8080)."}";
 340     }else{
 341         return undef;
 342     }
 343 }
 344
 345 sub get_macro_for_HZK{
 346     my($char_id)=@_;
 347     my($char,$hzk,$HZK);
 348     $char=pack("U",$char_id);
 349     foreach (@HZK){
 350         if($hzk=&get_char_attribute($char,$_)){
 351             m/hanziku\-(\d+)/ and $HZK=$1;
 352             last;
 353         }
 354     }
 355     if($hzk){
 356         return "{\\fontencoding{OT1}\\fontfamily{".sprintf("hzk%02d",$HZK)."}\\selectfont\\char".($hzk|0x8080)."}";
 357     }else{
 358         return undef;
 359     }
 360 }
 361
 362 sub get_macro_for_CDP{
 363     my($char_id)=@_;
 364     my($char,$cdp);
 365     $char=pack("U",$char_id);
 366     foreach (@CDP){
 367         if($cdp=&get_char_attribute($char,$_)){
 368             last;
 369         }
 370     }
 371     if($cdp){
 372         return "{\\fontencoding{OT1}\\fontfamily{cdp}\\selectfont\\char".($cdp|0x8080)."}";
 373     }else{
 374         return undef;
 375     }
 376 }