add_adobecid.pl

   1 #!/usr/bin/perl -w
   2
   3 use strict;
   4 use vars qw($cmapfile $db_home $encoding $utf16
   5             %cs_var
   6             $ucs $cid $last
   7             $ciddb_filename $ciddb %ciddb %cid
   8             );
   9 use BerkeleyDB;
  10 use Chise_utils ':all';
  11 require 5.008;
  12
  13 my $debug=0;
  14
  15 my $usage=<<EOF;
  16 Usage: perl $0 <CMAP file> <CHISE DB dir>
  17     <CMAP file> UniJIS-UTF16-H etc. available in Adobe Reader Directory.
  18     <CHISE DB dir> is directory to store BDB data,
  19       typically /usr/local/share/chise/1.0/db.
  20 EOF
  21
  22 #my $db_home="/usr/local/lib/chise/char-db";
  23 #my $db_home="/usr/local/lib/chise/db";
  24
  25 if(@ARGV==2){
  26     $cmapfile=shift;
  27     $db_home=shift;
  28
  29     $db_home=~s!/$!!;
  30     if(-d "$db_home/character"){
  31         $db_home=$db_home."/character/feature";
  32     }elsif(-d "$db_home/system-char-id"){
  33         $db_home=$db_home."/system-char-id";
  34     }else{
  35         print STDERR $usage;
  36         exit 1;
  37     }
  38
  39     ($ciddb_filename=$cmapfile)=~s!^.*/(.*)$!"vnd-adobe-cid-".lc($1)!e;
  40     ($encoding=$cmapfile)=~s!.*/Uni(\w+).*$!"\=ucs\@".lc($1)!e;
  41 }
  42
  43 unless(defined($cmapfile) and -f $cmapfile
  44        and $encoding=~/^=ucs\@(cns|gb|jis|ks)$/
  45        and -d $db_home){
  46     print STDERR $usage;
  47     exit 1;
  48 }
  49
  50 if($cmapfile=~/utf16/io){
  51     $utf16=1;
  52 }
  53
  54 # if working on Mac OS.
  55 if($^O=~/darwin/){
  56     print STDERR "Using ^M as delimiter.\n";
  57     $/="\r";
  58 }
  59
  60 $cs_var{'=ucs@cns'}=['=cns11643-1','=cns11643-2',
  61                      '=cns11643-3','=cns11643-4',
  62                      '=cns11643-5','=cns11643-6',
  63                      '=cns11643-7'];
  64
  65 $cs_var{'=ucs@gb'}=['=gb12345','=gb2312'];
  66
  67 $cs_var{'=ucs@jis'}=['=jis-x0208','=jis-x0208-1978',
  68                      '=jis-x0208-1983','=jis-x0208-1990','=jis-x0208-1997',
  69                      '=jis-x0212',
  70                      '=jis-x0213-1-2000','=jis-x0213-2-2000'];
  71
  72 $cs_var{'=ucs@ks'}=['=ks-x1001'];
  73
  74 if(-f "$db_home/$ciddb_filename"){
  75     print STDERR "Removing old DB $db_home/$ciddb_filename.\n";
  76     unlink "$db_home/$ciddb_filename";
  77 }
  78
  79 $ciddb=tie %ciddb, 'BerkeleyDB::Hash',
  80     -Filename => "$db_home/$ciddb_filename",
  81     -Flags => DB_CREATE|DB_TRUNCATE,
  82     -Pagesize      => 512,
  83     or die $!;
  84
  85 my $in_cidrange=0;
  86 my $in_cidchar=0;
  87 print STDERR "Reading $cmapfile...";
  88 open(CMAP,"<$cmapfile") or die $!;
  89 # taken from expandcmap.pl by taiji.
  90 while(<CMAP>){
  91     if(/begincidrange/o){
  92         $in_cidrange=1;
  93     }elsif(/endcidrange/o){
  94         $in_cidrange=0;
  95     }elsif(/begincidchar/o){
  96         $in_cidchar=1;
  97     }elsif(/endcidchar/o){
  98         $in_cidchar=0;
  99     }elsif($in_cidchar){
 100         if(/<([\da-fA-F]+)>\s*(\d+)/o){
 101             ($ucs,$cid)=($utf16?&decode_utf16($1):hex($1),$2);
 102             &store_cid($ucs,$cid,$encoding);
 103         }
 104     }elsif($in_cidrange){
 105         if(/<([\da-fA-F]+)>\s*<([\da-fA-F]+)>\s*(\d+)/o){
 106             ($ucs, $last, $cid) = ($utf16?&decode_utf16($1):hex($1), $utf16?&decode_utf16($2):hex($2), $3);
 107             while ($ucs <= $last) {
 108                 &store_cid($ucs,$cid,$encoding);
 109                 $cid++,$ucs++;
 110             }
 111         }
 112     }
 113 }
 114 close(CMAP);
 115 print STDERR "done!\n";
 116
 117 print STDERR "Storing data to CHISE DB...";
 118 foreach my $char (sort keys %cid){
 119     unless($ciddb->db_put("?".$char,$cid{$char})==0){
 120         die $!;
 121     }
 122 }
 123 print STDERR "done!\n";
 124
 125 undef $ciddb;
 126 untie %ciddb;
 127
 128 exit 0;
 129
 130 sub store_cid{
 131     my($ucs,$cid,$encoding)=@_;
 132     my($char,$char_id,$char_id_unified);
 133
 134     if($char_id=&replace_char_id($ucs,$encoding)){
 135         $char=pack("U",$char_id);
 136     }else{
 137         if(&have_glyph($ucs,$encoding)){
 138             $char=pack("U",$ucs);
 139         }else{
 140             foreach $char_id_unified (&get_char_id_unified($ucs)){
 141                 if(&have_glyph($char_id_unified,$encoding)){
 142                     $char_id=$char_id_unified;
 143                     last;
 144                 }
 145             }
 146             if($char_id){
 147                 $char=pack("U",$char_id);
 148                 if($debug){
 149                     print STDERR sprintf("%x is used for %x(%s).\n",
 150                                          $char_id,$ucs,$encoding);
 151                 }
 152             }else{
 153                 $char=pack("U",$ucs);
 154                 if($debug){
 155                     print STDERR sprintf("%x is uncertain for %s.\n",$ucs,$encoding);
 156                 }
 157             }
 158         }
 159     }
 160 #    $char=&replace_denotational($char);
 161     if($debug){
 162         print STDERR sprintf("%X:%d\n",unpack("U",$char),$cid);
 163     }
 164     $cid{$char}=$cid;
 165 }
 166
 167 sub replace_denotational($){
 168     my($in_char)=@_;
 169     my($out_char);
 170     my $ucs=unpack("U",$in_char);
 171
 172     if(($out_char)=&get_chars_matching('=ucs@unicode',$ucs)){
 173         return $out_char;
 174     }else{
 175         return $in_char;
 176     }
 177 }
 178
 179 sub replace_char_id{
 180     my($ucs,$encoding)=@_;
 181     my($char);
 182
 183     if(($char)=&get_chars_matching($encoding,$ucs)){
 184         return unpack("U",$char);
 185     }else{
 186         return undef;
 187     }
 188 }
 189
 190 sub have_glyph{
 191     my($char_id,$cs)=@_;
 192     my($cs_var,$char);
 193     $char=pack("U",$char_id);
 194     foreach $cs_var (@{$cs_var{$cs}}){
 195         if(&get_char_attribute($char,$cs_var)){
 196             return 1;
 197         }
 198     }
 199     return undef;
 200 }
 201
 202 sub get_char_id_unified{
 203     my($char_id)=@_;
 204     my($chars);
 205     if($chars=&get_char_attribute(pack("U",$char_id),'->ucs-unified')){
 206         $chars=~s/^\((.*)\)$/$1/;
 207         return map {unpack("U",$_)} (split(/\s*\?/,$chars));
 208     }else{
 209         return ();
 210     }
 211 }
 212
 213 sub decode_utf16($){
 214     my($in)=@_;
 215     my($out);
 216     if($in=~m/([\da-fA-F]{4})([\da-fA-F]{4})/o){
 217         $out=0x10000 + (hex($1) - 0xD800) * 0x400 + (hex($2) - 0xDC00);
 218     }else{
 219         $out=hex($in);
 220     }
 221     return $out;
 222 }