add_adobecid.pl

   1 #!/usr/bin/perl -w
   2
   3 use strict;
   4 use vars qw($perl56 $perl58
   5             $cmapfile $db_home $encoding
   6             %cs_var
   7             $ucs $cid $last
   8             $ciddb_filename $ciddb
   9             );
  10 use BerkeleyDB;
  11 use Chise_utils ':all';
  12
  13 my $debug=0;
  14
  15 if($^V and $^V ge v5.8){
  16     $perl58=1;
  17 }elsif($^V and $^V ge v5.6){
  18     $perl56=1;
  19 }else{
  20     print STDERR "This version is not supported.";
  21 }
  22 if($perl58){
  23     eval "use Encode";
  24     binmode(STDIN, ':encoding(utf8)');
  25     binmode(STDOUT, ':encoding(utf8)');
  26 }
  27
  28 # if working on Mac OS.
  29 if($^O=~/darwin/){
  30     print STDERR "Using ^M as delimiter.\n";
  31     $/="\r";
  32 }
  33
  34 my $usage=<<EOF;
  35 Usage: perl $0 <CMAP file> <CHISE DB dir>
  36     <CMAP file> UniJIS-UTF16-H etc. available in Adobe Reader Directory.
  37     <CHISE DB dir> is directory to store BDB data,
  38       typically /usr/local/lib/chise/chise-db.
  39 EOF
  40
  41 #my $db_home="/usr/local/lib/chise/char-db";
  42 #my $db_home="/usr/local/lib/chise/db";
  43
  44 if(@ARGV==2){
  45     $cmapfile=shift;
  46     $db_home=shift;
  47
  48     $db_home=~s!/$!!;
  49     if(-d "$db_home/character"){
  50         $db_home=$db_home."/character/feature";
  51     }elsif(-d "$db_home/system-char-id"){
  52         $db_home=$db_home."/system-char-id";
  53     }else{
  54         print STDERR $usage;
  55         exit 1;
  56     }
  57
  58     ($ciddb_filename=$cmapfile)=~s!^.*/(.*)$!"vnd-adobe-cid-".lc($1)!e;
  59     ($encoding=$cmapfile)=~s!.*/Uni(\w+).*$!"\=ucs\@".lc($1)!e;
  60 }
  61
  62 unless(defined($cmapfile) and -f $cmapfile
  63        and $encoding=~/^=ucs\@(cns|gb|jis|ks)$/
  64        and -d $db_home){
  65     print STDERR $usage;
  66     exit 1;
  67 }
  68
  69 $cs_var{'=ucs@cns'}=['=cns11643-1','=cns11643-2',
  70                      '=cns11643-3','=cns11643-4',
  71                      '=cns11643-5','=cns11643-6',
  72                      '=cns11643-7'];
  73
  74 $cs_var{'=ucs@gb'}=['=gb12345','=gb2312'];
  75
  76 $cs_var{'=ucs@jis'}=['=jis-x0208','=jis-x0208-1978',
  77                      '=jis-x0208-1983','=jis-x0208-1990',
  78                      '=jis-x0212',
  79                      '=jis-x0213-1-2000','=jis-x0213-2-2000'];
  80
  81 $cs_var{'=ucs@ks'}=['=ks-x1001'];
  82
  83 if(-f "$db_home/$ciddb_filename"){
  84     print STDERR "Removing old DB $db_home/$ciddb_filename.\n";
  85     unlink "$db_home/$ciddb_filename";
  86 }
  87 $ciddb=new BerkeleyDB::Hash
  88     -Filename => "$db_home/$ciddb_filename", -Flags => DB_CREATE
  89     or die $!;
  90
  91 my $in_cidrange=0;
  92 my $in_cidchar=0;
  93 print STDERR "Reading $cmapfile...";
  94 open(CMAP,"<$cmapfile") or die $!;
  95 # taken from expandcmap.pl by taiji.
  96 while(<CMAP>){
  97     if(/begincidrange/){
  98         $in_cidrange=1;
  99     }elsif(/endcidrange/){
 100         $in_cidrange=0;
 101     }elsif(/begincidchar/){
 102         $in_cidchar=1;
 103     }elsif(/endcidchar/){
 104         $in_cidchar=0;
 105     }elsif($in_cidchar){
 106         if(/<([\da-fA-F]+)>\s*(\d+)/){
 107             ($ucs,$cid)=(hex($1),$2);
 108             &store_cid($ucs,$cid,$encoding);
 109         }
 110     }elsif($in_cidrange){
 111         if(/<([\da-fA-F]+)>\s*<([\da-fA-F]+)>\s*(\d+)/){
 112             ($ucs, $last, $cid) = (hex($1), hex($2), $3);
 113             while ($ucs <= $last) {
 114                 &store_cid($ucs,$cid,$encoding);
 115                 $cid++,$ucs++;
 116             }
 117         }
 118     }
 119 }
 120 close(CMAP);
 121 print STDERR "done!\n";
 122
 123 exit 0;
 124
 125 sub store_cid{
 126     my($ucs,$cid,$encoding)=@_;
 127     my($char,$char_id,$char_id_unified);
 128
 129     if($char_id=&replace_char_id($ucs,$encoding)){
 130         $char=pack("U",$char_id);
 131     }else{
 132         if(&have_glyph($ucs,$encoding)){
 133             $char=pack("U",$ucs);
 134         }else{
 135             foreach $char_id_unified (&get_char_id_unified($ucs)){
 136                 if(&have_glyph($char_id_unified,$encoding)){
 137                     $char_id=$char_id_unified;
 138                     last;
 139                 }
 140             }
 141             if($char_id){
 142                 $char=pack("U",$char_id);
 143                 if($debug){
 144                     print STDERR sprintf("%x is used for %x(%s).\n",
 145                                          $char_id,$ucs,$encoding);
 146                 }
 147             }else{
 148                 $char=pack("U",$ucs);
 149                 if($debug){
 150                     print STDERR sprintf("%x is uncertain for %s.\n",$ucs,$encoding);
 151                 }
 152             }
 153         }
 154     }
 155     if($debug){
 156         print STDERR sprintf("%X:%d\n",unpack("U",$char),$cid);
 157     }
 158     unless($ciddb->db_put("?".$char,$cid)==0){
 159         die $!;
 160     }
 161 }
 162
 163 sub replace_char_id{
 164     my($ucs,$encoding)=@_;
 165     my($char);
 166
 167     if(($char)=&get_chars_matching($encoding,$ucs)){
 168         $char=decode('utf8', $char) if($perl58);
 169         $char=~s/^\?//;
 170         return unpack("U",$char);
 171     }else{
 172         return undef;
 173     }
 174 }
 175
 176 sub have_glyph{
 177     my($char_id,$cs)=@_;
 178     my($cs_var,$char);
 179     $char=pack("U",$char_id);
 180     foreach $cs_var (@{$cs_var{$cs}}){
 181         if(&get_char_attribute($char,$cs_var)){
 182             return 1;
 183         }
 184     }
 185     return undef;
 186 }
 187
 188 sub get_char_id_unified{
 189     my($char_id)=@_;
 190     my($chars);
 191     if($chars=&get_char_attribute(pack("U",$char_id),'->ucs-unified')){
 192         $chars=~s/^\((.*)\)$/$1/;
 193         $chars=~s/\?//g;
 194         return map {unpack("U",$_)} (split(/\s+/,$chars));
 195     }else{
 196         return ();
 197     }
 198 }