chise2otf/add_adobecid.pl

   1 #!/usr/bin/perl -w
   2
   3 use strict;
   4 use vars qw($perl56 $perl58
   5             $cmapfile $db_home $encoding
   6             %cs_var
   7             $ucs $cid $last
   8             $ciddb_filename $ciddb
   9             );
  10 use BerkeleyDB;
  11 use Chise_utils ':all';
  12
  13 my $debug=0;
  14
  15 if($^V and $^V ge v5.8){
  16     $perl58=1;
  17 }elsif($^V and $^V ge v5.6){
  18     $perl56=1;
  19 }else{
  20     print STDERR "This version is not supported.";
  21 }
  22 if($perl58){
  23     eval "use Encode";
  24     binmode(STDIN, ':encoding(utf8)');
  25     binmode(STDOUT, ':encoding(utf8)');
  26 }
  27
  28 # if working on Mac OS.
  29 if($^O=~/darwin/){
  30     print STDERR "Using ^M as delimiter.\n";
  31     $/="\r";
  32 }
  33
  34 my $usage=<<EOF;
  35 Usage: perl $0 <CMAP file> <CHISE DB dir>
  36     <CMAP file> UniJIS-UCS2-H, etc. available in Adobe Reader Directory.
  37     <CHISE DB dir> is the directory to store BDB data,
  38       typically /usr/local/lib/chise/char-db.
  39 EOF
  40
  41 #my $db_home="/usr/local/lib/chise/char-db";
  42
  43 if(@ARGV==2){
  44     $cmapfile=shift;
  45     $db_home=shift;
  46
  47     $db_home=~s!/$!!;
  48     $db_home=$db_home."/system-char-id";
  49
  50     ($ciddb_filename=$cmapfile)=~s!^.*/(.*)$!"adobe-".lc($1)!e;
  51     ($encoding=$cmapfile)=~s!.*/Uni(\w+).*$!"\=ucs\@".lc($1)!e;
  52 }
  53
  54 unless(defined($cmapfile) and -f $cmapfile
  55        and $encoding=~/^=ucs\@(cns|gb|jis|ks)$/
  56        and -d $db_home){
  57     print $usage;
  58     exit 1;
  59 }
  60
  61 $cs_var{'=ucs@cns'}=['=cns11643-1','=cns11643-2',
  62                      '=cns11643-3','=cns11643-4',
  63                      '=cns11643-5','=cns11643-6',
  64                      '=cns11643-7'];
  65
  66 $cs_var{'=ucs@gb'}=['=gb12345','=gb2312'];
  67
  68 $cs_var{'=ucs@jis'}=['=jis-x0208','=jis-x0208-1978',
  69                      '=jis-x0208-1983','=jis-x0208-1990',
  70                      '=jis-x0212',
  71                      '=jis-x0213-1-2000','=jis-x0213-2-2000'];
  72
  73 $cs_var{'=ucs@ks'}=['=ks-x1001'];
  74
  75 if(-f "$db_home/$ciddb_filename"){
  76     print STDERR "Removing old DB $db_home/$ciddb_filename.\n";
  77     unlink "$db_home/$ciddb_filename";
  78 }
  79 $ciddb=new BerkeleyDB::Hash
  80     -Filename => "$db_home/$ciddb_filename", -Flags => DB_CREATE
  81     or die $!;
  82
  83 my $in_cidrange=0;
  84 my $in_cidchar=0;
  85 print STDERR "Reading $cmapfile...";
  86 open(CMAP,"<$cmapfile") or die $!;
  87 # taken from expandcmap.pl by taiji.
  88 while(<CMAP>){
  89     if(/begincidrange/){
  90         $in_cidrange=1;
  91     }elsif(/endcidrange/){
  92         $in_cidrange=0;
  93     }elsif(/begincidchar/){
  94         $in_cidchar=1;
  95     }elsif(/endcidchar/){
  96         $in_cidchar=0;
  97     }elsif($in_cidchar){
  98         if(/<([\da-fA-F]+)>\s*(\d+)/){
  99             ($ucs,$cid)=(hex($1),$2);
 100             &store_cid($ucs,$cid,$encoding);
 101         }
 102     }elsif($in_cidrange){
 103         if(/<([\da-fA-F]+)>\s*<([\da-fA-F]+)>\s*(\d+)/){
 104             ($ucs, $last, $cid) = (hex($1), hex($2), $3);
 105             while ($ucs <= $last) {
 106                 &store_cid($ucs,$cid,$encoding);
 107                 $cid++,$ucs++;
 108             }
 109         }
 110     }
 111 }
 112 close(CMAP);
 113 print STDERR "done!\n";
 114
 115 exit 0;
 116
 117 sub store_cid{
 118     my($ucs,$cid,$encoding)=@_;
 119     my($char,$char_id,$char_id_unified);
 120
 121     if($char_id=&replace_char_id($ucs,$encoding)){
 122         $char=pack("U",$char_id);
 123     }else{
 124         if(&have_glyph($ucs,$encoding)){
 125             $char=pack("U",$ucs);
 126         }else{
 127             foreach $char_id_unified (&get_char_id_unified($ucs)){
 128                 if(&have_glyph($char_id_unified,$encoding)){
 129                     $char_id=$char_id_unified;
 130                     last;
 131                 }
 132             }
 133             if($char_id){
 134                 $char=pack("U",$char_id);
 135                 if($debug){
 136                     print STDERR sprintf("%x is used for %x(%s).\n",
 137                                          $char_id,$ucs,$encoding);
 138                 }
 139             }else{
 140                 $char=pack("U",$ucs);
 141                 if($debug){
 142                     print STDERR sprintf("%x is uncertain for %s.\n",$ucs,$encoding);
 143                 }
 144             }
 145         }
 146     }
 147     if($debug){
 148         print STDERR sprintf("%X:%d\n",unpack("U",$char),$cid);
 149     }
 150     unless($ciddb->db_put("?".$char,$cid)==0){
 151         die $!;
 152     }
 153 }
 154
 155 exit 0;
 156
 157
 158 sub replace_char_id{
 159     my($ucs,$encoding)=@_;
 160     my($char);
 161
 162     if(($char)=&get_chars_matching($encoding,$ucs)){
 163         $char=decode('utf8', $char) if($perl58);
 164         $char=~s/^\?//;
 165         return unpack("U",$char);
 166     }else{
 167         return undef;
 168     }
 169 }
 170
 171 sub have_glyph{
 172     my($char_id,$cs)=@_;
 173     my($cs_var,$char);
 174     $char=pack("U",$char_id);
 175     foreach $cs_var (@{$cs_var{$cs}}){
 176         if(&get_char_attribute($char,$cs_var)){
 177             return 1;
 178         }
 179     }
 180     return undef;
 181 }
 182
 183 sub get_char_id_unified{
 184     my($char_id)=@_;
 185     my($chars);
 186     if($chars=&get_char_attribute(pack("U",$char_id),'->ucs-unified')){
 187         $chars=~s/^\((.*)\)$/$1/;
 188         $chars=~s/\?//g;
 189         return map {unpack("U",$_)} (split(/\s+/,$chars));
 190     }else{
 191         return ();
 192     }
 193 }