use $omegadb_path of Chise_utils.pm.
[chise/omega.git] / add_adobecid.pl
1 #!/usr/bin/perl -w
2
3 use strict;
4 use vars qw($cmapfile $db_home $encoding
5             %cs_var
6             $ucs $cid $last
7             $ciddb_filename $ciddb
8             );
9 use BerkeleyDB;
10 use Chise_utils ':all';
11 require 5.008;
12
13 my $debug=0;
14
15 my $usage=<<EOF;
16 Usage: perl $0 <CMAP file> <CHISE DB dir>
17     <CMAP file> UniJIS-UTF16-H etc. available in Adobe Reader Directory.
18     <CHISE DB dir> is directory to store BDB data,
19       typically /usr/local/lib/chise/chise-db.
20 EOF
21
22 #my $db_home="/usr/local/lib/chise/char-db";
23 #my $db_home="/usr/local/lib/chise/db";
24
25 if(@ARGV==2){
26     $cmapfile=shift;
27     $db_home=shift;
28
29     $db_home=~s!/$!!;
30     if(-d "$db_home/character"){
31         $db_home=$db_home."/character/feature";
32     }elsif(-d "$db_home/system-char-id"){
33         $db_home=$db_home."/system-char-id";
34     }else{
35         print STDERR $usage;
36         exit 1;
37     }
38
39     ($ciddb_filename=$cmapfile)=~s!^.*/(.*)$!"vnd-adobe-cid-".lc($1)!e;
40     ($encoding=$cmapfile)=~s!.*/Uni(\w+).*$!"\=ucs\@".lc($1)!e;
41 }
42
43 unless(defined($cmapfile) and -f $cmapfile
44        and $encoding=~/^=ucs\@(cns|gb|jis|ks)$/
45        and -d $db_home){
46     print STDERR $usage;
47     exit 1;
48 }
49
50 # if working on Mac OS.
51 if($^O=~/darwin/){
52     print STDERR "Using ^M as delimiter.\n";
53     $/="\r";
54 }
55
56 $cs_var{'=ucs@cns'}=['=cns11643-1','=cns11643-2',
57                      '=cns11643-3','=cns11643-4',
58                      '=cns11643-5','=cns11643-6',
59                      '=cns11643-7'];
60
61 $cs_var{'=ucs@gb'}=['=gb12345','=gb2312'];
62
63 $cs_var{'=ucs@jis'}=['=jis-x0208','=jis-x0208-1978',
64                      '=jis-x0208-1983','=jis-x0208-1990',
65                      '=jis-x0212',
66                      '=jis-x0213-1-2000','=jis-x0213-2-2000'];
67
68 $cs_var{'=ucs@ks'}=['=ks-x1001'];
69
70 if(-f "$db_home/$ciddb_filename"){
71     print STDERR "Removing old DB $db_home/$ciddb_filename.\n";
72     unlink "$db_home/$ciddb_filename";
73 }
74 $ciddb=new BerkeleyDB::Hash
75     -Filename => "$db_home/$ciddb_filename", -Flags => DB_CREATE
76     or die $!;
77
78 my $in_cidrange=0;
79 my $in_cidchar=0;
80 print STDERR "Reading $cmapfile...";
81 open(CMAP,"<$cmapfile") or die $!;
82 # taken from expandcmap.pl by taiji.
83 while(<CMAP>){
84     if(/begincidrange/){
85         $in_cidrange=1;
86     }elsif(/endcidrange/){
87         $in_cidrange=0;
88     }elsif(/begincidchar/){
89         $in_cidchar=1;
90     }elsif(/endcidchar/){
91         $in_cidchar=0;
92     }elsif($in_cidchar){
93         if(/<([\da-fA-F]+)>\s*(\d+)/){
94             ($ucs,$cid)=(hex($1),$2);
95             &store_cid($ucs,$cid,$encoding);
96         }
97     }elsif($in_cidrange){
98         if(/<([\da-fA-F]+)>\s*<([\da-fA-F]+)>\s*(\d+)/){
99             ($ucs, $last, $cid) = (hex($1), hex($2), $3);
100             while ($ucs <= $last) {
101                 &store_cid($ucs,$cid,$encoding);
102                 $cid++,$ucs++;
103             }
104         }
105     }
106 }
107 close(CMAP);
108 print STDERR "done!\n";
109
110 exit 0;
111
112 sub store_cid{
113     my($ucs,$cid,$encoding)=@_;
114     my($char,$char_id,$char_id_unified);
115
116     if($char_id=&replace_char_id($ucs,$encoding)){
117         $char=pack("U",$char_id);
118     }else{
119         if(&have_glyph($ucs,$encoding)){
120             $char=pack("U",$ucs);
121         }else{
122             foreach $char_id_unified (&get_char_id_unified($ucs)){
123                 if(&have_glyph($char_id_unified,$encoding)){
124                     $char_id=$char_id_unified;
125                     last;
126                 }
127             }
128             if($char_id){
129                 $char=pack("U",$char_id);
130                 if($debug){
131                     print STDERR sprintf("%x is used for %x(%s).\n",
132                                          $char_id,$ucs,$encoding);
133                 }
134             }else{
135                 $char=pack("U",$ucs);
136                 if($debug){
137                     print STDERR sprintf("%x is uncertain for %s.\n",$ucs,$encoding);
138                 }
139             }
140         }
141     }
142     if($debug){
143         print STDERR sprintf("%X:%d\n",unpack("U",$char),$cid);
144     }
145     unless($ciddb->db_put("?".$char,$cid)==0){
146         die $!;
147     }
148 }
149
150 sub replace_char_id{
151     my($ucs,$encoding)=@_;
152     my($char);
153
154     if(($char)=&get_chars_matching($encoding,$ucs)){
155         return unpack("U",$char);
156     }else{
157         return undef;
158     }
159 }
160
161 sub have_glyph{
162     my($char_id,$cs)=@_;
163     my($cs_var,$char);
164     $char=pack("U",$char_id);
165     foreach $cs_var (@{$cs_var{$cs}}){
166         if(&get_char_attribute($char,$cs_var)){
167             return 1;
168         }
169     }
170     return undef;
171 }
172
173 sub get_char_id_unified{
174     my($char_id)=@_;
175     my($chars);
176     if($chars=&get_char_attribute(pack("U",$char_id),'->ucs-unified')){
177         $chars=~s/^\((.*)\)$/$1/;
178         return map {unpack("U",$_)} (split(/\s*\?/,$chars));
179     }else{
180         return ();
181     }
182 }