4 use vars qw($cmapfile $db_home $encoding
10 use Chise_utils ':all';
16 Usage: perl $0 <CMAP file> <CHISE DB dir>
17 <CMAP file> UniJIS-UTF16-H etc. available in Adobe Reader Directory.
18 <CHISE DB dir> is directory to store BDB data,
19 typically /usr/local/lib/chise/chise-db.
22 #my $db_home="/usr/local/lib/chise/char-db";
23 #my $db_home="/usr/local/lib/chise/db";
30 if(-d "$db_home/character"){
31 $db_home=$db_home."/character/feature";
32 }elsif(-d "$db_home/system-char-id"){
33 $db_home=$db_home."/system-char-id";
39 ($ciddb_filename=$cmapfile)=~s!^.*/(.*)$!"vnd-adobe-cid-".lc($1)!e;
40 ($encoding=$cmapfile)=~s!.*/Uni(\w+).*$!"\=ucs\@".lc($1)!e;
43 unless(defined($cmapfile) and -f $cmapfile
44 and $encoding=~/^=ucs\@(cns|gb|jis|ks)$/
50 # if working on Mac OS.
52 print STDERR "Using ^M as delimiter.\n";
56 $cs_var{'=ucs@cns'}=['=cns11643-1','=cns11643-2',
57 '=cns11643-3','=cns11643-4',
58 '=cns11643-5','=cns11643-6',
61 $cs_var{'=ucs@gb'}=['=gb12345','=gb2312'];
63 $cs_var{'=ucs@jis'}=['=jis-x0208','=jis-x0208-1978',
64 '=jis-x0208-1983','=jis-x0208-1990',
66 '=jis-x0213-1-2000','=jis-x0213-2-2000'];
68 $cs_var{'=ucs@ks'}=['=ks-x1001'];
70 if(-f "$db_home/$ciddb_filename"){
71 print STDERR "Removing old DB $db_home/$ciddb_filename.\n";
72 unlink "$db_home/$ciddb_filename";
74 $ciddb=new BerkeleyDB::Hash
75 -Filename => "$db_home/$ciddb_filename", -Flags => DB_CREATE
80 print STDERR "Reading $cmapfile...";
81 open(CMAP,"<$cmapfile") or die $!;
82 # taken from expandcmap.pl by taiji.
86 }elsif(/endcidrange/){
88 }elsif(/begincidchar/){
93 if(/<([\da-fA-F]+)>\s*(\d+)/){
94 ($ucs,$cid)=(hex($1),$2);
95 &store_cid($ucs,$cid,$encoding);
98 if(/<([\da-fA-F]+)>\s*<([\da-fA-F]+)>\s*(\d+)/){
99 ($ucs, $last, $cid) = (hex($1), hex($2), $3);
100 while ($ucs <= $last) {
101 &store_cid($ucs,$cid,$encoding);
108 print STDERR "done!\n";
113 my($ucs,$cid,$encoding)=@_;
114 my($char,$char_id,$char_id_unified);
116 if($char_id=&replace_char_id($ucs,$encoding)){
117 $char=pack("U",$char_id);
119 if(&have_glyph($ucs,$encoding)){
120 $char=pack("U",$ucs);
122 foreach $char_id_unified (&get_char_id_unified($ucs)){
123 if(&have_glyph($char_id_unified,$encoding)){
124 $char_id=$char_id_unified;
129 $char=pack("U",$char_id);
131 print STDERR sprintf("%x is used for %x(%s).\n",
132 $char_id,$ucs,$encoding);
135 $char=pack("U",$ucs);
137 print STDERR sprintf("%x is uncertain for %s.\n",$ucs,$encoding);
143 print STDERR sprintf("%X:%d\n",unpack("U",$char),$cid);
145 unless($ciddb->db_put("?".$char,$cid)==0){
151 my($ucs,$encoding)=@_;
154 if(($char)=&get_chars_matching($encoding,$ucs)){
156 return unpack("U",$char);
165 $char=pack("U",$char_id);
166 foreach $cs_var (@{$cs_var{$cs}}){
167 if(&get_char_attribute($char,$cs_var)){
174 sub get_char_id_unified{
177 if($chars=&get_char_attribute(pack("U",$char_id),'->ucs-unified')){
178 $chars=~s/^\((.*)\)$/$1/;
179 return map {unpack("U",$_)} (split(/\s*\?/,$chars));