4 use vars qw($cmapfile $db_home $encoding $utf16
7 $ciddb_filename $ciddb %ciddb %cid
10 use Chise_utils ':all';
16 Usage: perl $0 <CMAP file> <CHISE DB dir>
17 <CMAP file> UniJIS-UTF16-H etc. available in Adobe Reader Directory.
18 <CHISE DB dir> is directory to store BDB data,
19 typically /usr/local/share/chise/1.0/db.
22 #my $db_home="/usr/local/lib/chise/char-db";
23 #my $db_home="/usr/local/lib/chise/db";
30 if(-d "$db_home/character"){
31 $db_home=$db_home."/character/feature";
32 }elsif(-d "$db_home/system-char-id"){
33 $db_home=$db_home."/system-char-id";
39 ($ciddb_filename=$cmapfile)=~s!^.*/(.*)$!"vnd-adobe-cid-".lc($1)!e;
40 ($encoding=$cmapfile)=~s!.*/Uni(\w+).*$!"\=ucs\@".lc($1)!e;
43 unless(defined($cmapfile) and -f $cmapfile
44 and $encoding=~/^=ucs\@(cns|gb|jis|ks)$/
50 if($cmapfile=~/utf16/io){
54 # if working on Mac OS.
56 print STDERR "Using ^M as delimiter.\n";
60 $cs_var{'=ucs@cns'}=['=cns11643-1','=cns11643-2',
61 '=cns11643-3','=cns11643-4',
62 '=cns11643-5','=cns11643-6',
65 $cs_var{'=ucs@gb'}=['=gb12345','=gb2312'];
67 $cs_var{'=ucs@jis'}=['=jis-x0208','=jis-x0208-1978',
68 '=jis-x0208-1983','=jis-x0208-1990','=jis-x0208-1997',
70 '=jis-x0213-1-2000','=jis-x0213-2-2000'];
72 $cs_var{'=ucs@ks'}=['=ks-x1001'];
74 if(-f "$db_home/$ciddb_filename"){
75 print STDERR "Removing old DB $db_home/$ciddb_filename.\n";
76 unlink "$db_home/$ciddb_filename";
79 $ciddb=tie %ciddb, 'BerkeleyDB::Hash',
80 -Filename => "$db_home/$ciddb_filename",
81 -Flags => DB_CREATE|DB_TRUNCATE,
87 print STDERR "Reading $cmapfile...";
88 open(CMAP,"<$cmapfile") or die $!;
89 # taken from expandcmap.pl by taiji.
93 }elsif(/endcidrange/o){
95 }elsif(/begincidchar/o){
97 }elsif(/endcidchar/o){
100 if(/<([\da-fA-F]+)>\s*(\d+)/o){
101 ($ucs,$cid)=($utf16?&decode_utf16($1):hex($1),$2);
102 &store_cid($ucs,$cid,$encoding);
104 }elsif($in_cidrange){
105 if(/<([\da-fA-F]+)>\s*<([\da-fA-F]+)>\s*(\d+)/o){
106 ($ucs, $last, $cid) = ($utf16?&decode_utf16($1):hex($1), $utf16?&decode_utf16($2):hex($2), $3);
107 while ($ucs <= $last) {
108 &store_cid($ucs,$cid,$encoding);
115 print STDERR "done!\n";
117 print STDERR "Storing data to CHISE DB...";
118 foreach my $char (sort keys %cid){
119 unless($ciddb->db_put("?".$char,$cid{$char})==0){
123 print STDERR "done!\n";
131 my($ucs,$cid,$encoding)=@_;
132 my($char,$char_id,$char_id_unified);
134 if($char_id=&replace_char_id($ucs,$encoding)){
135 $char=pack("U",$char_id);
137 if(&have_glyph($ucs,$encoding)){
138 $char=pack("U",$ucs);
140 foreach $char_id_unified (&get_char_id_unified($ucs)){
141 if(&have_glyph($char_id_unified,$encoding)){
142 $char_id=$char_id_unified;
147 $char=pack("U",$char_id);
149 print STDERR sprintf("%x is used for %x(%s).\n",
150 $char_id,$ucs,$encoding);
153 $char=pack("U",$ucs);
155 print STDERR sprintf("%x is uncertain for %s.\n",$ucs,$encoding);
160 # $char=&replace_denotational($char);
162 print STDERR sprintf("%X:%d\n",unpack("U",$char),$cid);
167 sub replace_denotational($){
170 my $ucs=unpack("U",$in_char);
172 if(($out_char)=&get_chars_matching('=ucs@unicode',$ucs)){
180 my($ucs,$encoding)=@_;
183 if(($char)=&get_chars_matching($encoding,$ucs)){
184 return unpack("U",$char);
193 $char=pack("U",$char_id);
194 foreach $cs_var (@{$cs_var{$cs}}){
195 if(&get_char_attribute($char,$cs_var)){
202 sub get_char_id_unified{
205 if($chars=&get_char_attribute(pack("U",$char_id),'->ucs-unified')){
206 $chars=~s/^\((.*)\)$/$1/;
207 return map {unpack("U",$_)} (split(/\s*\?/,$chars));
216 if($in=~m/([\da-fA-F]{4})([\da-fA-F]{4})/o){
217 $out=0x10000 + (hex($1) - 0xD800) * 0x400 + (hex($2) - 0xDC00);