4 use vars qw($perl56 $perl58
5 $cmapfile $db_home $encoding
11 use Chise_utils ':all';
15 if($^V and $^V ge v5.8){
17 }elsif($^V and $^V ge v5.6){
20 print STDERR "This version is not supported.";
24 binmode(STDIN, ':encoding(utf8)');
25 binmode(STDOUT, ':encoding(utf8)');
28 # if working on Mac OS.
30 print STDERR "Using ^M as delimiter.\n";
35 Usage: perl $0 <CMAP file> <CHISE DB dir>
36 <CMAP file> UniJIS-UCS2-H, etc. available in Adobe Reader Directory.
37 <CHISE DB dir> is the directory to store BDB data,
38 typically /usr/local/lib/chise/char-db.
41 #my $db_home="/usr/local/lib/chise/char-db";
48 $db_home=$db_home."/system-char-id";
50 ($ciddb_filename=$cmapfile)=~s!^.*/(.*)$!"adobe-".lc($1)!e;
51 ($encoding=$cmapfile)=~s!.*/Uni(\w+).*$!"\=ucs\@".lc($1)!e;
54 unless(defined($cmapfile) and -f $cmapfile
55 and $encoding=~/^=ucs\@(cns|gb|jis|ks)$/
61 $cs_var{'=ucs@cns'}=['=cns11643-1','=cns11643-2',
62 '=cns11643-3','=cns11643-4',
63 '=cns11643-5','=cns11643-6',
66 $cs_var{'=ucs@gb'}=['=gb12345','=gb2312'];
68 $cs_var{'=ucs@jis'}=['=jis-x0208','=jis-x0208-1978',
69 '=jis-x0208-1983','=jis-x0208-1990',
71 '=jis-x0213-1-2000','=jis-x0213-2-2000'];
73 $cs_var{'=ucs@ks'}=['=ks-x1001'];
75 if(-f "$db_home/$ciddb_filename"){
76 print STDERR "Removing old DB $db_home/$ciddb_filename.\n";
77 unlink "$db_home/$ciddb_filename";
79 $ciddb=new BerkeleyDB::Hash
80 -Filename => "$db_home/$ciddb_filename", -Flags => DB_CREATE
85 print STDERR "Reading $cmapfile...";
86 open(CMAP,"<$cmapfile") or die $!;
87 # taken from expandcmap.pl by taiji.
91 }elsif(/endcidrange/){
93 }elsif(/begincidchar/){
98 if(/<([\da-fA-F]+)>\s*(\d+)/){
99 ($ucs,$cid)=(hex($1),$2);
100 &store_cid($ucs,$cid,$encoding);
102 }elsif($in_cidrange){
103 if(/<([\da-fA-F]+)>\s*<([\da-fA-F]+)>\s*(\d+)/){
104 ($ucs, $last, $cid) = (hex($1), hex($2), $3);
105 while ($ucs <= $last) {
106 &store_cid($ucs,$cid,$encoding);
113 print STDERR "done!\n";
118 my($ucs,$cid,$encoding)=@_;
119 my($char,$char_id,$char_id_unified);
121 if($char_id=&replace_char_id($ucs,$encoding)){
122 $char=pack("U",$char_id);
124 if(&have_glyph($ucs,$encoding)){
125 $char=pack("U",$ucs);
127 foreach $char_id_unified (&get_char_id_unified($ucs)){
128 if(&have_glyph($char_id_unified,$encoding)){
129 $char_id=$char_id_unified;
134 $char=pack("U",$char_id);
136 print STDERR sprintf("%x is used for %x(%s).\n",
137 $char_id,$ucs,$encoding);
140 $char=pack("U",$ucs);
142 print STDERR sprintf("%x is uncertain for %s.\n",$ucs,$encoding);
148 print STDERR sprintf("%X:%d\n",unpack("U",$char),$cid);
150 unless($ciddb->db_put("?".$char,$cid)==0){
159 my($ucs,$encoding)=@_;
162 if(($char)=&get_chars_matching($encoding,$ucs)){
163 $char=decode('utf8', $char) if($perl58);
165 return unpack("U",$char);
174 $char=pack("U",$char_id);
175 foreach $cs_var (@{$cs_var{$cs}}){
176 if(&get_char_attribute($char,$cs_var)){
183 sub get_char_id_unified{
186 if($chars=&get_char_attribute(pack("U",$char_id),'->ucs-unified')){
187 $chars=~s/^\((.*)\)$/$1/;
189 return map {unpack("U",$_)} (split(/\s+/,$chars));