4 use vars qw($perl56 $perl58
5 $cmapfile $db_home $encoding
11 use Chise_utils ':all';
15 if($^V and $^V ge v5.8){
17 }elsif($^V and $^V ge v5.6){
20 print STDERR "This version is not supported.";
24 binmode(STDIN, ':encoding(utf8)');
25 binmode(STDOUT, ':encoding(utf8)');
28 # if working on Mac OS.
30 print STDERR "Using ^M as delimiter.\n";
35 Usage: perl $0 <CMAP file> <CHISE DB dir>
36 <CMAP file> UniJIS-UTF16-H etc. available in Adobe Reader Directory.
37 <CHISE DB dir> is directory to store BDB data,
38 typically /usr/local/lib/chise/chise-db.
41 #my $db_home="/usr/local/lib/chise/char-db";
42 #my $db_home="/usr/local/lib/chise/db";
49 if(-d "$db_home/character"){
50 $db_home=$db_home."/character/feature";
51 }elsif(-d "$db_home/system-char-id"){
52 $db_home=$db_home."/system-char-id";
58 ($ciddb_filename=$cmapfile)=~s!^.*/(.*)$!"vnd-adobe-cid-".lc($1)!e;
59 ($encoding=$cmapfile)=~s!.*/Uni(\w+).*$!"\=ucs\@".lc($1)!e;
62 unless(defined($cmapfile) and -f $cmapfile
63 and $encoding=~/^=ucs\@(cns|gb|jis|ks)$/
69 $cs_var{'=ucs@cns'}=['=cns11643-1','=cns11643-2',
70 '=cns11643-3','=cns11643-4',
71 '=cns11643-5','=cns11643-6',
74 $cs_var{'=ucs@gb'}=['=gb12345','=gb2312'];
76 $cs_var{'=ucs@jis'}=['=jis-x0208','=jis-x0208-1978',
77 '=jis-x0208-1983','=jis-x0208-1990',
79 '=jis-x0213-1-2000','=jis-x0213-2-2000'];
81 $cs_var{'=ucs@ks'}=['=ks-x1001'];
83 if(-f "$db_home/$ciddb_filename"){
84 print STDERR "Removing old DB $db_home/$ciddb_filename.\n";
85 unlink "$db_home/$ciddb_filename";
87 $ciddb=new BerkeleyDB::Hash
88 -Filename => "$db_home/$ciddb_filename", -Flags => DB_CREATE
93 print STDERR "Reading $cmapfile...";
94 open(CMAP,"<$cmapfile") or die $!;
95 # taken from expandcmap.pl by taiji.
99 }elsif(/endcidrange/){
101 }elsif(/begincidchar/){
103 }elsif(/endcidchar/){
106 if(/<([\da-fA-F]+)>\s*(\d+)/){
107 ($ucs,$cid)=(hex($1),$2);
108 &store_cid($ucs,$cid,$encoding);
110 }elsif($in_cidrange){
111 if(/<([\da-fA-F]+)>\s*<([\da-fA-F]+)>\s*(\d+)/){
112 ($ucs, $last, $cid) = (hex($1), hex($2), $3);
113 while ($ucs <= $last) {
114 &store_cid($ucs,$cid,$encoding);
121 print STDERR "done!\n";
126 my($ucs,$cid,$encoding)=@_;
127 my($char,$char_id,$char_id_unified);
129 if($char_id=&replace_char_id($ucs,$encoding)){
130 $char=pack("U",$char_id);
132 if(&have_glyph($ucs,$encoding)){
133 $char=pack("U",$ucs);
135 foreach $char_id_unified (&get_char_id_unified($ucs)){
136 if(&have_glyph($char_id_unified,$encoding)){
137 $char_id=$char_id_unified;
142 $char=pack("U",$char_id);
144 print STDERR sprintf("%x is used for %x(%s).\n",
145 $char_id,$ucs,$encoding);
148 $char=pack("U",$ucs);
150 print STDERR sprintf("%x is uncertain for %s.\n",$ucs,$encoding);
156 print STDERR sprintf("%X:%d\n",unpack("U",$char),$cid);
158 unless($ciddb->db_put("?".$char,$cid)==0){
164 my($ucs,$encoding)=@_;
167 if(($char)=&get_chars_matching($encoding,$ucs)){
168 $char=decode('utf8', $char) if($perl58);
170 return unpack("U",$char);
179 $char=pack("U",$char_id);
180 foreach $cs_var (@{$cs_var{$cs}}){
181 if(&get_char_attribute($char,$cs_var)){
188 sub get_char_id_unified{
191 if($chars=&get_char_attribute(pack("U",$char_id),'->ucs-unified')){
192 $chars=~s/^\((.*)\)$/$1/;
194 return map {unpack("U",$_)} (split(/\s+/,$chars));