use strict;
use vars qw($perl56 $perl58
$cmapfile $db_home $encoding
+ %cs_var
$ucs $cid $last
$ciddb_filename $ciddb
);
use BerkeleyDB;
use Chise_utils ':all';
+my $debug=0;
+
if($^V and $^V ge v5.8){
$perl58=1;
}elsif($^V and $^V ge v5.6){
binmode(STDOUT, ':encoding(utf8)');
}
-# if using with Mac.
+# if working on Mac OS.
if($^O=~/darwin/){
print STDERR "Using ^M as delimiter.\n";
$/="\r";
my $usage=<<EOF;
Usage: perl $0 <CMAP file> <CHISE DB dir>
<CMAP file> UniJIS-UCS2-H, etc. available in Adobe Reader Directory.
- <CHISE DB dir> is the directory to store BDB data.
+ <CHISE DB dir> is the directory to store BDB data,
+ typically /usr/local/lib/chise/char-db.
EOF
-#my $db_home="./omegadb";
+#my $db_home="/usr/local/lib/chise/char-db";
if(@ARGV==2){
$cmapfile=shift;
$db_home=shift;
+
$db_home=~s!/$!!;
-}
+ $db_home=$db_home."/system-char-id";
-($ciddb_filename=$cmapfile)=~s!^.*/(.*)$!"adobe-".lc($1)!e;
-($encoding=$cmapfile)=~s!.*/Uni(\w+).*$!"\=ucs\@".lc($1)!e;
+ ($ciddb_filename=$cmapfile)=~s!^.*/(.*)$!"adobe-".lc($1)!e;
+ ($encoding=$cmapfile)=~s!.*/Uni(\w+).*$!"\=ucs\@".lc($1)!e;
+}
-if(not -f $cmapfile
- or not $encoding=~/^=ucs\@(cns|gb|jis|ks)$/
- or not -d $db_home){
+unless(-f $cmapfile
+ and $encoding=~/^=ucs\@(cns|gb|jis|ks)$/
+ and -d $db_home){
print $usage;
exit 1;
}
-if(-f "$db_home/system-char-id/$ciddb_filename"){
- print STDERR "Removing old DB $db_home/system-char-id/$ciddb_filename.\n";
- unlink "$db_home/system-char-id/$ciddb_filename";
+if(-f "$db_home/$ciddb_filename"){
+ print STDERR "Removing old DB $db_home/$ciddb_filename.\n";
+ unlink "$db_home/$ciddb_filename";
}
$ciddb=new BerkeleyDB::Hash
- -Filename => "$db_home/system-char-id/$ciddb_filename", -Flags => DB_CREATE
+ -Filename => "$db_home/$ciddb_filename", -Flags => DB_CREATE
or die $!;
my $in_cidrange=0;
sub store_cid{
my($ucs,$cid,$encoding)=@_;
- my($char);
- unless($char=&replace_char($ucs,$encoding)){
- $char=pack("U",$ucs);
+ my($char,$char_id,$char_id_unified);
+
+ if($char_id=&replace_char_id($ucs,$encoding)){
+ $char=pack("U",$char_id);
+ }else{
+ if(&have_glyph($ucs,$encoding)){
+ $char=pack("U",$ucs);
+ }else{
+ foreach $char_id_unified (&get_char_id_unified($ucs)){
+ if(&have_glyph($char_id_unified,$encoding)){
+ $char_id=$char_id_unified;
+ last;
+ }
+ }
+ if($char_id){
+ $char=pack("U",$char_id);
+ if($debug){
+ print STDERR sprintf("%x is used for %x(%s).\n",
+ $char_id,$ucs,$encoding);
+ }
+ }else{
+ $char=pack("U",$ucs);
+ if($debug){
+ print STDERR sprintf("%x is uncertain for %s.\n",$ucs,$encoding);
+ }
+ }
+ }
+ }
+ if($debug){
+ print STDERR sprintf("%X:%X\n",unpack("U",$char),$cid);
}
-# print sprintf("%X",unpack("U",$char)),":",$cid,"\n";
unless($ciddb->db_put("?".$char,$cid)==0){
die $!;
}
}
-sub replace_char{
+exit 0;
+
+$cs_var{'=ucs@cns'}=['=cns11643-1','=cns11643-2',
+ '=cns11643-3','=cns11643-4',
+ '=cns11643-5','=cns11643-6',
+ '=cns11643-7'];
+
+$cs_var{'=ucs@gb'}=['=gb12345','=gb2312'];
+
+$cs_var{'=ucs@jis'}=['=jis-x0208','=jis-x0208-1978',
+ '=jis-x0208-1983','=jis-x0208-1990',
+ '=jis-x0212',
+ '=jis-x0213-1-2000','=jis-x0213-2-2000'];
+
+$cs_var{'=ucs@ks'}=['=ks-x1001'];
+
+sub replace_char_id{
my($ucs,$encoding)=@_;
- my($output_char);
+ my($char);
- if(($output_char)=&get_chars_matching($encoding,$ucs)){
- $output_char=decode('utf8', $output_char) if($perl58);
- $output_char=~s/^\?//;
- return $output_char;
+ if(($char)=&get_chars_matching($encoding,$ucs)){
+ $char=decode('utf8', $char) if($perl58);
+ $char=~s/^\?//;
+ return unpack("U",$char);
}else{
return undef;
}
}
+
+sub have_glyph{
+ my($char_id,$cs)=@_;
+ my($cs_var);
+ foreach $cs_var (@{$cs_var{$cs}}){
+ if(&get_char_attribute(pack("U",$char_id),$cs_var)){
+ return 1;
+ }
+ }
+ return undef;
+}
+
+sub get_char_id_unified{
+ my($char_id)=@_;
+ my($chars);
+ $chars=&get_char_attribute(pack("U",$char_id),'->ucs-unified');
+ $chars=~s/^\((.*)\)$/$1/;
+ $chars=~s/\?//g;
+ return map {unpack("U",$_)} (split(/\s+/,$chars));
+}