#!/usr/bin/perl -w
use strict;
-use vars qw($cmapfile $db_home $encoding
- %cs_var
+use vars qw($cmapfile $db_home $encoding $utf16
+ %cs_var
$ucs $cid $last
$ciddb_filename $ciddb %ciddb %cid
);
Usage: perl $0 <CMAP file> <CHISE DB dir>
<CMAP file> UniJIS-UTF16-H etc. available in Adobe Reader Directory.
<CHISE DB dir> is directory to store BDB data,
- typically /usr/local/share/chise/0.3/db.
+ typically /usr/local/share/chise/1.0/db.
EOF
#my $db_home="/usr/local/lib/chise/char-db";
exit 1;
}
+if($cmapfile=~/utf16/io){
+ $utf16=1;
+}
+
# if working on Mac OS.
if($^O=~/darwin/){
print STDERR "Using ^M as delimiter.\n";
$cs_var{'=ucs@gb'}=['=gb12345','=gb2312'];
$cs_var{'=ucs@jis'}=['=jis-x0208','=jis-x0208-1978',
- '=jis-x0208-1983','=jis-x0208-1990',
+ '=jis-x0208-1983','=jis-x0208-1990','=jis-x0208-1997',
'=jis-x0212',
'=jis-x0213-1-2000','=jis-x0213-2-2000'];
open(CMAP,"<$cmapfile") or die $!;
# taken from expandcmap.pl by taiji.
while(<CMAP>){
- if(/begincidrange/){
+ if(/begincidrange/o){
$in_cidrange=1;
- }elsif(/endcidrange/){
+ }elsif(/endcidrange/o){
$in_cidrange=0;
- }elsif(/begincidchar/){
+ }elsif(/begincidchar/o){
$in_cidchar=1;
- }elsif(/endcidchar/){
+ }elsif(/endcidchar/o){
$in_cidchar=0;
}elsif($in_cidchar){
- if(/<([\da-fA-F]+)>\s*(\d+)/){
- ($ucs,$cid)=(hex($1),$2);
+ if(/<([\da-fA-F]+)>\s*(\d+)/o){
+ ($ucs,$cid)=($utf16?&decode_utf16($1):hex($1),$2);
&store_cid($ucs,$cid,$encoding);
}
}elsif($in_cidrange){
- if(/<([\da-fA-F]+)>\s*<([\da-fA-F]+)>\s*(\d+)/){
- ($ucs, $last, $cid) = (hex($1), hex($2), $3);
+ if(/<([\da-fA-F]+)>\s*<([\da-fA-F]+)>\s*(\d+)/o){
+ ($ucs, $last, $cid) = ($utf16?&decode_utf16($1):hex($1), $utf16?&decode_utf16($2):hex($2), $3);
while ($ucs <= $last) {
&store_cid($ucs,$cid,$encoding);
$cid++,$ucs++;
}
}
}
+# $char=&replace_denotational($char);
if($debug){
print STDERR sprintf("%X:%d\n",unpack("U",$char),$cid);
}
$cid{$char}=$cid;
}
+sub replace_denotational($){
+ my($in_char)=@_;
+ my($out_char);
+ my $ucs=unpack("U",$in_char);
+
+ if(($out_char)=&get_chars_matching('=ucs@unicode',$ucs)){
+ return $out_char;
+ }else{
+ return $in_char;
+ }
+}
+
sub replace_char_id{
my($ucs,$encoding)=@_;
my($char);
return ();
}
}
+
+sub decode_utf16($){
+ my($in)=@_;
+ my($out);
+ if($in=~m/([\da-fA-F]{4})([\da-fA-F]{4})/o){
+ $out=0x10000 + (hex($1) - 0xD800) * 0x400 + (hex($2) - 0xDC00);
+ }else{
+ $out=hex($in);
+ }
+ return $out;
+}