#!/usr/bin/perl -w
use strict;
-use vars qw($perl56 $perl58
- $cmapfile $db_home $encoding
+use vars qw($cmapfile $db_home $encoding
%cs_var
$ucs $cid $last
- $ciddb_filename $ciddb
+ $ciddb_filename $ciddb %ciddb %cid
);
use BerkeleyDB;
use Chise_utils ':all';
+require 5.008;
my $debug=0;
-if($^V and $^V ge v5.8){
- $perl58=1;
-}elsif($^V and $^V ge v5.6){
- $perl56=1;
-}else{
- print STDERR "This version is not supported.";
-}
-if($perl58){
- eval "use Encode";
- binmode(STDIN, ':encoding(utf8)');
- binmode(STDOUT, ':encoding(utf8)');
-}
-
-# if working on Mac OS.
-if($^O=~/darwin/){
- print STDERR "Using ^M as delimiter.\n";
- $/="\r";
-}
-
my $usage=<<EOF;
Usage: perl $0 <CMAP file> <CHISE DB dir>
<CMAP file> UniJIS-UTF16-H etc. available in Adobe Reader Directory.
exit 1;
}
+# if working on Mac OS.
+if($^O=~/darwin/){
+ print STDERR "Using ^M as delimiter.\n";
+ $/="\r";
+}
+
$cs_var{'=ucs@cns'}=['=cns11643-1','=cns11643-2',
'=cns11643-3','=cns11643-4',
'=cns11643-5','=cns11643-6',
print STDERR "Removing old DB $db_home/$ciddb_filename.\n";
unlink "$db_home/$ciddb_filename";
}
-$ciddb=new BerkeleyDB::Hash
- -Filename => "$db_home/$ciddb_filename", -Flags => DB_CREATE
+
+$ciddb=tie %ciddb, 'BerkeleyDB::Hash',
+ -Filename => "$db_home/$ciddb_filename",
+ -Flags => DB_CREATE|DB_TRUNCATE,
+ -Pagesize => 512,
or die $!;
my $in_cidrange=0;
close(CMAP);
print STDERR "done!\n";
+print STDERR "Storing data to CHISE DB...";
+foreach my $char (sort keys %cid){
+ unless($ciddb->db_put("?".$char,$cid{$char})==0){
+ die $!;
+ }
+}
+print STDERR "done!\n";
+
+undef $ciddb;
+untie %ciddb;
+
exit 0;
sub store_cid{
if($debug){
print STDERR sprintf("%X:%d\n",unpack("U",$char),$cid);
}
- unless($ciddb->db_put("?".$char,$cid)==0){
- die $!;
- }
+ $cid{$char}=$cid;
}
sub replace_char_id{
my($char);
if(($char)=&get_chars_matching($encoding,$ucs)){
- $char=decode('utf8', $char) if($perl58);
- $char=~s/^\?//;
return unpack("U",$char);
}else{
return undef;
my($chars);
if($chars=&get_char_attribute(pack("U",$char_id),'->ucs-unified')){
$chars=~s/^\((.*)\)$/$1/;
- $chars=~s/\?//g;
- return map {unpack("U",$_)} (split(/\s+/,$chars));
+ return map {unpack("U",$_)} (split(/\s*\?/,$chars));
}else{
return ();
}