1 ###############################################################################
4 # create DB file for KAGE/cgi from CHISE-IDS *.txt
6 ###############################################################################
8 if($#ARGV != 0 || !(-d $ARGV[0])){
9 print "Usage: perl makeidsdb.pl ids_dir\n";
13 $idsdir =~ s/\/?$//; # remove the last slash
16 print "Remove old idsdb first.\n";
23 tie %db, "BerkeleyDB::Hash", -Filename => "idsdb", -Flags => DB_CREATE
24 or die "An error occured at ceating DB file.\n";
27 opendir(DH, "$ARGV[0]");
28 @dirlist = readdir(DH);
35 open(FH, "<$ARGV[0]/$_");
37 if($buffer[0] =~ m/utf-8-mcs-er/){
40 $_ =~ m/(.*?)[\s]?\t(.*)\t(.*)\n/; # remove space after Daikanwa's No.
41 $key = &properucs2kageucs($1);
42 $value = &utf8er2kageer($3);
43 if(($key ne $value) && ($value ne "")){
44 print "\rcreating ... [$counter] key : $key ";
60 ###############################################################################
62 sub properucs2kageucs{
65 $ax =~ s/\'/s/; # replace single quote for Daikanwa No.
66 $ax =~ s/\"/d/; # replace double quote for Daikanwa No.
74 my $result, $first, $semicolon, $ax;
78 for(my $pointer = 0; $pointer < length(@_[0]);){
79 $first = unpack("C", substr(@_[0], $pointer, 1));
80 if(0x00 <= $first && $first <= 0x7f){ # 1 byte
82 # the top letter is must be "&"
84 $semicolon = index(@_[0], ';', $pointer);
85 $ax = substr(@_[0], $pointer + 1, $semicolon - $pointer - 1).".";
86 $ax =~ s/^I-//; # remove "I-"
87 $ax =~ s/\'/s/; # replace single quote for Daikanwa No.
88 $ax =~ s/\"/d/; # replace double quote for Daikanwa No.
91 $pointer = $semicolon + 1;
98 elsif(0xc0 <= $first && $first <= 0xdf){ # 2 bytes
102 elsif(0xe0 <= $first && $first <= 0xef){ # 3 bytes
103 $ax = (unpack("C", substr(@_[0], $pointer, 1)) & 0x0f) << 12;
104 $ax += (unpack("C", substr(@_[0], $pointer + 1, 1)) & 0x3f) << 6;
105 $ax += (unpack("C", substr(@_[0], $pointer + 2, 1)) & 0x3f);
106 $result .= sprintf("u%x.", $ax);
109 elsif(0xf0 <= $first && $first <= 0xf7){ # 4 bytes
110 $ax = (unpack("C", substr(@_[0], $pointer, 1)) & 0x07) << 18;
111 $ax += (unpack("C", substr(@_[0], $pointer + 1, 1)) & 0x3f) << 12;
112 $ax += (unpack("C", substr(@_[0], $pointer + 2, 1)) & 0x3f) << 6;
113 $ax += (unpack("C", substr(@_[0], $pointer + 3, 1)) & 0x3f);
114 $result .= sprintf("u%x.", $ax);
117 elsif(0xf8 <= $first && $first <= 0xfb){ # 5 bytes
118 $ax = (unpack("C", substr(@_[0], $pointer, 1)) & 0x03) << 24;
119 $ax += (unpack("C", substr(@_[0], $pointer + 1, 1)) & 0x3f) << 18;
120 $ax += (unpack("C", substr(@_[0], $pointer + 2, 1)) & 0x3f) << 12;
121 $ax += (unpack("C", substr(@_[0], $pointer + 3, 1)) & 0x3f) << 6;
122 $ax += (unpack("C", substr(@_[0], $pointer + 4, 1)) & 0x3f);
123 $result .= sprintf("u%x.", $ax);
126 elsif(0xfc <= $first && $first <= 0xfd){ # 6 bytes
127 $ax = (unpack("C", substr(@_[0], $pointer, 1)) & 0x01) << 30;
128 $ax += (unpack("C", substr(@_[0], $pointer + 1, 1)) & 0x3f) << 24;
129 $ax += (unpack("C", substr(@_[0], $pointer + 2, 1)) & 0x3f) << 18;
130 $ax += (unpack("C", substr(@_[0], $pointer + 3, 1)) & 0x3f) << 12;
131 $ax += (unpack("C", substr(@_[0], $pointer + 4, 1)) & 0x3f) << 6;
132 $ax += (unpack("C", substr(@_[0], $pointer + 5, 1)) & 0x3f);
133 $result .= sprintf("u%x.", $ax);
139 chop($result); # remove the last period char.