From: Koichi KAMICHI Date: Thu, 25 Nov 2004 14:04:37 +0000 (+0000) Subject: Initial revision X-Git-Url: http://git.chise.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=0db94c8a3c790234f2b05fd1626dd1f759a7ee52;p=chise%2Fkage.git Initial revision --- diff --git a/kagecgi/makeidsdb.pl b/kagecgi/makeidsdb.pl new file mode 100644 index 0000000..13ea6b4 --- /dev/null +++ b/kagecgi/makeidsdb.pl @@ -0,0 +1,142 @@ +############################################################################### +# makeidsdb.pl +# +# create DB file for KAGE/cgi from CHISE-IDS *.txt +# by Koichi Kamichi +############################################################################### + +if($#ARGV != 0 || !(-d $ARGV[0])){ + print "Usage: perl makeidsdb.pl ids_dir\n"; + exit; +} +$idsdir = $ARGV[0]; +$idsdir =~ s/\/?$//; # remove the last slash + +if(-e "idsdb"){ + print "Remove old idsdb first.\n"; + exit; +} + +use Fcntl; +use BerkeleyDB; + +tie %db, "BerkeleyDB::Hash", -Filename => "idsdb", -Flags => DB_CREATE +or die "An error occured at ceating DB file.\n"; +%db = (); + +opendir(DH, "$ARGV[0]"); +@dirlist = readdir(DH); + +$counter = 1; + +foreach(@dirlist){ + if(m/IDS-.*\.txt/){ + + open(FH, "<$ARGV[0]/$_"); + @buffer = ; + if($buffer[0] =~ m/utf-8-mcs-er/){ + foreach(@buffer){ + + $_ =~ m/(.*?)[\s]?\t(.*)\t(.*)\n/; # remove space after Daikanwa's No. + $key = &properucs2kageucs($1); + $value = &utf8er2kageer($3); + if(($key ne $value) && ($value ne "")){ + print "\rcreating ... [$counter] key : $key "; + $db{$key} = $value; + $counter++; + } + + } + } + close(FH); + + } +} + +closedir(DH); +untie(%db); +print "\rdone.\n"; + +############################################################################### + +sub properucs2kageucs{ + my $ax = @_[0]; + + $ax =~ s/\'/s/; # replace single quote for Daikanwa No. + $ax =~ s/\"/d/; # replace double quote for Daikanwa No. + $ax =~ s/U\+/u/; + $ax =~ s/U\-0*/u/; + + return lc($ax); +} + +sub utf8er2kageer{ + my $result, $first, $semicolon, $ax; + + $result = ""; + + for(my $pointer = 0; $pointer < length(@_[0]);){ + $first = unpack("C", substr(@_[0], $pointer, 1)); + if(0x00 <= $first && $first <= 0x7f){ # 1 byte + + # the top letter is must be "&" + if($first == 0x26){ + $semicolon = index(@_[0], ';', $pointer); + $ax = substr(@_[0], $pointer + 1, $semicolon - $pointer - 1)."."; + $ax =~ s/^I-//; # remove "I-" + $ax =~ s/\'/s/; # replace single quote for Daikanwa No. + $ax =~ s/\"/d/; # replace double quote for Daikanwa No. + $result .= $ax; + + $pointer = $semicolon + 1; + } + else{ + $pointer += 1; + } + + } + elsif(0xc0 <= $first && $first <= 0xdf){ # 2 bytes + # may not be exist + $pointer += 2; + } + elsif(0xe0 <= $first && $first <= 0xef){ # 3 bytes + $ax = (unpack("C", substr(@_[0], $pointer, 1)) & 0x0f) << 12; + $ax += (unpack("C", substr(@_[0], $pointer + 1, 1)) & 0x3f) << 6; + $ax += (unpack("C", substr(@_[0], $pointer + 2, 1)) & 0x3f); + $result .= sprintf("u%x.", $ax); + $pointer += 3; + } + elsif(0xf0 <= $first && $first <= 0xf7){ # 4 bytes + $ax = (unpack("C", substr(@_[0], $pointer, 1)) & 0x07) << 18; + $ax += (unpack("C", substr(@_[0], $pointer + 1, 1)) & 0x3f) << 12; + $ax += (unpack("C", substr(@_[0], $pointer + 2, 1)) & 0x3f) << 6; + $ax += (unpack("C", substr(@_[0], $pointer + 3, 1)) & 0x3f); + $result .= sprintf("u%x.", $ax); + $pointer += 4; + } + elsif(0xf8 <= $first && $first <= 0xfb){ # 5 bytes + $ax = (unpack("C", substr(@_[0], $pointer, 1)) & 0x03) << 24; + $ax += (unpack("C", substr(@_[0], $pointer + 1, 1)) & 0x3f) << 18; + $ax += (unpack("C", substr(@_[0], $pointer + 2, 1)) & 0x3f) << 12; + $ax += (unpack("C", substr(@_[0], $pointer + 3, 1)) & 0x3f) << 6; + $ax += (unpack("C", substr(@_[0], $pointer + 4, 1)) & 0x3f); + $result .= sprintf("u%x.", $ax); + $pointer += 5; + } + elsif(0xfc <= $first && $first <= 0xfd){ # 6 bytes + $ax = (unpack("C", substr(@_[0], $pointer, 1)) & 0x01) << 30; + $ax += (unpack("C", substr(@_[0], $pointer + 1, 1)) & 0x3f) << 24; + $ax += (unpack("C", substr(@_[0], $pointer + 2, 1)) & 0x3f) << 18; + $ax += (unpack("C", substr(@_[0], $pointer + 3, 1)) & 0x3f) << 12; + $ax += (unpack("C", substr(@_[0], $pointer + 4, 1)) & 0x3f) << 6; + $ax += (unpack("C", substr(@_[0], $pointer + 5, 1)) & 0x3f); + $result .= sprintf("u%x.", $ax); + $pointer += 6; + } + + } + + chop($result); # remove the last period char. + return lc($result); + +}