X-Git-Url: http://git.chise.org/gitweb/?a=blobdiff_plain;f=Chise_utils%2FChise_utils.pm;h=0cdd999a40b6f9903610d4baffd63e08f672d8a3;hb=b0e3aa47a0161d8ba937ec39838c9d1d71bf0f1f;hp=c2a89bf507fbe9826fb70f17eb785e6a3ea695e0;hpb=2f9065adb84d8a6be5bce9170757b86d6fbecd1b;p=chise%2Fperl.git diff --git a/Chise_utils/Chise_utils.pm b/Chise_utils/Chise_utils.pm index c2a89bf..0cdd999 100644 --- a/Chise_utils/Chise_utils.pm +++ b/Chise_utils/Chise_utils.pm @@ -8,8 +8,9 @@ require Exporter; use utf8; use BerkeleyDB; -use vars qw(%db %chardb $atr - $idc +use vars qw(%db %chardb + %reverse_db %reverse_chardb + $atr $idc ); our @ISA = qw(Exporter); @@ -22,7 +23,11 @@ our @ISA = qw(Exporter); # If you do not need this, moving things directly into @EXPORT or @EXPORT_OK # will save memory. our %EXPORT_TAGS = ( 'all' => [ qw( - %db %chardb $idc + %db %chardb + %reverse_db %reverse_chardb + $idc + &get_db + &get_reverse_db &get_char_attribute &get_chars_matching &get_chars_containing @@ -42,14 +47,20 @@ our $VERSION = '0.01'; # Preloaded methods go here. my $DB_HOME=""; -if(-e '/usr/local/lib/xemacs-21.4.10/i686-pc-linux/char-db/system-char-id'){ - $DB_HOME='/usr/local/lib/xemacs-21.4.10/i686-pc-linux/char-db/system-char-id'; -}elsif(-e 'd:/work/chise/char-db/system-char-id'){ - $DB_HOME='d:/work/chise/char-db/system-char-id'; -}elsif(-e '/usr/local/lib/xemacs-21.4.10/powerpc-apple-darwin6.4/char-db/system-char-id'){ - $DB_HOME='/usr/local/lib/xemacs-21.4.10/powerpc-apple-darwin6.4/char-db/system-char-id'; -}elsif(-e '/usr/local/xemacs-utf2000/lib/xemacs-21.4.10/powerpc-apple-darwin6.4/char-db/system-char-id'){ - $DB_HOME='/usr/local/xemacs-utf2000/lib/xemacs-21.4.10/powerpc-apple-darwin6.4/char-db/system-char-id'; +if(-e '/usr/local/lib/xemacs-21.4.11/i686-pc-linux/char-db'){ + $DB_HOME='/usr/local/lib/xemacs-21.4.11/i686-pc-linux/char-db'; +}elsif(-e '/usr/local/lib/xemacs-21.4.11/powerpc-apple-darwin6.4/char-db'){ + $DB_HOME='/usr/local/lib/xemacs-21.4.11/powerpc-apple-darwin6.4/char-db'; +}elsif(-e '/usr/local/xemacs-utf2000/lib/xemacs-21.4.11/powerpc-apple-darwin6.4/char-db'){ + $DB_HOME='/usr/local/xemacs-utf2000/lib/xemacs-21.4.11/powerpc-apple-darwin6.4/char-db'; +}elsif(-e '/usr/local/lib/xemacs-21.4.10/i686-pc-linux/char-db'){ + $DB_HOME='/usr/local/lib/xemacs-21.4.10/i686-pc-linux/char-db'; +}elsif(-e '/usr/local/lib/xemacs-21.4.10/powerpc-apple-darwin6.4/char-db'){ + $DB_HOME='/usr/local/lib/xemacs-21.4.10/powerpc-apple-darwin6.4/char-db'; +}elsif(-e '/usr/local/xemacs-utf2000/lib/xemacs-21.4.10/powerpc-apple-darwin6.4/char-db'){ + $DB_HOME='/usr/local/xemacs-utf2000/lib/xemacs-21.4.10/powerpc-apple-darwin6.4/char-db'; +}elsif(-e 'd:/work/chise/char-db'){ + $DB_HOME='d:/work/chise/char-db'; }else{ print STDERR "No database found.\n"; print STDERR "Pleas set \$DB_HOME to Chise_utils.pm.\n"; @@ -58,26 +69,80 @@ if(-e '/usr/local/lib/xemacs-21.4.10/i686-pc-linux/char-db/system-char-id'){ $idc="\x{2ff0}-\x{2fff}"; -for (<$DB_HOME/*>){ +my %er_alias = + ('C1','chinese-cns11643-1', + 'C2','chinese-cns11643-2', + 'C3','chinese-cns11643-3', + 'C4','chinese-cns11643-4', + 'C5','chinese-cns11643-5', + 'C6','chinese-cns11643-6', + 'C7','chinese-cns11643-7', + 'CB','ideograph-cbeta', + 'CDP','chinese-big5-cdp', + 'GT','ideograph-gt', + 'GT-K','ideograph-gt', + 'HZK1','ideograph-hanziku-1', + 'HZK2','ideograph-hanziku-2', + 'HZK3','ideograph-hanziku-3', + 'HZK4','ideograph-hanziku-4', + 'HZK5','ideograph-hanziku-5', + 'HZK6','ideograph-hanziku-6', + 'HZK7','ideograph-hanziku-7', + 'HZK8','ideograph-hanziku-8', + 'HZK9','ideograph-hanziku-9', + 'HZK10','ideograph-hanziku-10', + 'HZK11','ideograph-hanziku-11', + 'HZK12','ideograph-hanziku-12', + 'J78','japanese-jisx0208-1978', + 'J83','japanese-jisx0208', + 'J90','japanese-jisx0208-1990', + 'JSP','japanese-jisx0212', + 'JX1','japanese-jisx0213-1', + 'JX2','japanese-jisx0213-2', + 'K0','korean-ksc5601', + 'M','ideograph-daikanwa', + ); + +for (glob "$DB_HOME/system-char-id/*"){ next if(/\.txt$/); $atr=$_; - $atr=~s!$DB_HOME/!!; + $atr=~s!$DB_HOME/system-char-id/!!; $db{$atr}=$_; } -foreach $atr (keys %db){ +for (glob "$DB_HOME/*"){ + next if(/\.txt$/ or /system-char-id/); + $atr=$_; + $atr=~s!$DB_HOME/!!; + $reverse_db{$atr}=$_."/system-char-id"; +} + +sub get_db{ + my($atr)=@_; + return 1 if(defined(%{$chardb{$atr}})); if(defined($db{$atr}) and -f $db{$atr}){ tie %{$chardb{$atr}}, "BerkeleyDB::Hash", -Filename => $db{$atr}; }else{ - print STDERR "no target\n"; - exit 1; + return undef; + } +} + +sub get_reverse_db{ + my($atr)=@_; + return 1 if(defined(%{$reverse_chardb{$atr}})); + if(defined($reverse_db{$atr}) and -f $reverse_db{$atr}){ + tie %{$reverse_chardb{$atr}}, "BerkeleyDB::Hash", + -Filename => $reverse_db{$atr}; + }else{ + return undef; } } sub get_char_attribute{ my($char,$atr)=@_; + &get_db($atr) or return ""; if($chardb{$atr}->{"?$char"}){ return $chardb{$atr}->{"?$char"}; }else{ @@ -88,7 +153,7 @@ sub get_char_attribute{ sub get_chars_containing{ my($atr,$value)=@_; my($char,@res); - if(defined(%{$chardb{$atr}})){ + if(&get_db($atr)){ foreach $char (keys %{$chardb{$atr}}){ if($chardb{$atr}->{$char}=~/$value/){ $char=~s/^\?//; @@ -102,14 +167,26 @@ sub get_chars_containing{ sub get_chars_matching{ my($atr,$value)=@_; my($char,@res); - if(defined(%{$chardb{$atr}})){ - foreach $char (keys %{$chardb{$atr}}){ - if($chardb{$atr}->{$char} eq $value){ + if(defined($reverse_db{$atr})){ + if(&get_reverse_db($atr)){ + if($char=$reverse_chardb{$atr}->{$value}){ $char=~s/^\?//; push @res,$char; } } } + # else{ + # fall back if DB inconsistency exists. + unless(@res){ + if(&get_db($atr)){ + foreach $char (keys %{$chardb{$atr}}){ + if($chardb{$atr}->{$char} eq $value){ + $char=~s/^\?//; + push @res,$char; + } + } + } + } return @res; } @@ -119,16 +196,16 @@ sub get_chars_for{ my(%res,@res,$atr,$value); my $i=0; foreach $query (@q){ - if($query=~/==/){ - ($atr,$value)=split("==",$query,2); + if($query=~/=~/){ + ($atr,$value)=split("=~",$query,2); $i++; - foreach (&get_chars_matching($atr,$value)){ + foreach (&get_chars_containing($atr,$value)){ $res{$_}++; } - }elsif($query=~/=~/){ - ($atr,$value)=split("=~",$query,2); + }elsif($query=~/==/){ + ($atr,$value)=split(/==/,$query,2); $i++; - foreach (&get_chars_containing($atr,$value)){ + foreach (&get_chars_matching($atr,$value)){ $res{$_}++; } } @@ -142,33 +219,25 @@ sub get_chars_for{ } sub de_er{ - my($char)=@_; - if($char=~/^\d+$/){ - $char=pack("U",$char); - }elsif($char=~/U[\+\-](\d+)/){ - $char=pack("U",$1); - }elsif($char=~m/CDP\-(\d+)/){ - # chinese-big5-cdp CDP- 4 X), - # ideograph-daikanwa M- 5 d), - # ideograph-cbeta CB 5 d), - # ideograph-gt GT- 5 d), - # ideograph-gt-k GT-K 5 d), - # japanese-jisx0208-1990 J90- 4 X), - # japanese-jisx0208 J83- 4 X), - # japanese-jisx0213-1 JX1- 4 X), - # japanese-jisx0213-2 JX2- 4 X), - # japanese-jisx0212 JSP- 4 X), - # japanese-jisx0208-1978 J78- 4 X), - # chinese-cns11643-1 C1- 4 X), - # chinese-cns11643-2 C2- 4 X), - # chinese-cns11643-3 C3- 4 X), - # chinese-cns11643-4 C4- 4 X), - # chinese-cns11643-5 C5- 4 X), - # chinese-cns11643-6 C6- 4 X), - # chinese-cns11643-7 C7- 4 X), - # korean-ksc5601 K0- 4 X), + my($er)=@_; + my($output_char,$atr,$value); + my $keys = join '|', keys %er_alias; + if($er=~/^\d+$/){ + $output_char=pack("U",$er); + }elsif($er=~/^U[\+\-]([a-fA-F\d]+)/){ + $output_char=pack("U",hex($1)); + }elsif($er=~/(?:I\-)?($keys)\-?([0-9a-fA-F]+)/){ + ($atr,$value)=($1,$2); + unless($er_alias{$atr}=~/daikanwa|gt/){ + $value=hex($value); + } + ($output_char)=&get_chars_matching($er_alias{$atr},$value); + } + if($output_char){ + return $output_char; + }else{ + return $er; } - return $char; } sub ids_argc{