X-Git-Url: http://git.chise.org/gitweb/?a=blobdiff_plain;f=Chise_utils%2FChise_utils.pm;h=0cdd999a40b6f9903610d4baffd63e08f672d8a3;hb=b0e3aa47a0161d8ba937ec39838c9d1d71bf0f1f;hp=e5f54b3a364de6f3d44259ac23f2a7833bd4727d;hpb=26086c337b2d87e64af5fa8a9377a7b8af84dd46;p=chise%2Fperl.git diff --git a/Chise_utils/Chise_utils.pm b/Chise_utils/Chise_utils.pm index e5f54b3..0cdd999 100644 --- a/Chise_utils/Chise_utils.pm +++ b/Chise_utils/Chise_utils.pm @@ -8,8 +8,9 @@ require Exporter; use utf8; use BerkeleyDB; -use vars qw(%db %chardb $atr - $idc +use vars qw(%db %chardb + %reverse_db %reverse_chardb + $atr $idc ); our @ISA = qw(Exporter); @@ -22,7 +23,11 @@ our @ISA = qw(Exporter); # If you do not need this, moving things directly into @EXPORT or @EXPORT_OK # will save memory. our %EXPORT_TAGS = ( 'all' => [ qw( - %db %chardb $idc + %db %chardb + %reverse_db %reverse_chardb + $idc + &get_db + &get_reverse_db &get_char_attribute &get_chars_matching &get_chars_containing @@ -42,14 +47,20 @@ our $VERSION = '0.01'; # Preloaded methods go here. my $DB_HOME=""; -if(-e '/usr/local/lib/xemacs-21.4.10/i686-pc-linux/char-db/system-char-id'){ - $DB_HOME='/usr/local/lib/xemacs-21.4.10/i686-pc-linux/char-db/system-char-id'; -}elsif(-e 'd:/work/chise/char-db/system-char-id'){ - $DB_HOME='d:/work/chise/char-db/system-char-id'; -}elsif(-e '/usr/local/lib/xemacs-21.4.10/powerpc-apple-darwin6.4/char-db/system-char-id'){ - $DB_HOME='/usr/local/lib/xemacs-21.4.10/powerpc-apple-darwin6.4/char-db/system-char-id'; -}elsif(-e '/usr/local/xemacs-utf2000/lib/xemacs-21.4.10/powerpc-apple-darwin6.4/char-db/system-char-id'){ - $DB_HOME='/usr/local/xemacs-utf2000/lib/xemacs-21.4.10/powerpc-apple-darwin6.4/char-db/system-char-id'; +if(-e '/usr/local/lib/xemacs-21.4.11/i686-pc-linux/char-db'){ + $DB_HOME='/usr/local/lib/xemacs-21.4.11/i686-pc-linux/char-db'; +}elsif(-e '/usr/local/lib/xemacs-21.4.11/powerpc-apple-darwin6.4/char-db'){ + $DB_HOME='/usr/local/lib/xemacs-21.4.11/powerpc-apple-darwin6.4/char-db'; +}elsif(-e '/usr/local/xemacs-utf2000/lib/xemacs-21.4.11/powerpc-apple-darwin6.4/char-db'){ + $DB_HOME='/usr/local/xemacs-utf2000/lib/xemacs-21.4.11/powerpc-apple-darwin6.4/char-db'; +}elsif(-e '/usr/local/lib/xemacs-21.4.10/i686-pc-linux/char-db'){ + $DB_HOME='/usr/local/lib/xemacs-21.4.10/i686-pc-linux/char-db'; +}elsif(-e '/usr/local/lib/xemacs-21.4.10/powerpc-apple-darwin6.4/char-db'){ + $DB_HOME='/usr/local/lib/xemacs-21.4.10/powerpc-apple-darwin6.4/char-db'; +}elsif(-e '/usr/local/xemacs-utf2000/lib/xemacs-21.4.10/powerpc-apple-darwin6.4/char-db'){ + $DB_HOME='/usr/local/xemacs-utf2000/lib/xemacs-21.4.10/powerpc-apple-darwin6.4/char-db'; +}elsif(-e 'd:/work/chise/char-db'){ + $DB_HOME='d:/work/chise/char-db'; }else{ print STDERR "No database found.\n"; print STDERR "Pleas set \$DB_HOME to Chise_utils.pm.\n"; @@ -58,26 +69,80 @@ if(-e '/usr/local/lib/xemacs-21.4.10/i686-pc-linux/char-db/system-char-id'){ $idc="\x{2ff0}-\x{2fff}"; -for (<$DB_HOME/*>){ +my %er_alias = + ('C1','chinese-cns11643-1', + 'C2','chinese-cns11643-2', + 'C3','chinese-cns11643-3', + 'C4','chinese-cns11643-4', + 'C5','chinese-cns11643-5', + 'C6','chinese-cns11643-6', + 'C7','chinese-cns11643-7', + 'CB','ideograph-cbeta', + 'CDP','chinese-big5-cdp', + 'GT','ideograph-gt', + 'GT-K','ideograph-gt', + 'HZK1','ideograph-hanziku-1', + 'HZK2','ideograph-hanziku-2', + 'HZK3','ideograph-hanziku-3', + 'HZK4','ideograph-hanziku-4', + 'HZK5','ideograph-hanziku-5', + 'HZK6','ideograph-hanziku-6', + 'HZK7','ideograph-hanziku-7', + 'HZK8','ideograph-hanziku-8', + 'HZK9','ideograph-hanziku-9', + 'HZK10','ideograph-hanziku-10', + 'HZK11','ideograph-hanziku-11', + 'HZK12','ideograph-hanziku-12', + 'J78','japanese-jisx0208-1978', + 'J83','japanese-jisx0208', + 'J90','japanese-jisx0208-1990', + 'JSP','japanese-jisx0212', + 'JX1','japanese-jisx0213-1', + 'JX2','japanese-jisx0213-2', + 'K0','korean-ksc5601', + 'M','ideograph-daikanwa', + ); + +for (glob "$DB_HOME/system-char-id/*"){ next if(/\.txt$/); $atr=$_; - $atr=~s!$DB_HOME/!!; + $atr=~s!$DB_HOME/system-char-id/!!; $db{$atr}=$_; } -foreach $atr (keys %db){ +for (glob "$DB_HOME/*"){ + next if(/\.txt$/ or /system-char-id/); + $atr=$_; + $atr=~s!$DB_HOME/!!; + $reverse_db{$atr}=$_."/system-char-id"; +} + +sub get_db{ + my($atr)=@_; + return 1 if(defined(%{$chardb{$atr}})); if(defined($db{$atr}) and -f $db{$atr}){ tie %{$chardb{$atr}}, "BerkeleyDB::Hash", -Filename => $db{$atr}; }else{ - print STDERR "no target\n"; - exit 1; + return undef; + } +} + +sub get_reverse_db{ + my($atr)=@_; + return 1 if(defined(%{$reverse_chardb{$atr}})); + if(defined($reverse_db{$atr}) and -f $reverse_db{$atr}){ + tie %{$reverse_chardb{$atr}}, "BerkeleyDB::Hash", + -Filename => $reverse_db{$atr}; + }else{ + return undef; } } sub get_char_attribute{ my($char,$atr)=@_; + &get_db($atr) or return ""; if($chardb{$atr}->{"?$char"}){ return $chardb{$atr}->{"?$char"}; }else{ @@ -88,7 +153,7 @@ sub get_char_attribute{ sub get_chars_containing{ my($atr,$value)=@_; my($char,@res); - if(defined(%{$chardb{$atr}})){ + if(&get_db($atr)){ foreach $char (keys %{$chardb{$atr}}){ if($chardb{$atr}->{$char}=~/$value/){ $char=~s/^\?//; @@ -102,14 +167,26 @@ sub get_chars_containing{ sub get_chars_matching{ my($atr,$value)=@_; my($char,@res); - if(defined(%{$chardb{$atr}})){ - foreach $char (keys %{$chardb{$atr}}){ - if($chardb{$atr}->{$char} eq $value){ + if(defined($reverse_db{$atr})){ + if(&get_reverse_db($atr)){ + if($char=$reverse_chardb{$atr}->{$value}){ $char=~s/^\?//; push @res,$char; } } } + # else{ + # fall back if DB inconsistency exists. + unless(@res){ + if(&get_db($atr)){ + foreach $char (keys %{$chardb{$atr}}){ + if($chardb{$atr}->{$char} eq $value){ + $char=~s/^\?//; + push @res,$char; + } + } + } + } return @res; } @@ -125,8 +202,8 @@ sub get_chars_for{ foreach (&get_chars_containing($atr,$value)){ $res{$_}++; } - }elsif($query=~/=/){ - ($atr,$value)=split(/=+/,$query,2); + }elsif($query=~/==/){ + ($atr,$value)=split(/==/,$query,2); $i++; foreach (&get_chars_matching($atr,$value)){ $res{$_}++; @@ -143,68 +220,18 @@ sub get_chars_for{ sub de_er{ my($er)=@_; - my($output_char); + my($output_char,$atr,$value); + my $keys = join '|', keys %er_alias; if($er=~/^\d+$/){ $output_char=pack("U",$er); }elsif($er=~/^U[\+\-]([a-fA-F\d]+)/){ $output_char=pack("U",hex($1)); - }elsif($er=~m/^CDP\-([a-fA-F\d]+)/){ - ($output_char)=&get_chars_matching("chinese-big5-cdp",$1); - # chinese-big5-cdp CDP- 4 X), - }elsif($er=~m/^M\-([\d]+)/){ - ($output_char)=&get_chars_matching("ideograph-daikanwa",$1); - # ideograph-daikanwa M- 5 d), - }elsif($er=~m/^CB\-([\d]+)/){ - ($output_char)=&get_chars_matching("ideograph-cbeta",$1); - # ideograph-cbeta CB 5 d), - }elsif($er=~m/^GT\-([\d]+)/){ - ($output_char)=&get_chars_matching("ideograph-gt",$1); - # ideograph-gt GT- 5 d), - }elsif($er=~m/^GT\-K\-([\d]+)/){ - ($output_char)=&get_chars_matching("ideograph-gt-k",$1); - # ideograph-gt-k GT-K 5 d), - }elsif($er=~m/^J90\-([a-fA-F\d]+)/){ - ($output_char)=&get_chars_matching("japanese-jisx0208-1990",$1); - # japanese-jisx0208-1990 J90- 4 X), - }elsif($er=~m/^J83\-([a-fA-F\d]+)/){ - ($output_char)=&get_chars_matching("japanese-jisx0208",$1); - # japanese-jisx0208 J83- 4 X), - }elsif($er=~m/^JX1\-([a-fA-F\d]+)/){ - ($output_char)=&get_chars_matching("japanese-jisx0213-1",$1); - # japanese-jisx0213-1 JX1- 4 X), - }elsif($er=~m/^JX2\-([a-fA-F\d]+)/){ - ($output_char)=&get_chars_matching("japanese-jisx0213-2",$1); - # japanese-jisx0213-2 JX2- 4 X), - }elsif($er=~m/^JSP\-([a-fA-F\d]+)/){ - ($output_char)=&get_chars_matching("japanese-jisx0212",$1); - # japanese-jisx0212 JSP- 4 X), - }elsif($er=~m/^J78\-([a-fA-F\d]+)/){ - ($output_char)=&get_chars_matching("japanese-jisx0208-1978",$1); - # japanese-jisx0208-1978 J78- 4 X), - }elsif($er=~m/^C1\-([a-fA-F\d]+)/){ - ($output_char)=&get_chars_matching("chinese-cns11643-1",$1); - # chinese-cns11643-1 C1- 4 X), - }elsif($er=~m/^C2\-([a-fA-F\d]+)/){ - ($output_char)=&get_chars_matching("chinese-cns11643-2",$1); - # chinese-cns11643-2 C2- 4 X), - }elsif($er=~m/^C3\-([a-fA-F\d]+)/){ - ($output_char)=&get_chars_matching("chinese-cns11643-3",$1); - # chinese-cns11643-3 C3- 4 X), - }elsif($er=~m/^C4\-([a-fA-F\d]+)/){ - ($output_char)=&get_chars_matching("chinese-cns11643-4",$1); - # chinese-cns11643-4 C4- 4 X), - }elsif($er=~m/^C5\-([a-fA-F\d]+)/){ - ($output_char)=&get_chars_matching("chinese-cns11643-5",$1); - # chinese-cns11643-5 C5- 4 X), - }elsif($er=~m/^C6\-([a-fA-F\d]+)/){ - ($output_char)=&get_chars_matching("chinese-cns11643-6",$1); - # chinese-cns11643-6 C6- 4 X), - }elsif($er=~m/^C7\-([a-fA-F\d]+)/){ - ($output_char)=&get_chars_matching("chinese-cns11643-7",$1); - # chinese-cns11643-7 C7- 4 X), - }elsif($er=~m/^K0\-([a-fA-F\d]+)/){ - ($output_char)=&get_chars_matching("korean-ksc5601",$1); - # korean-ksc5601 K0- 4 X), + }elsif($er=~/(?:I\-)?($keys)\-?([0-9a-fA-F]+)/){ + ($atr,$value)=($1,$2); + unless($er_alias{$atr}=~/daikanwa|gt/){ + $value=hex($value); + } + ($output_char)=&get_chars_matching($er_alias{$atr},$value); } if($output_char){ return $output_char;