X-Git-Url: http://git.chise.org/gitweb/?a=blobdiff_plain;f=Chise_utils%2FChise_utils.pm;h=2f40e7548613a63e5bb56ee7a9b1540fae550904;hb=e0d14956e6b4179eaa2f58b5b6398eb51d5c79f8;hp=3338922b759da62d7b9b31b79d7fd9495128f42b;hpb=68f33c220db66e047d2686f4ae87040a50bef2f3;p=chise%2Fperl.git diff --git a/Chise_utils/Chise_utils.pm b/Chise_utils/Chise_utils.pm index 3338922..2f40e75 100644 --- a/Chise_utils/Chise_utils.pm +++ b/Chise_utils/Chise_utils.pm @@ -8,7 +8,9 @@ require Exporter; use utf8; use BerkeleyDB; -use vars qw(%db %chardb $atr +use vars qw(%db %chardb + %reverse_db %reverse_chardb + $atr $idc ); our @ISA = qw(Exporter); @@ -22,12 +24,16 @@ our @ISA = qw(Exporter); # will save memory. our %EXPORT_TAGS = ( 'all' => [ qw( %db %chardb + %reverse_db %reverse_chardb + $idc + &get_db + &get_reverse_db &get_char_attribute &get_chars_matching &get_chars_containing - &get_chars_matching &get_chars_for &de_er + &ids_argc ) ] ); our @EXPORT_OK = ( @{ $EXPORT_TAGS{'all'} } ); @@ -40,53 +46,134 @@ our $VERSION = '0.01'; # Preloaded methods go here. my $DB_HOME=""; -if(-e '/usr/local/lib/xemacs-21.4.10/i686-pc-linux/char-db/system-char-id'){ - $DB_HOME='/usr/local/lib/xemacs-21.4.10/i686-pc-linux/char-db/system-char-id'; -}elsif(-e 'd:/work/chise/char-db/system-char-id'){ - $DB_HOME='d:/work/chise/char-db/system-char-id'; -}elsif(-e '/usr/local/lib/xemacs-21.4.10/powerpc-apple-darwin6.4/char-db/system-char-id'){ - $DB_HOME='/usr/local/lib/xemacs-21.4.10/powerpc-apple-darwin6.4/char-db/system-char-id'; -}elsif(-e '/usr/local/xemacs-utf2000/lib/xemacs-21.4.10/powerpc-apple-darwin6.4/char-db/system-char-id'){ - $DB_HOME='/usr/local/xemacs-utf2000/lib/xemacs-21.4.10/powerpc-apple-darwin6.4/char-db/system-char-id'; +unless($DB_HOME){ + if(-e '/usr/local/lib/chise/db'){ + $DB_HOME='/usr/local/lib/chise/db'; + }elsif(-e '/usr/lib/chise/db'){ + $DB_HOME='/usr/lib/chise/db'; + }elsif(-e '/sw/lib/chise/db'){ + $DB_HOME='/sw/lib/chise/db'; + }elsif(-e '/usr/local/lib/chise/char-db'){ + $DB_HOME='/usr/local/lib/chise/char-db'; + }elsif(-e 'd:/work/chise/char-db'){ + $DB_HOME='d:/work/chise/char-db'; + }else{ + print STDERR "No database found.\n"; + print STDERR "Pleas set \$DB_HOME to Chise_utils.pm.\n"; + exit 1; + } +} + +$idc="\x{2ff0}-\x{2fff}"; + +my %er_alias = + ('C1','=cns11643-1', + 'C2','=cns11643-2', + 'C3','=cns11643-3', + 'C4','=cns11643-4', + 'C5','=cns11643-5', + 'C6','=cns11643-6', + 'C7','=cns11643-7', + 'CB','=cbeta', + 'CDP','=big5-cdp', + 'GT','=gt', + 'GT-K','=gt-k', + 'HZK1','=hanziku-1', + 'HZK2','=hanziku-2', + 'HZK3','=hanziku-3', + 'HZK4','=hanziku-4', + 'HZK5','=hanziku-5', + 'HZK6','=hanziku-6', + 'HZK7','=hanziku-7', + 'HZK8','=hanziku-8', + 'HZK9','=hanziku-9', + 'HZK10','=hanziku-10', + 'HZK11','=hanziku-11', + 'HZK12','=hanziku-12', + 'J78','=jisx0208-1978', + 'J83','=jisx0208', + 'J90','=jisx0208-1990', + 'JSP','=jisx0212', + 'JX1','=jisx0213-1', + 'JX2','=jisx0213-2', + 'K0','=ks-x1001', + 'M','=daikanwa', + ); + +my $er_prefix_re=join '|', keys %er_alias; + +if(-d "$DB_HOME/character"){ + for (glob "$DB_HOME/character/feature/*"){ + next if(/\.txt$/); + $atr=$_; + $atr=~s!$DB_HOME/character/feature/!!; + $db{$atr}=$_; + } + for (glob "$DB_HOME/character/by-feature/*"){ + next if(/\.txt$/); + $atr=$_; + $atr=~s!$DB_HOME/character/feature/!!; + $reverse_db{$atr}=$_; + } +}elsif(-d "$DB_HOME/system-char-id"){ + for (glob "$DB_HOME/system-char-id/*"){ + next if(/\.txt$/); + $atr=$_; + $atr=~s!$DB_HOME/system-char-id/!!; + $db{$atr}=$_; + } + for (glob "$DB_HOME/*"){ + next if(/\.txt$/ or /system-char-id/); + $atr=$_; + $atr=~s!$DB_HOME/!!; + $reverse_db{$atr}=$_."/system-char-id"; + } }else{ print STDERR "No database found.\n"; - print STDERR "Pleas set \$DB_HOME to Chise_utils.pm.\n"; + print STDERR "Pleas set \$DB_HOME to Chise_utils.pm correctly.\n"; exit 1; } -for (<$DB_HOME/*>){ - next if(/\.txt$/); - $atr=$_; - $atr=~s!$DB_HOME/!!; - $db{$atr}=$_; -} - -foreach $atr (keys %db){ +sub get_db{ + my($atr)=@_; + return 1 if(defined(%{$chardb{$atr}})); if(defined($db{$atr}) and -f $db{$atr}){ tie %{$chardb{$atr}}, "BerkeleyDB::Hash", -Filename => $db{$atr}; }else{ - print STDERR "no target\n"; - exit 1; + return undef; + } +} + +sub get_reverse_db{ + my($atr)=@_; + return 1 if(defined(%{$reverse_chardb{$atr}})); + if(defined($reverse_db{$atr}) and -f $reverse_db{$atr}){ + tie %{$reverse_chardb{$atr}}, "BerkeleyDB::Hash", + -Filename => $reverse_db{$atr}; + }else{ + return undef; } } sub get_char_attribute{ my($char,$atr)=@_; + &get_db($atr) or return ""; if($chardb{$atr}->{"?$char"}){ return $chardb{$atr}->{"?$char"}; }else{ - return "no attribute for $char"; + return ""; } } sub get_chars_containing{ my($atr,$value)=@_; my($char,@res); - if(defined(%{$chardb{$atr}})){ + if(&get_db($atr)){ foreach $char (keys %{$chardb{$atr}}){ if($chardb{$atr}->{$char}=~/$value/){ + $char=~s/^\?//; push @res,$char; } } @@ -97,13 +184,27 @@ sub get_chars_containing{ sub get_chars_matching{ my($atr,$value)=@_; my($char,@res); - if(defined(%{$chardb{$atr}})){ - foreach $char (keys %{$chardb{$atr}}){ - if($chardb{$atr}->{$char}=~/^$value$/){ + if(defined($reverse_db{$atr})){ + if(&get_reverse_db($atr)){ + if($char=$reverse_chardb{$atr}->{$value}){ + $char=~s/^\?//; push @res,$char; } } } + else{ +# never fall back. +# unless(@res){ +# # fall back if DB inconsistency exists. + if(&get_db($atr)){ + foreach $char (keys %{$chardb{$atr}}){ + if($chardb{$atr}->{$char} eq $value){ + $char=~s/^\?//; + push @res,$char; + } + } + } + } return @res; } @@ -113,56 +214,58 @@ sub get_chars_for{ my(%res,@res,$atr,$value); my $i=0; foreach $query (@q){ - if($query=~/==/){ - ($atr,$value)=split("==",$query,2); + if($query=~/=~/){ + ($atr,$value)=split("=~",$query,2); $i++; - foreach (&get_chars_matching($atr,$value)){ + foreach (&get_chars_containing($atr,$value)){ $res{$_}++; } - }elsif($query=~/=~/){ - ($atr,$value)=split("=~",$query,2); + }elsif($query=~/==/){ + ($atr,$value)=split(/==/,$query,2); $i++; - foreach (&get_chars_containing($atr,$value)){ + foreach (&get_chars_matching($atr,$value)){ $res{$_}++; } } } - foreach (keys %res){ - if($res{$_}==$i){ - push @res,$_; + return grep {defined($res{$_}) and $res{$_}==$i} (keys %res); +} + +sub de_er{ + my($er)=@_; + my($output_char,$atr,$value); + my($prefix,$suffix); + $er=~/^(amp|&)?(.+?)(;)?$/ + and $prefix=$1,$er=$2,$suffix=$3; + $prefix or $prefix="",$suffix or $suffix=""; + if($er=~/^\d+$/){ + $output_char=pack("U",$er); + }elsif($er=~/^U[\+\-]([a-fA-F\d]+)/){ + $output_char=pack("U",hex($1)); + }elsif($er=~/(?:I\-)?($er_prefix_re)\-?([0-9a-fA-F]+)/){ + ($atr,$value)=($1,$2); + unless($er_alias{$atr}=~/daikanwa|gt/){ + $value=hex($value); } + ($output_char)=&get_chars_matching($er_alias{$atr},$value); + } + if($output_char){ + return $output_char; + }else{ + return $prefix.$er.$suffix; } - return @res; } -sub de_er{ +sub ids_argc{ my($char)=@_; - if($char=~/^\d+$/){ - $char=pack("U",$char); - }elsif($char=~/U[\+\-](\d+)/){ - $char=pack("U",$1); - }elsif($char=~m/CDP\-(\d+)/){ - # chinese-big5-cdp CDP- 4 X), - # ideograph-daikanwa M- 5 d), - # ideograph-cbeta CB 5 d), - # ideograph-gt GT- 5 d), - # ideograph-gt-k GT-K 5 d), - # japanese-jisx0208-1990 J90- 4 X), - # japanese-jisx0208 J83- 4 X), - # japanese-jisx0213-1 JX1- 4 X), - # japanese-jisx0213-2 JX2- 4 X), - # japanese-jisx0212 JSP- 4 X), - # japanese-jisx0208-1978 J78- 4 X), - # chinese-cns11643-1 C1- 4 X), - # chinese-cns11643-2 C2- 4 X), - # chinese-cns11643-3 C3- 4 X), - # chinese-cns11643-4 C4- 4 X), - # chinese-cns11643-5 C5- 4 X), - # chinese-cns11643-6 C6- 4 X), - # chinese-cns11643-7 C7- 4 X), - # korean-ksc5601 K0- 4 X), + my $char_id=unpack("U",$char); + if($char_id==0x2ff2 or $char_id==0x2ff3){ + return 3; + }elsif($char_id>=0x2ff0 and $char_id<=0x2fff){ + return 2; + }else{ + return 0; } - return $char; } # Autoload methods go after =cut, and are processed by the autosplit program.