change regular expression in &de_er
[chise/perl.git] / Chise_utils / Chise_utils.pm
index 193c8d8..5967978 100644 (file)
@@ -31,7 +31,6 @@ our %EXPORT_TAGS = ( 'all' => [ qw(
                                   &get_char_attribute
                                   &get_chars_matching
                                   &get_chars_containing
-                                  &get_chars_matching
                                   &get_chars_for
                                   &de_er
                                   &ids_argc
@@ -47,76 +46,92 @@ our $VERSION = '0.01';
 # Preloaded methods go here.
 
 my $DB_HOME="";
-if(-e '/usr/local/lib/chise/char-db'){
-    $DB_HOME='/usr/local/lib/chise/char-db';
-}elsif(-e '/usr/local/lib/xemacs-21.4.11/i686-pc-linux/char-db'){
-    $DB_HOME='/usr/local/lib/xemacs-21.4.11/i686-pc-linux/char-db';
-}elsif(-e '/usr/local/lib/xemacs-21.4.11/powerpc-apple-darwin6.4/char-db'){
-    $DB_HOME='/usr/local/lib/xemacs-21.4.11/powerpc-apple-darwin6.4/char-db';
-}elsif(-e '/usr/local/xemacs-utf2000/lib/xemacs-21.4.11/powerpc-apple-darwin6.4/char-db'){
-    $DB_HOME='/usr/local/xemacs-utf2000/lib/xemacs-21.4.11/powerpc-apple-darwin6.4/char-db';
-}elsif(-e '/usr/local/lib/xemacs-21.4.10/i686-pc-linux/char-db'){
-    $DB_HOME='/usr/local/lib/xemacs-21.4.10/i686-pc-linux/char-db';
-}elsif(-e '/usr/local/lib/xemacs-21.4.10/powerpc-apple-darwin6.4/char-db'){
-    $DB_HOME='/usr/local/lib/xemacs-21.4.10/powerpc-apple-darwin6.4/char-db';
-}elsif(-e '/usr/local/xemacs-utf2000/lib/xemacs-21.4.10/powerpc-apple-darwin6.4/char-db'){
-    $DB_HOME='/usr/local/xemacs-utf2000/lib/xemacs-21.4.10/powerpc-apple-darwin6.4/char-db';
-}elsif(-e 'd:/work/chise/char-db'){
-    $DB_HOME='d:/work/chise/char-db';
-}else{
-    print STDERR "No database found.\n";
-    print STDERR "Pleas set \$DB_HOME to Chise_utils.pm.\n";
-    exit 1;
+unless($DB_HOME){
+    if(-e '/usr/local/lib/chise/db'){
+       $DB_HOME='/usr/local/lib/chise/db';
+    }elsif(-e '/usr/lib/chise/db'){
+       $DB_HOME='/usr/lib/chise/db';
+    }elsif(-e '/sw/lib/chise/db'){
+       $DB_HOME='/sw/lib/chise/db';
+    }elsif(-e '/usr/local/lib/chise/char-db'){
+       $DB_HOME='/usr/local/lib/chise/char-db';
+    }elsif(-e 'd:/work/chise/char-db'){
+       $DB_HOME='d:/work/chise/char-db';
+    }else{
+       print STDERR "No database found.\n";
+       print STDERR "Pleas set \$DB_HOME to Chise_utils.pm.\n";
+       exit 1;
+    }
 }
 
 $idc="\x{2ff0}-\x{2fff}";
 
 my %er_alias =
-    ('C1','chinese-cns11643-1',
-     'C2','chinese-cns11643-2',
-     'C3','chinese-cns11643-3',
-     'C4','chinese-cns11643-4',
-     'C5','chinese-cns11643-5',
-     'C6','chinese-cns11643-6',
-     'C7','chinese-cns11643-7',
-     'CB','ideograph-cbeta',
-     'CDP','chinese-big5-cdp',
-     'GT','ideograph-gt',
-     'GT-K','ideograph-gt',
-     'HZK1','ideograph-hanziku-1',
-     'HZK2','ideograph-hanziku-2',
-     'HZK3','ideograph-hanziku-3',
-     'HZK4','ideograph-hanziku-4',
-     'HZK5','ideograph-hanziku-5',
-     'HZK6','ideograph-hanziku-6',
-     'HZK7','ideograph-hanziku-7',
-     'HZK8','ideograph-hanziku-8',
-     'HZK9','ideograph-hanziku-9',
-     'HZK10','ideograph-hanziku-10',
-     'HZK11','ideograph-hanziku-11',
-     'HZK12','ideograph-hanziku-12',
-     'J78','japanese-jisx0208-1978',
-     'J83','japanese-jisx0208',
-     'J90','japanese-jisx0208-1990',
-     'JSP','japanese-jisx0212',
-     'JX1','japanese-jisx0213-1',
-     'JX2','japanese-jisx0213-2',
-     'K0','korean-ksc5601',
-     'M','ideograph-daikanwa',
+    ('C1','=cns11643-1',
+     'C2','=cns11643-2',
+     'C3','=cns11643-3',
+     'C4','=cns11643-4',
+     'C5','=cns11643-5',
+     'C6','=cns11643-6',
+     'C7','=cns11643-7',
+     'CB','=cbeta',
+     'CDP','=big5-cdp',
+     'GT','=gt',
+     'GT-K','=gt-k',
+     'HZK1','=hanziku-1',
+     'HZK2','=hanziku-2',
+     'HZK3','=hanziku-3',
+     'HZK4','=hanziku-4',
+     'HZK5','=hanziku-5',
+     'HZK6','=hanziku-6',
+     'HZK7','=hanziku-7',
+     'HZK8','=hanziku-8',
+     'HZK9','=hanziku-9',
+     'HZK10','=hanziku-10',
+     'HZK11','=hanziku-11',
+     'HZK12','=hanziku-12',
+     'J78','=jisx0208-1978',
+     'J83','=jisx0208',
+     'J90','=jisx0208-1990',
+     'JSP','=jisx0212',
+     'JX1','=jisx0213-1',
+     'JX2','=jisx0213-2',
+     'K0','=ks-x1001',
+     'M','=daikanwa',
      );
 
-for (glob "$DB_HOME/system-char-id/*"){
-    next if(/\.txt$/);
-    $atr=$_;
-    $atr=~s!$DB_HOME/system-char-id/!!;
-    $db{$atr}=$_;
-}
+my $er_prefix_re=join '|', keys %er_alias;
 
-for (glob "$DB_HOME/*"){
-    next if(/\.txt$/ or /system-char-id/);
-    $atr=$_;
-    $atr=~s!$DB_HOME/!!;
-    $reverse_db{$atr}=$_."/system-char-id";
+if(-d "$DB_HOME/character"){
+    for (glob "$DB_HOME/character/feature/*"){
+       next if(/\.txt$/);
+       $atr=$_;
+       $atr=~s!$DB_HOME/character/feature/!!;
+       $db{$atr}=$_;
+    }
+    for (glob "$DB_HOME/character/by-feature/*"){
+       next if(/\.txt$/);
+       $atr=$_;
+       $atr=~s!$DB_HOME/character/feature/!!;
+       $reverse_db{$atr}=$_;
+    }
+}elsif(-d "$DB_HOME/system-char-id"){
+    for (glob "$DB_HOME/system-char-id/*"){
+       next if(/\.txt$/);
+       $atr=$_;
+       $atr=~s!$DB_HOME/system-char-id/!!;
+       $db{$atr}=$_;
+    }
+    for (glob "$DB_HOME/*"){
+       next if(/\.txt$/ or /system-char-id/);
+       $atr=$_;
+       $atr=~s!$DB_HOME/!!;
+       $reverse_db{$atr}=$_."/system-char-id";
+    }
+}else{
+    print STDERR "No database found.\n";
+    print STDERR "Pleas set \$DB_HOME to Chise_utils.pm correctly.\n";
+    exit 1;
 }
 
 sub get_db{
@@ -177,9 +192,10 @@ sub get_chars_matching{
            }
        }
     }
-    # else{
-    # fall back if DB inconsistency exists.
-    unless(@res){
+    else{
+#   never fall back.
+#    unless(@res){
+#    # fall back if DB inconsistency exists.
        if(&get_db($atr)){
            foreach $char (keys %{$chardb{$atr}}){
                if($chardb{$atr}->{$char} eq $value){
@@ -212,23 +228,21 @@ sub get_chars_for{
            }
        }
     }
-    foreach (keys %res){
-       if($res{$_}==$i){
-           push @res,$_;
-       }
-    }
-    return @res;
+    return grep {defined($res{$_}) and $res{$_}==$i} (keys %res);
 }
 
 sub de_er{
     my($er)=@_;
     my($output_char,$atr,$value);
-    my $keys = join '|', keys %er_alias;
+    my($prefix,$suffix);
+    $er=~/^(amp|&)?(.+?)(;)?$/
+       and $prefix=$1,$er=$2,$suffix=$3;
+    $prefix or $prefix="",$suffix or $suffix="";
     if($er=~/^\d+$/){
        $output_char=pack("U",$er);
     }elsif($er=~/^U[\+\-]([a-fA-F\d]+)/){
        $output_char=pack("U",hex($1));
-    }elsif($er=~/(?:I\-)?($keys)\-?([0-9a-fA-F]+)/){
+    }elsif($er=~/^(?:I\-)?($er_prefix_re)\-?([0-9a-fA-F]+)$/){
        ($atr,$value)=($1,$2);
        unless($er_alias{$atr}=~/daikanwa|gt/){
            $value=hex($value);
@@ -236,9 +250,9 @@ sub de_er{
        ($output_char)=&get_chars_matching($er_alias{$atr},$value);
     }
     if($output_char){
-      return $output_char;
+       return $output_char;
     }else{
-      return $er;
+       return $prefix.$er.$suffix;
     }
 }