X-Git-Url: http://git.chise.org/gitweb/?a=blobdiff_plain;f=inCHISE;h=7a498302b1c741ce9c70efc26677e324b2e762f2;hb=5a5e431819878e0b4e9c46ebe2ff43678d7f3ca3;hp=115809f5cabdeeb7e4205c461bd4403845b81553;hpb=da4a98967af72d2e69b06f5451cf75a7a8f6daaa;p=chise%2Fomega.git

diff --git a/inCHISE b/inCHISE
index 115809f..7a49830 100755
--- a/inCHISE
+++ b/inCHISE
@@ -5,6 +5,7 @@
 use strict;
 use vars qw($omegadb_path
 	    $opt_protrude $opt_allow_unify
+	    $opt_use_kage_for_Ext_B
 	    %opt_order %order %order_map
 	    $opt_in_cs $opt_out_cs
 	    $opt_help $usage
@@ -21,8 +22,6 @@ use Fcntl ':flock';
 use Chise_utils ':all';
 require 5.008;
 
-my $omegadb_path="/usr/local/lib/chise/omega";
-
 ### Options ###
 
 #$opt_order{'UniMulti'}='jcgk';
@@ -35,6 +34,9 @@ $opt_order{'UniKS'}='k';
 $opt_allow_unify=1; # 1=true, 0=false.
 $opt_protrude=0;# 1=true, 0=false.
 
+# currently does not work.
+$opt_use_kage_for_Ext_B=0;# 1=true, 0=false.
+
 ### End ###
 
 my $strictly_forbidden_after = 'ãããï¼»ï¼ãâââï¼ãï½ã{\[\(\x{3016}ï½ï½¢';
@@ -58,10 +60,22 @@ my $forbidden_before
 my $slightly_forbidden_before
     = '\x{000a}\#\-âââ°â²â³âããããã½ã¾ï¼ï¼ï¼ï¾ï¾';
 
-my $asian = '\x{1100}-\x{11FF}\x{2E80}-\x{D7AF}\x{F900}-\x{FAFF}\x{FE30}-\x{FE4F}\x{FF00}-\x{FFFFFF}';
+my $asian = '\x{1100}-\x{11FF}\x{2E80}-\x{D7AF}\x{F900}-\x{FAFF}\x{FE30}-\x{FE4F}\x{FF00}-\x{FFFFFF}'; # need to fix.
 
 my $space = '\x{0020}\x{0009}\x{000A}\x{000C}\x{000D}';
 
+my %tex_meta=('#'=>'\#',
+	      '$'=>'\\textdollar{}',
+	      '%'=>'\%',
+	      '&'=>'\&',
+	      '{'=>'\\textbraceleft{}',
+	      '}'=>'\\textbraceright{}',
+	      '\\'=>'\\textbackslash{}',
+	      '_'=>'\\textunderscore',
+	     );
+
+my $tex_meta_re=join('|',map {quotemeta($_)} keys %tex_meta);
+
 &GetOptions("in=s"=>\$opt_in_cs,
 	    "out=s"=>\$opt_out_cs,
 	    "help",\$opt_help);
@@ -69,7 +83,7 @@ my $space = '\x{0020}\x{0009}\x{000A}\x{000C}\x{000D}';
 $usage=<<EOF;
 Usage: $0 -i <input coding system> -o <cmap encoding>
     input coding system:
-      Utf8mcs, Utf8cns, Utf8gb, Utf8jis, Utf8ks
+      Utf8mcs, Utf8cns, Utf8gb, Utf8jis, Utf8ks, Utf8big5
     cmap encoding:
       UniCNS, UniGB, UniJIS, UniKS, UniMulti
 EOF
@@ -82,7 +96,7 @@ if($opt_in_cs or $opt_out_cs){
 }
 
 # $in_cs:
-#   Utf8mcs,Utf8cns,Utf8gb,Utf8jis,Utf8ks,
+#   Utf8mcs,Utf8cns,Utf8gb,Utf8jis,Utf8ks,Utf8big5
 # $out_cs:
 #   UniCNS,UniGB,UniJIS,UniKS,UniMulti
 
@@ -119,8 +133,8 @@ if(-e $idsdata_file){
 $ids_argc=0;
 $ids="";
 
-$geta=pack("U",0x3013);
-#$geta=pack("U",0xfffd);
+#$geta=pack("U",0x3013);
+$geta=pack("U",0xfffd);
 
 @GT=(#"=gt","=gt-k",
      "=gt-pj-1","=gt-pj-2","=gt-pj-3","=gt-pj-4","=gt-pj-5",
@@ -156,9 +170,19 @@ foreach $out_cs ('UniCNS','UniGB','UniJIS','UniKS','UniMulti'){
 
 while(<>){
     utf8::decode($_);
+
     if($in_cs ne 'ucs@mcs'){
 	s/(.)/&get_char_in_mcs($1,$in_cs)/ge;
     }
+    s/([$asian])\s+([$asian])/$1$2/g unless($out_cs eq 'UniKS');
+    s/([$asian])\s*([^$asian])/$1 $2/g;
+    s/([^$asian])\s*([$asian])/$1 $2/g;
+    s/\-\-\-/pack("U",0x2014)/geo;# EM DASH
+    s/\-\-/pack("U",0x2013)/geo;# EN DASH
+    s/\`\`/pack("U",0x201f)/geo;# DOUBLE HIGH-REVERSED-9 QUOTATION MARK
+    s/\`/pack("U",0x201b)/geo;# SINGLE HIGH-REVERSED-9 QUOTATION MARK
+    s/\'\'/pack("U",0x201d)/geo;# RIGHT DOUBLE QUOTATION MARK
+    s/\'/pack("U",0x2019)/geo;# RIGHT DOUBLE QUOTATION MARK
     s/(amp.+?;)/&de_tex_er($1)/ge;
 #    s/(&.+?;)/&de_tex_er($1)/ge;
     @chars=split(//);
@@ -168,14 +192,50 @@ while(<>){
 	$char_id=unpack("U",$char);
 
 	if($char_id<=0x20){
-	    print $chars[$i];
+	    # add quarter space between asian and non-asian.
+	    if($i>0 and $i<$#chars){
+		if(($chars[$i-1]=~/[$asian]/
+		    and $chars[$i+1]=~/[^$asian]/
+		      and $chars[$i+1]=~/[$strictly_forbidden_before$slightly_forbidden_before]/)
+		   or($chars[$i-1]=~/[^$asian]/
+		      and $chars[$i+1]=~/[$asian]/
+		      and $chars[$i-1]=~/[$strictly_forbidden_after$slightly_forbidden_after]/)
+		   ){
+		    print '';
+		}elsif(($chars[$i-1]=~/[$asian]/
+			and $chars[$i+1]=~/[^$asian]/)
+		       or($chars[$i-1]=~/[^$asian]/
+			  and $chars[$i+1]=~/[$asian]/)){
+		    print '{\selectjisfont\hspace{.25ex}}';
+		}else{
+		    print ' ';
+		}
+	    }
+	    next CHAR;
+	}elsif($char=~m/($tex_meta_re)/o){
+	    print $tex_meta{$1};
 	    next CHAR;
-	}elsif($char_id>0x20 and $char_id<=0x02af){
-	    # Basic Latin
-	    # Latin-1 Supplement
-	    # Latin Extended-A
-	    # Latin Extended-B
-	    # IPA Extensions
+	}elsif(($char_id>0x20 and $char_id<=0x021f)
+	       # Basic Latin
+	       # Latin-1 Supplement
+	       # Latin Extended-A
+	       # Latin Extended-B (not all)
+	       or($char_id>=0x0250 and $char_id<=0x02af)
+	       # IPA Extensions
+	       or($char_id>=0x0300 and $char_id<=0x033f)
+	       or($char_id>=0x0360 and $char_id<=0x036f)
+	       # Combining Diacritical Marks
+	       or($char_id>=0x1e00 and $char_id<=0x1eff)
+	       # Latin Extended Additional
+	       or($char_id>=0x0370 and $char_id<=0x03ff)
+               # Greek and Coptic
+	       or($char_id>=0x0400 and $char_id<=0x04ff)
+               # Cyrillic
+	       or($char_id>=0x0530 and $char_id<=0x058f)
+               # Armenian
+	       or($char_id>=0x2010 and $char_id<=0x2046)
+	       # General Punctuation (partial)
+	       ){
 	    print &latin_parse();
 	    next CHAR;
 	}elsif($char_id>=0x2ff0 and $char_id<=0x2fff){
@@ -185,21 +245,6 @@ while(<>){
 	}else{
 	    if(($out_char=&get_output_char($char,$out_cs))){
 		print $out_char,&add_break($i);
-	    }elsif($char_id >= 0x20000 && $char_id <=0x2a6df){
-		# CJK Unified Ideographs Extension B
-		if(not defined($ids{$char}) and $ids{$char}[1]>=0){
-		    $ids{$char}[0]=$font_start;
-		    $ids{$char}[1]=$ids_start;
-		    $ids_start++;
-		    if($ids_start>255){
-			$ids_start=0;
-			$font_start++;
-		    }
-		}
-		print "{\\fontencoding{OT1}\\fontfamily{" .
-		    sprintf("chise%03d",$ids{$char}[0]) .
-		    "}\\selectfont\\char$ids{$char}[1]}",&add_break($i);
-		next CHAR;
 	    }else{
 		if($opt_allow_unify){
 		    @chars_unified=&get_chars_unified($char);
@@ -213,6 +258,24 @@ while(<>){
 			}
 		    }
 		}
+		if($opt_use_kage_for_Ext_B){
+		    if($char_id >= 0x20000 && $char_id <=0x2a6df){
+			# CJK Unified Ideographs Extension B
+			if(not defined($ids{$char}) and $ids{$char}[1]>=0){
+			    $ids{$char}[0]=$font_start;
+			    $ids{$char}[1]=$ids_start;
+			    $ids_start++;
+			    if($ids_start>255){
+				$ids_start=0;
+				$font_start++;
+			    }
+			}
+			print "{\\fontencoding{OT1}\\fontfamily{" .
+			    sprintf("chise%03d",$ids{$char}[0]) .
+				"}\\selectfont\\char$ids{$char}[1]}",&add_break($i);
+			next CHAR;
+		    }
+		}
 		if($ids=&get_ids($char)){
 		    print &get_macro_for_ids($ids),&add_break($i);
 		}else{
@@ -264,41 +327,46 @@ sub add_break{
     if($i<($#chars-1)){
 	if(($chars[$i+1]=~m/[$strictly_forbidden_before]/o)
 	   and($chars[$i+2]=~m/[$strictly_forbidden_before]/o)){
-	    return "\\CJKunbreakablekernone ";
+	    return "\\CJKunbreakablekernone{}";
 	}elsif($opt_protrude){
 	    if(($chars[$i+1]=~m/[$strictly_forbidden_before]/o)
 	       and($chars[$i+2]=~m/[^$strictly_forbidden_before]/o)){
-		return "\\CJKunbreakablekernone \\CJKprotrude ";
+		return "\\CJKunbreakablekernone \\CJKprotrude{}";
 	    }
 	}
     }
     if(($i<$#chars)
+       and($chars[$i+1]eq" ")){
+       # preserve space.
+ 	return "";
+    }
+    if(($i<$#chars)
        and($chars[$i+1]=~m/[$strictly_forbidden_before]/o)){
-	return "\\CJKunbreakablekernone ";
+	return "\\CJKunbreakablekernone{}";
     }
     if($chars[$i]=~m/[$strictly_forbidden_after]/o){
-	return "\\CJKunbreakablekernone ";
+	return "\\CJKunbreakablekernone{}";
     }
     if(($i<$#chars)
        and($chars[$i+1]=~m/[$forbidden_before]/o)){
-	return "\\CJKunbreakablekerntwo ";
+	return "\\CJKunbreakablekerntwo{}";
 
     }
     if($chars[$i]=~m/[$forbidden_after]/o){
-	return "\\CJKunbreakablekerntwo ";
+	return "\\CJKunbreakablekerntwo{}";
     }
     if(($i<$#chars)
        and($chars[$i+1]=~m/[$slightly_forbidden_before]/o)){
-	return "\\CJKunbreakablekernthree ";
+	return "\\CJKunbreakablekernthree{}";
     }
     if($chars[$i]=~m/[$slightly_forbidden_after]/o){
-	return "\\CJKunbreakablekernthree ";
+	return "\\CJKunbreakablekernthree{}";
     }
     if($chars[$i]=~m/[$asian]/o){
-	return "\\CJKbreakablekern ";
+	return "\\CJKbreakablekern{}";
     }
     if(($i<$#chars)and($chars[$i+1]=~m/[$asian]/o)){
-	return "\\CJKbreakablekern ";
+	return "\\CJKbreakablekern{}";
     }
 }
 
@@ -306,19 +374,32 @@ sub latin_parse{
     # arguments: none
     # return: string for output with TeX macro.
     my($char_id);
-    my $out_str=$chars[$i];
-    $i++;
+    my $out_str="";
     while($i<=$#chars){
 	$char_id=unpack("U",$chars[$i]);
-	if($char_id>0x20 and $char_id<=0x02af){
+	if(($char_id>0x20 and $char_id<=0x021f)
+	   or($char_id>=0x0250 and $char_id<=0x02af)# IPA Extensions
+	   or($char_id>=0x0300 and $char_id<=0x033f)# Combining Diacritical Marks
+	   or($char_id>=0x0360 and $char_id<=0x036f)
+	   or($char_id>=0x0370 and $char_id<=0x03ff)# Greek and Coptic
+	   or($char_id>=0x0400 and $char_id<=0x04ff)# Cyrillic
+	   or($char_id>=0x0530 and $char_id<=0x058f)# Armenian
+	   ){
 	    $out_str.=$chars[$i];
+	}elsif($char_id>=0x1e00 and $char_id<=0x1eff){
+	    # Latin Extended Additional
+            # 0x1e00 -> 0x0600, etc.
+	    $out_str.=pack("U",$char_id-0x1800);
+	}elsif($char_id>=0x2010 and $char_id<=0x2046){
+	    # General Punctuation (partial)
+	    $out_str.=pack("U",$char_id-0x1000);
 	}else{
 	    $i--;
 	    last;
 	}
 	$i++;
     }
-    return '{\normalfont {'.$out_str.'}}';
+    return '{\fontencoding{OT1}\fontfamily{omlgc}\selectfont '.$out_str.'}';###UT1?
 }
 
 sub ids_parse{
@@ -368,11 +449,11 @@ sub get_macro_for_ids{
     # return: TeX macro for ids
     #          or GETA character if ids is invalid for KAGE.
     my($ids)=@_;
-    $ids=&normalize_ids($ids,"UniJIS");
+    # $ids=&normalize_ids($ids,"UniJIS");
     return $geta if(($ids!~/[$idc]/)
 		    or($ids=~/[\x{10000}-]/));
                     #irregular for KAGE.
-    if(not defined($ids{$ids}) and $ids{$ids}[1]>=0){
+    if(not defined($ids{$ids})){
 	$ids{$ids}[0]=$font_start;
 	$ids{$ids}[1]=$ids_start;
 	$ids_start++;
@@ -478,6 +559,8 @@ sub get_ids{
     my $ids="";
     $ids=&get_char_attribute($char,"ids-aggregated")
 	or $ids=&get_char_attribute($char,"ids");
+#    $ids=&get_char_attribute($char,"ids-decomposed")
+#	or $ids=&get_char_attribute($char,"ids");
 #	  or $ids=&get_char_attribute($char,"ideographic-structure");
 #    $ids=~s/[? ()]//g;
     return $ids;