X-Git-Url: http://git.chise.org/gitweb/?a=blobdiff_plain;f=chise2otf%2Fchise2otf;h=4659a4e46bf2ea30324b4704b61eaa8dcac7af06;hb=455d2c1110e3091cbcb34e60771a66a111654643;hp=bf9418f5d4f5f9304d65ad2c9c7d5fd009a61f3e;hpb=a82c439ebc29aad941631860fdd786270458a8d5;p=chise%2Fomega.git

diff --git a/chise2otf/chise2otf b/chise2otf/chise2otf
index bf9418f..4659a4e 100755
--- a/chise2otf/chise2otf
+++ b/chise2otf/chise2otf
@@ -2,11 +2,14 @@
 
 use strict;
 use vars qw($opt_in_cs $opt_order $opt_kage $opt_replace
+            $opt_latin
+	    $opt_use_kage_for_Ext_B $opt_allow_unification
 	    $opt_help $usage
 	    $in_cs $out_cs $i @chars
 	    @order $order %order
 	    @texmacro
 	    $char $char_id $out_char
+	    $char_unified @chars_unified
 	    $ids $ids_argc %ids $idsdb
 	    $idsdata_file $ids_start $font_start
 	    $perl56 $perl58
@@ -18,10 +21,10 @@ use utf8;
 use Chise_utils ':all';
 require 5.008;
 
-my $omegadb_path="/usr/local/lib/chise/omega";
-$omegadb_path=~s!/$!!;
+# currently does not work, so...
+$opt_use_kage_for_Ext_B=0;
 
-my $makefonts="$omegadb_path/makefonts.pl";
+my $makefonts="/usr/local/share/texmf/omega/ocp/local/chise/makefonts.pl";
 my $exec_makefonts=0;
 my $geta=pack("S",8750|0x8080);
 
@@ -29,22 +32,26 @@ my $geta=pack("S",8750|0x8080);
 	    "order=s"=>\$opt_order,
 	    "replace",\$opt_replace,
 	    "kage",\$opt_kage,
+	    "latin",\$opt_latin,
+	    "unify",\$opt_allow_unification,
 	    "help",\$opt_help);
 
 $usage=<<EOF;
-Usage: $0 [-i <input coding system>] [-o <order of kanji>] [-k] <filename>
-    input coding system: (default: ucs\@mcs)
-      ucs\@mcs, ucs\@cns, ucs\@gb, ucs\@jis, ucs\@ks
-    order of kanji: (default: j)
+Usage: $0 [-i <input coding system>] [-o <order of kanji>] [-klru] <filename>
+    -i: input coding system: (default: ucs\@mcs)
+       ucs\@mcs, ucs\@cns, ucs\@gb, ucs\@jis, ucs\@ks
+    -o: order of kanji: (default: j)
        c: CNS
        g: GB
        j: JIS
        k: KS
        G: GT
        m: Multi, use \\UTFM of otf.sty
-      You can also combine them, ex. jtcgkm
-    k: use Kage server.
-    r: replace r and l with dot below to those with circle below.
+      You can also combine them, ex. jGcgkm
+    -k: use Kage server.
+    -l: preserve latin characters also in ucs\@jis environment.
+    -r: replace r and l with dot below to those with circle below.
+    -u: allow unification.
 EOF
 
 %order=('c'=>'UniCNS',
@@ -109,21 +116,26 @@ if($opt_replace){
     $texmacro[0x1E5D]='{\ifmmode \ucirc{\bar{r}}\else \ucirc{\={r}}\fi}';
 }
 
-$idsdata_file="$omegadb_path/idsdata.pl";
+$idsdata_file="$omegadb_path/idsdata.txt";
 $ids_start=0x00; 
 $font_start=0;
 
 if(-e $idsdata_file){
-    open(IDSDATA,"+<$idsdata_file") or die;
+    open(IDSDATA,"+<:utf8",$idsdata_file) or die;
     flock(IDSDATA,LOCK_EX);
     seek(IDSDATA,0,0);
     while(<IDSDATA>){
-	eval $_;
+	utf8::decode($_);
+	if(m/^START\t(\d+)\t(\d+)/){
+	    $font_start=$1,$ids_start=$2;
+	}elsif(m/^(.*?)\t(\d+)\t(\d+)/){
+	    $ids{$1}=[$2,$3,];
+	}
     }
     seek(IDSDATA,0,0);
-#	  require $idsdata_file;
+    truncate(IDSDATA,0);
 }else{
-    open(IDSDATA,">$idsdata_file") or die;
+    open(IDSDATA,">:utf8",$idsdata_file) or die;
     flock(IDSDATA,LOCK_EX);
     seek(IDSDATA,0,0);
 }
@@ -144,11 +156,12 @@ $ids="";
 
 while(<>){
     if($in_cs ne 'ucs@mcs'){
-	s/(.)/pack("U",&get_char_id(unpack("U",$1),$in_cs))/ge;
+	s/(.)/&get_char_in_mcs($1,$in_cs)/ge;
     }
     s/((?:^|[^\\])(?:\\\\)*)(&.*?;)/&de_tex_er($1,$2)/ge;
     s/((?:^|[^\\])(?:\\\\)*)\\([$idc])/$1.'\UTFM{'.sprintf("%X",unpack("U",$2)).'}'/ge;
     @chars=split(//);
+  CHAR:
     for($i=0;$i<=$#chars;$i++){
 	$char=$chars[$i];
 	$char_id=unpack("U",$chars[$i]);
@@ -156,51 +169,66 @@ while(<>){
 	if($char_id<=0x7f){
 	    # Basic Latin
 	    print $char;
-	    next;
+	    next CHAR;
 	}elsif(defined($texmacro[$char_id]) and $texmacro[$char_id]){
 	    # already defined for 
 	    # 0080..00FF; Latin-1 Supplement
 	    # 0100..017F; Latin Extended-A
 	    # 1E00..1EFF; Latin Extended Additional
 	    print $texmacro[$char_id];
-	    next;
+	    next CHAR;
 	}elsif($char_id>=0x2ff0 and $char_id<=0x2fff){
 	    # Ideographic Description Characters
 	    print &ids_parse();
-	    next;
+	    next CHAR;
 	}else{
-	    if(($out_char=&get_output_char($char_id))){
+	    if(($out_char=&get_output_char($char))){
 		print $out_char;
-	    }elsif($char_id >= 0x20000 && $char_id <=0x2a6df){
-		# CJK Unified Ideographs Extension B
-		if(not defined($ids{$char}) and $ids{$char}[1]>=0){
-		    $ids{$char}[0]=$font_start;
-		    $ids{$char}[1]=$ids_start;
-		    $ids_start++;
-		    if($ids_start>255){
-			$ids_start=0;
-			$font_start++;
+		next CHAR;
+	    }else{
+		if($opt_allow_unification){
+		    @chars_unified=&get_chars_unified($char);
+		    if(@chars_unified>0){
+			foreach $char_unified (@chars_unified){
+			    if(($out_char=&get_output_char($char_unified))){
+				print $out_char;
+				next CHAR;
+			    }
+			}
 		    }
 		}
-		print "{\\fontencoding{OT1}\\fontfamily{" .
-		    sprintf("chise%03d",$ids{$char}[0]) .
-		    "}\\selectfont\\char$ids{$char}[1]}";
-		next;
-	    }else{
-		print &get_macro_for_ids(&get_ids($char));
+		if($opt_use_kage_for_Ext_B
+		   and $char_id >= 0x20000 and $char_id <=0x2a6df){
+		    # CJK Unified Ideographs Extension B
+		    if(not defined($ids{$char}) and $ids{$char}[1]>=0){
+			$ids{$char}[0]=$font_start;
+			$ids{$char}[1]=$ids_start;
+			$ids_start++;
+			if($ids_start>255){
+			    $ids_start=0;
+			    $font_start++;
+			}
+		    }
+		    print "{\\fontencoding{OT1}\\fontfamily{" .
+			sprintf("chise%03d",$ids{$char}[0]) .
+			    "}\\selectfont\\char$ids{$char}[1]}";
+		    next CHAR;
+		}
+		if($ids=&get_ids($char)){
+		    print &get_macro_for_ids($ids);
+		    next CHAR;
+		}else{
+		    print $geta;
+		}
 	    }
 	}
     }
 }
 
-print IDSDATA 'use utf8;',"\n";
+print IDSDATA 'START',"\t",$font_start,"\t",$ids_start,"\n";
 foreach $ids (keys %ids){
-    print IDSDATA '$ids{\'',$ids,'\'}='
-    ,'[',join ",",@{$ids{$ids}},"];\n";
+    print IDSDATA $ids,"\t",join("\t",@{$ids{$ids}}),"\n";
 }
-print IDSDATA '$font_start=',$font_start,";\n";
-print IDSDATA '$ids_start=',$ids_start,";\n";
-print IDSDATA "1;";
 flock(IDSDATA,LOCK_UN);
 
 if($exec_makefonts){
@@ -229,7 +257,6 @@ sub de_tex_er{
 	    $value=hex($value);
 	}
 	($output_char)=&get_chars_matching($er_alias{$atr},$value);
-	utf8::decode($output_char);
     }
     if($output_char){
 	return $before_er.$output_char;
@@ -255,8 +282,8 @@ sub ids_parse{
 
 	($ids,$ids_argc)=&ids_rest($ids,$ids_argc,$chars[$i]);
 	if($ids_argc==0){
-	    if(($char_id=&get_char_id_for_ids($ids))
-	       and($out_char=&get_output_char($char_id))){
+	    if(($char=&get_char_for_ids($ids))
+	       and($out_char=&get_output_char($char))){
 		return $out_char;
 	    }else{
 		return &get_macro_for_ids($ids);
@@ -277,8 +304,6 @@ sub ids_rest{
 	$ids_argc--;
     }
     $ids.=$char;
-#    $ids.=$char if($perl56);
-#    $ids.=encode('utf8',$char) if($perl58);
     return ($ids,$ids_argc);
 }
 
@@ -288,11 +313,11 @@ sub get_macro_for_ids{
     #          or GETA character if ids is invalid for KAGE.
     my($ids)=@_;
 #    return $geta if(not $exec_makefonts);
-    $ids=&normalize_ids($ids,"UniJIS");
+#    $ids=&normalize_ids($ids,"UniJIS");
     return $geta if(($ids!~/[$idc]/)
 		    or($ids=~/[\x{10000}-]/));
                     #irregular for KAGE.
-    if(not defined($ids{$ids}) and $ids{$ids}[1]>=0){
+    if(not defined($ids{$ids})){
 	$ids{$ids}[0]=$font_start;
 	$ids{$ids}[1]=$ids_start;
 	$ids_start++;
@@ -313,15 +338,17 @@ sub normalize_ids{
     $out_cs=~s/Uni(.+)/'ucs@'.lc($1)/e;
 
     my $output_ids="";
-    my($char,$char_id,$output_char_id);
+    my($char,$output_char_id);
     while($ids=~m/(.)/g){
 	$char=$1;
-	$char_id=unpack("U",$char);
 	if($char=~/[$idc]/){
 	    $output_ids.=$char;
-	}elsif($output_char_id=&get_char_attribute($char,"=$out_cs")){
-	    $output_ids.=pack("U",$output_char_id);
-	}elsif($output_char_id=&get_char_attribute($char,"=ucs")){
+	}elsif($output_char_id=&get_char_attribute($char,"=$out_cs")
+	       or $output_char_id=&get_char_attribute($char,"=ucs")
+	       or $output_char_id=&get_char_attribute($char,"=>$out_cs")
+	       or $output_char_id=&get_char_attribute($char,"=>ucs")
+	       or $output_char_id=&get_char_attribute($char,"=>ucs*")
+	      ){
 	    $output_ids.=pack("U",$output_char_id);
 	}else{
 	    return $geta;
@@ -331,14 +358,15 @@ sub normalize_ids{
 }
 
 sub get_output_char{
-    # argument: <char-id>
+    # argument: <char>
     # return: character in EUC-JP or TeX macro for pTeX.
-    my($char_id)=@_;
-    my($char,$out_char,$out_char_id,$gt);
+    my($char)=@_;
+    my($out_char,$out_char_id,$gt);
     
-    $char=pack('U',$char_id);
-
-    if($out_char=&get_char_attribute($char,'=jis-x0208')){
+    if($out_char=&get_char_attribute($char,'=jis-x0208')
+       or $out_char=&get_char_attribute($char,'=jis-x0208-1983')
+       or $out_char=&get_char_attribute($char,'=jis-x0208-1990')
+      ){
 	return pack("S",$out_char|0x8080);
     }else{
 	foreach $out_cs (@order){
@@ -348,6 +376,7 @@ sub get_output_char{
 		   or $out_char_id=&get_char_attribute($char,'=ucs')
 		   or $out_char_id=&get_char_attribute($char,'=>ucs@jis')
 		   or $out_char_id=&get_char_attribute($char,'=>ucs')
+		   or $out_char_id=&get_char_attribute($char,'=>ucs*')
 		   ){
 		    return "\\UTF{".sprintf("%X",$out_char_id)."}";
 		}
@@ -357,6 +386,7 @@ sub get_output_char{
 		   or $out_char_id=&get_char_attribute($char,'=ucs')
 		   or $out_char_id=&get_char_attribute($char,'=>ucs@gb')
 		   or $out_char_id=&get_char_attribute($char,'=>ucs')
+		   or $out_char_id=&get_char_attribute($char,'=>ucs*')
 		   ){
 		    return "\\UTFC{".sprintf("%X",$out_char_id)."}";
 		}
@@ -366,6 +396,7 @@ sub get_output_char{
 		   or $out_char_id=&get_char_attribute($char,'=ucs')
 		   or $out_char_id=&get_char_attribute($char,'=>ucs@cns')
 		   or $out_char_id=&get_char_attribute($char,'=>ucs')
+		   or $out_char_id=&get_char_attribute($char,'=>ucs*')
 		   ){
 		    return "\\UTFT{".sprintf("%X",$out_char_id)."}";
 		}
@@ -375,11 +406,12 @@ sub get_output_char{
 		   or $out_char_id=&get_char_attribute($char,'=ucs')
 		   or $out_char_id=&get_char_attribute($char,'=>ucs@ks')
 		   or $out_char_id=&get_char_attribute($char,'=>ucs')
+		   or $out_char_id=&get_char_attribute($char,'=>ucs*')
 		   ){
 		    return "\\UTFK{".sprintf("%X",$out_char_id)."}";
 		}
 	    }elsif($out_cs eq 'GT'){
-		return $gt if($gt=&get_macro_for_GT($char_id));
+		return $gt if($gt=&get_macro_for_GT($char));
 	    }elsif($out_cs eq 'Multi'){
 		if($out_char_id=&get_char_attribute($char,'=ucs')){
 		    return "\\UTFM{".sprintf("%X",$out_char_id)."}";
@@ -400,38 +432,62 @@ sub get_ids{
     return $ids;
 }
 
-sub get_char_id_for_ids{
+sub get_char_for_ids{
     # argument: <ideographic description sequence>
-    # return: char-id
+    # return: char or undef
     my($ids)=@_;
     my($output_char);
 
     if(($output_char)=&get_chars_matching("ids",$ids)){
-	return unpack("U",$output_char);
+	return $output_char;
     }else{
 	return undef;
     }
 }
 
-sub get_char_id{
-    # argument: <char-id>, <input coding system>
-    # return:   char-id.
-    my($char_id,$in_cs)=@_;
+sub get_char_in_mcs{
+    # argument: <char>, <input coding system>
+    # return:   char in ucs@mcs.
+    my($char,$in_cs)=@_;
     my($output_char);
+    my $char_id=unpack("U",$char);
 
+    if($opt_latin and $texmacro[$char_id]){
+	return $char;
+    }
     if(($output_char)=&get_chars_matching("=$in_cs",$char_id)){
-	return unpack("U",$output_char);
+	return $output_char;
     }else{
-	return $char_id;
+	return $char;
+    }
+}
+
+sub get_chars_unified{
+    my($char)=@_;
+    my($chars,$ucs,$char_ucs);
+    my(@chars);
+
+    if($chars=&get_char_attribute($char,'->ucs-unified')){
+	$chars=~s/^\((.*)\)$/$1/;
+	return (split(/\s*\?/,$chars));
+    }elsif($ucs=&get_char_attribute($char,'=>ucs*')
+	  or $ucs=&get_char_attribute($char,'=>ucs')){
+	$char_ucs=pack("U",$ucs);
+	if($chars=&get_char_attribute($char_ucs,'->ucs-unified')){
+	    $chars=~s/^\((.*)\)$/$1/;
+	    @chars=grep {not /^$char$/}
+		(split(/\s*\?/,$chars));
+	    push(@chars,$char_ucs);
+	    return @chars;
+	}
     }
 }
 
 sub get_macro_for_GT{
-    # argument: <char-id>
-    # return: TeX macro for GT fonts.
-    my($char_id)=@_;
-    my($char,$gt,$GT);
-    $char=pack("U",$char_id);
+    # argument: <char>
+    # return: TeX macro for GT fonts or undef.
+    my($char)=@_;
+    my($gt,$GT);
     foreach (@GT){
 	if($gt=&get_char_attribute($char,$_)){
 	    m/gt\-pj\-(\d+)/ and $GT=$1;