+#!/usr/bin/perl
+
+use strict;
+use vars qw($opt_in_cs $opt_out_cs $opt_help $usage
+ $in_cs $out_cs
+ $char $char_id $out_char $omegadb_home
+ $ids $ids_argc %ids $idsdb
+ $idsdata_file $ids_start $font_start
+ %utf8mcs_map_from
+ %cmap_to
+ $inotp $perl56 $perl58
+ $useCDP $useHZK $useGT
+ @CDP @HZK @GT
+ );
+use Getopt::Long;
+use utf8;
+use Chise_utils ':all';
+
+$useGT=1;
+$useHZK=0;
+$useCDP=0;
+
+if($^V and $^V ge v5.8){
+ $perl58=1;
+}elsif($^V and $^V ge v5.6){
+ $perl56=1;
+}else{
+ print STDERR "This versin is not supported.";
+}
+if($perl58){
+ eval "use Encode";
+ binmode(STDIN, ':encoding(utf8)');
+ binmode(STDOUT, ':encoding(utf8)');
+}
+
+#$omegadb_home="/home/ttomabec/.chise";
+$omegadb_home="/Users/izumi/.chise";
+
+&GetOptions("in=s"=>\$opt_in_cs,
+ "i=s"=>\$opt_in_cs,
+ "out=s"=>\$opt_out_cs,
+ "o=s"=>\$opt_out_cs,
+ "help",\$opt_help,
+ "h",\$opt_help);
+
+$usage=<<EOF;
+Usage: $0 -i <input coding system> -o <cmap encoding>
+ input coding system:
+ Utf8mcs, Utf8cns, Utf8gb, Utf8jis, Utf8ks
+ cmap encoding:
+ UniCNS, UniGB, UniJIS, UniKS
+EOF
+
+if($opt_in_cs or $opt_out_cs){
+ $in_cs=$opt_in_cs;
+ $out_cs=$opt_out_cs;
+}elsif(@ARGV==0){
+ ($in_cs,$out_cs)=($0=~/(Utf8.+)To(\w+)/);
+ $inotp=1;
+}
+
+# $in_cs:
+# utf-8-mcs,utf-8-cns,utf-8-gb,utf-8-jis,utf-8-ks,
+# $out_cs:
+# UniCNS,UniGB,UniJIS,UniKS
+
+if($opt_help
+ or not defined($in_cs)
+ or not defined($out_cs)){
+ print $usage;
+ exit 1;
+}
+
+$idsdata_file="idsdata.pl";
+$ids_start=0x00;
+$font_start=0;
+
+if(-e $idsdata_file){
+ require $idsdata_file;
+}
+
+$ids_argc=0;
+$ids="";
+
+@GT=(#"=gt","=gt-k",
+ "=gt-pj-1","=gt-pj-2","=gt-pj-3","=gt-pj-4","=gt-pj-5","=gt-pj-6","=gt-pj-7","=gt-pj-8","=gt-pj-9","=gt-pj-10","=gt-pj-11"
+ #,"=gt-pj-k1","=gt-pj-k2"
+ );
+@HZK=("=hanziku-1","=hanziku-10","=hanziku-11","=hanziku-12","=hanziku-2","=hanziku-3","=hanziku-4","=hanziku-5","=hanziku-6","=hanziku-7","=hanziku-8","=hanziku-9");
+@CDP=("=big5-cdp");
+
+while(<>){
+ # temporary fix for using in OTP for perl 5.6.
+ s/(.)/pack("c",unpack("U",$1))/ge if($inotp
+ and $in_cs=~/utf8/i
+ and $perl56);
+ # for perl 5.8.
+ $_=decode('utf8', $_) if ($inotp and $in_cs=~/utf8/i
+ and $perl58);
+ s/(amp.+?;)/&tex_de_er($1)/ge;
+# s/(&.+?;)/&tex_de_er($1)/ge;
+ while(m/(.)/g){
+ $char=&get_char_in_utf8mcs($1,$in_cs);
+ $char_id=unpack("U",$char);
+ if($ids_argc>0){
+ ($ids,$ids_argc)=&ids_rest($ids,$ids_argc,$char);
+ if($ids_argc==0){
+ if(($char_id=&get_char_id_for_ids($ids))
+ and(($out_char=&get_output_char($char_id,$out_cs)))){
+ print $out_char;
+ }else{
+ print &replace_ids($ids) if($perl56);
+ print encode('utf8', &replace_ids($ids)) if($perl58);
+ }
+ $ids="";
+ }
+ }elsif($char_id>=0x2ff0 and $char_id<=0x2fff){
+ ($ids,$ids_argc)=&ids_rest("",0,$char);
+ next;
+ }else{
+ if($char_id<=0xff){
+ print $char;
+ next;
+ }
+ if(($out_char=&get_output_char($char_id,$out_cs))){
+ print $out_char;
+ }elsif($char_id >= 0x20000 && $char_id <=0x2a6df){
+ unless(defined($ids{$char}) and $ids{$char}[1]>=0){
+ $ids{$char}[0]=$font_start;
+ $ids{$char}[1]=$ids_start;
+ $ids_start++;
+ if($ids_start>255){
+ $ids_start=0;
+ $font_start++;
+ }
+ }
+ print "{\\fontencoding{OT1}\\fontfamily{" .
+ sprintf("chise%03d",$ids{$char}[0]) .
+ "}\\selectfont\\char$ids{$char}[1]}";
+ next;
+ }else{
+ print &replace_ids(&get_ids($char));
+ }
+ }
+ }
+ if($ids_argc>0){
+ print STDERR "IDS parse error: $ids\n";
+# print pack("U",0xfffd);
+ print pack("U",0x3013) if($perl56);
+ print encode('utf8',pack("U",0x3013)) if($perl58);
+ $ids="";
+ $ids_argc=0;
+ }
+}
+
+open(IDSDATA,">$idsdata_file") or die;
+print IDSDATA 'use utf8;',"\n";
+foreach $ids (keys %ids){
+ print IDSDATA '$ids{\'',$ids,'\'}=[',join ",",@{$ids{$ids}},"];\n" if($perl56);
+ print IDSDATA '$ids{\'',encode('utf8',$ids),'\'}=[',join ",",@{$ids{$ids}},"];\n" if($perl58);
+}
+print IDSDATA '$font_start=',$font_start,";\n";
+print IDSDATA '$ids_start=',$ids_start,";\n";
+print IDSDATA "1;";
+
+exit 0;
+
+sub tex_de_er{
+ my($er)=@_;
+ my($out);
+ $er=~s/^amp(.*);$/$1/;
+# $er=~s/^&(.*);$/$1/;
+ $out=&de_er($er);
+ if($out){
+ return $out;
+ }else{
+ return "amp$er;";
+ }
+}
+
+sub ids_rest{
+ my($ids,$ids_argc,$char)=@_;
+ my($argc);
+ $argc=&ids_argc($char);
+ if($argc){
+ $ids_argc+= $ids_argc==0 ? $argc : $argc-1;
+ }else{
+ $ids_argc--;
+ }
+ $ids.=$char if($perl56);
+ $ids.=encode('utf8',$char) if($perl58);
+ return ($ids,$ids_argc);
+}
+
+sub replace_ids{
+ my($ids)=@_;
+ $ids=&normalize_ids($ids,"UniJIS");
+# return pack("U",0xfffd) if($ids!~/[$idc]/);
+ return pack("U",0x3013) if(($ids!~/[$idc]/)
+ or($ids=~/[\x{10000}-]/));
+ #irregular for KAGE.
+ unless(defined($ids{$ids}) and $ids{$ids}[1]>=0){
+ $ids{$ids}[0]=$font_start;
+ $ids{$ids}[1]=$ids_start;
+ $ids_start++;
+ }
+ if($ids_start>255){
+ $ids_start=0;
+ $font_start++;
+ }
+ return "{\\fontencoding{OT1}\\fontfamily{".sprintf("chise%03d",$ids{$ids}[0])."}\\selectfont\\char$ids{$ids}[1]}";
+}
+
+sub normalize_ids{
+ my($ids,$out_cs)=@_;
+ $ids = decode('utf8', $ids) if $perl58;
+ $out_cs=~s/Uni(.+)/"ucs-".lc($1)/e;
+ my $output_ids="";
+ my($char,$char_id,$output_char_id);
+ while($ids=~m/(.)/g){
+ $char=$1;
+ $char_id=unpack("U",$char);
+ if($char=~/[$idc]/){
+ $output_ids.=$char;
+ }elsif($output_char_id=&get_char_attribute($char,$out_cs)){
+ $output_ids.=pack("U",$output_char_id);
+ }elsif($output_char_id=&get_char_attribute($char,"=ucs")){
+ $output_ids.=pack("U",$output_char_id);
+ }elsif($output_char_id=&get_char_attribute($char,"ucs")){
+ $output_ids.=pack("U",$output_char_id);
+ }else{
+ return pack("U",0xfffe);
+ }
+ }
+ return $output_ids;
+}
+
+sub get_output_char{
+ my($char_id,$out_cs)=@_;
+ my($out_char_id,$suffix);
+ my($gt,$hzk,$cdp);
+ if(not defined($cmap_to{$out_cs})){
+ &get_cmap($out_cs);
+ }
+ if($out_char_id=$cmap_to{$out_cs}->{$char_id}){
+ return pack("U",$out_char_id);
+ }else{
+ return $gt if($useGT and $gt=&get_macro_for_GT($char_id));
+ return $hzk if($useHZK and $hzk=&get_macro_for_HZK($char_id));
+ return $cdp if($useCDP and $cdp=&get_macro_for_CDP($char_id));
+ return undef;
+ }
+}
+
+sub get_cmap{
+ my($out_cs)=@_;
+ tie %{$cmap_to{$out_cs}}, "BerkeleyDB::Hash",
+ -Filename => "$omegadb_home/$out_cs" or die $!;
+}
+
+sub get_ids{
+ my($char)=@_;
+ my $ids="";
+ $ids=&get_char_attribute($char,"ids-aggregated")
+ or &get_char_attribute($char,"ids");
+# or &get_char_attribute($char,"ideographic-structure");
+ $ids=decode('utf8', $ids) if($perl58);
+# $ids=~s/[? ()]//g;
+ return $ids;
+}
+
+sub get_char_id_for_ids{
+ my($ids)=@_;
+ my($char_id,$char);
+ $ids=decode('utf8', $ids) if($perl58);
+# $ids="(?".(join " ?",(split(//,$ids))).")";
+ &get_idsdb if(not defined($idsdb));
+ $char=$idsdb->{$ids};
+ $char=decode('utf8',$char) if($perl58);
+ if($char){
+ return unpack("U",$char);
+ }else{
+ return undef;
+ }
+}
+
+sub get_idsdb{
+ tie %{$idsdb}, "BerkeleyDB::Hash",
+ -Filename => "$omegadb_home/idsdb" or die $!;
+}
+
+sub get_char_in_utf8mcs_bak{
+ my($char,$in_cs)=@_;
+ return $char if($in_cs eq "Utf8mcs");
+ my($char_id,$output_char);
+ $in_cs=~s/Utf8/ucs-/;
+ $char_id=unpack("U",$char);
+ if(($output_char)=&get_chars_matching("$in_cs",$char_id)){
+ $output_char=decode('utf8', $output_char) if($perl58);
+ return $output_char;
+ }else{
+ return $char;
+ }
+}
+
+sub get_char_in_utf8mcs{
+ # argument: <character>, <input coding system>
+ # return: character in UTF-8mcs.
+ my($char,$in_cs)=@_;
+ my($char_id,$output_char_id);
+ return $char if($in_cs eq "Utf8mcs");
+ $char_id=unpack("U",$char);
+ &get_utf8mcs_map($in_cs) if(not defined($utf8mcs_map_from{$in_cs}));
+ if($output_char_id=$utf8mcs_map_from{$in_cs}->{$char_id}){
+ return pack("U",$output_char_id);
+ }else{
+ return $char;
+ }
+}
+
+sub get_utf8mcs_map{
+ my($in_cs)=@_;
+ my($suffix);
+ ($suffix=$in_cs)=~s/^Utf8//;
+ tie %{$utf8mcs_map_from{$in_cs}}, "BerkeleyDB::Hash",
+ -Filename => "$omegadb_home/ucs-$suffix" or die $!;
+}
+
+sub get_macro_for_GT{
+ my($char_id)=@_;
+ my($char,$gt,$GT);
+ $char=pack("U",$char_id);
+ foreach (@GT){
+ if($gt=&get_char_attribute($char,$_)){
+ m/gt\-pj\-(\d+)/ and $GT=$1;
+ last;
+ }
+ }
+ if($gt){
+ return "{\\fontencoding{OT1}\\fontfamily{".sprintf("gt%02d",$GT)."}\\selectfont\\char".($gt|0x8080)."}";
+ }else{
+ return undef;
+ }
+}
+
+sub get_macro_for_HZK{
+ my($char_id)=@_;
+ my($char,$hzk,$HZK);
+ $char=pack("U",$char_id);
+ foreach (@HZK){
+ if($hzk=&get_char_attribute($char,$_)){
+ m/hanziku\-(\d+)/ and $HZK=$1;
+ last;
+ }
+ }
+ if($hzk){
+ return "{\\fontencoding{OT1}\\fontfamily{".sprintf("hzk%02d",$HZK)."}\\selectfont\\char".($hzk|0x8080)."}";
+ }else{
+ return undef;
+ }
+}
+
+sub get_macro_for_CDP{
+ my($char_id)=@_;
+ my($char,$cdp);
+ $char=pack("U",$char_id);
+ foreach (@CDP){
+ if($cdp=&get_char_attribute($char,$_)){
+ last;
+ }
+ }
+ if($cdp){
+ return "{\\fontencoding{OT1}\\fontfamily{cdp}\\selectfont\\char".($cdp|0x8080)."}";
+ }else{
+ return undef;
+ }
+}