4 use vars qw($opt_in_cs $opt_out_cs $opt_help $usage
6 $char $char_id $out_char $omegadb_home
7 $ids $ids_argc %ids $idsdb
8 $idsdata_file $ids_start $font_start
11 $inotp $perl56 $perl58
12 $useCDP $useHZK $useGT
17 use Chise_utils ':all';
23 if($^V and $^V ge v5.8){
25 }elsif($^V and $^V ge v5.6){
28 print STDERR "This versin is not supported.";
32 binmode(STDIN, ':encoding(utf8)');
33 binmode(STDOUT, ':encoding(utf8)');
36 #$omegadb_home="/home/ttomabec/.chise";
37 $omegadb_home="/Users/izumi/.chise";
39 &GetOptions("in=s"=>\$opt_in_cs,
41 "out=s"=>\$opt_out_cs,
47 Usage: $0 -i <input coding system> -o <cmap encoding>
49 Utf8mcs, Utf8cns, Utf8gb, Utf8jis, Utf8ks
51 UniCNS, UniGB, UniJIS, UniKS
54 if($opt_in_cs or $opt_out_cs){
58 ($in_cs,$out_cs)=($0=~/(Utf8.+)To(\w+)/);
63 # utf-8-mcs,utf-8-cns,utf-8-gb,utf-8-jis,utf-8-ks,
65 # UniCNS,UniGB,UniJIS,UniKS
68 or not defined($in_cs)
69 or not defined($out_cs)){
74 $idsdata_file="idsdata.pl";
79 require $idsdata_file;
86 "=gt-pj-1","=gt-pj-2","=gt-pj-3","=gt-pj-4","=gt-pj-5","=gt-pj-6","=gt-pj-7","=gt-pj-8","=gt-pj-9","=gt-pj-10","=gt-pj-11"
87 #,"=gt-pj-k1","=gt-pj-k2"
89 @HZK=("=hanziku-1","=hanziku-10","=hanziku-11","=hanziku-12","=hanziku-2","=hanziku-3","=hanziku-4","=hanziku-5","=hanziku-6","=hanziku-7","=hanziku-8","=hanziku-9");
93 # temporary fix for using in OTP for perl 5.6.
94 s/(.)/pack("c",unpack("U",$1))/ge if($inotp
98 $_=decode('utf8', $_) if ($inotp and $in_cs=~/utf8/i
100 s/(amp.+?;)/&tex_de_er($1)/ge;
101 # s/(&.+?;)/&tex_de_er($1)/ge;
103 $char=&get_char_in_utf8mcs($1,$in_cs);
104 $char_id=unpack("U",$char);
106 ($ids,$ids_argc)=&ids_rest($ids,$ids_argc,$char);
108 if(($char_id=&get_char_id_for_ids($ids))
109 and(($out_char=&get_output_char($char_id,$out_cs)))){
112 print &replace_ids($ids) if($perl56);
113 print encode('utf8', &replace_ids($ids)) if($perl58);
117 }elsif($char_id>=0x2ff0 and $char_id<=0x2fff){
118 ($ids,$ids_argc)=&ids_rest("",0,$char);
125 if(($out_char=&get_output_char($char_id,$out_cs))){
127 }elsif($char_id >= 0x20000 && $char_id <=0x2a6df){
128 unless(defined($ids{$char}) and $ids{$char}[1]>=0){
129 $ids{$char}[0]=$font_start;
130 $ids{$char}[1]=$ids_start;
137 print "{\\fontencoding{OT1}\\fontfamily{" .
138 sprintf("chise%03d",$ids{$char}[0]) .
139 "}\\selectfont\\char$ids{$char}[1]}";
142 print &replace_ids(&get_ids($char));
147 print STDERR "IDS parse error: $ids\n";
148 # print pack("U",0xfffd);
149 print pack("U",0x3013) if($perl56);
150 print encode('utf8',pack("U",0x3013)) if($perl58);
156 open(IDSDATA,">$idsdata_file") or die;
157 print IDSDATA 'use utf8;',"\n";
158 foreach $ids (keys %ids){
159 print IDSDATA '$ids{\'',$ids,'\'}=[',join ",",@{$ids{$ids}},"];\n" if($perl56);
160 print IDSDATA '$ids{\'',encode('utf8',$ids),'\'}=[',join ",",@{$ids{$ids}},"];\n" if($perl58);
162 print IDSDATA '$font_start=',$font_start,";\n";
163 print IDSDATA '$ids_start=',$ids_start,";\n";
171 $er=~s/^amp(.*);$/$1/;
172 # $er=~s/^&(.*);$/$1/;
182 my($ids,$ids_argc,$char)=@_;
184 $argc=&ids_argc($char);
186 $ids_argc+= $ids_argc==0 ? $argc : $argc-1;
190 $ids.=$char if($perl56);
191 $ids.=encode('utf8',$char) if($perl58);
192 return ($ids,$ids_argc);
197 $ids=&normalize_ids($ids,"UniJIS");
198 # return pack("U",0xfffd) if($ids!~/[$idc]/);
199 return pack("U",0x3013) if(($ids!~/[$idc]/)
200 or($ids=~/[\x{10000}-]/));
202 unless(defined($ids{$ids}) and $ids{$ids}[1]>=0){
203 $ids{$ids}[0]=$font_start;
204 $ids{$ids}[1]=$ids_start;
211 return "{\\fontencoding{OT1}\\fontfamily{".sprintf("chise%03d",$ids{$ids}[0])."}\\selectfont\\char$ids{$ids}[1]}";
216 $ids = decode('utf8', $ids) if $perl58;
217 $out_cs=~s/Uni(.+)/"ucs-".lc($1)/e;
219 my($char,$char_id,$output_char_id);
220 while($ids=~m/(.)/g){
222 $char_id=unpack("U",$char);
225 }elsif($output_char_id=&get_char_attribute($char,$out_cs)){
226 $output_ids.=pack("U",$output_char_id);
227 }elsif($output_char_id=&get_char_attribute($char,"=ucs")){
228 $output_ids.=pack("U",$output_char_id);
229 }elsif($output_char_id=&get_char_attribute($char,"ucs")){
230 $output_ids.=pack("U",$output_char_id);
232 return pack("U",0xfffe);
239 my($char_id,$out_cs)=@_;
240 my($out_char_id,$suffix);
242 if(not defined($cmap_to{$out_cs})){
245 if($out_char_id=$cmap_to{$out_cs}->{$char_id}){
246 return pack("U",$out_char_id);
248 return $gt if($useGT and $gt=&get_macro_for_GT($char_id));
249 return $hzk if($useHZK and $hzk=&get_macro_for_HZK($char_id));
250 return $cdp if($useCDP and $cdp=&get_macro_for_CDP($char_id));
257 tie %{$cmap_to{$out_cs}}, "BerkeleyDB::Hash",
258 -Filename => "$omegadb_home/$out_cs" or die $!;
264 $ids=&get_char_attribute($char,"ids-aggregated")
265 or &get_char_attribute($char,"ids");
266 # or &get_char_attribute($char,"ideographic-structure");
267 $ids=decode('utf8', $ids) if($perl58);
272 sub get_char_id_for_ids{
275 $ids=decode('utf8', $ids) if($perl58);
276 # $ids="(?".(join " ?",(split(//,$ids))).")";
277 &get_idsdb if(not defined($idsdb));
278 $char=$idsdb->{$ids};
279 $char=decode('utf8',$char) if($perl58);
281 return unpack("U",$char);
288 tie %{$idsdb}, "BerkeleyDB::Hash",
289 -Filename => "$omegadb_home/idsdb" or die $!;
292 sub get_char_in_utf8mcs_bak{
294 return $char if($in_cs eq "Utf8mcs");
295 my($char_id,$output_char);
296 $in_cs=~s/Utf8/ucs-/;
297 $char_id=unpack("U",$char);
298 if(($output_char)=&get_chars_matching("$in_cs",$char_id)){
299 $output_char=decode('utf8', $output_char) if($perl58);
306 sub get_char_in_utf8mcs{
307 # argument: <character>, <input coding system>
308 # return: character in UTF-8mcs.
310 my($char_id,$output_char_id);
311 return $char if($in_cs eq "Utf8mcs");
312 $char_id=unpack("U",$char);
313 &get_utf8mcs_map($in_cs) if(not defined($utf8mcs_map_from{$in_cs}));
314 if($output_char_id=$utf8mcs_map_from{$in_cs}->{$char_id}){
315 return pack("U",$output_char_id);
324 ($suffix=$in_cs)=~s/^Utf8//;
325 tie %{$utf8mcs_map_from{$in_cs}}, "BerkeleyDB::Hash",
326 -Filename => "$omegadb_home/ucs-$suffix" or die $!;
329 sub get_macro_for_GT{
332 $char=pack("U",$char_id);
334 if($gt=&get_char_attribute($char,$_)){
335 m/gt\-pj\-(\d+)/ and $GT=$1;
340 return "{\\fontencoding{OT1}\\fontfamily{".sprintf("gt%02d",$GT)."}\\selectfont\\char".($gt|0x8080)."}";
346 sub get_macro_for_HZK{
349 $char=pack("U",$char_id);
351 if($hzk=&get_char_attribute($char,$_)){
352 m/hanziku\-(\d+)/ and $HZK=$1;
357 return "{\\fontencoding{OT1}\\fontfamily{".sprintf("hzk%02d",$HZK)."}\\selectfont\\char".($hzk|0x8080)."}";
363 sub get_macro_for_CDP{
366 $char=pack("U",$char_id);
368 if($cdp=&get_char_attribute($char,$_)){
373 return "{\\fontencoding{OT1}\\fontfamily{cdp}\\selectfont\\char".($cdp|0x8080)."}";