4 use vars qw($opt_in_cs $opt_out_cs $opt_help $usage
6 $char $char_id $out_char $omegadb_home
7 $ids $ids_argc %ids $idsdb
8 $idsdata_file $ids_start $font_start
11 $inotp $perl56 $perl58
12 $useCDP $useHZK $useGT
17 use Chise_utils ':all';
23 if($^V and $^V ge v5.8){
25 }elsif($^V and $^V ge v5.6){
28 print STDERR "This versin is not supported.";
32 binmode(STDIN, ':encoding(utf8)');
33 binmode(STDOUT, ':encoding(utf8)');
36 $omegadb_home="$HOME/.chise";
38 &GetOptions("in=s"=>\$opt_in_cs,
40 "out=s"=>\$opt_out_cs,
46 Usage: $0 -i <input coding system> -o <cmap encoding>
48 Utf8mcs, Utf8cns, Utf8gb, Utf8jis, Utf8ks
50 UniCNS, UniGB, UniJIS, UniKS
53 if($opt_in_cs or $opt_out_cs){
57 ($in_cs,$out_cs)=($0=~/(Utf8.+)To(\w+)/);
62 # utf-8-mcs,utf-8-cns,utf-8-gb,utf-8-jis,utf-8-ks,
64 # UniCNS,UniGB,UniJIS,UniKS
67 or not defined($in_cs)
68 or not defined($out_cs)){
73 $idsdata_file="idsdata.pl";
78 require $idsdata_file;
85 "=gt-pj-1","=gt-pj-2","=gt-pj-3","=gt-pj-4","=gt-pj-5","=gt-pj-6","=gt-pj-7","=gt-pj-8","=gt-pj-9","=gt-pj-10","=gt-pj-11"
86 #,"=gt-pj-k1","=gt-pj-k2"
88 @HZK=("=hanziku-1","=hanziku-10","=hanziku-11","=hanziku-12","=hanziku-2","=hanziku-3","=hanziku-4","=hanziku-5","=hanziku-6","=hanziku-7","=hanziku-8","=hanziku-9");
92 # temporary fix for using in OTP for perl 5.6.
93 s/(.)/pack("c",unpack("U",$1))/ge if($inotp
97 $_=decode('utf8', $_) if ($inotp and $in_cs=~/utf8/i
99 s/(amp.+?;)/&tex_de_er($1)/ge;
100 # s/(&.+?;)/&tex_de_er($1)/ge;
102 $char=&get_char_in_utf8mcs($1,$in_cs);
103 $char_id=unpack("U",$char);
105 ($ids,$ids_argc)=&ids_rest($ids,$ids_argc,$char);
107 if(($char_id=&get_char_id_for_ids($ids))
108 and(($out_char=&get_output_char($char_id,$out_cs)))){
111 print &replace_ids($ids) if($perl56);
112 print encode('utf8', &replace_ids($ids)) if($perl58);
116 }elsif($char_id>=0x2ff0 and $char_id<=0x2fff){
117 ($ids,$ids_argc)=&ids_rest("",0,$char);
124 if(($out_char=&get_output_char($char_id,$out_cs))){
126 }elsif($char_id >= 0x20000 && $char_id <=0x2a6df){
127 unless(defined($ids{$char}) and $ids{$char}[1]>=0){
128 $ids{$char}[0]=$font_start;
129 $ids{$char}[1]=$ids_start;
136 print "{\\fontencoding{OT1}\\fontfamily{" .
137 sprintf("chise%03d",$ids{$char}[0]) .
138 "}\\selectfont\\char$ids{$char}[1]}";
141 print &replace_ids(&get_ids($char));
146 print STDERR "IDS parse error: $ids\n";
147 # print pack("U",0xfffd);
148 print pack("U",0x3013) if($perl56);
149 print encode('utf8',pack("U",0x3013)) if($perl58);
155 open(IDSDATA,">$idsdata_file") or die;
156 print IDSDATA 'use utf8;',"\n";
157 foreach $ids (keys %ids){
158 print IDSDATA '$ids{\'',$ids,'\'}=[',join ",",@{$ids{$ids}},"];\n" if($perl56);
159 print IDSDATA '$ids{\'',encode('utf8',$ids),'\'}=[',join ",",@{$ids{$ids}},"];\n" if($perl58);
161 print IDSDATA '$font_start=',$font_start,";\n";
162 print IDSDATA '$ids_start=',$ids_start,";\n";
170 $er=~s/^amp(.*);$/$1/;
171 # $er=~s/^&(.*);$/$1/;
181 my($ids,$ids_argc,$char)=@_;
183 $argc=&ids_argc($char);
185 $ids_argc+= $ids_argc==0 ? $argc : $argc-1;
189 $ids.=$char if($perl56);
190 $ids.=encode('utf8',$char) if($perl58);
191 return ($ids,$ids_argc);
196 $ids=&normalize_ids($ids,"UniJIS");
197 # return pack("U",0xfffd) if($ids!~/[$idc]/);
198 return pack("U",0x3013) if(($ids!~/[$idc]/)
199 or($ids=~/[\x{10000}-]/));
201 unless(defined($ids{$ids}) and $ids{$ids}[1]>=0){
202 $ids{$ids}[0]=$font_start;
203 $ids{$ids}[1]=$ids_start;
210 return "{\\fontencoding{OT1}\\fontfamily{".sprintf("chise%03d",$ids{$ids}[0])."}\\selectfont\\char$ids{$ids}[1]}";
215 $ids = decode('utf8', $ids) if $perl58;
216 $out_cs=~s/Uni(.+)/"ucs-".lc($1)/e;
218 my($char,$char_id,$output_char_id);
219 while($ids=~m/(.)/g){
221 $char_id=unpack("U",$char);
224 }elsif($output_char_id=&get_char_attribute($char,$out_cs)){
225 $output_ids.=pack("U",$output_char_id);
226 }elsif($output_char_id=&get_char_attribute($char,"=ucs")){
227 $output_ids.=pack("U",$output_char_id);
228 }elsif($output_char_id=&get_char_attribute($char,"ucs")){
229 $output_ids.=pack("U",$output_char_id);
231 return pack("U",0xfffe);
238 my($char_id,$out_cs)=@_;
239 my($out_char_id,$suffix);
241 if(not defined($cmap_to{$out_cs})){
244 if($out_char_id=$cmap_to{$out_cs}->{$char_id}){
245 return pack("U",$out_char_id);
247 return $gt if($useGT and $gt=&get_macro_for_GT($char_id));
248 return $hzk if($useHZK and $hzk=&get_macro_for_HZK($char_id));
249 return $cdp if($useCDP and $cdp=&get_macro_for_CDP($char_id));
256 tie %{$cmap_to{$out_cs}}, "BerkeleyDB::Hash",
257 -Filename => "$omegadb_home/$out_cs" or die $!;
263 $ids=&get_char_attribute($char,"ids-aggregated")
264 or &get_char_attribute($char,"ids");
265 # or &get_char_attribute($char,"ideographic-structure");
266 $ids=decode('utf8', $ids) if($perl58);
271 sub get_char_id_for_ids{
274 $ids=decode('utf8', $ids) if($perl58);
275 # $ids="(?".(join " ?",(split(//,$ids))).")";
276 &get_idsdb if(not defined($idsdb));
277 $char=$idsdb->{$ids};
278 $char=decode('utf8',$char) if($perl58);
280 return unpack("U",$char);
287 tie %{$idsdb}, "BerkeleyDB::Hash",
288 -Filename => "$omegadb_home/idsdb" or die $!;
291 sub get_char_in_utf8mcs_bak{
293 return $char if($in_cs eq "Utf8mcs");
294 my($char_id,$output_char);
295 $in_cs=~s/Utf8/ucs-/;
296 $char_id=unpack("U",$char);
297 if(($output_char)=&get_chars_matching("$in_cs",$char_id)){
298 $output_char=decode('utf8', $output_char) if($perl58);
305 sub get_char_in_utf8mcs{
306 # argument: <character>, <input coding system>
307 # return: character in UTF-8mcs.
309 my($char_id,$output_char_id);
310 return $char if($in_cs eq "Utf8mcs");
311 $char_id=unpack("U",$char);
312 &get_utf8mcs_map($in_cs) if(not defined($utf8mcs_map_from{$in_cs}));
313 if($output_char_id=$utf8mcs_map_from{$in_cs}->{$char_id}){
314 return pack("U",$output_char_id);
323 ($suffix=$in_cs)=~s/^Utf8//;
324 tie %{$utf8mcs_map_from{$in_cs}}, "BerkeleyDB::Hash",
325 -Filename => "$omegadb_home/ucs-$suffix" or die $!;
328 sub get_macro_for_GT{
331 $char=pack("U",$char_id);
333 if($gt=&get_char_attribute($char,$_)){
334 m/gt\-pj\-(\d+)/ and $GT=$1;
339 return "{\\fontencoding{OT1}\\fontfamily{".sprintf("gt%02d",$GT)."}\\selectfont\\char".($gt|0x8080)."}";
345 sub get_macro_for_HZK{
348 $char=pack("U",$char_id);
350 if($hzk=&get_char_attribute($char,$_)){
351 m/hanziku\-(\d+)/ and $HZK=$1;
356 return "{\\fontencoding{OT1}\\fontfamily{".sprintf("hzk%02d",$HZK)."}\\selectfont\\char".($hzk|0x8080)."}";
362 sub get_macro_for_CDP{
365 $char=pack("U",$char_id);
367 if($cdp=&get_char_attribute($char,$_)){
372 return "{\\fontencoding{OT1}\\fontfamily{cdp}\\selectfont\\char".($cdp|0x8080)."}";