4 use vars qw($opt_in_cs $opt_out_cs $opt_help $usage
7 $char $char_id $out_char $omegadb_home
8 $ids $ids_argc %ids $idsdb
9 $idsdata_file $ids_start $font_start
12 $inotp $perl56 $perl58
13 $useCDP $useHZK $useGT
18 use Chise_utils ':all';
20 my $strictly_forbidden_after = "
41 my $forbidden_after = "\x{0000}";
44 my $slightly_forbidden_after = "
62 # All these characters are allowed to protrude
64 my $strictly_forbidden_before = "
99 my $forbidden_before = "
128 my $slightly_forbidden_before = "
150 my $asian = "\x{1100}-\x{11FF} | \x{2E80}-\x{D7AF} |
151 \x{F900}-\x{FAFF} | \x{FE30}-\x{FE4F} |
154 my $space = "\x{0020} | \x{0009} | \x{000A} | \x{000C} | \x{000D} ";
161 if($^V and $^V ge v5.8){
163 }elsif($^V and $^V ge v5.6){
166 print STDERR "This versin is not supported.";
170 binmode(STDIN, ':encoding(utf8)');
171 binmode(STDOUT, ':encoding(utf8)');
174 $omegadb_home="/Users/izumi/.chise";
176 &GetOptions("in=s"=>\$opt_in_cs,
178 "out=s"=>\$opt_out_cs,
184 Usage: $0 -i <input coding system> -o <cmap encoding>
186 Utf8mcs, Utf8cns, Utf8gb, Utf8jis, Utf8ks
188 UniCNS, UniGB, UniJIS, UniKS
191 if($opt_in_cs or $opt_out_cs){
195 ($in_cs,$out_cs)=($0=~/(Utf8.+)To(\w+)/);
200 # utf-8-mcs,utf-8-cns,utf-8-gb,utf-8-jis,utf-8-ks,
202 # UniCNS,UniGB,UniJIS,UniKS
205 or not defined($in_cs)
206 or not defined($out_cs)){
211 $idsdata_file="idsdata.pl";
215 if(-e $idsdata_file){
216 require $idsdata_file;
223 "=gt-pj-1","=gt-pj-2","=gt-pj-3","=gt-pj-4","=gt-pj-5","=gt-pj-6","=gt-pj-7","=gt-pj-8","=gt-pj-9","=gt-pj-10","=gt-pj-11"
224 #,"=gt-pj-k1","=gt-pj-k2"
226 @HZK=("=hanziku-1","=hanziku-10","=hanziku-11","=hanziku-12","=hanziku-2","=hanziku-3","=hanziku-4","=hanziku-5","=hanziku-6","=hanziku-7","=hanziku-8","=hanziku-9");
230 # temporary fix for using in OTP for perl 5.6.
231 s/(.)/pack("c",unpack("U",$1))/ge if($inotp
235 $_=decode('utf8', $_) if ($inotp and $in_cs=~/utf8/i
237 s/(amp.+?;)/&tex_de_er($1)/ge;
238 # s/(&.+?;)/&tex_de_er($1)/ge;
239 # s/^(.*)$/&add_break($1)/e;
241 for($i=0;$i<=$#chars;$i++){
243 $char=&get_char_in_utf8mcs($chars[$i],$in_cs);
244 $char_id=unpack("U",$char);
247 ($ids,$ids_argc)=&ids_rest($ids,$ids_argc,$char);
249 if(($char_id=&get_char_id_for_ids($ids))
250 and(($out_char=&get_output_char($char_id,$out_cs)))){
251 print $out_char,&add_break($i);
253 print &replace_ids($ids),&add_break($i) if($perl56);
254 print encode('utf8', &replace_ids($ids)),&add_break($i) if($perl58);
258 }elsif($char_id>=0x2ff0 and $char_id<=0x2fff){
259 ($ids,$ids_argc)=&ids_rest("",0,$char);
266 if(($out_char=&get_output_char($char_id,$out_cs))){
267 print $out_char,&add_break($i);
268 }elsif($char_id >= 0x20000 && $char_id <=0x2a6df){
269 unless(defined($ids{$char}) and $ids{$char}[1]>=0){
270 $ids{$char}[0]=$font_start;
271 $ids{$char}[1]=$ids_start;
278 print "{\\fontencoding{OT1}\\fontfamily{" .
279 sprintf("chise%03d",$ids{$char}[0]) .
280 "}\\selectfont\\char$ids{$char}[1]}",&add_break($i);
283 print &replace_ids(&get_ids($char)),&add_break($i);
288 print STDERR "IDS parse error: $ids\n";
289 # print pack("U",0xfffd);
290 print pack("U",0x3013) if($perl56);
291 print encode('utf8',pack("U",0x3013)) if($perl58);
297 open(IDSDATA,">$idsdata_file") or die;
298 print IDSDATA 'use utf8;',"\n";
299 foreach $ids (keys %ids){
300 print IDSDATA '$ids{\'',$ids,'\'}=[',join ",",@{$ids{$ids}},"];\n" if($perl56);
301 print IDSDATA '$ids{\'',encode('utf8',$ids),'\'}=[',join ",",@{$ids{$ids}},"];\n" if($perl58);
303 print IDSDATA '$font_start=',$font_start,";\n";
304 print IDSDATA '$ids_start=',$ids_start,";\n";
312 $er=~s/^amp(.*);$/$1/;
313 # $er=~s/^&(.*);$/$1/;
325 # $line=~s/(.)($strictly_forbidden_before)($not_strictly_forbidden_before)/
326 # $1."\\CJKunbreakablekernone \\CJKprotrude ".$2.$3/egx;
327 # $line=~s/(.)($strictly_forbidden_before)($strictly_forbidden_before)/
328 # $1."\\CJKunbreakablekernone ".$2.$3/egx;
330 # $line=~s/(.)($strictly_forbidden_before)/
331 # $1."\\CJKunbreakablekernone ".$2/egx;
332 # $line=~s/(.)($forbidden_before)/
333 # $1."\\CJKunbreakablekerntwo ".$2/egx;
334 # $line=~s/(.)($slightly_forbidden_before)/
335 # $1."\\CJKunbreakablekernthree ".$2/egx;
337 # $line=~s/($forbidden_after)(.)/
338 # $1."\\CJKunbreakablekerntwo ".$2/egx;
339 # $line=~s/($strictly_forbidden_after)(. )/
340 # $1."\\CJKunbreakablekernone ".$2/egx;
341 # $line=~s/($slightly_forbidden_after)(.)/
342 # $1."\\CJKunbreakablekernthree ".$2/egx;
344 # $line=~s/($asian)(.)/$1\\CJKbreakablekern $2/g;
345 # $line=~s/(.)($asian)/$1\\CJKbreakablekern $2/g;
349 if(($chars[$i+1]=~m/[$strictly_forbidden_before]/x)
350 and($chars[$i+2]=~m/[^$strictly_forbidden_before]/x)){
351 return "\\CJKunbreakablekernone \\CJKprotrude ";
352 }elsif(($chars[$i+1]=~m/[$strictly_forbidden_before]/x)
353 and($chars[$i+2]=~m/[$strictly_forbidden_before]/x)){
354 return "\\CJKunbreakablekernone ";
357 if($chars[$i+1]=~m/[$strictly_forbidden_before]/x){
358 return "\\CJKunbreakablekernone ";
359 }elsif($chars[$i+1]=~m/[$forbidden_before]/x){
360 return "\\CJKunbreakablekerntwo ";
361 }elsif($chars[$i+1]=~m/[$slightly_forbidden_before]/x){
362 return "\\CJKunbreakablekernthree ";
366 if($chars[$i]=~m/[$forbidden_after]/x){
367 return "\\CJKunbreakablekerntwo ";
368 }elsif($chars[$i]=~m/[$strictly_forbidden_after]/x){
369 return "\\CJKunbreakablekernone ";
370 }elsif($chars[$i]=~m/[$slightly_forbidden_after]/x){
371 return "\\CJKunbreakablekernthree ";
374 return "\\CJKbreakablekern ";
378 my($ids,$ids_argc,$char)=@_;
380 $argc=&ids_argc($char);
382 $ids_argc+= $ids_argc==0 ? $argc : $argc-1;
386 $ids.=$char if($perl56);
387 $ids.=encode('utf8',$char) if($perl58);
388 return ($ids,$ids_argc);
393 $ids=&normalize_ids($ids,"UniJIS");
394 # return pack("U",0xfffd) if($ids!~/[$idc]/);
395 return pack("U",0x3013) if(($ids!~/[$idc]/)
396 or($ids=~/[\x{10000}-]/));
398 unless(defined($ids{$ids}) and $ids{$ids}[1]>=0){
399 $ids{$ids}[0]=$font_start;
400 $ids{$ids}[1]=$ids_start;
407 return "{\\fontencoding{OT1}\\fontfamily{".sprintf("chise%03d",$ids{$ids}[0])."}\\selectfont\\char$ids{$ids}[1]}";
412 $ids = decode('utf8', $ids) if $perl58;
413 $out_cs=~s/Uni(.+)/"ucs-".lc($1)/e;
415 my($char,$char_id,$output_char_id);
416 while($ids=~m/(.)/g){
418 $char_id=unpack("U",$char);
421 }elsif($output_char_id=&get_char_attribute($char,$out_cs)){
422 $output_ids.=pack("U",$output_char_id);
423 }elsif($output_char_id=&get_char_attribute($char,"=ucs")){
424 $output_ids.=pack("U",$output_char_id);
425 }elsif($output_char_id=&get_char_attribute($char,"ucs")){
426 $output_ids.=pack("U",$output_char_id);
428 return pack("U",0xfffe);
435 my($char_id,$out_cs)=@_;
436 my($out_char_id,$suffix);
438 if(not defined($cmap_to{$out_cs})){
441 if($out_char_id=$cmap_to{$out_cs}->{$char_id}){
442 return pack("U",$out_char_id);
444 return $gt if($useGT and $gt=&get_macro_for_GT($char_id));
445 return $hzk if($useHZK and $hzk=&get_macro_for_HZK($char_id));
446 return $cdp if($useCDP and $cdp=&get_macro_for_CDP($char_id));
453 tie %{$cmap_to{$out_cs}}, "BerkeleyDB::Hash",
454 -Filename => "$omegadb_home/$out_cs" or die $!;
460 $ids=&get_char_attribute($char,"ids-aggregated")
461 or &get_char_attribute($char,"ids");
462 # or &get_char_attribute($char,"ideographic-structure");
463 $ids=decode('utf8', $ids) if($perl58);
468 sub get_char_id_for_ids{
471 $ids=decode('utf8', $ids) if($perl58);
472 # $ids="(?".(join " ?",(split(//,$ids))).")";
473 &get_idsdb if(not defined($idsdb));
474 $char=$idsdb->{$ids};
475 $char=decode('utf8',$char) if($perl58);
477 return unpack("U",$char);
484 tie %{$idsdb}, "BerkeleyDB::Hash",
485 -Filename => "$omegadb_home/idsdb" or die $!;
488 sub get_char_in_utf8mcs_bak{
490 return $char if($in_cs eq "Utf8mcs");
491 my($char_id,$output_char);
492 $in_cs=~s/Utf8/ucs-/;
493 $char_id=unpack("U",$char);
494 if(($output_char)=&get_chars_matching("$in_cs",$char_id)){
495 $output_char=decode('utf8', $output_char) if($perl58);
502 sub get_char_in_utf8mcs{
503 # argument: <character>, <input coding system>
504 # return: character in UTF-8mcs.
506 my($char_id,$output_char_id);
507 return $char if($in_cs eq "Utf8mcs");
508 $char_id=unpack("U",$char);
509 &get_utf8mcs_map($in_cs) if(not defined($utf8mcs_map_from{$in_cs}));
510 if($output_char_id=$utf8mcs_map_from{$in_cs}->{$char_id}){
511 return pack("U",$output_char_id);
520 ($suffix=$in_cs)=~s/^Utf8//;
521 tie %{$utf8mcs_map_from{$in_cs}}, "BerkeleyDB::Hash",
522 -Filename => "$omegadb_home/ucs-$suffix" or die $!;
525 sub get_macro_for_GT{
528 $char=pack("U",$char_id);
530 if($gt=&get_char_attribute($char,$_)){
531 m/gt\-pj\-(\d+)/ and $GT=$1;
536 return "{\\fontencoding{OT1}\\fontfamily{".sprintf("gt%02d",$GT)."}\\selectfont\\char".($gt|0x8080)."}";
537 # return "\\GT{".sprintf("gt%02d",$GT)."}{\\char".($gt|0x8080)."}";
543 sub get_macro_for_HZK{
546 $char=pack("U",$char_id);
548 if($hzk=&get_char_attribute($char,$_)){
549 m/hanziku\-(\d+)/ and $HZK=$1;
554 return "{\\fontencoding{OT1}\\fontfamily{".sprintf("hzk%02d",$HZK)."}\\selectfont\\char".($hzk|0x8080)."}";
560 sub get_macro_for_CDP{
563 $char=pack("U",$char_id);
565 if($cdp=&get_char_attribute($char,$_)){
570 return "{\\fontencoding{OT1}\\fontfamily{cdp}\\selectfont\\char".($cdp|0x8080)."}";