add temporary line break routine. need fix.
[chise/omega.git] / outCMAP
1 #!/usr/bin/perl
2
3 use strict;
4 use vars qw($opt_in_cs $opt_out_cs $opt_help $usage
5             $in_cs $out_cs
6             $i @chars
7             $char $char_id $out_char $omegadb_home
8             $ids $ids_argc %ids $idsdb
9             $idsdata_file $ids_start $font_start
10             %utf8mcs_map_from
11             %cmap_to
12             $inotp $perl56 $perl58
13             $useCDP $useHZK $useGT
14             @CDP @HZK @GT
15             );
16 use Getopt::Long;
17 use utf8;
18 use Chise_utils ':all';
19
20 my $strictly_forbidden_after = "
21       \x{0028} |
22       \x{005B} |
23       \x{007B} |
24       \x{2018} |
25       \x{201C} |
26       \x{3008} |
27       \x{300A} |
28       \x{300C} |
29       \x{300E} |
30       \x{3010} |
31       \x{3014} |
32       \x{3016} |
33       \x{FF08} |
34       \x{FF08} |
35       \x{FF3B} |
36       \x{FF5B} |
37       \x{FF5B} |
38       \x{FF62} 
39 ";
40
41 my $forbidden_after = "\x{0000}";
42
43 # ¥¥$$〒♯##¢¢££@@§
44 my $slightly_forbidden_after = "
45       \x{FFE5} |
46       \x{00A5} |
47       \x{FF04} |
48       \x{0024} |
49       \x{3012} |
50       \x{266F} |
51       \x{FF03} |
52       \x{0023} |
53       \x{FFE0} |
54       \x{00A2} |
55       \x{FFE1} |
56       \x{00A3} |
57       \x{FF20} |
58       \x{0040} |
59       \x{00A7}
60 ";
61
62 # All these characters are allowed to protrude
63 # in the right margin
64 my $strictly_forbidden_before = "
65       \x{0021} | # !
66       \x{002c} | # ,
67       \x{002e} | # .
68       \x{003a} | # :
69       \x{003b} | # ;
70       \x{003f} | # ?
71       \x{3001} | # 、
72       \x{3002} | # 。
73       \x{ff01} | # !
74       \x{ff0c} | # ,
75       \x{ff0e} | # .
76       \x{ff1a} | # :
77       \x{ff1b} | # ;
78       \x{ff1f} | # ?
79       \x{ff61} | # 。
80       \x{0029} | # )
81 #      \x{005d} | # ]
82       \x{007d} | # }
83       \x{2019} | # ’
84       \x{201d} | # ”
85       \x{3009} | # 〉
86       \x{300b} | # 》
87       \x{300d} | # 」
88       \x{300f} | # 』
89       \x{3011} | # 】
90       \x{3015} | # 〕
91       \x{3017} | # white 】
92       \x{ff09} | # )
93       \x{ff3d} | # ]
94       \x{ff5d} | # }
95       \x{ff5d} | # }
96       \x{ff63}   # 」
97 ";
98
99 my $forbidden_before = "
100       \x{30fc} | # ー
101       \x{3005} | # 々
102       \x{3041} | # ぁ
103       \x{3043} | # ぃ
104       \x{3045} | # ぅ
105       \x{3047} | # ぇ
106       \x{3049} | # ぉ
107       \x{3083} | # ゃ
108       \x{3085} | # ゅ
109       \x{3087} | # ょ
110       \x{3063} | # っ
111       \x{308e} | # ゎ
112       \x{30a1} | # ァ
113       \x{30a3} | # ィ
114       \x{30a5} | # ゥ
115       \x{30a7} | # ェ
116       \x{30a9} | # ォ
117       \x{30e3} | # ャ
118       \x{30e5} | # ュ
119       \x{30e7} | # ョ
120       \x{30c3} | # ッ
121       \x{30ee} | # ヮ
122       \x{30f5} | # ヵ
123       \x{30f6}   # ヶ
124 ";
125
126 #      \-       | # -
127
128 my $slightly_forbidden_before = "
129       \x{000a} | # ???
130       \#       | # #
131       \x{2010} | # ‐
132       \x{2012} | # −
133       \x{2030} | # ‰
134       \x{2032} | # ′
135       \x{2033} | # ″
136       \x{2103} | # ℃
137       \x{309b} | # ゛
138       \x{309c} | # ゜
139       \x{309d} | # ゝ
140       \x{309e} | # ゞ
141       \x{30fd} | # ヽ
142       \x{30fe} | # ヾ
143       \x{ff02} | # "
144       \x{ff05} | # %
145       \x{ff0d} | # -
146       \x{ff9e} | # ゙
147       \x{ff9f}   # ゚
148 ";
149
150 my $asian = "\x{1100}-\x{11FF} | \x{2E80}-\x{D7AF} | 
151              \x{F900}-\x{FAFF} | \x{FE30}-\x{FE4F} | 
152              \x{FF00}-";
153
154 my $space = "\x{0020} | \x{0009} | \x{000A} | \x{000C} | \x{000D} ";
155
156
157 $useGT=1;
158 $useHZK=0;
159 $useCDP=0;
160
161 if($^V and $^V ge v5.8){
162     $perl58=1;
163 }elsif($^V and $^V ge v5.6){
164     $perl56=1;
165 }else{
166     print STDERR "This versin is not supported.";
167 }
168 if($perl58){
169     eval "use Encode";
170     binmode(STDIN, ':encoding(utf8)');
171     binmode(STDOUT, ':encoding(utf8)');
172 }
173
174 $omegadb_home="/Users/izumi/.chise";
175
176 &GetOptions("in=s"=>\$opt_in_cs,
177             "i=s"=>\$opt_in_cs,
178             "out=s"=>\$opt_out_cs,
179             "o=s"=>\$opt_out_cs,
180             "help",\$opt_help,
181             "h",\$opt_help);
182
183 $usage=<<EOF;
184 Usage: $0 -i <input coding system> -o <cmap encoding>
185     input coding system:
186       Utf8mcs, Utf8cns, Utf8gb, Utf8jis, Utf8ks
187     cmap encoding:
188       UniCNS, UniGB, UniJIS, UniKS
189 EOF
190
191 if($opt_in_cs or $opt_out_cs){
192     $in_cs=$opt_in_cs;
193     $out_cs=$opt_out_cs;
194 }elsif(@ARGV==0){
195     ($in_cs,$out_cs)=($0=~/(Utf8.+)To(\w+)/);
196     $inotp=1;
197 }
198
199 # $in_cs:
200 #   utf-8-mcs,utf-8-cns,utf-8-gb,utf-8-jis,utf-8-ks,
201 # $out_cs:
202 #   UniCNS,UniGB,UniJIS,UniKS
203
204 if($opt_help
205    or not defined($in_cs)
206    or not defined($out_cs)){
207     print $usage;
208     exit 1;
209 }
210
211 $idsdata_file="idsdata.pl";
212 $ids_start=0x00; 
213 $font_start=0;
214
215 if(-e $idsdata_file){
216     require $idsdata_file;
217 }
218
219 $ids_argc=0;
220 $ids="";
221
222 @GT=(#"=gt","=gt-k",
223      "=gt-pj-1","=gt-pj-2","=gt-pj-3","=gt-pj-4","=gt-pj-5","=gt-pj-6","=gt-pj-7","=gt-pj-8","=gt-pj-9","=gt-pj-10","=gt-pj-11"
224      #,"=gt-pj-k1","=gt-pj-k2"
225      );
226 @HZK=("=hanziku-1","=hanziku-10","=hanziku-11","=hanziku-12","=hanziku-2","=hanziku-3","=hanziku-4","=hanziku-5","=hanziku-6","=hanziku-7","=hanziku-8","=hanziku-9");
227 @CDP=("=big5-cdp");
228
229 while(<>){
230     # temporary fix for using in OTP for perl 5.6.
231     s/(.)/pack("c",unpack("U",$1))/ge if($inotp
232                                          and $in_cs=~/utf8/i
233                                          and $perl56);
234     # for perl 5.8.
235     $_=decode('utf8', $_)  if ($inotp and $in_cs=~/utf8/i
236                                and $perl58);
237     s/(amp.+?;)/&tex_de_er($1)/ge;
238 #    s/(&.+?;)/&tex_de_er($1)/ge;
239 #    s/^(.*)$/&add_break($1)/e;
240     @chars=split(//);
241     for($i=0;$i<=$#chars;$i++){
242 #    while(m/(.)/g){
243         $char=&get_char_in_utf8mcs($chars[$i],$in_cs);
244         $char_id=unpack("U",$char);
245         if($ids_argc>0){
246             # It's in IDS.
247             ($ids,$ids_argc)=&ids_rest($ids,$ids_argc,$char);
248             if($ids_argc==0){
249                 if(($char_id=&get_char_id_for_ids($ids))
250                    and(($out_char=&get_output_char($char_id,$out_cs)))){
251                     print $out_char,&add_break($i);
252                 }else{
253                     print &replace_ids($ids),&add_break($i) if($perl56);
254                     print encode('utf8', &replace_ids($ids)),&add_break($i) if($perl58);
255                 }
256                 $ids="";
257             }
258         }elsif($char_id>=0x2ff0 and $char_id<=0x2fff){
259             ($ids,$ids_argc)=&ids_rest("",0,$char);
260             next;
261         }else{
262             if($char_id<=0xff){
263                 print $char;
264                 next;
265             }
266             if(($out_char=&get_output_char($char_id,$out_cs))){
267                 print $out_char,&add_break($i);
268             }elsif($char_id >= 0x20000 && $char_id <=0x2a6df){
269                 unless(defined($ids{$char}) and $ids{$char}[1]>=0){
270                     $ids{$char}[0]=$font_start;
271                     $ids{$char}[1]=$ids_start;
272                     $ids_start++;
273                     if($ids_start>255){
274                         $ids_start=0;
275                         $font_start++;
276                     }
277                 }
278                 print "{\\fontencoding{OT1}\\fontfamily{" .
279                     sprintf("chise%03d",$ids{$char}[0]) .
280                     "}\\selectfont\\char$ids{$char}[1]}",&add_break($i);
281                 next;
282             }else{
283                 print &replace_ids(&get_ids($char)),&add_break($i);
284             }
285         }
286     }
287     if($ids_argc>0){
288         print STDERR "IDS parse error: $ids\n";
289 #       print pack("U",0xfffd);
290         print pack("U",0x3013) if($perl56);
291         print encode('utf8',pack("U",0x3013)) if($perl58);
292         $ids="";
293         $ids_argc=0;
294     }
295 }
296
297 open(IDSDATA,">$idsdata_file") or die;
298 print IDSDATA 'use utf8;',"\n";
299 foreach $ids (keys %ids){
300     print IDSDATA '$ids{\'',$ids,'\'}=[',join ",",@{$ids{$ids}},"];\n" if($perl56);
301     print IDSDATA '$ids{\'',encode('utf8',$ids),'\'}=[',join ",",@{$ids{$ids}},"];\n" if($perl58);
302 }
303 print IDSDATA '$font_start=',$font_start,";\n";
304 print IDSDATA '$ids_start=',$ids_start,";\n";
305 print IDSDATA "1;";
306
307 exit 0;
308
309 sub tex_de_er{
310     my($er)=@_;
311     my($out);
312     $er=~s/^amp(.*);$/$1/;
313 #    $er=~s/^&(.*);$/$1/;
314     $out=&de_er($er);
315     if($out){
316         return $out;
317     }else{
318         return "amp$er;";
319     }
320 }
321
322 sub add_break{
323     my($i)=@_;
324
325 #     $line=~s/(.)($strictly_forbidden_before)($not_strictly_forbidden_before)/
326 #       $1."\\CJKunbreakablekernone \\CJKprotrude ".$2.$3/egx;
327 #     $line=~s/(.)($strictly_forbidden_before)($strictly_forbidden_before)/
328 #       $1."\\CJKunbreakablekernone ".$2.$3/egx;
329
330 #     $line=~s/(.)($strictly_forbidden_before)/
331 #       $1."\\CJKunbreakablekernone ".$2/egx;
332 #     $line=~s/(.)($forbidden_before)/
333 #       $1."\\CJKunbreakablekerntwo ".$2/egx;
334 #     $line=~s/(.)($slightly_forbidden_before)/
335 #       $1."\\CJKunbreakablekernthree ".$2/egx;
336
337 #     $line=~s/($forbidden_after)(.)/
338 #       $1."\\CJKunbreakablekerntwo ".$2/egx;
339 #     $line=~s/($strictly_forbidden_after)(. )/
340 #       $1."\\CJKunbreakablekernone ".$2/egx;
341 #     $line=~s/($slightly_forbidden_after)(.)/
342 #       $1."\\CJKunbreakablekernthree ".$2/egx;
343
344 #     $line=~s/($asian)(.)/$1\\CJKbreakablekern $2/g;
345 #     $line=~s/(.)($asian)/$1\\CJKbreakablekern $2/g;
346
347     if($i<$#chars){
348         if($i<($#chars-1)){
349             if(($chars[$i+1]=~m/[$strictly_forbidden_before]/x)
350                and($chars[$i+2]=~m/[^$strictly_forbidden_before]/x)){
351                 return "\\CJKunbreakablekernone \\CJKprotrude ";
352             }elsif(($chars[$i+1]=~m/[$strictly_forbidden_before]/x)
353                and($chars[$i+2]=~m/[$strictly_forbidden_before]/x)){
354                 return "\\CJKunbreakablekernone ";
355             }
356         }else{
357             if($chars[$i+1]=~m/[$strictly_forbidden_before]/x){
358                 return "\\CJKunbreakablekernone ";
359             }elsif($chars[$i+1]=~m/[$forbidden_before]/x){
360                 return "\\CJKunbreakablekerntwo ";
361             }elsif($chars[$i+1]=~m/[$slightly_forbidden_before]/x){
362                 return "\\CJKunbreakablekernthree ";
363             }
364         }
365     }elsif($i>0){
366         if($chars[$i]=~m/[$forbidden_after]/x){
367             return "\\CJKunbreakablekerntwo ";
368         }elsif($chars[$i]=~m/[$strictly_forbidden_after]/x){
369             return "\\CJKunbreakablekernone ";
370         }elsif($chars[$i]=~m/[$slightly_forbidden_after]/x){
371             return "\\CJKunbreakablekernthree ";
372         }
373     }
374     return "\\CJKbreakablekern ";
375 }
376
377 sub ids_rest{
378     my($ids,$ids_argc,$char)=@_;
379     my($argc);
380     $argc=&ids_argc($char);
381     if($argc){
382         $ids_argc+= $ids_argc==0 ? $argc : $argc-1;
383     }else{
384         $ids_argc--;
385     }
386     $ids.=$char if($perl56);
387     $ids.=encode('utf8',$char) if($perl58);
388     return ($ids,$ids_argc);
389 }
390
391 sub replace_ids{
392     my($ids)=@_;
393     $ids=&normalize_ids($ids,"UniJIS");
394 #    return pack("U",0xfffd) if($ids!~/[$idc]/);
395     return pack("U",0x3013) if(($ids!~/[$idc]/)
396                                or($ids=~/[\x{10000}-]/));
397                     #irregular for KAGE.
398     unless(defined($ids{$ids}) and $ids{$ids}[1]>=0){
399         $ids{$ids}[0]=$font_start;
400         $ids{$ids}[1]=$ids_start;
401         $ids_start++;
402     }
403     if($ids_start>255){
404         $ids_start=0;
405         $font_start++;
406     }
407     return "{\\fontencoding{OT1}\\fontfamily{".sprintf("chise%03d",$ids{$ids}[0])."}\\selectfont\\char$ids{$ids}[1]}";
408 }
409
410 sub normalize_ids{
411     my($ids,$out_cs)=@_;
412     $ids = decode('utf8', $ids) if $perl58;
413     $out_cs=~s/Uni(.+)/"ucs-".lc($1)/e;
414     my $output_ids="";
415     my($char,$char_id,$output_char_id);
416     while($ids=~m/(.)/g){
417         $char=$1;
418         $char_id=unpack("U",$char);
419         if($char=~/[$idc]/){
420             $output_ids.=$char;
421         }elsif($output_char_id=&get_char_attribute($char,$out_cs)){
422             $output_ids.=pack("U",$output_char_id);
423         }elsif($output_char_id=&get_char_attribute($char,"=ucs")){
424             $output_ids.=pack("U",$output_char_id);
425         }elsif($output_char_id=&get_char_attribute($char,"ucs")){
426             $output_ids.=pack("U",$output_char_id);
427         }else{
428             return pack("U",0xfffe);
429         }
430     }
431     return $output_ids;
432 }
433
434 sub get_output_char{
435     my($char_id,$out_cs)=@_;
436     my($out_char_id,$suffix);
437     my($gt,$hzk,$cdp);
438     if(not defined($cmap_to{$out_cs})){
439         &get_cmap($out_cs);
440     }
441     if($out_char_id=$cmap_to{$out_cs}->{$char_id}){
442         return pack("U",$out_char_id);
443     }else{
444         return $gt if($useGT and $gt=&get_macro_for_GT($char_id));
445         return $hzk if($useHZK and $hzk=&get_macro_for_HZK($char_id));
446         return $cdp if($useCDP and $cdp=&get_macro_for_CDP($char_id));
447         return undef;
448     }
449 }
450
451 sub get_cmap{
452     my($out_cs)=@_;
453     tie %{$cmap_to{$out_cs}}, "BerkeleyDB::Hash",
454     -Filename => "$omegadb_home/$out_cs" or die $!;
455 }
456
457 sub get_ids{
458     my($char)=@_;
459     my $ids="";
460     $ids=&get_char_attribute($char,"ids-aggregated")
461         or &get_char_attribute($char,"ids");
462 #       or &get_char_attribute($char,"ideographic-structure");
463     $ids=decode('utf8', $ids) if($perl58);
464 #    $ids=~s/[? ()]//g;
465     return $ids;
466 }
467
468 sub get_char_id_for_ids{
469     my($ids)=@_;
470     my($char_id,$char);
471     $ids=decode('utf8', $ids) if($perl58);
472 #    $ids="(?".(join " ?",(split(//,$ids))).")";
473     &get_idsdb if(not defined($idsdb));
474     $char=$idsdb->{$ids};
475     $char=decode('utf8',$char) if($perl58);
476     if($char){
477         return unpack("U",$char);
478     }else{
479         return undef;
480     }
481 }
482
483 sub get_idsdb{
484     tie %{$idsdb}, "BerkeleyDB::Hash",
485     -Filename => "$omegadb_home/idsdb" or die $!;
486 }
487
488 sub get_char_in_utf8mcs_bak{
489     my($char,$in_cs)=@_;
490     return $char if($in_cs eq "Utf8mcs");
491     my($char_id,$output_char);
492     $in_cs=~s/Utf8/ucs-/;
493     $char_id=unpack("U",$char);
494     if(($output_char)=&get_chars_matching("$in_cs",$char_id)){
495         $output_char=decode('utf8', $output_char) if($perl58);
496         return $output_char;
497     }else{
498         return $char;
499     }
500 }
501
502 sub get_char_in_utf8mcs{
503     # argument: <character>, <input coding system>
504     # return:   character in UTF-8mcs.
505     my($char,$in_cs)=@_;
506     my($char_id,$output_char_id);
507     return $char if($in_cs eq "Utf8mcs");
508     $char_id=unpack("U",$char);
509     &get_utf8mcs_map($in_cs) if(not defined($utf8mcs_map_from{$in_cs}));
510     if($output_char_id=$utf8mcs_map_from{$in_cs}->{$char_id}){
511         return pack("U",$output_char_id);
512    }else{
513         return $char;
514    }
515 }
516
517 sub get_utf8mcs_map{
518     my($in_cs)=@_;
519     my($suffix);
520     ($suffix=$in_cs)=~s/^Utf8//;
521     tie %{$utf8mcs_map_from{$in_cs}}, "BerkeleyDB::Hash",
522     -Filename => "$omegadb_home/ucs-$suffix" or die $!;
523 }
524
525 sub get_macro_for_GT{
526     my($char_id)=@_;
527     my($char,$gt,$GT);
528     $char=pack("U",$char_id);
529     foreach (@GT){
530         if($gt=&get_char_attribute($char,$_)){
531             m/gt\-pj\-(\d+)/ and $GT=$1;
532             last;
533         }
534     }
535     if($gt){
536         return "{\\fontencoding{OT1}\\fontfamily{".sprintf("gt%02d",$GT)."}\\selectfont\\char".($gt|0x8080)."}";
537 #       return "\\GT{".sprintf("gt%02d",$GT)."}{\\char".($gt|0x8080)."}";
538     }else{
539         return undef;
540     }
541 }
542
543 sub get_macro_for_HZK{
544     my($char_id)=@_;
545     my($char,$hzk,$HZK);
546     $char=pack("U",$char_id);
547     foreach (@HZK){
548         if($hzk=&get_char_attribute($char,$_)){
549             m/hanziku\-(\d+)/ and $HZK=$1;
550             last;
551         }
552     }
553     if($hzk){
554         return "{\\fontencoding{OT1}\\fontfamily{".sprintf("hzk%02d",$HZK)."}\\selectfont\\char".($hzk|0x8080)."}";
555     }else{
556         return undef;
557     }
558 }
559
560 sub get_macro_for_CDP{
561     my($char_id)=@_;
562     my($char,$cdp);
563     $char=pack("U",$char_id);
564     foreach (@CDP){
565         if($cdp=&get_char_attribute($char,$_)){
566             last;
567         }
568     }
569     if($cdp){
570         return "{\\fontencoding{OT1}\\fontfamily{cdp}\\selectfont\\char".($cdp|0x8080)."}";
571     }else{
572         return undef;
573     }
574 }