#!/usr/bin/perl
+# ver.0.2
+
use strict;
-use vars qw($opt_in_cs $opt_out_cs $opt_help $usage
- $in_cs $out_cs
- $i @chars
- $char $char_id $out_char $omegadb_home
+use vars qw($opt_in_cs $opt_out_cs $opt_protrude
+ $opt_help $usage
+ $in_cs $out_cs $i @chars
+ @order $opt_order %order
+ $char $char_id $out_char
$ids $ids_argc %ids $idsdb
$idsdata_file $ids_start $font_start
%utf8mcs_map_from
- %cmap_to
$inotp $perl56 $perl58
- $useCDP $useHZK $useGT
@CDP @HZK @GT
);
use Getopt::Long;
use utf8;
use Chise_utils ':all';
+### Options ###
+
+$opt_order='jtcgk';
+$opt_protrude=0;# 1=true, 0=false.
+
+###
my $strictly_forbidden_after = '「【『[(〈“‘‘(〔{《{\[\(\x{3016}{「';
# \x{3016} | # white 【
# in the right margin
my $strictly_forbidden_before=
'!,.:;?、。!,.:;?。\)#}’”〉》」』】〕\x{3017})]}}」\]';
-# \x{3017} | # white 】
+### \x{3017} | # white 】
my $forbidden_before
= 'ー々ぁぃぅぇぉゃゅょっゎァィゥェォャュョッヮヵヶ';
my $space = '\x{0020}\x{0009}\x{000A}\x{000C}\x{000D}';
-$useGT=1;
-$useHZK=0;
-$useCDP=0;
-
if($^V and $^V ge v5.8){
$perl58=1;
}elsif($^V and $^V ge v5.6){
$perl56=1;
}else{
print STDERR "This versin is not supported.";
+ exit 1;
}
if($perl58){
eval "use Encode";
binmode(STDOUT, ':encoding(utf8)');
}
-$omegadb_home="/usr/local/lib/chise/omega/db";
-
&GetOptions("in=s"=>\$opt_in_cs,
- "i=s"=>\$opt_in_cs,
"out=s"=>\$opt_out_cs,
- "o=s"=>\$opt_out_cs,
- "help",\$opt_help,
- "h",\$opt_help);
+ "help",\$opt_help);
$usage=<<EOF;
Usage: $0 -i <input coding system> -o <cmap encoding>
input coding system:
Utf8mcs, Utf8cns, Utf8gb, Utf8jis, Utf8ks
cmap encoding:
- UniCNS, UniGB, UniJIS, UniKS
+ UniCNS, UniGB, UniJIS, UniKS, UniMulti
EOF
if($opt_in_cs or $opt_out_cs){
}
# $in_cs:
-# utf-8-mcs,utf-8-cns,utf-8-gb,utf-8-jis,utf-8-ks,
+# Utf8mcs,Utf8cns,Utf8gb,Utf8jis,Utf8ks,
# $out_cs:
-# UniCNS,UniGB,UniJIS,UniKS
+# UniCNS,UniGB,UniJIS,UniKS,UniMulti
+
+$in_cs=~s/Utf8/ucs\@/;
if($opt_help
or not defined($in_cs)
@HZK=("=hanziku-1","=hanziku-10","=hanziku-11","=hanziku-12","=hanziku-2","=hanziku-3","=hanziku-4","=hanziku-5","=hanziku-6","=hanziku-7","=hanziku-8","=hanziku-9");
@CDP=("=big5-cdp");
+%order=('c'=>'UniCNS',
+ 'g'=>'UniGB',
+ 'j'=>'UniJIS',
+ 'k'=>'UniKS',
+ 't'=>'GT',
+# not implemented yet.
+# 'h'=>'HZK',
+# 'd'=>'CDP',
+ );
+
+if(defined($opt_order)){
+ if($opt_order=~/^[cgjkt]*$/){
+ @order=split(//,$opt_order);
+ @order=map {$order{$_}} @order;
+ }else{
+ print STDERR "Invalid order!\n";
+ exit 1;
+ }
+}
+
while(<>){
- # temporary fix for using in OTP for perl 5.6.
- s/(.)/pack("c",unpack("U",$1))/ge if($inotp
- and $in_cs=~/utf8/i
- and $perl56);
- # for perl 5.8.
- $_=decode('utf8', $_) if ($inotp and $in_cs=~/utf8/i
- and $perl58);
+ if($perl56){
+ # for perl 5.6
+ if($inotp){
+ s/(.)/pack("c",&get_char_id(unpack("U",$1),$in_cs))/ge;
+ }else{
+ s/(.)/pack("U",&get_char_id(unpack("U",$1),$in_cs))/ge;
+ }
+ }elsif($perl58){
+ # for perl 5.8.
+ $_=decode('utf8', $_);
+ s/(.)/pack("U",&get_char_id(unpack("U",$1),$in_cs))/ge;
+ }
s/(amp.+?;)/&tex_de_er($1)/ge;
# s/(&.+?;)/&tex_de_er($1)/ge;
-# s/^(.*)$/&add_break($1)/e;
@chars=split(//);
for($i=0;$i<=$#chars;$i++){
-# while(m/(.)/g){
- $char=&get_char_in_utf8mcs($chars[$i],$in_cs);
+ $char=$chars[$i];
$char_id=unpack("U",$char);
- if($ids_argc>0){
- # It's in IDS.
- ($ids,$ids_argc)=&ids_rest($ids,$ids_argc,$char);
- if($ids_argc==0){
- if(($char_id=&get_char_id_for_ids($ids))
- and(($out_char=&get_output_char($char_id,$out_cs)))){
- print $out_char,&add_break($i);
- }else{
- print &replace_ids($ids),&add_break($i) if($perl56);
- print encode('utf8', &replace_ids($ids)),&add_break($i) if($perl58);
+
+ if($char_id<=0x20){
+ print $chars[$i];
+ next;
+ }elsif($char_id>0x20 and $char_id<=0xff){
+ # Basic Latin
+ # Latin-1 Supplement
+ print &latin_parse();
+ next;
+ }elsif($char_id>=0x2ff0 and $char_id<=0x2fff){
+ # Ideographic Description Characters
+ print &ids_parse();
+ next;
+ }elsif($char_id >= 0x20000 && $char_id <=0x2a6df){
+ # CJK Unified Ideographs Extension B
+ if(not defined($ids{$char}) and $ids{$char}[1]>=0){
+ $ids{$char}[0]=$font_start;
+ $ids{$char}[1]=$ids_start;
+ $ids_start++;
+ if($ids_start>255){
+ $ids_start=0;
+ $font_start++;
}
- $ids="";
}
- }elsif($char_id>=0x2ff0 and $char_id<=0x2fff){
- ($ids,$ids_argc)=&ids_rest("",0,$char);
+ print "{\\fontencoding{OT1}\\fontfamily{" .
+ sprintf("chise%03d",$ids{$char}[0]) .
+ "}\\selectfont\\char$ids{$char}[1]}",&add_break($i);
next;
}else{
- if($char_id<=0xff){
- print $char;
- next;
- }
if(($out_char=&get_output_char($char_id,$out_cs))){
print $out_char,&add_break($i);
- }elsif($char_id >= 0x20000 && $char_id <=0x2a6df){
- unless(defined($ids{$char}) and $ids{$char}[1]>=0){
- $ids{$char}[0]=$font_start;
- $ids{$char}[1]=$ids_start;
- $ids_start++;
- if($ids_start>255){
- $ids_start=0;
- $font_start++;
- }
- }
- print "{\\fontencoding{OT1}\\fontfamily{" .
- sprintf("chise%03d",$ids{$char}[0]) .
- "}\\selectfont\\char$ids{$char}[1]}",&add_break($i);
- next;
}else{
- print &replace_ids(&get_ids($char)),&add_break($i);
+ print &get_macro_for_ids(&get_ids($char)),&add_break($i);
}
}
}
- if($ids_argc>0){
- print STDERR "IDS parse error: $ids\n";
-# print pack("U",0xfffd);
- print pack("U",0x3013) if($perl56);
- print encode('utf8',pack("U",0x3013)) if($perl58);
- $ids="";
- $ids_argc=0;
- }
}
open(IDSDATA,">$idsdata_file") or die;
$er=~s/^amp(.*);$/$1/;
# $er=~s/^&(.*);$/$1/;
$out=&de_er($er);
- if($out){
+ if($out and $out ne $er){
return $out;
}else{
return "amp$er;";
if(($chars[$i+1]=~m/[$strictly_forbidden_before]/o)
and($chars[$i+2]=~m/[$strictly_forbidden_before]/o)){
return "\\CJKunbreakablekernone ";
- }elsif(($chars[$i+1]=~m/[$strictly_forbidden_before]/o)
+ }elsif($opt_protrude){
+ if(($chars[$i+1]=~m/[$strictly_forbidden_before]/o)
and($chars[$i+2]=~m/[^$strictly_forbidden_before]/o)){
- return "\\CJKunbreakablekernone \\CJKprotrude ";
+ return "\\CJKunbreakablekernone \\CJKprotrude ";
+ }
}
}
if(($i<$#chars)
}
}
+sub latin_parse{
+ # arguments: none
+ # return: string for output with TeX macro.
+ my($char_id);
+ my $out_str=$chars[$i];
+ $i++;
+ while($i<=$#chars){
+ $char_id=unpack("U",$chars[$i]);
+ if($char_id<=0xff){
+ $out_str.=pack("U",$char_id);
+ }else{
+ $i--;
+ last;
+ }
+ $i++;
+ }
+ return '{\fontfamily{\rmdefault}\selectfont {'.$out_str.'}}';
+}
+
+sub ids_parse{
+ # arguments: none
+ # return: character for output,
+ # TeX macro for ids,
+ # or GETA character if ids is invalid.
+ my($ids,$ids_argc)=&ids_rest("",0,$chars[$i]);
+
+ while($ids_argc>0){
+ # We are in IDS.
+ $i++;
+ if($i>$#chars){
+ print STDERR "IDS parse error: $ids\n";
+# return pack("U",0xfffd);
+ return pack("U",0x3013);
+ }
+
+ ($ids,$ids_argc)=&ids_rest($ids,$ids_argc,$chars[$i]);
+ if($ids_argc==0){
+ if(($char_id=&get_char_id_for_ids($ids))
+ and($out_char=&get_output_char($char_id,$out_cs))){
+ return $out_char;
+ }else{
+ return &get_macro_for_ids($ids) if($perl56);
+ return encode('utf8', &get_macro_for_ids($ids)) if($perl58);
+ }
+ }
+ }
+}
+
sub ids_rest{
+ # arguments: <ids>, <rest number of arguments for ids>, <character>
+ # return: ids and rest number of arguments for ids.
my($ids,$ids_argc,$char)=@_;
my($argc);
$argc=&ids_argc($char);
return ($ids,$ids_argc);
}
-sub replace_ids{
+sub get_macro_for_ids{
+ # argument: <ids>
+ # return: TeX macro for ids
+ # or GETA character if ids is invalid for KAGE.
my($ids)=@_;
$ids=&normalize_ids($ids,"UniJIS");
-# return pack("U",0xfffd) if($ids!~/[$idc]/);
+# return pack("U",0xfffd) if(($ids!~/[$idc]/)
return pack("U",0x3013) if(($ids!~/[$idc]/)
or($ids=~/[\x{10000}-]/));
#irregular for KAGE.
- unless(defined($ids{$ids}) and $ids{$ids}[1]>=0){
+ if(not defined($ids{$ids}) and $ids{$ids}[1]>=0){
$ids{$ids}[0]=$font_start;
$ids{$ids}[1]=$ids_start;
$ids_start++;
$ids_start=0;
$font_start++;
}
- return "{\\fontencoding{OT1}\\fontfamily{".sprintf("chise%03d",$ids{$ids}[0])."}\\selectfont\\char$ids{$ids}[1]}";
+ return "{\\fontencoding{OT1}\\fontfamily{"
+ .sprintf("chise%03d",$ids{$ids}[0])
+ ."}\\selectfont\\char$ids{$ids}[1]}";
}
sub normalize_ids{
+ # argument: <ids>, <output coding system>
+ # return: ids or GETA character if ids is invalid for KAGE.
my($ids,$out_cs)=@_;
$ids = decode('utf8', $ids) if $perl58;
- $out_cs=~s/Uni(.+)/"ucs-".lc($1)/e;
+ $out_cs=~s/Uni(.+)/'ucs@'.lc($1)/e;
+
my $output_ids="";
my($char,$char_id,$output_char_id);
while($ids=~m/(.)/g){
$char_id=unpack("U",$char);
if($char=~/[$idc]/){
$output_ids.=$char;
- }elsif($output_char_id=&get_char_attribute($char,$out_cs)){
+ }elsif($output_char_id=&get_char_attribute($char,"=$out_cs")){
$output_ids.=pack("U",$output_char_id);
}elsif($output_char_id=&get_char_attribute($char,"=ucs")){
$output_ids.=pack("U",$output_char_id);
- }elsif($output_char_id=&get_char_attribute($char,"ucs")){
- $output_ids.=pack("U",$output_char_id);
}else{
- return pack("U",0xfffe);
+# return pack("U",0xfffd);
+ return pack("U",0x3013);
}
}
return $output_ids;
}
sub get_output_char{
+ # argument: <char-id>
+ # return: character in output coding system or TeX macro.
my($char_id,$out_cs)=@_;
- my($out_char_id,$suffix);
+ my($char,$out_char_id,$suffix);
my($gt,$hzk,$cdp);
- if(not defined($cmap_to{$out_cs})){
- &get_cmap($out_cs);
- }
- if($out_char_id=$cmap_to{$out_cs}->{$char_id}){
- return pack("U",$out_char_id);
- }else{
- return $gt if($useGT and $gt=&get_macro_for_GT($char_id));
- return $hzk if($useHZK and $hzk=&get_macro_for_HZK($char_id));
- return $cdp if($useCDP and $cdp=&get_macro_for_CDP($char_id));
- return undef;
- }
-}
-sub get_cmap{
- my($out_cs)=@_;
- tie %{$cmap_to{$out_cs}}, "BerkeleyDB::Hash",
- -Filename => "$omegadb_home/$out_cs" or die $!;
+ $char=pack('U',$char_id);
+
+ if($out_cs eq 'UniJIS'
+ and &get_char_attribute($char,"adobe-unijis-utf16-h")){
+ if($out_char_id=&get_char_attribute($char,'=ucs@jis')
+ or $out_char_id=&get_char_attribute($char,'=ucs')
+ or $out_char_id=&get_char_attribute($char,'=>ucs@jis')
+ or $out_char_id=&get_char_attribute($char,'=>ucs')
+ ){
+ return pack("U",$out_char_id);
+ }
+ }elsif($out_cs eq 'UniGB'
+ and &get_char_attribute($char,"adobe-unigb-ucs2-h")){
+ if($out_char_id=&get_char_attribute($char,'=ucs@gb')
+ or $out_char_id=&get_char_attribute($char,'=ucs')
+ or $out_char_id=&get_char_attribute($char,'=>ucs@gb')
+ or $out_char_id=&get_char_attribute($char,'=>ucs')
+ ){
+ return pack("U",$out_char_id);
+ }
+ }elsif($out_cs eq 'UniCNS'
+ and &get_char_attribute($char,"adobe-unicns-ucs2-h")){
+ if($out_char_id=&get_char_attribute($char,'=ucs@cns')
+ or $out_char_id=&get_char_attribute($char,'=ucs')
+ or $out_char_id=&get_char_attribute($char,'=>ucs@cns')
+ or $out_char_id=&get_char_attribute($char,'=>ucs')
+ ){
+ return pack("U",$out_char_id);
+ }
+ }elsif($out_cs eq 'UniKS'
+ and &get_char_attribute($char,"adobe-uniks-ucs2-h")){
+ if($out_char_id=&get_char_attribute($char,'=ucs@ks')
+ or $out_char_id=&get_char_attribute($char,'=ucs')
+ or $out_char_id=&get_char_attribute($char,'=>ucs@ks')
+ or $out_char_id=&get_char_attribute($char,'=>ucs')
+ ){
+ return pack("U",$out_char_id);
+ }
+ }elsif($out_cs eq 'UniMulti'){
+ foreach $out_cs (@order){
+
+ if($out_cs eq 'UniJIS'
+ and &get_char_attribute($char,"adobe-unijis-utf16-h")){
+ if($out_char_id=&get_char_attribute($char,'=ucs@jis')
+ or $out_char_id=&get_char_attribute($char,'=ucs')
+ or $out_char_id=&get_char_attribute($char,'=>ucs@jis')
+ or $out_char_id=&get_char_attribute($char,'=>ucs')
+ ){
+ return '{\selectjisfont\char'.$out_char_id.'}';
+ }
+ }elsif($out_cs eq 'UniGB'
+ and &get_char_attribute($char,"adobe-unigb-ucs2-h")){
+ if($out_char_id=&get_char_attribute($char,'=ucs@gb')
+ or $out_char_id=&get_char_attribute($char,'=ucs')
+ or $out_char_id=&get_char_attribute($char,'=>ucs@gb')
+ or $out_char_id=&get_char_attribute($char,'=>ucs')
+ ){
+ return '{\selectgbsfont\char'.$out_char_id.'}';
+ }
+ }elsif($out_cs eq 'UniCNS'
+ and &get_char_attribute($char,"adobe-unicns-ucs2-h")){
+ if($out_char_id=&get_char_attribute($char,'=ucs@cns')
+ or $out_char_id=&get_char_attribute($char,'=ucs')
+ or $out_char_id=&get_char_attribute($char,'=>ucs@cns')
+ or $out_char_id=&get_char_attribute($char,'=>ucs')
+ ){
+ return '{\selectcnsfont\char'.$out_char_id.'}';
+ }
+ }elsif($out_cs eq 'UniKS'
+ and &get_char_attribute($char,"adobe-uniks-ucs2-h")){
+ if($out_char_id=&get_char_attribute($char,'=ucs@ks')
+ or $out_char_id=&get_char_attribute($char,'=ucs')
+ or $out_char_id=&get_char_attribute($char,'=>ucs@ks')
+ or $out_char_id=&get_char_attribute($char,'=>ucs')
+ ){
+ return '{\selectksxfont\char'.$out_char_id.'}';
+ }
+ }elsif($out_cs eq 'GT'){
+ return $gt if($gt=&get_macro_for_GT($char_id));
+ }elsif($out_cs eq 'HZK'){
+ return $hzk if($hzk=&get_macro_for_HZK($char_id));
+ }elsif($out_cs eq 'CDP'){
+ return $cdp if($cdp=&get_macro_for_CDP($char_id));
+ }
+ }
+ }
+ return undef;
}
sub get_ids{
+ # argument: <character>
+ # return: ids
my($char)=@_;
my $ids="";
$ids=&get_char_attribute($char,"ids-aggregated")
}
sub get_char_id_for_ids{
+ # argument: <ideographic description sequence>
+ # return: char-id
my($ids)=@_;
- my($char_id,$char);
+ my($output_char);
$ids=decode('utf8', $ids) if($perl58);
-# $ids="(?".(join " ?",(split(//,$ids))).")";
- &get_idsdb if(not defined($idsdb));
- $char=$idsdb->{$ids};
- $char=decode('utf8',$char) if($perl58);
- if($char){
- return unpack("U",$char);
+
+ if(($output_char)=&get_chars_matching("ids",$ids)){
+ return unpack("U",$output_char);
}else{
return undef;
}
}
-sub get_idsdb{
- tie %{$idsdb}, "BerkeleyDB::Hash",
- -Filename => "$omegadb_home/idsdb" or die $!;
-}
+sub get_char_id{
+ # argument: <char-id>, <input coding system>
+ # return: char-id.
+ my($char_id,$in_cs)=@_;
+ my($output_char);
+
+ return $char_id if($in_cs eq 'ucs@mcs');
-sub get_char_in_utf8mcs_bak{
- my($char,$in_cs)=@_;
- return $char if($in_cs eq "Utf8mcs");
- my($char_id,$output_char);
- $in_cs=~s/Utf8/ucs-/;
- $char_id=unpack("U",$char);
- if(($output_char)=&get_chars_matching("$in_cs",$char_id)){
- $output_char=decode('utf8', $output_char) if($perl58);
- return $output_char;
+ if(($output_char)=&get_chars_matching("=$in_cs",$char_id)){
+ return unpack("U",$output_char);
}else{
- return $char;
+ return $char_id;
}
}
-sub get_char_in_utf8mcs{
- # argument: <character>, <input coding system>
- # return: character in UTF-8mcs.
- my($char,$in_cs)=@_;
- my($char_id,$output_char_id);
- return $char if($in_cs eq "Utf8mcs");
- $char_id=unpack("U",$char);
- &get_utf8mcs_map($in_cs) if(not defined($utf8mcs_map_from{$in_cs}));
- if($output_char_id=$utf8mcs_map_from{$in_cs}->{$char_id}){
- return pack("U",$output_char_id);
- }else{
- return $char;
- }
-}
-
-sub get_utf8mcs_map{
- my($in_cs)=@_;
- my($suffix);
- ($suffix=$in_cs)=~s/^Utf8//;
- tie %{$utf8mcs_map_from{$in_cs}}, "BerkeleyDB::Hash",
- -Filename => "$omegadb_home/ucs-$suffix" or die $!;
-}
-
sub get_macro_for_GT{
+ # argument: <char-id>
+ # return: TeX macro for GT fonts.
my($char_id)=@_;
my($char,$gt,$GT);
$char=pack("U",$char_id);
}
}
if($gt){
- return "{\\fontencoding{OT1}\\fontfamily{".sprintf("gt%02d",$GT)."}\\selectfont\\char".($gt|0x8080)."}";
-# return "\\GT{".sprintf("gt%02d",$GT)."}{\\char".($gt|0x8080)."}";
+ return "{\\fontencoding{OT1}\\fontfamily{"
+ .sprintf("gt%02d",$GT)
+ ."}\\selectfont\\char".($gt|0x8080)."}";
}else{
return undef;
}