1 # Copyright (C) 2002-2004 Kouichirou Eto, All rights reserved.
3 require "chise/chisedb"
7 PART = "&([-+0-9A-Za-z#]+);"
9 PART_RE = Regexp.new(PART)
10 ALL_RE = Regexp.new(ALL)
12 def contain_er?(s) (PART_RE =~ s) != nil; end
13 def is_er?(s) (ALL_RE =~ s) != nil; end
15 # the order is important. The primary charset should be selectable.
17 %w( =jis-x0208-1990 J90- 4 X),
18 %w( =jis-x0208-1983 J83- 4 X),
19 %w( =jis-x0208-1978 J78- 4 X),
20 %w( =jis-x0208 J90- 4 X), #
\8cp
\8f³
\82Ì
\83A
\83h
\83z
\83b
\83N
\82È
\8eÀ
\91\95
21 %w( =jis-x0208 J83- 4 X), #
\8cp
\8f³
\82Ì
\83A
\83h
\83z
\83b
\83N
\82È
\8eÀ
\91\95
22 %w( =jis-x0208 J78- 4 X), #
\8cp
\8f³
\82Ì
\83A
\83h
\83z
\83b
\83N
\82È
\8eÀ
\91\95
23 %w( =jis-x0213-1-2000 JX1- 4 X),
24 %w( =jis-x0213-2-2000 JX2- 4 X),
25 %w( =jis-x0212 JSP- 4 X),
26 %w( =big5-cdp CDP- 4 X),
27 %w( =cns11643-1 C1- 4 X),
28 %w( =cns11643-2 C2- 4 X),
29 %w( =cns11643-3 C3- 4 X),
30 %w( =cns11643-4 C4- 4 X),
31 %w( =cns11643-5 C5- 4 X),
32 %w( =cns11643-6 C6- 4 X),
33 %w( =cns11643-7 C7- 4 X),
34 %w( =ks-x1001 K0- 4 X),
35 %w( =daikanwa M- 5 d),
40 PRIVATE_USE_AREA = 0xe000
44 include EntityReference
46 def parse(c) # parse a value and return a number (MCS)
47 raise "c is nil" if c.nil?
51 c = c.sub(/\A\?/, "") # remove "?" in the head
52 u4 = c.u8tou32 # translate from UTF-8 to UTF-32
53 return u4.u32to_i # translate UTF-32 to UCS number
56 return parse_er(c) if is_er?(c) # ER?
58 return c.to_i if /^\d+$/ =~ c # only numbers?
60 raise "unknown format"
63 if c.kind_of?(Numeric)
64 c = 0x80000000 + c if c < 0 # negative value
68 raise "unknown object"
71 def parse_er(s) # parse a Entity Reference and return a number (MCS)
72 raise "wrong ER." unless ALL_RE =~ s # don't use is_er? for getting $1.
74 s = $1 # extract the part of ER
76 return $1.hex if s =~ /\AMCS-([0-9A-Fa-f]+)\Z/ # MCS. It's a mystery.
78 return $1.hex if s =~ /\AU[-+]?([0-9A-Fa-f]+)\Z/ ||
79 s =~ /\A#x([0-9A-Fa-f]+)\Z/ # Unicode code point in Hex.
81 return $1.to_i if s =~ /\A#([0-9]+)\Z/ # Unicode code point in Decimal.
83 if s =~ /\Amy-([0-9]+)\Z/ # my own code point. It's a secret.
84 return PRIVATE_USE_AREA + $1.to_i # private use area of Unicode.
87 if s =~ /\AI-/ # I- stands for Isolated character. It's a wonder.
91 CODESYS_TABLE.each {|codesys, er_prefix, keta, numtype|
100 re = "\\A#{er_prefix}(#{nre}{#{keta},#{keta}})\\Z"
101 next unless Regexp.new(re) =~ s
110 u8 = get_ccs(codesys, code)
119 raise "unknown Entity Reference"
123 def get_ccs(ccs, code_point)
124 cd = ChiseDB.instance
125 cd.decode_char(ccs, code_point)
129 class EntityReferenceParser
130 include EntityReference
132 def de_er(s) # replace EntityReference with corresponding character.
133 return s unless PART_RE =~ s # don't use contain_er? to get $1
136 char = Character.get(er)
137 ss = s.sub(Regexp.new(Regexp.escape(er)), char.utf8_mcs)
139 return de_er(ss) if contain_er?(ss) # recursive
144 class EntityReferenceEncoder
145 include EntityReference
149 return "&#x%04x;" % cid if cid <= 0xffff
150 return "&#x%05x;" % cid if cid <= 0xfffff
152 CODESYS_TABLE.each {|codesys, er_prefix, keta, numtype|
155 return "&#{er_prefix}%0#{keta}#{numtype};" % code
158 "&MCS-%08X;" % cid # the last answer
161 def to_er_by_ccs(cid, codesys) # not yet