1 # Copyright (C) 2002-2004 Kouichirou Eto, All rights reserved.
3 require "chise/chisedb"
7 module EntityReferenceModule
8 PART = "&([-+0-9A-Za-z#]+);"
10 PART_RE = Regexp.new(PART)
11 ALL_RE = Regexp.new(ALL)
13 def contain_er?(s) (PART_RE =~ s) != nil; end
14 def is_er?(s) (ALL_RE =~ s) != nil; end
16 # the order is important. The primary charset should be selectable.
18 %w( =jis-x0208-1990 J90- 4 X),
19 %w( =jis-x0208-1983 J83- 4 X),
20 %w( =jis-x0208-1978 J78- 4 X),
21 %w( =jis-x0208 J90- 4 X), #
\8cp
\8f³
\82Ì
\83A
\83h
\83z
\83b
\83N
\82È
\8eÀ
\91\95
22 %w( =jis-x0208 J83- 4 X), #
\8cp
\8f³
\82Ì
\83A
\83h
\83z
\83b
\83N
\82È
\8eÀ
\91\95
23 %w( =jis-x0208 J78- 4 X), #
\8cp
\8f³
\82Ì
\83A
\83h
\83z
\83b
\83N
\82È
\8eÀ
\91\95
24 %w( =jis-x0213-1-2000 JX1- 4 X),
25 %w( =jis-x0213-2-2000 JX2- 4 X),
26 %w( =jis-x0212 JSP- 4 X),
27 %w( =big5-cdp CDP- 4 X),
29 %w( =cns11643-1 C1- 4 X),
30 %w( =cns11643-2 C2- 4 X),
31 %w( =cns11643-3 C3- 4 X),
32 %w( =cns11643-4 C4- 4 X),
33 %w( =cns11643-5 C5- 4 X),
34 %w( =cns11643-6 C6- 4 X),
35 %w( =cns11643-7 C7- 4 X),
36 %w( =ks-x1001 K0- 4 X),
37 %w( =daikanwa M- 5 d),
41 %w( =hanziku-1 HZK01- 4 X),
42 %w( =hanziku-2 HZK02- 4 X),
43 %w( =hanziku-3 HZK03- 4 X),
44 %w( =hanziku-4 HZK04- 4 X),
45 %w( =hanziku-5 HZK05- 4 X),
46 %w( =hanziku-6 HZK06- 4 X),
47 %w( =hanziku-7 HZK07- 4 X),
48 %w( =hanziku-8 HZK08- 4 X),
49 %w( =hanziku-9 HZK09- 4 X),
50 %w( =hanziku-10 HZK10- 4 X),
51 %w( =hanziku-11 HZK11- 4 X),
52 %w( =hanziku-12 HZK12- 4 X),
53 %w( =ruimoku-v6 RUI6- 4 X),
54 %w( =jef-china3 JC3- 4 X),
59 include EntityReferenceModule
62 PRIVATE_USE_AREA = 0xe000
64 def parse(c) # parse a value and return a number (MCS)
65 raise "c is nil" if c.nil?
69 c = c.sub(/\A\?/, "") # remove "?" in the head
70 #u4 = c.u8tou32 # translate from UTF-8 to UTF-32
71 #return u4.u32to_i # translate UTF-32 to UCS number
75 return parse_er(c) if is_er?(c) # ER?
77 return c.to_i if /^\d+$/ =~ c # only numbers?
79 raise "unknown format"
82 if c.kind_of?(Numeric)
83 c = 0x80000000 + c if c < 0 # negative value
87 raise "unknown object"
90 def parse_er(s) # parse a Entity Reference and return a number (MCS)
91 raise "wrong ER." unless ALL_RE =~ s # don't use is_er? for getting $1.
93 s = $1 # extract the part of ER
95 return $1.hex if s =~ /\AMCS-([0-9A-Fa-f]+)\Z/ # MCS. It's a mystery.
97 return $1.hex if s =~ /\AU[-+]?([0-9A-Fa-f]+)\Z/ ||
98 s =~ /\A#x([0-9A-Fa-f]+)\Z/ # Unicode code point in Hex.
100 return $1.to_i if s =~ /\A#([0-9]+)\Z/ # Unicode code point in Decimal.
102 if s =~ /\Amy-([0-9]+)\Z/ # my own code point. It's a secret.
103 return PRIVATE_USE_AREA + $1.to_i # private use area of Unicode.
106 if s =~ /\AI-/ # I- stands for Isolated character. It's a wonder.
107 s = s.sub(/\AI-/, "")
110 CCS_TABLE.each {|ccs, er_prefix, keta, numtype|
119 re = "\\A#{er_prefix}(#{nre}{#{keta},#{keta}})\\Z"
120 next unless Regexp.new(re) =~ s
130 u8 = get_ccs(ccs, code)
140 raise "unknown Entity Reference"
144 def get_ccs(ccs, code_point)
145 cd = ChiseDB.instance
146 cd.decode_char(ccs, code_point)
150 class EntityReferenceParser
151 include EntityReferenceModule
153 def de_er(s) # replace EntityReference with corresponding character.
154 return s unless PART_RE =~ s # don't use contain_er? to get $1
157 char = Character.get(er)
158 ss = s.sub(Regexp.new(Regexp.escape(er)), char.utf8_mcs)
160 return de_er(ss) if contain_er?(ss) # recursive
165 class EntityReferenceEncoder
166 include EntityReferenceModule
170 return "&#x%04x;" % cid if cid <= 0xffff
171 return "&#x%05x;" % cid if cid <= 0xfffff
173 CCS_TABLE.each {|ccs, er_prefix, keta, numtype|
176 return "&#{er_prefix}%0#{keta}#{numtype};" % code
179 "&MCS-%08X;" % cid # the last answer
182 def to_er_by_ccs(cid, ccs) # not yet