1 # Copyright (C) 2002-2004 Kouichirou Eto, All rights reserved.
5 def parse(c) # parse a value and return a number (MCS)
6 raise "c is nil" if c.nil?
10 c = c.sub(/\A\?/, "") # remove "?" in the head
11 u4 = c.u8tou32 # translate from UTF-8 to UTF-32
12 return u4.u32to_i # translate UTF-32 to UCS number
15 return parse_er(c) if is_er?(c) # ER?
17 return c.to_i if /^\d+$/ =~ c # only numbers?
19 raise "unknown format"
22 if c.kind_of?(Numeric)
23 c = 0x80000000 + c if c < 0 # negative value
27 raise "unknown object"
30 PART = "&([-+0-9A-Za-z#]+);"
32 PART_RE = Regexp.new(PART)
33 ALL_RE = Regexp.new(ALL)
35 def contain_er?(s) (PART_RE =~ s) != nil; end
36 def is_er?(s) (ALL_RE =~ s) != nil; end
38 # the order is important. The primary charset should be selectable.
40 %w( =jis-x0208-1990 J90- 4 X),
41 %w( =jis-x0208-1983 J83- 4 X),
42 %w( =jis-x0208-1978 J78- 4 X),
43 %w( =jis-x0208 J90- 4 X), #
\8cp
\8f³
\82Ì
\83A
\83h
\83z
\83b
\83N
\82È
\8eÀ
\91\95
44 %w( =jis-x0208 J83- 4 X), #
\8cp
\8f³
\82Ì
\83A
\83h
\83z
\83b
\83N
\82È
\8eÀ
\91\95
45 %w( =jis-x0208 J78- 4 X), #
\8cp
\8f³
\82Ì
\83A
\83h
\83z
\83b
\83N
\82È
\8eÀ
\91\95
46 %w( =jis-x0213-1-2000 JX1- 4 X),
47 %w( =jis-x0213-2-2000 JX2- 4 X),
48 %w( =jis-x0212 JSP- 4 X),
49 %w( =big5-cdp CDP- 4 X),
50 %w( =cns11643-1 C1- 4 X),
51 %w( =cns11643-2 C2- 4 X),
52 %w( =cns11643-3 C3- 4 X),
53 %w( =cns11643-4 C4- 4 X),
54 %w( =cns11643-5 C5- 4 X),
55 %w( =cns11643-6 C6- 4 X),
56 %w( =cns11643-7 C7- 4 X),
57 %w( =ks-x1001 K0- 4 X),
58 %w( =daikanwa M- 5 d),
63 PRIVATE_USE_AREA = 0xe000
65 def parse_er(s) # parse a Entity Reference and return a number (MCS)
66 raise "wrong ER." unless ALL_RE =~ s # don't use is_er? for getting $1.
68 s = $1 # extract the part of ER
70 return $1.hex if s =~ /\AMCS-([0-9A-Fa-f]+)\Z/ # MCS. It's a mystery.
72 return $1.hex if s =~ /\AU[-+]?([0-9A-Fa-f]+)\Z/ ||
73 s =~ /\A#x([0-9A-Fa-f]+)\Z/ # Unicode code point in Hex.
75 return $1.to_i if s =~ /\A#([0-9]+)\Z/ # Unicode code point in Decimal.
77 if s =~ /\Amy-([0-9]+)\Z/ # my own code point. It's a secret.
78 return PRIVATE_USE_AREA + $1.to_i # private use area of Unicode.
81 if s =~ /\AI-/ # I- stands for Isolated character. It's a wonder.
85 CODESYS_TABLE.each {|codesys, er_prefix, keta, numtype|
94 re = "\\A#{er_prefix}(#{nre}{#{keta},#{keta}})\\Z"
95 next unless Regexp.new(re) =~ s
104 u8 = CodesysDB.instance.get(codesys, code)
113 raise "unknown Entity Reference"