1 # Copyright (C) 2002-2004 Kouichirou Eto, All rights reserved.
5 def parse(c) # parse a value and return a number
6 return 0 if c.nil? # what? should raise exception?
9 c = 0x80000000 + c if c < 0 # negative value
13 raise "unknown object" unless c.kind_of?(String)
15 return c.to_i if /^\d+$/ =~ c # only numbers?
17 return parse_er(c) if is_er?(c) # ER?
19 c = c.sub(/\A\?/, "") if /\A\?/ =~ c # remove a "?" in the head
21 u4 = c.u8tou32 # translate from UTF-8 to UTF-32
22 u4.u32to_i # translate UTF-32 to UCS number
25 PART = "&([-+0-9A-Za-z#]+);"
27 PART_RE = Regexp.new(PART)
28 ALL_RE = Regexp.new(ALL)
30 def contain_er?(s) (PART_RE =~ s) != nil; end
31 def is_er?(s) (ALL_RE =~ s) != nil; end
33 # the order is important. The primary sharset should be selectable.
35 %w( =jis-x0208-1990 J90- 4 X),
36 %w( =jis-x0208-1983 J83- 4 X),
37 %w( =jis-x0208-1978 J78- 4 X),
38 %w( =jis-x0208 J90- 4 X), #
\8cp
\8f³
\82Ì
\83A
\83h
\83z
\83b
\83N
\82È
\8eÀ
\91\95
39 %w( =jis-x0208 J83- 4 X), #
\8cp
\8f³
\82Ì
\83A
\83h
\83z
\83b
\83N
\82È
\8eÀ
\91\95
40 %w( =jis-x0208 J78- 4 X), #
\8cp
\8f³
\82Ì
\83A
\83h
\83z
\83b
\83N
\82È
\8eÀ
\91\95
41 %w( =jis-x0213-1-2000 JX1- 4 X),
42 %w( =jis-x0213-2-2000 JX2- 4 X),
43 %w( =jis-x0212 JSP- 4 X),
44 %w( =big5-cdp CDP- 4 X),
45 %w( =cns11643-1 C1- 4 X),
46 %w( =cns11643-2 C2- 4 X),
47 %w( =cns11643-3 C3- 4 X),
48 %w( =cns11643-4 C4- 4 X),
49 %w( =cns11643-5 C5- 4 X),
50 %w( =cns11643-6 C6- 4 X),
51 %w( =cns11643-7 C7- 4 X),
52 %w( =ks-x1001 K0- 4 X),
53 %w( =daikanwa M- 5 d),
58 PRIVATE_USE_AREA = 0xe000
60 def parse_er(s) # parse a ER and return a number (FIXNUM)
61 unless ALL_RE =~ s # I do not use is_er? to get $1.
65 s = $1 # extract the part of ER
67 if s =~ /\AMCS-([0-9A-Fa-f]+)\Z/ # MCS. It's a mystery.
71 if s =~ /\AU[-+]?([0-9A-Fa-f]+)\Z/ ||
72 s =~ /\A#x([0-9A-Fa-f]+)\Z/ # Unicode code point in Hex.
76 if s =~ /\A#([0-9]+)\Z/ # Unicode code point in Decimal.
80 if s =~ /\Amy-([0-9]+)\Z/ # my own code point. It's a secret.
81 return PRIVATE_USE_AREA + $1.to_i # private use area of Unicode.
84 if s =~ /\AI-/ # I- stands for Isolated character. It's a wonder.
88 CODESYS_TABLE.each {|codesys, er_prefix, keta, numtype|
97 re = "\\A#{er_prefix}(#{nre}{#{keta},#{keta}})\\Z"
98 next unless Regexp.new(re) =~ s
107 u8 = CodesysDB.instance.get(codesys, code)
116 raise "unknown Entity Reference"