# Copyright (C) 2002-2004 Kouichirou Eto, All rights reserved.
-module CHISE
- class CharacterParser
- def parse(c) # parse a value and return a number (MCS)
- raise "c is nil" if c.nil?
-
- if c.kind_of?(String)
- if /\A\?/ =~ c
- c = c.sub(/\A\?/, "") # remove "?" in the head
- u4 = c.u8tou32 # translate from UTF-8 to UTF-32
- return u4.u32to_i # translate UTF-32 to UCS number
- end
-
- return parse_er(c) if is_er?(c) # ER?
-
- return c.to_i if /^\d+$/ =~ c # only numbers?
-
- raise "unknown format"
- end
-
- if c.kind_of?(Numeric)
- c = 0x80000000 + c if c < 0 # negative value
- return c.to_i
- end
-
- raise "unknown object"
- end
+require "chise/chisedb"
+module CHISE
+ module EntityReference
PART = "&([-+0-9A-Za-z#]+);"
- ALL = "\\A#{PART}\\Z"
+ ALL = '\A'+PART+'\Z'
PART_RE = Regexp.new(PART)
ALL_RE = Regexp.new(ALL)
def contain_er?(s) (PART_RE =~ s) != nil; end
- def is_er?(s) (ALL_RE =~ s) != nil; end
+ def is_er?(s) (ALL_RE =~ s) != nil; end
# the order is important. The primary charset should be selectable.
CODESYS_TABLE = [
%w( =gt-k GT-K 5 d),
]
PRIVATE_USE_AREA = 0xe000
+ end
+
+ class CharacterParser
+ include EntityReference
+
+ def parse(c) # parse a value and return a number (MCS)
+ raise "c is nil" if c.nil?
+
+ if c.kind_of?(String)
+ if /\A\?/ =~ c
+ c = c.sub(/\A\?/, "") # remove "?" in the head
+ u4 = c.u8tou32 # translate from UTF-8 to UTF-32
+ return u4.u32to_i # translate UTF-32 to UCS number
+ end
+
+ return parse_er(c) if is_er?(c) # ER?
+
+ return c.to_i if /^\d+$/ =~ c # only numbers?
+
+ raise "unknown format"
+ end
+
+ if c.kind_of?(Numeric)
+ c = 0x80000000 + c if c < 0 # negative value
+ return c.to_i
+ end
+
+ raise "unknown object"
+ end
def parse_er(s) # parse a Entity Reference and return a number (MCS)
raise "wrong ER." unless ALL_RE =~ s # don't use is_er? for getting $1.
code = codestr.hex
end
- u8 = CodesysDB.instance.get(codesys, code)
+ u8 = get_ccs(codesys, code)
next if u8.nil?
- num = parse("?"+u8)
+ num = parse(u8)
next if num.nil?
return num
raise "unknown Entity Reference"
end
+ private
+ def get_ccs(ccs, code_point)
+ cd = ChiseDB.instance
+ cd.decode_char(ccs, code_point)
+ end
+ end
+
+ class EntityReferenceParser
+ include EntityReference
+
+ def de_er(s) # replace EntityReference with corresponding character.
+ return s unless PART_RE =~ s # don't use contain_er? to get $1
+
+ er = "&"+$1+";"
+ char = Character.get(er)
+ ss = s.sub(Regexp.new(Regexp.escape(er)), char.utf8_mcs)
+
+ return de_er(ss) if contain_er?(ss) # recursive
+ ss
+ end
+ end
+
+ class EntityReferenceEncoder
+ include EntityReference
+
+ def to_er(char)
+ cid = char.char_id
+ return "&#x%04x;" % cid if cid <= 0xffff
+ return "&#x%05x;" % cid if cid <= 0xfffff
+
+ CODESYS_TABLE.each {|codesys, er_prefix, keta, numtype|
+ code = char[codesys]
+ next if code.nil?
+ return "&#{er_prefix}%0#{keta}#{numtype};" % code
+ }
+
+ "&MCS-%08X;" % cid # the last answer
+ end
+
+ def to_er_by_ccs(cid, codesys) # not yet
+ end
+
end
end