# Copyright (C) 2002-2004 Kouichirou Eto, All rights reserved.
+require "chise/character"
+require "chise/parser"
+require "chise/ids"
+
class String
- def to_a()
- ar = self.split(//u) # split self to chars as UTF-8
- ar
+ include CHISE::StringIDS
+
+ # copied from htree/encoder.rb
+ UTF8_RE = /\A(?:
+ [\x00-\x7f]
+ |[\xc0-\xdf][\x80-\xbf]
+ |[\xe0-\xef][\x80-\xbf][\x80-\xbf]
+ |[\xf0-\xf7][\x80-\xbf][\x80-\xbf][\x80-\xbf]
+ |[\xf8-\xfb][\x80-\xbf][\x80-\xbf][\x80-\xbf][\x80-\xbf]
+ |[\xfc-\xfd][\x80-\xbf][\x80-\xbf][\x80-\xbf][\x80-\xbf][\x80-\xbf])\Z/nx
+
+ def is_a_utf8? # Is this string one character in UTF-8?
+ (UTF8_RE =~ self) != nil
end
- def each_char() to_a.each {|ch| yield ch } end
- def each_character() to_a.each {|ch| yield ch.char } end
- def char_length() to_a.length end
- def char_at(n) to_a()[n] end
- def first_char() to_a[0] end
- def char() CHISE::Character.get(to_a[0]) end
- def to_utf8()
- return to_a.map {|ch|
- ch.char.to_utf8
- }.join("")
+ def char
+ raise unless is_a_utf8?
+ CHISE::Character.get("?"+self)
end
- def map_char(block = Proc.new)
- return unless block_given?
- return self.to_a.map {|ch| (block.call(ch)).to_s }.join("")
+ def method_missing(mid, *args, &block)
+ #char.method_missing(mid, *args)
+ char.send(mid, *args, &block)
end
- def map_char!(block = Proc.new)
- return unless block_given?
- self.replace(self.map_char {|ch| block.call(ch)})
+ def to_a
+ self.split(//u)
end
- def map_character(block = Proc.new)
- return unless block_given?
- return self.to_a.map {|ch| (block.call(ch.char)).to_s }.join("")
+ def char_length
+ to_a.length
end
- def map_character!(block = Proc.new)
- return unless block_given?
- self.replace(self.map_char {|ch| block.call(ch.char)})
+ def each_char
+ to_a.each {|c|
+ yield(c)
+ }
end
- def method_missing(mid, *args)
- if char_length == 1 #\8fÈ\97ª\8c`\82ª\97L\8cø\82È\82Ì\82Í\81A\88ê\95¶\8e\9a\82Ì\8e\9e\82¾\82¯
- char.method_missing(mid, *args)
- else
- raise NameError, "undefined method `#{mid.id2name}'", caller(1)
- end
+ def map_char
+ to_a.map {|c|
+ yield(c).to_s
+ }.join
end
- def map_utf8() map_char {|ch| ch.char.map_utf8 } end
- alias map_ucs map_utf8
-
- def map_ucs_er() map_char {|ch| ch.char.map_ucs_er } end
- def to_er() map_char {|ch| ch.char.to_er } end
-
- #put\8aÖ\8cW\81A[]\8aÖ\8cW\82Í\97p\88Ó\82µ\82È\82¢\82±\82Æ\82É\82µ\82½\81B
- def de_er!() #EntityReference\82ð\8eæ\82è\8f\9c\82
- return self unless self =~ Regexp.new(EntityReference::REGEXP_PART) #\82»\82ê\82ç\82µ\82¢\82Ì\82ª\96³\82¯\82ê\82Î\89½\82à\82µ\82È\82¢
- er = "&"+$1+";"
- self.sub!(Regexp.new(Regexp.escape(er)), Character.new(er).mcs_utf8) #\95Ï\8a·\8e©\91Ì\82ÍCharacter\82É\82Ü\82©\82¹\82é
- return self.de_er! if self =~ Regexp.new(EntityReference::REGEXP_PART) #\82Ü\82¾\82 \82Á\82½\82ç\8dÄ\8bA
- return self
- end
-
- def de_er() return self.dup.de_er!; end
-
- def inspect_all() map_char {|ch| ch.char.inspect_all } end
- def inspect_x() map_char {|ch| ch.char.inspect_x } end
-
-# def to_euc() map_char {|ch| ch.char.to_euc } end
- def map_euc() map_char {|ch| ch.char.map_euc } end
-# def to_sjis() map_char {|ch| ch.char.to_sjis } end
- def map_sjis() map_char {|ch| ch.char.map_sjis } end
-
- def glyph_decompose() map_char {|ch| ch.char.glyph_decompose } end
- def decompose() map_char {|ch| ch.char.decompose } end
- def decompose!() self.replace(self.decompose); self; end
-
- def nu_decompose_all(level=nil)
- level = 0 if level.nil?
- if 10 < level
- p ["too many recursive", self]
- exit
- end
- de = self.decompose
- return de.decompose_all(level+1) if de != self #\82È\82É\82©\95Ï\89»\82ª\82 \82Á\82½\82©\82ç\8dÄ\8bA
- de #\82à\82¤\82±\82ê\88È\8fã\95Ï\89»\82Í\96³\82³\82»\82¤\82¾\82¼\82Æ\81B
- end
-
- def decompose_all() map_char {|ch| ch.char.decompose_all } end
- def decompose_all!() self.replace(self.decompose_all); self; end
-
- def find() #"\93ú\89_"\81¨"\93Ü"\82Æ\82©\82¢\82¤\8a´\82¶\82Ì\91\80\8dì
- ar = []
- length = char_length()
- each_char {|ch|
- char = ch.char
- ar << char.ids_contained #\82»\82Ì\95¶\8e\9a\82ð\8aÜ\82ñ\82Å\82¢\82é\8a¿\8e\9a\82Ì\83\8a\83X\83g
+ def each_character
+ to_a.each {|ch|
+ yield ch.char
}
- h = Hash.new(0)
- ar.each {|list|
- next if list.nil?
- list.each_char {|ch|
- h[ch] += 1
- }
- }
- str = ""
- h.each {|k, v|
- # p [k, v]
- if length == v #\91S\95\94\82É\8aç\82ð\8fo\82µ\82Ä\82¢\82½\82ç
- str += k
- end
- }
- # p str
- str
end
- def compose()
- db = CHISE::CodesysDB.instance
- composed = db.get("ids", self)
- return "" if composed.nil? #\82È\82©\82Á\82½\82æ\82Æ\81B
- return "" if composed.char_length == 0 #\82È\82É\82²\82Æ?
- return composed if composed.char_length == 1
- composed.each_char {|ch|
- char = ch.char
- return ch if char.has_attribute? #\82Æ\82è\82 \82¦\82¸\8dÅ\8f\89\82É\82Ý\82Â\82©\82Á\82½\82à\82Ì\82ð\95Ô\82·\82Æ\82¢\82¤\83k\83\8b\82¢\8ed\97l
- }
- return "" #attribute\82ð\8e\9d\82Â\82à\82Ì\82ª\88ê\82Â\82à\96³\82©\82Á\82½\82ç\81A""\82É\82·\82é
+ def de_er()
+ pa = CHISE::EntityReferenceParser.new
+ pa.de_er(self)
end
- def aggregate()
- #self\82Å\82 \82é\95¶\8e\9a\97ñ\82ðIDS\82¾\82Æ\89¼\92è\82µ\81A\82»\82ê\82ð\8a®\91S\82Écompose\82µ\82«\82ç\82È\82¢\82Å\81A
- #\82»\82Ì\95\94\95ª\8fW\8d\87\82¾\82¯\82ð\82Æ\82è\82¾\82µ\82Ä\81Acompose\89Â\94\\82Å\82 \82ê\82Î\82Å\82«\82é\82¾\82¯compose\82·\82é\81B
- tree = CHISE::IDS_Tree.new(self)
- return self if tree.depth <= 1 #sub_nodes\82ª\96³\82¢\8fê\8d\87\82Í\82±\82±\82Å\82³\82æ\82È\82ç
- tree.sub_nodes.each {|node|
- c = node.compose
- next if c.nil? || c == ""
- # print "#{self} #{node} #{c}\n"
- # p [self, node, c]
- n = self.gsub(node, c)
- return n.aggregate
- }
- return self #\82¨\82«\82©\82¦\82ç\82ê\82é\82à\82Ì\82ª\82Ü\82Á\82½\82\82È\82©\82Á\82½\82ç\81A\8e©\95ª\82ð\82©\82¦\82·\81B
+ def to_ids
+ CHISE::IDS.new(self)
end
end