From: eto Date: Fri, 11 Jun 2004 15:24:06 +0000 (+0000) Subject: i X-Git-Url: http://git.chise.org/gitweb/?a=commitdiff_plain;h=ffce8c4eb4a667debd47bfe237de4588414b0bb4;p=chise%2Fruby.git i --- diff --git a/chise/char.rb b/chise/char.rb index 2d685be..8441ae6 100755 --- a/chise/char.rb +++ b/chise/char.rb @@ -1,8 +1,4 @@ # Copyright (C) 2002-2004 Kouichirou Eto, All rights reserved. -require "chise/db" -require "chise/config" -require "chise/iconv" -require "chise/uconv" -require "chise/string" require "chise/character" +require "chise/string" diff --git a/chise/character.rb b/chise/character.rb index 753089a..190792f 100755 --- a/chise/character.rb +++ b/chise/character.rb @@ -2,6 +2,8 @@ require "singleton" require "chise/parser" +require "chise/chisedb" +require "chise/iconv" module CHISE class CharacterFactory # generate Character object and cache them @@ -18,333 +20,107 @@ module CHISE @chars = {} end - def get(char_id) + def get(s) check_max - mcs = @parser.parse(char_id) + mcs = @parser.parse(s) @chars[mcs] = Character.new(mcs) if @chars[mcs].nil? @chars[mcs] end def check_max - clear if MAX_CACHE_CHARACTER < @chars.length # clear all caches. + clear if MAX_CACHE_CHARACTER < @chars.length # clear all cache end end class Character - def initialize(char_id=nil) + def initialize(char_id) + raise if char_id.nil? + raise unless char_id.is_a?(Fixnum) # char_id sure is a Fixnum. + raise if char_id < 0 # char_id sure is a positive value. @char_id = char_id - @attributes = {} - @check_all_database = false + @char_id.freeze + @utf8_mcs = CHISE.i_tou8(@char_id) + @utf8_mcs.freeze + @feature = {} + @check_all_done = nil end attr_reader :char_id + attr_reader :utf8_mcs - - - - - - def to_i() @char_id end - def mcs_utf8() Character.u4itou8(@char_id) end - def mcs_hex() sprintf("%x", @char_id) end - - def self.get(char_id) # flyweight pattern - CharacterFactory.instance.get(char_id) - end - - def normalize_attribute_name(b) - a = b.dup - a.gsub!(/_/, "-") #underline‚Í-‚É’uŠ· - a.sub!(/-at-/, "@") - a.sub!(/^map-/, "=>") - a.sub!(/^to-/, "->") - a.sub!(/^from-/, "<-") - a - end - - def get_char_attribute(b) # XEmacs CHISE compatible API - a = normalize_attribute_name(b) - #p [a, b] - atr = @attributes[a] - return atr if atr - atr = check_database(a) - if atr - @attributes[a] = atr - return atr - end - return get_char_attribute("="+a) unless a =~ /^=/ - #“ª‚É=‚ª‚‚¢‚Ä‚È‚¢ê‡‚Í‚»‚ꂪÈ—ª‚³‚ê‚Ä‚¢‚邱‚Æ‚ð‰¼’è‚µ‚āAÄ‹A‚·‚é - nil - end - - def put_char_attribute(b,v) - a = normalize_attribute_name(b) - @attributes[a] = v; - CharDB.instance.put(a, mcs_utf8(), v) + def self.get(s) + CharacterFactory.instance.get(s) end - def char_attribute_alist() check_all_database(); @attributes; end - def char_attribute_list() check_all_database(); @attributes.keys; end - alias [] get_char_attribute #‚»‚Ì—ªÌ - alias []= put_char_attribute - alias alist char_attribute_alist - alias list char_attribute_list - - def method_missing(mid, *args) # ref. ostruct.rb - mname = mid.id2name - return get_char_attribute(mname) if args.length == 0 - put_char_attribute(mname.chop, args[0]) if mname =~ /=$/ #‘ã“ü - end - - def has_attribute?() # ˆÓ–¡‚Ì‚ ‚éattribute‚ðŽ‚Á‚Ä‚Ü‚·‚©? - keys = list - keys.delete_if {|k| - k =~ /ids/ - } - return (keys.length != 0) - end - - def ==(ch) - return false if ch.nil? - return false unless ch.is_a? Character - self.char_id == ch.char_id + def inspect + sprintf("Char:%x", @char_id) end - def self.u4itou4(num) - return "" unless num.is_a?(Integer) - return sprintf("%c%c%c%c", num&0xff, (num >> 8)&0xff, (num >> 16)&0xff, (num >> 24)&0xff) #UCS-4”’l‚𕶎š—ñ‚É‚µ‚Äreturn - end - - def self.u4itou8(char_id) #ucs‚̐”’l‚ðŽó‚¯‚Æ‚èAUTF-8‚Ì•¶Žšˆê•¶Žš‚ð•Ô‚· - begin - u4 = Character.u4itou4(char_id) - u8 = Uconv.u4tou8(u4) - return u8 - rescue - #raise ArgumentError, "invalid char_id (#{char_id})", caller(1) - #print "error\n" - return "" - end - end - - def check_database(a) - db = CharDB.instance - u8 = mcs_utf8() - v = db.get(a, u8) # u8‚Å•\‚³‚ê‚镶Žš‚ÌaƒAƒgƒŠƒrƒ…[ƒg‚𒲂ׂéB - v - end - - def check_all_database() # Œ»Ý‚Ì@char_id‚©‚çA•¶Žšƒf[ƒ^ƒx[ƒX‚ðŽQÆ‚·‚é - return if @check_all_database - return if @char_id.nil? - db = CharDB.instance - u8 = mcs_utf8() - atrs = db.get_all(u8) #u8‚Å•\‚³‚ê‚镶Žš‚̃AƒgƒŠƒrƒ…[ƒg‚ð‘S•”Ž‚Á‚Ä‚±‚¢ - atrs.each {|a,v| - @attributes[a] = v #‚Æ‚©‚¢‚¤Š´‚¶‚Å‘ã“ü‚·‚é‚Ì‚Å‚¦‚¦‚©‚È? - } - @check_all_database = true #d‚¢ˆ—‚Ȃ̂ňꉞcheck‚·‚é - end + def to_s() @utf8_mcs; end - def ucs() - #p "ucs" - #ar = %w{ucs ucs-big5 ucs-cdp ucs-cns ucs-jis ucs-ks =>ucs =>ucs* =>ucs-jis} - #ar = %w{ucs ucs-jis ucs-big5 ucs-cdp ucs-cns ucs-ks =>ucs =>ucs* =>ucs-jis} - ar = %w{ucs-jis ucs =>ucs-jis} - #•À‚я‡‚Íœ“ˆÓ“I‚ŁAucs-jis‚ðæ‚ɏo‚µ‚Ä‚¢‚éB–{—ˆ‚Í‚±‚ê‚àŽw’è‚Å‚«‚é‚悤‚É‚·‚é‚ׂ«B - ar.each {|a| #p [a] - u = get_char_attribute(a) - return u if u - } - nil - end + def [](f) + f = normalize_feature_name(f) - #-------------------------------------------------------------------CCSŠÖŒW - def to_utf8() Uconv.u4tou8(Character.u4itou4(ucs())) end #UTF8•¶Žš—ñ‚ð•Ô‚· - #alias to_s to_utf8 - alias to_s mcs_utf8 + v = @feature[f] + return v if v + v = @feature["="+f] + return v if v - def map_utf8() - u = ucs() - if u.nil? || 0xffff < u - return to_er() - else - return to_utf8() + v = get_feature(f) + if v + @feature[f] = v + return v end - end - alias map_ucs map_utf8 - def map_ucs_er() - u = ucs() - if u.nil? || 0xffff < u - return to_er() - else - return Character.get(u).to_er() + v = get_feature("="+f) + if v + @feature["="+f] = v + return v end - end - - def to_euc() - u = ucs() - return "" if u.nil? || 0xffff < u - Uconv.u16toeuc(Uconv.u4tou16(Character.u4itou4(ucs()))) - end - - def map_euc() - e = to_euc() - return e if e != "" - return to_er() - end - - def to_sjis() - u = ucs() - return "" if u.nil? || 0xffff < u - Uconv.u16tosjis(Uconv.u4tou16(Character.u4itou4(ucs()))) - end - - def map_sjis() - e = to_sjis() - return e if e != "" - return to_er() - end - - def to_er(codesys=nil) #ŽÀ‘ÌŽQÆ‚ð•Ô‚·AŠó–]‚·‚écodesys‚ªˆø”(–¢ŽÀ‘•) - return "" if @char_id.nil? - return sprintf("&#x%04x;", @char_id) if @char_id <= 0xffff - return sprintf("&#x%05x;", @char_id) if @char_id <= 0xfffff - EntityReference.each_codesys {|codesys, er_prefix, keta, numtype| - code = self[codesys] - next if code.nil? - return sprintf("&#{er_prefix}%0#{keta}#{numtype};", code) - } - return sprintf("&MCS-%08X;", @char_id) #–{“–‚Í‚±‚ê‚Í–³‚µ‚É‚µ‚½‚¢ - end - - def to_er_list() - ar = [] - EntityReference.each_codesys {|codesys, er_prefix, keta, numtype| - er = to_er(codesys) - ar << er if er - } - ar - end - - def inspect_x() - return "<>" if @char_id.nil? - ar = [to_utf8(), to_er().sub(/^&/,"").chop] - "<"+ar.join(",")+">" - end - alias inspect inspect_x - def inspect_all_codesys() #–¢Š®¬ - #to_er‚ð‘S‚Ä‚Ìcodesys‚É‚¨‚¢‚ÄŽÀs‚·‚éB‚»‚ÌŒ‹‰Ê‚ðƒRƒ“ƒpƒNƒg‚É‚Ü‚Æ‚ß‚é + nil end - def inspect_all() - ar = [inspect.chop] - alist.to_a.sort.each {|a, v| ar << "#{a}:#{v}" } - return ar.join(",")+">" + def []=(k,v) + f = normalize_feature_name(k) + cd = ChiseDB.instance + ft = cd.get_feature(f) + ft.set_value(@char_id, v) + @feature[f] = v; end - def dump_all() - ar = [inspect] - alist.to_a.sort.each {|a, v| ar << "#{a}:#{v}" } - return ar.join('\n')+'\n' - end + def method_missing(mid, *args) # ref. ostruct.rb + mname = mid.id2name - def get_attributes() - str = "" - alist.to_a.sort.each {|a, v| - str += "#{a}: #{v}\n" - } - str - end + return self[mname] if args.empty? # get - def inspect_ids(hex_flag=false) - ids = decompose - ar = [] - ar << (hex_flag ? "x"+mcs_hex : to_utf8) - if to_s != ids #ids‚ª•”•i‚»‚Ì‚à‚Ì‚¾‚Á‚½‚ç•”•i’ljÁ‚Í‚µ‚È‚¢ - ids.each_char {|ch| - char = ch.char - next if char.is_ids? - if hex_flag then - ar << "x"+char.mcs_hex - else - u = char.to_utf8 - if u != "" - ar << u - else - ar << char.to_er - end - end - } + if args.length == 1 && /=\Z/ =~ mname # put + self[mname.chop] = args.shift + return end - return "("+ar.join("\t")+")" - end - #--------------------------------------------------------------------IDSŠÖŒW - def glyph_decompose() do_decompose(false) end - def decompose() do_decompose(true) end - def do_decompose(check_meaning = true) - k = self.to_s - # idss = self["ids"] - # return idss if idss - # return k if self.is_basic_kanji? #Šî–{Š¿Žš‚Ístop kanji‚Æ‚·‚邼‚ƁB - if check_meaning - return self["ids-represent"] if self["ids-represent"] #ids_represent‚ðŽ‚Á‚Ä‚¢‚éê‡‚Í‚»‚Ì’l‚Æ‚·‚éB - return self["ids-element"] if self["ids-element"] #ids_element‚ðŽ‚Á‚Ä‚¢‚éê‡‚Í‚»‚Ì’l‚Æ‚·‚éB - idss = self["ids-meaning"] - return idss if idss && 0 < idss.length && k != idss - end - idss = self["ids-aggregated"] - return idss if idss && 0 < idss.length && k != idss - idss = self["ids"] - return idss if idss && 0 < idss.length && k != idss - return k - # return k if idss.nil? || idss.length == 0 || k == idss - # if idss.char_length == 2 - # p ["What???", k, idss, k.inspect_all] - # #return idssx[1] #“ñŒÂ–Ú‚¾‚¯•Ô‚·‚Æ‚©? - # return k #IDS‚É“WŠJ‚·‚é•û–@‚ª–³‚¢‚ƁB - # end - # return k if k == idss - # if idss.include?(k) #‚±‚Ì“ñ•¶Žš‚ÌBUG‘΍ô - # #return idss.sub(k, "") - # return k #IDS‚É“WŠJ‚·‚é•û–@‚ª–³‚¢‚ƁB - # end - # return idss + raise "error" end - def decompose_all - pde = "" - de = self.decompose #o”­“_ - level = 0 - while true - pde = de - de = pde.decompose #‚à‚¤ˆê“x•ª‰ð‚ð‚µ‚Ä‚Ý‚éB - break if pde == de #ƒ‹[ƒv‚𔲂¯‚¾‚· - exit if 10 < level #p ["too many recursive", self] - level += 1 - end - return de + def to_er + en = EntityReferenceEncoder.new + en.to_er(self) end - def decompose_all_nu(level=nil) - level = 0 if level.nil? - if 10 < level - p ["too many recursive", self] - exit - end - de = self.decompose - return de.decompose_all(level+1) if de != self #‚È‚É‚©•Ï‰»‚ª‚ ‚Á‚½‚©‚çÄ‹A - return de #‚à‚¤‚±‚êˆÈã•Ï‰»‚Í–³‚³‚»‚¤‚¾‚¼‚ƁB - end + private - def is_ids?() 0x2ff0 <= @char_id && @char_id <= 0x2fff end + def get_feature(f) + cd = ChiseDB.instance + cd.load_feature(f, @char_id) + end - def ids_operator_argc() - return 0 unless is_ids? - return 3 if @char_id == 0x2ff2 || @char_id == 0x2ff3 - return 2 + def normalize_feature_name(a) + a = a.gsub(/_/, "-") #underlineは-に置換 + a = a.sub(/-at-/, "@") + a = a.sub(/-plus-/, "+") + a = a.sub(/\Amap-/, "=>") + a = a.sub(/\Ato-/, "->") + a = a.sub(/\Afrom-/, "<-") + a end end end diff --git a/chise/chisedb.rb b/chise/chisedb.rb new file mode 100755 index 0000000..18deeb6 --- /dev/null +++ b/chise/chisedb.rb @@ -0,0 +1,52 @@ +# Copyright (C) 2002-2004 Kouichirou Eto, All rights reserved. + +require "singleton" +require "chise/rbchise" + +module CHISE + class ChiseDB + include Singleton + + def initialize + @ds = DataSource.new + end + + def location() @ds.location; end + + def get_feature(f) + @ds.get_feature(f) + end + + def get_ccs(c) + @ds.get_ccs(c) + end + + def decode_char(name, cid) + v = @ds.decode_char(name, cid) + v + end + + def load_feature(name, cid) + v = @ds.load_feature(name, cid) + v = normalize_value(v) + v + end + + def each_feature + @ds.each_feature {|f| + yield f + } + end + + private + + def normalize_value(v) + return v if v.nil? + return v.to_i if /\A\d+\Z/ =~ v # number? + return $1 if /\A"(.+)"\Z/ =~ v # remove surrounding " + #return v.sub(/\A\?/, "") if v =~ /\A\?/ # remove ? in the head + #return parse_sexp(v) if v =~ /\A\(.+\)\Z/ # parse sexp # not yet + v + end + end +end diff --git a/chise/db.rb b/chise/db.rb index e5707d4..2371a61 100755 --- a/chise/db.rb +++ b/chise/db.rb @@ -40,16 +40,6 @@ module CHISE return key end - def myvalue(v) - return v if v == nil - return v.to_i if v =~ /^\d+$/ #数字だったらここで変換しておく - return v.sub(/^\?/, "") if v =~ /^\?/ #冒頭の?は取り除く - return $1 if v =~ /^"(.+)"$/ #最初と最後に"がついていたら、取り除く - #p ["get", v, t, key, db] - #return parse_sexp(v) if v =~ /^\(.+\)$/ #最初と最後が()の時は、S式にparseする - return v #それ以外って何? - end - def myget(key) #keyキーを引いて返す key = mykey(key) v = get(key) #存在しなかったらnilを返すことになる @@ -64,10 +54,8 @@ module CHISE end class DB # abstract class for DataBase - # translate file name for deal with Windows file system. - def get_filename(t) - return @pre + DB.unix_to_win(t) + @post if CHISE.windows? + return @pre + CHISE.unix_to_win(t) + @post if CHISE.windows? return @pre + t + @post end @@ -90,7 +78,7 @@ module CHISE } keys = [] files.each {|f| - t = DB.win_to_unix(f) + t = CHISE.win_to_unix(f) t.sub!(%r|^#{@pre}|, "") t.sub!(%r|#{@post}$|, "") if @post != "" keys << t @@ -347,6 +335,6 @@ module CHISE return char unless char.nil? return nil end - end + end end diff --git a/chise/iconv.rb b/chise/iconv.rb index 5833f48..a55b6fa 100755 --- a/chise/iconv.rb +++ b/chise/iconv.rb @@ -82,9 +82,27 @@ class String s = self return (s[0] << 24 | s[1] << 16 | s[2] << 8 | s[3]) end + + def u8to_i + u32 = self.u8tou32 + u32.u32to_i + end +end + +module CHISE + def i_tou32(n) # convert a integer to UTF-32 String + raise unless n.is_a?(Integer) + sprintf("%c%c%c%c", (n >> 24)&0xff, (n >> 16)&0xff, (n >> 8)&0xff, n&0xff) + end + + def i_tou8(n) # convert a integer to UTF-8 String + u32 = CHISE.i_tou32(n) + u32.u32tou8 + end + module_function :i_tou32, :i_tou8 end -class Uconv +class NuUconv def self.u8tou4(s) s.u8tou32; end def self.u4tou8(s) s.u32tou8; end def self.u4tou16(s) s.u32tou16; end diff --git a/chise/ids.rb b/chise/ids.rb index d1c62c0..e46bc51 100755 --- a/chise/ids.rb +++ b/chise/ids.rb @@ -1,6 +1,5 @@ # Copyright (C) 2002-2004 Kouichirou Eto, All rights reserved. -$LOAD_PATH << "../../lib" if $0 == __FILE__ require "chise/db" module CHISE diff --git a/chise/org-character.rb b/chise/org-character.rb new file mode 100755 index 0000000..e238bb2 --- /dev/null +++ b/chise/org-character.rb @@ -0,0 +1,216 @@ + def mcs_hex() sprintf("%x", @char_id) end + + def char_feature_alist() check_all_database(); @features; end + def char_feature_list() check_all_database(); @features.keys; end + alias alist char_feature_alist + alias list char_feature_list + + def has_feature?() # ˆÓ–¡‚Ì‚ ‚éfeature‚ðŽ‚Á‚Ä‚Ü‚·‚©? + keys = list + keys.delete_if {|k| + k =~ /ids/ + } + return (keys.length != 0) + end + + def check_database(a) + db = CharDB.instance + u8 = mcs_utf8() + v = db.get(a, u8) # u8‚Å•\‚³‚ê‚镶Žš‚ÌaƒAƒgƒŠƒrƒ…[ƒg‚𒲂ׂéB + v + end + + def check_all_database() # Œ»Ý‚Ì@char_id‚©‚çA•¶Žšƒf[ƒ^ƒx[ƒX‚ðŽQÆ‚·‚é + return if @check_all_database + return if @char_id.nil? + db = CharDB.instance + u8 = mcs_utf8() + atrs = db.get_all(u8) #u8‚Å•\‚³‚ê‚镶Žš‚̃AƒgƒŠƒrƒ…[ƒg‚ð‘S•”Ž‚Á‚Ä‚±‚¢ + atrs.each {|a,v| + @features[a] = v #‚Æ‚©‚¢‚¤Š´‚¶‚Å‘ã“ü‚·‚é‚Ì‚Å‚¦‚¦‚©‚È? + } + @check_all_database = true #d‚¢ˆ—‚Ȃ̂ňꉞcheck‚·‚é + end + + def ucs() + #p "ucs" + #ar=%w{ucs ucs-big5 ucs-cdp ucs-cns ucs-jis ucs-ks =>ucs =>ucs* =>ucs-jis} + #ar=%w{ucs ucs-jis ucs-big5 ucs-cdp ucs-cns ucs-ks =>ucs =>ucs* =>ucs-jis} + ar = %w{ucs-jis ucs =>ucs-jis} + #•À‚я‡‚Íœ“ˆÓ“I‚ŁAucs-jis‚ðæ‚ɏo‚µ‚Ä‚¢‚éB–{—ˆ‚Í‚±‚ê‚àŽw’è‚Å‚«‚é‚悤‚É‚·‚é‚ׂ«B + ar.each {|a| #p [a] + u = get_char_feature(a) + return u if u + } + nil + end + + #-------------------------------------------------------------------CCSŠÖŒW + def to_utf8() Uconv.u4tou8(Character.u4itou4(ucs())) end #UTF8•¶Žš—ñ‚ð•Ô‚· + #alias to_s to_utf8 + alias to_s mcs_utf8 + + def map_utf8() + u = ucs() + if u.nil? || 0xffff < u + return to_er() + else + return to_utf8() + end + end + alias map_ucs map_utf8 + + def map_ucs_er() + u = ucs() + if u.nil? || 0xffff < u + return to_er() + else + return Character.get(u).to_er() + end + end + + def to_euc() + u = ucs() + return "" if u.nil? || 0xffff < u + Uconv.u16toeuc(Uconv.u4tou16(Character.u4itou4(ucs()))) + end + + def map_euc() + e = to_euc() + return e if e != "" + return to_er() + end + + def to_sjis() + u = ucs() + return "" if u.nil? || 0xffff < u + Uconv.u16tosjis(Uconv.u4tou16(Character.u4itou4(ucs()))) + end + + def map_sjis() + e = to_sjis() + return e if e != "" + return to_er() + end + + def to_er_list() + ar = [] + EntityReference.each_codesys {|codesys, er_prefix, keta, numtype| + er = to_er(codesys) + ar << er if er + } + ar + end + + def inspect_all_codesys() #–¢Š®¬ + #to_er‚ð‘S‚Ä‚Ìcodesys‚É‚¨‚¢‚ÄŽÀs‚·‚éB‚»‚ÌŒ‹‰Ê‚ðƒRƒ“ƒpƒNƒg‚É‚Ü‚Æ‚ß‚é + end + + def inspect_all() + ar = [inspect.chop] + alist.to_a.sort.each {|a, v| ar << "#{a}:#{v}" } + return ar.join(",")+">" + end + + def dump_all() + ar = [inspect] + alist.to_a.sort.each {|a, v| ar << "#{a}:#{v}" } + return ar.join('\n')+'\n' + end + + def get_features() + str = "" + alist.to_a.sort.each {|a, v| + str += "#{a}: #{v}\n" + } + str + end + + def inspect_ids(hex_flag=false) + ids = decompose + ar = [] + ar << (hex_flag ? "x"+mcs_hex : to_utf8) + if to_s != ids #ids‚ª•”•i‚»‚Ì‚à‚Ì‚¾‚Á‚½‚ç•”•i’ljÁ‚Í‚µ‚È‚¢ + ids.each_char {|ch| + char = ch.char + next if char.is_ids? + if hex_flag then + ar << "x"+char.mcs_hex + else + u = char.to_utf8 + if u != "" + ar << u + else + ar << char.to_er + end + end + } + end + return "("+ar.join("\t")+")" + end + + #--------------------------------------------------------------------IDSŠÖŒW + def glyph_decompose() do_decompose(false) end + def decompose() do_decompose(true) end + def do_decompose(check_meaning = true) + k = self.to_s + # idss = self["ids"] + # return idss if idss + # return k if self.is_basic_kanji? #Šî–{Š¿Žš‚Ístop kanji‚Æ‚·‚邼‚ƁB + if check_meaning + return self["ids-represent"] if self["ids-represent"] #ids_represent‚ðŽ‚Á‚Ä‚¢‚éê‡‚Í‚»‚Ì’l‚Æ‚·‚éB + return self["ids-element"] if self["ids-element"] #ids_element‚ðŽ‚Á‚Ä‚¢‚éê‡‚Í‚»‚Ì’l‚Æ‚·‚éB + idss = self["ids-meaning"] + return idss if idss && 0 < idss.length && k != idss + end + idss = self["ids-aggregated"] + return idss if idss && 0 < idss.length && k != idss + idss = self["ids"] + return idss if idss && 0 < idss.length && k != idss + return k + # return k if idss.nil? || idss.length == 0 || k == idss + # if idss.char_length == 2 + # p ["What???", k, idss, k.inspect_all] + # #return idssx[1] #“ñŒÂ–Ú‚¾‚¯•Ô‚·‚Æ‚©? + # return k #IDS‚É“WŠJ‚·‚é•û–@‚ª–³‚¢‚ƁB + # end + # return k if k == idss + # if idss.include?(k) #‚±‚Ì“ñ•¶Žš‚ÌBUG‘΍ô + # #return idss.sub(k, "") + # return k #IDS‚É“WŠJ‚·‚é•û–@‚ª–³‚¢‚ƁB + # end + # return idss + end + + def decompose_all + pde = "" + de = self.decompose #o”­“_ + level = 0 + while true + pde = de + de = pde.decompose #‚à‚¤ˆê“x•ª‰ð‚ð‚µ‚Ä‚Ý‚éB + break if pde == de #ƒ‹[ƒv‚𔲂¯‚¾‚· + exit if 10 < level #p ["too many recursive", self] + level += 1 + end + return de + end + + def decompose_all_nu(level=nil) + level = 0 if level.nil? + if 10 < level + p ["too many recursive", self] + exit + end + de = self.decompose + return de.decompose_all(level+1) if de != self #‚È‚É‚©•Ï‰»‚ª‚ ‚Á‚½‚©‚çÄ‹A + return de #‚à‚¤‚±‚êˆÈã•Ï‰»‚Í–³‚³‚»‚¤‚¾‚¼‚ƁB + end + + def is_ids?() 0x2ff0 <= @char_id && @char_id <= 0x2fff end + + def ids_operator_argc() + return 0 unless is_ids? + return 3 if @char_id == 0x2ff2 || @char_id == 0x2ff3 + return 2 + end diff --git a/chise/org-string.rb b/chise/org-string.rb new file mode 100755 index 0000000..323d135 --- /dev/null +++ b/chise/org-string.rb @@ -0,0 +1,121 @@ + def each_character() to_a.each {|ch| yield ch.char } end + def char_length() to_a.length end + def to_utf8() + return to_a.map {|ch| + ch.char.to_utf8 + }.join("") + end + + def map_char(block = Proc.new) + return unless block_given? + return self.to_a.map {|ch| (block.call(ch)).to_s }.join("") + end + + def map_char!(block = Proc.new) + return unless block_given? + self.replace(self.map_char {|ch| block.call(ch)}) + end + + def map_character(block = Proc.new) + return unless block_given? + return self.to_a.map {|ch| (block.call(ch.char)).to_s }.join("") + end + + def map_character!(block = Proc.new) + return unless block_given? + self.replace(self.map_char {|ch| block.call(ch.char)}) + end + + def map_utf8() map_char {|ch| ch.char.map_utf8 } end + alias map_ucs map_utf8 + + def map_ucs_er() map_char {|ch| ch.char.map_ucs_er } end + def to_er() map_char {|ch| ch.char.to_er } end + + #putŠÖŒWA[]ŠÖŒW‚Í—pˆÓ‚µ‚È‚¢‚±‚Æ‚É‚µ‚½B + + def inspect_all() map_char {|ch| ch.char.inspect_all } end + def inspect_x() map_char {|ch| ch.char.inspect_x } end + +# def to_euc() map_char {|ch| ch.char.to_euc } end + def map_euc() map_char {|ch| ch.char.map_euc } end +# def to_sjis() map_char {|ch| ch.char.to_sjis } end + def map_sjis() map_char {|ch| ch.char.map_sjis } end + + def glyph_decompose() map_char {|ch| ch.char.glyph_decompose } end + def decompose() map_char {|ch| ch.char.decompose } end + def decompose!() self.replace(self.decompose); self; end + + def nu_decompose_all(level=nil) + level = 0 if level.nil? + if 10 < level + p ["too many recursive", self] + exit + end + de = self.decompose + return de.decompose_all(level+1) if de != self #‚È‚É‚©•Ï‰»‚ª‚ ‚Á‚½‚©‚çÄ‹A + de #‚à‚¤‚±‚êˆÈã•Ï‰»‚Í–³‚³‚»‚¤‚¾‚¼‚ƁB + end + + def decompose_all() map_char {|ch| ch.char.decompose_all } end + def decompose_all!() self.replace(self.decompose_all); self; end + + def find() #"“ú‰_"¨"“Ü"‚Æ‚©‚¢‚¤Š´‚¶‚Ì‘€ì + ar = [] + length = char_length() + each_char {|ch| + char = ch.char + ar << char.ids_contained #‚»‚Ì•¶Žš‚ðŠÜ‚ñ‚Å‚¢‚銿Žš‚̃ŠƒXƒg + } + h = Hash.new(0) + ar.each {|list| + next if list.nil? + list.each_char {|ch| + h[ch] += 1 + } + } + str = "" + h.each {|k, v| + # p [k, v] + if length == v #‘S•”‚ÉŠç‚ðo‚µ‚Ä‚¢‚½‚ç + str += k + end + } + # p str + str + end + + def compose() + db = CHISE::CodesysDB.instance + composed = db.get("ids", self) + return "" if composed.nil? #‚È‚©‚Á‚½‚æ‚ƁB + return "" if composed.char_length == 0 #‚È‚É‚²‚Æ? + return composed if composed.char_length == 1 + composed.each_char {|ch| + char = ch.char + return ch if char.has_attribute? #‚Æ‚è‚ ‚¦‚¸Å‰‚ɂ݂‚©‚Á‚½‚à‚Ì‚ð•Ô‚·‚Æ‚¢‚¤ƒkƒ‹‚¢Žd—l + } + return "" #attribute‚ðŽ‚Â‚à‚Ì‚ªˆê‚Â‚à–³‚©‚Á‚½‚çA""‚É‚·‚é + end + + def aggregate() + #self‚Å‚ ‚镶Žš—ñ‚ðIDS‚¾‚Ɖ¼’肵A‚»‚ê‚ðŠ®‘S‚Écompose‚µ‚«‚ç‚È‚¢‚ŁA + #‚»‚Ì•”•ªW‡‚¾‚¯‚ð‚Ƃ肾‚µ‚āAcompose‰Â”\‚Å‚ ‚ê‚΂ł«‚邾‚¯compose‚·‚éB + tree = CHISE::IDS_Tree.new(self) + return self if tree.depth <= 1 #sub_nodes‚ª–³‚¢ê‡‚Í‚±‚±‚Å‚³‚æ‚È‚ç + tree.sub_nodes.each {|node| + c = node.compose + next if c.nil? || c == "" + # print "#{self} #{node} #{c}\n" + # p [self, node, c] + n = self.gsub(node, c) + return n.aggregate + } + return self #‚¨‚«‚©‚¦‚ç‚ê‚é‚à‚Ì‚ª‚Ü‚Á‚½‚­‚È‚©‚Á‚½‚çAŽ©•ª‚ð‚©‚¦‚·B + end + +---------------------------------------------------------------------- +¡ƒXƒe + def char_at(n) to_a()[n] end + def first_char() to_a[0] end +---------------------------------------------------------------------- diff --git a/chise/parser.rb b/chise/parser.rb index 812b77c..f790468 100755 --- a/chise/parser.rb +++ b/chise/parser.rb @@ -1,39 +1,16 @@ # Copyright (C) 2002-2004 Kouichirou Eto, All rights reserved. -module CHISE - class CharacterParser - def parse(c) # parse a value and return a number (MCS) - raise "c is nil" if c.nil? - - if c.kind_of?(String) - if /\A\?/ =~ c - c = c.sub(/\A\?/, "") # remove "?" in the head - u4 = c.u8tou32 # translate from UTF-8 to UTF-32 - return u4.u32to_i # translate UTF-32 to UCS number - end - - return parse_er(c) if is_er?(c) # ER? - - return c.to_i if /^\d+$/ =~ c # only numbers? - - raise "unknown format" - end - - if c.kind_of?(Numeric) - c = 0x80000000 + c if c < 0 # negative value - return c.to_i - end - - raise "unknown object" - end +require "chise/chisedb" +module CHISE + module EntityReference PART = "&([-+0-9A-Za-z#]+);" - ALL = "\\A#{PART}\\Z" + ALL = '\A'+PART+'\Z' PART_RE = Regexp.new(PART) ALL_RE = Regexp.new(ALL) def contain_er?(s) (PART_RE =~ s) != nil; end - def is_er?(s) (ALL_RE =~ s) != nil; end + def is_er?(s) (ALL_RE =~ s) != nil; end # the order is important. The primary charset should be selectable. CODESYS_TABLE = [ @@ -61,6 +38,35 @@ module CHISE %w( =gt-k GT-K 5 d), ] PRIVATE_USE_AREA = 0xe000 + end + + class CharacterParser + include EntityReference + + def parse(c) # parse a value and return a number (MCS) + raise "c is nil" if c.nil? + + if c.kind_of?(String) + if /\A\?/ =~ c + c = c.sub(/\A\?/, "") # remove "?" in the head + u4 = c.u8tou32 # translate from UTF-8 to UTF-32 + return u4.u32to_i # translate UTF-32 to UCS number + end + + return parse_er(c) if is_er?(c) # ER? + + return c.to_i if /^\d+$/ =~ c # only numbers? + + raise "unknown format" + end + + if c.kind_of?(Numeric) + c = 0x80000000 + c if c < 0 # negative value + return c.to_i + end + + raise "unknown object" + end def parse_er(s) # parse a Entity Reference and return a number (MCS) raise "wrong ER." unless ALL_RE =~ s # don't use is_er? for getting $1. @@ -101,10 +107,10 @@ module CHISE code = codestr.hex end - u8 = CodesysDB.instance.get(codesys, code) + u8 = get_ccs(codesys, code) next if u8.nil? - num = parse("?"+u8) + num = parse(u8) next if num.nil? return num @@ -113,5 +119,47 @@ module CHISE raise "unknown Entity Reference" end + private + def get_ccs(ccs, code_point) + cd = ChiseDB.instance + cd.decode_char(ccs, code_point) + end + end + + class EntityReferenceParser + include EntityReference + + def de_er(s) # replace EntityReference with corresponding character. + return s unless PART_RE =~ s # don't use contain_er? to get $1 + + er = "&"+$1+";" + char = Character.get(er) + ss = s.sub(Regexp.new(Regexp.escape(er)), char.utf8_mcs) + + return de_er(ss) if contain_er?(ss) # recursive + ss + end + end + + class EntityReferenceEncoder + include EntityReference + + def to_er(char) + cid = char.char_id + return "&#x%04x;" % cid if cid <= 0xffff + return "&#x%05x;" % cid if cid <= 0xfffff + + CODESYS_TABLE.each {|codesys, er_prefix, keta, numtype| + code = char[codesys] + next if code.nil? + return "&#{er_prefix}%0#{keta}#{numtype};" % code + } + + "&MCS-%08X;" % cid # the last answer + end + + def to_er_by_ccs(cid, codesys) # not yet + end + end end diff --git a/chise/rbchise.rb b/chise/rbchise.rb index 789dbe9..f12c238 100755 --- a/chise/rbchise.rb +++ b/chise/rbchise.rb @@ -2,79 +2,316 @@ # "rbchise.so" ext compatible library by eto 2003-0317 require "bdb" -require "chise/config" +require "pathname" +require "fileutils" +require "chise/util" module CHISE class DataSource NONE = 0 Berkeley_DB = 1 - def initialize(type = Berkeley_DB, location = nil) - @type, @location = type, location - @location = Config.instance.db_dir if @location.nil? - @fnames = {} - @cnames_names = {} - at_exit { - @fnames.each {|k, db| db.close } - @cnames.each {|k, db| db.close } - } + def initialize(type=Berkeley_DB, loc=nil, subtype=0, modemask=0755) + @type = type + loc = Config.instance.db_dir if loc.nil? + @location = loc.path + @subtype = subtype + @modemask = modemask + @fdb = {} + @cdb = {} end + attr_reader :type, :location, :subtype, :modemask - def get_feature(feature) - @fnames[feature] = open_feature(feature) if @fnames[feature].nil? - @fnames[feature] + def get_feature(f) + @fdb[f] = FeatureTable.new(self, f) if @fdb[f].nil? + @fdb[f] end - def decode_char(name, code_point) - ccs = get_ccs(name) - ccs.decode(code_point) + def get_ccs(ccs) + @cdb[ccs] = CCSTable.new(self, ccs) if @cdb[ccs].nil? + @cdb[ccs] end - def get_ccs(name) - db = open(name, "system-char-id") - CCSTable.new(name, db) + def each_feature + dir = @location + "character/feature" + dir.each_entry {|f| + next if f.to_s == "." || f.to_s == ".." + f = f.unescape_win_filename + f = f.unescape + yield(f.to_s) + } end - def open_feature_table(feature) - db = open("system-char-id", feature) - FeatureTable.new(feature, db) + def load_feature(name, cid) + ft = get_feature(name) + return nil if ft.nil? + ft.get_value(cid) end - def open(from, to) # real_subtpe, accessmask, modemask - name = from+"/"+to - return @dbs[name] if @dbs[name] - file = @location+"/"+name - @dbs[name] = BDB::Hash.open(file, nil, 0) + def decode_char(ccs, code_point) + ct = get_ccs(ccs) + return nil if ct.nil? + ct.decode(code_point) end end - class AttributeTable # abstract class + module ChiseValue; end + + class AttributeTable + def initialize(dir, cat, keytype, name, amask, mmask) + dbdir = dir + cat + keytype + #FileUtils.mkdir_p(dbdir.to_s) unless dbdir.directory? + name = name.path + name = name.escape + name = name.escape_win_filename + path = dbdir + name +# qp path, amask, mmask + raise unless path.exist? +# @db = BDB::Hash.open(path.to_s, amask, mmask) + @db = BDB::Hash.open(path.to_s) + at_exit { + close + } + end + + def close + return if @db.nil? + begin + @db.sync + @db.close + rescue + end + end + + def get(k) + @db.get(k) + end + + def put(k, v) + @db.put(k, v) + end + + def each + @db.each {|k, v| yield(k, v) } + end end - - class CCSTable < AttributeTable - def initialize(ccs, db) - @ccs, @db = ccs, db + + module TableAccessModule + def initialize(ds, name) + @ds, @name = ds, name + @db = nil + @access = 0 end - def get_char(code_point) - @db.get(code_point) + def sync + @db.close if @db + @db = nil + @access = 0 end + alias close sync - def put_char(code_point, cid) - @db.put(code_point, cid) + def setup_db_exec(writable, cat, key) + if writable + sync if @access & BDB::CREATE == 0 + @access = BDB::CREATE + else + @access = BDB::RDONLY + end + + return if @db + + begin + @db = AttributeTable.new(@ds.location, cat, key, + @name, @access, @ds.modemask) + rescue + @db = nil + end + #raise if @db.nil? end end - class FeatureTable < AttributeTable - def initialize(feature, db) - @feature, @db = feature, db + class FeatureTable + include ChiseValue + include TableAccessModule + + def set_value(cid, value) + setup_db(true) + return nil if @db.nil? + key = format_char_id(cid) + @db.put(key, value) end - def get_value(char_id) - @db.get(char_id) + def get_value(cid) + setup_db + return nil if @db.nil? + key = format_char_id(cid) + @db.get(key) end def each + setup_db + return nil if @db.nil? + @db.each {|k, v| + cid = parse_c_string(k) + yield(cid, v) + } + end + + private + def setup_db(writable=nil) + setup_db_exec(writable, "character", "feature") + end + end + + class CCSTable + include ChiseValue + include TableAccessModule + + def decode(code_point) + setup_db + k = code_point.to_s + v = @db.get(k) + return nil if v.nil? + cid = parse_c_string(v) + cid + end + + def set_decoded_char(code_point, cid) + setup_db(true) + k = code_point.to_s + v = format_char_id(cid) + @db.put(k, v) + end + + private + def setup_db(writable=nil) + setup_db_exec(writable, "character", "by_feature") + end + end + + module ChiseValue + def parse_c_string(str) + i = 0 + c = str[i] + i += 1 + len = str.length + + raise unless 2 <= len && c == ?\? + + c = str[i] + i += 1 + + if (c == ?\\) + raise if (len < 3) + c = str[i] + i += 1 + if (c == ?^) + raise if (len < 4) + c = str[i] + i += 1 + if c == ?\? + return 0x7F + else + return c & (0x80 | 0x1F) + end + end + # raise # ? + end + + if ( c < 0xC0 ) + cid = c + counter = 0 + elsif ( c < 0xE0 ) + cid = c & 0x1f + counter = 1 + elsif ( c < 0xF0 ) + cid = c & 0x0f + counter = 2 + elsif ( c < 0xF8 ) + cid = c & 0x07 + counter = 3 + elsif ( c < 0xFC ) + cid = c & 0x03 + counter = 4 + else + cid = c & 0x01 + counter = 5 + end + + if (counter + 2 <= len) + (0...counter).each {|j| + cid = (cid << 6) | (str[j + i] & 0x3F) + } + return cid + end + + raise + end + + def format_char_id(cid) + case cid + when ?\t then return "?\t" + when ?\n then return "?\n" + when ?\r then return "?\r" + when 0x1C then return "?\^\\" + end + + if cid <= 0x1F + return "?\\^"+(?@+cid).chr + elsif (cid == ?\s) || (cid == ?\") || + (cid == ?\#) || (cid == ?\') || + (cid == ?\() || (cid == ?\)) || + (cid == ?\,) || (cid == ?\.) || + (cid == ?\;) || (cid == ?\?) || + (cid == ?\[) || (cid == ?\\) || + (cid == ?\]) || (cid == ?\`) + return "?\\"+cid.chr + elsif (cid <= 0x7E) + return("?"+cid.chr) + elsif (cid == 0x7F) + return "?\\^?"+0.chr + elsif (cid <= 0x9F) + dest = "?\\^" + dest += (((cid + ?@) >> 6) | 0xC0).chr + dest += (((cid + ?@) & 0x3F) | 0x80).chr + return dest + elsif (cid <= 0x7FF) + dest = "? " + dest[1] = (cid >> 6) | 0xC0 + dest[2] = (cid & 0x3F) | 0x80 + return dest + elsif (cid <= 0xFFFF) + dest = "? " + dest[1] = (cid >> 12) | 0xE0 + dest[2] = ((cid >> 6) & 0x3F) | 0x80 + dest[3] = (cid & 0x3F) | 0x80 + return dest + elsif (cid <= 0x1FFFFF) + dest = "? " + dest[1] = (cid >> 18) | 0xF0 + dest[2] = ((cid >> 12) & 0x3F) | 0x80 + dest[3] = ((cid >> 6) & 0x3F) | 0x80 + dest[4] = (cid & 0x3F) | 0x80 + return dest + elsif (cid <= 0x3FFFFFF) + dest = "? " + dest[1] = (cid >> 24) | 0xF8 + dest[2] = ((cid >> 18) & 0x3F) | 0x80 + dest[3] = ((cid >> 12) & 0x3F) | 0x80 + dest[4] = ((cid >> 6) & 0x3F) | 0x80 + dest[5] = (cid & 0x3F) | 0x80 + return dest + else + dest = "? " + dest[1] = (cid >> 30) | 0xFC + dest[2] = ((cid >> 24) & 0x3F) | 0x80 + dest[3] = ((cid >> 18) & 0x3F) | 0x80 + dest[4] = ((cid >> 12) & 0x3F) | 0x80 + dest[5] = ((cid >> 6) & 0x3F) | 0x80 + dest[6] = (cid & 0x3F) | 0x80 + return dest + end + raise end end end diff --git a/chise/string.rb b/chise/string.rb index aba2ce6..ee96134 100755 --- a/chise/string.rb +++ b/chise/string.rb @@ -1,146 +1,44 @@ # Copyright (C) 2002-2004 Kouichirou Eto, All rights reserved. -class String - def to_a() - ar = self.split(//u) # split self to chars as UTF-8 - ar - end - - def each_char() to_a.each {|ch| yield ch } end - def each_character() to_a.each {|ch| yield ch.char } end - def char_length() to_a.length end - def char_at(n) to_a()[n] end - def first_char() to_a[0] end - def char() CHISE::Character.get(to_a[0]) end - def to_utf8() - return to_a.map {|ch| - ch.char.to_utf8 - }.join("") - end - - def map_char(block = Proc.new) - return unless block_given? - return self.to_a.map {|ch| (block.call(ch)).to_s }.join("") - end +require "chise/character" +require "chise/parser" - def map_char!(block = Proc.new) - return unless block_given? - self.replace(self.map_char {|ch| block.call(ch)}) - end +class String + # copied from htree/encoder.rb + UTF8_RE = /\A(?: + [\x00-\x7f] + |[\xc0-\xdf][\x80-\xbf] + |[\xe0-\xef][\x80-\xbf][\x80-\xbf] + |[\xf0-\xf7][\x80-\xbf][\x80-\xbf][\x80-\xbf] + |[\xf8-\xfb][\x80-\xbf][\x80-\xbf][\x80-\xbf][\x80-\xbf] + |[\xfc-\xfd][\x80-\xbf][\x80-\xbf][\x80-\xbf][\x80-\xbf][\x80-\xbf])\Z/nx - def map_character(block = Proc.new) - return unless block_given? - return self.to_a.map {|ch| (block.call(ch.char)).to_s }.join("") + def is_a_utf8? # Is this string one character in UTF-8? + (UTF8_RE =~ self) != nil end - def map_character!(block = Proc.new) - return unless block_given? - self.replace(self.map_char {|ch| block.call(ch.char)}) + def char + raise unless is_a_utf8? + CHISE::Character.get("?"+self) end def method_missing(mid, *args) - if char_length == 1 #È—ªŒ`‚ª—LŒø‚Ȃ̂́Aˆê•¶Žš‚ÌŽž‚¾‚¯ - char.method_missing(mid, *args) - else - raise NameError, "undefined method `#{mid.id2name}'", caller(1) - end - end - - def map_utf8() map_char {|ch| ch.char.map_utf8 } end - alias map_ucs map_utf8 - - def map_ucs_er() map_char {|ch| ch.char.map_ucs_er } end - def to_er() map_char {|ch| ch.char.to_er } end - - #putŠÖŒWA[]ŠÖŒW‚Í—pˆÓ‚µ‚È‚¢‚±‚Æ‚É‚µ‚½B - def de_er!() #EntityReference‚ðŽæ‚菜‚­ - return self unless self =~ Regexp.new(EntityReference::REGEXP_PART) #‚»‚ê‚炵‚¢‚Ì‚ª–³‚¯‚ê‚Ή½‚à‚µ‚È‚¢ - er = "&"+$1+";" - self.sub!(Regexp.new(Regexp.escape(er)), Character.new(er).mcs_utf8) #•ÏŠ·Ž©‘Ì‚ÍCharacter‚É‚Ü‚©‚¹‚é - return self.de_er! if self =~ Regexp.new(EntityReference::REGEXP_PART) #‚Ü‚¾‚ ‚Á‚½‚çÄ‹A - return self + char.method_missing(mid, *args) end - def de_er() return self.dup.de_er!; end - - def inspect_all() map_char {|ch| ch.char.inspect_all } end - def inspect_x() map_char {|ch| ch.char.inspect_x } end - -# def to_euc() map_char {|ch| ch.char.to_euc } end - def map_euc() map_char {|ch| ch.char.map_euc } end -# def to_sjis() map_char {|ch| ch.char.to_sjis } end - def map_sjis() map_char {|ch| ch.char.map_sjis } end - - def glyph_decompose() map_char {|ch| ch.char.glyph_decompose } end - def decompose() map_char {|ch| ch.char.decompose } end - def decompose!() self.replace(self.decompose); self; end - - def nu_decompose_all(level=nil) - level = 0 if level.nil? - if 10 < level - p ["too many recursive", self] - exit - end - de = self.decompose - return de.decompose_all(level+1) if de != self #‚È‚É‚©•Ï‰»‚ª‚ ‚Á‚½‚©‚çÄ‹A - de #‚à‚¤‚±‚êˆÈã•Ï‰»‚Í–³‚³‚»‚¤‚¾‚¼‚ƁB - end - - def decompose_all() map_char {|ch| ch.char.decompose_all } end - def decompose_all!() self.replace(self.decompose_all); self; end - - def find() #"“ú‰_"¨"“Ü"‚Æ‚©‚¢‚¤Š´‚¶‚Ì‘€ì - ar = [] - length = char_length() - each_char {|ch| - char = ch.char - ar << char.ids_contained #‚»‚Ì•¶Žš‚ðŠÜ‚ñ‚Å‚¢‚銿Žš‚̃ŠƒXƒg - } - h = Hash.new(0) - ar.each {|list| - next if list.nil? - list.each_char {|ch| - h[ch] += 1 - } - } - str = "" - h.each {|k, v| - # p [k, v] - if length == v #‘S•”‚ÉŠç‚ðo‚µ‚Ä‚¢‚½‚ç - str += k - end + def each_char + to_a.each {|c| + yield(c) } - # p str - str end - def compose() - db = CHISE::CodesysDB.instance - composed = db.get("ids", self) - return "" if composed.nil? #‚È‚©‚Á‚½‚æ‚ƁB - return "" if composed.char_length == 0 #‚È‚É‚²‚Æ? - return composed if composed.char_length == 1 - composed.each_char {|ch| - char = ch.char - return ch if char.has_attribute? #‚Æ‚è‚ ‚¦‚¸Å‰‚ɂ݂‚©‚Á‚½‚à‚Ì‚ð•Ô‚·‚Æ‚¢‚¤ƒkƒ‹‚¢Žd—l - } - return "" #attribute‚ðŽ‚Â‚à‚Ì‚ªˆê‚Â‚à–³‚©‚Á‚½‚çA""‚É‚·‚é + def to_a + self.split(//u) end - def aggregate() - #self‚Å‚ ‚镶Žš—ñ‚ðIDS‚¾‚Ɖ¼’肵A‚»‚ê‚ðŠ®‘S‚Écompose‚µ‚«‚ç‚È‚¢‚ŁA - #‚»‚Ì•”•ªW‡‚¾‚¯‚ð‚Ƃ肾‚µ‚āAcompose‰Â”\‚Å‚ ‚ê‚΂ł«‚邾‚¯compose‚·‚éB - tree = CHISE::IDS_Tree.new(self) - return self if tree.depth <= 1 #sub_nodes‚ª–³‚¢ê‡‚Í‚±‚±‚Å‚³‚æ‚È‚ç - tree.sub_nodes.each {|node| - c = node.compose - next if c.nil? || c == "" - # print "#{self} #{node} #{c}\n" - # p [self, node, c] - n = self.gsub(node, c) - return n.aggregate - } - return self #‚¨‚«‚©‚¦‚ç‚ê‚é‚à‚Ì‚ª‚Ü‚Á‚½‚­‚È‚©‚Á‚½‚çAŽ©•ª‚ð‚©‚¦‚·B + def de_er() + pa = CHISE::EntityReferenceParser.new + pa.de_er(self) end end diff --git a/chise/util.rb b/chise/util.rb index b8e2918..8a34ea7 100644 --- a/chise/util.rb +++ b/chise/util.rb @@ -1,20 +1,53 @@ # Copyright (C) 2002-2004 Kouichirou Eto, All rights reserved. -module CHISE - def unix_to_win(unix_path) - win = unix_path.gsub(//, ")") win = win.gsub(/\*/, "+") win = win.gsub(/\?/, "!") - win + Pathname.new(win) end - def win_to_unix(win_path) - unix = win_path.gsub(/\)/, ">") + def win_to_unix + unix = @path.gsub(/\)/, ">") unix = unix.gsub(/\(/, "<") unix = unix.gsub(/\!/, "?") unix = unix.gsub(/\+/, "*") - unix + Pathname.new(unix) + end + + def escape_win_filename + return self.unix_to_win if CHISE.windows? + self + end + + def unescape_win_filename + return self.win_to_unix if CHISE.windows? + self end - module_function :unix_to_win, :win_to_unix end diff --git a/test/org-test-char.rb b/test/org-test-char.rb new file mode 100755 index 0000000..48df31b --- /dev/null +++ b/test/org-test-char.rb @@ -0,0 +1,52 @@ +#!/usr/bin/env ruby +# Copyright (C) 2002-2004 Kouichirou Eto, All rights reserved. + +require "common" + +class TestCharacter < Test::Unit::TestCase + + def test_method + @char = CHISE::Character.get("字") #UTF8で与えること + assert_instance_of(Hash, @char.char_attribute_alist) + assert_instance_of(Hash, @char.alist) + assert_instance_of(Array, @char.char_attribute_list) + assert_instance_of(Array, @char.list) + assert_instance_of(String, @char.inspect) + end + + def test_jis + char = CHISE::Character.get("逢") + assert_instance_of(String, char.get_attributes) + char = CHISE::Character.get("å­¦") + assert_instance_of(String, char.get_attributes) + end + + def p_er(er) + p er.de_er.char.inspect_all + end + + def nu_test_has_attribute + assert("&J90-4833;".de_er.char.has_attribute?) #罪 + assert(! "&MCS-00E06E9B;;".de_er.char.has_attribute?) #罪のisolated character, attributeを持ってない + assert("&C1-602E;".de_er.char.has_attribute?) #渡 + assert("&J90-454F;".de_er.char.has_attribute?) #渡 + p_er("&C1-602E;") #渡 + p_er("&J90-454F;") + p_er("&J83-4D63;") #翼 + p_er("&J90-4D63;") + p_er("&J83-3958;") #è³¼ + p_er("&J90-3958;") + end +end + +#===== PRINT_ALL [字] MCS-00005B57 &J90-3B7A; ===== +#chinese-gb2312: 0x5756 +#chinese-isoir165: 0x5756 +#korean-ksc5601: 0x6D2E +#ucs: 0x5B57 +#chinese-cns11643-1: 0x4773 +#chinese-big5: 0xA672 + +# test_print(CHISE::Character.get("&CDP-8B42;")) +# test_print(CHISE::Character.get("&I-CDP-8AF6;")) +#===== PRINT_ALL [舛] MCS-00ECA524 &K0-743F; ===== diff --git a/test/org-test-db.rb b/test/org-test-db.rb new file mode 100755 index 0000000..b415b5e --- /dev/null +++ b/test/org-test-db.rb @@ -0,0 +1,89 @@ +#!/usr/bin/env ruby +# Copyright (C) 2002-2004 Kouichirou Eto, All rights reserved. +# This file is in SJIS charset. Japanese Character -> Š¿Žš. + +require "common" + +class TestDataBase < Test::Unit::TestCase + def setup + @cdb = CHISE::CharDB.instance + @sdb = CHISE::CodesysDB.instance + end + + def test_each_db(db) + assert_instance_of(Array, db.keys) + end + + def test_make_db(db) + h = {"a" => 1, "b" => 2, "c" => 3} + db.remove_db("test-db") #‚Ü‚¸Å‰‚ɏÁ‚µ‚Ä‚¨‚­ + assert_not_nil(db.make_db("test-db", h)) + assert_not_nil(db.open_db("test-db")) + assert_equal(1, db.get("test-db", "a")) + assert_equal(2, db.get("test-db", "b")) + assert_equal(3, db.get("test-db", "c")) + db.remove_db("test-db") #ÅŒã‚É‚Ü‚½Á‚µ‚Ä‚¨‚­ + end + + def test_dbs + test_each_db(@cdb) + test_each_db(@sdb) +# test_make_db(@cdb) +# test_make_db(@sdb) + end +end + +class TestCodesys < Test::Unit::TestCase + def setup + @db = CHISE::CodesysDB.instance + end + + def test_db_length + assert_equal(6287, @db.get("=jis-x0208").keys.length) + assert_equal(590, @db.get("japanese-jisx0208").keys.length) + assert_equal(499, @db.get("japanese-jisx0208-1978").keys.length) + assert_equal(593, @db.get("japanese-jisx0208-1990").keys.length) + assert_equal(6067, @db.get("japanese-jisx0212").keys.length) + assert_equal(1697, @db.get("japanese-jisx0213-1").keys.length) + assert_equal(2345, @db.get("japanese-jisx0213-2").keys.length) + assert_equal(4270, @db.get("ucs-jis").keys.length) + end + + def test_db + keys = @db.keys + assert_instance_of(Array, @db.keys, "db.keys") + db = @db.get("ascii") + assert_equal(128, db.keys.length, "can get keys") + assert_equal(63, @db.get("katakana-jisx0201").keys.length) + assert_equal(94, @db.get("latin-jisx0201").keys.length) + + counter = 0 + @db.each("=jis-x0208"){|k, v| #ˆø”‚ÌCodesysƒf[ƒ^ƒx[ƒX‚Ì‚»‚ꂼ‚ê‚ɑ΂µ‚ÄŽÀs‚·‚é + er0 = sprintf("&J90-%04X;", k) + er1 = CHISE::Character.new(v).to_er + counter += 1; break if 10 < counter + } + end + + def test_jis_codesys + db = CHISE::CodesysDB.instance + codesys = db.get_codesys("=jis-x0208") + ks = codesys.keys.sort #‚Æ‚·‚邱‚Æ‚É‚æ‚Á‚āAJISX0208 1990‚̏W‡‘S•”‚Ìkeys‚ª“¾‚ç‚ê‚é +# assert_equal(6880, ks.length) + assert_equal(8481, ks.first) + assert_equal(29566, ks.last) + char = codesys.get(15226) #"Žš" + assert_equal("Žš".su, char.to_s) + + assert_equal("ˆŸ".su, codesys.get(12321)) + jis = "ˆŸ".su.char.japanese_jisx0208_1990 +# assert_equal("ˆŸ", codesys.get(jis)) +# assert_equal("ˆŸ", sprintf("&J90-%04X;", jis).de_er) + +# codesys = db.get_codesys("japanese-jisx0208-1990") #‹Œ–¼ + codesys = db.get_codesys("=jis-x0208-1990") + assert_equal(8481, ks.first) + assert_equal(29566, ks.last) + end + +end diff --git a/test/org-test-ids.rb b/test/org-test-ids.rb new file mode 100755 index 0000000..74653ce --- /dev/null +++ b/test/org-test-ids.rb @@ -0,0 +1,295 @@ +#!/usr/bin/env ruby +# Copyright (C) 2002-2004 Kouichirou Eto, All rights reserved. +# by eto 2003-0112 + +require "common" + +class TestIDS < Test::Unit::TestCase + def test_ids + char = "榊".char + assert_equal("⿰木神", char.ids) + assert_equal("⿰木神", char.decompose) + str = "榊" + assert_equal("⿰木神", str.char.ids) + assert_equal("⿰木神", str.decompose) + assert_equal("⿰木⿰⺭申", str.decompose.decompose) + assert_equal("⿰木神", str.decompose!) + assert_equal("⿰木⿰⺭申", str.decompose!) + str = "榊" + assert_equal("⿰木⿰⺭申", str.decompose_all) + assert_equal("⿰木⿰⺭申", str.decompose_all!) + assert_equal("⿰木⿰⺭申", str) + #今はまだcomposeはできない。 + + de = "ç´°".decompose + assert_match(/田$/, de) + assert_equal(3, de.char_length) + de = "&JX2-7577;".de_er.decompose + de = "&CDP-8B60;".de_er.decompose + assert_equal(1, de.char_length) + de = "&JX2-217E;".de_er.decompose + assert_match(/^â¿°/, de) + assert_equal(3, de.char_length) + assert_equal(6, de.decompose!.char_length) +# assert_equal(6, de.decompose!.char_length) + + assert("⿸".char.is_ids?) + assert(! "木".char.is_ids?) + assert_equal(2, "â¿°".char.ids_operator_argc) + assert_equal(2, "&U+2FF0;".de_er.char.ids_operator_argc) + assert_equal(2, "&U+2FF1;".de_er.char.ids_operator_argc) + assert_equal(3, "&U+2FF2;".de_er.char.ids_operator_argc) + assert_equal(3, "&U+2FF3;".de_er.char.ids_operator_argc) + + assert_equal("â¿°", "&U+2FF0;".de_er.to_s) + assert("&U+2FF0;".de_er.char.is_ids?) + assert("&U+2FFF;".de_er.char.is_ids?) + #assert_match(/U\+2FF0/, "&U+2FF0;".de_er.char.inspect_x) + assert_match(/IDEOGRAPHIC DESCRIPTION CHARACTER LEFT TO RIGHT/, "&U+2FF0;".de_er.char.inspect_all) + (0x2FF0..0x2FFB).each {|i| + assert_match(/IDEOGRAPHIC DESCRIPTION CHARACTER/, CHISE::Character.new(i).name) + } + + assert_match(/LEFT TO RIGHT/, "&U+2FF0;".de_er.char.name) #∫ + assert_match(/ABOVE TO BELOW/, "&U+2FF1;".de_er.char.name) #∨ + assert_match(/LEFT TO MIDDLE AND RIGHT/, "&U+2FF2;".de_er.char.name) #∬ + assert_match(/ABOVE TO MIDDLE AND BELOW/, "&U+2FF3;".de_er.char.name) #∀ + assert_match(/FULL SURROUND/, "&U+2FF4;".de_er.char.name) #∃ + assert_match(/SURROUND FROM ABOVE/, "&U+2FF5;".de_er.char.name) #∩ + assert_match(/SURROUND FROM BELOW/, "&U+2FF6;".de_er.char.name) #∪ + assert_match(/SURROUND FROM LEFT/, "&U+2FF7;".de_er.char.name) #⊂ + assert_match(/SURROUND FROM UPPER LEFT/, "&U+2FF8;".de_er.char.name) #√ + assert_match(/SURROUND FROM UPPER RIGHT/, "&U+2FF9;".de_er.char.name) #∂ + assert_match(/SURROUND FROM LOWER LEFT/, "&U+2FFA;".de_er.char.name) #∠ + assert_match(/OVERLAID/, "&U+2FFB;".de_er.char.name) #∵ + end + + def test_tree + assert_equal("[]", CHISE::Tree.new().inspect) + assert_equal("[1]", CHISE::Tree.new().add_leaf(1).inspect) + assert_equal("[1, 2]", CHISE::Tree.new().add_leaf(1).add_leaf(2).inspect) + assert_equal("[[]]", CHISE::Tree.new().add_node.inspect) + assert_equal("[[1]]", CHISE::Tree.new().add_node.add_leaf(1).inspect) + assert_equal("[[1, 2]]", CHISE::Tree.new().add_node.add_leaf(1).add_leaf(2).inspect) + assert_equal("[[1]]", CHISE::Tree.new().add_node.add_leaf(1).end_node.inspect) + assert_equal("[[1], [1]]", CHISE::Tree.new().add_node.add_leaf(1).end_node.add_node.add_leaf(1).end_node.inspect) + + tree = CHISE::Tree.new + assert_equal("[]", tree.inspect) + assert_equal("[1]", tree.add_leaf(1).inspect) + assert_equal(0, tree.depth) + assert_equal("[1, 2]", tree.add_leaf(2).inspect) + assert_equal("[1, 2, []]", tree.add_node.inspect) + assert_equal("[1, 2, [3]]", tree.add_leaf(3).inspect) + assert_equal(1, tree.depth) + assert_equal("[1, 2, [3, 4]]", tree.add_leaf(4).inspect) + assert_equal("[1, 2, [3, 4]]", tree.end_node.inspect) + assert_equal("[1, 2, [3, 4], [5]]", tree.add_node.add_leaf(5).inspect) + assert_equal("[1, 2, [3, 4], [5, [6]]]", tree.add_node.add_leaf(6).inspect) + assert_equal(2, tree.depth) + + tree = CHISE::Tree.new + assert_equal("[[\"+\"]]", tree.add_node("+", 2).inspect) + assert_equal("[[\"+\", 1]]", tree.add_leaf(1).inspect) + assert_equal("unmatch leaves", tree.check_integrity) + assert_equal("[[\"+\", 1, 2]]", tree.add_leaf(2).inspect) + assert_nil(tree.check_integrity) + assert_equal("[[\"+\", 1, 2], 3]", tree.add_leaf(3).inspect) + assert_equal("extra nodes", tree.check_integrity) + + tree = CHISE::Tree.new + assert_equal("[[\"+\"]]", tree.add_node("+", 2).inspect) + assert_equal("unmatch leaves", tree.check_integrity) + assert_equal("[[\"+\", 1]]", tree.add_leaf(1).inspect) + assert_equal("unmatch leaves", tree.check_integrity) + assert_equal("[[\"+\", 1, [\"+\"]]]", tree.add_node("+", 2).inspect) + assert_equal("unmatch leaves", tree.check_integrity) + assert_equal("[[\"+\", 1, [\"+\", 2]]]", tree.add_leaf(2).inspect) + assert_equal("unmatch leaves", tree.check_integrity) + assert_equal("[[\"+\", 1, [\"+\", 2, 3]]]", tree.add_leaf(3).inspect) + assert_nil(tree.check_integrity) + + tree = CHISE::Tree.new + assert_equal("[1]", tree.add_leaf(1).inspect) + assert_nil(tree.check_integrity) + assert_equal("[1, 2]", tree.add_leaf(2).inspect) + assert_equal("extra leaves", tree.check_integrity) + end + + def test_ids_tree +# assert_equal("[[<+,U+002B>, , ]]", CHISE::IDS_Tree.new("+AB").inspect) +# assert_equal("[[<+,U+002B>, , ], ]", CHISE::IDS_Tree.new("+ABC").inspect) +# assert_equal("[[<+,U+002B>, , [<+,U+002B>, , ]]]", CHISE::IDS_Tree.new("+A+BC").inspect) +# assert_equal("[[<+,U+002B>, , [<+,U+002B>, , ]], ]", CHISE::IDS_Tree.new("+A+BCD").inspect) + + #assert_equal("[<榊,U+698A>]", CHISE::IDS_Tree.new("榊").inspect) +# assert_equal("[[<â¿°,U+2FF0>, <木,J90-4C5A>, <神,J90-3F40>]]", CHISE::IDS_Tree.new("⿰木神").inspect) + assert_equal(1, CHISE::IDS_Tree.new("⿰木神").depth) +# assert_equal("[[<â¿°,U+2FF0>, <木,J90-4C5A>, [<â¿°,U+2FF0>, <⺭,CDP-8B70>, <申,J90-3F3D>]]]", CHISE::IDS_Tree.new("⿰木⿰⺭申").inspect) + assert_equal(2, CHISE::IDS_Tree.new("⿰木⿰⺭申").depth) + assert_equal("unmatch leaves", CHISE::IDS_Tree.new("⿰木").check_integrity) + assert_nil(CHISE::IDS_Tree.new("⿰木神").check_integrity) + assert_equal("unmatch leaves", CHISE::IDS_Tree.new("⿰木⿰申").check_integrity) + assert_nil(CHISE::IDS_Tree.new("⿰木⿰⺭申").check_integrity) + assert_equal("extra nodes", CHISE::IDS_Tree.new("⿰木⿰⺭申申").check_integrity) + assert_nil(CHISE::IDS_Tree.new("榊").check_integrity) + assert_equal("extra leaves", CHISE::IDS_Tree.new("榊榊").check_integrity) + + assert_equal(3, "⿳".char.ids_operator_argc) + assert_equal("⿳士冖匕", "壱".char.ids) + assert_equal(3, "壱".char.ids.char.ids_operator_argc) + assert_nil(CHISE::IDS_Tree.new("⿳士冖匕").check_integrity) + assert_equal("unmatch leaves", CHISE::IDS_Tree.new("⿳士冖").check_integrity) + assert_equal("extra nodes", CHISE::IDS_Tree.new("⿳士冖匕匕").check_integrity) + + assert_equal("contains ques", CHISE::IDS_Tree.new("⿳士冖?").check_integrity) + end + + def test_tree_depth + assert_equal(1, CHISE::IDS_Tree.new("林".decompose).depth) +# assert_equal("["⿰木木"]", CHISE::IDS_Tree.new("林".decompose).nodes.inspect) +# assert_equal("[]", CHISE::IDS_Tree.new("林".decompose).sub_nodes.inspect) + assert_equal(2, CHISE::IDS_Tree.new("榊".decompose_all).depth) +# assert_equal("["⿰木⿰⺭申", "⿰⺭申"]", CHISE::IDS_Tree.new("榊".decompose_all).nodes.inspect) +# assert_equal("["⿰⺭申"]", CHISE::IDS_Tree.new("榊".decompose_all).sub_nodes.inspect) + +# assert_equal(3, CHISE::IDS_Tree.new("焔".decompose_all).depth) +# assert_equal(3, CHISE::IDS_Tree.new("焔".decompose_all).nodes.length) +# assert_equal(2, CHISE::IDS_Tree.new("焔".decompose_all).sub_nodes.length) + + assert_equal(2, CHISE::IDS_Tree.new("屡".decompose_all).depth) + assert_equal("⿸尸娄", "⿸尸⿱米女".aggregate) + assert_equal(3, CHISE::IDS_Tree.new("醤".decompose_all).depth) + end + + def test_compose_exact #正確に一致するIDSを検知する + assert_equal("榊", "榊".decompose.compose) + assert_equal("壱", "壱".decompose.compose) + assert_equal("⿰木木", "林".decompose) + assert_equal("林", "⿰木木".compose) + assert_equal("林", "林".decompose.compose) + assert_equal("⿰木木", "⿰木木".compose.decompose) + assert_equal("林".ucs, "⿰木木".compose.ucs) + end + + def test_idc_shortcut + assert_equal(IDC_LR, "林".decompose.first_char) + assert_equal(IDC_LR+"木木", "林".decompose) + + assert_equal(IDC_AB, "森".decompose.first_char) + assert_equal(IDC_AB+"木林", "森".decompose) + assert_equal(IDC_AB+"火火", "炎".decompose) + + assert_equal(IDC_LMR, "班".decompose.first_char) + assert_equal(IDC_LMR+"å½³"+IDC_AB+"山王"+"攵", "å¾´".decompose) #meaning? + + assert_equal(IDC_AMB, "é¼»".decompose.first_char) + assert_equal(IDC_AMB+"自田廾", "é¼»".decompose) + assert_equal(IDC_AMB+"士冖匕", "壱".decompose) + assert_equal(IDC_AMB+"穴厶心", "窓".decompose) + assert_equal(IDC_AMB+"丗冖巾", "帯".decompose) + + assert_equal(IDC_FS, "囲".decompose.first_char) + assert_equal(IDC_FS+"囗井", "囲".decompose) + assert_equal(IDC_FS+"行韋", "衛".decompose) + assert_equal(IDC_FS+"行圭", "街".decompose) + assert_equal(IDC_FS+"行重", "衝".decompose) + assert_equal(IDC_FS+IDC_AB+"一凵田", "画".decompose) + + assert_equal(IDC_FA, "問".decompose.first_char) + assert_equal(IDC_FA+"門口", "問".decompose) + assert_equal(IDC_FA+"門"+IDC_LR+"豆寸", "闘".decompose) + assert_equal(IDC_FA+"戌女", "威".decompose) + assert_equal(IDC_FA+"茂臣", "蔵".decompose) + assert_equal(IDC_FA+"尺旦", "昼".decompose) + assert_equal(IDC_FA+"冂入", "内".decompose) + assert_equal(IDC_FA+"几丶", "凡".decompose) + assert_equal(IDC_FA+"几"+IDC_AB+"丿虫", "風".decompose) + + assert_equal(IDC_FB, "凶".decompose.first_char) + assert_equal(IDC_AB+"æ­¢"+IDC_FB+"凵米", "æ­¯".decompose) + + assert_equal(IDC_FL, "匠".decompose.first_char) + assert_equal(IDC_FL+"匚斤", "匠".decompose) + assert_equal(IDC_FL+"匚矢", "医".decompose) + assert_equal(IDC_FL+"匚若", "匿".decompose) + assert_equal(IDC_FL+"匚儿", "匹".decompose) + + assert_equal(IDC_FUL, "庁".decompose.first_char) + assert_equal(IDC_FUL+"广丁", "庁".decompose) + assert_equal(IDC_FUL+"歹匕", "æ­»".decompose) + assert_equal(IDC_FUL+"尹口", "君".decompose) + assert_equal(IDC_FUL+"麻鬼", "魔".decompose) + assert_equal(IDC_FUL+"府肉", "腐".decompose) + assert_equal(IDC_FUL+"麻手", "摩".decompose) + assert_equal(IDC_FUL+"虍思", "慮".decompose) + assert_equal(IDC_FUL+"食口", "倉".decompose) + assert_equal(IDC_AB+"日"+IDC_FUL+"耳又", "最".decompose) + assert_equal(IDC_FUL+"手目", "看".decompose) #meaning + assert_equal(IDC_FUL+"辰口", "唇".decompose) #? + + assert_equal(IDC_FUR, "句".decompose.first_char) + assert_equal(IDC_FUR+"勹口", "句".decompose) + assert_equal(IDC_FUR+"勹丶", "勺".decompose) + assert_equal(IDC_FUR+"勹日", "旬".decompose) + assert_equal(IDC_FUR+"戈廾", "戒".decompose) + assert_equal(IDC_FUR+"弋工", "式".decompose) + assert_equal(IDC_FUR+"刀丿", "刃".decompose) + assert_equal(IDC_FUR+"鳥山", "島".decompose) #meaning + + assert_equal(IDC_FLL, "通".decompose.first_char) + assert_equal(IDC_FLL+"廴聿", "建".decompose) + assert_equal(IDC_FLL+"走戉", "越".decompose) + assert_equal(IDC_FLL+"èµ°å·³", "èµ·".decompose) + assert_equal(IDC_FLL+"走取", "趣".decompose) + assert_equal(IDC_FLL+"走召", "超".decompose) + assert_equal(IDC_FLL+"是頁", "題".decompose) + assert_equal(IDC_FLL+"免力", "勉".decompose) + assert_equal(IDC_FLL+"鬼未", "魅".decompose) + assert_equal(IDC_FLL+"黒犬", "黙".decompose) + + assert_equal(IDC_O, "太".decompose.first_char) + assert_equal(IDC_O+"大丶", "太".decompose) + assert_equal(IDC_O+"衣中", "è¡·".decompose) + assert_equal(IDC_O+"衣里", "裏".decompose) + assert_equal(IDC_O+"勹巳", "包".decompose) + assert_equal(IDC_O+"勹乂", "匁".decompose) + assert_equal(IDC_O+"木日", "東".decompose) + assert_equal(IDC_O+"弍一", "弐".decompose) + assert_equal(IDC_O+"衣保", "褒".decompose) + end + + def test_glyph_decompose + assert_equal("音", "音".decompose) +# assert_equal(IDC_AB+"立日", "音".glyph_decompose) + assert_equal(IDC_FLL, "世".decompose.first_char) + assert_equal("世", "世".glyph_decompose) + end + + def test_find() +# p "日雲".find #"曇" + assert(4 <= "日雲".find .char_length) #"曇" + end + + def test_compose_part() +# p de.compose_ar +# p "神".compose_ar +# p "木".compose_ar.join + end + + def test_aggregate() +# db = IDS_DB.instance +# db.list_aggregate + end + + def test_ids_error() +# p "実".char.inspect_all +# p "実".char.ids +# assert_equal("contains ques", "実".char.ids_error) +# assert_equal("unmatch leaves", "実".char.ids_error) +# p CharDB.instance.get("ascii").keys +# p CharDB.instance.get("no-such-attribute").keys +# p CharDB.instance.get("ids-error").keys + end + +end diff --git a/test/org-test-kage.rb b/test/org-test-kage.rb new file mode 100755 index 0000000..d476a7b --- /dev/null +++ b/test/org-test-kage.rb @@ -0,0 +1,39 @@ +#!/usr/bin/env ruby +# Copyright (C) 2002-2004 Kouichirou Eto, All rights reserved. +# kage testcase by eto 2003-0318 + +require "common" +#require "chise/stroke" +#include StrokeFont + +#class TestKage < Test::Unit::TestCase +class TestKage + def setup + @kage = KageFont.new + end + + def test_stroke + end + + def test_kage + char = Character.get(0x4e03) #七 + font = @kage.get(0x4e03) + assert_instance_of(KageGlyph, font) + font.parse + + svg = <<"EOT" + M 50,540 950,255 + M 330,50 330,900 M 330,900 Q 330,950 380,950 M 380,950 840,950 M 840,950 Q 890,950 915,850 +EOT + strokes = KageParser.parse(svg) + #p strokes + end + + def test_path + pr = PathResolver.new + assert_equal([[0, 0, 1000, 1000]], pr.parse("M 0,0 1000,1000")) + assert_equal([[0, 0, 0, 1000], [0, 1000, 1000, 1000]], pr.parse("M 0,0 0,1000 1000,1000")) +# assert_equal([[0, 0, 0.0, 0.0], [0.0, 0.0, 62.5, 437.5], [62.5, 437.5, 250.0, 750.0], [250.0, 750.0, 562.5, 937.5]], pr.parse("M 0,0 Q 0,1000 1000,1000")) + end + +end diff --git a/test/org-test-kanjilist.rb b/test/org-test-kanjilist.rb new file mode 100755 index 0000000..48c1e6c --- /dev/null +++ b/test/org-test-kanjilist.rb @@ -0,0 +1,34 @@ +#!/usr/bin/env ruby +# Copyright (C) 2002-2004 Kouichirou Eto, All rights reserved. +# testcase for KanjiNetwork by eto 2003-0227 + +require "common" +require "chise/kanjilist" + +class TestKanjiList < Test::Unit::TestCase + def setup + @kl = CHISE::KanjiList.instance + end + + def test_kyoiku_kanji_list # 範囲指定が複雑なのでtestするべし。 + assert_equal("愛悪圧安暗案以位囲委意易異移胃衣遺医域育一印員因引飲院右宇羽雨運雲営映栄永泳英衛液益駅円園延沿演遠塩央往応横王黄億屋恩温音下化仮何価加可夏家科果歌河火花荷課貨過我画芽賀会解回快改械海灰界絵開階貝外害街各拡格確覚角閣革学楽額割活株寒刊巻完官干幹感慣漢看管簡観間関館丸岸眼岩顔願危喜器基寄希揮机旗期機帰気汽季紀規記貴起技疑義議客逆久休吸宮弓急救求泣球究級給旧牛去居挙許漁魚京供競共協境強教橋胸興郷鏡業局曲極玉勤均禁筋近金銀九句区苦具空君訓群軍郡係兄型形径敬景系経計警軽芸劇激欠決潔穴結血月件健券建憲検権犬研絹県見険験元原厳減源現言限個古呼固己庫戸故湖五午後語誤護交候光公功効厚口向后好孝工幸広康校構港皇紅耕考航行講鉱鋼降高号合刻告国穀黒骨今困根混左差査砂座再最妻才採済災祭細菜裁際在材罪財坂作昨策桜冊刷察札殺雑皿三参山散産算蚕賛酸残仕使司史四士始姉姿子市師志思指支枝止死氏私糸紙至視詞詩試誌資飼歯事似児字寺持時次治磁示耳自辞式識七失室質実舎写射捨社者謝車借尺若弱主取守手種酒首受授樹収周宗就州修拾秋終習衆週集住十従縦重宿祝縮熟出術述春準純順処初所暑署書諸助女序除傷勝商唱将小少承招昭松消焼照省章笑証象賞障 上乗城場常情条状蒸植織職色食信心新森深申真神臣親身進針人仁図垂推水数寸世制勢性成政整星晴正清生盛精聖声製西誠青静税席昔石積績責赤切接折設節説雪絶舌先千宣専川戦泉浅洗染線船選銭前善然全祖素組創倉奏層想操早巣争相窓総草装走送像増臓蔵造側則息束測足速属族続卒存孫尊損村他多太打体対帯待態貸退隊代台大第題宅達谷単担探炭短誕団断暖段男談値知地池置築竹茶着中仲宙忠昼柱注虫著貯丁兆帳庁張朝潮町腸調長頂鳥直賃追痛通低停定底庭弟提程敵的笛適鉄典天展店転点伝田電徒登都努度土党冬刀島投東湯灯当等答糖統討豆頭働動同堂導童道銅得徳特毒独読届内南難二肉日乳入任認熱年念燃納能脳農波派破馬俳拝敗背肺配倍梅買売博白麦箱畑八発判半反板版犯班飯晩番否悲批比皮秘肥費非飛備美鼻必筆百俵標氷票表評病秒品貧不付夫婦富布府父負武部風副復服福腹複仏物分奮粉文聞兵平並閉陛米別変片編辺返便勉弁保歩補墓暮母包報宝放方法訪豊亡忘暴望棒貿防北牧本妹枚毎幕末万満味未密脈民務夢無名命明盟迷鳴綿面模毛木目問門夜野矢役約薬訳油輸優勇友有由遊郵夕予余預幼容曜様洋用羊葉要陽養欲浴翌来落乱卵覧利理裏里陸律率立略流留旅両料良量領力緑林臨輪類令例冷礼歴列練連路労朗老六録論和話", @kl.kyoiku()) + assert_equal("一右雨円王音下火花学気休金九空月犬見五口校左三山四子糸字耳七車手十出女小上森人水正生青石赤先千川早足村大男中虫町天田土二日入年白八百文本名木目夕立力林六", @kl.kyoiku(1)) + assert_equal("引雲遠黄何夏家科歌画会回海絵貝外楽間顔帰汽記牛魚京強教玉近形計元原古戸午後語交光工広考行高合国黒今才作算市思止紙寺時自室社弱首秋春書少場色食心新親図数星晴声西切雪船前組草走多太体台谷知地池竹茶昼朝長鳥通弟店点電冬刀東当答頭同道読南馬買売麦半番父風分聞米歩母方北妹毎明鳴毛門夜野友曜用来理里話", @kl.kyoiku(2)) + assert_equal("一右雨円王音下火花学気休金九空月犬見五口校左三山四子糸字耳七車手十出女小上森人水正生青石赤先千川早足村大男中虫町天田土二日入年白八百文本名木目夕立力林六引雲遠黄何夏家科歌画会回海絵貝外楽間顔帰汽記牛魚京強教玉近形計元原古戸午後語交光工広考行高合国黒今才作算市思止紙寺時自室社弱首秋春書少場色食心新親図数星晴声西切雪船前組草走多太体台谷知地池竹茶昼朝長鳥通弟店点電冬刀東当答頭同道読南馬買売麦半番父風分聞米歩母方北妹毎明鳴毛門夜野友曜用来理里話", @kl.kyoiku(1..2)) + + assert_equal("右雨王音火貝九玉金月犬見口左山子糸耳車手十女人水夕石川早足大竹虫天田土日年白文木目立力六", @kl.kyoiku(1, CHISE::KanjiList::SHOUKEI)) + assert_equal("一二三四五下七小上生中入八本", @kl.kyoiku(1, CHISE::KanjiList::SHIJI)) + assert_equal("円休出森正赤千男町名林", @kl.kyoiku(1, CHISE::KanjiList::KAII)) + assert_equal("花学気空校字青先草村百", @kl.kyoiku(1, CHISE::KanjiList::KEISEI)) + + assert_equal("羽雲夏画回会外角弓牛魚京兄原戸古午工交行高黄才止矢自首心西長鳥弟刀東肉馬米歩母方北万毛門用来", @kl.kyoiku(2, CHISE::KanjiList::SHOUKEI)) + assert_equal("", @kl.kyoiku(2, CHISE::KanjiList::SHIJI)) + assert_equal("科楽岩顔汽教計公谷黒今思春少声雪走多太台直電内売半番父明鳴友里", @kl.kyoiku(2, CHISE::KanjiList::KAII)) + assert_equal("引園遠何家歌海絵活間丸記帰強近形元言後語広光考合国細作算市姉紙寺時室社弱配秋週書場色食新親図数星晴切船線前組体地池茶昼朝通店点冬当答頭同道読南買麦風分聞毎妹夜野曜理話", @kl.kyoiku(2, CHISE::KanjiList::KEISEI)) + + assert_equal("右雨王音火貝九玉金月犬見口左山子糸耳車手十女人水夕石川早足大竹虫天田土日年白文木目立力六羽雲夏画回会外角弓牛魚京兄原戸古午工交行高黄才止矢自首心西長鳥弟刀東肉馬米歩母方北万毛門用来", @kl.kyoiku(1..2, CHISE::KanjiList::SHOUKEI)) + assert_equal("一二三四五下七小上生中入八本", @kl.kyoiku(1..2, CHISE::KanjiList::SHIJI)) + assert_equal("円休出森正赤千男町名林科楽岩顔汽教計公谷黒今思春少声雪走多太台直電内売半番父明鳴友里", @kl.kyoiku(1..2, CHISE::KanjiList::KAII)) + assert_equal("花学気空校字青先草村百引園遠何家歌海絵活間丸記帰強近形元言後語広光考合国細作算市姉紙寺時室社弱配秋週書場色食新親図数星晴切船線前組体地池茶昼朝通店点冬当答頭同道読南買麦風分聞毎妹夜野曜理話", @kl.kyoiku(1..2, CHISE::KanjiList::KEISEI)) + end +end diff --git a/test/org-test-network.rb b/test/org-test-network.rb new file mode 100755 index 0000000..7578ee8 --- /dev/null +++ b/test/org-test-network.rb @@ -0,0 +1,22 @@ +#!/usr/bin/env ruby +# Copyright (C) 2002-2004 Kouichirou Eto, All rights reserved. +# testcase for KanjiNetwork by eto 2003-0227 + +require "common" +require "chise/network" + +class Test_KanjiNetwork < Test::Unit::TestCase + def test_network + @kl = CHISE::KanjiList.instance + @kn = CHISE::KanjiNetwork.new + @kn.make_network(@kl.awase) + @kn.out("min.dot") #途中状態を保存 + + @gv = CHISE::GraphvizOLE.new() #OLE version + @gv.type = CHISE::Graphviz::TWOPI + @gv.target = "svg" + @gv.in = "min.dot" + @gv.out = "min.svg" + @gv.generate() + end +end diff --git a/test/org-test-str.rb b/test/org-test-str.rb new file mode 100755 index 0000000..c0259f4 --- /dev/null +++ b/test/org-test-str.rb @@ -0,0 +1,38 @@ +#!/usr/bin/env ruby +# Copyright (C) 2002-2004 Kouichirou Eto, All rights reserved. + +require "common" + +class TestString < Test::Unit::TestCase + def test_method + @str = "文字列" + str = @str.map_char {|ch| + ch+ch + } + assert_equal("文文字字列列", str) + assert_equal("文字列", @str) + +# assert_equal("<文,C1-4546>", "文".inspect_x) +# assert_equal("<字,J90-3B7A>", "字".inspect_x) +# assert_equal("<列,J90-4E73>", "列".inspect_x) +# assert_equal("<文,C1-4546><字,J90-3B7A><列,J90-4E73>", "文字列".inspect_x) + + ins = "字".inspect_all +# assert_match(/^<字,J90-3B7A,/, ins) + assert_match(/^<字,#x5b57,/, ins) + assert_match(/=big5:42610/, ins) + assert_match(/=cns11643-1:18291/, ins) + assert_match(/=gb2312:22358/, ins) + assert_match(/=daikanwa:6942/, ins) + assert_match(/ideographic-radical:39/, ins) + assert_match(/ideographic-strokes:3/, ins) + assert_match(/=ks-x1001:27950/, ins) + assert_match(/shinjigen-2:1777/, ins) + assert_match(/total-strokes:6/, ins) + assert_match(/=ucs:23383/, ins) + assert_match(/=gt:8734/, ins) + assert_match(/=gt-k:1624/, ins) + assert_match(/=gt-pj-1:15226/, ins) + assert_match(/=jis-x0208:15226/, ins) + end +end diff --git a/test/test-ccs.rb b/test/test-ccs.rb new file mode 100755 index 0000000..24d26db --- /dev/null +++ b/test/test-ccs.rb @@ -0,0 +1,10 @@ +#!/usr/bin/env ruby +# Copyright (C) 2002-2004 Kouichirou Eto, All rights reserved. + +require "common" + +class TestCCS < Test::Unit::TestCase + def test_ccs + @cs = ChiseDB.instance + end +end diff --git a/test/test-char.rb b/test/test-char.rb new file mode 100755 index 0000000..5e36c4c --- /dev/null +++ b/test/test-char.rb @@ -0,0 +1,67 @@ +#!/usr/bin/env ruby +# Copyright (C) 2002-2004 Kouichirou Eto, All rights reserved. + +require "common" + +class TestCharacter < Test::Unit::TestCase + def test_equality + c1 = "字".char # flyweight pattern + c2 = CHISE::Character.get("?字") + assert_equal(c1, c2) # equal + assert_same(c1, c2) # same instance + c3 = CHISE::Character.new(0x5b57) + assert_not_equal(c1, c3) # not equal + assert_not_same(c1, c3) # not same instance + assert_equal(c1.char_id, c3.char_id) + end + + def test_char + char = "字".char + assert_equal(23383, char["=ucs"]) + assert_equal(23383, char["ucs"]) + assert_equal(23383, char.ucs) + assert_equal(22358, char.gb2312) + assert_equal(6, char.total_strokes) + assert_equal(3, char.ideographic_strokes) + assert_equal(39, char.ideographic_radical) + assert_equal(nil, char.nosuchfeature) + assert_raise(RuntimeError){ char.nosuchmethod(0) } + end + + def test_latin + char = "A".char + assert_equal(65, char.ascii) + assert_equal(65, char.ucs) + assert_equal(65, char.latin_jisx0201) + assert_equal(65, char.latin_viscii) + assert_equal("L", char.bidi_category) + assert_equal("LATIN CAPITAL LETTER A", char.name) + end + + def test_to_er + #assert_equal("&J90-3B7A;", "字".char.to_er) + assert_equal("字", "字".char.to_er) + assert_equal("字", "&M-06942;".de_er.char.to_er) + assert_equal("𡙫", "&M-06000;".de_er.char.to_er) + end + + def test_alias + assert_equal("DIGIT ONE", "1".name) + assert_equal("DIGIT ONE", "1".char.name) + assert_equal("DIGIT ONE", "1".char["name"]) + assert_equal("(((name . \"FULLWIDTH DIGIT ONE\") (=ucs . 65297)))", "1".char["->fullwidth"]) + assert_equal("(((name . \"FULLWIDTH DIGIT ONE\") (=ucs . 65297)))", "1".to_fullwidth) + assert_equal("(((name . \"DIGIT ONE\") (=ucs . 49)))", "1".char["<-fullwidth"]) + assert_equal("(((name . \"DIGIT ONE\") (=ucs . 49)))", "1".from_fullwidth) +# assert_equal(0xfa55, "突".map_ucs_at_jis) +# assert_equal(0xfa55, "突".char["=>ucs@jis"]) + end + + def test_put + char = "字".char + char["test_attribute"] = "test" + assert_equal("test", char.test_attribute) + char["test_attribute"] = "test2" + assert_equal("test2", char.test_attribute) + end +end diff --git a/test/test-config.rb b/test/test-config.rb index 1d04d7e..019ec24 100755 --- a/test/test-config.rb +++ b/test/test-config.rb @@ -7,7 +7,7 @@ class Test_Config < Test::Unit::TestCase def test_config @config = CHISE::Config.instance assert_match(%r|/chise\Z|, @config.base_dir.to_s) - assert_match(%r|/chise/char-db\Z|, @config.db_dir.to_s) + assert_match(%r|/chise/chise-db\Z|, @config.db_dir.to_s) assert_match(%r|/chise/ids\Z|, @config.ids_dir.to_s) end end diff --git a/test/test-db.rb b/test/test-db.rb index b4fc1a6..e69de29 100755 --- a/test/test-db.rb +++ b/test/test-db.rb @@ -1,23 +0,0 @@ -#!/usr/bin/env ruby -# Copyright (C) 2002-2004 Kouichirou Eto, All rights reserved. - -require "common" - -class TestBDB < Test::Unit::TestCase - def test_bdb - @config = CHISE::Config.instance - @dir = @config.db_dir - assert_match(%r|/chise/char-db\Z|, @dir) - file = @dir+"/=jis-x0208/system-char-id" - db = BDB::Hash.open(file, nil, 0) - end - - def test_db -# @cdb = CHISE::CharDB.instance -# @sdb = CHISE::CodesysDB.instance - end - - def test_codesys - @db = CHISE::CodesysDB.instance - end -end diff --git a/test/test-iconv.rb b/test/test-iconv.rb index d407f80..585ab92 100755 --- a/test/test-iconv.rb +++ b/test/test-iconv.rb @@ -30,6 +30,11 @@ class TestIconv < Test::Unit::TestCase assert_equal("[W", u32.u32tou16) assert_equal("\273\372", u16.u16toeuc) assert_equal("\216\232", u16.u16tosjis) + assert_equal(23383, u32.u32to_i) + assert_equal(23383, u8.u8to_i) + + assert_equal(u32, CHISE.i_tou32(23383)) + assert_equal(u8, CHISE.i_tou8(23383)) u8 = "Š¿Žš".sjistou8 assert_equal("\346\274\242\345\255\227", u8) diff --git a/test/test-ids.rb b/test/test-ids.rb new file mode 100755 index 0000000..3c0e98c --- /dev/null +++ b/test/test-ids.rb @@ -0,0 +1,13 @@ +#!/usr/bin/env ruby +# Copyright (C) 2002-2004 Kouichirou Eto, All rights reserved. + +require "common" + +class TestIDS < Test::Unit::TestCase + def test_idc + char = CHISE::Character.get(0x2FF0) + assert_equal("IDEOGRAPHIC DESCRIPTION CHARACTER LEFT TO RIGHT", char.name) + assert_equal(char.to_er, "⿰") + assert_equal(char.bidi_category, "ON") + end +end diff --git a/test/test-parser.rb b/test/test-parser.rb index 54c1211..ced9600 100755 --- a/test/test-parser.rb +++ b/test/test-parser.rb @@ -14,6 +14,7 @@ class TestParser < Test::Unit::TestCase assert_equal(65, @pa.parse("65")) assert_equal(20175, @pa.parse("?\344\273\217")) assert_raise(RuntimeError){ @pa.parse("nosuchcharacter") } + assert_raise(RuntimeError){ @pa.parse("\344\273\217") } # test_parse_er assert_equal(true, @pa.contain_er?("A")) @@ -21,15 +22,31 @@ class TestParser < Test::Unit::TestCase assert_equal(true, @pa.is_er?("A")) assert_equal(false, @pa.is_er?("This is A er.")) assert_raise(RuntimeError){ @pa.parse_er("nosucher") } - assert_equal(65, @pa.parse("&MCS-00000041;")) - assert_equal(65, @pa.parse("&U-0041;")) - assert_equal(65, @pa.parse("&U+0041;")) - assert_equal(65, @pa.parse("A")) - assert_equal(65, @pa.parse("A")) - assert_equal(65, @pa.parse("A")) assert_equal(0xe001, @pa.parse("&my-1;")) -# assert_equal(23383, @pa.parse("&J90-3B7A;")) -# assert_equal(23383, @pa.parse("&I-J90-3B7A;")) -# assert_raise(RuntimeError){ @pa.parse_er("&nosucher;") } + + assert_equal(23383, @pa.parse("&MCS-00005B57;")) + assert_equal(23383, @pa.parse("&U5B57;")) + assert_equal(23383, @pa.parse("&U-5B57;")) + assert_equal(23383, @pa.parse("&U+5B57;")) + assert_equal(23383, @pa.parse("字")) + assert_equal(23383, @pa.parse("字")) + + # test_get_ccs + assert_equal(23383, @pa.parse("&J90-3B7A;")) + assert_equal(23383, @pa.parse("&I-J90-3B7A;")) + assert_equal(23383, @pa.parse("&MCS-00005B57;")) + assert_equal(23383, @pa.parse("&M-06942;")) + assert_raise(RuntimeError){ @pa.parse_er("&nosucher;") } + + assert_equal(28193, @pa.parse("&C1-602E;")) # 渡 + assert_equal(15542221, @pa.parse("&C1-6030;")) # unknown end + + def test_de_er + @pa = CHISE::EntityReferenceParser.new + assert_equal("This is A.", @pa.de_er("This is A.")) + assert_equal("A\345\255\227B", @pa.de_er("A&U5B57;B")) + assert_equal("A\345\255\227B", @pa.de_er("A&J90-3B7A;B")) + end + end diff --git a/test/test-rbchise.rb b/test/test-rbchise.rb index 457a3bb..84ec877 100755 --- a/test/test-rbchise.rb +++ b/test/test-rbchise.rb @@ -4,23 +4,134 @@ require "common" class TestRbChise < Test::Unit::TestCase - def test_rbchise - - end + include CHISE::ChiseValue - def test_rbchise0 + def test_rbchise @ds = CHISE::DataSource.new assert_instance_of(CHISE::DataSource, @ds) - @dt = @ds.open_decoding_table("=daikanwa") - assert_instance_of(CHISE::DecodingTable, @dt) - char_id = @dt.get_char(364) # get a character by Daikanwa number 364. - assert_instance_of(String, char_id) - assert_equal("?\344\273\217", char_id) + assert_match(/chise-db\Z/, @ds.location.to_s) - @ft = @ds.open_feature_table("ideographic-structure") + @ct = @ds.get_ccs("=daikanwa") + assert_instance_of(CHISE::CCSTable, @ct) + char_id = @ct.decode(364) # get a character by Daikanwa number 364. + assert_equal(20175, char_id) + str = format_char_id(20175) + assert_equal("?\344\273\217", str) + + char_id = @ds.decode_char("=daikanwa", 364) + assert_equal(20175, char_id) + + @ft = @ds.get_feature("ideographic-structure") assert_instance_of(CHISE::FeatureTable, @ft) value = @ft.get_value(char_id) assert_instance_of(String, value) assert_equal("(?\342\277\260 ?\344\272\273 ?\345\216\266)", value) + + value = @ds.load_feature("ideographic-structure", char_id) + assert_equal("(?\342\277\260 ?\344\272\273 ?\345\216\266)", value) + + @ds.each_feature {|f| + #qp f + assert_instance_of(String, f) + } + + @ft.each {|k, v| + #qp k, v + assert_kind_of(Integer, k) + assert_instance_of(String, v) + } + + ft = @ds.get_feature("numeric-value") + ft.each {|k, v| + #qp k, v + assert_kind_of(Integer, k) + assert_instance_of(String, v) + } + end + + def test_error + @ds = CHISE::DataSource.new + @ft = @ds.get_feature("nosuchfeature") + v = @ft.get_value(20175) + assert_equal(nil, v) + end + + def test_chisedb + @cd = CHISE::ChiseDB.instance + + char_id = @cd.decode_char("=daikanwa", 364) + assert_equal(20175, char_id) + + value = @cd.load_feature("ideographic-structure", char_id) + assert_equal("(?\342\277\260 ?\344\272\273 ?\345\216\266)", value) + + value = @cd.load_feature("=ucs", char_id) + assert_equal(20175, value) + + @cd.each_feature {|f| + assert_instance_of(String, f) + } + + ft = @cd.get_feature("numeric-value") + ft.each {|k, v| + assert_kind_of(Integer, k) + assert_instance_of(String, v) + } + end + + def test_ascii + @cd = CHISE::ChiseDB.instance + ct = @cd.get_ccs("ascii") + char_id = ct.decode(65) + assert_equal(65, char_id) + assert_equal("A", CHISE::Character.get(char_id).to_s) +# assert_equal("A", char.to_s) + end + + + def test_parse_c_string + u8 = "字" + assert_equal(23383, u8.u8to_i) + assert_equal(23383, parse_c_string("?"+u8)) + assert_equal(0, parse_c_string("?\\^@")) + assert_equal(9, parse_c_string("?\t")) + assert_equal(10, parse_c_string("?\n")) + assert_equal(13, parse_c_string("?\r")) + assert_equal(94, parse_c_string("?^\\")) + assert_equal(31, parse_c_string("?\\^_")) + assert_equal(32, parse_c_string("?\\ ")) + assert_equal(34, parse_c_string("?\\\"")) + assert_equal(126, parse_c_string("?~")) + assert_equal(127, parse_c_string("?\\^?\000")) + assert_equal(131, parse_c_string("?\\^\303\237")) + assert_equal(0x7FF, parse_c_string("?\337\277")) + assert_equal(0xFFFF, parse_c_string("?\357\277\277")) + assert_equal(0x1FFFFF, parse_c_string("?\367\277\277\277")) + assert_equal(0x3FFFFFF, parse_c_string("?\373\277\277\277\277")) + assert_equal(0xFFFFFFF, parse_c_string("?\374\217\277\277\277\277")) + assert_raise(RuntimeError) { parse_c_string("nosuch") } + end + + def test_format_char_id + u8 = "字" + assert_equal(u8, CHISE.i_tou8(23383)) + assert_equal("?\345\255\227", format_char_id(23383)) + assert_equal("?"+u8, format_char_id(23383)) + assert_equal("?\\^@", format_char_id(0)) + assert_equal("?\t", format_char_id(?\t)) + assert_equal("?\n", format_char_id(?\n)) + assert_equal("?\r", format_char_id(?\r)) + assert_equal("?^\\", format_char_id(0x1C)) + assert_equal("?\\^_", format_char_id(0x1F)) + assert_equal("?\\ ", format_char_id(?\s)) + assert_equal("?\\\"", format_char_id(?\")) + assert_equal("?~", format_char_id(0x7E)) + assert_equal("?\\^?\000", format_char_id(0x7F)) + assert_equal("?\\^\303\237", format_char_id(0x9F)) + assert_equal("?\337\277", format_char_id(0x7FF)) + assert_equal("?\357\277\277", format_char_id(0xFFFF)) + assert_equal("?\367\277\277\277", format_char_id(0x1FFFFF)) + assert_equal("?\373\277\277\277\277", format_char_id(0x3FFFFFF)) + assert_equal("?\374\217\277\277\277\277", format_char_id(0xFFFFFFF)) end end diff --git a/test/test-string.rb b/test/test-string.rb new file mode 100755 index 0000000..87e3c76 --- /dev/null +++ b/test/test-string.rb @@ -0,0 +1,38 @@ +#!/usr/bin/env ruby +# Copyright (C) 2002-2004 Kouichirou Eto, All rights reserved. + +require "common" + +class TestString < Test::Unit::TestCase + def test_string + # test_utf8 + assert_equal(true, "字".is_a_utf8?) + assert_equal(false, "字字".is_a_utf8?) + assert_raises(RuntimeError){ "字字".char } + assert_raises(RuntimeError){ "字字".ucs } + + # test_feature + assert_equal(25991, "文".ucs) + assert_equal(23383, "字".ucs) + end + + def test_er + assert_equal("字", CHISE::Character.get("&J90-3B7A;").to_s) + assert_equal("字", "字".de_er) # no effect + assert_equal("字", "&J90-3B7A;".de_er) + assert_equal("文字列", "文&J90-3B7A;列".de_er) + assert_equal("文字列", "文&M-06942;列".de_er) + assert_equal("文字列", "文&MCS-00005B57;列".de_er) + assert_equal("文字列", "文&U-5B57;列".de_er) + assert_equal("文字列", "文&U+5B57;列".de_er) + assert_raises(RuntimeError){ "文&nosucher;列".de_er } + end + + def test_characters + @str = "文字列" + assert_equal(["文","字","列"], @str.to_a) + ar = [] + @str.each_char {|char| ar << char } + assert_equal(["文","字","列"], ar) + end +end diff --git a/test/test-util.rb b/test/test-util.rb new file mode 100755 index 0000000..932f53d --- /dev/null +++ b/test/test-util.rb @@ -0,0 +1,11 @@ +#!/usr/bin/env ruby +# Copyright (C) 2002-2004 Kouichirou Eto, All rights reserved. + +require "common" + +class TestUtil < Test::Unit::TestCase + def test_db + assert_equal("()+!", "<>*?".path.unix_to_win.to_s) + assert_equal("<>*?", "()+!".path.win_to_unix.to_s) + end +end diff --git a/tools/.cvsignore b/tools/.cvsignore index 8a70bb0..480d153 100644 --- a/tools/.cvsignore +++ b/tools/.cvsignore @@ -1 +1,2 @@ chise-db.tar.gz +ruby.exe.stackdump diff --git a/tools/dbdumball.rb b/tools/dbdumball.rb index 3c41181..e69de29 100755 --- a/tools/dbdumball.rb +++ b/tools/dbdumball.rb @@ -1,8 +0,0 @@ -#!/usr/bin/env ruby -# Copyright (C) 2002-2004 Kouichirou Eto, All rights reserved. - -$LOAD_PATH.unshift("..") -require "chise/char" - -CharDB.instance.dump_all() -CodesysDB.instance.dump_all() diff --git a/tools/dump-database.rb b/tools/dump-database.rb new file mode 100755 index 0000000..d116f47 --- /dev/null +++ b/tools/dump-database.rb @@ -0,0 +1,61 @@ +#!/usr/bin/env ruby +# Copyright (C) 2002-2004 Kouichirou Eto, All rights reserved. + +$LOAD_PATH.unshift("..") +require "chise/char" +require "chise/qp" + +module CHISE + class DataBaseManagement + def dump_all + #CharDB.instance.dump_all() + #CodesysDB.instance.dump_all() + + cd = ChiseDB.instance + path = cd.location+"character/feature" + + cd.each_feature {|f| + ft = cd.get_feature(f) + h = {} + ft.each {|k, v| + h[k] = v + } + + f = f.path + f = f.escape + f = f.escape_win_filename + txt = f.to_s+".txt" + qp f, txt + t = path+txt + + t.open("wb"){|out| + h.sort.each {|k, v| + out.printf("%s\t%s\n", k, v) + } + } + + ft.close + } + end + + def dump_db(t) + db = get(t) + return nil unless db + file = get_filename(t) + open("#{file}.txt", "w"){|out| + # out.binmode.sync = true + ar = db.to_a + ar.map! {|k, v| [to_num(k), to_num(v)] } + ar.sort.each {|k, v| + out.printf("%s\t%s\n", k, v) + } + } + true + end + + + end +end + +man = CHISE::DataBaseManagement.new +man.dump_all diff --git a/tools/management.rb b/tools/management.rb new file mode 100755 index 0000000..820a960 --- /dev/null +++ b/tools/management.rb @@ -0,0 +1,107 @@ +# Copyright (C) 2002-2004 Kouichirou Eto, All rights reserved. + +$LOAD_PATH.unshift("..") +require "pathname" +require "fileutils" +require "chise/config" +require "chise/util" +require "chise/qp" + +module CHISE + class DataBaseManagement + def dump_all +#CharDB.instance.dump_all() +#CodesysDB.instance.dump_all() + end + end + + class DataBaseFileManagement + + # from specs/char-atr.ja.txt + OBSOLETE_FEATURES = " +cns-radical +cns-radical? +kangxi-radical +daikanwa-radical + +cns-strokes +kangxi-strokes +daikanwa-strokes +shinjigen-1-radical +gb-original-radical +japanese-strokes +jis-strokes-a +jisx0208-strokes +unicode-strokes + +cns-total-strokes + +non-morohashi + +=>ucs* +#=>mojikyo +#=mojikyo +->identical + +ancient-ideograph-of +ancient-char-of-shinjigen-1 +original-ideograph-of +original-char-of-shinjigen-1 +vulgar-ideograph-of +vulgar-char-of-shinjigen-1 +ideographic-variants +variant-of-shinjigen-1 + +iso-10646-comment +".split + + def initialize() + # @opt = {:noop=>true, :verbose=>true} + @opt = {:verbose=>true} + end + + def move_obsolete_files + fpath = Config.instance.db_dir.path+"system-char-id" + fpath.chdir { + opath = "obsolete".path + opath.mkdir unless opath.directory? + + OBSOLETE_FEATURES.each {|attr| + next if attr =~ /^#/ + f = attr.path + f = f.normalize_filename + FileUtils.mv(f.to_s, opath.to_s, @opt) if f.exist? + f = f+".txt" + FileUtils.mv(f.to_s, opath.to_s, @opt) if f.exist? + } + } + end + + def rename_files + path = Config.instance.db_dir.path + + nfpath = path+"character/feature" + FileUtils.mkdir_p(nfpath.to_s, @opt) unless nfpath.directory? + + fpath = path+"system-char-id" + fpath.each_entry {|f| + next if /\A\./ =~ f + FileUtils.mv((fpath+f).to_s, nfpath.to_s, @opt) + } + + ncpath = path+"character/by_feature" + FileUtils.mkdir_p(ncpath.to_s, @opt) unless ncpath.directory? + + path.each_entry {|f| + next if /\A\./ =~ f + next if f.to_s == "character" + d = path + f + next unless d.directory? + ff = d + "system-char-id" + if ff.exist? + FileUtils.mv(ff.to_s, (ncpath+f).to_s, @opt) + end + } + end + end +end diff --git a/tools/move-obsolete-files.rb b/tools/move-obsolete-files.rb index 9692632..0eb5546 100755 --- a/tools/move-obsolete-files.rb +++ b/tools/move-obsolete-files.rb @@ -1,82 +1,7 @@ #!/usr/bin/env ruby # Copyright (C) 2002-2004 Kouichirou Eto, All rights reserved. -# move obsolete BDB files. -$LOAD_PATH.unshift("..") -require "chise/config" -require "chise/util" +require "management" -module CHISE - class DBS_Management # DataBase file management - # from specs/char-atr.ja.txt - OBSOLETE_ATTRIBUTES = " -cns-radical -cns-radical? -kangxi-radical -daikanwa-radical - -cns-strokes -kangxi-strokes -daikanwa-strokes -shinjigen-1-radical -gb-original-radical -japanese-strokes -jis-strokes-a -jisx0208-strokes -unicode-strokes - -cns-total-strokes - -non-morohashi - -=>ucs* -#=>mojikyo -#=mojikyo -->identical - -ancient-ideograph-of -ancient-char-of-shinjigen-1 -original-ideograph-of -original-char-of-shinjigen-1 -vulgar-ideograph-of -vulgar-char-of-shinjigen-1 -ideographic-variants -variant-of-shinjigen-1 - -iso-10646-comment -".split - - def initialize - end - - def move_obsolete_files - fdir = Config.instance.db_dir+"/system-char-id" - Dir.chdir(fdir){ - odir = "obsolete" - Dir.mkdir(odir) unless FileTest.directory? odir - - OBSOLETE_ATTRIBUTES.each {|attr| - next if attr =~ /^#/ - f = get_filename(attr) - move(f, odir) if FileTest.exist?(f) - f = f+".txt" - move(f, odir) if FileTest.exist?(f) - } - } - end - - def get_filename(t) - t = CHISE.unix_to_win(t) if CHISE.windows? - t - end - - def move(file, dir) - cmd = "mv './#{file}' #{dir}" - p cmd - system cmd - end - end -end - -man = CHISE::DBS_Management.new +man = CHISE::DataBaseFileManagement.new man.move_obsolete_files diff --git a/tools/rename-files.rb b/tools/rename-files.rb new file mode 100755 index 0000000..ae47d42 --- /dev/null +++ b/tools/rename-files.rb @@ -0,0 +1,7 @@ +#!/usr/bin/env ruby +# Copyright (C) 2002-2004 Kouichirou Eto, All rights reserved. + +require "management" + +man = CHISE::DataBaseFileManagement.new +man.rename_files