From 532444ed9ee69c56b57e0958769b76dc149ebcd2 Mon Sep 17 00:00:00 2001 From: eto Date: Mon, 14 Jun 2004 12:43:08 +0000 Subject: [PATCH] n.c. --- chise/character.rb | 22 +- chise/chisedb.rb | 37 +++ chise/iconv.rb | 40 ++-- chise/ids.rb | 560 +++------------------------------------------- chise/idsdb.rb | 97 +++++++- chise/idsdbmanagement.rb | 23 -- chise/management.rb | 4 +- chise/org-character.rb | 15 +- chise/org-string.rb | 2 - chise/parser.rb | 48 ++-- chise/rbchise.rb | 331 +-------------------------- chise/string.rb | 14 +- chise/util.rb | 52 +---- test/common.rb | 9 +- test/org-test-ids.rb | 101 --------- test/test-char.rb | 15 +- test/test-iconv.rb | 8 +- test/test-ids.rb | 24 +- test/test-idsdb.rb | 51 ++++- test/test-parser.rb | 28 ++- test/test-rbchise.rb | 155 ------------- test/test-string.rb | 9 + test/test-util.rb | 11 - 23 files changed, 376 insertions(+), 1280 deletions(-) diff --git a/chise/character.rb b/chise/character.rb index 190792f..c133c53 100755 --- a/chise/character.rb +++ b/chise/character.rb @@ -4,6 +4,8 @@ require "singleton" require "chise/parser" require "chise/chisedb" require "chise/iconv" +require "chise/utf8" +require "chise/ids" module CHISE class CharacterFactory # generate Character object and cache them @@ -33,13 +35,17 @@ module CHISE end class Character + include UTF8Value + include IDS_Module + def initialize(char_id) raise if char_id.nil? - raise unless char_id.is_a?(Fixnum) # char_id sure is a Fixnum. - raise if char_id < 0 # char_id sure is a positive value. + raise unless char_id.kind_of?(Integer) # make sure char_id is Integer. + raise if char_id < 0 # make sure char_id is positive. @char_id = char_id @char_id.freeze - @utf8_mcs = CHISE.i_tou8(@char_id) + # @utf8_mcs = CHISE.i_tou8(@char_id) + @utf8_mcs = itou8(@char_id) @utf8_mcs.freeze @feature = {} @check_all_done = nil @@ -106,6 +112,16 @@ module CHISE en.to_er(self) end + def is_idc? + 0x2ff0 <= @char_id && @char_id <= 0x2fff + end + + def idc_argument_number + return 0 unless is_idc? + return 3 if @char_id == 0x2ff2 || @char_id == 0x2ff3 + return 2 + end + private def get_feature(f) diff --git a/chise/chisedb.rb b/chise/chisedb.rb index b2f898b..3134fb1 100755 --- a/chise/chisedb.rb +++ b/chise/chisedb.rb @@ -9,6 +9,7 @@ module CHISE def initialize @ds = DataSource.new + @byids_db = {} end def location() @ds.location; end @@ -18,5 +19,41 @@ module CHISE def load_feature(n, cid) @ds.load_feature(n, cid) end def each_feature() @ds.each_feature {|f| yield f } end def each_ccs() @ds.each_ccs {|c| yield c } end + + def get_by_ids_db(n) + @byids_db[n] = ByIDS_DB.new(@ds, n) if @byids_db[n].nil? + @byids_db[n] + end + end + + class ByIDS_DB + include ChiseValue + include TableAccessModule + + def initialize(ds, name) + @ds, @name = ds, name + @category, @keyvalue = "character", "by_ids" + reset + end + + def decode(ids) + setup_db + return nil if @db.nil? + parse_c_string(@db.get(ids)) + end + + def set_decoded_char(ids, cid) + setup_db(true) + raise "@db is nil." if @db.nil? + @db.put(ids, format_char_id(cid)) + end + + def each + setup_db + raise "@db is nil." if @db.nil? + @db.each {|k, v| + yield(parse_value(k), parse_c_string(v)) + } + end end end diff --git a/chise/iconv.rb b/chise/iconv.rb index a55b6fa..d361da6 100755 --- a/chise/iconv.rb +++ b/chise/iconv.rb @@ -77,29 +77,29 @@ class String def u16toeuc() Iconv.iconv_to_from("EUC-JP", "UTF-16", self) end def u16tosjis() Iconv.iconv_to_from("Shift_JIS", "UTF-16", self) end - def u32to_i - return 0 if length == 0 - s = self - return (s[0] << 24 | s[1] << 16 | s[2] << 8 | s[3]) - end - - def u8to_i - u32 = self.u8tou32 - u32.u32to_i - end +# def u32to_i +# return 0 if length == 0 +# s = self +# return (s[0] << 24 | s[1] << 16 | s[2] << 8 | s[3]) +# end + +# def u8to_i +# u32 = self.u8tou32 +# u32.u32to_i +# end end module CHISE - def i_tou32(n) # convert a integer to UTF-32 String - raise unless n.is_a?(Integer) - sprintf("%c%c%c%c", (n >> 24)&0xff, (n >> 16)&0xff, (n >> 8)&0xff, n&0xff) - end - - def i_tou8(n) # convert a integer to UTF-8 String - u32 = CHISE.i_tou32(n) - u32.u32tou8 - end - module_function :i_tou32, :i_tou8 +# def i_tou32(n) # convert a integer to UTF-32 String +# raise unless n.is_a?(Integer) +# sprintf("%c%c%c%c", (n >> 24)&0xff, (n >> 16)&0xff, (n >> 8)&0xff, n&0xff) +# end + +# def i_tou8(n) # convert a integer to UTF-8 String +# u32 = CHISE.i_tou32(n) +# u32.u32tou8 +# end +# module_function :i_tou32, :i_tou8 end class NuUconv diff --git a/chise/ids.rb b/chise/ids.rb index e46bc51..792c863 100755 --- a/chise/ids.rb +++ b/chise/ids.rb @@ -1,543 +1,53 @@ # Copyright (C) 2002-2004 Kouichirou Eto, All rights reserved. -require "chise/db" +require "chise/idstree" module CHISE -# IDC_LEFT_TO_RIGHT = "â¿°" -# IDC_ABOVE_TO_BELOW = "⿱" -# IDC_LEFT_TO_MIDDLE_AND_RIGHT = "⿲" -# IDC_ABOVE_TO_MIDDLE_AND_BELOW = "⿳" -# IDC_FULL_SURROUND = "â¿´" -# IDC_SURROUND_FROM_ABOVE = "⿵" -# IDC_SURROUND_FROM_BELOW = "⿶" -# IDC_SURROUND_FROM_LEFT = "â¿·" -# IDC_SURROUND_FROM_UPPER_LEFT = "⿸" -# IDC_SURROUND_FROM_UPPER_RIGHT = "⿹" -# IDC_SURROUND_FROM_LOWER_LEFT = "⿺" -# IDC_OVERLAID = "â¿»" - - IDC_LEFT_TO_RIGHT = "\342\277\260" #2FF0 - IDC_ABOVE_TO_BELOW = "\342\277\261" - IDC_LEFT_TO_MIDDLE_AND_RIGHT = "\342\277\262" - IDC_ABOVE_TO_MIDDLE_AND_BELOW = "\342\277\263" - IDC_FULL_SURROUND = "\342\277\264" #2FF4 - IDC_SURROUND_FROM_ABOVE = "\342\277\265" - IDC_SURROUND_FROM_BELOW = "\342\277\266" - IDC_SURROUND_FROM_LEFT = "\342\277\267" - IDC_SURROUND_FROM_UPPER_LEFT = "\342\277\270" - IDC_SURROUND_FROM_UPPER_RIGHT = "\342\277\271" - IDC_SURROUND_FROM_LOWER_LEFT = "\342\277\272" - IDC_OVERLAID = "\342\277\273" - - IDC_LR = IDC_LEFT_TO_RIGHT - IDC_AB = IDC_ABOVE_TO_BELOW - IDC_LM = IDC_LEFT_TO_MIDDLE_AND_RIGHT - IDC_AM = IDC_ABOVE_TO_MIDDLE_AND_BELOW - IDC_FS = IDC_FULL_SURROUND - IDC_FA = IDC_SURROUND_FROM_ABOVE - IDC_FB = IDC_SURROUND_FROM_BELOW - IDC_FL = IDC_SURROUND_FROM_LEFT - IDC_UL = IDC_SURROUND_FROM_UPPER_LEFT - IDC_UR = IDC_SURROUND_FROM_UPPER_RIGHT - IDC_LL = IDC_SURROUND_FROM_LOWER_LEFT - IDC_OV = IDC_OVERLAID - - IDC_LMR = IDC_LM - IDC_AMB = IDC_AM - IDC_FUL = IDC_UL - IDC_FUR = IDC_UR - IDC_FLL = IDC_LL - IDC_O = IDC_OV - - class IDS_TEXT_DB < DB - include Singleton - - IDS_LIST = " -IDS-UCS-Basic.txt -#IDS-UCS-Compat-Supplement.txt -#IDS-UCS-Compat.txt -IDS-UCS-Ext-A.txt -IDS-UCS-Ext-B-1.txt -IDS-UCS-Ext-B-2.txt -IDS-UCS-Ext-B-3.txt -IDS-UCS-Ext-B-4.txt -IDS-UCS-Ext-B-5.txt -IDS-UCS-Ext-B-6.txt -IDS-JIS-X0208-1990.txt -IDS-Daikanwa-01.txt -IDS-Daikanwa-02.txt -IDS-Daikanwa-03.txt -IDS-Daikanwa-04.txt -IDS-Daikanwa-05.txt -IDS-Daikanwa-06.txt -IDS-Daikanwa-07.txt -IDS-Daikanwa-08.txt -IDS-Daikanwa-09.txt -IDS-Daikanwa-10.txt -IDS-Daikanwa-11.txt -IDS-Daikanwa-12.txt -IDS-Daikanwa-dx.txt -IDS-Daikanwa-ho.txt -IDS-CBETA.txt -".split - - def initialize() - super - @ids_list = IDS_LIST - @chars = [] - - @dir = Config.instance.ids_dir - - @glob, @pre, @post = "#{@dir}/db/*", "#{@dir}/db/", "" - dir = File.dirname(@pre) - Dir.mkdir(dir) unless FileTest.exist?(dir) - open_dbs() - end - - def each_file() - return unless block_given? - @ids_list.each {|file| - next if file =~ /^#/ - yield(@dir+file) - } - end - - def each_line(file) - open(file){|f| - while line = f.gets - next if line =~ /^;/ #コメントはとばす - line.chomp! - code, char, ids = line.split - yield(code, char, ids) - end - } - end - - def dump_text_all - each_file {|file| - dir = File.dirname(file) + "/../ids-new/" - Dir.mkdir(dir) if ! FileTest.directory?(dir) - newfile = dir + File.basename(file) - p [file, newfile] - open(newfile, "w"){|out| - out.binmode.sync = true - each_line(file){|code, ch, ids| - char = Character.get(ch) - ids = char.decompose - out.print "#{code} #{ch} #{ids}\n" - } - } - } - end - - def make_ids_error - each_file {|file| - dir = File.dirname(file) + "/../ids-error" - Dir.mkdir(dir) unless FileTest.exist?(dir) - errfile = dir + "/" + File.basename(file) - # p [file, errfile] - open(errfile, "w"){|out| - out.binmode.sync = true - each_line(file){|code, ch, ids| - char = Character.get(ch) - ids_error = char["ids-error"] - next if ids_error.nil? - out.print "#{code} #{ch} #{ids} #{ids_error}\n" - } - } - } - end - end - - class IDS_DB < DB # BDB化したIDS DBを扱う - include Singleton - - def initialize - @dbs = CharDB.instance - end - - def make_ids_db - db = IDS_TEXT_DB.instance - db.each_file {|file| - @char_counter = 0 - @same_ids_counter = 0 - @good_ids_counter = 0 - @conflict_ids_counter = 0 - db.each_line(file){|code, ch, ids| - @char_counter += 1 - - ids = "" if ids == nil - next if ids == "" #IDSが定義されていない場合は、さっくりと無視するべしよ。 - - charimg = Character.get(ch) #実体参照である可能性がある - - next if code =~ /'$/ || code =~ /"$/ #大漢和番号のダッシュ付きは無視する - char = Character.get("&"+code+";") #code表記を元に実体参照を作って解釈する - if char.nil? || char.to_s == "" #うまく文字にならなかった - print "char == null #{char.inspect} #{code} #{ch} #{ids}\n" unless code =~ /^M-/ || code =~ /^CB/ - #大漢和、CBETA以外の場合は、エラーメッセージ。 - next - end - if char != charimg #code表記と文字が一致していない? - unless code =~ /^M-/ || code =~ /^MH-/ || code =~ /^CB/ #食い違っていて当然であるので何もしない - print "unknown char #{char.inspect} #{code} #{ch} #{ids}\n" - next #それ以外の場合はエラーメッセージをだして、次へ。 - end - end - #next if !char.has_attribute? #isolated characterはまぎれこませない。 - - ids.de_er! #実体参照を解除する - next if ids == char.to_s #もし文字とまったく一緒なら、意味が無いので情報を持たない - next if ids.char_length == 1 - - idstree = IDS_Tree.new(ids) - c = idstree.check_integrity - c = "contains self" if ids.include?(char.to_s) - if c #ちょっとでもエラーがある場合は、 - char["ids-error"] = c #エラーを記録して、データとしては保持しない - next - end - - if char["ids"].nil? || char["ids"] == "" #元々IDSが無かった場合は、 - char["ids"] = ids #普通に代入すればそれでいいです。 - @good_ids_counter += 1 - else #しかしいままでにすでにIDSが定義されていた場合は? - if char["ids"] == ids #新しいIDSと古いIDSが完全に一致するなら無視しましょう。 - @same_ids_counter += 1 - else #しかしいままでのIDSと新しいIDSが食い違った場合は? - @conflict_ids_counter += 1 - # print "conflict #{char.inspect} #{code} #{ids} #{char["ids"]}\n" - end - end - } - print "#{file} #{@char_counter} #{@same_ids_counter} #{@conflict_ids_counter} #{@good_ids_counter}\n" - CharacterFactory.instance.reset() - } - @dbs.dump_db("ids-error") #テキスト化する - @dbs.dump_db("ids") #テキスト化する - end - - def make_ids_reverse - h = Hash.new - @dbs.each("ids") {|k, v| - char = k.char - ids = char.decompose - h[ids] = "" if h[ids].nil? - h[ids] += k #追加する - } - h.each {|k, v| - h[k] = char_sort(v) #文字の順番を、よく使うっぽいものからの順番にする - } - h.delete_if {|k, v| #h[k]が""になる可能性もあるが、それはkeyとして入れないことにする。 - v == "" - } - print "length #{h.length}\n" - cdb = CodesysDB.instance - cdb.make_db_no_question_mark("ids", h) - cdb.open_db("ids") #これが無いと、dump_dbされません。 - cdb.dump_db("ids") - end - - def char_sort(composed) - return composed if composed.char_length == 1 - ar = composed.to_a - arorg = ar.dup - ar2 = [] - ar.dup.each {|ch| - char = ch.char - if char.char_id < 0xfffff #Unicodeっぽい? - ar2 << ch - ar.delete(ch) - end - } - if 0 < ar.length - EntityReference.each_codesys{|codesys, er_prefix, keta, numtype| - ar.each {|ch| - char = ch.char - v = char[codesys] - # p [codesys, v] if v - if v #EntityReferenceの順番に準拠する。 - ar2 << ch - ar.delete(ch) - end - } - } - end - if 0 < ar.length - # p ["yokuwakaran character", ar, ar[0].inspect_all, arorg] - EntityReference.each_codesys{|codesys, er_prefix, keta, numtype| - ar.dup.each {|ch| - char = ch.char - v = char[codesys] - # p [codesys, v] if v - } - } - end - return ar2.join("") - end - - def dump_ids_duplicated - open("ids-duplicated.txt", "w"){|out| - #out.binmode - CodesysDB.instance.each("ids") {|k, v| - if v.nil? - out.print "nil #{k} #{v}\n" - next - end - n = v.char_length - next if n == 1 - out.print "#{n} #{k} #{v}" - v.each_char {|ch| - char = ch.char - out.print " #{char.inspect}" - } - out.print "\n" - } - } - end - - def make_ids_aggregated - @dbs.each("ids") {|k, v| - char = k.char - ids = char.decompose - ag = ids.aggregate - char["ids-aggregated"] = ag - } - @dbs.dump_db("ids-aggregated") - end - - def dump_ids_aggregated - open("ids-aggregated.txt", "w"){|out| - #out.binmode - @dbs.each("ids") {|k, v| - char = k.char - ids = char["ids"] - ag = char["ids-aggregated"] - out.print "#{char.to_s} #{ag} #{ids}\n" if ids != ag - } - } - end - - def make_ids_parts - @dbs.each("ids") {|k, v| - char = k.char - pids = char.to_s - ar = [] - counter = 0 - loop { - ids = pids.decompose - break if ids == pids #これ以上分割できないようだったら終了〜。 - ar += ids.to_a - counter += 1 - p [char.to_s, pids, ids, ar] if 10 < counter #これは何かおかしいぞと - pids = ids - } - ar.sort! - ar.uniq! - #やっぱりIDS文字も加えることにする. by eto 2003-02-05 - # ar.delete_if {|ch| - # ch.char.is_ids? #IDS文字はまぎれこませない。 - # } - str = ar.join("") - char["ids-parts"] = str - } - @dbs.dump_db("ids-parts") - end - - def make_ids_contained - h = Hash.new - @dbs.each("ids-parts") {|k, v| - char = k.char - parts = char.ids_parts - parts.each_char {|ch| - # part = ch.char - h[ch] = [] if h[ch].nil? - h[ch] << k - # h[ch] += k - # part["ids-contained"] = "" if part["ids-contained"].nil? - # part["ids-contained"] += k - } - } - h.each {|k, v| - char = k.char - v.sort! - char["ids-contained"] = v.join("") - - } - @dbs.dump_db("ids-contained") - end - - def make_ids_decomposed - @dbs.each("ids") {|k, v| - char = k.char - de= char.decompose_all - char["ids-decomposed"] = de - } - @dbs.dump_db("ids-decomposed") - end - - end - - class Node < Array # 木構造の中の一つの枝 - def initialize(nodeleaf=nil, nodenum=nil) - super() - @nodeleaf = nodeleaf - @nodenum = nodenum - if @nodeleaf - original_add(@nodeleaf) - end - end - attr_reader :nodenum - - alias original_add << - private :original_add - - def <<(obj) - original_add(obj) - @nodenum -= 1 if @nodenum - end - - def nodes - ar = [] - ar << self.to_s - self.each {|n| - ar += n.nodes if n.is_a? Node - } - return ar - end - - end - - class Tree # 木構造を扱う - def initialize() - @root = Node.new() - @stack = [@root] - @leafnum = 0 - @depth = 1 #stackの深さが最大になったところの値、木構造が無いときは1となる - end - - def depth() @depth - 1 end - - def add_node(nodeleaf=nil, nodenum=nil) #枝を追加 - new_node = Node.new(nodeleaf, nodenum) - @stack.last << new_node - @stack << new_node - if @depth < @stack.length - @depth = @stack.length - end - self - end - - def end_node() #この枝は終り - @stack.pop - self - end - - def add_leaf(a) #葉を追加 - @stack.last << a - end_check() - self - end - - def end_check() - n = @stack.last.nodenum - if n && n == 0 - end_node() - end_check() #再帰 - end - end - - def check_integrity - n = @stack.last.nodenum - return nil if @root.length == 0 #no tree is good tree - return "unmatch leaves" if n && n != 0 - return "extra nodes" if @root.first.is_a?(Node) && @root.length != 1 - return "extra leaves" if @root.length != 1 - return nil - end - - def nodes - r = @root.nodes - r.shift - r - end - - def sub_nodes - r = nodes - r.shift - r - end - - def to_s() @root.to_s end - - def inspect() @root.inspect end - end - - class IDS_Tree < Tree + IDC_0 = "\342\277\260" + IDC_1 = "\342\277\261" + IDC_2 = "\342\277\262" + IDC_3 = "\342\277\263" + IDC_4 = "\342\277\264" + IDC_5 = "\342\277\265" + IDC_6 = "\342\277\266" + IDC_7 = "\342\277\267" + IDC_8 = "\342\277\270" + IDC_9 = "\342\277\271" + IDC_A = "\342\277\272" + IDC_B = "\342\277\273" + + IDC_LEFT_TO_RIGHT = IDC_0 + IDC_ABOVE_TO_BELOW = IDC_1 + IDC_LEFT_TO_MIDDLE_AND_RIGHT = IDC_2 + IDC_ABOVE_TO_MIDDLE_AND_BELOW = IDC_3 + IDC_FULL_SURROUND = IDC_4 + IDC_SURROUND_FROM_ABOVE = IDC_5 + IDC_SURROUND_FROM_BELOW = IDC_6 + IDC_SURROUND_FROM_LEFT = IDC_7 + IDC_SURROUND_FROM_UPPER_LEFT = IDC_8 + IDC_SURROUND_FROM_UPPER_RIGHT = IDC_9 + IDC_SURROUND_FROM_LOWER_LEFT = IDC_A + IDC_OVERLAID = IDC_B + + class IDS_Decomposer def initialize(str) @str = str - super() - parse() - end - - def parse() - @str.each_char {|ch| - char = Character.new(ch) - if is_ids?(char) - add_node(char, ids_operator_argc(char)) - else - add_leaf(char) - end - } - end - - def is_ids?(obj) - return true if "+*".include?(obj.to_s) #テスト用ですかね - return true if obj.is_ids? - return false end - def ids_operator_argc(obj) - return obj.ids_operator_argc if 0 < obj.ids_operator_argc - return 2 #テスト用ってことで - end - - def check_integrity - r = super - return r if r #不完全がすでにわかっているならreturn - return "contains ques" if @str =~ /\?/ #?が含まれている? - return nil + def decompose + end end - class IDS # IDSそのものを扱うclass - def initialize(str) #IDS文字列をうけとる。 - @str = str + module IDS_Module + def decompose + self.ids end - def parse + def decompose_all + end - def parse_x #柔軟型のParse. IDSキャラクターが前にきてなくてもよい。などなど。 - end end - class Counter - #使い方 - #counter = Counter.new(50) { exit } - #counter.count - def initialize(max) - @max = max - @count = 0 - @proc = proc - end - - def count - @count += 1 - if @max <= @count - @proc.call - end - end - - end end diff --git a/chise/idsdb.rb b/chise/idsdb.rb index 07bc5ec..e9773ae 100755 --- a/chise/idsdb.rb +++ b/chise/idsdb.rb @@ -1,8 +1,98 @@ # Copyright (C) 2002-2004 Kouichirou Eto, All rights reserved. require "chise/char" +require "chise/ids" +require "chise/qp" +require "chise/management" module CHISE + class IDS_DB_Management + def initialize + @cd = ChiseDB.instance + @idsdb = IDS_DB.instance + end + + def check_conflict_of_ids_text + @idsdb.each_ccs {|ccs| + qp ccs + c = Hash.new(0) + h = {} + @idsdb.get_ccs(ccs).each_character {|char, ids| + c["char"] += 1 + next if ids == char.to_s + next if ids.char_length == 1 + char_id = char.char_id + cids = h[char_id] + if cids.nil? # There is no ids yet. + h[char_id] = ids # just set it. + c["good"] += 1 + else # but, if there is already a ids? + if cids == ids # the two are same. + c["same"] += 1 # and just ignore + else # but, if the two are not same? + c["conflict"] += 1 + puts "conflict\t#{char.to_s}\t#{ids}\t#{cids}" + end + end + } + puts "#{ccs}\t#{c['char']}\t#{c['same']}\t#{c['conflict']}\t#{c['good']}" + } + end + + def store_ids_as_text + @idsdb.each_ccs {|ccs| + #qp ccs + @idsdb.get_ccs(ccs).each_character {|char, ids| + next if ids == char.to_s + next if ids.char_length == 1 + char.ids_text = ids # just set it. + } + } + @cd.get_feature("ids-text").dump + end + + def store_ids_de_er + @cd.get_feature("ids-text").each {|cid, idser| + char = Character.get(cid) + begin + ids = idser.de_er # parse Entity Reference + rescue => e + qp cid, idser + next + end + char.ids_de_er = ids # set it. + } + @cd.get_feature("ids-de-er").dump + end + + def check_integrity_of_ids_tree + @cd.get_feature("ids-de-er").each {|cid, ids| + char = Character.get(cid) + idstree = IDS_Tree.new(ids) + begin + raise "contains self" if ids.include?(char.to_s) + idstree.check_integrity + rescue => e + #puts "#{cid}\t#{e.message}\t#{ids}" + char.ids_error = e.message + next + end + char.ids = ids # set it. + } + @cd.get_feature("ids").dump + @cd.get_feature("ids-error").dump + end + + def make_by_ids_db + ct = @cd.get_by_ids_db("ids") + @cd.get_feature("ids").each {|cid, ids| + char = Character.get(cid) + ct.set_decoded_char(ids, cid) + } + ct.dump + end + end + class IDS_DB include Singleton @@ -36,6 +126,7 @@ module CHISE @path.open {|f| f.each {|line| next if /\A;/ =~ line # skip comment + line.chomp! code, picture, ids = line.split raise if code.nil? ids = "" if ids.nil? @@ -44,13 +135,17 @@ module CHISE } end - def each_entry + def each_character each_line {|code, ids| + next if ids.nil? + next if ids == "" # If there is no IDS, ignore it. + er = "&"+code+";" begin char = Character.get(er) rescue #qp er + next end next if char.nil? yield(char, ids) diff --git a/chise/idsdbmanagement.rb b/chise/idsdbmanagement.rb index 93a15e5..e69de29 100755 --- a/chise/idsdbmanagement.rb +++ b/chise/idsdbmanagement.rb @@ -1,23 +0,0 @@ -# Copyright (C) 2002-2004 Kouichirou Eto, All rights reserved. - -require "chise/idsdb" -require "chise/qp" - -module CHISE - class IDS_DB_Management - def initialize - @idb = CHISE::IDS_DB.instance - end - - def store_ids_to_bdb - @idb.each_ccs {|ccs| - #qp ccs - cd = @idb.get_ccs(ccs) - cd.each_entry {|char, ids| - char.ids = ids if char.ids.nil? - } - } - end - - end -end diff --git a/chise/management.rb b/chise/management.rb index 239f05a..2e17209 100755 --- a/chise/management.rb +++ b/chise/management.rb @@ -6,7 +6,7 @@ require "chise/char" require "chise/qp" module CHISE - class TableAccess + module TableAccessModule def to_hash h = {} each {|k, v| h[k] = v } @@ -15,9 +15,7 @@ module CHISE def dump txt = @name.path.escape.escape_win_filename.to_s+".txt" - #"character/feature" t = @ds.location+@category+@keyvalue+txt - qp t.to_s t.open("wb"){|out| to_hash.sort.each {|k, v| out.printf("%s\t%s\n", k, v) diff --git a/chise/org-character.rb b/chise/org-character.rb index e238bb2..c6e2778 100755 --- a/chise/org-character.rb +++ b/chise/org-character.rb @@ -1,3 +1,5 @@ +module CHISE + class Character def mcs_hex() sprintf("%x", @char_id) end def char_feature_alist() check_all_database(); @features; end @@ -16,8 +18,7 @@ def check_database(a) db = CharDB.instance u8 = mcs_utf8() - v = db.get(a, u8) # u8‚Å•\‚³‚ê‚镶Žš‚ÌaƒAƒgƒŠƒrƒ…[ƒg‚𒲂ׂéB - v + db.get(a, u8) # u8‚Å•\‚³‚ê‚镶Žš‚ÌaƒAƒgƒŠƒrƒ…[ƒg‚𒲂ׂéB end def check_all_database() # Œ»Ý‚Ì@char_id‚©‚çA•¶Žšƒf[ƒ^ƒx[ƒX‚ðŽQÆ‚·‚é @@ -206,11 +207,5 @@ return de.decompose_all(level+1) if de != self #‚È‚É‚©•Ï‰»‚ª‚ ‚Á‚½‚©‚çÄ‹A return de #‚à‚¤‚±‚êˆÈã•Ï‰»‚Í–³‚³‚»‚¤‚¾‚¼‚ƁB end - - def is_ids?() 0x2ff0 <= @char_id && @char_id <= 0x2fff end - - def ids_operator_argc() - return 0 unless is_ids? - return 3 if @char_id == 0x2ff2 || @char_id == 0x2ff3 - return 2 - end + end +end diff --git a/chise/org-string.rb b/chise/org-string.rb index 375a2d7..a495fe6 100755 --- a/chise/org-string.rb +++ b/chise/org-string.rb @@ -1,6 +1,4 @@ class String - def each_character() to_a.each {|ch| yield ch.char } end - def char_length() to_a.length end def to_utf8() return to_a.map {|ch| ch.char.to_utf8 diff --git a/chise/parser.rb b/chise/parser.rb index 5a6cf7e..1570f20 100755 --- a/chise/parser.rb +++ b/chise/parser.rb @@ -1,9 +1,10 @@ # Copyright (C) 2002-2004 Kouichirou Eto, All rights reserved. require "chise/chisedb" +require "chise/utf8" module CHISE - module EntityReference + module EntityReferenceModule PART = "&([-+0-9A-Za-z#]+);" ALL = '\A'+PART+'\Z' PART_RE = Regexp.new(PART) @@ -13,7 +14,7 @@ module CHISE def is_er?(s) (ALL_RE =~ s) != nil; end # the order is important. The primary charset should be selectable. - CODESYS_TABLE = [ + CCS_TABLE = [ %w( =jis-x0208-1990 J90- 4 X), %w( =jis-x0208-1983 J83- 4 X), %w( =jis-x0208-1978 J78- 4 X), @@ -24,6 +25,7 @@ module CHISE %w( =jis-x0213-2-2000 JX2- 4 X), %w( =jis-x0212 JSP- 4 X), %w( =big5-cdp CDP- 4 X), + %w( =big5 B- 4 X), %w( =cns11643-1 C1- 4 X), %w( =cns11643-2 C2- 4 X), %w( =cns11643-3 C3- 4 X), @@ -36,12 +38,28 @@ module CHISE %w( =cbeta CB 5 d), %w( =gt GT- 5 d), %w( =gt-k GT-K 5 d), + %w( =hanziku-1 HZK01- 4 X), + %w( =hanziku-2 HZK02- 4 X), + %w( =hanziku-3 HZK03- 4 X), + %w( =hanziku-4 HZK04- 4 X), + %w( =hanziku-5 HZK05- 4 X), + %w( =hanziku-6 HZK06- 4 X), + %w( =hanziku-7 HZK07- 4 X), + %w( =hanziku-8 HZK08- 4 X), + %w( =hanziku-9 HZK09- 4 X), + %w( =hanziku-10 HZK10- 4 X), + %w( =hanziku-11 HZK11- 4 X), + %w( =hanziku-12 HZK12- 4 X), + %w( =ruimoku-v6 RUI6- 4 X), + %w( =jef-china3 JC3- 4 X), ] - PRIVATE_USE_AREA = 0xe000 end class CharacterParser - include EntityReference + include EntityReferenceModule + include UTF8Value + + PRIVATE_USE_AREA = 0xe000 def parse(c) # parse a value and return a number (MCS) raise "c is nil" if c.nil? @@ -49,8 +67,9 @@ module CHISE if c.kind_of?(String) if /\A\?/ =~ c c = c.sub(/\A\?/, "") # remove "?" in the head - u4 = c.u8tou32 # translate from UTF-8 to UTF-32 - return u4.u32to_i # translate UTF-32 to UCS number + #u4 = c.u8tou32 # translate from UTF-8 to UTF-32 + #return u4.u32to_i # translate UTF-32 to UCS number + return u8toi(c) end return parse_er(c) if is_er?(c) # ER? @@ -88,7 +107,7 @@ module CHISE s = s.sub(/\AI-/, "") end - CODESYS_TABLE.each {|codesys, er_prefix, keta, numtype| + CCS_TABLE.each {|ccs, er_prefix, keta, numtype| if numtype == "d" nre = '\d' elsif numtype == "X" @@ -99,6 +118,7 @@ module CHISE re = "\\A#{er_prefix}(#{nre}{#{keta},#{keta}})\\Z" next unless Regexp.new(re) =~ s + #qp s codestr = $1 if numtype == "d" @@ -107,8 +127,8 @@ module CHISE code = codestr.hex end - u8 = get_ccs(codesys, code) -# qp s, u8 + u8 = get_ccs(ccs, code) + #qp ccs, s, u8 next if u8.nil? num = parse(u8) @@ -128,7 +148,7 @@ module CHISE end class EntityReferenceParser - include EntityReference + include EntityReferenceModule def de_er(s) # replace EntityReference with corresponding character. return s unless PART_RE =~ s # don't use contain_er? to get $1 @@ -143,15 +163,15 @@ module CHISE end class EntityReferenceEncoder - include EntityReference + include EntityReferenceModule def to_er(char) cid = char.char_id return "&#x%04x;" % cid if cid <= 0xffff return "&#x%05x;" % cid if cid <= 0xfffff - CODESYS_TABLE.each {|codesys, er_prefix, keta, numtype| - code = char[codesys] + CCS_TABLE.each {|ccs, er_prefix, keta, numtype| + code = char[ccs] next if code.nil? return "&#{er_prefix}%0#{keta}#{numtype};" % code } @@ -159,7 +179,7 @@ module CHISE "&MCS-%08X;" % cid # the last answer end - def to_er_by_ccs(cid, codesys) # not yet + def to_er_by_ccs(cid, ccs) # not yet end end diff --git a/chise/rbchise.rb b/chise/rbchise.rb index c90749e..6391dc3 100755 --- a/chise/rbchise.rb +++ b/chise/rbchise.rb @@ -1,332 +1,3 @@ # Copyright (C) 2002-2004 Kouichirou Eto, All rights reserved. -# "rbchise.so" ext compatible library by eto 2003-0317 -require "bdb" -require "pathname" -require "fileutils" -require "chise/util" - -module CHISE - module ChiseValue; end - module TableAccessModule; end - - class DataSource - NONE = 0 - Berkeley_DB = 1 - - def initialize(type=Berkeley_DB, loc=nil, subtype=0, modemask=0755) - @type = type - loc = Config.instance.db_dir if loc.nil? - @location = loc.path - @subtype = subtype - @modemask = modemask - @fdb = {} - @cdb = {} - end - attr_reader :type, :location, :subtype, :modemask - - def get_feature(f) - @fdb[f] = FeatureTable.new(self, f) if @fdb[f].nil? - @fdb[f] - end - - def get_ccs(ccs) - @cdb[ccs] = CCSTable.new(self, ccs) if @cdb[ccs].nil? - @cdb[ccs] - end - - def each_feature - each_entry("character/feature") {|f| yield(f) } - end - - def each_ccs - each_entry("character/by_feature") {|f| yield(f) } - end - - def load_feature(name, cid) - ft = get_feature(name) - return nil if ft.nil? - ft.get_value(cid) - end - - def decode_char(ccs, code_point) - ct = get_ccs(ccs) - return nil if ct.nil? - ct.decode(code_point) - end - - private - def each_entry(subdir) - dir = @location + subdir - dir.each_entry {|f| - next if f.to_s == "." || f.to_s == ".." - next if f.to_s =~ /\.txt\Z/ - yield(f.unescape_win_filename.unescape.to_s) - } - end - end - - class TableAccess - def initialize(ds, name) - @ds, @name = ds, name - @db = nil - @access = 0 - end - - def sync - @db.close if @db - @db = nil - @access = 0 - end - alias close sync - - - private - def setup_db(writable=nil) - setup_db_exec(writable, @category, @keyvalue) - end - - def setup_db_exec(writable, cat, key) - if writable - sync if @access & BDB::CREATE == 0 - @access = BDB::CREATE - else - @access = BDB::RDONLY - end - - return if @db - - begin - @db = AttributeTable.new(@ds.location, cat, key, - @name, @access, @ds.modemask) - rescue - @db = nil - end - #raise if @db.nil? - end - end - - class FeatureTable < TableAccess - include ChiseValue - - def initialize(ds, name) - super - @category, @keyvalue = "character", "feature" - end - - def get_value(cid) - setup_db - return nil if @db.nil? - parse_value(@db.get(format_char_id(cid))) - end - - def set_value(cid, value) - setup_db(true) - return nil if @db.nil? - @db.put(format_char_id(cid), value) - end - - def each - setup_db - return nil if @db.nil? - @db.each {|k, v| - yield(parse_c_string(k), v) - } - end - end - - class CCSTable < TableAccess - include ChiseValue - - def initialize(ds, name) - super - @category, @keyvalue = "character", "by_feature" - end - - def decode(code_point) - setup_db - return nil if @db.nil? - parse_c_string(@db.get(code_point.to_s)) - end - - def set_decoded_char(code_point, cid) - setup_db(true) - return nil if @db.nil? - @db.put(code_point.to_s, format_char_id(cid)) - end - - def each - setup_db - return nil if @db.nil? - @db.each {|k, v| - yield(parse_value(k), parse_c_string(v)) - } - end - end - - class AttributeTable - def initialize(dir, cat, keytype, name, amask, mmask) - dbdir = dir + cat + keytype - #FileUtils.mkdir_p(dbdir.to_s) unless dbdir.directory? - path = dbdir + name.path.escape.escape_win_filename -# qp path, amask, mmask - raise unless path.exist? -# @db = BDB::Hash.open(path.to_s, amask, mmask) - @db = BDB::Hash.open(path.to_s) - at_exit { - close - } - end - - def close - return if @db.nil? - begin - @db.sync - @db.close - rescue - end - end - - def get(k) @db.get(k); end - def put(k, v) @db.put(k, v); end - def each() @db.each {|k, v| yield(k, v) } end - end - - module ChiseValue - def parse_value(v) - return v if v.nil? - #return v if v.kind_of?(Integer) - return v.to_i if /\A\d+\Z/ =~ v # number? - return $1 if /\A"(.+)"\Z/ =~ v # remove surrounding " - #return v.sub(/\A\?/, "") if v =~ /\A\?/ # remove ? in the head - #return parse_sexp(v) if v =~ /\A\(.+\)\Z/ # parse sexp # not yet - v - end - - def parse_c_string(str) - return nil if str.nil? - - i = 0 - c = str[i] - i += 1 - len = str.length - - raise unless 2 <= len && c == ?\? - - c = str[i] - i += 1 - - if (c == ?\\) - raise if (len < 3) - c = str[i] - i += 1 - if (c == ?^) - raise if (len < 4) - c = str[i] - i += 1 - if c == ?\? - return 0x7F - else - return c & (0x80 | 0x1F) - end - end - # raise # ? - end - - if ( c < 0xC0 ) - cid = c - counter = 0 - elsif ( c < 0xE0 ) - cid = c & 0x1f - counter = 1 - elsif ( c < 0xF0 ) - cid = c & 0x0f - counter = 2 - elsif ( c < 0xF8 ) - cid = c & 0x07 - counter = 3 - elsif ( c < 0xFC ) - cid = c & 0x03 - counter = 4 - else - cid = c & 0x01 - counter = 5 - end - - if (counter + 2 <= len) - (0...counter).each {|j| - cid = (cid << 6) | (str[j + i] & 0x3F) - } - return cid - end - - raise - end - - def format_char_id(cid) - case cid - when ?\t then return "?\t" - when ?\n then return "?\n" - when ?\r then return "?\r" - when 0x1C then return "?\^\\" - end - - if cid <= 0x1F - return "?\\^"+(?@+cid).chr - elsif (cid == ?\s) || (cid == ?\") || - (cid == ?\#) || (cid == ?\') || - (cid == ?\() || (cid == ?\)) || - (cid == ?\,) || (cid == ?\.) || - (cid == ?\;) || (cid == ?\?) || - (cid == ?\[) || (cid == ?\\) || - (cid == ?\]) || (cid == ?\`) - return "?\\"+cid.chr - elsif (cid <= 0x7E) - return("?"+cid.chr) - elsif (cid == 0x7F) - return "?\\^?"+0.chr - elsif (cid <= 0x9F) - dest = "?\\^" - dest += (((cid + ?@) >> 6) | 0xC0).chr - dest += (((cid + ?@) & 0x3F) | 0x80).chr - return dest - elsif (cid <= 0x7FF) - dest = "? " - dest[1] = (cid >> 6) | 0xC0 - dest[2] = (cid & 0x3F) | 0x80 - return dest - elsif (cid <= 0xFFFF) - dest = "? " - dest[1] = (cid >> 12) | 0xE0 - dest[2] = ((cid >> 6) & 0x3F) | 0x80 - dest[3] = (cid & 0x3F) | 0x80 - return dest - elsif (cid <= 0x1FFFFF) - dest = "? " - dest[1] = (cid >> 18) | 0xF0 - dest[2] = ((cid >> 12) & 0x3F) | 0x80 - dest[3] = ((cid >> 6) & 0x3F) | 0x80 - dest[4] = (cid & 0x3F) | 0x80 - return dest - elsif (cid <= 0x3FFFFFF) - dest = "? " - dest[1] = (cid >> 24) | 0xF8 - dest[2] = ((cid >> 18) & 0x3F) | 0x80 - dest[3] = ((cid >> 12) & 0x3F) | 0x80 - dest[4] = ((cid >> 6) & 0x3F) | 0x80 - dest[5] = (cid & 0x3F) | 0x80 - return dest - else - dest = "? " - dest[1] = (cid >> 30) | 0xFC - dest[2] = ((cid >> 24) & 0x3F) | 0x80 - dest[3] = ((cid >> 18) & 0x3F) | 0x80 - dest[4] = ((cid >> 12) & 0x3F) | 0x80 - dest[5] = ((cid >> 6) & 0x3F) | 0x80 - dest[6] = (cid & 0x3F) | 0x80 - return dest - end - raise - end - end -end +require "chise/libchise" diff --git a/chise/string.rb b/chise/string.rb index ee96134..929c7ca 100755 --- a/chise/string.rb +++ b/chise/string.rb @@ -26,14 +26,24 @@ class String char.method_missing(mid, *args) end + def to_a + self.split(//u) + end + + def char_length + to_a.length + end + def each_char to_a.each {|c| yield(c) } end - def to_a - self.split(//u) + def each_character + to_a.each {|ch| + yield ch.char + } end def de_er() diff --git a/chise/util.rb b/chise/util.rb index 8a34ea7..4cdba13 100644 --- a/chise/util.rb +++ b/chise/util.rb @@ -1,53 +1,3 @@ # Copyright (C) 2002-2004 Kouichirou Eto, All rights reserved. -require "pathname" -require "chise/config" - -class String - def path - Pathname.new(self) - end -end - -class Pathname - def escape # copied from cgi.rb - s = @path.gsub(/([\/%]+)/n){ - "%" + $1.unpack("H2" * $1.size).join("%").upcase - } - Pathname.new(s) - end - - def unescape # copied from cgi.rb - s = @path.tr("+", " ").gsub(/((?:%[0-9a-fA-F]{2})+)/n) { - [$1.delete("%")].pack("H*") - } - Pathname.new(s) - end - - # translate file name for deal with the restriction of Windows file system. - def unix_to_win - win = @path.gsub(//, ")") - win = win.gsub(/\*/, "+") - win = win.gsub(/\?/, "!") - Pathname.new(win) - end - - def win_to_unix - unix = @path.gsub(/\)/, ">") - unix = unix.gsub(/\(/, "<") - unix = unix.gsub(/\!/, "?") - unix = unix.gsub(/\+/, "*") - Pathname.new(unix) - end - - def escape_win_filename - return self.unix_to_win if CHISE.windows? - self - end - - def unescape_win_filename - return self.win_to_unix if CHISE.windows? - self - end -end +require "chise/path" diff --git a/test/common.rb b/test/common.rb index b06096b..1be12c3 100755 --- a/test/common.rb +++ b/test/common.rb @@ -1,11 +1,12 @@ # Copyright (C) 2002-2004 Kouichirou Eto, All rights reserved. $VERBOSE = true +#$KCODE = "u" -$debug = false # for test -$debug = true # for test -$stdout.binmode if $debug -$stdout.sync = true if $debug +# $debug = false # for test +# $debug = true # for test +# $stdout.binmode if $debug +# $stdout.sync = true if $debug $LOAD_PATH.unshift("..") require "test/unit" diff --git a/test/org-test-ids.rb b/test/org-test-ids.rb index 74653ce..9034b59 100755 --- a/test/org-test-ids.rb +++ b/test/org-test-ids.rb @@ -6,9 +6,6 @@ require "common" class TestIDS < Test::Unit::TestCase def test_ids - char = "榊".char - assert_equal("⿰木神", char.ids) - assert_equal("⿰木神", char.decompose) str = "榊" assert_equal("⿰木神", str.char.ids) assert_equal("⿰木神", str.decompose) @@ -64,104 +61,6 @@ class TestIDS < Test::Unit::TestCase assert_match(/OVERLAID/, "&U+2FFB;".de_er.char.name) #∵ end - def test_tree - assert_equal("[]", CHISE::Tree.new().inspect) - assert_equal("[1]", CHISE::Tree.new().add_leaf(1).inspect) - assert_equal("[1, 2]", CHISE::Tree.new().add_leaf(1).add_leaf(2).inspect) - assert_equal("[[]]", CHISE::Tree.new().add_node.inspect) - assert_equal("[[1]]", CHISE::Tree.new().add_node.add_leaf(1).inspect) - assert_equal("[[1, 2]]", CHISE::Tree.new().add_node.add_leaf(1).add_leaf(2).inspect) - assert_equal("[[1]]", CHISE::Tree.new().add_node.add_leaf(1).end_node.inspect) - assert_equal("[[1], [1]]", CHISE::Tree.new().add_node.add_leaf(1).end_node.add_node.add_leaf(1).end_node.inspect) - - tree = CHISE::Tree.new - assert_equal("[]", tree.inspect) - assert_equal("[1]", tree.add_leaf(1).inspect) - assert_equal(0, tree.depth) - assert_equal("[1, 2]", tree.add_leaf(2).inspect) - assert_equal("[1, 2, []]", tree.add_node.inspect) - assert_equal("[1, 2, [3]]", tree.add_leaf(3).inspect) - assert_equal(1, tree.depth) - assert_equal("[1, 2, [3, 4]]", tree.add_leaf(4).inspect) - assert_equal("[1, 2, [3, 4]]", tree.end_node.inspect) - assert_equal("[1, 2, [3, 4], [5]]", tree.add_node.add_leaf(5).inspect) - assert_equal("[1, 2, [3, 4], [5, [6]]]", tree.add_node.add_leaf(6).inspect) - assert_equal(2, tree.depth) - - tree = CHISE::Tree.new - assert_equal("[[\"+\"]]", tree.add_node("+", 2).inspect) - assert_equal("[[\"+\", 1]]", tree.add_leaf(1).inspect) - assert_equal("unmatch leaves", tree.check_integrity) - assert_equal("[[\"+\", 1, 2]]", tree.add_leaf(2).inspect) - assert_nil(tree.check_integrity) - assert_equal("[[\"+\", 1, 2], 3]", tree.add_leaf(3).inspect) - assert_equal("extra nodes", tree.check_integrity) - - tree = CHISE::Tree.new - assert_equal("[[\"+\"]]", tree.add_node("+", 2).inspect) - assert_equal("unmatch leaves", tree.check_integrity) - assert_equal("[[\"+\", 1]]", tree.add_leaf(1).inspect) - assert_equal("unmatch leaves", tree.check_integrity) - assert_equal("[[\"+\", 1, [\"+\"]]]", tree.add_node("+", 2).inspect) - assert_equal("unmatch leaves", tree.check_integrity) - assert_equal("[[\"+\", 1, [\"+\", 2]]]", tree.add_leaf(2).inspect) - assert_equal("unmatch leaves", tree.check_integrity) - assert_equal("[[\"+\", 1, [\"+\", 2, 3]]]", tree.add_leaf(3).inspect) - assert_nil(tree.check_integrity) - - tree = CHISE::Tree.new - assert_equal("[1]", tree.add_leaf(1).inspect) - assert_nil(tree.check_integrity) - assert_equal("[1, 2]", tree.add_leaf(2).inspect) - assert_equal("extra leaves", tree.check_integrity) - end - - def test_ids_tree -# assert_equal("[[<+,U+002B>, , ]]", CHISE::IDS_Tree.new("+AB").inspect) -# assert_equal("[[<+,U+002B>, , ], ]", CHISE::IDS_Tree.new("+ABC").inspect) -# assert_equal("[[<+,U+002B>, , [<+,U+002B>, , ]]]", CHISE::IDS_Tree.new("+A+BC").inspect) -# assert_equal("[[<+,U+002B>, , [<+,U+002B>, , ]], ]", CHISE::IDS_Tree.new("+A+BCD").inspect) - - #assert_equal("[<榊,U+698A>]", CHISE::IDS_Tree.new("榊").inspect) -# assert_equal("[[<â¿°,U+2FF0>, <木,J90-4C5A>, <神,J90-3F40>]]", CHISE::IDS_Tree.new("⿰木神").inspect) - assert_equal(1, CHISE::IDS_Tree.new("⿰木神").depth) -# assert_equal("[[<â¿°,U+2FF0>, <木,J90-4C5A>, [<â¿°,U+2FF0>, <⺭,CDP-8B70>, <申,J90-3F3D>]]]", CHISE::IDS_Tree.new("⿰木⿰⺭申").inspect) - assert_equal(2, CHISE::IDS_Tree.new("⿰木⿰⺭申").depth) - assert_equal("unmatch leaves", CHISE::IDS_Tree.new("⿰木").check_integrity) - assert_nil(CHISE::IDS_Tree.new("⿰木神").check_integrity) - assert_equal("unmatch leaves", CHISE::IDS_Tree.new("⿰木⿰申").check_integrity) - assert_nil(CHISE::IDS_Tree.new("⿰木⿰⺭申").check_integrity) - assert_equal("extra nodes", CHISE::IDS_Tree.new("⿰木⿰⺭申申").check_integrity) - assert_nil(CHISE::IDS_Tree.new("榊").check_integrity) - assert_equal("extra leaves", CHISE::IDS_Tree.new("榊榊").check_integrity) - - assert_equal(3, "⿳".char.ids_operator_argc) - assert_equal("⿳士冖匕", "壱".char.ids) - assert_equal(3, "壱".char.ids.char.ids_operator_argc) - assert_nil(CHISE::IDS_Tree.new("⿳士冖匕").check_integrity) - assert_equal("unmatch leaves", CHISE::IDS_Tree.new("⿳士冖").check_integrity) - assert_equal("extra nodes", CHISE::IDS_Tree.new("⿳士冖匕匕").check_integrity) - - assert_equal("contains ques", CHISE::IDS_Tree.new("⿳士冖?").check_integrity) - end - - def test_tree_depth - assert_equal(1, CHISE::IDS_Tree.new("林".decompose).depth) -# assert_equal("["⿰木木"]", CHISE::IDS_Tree.new("林".decompose).nodes.inspect) -# assert_equal("[]", CHISE::IDS_Tree.new("林".decompose).sub_nodes.inspect) - assert_equal(2, CHISE::IDS_Tree.new("榊".decompose_all).depth) -# assert_equal("["⿰木⿰⺭申", "⿰⺭申"]", CHISE::IDS_Tree.new("榊".decompose_all).nodes.inspect) -# assert_equal("["⿰⺭申"]", CHISE::IDS_Tree.new("榊".decompose_all).sub_nodes.inspect) - -# assert_equal(3, CHISE::IDS_Tree.new("焔".decompose_all).depth) -# assert_equal(3, CHISE::IDS_Tree.new("焔".decompose_all).nodes.length) -# assert_equal(2, CHISE::IDS_Tree.new("焔".decompose_all).sub_nodes.length) - - assert_equal(2, CHISE::IDS_Tree.new("屡".decompose_all).depth) - assert_equal("⿸尸娄", "⿸尸⿱米女".aggregate) - assert_equal(3, CHISE::IDS_Tree.new("醤".decompose_all).depth) - end - def test_compose_exact #正確に一致するIDSを検知する assert_equal("榊", "榊".decompose.compose) assert_equal("壱", "壱".decompose.compose) diff --git a/test/test-char.rb b/test/test-char.rb index 5e36c4c..70dbdc0 100755 --- a/test/test-char.rb +++ b/test/test-char.rb @@ -28,6 +28,11 @@ class TestCharacter < Test::Unit::TestCase assert_raise(RuntimeError){ char.nosuchmethod(0) } end + def test_bignum + char = CHISE::Character.get(1644203214) + assert_equal("\375\242\200\210\263\216", char.to_s) + end + def test_latin char = "A".char assert_equal(65, char.ascii) @@ -59,9 +64,11 @@ class TestCharacter < Test::Unit::TestCase def test_put char = "字".char - char["test_attribute"] = "test" - assert_equal("test", char.test_attribute) - char["test_attribute"] = "test2" - assert_equal("test2", char.test_attribute) + #qp char.test_feature + char.test_feature = "test1" + assert_equal("test1", char.test_feature) + #qp char.test_feature + char.test_feature = "test2" + assert_equal("test2", char.test_feature) end end diff --git a/test/test-iconv.rb b/test/test-iconv.rb index 585ab92..43c2e64 100755 --- a/test/test-iconv.rb +++ b/test/test-iconv.rb @@ -30,11 +30,11 @@ class TestIconv < Test::Unit::TestCase assert_equal("[W", u32.u32tou16) assert_equal("\273\372", u16.u16toeuc) assert_equal("\216\232", u16.u16tosjis) - assert_equal(23383, u32.u32to_i) - assert_equal(23383, u8.u8to_i) +# assert_equal(23383, u32.u32to_i) +# assert_equal(23383, u8.u8to_i) - assert_equal(u32, CHISE.i_tou32(23383)) - assert_equal(u8, CHISE.i_tou8(23383)) +# assert_equal(u32, CHISE.i_tou32(23383)) +# assert_equal(u8, CHISE.i_tou8(23383)) u8 = "Š¿Žš".sjistou8 assert_equal("\346\274\242\345\255\227", u8) diff --git a/test/test-ids.rb b/test/test-ids.rb index 3c0e98c..e850450 100755 --- a/test/test-ids.rb +++ b/test/test-ids.rb @@ -2,8 +2,9 @@ # Copyright (C) 2002-2004 Kouichirou Eto, All rights reserved. require "common" +require "chise/ids" -class TestIDS < Test::Unit::TestCase +class TestIDC < Test::Unit::TestCase def test_idc char = CHISE::Character.get(0x2FF0) assert_equal("IDEOGRAPHIC DESCRIPTION CHARACTER LEFT TO RIGHT", char.name) @@ -11,3 +12,24 @@ class TestIDS < Test::Unit::TestCase assert_equal(char.bidi_category, "ON") end end + +class TestIDS < Test::Unit::TestCase + def test_ids_1 + assert_equal("\342\277\261\345\256\200\345\255\220", "字".ids) + assert_equal("⿱宀子", "字".ids) + assert_equal(CHISE::IDC_1+"宀子", "字".ids) + assert_equal("\342\277\260\346\227\245\345\257\272", "時".ids) + assert_equal(CHISE::IDC_0+"日寺", "時".ids) + end + + def test_decompose + char = "榊".char + assert_equal("⿰木神", char.ids) + assert_equal("⿰木神", char.decompose) +# assert_equal("⿰木神", char.decompose_all) + + + + + end +end diff --git a/test/test-idsdb.rb b/test/test-idsdb.rb index e960397..eda3493 100755 --- a/test/test-idsdb.rb +++ b/test/test-idsdb.rb @@ -3,24 +3,55 @@ require "common" require "chise/idsdb" +require "chise/management" + +#class TestIDS_DB < Test::Unit::TestCase +class TestIDS_DB + def check_ccs_db(cd) + cd.each_line {|code, ids| + assert_instance_of(String, code) + assert_instance_of(String, ids) + } + cd.each_character {|char, ids| + assert_instance_of(CHISE::Character, char) + assert_instance_of(String, ids) + } + end -class TestIDS_DB < Test::Unit::TestCase def test_ids_db @idb = CHISE::IDS_DB.instance assert_instance_of(CHISE::IDS_DB, @idb) @idb.each_ccs {|ccs| cd = @idb.get_ccs(ccs) assert_instance_of(CHISE::IDS_CCS_DB, cd) + #check_ccs_db(cd) } + cd = @idb.get_ccs("JIS-X0208-1990") + check_ccs_db(cd) + end +end - @cd = @idb.get_ccs("JIS-X0208-1990") - @cd.each_line {|code, ids| - assert_instance_of(String, code) - assert_instance_of(String, ids) - } - @cd.each_entry {|char, ids| - assert_instance_of(CHISE::Character, char) - assert_instance_of(String, ids) - } +class TestIDS_DB_Management < Test::Unit::TestCase + def test_management + man = CHISE::IDS_DB_Management.new + # make sure there is no conflict + #man.check_conflict_of_ids_text # 167.499 seconds. + #man.store_ids_as_text # 172.024 seconds. + #man.store_ids_de_er # 47.99 seconds. + #man.check_integrity_of_ids_tree # 58.185 seconds. + #man.make_by_ids_db # 29.572 seconds. + +=begin + db = IDS_DB.instance +# db.make_ids_db #1時間12分 +# IDS_TEXT_DB.instance.make_ids_error #4分 +# db.make_ids_reverse #2分 + db.dump_ids_duplicated #1分 + db.make_ids_aggregated #5分 + db.dump_ids_aggregated #1分 + db.make_ids_parts #30分 + db.make_ids_contained #2分 + #db.make_ids_decomposed #2分→おわらなかった…。 +=end end end diff --git a/test/test-parser.rb b/test/test-parser.rb index 3a29516..b8916f3 100755 --- a/test/test-parser.rb +++ b/test/test-parser.rb @@ -4,10 +4,11 @@ require "common" class TestParser < Test::Unit::TestCase - def test_parser + def setup @pa = CHISE::CharacterParser.new + end - # test_parse + def test_parse assert_raise(RuntimeError){ @pa.parse(nil) } assert_equal(65, @pa.parse(0x41)) assert_raise(RuntimeError){ @pa.parse(Object.new) } @@ -15,14 +16,16 @@ class TestParser < Test::Unit::TestCase assert_equal(20175, @pa.parse("?\344\273\217")) assert_raise(RuntimeError){ @pa.parse("nosuchcharacter") } assert_raise(RuntimeError){ @pa.parse("\344\273\217") } + end - # test_parse_er + def test_parse_er assert_equal(true, @pa.contain_er?("A")) assert_equal(true, @pa.contain_er?("This is A er.")) assert_equal(true, @pa.is_er?("A")) assert_equal(false, @pa.is_er?("This is A er.")) - assert_raise(RuntimeError){ @pa.parse_er("nosucher") } assert_equal(0xe001, @pa.parse("&my-1;")) + assert_raise(RuntimeError){ @pa.parse_er("&nosucher;") } + assert_raise(RuntimeError){ @pa.parse_er("nosucher") } assert_equal(23383, @pa.parse("&MCS-00005B57;")) assert_equal(23383, @pa.parse("&U5B57;")) @@ -30,16 +33,29 @@ class TestParser < Test::Unit::TestCase assert_equal(23383, @pa.parse("&U+5B57;")) assert_equal(23383, @pa.parse("字")) assert_equal(23383, @pa.parse("字")) + end - # test_get_ccs + def test_parse_ccs assert_equal(23383, @pa.parse("&J90-3B7A;")) assert_equal(23383, @pa.parse("&I-J90-3B7A;")) assert_equal(23383, @pa.parse("&MCS-00005B57;")) assert_equal(23383, @pa.parse("&M-06942;")) - assert_raise(RuntimeError){ @pa.parse_er("&nosucher;") } + end + def test_comples_ccs assert_equal(28193, @pa.parse("&C1-602E;")) # 渡 assert_equal(15542221, @pa.parse("&C1-6030;")) # unknown + + # test_ccs_etc + assert_equal(131636, @pa.parse("&HZK01-C947;")) # =hanziku-1 + assert_equal(1644203214, @pa.parse("&CDP-8CCE;")) # CDP + assert_equal(1644202927, @pa.parse("&CDP-8BAF;")) + assert_equal(1644210346, @pa.parse("&B-A8AA;")) # =big5 + assert_equal(1644202869, @pa.parse("&RUI6-E00E;")) # =ruimoku-v6 + assert_equal(15225021, @pa.parse("&JC3-50BD;")) # =jef-china3 + assert_equal(1644202692, @pa.parse("&CB00008;")) + assert_equal(14820071, @pa.parse("&CB08935;")) + #assert_equal(0, @pa.parse("&CB08661;")) # what? end def test_de_er diff --git a/test/test-rbchise.rb b/test/test-rbchise.rb index 2d69d2f..e69de29 100755 --- a/test/test-rbchise.rb +++ b/test/test-rbchise.rb @@ -1,155 +0,0 @@ -#!/usr/bin/env ruby -# Copyright (C) 2002-2004 Kouichirou Eto, All rights reserved. - -require "common" - -class TestRbChise < Test::Unit::TestCase - include CHISE::ChiseValue - - def test_rbchise - @ds = CHISE::DataSource.new - assert_instance_of(CHISE::DataSource, @ds) - assert_match(/chise-db\Z/, @ds.location.to_s) - - @ct = @ds.get_ccs("=daikanwa") - assert_instance_of(CHISE::CCSTable, @ct) - char_id = @ct.decode(364) # get a character by Daikanwa number 364. - assert_equal(20175, char_id) - str = format_char_id(20175) - assert_equal("?\344\273\217", str) - - char_id = @ds.decode_char("=daikanwa", 364) - assert_equal(20175, char_id) - - @ft = @ds.get_feature("ideographic-structure") - assert_instance_of(CHISE::FeatureTable, @ft) - value = @ft.get_value(char_id) - assert_instance_of(String, value) - assert_equal("(?\342\277\260 ?\344\272\273 ?\345\216\266)", value) - - value = @ds.load_feature("ideographic-structure", char_id) - assert_equal("(?\342\277\260 ?\344\272\273 ?\345\216\266)", value) - - @ds.each_feature {|f| - #qp f - assert_instance_of(String, f) - } - - @ft.each {|k, v| - #qp k, v - assert_kind_of(Integer, k) - assert_instance_of(String, v) - } - - ft = @ds.get_feature("numeric-value") - ft.each {|k, v| - #qp k, v - assert_kind_of(Integer, k) - assert_instance_of(String, v) - } - end - - def test_each_ccs - @ds = CHISE::DataSource.new - @ds.each_ccs {|ccs| - #qp ccs - assert_instance_of(String, ccs) - ct = @ds.get_ccs(ccs) - assert_instance_of(CHISE::CCSTable, ct) - } - - ct = @ds.get_ccs("=ascii") - ct.each {|k, v| - #qp k, v - assert_kind_of(Integer, k) - assert_kind_of(Integer, v) - } - ct.close - end - - def test_error - @ds = CHISE::DataSource.new - @ft = @ds.get_feature("nosuchfeature") - v = @ft.get_value(20175) - assert_equal(nil, v) - end - - def test_chisedb - @cd = CHISE::ChiseDB.instance - - char_id = @cd.decode_char("=daikanwa", 364) - assert_equal(20175, char_id) - - value = @cd.load_feature("ideographic-structure", char_id) - assert_equal("(?\342\277\260 ?\344\272\273 ?\345\216\266)", value) - - value = @cd.load_feature("=ucs", char_id) - assert_equal(20175, value) - - @cd.each_feature {|f| - assert_instance_of(String, f) - } - - ft = @cd.get_feature("numeric-value") - ft.each {|k, v| - assert_kind_of(Integer, k) - assert_instance_of(String, v) - } - end - - def test_ascii - @cd = CHISE::ChiseDB.instance - ct = @cd.get_ccs("ascii") - char_id = ct.decode(65) - assert_equal(65, char_id) - assert_equal("A", CHISE::Character.get(char_id).to_s) -# assert_equal("A", char.to_s) - end - - - def test_parse_c_string - u8 = "字" - assert_equal(23383, u8.u8to_i) - assert_equal(23383, parse_c_string("?"+u8)) - assert_equal(0, parse_c_string("?\\^@")) - assert_equal(9, parse_c_string("?\t")) - assert_equal(10, parse_c_string("?\n")) - assert_equal(13, parse_c_string("?\r")) - assert_equal(94, parse_c_string("?^\\")) - assert_equal(31, parse_c_string("?\\^_")) - assert_equal(32, parse_c_string("?\\ ")) - assert_equal(34, parse_c_string("?\\\"")) - assert_equal(126, parse_c_string("?~")) - assert_equal(127, parse_c_string("?\\^?\000")) - assert_equal(131, parse_c_string("?\\^\303\237")) - assert_equal(0x7FF, parse_c_string("?\337\277")) - assert_equal(0xFFFF, parse_c_string("?\357\277\277")) - assert_equal(0x1FFFFF, parse_c_string("?\367\277\277\277")) - assert_equal(0x3FFFFFF, parse_c_string("?\373\277\277\277\277")) - assert_equal(0xFFFFFFF, parse_c_string("?\374\217\277\277\277\277")) - assert_raise(RuntimeError) { parse_c_string("nosuch") } - end - - def test_format_char_id - u8 = "字" - assert_equal(u8, CHISE.i_tou8(23383)) - assert_equal("?\345\255\227", format_char_id(23383)) - assert_equal("?"+u8, format_char_id(23383)) - assert_equal("?\\^@", format_char_id(0)) - assert_equal("?\t", format_char_id(?\t)) - assert_equal("?\n", format_char_id(?\n)) - assert_equal("?\r", format_char_id(?\r)) - assert_equal("?^\\", format_char_id(0x1C)) - assert_equal("?\\^_", format_char_id(0x1F)) - assert_equal("?\\ ", format_char_id(?\s)) - assert_equal("?\\\"", format_char_id(?\")) - assert_equal("?~", format_char_id(0x7E)) - assert_equal("?\\^?\000", format_char_id(0x7F)) - assert_equal("?\\^\303\237", format_char_id(0x9F)) - assert_equal("?\337\277", format_char_id(0x7FF)) - assert_equal("?\357\277\277", format_char_id(0xFFFF)) - assert_equal("?\367\277\277\277", format_char_id(0x1FFFFF)) - assert_equal("?\373\277\277\277\277", format_char_id(0x3FFFFFF)) - assert_equal("?\374\217\277\277\277\277", format_char_id(0xFFFFFFF)) - end -end diff --git a/test/test-string.rb b/test/test-string.rb index 87e3c76..2f7cc56 100755 --- a/test/test-string.rb +++ b/test/test-string.rb @@ -28,6 +28,15 @@ class TestString < Test::Unit::TestCase assert_raises(RuntimeError){ "文&nosucher;列".de_er } end + def test_de_er_ccs + assert_equal("\346\270\241", "&C1-602E;".de_er) # 渡 + assert_equal("\370\273\222\237\215", "&C1-6030;".de_er) # unknown + # test_hanziku + assert_equal("\360\240\210\264", "&HZK01-C947;".de_er) +# assert_equal(1644203214, "&CDP-8CCE;".de_er) +# assert_equal(1644202927, "&CDP-8BAF;".de_er) + end + def test_characters @str = "文字列" assert_equal(["文","字","列"], @str.to_a) diff --git a/test/test-util.rb b/test/test-util.rb index 932f53d..e69de29 100755 --- a/test/test-util.rb +++ b/test/test-util.rb @@ -1,11 +0,0 @@ -#!/usr/bin/env ruby -# Copyright (C) 2002-2004 Kouichirou Eto, All rights reserved. - -require "common" - -class TestUtil < Test::Unit::TestCase - def test_db - assert_equal("()+!", "<>*?".path.unix_to_win.to_s) - assert_equal("<>*?", "()+!".path.win_to_unix.to_s) - end -end -- 1.7.10.4