X-Git-Url: http://git.chise.org/gitweb/?a=blobdiff_plain;ds=sidebyside;f=chise%2Fidsdb.rb;h=83a55c0b3ec59419042054297d15e89081c4e8f7;hb=fc94a49ca7fffa51475bcccf26b328a3b92f3758;hp=07bc5ec82eb5d9b1c90d6f576a52593cb93965c6;hpb=d9c959430ad67c86b09a049da28e173f7450ece9;p=chise%2Fruby.git diff --git a/chise/idsdb.rb b/chise/idsdb.rb index 07bc5ec..83a55c0 100755 --- a/chise/idsdb.rb +++ b/chise/idsdb.rb @@ -1,8 +1,188 @@ # Copyright (C) 2002-2004 Kouichirou Eto, All rights reserved. require "chise/char" +require "chise/ids" +require "chise/qp" +require "chise/management" module CHISE + class IDS_DB_Management + def initialize + @cd = ChiseDB.instance + @idsdb = IDS_DB.instance + end + + def check_conflict_of_ids_text + @idsdb.each_ccs {|ccs| + #qp ccs + c = Hash.new(0) + h = {} + @idsdb.get_ccs(ccs).each_character {|char, ids| + c["char"] += 1 + next if ids == char.to_s + next if ids.char_length == 1 + char_id = char.char_id + cids = h[char_id] + if cids.nil? # There is no ids yet. + h[char_id] = ids # just set it. + c["good"] += 1 + else # but, if there is already a ids? + if cids == ids # the two are same. + c["same"] += 1 # and just ignore + else # but, if the two are not same? + c["conflict"] += 1 + puts "conflict\t#{char.to_s}\t#{ids}\t#{cids}" + end + end + } + puts "#{ccs}\t#{c['char']}\t#{c['same']}\t#{c['conflict']}\t#{c['good']}" + } + end + + def store_ids_as_text + max = 20000 + h = {} + @idsdb.each_ccs {|ccs| + qp ccs + @idsdb.get_ccs(ccs).each_character {|char, ids| + next if ids == char.to_s + next if ids.char_length == 1 + char.ids_text = ids # just set it. + h[char.char_id] = ids +# break if max <= h.length + } +# break if max <= h.length + } + qp "%08X" % h.keys.max + qp "sync", @cd.get_feature("ids-text").sync + @cd.get_feature("ids-text").dump + qp h.length + qp @cd.get_feature("ids-text").to_hash.length + end + + def store_ids_de_er + h = {} + @cd.get_feature("ids-text").each_char {|cid, ids_text| + char = Character.get(cid) + begin + ids = ids_text.de_er # parse Entity Reference + rescue => e + qp cid, ids_text + next + end + next if ids == char.to_s + next if ids.char_length == 1 + char.ids_de_er = ids # set it. + h[char.char_id] = ids + } + qp "%08X" % h.keys.max + @cd.get_feature("ids-de-er").dump + qp h.length + qp @cd.get_feature("ids-de-er").to_hash.length + end + + def check_integrity_of_ids_tree + h = {} + @cd.get_feature("ids-de-er").each_char {|cid, ids| + char = Character.get(cid) + idstree = IDS_Tree.new(ids) + begin + raise "contains self" if ids.include?(char.to_s) + idstree.check_integrity + rescue => e + #puts "#{cid}\t#{e.message}\t#{ids}" + char.ids_error = e.message + next + end + char.ids_org = ids # set it. + h[char.char_id] = ids + } + @cd.get_feature("ids-org").dump + qp h.length + qp @cd.get_feature("ids-org").to_hash.length + @cd.get_feature("ids-error").dump + end + + def make_by_ids_db_org + h = {} + byids = @cd.get_by_ids_db("ids-org") + @cd.get_feature("ids-org").each_char {|cid, ids| + char = Character.get(cid) + byids.set_decoded_char(ids, cid) + h[ids] = cid + } + qp h.length + byids.dump + qp byids.to_hash.length + end + + def store_ids_aggregated + h = {} + @cd.get_feature("ids-org").each_char {|cid, ids| + char = Character.get(cid) + #ids = char.decompose + #ids = char.ids + ag = ids.to_ids.aggregate("ids-org") + #puts "#{char.to_s}\t#{ids}\t#{ag}" + char.ids = ag # ids-aggregated + h[char.char_id] = ids + } + @cd.get_feature("ids").dump + qp h.length + qp @cd.get_feature("ids").to_hash.length + end + + def store_ids_subparts + h = {} + @cd.get_feature("ids").each_char {|cid, v| + char = Character.get(cid) + pids = char.to_s # previous_ids + ar = [] + i = 0 # only for infinite loop check + loop { + ids = pids.decompose + break if ids == pids # break if there is no possibilities. + ar += ids.to_a + i += 1 + qp [char.to_s, pids, ids, ar] if 10 < i # something wrong. + pids = ids + } + str = ar.sort.uniq.join("") # can contain IDC. + char.ids_subparts = str + h[char.char_id] = str + } + @cd.get_feature("ids-subparts").dump + qp h.length + qp @cd.get_feature("ids-subparts").to_hash.length + end + + def store_ids_contained + h = Hash.new + @cd.get_feature("ids-subparts").each_char {|cid, v| + char = Character.get(cid) + parts = char.ids_subparts + parts.each_char {|ch| + h[ch] = [] if h[ch].nil? + h[ch] << cid + } + } + h.each {|char, ar| + str = ar.sort.map {|cid| Character.get(cid).to_s }.join + char.ids_contained = str + } + @cd.get_feature("ids-contained").dump + end + + def make_by_ids_db + byids = @cd.get_by_ids_db("ids") + @cd.get_feature("ids").each_char {|cid, ids| + char = Character.get(cid) + byids.set_decoded_char(ids, cid) + } + byids.dump + end + end + class IDS_DB include Singleton @@ -36,6 +216,7 @@ module CHISE @path.open {|f| f.each {|line| next if /\A;/ =~ line # skip comment + line.chomp! code, picture, ids = line.split raise if code.nil? ids = "" if ids.nil? @@ -44,13 +225,17 @@ module CHISE } end - def each_entry + def each_character each_line {|code, ids| + next if ids.nil? + next if ids == "" # If there is no IDS, ignore it. + er = "&"+code+";" begin char = Character.get(er) rescue #qp er + next end next if char.nil? yield(char, ids)