--- /dev/null
+# Copyright (C) 2002-2004 Kouichirou Eto, All rights reserved.
+
+$LOAD_PATH << "../../lib" if $0 == __FILE__
+require "chise/db"
+
+module CHISE
+# IDC_LEFT_TO_RIGHT = "⿰"
+# IDC_ABOVE_TO_BELOW = "⿱"
+# IDC_LEFT_TO_MIDDLE_AND_RIGHT = "⿲"
+# IDC_ABOVE_TO_MIDDLE_AND_BELOW = "⿳"
+# IDC_FULL_SURROUND = "⿴"
+# IDC_SURROUND_FROM_ABOVE = "⿵"
+# IDC_SURROUND_FROM_BELOW = "⿶"
+# IDC_SURROUND_FROM_LEFT = "⿷"
+# IDC_SURROUND_FROM_UPPER_LEFT = "⿸"
+# IDC_SURROUND_FROM_UPPER_RIGHT = "⿹"
+# IDC_SURROUND_FROM_LOWER_LEFT = "⿺"
+# IDC_OVERLAID = "⿻"
+
+ IDC_LEFT_TO_RIGHT = "\342\277\260" #2FF0
+ IDC_ABOVE_TO_BELOW = "\342\277\261"
+ IDC_LEFT_TO_MIDDLE_AND_RIGHT = "\342\277\262"
+ IDC_ABOVE_TO_MIDDLE_AND_BELOW = "\342\277\263"
+ IDC_FULL_SURROUND = "\342\277\264" #2FF4
+ IDC_SURROUND_FROM_ABOVE = "\342\277\265"
+ IDC_SURROUND_FROM_BELOW = "\342\277\266"
+ IDC_SURROUND_FROM_LEFT = "\342\277\267"
+ IDC_SURROUND_FROM_UPPER_LEFT = "\342\277\270"
+ IDC_SURROUND_FROM_UPPER_RIGHT = "\342\277\271"
+ IDC_SURROUND_FROM_LOWER_LEFT = "\342\277\272"
+ IDC_OVERLAID = "\342\277\273"
+
+ IDC_LR = IDC_LEFT_TO_RIGHT
+ IDC_AB = IDC_ABOVE_TO_BELOW
+ IDC_LM = IDC_LEFT_TO_MIDDLE_AND_RIGHT
+ IDC_AM = IDC_ABOVE_TO_MIDDLE_AND_BELOW
+ IDC_FS = IDC_FULL_SURROUND
+ IDC_FA = IDC_SURROUND_FROM_ABOVE
+ IDC_FB = IDC_SURROUND_FROM_BELOW
+ IDC_FL = IDC_SURROUND_FROM_LEFT
+ IDC_UL = IDC_SURROUND_FROM_UPPER_LEFT
+ IDC_UR = IDC_SURROUND_FROM_UPPER_RIGHT
+ IDC_LL = IDC_SURROUND_FROM_LOWER_LEFT
+ IDC_OV = IDC_OVERLAID
+
+ IDC_LMR = IDC_LM
+ IDC_AMB = IDC_AM
+ IDC_FUL = IDC_UL
+ IDC_FUR = IDC_UR
+ IDC_FLL = IDC_LL
+ IDC_O = IDC_OV
+
+ class IDS_TEXT_DB < DB #======================================================================
+ include Singleton
+ IDS_LIST = "
+IDS-UCS-Basic.txt
+#IDS-UCS-Compat-Supplement.txt
+#IDS-UCS-Compat.txt
+IDS-UCS-Ext-A.txt
+IDS-UCS-Ext-B-1.txt
+IDS-UCS-Ext-B-2.txt
+IDS-UCS-Ext-B-3.txt
+IDS-UCS-Ext-B-4.txt
+IDS-UCS-Ext-B-5.txt
+IDS-UCS-Ext-B-6.txt
+IDS-JIS-X0208-1990.txt
+IDS-Daikanwa-01.txt
+IDS-Daikanwa-02.txt
+IDS-Daikanwa-03.txt
+IDS-Daikanwa-04.txt
+IDS-Daikanwa-05.txt
+IDS-Daikanwa-06.txt
+IDS-Daikanwa-07.txt
+IDS-Daikanwa-08.txt
+IDS-Daikanwa-09.txt
+IDS-Daikanwa-10.txt
+IDS-Daikanwa-11.txt
+IDS-Daikanwa-12.txt
+IDS-Daikanwa-dx.txt
+IDS-Daikanwa-ho.txt
+IDS-CBETA.txt
+".split
+ def initialize()
+ super
+ @ids_list = IDS_LIST
+ @chars = []
+
+ @dir = Config.instance.ids_dir
+
+ @glob, @pre, @post = "#{@dir}/db/*", "#{@dir}/db/", ""
+ dir = File.dirname(@pre)
+ Dir.mkdir(dir) unless FileTest.exist?(dir)
+ open_dbs()
+ end
+
+ def each_file()
+ return unless block_given?
+ @ids_list.each {|file|
+ next if file =~ /^#/
+ yield(@dir+file)
+ }
+ end
+
+ def each_line(file)
+ open(file){|f|
+ while line = f.gets
+ next if line =~ /^;/ #コメントはとばす
+ line.chomp!
+ code, char, ids = line.split
+ yield(code, char, ids)
+ end
+ }
+ end
+ def dump_text_all
+ each_file {|file|
+ dir = File.dirname(file) + "/../ids-new/"
+ Dir.mkdir(dir) if ! FileTest.directory?(dir)
+ newfile = dir + File.basename(file)
+ p [file, newfile]
+ open(newfile, "w"){|out|
+ out.binmode.sync = true
+ each_line(file){|code, ch, ids|
+ char = Character.get(ch)
+ ids = char.decompose
+ out.print "#{code} #{ch} #{ids}\n"
+ }
+ }
+ }
+ end
+ def make_ids_error
+ each_file {|file|
+ dir = File.dirname(file) + "/../ids-error"
+ Dir.mkdir(dir) unless FileTest.exist?(dir)
+ errfile = dir + "/" + File.basename(file)
+ # p [file, errfile]
+ open(errfile, "w"){|out|
+ out.binmode.sync = true
+ each_line(file){|code, ch, ids|
+ char = Character.get(ch)
+ ids_error = char["ids-error"]
+ next if ids_error.nil?
+ out.print "#{code} #{ch} #{ids} #{ids_error}\n"
+ }
+ }
+ }
+ end
+ end
+
+ class IDS_DB < DB # BDB化したIDS DBを扱う
+ include Singleton
+ def initialize
+ @dbs = CharDB.instance
+ end
+ def make_ids_db
+ db = IDS_TEXT_DB.instance
+ db.each_file {|file|
+ @char_counter = 0
+ @same_ids_counter = 0
+ @good_ids_counter = 0
+ @conflict_ids_counter = 0
+ db.each_line(file){|code, ch, ids|
+ @char_counter += 1
+
+ ids = "" if ids == nil
+ next if ids == "" #IDSが定義されていない場合は、さっくりと無視するべしよ。
+
+ charimg = Character.get(ch) #実体参照である可能性がある
+
+ next if code =~ /'$/ || code =~ /"$/ #大漢和番号のダッシュ付きは無視する
+ char = Character.get("&"+code+";") #code表記を元に実体参照を作って解釈する
+ if char.nil? || char.to_s == "" #うまく文字にならなかった
+ print "char == null #{char.inspect} #{code} #{ch} #{ids}\n" unless code =~ /^M-/ || code =~ /^CB/
+ #大漢和、CBETA以外の場合は、エラーメッセージ。
+ next
+ end
+ if char != charimg #code表記と文字が一致していない?
+ unless code =~ /^M-/ || code =~ /^MH-/ || code =~ /^CB/ #食い違っていて当然であるので何もしない
+ print "unknown char #{char.inspect} #{code} #{ch} #{ids}\n"
+ next #それ以外の場合はエラーメッセージをだして、次へ。
+ end
+ end
+ #next if !char.has_attribute? #isolated characterはまぎれこませない。
+
+ ids.de_er! #実体参照を解除する
+ next if ids == char.to_s #もし文字とまったく一緒なら、意味が無いので情報を持たない
+ next if ids.char_length == 1
+
+ idstree = IDS_Tree.new(ids)
+ c = idstree.check_integrity
+ c = "contains self" if ids.include?(char.to_s)
+ if c #ちょっとでもエラーがある場合は、
+ char["ids-error"] = c #エラーを記録して、データとしては保持しない
+ next
+ end
+
+ if char["ids"].nil? || char["ids"] == "" #元々IDSが無かった場合は、
+ char["ids"] = ids #普通に代入すればそれでいいです。
+ @good_ids_counter += 1
+ else #しかしいままでにすでにIDSが定義されていた場合は?
+ if char["ids"] == ids #新しいIDSと古いIDSが完全に一致するなら無視しましょう。
+ @same_ids_counter += 1
+ else #しかしいままでのIDSと新しいIDSが食い違った場合は?
+ @conflict_ids_counter += 1
+ # print "conflict #{char.inspect} #{code} #{ids} #{char["ids"]}\n"
+ end
+ end
+ }
+ print "#{file} #{@char_counter} #{@same_ids_counter} #{@conflict_ids_counter} #{@good_ids_counter}\n"
+ CharacterFactory.instance.reset()
+ }
+ @dbs.dump_db("ids-error") #テキスト化する
+ @dbs.dump_db("ids") #テキスト化する
+ end
+ def make_ids_reverse
+ h = Hash.new
+ @dbs.each("ids") {|k, v|
+ char = k.char
+ ids = char.decompose
+ h[ids] = "" if h[ids].nil?
+ h[ids] += k #追加する
+ }
+ h.each {|k, v|
+ h[k] = char_sort(v) #文字の順番を、よく使うっぽいものからの順番にする
+ }
+ h.delete_if {|k, v| #h[k]が""になる可能性もあるが、それはkeyとして入れないことにする。
+ v == ""
+ }
+ print "length #{h.length}\n"
+ cdb = CodesysDB.instance
+ cdb.make_db_no_question_mark("ids", h)
+ cdb.open_db("ids") #これが無いと、dump_dbされません。
+ cdb.dump_db("ids")
+ end
+ def char_sort(composed)
+ return composed if composed.char_length == 1
+ ar = composed.to_a
+ arorg = ar.dup
+ ar2 = []
+ ar.dup.each {|ch|
+ char = ch.char
+ if char.char_id < 0xfffff #Unicodeっぽい?
+ ar2 << ch
+ ar.delete(ch)
+ end
+ }
+ if 0 < ar.length
+ EntityReference.each_codesys{|codesys, er_prefix, keta, numtype|
+ ar.each {|ch|
+ char = ch.char
+ v = char[codesys]
+ # p [codesys, v] if v
+ if v #EntityReferenceの順番に準拠する。
+ ar2 << ch
+ ar.delete(ch)
+ end
+ }
+ }
+ end
+ if 0 < ar.length
+ # p ["yokuwakaran character", ar, ar[0].inspect_all, arorg]
+ EntityReference.each_codesys{|codesys, er_prefix, keta, numtype|
+ ar.dup.each {|ch|
+ char = ch.char
+ v = char[codesys]
+ # p [codesys, v] if v
+ }
+ }
+ end
+ return ar2.join("")
+ end
+ def dump_ids_duplicated
+ open("ids-duplicated.txt", "w"){|out|
+ #out.binmode
+ CodesysDB.instance.each("ids") {|k, v|
+ if v.nil?
+ out.print "nil #{k} #{v}\n"
+ next
+ end
+ n = v.char_length
+ next if n == 1
+ out.print "#{n} #{k} #{v}"
+ v.each_char {|ch|
+ char = ch.char
+ out.print " #{char.inspect}"
+ }
+ out.print "\n"
+ }
+ }
+ end
+ def make_ids_aggregated
+ @dbs.each("ids") {|k, v|
+ char = k.char
+ ids = char.decompose
+ ag = ids.aggregate
+ char["ids-aggregated"] = ag
+ }
+ @dbs.dump_db("ids-aggregated")
+ end
+ def dump_ids_aggregated
+ open("ids-aggregated.txt", "w"){|out|
+ #out.binmode
+ @dbs.each("ids") {|k, v|
+ char = k.char
+ ids = char["ids"]
+ ag = char["ids-aggregated"]
+ out.print "#{char.to_s} #{ag} #{ids}\n" if ids != ag
+ }
+ }
+ end
+ def make_ids_parts
+ @dbs.each("ids") {|k, v|
+ char = k.char
+ pids = char.to_s
+ ar = []
+ counter = 0
+ loop {
+ ids = pids.decompose
+ break if ids == pids #これ以上分割できないようだったら終了〜。
+ ar += ids.to_a
+ counter += 1
+ p [char.to_s, pids, ids, ar] if 10 < counter #これは何かおかしいぞと
+ pids = ids
+ }
+ ar.sort!
+ ar.uniq!
+ #やっぱりIDS文字も加えることにする. by eto 2003-02-05
+ # ar.delete_if {|ch|
+ # ch.char.is_ids? #IDS文字はまぎれこませない。
+ # }
+ str = ar.join("")
+ char["ids-parts"] = str
+ }
+ @dbs.dump_db("ids-parts")
+ end
+ def make_ids_contained
+ h = Hash.new
+ @dbs.each("ids-parts") {|k, v|
+ char = k.char
+ parts = char.ids_parts
+ parts.each_char {|ch|
+ # part = ch.char
+ h[ch] = [] if h[ch].nil?
+ h[ch] << k
+ # h[ch] += k
+ # part["ids-contained"] = "" if part["ids-contained"].nil?
+ # part["ids-contained"] += k
+ }
+ }
+ h.each {|k, v|
+ char = k.char
+ v.sort!
+ char["ids-contained"] = v.join("")
+
+ }
+ @dbs.dump_db("ids-contained")
+ end
+ def make_ids_decomposed
+ @dbs.each("ids") {|k, v|
+ char = k.char
+ de= char.decompose_all
+ char["ids-decomposed"] = de
+ }
+ @dbs.dump_db("ids-decomposed")
+ end
+ end
+
+ class Node < Array #==================================木構造の中の一つの枝
+ def initialize(nodeleaf=nil, nodenum=nil)
+ super()
+ @nodeleaf = nodeleaf
+ @nodenum = nodenum
+ if @nodeleaf
+ original_add(@nodeleaf)
+ end
+ end
+ attr_reader :nodenum
+ alias original_add <<
+ private :original_add
+ def <<(obj)
+ original_add(obj)
+ @nodenum -= 1 if @nodenum
+ end
+ def nodes
+ ar = []
+ ar << self.to_s
+ self.each {|n|
+ ar += n.nodes if n.is_a? Node
+ }
+ return ar
+ end
+ end
+
+ class Tree #==================================================木構造を扱う
+ def initialize()
+ @root = Node.new()
+ @stack = [@root]
+ @leafnum = 0
+ @depth = 1 #stackの深さが最大になったところの値、木構造が無いときは1となる
+ end
+ def depth() @depth - 1 end
+ def add_node(nodeleaf=nil, nodenum=nil) #枝を追加
+ new_node = Node.new(nodeleaf, nodenum)
+ @stack.last << new_node
+ @stack << new_node
+ if @depth < @stack.length
+ @depth = @stack.length
+ end
+ self
+ end
+ def end_node() #この枝は終り
+ @stack.pop
+ self
+ end
+ def add_leaf(a) #葉を追加
+ @stack.last << a
+ end_check()
+ self
+ end
+ def end_check()
+ n = @stack.last.nodenum
+ if n && n == 0
+ end_node()
+ end_check() #再帰
+ end
+ end
+ def check_integrity
+ n = @stack.last.nodenum
+ return nil if @root.length == 0 #no tree is good tree
+ return "unmatch leaves" if n && n != 0
+ return "extra nodes" if @root.first.is_a?(Node) && @root.length != 1
+ return "extra leaves" if @root.length != 1
+ return nil
+ end
+ def nodes
+ r = @root.nodes
+ r.shift
+ r
+ end
+ def sub_nodes
+ r = nodes
+ r.shift
+ r
+ end
+ def to_s() @root.to_s end
+ def inspect() @root.inspect end
+ end
+
+ class IDS_Tree < Tree
+ def initialize(str)
+ @str = str
+ super()
+ parse()
+ end
+ def parse()
+ @str.each_char {|ch|
+ char = Character.new(ch)
+ if is_ids?(char)
+ add_node(char, ids_operator_argc(char))
+ else
+ add_leaf(char)
+ end
+ }
+ end
+ def is_ids?(obj)
+ return true if "+*".include?(obj.to_s) #テスト用ですかね
+ return true if obj.is_ids?
+ return false
+ end
+ def ids_operator_argc(obj)
+ return obj.ids_operator_argc if 0 < obj.ids_operator_argc
+ return 2 #テスト用ってことで
+ end
+ def check_integrity
+ r = super
+ return r if r #不完全がすでにわかっているならreturn
+ return "contains ques" if @str =~ /\?/ #?が含まれている?
+ return nil
+ end
+ end
+
+ class IDS #=========================================IDSそのものを扱うclass
+ def initialize(str) #IDS文字列をうけとる。
+ @str = str
+ end
+ def parse
+ end
+ def parse_x #柔軟型のParse. IDSキャラクターが前にきてなくてもよい。などなど。
+ end
+ end
+
+ class Counter
+ #使い方
+ #counter = Counter.new(50) { exit }
+ #counter.count
+ def initialize(max)
+ @max = max
+ @count = 0
+ @proc = proc
+ end
+ def count
+ @count += 1
+ if @max <= @count
+ @proc.call
+ end
+ end
+ end
+
+end