# Copyright (C) 2002-2004 Kouichirou Eto, All rights reserved.
-$LOAD_PATH << "../../lib" if $0 == __FILE__
-require "chise/db"
+require "chise/idstree"
module CHISE
-# IDC_LEFT_TO_RIGHT = "⿰"
-# IDC_ABOVE_TO_BELOW = "⿱"
-# IDC_LEFT_TO_MIDDLE_AND_RIGHT = "⿲"
-# IDC_ABOVE_TO_MIDDLE_AND_BELOW = "⿳"
-# IDC_FULL_SURROUND = "⿴"
-# IDC_SURROUND_FROM_ABOVE = "⿵"
-# IDC_SURROUND_FROM_BELOW = "⿶"
-# IDC_SURROUND_FROM_LEFT = "⿷"
-# IDC_SURROUND_FROM_UPPER_LEFT = "⿸"
-# IDC_SURROUND_FROM_UPPER_RIGHT = "⿹"
-# IDC_SURROUND_FROM_LOWER_LEFT = "⿺"
-# IDC_OVERLAID = "⿻"
-
- IDC_LEFT_TO_RIGHT = "\342\277\260" #2FF0
- IDC_ABOVE_TO_BELOW = "\342\277\261"
- IDC_LEFT_TO_MIDDLE_AND_RIGHT = "\342\277\262"
- IDC_ABOVE_TO_MIDDLE_AND_BELOW = "\342\277\263"
- IDC_FULL_SURROUND = "\342\277\264" #2FF4
- IDC_SURROUND_FROM_ABOVE = "\342\277\265"
- IDC_SURROUND_FROM_BELOW = "\342\277\266"
- IDC_SURROUND_FROM_LEFT = "\342\277\267"
- IDC_SURROUND_FROM_UPPER_LEFT = "\342\277\270"
- IDC_SURROUND_FROM_UPPER_RIGHT = "\342\277\271"
- IDC_SURROUND_FROM_LOWER_LEFT = "\342\277\272"
- IDC_OVERLAID = "\342\277\273"
-
- IDC_LR = IDC_LEFT_TO_RIGHT
- IDC_AB = IDC_ABOVE_TO_BELOW
- IDC_LM = IDC_LEFT_TO_MIDDLE_AND_RIGHT
- IDC_AM = IDC_ABOVE_TO_MIDDLE_AND_BELOW
- IDC_FS = IDC_FULL_SURROUND
- IDC_FA = IDC_SURROUND_FROM_ABOVE
- IDC_FB = IDC_SURROUND_FROM_BELOW
- IDC_FL = IDC_SURROUND_FROM_LEFT
- IDC_UL = IDC_SURROUND_FROM_UPPER_LEFT
- IDC_UR = IDC_SURROUND_FROM_UPPER_RIGHT
- IDC_LL = IDC_SURROUND_FROM_LOWER_LEFT
- IDC_OV = IDC_OVERLAID
-
- IDC_LMR = IDC_LM
- IDC_AMB = IDC_AM
- IDC_FUL = IDC_UL
- IDC_FUR = IDC_UR
- IDC_FLL = IDC_LL
- IDC_O = IDC_OV
-
- class IDS_TEXT_DB < DB #======================================================================
- include Singleton
- IDS_LIST = "
-IDS-UCS-Basic.txt
-#IDS-UCS-Compat-Supplement.txt
-#IDS-UCS-Compat.txt
-IDS-UCS-Ext-A.txt
-IDS-UCS-Ext-B-1.txt
-IDS-UCS-Ext-B-2.txt
-IDS-UCS-Ext-B-3.txt
-IDS-UCS-Ext-B-4.txt
-IDS-UCS-Ext-B-5.txt
-IDS-UCS-Ext-B-6.txt
-IDS-JIS-X0208-1990.txt
-IDS-Daikanwa-01.txt
-IDS-Daikanwa-02.txt
-IDS-Daikanwa-03.txt
-IDS-Daikanwa-04.txt
-IDS-Daikanwa-05.txt
-IDS-Daikanwa-06.txt
-IDS-Daikanwa-07.txt
-IDS-Daikanwa-08.txt
-IDS-Daikanwa-09.txt
-IDS-Daikanwa-10.txt
-IDS-Daikanwa-11.txt
-IDS-Daikanwa-12.txt
-IDS-Daikanwa-dx.txt
-IDS-Daikanwa-ho.txt
-IDS-CBETA.txt
-".split
- def initialize()
- super
- @ids_list = IDS_LIST
- @chars = []
-
- @dir = Config.instance.ids_dir
-
- @glob, @pre, @post = "#{@dir}/db/*", "#{@dir}/db/", ""
- dir = File.dirname(@pre)
- Dir.mkdir(dir) unless FileTest.exist?(dir)
- open_dbs()
- end
-
- def each_file()
- return unless block_given?
- @ids_list.each {|file|
- next if file =~ /^#/
- yield(@dir+file)
+ IDC_0 = "\342\277\260"
+ IDC_1 = "\342\277\261"
+ IDC_2 = "\342\277\262"
+ IDC_3 = "\342\277\263"
+ IDC_4 = "\342\277\264"
+ IDC_5 = "\342\277\265"
+ IDC_6 = "\342\277\266"
+ IDC_7 = "\342\277\267"
+ IDC_8 = "\342\277\270"
+ IDC_9 = "\342\277\271"
+ IDC_A = "\342\277\272"
+ IDC_B = "\342\277\273"
+
+ IDC_LEFT_TO_RIGHT = IDC_0
+ IDC_ABOVE_TO_BELOW = IDC_1
+ IDC_LEFT_TO_MIDDLE_AND_RIGHT = IDC_2
+ IDC_ABOVE_TO_MIDDLE_AND_BELOW = IDC_3
+ IDC_FULL_SURROUND = IDC_4
+ IDC_SURROUND_FROM_ABOVE = IDC_5
+ IDC_SURROUND_FROM_BELOW = IDC_6
+ IDC_SURROUND_FROM_LEFT = IDC_7
+ IDC_SURROUND_FROM_UPPER_LEFT = IDC_8
+ IDC_SURROUND_FROM_UPPER_RIGHT = IDC_9
+ IDC_SURROUND_FROM_LOWER_LEFT = IDC_A
+ IDC_OVERLAID = IDC_B
+
+ class IDS
+ def initialize(ids)
+ @ids = ids
+ @ids.freeze
+ end
+
+ def tree() IDS_Tree.new(@ids); end
+
+ def compose
+ ids = @ids
+ cd = ChiseDB.instance
+ ct = cd.get_by_ids_db("ids")
+ cid = ct.decode(ids)
+ return "" if cid.nil?
+ composed = Character.get(cid).to_s
+ return "" if composed.nil?
+ return "" if composed.char_length == 0
+ return composed if composed.char_length == 1
+ composed.each_char {|ch|
+ char = ch.char
+ #return ch if char.has_attribute?
+ return ch
}
+ return ""
end
- def each_line(file)
- open(file){|f|
- while line = f.gets
- next if line =~ /^;/ #コメントはとばす
- line.chomp!
- code, char, ids = line.split
- yield(code, char, ids)
- end
- }
- end
- def dump_text_all
- each_file {|file|
- dir = File.dirname(file) + "/../ids-new/"
- Dir.mkdir(dir) if ! FileTest.directory?(dir)
- newfile = dir + File.basename(file)
- p [file, newfile]
- open(newfile, "w"){|out|
- out.binmode.sync = true
- each_line(file){|code, ch, ids|
- char = Character.get(ch)
- ids = char.decompose
- out.print "#{code} #{ch} #{ids}\n"
- }
- }
- }
- end
- def make_ids_error
- each_file {|file|
- dir = File.dirname(file) + "/../ids-error"
- Dir.mkdir(dir) unless FileTest.exist?(dir)
- errfile = dir + "/" + File.basename(file)
- # p [file, errfile]
- open(errfile, "w"){|out|
- out.binmode.sync = true
- each_line(file){|code, ch, ids|
- char = Character.get(ch)
- ids_error = char["ids-error"]
- next if ids_error.nil?
- out.print "#{code} #{ch} #{ids} #{ids_error}\n"
- }
- }
+ def aggregate
+ # Take each sub part of String.
+ # If you can aggregate the sub part, aggregate it.
+ #tree = IDS_Tree.new(@ids)
+ tree = self.tree
+ return @ids if tree.depth <= 1 # no sub_node
+ tree.sub_nodes.each {|node|
+ c = node.to_ids.compose
+ next if c.nil? || c == ""
+ # print "#{@ids} #{node} #{c}\n"
+ # p [@ids, node, c]
+ n = @ids.gsub(node, c)
+ return n.to_ids.aggregate
}
+ @ids
end
end
- class IDS_DB < DB # BDB化したIDS DBを扱う
- include Singleton
- def initialize
- @dbs = CharDB.instance
+ module StringIDS
+ def decompose
+ map_char {|ch| ch.char.decompose }
end
- def make_ids_db
- db = IDS_TEXT_DB.instance
- db.each_file {|file|
- @char_counter = 0
- @same_ids_counter = 0
- @good_ids_counter = 0
- @conflict_ids_counter = 0
- db.each_line(file){|code, ch, ids|
- @char_counter += 1
-
- ids = "" if ids == nil
- next if ids == "" #IDSが定義されていない場合は、さっくりと無視するべしよ。
-
- charimg = Character.get(ch) #実体参照である可能性がある
-
- next if code =~ /'$/ || code =~ /"$/ #大漢和番号のダッシュ付きは無視する
- char = Character.get("&"+code+";") #code表記を元に実体参照を作って解釈する
- if char.nil? || char.to_s == "" #うまく文字にならなかった
- print "char == null #{char.inspect} #{code} #{ch} #{ids}\n" unless code =~ /^M-/ || code =~ /^CB/
- #大漢和、CBETA以外の場合は、エラーメッセージ。
- next
- end
- if char != charimg #code表記と文字が一致していない?
- unless code =~ /^M-/ || code =~ /^MH-/ || code =~ /^CB/ #食い違っていて当然であるので何もしない
- print "unknown char #{char.inspect} #{code} #{ch} #{ids}\n"
- next #それ以外の場合はエラーメッセージをだして、次へ。
- end
- end
- #next if !char.has_attribute? #isolated characterはまぎれこませない。
-
- ids.de_er! #実体参照を解除する
- next if ids == char.to_s #もし文字とまったく一緒なら、意味が無いので情報を持たない
- next if ids.char_length == 1
-
- idstree = IDS_Tree.new(ids)
- c = idstree.check_integrity
- c = "contains self" if ids.include?(char.to_s)
- if c #ちょっとでもエラーがある場合は、
- char["ids-error"] = c #エラーを記録して、データとしては保持しない
- next
- end
- if char["ids"].nil? || char["ids"] == "" #元々IDSが無かった場合は、
- char["ids"] = ids #普通に代入すればそれでいいです。
- @good_ids_counter += 1
- else #しかしいままでにすでにIDSが定義されていた場合は?
- if char["ids"] == ids #新しいIDSと古いIDSが完全に一致するなら無視しましょう。
- @same_ids_counter += 1
- else #しかしいままでのIDSと新しいIDSが食い違った場合は?
- @conflict_ids_counter += 1
- # print "conflict #{char.inspect} #{code} #{ids} #{char["ids"]}\n"
- end
- end
- }
- print "#{file} #{@char_counter} #{@same_ids_counter} #{@conflict_ids_counter} #{@good_ids_counter}\n"
- CharacterFactory.instance.reset()
- }
- @dbs.dump_db("ids-error") #テキスト化する
- @dbs.dump_db("ids") #テキスト化する
- end
- def make_ids_reverse
- h = Hash.new
- @dbs.each("ids") {|k, v|
- char = k.char
- ids = char.decompose
- h[ids] = "" if h[ids].nil?
- h[ids] += k #追加する
- }
- h.each {|k, v|
- h[k] = char_sort(v) #文字の順番を、よく使うっぽいものからの順番にする
- }
- h.delete_if {|k, v| #h[k]が""になる可能性もあるが、それはkeyとして入れないことにする。
- v == ""
- }
- print "length #{h.length}\n"
- cdb = CodesysDB.instance
- cdb.make_db_no_question_mark("ids", h)
- cdb.open_db("ids") #これが無いと、dump_dbされません。
- cdb.dump_db("ids")
- end
- def char_sort(composed)
- return composed if composed.char_length == 1
- ar = composed.to_a
- arorg = ar.dup
- ar2 = []
- ar.dup.each {|ch|
- char = ch.char
- if char.char_id < 0xfffff #Unicodeっぽい?
- ar2 << ch
- ar.delete(ch)
- end
- }
- if 0 < ar.length
- EntityReference.each_codesys{|codesys, er_prefix, keta, numtype|
- ar.each {|ch|
- char = ch.char
- v = char[codesys]
- # p [codesys, v] if v
- if v #EntityReferenceの順番に準拠する。
- ar2 << ch
- ar.delete(ch)
- end
- }
- }
- end
- if 0 < ar.length
- # p ["yokuwakaran character", ar, ar[0].inspect_all, arorg]
- EntityReference.each_codesys{|codesys, er_prefix, keta, numtype|
- ar.dup.each {|ch|
- char = ch.char
- v = char[codesys]
- # p [codesys, v] if v
- }
- }
- end
- return ar2.join("")
- end
- def dump_ids_duplicated
- open("ids-duplicated.txt", "w"){|out|
- #out.binmode
- CodesysDB.instance.each("ids") {|k, v|
- if v.nil?
- out.print "nil #{k} #{v}\n"
- next
- end
- n = v.char_length
- next if n == 1
- out.print "#{n} #{k} #{v}"
- v.each_char {|ch|
- char = ch.char
- out.print " #{char.inspect}"
- }
- out.print "\n"
- }
- }
- end
- def make_ids_aggregated
- @dbs.each("ids") {|k, v|
- char = k.char
- ids = char.decompose
- ag = ids.aggregate
- char["ids-aggregated"] = ag
- }
- @dbs.dump_db("ids-aggregated")
- end
- def dump_ids_aggregated
- open("ids-aggregated.txt", "w"){|out|
- #out.binmode
- @dbs.each("ids") {|k, v|
- char = k.char
- ids = char["ids"]
- ag = char["ids-aggregated"]
- out.print "#{char.to_s} #{ag} #{ids}\n" if ids != ag
- }
- }
- end
- def make_ids_parts
- @dbs.each("ids") {|k, v|
- char = k.char
- pids = char.to_s
- ar = []
- counter = 0
- loop {
- ids = pids.decompose
- break if ids == pids #これ以上分割できないようだったら終了〜。
- ar += ids.to_a
- counter += 1
- p [char.to_s, pids, ids, ar] if 10 < counter #これは何かおかしいぞと
- pids = ids
- }
- ar.sort!
- ar.uniq!
- #やっぱりIDS文字も加えることにする. by eto 2003-02-05
- # ar.delete_if {|ch|
- # ch.char.is_ids? #IDS文字はまぎれこませない。
- # }
- str = ar.join("")
- char["ids-parts"] = str
- }
- @dbs.dump_db("ids-parts")
- end
- def make_ids_contained
- h = Hash.new
- @dbs.each("ids-parts") {|k, v|
- char = k.char
- parts = char.ids_parts
- parts.each_char {|ch|
- # part = ch.char
- h[ch] = [] if h[ch].nil?
- h[ch] << k
- # h[ch] += k
- # part["ids-contained"] = "" if part["ids-contained"].nil?
- # part["ids-contained"] += k
- }
- }
- h.each {|k, v|
- char = k.char
- v.sort!
- char["ids-contained"] = v.join("")
-
- }
- @dbs.dump_db("ids-contained")
- end
- def make_ids_decomposed
- @dbs.each("ids") {|k, v|
- char = k.char
- de= char.decompose_all
- char["ids-decomposed"] = de
- }
- @dbs.dump_db("ids-decomposed")
+ def decompose_all
+ map_char {|ch| ch.char.decompose_all }
end
end
- class Node < Array #==================================木構造の中の一つの枝
- def initialize(nodeleaf=nil, nodenum=nil)
- super()
- @nodeleaf = nodeleaf
- @nodenum = nodenum
- if @nodeleaf
- original_add(@nodeleaf)
- end
- end
- attr_reader :nodenum
- alias original_add <<
- private :original_add
- def <<(obj)
- original_add(obj)
- @nodenum -= 1 if @nodenum
+ module CharacterIDC
+ def is_idc?
+ 0x2ff0 <= @char_id && @char_id <= 0x2fff
end
- def nodes
- ar = []
- ar << self.to_s
- self.each {|n|
- ar += n.nodes if n.is_a? Node
- }
- return ar
+
+ def idc_argument_number
+ return 0 unless is_idc?
+ return 3 if @char_id == 0x2ff2 || @char_id == 0x2ff3
+ return 2
end
end
- class Tree #==================================================木構造を扱う
- def initialize()
- @root = Node.new()
- @stack = [@root]
- @leafnum = 0
- @depth = 1 #stackの深さが最大になったところの値、木構造が無いときは1となる
- end
- def depth() @depth - 1 end
- def add_node(nodeleaf=nil, nodenum=nil) #枝を追加
- new_node = Node.new(nodeleaf, nodenum)
- @stack.last << new_node
- @stack << new_node
- if @depth < @stack.length
- @depth = @stack.length
- end
- self
+ module CharacterIDS
+ def decompose # by glyph
+ decompose_internal
end
- def end_node() #この枝は終り
- @stack.pop
- self
- end
- def add_leaf(a) #葉を追加
- @stack.last << a
- end_check()
- self
+
+ def decompose_by_meaning
+ decompose_internal(true)
end
- def end_check()
- n = @stack.last.nodenum
- if n && n == 0
- end_node()
- end_check() #再帰
+
+ def decompose_all
+ pde = ""
+ de = self.decompose # the start point.
+ level = 0
+ while true
+ pde = de
+ de = pde.decompose # decompose it again.
+ break if pde == de # previous is same.
+ exit if 10 < level # p ["too many recursive", self]
+ level += 1
end
+ de
end
- def check_integrity
- n = @stack.last.nodenum
- return nil if @root.length == 0 #no tree is good tree
- return "unmatch leaves" if n && n != 0
- return "extra nodes" if @root.first.is_a?(Node) && @root.length != 1
- return "extra leaves" if @root.length != 1
- return nil
- end
- def nodes
- r = @root.nodes
- r.shift
- r
- end
- def sub_nodes
- r = nodes
- r.shift
- r
- end
- def to_s() @root.to_s end
- def inspect() @root.inspect end
- end
- class IDS_Tree < Tree
- def initialize(str)
- @str = str
- super()
- parse()
- end
- def parse()
- @str.each_char {|ch|
- char = Character.new(ch)
- if is_ids?(char)
- add_node(char, ids_operator_argc(char))
- else
- add_leaf(char)
- end
- }
- end
- def is_ids?(obj)
- return true if "+*".include?(obj.to_s) #テスト用ですかね
- return true if obj.is_ids?
- return false
- end
- def ids_operator_argc(obj)
- return obj.ids_operator_argc if 0 < obj.ids_operator_argc
- return 2 #テスト用ってことで
- end
- def check_integrity
- r = super
- return r if r #不完全がすでにわかっているならreturn
- return "contains ques" if @str =~ /\?/ #?が含まれている?
- return nil
- end
- end
+ private
- class IDS #=========================================IDSそのものを扱うclass
- def initialize(str) #IDS文字列をうけとる。
- @str = str
- end
- def parse
- end
- def parse_x #柔軟型のParse. IDSキャラクターが前にきてなくてもよい。などなど。
- end
- end
+ def decompose_internal(by_meaning=nil)
+ #idss = self.ids
+ #return idss if idss
+ #return k if self.is_basic_kanji?
+ #return ids if idss && 0 < ids.length && k != ids
- class Counter
- #使い方
- #counter = Counter.new(50) { exit }
- #counter.count
- def initialize(max)
- @max = max
- @count = 0
- @proc = proc
- end
- def count
- @count += 1
- if @max <= @count
- @proc.call
+ k = self.to_s
+ if by_meaning
+ ids = self.ids_represent
+ return ids if ids && 0 < ids.length && k != ids
+ ids = self.ids_element
+ return ids if ids && 0 < ids.length && k != ids
+ ids = self.ids_meaning
+ return ids if ids && 0 < ids.length && k != ids
end
+ ids = self.ids_aggregated
+ return ids if ids && 0 < ids.length && k != ids
+ ids = self.ids
+ return ids if ids && 0 < ids.length && k != ids
+ k
+
+ #return k if ids.nil? || ids.length == 0 || k == ids
+ #if ids.char_length == 2
+ #p ["What???", k, ids, k.inspect_all]
+ ##return idsx[1] #二個目だけ返すとか?
+ #return k #IDSに展開する方法が無いと。
+ #end
+ #return k if k == ids
+ #if ids.include?(k) #<C5-4C4D><C6-4A37>この二文字のBUG対策
+ ##return ids.sub(k, "")
+ #return k #IDSに展開する方法が無いと。
+ #end
+ #return ids
end
- end
+ end
end