n.c.
authoreto <eto>
Mon, 14 Jun 2004 12:43:08 +0000 (12:43 +0000)
committereto <eto>
Mon, 14 Jun 2004 12:43:08 +0000 (12:43 +0000)
23 files changed:
chise/character.rb
chise/chisedb.rb
chise/iconv.rb
chise/ids.rb
chise/idsdb.rb
chise/idsdbmanagement.rb
chise/management.rb
chise/org-character.rb
chise/org-string.rb
chise/parser.rb
chise/rbchise.rb
chise/string.rb
chise/util.rb
test/common.rb
test/org-test-ids.rb
test/test-char.rb
test/test-iconv.rb
test/test-ids.rb
test/test-idsdb.rb
test/test-parser.rb
test/test-rbchise.rb
test/test-string.rb
test/test-util.rb

index 190792f..c133c53 100755 (executable)
@@ -4,6 +4,8 @@ require "singleton"
 require "chise/parser"
 require "chise/chisedb"
 require "chise/iconv"
+require "chise/utf8"
+require "chise/ids"
 
 module CHISE
   class CharacterFactory # generate Character object and cache them
@@ -33,13 +35,17 @@ module CHISE
   end
 
   class Character
+    include UTF8Value
+    include IDS_Module
+
     def initialize(char_id)
       raise if char_id.nil?
-      raise unless char_id.is_a?(Fixnum) # char_id sure is a Fixnum.
-      raise if char_id < 0 # char_id sure is a positive value.
+      raise unless char_id.kind_of?(Integer) # make sure char_id is Integer.
+      raise if char_id < 0 # make sure char_id is positive.
       @char_id = char_id
       @char_id.freeze
-      @utf8_mcs = CHISE.i_tou8(@char_id)
+      # @utf8_mcs = CHISE.i_tou8(@char_id)
+      @utf8_mcs = itou8(@char_id)
       @utf8_mcs.freeze
       @feature = {}
       @check_all_done = nil
@@ -106,6 +112,16 @@ module CHISE
       en.to_er(self)
     end
 
+    def is_idc?
+      0x2ff0 <= @char_id && @char_id <= 0x2fff
+    end
+
+    def idc_argument_number
+      return 0 unless is_idc?
+      return 3 if @char_id == 0x2ff2 || @char_id == 0x2ff3
+      return 2
+    end
+
     private
 
     def get_feature(f)
index b2f898b..3134fb1 100755 (executable)
@@ -9,6 +9,7 @@ module CHISE
 
     def initialize
       @ds = DataSource.new
+      @byids_db = {}
     end
 
     def location() @ds.location; end
@@ -18,5 +19,41 @@ module CHISE
     def load_feature(n, cid) @ds.load_feature(n, cid) end
     def each_feature() @ds.each_feature {|f| yield f } end
     def each_ccs() @ds.each_ccs {|c| yield c } end
+
+    def get_by_ids_db(n)
+      @byids_db[n] = ByIDS_DB.new(@ds, n) if @byids_db[n].nil?
+      @byids_db[n]
+    end
+  end
+
+  class ByIDS_DB
+    include ChiseValue
+    include TableAccessModule
+
+    def initialize(ds, name)
+      @ds, @name = ds, name
+      @category, @keyvalue = "character", "by_ids"
+      reset
+    end
+
+    def decode(ids)
+      setup_db
+      return nil if @db.nil?
+      parse_c_string(@db.get(ids))
+    end
+
+    def set_decoded_char(ids, cid)
+      setup_db(true)
+      raise "@db is nil." if @db.nil?
+      @db.put(ids, format_char_id(cid))
+    end
+
+    def each
+      setup_db
+      raise "@db is nil." if @db.nil?
+      @db.each {|k, v|
+       yield(parse_value(k), parse_c_string(v))
+      }
+    end
   end
 end
index a55b6fa..d361da6 100755 (executable)
@@ -77,29 +77,29 @@ class String
   def u16toeuc()       Iconv.iconv_to_from("EUC-JP", "UTF-16", self)   end
   def u16tosjis()      Iconv.iconv_to_from("Shift_JIS", "UTF-16", self) end
 
-  def u32to_i
-    return 0 if length == 0
-    s = self
-    return (s[0] << 24 | s[1] << 16 | s[2] << 8 | s[3])
-  end
-
-  def u8to_i
-    u32 = self.u8tou32
-    u32.u32to_i
-  end
+#  def u32to_i
+#    return 0 if length == 0
+#    s = self
+#    return (s[0] << 24 | s[1] << 16 | s[2] << 8 | s[3])
+#  end
+
+#  def u8to_i
+#    u32 = self.u8tou32
+#    u32.u32to_i
+#  end
 end
 
 module CHISE
-  def i_tou32(n) # convert a integer to UTF-32 String
-    raise unless n.is_a?(Integer)
-    sprintf("%c%c%c%c", (n >> 24)&0xff, (n >> 16)&0xff, (n >> 8)&0xff, n&0xff)
-  end
-
-  def i_tou8(n) # convert a integer to UTF-8 String
-    u32 = CHISE.i_tou32(n)
-    u32.u32tou8
-  end
-  module_function :i_tou32, :i_tou8
+#  def i_tou32(n) # convert a integer to UTF-32 String
+#    raise unless n.is_a?(Integer)
+#    sprintf("%c%c%c%c", (n >> 24)&0xff, (n >> 16)&0xff, (n >> 8)&0xff, n&0xff)
+#  end
+
+#  def i_tou8(n) # convert a integer to UTF-8 String
+#    u32 = CHISE.i_tou32(n)
+#    u32.u32tou8
+#  end
+#  module_function :i_tou32, :i_tou8
 end
 
 class NuUconv
index e46bc51..792c863 100755 (executable)
 # Copyright (C) 2002-2004 Kouichirou Eto, All rights reserved.
 
-require "chise/db"
+require "chise/idstree"
 
 module CHISE
-#  IDC_LEFT_TO_RIGHT = "⿰"
-#  IDC_ABOVE_TO_BELOW = "⿱"
-#  IDC_LEFT_TO_MIDDLE_AND_RIGHT = "⿲"
-#  IDC_ABOVE_TO_MIDDLE_AND_BELOW = "⿳"
-#  IDC_FULL_SURROUND = "⿴"
-#  IDC_SURROUND_FROM_ABOVE = "⿵"
-#  IDC_SURROUND_FROM_BELOW = "⿶"
-#  IDC_SURROUND_FROM_LEFT = "⿷"
-#  IDC_SURROUND_FROM_UPPER_LEFT = "⿸"
-#  IDC_SURROUND_FROM_UPPER_RIGHT = "⿹"
-#  IDC_SURROUND_FROM_LOWER_LEFT = "⿺"
-#  IDC_OVERLAID = "⿻"
-
-  IDC_LEFT_TO_RIGHT = "\342\277\260" #2FF0
-  IDC_ABOVE_TO_BELOW = "\342\277\261"
-  IDC_LEFT_TO_MIDDLE_AND_RIGHT = "\342\277\262"
-  IDC_ABOVE_TO_MIDDLE_AND_BELOW = "\342\277\263"
-  IDC_FULL_SURROUND = "\342\277\264" #2FF4
-  IDC_SURROUND_FROM_ABOVE = "\342\277\265"
-  IDC_SURROUND_FROM_BELOW = "\342\277\266"
-  IDC_SURROUND_FROM_LEFT = "\342\277\267"
-  IDC_SURROUND_FROM_UPPER_LEFT = "\342\277\270"
-  IDC_SURROUND_FROM_UPPER_RIGHT = "\342\277\271"
-  IDC_SURROUND_FROM_LOWER_LEFT = "\342\277\272"
-  IDC_OVERLAID = "\342\277\273"
-
-  IDC_LR  = IDC_LEFT_TO_RIGHT
-  IDC_AB  = IDC_ABOVE_TO_BELOW
-  IDC_LM  = IDC_LEFT_TO_MIDDLE_AND_RIGHT
-  IDC_AM  = IDC_ABOVE_TO_MIDDLE_AND_BELOW
-  IDC_FS  = IDC_FULL_SURROUND
-  IDC_FA  = IDC_SURROUND_FROM_ABOVE
-  IDC_FB  = IDC_SURROUND_FROM_BELOW
-  IDC_FL  = IDC_SURROUND_FROM_LEFT
-  IDC_UL  = IDC_SURROUND_FROM_UPPER_LEFT
-  IDC_UR  = IDC_SURROUND_FROM_UPPER_RIGHT
-  IDC_LL  = IDC_SURROUND_FROM_LOWER_LEFT
-  IDC_OV  = IDC_OVERLAID
-
-  IDC_LMR = IDC_LM
-  IDC_AMB = IDC_AM
-  IDC_FUL = IDC_UL
-  IDC_FUR = IDC_UR
-  IDC_FLL = IDC_LL
-  IDC_O   = IDC_OV
-
-  class IDS_TEXT_DB < DB
-    include Singleton
-
-    IDS_LIST = "
-IDS-UCS-Basic.txt
-#IDS-UCS-Compat-Supplement.txt
-#IDS-UCS-Compat.txt
-IDS-UCS-Ext-A.txt
-IDS-UCS-Ext-B-1.txt
-IDS-UCS-Ext-B-2.txt
-IDS-UCS-Ext-B-3.txt
-IDS-UCS-Ext-B-4.txt
-IDS-UCS-Ext-B-5.txt
-IDS-UCS-Ext-B-6.txt
-IDS-JIS-X0208-1990.txt
-IDS-Daikanwa-01.txt
-IDS-Daikanwa-02.txt
-IDS-Daikanwa-03.txt
-IDS-Daikanwa-04.txt
-IDS-Daikanwa-05.txt
-IDS-Daikanwa-06.txt
-IDS-Daikanwa-07.txt
-IDS-Daikanwa-08.txt
-IDS-Daikanwa-09.txt
-IDS-Daikanwa-10.txt
-IDS-Daikanwa-11.txt
-IDS-Daikanwa-12.txt
-IDS-Daikanwa-dx.txt
-IDS-Daikanwa-ho.txt
-IDS-CBETA.txt
-".split
-
-    def initialize()
-      super
-      @ids_list = IDS_LIST
-      @chars = []
-
-      @dir = Config.instance.ids_dir
-      
-      @glob, @pre, @post = "#{@dir}/db/*", "#{@dir}/db/", ""
-      dir = File.dirname(@pre)
-      Dir.mkdir(dir) unless FileTest.exist?(dir)
-      open_dbs()
-    end
-
-    def each_file()
-      return unless block_given?
-      @ids_list.each {|file|
-       next if file =~ /^#/
-       yield(@dir+file)
-      }
-    end
-
-    def each_line(file)
-      open(file){|f|
-       while line = f.gets
-         next if line =~ /^;/ #コメントはとばす
-         line.chomp!
-         code, char, ids = line.split
-         yield(code, char, ids)
-       end
-      }
-    end
-
-    def dump_text_all
-      each_file {|file|
-       dir = File.dirname(file) + "/../ids-new/"
-       Dir.mkdir(dir) if ! FileTest.directory?(dir)
-       newfile = dir + File.basename(file)
-       p [file, newfile]
-       open(newfile, "w"){|out|
-         out.binmode.sync = true
-         each_line(file){|code, ch, ids|
-           char = Character.get(ch)
-           ids = char.decompose
-           out.print "#{code}  #{ch}   #{ids}\n"
-         }
-       }
-      }
-    end
-
-    def make_ids_error
-      each_file {|file|
-       dir = File.dirname(file) + "/../ids-error"
-       Dir.mkdir(dir) unless FileTest.exist?(dir)
-       errfile = dir + "/" + File.basename(file)
-       #       p [file, errfile]
-       open(errfile, "w"){|out|
-         out.binmode.sync = true
-         each_line(file){|code, ch, ids|
-           char = Character.get(ch)
-           ids_error = char["ids-error"]
-           next if ids_error.nil?
-           out.print "#{code}  #{ch}   #{ids}  #{ids_error}\n"
-         }
-       }
-      }
-    end
-  end
-
-  class IDS_DB < DB # BDB化したIDS DBを扱う
-    include Singleton
-
-    def initialize
-      @dbs = CharDB.instance
-    end
-
-    def make_ids_db
-      db = IDS_TEXT_DB.instance
-      db.each_file {|file|
-       @char_counter = 0
-       @same_ids_counter = 0
-       @good_ids_counter = 0
-       @conflict_ids_counter = 0
-       db.each_line(file){|code, ch, ids|
-         @char_counter += 1
-
-         ids = "" if ids == nil
-         next if ids == "" #IDSが定義されていない場合は、さっくりと無視するべしよ。
-
-         charimg = Character.get(ch) #実体参照である可能性がある
-
-         next if code =~ /'$/ || code =~ /"$/ #大漢和番号のダッシュ付きは無視する
-         char = Character.get("&"+code+";") #code表記を元に実体参照を作って解釈する
-         if char.nil? || char.to_s == "" #うまく文字にならなかった
-           print "char == null #{char.inspect} #{code} #{ch}   #{ids}\n" unless code =~ /^M-/ || code =~ /^CB/
-           #大漢和、CBETA以外の場合は、エラーメッセージ。
-           next
-         end
-         if char != charimg #code表記と文字が一致していない?
-           unless code =~ /^M-/ || code =~ /^MH-/ || code =~ /^CB/ #食い違っていて当然であるので何もしない
-             print "unknown char       #{char.inspect} #{code} #{ch}   #{ids}\n"
-             next #それ以外の場合はエラーメッセージをだして、次へ。
-           end
-         end
-         #next if !char.has_attribute? #isolated characterはまぎれこませない。
-
-         ids.de_er! #実体参照を解除する
-         next if ids == char.to_s #もし文字とまったく一緒なら、意味が無いので情報を持たない
-         next if ids.char_length == 1
-
-         idstree = IDS_Tree.new(ids)
-         c = idstree.check_integrity
-         c = "contains self" if ids.include?(char.to_s)
-         if c #ちょっとでもエラーがある場合は、
-           char["ids-error"] = c #エラーを記録して、データとしては保持しない
-           next
-         end
-
-         if char["ids"].nil? || char["ids"] == "" #元々IDSが無かった場合は、
-           char["ids"] = ids #普通に代入すればそれでいいです。
-           @good_ids_counter += 1
-         else #しかしいままでにすでにIDSが定義されていた場合は?
-           if char["ids"] == ids #新しいIDSと古いIDSが完全に一致するなら無視しましょう。
-             @same_ids_counter += 1
-           else #しかしいままでのIDSと新しいIDSが食い違った場合は?
-             @conflict_ids_counter += 1
-             #       print "conflict   #{char.inspect} #{code} #{ids}  #{char["ids"]}\n"
-           end
-         end
-       }
-       print "#{file}  #{@char_counter}        #{@same_ids_counter}    #{@conflict_ids_counter}        #{@good_ids_counter}\n"
-       CharacterFactory.instance.reset()
-      }
-      @dbs.dump_db("ids-error") #テキスト化する
-      @dbs.dump_db("ids") #テキスト化する
-    end
-
-    def make_ids_reverse
-      h = Hash.new
-      @dbs.each("ids") {|k, v|
-       char = k.char
-       ids = char.decompose
-       h[ids] = "" if h[ids].nil?
-       h[ids] += k #追加する
-      }
-      h.each {|k, v|
-       h[k] = char_sort(v) #文字の順番を、よく使うっぽいものからの順番にする
-      }
-      h.delete_if {|k, v| #h[k]が""になる可能性もあるが、それはkeyとして入れないことにする。
-       v == ""
-      }
-      print "length    #{h.length}\n"
-      cdb = CodesysDB.instance
-      cdb.make_db_no_question_mark("ids", h)
-      cdb.open_db("ids") #これが無いと、dump_dbされません。
-      cdb.dump_db("ids")
-    end
-
-    def char_sort(composed)
-      return composed if composed.char_length == 1
-      ar = composed.to_a
-      arorg = ar.dup
-      ar2 = []
-      ar.dup.each {|ch|
-       char = ch.char
-       if char.char_id < 0xfffff #Unicodeっぽい?
-         ar2 << ch
-         ar.delete(ch)
-       end
-      }
-      if 0 < ar.length
-       EntityReference.each_codesys{|codesys, er_prefix, keta, numtype|
-         ar.each {|ch|
-           char = ch.char
-           v = char[codesys]
-           #       p [codesys, v] if v
-           if v #EntityReferenceの順番に準拠する。
-             ar2 << ch
-             ar.delete(ch)
-           end
-         }
-       }
-      end
-      if 0 < ar.length
-       #       p ["yokuwakaran character", ar, ar[0].inspect_all, arorg]
-       EntityReference.each_codesys{|codesys, er_prefix, keta, numtype|
-         ar.dup.each {|ch|
-           char = ch.char
-           v = char[codesys]
-           #       p [codesys, v] if v
-         }
-       }
-      end
-      return ar2.join("")
-    end
-
-    def dump_ids_duplicated
-      open("ids-duplicated.txt", "w"){|out|
-       #out.binmode
-       CodesysDB.instance.each("ids") {|k, v|
-         if v.nil?
-           out.print "nil      #{k}    #{v}\n"
-           next
-         end
-         n = v.char_length
-         next if n == 1
-         out.print "#{n}       #{k}    #{v}"
-         v.each_char {|ch|
-           char = ch.char
-           out.print " #{char.inspect}"
-         }
-         out.print "\n"
-       }
-      }
-    end
-
-    def make_ids_aggregated
-      @dbs.each("ids") {|k, v|
-       char = k.char
-       ids = char.decompose
-       ag = ids.aggregate
-       char["ids-aggregated"] = ag
-      }
-      @dbs.dump_db("ids-aggregated")
-    end
-
-    def dump_ids_aggregated
-      open("ids-aggregated.txt", "w"){|out|
-       #out.binmode
-       @dbs.each("ids") {|k, v|
-         char = k.char
-         ids = char["ids"]
-         ag  = char["ids-aggregated"]
-         out.print "#{char.to_s}       #{ag}   #{ids}\n" if ids != ag
-       }
-      }
-    end
-
-    def make_ids_parts
-      @dbs.each("ids") {|k, v|
-       char = k.char
-       pids = char.to_s
-       ar = []
-       counter = 0
-       loop {
-         ids = pids.decompose
-         break if ids == pids #これ以上分割できないようだったら終了〜。
-         ar += ids.to_a
-         counter += 1
-         p [char.to_s, pids, ids, ar] if 10 < counter #これは何かおかしいぞと
-         pids = ids
-       }
-       ar.sort!
-       ar.uniq!
-       #やっぱりIDS文字も加えることにする. by eto 2003-02-05
-       #       ar.delete_if {|ch|
-       #         ch.char.is_ids? #IDS文字はまぎれこませない。
-       #       }
-       str = ar.join("")
-       char["ids-parts"] = str
-      }
-      @dbs.dump_db("ids-parts")
-    end
-
-    def make_ids_contained
-      h = Hash.new
-      @dbs.each("ids-parts") {|k, v|
-       char = k.char
-       parts = char.ids_parts
-       parts.each_char {|ch|
-         #       part = ch.char
-         h[ch] = [] if h[ch].nil?
-         h[ch] << k
-         #       h[ch] += k
-         #       part["ids-contained"] = "" if part["ids-contained"].nil?
-         #       part["ids-contained"] += k
-       }
-      }
-      h.each {|k, v|
-       char = k.char
-       v.sort!
-       char["ids-contained"] = v.join("")
-       
-      }
-      @dbs.dump_db("ids-contained")
-    end
-
-    def make_ids_decomposed
-      @dbs.each("ids") {|k, v|
-       char = k.char
-       de= char.decompose_all
-       char["ids-decomposed"] = de
-      }
-      @dbs.dump_db("ids-decomposed")
-    end
-
-  end
-
-  class Node < Array # 木構造の中の一つの枝
-    def initialize(nodeleaf=nil, nodenum=nil)
-      super()
-      @nodeleaf = nodeleaf
-      @nodenum = nodenum
-      if @nodeleaf
-       original_add(@nodeleaf)
-      end
-    end
-    attr_reader :nodenum
-
-    alias original_add <<
-      private :original_add
-
-    def <<(obj)
-      original_add(obj)
-      @nodenum -= 1 if @nodenum
-    end
-
-    def nodes
-      ar = []
-      ar << self.to_s
-      self.each {|n|
-       ar += n.nodes if n.is_a? Node
-      }
-      return ar
-    end
-
-  end
-
-  class Tree # 木構造を扱う
-    def initialize()
-      @root = Node.new()
-      @stack = [@root]
-      @leafnum = 0
-      @depth = 1 #stackの深さが最大になったところの値、木構造が無いときは1となる
-    end
-
-    def depth() @depth - 1 end
-
-    def add_node(nodeleaf=nil, nodenum=nil) #枝を追加
-      new_node = Node.new(nodeleaf, nodenum)
-      @stack.last << new_node
-      @stack << new_node
-      if @depth < @stack.length
-       @depth = @stack.length
-      end
-      self
-    end
-
-    def end_node() #この枝は終り
-      @stack.pop
-      self
-    end
-
-    def add_leaf(a) #葉を追加
-      @stack.last << a
-      end_check()
-      self
-    end
-
-    def end_check()
-      n = @stack.last.nodenum
-      if n && n == 0
-       end_node()
-       end_check() #再帰
-      end
-    end
-
-    def check_integrity
-      n = @stack.last.nodenum
-      return nil if @root.length == 0 #no tree is good tree
-      return "unmatch leaves" if n && n != 0
-      return "extra nodes" if @root.first.is_a?(Node) && @root.length != 1
-      return "extra leaves" if @root.length != 1
-      return nil
-    end
-
-    def nodes
-      r = @root.nodes
-      r.shift
-      r
-    end
-
-    def sub_nodes
-      r = nodes
-      r.shift
-      r
-    end
-
-    def to_s()    @root.to_s    end
-
-    def inspect() @root.inspect end
-  end
-
-  class IDS_Tree < Tree
+  IDC_0 = "\342\277\260"
+  IDC_1 = "\342\277\261"
+  IDC_2 = "\342\277\262"
+  IDC_3 = "\342\277\263"
+  IDC_4 = "\342\277\264"
+  IDC_5 = "\342\277\265"
+  IDC_6 = "\342\277\266"
+  IDC_7 = "\342\277\267"
+  IDC_8 = "\342\277\270"
+  IDC_9 = "\342\277\271"
+  IDC_A = "\342\277\272"
+  IDC_B = "\342\277\273"
+
+  IDC_LEFT_TO_RIGHT = IDC_0
+  IDC_ABOVE_TO_BELOW = IDC_1
+  IDC_LEFT_TO_MIDDLE_AND_RIGHT = IDC_2
+  IDC_ABOVE_TO_MIDDLE_AND_BELOW = IDC_3
+  IDC_FULL_SURROUND = IDC_4
+  IDC_SURROUND_FROM_ABOVE = IDC_5
+  IDC_SURROUND_FROM_BELOW = IDC_6
+  IDC_SURROUND_FROM_LEFT = IDC_7
+  IDC_SURROUND_FROM_UPPER_LEFT = IDC_8
+  IDC_SURROUND_FROM_UPPER_RIGHT = IDC_9
+  IDC_SURROUND_FROM_LOWER_LEFT = IDC_A
+  IDC_OVERLAID = IDC_B
+
+  class IDS_Decomposer
     def initialize(str)
       @str = str
-      super()
-      parse()
-    end
-
-    def parse()
-      @str.each_char {|ch|
-       char = Character.new(ch)
-       if is_ids?(char)
-         add_node(char, ids_operator_argc(char))
-       else
-         add_leaf(char)
-       end
-      }
-    end
-
-    def is_ids?(obj)
-      return true if "+*".include?(obj.to_s) #テスト用ですかね
-      return true if obj.is_ids?
-      return false
     end
 
-    def ids_operator_argc(obj)
-      return obj.ids_operator_argc if 0 < obj.ids_operator_argc
-      return 2 #テスト用ってことで
-    end
-
-    def check_integrity
-      r = super
-      return r if r #不完全がすでにわかっているならreturn
-      return "contains ques" if @str =~ /\?/ #?が含まれている?
-      return nil
+    def decompose
+      
     end
   end
 
-  class IDS # IDSそのものを扱うclass
-    def initialize(str) #IDS文字列をうけとる。
-      @str = str
+  module IDS_Module
+    def decompose
+      self.ids
     end
 
-    def parse
+    def decompose_all
+      
     end
 
-    def parse_x #柔軟型のParse. IDSキャラクターが前にきてなくてもよい。などなど。
-    end
   end
 
-  class Counter
-    #使い方
-    #counter = Counter.new(50) { exit }
-    #counter.count
-    def initialize(max)
-      @max = max
-      @count = 0
-      @proc = proc
-    end
-
-    def count
-      @count += 1
-      if @max <= @count
-       @proc.call
-      end
-    end
-
-  end
 end
index 07bc5ec..e9773ae 100755 (executable)
@@ -1,8 +1,98 @@
 # Copyright (C) 2002-2004 Kouichirou Eto, All rights reserved.
 
 require "chise/char"
+require "chise/ids"
+require "chise/qp"
+require "chise/management"
 
 module CHISE
+  class IDS_DB_Management
+    def initialize
+      @cd = ChiseDB.instance
+      @idsdb = IDS_DB.instance
+    end
+
+    def check_conflict_of_ids_text
+      @idsdb.each_ccs {|ccs|
+       qp ccs
+       c = Hash.new(0)
+       h = {}
+       @idsdb.get_ccs(ccs).each_character {|char, ids|
+         c["char"] += 1
+         next if ids == char.to_s
+         next if ids.char_length == 1
+         char_id = char.char_id
+         cids = h[char_id]
+         if cids.nil? # There is no ids yet.
+           h[char_id] = ids # just set it.
+           c["good"] += 1
+         else # but, if there is already a ids?
+           if cids == ids # the two are same.
+             c["same"] += 1 # and just ignore
+           else # but, if the two are not same?
+             c["conflict"] += 1
+             puts "conflict\t#{char.to_s}\t#{ids}\t#{cids}"
+           end
+         end
+       }
+       puts "#{ccs}\t#{c['char']}\t#{c['same']}\t#{c['conflict']}\t#{c['good']}"
+      }
+    end
+
+    def store_ids_as_text
+      @idsdb.each_ccs {|ccs|
+       #qp ccs
+       @idsdb.get_ccs(ccs).each_character {|char, ids|
+         next if ids == char.to_s
+         next if ids.char_length == 1
+         char.ids_text = ids # just set it.
+       }
+      }
+      @cd.get_feature("ids-text").dump
+    end
+
+    def store_ids_de_er
+      @cd.get_feature("ids-text").each {|cid, idser|
+       char = Character.get(cid)
+       begin
+         ids = idser.de_er # parse Entity Reference
+       rescue => e
+         qp cid, idser
+         next
+       end
+       char.ids_de_er = ids # set it.
+      }
+      @cd.get_feature("ids-de-er").dump
+    end
+
+    def check_integrity_of_ids_tree
+      @cd.get_feature("ids-de-er").each {|cid, ids|
+       char = Character.get(cid)
+       idstree = IDS_Tree.new(ids)
+       begin
+         raise "contains self" if ids.include?(char.to_s)
+         idstree.check_integrity
+       rescue => e
+         #puts "#{cid}\t#{e.message}\t#{ids}"
+         char.ids_error = e.message
+         next
+       end
+       char.ids = ids # set it.
+      }
+      @cd.get_feature("ids").dump
+      @cd.get_feature("ids-error").dump
+    end
+
+    def make_by_ids_db
+      ct = @cd.get_by_ids_db("ids")
+      @cd.get_feature("ids").each {|cid, ids|
+       char = Character.get(cid)
+       ct.set_decoded_char(ids, cid)
+      }
+      ct.dump
+    end
+  end
+
   class IDS_DB
     include Singleton
 
@@ -36,6 +126,7 @@ module CHISE
       @path.open {|f|
        f.each {|line|
          next if /\A;/ =~ line # skip comment
+         line.chomp!
          code, picture, ids = line.split
          raise if code.nil?
          ids = "" if ids.nil?
@@ -44,13 +135,17 @@ module CHISE
       }
     end
 
-    def each_entry
+    def each_character
       each_line {|code, ids|
+       next if ids.nil?
+       next if ids == "" # If there is no IDS, ignore it.
+
        er = "&"+code+";"
        begin
          char = Character.get(er)
        rescue
          #qp er
+         next
        end
        next if char.nil?
        yield(char, ids)
index 93a15e5..e69de29 100755 (executable)
@@ -1,23 +0,0 @@
-# Copyright (C) 2002-2004 Kouichirou Eto, All rights reserved.
-
-require "chise/idsdb"
-require "chise/qp"
-
-module CHISE
-  class IDS_DB_Management
-    def initialize
-      @idb = CHISE::IDS_DB.instance
-    end
-
-    def store_ids_to_bdb
-      @idb.each_ccs {|ccs|
-       #qp ccs
-       cd = @idb.get_ccs(ccs)
-       cd.each_entry {|char, ids|
-         char.ids = ids if char.ids.nil?
-       }
-      }
-    end
-
-  end
-end
index 239f05a..2e17209 100755 (executable)
@@ -6,7 +6,7 @@ require "chise/char"
 require "chise/qp"
 
 module CHISE
-  class TableAccess
+  module TableAccessModule
     def to_hash
       h = {}
       each {|k, v| h[k] = v }
@@ -15,9 +15,7 @@ module CHISE
 
     def dump
       txt = @name.path.escape.escape_win_filename.to_s+".txt"
-      #"character/feature"
       t = @ds.location+@category+@keyvalue+txt
-      qp t.to_s
       t.open("wb"){|out|
        to_hash.sort.each {|k, v|
          out.printf("%s\t%s\n", k, v)
index e238bb2..c6e2778 100755 (executable)
@@ -1,3 +1,5 @@
+module CHISE
+  class Character
     def mcs_hex() sprintf("%x", @char_id) end
 
     def char_feature_alist() check_all_database(); @features; end
@@ -16,8 +18,7 @@
     def check_database(a)
       db = CharDB.instance
       u8 = mcs_utf8()
-      v = db.get(a, u8) # u8\82Å\95\\82³\82ê\82é\95\8e\9a\82Ìa\83A\83g\83\8a\83r\83\85\81[\83g\82ð\92²\82×\82é\81B
-      v
+      db.get(a, u8) # u8\82Å\95\\82³\82ê\82é\95\8e\9a\82Ìa\83A\83g\83\8a\83r\83\85\81[\83g\82ð\92²\82×\82é\81B
     end
 
     def check_all_database() # \8c»\8dÝ\82Ì@char_id\82©\82ç\81A\95\8e\9a\83f\81[\83^\83x\81[\83X\82ð\8eQ\8fÆ\82·\82é
       return de.decompose_all(level+1) if de != self #\82È\82É\82©\95Ï\89»\82ª\82 \82Á\82½\82©\82ç\8dÄ\8bA
       return de #\82à\82¤\82±\82ê\88È\8fã\95Ï\89»\82Í\96³\82³\82»\82¤\82¾\82¼\82Æ\81B
     end
-
-    def is_ids?() 0x2ff0 <= @char_id && @char_id <= 0x2fff end
-
-    def ids_operator_argc()
-      return 0 unless is_ids?
-      return 3 if @char_id == 0x2ff2 || @char_id == 0x2ff3
-      return 2
-    end
+  end
+end
index 375a2d7..a495fe6 100755 (executable)
@@ -1,6 +1,4 @@
 class String
-  def each_character() to_a.each {|ch| yield ch.char } end
-  def char_length() to_a.length end
   def to_utf8()
     return to_a.map {|ch|
       ch.char.to_utf8
index 5a6cf7e..1570f20 100755 (executable)
@@ -1,9 +1,10 @@
 # Copyright (C) 2002-2004 Kouichirou Eto, All rights reserved.
 
 require "chise/chisedb"
+require "chise/utf8"
 
 module CHISE
-  module EntityReference
+  module EntityReferenceModule
     PART    = "&([-+0-9A-Za-z#]+);"
     ALL     = '\A'+PART+'\Z'
     PART_RE = Regexp.new(PART)
@@ -13,7 +14,7 @@ module CHISE
     def is_er?(s)      (ALL_RE =~ s)  != nil;  end
 
     # the order is important.  The primary charset should be selectable.
-    CODESYS_TABLE = [
+    CCS_TABLE = [
       %w( =jis-x0208-1990      J90- 4 X),
       %w( =jis-x0208-1983      J83- 4 X),
       %w( =jis-x0208-1978      J78- 4 X),
@@ -24,6 +25,7 @@ module CHISE
       %w( =jis-x0213-2-2000    JX2- 4 X),
       %w( =jis-x0212           JSP- 4 X),
       %w( =big5-cdp            CDP- 4 X),
+      %w( =big5                        B-   4 X),
       %w( =cns11643-1          C1-  4 X),
       %w( =cns11643-2          C2-  4 X),
       %w( =cns11643-3          C3-  4 X),
@@ -36,12 +38,28 @@ module CHISE
       %w( =cbeta               CB   5 d),
       %w( =gt                  GT-  5 d),
       %w( =gt-k                        GT-K 5 d),
+      %w( =hanziku-1           HZK01- 4 X),
+      %w( =hanziku-2           HZK02- 4 X),
+      %w( =hanziku-3           HZK03- 4 X),
+      %w( =hanziku-4           HZK04- 4 X),
+      %w( =hanziku-5           HZK05- 4 X),
+      %w( =hanziku-6           HZK06- 4 X),
+      %w( =hanziku-7           HZK07- 4 X),
+      %w( =hanziku-8           HZK08- 4 X),
+      %w( =hanziku-9           HZK09- 4 X),
+      %w( =hanziku-10          HZK10- 4 X),
+      %w( =hanziku-11          HZK11- 4 X),
+      %w( =hanziku-12          HZK12- 4 X),
+      %w( =ruimoku-v6          RUI6-  4 X),
+      %w( =jef-china3          JC3-   4 X),
     ]
-    PRIVATE_USE_AREA = 0xe000
   end
 
   class CharacterParser
-    include EntityReference
+    include EntityReferenceModule
+    include UTF8Value
+
+    PRIVATE_USE_AREA = 0xe000
 
     def parse(c) # parse a value and return a number (MCS)
       raise "c is nil" if c.nil?
@@ -49,8 +67,9 @@ module CHISE
       if c.kind_of?(String)
        if /\A\?/ =~ c
          c = c.sub(/\A\?/, "") # remove "?" in the head
-         u4 = c.u8tou32 # translate from UTF-8 to UTF-32
-         return u4.u32to_i # translate UTF-32 to UCS number
+         #u4 = c.u8tou32 # translate from UTF-8 to UTF-32
+         #return u4.u32to_i # translate UTF-32 to UCS number
+         return u8toi(c)
        end
 
        return parse_er(c) if is_er?(c) # ER?
@@ -88,7 +107,7 @@ module CHISE
        s = s.sub(/\AI-/, "")
       end
 
-      CODESYS_TABLE.each {|codesys, er_prefix, keta, numtype|
+      CCS_TABLE.each {|ccs, er_prefix, keta, numtype|
        if numtype == "d"
          nre = '\d'
        elsif numtype == "X"
@@ -99,6 +118,7 @@ module CHISE
 
        re = "\\A#{er_prefix}(#{nre}{#{keta},#{keta}})\\Z"
        next unless Regexp.new(re) =~ s
+       #qp s
 
        codestr = $1
        if numtype == "d"
@@ -107,8 +127,8 @@ module CHISE
          code = codestr.hex
        end
 
-       u8 = get_ccs(codesys, code)
-#      qp s, u8
+       u8 = get_ccs(ccs, code)
+       #qp ccs, s, u8
        next if u8.nil?
 
        num = parse(u8)
@@ -128,7 +148,7 @@ module CHISE
   end
 
   class EntityReferenceParser
-    include EntityReference
+    include EntityReferenceModule
 
     def de_er(s) # replace EntityReference with corresponding character.
       return s unless PART_RE =~ s # don't use contain_er? to get $1
@@ -143,15 +163,15 @@ module CHISE
   end
 
   class EntityReferenceEncoder
-    include EntityReference
+    include EntityReferenceModule
 
     def to_er(char)
       cid = char.char_id
       return "&#x%04x;" % cid if cid <=  0xffff
       return "&#x%05x;" % cid if cid <= 0xfffff
 
-      CODESYS_TABLE.each {|codesys, er_prefix, keta, numtype|
-       code = char[codesys]
+      CCS_TABLE.each {|ccs, er_prefix, keta, numtype|
+       code = char[ccs]
        next if code.nil?
        return "&#{er_prefix}%0#{keta}#{numtype};" % code
       }
@@ -159,7 +179,7 @@ module CHISE
       "&MCS-%08X;" % cid # the last answer
     end
 
-    def to_er_by_ccs(cid, codesys) # not yet
+    def to_er_by_ccs(cid, ccs) # not yet
     end
 
   end
index c90749e..6391dc3 100755 (executable)
@@ -1,332 +1,3 @@
 # Copyright (C) 2002-2004 Kouichirou Eto, All rights reserved.
-# "rbchise.so" ext compatible library by eto 2003-0317
 
-require "bdb"
-require "pathname"
-require "fileutils"
-require "chise/util"
-
-module CHISE
-  module ChiseValue; end
-  module TableAccessModule; end
-
-  class DataSource
-    NONE = 0
-    Berkeley_DB = 1
-
-    def initialize(type=Berkeley_DB, loc=nil, subtype=0, modemask=0755)
-      @type = type
-      loc = Config.instance.db_dir if loc.nil?
-      @location = loc.path
-      @subtype = subtype
-      @modemask = modemask
-      @fdb = {}
-      @cdb = {}
-    end
-    attr_reader :type, :location, :subtype, :modemask
-
-    def get_feature(f)
-      @fdb[f] = FeatureTable.new(self, f) if @fdb[f].nil?
-      @fdb[f] 
-    end
-
-    def get_ccs(ccs)
-      @cdb[ccs] = CCSTable.new(self, ccs) if @cdb[ccs].nil?
-      @cdb[ccs] 
-    end
-
-    def each_feature
-      each_entry("character/feature") {|f| yield(f) }
-    end
-
-    def each_ccs
-      each_entry("character/by_feature") {|f| yield(f) }
-    end
-
-    def load_feature(name, cid)
-      ft = get_feature(name)
-      return nil if ft.nil?
-      ft.get_value(cid)
-    end
-
-    def decode_char(ccs, code_point)
-      ct = get_ccs(ccs)
-      return nil if ct.nil?
-      ct.decode(code_point)
-    end
-
-    private
-    def each_entry(subdir)
-      dir = @location + subdir
-      dir.each_entry {|f|
-       next if f.to_s == "." || f.to_s == ".."
-       next if f.to_s =~ /\.txt\Z/
-       yield(f.unescape_win_filename.unescape.to_s)
-      }
-    end
-  end
-
-  class TableAccess
-    def initialize(ds, name)
-      @ds, @name = ds, name
-      @db = nil
-      @access = 0
-    end
-
-    def sync
-      @db.close if @db
-      @db = nil
-      @access = 0
-    end
-    alias close sync
-
-
-    private
-    def setup_db(writable=nil)
-      setup_db_exec(writable, @category, @keyvalue)
-    end
-
-    def setup_db_exec(writable, cat, key)
-      if writable
-       sync if @access & BDB::CREATE == 0
-       @access = BDB::CREATE
-      else
-       @access = BDB::RDONLY
-      end
-
-      return if @db
-
-      begin
-       @db = AttributeTable.new(@ds.location, cat, key,
-                                @name, @access, @ds.modemask)
-      rescue
-       @db = nil
-      end
-      #raise if @db.nil?
-    end
-  end
-
-  class FeatureTable < TableAccess
-    include ChiseValue
-
-    def initialize(ds, name)
-      super
-      @category, @keyvalue = "character", "feature"
-    end
-
-    def get_value(cid)
-      setup_db
-      return nil if @db.nil?
-      parse_value(@db.get(format_char_id(cid)))
-    end
-
-    def set_value(cid, value)
-      setup_db(true)
-      return nil if @db.nil?
-      @db.put(format_char_id(cid), value)
-    end
-
-    def each
-      setup_db
-      return nil if @db.nil?
-      @db.each {|k, v|
-       yield(parse_c_string(k), v)
-      }
-    end
-  end
-
-  class CCSTable < TableAccess
-    include ChiseValue
-
-    def initialize(ds, name)
-      super
-      @category, @keyvalue = "character", "by_feature"
-    end
-
-    def decode(code_point)
-      setup_db
-      return nil if @db.nil?
-      parse_c_string(@db.get(code_point.to_s))
-    end
-
-    def set_decoded_char(code_point, cid)
-      setup_db(true)
-      return nil if @db.nil?
-      @db.put(code_point.to_s, format_char_id(cid))
-    end
-
-    def each
-      setup_db
-      return nil if @db.nil?
-      @db.each {|k, v|
-       yield(parse_value(k), parse_c_string(v))
-      }
-    end
-  end
-
-  class AttributeTable
-    def initialize(dir, cat, keytype, name, amask, mmask)
-      dbdir  = dir + cat + keytype
-      #FileUtils.mkdir_p(dbdir.to_s) unless dbdir.directory?
-      path = dbdir + name.path.escape.escape_win_filename
-#     qp path, amask, mmask
-      raise unless path.exist?
-#     @db = BDB::Hash.open(path.to_s, amask, mmask)
-      @db = BDB::Hash.open(path.to_s)
-      at_exit {
-       close
-      }
-    end
-
-    def close
-      return if @db.nil?
-      begin
-       @db.sync
-       @db.close
-      rescue
-      end
-    end
-
-    def get(k)    @db.get(k);    end
-    def put(k, v) @db.put(k, v); end
-    def each() @db.each {|k, v| yield(k, v) } end
-  end
-
-  module ChiseValue
-    def parse_value(v)
-      return v if v.nil?
-      #return v if v.kind_of?(Integer)
-      return v.to_i if /\A\d+\Z/ =~ v # number?
-      return $1 if /\A"(.+)"\Z/ =~ v # remove surrounding "
-      #return v.sub(/\A\?/, "") if v =~ /\A\?/ # remove ? in the head
-      #return parse_sexp(v) if v =~ /\A\(.+\)\Z/ # parse sexp # not yet
-      v
-    end
-
-    def parse_c_string(str)
-      return nil if str.nil?
-
-      i = 0
-      c = str[i]
-      i += 1
-      len = str.length
-
-      raise unless 2 <= len && c == ?\?
-
-      c = str[i]
-      i += 1
-
-      if (c == ?\\)
-       raise if (len < 3)
-       c = str[i]
-       i += 1
-       if (c == ?^)
-         raise if (len < 4)
-         c = str[i]
-         i += 1
-         if c == ?\?
-           return 0x7F
-         else
-           return c & (0x80 | 0x1F)
-         end
-       end
-       # raise # ?
-      end
-
-      if ( c < 0xC0 )
-       cid = c
-       counter = 0
-      elsif ( c < 0xE0 )
-       cid = c & 0x1f
-       counter = 1
-      elsif ( c < 0xF0 )
-       cid = c & 0x0f
-       counter = 2
-      elsif ( c < 0xF8 )
-       cid = c & 0x07
-       counter = 3
-      elsif ( c < 0xFC )
-       cid = c & 0x03
-       counter = 4
-      else
-       cid = c & 0x01
-       counter = 5
-      end
-
-      if (counter + 2 <= len)
-       (0...counter).each {|j|
-         cid = (cid << 6) | (str[j + i] & 0x3F)
-       }
-       return cid
-      end
-
-      raise
-    end
-
-    def format_char_id(cid)
-      case cid
-      when ?\t  then return "?\t"
-      when ?\n  then return "?\n"
-      when ?\r  then return "?\r"
-      when 0x1C then return "?\^\\"
-      end
-
-      if cid <= 0x1F
-       return "?\\^"+(?@+cid).chr
-      elsif (cid == ?\s) || (cid == ?\") ||
-         (cid == ?\#) || (cid == ?\') ||
-         (cid == ?\() || (cid == ?\)) ||
-         (cid == ?\,) || (cid == ?\.) ||
-         (cid == ?\;) || (cid == ?\?) ||
-         (cid == ?\[) || (cid == ?\\) ||
-         (cid == ?\]) || (cid == ?\`)
-       return "?\\"+cid.chr
-      elsif (cid <= 0x7E)
-       return("?"+cid.chr)
-      elsif (cid == 0x7F)
-       return "?\\^?"+0.chr
-      elsif (cid <= 0x9F)
-       dest = "?\\^"
-       dest += (((cid + ?@) >> 6) | 0xC0).chr
-       dest += (((cid + ?@) & 0x3F) | 0x80).chr
-       return dest
-      elsif (cid <= 0x7FF)
-       dest = "?  "
-       dest[1] = (cid >> 6) | 0xC0
-       dest[2] = (cid & 0x3F) | 0x80
-       return dest
-      elsif (cid <= 0xFFFF)
-       dest = "?   "
-       dest[1] =  (cid >> 12) | 0xE0
-       dest[2] = ((cid >>  6) & 0x3F) | 0x80
-       dest[3] =  (cid        & 0x3F) | 0x80
-       return dest
-      elsif (cid <= 0x1FFFFF)
-       dest = "?    "
-       dest[1] =  (cid >> 18) | 0xF0
-       dest[2] = ((cid >> 12) & 0x3F) | 0x80
-       dest[3] = ((cid >>  6) & 0x3F) | 0x80
-       dest[4] =  (cid        & 0x3F) | 0x80
-       return dest
-      elsif (cid <= 0x3FFFFFF)
-       dest = "?     "
-       dest[1] =  (cid >> 24) | 0xF8
-       dest[2] = ((cid >> 18) & 0x3F) | 0x80
-       dest[3] = ((cid >> 12) & 0x3F) | 0x80
-       dest[4] = ((cid >>  6) & 0x3F) | 0x80
-       dest[5] =  (cid        & 0x3F) | 0x80
-       return dest
-      else
-       dest = "?      "
-       dest[1] =  (cid >> 30) | 0xFC
-       dest[2] = ((cid >> 24) & 0x3F) | 0x80
-       dest[3] = ((cid >> 18) & 0x3F) | 0x80
-       dest[4] = ((cid >> 12) & 0x3F) | 0x80
-       dest[5] = ((cid >>  6) & 0x3F) | 0x80
-       dest[6] =  (cid        & 0x3F) | 0x80
-       return dest
-      end
-      raise
-    end
-  end
-end
+require "chise/libchise"
index ee96134..929c7ca 100755 (executable)
@@ -26,14 +26,24 @@ class String
     char.method_missing(mid, *args)
   end
 
+  def to_a
+    self.split(//u)
+  end
+
+  def char_length
+    to_a.length
+  end
+
   def each_char
     to_a.each {|c|
       yield(c)
     }
   end
 
-  def to_a
-    self.split(//u)
+  def each_character
+    to_a.each {|ch|
+      yield ch.char
+    }
   end
 
   def de_er()
index 8a34ea7..4cdba13 100644 (file)
@@ -1,53 +1,3 @@
 # Copyright (C) 2002-2004 Kouichirou Eto, All rights reserved.
 
-require "pathname"
-require "chise/config"
-
-class String
-  def path
-    Pathname.new(self)
-  end
-end
-
-class Pathname
-  def escape # copied from cgi.rb
-    s = @path.gsub(/([\/%]+)/n){
-      "%" + $1.unpack("H2" * $1.size).join("%").upcase
-    }
-    Pathname.new(s)
-  end
-
-  def unescape # copied from cgi.rb
-    s = @path.tr("+", " ").gsub(/((?:%[0-9a-fA-F]{2})+)/n) {
-      [$1.delete("%")].pack("H*")
-    }
-    Pathname.new(s)
-  end
-
-  # translate file name for deal with the restriction of Windows file system.
-  def unix_to_win
-    win = @path.gsub(/</, "(")
-    win = win.gsub(/>/, ")")
-    win = win.gsub(/\*/, "+")
-    win = win.gsub(/\?/, "!")
-    Pathname.new(win)
-  end
-
-  def win_to_unix
-    unix = @path.gsub(/\)/, ">")
-    unix = unix.gsub(/\(/, "<")
-    unix = unix.gsub(/\!/, "?")
-    unix = unix.gsub(/\+/, "*")
-    Pathname.new(unix)
-  end
-
-  def escape_win_filename
-    return self.unix_to_win if CHISE.windows?
-    self
-  end
-
-  def unescape_win_filename
-    return self.win_to_unix if CHISE.windows?
-    self
-  end
-end
+require "chise/path"
index b06096b..1be12c3 100755 (executable)
@@ -1,11 +1,12 @@
 # Copyright (C) 2002-2004 Kouichirou Eto, All rights reserved.
 
 $VERBOSE = true
+#$KCODE = "u"
 
-$debug = false # for test
-$debug = true  # for test
-$stdout.binmode if $debug
-$stdout.sync = true if $debug
+# $debug = false # for test
+# $debug = true  # for test
+# $stdout.binmode if $debug
+# $stdout.sync = true if $debug
 
 $LOAD_PATH.unshift("..")
 require "test/unit"
index 74653ce..9034b59 100755 (executable)
@@ -6,9 +6,6 @@ require "common"
 
 class TestIDS < Test::Unit::TestCase
   def test_ids
-    char = "榊".char
-    assert_equal("⿰木神", char.ids)
-    assert_equal("⿰木神", char.decompose)
     str = "榊"
     assert_equal("⿰木神", str.char.ids)
     assert_equal("⿰木神", str.decompose)
@@ -64,104 +61,6 @@ class TestIDS < Test::Unit::TestCase
     assert_match(/OVERLAID/, "&U+2FFB;".de_er.char.name) #∵
   end
 
-  def test_tree
-    assert_equal("[]", CHISE::Tree.new().inspect)
-    assert_equal("[1]", CHISE::Tree.new().add_leaf(1).inspect)
-    assert_equal("[1, 2]", CHISE::Tree.new().add_leaf(1).add_leaf(2).inspect)
-    assert_equal("[[]]", CHISE::Tree.new().add_node.inspect)
-    assert_equal("[[1]]", CHISE::Tree.new().add_node.add_leaf(1).inspect)
-    assert_equal("[[1, 2]]", CHISE::Tree.new().add_node.add_leaf(1).add_leaf(2).inspect)
-    assert_equal("[[1]]", CHISE::Tree.new().add_node.add_leaf(1).end_node.inspect)
-    assert_equal("[[1], [1]]", CHISE::Tree.new().add_node.add_leaf(1).end_node.add_node.add_leaf(1).end_node.inspect)
-
-    tree = CHISE::Tree.new
-    assert_equal("[]", tree.inspect)
-    assert_equal("[1]", tree.add_leaf(1).inspect)
-    assert_equal(0, tree.depth)
-    assert_equal("[1, 2]", tree.add_leaf(2).inspect)
-    assert_equal("[1, 2, []]", tree.add_node.inspect)
-    assert_equal("[1, 2, [3]]", tree.add_leaf(3).inspect)
-    assert_equal(1, tree.depth)
-    assert_equal("[1, 2, [3, 4]]", tree.add_leaf(4).inspect)
-    assert_equal("[1, 2, [3, 4]]", tree.end_node.inspect)
-    assert_equal("[1, 2, [3, 4], [5]]", tree.add_node.add_leaf(5).inspect)
-    assert_equal("[1, 2, [3, 4], [5, [6]]]", tree.add_node.add_leaf(6).inspect)
-    assert_equal(2, tree.depth)
-
-    tree = CHISE::Tree.new
-    assert_equal("[[\"+\"]]", tree.add_node("+", 2).inspect)
-    assert_equal("[[\"+\", 1]]", tree.add_leaf(1).inspect)
-    assert_equal("unmatch leaves", tree.check_integrity)
-    assert_equal("[[\"+\", 1, 2]]", tree.add_leaf(2).inspect)
-    assert_nil(tree.check_integrity)
-    assert_equal("[[\"+\", 1, 2], 3]", tree.add_leaf(3).inspect)
-    assert_equal("extra nodes", tree.check_integrity)
-    
-    tree = CHISE::Tree.new
-    assert_equal("[[\"+\"]]", tree.add_node("+", 2).inspect)
-    assert_equal("unmatch leaves", tree.check_integrity)
-    assert_equal("[[\"+\", 1]]", tree.add_leaf(1).inspect)
-    assert_equal("unmatch leaves", tree.check_integrity)
-    assert_equal("[[\"+\", 1, [\"+\"]]]", tree.add_node("+", 2).inspect)
-    assert_equal("unmatch leaves", tree.check_integrity)
-    assert_equal("[[\"+\", 1, [\"+\", 2]]]", tree.add_leaf(2).inspect)
-    assert_equal("unmatch leaves", tree.check_integrity)
-    assert_equal("[[\"+\", 1, [\"+\", 2, 3]]]", tree.add_leaf(3).inspect)
-    assert_nil(tree.check_integrity)
-
-    tree = CHISE::Tree.new
-    assert_equal("[1]", tree.add_leaf(1).inspect)
-    assert_nil(tree.check_integrity)
-    assert_equal("[1, 2]", tree.add_leaf(2).inspect)
-    assert_equal("extra leaves", tree.check_integrity)
-  end
-
-  def test_ids_tree
-#    assert_equal("[[<+,U+002B>, <A,U+0041>, <B,U+0042>]]", CHISE::IDS_Tree.new("+AB").inspect)
-#    assert_equal("[[<+,U+002B>, <A,U+0041>, <B,U+0042>], <C,U+0043>]", CHISE::IDS_Tree.new("+ABC").inspect)
-#    assert_equal("[[<+,U+002B>, <A,U+0041>, [<+,U+002B>, <B,U+0042>, <C,U+0043>]]]", CHISE::IDS_Tree.new("+A+BC").inspect)
-#    assert_equal("[[<+,U+002B>, <A,U+0041>, [<+,U+002B>, <B,U+0042>, <C,U+0043>]], <D,U+0044>]", CHISE::IDS_Tree.new("+A+BCD").inspect)
-
-    #assert_equal("[<榊,U+698A>]", CHISE::IDS_Tree.new("榊").inspect)
-#    assert_equal("[[<⿰,U+2FF0>, <木,J90-4C5A>, <神,J90-3F40>]]", CHISE::IDS_Tree.new("⿰木神").inspect)
-    assert_equal(1, CHISE::IDS_Tree.new("⿰木神").depth)
-#    assert_equal("[[<⿰,U+2FF0>, <木,J90-4C5A>, [<⿰,U+2FF0>, <⺭,CDP-8B70>, <申,J90-3F3D>]]]", CHISE::IDS_Tree.new("⿰木⿰⺭申").inspect)
-    assert_equal(2, CHISE::IDS_Tree.new("⿰木⿰⺭申").depth)
-    assert_equal("unmatch leaves", CHISE::IDS_Tree.new("⿰木").check_integrity)
-    assert_nil(CHISE::IDS_Tree.new("⿰木神").check_integrity)
-    assert_equal("unmatch leaves", CHISE::IDS_Tree.new("⿰木⿰申").check_integrity)
-    assert_nil(CHISE::IDS_Tree.new("⿰木⿰⺭申").check_integrity)
-    assert_equal("extra nodes", CHISE::IDS_Tree.new("⿰木⿰⺭申申").check_integrity)
-    assert_nil(CHISE::IDS_Tree.new("榊").check_integrity)
-    assert_equal("extra leaves", CHISE::IDS_Tree.new("榊榊").check_integrity)
-
-    assert_equal(3, "⿳".char.ids_operator_argc)
-    assert_equal("⿳士冖匕", "壱".char.ids)
-    assert_equal(3, "壱".char.ids.char.ids_operator_argc)
-    assert_nil(CHISE::IDS_Tree.new("⿳士冖匕").check_integrity)
-    assert_equal("unmatch leaves", CHISE::IDS_Tree.new("⿳士冖").check_integrity)
-    assert_equal("extra nodes", CHISE::IDS_Tree.new("⿳士冖匕匕").check_integrity)
-
-    assert_equal("contains ques", CHISE::IDS_Tree.new("⿳士冖?").check_integrity)
-  end
-
-  def test_tree_depth
-    assert_equal(1, CHISE::IDS_Tree.new("林".decompose).depth)
-#    assert_equal("["⿰木木"]", CHISE::IDS_Tree.new("林".decompose).nodes.inspect)
-#    assert_equal("[]", CHISE::IDS_Tree.new("林".decompose).sub_nodes.inspect)
-    assert_equal(2, CHISE::IDS_Tree.new("榊".decompose_all).depth)
-#    assert_equal("["⿰木⿰⺭申", "⿰⺭申"]", CHISE::IDS_Tree.new("榊".decompose_all).nodes.inspect)
-#    assert_equal("["⿰⺭申"]", CHISE::IDS_Tree.new("榊".decompose_all).sub_nodes.inspect)
-
-#    assert_equal(3, CHISE::IDS_Tree.new("焔".decompose_all).depth)
-#    assert_equal(3, CHISE::IDS_Tree.new("焔".decompose_all).nodes.length)
-#    assert_equal(2, CHISE::IDS_Tree.new("焔".decompose_all).sub_nodes.length)
-
-    assert_equal(2, CHISE::IDS_Tree.new("屡".decompose_all).depth)
-    assert_equal("⿸尸娄", "⿸尸⿱米女".aggregate)
-    assert_equal(3, CHISE::IDS_Tree.new("醤".decompose_all).depth)
-  end
-
   def test_compose_exact #正確に一致するIDSを検知する
     assert_equal("榊", "榊".decompose.compose)
     assert_equal("壱", "壱".decompose.compose)
index 5e36c4c..70dbdc0 100755 (executable)
@@ -28,6 +28,11 @@ class TestCharacter < Test::Unit::TestCase
     assert_raise(RuntimeError){ char.nosuchmethod(0) }
   end
 
+  def test_bignum
+    char = CHISE::Character.get(1644203214)
+    assert_equal("\375\242\200\210\263\216",   char.to_s)
+  end
+
   def test_latin
     char = "A".char
     assert_equal(65, char.ascii)
@@ -59,9 +64,11 @@ class TestCharacter < Test::Unit::TestCase
 
   def test_put
     char = "字".char
-    char["test_attribute"] = "test"
-    assert_equal("test", char.test_attribute)
-    char["test_attribute"] = "test2"
-    assert_equal("test2", char.test_attribute)
+    #qp char.test_feature
+    char.test_feature = "test1"
+    assert_equal("test1", char.test_feature)
+    #qp char.test_feature
+    char.test_feature = "test2"
+    assert_equal("test2", char.test_feature)
   end
 end
index 585ab92..43c2e64 100755 (executable)
@@ -30,11 +30,11 @@ class TestIconv < Test::Unit::TestCase
     assert_equal("[W", u32.u32tou16)
     assert_equal("\273\372", u16.u16toeuc)
     assert_equal("\216\232", u16.u16tosjis)
-    assert_equal(23383, u32.u32to_i)
-    assert_equal(23383, u8.u8to_i)
+#    assert_equal(23383, u32.u32to_i)
+#    assert_equal(23383, u8.u8to_i)
 
-    assert_equal(u32, CHISE.i_tou32(23383))
-    assert_equal(u8, CHISE.i_tou8(23383))
+#    assert_equal(u32, CHISE.i_tou32(23383))
+#    assert_equal(u8, CHISE.i_tou8(23383))
 
     u8 = "\8a¿\8e\9a".sjistou8
     assert_equal("\346\274\242\345\255\227", u8)
index 3c0e98c..e850450 100755 (executable)
@@ -2,8 +2,9 @@
 # Copyright (C) 2002-2004 Kouichirou Eto, All rights reserved.
 
 require "common"
+require "chise/ids"
 
-class TestIDS < Test::Unit::TestCase
+class TestIDC < Test::Unit::TestCase
   def test_idc
     char = CHISE::Character.get(0x2FF0)
     assert_equal("IDEOGRAPHIC DESCRIPTION CHARACTER LEFT TO RIGHT", char.name)
@@ -11,3 +12,24 @@ class TestIDS < Test::Unit::TestCase
     assert_equal(char.bidi_category, "ON")
   end
 end
+
+class TestIDS < Test::Unit::TestCase
+  def test_ids_1
+    assert_equal("\342\277\261\345\256\200\345\255\220", "字".ids)
+    assert_equal("⿱宀子", "字".ids)
+    assert_equal(CHISE::IDC_1+"宀子", "字".ids)
+    assert_equal("\342\277\260\346\227\245\345\257\272", "時".ids)
+    assert_equal(CHISE::IDC_0+"日寺", "時".ids)
+  end
+
+  def test_decompose
+    char = "榊".char
+    assert_equal("⿰木神", char.ids)
+    assert_equal("⿰木神", char.decompose)
+#    assert_equal("⿰木神", char.decompose_all)
+
+
+
+
+  end
+end
index e960397..eda3493 100755 (executable)
@@ -3,24 +3,55 @@
 
 require "common"
 require "chise/idsdb"
+require "chise/management"
+
+#class TestIDS_DB < Test::Unit::TestCase
+class TestIDS_DB
+  def check_ccs_db(cd)
+    cd.each_line {|code, ids|
+      assert_instance_of(String, code)
+      assert_instance_of(String, ids)
+    }
+    cd.each_character {|char, ids|
+      assert_instance_of(CHISE::Character, char)
+      assert_instance_of(String, ids)
+    }
+  end
 
-class TestIDS_DB < Test::Unit::TestCase
   def test_ids_db
     @idb = CHISE::IDS_DB.instance
     assert_instance_of(CHISE::IDS_DB, @idb)
     @idb.each_ccs {|ccs|
       cd = @idb.get_ccs(ccs)
       assert_instance_of(CHISE::IDS_CCS_DB, cd)
+      #check_ccs_db(cd)
     }
+    cd = @idb.get_ccs("JIS-X0208-1990")
+    check_ccs_db(cd)
+  end
+end
 
-    @cd = @idb.get_ccs("JIS-X0208-1990")
-    @cd.each_line {|code, ids|
-      assert_instance_of(String, code)
-      assert_instance_of(String, ids)
-    }
-    @cd.each_entry {|char, ids|
-      assert_instance_of(CHISE::Character, char)
-      assert_instance_of(String, ids)
-    }
+class TestIDS_DB_Management < Test::Unit::TestCase
+  def test_management
+    man = CHISE::IDS_DB_Management.new
+    # make sure there is no conflict
+    #man.check_conflict_of_ids_text # 167.499 seconds.
+    #man.store_ids_as_text # 172.024 seconds.
+    #man.store_ids_de_er # 47.99 seconds.
+    #man.check_integrity_of_ids_tree # 58.185 seconds.
+    #man.make_by_ids_db # 29.572 seconds.
+
+=begin
+    db = IDS_DB.instance
+#    db.make_ids_db #1時間12分
+#    IDS_TEXT_DB.instance.make_ids_error #4分
+#    db.make_ids_reverse #2分
+    db.dump_ids_duplicated #1分
+    db.make_ids_aggregated #5分
+    db.dump_ids_aggregated #1分
+    db.make_ids_parts #30分
+    db.make_ids_contained #2分
+    #db.make_ids_decomposed #2分→おわらなかった…。
+=end
   end
 end
index 3a29516..b8916f3 100755 (executable)
@@ -4,10 +4,11 @@
 require "common"
 
 class TestParser < Test::Unit::TestCase
-  def test_parser
+  def setup
     @pa = CHISE::CharacterParser.new
+  end
 
-    # test_parse
+  def test_parse
     assert_raise(RuntimeError){ @pa.parse(nil) }
     assert_equal(65, @pa.parse(0x41))
     assert_raise(RuntimeError){ @pa.parse(Object.new) }
@@ -15,14 +16,16 @@ class TestParser < Test::Unit::TestCase
     assert_equal(20175, @pa.parse("?\344\273\217"))
     assert_raise(RuntimeError){ @pa.parse("nosuchcharacter") }
     assert_raise(RuntimeError){ @pa.parse("\344\273\217") }
+  end
 
-    # test_parse_er
+  def test_parse_er
     assert_equal(true, @pa.contain_er?("&#x41;"))
     assert_equal(true, @pa.contain_er?("This is &#x41; er."))
     assert_equal(true, @pa.is_er?("&#x41;"))
     assert_equal(false, @pa.is_er?("This is &#x41; er."))
-    assert_raise(RuntimeError){ @pa.parse_er("nosucher") }
     assert_equal(0xe001, @pa.parse("&my-1;"))
+    assert_raise(RuntimeError){ @pa.parse_er("&nosucher;") }
+    assert_raise(RuntimeError){ @pa.parse_er("nosucher") }
 
     assert_equal(23383, @pa.parse("&MCS-00005B57;"))
     assert_equal(23383, @pa.parse("&U5B57;"))
@@ -30,16 +33,29 @@ class TestParser < Test::Unit::TestCase
     assert_equal(23383, @pa.parse("&U+5B57;"))
     assert_equal(23383, @pa.parse("&#x5B57;"))
     assert_equal(23383, @pa.parse("&#23383;"))
+  end
 
-    # test_get_ccs
+  def test_parse_ccs
     assert_equal(23383, @pa.parse("&J90-3B7A;"))
     assert_equal(23383, @pa.parse("&I-J90-3B7A;"))
     assert_equal(23383, @pa.parse("&MCS-00005B57;"))
     assert_equal(23383, @pa.parse("&M-06942;"))
-    assert_raise(RuntimeError){ @pa.parse_er("&nosucher;") }
+  end
 
+  def test_comples_ccs
     assert_equal(28193, @pa.parse("&C1-602E;")) # 渡
     assert_equal(15542221, @pa.parse("&C1-6030;")) # unknown
+
+    # test_ccs_etc
+    assert_equal(131636, @pa.parse("&HZK01-C947;")) # =hanziku-1
+    assert_equal(1644203214, @pa.parse("&CDP-8CCE;")) # CDP
+    assert_equal(1644202927, @pa.parse("&CDP-8BAF;"))
+    assert_equal(1644210346, @pa.parse("&B-A8AA;")) # =big5
+    assert_equal(1644202869, @pa.parse("&RUI6-E00E;")) # =ruimoku-v6
+    assert_equal(15225021, @pa.parse("&JC3-50BD;")) # =jef-china3
+    assert_equal(1644202692, @pa.parse("&CB00008;"))
+    assert_equal(14820071, @pa.parse("&CB08935;"))
+    #assert_equal(0, @pa.parse("&CB08661;")) # what?
   end
 
   def test_de_er
index 2d69d2f..e69de29 100755 (executable)
@@ -1,155 +0,0 @@
-#!/usr/bin/env ruby
-# Copyright (C) 2002-2004 Kouichirou Eto, All rights reserved.
-
-require "common"
-
-class TestRbChise < Test::Unit::TestCase
-  include CHISE::ChiseValue
-
-  def test_rbchise
-    @ds = CHISE::DataSource.new
-    assert_instance_of(CHISE::DataSource, @ds)
-    assert_match(/chise-db\Z/, @ds.location.to_s)
-
-    @ct = @ds.get_ccs("=daikanwa")
-    assert_instance_of(CHISE::CCSTable, @ct)
-    char_id = @ct.decode(364) # get a character by Daikanwa number 364.
-    assert_equal(20175, char_id)
-    str = format_char_id(20175)
-    assert_equal("?\344\273\217", str)
-
-    char_id = @ds.decode_char("=daikanwa", 364)
-    assert_equal(20175, char_id)
-
-    @ft = @ds.get_feature("ideographic-structure")
-    assert_instance_of(CHISE::FeatureTable, @ft)
-    value = @ft.get_value(char_id)
-    assert_instance_of(String, value)
-    assert_equal("(?\342\277\260 ?\344\272\273 ?\345\216\266)", value)
-
-    value = @ds.load_feature("ideographic-structure", char_id)
-    assert_equal("(?\342\277\260 ?\344\272\273 ?\345\216\266)", value)
-
-    @ds.each_feature {|f|
-      #qp f
-      assert_instance_of(String, f)
-    }
-
-    @ft.each {|k, v|
-      #qp k, v
-      assert_kind_of(Integer, k)
-      assert_instance_of(String, v)
-    }
-
-    ft = @ds.get_feature("numeric-value")
-    ft.each {|k, v|
-      #qp k, v
-      assert_kind_of(Integer, k)
-      assert_instance_of(String, v)
-    }
-  end
-
-  def test_each_ccs
-    @ds = CHISE::DataSource.new
-    @ds.each_ccs {|ccs|
-      #qp ccs
-      assert_instance_of(String, ccs)
-      ct = @ds.get_ccs(ccs)
-      assert_instance_of(CHISE::CCSTable, ct)
-    }
-
-    ct = @ds.get_ccs("=ascii")
-    ct.each {|k, v|
-      #qp k, v
-      assert_kind_of(Integer, k)
-      assert_kind_of(Integer, v)
-    }
-    ct.close
-  end
-
-  def test_error
-    @ds = CHISE::DataSource.new
-    @ft = @ds.get_feature("nosuchfeature")
-    v = @ft.get_value(20175)
-    assert_equal(nil, v)
-  end
-
-  def test_chisedb
-    @cd = CHISE::ChiseDB.instance
-
-    char_id = @cd.decode_char("=daikanwa", 364)
-    assert_equal(20175, char_id)
-
-    value = @cd.load_feature("ideographic-structure", char_id)
-    assert_equal("(?\342\277\260 ?\344\272\273 ?\345\216\266)", value)
-
-    value = @cd.load_feature("=ucs", char_id)
-    assert_equal(20175, value)
-
-    @cd.each_feature {|f|
-      assert_instance_of(String, f)
-    }
-
-    ft = @cd.get_feature("numeric-value")
-    ft.each {|k, v|
-      assert_kind_of(Integer, k)
-      assert_instance_of(String, v)
-    }
-  end
-
-  def test_ascii
-    @cd = CHISE::ChiseDB.instance
-    ct = @cd.get_ccs("ascii")
-    char_id = ct.decode(65)
-    assert_equal(65, char_id)
-    assert_equal("A", CHISE::Character.get(char_id).to_s)
-#    assert_equal("A", char.to_s)
-  end
-
-
-  def test_parse_c_string
-    u8 = "字"
-    assert_equal(23383, u8.u8to_i)
-    assert_equal(23383,        parse_c_string("?"+u8))
-    assert_equal(0,    parse_c_string("?\\^@"))
-    assert_equal(9,    parse_c_string("?\t"))
-    assert_equal(10,   parse_c_string("?\n"))
-    assert_equal(13,   parse_c_string("?\r"))
-    assert_equal(94,   parse_c_string("?^\\"))
-    assert_equal(31,   parse_c_string("?\\^_"))
-    assert_equal(32,   parse_c_string("?\\ "))
-    assert_equal(34,   parse_c_string("?\\\""))
-    assert_equal(126,  parse_c_string("?~"))
-    assert_equal(127,  parse_c_string("?\\^?\000"))
-    assert_equal(131,  parse_c_string("?\\^\303\237"))
-    assert_equal(0x7FF,        parse_c_string("?\337\277"))
-    assert_equal(0xFFFF,       parse_c_string("?\357\277\277"))
-    assert_equal(0x1FFFFF,     parse_c_string("?\367\277\277\277"))
-    assert_equal(0x3FFFFFF,    parse_c_string("?\373\277\277\277\277"))
-    assert_equal(0xFFFFFFF,    parse_c_string("?\374\217\277\277\277\277"))
-    assert_raise(RuntimeError) { parse_c_string("nosuch") }
-  end
-
-  def test_format_char_id
-    u8 = "字"
-    assert_equal(u8, CHISE.i_tou8(23383))
-    assert_equal("?\345\255\227",      format_char_id(23383))
-    assert_equal("?"+u8,               format_char_id(23383))
-    assert_equal("?\\^@",      format_char_id(0))
-    assert_equal("?\t",                format_char_id(?\t))
-    assert_equal("?\n",                format_char_id(?\n))
-    assert_equal("?\r",                format_char_id(?\r))
-    assert_equal("?^\\",       format_char_id(0x1C))
-    assert_equal("?\\^_",      format_char_id(0x1F))
-    assert_equal("?\\ ",       format_char_id(?\s))
-    assert_equal("?\\\"",      format_char_id(?\"))
-    assert_equal("?~",         format_char_id(0x7E))
-    assert_equal("?\\^?\000",  format_char_id(0x7F))
-    assert_equal("?\\^\303\237",       format_char_id(0x9F))
-    assert_equal("?\337\277",  format_char_id(0x7FF))
-    assert_equal("?\357\277\277",      format_char_id(0xFFFF))
-    assert_equal("?\367\277\277\277",  format_char_id(0x1FFFFF))
-    assert_equal("?\373\277\277\277\277",      format_char_id(0x3FFFFFF))
-    assert_equal("?\374\217\277\277\277\277",  format_char_id(0xFFFFFFF))
-  end
-end
index 87e3c76..2f7cc56 100755 (executable)
@@ -28,6 +28,15 @@ class TestString < Test::Unit::TestCase
     assert_raises(RuntimeError){ "文&nosucher;列".de_er }
   end
 
+  def test_de_er_ccs
+    assert_equal("\346\270\241", "&C1-602E;".de_er) # 渡
+    assert_equal("\370\273\222\237\215", "&C1-6030;".de_er) # unknown
+    # test_hanziku
+    assert_equal("\360\240\210\264", "&HZK01-C947;".de_er)
+#    assert_equal(1644203214, "&CDP-8CCE;".de_er)
+#    assert_equal(1644202927, "&CDP-8BAF;".de_er)
+  end
+
   def test_characters
     @str = "文字列"
     assert_equal(["文","字","列"], @str.to_a)
index 932f53d..e69de29 100755 (executable)
@@ -1,11 +0,0 @@
-#!/usr/bin/env ruby
-# Copyright (C) 2002-2004 Kouichirou Eto, All rights reserved.
-
-require "common"
-
-class TestUtil < Test::Unit::TestCase
-  def test_db
-    assert_equal("()+!", "<>*?".path.unix_to_win.to_s)
-    assert_equal("<>*?", "()+!".path.win_to_unix.to_s)
-  end
-end