require "chise/parser"
require "chise/chisedb"
require "chise/iconv"
+require "chise/utf8"
+require "chise/ids"
module CHISE
class CharacterFactory # generate Character object and cache them
end
class Character
+ include UTF8Value
+ include IDS_Module
+
def initialize(char_id)
raise if char_id.nil?
- raise unless char_id.is_a?(Fixnum) # char_id sure is a Fixnum.
- raise if char_id < 0 # char_id sure is a positive value.
+ raise unless char_id.kind_of?(Integer) # make sure char_id is Integer.
+ raise if char_id < 0 # make sure char_id is positive.
@char_id = char_id
@char_id.freeze
- @utf8_mcs = CHISE.i_tou8(@char_id)
+ # @utf8_mcs = CHISE.i_tou8(@char_id)
+ @utf8_mcs = itou8(@char_id)
@utf8_mcs.freeze
@feature = {}
@check_all_done = nil
en.to_er(self)
end
+ def is_idc?
+ 0x2ff0 <= @char_id && @char_id <= 0x2fff
+ end
+
+ def idc_argument_number
+ return 0 unless is_idc?
+ return 3 if @char_id == 0x2ff2 || @char_id == 0x2ff3
+ return 2
+ end
+
private
def get_feature(f)
def initialize
@ds = DataSource.new
+ @byids_db = {}
end
def location() @ds.location; end
def load_feature(n, cid) @ds.load_feature(n, cid) end
def each_feature() @ds.each_feature {|f| yield f } end
def each_ccs() @ds.each_ccs {|c| yield c } end
+
+ def get_by_ids_db(n)
+ @byids_db[n] = ByIDS_DB.new(@ds, n) if @byids_db[n].nil?
+ @byids_db[n]
+ end
+ end
+
+ class ByIDS_DB
+ include ChiseValue
+ include TableAccessModule
+
+ def initialize(ds, name)
+ @ds, @name = ds, name
+ @category, @keyvalue = "character", "by_ids"
+ reset
+ end
+
+ def decode(ids)
+ setup_db
+ return nil if @db.nil?
+ parse_c_string(@db.get(ids))
+ end
+
+ def set_decoded_char(ids, cid)
+ setup_db(true)
+ raise "@db is nil." if @db.nil?
+ @db.put(ids, format_char_id(cid))
+ end
+
+ def each
+ setup_db
+ raise "@db is nil." if @db.nil?
+ @db.each {|k, v|
+ yield(parse_value(k), parse_c_string(v))
+ }
+ end
end
end
def u16toeuc() Iconv.iconv_to_from("EUC-JP", "UTF-16", self) end
def u16tosjis() Iconv.iconv_to_from("Shift_JIS", "UTF-16", self) end
- def u32to_i
- return 0 if length == 0
- s = self
- return (s[0] << 24 | s[1] << 16 | s[2] << 8 | s[3])
- end
-
- def u8to_i
- u32 = self.u8tou32
- u32.u32to_i
- end
+# def u32to_i
+# return 0 if length == 0
+# s = self
+# return (s[0] << 24 | s[1] << 16 | s[2] << 8 | s[3])
+# end
+
+# def u8to_i
+# u32 = self.u8tou32
+# u32.u32to_i
+# end
end
module CHISE
- def i_tou32(n) # convert a integer to UTF-32 String
- raise unless n.is_a?(Integer)
- sprintf("%c%c%c%c", (n >> 24)&0xff, (n >> 16)&0xff, (n >> 8)&0xff, n&0xff)
- end
-
- def i_tou8(n) # convert a integer to UTF-8 String
- u32 = CHISE.i_tou32(n)
- u32.u32tou8
- end
- module_function :i_tou32, :i_tou8
+# def i_tou32(n) # convert a integer to UTF-32 String
+# raise unless n.is_a?(Integer)
+# sprintf("%c%c%c%c", (n >> 24)&0xff, (n >> 16)&0xff, (n >> 8)&0xff, n&0xff)
+# end
+
+# def i_tou8(n) # convert a integer to UTF-8 String
+# u32 = CHISE.i_tou32(n)
+# u32.u32tou8
+# end
+# module_function :i_tou32, :i_tou8
end
class NuUconv
# Copyright (C) 2002-2004 Kouichirou Eto, All rights reserved.
-require "chise/db"
+require "chise/idstree"
module CHISE
-# IDC_LEFT_TO_RIGHT = "⿰"
-# IDC_ABOVE_TO_BELOW = "⿱"
-# IDC_LEFT_TO_MIDDLE_AND_RIGHT = "⿲"
-# IDC_ABOVE_TO_MIDDLE_AND_BELOW = "⿳"
-# IDC_FULL_SURROUND = "⿴"
-# IDC_SURROUND_FROM_ABOVE = "⿵"
-# IDC_SURROUND_FROM_BELOW = "⿶"
-# IDC_SURROUND_FROM_LEFT = "⿷"
-# IDC_SURROUND_FROM_UPPER_LEFT = "⿸"
-# IDC_SURROUND_FROM_UPPER_RIGHT = "⿹"
-# IDC_SURROUND_FROM_LOWER_LEFT = "⿺"
-# IDC_OVERLAID = "⿻"
-
- IDC_LEFT_TO_RIGHT = "\342\277\260" #2FF0
- IDC_ABOVE_TO_BELOW = "\342\277\261"
- IDC_LEFT_TO_MIDDLE_AND_RIGHT = "\342\277\262"
- IDC_ABOVE_TO_MIDDLE_AND_BELOW = "\342\277\263"
- IDC_FULL_SURROUND = "\342\277\264" #2FF4
- IDC_SURROUND_FROM_ABOVE = "\342\277\265"
- IDC_SURROUND_FROM_BELOW = "\342\277\266"
- IDC_SURROUND_FROM_LEFT = "\342\277\267"
- IDC_SURROUND_FROM_UPPER_LEFT = "\342\277\270"
- IDC_SURROUND_FROM_UPPER_RIGHT = "\342\277\271"
- IDC_SURROUND_FROM_LOWER_LEFT = "\342\277\272"
- IDC_OVERLAID = "\342\277\273"
-
- IDC_LR = IDC_LEFT_TO_RIGHT
- IDC_AB = IDC_ABOVE_TO_BELOW
- IDC_LM = IDC_LEFT_TO_MIDDLE_AND_RIGHT
- IDC_AM = IDC_ABOVE_TO_MIDDLE_AND_BELOW
- IDC_FS = IDC_FULL_SURROUND
- IDC_FA = IDC_SURROUND_FROM_ABOVE
- IDC_FB = IDC_SURROUND_FROM_BELOW
- IDC_FL = IDC_SURROUND_FROM_LEFT
- IDC_UL = IDC_SURROUND_FROM_UPPER_LEFT
- IDC_UR = IDC_SURROUND_FROM_UPPER_RIGHT
- IDC_LL = IDC_SURROUND_FROM_LOWER_LEFT
- IDC_OV = IDC_OVERLAID
-
- IDC_LMR = IDC_LM
- IDC_AMB = IDC_AM
- IDC_FUL = IDC_UL
- IDC_FUR = IDC_UR
- IDC_FLL = IDC_LL
- IDC_O = IDC_OV
-
- class IDS_TEXT_DB < DB
- include Singleton
-
- IDS_LIST = "
-IDS-UCS-Basic.txt
-#IDS-UCS-Compat-Supplement.txt
-#IDS-UCS-Compat.txt
-IDS-UCS-Ext-A.txt
-IDS-UCS-Ext-B-1.txt
-IDS-UCS-Ext-B-2.txt
-IDS-UCS-Ext-B-3.txt
-IDS-UCS-Ext-B-4.txt
-IDS-UCS-Ext-B-5.txt
-IDS-UCS-Ext-B-6.txt
-IDS-JIS-X0208-1990.txt
-IDS-Daikanwa-01.txt
-IDS-Daikanwa-02.txt
-IDS-Daikanwa-03.txt
-IDS-Daikanwa-04.txt
-IDS-Daikanwa-05.txt
-IDS-Daikanwa-06.txt
-IDS-Daikanwa-07.txt
-IDS-Daikanwa-08.txt
-IDS-Daikanwa-09.txt
-IDS-Daikanwa-10.txt
-IDS-Daikanwa-11.txt
-IDS-Daikanwa-12.txt
-IDS-Daikanwa-dx.txt
-IDS-Daikanwa-ho.txt
-IDS-CBETA.txt
-".split
-
- def initialize()
- super
- @ids_list = IDS_LIST
- @chars = []
-
- @dir = Config.instance.ids_dir
-
- @glob, @pre, @post = "#{@dir}/db/*", "#{@dir}/db/", ""
- dir = File.dirname(@pre)
- Dir.mkdir(dir) unless FileTest.exist?(dir)
- open_dbs()
- end
-
- def each_file()
- return unless block_given?
- @ids_list.each {|file|
- next if file =~ /^#/
- yield(@dir+file)
- }
- end
-
- def each_line(file)
- open(file){|f|
- while line = f.gets
- next if line =~ /^;/ #コメントはとばす
- line.chomp!
- code, char, ids = line.split
- yield(code, char, ids)
- end
- }
- end
-
- def dump_text_all
- each_file {|file|
- dir = File.dirname(file) + "/../ids-new/"
- Dir.mkdir(dir) if ! FileTest.directory?(dir)
- newfile = dir + File.basename(file)
- p [file, newfile]
- open(newfile, "w"){|out|
- out.binmode.sync = true
- each_line(file){|code, ch, ids|
- char = Character.get(ch)
- ids = char.decompose
- out.print "#{code} #{ch} #{ids}\n"
- }
- }
- }
- end
-
- def make_ids_error
- each_file {|file|
- dir = File.dirname(file) + "/../ids-error"
- Dir.mkdir(dir) unless FileTest.exist?(dir)
- errfile = dir + "/" + File.basename(file)
- # p [file, errfile]
- open(errfile, "w"){|out|
- out.binmode.sync = true
- each_line(file){|code, ch, ids|
- char = Character.get(ch)
- ids_error = char["ids-error"]
- next if ids_error.nil?
- out.print "#{code} #{ch} #{ids} #{ids_error}\n"
- }
- }
- }
- end
- end
-
- class IDS_DB < DB # BDB化したIDS DBを扱う
- include Singleton
-
- def initialize
- @dbs = CharDB.instance
- end
-
- def make_ids_db
- db = IDS_TEXT_DB.instance
- db.each_file {|file|
- @char_counter = 0
- @same_ids_counter = 0
- @good_ids_counter = 0
- @conflict_ids_counter = 0
- db.each_line(file){|code, ch, ids|
- @char_counter += 1
-
- ids = "" if ids == nil
- next if ids == "" #IDSが定義されていない場合は、さっくりと無視するべしよ。
-
- charimg = Character.get(ch) #実体参照である可能性がある
-
- next if code =~ /'$/ || code =~ /"$/ #大漢和番号のダッシュ付きは無視する
- char = Character.get("&"+code+";") #code表記を元に実体参照を作って解釈する
- if char.nil? || char.to_s == "" #うまく文字にならなかった
- print "char == null #{char.inspect} #{code} #{ch} #{ids}\n" unless code =~ /^M-/ || code =~ /^CB/
- #大漢和、CBETA以外の場合は、エラーメッセージ。
- next
- end
- if char != charimg #code表記と文字が一致していない?
- unless code =~ /^M-/ || code =~ /^MH-/ || code =~ /^CB/ #食い違っていて当然であるので何もしない
- print "unknown char #{char.inspect} #{code} #{ch} #{ids}\n"
- next #それ以外の場合はエラーメッセージをだして、次へ。
- end
- end
- #next if !char.has_attribute? #isolated characterはまぎれこませない。
-
- ids.de_er! #実体参照を解除する
- next if ids == char.to_s #もし文字とまったく一緒なら、意味が無いので情報を持たない
- next if ids.char_length == 1
-
- idstree = IDS_Tree.new(ids)
- c = idstree.check_integrity
- c = "contains self" if ids.include?(char.to_s)
- if c #ちょっとでもエラーがある場合は、
- char["ids-error"] = c #エラーを記録して、データとしては保持しない
- next
- end
-
- if char["ids"].nil? || char["ids"] == "" #元々IDSが無かった場合は、
- char["ids"] = ids #普通に代入すればそれでいいです。
- @good_ids_counter += 1
- else #しかしいままでにすでにIDSが定義されていた場合は?
- if char["ids"] == ids #新しいIDSと古いIDSが完全に一致するなら無視しましょう。
- @same_ids_counter += 1
- else #しかしいままでのIDSと新しいIDSが食い違った場合は?
- @conflict_ids_counter += 1
- # print "conflict #{char.inspect} #{code} #{ids} #{char["ids"]}\n"
- end
- end
- }
- print "#{file} #{@char_counter} #{@same_ids_counter} #{@conflict_ids_counter} #{@good_ids_counter}\n"
- CharacterFactory.instance.reset()
- }
- @dbs.dump_db("ids-error") #テキスト化する
- @dbs.dump_db("ids") #テキスト化する
- end
-
- def make_ids_reverse
- h = Hash.new
- @dbs.each("ids") {|k, v|
- char = k.char
- ids = char.decompose
- h[ids] = "" if h[ids].nil?
- h[ids] += k #追加する
- }
- h.each {|k, v|
- h[k] = char_sort(v) #文字の順番を、よく使うっぽいものからの順番にする
- }
- h.delete_if {|k, v| #h[k]が""になる可能性もあるが、それはkeyとして入れないことにする。
- v == ""
- }
- print "length #{h.length}\n"
- cdb = CodesysDB.instance
- cdb.make_db_no_question_mark("ids", h)
- cdb.open_db("ids") #これが無いと、dump_dbされません。
- cdb.dump_db("ids")
- end
-
- def char_sort(composed)
- return composed if composed.char_length == 1
- ar = composed.to_a
- arorg = ar.dup
- ar2 = []
- ar.dup.each {|ch|
- char = ch.char
- if char.char_id < 0xfffff #Unicodeっぽい?
- ar2 << ch
- ar.delete(ch)
- end
- }
- if 0 < ar.length
- EntityReference.each_codesys{|codesys, er_prefix, keta, numtype|
- ar.each {|ch|
- char = ch.char
- v = char[codesys]
- # p [codesys, v] if v
- if v #EntityReferenceの順番に準拠する。
- ar2 << ch
- ar.delete(ch)
- end
- }
- }
- end
- if 0 < ar.length
- # p ["yokuwakaran character", ar, ar[0].inspect_all, arorg]
- EntityReference.each_codesys{|codesys, er_prefix, keta, numtype|
- ar.dup.each {|ch|
- char = ch.char
- v = char[codesys]
- # p [codesys, v] if v
- }
- }
- end
- return ar2.join("")
- end
-
- def dump_ids_duplicated
- open("ids-duplicated.txt", "w"){|out|
- #out.binmode
- CodesysDB.instance.each("ids") {|k, v|
- if v.nil?
- out.print "nil #{k} #{v}\n"
- next
- end
- n = v.char_length
- next if n == 1
- out.print "#{n} #{k} #{v}"
- v.each_char {|ch|
- char = ch.char
- out.print " #{char.inspect}"
- }
- out.print "\n"
- }
- }
- end
-
- def make_ids_aggregated
- @dbs.each("ids") {|k, v|
- char = k.char
- ids = char.decompose
- ag = ids.aggregate
- char["ids-aggregated"] = ag
- }
- @dbs.dump_db("ids-aggregated")
- end
-
- def dump_ids_aggregated
- open("ids-aggregated.txt", "w"){|out|
- #out.binmode
- @dbs.each("ids") {|k, v|
- char = k.char
- ids = char["ids"]
- ag = char["ids-aggregated"]
- out.print "#{char.to_s} #{ag} #{ids}\n" if ids != ag
- }
- }
- end
-
- def make_ids_parts
- @dbs.each("ids") {|k, v|
- char = k.char
- pids = char.to_s
- ar = []
- counter = 0
- loop {
- ids = pids.decompose
- break if ids == pids #これ以上分割できないようだったら終了〜。
- ar += ids.to_a
- counter += 1
- p [char.to_s, pids, ids, ar] if 10 < counter #これは何かおかしいぞと
- pids = ids
- }
- ar.sort!
- ar.uniq!
- #やっぱりIDS文字も加えることにする. by eto 2003-02-05
- # ar.delete_if {|ch|
- # ch.char.is_ids? #IDS文字はまぎれこませない。
- # }
- str = ar.join("")
- char["ids-parts"] = str
- }
- @dbs.dump_db("ids-parts")
- end
-
- def make_ids_contained
- h = Hash.new
- @dbs.each("ids-parts") {|k, v|
- char = k.char
- parts = char.ids_parts
- parts.each_char {|ch|
- # part = ch.char
- h[ch] = [] if h[ch].nil?
- h[ch] << k
- # h[ch] += k
- # part["ids-contained"] = "" if part["ids-contained"].nil?
- # part["ids-contained"] += k
- }
- }
- h.each {|k, v|
- char = k.char
- v.sort!
- char["ids-contained"] = v.join("")
-
- }
- @dbs.dump_db("ids-contained")
- end
-
- def make_ids_decomposed
- @dbs.each("ids") {|k, v|
- char = k.char
- de= char.decompose_all
- char["ids-decomposed"] = de
- }
- @dbs.dump_db("ids-decomposed")
- end
-
- end
-
- class Node < Array # 木構造の中の一つの枝
- def initialize(nodeleaf=nil, nodenum=nil)
- super()
- @nodeleaf = nodeleaf
- @nodenum = nodenum
- if @nodeleaf
- original_add(@nodeleaf)
- end
- end
- attr_reader :nodenum
-
- alias original_add <<
- private :original_add
-
- def <<(obj)
- original_add(obj)
- @nodenum -= 1 if @nodenum
- end
-
- def nodes
- ar = []
- ar << self.to_s
- self.each {|n|
- ar += n.nodes if n.is_a? Node
- }
- return ar
- end
-
- end
-
- class Tree # 木構造を扱う
- def initialize()
- @root = Node.new()
- @stack = [@root]
- @leafnum = 0
- @depth = 1 #stackの深さが最大になったところの値、木構造が無いときは1となる
- end
-
- def depth() @depth - 1 end
-
- def add_node(nodeleaf=nil, nodenum=nil) #枝を追加
- new_node = Node.new(nodeleaf, nodenum)
- @stack.last << new_node
- @stack << new_node
- if @depth < @stack.length
- @depth = @stack.length
- end
- self
- end
-
- def end_node() #この枝は終り
- @stack.pop
- self
- end
-
- def add_leaf(a) #葉を追加
- @stack.last << a
- end_check()
- self
- end
-
- def end_check()
- n = @stack.last.nodenum
- if n && n == 0
- end_node()
- end_check() #再帰
- end
- end
-
- def check_integrity
- n = @stack.last.nodenum
- return nil if @root.length == 0 #no tree is good tree
- return "unmatch leaves" if n && n != 0
- return "extra nodes" if @root.first.is_a?(Node) && @root.length != 1
- return "extra leaves" if @root.length != 1
- return nil
- end
-
- def nodes
- r = @root.nodes
- r.shift
- r
- end
-
- def sub_nodes
- r = nodes
- r.shift
- r
- end
-
- def to_s() @root.to_s end
-
- def inspect() @root.inspect end
- end
-
- class IDS_Tree < Tree
+ IDC_0 = "\342\277\260"
+ IDC_1 = "\342\277\261"
+ IDC_2 = "\342\277\262"
+ IDC_3 = "\342\277\263"
+ IDC_4 = "\342\277\264"
+ IDC_5 = "\342\277\265"
+ IDC_6 = "\342\277\266"
+ IDC_7 = "\342\277\267"
+ IDC_8 = "\342\277\270"
+ IDC_9 = "\342\277\271"
+ IDC_A = "\342\277\272"
+ IDC_B = "\342\277\273"
+
+ IDC_LEFT_TO_RIGHT = IDC_0
+ IDC_ABOVE_TO_BELOW = IDC_1
+ IDC_LEFT_TO_MIDDLE_AND_RIGHT = IDC_2
+ IDC_ABOVE_TO_MIDDLE_AND_BELOW = IDC_3
+ IDC_FULL_SURROUND = IDC_4
+ IDC_SURROUND_FROM_ABOVE = IDC_5
+ IDC_SURROUND_FROM_BELOW = IDC_6
+ IDC_SURROUND_FROM_LEFT = IDC_7
+ IDC_SURROUND_FROM_UPPER_LEFT = IDC_8
+ IDC_SURROUND_FROM_UPPER_RIGHT = IDC_9
+ IDC_SURROUND_FROM_LOWER_LEFT = IDC_A
+ IDC_OVERLAID = IDC_B
+
+ class IDS_Decomposer
def initialize(str)
@str = str
- super()
- parse()
- end
-
- def parse()
- @str.each_char {|ch|
- char = Character.new(ch)
- if is_ids?(char)
- add_node(char, ids_operator_argc(char))
- else
- add_leaf(char)
- end
- }
- end
-
- def is_ids?(obj)
- return true if "+*".include?(obj.to_s) #テスト用ですかね
- return true if obj.is_ids?
- return false
end
- def ids_operator_argc(obj)
- return obj.ids_operator_argc if 0 < obj.ids_operator_argc
- return 2 #テスト用ってことで
- end
-
- def check_integrity
- r = super
- return r if r #不完全がすでにわかっているならreturn
- return "contains ques" if @str =~ /\?/ #?が含まれている?
- return nil
+ def decompose
+
end
end
- class IDS # IDSそのものを扱うclass
- def initialize(str) #IDS文字列をうけとる。
- @str = str
+ module IDS_Module
+ def decompose
+ self.ids
end
- def parse
+ def decompose_all
+
end
- def parse_x #柔軟型のParse. IDSキャラクターが前にきてなくてもよい。などなど。
- end
end
- class Counter
- #使い方
- #counter = Counter.new(50) { exit }
- #counter.count
- def initialize(max)
- @max = max
- @count = 0
- @proc = proc
- end
-
- def count
- @count += 1
- if @max <= @count
- @proc.call
- end
- end
-
- end
end
# Copyright (C) 2002-2004 Kouichirou Eto, All rights reserved.
require "chise/char"
+require "chise/ids"
+require "chise/qp"
+require "chise/management"
module CHISE
+ class IDS_DB_Management
+ def initialize
+ @cd = ChiseDB.instance
+ @idsdb = IDS_DB.instance
+ end
+
+ def check_conflict_of_ids_text
+ @idsdb.each_ccs {|ccs|
+ qp ccs
+ c = Hash.new(0)
+ h = {}
+ @idsdb.get_ccs(ccs).each_character {|char, ids|
+ c["char"] += 1
+ next if ids == char.to_s
+ next if ids.char_length == 1
+ char_id = char.char_id
+ cids = h[char_id]
+ if cids.nil? # There is no ids yet.
+ h[char_id] = ids # just set it.
+ c["good"] += 1
+ else # but, if there is already a ids?
+ if cids == ids # the two are same.
+ c["same"] += 1 # and just ignore
+ else # but, if the two are not same?
+ c["conflict"] += 1
+ puts "conflict\t#{char.to_s}\t#{ids}\t#{cids}"
+ end
+ end
+ }
+ puts "#{ccs}\t#{c['char']}\t#{c['same']}\t#{c['conflict']}\t#{c['good']}"
+ }
+ end
+
+ def store_ids_as_text
+ @idsdb.each_ccs {|ccs|
+ #qp ccs
+ @idsdb.get_ccs(ccs).each_character {|char, ids|
+ next if ids == char.to_s
+ next if ids.char_length == 1
+ char.ids_text = ids # just set it.
+ }
+ }
+ @cd.get_feature("ids-text").dump
+ end
+
+ def store_ids_de_er
+ @cd.get_feature("ids-text").each {|cid, idser|
+ char = Character.get(cid)
+ begin
+ ids = idser.de_er # parse Entity Reference
+ rescue => e
+ qp cid, idser
+ next
+ end
+ char.ids_de_er = ids # set it.
+ }
+ @cd.get_feature("ids-de-er").dump
+ end
+
+ def check_integrity_of_ids_tree
+ @cd.get_feature("ids-de-er").each {|cid, ids|
+ char = Character.get(cid)
+ idstree = IDS_Tree.new(ids)
+ begin
+ raise "contains self" if ids.include?(char.to_s)
+ idstree.check_integrity
+ rescue => e
+ #puts "#{cid}\t#{e.message}\t#{ids}"
+ char.ids_error = e.message
+ next
+ end
+ char.ids = ids # set it.
+ }
+ @cd.get_feature("ids").dump
+ @cd.get_feature("ids-error").dump
+ end
+
+ def make_by_ids_db
+ ct = @cd.get_by_ids_db("ids")
+ @cd.get_feature("ids").each {|cid, ids|
+ char = Character.get(cid)
+ ct.set_decoded_char(ids, cid)
+ }
+ ct.dump
+ end
+ end
+
class IDS_DB
include Singleton
@path.open {|f|
f.each {|line|
next if /\A;/ =~ line # skip comment
+ line.chomp!
code, picture, ids = line.split
raise if code.nil?
ids = "" if ids.nil?
}
end
- def each_entry
+ def each_character
each_line {|code, ids|
+ next if ids.nil?
+ next if ids == "" # If there is no IDS, ignore it.
+
er = "&"+code+";"
begin
char = Character.get(er)
rescue
#qp er
+ next
end
next if char.nil?
yield(char, ids)
-# Copyright (C) 2002-2004 Kouichirou Eto, All rights reserved.
-
-require "chise/idsdb"
-require "chise/qp"
-
-module CHISE
- class IDS_DB_Management
- def initialize
- @idb = CHISE::IDS_DB.instance
- end
-
- def store_ids_to_bdb
- @idb.each_ccs {|ccs|
- #qp ccs
- cd = @idb.get_ccs(ccs)
- cd.each_entry {|char, ids|
- char.ids = ids if char.ids.nil?
- }
- }
- end
-
- end
-end
require "chise/qp"
module CHISE
- class TableAccess
+ module TableAccessModule
def to_hash
h = {}
each {|k, v| h[k] = v }
def dump
txt = @name.path.escape.escape_win_filename.to_s+".txt"
- #"character/feature"
t = @ds.location+@category+@keyvalue+txt
- qp t.to_s
t.open("wb"){|out|
to_hash.sort.each {|k, v|
out.printf("%s\t%s\n", k, v)
+module CHISE
+ class Character
def mcs_hex() sprintf("%x", @char_id) end
def char_feature_alist() check_all_database(); @features; end
def check_database(a)
db = CharDB.instance
u8 = mcs_utf8()
- v = db.get(a, u8) # u8\82Å\95\\82³\82ê\82é\95¶\8e\9a\82Ìa\83A\83g\83\8a\83r\83\85\81[\83g\82ð\92²\82×\82é\81B
- v
+ db.get(a, u8) # u8\82Å\95\\82³\82ê\82é\95¶\8e\9a\82Ìa\83A\83g\83\8a\83r\83\85\81[\83g\82ð\92²\82×\82é\81B
end
def check_all_database() # \8c»\8dÝ\82Ì@char_id\82©\82ç\81A\95¶\8e\9a\83f\81[\83^\83x\81[\83X\82ð\8eQ\8fÆ\82·\82é
return de.decompose_all(level+1) if de != self #\82È\82É\82©\95Ï\89»\82ª\82 \82Á\82½\82©\82ç\8dÄ\8bA
return de #\82à\82¤\82±\82ê\88È\8fã\95Ï\89»\82Í\96³\82³\82»\82¤\82¾\82¼\82Æ\81B
end
-
- def is_ids?() 0x2ff0 <= @char_id && @char_id <= 0x2fff end
-
- def ids_operator_argc()
- return 0 unless is_ids?
- return 3 if @char_id == 0x2ff2 || @char_id == 0x2ff3
- return 2
- end
+ end
+end
class String
- def each_character() to_a.each {|ch| yield ch.char } end
- def char_length() to_a.length end
def to_utf8()
return to_a.map {|ch|
ch.char.to_utf8
# Copyright (C) 2002-2004 Kouichirou Eto, All rights reserved.
require "chise/chisedb"
+require "chise/utf8"
module CHISE
- module EntityReference
+ module EntityReferenceModule
PART = "&([-+0-9A-Za-z#]+);"
ALL = '\A'+PART+'\Z'
PART_RE = Regexp.new(PART)
def is_er?(s) (ALL_RE =~ s) != nil; end
# the order is important. The primary charset should be selectable.
- CODESYS_TABLE = [
+ CCS_TABLE = [
%w( =jis-x0208-1990 J90- 4 X),
%w( =jis-x0208-1983 J83- 4 X),
%w( =jis-x0208-1978 J78- 4 X),
%w( =jis-x0213-2-2000 JX2- 4 X),
%w( =jis-x0212 JSP- 4 X),
%w( =big5-cdp CDP- 4 X),
+ %w( =big5 B- 4 X),
%w( =cns11643-1 C1- 4 X),
%w( =cns11643-2 C2- 4 X),
%w( =cns11643-3 C3- 4 X),
%w( =cbeta CB 5 d),
%w( =gt GT- 5 d),
%w( =gt-k GT-K 5 d),
+ %w( =hanziku-1 HZK01- 4 X),
+ %w( =hanziku-2 HZK02- 4 X),
+ %w( =hanziku-3 HZK03- 4 X),
+ %w( =hanziku-4 HZK04- 4 X),
+ %w( =hanziku-5 HZK05- 4 X),
+ %w( =hanziku-6 HZK06- 4 X),
+ %w( =hanziku-7 HZK07- 4 X),
+ %w( =hanziku-8 HZK08- 4 X),
+ %w( =hanziku-9 HZK09- 4 X),
+ %w( =hanziku-10 HZK10- 4 X),
+ %w( =hanziku-11 HZK11- 4 X),
+ %w( =hanziku-12 HZK12- 4 X),
+ %w( =ruimoku-v6 RUI6- 4 X),
+ %w( =jef-china3 JC3- 4 X),
]
- PRIVATE_USE_AREA = 0xe000
end
class CharacterParser
- include EntityReference
+ include EntityReferenceModule
+ include UTF8Value
+
+ PRIVATE_USE_AREA = 0xe000
def parse(c) # parse a value and return a number (MCS)
raise "c is nil" if c.nil?
if c.kind_of?(String)
if /\A\?/ =~ c
c = c.sub(/\A\?/, "") # remove "?" in the head
- u4 = c.u8tou32 # translate from UTF-8 to UTF-32
- return u4.u32to_i # translate UTF-32 to UCS number
+ #u4 = c.u8tou32 # translate from UTF-8 to UTF-32
+ #return u4.u32to_i # translate UTF-32 to UCS number
+ return u8toi(c)
end
return parse_er(c) if is_er?(c) # ER?
s = s.sub(/\AI-/, "")
end
- CODESYS_TABLE.each {|codesys, er_prefix, keta, numtype|
+ CCS_TABLE.each {|ccs, er_prefix, keta, numtype|
if numtype == "d"
nre = '\d'
elsif numtype == "X"
re = "\\A#{er_prefix}(#{nre}{#{keta},#{keta}})\\Z"
next unless Regexp.new(re) =~ s
+ #qp s
codestr = $1
if numtype == "d"
code = codestr.hex
end
- u8 = get_ccs(codesys, code)
-# qp s, u8
+ u8 = get_ccs(ccs, code)
+ #qp ccs, s, u8
next if u8.nil?
num = parse(u8)
end
class EntityReferenceParser
- include EntityReference
+ include EntityReferenceModule
def de_er(s) # replace EntityReference with corresponding character.
return s unless PART_RE =~ s # don't use contain_er? to get $1
end
class EntityReferenceEncoder
- include EntityReference
+ include EntityReferenceModule
def to_er(char)
cid = char.char_id
return "&#x%04x;" % cid if cid <= 0xffff
return "&#x%05x;" % cid if cid <= 0xfffff
- CODESYS_TABLE.each {|codesys, er_prefix, keta, numtype|
- code = char[codesys]
+ CCS_TABLE.each {|ccs, er_prefix, keta, numtype|
+ code = char[ccs]
next if code.nil?
return "&#{er_prefix}%0#{keta}#{numtype};" % code
}
"&MCS-%08X;" % cid # the last answer
end
- def to_er_by_ccs(cid, codesys) # not yet
+ def to_er_by_ccs(cid, ccs) # not yet
end
end
# Copyright (C) 2002-2004 Kouichirou Eto, All rights reserved.
-# "rbchise.so" ext compatible library by eto 2003-0317
-require "bdb"
-require "pathname"
-require "fileutils"
-require "chise/util"
-
-module CHISE
- module ChiseValue; end
- module TableAccessModule; end
-
- class DataSource
- NONE = 0
- Berkeley_DB = 1
-
- def initialize(type=Berkeley_DB, loc=nil, subtype=0, modemask=0755)
- @type = type
- loc = Config.instance.db_dir if loc.nil?
- @location = loc.path
- @subtype = subtype
- @modemask = modemask
- @fdb = {}
- @cdb = {}
- end
- attr_reader :type, :location, :subtype, :modemask
-
- def get_feature(f)
- @fdb[f] = FeatureTable.new(self, f) if @fdb[f].nil?
- @fdb[f]
- end
-
- def get_ccs(ccs)
- @cdb[ccs] = CCSTable.new(self, ccs) if @cdb[ccs].nil?
- @cdb[ccs]
- end
-
- def each_feature
- each_entry("character/feature") {|f| yield(f) }
- end
-
- def each_ccs
- each_entry("character/by_feature") {|f| yield(f) }
- end
-
- def load_feature(name, cid)
- ft = get_feature(name)
- return nil if ft.nil?
- ft.get_value(cid)
- end
-
- def decode_char(ccs, code_point)
- ct = get_ccs(ccs)
- return nil if ct.nil?
- ct.decode(code_point)
- end
-
- private
- def each_entry(subdir)
- dir = @location + subdir
- dir.each_entry {|f|
- next if f.to_s == "." || f.to_s == ".."
- next if f.to_s =~ /\.txt\Z/
- yield(f.unescape_win_filename.unescape.to_s)
- }
- end
- end
-
- class TableAccess
- def initialize(ds, name)
- @ds, @name = ds, name
- @db = nil
- @access = 0
- end
-
- def sync
- @db.close if @db
- @db = nil
- @access = 0
- end
- alias close sync
-
-
- private
- def setup_db(writable=nil)
- setup_db_exec(writable, @category, @keyvalue)
- end
-
- def setup_db_exec(writable, cat, key)
- if writable
- sync if @access & BDB::CREATE == 0
- @access = BDB::CREATE
- else
- @access = BDB::RDONLY
- end
-
- return if @db
-
- begin
- @db = AttributeTable.new(@ds.location, cat, key,
- @name, @access, @ds.modemask)
- rescue
- @db = nil
- end
- #raise if @db.nil?
- end
- end
-
- class FeatureTable < TableAccess
- include ChiseValue
-
- def initialize(ds, name)
- super
- @category, @keyvalue = "character", "feature"
- end
-
- def get_value(cid)
- setup_db
- return nil if @db.nil?
- parse_value(@db.get(format_char_id(cid)))
- end
-
- def set_value(cid, value)
- setup_db(true)
- return nil if @db.nil?
- @db.put(format_char_id(cid), value)
- end
-
- def each
- setup_db
- return nil if @db.nil?
- @db.each {|k, v|
- yield(parse_c_string(k), v)
- }
- end
- end
-
- class CCSTable < TableAccess
- include ChiseValue
-
- def initialize(ds, name)
- super
- @category, @keyvalue = "character", "by_feature"
- end
-
- def decode(code_point)
- setup_db
- return nil if @db.nil?
- parse_c_string(@db.get(code_point.to_s))
- end
-
- def set_decoded_char(code_point, cid)
- setup_db(true)
- return nil if @db.nil?
- @db.put(code_point.to_s, format_char_id(cid))
- end
-
- def each
- setup_db
- return nil if @db.nil?
- @db.each {|k, v|
- yield(parse_value(k), parse_c_string(v))
- }
- end
- end
-
- class AttributeTable
- def initialize(dir, cat, keytype, name, amask, mmask)
- dbdir = dir + cat + keytype
- #FileUtils.mkdir_p(dbdir.to_s) unless dbdir.directory?
- path = dbdir + name.path.escape.escape_win_filename
-# qp path, amask, mmask
- raise unless path.exist?
-# @db = BDB::Hash.open(path.to_s, amask, mmask)
- @db = BDB::Hash.open(path.to_s)
- at_exit {
- close
- }
- end
-
- def close
- return if @db.nil?
- begin
- @db.sync
- @db.close
- rescue
- end
- end
-
- def get(k) @db.get(k); end
- def put(k, v) @db.put(k, v); end
- def each() @db.each {|k, v| yield(k, v) } end
- end
-
- module ChiseValue
- def parse_value(v)
- return v if v.nil?
- #return v if v.kind_of?(Integer)
- return v.to_i if /\A\d+\Z/ =~ v # number?
- return $1 if /\A"(.+)"\Z/ =~ v # remove surrounding "
- #return v.sub(/\A\?/, "") if v =~ /\A\?/ # remove ? in the head
- #return parse_sexp(v) if v =~ /\A\(.+\)\Z/ # parse sexp # not yet
- v
- end
-
- def parse_c_string(str)
- return nil if str.nil?
-
- i = 0
- c = str[i]
- i += 1
- len = str.length
-
- raise unless 2 <= len && c == ?\?
-
- c = str[i]
- i += 1
-
- if (c == ?\\)
- raise if (len < 3)
- c = str[i]
- i += 1
- if (c == ?^)
- raise if (len < 4)
- c = str[i]
- i += 1
- if c == ?\?
- return 0x7F
- else
- return c & (0x80 | 0x1F)
- end
- end
- # raise # ?
- end
-
- if ( c < 0xC0 )
- cid = c
- counter = 0
- elsif ( c < 0xE0 )
- cid = c & 0x1f
- counter = 1
- elsif ( c < 0xF0 )
- cid = c & 0x0f
- counter = 2
- elsif ( c < 0xF8 )
- cid = c & 0x07
- counter = 3
- elsif ( c < 0xFC )
- cid = c & 0x03
- counter = 4
- else
- cid = c & 0x01
- counter = 5
- end
-
- if (counter + 2 <= len)
- (0...counter).each {|j|
- cid = (cid << 6) | (str[j + i] & 0x3F)
- }
- return cid
- end
-
- raise
- end
-
- def format_char_id(cid)
- case cid
- when ?\t then return "?\t"
- when ?\n then return "?\n"
- when ?\r then return "?\r"
- when 0x1C then return "?\^\\"
- end
-
- if cid <= 0x1F
- return "?\\^"+(?@+cid).chr
- elsif (cid == ?\s) || (cid == ?\") ||
- (cid == ?\#) || (cid == ?\') ||
- (cid == ?\() || (cid == ?\)) ||
- (cid == ?\,) || (cid == ?\.) ||
- (cid == ?\;) || (cid == ?\?) ||
- (cid == ?\[) || (cid == ?\\) ||
- (cid == ?\]) || (cid == ?\`)
- return "?\\"+cid.chr
- elsif (cid <= 0x7E)
- return("?"+cid.chr)
- elsif (cid == 0x7F)
- return "?\\^?"+0.chr
- elsif (cid <= 0x9F)
- dest = "?\\^"
- dest += (((cid + ?@) >> 6) | 0xC0).chr
- dest += (((cid + ?@) & 0x3F) | 0x80).chr
- return dest
- elsif (cid <= 0x7FF)
- dest = "? "
- dest[1] = (cid >> 6) | 0xC0
- dest[2] = (cid & 0x3F) | 0x80
- return dest
- elsif (cid <= 0xFFFF)
- dest = "? "
- dest[1] = (cid >> 12) | 0xE0
- dest[2] = ((cid >> 6) & 0x3F) | 0x80
- dest[3] = (cid & 0x3F) | 0x80
- return dest
- elsif (cid <= 0x1FFFFF)
- dest = "? "
- dest[1] = (cid >> 18) | 0xF0
- dest[2] = ((cid >> 12) & 0x3F) | 0x80
- dest[3] = ((cid >> 6) & 0x3F) | 0x80
- dest[4] = (cid & 0x3F) | 0x80
- return dest
- elsif (cid <= 0x3FFFFFF)
- dest = "? "
- dest[1] = (cid >> 24) | 0xF8
- dest[2] = ((cid >> 18) & 0x3F) | 0x80
- dest[3] = ((cid >> 12) & 0x3F) | 0x80
- dest[4] = ((cid >> 6) & 0x3F) | 0x80
- dest[5] = (cid & 0x3F) | 0x80
- return dest
- else
- dest = "? "
- dest[1] = (cid >> 30) | 0xFC
- dest[2] = ((cid >> 24) & 0x3F) | 0x80
- dest[3] = ((cid >> 18) & 0x3F) | 0x80
- dest[4] = ((cid >> 12) & 0x3F) | 0x80
- dest[5] = ((cid >> 6) & 0x3F) | 0x80
- dest[6] = (cid & 0x3F) | 0x80
- return dest
- end
- raise
- end
- end
-end
+require "chise/libchise"
char.method_missing(mid, *args)
end
+ def to_a
+ self.split(//u)
+ end
+
+ def char_length
+ to_a.length
+ end
+
def each_char
to_a.each {|c|
yield(c)
}
end
- def to_a
- self.split(//u)
+ def each_character
+ to_a.each {|ch|
+ yield ch.char
+ }
end
def de_er()
# Copyright (C) 2002-2004 Kouichirou Eto, All rights reserved.
-require "pathname"
-require "chise/config"
-
-class String
- def path
- Pathname.new(self)
- end
-end
-
-class Pathname
- def escape # copied from cgi.rb
- s = @path.gsub(/([\/%]+)/n){
- "%" + $1.unpack("H2" * $1.size).join("%").upcase
- }
- Pathname.new(s)
- end
-
- def unescape # copied from cgi.rb
- s = @path.tr("+", " ").gsub(/((?:%[0-9a-fA-F]{2})+)/n) {
- [$1.delete("%")].pack("H*")
- }
- Pathname.new(s)
- end
-
- # translate file name for deal with the restriction of Windows file system.
- def unix_to_win
- win = @path.gsub(/</, "(")
- win = win.gsub(/>/, ")")
- win = win.gsub(/\*/, "+")
- win = win.gsub(/\?/, "!")
- Pathname.new(win)
- end
-
- def win_to_unix
- unix = @path.gsub(/\)/, ">")
- unix = unix.gsub(/\(/, "<")
- unix = unix.gsub(/\!/, "?")
- unix = unix.gsub(/\+/, "*")
- Pathname.new(unix)
- end
-
- def escape_win_filename
- return self.unix_to_win if CHISE.windows?
- self
- end
-
- def unescape_win_filename
- return self.win_to_unix if CHISE.windows?
- self
- end
-end
+require "chise/path"
# Copyright (C) 2002-2004 Kouichirou Eto, All rights reserved.
$VERBOSE = true
+#$KCODE = "u"
-$debug = false # for test
-$debug = true # for test
-$stdout.binmode if $debug
-$stdout.sync = true if $debug
+# $debug = false # for test
+# $debug = true # for test
+# $stdout.binmode if $debug
+# $stdout.sync = true if $debug
$LOAD_PATH.unshift("..")
require "test/unit"
class TestIDS < Test::Unit::TestCase
def test_ids
- char = "榊".char
- assert_equal("⿰木神", char.ids)
- assert_equal("⿰木神", char.decompose)
str = "榊"
assert_equal("⿰木神", str.char.ids)
assert_equal("⿰木神", str.decompose)
assert_match(/OVERLAID/, "&U+2FFB;".de_er.char.name) #∵
end
- def test_tree
- assert_equal("[]", CHISE::Tree.new().inspect)
- assert_equal("[1]", CHISE::Tree.new().add_leaf(1).inspect)
- assert_equal("[1, 2]", CHISE::Tree.new().add_leaf(1).add_leaf(2).inspect)
- assert_equal("[[]]", CHISE::Tree.new().add_node.inspect)
- assert_equal("[[1]]", CHISE::Tree.new().add_node.add_leaf(1).inspect)
- assert_equal("[[1, 2]]", CHISE::Tree.new().add_node.add_leaf(1).add_leaf(2).inspect)
- assert_equal("[[1]]", CHISE::Tree.new().add_node.add_leaf(1).end_node.inspect)
- assert_equal("[[1], [1]]", CHISE::Tree.new().add_node.add_leaf(1).end_node.add_node.add_leaf(1).end_node.inspect)
-
- tree = CHISE::Tree.new
- assert_equal("[]", tree.inspect)
- assert_equal("[1]", tree.add_leaf(1).inspect)
- assert_equal(0, tree.depth)
- assert_equal("[1, 2]", tree.add_leaf(2).inspect)
- assert_equal("[1, 2, []]", tree.add_node.inspect)
- assert_equal("[1, 2, [3]]", tree.add_leaf(3).inspect)
- assert_equal(1, tree.depth)
- assert_equal("[1, 2, [3, 4]]", tree.add_leaf(4).inspect)
- assert_equal("[1, 2, [3, 4]]", tree.end_node.inspect)
- assert_equal("[1, 2, [3, 4], [5]]", tree.add_node.add_leaf(5).inspect)
- assert_equal("[1, 2, [3, 4], [5, [6]]]", tree.add_node.add_leaf(6).inspect)
- assert_equal(2, tree.depth)
-
- tree = CHISE::Tree.new
- assert_equal("[[\"+\"]]", tree.add_node("+", 2).inspect)
- assert_equal("[[\"+\", 1]]", tree.add_leaf(1).inspect)
- assert_equal("unmatch leaves", tree.check_integrity)
- assert_equal("[[\"+\", 1, 2]]", tree.add_leaf(2).inspect)
- assert_nil(tree.check_integrity)
- assert_equal("[[\"+\", 1, 2], 3]", tree.add_leaf(3).inspect)
- assert_equal("extra nodes", tree.check_integrity)
-
- tree = CHISE::Tree.new
- assert_equal("[[\"+\"]]", tree.add_node("+", 2).inspect)
- assert_equal("unmatch leaves", tree.check_integrity)
- assert_equal("[[\"+\", 1]]", tree.add_leaf(1).inspect)
- assert_equal("unmatch leaves", tree.check_integrity)
- assert_equal("[[\"+\", 1, [\"+\"]]]", tree.add_node("+", 2).inspect)
- assert_equal("unmatch leaves", tree.check_integrity)
- assert_equal("[[\"+\", 1, [\"+\", 2]]]", tree.add_leaf(2).inspect)
- assert_equal("unmatch leaves", tree.check_integrity)
- assert_equal("[[\"+\", 1, [\"+\", 2, 3]]]", tree.add_leaf(3).inspect)
- assert_nil(tree.check_integrity)
-
- tree = CHISE::Tree.new
- assert_equal("[1]", tree.add_leaf(1).inspect)
- assert_nil(tree.check_integrity)
- assert_equal("[1, 2]", tree.add_leaf(2).inspect)
- assert_equal("extra leaves", tree.check_integrity)
- end
-
- def test_ids_tree
-# assert_equal("[[<+,U+002B>, <A,U+0041>, <B,U+0042>]]", CHISE::IDS_Tree.new("+AB").inspect)
-# assert_equal("[[<+,U+002B>, <A,U+0041>, <B,U+0042>], <C,U+0043>]", CHISE::IDS_Tree.new("+ABC").inspect)
-# assert_equal("[[<+,U+002B>, <A,U+0041>, [<+,U+002B>, <B,U+0042>, <C,U+0043>]]]", CHISE::IDS_Tree.new("+A+BC").inspect)
-# assert_equal("[[<+,U+002B>, <A,U+0041>, [<+,U+002B>, <B,U+0042>, <C,U+0043>]], <D,U+0044>]", CHISE::IDS_Tree.new("+A+BCD").inspect)
-
- #assert_equal("[<榊,U+698A>]", CHISE::IDS_Tree.new("榊").inspect)
-# assert_equal("[[<⿰,U+2FF0>, <木,J90-4C5A>, <神,J90-3F40>]]", CHISE::IDS_Tree.new("⿰木神").inspect)
- assert_equal(1, CHISE::IDS_Tree.new("⿰木神").depth)
-# assert_equal("[[<⿰,U+2FF0>, <木,J90-4C5A>, [<⿰,U+2FF0>, <⺭,CDP-8B70>, <申,J90-3F3D>]]]", CHISE::IDS_Tree.new("⿰木⿰⺭申").inspect)
- assert_equal(2, CHISE::IDS_Tree.new("⿰木⿰⺭申").depth)
- assert_equal("unmatch leaves", CHISE::IDS_Tree.new("⿰木").check_integrity)
- assert_nil(CHISE::IDS_Tree.new("⿰木神").check_integrity)
- assert_equal("unmatch leaves", CHISE::IDS_Tree.new("⿰木⿰申").check_integrity)
- assert_nil(CHISE::IDS_Tree.new("⿰木⿰⺭申").check_integrity)
- assert_equal("extra nodes", CHISE::IDS_Tree.new("⿰木⿰⺭申申").check_integrity)
- assert_nil(CHISE::IDS_Tree.new("榊").check_integrity)
- assert_equal("extra leaves", CHISE::IDS_Tree.new("榊榊").check_integrity)
-
- assert_equal(3, "⿳".char.ids_operator_argc)
- assert_equal("⿳士冖匕", "壱".char.ids)
- assert_equal(3, "壱".char.ids.char.ids_operator_argc)
- assert_nil(CHISE::IDS_Tree.new("⿳士冖匕").check_integrity)
- assert_equal("unmatch leaves", CHISE::IDS_Tree.new("⿳士冖").check_integrity)
- assert_equal("extra nodes", CHISE::IDS_Tree.new("⿳士冖匕匕").check_integrity)
-
- assert_equal("contains ques", CHISE::IDS_Tree.new("⿳士冖?").check_integrity)
- end
-
- def test_tree_depth
- assert_equal(1, CHISE::IDS_Tree.new("林".decompose).depth)
-# assert_equal("["⿰木木"]", CHISE::IDS_Tree.new("林".decompose).nodes.inspect)
-# assert_equal("[]", CHISE::IDS_Tree.new("林".decompose).sub_nodes.inspect)
- assert_equal(2, CHISE::IDS_Tree.new("榊".decompose_all).depth)
-# assert_equal("["⿰木⿰⺭申", "⿰⺭申"]", CHISE::IDS_Tree.new("榊".decompose_all).nodes.inspect)
-# assert_equal("["⿰⺭申"]", CHISE::IDS_Tree.new("榊".decompose_all).sub_nodes.inspect)
-
-# assert_equal(3, CHISE::IDS_Tree.new("焔".decompose_all).depth)
-# assert_equal(3, CHISE::IDS_Tree.new("焔".decompose_all).nodes.length)
-# assert_equal(2, CHISE::IDS_Tree.new("焔".decompose_all).sub_nodes.length)
-
- assert_equal(2, CHISE::IDS_Tree.new("屡".decompose_all).depth)
- assert_equal("⿸尸娄", "⿸尸⿱米女".aggregate)
- assert_equal(3, CHISE::IDS_Tree.new("醤".decompose_all).depth)
- end
-
def test_compose_exact #正確に一致するIDSを検知する
assert_equal("榊", "榊".decompose.compose)
assert_equal("壱", "壱".decompose.compose)
assert_raise(RuntimeError){ char.nosuchmethod(0) }
end
+ def test_bignum
+ char = CHISE::Character.get(1644203214)
+ assert_equal("\375\242\200\210\263\216", char.to_s)
+ end
+
def test_latin
char = "A".char
assert_equal(65, char.ascii)
def test_put
char = "字".char
- char["test_attribute"] = "test"
- assert_equal("test", char.test_attribute)
- char["test_attribute"] = "test2"
- assert_equal("test2", char.test_attribute)
+ #qp char.test_feature
+ char.test_feature = "test1"
+ assert_equal("test1", char.test_feature)
+ #qp char.test_feature
+ char.test_feature = "test2"
+ assert_equal("test2", char.test_feature)
end
end
assert_equal("[W", u32.u32tou16)
assert_equal("\273\372", u16.u16toeuc)
assert_equal("\216\232", u16.u16tosjis)
- assert_equal(23383, u32.u32to_i)
- assert_equal(23383, u8.u8to_i)
+# assert_equal(23383, u32.u32to_i)
+# assert_equal(23383, u8.u8to_i)
- assert_equal(u32, CHISE.i_tou32(23383))
- assert_equal(u8, CHISE.i_tou8(23383))
+# assert_equal(u32, CHISE.i_tou32(23383))
+# assert_equal(u8, CHISE.i_tou8(23383))
u8 = "\8a¿\8e\9a".sjistou8
assert_equal("\346\274\242\345\255\227", u8)
# Copyright (C) 2002-2004 Kouichirou Eto, All rights reserved.
require "common"
+require "chise/ids"
-class TestIDS < Test::Unit::TestCase
+class TestIDC < Test::Unit::TestCase
def test_idc
char = CHISE::Character.get(0x2FF0)
assert_equal("IDEOGRAPHIC DESCRIPTION CHARACTER LEFT TO RIGHT", char.name)
assert_equal(char.bidi_category, "ON")
end
end
+
+class TestIDS < Test::Unit::TestCase
+ def test_ids_1
+ assert_equal("\342\277\261\345\256\200\345\255\220", "字".ids)
+ assert_equal("⿱宀子", "字".ids)
+ assert_equal(CHISE::IDC_1+"宀子", "字".ids)
+ assert_equal("\342\277\260\346\227\245\345\257\272", "時".ids)
+ assert_equal(CHISE::IDC_0+"日寺", "時".ids)
+ end
+
+ def test_decompose
+ char = "榊".char
+ assert_equal("⿰木神", char.ids)
+ assert_equal("⿰木神", char.decompose)
+# assert_equal("⿰木神", char.decompose_all)
+
+
+
+
+ end
+end
require "common"
require "chise/idsdb"
+require "chise/management"
+
+#class TestIDS_DB < Test::Unit::TestCase
+class TestIDS_DB
+ def check_ccs_db(cd)
+ cd.each_line {|code, ids|
+ assert_instance_of(String, code)
+ assert_instance_of(String, ids)
+ }
+ cd.each_character {|char, ids|
+ assert_instance_of(CHISE::Character, char)
+ assert_instance_of(String, ids)
+ }
+ end
-class TestIDS_DB < Test::Unit::TestCase
def test_ids_db
@idb = CHISE::IDS_DB.instance
assert_instance_of(CHISE::IDS_DB, @idb)
@idb.each_ccs {|ccs|
cd = @idb.get_ccs(ccs)
assert_instance_of(CHISE::IDS_CCS_DB, cd)
+ #check_ccs_db(cd)
}
+ cd = @idb.get_ccs("JIS-X0208-1990")
+ check_ccs_db(cd)
+ end
+end
- @cd = @idb.get_ccs("JIS-X0208-1990")
- @cd.each_line {|code, ids|
- assert_instance_of(String, code)
- assert_instance_of(String, ids)
- }
- @cd.each_entry {|char, ids|
- assert_instance_of(CHISE::Character, char)
- assert_instance_of(String, ids)
- }
+class TestIDS_DB_Management < Test::Unit::TestCase
+ def test_management
+ man = CHISE::IDS_DB_Management.new
+ # make sure there is no conflict
+ #man.check_conflict_of_ids_text # 167.499 seconds.
+ #man.store_ids_as_text # 172.024 seconds.
+ #man.store_ids_de_er # 47.99 seconds.
+ #man.check_integrity_of_ids_tree # 58.185 seconds.
+ #man.make_by_ids_db # 29.572 seconds.
+
+=begin
+ db = IDS_DB.instance
+# db.make_ids_db #1時間12分
+# IDS_TEXT_DB.instance.make_ids_error #4分
+# db.make_ids_reverse #2分
+ db.dump_ids_duplicated #1分
+ db.make_ids_aggregated #5分
+ db.dump_ids_aggregated #1分
+ db.make_ids_parts #30分
+ db.make_ids_contained #2分
+ #db.make_ids_decomposed #2分→おわらなかった…。
+=end
end
end
require "common"
class TestParser < Test::Unit::TestCase
- def test_parser
+ def setup
@pa = CHISE::CharacterParser.new
+ end
- # test_parse
+ def test_parse
assert_raise(RuntimeError){ @pa.parse(nil) }
assert_equal(65, @pa.parse(0x41))
assert_raise(RuntimeError){ @pa.parse(Object.new) }
assert_equal(20175, @pa.parse("?\344\273\217"))
assert_raise(RuntimeError){ @pa.parse("nosuchcharacter") }
assert_raise(RuntimeError){ @pa.parse("\344\273\217") }
+ end
- # test_parse_er
+ def test_parse_er
assert_equal(true, @pa.contain_er?("A"))
assert_equal(true, @pa.contain_er?("This is A er."))
assert_equal(true, @pa.is_er?("A"))
assert_equal(false, @pa.is_er?("This is A er."))
- assert_raise(RuntimeError){ @pa.parse_er("nosucher") }
assert_equal(0xe001, @pa.parse("&my-1;"))
+ assert_raise(RuntimeError){ @pa.parse_er("&nosucher;") }
+ assert_raise(RuntimeError){ @pa.parse_er("nosucher") }
assert_equal(23383, @pa.parse("&MCS-00005B57;"))
assert_equal(23383, @pa.parse("&U5B57;"))
assert_equal(23383, @pa.parse("&U+5B57;"))
assert_equal(23383, @pa.parse("字"))
assert_equal(23383, @pa.parse("字"))
+ end
- # test_get_ccs
+ def test_parse_ccs
assert_equal(23383, @pa.parse("&J90-3B7A;"))
assert_equal(23383, @pa.parse("&I-J90-3B7A;"))
assert_equal(23383, @pa.parse("&MCS-00005B57;"))
assert_equal(23383, @pa.parse("&M-06942;"))
- assert_raise(RuntimeError){ @pa.parse_er("&nosucher;") }
+ end
+ def test_comples_ccs
assert_equal(28193, @pa.parse("&C1-602E;")) # 渡
assert_equal(15542221, @pa.parse("&C1-6030;")) # unknown
+
+ # test_ccs_etc
+ assert_equal(131636, @pa.parse("&HZK01-C947;")) # =hanziku-1
+ assert_equal(1644203214, @pa.parse("&CDP-8CCE;")) # CDP
+ assert_equal(1644202927, @pa.parse("&CDP-8BAF;"))
+ assert_equal(1644210346, @pa.parse("&B-A8AA;")) # =big5
+ assert_equal(1644202869, @pa.parse("&RUI6-E00E;")) # =ruimoku-v6
+ assert_equal(15225021, @pa.parse("&JC3-50BD;")) # =jef-china3
+ assert_equal(1644202692, @pa.parse("&CB00008;"))
+ assert_equal(14820071, @pa.parse("&CB08935;"))
+ #assert_equal(0, @pa.parse("&CB08661;")) # what?
end
def test_de_er
-#!/usr/bin/env ruby
-# Copyright (C) 2002-2004 Kouichirou Eto, All rights reserved.
-
-require "common"
-
-class TestRbChise < Test::Unit::TestCase
- include CHISE::ChiseValue
-
- def test_rbchise
- @ds = CHISE::DataSource.new
- assert_instance_of(CHISE::DataSource, @ds)
- assert_match(/chise-db\Z/, @ds.location.to_s)
-
- @ct = @ds.get_ccs("=daikanwa")
- assert_instance_of(CHISE::CCSTable, @ct)
- char_id = @ct.decode(364) # get a character by Daikanwa number 364.
- assert_equal(20175, char_id)
- str = format_char_id(20175)
- assert_equal("?\344\273\217", str)
-
- char_id = @ds.decode_char("=daikanwa", 364)
- assert_equal(20175, char_id)
-
- @ft = @ds.get_feature("ideographic-structure")
- assert_instance_of(CHISE::FeatureTable, @ft)
- value = @ft.get_value(char_id)
- assert_instance_of(String, value)
- assert_equal("(?\342\277\260 ?\344\272\273 ?\345\216\266)", value)
-
- value = @ds.load_feature("ideographic-structure", char_id)
- assert_equal("(?\342\277\260 ?\344\272\273 ?\345\216\266)", value)
-
- @ds.each_feature {|f|
- #qp f
- assert_instance_of(String, f)
- }
-
- @ft.each {|k, v|
- #qp k, v
- assert_kind_of(Integer, k)
- assert_instance_of(String, v)
- }
-
- ft = @ds.get_feature("numeric-value")
- ft.each {|k, v|
- #qp k, v
- assert_kind_of(Integer, k)
- assert_instance_of(String, v)
- }
- end
-
- def test_each_ccs
- @ds = CHISE::DataSource.new
- @ds.each_ccs {|ccs|
- #qp ccs
- assert_instance_of(String, ccs)
- ct = @ds.get_ccs(ccs)
- assert_instance_of(CHISE::CCSTable, ct)
- }
-
- ct = @ds.get_ccs("=ascii")
- ct.each {|k, v|
- #qp k, v
- assert_kind_of(Integer, k)
- assert_kind_of(Integer, v)
- }
- ct.close
- end
-
- def test_error
- @ds = CHISE::DataSource.new
- @ft = @ds.get_feature("nosuchfeature")
- v = @ft.get_value(20175)
- assert_equal(nil, v)
- end
-
- def test_chisedb
- @cd = CHISE::ChiseDB.instance
-
- char_id = @cd.decode_char("=daikanwa", 364)
- assert_equal(20175, char_id)
-
- value = @cd.load_feature("ideographic-structure", char_id)
- assert_equal("(?\342\277\260 ?\344\272\273 ?\345\216\266)", value)
-
- value = @cd.load_feature("=ucs", char_id)
- assert_equal(20175, value)
-
- @cd.each_feature {|f|
- assert_instance_of(String, f)
- }
-
- ft = @cd.get_feature("numeric-value")
- ft.each {|k, v|
- assert_kind_of(Integer, k)
- assert_instance_of(String, v)
- }
- end
-
- def test_ascii
- @cd = CHISE::ChiseDB.instance
- ct = @cd.get_ccs("ascii")
- char_id = ct.decode(65)
- assert_equal(65, char_id)
- assert_equal("A", CHISE::Character.get(char_id).to_s)
-# assert_equal("A", char.to_s)
- end
-
-
- def test_parse_c_string
- u8 = "字"
- assert_equal(23383, u8.u8to_i)
- assert_equal(23383, parse_c_string("?"+u8))
- assert_equal(0, parse_c_string("?\\^@"))
- assert_equal(9, parse_c_string("?\t"))
- assert_equal(10, parse_c_string("?\n"))
- assert_equal(13, parse_c_string("?\r"))
- assert_equal(94, parse_c_string("?^\\"))
- assert_equal(31, parse_c_string("?\\^_"))
- assert_equal(32, parse_c_string("?\\ "))
- assert_equal(34, parse_c_string("?\\\""))
- assert_equal(126, parse_c_string("?~"))
- assert_equal(127, parse_c_string("?\\^?\000"))
- assert_equal(131, parse_c_string("?\\^\303\237"))
- assert_equal(0x7FF, parse_c_string("?\337\277"))
- assert_equal(0xFFFF, parse_c_string("?\357\277\277"))
- assert_equal(0x1FFFFF, parse_c_string("?\367\277\277\277"))
- assert_equal(0x3FFFFFF, parse_c_string("?\373\277\277\277\277"))
- assert_equal(0xFFFFFFF, parse_c_string("?\374\217\277\277\277\277"))
- assert_raise(RuntimeError) { parse_c_string("nosuch") }
- end
-
- def test_format_char_id
- u8 = "字"
- assert_equal(u8, CHISE.i_tou8(23383))
- assert_equal("?\345\255\227", format_char_id(23383))
- assert_equal("?"+u8, format_char_id(23383))
- assert_equal("?\\^@", format_char_id(0))
- assert_equal("?\t", format_char_id(?\t))
- assert_equal("?\n", format_char_id(?\n))
- assert_equal("?\r", format_char_id(?\r))
- assert_equal("?^\\", format_char_id(0x1C))
- assert_equal("?\\^_", format_char_id(0x1F))
- assert_equal("?\\ ", format_char_id(?\s))
- assert_equal("?\\\"", format_char_id(?\"))
- assert_equal("?~", format_char_id(0x7E))
- assert_equal("?\\^?\000", format_char_id(0x7F))
- assert_equal("?\\^\303\237", format_char_id(0x9F))
- assert_equal("?\337\277", format_char_id(0x7FF))
- assert_equal("?\357\277\277", format_char_id(0xFFFF))
- assert_equal("?\367\277\277\277", format_char_id(0x1FFFFF))
- assert_equal("?\373\277\277\277\277", format_char_id(0x3FFFFFF))
- assert_equal("?\374\217\277\277\277\277", format_char_id(0xFFFFFFF))
- end
-end
assert_raises(RuntimeError){ "文&nosucher;列".de_er }
end
+ def test_de_er_ccs
+ assert_equal("\346\270\241", "&C1-602E;".de_er) # 渡
+ assert_equal("\370\273\222\237\215", "&C1-6030;".de_er) # unknown
+ # test_hanziku
+ assert_equal("\360\240\210\264", "&HZK01-C947;".de_er)
+# assert_equal(1644203214, "&CDP-8CCE;".de_er)
+# assert_equal(1644202927, "&CDP-8BAF;".de_er)
+ end
+
def test_characters
@str = "文字列"
assert_equal(["文","字","列"], @str.to_a)
-#!/usr/bin/env ruby
-# Copyright (C) 2002-2004 Kouichirou Eto, All rights reserved.
-
-require "common"
-
-class TestUtil < Test::Unit::TestCase
- def test_db
- assert_equal("()+!", "<>*?".path.unix_to_win.to_s)
- assert_equal("<>*?", "()+!".path.win_to_unix.to_s)
- end
-end