From 66837463b5d1d847c8f8043c1e08615aa7bbd8e3 Mon Sep 17 00:00:00 2001 From: eto Date: Fri, 3 Oct 2003 17:46:34 +0000 Subject: [PATCH] add new codesys support --- src/chise.rb | 92 ++++++++++++++++++++++++++++++++------------------ src/db.rb | 16 ++++----- t/tc_char.rb | 6 ++-- t/tc_db.rb | 17 ++++++---- t/tc_ids.rb | 8 ++--- t/tc_str.rb | 10 ++++-- t/ts_chise.rb | 2 +- tools/make_ids_db.rb | 16 ++++----- 8 files changed, 102 insertions(+), 65 deletions(-) diff --git a/src/chise.rb b/src/chise.rb index 06b7e8d..f14b59f 100755 --- a/src/chise.rb +++ b/src/chise.rb @@ -156,34 +156,58 @@ module CHISE #================================================================== DB_DIR = 'd:/work/chise/char-db' #この後に/sysmtem-char-id/ucsという感じに続く IDS_DB_DIR = 'd:/work/chise/ids/' #この後にIDS-JIS-X0208-1990.txtという感じに続く else - DB_DIR = '/usr/local/lib/xemacs-21.4.10/i686-pc-linux/char-db' #この後に/sysmtem-char-id/ucsという感じに続く + DB_DIR = '/usr/local/lib/xemacs-21.4.12/i686-pc-linux/char-db' #この後に/sysmtem-char-id/ucsという感じに続く IDS_DB_DIR = '/home/eto/work/chise/ids/' #この後にIDS-JIS-X0208-1990.txtという感じに続く end class EntityReference #====================================================================== #状況によってどのERに変換するかが異なる可能性があるので、普通のclassとして実装したほうがいい? +# CODESYS_TABLE = [ +# %w( chinese-big5-cdp CDP- 4 X), +# %w( ideograph-daikanwa M- 5 d), +# %w( ideograph-cbeta CB 5 d), +# %w( ideograph-gt GT- 5 d), +# %w( ideograph-gt-k GT-K 5 d), +# %w( japanese-jisx0208-1990 J90- 4 X), +# %w( japanese-jisx0208 J83- 4 X), +# %w( japanese-jisx0213-1 JX1- 4 X), +# %w( japanese-jisx0213-2 JX2- 4 X), +# %w( japanese-jisx0212 JSP- 4 X), +# %w( japanese-jisx0208-1978 J78- 4 X), +# %w( chinese-cns11643-1 C1- 4 X), +# %w( chinese-cns11643-2 C2- 4 X), +# %w( chinese-cns11643-3 C3- 4 X), +# %w( chinese-cns11643-4 C4- 4 X), +# %w( chinese-cns11643-5 C5- 4 X), +# %w( chinese-cns11643-6 C6- 4 X), +# %w( chinese-cns11643-7 C7- 4 X), +# %w( korean-ksc5601 K0- 4 X), +# ] +# CODESYS_ORDER = %w(japanese chinese korean ideograph) CODESYS_TABLE = [ - %w( chinese-big5-cdp CDP- 4 X), - %w( ideograph-daikanwa M- 5 d), - %w( ideograph-cbeta CB 5 d), - %w( ideograph-gt GT- 5 d), - %w( ideograph-gt-k GT-K 5 d), - %w( japanese-jisx0208-1990 J90- 4 X), - %w( japanese-jisx0208 J83- 4 X), - %w( japanese-jisx0213-1 JX1- 4 X), - %w( japanese-jisx0213-2 JX2- 4 X), - %w( japanese-jisx0212 JSP- 4 X), - %w( japanese-jisx0208-1978 J78- 4 X), - %w( chinese-cns11643-1 C1- 4 X), - %w( chinese-cns11643-2 C2- 4 X), - %w( chinese-cns11643-3 C3- 4 X), - %w( chinese-cns11643-4 C4- 4 X), - %w( chinese-cns11643-5 C5- 4 X), - %w( chinese-cns11643-6 C6- 4 X), - %w( chinese-cns11643-7 C7- 4 X), - %w( korean-ksc5601 K0- 4 X), + %w( =jis-x0208-1990 J90- 4 X), + %w( =jis-x0208-1983 J83- 4 X), + %w( =jis-x0208-1978 J78- 4 X), + %w( =jis-x0208 J90- 4 X), #継承のアドホックな実装 + %w( =jis-x0208 J83- 4 X), #継承のアドホックな実装 + %w( =jis-x0208 J78- 4 X), #継承のアドホックな実装 + %w( =jis-x0213-1-2000 JX1- 4 X), + %w( =jis-x0213-2-2000 JX2- 4 X), + %w( =jis-x0212 JSP- 4 X), + %w( =big5-cdp CDP- 4 X), + %w( =cns11643-1 C1- 4 X), + %w( =cns11643-2 C2- 4 X), + %w( =cns11643-3 C3- 4 X), + %w( =cns11643-4 C4- 4 X), + %w( =cns11643-5 C5- 4 X), + %w( =cns11643-6 C6- 4 X), + %w( =cns11643-7 C7- 4 X), + %w( =ks-x1001 K0- 4 X), + %w( =daikanwa M- 5 d), + %w( =cbeta CB 5 d), + %w( =gt GT- 5 d), + %w( =gt-k GT-K 5 d), ] - CODESYS_ORDER = %w(japanese chinese korean ideograph) REGEXP_PART = "&([-+0-9A-Za-z#]+);" REGEXP_ALL = "^#{REGEXP_PART}$" @@ -202,13 +226,14 @@ module CHISE #================================================================== each_codesys {|codesys, er_prefix, keta, numtype| #p [codesys, er_prefix, keta, numtype] numtyperegex = '\d' #if numtype == 'd' numtyperegex = '[0-9A-Fa-f]' if numtype == 'X' - regexpstr = "^#{er_prefix}(#{numtyperegex}{#{keta},#{keta}})$" #p regexpstr + regexpstr = "^#{er_prefix}(#{numtyperegex}{#{keta},#{keta}})$" if er =~ Regexp.new(regexpstr) codestr = $1 code = codestr.to_i #if numtype == 'd' code = codestr.hex if numtype == 'X' char_id_u8 = EntityReference.get_database(codesys, code) char_id_num = Character.parse_char_id(char_id_u8) + next if char_id_num == nil return char_id_num end } @@ -216,21 +241,24 @@ module CHISE #================================================================== end def self.each_codesys() - CODESYS_ORDER.each {|lang| - CODESYS_TABLE.each {|codesys, er_prefix, keta, numtype| #普通こういう書き方はしない。ループ一個にする。 - next unless codesys =~ lang - yield(codesys, er_prefix, keta, numtype) - } +# CODESYS_ORDER.each {|lang| +# CODESYS_TABLE.each {|codesys, er_prefix, keta, numtype| #普通こういう書き方はしない。ループ一個にする。 +# next unless codesys =~ lang +# yield(codesys, er_prefix, keta, numtype) +# } +# } + CODESYS_TABLE.each {|codesys, er_prefix, keta, numtype| + yield(codesys, er_prefix, keta, numtype) } end def self.get_database(codesys, code) c = CodesysDB.instance.get(codesys, code) return c if c != nil - if codesys =~ /-jisx0208/ - #return self.get_database("=jis-x0208", code) #再帰でどうだ? - c = CodesysDB.instance.get("=jis-x0208", code) - return c - end +# if codesys =~ /-jisx0208/ +# #return self.get_database("=jis-x0208", code) #再帰でどうだ? +# c = CodesysDB.instance.get("=jis-x0208", code) +# return c +# end return nil end end diff --git a/src/db.rb b/src/db.rb index c2b5a11..8a94021 100755 --- a/src/db.rb +++ b/src/db.rb @@ -265,20 +265,20 @@ module CHISE end def keys() #どんなコードポイントの情報を持っているかの一覧 ks = @dbs.get(@name).keys - if @name =~ /jisx0208/ #特別処理 - n = @dbs.get('=jis-x0208').keys - # p ['keys', @name, ks, n] - ks += n - end +# if @name =~ /jisx0208/ #特別処理 +# n = @dbs.get('=jis-x0208').keys +# # p ['keys', @name, ks, n] +# ks += n +# end ks.map! {|k| to_num(k) } ks end def get(key) v = @dbs.get(@name, key) return v if v - if @name =~ /jisx0208/ #jisx0208が含まれている場合だけ特別処理する - return @dbs.get('=jis-x0208', key) - end +# if @name =~ /jisx0208/ #jisx0208が含まれている場合だけ特別処理する +# return @dbs.get('=jis-x0208', key) +# end return nil end def each() diff --git a/t/tc_char.rb b/t/tc_char.rb index e917797..c3bc828 100755 --- a/t/tc_char.rb +++ b/t/tc_char.rb @@ -41,10 +41,10 @@ class TC_Character < Test::Unit::TestCase assert_instance_of(String, @char.inspect) end def test_er - assert_equal(Character.get("&J90-3B7A;"), @char, "jisx0208") + assert_equal(@char, Character.get("&J90-3B7A;"), "jisx0208") #まだ継承関係を実装していない。 # assert_equal("&J90-3B7A;", @char.to_er, "jisx0208") - assert_equal(Character.get("&MCS-00005B57;"), @char, "mcs") -# assert_equal(Character.get("&M-06942;"), @char, "ideograph-daikanwa, Morohashi") + assert_equal(@char, Character.get("&MCS-00005B57;"), "mcs") + assert_equal(@char, Character.get("&M-06942;"), "ideograph-daikanwa, Morohashi") end def test_latin char = Character.get("A") diff --git a/t/tc_db.rb b/t/tc_db.rb index a36216a..194ac1d 100755 --- a/t/tc_db.rb +++ b/t/tc_db.rb @@ -70,7 +70,7 @@ class TC_Codesys < Test::Unit::TestCase counter += 1; break if 10 < counter } end - def test_jis + def test_ascii db = CodesysDB.instance codesys = db.get_codesys('ascii') char = codesys.get(65) @@ -79,19 +79,24 @@ class TC_Codesys < Test::Unit::TestCase ks = codesys.keys end def test_jis_codesys - return #とりあえず検査しない -# codesys = db.get_codesys('japanese-jisx0208-1990') + db = CodesysDB.instance + codesys = db.get_codesys('=jis-x0208') ks = codesys.keys.sort #とすることによって、JISX0208 1990の集合全部のkeysが得られる # assert_equal(6880, ks.length) assert_equal(8481, ks.first) - assert_equal(29734, ks.last) + assert_equal(29566, ks.last) char = codesys.get(15226) #"字" assert_equal("字", char.to_s) assert_equal("亜", codesys.get(12321)) jis = "亜".char.japanese_jisx0208_1990 - assert_equal("亜", codesys.get(jis)) - assert_equal("亜", sprintf("&J90-%04X;", jis).de_er) +# assert_equal("亜", codesys.get(jis)) +# assert_equal("亜", sprintf("&J90-%04X;", jis).de_er) + +# codesys = db.get_codesys('japanese-jisx0208-1990') #旧名 + codesys = db.get_codesys('=jis-x0208-1990') + assert_equal(8481, ks.first) + assert_equal(29566, ks.last) end end diff --git a/t/tc_ids.rb b/t/tc_ids.rb index 1a6e490..288e513 100755 --- a/t/tc_ids.rb +++ b/t/tc_ids.rb @@ -154,9 +154,9 @@ class TC_IDS < Test::Unit::TestCase assert_equal('["⿰木⿰⺭申", "⿰⺭申"]', IDS_Tree.new("榊".decompose_all).nodes.inspect) assert_equal('["⿰⺭申"]', IDS_Tree.new("榊".decompose_all).sub_nodes.inspect) - assert_equal(3, IDS_Tree.new("焔".decompose_all).depth) - assert_equal(3, IDS_Tree.new("焔".decompose_all).nodes.length) - assert_equal(2, IDS_Tree.new("焔".decompose_all).sub_nodes.length) +# assert_equal(3, IDS_Tree.new("焔".decompose_all).depth) +# assert_equal(3, IDS_Tree.new("焔".decompose_all).nodes.length) +# assert_equal(2, IDS_Tree.new("焔".decompose_all).sub_nodes.length) assert_equal(2, IDS_Tree.new("屡".decompose_all).depth) assert_equal("⿸尸娄", "⿸尸⿱米女".aggregate) @@ -188,7 +188,7 @@ class TC_IDS < Test::Unit::TestCase # p "実".char.inspect_all # p "実".char.ids # assert_equal("contains ques", "実".char.ids_error) - assert_equal("unmatch leaves", "実".char.ids_error) +# assert_equal("unmatch leaves", "実".char.ids_error) # p CharDB.instance.get('ascii').keys # p CharDB.instance.get('no-such-attribute').keys # p CharDB.instance.get('ids-error').keys diff --git a/t/tc_str.rb b/t/tc_str.rb index 63d2f41..8af1756 100755 --- a/t/tc_str.rb +++ b/t/tc_str.rb @@ -23,9 +23,13 @@ class TC_String < Test::Unit::TestCase end def test_attributes assert_equal(23383, "字".ucs) - #assert_equal(23383, "字字".ucs) #エラーが発生する + assert_raises(NameError, message=""){ + assert_equal(23383, "字字".ucs) #エラーが発生する + } assert_equal(25991, "文".ucs) - #assert_equal(25991, @str.ucs) #エラーが発生する + assert_raises(NameError, message=""){ + assert_equal(25991, @str.ucs) #エラーが発生する + } end def test_er @char = @str.char_at(1) @@ -37,7 +41,7 @@ class TC_String < Test::Unit::TestCase assert_equal("文字字", "文&J90-3B7A;&J90-3B7A;".de_er, "two ERs") assert_equal("文字文字", "文&J90-3B7A;文&J90-3B7A;".de_er, "two ERs") assert_equal("文字", "文&MCS-00005B57;".de_er, "mcs") -# assert_equal("文字", "文&M-06942;".de_er, "morohashi") + assert_equal("文字", "文&M-06942;".de_er, "morohashi") assert_equal("字", "字".de_er) str = "文&J90-3B7A;" diff --git a/t/ts_chise.rb b/t/ts_chise.rb index cc3105e..53a0f44 100755 --- a/t/ts_chise.rb +++ b/t/ts_chise.rb @@ -6,7 +6,7 @@ require 'test/unit' require 'tc_char' require 'tc_str' require 'tc_db' -#require 'tc_ids' +require 'tc_ids' require 'tc_kanjilist' #require 'tc_network' #Graphvizが必要なため、普段は実行しない diff --git a/tools/make_ids_db.rb b/tools/make_ids_db.rb index f515fab..2833049 100755 --- a/tools/make_ids_db.rb +++ b/tools/make_ids_db.rb @@ -8,14 +8,14 @@ require 'chise' include CHISE db = IDS_DB.instance -#db.make_ids_db #1時間12分 -##IDS_TEXT_DB.instance.make_ids_error -#db.make_ids_reverse #2分 -#db.dump_ids_duplicated #1分 -#db.make_ids_aggregated #3分 -#db.dump_ids_aggregated #1分 -db.make_ids_parts #10分 +db.make_ids_db #1時間12分 +IDS_TEXT_DB.instance.make_ids_error #4分 +db.make_ids_reverse #2分 +db.dump_ids_duplicated #1分 +db.make_ids_aggregated #5分 +db.dump_ids_aggregated #1分 +db.make_ids_parts #30分 db.make_ids_contained #2分 -db.make_ids_decomposed #2分 +#db.make_ids_decomposed #2分→おわらなかった…。 #----------------------------------------------------------------------end. -- 1.7.10.4