require "bdb"
require "chise/config"
require "chise/rbchise"
+require "chise/management"
module CHISE
return @pre + t + @post
end
- def get_dirname(t) File.dirname(get_filename(t)) end
+ def get_dirname(t)
+ File.dirname(get_filename(t))
+ end
def open_dbs()
@dbs = Hash.new
t.sub!(%r|#{@post}$|, "") if @post != ""
keys << t
}
- return keys
+ keys
end
def close_db(t)
@common = db.get_codesys("=jis-x0208")
@newest = db.get_codesys("japanese-jisx0208-1990")
end
+
def get_char(code)
char = @common.get(code)
return char unless char.nil?
end
end
- class DBS_Management # DataBase file management
- OBSOLETE_ATTRIBUTES = "
-cns-radical
-cns-radical?
-kangxi-radical
-daikanwa-radical
-unicode-radical
-
-cns-strokes
-kangxi-strokes
-daikanwa-strokes
-shinjigen-1-radical
-gb-original-radical
-japanese-strokes
-jis-strokes-a
-jis-strokes-b
-jisx0208-strokes
-jis-x0213-strokes
-jisx0213-strokes
-unicode-strokes
-
-totalstrokes
-cns-total-strokes
-jis-total-strokes-b
-
-non-morohashi
-
-=>ucs*
-#=>mojikyo
-#=mojikyo
-->identical
-
-ancient-ideograph-of
-ancient-char-of-shinjigen-1
-original-ideograph-of
-original-char-of-shinjigen-1
-simplified-ideograph-of
-vulgar-ideograph-of
-vulgar-char-of-shinjigen-1
-ideograph=
-ideographic-variants
-variant-of-shinjigen-1
-
-iso-10646-comment
-".split
-
- def initialize
- dir = Config.instance.db_dir
- @odir = dir+"/system-char-id/obsolete" #直打ちしている。
- end
-
- def move_obsolete_files # move obsolete BDB files to obsolete directory
- db = CharDB.instance
- db.close_all
- Dir.mkdir(@odir) unless FileTest.directory? @odir
- OBSOLETE_ATTRIBUTES.each {|attr|
- next if attr =~ /^#/
- filename = db.get_filename(attr)
- move_to_obsolete(filename)
- move_to_obsolete(filename+".txt")
- }
- end
-
- def move_to_obsolete(file)
- cmd = "mv \"#{file}\" #{@odir}"
- # p cmd
- system cmd
- end
-
- end
end
def u32to_i
return 0 if length == 0
s = self
-# return (s[3] << 24 | s[2] << 16 | s[1] << 8 | s[0])
return (s[0] << 24 | s[1] << 16 | s[2] << 8 | s[3])
end
end
+
+class Uconv
+ def self.u8tou4(s) s.u8tou32; end
+ def self.u4tou8(s) s.u32tou8; end
+ def self.u4tou16(s) s.u32tou16; end
+ def self.u16toeuc(s) s.u16toeuc; end
+ def self.u16tosjis(s) s.u16tosjis; end
+end
--- /dev/null
+# Copyright (C) 2002-2004 Kouichirou Eto, All rights reserved.
+
+module CHISE
+ class DBS_Management # DataBase file management
+ OBSOLETE_ATTRIBUTES = "
+cns-radical
+cns-radical?
+kangxi-radical
+daikanwa-radical
+unicode-radical
+
+cns-strokes
+kangxi-strokes
+daikanwa-strokes
+shinjigen-1-radical
+gb-original-radical
+japanese-strokes
+jis-strokes-a
+jis-strokes-b
+jisx0208-strokes
+jis-x0213-strokes
+jisx0213-strokes
+unicode-strokes
+
+totalstrokes
+cns-total-strokes
+jis-total-strokes-b
+
+non-morohashi
+
+=>ucs*
+#=>mojikyo
+#=mojikyo
+->identical
+
+ancient-ideograph-of
+ancient-char-of-shinjigen-1
+original-ideograph-of
+original-char-of-shinjigen-1
+simplified-ideograph-of
+vulgar-ideograph-of
+vulgar-char-of-shinjigen-1
+ideograph=
+ideographic-variants
+variant-of-shinjigen-1
+
+iso-10646-comment
+".split
+
+ def initialize
+ dir = Config.instance.db_dir
+ @odir = dir+"/system-char-id/obsolete" #\92¼\91Å\82¿\82µ\82Ä\82¢\82é\81B
+ end
+
+ def move_obsolete_files # move obsolete BDB files
+ db = CharDB.instance
+ db.close_all
+ Dir.mkdir(@odir) unless FileTest.directory? @odir
+ OBSOLETE_ATTRIBUTES.each {|attr|
+ next if attr =~ /^#/
+ filename = db.get_filename(attr)
+ move_to_obsolete(filename)
+ move_to_obsolete(filename+".txt")
+ }
+ end
+
+ def move_to_obsolete(file)
+ cmd = "mv \"#{file}\" #{@odir}"
+ #p cmd
+ system cmd
+ end
+ end
+end
def parse(c) # parse a value and return a number (MCS)
raise "c is nil" if c.nil?
- if c.kind_of?(Numeric)
- c = 0x80000000 + c if c < 0 # negative value
- return c.to_i
- end
-
- raise "unknown object" unless c.kind_of?(String)
+ if c.kind_of?(String)
+ if /\A\?/ =~ c
+ c = c.sub(/\A\?/, "") # remove "?" in the head
+ u4 = c.u8tou32 # translate from UTF-8 to UTF-32
+ return u4.u32to_i # translate UTF-32 to UCS number
+ end
- return c.to_i if /^\d+$/ =~ c # only numbers?
+ return parse_er(c) if is_er?(c) # ER?
- return parse_er(c) if is_er?(c) # ER?
+ return c.to_i if /^\d+$/ =~ c # only numbers?
- c = c.sub(/\A\?/, "") if /\A\?/ =~ c # remove a "?" in the head
+ raise "unknown format"
+ end
- u4 = c.u8tou32 # translate from UTF-8 to UTF-32
- u4.u32to_i # translate UTF-32 to UCS number
+ if c.kind_of?(Numeric)
+ c = 0x80000000 + c if c < 0 # negative value
+ return c.to_i
+ end
+
+ raise "unknown object"
end
PART = "&([-+0-9A-Za-z#]+);"
def contain_er?(s) (PART_RE =~ s) != nil; end
def is_er?(s) (ALL_RE =~ s) != nil; end
- # the order is important. The primary sharset should be selectable.
+ # the order is important. The primary charset should be selectable.
CODESYS_TABLE = [
%w( =jis-x0208-1990 J90- 4 X),
%w( =jis-x0208-1983 J83- 4 X),
]
PRIVATE_USE_AREA = 0xe000
- def parse_er(s) # parse a ER and return a number (FIXNUM)
- unless ALL_RE =~ s # I do not use is_er? to get $1.
- raise "wrong ER."
- end
+ def parse_er(s) # parse a Entity Reference and return a number (MCS)
+ raise "wrong ER." unless ALL_RE =~ s # I don't use is_er? for getting $1.
s = $1 # extract the part of ER
- if s =~ /\AMCS-([0-9A-Fa-f]+)\Z/ # MCS. It's a mystery.
- return $1.hex
- end
+ return $1.hex if s =~ /\AMCS-([0-9A-Fa-f]+)\Z/ # MCS. It's a mystery.
- if s =~ /\AU[-+]?([0-9A-Fa-f]+)\Z/ ||
+ return $1.hex if s =~ /\AU[-+]?([0-9A-Fa-f]+)\Z/ ||
s =~ /\A#x([0-9A-Fa-f]+)\Z/ # Unicode code point in Hex.
- return $1.hex
- end
- if s =~ /\A#([0-9]+)\Z/ # Unicode code point in Decimal.
- return $1.to_i
- end
+ return $1.to_i if s =~ /\A#([0-9]+)\Z/ # Unicode code point in Decimal.
- if s =~ /\Amy-([0-9]+)\Z/ # my own code point. It's a secret.
+ if s =~ /\Amy-([0-9]+)\Z/ # my own code point. It's a secret.
return PRIVATE_USE_AREA + $1.to_i # private use area of Unicode.
end
- if s =~ /\AI-/ # I- stands for Isolated character. It's a wonder.
+ if s =~ /\AI-/ # I- stands for Isolated character. It's a wonder.
s = s.sub(/\AI-/, "")
end
u8 = CodesysDB.instance.get(codesys, code)
next if u8.nil?
- num = parse(u8)
+ num = parse("?"+u8)
next if num.nil?
return num
def initialize(ccs, db)
@ccs, @db = ccs, db
end
+
def get_char(code_point)
@db.get(code_point)
end
+
def put_char(code_point, cid)
@db.put(code_point, cid)
end
def initialize(feature, db)
@feature, @db = feature, db
end
+
def get_value(char_id)
@db.get(char_id)
end
+
def each
end
end
# StrokeFont library by eto 2003-0311
require "sgl"
-$LOAD_PATH << "../../lib" if $0 == __FILE__
require "chise/kage"
require "chise/csf"
end
end
- class Stroke #====================================================== 一本の線
+ class Stroke # a connected line
def initialize
@points = []
@length = nil
end
end
- class Strokes #===================================================== 複数の線
+ class Strokes # lines
def initialize
@strokes = []
@px1, @py1, @px2, @py2 = 0, 0, 0, 0
-# Copyright (C) 2002-2004 Kouichirou Eto, All rights reserved.
-
-require "chise/iconv"
-
-class Uconv
- def self.u8tou4(s) s.u8tou32; end
- def self.u4tou8(s) s.u32tou8; end
- def self.u4tou16(s) s.u32tou16; end
- def self.u16toeuc(s) s.u16toeuc; end
- def self.u16tosjis(s) s.u16tosjis; end
-end
$LOAD_PATH.unshift("..")
require "chise/char"
-str = "字" #Stringを拡張している。UTF8で与えること。
-p str.ucs #とすると、その文字のucsの値が表示される
-p str.total_strokes #画数が表示される
-p str.gb2312 #などなど
-str.char.alist.each {|a, v| #こんな感じで全属性を表示できる
+str = "字" # Stringを拡張している。UTF8で与えること。
+p str.ucs # とすると、その文字のucsの値が表示される
+p str.total_strokes # 画数が表示される
+p str.gb2312 # などなど
+str.char.alist.each {|a, v| # こんな感じで全属性を表示できる
print a, ': ', v, "\n"
}
-p str.inspect_x #Characterについての情報が表示される。
-p str.inspect_all #持っている属性情報を全て表示する。
+p str.inspect_x # Characterについての情報が表示される。
+p str.inspect_all # 持っている属性情報を全て表示する。
-str = "文字列" #もちろん一文字でなく文字列も扱える。UTF-8で与える。
-p str.inspect_x #各文字の情報が表示される。
-p str.inspect_all #各文字の属性情報を全て表示する。
+str = "文字列" # もちろん一文字でなく文字列も扱える。UTF-8で与える。
+p str.inspect_x # 各文字の情報が表示される。
+p str.inspect_all # 各文字の属性情報を全て表示する。
#!/usr/bin/env ruby
-$KCODE = 'u'
+$KCODE = "u"
$LOAD_PATH.unshift("..")
require "chise/char"
p "字" # "字"
#!/usr/bin/env ruby
-$KCODE = 'u'
-$LOAD_PATH << '../lib'
-require 'chise'
-include CHISE
+$KCODE = "u"
+$LOAD_PATH.unshift("..")
+require "chise/char"
p "衝".inspect_all
p "行".inspect_all
#!/usr/bin/env ruby
-$KCODE = 'u'
-$LOAD_PATH << '../lib'
-require 'chise'
-include CHISE
+$KCODE = "u"
+$LOAD_PATH.unshift("..")
+require "chise/char"
p "木".inspect_all
exit
#!/usr/bin/env ruby
-$KCODE = 'u'
-$LOAD_PATH << '../lib'
-require 'chise'
-include CHISE
+$KCODE = "u"
+$LOAD_PATH.unshift("..")
+require "chise/char"
puts "鬼".find.split(//u).sort.join
exit
#!/usr/bin/env ruby
-$KCODE = 'u'
+$KCODE = "u"
$LOAD_PATH.unshift("..")
require "chise/char"
p "字".decompose # "+宀子"
#!/usr/bin/env ruby
-$KCODE = 'u'
+$KCODE = "u"
$LOAD_PATH.unshift("..")
require "chise/char"
p "⿰木木".compose
#!/usr/bin/env ruby
-$KCODE = 'u'
+$KCODE = "u"
$LOAD_PATH.unshift("..")
require "chise/char"
+
p "日雲".find
"日雲".find.each_character{|c|
puts c.ids
#!/usr/bin/env ruby
-$KCODE = 'u'
+$KCODE = "u"
$LOAD_PATH.unshift("..")
require "chise/char"
#!/usr/bin/env ruby
-$KCODE = 'u'
+$KCODE = "u"
$LOAD_PATH.unshift("..")
require "chise/char"
#!/usr/bin/env ruby
-$KCODE = 'u'
-$LOAD_PATH << '../lib'
-require 'chise'
-require 'chise/kanjilist'
-include CHISE
+$KCODE = "u"
+$LOAD_PATH.unshift("..")
+require "chise/char"
+require "chise/kanjilist"
[IDC_LR, IDC_AB, IDC_LMR, IDC_AMB, IDC_FS, IDC_FA, IDC_FB, IDC_FL, IDC_FUL, IDC_FUR, IDC_FLL, IDC_O].each {|idc|
p idc
#!/usr/bin/env ruby
-$KCODE = 'u'
-$LOAD_PATH << '../lib'
-require 'chise'
-require 'chise/kanjilist'
-include CHISE
+$KCODE = "u"
+$LOAD_PATH.unshift("..")
+require "chise/char"
+require "chise/kanjilist"
def atom_list(list)
list.map_character {|char|
#!/usr/bin/env ruby
# Copyright (C) 2002-2004 Kouichirou Eto, All rights reserved.
-
-$LOAD_PATH.unshift("..")
-require "chise"
-include CHISE
$KCODE = "u"
+$LOAD_PATH.unshift("..")
+require "chise/char"
str = "字" #Stringを拡張している。UTF8で与えること。
p str.ucs #とすると、その文字のucsの値が表示される
--- /dev/null
+ruby.exe.stackdump
#!/usr/bin/env ruby
# Copyright (C) 2002-2004 Kouichirou Eto, All rights reserved.
-# This file is in SJIS charset. Japanese Character -> \8a¿\8e\9a.
require "common"
#!/usr/bin/env ruby
# Copyright (C) 2002-2004 Kouichirou Eto, All rights reserved.
-# This file is in SJIS charset. Japanese Character -> \8a¿\8e\9a.
require "common"
class TestBDB < Test::Unit::TestCase
def test_bdb
@config = CHISE::Config.instance
- dir = @config.db_dir
- assert_match(%r|/chise/char-db\Z|, dir)
-
- file = dir+"/=jis-x0208/system-char-id"
+ @dir = @config.db_dir
+ assert_match(%r|/chise/char-db\Z|, @dir)
+ file = @dir+"/=jis-x0208/system-char-id"
db = BDB::Hash.open(file, nil, 0)
end
@db = CHISE::CodesysDB.instance
end
end
-
-class TestCodesys < Test::Unit::TestCase
- def nusetup
- @db = CHISE::CodesysDB.instance
- end
-
- def test_dummy
- end
-
- def nu_test_db_length
- assert_equal(6287, @db.get("=jis-x0208").keys.length, "keys")
- assert_equal(590, @db.get("japanese-jisx0208").keys.length, "keys")
- assert_equal(499, @db.get("japanese-jisx0208-1978").keys.length, "keys")
- assert_equal(593, @db.get("japanese-jisx0208-1990").keys.length, "keys")
- assert_equal(6067, @db.get("japanese-jisx0212").keys.length, "keys")
- assert_equal(1697, @db.get("japanese-jisx0213-1").keys.length, "keys")
- assert_equal(2345, @db.get("japanese-jisx0213-2").keys.length, "keys")
- assert_equal(4270, @db.get("ucs-jis").keys.length, "keys")
- end
-
- def nutest_db
- keys = @db.keys
- assert_instance_of(Array, @db.keys, "db.keys")
- db = @db.get("ascii")
- assert_equal(128, db.keys.length, "can get keys")
- assert_equal(63, @db.get("katakana-jisx0201").keys.length, "keys")
- assert_equal(94, @db.get("latin-jisx0201").keys.length, "keys")
-
- counter = 0
- @db.each("=jis-x0208"){|k, v| #\88ø\90\94\82ÌCodesys\83f\81[\83^\83x\81[\83X\82Ì\82»\82ê\82¼\82ê\82É\91Î\82µ\82Ä\8eÀ\8ds\82·\82é
- er0 = sprintf("&J90-%04X;", k)
- er1 = CHISE::Character.new(v).to_er
- counter += 1; break if 10 < counter
- }
- end
-
- def nutest_ascii
- db = CHISE::CodesysDB.instance
- codesys = db.get_codesys("ascii")
- char = codesys.get(65)
- assert_equal("A", char.to_s)
- assert_equal(128, codesys.keys.length)
- ks = codesys.keys
- end
-
- def nutest_jis_codesys
- db = CHISE::CodesysDB.instance
- codesys = db.get_codesys("=jis-x0208")
- ks = codesys.keys.sort #\82Æ\82·\82é\82±\82Æ\82É\82æ\82Á\82Ä\81AJISX0208 1990\82Ì\8fW\8d\87\91S\95\94\82Ìkeys\82ª\93¾\82ç\82ê\82é
-# assert_equal(6880, ks.length)
- assert_equal(8481, ks.first)
- assert_equal(29566, ks.last)
- char = codesys.get(15226) #"\8e\9a"
- assert_equal("\8e\9a".su, char.to_s)
-
- assert_equal("\88\9f".su, codesys.get(12321))
- jis = "\88\9f".su.char.japanese_jisx0208_1990
-# assert_equal("\88\9f", codesys.get(jis))
-# assert_equal("\88\9f", sprintf("&J90-%04X;", jis).de_er)
-
-# codesys = db.get_codesys("japanese-jisx0208-1990") #\8b\8c\96¼
- codesys = db.get_codesys("=jis-x0208-1990")
- assert_equal(8481, ks.first)
- assert_equal(29566, ks.last)
- end
-end
require "common"
class TestIconv < Test::Unit::TestCase
+ def test_original_iconv
+ u8 = "\8e\9a".sjistou8
+ s = Iconv.iconv_to_from("UTF-16", "UTF-8", u8)
+ assert_equal("\376\377\x5b\x57", s) # \376\377 -> Byte Order Mark?
+ s = Iconv.iconv_to_from("UTF-32", "UTF-8", u8)
+ assert_equal("\0\0\376\377\0\0[W", s)
+ end
+
def test_iconv
u8 = "\8e\9a".sjistou8 # U+5B57 (0x8E9A): CJK Unified Ideograph
assert_equal("\345\255\227", u8)
assert_equal("\264\301\273\372", u16.u16toeuc)
assert_equal("\212\277\216\232", u16.u16tosjis)
end
-
- def test_original_iconv
- u8 = "\8e\9a".sjistou8
- s = Iconv.iconv_to_from("UTF-16", "UTF-8", u8)
- assert_equal("\376\377\x5b\x57", s) # \376\377 -> Byte Order Corder?
- s = Iconv.iconv_to_from("UTF-32", "UTF-8", u8)
- assert_equal("\0\0\376\377\0\0[W", s)
- end
end
@pa = CHISE::CharacterParser.new
# test_parse
- assert_equal(0, @pa.parse(nil))
+ assert_raise(RuntimeError){ @pa.parse(nil) }
assert_equal(65, @pa.parse(0x41))
assert_raise(RuntimeError){ @pa.parse(Object.new) }
assert_equal(65, @pa.parse("65"))
assert_equal(20175, @pa.parse("?\344\273\217"))
- assert_equal(110, @pa.parse("nosuchcharacter")) # hatena?
+ assert_raise(RuntimeError){ @pa.parse("nosuchcharacter") }
# test_parse_er
assert_equal(true, @pa.contain_er?("A"))
assert_equal(65, @pa.parse("A"))
assert_equal(65, @pa.parse("A"))
assert_equal(0xe001, @pa.parse("&my-1;"))
- assert_equal(23383, @pa.parse("&J90-3B7A;"))
- assert_equal(23383, @pa.parse("&I-J90-3B7A;"))
-# assert_raise(RuntimeError){ @pa.parse_er("&nosucher;") }
+# assert_equal(23383, @pa.parse("&J90-3B7A;"))
+# assert_equal(23383, @pa.parse("&I-J90-3B7A;"))
+# assert_raise(RuntimeError){ @pa.parse_er("&nosucher;") }
end
end
class TestRubyChise < Test::Unit::TestCase
def test_rbchise
- ds = CHISE::DataSource.new
- assert_instance_of(CHISE::DataSource, ds)
- dt = ds.open_decoding_table("=daikanwa")
- assert_instance_of(CHISE::DecodingTable, dt)
- char_id = dt.get_char(364) # get a character by Daikanwa number 364.
+ @ds = CHISE::DataSource.new
+ assert_instance_of(CHISE::DataSource, @ds)
+ @dt = @ds.open_decoding_table("=daikanwa")
+ assert_instance_of(CHISE::DecodingTable, @dt)
+ char_id = @dt.get_char(364) # get a character by Daikanwa number 364.
assert_instance_of(String, char_id)
assert_equal("?\344\273\217", char_id)
- ft = ds.open_feature_table("ideographic-structure")
- assert_instance_of(CHISE::FeatureTable, ft)
- value = ft.get_value(char_id)
+ @ft = @ds.open_feature_table("ideographic-structure")
+ assert_instance_of(CHISE::FeatureTable, @ft)
+ value = @ft.get_value(char_id)
assert_instance_of(String, value)
assert_equal("(?\342\277\260 ?\344\272\273 ?\345\216\266)", value)
end