9e97fe552bb7264da2a86fef0a7ebf19d2cca2d1
[chise/ruby.git] / chise / idsdb.rb
1 # Copyright (C) 2002-2004 Kouichirou Eto, All rights reserved.
2
3 require "chise/char"
4 require "chise/ids"
5 require "chise/qp"
6 require "chise/management"
7
8 module CHISE
9   class IDS_DB_Management
10     def initialize
11       @cd = ChiseDB.instance
12       @idsdb = IDS_DB.instance
13     end
14
15     def check_conflict_of_ids_text
16       @idsdb.each_ccs {|ccs|
17         #qp ccs
18         c = Hash.new(0)
19         h = {}
20         @idsdb.get_ccs(ccs).each_character {|char, ids|
21           c["char"] += 1
22           next if ids == char.to_s
23           next if ids.char_length == 1
24           char_id = char.char_id
25           cids = h[char_id]
26           if cids.nil? # There is no ids yet.
27             h[char_id] = ids # just set it.
28             c["good"] += 1
29           else # but, if there is already a ids?
30             if cids == ids # the two are same.
31               c["same"] += 1 # and just ignore
32             else # but, if the two are not same?
33               c["conflict"] += 1
34               puts "conflict\t#{char.to_s}\t#{ids}\t#{cids}"
35             end
36           end
37         }
38         puts "#{ccs}\t#{c['char']}\t#{c['same']}\t#{c['conflict']}\t#{c['good']}"
39       }
40     end
41
42     def store_ids_as_text
43       @idsdb.each_ccs {|ccs|
44         qp ccs
45         i = 0
46         @idsdb.get_ccs(ccs).each_character {|char, ids|
47           next if ids == char.to_s
48           next if ids.char_length == 1
49           char.ids_text = ids # just set it.
50           i += 1
51           break if 10000 < i
52         }
53       }
54       @cd.get_feature("ids-text").dump
55     end
56
57     def store_ids_de_er
58       @cd.get_feature("ids-text").each_char {|cid, idser|
59         char = Character.get(cid)
60         begin
61           ids = idser.de_er # parse Entity Reference
62         rescue => e
63           qp cid, idser
64           next
65         end
66         char.ids_de_er = ids # set it.
67       }
68       @cd.get_feature("ids-de-er").dump
69     end
70
71     def check_integrity_of_ids_tree
72       @cd.get_feature("ids-de-er").each_char {|cid, ids|
73         char = Character.get(cid)
74         idstree = IDS_Tree.new(ids)
75         begin
76           raise "contains self" if ids.include?(char.to_s)
77           idstree.check_integrity
78         rescue => e
79           #puts "#{cid}\t#{e.message}\t#{ids}"
80           char.ids_error = e.message
81           next
82         end
83         char.ids_org = ids # set it.
84       }
85       @cd.get_feature("ids-org").dump
86       @cd.get_feature("ids-error").dump
87     end
88
89     def make_by_ids_db
90       byidsdb = @cd.get_by_ids_db("ids-org")
91       @cd.get_feature("ids-org").each_char {|cid, ids|
92         char = Character.get(cid)
93         byidsdb.set_decoded_char(ids, cid)
94       }
95       byidsdb.dump
96     end
97
98     def store_ids_aggregated
99       @cd.get_feature("ids-org").each_char {|cid, ids|
100         char = Character.get(cid)
101         #ids = char.decompose
102         #ids = char.ids
103         ag = ids.to_ids.aggregate("ids-org")
104         #puts "#{char.to_s}\t#{ids}\t#{ag}"
105         char.ids = ag # ids-aggregated
106       }
107       @cd.get_feature("ids").dump
108     end
109
110     def store_ids_subparts
111       @cd.get_feature("ids").each_char {|cid, v|
112         char = Character.get(cid)
113         pids = char.to_s # previous_ids
114         ar = []
115         i = 0 # only for infinite loop check
116         loop {
117           ids = pids.decompose
118           break if ids == pids #これ以上分割できないようだったら終了〜。
119           ar += ids.to_a
120           i += 1
121           qp [char.to_s, pids, ids, ar] if 10 < i #これは何かおかしいぞと
122           pids = ids
123         }
124         str = ar.sort.uniq.join("") # can contain IDC.
125         char.ids_subparts = str
126       }
127       @cd.get_feature("ids-subparts").dump
128     end
129
130     def store_ids_contained
131       h = Hash.new
132       @cd.get_feature("ids-subparts").each_char {|cid, v|
133         char = Character.get(cid)
134         parts = char.ids_subparts
135         parts.each_char {|ch|
136           h[ch] = [] if h[ch].nil?
137           h[ch] << cid
138         }
139       }
140       h.each {|ch, v|
141         #char = Character.get(cid)
142         char = ch.char
143         v = v.sort
144         char.ids_contained = v.join
145       }
146       @cd.get_feature("ids-contained").dump
147     end
148   end
149
150   class IDS_DB
151     include Singleton
152
153     def initialize
154       @config = Config.instance
155       @path = @config.ids_dir.path
156       @dbs = {}
157     end
158     attr_reader :path
159
160     def get_ccs(ccs)
161       @dbs[ccs] = IDS_CCS_DB.new(self, ccs) if @dbs[ccs].nil?
162       @dbs[ccs]
163     end
164
165     def each_ccs
166       @path.each_entry {|f|
167         next unless /\AIDS-(.+)\.txt\Z/ =~ f
168         yield($1)
169       }
170     end
171   end
172
173   class IDS_CCS_DB
174     def initialize(idsdb, ccs)
175       @idsdb, @ccs = idsdb, ccs
176       @path = @idsdb.path+("IDS-"+ccs+".txt")
177     end
178
179     def each_line
180       @path.open {|f|
181         f.each {|line|
182           next if /\A;/ =~ line # skip comment
183           line.chomp!
184           code, picture, ids = line.split
185           raise if code.nil?
186           ids = "" if ids.nil?
187           yield(code, ids)
188         }
189       }
190     end
191
192     def each_character
193       each_line {|code, ids|
194         next if ids.nil?
195         next if ids == "" # If there is no IDS, ignore it.
196
197         er = "&"+code+";"
198         begin
199           char = Character.get(er)
200         rescue
201           #qp er
202           next
203         end
204         next if char.nil?
205         yield(char, ids)
206       }
207     end
208
209   end
210 end