update.
[chise/ruby.git] / chise / idsdb.rb
1 # Copyright (C) 2002-2004 Kouichirou Eto, All rights reserved.
2
3 require "chise/char"
4 require "chise/ids"
5 require "chise/qp"
6 require "chise/management"
7
8 module CHISE
9   class IDS_DB_Management
10     def initialize
11       @cd = ChiseDB.instance
12       @idsdb = IDS_DB.instance
13     end
14
15     def check_conflict_of_ids_text
16       @idsdb.each_ccs {|ccs|
17         #qp ccs
18         c = Hash.new(0)
19         h = {}
20         @idsdb.get_ccs(ccs).each_character {|char, ids|
21           c["char"] += 1
22           next if ids == char.to_s
23           next if ids.char_length == 1
24           char_id = char.char_id
25           cids = h[char_id]
26           if cids.nil? # There is no ids yet.
27             h[char_id] = ids # just set it.
28             c["good"] += 1
29           else # but, if there is already a ids?
30             if cids == ids # the two are same.
31               c["same"] += 1 # and just ignore
32             else # but, if the two are not same?
33               c["conflict"] += 1
34               puts "conflict\t#{char.to_s}\t#{ids}\t#{cids}"
35             end
36           end
37         }
38         puts "#{ccs}\t#{c['char']}\t#{c['same']}\t#{c['conflict']}\t#{c['good']}"
39       }
40     end
41
42     def store_ids_as_text
43       max = 20000
44       h = {}
45       @idsdb.each_ccs {|ccs|
46         qp ccs
47         @idsdb.get_ccs(ccs).each_character {|char, ids|
48           next if ids == char.to_s
49           next if ids.char_length == 1
50           char.ids_text = ids # just set it.
51           h[char.char_id] = ids
52 #         break if max <= h.length
53         }
54 #       break if max <= h.length
55       }
56       qp "%08X" % h.keys.max
57       qp "sync", @cd.get_feature("ids-text").sync
58       @cd.get_feature("ids-text").dump
59       qp h.length
60       qp @cd.get_feature("ids-text").to_hash.length
61     end
62
63     def store_ids_de_er
64       h = {}
65       @cd.get_feature("ids-text").each_char {|cid, ids_text|
66         char = Character.get(cid)
67         begin
68           ids = ids_text.de_er # parse Entity Reference
69         rescue => e
70           qp cid, ids_text
71           next
72         end
73         next if ids == char.to_s
74         next if ids.char_length == 1
75         char.ids_de_er = ids # set it.
76         h[char.char_id] = ids
77       }
78       qp "%08X" % h.keys.max
79       @cd.get_feature("ids-de-er").dump
80       qp h.length
81       qp @cd.get_feature("ids-de-er").to_hash.length
82     end
83
84     def check_integrity_of_ids_tree
85       h = {}
86       @cd.get_feature("ids-de-er").each_char {|cid, ids|
87         char = Character.get(cid)
88         idstree = IDS_Tree.new(ids)
89         begin
90           raise "contains self" if ids.include?(char.to_s)
91           idstree.check_integrity
92         rescue => e
93           #puts "#{cid}\t#{e.message}\t#{ids}"
94           char.ids_error = e.message
95           next
96         end
97         char.ids_org = ids # set it.
98         h[char.char_id] = ids
99       }
100       @cd.get_feature("ids-org").dump
101       qp h.length
102       qp @cd.get_feature("ids-org").to_hash.length
103       @cd.get_feature("ids-error").dump
104     end
105
106     def make_by_ids_db_org
107       h = {}
108       byids = @cd.get_by_ids_db("ids-org")
109       @cd.get_feature("ids-org").each_char {|cid, ids|
110         char = Character.get(cid)
111         byids.set_decoded_char(ids, cid)
112         h[ids] = cid
113       }
114       qp h.length
115       byids.dump
116       qp byids.to_hash.length
117     end
118
119     def store_ids_aggregated
120       h = {}
121       @cd.get_feature("ids-org").each_char {|cid, ids|
122         char = Character.get(cid)
123         #ids = char.decompose
124         #ids = char.ids
125         ag = ids.to_ids.aggregate("ids-org")
126         #puts "#{char.to_s}\t#{ids}\t#{ag}"
127         char.ids = ag # ids-aggregated
128         h[char.char_id] = ids
129       }
130       @cd.get_feature("ids").dump
131       qp h.length
132       qp @cd.get_feature("ids").to_hash.length
133     end
134
135     def store_ids_subparts
136       h = {}
137       @cd.get_feature("ids").each_char {|cid, v|
138         char = Character.get(cid)
139         pids = char.to_s # previous_ids
140         ar = []
141         i = 0 # only for infinite loop check
142         loop {
143           ids = pids.decompose
144           break if ids == pids # break if there is no possibilities.
145           ar += ids.to_a
146           i += 1
147           qp [char.to_s, pids, ids, ar] if 10 < i # something wrong.
148           pids = ids
149         }
150         str = ar.sort.uniq.join("") # can contain IDC.
151         char.ids_subparts = str
152         h[char.char_id] = str
153       }
154       @cd.get_feature("ids-subparts").dump
155       qp h.length
156       qp @cd.get_feature("ids-subparts").to_hash.length
157     end
158
159     def store_ids_contained
160       h = Hash.new
161       @cd.get_feature("ids-subparts").each_char {|cid, v|
162         char = Character.get(cid)
163         parts = char.ids_subparts
164         parts.each_char {|ch|
165           h[ch] = [] if h[ch].nil?
166           h[ch] << cid
167         }
168       }
169       h.each {|char, ar|
170         str = ar.sort.map {|cid| Character.get(cid).to_s }.join
171         char.ids_contained = str
172       }
173       @cd.get_feature("ids-contained").dump
174     end
175
176     def make_by_ids_db
177       byids = @cd.get_by_ids_db("ids")
178       @cd.get_feature("ids").each_char {|cid, ids|
179         char = Character.get(cid)
180         byids.set_decoded_char(ids, cid)
181       }
182       byids.dump
183     end
184   end
185
186   class IDS_DB
187     include Singleton
188
189     def initialize
190       @config = Config.instance
191       @path = @config.ids_dir.path
192       @dbs = {}
193     end
194     attr_reader :path
195
196     def get_ccs(ccs)
197       @dbs[ccs] = IDS_CCS_DB.new(self, ccs) if @dbs[ccs].nil?
198       @dbs[ccs]
199     end
200
201     def each_ccs
202       @path.each_entry {|f|
203         next unless /\AIDS-(.+)\.txt\Z/ =~ f
204         yield($1)
205       }
206     end
207   end
208
209   class IDS_CCS_DB
210     def initialize(idsdb, ccs)
211       @idsdb, @ccs = idsdb, ccs
212       @path = @idsdb.path+("IDS-"+ccs+".txt")
213     end
214
215     def each_line
216       @path.open {|f|
217         f.each {|line|
218           next if /\A;/ =~ line # skip comment
219           line.chomp!
220           code, picture, ids = line.split
221           raise if code.nil?
222           ids = "" if ids.nil?
223           yield(code, ids)
224         }
225       }
226     end
227
228     def each_character
229       each_line {|code, ids|
230         next if ids.nil?
231         next if ids == "" # If there is no IDS, ignore it.
232
233         er = "&"+code+";"
234         begin
235           char = Character.get(er)
236         rescue
237           #qp er
238           next
239         end
240         next if char.nil?
241         yield(char, ids)
242       }
243     end
244
245   end
246 end