update.
[chise/ruby.git] / chise / idsdb.rb
1 # Copyright (C) 2002-2004 Kouichirou Eto, All rights reserved.
2
3 require "chise/char"
4 require "chise/ids"
5 require "chise/qp"
6 require "chise/management"
7
8 module CHISE
9   class IDS_DB_Management
10     def initialize
11       @cd = ChiseDB.instance
12       @idsdb = IDS_DB.instance
13     end
14
15     def check_conflict_of_ids_text
16       @idsdb.each_ccs {|ccs|
17         qp ccs
18         c = Hash.new(0)
19         h = {}
20         @idsdb.get_ccs(ccs).each_character {|char, ids|
21           c["char"] += 1
22           next if ids == char.to_s
23           next if ids.char_length == 1
24           char_id = char.char_id
25           cids = h[char_id]
26           if cids.nil? # There is no ids yet.
27             h[char_id] = ids # just set it.
28             c["good"] += 1
29           else # but, if there is already a ids?
30             if cids == ids # the two are same.
31               c["same"] += 1 # and just ignore
32             else # but, if the two are not same?
33               c["conflict"] += 1
34               puts "conflict\t#{char.to_s}\t#{ids}\t#{cids}"
35             end
36           end
37         }
38         puts "#{ccs}\t#{c['char']}\t#{c['same']}\t#{c['conflict']}\t#{c['good']}"
39       }
40     end
41
42     def store_ids_as_text
43       @idsdb.each_ccs {|ccs|
44         #qp ccs
45         @idsdb.get_ccs(ccs).each_character {|char, ids|
46           next if ids == char.to_s
47           next if ids.char_length == 1
48           char.ids_text = ids # just set it.
49         }
50       }
51       @cd.get_feature("ids-text").dump
52     end
53
54     def store_ids_de_er
55       @cd.get_feature("ids-text").each {|cid, idser|
56         char = Character.get(cid)
57         begin
58           ids = idser.de_er # parse Entity Reference
59         rescue => e
60           qp cid, idser
61           next
62         end
63         char.ids_de_er = ids # set it.
64       }
65       @cd.get_feature("ids-de-er").dump
66     end
67
68     def check_integrity_of_ids_tree
69       @cd.get_feature("ids-de-er").each {|cid, ids|
70         char = Character.get(cid)
71         idstree = IDS_Tree.new(ids)
72         begin
73           raise "contains self" if ids.include?(char.to_s)
74           idstree.check_integrity
75         rescue => e
76           #puts "#{cid}\t#{e.message}\t#{ids}"
77           char.ids_error = e.message
78           next
79         end
80         char.ids = ids # set it.
81       }
82       @cd.get_feature("ids").dump
83       @cd.get_feature("ids-error").dump
84     end
85
86     def make_by_ids_db
87       ct = @cd.get_by_ids_db("ids")
88       @cd.get_feature("ids").each {|cid, ids|
89         char = Character.get(cid)
90         ct.set_decoded_char(ids, cid)
91       }
92       ct.dump
93     end
94
95     def store_ids_aggregated
96       @cd.get_feature("ids").each {|cid, ids|
97         char = Character.get(cid)
98         #ids = char.decompose
99         #ids = char.ids
100         ag = ids.to_ids.aggregate
101         #puts "#{char.to_s}\t#{ids}\t#{ag}"
102         char.ids_aggregated = ag
103       }
104       @cd.get_feature("ids-aggregated").dump
105     end
106
107     def store_ids_subparts
108       @cd.get_feature("ids").each {|cid, v|
109         char = Character.get(cid)
110         pids = char.to_s # previous_ids
111         ar = []
112         i = 0
113         loop {
114           ids = pids.decompose
115           break if ids == pids #これ以上分割できないようだったら終了〜。
116           ar += ids.to_a
117           i += 1
118           qp [char.to_s, pids, ids, ar] if 10 < i #これは何かおかしいぞと
119           pids = ids
120         }
121         str = ar.sort.uniq.join("") # can contain IDC.
122         char.ids_subparts = str
123       }
124       @cd.get_feature("ids-subparts").dump
125     end
126
127     def store_ids_contained
128       h = Hash.new
129       @cd.get_feature("ids-subparts").each {|cid, v|
130         char = Character.get(cid)
131         parts = char.ids_subparts
132         parts.each_char {|ch|
133           h[ch] = [] if h[ch].nil?
134           h[ch] << cid
135         }
136       }
137       h.each {|ch, v|
138         #char = Character.get(cid)
139         char = ch.char
140         v = v.sort
141         char.ids_contained = v.join
142       }
143       @cd.get_feature("ids-contained").dump
144     end
145
146   end
147
148   class IDS_DB
149     include Singleton
150
151     def initialize
152       @config = Config.instance
153       @path = @config.ids_dir.path
154       @dbs = {}
155     end
156     attr_reader :path
157
158     def get_ccs(ccs)
159       @dbs[ccs] = IDS_CCS_DB.new(self, ccs) if @dbs[ccs].nil?
160       @dbs[ccs]
161     end
162
163     def each_ccs
164       @path.each_entry {|f|
165         next unless /\AIDS-(.+)\.txt\Z/ =~ f
166         yield($1)
167       }
168     end
169   end
170
171   class IDS_CCS_DB
172     def initialize(idsdb, ccs)
173       @idsdb, @ccs = idsdb, ccs
174       @path = @idsdb.path+("IDS-"+ccs+".txt")
175     end
176
177     def each_line
178       @path.open {|f|
179         f.each {|line|
180           next if /\A;/ =~ line # skip comment
181           line.chomp!
182           code, picture, ids = line.split
183           raise if code.nil?
184           ids = "" if ids.nil?
185           yield(code, ids)
186         }
187       }
188     end
189
190     def each_character
191       each_line {|code, ids|
192         next if ids.nil?
193         next if ids == "" # If there is no IDS, ignore it.
194
195         er = "&"+code+";"
196         begin
197           char = Character.get(er)
198         rescue
199           #qp er
200           next
201         end
202         next if char.nil?
203         yield(char, ids)
204       }
205     end
206
207   end
208 end