update.
[chise/concord-kanbun.git] / concord-kanbun-dic.el
1 (concord-assign-genre 'entry@zh-classical "/usr/local/var/kanbun/db")
2 (concord-assign-genre 'word-class@zh-classical "/usr/local/var/kanbun/db")
3 (concord-assign-genre 'morpheme@zh-classical "/usr/local/var/kanbun/db")
4 (concord-assign-genre 'sentence@zh-classical "/usr/local/var/kanbun/db")
5 (mount-char-attribute-table '*instance@morpheme-entry/zh-classical)
6
7 (defvar concord-kanbun-allocated-morpheme-id-max 0)
8
9 (defun concord-kanbun-word-class-canonical-p (word-class)
10   (memq 'mkwcs (concord-object-get word-class 'sources)))
11
12 (defun concord-kanbun-encode-name-as-id (name)
13   (let (ucs ret)
14     (mapconcat (lambda (c)
15                  (cond
16                   ((eq c ?\ )
17                    "_"
18                    )
19                   ((or (and (<= ?A c)(<= c ?Z))
20                        (and (<= ?a c)(< c ?u))
21                        (and (< ?u c)(< c ?z))
22                        (and (<= ?0 c)(<= c ?9)))
23                    (char-to-string c)
24                    )
25                   ((setq ucs (or (encode-char c '=ucs@JP)
26                                  (char-ucs c)))
27                    (format "u%04X" ucs)
28                    )
29                   ((setq ret (encode-char c '=ruimoku-v6))
30                    (format "r%04X" ret)
31                    )
32                   ((setq ret (encode-char c '=jef-china3))
33                    (format "j%04X" ret)
34                    )
35                   (t
36                    (error "Unknown character %c" c)
37                    ;; (format "m%08X" (char-id c))
38                    )))
39                name
40                "")))
41
42 (defun concord-kanbun-add-sentence-entry (entry)
43   (let (entry-id me-cobj)
44     (unless (setq me-cobj (concord-decode-object '=name entry
45                                                  'entry@zh-classical))
46       (setq entry-id (intern (concord-kanbun-encode-name-as-id entry)))
47       (setq me-cobj (concord-make-object 'entry@zh-classical entry-id))
48       (concord-object-put me-cobj '=name entry))
49     me-cobj))
50
51 (defun concord-kanbun-add-morpheme-entry (entry)
52   (let ((len (length entry))
53         (i 0)
54         entry-id me-cobj chr ret ucs)
55     (unless (setq me-cobj (concord-decode-object
56                            '=name entry
57                            'entry@zh-classical))
58       (setq entry-id (intern (concord-kanbun-encode-name-as-id entry)))
59       (setq me-cobj (concord-make-object 'entry@zh-classical entry-id))
60       (concord-object-put me-cobj '=name entry)
61       (if (= len 1)
62           (concord-object-put me-cobj 'character (list (aref entry 0))))
63       (while (< i len)
64         (setq chr (aref entry i))
65         (if (setq ucs (or (encode-char chr '=ucs@JP)
66                           (char-ucs chr)))
67             (setq chr (decode-char '=ucs ucs)))
68         (setq ret (get-char-attribute
69                    chr '*instance@morpheme-entry/zh-classical))
70         (unless (member me-cobj ret)
71           (put-char-attribute chr '*instance@morpheme-entry/zh-classical
72                               (cons me-cobj ret)))
73         (setq i (1+ i)))
74       (save-char-attribute-table '*instance@morpheme-entry/zh-classical))
75     me-cobj))
76
77 ;; (save-char-attribute-table '*instance@morpheme-entry/zh-classical)
78
79 (defun concord-kanbun-add-word-class (&rest word-class)
80   (let (wc-name wc-cobj wc-name-id parent-wc-cobj
81                 r-wcl)
82     (dolist (wcf word-class)
83       (when (and wcf
84                  (not (string= wcf "*")))
85         (setq r-wcl (cons wcf r-wcl))
86         (setq wc-name
87               (if wc-name
88                   (format "%s,%s" wc-name wcf)
89                 wcf))))
90     (unless wc-name
91       (setq wc-name "*"))
92     (unless (setq wc-cobj (concord-decode-object
93                            '=name wc-name
94                            'word-class@zh-classical))
95       (setq wc-name-id (intern (concord-kanbun-encode-name-as-id wc-name)))
96       (setq wc-cobj (concord-make-object
97                      'word-class@zh-classical wc-name-id))
98       (concord-object-put wc-cobj '=name wc-name)
99       (when (and r-wcl
100                  (setq parent-wc-cobj
101                        (apply #'concord-kanbun-add-word-class
102                               (nreverse (cdr r-wcl)))))
103         (concord-object-put
104          wc-cobj '<-subcategory@misc (list parent-wc-cobj)))
105       )
106     wc-cobj))
107
108 (defun concord-kanbun-add-morpheme (entry word-superclass word-class
109                                           word-subclass1 word-subclass2
110                                           word-subclass3
111                                           canonical-form
112                                           ja-form ja-kana ja-conj-type
113                                           comment)
114   (let* (entry-cobj
115          canonical-entry-cobj
116          wc-cobj wc-name wc-canonical-flag
117          mm-name
118          mj-name
119          mjc-name mjc-id mjc-cobj)
120     (when entry
121       (setq wc-name
122             (if (or (null word-subclass3)
123                     (string-equal word-subclass3 "*"))
124                 (format "%s,%s,%s,%s"
125                         word-superclass word-class
126                         word-subclass1 word-subclass2)
127               (format "%s,%s,%s,%s,%s"
128                       word-superclass word-class
129                       word-subclass1 word-subclass2
130                       word-subclass3)))
131       (setq mm-name (format "%s (%s) [%s]"
132                             entry canonical-form wc-name))
133       (setq mj-name (format "%s (%s (%s),%s)"
134                             mm-name ja-form ja-kana ja-conj-type))
135       (setq mjc-name
136             (if comment
137                 (format "%s\t; %s" mj-name comment)
138               mj-name))
139       (unless (setq mjc-cobj (concord-decode-object
140                              '=name mjc-name 'morpheme@zh-classical))
141         (setq mjc-id (intern (concord-kanbun-encode-name-as-id mjc-name)))
142         (setq concord-kanbun-allocated-morpheme-id-max
143               (1+ concord-kanbun-allocated-morpheme-id-max))
144         ;; (setq mjc-id concord-kanbun-allocated-morpheme-id-max)
145         (setq mjc-cobj (concord-make-object 'morpheme@zh-classical mjc-id))
146         (concord-object-put mjc-cobj '=name mjc-name)
147         (concord-object-put mjc-cobj
148                             '=number concord-kanbun-allocated-morpheme-id-max)
149         (setq entry-cobj (concord-kanbun-add-morpheme-entry entry))
150         (setq wc-cobj (concord-kanbun-add-word-class
151                        word-superclass word-class
152                        word-subclass1 word-subclass2
153                        word-subclass3))
154         (concord-object-put mjc-cobj '->word-class (list wc-cobj))
155         (setq wc-canonical-flag
156               (concord-kanbun-word-class-canonical-p wc-cobj))
157         (concord-object-put mjc-cobj
158                             (if wc-canonical-flag
159                                 '->entry@morpheme
160                               '->entry@morpheme/misc)
161                             (list entry-cobj))
162         (unless (string= entry canonical-form)
163           (when (setq canonical-entry-cobj
164                       (concord-kanbun-add-morpheme-entry canonical-form))
165             (concord-object-put mjc-cobj
166                                 (if wc-canonical-flag
167                                     '->entry@morpheme/canonical
168                                   '->entry@morpheme/canonical/misc)
169                                 (list canonical-entry-cobj))))
170         (concord-object-put mjc-cobj 'ja-form ja-form)
171         (concord-object-put mjc-cobj 'ja-kana ja-kana)
172         (concord-object-put mjc-cobj 'ja-conjugation-type ja-conj-type))
173       mjc-cobj)))
174
175 (defun concord-kanbun-parse-corpus-line (string)
176   (let* ((ret (split-string string "\t*[;\e$B!(\e(B]\\s *"))
177          entry features comment)
178     (if (and (setq comment (nth 1 ret))
179              (string-match "[ \t]+$" comment))
180         (setq comment (substring comment 0 (match-beginning 0))))
181     (setq ret (split-string (car ret) "\t"))
182     (setq entry (car ret)
183           features (split-string (nth 1 ret) ","))
184     (list entry
185           (car features)(nth 1 features)
186           (nth 2 features)(nth 3 features)(nth 4 features)
187           (nth 6 features)
188           (nth 7 features)(nth 8 features)(nth 9 features)
189           comment)))
190
191 (defun concord-kanbun-add-corpus-line (string)
192   (apply #'concord-kanbun-add-morpheme
193          (concord-kanbun-parse-corpus-line string)))
194
195 (defun concord-kanbun-read-sentence (sentence-number &optional source-name)
196   (unless source-name
197     (setq source-name (file-name-nondirectory buffer-file-name)))
198   (let ((beg (point))
199         end send
200         ret
201         sentence dest
202         ;; sentence-name
203         sentence-id-name sentence-id sentence-cobj
204         sentence-entry-cobj)
205     (prog1
206         (save-excursion
207           (when (search-forward "\nEOS\n" nil t)
208             (setq end (match-beginning 0)
209                   send (match-end 0))
210             (goto-char beg)
211             (while (search-forward "\t" end t)
212               (setq ret (concord-kanbun-parse-corpus-line
213                          (buffer-substring (point-at-bol)(point-at-eol))))
214               (setq sentence (concat sentence (car ret)))
215               ;; (setq sentence-name
216               ;;       (concat sentence-name
217               ;;               (if sentence-name
218               ;;                   " ")
219               ;;               (format "%s[%s,%s,%s]"
220               ;;                       (car ret)
221               ;;                       (nth 2 ret)
222               ;;                       (nth 3 ret)(nth 4 ret))))
223               (setq dest
224                     (cons (apply #'concord-kanbun-add-morpheme ret)
225                           dest))
226               (goto-char (point-at-eol)))
227             ;; (setq sentence-id
228             ;;       (intern
229             ;;        (concord-kanbun-encode-name-as-id
230             ;;         (format "%s/%d" source-name sentence-number))))
231             (setq sentence-id-name
232                   (format "%s/%d" source-name sentence-number))
233             (setq sentence-id (intern sentence-id-name))
234             (unless (setq sentence-cobj
235                           (concord-decode-object
236                            '=id sentence-id 'sentence@zh-classical))
237               (setq sentence-cobj
238                     (concord-make-object
239                      'sentence@zh-classical sentence-id))
240               ;; (concord-object-put
241               ;;  sentence-cobj '=name (format "%s(%s)"
242               ;;                               sentence-name sentence-id-name))
243               (concord-object-put
244                sentence-cobj '=name (format "%s(%s)"
245                                             sentence sentence-id-name))
246               )
247             (concord-object-put
248              sentence-cobj 'source/file-name source-name)
249             (concord-object-put
250              sentence-cobj 'source/sentence-number sentence-number)
251             (concord-object-put
252              sentence-cobj '->morphemes (nreverse dest))
253             (when (setq sentence-entry-cobj
254                         (concord-kanbun-add-sentence-entry sentence))
255               (concord-object-put
256                sentence-cobj '->entry@sentence (list sentence-entry-cobj)))
257             sentence-cobj))
258       (if send
259           (goto-char send)))))
260
261 (defun concord-kanbun-read-buffer (&optional source-name)
262   (interactive)
263   (save-excursion
264     (goto-char (point-min))
265     (unless source-name
266       (setq source-name (file-name-nondirectory buffer-file-name)))
267     (let ((i 1))
268       (while (concord-kanbun-read-sentence i source-name)
269         (message (format "%s: sentence #%d is stored." source-name i))
270         (setq i (1+ i))))))
271
272 (defun concord-kanbun-batch-read-file ()
273   (set-terminal-coding-system 'utf-8-jp-er)
274   (let ((file (pop command-line-args-left))
275         (coding-system-for-read 'utf-8-jp-er)
276         (file-name-coding-system 'utf-8-jp-er))
277     (with-temp-buffer
278       (insert-file-contents file)
279       (concord-kanbun-read-buffer (file-name-nondirectory file)))))