91e21721faabee0d4143bc2d8f6326ef06db4648
[chise/concord-kanbun.git] / concord-kanbun-dic.el
1 (concord-assign-genre 'entry@zh-classical "/usr/local/var/kanbun/db")
2 (concord-assign-genre 'word-class@zh-classical "/usr/local/var/kanbun/db")
3 (concord-assign-genre 'morpheme@zh-classical "/usr/local/var/kanbun/db")
4 (concord-assign-genre 'sentence@zh-classical "/usr/local/var/kanbun/db")
5 (mount-char-attribute-table '*instance@morpheme-entry/zh-classical)
6
7 (defun concord-kanbun-word-class-canonical-p (word-class)
8   (memq 'mkwcs (concord-object-get word-class 'sources)))
9
10 (defun concord-kanbun-encode-name-as-id (name)
11   (let (ucs ret)
12     (mapconcat (lambda (c)
13                  (cond
14                   ((eq c ?\ )
15                    "_"
16                    )
17                   ((or (and (<= ?A c)(<= c ?Z))
18                        (and (<= ?a c)(< c ?u))
19                        (and (< ?u c)(< c ?z))
20                        (and (<= ?0 c)(<= c ?9)))
21                    (char-to-string c)
22                    )
23                   ((setq ucs (or (encode-char c '=ucs@JP)
24                                  (char-ucs c)))
25                    (format "u%04X" ucs)
26                    )
27                   ((setq ret (encode-char c '=ruimoku-v6))
28                    (format "r%04X" ret)
29                    )
30                   ((setq ret (encode-char c '=jef-china3))
31                    (format "j%04X" ret)
32                    )
33                   (t
34                    (error "Unknown character %c" c)
35                    ;; (format "m%08X" (char-id c))
36                    )))
37                name
38                "")))
39
40 (defun concord-kanbun-add-sentence-entry (entry)
41   (let (entry-id me-cobj)
42     (unless (setq me-cobj (concord-decode-object '=name entry
43                                                  'entry@zh-classical))
44       (setq entry-id (intern (concord-kanbun-encode-name-as-id entry)))
45       (setq me-cobj (concord-make-object 'entry@zh-classical entry-id))
46       (concord-object-put me-cobj '=name entry))
47     me-cobj))
48
49 (defun concord-kanbun-add-morpheme-entry (entry)
50   (let ((len (length entry))
51         (i 0)
52         entry-id me-cobj chr ret ucs)
53     (unless (setq me-cobj (concord-decode-object
54                            '=name entry
55                            'entry@zh-classical))
56       (setq entry-id (intern (concord-kanbun-encode-name-as-id entry)))
57       (setq me-cobj (concord-make-object 'entry@zh-classical entry-id))
58       (concord-object-put me-cobj '=name entry)
59       (if (= len 1)
60           (concord-object-put me-cobj 'character (list (aref entry 0))))
61       (while (< i len)
62         (setq chr (aref entry i))
63         (if (setq ucs (or (encode-char chr '=ucs@JP)
64                           (char-ucs chr)))
65             (setq chr (decode-char '=ucs ucs)))
66         (setq ret (get-char-attribute
67                    chr '*instance@morpheme-entry/zh-classical))
68         (unless (member me-cobj ret)
69           (put-char-attribute chr '*instance@morpheme-entry/zh-classical
70                               (cons me-cobj ret)))
71         (setq i (1+ i)))
72       (save-char-attribute-table '*instance@morpheme-entry/zh-classical))
73     me-cobj))
74
75 ;; (save-char-attribute-table '*instance@morpheme-entry/zh-classical)
76
77 (defun concord-kanbun-add-word-class (&rest word-class)
78   (let (wc-name wc-cobj wc-name-id parent-wc-cobj
79                 r-wcl)
80     (dolist (wcf word-class)
81       (when (and wcf
82                  (not (string= wcf "*")))
83         (setq r-wcl (cons wcf r-wcl))
84         (setq wc-name
85               (if wc-name
86                   (format "%s,%s" wc-name wcf)
87                 wcf))))
88     (unless wc-name
89       (setq wc-name "*"))
90     (unless (setq wc-cobj (concord-decode-object
91                            '=name wc-name
92                            'word-class@zh-classical))
93       (setq wc-name-id (intern (concord-kanbun-encode-name-as-id wc-name)))
94       (setq wc-cobj (concord-make-object
95                      'word-class@zh-classical wc-name-id))
96       (concord-object-put wc-cobj '=name wc-name)
97       (when (and r-wcl
98                  (setq parent-wc-cobj
99                        (apply #'concord-kanbun-add-word-class
100                               (nreverse (cdr r-wcl)))))
101         (concord-object-put
102          wc-cobj '<-subcategory@misc (list parent-wc-cobj)))
103       )
104     wc-cobj))
105
106 (defun concord-kanbun-add-morpheme (entry word-superclass word-class
107                                           word-subclass1 word-subclass2
108                                           word-subclass3
109                                           canonical-form
110                                           ja-form ja-kana ja-conj-type
111                                           comment)
112   (let* (entry-cobj
113          canonical-entry-cobj
114          wc-cobj wc-name wc-canonical-flag
115          mm-name
116          mj-name
117          mjc-name mjc-id mjc-cobj)
118     (when entry
119       (setq wc-name
120             (if (or (null word-subclass3)
121                     (string-equal word-subclass3 "*"))
122                 (format "%s,%s,%s,%s"
123                         word-superclass word-class
124                         word-subclass1 word-subclass2)
125               (format "%s,%s,%s,%s,%s"
126                       word-superclass word-class
127                       word-subclass1 word-subclass2
128                       word-subclass3)))
129       (setq mm-name (format "%s (%s) [%s]"
130                             entry canonical-form wc-name))
131       (setq mj-name (format "%s (%s (%s),%s)"
132                             mm-name ja-form ja-kana ja-conj-type))
133       (setq mjc-name
134             (if comment
135                 (format "%s\t; %s" mj-name comment)
136               mj-name))
137       (unless (setq mjc-cobj (concord-decode-object
138                              '=name mjc-name 'morpheme@zh-classical))
139         (setq mjc-id (intern (concord-kanbun-encode-name-as-id mjc-name)))
140         (setq mjc-cobj (concord-make-object 'morpheme@zh-classical mjc-id))
141         (concord-object-put mjc-cobj '=name mjc-name)
142         (setq entry-cobj (concord-kanbun-add-morpheme-entry entry))
143         (setq wc-cobj (concord-kanbun-add-word-class
144                        word-superclass word-class
145                        word-subclass1 word-subclass2
146                        word-subclass3))
147         (concord-object-put mjc-cobj '->word-class (list wc-cobj))
148         (setq wc-canonical-flag
149               (concord-kanbun-word-class-canonical-p wc-cobj))
150         (concord-object-put mjc-cobj
151                             (if wc-canonical-flag
152                                 '->entry@morpheme
153                               '->entry@morpheme/misc)
154                             (list entry-cobj))
155         (unless (string= entry canonical-form)
156           (when (setq canonical-entry-cobj
157                       (concord-kanbun-add-morpheme-entry canonical-form))
158             (concord-object-put mjc-cobj
159                                 (if wc-canonical-flag
160                                     '->entry@morpheme/canonical
161                                   '->entry@morpheme/canonical/misc)
162                                 (list canonical-entry-cobj))))
163         (concord-object-put mjc-cobj 'ja-form ja-form)
164         (concord-object-put mjc-cobj 'ja-kana ja-kana)
165         (concord-object-put mjc-cobj 'ja-conjugation-type ja-conj-type))
166       mjc-cobj)))
167
168 (defun concord-kanbun-parse-corpus-line (string)
169   (let* ((ret (split-string string "\t*[;\e$B!(\e(B]\\s *"))
170          entry features comment)
171     (if (and (setq comment (nth 1 ret))
172              (string-match "[ \t]+$" comment))
173         (setq comment (substring comment 0 (match-beginning 0))))
174     (setq ret (split-string (car ret) "\t"))
175     (setq entry (car ret)
176           features (split-string (nth 1 ret) ","))
177     (list entry
178           (car features)(nth 1 features)
179           (nth 2 features)(nth 3 features)(nth 4 features)
180           (nth 6 features)
181           (nth 7 features)(nth 8 features)(nth 9 features)
182           comment)))
183
184 (defun concord-kanbun-add-corpus-line (string)
185   (apply #'concord-kanbun-add-morpheme
186          (concord-kanbun-parse-corpus-line string)))
187
188 (defun concord-kanbun-read-sentence (sentence-number &optional source-name)
189   (unless source-name
190     (setq source-name (file-name-nondirectory buffer-file-name)))
191   (let ((beg (point))
192         end send
193         ret
194         sentence dest
195         ;; sentence-name
196         sentence-id-name sentence-id sentence-cobj
197         sentence-entry-cobj)
198     (prog1
199         (save-excursion
200           (when (search-forward "\nEOS\n" nil t)
201             (setq end (match-beginning 0)
202                   send (match-end 0))
203             (goto-char beg)
204             (while (search-forward "\t" end t)
205               (setq ret (concord-kanbun-parse-corpus-line
206                          (buffer-substring (point-at-bol)(point-at-eol))))
207               (setq sentence (concat sentence (car ret)))
208               ;; (setq sentence-name
209               ;;       (concat sentence-name
210               ;;               (if sentence-name
211               ;;                   " ")
212               ;;               (format "%s[%s,%s,%s]"
213               ;;                       (car ret)
214               ;;                       (nth 2 ret)
215               ;;                       (nth 3 ret)(nth 4 ret))))
216               (setq dest
217                     (cons (apply #'concord-kanbun-add-morpheme ret)
218                           dest))
219               (goto-char (point-at-eol)))
220             ;; (setq sentence-id
221             ;;       (intern
222             ;;        (concord-kanbun-encode-name-as-id
223             ;;         (format "%s/%d" source-name sentence-number))))
224             (setq sentence-id-name
225                   (format "%s/%d" source-name sentence-number))
226             (setq sentence-id (intern sentence-id-name))
227             (unless (setq sentence-cobj
228                           (concord-decode-object
229                            '=id sentence-id 'sentence@zh-classical))
230               (setq sentence-cobj
231                     (concord-make-object
232                      'sentence@zh-classical sentence-id))
233               ;; (concord-object-put
234               ;;  sentence-cobj '=name (format "%s(%s)"
235               ;;                               sentence-name sentence-id-name))
236               (concord-object-put
237                sentence-cobj '=name (format "%s(%s)"
238                                             sentence sentence-id-name))
239               )
240             (concord-object-put
241              sentence-cobj 'source/file-name source-name)
242             (concord-object-put
243              sentence-cobj 'source/sentence-number sentence-number)
244             (concord-object-put
245              sentence-cobj '->morphemes (nreverse dest))
246             (when (setq sentence-entry-cobj
247                         (concord-kanbun-add-sentence-entry sentence))
248               (concord-object-put
249                sentence-cobj '->entry@sentence (list sentence-entry-cobj)))
250             sentence-cobj))
251       (if send
252           (goto-char send)))))
253
254 (defun concord-kanbun-read-buffer (&optional source-name)
255   (interactive)
256   (save-excursion
257     (goto-char (point-min))
258     (unless source-name
259       (setq source-name (file-name-nondirectory buffer-file-name)))
260     (let ((i 1))
261       (while (concord-kanbun-read-sentence i source-name)
262         (message (format "%s: sentence #%d is stored." source-name i))
263         (setq i (1+ i))))))
264
265 (defun concord-kanbun-batch-read-file ()
266   (set-terminal-coding-system 'utf-8-jp-er)
267   (let ((file (pop command-line-args-left))
268         (coding-system-for-read 'utf-8-jp-er)
269         (file-name-coding-system 'utf-8-jp-er))
270     (with-temp-buffer
271       (insert-file-contents file)
272       (concord-kanbun-read-buffer (file-name-nondirectory file)))))