(concord-kanbun-add-morpheme): Add new argument comment.
[chise/concord-kanbun.git] / concord-kanbun-dic.el
1 (concord-assign-genre 'entry@zh-classical "/usr/local/var/kanbun/db")
2 (concord-assign-genre 'word-class@zh-classical "/usr/local/var/kanbun/db")
3 (concord-assign-genre 'morpheme@zh-classical "/usr/local/var/kanbun/db")
4 (concord-assign-genre 'sentence@zh-classical "/usr/local/var/kanbun/db")
5 (mount-char-attribute-table '*instance@morpheme-entry/zh-classical)
6
7 (defun concord-kanbun-encode-name-as-id (name)
8   (let (ucs ret)
9     (mapconcat (lambda (c)
10                  (cond
11                   ((eq c ?\ )
12                    "_"
13                    )
14                   ((or (and (<= ?A c)(<= c ?Z))
15                        (and (<= ?a c)(< c ?u))
16                        (and (< ?u c)(< c ?z))
17                        (and (<= ?0 c)(<= c ?9)))
18                    (char-to-string c)
19                    )
20                   ((setq ucs (or (encode-char c '=ucs@JP)
21                                  (char-ucs c)))
22                    (format "u%04X" ucs)
23                    )
24                   ((setq ret (encode-char c '=ruimoku-v6))
25                    (format "r%04X" ret)
26                    )
27                   ((setq ret (encode-char c '=jef-china3))
28                    (format "j%04X" ret)
29                    )
30                   (t
31                    (error "Unknown character %c" c)
32                    ;; (format "m%08X" (char-id c))
33                    )))
34                name
35                "")))
36
37 (defun concord-kanbun-add-sentence-entry (entry)
38   (let (entry-id me-cobj)
39     (unless (setq me-cobj (concord-decode-object '=name entry
40                                                  'entry@zh-classical))
41       (setq entry-id (intern (concord-kanbun-encode-name-as-id entry)))
42       (setq me-cobj (concord-make-object 'entry@zh-classical entry-id))
43       (concord-object-put me-cobj '=name entry))
44     me-cobj))
45
46 (defun concord-kanbun-add-morpheme-entry (entry)
47   (let ((len (length entry))
48         (i 0)
49         entry-id me-cobj chr ret ucs)
50     (unless (setq me-cobj (concord-decode-object
51                            '=name entry
52                            'entry@zh-classical))
53       (setq entry-id (intern (concord-kanbun-encode-name-as-id entry)))
54       (setq me-cobj (concord-make-object 'entry@zh-classical entry-id))
55       (concord-object-put me-cobj '=name entry)
56       (if (= len 1)
57           (concord-object-put me-cobj 'character (list (aref entry 0))))
58       (while (< i len)
59         (setq chr (aref entry i))
60         (if (setq ucs (or (encode-char chr '=ucs@JP)
61                           (char-ucs chr)))
62             (setq chr (decode-char '=ucs ucs)))
63         (setq ret (get-char-attribute
64                    chr '*instance@morpheme-entry/zh-classical))
65         (unless (member me-cobj ret)
66           (put-char-attribute chr '*instance@morpheme-entry/zh-classical
67                               (cons me-cobj ret)))
68         (setq i (1+ i)))
69       (save-char-attribute-table '*instance@morpheme-entry/zh-classical))
70     me-cobj))
71
72 ;; (save-char-attribute-table '*instance@morpheme-entry/zh-classical)
73
74 (defun concord-kanbun-add-word-class (word-superclass
75                                       word-class
76                                       word-subclass1 word-subclass2
77                                       word-subclass3)
78   (let (wc-name wc-cobj wc-name-id)
79     (when (and word-superclass
80                (not (string= word-superclass "*")))
81       (setq wc-name word-superclass)
82       (when (and word-class
83                  (not (string= word-class "*")))
84         (setq wc-name
85               (format "%s,%s" wc-name word-class))
86         (when (and word-subclass1
87                    (not (string= word-subclass1 "*")))
88           (setq wc-name
89                 (format "%s,%s" wc-name word-subclass1))
90           (when (and word-subclass2
91                      (not (string= word-subclass2 "*")))
92             (setq wc-name
93                   (format "%s,%s" wc-name word-subclass2))
94             (when (and word-subclass3
95                        (not (string= word-subclass3 "*")))
96               (setq wc-name
97                     (format "%s,%s" wc-name word-subclass3)))))))
98     (unless (setq wc-cobj (concord-decode-object
99                            '=name wc-name
100                            'word-class@zh-classical))
101       (setq wc-name-id (intern (concord-kanbun-encode-name-as-id wc-name)))
102       (setq wc-cobj (concord-make-object
103                      'word-class@zh-classical wc-name-id))
104       (concord-object-put wc-cobj '=name wc-name))
105     wc-cobj))
106
107
108 (defun concord-kanbun-add-morpheme (entry word-superclass word-class
109                                           word-subclass1 word-subclass2
110                                           word-subclass3
111                                           canonical-form
112                                           ja-form ja-kana ja-conj-type
113                                           comment)
114   (let* (entry-cobj
115          canonical-entry-cobj
116          wc-cobj wc-name
117          mm-name
118          mj-name
119          mjc-name mjc-id mjc-cobj)
120     (when entry
121       (setq wc-name
122             (if (or (null word-subclass3)
123                     (string-equal word-subclass3 "*"))
124                 (format "%s,%s,%s,%s"
125                         word-superclass word-class
126                         word-subclass1 word-subclass2)
127               (format "%s,%s,%s,%s,%s"
128                       word-superclass word-class
129                       word-subclass1 word-subclass2
130                       word-subclass3)))
131       (setq mm-name (format "%s (%s) [%s]"
132                             entry canonical-form wc-name))
133       (setq mj-name (format "%s (%s (%s),%s)"
134                             mm-name ja-form ja-kana ja-conj-type))
135       (setq mjc-name
136             (if comment
137                 (format "%s\t; %s" mj-name comment)
138               mj-name))
139       (unless (setq mjc-cobj (concord-decode-object
140                              '=name mjc-name 'morpheme@zh-classical))
141         (setq mjc-id (intern (concord-kanbun-encode-name-as-id mjc-name)))
142         (setq mjc-cobj (concord-make-object 'morpheme@zh-classical mjc-id))
143         (concord-object-put mjc-cobj '=name mjc-name)
144         (when (setq entry-cobj (concord-kanbun-add-morpheme-entry entry))
145           (concord-object-put mjc-cobj '->entry@morpheme (list entry-cobj)))
146         (unless (string= entry canonical-form)
147           (when (setq canonical-entry-cobj
148                       (concord-kanbun-add-morpheme-entry canonical-form))
149             (concord-object-put mjc-cobj '->entry@morpheme/canonical
150                                 (list canonical-entry-cobj))))
151         (when (setq wc-cobj (concord-kanbun-add-word-class
152                              word-superclass word-class
153                              word-subclass1 word-subclass2
154                              word-subclass3))
155           (concord-object-put mjc-cobj '->word-class (list wc-cobj)))
156         (concord-object-put mjc-cobj 'ja-form ja-form)
157         (concord-object-put mjc-cobj 'ja-kana ja-kana)
158         (concord-object-put mjc-cobj 'ja-conjugation-type ja-conj-type))
159       mjc-cobj)))
160
161 (defun concord-kanbun-parse-corpus-line (string)
162   (let* ((ret (split-string string "\t*[;\e$B!(\e(B]\\s *"))
163          entry features comment)
164     (if (and (setq comment (nth 1 ret))
165              (string-match "[ \t]+$" comment))
166         (setq comment (substring comment 0 (match-beginning 0))))
167     (setq ret (split-string (car ret) "\t"))
168     (setq entry (car ret)
169           features (split-string (nth 1 ret) ","))
170     (list entry
171           (car features)(nth 1 features)
172           (nth 2 features)(nth 3 features)(nth 4 features)
173           (nth 6 features)
174           (nth 7 features)(nth 8 features)(nth 9 features)
175           comment)))
176
177 (defun concord-kanbun-add-corpus-line (string)
178   (apply #'concord-kanbun-add-morpheme
179          (concord-kanbun-parse-corpus-line string)))
180
181 (defun concord-kanbun-read-sentence (sentence-number &optional source-name)
182   (unless source-name
183     (setq source-name (file-name-nondirectory buffer-file-name)))
184   (let ((beg (point))
185         end send
186         ret
187         sentence dest
188         sentence-name
189         sentence-id-name sentence-id sentence-cobj
190         sentence-entry-cobj)
191     (prog1
192         (save-excursion
193           (when (search-forward "\nEOS\n" nil t)
194             (setq end (match-beginning 0)
195                   send (match-end 0))
196             (goto-char beg)
197             (while (search-forward "\t" end t)
198               (setq ret (concord-kanbun-parse-corpus-line
199                          (buffer-substring (point-at-bol)(point-at-eol))))
200               (setq sentence (concat sentence (car ret)))
201               (setq sentence-name
202                     (concat sentence-name
203                             (if sentence-name
204                                 " ")
205                             (format "%s[%s,%s,%s]"
206                                     (car ret)
207                                     (nth 2 ret)
208                                     (nth 3 ret)(nth 4 ret))))
209               (setq dest
210                     (cons (apply #'concord-kanbun-add-morpheme ret)
211                           dest))
212               (goto-char (point-at-eol)))
213             ;; (setq sentence-id
214             ;;       (intern
215             ;;        (concord-kanbun-encode-name-as-id
216             ;;         (format "%s/%d" source-name sentence-number))))
217             (setq sentence-id-name
218                   (format "%s/%d" source-name sentence-number))
219             (setq sentence-id (intern sentence-id-name))
220             (unless (setq sentence-cobj
221                           (concord-decode-object
222                            '=id sentence-id 'sentence@zh-classical))
223               (setq sentence-cobj
224                     (concord-make-object
225                      'sentence@zh-classical sentence-id))
226               (concord-object-put
227                sentence-cobj '=name (format "%s(%s)"
228                                             sentence-name sentence-id-name))
229               ;; (concord-object-put
230               ;;  sentence-cobj '=name (format "%s(%s)"
231               ;;                               sentence sentence-id-name))
232               )
233             (concord-object-put
234              sentence-cobj 'source/file-name source-name)
235             (concord-object-put
236              sentence-cobj 'source/sentence-number sentence-number)
237             (concord-object-put
238              sentence-cobj '->morphemes (nreverse dest))
239             (when (setq sentence-entry-cobj
240                         (concord-kanbun-add-sentence-entry sentence))
241               (concord-object-put
242                sentence-cobj '->entry@sentence (list sentence-entry-cobj)))
243             sentence-cobj))
244       (if send
245           (goto-char send)))))
246
247 (defun concord-kanbun-read-buffer (&optional source-name)
248   (interactive)
249   (save-excursion
250     (goto-char (point-min))
251     (unless source-name
252       (setq source-name (file-name-nondirectory buffer-file-name)))
253     (let ((i 1))
254       (while (concord-kanbun-read-sentence i source-name)
255         (message (format "%s: sentence #%d is stored." source-name i))
256         (setq i (1+ i))))))
257
258 (defun concord-kanbun-batch-read-file ()
259   (set-terminal-coding-system 'utf-8-jp-er)
260   (let ((file (pop command-line-args-left))
261         (coding-system-for-read 'utf-8-jp-er)
262         (file-name-coding-system 'utf-8-jp-er))
263     (with-temp-buffer
264       (insert-file-contents file)
265       (concord-kanbun-read-buffer (file-name-nondirectory file)))))