(concord-kanbun-add-word-class): Add new argument `word-subclass3'.
[chise/concord-kanbun.git] / concord-kanbun-dic.el
1 (concord-assign-genre 'entry@zh-classical "/usr/local/var/kanbun/db")
2 (concord-assign-genre 'word-class@zh-classical "/usr/local/var/kanbun/db")
3 (concord-assign-genre 'morpheme@zh-classical "/usr/local/var/kanbun/db")
4 (concord-assign-genre 'sentence@zh-classical "/usr/local/var/kanbun/db")
5 (mount-char-attribute-table '*instance@morpheme-entry/zh-classical)
6
7 (defun concord-kanbun-encode-name-as-id (name)
8   (let (ucs ret)
9     (mapconcat (lambda (c)
10                  (cond
11                   ((eq c ?\ )
12                    "_"
13                    )
14                   ((or (and (<= ?A c)(<= c ?Z))
15                        (and (<= ?a c)(< c ?u))
16                        (and (< ?u c)(< c ?z))
17                        (and (<= ?0 c)(<= c ?9)))
18                    (char-to-string c)
19                    )
20                   ((setq ucs (or (encode-char c '=ucs@JP)
21                                  (char-ucs c)))
22                    (format "u%04X" ucs)
23                    )
24                   ((setq ret (encode-char c '=ruimoku-v6))
25                    (format "r%04X" ret)
26                    )
27                   ((setq ret (encode-char c '=jef-china3))
28                    (format "j%04X" ret)
29                    )
30                   (t
31                    (error "Unknown character %c" c)
32                    ;; (format "m%08X" (char-id c))
33                    )))
34                name
35                "")))
36
37 (defun concord-kanbun-add-sentence-entry (entry)
38   (let (entry-id me-cobj)
39     (unless (setq me-cobj (concord-decode-object '=name entry
40                                                  'entry@zh-classical))
41       (setq entry-id (intern (concord-kanbun-encode-name-as-id entry)))
42       (setq me-cobj (concord-make-object 'entry@zh-classical entry-id))
43       (concord-object-put me-cobj '=name entry))
44     me-cobj))
45
46 (defun concord-kanbun-add-morpheme-entry (entry)
47   (let ((len (length entry))
48         (i 0)
49         entry-id me-cobj chr ret ucs)
50     (unless (setq me-cobj (concord-decode-object
51                            '=name entry
52                            'entry@zh-classical))
53       (setq entry-id (intern (concord-kanbun-encode-name-as-id entry)))
54       (setq me-cobj (concord-make-object 'entry@zh-classical entry-id))
55       (concord-object-put me-cobj '=name entry)
56       (if (= len 1)
57           (concord-object-put me-cobj 'character (list (aref entry 0))))
58       (while (< i len)
59         (setq chr (aref entry i))
60         (if (setq ucs (or (encode-char chr '=ucs@JP)
61                           (char-ucs chr)))
62             (setq chr (decode-char '=ucs ucs)))
63         (setq ret (get-char-attribute
64                    chr '*instance@morpheme-entry/zh-classical))
65         (unless (member me-cobj ret)
66           (put-char-attribute chr '*instance@morpheme-entry/zh-classical
67                               (cons me-cobj ret)))
68         (setq i (1+ i)))
69       (save-char-attribute-table '*instance@morpheme-entry/zh-classical))
70     me-cobj))
71
72 ;; (save-char-attribute-table '*instance@morpheme-entry/zh-classical)
73
74 (defun concord-kanbun-add-word-class (word-superclass
75                                       word-class
76                                       word-subclass1 word-subclass2
77                                       word-subclass3)
78   (let (wc-name wc-cobj wc-name-id)
79     (when (and word-superclass
80                (not (string= word-superclass "*")))
81       (setq wc-name word-superclass)
82       (when (and word-class
83                  (not (string= word-class "*")))
84         (setq wc-name
85               (format "%s,%s" wc-name word-class))
86         (when (and word-subclass1
87                    (not (string= word-subclass1 "*")))
88           (setq wc-name
89                 (format "%s,%s" wc-name word-subclass1))
90           (when (and word-subclass2
91                      (not (string= word-subclass2 "*")))
92             (setq wc-name
93                   (format "%s,%s" wc-name word-subclass2))
94             (when (and word-subclass3
95                        (not (string= word-subclass3 "*")))
96               (setq wc-name
97                     (format "%s,%s" wc-name word-subclass3)))))))
98     (unless (setq wc-cobj (concord-decode-object
99                            '=name wc-name
100                            'word-class@zh-classical))
101       (setq wc-name-id (intern (concord-kanbun-encode-name-as-id wc-name)))
102       (setq wc-cobj (concord-make-object
103                      'word-class@zh-classical wc-name-id))
104       (concord-object-put wc-cobj '=name wc-name))
105     wc-cobj))
106
107
108 (defun concord-kanbun-add-morpheme (entry word-superclass word-class
109                                           word-subclass1 word-subclass2
110                                           word-subclass3
111                                           canonical-form
112                                           ja-form ja-kana ja-conj-type)
113   (let* (entry-cobj
114          canonical-entry-cobj
115          wc-cobj wc-name
116          mm-name
117          mj-name mj-id mj-cobj)
118     (when entry
119       (setq wc-name
120             (if (or (null word-subclass3)
121                     (string-equal word-subclass3 "*"))
122                 (format "%s,%s,%s,%s"
123                         word-superclass word-class
124                         word-subclass1 word-subclass2)
125               (format "%s,%s,%s,%s,%s"
126                       word-superclass word-class
127                       word-subclass1 word-subclass2
128                       word-subclass3)))
129       (setq mm-name (format "%s (%s) [%s]"
130                             entry canonical-form wc-name))
131       (setq mj-name (format "%s (%s (%s),%s)"
132                             mm-name ja-form ja-kana ja-conj-type))
133       (unless (setq mj-cobj (concord-decode-object
134                              '=name mj-name 'morpheme@zh-classical))
135         (setq mj-id (intern (concord-kanbun-encode-name-as-id mj-name)))
136         (setq mj-cobj (concord-make-object 'morpheme@zh-classical mj-id))
137         (concord-object-put mj-cobj '=name mj-name)
138         (when (setq entry-cobj (concord-kanbun-add-morpheme-entry entry))
139           (concord-object-put mj-cobj '->entry@morpheme (list entry-cobj)))
140         (unless (string= entry canonical-form)
141           (when (setq canonical-entry-cobj
142                       (concord-kanbun-add-morpheme-entry canonical-form))
143             (concord-object-put mj-cobj '->entry@morpheme/canonical
144                                 (list canonical-entry-cobj))))
145         (when (setq wc-cobj (concord-kanbun-add-word-class
146                              word-superclass word-class
147                              word-subclass1 word-subclass2
148                              word-subclass3))
149           (concord-object-put mj-cobj '->word-class (list wc-cobj)))
150         (concord-object-put mj-cobj 'ja-form ja-form)
151         (concord-object-put mj-cobj 'ja-kana ja-kana)
152         (concord-object-put mj-cobj 'ja-conjugation-type ja-conj-type))
153       mj-cobj)))
154
155 (defun concord-kanbun-parse-corpus-line (string)
156   (let* ((ret (split-string string "\t"))
157          (ret2 (split-string (nth 1 ret) ",")))
158     (list (car ret)
159           (car ret2)(nth 1 ret2)(nth 2 ret2)(nth 3 ret2)(nth 4 ret2)
160           (nth 6 ret2)
161           (nth 7 ret2)(nth 8 ret2)(nth 9 ret2))))
162
163 (defun concord-kanbun-add-corpus-line (string)
164   (apply #'concord-kanbun-add-morpheme
165          (concord-kanbun-parse-corpus-line string)))
166
167 (defun concord-kanbun-read-sentence (sentence-number &optional source-name)
168   (unless source-name
169     (setq source-name (file-name-nondirectory buffer-file-name)))
170   (let ((beg (point))
171         end send
172         ret
173         sentence dest
174         sentence-name
175         sentence-id-name sentence-id sentence-cobj
176         sentence-entry-cobj)
177     (prog1
178         (save-excursion
179           (when (search-forward "\nEOS\n" nil t)
180             (setq end (match-beginning 0)
181                   send (match-end 0))
182             (goto-char beg)
183             (while (search-forward "\t" end t)
184               (setq ret (concord-kanbun-parse-corpus-line
185                          (buffer-substring (point-at-bol)(point-at-eol))))
186               (setq sentence (concat sentence (car ret)))
187               (setq sentence-name
188                     (concat sentence-name
189                             (if sentence-name
190                                 " ")
191                             (format "%s[%s,%s,%s]"
192                                     (car ret)
193                                     (nth 2 ret)
194                                     (nth 3 ret)(nth 4 ret))))
195               (setq dest
196                     (cons (apply #'concord-kanbun-add-morpheme ret)
197                           dest)))
198             ;; (setq sentence-id
199             ;;       (intern
200             ;;        (concord-kanbun-encode-name-as-id
201             ;;         (format "%s/%d" source-name sentence-number))))
202             (setq sentence-id-name
203                   (format "%s/%d" source-name sentence-number))
204             (setq sentence-id (intern sentence-id-name))
205             (unless (setq sentence-cobj
206                           (concord-decode-object
207                            '=id sentence-id 'sentence@zh-classical))
208               (setq sentence-cobj
209                     (concord-make-object
210                      'sentence@zh-classical sentence-id))
211               (concord-object-put
212                sentence-cobj '=name (format "%s(%s)"
213                                             sentence-name sentence-id-name))
214               ;; (concord-object-put
215               ;;  sentence-cobj '=name (format "%s(%s)"
216               ;;                               sentence sentence-id-name))
217               )
218             (concord-object-put
219              sentence-cobj 'source/file-name source-name)
220             (concord-object-put
221              sentence-cobj 'source/sentence-number sentence-number)
222             (concord-object-put
223              sentence-cobj '->morphemes (nreverse dest))
224             (when (setq sentence-entry-cobj
225                         (concord-kanbun-add-sentence-entry sentence))
226               (concord-object-put
227                sentence-cobj '->entry@sentence (list sentence-entry-cobj)))
228             sentence-cobj))
229       (if send
230           (goto-char send)))))
231
232 (defun concord-kanbun-read-buffer (&optional source-name)
233   (interactive)
234   (save-excursion
235     (goto-char (point-min))
236     (unless source-name
237       (setq source-name (file-name-nondirectory buffer-file-name)))
238     (let ((i 1))
239       (while (concord-kanbun-read-sentence i source-name)
240         (message (format "%s: sentence #%d is stored." source-name i))
241         (setq i (1+ i))))))
242
243 (defun concord-kanbun-batch-read-file ()
244   (set-terminal-coding-system 'utf-8-jp-er)
245   (let ((file (pop command-line-args-left))
246         (coding-system-for-read 'utf-8-jp-er)
247         (file-name-coding-system 'utf-8-jp-er))
248     (with-temp-buffer
249       (insert-file-contents file)
250       (concord-kanbun-read-buffer (file-name-nondirectory file)))))