ea26063678bc4f2a6dde18d3c89e9fa50a3dc994
[chise/concord-kanbun.git] / concord-kanbun-dic.el
1 (concord-assign-genre 'entry@zh-classical "/usr/local/var/kanbun/db")
2 (concord-assign-genre 'word-class@zh-classical "/usr/local/var/kanbun/db")
3 (concord-assign-genre 'morpheme@zh-classical "/usr/local/var/kanbun/db")
4 (concord-assign-genre 'sentence@zh-classical "/usr/local/var/kanbun/db")
5 (mount-char-attribute-table '*instance@morpheme-entry/zh-classical)
6
7 (defun concord-kanbun-encode-name-as-id (name)
8   (let (ucs ret)
9     (mapconcat (lambda (c)
10                  (cond
11                   ((eq c ?\ )
12                    "_"
13                    )
14                   ((or (and (<= ?A c)(<= c ?Z))
15                        (and (<= ?a c)(< c ?u))
16                        (and (< ?u c)(< c ?z))
17                        (and (<= ?0 c)(<= c ?9)))
18                    (char-to-string c)
19                    )
20                   ((setq ucs (or (encode-char c '=ucs@JP)
21                                  (char-ucs c)))
22                    (format "u%04X" ucs)
23                    )
24                   ((setq ret (encode-char c '=ruimoku-v6))
25                    (format "r%04X" ret)
26                    )
27                   ((setq ret (encode-char c '=jef-china3))
28                    (format "j%04X" ret)
29                    )
30                   (t
31                    (error "Unknown character %c" c)
32                    ;; (format "m%08X" (char-id c))
33                    )))
34                name
35                "")))
36
37 (defun concord-kanbun-add-sentence-entry (entry)
38   (let (entry-id me-cobj)
39     (unless (setq me-cobj (concord-decode-object '=name entry
40                                                  'entry@zh-classical))
41       (setq entry-id (intern (concord-kanbun-encode-name-as-id entry)))
42       (setq me-cobj (concord-make-object 'entry@zh-classical entry-id))
43       (concord-object-put me-cobj '=name entry))
44     me-cobj))
45
46 (defun concord-kanbun-add-morpheme-entry (entry)
47   (let ((len (length entry))
48         (i 0)
49         entry-id me-cobj chr ret ucs)
50     (unless (setq me-cobj (concord-decode-object
51                            '=name entry
52                            'entry@zh-classical))
53       (setq entry-id (intern (concord-kanbun-encode-name-as-id entry)))
54       (setq me-cobj (concord-make-object 'entry@zh-classical entry-id))
55       (concord-object-put me-cobj '=name entry)
56       (if (= len 1)
57           (concord-object-put me-cobj 'character (list (aref entry 0))))
58       (while (< i len)
59         (setq chr (aref entry i))
60         (if (setq ucs (or (encode-char chr '=ucs@JP)
61                           (char-ucs chr)))
62             (setq chr (decode-char '=ucs ucs)))
63         (setq ret (get-char-attribute
64                    chr '*instance@morpheme-entry/zh-classical))
65         (unless (member me-cobj ret)
66           (put-char-attribute chr '*instance@morpheme-entry/zh-classical
67                               (cons me-cobj ret)))
68         (setq i (1+ i)))
69       (save-char-attribute-table '*instance@morpheme-entry/zh-classical))
70     me-cobj))
71
72 ;; (save-char-attribute-table '*instance@morpheme-entry/zh-classical)
73
74 (defun concord-kanbun-add-word-class (&rest word-class)
75   (let (wc-name wc-cobj wc-name-id parent-wc-cobj
76                 r-wcl)
77     (dolist (wcf word-class)
78       (when (and wcf
79                  (not (string= wcf "*")))
80         (setq r-wcl (cons wcf r-wcl))
81         (setq wc-name
82               (if wc-name
83                   (format "%s,%s" wc-name wcf)
84                 wcf))))
85     (unless wc-name
86       (setq wc-name "*"))
87     (unless (setq wc-cobj (concord-decode-object
88                            '=name wc-name
89                            'word-class@zh-classical))
90       (setq wc-name-id (intern (concord-kanbun-encode-name-as-id wc-name)))
91       (setq wc-cobj (concord-make-object
92                      'word-class@zh-classical wc-name-id))
93       (concord-object-put wc-cobj '=name wc-name)
94       (when (and r-wcl
95                  (setq parent-wc-cobj
96                        (apply #'concord-kanbun-add-word-class
97                               (nreverse (cdr r-wcl)))))
98         (concord-object-put
99          wc-cobj '<-subcategory (list parent-wc-cobj)))
100       )
101     wc-cobj))
102
103 (defun concord-kanbun-add-morpheme (entry word-superclass word-class
104                                           word-subclass1 word-subclass2
105                                           word-subclass3
106                                           canonical-form
107                                           ja-form ja-kana ja-conj-type
108                                           comment)
109   (let* (entry-cobj
110          canonical-entry-cobj
111          wc-cobj wc-name
112          mm-name
113          mj-name
114          mjc-name mjc-id mjc-cobj)
115     (when entry
116       (setq wc-name
117             (if (or (null word-subclass3)
118                     (string-equal word-subclass3 "*"))
119                 (format "%s,%s,%s,%s"
120                         word-superclass word-class
121                         word-subclass1 word-subclass2)
122               (format "%s,%s,%s,%s,%s"
123                       word-superclass word-class
124                       word-subclass1 word-subclass2
125                       word-subclass3)))
126       (setq mm-name (format "%s (%s) [%s]"
127                             entry canonical-form wc-name))
128       (setq mj-name (format "%s (%s (%s),%s)"
129                             mm-name ja-form ja-kana ja-conj-type))
130       (setq mjc-name
131             (if comment
132                 (format "%s\t; %s" mj-name comment)
133               mj-name))
134       (unless (setq mjc-cobj (concord-decode-object
135                              '=name mjc-name 'morpheme@zh-classical))
136         (setq mjc-id (intern (concord-kanbun-encode-name-as-id mjc-name)))
137         (setq mjc-cobj (concord-make-object 'morpheme@zh-classical mjc-id))
138         (concord-object-put mjc-cobj '=name mjc-name)
139         (when (setq entry-cobj (concord-kanbun-add-morpheme-entry entry))
140           (concord-object-put mjc-cobj '->entry@morpheme (list entry-cobj)))
141         (unless (string= entry canonical-form)
142           (when (setq canonical-entry-cobj
143                       (concord-kanbun-add-morpheme-entry canonical-form))
144             (concord-object-put mjc-cobj '->entry@morpheme/canonical
145                                 (list canonical-entry-cobj))))
146         (when (setq wc-cobj (concord-kanbun-add-word-class
147                              word-superclass word-class
148                              word-subclass1 word-subclass2
149                              word-subclass3))
150           (concord-object-put mjc-cobj '->word-class (list wc-cobj)))
151         (concord-object-put mjc-cobj 'ja-form ja-form)
152         (concord-object-put mjc-cobj 'ja-kana ja-kana)
153         (concord-object-put mjc-cobj 'ja-conjugation-type ja-conj-type))
154       mjc-cobj)))
155
156 (defun concord-kanbun-parse-corpus-line (string)
157   (let* ((ret (split-string string "\t*[;\e$B!(\e(B]\\s *"))
158          entry features comment)
159     (if (and (setq comment (nth 1 ret))
160              (string-match "[ \t]+$" comment))
161         (setq comment (substring comment 0 (match-beginning 0))))
162     (setq ret (split-string (car ret) "\t"))
163     (setq entry (car ret)
164           features (split-string (nth 1 ret) ","))
165     (list entry
166           (car features)(nth 1 features)
167           (nth 2 features)(nth 3 features)(nth 4 features)
168           (nth 6 features)
169           (nth 7 features)(nth 8 features)(nth 9 features)
170           comment)))
171
172 (defun concord-kanbun-add-corpus-line (string)
173   (apply #'concord-kanbun-add-morpheme
174          (concord-kanbun-parse-corpus-line string)))
175
176 (defun concord-kanbun-read-sentence (sentence-number &optional source-name)
177   (unless source-name
178     (setq source-name (file-name-nondirectory buffer-file-name)))
179   (let ((beg (point))
180         end send
181         ret
182         sentence dest
183         ;; sentence-name
184         sentence-id-name sentence-id sentence-cobj
185         sentence-entry-cobj)
186     (prog1
187         (save-excursion
188           (when (search-forward "\nEOS\n" nil t)
189             (setq end (match-beginning 0)
190                   send (match-end 0))
191             (goto-char beg)
192             (while (search-forward "\t" end t)
193               (setq ret (concord-kanbun-parse-corpus-line
194                          (buffer-substring (point-at-bol)(point-at-eol))))
195               (setq sentence (concat sentence (car ret)))
196               ;; (setq sentence-name
197               ;;       (concat sentence-name
198               ;;               (if sentence-name
199               ;;                   " ")
200               ;;               (format "%s[%s,%s,%s]"
201               ;;                       (car ret)
202               ;;                       (nth 2 ret)
203               ;;                       (nth 3 ret)(nth 4 ret))))
204               (setq dest
205                     (cons (apply #'concord-kanbun-add-morpheme ret)
206                           dest))
207               (goto-char (point-at-eol)))
208             ;; (setq sentence-id
209             ;;       (intern
210             ;;        (concord-kanbun-encode-name-as-id
211             ;;         (format "%s/%d" source-name sentence-number))))
212             (setq sentence-id-name
213                   (format "%s/%d" source-name sentence-number))
214             (setq sentence-id (intern sentence-id-name))
215             (unless (setq sentence-cobj
216                           (concord-decode-object
217                            '=id sentence-id 'sentence@zh-classical))
218               (setq sentence-cobj
219                     (concord-make-object
220                      'sentence@zh-classical sentence-id))
221               ;; (concord-object-put
222               ;;  sentence-cobj '=name (format "%s(%s)"
223               ;;                               sentence-name sentence-id-name))
224               (concord-object-put
225                sentence-cobj '=name (format "%s(%s)"
226                                             sentence sentence-id-name))
227               )
228             (concord-object-put
229              sentence-cobj 'source/file-name source-name)
230             (concord-object-put
231              sentence-cobj 'source/sentence-number sentence-number)
232             (concord-object-put
233              sentence-cobj '->morphemes (nreverse dest))
234             (when (setq sentence-entry-cobj
235                         (concord-kanbun-add-sentence-entry sentence))
236               (concord-object-put
237                sentence-cobj '->entry@sentence (list sentence-entry-cobj)))
238             sentence-cobj))
239       (if send
240           (goto-char send)))))
241
242 (defun concord-kanbun-read-buffer (&optional source-name)
243   (interactive)
244   (save-excursion
245     (goto-char (point-min))
246     (unless source-name
247       (setq source-name (file-name-nondirectory buffer-file-name)))
248     (let ((i 1))
249       (while (concord-kanbun-read-sentence i source-name)
250         (message (format "%s: sentence #%d is stored." source-name i))
251         (setq i (1+ i))))))
252
253 (defun concord-kanbun-batch-read-file ()
254   (set-terminal-coding-system 'utf-8-jp-er)
255   (let ((file (pop command-line-args-left))
256         (coding-system-for-read 'utf-8-jp-er)
257         (file-name-coding-system 'utf-8-jp-er))
258     (with-temp-buffer
259       (insert-file-contents file)
260       (concord-kanbun-read-buffer (file-name-nondirectory file)))))