Assign genre `entry@zh-classical' instead of
[chise/concord-kanbun.git] / concord-kanbun-dic.el
1 (concord-assign-genre 'entry@zh-classical "/usr/local/var/kanbun/db")
2 (concord-assign-genre 'word-class@zh-classical "/usr/local/var/kanbun/db")
3 (concord-assign-genre 'morpheme@zh-classical "/usr/local/var/kanbun/db")
4 (concord-assign-genre 'sentence@zh-classical "/usr/local/var/kanbun/db")
5 (mount-char-attribute-table '*instance@morpheme-entry/zh-classical)
6
7 (defun concord-kanbun-encode-name-as-id (name)
8   (let (ucs ret)
9     (mapconcat (lambda (c)
10                  (cond
11                   ((eq c ?\ )
12                    "_"
13                    )
14                   ((or (and (<= ?A c)(<= c ?Z))
15                        (and (<= ?a c)(< c ?u))
16                        (and (< ?u c)(< c ?z))
17                        (and (<= ?0 c)(<= c ?9)))
18                    (char-to-string c)
19                    )
20                   ((setq ucs (or (encode-char c '=ucs@JP)
21                                  (char-ucs c)))
22                    (format "u%04X" ucs)
23                    )
24                   ((setq ret (encode-char c '=ruimoku-v6))
25                    (format "r%04X" ret)
26                    )
27                   ((setq ret (encode-char c '=jef-china3))
28                    (format "j%04X" ret)
29                    )
30                   (t
31                    (error "Unknown character %c" c)
32                    ;; (format "m%08X" (char-id c))
33                    )))
34                name
35                "")))
36
37 (defun concord-kanbun-add-sentence-entry (entry)
38   (let (entry-id me-cobj)
39     (unless (setq me-cobj (concord-decode-object '=name entry
40                                                  'entry@zh-classical))
41       (setq entry-id (intern (concord-kanbun-encode-name-as-id entry)))
42       (setq me-cobj (concord-make-object 'entry@zh-classical entry-id))
43       (concord-object-put me-cobj '=name entry))
44     me-cobj))
45
46 (defun concord-kanbun-add-morpheme-entry (entry)
47   (let ((len (length entry))
48         (i 0)
49         entry-id me-cobj chr ret ucs)
50     (unless (setq me-cobj (concord-decode-object
51                            '=name entry
52                            'entry@zh-classical))
53       (setq entry-id (intern (concord-kanbun-encode-name-as-id entry)))
54       (setq me-cobj (concord-make-object 'entry@zh-classical entry-id))
55       (concord-object-put me-cobj '=name entry)
56       (if (= len 1)
57           (concord-object-put me-cobj 'character (list (aref entry 0))))
58       (while (< i len)
59         (setq chr (aref entry i))
60         (if (setq ucs (or (encode-char chr '=ucs@JP)
61                           (char-ucs chr)))
62             (setq chr (decode-char '=ucs ucs)))
63         (setq ret (get-char-attribute
64                    chr '*instance@morpheme-entry/zh-classical))
65         (unless (member me-cobj ret)
66           (put-char-attribute chr '*instance@morpheme-entry/zh-classical
67                               (cons me-cobj ret)))
68         (setq i (1+ i)))
69       (save-char-attribute-table '*instance@morpheme-entry/zh-classical))
70     me-cobj))
71
72 ;; (save-char-attribute-table '*instance@morpheme-entry/zh-classical)
73
74 (defun concord-kanbun-add-word-class (word-superclass
75                                       word-class
76                                       word-subclass1 word-subclass2)
77   (let (wc-name wc-cobj wc-name-id)
78     (when (and word-superclass
79                (not (string= word-superclass "*")))
80       (setq wc-name word-superclass)
81       (when (and word-class
82                  (not (string= word-class "*")))
83         (setq wc-name
84               (format "%s,%s" wc-name word-class))
85         (when (and word-subclass1
86                    (not (string= word-subclass1 "*")))
87           (setq wc-name
88                 (format "%s,%s" wc-name word-subclass1))
89           (when (and word-subclass2
90                      (not (string= word-subclass2 "*")))
91             (setq wc-name
92                   (format "%s,%s" wc-name word-subclass2))))))
93     (unless (setq wc-cobj (concord-decode-object
94                            '=name wc-name
95                            'word-class@zh-classical))
96       (setq wc-name-id (intern (concord-kanbun-encode-name-as-id wc-name)))
97       (setq wc-cobj (concord-make-object
98                      'word-class@zh-classical wc-name-id))
99       (concord-object-put wc-cobj '=name wc-name))
100     wc-cobj))
101
102
103 (defun concord-kanbun-add-morpheme (entry word-superclass word-class
104                                           word-subclass1 word-subclass2
105                                           canonical-form
106                                           ja-form ja-kana ja-conj-type)
107   (let* (entry-cobj
108          canonical-entry-cobj
109          wc-cobj wc-name
110          mm-name
111          mj-name mj-id mj-cobj)
112     (when entry
113       (setq wc-name (format "%s,%s,%s,%s"
114                             word-superclass word-class
115                             word-subclass1 word-subclass2))
116       (setq mm-name (format "%s (%s) [%s]"
117                             entry canonical-form wc-name))
118       (setq mj-name (format "%s (%s (%s),%s)"
119                             mm-name ja-form ja-kana ja-conj-type))
120       (unless (setq mj-cobj (concord-decode-object
121                              '=name mj-name 'morpheme@zh-classical))
122         (setq mj-id (intern (concord-kanbun-encode-name-as-id mj-name)))
123         (setq mj-cobj (concord-make-object 'morpheme@zh-classical mj-id))
124         (concord-object-put mj-cobj '=name mj-name)
125         (when (setq entry-cobj (concord-kanbun-add-morpheme-entry entry))
126           (concord-object-put mj-cobj '->entry@morpheme (list entry-cobj)))
127         (unless (string= entry canonical-form)
128           (when (setq canonical-entry-cobj
129                       (concord-kanbun-add-morpheme-entry canonical-form))
130             (concord-object-put mj-cobj '->entry@morpheme/canonical
131                                 (list canonical-entry-cobj))))
132         (when (setq wc-cobj (concord-kanbun-add-word-class
133                              word-superclass word-class
134                              word-subclass1 word-subclass2))
135           (concord-object-put mj-cobj '->word-class (list wc-cobj)))
136         (concord-object-put mj-cobj 'ja-form ja-form)
137         (concord-object-put mj-cobj 'ja-kana ja-kana)
138         (concord-object-put mj-cobj 'ja-conjugation-type ja-conj-type))
139       mj-cobj)))
140
141 (defun concord-kanbun-parse-corpus-line (string)
142   (let* ((ret (split-string string "\t"))
143          (ret2 (split-string (nth 1 ret) ",")))
144     (list (car ret)
145           (car ret2)(nth 1 ret2)(nth 2 ret2)(nth 3 ret2)
146           (nth 6 ret2)
147           (nth 7 ret2)(nth 8 ret2)(nth 9 ret2))))
148
149 (defun concord-kanbun-add-corpus-line (string)
150   (apply #'concord-kanbun-add-morpheme
151          (concord-kanbun-parse-corpus-line string)))
152
153 (defun concord-kanbun-read-sentence (sentence-number &optional source-name)
154   (unless source-name
155     (setq source-name (file-name-nondirectory buffer-file-name)))
156   (let ((beg (point))
157         end send
158         ret
159         sentence dest
160         sentence-name
161         sentence-id-name sentence-id sentence-cobj
162         sentence-entry-cobj)
163     (prog1
164         (save-excursion
165           (when (search-forward "\nEOS\n" nil t)
166             (setq end (match-beginning 0)
167                   send (match-end 0))
168             (goto-char beg)
169             (while (search-forward "\t" end t)
170               (setq ret (concord-kanbun-parse-corpus-line
171                          (buffer-substring (point-at-bol)(point-at-eol))))
172               (setq sentence (concat sentence (car ret)))
173               (setq sentence-name
174                     (concat sentence-name
175                             (if sentence-name
176                                 " ")
177                             (format "%s[%s,%s,%s]"
178                                     (car ret)
179                                     (nth 2 ret)
180                                     (nth 3 ret)(nth 4 ret))))
181               (setq dest
182                     (cons (apply #'concord-kanbun-add-morpheme ret)
183                           dest)))
184             ;; (setq sentence-id
185             ;;       (intern
186             ;;        (concord-kanbun-encode-name-as-id
187             ;;         (format "%s/%d" source-name sentence-number))))
188             (setq sentence-id-name
189                   (format "%s/%d" source-name sentence-number))
190             (setq sentence-id (intern sentence-id-name))
191             (unless (setq sentence-cobj
192                           (concord-decode-object
193                            '=id sentence-id 'sentence@zh-classical))
194               (setq sentence-cobj
195                     (concord-make-object
196                      'sentence@zh-classical sentence-id))
197               (concord-object-put
198                sentence-cobj '=name (format "%s(%s)"
199                                             sentence-name sentence-id-name))
200               ;; (concord-object-put
201               ;;  sentence-cobj '=name (format "%s(%s)"
202               ;;                               sentence sentence-id-name))
203               )
204             (concord-object-put
205              sentence-cobj 'source/file-name source-name)
206             (concord-object-put
207              sentence-cobj 'source/sentence-number sentence-number)
208             (concord-object-put
209              sentence-cobj '->morphemes (nreverse dest))
210             (when (setq sentence-entry-cobj
211                         (concord-kanbun-add-sentence-entry sentence))
212               (concord-object-put
213                sentence-cobj '->entry@sentence (list sentence-entry-cobj)))
214             sentence-cobj))
215       (if send
216           (goto-char send)))))
217
218 (defun concord-kanbun-read-buffer (&optional source-name)
219   (interactive)
220   (save-excursion
221     (goto-char (point-min))
222     (unless source-name
223       (setq source-name (file-name-nondirectory buffer-file-name)))
224     (let ((i 1))
225       (while (concord-kanbun-read-sentence i source-name)
226         (message (format "%s: sentence #%d is stored." source-name i))
227         (setq i (1+ i))))))
228
229 (defun concord-kanbun-batch-read-file ()
230   (set-terminal-coding-system 'utf-8-jp-er)
231   (let ((file (pop command-line-args-left))
232         (coding-system-for-read 'utf-8-jp-er)
233         (file-name-coding-system 'utf-8-jp-er))
234     (with-temp-buffer
235       (insert-file-contents file)
236       (concord-kanbun-read-buffer (file-name-nondirectory file)))))