Renamed.
[chise/concord-kanbun.git] / concord-kanbun-dic.el
1 (concord-assign-genre 'morpheme-entry@zh-classical "/usr/local/var/kanbun/db")
2 (concord-assign-genre 'word-class@zh-classical "/usr/local/var/kanbun/db")
3 (concord-assign-genre 'morpheme@zh-classical "/usr/local/var/kanbun/db")
4 (concord-assign-genre 'sentence@zh-classical "/usr/local/var/kanbun/db")
5 (concord-assign-genre 'sentence-entry@zh-classical "/usr/local/var/kanbun/db")
6 (mount-char-attribute-table '*instance@morpheme-entry/zh-classical)
7
8 (defun concord-kanbun-encode-name-as-id (name)
9   (let (ucs ret)
10     (mapconcat (lambda (c)
11                  (cond
12                   ((eq c ?\ )
13                    "_"
14                    )
15                   ((or (and (<= ?A c)(<= c ?Z))
16                        (and (<= ?a c)(< c ?u))
17                        (and (< ?u c)(< c ?z))
18                        (and (<= ?0 c)(<= c ?9)))
19                    (char-to-string c)
20                    )
21                   ((setq ucs (or (encode-char c '=ucs@JP)
22                                  (char-ucs c)))
23                    (format "u%04X" ucs)
24                    )
25                   ((setq ret (encode-char c '=ruimoku-v6))
26                    (format "r%04X" ret)
27                    )
28                   ((setq ret (encode-char c '=jef-china3))
29                    (format "j%04X" ret)
30                    )
31                   (t
32                    (error "Unknown character %c" c)
33                    ;; (format "m%08X" (char-id c))
34                    )))
35                name
36                "")))
37
38 (defun concord-kanbun-add-sentence-entry (entry)
39   (let (entry-id me-cobj)
40     (unless (setq me-cobj (concord-decode-object
41                            '=name entry
42                            'sentence-entry@zh-classical))
43       (setq entry-id (intern (concord-kanbun-encode-name-as-id entry)))
44       (setq me-cobj (concord-make-object
45                      'sentence-entry@zh-classical entry-id))
46       (concord-object-put me-cobj '=name entry))
47     me-cobj))
48
49 (defun concord-kanbun-add-morpheme-entry (entry)
50   (let ((len (length entry))
51         (i 0)
52         entry-id me-cobj chr ret ucs)
53     (unless (setq me-cobj (concord-decode-object
54                            '=name entry
55                            'morpheme-entry@zh-classical))
56       (setq entry-id (intern (concord-kanbun-encode-name-as-id entry)))
57       (setq me-cobj (concord-make-object
58                      'morpheme-entry@zh-classical entry-id))
59       (concord-object-put me-cobj '=name entry)
60       (if (= len 1)
61           (concord-object-put me-cobj 'character (list (aref entry 0))))
62       (while (< i len)
63         (setq chr (aref entry i))
64         (if (setq ucs (or (encode-char chr '=ucs@JP)
65                           (char-ucs chr)))
66             (setq chr (decode-char '=ucs ucs)))
67         (setq ret (get-char-attribute
68                    chr '*instance@morpheme-entry/zh-classical))
69         (unless (member me-cobj ret)
70           (put-char-attribute chr '*instance@morpheme-entry/zh-classical
71                               (cons me-cobj ret)))
72         (setq i (1+ i)))
73       (save-char-attribute-table '*instance@morpheme-entry/zh-classical))
74     me-cobj))
75
76 ;; (save-char-attribute-table '*instance@morpheme-entry/zh-classical)
77
78 (defun concord-kanbun-add-word-class (word-superclass
79                                       word-class
80                                       word-subclass1 word-subclass2)
81   (let (wc-name wc-cobj wc-name-id)
82     (when (and word-superclass
83                (not (string= word-superclass "*")))
84       (setq wc-name word-superclass)
85       (when (and word-class
86                  (not (string= word-class "*")))
87         (setq wc-name
88               (format "%s,%s" wc-name word-class))
89         (when (and word-subclass1
90                    (not (string= word-subclass1 "*")))
91           (setq wc-name
92                 (format "%s,%s" wc-name word-subclass1))
93           (when (and word-subclass2
94                      (not (string= word-subclass2 "*")))
95             (setq wc-name
96                   (format "%s,%s" wc-name word-subclass2))))))
97     (unless (setq wc-cobj (concord-decode-object
98                            '=name wc-name
99                            'word-class@zh-classical))
100       (setq wc-name-id (intern (concord-kanbun-encode-name-as-id wc-name)))
101       (setq wc-cobj (concord-make-object
102                      'word-class@zh-classical wc-name-id))
103       (concord-object-put wc-cobj '=name wc-name))
104     wc-cobj))
105
106
107 (defun concord-kanbun-add-morpheme (entry word-superclass word-class
108                                           word-subclass1 word-subclass2
109                                           canonical-form
110                                           ja-form ja-kana ja-conj-type)
111   (let* (entry-cobj
112          wc-cobj wc-name
113          mm-name
114          mj-name mj-id mj-cobj)
115     (when entry
116       (setq wc-name (format "%s,%s,%s,%s"
117                             word-superclass word-class
118                             word-subclass1 word-subclass2))
119       (setq mm-name (format "%s (%s) [%s]"
120                             entry canonical-form wc-name))
121       (setq mj-name (format "%s (%s (%s),%s)"
122                             mm-name ja-form ja-kana ja-conj-type))
123       (unless (setq mj-cobj (concord-decode-object
124                              '=name mj-name 'morpheme@zh-classical))
125         (setq mj-id (intern (concord-kanbun-encode-name-as-id mj-name)))
126         (setq mj-cobj (concord-make-object 'morpheme@zh-classical mj-id))
127         (concord-object-put mj-cobj '=name mj-name)
128         (when (setq entry-cobj (concord-kanbun-add-morpheme-entry entry))
129           (concord-object-put mj-cobj '->entry (list entry-cobj)))
130         (when (setq wc-cobj (concord-kanbun-add-word-class
131                              word-superclass word-class
132                              word-subclass1 word-subclass2))
133           (concord-object-put mj-cobj '->word-class (list wc-cobj)))
134         (concord-object-put mj-cobj 'ja-form ja-form)
135         (concord-object-put mj-cobj 'ja-kana ja-kana)
136         (concord-object-put mj-cobj 'ja-conjugation-type ja-conj-type))
137       mj-cobj)))
138
139 (defun concord-kanbun-parse-corpus-line (string)
140   (let* ((ret (split-string string "\t"))
141          (ret2 (split-string (nth 1 ret) ",")))
142     (list (car ret)
143           (car ret2)(nth 1 ret2)(nth 2 ret2)(nth 3 ret2)
144           (nth 6 ret2)
145           (nth 7 ret2)(nth 8 ret2)(nth 9 ret2))))
146
147 (defun concord-kanbun-add-corpus-line (string)
148   (apply #'concord-kanbun-add-morpheme
149          (concord-kanbun-parse-corpus-line string)))
150
151 (defun concord-kanbun-read-sentence (sentence-number &optional source-name)
152   (unless source-name
153     (setq source-name (file-name-nondirectory buffer-file-name)))
154   (let ((beg (point))
155         end send
156         ret
157         sentence dest
158         sentence-id-name sentence-id sentence-cobj
159         sentence-entry-cobj)
160     (prog1
161         (save-excursion
162           (when (search-forward "\nEOS\n" nil t)
163             (setq end (match-beginning 0)
164                   send (match-end 0))
165             (goto-char beg)
166             (while (search-forward "\t" end t)
167               (setq ret (concord-kanbun-parse-corpus-line
168                          (buffer-substring (point-at-bol)(point-at-eol))))
169               (setq sentence (concat sentence (car ret)))
170               (setq dest
171                     (cons (apply #'concord-kanbun-add-morpheme ret)
172                           dest)))
173             ;; (setq sentence-id
174             ;;       (intern
175             ;;        (concord-kanbun-encode-name-as-id
176             ;;         (format "%s/%d" source-name sentence-number))))
177             (setq sentence-id-name
178                   (format "%s/%d" source-name sentence-number))
179             (setq sentence-id (intern sentence-id-name))
180             (unless (setq sentence-cobj
181                           (concord-decode-object
182                            '=id sentence-id 'sentence@zh-classical))
183               (setq sentence-cobj
184                     (concord-make-object
185                      'sentence@zh-classical sentence-id))
186               (concord-object-put
187                sentence-cobj '=name (format "%s(%s)"
188                                             sentence sentence-id-name)))
189             (concord-object-put
190              sentence-cobj 'source/file-name source-name)
191             (concord-object-put
192              sentence-cobj 'source/sentence-number sentence-number)
193             (concord-object-put
194              sentence-cobj '->morphemes (nreverse dest))
195             (when (setq sentence-entry-cobj
196                         (concord-kanbun-add-sentence-entry sentence))
197               (concord-object-put
198                sentence-cobj '->entry (list sentence-entry-cobj)))
199             sentence-cobj))
200       (if send
201           (goto-char send)))))
202
203 (defun concord-kanbun-read-buffer (&optional source-name)
204   (interactive)
205   (save-excursion
206     (goto-char (point-min))
207     (unless source-name
208       (setq source-name (file-name-nondirectory buffer-file-name)))
209     (let ((i 1))
210       (while (concord-kanbun-read-sentence i source-name)
211         (setq i (1+ i))))))