1 (concord-assign-genre 'entry@zh-classical "/usr/local/var/kanbun/db")
2 (concord-assign-genre 'word-class@zh-classical "/usr/local/var/kanbun/db")
3 (concord-assign-genre 'morpheme@zh-classical "/usr/local/var/kanbun/db")
4 (concord-assign-genre 'sentence@zh-classical "/usr/local/var/kanbun/db")
5 (mount-char-attribute-table '*instance@morpheme-entry/zh-classical)
7 (defun concord-kanbun-word-class-canonical-p (word-class)
8 (memq 'mkwcs (concord-object-get word-class 'sources)))
10 (defun concord-kanbun-encode-name-as-id (name)
12 (mapconcat (lambda (c)
17 ((or (and (<= ?A c)(<= c ?Z))
18 (and (<= ?a c)(< c ?u))
19 (and (< ?u c)(< c ?z))
20 (and (<= ?0 c)(<= c ?9)))
23 ((setq ucs (or (encode-char c '=ucs@JP)
27 ((setq ret (encode-char c '=ruimoku-v6))
30 ((setq ret (encode-char c '=jef-china3))
34 (error "Unknown character %c" c)
35 ;; (format "m%08X" (char-id c))
40 (defun concord-kanbun-add-sentence-entry (entry)
41 (let (entry-id me-cobj)
42 (unless (setq me-cobj (concord-decode-object '=name entry
44 (setq entry-id (intern (concord-kanbun-encode-name-as-id entry)))
45 (setq me-cobj (concord-make-object 'entry@zh-classical entry-id))
46 (concord-object-put me-cobj '=name entry))
49 (defun concord-kanbun-add-morpheme-entry (entry)
50 (let ((len (length entry))
52 entry-id me-cobj chr ret ucs)
53 (unless (setq me-cobj (concord-decode-object
56 (setq entry-id (intern (concord-kanbun-encode-name-as-id entry)))
57 (setq me-cobj (concord-make-object 'entry@zh-classical entry-id))
58 (concord-object-put me-cobj '=name entry)
60 (concord-object-put me-cobj 'character (list (aref entry 0))))
62 (setq chr (aref entry i))
63 (if (setq ucs (or (encode-char chr '=ucs@JP)
65 (setq chr (decode-char '=ucs ucs)))
66 (setq ret (get-char-attribute
67 chr '*instance@morpheme-entry/zh-classical))
68 (unless (member me-cobj ret)
69 (put-char-attribute chr '*instance@morpheme-entry/zh-classical
72 (save-char-attribute-table '*instance@morpheme-entry/zh-classical))
75 ;; (save-char-attribute-table '*instance@morpheme-entry/zh-classical)
77 (defun concord-kanbun-add-word-class (&rest word-class)
78 (let (wc-name wc-cobj wc-name-id parent-wc-cobj
80 (dolist (wcf word-class)
82 (not (string= wcf "*")))
83 (setq r-wcl (cons wcf r-wcl))
86 (format "%s,%s" wc-name wcf)
90 (unless (setq wc-cobj (concord-decode-object
92 'word-class@zh-classical))
93 (setq wc-name-id (intern (concord-kanbun-encode-name-as-id wc-name)))
94 (setq wc-cobj (concord-make-object
95 'word-class@zh-classical wc-name-id))
96 (concord-object-put wc-cobj '=name wc-name)
99 (apply #'concord-kanbun-add-word-class
100 (nreverse (cdr r-wcl)))))
102 wc-cobj '<-subcategory@misc (list parent-wc-cobj)))
106 (defun concord-kanbun-add-morpheme (entry word-superclass word-class
107 word-subclass1 word-subclass2
110 ja-form ja-kana ja-conj-type
114 wc-cobj wc-name wc-canonical-flag
117 mjc-name mjc-id mjc-cobj)
120 (if (or (null word-subclass3)
121 (string-equal word-subclass3 "*"))
122 (format "%s,%s,%s,%s"
123 word-superclass word-class
124 word-subclass1 word-subclass2)
125 (format "%s,%s,%s,%s,%s"
126 word-superclass word-class
127 word-subclass1 word-subclass2
129 (setq mm-name (format "%s (%s) [%s]"
130 entry canonical-form wc-name))
131 (setq mj-name (format "%s (%s (%s),%s)"
132 mm-name ja-form ja-kana ja-conj-type))
135 (format "%s\t; %s" mj-name comment)
137 (unless (setq mjc-cobj (concord-decode-object
138 '=name mjc-name 'morpheme@zh-classical))
139 (setq mjc-id (intern (concord-kanbun-encode-name-as-id mjc-name)))
140 (setq mjc-cobj (concord-make-object 'morpheme@zh-classical mjc-id))
141 (concord-object-put mjc-cobj '=name mjc-name)
142 (setq entry-cobj (concord-kanbun-add-morpheme-entry entry))
143 (setq wc-cobj (concord-kanbun-add-word-class
144 word-superclass word-class
145 word-subclass1 word-subclass2
147 (concord-object-put mjc-cobj '->word-class (list wc-cobj))
148 (setq wc-canonical-flag
149 (concord-kanbun-word-class-canonical-p wc-cobj))
150 (concord-object-put mjc-cobj
151 (if wc-canonical-flag
153 '->entry@morpheme/misc)
155 (unless (string= entry canonical-form)
156 (when (setq canonical-entry-cobj
157 (concord-kanbun-add-morpheme-entry canonical-form))
158 (concord-object-put mjc-cobj
159 (if wc-canonical-flag
160 '->entry@morpheme/canonical
161 '->entry@morpheme/canonical/misc)
162 (list canonical-entry-cobj))))
163 (concord-object-put mjc-cobj 'ja-form ja-form)
164 (concord-object-put mjc-cobj 'ja-kana ja-kana)
165 (concord-object-put mjc-cobj 'ja-conjugation-type ja-conj-type))
168 (defun concord-kanbun-parse-corpus-line (string)
169 (let* ((ret (split-string string "\t*[;
\e$B!(
\e(B]\\s *"))
170 entry features comment)
171 (if (and (setq comment (nth 1 ret))
172 (string-match "[ \t]+$" comment))
173 (setq comment (substring comment 0 (match-beginning 0))))
174 (setq ret (split-string (car ret) "\t"))
175 (setq entry (car ret)
176 features (split-string (nth 1 ret) ","))
178 (car features)(nth 1 features)
179 (nth 2 features)(nth 3 features)(nth 4 features)
181 (nth 7 features)(nth 8 features)(nth 9 features)
184 (defun concord-kanbun-add-corpus-line (string)
185 (apply #'concord-kanbun-add-morpheme
186 (concord-kanbun-parse-corpus-line string)))
188 (defun concord-kanbun-read-sentence (sentence-number &optional source-name)
190 (setq source-name (file-name-nondirectory buffer-file-name)))
196 sentence-id-name sentence-id sentence-cobj
200 (when (search-forward "\nEOS\n" nil t)
201 (setq end (match-beginning 0)
204 (while (search-forward "\t" end t)
205 (setq ret (concord-kanbun-parse-corpus-line
206 (buffer-substring (point-at-bol)(point-at-eol))))
207 (setq sentence (concat sentence (car ret)))
208 ;; (setq sentence-name
209 ;; (concat sentence-name
212 ;; (format "%s[%s,%s,%s]"
215 ;; (nth 3 ret)(nth 4 ret))))
217 (cons (apply #'concord-kanbun-add-morpheme ret)
219 (goto-char (point-at-eol)))
222 ;; (concord-kanbun-encode-name-as-id
223 ;; (format "%s/%d" source-name sentence-number))))
224 (setq sentence-id-name
225 (format "%s/%d" source-name sentence-number))
226 (setq sentence-id (intern sentence-id-name))
227 (unless (setq sentence-cobj
228 (concord-decode-object
229 '=id sentence-id 'sentence@zh-classical))
232 'sentence@zh-classical sentence-id))
233 ;; (concord-object-put
234 ;; sentence-cobj '=name (format "%s(%s)"
235 ;; sentence-name sentence-id-name))
237 sentence-cobj '=name (format "%s(%s)"
238 sentence sentence-id-name))
241 sentence-cobj 'source/file-name source-name)
243 sentence-cobj 'source/sentence-number sentence-number)
245 sentence-cobj '->morphemes (nreverse dest))
246 (when (setq sentence-entry-cobj
247 (concord-kanbun-add-sentence-entry sentence))
249 sentence-cobj '->entry@sentence (list sentence-entry-cobj)))
254 (defun concord-kanbun-read-buffer (&optional source-name)
257 (goto-char (point-min))
259 (setq source-name (file-name-nondirectory buffer-file-name)))
261 (while (concord-kanbun-read-sentence i source-name)
262 (message (format "%s: sentence #%d is stored." source-name i))
265 (defun concord-kanbun-batch-read-file ()
266 (set-terminal-coding-system 'utf-8-jp-er)
267 (let ((file (pop command-line-args-left))
268 (coding-system-for-read 'utf-8-jp-er)
269 (file-name-coding-system 'utf-8-jp-er))
271 (insert-file-contents file)
272 (concord-kanbun-read-buffer (file-name-nondirectory file)))))