From: MORIOKA Tomohiko Date: Wed, 4 Sep 2013 13:09:35 +0000 (+0900) Subject: New file. X-Git-Url: http://git.chise.org/gitweb/?a=commitdiff_plain;h=b6cd44656c6aace5c504228ea82e7724c3d48932;p=chise%2Fconcord-kanbun.git New file. --- b6cd44656c6aace5c504228ea82e7724c3d48932 diff --git a/est-kanbun-dic.el b/est-kanbun-dic.el new file mode 100644 index 0000000..1cfcd70 --- /dev/null +++ b/est-kanbun-dic.el @@ -0,0 +1,211 @@ +(concord-assign-genre 'morpheme-entry@zh-classical "/usr/local/var/kanbun/db") +(concord-assign-genre 'word-class@zh-classical "/usr/local/var/kanbun/db") +(concord-assign-genre 'morpheme@zh-classical "/usr/local/var/kanbun/db") +(concord-assign-genre 'sentence@zh-classical "/usr/local/var/kanbun/db") +(concord-assign-genre 'sentence-entry@zh-classical "/usr/local/var/kanbun/db") +(mount-char-attribute-table '*instance@morpheme-entry/zh-classical) + +(defun concord-kanbun-encode-name-as-id (name) + (let (ucs ret) + (mapconcat (lambda (c) + (cond + ((eq c ?\ ) + "_" + ) + ((or (and (<= ?A c)(<= c ?Z)) + (and (<= ?a c)(< c ?u)) + (and (< ?u c)(< c ?z)) + (and (<= ?0 c)(<= c ?9))) + (char-to-string c) + ) + ((setq ucs (or (encode-char c '=ucs@JP) + (char-ucs c))) + (format "u%04X" ucs) + ) + ((setq ret (encode-char c '=ruimoku-v6)) + (format "r%04X" ret) + ) + ((setq ret (encode-char c '=jef-china3)) + (format "j%04X" ret) + ) + (t + (error "Unknown character %c" c) + ;; (format "m%08X" (char-id c)) + ))) + name + ""))) + +(defun concord-kanbun-add-sentence-entry (entry) + (let (entry-id me-cobj) + (unless (setq me-cobj (concord-decode-object + '=name entry + 'sentence-entry@zh-classical)) + (setq entry-id (intern (concord-kanbun-encode-name-as-id entry))) + (setq me-cobj (concord-make-object + 'sentence-entry@zh-classical entry-id)) + (concord-object-put me-cobj '=name entry)) + me-cobj)) + +(defun concord-kanbun-add-morpheme-entry (entry) + (let ((len (length entry)) + (i 0) + entry-id me-cobj chr ret ucs) + (unless (setq me-cobj (concord-decode-object + '=name entry + 'morpheme-entry@zh-classical)) + (setq entry-id (intern (concord-kanbun-encode-name-as-id entry))) + (setq me-cobj (concord-make-object + 'morpheme-entry@zh-classical entry-id)) + (concord-object-put me-cobj '=name entry) + (if (= len 1) + (concord-object-put me-cobj 'character (list (aref entry 0)))) + (while (< i len) + (setq chr (aref entry i)) + (if (setq ucs (or (encode-char chr '=ucs@JP) + (char-ucs chr))) + (setq chr (decode-char '=ucs ucs))) + (setq ret (get-char-attribute + chr '*instance@morpheme-entry/zh-classical)) + (unless (member me-cobj ret) + (put-char-attribute chr '*instance@morpheme-entry/zh-classical + (cons me-cobj ret))) + (setq i (1+ i))) + (save-char-attribute-table '*instance@morpheme-entry/zh-classical)) + me-cobj)) + +;; (save-char-attribute-table '*instance@morpheme-entry/zh-classical) + +(defun concord-kanbun-add-word-class (word-superclass + word-class + word-subclass1 word-subclass2) + (let (wc-name wc-cobj wc-name-id) + (when (and word-superclass + (not (string= word-superclass "*"))) + (setq wc-name word-superclass) + (when (and word-class + (not (string= word-class "*"))) + (setq wc-name + (format "%s,%s" wc-name word-class)) + (when (and word-subclass1 + (not (string= word-subclass1 "*"))) + (setq wc-name + (format "%s,%s" wc-name word-subclass1)) + (when (and word-subclass2 + (not (string= word-subclass2 "*"))) + (setq wc-name + (format "%s,%s" wc-name word-subclass2)))))) + (unless (setq wc-cobj (concord-decode-object + '=name wc-name + 'word-class@zh-classical)) + (setq wc-name-id (intern (concord-kanbun-encode-name-as-id wc-name))) + (setq wc-cobj (concord-make-object + 'word-class@zh-classical wc-name-id)) + (concord-object-put wc-cobj '=name wc-name)) + wc-cobj)) + + +(defun concord-kanbun-add-morpheme (entry word-superclass word-class + word-subclass1 word-subclass2 + canonical-form + ja-form ja-kana ja-conj-type) + (let* (entry-cobj + wc-cobj wc-name + mm-name + mj-name mj-id mj-cobj) + (when entry + (setq wc-name (format "%s,%s,%s,%s" + word-superclass word-class + word-subclass1 word-subclass2)) + (setq mm-name (format "%s (%s) [%s]" + entry canonical-form wc-name)) + (setq mj-name (format "%s (%s (%s),%s)" + mm-name ja-form ja-kana ja-conj-type)) + (unless (setq mj-cobj (concord-decode-object + '=name mj-name 'morpheme@zh-classical)) + (setq mj-id (intern (concord-kanbun-encode-name-as-id mj-name))) + (setq mj-cobj (concord-make-object 'morpheme@zh-classical mj-id)) + (concord-object-put mj-cobj '=name mj-name) + (when (setq entry-cobj (concord-kanbun-add-morpheme-entry entry)) + (concord-object-put mj-cobj '->entry (list entry-cobj))) + (when (setq wc-cobj (concord-kanbun-add-word-class + word-superclass word-class + word-subclass1 word-subclass2)) + (concord-object-put mj-cobj '->word-class (list wc-cobj))) + (concord-object-put mj-cobj 'ja-form ja-form) + (concord-object-put mj-cobj 'ja-kana ja-kana) + (concord-object-put mj-cobj 'ja-conjugation-type ja-conj-type)) + mj-cobj))) + +(defun concord-kanbun-parse-corpus-line (string) + (let* ((ret (split-string string "\t")) + (ret2 (split-string (nth 1 ret) ","))) + (list (car ret) + (car ret2)(nth 1 ret2)(nth 2 ret2)(nth 3 ret2) + (nth 6 ret2) + (nth 7 ret2)(nth 8 ret2)(nth 9 ret2)))) + +(defun concord-kanbun-add-corpus-line (string) + (apply #'concord-kanbun-add-morpheme + (concord-kanbun-parse-corpus-line string))) + +(defun concord-kanbun-read-sentence (sentence-number &optional source-name) + (unless source-name + (setq source-name (file-name-nondirectory buffer-file-name))) + (let ((beg (point)) + end send + ret + sentence dest + sentence-id-name sentence-id sentence-cobj + sentence-entry-cobj) + (prog1 + (save-excursion + (when (search-forward "\nEOS\n" nil t) + (setq end (match-beginning 0) + send (match-end 0)) + (goto-char beg) + (while (search-forward "\t" end t) + (setq ret (concord-kanbun-parse-corpus-line + (buffer-substring (point-at-bol)(point-at-eol)))) + (setq sentence (concat sentence (car ret))) + (setq dest + (cons (apply #'concord-kanbun-add-morpheme ret) + dest))) + ;; (setq sentence-id + ;; (intern + ;; (concord-kanbun-encode-name-as-id + ;; (format "%s/%d" source-name sentence-number)))) + (setq sentence-id-name + (format "%s/%d" source-name sentence-number)) + (setq sentence-id (intern sentence-id-name)) + (unless (setq sentence-cobj + (concord-decode-object + '=id sentence-id 'sentence@zh-classical)) + (setq sentence-cobj + (concord-make-object + 'sentence@zh-classical sentence-id)) + (concord-object-put + sentence-cobj '=name (format "%s(%s)" + sentence sentence-id-name))) + (concord-object-put + sentence-cobj 'source/file-name source-name) + (concord-object-put + sentence-cobj 'source/sentence-number sentence-number) + (concord-object-put + sentence-cobj '->morphemes (nreverse dest)) + (when (setq sentence-entry-cobj + (concord-kanbun-add-sentence-entry sentence)) + (concord-object-put + sentence-cobj '->entry (list sentence-entry-cobj))) + sentence-cobj)) + (if send + (goto-char send))))) + +(defun concord-kanbun-read-buffer (&optional source-name) + (interactive) + (save-excursion + (goto-char (point-min)) + (unless source-name + (setq source-name (file-name-nondirectory buffer-file-name))) + (let ((i 1)) + (while (concord-kanbun-read-sentence i source-name) + (setq i (1+ i))))))