(concord-assign-genre 'entry@zh-classical "/usr/local/var/kanbun/db") (concord-assign-genre 'word-class@zh-classical "/usr/local/var/kanbun/db") (concord-assign-genre 'morpheme@zh-classical "/usr/local/var/kanbun/db") (concord-assign-genre 'sentence@zh-classical "/usr/local/var/kanbun/db") (mount-char-attribute-table '*instance@morpheme-entry/zh-classical) (defun concord-kanbun-encode-name-as-id (name) (let (ucs ret) (mapconcat (lambda (c) (cond ((eq c ?\ ) "_" ) ((or (and (<= ?A c)(<= c ?Z)) (and (<= ?a c)(< c ?u)) (and (< ?u c)(< c ?z)) (and (<= ?0 c)(<= c ?9))) (char-to-string c) ) ((setq ucs (or (encode-char c '=ucs@JP) (char-ucs c))) (format "u%04X" ucs) ) ((setq ret (encode-char c '=ruimoku-v6)) (format "r%04X" ret) ) ((setq ret (encode-char c '=jef-china3)) (format "j%04X" ret) ) (t (error "Unknown character %c" c) ;; (format "m%08X" (char-id c)) ))) name ""))) (defun concord-kanbun-add-sentence-entry (entry) (let (entry-id me-cobj) (unless (setq me-cobj (concord-decode-object '=name entry 'entry@zh-classical)) (setq entry-id (intern (concord-kanbun-encode-name-as-id entry))) (setq me-cobj (concord-make-object 'entry@zh-classical entry-id)) (concord-object-put me-cobj '=name entry)) me-cobj)) (defun concord-kanbun-add-morpheme-entry (entry) (let ((len (length entry)) (i 0) entry-id me-cobj chr ret ucs) (unless (setq me-cobj (concord-decode-object '=name entry 'entry@zh-classical)) (setq entry-id (intern (concord-kanbun-encode-name-as-id entry))) (setq me-cobj (concord-make-object 'entry@zh-classical entry-id)) (concord-object-put me-cobj '=name entry) (if (= len 1) (concord-object-put me-cobj 'character (list (aref entry 0)))) (while (< i len) (setq chr (aref entry i)) (if (setq ucs (or (encode-char chr '=ucs@JP) (char-ucs chr))) (setq chr (decode-char '=ucs ucs))) (setq ret (get-char-attribute chr '*instance@morpheme-entry/zh-classical)) (unless (member me-cobj ret) (put-char-attribute chr '*instance@morpheme-entry/zh-classical (cons me-cobj ret))) (setq i (1+ i))) (save-char-attribute-table '*instance@morpheme-entry/zh-classical)) me-cobj)) ;; (save-char-attribute-table '*instance@morpheme-entry/zh-classical) (defun concord-kanbun-add-word-class (word-superclass word-class word-subclass1 word-subclass2 word-subclass3) (let (wc-name wc-cobj wc-name-id) (when (and word-superclass (not (string= word-superclass "*"))) (setq wc-name word-superclass) (when (and word-class (not (string= word-class "*"))) (setq wc-name (format "%s,%s" wc-name word-class)) (when (and word-subclass1 (not (string= word-subclass1 "*"))) (setq wc-name (format "%s,%s" wc-name word-subclass1)) (when (and word-subclass2 (not (string= word-subclass2 "*"))) (setq wc-name (format "%s,%s" wc-name word-subclass2)) (when (and word-subclass3 (not (string= word-subclass3 "*"))) (setq wc-name (format "%s,%s" wc-name word-subclass3))))))) (unless (setq wc-cobj (concord-decode-object '=name wc-name 'word-class@zh-classical)) (setq wc-name-id (intern (concord-kanbun-encode-name-as-id wc-name))) (setq wc-cobj (concord-make-object 'word-class@zh-classical wc-name-id)) (concord-object-put wc-cobj '=name wc-name)) wc-cobj)) (defun concord-kanbun-add-morpheme (entry word-superclass word-class word-subclass1 word-subclass2 word-subclass3 canonical-form ja-form ja-kana ja-conj-type comment) (let* (entry-cobj canonical-entry-cobj wc-cobj wc-name mm-name mj-name mjc-name mjc-id mjc-cobj) (when entry (setq wc-name (if (or (null word-subclass3) (string-equal word-subclass3 "*")) (format "%s,%s,%s,%s" word-superclass word-class word-subclass1 word-subclass2) (format "%s,%s,%s,%s,%s" word-superclass word-class word-subclass1 word-subclass2 word-subclass3))) (setq mm-name (format "%s (%s) [%s]" entry canonical-form wc-name)) (setq mj-name (format "%s (%s (%s),%s)" mm-name ja-form ja-kana ja-conj-type)) (setq mjc-name (if comment (format "%s\t; %s" mj-name comment) mj-name)) (unless (setq mjc-cobj (concord-decode-object '=name mjc-name 'morpheme@zh-classical)) (setq mjc-id (intern (concord-kanbun-encode-name-as-id mjc-name))) (setq mjc-cobj (concord-make-object 'morpheme@zh-classical mjc-id)) (concord-object-put mjc-cobj '=name mjc-name) (when (setq entry-cobj (concord-kanbun-add-morpheme-entry entry)) (concord-object-put mjc-cobj '->entry@morpheme (list entry-cobj))) (unless (string= entry canonical-form) (when (setq canonical-entry-cobj (concord-kanbun-add-morpheme-entry canonical-form)) (concord-object-put mjc-cobj '->entry@morpheme/canonical (list canonical-entry-cobj)))) (when (setq wc-cobj (concord-kanbun-add-word-class word-superclass word-class word-subclass1 word-subclass2 word-subclass3)) (concord-object-put mjc-cobj '->word-class (list wc-cobj))) (concord-object-put mjc-cobj 'ja-form ja-form) (concord-object-put mjc-cobj 'ja-kana ja-kana) (concord-object-put mjc-cobj 'ja-conjugation-type ja-conj-type)) mjc-cobj))) (defun concord-kanbun-parse-corpus-line (string) (let* ((ret (split-string string "\t*[;;]\\s *")) entry features comment) (if (and (setq comment (nth 1 ret)) (string-match "[ \t]+$" comment)) (setq comment (substring comment 0 (match-beginning 0)))) (setq ret (split-string (car ret) "\t")) (setq entry (car ret) features (split-string (nth 1 ret) ",")) (list entry (car features)(nth 1 features) (nth 2 features)(nth 3 features)(nth 4 features) (nth 6 features) (nth 7 features)(nth 8 features)(nth 9 features) comment))) (defun concord-kanbun-add-corpus-line (string) (apply #'concord-kanbun-add-morpheme (concord-kanbun-parse-corpus-line string))) (defun concord-kanbun-read-sentence (sentence-number &optional source-name) (unless source-name (setq source-name (file-name-nondirectory buffer-file-name))) (let ((beg (point)) end send ret sentence dest sentence-name sentence-id-name sentence-id sentence-cobj sentence-entry-cobj) (prog1 (save-excursion (when (search-forward "\nEOS\n" nil t) (setq end (match-beginning 0) send (match-end 0)) (goto-char beg) (while (search-forward "\t" end t) (setq ret (concord-kanbun-parse-corpus-line (buffer-substring (point-at-bol)(point-at-eol)))) (setq sentence (concat sentence (car ret))) (setq sentence-name (concat sentence-name (if sentence-name " ") (format "%s[%s,%s,%s]" (car ret) (nth 2 ret) (nth 3 ret)(nth 4 ret)))) (setq dest (cons (apply #'concord-kanbun-add-morpheme ret) dest)) (goto-char (point-at-eol))) ;; (setq sentence-id ;; (intern ;; (concord-kanbun-encode-name-as-id ;; (format "%s/%d" source-name sentence-number)))) (setq sentence-id-name (format "%s/%d" source-name sentence-number)) (setq sentence-id (intern sentence-id-name)) (unless (setq sentence-cobj (concord-decode-object '=id sentence-id 'sentence@zh-classical)) (setq sentence-cobj (concord-make-object 'sentence@zh-classical sentence-id)) (concord-object-put sentence-cobj '=name (format "%s(%s)" sentence-name sentence-id-name)) ;; (concord-object-put ;; sentence-cobj '=name (format "%s(%s)" ;; sentence sentence-id-name)) ) (concord-object-put sentence-cobj 'source/file-name source-name) (concord-object-put sentence-cobj 'source/sentence-number sentence-number) (concord-object-put sentence-cobj '->morphemes (nreverse dest)) (when (setq sentence-entry-cobj (concord-kanbun-add-sentence-entry sentence)) (concord-object-put sentence-cobj '->entry@sentence (list sentence-entry-cobj))) sentence-cobj)) (if send (goto-char send))))) (defun concord-kanbun-read-buffer (&optional source-name) (interactive) (save-excursion (goto-char (point-min)) (unless source-name (setq source-name (file-name-nondirectory buffer-file-name))) (let ((i 1)) (while (concord-kanbun-read-sentence i source-name) (message (format "%s: sentence #%d is stored." source-name i)) (setq i (1+ i)))))) (defun concord-kanbun-batch-read-file () (set-terminal-coding-system 'utf-8-jp-er) (let ((file (pop command-line-args-left)) (coding-system-for-read 'utf-8-jp-er) (file-name-coding-system 'utf-8-jp-er)) (with-temp-buffer (insert-file-contents file) (concord-kanbun-read-buffer (file-name-nondirectory file)))))