From ce4f80049bfe2a675d83f3dadbdb48e7a609d8e1 Mon Sep 17 00:00:00 2001 From: MORIOKA Tomohiko Date: Thu, 12 Sep 2013 00:42:04 +0900 Subject: [PATCH] New file. --- concord-kanbun-dump.el | 44 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) create mode 100644 concord-kanbun-dump.el diff --git a/concord-kanbun-dump.el b/concord-kanbun-dump.el new file mode 100644 index 0000000..12642b8 --- /dev/null +++ b/concord-kanbun-dump.el @@ -0,0 +1,44 @@ +(defun concord-kanbun-corpus-insert-morpheme (morpheme) + (let ((entry (concord-object-get morpheme '->entry@morpheme)) + (word-class (concord-object-get morpheme '->word-class)) + (canonical-form (concord-object-get + morpheme '->entry@morpheme/canonical)) + (ja-form (concord-object-get morpheme 'ja-form)) + (ja-kana (concord-object-get morpheme 'ja-kana)) + (ja-conj-type (concord-object-get morpheme 'ja-conjugation-type))) + (when entry + (setq entry (concord-object-get (car entry) '=name))) + (when word-class + (setq word-class + (split-string + (concord-object-get (car word-class) '=name) + ","))) + (setq canonical-form + (if canonical-form + (concord-object-get (car canonical-form) '=name) + entry)) + (insert + (format "%s\t%s,%s,%s,%s,*,*,%s,%s,%s,%s\n" + (or entry "*") + (or (car word-class) "*") + (or (nth 1 word-class) "*") + (or (nth 2 word-class) "*") + (or (nth 3 word-class) "*") + (or canonical-form "*") + ja-form ja-kana ja-conj-type)))) + +(defun concord-kanbun-dump-file (source dest-dir) + (with-temp-buffer + (let ((coding-system-for-write 'utf-8-jp-er) + (i 1) + s-obj morphemes) + (while (setq s-obj (concord-decode-object + '=id (intern (format "%s/%s" source i)) + 'sentence@zh-classical)) + (when (setq morphemes (concord-object-get s-obj '->morphemes)) + (dolist (morpheme morphemes) + (concord-kanbun-corpus-insert-morpheme morpheme)) + (insert "EOS\n")) + (setq i (1+ i))) + (write-region (point-min)(point-max) + (expand-file-name source dest-dir))))) -- 1.7.10.4