New file.
authorMORIOKA Tomohiko <tomo.git@chise.org>
Wed, 4 Sep 2013 13:09:35 +0000 (22:09 +0900)
committerMORIOKA Tomohiko <tomo.git@chise.org>
Wed, 4 Sep 2013 13:09:35 +0000 (22:09 +0900)
est-kanbun-dic.el [new file with mode: 0644]

diff --git a/est-kanbun-dic.el b/est-kanbun-dic.el
new file mode 100644 (file)
index 0000000..1cfcd70
--- /dev/null
@@ -0,0 +1,211 @@
+(concord-assign-genre 'morpheme-entry@zh-classical "/usr/local/var/kanbun/db")
+(concord-assign-genre 'word-class@zh-classical "/usr/local/var/kanbun/db")
+(concord-assign-genre 'morpheme@zh-classical "/usr/local/var/kanbun/db")
+(concord-assign-genre 'sentence@zh-classical "/usr/local/var/kanbun/db")
+(concord-assign-genre 'sentence-entry@zh-classical "/usr/local/var/kanbun/db")
+(mount-char-attribute-table '*instance@morpheme-entry/zh-classical)
+
+(defun concord-kanbun-encode-name-as-id (name)
+  (let (ucs ret)
+    (mapconcat (lambda (c)
+                (cond
+                 ((eq c ?\ )
+                  "_"
+                  )
+                 ((or (and (<= ?A c)(<= c ?Z))
+                      (and (<= ?a c)(< c ?u))
+                      (and (< ?u c)(< c ?z))
+                      (and (<= ?0 c)(<= c ?9)))
+                  (char-to-string c)
+                  )
+                 ((setq ucs (or (encode-char c '=ucs@JP)
+                                (char-ucs c)))
+                  (format "u%04X" ucs)
+                  )
+                 ((setq ret (encode-char c '=ruimoku-v6))
+                  (format "r%04X" ret)
+                  )
+                 ((setq ret (encode-char c '=jef-china3))
+                  (format "j%04X" ret)
+                  )
+                 (t
+                  (error "Unknown character %c" c)
+                   ;; (format "m%08X" (char-id c))
+                  )))
+              name
+              "")))
+
+(defun concord-kanbun-add-sentence-entry (entry)
+  (let (entry-id me-cobj)
+    (unless (setq me-cobj (concord-decode-object
+                          '=name entry
+                          'sentence-entry@zh-classical))
+      (setq entry-id (intern (concord-kanbun-encode-name-as-id entry)))
+      (setq me-cobj (concord-make-object
+                    'sentence-entry@zh-classical entry-id))
+      (concord-object-put me-cobj '=name entry))
+    me-cobj))
+
+(defun concord-kanbun-add-morpheme-entry (entry)
+  (let ((len (length entry))
+       (i 0)
+       entry-id me-cobj chr ret ucs)
+    (unless (setq me-cobj (concord-decode-object
+                          '=name entry
+                          'morpheme-entry@zh-classical))
+      (setq entry-id (intern (concord-kanbun-encode-name-as-id entry)))
+      (setq me-cobj (concord-make-object
+                    'morpheme-entry@zh-classical entry-id))
+      (concord-object-put me-cobj '=name entry)
+      (if (= len 1)
+         (concord-object-put me-cobj 'character (list (aref entry 0))))
+      (while (< i len)
+       (setq chr (aref entry i))
+       (if (setq ucs (or (encode-char chr '=ucs@JP)
+                         (char-ucs chr)))
+           (setq chr (decode-char '=ucs ucs)))
+       (setq ret (get-char-attribute
+                  chr '*instance@morpheme-entry/zh-classical))
+       (unless (member me-cobj ret)
+         (put-char-attribute chr '*instance@morpheme-entry/zh-classical
+                             (cons me-cobj ret)))
+       (setq i (1+ i)))
+      (save-char-attribute-table '*instance@morpheme-entry/zh-classical))
+    me-cobj))
+
+;; (save-char-attribute-table '*instance@morpheme-entry/zh-classical)
+
+(defun concord-kanbun-add-word-class (word-superclass
+                                     word-class
+                                     word-subclass1 word-subclass2)
+  (let (wc-name wc-cobj wc-name-id)
+    (when (and word-superclass
+              (not (string= word-superclass "*")))
+      (setq wc-name word-superclass)
+      (when (and word-class
+                (not (string= word-class "*")))
+       (setq wc-name
+             (format "%s,%s" wc-name word-class))
+       (when (and word-subclass1
+                  (not (string= word-subclass1 "*")))
+         (setq wc-name
+               (format "%s,%s" wc-name word-subclass1))
+         (when (and word-subclass2
+                    (not (string= word-subclass2 "*")))
+           (setq wc-name
+                 (format "%s,%s" wc-name word-subclass2))))))
+    (unless (setq wc-cobj (concord-decode-object
+                          '=name wc-name
+                          'word-class@zh-classical))
+      (setq wc-name-id (intern (concord-kanbun-encode-name-as-id wc-name)))
+      (setq wc-cobj (concord-make-object
+                    'word-class@zh-classical wc-name-id))
+      (concord-object-put wc-cobj '=name wc-name))
+    wc-cobj))
+
+
+(defun concord-kanbun-add-morpheme (entry word-superclass word-class
+                                         word-subclass1 word-subclass2
+                                         canonical-form
+                                         ja-form ja-kana ja-conj-type)
+  (let* (entry-cobj
+        wc-cobj wc-name
+        mm-name
+        mj-name mj-id mj-cobj)
+    (when entry
+      (setq wc-name (format "%s,%s,%s,%s"
+                           word-superclass word-class
+                           word-subclass1 word-subclass2))
+      (setq mm-name (format "%s (%s) [%s]"
+                           entry canonical-form wc-name))
+      (setq mj-name (format "%s (%s (%s),%s)"
+                           mm-name ja-form ja-kana ja-conj-type))
+      (unless (setq mj-cobj (concord-decode-object
+                            '=name mj-name 'morpheme@zh-classical))
+       (setq mj-id (intern (concord-kanbun-encode-name-as-id mj-name)))
+       (setq mj-cobj (concord-make-object 'morpheme@zh-classical mj-id))
+       (concord-object-put mj-cobj '=name mj-name)
+       (when (setq entry-cobj (concord-kanbun-add-morpheme-entry entry))
+         (concord-object-put mj-cobj '->entry (list entry-cobj)))
+       (when (setq wc-cobj (concord-kanbun-add-word-class
+                            word-superclass word-class
+                            word-subclass1 word-subclass2))
+         (concord-object-put mj-cobj '->word-class (list wc-cobj)))
+       (concord-object-put mj-cobj 'ja-form ja-form)
+       (concord-object-put mj-cobj 'ja-kana ja-kana)
+       (concord-object-put mj-cobj 'ja-conjugation-type ja-conj-type))
+      mj-cobj)))
+
+(defun concord-kanbun-parse-corpus-line (string)
+  (let* ((ret (split-string string "\t"))
+        (ret2 (split-string (nth 1 ret) ",")))
+    (list (car ret)
+         (car ret2)(nth 1 ret2)(nth 2 ret2)(nth 3 ret2)
+         (nth 6 ret2)
+         (nth 7 ret2)(nth 8 ret2)(nth 9 ret2))))
+
+(defun concord-kanbun-add-corpus-line (string)
+  (apply #'concord-kanbun-add-morpheme
+        (concord-kanbun-parse-corpus-line string)))
+
+(defun concord-kanbun-read-sentence (sentence-number &optional source-name)
+  (unless source-name
+    (setq source-name (file-name-nondirectory buffer-file-name)))
+  (let ((beg (point))
+       end send
+       ret
+       sentence dest
+       sentence-id-name sentence-id sentence-cobj
+       sentence-entry-cobj)
+    (prog1
+       (save-excursion
+         (when (search-forward "\nEOS\n" nil t)
+           (setq end (match-beginning 0)
+                 send (match-end 0))
+           (goto-char beg)
+           (while (search-forward "\t" end t)
+             (setq ret (concord-kanbun-parse-corpus-line
+                        (buffer-substring (point-at-bol)(point-at-eol))))
+             (setq sentence (concat sentence (car ret)))
+             (setq dest
+                   (cons (apply #'concord-kanbun-add-morpheme ret)
+                         dest)))
+            ;; (setq sentence-id
+            ;;       (intern
+            ;;        (concord-kanbun-encode-name-as-id
+            ;;         (format "%s/%d" source-name sentence-number))))
+           (setq sentence-id-name
+                 (format "%s/%d" source-name sentence-number))
+           (setq sentence-id (intern sentence-id-name))
+           (unless (setq sentence-cobj
+                         (concord-decode-object
+                          '=id sentence-id 'sentence@zh-classical))
+             (setq sentence-cobj
+                   (concord-make-object
+                    'sentence@zh-classical sentence-id))
+             (concord-object-put
+              sentence-cobj '=name (format "%s(%s)"
+                                           sentence sentence-id-name)))
+           (concord-object-put
+            sentence-cobj 'source/file-name source-name)
+           (concord-object-put
+            sentence-cobj 'source/sentence-number sentence-number)
+           (concord-object-put
+            sentence-cobj '->morphemes (nreverse dest))
+           (when (setq sentence-entry-cobj
+                       (concord-kanbun-add-sentence-entry sentence))
+             (concord-object-put
+              sentence-cobj '->entry (list sentence-entry-cobj)))
+           sentence-cobj))
+      (if send
+         (goto-char send)))))
+
+(defun concord-kanbun-read-buffer (&optional source-name)
+  (interactive)
+  (save-excursion
+    (goto-char (point-min))
+    (unless source-name
+      (setq source-name (file-name-nondirectory buffer-file-name)))
+    (let ((i 1))
+      (while (concord-kanbun-read-sentence i source-name)
+       (setq i (1+ i))))))