Sync up with r21-4-15-chise-0_22-1.
[chise/xemacs-chise.git.1] / lisp / utf-2000 / char-db-util.el
index 84b6a98..7e9a87d 100644 (file)
@@ -1,9 +1,9 @@
-;;; char-db-util.el --- Character Database utility
+;;; char-db-util.el --- Character Database utility -*- coding: utf-8-er; -*-
 
-;; Copyright (C) 1998,1999,2000,2001,2002,2003,2004 MORIOKA Tomohiko.
+;; Copyright (C) 1998,1999,2000,2001,2002,2003,2004,2005 MORIOKA Tomohiko.
 
 ;; Author: MORIOKA Tomohiko <tomo@kanji.zinbun.kyoto-u.ac.jp>
-;; Keywords: CHISE, Character Database, ISO/IEC 10646, Unicode, UCS-4, MULE.
+;; Keywords: CHISE, Character Database, ISO/IEC 10646, UCS, Unicode, MULE.
 
 ;; This file is part of XEmacs CHISE.
 
       (setq i (1+ i)))
     v))
 
+(defun ideographic-radical (number)
+  (aref ideographic-radicals number))
+
+(defconst shuowen-radicals
+  [?一 ?上 ?示 ?三 ?王 ?玉 ?玨 ?气 ?士 ?丨 ?屮 ?艸 ?茻])
+
+(defun shuowen-radical (number)
+  (aref shuowen-radicals (1- number)))
+
 (defvar char-db-file-coding-system 'utf-8-mcs-er)
 
 (defvar char-db-feature-domains
   '(ucs daikanwa cns gt jis jis/alt jis/a jis/b
-       jis-x0213 cdp shinjigen misc unknown))
+       jis-x0212 jis-x0213 cdp shinjigen misc unknown))
 
-(defvar char-db-ignored-attributes nil)
+(defvar char-db-ignored-attributes '(ideographic-products))
 
 (defun char-attribute-name< (ka kb)
   (cond
     greek-iso8859-7
     thai-tis620
     =jis-x0208
-    japanese-jisx0208
+    =jis-x0208@1978
+    =jis-x0208@1983
     japanese-jisx0212
-    japanese-jisx0208-1978
     chinese-gb2312
     chinese-cns11643-1
     chinese-cns11643-2
     chinese-cns11643-5
     chinese-cns11643-6
     chinese-cns11643-7
-    =jis-x0208-1990
+    =jis-x0208@1990
     =jis-x0213-1-2000
     =jis-x0213-2-2000
     korean-ksc5601
     ideograph-hanziku-10
     ideograph-hanziku-11
     ideograph-hanziku-12
+    =gt-k
+    =ucs@iso
+    =ucs@unicode
     =big5
     =big5-eten
-    =gt-k
+    =jis-x0208@1997
+    =zinbun-oracle
     =jef-china3))
 
 (defun char-db-make-char-spec (char)
                    (if (and (or (charset-iso-final-char ccs)
                                 (memq ccs
                                       '(=daikanwa
-                                        =daikanwa-rev2
+                                        =daikanwa@rev2
                                         ;; =gt-k
                                         )))
+                            (setq ccs (charset-name ccs))
+                            (null (assq ccs char-spec))
                             (setq ret (encode-char char ccs 'defined-only)))
                        (setq char-spec (cons (cons ccs ret) char-spec))))
                  (if (null char-spec)
             (insert (format "%s%s\t%d ; %c%s"
                             separator
                             name value
-                            (aref ideographic-radicals value)
+                            (ideographic-radical value)
                             line-breaking))
             (setq separator ""))
             (t
   (insert
    (format
     (cond ((memq name '(=daikanwa
-                       =daikanwa-rev1 =daikanwa-rev2
-                       =gt =gt-k =cbeta))
+                       =daikanwa@rev1 =daikanwa@rev2
+                       =gt =gt-k =cbeta =zinbun-oracle))
           "(%-18s . %05d)\t; %c")
          ((eq name 'mojikyo)
           "(%-18s . %06d)\t; %c")
                        line-breaking))
        (setq attributes (delq name attributes))
        ))
-    ;; (dolist (name '(=>ucs-gb =>ucs-cns =>ucs-jis =>ucs-ks =>ucs-big5))
-    ;;   (when (and (memq name attributes)
-    ;;              (setq value (get-char-attribute char name)))
-    ;;     (insert (format "(%-18s . #x%04X)\t; %c%s"
-    ;;                     (intern
-    ;;                      (concat "=>ucs@"
-    ;;                              (substring (symbol-name name) 6)))
-    ;;                     value
-    ;;                     (decode-char (intern
-    ;;                                   (concat "=ucs@"
-    ;;                                           (substring
-    ;;                                            (symbol-name name) 6)))
-    ;;                                  value)
-    ;;                     line-breaking))
-    ;;     (setq attributes (delq name attributes))))
-    ;; (when (and (memq '->ucs attributes)
-    ;;            (setq value (get-char-attribute char '->ucs)))
-    ;;   (insert (format (if char-db-convert-obsolete-format
-    ;;                       "(=>ucs\t\t. #x%04X)\t; %c%s"
-    ;;                     "(->ucs\t\t. #x%04X)\t; %c%s")
-    ;;                   value (decode-char '=ucs value)
-    ;;                   line-breaking))
-    ;;   (setq attributes (delq '->ucs attributes))
-    ;;   )
     (dolist (name '(=>daikanwa))
       (when (and (memq name attributes)
                 (setq value (get-char-attribute char name)))
       (setq radical value)
       (insert (format "(ideographic-radical . %S)\t; %c%s"
                      radical
-                     (aref ideographic-radicals radical)
+                     (ideographic-radical radical)
                      line-breaking))
       (setq attributes (delq 'ideographic-radical attributes))
       )
+    (when (and (memq 'shuowen-radical attributes)
+              (setq value (get-char-attribute char 'shuowen-radical)))
+      (insert (format "(shuowen-radical\t. %S)\t; %c%s"
+                     value
+                     (shuowen-radical value)
+                     line-breaking))
+      (setq attributes (delq 'shuowen-radical attributes))
+      )
     (let (key)
-      (dolist (domain char-db-feature-domains)
+      (dolist (domain
+              (append
+               char-db-feature-domains
+               (let (dest domain)
+                 (dolist (feature (char-attribute-list))
+                   (setq feature (symbol-name feature))
+                   (when (string-match
+                          "\\(radical\\|strokes\\)@\\([^@*]+\\)\\(\\*\\|$\\)"
+                          feature)
+                     (setq domain (intern (match-string 2 feature)))
+                    (unless (memq domain dest)
+                      (setq dest (cons domain dest)))))
+                 (sort dest #'string<))))
        (setq key (intern (format "%s@%s" 'ideographic-radical domain)))
        (when (and (memq key attributes)
                   (setq value (get-char-attribute char key)))
          (insert (format "(%s . %S)\t; %c%s"
                          key
                          radical
-                         (aref ideographic-radicals radical)
+                         (ideographic-radical radical)
                          line-breaking))
          (setq attributes (delq key attributes))
          )
       (unless (eq value radical)
        (insert (format "(kangxi-radical\t . %S)\t; %c%s"
                        value
-                       (aref ideographic-radicals value)
+                       (ideographic-radical value)
                        line-breaking))
        (or radical
            (setq radical value)))
       (unless (eq value radical)
        (insert (format "(japanese-radical\t . %S)\t; %c%s"
                        value
-                       (aref ideographic-radicals value)
+                       (ideographic-radical value)
                        line-breaking))
        (or radical
            (setq radical value)))
               (setq value (get-char-attribute char 'cns-radical)))
       (insert (format "(cns-radical\t . %S)\t; %c%s"
                      value
-                     (aref ideographic-radicals value)
+                     (ideographic-radical value)
                      line-breaking))
       (setq attributes (delq 'cns-radical attributes))
       )
       (unless (eq value radical)
        (insert (format "(shinjigen-1-radical . %S)\t; %c%s"
                        value
-                       (aref ideographic-radicals value)
+                       (ideographic-radical value)
                        line-breaking))
        (or radical
            (setq radical value)))
                         ->denotational <-subsumptive ->ucs-unified
                         ->ideographic-component-forms))
        (setq attributes (delq ignored attributes))))
-    ;; (setq rest ccs-attributes)
-    ;; (while (and rest
-    ;;             (progn
-    ;;               (setq value (get-char-attribute char (car rest)))
-    ;;               (if value
-    ;;                   (if (>= (length (symbol-name (car rest))) 19)
-    ;;                       (progn
-    ;;                         (setq has-long-ccs-name t)
-    ;;                         nil)
-    ;;                     t)
-    ;;                 t)))
-    ;;   (setq rest (cdr rest)))
     (while attributes
       (setq name (car attributes))
       (if (setq value (get-char-attribute char name))
                          (prog1
                              (setq value (get-char-attribute char name))
                            (setq dest-ccss (cons name dest-ccss))))
-                    (char-db-insert-ccs-feature name value line-breaking)
-                     ;; (insert
-                     ;;  (format
-                     ;;   (cond ((memq name '(=daikanwa
-                     ;;                       =daikanwa-rev1 =daikanwa-rev2
-                     ;;                       =gt =gt-k =cbeta))
-                     ;;          (if has-long-ccs-name
-                     ;;              "(%-26s . %05d)\t; %c%s"
-                     ;;            "(%-18s . %05d)\t; %c%s"))
-                     ;;         ((eq name 'mojikyo)
-                     ;;          (if has-long-ccs-name
-                     ;;              "(%-26s . %06d)\t; %c%s"
-                     ;;            "(%-18s . %06d)\t; %c%s"))
-                     ;;         ((>= (charset-dimension name) 2)
-                     ;;          (if has-long-ccs-name
-                     ;;              "(%-26s . #x%04X)\t; %c%s"
-                     ;;            "(%-18s . #x%04X)\t; %c%s"))
-                     ;;         (t
-                     ;;          (if has-long-ccs-name
-                     ;;              "(%-26s . #x%02X)\t; %c%s"
-                     ;;            "(%-18s . #x%02X)\t; %c%s")))
-                     ;;   name
-                     ;;   (if (= (charset-iso-graphic-plane name) 1)
-                     ;;       (logior value
-                     ;;               (cond ((= (charset-dimension name) 1)
-                     ;;                      #x80)
-                     ;;                     ((= (charset-dimension name) 2)
-                     ;;                      #x8080)
-                     ;;                     ((= (charset-dimension name) 3)
-                     ;;                      #x808080)
-                     ;;                     (t 0)))
-                     ;;     value)
-                     ;;   (char-db-decode-isolated-char name value)
-                     ;;   line-breaking))
-                    )
+                    (char-db-insert-ccs-feature name value line-breaking))
                 )
                ((string-match "^=>ucs@" (symbol-name name))
                 (insert (format "(%-18s . #x%04X)\t; %c%s"
                             (intern (format "%s*sources" name))))
                      (not (string-match "\\*sources$" (symbol-name name)))
                      (or (eq name '<-identical)
+                         (string-match "^->halfwidth" (symbol-name name))
+                         (and
+                          (string-match "^->fullwidth" (symbol-name name))
+                          (not
+                           (and (consp value)
+                                (characterp (car value))
+                                (encode-char
+                                 (car value) '=ucs 'defined-only))))
                          (string-match "^->simplified" (symbol-name name))
-                          ;; (string-match "^<-same" (symbol-name name))
-                         (string-match "^->same" (symbol-name name))
-                          ;; (string-match "^->ideographic-same" (symbol-name name))
                          (string-match "^->vulgar" (symbol-name name))
                          (string-match "^->wrong" (symbol-name name))
+                         (string-match "^->same" (symbol-name name))
+                         (string-match "^->formed" (symbol-name name))
                          (string-match "^->original" (symbol-name name))
                          (string-match "^->ancient" (symbol-name name))
+                         (string-match "^->Oracle-Bones" (symbol-name name))
                          ))
                 )
                ((or (eq name 'ideographic-structure)
+                    (eq name 'ideographic-combination)
                     (eq name 'ideographic-)
                     (string-match "^\\(->\\|<-\\)" (symbol-name name)))
                 (insert (format "(%-18s%s " name line-breaking))
                              (setq required-features
                                    (union required-features
                                           '(=jis-x0208
-                                            =jis-x0208-1990
+                                            =jis-x0208@1990
                                             =jis-x0213-1-2000
                                             =jis-x0213-2-2000
                                             =jis-x0212
-                                            =jis-x0208-1983
-                                            =jis-x0208-1978))))
+                                            =jis-x0208@1983
+                                            =jis-x0208@1978))))
                             ((eq source 'CN)
                              (setq required-features
                                    (union required-features
                                 (setq required-features
                                       (union required-features
                                              '(=jis-x0208
-                                               =jis-x0208-1990
+                                               =jis-x0208@1990
                                                =jis-x0213-1-2000
                                                =jis-x0213-2-2000
                                                =jis-x0212
-                                               =jis-x0208-1983
-                                               =jis-x0208-1978))))
+                                               =jis-x0208@1983
+                                               =jis-x0208@1978))))
                                ((string-match "@CN" (symbol-name name))
                                 (setq required-features
                                       (union required-features
                                 line-breaking)))
                ))
       (setq attributes (cdr attributes)))
-    ;; (while ccs-attributes
-    ;;   (setq name (charset-name (car ccs-attributes)))
-    ;;   (if (and (not (memq name dest-ccss))
-    ;;            (prog1
-    ;;                (setq value (get-char-attribute char name))
-    ;;              (setq dest-ccss (cons name dest-ccss))))
-    ;;       (insert
-    ;;        (format
-    ;;         (cond ((memq name '(=daikanwa
-    ;;                             =daikanwa-rev1 =daikanwa-rev2
-    ;;                             =gt =gt-k =cbeta))
-    ;;                (if has-long-ccs-name
-    ;;                    "(%-26s . %05d)\t; %c%s"
-    ;;                  "(%-18s . %05d)\t; %c%s"))
-    ;;               ((eq name 'mojikyo)
-    ;;                (if has-long-ccs-name
-    ;;                    "(%-26s . %06d)\t; %c%s"
-    ;;                  "(%-18s . %06d)\t; %c%s"))
-    ;;               ((>= (charset-dimension name) 2)
-    ;;                (if has-long-ccs-name
-    ;;                    "(%-26s . #x%04X)\t; %c%s"
-    ;;                  "(%-18s . #x%04X)\t; %c%s"))
-    ;;               (t
-    ;;                (if has-long-ccs-name
-    ;;                    "(%-26s . #x%02X)\t; %c%s"
-    ;;                  "(%-18s . #x%02X)\t; %c%s")))
-    ;;         name
-    ;;         (if (= (charset-iso-graphic-plane name) 1)
-    ;;             (logior value
-    ;;                     (cond ((= (charset-dimension name) 1)
-    ;;                            #x80)
-    ;;                           ((= (charset-dimension name) 2)
-    ;;                            #x8080)
-    ;;                           ((= (charset-dimension name) 3)
-    ;;                            #x808080)
-    ;;                           (t 0)))
-    ;;           value)
-    ;;         (char-db-decode-isolated-char name value)
-    ;;         line-breaking)))
-    ;;   (setq ccs-attributes (cdr ccs-attributes)))
     (insert ")")))
 
 (defun insert-char-data (char &optional readable
                                           no-ucs-unified
                                           script excluded-script)
   (insert-char-data char printable)
-  (let ((variants (or (char-variants char)
-                     (let ((ucs (get-char-attribute char '->ucs)))
-                       (if ucs
-                           (delete char (char-variants (int-char ucs)))))))
-       variant vs)
+  (let ((variants (char-variants char))
+       rest
+       variant vs ret)
     (setq variants (sort variants #'<))
-    (while variants
-      (setq variant (car variants))
-      (if (and (or (null script)
-                  (null (setq vs (get-char-attribute variant 'script)))
-                  (memq script vs))
-              (or (null excluded-script)
-                  (null (setq vs (get-char-attribute variant 'script)))
-                  (not (memq excluded-script vs))))
-         (or (and no-ucs-unified (get-char-attribute variant '=ucs))
-             (insert-char-data variant printable)))
-      (setq variants (cdr variants))
-      )))
+    (setq rest variants)
+    (setq variants (cons char variants))
+    (while rest
+      (setq variant (car rest))
+      (unless (get-char-attribute variant '<-subsumptive)
+       (if (and (or (null script)
+                    (null (setq vs (get-char-attribute variant 'script)))
+                    (memq script vs))
+                (or (null excluded-script)
+                    (null (setq vs (get-char-attribute variant 'script)))
+                    (not (memq excluded-script vs))))
+           (unless (and no-ucs-unified (get-char-attribute variant '=ucs))
+             (insert-char-data variant printable)
+             (if (setq ret (char-variants variant))
+                 (while ret
+                   (or (memq (car ret) variants)
+                        ;; (get-char-attribute (car ret) '<-subsumptive)
+                       (setq rest (nconc rest (list (car ret)))))
+                   (setq ret (cdr ret)))))))
+      (setq rest (cdr rest)))))
 
 (defun insert-char-range-data (min max &optional script excluded-script)
   (let ((code min)