;;; char-db-util.el --- Character Database utility
-;; Copyright (C) 1998,1999,2000,2001,2002,2003,2004 MORIOKA Tomohiko.
+;; Copyright (C) 1998,1999,2000,2001,2002,2003,2004,2005 MORIOKA Tomohiko.
;; Author: MORIOKA Tomohiko <tomo@kanji.zinbun.kyoto-u.ac.jp>
;; Keywords: CHISE, Character Database, ISO/IEC 10646, Unicode, UCS-4, MULE.
(setq i (1+ i)))
v))
+(defvar char-db-file-coding-system 'utf-8-mcs-er)
+
(defvar char-db-feature-domains
'(ucs daikanwa cns gt jis jis/alt jis/a jis/b
- jis-x0213 misc unknown))
+ jis-x0212 jis-x0213 cdp shinjigen misc unknown))
-(defvar char-db-ignored-attributes nil)
+(defvar char-db-ignored-attributes '(ideographic-products))
(defun char-attribute-name< (ka kb)
(cond
+ ((eq '->denotational kb)
+ t)
+ ((eq '->subsumptive kb)
+ (not (eq '->denotational ka)))
+ ((eq '->denotational ka)
+ nil)
+ ((eq '->subsumptive ka)
+ nil)
((find-charset ka)
- (cond
- ((find-charset kb)
- (if (<= (charset-id ka) 0)
- (if (<= (charset-id kb) 0)
- (cond
- ((= (charset-dimension ka)
- (charset-dimension kb))
- (> (charset-id ka)(charset-id kb)))
- (t
- (> (charset-dimension ka)
- (charset-dimension kb))
- ))
- t)
- (if (<= (charset-id kb) 0)
- nil
- (< (charset-id ka)(charset-id kb)))))
- ((symbolp kb)
- nil)
- (t
- t)))
+ (if (find-charset kb)
+ (if (<= (charset-id ka) 1)
+ (if (<= (charset-id kb) 1)
+ (cond
+ ((= (charset-dimension ka)
+ (charset-dimension kb))
+ (> (charset-id ka)(charset-id kb)))
+ (t
+ (> (charset-dimension ka)
+ (charset-dimension kb))
+ ))
+ t)
+ (if (<= (charset-id kb) 1)
+ nil
+ (< (charset-id ka)(charset-id kb))))
+ nil))
((find-charset kb)
t)
((symbolp ka)
greek-iso8859-7
thai-tis620
=jis-x0208
- japanese-jisx0208
+ =jis-x0208@1978
+ =jis-x0208@1983
japanese-jisx0212
- japanese-jisx0208-1978
chinese-gb2312
chinese-cns11643-1
chinese-cns11643-2
chinese-cns11643-5
chinese-cns11643-6
chinese-cns11643-7
- =jis-x0208-1990
+ =jis-x0208@1990
=jis-x0213-1-2000
=jis-x0213-2-2000
korean-ksc5601
ideograph-hanziku-10
ideograph-hanziku-11
ideograph-hanziku-12
+ =gt-k
+ =ucs@iso
+ =ucs@unicode
=big5
=big5-eten
- =gt-k
+ =jis-x0208@1997
=jef-china3))
(defun char-db-make-char-spec (char)
(dolist (ccs (delq (car ret) (charset-list)))
(if (and (or (charset-iso-final-char ccs)
(memq ccs
- '(ideograph-daikanwa
- =daikanwa-rev2
+ '(=daikanwa
+ =daikanwa@rev2
;; =gt-k
)))
+ (setq ccs (charset-name ccs))
+ (null (assq ccs char-spec))
(setq ret (encode-char char ccs 'defined-only)))
(setq char-spec (cons (cons ccs ret) char-spec))))
(if (null char-spec)
required-features)
(unless column
(setq column (current-column)))
- (let (char-spec al cal key temp-char)
+ (let (char-spec temp-char)
(setq char-spec (char-db-make-char-spec char))
(unless (or (characterp char) ; char
(condition-case nil
char-spec)))
(remove-char-attribute temp-char 'ideograph-daikanwa)
(setq char temp-char))
- (setq al nil
- cal nil)
- (while char-spec
- (setq key (car (car char-spec)))
- (unless (memq key char-db-ignored-attributes)
- (if (find-charset key)
- (if (encode-char char key 'defined-only)
- (setq cal (cons key cal)))
- (setq al (cons key al))))
- (setq char-spec (cdr char-spec)))
- (unless cal
- (setq char-spec (char-db-make-char-spec char))
- (while char-spec
- (setq key (car (car char-spec)))
- (unless (memq key char-db-ignored-attributes)
- (if (find-charset key)
- (setq cal (cons key cal))
- (setq al (cons key al))))
- (setq char-spec (cdr char-spec)))
- )
- (unless (or cal
- (memq 'ideographic-structure al))
- (push 'ideographic-structure al))
- (dolist (feature required-features)
- (if (find-charset feature)
- (if (encode-char char feature 'defined-only)
- (setq cal (adjoin feature cal)))
- (setq al (adjoin feature al))))
(insert-char-attributes char
readable
- (or al 'none) cal)
+ (union (mapcar #'car char-spec)
+ required-features))
(when temp-char
;; undefine temporary character
;; Current implementation is dirty.
(let ((line-breaking
(concat "\n" (make-string (1+ column) ?\ )))
name value
- ret al cal key
+ ret al ; cal
+ key
lbs cell rest separator)
(insert "(")
(while alist
(error nil)))
(progn
(setq al nil
- cal nil)
+ ;; cal nil
+ )
(while value
(setq key (car (car value)))
- (if (find-charset key)
- (setq cal (cons key cal))
- (setq al (cons key al)))
+ ;; (if (find-charset key)
+ ;; (setq cal (cons key cal))
+ (setq al (cons key al))
+ ;; )
(setq value (cdr value)))
(insert-char-attributes ret
readable
- (or al 'none) cal))
+ (or al 'none) ; cal
+ ))
(insert (prin1-to-string value)))
(insert ")")
(insert line-breaking))
(progn
(setq rest cell
al nil
- cal nil)
+ ;; cal nil
+ )
(while rest
(setq key (car (car rest)))
- (if (find-charset key)
- (setq cal (cons key cal))
- (setq al (cons key al)))
+ ;; (if (find-charset key)
+ ;; (setq cal (cons key cal))
+ (setq al (cons key al))
+ ;; )
(setq rest (cdr rest)))
(if separator
(insert lbs))
(insert-char-attributes ret
readable
- al cal)
+ al ; cal
+ )
(setq separator lbs))
(if separator
(insert separator))
(defvar char-db-convert-obsolete-format t)
-(defun insert-char-attributes (char &optional readable
- attributes ccs-attributes
- column)
- (let (atr-d ccs-d)
- (setq attributes
- (sort (if attributes
- (if (consp attributes)
- (progn
- (dolist (name attributes)
- (unless (memq name char-db-ignored-attributes)
- (push name atr-d)))
- atr-d))
- (dolist (name (char-attribute-list))
- (unless (memq name char-db-ignored-attributes)
- (if (find-charset name)
- (push name ccs-d)
- (push name atr-d))))
- atr-d)
- #'char-attribute-name<))
- (setq ccs-attributes
- (sort (if ccs-attributes
- (progn
- (setq ccs-d nil)
- (dolist (name ccs-attributes)
- (unless (memq name char-db-ignored-attributes)
- (push name ccs-d)))
- ccs-d)
- (or ccs-d
- (progn
- (dolist (name (charset-list))
- (unless (memq name char-db-ignored-attributes)
- (push name ccs-d)))
- ccs-d)))
- #'char-attribute-name<)))
+(defun char-db-insert-ccs-feature (name value line-breaking)
+ (insert
+ (format
+ (cond ((memq name '(=daikanwa
+ =daikanwa@rev1 =daikanwa@rev2
+ =gt =gt-k =cbeta))
+ "(%-18s . %05d)\t; %c")
+ ((eq name 'mojikyo)
+ "(%-18s . %06d)\t; %c")
+ ((>= (charset-dimension name) 2)
+ "(%-18s . #x%04X)\t; %c")
+ (t
+ "(%-18s . #x%02X)\t; %c"))
+ name
+ (if (= (charset-iso-graphic-plane name) 1)
+ (logior value
+ (cond ((= (charset-dimension name) 1)
+ #x80)
+ ((= (charset-dimension name) 2)
+ #x8080)
+ ((= (charset-dimension name) 3)
+ #x808080)
+ (t 0)))
+ value)
+ (char-db-decode-isolated-char name value)))
+ (if (and (= (charset-chars name) 94)
+ (= (charset-dimension name) 2))
+ (insert (format " [%02d-%02d]"
+ (- (lsh value -8) 32)
+ (- (logand value 255) 32))))
+ (insert line-breaking))
+
+(defun insert-char-attributes (char &optional readable attributes column)
(unless column
(setq column (current-column)))
- (let (name value has-long-ccs-name rest
+ (let (name value ; has-long-ccs-name
+ rest
radical strokes
(line-breaking
(concat "\n" (make-string (1+ column) ?\ )))
lbs cell separator ret
key al cal
dest-ccss
- sources required-features)
+ sources required-features
+ ccss)
+ (let (atr-d)
+ (setq attributes
+ (sort (if attributes
+ (if (consp attributes)
+ (progn
+ (dolist (name attributes)
+ (unless (memq name char-db-ignored-attributes)
+ (if (find-charset name)
+ (push name ccss))
+ (push name atr-d)))
+ atr-d))
+ (dolist (name (char-attribute-list))
+ (unless (memq name char-db-ignored-attributes)
+ (if (find-charset name)
+ (push name ccss))
+ (push name atr-d)))
+ atr-d)
+ #'char-attribute-name<)))
(insert "(")
(when (and (memq 'name attributes)
(setq value (get-char-attribute char 'name)))
line-breaking))
(setq attributes (delq 'script attributes))
)
+ ;; (when (and (memq '<-denotational attributes)
+ ;; (setq value (get-char-attribute char '<-denotational))
+ ;; (null (cdr value))
+ ;; (setq value (encode-char (car value) 'ucs 'defined-only)))
+ ;; (insert (format "(%-18s . #x%04X)\t; %c%s"
+ ;; '=>ucs value (decode-char 'ucs value)
+ ;; line-breaking))
+ ;; (setq attributes (delq '<-denotational attributes)))
(dolist (name '(=>ucs =>ucs*))
(when (and (memq name attributes)
(setq value (get-char-attribute char name)))
line-breaking))
(setq attributes (delq name attributes))
))
- ;; (dolist (name '(=>ucs-gb =>ucs-cns =>ucs-jis =>ucs-ks =>ucs-big5))
- ;; (when (and (memq name attributes)
- ;; (setq value (get-char-attribute char name)))
- ;; (insert (format "(%-18s . #x%04X)\t; %c%s"
- ;; (intern
- ;; (concat "=>ucs@"
- ;; (substring (symbol-name name) 6)))
- ;; value
- ;; (decode-char (intern
- ;; (concat "=ucs@"
- ;; (substring
- ;; (symbol-name name) 6)))
- ;; value)
- ;; line-breaking))
- ;; (setq attributes (delq name attributes))))
- ;; (when (and (memq '->ucs attributes)
- ;; (setq value (get-char-attribute char '->ucs)))
- ;; (insert (format (if char-db-convert-obsolete-format
- ;; "(=>ucs\t\t. #x%04X)\t; %c%s"
- ;; "(->ucs\t\t. #x%04X)\t; %c%s")
- ;; value (decode-char '=ucs value)
- ;; line-breaking))
- ;; (setq attributes (delq '->ucs attributes))
- ;; )
(dolist (name '(=>daikanwa))
(when (and (memq name attributes)
(setq value (get-char-attribute char name)))
line-breaking)
(format "(%-18s %s)\t; %c%s"
name
- (mapconcat #'prin1-to-string
+ (mapconcat (function prin1-to-string)
value " ")
(char-representative-of-daikanwa char)
line-breaking)))
(setq attributes (delq 'ideographic-radical attributes))
)
(let (key)
- (dolist (domain char-db-feature-domains)
+ (dolist (domain
+ (append
+ char-db-feature-domains
+ (let (dest domain)
+ (dolist (feature (char-attribute-list))
+ (setq feature (symbol-name feature))
+ (when (string-match
+ "\\(radical\\|strokes\\)@\\([^@*]+\\)\\(\\*\\|$\\)"
+ feature)
+ (setq domain (intern (match-string 2 feature)))
+ (unless (memq domain dest)
+ (setq dest (cons domain dest)))))
+ (sort dest #'string<))))
(setq key (intern (format "%s@%s" 'ideographic-radical domain)))
(when (and (memq key attributes)
(setq value (get-char-attribute char key)))
)
(unless readable
(dolist (ignored '(composition
- ->denotational <-subsumptive ->ucs-unified))
+ ->denotational <-subsumptive ->ucs-unified
+ ->ideographic-component-forms))
(setq attributes (delq ignored attributes))))
- (setq rest ccs-attributes)
- (while (and rest
- (progn
- (setq value (get-char-attribute char (car rest)))
- (if value
- (if (>= (length (symbol-name (car rest))) 19)
- (progn
- (setq has-long-ccs-name t)
- nil)
- t)
- t)))
- (setq rest (cdr rest)))
(while attributes
(setq name (car attributes))
(if (setq value (get-char-attribute char name))
- (cond ((string-match "^=>ucs@" (symbol-name name))
+ (cond ((setq ret (find-charset name))
+ (setq name (charset-name ret))
+ (if (and (not (memq name dest-ccss))
+ (prog1
+ (setq value (get-char-attribute char name))
+ (setq dest-ccss (cons name dest-ccss))))
+ (char-db-insert-ccs-feature name value line-breaking))
+ )
+ ((string-match "^=>ucs@" (symbol-name name))
(insert (format "(%-18s . #x%04X)\t; %c%s"
name value (decode-char '=ucs value)
line-breaking))
((eq name 'jisx0208-1978/4X)
(insert (format "(%-18s . #x%04X)%s"
name value
- line-breaking)))
+ line-breaking))
+ )
+ ((and (not readable)
+ (null (get-char-attribute
+ char
+ (intern (format "%s*sources" name))))
+ (not (string-match "\\*sources$" (symbol-name name)))
+ (or (eq name '<-identical)
+ (string-match "^->halfwidth" (symbol-name name))
+ (and
+ (string-match "^->fullwidth" (symbol-name name))
+ (not
+ (and (consp value)
+ (characterp (car value))
+ (encode-char
+ (car value) '=ucs 'defined-only))))
+ (string-match "^->simplified" (symbol-name name))
+ (string-match "^->vulgar" (symbol-name name))
+ (string-match "^->wrong" (symbol-name name))
+ (string-match "^->same" (symbol-name name))
+ (string-match "^->original" (symbol-name name))
+ (string-match "^->ancient" (symbol-name name))
+ ))
+ )
((or (eq name 'ideographic-structure)
(eq name 'ideographic-)
(string-match "^\\(->\\|<-\\)" (symbol-name name)))
(if (integerp cell)
(setq cell (decode-char '=ucs cell)))
(cond ((eq name '->subsumptive)
- (if separator
- (insert lbs))
- (let ((char-db-ignored-attributes
- (cons '<-subsumptive
- char-db-ignored-attributes)))
- (insert-char-attributes cell readable))
- (setq separator lbs))
+ (when (or (not
+ (some (lambda (atr)
+ (get-char-attribute cell atr))
+ char-db-ignored-attributes))
+ (some (lambda (ccs)
+ (encode-char cell ccs
+ 'defined-only))
+ ccss))
+ (if separator
+ (insert lbs))
+ (let ((char-db-ignored-attributes
+ (cons '<-subsumptive
+ char-db-ignored-attributes)))
+ (insert-char-attributes cell readable))
+ (setq separator lbs))
+ )
((characterp cell)
(setq sources
(get-char-attribute
(intern (format "%s*sources" name))))
(setq required-features nil)
(dolist (source sources)
- (setq required-features
- (cons
- (if (find-charset
- (setq ret (intern
- (format "=%s" source))))
- ret
- source)
- required-features)))
- (when (string-match "@JP" (symbol-name name))
- (setq required-features
- (union required-features
- '(=jis-x0208
- =jis-x0208-1990
- =jis-x0213-1-2000
- =jis-x0213-2-2000
- =jis-x0212
- =jis-x0208-1983
- =jis-x0208-1978))))
+ (cond
+ ((memq source '(JP JP/Jouyou
+ shinjigen-1))
+ (setq required-features
+ (union required-features
+ '(=jis-x0208
+ =jis-x0208@1990
+ =jis-x0213-1-2000
+ =jis-x0213-2-2000
+ =jis-x0212
+ =jis-x0208@1983
+ =jis-x0208@1978))))
+ ((eq source 'CN)
+ (setq required-features
+ (union required-features
+ '(=gb2312
+ =gb12345
+ =iso-ir165)))))
+ (cond
+ ((find-charset
+ (setq ret (intern (format "=%s" source))))
+ (setq required-features
+ (cons ret required-features)))
+ (t (setq required-features
+ (cons source required-features)))))
+ (cond ((string-match "@JP" (symbol-name name))
+ (setq required-features
+ (union required-features
+ '(=jis-x0208
+ =jis-x0208@1990
+ =jis-x0213-1-2000
+ =jis-x0213-2-2000
+ =jis-x0212
+ =jis-x0208@1983
+ =jis-x0208@1978))))
+ ((string-match "@CN" (symbol-name name))
+ (setq required-features
+ (union required-features
+ '(=gb2312
+ =gb12345
+ =iso-ir165)))))
(if separator
(insert lbs))
(if readable
line-breaking)))
))
(setq attributes (cdr attributes)))
- (while ccs-attributes
- (setq name (charset-name (car ccs-attributes)))
- (if (and (not (memq name dest-ccss))
- (prog1
- (setq value (get-char-attribute char name))
- (setq dest-ccss (cons name dest-ccss))))
- (insert
- (format
- (cond ((memq name '(=daikanwa
- =daikanwa-rev1 =daikanwa-rev2
- =gt =gt-k =cbeta))
- (if has-long-ccs-name
- "(%-26s . %05d)\t; %c%s"
- "(%-18s . %05d)\t; %c%s"))
- ((eq name 'mojikyo)
- (if has-long-ccs-name
- "(%-26s . %06d)\t; %c%s"
- "(%-18s . %06d)\t; %c%s"))
- ((>= (charset-dimension name) 2)
- (if has-long-ccs-name
- "(%-26s . #x%04X)\t; %c%s"
- "(%-18s . #x%04X)\t; %c%s"))
- (t
- (if has-long-ccs-name
- "(%-26s . #x%02X)\t; %c%s"
- "(%-18s . #x%02X)\t; %c%s")))
- name
- (if (= (charset-iso-graphic-plane name) 1)
- (logior value
- (cond ((= (charset-dimension name) 1)
- #x80)
- ((= (charset-dimension name) 2)
- #x8080)
- ((= (charset-dimension name) 3)
- #x808080)
- (t 0)))
- value)
- (char-db-decode-isolated-char name value)
- line-breaking)))
- (setq ccs-attributes (cdr ccs-attributes)))
(insert ")")))
(defun insert-char-data (char &optional readable
- attributes ccs-attributes)
+ attributes)
(save-restriction
(narrow-to-region (point)(point))
(insert "(define-char
'")
- (insert-char-attributes char readable
- attributes ccs-attributes)
+ (insert-char-attributes char readable attributes)
(insert ")\n")
(goto-char (point-min))
(while (re-search-forward "[ \t]+$" nil t)
no-ucs-unified
script excluded-script)
(insert-char-data char printable)
- (let ((variants (or (char-variants char)
- (let ((ucs (get-char-attribute char '->ucs)))
- (if ucs
- (delete char (char-variants (int-char ucs)))))))
- variant vs)
+ (let ((variants (char-variants char))
+ rest
+ variant vs ret)
(setq variants (sort variants #'<))
- (while variants
- (setq variant (car variants))
- (if (and (or (null script)
- (null (setq vs (get-char-attribute variant 'script)))
- (memq script vs))
- (or (null excluded-script)
- (null (setq vs (get-char-attribute variant 'script)))
- (not (memq excluded-script vs))))
- (or (and no-ucs-unified (get-char-attribute variant '=ucs))
- (insert-char-data variant printable)))
- (setq variants (cdr variants))
- )))
+ (setq rest variants)
+ (setq variants (cons char variants))
+ (while rest
+ (setq variant (car rest))
+ (unless (get-char-attribute variant '<-subsumptive)
+ (if (and (or (null script)
+ (null (setq vs (get-char-attribute variant 'script)))
+ (memq script vs))
+ (or (null excluded-script)
+ (null (setq vs (get-char-attribute variant 'script)))
+ (not (memq excluded-script vs))))
+ (unless (and no-ucs-unified (get-char-attribute variant '=ucs))
+ (insert-char-data variant printable)
+ (if (setq ret (char-variants variant))
+ (while ret
+ (or (memq (car ret) variants)
+ ;; (get-char-attribute (car ret) '<-subsumptive)
+ (setq rest (nconc rest (list (car ret)))))
+ (setq ret (cdr ret)))))))
+ (setq rest (cdr rest)))))
(defun insert-char-range-data (min max &optional script excluded-script)
(let ((code min)
(defun write-char-range-data-to-file (min max file
&optional script excluded-script)
- (let ((coding-system-for-write 'utf-8-mcs))
+ (let ((coding-system-for-write char-db-file-coding-system))
(with-temp-buffer
- (insert ";; -*- coding: utf-8-mcs -*-\n")
+ (insert (format ";; -*- coding: %s -*-\n"
+ char-db-file-coding-system))
(insert-char-range-data min max script excluded-script)
(write-region (point-min)(point-max) file))))