(est-coded-charset-priority-list): Add `=>ucs-itaiji-007'.
[chise/isd.git] / isd-turtle.el
1 ;;; isd-turtle.el --- Utility to dump ideographic-structure as Turtle files
2
3 ;; Copyright (C) 2017 MORIOKA Tomohiko
4
5 ;; Author: MORIOKA Tomohiko <tomo@kanji.zinbun.kyoto-u.ac.jp>
6 ;; Keywords: Ideographic Structures (漢字構造、解字), IDS, CHISE, RDF, Turtle
7
8 ;; This file is a part of CHISE-ISD (Ideographic Structure Database).
9
10 ;; This program is free software; you can redistribute it and/or
11 ;; modify it under the terms of the GNU General Public License as
12 ;; published by the Free Software Foundation; either version 2, or (at
13 ;; your option) any later version.
14
15 ;; This program is distributed in the hope that it will be useful, but
16 ;; WITHOUT ANY WARRANTY; without even the implied warranty of
17 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18 ;; General Public License for more details.
19
20 ;; You should have received a copy of the GNU General Public License
21 ;; along with this program; see the file COPYING.  If not, write to
22 ;; the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 ;; Boston, MA 02111-1307, USA.
24
25 ;;; Code:
26
27 (require 'cwiki-common)
28
29 (defvar isd-url-prefix "http://rdf.chise.org/data/")
30
31 (setq est-coded-charset-priority-list
32   '(; =ucs
33     =mj
34     =adobe-japan1-0
35     =adobe-japan1-1
36     =adobe-japan1-2
37     =adobe-japan1-3
38     =adobe-japan1-4
39     =adobe-japan1-5
40     =adobe-japan1-6
41     =ucs@iso
42     =jis-x0208 =jis-x0208@1990
43     =jis-x0213-1
44     =jis-x0213-1@2000 =jis-x0213-1@2004
45     =jis-x0213-2
46     =jis-x0212
47     =gt
48     =hanyo-denshi/ks
49     =hanyo-denshi/tk
50     =ucs-itaiji-001
51     =ucs-itaiji-002
52     =ucs-itaiji-003
53     =ucs-itaiji-004
54     =ucs-itaiji-005
55     =ucs-itaiji-006
56     =ucs-itaiji-007
57     =ucs-itaiji-009
58     =ucs-itaiji-084
59     =ucs-var-001
60     =ucs-var-002
61     =ucs-var-003
62     =ucs-var-004
63     =cns11643-1 =cns11643-2 =cns11643-3
64     =cns11643-4 =cns11643-5 =cns11643-6 =cns11643-7
65     =gb2312
66     =big5-cdp
67     =ks-x1001
68     =gt-k
69     =ucs@unicode
70     =ucs@JP/hanazono
71     =gb12345
72     =ucs@cns
73     =ucs@gb
74     =zinbun-oracle =>zinbun-oracle
75     =daikanwa
76     =ruimoku-v6
77     =cbeta =jef-china3
78     =daikanwa/+2p
79     =+>ucs@iso =+>ucs@unicode
80     =+>ucs@jis
81     =+>ucs@cns
82     =+>ucs@ks
83     =+>ucs@jis/1990
84     =>mj
85     =>jis-x0208 =>jis-x0213-1
86     =>jis-x0208@1997
87     =>ucs@iwds-1
88     =>ucs@component
89     =>iwds-1
90     =>ucs@iso
91     =>ucs@unicode
92     =>ucs@jis =>ucs@cns =>ucs@ks
93     =>gt
94     =>gt-k
95     =>>ucs@iso =>>ucs@unicode
96     =>>ucs@jis =>>ucs@cns =>>ucs@ks
97     =>>gt-k
98     =>>hanyo-denshi/ks
99     ==mj
100     ==ucs@iso
101     ==ucs@unicode
102     ==adobe-japan1-0
103     ==adobe-japan1-1
104     ==adobe-japan1-2
105     ==adobe-japan1-3
106     ==adobe-japan1-4
107     ==adobe-japan1-5
108     ==adobe-japan1-6
109     ==ks-x1001
110     ==hanyo-denshi/ks
111     ==hanyo-denshi/tk
112     ==ucs@jis
113     ==gt
114     ==cns11643-1 ==cns11643-2 ==cns11643-3
115     ==cns11643-4 ==cns11643-5 ==cns11643-6 ==cns11643-7
116     ==jis-x0212
117     ==ucs@cns
118     ==koseki
119     ==daikanwa
120     ==gt-k
121     ==ucs@gb
122     ==ucs-itaiji-003
123     ==ucs@JP/hanazono
124     ==daikanwa/+2p
125     =>>jis-x0208 =>>jis-x0213-1 =>>jis-x0213-2
126     =+>jis-x0208 =+>jis-x0213-1 =+>jis-x0213-2
127     =+>hanyo-denshi/jt
128     =+>jis-x0208@1978
129     =>>gt
130     =+>adobe-japan1
131     =>>adobe-japan1
132     =jis-x0208@1983 =jis-x0208@1978
133     =>ucs-itaiji-001
134     =>ucs-itaiji-002
135     =>ucs-itaiji-005
136     =>ucs-itaiji-007
137     ==>ucs@bucs
138     =big5
139     =>cbeta
140     ===mj
141     ===ucs@iso
142     ===ucs@unicode
143     ===hanyo-denshi/ks
144     ===ks-x1001
145     ===gt
146     ===gt-k
147     ===ucs@ks
148     ===ucs@gb
149     =shinjigen
150     =shinjigen@rev
151     =shinjigen@1ed
152     =shinjigen/+p@rev
153     ==shinjigen
154     ==shinjigen@rev
155     ==daikanwa/+p
156     ==shinjigen@1ed
157     ===daikanwa/+p
158     =>daikanwa/ho
159     ===daikanwa/ho
160     ))
161
162 ;; (defvar isd-turtle-ccs-list nil)
163 (defvar chise-turtle-ccs-prefix-alist nil)
164
165 (defun charset-code-point-format-spec (ccs)
166   (cond ((memq ccs '(=ucs))
167          "0x%04X")
168         (t
169          (let ((ccs-name (symbol-name ccs)))
170            (cond
171             ((string-match
172               "\\(shinjigen\\|daikanwa/ho\\|=>iwds-1\\)"
173               ccs-name)
174              "%04d")
175             ((string-match
176               "\\(gt\\|daikanwa\\|adobe-japan1\\|cbeta\\|zinbun-oracle\\|hng\\)"
177               ccs-name)
178              "%05d")
179             ((string-match "\\(hanyo-denshi/ks\\|koseki\\|mj\\)" ccs-name)
180              "%06d")
181             ((string-match "hanyo-denshi/tk" ccs-name)
182              "%08d")
183             (t
184              "0x%X"))))))
185
186 ;; (defun isd-turtle-uri-encode-feature-name (feature-name)
187 ;;   (cond
188 ;;    ((eq '=ucs feature-name)
189 ;;     "a.ucs")
190 ;;    ((eq '==>ucs@bucs feature-name)
191 ;;     "bucs")
192 ;;    (t
193 ;;     (mapconcat (lambda (c)
194 ;;                  (if (eq c ?@)
195 ;;                      "_"
196 ;;                    (char-to-string c)))
197 ;;                (www-uri-encode-feature-name feature-name)
198 ;;                ""))))
199 (defun chise-turtle-uri-encode-ccs-name (feature-name)
200   (cond
201    ((eq '=ucs feature-name)
202     "a.ucs")
203    ((eq '=big5 feature-name)
204     "a.big5")
205    ((eq '==>ucs@bucs feature-name)
206     "bucs")
207    (t
208     (mapconcat (lambda (c)
209                  (cond
210                   ((eq c ?@)
211                    "_")
212                   ((eq c ?+)
213                    "._.")
214                   ((eq c ?=)
215                    ".:.")
216                   (t
217                    (char-to-string c))))
218                (www-uri-encode-feature-name feature-name)
219                ""))))
220
221 ;; (defun isd-turtle-format-ccs-code-point (ccs code-point)
222 ;;   (unless (memq ccs isd-turtle-ccs-list)
223 ;;     (setq isd-turtle-ccs-list (cons ccs isd-turtle-ccs-list)))
224 ;;   (format "%s:%s"
225 ;;           (isd-turtle-uri-encode-feature-name ccs)
226 ;;           (format (charset-code-point-format-spec ccs)
227 ;;                   code-point)))
228 (defun chise-turtle-format-ccs-code-point (ccs code-point)
229   (let ((ccs-uri (chise-turtle-uri-encode-ccs-name ccs)))
230     (unless (assoc ccs-uri chise-turtle-ccs-prefix-alist)
231       (setq chise-turtle-ccs-prefix-alist
232             (cons (cons ccs-uri ccs)
233                   chise-turtle-ccs-prefix-alist)))
234     (format "%s:%s"
235             ccs-uri
236             (format (charset-code-point-format-spec ccs)
237                     code-point))))
238
239 (defun isd-turtle-encode-char (object)
240   (let ((ccs-list est-coded-charset-priority-list)
241         ccs ret)
242     (if (setq ret (encode-char object '=ucs))
243         (chise-turtle-format-ccs-code-point '=ucs ret)
244       (while (and ccs-list
245                   (setq ccs (pop ccs-list))
246                   (not (setq ret (encode-char object ccs 'defined-only)))))
247       (cond (ret
248              (chise-turtle-format-ccs-code-point ccs ret)
249              )
250             ((and (setq ccs (car (split-char object)))
251                   (setq ret (encode-char object ccs)))
252              (chise-turtle-format-ccs-code-point ccs ret)
253              )
254             (t
255              (format (if est-hide-cgi-mode
256                          "system-char-id=0x%X"
257                        "system-char-id:0x%X")
258                      (encode-char object 'system-char-id))
259              )))))
260
261 (defun isd-turtle-format-component (component separator level prefix)
262   (cond ((characterp component)
263          (format "%s %c # %c"
264                  (isd-turtle-encode-char component)
265                  separator
266                  component)
267          )
268         ((consp component)
269          (let ((ret (find-char component)))
270            (cond (ret
271                   (format "%s %c # %c"
272                           (isd-turtle-encode-char ret) separator ret))
273                  ((setq ret (assq 'ideographic-structure component))
274                   (if (eq separator ?\;)
275                       (format "%s ;"
276                               (isd-turtle-format-char nil nil (cdr ret) (1+ level)
277                                                       prefix))
278                     (isd-turtle-format-char nil nil (cdr ret) (1+ level)
279                                             prefix))))))))
280
281 (defun isd-turtle-format-char (ccs code-point &optional ids-list level
282                                    prefix without-head-char)
283   (unless level
284     (setq level 0))
285   (unless prefix
286     (setq prefix ""))
287   (let ((indent (make-string (* level 4) ?\ ))
288         char
289         idc
290         p1 p2 p3
291         c1 c2 c3
292         ret)
293     (unless ids-list
294       (if (and ccs code-point
295                (setq char (decode-char ccs code-point)))
296           (setq ids-list (get-char-attribute char 'ideographic-structure))))
297     (setq idc (car ids-list))
298     (setq c1 (nth 1 ids-list)
299           c2 (nth 2 ids-list)
300           c3 (nth 3 ids-list))
301     (if (char-ref-p idc)
302         (setq idc (plist-get idc :char)))
303     (if (and (consp idc)
304              (setq ret (find-char idc)))
305         (setq idc ret))
306     (if (and (consp c1)
307              (setq ret (find-char c1)))
308         (setq c1 ret))
309     (if (and (consp c2)
310              (setq ret (find-char c2)))
311         (setq c2 ret))
312     (if (and (consp c3)
313              (setq ret (find-char c3)))
314         (setq c3 ret))
315     (cond
316      ((eq idc ?\u2FF0) ; ⿰
317       (setq p1 'left
318             p2 'right)
319       )
320      ((eq idc ?⿱)
321       (setq p1 'above
322             p2 'below)
323       )
324      ((eq idc ?⿲)
325       (setq p1 'left
326             p2 'middle
327             p3 'right)
328       )
329      ((eq idc ?⿳)
330       (setq p1 'above
331             p2 'middle
332             p3 'below)
333       )
334      ((memq idc '(?⿴ ?⿵ ?⿶ ?⿷ ?⿸ ?⿹ ?⿺))
335       (setq p1 'surround
336             p2 'filling)
337       )
338      ((eq idc ?⿻)
339       (setq p1 'underlying
340             p2 'overlaying)
341       ))
342     (cond
343      (p3
344       (format "%s
345 %s    %s:structure [ a idc:%c ;
346 %s        %s:%-8s %s
347 %s        %s:%-8s %s
348 %s        %s:%-8s %s
349 %s    ]%s"
350               (if without-head-char
351                   ""
352                 (if (and ccs code-point)
353                     (format "%s   # %c"
354                             (chise-turtle-format-ccs-code-point ccs code-point)
355                             char)
356                   "["))
357               indent prefix idc
358               indent prefix p1 (isd-turtle-format-component c1 ?\; (1+ level) prefix)
359               indent prefix p2 (isd-turtle-format-component c2 ?\; (1+ level) prefix)
360               indent prefix p3 (isd-turtle-format-component c3 ?\  (1+ level) prefix)
361               indent
362               (if without-head-char
363                   ""
364                 (if (null char)
365                     (format "\n%s]"
366                             indent)
367                   "")))
368       )
369      (idc
370       (format "%s
371 %s    %s:structure [ a idc:%c ;
372 %s        %s:%-8s %s
373 %s        %s:%-8s %s
374 %s    ]%s"
375               (if without-head-char
376                   ""
377                 (if (and ccs code-point)
378                     (format "%s   # %c"
379                             (chise-turtle-format-ccs-code-point ccs code-point)
380                             char)
381                   "["))
382               indent prefix idc
383               indent prefix p1 (isd-turtle-format-component c1 ?\; (1+ level) prefix)
384               indent prefix p2 (isd-turtle-format-component c2 ?\  (1+ level) prefix)
385               indent
386               (if without-head-char
387                   ""
388                 (if (null char)
389                     (format "\n%s]"
390                             indent)
391                   "")))))
392     ))
393
394 (defun isd-turtle-insert-char (ccs code-point)
395   (let ((ret (isd-turtle-format-char ccs code-point)))
396     (when ret
397       (insert ret)
398       (insert " .\n"))))
399
400 (defun isd-turtle-insert-ccs-ranges (ccs &rest ranges)
401   (let (range code max-code)
402     (while ranges
403       (setq range (car ranges))
404       (cond ((consp range)
405              (setq code (car range)
406                    max-code (cdr range))
407              (while (<= code max-code)
408                (isd-turtle-insert-char ccs code)
409                (setq code (1+ code)))
410              )
411             ((integerp range)
412              (isd-turtle-insert-char ccs range)
413              )
414             (t (error 'wrong-type-argument range)))
415       (setq ranges (cdr ranges)))))
416
417 (defun isd-turtle-dump-range (file path func &rest args)
418   (with-temp-buffer
419     (let ((coding-system-for-write 'utf-8-mcs-er)
420           ;; isd-turtle-ccs-list
421           chise-turtle-ccs-prefix-alist)
422       (if (file-directory-p path)
423           (setq path (expand-file-name file path)))
424       (apply func args)
425       (goto-char (point-min))
426       ;; (dolist (ccs (sort isd-turtle-ccs-list
427       ;;                    #'char-attribute-name<))
428       ;;   (insert (format "@prefix %s: <%s%s=> .\n"
429       ;;                   (isd-turtle-uri-encode-feature-name ccs)
430       ;;                   "http://www.chise.org/est/view/character/"
431       ;;                   (www-uri-encode-feature-name ccs))))
432       (dolist (cell (sort chise-turtle-ccs-prefix-alist
433                           (lambda (a b)
434                             (char-attribute-name< (cdr a)(cdr b)))))
435         (insert (format "@prefix %s: <%s/%s=> .\n"
436                         (car cell)
437                         "http://www.chise.org/est/view/character"
438                         (www-uri-encode-feature-name (cdr cell)))))
439       (insert "\n")
440       (goto-char (point-min))
441       (insert "# -*- coding: utf-8-mcs-er -*-\n")
442       (insert "@prefix : <http://rdf.chise.org/rdf/property/character/isd/> .
443 @prefix idc: <http://rdf.chise.org/rdf/type/character/idc/> .\n")
444       (write-region (point-min)(point-max) path))))
445
446 ;;;###autoload
447 (defun isd-turtle-dump-ucs-basic (filename)
448   (interactive "Fdump ISD-UCS-Basic : ")
449   (isd-turtle-dump-range "ISD-UCS-Basic.ttl" filename
450                          #'isd-turtle-insert-ccs-ranges
451                          '=ucs '(#x4E00 . #x9FA5)))
452
453 ;;;###autoload
454 (defun isd-turtle-dump-ucs-ext-a (filename)
455   (interactive "Fdump ISD-UCS-Ext-A : ")
456   (isd-turtle-dump-range "ISD-UCS-Ext-A.ttl" filename
457                          #'isd-turtle-insert-ccs-ranges
458                          '=ucs '(#x3400 . #x4DB5) #xFA1F #xFA23))
459
460 ;;;###autoload
461 (defun isd-turtle-dump-ucs-ext-b-1 (filename)
462   (interactive "Fdump IDS-UCS-Ext-B-1 : ")
463   (isd-turtle-dump-range "ISD-UCS-Ext-B-1.ttl" filename
464                          #'isd-turtle-insert-ccs-ranges
465                          'ucs '(#x20000 . #x21FFF)))
466
467 ;;;###autoload
468 (defun isd-turtle-dump-ucs-ext-b-2 (filename)
469   (interactive "Fdump IDS-UCS-Ext-B-2 : ")
470   (isd-turtle-dump-range "ISD-UCS-Ext-B-2.ttl" filename
471                          #'isd-turtle-insert-ccs-ranges
472                          'ucs '(#x22000 . #x23FFF)))
473
474 ;;;###autoload
475 (defun isd-turtle-dump-ucs-ext-b-3 (filename)
476   (interactive "Fdump IDS-UCS-Ext-B-3 : ")
477   (isd-turtle-dump-range "ISD-UCS-Ext-B-3.ttl" filename
478                          #'isd-turtle-insert-ccs-ranges
479                          'ucs '(#x24000 . #x25FFF)))
480
481 ;;;###autoload
482 (defun isd-turtle-dump-ucs-ext-b-4 (filename)
483   (interactive "Fdump IDS-UCS-Ext-B-4 : ")
484   (isd-turtle-dump-range "ISD-UCS-Ext-B-4.ttl" filename
485                          #'isd-turtle-insert-ccs-ranges
486                          'ucs '(#x26000 . #x27FFF)))
487
488 ;;;###autoload
489 (defun isd-turtle-dump-ucs-ext-b-5 (filename)
490   (interactive "Fdump IDS-UCS-Ext-B-5 : ")
491   (isd-turtle-dump-range "ISD-UCS-Ext-B-5.ttl" filename
492                          #'isd-turtle-insert-ccs-ranges
493                          'ucs '(#x28000 . #x29FFF)))
494
495 ;;;###autoload
496 (defun isd-turtle-dump-ucs-ext-b-6 (filename)
497   (interactive "Fdump IDS-UCS-Ext-B-6 : ")
498   (isd-turtle-dump-range "ISD-UCS-Ext-B-6.ttl" filename
499                          #'isd-turtle-insert-ccs-ranges
500                          'ucs '(#x2A000 . #x2A6D6)))
501
502 ;;;###autoload
503 (defun isd-turtle-dump-ucs-ext-c (filename)
504   (interactive "Fdump IDS-UCS-Ext-C : ")
505   (isd-turtle-dump-range "ISD-UCS-Ext-C.ttl" filename
506                          #'isd-turtle-insert-ccs-ranges
507                          'ucs '(#x2A700 . #x2B734)))
508
509 ;;;###autoload
510 (defun isd-turtle-dump-ucs-ext-d (filename)
511   (interactive "Fdump IDS-UCS-Ext-D : ")
512   (isd-turtle-dump-range "ISD-UCS-Ext-D.ttl" filename
513                          #'isd-turtle-insert-ccs-ranges
514                          'ucs '(#x2B740 . #x2B81D)))
515
516 ;;;###autoload
517 (defun isd-turtle-dump-ucs-ext-e (filename)
518   (interactive "Fdump IDS-UCS-Ext-E : ")
519   (isd-turtle-dump-range "ISD-UCS-Ext-E.ttl" filename
520                          #'isd-turtle-insert-ccs-ranges
521                          'ucs '(#x2B820 . #x2CEA1)))
522
523 ;;;###autoload
524 (defun isd-turtle-dump-mj-0 (filename)
525   (interactive "Fdump ISD-MJ-0 : ")
526   (isd-turtle-dump-range "ISD-MJ-0.ttl" filename
527                          #'isd-turtle-insert-ccs-ranges
528                          '=mj '(1 . 9999)))
529
530 ;;;###autoload
531 (defun isd-turtle-dump-mj-1 (filename)
532   (interactive "Fdump ISD-MJ-1 : ")
533   (isd-turtle-dump-range "ISD-MJ-1.ttl" filename
534                          #'isd-turtle-insert-ccs-ranges
535                          '=mj '(10000 . 19999)))
536
537 ;;;###autoload
538 (defun isd-turtle-dump-mj-2 (filename)
539   (interactive "Fdump ISD-MJ-2 : ")
540   (isd-turtle-dump-range "ISD-MJ-2.ttl" filename
541                          #'isd-turtle-insert-ccs-ranges
542                          '=mj '(20000 . 29999)))
543
544 ;;;###autoload
545 (defun isd-turtle-dump-mj-3 (filename)
546   (interactive "Fdump ISD-MJ-3 : ")
547   (isd-turtle-dump-range "ISD-MJ-3.ttl" filename
548                          #'isd-turtle-insert-ccs-ranges
549                          '=mj '(30000 . 39999)))
550
551 ;;;###autoload
552 (defun isd-turtle-dump-mj-4 (filename)
553   (interactive "Fdump ISD-MJ-4 : ")
554   (isd-turtle-dump-range "ISD-MJ-4.ttl" filename
555                          #'isd-turtle-insert-ccs-ranges
556                          '=mj '(40000 . 49999)))
557
558 ;;;###autoload
559 (defun isd-turtle-dump-mj-5 (filename)
560   (interactive "Fdump ISD-MJ-5 : ")
561   (isd-turtle-dump-range "ISD-MJ-5.ttl" filename
562                          #'isd-turtle-insert-ccs-ranges
563                          '=mj '(50000 . 59999)))
564
565 ;;;###autoload
566 (defun isd-turtle-dump-mj-6 (filename)
567   (interactive "Fdump ISD-MJ-6 : ")
568   (isd-turtle-dump-range "ISD-MJ-6.ttl" filename
569                          #'isd-turtle-insert-ccs-ranges
570                          '=mj '(60000 . 69999)))
571
572 ;;;###autoload
573 (defun isd-turtle-dump-all (directory)
574   (interactive "DISD directory : ")
575   (isd-turtle-dump-ucs-basic directory)
576   (isd-turtle-dump-ucs-ext-a directory)
577   (isd-turtle-dump-ucs-ext-b-1 directory)
578   (isd-turtle-dump-ucs-ext-b-2 directory)
579   (isd-turtle-dump-ucs-ext-b-3 directory)
580   (isd-turtle-dump-ucs-ext-b-4 directory)
581   (isd-turtle-dump-ucs-ext-b-5 directory)
582   (isd-turtle-dump-ucs-ext-b-6 directory)
583   (isd-turtle-dump-ucs-ext-c directory)
584   (isd-turtle-dump-ucs-ext-d directory)
585   (isd-turtle-dump-ucs-ext-e directory)
586   (isd-turtle-dump-mj-0 directory)
587   (isd-turtle-dump-mj-1 directory)
588   (isd-turtle-dump-mj-2 directory)
589   (isd-turtle-dump-mj-3 directory)
590   (isd-turtle-dump-mj-4 directory)
591   (isd-turtle-dump-mj-5 directory)
592   (isd-turtle-dump-mj-6 directory)
593   )
594
595
596 ;;; @ End.
597 ;;;
598
599 (provide 'isd-turtle)
600
601 ;;; isd-turtle.el ends here