Update Copyright header.
[chise/isd.git] / isd-turtle.el
1 ;;; isd-turtle.el --- Utility to dump ideographic-structure as Turtle files
2
3 ;; Copyright (C) 2017, 2018 MORIOKA Tomohiko
4
5 ;; Author: MORIOKA Tomohiko <tomo@kanji.zinbun.kyoto-u.ac.jp>
6 ;; Keywords: Ideographic Structures (漢字構造、解字), IDS, CHISE, RDF, Turtle
7
8 ;; This file is a part of CHISE-ISD (Ideographic Structure Database).
9
10 ;; This program is free software; you can redistribute it and/or
11 ;; modify it under the terms of the GNU General Public License as
12 ;; published by the Free Software Foundation; either version 2, or (at
13 ;; your option) any later version.
14
15 ;; This program is distributed in the hope that it will be useful, but
16 ;; WITHOUT ANY WARRANTY; without even the implied warranty of
17 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18 ;; General Public License for more details.
19
20 ;; You should have received a copy of the GNU General Public License
21 ;; along with this program; see the file COPYING.  If not, write to
22 ;; the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 ;; Boston, MA 02111-1307, USA.
24
25 ;;; Code:
26
27 (require 'cwiki-common)
28
29 (defvar isd-url-prefix "http://rdf.chise.org/data/")
30
31 (setq est-coded-charset-priority-list
32   '(; =ucs
33     =mj
34     =adobe-japan1-0
35     =adobe-japan1-1
36     =adobe-japan1-2
37     =adobe-japan1-3
38     =adobe-japan1-4
39     =adobe-japan1-5
40     =adobe-japan1-6
41     =ucs@iso
42     =jis-x0208 =jis-x0208@1990
43     =jis-x0213-1
44     =jis-x0213-1@2000 =jis-x0213-1@2004
45     =jis-x0213-2
46     =jis-x0212
47     =gt
48     =hanyo-denshi/ks
49     =hanyo-denshi/tk
50     =ucs-itaiji-001
51     =ucs-itaiji-002
52     =ucs-itaiji-003
53     =ucs-itaiji-004
54     =ucs-itaiji-005
55     =ucs-itaiji-006
56     =ucs-itaiji-007
57     =ucs-itaiji-008
58     =ucs-itaiji-009
59     =ucs-itaiji-010
60     =ucs-itaiji-084
61     =ucs-var-001
62     =ucs-var-002
63     =ucs-var-003
64     =ucs-var-004
65     =ucs-var-005
66     =cns11643-1 =cns11643-2 =cns11643-3
67     =cns11643-4 =cns11643-5 =cns11643-6 =cns11643-7
68     =gb2312
69     =big5-cdp
70     =ks-x1001
71     =gt-k
72     =ucs@unicode
73     =ucs@JP/hanazono
74     =gb12345
75     =ucs@cns
76     =ucs@gb
77     =zinbun-oracle =>zinbun-oracle
78     =daikanwa
79     =ruimoku-v6
80     =cbeta =jef-china3
81     =daikanwa/+2p
82     =+>ucs@iso =+>ucs@unicode
83     =+>ucs@jis
84     =+>ucs@cns
85     =+>ucs@ks
86     =+>ucs@jis/1990
87     =>mj
88     =>jis-x0208 =>jis-x0213-1
89     =>jis-x0208@1997
90     =>ucs@iwds-1
91     =>ucs@cognate
92     =>ucs@component
93     =>iwds-1
94     =>ucs@iso
95     =>ucs@unicode
96     =>ucs@jis =>ucs@cns =>ucs@ks
97     =>gt
98     =>gt-k
99     =>>ucs@iso =>>ucs@unicode
100     =>>ucs@jis =>>ucs@cns =>>ucs@ks
101     =>>gt-k
102     =>>hanyo-denshi/ks
103     ==mj
104     ==ucs@iso
105     ==ucs@unicode
106     ==adobe-japan1-0
107     ==adobe-japan1-1
108     ==adobe-japan1-2
109     ==adobe-japan1-3
110     ==adobe-japan1-4
111     ==adobe-japan1-5
112     ==adobe-japan1-6
113     ==ks-x1001
114     ==hanyo-denshi/ks
115     ==hanyo-denshi/tk
116     ==ucs@jis
117     ==gt
118     ==cns11643-1 ==cns11643-2 ==cns11643-3
119     ==cns11643-4 ==cns11643-5 ==cns11643-6 ==cns11643-7
120     ==jis-x0212
121     ==ucs@cns
122     ==koseki
123     ==daikanwa
124     ==gt-k
125     ==ucs@gb
126     ==ucs-itaiji-003
127     ==ucs@JP/hanazono
128     ==daikanwa/+2p
129     =>>jis-x0208 =>>jis-x0213-1 =>>jis-x0213-2
130     =+>jis-x0208 =+>jis-x0213-1 =+>jis-x0213-2
131     =+>hanyo-denshi/jt
132     =+>jis-x0208@1978
133     =>>gt
134     =+>adobe-japan1
135     =>>adobe-japan1
136     =jis-x0208@1983 =jis-x0208@1978
137     =>ucs-itaiji-001
138     =>ucs-itaiji-002
139     =>ucs-itaiji-003
140     =>ucs-itaiji-004
141     =>ucs-itaiji-005
142     =>ucs-itaiji-006
143     =>ucs-itaiji-007
144     ==>ucs@bucs
145     =big5
146     =>cbeta
147     ===mj
148     ===ucs@iso
149     ===ucs@unicode
150     ===hanyo-denshi/ks
151     ===ks-x1001
152     ===gt
153     ===gt-k
154     ===ucs@ks
155     ===ucs@gb
156     =shinjigen
157     =shinjigen@rev
158     =shinjigen@1ed
159     =shinjigen/+p@rev
160     ==shinjigen
161     ==shinjigen@rev
162     ==daikanwa/+p
163     ==shinjigen@1ed
164     ===daikanwa/+p
165     =>daikanwa/ho
166     ===daikanwa/ho
167     ))
168
169 ;; (defvar isd-turtle-ccs-list nil)
170 (defvar chise-turtle-ccs-prefix-alist nil)
171
172 (defun charset-code-point-format-spec (ccs)
173   (cond ((memq ccs '(=ucs))
174          "0x%04X")
175         (t
176          (let ((ccs-name (symbol-name ccs)))
177            (cond
178             ((string-match
179               "\\(shinjigen\\|daikanwa/ho\\|=>iwds-1\\)"
180               ccs-name)
181              "%04d")
182             ((string-match
183               "\\(gt\\|daikanwa\\|adobe-japan1\\|cbeta\\|zinbun-oracle\\|hng\\)"
184               ccs-name)
185              "%05d")
186             ((string-match "\\(hanyo-denshi/ks\\|koseki\\|mj\\)" ccs-name)
187              "%06d")
188             ((string-match "hanyo-denshi/tk" ccs-name)
189              "%08d")
190             (t
191              "0x%X"))))))
192
193 ;; (defun isd-turtle-uri-encode-feature-name (feature-name)
194 ;;   (cond
195 ;;    ((eq '=ucs feature-name)
196 ;;     "a.ucs")
197 ;;    ((eq '==>ucs@bucs feature-name)
198 ;;     "bucs")
199 ;;    (t
200 ;;     (mapconcat (lambda (c)
201 ;;                  (if (eq c ?@)
202 ;;                      "_"
203 ;;                    (char-to-string c)))
204 ;;                (www-uri-encode-feature-name feature-name)
205 ;;                ""))))
206 (defun chise-turtle-uri-encode-ccs-name (feature-name)
207   (cond
208    ((eq '=ucs feature-name)
209     "a.ucs")
210    ((eq '=big5 feature-name)
211     "a.big5")
212    ((eq '==>ucs@bucs feature-name)
213     "bucs")
214    (t
215     (mapconcat (lambda (c)
216                  (cond
217                   ((eq c ?@)
218                    "_")
219                   ((eq c ?+)
220                    "._.")
221                   ((eq c ?=)
222                    ".:.")
223                   (t
224                    (char-to-string c))))
225                (www-uri-encode-feature-name feature-name)
226                ""))))
227
228 ;; (defun isd-turtle-format-ccs-code-point (ccs code-point)
229 ;;   (unless (memq ccs isd-turtle-ccs-list)
230 ;;     (setq isd-turtle-ccs-list (cons ccs isd-turtle-ccs-list)))
231 ;;   (format "%s:%s"
232 ;;           (isd-turtle-uri-encode-feature-name ccs)
233 ;;           (format (charset-code-point-format-spec ccs)
234 ;;                   code-point)))
235 (defun chise-turtle-format-ccs-code-point (ccs code-point)
236   (let ((ccs-uri (chise-turtle-uri-encode-ccs-name ccs)))
237     (unless (assoc ccs-uri chise-turtle-ccs-prefix-alist)
238       (setq chise-turtle-ccs-prefix-alist
239             (cons (cons ccs-uri ccs)
240                   chise-turtle-ccs-prefix-alist)))
241     (format "%s:%s"
242             ccs-uri
243             (format (charset-code-point-format-spec ccs)
244                     code-point))))
245
246 (defun isd-turtle-encode-char (object)
247   (let ((ccs-list est-coded-charset-priority-list)
248         ccs ret)
249     (if (setq ret (encode-char object '=ucs))
250         (chise-turtle-format-ccs-code-point '=ucs ret)
251       (while (and ccs-list
252                   (setq ccs (pop ccs-list))
253                   (not (setq ret (encode-char object ccs 'defined-only)))))
254       (cond (ret
255              (chise-turtle-format-ccs-code-point ccs ret)
256              )
257             ((and (setq ccs (car (split-char object)))
258                   (setq ret (encode-char object ccs)))
259              (chise-turtle-format-ccs-code-point ccs ret)
260              )
261             (t
262              (format (if est-hide-cgi-mode
263                          "system-char-id=0x%X"
264                        "system-char-id:0x%X")
265                      (encode-char object 'system-char-id))
266              )))))
267
268 (defun isd-turtle-format-component (component separator level prefix)
269   (cond ((characterp component)
270          (format "%s %c # %c"
271                  (isd-turtle-encode-char component)
272                  separator
273                  component)
274          )
275         ((consp component)
276          (let ((ret (find-char component)))
277            (cond (ret
278                   (format "%s %c # %c"
279                           (isd-turtle-encode-char ret) separator ret))
280                  ((setq ret (assq 'ideographic-structure component))
281                   (if (eq separator ?\;)
282                       (format "%s ;"
283                               (isd-turtle-format-char nil nil (cdr ret) (1+ level)
284                                                       prefix))
285                     (isd-turtle-format-char nil nil (cdr ret) (1+ level)
286                                             prefix))))))))
287
288 (defun isd-turtle-format-char (ccs code-point &optional ids-list level
289                                    prefix without-head-char)
290   (unless level
291     (setq level 0))
292   (unless prefix
293     (setq prefix ""))
294   (let ((indent (make-string (* level 4) ?\ ))
295         char
296         idc idc-str
297         p1 p2 p3
298         c1 c2 c3
299         ret)
300     (unless ids-list
301       (if (and ccs code-point
302                (setq char (decode-char ccs code-point)))
303           (setq ids-list (get-char-attribute char 'ideographic-structure))))
304     (setq idc (car ids-list))
305     (setq c1 (nth 1 ids-list)
306           c2 (nth 2 ids-list)
307           c3 (nth 3 ids-list))
308     (if (char-ref-p idc)
309         (setq idc (plist-get idc :char)))
310     (if (and (consp idc)
311              (setq ret (find-char idc)))
312         (setq idc ret))
313     (if (and (consp c1)
314              (setq ret (find-char c1)))
315         (setq c1 ret))
316     (if (and (consp c2)
317              (setq ret (find-char c2)))
318         (setq c2 ret))
319     (if (and (consp c3)
320              (setq ret (find-char c3)))
321         (setq c3 ret))
322     (cond
323      ((eq idc ?\u2FF0) ; ⿰
324       (setq p1 'left
325             p2 'right)
326       )
327      ((eq idc ?⿱)
328       (setq p1 'above
329             p2 'below)
330       )
331      ((eq idc ?⿲)
332       (setq p1 'left
333             p2 'middle
334             p3 'right)
335       )
336      ((eq idc ?⿳)
337       (setq p1 'above
338             p2 'middle
339             p3 'below)
340       )
341      ((memq idc '(?⿴ ?⿵ ?⿶ ?⿷ ?⿸ ?⿹ ?⿺))
342       (setq p1 'surround
343             p2 'filling)
344       )
345      ((eq idc ?⿻)
346       (setq p1 'underlying
347             p2 'overlaying)
348       )
349      ((and idc (eq (encode-char idc '=>iwds-1) 305))
350       (setq idc-str "⿱・⿸")
351       (setq p1 'above
352             p2 'below)
353       ))
354     (cond
355      (p3
356       (format "%s
357 %s    %s:structure [ a idc:%s ;
358 %s        %s:%-8s %s
359 %s        %s:%-8s %s
360 %s        %s:%-8s %s
361 %s    ]%s"
362               (if without-head-char
363                   ""
364                 (if (and ccs code-point)
365                     (format "%s   # %c"
366                             (chise-turtle-format-ccs-code-point ccs code-point)
367                             char)
368                   "["))
369               indent prefix (or idc-str (char-to-string idc))
370               indent prefix p1 (isd-turtle-format-component c1 ?\; (1+ level) prefix)
371               indent prefix p2 (isd-turtle-format-component c2 ?\; (1+ level) prefix)
372               indent prefix p3 (isd-turtle-format-component c3 ?\  (1+ level) prefix)
373               indent
374               (if without-head-char
375                   ""
376                 (if (null char)
377                     (format "\n%s]"
378                             indent)
379                   "")))
380       )
381      (idc
382       (format "%s
383 %s    %s:structure [ a idc:%s ;
384 %s        %s:%-8s %s
385 %s        %s:%-8s %s
386 %s    ]%s"
387               (if without-head-char
388                   ""
389                 (if (and ccs code-point)
390                     (format "%s   # %c"
391                             (chise-turtle-format-ccs-code-point ccs code-point)
392                             char)
393                   "["))
394               indent prefix (or idc-str (char-to-string idc))
395               indent prefix p1 (isd-turtle-format-component c1 ?\; (1+ level) prefix)
396               indent prefix p2 (isd-turtle-format-component c2 ?\  (1+ level) prefix)
397               indent
398               (if without-head-char
399                   ""
400                 (if (null char)
401                     (format "\n%s]"
402                             indent)
403                   "")))))
404     ))
405
406 (defun isd-turtle-insert-char (ccs code-point)
407   (let ((ret (isd-turtle-format-char ccs code-point)))
408     (when ret
409       (insert ret)
410       (insert " .\n"))))
411
412 (defun isd-turtle-insert-ccs-ranges (ccs &rest ranges)
413   (let (range code max-code)
414     (while ranges
415       (setq range (car ranges))
416       (cond ((consp range)
417              (setq code (car range)
418                    max-code (cdr range))
419              (while (<= code max-code)
420                (isd-turtle-insert-char ccs code)
421                (setq code (1+ code)))
422              )
423             ((integerp range)
424              (isd-turtle-insert-char ccs range)
425              )
426             (t (error 'wrong-type-argument range)))
427       (setq ranges (cdr ranges)))))
428
429 (defun isd-turtle-dump-range (file path func &rest args)
430   (with-temp-buffer
431     (let ((coding-system-for-write 'utf-8-mcs-er)
432           ;; isd-turtle-ccs-list
433           chise-turtle-ccs-prefix-alist)
434       (if (file-directory-p path)
435           (setq path (expand-file-name file path)))
436       (apply func args)
437       (goto-char (point-min))
438       ;; (dolist (ccs (sort isd-turtle-ccs-list
439       ;;                    #'char-attribute-name<))
440       ;;   (insert (format "@prefix %s: <%s%s=> .\n"
441       ;;                   (isd-turtle-uri-encode-feature-name ccs)
442       ;;                   "http://www.chise.org/est/view/character/"
443       ;;                   (www-uri-encode-feature-name ccs))))
444       (dolist (cell (sort chise-turtle-ccs-prefix-alist
445                           (lambda (a b)
446                             (char-attribute-name< (cdr a)(cdr b)))))
447         (insert (format "@prefix %s: <%s/%s=> .\n"
448                         (car cell)
449                         "http://www.chise.org/est/view/character"
450                         (www-uri-encode-feature-name (cdr cell)))))
451       (insert "\n")
452       (goto-char (point-min))
453       (insert "# -*- coding: utf-8-mcs-er -*-\n")
454       (insert "@prefix : <http://rdf.chise.org/rdf/property/character/isd/> .
455 @prefix idc: <http://rdf.chise.org/rdf/type/character/idc/> .\n")
456       (write-region (point-min)(point-max) path))))
457
458 ;;;###autoload
459 (defun isd-turtle-dump-ucs-basic (filename)
460   (interactive "Fdump ISD-UCS-Basic : ")
461   (isd-turtle-dump-range "ISD-UCS-Basic.ttl" filename
462                          #'isd-turtle-insert-ccs-ranges
463                          '=ucs '(#x4E00 . #x9FA5)))
464
465 ;;;###autoload
466 (defun isd-turtle-dump-ucs-ext-a (filename)
467   (interactive "Fdump ISD-UCS-Ext-A : ")
468   (isd-turtle-dump-range "ISD-UCS-Ext-A.ttl" filename
469                          #'isd-turtle-insert-ccs-ranges
470                          '=ucs '(#x3400 . #x4DB5) #xFA1F #xFA23))
471
472 ;;;###autoload
473 (defun isd-turtle-dump-ucs-ext-b-1 (filename)
474   (interactive "Fdump IDS-UCS-Ext-B-1 : ")
475   (isd-turtle-dump-range "ISD-UCS-Ext-B-1.ttl" filename
476                          #'isd-turtle-insert-ccs-ranges
477                          'ucs '(#x20000 . #x21FFF)))
478
479 ;;;###autoload
480 (defun isd-turtle-dump-ucs-ext-b-2 (filename)
481   (interactive "Fdump IDS-UCS-Ext-B-2 : ")
482   (isd-turtle-dump-range "ISD-UCS-Ext-B-2.ttl" filename
483                          #'isd-turtle-insert-ccs-ranges
484                          'ucs '(#x22000 . #x23FFF)))
485
486 ;;;###autoload
487 (defun isd-turtle-dump-ucs-ext-b-3 (filename)
488   (interactive "Fdump IDS-UCS-Ext-B-3 : ")
489   (isd-turtle-dump-range "ISD-UCS-Ext-B-3.ttl" filename
490                          #'isd-turtle-insert-ccs-ranges
491                          'ucs '(#x24000 . #x25FFF)))
492
493 ;;;###autoload
494 (defun isd-turtle-dump-ucs-ext-b-4 (filename)
495   (interactive "Fdump IDS-UCS-Ext-B-4 : ")
496   (isd-turtle-dump-range "ISD-UCS-Ext-B-4.ttl" filename
497                          #'isd-turtle-insert-ccs-ranges
498                          'ucs '(#x26000 . #x27FFF)))
499
500 ;;;###autoload
501 (defun isd-turtle-dump-ucs-ext-b-5 (filename)
502   (interactive "Fdump IDS-UCS-Ext-B-5 : ")
503   (isd-turtle-dump-range "ISD-UCS-Ext-B-5.ttl" filename
504                          #'isd-turtle-insert-ccs-ranges
505                          'ucs '(#x28000 . #x29FFF)))
506
507 ;;;###autoload
508 (defun isd-turtle-dump-ucs-ext-b-6 (filename)
509   (interactive "Fdump IDS-UCS-Ext-B-6 : ")
510   (isd-turtle-dump-range "ISD-UCS-Ext-B-6.ttl" filename
511                          #'isd-turtle-insert-ccs-ranges
512                          'ucs '(#x2A000 . #x2A6D6)))
513
514 ;;;###autoload
515 (defun isd-turtle-dump-ucs-ext-c (filename)
516   (interactive "Fdump IDS-UCS-Ext-C : ")
517   (isd-turtle-dump-range "ISD-UCS-Ext-C.ttl" filename
518                          #'isd-turtle-insert-ccs-ranges
519                          'ucs '(#x2A700 . #x2B734)))
520
521 ;;;###autoload
522 (defun isd-turtle-dump-ucs-ext-d (filename)
523   (interactive "Fdump IDS-UCS-Ext-D : ")
524   (isd-turtle-dump-range "ISD-UCS-Ext-D.ttl" filename
525                          #'isd-turtle-insert-ccs-ranges
526                          'ucs '(#x2B740 . #x2B81D)))
527
528 ;;;###autoload
529 (defun isd-turtle-dump-ucs-ext-e (filename)
530   (interactive "Fdump IDS-UCS-Ext-E : ")
531   (isd-turtle-dump-range "ISD-UCS-Ext-E.ttl" filename
532                          #'isd-turtle-insert-ccs-ranges
533                          'ucs '(#x2B820 . #x2CEA1)))
534
535 ;;;###autoload
536 (defun isd-turtle-dump-mj-0 (filename)
537   (interactive "Fdump ISD-MJ-0 : ")
538   (isd-turtle-dump-range "ISD-MJ-0.ttl" filename
539                          #'isd-turtle-insert-ccs-ranges
540                          '=mj '(1 . 9999)))
541
542 ;;;###autoload
543 (defun isd-turtle-dump-mj-1 (filename)
544   (interactive "Fdump ISD-MJ-1 : ")
545   (isd-turtle-dump-range "ISD-MJ-1.ttl" filename
546                          #'isd-turtle-insert-ccs-ranges
547                          '=mj '(10000 . 19999)))
548
549 ;;;###autoload
550 (defun isd-turtle-dump-mj-2 (filename)
551   (interactive "Fdump ISD-MJ-2 : ")
552   (isd-turtle-dump-range "ISD-MJ-2.ttl" filename
553                          #'isd-turtle-insert-ccs-ranges
554                          '=mj '(20000 . 29999)))
555
556 ;;;###autoload
557 (defun isd-turtle-dump-mj-3 (filename)
558   (interactive "Fdump ISD-MJ-3 : ")
559   (isd-turtle-dump-range "ISD-MJ-3.ttl" filename
560                          #'isd-turtle-insert-ccs-ranges
561                          '=mj '(30000 . 39999)))
562
563 ;;;###autoload
564 (defun isd-turtle-dump-mj-4 (filename)
565   (interactive "Fdump ISD-MJ-4 : ")
566   (isd-turtle-dump-range "ISD-MJ-4.ttl" filename
567                          #'isd-turtle-insert-ccs-ranges
568                          '=mj '(40000 . 49999)))
569
570 ;;;###autoload
571 (defun isd-turtle-dump-mj-5 (filename)
572   (interactive "Fdump ISD-MJ-5 : ")
573   (isd-turtle-dump-range "ISD-MJ-5.ttl" filename
574                          #'isd-turtle-insert-ccs-ranges
575                          '=mj '(50000 . 59999)))
576
577 ;;;###autoload
578 (defun isd-turtle-dump-mj-6 (filename)
579   (interactive "Fdump ISD-MJ-6 : ")
580   (isd-turtle-dump-range "ISD-MJ-6.ttl" filename
581                          #'isd-turtle-insert-ccs-ranges
582                          '=mj '(60000 . 69999)))
583
584 ;;;###autoload
585 (defun isd-turtle-dump-all (directory)
586   (interactive "DISD directory : ")
587   (isd-turtle-dump-ucs-basic directory)
588   (isd-turtle-dump-ucs-ext-a directory)
589   (isd-turtle-dump-ucs-ext-b-1 directory)
590   (isd-turtle-dump-ucs-ext-b-2 directory)
591   (isd-turtle-dump-ucs-ext-b-3 directory)
592   (isd-turtle-dump-ucs-ext-b-4 directory)
593   (isd-turtle-dump-ucs-ext-b-5 directory)
594   (isd-turtle-dump-ucs-ext-b-6 directory)
595   (isd-turtle-dump-ucs-ext-c directory)
596   (isd-turtle-dump-ucs-ext-d directory)
597   (isd-turtle-dump-ucs-ext-e directory)
598   (isd-turtle-dump-mj-0 directory)
599   (isd-turtle-dump-mj-1 directory)
600   (isd-turtle-dump-mj-2 directory)
601   (isd-turtle-dump-mj-3 directory)
602   (isd-turtle-dump-mj-4 directory)
603   (isd-turtle-dump-mj-5 directory)
604   (isd-turtle-dump-mj-6 directory)
605   )
606
607
608 ;;; @ End.
609 ;;;
610
611 (provide 'isd-turtle)
612
613 ;;; isd-turtle.el ends here