Sync up with the current environment.
[chise/isd.git] / isd-turtle.el
1 ;;; isd-turtle.el --- Utility to dump ideographic-structure as Turtle files
2
3 ;; Copyright (C) 2017 MORIOKA Tomohiko
4
5 ;; Author: MORIOKA Tomohiko <tomo@kanji.zinbun.kyoto-u.ac.jp>
6 ;; Keywords: Ideographic Structures (漢字構造、解字), IDS, CHISE, RDF, Turtle
7
8 ;; This file is a part of CHISE-ISD (Ideographic Structure Database).
9
10 ;; This program is free software; you can redistribute it and/or
11 ;; modify it under the terms of the GNU General Public License as
12 ;; published by the Free Software Foundation; either version 2, or (at
13 ;; your option) any later version.
14
15 ;; This program is distributed in the hope that it will be useful, but
16 ;; WITHOUT ANY WARRANTY; without even the implied warranty of
17 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18 ;; General Public License for more details.
19
20 ;; You should have received a copy of the GNU General Public License
21 ;; along with this program; see the file COPYING.  If not, write to
22 ;; the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 ;; Boston, MA 02111-1307, USA.
24
25 ;;; Code:
26
27 (require 'cwiki-common)
28
29 (defvar isd-url-prefix "http://rdf.chise.org/data/")
30
31 (setq est-coded-charset-priority-list
32   '(; =ucs
33     =mj
34     =adobe-japan1-0
35     =adobe-japan1-1
36     =adobe-japan1-2
37     =adobe-japan1-3
38     =adobe-japan1-4
39     =adobe-japan1-5
40     =adobe-japan1-6
41     =ucs@iso
42     =jis-x0208 =jis-x0208@1990
43     =jis-x0213-1
44     =jis-x0213-1@2000 =jis-x0213-1@2004
45     =jis-x0213-2
46     =jis-x0212
47     =gt
48     =hanyo-denshi/ks
49     =hanyo-denshi/tk
50     =ucs-itaiji-001
51     =ucs-itaiji-002
52     =ucs-itaiji-003
53     =ucs-itaiji-004
54     =ucs-itaiji-005
55     =ucs-itaiji-006
56     =ucs-itaiji-007
57     =ucs-itaiji-008
58     =ucs-itaiji-009
59     =ucs-itaiji-010
60     =ucs-itaiji-084
61     =ucs-var-001
62     =ucs-var-002
63     =ucs-var-003
64     =ucs-var-004
65     =cns11643-1 =cns11643-2 =cns11643-3
66     =cns11643-4 =cns11643-5 =cns11643-6 =cns11643-7
67     =gb2312
68     =big5-cdp
69     =ks-x1001
70     =gt-k
71     =ucs@unicode
72     =ucs@JP/hanazono
73     =gb12345
74     =ucs@cns
75     =ucs@gb
76     =zinbun-oracle =>zinbun-oracle
77     =daikanwa
78     =ruimoku-v6
79     =cbeta =jef-china3
80     =daikanwa/+2p
81     =+>ucs@iso =+>ucs@unicode
82     =+>ucs@jis
83     =+>ucs@cns
84     =+>ucs@ks
85     =+>ucs@jis/1990
86     =>mj
87     =>jis-x0208 =>jis-x0213-1
88     =>jis-x0208@1997
89     =>ucs@iwds-1
90     =>ucs@cognate
91     =>ucs@component
92     =>iwds-1
93     =>ucs@iso
94     =>ucs@unicode
95     =>ucs@jis =>ucs@cns =>ucs@ks
96     =>gt
97     =>gt-k
98     =>>ucs@iso =>>ucs@unicode
99     =>>ucs@jis =>>ucs@cns =>>ucs@ks
100     =>>gt-k
101     =>>hanyo-denshi/ks
102     ==mj
103     ==ucs@iso
104     ==ucs@unicode
105     ==adobe-japan1-0
106     ==adobe-japan1-1
107     ==adobe-japan1-2
108     ==adobe-japan1-3
109     ==adobe-japan1-4
110     ==adobe-japan1-5
111     ==adobe-japan1-6
112     ==ks-x1001
113     ==hanyo-denshi/ks
114     ==hanyo-denshi/tk
115     ==ucs@jis
116     ==gt
117     ==cns11643-1 ==cns11643-2 ==cns11643-3
118     ==cns11643-4 ==cns11643-5 ==cns11643-6 ==cns11643-7
119     ==jis-x0212
120     ==ucs@cns
121     ==koseki
122     ==daikanwa
123     ==gt-k
124     ==ucs@gb
125     ==ucs-itaiji-003
126     ==ucs@JP/hanazono
127     ==daikanwa/+2p
128     =>>jis-x0208 =>>jis-x0213-1 =>>jis-x0213-2
129     =+>jis-x0208 =+>jis-x0213-1 =+>jis-x0213-2
130     =+>hanyo-denshi/jt
131     =+>jis-x0208@1978
132     =>>gt
133     =+>adobe-japan1
134     =>>adobe-japan1
135     =jis-x0208@1983 =jis-x0208@1978
136     =>ucs-itaiji-001
137     =>ucs-itaiji-002
138     =>ucs-itaiji-003
139     =>ucs-itaiji-004
140     =>ucs-itaiji-005
141     =>ucs-itaiji-006
142     =>ucs-itaiji-007
143     ==>ucs@bucs
144     =big5
145     =>cbeta
146     ===mj
147     ===ucs@iso
148     ===ucs@unicode
149     ===hanyo-denshi/ks
150     ===ks-x1001
151     ===gt
152     ===gt-k
153     ===ucs@ks
154     ===ucs@gb
155     =shinjigen
156     =shinjigen@rev
157     =shinjigen@1ed
158     =shinjigen/+p@rev
159     ==shinjigen
160     ==shinjigen@rev
161     ==daikanwa/+p
162     ==shinjigen@1ed
163     ===daikanwa/+p
164     =>daikanwa/ho
165     ===daikanwa/ho
166     ))
167
168 ;; (defvar isd-turtle-ccs-list nil)
169 (defvar chise-turtle-ccs-prefix-alist nil)
170
171 (defun charset-code-point-format-spec (ccs)
172   (cond ((memq ccs '(=ucs))
173          "0x%04X")
174         (t
175          (let ((ccs-name (symbol-name ccs)))
176            (cond
177             ((string-match
178               "\\(shinjigen\\|daikanwa/ho\\|=>iwds-1\\)"
179               ccs-name)
180              "%04d")
181             ((string-match
182               "\\(gt\\|daikanwa\\|adobe-japan1\\|cbeta\\|zinbun-oracle\\|hng\\)"
183               ccs-name)
184              "%05d")
185             ((string-match "\\(hanyo-denshi/ks\\|koseki\\|mj\\)" ccs-name)
186              "%06d")
187             ((string-match "hanyo-denshi/tk" ccs-name)
188              "%08d")
189             (t
190              "0x%X"))))))
191
192 ;; (defun isd-turtle-uri-encode-feature-name (feature-name)
193 ;;   (cond
194 ;;    ((eq '=ucs feature-name)
195 ;;     "a.ucs")
196 ;;    ((eq '==>ucs@bucs feature-name)
197 ;;     "bucs")
198 ;;    (t
199 ;;     (mapconcat (lambda (c)
200 ;;                  (if (eq c ?@)
201 ;;                      "_"
202 ;;                    (char-to-string c)))
203 ;;                (www-uri-encode-feature-name feature-name)
204 ;;                ""))))
205 (defun chise-turtle-uri-encode-ccs-name (feature-name)
206   (cond
207    ((eq '=ucs feature-name)
208     "a.ucs")
209    ((eq '=big5 feature-name)
210     "a.big5")
211    ((eq '==>ucs@bucs feature-name)
212     "bucs")
213    (t
214     (mapconcat (lambda (c)
215                  (cond
216                   ((eq c ?@)
217                    "_")
218                   ((eq c ?+)
219                    "._.")
220                   ((eq c ?=)
221                    ".:.")
222                   (t
223                    (char-to-string c))))
224                (www-uri-encode-feature-name feature-name)
225                ""))))
226
227 ;; (defun isd-turtle-format-ccs-code-point (ccs code-point)
228 ;;   (unless (memq ccs isd-turtle-ccs-list)
229 ;;     (setq isd-turtle-ccs-list (cons ccs isd-turtle-ccs-list)))
230 ;;   (format "%s:%s"
231 ;;           (isd-turtle-uri-encode-feature-name ccs)
232 ;;           (format (charset-code-point-format-spec ccs)
233 ;;                   code-point)))
234 (defun chise-turtle-format-ccs-code-point (ccs code-point)
235   (let ((ccs-uri (chise-turtle-uri-encode-ccs-name ccs)))
236     (unless (assoc ccs-uri chise-turtle-ccs-prefix-alist)
237       (setq chise-turtle-ccs-prefix-alist
238             (cons (cons ccs-uri ccs)
239                   chise-turtle-ccs-prefix-alist)))
240     (format "%s:%s"
241             ccs-uri
242             (format (charset-code-point-format-spec ccs)
243                     code-point))))
244
245 (defun isd-turtle-encode-char (object)
246   (let ((ccs-list est-coded-charset-priority-list)
247         ccs ret)
248     (if (setq ret (encode-char object '=ucs))
249         (chise-turtle-format-ccs-code-point '=ucs ret)
250       (while (and ccs-list
251                   (setq ccs (pop ccs-list))
252                   (not (setq ret (encode-char object ccs 'defined-only)))))
253       (cond (ret
254              (chise-turtle-format-ccs-code-point ccs ret)
255              )
256             ((and (setq ccs (car (split-char object)))
257                   (setq ret (encode-char object ccs)))
258              (chise-turtle-format-ccs-code-point ccs ret)
259              )
260             (t
261              (format (if est-hide-cgi-mode
262                          "system-char-id=0x%X"
263                        "system-char-id:0x%X")
264                      (encode-char object 'system-char-id))
265              )))))
266
267 (defun isd-turtle-format-component (component separator level prefix)
268   (cond ((characterp component)
269          (format "%s %c # %c"
270                  (isd-turtle-encode-char component)
271                  separator
272                  component)
273          )
274         ((consp component)
275          (let ((ret (find-char component)))
276            (cond (ret
277                   (format "%s %c # %c"
278                           (isd-turtle-encode-char ret) separator ret))
279                  ((setq ret (assq 'ideographic-structure component))
280                   (if (eq separator ?\;)
281                       (format "%s ;"
282                               (isd-turtle-format-char nil nil (cdr ret) (1+ level)
283                                                       prefix))
284                     (isd-turtle-format-char nil nil (cdr ret) (1+ level)
285                                             prefix))))))))
286
287 (defun isd-turtle-format-char (ccs code-point &optional ids-list level
288                                    prefix without-head-char)
289   (unless level
290     (setq level 0))
291   (unless prefix
292     (setq prefix ""))
293   (let ((indent (make-string (* level 4) ?\ ))
294         char
295         idc
296         p1 p2 p3
297         c1 c2 c3
298         ret)
299     (unless ids-list
300       (if (and ccs code-point
301                (setq char (decode-char ccs code-point)))
302           (setq ids-list (get-char-attribute char 'ideographic-structure))))
303     (setq idc (car ids-list))
304     (setq c1 (nth 1 ids-list)
305           c2 (nth 2 ids-list)
306           c3 (nth 3 ids-list))
307     (if (char-ref-p idc)
308         (setq idc (plist-get idc :char)))
309     (if (and (consp idc)
310              (setq ret (find-char idc)))
311         (setq idc ret))
312     (if (and (consp c1)
313              (setq ret (find-char c1)))
314         (setq c1 ret))
315     (if (and (consp c2)
316              (setq ret (find-char c2)))
317         (setq c2 ret))
318     (if (and (consp c3)
319              (setq ret (find-char c3)))
320         (setq c3 ret))
321     (cond
322      ((eq idc ?\u2FF0) ; ⿰
323       (setq p1 'left
324             p2 'right)
325       )
326      ((eq idc ?⿱)
327       (setq p1 'above
328             p2 'below)
329       )
330      ((eq idc ?⿲)
331       (setq p1 'left
332             p2 'middle
333             p3 'right)
334       )
335      ((eq idc ?⿳)
336       (setq p1 'above
337             p2 'middle
338             p3 'below)
339       )
340      ((memq idc '(?⿴ ?⿵ ?⿶ ?⿷ ?⿸ ?⿹ ?⿺))
341       (setq p1 'surround
342             p2 'filling)
343       )
344      ((eq idc ?⿻)
345       (setq p1 'underlying
346             p2 'overlaying)
347       ))
348     (cond
349      (p3
350       (format "%s
351 %s    %s:structure [ a idc:%c ;
352 %s        %s:%-8s %s
353 %s        %s:%-8s %s
354 %s        %s:%-8s %s
355 %s    ]%s"
356               (if without-head-char
357                   ""
358                 (if (and ccs code-point)
359                     (format "%s   # %c"
360                             (chise-turtle-format-ccs-code-point ccs code-point)
361                             char)
362                   "["))
363               indent prefix idc
364               indent prefix p1 (isd-turtle-format-component c1 ?\; (1+ level) prefix)
365               indent prefix p2 (isd-turtle-format-component c2 ?\; (1+ level) prefix)
366               indent prefix p3 (isd-turtle-format-component c3 ?\  (1+ level) prefix)
367               indent
368               (if without-head-char
369                   ""
370                 (if (null char)
371                     (format "\n%s]"
372                             indent)
373                   "")))
374       )
375      (idc
376       (format "%s
377 %s    %s:structure [ a idc:%c ;
378 %s        %s:%-8s %s
379 %s        %s:%-8s %s
380 %s    ]%s"
381               (if without-head-char
382                   ""
383                 (if (and ccs code-point)
384                     (format "%s   # %c"
385                             (chise-turtle-format-ccs-code-point ccs code-point)
386                             char)
387                   "["))
388               indent prefix idc
389               indent prefix p1 (isd-turtle-format-component c1 ?\; (1+ level) prefix)
390               indent prefix p2 (isd-turtle-format-component c2 ?\  (1+ level) prefix)
391               indent
392               (if without-head-char
393                   ""
394                 (if (null char)
395                     (format "\n%s]"
396                             indent)
397                   "")))))
398     ))
399
400 (defun isd-turtle-insert-char (ccs code-point)
401   (let ((ret (isd-turtle-format-char ccs code-point)))
402     (when ret
403       (insert ret)
404       (insert " .\n"))))
405
406 (defun isd-turtle-insert-ccs-ranges (ccs &rest ranges)
407   (let (range code max-code)
408     (while ranges
409       (setq range (car ranges))
410       (cond ((consp range)
411              (setq code (car range)
412                    max-code (cdr range))
413              (while (<= code max-code)
414                (isd-turtle-insert-char ccs code)
415                (setq code (1+ code)))
416              )
417             ((integerp range)
418              (isd-turtle-insert-char ccs range)
419              )
420             (t (error 'wrong-type-argument range)))
421       (setq ranges (cdr ranges)))))
422
423 (defun isd-turtle-dump-range (file path func &rest args)
424   (with-temp-buffer
425     (let ((coding-system-for-write 'utf-8-mcs-er)
426           ;; isd-turtle-ccs-list
427           chise-turtle-ccs-prefix-alist)
428       (if (file-directory-p path)
429           (setq path (expand-file-name file path)))
430       (apply func args)
431       (goto-char (point-min))
432       ;; (dolist (ccs (sort isd-turtle-ccs-list
433       ;;                    #'char-attribute-name<))
434       ;;   (insert (format "@prefix %s: <%s%s=> .\n"
435       ;;                   (isd-turtle-uri-encode-feature-name ccs)
436       ;;                   "http://www.chise.org/est/view/character/"
437       ;;                   (www-uri-encode-feature-name ccs))))
438       (dolist (cell (sort chise-turtle-ccs-prefix-alist
439                           (lambda (a b)
440                             (char-attribute-name< (cdr a)(cdr b)))))
441         (insert (format "@prefix %s: <%s/%s=> .\n"
442                         (car cell)
443                         "http://www.chise.org/est/view/character"
444                         (www-uri-encode-feature-name (cdr cell)))))
445       (insert "\n")
446       (goto-char (point-min))
447       (insert "# -*- coding: utf-8-mcs-er -*-\n")
448       (insert "@prefix : <http://rdf.chise.org/rdf/property/character/isd/> .
449 @prefix idc: <http://rdf.chise.org/rdf/type/character/idc/> .\n")
450       (write-region (point-min)(point-max) path))))
451
452 ;;;###autoload
453 (defun isd-turtle-dump-ucs-basic (filename)
454   (interactive "Fdump ISD-UCS-Basic : ")
455   (isd-turtle-dump-range "ISD-UCS-Basic.ttl" filename
456                          #'isd-turtle-insert-ccs-ranges
457                          '=ucs '(#x4E00 . #x9FA5)))
458
459 ;;;###autoload
460 (defun isd-turtle-dump-ucs-ext-a (filename)
461   (interactive "Fdump ISD-UCS-Ext-A : ")
462   (isd-turtle-dump-range "ISD-UCS-Ext-A.ttl" filename
463                          #'isd-turtle-insert-ccs-ranges
464                          '=ucs '(#x3400 . #x4DB5) #xFA1F #xFA23))
465
466 ;;;###autoload
467 (defun isd-turtle-dump-ucs-ext-b-1 (filename)
468   (interactive "Fdump IDS-UCS-Ext-B-1 : ")
469   (isd-turtle-dump-range "ISD-UCS-Ext-B-1.ttl" filename
470                          #'isd-turtle-insert-ccs-ranges
471                          'ucs '(#x20000 . #x21FFF)))
472
473 ;;;###autoload
474 (defun isd-turtle-dump-ucs-ext-b-2 (filename)
475   (interactive "Fdump IDS-UCS-Ext-B-2 : ")
476   (isd-turtle-dump-range "ISD-UCS-Ext-B-2.ttl" filename
477                          #'isd-turtle-insert-ccs-ranges
478                          'ucs '(#x22000 . #x23FFF)))
479
480 ;;;###autoload
481 (defun isd-turtle-dump-ucs-ext-b-3 (filename)
482   (interactive "Fdump IDS-UCS-Ext-B-3 : ")
483   (isd-turtle-dump-range "ISD-UCS-Ext-B-3.ttl" filename
484                          #'isd-turtle-insert-ccs-ranges
485                          'ucs '(#x24000 . #x25FFF)))
486
487 ;;;###autoload
488 (defun isd-turtle-dump-ucs-ext-b-4 (filename)
489   (interactive "Fdump IDS-UCS-Ext-B-4 : ")
490   (isd-turtle-dump-range "ISD-UCS-Ext-B-4.ttl" filename
491                          #'isd-turtle-insert-ccs-ranges
492                          'ucs '(#x26000 . #x27FFF)))
493
494 ;;;###autoload
495 (defun isd-turtle-dump-ucs-ext-b-5 (filename)
496   (interactive "Fdump IDS-UCS-Ext-B-5 : ")
497   (isd-turtle-dump-range "ISD-UCS-Ext-B-5.ttl" filename
498                          #'isd-turtle-insert-ccs-ranges
499                          'ucs '(#x28000 . #x29FFF)))
500
501 ;;;###autoload
502 (defun isd-turtle-dump-ucs-ext-b-6 (filename)
503   (interactive "Fdump IDS-UCS-Ext-B-6 : ")
504   (isd-turtle-dump-range "ISD-UCS-Ext-B-6.ttl" filename
505                          #'isd-turtle-insert-ccs-ranges
506                          'ucs '(#x2A000 . #x2A6D6)))
507
508 ;;;###autoload
509 (defun isd-turtle-dump-ucs-ext-c (filename)
510   (interactive "Fdump IDS-UCS-Ext-C : ")
511   (isd-turtle-dump-range "ISD-UCS-Ext-C.ttl" filename
512                          #'isd-turtle-insert-ccs-ranges
513                          'ucs '(#x2A700 . #x2B734)))
514
515 ;;;###autoload
516 (defun isd-turtle-dump-ucs-ext-d (filename)
517   (interactive "Fdump IDS-UCS-Ext-D : ")
518   (isd-turtle-dump-range "ISD-UCS-Ext-D.ttl" filename
519                          #'isd-turtle-insert-ccs-ranges
520                          'ucs '(#x2B740 . #x2B81D)))
521
522 ;;;###autoload
523 (defun isd-turtle-dump-ucs-ext-e (filename)
524   (interactive "Fdump IDS-UCS-Ext-E : ")
525   (isd-turtle-dump-range "ISD-UCS-Ext-E.ttl" filename
526                          #'isd-turtle-insert-ccs-ranges
527                          'ucs '(#x2B820 . #x2CEA1)))
528
529 ;;;###autoload
530 (defun isd-turtle-dump-mj-0 (filename)
531   (interactive "Fdump ISD-MJ-0 : ")
532   (isd-turtle-dump-range "ISD-MJ-0.ttl" filename
533                          #'isd-turtle-insert-ccs-ranges
534                          '=mj '(1 . 9999)))
535
536 ;;;###autoload
537 (defun isd-turtle-dump-mj-1 (filename)
538   (interactive "Fdump ISD-MJ-1 : ")
539   (isd-turtle-dump-range "ISD-MJ-1.ttl" filename
540                          #'isd-turtle-insert-ccs-ranges
541                          '=mj '(10000 . 19999)))
542
543 ;;;###autoload
544 (defun isd-turtle-dump-mj-2 (filename)
545   (interactive "Fdump ISD-MJ-2 : ")
546   (isd-turtle-dump-range "ISD-MJ-2.ttl" filename
547                          #'isd-turtle-insert-ccs-ranges
548                          '=mj '(20000 . 29999)))
549
550 ;;;###autoload
551 (defun isd-turtle-dump-mj-3 (filename)
552   (interactive "Fdump ISD-MJ-3 : ")
553   (isd-turtle-dump-range "ISD-MJ-3.ttl" filename
554                          #'isd-turtle-insert-ccs-ranges
555                          '=mj '(30000 . 39999)))
556
557 ;;;###autoload
558 (defun isd-turtle-dump-mj-4 (filename)
559   (interactive "Fdump ISD-MJ-4 : ")
560   (isd-turtle-dump-range "ISD-MJ-4.ttl" filename
561                          #'isd-turtle-insert-ccs-ranges
562                          '=mj '(40000 . 49999)))
563
564 ;;;###autoload
565 (defun isd-turtle-dump-mj-5 (filename)
566   (interactive "Fdump ISD-MJ-5 : ")
567   (isd-turtle-dump-range "ISD-MJ-5.ttl" filename
568                          #'isd-turtle-insert-ccs-ranges
569                          '=mj '(50000 . 59999)))
570
571 ;;;###autoload
572 (defun isd-turtle-dump-mj-6 (filename)
573   (interactive "Fdump ISD-MJ-6 : ")
574   (isd-turtle-dump-range "ISD-MJ-6.ttl" filename
575                          #'isd-turtle-insert-ccs-ranges
576                          '=mj '(60000 . 69999)))
577
578 ;;;###autoload
579 (defun isd-turtle-dump-all (directory)
580   (interactive "DISD directory : ")
581   (isd-turtle-dump-ucs-basic directory)
582   (isd-turtle-dump-ucs-ext-a directory)
583   (isd-turtle-dump-ucs-ext-b-1 directory)
584   (isd-turtle-dump-ucs-ext-b-2 directory)
585   (isd-turtle-dump-ucs-ext-b-3 directory)
586   (isd-turtle-dump-ucs-ext-b-4 directory)
587   (isd-turtle-dump-ucs-ext-b-5 directory)
588   (isd-turtle-dump-ucs-ext-b-6 directory)
589   (isd-turtle-dump-ucs-ext-c directory)
590   (isd-turtle-dump-ucs-ext-d directory)
591   (isd-turtle-dump-ucs-ext-e directory)
592   (isd-turtle-dump-mj-0 directory)
593   (isd-turtle-dump-mj-1 directory)
594   (isd-turtle-dump-mj-2 directory)
595   (isd-turtle-dump-mj-3 directory)
596   (isd-turtle-dump-mj-4 directory)
597   (isd-turtle-dump-mj-5 directory)
598   (isd-turtle-dump-mj-6 directory)
599   )
600
601
602 ;;; @ End.
603 ;;;
604
605 (provide 'isd-turtle)
606
607 ;;; isd-turtle.el ends here