96201c1dc079b0f5e99087ad7fba0c2679e78557
[chise/isd.git] / isd-turtle.el
1 ;;; isd-turtle.el --- Utility to dump ideographic-structure as Turtle files
2
3 ;; Copyright (C) 2017 MORIOKA Tomohiko
4
5 ;; Author: MORIOKA Tomohiko <tomo@kanji.zinbun.kyoto-u.ac.jp>
6 ;; Keywords: Ideographic Structures (漢字構造、解字), IDS, CHISE, RDF, Turtle
7
8 ;; This file is a part of CHISE-ISD (Ideographic Structure Database).
9
10 ;; This program is free software; you can redistribute it and/or
11 ;; modify it under the terms of the GNU General Public License as
12 ;; published by the Free Software Foundation; either version 2, or (at
13 ;; your option) any later version.
14
15 ;; This program is distributed in the hope that it will be useful, but
16 ;; WITHOUT ANY WARRANTY; without even the implied warranty of
17 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18 ;; General Public License for more details.
19
20 ;; You should have received a copy of the GNU General Public License
21 ;; along with this program; see the file COPYING.  If not, write to
22 ;; the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 ;; Boston, MA 02111-1307, USA.
24
25 ;;; Code:
26
27 (require 'cwiki-common)
28
29 (defvar isd-url-prefix "http://rdf.chise.org/data/")
30
31 (setq est-coded-charset-priority-list
32   '(; =ucs
33     =mj
34     =adobe-japan1-0
35     =adobe-japan1-1
36     =adobe-japan1-2
37     =adobe-japan1-3
38     =adobe-japan1-4
39     =adobe-japan1-5
40     =adobe-japan1-6
41     =ucs@iso
42     =jis-x0208 =jis-x0208@1990
43     =jis-x0213-1
44     =jis-x0213-1@2000 =jis-x0213-1@2004
45     =jis-x0213-2
46     =jis-x0212
47     =gt
48     =hanyo-denshi/ks
49     =hanyo-denshi/tk
50     =ucs-itaiji-001
51     =ucs-itaiji-002
52     =ucs-itaiji-003
53     =ucs-itaiji-004
54     =ucs-itaiji-005
55     =ucs-itaiji-006
56     =ucs-itaiji-009
57     =ucs-itaiji-084
58     =ucs-var-001
59     =ucs-var-002
60     =ucs-var-003
61     =ucs-var-004
62     =cns11643-1 =cns11643-2 =cns11643-3
63     =cns11643-4 =cns11643-5 =cns11643-6 =cns11643-7
64     =gb2312
65     =big5-cdp
66     =ks-x1001
67     =gt-k
68     =ucs@unicode
69     =ucs@JP/hanazono
70     =gb12345
71     =ucs@cns
72     =ucs@gb
73     =zinbun-oracle =>zinbun-oracle
74     =daikanwa
75     =ruimoku-v6
76     =cbeta =jef-china3
77     =daikanwa/+2p
78     =+>ucs@iso =+>ucs@unicode
79     =+>ucs@jis
80     =+>ucs@cns
81     =+>ucs@ks
82     =+>ucs@jis/1990
83     =>mj
84     =>jis-x0208 =>jis-x0213-1
85     =>jis-x0208@1997
86     =>ucs@iwds-1
87     =>ucs@component
88     =>iwds-1
89     =>ucs@iso
90     =>ucs@unicode
91     =>ucs@jis =>ucs@cns =>ucs@ks
92     =>gt
93     =>gt-k
94     =>>ucs@iso =>>ucs@unicode
95     =>>ucs@jis =>>ucs@cns =>>ucs@ks
96     =>>gt-k
97     =>>hanyo-denshi/ks
98     ==mj
99     ==ucs@iso
100     ==ucs@unicode
101     ==adobe-japan1-0
102     ==adobe-japan1-1
103     ==adobe-japan1-2
104     ==adobe-japan1-3
105     ==adobe-japan1-4
106     ==adobe-japan1-5
107     ==adobe-japan1-6
108     ==ks-x1001
109     ==hanyo-denshi/ks
110     ==hanyo-denshi/tk
111     ==ucs@jis
112     ==gt
113     ==cns11643-1 ==cns11643-2 ==cns11643-3
114     ==cns11643-4 ==cns11643-5 ==cns11643-6 ==cns11643-7
115     ==jis-x0212
116     ==ucs@cns
117     ==koseki
118     ==daikanwa
119     ==gt-k
120     ==ucs@gb
121     ==ucs-itaiji-003
122     ==ucs@JP/hanazono
123     ==daikanwa/+2p
124     =>>jis-x0208 =>>jis-x0213-1 =>>jis-x0213-2
125     =+>jis-x0208 =+>jis-x0213-1 =+>jis-x0213-2
126     =+>hanyo-denshi/jt
127     =+>jis-x0208@1978
128     =>>gt
129     =+>adobe-japan1
130     =>>adobe-japan1
131     =jis-x0208@1983 =jis-x0208@1978
132     =>ucs-itaiji-001
133     =>ucs-itaiji-002
134     =>ucs-itaiji-005
135     ==>ucs@bucs
136     =big5
137     =>cbeta
138     ===mj
139     ===ucs@iso
140     ===ucs@unicode
141     ===hanyo-denshi/ks
142     ===ks-x1001
143     ===gt
144     ===gt-k
145     ===ucs@ks
146     ===ucs@gb
147     =shinjigen
148     =shinjigen@rev
149     =shinjigen@1ed
150     =shinjigen/+p@rev
151     ==shinjigen
152     ==shinjigen@rev
153     ==daikanwa/+p
154     ==shinjigen@1ed
155     ===daikanwa/+p
156     =>daikanwa/ho
157     ===daikanwa/ho
158     ))
159
160 ;; (defvar isd-turtle-ccs-list nil)
161 (defvar chise-turtle-ccs-prefix-alist nil)
162
163 (defun charset-code-point-format-spec (ccs)
164   (cond ((memq ccs '(=ucs))
165          "0x%04X")
166         ((memq ccs '(=gt
167                      =gt-k =daikanwa =adobe-japan1
168                      =cbeta =zinbun-oracle))
169          "%05d")
170         ((memq ccs '(=hanyo-denshi/ks
171                      =koseki =mj))
172          "%06d")
173         ((memq ccs '(=hanyo-denshi/tk ==hanyo-denshi/tk))
174          "%08d")
175         (t
176          "0x%X")))
177
178 ;; (defun isd-turtle-uri-encode-feature-name (feature-name)
179 ;;   (cond
180 ;;    ((eq '=ucs feature-name)
181 ;;     "a.ucs")
182 ;;    ((eq '==>ucs@bucs feature-name)
183 ;;     "bucs")
184 ;;    (t
185 ;;     (mapconcat (lambda (c)
186 ;;                  (if (eq c ?@)
187 ;;                      "_"
188 ;;                    (char-to-string c)))
189 ;;                (www-uri-encode-feature-name feature-name)
190 ;;                ""))))
191 (defun chise-turtle-uri-encode-ccs-name (feature-name)
192   (cond
193    ((eq '=ucs feature-name)
194     "a.ucs")
195    ((eq '=big5 feature-name)
196     "a.big5")
197    ((eq '==>ucs@bucs feature-name)
198     "bucs")
199    (t
200     (mapconcat (lambda (c)
201                  (cond
202                   ((eq c ?@)
203                    "_")
204                   ((eq c ?+)
205                    "._.")
206                   ((eq c ?=)
207                    ".:.")
208                   (t
209                    (char-to-string c))))
210                (www-uri-encode-feature-name feature-name)
211                ""))))
212
213 ;; (defun isd-turtle-format-ccs-code-point (ccs code-point)
214 ;;   (unless (memq ccs isd-turtle-ccs-list)
215 ;;     (setq isd-turtle-ccs-list (cons ccs isd-turtle-ccs-list)))
216 ;;   (format "%s:%s"
217 ;;           (isd-turtle-uri-encode-feature-name ccs)
218 ;;           (format (charset-code-point-format-spec ccs)
219 ;;                   code-point)))
220 (defun chise-turtle-format-ccs-code-point (ccs code-point)
221   (let ((ccs-uri (chise-turtle-uri-encode-ccs-name ccs)))
222     (unless (assoc ccs-uri chise-turtle-ccs-prefix-alist)
223       (setq chise-turtle-ccs-prefix-alist
224             (cons (cons ccs-uri ccs)
225                   chise-turtle-ccs-prefix-alist)))
226     (format "%s:%s"
227             ccs-uri
228             (format (charset-code-point-format-spec ccs)
229                     code-point))))
230
231 (defun isd-turtle-encode-char (object)
232   (let ((ccs-list est-coded-charset-priority-list)
233         ccs ret)
234     (if (setq ret (encode-char object '=ucs))
235         (chise-turtle-format-ccs-code-point '=ucs ret)
236       (while (and ccs-list
237                   (setq ccs (pop ccs-list))
238                   (not (setq ret (encode-char object ccs 'defined-only)))))
239       (cond (ret
240              (chise-turtle-format-ccs-code-point ccs ret)
241              )
242             ((and (setq ccs (car (split-char object)))
243                   (setq ret (encode-char object ccs)))
244              (chise-turtle-format-ccs-code-point ccs ret)
245              )
246             (t
247              (format (if est-hide-cgi-mode
248                          "system-char-id=0x%X"
249                        "system-char-id:0x%X")
250                      (encode-char object 'system-char-id))
251              )))))
252
253 (defun isd-turtle-format-component (component separator level)
254   (cond ((characterp component)
255          (format "%s %c # %c"
256                  (isd-turtle-encode-char component)
257                  separator
258                  component)
259          )
260         ((consp component)
261          (let ((ret (find-char component)))
262            (cond (ret
263                   (format "%s %c # %c"
264                           (isd-turtle-encode-char ret) separator ret))
265                  ((setq ret (assq 'ideographic-structure component))
266                   (if (eq separator ?\;)
267                       (format "%s ;"
268                               (isd-turtle-format-char nil nil (cdr ret) (1+ level)))
269                     (isd-turtle-format-char nil nil (cdr ret) (1+ level)))))))))
270
271 (defun isd-turtle-format-char (ccs code-point &optional ids-list level
272                                    prefix without-head-char)
273   (unless level
274     (setq level 0))
275   (unless prefix
276     (setq prefix ""))
277   (let ((indent (make-string (* level 4) ?\ ))
278         char
279         idc
280         p1 p2 p3
281         c1 c2 c3
282         ret)
283     (unless ids-list
284       (if (and ccs code-point
285                (setq char (decode-char ccs code-point)))
286           (setq ids-list (get-char-attribute char 'ideographic-structure))))
287     (setq idc (car ids-list))
288     (setq c1 (nth 1 ids-list)
289           c2 (nth 2 ids-list)
290           c3 (nth 3 ids-list))
291     (if (char-ref-p idc)
292         (setq idc (plist-get idc :char)))
293     (if (and (consp idc)
294              (setq ret (find-char idc)))
295         (setq idc ret))
296     (if (and (consp c1)
297              (setq ret (find-char c1)))
298         (setq c1 ret))
299     (if (and (consp c2)
300              (setq ret (find-char c2)))
301         (setq c2 ret))
302     (if (and (consp c3)
303              (setq ret (find-char c3)))
304         (setq c3 ret))
305     (cond
306      ((eq idc ?\u2FF0) ; ⿰
307       (setq p1 'left
308             p2 'right)
309       )
310      ((eq idc ?⿱)
311       (setq p1 'above
312             p2 'below)
313       )
314      ((eq idc ?⿲)
315       (setq p1 'left
316             p2 'middle
317             p3 'right)
318       )
319      ((eq idc ?⿳)
320       (setq p1 'above
321             p2 'middle
322             p3 'below)
323       )
324      ((memq idc '(?⿴ ?⿵ ?⿶ ?⿷ ?⿸ ?⿹ ?⿺))
325       (setq p1 'surround
326             p2 'filling)
327       )
328      ((eq idc ?⿻)
329       (setq p1 'underlying
330             p2 'overlaying)
331       ))
332     (cond
333      (p3
334       (format "%s
335 %s    %s:structure [ a idc:%c ;
336 %s        %s:%-8s %s
337 %s        %s:%-8s %s
338 %s        %s:%-8s %s
339 %s    ]%s"
340               (if without-head-char
341                   ""
342                 (if (and ccs code-point)
343                     (format "%s   # %c"
344                             (chise-turtle-format-ccs-code-point ccs code-point)
345                             char)
346                   "["))
347               indent prefix idc
348               indent prefix p1 (isd-turtle-format-component c1 ?\; (1+ level))
349               indent prefix p2 (isd-turtle-format-component c2 ?\; (1+ level))
350               indent prefix p3 (isd-turtle-format-component c3 ?\  (1+ level))
351               indent
352               (if without-head-char
353                   ""
354                 (if (null char)
355                     (format "\n%s]"
356                             indent)
357                   "")))
358       )
359      (idc
360       (format "%s
361 %s    %s:structure [ a idc:%c ;
362 %s        %s:%-8s %s
363 %s        %s:%-8s %s
364 %s    ]%s"
365               (if without-head-char
366                   ""
367                 (if (and ccs code-point)
368                     (format "%s   # %c"
369                             (chise-turtle-format-ccs-code-point ccs code-point)
370                             char)
371                   "["))
372               indent prefix idc
373               indent prefix p1 (isd-turtle-format-component c1 ?\; (1+ level))
374               indent prefix p2 (isd-turtle-format-component c2 ?\  (1+ level))
375               indent
376               (if without-head-char
377                   ""
378                 (if (null char)
379                     (format "\n%s]"
380                             indent)
381                   "")))))
382     ))
383
384 (defun isd-turtle-insert-char (ccs code-point)
385   (let ((ret (isd-turtle-format-char ccs code-point)))
386     (when ret
387       (insert ret)
388       (insert " .\n"))))
389
390 (defun isd-turtle-insert-ccs-ranges (ccs &rest ranges)
391   (let (range code max-code)
392     (while ranges
393       (setq range (car ranges))
394       (cond ((consp range)
395              (setq code (car range)
396                    max-code (cdr range))
397              (while (<= code max-code)
398                (isd-turtle-insert-char ccs code)
399                (setq code (1+ code)))
400              )
401             ((integerp range)
402              (isd-turtle-insert-char ccs range)
403              )
404             (t (error 'wrong-type-argument range)))
405       (setq ranges (cdr ranges)))))
406
407 (defun isd-turtle-dump-range (file path func &rest args)
408   (with-temp-buffer
409     (let ((coding-system-for-write 'utf-8-mcs-er)
410           ;; isd-turtle-ccs-list
411           chise-turtle-ccs-prefix-alist)
412       (if (file-directory-p path)
413           (setq path (expand-file-name file path)))
414       (apply func args)
415       (goto-char (point-min))
416       ;; (dolist (ccs (sort isd-turtle-ccs-list
417       ;;                    #'char-attribute-name<))
418       ;;   (insert (format "@prefix %s: <%s%s=> .\n"
419       ;;                   (isd-turtle-uri-encode-feature-name ccs)
420       ;;                   "http://www.chise.org/est/view/character/"
421       ;;                   (www-uri-encode-feature-name ccs))))
422       (dolist (cell (sort chise-turtle-ccs-prefix-alist
423                           (lambda (a b)
424                             (char-attribute-name< (cdr a)(cdr b)))))
425         (insert (format "@prefix %s: <%s/%s=> .\n"
426                         (car cell)
427                         "http://www.chise.org/est/view/character"
428                         (www-uri-encode-feature-name (cdr cell)))))
429       (insert "\n")
430       (goto-char (point-min))
431       (insert "# -*- coding: utf-8-mcs-er -*-\n")
432       (insert "@prefix : <http://rdf.chise.org/rdf/property/character/isd/> .
433 @prefix idc: <http://rdf.chise.org/rdf/type/character/idc/> .\n")
434       (write-region (point-min)(point-max) path))))
435
436 ;;;###autoload
437 (defun isd-turtle-dump-ucs-basic (filename)
438   (interactive "Fdump ISD-UCS-Basic : ")
439   (isd-turtle-dump-range "ISD-UCS-Basic.ttl" filename
440                          #'isd-turtle-insert-ccs-ranges
441                          '=ucs '(#x4E00 . #x9FA5)))
442
443 ;;;###autoload
444 (defun isd-turtle-dump-ucs-ext-a (filename)
445   (interactive "Fdump ISD-UCS-Ext-A : ")
446   (isd-turtle-dump-range "ISD-UCS-Ext-A.ttl" filename
447                          #'isd-turtle-insert-ccs-ranges
448                          '=ucs '(#x3400 . #x4DB5) #xFA1F #xFA23))
449
450 ;;;###autoload
451 (defun isd-turtle-dump-ucs-ext-b-1 (filename)
452   (interactive "Fdump IDS-UCS-Ext-B-1 : ")
453   (isd-turtle-dump-range "ISD-UCS-Ext-B-1.ttl" filename
454                          #'isd-turtle-insert-ccs-ranges
455                          'ucs '(#x20000 . #x21FFF)))
456
457 ;;;###autoload
458 (defun isd-turtle-dump-ucs-ext-b-2 (filename)
459   (interactive "Fdump IDS-UCS-Ext-B-2 : ")
460   (isd-turtle-dump-range "ISD-UCS-Ext-B-2.ttl" filename
461                          #'isd-turtle-insert-ccs-ranges
462                          'ucs '(#x22000 . #x23FFF)))
463
464 ;;;###autoload
465 (defun isd-turtle-dump-ucs-ext-b-3 (filename)
466   (interactive "Fdump IDS-UCS-Ext-B-3 : ")
467   (isd-turtle-dump-range "ISD-UCS-Ext-B-3.ttl" filename
468                          #'isd-turtle-insert-ccs-ranges
469                          'ucs '(#x24000 . #x25FFF)))
470
471 ;;;###autoload
472 (defun isd-turtle-dump-ucs-ext-b-4 (filename)
473   (interactive "Fdump IDS-UCS-Ext-B-4 : ")
474   (isd-turtle-dump-range "ISD-UCS-Ext-B-4.ttl" filename
475                          #'isd-turtle-insert-ccs-ranges
476                          'ucs '(#x26000 . #x27FFF)))
477
478 ;;;###autoload
479 (defun isd-turtle-dump-ucs-ext-b-5 (filename)
480   (interactive "Fdump IDS-UCS-Ext-B-5 : ")
481   (isd-turtle-dump-range "ISD-UCS-Ext-B-5.ttl" filename
482                          #'isd-turtle-insert-ccs-ranges
483                          'ucs '(#x28000 . #x29FFF)))
484
485 ;;;###autoload
486 (defun isd-turtle-dump-ucs-ext-b-6 (filename)
487   (interactive "Fdump IDS-UCS-Ext-B-6 : ")
488   (isd-turtle-dump-range "ISD-UCS-Ext-B-6.ttl" filename
489                          #'isd-turtle-insert-ccs-ranges
490                          'ucs '(#x2A000 . #x2A6D6)))
491
492 ;;;###autoload
493 (defun isd-turtle-dump-ucs-ext-c (filename)
494   (interactive "Fdump IDS-UCS-Ext-C : ")
495   (isd-turtle-dump-range "ISD-UCS-Ext-C.ttl" filename
496                          #'isd-turtle-insert-ccs-ranges
497                          'ucs '(#x2A700 . #x2B734)))
498
499 ;;;###autoload
500 (defun isd-turtle-dump-ucs-ext-d (filename)
501   (interactive "Fdump IDS-UCS-Ext-D : ")
502   (isd-turtle-dump-range "ISD-UCS-Ext-D.ttl" filename
503                          #'isd-turtle-insert-ccs-ranges
504                          'ucs '(#x2B740 . #x2B81D)))
505
506 ;;;###autoload
507 (defun isd-turtle-dump-ucs-ext-e (filename)
508   (interactive "Fdump IDS-UCS-Ext-E : ")
509   (isd-turtle-dump-range "ISD-UCS-Ext-E.ttl" filename
510                          #'isd-turtle-insert-ccs-ranges
511                          'ucs '(#x2B820 . #x2CEA1)))
512
513 ;;;###autoload
514 (defun isd-turtle-dump-mj-0 (filename)
515   (interactive "Fdump ISD-MJ-0 : ")
516   (isd-turtle-dump-range "ISD-MJ-0.ttl" filename
517                          #'isd-turtle-insert-ccs-ranges
518                          '=mj '(1 . 9999)))
519
520 ;;;###autoload
521 (defun isd-turtle-dump-mj-1 (filename)
522   (interactive "Fdump ISD-MJ-1 : ")
523   (isd-turtle-dump-range "ISD-MJ-1.ttl" filename
524                          #'isd-turtle-insert-ccs-ranges
525                          '=mj '(10000 . 19999)))
526
527 ;;;###autoload
528 (defun isd-turtle-dump-mj-2 (filename)
529   (interactive "Fdump ISD-MJ-2 : ")
530   (isd-turtle-dump-range "ISD-MJ-2.ttl" filename
531                          #'isd-turtle-insert-ccs-ranges
532                          '=mj '(20000 . 29999)))
533
534 ;;;###autoload
535 (defun isd-turtle-dump-mj-3 (filename)
536   (interactive "Fdump ISD-MJ-3 : ")
537   (isd-turtle-dump-range "ISD-MJ-3.ttl" filename
538                          #'isd-turtle-insert-ccs-ranges
539                          '=mj '(30000 . 39999)))
540
541 ;;;###autoload
542 (defun isd-turtle-dump-mj-4 (filename)
543   (interactive "Fdump ISD-MJ-4 : ")
544   (isd-turtle-dump-range "ISD-MJ-4.ttl" filename
545                          #'isd-turtle-insert-ccs-ranges
546                          '=mj '(40000 . 49999)))
547
548 ;;;###autoload
549 (defun isd-turtle-dump-mj-5 (filename)
550   (interactive "Fdump ISD-MJ-5 : ")
551   (isd-turtle-dump-range "ISD-MJ-5.ttl" filename
552                          #'isd-turtle-insert-ccs-ranges
553                          '=mj '(50000 . 59999)))
554
555 ;;;###autoload
556 (defun isd-turtle-dump-mj-6 (filename)
557   (interactive "Fdump ISD-MJ-6 : ")
558   (isd-turtle-dump-range "ISD-MJ-6.ttl" filename
559                          #'isd-turtle-insert-ccs-ranges
560                          '=mj '(60000 . 69999)))
561
562 ;;;###autoload
563 (defun isd-turtle-dump-all (directory)
564   (interactive "DISD directory : ")
565   (isd-turtle-dump-ucs-basic directory)
566   (isd-turtle-dump-ucs-ext-a directory)
567   (isd-turtle-dump-ucs-ext-b-1 directory)
568   (isd-turtle-dump-ucs-ext-b-2 directory)
569   (isd-turtle-dump-ucs-ext-b-3 directory)
570   (isd-turtle-dump-ucs-ext-b-4 directory)
571   (isd-turtle-dump-ucs-ext-b-5 directory)
572   (isd-turtle-dump-ucs-ext-b-6 directory)
573   (isd-turtle-dump-ucs-ext-c directory)
574   (isd-turtle-dump-ucs-ext-d directory)
575   (isd-turtle-dump-ucs-ext-e directory)
576   (isd-turtle-dump-mj-0 directory)
577   (isd-turtle-dump-mj-1 directory)
578   (isd-turtle-dump-mj-2 directory)
579   (isd-turtle-dump-mj-3 directory)
580   (isd-turtle-dump-mj-4 directory)
581   (isd-turtle-dump-mj-5 directory)
582   (isd-turtle-dump-mj-6 directory)
583   )
584
585
586 ;;; @ End.
587 ;;;
588
589 (provide 'isd-turtle)
590
591 ;;; isd-turtle.el ends here