(isd-turtle-ccs-list): Abolished.
[chise/isd.git] / isd-turtle.el
1 ;;; isd-turtle.el --- Utility to dump ideographic-structure as Turtle files
2
3 ;; Copyright (C) 2017 MORIOKA Tomohiko
4
5 ;; Author: MORIOKA Tomohiko <tomo@kanji.zinbun.kyoto-u.ac.jp>
6 ;; Keywords: Ideographic Structures (漢字構造、解字), IDS, CHISE, RDF, Turtle
7
8 ;; This file is a part of CHISE-ISD (Ideographic Structure Database).
9
10 ;; This program is free software; you can redistribute it and/or
11 ;; modify it under the terms of the GNU General Public License as
12 ;; published by the Free Software Foundation; either version 2, or (at
13 ;; your option) any later version.
14
15 ;; This program is distributed in the hope that it will be useful, but
16 ;; WITHOUT ANY WARRANTY; without even the implied warranty of
17 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18 ;; General Public License for more details.
19
20 ;; You should have received a copy of the GNU General Public License
21 ;; along with this program; see the file COPYING.  If not, write to
22 ;; the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 ;; Boston, MA 02111-1307, USA.
24
25 ;;; Code:
26
27 (require 'cwiki-common)
28
29 (defvar isd-url-prefix "http://rdf.chise.org/data/")
30
31 (setq est-coded-charset-priority-list
32   '(; =ucs
33     =mj
34     =adobe-japan1
35     =ucs@iso
36     =jis-x0208 =jis-x0208@1990
37     =jis-x0213-1
38     =jis-x0213-1@2000 =jis-x0213-1@2004
39     =jis-x0213-2
40     =jis-x0212
41     =gt
42     =hanyo-denshi/tk
43     =ucs-itaiji-001
44     =ucs-itaiji-002
45     =ucs-itaiji-003
46     =ucs-itaiji-005
47     =ucs-itaiji-006
48     =ucs-itaiji-084
49     =ucs-var-001
50     =ucs-var-002
51     =ucs-var-004
52     =cns11643-1 =cns11643-2 =cns11643-3
53     =cns11643-4 =cns11643-5 =cns11643-6 =cns11643-7
54     =gb2312
55     =big5-cdp
56     =gt-k
57     =ucs@unicode
58     =ucs@JP/hanazono
59     =gb12345
60     =zinbun-oracle =>zinbun-oracle
61     =daikanwa
62     =ruimoku-v6
63     =cbeta =jef-china3
64     =>jis-x0208 =>jis-x0213-1
65     =>jis-x0208@1997
66     =>ucs@iwds-1
67     =>ucs@component
68     =>iwds-1
69     =>ucs@iso
70     =>ucs@unicode
71     =+>ucs@iso =+>ucs@unicode
72     =>ucs@jis =>ucs@cns =>ucs@ks
73     =>>ucs@iso =>>ucs@unicode
74     =>>ucs@jis =>>ucs@cns =>>ucs@ks
75     ==mj
76     =>>jis-x0208 =>>jis-x0213-1 =>>jis-x0213-2
77     =+>jis-x0208 =+>jis-x0213-1 =+>jis-x0213-2
78     =+>jis-x0208@1978
79     =>>gt
80     =+>adobe-japan1
81     =>>adobe-japan1
82     =jis-x0208@1983 =jis-x0208@1978
83     =>ucs-itaiji-001
84     =>ucs-itaiji-002
85     =>ucs-itaiji-005
86     ==ucs@unicode
87     ==>ucs@bucs
88     =big5
89     =>cbeta
90     ))
91
92 ;; (defvar isd-turtle-ccs-list nil)
93 (defvar chise-turtle-ccs-prefix-alist nil)
94
95 (defun charset-code-point-format-spec (ccs)
96   (cond ((memq ccs '(=ucs))
97          "0x%04X")
98         ((memq ccs '(=gt
99                      =gt-k =daikanwa =adobe-japan1
100                      =cbeta =zinbun-oracle))
101          "%05d")
102         ((memq ccs '(=hanyo-denshi/ks
103                      =koseki =mj))
104          "%06d")
105         ((memq ccs '(=hanyo-denshi/tk ==hanyo-denshi/tk))
106          "%08d")
107         (t
108          "0x%X")))
109
110 ;; (defun isd-turtle-uri-encode-feature-name (feature-name)
111 ;;   (cond
112 ;;    ((eq '=ucs feature-name)
113 ;;     "a.ucs")
114 ;;    ((eq '==>ucs@bucs feature-name)
115 ;;     "bucs")
116 ;;    (t
117 ;;     (mapconcat (lambda (c)
118 ;;                  (if (eq c ?@)
119 ;;                      "_"
120 ;;                    (char-to-string c)))
121 ;;                (www-uri-encode-feature-name feature-name)
122 ;;                ""))))
123 (defun chise-turtle-uri-encode-ccs-name (feature-name)
124   (cond
125    ((eq '=ucs feature-name)
126     "a.ucs")
127    ((eq '=big5 feature-name)
128     "a.big5")
129    ((eq '==>ucs@bucs feature-name)
130     "bucs")
131    (t
132     (mapconcat (lambda (c)
133                  (cond
134                   ((eq c ?@)
135                    "_")
136                   ((eq c ?+)
137                    "._.")
138                   ((eq c ?=)
139                    ".:.")
140                   (t
141                    (char-to-string c))))
142                (www-uri-encode-feature-name feature-name)
143                ""))))
144
145 ;; (defun isd-turtle-format-ccs-code-point (ccs code-point)
146 ;;   (unless (memq ccs isd-turtle-ccs-list)
147 ;;     (setq isd-turtle-ccs-list (cons ccs isd-turtle-ccs-list)))
148 ;;   (format "%s:%s"
149 ;;           (isd-turtle-uri-encode-feature-name ccs)
150 ;;           (format (charset-code-point-format-spec ccs)
151 ;;                   code-point)))
152 (defun chise-turtle-format-ccs-code-point (ccs code-point)
153   (let ((ccs-uri (chise-turtle-uri-encode-ccs-name ccs)))
154     (unless (assoc ccs-uri chise-turtle-ccs-prefix-alist)
155       (setq chise-turtle-ccs-prefix-alist
156             (cons (cons ccs-uri ccs)
157                   chise-turtle-ccs-prefix-alist)))
158     (format "%s:%s"
159             ccs-uri
160             (format (charset-code-point-format-spec ccs)
161                     code-point))))
162
163 (defun isd-turtle-encode-char (object)
164   (let ((ccs-list est-coded-charset-priority-list)
165         ccs ret)
166     (if (setq ret (encode-char object '=ucs))
167         (chise-turtle-format-ccs-code-point '=ucs ret)
168       (while (and ccs-list
169                   (setq ccs (pop ccs-list))
170                   (not (setq ret (encode-char object ccs 'defined-only)))))
171       (cond (ret
172              (chise-turtle-format-ccs-code-point ccs ret)
173              )
174             ((and (setq ccs (car (split-char object)))
175                   (setq ret (encode-char object ccs)))
176              (chise-turtle-format-ccs-code-point ccs ret)
177              )
178             (t
179              (format (if est-hide-cgi-mode
180                          "system-char-id=0x%X"
181                        "system-char-id:0x%X")
182                      (encode-char object 'system-char-id))
183              )))))
184
185 (defun isd-turtle-format-component (component separator level)
186   (cond ((characterp component)
187          (format "%s %c # %c"
188                  (isd-turtle-encode-char component)
189                  separator
190                  component)
191          )
192         ((consp component)
193          (let ((ret (find-char component)))
194            (cond (ret
195                   (format "%s %c # %c"
196                           (isd-turtle-encode-char ret) separator ret))
197                  ((setq ret (assq 'ideographic-structure component))
198                   (if (eq separator ?\;)
199                       (format "%s ;"
200                               (isd-turtle-format-char nil nil (cdr ret) (1+ level)))
201                     (isd-turtle-format-char nil nil (cdr ret) (1+ level)))))))))
202
203 (defun isd-turtle-format-char (ccs code-point &optional ids-list level
204                                    prefix without-head-char)
205   (unless level
206     (setq level 0))
207   (unless prefix
208     (setq prefix ""))
209   (let ((indent (make-string (* level 4) ?\ ))
210         char
211         idc
212         p1 p2 p3
213         c1 c2 c3
214         ret)
215     (unless ids-list
216       (if (and ccs code-point
217                (setq char (decode-char ccs code-point)))
218           (setq ids-list (get-char-attribute char 'ideographic-structure))))
219     (setq idc (car ids-list))
220     (setq c1 (nth 1 ids-list)
221           c2 (nth 2 ids-list)
222           c3 (nth 3 ids-list))
223     (if (char-ref-p idc)
224         (setq idc (plist-get idc :char)))
225     (if (and (consp idc)
226              (setq ret (find-char idc)))
227         (setq idc ret))
228     (if (and (consp c1)
229              (setq ret (find-char c1)))
230         (setq c1 ret))
231     (if (and (consp c2)
232              (setq ret (find-char c2)))
233         (setq c2 ret))
234     (if (and (consp c3)
235              (setq ret (find-char c3)))
236         (setq c3 ret))
237     (cond
238      ((eq idc ?\u2FF0) ; ⿰
239       (setq p1 'left
240             p2 'right)
241       )
242      ((eq idc ?⿱)
243       (setq p1 'above
244             p2 'below)
245       )
246      ((eq idc ?⿲)
247       (setq p1 'left
248             p2 'middle
249             p3 'right)
250       )
251      ((eq idc ?⿳)
252       (setq p1 'above
253             p2 'middle
254             p3 'below)
255       )
256      ((memq idc '(?⿴ ?⿵ ?⿶ ?⿷ ?⿸ ?⿹ ?⿺))
257       (setq p1 'surround
258             p2 'filling)
259       )
260      ((eq idc ?⿻)
261       (setq p1 'underlying
262             p2 'overlaying)
263       ))
264     (cond
265      (p3
266       (format "%s
267 %s    %s:structure [ a idc:%c ;
268 %s        %s:%-8s %s
269 %s        %s:%-8s %s
270 %s        %s:%-8s %s
271 %s    ]%s"
272               (if without-head-char
273                   ""
274                 (if (and ccs code-point)
275                     (format "%s   # %c"
276                             (chise-turtle-format-ccs-code-point ccs code-point)
277                             char)
278                   "["))
279               indent prefix idc
280               indent prefix p1 (isd-turtle-format-component c1 ?\; (1+ level))
281               indent prefix p2 (isd-turtle-format-component c2 ?\; (1+ level))
282               indent prefix p3 (isd-turtle-format-component c3 ?\  (1+ level))
283               indent
284               (if without-head-char
285                   ""
286                 (if (null char)
287                     (format "\n%s]"
288                             indent)
289                   "")))
290       )
291      (idc
292       (format "%s
293 %s    %s:structure [ a idc:%c ;
294 %s        %s:%-8s %s
295 %s        %s:%-8s %s
296 %s    ]%s"
297               (if without-head-char
298                   ""
299                 (if (and ccs code-point)
300                     (format "%s   # %c"
301                             (chise-turtle-format-ccs-code-point ccs code-point)
302                             char)
303                   "["))
304               indent prefix idc
305               indent prefix p1 (isd-turtle-format-component c1 ?\; (1+ level))
306               indent prefix p2 (isd-turtle-format-component c2 ?\  (1+ level))
307               indent
308               (if without-head-char
309                   ""
310                 (if (null char)
311                     (format "\n%s]"
312                             indent)
313                   "")))))
314     ))
315
316 (defun isd-turtle-insert-char (ccs code-point)
317   (let ((ret (isd-turtle-format-char ccs code-point)))
318     (when ret
319       (insert ret)
320       (insert " .\n"))))
321
322 (defun isd-turtle-insert-ccs-ranges (ccs &rest ranges)
323   (let (range code max-code)
324     (while ranges
325       (setq range (car ranges))
326       (cond ((consp range)
327              (setq code (car range)
328                    max-code (cdr range))
329              (while (<= code max-code)
330                (isd-turtle-insert-char ccs code)
331                (setq code (1+ code)))
332              )
333             ((integerp range)
334              (isd-turtle-insert-char ccs range)
335              )
336             (t (error 'wrong-type-argument range)))
337       (setq ranges (cdr ranges)))))
338
339 (defun isd-turtle-dump-range (file path func &rest args)
340   (with-temp-buffer
341     (let ((coding-system-for-write 'utf-8-mcs-er)
342           ;; isd-turtle-ccs-list
343           chise-turtle-ccs-prefix-alist)
344       (if (file-directory-p path)
345           (setq path (expand-file-name file path)))
346       (apply func args)
347       (goto-char (point-min))
348       ;; (dolist (ccs (sort isd-turtle-ccs-list
349       ;;                    #'char-attribute-name<))
350       ;;   (insert (format "@prefix %s: <%s%s=> .\n"
351       ;;                   (isd-turtle-uri-encode-feature-name ccs)
352       ;;                   "http://www.chise.org/est/view/character/"
353       ;;                   (www-uri-encode-feature-name ccs))))
354       (dolist (cell (sort chise-turtle-ccs-prefix-alist
355                           (lambda (a b)
356                             (char-attribute-name< (cdr a)(cdr b)))))
357         (insert (format "@prefix %s: <%s/%s=> .\n"
358                         (car cell)
359                         "http://www.chise.org/est/view/character"
360                         (www-uri-encode-feature-name (cdr cell)))))
361       (insert "\n")
362       (goto-char (point-min))
363       (insert "# -*- coding: utf-8-mcs-er -*-\n")
364       (insert "@prefix : <http://rdf.chise.org/rdf/property/character/isd/> .
365 @prefix idc: <http://rdf.chise.org/rdf/type/character/idc/> .\n")
366       (write-region (point-min)(point-max) path))))
367
368 ;;;###autoload
369 (defun isd-turtle-dump-ucs-basic (filename)
370   (interactive "Fdump ISD-UCS-Basic : ")
371   (isd-turtle-dump-range "ISD-UCS-Basic.ttl" filename
372                          #'isd-turtle-insert-ccs-ranges
373                          '=ucs '(#x4E00 . #x9FA5)))
374
375 ;;;###autoload
376 (defun isd-turtle-dump-ucs-ext-a (filename)
377   (interactive "Fdump ISD-UCS-Ext-A : ")
378   (isd-turtle-dump-range "ISD-UCS-Ext-A.ttl" filename
379                          #'isd-turtle-insert-ccs-ranges
380                          '=ucs '(#x3400 . #x4DB5) #xFA1F #xFA23))
381
382 ;;;###autoload
383 (defun isd-turtle-dump-ucs-ext-b-1 (filename)
384   (interactive "Fdump IDS-UCS-Ext-B-1 : ")
385   (isd-turtle-dump-range "ISD-UCS-Ext-B-1.ttl" filename
386                          #'isd-turtle-insert-ccs-ranges
387                          'ucs '(#x20000 . #x21FFF)))
388
389 ;;;###autoload
390 (defun isd-turtle-dump-ucs-ext-b-2 (filename)
391   (interactive "Fdump IDS-UCS-Ext-B-2 : ")
392   (isd-turtle-dump-range "ISD-UCS-Ext-B-2.ttl" filename
393                          #'isd-turtle-insert-ccs-ranges
394                          'ucs '(#x22000 . #x23FFF)))
395
396 ;;;###autoload
397 (defun isd-turtle-dump-ucs-ext-b-3 (filename)
398   (interactive "Fdump IDS-UCS-Ext-B-3 : ")
399   (isd-turtle-dump-range "ISD-UCS-Ext-B-3.ttl" filename
400                          #'isd-turtle-insert-ccs-ranges
401                          'ucs '(#x24000 . #x25FFF)))
402
403 ;;;###autoload
404 (defun isd-turtle-dump-ucs-ext-b-4 (filename)
405   (interactive "Fdump IDS-UCS-Ext-B-4 : ")
406   (isd-turtle-dump-range "ISD-UCS-Ext-B-4.ttl" filename
407                          #'isd-turtle-insert-ccs-ranges
408                          'ucs '(#x26000 . #x27FFF)))
409
410 ;;;###autoload
411 (defun isd-turtle-dump-ucs-ext-b-5 (filename)
412   (interactive "Fdump IDS-UCS-Ext-B-5 : ")
413   (isd-turtle-dump-range "ISD-UCS-Ext-B-5.ttl" filename
414                          #'isd-turtle-insert-ccs-ranges
415                          'ucs '(#x28000 . #x29FFF)))
416
417 ;;;###autoload
418 (defun isd-turtle-dump-ucs-ext-b-6 (filename)
419   (interactive "Fdump IDS-UCS-Ext-B-6 : ")
420   (isd-turtle-dump-range "ISD-UCS-Ext-B-6.ttl" filename
421                          #'isd-turtle-insert-ccs-ranges
422                          'ucs '(#x2A000 . #x2A6D6)))
423
424 ;;;###autoload
425 (defun isd-turtle-dump-ucs-ext-c (filename)
426   (interactive "Fdump IDS-UCS-Ext-C : ")
427   (isd-turtle-dump-range "ISD-UCS-Ext-C.ttl" filename
428                          #'isd-turtle-insert-ccs-ranges
429                          'ucs '(#x2A700 . #x2B734)))
430
431 ;;;###autoload
432 (defun isd-turtle-dump-ucs-ext-d (filename)
433   (interactive "Fdump IDS-UCS-Ext-D : ")
434   (isd-turtle-dump-range "ISD-UCS-Ext-D.ttl" filename
435                          #'isd-turtle-insert-ccs-ranges
436                          'ucs '(#x2B740 . #x2B81D)))
437
438 ;;;###autoload
439 (defun isd-turtle-dump-ucs-ext-e (filename)
440   (interactive "Fdump IDS-UCS-Ext-E : ")
441   (isd-turtle-dump-range "ISD-UCS-Ext-E.ttl" filename
442                          #'isd-turtle-insert-ccs-ranges
443                          'ucs '(#x2B820 . #x2CEA1)))
444
445 ;;;###autoload
446 (defun isd-turtle-dump-mj-0 (filename)
447   (interactive "Fdump ISD-MJ-0 : ")
448   (isd-turtle-dump-range "ISD-MJ-0.ttl" filename
449                          #'isd-turtle-insert-ccs-ranges
450                          '=mj '(1 . 9999)))
451
452 ;;;###autoload
453 (defun isd-turtle-dump-mj-1 (filename)
454   (interactive "Fdump ISD-MJ-1 : ")
455   (isd-turtle-dump-range "ISD-MJ-1.ttl" filename
456                          #'isd-turtle-insert-ccs-ranges
457                          '=mj '(10000 . 19999)))
458
459 ;;;###autoload
460 (defun isd-turtle-dump-mj-2 (filename)
461   (interactive "Fdump ISD-MJ-2 : ")
462   (isd-turtle-dump-range "ISD-MJ-2.ttl" filename
463                          #'isd-turtle-insert-ccs-ranges
464                          '=mj '(20000 . 29999)))
465
466 ;;;###autoload
467 (defun isd-turtle-dump-mj-3 (filename)
468   (interactive "Fdump ISD-MJ-3 : ")
469   (isd-turtle-dump-range "ISD-MJ-3.ttl" filename
470                          #'isd-turtle-insert-ccs-ranges
471                          '=mj '(30000 . 39999)))
472
473 ;;;###autoload
474 (defun isd-turtle-dump-mj-4 (filename)
475   (interactive "Fdump ISD-MJ-4 : ")
476   (isd-turtle-dump-range "ISD-MJ-4.ttl" filename
477                          #'isd-turtle-insert-ccs-ranges
478                          '=mj '(40000 . 49999)))
479
480 ;;;###autoload
481 (defun isd-turtle-dump-mj-5 (filename)
482   (interactive "Fdump ISD-MJ-5 : ")
483   (isd-turtle-dump-range "ISD-MJ-5.ttl" filename
484                          #'isd-turtle-insert-ccs-ranges
485                          '=mj '(50000 . 59999)))
486
487 ;;;###autoload
488 (defun isd-turtle-dump-mj-6 (filename)
489   (interactive "Fdump ISD-MJ-6 : ")
490   (isd-turtle-dump-range "ISD-MJ-6.ttl" filename
491                          #'isd-turtle-insert-ccs-ranges
492                          '=mj '(60000 . 69999)))
493
494 ;;;###autoload
495 (defun isd-turtle-dump-all (directory)
496   (interactive "DISD directory : ")
497   (isd-turtle-dump-ucs-basic directory)
498   (isd-turtle-dump-ucs-ext-a directory)
499   (isd-turtle-dump-ucs-ext-b-1 directory)
500   (isd-turtle-dump-ucs-ext-b-2 directory)
501   (isd-turtle-dump-ucs-ext-b-3 directory)
502   (isd-turtle-dump-ucs-ext-b-4 directory)
503   (isd-turtle-dump-ucs-ext-b-5 directory)
504   (isd-turtle-dump-ucs-ext-b-6 directory)
505   (isd-turtle-dump-ucs-ext-c directory)
506   (isd-turtle-dump-ucs-ext-d directory)
507   (isd-turtle-dump-ucs-ext-e directory)
508   (isd-turtle-dump-mj-0 directory)
509   (isd-turtle-dump-mj-1 directory)
510   (isd-turtle-dump-mj-2 directory)
511   (isd-turtle-dump-mj-3 directory)
512   (isd-turtle-dump-mj-4 directory)
513   (isd-turtle-dump-mj-5 directory)
514   (isd-turtle-dump-mj-6 directory)
515   )
516
517
518 ;;; @ End.
519 ;;;
520
521 (provide 'isd-turtle)
522
523 ;;; isd-turtle.el ends here