(ideographic-structure-merge-components-alist): New function.
[chise/ids.git] / ids-find.el
1 ;;; ids-find.el --- search utility based on Ideographic-structures
2
3 ;; Copyright (C) 2002, 2003, 2005, 2006, 2007, 2017, 2020 MORIOKA Tomohiko
4
5 ;; Author: MORIOKA Tomohiko <tomo@kanji.zinbun.kyoto-u.ac.jp>
6 ;; Keywords: Kanji, Ideographs, search, IDS, CHISE, UCS, Unicode
7
8 ;; This file is a part of CHISE-IDS.
9
10 ;; This program is free software; you can redistribute it and/or
11 ;; modify it under the terms of the GNU General Public License as
12 ;; published by the Free Software Foundation; either version 2, or (at
13 ;; your option) any later version.
14
15 ;; This program is distributed in the hope that it will be useful, but
16 ;; WITHOUT ANY WARRANTY; without even the implied warranty of
17 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18 ;; General Public License for more details.
19
20 ;; You should have received a copy of the GNU General Public License
21 ;; along with this program; see the file COPYING.  If not, write to
22 ;; the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 ;; Boston, MA 02111-1307, USA.
24
25 ;;; Code:
26
27 (defun ids-index-store-char (product component)
28   (let ((ret (get-char-attribute component 'ideographic-products)))
29     (unless (memq product ret)
30       (put-char-attribute component 'ideographic-products
31                           (cons product ret))
32       (when (setq ret (char-feature component 'ideographic-structure))
33         (ids-index-store-structure product ret)))
34     ))
35
36 (defun ids-index-store-structure (product structure)
37   (let (ret)
38     (dolist (cell (cdr structure))
39       (if (char-ref-p cell)
40           (setq cell (plist-get cell :char)))
41       (cond ((characterp cell)
42              (ids-index-store-char product cell))
43             ((setq ret (assq 'ideographic-structure cell))
44              (ids-index-store-structure product (cdr ret)))
45             ((setq ret (find-char cell))
46              (ids-index-store-char product ret))
47             ))))
48
49 ;;;###autoload
50 (defun ids-update-index ()
51   (interactive)
52   (map-char-attribute
53    (lambda (c v)
54      (ids-index-store-structure c v)
55      nil)
56    'ideographic-structure)
57   (map-char-attribute
58    (lambda (c v)
59      (ids-index-store-structure c v)
60      nil)
61    'ideographic-structure@apparent)
62   (save-char-attribute-table 'ideographic-products))
63
64
65 (mount-char-attribute-table 'ideographic-products)
66
67 ;;;###autoload
68 (defun ids-find-all-products (char)
69   (let (dest)
70     (dolist (cell (char-feature char 'ideographic-products))
71       (unless (memq cell dest)
72         (setq dest (cons cell dest)))
73       (setq dest (union dest (ids-find-all-products cell))))
74     dest))
75
76 (defun of-component-features ()
77   (let (dest)
78     (dolist (feature (char-attribute-list))
79       (when (string-match "^<-.*[@/]component\\(/[^*/]+\\)*$"
80                           (symbol-name feature))
81         (push feature dest)))
82     (list* '<-mistakable '->mistakable
83            '<-formed '->formed
84            '<-same '->same
85            '<-original '->original
86            '<-ancient '->ancient
87            dest)))
88
89 (defun to-component-features ()
90   (let (dest)
91     (dolist (feature (char-attribute-list))
92       (when (string-match "^->.*[@/]component\\(/[^*/]+\\)*$"
93                           (symbol-name feature))
94         (push feature dest)))
95     dest))
96
97 ;;;###autoload
98 (defun char-component-variants (char)
99   (let ((dest (list char))
100         ret uchr)
101     (dolist (feature (to-component-features))
102       (if (setq ret (get-char-attribute char feature))
103           (dolist (c ret)
104             (setq dest (union dest (char-component-variants c))))))
105     (cond
106      ;; ((setq ret (some (lambda (feature)
107      ;;                    (get-char-attribute char feature))
108      ;;                  (to-component-features)))
109      ;;  (dolist (c ret)
110      ;;    (setq dest (union dest (char-component-variants c))))
111      ;;  )
112      ((setq ret (get-char-attribute char '->ucs-unified))
113       (setq dest (cons char ret))
114       (dolist (c dest)
115         (setq dest (union dest
116                           (some (lambda (feature)
117                                   (get-char-attribute c feature))
118                                 (of-component-features))
119                           )))
120       )
121      ((and (setq ret (get-char-attribute char '=>ucs))
122            (setq uchr (decode-char '=ucs ret)))
123       (setq dest (cons uchr (char-variants uchr)))
124       (dolist (c dest)
125         (setq dest (union dest
126                           (some (lambda (feature)
127                                   (get-char-attribute c feature))
128                                 (of-component-features))
129                           )))
130       )
131      (t
132       (map-char-family
133        (lambda (c)
134          (unless (memq c dest)
135            (setq dest (cons c dest)))
136          (setq dest
137                (union dest
138                       (some (lambda (feature)
139                               (char-feature c feature))
140                             (of-component-features))
141                       ))
142          nil)
143        char)
144       ))
145     dest))
146
147 ;;;###autoload
148 (defun ideographic-products-find (&rest components)
149   (if (stringp (car components))
150       (setq components (string-to-char-list (car components))))
151   (let (dest products)
152     (dolist (variant (char-component-variants (car components)))
153       (setq products
154             (union products
155                    (get-char-attribute variant 'ideographic-products))))
156     (setq dest products)
157     (while (and dest
158                 (setq components (cdr components)))
159       (setq products nil)
160       (dolist (variant (char-component-variants (car components)))
161         (setq products
162               (union products
163                      (get-char-attribute variant 'ideographic-products))))
164       (setq dest (intersection dest products)))
165     dest))
166
167 (defun ideograph-find-products-with-variants (components &optional ignored-chars)
168   (if (stringp components)
169       (setq components (string-to-char-list components)))
170   (let (dest products)
171     (dolist (variant (char-component-variants (car components)))
172       (setq products
173             (union products
174                    (set-difference
175                     (get-char-attribute variant 'ideographic-products)
176                     ignored-chars))))
177     (setq dest products)
178     (while (and dest
179                 (setq components (cdr components)))
180       (setq products nil)
181       (dolist (variant (char-component-variants (car components)))
182         (setq products
183               (union products
184                      (set-difference
185                       (get-char-attribute variant 'ideographic-products)
186                       ignored-chars))))
187       (setq dest (intersection dest products)))
188     dest))
189
190 (defun ideograph-find-products (components &optional ignored-chars)
191   (if (stringp components)
192       (setq components (string-to-char-list components)))
193   (let (dest products)
194     ;; (dolist (variant (char-component-variants (car components)))
195     ;;   (setq products
196     ;;         (union products
197     ;;                (get-char-attribute variant 'ideographic-products))))
198     ;; (setq dest products)
199     (setq dest (get-char-attribute (car components) 'ideographic-products))
200     (while (and dest
201                 (setq components (cdr components)))
202       ;; (setq products nil)
203       ;; (dolist (variant (char-component-variants (car components)))
204       ;;   (setq products
205       ;;         (union products
206       ;;                (get-char-attribute variant 'ideographic-products))))
207       (setq products (get-char-attribute (car components) 'ideographic-products))
208       (setq dest (intersection dest products)))
209     dest))
210
211
212 (defun ideographic-structure-char= (c1 c2)
213   (or (eq c1 c2)
214       (and c1 c2
215            (let ((m1 (char-ucs c1))
216                  (m2 (char-ucs c2)))
217              (or (and m1 m2
218                       (eq m1 m2))
219                  (memq c1 (char-component-variants c2)))))))
220
221 (defun ideographic-structure-member-compare-components (component s-component)
222   (let (ret)
223     (cond ((char-ref= component s-component #'ideographic-structure-char=))
224           ((listp s-component)
225            (if (setq ret (assq 'ideographic-structure s-component))
226                (ideographic-structure-member component (cdr ret))))
227           ((setq ret (get-char-attribute s-component 'ideographic-structure))
228            (ideographic-structure-member component ret)))))
229
230 ;;;###autoload
231 (defun ideographic-structure-member (component structure)
232   "Return non-nil if COMPONENT is included in STRUCTURE."
233   (or (memq component structure)
234       (progn
235         (setq structure (cdr structure))
236         (ideographic-structure-member-compare-components
237          component (car structure)))
238       (progn
239         (setq structure (cdr structure))
240         (ideographic-structure-member-compare-components
241          component (car structure)))
242       (progn
243         (setq structure (cdr structure))
244         (and (car structure)
245              (ideographic-structure-member-compare-components
246               component (car structure))))))
247
248
249 ;;;###autoload
250 (defun ideographic-structure-repertoire-p (structure components)
251   "Return non-nil if STRUCTURE can be constructed by a subset of COMPONENTS."
252   (and structure
253        (let (ret s-component)
254          (catch 'tag
255            (while (setq structure (cdr structure))
256              (setq s-component (car structure))
257              (unless (characterp s-component)
258                (if (setq ret (find-char s-component))
259                    (setq s-component ret)))
260              (unless (cond
261                       ((listp s-component)
262                        (if (setq ret (assq 'ideographic-structure s-component))
263                            (ideographic-structure-repertoire-p
264                             (cdr ret) components)))
265                       ((member* s-component components
266                                 :test #'ideographic-structure-char=))
267                       ((setq ret
268                              (get-char-attribute s-component
269                                                  'ideographic-structure))
270                        (ideographic-structure-repertoire-p ret components)))
271                (throw 'tag nil)))
272            t))))
273
274
275 (defvar ids-find-result-buffer "*ids-chars*")
276
277 (defun ids-find-format-line (c v)
278   (format "%c\t%s\t%s\n"
279           c
280           (or (let ((ucs (or (char-ucs c)
281                              (encode-char c 'ucs))))
282                 (if ucs
283                     (cond ((<= ucs #xFFFF)
284                            (format "    U+%04X" ucs))
285                           ((<= ucs #x10FFFF)
286                            (format "U-%08X" ucs)))))
287               "          ")
288           (or (ideographic-structure-to-ids v)
289               v)))
290
291 (defun ids-insert-chars-including-components* (components
292                                                &optional level ignored-chars)
293   (unless level
294     (setq level 0))
295   (let (is i as bs)
296     (dolist (c (sort (copy-tree (ideograph-find-products components
297                                                          ignored-chars))
298                      (lambda (a b)
299                        (if (setq as (char-total-strokes a))
300                            (if (setq bs (char-total-strokes b))
301                                (if (= as bs)
302                                    (ideograph-char< a b)
303                                  (< as bs))
304                              t)
305                          (ideograph-char< a b)))))
306       (unless (memq c ignored-chars)
307         (setq is (char-feature c 'ideographic-structure))
308         (setq i 0)
309         (while (< i level)
310           (insert "\t")
311           (setq i (1+ i)))
312         (insert (ids-find-format-line c is))
313         (setq ignored-chars
314               (ids-insert-chars-including-components*
315                (char-to-string c) (1+ level)
316                (cons c ignored-chars))))
317       )
318     )
319   ignored-chars)
320
321 (defun ids-insert-chars-including-components (components
322                                               &optional level ignored-chars)
323   (unless level
324     (setq level 0))
325   (setq ignored-chars
326         (nreverse
327          (ids-insert-chars-including-components* components
328                                                  level ignored-chars)))
329   (let (is i as bs)
330     (dolist (c ignored-chars)
331       (dolist (vc (char-component-variants c))
332         (unless (memq vc ignored-chars)
333           (when (setq is (get-char-attribute vc 'ideographic-structure))
334             (setq i 0)
335             (while (< i level)
336               (insert "\t")
337               (setq i (1+ i)))
338             (insert (ids-find-format-line vc is))
339             (setq ignored-chars
340                   (ids-insert-chars-including-components*
341                    (char-to-string vc) (1+ level)
342                    (cons vc ignored-chars)))))))
343     (dolist (c (sort (copy-tree (ideograph-find-products-with-variants
344                                  components ignored-chars))
345                      (lambda (a b)
346                        (if (setq as (char-total-strokes a))
347                            (if (setq bs (char-total-strokes b))
348                                (if (= as bs)
349                                    (ideograph-char< a b)
350                                  (< as bs))
351                              t)
352                          (ideograph-char< a b)))))
353       (unless (memq c ignored-chars)
354         (setq is (get-char-attribute c 'ideographic-structure))
355         (setq i 0)
356         (while (< i level)
357           (insert "\t")
358           (setq i (1+ i)))
359         (insert (ids-find-format-line c is))
360         (setq ignored-chars
361               (ids-insert-chars-including-components*
362                (char-to-string c) (1+ level)
363                (cons c ignored-chars))))
364       )
365     )
366   ignored-chars)
367
368 ;;;###autoload
369 (defun ids-find-chars-including-components (components)
370   "Search Ideographs whose structures have COMPONENTS."
371   (interactive "sComponents : ")
372   (with-current-buffer (get-buffer-create ids-find-result-buffer)
373     (setq buffer-read-only nil)
374     (erase-buffer)
375     (ids-insert-chars-including-components components 0 nil)
376     ;; (let ((ignored-chars
377     ;;        (nreverse
378     ;;         (ids-insert-chars-including-components components 0 nil
379     ;;                                                #'ideograph-find-products)))
380     ;;       rest)
381     ;;   (setq rest ignored-chars)
382     ;;   ;; (dolist (c rest)
383     ;;   ;;   (setq ignored-chars
384     ;;   ;;         (union ignored-chars
385     ;;   ;;                (ids-insert-chars-including-components
386     ;;   ;;                 (list c) 0 ignored-chars
387     ;;   ;;                 #'ideograph-find-products-with-variants))))
388     ;;   (ids-insert-chars-including-components components 0 ignored-chars
389     ;;                                          #'ideograph-find-products-with-variants))
390     (goto-char (point-min)))
391   (view-buffer ids-find-result-buffer))
392
393 ;;;###autoload
394 (define-obsolete-function-alias 'ideographic-structure-search-chars
395   'ids-find-chars-including-components)
396
397 ;;;###autoload
398 (defun ids-find-chars-covered-by-components (components)
399   "Search Ideographs which structures are consisted by subsets of COMPONENTS."
400   (interactive "sComponents: ")
401   (if (stringp components)
402       (setq components (string-to-char-list components)))
403   (with-current-buffer (get-buffer-create ids-find-result-buffer)
404     (setq buffer-read-only nil)
405     (erase-buffer)
406     (map-char-attribute
407      (lambda (c v)
408        (when (ideographic-structure-repertoire-p v components)
409          (insert (ids-find-format-line c v))))
410      'ideographic-structure)
411     (goto-char (point-min)))
412   (view-buffer ids-find-result-buffer))
413
414
415 (defun ideographic-structure-merge-components-alist (ca1 ca2)
416   (let ((dest-alist ca1)
417         ret)
418     (dolist (cell ca2)
419       (if (setq ret (assq (car cell) dest-alist))
420           (setcdr ret (+ (cdr ret)(cdr cell)))
421         (setq dest-alist (cons cell dest-alist))))
422     dest-alist))
423
424 (defun ideographic-structure-to-components-alist (structure)
425   (apply #'ideographic-structure-to-components-alist* structure))
426
427 (defun ideographic-structure-to-components-alist* (operator component1 component2
428                                                             &optional component3
429                                                             &rest opts)
430   (let (dest-alist ret)
431     (setq dest-alist
432           (cond ((characterp component1)
433                  (unless (encode-char component1 'ascii)
434                    (list (cons component1 1)))
435                  )
436                 ((setq ret (assq 'ideographic-structure component1))
437                  (ideographic-structure-to-components-alist (cdr ret))
438                  )
439                 ((setq ret (find-char component1))
440                  (list (cons ret 1))
441                  )))
442     (setq dest-alist
443           (ideographic-structure-merge-components-alist
444            dest-alist
445            (cond ((characterp component2)
446                   (unless (encode-char component2 'ascii)
447                     (list (cons component2 1)))
448                   )
449                  ((setq ret (assq 'ideographic-structure component2))
450                   (ideographic-structure-to-components-alist (cdr ret))
451                   )
452                  ((setq ret (find-char component2))
453                   (list (cons ret 1))
454                   ))))
455     (if (memq operator '(?\u2FF2 ?\u2FF3))
456         (ideographic-structure-merge-components-alist
457          dest-alist
458          (cond ((characterp component3)
459                 (unless (encode-char component3 'ascii)
460                   (list (cons component3 1)))
461                 )
462                ((setq ret (assq 'ideographic-structure component3))
463                 (ideographic-structure-to-components-alist (cdr ret))
464                 )
465                ((setq ret (find-char component3))
466                 (list (cons ret 1))
467                 )))
468       dest-alist)))
469
470 ;;;###autoload
471 (defun ideographic-structure-equal (structure1 structure2)
472   (and (eq (car structure1)(car structure2))
473        (ideographic-structure-character= (nth 1 structure1)(nth 1 structure2))
474        (ideographic-structure-character= (nth 2 structure1)(nth 2 structure2))
475        (if (memq (car structure1) '(?\u2FF2 ?\u2FF3))
476            (ideographic-structure-character= (nth 3 structure1)(nth 3 structure2))
477          t)))
478
479 ;;;###autoload
480 (defun ideographic-structure-character= (c1 c2)
481   (let (ret ret2)
482     (cond ((characterp c1)
483            (cond ((encode-char c1 'ascii)
484                   )
485                  ((characterp c2)
486                   (or (eq c1 c2)
487                       (encode-char c2 'ascii))
488                   )
489                  ((setq ret2 (find-char c2))
490                   (eq c1 ret2)
491                   )
492                  ((setq ret2 (assq 'ideographic-structure c2))
493                   (and (setq ret (get-char-attribute c1 'ideographic-structure))
494                        (ideographic-structure-equal ret (cdr ret2)))
495                   ))
496            )
497           ((setq ret (assq 'ideographic-structure c1))
498            (cond ((characterp c2)
499                   (or (encode-char c2 'ascii)
500                       (and (setq ret2 (get-char-attribute c2 'ideographic-structure))
501                            (ideographic-structure-equal (cdr ret) ret2)))
502                   )
503                  ((setq ret2 (find-char c2))
504                   (and (setq ret2 (get-char-attribute c2 'ideographic-structure))
505                        (ideographic-structure-equal (cdr ret) ret2))
506                   )
507                  ((setq ret2 (assq 'ideographic-structure c2))
508                   (ideographic-structure-equal (cdr ret)(cdr ret2))
509                   ))
510            )
511           ((setq ret (find-char c1))
512            (cond ((characterp c2)
513                   (or (eq ret c2)
514                       (encode-char c2 'ascii))
515                   )
516                  ((setq ret2 (find-char c2))
517                   (eq ret ret2)
518                   )
519                  ((setq ret2 (assq 'ideographic-structure c2))
520                   (and (setq ret (get-char-attribute c1 'ideographic-structure))
521                        (ideographic-structure-equal ret (cdr ret2))
522                        )))))))
523
524 ;;;###autoload
525 (defun ideographic-structure-find-chars (structure)
526   (apply #'ideographic-structure-find-chars* structure))
527
528 (defun ideographic-structure-find-chars* (operator component1 component2
529                                                    &optional component3)
530   (let ((comp-alist (ideographic-structure-to-components-alist*
531                      operator component1 component2 component3))
532         c1 c2 c3
533         ret pl str)
534     (dolist (pc (caar
535                  (sort (mapcar (lambda (cell)
536                                  (if (setq ret (get-char-attribute
537                                                 (car cell) 'ideographic-products))
538                                      (cons ret (length ret))
539                                    (cons nil 0)))
540                                comp-alist)
541                        (lambda (a b)
542                          (< (cdr a)(cdr b))))))
543       (when (and (setq str (get-char-attribute pc 'ideographic-structure))
544                  (eq (car str) operator)
545                  (setq c1 (nth 1 str))
546                  (ideographic-structure-character= c1 component1)
547                  (setq c2 (nth 2 str))
548                  (ideographic-structure-character= c2 component2)
549                  (cond ((memq (car str) '(?\u2FF2 ?\u2FF3))
550                         (setq c3 (nth 3 str))
551                         (ideographic-structure-character= c3 component3)
552                         )
553                        (t)))
554         (setq pl (cons pc pl))
555         ))
556     pl))
557
558 ;;;###autoload
559 (defun ideographic-char-count-components (char component)
560   (let ((dest 0)
561         structure)
562     (cond ((eq char component)
563            1)
564           ((setq structure (get-char-attribute char 'ideographic-structure))
565            (dolist (cell (ideographic-structure-to-components-alist structure))
566              (setq dest
567                    (+ dest
568                       (if (eq (car cell) char)
569                           (cdr cell)
570                         (* (ideographic-char-count-components (car cell) component)
571                            (cdr cell))))))
572            dest)
573           (t
574            0))))
575
576
577 ;;; @ End.
578 ;;;
579
580 (provide 'ids-find)
581
582 ;;; ids-find.el ends here