(ideographic-structure-character=): Fixed.
[chise/ids.git] / ids-find.el
1 ;;; ids-find.el --- search utility based on Ideographic-structures
2
3 ;; Copyright (C) 2002, 2003, 2005, 2006, 2007, 2017, 2020 MORIOKA Tomohiko
4
5 ;; Author: MORIOKA Tomohiko <tomo@kanji.zinbun.kyoto-u.ac.jp>
6 ;; Keywords: Kanji, Ideographs, search, IDS, CHISE, UCS, Unicode
7
8 ;; This file is a part of CHISE-IDS.
9
10 ;; This program is free software; you can redistribute it and/or
11 ;; modify it under the terms of the GNU General Public License as
12 ;; published by the Free Software Foundation; either version 2, or (at
13 ;; your option) any later version.
14
15 ;; This program is distributed in the hope that it will be useful, but
16 ;; WITHOUT ANY WARRANTY; without even the implied warranty of
17 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18 ;; General Public License for more details.
19
20 ;; You should have received a copy of the GNU General Public License
21 ;; along with this program; see the file COPYING.  If not, write to
22 ;; the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 ;; Boston, MA 02111-1307, USA.
24
25 ;;; Code:
26
27 (defun ids-index-store-char (product component)
28   (let ((ret (get-char-attribute component 'ideographic-products)))
29     (unless (memq product ret)
30       (put-char-attribute component 'ideographic-products
31                           (cons product ret))
32       (when (setq ret (char-feature component 'ideographic-structure))
33         (ids-index-store-structure product ret)))
34     ))
35
36 (defun ids-index-store-structure (product structure)
37   (let (ret)
38     (dolist (cell (cdr structure))
39       (if (char-ref-p cell)
40           (setq cell (plist-get cell :char)))
41       (cond ((characterp cell)
42              (ids-index-store-char product cell))
43             ((setq ret (assq 'ideographic-structure cell))
44              (ids-index-store-structure product (cdr ret)))
45             ((setq ret (find-char cell))
46              (ids-index-store-char product ret))
47             ))))
48
49 ;;;###autoload
50 (defun ids-update-index ()
51   (interactive)
52   (map-char-attribute
53    (lambda (c v)
54      (ids-index-store-structure c v)
55      nil)
56    'ideographic-structure)
57   (map-char-attribute
58    (lambda (c v)
59      (ids-index-store-structure c v)
60      nil)
61    'ideographic-structure@apparent)
62   (save-char-attribute-table 'ideographic-products))
63
64
65 (mount-char-attribute-table 'ideographic-products)
66
67 ;;;###autoload
68 (defun ids-find-all-products (char)
69   (let (dest)
70     (dolist (cell (char-feature char 'ideographic-products))
71       (unless (memq cell dest)
72         (setq dest (cons cell dest)))
73       (setq dest (union dest (ids-find-all-products cell))))
74     dest))
75
76 (defun of-component-features ()
77   (let (dest)
78     (dolist (feature (char-attribute-list))
79       (when (string-match "^<-.*[@/]component\\(/[^*/]+\\)*$"
80                           (symbol-name feature))
81         (push feature dest)))
82     (list* '<-mistakable '->mistakable
83            '<-formed '->formed
84            '<-same '->same
85            '<-original '->original
86            '<-ancient '->ancient
87            dest)))
88
89 (defun to-component-features ()
90   (let (dest)
91     (dolist (feature (char-attribute-list))
92       (when (string-match "^->.*[@/]component\\(/[^*/]+\\)*$"
93                           (symbol-name feature))
94         (push feature dest)))
95     dest))
96
97 ;;;###autoload
98 (defun char-component-variants (char)
99   (let ((dest (list char))
100         ret uchr)
101     (dolist (feature (to-component-features))
102       (if (setq ret (get-char-attribute char feature))
103           (dolist (c ret)
104             (setq dest (union dest (char-component-variants c))))))
105     (cond
106      ;; ((setq ret (some (lambda (feature)
107      ;;                    (get-char-attribute char feature))
108      ;;                  (to-component-features)))
109      ;;  (dolist (c ret)
110      ;;    (setq dest (union dest (char-component-variants c))))
111      ;;  )
112      ((setq ret (get-char-attribute char '->ucs-unified))
113       (setq dest (cons char ret))
114       (dolist (c dest)
115         (setq dest (union dest
116                           (some (lambda (feature)
117                                   (get-char-attribute c feature))
118                                 (of-component-features))
119                           )))
120       )
121      ((and (setq ret (get-char-attribute char '=>ucs))
122            (setq uchr (decode-char '=ucs ret)))
123       (setq dest (cons uchr (char-variants uchr)))
124       (dolist (c dest)
125         (setq dest (union dest
126                           (some (lambda (feature)
127                                   (get-char-attribute c feature))
128                                 (of-component-features))
129                           )))
130       )
131      (t
132       (map-char-family
133        (lambda (c)
134          (unless (memq c dest)
135            (setq dest (cons c dest)))
136          (setq dest
137                (union dest
138                       (some (lambda (feature)
139                               (char-feature c feature))
140                             (of-component-features))
141                       ))
142          nil)
143        char)
144       ))
145     dest))
146
147 ;;;###autoload
148 (defun ideographic-products-find (&rest components)
149   (if (stringp (car components))
150       (setq components (string-to-char-list (car components))))
151   (let (dest products)
152     (dolist (variant (char-component-variants (car components)))
153       (setq products
154             (union products
155                    (get-char-attribute variant 'ideographic-products))))
156     (setq dest products)
157     (while (and dest
158                 (setq components (cdr components)))
159       (setq products nil)
160       (dolist (variant (char-component-variants (car components)))
161         (setq products
162               (union products
163                      (get-char-attribute variant 'ideographic-products))))
164       (setq dest (intersection dest products)))
165     dest))
166
167 (defun ideograph-find-products-with-variants (components &optional ignored-chars)
168   (if (stringp components)
169       (setq components (string-to-char-list components)))
170   (let (dest products)
171     (dolist (variant (char-component-variants (car components)))
172       (setq products
173             (union products
174                    (set-difference
175                     (get-char-attribute variant 'ideographic-products)
176                     ignored-chars))))
177     (setq dest products)
178     (while (and dest
179                 (setq components (cdr components)))
180       (setq products nil)
181       (dolist (variant (char-component-variants (car components)))
182         (setq products
183               (union products
184                      (set-difference
185                       (get-char-attribute variant 'ideographic-products)
186                       ignored-chars))))
187       (setq dest (intersection dest products)))
188     dest))
189
190 (defun ideograph-find-products (components &optional ignored-chars)
191   (if (stringp components)
192       (setq components (string-to-char-list components)))
193   (let (dest products)
194     ;; (dolist (variant (char-component-variants (car components)))
195     ;;   (setq products
196     ;;         (union products
197     ;;                (get-char-attribute variant 'ideographic-products))))
198     ;; (setq dest products)
199     (setq dest (get-char-attribute (car components) 'ideographic-products))
200     (while (and dest
201                 (setq components (cdr components)))
202       ;; (setq products nil)
203       ;; (dolist (variant (char-component-variants (car components)))
204       ;;   (setq products
205       ;;         (union products
206       ;;                (get-char-attribute variant 'ideographic-products))))
207       (setq products (get-char-attribute (car components) 'ideographic-products))
208       (setq dest (intersection dest products)))
209     dest))
210
211
212 (defun ideographic-structure-char= (c1 c2)
213   (or (eq c1 c2)
214       (and c1 c2
215            (let ((m1 (char-ucs c1))
216                  (m2 (char-ucs c2)))
217              (or (and m1 m2
218                       (eq m1 m2))
219                  (memq c1 (char-component-variants c2)))))))
220
221 (defun ideographic-structure-member-compare-components (component s-component)
222   (let (ret)
223     (cond ((char-ref= component s-component #'ideographic-structure-char=))
224           ((listp s-component)
225            (if (setq ret (assq 'ideographic-structure s-component))
226                (ideographic-structure-member component (cdr ret))))
227           ((setq ret (get-char-attribute s-component 'ideographic-structure))
228            (ideographic-structure-member component ret)))))
229
230 ;;;###autoload
231 (defun ideographic-structure-member (component structure)
232   "Return non-nil if COMPONENT is included in STRUCTURE."
233   (or (memq component structure)
234       (progn
235         (setq structure (cdr structure))
236         (ideographic-structure-member-compare-components
237          component (car structure)))
238       (progn
239         (setq structure (cdr structure))
240         (ideographic-structure-member-compare-components
241          component (car structure)))
242       (progn
243         (setq structure (cdr structure))
244         (and (car structure)
245              (ideographic-structure-member-compare-components
246               component (car structure))))))
247
248
249 ;;;###autoload
250 (defun ideographic-structure-repertoire-p (structure components)
251   "Return non-nil if STRUCTURE can be constructed by a subset of COMPONENTS."
252   (and structure
253        (let (ret s-component)
254          (catch 'tag
255            (while (setq structure (cdr structure))
256              (setq s-component (car structure))
257              (unless (characterp s-component)
258                (if (setq ret (find-char s-component))
259                    (setq s-component ret)))
260              (unless (cond
261                       ((listp s-component)
262                        (if (setq ret (assq 'ideographic-structure s-component))
263                            (ideographic-structure-repertoire-p
264                             (cdr ret) components)))
265                       ((member* s-component components
266                                 :test #'ideographic-structure-char=))
267                       ((setq ret
268                              (get-char-attribute s-component
269                                                  'ideographic-structure))
270                        (ideographic-structure-repertoire-p ret components)))
271                (throw 'tag nil)))
272            t))))
273
274
275 (defvar ids-find-result-buffer "*ids-chars*")
276
277 (defun ids-find-format-line (c v)
278   (format "%c\t%s\t%s\n"
279           c
280           (or (let ((ucs (or (char-ucs c)
281                              (encode-char c 'ucs))))
282                 (if ucs
283                     (cond ((<= ucs #xFFFF)
284                            (format "    U+%04X" ucs))
285                           ((<= ucs #x10FFFF)
286                            (format "U-%08X" ucs)))))
287               "          ")
288           (or (ideographic-structure-to-ids v)
289               v)))
290
291 (defun ids-insert-chars-including-components* (components
292                                                &optional level ignored-chars)
293   (unless level
294     (setq level 0))
295   (let (is i as bs)
296     (dolist (c (sort (copy-tree (ideograph-find-products components
297                                                          ignored-chars))
298                      (lambda (a b)
299                        (if (setq as (char-total-strokes a))
300                            (if (setq bs (char-total-strokes b))
301                                (if (= as bs)
302                                    (ideograph-char< a b)
303                                  (< as bs))
304                              t)
305                          (ideograph-char< a b)))))
306       (unless (memq c ignored-chars)
307         (setq is (char-feature c 'ideographic-structure))
308         (setq i 0)
309         (while (< i level)
310           (insert "\t")
311           (setq i (1+ i)))
312         (insert (ids-find-format-line c is))
313         (setq ignored-chars
314               (ids-insert-chars-including-components*
315                (char-to-string c) (1+ level)
316                (cons c ignored-chars))))
317       )
318     )
319   ignored-chars)
320
321 (defun ids-insert-chars-including-components (components
322                                               &optional level ignored-chars)
323   (unless level
324     (setq level 0))
325   (setq ignored-chars
326         (nreverse
327          (ids-insert-chars-including-components* components
328                                                  level ignored-chars)))
329   (let (is i as bs)
330     (dolist (c ignored-chars)
331       (dolist (vc (char-component-variants c))
332         (unless (memq vc ignored-chars)
333           (when (setq is (get-char-attribute vc 'ideographic-structure))
334             (setq i 0)
335             (while (< i level)
336               (insert "\t")
337               (setq i (1+ i)))
338             (insert (ids-find-format-line vc is))
339             (setq ignored-chars
340                   (ids-insert-chars-including-components*
341                    (char-to-string vc) (1+ level)
342                    (cons vc ignored-chars)))))))
343     (dolist (c (sort (copy-tree (ideograph-find-products-with-variants
344                                  components ignored-chars))
345                      (lambda (a b)
346                        (if (setq as (char-total-strokes a))
347                            (if (setq bs (char-total-strokes b))
348                                (if (= as bs)
349                                    (ideograph-char< a b)
350                                  (< as bs))
351                              t)
352                          (ideograph-char< a b)))))
353       (unless (memq c ignored-chars)
354         (setq is (get-char-attribute c 'ideographic-structure))
355         (setq i 0)
356         (while (< i level)
357           (insert "\t")
358           (setq i (1+ i)))
359         (insert (ids-find-format-line c is))
360         (setq ignored-chars
361               (ids-insert-chars-including-components*
362                (char-to-string c) (1+ level)
363                (cons c ignored-chars))))
364       )
365     )
366   ignored-chars)
367
368 ;;;###autoload
369 (defun ids-find-chars-including-components (components)
370   "Search Ideographs whose structures have COMPONENTS."
371   (interactive "sComponents : ")
372   (with-current-buffer (get-buffer-create ids-find-result-buffer)
373     (setq buffer-read-only nil)
374     (erase-buffer)
375     (ids-insert-chars-including-components components 0 nil)
376     ;; (let ((ignored-chars
377     ;;        (nreverse
378     ;;         (ids-insert-chars-including-components components 0 nil
379     ;;                                                #'ideograph-find-products)))
380     ;;       rest)
381     ;;   (setq rest ignored-chars)
382     ;;   ;; (dolist (c rest)
383     ;;   ;;   (setq ignored-chars
384     ;;   ;;         (union ignored-chars
385     ;;   ;;                (ids-insert-chars-including-components
386     ;;   ;;                 (list c) 0 ignored-chars
387     ;;   ;;                 #'ideograph-find-products-with-variants))))
388     ;;   (ids-insert-chars-including-components components 0 ignored-chars
389     ;;                                          #'ideograph-find-products-with-variants))
390     (goto-char (point-min)))
391   (view-buffer ids-find-result-buffer))
392
393 ;;;###autoload
394 (define-obsolete-function-alias 'ideographic-structure-search-chars
395   'ids-find-chars-including-components)
396
397 ;;;###autoload
398 (defun ids-find-chars-covered-by-components (components)
399   "Search Ideographs which structures are consisted by subsets of COMPONENTS."
400   (interactive "sComponents: ")
401   (if (stringp components)
402       (setq components (string-to-char-list components)))
403   (with-current-buffer (get-buffer-create ids-find-result-buffer)
404     (setq buffer-read-only nil)
405     (erase-buffer)
406     (map-char-attribute
407      (lambda (c v)
408        (when (ideographic-structure-repertoire-p v components)
409          (insert (ids-find-format-line c v))))
410      'ideographic-structure)
411     (goto-char (point-min)))
412   (view-buffer ids-find-result-buffer))
413
414
415 (defun ideographic-structure-merge-components-alist (ca1 ca2)
416   (let ((dest-alist ca1)
417         ret)
418     (dolist (cell ca2)
419       (if (setq ret (assq (car cell) dest-alist))
420           (setcdr ret (+ (cdr ret)(cdr cell)))
421         (setq dest-alist (cons cell dest-alist))))
422     dest-alist))
423
424 (defun ideographic-structure-to-components-alist (structure)
425   (apply #'ideographic-structure-to-components-alist* structure))
426
427 (defun ideographic-structure-to-components-alist* (operator component1 component2
428                                                             &optional component3
429                                                             &rest opts)
430   (let (dest-alist ret)
431     (setq dest-alist
432           (cond ((characterp component1)
433                  (unless (encode-char component1 'ascii)
434                    (list (cons component1 1)))
435                  )
436                 ((setq ret (assq 'ideographic-structure component1))
437                  (ideographic-structure-to-components-alist (cdr ret))
438                  )
439                 ((setq ret (find-char component1))
440                  (list (cons ret 1))
441                  )))
442     (setq dest-alist
443           (ideographic-structure-merge-components-alist
444            dest-alist
445            (cond ((characterp component2)
446                   (unless (encode-char component2 'ascii)
447                     (list (cons component2 1)))
448                   )
449                  ((setq ret (assq 'ideographic-structure component2))
450                   (ideographic-structure-to-components-alist (cdr ret))
451                   )
452                  ((setq ret (find-char component2))
453                   (list (cons ret 1))
454                   ))))
455     (if (memq operator '(?\u2FF2 ?\u2FF3))
456         (ideographic-structure-merge-components-alist
457          dest-alist
458          (cond ((characterp component3)
459                 (unless (encode-char component3 'ascii)
460                   (list (cons component3 1)))
461                 )
462                ((setq ret (assq 'ideographic-structure component3))
463                 (ideographic-structure-to-components-alist (cdr ret))
464                 )
465                ((setq ret (find-char component3))
466                 (list (cons ret 1))
467                 )))
468       dest-alist)))
469
470 (defun ids-find-merge-variables (ve1 ve2)
471   (cond ((eq ve1 t)
472          ve2)
473         ((eq ve2 t)
474          ve1)
475         (t
476          (let ((dest-alist ve1)
477                (rest ve2)
478                cell ret)
479            (while (and rest
480                        (setq cell (car rest))
481                        (if (setq ret (assq (car cell) ve1))
482                            (eq (cdr ret)(cdr cell))
483                          (setq dest-alist (cons cell dest-alist))))
484              (setq rest (cdr rest)))
485            (if rest
486                nil
487              dest-alist)))))
488
489 ;;;###autoload
490 (defun ideographic-structure-equal (structure1 structure2)
491   (let (dest-alist ret)
492     (and (setq dest-alist (ideographic-structure-character=
493                            (car structure1)(car structure2)))
494          (setq ret (ideographic-structure-character=
495                     (nth 1 structure1)(nth 1 structure2)))
496          (setq dest-alist (ids-find-merge-variables dest-alist ret))
497          (setq ret (ideographic-structure-character=
498                     (nth 2 structure1)(nth 2 structure2)))
499          (setq dest-alist (ids-find-merge-variables dest-alist ret))
500          (if (memq (car structure1) '(?\u2FF2 ?\u2FF3))
501              (and (setq ret (ideographic-structure-character=
502                              (nth 3 structure1)(nth 3 structure2)))
503                   (setq dest-alist (ids-find-merge-variables dest-alist ret)))
504            dest-alist))))
505
506 ;;;###autoload
507 (defun ideographic-structure-character= (c1 c2)
508   (let (ret ret2)
509     (cond ((characterp c1)
510            (cond ((encode-char c1 'ascii)
511                   (list (cons c1 c2))
512                   )
513                  ((characterp c2)
514                   (if (encode-char c2 'ascii)
515                       (list (cons c2 c1))
516                     (eq c1 c2))
517                   )
518                  ((setq ret2 (find-char c2))
519                   (eq c1 ret2)
520                   )
521                  ((setq ret2 (assq 'ideographic-structure c2))
522                   (and (setq ret (get-char-attribute c1 'ideographic-structure))
523                        (ideographic-structure-equal ret (cdr ret2)))
524                   ))
525            )
526           ((setq ret (assq 'ideographic-structure c1))
527            (cond ((characterp c2)
528                   (if (encode-char c2 'ascii)
529                       (list (cons c2 c1))
530                     (and (setq ret2 (get-char-attribute c2 'ideographic-structure))
531                          (ideographic-structure-equal (cdr ret) ret2)))
532                   )
533                  ((setq ret2 (find-char c2))
534                   (and (setq ret2 (get-char-attribute ret2 'ideographic-structure))
535                        (ideographic-structure-equal (cdr ret) ret2))
536                   )
537                  ((setq ret2 (assq 'ideographic-structure c2))
538                   (ideographic-structure-equal (cdr ret)(cdr ret2))
539                   ))
540            )
541           ((setq ret (find-char c1))
542            (cond ((characterp c2)
543                   (if (encode-char c2 'ascii)
544                       (list (cons c2 c1))
545                     (eq ret c2))
546                   )
547                  ((setq ret2 (find-char c2))
548                   (eq ret ret2)
549                   )
550                  ((setq ret2 (assq 'ideographic-structure c2))
551                   (and (setq ret (get-char-attribute ret 'ideographic-structure))
552                        (ideographic-structure-equal ret (cdr ret2))
553                        )))))))
554
555 ;;;###autoload
556 (defun ideographic-structure-find-chars (structure)
557   (apply #'ideographic-structure-find-chars* structure))
558
559 (defun ideographic-structure-find-chars* (operator component1 component2
560                                                    &optional component3)
561   (let ((comp-alist (ideographic-structure-to-components-alist*
562                      operator component1 component2 component3))
563         c1 c2 c3
564         ret pl str
565         var-alist)
566     (dolist (pc (caar
567                  (sort (mapcar (lambda (cell)
568                                  (if (setq ret (get-char-attribute
569                                                 (car cell) 'ideographic-products))
570                                      (cons ret (length ret))
571                                    (cons nil 0)))
572                                comp-alist)
573                        (lambda (a b)
574                          (< (cdr a)(cdr b))))))
575       (when (and (setq str (get-char-attribute pc 'ideographic-structure))
576                  (setq var-alist
577                        (ideographic-structure-character= (car str) operator))
578                  (setq c1 (nth 1 str))
579                  (setq ret (ideographic-structure-character= c1 component1))
580                  (setq var-alist (ids-find-merge-variables var-alist ret))
581                  (setq c2 (nth 2 str))
582                  (setq ret (ideographic-structure-character= c2 component2))
583                  (setq var-alist (ids-find-merge-variables var-alist ret))
584                  (cond ((memq (car str) '(?\u2FF2 ?\u2FF3))
585                         (setq c3 (nth 3 str))
586                         (and (setq ret (ideographic-structure-character=
587                                         c3 component3))
588                              (ids-find-merge-variables var-alist ret))
589                         )
590                        (t var-alist)))
591         (setq pl (cons pc pl))
592         ))
593     pl))
594
595 ;;;###autoload
596 (defun ideographic-char-count-components (char component)
597   (let ((dest 0)
598         structure)
599     (cond ((eq char component)
600            1)
601           ((setq structure (get-char-attribute char 'ideographic-structure))
602            (dolist (cell (ideographic-structure-to-components-alist structure))
603              (setq dest
604                    (+ dest
605                       (if (eq (car cell) char)
606                           (cdr cell)
607                         (* (ideographic-char-count-components (car cell) component)
608                            (cdr cell))))))
609            dest)
610           (t
611            0))))
612
613
614 ;;; @ End.
615 ;;;
616
617 (provide 'ids-find)
618
619 ;;; ids-find.el ends here