(www-ids-find-format-line): Encode UTF-8 characters in URL; don't use
[chise/ids.git] / www / www-ids-find.el
1 (require 'ids-find)
2
3 (defun decode-url-string (string &optional coding-system)
4   (if (> (length string) 0)
5       (let ((i 0)
6             dest)
7         (while (string-match "%\\([0-9A-F][0-9A-F]\\)" string i)
8           (setq dest (concat dest
9                              (substring string i (match-beginning 0))
10                              (char-to-string
11                               (int-char
12                                (string-to-int (match-string 1 string) 16))))
13                 i (match-end 0)))
14         (decode-coding-string
15          (concat dest (substring string i))
16          coding-system))))
17
18 (defvar www-ids-find-tang-chars-file-name
19   "~tomo/projects/chise/ids/www/tang-chars.udd")
20
21 (defun www-ids-find-format-line (c is)
22   (let ((str (encode-coding-string (format "%c" c) 'utf-8-er))
23         plane code ucs)
24     (princ
25      (with-temp-buffer
26        (cond
27         ((string-match "&CB\\([0-9]+\\);" str)
28          (setq code (string-to-int (match-string 1 str)))
29          (insert "<a href=\"http://mousai.kanji.zinbun.kyoto-u.ac.jp/char-desc?char=")
30          (insert str)
31          (insert (format "\"><img alt=\"CB%05d\" src=\"http://mousai.kanji.zinbun.kyoto-u.ac.jp/glyphs/cb-gaiji/%02d/CB%05d.gif\">\n"
32                          code (/ code 1000) code))
33          (insert (format "CB%05d</a>" code))
34          )
35         ((string-match "&JC3-\\([0-9A-F]+\\);" str)
36          (setq code (string-to-int (match-string 1 str) 16))
37          (insert "<a href=\"http://mousai.kanji.zinbun.kyoto-u.ac.jp/char-desc?char=")
38          (insert str)
39          (insert (format "\"><img alt=\"JC3-%04X\" src=\"http://kanji.zinbun.kyoto-u.ac.jp/db/CHINA3/Gaiji/%04x.gif\">\n"
40                          code code))
41          (insert (format "JC3-%04X</a>" code))
42          )
43         ((string-match "&J\\(78\\|83\\|90\\|SP\\)-\\([0-9A-F]+\\);" str)
44          (setq plane (match-string 1 str)
45                code (string-to-int (match-string 2 str) 16))
46          (insert "<a href=\"http://mousai.kanji.zinbun.kyoto-u.ac.jp/char-desc?char=")
47          (insert str)
48          (insert (format "\"><img alt=\"J%s-%04X\" src=\"http://mousai.kanji.zinbun.kyoto-u.ac.jp/glyphs/JIS-%s/%02d-%02d.gif\">\n"
49                          plane code plane
50                          (- (lsh code -8) 32)
51                          (- (logand code 255) 32)))
52          (insert (format "J%s-%04X</a>" plane code))
53          )
54         ((string-match "&G\\([01]\\)-\\([0-9A-F]+\\);" str)
55          (setq plane (string-to-int (match-string 1 str))
56                code (string-to-int (match-string 2 str) 16))
57          (insert "<a href=\"http://mousai.kanji.zinbun.kyoto-u.ac.jp/char-desc?char=")
58          (insert str)
59          (insert (format "\"><img alt=\"G%d-%04X\" src=\"http://mousai.kanji.zinbun.kyoto-u.ac.jp/glyphs/GB%d/%02d-%02d.gif\">\n"
60                          plane code plane
61                          (- (lsh code -8) 32)
62                          (- (logand code 255) 32)))
63          (insert (format "G%d-%04X</a>" plane code))
64          )
65         ((string-match "&C\\([1-7]\\)-\\([0-9A-F]+\\);" str)
66          (setq plane (string-to-int (match-string 1 str))
67                code (string-to-int (match-string 2 str) 16))
68          (insert "<a href=\"http://mousai.kanji.zinbun.kyoto-u.ac.jp/char-desc?char=")
69          (insert str)
70          (insert (format "\"><img alt=\"C%d-%04X\" src=\"http://mousai.kanji.zinbun.kyoto-u.ac.jp/glyphs/CNS%d/%04X.gif\">\n"
71                          plane code plane code))
72          (insert (format "C%d-%04X</a>" plane code))
73          )
74         ((string-match "&ZOB-\\([0-9]+\\);" str)
75          (setq code (string-to-int (match-string 1 str)))
76          (insert "<a href=\"http://mousai.kanji.zinbun.kyoto-u.ac.jp/char-desc?char=")
77          (insert str)
78          (insert (format "\"><img alt=\"ZOB-%04d\" src=\"http://mousai.kanji.zinbun.kyoto-u.ac.jp/glyphs/ZOB-1968/%04d.png\">\n"
79                          code code))
80          (insert (format "ZOB-%04d</a>" code))
81          )
82         (t
83          (insert "<a href=\"http://mousai.kanji.zinbun.kyoto-u.ac.jp/char-desc?char=")
84          ;; (insert str)
85          (insert
86           (mapconcat (lambda (c)
87                        (if (<= (char-int c) #x7F)
88                            (char-to-string c)
89                          (format "%%%02X" c)))
90                      str ""))
91          (insert "\">")
92          (insert str)
93          (insert "</a>")
94          ))
95        (goto-char (point-min))
96        (while (search-forward "&" nil t)
97          (replace-match "&amp;" t 'literal))
98        (buffer-string)
99        ))
100     (princ
101      (or (if (setq ucs (or (char-ucs c)
102                            (encode-char c 'ucs)))
103              (format " <a href=\"http://www.unicode.org/cgi-bin/GetUnihanData.pl?codepoint=%X\">%s</a>"
104                      ucs
105                      (cond ((<= ucs #xFFFF)
106                             (format "U+%04X" ucs))
107                            ((<= ucs #x10FFFF)
108                             (format "U-%08X" ucs))))
109            "          ")))
110     (princ " ")
111     (when is
112       (princ
113        (with-temp-buffer
114          (insert
115           (encode-coding-string
116            (ideographic-structure-to-ids is)
117            'utf-8-jp-er))
118          (goto-char (point-min))
119          (while (re-search-forward "&CB\\([0-9]+\\);" nil t)
120            (setq code (string-to-int (match-string 1)))
121            (replace-match
122             (format "<img alt=\"CB%05d\" src=\"http://mousai.kanji.zinbun.kyoto-u.ac.jp/glyphs/cb-gaiji/%02d/CB%05d.gif\">"
123                     code (/ code 1000) code)
124             t 'literal))
125          (buffer-string))))
126     (when (and ucs
127                (with-current-buffer
128                    (find-file-noselect
129                     www-ids-find-tang-chars-file-name)
130                  (goto-char (point-min))
131                  (re-search-forward (format "^%d$" ucs) nil t)))
132       (princ
133        (format " <a href=\"http://coe21.zinbun.kyoto-u.ac.jp/djvuchar?query=%s\">"
134                (mapconcat
135                 (lambda (c)
136                   (format "%%%02X" (char-int c)))
137                 (encode-coding-string (char-to-string c)
138                                       'utf-8-jp)
139                 "")))
140       (princ (encode-coding-string "\e$B"M\e(B[\e$BEbBeBsK\\e(B]</a>" 'utf-8-jp-er)))
141     (princ "<br>\n")))
142
143 (defun www-ids-insert-chars-including-components (components)
144   (let (is)
145     (dolist (c (ideographic-products-find components))
146       (setq is (char-feature c 'ideographic-structure))
147       ;; to avoid problems caused by wrong indexes
148       (when (every (lambda (cc)
149                      (ideographic-structure-member cc is))
150                    components)
151         (princ "<li>")
152         (www-ids-find-format-line c is)
153         (princ "<ul>\n")
154         (www-ids-insert-chars-including-components (char-to-string c))
155         (princ "</ul>\n")
156         )
157       )))
158
159 (defun www-batch-ids-find ()
160   (let ((components (car command-line-args-left))
161         (coded-charset-entity-reference-alist
162          (list*
163           '(=cns11643-1         "C1-" 4 X)
164           '(=cns11643-2         "C2-" 4 X)
165           '(=cns11643-3         "C3-" 4 X)
166           '(=cns11643-4         "C4-" 4 X)
167           '(=cns11643-5         "C5-" 4 X)
168           '(=cns11643-6         "C6-" 4 X)
169           '(=cns11643-7         "C7-" 4 X)
170           '(=gb2312             "G0-" 4 X)
171           '(=gb12345            "G1-" 4 X)
172           '(=jis-x0208@1990     "J90-" 4 X)
173           '(=jis-x0212          "JSP-" 4 X)
174           '(=cbeta              "CB" 5 d)
175           '(=jef-china3         "JC3-" 4 X)
176           '(=jis-x0208@1978     "J78-" 4 X)
177           '(=jis-x0208@1983     "J83-" 4 X)
178           '(=daikanwa           "M-" 5 d)
179           coded-charset-entity-reference-alist))
180         )
181     (setq command-line-args-left (cdr command-line-args-left))
182     (cond
183      ((stringp components)
184       (if (string-match "^components=" components)
185           (setq components (substring components (match-end 0))))
186       (setq components
187             (if (> (length components) 0)
188                 (decode-url-string components 'utf-8-jp-er)
189               nil))
190       )
191      (t
192       (setq components nil)
193       ))
194     (princ "Content-Type: text/html; charset=UTF-8
195
196 <!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\"
197             \"http://www.w3.org/TR/html4/loose.dtd\">
198 <html lang=\"ja\">
199 <head>
200 <title>CHISE IDS Find</title>
201 </head>
202
203 <body>
204
205 <h1>")
206     (princ (encode-coding-string "CHISE IDS \e$B4A;z8!:w\e(B" 'utf-8-jp-er))
207     (princ "</h1>
208 <p>
209 <form action=\"http://mousai.kanji.zinbun.kyoto-u.ac.jp/ids-find\" method=\"GET\">
210 ")
211     (princ (encode-coding-string "\e$BItIJJ8;zNs\e(B" 'utf-8-jp-er))
212     (princ " <input type=\"text\" name=\"components\" size=\"30\" maxlength=\"30\" value=\"")
213     (if (> (length components) 0)
214         (princ (encode-coding-string components 'utf-8-jp-er)))
215     (princ "\">
216 <input type=\"submit\" value=\"")
217     (princ (encode-coding-string "\e$B8!:w3+;O\e(B" 'utf-8-jp-er))
218     (princ "\">
219 </form>
220
221 ")
222     (cond
223      (components
224       ;; (map-char-attribute
225       ;;  (lambda (c v)
226       ;;    (when (every (lambda (p)
227       ;;                   (ideographic-structure-member p v))
228       ;;                 components)
229       ;;      (princ (encode-coding-string
230       ;;              (ids-find-format-line c v)
231       ;;              'utf-8-jp-er))
232       ;;      (princ "<br>\n")
233       ;;      )
234       ;;    nil)
235       ;;  'ideographic-structure)
236       (when (= (length components) 1)
237         (www-ids-find-format-line (aref components 0)
238                                   (char-feature (aref components 0)
239                                                 'ideographic-structure)))
240       ;; (dolist (c (ideographic-products-find components))
241       ;;   (setq is (char-feature c 'ideographic-structure))
242       ;;   ;; to avoid problems caused by wrong indexes
243       ;;   (when (every (lambda (c)
244       ;;                  (ideographic-structure-member c is))
245       ;;                components)
246       ;;     (www-ids-find-format-line c is)))
247       (princ "<ul>\n")
248       (www-ids-insert-chars-including-components components)
249       (princ "</ul>\n")
250       )
251      (t
252       (princ (encode-coding-string "<hr>
253 <p>
254 \e$B;XDj$7$?ItIJ$rA4$F4^$`4A;z$N0lMw$rI=<($7$^$9!#\e(B
255 <p>
256 CHISE \e$B$GMQ$$$i$l$k<BBV;2>H7A<0!JNc!'\e(B&amp;M-00256;\e$B!K$GItIJ$r;XDj$9$k;v$b$G$-$^$9!#\e(B" 'utf-8-jp-er))
257       ))
258     (princ "<hr>")
259     (princ
260      (format
261       "Powered by <a
262 href=\"http://kanji.zinbun.kyoto-u.ac.jp/projects/chise/xemacs/\"
263 >XEmacs CHISE</a> %s."
264       xemacs-chise-version))
265     (princ "
266 </body>
267 </html>
268 ")))