1 ;;; eword-decode.el --- RFC 2047 based encoded-word decoder for GNU Emacs
3 ;; Copyright (C) 1995,1996,1997,1998 Free Software Foundation, Inc.
5 ;; Author: ENAMI Tsugutomo <enami@sys.ptg.sony.co.jp>
6 ;; MORIOKA Tomohiko <morioka@jaist.ac.jp>
7 ;; Maintainer: MORIOKA Tomohiko <morioka@jaist.ac.jp>
9 ;; Original: 1992/07/20 ENAMI Tsugutomo's `mime.el'.
10 ;; Renamed: 1993/06/03 to tiny-mime.el
11 ;; Renamed: 1995/10/03 from tiny-mime.el (split off encoder)
12 ;; Renamed: 1997/02/22 from tm-ew-d.el
13 ;; Keywords: encoded-word, MIME, multilingual, header, mail, news
15 ;; This file is part of SEMI (Spadework for Emacs MIME Interfaces).
17 ;; This program is free software; you can redistribute it and/or
18 ;; modify it under the terms of the GNU General Public License as
19 ;; published by the Free Software Foundation; either version 2, or (at
20 ;; your option) any later version.
22 ;; This program is distributed in the hope that it will be useful, but
23 ;; WITHOUT ANY WARRANTY; without even the implied warranty of
24 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
25 ;; General Public License for more details.
27 ;; You should have received a copy of the GNU General Public License
28 ;; along with GNU Emacs; see the file COPYING. If not, write to the
29 ;; Free Software Foundation, Inc., 59 Temple Place - Suite 330,
30 ;; Boston, MA 02111-1307, USA.
34 (require 'std11-parse)
38 (defgroup eword-decode nil
39 "Encoded-word decoding"
43 ;;; @ MIME encoded-word definition
46 (defconst eword-encoded-text-regexp "[!->@-~]+")
47 (defconst eword-encoded-word-regexp
48 (concat (regexp-quote "=?")
56 eword-encoded-text-regexp
64 (defconst base64-token-regexp "[A-Za-z0-9+/]")
65 (defconst base64-token-padding-regexp "[A-Za-z0-9+/=]")
67 (defconst eword-B-encoded-text-regexp
76 base64-token-padding-regexp
77 base64-token-padding-regexp
80 ;; (defconst eword-B-encoding-and-encoded-text-regexp
81 ;; (concat "\\(B\\)\\?" eword-B-encoded-text-regexp))
84 ;;; @@ Quoted-Printable
87 (defconst quoted-printable-hex-chars "0123456789ABCDEF")
88 (defconst quoted-printable-octet-regexp
89 (concat "=[" quoted-printable-hex-chars
90 "][" quoted-printable-hex-chars "]"))
92 (defconst eword-Q-encoded-text-regexp
93 (concat "\\([^=?]\\|" quoted-printable-octet-regexp "\\)+"))
94 ;; (defconst eword-Q-encoding-and-encoded-text-regexp
95 ;; (concat "\\(Q\\)\\?" eword-Q-encoded-text-regexp))
101 (defun eword-decode-string (string &optional must-unfold)
102 "Decode MIME encoded-words in STRING.
104 STRING is unfolded before decoding.
106 If an encoded-word is broken or your emacs implementation can not
107 decode the charset included in it, it is not decoded.
109 If MUST-UNFOLD is non-nil, it unfolds and eliminates line-breaks even
110 if there are in decoded encoded-words (generated by bad manner MUA
111 such as a version of Net$cape)."
112 (setq string (std11-unfold-string string))
113 (let ((dest "")(ew nil)
115 (while (and (string-match eword-encoded-word-regexp string)
116 (setq beg (match-beginning 0)
122 (string-match "^[ \t]+$" (substring string 0 beg))
124 (setq dest (concat dest (substring string 0 beg)))
129 (eword-decode-encoded-word
130 (substring string beg end) must-unfold)
132 (setq string (substring string end))
142 (defun eword-decode-region (start end &optional unfolding must-unfold)
143 "Decode MIME encoded-words in region between START and END.
145 If UNFOLDING is not nil, it unfolds before decoding.
147 If MUST-UNFOLD is non-nil, it unfolds and eliminates line-breaks even
148 if there are in decoded encoded-words (generated by bad manner MUA
149 such as a version of Net$cape)."
153 (narrow-to-region start end)
155 (eword-decode-unfold)
157 (goto-char (point-min))
158 (while (re-search-forward (concat "\\(" eword-encoded-word-regexp "\\)"
160 "\\(" eword-encoded-word-regexp "\\)")
162 (replace-match "\\1\\6")
163 (goto-char (point-min))
165 (while (re-search-forward eword-encoded-word-regexp nil t)
166 (insert (eword-decode-encoded-word
168 (buffer-substring (match-beginning 0) (match-end 0))
169 (delete-region (match-beginning 0) (match-end 0))
175 ;;; @ for message header
178 (defcustom eword-decode-ignored-field-list
179 '(newsgroups path lines nntp-posting-host message-id date)
180 "*List of field-names to be ignored when decoding.
181 Each field name must be symbol."
183 :type '(repeat symbol))
185 (defcustom eword-decode-structured-field-list
186 '(reply-to resent-reply-to from resent-from sender resent-sender
187 to resent-to cc resent-cc bcc resent-bcc dcc
188 mime-version content-type content-transfer-encoding
190 "*List of field-names to decode as structured field.
191 Each field name must be symbol."
193 :type '(repeat symbol))
195 (defun eword-decode-header (&optional code-conversion separator)
196 "Decode MIME encoded-words in header fields.
197 If CODE-CONVERSION is nil, it decodes only encoded-words. If it is
198 mime-charset, it decodes non-ASCII bit patterns as the mime-charset.
199 Otherwise it decodes non-ASCII bit patterns as the
200 default-mime-charset.
201 If SEPARATOR is not nil, it is used as header separator."
205 (std11-narrow-to-header separator)
206 (let ((default-charset
208 (if (mime-charset-to-coding-system code-conversion)
210 default-mime-charset))))
212 (let (beg p end field-name len)
213 (goto-char (point-min))
214 (while (re-search-forward std11-field-head-regexp nil t)
215 (setq beg (match-beginning 0)
217 field-name (buffer-substring beg (1- p))
218 len (string-width field-name)
219 field-name (intern (downcase field-name))
220 end (std11-field-end))
221 (cond ((memq field-name eword-decode-ignored-field-list)
224 ((memq field-name eword-decode-structured-field-list)
225 ;; Decode as structured field
226 (let ((body (buffer-substring p end))
227 (default-mime-charset default-charset))
228 (delete-region p end)
229 (insert (eword-decode-and-fold-structured-field
233 ;; Decode as unstructured field
235 (narrow-to-region beg (1+ end))
236 (decode-mime-charset-region p end default-charset)
238 (if (re-search-forward eword-encoded-word-regexp
240 (eword-decode-region beg (point-max) 'unfold))
242 (eword-decode-region (point-min) (point-max) t)
245 (defun eword-decode-unfold ()
246 (goto-char (point-min))
248 (while (re-search-forward std11-field-head-regexp nil t)
249 (setq beg (match-beginning 0)
250 end (std11-field-end))
251 (setq field (buffer-substring beg end))
252 (if (string-match eword-encoded-word-regexp field)
254 (narrow-to-region (goto-char beg) end)
255 (while (re-search-forward "\n\\([ \t]\\)" nil t)
256 (replace-match (match-string 1))
258 (goto-char (point-max))
263 ;;; @ encoded-word decoder
266 (defvar eword-warning-face nil "Face used for invalid encoded-word.")
268 (defun eword-decode-encoded-word (word &optional must-unfold)
269 "Decode WORD if it is an encoded-word.
271 If your emacs implementation can not decode the charset of WORD, it
272 returns WORD. Similarly the encoded-word is broken, it returns WORD.
274 If MUST-UNFOLD is non-nil, it unfolds and eliminates line-breaks even
275 if there are in decoded encoded-word (generated by bad manner MUA such
276 as a version of Net$cape)."
277 (or (if (string-match eword-encoded-word-regexp word)
279 (substring word (match-beginning 1) (match-end 1))
283 (substring word (match-beginning 2) (match-end 2))
286 (substring word (match-beginning 3) (match-end 3))
289 (eword-decode-encoded-text charset encoding text must-unfold)
292 (add-text-properties 0 (length word)
293 (and eword-warning-face
294 (list 'face eword-warning-face))
301 ;;; @ encoded-text decoder
304 (defun eword-decode-encoded-text (charset encoding string
305 &optional must-unfold)
306 "Decode STRING as an encoded-text.
308 If your emacs implementation can not decode CHARSET, it returns nil.
310 If ENCODING is not \"B\" or \"Q\", it occurs error.
311 So you should write error-handling code if you don't want break by errors.
313 If MUST-UNFOLD is non-nil, it unfolds and eliminates line-breaks even
314 if there are in decoded encoded-text (generated by bad manner MUA such
315 as a version of Net$cape)."
316 (let ((cs (mime-charset-to-coding-system charset)))
320 ((string-equal "B" encoding)
321 (if (and (string-match eword-B-encoded-text-regexp string)
322 (string-equal string (match-string 0 string)))
323 (base64-decode-string string)
324 (error "Invalid encoded-text %s" string)))
325 ((string-equal "Q" encoding)
326 (if (and (string-match eword-Q-encoded-text-regexp string)
327 (string-equal string (match-string 0 string)))
328 (q-encoding-decode-string string)
329 (error "Invalid encoded-text %s" string)))
331 (error "Invalid encoding %s" encoding)
336 (setq dest (decode-coding-string dest cs))
343 (t (char-to-string chr)))
345 (std11-unfold-string dest)
351 ;;; @ lexical analyze
354 (defvar eword-lexical-analyze-cache nil)
355 (defvar eword-lexical-analyze-cache-max 299
356 "*Max position of eword-lexical-analyze-cache.
357 It is max size of eword-lexical-analyze-cache - 1.")
359 (defcustom eword-lexical-analyzers
360 '(eword-analyze-quoted-string
361 eword-analyze-domain-literal
362 eword-analyze-comment
364 eword-analyze-special
365 eword-analyze-encoded-word
367 "*List of functions to return result of lexical analyze.
368 Each function must have two arguments: STRING and MUST-UNFOLD.
369 STRING is the target string to be analyzed.
370 If MUST-UNFOLD is not nil, each function must unfold and eliminate
371 bare-CR and bare-LF from the result even if they are included in
372 content of the encoded-word.
373 Each function must return nil if it can not analyze STRING as its
376 Previous function is preferred to next function. If a function
377 returns nil, next function is used. Otherwise the return value will
380 :type '(repeat function))
382 (defun eword-analyze-quoted-string (string &optional must-unfold)
383 (let ((p (std11-check-enclosure string ?\" ?\")))
385 (cons (cons 'quoted-string
386 (decode-mime-charset-string
387 (std11-strip-quoted-pair (substring string 1 (1- p)))
388 default-mime-charset))
389 (substring string p))
392 (defun eword-analyze-domain-literal (string &optional must-unfold)
393 (std11-analyze-domain-literal string))
395 (defun eword-analyze-comment (string &optional must-unfold)
396 (let ((p (std11-check-enclosure string ?\( ?\) t)))
400 (decode-mime-charset-string
401 (std11-strip-quoted-pair (substring string 1 (1- p)))
402 default-mime-charset)
404 (substring string p))
407 (defun eword-analyze-spaces (string &optional must-unfold)
408 (std11-analyze-spaces string))
410 (defun eword-analyze-special (string &optional must-unfold)
411 (std11-analyze-special string))
413 (defun eword-analyze-encoded-word (string &optional must-unfold)
414 (if (eq (string-match eword-encoded-word-regexp string) 0)
415 (let ((end (match-end 0))
416 (dest (eword-decode-encoded-word (match-string 0 string)
419 (setq string (substring string end))
420 (while (eq (string-match `,(concat "[ \t\n]*\\("
421 eword-encoded-word-regexp
425 (setq end (match-end 0))
428 (eword-decode-encoded-word (match-string 1 string)
430 string (substring string end))
432 (cons (cons 'atom dest) string)
435 (defun eword-analyze-atom (string &optional must-unfold)
436 (if (string-match std11-atom-regexp string)
437 (let ((end (match-end 0)))
438 (cons (cons 'atom (decode-mime-charset-string
439 (substring string 0 end)
440 default-mime-charset))
441 (substring string end)
444 (defun eword-lexical-analyze-internal (string must-unfold)
446 (while (not (string-equal string ""))
448 (let ((rest eword-lexical-analyzers)
450 (while (and (setq func (car rest))
451 (null (setq r (funcall func string must-unfold)))
453 (setq rest (cdr rest)))
454 (or r `((error . ,string) . ""))
456 (setq dest (cons (car ret) dest))
457 (setq string (cdr ret))
462 (defun eword-lexical-analyze (string &optional must-unfold)
463 "Return lexical analyzed list corresponding STRING.
464 It is like std11-lexical-analyze, but it decodes non us-ascii
465 characters encoded as encoded-words or invalid \"raw\" format.
466 \"Raw\" non us-ascii characters are regarded as variable
467 `default-mime-charset'."
468 (let ((key (copy-sequence string))
470 (set-text-properties 0 (length key) nil key)
471 (if (setq ret (assoc key eword-lexical-analyze-cache))
473 (setq ret (eword-lexical-analyze-internal key must-unfold))
474 (setq eword-lexical-analyze-cache
476 (last eword-lexical-analyze-cache
477 eword-lexical-analyze-cache-max)))
480 (defun eword-decode-token (token)
481 (let ((type (car token))
483 (cond ((eq type 'quoted-string)
484 (std11-wrap-as-quoted-string value))
486 (concat "(" (std11-wrap-as-quoted-pairs value '(?( ?))) ")"))
489 (defun eword-decode-and-fold-structured-field
490 (string start-column &optional max-column must-unfold)
491 "Decode and fold (fill) STRING as structured field body.
492 It decodes non us-ascii characters in FULL-NAME encoded as
493 encoded-words or invalid \"raw\" string. \"Raw\" non us-ascii
494 characters are regarded as variable `default-mime-charset'.
496 If an encoded-word is broken or your emacs implementation can not
497 decode the charset included in it, it is not decoded.
499 If MAX-COLUMN is omitted, `fill-column' is used.
501 If MUST-UNFOLD is non-nil, it unfolds and eliminates line-breaks even
502 if there are in decoded encoded-words (generated by bad manner MUA
503 such as a version of Net$cape)."
505 (setq max-column fill-column))
506 (let ((c start-column)
507 (tokens (eword-lexical-analyze string must-unfold))
510 (while (and (setq token (car tokens))
511 (setq tokens (cdr tokens)))
512 (let* ((type (car token)))
513 (if (eq type 'spaces)
514 (let* ((next-token (car tokens))
515 (next-str (eword-decode-token next-token))
516 (next-len (string-width next-str))
517 (next-c (+ c next-len 1)))
518 (if (< next-c max-column)
519 (setq result (concat result " " next-str)
521 (setq result (concat result "\n " next-str)
523 (setq tokens (cdr tokens))
525 (let* ((str (eword-decode-token token)))
526 (setq result (concat result str)
527 c (+ c (string-width str)))
530 (concat result (eword-decode-token token))
533 (defun eword-decode-and-unfold-structured-field (string)
534 "Decode and unfold STRING as structured field body.
535 It decodes non us-ascii characters in FULL-NAME encoded as
536 encoded-words or invalid \"raw\" string. \"Raw\" non us-ascii
537 characters are regarded as variable `default-mime-charset'.
539 If an encoded-word is broken or your emacs implementation can not
540 decode the charset included in it, it is not decoded."
541 (let ((tokens (eword-lexical-analyze string 'must-unfold))
544 (let* ((token (car tokens))
546 (setq tokens (cdr tokens))
548 (if (eq type 'spaces)
550 (concat result (eword-decode-token token))
554 (defun eword-decode-structured-field-body (string &optional must-unfold
555 start-column max-column)
556 "Decode non us-ascii characters in STRING as structured field body.
557 STRING is unfolded before decoding.
559 It decodes non us-ascii characters in FULL-NAME encoded as
560 encoded-words or invalid \"raw\" string. \"Raw\" non us-ascii
561 characters are regarded as variable `default-mime-charset'.
563 If an encoded-word is broken or your emacs implementation can not
564 decode the charset included in it, it is not decoded.
566 If MUST-UNFOLD is non-nil, it unfolds and eliminates line-breaks even
567 if there are in decoded encoded-words (generated by bad manner MUA
568 such as a version of Net$cape)."
570 ;; fold with max-column
571 (eword-decode-and-fold-structured-field
572 string start-column max-column must-unfold)
574 (mapconcat (function eword-decode-token)
575 (eword-lexical-analyze string must-unfold)
579 (defun eword-decode-unstructured-field-body (string &optional must-unfold)
580 "Decode non us-ascii characters in STRING as unstructured field body.
581 STRING is unfolded before decoding.
583 It decodes non us-ascii characters in FULL-NAME encoded as
584 encoded-words or invalid \"raw\" string. \"Raw\" non us-ascii
585 characters are regarded as variable `default-mime-charset'.
587 If an encoded-word is broken or your emacs implementation can not
588 decode the charset included in it, it is not decoded.
590 If MUST-UNFOLD is non-nil, it unfolds and eliminates line-breaks even
591 if there are in decoded encoded-words (generated by bad manner MUA
592 such as a version of Net$cape)."
594 (decode-mime-charset-string string default-mime-charset)
597 (defun eword-extract-address-components (string)
598 "Extract full name and canonical address from STRING.
599 Returns a list of the form (FULL-NAME CANONICAL-ADDRESS).
600 If no name can be extracted, FULL-NAME will be nil.
601 It decodes non us-ascii characters in FULL-NAME encoded as
602 encoded-words or invalid \"raw\" string. \"Raw\" non us-ascii
603 characters are regarded as variable `default-mime-charset'."
604 (let* ((structure (car (std11-parse-address
605 (eword-lexical-analyze
606 (std11-unfold-string string) 'must-unfold))))
607 (phrase (std11-full-name-string structure))
608 (address (std11-address-string structure))
610 (list phrase address)
617 (provide 'eword-decode)
619 ;;; eword-decode.el ends here