1 ;;; eword-decode.el --- RFC 2047 based encoded-word decoder for GNU Emacs
3 ;; Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2003, 2004,
4 ;; 2005 Free Software Foundation, Inc.
6 ;; Author: ENAMI Tsugutomo <enami@sys.ptg.sony.co.jp>
7 ;; MORIOKA Tomohiko <tomo@m17n.org>
8 ;; TANAKA Akira <akr@m17n.org>
10 ;; Original: 1992/07/20 ENAMI Tsugutomo's `mime.el'.
11 ;; Renamed: 1993/06/03 to tiny-mime.el by MORIOKA Tomohiko
12 ;; Renamed: 1995/10/03 to tm-ew-d.el (split off encoder)
13 ;; by MORIOKA Tomohiko
14 ;; Renamed: 1997/02/22 from tm-ew-d.el by MORIOKA Tomohiko
15 ;; Keywords: encoded-word, MIME, multilingual, header, mail, news
17 ;; This file is part of FLIM (Faithful Library about Internet Message).
19 ;; This program is free software; you can redistribute it and/or
20 ;; modify it under the terms of the GNU General Public License as
21 ;; published by the Free Software Foundation; either version 2, or (at
22 ;; your option) any later version.
24 ;; This program is distributed in the hope that it will be useful, but
25 ;; WITHOUT ANY WARRANTY; without even the implied warranty of
26 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
27 ;; General Public License for more details.
29 ;; You should have received a copy of the GNU General Public License
30 ;; along with GNU Emacs; see the file COPYING. If not, write to the
31 ;; Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
32 ;; Boston, MA 02110-1301, USA.
40 (eval-when-compile (require 'cl)) ; list*, pop
46 ;; User options are defined in mime-def.el.
49 ;;; @ MIME encoded-word definition
53 (defconst eword-encoded-text-regexp "[!->@-~]+")
55 (defconst eword-encoded-word-regexp
57 (concat (regexp-quote "=?")
59 mime-charset-regexp ; 1
63 mime-language-regexp ; 2
67 mime-encoding-regexp ; 3
71 eword-encoded-text-regexp ; 4
73 (regexp-quote "?="))))
80 (defun eword-decode-string (string &optional must-unfold)
81 "Decode MIME encoded-words in STRING.
83 STRING is unfolded before decoding.
85 If an encoded-word is broken or your emacs implementation can not
86 decode the charset included in it, it is not decoded.
88 If MUST-UNFOLD is non-nil, it unfolds and eliminates line-breaks even
89 if there are in decoded encoded-words (generated by bad manner MUA
90 such as a version of Net$cape).
92 The language informations specified in the encoded words, if any, are
93 put to the decoded text as the `mime-language' text property."
94 (setq string (std11-unfold-string string))
95 (let ((regexp (concat "[\n\t ]*\\(" eword-encoded-word-regexp "\\)"))
98 (while (setq match (string-match regexp string next))
99 (setq start (match-beginning 1)
102 (setq next (match-end 0))
103 (push (list (match-string 2 string) ;; charset
104 (when (match-beginning 3) ;; language
108 (1+ (match-beginning 3)) (match-end 3)))))
109 (match-string 4 string) ;; encoding
110 (match-string 5 string) ;; encoded-text
111 (match-string 1 string)) ;; encoded-word
113 (setq match (and (string-match regexp string next)
114 (= next (match-beginning 0)))))
115 (setq words (eword-decode-encoded-words (nreverse words) must-unfold)
116 string (concat (substring string 0 start)
118 (substring string next))
119 next (+ start (length words)))))
122 (defun eword-decode-structured-field-body (string
123 &optional start-column max-column
125 (let ((tokens (eword-lexical-analyze string start 'must-unfold))
129 (setq token (car tokens))
130 (setq result (concat result (eword-decode-token token)))
131 (setq tokens (cdr tokens)))
134 (defun eword-decode-and-unfold-structured-field-body (string
139 "Decode and unfold STRING as structured field body.
140 It decodes non us-ascii characters in FULL-NAME encoded as
141 encoded-words or invalid \"raw\" string. \"Raw\" non us-ascii
142 characters are regarded as variable `default-mime-charset'.
144 If an encoded-word is broken or your emacs implementation can not
145 decode the charset included in it, it is not decoded."
146 (let ((tokens (eword-lexical-analyze string start 'must-unfold))
149 (let* ((token (car tokens))
151 (setq tokens (cdr tokens))
153 (if (eq type 'spaces)
155 (concat result (eword-decode-token token))
159 (defun eword-decode-and-fold-structured-field-body (string
163 (if (and mime-field-decoding-max-size
164 (> (length string) mime-field-decoding-max-size))
167 (setq max-column fill-column))
168 (let ((c start-column)
169 (tokens (eword-lexical-analyze string start 'must-unfold))
172 (while (and (setq token (car tokens))
173 (setq tokens (cdr tokens)))
174 (let* ((type (car token)))
175 (if (eq type 'spaces)
176 (let* ((next-token (car tokens))
177 (next-str (eword-decode-token next-token))
178 (next-len (string-width next-str))
179 (next-c (+ c next-len 1)))
180 (if (< next-c max-column)
181 (setq result (concat result " " next-str)
183 (setq result (concat result "\n " next-str)
185 (setq tokens (cdr tokens))
187 (let* ((str (eword-decode-token token)))
188 (setq result (concat result str)
189 c (+ c (string-width str)))
192 (concat result (eword-decode-token token))
195 (defun eword-decode-unstructured-field-body (string &optional start-column
198 (decode-mime-charset-string string default-mime-charset)))
200 (defun eword-decode-and-unfold-unstructured-field-body (string
201 &optional start-column
204 (decode-mime-charset-string (std11-unfold-string string)
205 default-mime-charset)
208 (defun eword-decode-unfolded-unstructured-field-body (string
209 &optional start-column
212 (decode-mime-charset-string string default-mime-charset)
219 (defun eword-decode-region (start end &optional unfolding must-unfold)
220 "Decode MIME encoded-words in region between START and END.
222 If UNFOLDING is not nil, it unfolds before decoding.
224 If MUST-UNFOLD is non-nil, it unfolds and eliminates line-breaks even
225 if there are in decoded encoded-words (generated by bad manner MUA
226 such as a version of Net$cape).
228 The language informations specified in the encoded words, if any, are
229 put to the decoded text as the `mime-language' text property."
233 (narrow-to-region start end)
235 (eword-decode-unfold))
236 (goto-char (point-min))
237 (let ((regexp (concat "[\n\t ]*\\(" eword-encoded-word-regexp "\\)"))
239 (while (setq match (re-search-forward regexp nil t))
240 (setq start (match-beginning 1)
243 (goto-char (setq end (match-end 0)))
244 (push (list (match-string 2) ;; charset
245 (when (match-beginning 3) ;; language
248 (buffer-substring (1+ (match-beginning 3))
250 (match-string 4) ;; encoding
251 (match-string 5) ;; encoded-text
252 (match-string 1)) ;; encoded-word
254 (setq match (looking-at regexp)))
255 (delete-region start end)
257 (eword-decode-encoded-words (nreverse words) must-unfold)))))))
259 (defun eword-decode-unfold ()
260 (goto-char (point-min))
262 (while (re-search-forward std11-field-head-regexp nil t)
263 (setq beg (match-beginning 0)
264 end (std11-field-end))
265 (setq field (buffer-substring beg end))
266 (if (string-match eword-encoded-word-regexp field)
268 (narrow-to-region (goto-char beg) end)
269 (while (re-search-forward "\n\\([ \t]\\)" nil t)
270 (replace-match (match-string 1))
272 (goto-char (point-max))
277 ;;; @ for message header
280 (defvar mime-field-decoder-alist nil)
282 (defvar mime-field-decoder-cache nil)
284 (defvar mime-update-field-decoder-cache 'mime-update-field-decoder-cache
285 "*Field decoder cache update function.")
288 (defun mime-set-field-decoder (field &rest specs)
289 "Set decoder of FIELD.
290 SPECS must be like `MODE1 DECODER1 MODE2 DECODER2 ...'.
291 Each mode must be `nil', `plain', `wide', `summary' or `nov'.
292 If mode is `nil', corresponding decoder is set up for every modes."
294 (let ((mode (pop specs))
295 (function (pop specs)))
298 (let ((cell (assq mode mime-field-decoder-alist)))
300 (setcdr cell (put-alist field function (cdr cell)))
301 (setq mime-field-decoder-alist
302 (cons (cons mode (list (cons field function)))
303 mime-field-decoder-alist))
305 (apply (function mime-set-field-decoder) field specs)
307 (mime-set-field-decoder field
315 (defmacro mime-find-field-presentation-method (name)
316 "Return field-presentation-method from NAME.
317 NAME must be `plain', `wide', `summary' or `nov'."
319 `(or (assq 'summary mime-field-decoder-cache)
325 (symbolp (car (cdr name)))
326 (null (cdr (cdr name))))
327 `(or (assq ,name mime-field-decoder-cache)
331 `(or (assq (or ,name 'summary) mime-field-decoder-cache)
332 (cons (or ,name 'summary) nil))
335 (defun mime-find-field-decoder-internal (field &optional mode)
336 "Return function to decode field-body of FIELD in MODE.
337 Optional argument MODE must be object of field-presentation-method."
338 (cdr (or (assq field (cdr mode))
340 (funcall mime-update-field-decoder-cache
343 (cdr (assq (car mode) mime-field-decoder-cache)))
347 (defun mime-find-field-decoder (field &optional mode)
348 "Return function to decode field-body of FIELD in MODE.
349 Optional argument MODE must be object or name of
350 field-presentation-method. Name of field-presentation-method must be
351 `plain', `wide', `summary' or `nov'.
352 Default value of MODE is `summary'."
354 (let ((p (cdr (mime-find-field-presentation-method mode))))
355 (if (and p (setq p (assq field p)))
357 (cdr (funcall mime-update-field-decoder-cache
358 field (or mode 'summary)))))
359 (inline (mime-find-field-decoder-internal field mode))
363 (defun mime-update-field-decoder-cache (field mode &optional function)
364 "Update field decoder cache `mime-field-decoder-cache'."
365 (cond ((eq function 'identity)
370 (cdr (assq (or mode 'summary) mime-field-decoder-alist))))
371 (setq function (cdr (or (assq field decoder-alist)
372 (assq t decoder-alist)))))
374 (let ((cell (assq mode mime-field-decoder-cache))
377 (if (setq ret (assq field (cdr cell)))
378 (setcdr ret function)
379 (setcdr cell (cons (setq ret (cons field function)) (cdr cell))))
380 (setq mime-field-decoder-cache
381 (cons (cons mode (list (setq ret (cons field function))))
382 mime-field-decoder-cache)))
386 (mime-set-field-decoder 'Archive nil nil)
387 (mime-set-field-decoder 'Content-Md5 nil nil)
388 (mime-set-field-decoder 'Control nil nil)
389 (mime-set-field-decoder 'Date nil nil)
390 (mime-set-field-decoder 'Distribution nil nil)
391 (mime-set-field-decoder 'Followup-Host nil nil)
392 (mime-set-field-decoder 'Followup-To nil nil)
393 (mime-set-field-decoder 'Lines nil nil)
394 (mime-set-field-decoder 'Message-Id nil nil)
395 (mime-set-field-decoder 'Newsgroups nil nil)
396 (mime-set-field-decoder 'Nntp-Posting-Host nil nil)
397 (mime-set-field-decoder 'Path nil nil)
398 (mime-set-field-decoder 'Posted-And-Mailed nil nil)
399 (mime-set-field-decoder 'Received nil nil)
400 (mime-set-field-decoder 'Status nil nil)
401 (mime-set-field-decoder 'X-Face nil nil)
402 (mime-set-field-decoder 'X-Face-Version nil nil)
403 (mime-set-field-decoder 'X-Info nil nil)
404 (mime-set-field-decoder 'X-Pgp-Key-Info nil nil)
405 (mime-set-field-decoder 'X-Pgp-Sig nil nil)
406 (mime-set-field-decoder 'X-Pgp-Sig-Version nil nil)
407 (mime-set-field-decoder 'Xref nil nil)
411 '(Reply-To Resent-Reply-To From Resent-From Sender Resent-Sender
412 To Resent-To Cc Resent-Cc Bcc Resent-Bcc Dcc
414 Mime-Version Content-Type Content-Transfer-Encoding
415 Content-Disposition User-Agent))
418 (setq field (pop fields))
419 (mime-set-field-decoder
421 'plain #'eword-decode-structured-field-body
422 'wide #'eword-decode-and-fold-structured-field-body
423 'summary #'eword-decode-and-unfold-structured-field-body
424 'nov #'eword-decode-and-unfold-structured-field-body)
427 ;; unstructured fields (default)
428 (mime-set-field-decoder
430 'plain #'eword-decode-unstructured-field-body
431 'wide #'eword-decode-unstructured-field-body
432 'summary #'eword-decode-and-unfold-unstructured-field-body
433 'nov #'eword-decode-unfolded-unstructured-field-body)
436 (defun mime-decode-field-body (field-body field-name
437 &optional mode max-column)
438 "Decode FIELD-BODY as FIELD-NAME in MODE, and return the result.
439 Optional argument MODE must be `plain', `wide', `summary' or `nov'.
440 Default mode is `summary'.
442 If MODE is `wide' and MAX-COLUMN is non-nil, the result is folded with
445 Non MIME encoded-word part in FILED-BODY is decoded with
446 `default-mime-charset'."
447 (let (field-name-symbol len decoder)
448 (if (symbolp field-name)
449 (setq field-name-symbol field-name
450 len (1+ (string-width (symbol-name field-name))))
451 (setq field-name-symbol (intern (capitalize field-name))
452 len (1+ (string-width field-name))))
453 (setq decoder (mime-find-field-decoder field-name-symbol mode))
455 (funcall decoder field-body len max-column)
457 (if (eq mode 'summary)
458 (std11-unfold-string field-body)
463 (defun mime-decode-header-in-region (start end
464 &optional code-conversion)
465 "Decode MIME encoded-words in region between START and END.
466 If CODE-CONVERSION is nil, it decodes only encoded-words. If it is
467 mime-charset, it decodes non-ASCII bit patterns as the mime-charset.
468 Otherwise it decodes non-ASCII bit patterns as the
469 default-mime-charset."
473 (narrow-to-region start end)
474 (let ((default-charset
476 (if (mime-charset-to-coding-system code-conversion)
478 default-mime-charset))))
480 (let ((mode-obj (mime-find-field-presentation-method 'wide))
481 beg p end field-name len field-decoder)
482 (goto-char (point-min))
483 (while (re-search-forward std11-field-head-regexp nil t)
484 (setq beg (match-beginning 0)
486 field-name (buffer-substring beg (1- p))
487 len (string-width field-name)
488 field-name (intern (capitalize field-name))
489 field-decoder (inline
490 (mime-find-field-decoder-internal
491 field-name mode-obj)))
493 (setq end (std11-field-end))
494 (let ((body (buffer-substring p end))
495 (default-mime-charset default-charset))
496 (delete-region p end)
497 (insert (funcall field-decoder body (1+ len)))
500 (eword-decode-region (point-min) (point-max) t)
504 (defun mime-decode-header-in-buffer (&optional code-conversion separator)
505 "Decode MIME encoded-words in header fields.
506 If CODE-CONVERSION is nil, it decodes only encoded-words. If it is
507 mime-charset, it decodes non-ASCII bit patterns as the mime-charset.
508 Otherwise it decodes non-ASCII bit patterns as the
509 default-mime-charset.
510 If SEPARATOR is not nil, it is used as header separator."
512 (mime-decode-header-in-region
515 (goto-char (point-min))
516 (if (re-search-forward
517 (concat "^\\(" (regexp-quote (or separator "")) "\\)?$")
524 (defalias 'eword-decode-header 'mime-decode-header-in-buffer)
525 (make-obsolete 'eword-decode-header 'mime-decode-header-in-buffer)
528 ;;; @ encoded-words decoder
531 (defvar eword-decode-allow-incomplete-encoded-text t
532 "*Non-nil means allow incomplete encoded-text in successive encoded-words.
533 Dividing of encoded-text in the place other than character boundaries
534 violates RFC2047 section 5, while we have a capability to decode it.
535 If it is non-nil, the decoder will decode B- or Q-encoding in each
536 encoded-word, concatenate them, and decode it by charset. Otherwise,
537 the decoder will fully decode each encoded-word before concatenating
540 (defun eword-decode-encoded-words (words must-unfold)
541 "Decode successive encoded-words in WORDS and return a decoded string.
542 Each element of WORDS looks like (CHARSET LANGUAGE ENCODING ENCODED-TEXT
545 If MUST-UNFOLD is non-nil, it unfolds and eliminates line-breaks even
546 if there are in decoded encoded-words (generated by bad manner MUA
547 such as a version of Net$cape)."
548 (let (word language charset encoding text rest)
550 (setq word (pop words)
551 language (nth 1 word))
552 (if (and (or (mime-charset-to-coding-system (setq charset (car word)))
554 (message "Unknown charset: %s" charset)
556 (cond ((member (setq encoding (nth 2 word)) '("B" "Q"))
558 ((member encoding '("b" "q"))
559 (setq encoding (upcase encoding)))
561 (message "Invalid encoding: %s" encoding)
565 (encoded-text-decode-string (nth 3 word) encoding))
567 (message "%s" (error-message-string err))
569 (if (and eword-decode-allow-incomplete-encoded-text
572 (string-equal (downcase charset) (downcase (caaar rest)))
573 (equal language (cdaar rest)))
574 ;; Concatenate text of which the charset is the same.
575 (setcdr (car rest) (concat (cdar rest) text))
576 (push (cons (cons charset language) text) rest))
577 ;; Don't decode encoded-word.
578 (push (cons (cons nil language) (nth 4 word)) rest)))
580 (setq word (or (and (setq charset (caaar rest))
582 (decode-mime-charset-string (cdar rest) charset)
584 (message "%s" (error-message-string err))
586 (concat (when (cdr rest) " ")
589 (not (eq (string-to-char words) ? )))
592 (setq word (mapconcat (lambda (chr)
593 (cond ((eq chr ?\n) "")
596 (t (char-to-string chr))))
597 (std11-unfold-string word)
599 (when (setq language (cdaar rest))
600 (put-text-property 0 (length word) 'mime-language language word))
601 (setq words (concat word words)
605 ;;; @ lexical analyze
608 (defvar eword-lexical-analyze-cache nil)
609 (defvar eword-lexical-analyze-cache-max 299
610 "*Max position of eword-lexical-analyze-cache.
611 It is max size of eword-lexical-analyze-cache - 1.")
613 (defvar mime-header-lexical-analyzer
614 '(eword-analyze-quoted-string
615 eword-analyze-domain-literal
616 eword-analyze-comment
618 eword-analyze-special
619 eword-analyze-encoded-word
621 "*List of functions to return result of lexical analyze.
622 Each function must have three arguments: STRING, START and MUST-UNFOLD.
623 STRING is the target string to be analyzed.
624 START is start position of STRING to analyze.
625 If MUST-UNFOLD is not nil, each function must unfold and eliminate
626 bare-CR and bare-LF from the result even if they are included in
627 content of the encoded-word.
628 Each function must return nil if it can not analyze STRING as its
631 Previous function is preferred to next function. If a function
632 returns nil, next function is used. Otherwise the return value will
635 (defun eword-analyze-quoted-string (string start &optional must-unfold)
636 (let ((p (std11-check-enclosure string ?\" ?\" nil start))
639 (setq ret (decode-mime-charset-string
640 (std11-strip-quoted-pair
641 (substring string (1+ start) (1- p)))
642 default-mime-charset))
643 (if mime-header-accept-quoted-encoded-words
644 (setq ret (eword-decode-string ret)))
645 (cons (cons 'quoted-string ret)
648 (defun eword-analyze-domain-literal (string start &optional must-unfold)
649 (std11-analyze-domain-literal string start))
651 (defun eword-analyze-comment (string from &optional must-unfold)
652 (let ((len (length string))
657 (eq (aref string i) ?\())
662 (setq chr (aref string i))
668 (setq last-str (concat last-str
669 (substring string from (1- i))
670 (char-to-string (aref string i)))
675 (setq ret (concat last-str
676 (substring string from i)))
684 (decode-mime-charset-string
685 ret default-mime-charset)
692 (if (setq ret (eword-analyze-comment string i must-unfold))
695 (substring string from i))
697 (if (string= last-str "")
698 (cons (car ret) dest)
701 (decode-mime-charset-string
702 last-str default-mime-charset)
716 (defun eword-analyze-spaces (string start &optional must-unfold)
717 (std11-analyze-spaces string start))
719 (defun eword-analyze-special (string start &optional must-unfold)
720 (std11-analyze-special string start))
722 (defun eword-analyze-encoded-word (string start &optional must-unfold)
723 (let* ((regexp (concat "[\n\t ]*\\(" eword-encoded-word-regexp "\\)"))
724 (match (and (string-match regexp string start)
725 (= start (match-beginning 0))))
728 (setq next (match-end 0))
729 (push (list (match-string 2 string) ;; charset
730 (when (match-beginning 3) ;; language
734 (1+ (match-beginning 3)) (match-end 3)))))
735 (match-string 4 string) ;; encoding
736 (match-string 5 string) ;; encoded-text
737 (match-string 1 string)) ;; encoded-word
739 (setq match (and (string-match regexp string next)
740 (= next (match-beginning 0)))))
742 (cons (cons 'atom (eword-decode-encoded-words (nreverse words)
746 (defun eword-analyze-atom (string start &optional must-unfold)
747 (if (and (string-match std11-atom-regexp string start)
748 (= (match-beginning 0) start))
749 (let ((end (match-end 0)))
750 (cons (cons 'atom (decode-mime-charset-string
751 (substring string start end)
752 default-mime-charset))
753 ;;(substring string end)
757 (defun eword-lexical-analyze-internal (string start must-unfold)
758 (let ((len (length string))
762 (let ((rest mime-header-lexical-analyzer)
764 (while (and (setq func (car rest))
766 (setq r (funcall func string start must-unfold)))
768 (setq rest (cdr rest)))
770 (cons (cons 'error (substring string start)) (1+ len)))
772 (setq dest (cons (car ret) dest)
778 (defun eword-lexical-analyze (string &optional start must-unfold)
779 "Return lexical analyzed list corresponding STRING.
780 It is like std11-lexical-analyze, but it decodes non us-ascii
781 characters encoded as encoded-words or invalid \"raw\" format.
782 \"Raw\" non us-ascii characters are regarded as variable
783 `default-mime-charset'."
784 (let ((key (substring string (or start 0)))
786 (set-text-properties 0 (length key) nil key)
787 (if (setq ret (assoc key eword-lexical-analyze-cache))
789 (setq ret (eword-lexical-analyze-internal key 0 must-unfold))
790 (setq eword-lexical-analyze-cache
792 eword-lexical-analyze-cache))
793 (if (cdr (setq cell (nthcdr eword-lexical-analyze-cache-max
794 eword-lexical-analyze-cache)))
798 (defun eword-decode-token (token)
799 (let ((type (car token))
801 (cond ((eq type 'quoted-string)
802 (std11-wrap-as-quoted-string value))
806 (setq dest (concat dest
807 (if (stringp (car value))
808 (std11-wrap-as-quoted-pairs
809 (car value) '(?( ?)))
810 (eword-decode-token (car value))
814 (concat "(" dest ")")
818 (defun eword-extract-address-components (string &optional start)
819 "Extract full name and canonical address from STRING.
820 Returns a list of the form (FULL-NAME CANONICAL-ADDRESS).
821 If no name can be extracted, FULL-NAME will be nil.
822 It decodes non us-ascii characters in FULL-NAME encoded as
823 encoded-words or invalid \"raw\" string. \"Raw\" non us-ascii
824 characters are regarded as variable `default-mime-charset'."
825 (let* ((structure (car (std11-parse-address
826 (eword-lexical-analyze
827 (std11-unfold-string string) start
829 (phrase (std11-full-name-string structure))
830 (address (std11-address-string structure))
832 (list phrase address)
839 (provide 'eword-decode)
841 ;;; eword-decode.el ends here