1 ;;; eword-decode.el --- RFC 2047 based encoded-word decoder for GNU Emacs
3 ;; Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2003, 2004,
4 ;; 2005 Free Software Foundation, Inc.
6 ;; Author: ENAMI Tsugutomo <enami@sys.ptg.sony.co.jp>
7 ;; MORIOKA Tomohiko <tomo@m17n.org>
8 ;; TANAKA Akira <akr@m17n.org>
10 ;; Original: 1992/07/20 ENAMI Tsugutomo's `mime.el'.
11 ;; Renamed: 1993/06/03 to tiny-mime.el by MORIOKA Tomohiko
12 ;; Renamed: 1995/10/03 to tm-ew-d.el (split off encoder)
13 ;; by MORIOKA Tomohiko
14 ;; Renamed: 1997/02/22 from tm-ew-d.el by MORIOKA Tomohiko
15 ;; Keywords: encoded-word, MIME, multilingual, header, mail, news
17 ;; This file is part of FLIM (Faithful Library about Internet Message).
19 ;; This program is free software; you can redistribute it and/or
20 ;; modify it under the terms of the GNU General Public License as
21 ;; published by the Free Software Foundation; either version 2, or (at
22 ;; your option) any later version.
24 ;; This program is distributed in the hope that it will be useful, but
25 ;; WITHOUT ANY WARRANTY; without even the implied warranty of
26 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
27 ;; General Public License for more details.
29 ;; You should have received a copy of the GNU General Public License
30 ;; along with GNU Emacs; see the file COPYING. If not, write to the
31 ;; Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
32 ;; Boston, MA 02110-1301, USA.
40 (eval-when-compile (require 'cl)) ; list*, pop
46 ;; User options are defined in mime-def.el.
49 ;;; @ MIME encoded-word definition
53 (defconst eword-encoded-text-regexp "[!->@-~]+")
55 (defconst eword-encoded-word-regexp
57 (concat (regexp-quote "=?")
59 mime-charset-regexp ; 1
63 mime-language-regexp ; 2
67 mime-encoding-regexp ; 3
71 eword-encoded-text-regexp ; 4
73 (regexp-quote "?="))))
80 (defun eword-decode-string (string &optional must-unfold)
81 "Decode MIME encoded-words in STRING.
83 STRING is unfolded before decoding.
85 If an encoded-word is broken or your emacs implementation can not
86 decode the charset included in it, it is not decoded.
88 If MUST-UNFOLD is non-nil, it unfolds and eliminates line-breaks even
89 if there are in decoded encoded-words (generated by bad manner MUA
90 such as a version of Net$cape)."
91 (setq string (std11-unfold-string string))
92 (let ((regexp (concat "[\n\t ]*\\(" eword-encoded-word-regexp "\\)"))
95 (while (setq match (string-match regexp string next))
96 (setq start (match-beginning 1)
99 (setq next (match-end 0))
100 (push (list (match-string 2 string) ;; charset
101 (when (match-beginning 3) ;; language
105 (1+ (match-beginning 3)) (match-end 3)))))
106 (match-string 4 string) ;; encoding
107 (match-string 5 string) ;; encoded-text
108 (match-string 1 string)) ;; encoded-word
110 (setq match (and (string-match regexp string next)
111 (= next (match-beginning 0)))))
112 (setq words (eword-decode-encoded-words (nreverse words) must-unfold)
113 string (concat (substring string 0 start)
115 (substring string next))
116 next (+ start (length words)))))
119 (defun eword-decode-structured-field-body (string
120 &optional start-column max-column
122 (let ((tokens (eword-lexical-analyze string start 'must-unfold))
126 (setq token (car tokens))
127 (setq result (concat result (eword-decode-token token)))
128 (setq tokens (cdr tokens)))
131 (defun eword-decode-and-unfold-structured-field-body (string
136 "Decode and unfold STRING as structured field body.
137 It decodes non us-ascii characters in FULL-NAME encoded as
138 encoded-words or invalid \"raw\" string. \"Raw\" non us-ascii
139 characters are regarded as variable `default-mime-charset'.
141 If an encoded-word is broken or your emacs implementation can not
142 decode the charset included in it, it is not decoded."
143 (let ((tokens (eword-lexical-analyze string start 'must-unfold))
146 (let* ((token (car tokens))
148 (setq tokens (cdr tokens))
150 (if (eq type 'spaces)
152 (concat result (eword-decode-token token))
156 (defun eword-decode-and-fold-structured-field-body (string
160 (if (and mime-field-decoding-max-size
161 (> (length string) mime-field-decoding-max-size))
164 (setq max-column fill-column))
165 (let ((c start-column)
166 (tokens (eword-lexical-analyze string start 'must-unfold))
169 (while (and (setq token (car tokens))
170 (setq tokens (cdr tokens)))
171 (let* ((type (car token)))
172 (if (eq type 'spaces)
173 (let* ((next-token (car tokens))
174 (next-str (eword-decode-token next-token))
175 (next-len (string-width next-str))
176 (next-c (+ c next-len 1)))
177 (if (< next-c max-column)
178 (setq result (concat result " " next-str)
180 (setq result (concat result "\n " next-str)
182 (setq tokens (cdr tokens))
184 (let* ((str (eword-decode-token token)))
185 (setq result (concat result str)
186 c (+ c (string-width str)))
189 (concat result (eword-decode-token token))
192 (defun eword-decode-unstructured-field-body (string &optional start-column
195 (decode-mime-charset-string string default-mime-charset)))
197 (defun eword-decode-and-unfold-unstructured-field-body (string
198 &optional start-column
201 (decode-mime-charset-string (std11-unfold-string string)
202 default-mime-charset)
205 (defun eword-decode-unfolded-unstructured-field-body (string
206 &optional start-column
209 (decode-mime-charset-string string default-mime-charset)
216 (defun eword-decode-region (start end &optional unfolding must-unfold)
217 "Decode MIME encoded-words in region between START and END.
219 If UNFOLDING is not nil, it unfolds before decoding.
221 If MUST-UNFOLD is non-nil, it unfolds and eliminates line-breaks even
222 if there are in decoded encoded-words (generated by bad manner MUA
223 such as a version of Net$cape)."
227 (narrow-to-region start end)
229 (eword-decode-unfold))
230 (goto-char (point-min))
231 (let ((regexp (concat "[\n\t ]*\\(" eword-encoded-word-regexp "\\)"))
233 (while (setq match (re-search-forward regexp nil t))
234 (setq start (match-beginning 1)
237 (goto-char (setq end (match-end 0)))
238 (push (list (match-string 2) ;; charset
239 (when (match-beginning 3) ;; language
242 (buffer-substring (1+ (match-beginning 3))
244 (match-string 4) ;; encoding
245 (match-string 5) ;; encoded-text
246 (match-string 1)) ;; encoded-word
248 (setq match (looking-at regexp)))
249 (delete-region start end)
251 (eword-decode-encoded-words (nreverse words) must-unfold)))))))
253 (defun eword-decode-unfold ()
254 (goto-char (point-min))
256 (while (re-search-forward std11-field-head-regexp nil t)
257 (setq beg (match-beginning 0)
258 end (std11-field-end))
259 (setq field (buffer-substring beg end))
260 (if (string-match eword-encoded-word-regexp field)
262 (narrow-to-region (goto-char beg) end)
263 (while (re-search-forward "\n\\([ \t]\\)" nil t)
264 (replace-match (match-string 1))
266 (goto-char (point-max))
271 ;;; @ for message header
274 (defvar mime-field-decoder-alist nil)
276 (defvar mime-field-decoder-cache nil)
278 (defvar mime-update-field-decoder-cache 'mime-update-field-decoder-cache
279 "*Field decoder cache update function.")
282 (defun mime-set-field-decoder (field &rest specs)
283 "Set decoder of FIELD.
284 SPECS must be like `MODE1 DECODER1 MODE2 DECODER2 ...'.
285 Each mode must be `nil', `plain', `wide', `summary' or `nov'.
286 If mode is `nil', corresponding decoder is set up for every modes."
288 (let ((mode (pop specs))
289 (function (pop specs)))
292 (let ((cell (assq mode mime-field-decoder-alist)))
294 (setcdr cell (put-alist field function (cdr cell)))
295 (setq mime-field-decoder-alist
296 (cons (cons mode (list (cons field function)))
297 mime-field-decoder-alist))
299 (apply (function mime-set-field-decoder) field specs)
301 (mime-set-field-decoder field
309 (defmacro mime-find-field-presentation-method (name)
310 "Return field-presentation-method from NAME.
311 NAME must be `plain', `wide', `summary' or `nov'."
313 `(or (assq 'summary mime-field-decoder-cache)
319 (symbolp (car (cdr name)))
320 (null (cdr (cdr name))))
321 `(or (assq ,name mime-field-decoder-cache)
325 `(or (assq (or ,name 'summary) mime-field-decoder-cache)
326 (cons (or ,name 'summary) nil))
329 (defun mime-find-field-decoder-internal (field &optional mode)
330 "Return function to decode field-body of FIELD in MODE.
331 Optional argument MODE must be object of field-presentation-method."
332 (cdr (or (assq field (cdr mode))
334 (funcall mime-update-field-decoder-cache
337 (cdr (assq (car mode) mime-field-decoder-cache)))
341 (defun mime-find-field-decoder (field &optional mode)
342 "Return function to decode field-body of FIELD in MODE.
343 Optional argument MODE must be object or name of
344 field-presentation-method. Name of field-presentation-method must be
345 `plain', `wide', `summary' or `nov'.
346 Default value of MODE is `summary'."
348 (let ((p (cdr (mime-find-field-presentation-method mode))))
349 (if (and p (setq p (assq field p)))
351 (cdr (funcall mime-update-field-decoder-cache
352 field (or mode 'summary)))))
353 (inline (mime-find-field-decoder-internal field mode))
357 (defun mime-update-field-decoder-cache (field mode &optional function)
358 "Update field decoder cache `mime-field-decoder-cache'."
359 (cond ((eq function 'identity)
364 (cdr (assq (or mode 'summary) mime-field-decoder-alist))))
365 (setq function (cdr (or (assq field decoder-alist)
366 (assq t decoder-alist)))))
368 (let ((cell (assq mode mime-field-decoder-cache))
371 (if (setq ret (assq field (cdr cell)))
372 (setcdr ret function)
373 (setcdr cell (cons (setq ret (cons field function)) (cdr cell))))
374 (setq mime-field-decoder-cache
375 (cons (cons mode (list (setq ret (cons field function))))
376 mime-field-decoder-cache)))
380 (mime-set-field-decoder 'Archive nil nil)
381 (mime-set-field-decoder 'Content-Md5 nil nil)
382 (mime-set-field-decoder 'Control nil nil)
383 (mime-set-field-decoder 'Date nil nil)
384 (mime-set-field-decoder 'Distribution nil nil)
385 (mime-set-field-decoder 'Followup-Host nil nil)
386 (mime-set-field-decoder 'Followup-To nil nil)
387 (mime-set-field-decoder 'Lines nil nil)
388 (mime-set-field-decoder 'Message-Id nil nil)
389 (mime-set-field-decoder 'Newsgroups nil nil)
390 (mime-set-field-decoder 'Nntp-Posting-Host nil nil)
391 (mime-set-field-decoder 'Path nil nil)
392 (mime-set-field-decoder 'Posted-And-Mailed nil nil)
393 (mime-set-field-decoder 'Received nil nil)
394 (mime-set-field-decoder 'Status nil nil)
395 (mime-set-field-decoder 'X-Face nil nil)
396 (mime-set-field-decoder 'X-Face-Version nil nil)
397 (mime-set-field-decoder 'X-Info nil nil)
398 (mime-set-field-decoder 'X-Pgp-Key-Info nil nil)
399 (mime-set-field-decoder 'X-Pgp-Sig nil nil)
400 (mime-set-field-decoder 'X-Pgp-Sig-Version nil nil)
401 (mime-set-field-decoder 'Xref nil nil)
405 '(Reply-To Resent-Reply-To From Resent-From Sender Resent-Sender
406 To Resent-To Cc Resent-Cc Bcc Resent-Bcc Dcc
408 Mime-Version Content-Type Content-Transfer-Encoding
409 Content-Disposition User-Agent))
412 (setq field (pop fields))
413 (mime-set-field-decoder
415 'plain #'eword-decode-structured-field-body
416 'wide #'eword-decode-and-fold-structured-field-body
417 'summary #'eword-decode-and-unfold-structured-field-body
418 'nov #'eword-decode-and-unfold-structured-field-body)
421 ;; unstructured fields (default)
422 (mime-set-field-decoder
424 'plain #'eword-decode-unstructured-field-body
425 'wide #'eword-decode-unstructured-field-body
426 'summary #'eword-decode-and-unfold-unstructured-field-body
427 'nov #'eword-decode-unfolded-unstructured-field-body)
430 (defun mime-decode-field-body (field-body field-name
431 &optional mode max-column)
432 "Decode FIELD-BODY as FIELD-NAME in MODE, and return the result.
433 Optional argument MODE must be `plain', `wide', `summary' or `nov'.
434 Default mode is `summary'.
436 If MODE is `wide' and MAX-COLUMN is non-nil, the result is folded with
439 Non MIME encoded-word part in FILED-BODY is decoded with
440 `default-mime-charset'."
441 (let (field-name-symbol len decoder)
442 (if (symbolp field-name)
443 (setq field-name-symbol field-name
444 len (1+ (string-width (symbol-name field-name))))
445 (setq field-name-symbol (intern (capitalize field-name))
446 len (1+ (string-width field-name))))
447 (setq decoder (mime-find-field-decoder field-name-symbol mode))
449 (funcall decoder field-body len max-column)
451 (if (eq mode 'summary)
452 (std11-unfold-string field-body)
457 (defun mime-decode-header-in-region (start end
458 &optional code-conversion)
459 "Decode MIME encoded-words in region between START and END.
460 If CODE-CONVERSION is nil, it decodes only encoded-words. If it is
461 mime-charset, it decodes non-ASCII bit patterns as the mime-charset.
462 Otherwise it decodes non-ASCII bit patterns as the
463 default-mime-charset."
467 (narrow-to-region start end)
468 (let ((default-charset
470 (if (mime-charset-to-coding-system code-conversion)
472 default-mime-charset))))
474 (let ((mode-obj (mime-find-field-presentation-method 'wide))
475 beg p end field-name len field-decoder)
476 (goto-char (point-min))
477 (while (re-search-forward std11-field-head-regexp nil t)
478 (setq beg (match-beginning 0)
480 field-name (buffer-substring beg (1- p))
481 len (string-width field-name)
482 field-name (intern (capitalize field-name))
483 field-decoder (inline
484 (mime-find-field-decoder-internal
485 field-name mode-obj)))
487 (setq end (std11-field-end))
488 (let ((body (buffer-substring p end))
489 (default-mime-charset default-charset))
490 (delete-region p end)
491 (insert (funcall field-decoder body (1+ len)))
494 (eword-decode-region (point-min) (point-max) t)
498 (defun mime-decode-header-in-buffer (&optional code-conversion separator)
499 "Decode MIME encoded-words in header fields.
500 If CODE-CONVERSION is nil, it decodes only encoded-words. If it is
501 mime-charset, it decodes non-ASCII bit patterns as the mime-charset.
502 Otherwise it decodes non-ASCII bit patterns as the
503 default-mime-charset.
504 If SEPARATOR is not nil, it is used as header separator."
506 (mime-decode-header-in-region
509 (goto-char (point-min))
510 (if (re-search-forward
511 (concat "^\\(" (regexp-quote (or separator "")) "\\)?$")
518 (defalias 'eword-decode-header 'mime-decode-header-in-buffer)
519 (make-obsolete 'eword-decode-header 'mime-decode-header-in-buffer)
522 ;;; @ encoded-words decoder
525 (defvar eword-decode-allow-incomplete-encoded-text t
526 "*Non-nil means allow incomplete encoded-text in successive encoded-words.
527 Dividing of encoded-text in the place other than character boundaries
528 violates RFC2047 section 5, while we have a capability to decode it.
529 If it is non-nil, the decoder will decode B- or Q-encoding in each
530 encoded-word, concatenate them, and decode it by charset. Otherwise,
531 the decoder will fully decode each encoded-word before concatenating
534 (defun eword-decode-encoded-words (words must-unfold)
535 "Decode successive encoded-words in WORDS and return a decoded string.
536 Each element of WORDS looks like (CHARSET LANGUAGE ENCODING ENCODED-TEXT
539 If MUST-UNFOLD is non-nil, it unfolds and eliminates line-breaks even
540 if there are in decoded encoded-words (generated by bad manner MUA
541 such as a version of Net$cape)."
542 (let (word language charset encoding text rest)
544 (setq word (pop words)
545 language (nth 1 word))
546 (if (and (or (mime-charset-to-coding-system (setq charset (car word)))
548 (message "Unknown charset: %s" charset)
550 (cond ((member (setq encoding (nth 2 word)) '("B" "Q"))
552 ((member encoding '("b" "q"))
553 (setq encoding (upcase encoding)))
555 (message "Invalid encoding: %s" encoding)
559 (encoded-text-decode-string (nth 3 word) encoding))
561 (message "%s" (error-message-string err))
563 (if (and eword-decode-allow-incomplete-encoded-text
566 (string-equal (downcase charset) (downcase (caaar rest)))
567 (equal language (cdaar rest)))
568 ;; Concatenate text of which the charset is the same.
569 (setcdr (car rest) (concat (cdar rest) text))
570 (push (cons (cons charset language) text) rest))
571 ;; Don't decode encoded-word.
572 (push (cons (cons nil language) (nth 4 word)) rest)))
574 (setq word (or (and (setq charset (caaar rest))
576 (decode-mime-charset-string (cdar rest) charset)
578 (message "%s" (error-message-string err))
580 (concat (when (cdr rest) " ")
583 (not (eq (string-to-char words) ? )))
586 (setq word (mapconcat (lambda (chr)
587 (cond ((eq chr ?\n) "")
590 (t (char-to-string chr))))
591 (std11-unfold-string word)
593 (when (setq language (cdaar rest))
594 (put-text-property 0 (length word) 'mime-language language word))
595 (setq words (concat word words)
599 ;;; @ lexical analyze
602 (defvar eword-lexical-analyze-cache nil)
603 (defvar eword-lexical-analyze-cache-max 299
604 "*Max position of eword-lexical-analyze-cache.
605 It is max size of eword-lexical-analyze-cache - 1.")
607 (defvar mime-header-lexical-analyzer
608 '(eword-analyze-quoted-string
609 eword-analyze-domain-literal
610 eword-analyze-comment
612 eword-analyze-special
613 eword-analyze-encoded-word
615 "*List of functions to return result of lexical analyze.
616 Each function must have three arguments: STRING, START and MUST-UNFOLD.
617 STRING is the target string to be analyzed.
618 START is start position of STRING to analyze.
619 If MUST-UNFOLD is not nil, each function must unfold and eliminate
620 bare-CR and bare-LF from the result even if they are included in
621 content of the encoded-word.
622 Each function must return nil if it can not analyze STRING as its
625 Previous function is preferred to next function. If a function
626 returns nil, next function is used. Otherwise the return value will
629 (defun eword-analyze-quoted-string (string start &optional must-unfold)
630 (let ((p (std11-check-enclosure string ?\" ?\" nil start))
633 (setq ret (decode-mime-charset-string
634 (std11-strip-quoted-pair
635 (substring string (1+ start) (1- p)))
636 default-mime-charset))
637 (if mime-header-accept-quoted-encoded-words
638 (setq ret (eword-decode-string ret)))
639 (cons (cons 'quoted-string ret)
642 (defun eword-analyze-domain-literal (string start &optional must-unfold)
643 (std11-analyze-domain-literal string start))
645 (defun eword-analyze-comment (string from &optional must-unfold)
646 (let ((len (length string))
651 (eq (aref string i) ?\())
656 (setq chr (aref string i))
662 (setq last-str (concat last-str
663 (substring string from (1- i))
664 (char-to-string (aref string i)))
669 (setq ret (concat last-str
670 (substring string from i)))
678 (decode-mime-charset-string
679 ret default-mime-charset)
686 (if (setq ret (eword-analyze-comment string i must-unfold))
689 (substring string from i))
691 (if (string= last-str "")
692 (cons (car ret) dest)
695 (decode-mime-charset-string
696 last-str default-mime-charset)
710 (defun eword-analyze-spaces (string start &optional must-unfold)
711 (std11-analyze-spaces string start))
713 (defun eword-analyze-special (string start &optional must-unfold)
714 (std11-analyze-special string start))
716 (defun eword-analyze-encoded-word (string start &optional must-unfold)
717 (let* ((regexp (concat "[\n\t ]*\\(" eword-encoded-word-regexp "\\)"))
718 (match (and (string-match regexp string start)
719 (= start (match-beginning 0))))
722 (setq next (match-end 0))
723 (push (list (match-string 2 string) ;; charset
724 (when (match-beginning 3) ;; language
728 (1+ (match-beginning 3)) (match-end 3)))))
729 (match-string 4 string) ;; encoding
730 (match-string 5 string) ;; encoded-text
731 (match-string 1 string)) ;; encoded-word
733 (setq match (and (string-match regexp string next)
734 (= next (match-beginning 0)))))
736 (cons (cons 'atom (eword-decode-encoded-words (nreverse words)
740 (defun eword-analyze-atom (string start &optional must-unfold)
741 (if (and (string-match std11-atom-regexp string start)
742 (= (match-beginning 0) start))
743 (let ((end (match-end 0)))
744 (cons (cons 'atom (decode-mime-charset-string
745 (substring string start end)
746 default-mime-charset))
747 ;;(substring string end)
751 (defun eword-lexical-analyze-internal (string start must-unfold)
752 (let ((len (length string))
756 (let ((rest mime-header-lexical-analyzer)
758 (while (and (setq func (car rest))
760 (setq r (funcall func string start must-unfold)))
762 (setq rest (cdr rest)))
764 (cons (cons 'error (substring string start)) (1+ len)))
766 (setq dest (cons (car ret) dest)
772 (defun eword-lexical-analyze (string &optional start must-unfold)
773 "Return lexical analyzed list corresponding STRING.
774 It is like std11-lexical-analyze, but it decodes non us-ascii
775 characters encoded as encoded-words or invalid \"raw\" format.
776 \"Raw\" non us-ascii characters are regarded as variable
777 `default-mime-charset'."
778 (let ((key (substring string (or start 0)))
780 (set-text-properties 0 (length key) nil key)
781 (if (setq ret (assoc key eword-lexical-analyze-cache))
783 (setq ret (eword-lexical-analyze-internal key 0 must-unfold))
784 (setq eword-lexical-analyze-cache
786 eword-lexical-analyze-cache))
787 (if (cdr (setq cell (nthcdr eword-lexical-analyze-cache-max
788 eword-lexical-analyze-cache)))
792 (defun eword-decode-token (token)
793 (let ((type (car token))
795 (cond ((eq type 'quoted-string)
796 (std11-wrap-as-quoted-string value))
800 (setq dest (concat dest
801 (if (stringp (car value))
802 (std11-wrap-as-quoted-pairs
803 (car value) '(?( ?)))
804 (eword-decode-token (car value))
808 (concat "(" dest ")")
812 (defun eword-extract-address-components (string &optional start)
813 "Extract full name and canonical address from STRING.
814 Returns a list of the form (FULL-NAME CANONICAL-ADDRESS).
815 If no name can be extracted, FULL-NAME will be nil.
816 It decodes non us-ascii characters in FULL-NAME encoded as
817 encoded-words or invalid \"raw\" string. \"Raw\" non us-ascii
818 characters are regarded as variable `default-mime-charset'."
819 (let* ((structure (car (std11-parse-address
820 (eword-lexical-analyze
821 (std11-unfold-string string) start
823 (phrase (std11-full-name-string structure))
824 (address (std11-address-string structure))
826 (list phrase address)
833 (provide 'eword-decode)
835 ;;; eword-decode.el ends here