1 ;;; eword-decode.el --- RFC 2047 based encoded-word decoder for GNU Emacs
3 ;; Copyright (C) 1995,1996,1997,1998 Free Software Foundation, Inc.
5 ;; Author: ENAMI Tsugutomo <enami@sys.ptg.sony.co.jp>
6 ;; MORIOKA Tomohiko <morioka@jaist.ac.jp>
7 ;; Tanaka Akira <akr@jaist.ac.jp>
8 ;; Maintainer: Tanaka Akira <akr@jaist.ac.jp>
10 ;; Original: 1992/07/20 ENAMI Tsugutomo's `mime.el'.
11 ;; Renamed: 1993/06/03 to tiny-mime.el
12 ;; Renamed: 1995/10/03 from tiny-mime.el (split off encoder)
13 ;; Renamed: 1997/02/22 from tm-ew-d.el
14 ;; Keywords: encoded-word, MIME, multilingual, header, mail, news
16 ;; This file is part of FLAM (Faithful Library About MIME).
18 ;; This program is free software; you can redistribute it and/or
19 ;; modify it under the terms of the GNU General Public License as
20 ;; published by the Free Software Foundation; either version 2, or (at
21 ;; your option) any later version.
23 ;; This program is distributed in the hope that it will be useful, but
24 ;; WITHOUT ANY WARRANTY; without even the implied warranty of
25 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
26 ;; General Public License for more details.
28 ;; You should have received a copy of the GNU General Public License
29 ;; along with GNU Emacs; see the file COPYING. If not, write to the
30 ;; Free Software Foundation, Inc., 59 Temple Place - Suite 330,
31 ;; Boston, MA 02111-1307, USA.
35 (require 'std11-parse)
39 (defgroup eword-decode nil
40 "Encoded-word decoding"
47 (defcustom eword-decode-sticked-encoded-word nil
48 "*If non-nil, decode encoded-words sticked on atoms,
49 other encoded-words, etc.
50 however this behaviour violates RFC2047."
54 (defcustom eword-decode-quoted-encoded-word nil
55 "*If non-nil, decode encoded-words in quoted-string
56 however this behaviour violates RFC2047."
61 ;;; @ MIME encoded-word definition
64 (defconst eword-encoded-word-prefix-regexp
65 (concat (regexp-quote "=?")
66 "\\(" mime-charset-regexp "\\)"
70 (defconst eword-encoded-word-suffix-regexp
73 (defconst eword-encoded-text-in-unstructured-regexp "[!->@-~]+")
74 (defconst eword-encoded-word-in-unstructured-regexp
75 (concat eword-encoded-word-prefix-regexp
76 "\\(" eword-encoded-text-in-unstructured-regexp "\\)"
77 eword-encoded-word-suffix-regexp))
78 (defconst eword-after-encoded-word-in-unstructured-regexp "\\([ \t]\\|$\\)")
80 (defconst eword-encoded-text-in-phrase-regexp "[-A-Za-z0-9!*+/=_]+")
81 (defconst eword-encoded-word-in-phrase-regexp
82 (concat eword-encoded-word-prefix-regexp
83 "\\(" eword-encoded-text-in-phrase-regexp "\\)"
84 eword-encoded-word-suffix-regexp))
85 (defconst eword-after-encoded-word-in-phrase-regexp "\\([ \t(]\\|$\\)")
87 (defconst eword-encoded-text-in-comment-regexp "[]!-'*->@-[^-~]+")
88 (defconst eword-encoded-word-in-comment-regexp
89 (concat eword-encoded-word-prefix-regexp
90 "\\(" eword-encoded-text-in-comment-regexp "\\)"
91 eword-encoded-word-suffix-regexp))
92 (defconst eword-after-encoded-word-in-comment-regexp "\\([ \t()\\\\]\\|$\\)")
94 (defconst eword-encoded-text-in-quoted-string-regexp "[]!#->@-[^-~]+")
95 (defconst eword-encoded-word-in-quoted-string-regexp
96 (concat eword-encoded-word-prefix-regexp
97 "\\(" eword-encoded-text-in-quoted-string-regexp "\\)"
98 eword-encoded-word-suffix-regexp))
99 (defconst eword-after-encoded-word-in-quoted-string-regexp "\\([ \t\"\\\\]\\|$\\)")
102 (defconst eword-encoded-text-regexp eword-encoded-text-in-unstructured-regexp)
103 (defconst eword-encoded-word-regexp eword-encoded-word-in-unstructured-regexp)
109 (defconst base64-token-regexp "[A-Za-z0-9+/]")
110 (defconst base64-token-padding-regexp "[A-Za-z0-9+/=]")
112 (defconst eword-B-encoded-text-regexp
121 base64-token-padding-regexp
122 base64-token-padding-regexp
125 ;; (defconst eword-B-encoding-and-encoded-text-regexp
126 ;; (concat "\\(B\\)\\?" eword-B-encoded-text-regexp))
129 ;;; @@ Quoted-Printable
132 (defconst quoted-printable-hex-chars "0123456789ABCDEF")
133 (defconst quoted-printable-octet-regexp
134 (concat "=[" quoted-printable-hex-chars
135 "][" quoted-printable-hex-chars "]"))
137 (defconst eword-Q-encoded-text-regexp
138 (concat "\\([^=?]\\|" quoted-printable-octet-regexp "\\)+"))
139 ;; (defconst eword-Q-encoding-and-encoded-text-regexp
140 ;; (concat "\\(Q\\)\\?" eword-Q-encoded-text-regexp))
143 ;;; @ internal utilities
146 (defun eword-decode-first-encoded-words (string
149 &optional must-unfold)
150 "Decode MIME encoded-words in beginning of STRING.
152 EWORD-REGEXP is the regexp that matches a encoded-word.
154 eword-encoded-word-in-unstructured-regexp,
155 eword-encoded-text-in-phrase-regexp,
156 eword-encoded-word-in-comment-regexp or
157 eword-encoded-word-in-quoted-string-regexp.
159 AFTER-REGEXP is the regexp that matches a after encoded-word.
161 eword-after-encoded-word-in-unstructured-regexp,
162 eword-after-encoded-text-in-phrase-regexp,
163 eword-after-encoded-word-in-comment-regexp or
164 eword-after-encoded-word-in-quoted-string-regexp.
166 If beginning of STRING matches EWORD-REGEXP and AFTER-REGEXP,
167 returns a cons cell of decoded string(sequence of characters) and
168 the rest(sequence of octets).
170 If beginning of STRING does not matches EWORD-REGEXP and AFTER-REGEXP,
173 If an encoded-word is broken or your emacs implementation can not
174 decode the charset included in it, it is returned in decoded part
175 as encoded-word form.
177 If MUST-UNFOLD is non-nil, it unfolds and eliminates line-breaks even
178 if there are in decoded encoded-words (generated by bad manner MUA
179 such as a version of Net$cape)."
180 (if eword-decode-sticked-encoded-word (setq after-regexp ""))
181 (let ((between-ewords-regexp
182 (if eword-decode-sticked-encoded-word
185 (src string) ; sequence of octets.
186 (dst "")) ; sequence of characters.
188 (concat "\\`\\(" eword-regexp "\\)" after-regexp) src)
191 (ew (substring src 0 q))
192 (dw (eword-decode-encoded-word ew must-unfold)))
193 (setq dst (concat dst dw)
194 src (substring src q))
195 (if (not (string= ew dw))
200 (concat "\\`\\(" between-ewords-regexp "\\)"
201 "\\(" eword-regexp "\\)"
205 (setq p (match-end 1)
207 ew (substring src p q)
208 dw (eword-decode-encoded-word ew must-unfold))
211 (setq dst (concat dst (substring src 0 q))
212 src (substring src q))
215 (setq dst (concat dst dw)
216 src (substring src q)))))
220 (defun eword-decode-entire-string (string
225 delimiters ; list of chars.
232 (while (< 0 (length src))
233 (let ((ch (aref src 0))
236 (eword-decode-first-encoded-words src
237 eword-regexp after-regexp must-unfold))))
238 (if (and (not (string= buf ""))
239 (or decoded (memq ch delimiters)))
240 (setq dst (concat dst
241 (std11-wrap-as-quoted-pairs
242 (decode-mime-charset-string buf default-charset)
247 (setq dst (concat dst
248 (std11-wrap-as-quoted-pairs
252 ((memq ch delimiters)
253 (setq dst (concat dst (list ch))
254 src (substring src 1)
257 (setq buf (concat buf (list (aref src 1)))
258 src (substring src 2)
260 ((string-match "\\`[ \t\n]+" src)
261 (setq buf (concat buf (substring src 0 (match-end 0)))
262 src (substring src (match-end 0))
264 ((and (string-match (concat "\\`=?" safe-regexp) src)
266 (setq buf (concat buf (substring src 0 (match-end 0)))
267 src (substring src (match-end 0))
268 ew-enable eword-decode-sticked-encoded-word))
269 (t (error "something wrong")))))
270 (if (not (string= buf ""))
271 (setq dst (concat dst
272 (std11-wrap-as-quoted-pairs
273 (decode-mime-charset-string buf default-charset)
281 (defun eword-decode-unstructured (string &optional must-unfold)
282 (eword-decode-entire-string
284 eword-encoded-word-in-unstructured-regexp
285 eword-after-encoded-word-in-unstructured-regexp
292 (defun eword-decode-comment (string &optional must-unfold)
293 (eword-decode-entire-string
295 eword-encoded-word-in-comment-regexp
296 eword-after-encoded-word-in-comment-regexp
303 (defun eword-decode-quoted-string (string &optional must-unfold)
304 (eword-decode-entire-string
306 eword-encoded-word-in-quoted-string-regexp
307 eword-after-encoded-word-in-quoted-string-regexp
314 (defun eword-decode-string (string &optional must-unfold default-mime-charset)
315 "Decode MIME encoded-words in STRING.
317 STRING is unfolded before decoding.
319 If an encoded-word is broken or your emacs implementation can not
320 decode the charset included in it, it is not decoded.
322 If MUST-UNFOLD is non-nil, it unfolds and eliminates line-breaks even
323 if there are in decoded encoded-words (generated by bad manner MUA
324 such as a version of Net$cape)."
325 (eword-decode-unstructured
326 (std11-unfold-string string)
333 (defun eword-decode-region (start end &optional unfolding must-unfold
334 default-mime-charset)
335 "Decode MIME encoded-words in region between START and END.
337 If UNFOLDING is not nil, it unfolds before decoding.
339 If MUST-UNFOLD is non-nil, it unfolds and eliminates line-breaks even
340 if there are in decoded encoded-words (generated by bad manner MUA
341 such as a version of Net$cape)."
345 (narrow-to-region start end)
347 (eword-decode-unfold)
349 (let ((str (eword-decode-unstructured
350 (buffer-substring (point-min) (point-max))
352 (delete-region (point-min) (point-max))
356 ;;; @ for message header
359 (defcustom eword-decode-ignored-field-list
360 '(newsgroups path lines nntp-posting-host message-id date)
361 "*List of field-names to be ignored when decoding.
362 Each field name must be symbol."
364 :type '(repeat symbol))
366 (defcustom eword-decode-structured-field-list
367 '(reply-to resent-reply-to from resent-from sender resent-sender
368 to resent-to cc resent-cc bcc resent-bcc dcc
369 mime-version content-type content-transfer-encoding
371 "*List of field-names to decode as structured field.
372 Each field name must be symbol."
374 :type '(repeat symbol))
376 (defun eword-decode-header (&optional code-conversion separator)
377 "Decode MIME encoded-words in header fields.
378 If CODE-CONVERSION is nil, it decodes only encoded-words. If it is
379 mime-charset, it decodes non-ASCII bit patterns as the mime-charset.
380 Otherwise it decodes non-ASCII bit patterns as the
381 default-mime-charset.
382 If SEPARATOR is not nil, it is used as header separator."
384 (if (and code-conversion
385 (not (mime-charset-to-coding-system code-conversion)))
386 (setq code-conversion default-mime-charset))
389 (std11-narrow-to-header separator)
391 (let (beg p end field-name len)
392 (goto-char (point-min))
393 (while (re-search-forward std11-field-head-regexp nil t)
394 (setq beg (match-beginning 0)
396 field-name (buffer-substring beg (1- p))
397 len (string-width field-name)
398 field-name (intern (downcase field-name))
399 end (std11-field-end))
400 (cond ((memq field-name eword-decode-ignored-field-list)
403 ((memq field-name eword-decode-structured-field-list)
404 ;; Decode as structured field
405 (let ((body (buffer-substring p end)))
406 (delete-region p end)
407 (insert (eword-decode-and-fold-structured-field
411 ;; Decode as unstructured field
413 (narrow-to-region beg (1+ end))
415 (eword-decode-region beg (point-max) 'unfold nil
417 (goto-char (point-max))
419 (eword-decode-region (point-min) (point-max) t nil code-conversion)
422 (defun eword-decode-unfold ()
423 (goto-char (point-min))
425 (while (re-search-forward std11-field-head-regexp nil t)
426 (setq beg (match-beginning 0)
427 end (std11-field-end))
428 (setq field (buffer-substring beg end))
429 (if (string-match eword-encoded-word-regexp field)
431 (narrow-to-region (goto-char beg) end)
432 (while (re-search-forward "\n\\([ \t]\\)" nil t)
433 (replace-match (match-string 1))
435 (goto-char (point-max))
440 ;;; @ encoded-word decoder
443 (defvar eword-warning-face nil "Face used for invalid encoded-word.")
445 (defun eword-decode-encoded-word (word &optional must-unfold)
446 "Decode WORD if it is an encoded-word.
448 If your emacs implementation can not decode the charset of WORD, it
449 returns WORD. Similarly the encoded-word is broken, it returns WORD.
451 If MUST-UNFOLD is non-nil, it unfolds and eliminates line-breaks even
452 if there are in decoded encoded-word (generated by bad manner MUA such
453 as a version of Net$cape)."
454 (or (if (string-match eword-encoded-word-regexp word)
456 (substring word (match-beginning 1) (match-end 1))
460 (substring word (match-beginning 2) (match-end 2))
463 (substring word (match-beginning 3) (match-end 3))
466 (eword-decode-encoded-text charset encoding text must-unfold)
469 (add-text-properties 0 (length word)
470 (and eword-warning-face
471 (list 'face eword-warning-face))
478 ;;; @ encoded-text decoder
481 (defun eword-decode-encoded-text (charset encoding string
482 &optional must-unfold)
483 "Decode STRING as an encoded-text.
485 If your emacs implementation can not decode CHARSET, it returns nil.
487 If ENCODING is not \"B\" or \"Q\", it occurs error.
488 So you should write error-handling code if you don't want break by errors.
490 If MUST-UNFOLD is non-nil, it unfolds and eliminates line-breaks even
491 if there are in decoded encoded-text (generated by bad manner MUA such
492 as a version of Net$cape)."
493 (let ((cs (mime-charset-to-coding-system charset)))
497 ((string-equal "B" encoding)
498 (if (and (string-match eword-B-encoded-text-regexp string)
499 (string-equal string (match-string 0 string)))
500 (base64-decode-string string)
501 (error "Invalid encoded-text %s" string)))
502 ((string-equal "Q" encoding)
503 (if (and (string-match eword-Q-encoded-text-regexp string)
504 (string-equal string (match-string 0 string)))
505 (q-encoding-decode-string string)
506 (error "Invalid encoded-text %s" string)))
508 (error "Invalid encoding %s" encoding)
513 (setq dest (decode-coding-string dest cs))
520 (t (char-to-string chr)))
522 (std11-unfold-string dest)
528 ;;; @ lexical analyze
531 (defvar eword-lexical-analyze-cache nil)
532 (defvar eword-lexical-analyze-cache-max 299
533 "*Max position of eword-lexical-analyze-cache.
534 It is max size of eword-lexical-analyze-cache - 1.")
536 (defcustom eword-lexical-analyzers
537 '(eword-analyze-quoted-string
538 eword-analyze-domain-literal
539 eword-analyze-comment
541 eword-analyze-special
542 eword-analyze-encoded-word
544 "*List of functions to return result of lexical analyze.
545 Each function must have two arguments: STRING and MUST-UNFOLD.
546 STRING is the target string to be analyzed.
547 If MUST-UNFOLD is not nil, each function must unfold and eliminate
548 bare-CR and bare-LF from the result even if they are included in
549 content of the encoded-word.
550 Each function must return nil if it can not analyze STRING as its
553 Previous function is preferred to next function. If a function
554 returns nil, next function is used. Otherwise the return value will
557 :type '(repeat function))
559 (defun eword-analyze-quoted-string (string &optional must-unfold)
560 (let ((p (std11-check-enclosure string ?\" ?\")))
562 (cons (cons 'quoted-string
563 (eword-decode-quoted-string (substring string 0 p)))
564 (substring string p))
567 (defun eword-analyze-domain-literal (string &optional must-unfold)
568 (std11-analyze-domain-literal string))
570 (defun eword-analyze-comment (string &optional must-unfold)
571 (let ((len (length string)))
572 (if (and (< 0 len) (eq (aref string 0) ?\())
574 (while (and p (< p len) (eq (aref string p) ?\())
575 (setq p (std11-check-enclosure string ?\( ?\) t p)))
578 (eword-decode-comment (substring string 0 p)))
579 (substring string p)))
582 (defun eword-analyze-spaces (string &optional must-unfold)
583 (std11-analyze-spaces string))
585 (defun eword-analyze-special (string &optional must-unfold)
586 (std11-analyze-special string))
588 (defun eword-analyze-encoded-word (string &optional must-unfold)
589 (let ((decoded (eword-decode-first-encoded-words
591 eword-encoded-word-in-phrase-regexp
592 eword-after-encoded-word-in-phrase-regexp
595 (cons (cons 'atom (car decoded)) (cdr decoded)))))
597 (defun eword-analyze-atom (string &optional must-unfold)
598 (if (let ((enable-multibyte-characters nil))
599 (string-match std11-atom-regexp string))
600 (let ((end (match-end 0)))
601 (if (and eword-decode-sticked-encoded-word
602 (string-match eword-encoded-word-in-phrase-regexp
603 (substring string 0 end))
604 (< 0 (match-beginning 0)))
605 (setq end (match-beginning 0)))
606 (cons (cons 'atom (decode-mime-charset-string
607 (substring string 0 end)
608 default-mime-charset))
609 (substring string end)
612 (defun eword-lexical-analyze-internal (string must-unfold)
614 (while (not (string-equal string ""))
616 (let ((rest eword-lexical-analyzers)
618 (while (and (setq func (car rest))
619 (null (setq r (funcall func string must-unfold)))
621 (setq rest (cdr rest)))
622 (or r `((error . ,string) . ""))
624 (setq dest (cons (car ret) dest))
625 (setq string (cdr ret))
630 (defun eword-lexical-analyze (string &optional must-unfold)
631 "Return lexical analyzed list corresponding STRING.
632 It is like std11-lexical-analyze, but it decodes non us-ascii
633 characters encoded as encoded-words or invalid \"raw\" format.
634 \"Raw\" non us-ascii characters are regarded as variable
635 `default-mime-charset'."
636 (let* ((str (copy-sequence string))
637 (key (cons str (cons default-mime-charset must-unfold)))
639 (set-text-properties 0 (length str) nil str)
640 (if (setq ret (assoc key eword-lexical-analyze-cache))
642 (setq ret (eword-lexical-analyze-internal str must-unfold))
643 (setq eword-lexical-analyze-cache
645 (last eword-lexical-analyze-cache
646 eword-lexical-analyze-cache-max)))
649 (defun eword-decode-token (token)
652 (defun eword-decode-and-fold-structured-field
653 (string start-column &optional max-column must-unfold)
654 "Decode and fold (fill) STRING as structured field body.
655 It decodes non us-ascii characters in FULL-NAME encoded as
656 encoded-words or invalid \"raw\" string. \"Raw\" non us-ascii
657 characters are regarded as variable `default-mime-charset'.
659 If an encoded-word is broken or your emacs implementation can not
660 decode the charset included in it, it is not decoded.
662 If MAX-COLUMN is omitted, `fill-column' is used.
664 If MUST-UNFOLD is non-nil, it unfolds and eliminates line-breaks even
665 if there are in decoded encoded-words (generated by bad manner MUA
666 such as a version of Net$cape)."
668 (setq max-column fill-column))
669 (let ((c start-column)
670 (tokens (eword-lexical-analyze string must-unfold))
673 (while (and (setq token (car tokens))
674 (setq tokens (cdr tokens)))
675 (let* ((type (car token)))
676 (if (eq type 'spaces)
677 (let* ((next-token (car tokens))
678 (next-str (eword-decode-token next-token))
679 (next-len (string-width next-str))
680 (next-c (+ c next-len 1)))
681 (if (< next-c max-column)
682 (setq result (concat result " " next-str)
684 (setq result (concat result "\n " next-str)
686 (setq tokens (cdr tokens))
688 (let* ((str (eword-decode-token token)))
689 (setq result (concat result str)
690 c (+ c (string-width str)))
693 (concat result (eword-decode-token token))
696 (defun eword-decode-and-unfold-structured-field (string)
697 "Decode and unfold STRING as structured field body.
698 It decodes non us-ascii characters in FULL-NAME encoded as
699 encoded-words or invalid \"raw\" string. \"Raw\" non us-ascii
700 characters are regarded as variable `default-mime-charset'.
702 If an encoded-word is broken or your emacs implementation can not
703 decode the charset included in it, it is not decoded."
704 (let ((tokens (eword-lexical-analyze string 'must-unfold))
707 (let* ((token (car tokens))
709 (setq tokens (cdr tokens))
711 (if (eq type 'spaces)
713 (concat result (eword-decode-token token))
717 (defun eword-decode-structured-field-body (string &optional must-unfold
718 start-column max-column)
719 "Decode non us-ascii characters in STRING as structured field body.
720 STRING is unfolded before decoding.
722 It decodes non us-ascii characters in FULL-NAME encoded as
723 encoded-words or invalid \"raw\" string. \"Raw\" non us-ascii
724 characters are regarded as variable `default-mime-charset'.
726 If an encoded-word is broken or your emacs implementation can not
727 decode the charset included in it, it is not decoded.
729 If MUST-UNFOLD is non-nil, it unfolds and eliminates line-breaks even
730 if there are in decoded encoded-words (generated by bad manner MUA
731 such as a version of Net$cape)."
733 ;; fold with max-column
734 (eword-decode-and-fold-structured-field
735 string start-column max-column must-unfold)
737 (mapconcat (function eword-decode-token)
738 (eword-lexical-analyze string must-unfold)
742 (defun eword-decode-unstructured-field-body (string &optional must-unfold)
743 "Decode non us-ascii characters in STRING as unstructured field body.
744 STRING is unfolded before decoding.
746 It decodes non us-ascii characters in FULL-NAME encoded as
747 encoded-words or invalid \"raw\" string. \"Raw\" non us-ascii
748 characters are regarded as variable `default-mime-charset'.
750 If an encoded-word is broken or your emacs implementation can not
751 decode the charset included in it, it is not decoded.
753 If MUST-UNFOLD is non-nil, it unfolds and eliminates line-breaks even
754 if there are in decoded encoded-words (generated by bad manner MUA
755 such as a version of Net$cape)."
756 (eword-decode-string string must-unfold default-mime-charset))
758 (defun eword-extract-address-components (string)
759 "Extract full name and canonical address from STRING.
760 Returns a list of the form (FULL-NAME CANONICAL-ADDRESS).
761 If no name can be extracted, FULL-NAME will be nil.
762 It decodes non us-ascii characters in FULL-NAME encoded as
763 encoded-words or invalid \"raw\" string. \"Raw\" non us-ascii
764 characters are regarded as variable `default-mime-charset'."
765 (let* ((structure (car (std11-parse-address
766 (eword-lexical-analyze
767 (std11-unfold-string string) 'must-unfold))))
768 (phrase (std11-full-name-string structure))
769 (address (std11-address-string structure))
771 (list phrase address)
778 (provide 'eword-decode)
780 ;;; eword-decode.el ends here