1 ;;; eword-decode.el --- RFC 2047 based encoded-word decoder for GNU Emacs
3 ;; Copyright (C) 1995,1996,1997,1998 Free Software Foundation, Inc.
5 ;; Author: ENAMI Tsugutomo <enami@sys.ptg.sony.co.jp>
6 ;; MORIOKA Tomohiko <morioka@jaist.ac.jp>
7 ;; TANAKA Akira <akr@jaist.ac.jp>
9 ;; Original: 1992/07/20 ENAMI Tsugutomo's `mime.el'.
10 ;; Renamed: 1993/06/03 to tiny-mime.el
11 ;; Renamed: 1995/10/03 from tiny-mime.el (split off encoder)
12 ;; Renamed: 1997/02/22 from tm-ew-d.el
13 ;; Keywords: encoded-word, MIME, multilingual, header, mail, news
15 ;; This file is part of SEMI (Spadework for Emacs MIME Interfaces).
17 ;; This program is free software; you can redistribute it and/or
18 ;; modify it under the terms of the GNU General Public License as
19 ;; published by the Free Software Foundation; either version 2, or (at
20 ;; your option) any later version.
22 ;; This program is distributed in the hope that it will be useful, but
23 ;; WITHOUT ANY WARRANTY; without even the implied warranty of
24 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
25 ;; General Public License for more details.
27 ;; You should have received a copy of the GNU General Public License
28 ;; along with GNU Emacs; see the file COPYING. If not, write to the
29 ;; Free Software Foundation, Inc., 59 Temple Place - Suite 330,
30 ;; Boston, MA 02111-1307, USA.
38 (eval-when-compile (require 'cl))
40 (defgroup eword-decode nil
41 "Encoded-word decoding"
44 (defcustom eword-max-size-to-decode 1000
45 "*Max size to decode header field."
47 :type '(choice (integer :tag "Limit (bytes)")
48 (const :tag "Don't limit" nil)))
51 ;;; @ MIME encoded-word definition
55 (defconst eword-encoded-text-regexp "[!->@-~]+")
57 (defconst eword-encoded-word-regexp
59 (concat (regexp-quote "=?")
67 eword-encoded-text-regexp
69 (regexp-quote "?="))))
75 (defun eword-decode-string (string &optional must-unfold)
76 "Decode MIME encoded-words in STRING.
78 STRING is unfolded before decoding.
80 If an encoded-word is broken or your emacs implementation can not
81 decode the charset included in it, it is not decoded.
83 If MUST-UNFOLD is non-nil, it unfolds and eliminates line-breaks even
84 if there are in decoded encoded-words (generated by bad manner MUA
85 such as a version of Net$cape)."
86 (setq string (std11-unfold-string string))
87 (let ((dest "")(ew nil)
89 (while (and (string-match eword-encoded-word-regexp string)
90 (setq beg (match-beginning 0)
96 (string-match "^[ \t]+$" (substring string 0 beg))
98 (setq dest (concat dest (substring string 0 beg)))
103 (eword-decode-encoded-word
104 (substring string beg end) must-unfold)
106 (setq string (substring string end))
112 (defun eword-decode-structured-field-body (string
113 &optional start-column max-column)
114 (let ((tokens (eword-lexical-analyze string 'must-unfold))
118 (setq token (car tokens))
119 (setq result (concat result (eword-decode-token token)))
120 (setq tokens (cdr tokens)))
123 (defun eword-decode-and-unfold-structured-field-body (string
127 "Decode and unfold STRING as structured field body.
128 It decodes non us-ascii characters in FULL-NAME encoded as
129 encoded-words or invalid \"raw\" string. \"Raw\" non us-ascii
130 characters are regarded as variable `default-mime-charset'.
132 If an encoded-word is broken or your emacs implementation can not
133 decode the charset included in it, it is not decoded."
134 (let ((tokens (eword-lexical-analyze string 'must-unfold))
137 (let* ((token (car tokens))
139 (setq tokens (cdr tokens))
141 (if (eq type 'spaces)
143 (concat result (eword-decode-token token))
147 (defun eword-decode-and-fold-structured-field-body (string
149 &optional max-column)
150 (if (and eword-max-size-to-decode
151 (> (length string) eword-max-size-to-decode))
154 (setq max-column fill-column))
155 (let ((c start-column)
156 (tokens (eword-lexical-analyze string 'must-unfold))
159 (while (and (setq token (car tokens))
160 (setq tokens (cdr tokens)))
161 (let* ((type (car token)))
162 (if (eq type 'spaces)
163 (let* ((next-token (car tokens))
164 (next-str (eword-decode-token next-token))
165 (next-len (string-width next-str))
166 (next-c (+ c next-len 1)))
167 (if (< next-c max-column)
168 (setq result (concat result " " next-str)
170 (setq result (concat result "\n " next-str)
172 (setq tokens (cdr tokens))
174 (let* ((str (eword-decode-token token)))
175 (setq result (concat result str)
176 c (+ c (string-width str)))
179 (concat result (eword-decode-token token))
182 (defun eword-decode-unstructured-field-body (string &optional start-column
185 (decode-mime-charset-string string default-mime-charset)))
187 (defun eword-decode-and-unfold-unstructured-field-body (string
188 &optional start-column
191 (decode-mime-charset-string (std11-unfold-string string)
192 default-mime-charset)
195 (defun eword-decode-unfolded-unstructured-field-body (string
196 &optional start-column
199 (decode-mime-charset-string string default-mime-charset)
206 (defun eword-decode-region (start end &optional unfolding must-unfold)
207 "Decode MIME encoded-words in region between START and END.
209 If UNFOLDING is not nil, it unfolds before decoding.
211 If MUST-UNFOLD is non-nil, it unfolds and eliminates line-breaks even
212 if there are in decoded encoded-words (generated by bad manner MUA
213 such as a version of Net$cape)."
217 (narrow-to-region start end)
219 (eword-decode-unfold)
221 (goto-char (point-min))
222 (while (re-search-forward (concat "\\(" eword-encoded-word-regexp "\\)"
224 "\\(" eword-encoded-word-regexp "\\)")
226 (replace-match "\\1\\6")
227 (goto-char (point-min))
229 (while (re-search-forward eword-encoded-word-regexp nil t)
230 (insert (eword-decode-encoded-word
232 (buffer-substring (match-beginning 0) (match-end 0))
233 (delete-region (match-beginning 0) (match-end 0))
238 (defun eword-decode-unfold ()
239 (goto-char (point-min))
241 (while (re-search-forward std11-field-head-regexp nil t)
242 (setq beg (match-beginning 0)
243 end (std11-field-end))
244 (setq field (buffer-substring beg end))
245 (if (string-match eword-encoded-word-regexp field)
247 (narrow-to-region (goto-char beg) end)
248 (while (re-search-forward "\n\\([ \t]\\)" nil t)
249 (replace-match (match-string 1))
251 (goto-char (point-max))
256 ;;; @ for message header
259 (defvar mime-field-decoder-alist nil)
261 (defvar mime-field-decoder-cache nil)
263 (defvar mime-update-field-decoder-cache 'mime-update-field-decoder-cache
264 "*Field decoder cache update function.")
267 (defun mime-set-field-decoder (field &rest specs)
268 "Set decoder of FILED.
269 SPECS must be like `MODE1 DECODER1 MODE2 DECODER2 ...'.
270 Each mode must be `nil', `plain', `wide', `summary' or `nov'.
271 If mode is `nil', corresponding decoder is set up for every modes."
273 (let ((mode (pop specs))
274 (function (pop specs)))
277 (let ((cell (assq mode mime-field-decoder-alist)))
279 (setcdr cell (put-alist field function (cdr cell)))
280 (setq mime-field-decoder-alist
281 (cons (cons mode (list (cons field function)))
282 mime-field-decoder-alist))
284 (apply (function mime-set-field-decoder) field specs)
286 (mime-set-field-decoder field
294 (defmacro mime-find-field-presentation-method (name)
295 "Return field-presentation-method from NAME.
296 NAME must be `plain', `wide', `summary' or `nov'."
298 `(or (assq 'summary mime-field-decoder-cache)
304 (symbolp (car (cdr name)))
305 (null (cdr (cdr name))))
306 `(or (assq ,name mime-field-decoder-cache)
310 `(or (assq (or ,name 'summary) mime-field-decoder-cache)
311 (cons (or ,name 'summary) nil))
314 (defun mime-find-field-decoder-internal (field &optional mode)
315 "Return function to decode field-body of FIELD in MODE.
316 Optional argument MODE must be object of field-presentation-method."
317 (cdr (or (assq field (cdr mode))
319 (funcall mime-update-field-decoder-cache
322 (cdr (assq (car mode) mime-field-decoder-cache)))
326 (defun mime-find-field-decoder (field &optional mode)
327 "Return function to decode field-body of FIELD in MODE.
328 Optional argument MODE must be object or name of
329 field-presentation-method. Name of field-presentation-method must be
330 `plain', `wide', `summary' or `nov'.
331 Default value of MODE is `summary'."
333 (let ((p (cdr (mime-find-field-presentation-method mode))))
334 (if (and p (setq p (assq field p)))
336 (cdr (funcall mime-update-field-decoder-cache
337 field (or mode 'summary)))))
338 (inline (mime-find-field-decoder-internal field mode))
342 (defun mime-update-field-decoder-cache (field mode &optional function)
343 "Update field decoder cache `mime-field-decoder-cache'."
344 (cond ((eq function 'identity)
349 (cdr (assq (or mode 'summary) mime-field-decoder-alist))))
350 (setq function (cdr (or (assq field decoder-alist)
351 (assq t decoder-alist)))))
353 (let ((cell (assq mode mime-field-decoder-cache))
356 (if (setq ret (assq field (cdr cell)))
357 (setcdr ret function)
358 (setcdr cell (cons (setq ret (cons field function)) (cdr cell))))
359 (setq mime-field-decoder-cache
360 (cons (cons mode (list (setq ret (cons field function))))
361 mime-field-decoder-cache)))
365 (mime-set-field-decoder 'Archive nil nil)
366 (mime-set-field-decoder 'Content-Md5 nil nil)
367 (mime-set-field-decoder 'Control nil nil)
368 (mime-set-field-decoder 'Date nil nil)
369 (mime-set-field-decoder 'Distribution nil nil)
370 (mime-set-field-decoder 'Followup-Host nil nil)
371 (mime-set-field-decoder 'Followup-To nil nil)
372 (mime-set-field-decoder 'Lines nil nil)
373 (mime-set-field-decoder 'Message-Id nil nil)
374 (mime-set-field-decoder 'Newsgroups nil nil)
375 (mime-set-field-decoder 'Nntp-Posting-Host nil nil)
376 (mime-set-field-decoder 'Path nil nil)
377 (mime-set-field-decoder 'Posted-And-Mailed nil nil)
378 (mime-set-field-decoder 'Received nil nil)
379 (mime-set-field-decoder 'Status nil nil)
380 (mime-set-field-decoder 'X-Face nil nil)
381 (mime-set-field-decoder 'X-Face-Version nil nil)
382 (mime-set-field-decoder 'X-Info nil nil)
383 (mime-set-field-decoder 'X-Pgp-Key-Info nil nil)
384 (mime-set-field-decoder 'X-Pgp-Sig nil nil)
385 (mime-set-field-decoder 'X-Pgp-Sig-Version nil nil)
386 (mime-set-field-decoder 'Xref nil nil)
390 '(Reply-To Resent-Reply-To From Resent-From Sender Resent-Sender
391 To Resent-To Cc Resent-Cc Bcc Resent-Bcc Dcc
393 Mime-Version Content-Type Content-Transfer-Encoding
394 Content-Disposition User-Agent))
397 (setq field (pop fields))
398 (mime-set-field-decoder
400 'plain #'eword-decode-structured-field-body
401 'wide #'eword-decode-and-fold-structured-field-body
402 'summary #'eword-decode-and-unfold-structured-field-body
403 'nov #'eword-decode-and-unfold-structured-field-body)
406 ;; unstructured fields (default)
407 (mime-set-field-decoder
409 'plain #'eword-decode-unstructured-field-body
410 'wide #'eword-decode-unstructured-field-body
411 'summary #'eword-decode-and-unfold-unstructured-field-body
412 'nov #'eword-decode-unfolded-unstructured-field-body)
415 (defun mime-decode-field-body (field-body field-name
416 &optional mode max-column)
417 "Decode FIELD-BODY as FIELD-NAME in MODE, and return the result.
418 Optional argument MODE must be `plain', `wide', `summary' or `nov'.
419 Default mode is `summary'.
421 If MODE is `wide' and MAX-COLUMN is non-nil, the result is folded with
424 Non MIME encoded-word part in FILED-BODY is decoded with
425 `default-mime-charset'."
426 (let (field-name-symbol len decoder)
427 (if (symbolp field-name)
428 (setq field-name-symbol field-name
429 len (1+ (string-width (symbol-name field-name))))
430 (setq field-name-symbol (intern (capitalize field-name))
431 len (1+ (string-width field-name))))
432 (setq decoder (mime-find-field-decoder field-name-symbol mode))
434 (funcall decoder field-body len max-column)
436 (if (eq mode 'summary)
437 (std11-unfold-string field-body)
442 (defun mime-decode-header-in-region (start end
443 &optional code-conversion)
444 "Decode MIME encoded-words in region between START and END.
445 If CODE-CONVERSION is nil, it decodes only encoded-words. If it is
446 mime-charset, it decodes non-ASCII bit patterns as the mime-charset.
447 Otherwise it decodes non-ASCII bit patterns as the
448 default-mime-charset."
452 (narrow-to-region start end)
453 (let ((default-charset
455 (if (mime-charset-to-coding-system code-conversion)
457 default-mime-charset))))
459 (let ((mode-obj (mime-find-field-presentation-method 'wide))
460 beg p end field-name len field-decoder)
461 (goto-char (point-min))
462 (while (re-search-forward std11-field-head-regexp nil t)
463 (setq beg (match-beginning 0)
465 field-name (buffer-substring beg (1- p))
466 len (string-width field-name)
467 field-name (intern (capitalize field-name))
468 field-decoder (inline
469 (mime-find-field-decoder-internal
470 field-name mode-obj)))
472 (setq end (std11-field-end))
473 (let ((body (buffer-substring p end))
474 (default-mime-charset default-charset))
475 (delete-region p end)
476 (insert (funcall field-decoder body (1+ len)))
479 (eword-decode-region (point-min) (point-max) t)
483 (defun mime-decode-header-in-buffer (&optional code-conversion separator)
484 "Decode MIME encoded-words in header fields.
485 If CODE-CONVERSION is nil, it decodes only encoded-words. If it is
486 mime-charset, it decodes non-ASCII bit patterns as the mime-charset.
487 Otherwise it decodes non-ASCII bit patterns as the
488 default-mime-charset.
489 If SEPARATOR is not nil, it is used as header separator."
491 (mime-decode-header-in-region
494 (goto-char (point-min))
495 (if (re-search-forward
496 (concat "^\\(" (regexp-quote (or separator "")) "\\)?$")
503 (define-obsolete-function-alias 'eword-decode-header
504 'mime-decode-header-in-buffer)
507 ;;; @ encoded-word decoder
510 (defvar eword-decode-encoded-word-error-handler
511 'eword-decode-encoded-word-default-error-handler)
513 (defvar eword-warning-face nil
514 "Face used for invalid encoded-word.")
516 (defun eword-decode-encoded-word-default-error-handler (word signal)
517 (and (add-text-properties 0 (length word)
518 (and eword-warning-face
519 (list 'face eword-warning-face))
523 (defun eword-decode-encoded-word (word &optional must-unfold)
524 "Decode WORD if it is an encoded-word.
526 If your emacs implementation can not decode the charset of WORD, it
527 returns WORD. Similarly the encoded-word is broken, it returns WORD.
529 If MUST-UNFOLD is non-nil, it unfolds and eliminates line-breaks even
530 if there are in decoded encoded-word (generated by bad manner MUA such
531 as a version of Net$cape)."
532 (or (if (string-match eword-encoded-word-regexp word)
534 (substring word (match-beginning 1) (match-end 1))
538 (substring word (match-beginning 2) (match-end 2))
541 (substring word (match-beginning 3) (match-end 3))
544 (eword-decode-encoded-text charset encoding text must-unfold)
546 (funcall eword-decode-encoded-word-error-handler word err)
552 ;;; @ encoded-text decoder
555 (defun eword-decode-encoded-text (charset encoding string
556 &optional must-unfold)
557 "Decode STRING as an encoded-text.
559 If your emacs implementation can not decode CHARSET, it returns nil.
561 If ENCODING is not \"B\" or \"Q\", it occurs error.
562 So you should write error-handling code if you don't want break by errors.
564 If MUST-UNFOLD is non-nil, it unfolds and eliminates line-breaks even
565 if there are in decoded encoded-text (generated by bad manner MUA such
566 as a version of Net$cape)."
567 (let ((cs (mime-charset-to-coding-system charset)))
569 (let ((dest (encoded-text-decode-string string encoding)))
571 (setq dest (decode-mime-charset-string dest charset))
575 (cond ((eq chr ?\n) "")
577 (t (char-to-string chr)))
579 (std11-unfold-string dest)
584 ;;; @ lexical analyze
587 (defvar eword-lexical-analyze-cache nil)
588 (defvar eword-lexical-analyze-cache-max 299
589 "*Max position of eword-lexical-analyze-cache.
590 It is max size of eword-lexical-analyze-cache - 1.")
592 (defcustom eword-lexical-analyzers
593 '(eword-analyze-quoted-string
594 eword-analyze-domain-literal
595 eword-analyze-comment
597 eword-analyze-special
598 eword-analyze-encoded-word
600 "*List of functions to return result of lexical analyze.
601 Each function must have two arguments: STRING and MUST-UNFOLD.
602 STRING is the target string to be analyzed.
603 If MUST-UNFOLD is not nil, each function must unfold and eliminate
604 bare-CR and bare-LF from the result even if they are included in
605 content of the encoded-word.
606 Each function must return nil if it can not analyze STRING as its
609 Previous function is preferred to next function. If a function
610 returns nil, next function is used. Otherwise the return value will
613 :type '(repeat function))
615 (defun eword-analyze-quoted-string (string &optional must-unfold)
616 (let ((p (std11-check-enclosure string ?\" ?\")))
618 (cons (cons 'quoted-string
619 (decode-mime-charset-string
620 (std11-strip-quoted-pair (substring string 1 (1- p)))
621 default-mime-charset))
622 (substring string p))
625 (defun eword-analyze-domain-literal (string &optional must-unfold)
626 (std11-analyze-domain-literal string))
628 (defun eword-analyze-comment (string &optional must-unfold)
629 (let ((p (std11-check-enclosure string ?\( ?\) t)))
633 (decode-mime-charset-string
634 (std11-strip-quoted-pair (substring string 1 (1- p)))
635 default-mime-charset)
637 (substring string p))
640 (defun eword-analyze-spaces (string &optional must-unfold)
641 (std11-analyze-spaces string))
643 (defun eword-analyze-special (string &optional must-unfold)
644 (std11-analyze-special string))
646 (defun eword-analyze-encoded-word (string &optional must-unfold)
647 (if (eq (string-match eword-encoded-word-regexp string) 0)
648 (let ((end (match-end 0))
649 (dest (eword-decode-encoded-word (match-string 0 string)
652 (setq string (substring string end))
653 (while (eq (string-match `,(concat "[ \t\n]*\\("
654 eword-encoded-word-regexp
658 (setq end (match-end 0))
661 (eword-decode-encoded-word (match-string 1 string)
663 string (substring string end))
665 (cons (cons 'atom dest) string)
668 (defun eword-analyze-atom (string &optional must-unfold)
669 (if (string-match std11-atom-regexp string)
670 (let ((end (match-end 0)))
671 (cons (cons 'atom (decode-mime-charset-string
672 (substring string 0 end)
673 default-mime-charset))
674 (substring string end)
677 (defun eword-lexical-analyze-internal (string must-unfold)
679 (while (not (string-equal string ""))
681 (let ((rest eword-lexical-analyzers)
683 (while (and (setq func (car rest))
684 (null (setq r (funcall func string must-unfold)))
686 (setq rest (cdr rest)))
687 (or r `((error . ,string) . ""))
689 (setq dest (cons (car ret) dest))
690 (setq string (cdr ret))
695 (defun eword-lexical-analyze (string &optional must-unfold)
696 "Return lexical analyzed list corresponding STRING.
697 It is like std11-lexical-analyze, but it decodes non us-ascii
698 characters encoded as encoded-words or invalid \"raw\" format.
699 \"Raw\" non us-ascii characters are regarded as variable
700 `default-mime-charset'."
701 (let ((key (copy-sequence string))
703 (set-text-properties 0 (length key) nil key)
704 (if (setq ret (assoc key eword-lexical-analyze-cache))
706 (setq ret (eword-lexical-analyze-internal key must-unfold))
707 (setq eword-lexical-analyze-cache
709 (last eword-lexical-analyze-cache
710 eword-lexical-analyze-cache-max)))
713 (defun eword-decode-token (token)
714 (let ((type (car token))
716 (cond ((eq type 'quoted-string)
717 (std11-wrap-as-quoted-string value))
719 (concat "(" (std11-wrap-as-quoted-pairs value '(?( ?))) ")"))
722 (defun eword-extract-address-components (string)
723 "Extract full name and canonical address from STRING.
724 Returns a list of the form (FULL-NAME CANONICAL-ADDRESS).
725 If no name can be extracted, FULL-NAME will be nil.
726 It decodes non us-ascii characters in FULL-NAME encoded as
727 encoded-words or invalid \"raw\" string. \"Raw\" non us-ascii
728 characters are regarded as variable `default-mime-charset'."
729 (let* ((structure (car (std11-parse-address
730 (eword-lexical-analyze
731 (std11-unfold-string string) 'must-unfold))))
732 (phrase (std11-full-name-string structure))
733 (address (std11-address-string structure))
735 (list phrase address)
742 (provide 'eword-decode)
744 ;;; eword-decode.el ends here