X-Git-Url: http://git.chise.org/gitweb/?a=blobdiff_plain;f=eword-decode.el;h=46636106602c76a3bfdd76fa16ba7fd3bb0c3df7;hb=8a3113daa75934774a9cff658c360a7719b2407f;hp=0bf4f54ee027a8ef829d7ee14d4a901672333bcd;hpb=a4c5550837cdfb5bb9cfc746587de486f8df4106;p=elisp%2Fflim.git diff --git a/eword-decode.el b/eword-decode.el index 0bf4f54..4663610 100644 --- a/eword-decode.el +++ b/eword-decode.el @@ -4,7 +4,8 @@ ;; Author: ENAMI Tsugutomo ;; MORIOKA Tomohiko -;; Maintainer: MORIOKA Tomohiko +;; Tanaka Akira +;; Maintainer: Tanaka Akira ;; Created: 1995/10/03 ;; Original: 1992/07/20 ENAMI Tsugutomo's `mime.el'. ;; Renamed: 1993/06/03 to tiny-mime.el @@ -12,7 +13,7 @@ ;; Renamed: 1997/02/22 from tm-ew-d.el ;; Keywords: encoded-word, MIME, multilingual, header, mail, news -;; This file is part of SEMI (Spadework for Emacs MIME Interfaces). +;; This file is part of FLAM (Faithful Library About MIME). ;; This program is free software; you can redistribute it and/or ;; modify it under the terms of the GNU General Public License as @@ -40,22 +41,66 @@ :group 'mime) +;;; @ variables +;;; + +(defcustom eword-decode-sticked-encoded-word nil + "*If non-nil, decode encoded-words sticked on atoms, +other encoded-words, etc. +however this behaviour violates RFC2047." + :group 'eword-decode + :type 'boolean) + +(defcustom eword-decode-quoted-encoded-word nil + "*If non-nil, decode encoded-words in quoted-string +however this behaviour violates RFC2047." + :group 'eword-decode + :type 'boolean) + + ;;; @ MIME encoded-word definition ;;; -(defconst eword-encoded-text-regexp "[!->@-~]+") -(defconst eword-encoded-word-regexp +(defconst eword-encoded-word-prefix-regexp (concat (regexp-quote "=?") - "\\(" - mime-charset-regexp - "\\)" + "\\(" mime-charset-regexp "\\)" (regexp-quote "?") "\\(B\\|Q\\)" - (regexp-quote "?") - "\\(" - eword-encoded-text-regexp - "\\)" - (regexp-quote "?="))) + (regexp-quote "?"))) +(defconst eword-encoded-word-suffix-regexp + (regexp-quote "?=")) + +(defconst eword-encoded-text-in-unstructured-regexp "[!->@-~]+") +(defconst eword-encoded-word-in-unstructured-regexp + (concat eword-encoded-word-prefix-regexp + "\\(" eword-encoded-text-in-unstructured-regexp "\\)" + eword-encoded-word-suffix-regexp)) +(defconst eword-after-encoded-word-in-unstructured-regexp "\\([ \t]\\|$\\)") + +(defconst eword-encoded-text-in-phrase-regexp "[-A-Za-z0-9!*+/=_]+") +(defconst eword-encoded-word-in-phrase-regexp + (concat eword-encoded-word-prefix-regexp + "\\(" eword-encoded-text-in-phrase-regexp "\\)" + eword-encoded-word-suffix-regexp)) +(defconst eword-after-encoded-word-in-phrase-regexp "\\([ \t]\\|$\\)") + +(defconst eword-encoded-text-in-comment-regexp "[]!-'*->@-[^-~]+") +(defconst eword-encoded-word-in-comment-regexp + (concat eword-encoded-word-prefix-regexp + "\\(" eword-encoded-text-in-comment-regexp "\\)" + eword-encoded-word-suffix-regexp)) +(defconst eword-after-encoded-word-in-comment-regexp "\\([ \t()\\\\]\\|$\\)") + +(defconst eword-encoded-text-in-quoted-string-regexp "[]!#->@-[^-~]+") +(defconst eword-encoded-word-in-quoted-string-regexp + (concat eword-encoded-word-prefix-regexp + "\\(" eword-encoded-text-in-quoted-string-regexp "\\)" + eword-encoded-word-suffix-regexp)) +(defconst eword-after-encoded-word-in-quoted-string-regexp "\\([ \t\"\\\\]\\|$\\)") + +; obsolete +(defconst eword-encoded-text-regexp eword-encoded-text-in-unstructured-regexp) +(defconst eword-encoded-word-regexp eword-encoded-word-in-unstructured-regexp) ;;; @@ Base64 @@ -90,10 +135,183 @@ ;; (concat "\\(Q\\)\\?" eword-Q-encoded-text-regexp)) +;;; @ internal utilities +;;; + +(defun eword-decode-first-encoded-words (string + eword-regexp + after-regexp + &optional must-unfold) + "Decode MIME encoded-words in beginning of STRING. + +EWORD-REGEXP is the regexp that matches a encoded-word. +Usual value is +eword-encoded-word-in-unstructured-regexp, +eword-encoded-text-in-phrase-regexp, +eword-encoded-word-in-comment-regexp or +eword-encoded-word-in-quoted-string-regexp. + +AFTER-REGEXP is the regexp that matches a after encoded-word. +Usual value is +eword-after-encoded-word-in-unstructured-regexp, +eword-after-encoded-text-in-phrase-regexp, +eword-after-encoded-word-in-comment-regexp or +eword-after-encoded-word-in-quoted-string-regexp. + +If beginning of STRING matches EWORD-REGEXP with AFTER-REGEXP, +returns a cons cell of decoded string(sequence of characters) and +the rest(sequence of octets). + +If beginning of STRING does not matches EWORD-REGEXP and AFTER-REGEXP, +returns nil. + +If an encoded-word is broken or your emacs implementation can not +decode the charset included in it, it is returned in decoded part +as encoded-word form. + +If MUST-UNFOLD is non-nil, it unfolds and eliminates line-breaks even +if there are in decoded encoded-words (generated by bad manner MUA +such as a version of Net$cape)." + (if eword-decode-sticked-encoded-word (setq after-regexp "")) + (let* ((between-ewords-regexp + (if eword-decode-sticked-encoded-word + "\\(\n?[ \t]\\)*" + "\\(\n?[ \t]\\)+")) + (between-ewords-eword-after-regexp + (concat "\\`\\(" between-ewords-regexp "\\)" + "\\(" eword-regexp "\\)" + after-regexp)) + (eword-after-regexp + (concat "\\`\\(" eword-regexp "\\)" after-regexp)) + (src string) ; sequence of octets. + (dst "")) ; sequence of characters. + (if (string-match eword-after-regexp src) + (let* (p + (q (match-end 1)) + (ew (substring src 0 q)) + (dw (eword-decode-encoded-word ew must-unfold))) + (setq dst (concat dst dw) + src (substring src q)) + (if (not (string= ew dw)) + (progn + (while + (and + (string-match between-ewords-eword-after-regexp src) + (progn + (setq p (match-end 1) + q (match-end 3) + ew (substring src p q) + dw (eword-decode-encoded-word ew must-unfold)) + (if (string= ew dw) + (progn + (setq dst (concat dst (substring src 0 q)) + src (substring src q)) + nil) + t))) + (setq dst (concat dst dw) + src (substring src q))))) + (cons dst src)) + nil))) + +(defun eword-decode-entire-string (string + eword-regexp + after-regexp + safe-regexp + escape ; ?\\ or nil. + delimiters ; list of chars. + must-unfold + code-conversion) + (if (and code-conversion + (not (mime-charset-to-coding-system code-conversion))) + (setq code-conversion default-mime-charset)) + (let ((equal-safe-regexp (concat "\\`=?" safe-regexp)) + (dst "") + (buf "") + (src string) + (ew-enable t)) + (while (< 0 (length src)) + (let ((ch (aref src 0)) + (decoded (and + ew-enable + (eword-decode-first-encoded-words src + eword-regexp after-regexp must-unfold)))) + (if (and (not (string= buf "")) + (or decoded (memq ch delimiters))) + (setq dst (concat dst + (std11-wrap-as-quoted-pairs + (decode-mime-charset-string buf code-conversion) + delimiters)) + buf "")) + (cond + (decoded + (setq dst (concat dst + (std11-wrap-as-quoted-pairs + (car decoded) + delimiters)) + src (cdr decoded))) + ((memq ch delimiters) + (setq dst (concat dst (list ch)) + src (substring src 1) + ew-enable t)) + ((eq ch escape) + (setq buf (concat buf (list (aref src 1))) + src (substring src 2) + ew-enable t)) + ((string-match "\\`[ \t\n]+" src) + (setq buf (concat buf (substring src 0 (match-end 0))) + src (substring src (match-end 0)) + ew-enable t)) + ((and (string-match equal-safe-regexp src) + (< 0 (match-end 0))) + (setq buf (concat buf (substring src 0 (match-end 0))) + src (substring src (match-end 0)) + ew-enable eword-decode-sticked-encoded-word)) + (t (error "something wrong"))))) + (if (not (string= buf "")) + (setq dst (concat dst + (std11-wrap-as-quoted-pairs + (decode-mime-charset-string buf code-conversion) + delimiters)))) + dst)) + + ;;; @ for string ;;; -(defun eword-decode-string (string &optional must-unfold) +(defun eword-decode-unstructured (string code-conversion &optional must-unfold) + (eword-decode-entire-string + string + eword-encoded-word-in-unstructured-regexp + eword-after-encoded-word-in-unstructured-regexp + "[^ \t\n=]*" + nil + nil + must-unfold + code-conversion)) + +(defun eword-decode-comment (string code-conversion &optional must-unfold) + (eword-decode-entire-string + string + eword-encoded-word-in-comment-regexp + eword-after-encoded-word-in-comment-regexp + "[^ \t\n()\\\\=]*" + ?\\ + '(?\( ?\)) + must-unfold + code-conversion)) + +(defun eword-decode-quoted-string (string code-conversion &optional must-unfold) + (eword-decode-entire-string + string + eword-encoded-word-in-quoted-string-regexp + eword-after-encoded-word-in-quoted-string-regexp + "[^ \t\n\"\\\\=]*" + ?\\ + '(?\") + must-unfold + code-conversion)) + +(defun eword-decode-string (string &optional must-unfold code-conversion) "Decode MIME encoded-words in STRING. STRING is unfolded before decoding. @@ -103,45 +321,35 @@ decode the charset included in it, it is not decoded. If MUST-UNFOLD is non-nil, it unfolds and eliminates line-breaks even if there are in decoded encoded-words (generated by bad manner MUA -such as a version of Net$cape)." - (setq string (std11-unfold-string string)) - (let ((dest "")(ew nil) - beg end) - (while (and (string-match eword-encoded-word-regexp string) - (setq beg (match-beginning 0) - end (match-end 0)) - ) - (if (> beg 0) - (if (not - (and (eq ew t) - (string-match "^[ \t]+$" (substring string 0 beg)) - )) - (setq dest (concat dest (substring string 0 beg))) - ) - ) - (setq dest - (concat dest - (eword-decode-encoded-word - (substring string beg end) must-unfold) - )) - (setq string (substring string end)) - (setq ew t) - ) - (concat dest string) - )) +such as a version of Net$cape). + +If CODE-CONVERSION is nil, it decodes only encoded-words. If it is +mime-charset, it decodes non-ASCII bit patterns as the mime-charset. +Otherwise it decodes non-ASCII bit patterns as the +default-mime-charset." + (eword-decode-unstructured + (std11-unfold-string string) + code-conversion + must-unfold)) ;;; @ for region ;;; -(defun eword-decode-region (start end &optional unfolding must-unfold) +(defun eword-decode-region (start end &optional unfolding must-unfold + code-conversion) "Decode MIME encoded-words in region between START and END. If UNFOLDING is not nil, it unfolds before decoding. If MUST-UNFOLD is non-nil, it unfolds and eliminates line-breaks even if there are in decoded encoded-words (generated by bad manner MUA -such as a version of Net$cape)." +such as a version of Net$cape). + +If CODE-CONVERSION is nil, it decodes only encoded-words. If it is +mime-charset, it decodes non-ASCII bit patterns as the mime-charset. +Otherwise it decodes non-ASCII bit patterns as the +default-mime-charset." (interactive "*r") (save-excursion (save-restriction @@ -149,39 +357,29 @@ such as a version of Net$cape)." (if unfolding (eword-decode-unfold) ) - (goto-char (point-min)) - (while (re-search-forward (concat "\\(" eword-encoded-word-regexp "\\)" - "\\(\n?[ \t]\\)+" - "\\(" eword-encoded-word-regexp "\\)") - nil t) - (replace-match "\\1\\6") - (goto-char (point-min)) - ) - (while (re-search-forward eword-encoded-word-regexp nil t) - (insert (eword-decode-encoded-word - (prog1 - (buffer-substring (match-beginning 0) (match-end 0)) - (delete-region (match-beginning 0) (match-end 0)) - ) must-unfold)) - ) - ))) + (let ((str (eword-decode-unstructured + (buffer-substring (point-min) (point-max)) + code-conversion + must-unfold))) + (delete-region (point-min) (point-max)) + (insert str))))) ;;; @ for message header ;;; (defcustom eword-decode-ignored-field-list - '(newsgroups path lines nntp-posting-host received message-id date) + '(Newsgroups Path Lines Nntp-Posting-Host Received Message-Id Date) "*List of field-names to be ignored when decoding. Each field name must be symbol." :group 'eword-decode :type '(repeat symbol)) (defcustom eword-decode-structured-field-list - '(reply-to resent-reply-to from resent-from sender resent-sender - to resent-to cc resent-cc bcc resent-bcc dcc - mime-version content-type content-transfer-encoding - content-disposition) + '(Reply-To Resent-Reply-To From Resent-From Sender Resent-Sender + To Resent-To Cc Resent-Cc Bcc Resent-Bcc Dcc + Mime-Version Content-Type Content-Transfer-Encoding + Content-Disposition) "*List of field-names to decode as structured field. Each field name must be symbol." :group 'eword-decode @@ -195,47 +393,43 @@ Otherwise it decodes non-ASCII bit patterns as the default-mime-charset. If SEPARATOR is not nil, it is used as header separator." (interactive "*") + (if (and code-conversion + (not (mime-charset-to-coding-system code-conversion))) + (setq code-conversion default-mime-charset)) (save-excursion (save-restriction (std11-narrow-to-header separator) - (let ((default-charset - (if code-conversion - (if (mime-charset-to-coding-system code-conversion) - code-conversion - default-mime-charset)))) - (if default-charset - (let (beg p end field-name len) - (goto-char (point-min)) - (while (re-search-forward std11-field-head-regexp nil t) - (setq beg (match-beginning 0) - p (match-end 0) - field-name (buffer-substring beg (1- p)) - len (string-width field-name) - field-name (intern (downcase field-name)) - end (std11-field-end)) - (cond ((memq field-name eword-decode-ignored-field-list) - ;; Don't decode - ) - ((memq field-name eword-decode-structured-field-list) - ;; Decode as structured field - (let ((body (buffer-substring p end)) - (default-mime-charset default-charset)) - (delete-region p end) - (insert (eword-decode-and-fold-structured-field - body (1+ len))) - )) - (t - ;; Decode as unstructured field - (save-restriction - (narrow-to-region beg (1+ end)) - (decode-mime-charset-region p end default-charset) - (goto-char p) - (if (re-search-forward eword-encoded-word-regexp - nil t) - (eword-decode-region beg (point-max) 'unfold)) - ))))) - (eword-decode-region (point-min) (point-max) t) - ))))) + (if code-conversion + (let (beg p end field-name len) + (goto-char (point-min)) + (while (re-search-forward std11-field-head-regexp nil t) + (setq beg (match-beginning 0) + p (match-end 0) + field-name (buffer-substring beg (1- p)) + len (string-width field-name) + field-name (intern (capitalize field-name)) + end (std11-field-end)) + (cond ((memq field-name eword-decode-ignored-field-list) + ;; Don't decode + ) + ((memq field-name eword-decode-structured-field-list) + ;; Decode as structured field + (let ((body (buffer-substring p end))) + (delete-region p end) + (insert (eword-decode-and-fold-structured-field + body (1+ len))) + )) + (t + ;; Decode as unstructured field + (save-restriction + (narrow-to-region beg (1+ end)) + (goto-char p) + (eword-decode-region beg (point-max) 'unfold nil + code-conversion) + (goto-char (point-max)) + ))))) + (eword-decode-region (point-min) (point-max) t nil nil) + )))) (defun eword-decode-unfold () (goto-char (point-min)) @@ -333,21 +527,18 @@ as a version of Net$cape)." (error "Invalid encoding %s" encoding) ))) ) - (if dest - (progn - (setq dest (decode-coding-string dest cs)) - (if must-unfold - (mapconcat (function - (lambda (chr) - (cond - ((eq chr ?\n) "") - ((eq chr ?\t) " ") - (t (char-to-string chr))) - )) - (std11-unfold-string dest) - "") - dest) - )))))) + (when dest + (setq dest (decode-mime-charset-string dest charset)) + (if must-unfold + (mapconcat (function + (lambda (chr) + (cond ((eq chr ?\n) "") + ((eq chr ?\t) " ") + (t (char-to-string chr))) + )) + (std11-unfold-string dest) + "") + dest)))))) ;;; @ lexical analyze @@ -384,27 +575,35 @@ be the result." (defun eword-analyze-quoted-string (string &optional must-unfold) (let ((p (std11-check-enclosure string ?\" ?\"))) (if p - (cons (cons 'quoted-string - (decode-mime-charset-string - (std11-strip-quoted-pair (substring string 1 (1- p))) - default-mime-charset)) - (substring string p)) - ))) + (cons (cons 'quoted-string + (if eword-decode-quoted-encoded-word + (std11-wrap-as-quoted-string + (eword-decode-quoted-string + (substring string 1 (1- p)) + default-mime-charset)) + (std11-wrap-as-quoted-string + (decode-mime-charset-string + (std11-strip-quoted-pair (substring string 1 (1- p))) + default-mime-charset)))) + (substring string p))) + )) (defun eword-analyze-domain-literal (string &optional must-unfold) (std11-analyze-domain-literal string)) (defun eword-analyze-comment (string &optional must-unfold) - (let ((p (std11-check-enclosure string ?\( ?\) t))) - (if p - (cons (cons 'comment - (eword-decode-string - (decode-mime-charset-string - (std11-strip-quoted-pair (substring string 1 (1- p))) - default-mime-charset) - must-unfold)) - (substring string p)) - ))) + (let ((len (length string))) + (if (and (< 0 len) (eq (aref string 0) ?\()) + (let ((p 0)) + (while (and p (< p len) (eq (aref string p) ?\()) + (setq p (std11-check-enclosure string ?\( ?\) t p))) + (setq p (or p len)) + (cons (cons 'comment + (eword-decode-comment + (std11-unfold-string (substring string 0 p)) + default-mime-charset)) + (substring string p))) + nil))) (defun eword-analyze-spaces (string &optional must-unfold) (std11-analyze-spaces string)) @@ -413,30 +612,31 @@ be the result." (std11-analyze-special string)) (defun eword-analyze-encoded-word (string &optional must-unfold) - (if (eq (string-match eword-encoded-word-regexp string) 0) - (let ((end (match-end 0)) - (dest (eword-decode-encoded-word (match-string 0 string) - must-unfold)) - ) - (setq string (substring string end)) - (while (eq (string-match `,(concat "[ \t\n]*\\(" - eword-encoded-word-regexp - "\\)") - string) - 0) - (setq end (match-end 0)) - (setq dest - (concat dest - (eword-decode-encoded-word (match-string 1 string) - must-unfold)) - string (substring string end)) - ) - (cons (cons 'atom dest) string) - ))) + (let ((decoded (eword-decode-first-encoded-words + string + eword-encoded-word-in-phrase-regexp + eword-after-encoded-word-in-phrase-regexp + must-unfold))) + (if decoded + (let ((s (car decoded))) + (while (or (string-match std11-atom-regexp s) + (string-match std11-spaces-regexp s)) + (setq s (substring s (match-end 0)))) + (if (= (length s) 0) + (cons (cons 'atom (car decoded)) (cdr decoded)) + (cons (cons 'quoted-string + (std11-wrap-as-quoted-string (car decoded))) + (cdr decoded))))))) (defun eword-analyze-atom (string &optional must-unfold) - (if (string-match std11-atom-regexp string) + (if (let ((enable-multibyte-characters nil)) + (string-match std11-atom-regexp string)) (let ((end (match-end 0))) + (if (and eword-decode-sticked-encoded-word + (string-match eword-encoded-word-in-phrase-regexp + (substring string 0 end)) + (< 0 (match-beginning 0))) + (setq end (match-beginning 0))) (cons (cons 'atom (decode-mime-charset-string (substring string 0 end) default-mime-charset)) @@ -444,17 +644,24 @@ be the result." )))) (defun eword-lexical-analyze-internal (string must-unfold) - (let (dest ret) + (let ((last 'eword-analyze-spaces) + dest ret) (while (not (string-equal string "")) (setq ret - (let ((rest eword-lexical-analyzers) - func r) - (while (and (setq func (car rest)) - (null (setq r (funcall func string must-unfold))) - ) - (setq rest (cdr rest))) - (or r `((error . ,string) . "")) - )) + (let ((rest eword-lexical-analyzers) + func r) + (while (and (setq func (car rest)) + (or + (and + (not eword-decode-sticked-encoded-word) + (not (eq last 'eword-analyze-spaces)) + (eq func 'eword-analyze-encoded-word)) + (null (setq r (funcall func string must-unfold)))) + ) + (setq rest (cdr rest))) + (setq last func) + (or r `((error . ,string) . "")) + )) (setq dest (cons (car ret) dest)) (setq string (cdr ret)) ) @@ -467,12 +674,13 @@ It is like std11-lexical-analyze, but it decodes non us-ascii characters encoded as encoded-words or invalid \"raw\" format. \"Raw\" non us-ascii characters are regarded as variable `default-mime-charset'." - (let ((key (copy-sequence string)) - ret) - (set-text-properties 0 (length key) nil key) + (let* ((str (copy-sequence string)) + (key (cons str (cons default-mime-charset must-unfold))) + ret) + (set-text-properties 0 (length str) nil str) (if (setq ret (assoc key eword-lexical-analyze-cache)) (cdr ret) - (setq ret (eword-lexical-analyze-internal key must-unfold)) + (setq ret (eword-lexical-analyze-internal str must-unfold)) (setq eword-lexical-analyze-cache (cons (cons key ret) (last eword-lexical-analyze-cache @@ -480,13 +688,7 @@ characters encoded as encoded-words or invalid \"raw\" format. ret))) (defun eword-decode-token (token) - (let ((type (car token)) - (value (cdr token))) - (cond ((eq type 'quoted-string) - (std11-wrap-as-quoted-string value)) - ((eq type 'comment) - (concat "(" (std11-wrap-as-quoted-pairs value '(?( ?))) ")")) - (t value)))) + (cdr token)) (defun eword-decode-and-fold-structured-field (string start-column &optional max-column must-unfold) @@ -592,9 +794,7 @@ decode the charset included in it, it is not decoded. If MUST-UNFOLD is non-nil, it unfolds and eliminates line-breaks even if there are in decoded encoded-words (generated by bad manner MUA such as a version of Net$cape)." - (eword-decode-string - (decode-mime-charset-string string default-mime-charset) - must-unfold)) + (eword-decode-string string must-unfold default-mime-charset)) (defun eword-extract-address-components (string) "Extract full name and canonical address from STRING.