X-Git-Url: http://git.chise.org/gitweb/?a=blobdiff_plain;f=eword-decode.el;h=077ae7890aa6fe18fd2886a566cde77b57efdbc5;hb=4d9c85cce61c5f3a0e1746a885b57b667f3843d8;hp=a3d729e0f80afe66a1beae89a77a2694ab80d604;hpb=1db53190289c3b9448dc5c06adc76af0ae963637;p=elisp%2Fflim.git diff --git a/eword-decode.el b/eword-decode.el index a3d729e..077ae78 100644 --- a/eword-decode.el +++ b/eword-decode.el @@ -32,14 +32,26 @@ ;;; Code: -(require 'std11-parse) +(require 'std11) (require 'mel) (require 'mime-def) +(require 'ew-dec) + (defgroup eword-decode nil "Encoded-word decoding" :group 'mime) +;;; TEST + +(defvar rotate-memo nil) +(defmacro rotate-memo (var val) + `(when rotate-memo + (unless (boundp ',var) (setq ,var ())) + (setq ,var (cons ,val ,var)) + (let ((tmp (last ,var (- (length ,var) 100)))) + (when tmp (setcdr tmp nil))) + ,var)) ;;; @ variables ;;; @@ -82,7 +94,7 @@ however this behaviour violates RFC2047." (concat eword-encoded-word-prefix-regexp "\\(" eword-encoded-text-in-phrase-regexp "\\)" eword-encoded-word-suffix-regexp)) -(defconst eword-after-encoded-word-in-phrase-regexp "\\([ \t(]\\|$\\)") +(defconst eword-after-encoded-word-in-phrase-regexp "\\([ \t]\\|$\\)") (defconst eword-encoded-text-in-comment-regexp "[]!-'*->@-[^-~]+") (defconst eword-encoded-word-in-comment-regexp @@ -129,11 +141,6 @@ however this behaviour violates RFC2047." ;;; @@ Quoted-Printable ;;; -(defconst quoted-printable-hex-chars "0123456789ABCDEF") -(defconst quoted-printable-octet-regexp - (concat "=[" quoted-printable-hex-chars - "][" quoted-printable-hex-chars "]")) - (defconst eword-Q-encoded-text-regexp (concat "\\([^=?]\\|" quoted-printable-octet-regexp "\\)+")) ;; (defconst eword-Q-encoding-and-encoded-text-regexp @@ -224,6 +231,7 @@ such as a version of Net$cape)." safe-regexp escape ; ?\\ or nil. delimiters ; list of chars. + chars-must-be-quote must-unfold code-conversion) (if (and code-conversion @@ -245,14 +253,14 @@ such as a version of Net$cape)." (setq dst (concat dst (std11-wrap-as-quoted-pairs (decode-mime-charset-string buf code-conversion) - delimiters)) + chars-must-be-quote)) buf "")) (cond (decoded (setq dst (concat dst (std11-wrap-as-quoted-pairs (car decoded) - delimiters)) + chars-must-be-quote)) src (cdr decoded))) ((memq ch delimiters) (setq dst (concat dst (list ch)) @@ -276,7 +284,7 @@ such as a version of Net$cape)." (setq dst (concat dst (std11-wrap-as-quoted-pairs (decode-mime-charset-string buf code-conversion) - delimiters)))) + chars-must-be-quote)))) dst)) @@ -291,6 +299,7 @@ such as a version of Net$cape)." "[^ \t\n=]*" nil nil + nil must-unfold code-conversion)) @@ -302,6 +311,7 @@ such as a version of Net$cape)." "[^ \t\n()\\\\=]*" ?\\ '(?\( ?\)) + '(?\( ?\) ?\\ ?\r ?\n) must-unfold code-conversion)) @@ -313,6 +323,7 @@ such as a version of Net$cape)." "[^ \t\n\"\\\\=]*" ?\\ '(?\") + '(?\" ?\\ ?\r ?\n) must-unfold code-conversion)) @@ -356,6 +367,8 @@ mime-charset, it decodes non-ASCII bit patterns as the mime-charset. Otherwise it decodes non-ASCII bit patterns as the default-mime-charset." (interactive "*r") + (rotate-memo args-eword-decode-region + (list start end (buffer-substring start end) unfolding must-unfold code-conversion)) (save-excursion (save-restriction (narrow-to-region start end) @@ -374,17 +387,17 @@ default-mime-charset." ;;; (defcustom eword-decode-ignored-field-list - '(newsgroups path lines nntp-posting-host received message-id date) + '(Newsgroups Path Lines Nntp-Posting-Host Received Message-Id Date) "*List of field-names to be ignored when decoding. Each field name must be symbol." :group 'eword-decode :type '(repeat symbol)) (defcustom eword-decode-structured-field-list - '(reply-to resent-reply-to from resent-from sender resent-sender - to resent-to cc resent-cc bcc resent-bcc dcc - mime-version content-type content-transfer-encoding - content-disposition) + '(Reply-To Resent-Reply-To From Resent-From Sender Resent-Sender + To Resent-To Cc Resent-Cc Bcc Resent-Bcc Dcc + Mime-Version Content-Type Content-Transfer-Encoding + Content-Disposition) "*List of field-names to decode as structured field. Each field name must be symbol." :group 'eword-decode @@ -398,41 +411,40 @@ Otherwise it decodes non-ASCII bit patterns as the default-mime-charset. If SEPARATOR is not nil, it is used as header separator." (interactive "*") + (rotate-memo args-eword-decode-header (list code-conversion)) + (unless code-conversion + (message "eword-decode-header is called with no code-conversion") + (sit-for 2)) (if (and code-conversion (not (mime-charset-to-coding-system code-conversion))) (setq code-conversion default-mime-charset)) (save-excursion (save-restriction (std11-narrow-to-header separator) + (rotate-memo args-h-eword-decode-header (buffer-substring (point-min) (point-max))) (if code-conversion - (let (beg p end field-name len) + (let (beg p end field-name field-body decoded) (goto-char (point-min)) (while (re-search-forward std11-field-head-regexp nil t) (setq beg (match-beginning 0) p (match-end 0) field-name (buffer-substring beg (1- p)) - len (string-width field-name) - field-name (intern (downcase field-name)) - end (std11-field-end)) - (cond ((memq field-name eword-decode-ignored-field-list) - ;; Don't decode - ) - ((memq field-name eword-decode-structured-field-list) - ;; Decode as structured field - (let ((body (buffer-substring p end))) - (delete-region p end) - (insert (eword-decode-and-fold-structured-field - body (1+ len))) - )) - (t - ;; Decode as unstructured field - (save-restriction - (narrow-to-region beg (1+ end)) - (goto-char p) - (eword-decode-region beg (point-max) 'unfold nil - code-conversion) - (goto-char (point-max)) - ))))) + end (std11-field-end) + field-body (ew-lf-crlf-to-crlf + (buffer-substring p end)) + decoded (ew-decode-field + field-name field-body)) + (unless (equal field-body decoded) + (setq decoded (ew-crlf-refold + decoded + (1+ (string-width field-name)) + fill-column))) + (delete-region p end) + (insert (ew-crlf-to-lf decoded)) + (add-text-properties beg (min (1+ (point)) (point-max)) + (list 'original-field-name field-name + 'original-field-body field-body)) + )) (eword-decode-region (point-min) (point-max) t nil nil) )))) @@ -457,7 +469,18 @@ If SEPARATOR is not nil, it is used as header separator." ;;; @ encoded-word decoder ;;; -(defvar eword-warning-face nil "Face used for invalid encoded-word.") +(defvar eword-decode-encoded-word-error-handler + 'eword-decode-encoded-word-default-error-handler) + +(defvar eword-warning-face nil + "Face used for invalid encoded-word.") + +(defun eword-decode-encoded-word-default-error-handler (word signal) + (and (add-text-properties 0 (length word) + (and eword-warning-face + (list 'face eword-warning-face)) + word) + word)) (defun eword-decode-encoded-word (word &optional must-unfold) "Decode WORD if it is an encoded-word. @@ -482,12 +505,8 @@ as a version of Net$cape)." (condition-case err (eword-decode-encoded-text charset encoding text must-unfold) (error - (and - (add-text-properties 0 (length word) - (and eword-warning-face - (list 'face eword-warning-face)) - word) - word))) + (funcall eword-decode-encoded-word-error-handler word err) + )) )) word)) @@ -525,21 +544,18 @@ as a version of Net$cape)." (error "Invalid encoding %s" encoding) ))) ) - (if dest - (progn - (setq dest (decode-coding-string dest cs)) - (if must-unfold - (mapconcat (function - (lambda (chr) - (cond - ((eq chr ?\n) "") - ((eq chr ?\t) " ") - (t (char-to-string chr))) - )) - (std11-unfold-string dest) - "") - dest) - )))))) + (when dest + (setq dest (decode-mime-charset-string dest charset)) + (if must-unfold + (mapconcat (function + (lambda (chr) + (cond ((eq chr ?\n) "") + ((eq chr ?\t) " ") + (t (char-to-string chr))) + )) + (std11-unfold-string dest) + "") + dest)))))) ;;; @ lexical analyze @@ -576,12 +592,17 @@ be the result." (defun eword-analyze-quoted-string (string &optional must-unfold) (let ((p (std11-check-enclosure string ?\" ?\"))) (if p - (cons (cons 'quoted-string - (eword-decode-quoted-string - (substring string 0 p) - default-mime-charset)) - (substring string p)) - ))) + (cons (cons 'quoted-string + (if eword-decode-quoted-encoded-word + (eword-decode-quoted-string + (substring string 0 p) + default-mime-charset) + (std11-wrap-as-quoted-string + (decode-mime-charset-string + (std11-strip-quoted-pair (substring string 1 (1- p))) + default-mime-charset)))) + (substring string p))) + )) (defun eword-analyze-domain-literal (string &optional must-unfold) (std11-analyze-domain-literal string)) @@ -595,7 +616,7 @@ be the result." (setq p (or p len)) (cons (cons 'comment (eword-decode-comment - (substring string 0 p) + (std11-unfold-string (substring string 0 p)) default-mime-charset)) (substring string p))) nil))) @@ -608,12 +629,20 @@ be the result." (defun eword-analyze-encoded-word (string &optional must-unfold) (let ((decoded (eword-decode-first-encoded-words - string - eword-encoded-word-in-phrase-regexp - eword-after-encoded-word-in-phrase-regexp - must-unfold))) + string + eword-encoded-word-in-phrase-regexp + eword-after-encoded-word-in-phrase-regexp + must-unfold))) (if decoded - (cons (cons 'atom (car decoded)) (cdr decoded))))) + (let ((s (car decoded))) + (while (or (string-match std11-atom-regexp s) + (string-match std11-spaces-regexp s)) + (setq s (substring s (match-end 0)))) + (if (= (length s) 0) + (cons (cons 'atom (car decoded)) (cdr decoded)) + (cons (cons 'quoted-string + (std11-wrap-as-quoted-string (car decoded))) + (cdr decoded))))))) (defun eword-analyze-atom (string &optional must-unfold) (if (let ((enable-multibyte-characters nil)) @@ -631,17 +660,24 @@ be the result." )))) (defun eword-lexical-analyze-internal (string must-unfold) - (let (dest ret) + (let ((last 'eword-analyze-spaces) + dest ret) (while (not (string-equal string "")) (setq ret - (let ((rest eword-lexical-analyzers) - func r) - (while (and (setq func (car rest)) - (null (setq r (funcall func string must-unfold))) - ) - (setq rest (cdr rest))) - (or r `((error . ,string) . "")) - )) + (let ((rest eword-lexical-analyzers) + func r) + (while (and (setq func (car rest)) + (or + (and + (not eword-decode-sticked-encoded-word) + (not (eq last 'eword-analyze-spaces)) + (eq func 'eword-analyze-encoded-word)) + (null (setq r (funcall func string must-unfold)))) + ) + (setq rest (cdr rest))) + (setq last func) + (or r `((error . ,string) . "")) + )) (setq dest (cons (car ret) dest)) (setq string (cdr ret)) ) @@ -685,34 +721,17 @@ If MAX-COLUMN is omitted, `fill-column' is used. If MUST-UNFOLD is non-nil, it unfolds and eliminates line-breaks even if there are in decoded encoded-words (generated by bad manner MUA such as a version of Net$cape)." + (rotate-memo args-eword-decode-and-fold-structured-field + (list string start-column max-column must-unfold)) (or max-column (setq max-column fill-column)) - (let ((c start-column) - (tokens (eword-lexical-analyze string must-unfold)) - (result "") - token) - (while (and (setq token (car tokens)) - (setq tokens (cdr tokens))) - (let* ((type (car token))) - (if (eq type 'spaces) - (let* ((next-token (car tokens)) - (next-str (eword-decode-token next-token)) - (next-len (string-width next-str)) - (next-c (+ c next-len 1))) - (if (< next-c max-column) - (setq result (concat result " " next-str) - c next-c) - (setq result (concat result "\n " next-str) - c (1+ next-len))) - (setq tokens (cdr tokens)) - ) - (let* ((str (eword-decode-token token))) - (setq result (concat result str) - c (+ c (string-width str))) - )))) - (if token - (concat result (eword-decode-token token)) - result))) + (let* ((field-name (make-string (1- start-column) ?X)) + (field-body (ew-lf-crlf-to-crlf string)) + (ew-decode-field-default-syntax '(ew-scan-unibyte-std11)) + (decoded (ew-decode-field field-name field-body))) + (unless (equal field-body decoded) + (setq decoded (ew-crlf-refold decoded start-column max-column))) + (ew-crlf-to-lf decoded))) (defun eword-decode-and-unfold-structured-field (string) "Decode and unfold STRING as structured field body. @@ -722,18 +741,10 @@ characters are regarded as variable `default-mime-charset'. If an encoded-word is broken or your emacs implementation can not decode the charset included in it, it is not decoded." - (let ((tokens (eword-lexical-analyze string 'must-unfold)) - (result "")) - (while tokens - (let* ((token (car tokens)) - (type (car token))) - (setq tokens (cdr tokens)) - (setq result - (if (eq type 'spaces) - (concat result " ") - (concat result (eword-decode-token token)) - )))) - result)) + (rotate-memo args-eword-decode-and-unfold-structured-field (list string)) + (let* ((ew-decode-field-default-syntax '(ew-scan-unibyte-std11)) + (decoded (ew-decode-field "" (ew-lf-crlf-to-crlf string)))) + (ew-crlf-to-lf (ew-crlf-unfold decoded)))) (defun eword-decode-structured-field-body (string &optional must-unfold start-column max-column) @@ -750,15 +761,16 @@ decode the charset included in it, it is not decoded. If MUST-UNFOLD is non-nil, it unfolds and eliminates line-breaks even if there are in decoded encoded-words (generated by bad manner MUA such as a version of Net$cape)." + (rotate-memo args-eword-decode-structured-field-body + (list string must-unfold start-column max-column)) (if start-column ;; fold with max-column (eword-decode-and-fold-structured-field string start-column max-column must-unfold) ;; Don't fold - (mapconcat (function eword-decode-token) - (eword-lexical-analyze string must-unfold) - "") - )) + (let* ((ew-decode-field-default-syntax '(ew-scan-unibyte-std11)) + (decoded (ew-decode-field "" (ew-lf-crlf-to-crlf string)))) + (ew-crlf-to-lf decoded)))) (defun eword-decode-unstructured-field-body (string &optional must-unfold) "Decode non us-ascii characters in STRING as unstructured field body. @@ -774,7 +786,10 @@ decode the charset included in it, it is not decoded. If MUST-UNFOLD is non-nil, it unfolds and eliminates line-breaks even if there are in decoded encoded-words (generated by bad manner MUA such as a version of Net$cape)." - (eword-decode-string string must-unfold default-mime-charset)) + (rotate-memo args-eword-decode-unstructured-field-body + (list string must-unfold)) + (let ((decoded (ew-decode-field "" (ew-lf-crlf-to-crlf string)))) + (ew-crlf-to-lf (ew-crlf-unfold decoded)))) (defun eword-extract-address-components (string) "Extract full name and canonical address from STRING. @@ -783,6 +798,7 @@ If no name can be extracted, FULL-NAME will be nil. It decodes non us-ascii characters in FULL-NAME encoded as encoded-words or invalid \"raw\" string. \"Raw\" non us-ascii characters are regarded as variable `default-mime-charset'." + (rotate-memo args-eword-extract-address-components (list string)) (let* ((structure (car (std11-parse-address (eword-lexical-analyze (std11-unfold-string string) 'must-unfold))))