Sync up with flim-1_3_0 to flim-1_8_1.
[elisp/flim.git] / eword-decode.el
index f0e74e5..fa07ab1 100644 (file)
@@ -4,7 +4,8 @@
 
 ;; Author: ENAMI Tsugutomo <enami@sys.ptg.sony.co.jp>
 ;;         MORIOKA Tomohiko <morioka@jaist.ac.jp>
-;; Maintainer: MORIOKA Tomohiko <morioka@jaist.ac.jp>
+;;         Tanaka Akira <akr@jaist.ac.jp>
+;; Maintainer: Tanaka Akira <akr@jaist.ac.jp>
 ;; Created: 1995/10/03
 ;; Original: 1992/07/20 ENAMI Tsugutomo's `mime.el'.
 ;;     Renamed: 1993/06/03 to tiny-mime.el
@@ -12,7 +13,7 @@
 ;;     Renamed: 1997/02/22 from tm-ew-d.el
 ;; Keywords: encoded-word, MIME, multilingual, header, mail, news
 
-;; This file is part of SEMI (Spadework for Emacs MIME Interfaces).
+;; This file is part of FLAM (Faithful Library About MIME).
 
 ;; This program is free software; you can redistribute it and/or
 ;; modify it under the terms of the GNU General Public License as
   :group 'mime)
 
 
+;;; @ variables
+;;;
+
+(defcustom eword-decode-sticked-encoded-word nil
+  "*If non-nil, decode encoded-words sticked on atoms,
+other encoded-words, etc.
+however this behaviour violates RFC2047."
+  :group 'eword-decode
+  :type 'boolean)
+
+(defcustom eword-decode-quoted-encoded-word nil
+  "*If non-nil, decode encoded-words in quoted-string
+however this behaviour violates RFC2047."
+  :group 'eword-decode
+  :type 'boolean)
+
+
 ;;; @ MIME encoded-word definition
 ;;;
 
-(defconst eword-encoded-text-regexp "[!->@-~]+")
-(defconst eword-encoded-word-regexp
+(defconst eword-encoded-word-prefix-regexp
   (concat (regexp-quote "=?")
-         "\\("
-         mime-charset-regexp
-         "\\)"
+         "\\(" mime-charset-regexp "\\)"
          (regexp-quote "?")
          "\\(B\\|Q\\)"
-         (regexp-quote "?")
-         "\\("
-         eword-encoded-text-regexp
-         "\\)"
-         (regexp-quote "?=")))
+         (regexp-quote "?")))
+(defconst eword-encoded-word-suffix-regexp
+  (regexp-quote "?="))
+
+(defconst eword-encoded-text-in-unstructured-regexp "[!->@-~]+")
+(defconst eword-encoded-word-in-unstructured-regexp
+  (concat eword-encoded-word-prefix-regexp
+         "\\(" eword-encoded-text-in-unstructured-regexp "\\)"
+         eword-encoded-word-suffix-regexp))
+(defconst eword-after-encoded-word-in-unstructured-regexp "\\([ \t]\\|$\\)")
+
+(defconst eword-encoded-text-in-phrase-regexp "[-A-Za-z0-9!*+/=_]+")
+(defconst eword-encoded-word-in-phrase-regexp
+  (concat eword-encoded-word-prefix-regexp
+         "\\(" eword-encoded-text-in-phrase-regexp "\\)"
+         eword-encoded-word-suffix-regexp))
+(defconst eword-after-encoded-word-in-phrase-regexp "\\([ \t]\\|$\\)")
+
+(defconst eword-encoded-text-in-comment-regexp "[]!-'*->@-[^-~]+")
+(defconst eword-encoded-word-in-comment-regexp
+  (concat eword-encoded-word-prefix-regexp
+         "\\(" eword-encoded-text-in-comment-regexp "\\)"
+         eword-encoded-word-suffix-regexp))
+(defconst eword-after-encoded-word-in-comment-regexp "\\([ \t()\\\\]\\|$\\)")
+
+(defconst eword-encoded-text-in-quoted-string-regexp "[]!#->@-[^-~]+")
+(defconst eword-encoded-word-in-quoted-string-regexp
+  (concat eword-encoded-word-prefix-regexp
+         "\\(" eword-encoded-text-in-quoted-string-regexp "\\)"
+         eword-encoded-word-suffix-regexp))
+(defconst eword-after-encoded-word-in-quoted-string-regexp "\\([ \t\"\\\\]\\|$\\)")
+
+; obsolete
+(defconst eword-encoded-text-regexp eword-encoded-text-in-unstructured-regexp)
+(defconst eword-encoded-word-regexp eword-encoded-word-in-unstructured-regexp)
 
 
 ;;; @@ Base64
 ;;   (concat "\\(Q\\)\\?" eword-Q-encoded-text-regexp))
 
 
+;;; @ internal utilities
+;;;
+
+(defun eword-decode-first-encoded-words (string
+                                        eword-regexp
+                                        after-regexp
+                                        &optional must-unfold)
+  "Decode MIME encoded-words in beginning of STRING.
+
+EWORD-REGEXP is the regexp that matches a encoded-word.
+Usual value is
+eword-encoded-word-in-unstructured-regexp, 
+eword-encoded-text-in-phrase-regexp,
+eword-encoded-word-in-comment-regexp or
+eword-encoded-word-in-quoted-string-regexp.
+
+AFTER-REGEXP is the regexp that matches a after encoded-word.
+Usual value is
+eword-after-encoded-word-in-unstructured-regexp, 
+eword-after-encoded-text-in-phrase-regexp,
+eword-after-encoded-word-in-comment-regexp or
+eword-after-encoded-word-in-quoted-string-regexp.
+
+If beginning of STRING matches EWORD-REGEXP with AFTER-REGEXP,
+returns a cons cell of decoded string(sequence of characters) and 
+the rest(sequence of octets).
+
+If beginning of STRING does not matches EWORD-REGEXP and AFTER-REGEXP,
+returns nil.
+
+If an encoded-word is broken or your emacs implementation can not
+decode the charset included in it, it is returned in decoded part
+as encoded-word form.
+
+If MUST-UNFOLD is non-nil, it unfolds and eliminates line-breaks even
+if there are in decoded encoded-words (generated by bad manner MUA
+such as a version of Net$cape)."
+  (if eword-decode-sticked-encoded-word (setq after-regexp ""))
+  (let* ((between-ewords-regexp
+          (if eword-decode-sticked-encoded-word
+            "\\(\n?[ \t]\\)*"
+            "\\(\n?[ \t]\\)+"))
+        (between-ewords-eword-after-regexp
+          (concat "\\`\\(" between-ewords-regexp "\\)"
+                     "\\(" eword-regexp "\\)"
+                     after-regexp))
+        (eword-after-regexp
+          (concat "\\`\\(" eword-regexp "\\)" after-regexp))
+        (src string)   ; sequence of octets.
+        (dst ""))      ; sequence of characters.
+    (if (string-match eword-after-regexp src)
+      (let* (p
+            (q (match-end 1))
+            (ew (substring src 0 q))
+            (dw (eword-decode-encoded-word ew must-unfold)))
+        (setq dst (concat dst dw)
+             src (substring src q))
+       (if (not (string= ew dw))
+         (progn
+           (while
+             (and
+               (string-match between-ewords-eword-after-regexp src)
+               (progn
+                 (setq p (match-end 1)
+                       q (match-end 3)
+                       ew (substring src p q)
+                       dw (eword-decode-encoded-word ew must-unfold))
+                 (if (string= ew dw)
+                   (progn
+                     (setq dst (concat dst (substring src 0 q))
+                           src (substring src q))
+                     nil)
+                   t)))
+             (setq dst (concat dst dw)
+                   src (substring src q)))))
+       (cons dst src))
+      nil)))
+
+(defun eword-decode-entire-string (string
+                                  eword-regexp
+                                  after-regexp
+                                  safe-regexp
+                                  escape ; ?\\ or nil.
+                                  delimiters ; list of chars.
+                                  must-unfold
+                                  code-conversion)
+  (if (and code-conversion
+          (not (mime-charset-to-coding-system code-conversion)))
+      (setq code-conversion default-mime-charset))
+  (let ((equal-safe-regexp (concat "\\`=?" safe-regexp))
+       (dst "")
+       (buf "")
+       (src string)
+       (ew-enable t))
+    (while (< 0 (length src))
+      (let ((ch (aref src 0))
+           (decoded (and
+                       ew-enable
+                       (eword-decode-first-encoded-words src
+                         eword-regexp after-regexp must-unfold))))
+       (if (and (not (string= buf ""))
+                (or decoded (memq ch delimiters)))
+         (setq dst (concat dst
+                     (std11-wrap-as-quoted-pairs
+                       (decode-mime-charset-string buf code-conversion)
+                       delimiters))
+               buf ""))
+       (cond
+         (decoded
+           (setq dst (concat dst
+                       (std11-wrap-as-quoted-pairs
+                         (car decoded)
+                         delimiters))
+                 src (cdr decoded)))
+         ((memq ch delimiters)
+           (setq dst (concat dst (list ch))
+                 src (substring src 1)
+                 ew-enable t))
+         ((eq ch escape)
+           (setq buf (concat buf (list (aref src 1)))
+                 src (substring src 2)
+                 ew-enable t))
+         ((string-match "\\`[ \t\n]+" src)
+           (setq buf (concat buf (substring src 0 (match-end 0)))
+                 src (substring src (match-end 0))
+                 ew-enable t))
+         ((and (string-match equal-safe-regexp src)
+               (< 0 (match-end 0)))
+           (setq buf (concat buf (substring src 0 (match-end 0)))
+                 src (substring src (match-end 0))
+                 ew-enable eword-decode-sticked-encoded-word))
+         (t (error "something wrong")))))
+    (if (not (string= buf ""))
+      (setq dst (concat dst
+                 (std11-wrap-as-quoted-pairs
+                   (decode-mime-charset-string buf code-conversion)
+                   delimiters))))
+    dst))
+
+
 ;;; @ for string
 ;;;
 
-(defun eword-decode-string (string &optional must-unfold)
+(defun eword-decode-unstructured (string code-conversion &optional must-unfold)
+  (eword-decode-entire-string
+    string
+    eword-encoded-word-in-unstructured-regexp
+    eword-after-encoded-word-in-unstructured-regexp
+    "[^ \t\n=]*"
+    nil
+    nil
+    must-unfold
+    code-conversion))
+
+(defun eword-decode-comment (string code-conversion &optional must-unfold)
+  (eword-decode-entire-string
+    string
+    eword-encoded-word-in-comment-regexp
+    eword-after-encoded-word-in-comment-regexp
+    "[^ \t\n()\\\\=]*"
+    ?\\
+    '(?\( ?\))
+    must-unfold
+    code-conversion))
+
+(defun eword-decode-quoted-string (string code-conversion &optional must-unfold)
+  (eword-decode-entire-string
+    string
+    eword-encoded-word-in-quoted-string-regexp
+    eword-after-encoded-word-in-quoted-string-regexp
+    "[^ \t\n\"\\\\=]*"
+    ?\\
+    '(?\")
+    must-unfold
+    code-conversion))
+
+(defun eword-decode-string (string &optional must-unfold code-conversion)
   "Decode MIME encoded-words in STRING.
 
 STRING is unfolded before decoding.
@@ -103,45 +321,35 @@ decode the charset included in it, it is not decoded.
 
 If MUST-UNFOLD is non-nil, it unfolds and eliminates line-breaks even
 if there are in decoded encoded-words (generated by bad manner MUA
-such as a version of Net$cape)."
-  (setq string (std11-unfold-string string))
-  (let ((dest "")(ew nil)
-       beg end)
-    (while (and (string-match eword-encoded-word-regexp string)
-               (setq beg (match-beginning 0)
-                     end (match-end 0))
-               )
-      (if (> beg 0)
-         (if (not
-              (and (eq ew t)
-                   (string-match "^[ \t]+$" (substring string 0 beg))
-                   ))
-             (setq dest (concat dest (substring string 0 beg)))
-           )
-       )
-      (setq dest
-           (concat dest
-                   (eword-decode-encoded-word
-                    (substring string beg end) must-unfold)
-                   ))
-      (setq string (substring string end))
-      (setq ew t)
-      )
-    (concat dest string)
-    ))
+such as a version of Net$cape).
+
+If CODE-CONVERSION is nil, it decodes only encoded-words.  If it is
+mime-charset, it decodes non-ASCII bit patterns as the mime-charset.
+Otherwise it decodes non-ASCII bit patterns as the
+default-mime-charset."
+  (eword-decode-unstructured
+    (std11-unfold-string string)
+    code-conversion
+    must-unfold))
 
 
 ;;; @ for region
 ;;;
 
-(defun eword-decode-region (start end &optional unfolding must-unfold)
+(defun eword-decode-region (start end &optional unfolding must-unfold
+                                               code-conversion)
   "Decode MIME encoded-words in region between START and END.
 
 If UNFOLDING is not nil, it unfolds before decoding.
 
 If MUST-UNFOLD is non-nil, it unfolds and eliminates line-breaks even
 if there are in decoded encoded-words (generated by bad manner MUA
-such as a version of Net$cape)."
+such as a version of Net$cape).
+
+If CODE-CONVERSION is nil, it decodes only encoded-words.  If it is
+mime-charset, it decodes non-ASCII bit patterns as the mime-charset.
+Otherwise it decodes non-ASCII bit patterns as the
+default-mime-charset."
   (interactive "*r")
   (save-excursion
     (save-restriction
@@ -149,22 +357,12 @@ such as a version of Net$cape)."
       (if unfolding
          (eword-decode-unfold)
        )
-      (goto-char (point-min))
-      (while (re-search-forward (concat "\\(" eword-encoded-word-regexp "\\)"
-                                        "\\(\n?[ \t]\\)+"
-                                        "\\(" eword-encoded-word-regexp "\\)")
-                                nil t)
-       (replace-match "\\1\\6")
-        (goto-char (point-min))
-       )
-      (while (re-search-forward eword-encoded-word-regexp nil t)
-       (insert (eword-decode-encoded-word
-                (prog1
-                    (buffer-substring (match-beginning 0) (match-end 0))
-                  (delete-region (match-beginning 0) (match-end 0))
-                  ) must-unfold))
-       )
-      )))
+      (let ((str (eword-decode-unstructured
+                  (buffer-substring (point-min) (point-max))
+                  code-conversion
+                  must-unfold)))
+       (delete-region (point-min) (point-max))
+       (insert str)))))
 
 
 ;;; @ for message header
@@ -195,47 +393,43 @@ Otherwise it decodes non-ASCII bit patterns as the
 default-mime-charset.
 If SEPARATOR is not nil, it is used as header separator."
   (interactive "*")
+  (if (and code-conversion
+          (not (mime-charset-to-coding-system code-conversion)))
+      (setq code-conversion default-mime-charset))
   (save-excursion
     (save-restriction
       (std11-narrow-to-header separator)
-      (let ((default-charset
-             (if code-conversion
-                 (if (mime-charset-to-coding-system code-conversion)
-                     code-conversion
-                   default-mime-charset))))
-       (if default-charset
-           (let (beg p end field-name len)
-             (goto-char (point-min))
-             (while (re-search-forward std11-field-head-regexp nil t)
-               (setq beg (match-beginning 0)
-                     p (match-end 0)
-                     field-name (buffer-substring beg (1- p))
-                     len (string-width field-name)
-                     field-name (intern (capitalize field-name))
-                     end (std11-field-end))
-               (cond ((memq field-name eword-decode-ignored-field-list)
-                      ;; Don't decode
-                      )
-                     ((memq field-name eword-decode-structured-field-list)
-                      ;; Decode as structured field
-                      (let ((body (buffer-substring p end))
-                            (default-mime-charset default-charset))
-                        (delete-region p end)
-                        (insert (eword-decode-and-fold-structured-field
-                                 body (1+ len)))
-                        ))
-                     (t
-                      ;; Decode as unstructured field
-                      (save-restriction
-                        (narrow-to-region beg (1+ end))
-                        (decode-mime-charset-region p end default-charset)
-                        (goto-char p)
-                        (if (re-search-forward eword-encoded-word-regexp
-                                               nil t)
-                            (eword-decode-region beg (point-max) 'unfold))
-                        )))))
-         (eword-decode-region (point-min) (point-max) t)
-         )))))
+      (if code-conversion
+         (let (beg p end field-name len)
+           (goto-char (point-min))
+           (while (re-search-forward std11-field-head-regexp nil t)
+             (setq beg (match-beginning 0)
+                   p (match-end 0)
+                   field-name (buffer-substring beg (1- p))
+                   len (string-width field-name)
+                   field-name (intern (capitalize field-name))
+                   end (std11-field-end))
+             (cond ((memq field-name eword-decode-ignored-field-list)
+                    ;; Don't decode
+                    )
+                   ((memq field-name eword-decode-structured-field-list)
+                    ;; Decode as structured field
+                    (let ((body (buffer-substring p end)))
+                      (delete-region p end)
+                      (insert (eword-decode-and-fold-structured-field
+                               body (1+ len)))
+                      ))
+                   (t
+                    ;; Decode as unstructured field
+                    (save-restriction
+                      (narrow-to-region beg (1+ end))
+                      (goto-char p)
+                      (eword-decode-region beg (point-max) 'unfold nil
+                        code-conversion)
+                      (goto-char (point-max))
+                      )))))
+       (eword-decode-region (point-min) (point-max) t nil nil)
+       ))))
 
 (defun eword-decode-unfold ()
   (goto-char (point-min))
@@ -333,21 +527,18 @@ as a version of Net$cape)."
                  (error "Invalid encoding %s" encoding)
                  )))
               )
-         (if dest
-             (progn
-               (setq dest (decode-coding-string dest cs))
-               (if must-unfold
-                   (mapconcat (function
-                               (lambda (chr)
-                                 (cond
-                                   ((eq chr ?\n) "")
-                                   ((eq chr ?\t) " ")
-                                   (t (char-to-string chr)))
-                                 ))
-                              (std11-unfold-string dest)
-                              "")
-                 dest)
-               ))))))
+         (when dest
+           (setq dest (decode-mime-charset-string dest charset))
+           (if must-unfold
+               (mapconcat (function
+                           (lambda (chr)
+                             (cond ((eq chr ?\n) "")
+                                   ((eq chr ?\t) " ")
+                                   (t (char-to-string chr)))
+                             ))
+                          (std11-unfold-string dest)
+                          "")
+             dest))))))
 
 
 ;;; @ lexical analyze
@@ -385,26 +576,32 @@ be the result."
   (let ((p (std11-check-enclosure string ?\" ?\")))
     (if p
        (cons (cons 'quoted-string
-                   (decode-mime-charset-string
-                    (std11-strip-quoted-pair (substring string 1 (1- p)))
-                    default-mime-charset))
-             (substring string p))
-      )))
+                   (if eword-decode-quoted-encoded-word
+                       (eword-decode-quoted-string
+                        (substring string 0 p)
+                        default-mime-charset)
+                     (decode-mime-charset-string
+                      (std11-strip-quoted-pair (substring string 0 p))
+                      default-mime-charset)))
+             (substring string p)))
+      ))
 
 (defun eword-analyze-domain-literal (string &optional must-unfold)
   (std11-analyze-domain-literal string))
 
 (defun eword-analyze-comment (string &optional must-unfold)
-  (let ((p (std11-check-enclosure string ?\( ?\) t)))
-    (if p
-       (cons (cons 'comment
-                   (eword-decode-string
-                    (decode-mime-charset-string
-                     (std11-strip-quoted-pair (substring string 1 (1- p)))
-                     default-mime-charset)
-                    must-unfold))
-             (substring string p))
-      )))
+  (let ((len (length string)))
+    (if (and (< 0 len) (eq (aref string 0) ?\())
+       (let ((p 0))
+         (while (and p (< p len) (eq (aref string p) ?\())
+           (setq p (std11-check-enclosure string ?\( ?\) t p)))
+         (setq p (or p len))
+         (cons (cons 'comment
+                     (eword-decode-comment
+                       (std11-unfold-string (substring string 0 p))
+                       default-mime-charset))
+               (substring string p)))
+      nil)))
 
 (defun eword-analyze-spaces (string &optional must-unfold)
   (std11-analyze-spaces string))
@@ -413,30 +610,23 @@ be the result."
   (std11-analyze-special string))
 
 (defun eword-analyze-encoded-word (string &optional must-unfold)
-  (if (eq (string-match eword-encoded-word-regexp string) 0)
-      (let ((end (match-end 0))
-           (dest (eword-decode-encoded-word (match-string 0 string)
-                                            must-unfold))
-           )
-       (setq string (substring string end))
-       (while (eq (string-match `,(concat "[ \t\n]*\\("
-                                          eword-encoded-word-regexp
-                                          "\\)")
-                                string)
-                  0)
-         (setq end (match-end 0))
-         (setq dest
-               (concat dest
-                       (eword-decode-encoded-word (match-string 1 string)
-                                                  must-unfold))
-               string (substring string end))
-         )
-       (cons (cons 'atom dest) string)
-       )))
+  (let ((decoded (eword-decode-first-encoded-words
+                  string
+                  eword-encoded-word-in-phrase-regexp
+                  eword-after-encoded-word-in-phrase-regexp
+                  must-unfold)))
+    (if decoded
+      (cons (cons 'atom (car decoded)) (cdr decoded)))))
 
 (defun eword-analyze-atom (string &optional must-unfold)
-  (if (string-match std11-atom-regexp string)
+  (if (let ((enable-multibyte-characters nil))
+        (string-match std11-atom-regexp string))
       (let ((end (match-end 0)))
+       (if (and eword-decode-sticked-encoded-word
+                (string-match eword-encoded-word-in-phrase-regexp
+                              (substring string 0 end))
+                (< 0 (match-beginning 0)))
+           (setq end (match-beginning 0)))
        (cons (cons 'atom (decode-mime-charset-string
                           (substring string 0 end)
                           default-mime-charset))
@@ -467,12 +657,13 @@ It is like std11-lexical-analyze, but it decodes non us-ascii
 characters encoded as encoded-words or invalid \"raw\" format.
 \"Raw\" non us-ascii characters are regarded as variable
 `default-mime-charset'."
-  (let ((key (copy-sequence string))
-       ret)
-    (set-text-properties 0 (length key) nil key)
+  (let* ((str (copy-sequence string))
+        (key (cons str (cons default-mime-charset must-unfold)))
+        ret)
+    (set-text-properties 0 (length str) nil str)
     (if (setq ret (assoc key eword-lexical-analyze-cache))
        (cdr ret)
-      (setq ret (eword-lexical-analyze-internal key must-unfold))
+      (setq ret (eword-lexical-analyze-internal str must-unfold))
       (setq eword-lexical-analyze-cache
            (cons (cons key ret)
                  (last eword-lexical-analyze-cache
@@ -480,13 +671,7 @@ characters encoded as encoded-words or invalid \"raw\" format.
       ret)))
 
 (defun eword-decode-token (token)
-  (let ((type (car token))
-       (value (cdr token)))
-    (cond ((eq type 'quoted-string)
-          (std11-wrap-as-quoted-string value))
-         ((eq type 'comment)
-          (concat "(" (std11-wrap-as-quoted-pairs value '(?( ?))) ")"))
-         (t value))))
+  (cdr token))
 
 (defun eword-decode-and-fold-structured-field
   (string start-column &optional max-column must-unfold)
@@ -592,9 +777,7 @@ decode the charset included in it, it is not decoded.
 If MUST-UNFOLD is non-nil, it unfolds and eliminates line-breaks even
 if there are in decoded encoded-words (generated by bad manner MUA
 such as a version of Net$cape)."
-  (eword-decode-string
-   (decode-mime-charset-string string default-mime-charset)
-   must-unfold))
+  (eword-decode-string string must-unfold default-mime-charset))
 
 (defun eword-extract-address-components (string)
   "Extract full name and canonical address from STRING.