eword-decode.el

   1 ;;; eword-decode.el --- RFC 2047 based encoded-word decoder for GNU Emacs
   2
   3 ;; Copyright (C) 1995,1996,1997 Free Software Foundation, Inc.
   4
   5 ;; Author: ENAMI Tsugutomo <enami@sys.ptg.sony.co.jp>
   6 ;;         MORIOKA Tomohiko <morioka@jaist.ac.jp>
   7 ;; Maintainer: MORIOKA Tomohiko <morioka@jaist.ac.jp>
   8 ;; Created: 1995/10/03
   9 ;; Original: 1992/07/20 ENAMI Tsugutomo's `mime.el'.
  10 ;;      Renamed: 1993/06/03 to tiny-mime.el
  11 ;;      Renamed: 1995/10/03 from tiny-mime.el (split off encoder)
  12 ;;      Renamed: 1997/02/22 from tm-ew-d.el
  13 ;; Version: $Revision: 0.20 $
  14 ;; Keywords: encoded-word, MIME, multilingual, header, mail, news
  15
  16 ;; This file is part of SEMI (SEMI is Emacs MIME Interfaces).
  17
  18 ;; This program is free software; you can redistribute it and/or
  19 ;; modify it under the terms of the GNU General Public License as
  20 ;; published by the Free Software Foundation; either version 2, or (at
  21 ;; your option) any later version.
  22
  23 ;; This program is distributed in the hope that it will be useful, but
  24 ;; WITHOUT ANY WARRANTY; without even the implied warranty of
  25 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  26 ;; General Public License for more details.
  27
  28 ;; You should have received a copy of the GNU General Public License
  29 ;; along with GNU Emacs; see the file COPYING.  If not, write to the
  30 ;; Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  31 ;; Boston, MA 02111-1307, USA.
  32
  33 ;;; Code:
  34
  35 (require 'std11-parse)
  36 (require 'mel)
  37 (require 'mime-def)
  38
  39
  40 ;;; @ version
  41 ;;;
  42
  43 (defconst eword-decode-RCS-ID
  44   "$Id: eword-decode.el,v 0.20 1998-01-11 03:07:16 morioka Exp $")
  45 (defconst eword-decode-version (get-version-string eword-decode-RCS-ID))
  46
  47
  48 ;;; @ MIME encoded-word definition
  49 ;;;
  50
  51 (defconst eword-encoded-text-regexp "[!->@-~]+")
  52 (defconst eword-encoded-word-regexp
  53   (concat (regexp-quote "=?")
  54           "\\("
  55           mime-charset-regexp
  56           "\\)"
  57           (regexp-quote "?")
  58           "\\(B\\|Q\\)"
  59           (regexp-quote "?")
  60           "\\("
  61           eword-encoded-text-regexp
  62           "\\)"
  63           (regexp-quote "?=")))
  64
  65
  66 ;;; @@ Base64
  67 ;;;
  68
  69 (defconst base64-token-regexp "[A-Za-z0-9+/]")
  70 (defconst base64-token-padding-regexp "[A-Za-z0-9+/=]")
  71
  72 (defconst eword-B-encoded-text-regexp
  73   (concat "\\(\\("
  74           base64-token-regexp
  75           base64-token-regexp
  76           base64-token-regexp
  77           base64-token-regexp
  78           "\\)*"
  79           base64-token-regexp
  80           base64-token-regexp
  81           base64-token-padding-regexp
  82           base64-token-padding-regexp
  83           "\\)"))
  84
  85 ;; (defconst eword-B-encoding-and-encoded-text-regexp
  86 ;;   (concat "\\(B\\)\\?" eword-B-encoded-text-regexp))
  87
  88
  89 ;;; @@ Quoted-Printable
  90 ;;;
  91
  92 (defconst quoted-printable-hex-chars "0123456789ABCDEF")
  93 (defconst quoted-printable-octet-regexp
  94   (concat "=[" quoted-printable-hex-chars
  95           "][" quoted-printable-hex-chars "]"))
  96
  97 (defconst eword-Q-encoded-text-regexp
  98   (concat "\\([^=?]\\|" quoted-printable-octet-regexp "\\)+"))
  99 ;; (defconst eword-Q-encoding-and-encoded-text-regexp
 100 ;;   (concat "\\(Q\\)\\?" eword-Q-encoded-text-regexp))
 101
 102
 103 ;;; @ for string
 104 ;;;
 105
 106 (defun eword-decode-string (string &optional must-unfold)
 107   "Decode MIME encoded-words in STRING.
 108
 109 STRING is unfolded before decoding.
 110
 111 If an encoded-word is broken or your emacs implementation can not
 112 decode the charset included in it, it is not decoded.
 113
 114 If MUST-UNFOLD is non-nil, it unfolds and eliminates line-breaks even
 115 if there are in decoded encoded-words (generated by bad manner MUA
 116 such as a version of Net$cape)."
 117   (setq string (std11-unfold-string string))
 118   (let ((dest "")(ew nil)
 119         beg end)
 120     (while (and (string-match eword-encoded-word-regexp string)
 121                 (setq beg (match-beginning 0)
 122                       end (match-end 0))
 123                 )
 124       (if (> beg 0)
 125           (if (not
 126                (and (eq ew t)
 127                     (string-match "^[ \t]+$" (substring string 0 beg))
 128                     ))
 129               (setq dest (concat dest (substring string 0 beg)))
 130             )
 131         )
 132       (setq dest
 133             (concat dest
 134                     (eword-decode-encoded-word
 135                      (substring string beg end) must-unfold)
 136                     ))
 137       (setq string (substring string end))
 138       (setq ew t)
 139       )
 140     (concat dest string)
 141     ))
 142
 143
 144 ;;; @ for region
 145 ;;;
 146
 147 (defun eword-decode-region (start end &optional unfolding must-unfold)
 148   "Decode MIME encoded-words in region between START and END.
 149
 150 If UNFOLDING is not nil, it unfolds before decoding.
 151
 152 If MUST-UNFOLD is non-nil, it unfolds and eliminates line-breaks even
 153 if there are in decoded encoded-words (generated by bad manner MUA
 154 such as a version of Net$cape)."
 155   (interactive "*r")
 156   (save-excursion
 157     (save-restriction
 158       (narrow-to-region start end)
 159       (if unfolding
 160           (eword-decode-unfold)
 161         )
 162       (goto-char (point-min))
 163       (while (re-search-forward (concat "\\(" eword-encoded-word-regexp "\\)"
 164                                         "\\(\n?[ \t]\\)+"
 165                                         "\\(" eword-encoded-word-regexp "\\)")
 166                                 nil t)
 167         (replace-match "\\1\\6")
 168         (goto-char (point-min))
 169         )
 170       (while (re-search-forward eword-encoded-word-regexp nil t)
 171         (insert (eword-decode-encoded-word
 172                  (prog1
 173                      (buffer-substring (match-beginning 0) (match-end 0))
 174                    (delete-region (match-beginning 0) (match-end 0))
 175                    ) must-unfold))
 176         )
 177       )))
 178
 179
 180 ;;; @ for message header
 181 ;;;
 182
 183 (defun eword-decode-header (&optional separator)
 184   "Decode MIME encoded-words in header fields.
 185 If SEPARATOR is not nil, it is used as header separator."
 186   (interactive "*")
 187   (save-excursion
 188     (save-restriction
 189       (std11-narrow-to-header separator)
 190       (eword-decode-region (point-min) (point-max) t)
 191       )))
 192
 193 (defun eword-decode-unfold ()
 194   (goto-char (point-min))
 195   (let (field beg end)
 196     (while (re-search-forward std11-field-head-regexp nil t)
 197       (setq beg (match-beginning 0)
 198             end (std11-field-end))
 199       (setq field (buffer-substring beg end))
 200       (if (string-match eword-encoded-word-regexp field)
 201           (save-restriction
 202             (narrow-to-region (goto-char beg) end)
 203             (while (re-search-forward "\n\\([ \t]\\)" nil t)
 204               (replace-match (match-string 1))
 205               )
 206             (goto-char (point-max))
 207             ))
 208       )))
 209
 210
 211 ;;; @ encoded-word decoder
 212 ;;;
 213
 214 (defvar eword-warning-face nil "Face used for invalid encoded-word.")
 215
 216 (defun eword-decode-encoded-word (word &optional must-unfold)
 217   "Decode WORD if it is an encoded-word.
 218
 219 If your emacs implementation can not decode the charset of WORD, it
 220 returns WORD.  Similarly the encoded-word is broken, it returns WORD.
 221
 222 If MUST-UNFOLD is non-nil, it unfolds and eliminates line-breaks even
 223 if there are in decoded encoded-word (generated by bad manner MUA such
 224 as a version of Net$cape)."
 225   (or (if (string-match eword-encoded-word-regexp word)
 226           (let ((charset
 227                  (substring word (match-beginning 1) (match-end 1))
 228                  )
 229                 (encoding
 230                  (upcase
 231                   (substring word (match-beginning 2) (match-end 2))
 232                   ))
 233                 (text
 234                  (substring word (match-beginning 3) (match-end 3))
 235                  ))
 236             (condition-case err
 237                 (eword-decode-encoded-text charset encoding text must-unfold)
 238               (error
 239                (and
 240                 (add-text-properties 0 (length word)
 241                                      (and eword-warning-face
 242                                           (list 'face eword-warning-face))
 243                                      word)
 244                 word)))
 245             ))
 246       word))
 247
 248
 249 ;;; @ encoded-text decoder
 250 ;;;
 251
 252 (defun eword-decode-encoded-text (charset encoding string
 253                                           &optional must-unfold)
 254   "Decode STRING as an encoded-text.
 255
 256 If your emacs implementation can not decode CHARSET, it returns nil.
 257
 258 If ENCODING is not \"B\" or \"Q\", it occurs error.
 259 So you should write error-handling code if you don't want break by errors.
 260
 261 If MUST-UNFOLD is non-nil, it unfolds and eliminates line-breaks even
 262 if there are in decoded encoded-text (generated by bad manner MUA such
 263 as a version of Net$cape)."
 264   (let ((cs (mime-charset-to-coding-system charset)))
 265     (if cs
 266         (let ((dest
 267                (cond
 268                 ((string-equal "B" encoding)
 269                  (if (and (string-match eword-B-encoded-text-regexp string)
 270                           (string-equal string (match-string 0 string)))
 271                      (base64-decode-string string)
 272                    (error "Invalid encoded-text %s" string)))
 273                 ((string-equal "Q" encoding)
 274                  (if (and (string-match eword-Q-encoded-text-regexp string)
 275                           (string-equal string (match-string 0 string)))
 276                      (q-encoding-decode-string string)
 277                    (error "Invalid encoded-text %s" string)))
 278                 (t
 279                  (error "Invalid encoding %s" encoding)
 280                  )))
 281               )
 282           (if dest
 283               (progn
 284                 (setq dest (decode-coding-string dest cs))
 285                 (if must-unfold
 286                     (mapconcat (function
 287                                 (lambda (chr)
 288                                   (cond
 289                                    ((eq chr ?\n) "")
 290                                    ((eq chr ?\t) " ")
 291                                    (t (char-to-string chr)))
 292                                   ))
 293                                (std11-unfold-string dest)
 294                                "")
 295                   dest)
 296                 ))))))
 297
 298
 299 ;;; @ lexical analyze
 300 ;;;
 301
 302 (defvar eword-lexical-analyze-cache nil)
 303 (defvar eword-lexical-analyze-cache-max 299
 304   "*Max position of eword-lexical-analyze-cache.
 305 It is max size of eword-lexical-analyze-cache - 1.")
 306
 307 (defun eword-analyze-quoted-string (string)
 308   (let ((p (std11-check-enclosure string ?\" ?\")))
 309     (if p
 310         (cons (cons 'quoted-string
 311                     (decode-mime-charset-string
 312                      (std11-strip-quoted-pair (substring string 1 (1- p)))
 313                      default-mime-charset))
 314               (substring string p))
 315       )))
 316
 317 (defun eword-analyze-comment (string &optional must-unfold)
 318   (let ((p (std11-check-enclosure string ?\( ?\) t)))
 319     (if p
 320         (cons (cons 'comment
 321                     (eword-decode-string
 322                      (decode-mime-charset-string
 323                       (std11-strip-quoted-pair (substring string 1 (1- p)))
 324                       default-mime-charset)
 325                      must-unfold))
 326               (substring string p))
 327       )))
 328
 329 (defun eword-analyze-encoded-word (string &optional must-unfold)
 330   (if (eq (string-match eword-encoded-word-regexp string) 0)
 331       (let ((end (match-end 0))
 332             (dest (eword-decode-encoded-word (match-string 0 string)
 333                                              must-unfold))
 334             )
 335         (setq string (substring string end))
 336         (while (eq (string-match `,(concat "[ \t\n]*\\("
 337                                            eword-encoded-word-regexp
 338                                            "\\)")
 339                                  string)
 340                    0)
 341           (setq end (match-end 0))
 342           (setq dest
 343                 (concat dest
 344                         (eword-decode-encoded-word (match-string 1 string)
 345                                                    must-unfold))
 346                 string (substring string end))
 347           )
 348         (cons (cons 'atom dest) string)
 349         )))
 350
 351 (defun eword-analyze-atom (string)
 352   (if (string-match std11-atom-regexp string)
 353       (let ((end (match-end 0)))
 354         (cons (cons 'atom (decode-mime-charset-string
 355                            (substring string 0 end)
 356                            default-mime-charset))
 357               (substring string end)
 358               ))))
 359
 360 (defun eword-lexical-analyze-internal (string must-unfold)
 361   (let (dest ret)
 362     (while (not (string-equal string ""))
 363       (setq ret
 364             (or (eword-analyze-quoted-string string)
 365                 (std11-analyze-domain-literal string)
 366                 (eword-analyze-comment string must-unfold)
 367                 (std11-analyze-spaces string)
 368                 (std11-analyze-special string)
 369                 (eword-analyze-encoded-word string must-unfold)
 370                 (eword-analyze-atom string)
 371                 '((error) . "")
 372                 ))
 373       (setq dest (cons (car ret) dest))
 374       (setq string (cdr ret))
 375       )
 376     (nreverse dest)
 377     ))
 378
 379 (defun eword-lexical-analyze (string &optional must-unfold)
 380   "Return lexical analyzed list corresponding STRING.
 381 It is like std11-lexical-analyze, but it decodes non us-ascii
 382 characters encoded as encoded-words or invalid \"raw\" format.
 383 \"Raw\" non us-ascii characters are regarded as variable
 384 `default-mime-charset'."
 385   (let ((key (copy-sequence string))
 386         ret)
 387     (set-text-properties 0 (length key) nil key)
 388     (if (setq ret (assoc key eword-lexical-analyze-cache))
 389         (cdr ret)
 390       (setq ret (eword-lexical-analyze-internal key must-unfold))
 391       (setq eword-lexical-analyze-cache
 392             (cons (cons key ret)
 393                   (last eword-lexical-analyze-cache
 394                         eword-lexical-analyze-cache-max)))
 395       ret)))
 396
 397 (defun eword-decode-structured-field-body (string &optional must-unfold)
 398   "Decode non us-ascii characters in STRING as structured field body.
 399 STRING is unfolded before decoding.
 400
 401 It decodes non us-ascii characters in FULL-NAME encoded as
 402 encoded-words or invalid \"raw\" string.  \"Raw\" non us-ascii
 403 characters are regarded as variable `default-mime-charset'.
 404
 405 If an encoded-word is broken or your emacs implementation can not
 406 decode the charset included in it, it is not decoded.
 407
 408 If MUST-UNFOLD is non-nil, it unfolds and eliminates line-breaks even
 409 if there are in decoded encoded-words (generated by bad manner MUA
 410 such as a version of Net$cape)."
 411   (mapconcat (function
 412               (lambda (token)
 413                 (let ((type (car token))
 414                       (value (cdr token)))
 415                   (cond ((eq type 'quoted-string)
 416                          (std11-wrap-as-quoted-string value)
 417                          )
 418                         ((eq type 'comment)
 419                          (concat "("
 420                                  (std11-wrap-as-quoted-pairs value '(?( ?)))
 421                                  ")")
 422                          )
 423                         (t
 424                          value)))))
 425              (eword-lexical-analyze string must-unfold)
 426              ""))
 427
 428 (defun eword-decode-unstructured-field-body (string &optional must-unfold)
 429   "Decode non us-ascii characters in STRING as unstructured field body.
 430 STRING is unfolded before decoding.
 431
 432 It decodes non us-ascii characters in FULL-NAME encoded as
 433 encoded-words or invalid \"raw\" string.  \"Raw\" non us-ascii
 434 characters are regarded as variable `default-mime-charset'.
 435
 436 If an encoded-word is broken or your emacs implementation can not
 437 decode the charset included in it, it is not decoded.
 438
 439 If MUST-UNFOLD is non-nil, it unfolds and eliminates line-breaks even
 440 if there are in decoded encoded-words (generated by bad manner MUA
 441 such as a version of Net$cape)."
 442   (eword-decode-string
 443    (decode-mime-charset-string string default-mime-charset)
 444    must-unfold))
 445
 446 (defun eword-extract-address-components (string)
 447   "Extract full name and canonical address from STRING.
 448 Returns a list of the form (FULL-NAME CANONICAL-ADDRESS).
 449 If no name can be extracted, FULL-NAME will be nil.
 450 It decodes non us-ascii characters in FULL-NAME encoded as
 451 encoded-words or invalid \"raw\" string.  \"Raw\" non us-ascii
 452 characters are regarded as variable `default-mime-charset'."
 453   (let* ((structure (car (std11-parse-address
 454                           (eword-lexical-analyze
 455                            (std11-unfold-string string) 'must-unfold))))
 456          (phrase  (std11-full-name-string structure))
 457          (address (std11-address-string structure))
 458          )
 459     (list phrase address)
 460     ))
 461
 462
 463 ;;; @ end
 464 ;;;
 465
 466 (provide 'eword-decode)
 467
 468 ;;; eword-decode.el ends here