eword-decode.el

   1 ;;; eword-decode.el --- RFC 2047 based encoded-word decoder for GNU Emacs
   2
   3 ;; Copyright (C) 1995,1996,1997 Free Software Foundation, Inc.
   4
   5 ;; Author: ENAMI Tsugutomo <enami@sys.ptg.sony.co.jp>
   6 ;;         MORIOKA Tomohiko <morioka@jaist.ac.jp>
   7 ;; Maintainer: MORIOKA Tomohiko <morioka@jaist.ac.jp>
   8 ;; Created: 1995/10/03
   9 ;; Original: 1992/07/20 ENAMI Tsugutomo's `mime.el'.
  10 ;;      Renamed: 1993/06/03 to tiny-mime.el
  11 ;;      Renamed: 1995/10/03 from tiny-mime.el (split off encoder)
  12 ;;      Renamed: 1997/02/22 from tm-ew-d.el
  13 ;; Version: $Revision: 1.4 $
  14 ;; Keywords: encoded-word, MIME, multilingual, header, mail, news
  15
  16 ;; This file is part of SEMI (SEMI is Emacs MIME Interfaces).
  17
  18 ;; This program is free software; you can redistribute it and/or
  19 ;; modify it under the terms of the GNU General Public License as
  20 ;; published by the Free Software Foundation; either version 2, or (at
  21 ;; your option) any later version.
  22
  23 ;; This program is distributed in the hope that it will be useful, but
  24 ;; WITHOUT ANY WARRANTY; without even the implied warranty of
  25 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  26 ;; General Public License for more details.
  27
  28 ;; You should have received a copy of the GNU General Public License
  29 ;; along with GNU Emacs; see the file COPYING.  If not, write to the
  30 ;; Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  31 ;; Boston, MA 02111-1307, USA.
  32
  33 ;;; Code:
  34
  35 (require 'std11-parse)
  36 (require 'mel)
  37 (require 'mime-def)
  38
  39 (defgroup eword-decode nil
  40   "Encoded-word decoding"
  41   :group 'mime)
  42
  43
  44 ;;; @ version
  45 ;;;
  46
  47 (defconst eword-decode-RCS-ID
  48   "$Id: eword-decode.el,v 1.4 1998-02-16 21:25:30 morioka Exp $")
  49 (defconst eword-decode-version (get-version-string eword-decode-RCS-ID))
  50
  51
  52 ;;; @ MIME encoded-word definition
  53 ;;;
  54
  55 (defconst eword-encoded-text-regexp "[!->@-~]+")
  56 (defconst eword-encoded-word-regexp
  57   (concat (regexp-quote "=?")
  58           "\\("
  59           mime-charset-regexp
  60           "\\)"
  61           (regexp-quote "?")
  62           "\\(B\\|Q\\)"
  63           (regexp-quote "?")
  64           "\\("
  65           eword-encoded-text-regexp
  66           "\\)"
  67           (regexp-quote "?=")))
  68
  69
  70 ;;; @@ Base64
  71 ;;;
  72
  73 (defconst base64-token-regexp "[A-Za-z0-9+/]")
  74 (defconst base64-token-padding-regexp "[A-Za-z0-9+/=]")
  75
  76 (defconst eword-B-encoded-text-regexp
  77   (concat "\\(\\("
  78           base64-token-regexp
  79           base64-token-regexp
  80           base64-token-regexp
  81           base64-token-regexp
  82           "\\)*"
  83           base64-token-regexp
  84           base64-token-regexp
  85           base64-token-padding-regexp
  86           base64-token-padding-regexp
  87           "\\)"))
  88
  89 ;; (defconst eword-B-encoding-and-encoded-text-regexp
  90 ;;   (concat "\\(B\\)\\?" eword-B-encoded-text-regexp))
  91
  92
  93 ;;; @@ Quoted-Printable
  94 ;;;
  95
  96 (defconst quoted-printable-hex-chars "0123456789ABCDEF")
  97 (defconst quoted-printable-octet-regexp
  98   (concat "=[" quoted-printable-hex-chars
  99           "][" quoted-printable-hex-chars "]"))
 100
 101 (defconst eword-Q-encoded-text-regexp
 102   (concat "\\([^=?]\\|" quoted-printable-octet-regexp "\\)+"))
 103 ;; (defconst eword-Q-encoding-and-encoded-text-regexp
 104 ;;   (concat "\\(Q\\)\\?" eword-Q-encoded-text-regexp))
 105
 106
 107 ;;; @ for string
 108 ;;;
 109
 110 (defun eword-decode-string (string &optional must-unfold)
 111   "Decode MIME encoded-words in STRING.
 112
 113 STRING is unfolded before decoding.
 114
 115 If an encoded-word is broken or your emacs implementation can not
 116 decode the charset included in it, it is not decoded.
 117
 118 If MUST-UNFOLD is non-nil, it unfolds and eliminates line-breaks even
 119 if there are in decoded encoded-words (generated by bad manner MUA
 120 such as a version of Net$cape)."
 121   (setq string (std11-unfold-string string))
 122   (let ((dest "")(ew nil)
 123         beg end)
 124     (while (and (string-match eword-encoded-word-regexp string)
 125                 (setq beg (match-beginning 0)
 126                       end (match-end 0))
 127                 )
 128       (if (> beg 0)
 129           (if (not
 130                (and (eq ew t)
 131                     (string-match "^[ \t]+$" (substring string 0 beg))
 132                     ))
 133               (setq dest (concat dest (substring string 0 beg)))
 134             )
 135         )
 136       (setq dest
 137             (concat dest
 138                     (eword-decode-encoded-word
 139                      (substring string beg end) must-unfold)
 140                     ))
 141       (setq string (substring string end))
 142       (setq ew t)
 143       )
 144     (concat dest string)
 145     ))
 146
 147
 148 ;;; @ for region
 149 ;;;
 150
 151 (defun eword-decode-region (start end &optional unfolding must-unfold)
 152   "Decode MIME encoded-words in region between START and END.
 153
 154 If UNFOLDING is not nil, it unfolds before decoding.
 155
 156 If MUST-UNFOLD is non-nil, it unfolds and eliminates line-breaks even
 157 if there are in decoded encoded-words (generated by bad manner MUA
 158 such as a version of Net$cape)."
 159   (interactive "*r")
 160   (save-excursion
 161     (save-restriction
 162       (narrow-to-region start end)
 163       (if unfolding
 164           (eword-decode-unfold)
 165         )
 166       (goto-char (point-min))
 167       (while (re-search-forward (concat "\\(" eword-encoded-word-regexp "\\)"
 168                                         "\\(\n?[ \t]\\)+"
 169                                         "\\(" eword-encoded-word-regexp "\\)")
 170                                 nil t)
 171         (replace-match "\\1\\6")
 172         (goto-char (point-min))
 173         )
 174       (while (re-search-forward eword-encoded-word-regexp nil t)
 175         (insert (eword-decode-encoded-word
 176                  (prog1
 177                      (buffer-substring (match-beginning 0) (match-end 0))
 178                    (delete-region (match-beginning 0) (match-end 0))
 179                    ) must-unfold))
 180         )
 181       )))
 182
 183
 184 ;;; @ for message header
 185 ;;;
 186
 187 (defcustom eword-decode-ignored-field-list
 188   '(newsgroups path lines nntp-posting-host message-id date)
 189   "*List of field-names to be ignored when decoding.
 190 Each field name must be symbol."
 191   :group 'eword-decode
 192   :type '(repeat symbol))
 193
 194 (defcustom eword-decode-structured-field-list
 195   '(reply-to resent-reply-to from resent-from sender resent-sender
 196              to resent-to cc resent-cc bcc resent-bcc dcc
 197              mime-version content-type content-transfer-encoding
 198              content-disposition)
 199   "*List of field-names to decode as structured field.
 200 Each field name must be symbol."
 201   :group 'eword-decode
 202   :type '(repeat symbol))
 203
 204 (defun eword-decode-header (&optional code-conversion separator)
 205   "Decode MIME encoded-words in header fields.
 206 If CODE-CONVERSION is nil, it decodes only encoded-words.  If it is
 207 mime-charset, it decodes non-ASCII bit patterns as the mime-charset.
 208 Otherwise it decodes non-ASCII bit patterns as the
 209 default-mime-charset.
 210 If SEPARATOR is not nil, it is used as header separator."
 211   (interactive "*")
 212   (save-excursion
 213     (save-restriction
 214       (std11-narrow-to-header separator)
 215       (let ((default-charset
 216               (if code-conversion
 217                   (if (mime-charset-to-coding-system code-conversion)
 218                       code-conversion
 219                     default-mime-charset))))
 220         (if default-charset
 221             (let (beg end field-name)
 222               (goto-char (point-min))
 223               (while (re-search-forward std11-field-head-regexp nil t)
 224                 (setq beg (match-beginning 0)
 225                       p (match-end 0)
 226                       field-name (intern
 227                                   (downcase (buffer-substring beg (1- p))))
 228                       end (std11-field-end))
 229                 (cond ((memq field-name eword-decode-ignored-field-list)
 230                        ;; Don't decode
 231                        )
 232                       ((memq field-name eword-decode-structured-field-list)
 233                        ;; Decode as structured field
 234                        (let ((body (buffer-substring p end))
 235                              (default-mime-charset default-charset))
 236                          (delete-region p end)
 237                          (insert (eword-decode-structured-field-body body))
 238                          ))
 239                       (t
 240                        ;; Decode as unstructured field
 241                        (save-restriction
 242                          (narrow-to-region beg (1+ end))
 243                          (decode-mime-charset-region p end default-charset)
 244                          (goto-char p)
 245                          (if (re-search-forward eword-encoded-word-regexp
 246                                                 nil t)
 247                              (eword-decode-region beg (point-max) 'unfold))
 248                          )))))
 249           (eword-decode-region (point-min) (point-max) t)
 250           )))))
 251
 252 (defun eword-decode-unfold ()
 253   (goto-char (point-min))
 254   (let (field beg end)
 255     (while (re-search-forward std11-field-head-regexp nil t)
 256       (setq beg (match-beginning 0)
 257             end (std11-field-end))
 258       (setq field (buffer-substring beg end))
 259       (if (string-match eword-encoded-word-regexp field)
 260           (save-restriction
 261             (narrow-to-region (goto-char beg) end)
 262             (while (re-search-forward "\n\\([ \t]\\)" nil t)
 263               (replace-match (match-string 1))
 264               )
 265             (goto-char (point-max))
 266             ))
 267       )))
 268
 269
 270 ;;; @ encoded-word decoder
 271 ;;;
 272
 273 (defvar eword-warning-face nil "Face used for invalid encoded-word.")
 274
 275 (defun eword-decode-encoded-word (word &optional must-unfold)
 276   "Decode WORD if it is an encoded-word.
 277
 278 If your emacs implementation can not decode the charset of WORD, it
 279 returns WORD.  Similarly the encoded-word is broken, it returns WORD.
 280
 281 If MUST-UNFOLD is non-nil, it unfolds and eliminates line-breaks even
 282 if there are in decoded encoded-word (generated by bad manner MUA such
 283 as a version of Net$cape)."
 284   (or (if (string-match eword-encoded-word-regexp word)
 285           (let ((charset
 286                  (substring word (match-beginning 1) (match-end 1))
 287                  )
 288                 (encoding
 289                  (upcase
 290                   (substring word (match-beginning 2) (match-end 2))
 291                   ))
 292                 (text
 293                  (substring word (match-beginning 3) (match-end 3))
 294                  ))
 295             (condition-case err
 296                 (eword-decode-encoded-text charset encoding text must-unfold)
 297               (error
 298                (and
 299                 (add-text-properties 0 (length word)
 300                                      (and eword-warning-face
 301                                           (list 'face eword-warning-face))
 302                                      word)
 303                 word)))
 304             ))
 305       word))
 306
 307
 308 ;;; @ encoded-text decoder
 309 ;;;
 310
 311 (defun eword-decode-encoded-text (charset encoding string
 312                                           &optional must-unfold)
 313   "Decode STRING as an encoded-text.
 314
 315 If your emacs implementation can not decode CHARSET, it returns nil.
 316
 317 If ENCODING is not \"B\" or \"Q\", it occurs error.
 318 So you should write error-handling code if you don't want break by errors.
 319
 320 If MUST-UNFOLD is non-nil, it unfolds and eliminates line-breaks even
 321 if there are in decoded encoded-text (generated by bad manner MUA such
 322 as a version of Net$cape)."
 323   (let ((cs (mime-charset-to-coding-system charset)))
 324     (if cs
 325         (let ((dest
 326                (cond
 327                 ((string-equal "B" encoding)
 328                  (if (and (string-match eword-B-encoded-text-regexp string)
 329                           (string-equal string (match-string 0 string)))
 330                      (base64-decode-string string)
 331                    (error "Invalid encoded-text %s" string)))
 332                 ((string-equal "Q" encoding)
 333                  (if (and (string-match eword-Q-encoded-text-regexp string)
 334                           (string-equal string (match-string 0 string)))
 335                      (q-encoding-decode-string string)
 336                    (error "Invalid encoded-text %s" string)))
 337                 (t
 338                  (error "Invalid encoding %s" encoding)
 339                  )))
 340               )
 341           (if dest
 342               (progn
 343                 (setq dest (decode-coding-string dest cs))
 344                 (if must-unfold
 345                     (mapconcat (function
 346                                 (lambda (chr)
 347                                   (cond
 348                                    ((eq chr ?\n) "")
 349                                    ((eq chr ?\t) " ")
 350                                    (t (char-to-string chr)))
 351                                   ))
 352                                (std11-unfold-string dest)
 353                                "")
 354                   dest)
 355                 ))))))
 356
 357
 358 ;;; @ lexical analyze
 359 ;;;
 360
 361 (defvar eword-lexical-analyze-cache nil)
 362 (defvar eword-lexical-analyze-cache-max 299
 363   "*Max position of eword-lexical-analyze-cache.
 364 It is max size of eword-lexical-analyze-cache - 1.")
 365
 366 (defun eword-analyze-quoted-string (string)
 367   (let ((p (std11-check-enclosure string ?\" ?\")))
 368     (if p
 369         (cons (cons 'quoted-string
 370                     (decode-mime-charset-string
 371                      (std11-strip-quoted-pair (substring string 1 (1- p)))
 372                      default-mime-charset))
 373               (substring string p))
 374       )))
 375
 376 (defun eword-analyze-comment (string &optional must-unfold)
 377   (let ((p (std11-check-enclosure string ?\( ?\) t)))
 378     (if p
 379         (cons (cons 'comment
 380                     (eword-decode-string
 381                      (decode-mime-charset-string
 382                       (std11-strip-quoted-pair (substring string 1 (1- p)))
 383                       default-mime-charset)
 384                      must-unfold))
 385               (substring string p))
 386       )))
 387
 388 (defun eword-analyze-encoded-word (string &optional must-unfold)
 389   (if (eq (string-match eword-encoded-word-regexp string) 0)
 390       (let ((end (match-end 0))
 391             (dest (eword-decode-encoded-word (match-string 0 string)
 392                                              must-unfold))
 393             )
 394         (setq string (substring string end))
 395         (while (eq (string-match `,(concat "[ \t\n]*\\("
 396                                            eword-encoded-word-regexp
 397                                            "\\)")
 398                                  string)
 399                    0)
 400           (setq end (match-end 0))
 401           (setq dest
 402                 (concat dest
 403                         (eword-decode-encoded-word (match-string 1 string)
 404                                                    must-unfold))
 405                 string (substring string end))
 406           )
 407         (cons (cons 'atom dest) string)
 408         )))
 409
 410 (defun eword-analyze-atom (string)
 411   (if (string-match std11-atom-regexp string)
 412       (let ((end (match-end 0)))
 413         (cons (cons 'atom (decode-mime-charset-string
 414                            (substring string 0 end)
 415                            default-mime-charset))
 416               (substring string end)
 417               ))))
 418
 419 (defun eword-lexical-analyze-internal (string must-unfold)
 420   (let (dest ret)
 421     (while (not (string-equal string ""))
 422       (setq ret
 423             (or (eword-analyze-quoted-string string)
 424                 (std11-analyze-domain-literal string)
 425                 (eword-analyze-comment string must-unfold)
 426                 (std11-analyze-spaces string)
 427                 (std11-analyze-special string)
 428                 (eword-analyze-encoded-word string must-unfold)
 429                 (eword-analyze-atom string)
 430                 '((error) . "")
 431                 ))
 432       (setq dest (cons (car ret) dest))
 433       (setq string (cdr ret))
 434       )
 435     (nreverse dest)
 436     ))
 437
 438 (defun eword-lexical-analyze (string &optional must-unfold)
 439   "Return lexical analyzed list corresponding STRING.
 440 It is like std11-lexical-analyze, but it decodes non us-ascii
 441 characters encoded as encoded-words or invalid \"raw\" format.
 442 \"Raw\" non us-ascii characters are regarded as variable
 443 `default-mime-charset'."
 444   (let ((key (copy-sequence string))
 445         ret)
 446     (set-text-properties 0 (length key) nil key)
 447     (if (setq ret (assoc key eword-lexical-analyze-cache))
 448         (cdr ret)
 449       (setq ret (eword-lexical-analyze-internal key must-unfold))
 450       (setq eword-lexical-analyze-cache
 451             (cons (cons key ret)
 452                   (last eword-lexical-analyze-cache
 453                         eword-lexical-analyze-cache-max)))
 454       ret)))
 455
 456 (defun eword-decode-structured-field-body (string &optional must-unfold)
 457   "Decode non us-ascii characters in STRING as structured field body.
 458 STRING is unfolded before decoding.
 459
 460 It decodes non us-ascii characters in FULL-NAME encoded as
 461 encoded-words or invalid \"raw\" string.  \"Raw\" non us-ascii
 462 characters are regarded as variable `default-mime-charset'.
 463
 464 If an encoded-word is broken or your emacs implementation can not
 465 decode the charset included in it, it is not decoded.
 466
 467 If MUST-UNFOLD is non-nil, it unfolds and eliminates line-breaks even
 468 if there are in decoded encoded-words (generated by bad manner MUA
 469 such as a version of Net$cape)."
 470   (mapconcat (function
 471               (lambda (token)
 472                 (let ((type (car token))
 473                       (value (cdr token)))
 474                   (cond ((eq type 'quoted-string)
 475                          (std11-wrap-as-quoted-string value)
 476                          )
 477                         ((eq type 'comment)
 478                          (concat "("
 479                                  (std11-wrap-as-quoted-pairs value '(?( ?)))
 480                                  ")")
 481                          )
 482                         (t
 483                          value)))))
 484              (eword-lexical-analyze string must-unfold)
 485              ""))
 486
 487 (defun eword-decode-unstructured-field-body (string &optional must-unfold)
 488   "Decode non us-ascii characters in STRING as unstructured field body.
 489 STRING is unfolded before decoding.
 490
 491 It decodes non us-ascii characters in FULL-NAME encoded as
 492 encoded-words or invalid \"raw\" string.  \"Raw\" non us-ascii
 493 characters are regarded as variable `default-mime-charset'.
 494
 495 If an encoded-word is broken or your emacs implementation can not
 496 decode the charset included in it, it is not decoded.
 497
 498 If MUST-UNFOLD is non-nil, it unfolds and eliminates line-breaks even
 499 if there are in decoded encoded-words (generated by bad manner MUA
 500 such as a version of Net$cape)."
 501   (eword-decode-string
 502    (decode-mime-charset-string string default-mime-charset)
 503    must-unfold))
 504
 505 (defun eword-extract-address-components (string)
 506   "Extract full name and canonical address from STRING.
 507 Returns a list of the form (FULL-NAME CANONICAL-ADDRESS).
 508 If no name can be extracted, FULL-NAME will be nil.
 509 It decodes non us-ascii characters in FULL-NAME encoded as
 510 encoded-words or invalid \"raw\" string.  \"Raw\" non us-ascii
 511 characters are regarded as variable `default-mime-charset'."
 512   (let* ((structure (car (std11-parse-address
 513                           (eword-lexical-analyze
 514                            (std11-unfold-string string) 'must-unfold))))
 515          (phrase  (std11-full-name-string structure))
 516          (address (std11-address-string structure))
 517          )
 518     (list phrase address)
 519     ))
 520
 521
 522 ;;; @ end
 523 ;;;
 524
 525 (provide 'eword-decode)
 526
 527 ;;; eword-decode.el ends here