eword-decode.el

   1 ;;; eword-decode.el --- RFC 2047 based encoded-word decoder for GNU Emacs
   2
   3 ;; Copyright (C) 1995,1996,1997 Free Software Foundation, Inc.
   4
   5 ;; Author: ENAMI Tsugutomo <enami@sys.ptg.sony.co.jp>
   6 ;;         MORIOKA Tomohiko <morioka@jaist.ac.jp>
   7 ;; Maintainer: MORIOKA Tomohiko <morioka@jaist.ac.jp>
   8 ;; Created: 1995/10/03
   9 ;; Original: 1992/07/20 ENAMI Tsugutomo's `mime.el'.
  10 ;;      Renamed: 1993/06/03 to tiny-mime.el
  11 ;;      Renamed: 1995/10/03 from tiny-mime.el (split off encoder)
  12 ;;      Renamed: 1997/02/22 from tm-ew-d.el
  13 ;; Version: $Revision: 0.19 $
  14 ;; Keywords: encoded-word, MIME, multilingual, header, mail, news
  15
  16 ;; This file is part of SEMI (SEMI is Emacs MIME Interfaces).
  17
  18 ;; This program is free software; you can redistribute it and/or
  19 ;; modify it under the terms of the GNU General Public License as
  20 ;; published by the Free Software Foundation; either version 2, or (at
  21 ;; your option) any later version.
  22
  23 ;; This program is distributed in the hope that it will be useful, but
  24 ;; WITHOUT ANY WARRANTY; without even the implied warranty of
  25 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  26 ;; General Public License for more details.
  27
  28 ;; You should have received a copy of the GNU General Public License
  29 ;; along with GNU Emacs; see the file COPYING.  If not, write to the
  30 ;; Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  31 ;; Boston, MA 02111-1307, USA.
  32
  33 ;;; Code:
  34
  35 (require 'std11-parse)
  36 (require 'mel)
  37 (require 'mime-def)
  38
  39
  40 ;;; @ version
  41 ;;;
  42
  43 (defconst eword-decode-RCS-ID
  44   "$Id: eword-decode.el,v 0.19 1997-08-30 14:26:13 morioka Exp $")
  45 (defconst eword-decode-version (get-version-string eword-decode-RCS-ID))
  46
  47
  48 ;;; @ MIME encoded-word definition
  49 ;;;
  50
  51 (defconst eword-encoded-text-regexp "[!->@-~]+")
  52 (defconst eword-encoded-word-regexp
  53   (concat (regexp-quote "=?")
  54           "\\("
  55           mime-charset-regexp
  56           "\\)"
  57           (regexp-quote "?")
  58           "\\(B\\|Q\\)"
  59           (regexp-quote "?")
  60           "\\("
  61           eword-encoded-text-regexp
  62           "\\)"
  63           (regexp-quote "?=")))
  64
  65
  66 ;;; @@ Base64
  67 ;;;
  68
  69 (defconst base64-token-regexp "[A-Za-z0-9+/]")
  70 (defconst base64-token-padding-regexp "[A-Za-z0-9+/=]")
  71
  72 (defconst eword-B-encoded-text-regexp
  73   (concat "\\(\\("
  74           base64-token-regexp
  75           base64-token-regexp
  76           base64-token-regexp
  77           base64-token-regexp
  78           "\\)*"
  79           base64-token-regexp
  80           base64-token-regexp
  81           base64-token-padding-regexp
  82           base64-token-padding-regexp
  83           "\\)"))
  84
  85 ;; (defconst eword-B-encoding-and-encoded-text-regexp
  86 ;;   (concat "\\(B\\)\\?" eword-B-encoded-text-regexp))
  87
  88
  89 ;;; @@ Quoted-Printable
  90 ;;;
  91
  92 (defconst quoted-printable-hex-chars "0123456789ABCDEF")
  93 (defconst quoted-printable-octet-regexp
  94   (concat "=[" quoted-printable-hex-chars
  95           "][" quoted-printable-hex-chars "]"))
  96
  97 (defconst eword-Q-encoded-text-regexp
  98   (concat "\\([^=?]\\|" quoted-printable-octet-regexp "\\)+"))
  99 ;; (defconst eword-Q-encoding-and-encoded-text-regexp
 100 ;;   (concat "\\(Q\\)\\?" eword-Q-encoded-text-regexp))
 101
 102
 103 ;;; @ for string
 104 ;;;
 105
 106 (defun eword-decode-string (string &optional must-unfold)
 107   "Decode MIME encoded-words in STRING.
 108
 109 STRING is unfolded before decoding.
 110
 111 If an encoded-word is broken or your emacs implementation can not
 112 decode the charset included in it, it is not decoded.
 113
 114 If MUST-UNFOLD is non-nil, it unfolds and eliminates line-breaks even
 115 if there are in decoded encoded-words (generated by bad manner MUA
 116 such as a version of Net$cape)."
 117   (setq string (std11-unfold-string string))
 118   (let ((dest "")(ew nil)
 119         beg end)
 120     (while (and (string-match eword-encoded-word-regexp string)
 121                 (setq beg (match-beginning 0)
 122                       end (match-end 0))
 123                 )
 124       (if (> beg 0)
 125           (if (not
 126                (and (eq ew t)
 127                     (string-match "^[ \t]+$" (substring string 0 beg))
 128                     ))
 129               (setq dest (concat dest (substring string 0 beg)))
 130             )
 131         )
 132       (setq dest
 133             (concat dest
 134                     (eword-decode-encoded-word
 135                      (substring string beg end) must-unfold)
 136                     ))
 137       (setq string (substring string end))
 138       (setq ew t)
 139       )
 140     (concat dest string)
 141     ))
 142
 143
 144 ;;; @ for region
 145 ;;;
 146
 147 (defun eword-decode-region (start end &optional unfolding must-unfold)
 148   "Decode MIME encoded-words in region between START and END.
 149
 150 If UNFOLDING is not nil, it unfolds before decoding.
 151
 152 If MUST-UNFOLD is non-nil, it unfolds and eliminates line-breaks even
 153 if there are in decoded encoded-words (generated by bad manner MUA
 154 such as a version of Net$cape)."
 155   (interactive "*r")
 156   (save-excursion
 157     (save-restriction
 158       (narrow-to-region start end)
 159       (if unfolding
 160           (eword-decode-unfold)
 161         )
 162       (goto-char (point-min))
 163       (while (re-search-forward (concat "\\(" eword-encoded-word-regexp "\\)"
 164                                         "\\(\n?[ \t]\\)+"
 165                                         "\\(" eword-encoded-word-regexp "\\)")
 166                                 nil t)
 167         (replace-match "\\1\\6")
 168         (goto-char (point-min))
 169         )
 170       (while (re-search-forward eword-encoded-word-regexp nil t)
 171         (insert (eword-decode-encoded-word
 172                  (prog1
 173                      (buffer-substring (match-beginning 0) (match-end 0))
 174                    (delete-region (match-beginning 0) (match-end 0))
 175                    ) must-unfold))
 176         )
 177       )))
 178
 179
 180 ;;; @ for message header
 181 ;;;
 182
 183 (defun eword-decode-header (&optional separator)
 184   "Decode MIME encoded-words in header fields.
 185 If SEPARATOR is not nil, it is used as header separator."
 186   (interactive "*")
 187   (save-excursion
 188     (save-restriction
 189       (std11-narrow-to-header separator)
 190       (eword-decode-region (point-min) (point-max) t)
 191       )))
 192
 193 (defun eword-decode-unfold ()
 194   (goto-char (point-min))
 195   (let (field beg end)
 196     (while (re-search-forward std11-field-head-regexp nil t)
 197       (setq beg (match-beginning 0)
 198             end (std11-field-end))
 199       (setq field (buffer-substring beg end))
 200       (if (string-match eword-encoded-word-regexp field)
 201           (save-restriction
 202             (narrow-to-region (goto-char beg) end)
 203             (while (re-search-forward "\n\\([ \t]\\)" nil t)
 204               (replace-match (match-string 1))
 205               )
 206             (goto-char (point-max))
 207             ))
 208       )))
 209
 210
 211 ;;; @ encoded-word decoder
 212 ;;;
 213
 214 (defvar eword-warning-face nil "Face used for invalid encoded-word.")
 215
 216 (defun eword-decode-encoded-word (word &optional must-unfold)
 217   "Decode WORD if it is an encoded-word.
 218
 219 If your emacs implementation can not decode the charset of WORD, it
 220 returns WORD.  Similarly the encoded-word is broken, it returns WORD.
 221
 222 If MUST-UNFOLD is non-nil, it unfolds and eliminates line-breaks even
 223 if there are in decoded encoded-word (generated by bad manner MUA such
 224 as a version of Net$cape)."
 225   (or (if (string-match eword-encoded-word-regexp word)
 226           (let ((charset
 227                  (substring word (match-beginning 1) (match-end 1))
 228                  )
 229                 (encoding
 230                  (upcase
 231                   (substring word (match-beginning 2) (match-end 2))
 232                   ))
 233                 (text
 234                  (substring word (match-beginning 3) (match-end 3))
 235                  ))
 236             (condition-case err
 237                 (eword-decode-encoded-text charset encoding text must-unfold)
 238               (error
 239                (and
 240                 (add-text-properties 0 (length word)
 241                                      (and eword-warning-face
 242                                           (list 'face eword-warning-face))
 243                                      word)
 244                 word)))
 245             ))
 246       word))
 247
 248
 249 ;;; @ encoded-text decoder
 250 ;;;
 251
 252 (defun eword-decode-encoded-text (charset encoding string
 253                                           &optional must-unfold)
 254   "Decode STRING as an encoded-text.
 255
 256 If your emacs implementation can not decode CHARSET, it returns nil.
 257
 258 If ENCODING is not \"B\" or \"Q\", it occurs error.
 259 So you should write error-handling code if you don't want break by errors.
 260
 261 If MUST-UNFOLD is non-nil, it unfolds and eliminates line-breaks even
 262 if there are in decoded encoded-text (generated by bad manner MUA such
 263 as a version of Net$cape)."
 264   (let ((cs (mime-charset-to-coding-system charset)))
 265     (if cs
 266         (let ((dest
 267                (cond
 268                 ((string-equal "B" encoding)
 269                  (if (and (string-match eword-B-encoded-text-regexp string)
 270                           (string-equal string (match-string 0 string)))
 271                      (base64-decode-string string)
 272                    (error "Invalid encoded-text %s" string)))
 273                 ((string-equal "Q" encoding)
 274                  (if (and (string-match eword-Q-encoded-text-regexp string)
 275                           (string-equal string (match-string 0 string)))
 276                      (q-encoding-decode-string string)
 277                    (error "Invalid encoded-text %s" string)))
 278                 (t
 279                  (error "Invalid encoding %s" encoding)
 280                  )))
 281               )
 282           (if dest
 283               (progn
 284                 (setq dest (decode-coding-string dest cs))
 285                 (if must-unfold
 286                     (mapconcat (function
 287                                 (lambda (chr)
 288                                   (cond
 289                                    ((eq chr ?\n) "")
 290                                    ((eq chr ?\t) " ")
 291                                    (t (char-to-string chr)))
 292                                   ))
 293                                (std11-unfold-string dest)
 294                                "")
 295                   dest)
 296                 ))))))
 297
 298
 299 ;;; @ lexical analyze
 300 ;;;
 301
 302 (defvar eword-lexical-analyze-cache nil)
 303 (defvar eword-lexical-analyze-cache-max 299
 304   "*Max position of eword-lexical-analyze-cache.
 305 It is max size of eword-lexical-analyze-cache - 1.")
 306
 307 (defun eword-analyze-quoted-string (string)
 308   (let ((p (std11-check-enclosure string ?\" ?\")))
 309     (if p
 310         (cons (cons 'quoted-string
 311                     (decode-mime-charset-string
 312                      (std11-strip-quoted-pair (substring string 1 (1- p)))
 313                      default-mime-charset))
 314               (substring string p))
 315       )))
 316
 317 (defun eword-analyze-comment (string &optional must-unfold)
 318   (let ((p (std11-check-enclosure string ?\( ?\) t)))
 319     (if p
 320         (cons (cons 'comment
 321                     (eword-decode-string
 322                      (decode-mime-charset-string
 323                       (std11-strip-quoted-pair (substring string 1 (1- p)))
 324                       default-mime-charset)
 325                      must-unfold))
 326               (substring string p))
 327       )))
 328
 329 (defun eword-analyze-encoded-word (string &optional must-unfold)
 330   (if (eq (string-match eword-encoded-word-regexp string) 0)
 331       (let ((end (match-end 0))
 332             (dest (eword-decode-encoded-word (match-string 0 string)
 333                                              must-unfold))
 334             )
 335         (setq string (substring string end))
 336         (while (eq (string-match `,(concat "[ \t\n]*\\("
 337                                            eword-encoded-word-regexp
 338                                            "\\)")
 339                                  string)
 340                    0)
 341           (setq end (match-end 0))
 342           (setq dest
 343                 (concat dest
 344                         (eword-decode-encoded-word (match-string 1 string)
 345                                                    must-unfold))
 346                 string (substring string end))
 347           )
 348         (cons (cons 'atom dest) string)
 349         )))
 350
 351 (defun eword-lexical-analyze-internal (string must-unfold)
 352   (let (dest ret)
 353     (while (not (string-equal string ""))
 354       (setq ret
 355             (or (eword-analyze-quoted-string string)
 356                 (std11-analyze-domain-literal string)
 357                 (eword-analyze-comment string must-unfold)
 358                 (std11-analyze-spaces string)
 359                 (std11-analyze-special string)
 360                 (eword-analyze-encoded-word string must-unfold)
 361                 (std11-analyze-atom string)
 362                 '((error) . "")
 363                 ))
 364       (setq dest (cons (car ret) dest))
 365       (setq string (cdr ret))
 366       )
 367     (nreverse dest)
 368     ))
 369
 370 (defun eword-lexical-analyze (string &optional must-unfold)
 371   "Return lexical analyzed list corresponding STRING.
 372 It is like std11-lexical-analyze, but it decodes non us-ascii
 373 characters encoded as encoded-words or invalid \"raw\" format.
 374 \"Raw\" non us-ascii characters are regarded as variable
 375 `default-mime-charset'."
 376   (let ((key (copy-sequence string))
 377         ret)
 378     (set-text-properties 0 (length key) nil key)
 379     (if (setq ret (assoc key eword-lexical-analyze-cache))
 380         (cdr ret)
 381       (setq ret (eword-lexical-analyze-internal key must-unfold))
 382       (setq eword-lexical-analyze-cache
 383             (cons (cons key ret)
 384                   (last eword-lexical-analyze-cache
 385                         eword-lexical-analyze-cache-max)))
 386       ret)))
 387
 388 (defun eword-decode-structured-field-body (string &optional must-unfold)
 389   "Decode non us-ascii characters in STRING as structured field body.
 390 STRING is unfolded before decoding.
 391
 392 It decodes non us-ascii characters in FULL-NAME encoded as
 393 encoded-words or invalid \"raw\" string.  \"Raw\" non us-ascii
 394 characters are regarded as variable `default-mime-charset'.
 395
 396 If an encoded-word is broken or your emacs implementation can not
 397 decode the charset included in it, it is not decoded.
 398
 399 If MUST-UNFOLD is non-nil, it unfolds and eliminates line-breaks even
 400 if there are in decoded encoded-words (generated by bad manner MUA
 401 such as a version of Net$cape)."
 402   (mapconcat (function
 403               (lambda (token)
 404                 (let ((type (car token))
 405                       (value (cdr token)))
 406                   (cond ((eq type 'quoted-string)
 407                          (std11-wrap-as-quoted-string value)
 408                          )
 409                         ((eq type 'comment)
 410                          (concat "("
 411                                  (std11-wrap-as-quoted-pairs value '(?( ?)))
 412                                  ")")
 413                          )
 414                         (t
 415                          value)))))
 416              (eword-lexical-analyze string must-unfold)
 417              ""))
 418
 419 (defun eword-decode-unstructured-field-body (string &optional must-unfold)
 420   "Decode non us-ascii characters in STRING as unstructured field body.
 421 STRING is unfolded before decoding.
 422
 423 It decodes non us-ascii characters in FULL-NAME encoded as
 424 encoded-words or invalid \"raw\" string.  \"Raw\" non us-ascii
 425 characters are regarded as variable `default-mime-charset'.
 426
 427 If an encoded-word is broken or your emacs implementation can not
 428 decode the charset included in it, it is not decoded.
 429
 430 If MUST-UNFOLD is non-nil, it unfolds and eliminates line-breaks even
 431 if there are in decoded encoded-words (generated by bad manner MUA
 432 such as a version of Net$cape)."
 433   (eword-decode-string
 434    (decode-mime-charset-string string default-mime-charset)
 435    must-unfold))
 436
 437 (defun eword-extract-address-components (string)
 438   "Extract full name and canonical address from STRING.
 439 Returns a list of the form (FULL-NAME CANONICAL-ADDRESS).
 440 If no name can be extracted, FULL-NAME will be nil.
 441 It decodes non us-ascii characters in FULL-NAME encoded as
 442 encoded-words or invalid \"raw\" string.  \"Raw\" non us-ascii
 443 characters are regarded as variable `default-mime-charset'."
 444   (let* ((structure (car (std11-parse-address
 445                           (eword-lexical-analyze
 446                            (std11-unfold-string string) 'must-unfold))))
 447          (phrase  (std11-full-name-string structure))
 448          (address (std11-address-string structure))
 449          )
 450     (list phrase address)
 451     ))
 452
 453
 454 ;;; @ end
 455 ;;;
 456
 457 (provide 'eword-decode)
 458
 459 ;;; eword-decode.el ends here