eword-decode.el

   1 ;;; eword-decode.el --- RFC 2047 based encoded-word decoder for GNU Emacs
   2
   3 ;; Copyright (C) 1995,1996,1997 Free Software Foundation, Inc.
   4
   5 ;; Author: ENAMI Tsugutomo <enami@sys.ptg.sony.co.jp>
   6 ;;         MORIOKA Tomohiko <morioka@jaist.ac.jp>
   7 ;; Maintainer: MORIOKA Tomohiko <morioka@jaist.ac.jp>
   8 ;; Created: 1995/10/03
   9 ;; Original: 1992/07/20 ENAMI Tsugutomo's `mime.el'.
  10 ;;      Renamed: 1993/06/03 to tiny-mime.el
  11 ;;      Renamed: 1995/10/03 from tiny-mime.el (split off encoder)
  12 ;;      Renamed: 1997/02/22 from tm-ew-d.el
  13 ;; Version: $Revision: 1.1 $
  14 ;; Keywords: encoded-word, MIME, multilingual, header, mail, news
  15
  16 ;; This file is part of SEMI (SEMI is Emacs MIME Interfaces).
  17
  18 ;; This program is free software; you can redistribute it and/or
  19 ;; modify it under the terms of the GNU General Public License as
  20 ;; published by the Free Software Foundation; either version 2, or (at
  21 ;; your option) any later version.
  22
  23 ;; This program is distributed in the hope that it will be useful, but
  24 ;; WITHOUT ANY WARRANTY; without even the implied warranty of
  25 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  26 ;; General Public License for more details.
  27
  28 ;; You should have received a copy of the GNU General Public License
  29 ;; along with GNU Emacs; see the file COPYING.  If not, write to the
  30 ;; Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  31 ;; Boston, MA 02111-1307, USA.
  32
  33 ;;; Code:
  34
  35 (require 'std11-parse)
  36 (require 'mel)
  37 (require 'mime-def)
  38
  39
  40 ;;; @ version
  41 ;;;
  42
  43 (defconst eword-decode-RCS-ID
  44   "$Id: eword-decode.el,v 1.1 1998-02-16 18:44:37 morioka Exp $")
  45 (defconst eword-decode-version (get-version-string eword-decode-RCS-ID))
  46
  47
  48 ;;; @ MIME encoded-word definition
  49 ;;;
  50
  51 (defconst eword-encoded-text-regexp "[!->@-~]+")
  52 (defconst eword-encoded-word-regexp
  53   (concat (regexp-quote "=?")
  54           "\\("
  55           mime-charset-regexp
  56           "\\)"
  57           (regexp-quote "?")
  58           "\\(B\\|Q\\)"
  59           (regexp-quote "?")
  60           "\\("
  61           eword-encoded-text-regexp
  62           "\\)"
  63           (regexp-quote "?=")))
  64
  65
  66 ;;; @@ Base64
  67 ;;;
  68
  69 (defconst base64-token-regexp "[A-Za-z0-9+/]")
  70 (defconst base64-token-padding-regexp "[A-Za-z0-9+/=]")
  71
  72 (defconst eword-B-encoded-text-regexp
  73   (concat "\\(\\("
  74           base64-token-regexp
  75           base64-token-regexp
  76           base64-token-regexp
  77           base64-token-regexp
  78           "\\)*"
  79           base64-token-regexp
  80           base64-token-regexp
  81           base64-token-padding-regexp
  82           base64-token-padding-regexp
  83           "\\)"))
  84
  85 ;; (defconst eword-B-encoding-and-encoded-text-regexp
  86 ;;   (concat "\\(B\\)\\?" eword-B-encoded-text-regexp))
  87
  88
  89 ;;; @@ Quoted-Printable
  90 ;;;
  91
  92 (defconst quoted-printable-hex-chars "0123456789ABCDEF")
  93 (defconst quoted-printable-octet-regexp
  94   (concat "=[" quoted-printable-hex-chars
  95           "][" quoted-printable-hex-chars "]"))
  96
  97 (defconst eword-Q-encoded-text-regexp
  98   (concat "\\([^=?]\\|" quoted-printable-octet-regexp "\\)+"))
  99 ;; (defconst eword-Q-encoding-and-encoded-text-regexp
 100 ;;   (concat "\\(Q\\)\\?" eword-Q-encoded-text-regexp))
 101
 102
 103 ;;; @ for string
 104 ;;;
 105
 106 (defun eword-decode-string (string &optional must-unfold)
 107   "Decode MIME encoded-words in STRING.
 108
 109 STRING is unfolded before decoding.
 110
 111 If an encoded-word is broken or your emacs implementation can not
 112 decode the charset included in it, it is not decoded.
 113
 114 If MUST-UNFOLD is non-nil, it unfolds and eliminates line-breaks even
 115 if there are in decoded encoded-words (generated by bad manner MUA
 116 such as a version of Net$cape)."
 117   (setq string (std11-unfold-string string))
 118   (let ((dest "")(ew nil)
 119         beg end)
 120     (while (and (string-match eword-encoded-word-regexp string)
 121                 (setq beg (match-beginning 0)
 122                       end (match-end 0))
 123                 )
 124       (if (> beg 0)
 125           (if (not
 126                (and (eq ew t)
 127                     (string-match "^[ \t]+$" (substring string 0 beg))
 128                     ))
 129               (setq dest (concat dest (substring string 0 beg)))
 130             )
 131         )
 132       (setq dest
 133             (concat dest
 134                     (eword-decode-encoded-word
 135                      (substring string beg end) must-unfold)
 136                     ))
 137       (setq string (substring string end))
 138       (setq ew t)
 139       )
 140     (concat dest string)
 141     ))
 142
 143
 144 ;;; @ for region
 145 ;;;
 146
 147 (defun eword-decode-region (start end &optional unfolding must-unfold)
 148   "Decode MIME encoded-words in region between START and END.
 149
 150 If UNFOLDING is not nil, it unfolds before decoding.
 151
 152 If MUST-UNFOLD is non-nil, it unfolds and eliminates line-breaks even
 153 if there are in decoded encoded-words (generated by bad manner MUA
 154 such as a version of Net$cape)."
 155   (interactive "*r")
 156   (save-excursion
 157     (save-restriction
 158       (narrow-to-region start end)
 159       (if unfolding
 160           (eword-decode-unfold)
 161         )
 162       (goto-char (point-min))
 163       (while (re-search-forward (concat "\\(" eword-encoded-word-regexp "\\)"
 164                                         "\\(\n?[ \t]\\)+"
 165                                         "\\(" eword-encoded-word-regexp "\\)")
 166                                 nil t)
 167         (replace-match "\\1\\6")
 168         (goto-char (point-min))
 169         )
 170       (while (re-search-forward eword-encoded-word-regexp nil t)
 171         (insert (eword-decode-encoded-word
 172                  (prog1
 173                      (buffer-substring (match-beginning 0) (match-end 0))
 174                    (delete-region (match-beginning 0) (match-end 0))
 175                    ) must-unfold))
 176         )
 177       )))
 178
 179
 180 ;;; @ for message header
 181 ;;;
 182
 183 (defun eword-decode-header (&optional code-conversion separator)
 184   "Decode MIME encoded-words in header fields.
 185 If CODE-CONVERSION is nil, it decodes only encoded-words.  If it is
 186 mime-charset, it decodes non-ASCII bit patterns as the mime-charset.
 187 Otherwise it decodes non-ASCII bit patterns as the
 188 default-mime-charset.
 189 If SEPARATOR is not nil, it is used as header separator."
 190   (interactive "*")
 191   (save-excursion
 192     (save-restriction
 193       (std11-narrow-to-header separator)
 194       (let ((default-charset
 195               (if code-conversion
 196                   (if (mime-charset-to-coding-system code-conversion)
 197                       code-conversion
 198                     default-mime-charset))))
 199         (if default-charset
 200             (let (beg end field-name)
 201               (goto-char (point-min))
 202               (while (re-search-forward std11-field-head-regexp nil t)
 203                 (setq beg (match-beginning 0)
 204                       p (match-end 0)
 205                       field-name (intern
 206                                   (downcase (buffer-substring beg (1- p))))
 207                       end (std11-field-end))
 208                 (cond ((memq field-name '(newsgroups message-id path))
 209                        )
 210                       ((memq field-name '(reply-to
 211                                           from sender
 212                                           resent-reply-to resent-from
 213                                           resent-sender to resent-to
 214                                           cc resent-cc
 215                                           bcc resent-bcc dcc
 216                                           mime-version))
 217                        (let ((body (buffer-substring p end))
 218                              (default-mime-charset default-charset))
 219                          (delete-region p end)
 220                          (insert (eword-decode-structured-field-body body))
 221                          ))
 222                       (t
 223                        (let ((body (buffer-substring p end))
 224                              (default-mime-charset default-charset))
 225                          (delete-region p end)
 226                          (insert (eword-decode-unstructured-field-body body))
 227                          )))))
 228           (eword-decode-region (point-min) (point-max) t)
 229           )))))
 230
 231 (defun eword-decode-unfold ()
 232   (goto-char (point-min))
 233   (let (field beg end)
 234     (while (re-search-forward std11-field-head-regexp nil t)
 235       (setq beg (match-beginning 0)
 236             end (std11-field-end))
 237       (setq field (buffer-substring beg end))
 238       (if (string-match eword-encoded-word-regexp field)
 239           (save-restriction
 240             (narrow-to-region (goto-char beg) end)
 241             (while (re-search-forward "\n\\([ \t]\\)" nil t)
 242               (replace-match (match-string 1))
 243               )
 244             (goto-char (point-max))
 245             ))
 246       )))
 247
 248
 249 ;;; @ encoded-word decoder
 250 ;;;
 251
 252 (defvar eword-warning-face nil "Face used for invalid encoded-word.")
 253
 254 (defun eword-decode-encoded-word (word &optional must-unfold)
 255   "Decode WORD if it is an encoded-word.
 256
 257 If your emacs implementation can not decode the charset of WORD, it
 258 returns WORD.  Similarly the encoded-word is broken, it returns WORD.
 259
 260 If MUST-UNFOLD is non-nil, it unfolds and eliminates line-breaks even
 261 if there are in decoded encoded-word (generated by bad manner MUA such
 262 as a version of Net$cape)."
 263   (or (if (string-match eword-encoded-word-regexp word)
 264           (let ((charset
 265                  (substring word (match-beginning 1) (match-end 1))
 266                  )
 267                 (encoding
 268                  (upcase
 269                   (substring word (match-beginning 2) (match-end 2))
 270                   ))
 271                 (text
 272                  (substring word (match-beginning 3) (match-end 3))
 273                  ))
 274             (condition-case err
 275                 (eword-decode-encoded-text charset encoding text must-unfold)
 276               (error
 277                (and
 278                 (add-text-properties 0 (length word)
 279                                      (and eword-warning-face
 280                                           (list 'face eword-warning-face))
 281                                      word)
 282                 word)))
 283             ))
 284       word))
 285
 286
 287 ;;; @ encoded-text decoder
 288 ;;;
 289
 290 (defun eword-decode-encoded-text (charset encoding string
 291                                           &optional must-unfold)
 292   "Decode STRING as an encoded-text.
 293
 294 If your emacs implementation can not decode CHARSET, it returns nil.
 295
 296 If ENCODING is not \"B\" or \"Q\", it occurs error.
 297 So you should write error-handling code if you don't want break by errors.
 298
 299 If MUST-UNFOLD is non-nil, it unfolds and eliminates line-breaks even
 300 if there are in decoded encoded-text (generated by bad manner MUA such
 301 as a version of Net$cape)."
 302   (let ((cs (mime-charset-to-coding-system charset)))
 303     (if cs
 304         (let ((dest
 305                (cond
 306                 ((string-equal "B" encoding)
 307                  (if (and (string-match eword-B-encoded-text-regexp string)
 308                           (string-equal string (match-string 0 string)))
 309                      (base64-decode-string string)
 310                    (error "Invalid encoded-text %s" string)))
 311                 ((string-equal "Q" encoding)
 312                  (if (and (string-match eword-Q-encoded-text-regexp string)
 313                           (string-equal string (match-string 0 string)))
 314                      (q-encoding-decode-string string)
 315                    (error "Invalid encoded-text %s" string)))
 316                 (t
 317                  (error "Invalid encoding %s" encoding)
 318                  )))
 319               )
 320           (if dest
 321               (progn
 322                 (setq dest (decode-coding-string dest cs))
 323                 (if must-unfold
 324                     (mapconcat (function
 325                                 (lambda (chr)
 326                                   (cond
 327                                    ((eq chr ?\n) "")
 328                                    ((eq chr ?\t) " ")
 329                                    (t (char-to-string chr)))
 330                                   ))
 331                                (std11-unfold-string dest)
 332                                "")
 333                   dest)
 334                 ))))))
 335
 336
 337 ;;; @ lexical analyze
 338 ;;;
 339
 340 (defvar eword-lexical-analyze-cache nil)
 341 (defvar eword-lexical-analyze-cache-max 299
 342   "*Max position of eword-lexical-analyze-cache.
 343 It is max size of eword-lexical-analyze-cache - 1.")
 344
 345 (defun eword-analyze-quoted-string (string)
 346   (let ((p (std11-check-enclosure string ?\" ?\")))
 347     (if p
 348         (cons (cons 'quoted-string
 349                     (decode-mime-charset-string
 350                      (std11-strip-quoted-pair (substring string 1 (1- p)))
 351                      default-mime-charset))
 352               (substring string p))
 353       )))
 354
 355 (defun eword-analyze-comment (string &optional must-unfold)
 356   (let ((p (std11-check-enclosure string ?\( ?\) t)))
 357     (if p
 358         (cons (cons 'comment
 359                     (eword-decode-string
 360                      (decode-mime-charset-string
 361                       (std11-strip-quoted-pair (substring string 1 (1- p)))
 362                       default-mime-charset)
 363                      must-unfold))
 364               (substring string p))
 365       )))
 366
 367 (defun eword-analyze-encoded-word (string &optional must-unfold)
 368   (if (eq (string-match eword-encoded-word-regexp string) 0)
 369       (let ((end (match-end 0))
 370             (dest (eword-decode-encoded-word (match-string 0 string)
 371                                              must-unfold))
 372             )
 373         (setq string (substring string end))
 374         (while (eq (string-match `,(concat "[ \t\n]*\\("
 375                                            eword-encoded-word-regexp
 376                                            "\\)")
 377                                  string)
 378                    0)
 379           (setq end (match-end 0))
 380           (setq dest
 381                 (concat dest
 382                         (eword-decode-encoded-word (match-string 1 string)
 383                                                    must-unfold))
 384                 string (substring string end))
 385           )
 386         (cons (cons 'atom dest) string)
 387         )))
 388
 389 (defun eword-analyze-atom (string)
 390   (if (string-match std11-atom-regexp string)
 391       (let ((end (match-end 0)))
 392         (cons (cons 'atom (decode-mime-charset-string
 393                            (substring string 0 end)
 394                            default-mime-charset))
 395               (substring string end)
 396               ))))
 397
 398 (defun eword-lexical-analyze-internal (string must-unfold)
 399   (let (dest ret)
 400     (while (not (string-equal string ""))
 401       (setq ret
 402             (or (eword-analyze-quoted-string string)
 403                 (std11-analyze-domain-literal string)
 404                 (eword-analyze-comment string must-unfold)
 405                 (std11-analyze-spaces string)
 406                 (std11-analyze-special string)
 407                 (eword-analyze-encoded-word string must-unfold)
 408                 (eword-analyze-atom string)
 409                 '((error) . "")
 410                 ))
 411       (setq dest (cons (car ret) dest))
 412       (setq string (cdr ret))
 413       )
 414     (nreverse dest)
 415     ))
 416
 417 (defun eword-lexical-analyze (string &optional must-unfold)
 418   "Return lexical analyzed list corresponding STRING.
 419 It is like std11-lexical-analyze, but it decodes non us-ascii
 420 characters encoded as encoded-words or invalid \"raw\" format.
 421 \"Raw\" non us-ascii characters are regarded as variable
 422 `default-mime-charset'."
 423   (let ((key (copy-sequence string))
 424         ret)
 425     (set-text-properties 0 (length key) nil key)
 426     (if (setq ret (assoc key eword-lexical-analyze-cache))
 427         (cdr ret)
 428       (setq ret (eword-lexical-analyze-internal key must-unfold))
 429       (setq eword-lexical-analyze-cache
 430             (cons (cons key ret)
 431                   (last eword-lexical-analyze-cache
 432                         eword-lexical-analyze-cache-max)))
 433       ret)))
 434
 435 (defun eword-decode-structured-field-body (string &optional must-unfold)
 436   "Decode non us-ascii characters in STRING as structured field body.
 437 STRING is unfolded before decoding.
 438
 439 It decodes non us-ascii characters in FULL-NAME encoded as
 440 encoded-words or invalid \"raw\" string.  \"Raw\" non us-ascii
 441 characters are regarded as variable `default-mime-charset'.
 442
 443 If an encoded-word is broken or your emacs implementation can not
 444 decode the charset included in it, it is not decoded.
 445
 446 If MUST-UNFOLD is non-nil, it unfolds and eliminates line-breaks even
 447 if there are in decoded encoded-words (generated by bad manner MUA
 448 such as a version of Net$cape)."
 449   (mapconcat (function
 450               (lambda (token)
 451                 (let ((type (car token))
 452                       (value (cdr token)))
 453                   (cond ((eq type 'quoted-string)
 454                          (std11-wrap-as-quoted-string value)
 455                          )
 456                         ((eq type 'comment)
 457                          (concat "("
 458                                  (std11-wrap-as-quoted-pairs value '(?( ?)))
 459                                  ")")
 460                          )
 461                         (t
 462                          value)))))
 463              (eword-lexical-analyze string must-unfold)
 464              ""))
 465
 466 (defun eword-decode-unstructured-field-body (string &optional must-unfold)
 467   "Decode non us-ascii characters in STRING as unstructured field body.
 468 STRING is unfolded before decoding.
 469
 470 It decodes non us-ascii characters in FULL-NAME encoded as
 471 encoded-words or invalid \"raw\" string.  \"Raw\" non us-ascii
 472 characters are regarded as variable `default-mime-charset'.
 473
 474 If an encoded-word is broken or your emacs implementation can not
 475 decode the charset included in it, it is not decoded.
 476
 477 If MUST-UNFOLD is non-nil, it unfolds and eliminates line-breaks even
 478 if there are in decoded encoded-words (generated by bad manner MUA
 479 such as a version of Net$cape)."
 480   (eword-decode-string
 481    (decode-mime-charset-string string default-mime-charset)
 482    must-unfold))
 483
 484 (defun eword-extract-address-components (string)
 485   "Extract full name and canonical address from STRING.
 486 Returns a list of the form (FULL-NAME CANONICAL-ADDRESS).
 487 If no name can be extracted, FULL-NAME will be nil.
 488 It decodes non us-ascii characters in FULL-NAME encoded as
 489 encoded-words or invalid \"raw\" string.  \"Raw\" non us-ascii
 490 characters are regarded as variable `default-mime-charset'."
 491   (let* ((structure (car (std11-parse-address
 492                           (eword-lexical-analyze
 493                            (std11-unfold-string string) 'must-unfold))))
 494          (phrase  (std11-full-name-string structure))
 495          (address (std11-address-string structure))
 496          )
 497     (list phrase address)
 498     ))
 499
 500
 501 ;;; @ end
 502 ;;;
 503
 504 (provide 'eword-decode)
 505
 506 ;;; eword-decode.el ends here