eword-decode.el

   1 ;;; eword-decode.el --- RFC 2047 based encoded-word decoder for GNU Emacs
   2
   3 ;; Copyright (C) 1995,1996,1997 Free Software Foundation, Inc.
   4
   5 ;; Author: ENAMI Tsugutomo <enami@sys.ptg.sony.co.jp>
   6 ;;         MORIOKA Tomohiko <morioka@jaist.ac.jp>
   7 ;; Maintainer: MORIOKA Tomohiko <morioka@jaist.ac.jp>
   8 ;; Created: 1995/10/03
   9 ;; Original: 1992/07/20 ENAMI Tsugutomo's `mime.el'.
  10 ;;      Renamed: 1993/06/03 to tiny-mime.el
  11 ;;      Renamed: 1995/10/03 from tiny-mime.el (split off encoder)
  12 ;;      Renamed: 1997/02/22 from tm-ew-d.el
  13 ;; Version: $Revision: 1.10 $
  14 ;; Keywords: encoded-word, MIME, multilingual, header, mail, news
  15
  16 ;; This file is part of SEMI (SEMI is Emacs MIME Interfaces).
  17
  18 ;; This program is free software; you can redistribute it and/or
  19 ;; modify it under the terms of the GNU General Public License as
  20 ;; published by the Free Software Foundation; either version 2, or (at
  21 ;; your option) any later version.
  22
  23 ;; This program is distributed in the hope that it will be useful, but
  24 ;; WITHOUT ANY WARRANTY; without even the implied warranty of
  25 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  26 ;; General Public License for more details.
  27
  28 ;; You should have received a copy of the GNU General Public License
  29 ;; along with GNU Emacs; see the file COPYING.  If not, write to the
  30 ;; Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  31 ;; Boston, MA 02111-1307, USA.
  32
  33 ;;; Code:
  34
  35 (require 'std11-parse)
  36 (require 'mel)
  37 (require 'mime-def)
  38
  39 (defgroup eword-decode nil
  40   "Encoded-word decoding"
  41   :group 'mime)
  42
  43
  44 ;;; @ version
  45 ;;;
  46
  47 (defconst eword-decode-RCS-ID
  48   "$Id: eword-decode.el,v 1.10 1998-02-25 13:28:53 morioka Exp $")
  49 (defconst eword-decode-version (get-version-string eword-decode-RCS-ID))
  50
  51
  52 ;;; @ MIME encoded-word definition
  53 ;;;
  54
  55 (defconst eword-encoded-text-regexp "[!->@-~]+")
  56 (defconst eword-encoded-word-regexp
  57   (concat (regexp-quote "=?")
  58           "\\("
  59           mime-charset-regexp
  60           "\\)"
  61           (regexp-quote "?")
  62           "\\(B\\|Q\\)"
  63           (regexp-quote "?")
  64           "\\("
  65           eword-encoded-text-regexp
  66           "\\)"
  67           (regexp-quote "?=")))
  68
  69
  70 ;;; @@ Base64
  71 ;;;
  72
  73 (defconst base64-token-regexp "[A-Za-z0-9+/]")
  74 (defconst base64-token-padding-regexp "[A-Za-z0-9+/=]")
  75
  76 (defconst eword-B-encoded-text-regexp
  77   (concat "\\(\\("
  78           base64-token-regexp
  79           base64-token-regexp
  80           base64-token-regexp
  81           base64-token-regexp
  82           "\\)*"
  83           base64-token-regexp
  84           base64-token-regexp
  85           base64-token-padding-regexp
  86           base64-token-padding-regexp
  87           "\\)"))
  88
  89 ;; (defconst eword-B-encoding-and-encoded-text-regexp
  90 ;;   (concat "\\(B\\)\\?" eword-B-encoded-text-regexp))
  91
  92
  93 ;;; @@ Quoted-Printable
  94 ;;;
  95
  96 (defconst quoted-printable-hex-chars "0123456789ABCDEF")
  97 (defconst quoted-printable-octet-regexp
  98   (concat "=[" quoted-printable-hex-chars
  99           "][" quoted-printable-hex-chars "]"))
 100
 101 (defconst eword-Q-encoded-text-regexp
 102   (concat "\\([^=?]\\|" quoted-printable-octet-regexp "\\)+"))
 103 ;; (defconst eword-Q-encoding-and-encoded-text-regexp
 104 ;;   (concat "\\(Q\\)\\?" eword-Q-encoded-text-regexp))
 105
 106
 107 ;;; @ for string
 108 ;;;
 109
 110 (defun eword-decode-string (string &optional must-unfold)
 111   "Decode MIME encoded-words in STRING.
 112
 113 STRING is unfolded before decoding.
 114
 115 If an encoded-word is broken or your emacs implementation can not
 116 decode the charset included in it, it is not decoded.
 117
 118 If MUST-UNFOLD is non-nil, it unfolds and eliminates line-breaks even
 119 if there are in decoded encoded-words (generated by bad manner MUA
 120 such as a version of Net$cape)."
 121   (setq string (std11-unfold-string string))
 122   (let ((dest "")(ew nil)
 123         beg end)
 124     (while (and (string-match eword-encoded-word-regexp string)
 125                 (setq beg (match-beginning 0)
 126                       end (match-end 0))
 127                 )
 128       (if (> beg 0)
 129           (if (not
 130                (and (eq ew t)
 131                     (string-match "^[ \t]+$" (substring string 0 beg))
 132                     ))
 133               (setq dest (concat dest (substring string 0 beg)))
 134             )
 135         )
 136       (setq dest
 137             (concat dest
 138                     (eword-decode-encoded-word
 139                      (substring string beg end) must-unfold)
 140                     ))
 141       (setq string (substring string end))
 142       (setq ew t)
 143       )
 144     (concat dest string)
 145     ))
 146
 147
 148 ;;; @ for region
 149 ;;;
 150
 151 (defun eword-decode-region (start end &optional unfolding must-unfold)
 152   "Decode MIME encoded-words in region between START and END.
 153
 154 If UNFOLDING is not nil, it unfolds before decoding.
 155
 156 If MUST-UNFOLD is non-nil, it unfolds and eliminates line-breaks even
 157 if there are in decoded encoded-words (generated by bad manner MUA
 158 such as a version of Net$cape)."
 159   (interactive "*r")
 160   (save-excursion
 161     (save-restriction
 162       (narrow-to-region start end)
 163       (if unfolding
 164           (eword-decode-unfold)
 165         )
 166       (goto-char (point-min))
 167       (while (re-search-forward (concat "\\(" eword-encoded-word-regexp "\\)"
 168                                         "\\(\n?[ \t]\\)+"
 169                                         "\\(" eword-encoded-word-regexp "\\)")
 170                                 nil t)
 171         (replace-match "\\1\\6")
 172         (goto-char (point-min))
 173         )
 174       (while (re-search-forward eword-encoded-word-regexp nil t)
 175         (insert (eword-decode-encoded-word
 176                  (prog1
 177                      (buffer-substring (match-beginning 0) (match-end 0))
 178                    (delete-region (match-beginning 0) (match-end 0))
 179                    ) must-unfold))
 180         )
 181       )))
 182
 183
 184 ;;; @ for message header
 185 ;;;
 186
 187 (defcustom eword-decode-ignored-field-list
 188   '(newsgroups path lines nntp-posting-host message-id date)
 189   "*List of field-names to be ignored when decoding.
 190 Each field name must be symbol."
 191   :group 'eword-decode
 192   :type '(repeat symbol))
 193
 194 (defcustom eword-decode-structured-field-list
 195   '(reply-to resent-reply-to from resent-from sender resent-sender
 196              to resent-to cc resent-cc bcc resent-bcc dcc
 197              mime-version content-type content-transfer-encoding
 198              content-disposition)
 199   "*List of field-names to decode as structured field.
 200 Each field name must be symbol."
 201   :group 'eword-decode
 202   :type '(repeat symbol))
 203
 204 (defun eword-decode-header (&optional code-conversion separator)
 205   "Decode MIME encoded-words in header fields.
 206 If CODE-CONVERSION is nil, it decodes only encoded-words.  If it is
 207 mime-charset, it decodes non-ASCII bit patterns as the mime-charset.
 208 Otherwise it decodes non-ASCII bit patterns as the
 209 default-mime-charset.
 210 If SEPARATOR is not nil, it is used as header separator."
 211   (interactive "*")
 212   (save-excursion
 213     (save-restriction
 214       (std11-narrow-to-header separator)
 215       (let ((default-charset
 216               (if code-conversion
 217                   (if (mime-charset-to-coding-system code-conversion)
 218                       code-conversion
 219                     default-mime-charset))))
 220         (if default-charset
 221             (let (beg p end field-name len)
 222               (goto-char (point-min))
 223               (while (re-search-forward std11-field-head-regexp nil t)
 224                 (setq beg (match-beginning 0)
 225                       p (match-end 0)
 226                       field-name (buffer-substring beg (1- p))
 227                       len (string-width field-name)
 228                       field-name (intern (downcase field-name))
 229                       end (std11-field-end))
 230                 (cond ((memq field-name eword-decode-ignored-field-list)
 231                        ;; Don't decode
 232                        )
 233                       ((memq field-name eword-decode-structured-field-list)
 234                        ;; Decode as structured field
 235                        (let ((body (buffer-substring p end))
 236                              (default-mime-charset default-charset))
 237                          (delete-region p end)
 238                          (insert (eword-decode-and-fold-structured-field
 239                                   body (1+ len)))
 240                          ))
 241                       (t
 242                        ;; Decode as unstructured field
 243                        (save-restriction
 244                          (narrow-to-region beg (1+ end))
 245                          (decode-mime-charset-region p end default-charset)
 246                          (goto-char p)
 247                          (if (re-search-forward eword-encoded-word-regexp
 248                                                 nil t)
 249                              (eword-decode-region beg (point-max) 'unfold))
 250                          )))))
 251           (eword-decode-region (point-min) (point-max) t)
 252           )))))
 253
 254 (defun eword-decode-unfold ()
 255   (goto-char (point-min))
 256   (let (field beg end)
 257     (while (re-search-forward std11-field-head-regexp nil t)
 258       (setq beg (match-beginning 0)
 259             end (std11-field-end))
 260       (setq field (buffer-substring beg end))
 261       (if (string-match eword-encoded-word-regexp field)
 262           (save-restriction
 263             (narrow-to-region (goto-char beg) end)
 264             (while (re-search-forward "\n\\([ \t]\\)" nil t)
 265               (replace-match (match-string 1))
 266               )
 267             (goto-char (point-max))
 268             ))
 269       )))
 270
 271
 272 ;;; @ encoded-word decoder
 273 ;;;
 274
 275 (defvar eword-warning-face nil "Face used for invalid encoded-word.")
 276
 277 (defun eword-decode-encoded-word (word &optional must-unfold)
 278   "Decode WORD if it is an encoded-word.
 279
 280 If your emacs implementation can not decode the charset of WORD, it
 281 returns WORD.  Similarly the encoded-word is broken, it returns WORD.
 282
 283 If MUST-UNFOLD is non-nil, it unfolds and eliminates line-breaks even
 284 if there are in decoded encoded-word (generated by bad manner MUA such
 285 as a version of Net$cape)."
 286   (or (if (string-match eword-encoded-word-regexp word)
 287           (let ((charset
 288                  (substring word (match-beginning 1) (match-end 1))
 289                  )
 290                 (encoding
 291                  (upcase
 292                   (substring word (match-beginning 2) (match-end 2))
 293                   ))
 294                 (text
 295                  (substring word (match-beginning 3) (match-end 3))
 296                  ))
 297             (condition-case err
 298                 (eword-decode-encoded-text charset encoding text must-unfold)
 299               (error
 300                (and
 301                 (add-text-properties 0 (length word)
 302                                      (and eword-warning-face
 303                                           (list 'face eword-warning-face))
 304                                      word)
 305                 word)))
 306             ))
 307       word))
 308
 309
 310 ;;; @ encoded-text decoder
 311 ;;;
 312
 313 (defun eword-decode-encoded-text (charset encoding string
 314                                           &optional must-unfold)
 315   "Decode STRING as an encoded-text.
 316
 317 If your emacs implementation can not decode CHARSET, it returns nil.
 318
 319 If ENCODING is not \"B\" or \"Q\", it occurs error.
 320 So you should write error-handling code if you don't want break by errors.
 321
 322 If MUST-UNFOLD is non-nil, it unfolds and eliminates line-breaks even
 323 if there are in decoded encoded-text (generated by bad manner MUA such
 324 as a version of Net$cape)."
 325   (let ((cs (mime-charset-to-coding-system charset)))
 326     (if cs
 327         (let ((dest
 328                (cond
 329                 ((string-equal "B" encoding)
 330                  (if (and (string-match eword-B-encoded-text-regexp string)
 331                           (string-equal string (match-string 0 string)))
 332                      (base64-decode-string string)
 333                    (error "Invalid encoded-text %s" string)))
 334                 ((string-equal "Q" encoding)
 335                  (if (and (string-match eword-Q-encoded-text-regexp string)
 336                           (string-equal string (match-string 0 string)))
 337                      (q-encoding-decode-string string)
 338                    (error "Invalid encoded-text %s" string)))
 339                 (t
 340                  (error "Invalid encoding %s" encoding)
 341                  )))
 342               )
 343           (if dest
 344               (progn
 345                 (setq dest (decode-coding-string dest cs))
 346                 (if must-unfold
 347                     (mapconcat (function
 348                                 (lambda (chr)
 349                                   (cond
 350                                    ((eq chr ?\n) "")
 351                                    ((eq chr ?\t) " ")
 352                                    (t (char-to-string chr)))
 353                                   ))
 354                                (std11-unfold-string dest)
 355                                "")
 356                   dest)
 357                 ))))))
 358
 359
 360 ;;; @ lexical analyze
 361 ;;;
 362
 363 (defvar eword-lexical-analyze-cache nil)
 364 (defvar eword-lexical-analyze-cache-max 299
 365   "*Max position of eword-lexical-analyze-cache.
 366 It is max size of eword-lexical-analyze-cache - 1.")
 367
 368 (defcustom eword-lexical-analyzers
 369   '(eword-analyze-quoted-string
 370     eword-analyze-domain-literal
 371     eword-analyze-comment
 372     eword-analyze-spaces
 373     eword-analyze-special
 374     eword-analyze-encoded-word
 375     eword-analyze-atom)
 376   "*List of functions to return result of lexical analyze.
 377 Each function must have two arguments: STRING and MUST-UNFOLD.
 378 STRING is the target string to be analyzed.
 379 If MUST-UNFOLD is not nil, each function must unfold and eliminate
 380 bare-CR and bare-LF from the result even if they are included in
 381 content of the encoded-word.
 382 Each function must return nil if it can not analyze STRING as its
 383 format.
 384
 385 Previous function is preferred to next function.  If a function
 386 returns nil, next function is used.  Otherwise the return value will
 387 be the result."
 388   :group 'eword-decode
 389   :type '(repeat function))
 390
 391 (defun eword-analyze-quoted-string (string &optional must-unfold)
 392   (let ((p (std11-check-enclosure string ?\" ?\")))
 393     (if p
 394         (cons (cons 'quoted-string
 395                     (decode-mime-charset-string
 396                      (std11-strip-quoted-pair (substring string 1 (1- p)))
 397                      default-mime-charset))
 398               (substring string p))
 399       )))
 400
 401 (defun eword-analyze-domain-literal (string &optional must-unfold)
 402   (std11-analyze-domain-literal string))
 403
 404 (defun eword-analyze-comment (string &optional must-unfold)
 405   (let ((p (std11-check-enclosure string ?\( ?\) t)))
 406     (if p
 407         (cons (cons 'comment
 408                     (eword-decode-string
 409                      (decode-mime-charset-string
 410                       (std11-strip-quoted-pair (substring string 1 (1- p)))
 411                       default-mime-charset)
 412                      must-unfold))
 413               (substring string p))
 414       )))
 415
 416 (defun eword-analyze-spaces (string &optional must-unfold)
 417   (std11-analyze-spaces string))
 418
 419 (defun eword-analyze-special (string &optional must-unfold)
 420   (std11-analyze-special string))
 421
 422 (defun eword-analyze-encoded-word (string &optional must-unfold)
 423   (if (eq (string-match eword-encoded-word-regexp string) 0)
 424       (let ((end (match-end 0))
 425             (dest (eword-decode-encoded-word (match-string 0 string)
 426                                              must-unfold))
 427             )
 428         (setq string (substring string end))
 429         (while (eq (string-match `,(concat "[ \t\n]*\\("
 430                                            eword-encoded-word-regexp
 431                                            "\\)")
 432                                  string)
 433                    0)
 434           (setq end (match-end 0))
 435           (setq dest
 436                 (concat dest
 437                         (eword-decode-encoded-word (match-string 1 string)
 438                                                    must-unfold))
 439                 string (substring string end))
 440           )
 441         (cons (cons 'atom dest) string)
 442         )))
 443
 444 (defun eword-analyze-atom (string &optional must-unfold)
 445   (if (string-match std11-atom-regexp string)
 446       (let ((end (match-end 0)))
 447         (cons (cons 'atom (decode-mime-charset-string
 448                            (substring string 0 end)
 449                            default-mime-charset))
 450               (substring string end)
 451               ))))
 452
 453 (defun eword-lexical-analyze-internal (string must-unfold)
 454   (let (dest ret)
 455     (while (not (string-equal string ""))
 456       (setq ret
 457             (let ((rest eword-lexical-analyzers)
 458                   func r)
 459               (while (and (setq func (car rest))
 460                           (null (setq r (funcall func string must-unfold)))
 461                           )
 462                 (setq rest (cdr rest)))
 463               (or r '((error) . ""))
 464               ))
 465       (setq dest (cons (car ret) dest))
 466       (setq string (cdr ret))
 467       )
 468     (nreverse dest)
 469     ))
 470
 471 (defun eword-lexical-analyze (string &optional must-unfold)
 472   "Return lexical analyzed list corresponding STRING.
 473 It is like std11-lexical-analyze, but it decodes non us-ascii
 474 characters encoded as encoded-words or invalid \"raw\" format.
 475 \"Raw\" non us-ascii characters are regarded as variable
 476 `default-mime-charset'."
 477   (let ((key (copy-sequence string))
 478         ret)
 479     (set-text-properties 0 (length key) nil key)
 480     (if (setq ret (assoc key eword-lexical-analyze-cache))
 481         (cdr ret)
 482       (setq ret (eword-lexical-analyze-internal key must-unfold))
 483       (setq eword-lexical-analyze-cache
 484             (cons (cons key ret)
 485                   (last eword-lexical-analyze-cache
 486                         eword-lexical-analyze-cache-max)))
 487       ret)))
 488
 489 (defun eword-decode-token (token)
 490   (let ((type (car token))
 491         (value (cdr token)))
 492     (cond ((eq type 'quoted-string)
 493            (std11-wrap-as-quoted-string value))
 494           ((eq type 'comment)
 495            (concat "(" (std11-wrap-as-quoted-pairs value '(?( ?))) ")"))
 496           (t value))))
 497
 498 (defun eword-decode-and-fold-structured-field
 499   (string start-column &optional max-column must-unfold)
 500   "Decode and fold (fill) STRING as structured field body.
 501 It decodes non us-ascii characters in FULL-NAME encoded as
 502 encoded-words or invalid \"raw\" string.  \"Raw\" non us-ascii
 503 characters are regarded as variable `default-mime-charset'.
 504
 505 If an encoded-word is broken or your emacs implementation can not
 506 decode the charset included in it, it is not decoded.
 507
 508 If MAX-COLUMN is omitted, `fill-column' is used.
 509
 510 If MUST-UNFOLD is non-nil, it unfolds and eliminates line-breaks even
 511 if there are in decoded encoded-words (generated by bad manner MUA
 512 such as a version of Net$cape)."
 513   (or max-column
 514       (setq max-column fill-column))
 515   (let ((c start-column)
 516         (tokens (eword-lexical-analyze string must-unfold))
 517         (result ""))
 518     (while tokens
 519       (let* ((token (car tokens))
 520              (type (car token)))
 521         (setq tokens (cdr tokens))
 522         (if (eq type 'spaces)
 523             (let* ((next-token (car tokens))
 524                    (next-str (eword-decode-token next-token))
 525                    (next-len (string-width next-str))
 526                    (next-c (+ c next-len 1)))
 527               (if (< next-c max-column)
 528                   (setq result (concat result " " next-str)
 529                         c next-c)
 530                 (setq result (concat result "\n " next-str)
 531                       c (1+ next-len)))
 532               (setq tokens (cdr tokens))
 533               )
 534           (let* ((str (eword-decode-token token)))
 535             (setq result (concat result str)
 536                   c (+ c (string-width str)))
 537             ))))
 538     result))
 539
 540 (defun eword-decode-and-unfold-structured-field (string)
 541   "Decode and unfold STRING as structured field body.
 542 It decodes non us-ascii characters in FULL-NAME encoded as
 543 encoded-words or invalid \"raw\" string.  \"Raw\" non us-ascii
 544 characters are regarded as variable `default-mime-charset'.
 545
 546 If an encoded-word is broken or your emacs implementation can not
 547 decode the charset included in it, it is not decoded."
 548   (let ((tokens (eword-lexical-analyze string 'must-unfold))
 549         (result ""))
 550     (while tokens
 551       (let* ((token (car tokens))
 552              (type (car token)))
 553         (setq tokens (cdr tokens))
 554         (setq result
 555               (if (eq type 'spaces)
 556                   (concat result " ")
 557                 (concat result (eword-decode-token token))
 558                 ))))
 559     result))
 560
 561 (defun eword-decode-structured-field-body (string &optional must-unfold
 562                                                   start-column max-column)
 563   "Decode non us-ascii characters in STRING as structured field body.
 564 STRING is unfolded before decoding.
 565
 566 It decodes non us-ascii characters in FULL-NAME encoded as
 567 encoded-words or invalid \"raw\" string.  \"Raw\" non us-ascii
 568 characters are regarded as variable `default-mime-charset'.
 569
 570 If an encoded-word is broken or your emacs implementation can not
 571 decode the charset included in it, it is not decoded.
 572
 573 If MUST-UNFOLD is non-nil, it unfolds and eliminates line-breaks even
 574 if there are in decoded encoded-words (generated by bad manner MUA
 575 such as a version of Net$cape)."
 576   (if start-column
 577       ;; fold with max-column
 578       (eword-decode-and-fold-structured-field
 579        string start-column max-column must-unfold)
 580     ;; Don't fold
 581     (mapconcat (function eword-decode-token)
 582                (eword-lexical-analyze string must-unfold)
 583                "")
 584     ))
 585
 586 (defun eword-decode-unstructured-field-body (string &optional must-unfold)
 587   "Decode non us-ascii characters in STRING as unstructured field body.
 588 STRING is unfolded before decoding.
 589
 590 It decodes non us-ascii characters in FULL-NAME encoded as
 591 encoded-words or invalid \"raw\" string.  \"Raw\" non us-ascii
 592 characters are regarded as variable `default-mime-charset'.
 593
 594 If an encoded-word is broken or your emacs implementation can not
 595 decode the charset included in it, it is not decoded.
 596
 597 If MUST-UNFOLD is non-nil, it unfolds and eliminates line-breaks even
 598 if there are in decoded encoded-words (generated by bad manner MUA
 599 such as a version of Net$cape)."
 600   (eword-decode-string
 601    (decode-mime-charset-string string default-mime-charset)
 602    must-unfold))
 603
 604 (defun eword-extract-address-components (string)
 605   "Extract full name and canonical address from STRING.
 606 Returns a list of the form (FULL-NAME CANONICAL-ADDRESS).
 607 If no name can be extracted, FULL-NAME will be nil.
 608 It decodes non us-ascii characters in FULL-NAME encoded as
 609 encoded-words or invalid \"raw\" string.  \"Raw\" non us-ascii
 610 characters are regarded as variable `default-mime-charset'."
 611   (let* ((structure (car (std11-parse-address
 612                           (eword-lexical-analyze
 613                            (std11-unfold-string string) 'must-unfold))))
 614          (phrase  (std11-full-name-string structure))
 615          (address (std11-address-string structure))
 616          )
 617     (list phrase address)
 618     ))
 619
 620
 621 ;;; @ end
 622 ;;;
 623
 624 (provide 'eword-decode)
 625
 626 ;;; eword-decode.el ends here