eword-decode.el

   1 ;;; eword-decode.el --- RFC 2047 based encoded-word decoder for GNU Emacs
   2
   3 ;; Copyright (C) 1995,1996,1997,1998 Free Software Foundation, Inc.
   4
   5 ;; Author: ENAMI Tsugutomo <enami@sys.ptg.sony.co.jp>
   6 ;;         MORIOKA Tomohiko <morioka@jaist.ac.jp>
   7 ;; Maintainer: MORIOKA Tomohiko <morioka@jaist.ac.jp>
   8 ;; Created: 1995/10/03
   9 ;; Original: 1992/07/20 ENAMI Tsugutomo's `mime.el'.
  10 ;;      Renamed: 1993/06/03 to tiny-mime.el
  11 ;;      Renamed: 1995/10/03 from tiny-mime.el (split off encoder)
  12 ;;      Renamed: 1997/02/22 from tm-ew-d.el
  13 ;; Keywords: encoded-word, MIME, multilingual, header, mail, news
  14
  15 ;; This file is part of SEMI (Spadework for Emacs MIME Interfaces).
  16
  17 ;; This program is free software; you can redistribute it and/or
  18 ;; modify it under the terms of the GNU General Public License as
  19 ;; published by the Free Software Foundation; either version 2, or (at
  20 ;; your option) any later version.
  21
  22 ;; This program is distributed in the hope that it will be useful, but
  23 ;; WITHOUT ANY WARRANTY; without even the implied warranty of
  24 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  25 ;; General Public License for more details.
  26
  27 ;; You should have received a copy of the GNU General Public License
  28 ;; along with GNU Emacs; see the file COPYING.  If not, write to the
  29 ;; Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  30 ;; Boston, MA 02111-1307, USA.
  31
  32 ;;; Code:
  33
  34 (require 'std11-parse)
  35 (require 'mel)
  36 (require 'mime-def)
  37
  38 (defgroup eword-decode nil
  39   "Encoded-word decoding"
  40   :group 'mime)
  41
  42
  43 ;;; @ MIME encoded-word definition
  44 ;;;
  45
  46 (defconst eword-encoded-text-regexp "[!->@-~]+")
  47 (defconst eword-encoded-word-regexp
  48   (concat (regexp-quote "=?")
  49           "\\("
  50           mime-charset-regexp
  51           "\\)"
  52           (regexp-quote "?")
  53           "\\(B\\|Q\\)"
  54           (regexp-quote "?")
  55           "\\("
  56           eword-encoded-text-regexp
  57           "\\)"
  58           (regexp-quote "?=")))
  59
  60
  61 ;;; @@ Base64
  62 ;;;
  63
  64 (defconst base64-token-regexp "[A-Za-z0-9+/]")
  65 (defconst base64-token-padding-regexp "[A-Za-z0-9+/=]")
  66
  67 (defconst eword-B-encoded-text-regexp
  68   (concat "\\(\\("
  69           base64-token-regexp
  70           base64-token-regexp
  71           base64-token-regexp
  72           base64-token-regexp
  73           "\\)*"
  74           base64-token-regexp
  75           base64-token-regexp
  76           base64-token-padding-regexp
  77           base64-token-padding-regexp
  78           "\\)"))
  79
  80 ;; (defconst eword-B-encoding-and-encoded-text-regexp
  81 ;;   (concat "\\(B\\)\\?" eword-B-encoded-text-regexp))
  82
  83
  84 ;;; @@ Quoted-Printable
  85 ;;;
  86
  87 (defconst quoted-printable-hex-chars "0123456789ABCDEF")
  88 (defconst quoted-printable-octet-regexp
  89   (concat "=[" quoted-printable-hex-chars
  90           "][" quoted-printable-hex-chars "]"))
  91
  92 (defconst eword-Q-encoded-text-regexp
  93   (concat "\\([^=?]\\|" quoted-printable-octet-regexp "\\)+"))
  94 ;; (defconst eword-Q-encoding-and-encoded-text-regexp
  95 ;;   (concat "\\(Q\\)\\?" eword-Q-encoded-text-regexp))
  96
  97
  98 ;;; @ for string
  99 ;;;
 100
 101 (defun eword-decode-string (string &optional must-unfold)
 102   "Decode MIME encoded-words in STRING.
 103
 104 STRING is unfolded before decoding.
 105
 106 If an encoded-word is broken or your emacs implementation can not
 107 decode the charset included in it, it is not decoded.
 108
 109 If MUST-UNFOLD is non-nil, it unfolds and eliminates line-breaks even
 110 if there are in decoded encoded-words (generated by bad manner MUA
 111 such as a version of Net$cape)."
 112   (setq string (std11-unfold-string string))
 113   (let ((dest "")(ew nil)
 114         beg end)
 115     (while (and (string-match eword-encoded-word-regexp string)
 116                 (setq beg (match-beginning 0)
 117                       end (match-end 0))
 118                 )
 119       (if (> beg 0)
 120           (if (not
 121                (and (eq ew t)
 122                     (string-match "^[ \t]+$" (substring string 0 beg))
 123                     ))
 124               (setq dest (concat dest (substring string 0 beg)))
 125             )
 126         )
 127       (setq dest
 128             (concat dest
 129                     (eword-decode-encoded-word
 130                      (substring string beg end) must-unfold)
 131                     ))
 132       (setq string (substring string end))
 133       (setq ew t)
 134       )
 135     (concat dest string)
 136     ))
 137
 138
 139 ;;; @ for region
 140 ;;;
 141
 142 (defun eword-decode-region (start end &optional unfolding must-unfold)
 143   "Decode MIME encoded-words in region between START and END.
 144
 145 If UNFOLDING is not nil, it unfolds before decoding.
 146
 147 If MUST-UNFOLD is non-nil, it unfolds and eliminates line-breaks even
 148 if there are in decoded encoded-words (generated by bad manner MUA
 149 such as a version of Net$cape)."
 150   (interactive "*r")
 151   (save-excursion
 152     (save-restriction
 153       (narrow-to-region start end)
 154       (if unfolding
 155           (eword-decode-unfold)
 156         )
 157       (goto-char (point-min))
 158       (while (re-search-forward (concat "\\(" eword-encoded-word-regexp "\\)"
 159                                         "\\(\n?[ \t]\\)+"
 160                                         "\\(" eword-encoded-word-regexp "\\)")
 161                                 nil t)
 162         (replace-match "\\1\\6")
 163         (goto-char (point-min))
 164         )
 165       (while (re-search-forward eword-encoded-word-regexp nil t)
 166         (insert (eword-decode-encoded-word
 167                  (prog1
 168                      (buffer-substring (match-beginning 0) (match-end 0))
 169                    (delete-region (match-beginning 0) (match-end 0))
 170                    ) must-unfold))
 171         )
 172       )))
 173
 174
 175 ;;; @ for message header
 176 ;;;
 177
 178 (defcustom eword-decode-ignored-field-list
 179   '(newsgroups path lines nntp-posting-host message-id date)
 180   "*List of field-names to be ignored when decoding.
 181 Each field name must be symbol."
 182   :group 'eword-decode
 183   :type '(repeat symbol))
 184
 185 (defcustom eword-decode-structured-field-list
 186   '(reply-to resent-reply-to from resent-from sender resent-sender
 187              to resent-to cc resent-cc bcc resent-bcc dcc
 188              mime-version content-type content-transfer-encoding
 189              content-disposition)
 190   "*List of field-names to decode as structured field.
 191 Each field name must be symbol."
 192   :group 'eword-decode
 193   :type '(repeat symbol))
 194
 195 (defun eword-decode-header (&optional code-conversion separator)
 196   "Decode MIME encoded-words in header fields.
 197 If CODE-CONVERSION is nil, it decodes only encoded-words.  If it is
 198 mime-charset, it decodes non-ASCII bit patterns as the mime-charset.
 199 Otherwise it decodes non-ASCII bit patterns as the
 200 default-mime-charset.
 201 If SEPARATOR is not nil, it is used as header separator."
 202   (interactive "*")
 203   (save-excursion
 204     (save-restriction
 205       (std11-narrow-to-header separator)
 206       (let ((default-charset
 207               (if code-conversion
 208                   (if (mime-charset-to-coding-system code-conversion)
 209                       code-conversion
 210                     default-mime-charset))))
 211         (if default-charset
 212             (let (beg p end field-name len)
 213               (goto-char (point-min))
 214               (while (re-search-forward std11-field-head-regexp nil t)
 215                 (setq beg (match-beginning 0)
 216                       p (match-end 0)
 217                       field-name (buffer-substring beg (1- p))
 218                       len (string-width field-name)
 219                       field-name (intern (downcase field-name))
 220                       end (std11-field-end))
 221                 (cond ((memq field-name eword-decode-ignored-field-list)
 222                        ;; Don't decode
 223                        )
 224                       ((memq field-name eword-decode-structured-field-list)
 225                        ;; Decode as structured field
 226                        (let ((body (buffer-substring p end))
 227                              (default-mime-charset default-charset))
 228                          (delete-region p end)
 229                          (insert (eword-decode-and-fold-structured-field
 230                                   body (1+ len)))
 231                          ))
 232                       (t
 233                        ;; Decode as unstructured field
 234                        (save-restriction
 235                          (narrow-to-region beg (1+ end))
 236                          (decode-mime-charset-region p end default-charset)
 237                          (goto-char p)
 238                          (if (re-search-forward eword-encoded-word-regexp
 239                                                 nil t)
 240                              (eword-decode-region beg (point-max) 'unfold))
 241                          )))))
 242           (eword-decode-region (point-min) (point-max) t)
 243           )))))
 244
 245 (defun eword-decode-unfold ()
 246   (goto-char (point-min))
 247   (let (field beg end)
 248     (while (re-search-forward std11-field-head-regexp nil t)
 249       (setq beg (match-beginning 0)
 250             end (std11-field-end))
 251       (setq field (buffer-substring beg end))
 252       (if (string-match eword-encoded-word-regexp field)
 253           (save-restriction
 254             (narrow-to-region (goto-char beg) end)
 255             (while (re-search-forward "\n\\([ \t]\\)" nil t)
 256               (replace-match (match-string 1))
 257               )
 258             (goto-char (point-max))
 259             ))
 260       )))
 261
 262
 263 ;;; @ encoded-word decoder
 264 ;;;
 265
 266 (defvar eword-warning-face nil "Face used for invalid encoded-word.")
 267
 268 (defun eword-decode-encoded-word (word &optional must-unfold)
 269   "Decode WORD if it is an encoded-word.
 270
 271 If your emacs implementation can not decode the charset of WORD, it
 272 returns WORD.  Similarly the encoded-word is broken, it returns WORD.
 273
 274 If MUST-UNFOLD is non-nil, it unfolds and eliminates line-breaks even
 275 if there are in decoded encoded-word (generated by bad manner MUA such
 276 as a version of Net$cape)."
 277   (or (if (string-match eword-encoded-word-regexp word)
 278           (let ((charset
 279                  (substring word (match-beginning 1) (match-end 1))
 280                  )
 281                 (encoding
 282                  (upcase
 283                   (substring word (match-beginning 2) (match-end 2))
 284                   ))
 285                 (text
 286                  (substring word (match-beginning 3) (match-end 3))
 287                  ))
 288             (condition-case err
 289                 (eword-decode-encoded-text charset encoding text must-unfold)
 290               (error
 291                (and
 292                 (add-text-properties 0 (length word)
 293                                      (and eword-warning-face
 294                                           (list 'face eword-warning-face))
 295                                      word)
 296                 word)))
 297             ))
 298       word))
 299
 300
 301 ;;; @ encoded-text decoder
 302 ;;;
 303
 304 (defun eword-decode-encoded-text (charset encoding string
 305                                           &optional must-unfold)
 306   "Decode STRING as an encoded-text.
 307
 308 If your emacs implementation can not decode CHARSET, it returns nil.
 309
 310 If ENCODING is not \"B\" or \"Q\", it occurs error.
 311 So you should write error-handling code if you don't want break by errors.
 312
 313 If MUST-UNFOLD is non-nil, it unfolds and eliminates line-breaks even
 314 if there are in decoded encoded-text (generated by bad manner MUA such
 315 as a version of Net$cape)."
 316   (let ((cs (mime-charset-to-coding-system charset)))
 317     (if cs
 318         (let ((dest
 319                (cond
 320                 ((string-equal "B" encoding)
 321                  (if (and (string-match eword-B-encoded-text-regexp string)
 322                           (string-equal string (match-string 0 string)))
 323                      (base64-decode-string string)
 324                    (error "Invalid encoded-text %s" string)))
 325                 ((string-equal "Q" encoding)
 326                  (if (and (string-match eword-Q-encoded-text-regexp string)
 327                           (string-equal string (match-string 0 string)))
 328                      (q-encoding-decode-string string)
 329                    (error "Invalid encoded-text %s" string)))
 330                 (t
 331                  (error "Invalid encoding %s" encoding)
 332                  )))
 333               )
 334           (if dest
 335               (progn
 336                 (setq dest (decode-coding-string dest cs))
 337                 (if must-unfold
 338                     (mapconcat (function
 339                                 (lambda (chr)
 340                                   (cond
 341                                    ((eq chr ?\n) "")
 342                                    ((eq chr ?\t) " ")
 343                                    (t (char-to-string chr)))
 344                                   ))
 345                                (std11-unfold-string dest)
 346                                "")
 347                   dest)
 348                 ))))))
 349
 350
 351 ;;; @ lexical analyze
 352 ;;;
 353
 354 (defvar eword-lexical-analyze-cache nil)
 355 (defvar eword-lexical-analyze-cache-max 299
 356   "*Max position of eword-lexical-analyze-cache.
 357 It is max size of eword-lexical-analyze-cache - 1.")
 358
 359 (defcustom eword-lexical-analyzers
 360   '(eword-analyze-quoted-string
 361     eword-analyze-domain-literal
 362     eword-analyze-comment
 363     eword-analyze-spaces
 364     eword-analyze-special
 365     eword-analyze-encoded-word
 366     eword-analyze-atom)
 367   "*List of functions to return result of lexical analyze.
 368 Each function must have two arguments: STRING and MUST-UNFOLD.
 369 STRING is the target string to be analyzed.
 370 If MUST-UNFOLD is not nil, each function must unfold and eliminate
 371 bare-CR and bare-LF from the result even if they are included in
 372 content of the encoded-word.
 373 Each function must return nil if it can not analyze STRING as its
 374 format.
 375
 376 Previous function is preferred to next function.  If a function
 377 returns nil, next function is used.  Otherwise the return value will
 378 be the result."
 379   :group 'eword-decode
 380   :type '(repeat function))
 381
 382 (defun eword-analyze-quoted-string (string &optional must-unfold)
 383   (let ((p (std11-check-enclosure string ?\" ?\")))
 384     (if p
 385         (cons (cons 'quoted-string
 386                     (decode-mime-charset-string
 387                      (std11-strip-quoted-pair (substring string 1 (1- p)))
 388                      default-mime-charset))
 389               (substring string p))
 390       )))
 391
 392 (defun eword-analyze-domain-literal (string &optional must-unfold)
 393   (std11-analyze-domain-literal string))
 394
 395 (defun eword-analyze-comment (string &optional must-unfold)
 396   (let ((p (std11-check-enclosure string ?\( ?\) t)))
 397     (if p
 398         (cons (cons 'comment
 399                     (eword-decode-string
 400                      (decode-mime-charset-string
 401                       (std11-strip-quoted-pair (substring string 1 (1- p)))
 402                       default-mime-charset)
 403                      must-unfold))
 404               (substring string p))
 405       )))
 406
 407 (defun eword-analyze-spaces (string &optional must-unfold)
 408   (std11-analyze-spaces string))
 409
 410 (defun eword-analyze-special (string &optional must-unfold)
 411   (std11-analyze-special string))
 412
 413 (defun eword-analyze-encoded-word (string &optional must-unfold)
 414   (if (eq (string-match eword-encoded-word-regexp string) 0)
 415       (let ((end (match-end 0))
 416             (dest (eword-decode-encoded-word (match-string 0 string)
 417                                              must-unfold))
 418             )
 419         (setq string (substring string end))
 420         (while (eq (string-match `,(concat "[ \t\n]*\\("
 421                                            eword-encoded-word-regexp
 422                                            "\\)")
 423                                  string)
 424                    0)
 425           (setq end (match-end 0))
 426           (setq dest
 427                 (concat dest
 428                         (eword-decode-encoded-word (match-string 1 string)
 429                                                    must-unfold))
 430                 string (substring string end))
 431           )
 432         (cons (cons 'atom dest) string)
 433         )))
 434
 435 (defun eword-analyze-atom (string &optional must-unfold)
 436   (if (string-match std11-atom-regexp string)
 437       (let ((end (match-end 0)))
 438         (cons (cons 'atom (decode-mime-charset-string
 439                            (substring string 0 end)
 440                            default-mime-charset))
 441               (substring string end)
 442               ))))
 443
 444 (defun eword-lexical-analyze-internal (string must-unfold)
 445   (let (dest ret)
 446     (while (not (string-equal string ""))
 447       (setq ret
 448             (let ((rest eword-lexical-analyzers)
 449                   func r)
 450               (while (and (setq func (car rest))
 451                           (null (setq r (funcall func string must-unfold)))
 452                           )
 453                 (setq rest (cdr rest)))
 454               (or r `((error . ,string) . ""))
 455               ))
 456       (setq dest (cons (car ret) dest))
 457       (setq string (cdr ret))
 458       )
 459     (nreverse dest)
 460     ))
 461
 462 (defun eword-lexical-analyze (string &optional must-unfold)
 463   "Return lexical analyzed list corresponding STRING.
 464 It is like std11-lexical-analyze, but it decodes non us-ascii
 465 characters encoded as encoded-words or invalid \"raw\" format.
 466 \"Raw\" non us-ascii characters are regarded as variable
 467 `default-mime-charset'."
 468   (let ((key (copy-sequence string))
 469         ret)
 470     (set-text-properties 0 (length key) nil key)
 471     (if (setq ret (assoc key eword-lexical-analyze-cache))
 472         (cdr ret)
 473       (setq ret (eword-lexical-analyze-internal key must-unfold))
 474       (setq eword-lexical-analyze-cache
 475             (cons (cons key ret)
 476                   (last eword-lexical-analyze-cache
 477                         eword-lexical-analyze-cache-max)))
 478       ret)))
 479
 480 (defun eword-decode-token (token)
 481   (let ((type (car token))
 482         (value (cdr token)))
 483     (cond ((eq type 'quoted-string)
 484            (std11-wrap-as-quoted-string value))
 485           ((eq type 'comment)
 486            (concat "(" (std11-wrap-as-quoted-pairs value '(?( ?))) ")"))
 487           (t value))))
 488
 489 (defun eword-decode-and-fold-structured-field
 490   (string start-column &optional max-column must-unfold)
 491   "Decode and fold (fill) STRING as structured field body.
 492 It decodes non us-ascii characters in FULL-NAME encoded as
 493 encoded-words or invalid \"raw\" string.  \"Raw\" non us-ascii
 494 characters are regarded as variable `default-mime-charset'.
 495
 496 If an encoded-word is broken or your emacs implementation can not
 497 decode the charset included in it, it is not decoded.
 498
 499 If MAX-COLUMN is omitted, `fill-column' is used.
 500
 501 If MUST-UNFOLD is non-nil, it unfolds and eliminates line-breaks even
 502 if there are in decoded encoded-words (generated by bad manner MUA
 503 such as a version of Net$cape)."
 504   (or max-column
 505       (setq max-column fill-column))
 506   (let ((c start-column)
 507         (tokens (eword-lexical-analyze string must-unfold))
 508         (result "")
 509         token)
 510     (while (and (setq token (car tokens))
 511                 (setq tokens (cdr tokens)))
 512       (let* ((type (car token)))
 513         (if (eq type 'spaces)
 514             (let* ((next-token (car tokens))
 515                    (next-str (eword-decode-token next-token))
 516                    (next-len (string-width next-str))
 517                    (next-c (+ c next-len 1)))
 518               (if (< next-c max-column)
 519                   (setq result (concat result " " next-str)
 520                         c next-c)
 521                 (setq result (concat result "\n " next-str)
 522                       c (1+ next-len)))
 523               (setq tokens (cdr tokens))
 524               )
 525           (let* ((str (eword-decode-token token)))
 526             (setq result (concat result str)
 527                   c (+ c (string-width str)))
 528             ))))
 529     (if token
 530         (concat result (eword-decode-token token))
 531       result)))
 532
 533 (defun eword-decode-and-unfold-structured-field (string)
 534   "Decode and unfold STRING as structured field body.
 535 It decodes non us-ascii characters in FULL-NAME encoded as
 536 encoded-words or invalid \"raw\" string.  \"Raw\" non us-ascii
 537 characters are regarded as variable `default-mime-charset'.
 538
 539 If an encoded-word is broken or your emacs implementation can not
 540 decode the charset included in it, it is not decoded."
 541   (let ((tokens (eword-lexical-analyze string 'must-unfold))
 542         (result ""))
 543     (while tokens
 544       (let* ((token (car tokens))
 545              (type (car token)))
 546         (setq tokens (cdr tokens))
 547         (setq result
 548               (if (eq type 'spaces)
 549                   (concat result " ")
 550                 (concat result (eword-decode-token token))
 551                 ))))
 552     result))
 553
 554 (defun eword-decode-structured-field-body (string &optional must-unfold
 555                                                   start-column max-column)
 556   "Decode non us-ascii characters in STRING as structured field body.
 557 STRING is unfolded before decoding.
 558
 559 It decodes non us-ascii characters in FULL-NAME encoded as
 560 encoded-words or invalid \"raw\" string.  \"Raw\" non us-ascii
 561 characters are regarded as variable `default-mime-charset'.
 562
 563 If an encoded-word is broken or your emacs implementation can not
 564 decode the charset included in it, it is not decoded.
 565
 566 If MUST-UNFOLD is non-nil, it unfolds and eliminates line-breaks even
 567 if there are in decoded encoded-words (generated by bad manner MUA
 568 such as a version of Net$cape)."
 569   (if start-column
 570       ;; fold with max-column
 571       (eword-decode-and-fold-structured-field
 572        string start-column max-column must-unfold)
 573     ;; Don't fold
 574     (mapconcat (function eword-decode-token)
 575                (eword-lexical-analyze string must-unfold)
 576                "")
 577     ))
 578
 579 (defun eword-decode-unstructured-field-body (string &optional must-unfold)
 580   "Decode non us-ascii characters in STRING as unstructured field body.
 581 STRING is unfolded before decoding.
 582
 583 It decodes non us-ascii characters in FULL-NAME encoded as
 584 encoded-words or invalid \"raw\" string.  \"Raw\" non us-ascii
 585 characters are regarded as variable `default-mime-charset'.
 586
 587 If an encoded-word is broken or your emacs implementation can not
 588 decode the charset included in it, it is not decoded.
 589
 590 If MUST-UNFOLD is non-nil, it unfolds and eliminates line-breaks even
 591 if there are in decoded encoded-words (generated by bad manner MUA
 592 such as a version of Net$cape)."
 593   (eword-decode-string
 594    (decode-mime-charset-string string default-mime-charset)
 595    must-unfold))
 596
 597 (defun eword-extract-address-components (string)
 598   "Extract full name and canonical address from STRING.
 599 Returns a list of the form (FULL-NAME CANONICAL-ADDRESS).
 600 If no name can be extracted, FULL-NAME will be nil.
 601 It decodes non us-ascii characters in FULL-NAME encoded as
 602 encoded-words or invalid \"raw\" string.  \"Raw\" non us-ascii
 603 characters are regarded as variable `default-mime-charset'."
 604   (let* ((structure (car (std11-parse-address
 605                           (eword-lexical-analyze
 606                            (std11-unfold-string string) 'must-unfold))))
 607          (phrase  (std11-full-name-string structure))
 608          (address (std11-address-string structure))
 609          )
 610     (list phrase address)
 611     ))
 612
 613
 614 ;;; @ end
 615 ;;;
 616
 617 (provide 'eword-decode)
 618
 619 ;;; eword-decode.el ends here