eword-decode.el

   1 ;;; eword-decode.el --- RFC 2047 based encoded-word decoder for GNU Emacs
   2
   3 ;; Copyright (C) 1995,1996,1997,1998 Free Software Foundation, Inc.
   4
   5 ;; Author: ENAMI Tsugutomo <enami@sys.ptg.sony.co.jp>
   6 ;;         MORIOKA Tomohiko <morioka@jaist.ac.jp>
   7 ;;         Tanaka Akira <akr@jaist.ac.jp>
   8 ;; Maintainer: Tanaka Akira <akr@jaist.ac.jp>
   9 ;; Created: 1995/10/03
  10 ;; Original: 1992/07/20 ENAMI Tsugutomo's `mime.el'.
  11 ;;      Renamed: 1993/06/03 to tiny-mime.el
  12 ;;      Renamed: 1995/10/03 from tiny-mime.el (split off encoder)
  13 ;;      Renamed: 1997/02/22 from tm-ew-d.el
  14 ;; Keywords: encoded-word, MIME, multilingual, header, mail, news
  15
  16 ;; This file is part of FLAM (Faithful Library About MIME).
  17
  18 ;; This program is free software; you can redistribute it and/or
  19 ;; modify it under the terms of the GNU General Public License as
  20 ;; published by the Free Software Foundation; either version 2, or (at
  21 ;; your option) any later version.
  22
  23 ;; This program is distributed in the hope that it will be useful, but
  24 ;; WITHOUT ANY WARRANTY; without even the implied warranty of
  25 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  26 ;; General Public License for more details.
  27
  28 ;; You should have received a copy of the GNU General Public License
  29 ;; along with GNU Emacs; see the file COPYING.  If not, write to the
  30 ;; Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  31 ;; Boston, MA 02111-1307, USA.
  32
  33 ;;; Code:
  34
  35 (require 'std11)
  36 (require 'mel)
  37 (require 'mime-def)
  38
  39 (defgroup eword-decode nil
  40   "Encoded-word decoding"
  41   :group 'mime)
  42
  43
  44 ;;; @ variables
  45 ;;;
  46
  47 (defcustom eword-decode-sticked-encoded-word nil
  48   "*If non-nil, decode encoded-words sticked on atoms,
  49 other encoded-words, etc.
  50 however this behaviour violates RFC2047."
  51   :group 'eword-decode
  52   :type 'boolean)
  53
  54 (defcustom eword-decode-quoted-encoded-word nil
  55   "*If non-nil, decode encoded-words in quoted-string
  56 however this behaviour violates RFC2047."
  57   :group 'eword-decode
  58   :type 'boolean)
  59
  60
  61 ;;; @ MIME encoded-word definition
  62 ;;;
  63
  64 (defconst eword-encoded-word-prefix-regexp
  65   (concat (regexp-quote "=?")
  66           "\\(" mime-charset-regexp "\\)"
  67           (regexp-quote "?")
  68           "\\(B\\|Q\\)"
  69           (regexp-quote "?")))
  70 (defconst eword-encoded-word-suffix-regexp
  71   (regexp-quote "?="))
  72
  73 (defconst eword-encoded-text-in-unstructured-regexp "[!->@-~]+")
  74 (defconst eword-encoded-word-in-unstructured-regexp
  75   (concat eword-encoded-word-prefix-regexp
  76           "\\(" eword-encoded-text-in-unstructured-regexp "\\)"
  77           eword-encoded-word-suffix-regexp))
  78 (defconst eword-after-encoded-word-in-unstructured-regexp "\\([ \t]\\|$\\)")
  79
  80 (defconst eword-encoded-text-in-phrase-regexp "[-A-Za-z0-9!*+/=_]+")
  81 (defconst eword-encoded-word-in-phrase-regexp
  82   (concat eword-encoded-word-prefix-regexp
  83           "\\(" eword-encoded-text-in-phrase-regexp "\\)"
  84           eword-encoded-word-suffix-regexp))
  85 (defconst eword-after-encoded-word-in-phrase-regexp "\\([ \t]\\|$\\)")
  86
  87 (defconst eword-encoded-text-in-comment-regexp "[]!-'*->@-[^-~]+")
  88 (defconst eword-encoded-word-in-comment-regexp
  89   (concat eword-encoded-word-prefix-regexp
  90           "\\(" eword-encoded-text-in-comment-regexp "\\)"
  91           eword-encoded-word-suffix-regexp))
  92 (defconst eword-after-encoded-word-in-comment-regexp "\\([ \t()\\\\]\\|$\\)")
  93
  94 (defconst eword-encoded-text-in-quoted-string-regexp "[]!#->@-[^-~]+")
  95 (defconst eword-encoded-word-in-quoted-string-regexp
  96   (concat eword-encoded-word-prefix-regexp
  97           "\\(" eword-encoded-text-in-quoted-string-regexp "\\)"
  98           eword-encoded-word-suffix-regexp))
  99 (defconst eword-after-encoded-word-in-quoted-string-regexp "\\([ \t\"\\\\]\\|$\\)")
 100
 101 ; obsolete
 102 (defconst eword-encoded-text-regexp eword-encoded-text-in-unstructured-regexp)
 103 (defconst eword-encoded-word-regexp eword-encoded-word-in-unstructured-regexp)
 104
 105
 106 ;;; @@ Base64
 107 ;;;
 108
 109 (defconst base64-token-regexp "[A-Za-z0-9+/]")
 110 (defconst base64-token-padding-regexp "[A-Za-z0-9+/=]")
 111
 112 (defconst eword-B-encoded-text-regexp
 113   (concat "\\(\\("
 114           base64-token-regexp
 115           base64-token-regexp
 116           base64-token-regexp
 117           base64-token-regexp
 118           "\\)*"
 119           base64-token-regexp
 120           base64-token-regexp
 121           base64-token-padding-regexp
 122           base64-token-padding-regexp
 123           "\\)"))
 124
 125 ;; (defconst eword-B-encoding-and-encoded-text-regexp
 126 ;;   (concat "\\(B\\)\\?" eword-B-encoded-text-regexp))
 127
 128
 129 ;;; @@ Quoted-Printable
 130 ;;;
 131
 132 (defconst eword-Q-encoded-text-regexp
 133   (concat "\\([^=?]\\|" quoted-printable-octet-regexp "\\)+"))
 134 ;; (defconst eword-Q-encoding-and-encoded-text-regexp
 135 ;;   (concat "\\(Q\\)\\?" eword-Q-encoded-text-regexp))
 136
 137
 138 ;;; @ internal utilities
 139 ;;;
 140
 141 (defun eword-decode-first-encoded-words (string
 142                                          eword-regexp
 143                                          after-regexp
 144                                          &optional must-unfold)
 145   "Decode MIME encoded-words in beginning of STRING.
 146
 147 EWORD-REGEXP is the regexp that matches a encoded-word.
 148 Usual value is
 149 eword-encoded-word-in-unstructured-regexp,
 150 eword-encoded-text-in-phrase-regexp,
 151 eword-encoded-word-in-comment-regexp or
 152 eword-encoded-word-in-quoted-string-regexp.
 153
 154 AFTER-REGEXP is the regexp that matches a after encoded-word.
 155 Usual value is
 156 eword-after-encoded-word-in-unstructured-regexp,
 157 eword-after-encoded-text-in-phrase-regexp,
 158 eword-after-encoded-word-in-comment-regexp or
 159 eword-after-encoded-word-in-quoted-string-regexp.
 160
 161 If beginning of STRING matches EWORD-REGEXP with AFTER-REGEXP,
 162 returns a cons cell of decoded string(sequence of characters) and
 163 the rest(sequence of octets).
 164
 165 If beginning of STRING does not matches EWORD-REGEXP and AFTER-REGEXP,
 166 returns nil.
 167
 168 If an encoded-word is broken or your emacs implementation can not
 169 decode the charset included in it, it is returned in decoded part
 170 as encoded-word form.
 171
 172 If MUST-UNFOLD is non-nil, it unfolds and eliminates line-breaks even
 173 if there are in decoded encoded-words (generated by bad manner MUA
 174 such as a version of Net$cape)."
 175   (if eword-decode-sticked-encoded-word (setq after-regexp ""))
 176   (let* ((between-ewords-regexp
 177            (if eword-decode-sticked-encoded-word
 178              "\\(\n?[ \t]\\)*"
 179              "\\(\n?[ \t]\\)+"))
 180          (between-ewords-eword-after-regexp
 181            (concat "\\`\\(" between-ewords-regexp "\\)"
 182                       "\\(" eword-regexp "\\)"
 183                       after-regexp))
 184          (eword-after-regexp
 185            (concat "\\`\\(" eword-regexp "\\)" after-regexp))
 186          (src string)   ; sequence of octets.
 187          (dst ""))      ; sequence of characters.
 188     (if (string-match eword-after-regexp src)
 189       (let* (p
 190              (q (match-end 1))
 191              (ew (substring src 0 q))
 192              (dw (eword-decode-encoded-word ew must-unfold)))
 193         (setq dst (concat dst dw)
 194               src (substring src q))
 195         (if (not (string= ew dw))
 196           (progn
 197             (while
 198               (and
 199                 (string-match between-ewords-eword-after-regexp src)
 200                 (progn
 201                   (setq p (match-end 1)
 202                         q (match-end 3)
 203                         ew (substring src p q)
 204                         dw (eword-decode-encoded-word ew must-unfold))
 205                   (if (string= ew dw)
 206                     (progn
 207                       (setq dst (concat dst (substring src 0 q))
 208                             src (substring src q))
 209                       nil)
 210                     t)))
 211               (setq dst (concat dst dw)
 212                     src (substring src q)))))
 213         (cons dst src))
 214       nil)))
 215
 216 (defun eword-decode-entire-string (string
 217                                    eword-regexp
 218                                    after-regexp
 219                                    safe-regexp
 220                                    escape ; ?\\ or nil.
 221                                    delimiters ; list of chars.
 222                                    must-unfold
 223                                    code-conversion)
 224   (if (and code-conversion
 225            (not (mime-charset-to-coding-system code-conversion)))
 226       (setq code-conversion default-mime-charset))
 227   (let ((equal-safe-regexp (concat "\\`=?" safe-regexp))
 228         (dst "")
 229         (buf "")
 230         (src string)
 231         (ew-enable t))
 232     (while (< 0 (length src))
 233       (let ((ch (aref src 0))
 234             (decoded (and
 235                         ew-enable
 236                         (eword-decode-first-encoded-words src
 237                           eword-regexp after-regexp must-unfold))))
 238         (if (and (not (string= buf ""))
 239                  (or decoded (memq ch delimiters)))
 240           (setq dst (concat dst
 241                       (std11-wrap-as-quoted-pairs
 242                         (decode-mime-charset-string buf code-conversion)
 243                         delimiters))
 244                 buf ""))
 245         (cond
 246           (decoded
 247             (setq dst (concat dst
 248                         (std11-wrap-as-quoted-pairs
 249                           (car decoded)
 250                           delimiters))
 251                   src (cdr decoded)))
 252           ((memq ch delimiters)
 253             (setq dst (concat dst (list ch))
 254                   src (substring src 1)
 255                   ew-enable t))
 256           ((eq ch escape)
 257             (setq buf (concat buf (list (aref src 1)))
 258                   src (substring src 2)
 259                   ew-enable t))
 260           ((string-match "\\`[ \t\n]+" src)
 261             (setq buf (concat buf (substring src 0 (match-end 0)))
 262                   src (substring src (match-end 0))
 263                   ew-enable t))
 264           ((and (string-match equal-safe-regexp src)
 265                 (< 0 (match-end 0)))
 266             (setq buf (concat buf (substring src 0 (match-end 0)))
 267                   src (substring src (match-end 0))
 268                   ew-enable eword-decode-sticked-encoded-word))
 269           (t (error "something wrong")))))
 270     (if (not (string= buf ""))
 271       (setq dst (concat dst
 272                   (std11-wrap-as-quoted-pairs
 273                     (decode-mime-charset-string buf code-conversion)
 274                     delimiters))))
 275     dst))
 276
 277
 278 ;;; @ for string
 279 ;;;
 280
 281 (defun eword-decode-unstructured (string code-conversion &optional must-unfold)
 282   (eword-decode-entire-string
 283     string
 284     eword-encoded-word-in-unstructured-regexp
 285     eword-after-encoded-word-in-unstructured-regexp
 286     "[^ \t\n=]*"
 287     nil
 288     nil
 289     must-unfold
 290     code-conversion))
 291
 292 (defun eword-decode-comment (string code-conversion &optional must-unfold)
 293   (eword-decode-entire-string
 294     string
 295     eword-encoded-word-in-comment-regexp
 296     eword-after-encoded-word-in-comment-regexp
 297     "[^ \t\n()\\\\=]*"
 298     ?\\
 299     '(?\( ?\))
 300     must-unfold
 301     code-conversion))
 302
 303 (defun eword-decode-quoted-string (string code-conversion &optional must-unfold)
 304   (eword-decode-entire-string
 305     string
 306     eword-encoded-word-in-quoted-string-regexp
 307     eword-after-encoded-word-in-quoted-string-regexp
 308     "[^ \t\n\"\\\\=]*"
 309     ?\\
 310     '(?\")
 311     must-unfold
 312     code-conversion))
 313
 314 (defun eword-decode-string (string &optional must-unfold code-conversion)
 315   "Decode MIME encoded-words in STRING.
 316
 317 STRING is unfolded before decoding.
 318
 319 If an encoded-word is broken or your emacs implementation can not
 320 decode the charset included in it, it is not decoded.
 321
 322 If MUST-UNFOLD is non-nil, it unfolds and eliminates line-breaks even
 323 if there are in decoded encoded-words (generated by bad manner MUA
 324 such as a version of Net$cape).
 325
 326 If CODE-CONVERSION is nil, it decodes only encoded-words.  If it is
 327 mime-charset, it decodes non-ASCII bit patterns as the mime-charset.
 328 Otherwise it decodes non-ASCII bit patterns as the
 329 default-mime-charset."
 330   (eword-decode-unstructured
 331     (std11-unfold-string string)
 332     code-conversion
 333     must-unfold))
 334
 335
 336 ;;; @ for region
 337 ;;;
 338
 339 (defun eword-decode-region (start end &optional unfolding must-unfold
 340                                                 code-conversion)
 341   "Decode MIME encoded-words in region between START and END.
 342
 343 If UNFOLDING is not nil, it unfolds before decoding.
 344
 345 If MUST-UNFOLD is non-nil, it unfolds and eliminates line-breaks even
 346 if there are in decoded encoded-words (generated by bad manner MUA
 347 such as a version of Net$cape).
 348
 349 If CODE-CONVERSION is nil, it decodes only encoded-words.  If it is
 350 mime-charset, it decodes non-ASCII bit patterns as the mime-charset.
 351 Otherwise it decodes non-ASCII bit patterns as the
 352 default-mime-charset."
 353   (interactive "*r")
 354   (save-excursion
 355     (save-restriction
 356       (narrow-to-region start end)
 357       (if unfolding
 358           (eword-decode-unfold)
 359         )
 360       (let ((str (eword-decode-unstructured
 361                    (buffer-substring (point-min) (point-max))
 362                    code-conversion
 363                    must-unfold)))
 364         (delete-region (point-min) (point-max))
 365         (insert str)))))
 366
 367
 368 ;;; @ for message header
 369 ;;;
 370
 371 (defcustom eword-decode-ignored-field-list
 372   '(Newsgroups Path Lines Nntp-Posting-Host Received Message-Id Date)
 373   "*List of field-names to be ignored when decoding.
 374 Each field name must be symbol."
 375   :group 'eword-decode
 376   :type '(repeat symbol))
 377
 378 (defcustom eword-decode-structured-field-list
 379   '(Reply-To Resent-Reply-To From Resent-From Sender Resent-Sender
 380              To Resent-To Cc Resent-Cc Bcc Resent-Bcc Dcc
 381              Mime-Version Content-Type Content-Transfer-Encoding
 382              Content-Disposition)
 383   "*List of field-names to decode as structured field.
 384 Each field name must be symbol."
 385   :group 'eword-decode
 386   :type '(repeat symbol))
 387
 388 (defun eword-decode-header (&optional code-conversion separator)
 389   "Decode MIME encoded-words in header fields.
 390 If CODE-CONVERSION is nil, it decodes only encoded-words.  If it is
 391 mime-charset, it decodes non-ASCII bit patterns as the mime-charset.
 392 Otherwise it decodes non-ASCII bit patterns as the
 393 default-mime-charset.
 394 If SEPARATOR is not nil, it is used as header separator."
 395   (interactive "*")
 396   (if (and code-conversion
 397            (not (mime-charset-to-coding-system code-conversion)))
 398       (setq code-conversion default-mime-charset))
 399   (save-excursion
 400     (save-restriction
 401       (std11-narrow-to-header separator)
 402       (if code-conversion
 403           (let (beg p end field-name len)
 404             (goto-char (point-min))
 405             (while (re-search-forward std11-field-head-regexp nil t)
 406               (setq beg (match-beginning 0)
 407                     p (match-end 0)
 408                     field-name (buffer-substring beg (1- p))
 409                     len (string-width field-name)
 410                     field-name (intern (capitalize field-name))
 411                     end (std11-field-end))
 412               (cond ((memq field-name eword-decode-ignored-field-list)
 413                      ;; Don't decode
 414                      )
 415                     ((memq field-name eword-decode-structured-field-list)
 416                      ;; Decode as structured field
 417                      (let ((body (buffer-substring p end)))
 418                        (delete-region p end)
 419                        (insert (eword-decode-and-fold-structured-field
 420                                 body (1+ len)))
 421                        ))
 422                     (t
 423                      ;; Decode as unstructured field
 424                      (save-restriction
 425                        (narrow-to-region beg (1+ end))
 426                        (goto-char p)
 427                        (eword-decode-region beg (point-max) 'unfold nil
 428                          code-conversion)
 429                        (goto-char (point-max))
 430                        )))))
 431         (eword-decode-region (point-min) (point-max) t nil nil)
 432         ))))
 433
 434 (defun eword-decode-unfold ()
 435   (goto-char (point-min))
 436   (let (field beg end)
 437     (while (re-search-forward std11-field-head-regexp nil t)
 438       (setq beg (match-beginning 0)
 439             end (std11-field-end))
 440       (setq field (buffer-substring beg end))
 441       (if (string-match eword-encoded-word-regexp field)
 442           (save-restriction
 443             (narrow-to-region (goto-char beg) end)
 444             (while (re-search-forward "\n\\([ \t]\\)" nil t)
 445               (replace-match (match-string 1))
 446               )
 447             (goto-char (point-max))
 448             ))
 449       )))
 450
 451
 452 ;;; @ encoded-word decoder
 453 ;;;
 454
 455 (defvar eword-decode-encoded-word-error-handler
 456   'eword-decode-encoded-word-default-error-handler)
 457
 458 (defvar eword-warning-face nil
 459   "Face used for invalid encoded-word.")
 460
 461 (defun eword-decode-encoded-word-default-error-handler (word signal)
 462   (and (add-text-properties 0 (length word)
 463                             (and eword-warning-face
 464                                  (list 'face eword-warning-face))
 465                             word)
 466        word))
 467
 468 (defun eword-decode-encoded-word (word &optional must-unfold)
 469   "Decode WORD if it is an encoded-word.
 470
 471 If your emacs implementation can not decode the charset of WORD, it
 472 returns WORD.  Similarly the encoded-word is broken, it returns WORD.
 473
 474 If MUST-UNFOLD is non-nil, it unfolds and eliminates line-breaks even
 475 if there are in decoded encoded-word (generated by bad manner MUA such
 476 as a version of Net$cape)."
 477   (or (if (string-match eword-encoded-word-regexp word)
 478           (let ((charset
 479                  (substring word (match-beginning 1) (match-end 1))
 480                  )
 481                 (encoding
 482                  (upcase
 483                   (substring word (match-beginning 2) (match-end 2))
 484                   ))
 485                 (text
 486                  (substring word (match-beginning 3) (match-end 3))
 487                  ))
 488             (condition-case err
 489                 (eword-decode-encoded-text charset encoding text must-unfold)
 490               (error
 491                (funcall eword-decode-encoded-word-error-handler word err)
 492                ))
 493             ))
 494       word))
 495
 496
 497 ;;; @ encoded-text decoder
 498 ;;;
 499
 500 (defun eword-decode-encoded-text (charset encoding string
 501                                           &optional must-unfold)
 502   "Decode STRING as an encoded-text.
 503
 504 If your emacs implementation can not decode CHARSET, it returns nil.
 505
 506 If ENCODING is not \"B\" or \"Q\", it occurs error.
 507 So you should write error-handling code if you don't want break by errors.
 508
 509 If MUST-UNFOLD is non-nil, it unfolds and eliminates line-breaks even
 510 if there are in decoded encoded-text (generated by bad manner MUA such
 511 as a version of Net$cape)."
 512   (let ((cs (mime-charset-to-coding-system charset)))
 513     (if cs
 514         (let ((dest
 515                (cond
 516                 ((string-equal "B" encoding)
 517                  (if (and (string-match eword-B-encoded-text-regexp string)
 518                           (string-equal string (match-string 0 string)))
 519                      (base64-decode-string string)
 520                    (error "Invalid encoded-text %s" string)))
 521                 ((string-equal "Q" encoding)
 522                  (if (and (string-match eword-Q-encoded-text-regexp string)
 523                           (string-equal string (match-string 0 string)))
 524                      (q-encoding-decode-string string)
 525                    (error "Invalid encoded-text %s" string)))
 526                 (t
 527                  (error "Invalid encoding %s" encoding)
 528                  )))
 529               )
 530           (when dest
 531             (setq dest (decode-mime-charset-string dest charset))
 532             (if must-unfold
 533                 (mapconcat (function
 534                             (lambda (chr)
 535                               (cond ((eq chr ?\n) "")
 536                                     ((eq chr ?\t) " ")
 537                                     (t (char-to-string chr)))
 538                               ))
 539                            (std11-unfold-string dest)
 540                            "")
 541               dest))))))
 542
 543
 544 ;;; @ lexical analyze
 545 ;;;
 546
 547 (defvar eword-lexical-analyze-cache nil)
 548 (defvar eword-lexical-analyze-cache-max 299
 549   "*Max position of eword-lexical-analyze-cache.
 550 It is max size of eword-lexical-analyze-cache - 1.")
 551
 552 (defcustom eword-lexical-analyzers
 553   '(eword-analyze-quoted-string
 554     eword-analyze-domain-literal
 555     eword-analyze-comment
 556     eword-analyze-spaces
 557     eword-analyze-special
 558     eword-analyze-encoded-word
 559     eword-analyze-atom)
 560   "*List of functions to return result of lexical analyze.
 561 Each function must have two arguments: STRING and MUST-UNFOLD.
 562 STRING is the target string to be analyzed.
 563 If MUST-UNFOLD is not nil, each function must unfold and eliminate
 564 bare-CR and bare-LF from the result even if they are included in
 565 content of the encoded-word.
 566 Each function must return nil if it can not analyze STRING as its
 567 format.
 568
 569 Previous function is preferred to next function.  If a function
 570 returns nil, next function is used.  Otherwise the return value will
 571 be the result."
 572   :group 'eword-decode
 573   :type '(repeat function))
 574
 575 (defun eword-analyze-quoted-string (string &optional must-unfold)
 576   (let ((p (std11-check-enclosure string ?\" ?\")))
 577     (if p
 578         (cons (cons 'quoted-string
 579                     (if eword-decode-quoted-encoded-word
 580                         (std11-wrap-as-quoted-string
 581                          (eword-decode-quoted-string
 582                           (substring string 1 (1- p))
 583                           default-mime-charset))
 584                       (std11-wrap-as-quoted-string
 585                        (decode-mime-charset-string
 586                         (std11-strip-quoted-pair (substring string 1 (1- p)))
 587                         default-mime-charset))))
 588               (substring string p)))
 589     ))
 590
 591 (defun eword-analyze-domain-literal (string &optional must-unfold)
 592   (std11-analyze-domain-literal string))
 593
 594 (defun eword-analyze-comment (string &optional must-unfold)
 595   (let ((len (length string)))
 596     (if (and (< 0 len) (eq (aref string 0) ?\())
 597         (let ((p 0))
 598           (while (and p (< p len) (eq (aref string p) ?\())
 599             (setq p (std11-check-enclosure string ?\( ?\) t p)))
 600           (setq p (or p len))
 601           (cons (cons 'comment
 602                       (eword-decode-comment
 603                         (std11-unfold-string (substring string 0 p))
 604                         default-mime-charset))
 605                 (substring string p)))
 606       nil)))
 607
 608 (defun eword-analyze-spaces (string &optional must-unfold)
 609   (std11-analyze-spaces string))
 610
 611 (defun eword-analyze-special (string &optional must-unfold)
 612   (std11-analyze-special string))
 613
 614 (defun eword-analyze-encoded-word (string &optional must-unfold)
 615   (let ((decoded (eword-decode-first-encoded-words
 616                   string
 617                   eword-encoded-word-in-phrase-regexp
 618                   eword-after-encoded-word-in-phrase-regexp
 619                   must-unfold)))
 620     (if decoded
 621         (let ((s (car decoded)))
 622           (while (or (string-match std11-atom-regexp s)
 623                      (string-match std11-spaces-regexp s))
 624             (setq s (substring s (match-end 0))))
 625           (if (= (length s) 0)
 626               (cons (cons 'atom (car decoded)) (cdr decoded))
 627             (cons (cons 'quoted-string
 628                         (std11-wrap-as-quoted-string (car decoded)))
 629                   (cdr decoded)))))))
 630
 631 (defun eword-analyze-atom (string &optional must-unfold)
 632   (if (let ((enable-multibyte-characters nil))
 633         (string-match std11-atom-regexp string))
 634       (let ((end (match-end 0)))
 635         (if (and eword-decode-sticked-encoded-word
 636                  (string-match eword-encoded-word-in-phrase-regexp
 637                                (substring string 0 end))
 638                  (< 0 (match-beginning 0)))
 639             (setq end (match-beginning 0)))
 640         (cons (cons 'atom (decode-mime-charset-string
 641                            (substring string 0 end)
 642                            default-mime-charset))
 643               (substring string end)
 644               ))))
 645
 646 (defun eword-lexical-analyze-internal (string must-unfold)
 647   (let ((last 'eword-analyze-spaces)
 648         dest ret)
 649     (while (not (string-equal string ""))
 650       (setq ret
 651             (let ((rest eword-lexical-analyzers)
 652                   func r)
 653               (while (and (setq func (car rest))
 654                           (or
 655                            (and
 656                             (not eword-decode-sticked-encoded-word)
 657                             (not (eq last 'eword-analyze-spaces))
 658                             (eq func 'eword-analyze-encoded-word))
 659                            (null (setq r (funcall func string must-unfold))))
 660                           )
 661                 (setq rest (cdr rest)))
 662               (setq last func)
 663               (or r `((error . ,string) . ""))
 664               ))
 665       (setq dest (cons (car ret) dest))
 666       (setq string (cdr ret))
 667       )
 668     (nreverse dest)
 669     ))
 670
 671 (defun eword-lexical-analyze (string &optional must-unfold)
 672   "Return lexical analyzed list corresponding STRING.
 673 It is like std11-lexical-analyze, but it decodes non us-ascii
 674 characters encoded as encoded-words or invalid \"raw\" format.
 675 \"Raw\" non us-ascii characters are regarded as variable
 676 `default-mime-charset'."
 677   (let* ((str (copy-sequence string))
 678          (key (cons str (cons default-mime-charset must-unfold)))
 679          ret)
 680     (set-text-properties 0 (length str) nil str)
 681     (if (setq ret (assoc key eword-lexical-analyze-cache))
 682         (cdr ret)
 683       (setq ret (eword-lexical-analyze-internal str must-unfold))
 684       (setq eword-lexical-analyze-cache
 685             (cons (cons key ret)
 686                   (last eword-lexical-analyze-cache
 687                         eword-lexical-analyze-cache-max)))
 688       ret)))
 689
 690 (defun eword-decode-token (token)
 691   (cdr token))
 692
 693 (defun eword-decode-and-fold-structured-field
 694   (string start-column &optional max-column must-unfold)
 695   "Decode and fold (fill) STRING as structured field body.
 696 It decodes non us-ascii characters in FULL-NAME encoded as
 697 encoded-words or invalid \"raw\" string.  \"Raw\" non us-ascii
 698 characters are regarded as variable `default-mime-charset'.
 699
 700 If an encoded-word is broken or your emacs implementation can not
 701 decode the charset included in it, it is not decoded.
 702
 703 If MAX-COLUMN is omitted, `fill-column' is used.
 704
 705 If MUST-UNFOLD is non-nil, it unfolds and eliminates line-breaks even
 706 if there are in decoded encoded-words (generated by bad manner MUA
 707 such as a version of Net$cape)."
 708   (or max-column
 709       (setq max-column fill-column))
 710   (let ((c start-column)
 711         (tokens (eword-lexical-analyze string must-unfold))
 712         (result "")
 713         token)
 714     (while (and (setq token (car tokens))
 715                 (setq tokens (cdr tokens)))
 716       (let* ((type (car token)))
 717         (if (eq type 'spaces)
 718             (let* ((next-token (car tokens))
 719                    (next-str (eword-decode-token next-token))
 720                    (next-len (string-width next-str))
 721                    (next-c (+ c next-len 1)))
 722               (if (< next-c max-column)
 723                   (setq result (concat result " " next-str)
 724                         c next-c)
 725                 (setq result (concat result "\n " next-str)
 726                       c (1+ next-len)))
 727               (setq tokens (cdr tokens))
 728               )
 729           (let* ((str (eword-decode-token token)))
 730             (setq result (concat result str)
 731                   c (+ c (string-width str)))
 732             ))))
 733     (if token
 734         (concat result (eword-decode-token token))
 735       result)))
 736
 737 (defun eword-decode-and-unfold-structured-field (string)
 738   "Decode and unfold STRING as structured field body.
 739 It decodes non us-ascii characters in FULL-NAME encoded as
 740 encoded-words or invalid \"raw\" string.  \"Raw\" non us-ascii
 741 characters are regarded as variable `default-mime-charset'.
 742
 743 If an encoded-word is broken or your emacs implementation can not
 744 decode the charset included in it, it is not decoded."
 745   (let ((tokens (eword-lexical-analyze string 'must-unfold))
 746         (result ""))
 747     (while tokens
 748       (let* ((token (car tokens))
 749              (type (car token)))
 750         (setq tokens (cdr tokens))
 751         (setq result
 752               (if (eq type 'spaces)
 753                   (concat result " ")
 754                 (concat result (eword-decode-token token))
 755                 ))))
 756     result))
 757
 758 (defun eword-decode-structured-field-body (string &optional must-unfold
 759                                                   start-column max-column)
 760   "Decode non us-ascii characters in STRING as structured field body.
 761 STRING is unfolded before decoding.
 762
 763 It decodes non us-ascii characters in FULL-NAME encoded as
 764 encoded-words or invalid \"raw\" string.  \"Raw\" non us-ascii
 765 characters are regarded as variable `default-mime-charset'.
 766
 767 If an encoded-word is broken or your emacs implementation can not
 768 decode the charset included in it, it is not decoded.
 769
 770 If MUST-UNFOLD is non-nil, it unfolds and eliminates line-breaks even
 771 if there are in decoded encoded-words (generated by bad manner MUA
 772 such as a version of Net$cape)."
 773   (if start-column
 774       ;; fold with max-column
 775       (eword-decode-and-fold-structured-field
 776        string start-column max-column must-unfold)
 777     ;; Don't fold
 778     (mapconcat (function eword-decode-token)
 779                (eword-lexical-analyze string must-unfold)
 780                "")
 781     ))
 782
 783 (defun eword-decode-unstructured-field-body (string &optional must-unfold)
 784   "Decode non us-ascii characters in STRING as unstructured field body.
 785 STRING is unfolded before decoding.
 786
 787 It decodes non us-ascii characters in FULL-NAME encoded as
 788 encoded-words or invalid \"raw\" string.  \"Raw\" non us-ascii
 789 characters are regarded as variable `default-mime-charset'.
 790
 791 If an encoded-word is broken or your emacs implementation can not
 792 decode the charset included in it, it is not decoded.
 793
 794 If MUST-UNFOLD is non-nil, it unfolds and eliminates line-breaks even
 795 if there are in decoded encoded-words (generated by bad manner MUA
 796 such as a version of Net$cape)."
 797   (eword-decode-string string must-unfold default-mime-charset))
 798
 799 (defun eword-extract-address-components (string)
 800   "Extract full name and canonical address from STRING.
 801 Returns a list of the form (FULL-NAME CANONICAL-ADDRESS).
 802 If no name can be extracted, FULL-NAME will be nil.
 803 It decodes non us-ascii characters in FULL-NAME encoded as
 804 encoded-words or invalid \"raw\" string.  \"Raw\" non us-ascii
 805 characters are regarded as variable `default-mime-charset'."
 806   (let* ((structure (car (std11-parse-address
 807                           (eword-lexical-analyze
 808                            (std11-unfold-string string) 'must-unfold))))
 809          (phrase  (std11-full-name-string structure))
 810          (address (std11-address-string structure))
 811          )
 812     (list phrase address)
 813     ))
 814
 815
 816 ;;; @ end
 817 ;;;
 818
 819 (provide 'eword-decode)
 820
 821 ;;; eword-decode.el ends here