eword-decode.el

   1 ;;; eword-decode.el --- RFC 2047 based encoded-word decoder for GNU Emacs
   2
   3 ;; Copyright (C) 1995,1996,1997,1998 Free Software Foundation, Inc.
   4
   5 ;; Author: ENAMI Tsugutomo <enami@sys.ptg.sony.co.jp>
   6 ;;         MORIOKA Tomohiko <morioka@jaist.ac.jp>
   7 ;;         Tanaka Akira <akr@jaist.ac.jp>
   8 ;; Maintainer: Tanaka Akira <akr@jaist.ac.jp>
   9 ;; Created: 1995/10/03
  10 ;; Original: 1992/07/20 ENAMI Tsugutomo's `mime.el'.
  11 ;;      Renamed: 1993/06/03 to tiny-mime.el
  12 ;;      Renamed: 1995/10/03 from tiny-mime.el (split off encoder)
  13 ;;      Renamed: 1997/02/22 from tm-ew-d.el
  14 ;; Keywords: encoded-word, MIME, multilingual, header, mail, news
  15
  16 ;; This file is part of FLAM (Faithful Library About MIME).
  17
  18 ;; This program is free software; you can redistribute it and/or
  19 ;; modify it under the terms of the GNU General Public License as
  20 ;; published by the Free Software Foundation; either version 2, or (at
  21 ;; your option) any later version.
  22
  23 ;; This program is distributed in the hope that it will be useful, but
  24 ;; WITHOUT ANY WARRANTY; without even the implied warranty of
  25 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  26 ;; General Public License for more details.
  27
  28 ;; You should have received a copy of the GNU General Public License
  29 ;; along with GNU Emacs; see the file COPYING.  If not, write to the
  30 ;; Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  31 ;; Boston, MA 02111-1307, USA.
  32
  33 ;;; Code:
  34
  35 (require 'std11-parse)
  36 (require 'mel)
  37 (require 'mime-def)
  38
  39 (defgroup eword-decode nil
  40   "Encoded-word decoding"
  41   :group 'mime)
  42
  43
  44 ;;; @ variables
  45 ;;;
  46
  47 (defcustom eword-decode-sticked-encoded-word nil
  48   "*If non-nil, decode encoded-words sticked on atoms,
  49 other encoded-words, etc.
  50 however this behaviour violates RFC2047."
  51   :group 'eword-decode
  52   :type 'boolean)
  53
  54 (defcustom eword-decode-quoted-encoded-word nil
  55   "*If non-nil, decode encoded-words in quoted-string
  56 however this behaviour violates RFC2047."
  57   :group 'eword-decode
  58   :type 'boolean)
  59
  60
  61 ;;; @ MIME encoded-word definition
  62 ;;;
  63
  64 (defconst eword-encoded-word-prefix-regexp
  65   (concat (regexp-quote "=?")
  66           "\\(" mime-charset-regexp "\\)"
  67           (regexp-quote "?")
  68           "\\(B\\|Q\\)"
  69           (regexp-quote "?")))
  70 (defconst eword-encoded-word-suffix-regexp
  71   (regexp-quote "?="))
  72
  73 (defconst eword-encoded-text-in-unstructured-regexp "[!->@-~]+")
  74 (defconst eword-encoded-word-in-unstructured-regexp
  75   (concat eword-encoded-word-prefix-regexp
  76           "\\(" eword-encoded-text-in-unstructured-regexp "\\)"
  77           eword-encoded-word-suffix-regexp))
  78 (defconst eword-after-encoded-word-in-unstructured-regexp "\\([ \t]\\|$\\)")
  79
  80 (defconst eword-encoded-text-in-phrase-regexp "[-A-Za-z0-9!*+/=_]+")
  81 (defconst eword-encoded-word-in-phrase-regexp
  82   (concat eword-encoded-word-prefix-regexp
  83           "\\(" eword-encoded-text-in-phrase-regexp "\\)"
  84           eword-encoded-word-suffix-regexp))
  85 (defconst eword-after-encoded-word-in-phrase-regexp "\\([ \t(]\\|$\\)")
  86
  87 (defconst eword-encoded-text-in-comment-regexp "[]!-'*->@-[^-~]+")
  88 (defconst eword-encoded-word-in-comment-regexp
  89   (concat eword-encoded-word-prefix-regexp
  90           "\\(" eword-encoded-text-in-comment-regexp "\\)"
  91           eword-encoded-word-suffix-regexp))
  92 (defconst eword-after-encoded-word-in-comment-regexp "\\([ \t()\\\\]\\|$\\)")
  93
  94 (defconst eword-encoded-text-in-quoted-string-regexp "[]!#->@-[^-~]+")
  95 (defconst eword-encoded-word-in-quoted-string-regexp
  96   (concat eword-encoded-word-prefix-regexp
  97           "\\(" eword-encoded-text-in-quoted-string-regexp "\\)"
  98           eword-encoded-word-suffix-regexp))
  99 (defconst eword-after-encoded-word-in-quoted-string-regexp "\\([ \t\"\\\\]\\|$\\)")
 100
 101 ; obsolete
 102 (defconst eword-encoded-text-regexp eword-encoded-text-in-unstructured-regexp)
 103 (defconst eword-encoded-word-regexp eword-encoded-word-in-unstructured-regexp)
 104
 105
 106 ;;; @@ Base64
 107 ;;;
 108
 109 (defconst base64-token-regexp "[A-Za-z0-9+/]")
 110 (defconst base64-token-padding-regexp "[A-Za-z0-9+/=]")
 111
 112 (defconst eword-B-encoded-text-regexp
 113   (concat "\\(\\("
 114           base64-token-regexp
 115           base64-token-regexp
 116           base64-token-regexp
 117           base64-token-regexp
 118           "\\)*"
 119           base64-token-regexp
 120           base64-token-regexp
 121           base64-token-padding-regexp
 122           base64-token-padding-regexp
 123           "\\)"))
 124
 125 ;; (defconst eword-B-encoding-and-encoded-text-regexp
 126 ;;   (concat "\\(B\\)\\?" eword-B-encoded-text-regexp))
 127
 128
 129 ;;; @@ Quoted-Printable
 130 ;;;
 131
 132 (defconst quoted-printable-hex-chars "0123456789ABCDEF")
 133 (defconst quoted-printable-octet-regexp
 134   (concat "=[" quoted-printable-hex-chars
 135           "][" quoted-printable-hex-chars "]"))
 136
 137 (defconst eword-Q-encoded-text-regexp
 138   (concat "\\([^=?]\\|" quoted-printable-octet-regexp "\\)+"))
 139 ;; (defconst eword-Q-encoding-and-encoded-text-regexp
 140 ;;   (concat "\\(Q\\)\\?" eword-Q-encoded-text-regexp))
 141
 142
 143 ;;; @ internal utilities
 144 ;;;
 145
 146 (defun eword-decode-first-encoded-words (string
 147                                          eword-regexp
 148                                          after-regexp
 149                                          &optional must-unfold)
 150   "Decode MIME encoded-words in beginning of STRING.
 151
 152 EWORD-REGEXP is the regexp that matches a encoded-word.
 153 Usual value is
 154 eword-encoded-word-in-unstructured-regexp,
 155 eword-encoded-text-in-phrase-regexp,
 156 eword-encoded-word-in-comment-regexp or
 157 eword-encoded-word-in-quoted-string-regexp.
 158
 159 AFTER-REGEXP is the regexp that matches a after encoded-word.
 160 Usual value is
 161 eword-after-encoded-word-in-unstructured-regexp,
 162 eword-after-encoded-text-in-phrase-regexp,
 163 eword-after-encoded-word-in-comment-regexp or
 164 eword-after-encoded-word-in-quoted-string-regexp.
 165
 166 If beginning of STRING matches EWORD-REGEXP with AFTER-REGEXP,
 167 returns a cons cell of decoded string(sequence of characters) and
 168 the rest(sequence of octets).
 169
 170 If beginning of STRING does not matches EWORD-REGEXP and AFTER-REGEXP,
 171 returns nil.
 172
 173 If an encoded-word is broken or your emacs implementation can not
 174 decode the charset included in it, it is returned in decoded part
 175 as encoded-word form.
 176
 177 If MUST-UNFOLD is non-nil, it unfolds and eliminates line-breaks even
 178 if there are in decoded encoded-words (generated by bad manner MUA
 179 such as a version of Net$cape)."
 180   (if eword-decode-sticked-encoded-word (setq after-regexp ""))
 181   (let* ((between-ewords-regexp
 182            (if eword-decode-sticked-encoded-word
 183              "\\(\n?[ \t]\\)*"
 184              "\\(\n?[ \t]\\)+"))
 185          (between-ewords-eword-after-regexp
 186            (concat "\\`\\(" between-ewords-regexp "\\)"
 187                       "\\(" eword-regexp "\\)"
 188                       after-regexp))
 189          (eword-after-regexp
 190            (concat "\\`\\(" eword-regexp "\\)" after-regexp))
 191          (src string)   ; sequence of octets.
 192          (dst ""))      ; sequence of characters.
 193     (if (string-match eword-after-regexp src)
 194       (let* (p
 195              (q (match-end 1))
 196              (ew (substring src 0 q))
 197              (dw (eword-decode-encoded-word ew must-unfold)))
 198         (setq dst (concat dst dw)
 199               src (substring src q))
 200         (if (not (string= ew dw))
 201           (progn
 202             (while
 203               (and
 204                 (string-match between-ewords-eword-after-regexp src)
 205                 (progn
 206                   (setq p (match-end 1)
 207                         q (match-end 3)
 208                         ew (substring src p q)
 209                         dw (eword-decode-encoded-word ew must-unfold))
 210                   (if (string= ew dw)
 211                     (progn
 212                       (setq dst (concat dst (substring src 0 q))
 213                             src (substring src q))
 214                       nil)
 215                     t)))
 216               (setq dst (concat dst dw)
 217                     src (substring src q)))))
 218         (cons dst src))
 219       nil)))
 220
 221 (defun eword-decode-entire-string (string
 222                                    eword-regexp
 223                                    after-regexp
 224                                    safe-regexp
 225                                    escape ; ?\\ or nil.
 226                                    delimiters ; list of chars.
 227                                    must-unfold
 228                                    code-conversion)
 229   (if (and code-conversion
 230            (not (mime-charset-to-coding-system code-conversion)))
 231       (setq code-conversion default-mime-charset))
 232   (let ((equal-safe-regexp (concat "\\`=?" safe-regexp))
 233         (dst "")
 234         (buf "")
 235         (src string)
 236         (ew-enable t))
 237     (while (< 0 (length src))
 238       (let ((ch (aref src 0))
 239             (decoded (and
 240                         ew-enable
 241                         (eword-decode-first-encoded-words src
 242                           eword-regexp after-regexp must-unfold))))
 243         (if (and (not (string= buf ""))
 244                  (or decoded (memq ch delimiters)))
 245           (setq dst (concat dst
 246                       (std11-wrap-as-quoted-pairs
 247                         (decode-mime-charset-string buf code-conversion)
 248                         delimiters))
 249                 buf ""))
 250         (cond
 251           (decoded
 252             (setq dst (concat dst
 253                         (std11-wrap-as-quoted-pairs
 254                           (car decoded)
 255                           delimiters))
 256                   src (cdr decoded)))
 257           ((memq ch delimiters)
 258             (setq dst (concat dst (list ch))
 259                   src (substring src 1)
 260                   ew-enable t))
 261           ((eq ch escape)
 262             (setq buf (concat buf (list (aref src 1)))
 263                   src (substring src 2)
 264                   ew-enable t))
 265           ((string-match "\\`[ \t\n]+" src)
 266             (setq buf (concat buf (substring src 0 (match-end 0)))
 267                   src (substring src (match-end 0))
 268                   ew-enable t))
 269           ((and (string-match equal-safe-regexp src)
 270                 (< 0 (match-end 0)))
 271             (setq buf (concat buf (substring src 0 (match-end 0)))
 272                   src (substring src (match-end 0))
 273                   ew-enable eword-decode-sticked-encoded-word))
 274           (t (error "something wrong")))))
 275     (if (not (string= buf ""))
 276       (setq dst (concat dst
 277                   (std11-wrap-as-quoted-pairs
 278                     (decode-mime-charset-string buf code-conversion)
 279                     delimiters))))
 280     dst))
 281
 282
 283 ;;; @ for string
 284 ;;;
 285
 286 (defun eword-decode-unstructured (string code-conversion &optional must-unfold)
 287   (eword-decode-entire-string
 288     string
 289     eword-encoded-word-in-unstructured-regexp
 290     eword-after-encoded-word-in-unstructured-regexp
 291     "[^ \t\n=]*"
 292     nil
 293     nil
 294     must-unfold
 295     code-conversion))
 296
 297 (defun eword-decode-comment (string code-conversion &optional must-unfold)
 298   (eword-decode-entire-string
 299     string
 300     eword-encoded-word-in-comment-regexp
 301     eword-after-encoded-word-in-comment-regexp
 302     "[^ \t\n()\\\\=]*"
 303     ?\\
 304     '(?\( ?\))
 305     must-unfold
 306     code-conversion))
 307
 308 (defun eword-decode-quoted-string (string code-conversion &optional must-unfold)
 309   (eword-decode-entire-string
 310     string
 311     eword-encoded-word-in-quoted-string-regexp
 312     eword-after-encoded-word-in-quoted-string-regexp
 313     "[^ \t\n\"\\\\=]*"
 314     ?\\
 315     '(?\")
 316     must-unfold
 317     code-conversion))
 318
 319 (defun eword-decode-string (string &optional must-unfold code-conversion)
 320   "Decode MIME encoded-words in STRING.
 321
 322 STRING is unfolded before decoding.
 323
 324 If an encoded-word is broken or your emacs implementation can not
 325 decode the charset included in it, it is not decoded.
 326
 327 If MUST-UNFOLD is non-nil, it unfolds and eliminates line-breaks even
 328 if there are in decoded encoded-words (generated by bad manner MUA
 329 such as a version of Net$cape).
 330
 331 If CODE-CONVERSION is nil, it decodes only encoded-words.  If it is
 332 mime-charset, it decodes non-ASCII bit patterns as the mime-charset.
 333 Otherwise it decodes non-ASCII bit patterns as the
 334 default-mime-charset."
 335   (eword-decode-unstructured
 336     (std11-unfold-string string)
 337     code-conversion
 338     must-unfold))
 339
 340
 341 ;;; @ for region
 342 ;;;
 343
 344 (defun eword-decode-region (start end &optional unfolding must-unfold
 345                                                 code-conversion)
 346   "Decode MIME encoded-words in region between START and END.
 347
 348 If UNFOLDING is not nil, it unfolds before decoding.
 349
 350 If MUST-UNFOLD is non-nil, it unfolds and eliminates line-breaks even
 351 if there are in decoded encoded-words (generated by bad manner MUA
 352 such as a version of Net$cape).
 353
 354 If CODE-CONVERSION is nil, it decodes only encoded-words.  If it is
 355 mime-charset, it decodes non-ASCII bit patterns as the mime-charset.
 356 Otherwise it decodes non-ASCII bit patterns as the
 357 default-mime-charset."
 358   (interactive "*r")
 359   (save-excursion
 360     (save-restriction
 361       (narrow-to-region start end)
 362       (if unfolding
 363           (eword-decode-unfold)
 364         )
 365       (let ((str (eword-decode-unstructured
 366                    (buffer-substring (point-min) (point-max))
 367                    code-conversion
 368                    must-unfold)))
 369         (delete-region (point-min) (point-max))
 370         (insert str)))))
 371
 372
 373 ;;; @ for message header
 374 ;;;
 375
 376 (defcustom eword-decode-ignored-field-list
 377   '(newsgroups path lines nntp-posting-host received message-id date)
 378   "*List of field-names to be ignored when decoding.
 379 Each field name must be symbol."
 380   :group 'eword-decode
 381   :type '(repeat symbol))
 382
 383 (defcustom eword-decode-structured-field-list
 384   '(reply-to resent-reply-to from resent-from sender resent-sender
 385              to resent-to cc resent-cc bcc resent-bcc dcc
 386              mime-version content-type content-transfer-encoding
 387              content-disposition)
 388   "*List of field-names to decode as structured field.
 389 Each field name must be symbol."
 390   :group 'eword-decode
 391   :type '(repeat symbol))
 392
 393 (defun eword-decode-header (&optional code-conversion separator)
 394   "Decode MIME encoded-words in header fields.
 395 If CODE-CONVERSION is nil, it decodes only encoded-words.  If it is
 396 mime-charset, it decodes non-ASCII bit patterns as the mime-charset.
 397 Otherwise it decodes non-ASCII bit patterns as the
 398 default-mime-charset.
 399 If SEPARATOR is not nil, it is used as header separator."
 400   (interactive "*")
 401   (if (and code-conversion
 402            (not (mime-charset-to-coding-system code-conversion)))
 403       (setq code-conversion default-mime-charset))
 404   (save-excursion
 405     (save-restriction
 406       (std11-narrow-to-header separator)
 407       (if code-conversion
 408           (let (beg p end field-name len)
 409             (goto-char (point-min))
 410             (while (re-search-forward std11-field-head-regexp nil t)
 411               (setq beg (match-beginning 0)
 412                     p (match-end 0)
 413                     field-name (buffer-substring beg (1- p))
 414                     len (string-width field-name)
 415                     field-name (intern (downcase field-name))
 416                     end (std11-field-end))
 417               (cond ((memq field-name eword-decode-ignored-field-list)
 418                      ;; Don't decode
 419                      )
 420                     ((memq field-name eword-decode-structured-field-list)
 421                      ;; Decode as structured field
 422                      (let ((body (buffer-substring p end)))
 423                        (delete-region p end)
 424                        (insert (eword-decode-and-fold-structured-field
 425                                 body (1+ len)))
 426                        ))
 427                     (t
 428                      ;; Decode as unstructured field
 429                      (save-restriction
 430                        (narrow-to-region beg (1+ end))
 431                        (goto-char p)
 432                        (eword-decode-region beg (point-max) 'unfold nil
 433                          code-conversion)
 434                        (goto-char (point-max))
 435                        )))))
 436         (eword-decode-region (point-min) (point-max) t nil nil)
 437         ))))
 438
 439 (defun eword-decode-unfold ()
 440   (goto-char (point-min))
 441   (let (field beg end)
 442     (while (re-search-forward std11-field-head-regexp nil t)
 443       (setq beg (match-beginning 0)
 444             end (std11-field-end))
 445       (setq field (buffer-substring beg end))
 446       (if (string-match eword-encoded-word-regexp field)
 447           (save-restriction
 448             (narrow-to-region (goto-char beg) end)
 449             (while (re-search-forward "\n\\([ \t]\\)" nil t)
 450               (replace-match (match-string 1))
 451               )
 452             (goto-char (point-max))
 453             ))
 454       )))
 455
 456
 457 ;;; @ encoded-word decoder
 458 ;;;
 459
 460 (defvar eword-warning-face nil "Face used for invalid encoded-word.")
 461
 462 (defun eword-decode-encoded-word (word &optional must-unfold)
 463   "Decode WORD if it is an encoded-word.
 464
 465 If your emacs implementation can not decode the charset of WORD, it
 466 returns WORD.  Similarly the encoded-word is broken, it returns WORD.
 467
 468 If MUST-UNFOLD is non-nil, it unfolds and eliminates line-breaks even
 469 if there are in decoded encoded-word (generated by bad manner MUA such
 470 as a version of Net$cape)."
 471   (or (if (string-match eword-encoded-word-regexp word)
 472           (let ((charset
 473                  (substring word (match-beginning 1) (match-end 1))
 474                  )
 475                 (encoding
 476                  (upcase
 477                   (substring word (match-beginning 2) (match-end 2))
 478                   ))
 479                 (text
 480                  (substring word (match-beginning 3) (match-end 3))
 481                  ))
 482             (condition-case err
 483                 (eword-decode-encoded-text charset encoding text must-unfold)
 484               (error
 485                (and
 486                 (add-text-properties 0 (length word)
 487                                      (and eword-warning-face
 488                                           (list 'face eword-warning-face))
 489                                      word)
 490                 word)))
 491             ))
 492       word))
 493
 494
 495 ;;; @ encoded-text decoder
 496 ;;;
 497
 498 (defun eword-decode-encoded-text (charset encoding string
 499                                           &optional must-unfold)
 500   "Decode STRING as an encoded-text.
 501
 502 If your emacs implementation can not decode CHARSET, it returns nil.
 503
 504 If ENCODING is not \"B\" or \"Q\", it occurs error.
 505 So you should write error-handling code if you don't want break by errors.
 506
 507 If MUST-UNFOLD is non-nil, it unfolds and eliminates line-breaks even
 508 if there are in decoded encoded-text (generated by bad manner MUA such
 509 as a version of Net$cape)."
 510   (let ((cs (mime-charset-to-coding-system charset)))
 511     (if cs
 512         (let ((dest
 513                (cond
 514                 ((string-equal "B" encoding)
 515                  (if (and (string-match eword-B-encoded-text-regexp string)
 516                           (string-equal string (match-string 0 string)))
 517                      (base64-decode-string string)
 518                    (error "Invalid encoded-text %s" string)))
 519                 ((string-equal "Q" encoding)
 520                  (if (and (string-match eword-Q-encoded-text-regexp string)
 521                           (string-equal string (match-string 0 string)))
 522                      (q-encoding-decode-string string)
 523                    (error "Invalid encoded-text %s" string)))
 524                 (t
 525                  (error "Invalid encoding %s" encoding)
 526                  )))
 527               )
 528           (if dest
 529               (progn
 530                 (setq dest (decode-coding-string dest cs))
 531                 (if must-unfold
 532                     (mapconcat (function
 533                                 (lambda (chr)
 534                                   (cond
 535                                    ((eq chr ?\n) "")
 536                                    ((eq chr ?\t) " ")
 537                                    (t (char-to-string chr)))
 538                                   ))
 539                                (std11-unfold-string dest)
 540                                "")
 541                   dest)
 542                 ))))))
 543
 544
 545 ;;; @ lexical analyze
 546 ;;;
 547
 548 (defvar eword-lexical-analyze-cache nil)
 549 (defvar eword-lexical-analyze-cache-max 299
 550   "*Max position of eword-lexical-analyze-cache.
 551 It is max size of eword-lexical-analyze-cache - 1.")
 552
 553 (defcustom eword-lexical-analyzers
 554   '(eword-analyze-quoted-string
 555     eword-analyze-domain-literal
 556     eword-analyze-comment
 557     eword-analyze-spaces
 558     eword-analyze-special
 559     eword-analyze-encoded-word
 560     eword-analyze-atom)
 561   "*List of functions to return result of lexical analyze.
 562 Each function must have two arguments: STRING and MUST-UNFOLD.
 563 STRING is the target string to be analyzed.
 564 If MUST-UNFOLD is not nil, each function must unfold and eliminate
 565 bare-CR and bare-LF from the result even if they are included in
 566 content of the encoded-word.
 567 Each function must return nil if it can not analyze STRING as its
 568 format.
 569
 570 Previous function is preferred to next function.  If a function
 571 returns nil, next function is used.  Otherwise the return value will
 572 be the result."
 573   :group 'eword-decode
 574   :type '(repeat function))
 575
 576 (defun eword-analyze-quoted-string (string &optional must-unfold)
 577   (let ((p (std11-check-enclosure string ?\" ?\")))
 578     (if p
 579         (cons (cons 'quoted-string
 580                     (eword-decode-quoted-string
 581                       (substring string 0 p)
 582                       default-mime-charset))
 583               (substring string p))
 584       )))
 585
 586 (defun eword-analyze-domain-literal (string &optional must-unfold)
 587   (std11-analyze-domain-literal string))
 588
 589 (defun eword-analyze-comment (string &optional must-unfold)
 590   (let ((len (length string)))
 591     (if (and (< 0 len) (eq (aref string 0) ?\())
 592         (let ((p 0))
 593           (while (and p (< p len) (eq (aref string p) ?\())
 594             (setq p (std11-check-enclosure string ?\( ?\) t p)))
 595           (setq p (or p len))
 596           (cons (cons 'comment
 597                       (eword-decode-comment
 598                         (substring string 0 p)
 599                         default-mime-charset))
 600                 (substring string p)))
 601       nil)))
 602
 603 (defun eword-analyze-spaces (string &optional must-unfold)
 604   (std11-analyze-spaces string))
 605
 606 (defun eword-analyze-special (string &optional must-unfold)
 607   (std11-analyze-special string))
 608
 609 (defun eword-analyze-encoded-word (string &optional must-unfold)
 610   (let ((decoded (eword-decode-first-encoded-words
 611                    string
 612                    eword-encoded-word-in-phrase-regexp
 613                    eword-after-encoded-word-in-phrase-regexp
 614                    must-unfold)))
 615     (if decoded
 616       (cons (cons 'atom (car decoded)) (cdr decoded)))))
 617
 618 (defun eword-analyze-atom (string &optional must-unfold)
 619   (if (let ((enable-multibyte-characters nil))
 620         (string-match std11-atom-regexp string))
 621       (let ((end (match-end 0)))
 622         (if (and eword-decode-sticked-encoded-word
 623                  (string-match eword-encoded-word-in-phrase-regexp
 624                                (substring string 0 end))
 625                  (< 0 (match-beginning 0)))
 626             (setq end (match-beginning 0)))
 627         (cons (cons 'atom (decode-mime-charset-string
 628                            (substring string 0 end)
 629                            default-mime-charset))
 630               (substring string end)
 631               ))))
 632
 633 (defun eword-lexical-analyze-internal (string must-unfold)
 634   (let (dest ret)
 635     (while (not (string-equal string ""))
 636       (setq ret
 637             (let ((rest eword-lexical-analyzers)
 638                   func r)
 639               (while (and (setq func (car rest))
 640                           (null (setq r (funcall func string must-unfold)))
 641                           )
 642                 (setq rest (cdr rest)))
 643               (or r `((error . ,string) . ""))
 644               ))
 645       (setq dest (cons (car ret) dest))
 646       (setq string (cdr ret))
 647       )
 648     (nreverse dest)
 649     ))
 650
 651 (defun eword-lexical-analyze (string &optional must-unfold)
 652   "Return lexical analyzed list corresponding STRING.
 653 It is like std11-lexical-analyze, but it decodes non us-ascii
 654 characters encoded as encoded-words or invalid \"raw\" format.
 655 \"Raw\" non us-ascii characters are regarded as variable
 656 `default-mime-charset'."
 657   (let* ((str (copy-sequence string))
 658          (key (cons str (cons default-mime-charset must-unfold)))
 659          ret)
 660     (set-text-properties 0 (length str) nil str)
 661     (if (setq ret (assoc key eword-lexical-analyze-cache))
 662         (cdr ret)
 663       (setq ret (eword-lexical-analyze-internal str must-unfold))
 664       (setq eword-lexical-analyze-cache
 665             (cons (cons key ret)
 666                   (last eword-lexical-analyze-cache
 667                         eword-lexical-analyze-cache-max)))
 668       ret)))
 669
 670 (defun eword-decode-token (token)
 671   (cdr token))
 672
 673 (defun eword-decode-and-fold-structured-field
 674   (string start-column &optional max-column must-unfold)
 675   "Decode and fold (fill) STRING as structured field body.
 676 It decodes non us-ascii characters in FULL-NAME encoded as
 677 encoded-words or invalid \"raw\" string.  \"Raw\" non us-ascii
 678 characters are regarded as variable `default-mime-charset'.
 679
 680 If an encoded-word is broken or your emacs implementation can not
 681 decode the charset included in it, it is not decoded.
 682
 683 If MAX-COLUMN is omitted, `fill-column' is used.
 684
 685 If MUST-UNFOLD is non-nil, it unfolds and eliminates line-breaks even
 686 if there are in decoded encoded-words (generated by bad manner MUA
 687 such as a version of Net$cape)."
 688   (or max-column
 689       (setq max-column fill-column))
 690   (let ((c start-column)
 691         (tokens (eword-lexical-analyze string must-unfold))
 692         (result "")
 693         token)
 694     (while (and (setq token (car tokens))
 695                 (setq tokens (cdr tokens)))
 696       (let* ((type (car token)))
 697         (if (eq type 'spaces)
 698             (let* ((next-token (car tokens))
 699                    (next-str (eword-decode-token next-token))
 700                    (next-len (string-width next-str))
 701                    (next-c (+ c next-len 1)))
 702               (if (< next-c max-column)
 703                   (setq result (concat result " " next-str)
 704                         c next-c)
 705                 (setq result (concat result "\n " next-str)
 706                       c (1+ next-len)))
 707               (setq tokens (cdr tokens))
 708               )
 709           (let* ((str (eword-decode-token token)))
 710             (setq result (concat result str)
 711                   c (+ c (string-width str)))
 712             ))))
 713     (if token
 714         (concat result (eword-decode-token token))
 715       result)))
 716
 717 (defun eword-decode-and-unfold-structured-field (string)
 718   "Decode and unfold STRING as structured field body.
 719 It decodes non us-ascii characters in FULL-NAME encoded as
 720 encoded-words or invalid \"raw\" string.  \"Raw\" non us-ascii
 721 characters are regarded as variable `default-mime-charset'.
 722
 723 If an encoded-word is broken or your emacs implementation can not
 724 decode the charset included in it, it is not decoded."
 725   (let ((tokens (eword-lexical-analyze string 'must-unfold))
 726         (result ""))
 727     (while tokens
 728       (let* ((token (car tokens))
 729              (type (car token)))
 730         (setq tokens (cdr tokens))
 731         (setq result
 732               (if (eq type 'spaces)
 733                   (concat result " ")
 734                 (concat result (eword-decode-token token))
 735                 ))))
 736     result))
 737
 738 (defun eword-decode-structured-field-body (string &optional must-unfold
 739                                                   start-column max-column)
 740   "Decode non us-ascii characters in STRING as structured field body.
 741 STRING is unfolded before decoding.
 742
 743 It decodes non us-ascii characters in FULL-NAME encoded as
 744 encoded-words or invalid \"raw\" string.  \"Raw\" non us-ascii
 745 characters are regarded as variable `default-mime-charset'.
 746
 747 If an encoded-word is broken or your emacs implementation can not
 748 decode the charset included in it, it is not decoded.
 749
 750 If MUST-UNFOLD is non-nil, it unfolds and eliminates line-breaks even
 751 if there are in decoded encoded-words (generated by bad manner MUA
 752 such as a version of Net$cape)."
 753   (if start-column
 754       ;; fold with max-column
 755       (eword-decode-and-fold-structured-field
 756        string start-column max-column must-unfold)
 757     ;; Don't fold
 758     (mapconcat (function eword-decode-token)
 759                (eword-lexical-analyze string must-unfold)
 760                "")
 761     ))
 762
 763 (defun eword-decode-unstructured-field-body (string &optional must-unfold)
 764   "Decode non us-ascii characters in STRING as unstructured field body.
 765 STRING is unfolded before decoding.
 766
 767 It decodes non us-ascii characters in FULL-NAME encoded as
 768 encoded-words or invalid \"raw\" string.  \"Raw\" non us-ascii
 769 characters are regarded as variable `default-mime-charset'.
 770
 771 If an encoded-word is broken or your emacs implementation can not
 772 decode the charset included in it, it is not decoded.
 773
 774 If MUST-UNFOLD is non-nil, it unfolds and eliminates line-breaks even
 775 if there are in decoded encoded-words (generated by bad manner MUA
 776 such as a version of Net$cape)."
 777   (eword-decode-string string must-unfold default-mime-charset))
 778
 779 (defun eword-extract-address-components (string)
 780   "Extract full name and canonical address from STRING.
 781 Returns a list of the form (FULL-NAME CANONICAL-ADDRESS).
 782 If no name can be extracted, FULL-NAME will be nil.
 783 It decodes non us-ascii characters in FULL-NAME encoded as
 784 encoded-words or invalid \"raw\" string.  \"Raw\" non us-ascii
 785 characters are regarded as variable `default-mime-charset'."
 786   (let* ((structure (car (std11-parse-address
 787                           (eword-lexical-analyze
 788                            (std11-unfold-string string) 'must-unfold))))
 789          (phrase  (std11-full-name-string structure))
 790          (address (std11-address-string structure))
 791          )
 792     (list phrase address)
 793     ))
 794
 795
 796 ;;; @ end
 797 ;;;
 798
 799 (provide 'eword-decode)
 800
 801 ;;; eword-decode.el ends here