;;; spam-stat.el --- detecting spam based on statistics
-;; Copyright (C) 2002, 2003 Free Software Foundation, Inc.
+;; Copyright (C) 2002, 2003, 2004 Free Software Foundation, Inc.
;; Author: Alex Schroeder <alex@gnu.org>
;; Keywords: network
\f
;;; Code:
+(require 'mail-parse)
(defgroup spam-stat nil
"Statistical spam detection for Emacs.
:type 'number
:group 'spam-stat)
+(defcustom spam-stat-process-directory-age 90
+ "Max. age of files to be processed in directory, in days.
+When using `spam-stat-process-spam-directory' or
+`spam-stat-process-non-spam-directory', only files that have
+been touched in this many days will be considered. Without
+this filter, re-training spam-stat with several thousand messages
+will start to take a very long time.")
+
(defvar spam-stat-syntax-table
(let ((table (copy-syntax-table text-mode-syntax-table)))
(modify-syntax-entry ?- "w" table)
(defun spam-stat-store-current-buffer ()
"Store a copy of the current buffer in `spam-stat-buffer'."
- (save-excursion
- (let ((str (buffer-string)))
- (set-buffer (get-buffer-create spam-stat-buffer-name))
+ (let ((buf (current-buffer)))
+ (with-current-buffer (get-buffer-create spam-stat-buffer-name)
(erase-buffer)
- (insert str)
+ (insert-buffer-substring buf)
(setq spam-stat-buffer (current-buffer)))))
(defun spam-stat-store-gnus-article-buffer ()
"Store a copy of the current article in `spam-stat-buffer'.
This uses `gnus-article-buffer'."
- (save-excursion
- (set-buffer gnus-original-article-buffer)
+ (with-current-buffer gnus-original-article-buffer
(spam-stat-store-current-buffer)))
;; Data -- not using defstruct in order to save space and time
;; Saving and Loading
(defun spam-stat-save (&optional force)
- "Save the `spam-stat' hash table as lisp file."
- (interactive)
+ "Save the `spam-stat' hash table as lisp file.
+With a prefix argument save unconditionally."
+ (interactive "P")
(when (or force spam-stat-dirty)
(with-temp-buffer
(let ((standard-output (current-buffer))
These are the words whose spam-stat differs the most from 0.5.
The list returned contains elements of the form \(WORD SCORE DIFF),
where DIFF is the difference between SCORE and 0.5."
- (with-spam-stat-max-buffer-size
- (with-syntax-table spam-stat-syntax-table
- (let (result word score)
- (maphash (lambda (word ignore)
- (setq score (spam-stat-score-word word)
- result (cons (list word score (abs (- score 0.5)))
- result)))
- (spam-stat-buffer-words))
- (setq result (sort result (lambda (a b) (< (nth 2 b) (nth 2 a)))))
- (setcdr (nthcdr 14 result) nil)
- result))))
+ (let (result word score)
+ (maphash (lambda (word ignore)
+ (setq score (spam-stat-score-word word)
+ result (cons (list word score (abs (- score 0.5)))
+ result)))
+ (spam-stat-buffer-words))
+ (setq result (sort result (lambda (a b) (< (nth 2 b) (nth 2 a)))))
+ (setcdr (nthcdr 14 result) nil)
+ result))
(defun spam-stat-score-buffer ()
"Return a score describing the spam-probability for this buffer."
;; Testing
+(defun spam-stat-strip-xref ()
+ "Strip the the Xref header."
+ (save-restriction
+ (mail-narrow-to-head)
+ (when (re-search-forward "^Xref:.*\n" nil t)
+ (delete-region (match-beginning 0) (match-end 0)))))
+
(defun spam-stat-process-directory (dir func)
"Process all the regular files in directory DIR using function FUNC."
(let* ((files (directory-files dir t "^[^.]"))
(dolist (f files)
(when (and (file-readable-p f)
(file-regular-p f)
- (> (nth 7 (file-attributes f)) 0))
+ (> (nth 7 (file-attributes f)) 0)
+ (< (time-to-number-of-days (time-since (nth 5 (file-attributes f))))
+ spam-stat-process-directory-age))
(setq count (1+ count))
(message "Reading %s: %.2f%%" dir (/ count max))
(insert-file-contents f)
+ (spam-stat-strip-xref)
(funcall func)
(erase-buffer))))))
(spam-stat-bad entry))
count)
(remhash key spam-stat)))
- spam-stat))
+ spam-stat)
+ (setq spam-stat-dirty t))
(defun spam-stat-install-hooks-function ()
"Install the spam-stat function hooks"