X-Git-Url: http://git.chise.org/gitweb/?a=blobdiff_plain;f=lisp%2Fspam-stat.el;h=e0265fabce5742ec2802f4d1cf6c287e3bfa65cb;hb=3e2cdad3100069cf7317dda286586a0e8bf388ac;hp=b7b7cd29a2fc4963f98bbdd7eb5056e73a6f65f7;hpb=6d10b2a907addb9acc61dff650adff0e182fb6ad;p=elisp%2Fgnus.git- diff --git a/lisp/spam-stat.el b/lisp/spam-stat.el index b7b7cd2..e0265fa 100644 --- a/lisp/spam-stat.el +++ b/lisp/spam-stat.el @@ -1,6 +1,6 @@ ;;; spam-stat.el --- detecting spam based on statistics -;; Copyright (C) 2002, 2003 Free Software Foundation, Inc. +;; Copyright (C) 2002, 2003, 2004 Free Software Foundation, Inc. ;; Author: Alex Schroeder ;; Keywords: network @@ -161,17 +161,30 @@ This variable says how many characters this will be." :group 'spam-stat) (defcustom spam-stat-split-fancy-spam-group "mail.spam" - "Name of the group where spam should be stored, if -`spam-stat-split-fancy' is used in fancy splitting rules. Has no -effect when spam-stat is invoked through spam.el." + "Name of the group where spam should be stored. +If `spam-stat-split-fancy' is used in fancy splitting rules. Has +no effect when spam-stat is invoked through spam.el." :type 'string :group 'spam-stat) (defcustom spam-stat-split-fancy-spam-threshhold 0.9 - "Spam score threshhold in spam-stat-split-fancy." + "Spam score threshold in spam-stat-split-fancy." :type 'number :group 'spam-stat) +(defcustom spam-stat-washing-hook nil + "Hook applied to each message before analysis." + :type 'hook + :group 'spam-stat) + +(defcustom spam-stat-process-directory-age 90 + "Max. age of files to be processed in directory, in days. +When using `spam-stat-process-spam-directory' or +`spam-stat-process-non-spam-directory', only files that have +been touched in this many days will be considered. Without +this filter, re-training spam-stat with several thousand messages +will start to take a very long time.") + (defvar spam-stat-syntax-table (let ((table (copy-syntax-table text-mode-syntax-table))) (modify-syntax-entry ?- "w" table) @@ -194,22 +207,24 @@ This is set by hooking into Gnus.") (defvar spam-stat-buffer-name " *spam stat buffer*" "Name of the `spam-stat-buffer'.") +(defvar spam-stat-coding-system + (if (mm-coding-system-p 'emacs-mule) 'emacs-mule 'raw-text) + "Coding system used for `spam-stat-file'.") + ;; Hooking into Gnus (defun spam-stat-store-current-buffer () "Store a copy of the current buffer in `spam-stat-buffer'." - (save-excursion - (let ((str (buffer-string))) - (set-buffer (get-buffer-create spam-stat-buffer-name)) + (let ((buf (current-buffer))) + (with-current-buffer (get-buffer-create spam-stat-buffer-name) (erase-buffer) - (insert str) + (insert-buffer-substring buf) (setq spam-stat-buffer (current-buffer))))) (defun spam-stat-store-gnus-article-buffer () "Store a copy of the current article in `spam-stat-buffer'. This uses `gnus-article-buffer'." - (save-excursion - (set-buffer gnus-original-article-buffer) + (with-current-buffer gnus-original-article-buffer (spam-stat-store-current-buffer))) ;; Data -- not using defstruct in order to save space and time @@ -281,7 +296,7 @@ Use `spam-stat-ngood', `spam-stat-nbad', `spam-stat-good', ;; Parsing (defmacro with-spam-stat-max-buffer-size (&rest body) - "Narrows the buffer down to the first 4k characters, then evaluates BODY." + "Narrow the buffer down to the first 4k characters, then evaluate BODY." `(save-restriction (when (> (- (point-max) (point-min)) @@ -291,7 +306,8 @@ Use `spam-stat-ngood', `spam-stat-nbad', `spam-stat-good', ,@body)) (defun spam-stat-buffer-words () - "Return a hash table of words and number of occurences in the buffer." + "Return a hash table of words and number of occurrences in the buffer." + (run-hooks 'spam-stat-washing-hook) (with-spam-stat-max-buffer-size (with-syntax-table spam-stat-syntax-table (goto-char (point-min)) @@ -340,7 +356,7 @@ Use `spam-stat-ngood', `spam-stat-nbad', `spam-stat-good', (lambda (word count) (let ((entry (gethash word spam-stat))) (if (not entry) - (error "This buffer has unknown words in it.") + (error "This buffer has unknown words in it") (spam-stat-set-good entry (- (spam-stat-good entry) count)) (spam-stat-set-bad entry (+ (spam-stat-bad entry) count)) (spam-stat-set-score entry (spam-stat-compute-score entry)) @@ -356,7 +372,7 @@ Use `spam-stat-ngood', `spam-stat-nbad', `spam-stat-good', (lambda (word count) (let ((entry (gethash word spam-stat))) (if (not entry) - (error "This buffer has unknown words in it.") + (error "This buffer has unknown words in it") (spam-stat-set-good entry (+ (spam-stat-good entry) count)) (spam-stat-set-bad entry (- (spam-stat-bad entry) count)) (spam-stat-set-score entry (spam-stat-compute-score entry)) @@ -367,30 +383,30 @@ Use `spam-stat-ngood', `spam-stat-nbad', `spam-stat-good', ;; Saving and Loading (defun spam-stat-save (&optional force) - "Save the `spam-stat' hash table as lisp file." - (interactive) + "Save the `spam-stat' hash table as Lisp file. +With a prefix argument save unconditionally." + (interactive "P") (when (or force spam-stat-dirty) - (with-temp-buffer - (let ((standard-output (current-buffer)) - (font-lock-maximum-size 0)) - (insert "(setq spam-stat-ngood " - (number-to-string spam-stat-ngood) - " spam-stat-nbad " - (number-to-string spam-stat-nbad) - " spam-stat (spam-stat-to-hash-table '(") - (maphash (lambda (word entry) - (prin1 (list word - (spam-stat-good entry) - (spam-stat-bad entry)))) - spam-stat) - (insert ")))") - (write-file spam-stat-file))) - (setq spam-stat-dirty nil))) + (let ((coding-system-for-write spam-stat-coding-system)) + (with-temp-file spam-stat-file + (let ((standard-output (current-buffer)) + (font-lock-maximum-size 0)) + (insert (format ";-*- coding: %s; -*-\n" spam-stat-coding-system)) + (insert (format "(setq spam-stat-ngood %d spam-stat-nbad %d +spam-stat (spam-stat-to-hash-table '(" spam-stat-ngood spam-stat-nbad)) + (maphash (lambda (word entry) + (prin1 (list word + (spam-stat-good entry) + (spam-stat-bad entry)))) + spam-stat) + (insert ")))")))) + (setq spam-stat-dirty nil))) (defun spam-stat-load () "Read the `spam-stat' hash table from disk." ;; TODO: maybe we should warn the user if spam-stat-dirty is t? - (load-file spam-stat-file) + (let ((coding-system-for-read spam-stat-coding-system)) + (load-file spam-stat-file)) (setq spam-stat-dirty nil)) (defun spam-stat-to-hash-table (entries) @@ -488,10 +504,12 @@ check the variable `spam-stat-score-data'." (dolist (f files) (when (and (file-readable-p f) (file-regular-p f) - (> (nth 7 (file-attributes f)) 0)) + (> (nth 7 (file-attributes f)) 0) + (< (time-to-number-of-days (time-since (nth 5 (file-attributes f)))) + spam-stat-process-directory-age)) (setq count (1+ count)) (message "Reading %s: %.2f%%" dir (/ count max)) - (insert-file-contents f) + (insert-file-contents-literally f) (spam-stat-strip-xref) (funcall func) (erase-buffer)))))) @@ -536,7 +554,7 @@ display non-spam files; otherwise display spam files." (setq count (1+ count)) (message "Reading %.2f%%, score %.2f" (/ count max) (/ score count)) - (insert-file-contents f) + (insert-file-contents-literally f) (setq buffer-score (spam-stat-score-buffer)) (when (> buffer-score 0.9) (setq score (1+ score))) @@ -565,10 +583,11 @@ COUNT defaults to 5" (spam-stat-bad entry)) count) (remhash key spam-stat))) - spam-stat)) + spam-stat) + (setq spam-stat-dirty t)) (defun spam-stat-install-hooks-function () - "Install the spam-stat function hooks" + "Install the spam-stat function hooks." (interactive) (add-hook 'nnmail-prepare-incoming-message-hook 'spam-stat-store-current-buffer) @@ -579,7 +598,7 @@ COUNT defaults to 5" (spam-stat-install-hooks-function)) (defun spam-stat-unload-hook () - "Uninstall the spam-stat function hooks" + "Uninstall the spam-stat function hooks." (interactive) (remove-hook 'nnmail-prepare-incoming-message-hook 'spam-stat-store-current-buffer)