From: yamaoka Date: Mon, 30 Sep 2002 22:08:47 +0000 (+0000) Subject: Synch with Oort Gnus. X-Git-Tag: t-gnus-6_15_8-04-quimby~11 X-Git-Url: http://git.chise.org/gitweb/?a=commitdiff_plain;h=ba5bf96122c2406a48a2f77df20ed51f1a1e9d98;p=elisp%2Fgnus.git- Synch with Oort Gnus. --- diff --git a/lisp/ChangeLog b/lisp/ChangeLog index bb428a7..984afef 100644 --- a/lisp/ChangeLog +++ b/lisp/ChangeLog @@ -1,3 +1,8 @@ +2002-09-30 Ted Zlatanov + + * spam.el: merged changes from pinard@iro.umontreal.ca (François Pinard). + Major revamp of the code, documentation is in comments in the file for now. + 2002-09-30 Simon Josefsson * mml2015.el (mml2015-pgg-clear-verify): Verifying in a unibyte diff --git a/lisp/spam.el b/lisp/spam.el index 9d42176..0e45d38 100644 --- a/lisp/spam.el +++ b/lisp/spam.el @@ -23,19 +23,169 @@ ;;; Commentary: +;;; This module addresses a few aspects of spam control under Gnus. Page +;;; breaks are used for grouping declarations and documentation relating to +;;; each particular aspect. + +;;; The integration with Gnus is not yet complete. See various `FIXME' +;;; comments, below, for supplementary explanations or discussions. + ;;; Code: +(require 'gnus-sum) + +;; FIXME! We should not require `dns' nor `message' until we actually +;; need them. Best would be to declare needed functions as auto-loadable. (require 'dns) (require 'message) -;;; Blackholes +;;; Main parameters. -(defvar spam-blackhole-servers - '("bl.spamcop.net" "relays.ordb.org" "dev.null.dk" - "relays.visi.com" "rbl.maps.vix.com") - "List of blackhole servers.") +(defvar spam-use-blacklist t + "True if the blacklist should be used.") + +(defvar spam-use-whitelist t + "True if the whitelist should be used.") + +(defvar spam-use-blackholes nil + ;; FIXME! Turned off for now. The DNS routines are said to be flaky. + "True if blackholes should be used.") + +(defvar spam-use-bogofilter t + "True if bogofilter should be used.") + +(defvar spam-split-group "spam" + "Usual group name where spam should be split.") + +(defvar spam-junk-mailgroups + ;; FIXME! The mailgroup list evidently depends on other choices made by the + ;; user, so the built-in default below is not likely to be appropriate. + (cons spam-split-group '("mail.junk" "poste.pourriel")) + "Mailgroups which are dedicated by splitting to receive various junk. +All unmarked article in such group receive the spam mark on group entry.") + +;; FIXME! For `spam-ham-marks' and `spam-spam-marks', I wonder if it would +;; not be easier for the user to just accept a string of mark letters, instead +;; of a list of Gnus variable names. In such case, the stunt of deferred +;; evaluation would not be useful anymore. Lars?? :-) + +;; FIXME! It is rather questionable to see `K', `X' and `Y' as indicating +;; positive ham. It much depends on how and why people use kill files, score +;; files, and the kill command. Maybe it would be better, by default, to not +;; process a message neither as ham nor spam, that is, just ignore it for +;; learning purposes, when we are not sure of how the user sees it. +;; But `r' and `R' should undoubtedly be seen as ham. + +;; FIXME! Some might consider overkill to define a list of spam marks. On +;; the other hand, who knows, some users might for example like that +;; explicitly `E'xpired articles be processed as positive spam. + +(defvar spam-ham-marks + (list gnus-del-mark gnus-read-mark gnus-killed-mark + gnus-kill-file-mark gnus-low-score-mark) + "Marks considered as being ham (positively not spam). +Such articles will be transmitted to `bogofilter -n' on group exit.") + +(defvar spam-spam-marks + (list gnus-spam-mark) + "Marks considered as being spam (positively spam). +Such articles will be transmitted to `bogofilter -s' on group exit.") + +;; FIXME! Ideally, the remainder of this page should be fully integrated +;; within `gnus-sum.el'. + +;;; Key bindings for spam control. + +;; FIXME! The justification for `M-d' is that this is what Paul Graham +;; suggests in his original article, and what Eric Raymond's patch for Mutt +;; uses. But more importantly, that binding was still free in Summary mode! -(defvar spam-split-group "spam" "Default group name for spam-split.") +;; FIXME! Lars has not blessed the following key bindings yet. It looks +;; convenient that the score analysis command uses a sequence ending with the +;; letter `t', so it nicely parallels `B t' or `V t'. `M-d' is a kind of +;; "alternate" `d', it is also the sequence suggested in Paul Graham article, +;; and also in Eric Raymond's patch for Mutt. `S x' might be the more +;; official key binding for `M-d'. + +(gnus-define-keys gnus-summary-mode-map + "St" spam-bogofilter-score + "Sx" gnus-summary-mark-as-spam + "\M-d" gnus-summary-mark-as-spam) + +;;; How to highlight a spam summary line. + +;; FIXME! Of course, `gnus-splash-face' has another purpose. Maybe a +;; special face should be created, named and used instead, for spam lines. + +(push '((eq mark gnus-spam-mark) . gnus-splash-face) + gnus-summary-highlight) + +;;; Hooks dispatching. A bit raw for now. + +(defun spam-summary-prepare () + (spam-mark-junk-as-spam-routine)) + +(defun spam-summary-prepare-exit () + (spam-bogofilter-register-routine)) + +(add-hook 'gnus-summary-prepare-hook 'spam-summary-prepare) +(add-hook 'gnus-summary-prepare-exit-hook 'spam-summary-prepare-exit) + +(defun spam-mark-junk-as-spam-routine () + (when (member gnus-newsgroup-name spam-junk-mailgroups) + (let ((articles gnus-newsgroup-articles) + article) + (while articles + (setq article (pop articles)) + (when (eq (gnus-summary-article-mark article) gnus-unread-mark) + (gnus-summary-mark-article article gnus-spam-mark)))))) + +;;;; Spam determination. + +;; The following list contains pairs associating a parameter variable with a +;; spam checking function. If the parameter variable is true, then the +;; checking function is called, and its value decides what happens. Each +;; individual check may return `nil', `t', or a mailgroup name. The value +;; `nil' means that the check does not yield a decision, and so, that further +;; checks are needed. The value `t' means that the message is definitely not +;; spam, and that further spam checks should be inhibited. Otherwise, a +;; mailgroup name is returned where the mail should go, and further checks are +;; also inhibited. The usual mailgroup name is the value of +;; `spam-split-group', meaning that the message is definitely a spam. + +(defvar spam-list-of-checks + '((spam-use-blacklist . spam-check-blacklist) + (spam-use-whitelist . spam-check-whitelist) + (spam-use-bbdb . spam-check-bbdb) + (spam-use-blackholes . spam-check-blackholes) + (spam-use-bogofilter . spam-check-bogofilter))) + +(defun spam-split () + "Split this message into the `spam' group if it is spam. +This function can be used as an entry in `nnmail-split-fancy', for +example like this: (: spam-split) + +See the Info node `(gnus)Fancy Mail Splitting' for more details." + (interactive) + + (let ((list-of-checks spam-list-of-checks) + decision) + (while (and list-of-checks (not decision)) + (let ((pair (pop list-of-checks))) + (when (eval (car pair)) + (setq decision (apply (cdr pair)))))) + (if (eq decision t) + nil + decision))) + +;;;; Blackholes. + +(defvar spam-blackhole-servers '("bl.spamcop.net" + "relays.ordb.org" + "dev.null.dk" + "relays.visi.com" + "rbl.maps.vix.com") + "List of blackhole servers.") (defun spam-check-blackholes () "Check the Receieved headers for blackholed relays." @@ -47,7 +197,7 @@ (goto-char (point-min)) (while (re-search-forward "\\[\\([0-9]+.[0-9]+.[0-9]+.[0-9]+\\)\\]" nil t) - (message "blackhole search found host IP %s" (match-string 1)) + (message "Blackhole search found host IP %s." (match-string 1)) (push (mapconcat 'identity (nreverse (split-string (match-string 1) "\\.")) ".") @@ -56,10 +206,11 @@ (dolist (ip ips) (when (query-dns (concat ip "." server)) (push (list ip server (query-dns (concat ip "." server) 'TXT)) - matches)))) - matches))) - -;;; Black- and white-lists + matches))))) + (when matches + spam-split-group))) + +;;;; Blacklists and whitelists. (defvar spam-directory "~/News/spam/" "When spam files are kept.") @@ -77,37 +228,61 @@ The regular expression is matched against the address.") (defvar spam-whitelist-cache nil) (defvar spam-blacklist-cache nil) -(defun spam-enter-whitelist (address &optional blacklist) - "Enter ADDRESS into the whitelist. -Optional arg BLACKLIST, if non-nil, means to enter in the blacklist instead." +(defun spam-enter-whitelist (address) + "Enter ADDRESS into the whitelist." (interactive "sAddress: ") - (let ((file (if blacklist spam-blacklist spam-whitelist))) - (unless (file-exists-p (file-name-directory file)) - (make-directory (file-name-directory file) t)) - (save-excursion - (set-buffer - (find-file-noselect file)) - (goto-char (point-max)) - (unless (bobp) - (insert "\n")) - (insert address "\n") - (save-buffer) - (spam-refresh-list-cache)))) + (spam-enter-list address spam-whitelist) + (setq spam-whitelist-cache nil)) (defun spam-enter-blacklist (address) "Enter ADDRESS into the blacklist." (interactive "sAddress: ") - (spam-enter-whitelist address t)) + (spam-enter-list address spam-blacklist) + (setq spam-blacklist-cache nil)) + +(defun spam-enter-list (address file) + "Enter ADDRESS into the given FILE, either the whitelist or the blacklist." + (unless (file-exists-p (file-name-directory file)) + (make-directory (file-name-directory file) t)) + (save-excursion + (set-buffer + (find-file-noselect file)) + (goto-char (point-max)) + (unless (bobp) + (insert "\n")) + (insert address "\n") + (save-buffer))) + +(defun spam-check-whitelist () + ;; FIXME! Should it detect when file timestamps change? + (unless spam-whitelist-cache + (setq spam-whitelist-cache (spam-parse-list spam-whitelist))) + (and (spam-from-listed-p spam-whitelist-cache) t)) + +;;; copied from code by Alexander Kotelnikov +;; FIXME: assumes that bbdb.el is loaded +;; FIXME: not sure about the logic... +(defun spam-check-bbdb () + "We want people, who are in bbdb not to be splitted to spam" + (let ((who (cadr + (gnus-extract-address-components (message-fetch-field "from")) + ))) + (bbdb-search (bbdb-records) nil nil (regexp-quote who)))) + +(defun spam-check-blacklist () + ;; FIXME! Should it detect when file timestamps change? + (unless spam-blacklist-cache + (setq spam-blacklist-cache (spam-parse-list spam-blacklist))) + (and (spam-from-listed-p spam-blacklist-cache) spam-split-group)) (eval-and-compile (defalias 'spam-point-at-eol (if (fboundp 'point-at-eol) 'point-at-eol 'line-end-position))) -(defun spam-parse-whitelist (&optional blacklist) - (let ((file (if blacklist spam-blacklist spam-whitelist)) - contents address) - (when (file-exists-p file) +(defun spam-parse-list (file) + (when (file-readable-p file) + (let (contents address) (with-temp-buffer (insert-file-contents file) (while (not (eobp)) @@ -120,46 +295,251 @@ Optional arg BLACKLIST, if non-nil, means to enter in the blacklist instead." (push address contents)))) (nreverse contents)))) -(defun spam-refresh-list-cache () - (setq spam-whitelist-cache (spam-parse-whitelist)) - (setq spam-blacklist-cache (spam-parse-whitelist t))) - -(defun spam-address-whitelisted-p (address &optional blacklist) - (let ((cache (if blacklist spam-blacklist-cache spam-whitelist-cache)) +(defun spam-from-listed-p (cache) + (let ((from (message-fetch-field "from")) found) - (while (and (not found) - cache) - (when (string-match (pop cache) address) - (setq found t))) + (while cache + (when (string-match (pop cache) from) + (setq found t + cache nil))) found)) + +;;;; Training via Bogofilter. Last updated 2002-09-02. -(defun spam-address-blacklisted-p (address &optional blacklist) - (if address - (spam-address-whitelisted-p address t) - nil)) +;;; See Paul Graham article, at `http://www.paulgraham.com/spam.html'. -;; Function for nnmail-split-fancy: returns 'spam' if an article is deemed to be spam -(defun spam-split () - "Split this message into the `spam' group if it is spam. -This function can be used as an entry in `nnmail-split-fancy', for -example like this: (: spam-split) +;;; This page is for those wanting to control spam with the help of Eric +;;; Raymond's speedy Bogofilter, see http://www.tuxedo.org/~esr/bogofilter. +;;; This has been tested with a locally patched copy of version 0.4. -See the Info node `(gnus)Fancy Mail Splitting' for more details." +;;; Make sure Bogofilter is installed. Bogofilter internally uses Judy fast +;;; associative arrays, so you need to install Judy first, and Bogofilter +;;; next. Fetch both distributions by visiting the following links and +;;; downloading the latest version of each: +;;; +;;; http://sourceforge.net/projects/judy/ +;;; http://www.tuxedo.org/~esr/bogofilter/ +;;; +;;; Unpack the Judy distribution and enter its main directory. Then do: +;;; +;;; ./configure +;;; make +;;; make install +;;; +;;; You will likely need to become super-user for the last step. Then, unpack +;;; the Bogofilter distribution and enter its main directory: +;;; +;;; make +;;; make install +;;; +;;; Here as well, you need to become super-user for the last step. Now, +;;; initialises your word lists by doing, under your own identity: +;;; +;;; mkdir ~/.bogofilter +;;; touch ~/.bogofilter/badlist +;;; touch ~/.bogofilter/goodlist +;;; +;;; These two files are text files you may edit, but you normally don't! + +;;; The `M-d' command gets added to Gnus summary mode, marking current article +;;; as spam, showing it with the `H' mark. Whenever you see a spam article, +;;; make sure to mark its summary line with `M-d' before leaving the group. +;;; Some groups, as per variable `spam-junk-mailgroups' below, receive articles +;;; from Gnus splitting on clues added by spam recognisers, so for these +;;; groups, we tack an `H' mark at group entry for all summary lines which +;;; would otherwise have no other mark. Make sure to _remove_ `H' marks for +;;; any article which is _not_ genuine spam, before leaving such groups: you +;;; may use `M-u' to "unread" the article, or `d' for declaring it read the +;;; non-spam way. When you leave a group, all `H' marked articles, saved or +;;; unsaved, are sent to Bogofilter which will study them as spam samples. + +;;; Messages may also be deleted in various other ways, and unless +;;; `spam-ham-marks-form' gets overridden below, marks `R' and `r' for default +;;; read or explicit delete, marks `X' and 'K' for automatic or explicit +;;; kills, as well as mark `Y' for low scores, are all considered to be +;;; associated with articles which are not spam. This assumption might be +;;; false, in particular if you use kill files or score files as means for +;;; detecting genuine spam, you should then adjust `spam-ham-marks-form'. When +;;; you leave a group, all _unsaved_ articles bearing any the above marks are +;;; sent to Bogofilter which will study these as not-spam samples. If you +;;; explicit kill a lot, you might sometimes end up with articles marked `K' +;;; which you never saw, and which might accidentally contain spam. Best is +;;; to make sure that real spam is marked with `H', and nothing else. + +;;; All other marks do not contribute to Bogofilter pre-conditioning. In +;;; particular, ticked, dormant or souped articles are likely to contribute +;;; later, when they will get deleted for real, so there is no need to use +;;; them prematurely. Explicitly expired articles do not contribute, command +;;; `E' is a way to get rid of an article without Bogofilter ever seeing it. + +;;; In a word, with a minimum of care for associating the `H' mark for spam +;;; articles only, Bogofilter training all gets fairly automatic. You should +;;; do this until you get a few hundreds of articles in each category, spam +;;; or not. The shell command `head -1 ~/.bogofilter/*' shows both article +;;; counts. The command `S S' in summary mode, either for debugging or for +;;; curiosity, triggers Bogofilter into displaying in another buffer the +;;; "spamicity" score of the current article (between 0.0 and 1.0), together +;;; with the article words which most significantly contribute to the score. + +;;; The real way for using Bogofilter, however, is to have some use tool like +;;; `procmail' for invoking it on message reception, then adding some +;;; recognisable header in case of detected spam. Gnus splitting rules might +;;; later trip on these added headers and react by sorting such articles into +;;; specific junk folders as per `spam-junk-mailgroups'. Here is a possible +;;; `.procmailrc' contents (still untested -- please tell me how it goes): +;;; +;;; :0HBf: +;;; * ? bogofilter +;;; | formail -bfI "X-Spam-Status: Yes" + +(defvar spam-output-buffer-name "*Bogofilter Output*" + "Name of buffer when displaying `bogofilter -v' output.") + +(defvar spam-spaminfo-header-regexp + ;; FIXME! In the following regexp, we should explain which tool produces + ;; which kind of header. I do not even remember them all by now. X-Junk + ;; (and previously X-NoSpam) are produced by the `NoSpam' tool, which has + ;; never been published, so it might not be reasonable leaving it in the + ;; list. + "^X-\\(jf\\|Junk\\|NoSpam\\|Spam\\|SB\\)[^:]*:" + "Regexp for spam markups in headers. +Markup from spam recognisers, as well as `Xref', are to be removed from +articles before they get registered by Bogofilter.") + +;; FIXME! I do not know if Gnus has a compatibility function for +;; `executable-find'. Here is a possible mantra for portability, +;; until Lars decides how we really should do it. +(unless (fboundp 'executable-find) + (if (fboundp 'locate-file) + (defun executable-find (command) + (locate-file command exec-path)) + (autoload 'executable-find "executable"))) +;; End of portability mantra for `executable-find'. + +(defvar spam-bogofilter-path (executable-find "bogofilter") + "File path of the Bogofilter executable program. +Force this variable to nil if you want to inhibit the functionality.") + +(defun spam-check-bogofilter () + ;; Dynamic spam check. I do not know how to check the exit status, + ;; so instead, read `bogofilter -v' output. + (when (and spam-use-bogofilter spam-bogofilter-path) + (spam-bogofilter-articles nil "-v" (list (gnus-summary-article-number))) + (when (save-excursion + (set-buffer spam-output-buffer-name) + (goto-char (point-min)) + (re-search-forward "Spamicity: \\(0\\.9\\|1\\.0\\)" nil t)) + spam-split-group))) + +(defun spam-bogofilter-score () + "Use `bogofilter -v' on the current article. +This yields the 15 most discriminant words for this article and the +spamicity coefficient of each, and the overall article spamicity." (interactive) + (when (and spam-use-bogofilter spam-bogofilter-path) + (spam-bogofilter-articles nil "-v" (list (gnus-summary-article-number))) + (save-excursion + (set-buffer spam-output-buffer-name) + (unless (= (point-min) (point-max)) + (display-message-or-buffer (current-buffer) + spam-output-buffer-name))))) + +(defun spam-bogofilter-register-routine () + (when (and spam-use-bogofilter spam-bogofilter-path) + (let ((articles gnus-newsgroup-articles) + article mark ham-articles spam-articles) + (while articles + (setq article (pop articles) + mark (gnus-summary-article-mark article)) + (cond ((memq mark spam-spam-marks) (push article spam-articles)) + ((memq article gnus-newsgroup-saved)) + ((memq mark spam-ham-marks) (push article ham-articles)))) + (when ham-articles + (spam-bogofilter-articles "ham" "-n" ham-articles)) + (when spam-articles + (spam-bogofilter-articles "SPAM" "-s" spam-articles))))) + +(defvar spam-bogofilter-initial-timeout 40 + "Timeout in seconds for the initial reply from the `bogofilter' program.") - ;; refresh the cache if it's necessary - (unless spam-whitelist-cache (spam-refresh-list-cache)) - (unless spam-blacklist-cache (spam-refresh-list-cache)) +(defvar spam-bogofilter-subsequent-timeout 15 + "Timeout in seconds for any subsequent reply from the `bogofilter' program.") - (let* ((from (message-fetch-field "from")) - (group nil)) - (when (spam-check-blackholes) - (setq group spam-split-group)) - (unless (spam-address-whitelisted-p from) ; unless the address is whitelisted, - (when (spam-address-blacklisted-p from) ; check if it's blacklisted, - (setq group spam-split-group)) ; and if so, set the group to spam-split-group - group))) +(defun spam-bogofilter-articles (type option articles) + (let ((output-buffer (get-buffer-create spam-output-buffer-name)) + (article-copy (get-buffer-create " *Bogofilter Article Copy*")) + (remove-regexp (concat spam-spaminfo-header-regexp "\\|Xref:")) + (counter 0) + prefix process article) + (when type + (setq prefix (format "Studying %d articles as %s..." (length articles) + type)) + (message "%s" prefix)) + (save-excursion (set-buffer output-buffer) (erase-buffer)) + (setq process (start-process "bogofilter" output-buffer + spam-bogofilter-path "-F" option)) + (process-kill-without-query process t) + (unwind-protect + (save-window-excursion + (while articles + (setq counter (1+ counter)) + (when prefix + (message "%s %d" prefix counter)) + (setq article (pop articles)) + (gnus-summary-goto-subject article) + (gnus-summary-select-article) + (gnus-eval-in-buffer-window article-copy + (insert-buffer-substring gnus-original-article-buffer) + ;; Remove spam classification redundant headers: they may induce + ;; unwanted biases in later analysis. + (goto-char (point-min)) + (while (not (or (eobp) (= (following-char) ?\n))) + (if (looking-at remove-regexp) + (delete-region (point) + (save-excursion (forward-line 1) (point))) + (forward-line 1))) + (goto-char (point-min)) + ;; Bogofilter really wants From envelopes for counting articles. + ;; Fake one at the beginning, make sure there will be no other. + (if (looking-at "From ") + (forward-line 1) + (insert "From nobody " (current-time-string) "\n")) + (let (case-fold-search) + (while (re-search-forward "^From " nil t) + (beginning-of-line) + (insert ">"))) + (process-send-region process (point-min) (point-max)) + (erase-buffer)))) + ;; Sending the EOF is unwind-protected. This is to prevent lost copies + ;; of `bogofilter', hung on reading their standard input, in case the + ;; whole registering process gets interrupted by the user. + (process-send-eof process)) + (kill-buffer article-copy) + ;; Receive process output. It sadly seems that we still have to protect + ;; ourselves against hung `bogofilter' processes. + (let ((status (process-status process)) + (timeout (* 1000 spam-bogofilter-initial-timeout)) + (quanta 200)) ; also counted in milliseconds + (while (and (not (eq status 'exit)) (> timeout 0)) + ;; `accept-process-output' timeout is counted in microseconds. + (setq timeout (if (accept-process-output process 0 (* 1000 quanta)) + (* 1000 spam-bogofilter-subsequent-timeout) + (- timeout quanta)) + status (process-status process))) + (if (eq status 'exit) + (when prefix + (message "%s done!" prefix)) + ;; Sigh! The process did time out... Become brutal! + (interrupt-process process) + (message "%s %d INTERRUPTED! (Article %d, status %s)" + (or prefix "Bogofilter process...") + counter article status) + ;; Give some time for user to read. Sitting redisplays but gives up + ;; if input is pending. Sleeping does not give up, but it does not + ;; redisplay either. Mix both: let's redisplay and not give up. + (sit-for 1) + (sleep-for 3))))) (provide 'spam) -;;; spam.el ends here +;;; spam.el ends here.