#!/usr/bin/perl # Copyright (C) 2010, 2017 Jonathan Kamens . # # The current version of this script is available from # . # # This program is free software; you can redistribute it and/or modify it under # the terms of the GNU General Public License as published by the Free Software # Foundation; either version 2 of the License, or (at your option) any later # version. # # This program is distributed in the hope that it will be useful, but WITHOUT # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more # details. # Command-line options: # # --help Print help message and exit # --keep Keep existing /tmp/notspam and /tmp/bogospam files instead of # regenerating them from ~/Mail/notspam* and ~/Mail/bogospam*. # Implies --spamok --hamok. # --spamok Ignore the fact that some of the messages in /tmp/bogospam were # classified by bogofilter as ham (i.e., bogofilter is wrong # about these messages). # --hamok Ignore the fact that some of the messages in /tmp/notspam were # classified by bogofilter as spam. # --wordlist Use the in-memory wordlist. # How to use the script: # # 1. Edit @notspam_files and @bogospam_files below as necessary to tell the # script where your ham and spam archives are. # 2. Run the script. # 3. If it complains about Spam messages in your non-spam archives, load them # into an editor, look for the messages it references (it will print out # the From line and number within the file of each message ), and confirm # that the referenced messages are all in fact ham. If not, then move them # from your non-spam archive to your spam archive and run the script again. # 4. Do the same thing in the opposite direction for messages in your spam # archives that the script complaints look like Ham. # 5. When /tmp/notspam and /tmp/bogospam are correct, run with "--keep" to tell # the script not to regenerate them and to go ahead and do the tuning # without worrying about seemingly misclassified messages. # 6. When it's done, copy the results into your .bogofilter.cf. my $md = "$ENV{HOME}/Mail"; my(@notspam_files) = ("$md/notspam", "$md/notspam.1", glob("$md/notspam.*.bz2")); my(@bogospam_files) = ("$md/bogospam", "$md/bogospam.1", glob("$md/bogospam.*.bz2")); use strict; use warnings; use Date::Parse; use File::Basename; use Getopt::Long; my $whoami = basename $0; my $usage = "Usage: $0 [--help] [--verbose] [--keep] [--spamok] [--hamok] [--wordlist=file] [--since=datetime]\n"; my $from_re = qr/^From\s+(?:\S+|".*"@\S+)\s+(.*2017)$/; my($verbose, $keep, $spamok, $hamok, $wordlist, $since_str, $since); die $usage if (! GetOptions("help" => sub {print $usage; exit}, "verbose+" => \$verbose, "keep" => \$keep, "spamok" => \$spamok, "hamok" => \$hamok, "wordlist" => \$wordlist, "since=s" => \$since_str, )); die "Can't parse $since_str\n$usage" if ($since_str and ! ($since = str2time($since_str))); my $subject_filter; sub subject_filter { return $subject_filter if ($subject_filter); my $filter_file = "$ENV{HOME}/.bogofilter/milter-subject-filters"; if (! -f $filter_file) { warn "$filter_file does not exist, skipping subject filters\n" if ($verbose); return; } open(SUBJECTS, "<", $filter_file) or die; my(@filters) = ; close(SUBJECTS) or die; map { s/^\s+//; s/\s+$//; } @filters; @filters = grep(! /^(?:\#|$)/, @filters); warn("Read ", scalar(@filters), " filters from $filter_file\n") if ($verbose); $subject_filter = "(?:" . join("|", @filters) . ")"; $subject_filter = qr/$subject_filter/o; return $subject_filter; } sub open_mbox { local($_) = @_; my($fh); if (/\.gz$/) { open($fh, "-|", "zcat", $_) or die; } elsif (/\.bz2$/) { open($fh, "-|", "bzcat", $_) or die; } else { open($fh, "<", $_) or die; } return $fh; } sub sort_mboxes { my($mboxes, $fast) = @_; my(%mboxes); if ($fast) { return sort { -M $a <=> -M $b } @{$mboxes}; } foreach my $mbox (@{$mboxes}) { my $fh = &open_mbox($mbox); my $from_line = <$fh>; close($fh); next if (! $from_line); if ($from_line !~ /$from_re/o) { die "First line of $mbox is not a From line: $from_line\n"; } $mboxes{$mbox} = str2time($1); } return(sort { $mboxes{$b} <=> $mboxes{$a} } keys %mboxes); } my $mbox_buffer = ""; sub read_mbox { my($file, $pct) = @_; my $subject_filter = &subject_filter(); warn("Reading $file ($pct%)\n") if ($verbose > 1); my $length = 0; my $want = 1; my $in_header = 1; my(@messages); my($fh) = &open_mbox($file); while (<$fh>) { if (/$from_re/o) { if ($want and $length) { push(@messages, substr($mbox_buffer, 0, $length)); } $want = $in_header = 1; $length = 0; } else { next if (! $want); if ($subject_filter and $in_header) { if (/^Subject:\s*$subject_filter/o) { warn("Skipping message with $_") if ($verbose); $want = 0; next; } if (/^$/) { $in_header = 0; } } } substr($mbox_buffer, $length, length) = $_; $length += length; } if ($want and $length) { push(@messages, substr($mbox_buffer, 0, $length)); } close($fh) or die; return reverse @messages; } sub preprocess { my($output_file, $files, $are_ok, $since, $look_for, $override) = @_; my($check_pid, $bogofilter_pid); my $good = 1; warn("Sorting mboxes for $output_file\n"); my(@files) = &sort_mboxes($files, $are_ok); open(OUTPUT, ">", $output_file) or die "open(>$output_file): $!"; close(OUTPUT) or die; local $SIG{PIPE} = sub { $good = 0; $check_pid = undef; }; if (! $are_ok) { warn "Preparing to check $output_file for $look_for\n" if ($verbose); pipe(FROMPARENT, TOCHECK) or die "pipe: $!"; $check_pid = fork(); die "fork: $!" if (! defined($check_pid)); if ($check_pid) { close(FROMPARENT); } else { open(OUTPUT, "<", $output_file) or die "open($output_file): $!"; my($event); my $partial = ""; my $count = 0; my $good = 1; close(TOCHECK); while (my $msg_description = ) { chomp $msg_description; $count++; # STATES: # 0 - waiting for From line # 1 - waiting for X-Bogosity line or end of header my $state = 0; my $from; while (1) { $_ = ; if (! $_) { sleep(1); next; } if ($_ !~ /\n$/) { $partial .= $_; next; } if ($partial) { $_ = $partial . $_; $partial = ""; } if ($state == 0) { if (/$from_re/o) { chomp($from = $_); if (!$since || (str2time($1) > $since)) { $state = 1; next; } # We don't care about this message, so just skip to # the next From line when we get another event. last; } } if ($state == 1) { if (/^$/) { last; } if (/^X-Bogosity:/) { if (/^X-Bogosity: $look_for/) { warn("Message $msg_description ($from) has ", "$look_for; specify $override (and ", "maybe --keep) to override\n"); $good = 0; } last; } } } } warn("Checked $count messages in $output_file\n") if ($verbose); exit(! $good); } } warn("Launching bogofilter -M -p -e for $output_file\n") if ($verbose); $bogofilter_pid = open(BOGOFILTER, "|-"); die "fork: $!" if (! defined($bogofilter_pid)); if (! $bogofilter_pid) { open(STDOUT, ">>", $output_file) or die "open(>>$output_file): $!"; exec("bogofilter", "-M", "-p", "-e") or die "exec(bogofilter): $!"; } my $count = 0; my $total = scalar @files; foreach my $file (@files) { my(@messages) = &read_mbox($file, int(100 * ++$count / $total)); my $which_message = scalar @messages; foreach my $message (@messages) { print(BOGOFILTER $message) or die; if ($check_pid) { print(TOCHECK "#$which_message in $file\n"); } $which_message--; } } if (! close(BOGOFILTER)) { warn("Bogofilter for $output_file failed\n"); $good = 0; } if ($check_pid) { close(TOCHECK); waitpid($check_pid, 0); $good &&= ! $?; } warn("Status of $output_file is $good\n") if ($verbose); return $good; } sub start_preprocess { my $pid = fork(); die "fork: $!" if (! defined($pid)); if (! $pid) { exit(! &preprocess(@_)); } return $pid; } sub finish_preprocess { my($pid) = @_; waitpid $pid, 0; return ! $?; } my $notspam_file = "/tmp/notspam"; my $spam_file = "/tmp/bogospam"; if (! $keep) { my $notspam_handle = &start_preprocess( $notspam_file, \@notspam_files, $hamok, $since, "Spam", "--hamok"); my $bogospam_handle = &start_preprocess( $spam_file, \@bogospam_files, $spamok, $since, "Ham", "--spamok"); my $notspam_ok = &finish_preprocess($notspam_handle); my $bogospam_ok = &finish_preprocess($bogospam_handle); if (! ($notspam_ok && $bogospam_ok)) { warn "Preprocessing failed\n"; exit(1); } } my(@cmd) = ("bogotune", "-T", "0", "-n", $notspam_file, "-s", $spam_file); push(@cmd, "-D") if ($wordlist); push(@cmd, "-v") if ($verbose > 1); warn("Launching @cmd\n") if ($verbose); my $success = ! system(@cmd); if (! $success) { warn("@cmd failed\n"); if (! $wordlist) { warn("Trying again with internal wordlist\n"); push(@cmd, "-D"); warn("Launching @cmd\n"); $success = ! system(@cmd); if (! $success) { warn("@cmd failed\n"); } } } if ($success) { unlink($notspam_file, $spam_file); } exit(! $success);