#!/usr/bin/perl # Copyright (C) 2010, 2017 Jonathan Kamens # # The current version of this script is available from # . # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # General Public License for more details. use strict; use warnings; use Email::Simple; use Errno; use Fcntl qw(SEEK_SET); use File::Basename; use Getopt::Long; use Mail::Send; use MIME::Base64; use MIME::QuotedPrint; use Net::IMAP::Simple; use Text::Wrap; # 1. Acquire lock. # 2. Open mailbox. # 3. For each message: # a. Fetch message ID. # b. Find in notspam. # c. Train bogofilter. # d. Add to bogospam file if set. # e. Remove from notspam. # f. Send to SpamCop. # g. Copy to bogospam mailbox if set. # h. Remove from mailbox. # 4. Close mailbox. # 5. Release lock. # Assumes nobody else is modifying the notspam or bogospam files while # this is running, so do not run it at the time of day when log files # are rotated! use vars qw($host_name $user_name $password $domain $spamcop_addr $no_spamcop_regexp $dryrun $spamcop $do_one $despam $retrain $to_mailbox $mailbox $debug $check_already_moved $redeliver $redeliver_cmd); my $whoami = basename $0; # You can put variable settings in the config file or just hard-code them # in the script. my $config_file = "$ENV{'HOME'}/.$whoami.cf"; # Most interesting configurable settings # Host name, username and password for your IMAP account $host_name = ''; $user_name = ''; $password = ''; # Your email domain name, used when canonicalizing messages to try to # match what's in your IMAP folder to what's in your ham / spam # archive. It's not essential. $domain = ''; # Your personal SpamCop submission-by-email address, i.e., # submit.whatever@spam.spamcop.net. If you don't set this, then # SpamCop submission will always be disabled. $spamcop_addr = ''; # Messages matching this regexp will not be submitted to SpamCop even # when SpamCop submission is enabled. $no_spamcop_regexp = ''; # If you specify --redeliver, then after processing the message, the # script will pipe it through the command below to redeliver. The # string in the command line will be replaced with either the # USER or LOGNAME environment variable. If this variable is not set, # then --redeliver does nothing. $redeliver_cmd = "/usr/sbin/sendmail -oi "; # End most interesting configurable settings. You can change things # below this, but you might not need to. $dryrun = 0; $spamcop = undef; $do_one = 0; $despam = 0; $retrain = 0; my $usage = "Usage: $whoami [--dryrun] [--debug [...]] [--[no]spamcop] [--mailbox folder] [--despam] [--[no]retrain] [--check-already-moved] [--redeliver] [--from-hint regexp]\n"; my $lock_file = "$ENV{'HOME'}/.$whoami.lock"; # Folder into which to save copies of all processed messages. # Probably just useful for debugging purposes. $to_mailbox = ''; $mailbox = 'spamtrain'; $debug = 0; $check_already_moved = undef; my $use_imap_text = undef; my $cl_config_file; my $from_hint = undef; die $usage if (! GetOptions("dryrun" => \$dryrun, "debug+" => \$debug, "spamcop!" => \$spamcop, "mailbox|folder=s" => \$mailbox, "one" => \$do_one, "despam" => \$despam, "retrain" => \$retrain, "check-already-moved" => \$check_already_moved, "config-file" => \$cl_config_file, "redeliver" => \$redeliver, "use-imap-text!" => \$use_imap_text, "from-hint=s" => \$from_hint, )); $config_file = $cl_config_file if ($cl_config_file); if (-f $config_file) { do $config_file or die "$whoami: Error running $config_file: $@\n"; } elsif ($cl_config_file) { die "$whoami: config file $config_file does not exist\n"; } $spamcop = 0 if (! $spamcop_addr); die "Must specify --[no]spamcop with --despam\n" if ($despam && ! defined($spamcop)); $spamcop = 1 if (! defined($spamcop)); my($from_base, $to_base); if ($despam) { $from_base = "bogospam"; $to_base = "notspam"; } else { $to_base = "bogospam"; $from_base = "notspam"; } die "$whoami: IMAP server name, username and password must be set\n" if (! ($host_name and $user_name and $password)); my $domain_re = $domain ? qr/\Q$domain\E/oi : undef; my(@from_files) = ("$ENV{'HOME'}/Mail/$from_base", glob("$ENV{'HOME'}/Mail/$from_base.?"), glob("$ENV{'HOME'}/Mail/$from_base.?.bz2"), glob("$ENV{'HOME'}/Mail/$from_base.??.bz2"), glob("$ENV{'HOME'}/Mail/$from_base.???.bz2")); if ($from_hint) { @from_files = grep(/$from_hint/o, @from_files); if (! @from_files) { die "Regexp $from_hint doesn't match any from files\n"; } } my(@to_files) = ("$ENV{'HOME'}/Mail/$to_base", glob("$ENV{'HOME'}/Mail/$to_base.?"), glob("$ENV{'HOME'}/Mail/$to_base.?.bz2"), glob("$ENV{'HOME'}/Mail/$to_base.??.bz2"), glob("$ENV{'HOME'}/Mail/$to_base.???.bz2")); my $to_file = "$ENV{'HOME'}/Mail/$to_base"; &acquire_lock($lock_file); my $imap = &open_mailbox($host_name, $user_name, $password, $mailbox); print $imap->last, " message(s) in $mailbox\n" if ($debug); foreach my $message (1..$imap->last) { print "Processing message $message\n" if ($debug); if ($imap->deleted($message)) { print "Skipping deleted message\n" if ($debug); next; } &process_message($imap, $message, \@from_files, \@to_files, $to_file, $to_mailbox); last if ($do_one); } &close_mailbox($imap); &release_lock($lock_file); sub acquire_lock { print "acquire_lock start\n" if ($debug > 1); my($lock_file) = @_; my $tmp_lock = "$lock_file.$$"; open(LOCK, ">", $tmp_lock) or die; print(LOCK "$$\n") or die; close(LOCK) or die; retry: if (! link($tmp_lock, $lock_file)) { if ($!{EEXIST}) { my $old_pid = do $lock_file; if ($old_pid) { if (! kill(0, $old_pid)) { if ($!{ESRCH}) { unlink_old_lock: if (! unlink($lock_file)) { die; } goto retry; } die; } die; } goto unlink_old_lock; } die; } unlink($tmp_lock); print "acquire_lock end\n" if ($debug > 1); return 1; } sub open_mailbox { print "open_mailbox start\n" if ($debug > 1); my($host_name, $user_name, $password, $folder) = @_; my $imap = Net::IMAP::Simple->new($host_name) or die $Net::IMAP::Simple::errstr; if (! $imap->login($user_name, $password)) { die $imap->errstr; } $imap->select($folder); print "open_mailbox end\n" if ($debug > 1); return $imap; } sub process_message { print "process_message start\n" if ($debug > 1); my($imap, $message, $from_ref, $to_ref, $to_file, $to_mailbox) = @_; my(@from_files) = @{$from_ref}; my(@to_files) = @{$to_ref}; my $entire_message = join('', @{$imap->get($message)}); $entire_message =~ s/\r\n/\n/g; my $es = Email::Simple->new(join '', @{$imap->top($message)}) or die; my $message_id = $es->header('message-id'); my $subject = ($es->header('subject') || ''); my $bogosity = ($es->header('x-bogosity') || ''); my $milter_id; my $no_bogofilter; if ($bogosity and $bogosity =~ /\bmilter_id=([[:xdigit:].]+)/) { $milter_id = $1; } my($no_message_id) = $entire_message; if ($message_id) { $no_message_id =~ s/^message-id.*\n//im; } my($from_file, $text, $begin_offset) = &find_in_mailboxes($message_id, $milter_id, $entire_message, @from_files); my $mi_str = $message_id || "no-message-id"; if (! $text) { if ($check_already_moved or ($despam && (! $bogosity || $bogosity =~ /^unsure/i))) { (undef, $text, undef) = &find_in_mailboxes($message_id, $milter_id, $entire_message, @to_files); if (! $text) { if (! ($no_bogofilter = ! $bogosity)) { if ($use_imap_text) { warn "Using IMAP text for $mi_str ($subject)\n"; $text = $entire_message; $from_file = undef; } else { warn "Can't find $mi_str ($subject) in $from_files[0] ... or $to_files[0] ...\n"; return; } } } elsif ($check_already_moved and not ($despam && (! $bogosity || $bogosity =~ /^unsure/i))) { warn "Message $mi_str ($subject) has already been moved\n"; return; } } else { if (! ($no_bogofilter = ! $bogosity)) { if ($use_imap_text) { warn "Using IMAP text for $mi_str ($subject)\n"; $text = $entire_message; $from_file = undef; } else { warn "Can't find $mi_str ($subject) in $from_files[0] ...\n"; return; } } } } if (! $dryrun) { if (! $no_bogofilter) { &train_bogofilter($text, $bogosity); if ($to_file) { &add_to_mailbox($text, $to_file); } if ($from_file) { &remove_from_mailbox($from_file, $begin_offset, length $text); } print "checking if spamcop is wanted start\n" if ($debug > 1); if ($spamcop_addr && $spamcop && $entire_message !~ /$no_spamcop_regexp/o) { if ($entire_message !~ /\n\n\s*\S/) { # This print is only temporary until I make sure # this new code is working as expected. print "Not sending empty message to SpamCop:\n$entire_message\n"; } else { print "spamcop start\n" if ($debug > 1); my $msg = new Mail::Send; $msg->to($spamcop_addr); my $fh = $msg->open; print($fh "\n", $entire_message); $fh->close or warn "Mail to spamcop failed\n"; print "spamcop end\n" if ($debug > 1); } } print "checking if spamcop is wanted end\n" if ($debug > 1); } if ($to_mailbox) { $imap->copy($message, $to_mailbox); } $imap->delete($message); if ($redeliver and $redeliver_cmd) { my $user = $ENV{'USER'} || $ENV{'LOGNAME'}; if ($user) { my $cmd = $redeliver_cmd; $cmd =~ s/\/$user/g; if (! (open(REDELIVER, "|-", $cmd) and print(REDELIVER $entire_message) and close(REDELIVER))) { warn "Redelivery command $cmd failed\n"; } } } } print "process_message end\n" if ($debug > 1); } sub close_mailbox { print "close_mailbox start\n" if ($debug > 1); my($imap) = @_; $imap->quit; print "close_mailbox end\n" if ($debug > 1); } sub release_lock { print "release_lock start\n" if ($debug > 1); my($lock_file) = @_; unlink($lock_file) or die; print "release_lock end\n" if ($debug > 1); } # Returns: file name, message text, begin offset sub find_in_mailboxes { print "find_in_mailboxes start\n" if ($debug > 1); my($message_id, $milter_id, $entire_message, @files) = @_; my($canonical) = &canonicalize_message($entire_message); foreach my $file (@files) { my($text, $offset) = &find_in_mailbox($message_id, $milter_id, $canonical, $file); if (defined($text)) { print "find_in_mailboxes end (success)\n" if ($debug > 1); return($file, $text, $offset); } } print "find_in_mailboxes end (failure)\n" if ($debug > 1); return (); } # Returns text and beginning offset sub find_in_mailbox { print "find_in_mailbox start\n" if ($debug > 1); my($message_id, $milter_id, $canonical, $file) = @_; my $op; if ($file =~ /\.bz2$/) { $file = "bzcat $file"; $op = "-|"; } elsif ($file =~ /\.gz$/) { $file = "zcat $file|"; $op = "-|"; } else { $op = "<"; } open(MAILBOX, $op, $file) or die; my $offset = 0; my($text) = ""; my($header) = ""; my($in_header) = 1; my($line); while ($line = ) { if ($line =~ /^From /) { if ($header && &check($message_id, $milter_id, $header, $text, $canonical)) { close(MAILBOX); print "find_in_mailbox end (success)\n" if ($debug > 1); return($text, $offset); } $offset += length $text; $text = $header = ""; $in_header = 1; } $text .= $line; if ($in_header) { $header .= $line; if ($line =~ /^$/) { $in_header = undef; } } } close(MAILBOX); if (&check($message_id, $milter_id, $header, $text, $canonical)) { print "find_in_mailbox end (success)\n" if ($debug > 1); return ($text, $offset); } else { print "find_in_mailbox end (failure)\n" if ($debug > 1); return (); } } sub check { print "check start\n" if ($debug > 1); my($id, $milter_id, $header, $text, $canonical) = @_; my $ret = &check_ids($id, $milter_id, $header); if (defined($ret)) { print "check end (check_ids=$ret)\n" if ($debug > 1); return $ret; } $text = &canonicalize_message($text); $ret = $canonical eq $text; print "check end (canonicalize=$ret)\n" if ($debug > 1); return $ret; } sub diff { print "diff start\n" if ($debug > 1); my($a, $b) = @_; open(OUT, ">/tmp/spamtrain.1") or die; print(OUT $a) or die; close(OUT) or die; open(OUT, ">/tmp/spamtrain.2") or die; print(OUT $b) or die; close(OUT) or die; system("diff -u /tmp/spamtrain.1 /tmp/spamtrain.2"); print "diff end\n" if ($debug > 1); } sub canonicalize_message { print "canonicalize_message start\n" if ($debug > 1); local($_) = @_; my($header, $body) = split(/\n\n/, $_, 2); $body ||= ""; $header .= "\n"; $header =~ s/[^ -~\t\n]/X/g; if ($header =~ s/^From\s+(\S.*\S)\s+... ... .. ..:..:.. ....\n//) { my $from_addr = $1; if ($header !~ /^From:/mig) { $header =~ s/(\n+)$/\nFrom: $from_addr$1/; } } $header =~ s/^((?i)(?:From|To|Reply-To):)\s*/$1 /mig; $header =~ s/^((?i)(?:From|To|Reply-To):) ([^\"\s].*[^\"\s])(\s*\<)\s*((?:\S(?:.*\S)?)?)\s*(\>)/$1 \"$2\"$3$4$5/mig; $header =~ s/^((?:From|To|CC|Reply-To):)( .*\n(?:[ \t]+\S.*\n)*)/$1\L$2\E/mig; my $to = ""; while ($header =~ s/^((?:From|To|Cc):.*\n(?:[ \t]+\S.*\n)*)//mi) { my $this = $1; # Canonicalize whitespace $this =~ s/\s*\n[ \t]+/ /g; $this =~ s/,\s*/, /g; $this =~ s/\s\s+$/\n/g; # Sendmail will append the domain to addresses that don't have it $this =~ s/\@$domain_re//g if ($domain_re); # Sendmail will convert spaces to periods in some cases, so # convert all periods back to spaces for canonicalization # purposes. $this =~ s/\./ /g; $to .= $this; } $header = $to . $header; $header =~ s/^return-path:.*\n//mi; $header =~ s/^date:.*\n//mi; $header =~ s/^sender:.*\n//mi; $header =~ s/^received:.*\n(?:[ \t]+.*\n)*//mig; $header =~ s/^x-sieve:.*\n//mi; $header =~ s/^message-id:.*\n//mi; $header =~ s/^x-bogosity:.*\n//mi; $header =~ s/^x-mime-autoconverted: from (?:base64|quoted-printable) to 8bit.*\n//mi; while ($body =~ s/\r([^\n])/$1/g) {} if ($header =~ /^Content-Type:\s*text\/plain\b/im) { my $fix; if ($header =~ /^Content-Transfer-Encoding:\s*base64[ \t]*\n/im and (my $decoded = decode_base64($body))) { $fix = 1; $body = $decoded; } elsif ($header =~ /^Content-Transfer-Encoding:\s*quoted-printable[ \t]*\n/im and ($decoded = decode_qp($body))) { $fix = 1; $body = $decoded; } if ($fix) { $header =~ s/^(Content-Transfer-Encoding:\s*)(?:base64|quoted-printable)([ \t]*\n)//im; $header .= "${1}8bit${2}"; } } if ($body) { $body =~ s/\r\n/\n/g; $body =~ s/\s*$/\n/; $body =~ s/^\s*\n//; $body =~ s/[ \t]+\n/\n/g; $body =~ s/^(content-id:)\s*(\S)/$1 $2/mig; if ($body =~ /[^\n]{1000}/) { $body = fill('', '', $body); } } print "canonicalize_message end\n" if ($debug > 1); $header . "\n" . $body; } # Return value: # undef - header has no IDs # 0 - does not match # 1 - matches sub check_ids { print "check_ids start\n" if ($debug > 1); my($message_id, $milter_id, $header) = @_; return if (! $header); my $es = Email::Simple->new($header); my $bogosity; if ($milter_id and ($bogosity = $es->header('x-bogosity')) and ($bogosity =~ /\bmilter_id=([[:xdigit:].]+)/)) { if ($milter_id eq $1) { print "check_ids end (Milter ID match: $milter_id)\n" if ($debug); return 1; } else { print "check_ids end (Milter ID mismatch)\n" if ($debug > 1); return 0; } } elsif ($message_id and my $msgid = $es->header('message-id')) { my $orig = $msgid; my $nonascii; if ($msgid =~ s/[^ -~]/X/g) { $nonascii++; } if ($msgid eq $message_id) { if ($nonascii) { warn "Converted nonascii message ID: $orig -> $msgid\n"; } print "check_ids end (message ID match)\n" if ($debug > 1); return 1; } else { print "check_ids end (message ID mismatch)\n" if ($debug > 1); return 0; } } else { print "check_ids end (failure)\n" if ($debug > 1); return undef; } } sub train_bogofilter { print "train_bogofilter start\n" if ($debug > 1); my($text, $bogosity) = @_; my $cmd = "bogofilter"; my($classify, $unclassify, $retraining); $unclassify = ""; if ($despam) { $classify = "-n"; if ($bogosity =~ /^ham\b/i) { $retraining = 1; } elsif ($bogosity =~ /^spam\b/i) { $unclassify = "-S"; } } else { $classify = "-s"; if ($bogosity =~ /^spam\b/i) { $retraining = 1; } elsif ($bogosity =~ /^ham\b/i) { $unclassify = "-N"; } } if (! $retrain and $retraining) { print "Skipping training because already trained properly\n" if ($debug); return; } if (! $unclassify) { print "Not unclassifying because previously unclassified\n" if ($debug); } $cmd = "$cmd $unclassify $classify"; print "Calling $cmd\n" if ($debug); open(BOGOFILTER, "|-", $cmd) or die; print(BOGOFILTER $text) or die; close(BOGOFILTER) or die; die if ($?); print "train_bogofilter end\n" if ($debug > 1); } sub add_to_mailbox { print "add_to_mailbox start\n" if ($debug > 1); my($text, $file) = @_; open(MAILBOX, ">>", $file) or die; print(MAILBOX $text) or die; close(MAILBOX) or die; print "add_to_mailbox end\n" if ($debug > 1); } sub remove_from_mailbox { print "remove_from_mailbox start\n" if ($debug > 1); my($file, $begin, $length) = @_; if ($file =~ /\.(?:bz2|gz)$/) { return &remove_from_compressed_mailbox($file, $begin, $length); } open(MAILBOX, "+<", $file) or die; seek(MAILBOX, $begin + $length, SEEK_SET) or die; local($/) = undef; my $after = ; seek(MAILBOX, $begin, SEEK_SET) or die; print(MAILBOX $after) or die; truncate(MAILBOX, $begin + length($after)) or die; close(MAILBOX) or die; print "remove_from_mailbox end\n" if ($debug > 1); } sub remove_from_compressed_mailbox { print "remove_from_compressed_mailbox start\n" if ($debug > 1); my($file, $begin, $length) = @_; my($uncompress_cmd, $compress_cmd); if ($file =~ s/(\.bz2)$//) { $uncompress_cmd = "bunzip2 $file$1"; $compress_cmd = "bzip2 $file"; } elsif ($file =~ s/(\.gz)$//) { $uncompress_cmd = "gunzip $file$1"; $compress_cmd = "gzip $file"; } else { die "Unrecognized suffix on $file"; } system($uncompress_cmd) && die; &remove_from_mailbox($file, $begin, $length); system($compress_cmd) && die; print "remove_from_compressed_mailbox end\n" if ($debug > 1); }