#!/usr/bin/perl

# buildfilter.pl version 4, 20021120
# Copyright 2002 by Will Wagner <wwagner@ymb.net>
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by 
# the Free Software Foundation; either version 2 of the License, or 
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of 
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
#
# See also http://www.gnu.org/licenses/gpl.txt

# Changes
# 20021019 WHW - Added an 'allwords' hash, so we don't have any repetition
#                in the computation phase or the word-score.txt file.  Changed
#                the name to 'buildfilter' instead of 'spamfilter'; it's more
#                descriptive of what this thing does.
# 20021024 WHW - Added word pair probabilities.  Probably need to find some
#                way to pare down the data a bit, since the pairs bloat the
#                probability file by 400%.
# 20021105 WHW - Changed over to use DB_File instead of text or storable.
#                It's supported out of the box, and should be nice and fast.
#                Also added count_threshold for the lower limit on occurrences
#                of a word (or pair).  We can use it to both limit the size of
#                the word score file and make our filters a little more
#                focused.
# 20021120 WHW - We now use Mail::Box to hide the implementation details of our
#                mail installation.  Mail::Box::Manager is missing the function
#                of telling us the names of all the mailboxes in the maildir.
#                That seems like a pretty large oversight.  I rolled the spam
#                folder into the good folder loop, with some detection code;
#                it's a little cleaner.  I also moved the message parser into
#                its own routine.  It is now incredibly S-L-O-W, for what
#                reason I'm trying to figure out.  Turns out that it
#                uses WAY too much memory as well (I knew it used a lot, but
#                I didn't realize it was *that* much - ~450MB for my mail).
#                Got rid of the allwords hash to try to pare down the memory
#                requirements a little.
#
# To-do:
#              - Since we can just save the word counts of the bad mails, we
#                can get rid of all the spam; we can just reload the bad word
#                count, if it exists, on the start of our run, and add the new
#                messages to it.  It'll save both disk space and runtime.  I
#                don't think there's a whole lot we can do about the legit mail
#                folders; we'll probably want to hang onto all that.

use strict;
use Mail::Box::Manager;
use DB_File;

# Configuration:  set these to reflect your installation
# The directory where all your mail folders are stored
my $maildir = "$ENV{'HOME'}/mail";
# The name of the folder where you keep all the spam you've gotten
my $spamfolder = 'spam';
# The names of the folders you don't want to include in the "good" mails
my @ignore_folders = qw(sent-mail sf-news);
# Set this if you want to use pairs
our $use_pairs = 1;
# Set how many times a word must be seen before it is "important enough"
my $count_threshold = 5;

# Shouldn't need to change much of anything below here just to use this thing
$| = 1;

my (%bad_word_count, %good_word_count, %word_score);
my ($allgood, $allbad) = (0, 0);
my @folders;
my $msgcount = 0;
my $prevword;
my $mgr = new Mail::Box::Manager(folderdir => "$maildir");
if (-d $maildir)
{
    # This really should be handled for us by the Mail::Box::Manager
    opendir DIR, "$maildir" or die "can't open $maildir: $!\n";
    my @mboxes = readdir DIR;
    closedir DIR;
    @folders = sort grep !/^\./, @mboxes;
    # Get rid of the folders to be ignored
    for my $fold (@ignore_folders)
    {
	my @tmpfolders = grep !/^$fold$/, @folders;
	@folders = @tmpfolders;
    }
}
else
{
    die "your maildir $maildir is not a directory, check the configuration\n";
}

# Grab all the words out of each message
for my $folder (@folders)
{
    print "Doing folder $folder";
    print " (bad)" if ($folder eq $spamfolder);
    print "... ";
    $msgcount = 0;
    my $msgfolder = $mgr->open(folder => "$maildir/$folder");
    for my $msg ($msgfolder->messages)
    {
	$msgcount++;
	if ($folder eq $spamfolder)
	{
	    $allbad += &parse_message($msg, \%bad_word_count);
	}
	else
	{
	    $allgood += &parse_message($msg, \%good_word_count);
	}
    }
    $msgfolder->close;
    print "$msgcount\n";
}

print "Calculating word probabilities...\n";
tie %word_score,
    'DB_File', "$ENV{'HOME'}/.spamfilter/word-score.db",
    O_CREAT|O_RDWR, 0644, $DB_BTREE
    or die "Couldn't tie word score file: $!";
open OUT, ">$ENV{'HOME'}/.spamfilter/word-score.txt"
    or die "Couldn't open word score textfile: $!";
SCORELOOP: for (sort(keys(%good_word_count), keys(%bad_word_count)))
{
    next SCORELOOP if ($_ eq $prevword);
    my $g = 2 * $good_word_count{$_};
    my $b = $bad_word_count{$_};

    unless ($g + $b < $count_threshold)
    {
        $word_score{$_} = max(0.01, min(0.99, (min(1.0, ($b / $allbad)) / (min(1.0, ($g / $allgood)) + (min(1.0, ($b / $allbad)))))));
        print OUT "$_ $word_score{$_}\n";
    }
    $prevword = $_;
}
close OUT;
untie %word_score;
print "Done\n";

sub parse_message
{
    my ($msg, $words) = @_;
    my $prevword = '';
    my $count = 0;

    for ($msg->lines)
    {
	@_ = split /[^-\'\$A-Za-z0-9]+/;
	for (@_)
	{
	    unless (/^[0-9]+$/ || /^$/)
	    {
		$_ = lc $_;
		$words->{$_}++;
		$words->{"$prevword $_"}++ if ($use_pairs && $prevword);
		$count++;
		$prevword = $_;
	    }
        }
    }
    return $count;
}

sub min
{
    my ($a, $b) = @_;
    return $b if ($a > $b);
    return $a;
}

sub max
{
    my ($a, $b) = @_;
    return $a if ($a > $b);
    return $b;
}
