#!/usr/bin/perl

# buildfilter.pl version 3, 20021105
# Copyright 2002 by Trinity Quirk <trinity@ymb.net>
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by 
# the Free Software Foundation; either version 2 of the License, or 
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of 
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
#
# See also http://www.gnu.org/licenses/gpl.txt

# Changes
# 20021019 WHW - Added an 'allwords' hash, so we don't have any repetition
#                in the computation phase or the word-score.txt file.  Changed
#                the name to 'buildfilter' instead of 'spamfilter'; it's more
#                descriptive of what this thing does.
# 20021024 WHW - Added word pair probabilities.  Probably need to find some
#                way to pare down the data a bit, since the pairs bloat the
#                probability file by 400%.
# 20021105 WHW - Changed over to use DB_File instead of text or storable.
#                It's supported out of the box, and should be nice and fast.
#                Also added count_threshold for the lower limit on occurrences
#                of a word (or pair).  We can use it to both limit the size of
#                the word score file and make our filters a little more focused.

use strict;
use DB_File;

my $count_threshold = 5;
my (%bad_word_count, %good_word_count, %word_score, %allwords);
my ($allgood, $allbad) = (0, 0);
my $badfile = "$ENV{'HOME'}/mail/spam";
my @goodfiles = glob("$ENV{'HOME'}/mail/*");
my $prevword;

print "Doing bad file $badfile...\n";
open BAD, "<$badfile" or die "can't open $badfile: $!\n";
for (<BAD>)
{
    $prevword = '' if (/^From /);
    @_ = split /[^-'\$A-Za-z0-9]+/;
    for (@_)
    {
        unless (/^[0-9]+$/ || /^$/)
        {
            $_ = lc $_;
            $bad_word_count{$_}++;
            $bad_word_count{"$prevword $_"}++ if ($prevword);
            $allwords{$_}++;
            $allwords{"$prevword $_"}++ if ($prevword);
            $allbad++;
            $prevword = $_;
        }
    }
}
close BAD;

DOGOODFILE: for (@goodfiles)
{
    next DOGOODFILE if ($_ =~ /\/spam$/ || $_ =~ /\/sent-mail$/ || $_ =~ /\/sf-news$/);
    print "Doing good file $_...\n";
    open GOOD, "<$_" or die "can't open $_: $!\n";
    for (<GOOD>)
    {
        @_ = split /[^-'\$A-Za-z0-9]+/;
        for (@_)
        {
            unless (/^[0-9]+$/ || /^$/)
            {
                $_ = lc $_;
                $good_word_count{$_}++;
                $allwords{$_}++;
                $allgood++;
            }
        }
    }
    close GOOD;
}

print "Calculating word probabilities...\n";
tie %word_score, 'DB_File', "$ENV{'HOME'}/.spamfilter/word-score.db", O_CREAT|O_RDWR, 0644, $DB_BTREE
    or die "Couldn't tie word score file: $!";
for (sort keys(%allwords))
{
    my $g = 2 * $good_word_count{$_};
    my $b = $bad_word_count{$_};

    unless ($g + $b < $count_threshold)
    {
        $word_score{$_} = max(0.01, min(0.99, (min(1.0, ($b / $allbad)) / (min(1.0, ($g / $allgood)) + (min(1.0, ($b / $allbad)))))));
    }
}
untie %word_score;
print "Done\n";

sub min
{
    my ($a, $b) = @_;
    return $b if ($a > $b);
    return $a;
}

sub max
{
    my ($a, $b) = @_;
    return $a if ($a > $b);
    return $b;
}
