#!/usr/bin/perl

# buildfilter-text.pl version 2, 20021019
# Copyright 2002 by Will Wagner <wwagner@ymb.net>
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by 
# the Free Software Foundation; either version 2 of the License, or 
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of 
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
#
# See also http://www.gnu.org/licenses/gpl.txt

# Changes
# 20021019 WHW - Added an 'allwords' hash, so we don't have any repetition
#                in the computation phase or the word-score.txt file.  Chenged
#                the name to 'buildfilter' instead of 'spamfilter'; it's more
#                descriptive of what this thing does.  The first version of
#                this file also had references to Storable, which I've removed.

use strict;

my (%bad_word_count, %good_word_count, %word_score, %allwords);
my ($allgood, $allbad) = (0, 0);
my $badfile = "$ENV{'HOME'}/mail/spam";
my @goodfiles = glob("$ENV{'HOME'}/mail/*");

print "Doing bad file $badfile...\n";
open BAD, "<$badfile" or die "can't open $badfile: $!\n";
for (<BAD>)
{
    @_ = split /[^-'\$A-Za-z0-9]+/;
    for (@_)
    {
        if (!/^[0-9]+$/)
        {
            $_ = lc $_;
            $bad_word_count{$_}++;
            $allwords{$_}++;
            $allbad++;
        }
    }
}
close BAD;

DOGOODFILE: for (@goodfiles)
{
    next DOGOODFILE if ($_ =~ /\/spam$/ || $_ =~ /\/sent-mail$)
    print "Doing good file $_...\n";
    open GOOD, "<$_" or die "can't open $_: $!\n";
    for (<GOOD>)
    {
        @_ = split /[^-'\$A-Za-z0-9]+/;
        for (@_)
        {
            if (!/^[0-9]+$/)
            {
                $_ = lc $_;
                $good_word_count{$_}++;
                $allwords{$_}++;
                $allgood++;
            }
        }
    }
    close GOOD;
}

open SCORE, ">$ENV{'HOME'}/.spamfilter/word-score.txt";
print "Calculating word probabilities...\n";
for (sort keys(%allwords))
{
    my $g = 2 * $good_word_count{$_};
    my $b = $bad_word_count{$_};

    unless ($g + $b < 5)
    {
        $word_score{$_} = max(0.01, min(0.99, (min(1.0, ($b / $allbad)) / (min(1.0, ($g / $allgood)) + (min(1.0, ($b / $allbad)))))));
        print SCORE "$_ $word_score{$_}\n";
    }
}
close SCORE;

sub min
{
    my ($a, $b) = @_;
    return $b if ($a > $b);
    return $a;
}

sub max
{
    my ($a, $b) = @_;
    return $a if ($a > $b);
    return $b;
}
