#!/usr/bin/perl

# runfilter.pl version 4, 20030119
# Copyright 2002, 2003 by Trinity Quirk <trinity@ymb.net>
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by 
# the Free Software Foundation; either version 2 of the License, or 
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of 
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
#
# See also http://www.gnu.org/licenses/gpl.txt

# 20021024 WHW - Added word pair probabilities
# 20021105 WHW - Changed to use DB_File instead of text or Storable.
# 20030119 WHW - Added some multi-level filtering, based on how spammy we think
#                the mail is.  It could go into maybe-spam, or spam, or just
#                be output (i.e. it's good).

use strict;
use DB_File;

my %word_score;
my %mail_word_scores;
my $default_prob = 0.49;
my $number_to_grab = 15;
my $spamfolder = "$ENV{'HOME'}/mail/spam";
my $maybefolder = "$ENV{'HOME'}/mail/maybe-spam";

# Tie up the score file
tie %word_score, 'DB_File', "$ENV{'HOME'}/.spamfilter/word-score.db", O_RDONLY, 0644, $DB_BTREE
    or die "Couldn't tie word score file: $!";

my $prevword;
my @lines = <>;
for (@lines)
{
    @_ = split /[^-'\$A-Za-z0-9]+/;
    for (@_)
    {
        unless (/^[0-9]+$/ || /^$/)
        {
            $_ = lc $_;
            # We don't care about the actual probability here, we just want to
            # know far from absolute neutral it is; i.e. how "interesting".
            $mail_word_scores{$_} = abs(0.5 - ($word_score{$_}
                                               or $default_prob));
            $mail_word_scores{"$prevword $_"} = abs(0.5 - ($word_score{"$prevword $_"} or $default_prob)) if ($prevword);
            $prevword = $_;
        }
    }
}

# Sort the words from this mail as to their "interestingness", biggest first.
my @words = sort { $mail_word_scores{$b} <=> $mail_word_scores{$a} } keys %mail_word_scores;
my $product = 1.0;
my $funk_product = 1.0;
# Now only process the most "interesting" $number_to_grab words.
for (@words[0 .. ($number_to_grab - 1)])
{
    $product *= ($word_score{$_} or $default_prob);
    $funk_product *= (1.0 - ($word_score{$_} or $default_prob));
}
untie %word_score;
my $real_score = $product / ($product + $funk_product);
if ($real_score >= 0.99)
{
    open OUT, ">>$spamfolder";
}
elsif ($real_score >= 0.9)
{
    open OUT, ">>$maybefolder";
}
else
{
    *OUT = *STDOUT;
}

my $flag = 0;
for (@lines)
{
    # At the first blank line, include the mail's score in the header
    if (!$flag && /^$/)
    {
        print OUT "X-Spamfilter-Score: $real_score\n";
        ++$flag;
    }
    print OUT "$_"
}
exit 0;
