#!/usr/bin/perl

# runfilter-text.pl version 1, 20021019
# Copyright 2002 by Will Wagner <wwagner@ymb.net>
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
#
# See also http://www.gnu.org/licenses/gpl.txt

use strict;

my %word_score;
my %mail_word_scores;
my $default_prob = 0.4;
my $number_to_grab = 15;

# Load up the score file by hand
open(INFILE, "<$ENV{'HOME'}/.spamfilter/word-score.txt") or die "can't load score file: $!";
for (<INFILE>)
{
    @_ = split / /;
    $word_score{$_[0]} = $_[1];
}
close INFILE;

my @lines = <>;
for (@lines)
{
    @_ = split /[^-'\$A-Za-z0-9]+/;
    for (@_)
    {
        if (!/^[0-9]+$/)
        {
            $_ = lc $_;
            # We don't care about the actual probability here, we just want to
            # know far from absolute neutral it is; i.e. how "interesting".
            $mail_word_scores{$_} = abs(0.5 - ($word_score{$_} or $default_prob));
        }
    }
}

# Sort the words from this mail as to their "interestingness", biggest first.
my @words = sort { $mail_word_scores{$b} <=> $mail_word_scores{$a} } keys %mail_word_scores;
my $product = 1.0;
my $funk_product = 1.0;
# Now only process the most "intersting" $number_to_grab words.
for (@words[0 .. ($number_to_grab - 1)])
{
    $product *= $word_score{$_};
    $funk_product *= (1.0 - $word_score{$_});
}
my $real_score = $product / ($product + $funk_product);
if ($real_score >= 0.9)
{
    for (@lines) { print "$_" }
    exit 0;
}
else
{
    exit 1;
}
