#!/usr/bin/perl

# runfilter.pl version 3, 20021105
# Copyright 2002 by Will Wagner <wwagner@ymb.net>
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by 
# the Free Software Foundation; either version 2 of the License, or 
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of 
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
#
# See also http://www.gnu.org/licenses/gpl.txt

# 20021024 WHW - Added word pair probabilities
# 20021105 WHW - Changed to use DB_File instead of text or Storable.

use strict;
use DB_File;

my %word_score;
my %mail_word_scores;
my $default_prob = 0.49;
my $number_to_grab = 15;

# Tie up the score file
tie %word_score, 'DB_File', "$ENV{'HOME'}/.spamfilter/word-score.db", O_RDONLY, 0644, $DB_BTREE
    or die "Couldn't tie word score file: $!";

my $prevword;
my @lines = <>;
for (@lines)
{
    @_ = split /[^-'\$A-Za-z0-9]+/;
    for (@_)
    {
        unless (/^[0-9]+$/ || /^$/)
        {
            $_ = lc $_;
            # We don't care about the actual probability here, we just want to
            # know far from absolute neutral it is; i.e. how "interesting".
            $mail_word_scores{$_} = abs(0.5 - ($word_score{$_}
                                               or $default_prob));
            $mail_word_scores{"$prevword $_"} = abs(0.5 - ($word_score{"$prevword $_"} or $default_prob)) if ($prevword);
            $prevword = $_;
        }
    }
}

# Sort the words from this mail as to their "interestingness", biggest first.
my @words = sort { $mail_word_scores{$b} <=> $mail_word_scores{$a} } keys %mail_word_scores;
my $product = 1.0;
my $funk_product = 1.0;
# Now only process the most "interesting" $number_to_grab words.
for (@words[0 .. ($number_to_grab - 1)])
{
    $product *= ($word_score{$_} or $default_prob);
    $funk_product *= (1.0 - ($word_score{$_} or $default_prob));
}
untie %word_score;
my $real_score = $product / ($product + $funk_product);
if ($real_score >= 0.9)
{
    for (@lines) { print "$_" }
    exit 0;
}
else
{
    exit 1;
}
