#!/usr/bin/perl
#
# filters blosxom comments based on a regular expression blacklist
#
######################### 
# $Id: blosxom-comment-filter.pl 139 2005-09-11 14:38:45Z steve $
#########################
# $Log$
# Revision 1.3  2005/09/11 14:38:45  steve
# better blacklist parsing
#
# Revision 1.2  2005/09/11 14:26:04  steve
# filters the body now
#
# Revision 1.1  2005/01/08 15:05:06  steve
# added
#
#########################

use strict;
use Data::Dumper;
$Data::Dumper::Indent = 1;

use Getopt::Std;

my ($version) = '$Revision: 139 $' =~ /Revision:\s*(.+)\$/;

our ( $opt_h, $opt_V, $opt_n );
getopts("hVn");

usage() if $opt_h;
usage() unless @ARGV > 0;
die "$0 version: $version\n" if $opt_V;

my $dir = $ARGV[0];
my $preview = $opt_n;

my $blacklist = "$dir/.blacklist" ;
open BL, $blacklist or die" cannot open blacklist: $blacklist: $!\n";

my @blist = map { chomp; s/\s*\#.*$//; qr($_) } <BL>;

print "read ", scalar @blist, " regular expressions in from blacklist file\n";

close BL;

recurse_dir( $dir, \@blist );
sub recurse_dir{
    my ( $dir, $bl ) = @_;

    print "reading directory $dir\n";
    my $dh;
    opendir $dh, $dir or die "cannot open dir: $dir: $!\n";

    while( my $file = readdir($dh) ){

	next if $file =~ /^\.{1,2}/;

	if( $file =~ /comments$/ ){
	
	    filter_file( "$dir/$file", $bl );

	}elsif( -d "$dir/$file" ){
	    recurse_dir( "$dir/$file", $bl );
	}
    }
}


sub filter_file{
    my( $file, $bl ) = @_;

    open CF, $file or die "cannot read comment file: $file: $!\n";

    my $comments = join "", <CF>;
    
    close CF;

    my $saved_comments;
    eval $comments;

    my $dirty = 0;
    foreach my $comment ( keys %{$saved_comments} ){
	my $url = $saved_comments->{$comment}->{url};
	my $body = $saved_comments->{$comment}->{comment};

	REGEX: foreach my $blre ( @$bl ){
	    if( $url =~ $blre || $body =~ $blre ){
		print "Deleting spam with URL of $url\n";
		delete $saved_comments->{$comment};
		$dirty = 1;
		last REGEX;
	    }
	}
    }

    if( $dirty && !$preview ){
	open CF, ">$file" or die "cannot write to comment file: $file: $!\n";

	print CF Data::Dumper->Dump([$saved_comments], ["saved_comments"]);
	
	close CF;
	print "wrote cleaned comments to $file\n";
    }
}

sub usage(){
    die <<USAGE;
usage: $0 [options] comment_dir
 
cleans out spam in Blosxom comments based on the blacklist found in:
   state/comments/.blacklist
a good such blacklist is available here:
  http://www.jayallen.org/comment_spam/blacklist.txt

options:
    -n          do not actually touch the files, just show what would be deleted
    -h          this help
    -V          print version information

Copyright(C) 2004 Steve Pomeroy <steve\@staticfree.info>
Licensed under the GNU GPL. See documentation for complete details.
USAGE

}
