#!/usr/bin/perl -w
getopts(
"l:L:h"
);
$opt_h
$opt_l
$opt_L
};
sub
usage {
die
"find-extremes [-l LC] [-L LC] [spam
log
] [nonspam
log
]
-l LC also
print
language specific rules
for
lang code LC (or
'all'
)
-L LC only
print
language specific rules
for
lang code LC (or
'all'
)
options -l and -L are mutually exclusive.
if
either the spam or and nonspam logs are unspecified, the defaults
are \"spam.
log
\" and \"nonspam.
log
\" in the cwd.
";
}
usage()
if
(
$opt_h
|| (
$opt_l
&&
$opt_L
));
$lower
= 1;
$higher
= 9;
$min_expected
= 2;
my
%freq_spam
= ();
my
%freq_over_higher_falsepos
= ();
my
%freq_nonspam
= ();
my
%freq_under_lower_falseneg
= ();
my
%over_expected_falsepos
= ();
my
%over_expected_falseneg
= ();
my
%ratio_expected_falsepos
= ();
my
%ratio_expected_falseneg
= ();
my
$num_spam
= 0;
my
$num_nonspam
= 0;
my
$num_over_higher_falsepos
= 0;
my
$num_under_lower_falseneg
= 0;
my
$ok_lang
=
''
;
readscores();
$ok_lang
=
lc
(
$opt_l
||
$opt_L
||
''
);
if
(
$ok_lang
eq
'all'
) {
$ok_lang
=
'.'
; }
foreach
my
$key
(
keys
%rules
) {
if
( (
$opt_L
&& !
$rules
{
$key
}->{lang}) ||
(
$rules
{
$key
}->{lang} &&
(!
$ok_lang
||
$rules
{
$key
}->{lang} !~ /^
$ok_lang
/i)
) ) {
delete
$rules
{
$key
} ;
next
;
}
if
(
$rules
{
$key
}->{tflags} =~ m/net/) {
delete
$rules
{
$key
};
next
;
}
if
(
$rules
{
$key
}->{tflags} !~ m/userconf/) {
if
(
$rules
{
$key
}->{tflags} =~ m/nice/) {
$freq_nonspam
{
$key
} = 0;
$freq_under_lower_falseneg
{
$key
} = 0;
}
else
{
$freq_spam
{
$key
} = 0;
$freq_over_higher_falsepos
{
$key
} = 0;
}
}
}
readlogs();
unless
((
$num_over_higher_falsepos
>=
$min_expected
)
&& (
$num_under_lower_falseneg
>=
$min_expected
)) {
die
"Insufficient extremes in dataset ("
.
$num_over_higher_falsepos
.
" "
.
$num_under_lower_falseneg
.
"); stopped"
;
}
sub
chisquare {
my
(
$a
,
$b
,
$c
,
$d
) =
@_
;
my
$chisquare
=
(((
$a
*$d
-
$b
*$c
)**2)*(
$a
+
$b
+
$c
+
$d
))/
((
$a
+
$d
)*(
$c
+
$d
)*(
$b
+
$d
)*(
$a
+
$c
));
if
(
$chisquare
>= 6.64) {
return
$chisquare
,0.01;
}
elsif
(
$chisquare
>= 3.841) {
return
$chisquare
,0.05;
}
else
{
return
$chisquare
,.5;
}
}
my
$ratio_falsepos
=
$num_over_higher_falsepos
/
$num_spam
;
my
$ratio_falseneg
=
$num_under_lower_falseneg
/
$num_nonspam
;
my
$skipped_non_nice
= 0;
foreach
$rule
(
keys
%freq_spam
) {
my
$expected
=
$freq_spam
{
$rule
}
*$ratio_falsepos
;
if
(
$expected
<=
$min_expected
) {
$skipped_non_nice
++;
next
;
}
$over_expected_falsepos
{
$rule
} =
$freq_over_higher_falsepos
{
$rule
} -
$expected
;
$ratio_expected_falsepos
{
$rule
} =
$freq_over_higher_falsepos
{
$rule
}/
$expected
;
(
$chisquare
{
$rule
},
$prob
{
$rule
}) =
chisquare(
$num_spam
,
$num_over_higher_falsepos
,
$freq_spam
{
$rule
},
$freq_over_higher_falsepos
{
$rule
});
if
(
$freq_over_higher_falsepos
{
$rule
} <
$expected
) {
$chisquare
{
$rule
} *= -1;
}
}
warn
"Skipped non-nice: $skipped_non_nice\n"
;
my
$skipped_nice
= 0;
foreach
$rule
(
keys
%freq_nonspam
) {
my
$expected
=
$freq_nonspam
{
$rule
}
*$ratio_falseneg
;
if
(
$expected
<=
$min_expected
) {
$skipped_nice
++;
next
;
}
$over_expected_falseneg
{
$rule
} =
$freq_under_lower_falseneg
{
$rule
} -
$expected
;
$ratio_expected_falseneg
{
$rule
} =
$freq_under_lower_falseneg
{
$rule
}/
$expected
;
(
$chisquare
{
$rule
},
$prob
{
$rule
}) =
chisquare(
$num_nonspam
,
$num_under_lower_falseneg
,
$freq_nonspam
{
$rule
},
$freq_under_lower_falseneg
{
$rule
});
if
(
$freq_under_lower_falseneg
{
$rule
} <
$expected
) {
$chisquare
{
$rule
} *= -1;
}
}
warn
"Skipped nice: $skipped_nice\n"
;
@rules_falsepos
=
grep
{
$prob
{
$_
} < .5} (
keys
%over_expected_falsepos
);
if
(
scalar
(
@rules_falsepos
)) {
print
"RULE\t\tCHISQUARE\tRATIO_FALSEPOS\tOVER_FALSEPOS\tFREQ_OVER ($num_over_higher_falsepos)\n"
;
my
(
@rules_falsepos_bad
) =
grep
{
$chisquare
{
$_
} > 0} (
@rules_falsepos
);
if
(
scalar
(
@rules_falsepos_bad
)) {
@rules_falsepos_bad
=
sort
{
(
$chisquare
{
$b
} <=>
$chisquare
{
$a
}) ||
(
$ratio_expected_falsepos
{
$b
} <=>
$ratio_expected_falsepos
{
$a
}) ||
(
$over_expected_falsepos
{
$b
} <=>
$over_expected_falsepos
{
$a
}) ||
(
$freq_over_higher_falsepos
{
$b
} <=>
$freq_over_higher_falsepos
{
$a
})} (
@rules_falsepos_bad
);
foreach
$rule
(
@rules_falsepos_bad
) {
print
$rule
.
"\t"
.
$prob
{
$rule
} .
"\t"
.
$ratio_expected_falsepos
{
$rule
} .
"\t"
.
$over_expected_falsepos
{
$rule
} .
"\t"
.
$freq_over_higher_falsepos
{
$rule
} .
"\n"
;
}
}
my
(
@rules_falsepos_good
) =
grep
{
$chisquare
{
$_
} < 0} (
@rules_falsepos
);
if
(
scalar
(
@rules_falsepos_good
)) {
print
"###\n"
;
@rules_falsepos_good
=
sort
{
(
$chisquare
{
$a
} <=>
$chisquare
{
$b
}) ||
(
$ratio_expected_falsepos
{
$a
} <=>
$ratio_expected_falsepos
{
$b
}) ||
(
$freq_spam
{
$b
} <=>
$freq_spam
{
$a
})} (
@rules_falsepos_good
);
foreach
$rule
(
@rules_falsepos_good
) {
print
$rule
.
"\t"
.
$prob
{
$rule
} .
"\t"
.
$ratio_expected_falsepos
{
$rule
} .
"\t"
.
$over_expected_falsepos
{
$rule
} .
"\t"
.
$freq_over_higher_falsepos
{
$rule
} .
"\n"
;
}
}
}
else
{
warn
"No over-falsepos to print\n"
;
}
@rules_falseneg
=
grep
{
$prob
{
$_
} < .5} (
keys
%over_expected_falseneg
);
if
(
scalar
(
@rules_falseneg
)) {
print
"RULE\t\tCHISQUARE\tRATIO_FALSENEG\tOVER_FALSENEG\tFREQ_UNDER ($num_under_lower_falseneg)\n"
;
my
(
@rules_falseneg_bad
) =
grep
{
$chisquare
{
$_
} > 0} (
@rules_falseneg
);
if
(
scalar
(
@rules_falseneg_bad
)) {
@rules_falseneg_bad
=
sort
{
(
$chisquare
{
$b
} <=>
$chisquare
{
$a
}) ||
(
$ratio_expected_falseneg
{
$b
} <=>
$ratio_expected_falseneg
{
$a
}) ||
(
$over_expected_falseneg
{
$b
} <=>
$over_expected_falseneg
{
$a
}) ||
(
$freq_under_lower_falseneg
{
$b
} <=>
$freq_under_lower_falseneg
{
$a
})} (
@rules_falseneg_bad
);
foreach
$rule
(
@rules_falseneg_bad
) {
print
$rule
.
"\t"
.
$prob
{
$rule
} .
"\t"
.
$ratio_expected_falseneg
{
$rule
} .
"\t"
.
$over_expected_falseneg
{
$rule
} .
"\t"
.
$freq_under_lower_falseneg
{
$rule
} .
"\n"
;
}
}
my
(
@rules_falseneg_good
) =
grep
{
$chisquare
{
$_
} < 0} (
@rules_falseneg
);
if
(
scalar
(
@rules_falseneg_good
)) {
print
"###\n"
;
@rules_falseneg_good
=
sort
{
(
$chisquare
{
$a
} <=>
$chisquare
{
$b
}) ||
(
$ratio_expected_falseneg
{
$a
} <=>
$ratio_expected_falseneg
{
$b
}) ||
(
$freq_spam
{
$b
} <=>
$freq_spam
{
$a
})} (
@rules_falseneg_good
);
foreach
$rule
(
@rules_falseneg_good
) {
print
$rule
.
"\t"
.
$prob
{
$rule
} .
"\t"
.
$ratio_expected_falseneg
{
$rule
} .
"\t"
.
$over_expected_falseneg
{
$rule
} .
"\t"
.
$freq_under_lower_falseneg
{
$rule
} .
"\n"
;
}
}
}
else
{
warn
"No over-falseneg to print\n"
;
}
exit
;
sub
readlogs {
my
$spam
=
$ARGV
[0] ||
"spam.log"
;
my
$nonspam
=
$ARGV
[1] || (-f
"good.log"
?
"good.log"
:
"nonspam.log"
);
(
open
(NONSPAM,
$nonspam
)) ||
(
die
"Couldn't open file '$nonspam': $!; stopped"
);
while
(
defined
(
$line
= <NONSPAM>)) {
if
(
$line
=~ m/^\s*\
next
;
}
elsif
(
$line
=~ m/^.\s+-?\d+\s+\S+\s*(\S*)/) {
my
$tests
= $1;
my
$hits
= 0;
my
(
@tests
) = ();
foreach
$test
(
grep
{
length
(
$_
)} (
split
(/,+/,
$tests
))) {
if
(
exists
(
$rules
{
$test
})) {
push
@tests
,
$test
;
$hits
+=
$rules
{
$test
}->{score};
}
}
if
(
scalar
(
@tests
)) {
$num_nonspam
++;
foreach
$test
(
grep
{
exists
(
$freq_nonspam
{
$_
})} (
@tests
)) {
$freq_nonspam
{
$test
}++;
}
if
(
$hits
>=
$higher
) {
$num_over_higher_falsepos
++;
foreach
$test
(
grep
{
exists
(
$freq_over_higher_falsepos
{
$_
})} (
@tests
)) {
$freq_over_higher_falsepos
{
$test
}++;
}
}
}
}
elsif
(
$line
=~ m/\S/) {
chomp
(
$line
);
warn
"Can't interpret line '$line'; skipping"
;
}
}
close
(NONSPAM);
(
open
(SPAM,
$spam
)) || (
die
"Couldn't open file '$spam': $!; stopped"
);
while
(
defined
(
$line
= <SPAM>)) {
if
(
$line
=~ m/^\s*\
next
;
}
elsif
(
$line
=~ m/^.\s+-?\d+\s+\S+\s*(\S*)/) {
my
$tests
= $1;
my
$hits
= 0;
my
$plus_hits
= 0;
my
(
@tests
) = ();
foreach
$test
(
grep
{
length
(
$_
)} (
split
(/,+/,
$tests
))) {
if
(
exists
(
$rules
{
$test
})) {
push
@tests
,
$test
;
$hits
+=
$rules
{
$test
}->{score};
if
(
$rules
{
$test
}->{score} > 0) {
$plus_hits
+=
$rules
{
$test
}->{score};
}
}
}
if
(
scalar
(
@tests
)) {
$num_spam
++;
foreach
$test
(
grep
{
exists
(
$freq_spam
{
$_
})} (
@tests
)) {
$freq_spam
{
$test
}++;
}
if
((
$hits
<=
$lower
) &&
$plus_hits
&&
(
$plus_hits
>=
$lower
)) {
$num_under_lower_falseneg
++;
foreach
$test
(
grep
{
exists
(
$freq_under_lower_falseneg
{
$_
})} (
@tests
)) {
$freq_under_lower_falseneg
{
$test
}++;
}
}
}
}
elsif
(
$line
=~ m/\S/) {
chomp
(
$line
);
warn
"Can't interpret line '$line'; skipping"
;
}
}
close
(SPAM);
}
sub
readscores {
system
(
"./parse-rules-for-masses"
) and
die
"Couldn't do parse-rules-for-masses: $?; stopped"
;
require
"./tmp/rules.pl"
;
}