our
@ISA
=
qw(Mail::SpamAssassin::Plugin)
;
our
$IGNORED_HDRS
=
qr{(?: (?:X-)?Sender # misc noise
|Delivered-To |Delivery-Date
|(?:X-)?Envelope-To
|X-MIME-Auto[Cc]onverted |X-Converted-To-Plain-Text
|Subject # not worth a tiny gain vs. to db size increase
# Date: can provide invalid cues if your spam corpus is
# older/newer than ham
|Date
# List headers: ignore. a spamfiltering mailing list will
# become a nonspam sign.
|X-List|(?:X-)?Mailing-List
|(?:X-)?List-(?:Archive|Help|Id|Owner|Post|Subscribe
|Unsubscribe|Host|Id|Manager|Admin|Comment
|Name|Url)
|X-Unsub(?:scribe)?
|X-Mailman-Version |X-Been[Tt]here |X-Loop
|Mail-Followup-To
|X-eGroups-(?:Return|From)
|X-MDMailing-List
|X-XEmacs-List
|X-Sympa-To
# gatewayed through mailing list (thanks to Allen Smith)
|(?:X-)?Resent-(?:From|To|Date)
|(?:X-)?Original-(?:From|To|Date)
# Spamfilter/virus-scanner headers: too easy to chain from
# these
|X-MailScanner(?:-SpamCheck)?
|X-Spam(?:-(?:Status|Level|Flag|Report|Hits|Score|Checker-Version))?
|X-Antispam |X-RBL-Warning |X-Mailscanner
|X-MDaemon-Deliver-To |X-Virus-Scanned
|X-Mass-Check-Id
|X-Pyzor |X-DCC-\S{2,25}
-Metrics
|X-Filtered-B[Yy] |X-Scanned-By |X-Scanner
|X-AP-Spam-(?:Score|Status) |X-RIPE-Spam-Status
|X-SpamCop-[^:]+
|X-SMTPD |(?:X-)?Spam-Apparently-To
|SPAM |X-Perlmx-Spam
|X-Bogosity
|Content-Class |Thread-(?:Index|Topic)
|X-Original[Aa]rrival[Tt]ime
|(?:X-)?Status |X-Flags |X-Keywords |Replied |Forwarded
|Lines |Content-Length
|X-UIDL? |X-IMAPbase
|X-Bugzilla-[^:]+
|X-VM-(?:Bookmark|(?:POP|IMAP)-Retrieved|Labels|Last-Modified
|Summary-Format|VHeader|v\d-Data|Message-Order)
| X-Gnus-Mail-Source
| Xref
)}ix;
our
$MARK_PRESENCE_ONLY_HDRS
=
qr{(?: X-Face
|X-(?:Gnu-?PG|PGP|GPG)(?:-Key)?-Fingerprint
|D(?:KIM|omainKey)-Signature
|X-Google-DKIM-Signature
|ARC-(?:Message-Signature|Seal)
|Autocrypt
)}
ix;
use
constant
TOKENIZE_LONG_8BIT_SEQS_AS_TUPLES
=> 0;
use
constant
TOKENIZE_LONG_8BIT_SEQS_AS_UTF8_CHARS
=> 1;
use
constant
TOKENIZE_LONG_TOKENS_AS_SKIPS
=> 1;
use
constant
PRE_CHEW_ADDR_HEADERS
=> 1;
use
constant
HDRS_TOKENIZE_LONG_TOKENS_AS_SKIPS
=> 1;
use
constant
BODY_TOKENIZE_LONG_TOKENS_AS_SKIPS
=> 1;
use
constant
URIS_TOKENIZE_LONG_TOKENS_AS_SKIPS
=> 0;
use
constant
DECOMPOSE_BODY_TOKENS
=> 1;
use
constant
MAP_HEADERS_USERAGENT
=> 1;
use
constant
ADD_INVIZ_TOKENS_I_PREFIX
=> 1;
use
constant
ADD_INVIZ_TOKENS_NO_PREFIX
=> 0;
our
%HEADER_NAME_COMPRESSION
= (
'Message-Id'
=>
'*m'
,
'Message-ID'
=>
'*M'
,
'Received'
=>
'*r'
,
'User-Agent'
=>
'*u'
,
'References'
=>
'*f'
,
'In-Reply-To'
=>
'*i'
,
'From'
=>
'*F'
,
'Reply-To'
=>
'*R'
,
'Return-Path'
=>
'*p'
,
'Return-path'
=>
'*rp'
,
'X-Mailer'
=>
'*x'
,
'X-Authentication-Warning'
=>
'*a'
,
'Organization'
=>
'*o'
,
'Organisation'
=>
'*o'
,
'Content-Type'
=>
'*ct'
,
'Content-Disposition'
=>
'*cd'
,
'Content-Transfer-Encoding'
=>
'*ce'
,
'x-spam-relays-trusted'
=>
'*RT'
,
'x-spam-relays-untrusted'
=>
'*RU'
,
);
our
$OPPORTUNISTIC_LOCK_VALID
= 300;
use
constant
USE_ROBINSON_FX_EQUATION_FOR_LOW_FREQS
=> 1;
use
constant
N_SIGNIFICANT_TOKENS
=> 150;
use
constant
REQUIRE_SIGNIFICANT_TOKENS_TO_SCORE
=> -1;
sub
new {
my
$class
=
shift
;
my
(
$main
) =
@_
;
$class
=
ref
(
$class
) ||
$class
;
my
$self
=
$class
->SUPER::new(
$main
);
bless
(
$self
,
$class
);
$self
->{main} =
$main
;
$self
->{conf} =
$main
->{conf};
$self
->{use_ignores} = 1;
$self
->{bayes_stopword}{en} =
qr/(?:a(?:ble|l(?:ready|l)|n[dy]|re)|b(?:ecause|oth)|c(?:an|ome)|e(?:ach|mail|ven)|f(?:ew|irst|or|rom)|give|h(?:a(?:ve|s)|ttp)|i(?:n(?:formation|to)|t\'s)|just|know|l(?:ike|o(?:ng|ok))|m(?:a(?:de|il(?:(?:ing|to))?|ke|ny)|o(?:re|st)|uch)|n(?:eed|o[tw]|umber)|o(?:ff|n(?:ly|e)|ut|wn)|p(?:eople|lace)|right|s(?:ame|ee|uch)|t(?:h(?:at|is|rough|e)|ime)|using|w(?:eb|h(?:ere|y)|ith(?:out)?|or(?:ld|k))|y(?:ears?|ou(?:(?:\'re|r))?))/
;
$self
->set_config(
$self
->{conf});
$self
->register_eval_rule(
"check_bayes"
,
$Mail::SpamAssassin::Conf::TYPE_BODY_EVALS
);
$self
;
}
sub
set_config {
my
(
$self
,
$conf
) =
@_
;
my
@cmds
;
push
(
@cmds
, {
setting
=>
'bayes_max_token_length'
,
default
=> MAX_TOKEN_LENGTH,
type
=>
$Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC
,
});
push
(
@cmds
, {
setting
=>
'bayes_stopword_languages'
,
default
=> [
'en'
],
type
=>
$Mail::SpamAssassin::Conf::CONF_TYPE_STRINGLIST
,
code
=>
sub
{
my
(
$self
,
$key
,
$value
,
$line
) =
@_
;
my
@langs
;
if
(
$value
eq
'disable'
) {
@{
$self
->{bayes_stopword_languages}} = ();
}
else
{
foreach
my
$lang
(
split
(/(?:\s*,\s*|\s+)/,
lc
(
$value
))) {
if
(
$lang
!~ /^([a-z]{2})$/) {
return
$Mail::SpamAssassin::Conf::INVALID_VALUE
;
}
push
@langs
,
$lang
;
}
if
(!
@langs
) {
return
$Mail::SpamAssassin::Conf::MISSING_REQUIRED_VALUE
;
}
@{
$self
->{bayes_stopword_languages}} =
@langs
;
}
}
});
$conf
->{parser}->register_commands(\
@cmds
);
}
sub
parse_config {
my
(
$self
,
$opts
) =
@_
;
if
(
$opts
->{key} =~ /^bayes_stopword_([a-z]{2})$/i) {
$self
->inhibit_further_callbacks();
my
$lang
=
lc
($1);
foreach
my
$re
(
split
(/\s+/,
$opts
->{value})) {
my
(
$rec
,
$err
) = compile_regexp(
'^(?i)'
.
$re
.
'$'
, 0);
if
(!
$rec
) {
warn
"bayes: invalid regexp for $opts->{key}: $err\n"
;
return
0;
}
$self
->{bayes_stopword}{
$lang
} =
$rec
;
}
return
1;
}
return
0;
}
sub
finish_parsing_end {
my
(
$self
,
$opts
) =
@_
;
my
$conf
=
$opts
->{conf};
my
@langs
;
foreach
my
$lang
(@{
$conf
->{bayes_stopword_languages}}) {
if
(
defined
$self
->{bayes_stopword}{
$lang
}) {
push
@langs
,
$lang
;
}
else
{
warn
"bayes: missing stopwords regexp for language '$lang'\n"
;
}
}
if
(
@langs
) {
dbg(
"bayes: stopwords for languages enabled: "
.
join
(
' '
,
@langs
));
@{
$conf
->{bayes_stopword_languages}} =
@langs
;
}
else
{
dbg(
"bayes: no stopword languages enabled"
);
$conf
->{bayes_stopword_languages} = [];
}
return
0;
}
sub
finish {
my
$self
=
shift
;
if
(
$self
->{store}) {
$self
->{store}->untie_db();
}
%{
$self
} = ();
}
sub
learner_get_implementation {
return
shift
; }
sub
prefork_init {
my
(
$self
) =
@_
;
if
(
$self
->{store} &&
$self
->{store}->UNIVERSAL::can(
'prefork_init'
)) {
$self
->{store}->prefork_init;
}
}
sub
spamd_child_init {
my
(
$self
) =
@_
;
if
(
$self
->{store} &&
$self
->{store}->UNIVERSAL::can(
'spamd_child_init'
)) {
$self
->{store}->spamd_child_init;
}
}
sub
check_bayes {
my
(
$self
,
$pms
,
$fulltext
,
$min
,
$max
) =
@_
;
return
0
if
(!
$self
->{conf}->{use_learner});
return
0
if
(!
$self
->{conf}->{use_bayes} || !
$self
->{conf}->{use_bayes_rules});
if
(!
exists
(
$pms
->{bayes_score})) {
my
$timer
=
$self
->{main}->time_method(
"check_bayes"
);
$pms
->{bayes_score} =
$self
->scan(
$pms
,
$pms
->{msg});
}
if
(
defined
$pms
->{bayes_score} &&
(
$min
== 0 ||
$pms
->{bayes_score} >
$min
) &&
(
$max
eq
"undef"
||
$pms
->{bayes_score} <=
$max
))
{
if
(
$self
->{conf}->{detailed_bayes_score}) {
$pms
->test_log(
sprintf
(
"score: %3.4f, hits: %s"
,
$pms
->{bayes_score},
$pms
->{bayes_hits}));
}
else
{
$pms
->test_log(
sprintf
(
"score: %3.4f"
,
$pms
->{bayes_score}));
}
return
1;
}
return
0;
}
sub
learner_close {
my
(
$self
,
$params
) =
@_
;
my
$quiet
=
$params
->{quiet};
if
(
$self
->{store}->db_readable()) {
warn
"bayes: oops! still tied to bayes DBs, untying\n"
unless
$quiet
;
$self
->{store}->untie_db();
}
}
sub
read_db_configs {
my
(
$self
) =
@_
;
$self
->{use_hapaxes} =
$self
->{conf}->{bayes_use_hapaxes};
}
sub
ignore_message {
my
(
$self
,
$PMS
) =
@_
;
return
0
unless
$self
->{use_ignores};
my
$ig_from
=
$self
->{main}->call_plugins (
"check_wb_list"
,
{
permsgstatus
=>
$PMS
,
type
=>
'from'
,
list
=>
'bayes_ignore_from'
});
my
$ig_to
=
$self
->{main}->call_plugins (
"check_wb_list"
,
{
permsgstatus
=>
$PMS
,
type
=>
'to'
,
list
=>
'bayes_ignore_to'
});
my
$ignore
=
$ig_from
||
$ig_to
;
dbg(
"bayes: not using bayes, bayes_ignore_from or _to rule"
)
if
$ignore
;
return
$ignore
;
}
sub
learn_message {
my
(
$self
,
$params
) =
@_
;
my
$isspam
=
$params
->{isspam};
my
$msg
=
$params
->{msg};
my
$id
=
$params
->{id};
if
(!
$self
->{conf}->{use_bayes}) {
return
; }
my
$msgdata
=
$self
->get_body_from_msg (
$msg
);
my
$ret
;
eval
{
local
$SIG
{
'__DIE__'
};
my
$timer
=
$self
->{main}->time_method(
"b_learn"
);
my
$ok
;
if
(
$self
->{main}->{learn_to_journal}) {
$ok
=
$self
->{store}->tie_db_readonly() ||
$self
->{store}->tie_db_writable();
}
else
{
$ok
=
$self
->{store}->tie_db_writable();
}
if
(
$ok
) {
$ret
=
$self
->_learn_trapped (
$isspam
,
$msg
,
$msgdata
,
$id
);
if
(!
$self
->{main}->{learn_caller_will_untie}) {
$self
->{store}->untie_db();
}
}
1;
} or
do
{
my
$eval_stat
= $@ ne
''
? $@ :
"errno=$!"
;
chomp
$eval_stat
;
$self
->{store}->untie_db();
die
"bayes: (in learn) $eval_stat\n"
;
};
return
$ret
;
}
sub
_learn_trapped {
my
(
$self
,
$isspam
,
$msg
,
$msgdata
,
$msgid
) =
@_
;
my
@msgid
= (
$msgid
);
if
(!
defined
$msgid
) {
@msgid
= (
$msg
->generate_msgid(),
$msg
->get_msgid() );
}
foreach
my
$msgid_t
(
@msgid
) {
next
if
!
defined
$msgid_t
;
my
$seen
=
$self
->{store}->seen_get (
$msgid_t
);
if
(
defined
(
$seen
)) {
if
((
$seen
eq
's'
&&
$isspam
) || (
$seen
eq
'h'
&& !
$isspam
)) {
dbg(
"bayes: $msgid_t already learnt correctly, not learning twice"
);
return
0;
}
elsif
(
$seen
!~ /^[hs]$/) {
warn
(
"bayes: db_seen corrupt: value='$seen' for $msgid_t, ignored"
);
}
else
{
if
(
$self
->{main}->{learn_no_relearn}) {
dbg(
"bayes: $msgid_t already learnt as opposite, not re-learning"
);
return
0;
}
dbg(
"bayes: $msgid_t already learnt as opposite, forgetting first"
);
my
$orig
=
$self
->{main}->{learn_caller_will_untie};
$self
->{main}->{learn_caller_will_untie} = 1;
my
$fatal
= !
defined
$self
->{main}->{bayes_scanner}->forget (
$msg
);
$self
->{main}->{learn_caller_will_untie} =
$orig
;
if
(
$fatal
) {
dbg(
"bayes: forget() returned a fatal error, so learn() will too"
);
return
;
}
}
last
;
}
}
$msgid
=
$msgid
[0];
my
$msgatime
=
$msg
->receive_date();
$msgatime
=
time
if
(
$msgatime
-
time
> 86400 );
my
$tokens
=
$self
->tokenize(
$msg
,
$msgdata
);
{
my
$timer
=
$self
->{main}->time_method(
'b_count_change'
);
if
(
$isspam
) {
$self
->{store}->nspam_nham_change(1, 0);
$self
->{store}->multi_tok_count_change(1, 0,
$tokens
,
$msgatime
);
}
else
{
$self
->{store}->nspam_nham_change(0, 1);
$self
->{store}->multi_tok_count_change(0, 1,
$tokens
,
$msgatime
);
}
}
$self
->{store}->seen_put (
$msgid
, (
$isspam
?
's'
:
'h'
));
$self
->{store}->cleanup();
$self
->{main}->call_plugins(
"bayes_learn"
, {
toksref
=>
$tokens
,
isspam
=>
$isspam
,
msgid
=>
$msgid
,
msgatime
=>
$msgatime
,
});
dbg(
"bayes: learned '$msgid', atime: $msgatime"
);
1;
}
sub
forget_message {
my
(
$self
,
$params
) =
@_
;
my
$msg
=
$params
->{msg};
my
$id
=
$params
->{id};
if
(!
$self
->{conf}->{use_bayes}) {
return
; }
my
$msgdata
=
$self
->get_body_from_msg (
$msg
);
my
$ret
;
eval
{
local
$SIG
{
'__DIE__'
};
my
$timer
=
$self
->{main}->time_method(
"b_learn"
);
my
$ok
;
if
(
$self
->{main}->{learn_to_journal}) {
$ok
=
$self
->{store}->tie_db_readonly() ||
$self
->{store}->tie_db_writable();
}
else
{
$ok
=
$self
->{store}->tie_db_writable();
}
if
(
$ok
) {
$ret
=
$self
->_forget_trapped (
$msg
,
$msgdata
,
$id
);
if
(!
$self
->{main}->{learn_caller_will_untie}) {
$self
->{store}->untie_db();
}
}
1;
} or
do
{
my
$eval_stat
= $@ ne
''
? $@ :
"errno=$!"
;
chomp
$eval_stat
;
$self
->{store}->untie_db();
die
"bayes: (in forget) $eval_stat\n"
;
};
return
$ret
;
}
sub
_forget_trapped {
my
(
$self
,
$msg
,
$msgdata
,
$msgid
) =
@_
;
my
@msgid
= (
$msgid
);
my
$isspam
;
if
(!
defined
$msgid
) {
@msgid
= (
$msg
->generate_msgid(),
$msg
->get_msgid() );
}
while
(
$msgid
=
shift
@msgid
) {
my
$seen
=
$self
->{store}->seen_get (
$msgid
);
if
(
defined
(
$seen
)) {
if
(
$seen
eq
's'
) {
$isspam
= 1;
}
elsif
(
$seen
eq
'h'
) {
$isspam
= 0;
}
else
{
dbg(
"bayes: forget: msgid $msgid seen entry is neither ham nor spam, ignored"
);
return
0;
}
last
;
}
else
{
dbg(
"bayes: forget: msgid $msgid not learnt, ignored"
);
}
}
if
(!
defined
$isspam
) {
dbg(
"bayes: forget: no msgid from this message has been learnt, skipping message"
);
return
0;
}
elsif
(
$isspam
) {
$self
->{store}->nspam_nham_change (-1, 0);
}
else
{
$self
->{store}->nspam_nham_change (0, -1);
}
my
$tokens
=
$self
->tokenize(
$msg
,
$msgdata
);
if
(
$isspam
) {
$self
->{store}->multi_tok_count_change (-1, 0,
$tokens
);
}
else
{
$self
->{store}->multi_tok_count_change (0, -1,
$tokens
);
}
$self
->{store}->seen_delete (
$msgid
);
$self
->{store}->cleanup();
$self
->{main}->call_plugins(
"bayes_forget"
, {
toksref
=>
$tokens
,
isspam
=>
$isspam
,
msgid
=>
$msgid
,
});
1;
}
sub
learner_sync {
my
(
$self
,
$params
) =
@_
;
if
(!
$self
->{conf}->{use_bayes}) {
return
0; }
dbg(
"bayes: bayes journal sync starting"
);
$self
->{store}->sync(
$params
);
dbg(
"bayes: bayes journal sync completed"
);
}
sub
learner_expire_old_training {
my
(
$self
,
$params
) =
@_
;
if
(!
$self
->{conf}->{use_bayes}) {
return
0; }
dbg(
"bayes: expiry starting"
);
my
$timer
=
$self
->{main}->time_method(
"expire_bayes"
);
$self
->{store}->expire_old_tokens(
$params
);
dbg(
"bayes: expiry completed"
);
}
sub
learner_is_scan_available {
my
(
$self
,
$params
) =
@_
;
return
0
unless
$self
->{conf}->{use_bayes};
return
0
unless
$self
->{store}->tie_db_readonly();
my
$caller_untie
=
$self
->{main}->{learn_caller_will_untie};
$self
->{main}->{learn_caller_will_untie} = 1;
$self
->_opportunistic_calls(1);
$self
->{main}->{learn_caller_will_untie} =
$caller_untie
;
my
(
$ns
,
$nn
) =
$self
->{store}->nspam_nham_get();
if
(
$ns
<
$self
->{conf}->{bayes_min_spam_num}) {
dbg(
"bayes: not available for scanning, only $ns spam(s) in bayes DB < "
.
$self
->{conf}->{bayes_min_spam_num});
if
(!
$self
->{main}->{learn_caller_will_untie}) {
$self
->{store}->untie_db();
}
return
0;
}
if
(
$nn
<
$self
->{conf}->{bayes_min_ham_num}) {
dbg(
"bayes: not available for scanning, only $nn ham(s) in bayes DB < "
.
$self
->{conf}->{bayes_min_ham_num});
if
(!
$self
->{main}->{learn_caller_will_untie}) {
$self
->{store}->untie_db();
}
return
0;
}
return
1;
}
sub
scan {
my
(
$self
,
$permsgstatus
,
$msg
) =
@_
;
return
unless
$self
->{conf}->{use_learner};
my
$caller_untie
=
$self
->{main}->{learn_caller_will_untie};
$self
->{main}->{learn_caller_will_untie} = 1;
goto
skip
if
(
$self
->{main}->{bayes_scanner}->ignore_message(
$permsgstatus
));
goto
skip
unless
$self
->learner_is_scan_available();
my
(
$ns
,
$nn
) =
$self
->{store}->nspam_nham_get();
dbg(
"bayes: corpus size: nspam = $ns, nham = $nn"
);
my
$msgtokens
;
{
my
$timer
=
$self
->{main}->time_method(
'b_tokenize'
);
my
$msgdata
=
$self
->_get_msgdata_from_permsgstatus (
$permsgstatus
);
$msgtokens
=
$self
->tokenize(
$msg
,
$msgdata
);
}
my
$tokensdata
;
{
my
$timer
=
$self
->{main}->time_method(
'b_tok_get_all'
);
$tokensdata
=
$self
->{store}->tok_get_all(
keys
%{
$msgtokens
});
}
my
$timer_compute_prob
=
$self
->{main}->time_method(
'b_comp_prob'
);
my
$probabilities_ref
=
$self
->_compute_prob_for_all_tokens(
$tokensdata
,
$ns
,
$nn
);
my
%pw
;
foreach
my
$tokendata
(@{
$tokensdata
}) {
my
$prob
=
shift
(
@$probabilities_ref
);
next
unless
defined
$prob
;
my
(
$token
,
$tok_spam
,
$tok_ham
,
$atime
) = @{
$tokendata
};
$pw
{
$token
} = {
prob
=>
$prob
,
spam_count
=>
$tok_spam
,
ham_count
=>
$tok_ham
,
atime
=>
$atime
};
}
my
@pw_keys
=
keys
%pw
;
if
(!
@pw_keys
) {
dbg(
"bayes: cannot use bayes on this message; none of the tokens were found in the database"
);
goto
skip;
}
my
$tcount_total
=
keys
%{
$msgtokens
};
my
$tcount_learned
=
scalar
@pw_keys
;
my
$msgatime
=
$msg
->receive_date();
my
$now
=
time
;
$msgatime
=
$now
if
(
$msgatime
>
$now
);
my
@touch_tokens
;
my
$tinfo_spammy
=
$permsgstatus
->{bayes_token_info_spammy} = [];
my
$tinfo_hammy
=
$permsgstatus
->{bayes_token_info_hammy} = [];
my
%tok_strength
=
map
( (
$_
,
abs
(
$pw
{
$_
}->{prob} - 0.5)),
@pw_keys
);
my
$log_each_token
= (would_log(
'dbg'
,
'bayes'
) > 1);
@pw_keys
=
sort
{
$tok_strength
{
$b
} <=>
$tok_strength
{
$a
} }
@pw_keys
;
if
(
@pw_keys
> N_SIGNIFICANT_TOKENS) {
$#pw_keys
= N_SIGNIFICANT_TOKENS - 1 }
my
@sorted
;
my
$score
;
foreach
my
$tok
(
@pw_keys
) {
next
if
$tok_strength
{
$tok
} <
$Mail::SpamAssassin::Bayes::Combine::MIN_PROB_STRENGTH
;
my
$pw_tok
=
$pw
{
$tok
};
my
$pw_prob
=
$pw_tok
->{prob};
my
$raw_token
=
$msgtokens
->{
$tok
} ||
"(unknown)"
;
my
$s
=
$pw_tok
->{spam_count};
my
$n
=
$pw_tok
->{ham_count};
my
$a
=
$pw_tok
->{atime};
push
( @{
$pw_prob
< 0.5 ?
$tinfo_hammy
:
$tinfo_spammy
},
[
$raw_token
,
$pw_prob
,
$s
,
$n
,
$a
] );
push
(
@sorted
,
$pw_prob
);
push
(
@touch_tokens
,
$tok
);
if
(
$log_each_token
) {
dbg(
"bayes: token '$raw_token' => $pw_prob"
);
}
}
if
(!
@sorted
|| (REQUIRE_SIGNIFICANT_TOKENS_TO_SCORE > 0 &&
$#sorted
<= REQUIRE_SIGNIFICANT_TOKENS_TO_SCORE))
{
dbg(
"bayes: cannot use bayes on this message; not enough usable tokens found"
);
goto
skip;
}
$score
= Mail::SpamAssassin::Bayes::Combine::combine(
$ns
,
$nn
, \
@sorted
);
undef
$timer_compute_prob
;
goto
skip
unless
defined
$score
;
dbg(
"bayes: score = $score"
);
{
my
$timer
=
$self
->{main}->time_method(
'b_tok_touch_all'
);
$self
->{store}->tok_touch_all(\
@touch_tokens
,
$msgatime
);
}
my
$timer_finish
=
$self
->{main}->time_method(
'b_finish'
);
$permsgstatus
->{bayes_nspam} =
$ns
;
$permsgstatus
->{bayes_nham} =
$nn
;
$self
->{main}->call_plugins(
"bayes_scan"
, {
toksref
=>
$msgtokens
,
probsref
=> \
%pw
,
score
=>
$score
,
msgatime
=>
$msgatime
,
significant_tokens
=> \
@touch_tokens
,
});
skip:
if
(!
defined
$score
) {
dbg(
"bayes: not scoring message, returning undef"
);
}
undef
$timer_compute_prob
;
if
(!
defined
$timer_finish
) {
$timer_finish
=
$self
->{main}->time_method(
'b_finish'
);
}
if
(
$self
->{main}->{opportunistic_expire_check_only}) {
$self
->_opportunistic_calls(1);
$permsgstatus
->{bayes_expiry_due} =
$self
->{store}->expiry_due();
}
else
{
$self
->_opportunistic_calls();
}
$self
->{store}->cleanup();
$self
->{main}->{learn_caller_will_untie} =
$caller_untie
;
if
(!
$caller_untie
) {
$self
->{store}->untie_db();
}
$permsgstatus
->set_tag (
'BAYESTCHAMMY'
,
(
$tinfo_hammy
?
scalar
@{
$tinfo_hammy
} : 0));
$permsgstatus
->set_tag (
'BAYESTCSPAMMY'
,
(
$tinfo_spammy
?
scalar
@{
$tinfo_spammy
} : 0));
$permsgstatus
->set_tag (
'BAYESTCLEARNED'
,
$tcount_learned
);
$permsgstatus
->set_tag (
'BAYESTC'
,
$tcount_total
);
$permsgstatus
->set_tag (
'HAMMYTOKENS'
,
sub
{
my
$pms
=
shift
;
$self
->bayes_report_make_list
(
$pms
,
$pms
->{bayes_token_info_hammy},
shift
);
});
$permsgstatus
->set_tag (
'SPAMMYTOKENS'
,
sub
{
my
$pms
=
shift
;
$self
->bayes_report_make_list
(
$pms
,
$pms
->{bayes_token_info_spammy},
shift
);
});
$permsgstatus
->set_tag (
'TOKENSUMMARY'
,
sub
{
my
$pms
=
shift
;
if
(
defined
$pms
->{tag_data}{BAYESTC} )
{
my
$tcount_neutral
=
$pms
->{tag_data}{BAYESTCLEARNED}
-
$pms
->{tag_data}{BAYESTCSPAMMY}
-
$pms
->{tag_data}{BAYESTCHAMMY};
my
$tcount_new
=
$pms
->{tag_data}{BAYESTC}
-
$pms
->{tag_data}{BAYESTCLEARNED};
"Tokens: new, $tcount_new; "
.
"hammy, $pms->{tag_data}{BAYESTCHAMMY}; "
.
"neutral, $tcount_neutral; "
.
"spammy, $pms->{tag_data}{BAYESTCSPAMMY}."
}
else
{
"Bayes not run."
;
}
});
return
$score
;
}
sub
learner_dump_database {
my
(
$self
,
$params
) =
@_
;
my
$magic
=
$params
->{magic};
my
$toks
=
$params
->{toks};
my
$regex
=
$params
->{regex};
return
0
unless
$self
->{store}->tie_db_readonly();
my
@vars
=
$self
->{store}->get_storage_variables();
my
(
$sb
,
$ns
,
$nh
,
$nt
,
$le
,
$oa
,
$bv
,
$js
,
$ad
,
$er
,
$na
) =
@vars
;
my
$template
=
'%3.3f %10u %10u %10u %s'
.
"\n"
;
if
(
$magic
) {
printf
(
$template
, 0.0, 0,
$bv
, 0,
'non-token data: bayes db version'
)
or
die
"Error writing: $!"
;
printf
(
$template
, 0.0, 0,
$ns
, 0,
'non-token data: nspam'
)
or
die
"Error writing: $!"
;
printf
(
$template
, 0.0, 0,
$nh
, 0,
'non-token data: nham'
)
or
die
"Error writing: $!"
;
printf
(
$template
, 0.0, 0,
$nt
, 0,
'non-token data: ntokens'
)
or
die
"Error writing: $!"
;
printf
(
$template
, 0.0, 0,
$oa
, 0,
'non-token data: oldest atime'
)
or
die
"Error writing: $!"
;
if
(
$bv
>= 2 ) {
printf
(
$template
, 0.0, 0,
$na
, 0,
'non-token data: newest atime'
)
or
die
"Error writing: $!"
;
}
if
(
$bv
< 2 ) {
printf
(
$template
, 0.0, 0,
$sb
, 0,
'non-token data: current scan-count'
)
or
die
"Error writing: $!"
;
}
if
(
$bv
>= 2 ) {
printf
(
$template
, 0.0, 0,
$js
, 0,
'non-token data: last journal sync atime'
)
or
die
"Error writing: $!"
;
}
printf
(
$template
, 0.0, 0,
$le
, 0,
'non-token data: last expiry atime'
)
or
die
"Error writing: $!"
;
if
(
$bv
>= 2 ) {
printf
(
$template
, 0.0, 0,
$ad
, 0,
'non-token data: last expire atime delta'
)
or
die
"Error writing: $!"
;
printf
(
$template
, 0.0, 0,
$er
, 0,
'non-token data: last expire reduction count'
)
or
die
"Error writing: $!"
;
}
}
if
(
$toks
) {
$self
->{store}->dump_db_toks(
$template
,
$regex
,
@vars
);
}
if
(!
$self
->{main}->{learn_caller_will_untie}) {
$self
->{store}->untie_db();
}
return
1;
}
sub
get_body_from_msg {
my
(
$self
,
$msg
) =
@_
;
if
(!
ref
$msg
) {
warn
"bayes: msg not a ref: '$msg'"
;
return
{ };
}
my
$permsgstatus
=
Mail::SpamAssassin::PerMsgStatus->new(
$self
->{main},
$msg
);
$msg
->extract_message_metadata (
$permsgstatus
);
my
$msgdata
=
$self
->_get_msgdata_from_permsgstatus (
$permsgstatus
);
$permsgstatus
->finish();
if
(!
defined
$msgdata
) {
warn
"bayes: failed to get body for "
.
scalar
(
$self
->{msg}->generate_msgid()).
"\n"
;
return
{ };
}
return
$msgdata
;
}
sub
_get_msgdata_from_permsgstatus {
my
(
$self
,
$pms
) =
@_
;
my
$t_src
=
$self
->{conf}->{bayes_token_sources};
my
$msgdata
= { };
$msgdata
->{bayes_token_body} =
$pms
->{msg}->get_visible_rendered_body_text_array()
if
$t_src
->{visible};
$msgdata
->{bayes_token_inviz} =
$pms
->{msg}->get_invisible_rendered_body_text_array()
if
$t_src
->{invisible};
$msgdata
->{bayes_mimepart_digests} =
$pms
->{msg}->get_mimepart_digests()
if
$t_src
->{mimepart};
@{
$msgdata
->{bayes_token_uris}} =
$pms
->get_uri_list()
if
$t_src
->{uri};
return
$msgdata
;
}
sub
tokenize {
my
(
$self
,
$msg
,
$msgdata
) =
@_
;
my
$conf
=
$self
->{conf};
my
$t_src
=
$conf
->{bayes_token_sources};
$self
->{stopword_cache} = ();
my
@tokens_body
;
if
(
$msgdata
->{bayes_token_body}) {
foreach
(@{
$msgdata
->{bayes_token_body}}) {
push
(
@tokens_body
,
$self
->_tokenize_line (
$_
,
''
, 1));
last
if
scalar
@tokens_body
>= 50000;
}
dbg(
"bayes: tokenized body: %d tokens"
,
scalar
@tokens_body
);
}
my
@tokens_uri
;
if
(
$msgdata
->{bayes_token_uris}) {
foreach
(@{
$msgdata
->{bayes_token_uris}}) {
push
(
@tokens_uri
,
$self
->_tokenize_line (
$_
,
''
, 2));
last
if
scalar
@tokens_uri
>= 10000;
}
dbg(
"bayes: tokenized uri: %d tokens"
,
scalar
@tokens_uri
);
}
my
@tokens_inviz
;
if
(
$msgdata
->{bayes_token_inviz}) {
my
$tokprefix
;
if
(ADD_INVIZ_TOKENS_I_PREFIX) {
$tokprefix
=
'I*:'
}
if
(ADD_INVIZ_TOKENS_NO_PREFIX) {
$tokprefix
=
''
}
if
(
defined
$tokprefix
) {
foreach
(@{
$msgdata
->{bayes_token_inviz}}) {
push
(
@tokens_inviz
,
$self
->_tokenize_line (
$_
,
$tokprefix
, 1));
last
if
scalar
@tokens_inviz
>= 50000;
}
}
dbg(
"bayes: tokenized invisible: %d tokens"
,
scalar
@tokens_inviz
);
}
my
@tokens_mimepart
;
if
(
$msgdata
->{bayes_mimepart_digests}) {
my
%shorthand
= (
'da39a3ee5e6b4b0d3255bfef95601890afd80709:text/plain'
=>
'Empty-Plaintext'
,
'da39a3ee5e6b4b0d3255bfef95601890afd80709:text/html'
=>
'Empty-HTML'
,
'da39a3ee5e6b4b0d3255bfef95601890afd80709:text/xml'
=>
'Empty-XML'
,
'adc83b19e793491b1c6ea0fd8b46cd9f32e592fc:text/plain'
=>
'OneNL-Plaintext'
,
'adc83b19e793491b1c6ea0fd8b46cd9f32e592fc:text/html'
=>
'OneNL-HTML'
,
'71853c6197a6a7f222db0f1978c7cb232b87c5ee:text/plain'
=>
'TwoNL-Plaintext'
,
'71853c6197a6a7f222db0f1978c7cb232b87c5ee:text/html'
=>
'TwoNL-HTML'
,
);
@tokens_mimepart
=
map
(
'MIME:'
. (
$shorthand
{
$_
} ||
$_
),
@{
$msgdata
->{bayes_mimepart_digests} });
dbg(
"bayes: tokenized mime parts: %d tokens"
,
scalar
@tokens_mimepart
);
dbg(
"bayes: mime-part token %s"
,
$_
)
for
@tokens_mimepart
;
}
my
@tokens_header
;
if
(
$t_src
->{header}) {
my
%hdrs
=
$self
->_tokenize_headers (
$msg
);
while
(
my
(
$prefix
,
$value
) =
each
%hdrs
) {
push
(
@tokens_header
,
$self
->_tokenize_line (
$value
,
"H$prefix:"
, 0));
last
if
scalar
@tokens_header
>= 10000;
}
dbg(
"bayes: tokenized header: %d tokens"
,
scalar
@tokens_header
);
}
delete
$self
->{stopword_cache};
my
%tokens
;
foreach
my
$token
(
@tokens_body
,
@tokens_uri
,
@tokens_inviz
,
@tokens_mimepart
,
@tokens_header
)
{
$tokens
{
substr
(sha1(
$token
), -5)} =
$token
if
$token
ne
''
;
}
return
\
%tokens
;
}
sub
_tokenize_line {
my
$self
=
$_
[0];
my
$tokprefix
=
$_
[2];
my
$region
=
$_
[3];
local
(
$_
) =
$_
[1];
my
$conf
=
$self
->{conf};
my
@rettokens
;
s{ ( [A-Za-z0-9,@*!_'"\$. -]+ |
[\xC0-\xDF][\x80-\xBF] |
[\xE0-\xEF][\x80-\xBF]{2} |
[\xF0-\xF4][\x80-\xBF]{3} |
[\xA1-\xFF] ) | . }
{
defined
$1 ? $1 :
' '
}xsge;
s/(\w)(\.{3,6})(\w)/$1 $2 $3/gs;
s/(\w)(\-{2,6})(\w)/$1 $2 $3/gs;
if
(IGNORE_TITLE_CASE) {
if
(
$region
== 1 ||
$region
== 2) {
s/(?:^|\.\s+)([A-Z])([^A-Z]+)(?:\s|$)/
' '
. (
lc
$1) . $2 .
' '
/ge;
}
}
my
$magic_re
=
$self
->{store}->get_magic_re();
TOKEN:
foreach
my
$token
(
split
) {
$token
=~ s/^[-'"\.,]+//;
$token
=~ s/[-
'"\.,]+$//; # so we don'
t get loads of
'"foo'
tokens
next
if
(
defined
$magic_re
&&
$token
=~ /
$magic_re
/o );
my
$len
=
length
(
$token
);
next
if
$len
< 3;
if
(@{
$conf
->{bayes_stopword_languages}}) {
if
(!
exists
$self
->{stopword_cache}{
$token
}) {
foreach
my
$lang
(@{
$conf
->{bayes_stopword_languages}}) {
if
(
$token
=~
$self
->{bayes_stopword}{
$lang
}) {
dbg(
"bayes: skipped token '$token' because it's in stopword list for language '$lang'"
);
$self
->{stopword_cache}{
$token
} = 1;
next
TOKEN;
}
}
$self
->{stopword_cache}{
$token
} = 0;
}
else
{
next
if
$self
->{stopword_cache}{
$token
};
}
}
if
(
$region
== 1 ||
$region
== 2) {
if
(CHEW_BODY_MAILADDRS &&
$token
=~ /\S\@\S/i) {
push
(
@rettokens
,
$self
->_tokenize_mail_addrs (
$token
));
}
elsif
(CHEW_BODY_URIS &&
$token
=~ /\S\.[a-z]/i) {
push
(
@rettokens
,
"UD:"
.
$token
);
my
$bit
=
$token
;
while
(
$bit
=~ s/^[^\.]+\.(.+)$/$1/gs) {
push
(
@rettokens
,
"UD:"
.$1);
}
}
}
if
(
$len
>
$conf
->{bayes_max_token_length} &&
index
(
$token
,
'*'
) == -1) {
if
(TOKENIZE_LONG_8BIT_SEQS_AS_UTF8_CHARS &&
$token
=~ /[\x80-\xBF]{2}/) {
my
(
@t
) =
$token
=~ /( (?: [\xE0-\xEF] | [\xF0-\xF4][\x80-\xBF] )
[\x80-\xBF]{2} )/xsg;
if
(
@t
) {
push
(
@rettokens
,
map
(
$tokprefix
.
'u8:'
.
$_
,
@t
));
next
;
}
}
if
(TOKENIZE_LONG_8BIT_SEQS_AS_TUPLES &&
$token
=~ /[\xa0-\xff]{2}/) {
while
(
$token
=~ s/^(..?)//) {
push
(
@rettokens
,
$tokprefix
.
'8:'
.$1);
}
next
;
}
if
((
$region
== 0 && HDRS_TOKENIZE_LONG_TOKENS_AS_SKIPS)
|| (
$region
== 1 && BODY_TOKENIZE_LONG_TOKENS_AS_SKIPS)
|| (
$region
== 2 && URIS_TOKENIZE_LONG_TOKENS_AS_SKIPS))
{
$token
=~ s{ ^ ( (?> (?: [\x00-\x7F\xF5-\xFF] |
[\xC0-\xDF][\x80-\xBF] |
[\xE0-\xEF][\x80-\xBF]{2} |
[\xF0-\xF4][\x80-\xBF]{3} | . ){7} ))
.{2,} \z }{sk:$1}xs;
}
}
if
(
$region
== 1 ||
$region
== 2) {
if
(DECOMPOSE_BODY_TOKENS) {
if
(
$token
=~ /[^\w:\*]/) {
my
$decompd
=
$token
;
$decompd
=~ s/[^\w:\*]//gs;
push
(
@rettokens
,
$tokprefix
.
$decompd
);
}
if
(
$token
=~ /[A-Z]/) {
my
$decompd
=
$token
;
$decompd
=
lc
$decompd
;
push
(
@rettokens
,
$tokprefix
.
$decompd
);
if
(
$token
=~ /[^\w:\*]/) {
$decompd
=~ s/[^\w:\*]//gs;
push
(
@rettokens
,
$tokprefix
.
$decompd
);
}
}
}
}
push
(
@rettokens
,
$tokprefix
.
$token
);
}
return
@rettokens
;
}
sub
_tokenize_headers {
my
(
$self
,
$msg
) =
@_
;
my
%parsed
;
my
@hdrs
;
my
@rcvdlines
;
for
(
$msg
->get_all_headers()) {
if
(/^Received:/i) {
push
(
@rcvdlines
,
$_
);
next
;
}
next
if
/^${IGNORED_HDRS}:/i;
next
if
IGNORE_MSGID_TOKENS && /^Message-ID:/i;
push
(
@hdrs
,
$_
);
}
push
(
@hdrs
,
$msg
->get_all_metadata());
if
(
$#rcvdlines
>= 0) {
push
(
@hdrs
,
$rcvdlines
[
$#rcvdlines
]); }
if
(
$#rcvdlines
>= 1) {
push
(
@hdrs
,
$rcvdlines
[
$#rcvdlines
-1]); }
for
(
@hdrs
) {
next
unless
/\S/;
my
(
$hdr
,
$val
) =
split
(/:/,
$_
, 2);
next
if
exists
$self
->{conf}->{bayes_ignore_header}->{
lc
$hdr
};
$val
||=
''
;
chomp
(
$val
);
if
(
$hdr
=~ /^(?:|X-|Resent-)Message-Id$/i) {
$val
=
$self
->_pre_chew_message_id (
$val
);
}
elsif
(PRE_CHEW_ADDR_HEADERS &&
$hdr
=~ /^(?:|X-|Resent-)
(?:Return-Path|From|To|Cc|Reply-To|Errors-To|Mail-Followup-To|Sender)$/ix)
{
$val
=
$self
->_pre_chew_addr_header (
$val
);
}
elsif
(
$hdr
eq
'Received'
) {
$val
=
$self
->_pre_chew_received (
$val
);
}
elsif
(
$hdr
eq
'Content-Type'
) {
$val
=
$self
->_pre_chew_content_type (
$val
);
}
elsif
(
$hdr
eq
'MIME-Version'
) {
$val
=~ s/1\.0//;
}
elsif
(
$hdr
=~ /^${MARK_PRESENCE_ONLY_HDRS}$/i) {
$val
=
"1"
;
}
elsif
(
$hdr
=~ /^x-spam-relays-(?:external|internal|trusted|untrusted)$/) {
$val
=~ s/ [a-z]+=/ /g;
}
if
(MAP_HEADERS_MID) {
if
(
$hdr
=~ /^(?:In-Reply-To|References|Message-ID)$/i) {
if
(
exists
$parsed
{
"*MI"
}) {
$parsed
{
"*MI"
} .=
" "
.
$val
;
}
else
{
$parsed
{
"*MI"
} =
$val
;
}
}
}
if
(MAP_HEADERS_FROMTOCC) {
if
(
$hdr
=~ /^(?:From|To|Cc)$/i) {
if
(
exists
$parsed
{
"*Ad"
}) {
$parsed
{
"*Ad"
} .=
" "
.
$val
;
}
else
{
$parsed
{
"*Ad"
} =
$val
;
}
}
}
if
(MAP_HEADERS_USERAGENT) {
if
(
$hdr
=~ /^(?:X-Mailer|User-Agent)$/i) {
if
(
exists
$parsed
{
"*UA"
}) {
$parsed
{
"*UA"
} .=
" "
.
$val
;
}
else
{
$parsed
{
"*UA"
} =
$val
;
}
}
}
if
(
defined
$HEADER_NAME_COMPRESSION
{
$hdr
}) {
$hdr
=
$HEADER_NAME_COMPRESSION
{
$hdr
};
}
if
(
exists
$parsed
{
$hdr
}) {
$parsed
{
$hdr
} .=
" "
.
$val
;
}
else
{
$parsed
{
$hdr
} =
$val
;
}
}
if
(would_log(
'dbg'
,
'bayes'
) > 1) {
foreach
my
$hdr
(
sort
keys
%parsed
) {
dbg(
"bayes: header tokens for $hdr = \"$parsed{$hdr}\""
);
}
}
return
%parsed
;
}
sub
_pre_chew_content_type {
my
(
$self
,
$val
) =
@_
;
if
(
$val
=~ s/boundary=[\"\'](.*?)[\"\']/ /ig) {
my
$boundary
= $1;
$boundary
=
''
if
!
defined
$boundary
;
$boundary
=~ s/[a-fA-F0-9]/H/gs;
$boundary
=~ s/([-_\.=]+)/ $1 /gs;
$val
.=
$boundary
;
}
$val
=~ s/\b(?:text|charset)\b/ /g;
$val
;
}
sub
_pre_chew_message_id {
my
(
$self
,
$val
) =
@_
;
$val
=~ s/<([0-9a-f]{4})[0-9a-f]{4}[0-9a-f]{4}\$
([0-9a-f]{4})[0-9a-f]{4}\$
([0-9a-f]{8})\@(\S+)>/ OEA$1 OEB$2 OEC$3 $4 /gx;
$val
=~ s/<[A-Za-z0-9]{7}-[A-Za-z0-9]{6}-0[A-Za-z0-9]\@//;
$val
=~ s/<20\d\d[01]\d[0123]\d[012]\d[012345]\d[012345]\d\.
[A-F0-9]{10,12}\@//gx;
$val
=~ s/[^_A-Za-z0-9]/ /g;
$val
;
}
sub
_pre_chew_received {
my
(
$self
,
$val
) =
@_
;
$val
=~ s/\swith\sSMTP\sid\sg[\dA-Z]{10,12}\s/ /gs;
$val
=~ s/\swith\sESMTP\sid\s[\dA-F]{10,12}\s/ /gs;
$val
=~ s/\bid\s[a-zA-Z0-9]{7,20}\b/ /gs;
$val
=~ s/\bid\s[A-Za-z0-9]{7}-[A-Za-z0-9]{6}-0[A-Za-z0-9]/ /gs;
$val
=~ s/(?:(?:Mon|Tue|Wed|Thu|Fri|Sat|Sun),\s)?
[0-3\s]?[0-9]\s
(?:Jan|Feb|Ma[ry]|Apr|Ju[nl]|Aug|Sep|Oct|Nov|Dec)\s
(?:19|20)?[0-9]{2}\s
[0-2][0-9](?:\:[0-5][0-9]){1,2}\s
(?:\s*\(|\)|\s*(?:[+-][0-9]{4})|\s*(?:UT|[A-Z]{2,3}T))*
//gx;
$val
=~ s{\b(\d{1,3}\.)(\d{1,3}\.)(\d{1,3})(\.\d{1,3})\b}{
if
($2 eq
'10'
|| ($2 eq
'192'
&& $3 eq
'168'
)) {
$1.$2.$3.$4.
" ip*"
.$1.$2.$3.$4.
" "
;
}
else
{
$1.$2.$3.
" ip*"
.$1.$2.$3.$4.
" "
;
}
}gex;
$val
=~ s/\b(?:
with
|from|
for
|SMTP|ESMTP)\b/ /g;
$val
;
}
sub
_pre_chew_addr_header {
my
(
$self
,
$val
) =
@_
;
local
(
$_
);
my
@addrs
= Mail::SpamAssassin::Util::parse_header_addresses(
$val
);
my
@toks
;
foreach
my
$addr
(
@addrs
) {
if
(
defined
$addr
->{phrase}) {
foreach
(
split
(/\s+/,
$addr
->{phrase})) {
push
@toks
,
"N*"
.
$_
;
}
}
if
(
defined
$addr
->{address}) {
push
@toks
,
$self
->_tokenize_mail_addrs(
$addr
->{address});
}
}
return
join
(
' '
,
@toks
);
}
sub
_tokenize_mail_addrs {
my
(
$self
,
$addr
) =
@_
;
(
$addr
=~ /(.+)\@(.+)$/) or
return
();
my
@toks
;
push
(
@toks
,
"U*"
.$1,
"D*"
.$2);
$_
= $2;
while
(s/^[^\.]+\.(.+)$/$1/gs) {
push
(
@toks
,
"D*"
.$1); }
return
@toks
;
}
sub
_compute_prob_for_all_tokens {
my
(
$self
,
$tokensdata
,
$ns
,
$nn
) =
@_
;
my
@probabilities
;
return
if
!
$ns
|| !
$nn
;
my
$threshold
= 1;
if
(!USE_ROBINSON_FX_EQUATION_FOR_LOW_FREQS) {
$threshold
= 10;
}
if
(!
$self
->{use_hapaxes}) {
$threshold
= 2;
}
foreach
my
$tokendata
(@{
$tokensdata
}) {
my
$s
=
$tokendata
->[1];
my
$n
=
$tokendata
->[2];
my
$prob
;
no
warnings
'uninitialized'
;
if
(
$s
+
$n
>=
$threshold
) {
$prob
= (
$s
*
$nn
) / (
$n
*
$ns
+
$s
*
$nn
);
if
(USE_ROBINSON_FX_EQUATION_FOR_LOW_FREQS) {
my
$robn
=
$s
+
$n
;
$prob
=
(
$Mail::SpamAssassin::Bayes::Combine::FW_S_DOT_X
+ (
$robn
*
$prob
))
/
(
$Mail::SpamAssassin::Bayes::Combine::FW_S_CONSTANT
+
$robn
);
}
}
push
(
@probabilities
,
$prob
);
}
return
\
@probabilities
;
}
sub
_compute_prob_for_token {
my
(
$self
,
$token
,
$ns
,
$nn
,
$s
,
$n
) =
@_
;
if
(!
defined
(
$s
) || !
defined
(
$n
)) {
(
$s
,
$n
,
undef
) =
$self
->{store}->tok_get(
$token
);
}
return
if
!
$s
&& !
$n
;
my
$probabilities_ref
=
$self
->_compute_prob_for_all_tokens([ [
$token
,
$s
,
$n
, 0] ],
$ns
,
$nn
);
return
$probabilities_ref
->[0];
}
sub
_compute_declassification_distance {
my
(
$self
,
$Ns
,
$Nn
,
$ns
,
$nn
,
$prob
) =
@_
;
return
0
if
$ns
== 0 &&
$nn
== 0;
if
(!USE_ROBINSON_FX_EQUATION_FOR_LOW_FREQS) {
return
0
if
(
$ns
+
$nn
< 10);}
if
(!
$self
->{use_hapaxes}) {
return
0
if
(
$ns
+
$nn
< 2);}
return
0
if
$Ns
== 0 ||
$Nn
== 0;
return
0
if
abs
(
$prob
- 0.5 ) <
$Mail::SpamAssassin::Bayes::Combine::MIN_PROB_STRENGTH
;
my
(
$Na
,
$na
,
$Nb
,
$nb
) =
$prob
> 0.5 ? (
$Nn
,
$nn
,
$Ns
,
$ns
) : (
$Ns
,
$ns
,
$Nn
,
$nn
);
my
$p
= 0.5 -
$Mail::SpamAssassin::Bayes::Combine::MIN_PROB_STRENGTH
;
return
int
( 1.0 - 1e-6 +
$nb
*
$Na
*
$p
/ (
$Nb
* ( 1 -
$p
)) ) -
$na
unless
USE_ROBINSON_FX_EQUATION_FOR_LOW_FREQS;
my
$s
=
$Mail::SpamAssassin::Bayes::Combine::FW_S_CONSTANT
;
my
$sx
=
$Mail::SpamAssassin::Bayes::Combine::FW_S_DOT_X
;
my
$a
=
$Nb
* ( 1 -
$p
);
my
$b
=
$Nb
* (
$sx
+
$nb
* ( 1 -
$p
) -
$p
*
$s
) -
$p
*
$Na
*
$nb
;
my
$c
=
$Na
*
$nb
* (
$sx
-
$p
* (
$s
+
$nb
) );
my
$discrim
=
$b
*
$b
- 4 *
$a
*
$c
;
my
$disc_max_0
=
$discrim
< 0 ? 0 :
$discrim
;
my
$dd_exact
= ( 1.0 - 1e-6 + ( -
$b
+
sqrt
(
$disc_max_0
) ) / ( 2
*$a
) ) -
$na
;
return
$dd_exact
< 1 ? 1 :
int
(
$dd_exact
);
}
sub
_opportunistic_calls {
my
(
$self
,
$journal_only
) =
@_
;
if
(!
$self
->{store}->db_readable()) {
dbg(
"bayes: opportunistic call attempt failed, DB not readable"
);
return
;
}
my
$running_expire
=
$self
->{store}->get_running_expire_tok();
if
(
defined
$running_expire
&&
$running_expire
+
$OPPORTUNISTIC_LOCK_VALID
>
time
() ) {
dbg(
"bayes: opportunistic call attempt skipped, found fresh running expire magic token"
);
return
;
}
if
(!
$journal_only
&&
$self
->{store}->expiry_due()) {
dbg(
"bayes: opportunistic call found expiry due"
);
$self
->{main}->{bayes_scanner}->sync(1,1);
}
elsif
(
$self
->{store}->sync_due() ) {
dbg(
"bayes: opportunistic call found journal sync due"
);
$self
->{main}->{bayes_scanner}->sync(1,0);
if
(
$self
->{store}->db_writable()) {
$self
->{store}->remove_running_expire_tok();
}
}
return
;
}
sub
learner_new {
my
(
$self
) =
@_
;
my
$store
;
my
$module
=
$self
->{conf}->{bayes_store_module};
if
(!
$module
) {
$module
=
'Mail::SpamAssassin::BayesStore::DBM'
;
}
elsif
(
$module
=~ /^([_A-Za-z0-9:]+)$/) {
$module
= untaint_var(
$module
);
}
else
{
die
"bayes: invalid module: $module\n"
;
}
dbg(
"bayes: learner_new self=%s, bayes_store_module=%s"
,
$self
,
$module
);
if
(
$self
->{store}) {
$self
->{store}->untie_db();
undef
$self
->{store};
}
eval
'
require
'.$module.'
;
$store
=
'.$module.'
->new(
$self
);
1;
' or
do
{
my
$eval_stat
= $@ ne
''
? $@ :
"errno=$!"
;
chomp
$eval_stat
;
die
"bayes: learner_new $module new() failed: $eval_stat\n"
;
};
dbg(
"bayes: learner_new: got store=%s"
,
$store
);
$self
->{store} =
$store
;
$self
;
}
sub
bayes_report_make_list {
my
(
$self
,
$pms
,
$info
,
$param
) =
@_
;
return
"Tokens not available."
unless
defined
$info
;
my
(
$limit
,
$fmt_arg
,
$more
) =
split
/,/, (
$param
||
'5'
);
my
%formats
= (
short
=>
'$t'
,
Short
=>
'Token: \"$t\"'
,
compact
=>
'$p-$D--$t'
,
Compact
=>
'Probability $p -declassification distance $D (\"+\" means > 9) --token: \"$t\"'
,
medium
=>
'$p-$D-$N--$t'
,
long
=>
'$p-$d--${h}h-${s}s--${a}d--$t'
,
Long
=>
'Probability $p -declassification distance $D --in ${h} ham messages -and ${s} spam messages --${a} days old--token:\"$t\"'
);
my
$raw_fmt
= (!
$fmt_arg
?
'$p-$D--$t'
:
$formats
{
$fmt_arg
});
return
"Invalid format, must be one of: "
.
join
(
","
,
keys
%formats
)
unless
defined
$raw_fmt
;
my
$fmt
=
'"'
.
$raw_fmt
.
'"'
;
my
$amt
=
$limit
<
@$info
?
$limit
:
@$info
;
return
""
unless
$amt
;
my
$ns
=
$pms
->{bayes_nspam};
my
$nh
=
$pms
->{bayes_nham};
my
$digit
=
sub
{
$_
[0] > 9 ?
"+"
:
$_
[0] };
my
$now
=
time
;
join
', '
,
map
{
my
(
$t
,
$prob
,
$s
,
$h
,
$u
) =
@$_
;
my
$a
=
int
((
$now
-
$u
)/(3600 * 24));
my
$d
=
$self
->_compute_declassification_distance(
$ns
,
$nh
,
$s
,
$h
,
$prob
);
my
$p
=
sprintf
"%.3f"
,
$prob
;
my
$n
=
$s
+
$h
;
my
(
$c
,
$o
) =
$prob
< 0.5 ? (
$h
,
$s
) : (
$s
,
$h
);
my
(
$D
,
$S
,
$H
,
$C
,
$O
,
$N
) =
map
&$digit
(
$_
), (
$d
,
$s
,
$h
,
$c
,
$o
,
$n
);
eval
$fmt
;
} @{
$info
}[0..
$amt
-1];
}
1;