our
@ISA
=
qw(Mail::SpamAssassin::Plugin)
;
sub
new {
my
$class
=
shift
;
my
$mailsaobject
=
shift
;
$class
=
ref
(
$class
) ||
$class
;
my
$self
=
$class
->SUPER::new(
$mailsaobject
);
bless
(
$self
,
$class
);
$self
->register_eval_rule (
"pdf_count"
,
$Mail::SpamAssassin::Conf::TYPE_BODY_EVALS
);
$self
->register_eval_rule (
"pdf_image_count"
,
$Mail::SpamAssassin::Conf::TYPE_BODY_EVALS
);
$self
->register_eval_rule (
"pdf_pixel_coverage"
,
$Mail::SpamAssassin::Conf::TYPE_BODY_EVALS
);
$self
->register_eval_rule (
"pdf_image_size_exact"
,
$Mail::SpamAssassin::Conf::TYPE_BODY_EVALS
);
$self
->register_eval_rule (
"pdf_image_size_range"
,
$Mail::SpamAssassin::Conf::TYPE_BODY_EVALS
);
$self
->register_eval_rule (
"pdf_named"
,
$Mail::SpamAssassin::Conf::TYPE_BODY_EVALS
);
$self
->register_eval_rule (
"pdf_name_regex"
,
$Mail::SpamAssassin::Conf::TYPE_BODY_EVALS
);
$self
->register_eval_rule (
"pdf_image_to_text_ratio"
,
$Mail::SpamAssassin::Conf::TYPE_BODY_EVALS
);
$self
->register_eval_rule (
"pdf_match_md5"
,
$Mail::SpamAssassin::Conf::TYPE_BODY_EVALS
);
$self
->register_eval_rule (
"pdf_match_fuzzy_md5"
,
$Mail::SpamAssassin::Conf::TYPE_BODY_EVALS
);
$self
->register_eval_rule (
"pdf_match_details"
,
$Mail::SpamAssassin::Conf::TYPE_BODY_EVALS
);
$self
->register_eval_rule (
"pdf_is_encrypted"
,
$Mail::SpamAssassin::Conf::TYPE_BODY_EVALS
);
$self
->register_eval_rule (
"pdf_is_empty_body"
,
$Mail::SpamAssassin::Conf::TYPE_BODY_EVALS
);
$self
->register_method_priority (
"parsed_metadata"
, -1);
return
$self
;
}
sub
parsed_metadata {
my
(
$self
,
$opts
) =
@_
;
my
$pms
=
$opts
->{permsgstatus};
$pms
->{pdfinfo}->{count_pdf} = 0;
$pms
->{pdfinfo}->{count_pdf_images} = 0;
my
@parts
=
$pms
->{msg}->find_parts(
qr@^(image|application)/(pdf|octet\-stream)$@
, 1);
my
$part_count
=
scalar
@parts
;
dbg(
"pdfinfo: Identified $part_count possible mime parts that need checked for PDF content"
);
foreach
my
$p
(
@parts
) {
my
$type
=
$p
->{type} ||
''
;
my
$name
=
$p
->{name} ||
''
;
dbg(
"pdfinfo: found part, type=$type file=$name"
);
next
unless
(
$name
=~ /\.[fp]df$/i ||
$type
=~ m@/pdf$@);
_get_pdf_details(
$pms
,
$p
);
$pms
->{pdfinfo}->{count_pdf}++;
}
_set_tag(
$pms
,
'PDFCOUNT'
,
$pms
->{pdfinfo}->{count_pdf});
_set_tag(
$pms
,
'PDFIMGCOUNT'
,
$pms
->{pdfinfo}->{count_pdf_images});
}
sub
_get_pdf_details {
my
(
$pms
,
$part
) =
@_
;
my
$data
=
$part
->decode();
$data
=~ s/^\xef\xbb\xbf//;
if
(
$data
!~ /^.{0,1024}\
%PDF
\-(\d\.\d)/s) {
dbg(
"pdfinfo: PDF magic header not found, invalid file?"
);
return
;
}
my
$version
= $1;
_set_tag(
$pms
,
'PDFVERSION'
,
$version
);
my
(
$fuzzy_data
,
$pdf_tags
);
my
(
$md5
,
$fuzzy_md5
) = (
''
,
''
);
my
(
$total_height
,
$total_width
,
$total_area
,
$line_count
) = (0,0,0,0);
my
$name
=
$part
->{name} ||
''
;
_set_tag(
$pms
,
'PDFNAME'
,
$name
);
$pms
->{pdfinfo}->{names_pdf}->{
$name
} = 1
if
$name
;
my
$no_more_fuzzy
= 0;
my
$got_image
= 0;
my
$encrypted
= 0;
my
%uris
;
while
(
$data
=~ /([^\n]+)/g) {
my
$line
= $1;
if
(!
$no_more_fuzzy
&& ++
$line_count
< 70) {
if
(
$line
!~ m/^\%/ &&
$line
!~ m/^\/(?:Height|Width|(?:(?:Media|Crop)Box))/ &&
$line
!~ m/^\d+\s+\d+\s+\d+\s+\d+\s+\d+\s+\d+\s+cm$/) {
$line
=~ s/\s+$//;
$fuzzy_data
.=
$line
;
}
$no_more_fuzzy
= 1
if
index
(
$line
,
'stream'
) >= 0;
}
$got_image
= 1
if
index
(
$line
,
'/Image'
) >= 0;
if
(!
$encrypted
&&
index
(
$line
,
'/Encrypt'
) == 0) {
$encrypted
=
$pms
->{pdfinfo}->{encrypted} = 1;
}
if
(
$got_image
) {
my
(
$width
,
$height
);
if
(
$line
=~ /^(\d+)\s+\d+\s+\d+\s+(\d+)\s+\d+\s+\d+\s+cm$/) {
$width
= $1;
$height
= $2;
}
elsif
(
$line
=~ /^\/Width\s(\d+)/) {
$width
= $1;
}
elsif
(
$line
=~ /^\/Height\s(\d+)/) {
$height
= $1;
}
elsif
(
$line
=~ m/\/Width\s(\d+)\/Height\s(\d+)/) {
$width
= $1;
$height
= $2;
}
if
(
$width
&&
$height
) {
$no_more_fuzzy
= 1;
my
$area
=
$width
*
$height
;
$total_height
+=
$height
;
$total_width
+=
$width
;
$total_area
+=
$area
;
$pms
->{pdfinfo}->{dems_pdf}->{
"${height}x${width}"
} = 1;
$pms
->{pdfinfo}->{count_pdf_images}++;
dbg(
"pdfinfo: Found image in PDF $name: $height x $width pixels ($area pixels sq.)"
);
_set_tag(
$pms
,
'PDFIMGDIM'
,
"${height}x${width}"
);
$got_image
=
$height
=
$width
= 0;
}
}
next
unless
index
(
$line
,
'/'
) >= 0;
if
(
$line
=~ m/^\/([A-Za-z]+)/) {
$pdf_tags
.= $1;
}
if
(
keys
%uris
< 20 &&
$line
=~ /(?:\/S\s{0,2}\/URI\s{0,2}|^\s*)\/URI\s{0,2}( \( .*? (?<!\\) \) | < [^>]* > )/x) {
my
$location
= _parse_string($1);
next
unless
index
(
$location
,
'.'
) > 0;
next
if
$location
=~ /\0/;
if
(!
exists
$uris
{
$location
}) {
$uris
{
$location
} = 1;
dbg(
"pdfinfo: found URI: $location"
);
$pms
->add_uri_detail_list(
$location
);
}
}
if
(
$line
=~ /\/Author\s{0,2}( \( .*? (?<!\\) \) | < [^>]* > )/x) {
my
$author
= _parse_string($1);
dbg(
"pdfinfo: found property Author=$author"
);
$pms
->{pdfinfo}->{details}->{author}->{
$author
} = 1;
_set_tag(
$pms
,
'PDFAUTHOR'
,
$author
);
}
if
(
$line
=~ /\/Creator\s{0,2}( \( .*? (?<!\\) \) | < [^>]* > )/x) {
my
$creator
= _parse_string($1);
dbg(
"pdfinfo: found property Creator=$creator"
);
$pms
->{pdfinfo}->{details}->{creator}->{
$creator
} = 1;
_set_tag(
$pms
,
'PDFCREATOR'
,
$creator
);
}
if
(
$line
=~ /\/CreationDate\s{0,2}\(D\:(\d+)/) {
my
$created
= _parse_string($1);
dbg(
"pdfinfo: found property Created=$created"
);
$pms
->{pdfinfo}->{details}->{created}->{
$created
} = 1;
}
if
(
$line
=~ /\/ModDate\s{0,2}\(D\:(\d+)/) {
my
$modified
= _parse_string($1);
dbg(
"pdfinfo: found property Modified=$modified"
);
$pms
->{pdfinfo}->{details}->{modified}->{
$modified
} = 1;
}
if
(
$line
=~ /\/Producer\s{0,2}( \( .*? (?<!\\) \) | < [^>]* > )/x) {
my
$producer
= _parse_string($1);
dbg(
"pdfinfo: found property Producer=$producer"
);
$pms
->{pdfinfo}->{details}->{producer}->{
$producer
} = 1;
_set_tag(
$pms
,
'PDFPRODUCER'
,
$producer
);
}
if
(
$line
=~ /\/Title\s{0,2}( \( .*? (?<!\\) \) | < [^>]* > )/x) {
my
$title
= _parse_string($1);
dbg(
"pdfinfo: found property Title=$title"
);
$pms
->{pdfinfo}->{details}->{title}->{
$title
} = 1;
_set_tag(
$pms
,
'PDFTITLE'
,
$title
);
}
}
$pms
->{pdfinfo}->{dems_pdf}->{
"${total_height}x${total_width}"
} = 1
if
(
$total_height
&&
$total_width
);
if
(
$total_area
) {
$pms
->{pdfinfo}->{pc_pdf} =
$total_area
;
_set_tag(
$pms
,
'PDFIMGAREA'
,
$total_area
);
dbg(
"pdfinfo: Total HxW: $total_height x $total_width ($total_area area)"
);
}
$md5
=
uc
(md5_hex(
$data
))
if
$data
;
$fuzzy_md5
=
uc
(md5_hex(
$fuzzy_data
))
if
$fuzzy_data
;
my
$tags_md5
=
''
;
$tags_md5
=
uc
(md5_hex(
$pdf_tags
))
if
$pdf_tags
;
dbg(
"pdfinfo: MD5 results for $name: md5=$md5 fuzzy1=$fuzzy_md5 fuzzy2=$tags_md5"
);
if
(
$md5
) {
$pms
->{pdfinfo}->{md5}->{
$md5
} = 1;
_set_tag(
$pms
,
'PDFMD5'
,
$fuzzy_md5
);
}
if
(
$fuzzy_md5
) {
$pms
->{pdfinfo}->{fuzzy_md5}->{
$fuzzy_md5
} = 1;
_set_tag(
$pms
,
'PDFMD5FUZZY1'
,
$fuzzy_md5
);
}
if
(
$tags_md5
) {
$pms
->{pdfinfo}->{fuzzy_md5}->{
$tags_md5
} = 1;
_set_tag(
$pms
,
'PDFMD5FUZZY2'
,
$tags_md5
);
}
}
sub
_parse_string {
local
$_
=
shift
;
if
(/^</) {
my
$str
=
''
;
$str
.=
pack
(
"H*"
, $1)
while
(/([0-9A-Fa-f]{2})/g);
$_
=
$str
;
s/\x00//g
if
(s/^(?:\xfe\xff|\xff\xfe)//);
}
else
{
s/^\(//; s/\)$//;
s/(?<!\\)\\([0-3][0-7][0-7])/
pack
(
"C"
,
oct
($1))/ge;
s/\x00//g
if
(s/^(?:\xfe\xff|\xff\xfe)//);
s/\\([()\\])/$1/g;
}
return
substr
(
$_
, 0, 256);
}
sub
_set_tag {
my
(
$pms
,
$tag
,
$value
) =
@_
;
return
unless
defined
$value
&&
$value
ne
''
;
dbg(
"pdfinfo: set_tag called for $tag: $value"
);
if
(
exists
$pms
->{tag_data}->{
$tag
}) {
if
(
length
(
$pms
->{tag_data}->{
$tag
}) < 2048) {
$pms
->{tag_data}->{
$tag
} .=
' '
.
$value
;
}
}
else
{
$pms
->{tag_data}->{
$tag
} =
$value
;
}
}
sub
pdf_named {
my
(
$self
,
$pms
,
$body
,
$name
) =
@_
;
return
0
unless
defined
$name
;
return
1
if
exists
$pms
->{pdfinfo}->{names_pdf}->{
$name
};
return
0;
}
sub
pdf_name_regex {
my
(
$self
,
$pms
,
$body
,
$regex
) =
@_
;
return
0
unless
defined
$regex
;
return
0
unless
exists
$pms
->{pdfinfo}->{names_pdf};
my
(
$rec
,
$err
) = compile_regexp(
$regex
, 2);
if
(!
$rec
) {
my
$rulename
=
$pms
->get_current_eval_rule_name();
warn
"pdfinfo: invalid regexp for $rulename '$regex': $err"
;
return
0;
}
foreach
my
$name
(
keys
%{
$pms
->{pdfinfo}->{names_pdf}}) {
if
(
$name
=~
$rec
) {
dbg(
"pdfinfo: pdf_name_regex hit on $name"
);
return
1;
}
}
return
0;
}
sub
pdf_is_encrypted {
my
(
$self
,
$pms
,
$body
) =
@_
;
return
$pms
->{pdfinfo}->{encrypted} ? 1 : 0;
}
sub
pdf_count {
my
(
$self
,
$pms
,
$body
,
$min
,
$max
) =
@_
;
return
_result_check(
$min
,
$max
,
$pms
->{pdfinfo}->{count_pdf});
}
sub
pdf_image_count {
my
(
$self
,
$pms
,
$body
,
$min
,
$max
) =
@_
;
return
_result_check(
$min
,
$max
,
$pms
->{pdfinfo}->{count_pdf_images});
}
sub
pdf_pixel_coverage {
my
(
$self
,
$pms
,
$body
,
$min
,
$max
) =
@_
;
return
_result_check(
$min
,
$max
,
$pms
->{pdfinfo}->{pc_pdf});
}
sub
pdf_image_to_text_ratio {
my
(
$self
,
$pms
,
$body
,
$min
,
$max
) =
@_
;
return
0
unless
defined
$max
;
return
0
unless
$pms
->{pdfinfo}->{pc_pdf};
my
$textlen
=
length
(
join
(
''
,
@$body
));
return
0
unless
$textlen
;
my
$ratio
=
$textlen
/
$pms
->{pdfinfo}->{pc_pdf};
dbg(
"pdfinfo: image ratio=$ratio, min=$min max=$max"
);
return
_result_check(
$min
,
$max
,
$ratio
, 1);
}
sub
pdf_is_empty_body {
my
(
$self
,
$pms
,
$body
,
$min
) =
@_
;
return
0
unless
$pms
->{pdfinfo}->{count_pdf};
$min
||= 0;
my
$bytes
= 0;
my
$idx
= 0;
foreach
my
$line
(
@$body
) {
next
if
$idx
++ == 0;
next
unless
$line
=~ /\S/;
$bytes
+=
length
(
$line
);
return
0
if
$bytes
>
$min
;
}
dbg(
"pdfinfo: pdf_is_empty_body matched ($bytes <= $min)"
);
return
1;
}
sub
pdf_image_size_exact {
my
(
$self
,
$pms
,
$body
,
$height
,
$width
) =
@_
;
return
0
unless
defined
$width
;
return
1
if
exists
$pms
->{pdfinfo}->{dems_pdf}->{
"${height}x${width}"
};
return
0;
}
sub
pdf_image_size_range {
my
(
$self
,
$pms
,
$body
,
$minh
,
$minw
,
$maxh
,
$maxw
) =
@_
;
return
0
unless
defined
$minw
;
return
0
unless
exists
$pms
->{pdfinfo}->{dems_pdf};
foreach
my
$dem
(
keys
%{
$pms
->{pdfinfo}->{dems_pdf}}) {
my
(
$h
,
$w
) =
split
(/x/,
$dem
);
next
if
(
$h
<
$minh
);
next
if
(
$w
<
$minw
);
next
if
(
defined
$maxh
&&
$h
>
$maxh
);
next
if
(
defined
$maxw
&&
$w
>
$maxw
);
return
1;
}
return
0;
}
sub
pdf_match_md5 {
my
(
$self
,
$pms
,
$body
,
$md5
) =
@_
;
return
0
unless
defined
$md5
;
return
1
if
exists
$pms
->{pdfinfo}->{md5}->{
uc
$md5
};
return
0;
}
sub
pdf_match_fuzzy_md5 {
my
(
$self
,
$pms
,
$body
,
$md5
) =
@_
;
return
0
unless
defined
$md5
;
return
1
if
exists
$pms
->{pdfinfo}->{fuzzy_md5}->{
uc
$md5
};
return
0;
}
sub
pdf_match_details {
my
(
$self
,
$pms
,
$body
,
$detail
,
$regex
) =
@_
;
return
0
unless
defined
$regex
;
return
0
unless
exists
$pms
->{pdfinfo}->{details}->{
$detail
};
my
(
$rec
,
$err
) = compile_regexp(
$regex
, 2);
if
(!
$rec
) {
my
$rulename
=
$pms
->get_current_eval_rule_name();
warn
"pdfinfo: invalid regexp for $rulename '$regex': $err"
;
return
0;
}
foreach
(
keys
%{
$pms
->{pdfinfo}->{details}->{
$detail
}}) {
if
(
$_
=~
$rec
) {
dbg(
"pdfinfo: pdf_match_details $detail ($regex) match: $_"
);
return
1;
}
}
return
0;
}
sub
_result_check {
my
(
$min
,
$max
,
$value
,
$nomaxequal
) =
@_
;
return
0
unless
defined
$min
&&
defined
$value
;
return
0
if
$value
<
$min
;
return
0
if
defined
$max
&&
$value
>
$max
;
return
0
if
defined
$nomaxequal
&&
$nomaxequal
&&
$value
==
$max
;
return
1;
}
1;