Hide Show 237 lines of Pod
my
$VERSION
= 0.001;
proc_status_ok exit_status_str);
our
@ISA
=
qw(Mail::SpamAssassin::Plugin)
;
sub
new {
my
(
$class
,
$mailsa
) =
@_
;
$class
=
ref
(
$class
) ||
$class
;
my
$self
=
$class
->SUPER::new(
$mailsa
);
bless
(
$self
,
$class
);
$self
->{match} = [];
$self
->{tools} = {};
$self
->{magic} = 0;
$self
->register_method_priority(
'post_message_parse'
, -1);
$self
->set_config(
$mailsa
->{conf});
return
$self
;
}
sub
set_config {
my
(
$self
,
$conf
) =
@_
;
my
@cmds
;
push
(
@cmds
, {
setting
=>
'extracttext_maxparts'
,
default
=> 10,
type
=>
$Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC
,
});
push
(
@cmds
, {
setting
=>
'extracttext_timeout'
,
default
=> 5,
type
=>
$Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC
,
code
=>
sub
{
my
(
$self
,
$key
,
$value
,
$line
) =
@_
;
unless
(
defined
$value
&&
$value
!~ /^$/) {
return
$Mail::SpamAssassin::Conf::MISSING_REQUIRED_VALUE
;
}
local
($1,$2);
unless
(
$value
=~ /^(\d+)(?:\s+(\d+))?$/) {
return
$Mail::SpamAssassin::Conf::INVALID_VALUE
;
}
$self
->{extracttext_timeout} = $1;
$self
->{extracttext_timeout_total} = $2;
}
});
$conf
->{parser}->register_commands(\
@cmds
);
}
sub
parse_config {
my
(
$self
,
$opts
) =
@_
;
return
0
if
$opts
->{user_config};
if
(
$opts
->{key} eq
'extracttext_use'
) {
$self
->inhibit_further_callbacks();
if
(
$opts
->{value} =~ s/\\\\/\\/g) {
warn
"extracttext: DOUBLE BACKSLASHES DEPRECATED, change config to single backslashes, autoconverted for backward compatibility: $opts->{key} $opts->{value}\n"
;
}
if
(
$opts
->{value} =~ /(?:to|2)html\b/) {
warn
"extracttext: HTML tools are not supported, plain text output is required. Please remove: $opts->{key} $opts->{value}\n"
;
return
1;
}
my
@vals
=
split
(/\s+/,
$opts
->{value});
my
$tool
=
lc
(
shift
@vals
);
return
0
unless
@vals
;
foreach
my
$what
(
@vals
) {
my
$where
;
if
(
index
(
$what
,
'/'
) >= 0) {
$where
=
'type'
;
}
else
{
$where
=
'name'
;
if
(
$what
=~ /^\.[a-zA-Z0-9]+$/) {
$what
=
".*\\$what"
;
}
}
my
(
$rec
,
$err
) = compile_regexp(
'^(?i)'
.
$what
.
'$'
, 0);
if
(!
$rec
) {
warn
(
"invalid regexp '$what': $err\n"
);
return
0;
}
push
@{
$self
->{match}}, {
where
=>
$where
,
what
=>
$rec
,
tool
=>
$tool
};
dbg(
'extracttext: use: %s %s %s'
,
$tool
,
$where
,
$what
);
}
return
1;
}
if
(
$opts
->{key} eq
'extracttext_external'
) {
$self
->inhibit_further_callbacks();
if
(
$opts
->{value} =~ s/\\\\/\\/g) {
warn
"extracttext: DOUBLE BACKSLASHES DEPRECATED, change config to single backslashes, autoconverted for backward compatibility: $opts->{key} $opts->{value}\n"
;
}
if
(
$opts
->{value} =~ /(?:to|2)html\b/) {
warn
"extracttext: HTML tools are not supported, plain text output is required. Please remove: $opts->{key} $opts->{value}\n"
;
return
1;
}
my
%env
;
while
(
$opts
->{value} =~ s/\{(.+?)\}/ /g) {
my
(
$k
,
$v
) =
split
(/=/, $1, 2);
$env
{
$k
} =
defined
$v
?
$v
:
''
;
}
my
@vals
=
split
(/\s+/,
$opts
->{value});
my
$name
=
lc
(
shift
@vals
);
return
0
unless
@vals
> 1;
if
(
$self
->{tools}->{
$name
}) {
warn
"extracttext: duplicate tool defined: $name\n"
;
return
0;
}
$self
->{tools}->{
$name
} = {
'name'
=>
$name
,
'type'
=>
'external'
,
'env'
=> \
%env
,
'cmd'
=> \
@vals
,
};
dbg(
'extracttext: external: %s "%s"'
,
$name
,
join
(
'","'
,
@vals
));
return
1;
}
return
0;
}
sub
_extract_external {
my
(
$self
,
$object
,
$tool
) =
@_
;
my
(
$errno
,
$pipe_errno
,
$tmp_file
,
$err_file
,
$pid
);
my
$resp
=
''
;
my
@cmd
= @{
$tool
->{cmd}};
Mail::SpamAssassin::PerMsgStatus::enter_helper_run_mode(
$self
);
foreach
(
keys
%{
$tool
->{env}}) {
$ENV
{
$_
} =
$tool
->{env}{
$_
};
}
my
$timer
= Mail::SpamAssassin::Timeout->new(
{
secs
=>
$self
->{main}->{conf}->{extracttext_timeout},
deadline
=>
$self
->{
'master_deadline'
} });
my
$err
=
$timer
->run_and_catch(
sub
{
local
$SIG
{PIPE} =
sub
{
die
"__brokenpipe__ignore__\n"
};
(
$tmp_file
,
my
$tmp_fh
) = Mail::SpamAssassin::Util::secure_tmpfile();
$tmp_file
or
die
"failed to create a temporary file"
;
print
$tmp_fh
${
$object
->{data}};
close
(
$tmp_fh
);
(
$err_file
,
my
$err_fh
) = Mail::SpamAssassin::Util::secure_tmpfile();
$err_file
or
die
"failed to create a temporary file"
;
close
(
$err_fh
);
$err_file
= untaint_file_path(
$err_file
);
foreach
(
@cmd
) {
s/\{\}/
$tmp_file
/;
$_
= untaint_var(
$_
);
}
$pid
= Mail::SpamAssassin::Util::helper_app_pipe_open(
*EXTRACT
,
undef
,
">$err_file"
,
@cmd
);
$pid
or
die
"$!\n"
;
my
(
$inbuf
,
$nread
);
while
(
$nread
=
read
(EXTRACT,
$inbuf
, 8192)) {
$resp
.=
$inbuf
}
defined
$nread
or
die
"error reading from pipe: $!"
;
$errno
= 0;
close
EXTRACT or
$errno
= $!;
if
(proc_status_ok($?,
$errno
)) {
dbg(
"extracttext: [%s] (%s) finished successfully"
,
$pid
,
$cmd
[0]);
}
elsif
(proc_status_ok($?,
$errno
, 0, 1)) {
dbg(
"extracttext: [%s] (%s) finished: %s"
,
$pid
,
$cmd
[0], exit_status_str($?,
$errno
));
}
else
{
info(
"extracttext: [%s] (%s) error: %s"
,
$pid
,
$cmd
[0], exit_status_str($?,
$errno
));
}
$pipe_errno
= $?;
});
if
(
defined
(
fileno
(
*EXTRACT
))) {
if
(
$pid
) {
if
(
kill
(
'TERM'
,
$pid
)) {
dbg(
"extracttext: killed stale helper [$pid] ($cmd[0])"
);
}
else
{
dbg(
"extracttext: killing helper application [$pid] ($cmd[0]) failed: $!"
);
}
}
$errno
= 0;
close
EXTRACT or
$errno
= $!;
proc_status_ok($?,
$errno
)
or info(
"extracttext: [%s] (%s) error: %s"
,
$pid
,
$cmd
[0], exit_status_str($?,
$errno
));
}
Mail::SpamAssassin::PerMsgStatus::leave_helper_run_mode(
$self
);
unlink
(
$tmp_file
);
my
$err_resp
= -s
$err_file
?
do
{
open
(ERRF,
$err_file
);
$_
= <ERRF>;
close
(ERRF);
chomp
;
$_
; } :
''
;
unlink
(
$err_file
);
if
(
$err_resp
ne
''
) {
dbg(
"extracttext: [$pid] ($cmd[0]) stderr output: $err_resp"
);
}
if
(
$pipe_errno
) {
if
(
$err_resp
=~ /\b(?:Usage:|No such file or directory)/) {
warn
"extracttext: error from $cmd[0], please verify configuration: $err_resp\n"
;
}
elsif
(
$err_resp
=~ /^Syntax (?:Warning|Error): (?:May not be a PDF file|Couldn't find trailer dictionary)/) {
}
elsif
(
$err_resp
=~ /^Error in (?:findFileFormatStream|fopenReadStream): (?:truncated file|file not found)/) {
}
elsif
(
$err_resp
=~ /^libpng error:/) {
}
elsif
(
$err_resp
=~ /^Corrupt JPEG data:/) {
}
elsif
(
$err_resp
=~ /^\S+ is not a Word Document/) {
}
elsif
(!
$resp
) {
warn
"extracttext: error ("
.(
$pipe_errno
/256).
") from $cmd[0]: $err_resp\n"
;
}
return
(0,
$resp
);
}
return
(1,
$resp
);
}
sub
_extract_object {
my
(
$self
,
$object
,
$tool
) =
@_
;
my
(
$ok
,
$text
);
if
(
$tool
->{type} eq
'external'
) {
(
$ok
,
$text
) =
$self
->_extract_external(
$object
,
$tool
);
}
else
{
warn
"extracttext: bad tool type: $tool->{type}\n"
;
return
0;
}
return
0
unless
$ok
;
if
(
$text
=~ /^[\s\r\n]*$/s) {
$text
=
''
;
}
else
{
}
if
(
$text
eq
''
) {
dbg(
'extracttext: No text extracted'
);
}
$text
= untaint_var(
$text
);
utf8::encode(
$text
)
if
utf8::is_utf8(
$text
);
return
(1,
$text
);
}
sub
_get_extension {
my
(
$self
,
$object
) =
@_
;
my
$fext
;
if
(
$object
->{name} &&
$object
->{name} =~ /\.([^.\\\/]+)$/) {
$fext
= $1;
}
elsif
(
$object
->{file} &&
$object
->{file} =~ /\.([^.\\\/]+)$/) {
$fext
= $1;
}
return
$fext
? (
$fext
) : ();
}
sub
_extract {
my
(
$self
,
$coll
,
$part
,
$type
,
$name
,
$data
,
$tool
) =
@_
;
my
$object
= {
'data'
=>
$data
,
'type'
=>
$type
,
'name'
=>
$name
};
my
@fexts
;
my
@types
;
my
@tools
= (
$tool
->{name});
my
(
$ok
,
$text
) =
$self
->_extract_object(
$object
,
$tool
);
return
0
unless
$ok
;
if
(
$text
ne
''
&& would_log(
'dbg'
,
'extracttext'
) > 1) {
dbg(
"extracttext: text extracted:\n$text"
);
}
push
@{
$coll
->{text}},
$text
;
push
@types
,
$type
;
push
@fexts
,
$self
->_get_extension(
$object
);
if
(
$text
eq
''
) {
push
@{
$coll
->{flags}},
'NoText'
;
push
@{
$coll
->{text}},
'NoText'
;
}
else
{
if
(
$text
=~ /<a(?:\s+[^>]+)?\s+href=
"([^"
>]*)"/) {
push
@{
$coll
->{flags}},
'ActionURI'
;
dbg(
"extracttext: ActionURI: $1"
);
push
@{
$coll
->{text}},
$text
;
}
if
(
$text
=~ /NoText/) {
push
@{
$coll
->{flags}},
'NoText'
;
dbg(
"extracttext: NoText"
);
push
@{
$coll
->{text}},
$text
;
}
$coll
->{chars} +=
length
(
$text
);
$coll
->{words} +=
split
(/\W+/s,
$text
) - 1;
dbg(
"extracttext: rendering text for type $type with $tool->{name}"
);
$part
->set_rendered(
$text
);
}
if
(
@types
) {
push
@{
$coll
->{types}},
join
(
', '
,
@types
);
}
if
(
@fexts
) {
push
@{
$coll
->{extensions}},
join
(
'_'
,
@fexts
);
}
push
@{
$coll
->{tools}},
join
(
'_'
,
@tools
);
return
1;
}
sub
_check_extract {
my
(
$self
,
$coll
,
$checked
,
$part
,
$decoded
,
$data
,
$type
,
$name
) =
@_
;
return
0
unless
(
defined
$type
||
defined
$name
);
foreach
my
$match
(@{
$self
->{match}}) {
next
unless
$self
->{tools}->{
$match
->{tool}};
next
if
$checked
->{
$match
->{tool}};
if
(
$match
->{where} eq
'name'
) {
next
unless
(
defined
$name
&&
$name
=~
$match
->{what});
}
elsif
(
$match
->{where} eq
'type'
) {
next
unless
(
defined
$type
&&
$type
=~
$match
->{what});
}
else
{
next
;
}
$checked
->{
$match
->{tool}} = 1;
return
1
if
$self
->_extract(
$coll
,
$part
,
$type
,
$name
,
$data
,
$self
->{tools}->{
$match
->{tool}});
}
return
0;
}
sub
post_message_parse {
my
(
$self
,
$opts
) =
@_
;
my
$timer
=
$self
->{main}->time_method(
"extracttext"
);
my
$msg
=
$opts
->{
'message'
};
$self
->{
'master_deadline'
} =
$msg
->{
'master_deadline'
};
my
$starttime
=
time
;
my
%collect
= (
'tools'
=> [],
'types'
=> [],
'extensions'
=> [],
'flags'
=> [],
'chars'
=> 0,
'words'
=> 0,
'text'
=> [],
);
my
$conf
=
$self
->{main}->{conf};
my
$maxparts
=
$conf
->{extracttext_maxparts};
my
$ttimeout
=
$conf
->{extracttext_timeout_total} ||
$conf
->{extracttext_timeout} > 10 ?
$conf
->{extracttext_timeout} : 10;
my
$nparts
= 0;
foreach
my
$part
(
$msg
->find_parts(
qr/./
, 1)) {
next
unless
$part
->is_leaf;
if
(
$maxparts
> 0 && ++
$nparts
>
$maxparts
) {
dbg(
"extracttext: Skipping MIME parts exceeding the ${maxparts}th"
);
last
;
}
if
(
time
-
$starttime
>=
$ttimeout
) {
dbg(
"extracttext: Skipping MIME parts, total execution timeout exceeded"
);
last
;
}
my
(
undef
,
$rtd
) =
$part
->rendered;
next
if
defined
$rtd
;
my
%checked
= ();
my
$dat
=
$part
->decode();
my
$typ
=
$part
->{type};
my
$nam
=
$part
->{name};
my
$dec
= 1;
next
if
$self
->_check_extract(\
%collect
,\
%checked
,
$part
,\
$dec
,\
$dat
,
$typ
,
$nam
);
}
return
1
unless
@{
$collect
{tools}};
my
@uniq_tools
=
do
{
my
%seen
;
grep
{ !
$seen
{
$_
}++ } @{
$collect
{tools}} };
my
@uniq_types
=
do
{
my
%seen
;
grep
{ !
$seen
{
$_
}++ } @{
$collect
{types}} };
my
@uniq_ext
=
do
{
my
%seen
;
grep
{ !
$seen
{
$_
}++ } @{
$collect
{extensions}} };
my
@uniq_flags
=
do
{
my
%seen
;
grep
{ !
$seen
{
$_
}++ } @{
$collect
{flags}} };
$msg
->put_metadata(
'X-ExtractText-Words'
,
$collect
{words});
$msg
->put_metadata(
'X-ExtractText-Chars'
,
$collect
{chars});
$msg
->put_metadata(
'X-ExtractText-Tools'
,
join
(
' '
,
@uniq_tools
));
$msg
->put_metadata(
'X-ExtractText-Types'
,
join
(
' '
,
@uniq_types
));
$msg
->put_metadata(
'X-ExtractText-Extensions'
,
join
(
' '
,
@uniq_ext
));
$msg
->put_metadata(
'X-ExtractText-Flags'
,
join
(
' '
,
@uniq_flags
));
return
1;
}
sub
parsed_metadata {
my
(
$self
,
$opts
) =
@_
;
my
$pms
=
$opts
->{permsgstatus};
my
$msg
=
$pms
->get_message();
foreach
my
$tag
((
'Words'
,
'Chars'
,
'Tools'
,
'Types'
,
'Extensions'
,
'Flags'
)) {
my
$v
=
$msg
->get_metadata(
"X-ExtractText-$tag"
);
if
(
defined
$v
) {
$pms
->set_tag(
"ExtractText$tag"
,
$v
);
dbg(
"extracttext: tag: $tag $v"
);
}
}
return
1;
}
1;