use
Errno
qw(ENOENT EACCES EEXIST)
;
our
@ISA
=
qw(Mail::SpamAssassin::Plugin)
;
my
$fixup_re_test
;
sub
new {
my
$class
=
shift
;
my
$mailsaobject
=
shift
;
$class
=
ref
(
$class
) ||
$class
;
my
$self
=
$class
->SUPER::new(
$mailsaobject
);
bless
(
$self
,
$class
);
$self
->{show_progress} = !
$mailsaobject
->{base_quiet};
return
$self
;
}
sub
finish_parsing_end {
my
(
$self
,
$params
) =
@_
;
my
$conf
=
$params
->{conf};
$self
->extract_bases(
$conf
);
}
sub
extract_bases {
my
(
$self
,
$conf
) =
@_
;
my
$main
=
$conf
->{main};
if
(!
$main
->{base_extract}) {
return
; }
$self
->{show_progress} and
info(
"base extraction starting. this can take a while..."
);
$self
->extract_set(
$conf
,
$conf
->{body_tests},
'body'
);
}
sub
extract_set {
my
(
$self
,
$conf
,
$test_set
,
$ruletype
) =
@_
;
foreach
my
$pri
(
keys
%{
$test_set
}) {
my
$nicepri
=
$pri
;
$nicepri
=~ s/-/neg/g;
$self
->extract_set_pri(
$conf
,
$test_set
->{
$pri
},
$ruletype
.
'_'
.
$nicepri
);
}
if
(
$self
->{tmpf}) {
unlink
$self
->{tmpf};
delete
$self
->{tmpf};
}
}
sub
extract_set_pri {
my
(
$self
,
$conf
,
$rules
,
$ruletype
) =
@_
;
my
@good_bases
;
my
@failed
;
my
$yes
= 0;
my
$no
= 0;
my
$count
= 0;
my
$start
=
time
;
$self
->{main} =
$conf
->{main};
$self
->{show_progress} and info (
"extracting from rules of type $ruletype"
);
my
$tflags
=
$conf
->{tflags};
my
$min_chars
= 5;
my
$progress
;
$self
->{show_progress} and
$progress
= Mail::SpamAssassin::Util::Progress->new({
total
=> (
scalar
keys
%{
$rules
} || 1),
itemtype
=>
'rules'
,
});
my
$cached
= { };
my
$cachefile
;
if
(
$self
->{main}->{bases_cache_dir}) {
$cachefile
=
$self
->{main}->{bases_cache_dir}.
"/rules.$ruletype"
;
dbg(
"zoom: reading cache file $cachefile"
);
$cached
=
$self
->read_cachefile(
$cachefile
);
}
NEXT_RULE:
foreach
my
$name
(
keys
%{
$rules
}) {
$self
->{show_progress} and
$progress
and
$progress
->update(++
$count
);
my
$rule
= qr_to_string(
$conf
->{test_qrs}->{
$name
});
if
(!
defined
$rule
) {
die
"zoom: error: regexp for $rule not found\n"
;
}
my
$cachekey
=
$name
.
'#'
.
$rule
;
my
$cent
=
$cached
->{rule_bases}->{
$cachekey
};
if
(
defined
$cent
) {
if
(
defined
$cent
->{g}) {
dbg(
"zoom: YES (cached) $rule $name"
);
foreach
my
$ent
(@{
$cent
->{g}}) {
push
@good_bases
, {
base
=>
$ent
->{base},
orig
=>
$ent
->{orig},
name
=>
$ent
->{name}
};
}
$yes
++;
}
else
{
dbg(
"zoom: NO (cached) $rule $name"
);
push
@failed
, {
orig
=>
$rule
};
$no
++;
}
next
NEXT_RULE;
}
my
$is_a_replace_rule
=
$conf
->{replace_rules}->{
$name
} ||
$conf
->{capture_rules}->{
$name
} ||
$conf
->{capture_template_rules}->{
$name
};
my
(
$minlen
,
$lossy
,
@bases
);
if
(!
$is_a_replace_rule
) {
eval
{
my
(
$qr
,
$mods
) =
$self
->simplify_and_qr_regexp(
$rule
);
(
$lossy
,
@bases
) =
$self
->extract_hints(
$rule
,
$qr
,
$mods
);
1;
} or
do
{
my
$eval_stat
= $@ ne
''
? $@ :
"errno=$!"
;
chomp
$eval_stat
;
$eval_stat
=~ s/ at .*//s;
dbg(
"zoom: giving up on regexp: $eval_stat"
);
};
foreach
my
$str
(
@bases
) {
my
$len
=
length
fixup_re(
$str
);
if
(
$len
<
$min_chars
) {
$minlen
=
undef
;
@bases
= ();
last
; }
elsif
(!
defined
(
$minlen
) ||
$len
<
$minlen
) {
$minlen
=
$len
; }
}
}
if
(
$is_a_replace_rule
|| !
$minlen
|| !
@bases
) {
dbg(
"zoom: ignoring rule %s, %s"
,
$name
,
$is_a_replace_rule
?
'is a replace rule'
: !
@bases
?
'no bases'
:
'no minlen'
);
push
@failed
, {
orig
=>
$rule
};
$cached
->{rule_bases}->{
$cachekey
} = { };
$no
++;
}
else
{
my
%subsumed
;
foreach
my
$base1
(
@bases
) {
foreach
my
$base2
(
@bases
) {
if
(
$base1
ne
$base2
&&
$base1
=~ /\Q
$base2
\E/) {
$subsumed
{
$base1
} = 1;
}
}
}
my
@forcache
;
foreach
my
$base
(
@bases
) {
next
if
$subsumed
{
$base
};
push
@good_bases
, {
base
=>
$base
,
orig
=>
$rule
,
name
=>
"$name,[l=$lossy]"
};
push
@forcache
, {
base
=>
$base
,
orig
=>
$rule
,
name
=>
"$name,[l=$lossy]"
};
}
$cached
->{rule_bases}->{
$cachekey
} = {
g
=> \
@forcache
};
$yes
++;
}
}
$self
->{show_progress} and
$progress
and
$progress
->final();
dbg(
"zoom: $ruletype: found "
.(
scalar
@good_bases
).
" usable base strings in $yes rules, skipped $no rules"
);
$conf
->{base_orig}->{
$ruletype
} = { };
$conf
->{base_string}->{
$ruletype
} = { };
$count
= 0;
$self
->{show_progress} and
$progress
= Mail::SpamAssassin::Util::Progress->new({
total
=> (
scalar
@good_bases
|| 1),
itemtype
=>
'bases'
,
});
my
@rewritten
;
foreach
my
$set1
(
@good_bases
) {
my
$base
=
$set1
->{base};
next
if
(!
$base
|| !
$set1
->{name});
push
@rewritten
, [
$base
,
$set1
->{name},
$set1
->{orig},
length
$base
,
$base
,
0
];
}
@good_bases
=
sort
{
$b
->[SLOT_LEN_BASE] <=>
$a
->[SLOT_LEN_BASE] ||
$a
->[SLOT_BASE] cmp
$b
->[SLOT_BASE] ||
$a
->[SLOT_NAME] cmp
$b
->[SLOT_NAME] ||
$a
->[SLOT_ORIG] cmp
$b
->[SLOT_ORIG]
}
@rewritten
;
my
$base_orig
=
$conf
->{base_orig}->{
$ruletype
};
my
$next_base_position
= 0;
for
my
$set1
(
@good_bases
) {
$next_base_position
++;
$self
->{show_progress} and
$progress
and
$progress
->update(++
$count
);
my
$base1
=
$set1
->[SLOT_BASE] or
next
;
my
$name1
=
$set1
->[SLOT_NAME];
my
$orig1
=
$set1
->[SLOT_ORIG];
my
$len1
=
$set1
->[SLOT_LEN_BASE];
$base_orig
->{
$name1
} =
$orig1
;
foreach
my
$set2
(
@good_bases
[
$next_base_position
..
$#good_bases
]) { # order from smallest to largest
if
(!
$set2
->[SLOT_BASE] ||
(
$base1
eq
$set2
->[SLOT_BASE] &&
$name1
eq
$set2
->[SLOT_NAME] &&
$orig1
eq
$set2
->[SLOT_ORIG]
)
)
{
$set2
->[SLOT_BASE] = CLOBBER;
next
;
}
next
if
index
(
$base1
,
$set2
->[SLOT_BASE_INITIAL]) == -1;
next
if
(
$set2
->[SLOT_HAS_MULTIPLE] &&
index
(
$set2
->[SLOT_NAME],
$name1
) > -1 &&
$set2
->[SLOT_NAME] =~ /(?: |^)\Q
$name1
\E(?: |$)/);
next
if
(
$set1
->[SLOT_HAS_MULTIPLE] &&
index
(
$set1
->[SLOT_NAME],
$set2
->[SLOT_NAME]) > -1 &&
$set1
->[SLOT_NAME] =~ /(?: |^)\Q
$set2
->[SLOT_NAME]\E(?: |$)/);
$set1
->[SLOT_NAME] .=
" "
.
$set2
->[SLOT_NAME];
$set1
->[SLOT_HAS_MULTIPLE] = 1;
}
}
my
%bases
;
foreach
my
$set
(
@good_bases
) {
my
$base
=
$set
->[0];
next
unless
$base
;
if
(
defined
$bases
{
$base
}) {
$bases
{
$base
} .=
" "
.
$set
->[1];
}
else
{
$bases
{
$base
} =
$set
->[1];
}
}
undef
@good_bases
;
my
$base_string
=
$conf
->{base_string}->{
$ruletype
};
foreach
my
$base
(
keys
%bases
) {
my
%u
;
for
my
$i
(
split
' '
,
$bases
{
$base
}) {
next
if
exists
$u
{
$i
};
undef
$u
{
$i
};
}
$base_string
->{
$base
} =
join
' '
,
sort
keys
%u
;
}
$self
->{show_progress} and
$progress
and
$progress
->final();
if
(
$cachefile
) {
$self
->write_cachefile (
$cachefile
,
$cached
);
}
my
$elapsed
=
time
-
$start
;
$self
->{show_progress} and info (
"$ruletype: "
.
(
scalar
keys
%{
$conf
->{base_string}->{
$ruletype
}}).
" base strings extracted in $elapsed seconds\n"
);
}
sub
simplify_and_qr_regexp {
my
$self
=
shift
;
my
$rule
=
shift
;
my
$main
=
$self
->{main};
my
$mods
=
''
;
while
(
$rule
=~ s/^\(\?([a-z]*)\)//) {
$mods
.= $1;
}
while
(
$rule
=~ s/^\(\?-([a-z]*)\)//) {
foreach
my
$modchar
(
split
''
,
$mods
) {
$mods
=~ s/
$modchar
//g;
}
}
my
$lossy
= 0;
if
(
$main
->{bases_must_be_casei}) {
$rule
=
lc
$rule
;
$lossy
= 1;
$mods
=~ s/i// and
$lossy
= 0;
$rule
=~ s/(?<!\\)\(\?i\:(.*?)\)/$1/gs and
$lossy
++;
$rule
=~ s/(?<!\\)\(\?-i\:(.*?)\)/$1/gs and
$lossy
++;
$rule
=~ s/\(\?i\)//gs;
}
else
{
die
"case-i"
if
$rule
=~ /\(\?i\)/;
die
"case-i"
if
index
(
$mods
,
'i'
) >= 0;
$rule
=~ s/(?<!\\)\(\?i\:(.*?)\)/$1/gs and
die
"case-i"
;
$rule
=~ s/(?<!\\)\(\?-i\:(.*?)\)/$1/gs;
}
$mods
=~ s/m// and
$lossy
++;
$mods
=~ s/s// and
$lossy
++;
$rule
=~ s/\(\^\|\\b\)//gs and
$lossy
++;
$rule
=~ s/\(\$\|\\b\)//gs and
$lossy
++;
$rule
=~ s/\(\\b\|\^\)//gs and
$lossy
++;
$rule
=~ s/\(\\b\|\$\)//gs and
$lossy
++;
$rule
=~ s/\(\?\![^\)]+\)//gs and
$lossy
++;
$rule
=~ s/(?<!\\)\\b//gs and
$lossy
++;
$rule
=~ s/\(\?\=\[[^\]]+\]\)//gs;
$mods
.=
"L"
if
$lossy
;
(
$rule
,
$mods
);
}
sub
extract_hints {
my
(
$self
,
$rawrule
,
$rule
,
$mods
) =
@_
;
my
$main
=
$self
->{main};
my
$orig
=
$rule
;
my
$lossy
= 0;
$mods
=~ s/L// and
$lossy
++;
die
"anchors"
if
$rule
=~ /^\(?(?:\^|\\A)/;
$rule
=~ s/(?<!\\)(?:\$|\\Z)\)?$// and
$lossy
++;
$main
->{bases_allow_noncapture_groups} or
$rule
=~ s/\(\?:/\(/g;
$rule
=~ s/\((.*?)\)\?/\($1\|\)/gs;
$rule
=~ s/(?<!\\)(\w)\?/\($1\|\)/gs;
if
(!
$self
->{tmpf}) {
(
$self
->{tmpf},
my
$tmpfh
) = Mail::SpamAssassin::Util::secure_tmpfile();
$tmpfh
or
die
"failed to create a temporary file"
;
close
$tmpfh
;
$self
->{tmpf} = untaint_var(
$self
->{tmpf});
}
open
(
my
$tmpfh
,
'>'
.
$self
->{tmpf})
or
die
"error opening $self->{tmpf}: $!"
;
binmode
$tmpfh
;
print
$tmpfh
"use bytes; m{"
.
$rule
.
"}"
.
$mods
or
die
"error writing to $self->{tmpf}: $!"
;
close
$tmpfh
or
die
"error closing $self->{tmpf}: $!"
;
$self
->{perl} =
$self
->get_perl()
if
!
exists
$self
->{perl};
local
*IN
;
open
(IN,
"$self->{perl} -c -Mre=debug $self->{tmpf} 2>&1 |"
)
or
die
"cannot run $self->{perl}: "
.exit_status_str($?,$!);
my
(
$inbuf
,
$nread
,
$fullstr
);
$fullstr
=
''
;
while
(
$nread
=
read
(IN,
$inbuf
,16384) ) {
$fullstr
.=
$inbuf
}
defined
$nread
or
die
"error reading from pipe: $!"
;
close
IN or
die
"error closing pipe: $!"
;
defined
$fullstr
or
warn
"empty result from a pipe"
;
$fullstr
=~ s/^.*\nFinal program:\n//gs;
$fullstr
=~ s/^(?:.*\n|)size \d[^\n]*\n//gs;
$fullstr
=~ s/^(?:.*\n|)first at \d[^\n]*\n//gs;
$fullstr
=~ s/\nOffsets:.*$//gs;
$fullstr
=~ s/^\S.*$//gm;
if
(
$fullstr
!~ /((?:\s[^\n]+\n)+)/m) {
die
"failed to parse Mre=debug output: $fullstr m{"
.
$rule
.
"}"
.
$mods
.
" $rawrule"
;
}
my
$opsstr
= $1;
DEBUG_RE_PARSING and
warn
"Mre=debug output: $opsstr"
;
my
@ops
;
foreach
my
$op
(
split
(/\n/s,
$opsstr
)) {
next
unless
$op
;
if
(
$op
=~ /^\s+\d+: (\s*)([A-Z]\w+)\b(.*?)\s*(?:\(\d+\))?$/) {
push
@ops
, [ $1, $2, $3 ];
}
elsif
(
$op
=~ /^ (\s*)<(.*)>\.\.\.\s*$/) {
my
$spcs
= $1;
my
$str
=
substr
($2, 0, 55);
push
@ops
, [
$spcs
,
'_moretrie'
,
"<$str...>"
];
}
elsif
(
$op
=~ /^ (\s*)(<.*>)\s*(?:\(\d+\))?$/) {
push
@ops
, [ $1,
'_moretrie'
, $2 ];
}
elsif
(
$op
=~ /^ at .+ line \d+$/) {
next
;
}
else
{
warn
"cannot parse '$op': $opsstr"
;
next
;
}
}
my
@unrolled
;
if
(
$main
->{bases_split_out_alternations}) {
@unrolled
=
$self
->unroll_branches(0, \
@ops
);
}
else
{
@unrolled
= ( \
@ops
);
}
my
@longests
;
foreach
my
$opsarray
(
@unrolled
) {
my
$longestexact
=
''
;
my
$buf
=
''
;
my
$add_candidate
=
sub
{
if
(
length
$buf
>
length
$longestexact
) {
$longestexact
=
$buf
; }
$buf
=
''
;
};
my
$prevop
;
foreach
my
$op
(@{
$opsarray
}) {
my
(
$spcs
,
$item
,
$args
) = @{
$op
};
next
if
(
$item
eq
'NOTHING'
);
if
(!
$spcs
&&
$item
=~ /^EXACT/ &&
$args
=~ /<(.*)>/)
{
my
$str
= $1;
$buf
.=
$str
;
if
(
$buf
=~ s/\\x\{[0-9a-fA-F]{4,}\}.*$//) {
$add_candidate
->();
}
if
(
length
$str
>= 55 &&
$buf
=~ s/\.\.\.$//) {
$add_candidate
->();
}
}
elsif
(!
$spcs
&&
$item
=~ /^_moretrie/ &&
$args
=~ /<(.*)>/)
{
$buf
.= $1;
if
(
length
$1 >= 55 &&
$buf
=~ s/\.\.\.$//) {
$add_candidate
->();
}
}
elsif
(
$item
=~ /^EXACT/ &&
$prevop
&& !
$prevop
->[0] &&
$prevop
->[1] =~ /^CURLYX/ &&
$prevop
->[2] =~ /\{(\d+),/ && $1 >= 1 &&
$args
=~ /<(.*)>/)
{
$buf
.= $1;
if
(
length
$1 >= 55 &&
$buf
=~ s/\.\.\.$//) {
$add_candidate
->();
}
}
elsif
(
$item
=~ /^_moretrie/ &&
$prevop
&& !
$prevop
->[0] &&
$prevop
->[1] =~ /^CURLYX/ &&
$prevop
->[2] =~ /\{(\d+),/ && $1 >= 1 &&
$args
=~ /<(.*)>/)
{
$buf
.= $1;
if
(
length
$1 >= 60 &&
$buf
=~ s/\.\.\.$//) {
$add_candidate
->();
}
}
else
{
$add_candidate
->();
if
(
$item
!~ /^(?:END|CLOSE\d|MINMOD)$/)
{
$lossy
= 1;
DEBUG_RE_PARSING and
warn
"item $item makes regexp lossy"
;
}
}
$prevop
=
$op
;
}
$add_candidate
->();
if
(!
$longestexact
) {
die
"no long-enough string found in $rawrule\n"
;
}
else
{
push
@longests
, (
$main
->{bases_must_be_casei}) ?
lc
$longestexact
:
$longestexact
;
}
}
DEBUG_RE_PARSING and
warn
"longest base strings: /"
.
join
(
"/"
,
@longests
).
"/"
;
return
(
$lossy
,
@longests
);
}
sub
unroll_branches {
my
(
$self
,
$depth
,
$opslist
) =
@_
;
die
"too deep"
if
(
$depth
++ > 5);
my
@ops
= (@{
$opslist
});
my
@pre_branch_ops
;
my
$branch_spcs
;
my
$trie_spcs
;
my
$open_spcs
;
DEBUG_RE_PARSING and
warn
"starting parse"
;
if
(
scalar
@ops
> 1 &&
$ops
[0]->[1] =~ /^BRANCH/) {
my
@newops
= ([
""
,
"OPEN1"
,
""
]);
foreach
my
$op
(
@ops
) {
push
@newops
, [
" "
.
$op
->[0],
$op
->[1],
$op
->[2] ];
}
push
@newops
, [
""
,
"CLOSE1"
,
""
];
@ops
=
@newops
;
}
while
(1) {
my
$op
=
shift
@ops
;
last
unless
defined
$op
;
my
(
$spcs
,
$item
,
$args
) = @{
$op
};
DEBUG_RE_PARSING and
warn
"pre: [$spcs] $item $args"
;
if
(
$item
=~ /^OPEN/) {
$open_spcs
=
$spcs
;
next
;
}
elsif
(
$item
=~ /^TRIE/) {
$trie_spcs
=
$spcs
;
last
;
}
elsif
(
$item
=~ /^BRANCH/) {
$branch_spcs
=
$spcs
;
last
;
}
elsif
(
$item
=~ /^EXACT/ &&
defined
$open_spcs
) {
push
@pre_branch_ops
, [
$open_spcs
,
$item
,
$args
];
next
;
}
elsif
(
defined
$open_spcs
) {
undef
$open_spcs
;
}
else
{
push
@pre_branch_ops
,
$op
;
}
}
if
(
scalar
@ops
== 0) {
return
[
@pre_branch_ops
];
}
my
@alts
;
my
@in_this_branch
;
DEBUG_RE_PARSING and
warn
"entering branch: "
.
"open='"
.(
defined
$open_spcs
?
$open_spcs
: '
undef
')."'
".
"branch='"
.(
defined
$branch_spcs
?
$branch_spcs
: '
undef
')."'
".
"trie='"
.(
defined
$trie_spcs
?
$trie_spcs
: '
undef
')."'
";
my
$open_sub_spcs
= (
$branch_spcs
?
$branch_spcs
:
""
).
" "
;
my
$trie_sub_spcs
=
""
;
while
(1) {
my
$op
=
shift
@ops
;
last
unless
defined
$op
;
my
(
$spcs
,
$item
,
$args
) = @{
$op
};
DEBUG_RE_PARSING and
warn
"in: [$spcs] $item $args"
;
if
(
defined
$branch_spcs
&&
$branch_spcs
eq
$spcs
&&
$item
=~ /^BRANCH/) {
push
@alts
, [
@pre_branch_ops
,
@in_this_branch
];
@in_this_branch
= ();
$open_sub_spcs
=
$branch_spcs
.
" "
;
$trie_sub_spcs
=
""
;
next
;
}
elsif
(
defined
$branch_spcs
&&
$branch_spcs
eq
$spcs
&&
$item
eq
'TAIL'
) {
push
@alts
, [
@pre_branch_ops
,
@in_this_branch
];
undef
$branch_spcs
;
$open_sub_spcs
=
""
;
$trie_sub_spcs
=
""
;
last
;
}
elsif
(
defined
$trie_spcs
&&
$trie_spcs
eq
$spcs
&&
$item
eq
'_moretrie'
) {
if
(
scalar
@in_this_branch
> 0) {
push
@alts
, [
@pre_branch_ops
,
@in_this_branch
];
}
@in_this_branch
= ( [
$open_spcs
,
$item
,
$args
] );
$open_sub_spcs
= (
$branch_spcs
?
$branch_spcs
:
""
).
" "
;
$trie_sub_spcs
=
" "
;
next
;
}
elsif
(
defined
$open_spcs
&&
$open_spcs
eq
$spcs
&&
$item
=~ /^CLOSE/) {
push
@alts
, [
@pre_branch_ops
,
@in_this_branch
];
undef
$branch_spcs
;
undef
$open_spcs
;
undef
$trie_spcs
;
$open_sub_spcs
=
""
;
$trie_sub_spcs
=
""
;
last
;
}
elsif
(
$item
eq
'END'
) {
push
@alts
, [
@pre_branch_ops
,
@in_this_branch
];
undef
$branch_spcs
;
undef
$open_spcs
;
undef
$trie_spcs
;
$open_sub_spcs
=
""
;
$trie_sub_spcs
=
""
;
last
;
}
else
{
if
(
$open_sub_spcs
) {
$spcs
=~ s/^
$open_sub_spcs
//;
$spcs
=~ s/^
$trie_sub_spcs
//;
}
push
@in_this_branch
, [
$spcs
,
$item
,
$args
];
}
}
if
(
defined
$branch_spcs
) {
die
"fell off end of string with a branch open: '$branch_spcs'"
;
}
foreach
my
$alt
(
@alts
) {
push
@{
$alt
},
@ops
;
}
if
(DEBUG_RE_PARSING) {
print
"unrolled: "
;
foreach
my
$alt
(
@alts
) {
foreach
my
$o
(@{
$alt
}) {
print
"{/$o->[0]/$o->[1]/$o->[2]} "
; }
print
"\n"
; }
}
my
@rets
;
foreach
my
$alt
(
@alts
) {
push
@rets
,
$self
->unroll_branches(
$depth
,
$alt
);
}
if
(DEBUG_RE_PARSING) {
print
"unrolled post-recurse: "
;
foreach
my
$alt
(
@rets
) {
foreach
my
$o
(@{
$alt
}) {
print
"{/$o->[0]/$o->[1]/$o->[2]} "
; }
print
"\n"
; }
}
return
@rets
;
}
sub
test {
my
(
$self
) =
@_
;
$self
->test_split_alt(
"foo"
,
"/foo/"
);
$self
->test_split_alt(
"(foo)"
,
"/foo/"
);
$self
->test_split_alt(
"foo(bar)baz"
,
"/foobarbaz/"
);
$self
->test_split_alt(
"x(foo|)"
,
"/xfoo/ /x/"
);
$self
->test_split_alt(
"fo(o|)"
,
"/foo/ /fo/"
);
$self
->test_split_alt(
"(foo|bar)"
,
"/foo/ /bar/"
);
$self
->test_split_alt(
"foo|bar"
,
"/foo/ /bar/"
);
$self
->test_split_alt(
"foo (bar|baz) argh"
,
"/foo bar argh/ /foo baz argh/"
);
$self
->test_split_alt(
"foo (bar|baz|bl(arg|at)) cough"
,
"/foo bar cough/ /foo baz cough/ /foo blarg cough/ /foo blat cough/"
);
$self
->test_split_alt(
"(s(otc|tco)k)"
,
"/sotck/ /stcok/"
);
$self
->test_split_alt(
"(business partner(s|ship|)|silent partner(s|ship|))"
,
"/business partners/ /silent partners/ /business partnership/ /silent partnership/ /business partner/ /silent partner/"
);
}
sub
test_split_alt {
my
(
$self
,
$in
,
$out
) =
@_
;
my
@got
=
$self
->split_alt(
$in
);
$out
=~ s/^\///;
$out
=~ s/\/$//;
my
@want
=
split
(/\/ \//,
$out
);
my
$failed
= 0;
if
(
scalar
@want
!=
scalar
@got
) {
warn
"FAIL: results count don't match"
;
$failed
++;
}
else
{
my
%got
=
map
{
$_
=> 1 }
@got
;
foreach
my
$w
(
@want
) {
if
(!
$got
{
$w
}) {
warn
"FAIL: '$w' not found"
;
$failed
++;
}
}
}
if
(
$failed
) {
print
"want: /"
.
join
(
'/ /'
,
@want
).
"/\n"
or
die
"error writing: $!"
;
print
"got: /"
.
join
(
'/ /'
,
@got
).
"/\n"
or
die
"error writing: $!"
;
return
0;
}
else
{
print
"ok\n"
or
die
"error writing: $!"
;
return
1;
}
}
sub
get_perl {
my
(
$self
) =
@_
;
my
$perl
;
my
$fromconf
=
$self
->{main}->{conf}->{re_parser_perl};
if
(
$fromconf
) {
$perl
=
$fromconf
;
}
elsif
($^X =~ m|^/|) {
$perl
= $^X;
}
else
{
$perl
=
$Config
{perlpath};
$perl
=~ s|/[^/]*$|/$^X|;
}
untaint_var(\
$perl
);
return
$perl
;
}
sub
read_cachefile {
my
(
$self
,
$cachefile
) =
@_
;
local
*IN
;
if
(
open
(IN,
"<"
.
$cachefile
)) {
my
(
$inbuf
,
$nread
,
$str
);
$str
=
''
;
while
(
$nread
=
read
(IN,
$inbuf
,16384) ) {
$str
.=
$inbuf
}
defined
$nread
or
die
"error reading from $cachefile: $!"
;
close
IN or
die
"error closing $cachefile: $!"
;
untaint_var(\
$str
);
my
$VAR1
;
if
(
eval
$str
) {
return
$VAR1
;
}
}
return
{ };
}
sub
write_cachefile {
my
(
$self
,
$cachefile
,
$cached
) =
@_
;
my
$dump
= Data::Dumper->new ([
$cached
]);
$dump
->Deepcopy(1);
$dump
->Purity(1);
$dump
->Indent(1);
my
$cachedir
=
$self
->{main}->{bases_cache_dir};
if
(
mkdir
(
$cachedir
)) {
}
elsif
($! == EEXIST) {
dbg(
"zoom: ok, cache directory already existed"
);
}
else
{
warn
"zoom: could not create cache directory: $cachedir ($!)\n"
;
return
;
}
open
(CACHE,
">$cachefile"
) or
warn
"cannot write to $cachefile"
;
print
CACHE (
$dump
->Dump,
";1;"
) or
die
"error writing: $!"
;
close
CACHE or
die
"error closing $cachefile: $!"
;
}
sub
fixup_re {
my
$re
=
shift
;
if
(
$fixup_re_test
) {
print
"INPUT: /$re/\n"
or
die
"error writing: $!"
}
my
$output
=
""
;
my
$TOK
=
qr([\"\\])
;
my
$STATE
;
local
($1,$2);
while
(
$re
=~ /\G(.*?)(
$TOK
)/gcs) {
my
$pre
= $1;
my
$tok
= $2;
if
(
length
(
$pre
)) {
$output
.=
"\"$pre\""
;
}
if
(
$tok
eq
'"'
) {
$output
.=
'"\\""'
;
}
elsif
(
$tok
eq
'\\'
) {
$re
=~ /\G(x\{[^\}]+\}|[0-7]{1,3}|.)/gcs or
die
"\\ at end of string!"
;
my
$esc
= $1;
if
(
$esc
eq
'"'
) {
$output
.=
'"\\""'
;
}
elsif
(
$esc
eq
'\\'
) {
$output
.=
'"**BACKSLASH**"'
;
}
elsif
(
$esc
=~ /^x\{(\S+)\}\z/) {
$output
.=
'"'
.
chr
(
hex
($1)).
'"'
;
}
elsif
(
$esc
=~ /^[0-7]{1,3}\z/) {
$output
.=
'"'
.
chr
(
oct
(
$esc
)).
'"'
;
}
else
{
$output
.=
"\"$esc\""
;
}
}
elsif
(
$fixup_re_test
) {
print
"PRE: $pre\nTOK: $tok\n"
or
die
"error writing: $!"
;
}
}
if
(!
defined
(
pos
(
$re
))) {
$output
.=
"\"$re\""
;
$output
=~ s{([\000-\037\177\200\377])}{
sprintf
(
"\\%03o"
,
ord
($1))}gse;
}
elsif
(
pos
(
$re
) <=
length
(
$re
)) {
$output
=~ s{([\000-\037\177\200\377])}{
sprintf
(
"\\%03o"
,
ord
($1))}gse;
$output
.= fixup_re(
substr
(
$re
,
pos
(
$re
)));
}
$output
=~ s/^
""
/"/;
$output
=~ s/(?<!\\)
""
\z/"/;
$output
=~ s/(?<!\\)
""
//g;
$output
=~ s/\*\
*BACKSLASH
\*\*/\\\\/gs;
if
(
$fixup_re_test
) {
print
"OUTPUT: $output\n"
or
die
"error writing: $!"
}
utf8::encode(
$output
)
if
utf8::is_utf8(
$output
);
return
$output
;
}
1;