our
$VERSION
= 0.001;
new_with_init
=>
'new'
,
get_set
=>
[
qw /
Verbose
/
];
sub
init {
my
(
$self
,
%args
) =
@_
;
$self
->Verbose(1);
$self
->ExtractMajorPatterns
(
String
=>
$args
{Contents});
}
sub
ExtractMajorPatterns {
my
(
$self
,
%args
) =
@_
;
print
"Extracting major patterns\n"
if
$self
->Verbose;
my
$occurances
=
$self
->GetAllSubstringsLengthLessThanSize
(
Size
=> 10,
String
=>
$args
{String});
my
$res
=
$self
->ExtractMostUsefulTerms
(
Occurances
=>
$occurances
);
my
$order
= {};
foreach
my
$k1
(
keys
%$occurances
) {
foreach
my
$k2
(
keys
%{
$occurances
->{
$k1
}}) {
$order
->{
$k2
}->{
$k1
} = 1;
}
}
my
$token
;
my
$counter
= 10;
do
{
$token
=
$res
->{List}->[
int
rand
$counter
];
++
$counter
;
}
while
(
length
(
$token
) < 5 or
$token
=~ /^\s*$/);
print
"Chose token: "
.Dumper(
$token
).
"\n"
if
$self
->Verbose;
my
$l
= [
sort
{
$a
<=>
$b
}
keys
%{
$occurances
->{
$token
}}];
my
$listsize
=
scalar
@$l
;
my
$i
=
int
rand
$listsize
;
print
"Selected instance $i of $listsize\n"
if
$self
->Verbose;
my
$cycles
= [];
for
(
my
$cycle
= 0;
$cycle
< 10; ++
$cycle
) {
$cycles
->[
$cycle
] = [];
for
(
my
$j
=
$l
->[
$i
+
$cycle
];
$j
<
$l
->[
$i
+1+
$cycle
]; ++
$j
) {
if
(
exists
$order
->{
$j
}) {
my
$neworder
;
foreach
my
$k
(
keys
%{
$order
->{
$j
}}) {
if
(
length
(
$k
) > 1) {
my
$tmp
=
$res
->{Ilist}->{
$k
};
if
(
defined
$tmp
) {
$neworder
->{
$k
} =
$j
;
}
}
}
push
@{
$cycles
->[
$cycle
]},
$neworder
if
$neworder
;
}
}
}
my
$cyclecount
= 0;
my
$seen
= {};
foreach
my
$cycle
(
@$cycles
) {
foreach
my
$hash
(
@$cycle
) {
foreach
my
$k
(
keys
%$hash
) {
$seen
->{
$k
}->{
$cyclecount
} = 1;
}
}
++
$cyclecount
;
}
my
$keep
= {};
foreach
my
$k
(
keys
%$seen
) {
if
(
scalar
keys
%{
$seen
->{
$k
}} ==
$cyclecount
) {
$keep
->{
$k
} = 1;
}
}
my
$newcycles
= [];
foreach
my
$cycle
(
@$cycles
) {
my
$newcycle
= [];
foreach
my
$hash
(
@$cycle
) {
my
$newhash
= {};
foreach
my
$key
(
keys
%$hash
) {
if
(
exists
$keep
->{
$key
}) {
$newhash
->{
$key
} =
$hash
->{
$key
};
}
}
if
(
scalar
keys
%$newhash
) {
push
@$newcycle
,
$newhash
;
}
}
push
@$newcycles
,
$newcycle
;
}
my
$regex
=
$self
->ExtractRegexFromCycles
(
Index
=>
$i
,
Cycles
=>
$newcycles
,
TokenList
=>
$l
,
);
$regex
=~ s/(\(\.\*\))+/(.*)/g;
$regex
=~ s/^(\(\.\*\))//;
$regex
=~ s/(\(\.\*\))$//;
my
@seeks
=
$regex
=~ /(\(\.\*\))/g;
my
$size
=
scalar
@seeks
;
print
Dumper(
$regex
).
"\n"
if
$self
->Verbose;
my
@elements
=
$args
{String} =~ /
$regex
/g;
my
@entries
;
while
(
@elements
) {
push
@entries
, [
splice
(
@elements
,0,
$size
)];
}
print
"Extracted "
.(
scalar
@entries
).
" records\n"
if
$self
->Verbose;
print
Dumper(\
@entries
);
}
sub
min {
my
(
$a
,
$b
) =
@_
;
if
(
$a
<
$b
) {
return
$a
;
}
return
$b
;
}
sub
GetAllSubstringsLengthLessThanSize {
my
(
$self
,
%args
) =
@_
;
my
$occurances
= {};
my
@l
=
split
//,
$args
{String};
print
"Length: "
.(
scalar
@l
).
"\n"
if
$self
->Verbose;
for
(
my
$i
= 0;
$i
<
scalar
@l
; ++
$i
) {
if
(!(
$i
% 1000)) {
print
"."
if
$self
->Verbose;
}
if
(!(
$i
% 50000)) {
print
"\n"
if
$self
->Verbose;
}
for
(
my
$j
=
$i
+ 1;
$j
<= min(
$i
+
$args
{Size},
scalar
@l
); ++
$j
) {
$occurances
->{
substr
(
$args
{String},
$i
,
$j
-
$i
)}->{
$i
} = 1;
}
}
print
"\n"
if
$self
->Verbose;
return
$occurances
;
}
sub
ExtractMostUsefulTerms {
my
(
$self
,
%args
) =
@_
;
print
"Extracting most useful terms\n"
if
$self
->Verbose;
my
$occurances
=
$args
{Occurances};
my
$res
;
my
$usefulness
= {};
my
@list
;
foreach
my
$k1
(
keys
%$occurances
) {
if
(
scalar
keys
%{
$occurances
->{
$k1
}} > 1) {
$res
->{
$k1
} =
$occurances
->{
$k1
};
}
}
my
$count
= 0;
my
$ilist
= {};
foreach
my
$k1
(
sort
{
(
scalar
keys
%{
$res
->{
$b
}}) *
length
(
$b
) <=>
(
scalar
keys
%{
$res
->{
$a
}}) *
length
(
$a
)}
keys
%$res
) {
my
$x
= (
scalar
keys
%{
$res
->{
$k1
}})
*length
(
$k1
);
push
@list
,
$k1
;
$ilist
->{
$k1
} =
$count
++;
$usefulness
->{
$k1
} =
$x
;
}
return
{
List
=> \
@list
,
Ilist
=>
$ilist
,
Usefulness
=>
$usefulness
};
}
sub
GenerateRegexFromCycle {
my
(
$self
,
%args
) =
@_
;
my
@string
;
my
$last
;
foreach
my
$hash
(@{
$args
{Cycle}}) {
foreach
my
$string
(
keys
%$hash
) {
my
$x
= 0;
foreach
my
$char
(
split
//,
$string
) {
$string
[
$hash
->{
$string
} -
$args
{TokenList}->[
$args
{Index}] +
$x
++] =
$char
;
$last
=
$hash
->{
$string
};
}
}
}
splice
(
@string
,
$last
-
$args
{TokenList}->[
$args
{Index}] + 1);
my
@constant
;
my
$r
= [];
my
$undefperiod
= 0;
foreach
my
$element
(
@string
) {
if
(!
defined
$element
) {
if
(
$undefperiod
) {
}
else
{
$undefperiod
= 1;
if
(
@constant
) {
my
@cp
=
@constant
;
push
@$r
,\
@cp
;
@constant
= ();
}
push
@$r
,
undef
;
}
}
else
{
$undefperiod
= 0;
push
@constant
,
$element
;
}
}
if
(
@constant
) {
my
@cp
=
@constant
;
push
@$r
, \
@cp
;
}
my
$regex
;
foreach
my
$list
(
@$r
) {
if
(
defined
$list
) {
my
$constant
=
join
(
""
,
@$list
);
$constant
=~ s/
/\r/g;
$constant
=~ s/([^\w\s\n])/\\$1/g;
$regex
.=
$constant
;
}
else
{
$regex
.=
"(.*)"
;
}
}
return
$regex
;
}
sub
ExtractRegexFromCycles {
my
(
$self
,
%args
) =
@_
;
my
$size
=
scalar
@{
$args
{Cycles}};
my
@l
= 0..(
$size
- 1);
my
@n
;
for
(
my
$i
= 0;
$i
< 8; ++
$i
) {
my
$index
=
int
rand
scalar
@l
;
push
@n
,
$l
[
$index
];
$l
[
$index
] =
$l
[
scalar
@l
- 1];
pop
@l
;
}
my
@regexes
;
foreach
my
$index
(
@n
) {
my
$newhash
= {};
foreach
my
$hash
(@{
$args
{Cycles}->[
$index
]}) {
foreach
my
$key
(
keys
%$hash
) {
$newhash
->{
$key
} =
$hash
->{
$key
};
}
}
my
$length
= (
scalar
keys
%$newhash
);
my
$regex
=
$self
->GenerateRegexFromCycle
(
Cycle
=>
$args
{Cycles}->[
$index
],
TokenList
=>
$args
{TokenList},
Index
=>
$args
{Index},
);
push
@regexes
,
$regex
;
}
print
Dumper(\
@regexes
);
my
$regex
=
shift
@regexes
;
while
(
@regexes
) {
my
$r2
=
shift
@regexes
;
$regex
=
join
(
""
,LCS([
split
//,
$regex
], [
split
//,
$r2
]));
}
$regex
=~ s/\\{1}//g;
return
$regex
;
}
1;