our
@ISA
=
qw(HTML::Parser)
;
my
%elements
=
map
{;
$_
=> 1 }
qw( a abbr acronym address area b base bdo big blockquote body br button caption cite code col colgroup dd del dfn div dl dt em fieldset form h1 h2 h3 h4 h5 h6 head hr html i img input ins kbd label legend li link map meta noscript object ol optgroup option p param pre q samp script select small span strong style sub sup table tbody td textarea tfoot th thead title tr tt ul var )
,
qw( applet basefont center dir font frame frameset iframe isindex menu noframes s strike u )
,
qw( nobr x-sigsep x-tab )
,
;
my
%tricks
=
map
{;
$_
=> 1 }
qw( bgsound embed listing plaintext xmp )
,
;
my
%elements_text_style
=
map
{;
$_
=> 1 }
qw( body font table tr th td big small basefont marquee span p div a )
,
;
my
%elements_whitespace
=
map
{;
$_
=> 1 }
qw( br div li th td dt dd p hr blockquote pre embed listing plaintext xmp title
h1 h2 h3 h4 h5 h6 )
,
;
my
%elements_uri
=
map
{;
$_
=> 1 }
qw( body table tr td a area link img frame iframe embed script form base bgsound meta )
,
;
my
%ok_attributes
;
$ok_attributes
{basefont}{
$_
} = 1
for
qw( color face size )
;
$ok_attributes
{body}{
$_
} = 1
for
qw( text bgcolor link alink vlink background )
;
$ok_attributes
{font}{
$_
} = 1
for
qw( color face size )
;
$ok_attributes
{marquee}{
$_
} = 1
for
qw( bgcolor background )
;
$ok_attributes
{table}{
$_
} = 1
for
qw( bgcolor style )
;
$ok_attributes
{td}{
$_
} = 1
for
qw( bgcolor style )
;
$ok_attributes
{th}{
$_
} = 1
for
qw( bgcolor style )
;
$ok_attributes
{
tr
}{
$_
} = 1
for
qw( bgcolor style )
;
$ok_attributes
{span}{
$_
} = 1
for
qw( style )
;
$ok_attributes
{p}{
$_
} = 1
for
qw( style )
;
$ok_attributes
{div}{
$_
} = 1
for
qw( style )
;
$ok_attributes
{a}{
$_
} = 1
for
qw( style )
;
sub
new {
my
(
$class
,
$character_semantics_input
,
$character_semantics_output
) =
@_
;
my
$self
=
$class
->SUPER::new(
api_version
=> 3,
handlers
=> [
start_document
=> [
"html_start"
,
"self"
],
start
=> [
"html_tag"
,
"self,tagname,attr,'+1'"
],
end_document
=> [
"html_end"
,
"self"
],
end
=> [
"html_tag"
,
"self,tagname,attr,'-1'"
],
text
=> [
"html_text"
,
"self,dtext"
],
comment
=> [
"html_comment"
,
"self,text"
],
declaration
=> [
"html_declaration"
,
"self,text"
],
],
marked_sections
=> 1);
$self
->{SA_character_semantics_input} =
$character_semantics_input
;
$self
->{SA_encode_results} =
$character_semantics_input
&& !
$character_semantics_output
;
$self
;
}
sub
html_start {
my
(
$self
) =
@_
;
$self
->put_results(
html
=> 1);
$self
->{basefont} = 3;
my
%default
= (
tag
=>
"default"
,
fgcolor
=>
"#000000"
,
bgcolor
=>
"#ffffff"
,
size
=>
$self
->{basefont});
push
@{
$self
->{text_style} }, \
%default
;
}
sub
html_end {
my
(
$self
) =
@_
;
delete
$self
->{text_style};
my
@uri
;
if
(
defined
$self
->{uri}) {
@uri
=
keys
%{
$self
->{uri}};
}
$self
->put_results(
uri
=> \
@uri
);
$self
->put_results(
anchor
=>
$self
->{anchor});
$self
->put_results(
uri_detail
=>
$self
->{uri});
$self
->put_results(
uri_truncated
=>
$self
->{uri_truncated});
$self
->put_results(
image_area
=>
$self
->{image_area});
$self
->put_results(
length
=>
$self
->{
length
});
$self
->put_results(
min_size
=>
$self
->{min_size});
$self
->put_results(
max_size
=>
$self
->{max_size});
if
(
exists
$self
->{tags}) {
$self
->put_results(
closed_extra_ratio
=>
(
$self
->{closed_extra} /
$self
->{tags}));
}
$self
->put_results(
comment
=>
$self
->{comment});
$self
->put_results(
script
=>
$self
->{script});
$self
->put_results(
title
=>
$self
->{title});
$self
->put_results(
inside
=>
$self
->{inside});
if
(
exists
$self
->{backhair}) {
$self
->put_results(
backhair_count
=>
scalar
keys
%{
$self
->{backhair} });
}
if
(
exists
$self
->{elements} &&
exists
$self
->{tags}) {
$self
->put_results(
bad_tag_ratio
=>
(
$self
->{tags} -
$self
->{elements}) /
$self
->{tags});
}
if
(
exists
$self
->{elements_seen} &&
exists
$self
->{tags_seen}) {
$self
->put_results(
non_element_ratio
=>
(
$self
->{tags_seen} -
$self
->{elements_seen}) /
$self
->{tags_seen});
}
if
(
exists
$self
->{tags} &&
exists
$self
->{obfuscation}) {
$self
->put_results(
obfuscation_ratio
=>
$self
->{obfuscation} /
$self
->{tags});
}
}
sub
put_results {
my
$self
=
shift
;
my
%results
=
@_
;
while
(
my
(
$k
,
$v
) =
each
%results
) {
$self
->{results}{
$k
} =
$v
;
}
}
sub
get_results {
my
(
$self
) =
@_
;
return
$self
->{results};
}
sub
get_rendered_text {
my
$self
=
shift
;
my
%options
=
@_
;
return
join
(
''
, @{
$self
->{text} })
unless
%options
;
my
$mask
;
while
(
my
(
$k
,
$v
) =
each
%options
) {
next
if
!
defined
$self
->{
"text_$k"
};
if
(!
defined
$mask
) {
$mask
|=
$v
?
$self
->{
"text_$k"
} : ~
$self
->{
"text_$k"
};
}
else
{
$mask
&=
$v
?
$self
->{
"text_$k"
} : ~
$self
->{
"text_$k"
};
}
}
my
$text
=
''
;
my
$i
= 0;
for
(@{
$self
->{text} }) {
$text
.=
$_
if
vec
(
$mask
,
$i
++, 1); }
return
$text
;
}
sub
parse {
my
(
$self
,
$text
) =
@_
;
$self
->{image_area} = 0;
$self
->{title_index} = -1;
$self
->{max_size} = 3;
$self
->{min_size} = 3;
$self
->{closed_html} = 0;
$self
->{closed_body} = 0;
$self
->{closed_extra} = 0;
$self
->{text} = [];
$self
->{
length
} += untaint_var(
length
(
$text
));
$text
=~ s/
 
;/ /g;
$text
=~ s/<(\w+)\s*\/>/<$1>/gi;
if
(utf8::is_utf8(
$text
)) {
$text
=~ s/(?:\x{201C}|\x{201D})/"/g;
}
else
{
$text
=~ s/\x{E2}\x{80}(?:\x{9C}|\x{9D})/"/g;
}
if
(!
$self
->UNIVERSAL::can(
'utf8_mode'
)) {
warn
"message: cannot set utf8_mode, module HTML::Parser is too old\n"
if
!
$self
->{SA_character_semantics_input};
}
else
{
$self
->SUPER::utf8_mode(
$self
->{SA_character_semantics_input} ? 0 : 1);
my
$utf8_mode
=
$self
->SUPER::utf8_mode;
dbg(
"message: HTML::Parser utf8_mode %s"
,
$utf8_mode
?
"on (assumed UTF-8 octets)"
:
"off (default, assumed Unicode characters)"
);
}
eval
{
local
$SIG
{__WARN__} =
sub
{
my
$err
=
$_
[0];
$err
=~ s/\s+/ /gs;
$err
=~ s/(.*) at .*/$1/s;
info(
"message: HTML::Parser warning: $err"
);
};
$self
->SUPER::parse(
$text
);
};
$self
->SUPER::parse(
"</style>"
)
while
exists
$self
->{inside}{style} &&
$self
->{inside}{style} > 0;
$self
->SUPER::parse(
"</script>"
)
while
exists
$self
->{inside}{script} &&
$self
->{inside}{script} > 0;
$self
->SUPER::
eof
;
return
$self
->{text};
}
sub
html_tag {
my
(
$self
,
$tag
,
$attr
,
$num
) =
@_
;
utf8::encode(
$tag
)
if
$self
->{SA_encode_results};
my
$maybe_namespace
= (
$tag
=~ m@^(?:o|st\d):[\w-]+/?$@);
if
(
exists
$elements
{
$tag
} ||
$maybe_namespace
) {
$self
->{elements}++;
$self
->{elements_seen}++
if
!
exists
$self
->{inside}{
$tag
};
}
$self
->{tags}++;
$self
->{tags_seen}++
if
!
exists
$self
->{inside}{
$tag
};
$self
->{inside}{
$tag
} +=
$num
;
if
(
$self
->{inside}{
$tag
} < 0) {
$self
->{inside}{
$tag
} = 0;
$self
->{closed_extra}++;
}
return
if
$maybe_namespace
;
if
(
exists
$elements
{
$tag
} ||
exists
$tricks
{
$tag
}) {
$self
->text_style(
$tag
,
$attr
,
$num
)
if
exists
$elements_text_style
{
$tag
};
$self
->html_whitespace(
$tag
)
if
exists
$elements_whitespace
{
$tag
};
if
(
$num
== 1) {
$self
->html_uri(
$tag
,
$attr
)
if
exists
$elements_uri
{
$tag
};
$self
->html_tests(
$tag
,
$attr
,
$num
);
}
else
{
$self
->{closed_html} = 1
if
$tag
eq
"html"
;
$self
->{closed_body} = 1
if
$tag
eq
"body"
;
}
}
}
sub
html_whitespace {
my
(
$self
,
$tag
) =
@_
;
if
(
$tag
eq
"br"
||
$tag
eq
"div"
) {
$self
->display_text(
"\n"
,
whitespace
=> 1);
}
elsif
(
$tag
=~ /^(?:li|t[hd]|d[td]|embed|h\d)$/) {
$self
->display_text(
" "
,
whitespace
=> 1);
}
elsif
(
$tag
=~ /^(?:p|hr|blockquote|pre|listing|plaintext|xmp|title)$/) {
$self
->display_text(
"\n\n"
,
whitespace
=> 1);
}
}
sub
push_uri {
my
(
$self
,
$type
,
$uri
) =
@_
;
$uri
=
$self
->canon_uri(
$uri
);
return
if
$uri
eq
''
;
utf8::encode(
$uri
)
if
$self
->{SA_encode_results};
if
(
$uri
=~ /^(?:data|mailto|file|cid|tel):/i) {
$self
->{uri}->{
$uri
}->{types}->{
$type
} = 1;
}
else
{
my
$target
= target_uri(
$self
->{base_href} ||
""
,
$uri
);
$self
->{uri}->{
$target
}->{types}->{
$type
} = 1
if
$target
ne
''
;
}
}
sub
canon_uri {
my
(
$self
,
$uri
) =
@_
;
$uri
=~ s/^[\s\xA0]+//;
$uri
=~ s/[\s\xA0]+$//;
if
(
length
$uri
> MAX_URI_LENGTH) {
$self
->{
'uri_truncated'
} = 1;
$uri
=
substr
$uri
, 0, MAX_URI_LENGTH;
}
return
$uri
;
}
sub
html_uri {
my
(
$self
,
$tag
,
$attr
) =
@_
;
if
(
$tag
=~ /^(?:body|table|
tr
|td)$/) {
if
(
defined
$attr
->{background}) {
$self
->push_uri(
$tag
,
$attr
->{background});
}
}
elsif
(
$tag
=~ /^(?:a|area|
link
)$/) {
while
(
my
(
$k
,
$v
) =
each
%$attr
) {
if
(
$k
=~ /\w{1,8}\/href/) {
delete
(
$attr
->{
$k
});
$attr
->{href} =
$v
;
}
}
if
(
defined
$attr
->{href}) {
if
(utf8::is_utf8(
$attr
->{href})) {
$attr
->{href} =~ s/\x{FEFF}//g;
}
else
{
$attr
->{href} =~ s/\x{EF}\x{BB}\x{BF}//g;
}
$self
->push_uri(
$tag
,
$attr
->{href});
}
if
(
defined
$attr
->{
'data-saferedirecturl'
}) {
$self
->push_uri(
$tag
,
$attr
->{
'data-saferedirecturl'
});
}
}
elsif
(
$tag
=~ /^(?:img|frame|iframe|embed|script|bgsound)$/) {
while
(
my
(
$k
,
$v
) =
each
%$attr
) {
if
(
$k
=~ /\w{1,8}\/src/) {
delete
(
$attr
->{
$k
});
$attr
->{src} =
$v
;
}
}
if
(
defined
$attr
->{src}) {
$self
->push_uri(
$tag
,
$attr
->{src});
}
}
elsif
(
$tag
eq
"form"
) {
if
(
defined
$attr
->{action}) {
$self
->push_uri(
$tag
,
$attr
->{action});
}
}
elsif
(
$tag
eq
"base"
) {
if
(
my
$uri
=
$attr
->{href}) {
$uri
=
$self
->canon_uri(
$uri
);
$self
->push_uri(
$tag
,
$uri
);
if
(
$uri
=~ m@^(?:https?|ftp):/{0,2}
@i
) {
$uri
=~ s@^([a-z]+:/{0,2}[^/]+/.*?)[^/\.]+\.[^/\.]{2,4}$@$1
@i
;
$uri
.=
"/"
unless
$uri
=~ m@/$@;
utf8::encode(
$uri
)
if
$self
->{SA_encode_results};
$self
->{base_href} =
$uri
;
}
}
}
elsif
(
$tag
eq
"meta"
&&
exists
$attr
->{
'http-equiv'
} &&
exists
$attr
->{content} &&
$attr
->{
'http-equiv'
} =~ /refresh/i &&
$attr
->{content} =~ /\burl\s*=/i)
{
my
$uri
=
$attr
->{content};
$uri
=~ s/^.*\burl\s*=\s*//i;
$uri
=~ s/\s*;.*//i;
$self
->push_uri(
$tag
,
$uri
);
}
}
sub
close_table_tag {
my
(
$self
,
$tag
) =
@_
;
return
unless
grep
{
$_
->{tag} eq
$tag
} @{
$self
->{text_style} };
my
$top
;
while
(@{
$self
->{text_style} } && (
$top
=
$self
->{text_style}[-1]->{tag})) {
if
((
$tag
eq
"td"
&& (
$top
eq
"font"
||
$top
eq
"td"
)) ||
(
$tag
eq
"tr"
&&
$top
=~ /^(?:font|td|
tr
)$/))
{
pop
@{
$self
->{text_style} };
}
else
{
last
;
}
}
}
sub
close_tag {
my
(
$self
,
$tag
) =
@_
;
return
if
!
grep
{
$_
->{tag} eq
$tag
} @{
$self
->{text_style} };
while
(
my
%current
= %{
pop
@{
$self
->{text_style} } }) {
last
if
$current
{tag} eq
$tag
;
}
}
sub
text_style {
my
(
$self
,
$tag
,
$attr
,
$num
) =
@_
;
$tag
=
"td"
if
$tag
eq
"th"
;
if
(
$num
== 1) {
if
(
$tag
eq
"body"
) {
}
if
(
$tag
eq
"basefont"
&&
exists
$attr
->{size} &&
$attr
->{size} =~ /^\s*(\d+)/)
{
$self
->{basefont} = $1;
return
;
}
$self
->close_table_tag(
$tag
)
if
(
$tag
eq
"td"
||
$tag
eq
"tr"
);
my
%new
= %{
$self
->{text_style}[-1] };
$new
{tag} =
$tag
;
if
(
$tag
eq
"big"
) {
$new
{size} += 1;
push
@{
$self
->{text_style} }, \
%new
;
return
;
}
if
(
$tag
eq
"small"
) {
$new
{size} -= 1;
push
@{
$self
->{text_style} }, \
%new
;
return
;
}
for
my
$name
(
keys
%$attr
) {
next
unless
exists
$ok_attributes
{
$tag
}{
$name
};
if
(
$name
eq
"text"
||
$name
eq
"color"
) {
$new
{fgcolor} = name_to_rgb(
$attr
->{
$name
});
}
elsif
(
$name
eq
"size"
) {
if
(
$attr
->{size} =~ /^\s*([+-]\d+)/) {
$new
{size} =
$self
->{basefont} + $1;
}
elsif
(
$attr
->{size} =~ /^\s*(\d+)/) {
$new
{size} = $1;
}
}
elsif
(
$name
eq
'style'
) {
$new
{style} =
$attr
->{style};
my
@parts
=
split
(/;/,
$new
{style});
foreach
(
@parts
) {
if
(/^\s*(background-)?color:\s*(.+?)\s*$/i) {
my
$whcolor
= $1 ?
'bgcolor'
:
'fgcolor'
;
my
$value
=
lc
$2;
$value
=~ s/\s+!important$//;
if
(
index
(
$value
,
'rgb'
) >= 0) {
$value
=~
tr
/0-9,//cd;
my
@rgb
=
split
(/,/,
$value
);
$new
{
$whcolor
} =
sprintf
(
"#%02x%02x%02x"
,
map
{ !
$_
? 0 :
$_
> 255 ? 255 :
$_
}
@rgb
[0..2]);
}
elsif
(
$value
eq
'inherit'
) {
}
else
{
$new
{
$whcolor
} = name_to_rgb(
$value
);
}
}
elsif
(/^\s*([a-z_-]+)\s*:\s*(\S.*?)\s*$/i) {
$new
{
'style_'
.$1} = $2;
}
}
}
elsif
(
$name
eq
"bgcolor"
) {
$attr
->{bgcolor} = name_to_rgb(
$attr
->{bgcolor});
}
else
{
$new
{
$name
} =
$attr
->{
$name
};
}
if
(
$new
{size} >
$self
->{max_size}) {
$self
->{max_size} =
$new
{size};
}
elsif
(
$new
{size} <
$self
->{min_size}) {
$self
->{min_size} =
$new
{size};
}
}
push
@{
$self
->{text_style} }, \
%new
;
}
else
{
if
(
$tag
ne
"body"
) {
$self
->close_tag(
$tag
);
}
}
}
sub
html_font_invisible {
my
(
$self
,
$text
) =
@_
;
my
$fg
=
$self
->{text_style}[-1]->{fgcolor};
my
$bg
=
$self
->{text_style}[-1]->{bgcolor};
my
$size
=
$self
->{text_style}[-1]->{size};
my
$display
=
$self
->{text_style}[-1]->{style_display};
my
$visibility
=
$self
->{text_style}[-1]->{style_visibility};
if
(
substr
(
$fg
,-6) eq
substr
(
$bg
,-6)) {
$self
->put_results(
font_low_contrast
=> 1);
return
1;
}
elsif
(
$fg
=~ /^\
my
(
$r1
,
$g1
,
$b1
) = (
hex
($1),
hex
($2),
hex
($3));
if
(
$bg
=~ /^\
my
(
$r2
,
$g2
,
$b2
) = (
hex
($1),
hex
($2),
hex
($3));
my
$r
= (
$r1
-
$r2
);
my
$g
= (
$g1
-
$g2
);
my
$b
= (
$b1
-
$b2
);
my
$distance
= ((0.2126
*$r
)**2 + (0.7152
*$g
)**2 + (0.0722
*$b
)**2)**0.5;
if
(
$distance
< 12) {
$self
->put_results(
font_low_contrast
=> 1);
return
1;
}
}
}
if
(
$fg
eq
'invalid'
or
$bg
eq
'invalid'
) {
$self
->put_results(
font_invalid_color
=> 1);
return
1;
}
if
(
$size
<= 1) {
return
1;
}
if
(
$display
&&
lc
$display
eq
'none'
) {
return
1;
}
if
(
$visibility
&&
lc
$visibility
eq
'hidden'
) {
return
1;
}
return
0;
}
sub
html_tests {
my
(
$self
,
$tag
,
$attr
,
$num
) =
@_
;
if
(
$tag
eq
"font"
&&
exists
$attr
->{face}) {
if
(
$attr
->{face} !~ /^\s*[
"'.]?[a-z ][a-z -]*[a-z]\d?["
']?(?:,\s*[
"']?[a-z][a-z -]*[a-z]\d?["
']?)*;?$/i) {
$self
->put_results(
font_face_bad
=> 1);
}
}
if
(
$tag
eq
"img"
&&
exists
$self
->{inside}{a} &&
$self
->{inside}{a} > 0) {
my
$uri
=
$self
->{anchor_last};
utf8::encode(
$uri
)
if
$self
->{SA_encode_results};
$self
->{uri}->{
$uri
}->{anchor_text}->[-1] .=
"<img>\n"
;
$self
->{anchor}->[-1] .=
"<img>\n"
;
}
if
(
$tag
eq
"img"
&&
exists
$attr
->{width} &&
exists
$attr
->{height}) {
my
$width
= 0;
my
$height
= 0;
my
$area
= 0;
if
(
$attr
->{width} =~ /^(\d+)(\%)?$/) {
$width
= $1;
$width
*= 8
if
(
defined
$2 && $2 eq
"%"
);
}
if
(
$attr
->{height} =~ /^(\d+)(\%)?$/) {
$height
= $1;
$height
*= 6
if
(
defined
$2 && $2 eq
"%"
);
}
$width
= 200
if
$width
<= 0;
$height
= 200
if
$height
<= 0;
if
(
$width
> 0 &&
$height
> 0) {
$area
=
$width
*
$height
;
$self
->{image_area} +=
$area
;
}
}
if
(
$tag
eq
"form"
&&
exists
$attr
->{action}) {
$self
->put_results(
form_action_mailto
=> 1)
if
$attr
->{action} =~ /mailto:/i
}
if
(
$tag
eq
"object"
||
$tag
eq
"embed"
) {
$self
->put_results(
embeds
=> 1);
}
if
(
$tag
eq
"a"
) {
my
$uri
=
$self
->{anchor_last} =
(
exists
$attr
->{href} ?
$self
->canon_uri(
$attr
->{href}) :
""
);
utf8::encode(
$uri
)
if
$self
->{SA_encode_results};
push
(@{
$self
->{uri}->{
$uri
}->{anchor_text}},
''
);
push
(@{
$self
->{anchor}},
''
);
}
if
(
$tag
eq
"title"
) {
$self
->{title_index}++;
$self
->{title}->[
$self
->{title_index}] =
""
;
}
if
(
$tag
eq
"meta"
&&
exists
$attr
->{
'http-equiv'
} &&
exists
$attr
->{content} &&
$attr
->{
'http-equiv'
} =~ /Content-Type/i &&
$attr
->{content} =~ /\bcharset\s*=\s*[
"']?([^"
']+)/i)
{
$self
->{charsets} .=
exists
$self
->{charsets} ?
" $1"
: $1;
}
}
sub
display_text {
my
$self
=
shift
;
my
$text
=
shift
;
my
%display
=
@_
;
if
(!
exists
$display
{invisible}) {
$display
{invisible} = 0;
}
if
(
$display
{whitespace}) {
if
(@{
$self
->{text} } &&
(!
defined
$self
->{text_whitespace} ||
!
vec
(
$self
->{text_whitespace}, $
(!
defined
$self
->{text_invisible} ||
!
vec
(
$self
->{text_invisible}, $
{
$self
->{text}->[-1] =~ s/ $//;
}
}
else
{
$text
=~ s/[ \t\n\r\f\x0b]+|\xc2\xa0/ /gs;
if
(@{
$self
->{text} } && !
$display
{invisible} &&
defined
$self
->{text_whitespace} &&
vec
(
$self
->{text_whitespace}, $
{
$text
=~ s/^ //;
}
}
push
@{
$self
->{text} },
$text
;
while
(
my
(
$k
,
$v
) =
each
%display
) {
my
$textvar
=
"text_"
.
$k
;
if
(!
exists
$self
->{
$textvar
}) {
$self
->{
$textvar
} =
''
; }
vec
(
$self
->{
$textvar
}, $
}
}
sub
html_text {
my
(
$self
,
$text
) =
@_
;
utf8::encode(
$text
)
if
$self
->{SA_encode_results};
if
(
exists
$self
->{inside}{script} &&
$self
->{inside}{script} > 0)
{
push
@{
$self
->{script} },
$text
;
return
;
}
if
(
exists
$self
->{inside}{style} &&
$self
->{inside}{style} > 0) {
return
;
}
if
(
exists
$self
->{inside}{a} &&
$self
->{inside}{a} > 0) {
my
$uri
=
$self
->{anchor_last};
utf8::encode(
$uri
)
if
$self
->{SA_encode_results};
$self
->{uri}->{
$uri
}->{anchor_text}->[-1] .=
$text
;
$self
->{anchor}->[-1] .=
$text
;
}
if
(
exists
$self
->{inside}{title} &&
$self
->{inside}{title} > 0) {
$self
->{title}->[
$self
->{title_index}] .=
$text
;
}
my
$invisible_for_bayes
= 0;
if
(
do
{(
my
$tmp
=
$text
) =~ s/(?:[ \t\n\r\f\x0b]|\xc2\xa0)//gs;
length
(
$tmp
)}) {
$invisible_for_bayes
=
$self
->html_font_invisible(
$text
);
}
if
(
exists
$self
->{text}->[-1]) {
if
(
$text
=~ /^[^\s\x21-\x2f\x3a-\x40\x5b-\x60\x7b-\x7e]/s &&
$self
->{text}->[-1] =~ /[^\s\x21-\x2f\x3a-\x40\x5b-\x60\x7b-\x7e]\z/s)
{
$self
->{obfuscation}++;
}
if
(
$self
->{text}->[-1] =~
/\b([^\s\x21-\x2f\x3a-\x40\x5b-\x60\x7b-\x7e]{1,7})\z/s)
{
my
$start
=
length
($1);
if
(
$text
=~ /^([^\s\x21-\x2f\x3a-\x40\x5b-\x60\x7b-\x7e]{1,7})\b/s) {
$self
->{backhair}->{
$start
.
"_"
.
length
($1)}++;
}
}
}
if
(
$invisible_for_bayes
) {
$self
->display_text(
$text
,
invisible
=> 1);
}
else
{
$self
->display_text(
$text
);
}
}
sub
html_comment {
my
(
$self
,
$text
) =
@_
;
utf8::encode(
$text
)
if
$self
->{SA_encode_results};
push
@{
$self
->{comment} },
$text
;
}
sub
html_declaration {
my
(
$self
,
$text
) =
@_
;
utf8::encode(
$text
)
if
$self
->{SA_encode_results};
if
(
$text
=~ /^<!doctype/i) {
my
$tag
=
"!doctype"
;
$self
->{elements}++;
$self
->{tags}++;
$self
->{inside}{
$tag
} = 0;
}
}
my
%html_color
= (
aqua
=> 0x00ffff,
black
=> 0x000000,
blue
=> 0x0000ff,
fuchsia
=> 0xff00ff,
gray
=> 0x808080,
green
=> 0x008000,
lime
=> 0x00ff00,
maroon
=> 0x800000,
navy
=> 0x000080,
olive
=> 0x808000,
purple
=> 0x800080,
red
=> 0xff0000,
silver
=> 0xc0c0c0,
teal
=> 0x008080,
white
=> 0xffffff,
yellow
=> 0xffff00,
aliceblue
=> 0xf0f8ff,
antiquewhite
=> 0xfaebd7,
aqua
=> 0x00ffff,
aquamarine
=> 0x7fffd4,
azure
=> 0xf0ffff,
beige
=> 0xf5f5dc,
bisque
=> 0xffe4c4,
black
=> 0x000000,
blanchedalmond
=> 0xffebcd,
blue
=> 0x0000ff,
blueviolet
=> 0x8a2be2,
brown
=> 0xa52a2a,
burlywood
=> 0xdeb887,
cadetblue
=> 0x5f9ea0,
chartreuse
=> 0x7fff00,
chocolate
=> 0xd2691e,
coral
=> 0xff7f50,
cornflowerblue
=> 0x6495ed,
cornsilk
=> 0xfff8dc,
crimson
=> 0xdc143c,
cyan
=> 0x00ffff,
darkblue
=> 0x00008b,
darkcyan
=> 0x008b8b,
darkgoldenrod
=> 0xb8860b,
darkgray
=> 0xa9a9a9,
darkgreen
=> 0x006400,
darkgrey
=> 0xa9a9a9,
darkkhaki
=> 0xbdb76b,
darkmagenta
=> 0x8b008b,
darkolivegreen
=> 0x556b2f,
darkorange
=> 0xff8c00,
darkorchid
=> 0x9932cc,
darkred
=> 0x8b0000,
darksalmon
=> 0xe9967a,
darkseagreen
=> 0x8fbc8f,
darkslateblue
=> 0x483d8b,
darkslategray
=> 0x2f4f4f,
darkslategrey
=> 0x2f4f4f,
darkturquoise
=> 0x00ced1,
darkviolet
=> 0x9400d3,
deeppink
=> 0xff1493,
deepskyblue
=> 0x00bfff,
dimgray
=> 0x696969,
dimgrey
=> 0x696969,
dodgerblue
=> 0x1e90ff,
firebrick
=> 0xb22222,
floralwhite
=> 0xfffaf0,
forestgreen
=> 0x228b22,
fuchsia
=> 0xff00ff,
gainsboro
=> 0xdcdcdc,
ghostwhite
=> 0xf8f8ff,
gold
=> 0xffd700,
goldenrod
=> 0xdaa520,
gray
=> 0x808080,
green
=> 0x008000,
greenyellow
=> 0xadff2f,
grey
=> 0x808080,
honeydew
=> 0xf0fff0,
hotpink
=> 0xff69b4,
indianred
=> 0xcd5c5c,
indigo
=> 0x4b0082,
ivory
=> 0xfffff0,
khaki
=> 0xf0e68c,
lavender
=> 0xe6e6fa,
lavenderblush
=> 0xfff0f5,
lawngreen
=> 0x7cfc00,
lemonchiffon
=> 0xfffacd,
lightblue
=> 0xadd8e6,
lightcoral
=> 0xf08080,
lightcyan
=> 0xe0ffff,
lightgoldenrodyellow
=> 0xfafad2,
lightgray
=> 0xd3d3d3,
lightgreen
=> 0x90ee90,
lightgrey
=> 0xd3d3d3,
lightpink
=> 0xffb6c1,
lightsalmon
=> 0xffa07a,
lightseagreen
=> 0x20b2aa,
lightskyblue
=> 0x87cefa,
lightslategray
=> 0x778899,
lightslategrey
=> 0x778899,
lightsteelblue
=> 0xb0c4de,
lightyellow
=> 0xffffe0,
lime
=> 0x00ff00,
limegreen
=> 0x32cd32,
linen
=> 0xfaf0e6,
magenta
=> 0xff00ff,
maroon
=> 0x800000,
mediumaquamarine
=> 0x66cdaa,
mediumblue
=> 0x0000cd,
mediumorchid
=> 0xba55d3,
mediumpurple
=> 0x9370db,
mediumseagreen
=> 0x3cb371,
mediumslateblue
=> 0x7b68ee,
mediumspringgreen
=> 0x00fa9a,
mediumturquoise
=> 0x48d1cc,
mediumvioletred
=> 0xc71585,
midnightblue
=> 0x191970,
mintcream
=> 0xf5fffa,
mistyrose
=> 0xffe4e1,
moccasin
=> 0xffe4b5,
navajowhite
=> 0xffdead,
navy
=> 0x000080,
oldlace
=> 0xfdf5e6,
olive
=> 0x808000,
olivedrab
=> 0x6b8e23,
orange
=> 0xffa500,
orangered
=> 0xff4500,
orchid
=> 0xda70d6,
palegoldenrod
=> 0xeee8aa,
palegreen
=> 0x98fb98,
paleturquoise
=> 0xafeeee,
palevioletred
=> 0xdb7093,
papayawhip
=> 0xffefd5,
peachpuff
=> 0xffdab9,
peru
=> 0xcd853f,
pink
=> 0xffc0cb,
plum
=> 0xdda0dd,
powderblue
=> 0xb0e0e6,
purple
=> 0x800080,
red
=> 0xff0000,
rosybrown
=> 0xbc8f8f,
royalblue
=> 0x4169e1,
saddlebrown
=> 0x8b4513,
salmon
=> 0xfa8072,
sandybrown
=> 0xf4a460,
seagreen
=> 0x2e8b57,
seashell
=> 0xfff5ee,
sienna
=> 0xa0522d,
silver
=> 0xc0c0c0,
skyblue
=> 0x87ceeb,
slateblue
=> 0x6a5acd,
slategray
=> 0x708090,
slategrey
=> 0x708090,
snow
=> 0xfffafa,
springgreen
=> 0x00ff7f,
steelblue
=> 0x4682b4,
tan
=> 0xd2b48c,
teal
=> 0x008080,
thistle
=> 0xd8bfd8,
tomato
=> 0xff6347,
turquoise
=> 0x40e0d0,
violet
=> 0xee82ee,
wheat
=> 0xf5deb3,
white
=> 0xffffff,
whitesmoke
=> 0xf5f5f5,
yellow
=> 0xffff00,
yellowgreen
=> 0x9acd32,
);
sub
name_to_rgb_old {
my
$color
=
lc
$_
[0];
my
$hex
=
$html_color
{
$color
};
if
(
defined
$hex
) {
return
sprintf
(
"#%06x"
,
$hex
);
}
$color
=~ s/^
$color
.=
"0"
x (3 - (
length
(
$color
) % 3))
if
(
length
(
$color
) % 3);
my
$length
=
length
(
$color
) / 3;
my
@colors
= (
$color
=~ /(.{
$length
})(.{
$length
})(.{
$length
})/);
foreach
(
@colors
) { s/.*(.{8})$/$1/; s/(..).*/$1/; s/^(.)$/0$1/ };
$color
=
join
(
""
,
@colors
);
$color
=~
tr
/0-9a-f/0/c;
return
"#"
.
$color
;
}
sub
name_to_rgb {
my
$color
=
lc
$_
[0];
my
$before
=
$color
;
$color
=~ s/^\s*//;
$color
=~ s/\s*$//;
my
$hex
=
$html_color
{
$color
};
if
(
defined
$hex
) {
return
sprintf
(
"#%06x"
,
$hex
);
}
if
(
$color
=~ m/^[
if
(
$color
=~ m/rgb\((\d+)%,\s*(\d+)%,\s*(\d+)%\s*\)/i) {
$color
=
"#"
.dec2hex(
int
($1/100*255)).dec2hex(
int
($2/100*255)).dec2hex(
int
($3/100*255));
}
if
(
$color
=~ m/rgb\((\d+),\s*(\d+),\s*(\d+)\s*\)/i) {
$color
=
"#"
.dec2hex($1).dec2hex($2).dec2hex($3);
}
if
(
$color
=~ m/^
$color
=~ s/[^a-f0-9]//ig;
if
(
length
(
$color
) > 6) {
$color
=
substr
(
$color
,0,6);
}
if
(
length
(
$color
) > 3 &&
length
(
$color
) < 6) {
$color
=
substr
(
$color
,0,3);
}
$color
.=
"0"
x (3 - (
length
(
$color
) % 3))
if
(
length
(
$color
) % 3);
if
(
length
(
$color
) == 3) {
$color
=~ m/(.)(.)(.)/;
$color
=
"$1$1$2$2$3$3"
;
}
}
else
{
return
"invalid"
;
}
}
else
{
return
"invalid"
;
}
return
"#"
.
$color
;
}
sub
dec2hex {
my
(
$dec
) =
@_
;
my
(
$pre
) =
''
;
if
(
$dec
< 16) {
$pre
=
'0'
;
}
return
sprintf
(
"$pre%lx"
,
$dec
);
}
sub
_parse_uri {
my
(
$u
) =
@_
;
my
%u
;
(
$u
{scheme},
$u
{authority},
$u
{path},
$u
{query},
$u
{fragment}) =
$u
=~ m|^(?:([^:/?
return
%u
;
}
sub
_remove_dot_segments {
my
(
$input
) =
@_
;
my
$output
=
""
;
$input
=~ s@^(?:\.\.?/)@/@;
while
(
$input
) {
if
(
$input
=~ s@^/\.(?:$|/)@/@) {
}
elsif
(
$input
=~ s@^/\.\.(?:$|/)@/@) {
$output
=~ s@/?[^/]*$@@;
}
elsif
(
$input
=~ s@(/?[^/]*)@@) {
$output
.= $1;
}
}
return
$output
;
}
sub
_merge_uri {
my
(
$base_authority
,
$base_path
,
$r_path
) =
@_
;
if
(
defined
$base_authority
&& !
$base_path
) {
return
"/"
.
$r_path
;
}
else
{
if
(
index
(
$base_path
,
'/'
) >= 0) {
$base_path
=~ s|(?<=/)[^/]*$||;
}
else
{
$base_path
=
""
;
}
return
$base_path
.
$r_path
;
}
}
sub
target_uri {
my
(
$base
,
$r
) =
@_
;
my
%r
= _parse_uri(
$r
);
my
%base
= _parse_uri(
$base
);
my
%t
;
if
((not URI_STRICT) and
(
defined
$r
{scheme} &&
defined
$base
{scheme}) and
(
$r
{scheme} eq
$base
{scheme}))
{
undef
$r
{scheme};
}
if
(
defined
$r
{scheme}) {
$t
{scheme} =
$r
{scheme};
$t
{authority} =
$r
{authority};
$t
{path} = _remove_dot_segments(
$r
{path});
$t
{query} =
$r
{query};
}
else
{
if
(
defined
$r
{authority}) {
$t
{authority} =
$r
{authority};
$t
{path} = _remove_dot_segments(
$r
{path});
$t
{query} =
$r
{query};
}
else
{
if
(
$r
{path} eq
""
) {
$t
{path} =
$base
{path};
if
(
defined
$r
{query}) {
$t
{query} =
$r
{query};
}
else
{
$t
{query} =
$base
{query};
}
}
else
{
if
(
$r
{path} =~ m|^/|) {
$t
{path} = _remove_dot_segments(
$r
{path});
}
else
{
$t
{path} = _merge_uri(
$base
{authority},
$base
{path},
$r
{path});
$t
{path} = _remove_dot_segments(
$t
{path});
}
$t
{query} =
$r
{query};
}
$t
{authority} =
$base
{authority};
}
$t
{scheme} =
$base
{scheme};
}
$t
{fragment} =
$r
{fragment};
my
$result
=
""
;
if
(
$t
{scheme}) {
$result
.=
$t
{scheme} .
":"
;
}
elsif
(
defined
$t
{authority}) {
if
(
$t
{authority} =~ /^www\d*\./i) {
$result
.=
"http:"
;
}
elsif
(
$t
{authority} =~ /^ftp\d*\./i) {
$result
.=
"ftp:"
;
}
}
if
(
defined
$t
{authority}) {
$result
.=
"//"
.
$t
{authority};
}
$result
.=
$t
{path};
if
(
defined
$t
{query}) {
$result
.=
"?"
.
$t
{query};
}
if
(
defined
$t
{fragment}) {
$result
.=
"#"
.
$t
{fragment};
}
return
$result
;
}
1;