use
5.008;
DOCTYPE_NIL
=>
''
,
DOCTYPE_HTML32
=>
'<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 3.2 Final//EN">'
,
DOCTYPE_HTML5
=>
'<!DOCTYPE html>'
,
DOCTYPE_LEGACY
=>
'<!DOCTYPE html SYSTEM "about:legacy-compat">'
,
DOCTYPE_HTML2
=>
'<!DOCTYPE html PUBLIC "-//IETF//DTD HTML 2.0//EN">'
,
};
our
$VERSION
=
'0.101'
;
our
%EXPORT_TAGS
= (
doctype
=> [
qw(DOCTYPE_NIL DOCTYPE_HTML32 DOCTYPE_HTML4 DOCTYPE_HTML5
DOCTYPE_LEGACY DOCTYPE_XHTML1 DOCTYPE_XHTML11 DOCTYPE_XHTML_BASIC
DOCTYPE_XHTML_RDFA DOCTYPE_HTML2 DOCTYPE_HTML40 DOCTYPE_HTML40_STRICT
DOCTYPE_HTML40_LOOSE DOCTYPE_HTML40_FRAMESET DOCTYPE_HTML401
DOCTYPE_HTML401_STRICT DOCTYPE_HTML401_LOOSE DOCTYPE_HTML401_FRAMESET
DOCTYPE_XHTML1_STRICT DOCTYPE_XHTML1_LOOSE DOCTYPE_XHTML1_FRAMESET
DOCTYPE_XHTML_MATHML_SVG DOCTYPE_XHTML_BASIC_10 DOCTYPE_XHTML_BASIC_11
DOCTYPE_HTML4_RDFA DOCTYPE_HTML401_RDFA11 DOCTYPE_HTML401_RDFA10
DOCTYPE_XHTML_RDFA10 DOCTYPE_XHTML_RDFA11)
]
);
our
@EXPORT_OK
= @{
$EXPORT_TAGS
{doctype} };
our
%Entities
;
our
@VoidElements
=
qw(area base br col command embed hr
img input keygen link meta param source track wbr)
;
our
@BooleanAttributes
=
qw(
hidden
audio@autoplay audio@preload audio@controls audio@loop
button@autofocus button@disabled button@formnovalidate
command@checked command@disabled
details@open
dl@compact
fieldset@disabled
form@novalidate
hr@noshade
iframe@seamless
img@ismap
input@autofocus input@checked input@disabled input@formnovalidate
input@multiple input@readonly input@required
keygen@autofocus keygen@disabled
ol@reversed
optgroup@disabled
option@disabled option@selected
script@async script@defer
select@autofocus select@disabled select@multiple select@readonly
select@required
style@scoped
textarea@autofocus textarea@disabled textarea@required
time@pubdate
track@default
video@autoplay video@preload video@controls video@loop
)
;
our
@OptionalStart
=
qw(html head body tbody)
;
our
@OptionalEnd
=
qw(html head body tbody dt dd li optgroup
option p rp rt td th tfoot thead tr)
;
BEGIN
{
eval
'use HTML::HTML5::Parser::NamedEntityList;'
;
unless
(@!)
{
while
(
my
(
$entity
,
$char
) =
each
(%{
$HTML::HTML5::Parser::TagSoupParser::EntityChar
}))
{
$Entities
{
$char
} =
$entity
if
$entity
=~ /;$/
&&
$Entities
{
$char
} cmp
$entity
;
}
}
$Entities
{
'&'
} =
'amp;'
;
$Entities
{
'"'
} =
'quot;'
;
$Entities
{
'<'
} =
'lt;'
;
$Entities
{
'>'
} =
'gt;'
;
}
sub
new
{
my
(
$class
,
%opts
) =
@_
;
$opts
{
'markup'
} ||=
'html'
;
$opts
{
'doctype'
} ||= DOCTYPE_HTML5;
$opts
{
'charset'
} ||=
'utf8'
;
return
bless
\
%opts
,
$class
;
}
sub
is_xhtml
{
my
(
$self
) =
@_
;
return
(
$self
->{
'markup'
} =~ m
'^(xml|xhtml|application/xml|text/xml|application/xhtml\+xml)$'
i);
}
sub
is_polyglot
{
my
(
$self
) =
@_
;
return
(
$self
->{
'polyglot'
} =~ /(yes|1)/i);
}
sub
should_quote_attributes
{
my
(
$self
) =
@_
;
return
(
$self
->{
'quote_attributes'
} =~ /(yes|1|always|force)/i)
||
$self
->is_xhtml
||
$self
->is_polyglot;
}
sub
should_slash_voids
{
my
(
$self
) =
@_
;
return
(
$self
->{
'voids'
} =~ /(slash)/i)
||
$self
->is_xhtml
||
$self
->is_polyglot;
}
sub
should_force_end_tags
{
my
(
$self
) =
@_
;
return
(
$self
->{
'end_tags'
} =~ /(yes|1|always|force)/i)
||
$self
->is_xhtml
||
$self
->is_polyglot;
}
sub
should_force_start_tags
{
my
(
$self
) =
@_
;
return
(
$self
->{
'start_tags'
} =~ /(yes|1|always|force)/i)
||
$self
->is_xhtml
||
$self
->is_polyglot;
}
sub
document
{
my
(
$self
,
$document
) =
@_
;
return
$self
->doctype() .
$self
->element(
$document
->documentElement);
}
sub
doctype
{
my
(
$self
) =
@_
;
return
$self
->{
'doctype'
};
}
sub
element
{
my
(
$self
,
$element
) =
@_
;
return
$element
->toString
my
$rv
=
''
;
my
$tagname
=
$element
->nodeName;
my
@attrs
=
$element
->attributes;
my
@kids
=
$element
->childNodes;
if
(
$tagname
eq
'html'
&& !
$self
->is_xhtml && !
$self
->is_polyglot)
{
@attrs
=
grep
{
$_
->nodeName ne
'xmlns'
}
@attrs
;
}
my
$omitstart
= 0;
if
(!
@attrs
and !
$self
->should_force_start_tags and
grep
{
$tagname
eq
$_
}
@OptionalStart
)
{
$omitstart
+=
eval
"return \$self->_check_omit_start_${tagname}(\$element);"
;
}
my
$omitend
= 0;
if
(!
$self
->should_force_end_tags and
grep
{
$tagname
eq
$_
}
@OptionalEnd
)
{
$omitend
+=
eval
"return \$self->_check_omit_end_${tagname}(\$element);"
;
}
unless
(
$omitstart
)
{
$rv
.=
'<'
.
$tagname
;
foreach
my
$a
(
@attrs
)
{
$rv
.=
' '
.
$self
->attribute(
$a
,
$element
);
}
}
if
(!
@kids
and
grep
{
$tagname
eq
$_
}
@VoidElements
and !
$omitstart
)
{
$rv
.=
$self
->should_slash_voids ?
' />'
:
'>'
;
return
$rv
;
}
$rv
.=
'>'
unless
$omitstart
;
foreach
my
$kid
(
@kids
)
{
if
(
$kid
->nodeName eq
'#text'
)
{
$rv
.=
$self
->text(
$kid
); }
elsif
(
$kid
->nodeName eq
'#comment'
)
{
$rv
.=
$self
->comment(
$kid
); }
elsif
(
$kid
->nodeName eq
'#cdata-section'
)
{
$rv
.=
$self
->cdata(
$kid
); }
else
{
$rv
.=
$self
->element(
$kid
); }
}
unless
(
$omitend
)
{
$rv
.=
'</'
.
$tagname
.
'>'
;
}
return
$rv
;
}
sub
attribute
{
my
(
$self
,
$attr
,
$element
) =
@_
;
my
$minimize
= 0;
my
$quote
= 1;
my
$quotechar
=
'"'
;
my
$attrname
=
$attr
->nodeName;
my
$elemname
=
$element
?
$element
->nodeName :
'*'
;
unless
(
$self
->should_quote_attributes)
{
if
((
$attr
->value eq
$attrname
or
$attr
->value eq
''
)
and
grep
{
$_
eq
$attrname
or
$_
eq
sprintf
(
'%s@%s'
,
$elemname
,
$attrname
) }
@BooleanAttributes
)
{
return
$attrname
;
}
if
(
$attr
->value =~ /^[A-Za-z0-9\._:-]+$/)
{
return
sprintf
(
'%s=%s'
,
$attrname
,
$attr
->value);
}
}
my
$encoded_value
;
if
(
$attr
->value !~ /\"/)
{
$quotechar
=
'"'
;
$encoded_value
=
$self
->encode_entities(
$attr
->value);
}
elsif
(
$attr
->value !~ /\'/)
{
$quotechar
=
"'"
;
$encoded_value
=
$self
->encode_entities(
$attr
->value);
}
else
{
$quotechar
=
'"'
;
$encoded_value
=
$self
->encode_entities(
$attr
->value,
characters
=>
"\""
);
}
return
sprintf
(
'%s=%s%s%s'
,
$attrname
,
$quotechar
,
$encoded_value
,
$quotechar
);
}
sub
comment
{
my
(
$self
,
$text
) =
@_
;
return
'<!--'
.
$self
->encode_entities(
$text
->nodeValue) .
'-->'
;
}
sub
cdata
{
my
(
$self
,
$text
) =
@_
;
if
(
$self
->is_polyglot &&
$text
->parentNode->nodeName =~ /^(script|style)$/i)
{
return
'/* <![CDATA[ */'
.
$text
->nodeValue .
'/* ]]> */'
;
}
elsif
(!
$self
->is_xhtml &&
$text
->parentNode->nodeName =~ /^(script|style)$/i)
{
return
$text
->nodeValue;
}
elsif
(!
$self
->is_xhtml)
{
return
$self
->text(
$text
);
}
else
{
return
'<![CDATA['
.
$text
->nodeValue .
']]>'
;
}
}
sub
text
{
my
(
$self
,
$text
) =
@_
;
if
(
$self
->is_polyglot &&
$text
->parentNode->nodeName =~ /^(script|style)$/i)
{
return
'/* <![CDATA[ */'
.
$text
->nodeValue .
'/* ]]> */'
;
}
elsif
(!
$self
->is_xhtml &&
$text
->parentNode->nodeName =~ /^(script|style)$/i)
{
return
$text
->nodeValue;
}
elsif
(
$text
->parentNode->nodeName =~ /^(script|style)$/i)
{
return
'<![CDATA['
.
$text
->nodeValue .
']]>'
;
}
return
$self
->encode_entities(
$text
->nodeValue,
characters
=>
"<>"
);
}
sub
encode_entities
{
my
(
$self
,
$string
,
%options
) =
@_
;
my
$characters
=
$options
{
'characters'
};
$characters
.=
'&'
;
$characters
.=
'\x{0}-\x{8}\x{B}\x{C}\x{E}-\x{1F}\x{26}\x{7F}'
;
$characters
.=
'\x{80}-\x{FFFFFF}'
unless
$self
->{
'charset'
} =~ /^utf[_-]?8$/i;
$string
=~ s/ ([
$characters
]) /
$self
->encode_entity($1) /egx;
return
$string
;
}
sub
encode_entity
{
my
(
$self
,
$char
) =
@_
;
return
unless
defined
$char
;
if
(
length
$char
> 1)
{
return
encode_entity(
substr
$char
, 0, 1).encode_entity(
substr
$char
, 1);
}
if
(
$char
=~ /^[&<>"]$/)
{
return
'&'
.
$Entities
{
$char
};
}
elsif
(!
$self
->is_xhtml &&
defined
$Entities
{
$char
})
{
return
'&'
.
$Entities
{
$char
};
}
elsif
(
$self
->{
'refs'
} =~ /dec/i)
{
return
sprintf
(
'&#%d;'
,
ord
$char
);
}
return
sprintf
(
'&#x%x;'
,
ord
$char
);
}
sub
_check_omit_end_body
{
my
(
$self
,
$element
) =
@_
;
my
$next
=
$element
->nextSibling;
unless
(
defined
$next
&&
$next
->nodeName eq
'#comment'
)
{
return
1
if
$element
->childNodes || !
$self
->_check_omit_start_body(
$element
);
}
}
sub
_check_omit_end_head
{
my
(
$self
,
$element
) =
@_
;
my
$next
=
$element
->nextSibling;
return
0
unless
defined
$next
;
return
0
if
$next
->nodeName eq
'#comment'
;
return
0
if
$next
->nodeName eq
'#text'
&&
$next
->nodeValue =~ /^\s/;
return
1;
}
sub
_check_omit_end_html
{
my
(
$self
,
$element
) =
@_
;
my
@bodies
=
$element
->getChildrenByTagName(
'body'
);
if
(
$bodies
[-1]->childNodes ||
$bodies
[-1]->attributes)
{
return
!
defined
$element
->nextSibling;
}
}
sub
_check_omit_end_dd
{
my
(
$self
,
$element
) =
@_
;
return
1
unless
defined
$element
->nextSibling;
return
1
if
$element
->nextSibling->nodeName
=~ /^( dd | dt )$/x;
}
*_check_omit_end_dt
= \
&_check_omit_end_dd
;
sub
_check_omit_end_li
{
my
(
$self
,
$element
) =
@_
;
return
1
unless
defined
$element
->nextSibling;
return
1
if
$element
->nextSibling->nodeName
=~ /^( li )$/x;
}
sub
_check_omit_end_optgroup
{
my
(
$self
,
$element
) =
@_
;
return
1
unless
defined
$element
->nextSibling;
return
1
if
$element
->nextSibling->nodeName
=~ /^( optgroup )$/x;
}
sub
_check_omit_end_option
{
my
(
$self
,
$element
) =
@_
;
return
1
unless
defined
$element
->nextSibling;
return
1
if
$element
->nextSibling->nodeName
=~ /^( option | optgroup )$/x;
}
sub
_check_omit_end_p
{
my
(
$self
,
$element
) =
@_
;
return
1
unless
defined
$element
->nextSibling;
return
1
if
$element
->nextSibling->nodeName
=~ /^( address | article | aside | blockquote | dir
| div | dl | fieldset | footer | form | h[1-6]
| header | hr | menu | nav | ol | p | pre | section
| table | ul )$/x;
}
sub
_check_omit_end_rp
{
my
(
$self
,
$element
) =
@_
;
return
1
unless
defined
$element
->nextSibling;
return
1
if
$element
->nextSibling->nodeName
=~ /^( rp | rt )$/x;
}
*_check_omit_end_rt
= \
&_check_omit_end_rp
;
sub
_check_omit_end_td
{
my
(
$self
,
$element
) =
@_
;
return
1
unless
defined
$element
->nextSibling;
return
1
if
$element
->nextSibling->nodeName
=~ /^( td | th )$/x;
}
*_check_omit_end_th
= \
&_check_omit_end_td
;
sub
_check_omit_end_tbody
{
my
(
$self
,
$element
) =
@_
;
return
1
unless
defined
$element
->nextSibling;
return
1
if
$element
->nextSibling->nodeName
=~ /^( tbody | tfoot )$/x;
}
sub
_check_omit_end_tfoot
{
my
(
$self
,
$element
) =
@_
;
return
1
unless
defined
$element
->nextSibling;
return
1
if
$element
->nextSibling->nodeName
=~ /^( tbody )$/x;
}
sub
_check_omit_end_thead
{
my
(
$self
,
$element
) =
@_
;
return
0
unless
defined
$element
->nextSibling;
return
1
if
$element
->nextSibling->nodeName
=~ /^( tbody | tfoot )$/x;
}
sub
_check_omit_end_tr
{
my
(
$self
,
$element
) =
@_
;
return
1
unless
defined
$element
->nextSibling;
return
1
if
$element
->nextSibling->nodeName
=~ /^(
tr
)$/x;
}
sub
_check_omit_start_body
{
my
(
$self
,
$element
) =
@_
;
my
@kids
=
$element
->childNodes;
my
$next
=
$kids
[0];
return
0
unless
defined
$next
;
return
0
if
$next
->nodeName eq
'#comment'
;
return
0
if
$next
->nodeName eq
'#text'
&&
$next
->nodeValue =~ /^\s/;
return
0
if
$next
->nodeName eq
'style'
;
return
0
if
$next
->nodeName eq
'script'
;
return
1;
}
sub
_check_omit_start_head
{
my
(
$self
,
$element
) =
@_
;
my
@kids
=
$element
->childNodes;
return
(
@kids
and
$kids
[0]->nodeType==XML_ELEMENT_NODE);
}
sub
_check_omit_start_html
{
my
(
$self
,
$element
) =
@_
;
my
@kids
=
$element
->childNodes;
return
(
@kids
and
$kids
[0]->nodeName ne
'#comment'
);
}
sub
_check_omit_start_tbody
{
my
(
$self
,
$element
) =
@_
;
my
@kids
=
$element
->childNodes;
return
0
unless
@kids
;
return
0
unless
$kids
[0]->nodeName eq
'tr'
;
return
1
unless
defined
$element
->previousSibling;
return
1
if
$element
->previousSibling->nodeName eq
'tbody'
&&
$self
->_check_omit_end_tbody(
$element
->previousSibling);
return
1
if
$element
->previousSibling->nodeName eq
'thead'
&&
$self
->_check_omit_end_thead(
$element
->previousSibling);
return
1
if
$element
->previousSibling->nodeName eq
'tfoot'
&&
$self
->_check_omit_end_tfoot(
$element
->previousSibling);
}
1;