#!/usr/bin/perl
use
5.010001;
use
Fatal
qw(open close)
;
sub
usage
{
say
{
*STDERR
}
"$PROGRAM_NAME html_fmt [uri|file]"
or
die
"say failed: $ERRNO"
;
exit
1;
}
my
$help_flag
= 0;
my
$allowed_avoid_whitespace_flags
= {
map
{ (
$_
, 1) }
qw(comment yes no)
};
my
$avoid_whitespace_flag
=
'yes'
;
local
$main::ADDED_TAG_COMMENT_FLAG
= 1;
my
$ws_ok_before_end_tag_flag
= 0;
my
$ws_ok_after_start_tag_flag
= 1;
my
$dump_config_flag
= 0;
my
$dump_AHFA_flag
= 0;
my
$trace_terminals_flag
= 0;
my
$trace_cruft_flag
= 0;
my
$trace_values_flag
= 0;
my
$compile_flag
;
usage()
if
not Getopt::Long::GetOptions(
'help'
=> \
$help_flag
,
'avoid-whitespace=s'
=> \
$avoid_whitespace_flag
,
'ws-ok-before-end-tag!'
=> \
$ws_ok_before_end_tag_flag
,
'ws-ok-after-start-tag!'
=> \
$ws_ok_after_start_tag_flag
,
'added-tag-comment!'
=> \
$main::ADDED_TAG_COMMENT_FLAG
,
'dump-config'
=> \
$dump_config_flag
,
'dump-AHFA'
=> \
$dump_AHFA_flag
,
'compile=s'
=> \
$compile_flag
,
'trace-terminals'
=> \
$trace_terminals_flag
,
'trace-cruft'
=> \
$trace_cruft_flag
,
'trace-values'
=> \
$trace_values_flag
,
);
usage()
if
$help_flag
or
scalar
@ARGV
> 1;
if
( not
$allowed_avoid_whitespace_flags
->{
$avoid_whitespace_flag
} ) {
die
"Bad avoid-whitespace flag\n"
,
'avoid-whitespace must be one of the following: '
,
join
q{ }
,
keys
%{
$allowed_avoid_whitespace_flags
};
}
my
$locator
=
shift
;
my
$document
;
GET_DOCUMENT: {
if
( not
defined
$locator
) {
local
$RS
=
undef
;
$document
= <STDIN>;
last
GET_DOCUMENT;
}
if
(
$locator
=~ /\A [[:alnum:]]+ [:] /xms ) {
my
$mech
= WWW::Mechanize->new(
autocheck
=> 1 );
$mech
->get(
$locator
);
$document
=
$mech
->content;
undef
$mech
;
last
GET_DOCUMENT;
}
{
local
$RS
=
undef
;
open
my
$fh
,
q{<}
,
$locator
;
$document
= <
$fh
>;
close
$fh
;
}
}
sub
post_process {
my
(
$value
) =
@_
;
my
@text_pieces
= ();
DATUM:
for
my
$line_data
(
map
{ @{
$_
} } @{
$value
} ) {
my
(
$type
,
$indent
) = @{
$line_data
};
if
(
$type
eq
'text'
) {
my
$text
=
$line_data
->[2];
my
$has_trailing_ws
=
$text
=~ s/ \s+ \z//xms;
if
(
$text
=~ s/ \A \s+ //xms ) {
push
@text_pieces
, [
'whitespace'
,
$indent
];
}
my
@lines
=
grep
{
$_
=~ /\S/xms }
split
/\n/xms,
$text
;
for
my
$line_no
( 0 ..
$#lines
) {
my
$line
=
$lines
[
$line_no
];
$line
=~ s/\A[ \t]+//xms;
$line
=~ s/[ \t]+\z//xms;
$line
=~ s/[ \t]+/ /xms;
push
@text_pieces
, [
'whitespace'
,
$indent
]
if
$line_no
;
push
@text_pieces
, [
'text'
,
$indent
,
$line
];
}
push
@text_pieces
, [
'whitespace'
,
$indent
]
if
$has_trailing_ws
;
next
DATUM;
}
if
(
$type
eq
'msg: missing start tag'
) {
my
$location
=
$line_data
->[2];
my
$tagname
=
$line_data
->[3];
GIVEN_LOCATION: {
if
(
$location
eq
'preceeding'
) {
push
@text_pieces
,
[
'html_fmt comment'
,
$indent
+ 1,
'Preceeding start tag is replacement for a missing one'
];
last
GIVEN_LOCATION;
}
if
(
$location
eq
'following pre'
) {
push
@text_pieces
,
[
'html_fmt comment'
,
$indent
,
"Inside following <pre>, a start tag is missing: <$tagname>"
];
last
GIVEN_LOCATION;
}
Carp::croak(
"Internal error: unprovided-for missing start tag location: $_"
);
}
next
DATUM;
}
if
(
$type
eq
'msg: missing end tag'
) {
my
$location
=
$line_data
->[2];
my
$tagname
=
$line_data
->[3];
GIVEN_LOCATION: {
if
(
$location
eq
'following'
) {
push
@text_pieces
,
[
'html_fmt comment'
,
$indent
+ 1,
'Following end tag is replacement for a missing one'
];
last
GIVEN_LOCATION;
}
if
(
$location
eq
'following pre'
) {
push
@text_pieces
,
[
'html_fmt comment'
,
$indent
,
"Inside following <pre>, an end tag is missing: </$tagname>"
];
last
GIVEN_LOCATION;
}
Carp::croak(
"Internal error: unprovided-for missing end tag location: $_"
);
}
next
DATUM;
}
if
(
$type
eq
'msg: missing pre end tag'
) {
push
@text_pieces
,
[
'html_fmt comment'
,
$indent
,
'An missing end tag was added to close the preceding <pre> element'
];
next
DATUM;
}
if
(
$type
eq
'msg: missing pre start tag'
) {
push
@text_pieces
,
[
'html_fmt comment'
,
$indent
,
'A missing start tag was added in front of the following <pre> element'
];
next
DATUM;
}
if
(
$type
eq
'msg: cruft'
) {
my
$location
=
$line_data
->[2];
my
$cruft
=
$line_data
->[3];
my
$token_type
=
$line_data
->[4];
my
$node_desc
=
$token_type
eq
'S'
?
'start tag'
:
$token_type
eq
'E'
?
'end tag'
:
$token_type
eq
'T'
?
'text'
:
'node'
;
GIVEN_LOCATION: {
if
(
$location
eq
'following'
) {
push
@text_pieces
,
[
'html_fmt comment'
,
$indent
,
"Next $node_desc is cruft"
];
last
GIVEN_LOCATION;
}
if
(
$location
eq
'following pre'
) {
(
my
$safe_cruft
=
$cruft
) =~ s/--/- -/xms;
push
@text_pieces
,
[
'html_fmt comment'
,
$indent
,
qq{Inside the following <pre>, $node_desc is cruft:\n$safe_cruft}
];
last
GIVEN_LOCATION;
}
Carp::croak(
"Internal error: unprovided-for cruft location: $_"
);
}
next
DATUM;
}
push
@text_pieces
, [
$type
,
$indent
,
$line_data
->[2] ];
}
my
@ws_unsafe
;
$ws_unsafe
[
scalar
@text_pieces
] = 0;
$ws_unsafe
[0] = 0;
my
$last_non_comment_ix
=
scalar
@text_pieces
;
my
$safe_after_piece
= 0;
my
$first_body_tag
;
my
$last_body_tag
;
TEXT_PIECE:
for
my
$text_piece_ix
( 0 ..
$#text_pieces
) {
my
(
$type
,
$indent
,
$text
) = @{
$text_pieces
[
$text_piece_ix
] };
my
$safe_before_piece
= 0;
WHITESPACE_BY_TYPE: {
last
WHITESPACE_BY_TYPE
if
$type
eq
'html_fmt comment'
;
last
WHITESPACE_BY_TYPE
if
$type
eq
'comment'
;
if
(
$type
eq
'whitespace'
) {
$safe_before_piece
=
$safe_after_piece
= 1;
last
WHITESPACE_BY_TYPE;
}
if
(
$type
eq
'end tag'
) {
if
(
$text
=~ /\A [^[:alnum:]]+ body [^[:alnum:]]+ \z /xmsi )
{
$last_body_tag
=
$text_piece_ix
;
}
$safe_before_piece
=
$ws_ok_before_end_tag_flag
;
$safe_after_piece
= 0;
last
WHITESPACE_BY_TYPE;
}
if
(
$type
eq
'start tag'
) {
if
( not
defined
$first_body_tag
and
$text
=~ /\A [^[:alnum:]]+ body [^[:alnum:]]+ \z /xmsi )
{
$first_body_tag
=
$text_piece_ix
;
}
$safe_after_piece
=
$ws_ok_after_start_tag_flag
;
last
WHITESPACE_BY_TYPE;
}
if
(
$type
eq
'text'
) {
$safe_after_piece
= 0;
last
WHITESPACE_BY_TYPE;
}
$safe_after_piece
= 0;
}
$ws_unsafe
[
$text_piece_ix
+ 1 ] =
$safe_after_piece
? 0 : 1;
if
(
$safe_before_piece
) {
for
my
$ix
(
$last_non_comment_ix
+ 1 ..
$text_piece_ix
) {
$ws_unsafe
[
$ix
] = 0;
}
}
if
(
$type
ne
'comment'
and
$type
ne
'html_fmt comment'
) {
$last_non_comment_ix
=
$text_piece_ix
;
}
}
$first_body_tag
//= -1;
for
(
my
$text_piece_ix
= 0;
$text_piece_ix
<=
$first_body_tag
;
$text_piece_ix
++
)
{
$ws_unsafe
[
$text_piece_ix
] = 0;
}
$last_body_tag
//=
$#text_pieces
;
for
(
my
$text_piece_ix
=
scalar
@text_pieces
;
$text_piece_ix
>
$last_body_tag
;
$text_piece_ix
--
)
{
$ws_unsafe
[
$text_piece_ix
] = 0;
}
my
@output
= ();
CREATE_OUTPUT: {
if
(
$avoid_whitespace_flag
eq
'comment'
) {
TEXT_PIECE:
for
my
$text_piece_ix
( 0 ..
$#text_pieces
) {
my
(
$type
,
$indent
,
$text
) =
@{
$text_pieces
[
$text_piece_ix
] };
next
TEXT_PIECE
if
$type
eq
'whitespace'
;
my
$suffix
=
$ws_unsafe
[
$text_piece_ix
+ 1 ] ?
'<!--'
:
q{}
;
my
$indentation
=
q{ }
x
$indent
;
if
(
$type
eq
'html_fmt comment'
) {
push
@output
,
$indentation
.
' html_fmt: '
.
$text
.
' -->'
.
$suffix
;
next
TEXT_PIECE;
}
if
(
$ws_unsafe
[
$text_piece_ix
] ) {
$indentation
=
$indentation
.
' html_fmt: this comment is to avoid introducing whitespace'
.
"\n"
.
$indentation
.
'-->'
;
}
if
(
$type
eq
'text'
and
$text
=~ /\S/xms ) {
push
@output
,
$indentation
.
$text
.
$suffix
;
next
TEXT_PIECE;
}
push
@output
,
$indentation
.
$text
.
$suffix
;
}
last
CREATE_OUTPUT;
}
{
my
$avoid_whitespace_is_yes
=
$avoid_whitespace_flag
ne
'no'
;
TEXT_PIECE:
for
my
$text_piece_ix
( 0 ..
$#text_pieces
) {
my
(
$type
,
$indent
,
$text
) =
@{
$text_pieces
[
$text_piece_ix
] };
next
TEXT_PIECE
if
$type
eq
'whitespace'
;
if
(
$avoid_whitespace_is_yes
and
$ws_unsafe
[
$text_piece_ix
] )
{
my
$indentation
=
q{ }
x (
$indent
);
if
(
$type
eq
'html_fmt comment'
) {
$text
=
qq{<!--\n}
.
$indentation
.
' html_fmt: '
.
$text
.
"\n"
.
$indentation
.
' -->'
;
}
$output
[-1] .=
$text
;
next
TEXT_PIECE;
}
my
$indentation
=
q{ }
x
$indent
;
if
(
$type
eq
'text'
and
$text
=~ /\S/xms ) {
push
@output
,
$indentation
.
$text
;
next
TEXT_PIECE;
}
if
(
$type
eq
'html_fmt comment'
) {
push
@output
,
$indentation
.
'<!-- html_fmt: '
.
$text
.
' -->'
;
next
TEXT_PIECE;
}
push
@output
,
$indentation
.
$text
;
}
last
CREATE_OUTPUT;
}
}
return
join
"\n"
,
@output
,
q{}
;
}
sub
do_pre {
my
@new_line_data
= ();
my
$start_tag
= Marpa::R2::HTML::start_tag();
my
$end_tag
= Marpa::R2::HTML::end_tag();
CHILD:
for
my
$value
( @{ Marpa::R2::HTML::
values
() } ) {
for
my
$line_data
( @{
$value
} ) {
if
(
$line_data
->[0] =~ /\A msg [:] /xms ) {
push
@new_line_data
,
[
$line_data
->[0], 0,
'following pre'
, @{
$line_data
}[ 3 .. $
];
}
}
}
if
( not
defined
$start_tag
) {
push
@new_line_data
, [
'msg: missing pre start tag'
, 0 ]
if
$main::ADDED_TAG_COMMENT_FLAG
;
push
@new_line_data
, [
'start tag'
, 0,
'<pre>'
];
}
my
$original
= Marpa::R2::HTML::original();
push
@new_line_data
, [
'pre'
, 0,
$original
];
if
( not
defined
$end_tag
) {
push
@new_line_data
, [
'end tag'
, 0,
'</pre>'
]
if
$main::ADDED_TAG_COMMENT_FLAG
;
push
@new_line_data
, [
'msg: missing pre end tag'
, 0 ];
}
return
\
@new_line_data
;
}
sub
do_cruft {
my
$literal
= Marpa::R2::HTML::literal();
my
@new_line_data
= ( [
'msg: cruft'
, 0,
'following'
,
$literal
, Marpa::R2::HTML::token_type() ] );
push
@new_line_data
, [
'cruft'
, 0,
$literal
];
return
\
@new_line_data
;
}
sub
do_comment {
my
$literal
= Marpa::R2::HTML::literal();
return
[ [
'comment'
, 0,
$literal
] ];
}
sub
do_default {
my
$tagname
= Marpa::R2::HTML::tagname();
my
@new_line_data
= ();
my
$start_tag
= Marpa::R2::HTML::start_tag();
my
$end_tag
= Marpa::R2::HTML::end_tag();
my
$is_empty_element
= Marpa::R2::HTML::is_empty_element();
if
(
$is_empty_element
) {
if
(
defined
$start_tag
) {
push
@new_line_data
, [
'empty element'
, 0,
$start_tag
];
}
else
{
push
@new_line_data
, [
'empty element'
, 0,
'<'
.
$tagname
.
'>'
];
push
@new_line_data
,
[
'msg: empty element missing start tag'
, 0,
'preceeding'
,
$tagname
]
if
$main::ADDED_TAG_COMMENT_FLAG
;
}
return
\
@new_line_data
;
}
{
my
$tag_type
=
'start tag'
;
if
(
defined
$start_tag
) {
push
@new_line_data
, [
$tag_type
, 0,
$start_tag
];
}
else
{
push
@new_line_data
, [
$tag_type
, 0,
'<'
.
$tagname
.
'>'
];
push
@new_line_data
,
[
'msg: missing start tag'
, 0,
'preceeding'
,
$tagname
]
if
$main::ADDED_TAG_COMMENT_FLAG
;
}
}
my
@descendant_data
= @{ Marpa::R2::HTML::descendants(
'value,original'
) };
my
$first_content_child
=
defined
$start_tag
? 1 : 0;
my
$last_content_child
=
$#descendant_data
- (
defined
$end_tag
? 1 : 0 );
CHILD:
for
my
$descendant_data_ix
(
$first_content_child
..
$last_content_child
)
{
my
(
$value
,
$original
) = @{
$descendant_data
[
$descendant_data_ix
] };
if
(
defined
$value
) {
for
my
$line_data
( @{
$value
} ) {
my
(
$type
,
$indent
,
@data
) = @{
$line_data
};
push
@new_line_data
, [
$type
,
$indent
+ 1,
@data
];
}
next
CHILD;
}
push
@new_line_data
, [
'text'
, 1,
$original
];
}
if
( not
$is_empty_element
) {
if
(
defined
$end_tag
) {
push
@new_line_data
, [
'end tag'
, 0,
$end_tag
];
}
else
{
push
@new_line_data
,
[
'msg: missing end tag'
, 0,
'following'
,
$tagname
]
if
$main::ADDED_TAG_COMMENT_FLAG
;
push
@new_line_data
, [
'end tag'
, 0,
'</'
.
$tagname
.
'>'
];
}
}
return
\
@new_line_data
;
}
my
%html_args
= (
'script'
=>
sub
{
return
[ [
'script'
, 0, Marpa::R2::HTML::original() ] ];
},
':CRUFT'
=> \
&do_cruft
,
':COMMENT'
=> \
&do_comment
,
'pre'
=> \
&do_pre
,
q{*}
=> \
&do_default
,
':TOP'
=>
sub
{
return
Marpa::R2::HTML::
values
(); }
);
my
%flags
= (
trace_terminals
=>
$trace_terminals_flag
,
trace_cruft
=>
$trace_cruft_flag
,
trace_values
=>
$trace_values_flag
,
dump_config
=>
$dump_config_flag
,
dump_AHFA
=>
$dump_AHFA_flag
,
);
if
(
defined
$compile_flag
) {
open
my
$fh
,
q{<}
,
$compile_flag
;
my
$source
=
join
q{}
, <
$fh
>;
close
$fh
;
$flags
{compile} = \
$source
;
}
my
$value_ref
= Marpa::R2::HTML::html( \
$document
, \
%html_args
, \\
%flags
);
die
'Internal error: no parse'
if
not
defined
$value_ref
;
if
(
$dump_config_flag
or
$dump_AHFA_flag
) {
print
${
$value_ref
};
exit
0;
}
print
post_process(
$value_ref
) or
die
"print failed: $ERRNO"
;
exit
0;