The Perl and Raku Conference 2025: Greenville, South Carolina - June 27-29 Learn more

#!/usr/bin/perl
# Copyright 2018 Jeffrey Kegler
# This file is part of Marpa::R2. Marpa::R2 is free software: you can
# redistribute it and/or modify it under the terms of the GNU Lesser
# General Public License as published by the Free Software Foundation,
# either version 3 of the License, or (at your option) any later version.
#
# Marpa::R2 is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser
# General Public License along with Marpa::R2. If not, see
use 5.010001;
use strict;
use English qw( -no_match_vars );
use Fatal qw(open close);
sub usage
{
say {*STDERR} "$PROGRAM_NAME html_fmt [uri|file]" or die "say failed: $ERRNO";
exit 1;
}
my $help_flag = 0;
my $allowed_avoid_whitespace_flags = { map { ( $_, 1) } qw(comment yes no) };
my $avoid_whitespace_flag = 'yes';
local $main::ADDED_TAG_COMMENT_FLAG = 1;
my $ws_ok_before_end_tag_flag = 0;
my $ws_ok_after_start_tag_flag = 1;
my $dump_config_flag = 0;
my $dump_AHFA_flag = 0;
my $trace_terminals_flag = 0;
my $trace_cruft_flag = 0;
my $trace_values_flag = 0;
my $compile_flag;
usage()
if not Getopt::Long::GetOptions(
'help' => \$help_flag,
'avoid-whitespace=s' => \$avoid_whitespace_flag,
'ws-ok-before-end-tag!' => \$ws_ok_before_end_tag_flag,
'ws-ok-after-start-tag!' => \$ws_ok_after_start_tag_flag,
'added-tag-comment!' => \$main::ADDED_TAG_COMMENT_FLAG,
# undocumented
'dump-config' => \$dump_config_flag,
'dump-AHFA' => \$dump_AHFA_flag,
'compile=s' => \$compile_flag,
'trace-terminals' => \$trace_terminals_flag,
'trace-cruft' => \$trace_cruft_flag,
'trace-values' => \$trace_values_flag,
);
usage() if $help_flag or scalar @ARGV > 1;
if ( not $allowed_avoid_whitespace_flags->{$avoid_whitespace_flag} ) {
die "Bad avoid-whitespace flag\n",
'avoid-whitespace must be one of the following: ', join q{ },
keys %{$allowed_avoid_whitespace_flags};
}
my $locator = shift;
my $document;
GET_DOCUMENT: {
if ( not defined $locator ) {
local $RS = undef;
## no critic(InputOutput::ProhibitExplicitStdin)
$document = <STDIN>;
last GET_DOCUMENT;
}
if ( $locator =~ /\A [[:alnum:]]+ [:] /xms ) {
require WWW::Mechanize;
my $mech = WWW::Mechanize->new( autocheck => 1 );
$mech->get($locator);
$document = $mech->content;
undef $mech;
last GET_DOCUMENT;
} ## end if ( $locator =~ /\A [[:alnum:]]+ [:] /xms )
{
local $RS = undef;
open my $fh, q{<}, $locator;
$document = <$fh>;
close $fh;
}
} ## end GET_DOCUMENT:
sub post_process {
my ($value) = @_;
my @text_pieces = ();
DATUM: for my $line_data ( map { @{$_} } @{$value} ) {
my ( $type, $indent ) = @{$line_data};
if ( $type eq 'text' ) {
my $text = $line_data->[2];
my $has_trailing_ws = $text =~ s/ \s+ \z//xms;
if ( $text =~ s/ \A \s+ //xms ) {
push @text_pieces, [ 'whitespace', $indent ];
}
my @lines = grep { $_ =~ /\S/xms } split /\n/xms, $text;
for my $line_no ( 0 .. $#lines ) {
my $line = $lines[$line_no];
$line =~ s/\A[ \t]+//xms;
$line =~ s/[ \t]+\z//xms;
$line =~ s/[ \t]+/ /xms;
push @text_pieces, [ 'whitespace', $indent ] if $line_no;
push @text_pieces, [ 'text', $indent, $line ];
} ## end for my $line_no ( 0 .. $#lines )
push @text_pieces, [ 'whitespace', $indent ] if $has_trailing_ws;
next DATUM;
} ## end if ( $type eq 'text' )
if ( $type eq 'msg: missing start tag' ) {
my $location = $line_data->[2];
my $tagname = $line_data->[3];
GIVEN_LOCATION: {
if ( $location eq 'preceeding' ) {
push @text_pieces,
[
'html_fmt comment',
$indent + 1,
'Preceeding start tag is replacement for a missing one'
];
last GIVEN_LOCATION;
} ## end if ( $location eq 'preceeding' )
if ( $location eq 'following pre' ) {
push @text_pieces,
[
'html_fmt comment',
$indent,
"Inside following <pre>, a start tag is missing: <$tagname>"
];
last GIVEN_LOCATION;
} ## end if ( $location eq 'following pre' )
Carp::croak(
"Internal error: unprovided-for missing start tag location: $_"
);
} ## end GIVEN_LOCATION:
next DATUM;
} ## end if ( $type eq 'msg: missing start tag' )
if ( $type eq 'msg: missing end tag' ) {
my $location = $line_data->[2];
my $tagname = $line_data->[3];
GIVEN_LOCATION: {
if ( $location eq 'following' ) {
push @text_pieces,
[
'html_fmt comment',
$indent + 1,
'Following end tag is replacement for a missing one'
];
last GIVEN_LOCATION;
} ## end if ( $location eq 'following' )
if ( $location eq 'following pre' ) {
push @text_pieces,
[
'html_fmt comment',
$indent,
"Inside following <pre>, an end tag is missing: </$tagname>"
];
last GIVEN_LOCATION;
} ## end if ( $location eq 'following pre' )
Carp::croak(
"Internal error: unprovided-for missing end tag location: $_"
);
} ## end GIVEN_LOCATION:
next DATUM;
} ## end if ( $type eq 'msg: missing end tag' )
if ( $type eq 'msg: missing pre end tag' ) {
push @text_pieces,
[
'html_fmt comment',
$indent,
'An missing end tag was added to close the preceding <pre> element'
];
next DATUM;
} ## end if ( $type eq 'msg: missing pre end tag' )
if ( $type eq 'msg: missing pre start tag' ) {
push @text_pieces,
[
'html_fmt comment',
$indent,
'A missing start tag was added in front of the following <pre> element'
];
next DATUM;
} ## end if ( $type eq 'msg: missing pre start tag' )
if ( $type eq 'msg: cruft' ) {
my $location = $line_data->[2];
my $cruft = $line_data->[3];
my $token_type = $line_data->[4];
my $node_desc =
$token_type eq 'S' ? 'start tag'
: $token_type eq 'E' ? 'end tag'
: $token_type eq 'T' ? 'text'
: 'node';
GIVEN_LOCATION: {
if ( $location eq 'following' ) {
push @text_pieces,
[
'html_fmt comment',
$indent,
"Next $node_desc is cruft"
];
last GIVEN_LOCATION;
} ## end if ( $location eq 'following' )
if ( $location eq 'following pre' ) {
# Make sure the cruft quoted inside
# the HTML comment does not
# disrupt the comment.
( my $safe_cruft = $cruft ) =~ s/--/- -/xms;
push @text_pieces,
[
'html_fmt comment',
$indent,
qq{Inside the following <pre>, $node_desc is cruft:\n$safe_cruft}
];
last GIVEN_LOCATION;
} ## end if ( $location eq 'following pre' )
Carp::croak(
"Internal error: unprovided-for cruft location: $_");
} ## end GIVEN_LOCATION:
next DATUM;
} ## end if ( $type eq 'msg: cruft' )
push @text_pieces, [ $type, $indent, $line_data->[2] ];
} ## end for my $line_data ( map { @{$_} } @{$value} )
my @ws_unsafe;
$ws_unsafe[ scalar @text_pieces ] = 0; # size the array
$ws_unsafe[0] = 0;
my $last_non_comment_ix = scalar @text_pieces;
my $safe_after_piece = 0;
my $first_body_tag;
my $last_body_tag;
TEXT_PIECE: for my $text_piece_ix ( 0 .. $#text_pieces ) {
my ( $type, $indent, $text ) = @{ $text_pieces[$text_piece_ix] };
my $safe_before_piece = 0;
WHITESPACE_BY_TYPE: {
# If a comment, after-piece is whatever current value
# of $safe_after_piece,
# leave before as-is
last WHITESPACE_BY_TYPE if $type eq 'html_fmt comment';
last WHITESPACE_BY_TYPE if $type eq 'comment';
if ( $type eq 'whitespace' ) {
# Safe before and after other whitespace
$safe_before_piece = $safe_after_piece = 1;
last WHITESPACE_BY_TYPE;
} ## end if ( $type eq 'whitespace' )
if ( $type eq 'end tag' ) {
if ( $text =~ /\A [^[:alnum:]]+ body [^[:alnum:]]+ \z /xmsi )
{
$last_body_tag = $text_piece_ix;
}
# Unsafe after an end tag, but safe before it
$safe_before_piece = $ws_ok_before_end_tag_flag;
$safe_after_piece = 0;
last WHITESPACE_BY_TYPE;
} ## end if ( $type eq 'end tag' )
if ( $type eq 'start tag' ) {
if ( not defined $first_body_tag
and $text
=~ /\A [^[:alnum:]]+ body [^[:alnum:]]+ \z /xmsi )
{
$first_body_tag = $text_piece_ix;
} ## end if ( not defined $first_body_tag and $text =~ ...)
# Leave safe status as-is before start tag
# Whitespace safe after a start tag
$safe_after_piece = $ws_ok_after_start_tag_flag;
last WHITESPACE_BY_TYPE;
} ## end if ( $type eq 'start tag' )
if ( $type eq 'text' ) {
# Leave safe status as-is before text
# Whitespace not safe after text
$safe_after_piece = 0;
last WHITESPACE_BY_TYPE;
} ## end if ( $type eq 'text' )
# Blocks can occur inline, etc.
# So with everything else, be conservative.
# Whitespace is unsafe after, and left as-is before.
$safe_after_piece = 0;
} ## end WHITESPACE_BY_TYPE:
$ws_unsafe[ $text_piece_ix + 1 ] = $safe_after_piece ? 0 : 1;
if ($safe_before_piece) {
for my $ix ( $last_non_comment_ix + 1 .. $text_piece_ix ) {
$ws_unsafe[$ix] = 0;
}
}
if ( $type ne 'comment' and $type ne 'html_fmt comment' ) {
$last_non_comment_ix = $text_piece_ix;
}
} ## end TEXT_PIECE: for my $text_piece_ix ( 0 .. $#text_pieces )
# Whitespace safe before the body
$first_body_tag //= -1;
for (
my $text_piece_ix = 0;
$text_piece_ix <= $first_body_tag;
$text_piece_ix++
)
{
$ws_unsafe[$text_piece_ix] = 0;
} ## end for ( my $text_piece_ix = 0; $text_piece_ix <= $first_body_tag...)
# Whitespace safe after the body,
# or at least at the EOF if no end tag
$last_body_tag //= $#text_pieces;
for (
my $text_piece_ix = scalar @text_pieces;
$text_piece_ix > $last_body_tag;
$text_piece_ix--
)
{
$ws_unsafe[$text_piece_ix] = 0;
} ## end for ( my $text_piece_ix = scalar @text_pieces; $text_piece_ix...)
my @output = ();
CREATE_OUTPUT: {
if ( $avoid_whitespace_flag eq 'comment' ) {
TEXT_PIECE: for my $text_piece_ix ( 0 .. $#text_pieces ) {
my ( $type, $indent, $text ) =
@{ $text_pieces[$text_piece_ix] };
next TEXT_PIECE if $type eq 'whitespace';
my $suffix = $ws_unsafe[ $text_piece_ix + 1 ] ? '<!--' : q{};
my $indentation = q{ } x $indent;
if ( $type eq 'html_fmt comment' ) {
push @output,
$indentation
. ' html_fmt: '
. $text . ' -->'
. $suffix;
next TEXT_PIECE;
} ## end if ( $type eq 'html_fmt comment' )
if ( $ws_unsafe[$text_piece_ix] ) {
$indentation =
$indentation
. ' html_fmt: this comment is to avoid introducing whitespace'
. "\n"
. $indentation . '-->';
} ## end if ( $ws_unsafe[$text_piece_ix] )
if ( $type eq 'text' and $text =~ /\S/xms ) {
push @output, $indentation . $text . $suffix;
next TEXT_PIECE;
}
push @output, $indentation . $text . $suffix;
} ## end TEXT_PIECE: for my $text_piece_ix ( 0 .. $#text_pieces )
last CREATE_OUTPUT;
} ## end if ( $avoid_whitespace_flag eq 'comment' )
{
# $avoid_whitespace eq 'yes' or $avoid_whitespace eq 'no'
my $avoid_whitespace_is_yes = $avoid_whitespace_flag ne 'no';
TEXT_PIECE: for my $text_piece_ix ( 0 .. $#text_pieces ) {
my ( $type, $indent, $text ) =
@{ $text_pieces[$text_piece_ix] };
next TEXT_PIECE if $type eq 'whitespace';
if ( $avoid_whitespace_is_yes and $ws_unsafe[$text_piece_ix] )
{
my $indentation = q{ } x ($indent);
if ( $type eq 'html_fmt comment' ) {
$text =
qq{<!--\n}
. $indentation
. ' html_fmt: '
. $text . "\n"
. $indentation . ' -->';
} ## end if ( $type eq 'html_fmt comment' )
$output[-1] .= $text;
next TEXT_PIECE;
} ## end if ( $avoid_whitespace_is_yes and $ws_unsafe[...])
my $indentation = q{ } x $indent;
if ( $type eq 'text' and $text =~ /\S/xms ) {
push @output, $indentation . $text;
next TEXT_PIECE;
}
if ( $type eq 'html_fmt comment' ) {
push @output,
$indentation . '<!-- html_fmt: ' . $text . ' -->';
next TEXT_PIECE;
}
push @output, $indentation . $text;
} ## end TEXT_PIECE: for my $text_piece_ix ( 0 .. $#text_pieces )
last CREATE_OUTPUT;
}
} ## end CREATE_OUTPUT:
return join "\n", @output, q{};
} ## end sub post_process
sub do_pre {
my @new_line_data = ();
my $start_tag = Marpa::R2::HTML::start_tag();
my $end_tag = Marpa::R2::HTML::end_tag();
CHILD:
for my $value ( @{ Marpa::R2::HTML::values() } ) {
for my $line_data ( @{$value} ) {
if ( $line_data->[0] =~ /\A msg [:] /xms ) {
push @new_line_data,
[
$line_data->[0], 0,
'following pre', @{$line_data}[ 3 .. $#{$line_data} ]
];
} ## end if ( $line_data->[0] =~ /\A msg [:] /xms )
} ## end for my $line_data ( @{$value} )
} ## end CHILD: for my $value ( @{ Marpa::R2::HTML::values() } )
if ( not defined $start_tag ) {
push @new_line_data, [ 'msg: missing pre start tag', 0 ]
if $main::ADDED_TAG_COMMENT_FLAG;
push @new_line_data, [ 'start tag', 0, '<pre>' ];
}
my $original = Marpa::R2::HTML::original();
push @new_line_data, [ 'pre', 0, $original ];
if ( not defined $end_tag ) {
push @new_line_data, [ 'end tag', 0, '</pre>' ]
if $main::ADDED_TAG_COMMENT_FLAG;
push @new_line_data, [ 'msg: missing pre end tag', 0 ];
}
return \@new_line_data;
} ## end sub do_pre
sub do_cruft {
my $literal = Marpa::R2::HTML::literal();
my @new_line_data = ( [ 'msg: cruft', 0, 'following', $literal, Marpa::R2::HTML::token_type() ] );
push @new_line_data, [ 'cruft', 0, $literal ];
return \@new_line_data;
} ## end sub do_cruft
sub do_comment {
my $literal = Marpa::R2::HTML::literal();
return [ [ 'comment', 0, $literal ] ];
}
sub do_default {
my $tagname = Marpa::R2::HTML::tagname();
my @new_line_data = ();
my $start_tag = Marpa::R2::HTML::start_tag();
my $end_tag = Marpa::R2::HTML::end_tag();
my $is_empty_element = Marpa::R2::HTML::is_empty_element();
if ($is_empty_element) {
if ( defined $start_tag ) {
push @new_line_data, [ 'empty element', 0, $start_tag ];
}
else {
push @new_line_data, [ 'empty element', 0, '<' . $tagname . '>' ];
push @new_line_data,
[ 'msg: empty element missing start tag', 0, 'preceeding', $tagname ]
if $main::ADDED_TAG_COMMENT_FLAG;
} ## end else [ if ( defined $start_tag ) ]
return \@new_line_data;
}
{
my $tag_type = 'start tag';
if ( defined $start_tag ) {
push @new_line_data, [ $tag_type, 0, $start_tag ];
}
else {
push @new_line_data, [ $tag_type, 0, '<' . $tagname . '>' ];
push @new_line_data,
[ 'msg: missing start tag', 0, 'preceeding', $tagname ]
if $main::ADDED_TAG_COMMENT_FLAG;
} ## end else [ if ( defined $start_tag ) ]
}
my @descendant_data = @{ Marpa::R2::HTML::descendants('value,original') };
my $first_content_child = defined $start_tag ? 1 : 0;
my $last_content_child = $#descendant_data - ( defined $end_tag ? 1 : 0 );
CHILD:
for my $descendant_data_ix ( $first_content_child .. $last_content_child )
{
my ( $value, $original ) = @{ $descendant_data[$descendant_data_ix] };
if ( defined $value ) {
for my $line_data ( @{$value} ) {
my ( $type, $indent, @data ) = @{$line_data};
push @new_line_data, [ $type, $indent + 1, @data ];
}
next CHILD;
} ## end if ( defined $value )
push @new_line_data, [ 'text', 1, $original ];
} ## end CHILD: for my $descendant_data_ix ( $first_content_child .. ...)
if ( not $is_empty_element ) {
if ( defined $end_tag ) {
push @new_line_data, [ 'end tag', 0, $end_tag ];
}
else {
push @new_line_data,
[ 'msg: missing end tag', 0, 'following', $tagname ]
if $main::ADDED_TAG_COMMENT_FLAG;
push @new_line_data, [ 'end tag', 0, '</' . $tagname . '>' ];
} ## end else [ if ( defined $end_tag ) ]
} ## end if ( not $is_empty_element )
return \@new_line_data;
} ## end sub do_default
my %html_args = (
'script' => sub {
return [ [ 'script', 0, Marpa::R2::HTML::original() ] ];
},
':CRUFT' => \&do_cruft,
':COMMENT' => \&do_comment,
'pre' => \&do_pre,
q{*} => \&do_default,
':TOP' => sub { return Marpa::R2::HTML::values(); }
);
my %flags = (
trace_terminals => $trace_terminals_flag,
trace_cruft => $trace_cruft_flag,
trace_values => $trace_values_flag,
dump_config => $dump_config_flag,
dump_AHFA => $dump_AHFA_flag,
);
if (defined $compile_flag) {
open my $fh, q{<}, $compile_flag;
my $source = join q{}, <$fh>;
close $fh;
$flags{compile} = \$source;
}
my $value_ref = Marpa::R2::HTML::html( \$document, \%html_args, \\%flags );
die 'Internal error: no parse' if not defined $value_ref;
if ($dump_config_flag or $dump_AHFA_flag) {
print ${$value_ref};
exit 0;
}
print post_process($value_ref) or die "print failed: $ERRNO";
exit 0;
# vim: set expandtab shiftwidth=4: