#!/usr/bin/perl
use
5.016;
my
$HELP
=
<<"HERE";
Usage: $0 [options] [file ...]
Options:
-e|--html-encoding=<enc> Specify input HTML encoding
-w|--width=<width> Max output line width
-h|--help Print this help message
HERE
my
%INLINE
=
map
{
$_
=> 1 }
qw(
a img span em i b strong mark s sub small address site sup q wbr time code
audio video
)
;
my
$WIDTH_MAX
= 512;
my
$WIDTH_MIN
= 10;
my
$Columns
= 80;
sub
wrap {
my
$str
=
shift
;
my
$cm1
=
$Columns
- 1;
return
$str
=~ s/(\S{
$cm1
})(\S{2,})/$1- $2/gr
=~ s/(.{0,
$Columns
})(\s|$)/$1\n/gr;
}
sub
para2text {
my
$node
=
shift
;
return
join
(
''
,
map
{ _html2text(
$_
) }
$node
->childNodes) .
"\n\n"
;
}
sub
heading2text {
my
$node
=
shift
;
return
uc
join
(
''
,
map
{ _html2text(
$_
) }
$node
->childNodes) .
"\n\n"
;
}
sub
break2text {
"\n"
}
sub
inline2text {
my
$node
=
shift
;
return
join
''
,
map
{ _html2text(
$_
) }
$node
->childNodes;
}
sub
item2text {
my
$node
=
shift
;
return
'* '
.
join
(
''
,
map
{ _html2text(
$_
) }
$node
->childNodes) .
"\n\n"
;
}
sub
_html2text {
my
$node
=
shift
;
my
%handle
= (
para
=> {
isa
=>
sub
{
shift
eq
'p'
},
sub
=> \
¶2text
,
},
heading
=> {
isa
=>
sub
{
shift
=~ /^h[1-6]$/ },
sub
=> \
&heading2text
,
},
break
=> {
isa
=>
sub
{
shift
eq
'br'
},
sub
=> \
&break2text
,
},
inline
=> {
isa
=>
sub
{
exists
$INLINE
{
shift
() } },
sub
=> \
&inline2text
,
},
item
=> {
isa
=>
sub
{
shift
eq
'li'
},
sub
=> \
&item2text
,
},
);
if
(
ref
$node
eq
'XML::LibXML::Element'
) {
for
my
$k
(
keys
%handle
) {
if
(
$handle
{
$k
}->{isa}(
lc
$node
->nodeName)) {
return
$handle
{
$k
}->{
sub
}->(
$node
);
}
}
return
join
(
''
,
map
{ _html2text(
$_
) }
$node
->childNodes) .
"\n\n"
;
}
elsif
(
ref
$node
eq
'XML::LibXML::Text'
) {
return
$node
->data =~ s/\s+/ /gr;
}
else
{
return
''
;
}
}
sub
html2text {
my
$node
=
shift
;
return
wrap(
_html2text(
$node
)
=~ s/^\ +|\ +$//mgr
=~ s/\ +/ /gr
) =~ s/(\s*\n){3,}/\n\n/gr;
}
sub
main {
my
$param
= {
Html
=>
undef
,
Width
=> 80,
Enc
=>
undef
,
};
Getopt::Long::config(
'bundling'
);
GetOptions(
'html-encoding|e=s'
=> \
$param
->{Enc},
'width|w=i'
=> \
$param
->{Width},
'help|h'
=>
sub
{
print
$HELP
;
exit
0 },
) or
die
"Error in command line arguments\n"
;
$param
->{Html} =
@ARGV
? [
@ARGV
] :
undef
;
unless
(
$param
->{Width} >=
$WIDTH_MIN
and
$param
->{Width} <=
$WIDTH_MAX
) {
die
"Width cannot be greater than $WIDTH_MAX or less than $WIDTH_MIN\n"
;
}
$Columns
=
$param
->{Width};
binmode
*STDOUT
,
':utf8'
;
if
(not
defined
$param
->{Html}) {
my
$dom
= XML::LibXML->load_html(
IO
=>
*STDIN
,
recover
=> 2,
encoding
=>
$param
->{Enc},
);
my
(
$body
) =
$dom
->findnodes(
'/html/body'
);
$body
//=
$dom
->documentElement;
say
html2text(
$body
);
}
else
{
say
join
"\n\n"
,
map
{
my
$dom
= XML::LibXML->load_html(
location
=>
$_
,
recover
=> 2,
encoding
=>
$param
->{Enc},
);
my
(
$body
) =
$dom
->findnodes(
'/html/body'
);
$body
//=
$dom
->documentElement;
html2text(
$body
);
} @{
$param
->{Html} };
}
1;
}
main;