From Code to Community: Sponsoring The Perl and Raku Conference 2025 Learn more

use strict;
use Encode qw(is_utf8 decode_utf8);
our @EXPORT = (
'u',
'normalize_whitespace',
'html2text',
'is_empty',
);
sub u($) {
defined($_[0]) or return undef;
my $v = "".$_[0];
return is_utf8($v) ? $v : decode_utf8($v);
}
sub is_empty {
(! defined($_[0])) || $_[0] eq '';
}
sub normalize_whitespace {
local $_ = $_[0];
s/\h+/ /g;
s/\r\n/\n/g;
s/\A\s+//;
s/\s+\z//;
return $_;
}
sub html2text {
my $html = $_[0];
my $content_dom = Mojo::DOM->new('<body>' . $html . '</body>');
$content_dom->find('br')->map(replace => "\n");
$content_dom->find('div,p')->map(append => "\n\n");
my @paragraphs = grep { $_ ne '' } map { normalize_whitespace($_) } split /\n\n+/, $content_dom->all_text;
return join "\n\n", @paragraphs;
}
1;