package WWW::FetchStory::Fetcher;
$WWW::FetchStory::Fetcher::VERSION = '0.2201';
use strict;
use warnings;
=head1 NAME
WWW::FetchStory::Fetcher - fetching module for WWW::FetchStory
=head1 VERSION
version 0.2201
=head1 DESCRIPTION
This is the base class for story-fetching plugins for WWW::FetchStory.
=cut
require File::Temp;
use Date::Format;
use Encode::ZapCP1252;
use HTML::Entities;
use HTML::Strip;
use XML::LibXML;
use HTML::Tidy::libXML;
use EBook::EPUB;
use Archive::Zip qw( :ERROR_CODES :CONSTANTS );
use YAML::Any;
use WWW::Mechanize::Sleepy;
use Encode qw( encode );
use HTTP::Cookies;
use HTTP::Cookies::Wget;
use HTTP::Cookies::Mozilla;
=head1 METHODS
=head2 new
$obj->WWW::FetchStory::Fetcher->new();
=cut
sub new {
my $class = shift;
my %parameters = @_;
my $self = bless ({%parameters}, ref ($class) || $class);
return ($self);
} # new
=head2 init
Initialize the object.
$obj->init(%args)
=cut
sub init {
my $self = shift;
my %parameters = @_;
foreach my $key (keys %parameters)
{
$self->{$key} = $parameters{$key};
}
if ($self->{use_wget})
{
$self->{wget_cmd} = 'wget';
if ($self->{wget_cookies} and -f $self->{wget_cookies})
{
$self->{wget_cmd} .= " --load-cookies " . $self->{wget_cookies};
}
if ($self->{debug})
{
$self->{wget_cmd} .= " --debug";
}
if ($self->{wget_options})
{
$self->{wget_cmd} .= ' ' . $self->{wget_options};
}
}
else
{
$self->{user_agent} = WWW::Mechanize::Sleepy->new(
keep_alive => 1,
env_proxy => 1,
sleep => '1..10',
agent => ref $self,
);
$self->{user_agent}->show_progress($self->{verbose} > 0);
if ($self->{firefox_cookies} and -f $self->{firefox_cookies})
{
my $cookies = HTTP::Cookies::Mozilla->new(
'file' => $self->{firefox_cookies},
hide_cookie2 => 1,
ignore_discard => 1,
);
print "\n--------------\n", $cookies->as_string, "\n------------\n" if ($self->{debug} && $self->{debug} > 2);
$self->{user_agent}->cookie_jar( $cookies );
}
elsif ($self->{wget_cookies} and -f $self->{wget_cookies})
{
my $cookies = HTTP::Cookies::Wget->new(
'file' => $self->{wget_cookies},
hide_cookie2 => 1,
ignore_discard => 1,
);
print "\n--------------\n", $cookies->as_string, "\n------------\n" if ($self->{debug} && $self->{debug} > 2);
$self->{user_agent}->cookie_jar( $cookies );
}
if ($self->{debug} && $self->{debug} > 1)
{
$self->{user_agent}->add_handler("request_send", sub { shift->dump; return });
$self->{user_agent}->add_handler("response_done", sub { shift->dump; return });
}
}
$self->{stripper} = HTML::Strip->new();
$self->{stripper}->add_striptag("head");
return ($self);
} # init
=head2 name
The name of the fetcher; this is basically the last component
of the module name. This works as either a class function or a method.
$name = $self->name();
$name = WWW::FetchStory::Fetcher::name($class);
=cut
sub name {
my $class = shift;
my $fullname = (ref ($class) ? ref ($class) : $class);
my @bits = split('::', $fullname);
return pop @bits;
} # name
=head2 info
Information about the fetcher.
By default this just returns the formatted name.
$info = $self->info();
=cut
sub info {
my $self = shift;
my $name = $self->name();
# split the name into words
my $info = $name;
$info =~ s/([A-Z])/ $1/g;
$info =~ s/^\s+//;
return $info;
} # info
=head2 priority
The priority of this fetcher. Fetchers with higher priority
get tried first. This is useful where there may be a generic
fetcher for a particular site, and then a more specialized fetcher
for particular sections of a site. For example, there may be a
generic LiveJournal fetcher, and then refinements for particular
LiveJournal community, such as the sshg_exchange community.
This works as either a class function or a method.
This must be overridden by the specific fetcher class.
$priority = $self->priority();
$priority = WWW::FetchStory::Fetcher::priority($class);
=cut
sub priority {
my $class = shift;
return 0;
} # priority
=head2 allow
If this fetcher can be used for the given URL, then this returns
true.
This must be overridden by the specific fetcher class.
if ($obj->allow($url))
{
....
}
=cut
sub allow {
my $self = shift;
my $url = shift;
return 0;
} # allow
=head2 fetch
Fetch the story, with the given options.
%story_info = $obj->fetch(
urls=>\@urls,
basename=>$basename,
toc=>0,
yaml=>0);
=over
=item basename
Optional basename used to construct the filenames.
If this is not given, the basename is derived from the title of the story.
=item epub
Create an EPUB file, deleting the HTML files which have been downloaded.
=item toc
Build a table-of-contents file if this is true.
=item yaml
Build a YAML file with meta-data about this story if this is true.
=item urls
The URLs of the story.
The first page is scraped for meta-information about the story,
including the title and author. Site-specific Fetcher plugins can find additional
information, including the URLs of all the chapters in a multi-chapter story.
=back
=cut
sub fetch {
my $self = shift;
my %args = (
urls=>undef,
basename=>'',
@_
);
$self->{verbose} = $args{verbose};
my $first_url = $args{urls}[0];
my $toc_content = $self->get_toc($first_url);
my %story_info = $self->parse_toc(%args, content=>$toc_content,
url=>$first_url);
my $basename = ($args{basename}
? $args{basename}
: $self->get_story_basename($story_info{title}));
$story_info{basename} = $basename;
my @storyfiles = ();
if (!$args{meta_only})
{
if ($args{epub} and exists $story_info{epub_url} and $story_info{epub_url})
{
my %epub_info = $self->get_epub(base=>$basename,
url=>$story_info{epub_url},
meta=>\%story_info);
$story_info{storyfiles} = [$epub_info{filename}];
$self->derive_values(info=>\%story_info);
warn Dump(\%story_info) if ($self->{verbose} > 1);
}
else
{
my @ch_urls = @{$story_info{chapters}};
my $one_chapter = (@ch_urls == 1);
my $first_chapter_is_toc =
$story_info{toc_first} || $self->{first_is_toc};
delete $story_info{toc_first};
my @ch_titles = ();
my @ch_wc = ();
my $count = (($one_chapter or $first_chapter_is_toc) ? 0 : 1);
foreach (my $i = 0; $i < @ch_urls; $i++)
{
my $ch_title = sprintf("%s (%d)", $story_info{title}, $i+1);
my %ch_info = $self->get_chapter(base=>$basename,
count=>$count,
url=>$ch_urls[$i],
title=>$ch_title);
push @storyfiles, $ch_info{filename};
push @ch_titles, $ch_info{title};
push @ch_wc, $ch_info{wordcount};
$story_info{wordcount} += $ch_info{wordcount};
$count++;
sleep 1; # try not to overload the archive
}
$self->derive_values(info=>\%story_info);
warn Dump(\%story_info) if ($self->{verbose} > 1);
$story_info{storyfiles} = \@storyfiles;
$story_info{chapter_titles} = \@ch_titles;
$story_info{chapter_wc} = \@ch_wc;
if ($args{toc} and !$args{epub}) # build a table-of-contents
{
my $toc = $self->build_toc(info=>\%story_info);
unshift @{$story_info{storyfiles}}, $toc;
unshift @{$story_info{chapter_titles}}, "Table of Contents";
}
if ($args{epub})
{
my $epub_file = $self->build_epub(info=>\%story_info);
# if we have built an EPUB file, then the storyfiles
# are now just one EPUB file.
$story_info{storyfiles} = [$epub_file];
}
}
}
if ($args{yaml})
{
my $filename = sprintf("%s.yml", $story_info{basename});
my $ofh;
open($ofh, ">", $filename) || die "Can't write to $filename";
print $ofh Dump(\%story_info);
close($ofh);
}
return %story_info;
} # fetch
=head1 Private Methods
=head2 get_story_basename
Figure out the file basename for a story by using its title.
$basename = $self->get_story_basename($title);
=cut
sub get_story_basename {
my $self = shift;
my $title = shift;
# make a word with only letters and numbers
# and remove HTML entities and UTF-8
# and with everything lowercase
# and the spaces replaced with underscores
my $base = $title;
$base =~ s/^The\s+//; # get rid of leading "The "
$base =~ s/^A\s+//; # get rid of leading "A "
$base =~ s/^An\s+//; # get rid of leading "An "
$base =~ s/-/ /g; # replace dashes with spaces
$base = decode_entities($base); # replace entities with UTF-8
$base =~ s/[^[:ascii:]]//g; # remove UTF-8
$base =~ s/[^\w\s]//g; # remove non-word characters
$base = lc($base);
my @words = split(' ', $base);
my $max_words = 3;
my @first_words = ();
# if there are three words or less, use all of them
if (@words <= $max_words)
{
@first_words = @words;
}
else
{
$max_words++ if (@words > 3); # four
$max_words++ if (@words > 5); # five if a lot
for (my $i = 0; $i < @words and @first_words < $max_words; $i++)
{
# skip little words
if ($words[$i] =~ /^(the|a|an|and)$/)
{
}
elsif (@words > 4 and $words[$i] =~ /^(of|to|in|or|on|by|i|is|isnt|its)$/)
{
# if there are a lot of words, skip these little words too
}
else
{
push @first_words, $words[$i];
}
}
}
return join('_', @first_words);
} # get_story_basename
=head2 extract_story
Extract the story-content from the fetched content.
my ($story, $title) = $self->extract_story(content=>$content,
title=>$title);
=cut
sub extract_story {
my $self = shift;
my %args = (
content=>'',
title=>'',
@_
);
my $story = '';
my $title = '';
if ($args{content} =~ m#<title>([^<]+)</title>#is)
{
$title = $1;
}
else
{
$title = $args{title};
}
# some badly formed pages have multiple BODY tags
if ($args{content} =~ m#<body[^>]*>.*?<body[^>]*>(.*?)</body>#is)
{
$story = $1;
}
elsif ($args{content} =~ m#<body[^>]*>(.*)</body>#is)
{
$story = $1;
}
elsif ($args{content} =~ m#</head>(.*)#is)
{
$story = $1;
}
if ($story)
{
$story = $self->tidy_chars($story);
}
else
{
$story = $args{content};
}
return ($story, $title);
} # extract_story
=head2 make_css
Create site-specific CSS styling.
$css = $self->make_css();
=cut
sub make_css {
my $self = shift;
return '';
} # make_css
=head2 tidy
Make a tidy, compliant XHTML page from the given story-content.
$content = $self->tidy(story=>$story,
title=>$title);
=cut
sub tidy {
my $self = shift;
my %args = (
story=>'',
title=>'',
@_
);
my $story = $args{story};
$story = $self->tidy_chars($story);
my $title = $args{title};
my $css = $self->make_css(%args);
my $html = '';
$html .= "<html>\n";
$html .= "<head>\n";
$html .= "<title>$title</title>\n";
$html .= $css if $css;
$html .= "</head>\n";
$html .= "<body>\n";
$html .= "$story\n";
$html .= "</body>\n";
$html .= "</html>\n";
my $tidy = HTML::Tidy::libXML->new();
$html = encode("UTF-8", $html);
my $xhtml = $tidy->clean($html, 'UTF-8', 1);
# fixing some errors
$xhtml =~ s!xmlns="http://www.w3.org/1999/xhtml" xmlns="http://www.w3.org/1999/xhtml"!xmlns="http://www.w3.org/1999/xhtml"!;
$xhtml =~ s!<i/>!!g;
$xhtml =~ s!<b/>!!g;
return $xhtml;
} # tidy
=head2 get_toc
Get a table-of-contents page.
=cut
sub get_toc {
my $self = shift;
my $url = shift;
return $self->get_page($url);
} # get_toc
=head2 get_page
Get the contents of a URL.
=cut
sub get_page {
my $self = shift;
my $url = shift;
warn "getting $url\n" if $self->{verbose};
my $content = '';
if ($self->{use_wget})
{
my $cmd = sprintf("%s -O %s '%s'", $self->{wget_cmd}, '-', $url);
warn "$cmd\n" if ($self->{verbose} > 1);
my $ifh;
open($ifh, "${cmd}|") or die "FAILED $cmd: $!";
while(<$ifh>)
{
$content .= $_;
}
close($ifh);
}
else
{
my $can_accept = HTTP::Message::decodable;
my $res = $self->{user_agent}->get($url,
'Accept-Encoding' => $can_accept,
'Keep-Alive' => "300",
'Connection' => 'keep-alive',
);
# Check the outcome of the response
if ($res->is_success) {
print $res->status_line, "\n" if $self->{debug};
}
else {
die "FAILED fetching $url ", $res->status_line;
}
$content = $res->decoded_content || $res->content;
}
if (!$content and $self->{verbose})
{
warn "No content from $url";
if ($self->{debug})
{
# there's a problem, we want to debug it
exit;
}
}
return $content;
} # get_page
=head2 parse_toc
Parse the table-of-contents file.
This must be overridden by the specific fetcher class.
%info = $self->parse_toc(content=>$content,
url=>$url,
urls=>\@urls);
This should return a hash containing:
=over
=item chapters
An array of URLs for the chapters of the story. In the case where the
story only takes one page, that will be the chapter.
In the case where multiple URLs have been passed in, it will be those URLs.
=item title
The title of the story.
=back
It may also return additional information, such as Summary.
=cut
sub parse_toc {
my $self = shift;
my %args = (
url=>'',
content=>'',
@_
);
my %info = ();
$info{url} = $args{url};
$info{title} = $self->parse_title(%args);
$info{author} = $self->parse_author(%args);
$info{summary} = $self->parse_summary(%args);
$info{characters} = $self->parse_characters(%args);
$info{universe} = $self->parse_universe(%args);
$info{category} = $self->parse_category(%args);
$info{rating} = $self->parse_rating(%args);
$info{chapters} = $self->parse_chapter_urls(%args);
$info{epub_url} = $self->parse_epub_url(%args);
return %info;
} # parse_toc
=head2 parse_chapter_urls
Figure out the URLs for the chapters of this story.
=cut
sub parse_chapter_urls {
my $self = shift;
my %args = (
url=>'',
content=>'',
@_
);
my @chapters = ();
if (defined $args{urls})
{
@chapters = @{$args{urls}};
}
else
{
@chapters = ($args{url});
}
return \@chapters;
} # parse_chapter_urls
=head2 parse_epub_url
Figure out the URL for the EPUB version of this story, if there is one.
=cut
sub parse_epub_url {
my $self = shift;
my %args = (
url=>'',
content=>'',
@_
);
return undef;
} # parse_epub_url
=head2 parse_title
Get the title from the content
=cut
sub parse_title {
my $self = shift;
my %args = (
content=>'',
@_
);
my $content = $args{content};
my $title = '';
if ($content =~ /<(?:b|strong)>Title:?\s*<\/(?:b|strong)>:?\s*"?(.*?)"?\s*<(?:br|p|\/p|div|\/div)/si)
{
$title = $1;
}
elsif ($content =~ /\bTitle:\s*"?(.*?)"?\s*<br/s)
{
$title = $1;
}
elsif ($content =~ m#<h1>([^<]+)</h1>#is)
{
$title = $1;
}
elsif ($content =~ m#<p class=MsoTitle>([^<]+)</p>#is)
{
$title = $1;
}
elsif ($content =~ m#<h2>([^<]+)</h2>#is)
{
$title = $1;
}
elsif ($content =~ m#<h3>([^<]+)</h3>#is)
{
$title = $1;
}
elsif ($content =~ m#<h4>([^<]+)</h4>#is)
{
$title = $1;
}
elsif ($content =~ m#<title>([^<]+)</title>#is)
{
$title = $1;
}
$title =~ s/<u>//ig;
$title =~ s/<\/u>//ig;
return $title;
} # parse_title
=head2 parse_ch_title
Get the chapter title from the content
=cut
sub parse_ch_title {
my $self = shift;
my %args = (
content=>'',
@_
);
my $content = $args{content};
my $title = '';
if ($content =~ /Chapter \d+[:.]?\s*([^<]+)/si)
{
$title = $1;
}
else
{
$title = $self->parse_title(%args);
}
$title =~ s/<u>//ig;
$title =~ s/<\/u>//ig;
return $title;
} # parse_ch_title
=head2 parse_author
Get the author from the content
=cut
sub parse_author {
my $self = shift;
my %args = (
content=>'',
@_
);
my $content = $args{content};
my $author = '';
if ($content =~ /<(?:b|strong)>Author:?\s*<\/(?:b|strong)>:?\s*"?(.*?)"?\s*<(?:br|p|\/p|div|\/div)/si)
{
$author = $1;
}
elsif ($content =~ /\bAuthor:\s*"?(.*?)"?\s*<br/si)
{
$author = $1;
}
elsif ($content =~ /<meta name="author" content="(.*?)"/si)
{
$author = $1;
}
elsif ($content =~ /<p>by (.*?)<br/si)
{
$author = $1;
}
return $author;
} # parse_author
=head2 parse_summary
Get the summary from the content
=cut
sub parse_summary {
my $self = shift;
my %args = (
content=>'',
@_
);
my $content = $args{content};
my $summary = '';
if ($content =~ /<(?:b|strong)>Summary:?\s*<\/(?:b|strong)>:?\s*"?(.*?)"?\s*<(?:br|p|\/p|div|\/div)/si)
{
$summary = $1;
}
elsif ($content =~ m#<i>Summary:</i>\s*([^<]+)\s*<br>#s)
{
$summary = $1;
}
elsif ($content =~ m#>Summary:\s*</span>\s*([^<]+)\s*<br#s)
{
$summary = $1;
}
elsif ($content =~ /<i>Summary:<\/i>\s*(.*?)\s*$/m)
{
$summary = $1;
}
elsif ($content =~ m#<tr><(?:th|td)>Summary</(?:th|td)><td>(.*?)</td></tr>#s)
{
$summary = $1;
$summary =~ s/<br>/ /g;
}
elsif ($content =~ /\bSummary:\s*"?(.*?)"?\s*<(?:br|p|\/p|div|\/div)/si)
{
$summary = $1;
}
elsif ($content =~ m#(?:Prompt|Summary):</b>([^<]+)#is)
{
$summary = $1;
}
elsif ($content =~ m#(?:Prompt|Summary):</strong>([^<]+)#is)
{
$summary = $1;
}
elsif ($content =~ m#(?:Prompt|Summary):</u>([^<]+)#is)
{
$summary = $1;
}
return $summary;
} # parse_summary
=head2 parse_characters
Get the characters from the content
=cut
sub parse_characters {
my $self = shift;
my %args = (
content=>'',
@_
);
my $content = $args{content};
my $characters = '';
if ($content =~ />Characters:?\s*<\/(?:b|strong)>:?\s*"?(.*?)"?\s*<(?:br|p|\/p|div|\/div)/si)
{
$characters = $1;
}
elsif ($content =~ /\bCharacters:\s*"?(.*?)"?\s*<br/si)
{
$characters = $1;
}
elsif ($content =~ m#<i>Characters:</i>\s*([^<]+)\s*<br>#s)
{
$characters = $1;
}
elsif ($content =~ m#(?:Pairings?|Characters):</(?:b|strong|u)>\s*([^<]+)#is)
{
$characters = $1;
}
elsif ($content =~ m#<tr><(?:th|td)>(?:Pairings?|Characters)</(?:th|td)><td>(.*?)</td></tr>#s)
{
$characters = $1;
$characters =~ s/<br>/, /g;
}
return $characters;
} # parse_characters
=head2 parse_universe
Get the universe/fandom from the content
=cut
sub parse_universe {
my $self = shift;
my %args = (
content=>'',
@_
);
my $content = $args{content};
my $universe = '';
if ($content =~ m#(?:Universe|Fandom):</(?:b|strong|u)>([^<]+)#is)
{
$universe = $1;
}
return $universe;
} # parse_universe
=head2 parse_recipient
Get the recipient from the content
=cut
sub parse_recipient {
my $self = shift;
my %args = (
content=>'',
@_
);
my $content = $args{content};
my $recipient = '';
if ($content =~ m#(?:Recipient|Prompter): (\w+)#is)
{
$recipient = $1;
}
elsif ($content =~ m#Recipient:</(?:b|strong|u)>([^<]+)#is)
{
$recipient = $1;
}
return $recipient;
} # parse_recipient
=head2 parse_category
Get the categories from the content
=cut
sub parse_category {
my $self = shift;
my %args = (
content=>'',
@_
);
my $content = $args{content};
my $category = '';
if ($content =~ m#(?:Category|Tags):</(?:b|strong|u)>([^<]+)#is)
{
$category = $1;
}
elsif ($content =~ m#<tr><(?:th|td)>Categories</(?:th|td)><td>(.*?)</td></tr>#s)
{
$category = $1;
$category =~ s/<br>/, /g;
}
return $category;
} # parse_category
=head2 parse_rating
Get the rating from the content
=cut
sub parse_rating {
my $self = shift;
my %args = (
content=>'',
@_
);
my $content = $args{content};
my $rating = '';
if ($content =~ m!^Rating:\s(.*?)$!m)
{
$rating = $1;
}
elsif ($content =~ m#Rating:</(?:b|strong|u)>\s*([^<]+)#is)
{
$rating = $1;
}
return $rating;
} # parse_rating
=head2 derive_values
Calculate additional Meta values, such as current date.
=cut
sub derive_values {
my $self = shift;
my %args = @_;
my $today = time2str('%Y-%m-%d', time);
$args{info}->{fetch_date} = $today;
my $words = $args{info}->{wordcount};
if ($words)
{
my $len = '';
if ($words == 100)
{
$len = 'Drabble';
} elsif ($words == 200)
{
$len = 'Double Drabble';
} elsif ($words >= 75000)
{
$len = 'Long Novel';
} elsif ($words >= 50000)
{
$len = 'Novel';
} elsif ($words >= 25000)
{
$len = 'Novella';
} elsif ($words >= 7500)
{
$len = 'Novelette';
} elsif ($words >= 2000)
{
$len = 'Short Story';
} elsif ($words > 500)
{
$len = 'Short Short';
} elsif ($words <= 500)
{
$len = 'Flash';
}
$args{info}->{story_length} = $len if $len;
}
for my $field (qw{characters universe category})
{
if (exists $args{info}->{$field}
and defined $args{info}->{$field}
and $args{info}->{$field} =~ /,/s)
{
my @chars = split(/,\s*/s, $args{info}->{$field});
$args{info}->{$field} = \@chars;
}
}
} # derive_values
=head2 get_chapter
Get an individual chapter of the story, tidy it,
and save it to a file.
$filename = $obj->get_chapter(base=>$basename,
count=>$count,
url=>$url,
title=>$title);
=cut
sub get_chapter {
my $self = shift;
my %args = (
base=>'',
count=>0,
url=>'',
title=>'',
@_
);
my $content = $self->get_page($args{url});
my ($story, $title) = $self->extract_story(%args, content=>$content);
my $chapter_title = $self->parse_ch_title(content=>$content);
$chapter_title = $title if !$chapter_title;
my $html = $self->tidy(story=>$story, title=>$chapter_title);
my %wc = $self->wordcount(content=>$html);
#
# Write the file
#
my $filename = ($args{count}
? sprintf("%s%02d.html", $args{base}, $args{count})
: sprintf("%s.html", $args{base}));
my $ofh;
open($ofh, ">", $filename) || die "Can't write to $filename";
print $ofh $html;
close($ofh);
return (
filename=>$filename,
title=>$chapter_title,
wordcount=>$wc{words},
charcount=>$wc{chars},
);
} # get_chapter
=head2 get_epub
Get the EPUB version of the story, tidy it,
and save it to a file.
$filename = $obj->get_epub(base=>$basename,
url=>$url);
=cut
sub get_epub {
my $self = shift;
my %args = (
base=>'',
url=>'',
meta=>undef,
@_
);
my %meta = %{$args{meta}};
my $content = $self->get_page($args{url});
my %epub_info = ();
#
# Write the file
#
my $filename = $args{base} . '.epub';
my $ofh;
open($ofh, ">", $filename) || die "Can't write to $filename";
print $ofh $content;
close($ofh);
$epub_info{filename} = $filename;
#
# Update the file metadata
#
my $zip = Archive::Zip->new();
my $status = $zip->read( $filename );
if ($status != AZ_OK)
{
return %epub_info;
}
my @members = $zip->membersMatching('.*\.opf');
if (@members && $members[0])
{
my %values = ();
my $opf = $zip->contents($members[0]);
my $dom = XML::LibXML->load_xml(string => $opf,
load_ext_dtd => 0,
no_network => 1);
my @metanodes = $dom->getElementsByLocalName('metadata');
foreach my $metanode (@metanodes)
{
if ($metanode->hasChildNodes)
{
my @children = $metanode->childNodes();
foreach my $node (@children)
{
$self->epub_parse_one_node(%args,
node=>$node,
values=>\%values);
}
}
}
print STDERR "get_epub: about to replace description\n" if $self->{debug};
$self->epub_replace_description(description=>$meta{summary}, xml=>$dom);
# remove meta info we don't want to be added to this
delete $meta{description};
delete $meta{title};
delete $meta{chapters};
delete $meta{epub_url};
delete $meta{basename};
delete $meta{toc_first};
$self->epub_add_meta(meta=>\%meta, xml=>$dom);
my $str = $dom->toString;
$zip->contents($members[0], $str);
$zip->overwrite();
}
return %epub_info;
} # get_epub
=head2 epub_replace_description
Replace or add the description to an EPUB file.
=cut
sub epub_replace_description {
my $self = shift;
my %args = @_;
my $dom = $args{xml};
my $desc = $args{description};
print STDERR "epub_replace_description: description=$desc\n" if $self->{debug};
my @metanodes = $dom->getElementsByLocalName('metadata');
return unless @metanodes;
my $metanode = $metanodes[0];
my @dnodes = $metanode->getElementsByLocalName('description');
if ($dnodes[0])
{
$metanode->removeChild($dnodes[0]);
}
$metanode->appendTextChild('dc:description', $desc);
} # epub_replace_description
=head2 epub_add_meta
Add the given meta-data to an EPUB file.
=cut
sub epub_add_meta {
my $self = shift;
my %args = @_;
my $dom = $args{xml};
my @metanodes = $dom->getElementsByLocalName('metadata');
return unless @metanodes;
my $metanode = $metanodes[0];
my %meta = %{$args{meta}};
foreach my $key (sort keys %meta)
{
my $chunk=<<EOT;
<meta name="$key" content="$meta{$key}"/>
EOT
$metanode->appendWellBalancedChunk( $chunk );
}
} # epub_add_meta
=head2 epub_parse_one_node
Parse a node of meta-information from an EPUB file.
=cut
sub epub_parse_one_node {
my $self = shift;
my %params = @_;
my $node = $params{node};
my $oldvals = $params{values};
my %newvals = ();
my $name = $node->localname;
return undef unless $name;
my $value = $node->textContent;
$value =~ s/^\s+//s;
$value =~ s/\s+$//s;
$value =~ s/\s\s+/ /gs;
if ($name eq 'meta' and $node->hasAttributes)
{
my $metaname = '';
my $metacontent = '';
my @atts = $node->attributes();
foreach my $att (@atts)
{
my $n = $att->localname;
my $v = $att->textContent;
$v =~ s/^\s+//s;
$v =~ s/\s+$//s;
if ($n eq 'name')
{
$metaname = $v;
}
else
{
$metacontent = $v;
}
}
$newvals{$metaname} = $metacontent;
}
elsif ($node->hasAttributes)
{
$newvals{$name}->{text} = $value unless !$value;
my @atts = $node->attributes();
foreach my $att (@atts)
{
my $n = $att->localname;
my $v = $att->textContent;
$v =~ s/^\s+//s;
$v =~ s/\s+$//s;
$newvals{$name}->{$n} = $v;
}
}
else
{
$newvals{$name} = $value;
}
# Don't want to overwrite existing values
foreach my $newname (sort keys %newvals)
{
my $newval = $newvals{$newname};
if (!ref $newval)
{
if (!exists $oldvals->{$newname})
{
$oldvals->{$newname} = $newval;
}
elsif (!ref $oldvals->{$newname})
{
my $v = $oldvals->{$newname};
$oldvals->{$newname} = [$v, $newval];
}
elsif (ref $oldvals->{$newname} eq 'ARRAY')
{
push @{$oldvals->{$newname}}, $newval;
}
else
{
$oldvals->{$newname}->{$newval} = $newval;
}
}
else
{
if (!exists $oldvals->{$newname})
{
$oldvals->{$newname} = $newval;
}
elsif (ref $oldvals->{$newname} eq 'ARRAY')
{
push @{$oldvals->{$newname}}, $newval;
}
else
{
my $v = $oldvals->{$newname};
$oldvals->{$newname} = [$v, $newval];
}
}
}
} # epub_parse_one_node
=head2 wordcount
Figure out the word-count.
=cut
sub wordcount {
my $self = shift;
my %args = (
@_
);
#
# Count the words
#
my $stripped = $self->{stripper}->parse($args{content});
$self->{stripper}->eof;
$stripped =~ s/[\n\r]/ /sg; # remove line splits
$stripped =~ s/^\s+//;
$stripped =~ s/\s+$//;
$stripped =~ s/\s+/ /g; # remove excess whitespace
my @words = split(' ', $stripped);
my $wordcount = @words;
my $chars = length($stripped);
if ($self->{debug})
{
my $orig_length = length($args{content});
print "orig_length=$orig_length, words=$wordcount, chars=$chars\n";
if ($wordcount < 200) # too short!
{
print "====== stripped ======\n$stripped\n======\n";
}
}
return (
words=>$wordcount,
chars=>$chars,
);
} # wordcount
=head2 build_toc
Build a local table-of-contents file from the meta-info about the story.
$self->build_toc(info=>\%info);
=cut
sub build_toc {
my $self = shift;
my %args = (
@_
);
my $info = $args{info};
my $filename = sprintf("%s00.html", $info->{basename});
my $html;
my $characters = (ref $info->{characters}
? join( ', ', @{$info->{characters}} )
: $info->{characters});
my $universe = (ref $info->{universe}
? join( ', ', @{$info->{universe}} )
: $info->{universe});
$html = <<EOT;
<html>
<head><title>$info->{title}</title></head>
<body>
<h1>$info->{title}</h1>
<p>by $info->{author}</p>
<p>Fetched from <a href="$info->{url}">$info->{url}</a></p>
<p><b>Summary:</b>
$info->{summary}
</p>
<p><b>Words:</b> $info->{wordcount}<br/>
<b>Universe:</b> $universe</p>
<b>Characters:</b> $characters</p>
<ol>
EOT
my @storyfiles = @{$info->{storyfiles}};
my @ch_titles = @{$info->{chapter_titles}};
my @ch_wc = @{$info->{chapter_wc}};
for (my $i=0; $i < @storyfiles; $i++)
{
$html .= sprintf("<li><a href=\"%s\">%s</a> (%d)</li>",
$storyfiles[$i],
$ch_titles[$i],
$ch_wc[$i]);
}
$html .= "\n</ol>\n</body></html>\n";
my $ofh;
open($ofh, ">", $filename) || die "Can't write to $filename";
print $ofh $html;
close($ofh);
return $filename;
} # build_toc
=head2 build_epub
Create an EPUB file from the story files and meta information.
$self->build_epub()
=cut
sub build_epub {
my $self = shift;
my %args = (
@_
);
my $info = $args{info};
my $epub = EBook::EPUB->new;
$epub->add_title($info->{title});
$epub->add_author($info->{author});
$epub->add_description($info->{summary});
$epub->add_language('en');
$epub->add_source($info->{url}, 'URL');
$epub->add_date($info->{fetch_date}, 'fetched');
# Add Subjects and additional Meta
# Also build up the title-page
my $info_str =<<EOT;
<h1>$info->{title}</h1>
<p>by $info->{author}</p>
<p><b>Fetched from:</b> $info->{url}</p>
<p><b>Summary:</b> $info->{summary}</p>
<p>
EOT
my %know = %{$info};
delete $know{title};
delete $know{author};
delete $know{summary};
delete $know{url};
delete $know{fetch_date};
delete $know{basename};
delete $know{chapter_titles};
delete $know{chapter_wc};
delete $know{chapters};
delete $know{storyfiles};
foreach my $key (sort keys %know)
{
if (!$know{$key})
{
next;
}
if (!ref $know{$key})
{
$info_str .= sprintf("<b>%s:</b> %s<br/>\n", $key, $know{$key});
if ($know{$key} =~ /,\s*/)
{
my @array = split(/,\s*/, $know{$key});
foreach my $v (@array)
{
if ($key =~ /^(?:category|story_length)$/)
{
$epub->add_subject($v);
}
else
{
$epub->add_meta_item($key, $v);
}
}
}
else
{
if ($key =~ /^(?:category|story_length)$/)
{
$epub->add_subject($know{$key});
}
else
{
$epub->add_meta_item($key, $know{$key});
}
}
}
else
{
$info_str .= sprintf("<b>%s:</b> %s<br/>\n", $key, join(', ', @{$know{$key}}));
foreach my $cat (@{$know{$key}})
{
if ($key =~ /^(?:category|story_length)$/)
{
$epub->add_subject($cat);
}
else
{
$epub->add_meta_item($key, $cat);
}
}
}
}
$info_str .= "</p>\n";
my $titlepage = $self->tidy(story=>$info_str, title=>$info->{title});
my $play_order = 1;
my $id;
$id = $epub->add_xhtml("title.html", $titlepage);
# Add top-level nav-point
my $navpoint = $epub->add_navpoint(
label => "ToC",
id => $id,
content => "title.html",
play_order => $play_order # should always start with 1
);
my @storyfiles = @{$info->{storyfiles}};
my @ch_titles = @{$info->{chapter_titles}};
for (my $i=0; $i < @storyfiles; $i++)
{
$play_order++;
$id = $epub->copy_xhtml($storyfiles[$i], $storyfiles[$i]);
my $navpoint = $epub->add_navpoint(
label => $ch_titles[$i],
id => $id,
content => $storyfiles[$i],
play_order => $play_order,
);
}
my $epub_file = $info->{basename} . '.epub';
$epub->pack_zip($epub_file);
# now unlink the storyfiles
for (my $i=0; $i < @storyfiles; $i++)
{
unlink $storyfiles[$i];
}
return $epub_file;
} # build_epub
=head2 tidy_chars
Remove nasty encodings.
$content = $self->tidy_chars($content);
=cut
sub tidy_chars {
my $self = shift;
my $string = shift;
# numeric entities
$string =~ s/ //sg;
$string =~ s/'/'/sg;
$string =~ s/"/"/sg;
$string =~ s/-/-/sg;
$string =~ s/ / /sg;
#-------------------------------------------------------
# from Catalyst::Plugin::Params::Demoronize
zap_cp1252($string);
my %replace_map = (
'\302' => '',
'\240' => ' ',
);
foreach my $replace (keys(%{replace_map})) {
my $rr = $replace_map{$replace};
$string =~ s/$replace/$rr/g;
}
#-------------------------------------------------------
# from demoronizser
# http://www.fourmilab.ch/webtools/demoroniser/
#-------------------------------------------------------
# Supply missing semicolon at end of numeric entity if
# Billy's bozos left it out.
$string =~ s/(&#[0-2]\d\d)\s/$1; /g;
# Fix dimbulb obscure numeric rendering of < > &
$string =~ s/\&\#038;/&/g;
$string =~ s/\&\#39;/‘/g;
$string =~ s/\&\#060;/</g;
$string =~ s/\&\#062;/>/g;
# Translate Unicode numeric punctuation characters
# into ISO equivalents
$string =~ s/‐/-/sg; # 0x2010 Hyphen
$string =~ s/‑/-/sg; # 0x2011 Non-breaking hyphen
$string =~ s/–/-/sg; # 0x2013 En dash
$string =~ s/—/--/sg; # 0x2014 Em dash
$string =~ s/―/--/sg; # 0x2015 Horizontal bar/quotation dash
$string =~ s/‖/||/sg; # 0x2016 Double vertical line
$string =~ s-‗-_-sg; # 0x2017 Double low line
$string =~ s/‘/`/sg; # 0x2018 Left single quotation mark
$string =~ s/’/'/sg; # 0x2019 Right single quotation mark
$string =~ s/‚/,/sg; # 0x201A Single low-9 quotation mark
$string =~ s/‛/`/sg; # 0x201B Single high-reversed-9 quotation mark
$string =~ s/“/"/sg; # 0x201C Left double quotation mark
$string =~ s/”/"/sg; # 0x201D Right double quotation mark
$string =~ s/„/,,/sg; # 0x201E Double low-9 quotation mark
$string =~ s/‟/"/sg; # 0x201F Double high-reversed-9 quotation mark
$string =~ s/•/*/sg; # 0x2022 Bullet
$string =~ s/‣/*/sg; # 0x2023 Triangular bullet
$string =~ s/․/./sg; # 0x2024 One dot leader
$string =~ s/‥/../sg; # 0x2026 Two dot leader
$string =~ s/…/.../sg; # 0x2026 Horizontal ellipsis
$string =~ s/‧/·/sg; # 0x2027 Hyphenation point
#-------------------------------------------------------
# and somehow some of the entities go funny
$string =~ s/\&\#133;/.../g;
$string =~ s/\ / /g;
$string =~ s/\‘/'/g;
$string =~ s/\’/'/g;
$string =~ s/\“/"/g;
$string =~ s/\”/"/g;
$string =~ s/\"/"/g;
$string =~ s/\–/-/g;
$string =~ s/\…/.../g;
# replace double-breaks with <p>
$string =~ s#<br\s*\/?>\s*<br\s*\/?>#\n<p>#sg;
# remove other cruft
$string =~ s#<wbr>##sg;
$string =~ s#</wbr>##sg;
$string =~ s#<wbr/>##sg;
$string =~ s#<nobr>##sg;
# Clean unwanted MS-Word HTML
$string =~ s#<!--\[if gte mso \d*\]>.*?<!\[endif\]-->##sg;
$string =~ s#<!--\[if !mso\]>.*?<!\[endif\]-->##sg;
$string =~ s!<[/]?(font|span|xml|del|ins|[ovwxp]:\w+|st\d:\w+)[^>]*?>!!igs;
$string =~ s!<([^>]*)(?:lang|style|size|face|[ovwxp]:\w+)=(?:'[^']*'|""[^""]*""|[^\s>]+)([^>]*)>!<$1$2>!isg;
$string =~ s/\s*class="Banner[0-9]+"//g;
$string =~ s/\s*class="Textbody"//g;
$string =~ s/\s*class="MsoNormal"//g;
$string =~ s/\s*class="MsoBodyText"//g;
return $string;
} # tidy_chars
1; # End of WWW::FetchStory::Fetcher
__END__