package NewsExtractor; our $VERSION = v0.35.0; use Moo; use Mojo::UserAgent; use Mojo::UserAgent::Transactor; use Try::Tiny; use Types::Standard qw< Str >; use Types::URI qw< Uri >; use Importer 'NewsExtractor::TextUtil' => qw(u); use NewsExtractor::Error; use NewsExtractor::Download; has url => ( required => 1, is => 'ro', isa => Uri, coerce => 1 ); has user_agent_string => ( required => 1, is => 'ro', isa => Str, default => sub { 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:75.0) Gecko/20100101 Firefox/75.0' } ); sub download { my NewsExtractor $self = shift; my $ua = Mojo::UserAgent->new()->transactor( Mojo::UserAgent::Transactor->new()->name( $self->user_agent_string ) )->max_redirects(3); my ($error, $download); my $tx = $ua->get( "". $self->url ); my $res; try { $res = $tx->result } catch { $error = NewsExtractor::Error->new( is_exception => 0, message => u($_), ) }; if ($res) { if ($res->is_error) { $error = NewsExtractor::Error->new( is_exception => 0, message => u($res->message), ); } else { $download = NewsExtractor::Download->new( tx => $tx ); } } return ($error, $download); } 1; __END__ =head1 NAME NewsExtractor - download and extract news articles from Internet. =head1 SYNOPSIS my ($error, $article) = NewsExtractor->new( url => $url )->download->parse; die $error if $error; # $article is an instance of NewsExtractor::Article say "Headline: " . $article->headline; say "When: " . ($article->dateline // "(unknown)"); say "By: " . ($article->journalist // "(unknown)"); say "\n" . $article->article_body; =head1 SEE Also L<NewsExtractor::Article> =head1 AUTHOR Kang-min Liu <gugod@gugod.org> =head1 LICENSE To the extent possible under law, Kang-min Liu has waived all copyright and related or neighboring rights to NewsExtractor. This work is published from: Taiwan. https://creativecommons.org/publicdomain/zero/1.0/ =cut