The Perl and Raku Conference 2025: Greenville, South Carolina - June 27-29 Learn more

use Moo;
use Types::Standard qw(InstanceOf);
has extractor => (
required => 0,
is => 'lazy',
isa => InstanceOf["NewsExtractor::CSSExtractor",
"NewsExtractor::JSONLDExtractor",
"NewsExtractor::SiteSpecificExtractor",
"NewsExtractor::GenericExtractor"],
builder => 1,
handles => [qw( headline dateline journalist content_text )],
);
use constant {
SiteSpecificExtractorByHost => {
'www.epochtimes.com' => 'NewsExtractor::SiteSpecificExtractor::www_epochtimes_com',
'www.hkcnews.com' => 'NewsExtractor::SiteSpecificExtractor::www_hkcnews_com',
'www.thestandnews.com' => 'NewsExtractor::SiteSpecificExtractor::www_thestandnews_com',
'www.allnews.tw' => 'NewsExtractor::SiteSpecificExtractor::www_allnews_tw',
'www.rvn.com.tw' => 'NewsExtractor::SiteSpecificExtractor::www_rvn_com_tw',
'www.chinatimes.com' => 'NewsExtractor::SiteSpecificExtractor::ChinaTimes',
'video.udn.com' => 'NewsExtractor::JSONLDExtractor',
'www.ctwant.com' => 'NewsExtractor::JSONLDExtractor',
'www.peopo.org' => 'NewsExtractor::SiteSpecificExtractor::www_peopo_org',
'www.ntdtv.com' => 'NewsExtractor::SiteSpecificExtractor::www_ntdtv_com',
'www.ksnews.com.tw' => 'NewsExtractor::SiteSpecificExtractor::www_ksnews_com_tw',
'news.tvbs.com.tw' => 'NewsExtractor::SiteSpecificExtractor::news_tvbs_com_tw',
'stars.udn.com' => 'NewsExtractor::SiteSpecificExtractor::UDN',
'money.udn.com' => 'NewsExtractor::SiteSpecificExtractor::UDN',
'house.udn.com' => 'NewsExtractor::SiteSpecificExtractor::UDN',
'www.taipeitimes.com' => 'NewsExtractor::SiteSpecificExtractor::www_taipeitimes_com',
'www.ettoday.net' => 'NewsExtractor::SiteSpecificExtractor::ETtoday',
'star.ettoday.net' => 'NewsExtractor::SiteSpecificExtractor::ETtoday',
'house.ettoday.net' => 'NewsExtractor::SiteSpecificExtractor::ETtoday',
'health.ettoday.net' => 'NewsExtractor::SiteSpecificExtractor::ETtoday',
'www.rti.org.tw' => 'NewsExtractor::SiteSpecificExtractor::www_rti_org_tw',
'www.bcc.com.tw' => 'NewsExtractor::SiteSpecificExtractor::www_bcc_com_tw',
'www.setn.com' => 'NewsExtractor::SiteSpecificExtractor::www_setn_com',
'news.tnn.tw' => 'NewsExtractor::SiteSpecificExtractor::news_tnn_tw',
'turnnewsapp.com' => 'NewsExtractor::SiteSpecificExtractor::turnnewsapp_com',
'news.cts.com.tw' => 'NewsExtractor::SiteSpecificExtractor::news_cts_com_tw',
'estate.ltn.com.tw' => 'NewsExtractor::SiteSpecificExtractor::estate_ltn_com_tw',
'www.upmedia.mg' => 'NewsExtractor::SiteSpecificExtractor::www_upmedia_mg',
'ctee.com.tw' => 'NewsExtractor::SiteSpecificExtractor::ctee_com_tw',
'news.ebc.net.tw' => 'NewsExtractor::SiteSpecificExtractor::news_ebc_net_tw',
'newnet.tw' => 'NewsExtractor::SiteSpecificExtractor::newnet_tw',
'www.ttv.com.tw' => 'NewsExtractor::SiteSpecificExtractor::www_ttv_com_tw',
'www.idn.com.tw' => 'NewsExtractor::SiteSpecificExtractor::www_idn_com_tw',
'www.fountmedia.io' => 'NewsExtractor::SiteSpecificExtractor::www_fountmedia_io',
},
CSSRuleSetByHost => {
'www.taiwannews.com.tw' => {
headline => 'h1.article-title',
dateline => 'div.article-date',
journalist => 'div.article-author',
content_text => 'article.article',
},
'udn.com' => {
headline => 'h1.article-content__title',
dateline => '.authors time.article-content__time',
journalist => '.authors span.article-content__author',
content_text => 'div.article-content__paragraph section.article-content__editor',
},
'www.nownews.com' => {
headline => 'h1.entry-title',
dateline => 'span.td-post-date:nth-child(2) > time:nth-child(1)',
journalist => '.td-post-author-name',
content_text => '.td-post-content > span[itemprop=articleBody]',
},
'www.enewstw.com' => {
headline => 'td.blog_title > strong',
dateline => 'td.blog_title tr:nth-child(2) > td.blog',
journalist => 'td.blog_title tr:nth-child(1) > td.blog',
content_text => 'td.new_t p',
},
'www.storm.mg' => {
headline => 'h1#article_title',
dateline => 'span#info_time',
journalist => '#article_info_wrapper #author_block a.link_author > span.info_author',
content_text => 'div#article_inner_wrapper > article:nth-child(1)',
}
}
};
sub _build_extractor {
my ($self) = @_;
my $url = $self->tx->req->url;
my $host = $url->host;
my $extractor;
if (my $sel = CSSRuleSetByHost->{$host}) {
$extractor = NewsExtractor::CSSExtractor->new(
css_selector => NewsExtractor::CSSRuleSet->new(%$sel),
tx => $self->tx
);
} elsif (my $extractor_class = SiteSpecificExtractorByHost->{$host}) {
$extractor = $extractor_class->new( tx => $self->tx );
} else {
$extractor = NewsExtractor::GenericExtractor->new( tx => $self->tx );
}
return $extractor;
}
1;