use
Importer
'NewsExtractor::TextUtil'
=>
qw( normalize_whitespace html2text )
;
use
Importer
'NewsExtractor::Constants'
=>
qw( %RE )
;
has
site_name
=> (
is
=>
"lazy"
,
isa
=> Maybe[Str],
);
no
Moo;
sub
_build_site_name {
my
(
$self
) =
@_
;
my
$el
=
$self
->dom->at(
"meta[property='og:site_name']"
);
if
(
$el
) {
return
$el
->attr(
'content'
);
}
return
undef
;
}
sub
headline {
my
(
$self
) =
@_
;
my
$site_name
=
$self
->site_name;
my
(
$title
,
$el
);
my
$dom
=
$self
->dom;
if
(
$el
=
$dom
->at(
"#story #news_title, #news_are .newsin_title, .data_midlle_news_box01 dl td:first-child"
)) {
$title
=
$el
->text;
}
elsif
(
$el
=
$dom
->at(
"meta[property='og:title']"
)) {
$title
=
$el
->attr(
"content"
);
}
elsif
(
$el
=
$dom
->at(
"meta[name='title']"
)) {
$title
=
$el
->attr(
'content'
);
}
elsif
(
$el
=
$dom
->at(
"title"
)) {
$title
=
$el
->text;
}
else
{
return
;
}
$title
.=
""
;
if
(
$site_name
) {
$title
=~ s/\s* \p{Punct} \s*
$site_name
\s* \z//x;
}
if
(
defined
(
$title
)) {
my
$delim
=
qr<(?: \p{Punct} | \| )>
x;
$title
=~ s/ \s*
$delim
\s*
$RE
{newspaper_names} \s* \z//x;
$title
=~ s/\A
$RE
{newspaper_names} \s*
$delim
\s* //x;
$title
=~ s/\r\n/\n/g;
$title
=~ s/\A\s+//;
$title
=~ s/\s+\z//;
}
return
$title
;
}
sub
dateline {
my
(
$self
) =
@_
;
my
$dateline
;
my
$guess
;
my
$dom
=
$self
->dom;
if
(
$guess
=
$dom
->at(
"meta[property='article:modified_time'], meta[property='article:published_time'], meta[itemprop=dateModified][content], meta[itemprop=datePublished][content]"
)) {
$dateline
=
$guess
->attr(
'content'
);
}
elsif
(
$guess
=
$dom
->at(
"time[itemprop=datePublished][datetime], h1 time[datetime], .func_time time[pubdate]"
)) {
$dateline
=
$guess
->attr(
'datetime'
);
}
elsif
(
$guess
=
$dom
->at(
".reporter time, span.time, span.viewtime, header.article-desc time, .timeBox .updatetime span, .caption div.label-date, .contents_page span.date, .main-content span.date, .newsin_date, .news .date, .author .date, ul.info > li.date > span:nth-child(2), #newsHeadline span.datetime, article p.date, .post-meta > .icon-clock > span, .article_info_content span.info_time, .content time.page-date, .c_time, .newsContent p.time, .story_bady_info_author span:nth-child(1), div.title > div.time, div.article-meta div.article-date, address.authorInfor time, .entry-meta .date a, .author-links .posts-date, .top_title span.post_time, .mid-news > .m-left-side > .maintype-wapper > h2, .node-inner > .submitted > span"
)) {
$dateline
=
$guess
->text;
}
elsif
(
$guess
=
$dom
->at(
"div#articles cite"
)) {
$guess
->at(
"a"
)->remove;
$dateline
=
$guess
->text;
}
elsif
(
$guess
=
$dom
->at(
"article.ndArticle_leftColumn div.ndArticle_creat, ul.info li.date, .cpInfo .cp, .nsa3 .tt27, .fncnews-content > .info > span.small-gray-text"
)) {
(
$dateline
) =
$guess
->text =~ m
}
elsif
(
$guess
=
$dom
->at(
".news-toolbar .news-toolbar__cell"
)) {
(
$dateline
) =
$guess
->text =~ m
}
elsif
(
$guess
=
$dom
->at(
".content .writer span:nth-child(2)"
)) {
(
$dateline
) =
$guess
->text =~ m
}
elsif
(
$guess
=
$dom
->at(
"div.contentBox div.content_date"
)) {
(
$dateline
) =
$guess
->text =~ m
}
elsif
(
$guess
=
$dom
->at(
"div.detitle2 > div.cell > div"
)) {
(
$dateline
) =
$guess
->text =~ m
}
elsif
(
$guess
=
$dom
->at(
"div.content-wrapper-right > div > div > div:nth-child(4), span.f12_15a_g2"
)) {
(
$dateline
) =
$guess
->text =~ m
}
elsif
(
$guess
=
$dom
->at(
"span#ctl00_ContentPlaceHolder1_News_Label, #ctl00_ContentPlaceHolder1_UpdatePanel2 font[color=darkred]"
)) {
(
$dateline
) =
$guess
->text =~ m
}
elsif
(
$guess
=
$dom
->at(
".news-info dd.date:nth-child(6)"
)) {
(
$dateline
) =
$guess
->text =~ m
}
elsif
(
$guess
=
$dom
->at(
"article.entry-content div:nth-child(2)"
)) {
(
$dateline
) =
$guess
->text =~ m
}
elsif
(
$guess
=
$dom
->at(
"span.submitted-by"
)) {
(
$dateline
) =
$guess
->text =~ m
}
elsif
(
$guess
=
$dom
->at(
'#story #news_author'
)) {
(
$dateline
) =
$guess
->all_text =~ m{\A 【記者.+ 】 (.+) \z}x;
}
elsif
(
$guess
=
$dom
->at(
'.data_midlle_news_box01 dl dd ul li:first-child'
)) {
(
$dateline
) =
$guess
->text;
my
(
$year
,
$mmdd
) =
$dateline
=~ /\A ([0-9]{3}) - (.+) \z /x;
$year
+= 1911;
$dateline
=
$year
.
'-'
.
$mmdd
;
}
elsif
(
$guess
=
$dom
->at(
'#details_block .left .date, .article_header > .author > span:last-child'
)) {
$dateline
= normalize_whitespace
$guess
->text;
}
if
(
$dateline
) {
$dateline
= normalize_whitespace(
$dateline
);
if
(
$dateline
=~ /^([0-9]{4})[^0-9]/) {
if
($1 > ((
localtime
)[5] + 1900)) {
$dateline
=
undef
;
}
}
}
return
$dateline
;
}
sub
journalist {
my
(
$self
) =
@_
;
my
$dom
=
$self
->dom;
my
(
$ret
,
$guess
);
if
(
$guess
=
$dom
->at(
'meta[property="og:article:author"]'
) ) {
$ret
=
$guess
->attr(
'content'
);
}
elsif
(
$guess
=
$dom
->at(
'meta[name="author"]'
) ) {
$ret
=
$guess
->attr(
'content'
);
}
elsif
(
$guess
=
$dom
->at(
'div.field-item a[href^=/author/], div.content_reporter a[itemprop=author], span[itemprop=author] a, div.author div.intro a div.name, div.article-author > h5 > a, div.article-meta > div.article-author > a, div.authorInfo li.authorName > a, .article .writer > p, .info_author, .news-info dd[itemprop=author], .content_reporter a, .top_title span.reporter_name, .post-heading time span, header .article-meta .article-author, .article_header > .author > span:first-child, .mid-news > .m-left-side > .maintype-wapper > .subtype-sort, .newsCon > .newsInfo > span:first-child, .newsdetail_content > .title > h4 > a[href^="/news/searchresult/news?search_text="], .m-from-author > .m-from-author__name'
) ) {
$ret
=
$guess
->text;
}
elsif
(
$guess
=
$dom
->at(
'.story_bady_info_author'
)) {
if
(
$guess
->find(
'a'
)->size() == 0) {
$ret
=
$guess
->text;
}
else
{
$ret
=
$guess
->find(
'a'
)->
map
(
sub
{ normalize_whitespace(
$_
->text ) })->
join
(
', '
) .
""
;
}
}
elsif
(
$guess
=
$dom
->at(
'span.f12_15a_g2'
)) {
(
$ret
) =
$guess
->text =~ m{/記者 (.+?)/};
}
elsif
(
$guess
=
$dom
->at(
'div#yt_container_placeholder + p'
)) {
(
$ret
) =
$guess
->text =~ m{\A \s* (.+) \s+ 報導 \s+ / }x;
}
elsif
(
$guess
=
$dom
->at(
'h4.font_color5'
)) {
(
$ret
) =
$guess
->all_text =~ m{\A \s* 編輯 \s* (.+) \s+ 報導 }x;
}
elsif
(
$guess
=
$dom
->at(
'#story #news_author'
)) {
(
$ret
) =
$guess
->all_text =~ m{\A 【 (記者 .+) 】}x;
}
elsif
(
$guess
=
$dom
->at(
'#details_block .left .name, .articleMain .article-author a.author-title, .article__credit a[href^="/author/"], span[itemprop=author] span[itemprop=name], .post-header-additional .post-meta-info a.nickname'
)) {
$ret
=
$guess
->text;
}
elsif
(
$guess
=
$dom
->at(
'div.single-post-meta a[rel="author"]'
)) {
(
$ret
) =
$guess
->text =~ m<^工商時報 (.+)\z>x;
}
$ret
=
undef
if
(
$ret
&& is_NewspaperName(
$ret
));
if
( !
$ret
&& (
my
$content_text
=
$self
->content_text)) {
my
@patterns
= (
qr<\b (?:特[約派])? [记記]者 \s* ([\s\p{Letter}、]+?) \s* [/╱/] \s* (?: 特稿 | 專訪 | \p{Letter}+ (?:報導|报导)) \b>
xs,
qr<\A 【(記者.+?報導)】>
x,
qr<\A 中評社 .+? \d+ 月 \d+ 日電(記者(.+?))>
x,
qr<\A ( 記者[^/]+/.+?電 )>
x,
qr<\A 匯流新聞網記者 (\p{Letter}+) /(?:\p{Letter}+)報導 >
x,
qr<\A 匯流新聞網記者\s*/\s*(\p{Letter}+)綜合報導>
x,
qr<((中央社[记記]者 \S+ 日 專?[電电] | 大纪元记者\p{Letter}+报导 | 記者.+?報導/.+?))>
x,
qr< \( ( \p{Letter}+ / \p{Letter}+ 報導 ) \) >
x,
qr<\A 文:記者(\p{Letter}+) \n>
x,
qr<( (譯者:.+?/核稿:.+?) )[0-9]+(?:\n|\z)>
x,
qr< \(記者 (.+?) \) \z >
x,
qr<^(編譯[^/]+?/.+?報導)$>
xsm,
qr<(( (?:譯者|編輯):.+) ) (?:[0-9]{7})? \z >
x,
qr<(記者 (\p{Letter}+) ) \z>
x,
qr< (記者 (\p{Letter}+) 綜合報導)\s+ ( (責任編輯:\p{Letter}+) ) \z>
x,
qr< ( (責任編輯:\p{Letter}+) )\z>
x,
qr< \s (公民記者 .+ 採訪報導) \z>
x,
qr<\A 【大成報記者 (\p{Letter}+) / .+報導】 >
x,
qr<\A 記者 (\p{Letter}+) /報導 >
x,
qr<\A \[ (記者.+報導) \] >
x,
qr<\A ( (記者.+報導) ) >
x,
qr<\A 【(本報記者.+報導)】 >
x,
qr<\b ﹝記者(\p{Letter}+?)/.+?報導﹞ \b>
x,
qr<\A〔新網記者 ( \p{Letter}+ (?:報導|特稿))〕\b>
x,
);
for
my
$pat
(
@patterns
) {
(
$ret
) =
$content_text
=~ m/
$pat
/;
last
if
$ret
;
}
unless
(
$ret
) {
my
(
$guess
) =
$content_text
=~ m{((\p{Letter}+))\z}xsm;
if
(
$guess
&&
$dom
->descendant_nodes->first(
sub
{
$_
->type eq
'text'
&&
$_
->content =~ m<記者${guess}\b> })) {
$ret
=
$guess
}
}
}
if
(
$ret
) {
$ret
= normalize_whitespace(
$ret
);
$ret
=
""
if
is_NewspaperName(
$ret
);
}
return
$ret
;
}
sub
content_text {
my
(
$self
) =
@_
;
my
(
$el
,
$html
);
$self
->dom->find(
'script, style, p.appE1121, div.sexmask, div.cat-list, div#marquee, #setting_weather'
)->
map
(
'remove'
);
my
$extractor
= HTML::ExtractContent->new;
if
(
$el
=
$self
->dom->at(
'article'
)) {
$html
=
$extractor
->extract(
"$el"
)->as_html;
}
else
{
$html
=
$extractor
->extract(
$self
->dom->to_string )->as_html;
}
my
$text
= html2text(
$html
);
my
@paragraphs
=
split
(/\n\n/,
$text
) or
return
undef
;
if
(
my
$site_name
=
$self
->site_name) {
$paragraphs
[-1] =~ s/\A \s* \p{Punct}? \s* ${site_name} \s* \p{Punct}? \s* \z//x;
$paragraphs
[-1] =~ s/${site_name}//x;
}
$paragraphs
[-1] =~ s/\A \s* \p{Punct}? \s*
$RE
{newspaper_names} \s* \p{Punct}? \s* \z//x;
if
(max(
map
{
length
(
$_
) }
@paragraphs
) < 30) {
return
undef
;
}
return
join
"\n\n"
,
@paragraphs
;
}
1;