script/jupiter - metacpan.org


            
              1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
—
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
—
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
—
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
—
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
—
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
              #! /usr/bin/env perl
# Planet Jupiter is a feed aggregator that creates a single HTML file
# Copyright (C) 2020–2021  Alex Schroeder <alex@gnu.org>
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program.  If not, see <https://www.gnu.org/licenses/>.
package Jupiter;
use Encode::Locale;
use Encode;
use utf8;
binmode(STDOUT, ":utf8");
binmode(STDERR, ":utf8");
=encoding UTF8
=head1 NAME
jupiter - turn a list of feeds into a HTML page, a river of news
=head1 SYNOPSIS
To update the feeds from one or more OPML files:
B<jupiter update> I<feed.opml> … [I</regex/> …]
To generate F<index.html>:
B<jupiter html> I<feed.opml>
=head1 DESCRIPTION
Planet Jupiter is used to pull together the latest updates from a bunch of other
sites and display them on a single web page, the "river of news". The sites we
get our updates from are defined in an OPML file.
A river of news, according to Dave Winer, is a feed aggregator. New items appear
at the top and old items disappear at the bottom. When it's gone, it's gone.
There is no count of unread items. The goal is to fight the I<fear of missing
out> (FOMO).
Each item looks similar to every other: headline, link, an extract, maybe a date
and an author. Extracts contain but the beginning of the article's text; all
markup is removed; no images. The goal is to make the page easy to skim.
Scroll down until you find something interesting and follow the link to the
original article if you want to read it.
=head2 The OPML file
You B<need> an OPML file. It's an XML file linking to I<feeds>. Here's an
example listing just one feed. In order to add more, add more C<outline>
elements with the C<xmlUrl> attribute. The exact order and nesting does not
matter. People can I<import> these OPML files into their own feed readers and
thus it may make sense to spend a bit more effort in making it presentable.
    <opml version="2.0">
      <body>
        <outline title="Alex Schroeder"
                 xmlUrl="https://alexschroeder.ch/view/index.rss"/>
      </body>
    </opml>
=head2 Update the feeds in your cache
This is how you update the feeds in a file called C<feed.opml>. It downloads all
the feeds linked to in the OPML file and stores them in the cache directory.
    jupiter update feed.opml
The directory used to keep a copy of all the feeds in the OPML file has the same
name as the OPML file but without the .opml extension. In other words, if your
OPML file is called C<feed.opml> then the cache directory is called C<feed>.
This operation takes long because it requests an update from all the sites
listed in your OPML file. Don't run it too often or you'll annoy the site
owners.
The OPML file must use the .opml extension. You can update the feeds for
multiple OPML files in one go.
=head2 Adding just one feed
After a while, the list of feeds in your OPML starts getting unwieldy. When you
add a new feed, you might not want to fetch all of them. In this case, provide a
regular expression surrounded by slashes to the C<update> command:
    jupiter update feed.opml /example/
Assuming a feed with a URL or title that matches the regular expression is
listed in your OPML file, only that feed is going to get updated.
There is no need to escape slashes in the regular expression: C<//rss/> works
just fine. Beware shell escaping, however. Most likely, you need to surround the
regular expression with single quotes if it contains spaces:
    jupiter update feed.opml '/Halberds & Helmets/'
Notice how we assume that named entities such as C<&amp;> have already been
parsed into the appropriate strings.
=head2 Generate the HTML
This is how you generate the C<index.html> file based on the feeds of your
C<feed.opml>. It assumes that you have already updated all the feeds (see
above).
    jupiter html feed.opml
See L</OPTIONS> for ways to change how the HTML is generated.
=head2 Generate the RSS feed
This happens at the same time as when you generate the HTML. It takes all the
entries that are being added to the HTML and puts the into a feed.
See L</OPTIONS> for ways to change how the HTML is generated.
=head2 Why separate the two steps?
The first reason is that tinkering with the templates involves running the
program again and again, and you don't want to contact all the sites whenever
you update your templates.
The other reason is that it allows you to create subsets. For example, you can
fetch the feeds for three different OPML files:
    jupiter update osr.opml indie.opml other.opml
And then you can create three different HTML files:
    jupiter html osr.html osr.opml
    jupiter html indie.html indie.opml
    jupiter html rpg.html osr.opml indie.opml other.opml
For an example of how it might look, check out the setup for the planets I run.
L<https://alexschroeder.ch/cgit/planet/about/>
=head2 What about the JSON file?
There's a JSON file that gets generated and updated as you run Planet Jupiter.
It's name depends on the OPML files used. It records metadata for every feed in
the OPML file that isn't stored in the feeds themselves.
C<message> is the HTTP status message, or a similar message such as "No entry
newer than 90 days." This is set when updating the feeds in your cache.
C<message> is the HTTP status code; this code could be the real status code from
the server (such as 404 for a "not found" status) or one generated by Jupiter
such that it matches the status message (such as 206 for a "partial content"
status when there aren't any recent entries in the feed). This is set when
updating the feeds in your cache.
C<title> is the site's title. When you update the feeds in your cache, it is
taken from the OPML file. That's how the feed can have a title even if the
download failed. When you generate the HTML, the feeds in the cache are parsed
and if a title is provided, it is stored in the JSON file and overrides the
title in the OPML file.
C<link> is the site's link for humans. When you generate the HTML, the feeds in
the cache are parsed and if a link is provided, it is stored in the JSON file.
If the OPML element contained a C<htmlURL> attribute, however, that takes
precedence. The reasoning is that when a podcast is hosted on a platform which
generates a link that you don't like and you know the link to the human-readable
blog elsewhere, use the C<htmlURL> attribute in the OPML file to override this.
C<last_modified> and C<etag> are two headers used for caching from the HTTP
response that cannot be changed by data in the feed.
If we run into problems downloading a feed, this setup allows us to still link
to the feeds that aren't working, using their correct names, and describing the
error we encountered.
=head2 Logging
Use the C<--log=LEVEL> to set the log level. Valid values for LEVEL are debug,
info, warn, error, and fatal.
=head1 LICENSE
GNU Affero General Public License
=head1 INSTALLATION
Using C<cpan>:
    cpan App::jupiter
Manual install:
    perl Makefile.PL
    make
    make install
=head2 Dependencies
To run Jupiter on Debian we need:
C<libmodern-perl-perl> for L<Modern::Perl>
C<libmojolicious-perl> for L<Mojo::Template>, L<Mojo::UserAgent>, L<Mojo::Log>,
L<Mojo::JSON>, and L<Mojo::Util>
C<libxml-libxml-perl> for L<XML::LibXML>
C<libfile-slurper-perl> for L<File::Slurper>
C<libdatetime-perl> for L<DateTime>
C<libdatetime-format-mail-perl> for L<DateTime::Format::Mail>
C<libdatetime-format-iso8601-perl> for L<DateTime::Format::ISO8601>
Unfortunately, L<Mojo::UserAgent::Role::Queued> isn't packaged for Debian.
Therefore, let's build it and install it as a Debian package.
    sudo apt-get install libmodule-build-tiny-perl
    sudo apt-get install dh-make-perl
    sudo dh-make-perl --build --cpan Mojo::UserAgent::Role::Queued
    dpkg --install libmojo-useragent-role-queued-perl_1.15-1_all.deb
To generate the C<README.md> from the source file, you need F<pod2markdown>
which you get in C<libpod-markdown-perl>.
=head1 FILES
There are a number of files in the F<share> directory which you can use as
starting points.
F<template.html> is the HTML template.
F<default.css> is a small CSS file used by F<template.html>.
F<personalize.js> is a small Javascript file used by F<template.html> used to
allow visitors to jump from one article to the next using C<J> and C<K>.
F<jupiter.png> is used by F<template.html> as the icon.
F<jupiter.svg> is used by F<template.html> as the logo.
F<feed.png> is used by F<template.html> as the icon for the feeds in the
sidebar.
F<feed.rss> is the feed template.
=head1 OPTIONS
HTML generation uses a template, C<template.html>. It is written for
C<Mojo::Template> and you can find it in the F<share> directory of your
distribution. The default templates use other files, such as the logo, the feed
icon, a CSS file, and a small Javascript snippet to enable navigation using the
C<J> and C<K> keys (see above).
You can specify a different HTML file to generate:
B<jupiter html> I<your.html feed.opml>
If you specify two HTML files, the first is the HTML file to generate and the
second is the template to use. Both must use the C<.html> extension.
B<jupiter html> I<your.html your-template.html feed.opml>
Feed generation uses a template, C<feed.rss>. It writes all the entries into a
file called C<feed.xml>. Again, the template is written for C<Mojo::Template>.
You can specify up to two XML, RSS or ATOM files. They must uses one of these
three extensions: C<.xml>, C<.rss>, or C<.atom>. The first is the name of the
feed to generate, the second is the template to use:
B<jupiter html> I<atom.xml template.xml planet.html template.html feed.opml>
In the above case, Planet Jupiter will write a feed called F<atom.xml> based on
F<template.xml> and a HTML file called F<planet.html> based on F<template.html>,
using the cached entries matching the feeds in F<feed.opml>.
=cut
use DateTime;
use DateTime::Format::Mail;
use DateTime::Format::ISO8601;
use File::Basename;
use File::Slurper qw(read_binary write_binary read_text write_text);
use List::Util qw(uniq min shuffle);
use Modern::Perl;
use Mojo::Log;
use Mojo::JSON qw(decode_json encode_json);
use Mojo::Template;
use Mojo::UserAgent;
use Pod::Simple::Text;
use XML::LibXML;
use Mojo::Util qw(slugify trim xml_escape html_unescape);
use File::ShareDir 'dist_file';
use vars qw($log);
our $log = Mojo::Log->new;
my $xpc = XML::LibXML::XPathContext->new;
$xpc->registerNs('atom', 'http://www.w3.org/2005/Atom');
$xpc->registerNs('html', 'http://www.w3.org/1999/xhtml');
$xpc->registerNs('dc', 'http://purl.org/dc/elements/1.1/');
$xpc->registerNs('itunes', 'http://www.itunes.com/dtds/podcast-1.0.dtd');
my $undefined_date = DateTime->from_epoch( epoch => 0 );
my (%wday, %month, $wday_re, $month_re);
%wday = qw (lun. Mon mar. Tue mer. Wed jeu. Thu ven. Fri sam. Sat dim. Sun);
%month = qw (janv. Jan févr. Feb mars Mar avr. Apr mai May juin Jun
             juil. Jul août Aug sept. Sep oct. Oct nov. Nov déc. Dec);
$wday_re = join('|', map { quotemeta } keys %wday) unless $wday_re;
$month_re = join('|', map { quotemeta } keys %month) unless $month_re;
# Our tests don't want to call main
__PACKAGE__->main unless caller;
sub main {
  my ($log_level) = grep /^--log=/, @ARGV;
  $log->level(substr($log_level, 6)) if $log_level;
  my ($command) = grep /^[a-z]+$/, @ARGV;
  $command ||= 'help';
  if ($command eq 'update') {
    update_cache(@ARGV);
  } elsif ($command eq 'html') {
    make_html(@ARGV);
  } else {
    my $parser = Pod::Simple::Text->new();
    $parser->parse_file($0);
  }
}
sub update_cache {
  my ($feeds, $files) = read_opml(@_);
  make_directories($feeds);
  load_feed_metadata($feeds, $files);
  my $ua = Mojo::UserAgent->new->with_roles('+Queued')
      ->max_redirects(3)
      ->max_active(5);
  make_promises($ua, $feeds);
  fetch_feeds($feeds);
  save_feed_metadata($feeds, $files);
  cleanup_cache($feeds);
}
sub make_promises {
  my $ua = shift;
  my $feeds = shift;
  for my $feed (@$feeds) {
    my $url = html_unescape $feed->{url}; # undo xml_escape for the request
    $ua->on(start => sub {
      my ($ua, $tx) = @_;
      $tx->req->headers->if_none_match($feed->{etag}) if ($feed->{etag});
      $tx->req->headers->if_modified_since($feed->{last_modified}) if ($feed->{last_modified});
    });
    $feed->{promise} = $ua->get_p($url)
        ->catch(sub {
          $feed->{message} = "@_";
          $feed->{code} = 521;
          # returning 0 in the case of an error is important
          0; })
        # sleeping to stop blogger.com from blocking us
        ->finally(sub { $log->debug($url); sleep 2; });
  }
}
sub fetch_feeds {
  my $feeds = shift;
  $log->info("Fetching feeds...");
  Mojo::Promise->all(map { $_->{promise} } @$feeds)->then(sub {
    # all returns the values in the same order!
    for (my $i = 0; $i < @_; $i++) {
      my $feed = $feeds->[$i];
      my $value = $_[$i];
      my $tx = $value->[0];
      # relies on catch returning 0 above
      next unless $tx;
      $feed->{message} = $tx->result->message;
      $feed->{code} = $tx->result->code;
      $feed->{last_modified} = $tx->result->headers->last_modified;
      $feed->{etag} = $tx->result->headers->etag;
      # save raw bytes if this is a success
      eval { write_binary($feed->{cache_file}, $tx->result->body) } if $tx->result->is_success;
      warn "Unable to write $feed->{cache_file}: $@\n" if $@;
    }
  })->catch(sub {
    warn "Something went wrong: @_";
  })->wait;
}
sub load_feed_metadata {
  my $feeds = shift;
  my $files = shift;
  for my $file (@$files) {
    my $filename = "$file->{path}/$file->{name}";
    next unless -r "$filename.json";
    my $data = decode_json read_binary("$filename.json");
    for my $feed (@$feeds) {
      my $url = $feed->{url};
      next unless $url;
      # don't overwrite title and htmlUrl from OPML file
      $feed->{title} = $data->{$url}->{title} if $data->{$url}->{title};
      $feed->{link} ||= $data->{$url}->{link};
      # all the other metadata is loaded from the JSON file
      $feed->{message} = $data->{$url}->{message};
      $feed->{code} = $data->{$url}->{code};
      $feed->{last_modified} = $data->{$url}->{last_modified};
      $feed->{etag} = $data->{$url}->{etag};
    } grep { $_->{opml_file} eq $file->{file} } @$feeds;
  }
}
sub save_feed_metadata {
  my $feeds = shift;
  my $files = shift;
  for my $file (@$files) {
    my $name = $file->{name};
    my %messages = map {
      my $feed = $_;
      $feed->{url} => { map { $_ => $feed->{$_} } grep { $feed->{$_} } qw(title link message code last_modified etag) };
    } grep { $_->{opml_file} eq $file->{file} } @$feeds;
    write_binary("$file->{path}/$file->{name}.json", encode_json \%messages);
  }
}
sub cleanup_cache {
  my $feeds = shift;
  my %good = map { $_ => 1 } @{cache_files($feeds)};
  my @unused = grep { not $good{$_} } @{existing_files($feeds)};
  if (@unused) {
    $log->info("Removing unused files from the cache...");
    foreach (@unused) { $log->info($_) }
    unlink @unused;
  }
}
sub existing_files {
  my $feeds = shift;
  my @files;
  for my $dir (uniq map { $_->{cache_dir} } @$feeds) {
    push(@files, <"$dir/*">) if $dir and -d $dir;
  }
  return \@files;
}
sub cache_files {
  my $feeds = shift;
  my @files = map { $_->{cache_file} } @$feeds;
  return \@files;
}
sub make_directories {
  my $feeds = shift;
  for my $dir (uniq map { $_->{cache_dir} } @$feeds) {
    if ($dir and not -d $dir) {
      mkdir $dir;
    }
  }
}
sub make_html {
  my ($feeds, $files) = read_opml(@_);
  load_feed_metadata($feeds, $files); # load messages and codes for feeds
  my $globals = globals($files);
  my $entries = entries($feeds, 4); # set data for feeds, too
  add_data($feeds, $entries);   # extract data from the xml
  save_feed_metadata($feeds, $files); # save title and link for feeds
  $entries = limit($entries, 100);
  write_text(html_file(@_), apply_template(read_text(html_template_file(@_)), $globals, $feeds, $entries));
  write_text(feed_file(@_), apply_template(read_text(feed_template_file(@_)), $globals, $feeds, $entries));
}
sub html_file {
  my ($html) = grep /\.html$/, @_;
  return $html || 'index.html';
}
sub html_template_file {
  my ($html, $template) = grep /\.html$/, @_;
  $template ||= dist_file('App-jupiter', 'template.html');
  die "HTML template $template not found\n" unless -r $template;
  return $template;
}
sub feed_file {
  my ($feed) = grep /\.(xml|rss|atom)$/, @_;
  return $feed if $feed;
  return 'feed.xml';
}
sub feed_template_file {
  my ($feed, $template) = grep /\.(xml|rss|atom)$/, @_;
  return $template if $template;
  return dist_file('App-jupiter', 'feed.rss');
}
sub apply_template {
  my $mnt = Mojo::Template->new;
  return $mnt->render(@_);
}
=head1 TEMPLATES
The page template is called with three hash references: C<globals>, C<feeds>,
and C<entries>. The keys of these three hash references are documented below.
The values of these hashes are all I<escaped HTML> except where noted (dates and
file names, for example).
The technical details of how to write the templates are documented in the man
page for L<Mojo::Template>.
=head2 Globals
There are not many global keys.
B<date> is the the publication date of the HTML page, in ISO date format:
YYYY-MM-DD.
B<files> is the list of OPML files used.
=cut
sub globals {
  my $files = shift;
  my @time = gmtime;
  my $today = DateTime->now->ymd;
  return {date => $today, files => $files};
}
=head2 Writing templates for feeds
Feeds have the following keys available:
B<title> is the title of the feed.
B<url> is the URL of the feed (RSS or Atom). This is not the link to the site!
B<link> is the URL of the web page (HTML). This is the link to the site.
B<opml_file> is the file name where this feed is listed.
B<cache_dir> is the directory where this feed is cached.
B<message> is the HTTP status message or other warning or error that we got
while fetching the feed.
B<code> is the HTTP status code we got while fetching the feed.
B<doc> is the L<XML::LibXML::Document>. Could be either Atom or RSS!
=cut
# Creates list of feeds. Each feed is a hash with keys title, url, opml_file,
# cache_dir and cache_file.
sub read_opml {
  my (@feeds, @files);
  my @filters = map { decode(locale => substr($_, 1, -1)) } grep /^\/.*\/$/, @_;
  for my $file (grep /\.opml$/, @_) {
    my $doc = XML::LibXML->load_xml(location => $file); # this better have no errors!
    my @nodes = $doc->findnodes('//outline[./@xmlUrl]');
    my ($name, $path) = fileparse($file, '.opml', '.xml');
    for my $node (@nodes) {
      my $title = xml_escape $node->getAttribute('title');
      my $url = xml_escape $node->getAttribute('xmlUrl');
      next if @filters > 0 and not grep { $url =~ /$_/ or $title =~ /$_/ } @filters;
      my $link = xml_escape $node->getAttribute('htmlUrl');
      push @feeds, {
        title => $title,    # title in the OPML file
        url => $url,        # feed URL in the OPML file
        link => $link,      # web URL in the OPML file
        opml_file => $file,
        cache_dir => "$path/$name",
        cache_file => "$path/$name/" . slugify($url),
      };
    }
    warn "No feeds found in the OPML file $file\n" unless @nodes;
    push @files, { file => $file, path => $path, name => $name };
  }
  @feeds = shuffle @feeds;
  return \@feeds, \@files;
}
sub entries {
  my $feeds = shift;
  my $limit = shift;
  my $date = DateTime->now(time_zone => 'UTC')->subtract( days => 90 ); # compute once
  my $now =  DateTime->now(time_zone => 'UTC');
  my @entries;
  for my $feed (@$feeds) {
    next unless -r $feed->{cache_file};
    my $doc = eval { XML::LibXML->load_xml(recover => 2, location => $feed->{cache_file} )};
    if (not $doc) {
      $feed->{message} = xml_escape "Parsing error: $@";
      $feed->{code} = 422; # unprocessable
      next;
    }
    $feed->{doc} = $doc;
    my @nodes = $xpc->findnodes("/rss/channel/item | /atom:feed/atom:entry", $doc);
    if (not @nodes) {
      $feed->{message} = "Empty feed";
      $feed->{code} = 204; # no content
      next;
    }
    # if this is an Atom feed, we need to sort the entries ourselves (older entries at the end)
    my @candidates = map {
      my $entry = {};
      $entry->{element} = $_;
      $entry->{id} = id($_);
      $entry->{date} = updated($_) || $undefined_date;
      $entry;
    } @nodes;
    @candidates = grep { DateTime->compare($_->{date}, $now) <= 0 } @candidates;
    @candidates = unique(sort { DateTime->compare( $b->{date}, $a->{date} ) } @candidates);
    @candidates = @candidates[0 .. min($#candidates, $limit - 1)];
    # now that we have limited the candidates, let's add more metadata from the feed
    for my $entry (@candidates) {
      $entry->{feed} = $feed;
      # these two are already escaped
      $entry->{blog_title} = $feed->{title};
      $entry->{blog_url} = $feed->{url};
    }
    add_age_warning($feed, \@candidates, $date);
    push @entries, @candidates;
  }
  return \@entries;
}
sub add_age_warning {
  my $feed = shift;
  my $entries = shift;
  my $date = shift;
  # feed modification date is smaller than the date given
  my ($node) = $xpc->findnodes("/rss/channel | /atom:feed", $feed->{doc});
  my $feed_date = updated($node);
  if ($feed_date and DateTime->compare($feed_date, $date) == -1) {
    $feed->{message} = "No feed updates in 90 days";
    $feed->{code} = 206; # partial content
    return;
  } else {
    # or no entry found with a modification date equal or bigger than the date given
    for my $entry (@$entries) {
      return if DateTime->compare($entry->{date}, $date) >= 0;
    }
    $feed->{message} = "No entry newer than 90 days";
    $feed->{code} = 206; # partial content
  }
}
sub updated {
  my $node = shift;
  return unless $node;
  my @nodes = $xpc->findnodes('pubDate | atom:published | atom:updated', $node) or return;
  my $date = $nodes[0]->textContent;
  my $dt = eval { DateTime::Format::Mail->parse_datetime($date) }
  || eval { DateTime::Format::ISO8601->parse_datetime($date) }
  || eval { DateTime::Format::Mail->parse_datetime(french($date)) };
  return $dt;
}
sub french {
  my $date = shift;
  $date =~ s/^($wday_re)/$wday{$1}/;
  $date =~ s/\b($month_re)/$month{$1}/;
  return $date;
}
sub id {
  my $node = shift;
  return unless $node;
  my $id = $xpc->findvalue('guid | atom:id', $node); # id is mandatory for Atom
  $id ||= $node->findvalue('link'); # one of the following three is mandatory for RSS
  $id ||= $node->findvalue('title');
  $id ||= $node->findvalue('description');
  return $id;
}
sub unique {
  my %seen;
  my @unique;
  for my $node (@_) {
    next if $seen{$node->{id}};
    $seen{$node->{id}} = 1;
    push(@unique, $node);
  }
  return @unique;
}
sub limit {
  my $entries = shift;
  my $limit = shift;
  # we want the most recent entries overall
  @$entries = sort { DateTime->compare( $b->{date}, $a->{date} ) } unique(@$entries);
  return [@$entries[0 .. min($#$entries, $limit - 1)]];
}
=head2 Writing templates for entries
Entries have the following keys available:
B<title> is the title of the post.
B<link> is the URL to the post on the web (probably a HTML page).
B<blog_title> is the title of the site.
B<blog_link> is the URL for the site on the web (probably a HTML page).
B<blog_url> is the URL for the site's feed (RSS or Atom).
B<authors> are the authors (or the Dublin Core contributor), a list of strings.
B<date> is the publication date, as a DateTime object.
B<day> is the publication date, in ISO date format: YYYY-MM-DD, for the UTC
timezone. The UTC timezone is picked so that the day doesn't jump back and forth
when sorting entries by date.
B<content> is the full post content, as string or encoded HTML.
B<excerpt> is the post content, limited to 500 characters, with paragraph
separators instead of HTML elements, as HTML. It is not encoded because the idea
is that it only gets added to the HTML and not to the feed, and the HTML it
contains is very controlled (only the pilcrow sign inside a span to indicate
paragraph breaks).
B<categories> are the categories, a list of strings.
B<element> is for internal use only. It contains the L<XML::LibXML::Element>
object. This could be RSS or Atom!
B<feed> is for internal use only. It's a reference to the feed this entry
belongs to.
=cut
sub add_data {
  my $feeds = shift;
  my $entries = shift;
  # A note on the use of xml_escape: whenever we get data from the feed itself,
  # it needs to be escaped if it gets printed into the HTML. For example: the
  # feed contains a feed title of "Foo &amp; Bar". findvalue returns "Foo &
  # Bar". When the template inserts the title, however, we want "Foo &amp; Bar",
  # not "Foo & Bar". Thus: any text we get from the feed needs to be escaped
  # if there's a chance we're going to print it again.
  for my $feed (@$feeds) {
    next unless $feed->{doc};
    # title and url in the feed overrides title and xmlUrl set in the OPML (XML already escaped)
    $feed->{title} = xml_escape($xpc->findvalue('/rss/channel/title | /atom:feed/atom:title', $feed->{doc})) || $feed->{title} || "";
    $feed->{url} = xml_escape($xpc->findvalue('/atom:feed/atom:link[@rel="self"]/@href', $feed->{doc})) || $feed->{url} || "";
    # link in the feed does not override htmlUrl in the OPML (XML already escaped)
    $feed->{link} = $feed->{link} || xml_escape($xpc->findvalue('/rss/channel/link | /atom:feed/atom:link[@rel="alternate"][@type="text/html"]/@href', $feed->{doc})) || "";
    # if they just pasted a domain "foo" then "//foo" is a valid URL
    $feed->{link} = "//" . $feed->{link} unless $feed->{link} =~ /\/\//;
  }
  for my $entry (@$entries) {
    # copy from the feed (XML is already escaped)
    $entry->{blog_link} = $entry->{feed}->{link};
    $entry->{blog_title} = $entry->{feed}->{title};
    $entry->{blog_url} = $entry->{feed}->{url};
    # parse the elements
    my $element = $entry->{element};
    $entry->{title} = xml_escape $xpc->findvalue('title | atom:title', $element) || "Untitled";
    my @links = map { xml_escape $_->to_literal } map { $xpc->findnodes($_, $element) }
      # sorted by preferences!
      qw(link atom:link[@rel="alternate"][@type="text/html"]/@href atom:link/@href);
    $entry->{link} = shift(@links) || "";
    my @authors = map { without_email(xml_escape strip_html($_->to_literal)) } $xpc->findnodes(
      'author | atom:author/atom:name | atom:contributor/atom:name | dc:creator | dc:contributor', $element);
    @authors = map { without_email(xml_escape strip_html($_->to_literal)) } $xpc->findnodes(
      '/atom:feed/atom:author/atom:name | '
      . '/atom:feed/atom:contributor/atom:name | '
      . '/rss/channel/dc:creator | '
      . '/rss/channel/dc:contributor | '
      . '/rss/channel/webMaster ', $element) unless @authors;
    $entry->{authors} = @authors ? \@authors : undef; # key must exist in the hash
    if (DateTime->compare($entry->{date}, $undefined_date) == 0) {
      $entry->{day} =  "(no date found)";
    } else {
      $entry->{day} = $entry->{date}->clone->set_time_zone('UTC')->ymd; # operate on a clone
    }
    my @categories = map { xml_escape strip_html($_->to_literal) } $xpc->findnodes('category | atom:category/@term', $element);
    $entry->{categories} = @categories ? \@categories : undef; # key must exist in the hash
    $entry->{excerpt} = '';
    $entry->{content} = '';
    my @nodes = $xpc->findnodes('description[text()!=""] | atom:content[text()!=""]', $element);
    @nodes = $xpc->findnodes('summary[text()!=""] | atom:summary[text()!=""] | itunes:summary[text()!=""]', $element) unless @nodes;
    my $content = shift(@nodes);
    # The default is that the content is either plain text or escaped HTML,
    # which is what we want for RSS. For Atom feeds, type="xhtml" means that the
    # content is XHTML, so it needs to be escaped.
    if ($content) {
      my $is_xhtml = $content->hasAttribute("type") && $content->getAttribute("type") eq "xhtml";
      $entry->{excerpt} = excerpt($content->to_literal);
      for my $child ($content->childNodes()) {
        my $c = $child->toString();
        $c = xml_escape $c if $is_xhtml;
        $entry->{content} .= $c;
      }
    }
  }
}
sub excerpt {
  my $content = shift;
  return '(no excerpt)' unless $content;
  my $doc = eval { XML::LibXML->load_html(recover => 2, string => $content) };
  my $separator = "¶";
  for my $node ($doc->findnodes('//style | //script | //time | //*[contains(@class, "post-header")] | //*[contains(@class, "post-share-buttons")] | //*[contains(@class, "post-footer")] | //*[contains(@class, "comments")]'),
      ){
    $node->parentNode->removeChild($node);
  }
  for my $node ($doc->findnodes('//p | //br | //blockquote | //li | //td | //th | //div | //h1 | //h2 | //h3 | //h4 | //h5 | //h6')) {
    $node->appendTextNode($separator);
  }
  my $text = strip_html($doc->textContent());
  $text =~ s/( +|----+)/ /g;
  # collapse whitespace and trim
  $text =~ s/\s+/ /g;
  $text = trim $text;
  # replace paragraph repeats with their surrounding spaces
  $text =~ s/ ?¶( ?¶)* ?/¶/g;
  $text =~ s/^¶//;
  $text =~ s/¶$//;
  my $len = length($text);
  $text = substr($text, 0, 500);
  $text .= "…" if $len > 500;
  $text = xml_escape $text;
  $text =~ s/¶/<span class="paragraph">¶ <\/span>/g;
  return $text;
}
# When there's a value that's supposed to be text but isn't, then we can try to
# turn it to HTML and from there to text... This is an ugly hack and I wish it
# wasn't necessary.
sub strip_html {
  my $str = shift;
  return '' unless $str;
  my $doc = eval { XML::LibXML->load_html(string => $str) };
  return $str unless $doc;
  return $doc->textContent();
}
# Strip RSS 2.0 emails
sub without_email {
  my $str = shift;
  $str =~ s/\S+@\S+ \(([^\)]+)\)/$1/;
  return $str;
}
=head1 SEE ALSO
OPML 2.0, L<http://dev.opml.org/spec2.html>
RSS 2.0, L<https://cyber.harvard.edu/rss/rss.html>
Atom Syndication, L<https://tools.ietf.org/html/rfc4287>
River of News,
L<http://scripting.com/2014/06/02/whatIsARiverOfNewsAggregator.html>
=cut
1;
	Global
`s`	Focus search bar
`?`	Bring up this help dialog
	GitHub
`g` `p`	Go to pull requests
`g` `i`	go to github issues (only if github is preferred repository)
	POD
`g` `a`	Go to author
`g` `c`	Go to changes
`g` `i`	Go to issues
`g` `d`	Go to dist
`g` `r`	Go to repository/SCM
`g` `s`	Go to source
`g` `b`	Go to file browse
	Search terms
module: (e.g. module:Plugin)
distribution: (e.g. distribution:Dancer auth)
author: (e.g. author:SONGMU Redis)
version: (e.g. version:1.00)