rss2leafnode - metacpan.org


            
              1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
—
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
              #!perl -w
# RSS2Leafnode -- copy RSS feeds to a local news spool
# Copyright 2007, 2008, 2009, 2010, 2011, 2012, 2013 Kevin Ryde
#
# This file is part of RSS2Leafnode.
#
# RSS2Leafnode is free software; you can redistribute it and/or modify it
# under the terms of the GNU General Public License as published by the Free
# Software Foundation; either version 3, or (at your option) any later
# version.
#
# RSS2Leafnode is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
# for more details.
#
# You should have received a copy of the GNU General Public License along
# with RSS2Leafnode.  If not, see <http://www.gnu.org/licenses/>.
use 5.010;
use strict;
use warnings;
use App::RSS2Leafnode;
use Encode;           # for Encode::PERLQQ
use PerlIO::encoding; # for fallback
# version 0.06 for bug fix of a struct size for perl 5.10 (there's some
# fragile duplication)
use PerlIO::locale 0.06;
our $VERSION = 70;
# locale encoding conversion on the tty, wide-chars everywhere internally
# for instance $subject from an item might be wide chars printed when --verbose
{ no warnings 'once';
  local $PerlIO::encoding::fallback = Encode::PERLQQ; # \x{1234} style
  (binmode (STDOUT, ':locale') && binmode (STDERR, ':locale'))
    or die "Cannot set :encoding on stdout/stderr: $!\n";
}
my $r2l = App::RSS2Leafnode->new;
exit $r2l->command_line;
__END__
=for stopwords rss2leafnode rss leafnode NNTP config leafnode undef charset utf-8 non-ascii charsets builtins misconfigured Eg Unrendered Google pre-releases Ryde PNG libxml multibyte codings feed's NOAA XHTML unescaping X-From-Url X-RSS-Url X-RSS-Generator eg sn codepage unescape favicon kbytes repost r2l.perl conf ie GPL appication WordPress pre RDF reposted MHTML
=head1 NAME
rss2leafnode -- post RSS or Atom feeds and web pages to newsgroups
=head1 SYNOPSIS
 rss2leafnode [--options]
=head1 DESCRIPTION
RSS2Leafnode downloads RSS or Atom feeds and posts items as messages to an
NNTP news server.  It's designed to make text items available for reading in
local newsgroups, not propagating anywhere (though that's not enforced).
Desired feeds are given in a configuration file F<.rss2leafnode.conf> in
your home directory.  For example to put a feed in group "r2l.perl"
    fetch_rss ('r2l.perl', 'http://log.perl.org/atom.xml');
This is actually Perl code, so comment lines begin with C<#> and you can
write conditionals etc (see L<perlintro> or L<perlsyn>).  The target
newsgroup must exist (see for example L</Leafnode> below).  With that done,
run C<rss2leafnode> as
    rss2leafnode
You can automate with C<cron> or similar.  If you do it under user C<news>
it could be just after a normal news fetch.  The C<--config> option below
lets you run different config files at different times, etc.  (Code in the
conf file could do that too.)  See F<examples/rss2leafnode.conf> in the
RSS2Leafnode sources for a complete sample.
Messages are added to the news spool using NNTP "POST" commands.  When a
feed is re-downloaded any items previously added are not repeated.  Multiple
feeds can be put in a single newsgroup.  Each feed is posted as it's
downloaded, so the first feed's articles appear while other feeds are still
being downloaded.
The target news server follows the L<Net::NNTP> defaults, or the newsgroup
name can be in the form of a L C<news:> or C<nntp:> URL of a server on a
different host or port.  For example a personal server on a high port
number,
    fetch_rss('news://somehost.mydomain.org:8119/r2l.weather',
              'http://feeds.feedburner.com/PTCC');
=head2 Web Pages
Plain web pages can be downloaded too.  Each time the page changes a new
article is injected.  This is good for a latest news or status page.  For
example
    fetch_html ('r2l.music',
      'http://www.abc.net.au/rage/playlist/print/saturday_print.htm');
The target can be an image or similar directly too.  It's simply put in a
news message with its indicated MIME type.  How well it displays depends on
your newsreader.
    fetch_html('r2l.weather',
               'http://www.bom.gov.au/difacs/IDX0604.gif');
The message "Subject" is the HTML C<< <title> >> or something better from
C<URI::Title> or C<Image::ExifTool> if you've got those.  C<URI::Title> has
special cases for a couple of unhelpful sites and C<Image::ExifTool> can get
a PNG image title.
If a web page isn't at a fixed location you can write some Perl code in
F<.rss2leafnode.conf> to construct a URL with a date etc.  It might be worth
attempting a couple of nearby dates if you're not certain when the new one
becomes available.
=head2 Re-Downloading
HTTP C<ETag> and C<Last-Modified> headers are used, if provided by the
server, to avoid re-downloading unchanged content (feeds or web pages).  RSS
C<< <thr:count> >> or C<< <slash:comments> >> are used to check for
unchanged comments feeds.  Values seen from the last run are saved in a
F<.rss2leafnode.status> file in your home directory.
If you've got C<XML::RSS::Timing> then it's used for RSS C<ttl>,
C<updateFrequency>, etc from a feed.  This means the feed is not
re-downloaded until its declared update times.  But only a few feeds have
good timing info, most merely have a C<ttl> advising for instance 5 minutes
between rechecks.
With C<--verbose> the next calculated update time is printed, in case you're
wondering why nothing is happening.  The easiest way to force a re-download
is to delete the F<~/.rss2leafnode.status> file.  Old status file entries
are automatically dropped if you don't fetch a particular feed for a while,
so that file should normally need no maintenance.
=head2 Leafnode
C<rss2leafnode> was originally created with the C<leafnode> program in mind,
but can be used with any server accepting posts.  It's your responsibility
to be careful where a target newsgroup propagates.  Don't make automated
postings to the world!
For leafnode version 2 see its F<README> file section "LOCAL NEWSGROUPS" on
creating local-only groups.  Add a line to the
F</etc/news/leafnode/local.groups> file like
    r2l.stuff   y       My various feeds
The group name is arbitrary and the description is optional, but note it
must be a tab character between the name and the "y" and between the "y" and
any description.  "y" means posting is allowed.
=head2 Small News
The Small News "sn" program is a another possible local server.  Create
groups in it with command
    snnewgroup r2l.something
When running the C<snntpd> daemon from C<inetd> or similar don't forget a
logger program argument on the command line as described its F<INSTALL.run>
file, otherwise log messages go to the client connection and will upset most
client program code, including C<Net::NNTP> used by C<rss2leafnode>.
=head2 Copyright
It's your responsibility to check the terms of use for any feeds or web
pages you download with C<rss2leafnode>.  Pay particular attention if
propagating or re-transmitting resulting messages.
Copyright or license statements in a feed are included in the messages as
C<X-Copyright> headers.  Unless the content is in the public domain such
copyright notices should be retained.
The transformations RSS2Leafnode makes to turn feed items into messages are
purely mechanical and the author believes they don't cause the program's
terms (ie. GPL, per L</"LICENSE"> below) to be imposed on the results.
=head1 COMMAND LINE OPTIONS
The command line options are
=over 4
=item C<--config=/some/filename>
Read the specified configuration file instead of F<~/.rss2leafnode.conf>.
=item C<--help>
Print some brief help information.
=item C<--verbose>
Print some diagnostics about what's being done.  With C<--verbose=2> print
various technical details.
=item C<--version>
Print the program version number and exit.
=back
=head1 CONFIG OPTIONS
The following config options can be set either in global variables, or on a
per-feed basis in an individual C<fetch_rss()> or C<fetch_html()>.
=over 4
=item $rss_get_links (default 0)
=item C<fetch_rss ("group", "url", rss_get_links =E<gt> G)>
If true then download links in each item and include the content in the news
message.  For example,
    $rss_get_links = 1;
    fetch_rss ('r2l.finance',
      'http://au.biz.yahoo.com/financenews/htt/financenews.xml');
Not all feeds have interesting things at their link.  Sometimes the RSS has
the full item text already.  But if the RSS is a summary then
C<$rss_get_links> makes the full article ready to read immediately, instead
of having to click through from the message.
Only the immediate link target URL is retrieved.  No images within the page
are downloaded, which is often a good thing to reduce bloat or avoid
occasional advertising in feeds.  You'll probably have trouble if the link
target uses frames (a set of HTML pages instead of just one).
=item $rss_get_comments (default 0)
=item C<fetch_rss ("group", "url", rss_get_comments =E<gt> G)>
If true then download the comments feeds for items and post as followup news
articles.  For example,
    fetch_rss ('r2l.food',
      'http://wickedgooddinner.blogspot.com/feeds/posts/default',
      rss_get_comments => 1);
To send a followup comment you usually must go to the links in the original
article (or the followups) and use some sort of web form.  Posting a message
to the newsgroup goes nowhere.
When a feed is available in both Atom and RSS formats sometimes only the
Atom one includes a comments feed URL.
Comments feeds are followed for as long as an article appears in the feed,
though in the current implementation it might be checked for new comments
only when the originating feed changes.
=item $render (default 0)
=item C<fetch_rss ("group", "url", render =E<gt> R)>
=item C<fetch_html ("group", "url", render =E<gt> R)>
If true then render HTML as plain text in the news messages.  Normally item
text, downloaded parts from C<$rss_get_links>, and C<fetch_html()> pages are
all presented as C<text/html>.  If your newsreader doesn't handle HTML very
well then C<render> is a good way to see just the text.  Setting C<1> uses
C<HTML::FormatText>,
    $render = 1;
    fetch_rss ('r2l.weather',
      'http://xml.weather.yahoo.com/forecastrss?p=ASXX0001&u=f');
Setting C<"WithLinks"> uses the C<HTML::FormatText::WithLinks> variant (you
must have that module) which shows HTML links as footnotes.
    fetch_rss ('r2l.stuff',
               'http://rss.sciam.com/sciam/basic-science',
               render => 'WithLinks');
Settings C<elinks>, C<lynx> or C<w3m> dump through the respective external
program (you must have C<HTML::FormatExternal> and the program).
    fetch_rss ('r2l.sport',
               'http://fr.news.yahoo.com/rss/rugby.xml',
               rss_get_links => 1,
               render => 'lynx');
=item $render_width (default 60)
The number of columns to use when rendering HTML to plain text or when
wrapping Atom text.  You can set this to whatever you find easiest to read,
or any special width needed by a particular feed.
=item $get_icon (default 0)
=item C<fetch_rss ("group", "url", get_icon =E<gt> G)>
=item C<fetch_html ("group", "url", get_icon =E<gt> G)>
Download an RSS/Atom icon or HTML favicon as an image for the C<Face>
header.  The C<Face> header is shown by Gnus and perhaps only a few other
news readers.  In Gnus it appears with "From:" in the article mode on a
graphical screen.  It can be a good visual cue to the origin, but may not
always be worth the extra download.
    $get_icon = 1;
    fetch_rss ('r2l.whatsnew',
               'http://www.archive.org/services/collection-rss.php');
C<Image::Magick> is required to process images.  Banners much wider than
high are suppressed as probably advertising and in any case not suited to
48x48 size limit of the C<Face> header specification.  A 48x48 image might
add perhaps 4 kbytes or more to each message.
For plain RSS and Atom feeds an image is normally a per-channel attribute so
it's the same for all articles from the feed.  An C<itunes:image> or
C<activity:actor> can be per-item and is used if present.
=item $rss_newest_only (default 0 for all)
=item C<fetch_rss ("group", "url", rss_newest_only =E<gt> $count)>
=item C<fetch_rss ("group", "url", rss_newest_only =E<gt> $period)>
Take only newest items from an RSS feed.  The default is 0 which means take
all items from the feed.  The value is either a number for the latest few
items, eg. 10 items,
  fetch_rss('r2l.test',
            'http://www.cpantesters.org/author/K/KRYDE-nopass.rss',
            rss_newest_only => 10);
Or it can be a string giving a period of time.  Only items newer than this
are taken
    "60 minutes"
    "1 hour"    "36 hours"
    "1 day"     "2 days"
    "1 day"     "2 days"
    "1 month"   "5 months"
    "1 year"    "0.75 years"
C<rss_newest_only> can be good if you're only interested in the most recent
item from a status or weather feed, or if you only want to get a few items
as a random taste of a feed.
If a feed goes back further than the news server retains then giving a
period such as "90 days" or whatever corresponding to the server time will
prevent old articles being re-added when the server discards them.  (It'd be
better if the news server could be asked for its retention time, but this
option is better than nothing.)
=item $html_extract_main (default 0 for all)
=item C<fetch_html ("group", "url", html_extract_main =E<gt> 1)>
=item C<fetch_rss ("group", "url", html_extract_main =E<gt> 1)>
Use L<HTML::ExtractMain> on downloaded HTML to pick out the "main" text from
the page.  For C<fetch_rss()> this is applied to downloaded link parts (per
C<rss_get_links> above).
C<HTML::ExtractMain> is good for removing headers or side columns of
boilerplate bumf on a page.  For a plain text view like RSS2Leafnode such
things tend to waste space and may come out looking particularly poor from a
non-tables renderer such as C<HTML::FormatText> or C<lynx>.
The algorithm in C<HTML::ExtractMain> is a simple paragraph scoring system
(as of its version 0.62).  It does a surprisingly good job but you might
want to check how much it discards, in case something good has not been
reckoned part of the main text.  There's no configuration for ExtractMain,
but perhaps in the future the "1" here to enable could be some settings or
whatever.
=back
=head2 Obscure Options
=over 4
=item $rss_charset_override (default undef)
=item C<fetch_rss ("group", "url", rss_charset_override =E<gt> "CHARSET")>
If set then force RSS content to be interpreted in this charset,
irrespective of what the document says.  See L<XML::Parser/ENCODINGS> for
the charsets supported (some builtins plus F<.enc> files under
F</usr/lib/perl5/XML/Parser/Encodings/>).
Use this option if the document is wrong or has no charset specified and
isn't the XML default utf-8.  Usually you'll only want this for a particular
offending feed.  For example,
    # AIR is latin-1, but doesn't have a <?xml> saying that
    fetch_rss ('r2l.finance',
               'http://www.aireview.com.au/rss.php',
               rss_charset_override => 'iso-8859-1');
By default RSS2Leafnode tries to cope with bad multibyte sequences by
re-coding to the feed's claimed charset.  If that works then the text will
have some substitute characters (either U+FFFD or question marks "?") and a
warning is given like
    Feed http://example.org/feed.xml
      recoded utf-8 to parse, expect substitutions for bad non-ascii
      (line 214, column 75, byte 13196)
Bad single-byte codings generally aren't detected and will just go through
to display something incorrect (eg. if MS-DOS codepage 1252 is used where
Latin-1 is claimed).  Nose around the raw feed as necessary to see where it
goes wrong.
=item $html_charset_from_content (default 0)
=item C<fetch_rss ("group", "url", html_charset_from_content =E<gt> H)>
=item C<fetch_html ("group", "url", html_charset_from_content =E<gt> H)>
If true then the charset used for HTML content is taken from the HTML
itself, rather than the server's HTTP headers.  Normally the server should
be believed, but if a particular server is misconfigured then you can try
this.
    fetch_html ('r2l.stuff',
                'http://www.somebadserver.com/newspage.html',
                html_charset_from_content => 1);
=back
=head2 Config Extent
Variables take effect from the point they're set, through to the end of the
file, or until a new setting. 
Options like C<render =E<gt> 'lynx'> in a particular C<fetch_rss()> or
C<fetch_html()> override the global settings, just for that call.
The Perl C<local> feature and a braces block can confine a variable setting
to a group of particular feeds.  Eg.
    { local $rss_get_links = 1;
      fetch_rss ('r2l.debian',
                 'http://www.debian.org/News/weekly/dwn.en.rdf');
      fetch_rss ('r2l.finance',
                 ...);
    }
=head2 Emacs
In Emacs, F<.rss2leafnode.conf> can be put into C<perl-mode> with the usual
mode setup in the file
    # -*- mode: perl-mode -*-
Or an C<auto-mode-alist> setup in your F<.emacs>,
    (add-to-list 'auto-mode-alist
                 '("/\\.rss2leafnode\\.conf\\'" . perl-mode))
The Debian package of C<rss2leafnode> has this setup, plus a completions
ignore for the C<.rss2leafnode.status> file.  See
F</etc/emacs/site-start.d/50rss2leafnode.el> in the package, or
F<debian/emacsen-startup> in the RSS2Leafnode sources.
=head1 OTHER DETAILS
Non-ascii RSS and Atom text and rendered HTML text are coded as utf-8 in the
generated messages so for non-ascii content you'll need a newsreader which
supports that.  Unrendered HTML is left in the charset the server gave, to
ensure it matches any C<< <meta http-equiv> >> in the document.  In all
cases the charset is specified in the MIME message headers or attachment
parts.  Transfer coding in the message body is chosen by C<MIME::Entity>
which normally means quoted-printable if any non-ascii or any very long
lines.  Atom C<< <content> >> already in base64 is left that way.
Links are shown at the end of each message for
    <link>                 RSS and Atom
    <enclosure>            RSS
    <comments>             RSS
    <content>              Atom externals, except other XML feeds
    <source>               RSS and Atom
    <wfw:comment>          well-formed web
    <wiki:diff> 
    <wiki:history>
    <sioc:has_creator>
    <sioc:has_discussion>
    <sioc:links_to>
    <sioc:reply_of>
    Author <url>           Atom and wiki, not downloaded
Comment or reply links show a count of replies from any of
    <thr:total>
    count="123"         \ attribute of <link>
    thr:count="123"     /
    <slash:comments>    sub-element of <comments>
RSS comment feeds for C<$rss_get_comments> are as follows.  "appication" is
a typo from WordPress pre 2.5 still sometimes found in use (as of Oct 2012).
    <wfw:commentRss>
    <link rel='replies' type='application/atom+xml' ...>
    <link rel='replies' type='appication/atom+xml' ...>
Comments links are shown as "Replies" or "RSS Replies".  If an RSS comment
feed hasn't been detected as RSS it may show up as plain "Replies" instead
of "RSS Replies".  In that case it won't be downloaded by the
C<rss_get_comments> option.
C<< <media:group> >> links are shown as blocks of links.  Not sure about the
quality of the formatting yet, and they're not downloaded by
C<rss_get_links>.
Common Alerts Protocol (CAP) fields for weather alerts etc are shown if
present (eg. from the US NOAA).  This can have more detail than just the
text.  Pseudo-link footnotes are shown for,
    <geo:lat>,<geo:long>
    <geo:Point>
    <georss:point>
    <statusnet:origin>      possibly with URL target too
    <media:credit>
Unrecognised item fields are shown in XML at the end of the message.  This
is a bit technical but tries not to drop information, and might suggest
extra things RSS2Leafnode could present or interpret.
An attempt is made to repair bad XML from a feed with C<XML::Liberal> if you
have that module.  It uses C<XML::LibXML> and the C<libxml> library and
often succeeds on annoying things like bad C<&foo;> entities, at least
enough to present something.  On hopelessly malformed data it might be a bit
slow.
The most common XML problem is too much or too little C<&foo;> entity
escaping.  Too little can turn HTML markup into nested XML elements and
RSS2Leafnode treats that as if it was XHTML style sub-elements, though the
result is likely to be imperfect.  Too much escaping results in raw or
semi-raw HTML C<< <p> >> or C<&foo;> coming through.  C<&apos;> may be XHTML
instead of HTML, though many browsers support it anyway.  An option for
extra unescaping might improve some bad feeds but in practice is unlikely to
be wholly successful.  Every bad feed tends to be bad in its own special
way.
=head2 Message Headers
For reference the message headers fields are generated roughly as follows,
=over
=item From:
First non-empty of
    <author>
    <jf:author>
    <slate:author>
    <dc:creator>
    <dc:contributor>
    <wiki:username>
    <itunes:author>
    <managingEditor>
    <webMaster>
    <dc:publisher>
    <itunes:owner>
    channel <title>
The C<dc> bits in RDF might have sub-elements
C<< <rdf:description><rdf:value> >> containing the actual text.
    <dc:contributor>
      <rdf:Description ...>
        <rdf:value>Mary McConnell</rdf:value>
      </rdf:Description>
    </dc:contributor>
Atom has C<< <name> >> and C<< <email> >> sub-elements.
C<< <itunes:owner> >> may have an C<< <itunes:email> >> sub-element.  Such
sub-elements can be checked without worrying whether the feed is supposed to
be Atom or RSS etc.  When there's no sub-elements the text is free-form and
might be things like
    Name
    Name <foo@bar.com>
    foo@bar.com (Name)
If there's no identifiable email mailbox part in the text or C<< <email> >>
then C<nobody@HOSTNAME> is added to make a valid RFC822 address.
The channel C<< <title> >> as a final fallback is meant to at least show
something about where the message came from if there's no author identified.
An author's home page is included in the message links described above.
Sometimes there may be multiple C<< <dc:creator> >> elements.  They're
combined as a multi-author C<From> per RFC5322 (though without picking out a
C<Sender> from among them).
=item Subject:
First present of
    <title>
    <dc:title>
    <dc:subject>
C<< <dc:subject> >> is normally only a keyword but might be better than
nothing.
=item Date:
First present of
    <pubDate>
    <dc:date>
    <jf:creationDate>
    <modified>
    <updated>
    <issued>
    <dcterms:issued>
    <created>
    <lastBuildDate>
    <published>
C<dc:date> is ISO format "2000-01-01T12:00:00Z" etc and anything in that
form is converted to RFC822 style for the messages.  An unrecognised form is
put through unmodified.
C<< <jf:creationDate> >> is not used.  It's apparently meant to be
locale-based for human readability and is probably accompanied by
C<< <pubDate> >> anyway so not needed.
=item Date-Received:
The date/time when C<rss2leafnode> made the message.
=item Message-ID:
First of
    <id>                         (Atom)
    <guid isPermaLink="true">
    <link>                       Yahoo Finance special case
    <guid isPermaLink="false">   and feed URL
    MD5 hash                     of various fields and feed URL
Yahoo Finance items repeated in different feeds are noticed using a special
match of the C<< <link> >> so that just one copy is posted.  (As of March
2010 those items don't offer RSS C<guid> identifiers.)
=item Keywords:
All of
    <category>
    <itunes:category>
    <cap:category>
    <itunes:keywords>
    <media:keywords>
    <dc:subject>
    <slash:section>
    <slate:topic>
The sub-category system of C<< <itunes:category> >> is not currently put
through.
Some blog feeds seem to give a big set of categories, an aggregate of
everything in the blog or some such, making an unattractively long
C<Keywords:> header.  It's kept in full for the sake of completeness, but if
viewing it in a newsreader then some sort of line limit might be wanted.
=item In-Reply-To:
C<< <thr:in-reply-to> >> elements (per RFC 4685) turned into Message-IDs the
same way as an Atom <id>.  This might help thread display in a news reader
if the parent item was downloaded too.
C<< <sioc:reply_of> >> is not used.  It'd be a possibility, but would
probably need a hard-coded mapping of URL to Message-ID.  For now it's just
shown as a link as described above.
=item Content-Location:
The URL of a C<fetch_html()> or a C<$get_links> attachment part.  Good
newsreaders can use this to resolve relative links in a HTML part.
This same URL and any C<xml:base> attribute is used as a C<< <base
href=""> >> for including a HTML fragment, so the location is present when
saving a message body (and when rendering it to plain text).
=item Content-Language:
First of
    <language>
    <dc:language>
    <twitter:lang>
    xml:lang=""
    HTTP response Content-Language header
C<xml:lang> is the standard XML attribute present on any element and
sometimes found on Atom C<< <content> >> text.
The language code is also added to a generated HTML body in HTML4 style,
though whether any renderers/browsers do much with it is another matter.
    <html lang="en">
=item Content-MD5:
From the corresponding HTTP header of a C<fetch_html()> or C<$get_links>
download part, though in practice this is almost never sent by HTTP servers.
=item Importance:
=item Priority:
These headers are only supposed to be for X.400 inter-operation.  Common
Alerts Protocol and Wiki (L<http://www.meatballwiki.org/wiki/ModWiki>) are
treated as
    <cap:severity> "Extreme" and "Severe"
       -> "Importance: high" and "Priority: urgent"
    <wiki:importance> "minor"
       -> "Importance: low"
=item Precedence:
"list" for certain Google Groups lists, identified by their link URLs per
C<List-Post> below.  Perhaps other feeds which come from mailing lists could
be identified too.
=item Face:
Per the C<$get_icons> option described above, the first item or channel
element
     <image>           RSS
     <icon>            Atom
     <logo>            Atom
     <itunes:image>
     <statusnet:postIcon>
     <media:thumbnail>
     <activity:actor><link rel="avatar">
     <author><gd:image>
     HTML favicon      for fetch_html()
Gnus and perhaps other newsreaders can display C<Face:>, see
L<http://quimby.gnus.org/circus/face>.
It'd be possible to generate an C<X-Face:> as well or instead, but it's
black and white and converting a colour image from the feeds is unlikely to
look good.
=item List-Post:
Mailbox of a Google Groups mailing list feeds such as
L<http://groups.google.com/group/cake-php/feed/rss_v2_0_msgs.xml>.  This may
help post a followup to the list, depending on the newsreader.  (A followup
to an C<rss2leafnode> newsgroup will normally go nowhere.)
=item PICS-Label:
Channel C<< <rating> >>.  Perhaps C<< <itunes:explicit> >> or
C<< <media:adult> >> could be turned into a rating too.
=item X-Mailer:
"RSS2Leafnode/VERSION" plus the usual from C<MIME::Entity> (see
L<MIME::Entity/build PARAMHASH>).
=item X-Copyright:
An RSS2Leafnode extension, being all of following.  See L</Copyright> above.
    <rights>                           Atom
    <copyright>                        RSS
    <dc:rights>
    <dcterms:license>
    <creativeCommons:license>
    <link rel="license" href="...">    Atom
These are sought in the channel, the item, and also any Atom style
C<< <source> >> within the item.
=item X-RSS-Url:
An RSS2Leafnode extension, being the originating C<fetch_rss()> feed URL
downloaded.  This is handy if an item has come out badly and you want to
check the raw feed.
=item X-RSS-Generator:
An RSS2Leafnode extension, being the channel C<< <generator> >>.  This might
help assign blame for bad feed content etc.
=back
Of course all this conversion wouldn't be necessary if RSS had been news in
the first place.  A news server already serves short messages, either
read-only or with followups, and if news servers hadn't got a well-deserved
reputation for being a pain to administer, and if news hadn't been based on
transferring gigabytes of "full feed" instead of on-demand, then RSS might
never have been wanted.  Of course the other side is that if you're a web
page author accustomed to HTTP then everything looks like a HTTP, and if you
like HTML then a ridiculous edifice like XML to encapsulate a half dozen
bits of text might even seem like a good idea.
=head1 BUGS
The way Message-IDs are checked on the news server means that the server
should be setup to retain messages for at least as long as the feed retains
items, or as long as the C<rss_newest_only> option you select for the feed.
If that's not so then old articles will be re-posted by the next
C<fetch_rss()> and will look like new articles to a newsreader.  (Letting
the news server track articles keeps down the amount of state
C<rss2leafnode> must maintain and means multiple users can insert a feed
without duplication.)
No retries are attempted if a news server disconnects, at least not unless
posting to a different news server then coming back.  Not sure if that's
good or bad, but the current repeated error messages for a disconnect are
unattractive.
Some pre-releases of leafnode 2 might have trouble posting to local
newsgroups while a C<fetchnews> run is in progress.  The local articles
don't show up until after a subsequent further C<fetchnews>.  Or was this
only for the C<rnews> inject?
No attention is paid to C<< <atom:updated> >> or other changes in an item.
Should an updated item be re-posted?  Is the C<Supersedes:> header better,
to replace the article?  Something allowing readers to see or not see
updates according to user preference might be good.  Currently the item is
reposted if C<< <atom:id> >> changes or if there's no C<id> and the content
changes enough to make a new MD5 hash.  Is C<id> supposed to stay the same
for an update?
The way C<$rss_get_links> only gets the immediate link target could perhaps
be extended to fetch images, frame sub-parts, etc of a HTML page and include
them in the message as RFC 2557 style "MHTML".  But do any news readers
actually display that?
Perhaps there should be a limit on the size of links to be downloaded.
Sometimes podcast links have both a html page and a full audio link.  If the
audio is bigger than some threshold then might like to download the html but
not the audio.
The entire XML feed is read into memory, which might be a little too much
for large feeds.  RSS was conceived as a "site summary" but is used for
bigger content too.  Twig has a partial-tree parse for one item at a time,
though applying the C<rss_newest_only> option would require a first pass to
choose items.  A progressive parse might help show at least the first few
items if there's a fatal syntax error or truncation part-way through.  Some
care would be needed that small changes by the automated charset recoding or
C<XML::Liberal> etc doesn't cause duplicated posts.
=head1 ENVIRONMENT VARIABLES
=over 4
=item C<NNTPSERVER>
=item C<NEWSHOST>
Default news server as per C<Net::NNTP>.
=back
=head1 FILES
=over 4
=item F<~/.rss2leafnode.conf>
Configuration file.
=item F<~/.rss2leafnode.status>
Status file, recording "last modified" dates for downloads.  This can be
deleted if something bad seems to have happened to it; the next
C<rss2leafnode> run will recreate it.
=item C</etc/perl/Net/libnet.cfg>
=item C<~/.libnet.cfg>
Defaults per C<Net::NNTP> and C<Net::Config>.
=back
=head1 SEE ALSO
L<leafnode(8)>,
L<HTML::FormatText>, L<HTML::FormatText::WithLinks>, L<HTML::FormatExternal>,
L<lynx(1)>,
L<URI::Title>, L<XML::Parser>, L<XML::Liberal>, L<Image::Magick>,
C<Net::NNTP>, C<Net::Config>
L<Plagger>, L<feed2imap(1)>, L<rss2email(1)>, L<rssdrop(1)>, L<toursst(1)>,
L<http://www.gwene.org>
=head1 HOME PAGE
L<http://user42.tuxfamily.org/rss2leafnode/index.html>
=head1 LICENSE
Copyright 2007, 2008, 2009, 2010, 2011, 2012, 2013 Kevin Ryde
RSS2Leafnode is free software; you can redistribute it and/or modify it
under the terms of the GNU General Public License as published by the Free
Software Foundation; either version 3, or (at your option) any later
version.
RSS2Leafnode is distributed in the hope that it will be useful, but WITHOUT
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
more details.
You should have received a copy of the GNU General Public License along with
RSS2Leafnode.  If not, see L<http://www.gnu.org/licenses/>.
=cut
	Global
`s`	Focus search bar
`?`	Bring up this help dialog
	GitHub
`g` `p`	Go to pull requests
`g` `i`	go to github issues (only if github is preferred repository)
	POD
`g` `a`	Go to author
`g` `c`	Go to changes
`g` `i`	Go to issues
`g` `d`	Go to dist
`g` `r`	Go to repository/SCM
`g` `s`	Go to source
`g` `b`	Go to file browse
	Search terms
module: (e.g. module:Plugin)
distribution: (e.g. distribution:Dancer auth)
author: (e.g. author:SONGMU Redis)
version: (e.g. version:1.00)