package Plucene::SearchEngine::Index::RSS;
use base 'Plucene::SearchEngine::Index::Base';
__PACKAGE__->register_handler(qw( rss rdf application/rss+xml application/rdf+xml ));
use 5.006;
use strict;
use warnings;
use XML::RSS;
use Date::Parse;
our $VERSION = '0.02';

sub gather_data_from_file {
    my ($self, $filename) = @_;
    my $xml = XML::RSS->new;
    eval{  $xml->parsefile($filename) }; return if $@;
    my @articles;
    my $x;
    for my $art_xml (@{$xml->{'items'}}) {
        my $art = (ref $self)->new; 
        $art->add_data("modified", "Date", 
            Time::Piece->new(str2time(
                $art_xml->{dc}{date} || $xml->{dc}{date} ||
                $xml->channel("pubDate")
            ))
        );
        if ($art_xml->{dc}{creator}) {
            $art->add_data("creator", "Text", $art_xml->{dc}{creator});
        }
        $art->add_data("feed", "Text", $xml->channel("title"));
        $art->add_data("id", "Keyword", $art_xml->{link}." in ".$self->{id}{data}[0]);
        $art->add_data("text", "UnStored", $art_xml->{description}
            || $art_xml->{"http://purl.org/rss/1.0/modules/content/"}{encoded}
        );
        $art->add_data("title", "Text", $art_xml->{title});
        push @articles, $art;
    }
    return @articles;
}

=head1 NAME

Plucene::SearchEngine::Index::RSS - Index RSS files

=head1 SYNOPSIS

    my @articles = Plucene::SearchEngine::Index::URL->(
        "http://planet.perl.org/rss10.xml"
    );
    $indexer->index($_->document) for @articles;

=head1 DESCRIPTION

This examines RSS files and creates document hashes for individual items
in the feed. The objects have the following Plucene fields:

=over 3

=item modified

The date that this article was published.

=item creator

The creator, if one was specified.

=item feed

The name of the feed from which this was taken.

=item id

The URL that the article links to, and the URL of the feed.

=item text

The text of the article.

=item title

The title of the article.

=back

=head1 WARNING

Since C<Plucene::SearchEngine::Index> uses MIME types to determine the
type of a file, this module doesn't work particularly well using the
C<File> frontend. It works OK with the C<URL> frontend if the webserver
sends the right content type header. If not, you may have to fudge it by
registering your own handlers:

    Plucene::SearchEngine::Index::RSS->register_handler("text/xml");
    # For instance

=head1 SEE ALSO

L<Plucene::SearchEngine::Index>.

=head1 AUTHOR

Simon Cozens, E<lt>simon@cpan.orgE<gt>

=head1 COPYRIGHT AND LICENSE

Copyright (C) 2004 by Simon Cozens

This library is free software; you can redistribute it and/or modify
it under the same terms as Perl itself.

=cut