#!/usr/bin/perl
package XML::RSSLite;
use strict;
use vars qw($VERSION);

$VERSION = 0.17;

sub import{
  no strict 'refs';
  shift;
  my $pkg = scalar caller();
  *{"${pkg}::parseRSS"} = \&parseRSS;
  *{"${pkg}::parseXML"} = \&parseXML if grep($_ eq 'parseXML', @_);
}


sub parseRSS {
  my($rr, $cref, $strip) = @_;

  die "$rr is not a hash reference"     unless ref($rr)   eq 'HASH';
  die "$cref is not a scalar reference" unless ref($cref) eq 'SCALAR';

  # Gotta have some content to parse
  return unless $$cref;

  preprocess($cref, $strip);
  {
    _parseRSS($rr, $cref), last if index(${$cref}, '<rss')+1;
    _parseRDF($rr, $cref), last if index(${$cref}, '<rdf:RDF')+1;
    _parseSN( $rr, $cref), last if index(${$cref}, '<scriptingnews')+1;
    _parseWL( $rr, $cref), last if index(${$cref}, '<weblog')+1;
    die "Content must be RSS|RDF|ScriptingNews|Weblog|reasonably close";
  }
  postprocess($rr);
}

sub preprocess {
  my($cref, $strip) = @_;
  $$cref =~ y/\r\n/\n/s;

  if( ! defined($strip) ){
    $$cref =~ y{\n\t ~0-9\-+!@#$%^&*()_=a-zA-Z[]\\;':",./<>?}{ }cs;
  }
  elsif($strip eq '1' ){
    $$cref =~ s/[[:^print:]]/ /g;
  }

  #XXX $$cref =~ s/&(?!0[a-zA-Z0-9]+|#\d+);/amp/gs;
  #XXX Do we wish to (re)allow escaped HTML?!
  $$cref =~ s{(?:<|&lt;)/?(?:b|i|h[1-6]|p|center|quote|strong)(?:>|&gt;)}{}gsi;
}

sub _parseRSS {
  parseXML($_[0], $_[1], 'channel', 0); 
  $_[0]->{'items'} = $_[0]->{'item'};
}

sub _parseRDF {
  my ($rr, $cref) = @_;

  $rr->{'items'} = [];
  my $item; 

  parseXML($_[0], $_[1], 'rdf:RDF', 0); 

  # Alias RDF to RSS
  if( exists($rr->{'item'}) ){
    $rr->{'items'} = $rr->{'item'};
  }
  else{
    my $li = $_[0]->{'rdf:li'} || $_[0]->{'rdf:Seq'}->{'rdf:li'};
    foreach $item ( @{$li} ){
      my %ia;
      if (exists $item->{'dc:description'}) {
	$ia{'description'} = $item->{'dc:description'};
      }  
      if (exists $item->{'dc:title'}) {
	$ia{'title'} = $item->{'dc:title'};
      }  
      if (exists $item->{'dc:identifier'}) {
	$ia{'link'} = delete($item->{'dc:identifier'});
      }  
      
      push(@{$rr->{'items'}}, \%ia); 
    }
  }
}

sub _parseSN {
  my ($rr, $cref) = @_;
  
  $rr->{'items'} = ();
  my $item; 

  parseXML($rr, $cref, 'channel', 0); 
  
  # Alias SN to RSS terms
  foreach $item ( @{$_[0]->{'rdf:li'}} ){
    my %ia;
    if (exists $item->{'text'}) {
      $ia{'description'} = $item->{'text'};
    }  
    if (exists $item->{'linetext'}) {
      $ia{'title'} = $item->{'linetext'};
    }  
    if (exists $item->{'url'}) {
      $ia{'link'} = $item->{'url'};
    }  

    push(@{$rr->{'items'}}, \%ia); 
  }
}


sub _parseWL {
  my ($rr, $cref) = @_;

  $rr->{'items'} = ();
  my $item; 

  #XXX is this the right tag to parse for?
  parseXML($rr, $cref, 'channel', 0); 
  
  # Alias WL to RSS
  foreach $item ( @{$_[0]->{'rdf:li'}} ){
    my %ia;
    if (exists $item->{'url'}) {
      $ia{'link'} = delete($item->{'url'});
    }

    push(@{$rr->{'items'}}, \%ia); 
  }
}


sub postprocess {
  my $rr = shift;

  #XXX Not much to do, what about un-munging URL's in source, etc.?!
  return unless defined($rr->{'items'});
  $rr->{'items'} = [$rr->{'items'}] unless ref($rr->{'items'}) eq 'ARRAY';

  foreach my $i (@{$rr->{'items'}}) {
    $i->{description} = $i->{description}->{'<>'} if ref($i->{description});

    # Put stuff into the right name if necessary
    if( not $i->{'link'} ){
      if( defined($i->{'url'}) ){
	$i->{'link'} = delete($i->{'url'}); }
      # See if you can use misplaced url in title for empty links
      elsif( exists($i->{'title'}) ){
	# The next case would trap this, but try to short-circuit the gathering
	if ($i->{'title'} =~ /^(?:https?|ftp):/) {
	  $i->{'link'} = $i->{'title'};
	}
	elsif ($i->{'title'} =~ /"((?:https?|ftp).*?)"/) {
	  $i->{'link'} = $1;
	  $i->{'title'} =~ s/<.*?>//;
	}
	else {
	  next;
	}
      }
    }
    
    # Trim whitespace
    $i->{'link'} =~ s/\s+$//;
    $i->{'link'} =~ s/^\s+//;

    # Make sure you've got an http/ftp link
    if( exists( $i->{'link'}) && $i->{'link'} !~ m{^(https?|ftp)://}i) {
      ## Rip link out of anchor tag
      if( ref($i->{'link'}) && $i->{'link'}->{a}->{href} ){
	$i->{'link'} = $i->{'link'}->{a}->{href} }
      ## Smells like a relative url
      elsif( $i->{'link'}  =~ m{^[#/]} and $rr->{'link'} =~ m{^https?://} ){
	if (substr($i->{'link'}, 0, 1) ne '/') {
	  $i->{'link'} = '/' . $i->{'link'};
	}
	$i->{'link'} = $rr->{'link'} . $i->{'link'};
      }
      else {
	next;
      }
    }
    
    #If we don't have a title, use the link
    unless( defined($i->{'title'}) ){
      $i->{'title'} = $i->{'link'};
    }
    
    if( exists($i->{'link'}) ){
#XXX      # Fix pre-process munging
#      $i->{'link'} =~ s/&amp;/&/gi;
      $i->{'link'} =~ s/ /%20/g;
    }
  }
}

sub parseXML{
  my($hash, $xml, $tag, $comments) = @_;
  my($begin, $end, @comments);
  local $_;

  #Kill comments
  while( ($begin =  index(${$xml}, '<!--')) > -1 &&
	${$xml} =~ m%<!--.*?--(>)%sg ){
    my $str = substr(${$xml}, $begin, pos(${$xml})-$begin, '');
    
    #Save them if requested
    do{ unshift @comments, [$begin, substr($str, 4, length($str)-7)] }
      if $comments;
  }

  _parseXML($hash, $xml, $tag);

#  #XXX Context of comment is lost!
#  #Expose comments if requested
#  do{ push(@$comments, $_->[1]) for @comments } if ref($comments) eq 'ARRAY';
  if( $comments ){
    #Restore comments if requested
    substr(${$xml}, $_->[0], 0, '<!--'.$_->[1].'-->') for @comments;

    #Expose comments if requested
    do{ push(@$comments, $_->[1]) for @comments } if ref($comments) eq 'ARRAY';
  }
}

sub _parseXML{
  my($hash, $xml, $tag, $index) = @_;
  my($begin, $end);

  #Find topTag and set pos to start matching from there
  ${$xml} =~ /<$tag(?:>|\s)/g;
  ($begin, $end) = (0, pos(${$xml})||0);

  #Match either <foo></foo> or <bar />, optional attributes, stash tag name
  while( ${$xml} =~ m%<([^\s>]+)(?:\s+[^>]*?)?(?:/|>.*?</\1)>%sg ){

    #Save the tag name, we'll need it
    $tag = $1 || $2;

    #Save the new beginning and end
    ($begin, $end) = ($end, pos(${$xml}));

    #Get the bit we just matched.
    my $str = substr(${$xml}, $begin, $end-$begin);
    
    #Extract the actual attributes and contents of the tag
   $str =~ m%<\Q$tag\E\s*([^>]*?)?>(.*?)</\Q$tag\E>%s ||
#XXX pointed out by hv
#    $str =~ s%^.*?<$tag\s*([^>]*?)?>(.*?)</$tag>%<$tag>$2</$tag>%s ||
      $str =~ m%<\Q$tag\E\s*([^>]*?)?\s*/>%;
    my($attr, $content) = ($1, $2);

    #Did we get attributes? clean them up and chuck them in a hash.
    if( $attr ){
      ($_, $attr) = ($attr, {});
      $attr->{$1} = $3 while m/([^\s=]+)\s*=\s*(['"]?)([^\2>]*?)(?:\2|$)/g;
    }

    my $inhash;
    #Recurse if contents has more tags, replace contents with reference we get
    if( $content && index($content, '<') > -1 ){
      _parseXML($inhash={}, \$str, $tag);
      #Was there any data in the contents? We should extract that...
      if( $str =~ />[^><]+</ ){
	#The odd RE above shortcircuits unnecessary entry

	#Clean whitespace between tags
	#$str =~ s%(?<=>)?\s*(?=<)%%g; #XXX ~same speed, wacko warning
	#$str =~ s%(>?)\s*<%$1<%g;
#XXX    #$str =~ s%(?:^|(?<=>))\s*(?:(?=<)|\z)%%g

	my $qr = qr{@{[join('|', map { quotemeta } keys %{$inhash})]}};
	$content =~ s%<($qr)\s*(?:[^>]*?)?(?:/|>.*?</\1)>%%sg;

	$inhash->{'<>'} = $content if $content =~ /\S/;
      }
    }

    if( ref($inhash) ){
      #We have attributes? Then we should merge them.
      if( ref($attr) ){
	for( keys %{$attr} ){
	  $inhash->{$_} = exists($inhash->{$_})   ?
	    (ref($inhash->{$_})  eq 'ARRAY'       ?
	     [@{$inhash->{$_}}, $attr->{$_}]   :
	     [  $inhash->{$_},  $attr->{$_}] ) : $attr->{$_};
	}
      }
    }
    elsif( ref($attr) ){
      $inhash = $attr;
    }
    else{
      #Otherwise save our content
      $inhash = $content;
    }
    
    $hash->{$tag} = exists($hash->{$tag}) ?
      (ref($hash->{$tag})  eq 'ARRAY'     ?
	[@{$hash->{$tag}}, $inhash]       :
	[  $hash->{$tag},  $inhash]  )    : $inhash;
  }
}

1;
__END__

=pod

=head1 NAME

XML::RSSLite - lightweight, "relaxed" RSS (and XML-ish) parser

=head1 SYNOPSIS

  use XML::RSSLite;

  parseRSS(\%result, \$content);

  print "=== Channel ===\n",
        "Title: $result{'title'}\n",
        "Desc:  $result{'description'}\n",
        "Link:  $result{'link'}\n\n";

  foreach $item (@{$result{'items'}}) {
  print "  --- Item ---\n",
        "  Title: $item->{'title'}\n",
        "  Desc:  $item->{'description'}\n",
        "  Link:  $item->{'link'}\n\n";
  }

=head1 DESCRIPTION

This module attempts to extract the maximum amount of content from
available documents, and is less concerned with XML compliance than
alternatives. Rather than rely on XML::Parser, it uses heuristics and good
old-fashioned Perl regular expressions. It stores the data in a simple
hash structure, and "aliases" certain tags so that when done, you can
count on having the minimal data necessary for re-constructing a valid
RSS file. This means you get the basic title, description, and link for a
channel and its items.

This module extracts more usable links by parsing "scriptingNews" and
"weblog" formats in addition to RDF & RSS. It also "sanitizes" the
output for best results. The munging includes:

=over

=item Remove html tags to leave plain text

=item Remove leading whitespace from URIs

=item By defaul strips characters except 0-9~!@#$%^&*()-+=a-zA-Z[];',.:"<>?\s

=item Use <url> tags when <link> is empty

=item Use misplaced urls in <title> when <link> is empty 

=item Exract links from <a href=...> if required   

=item Limit links to ftp and http(s)

=item Join relative item urls (beginning with / or #) to the site base

=back

=head2 EXPORT

=over

=item parseRSS($outHashRef, $inScalarRef, [$strip])

=over

=item inScalarRef - required

Reference to a scalar containing the document to be parsed. NOTE: The
contents will effectively be destroyed. Make a deep copy first if you care.

=item outHashRef - required

Reference to the hash within which to store the parsed content.

=item strip - optional

An expression indicating the level of winnowing to be performed on the
characters permitted in the results.

=over

=item 1 strip non-printable characters

=item 0 no characters are removed

=item undefined (Default) strip everything but:

0-9~!@#$%^&*()-+= a-zA-Z[];',.:"<>?\t\n

=back

=back

=back

=head2 EXPORTABLE

=over

=item parseXML(\%parsedTree, \$parseThis, 'topTag', $comments);

=over

=item parsedTree - required

Reference to hash to store the parsed document within.

=item parseThis  - required

Reference to scalar containing the document to parse.

=item topTag     - optional

Tag to consider the root node, leaving this undefined is not recommended.

=item comments   - optional

=over

=item false will remove contents from parseThis

=item true will not remove comments from parseThis

=item array reference is true, comments are stored here

=back

=back

=back

=head2 CAVEATS

This is not a conforming parser. It does not handle the following

=over

=item

  <foo bar=">">

=item

  <foo><bar> <bar></bar> <bar></bar> </bar></foo>

=item

  <![CDATA[ ]]>

=item

  PI

=back

It's non-validating, without a DTD the following cannot be properly addressed

=over

=item entities

=item namespaces

This may or may not be arriving in some future release.

=back

=back

=head1 SEE ALSO

perl(1), C<XML::RSS>, C<XML::SAX::PurePerl>,
C<XML::Parser::Lite>, <XML::Parser>

=head1 AUTHOR

Jerrad Pierce <jpierce@cpan.org>.

Scott Thomason <scott@thomasons.org>

=head1 LICENSE

Portions Copyright (c) 2002,2003,2009 Jerrad Pierce, (c) 2000 Scott Thomason.
All rights reserved. This program is free software; you can redistribute it 
and/or modify it under the same terms as Perl itself.

=cut