NAME
WWW::Scraper::Lite
VERSION
$LastChangedRevision: 15 $
SYNOPSIS
my $scraper = WWW::Scraper::Lite->new();
$scraper ->crawl( $domain ,
{
'//a' => sub {
my ( $scraper , $nodes ) = @_ ;
$scraper ->enqueue( grep { $_ =~ m{^ $domain } }
map { $scraper ->url_remove_anchor( $_ ) }
map { $scraper ->url_make_absolute( $_ ) }
map { $_ ->{href} }
@{ $nodes });
},
'/*' => sub {
my ( $scraper , $nodes ) = @_ ;
print $scraper ->{current}->{response}->content;
},
}
);
|
DESCRIPTION
SUBROUTINES/METHODS
new - constructor, initialises fetch-queue and seen-URL hash
my $oScraper = WWW::Scraper::Lite->new();
|
ua - new/cached LWP::UserAgent object
my $oUA = $oScraper ->ua();
|
crawl - start crawling a given URL with a given set of XPath callbacks
$oScraper ->crawl( $sStartURL , $hrCallbacks );
|
enqueue - push one or more URLs onto the fetch queue
$oScraper ->enqueue( @aURLs );
|
dequeue - shift a URL off the fetch queue
my $sURL = $oScraper ->dequeue();
|
current - a hashref containing information on the current page
my $hrCurrentData = $oScraper ->current;
|
url_remove_anchor - strip '#anchor' text from a URL string
my $sURLout = $oScraper ->url_remove_anchor( $sURLin );
|
url_make_absolute - add the current domain to a URL to make it absolute
my $sURLout = $oScraper ->url_remove_anchor( $sURLin );
|
DIAGNOSTICS
CONFIGURATION AND ENVIRONMENT
DEPENDENCIES
- strict
-
- warnings
-
- LWP::UserAgent
-
- HTML::TreeBuilder::XPath
-
- Carp
-
INCOMPATIBILITIES
BUGS AND LIMITATIONS
AUTHOR
$Author: Roger Pettett,,,$
LICENSE AND COPYRIGHT
This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version.
This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
You should have received a copy of the GNU General Public License along with this program. If not, see <http://www.gnu.org/licenses/>.