The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.

NAME

FEAR::API - There's no fear with this elegant site scraper

DESCRIPTION

FEAR::API is a tool that helps reduce your time creating site scraping scripts and help you do it in an much more elegant way. FEAR::API combines many strong and powerful features from various CPAN modules, such as LWP::UserAgent, WWW::Mechanize, Template::Extract, Encode, HTML::Parser, etc. and digests them into a deeper Zen.

More documentation will come sooooooner or later.

EXAMPLES

    use FEAR::API -base;
    

Fetch a page and store it in a scalar

    fetch("google.com") > my $content;
    
    my $content = fetch("google.com")->document->as_string;

Fetch a page and print to STDOUT

    getprint("google.com");
   
    print fetch("google.com")->document->as_string;

    fetch("google.com");
    print $$_;    

    fetch("google.com") | _print;

Fetch a page and save it to a file

    getstore("google.com");

    url("google.com")->() | _save_as("google.html");
    
    fetch("google.com") | io('google.html');
    url("google.com")->() >> _self;
    &$_ while $_;
    (url("google.com")->() >> _self) | _save_as_tree("./root");
    $_->() | _save_as_tree("./root") while $_;

Recursively get web pages from Google

    url("google.com")->() >> _self;
    &$_ >> _self while $_;

Recursively get web pages from Google

    (url("google.com")->() >> _self) | _save_as_tree("./root");
    while($_){
      (&$_ | _save_as_tree("./root")) >> _self;
    }
    url("google.com")->()->follow_link(n => 2);
    print Dumper fetch("google.com")->links;

Submit a query to Google

    url("google.com")->();
    submit_form(
                form_number => 1,
                fields => { q => "Kill Bush" }
                );
    url("[% FOREACH i = ['a'..'z'] %]
         http://some.site/[% i %]
         [% END %]");
    &$_ while $_;
    

Get pages in parallel

    url("google.com")->() >> _self;
    pfetch(sub{
               local $_ = shift;
               print join q/ /, title, current_url, document->size, $/;
           });

Minimal

    url("google.com")->()
      >> [
          qr(^http:) => _self,
          qr(google) => \my @l,
          qr(google) => sub {  print ">>>".$_->[0],$/ }
         ];
    $_->() while $_;
    print Dumper \@l;

Verbose

    fetch("http://google.com")
    ->report_links(
                   qr(^http:) => _self,
                   qr(google) => \my @l,
                   qr(google) => sub {  print ">>>".$_->[0],$/ }
                  );
    fetch while has_more_urls;
    print Dumper \@l;

Minimal

    url("google.com")->()
      >> {
          qr(^http:) => _self,
          qr(google) => \my @l,
          qr(google) => sub {  print ">>>".$_->[0],$/ }
         };
    $_->() while $_;
    print Dumper \@l;

Verbose

    fetch("http://google.com")
    ->fallthrough_report(1)
    ->report_links(
                   qr(^http:) => _self,
                   qr(google) => \my @l,
                   qr(google) => sub {  print ">>>".$_->[0],$/ }
                  );
    fetch while has_more_urls;
    print Dumper \@l;

Extraction

Extract data from CPAN

    url("http://search.cpan.org/recent")->();
    submit_form(
            form_name => "f",
            fields => {
                       query => "perl"
                      });
    template("<!--item-->[% p %]<!--end item-->");
    extract;
    print Dumper extresult;

Extract data from CPAN after some HTML cleanup

    url("http://search.cpan.org/recent")->();
    submit_form(
            form_name => "f",
            fields => {
                       query => "perl"
                      });
    preproc(q(s/\A.+<!--results-->(.+)<!--end results-->.+\Z/$1/s));
    print document->as_string;    # print content to STDOUT
    template("<!--item-->[% p %]<!--end item-->");
    extract;
    print Dumper extresult;

HTML cleanup, extract data, and refine results

    url("http://search.cpan.org/recent")->();
    submit_form(
            form_name => "f",
            fields => {
                       query => "perl"
                      });
    preproc(q(s/\A.+<!--results-->(.+)<!--end results-->.+\Z/$1/s));
    print $$_;    # print content to STDOUT
    template("<!--item-->[% rec %]<!--end item-->");
    extract;
    postproc(q($_->{rec} =~ s/<.+?>//g));     # Strip HTML tags
    print Dumper extresult;

Use filtering syntax

    fetch("http://search.cpan.org/recent");
    submit_form(
                form_name => "f",
                fields => {
                           query => "perl"
                })
       | _doc_filter(q(s/\A.+<!--results-->(.+)<!--end results-->.+\Z/$1/s))
       | _template("<!--item-->[% rec %]<!--end item-->")
       | _result_filter(q($_->{rec} =~ s/<.+?>//g));
    print Dumper \@$_;

Invoke handler for extracted results

    fetch("http://search.cpan.org/recent");
    submit_form(
                form_name => "f",
                fields => {
                           query => "perl"
                })
       | _doc_filter(q(s/\A.+<!--results-->(.+)<!--end results-->.+\Z/$1/s))
       | "<!--item-->[% rec %]<!--end item-->"
       | _result_filter(q($_->{rec} =~ s/<.+?>//g));
    invoke_handler('Data::Dumper');

Preprocess document

    url("google.com")->()
    | _preproc(use => "html_to_null")
    | _preproc(use => "decode_entities")
    | _print;

Postprocess extraction results

    fetch("http://search.cpan.org/recent");
    submit_form(
                form_name => "f",
                fields => {
                           query => "perl"
                })
       | _doc_filter(q(s/\A.+<!--results-->(.+)<!--end results-->.+\Z/$1/s))
       | _template("<!--item-->[% rec %]<!--end item-->")
       | _result_filter(use => "html_to_null",    qw(rec));
       | _result_filter(use => "decode_entities", qw(rec))
    print Dumper \@$_;

AUTHOR & COPYRIGHT

Copyright (C) 2006 by Yung-chung Lin (a.k.a. xern) <xern@cpan.org>

This library is free software; you can redistribute it and/or modify it under the same terms as Perl itself