# $File: //depot/OurNet-Query/Query.pm $ $Author: autrijus $
# $Revision: #4 $ $Change: 1925 $ $DateTime: 2001/09/28 15:12:40 $

package OurNet::Query;
require 5.005;

$OurNet::Query::VERSION = '1.56';

use strict;

use OurNet::Site;
use HTTP::Request::Common;
use LWP::Parallel::UserAgent;

=head1 NAME

OurNet::Query - Scriptable queries with template extraction

=head1 SYNOPSIS

    use OurNet::Query;

    # Set query parameters
    my ($query, $hits) = ('autrijus', 10);
    my @sites = ('google', 'google'); # XXX: write more templates!
    my %found;

    # Generate a new Query object
    my $bot = OurNet::Query->new($query, $hits, @sites);

    # Perform a query
    my $found = $bot->begin(\&callback, 30); # Timeout after 30 seconds

    print '*** ' . ($found ? $found : 'No') . ' match(es) found.';

    sub callback {
        my %entry = @_;
        my $entry = \%entry;

        unless ($found{$entry{url}}) {
            print "*** [$entry->{title}]" .
                     " ($entry->{score})" .
                   " - [$entry->{id}]\n"  .
             "    URL: [$entry->{url}]\n";
        }

        $found{$entry{url}}++;
    }

=head1 DESCRIPTION

This module provides an easy interface to perform multiple queries
to internet services, and I<wraps> them into your own format at once.
The results are processed on-the-fly and are returned via callback
functions.

Its interfaces resembles that of I<WWW::Search>'s, but implements it
in a different fashion. While I<WWW::Search> relies on additional
subclasses to parse returned results, I<OurNet::Query> uses I<site 
descriptors> for search search engine, which makes it much easier
to add new backends.

Site descriptors may be written in XML, I<Template> toolkit format,
or the I<.fmt> format from the commercial Inforia Quest product.

=head1 CAVEATS

The only confirmed, working site descriptor currently is F<google.tt2>.
The majority of F<*.xml> descriptors are outdated, and need volunteers
to either correct them, or convert them to C<.tt2> format.

This package is supposedly to I<magically> turn your web pages built
with Template Toolkit into web services overnight, using I<diff>-based
induction heuristics; but this is not happening yet. Stay tuned.

There should be instructions of how to write templates in various
formats.

=head1 COMPONENTS

Most Query Toolkit components are independently useful; they rely on
several front-end interfaces to glue themselves together. 

=head2 Full-Text Search Engine (FuzzyIndex)

The indexing module I<MUST> implement an indexing mechanism suitable
to handle variable-byte encoding charsets, e.g. big-5 or utf8. Its
index file I<SHOULD NOT> require original data be presented, nor
exceed the original data size on verage.

=head2 Interactive Queries (ChatBot)

The interactive query module I<MUST> accept context-free queries against
any indexed database generated by the Search Engine, and provide feedbacks
based on the entries contained within. It I<MUST> develop a heuristic to
accumulate user input, and build connections between entries based on
relevancy.

=head2 Template Extraction (Template::Extract)

This component I<MUST> support the C<Template(3)> Toolkit format, and
I<MAY> support additional template formats. It I<MUST> be capable of
taking a document and the original template used to generated it,
and produce the original parameter list.

All simple assignment and loop directives I<MUST> be supported; it
I<SHOULD> also accept nested loops and structure elements.

=head2 Site Descriptors (Site)

This includes a collection of oft-used web sites, akin to the
C<WWW::Search> or Inforia Quest collection. It I<SHOULD> also support
basic validation and variable interpolation within the descriptors.

=head2 Template Generation (Template::Generate)

This module I<MUST> be able to generate the original template, based
on two or more distinct outputs. It I<SHOULD> operate without any
prompt of original structures, but I<MAY> draw on such information to
increase its accuracy.

=head2 Front-End Interface (bin/*)

All above components I<MUST> come with at least one command-line
utility, capable of exporting most of their functions to the normal
user. The utilities I<SHOULD> assume a common look-and-feel.

=head2 Documentation (pod/*)

The Query Toolkit Manual I<MUST> contain a tutorial, an overview
of functions, and guides on how to embedd Query components into
existing programs.

=head1 MILESTONES

=head2 Milestone 0 - v1.56 - 2001/09/01

This milestone represents the raw, unconnected state of all tools.
It provides all basic functionalities except for template generation,
yet offers only fzindex / fzquery as useful user-accessible interfaces.

    FuzzyIndex	big-5 & latin-1 support
    ChatBot	automatic building of default database 
    T::Extract	template toolkit support; nested fetch
    Site	google (as proof-of-concept)
    bin/*	all above interfaces
    pod/*	overview of functions

=head2 Milestone 1 - v1.6 - 2001/10/15

This milestone aims to export a consistent interface to other developers,
by populating the missing descriptor and documents.

    FuzzyIndex	gb-1312 support
    Site	all major search engines and news sources
    T::Generate	simple diff-based heuristic framework
    bin/*	a parallel, configurable sitequery coupled with fzindex
    pod/*	embbed-howto, including win32 COM+ port

=head2 Milestone 2 - v1.7 - 2002/01/01

This milestone will be the first feature-complete release of Query Toolkit,
capable of being used in a more diversed environment.

=cut

# ---------------
# Variable Fields
# ---------------
use fields qw/callback pua timeout query sites bots hits found/;

# -----------------
# Package Constants
# -----------------
use constant ERROR_QUERY_NEEDED    => __PACKAGE__ . ' needs a query';
use constant ERROR_HITS_NEEDED     => __PACKAGE__ . ' needs sufficient hits';
use constant ERROR_SITES_NEEDED    => __PACKAGE__ . ' needs one or more sites';
use constant ERROR_CALLBACK_NEEDED => __PACKAGE__ . ' needs a callback sub';
use constant ERROR_PROTOCOL_UNDEF  => __PACKAGE__ . ' cannot use the protocol';

# -------------------------------------
# Subroutine new($query, $hits, @sites)
# -------------------------------------
sub new {
    my $class = shift;
    my $self  = ($] > 5.00562) ? fields::new($class)
                               : do { no strict 'refs';
                                      bless [\%{"$class\::FIELDS"}], $class };

    $self->{query} = shift  or (warn(ERROR_QUERY_NEEDED), return);
    $self->{hits}  = shift  or (warn(ERROR_HITS_NEEDED),  return);
    $self->{sites} = [ @_ ] or (warn(ERROR_SITES_NEEDED), return);
    $self->{pua}   = LWP::Parallel::UserAgent->new;

    return $self;
}

# ---------------------------------------------
# Subroutine begin($self, \&callback, $timeout)
# ---------------------------------------------
sub begin {
    my $self = shift;

    $self->{callback} = ($_[0] ? $_[0] : $self->{callback})
        or (warn(ERROR_CALLBACK_NEEDED), return);
    $self->{timeout}  = ($_[1] ? $_[1] : $self->{timeout});
    $self->{pua}->initialize;

    foreach my $count (0 .. $#{$self->{sites}}) {
        $self->{bots}[$count] = OurNet::Site->new(
	    $self->{sites}[$count]
	);

        my $siteurl = $self->{bots}[$count]->geturl(
	    $self->{query}, $self->{hits}
	);

        my $request = ($siteurl =~ m|^post:([^\?]+)\?(.+)|)
                    ? POST("http:$1", [split('[&;=]', $2)])
                    : GET($siteurl)
            or (warn(ERROR_PROTOCOL_UNDEF), return);

        # Closure is not something that most Perl programmers need
        # trouble themselves about to begin with. (perlref.pod)
        $self->{pua}->register($request, sub {
            $self->{bots}[$count]->callme($self, $count,
                                            $_[0], \&callmeback);
            return;
        });
    }

    $self->{found} = 0;
    $self->{pua}->wait($self->{timeout});

    return $self->{found};
}

# --------------------------------------
# Subroutine callmeback($self, $himself)
# --------------------------------------
sub callmeback {
    my ($self, $himself) = @_;

    foreach my $entry (@{$himself->{response}}) {
    	if (exists($entry->{url})) {
            &{$self->{callback}}(%{$entry});
            delete($entry->{url});

            $self->{found}++;
        }
    }
}

1;

=head1 SEE ALSO

L<OurNet::Site>

=head1 AUTHORS

Autrijus Tang E<lt>autrijus@autrijus.org>

=head1 COPYRIGHT

Copyright 2001 by Autrijus Tang E<lt>autrijus@autrijus.org>.

This program is free software; you can redistribute it and/or 
modify it under the same terms as Perl itself.

See L<http://www.perl.com/perl/misc/Artistic.html>

=cut