# $File: //depot/OurNet-Query/Query.pm $ $Author: autrijus $
# $Revision: #4 $ $Change: 1925 $ $DateTime: 2001/09/28 15:12:40 $
package OurNet::Query;
require 5.005;
$OurNet::Query::VERSION = '1.56';
use strict;
use OurNet::Site;
use HTTP::Request::Common;
use LWP::Parallel::UserAgent;
=head1 NAME
OurNet::Query - Scriptable queries with template extraction
=head1 SYNOPSIS
use OurNet::Query;
# Set query parameters
my ($query, $hits) = ('autrijus', 10);
my @sites = ('google', 'google'); # XXX: write more templates!
my %found;
# Generate a new Query object
my $bot = OurNet::Query->new($query, $hits, @sites);
# Perform a query
my $found = $bot->begin(\&callback, 30); # Timeout after 30 seconds
print '*** ' . ($found ? $found : 'No') . ' match(es) found.';
sub callback {
my %entry = @_;
my $entry = \%entry;
unless ($found{$entry{url}}) {
print "*** [$entry->{title}]" .
" ($entry->{score})" .
" - [$entry->{id}]\n" .
" URL: [$entry->{url}]\n";
}
$found{$entry{url}}++;
}
=head1 DESCRIPTION
This module provides an easy interface to perform multiple queries
to internet services, and I<wraps> them into your own format at once.
The results are processed on-the-fly and are returned via callback
functions.
Its interfaces resembles that of I<WWW::Search>'s, but implements it
in a different fashion. While I<WWW::Search> relies on additional
subclasses to parse returned results, I<OurNet::Query> uses I<site
descriptors> for search search engine, which makes it much easier
to add new backends.
Site descriptors may be written in XML, I<Template> toolkit format,
or the I<.fmt> format from the commercial Inforia Quest product.
=head1 CAVEATS
The only confirmed, working site descriptor currently is F<google.tt2>.
The majority of F<*.xml> descriptors are outdated, and need volunteers
to either correct them, or convert them to C<.tt2> format.
This package is supposedly to I<magically> turn your web pages built
with Template Toolkit into web services overnight, using I<diff>-based
induction heuristics; but this is not happening yet. Stay tuned.
There should be instructions of how to write templates in various
formats.
=head1 COMPONENTS
Most Query Toolkit components are independently useful; they rely on
several front-end interfaces to glue themselves together.
=head2 Full-Text Search Engine (FuzzyIndex)
The indexing module I<MUST> implement an indexing mechanism suitable
to handle variable-byte encoding charsets, e.g. big-5 or utf8. Its
index file I<SHOULD NOT> require original data be presented, nor
exceed the original data size on verage.
=head2 Interactive Queries (ChatBot)
The interactive query module I<MUST> accept context-free queries against
any indexed database generated by the Search Engine, and provide feedbacks
based on the entries contained within. It I<MUST> develop a heuristic to
accumulate user input, and build connections between entries based on
relevancy.
=head2 Template Extraction (Template::Extract)
This component I<MUST> support the C<Template(3)> Toolkit format, and
I<MAY> support additional template formats. It I<MUST> be capable of
taking a document and the original template used to generated it,
and produce the original parameter list.
All simple assignment and loop directives I<MUST> be supported; it
I<SHOULD> also accept nested loops and structure elements.
=head2 Site Descriptors (Site)
This includes a collection of oft-used web sites, akin to the
C<WWW::Search> or Inforia Quest collection. It I<SHOULD> also support
basic validation and variable interpolation within the descriptors.
=head2 Template Generation (Template::Generate)
This module I<MUST> be able to generate the original template, based
on two or more distinct outputs. It I<SHOULD> operate without any
prompt of original structures, but I<MAY> draw on such information to
increase its accuracy.
=head2 Front-End Interface (bin/*)
All above components I<MUST> come with at least one command-line
utility, capable of exporting most of their functions to the normal
user. The utilities I<SHOULD> assume a common look-and-feel.
=head2 Documentation (pod/*)
The Query Toolkit Manual I<MUST> contain a tutorial, an overview
of functions, and guides on how to embedd Query components into
existing programs.
=head1 MILESTONES
=head2 Milestone 0 - v1.56 - 2001/09/01
This milestone represents the raw, unconnected state of all tools.
It provides all basic functionalities except for template generation,
yet offers only fzindex / fzquery as useful user-accessible interfaces.
FuzzyIndex big-5 & latin-1 support
ChatBot automatic building of default database
T::Extract template toolkit support; nested fetch
Site google (as proof-of-concept)
bin/* all above interfaces
pod/* overview of functions
=head2 Milestone 1 - v1.6 - 2001/10/15
This milestone aims to export a consistent interface to other developers,
by populating the missing descriptor and documents.
FuzzyIndex gb-1312 support
Site all major search engines and news sources
T::Generate simple diff-based heuristic framework
bin/* a parallel, configurable sitequery coupled with fzindex
pod/* embbed-howto, including win32 COM+ port
=head2 Milestone 2 - v1.7 - 2002/01/01
This milestone will be the first feature-complete release of Query Toolkit,
capable of being used in a more diversed environment.
=cut
# ---------------
# Variable Fields
# ---------------
use fields qw/callback pua timeout query sites bots hits found/;
# -----------------
# Package Constants
# -----------------
use constant ERROR_QUERY_NEEDED => __PACKAGE__ . ' needs a query';
use constant ERROR_HITS_NEEDED => __PACKAGE__ . ' needs sufficient hits';
use constant ERROR_SITES_NEEDED => __PACKAGE__ . ' needs one or more sites';
use constant ERROR_CALLBACK_NEEDED => __PACKAGE__ . ' needs a callback sub';
use constant ERROR_PROTOCOL_UNDEF => __PACKAGE__ . ' cannot use the protocol';
# -------------------------------------
# Subroutine new($query, $hits, @sites)
# -------------------------------------
sub new {
my $class = shift;
my $self = ($] > 5.00562) ? fields::new($class)
: do { no strict 'refs';
bless [\%{"$class\::FIELDS"}], $class };
$self->{query} = shift or (warn(ERROR_QUERY_NEEDED), return);
$self->{hits} = shift or (warn(ERROR_HITS_NEEDED), return);
$self->{sites} = [ @_ ] or (warn(ERROR_SITES_NEEDED), return);
$self->{pua} = LWP::Parallel::UserAgent->new;
return $self;
}
# ---------------------------------------------
# Subroutine begin($self, \&callback, $timeout)
# ---------------------------------------------
sub begin {
my $self = shift;
$self->{callback} = ($_[0] ? $_[0] : $self->{callback})
or (warn(ERROR_CALLBACK_NEEDED), return);
$self->{timeout} = ($_[1] ? $_[1] : $self->{timeout});
$self->{pua}->initialize;
foreach my $count (0 .. $#{$self->{sites}}) {
$self->{bots}[$count] = OurNet::Site->new(
$self->{sites}[$count]
);
my $siteurl = $self->{bots}[$count]->geturl(
$self->{query}, $self->{hits}
);
my $request = ($siteurl =~ m|^post:([^\?]+)\?(.+)|)
? POST("http:$1", [split('[&;=]', $2)])
: GET($siteurl)
or (warn(ERROR_PROTOCOL_UNDEF), return);
# Closure is not something that most Perl programmers need
# trouble themselves about to begin with. (perlref.pod)
$self->{pua}->register($request, sub {
$self->{bots}[$count]->callme($self, $count,
$_[0], \&callmeback);
return;
});
}
$self->{found} = 0;
$self->{pua}->wait($self->{timeout});
return $self->{found};
}
# --------------------------------------
# Subroutine callmeback($self, $himself)
# --------------------------------------
sub callmeback {
my ($self, $himself) = @_;
foreach my $entry (@{$himself->{response}}) {
if (exists($entry->{url})) {
&{$self->{callback}}(%{$entry});
delete($entry->{url});
$self->{found}++;
}
}
}
1;
=head1 SEE ALSO
L<OurNet::Site>
=head1 AUTHORS
Autrijus Tang E<lt>autrijus@autrijus.org>
=head1 COPYRIGHT
Copyright 2001 by Autrijus Tang E<lt>autrijus@autrijus.org>.
This program is free software; you can redistribute it and/or
modify it under the same terms as Perl itself.
See L<http://www.perl.com/perl/misc/Artistic.html>
=cut