The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.

NAME

KinoSearch::Docs::Tutorial - sample indexing and search applications

DESCRIPTION

The following sample code for invindexer.plx and search.cgi can be used to create a simple search engine. It requires the html presentation of the US Constitution included in the distribution for KinoSearch, under t/us_constitution.

Taint mode is enabled on these scripts by default, since both indexers and search scripts typically handle untrusted data -- see perlsec for an explanation of why you really, really shouldn't turn it off. You'll either have to set the script as an executable and run it via ./invindexer.plx, or add the -T flag on the command line: perl -T invindexer.plx

Note that a proper indexer for html documents would not rely on quick-n-dirty regular expressions for stripping tags, as this one does for the sake of brevity -- it would use a dedicated parsing module such as HTML::Parser.

invindexer.plx

    #!/usr/bin/perl -T
    use strict;
    use warnings;
    
    use File::Spec;
    use KinoSearch::InvIndexer;
    use KinoSearch::Analysis::PolyAnalyzer;
    
    ### In order for invindexer.plx to work correctly, you must modify 
    ### $source_dir, $path_to_invindex, and possibly $base_url.
    ###
    ### $source_dir must lead to the directory containing the US
    ### Constitution html files.
    ###
    ### $path_to_invindex is the future location of the invindex.
    ###
    ### $base_url should reflect the location of the us_constitution directory
    ### when accessed via a web browser.
    my $source_dir       = '';
    my $path_to_invindex = '';
    my $base_url         = '/us_constitution';
    
    opendir CONSTITUTION_DIR, $source_dir
        or die "Couldn't open directory '$source_dir': $!";
    my @filenames = grep {/\.html/} readdir CONSTITUTION_DIR;
    closedir CONSTITUTION_DIR;
    
    ### STEP 1: Choose an Analyzer.
    my $analyzer = KinoSearch::Analysis::PolyAnalyzer->new( 
        language => 'en',
    );

    ### STEP 2: Create a InvIndexer object.
    my $invindexer = KinoSearch::InvIndexer->new(
        analyzer => $analyzer,
        invindex => $path_to_invindex,
        create   => 1,
    );
    
    ### STEP 3: Define fields.
    $invindexer->spec_field( name => 'title' );
    $invindexer->spec_field( name => 'bodytext' );
    $invindexer->spec_field(
        name    => 'url',
        indexed => 0,
    );
    
    foreach my $filename (@filenames) {
        next if $filename eq 'index.html';
        my $filepath = File::Spec->catfile( $source_dir, $filename );
        open( my $fh, '<', $filepath )
            or die "couldn't open file '$filepath': $!";
        my $content = do { local $/; <$fh> };
    
        ### STEP 4: Start a new document.
        my $doc = $invindexer->new_doc;
    
        $content =~ m#<title>(.*?)</title>#s
            or die "couldn't isolate title in '$filepath'";
        my $title = $1;
        $content =~ m#<div id="bodytext">(.*?)</div><!--bodytext-->#s
            or die "couldn't isolate bodytext in '$filepath'";
        my $bodytext = $1;
        $bodytext =~ s/<.*?>/ /gsm;    # quick and dirty tag stripping
    
        ### STEP 5: Set the value for each field.
        $doc->set_value( url      => "$base_url/$filename" );
        $doc->set_value( title    => $title );
        $doc->set_value( bodytext => $bodytext );
    
        ### STEP 6 Add the document to the invindex.
        $invindexer->add_doc($doc);
    
        ### STEP 7 Repeat steps 3-5 for each document in the collection.
    }
    
    ### STEP 8 Finalize the invindex.
    $invindexer->finish;

search.cgi

    #!/usr/bin/perl -T
    use strict;
    use warnings;
    
    use CGI;
    use List::Util qw( max min );
    use POSIX qw( ceil );
    use KinoSearch::Searcher;
    use KinoSearch::Analysis::PolyAnalyzer;
    
    my $cgi           = CGI->new;
    my $q             = $cgi->param('q');
    my $offset        = $cgi->param('offset');
    my $hits_per_page = 10;
    $q      = '' unless defined $q;
    $offset = 0  unless defined $offset;
    
    ### In order for search.cgi to work, $path_to_invindex must be modified so
    ### that it points to the invindex created by invindexer.plx, and
    ### $base_url may have to change to reflect where a web-browser should
    ### look for the us_constitution directory.
    my $path_to_invindex = '';
    my $base_url         = '/us_constitution';
    
    ### STEP 1: Specify the same Analyzer used to create the invindex.
    my $analyzer = KinoSearch::Analysis::PolyAnalyzer->new( 
        language => 'en', 
    );
    
    ### STEP 2: Create a Searcher object.
    my $searcher = KinoSearch::Searcher->new(
        invindex => $path_to_invindex,
        analyzer => $analyzer,
    );
    
    ### STEP 3: Feed a query to the Search object.
    my $hits = $searcher->search($q);
    
    ### STEP 4: Process the search.
    $hits->seek( $offset, $hits_per_page );
    
    ### STEP 5: Format the results however you like.
    
    # create result list
    my $report = '';
    while ( my ( $hit, $score ) = $hits->fetch_hit_hashref ) {
        $score = sprintf( "%0.3f", $score );
        my $excerpt = '';
        if ( $hit->{bodytext} =~ /^(.{60,80}\w\b)/xsm ) {
            $excerpt = "$1...";
        }
        $report .= qq|
            <p>
                <a href="$hit->{url}"><strong>$hit->{title}</strong></a>
                <em>$score</em>
                <br>
                $excerpt
            </p>
            |;
    }
    
    $q =~ s/"/&quot;/g;
    
    # display info about the number of hits, paging links
    my $total_hits = $hits->total_hits;
    my $num_hits_info;
    if ( !length $q ) {
        # no query, no display
        $num_hits_info = '';
    }
    elsif ( $total_hits == 0 ) {
        # alert the user that their search failed
        $num_hits_info = qq|<p>No matches for <strong>$q</strong></p>|;
    }
    else {
        # calculate the nums for the first and last hit to display
        my $last_result = min( ( $offset + $hits_per_page ), $total_hits );
        my $first_result = min( ( $offset + 1 ), $last_result );
    
        # display the result nums, start paging info
        $num_hits_info = qq|
            <p>
                Results <strong>$first_result-$last_result</strong> 
                of <strong>$total_hits</strong> for <strong>$q</strong>.
            </p>
            <p>
                Results Page:
            |;
    
        # calculate first and last hits pages to display / link to
        my $current_page = int( $first_result / $hits_per_page ) + 1;
        my $current_page = int( $first_result / $hits_per_page ) + 1;
        my $last_page    = ceil( $total_hits / $hits_per_page );
        my $first_page   = max( 1, ( $current_page - 9 ) );
        $last_page = min( $last_page, ( $current_page + 10 ) );
    
        # create a url for use in paging links
        my $href = $cgi->url( -relative => 1 ) . "?" . $cgi->query_string;
        $href .= ";offset=0" unless $href =~ /offset=/;
    
        # generate the "Prev" link;
        if ( $current_page > 1 ) {
            my $new_offset = ( $current_page - 2 ) * $hits_per_page;
            $href =~ s/(?<=offset=)\d+/$new_offset/;
            $num_hits_info .= qq|<a href="$href">&lt;= Prev</a>\n|;
        }
    
        # generate paging links
        for my $page_num ( $first_page .. $last_page ) {
            if ( $page_num == $current_page ) {
                $num_hits_info .= qq|$page_num \n|;
            }
            else {
                my $new_offset = ( $page_num - 1 ) * $hits_per_page;
                $href =~ s/(?<=offset=)\d+/$new_offset/;
                $num_hits_info .= qq|<a href="$href">$page_num</a>\n|;
            }
        }
    
        # generate the "Next" link
        if ( $current_page != $last_page ) {
            my $new_offset = $current_page * $hits_per_page;
            $href =~ s/(?<=offset=)\d+/$new_offset/;
            $num_hits_info .= qq|<a href="$href">Next =&gt;</a>\n|;
        }
    
        # finish paging links
        $num_hits_info .= "</p>\n";
    }
    
    # blast it all out
    print "Content-type: text/html\n\n";
    print <<END_HTML;
    <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
        "http://www.w3.org/TR/html4/loose.dtd">
    <html>
    <head>
        <meta http-equiv="Content-type" 
            content="text/html;charset=ISO-8859-1">
        <link rel="stylesheet" type="text/css" href="$base_url/uscon.css">
        <title>KinoSearch: $q</title>
    </head>
    
    <body>
    
        <div id="navigation">
            <form id="usconSearch" action="">
                <strong>
                Search the <a href="$base_url/index.html">US Constitution</a>:
                </strong>
                <input type="text" name="q" id="q" value="$q">
                <input type="submit" value="=&gt;">
                <input type="hidden" name="offset" value="0">
            </form>
        </div><!--navigation-->
    
        <div id="bodytext">
    
        $report
    
        $num_hits_info
    
        <p style="font-size: smaller; color: #666">
            <em>Powered by 
                <a href="http://www.rectangular.com/kinosearch/">
                    KinoSearch
                </a>
            </em>
        </p>
        </div><!--bodytext-->
    
    </body>
    
    </html>
    END_HTML

COPYRIGHT

Copyright 2005-2006 Marvin Humphrey

LICENSE, DISCLAIMER, BUGS, etc.

See KinoSearch version 0.05_03.