package CPAN::Search::Lite::Populate;
use strict;
use warnings;
no warnings qw(redefine);
use CPAN::Search::Lite::Util qw($table_id has_data);
use CPAN::Search::Lite::DBI::Index;
use CPAN::Search::Lite::DBI qw($dbh);
use File::Find;
use File::Basename;
use File::Spec::Functions;
use File::Path;
use AI::Categorizer;
use AI::Categorizer::Learner::NaiveBayes;
use AI::Categorizer::Document;
use AI::Categorizer::KnowledgeSet;
use Lingua::StopWords;

our $dbh = $CPAN::Search::Lite::DBI::dbh;

my ($setup, $no_ppm);
my $DEBUG = 1;
our $VERSION = 0.77;

my %tbl2obj;
$tbl2obj{$_} = __PACKAGE__ . '::' . $_ 
    for (qw(dists mods auths ppms chaps reqs));
my %obj2tbl  = reverse %tbl2obj;

sub new {
  my ($class, %args) = @_;
  
  foreach (qw(db user passwd) ) {
    die "Must supply a '$_' argument" unless defined $args{$_};
  }
    
  $setup = $args{setup};
  $no_ppm = $args{no_ppm};

  my $index = $args{index};
  my @tables = qw(dists mods auths);
  push @tables, 'ppms' unless $no_ppm;
  foreach my $table (@tables) {
      my $obj = $index->{$table};
      die "Please supply a CPAN::Search::Lite::Index::$table object"
          unless ($obj and ref($obj) eq "CPAN::Search::Lite::Index::$table");
  }
  my $state = $args{state};
  unless ($setup) {
      die "Please supply a CPAN::Search::Lite::State object"
          unless ($state and ref($state) eq 'CPAN::Search::Lite::State');
  }

  my $cdbi = CPAN::Search::Lite::DBI::Index->new(%args);

  my $no_mirror = $args{no_mirror};
  my $html_root = $args{html_root};
  my $pod_root = $args{pod_root};
  my $cat_threshold = $args{cat_threshold} || 0.998;
  my $no_cat = $args{no_cat};

  unless ($no_mirror) {
      die "Please supply the html root" unless $html_root;
      die "Please supply the pod root" unless $pod_root;
  }
  my $self = {index => $index,
              state => $state,
              obj => {},
              no_mirror => $no_mirror,
              html_root => $html_root,
              pod_root => $pod_root,
              cat_threshold => $cat_threshold,
              no_cat => $no_cat,
              cdbi => $cdbi,
             };
  bless $self, $class;
}

sub populate {
    my $self = shift;

    if ($setup) {
        unless ($self->{cdbi}->create_tables(setup => $setup)) {
            warn "Creating tables failed";
            return;
        }
    }
    unless ($self->create_objs()) {
        warn "Cannot create objects";
        return;
    }
    unless ($self->populate_tables()) {
        warn "Populating tables failed";
        return;
    }
    return 1;
}

sub create_objs {
    my $self = shift;
    my @tables = qw(dists auths mods reqs chaps);
    push @tables, 'ppms' unless $no_ppm;

    foreach my $table (@tables) {
        my $obj;
        my $pack = $tbl2obj{$table};
        my $index = $self->{index}->{$table};
        if ($index and ref($index) eq "CPAN::Search::Lite::Index::$table") {
            my $info = $index->{info};
	    return unless has_data($info);
            $obj = $pack->new(info => $info, 
                              cdbi => $self->{cdbi}->{objs}->{$table});
          }
        else {
            $obj = $pack->new(cdbi => $self->{cdbi}->{objs}->{$table});
        }
        $self->{obj}->{$table} = $obj;
    }
    foreach my $table (@tables) {
        my $obj = $self->{obj}->{$table};
        foreach (@tables) {
            next if ref($obj) eq $tbl2obj{$_};
            $obj->{obj}->{$_} = $self->{obj}->{$_};
        }
    }

    my $pack = __PACKAGE__ . '::cat';
    my $obj = $pack->new(cat_threshold => $self->{cat_threshold});
    foreach (qw(dists auths mods)) {
        $obj->{obj}->{$_} = $self->{obj}->{$_};
    }
    $self->{obj}->{cat} = $obj;

    unless ($setup) {
        my $state = $self->{state};
        my @tables = qw(auths dists mods);
        push @tables, 'ppms' unless $no_ppm;
        my @data = qw(ids insert update delete);

        foreach my $table (@tables) {
            my $state_obj = $state->{obj}->{$table};
            my $pop_obj = $self->{obj}->{$table};
            $pop_obj->{$_} = $state_obj->{$_} for (@data);
        }
    }
    return 1;
}

sub populate_tables {
    my $self = shift;
    my @methods = $setup ? qw(insert) : qw(insert update delete);
    my @tables = qw(auths dists mods reqs chaps);
    push @tables, 'ppms' unless $no_ppm;
    for my $method (@methods) {
        for my $table (@tables) {
            my $obj = $self->{obj}->{$table};
            unless ($obj->$method()) {
                if (my $error = $obj->{error_msg}) {
                    print "Fatal error from ", ref($obj), ": ", $error, $/;
                    return;
                }
                else {
                    my $info = $obj->{info_msg};
                    print "Info from ", ref($obj), ": ", $info, $/;
                }
            }
        }
    }

    unless ($self->{no_cat}) {
        my $cat = $self->{obj}->{cat};
        unless ($cat->categorize()) {
            if (my $error = $cat->{error_msg}) {
                print "Fatal error from ", ref($cat), ": ", $error, $/;
                return;
            }
            else {
                my $info = $cat->{info_msg};
                print "Info from ", ref($cat), ": ", $info, $/;
            }
        }
    }

    return 1;
}

package CPAN::Search::Lite::Populate::auths;
use base qw(CPAN::Search::Lite::Populate);
use CPAN::Search::Lite::Util qw(has_data);

sub new {
  my ($class, %args) = @_;
  my $info = $args{info};
  die "No author info available" unless has_data($info);
  my $cdbi = $args{cdbi};
  die "No dbi object available"
    unless ($cdbi and ref($cdbi) eq 'CPAN::Search::Lite::DBI::Index::auths');
  my $self = {
              info => $info,
              insert => {},
              update => {},
              delete => {},
              ids => {},
              obj => {},
              cdbi => $cdbi,
              error_msg => '',
              info_msg => '',
             };
  bless $self, $class;
}

sub insert {
  my $self = shift;
  unless ($dbh) {
    $self->{error_msg} = q{No db handle available};
    return;
  }
  my $info = $self->{info};
  my $cdbi = $self->{cdbi};
  my $data = $setup ? $info : $self->{insert};
  unless (has_data($data)) {
    $self->{info_msg} = q{No author data to insert};
    return;
  }
  my $auth_ids = $self->{ids};
  my @fields = qw(cpanid email fullname);
  my $sth = $cdbi->sth_insert(\@fields) or do {
    $self->{error_msg} = $cdbi->{error_msg};
    return;
  };
  foreach my $cpanid (keys %$data) {
    my $values = $info->{$cpanid};
    next unless ($values and $cpanid);
    print "Inserting author $cpanid\n";
    $sth->execute($cpanid, $values->{email}, $values->{fullname})
      or do {
        $cdbi->db_error($sth);
        $self->{error_msg} = $cdbi->{error_msg};
        return;
      };
    $auth_ids->{$cpanid} = $sth->{mysql_insertid};
  }
  $dbh->commit or do {
    $cdbi->db_error($sth);
    $self->{error_msg} = $cdbi->{error_msg};
    return;
  };
  $sth->finish();
  return 1;
}

sub update {
  my $self = shift;
  unless ($dbh) {
    $self->{error_msg} = q{No db handle available};
    return;
  }
  my $data = $self->{update};
  my $cdbi = $self->{cdbi};
  unless (has_data($data)) {
    $self->{info_msg} = q{No author data to update};
    return;
  }
  
  my $info = $self->{info};
  
  my @fields = qw(cpanid email fullname);
  foreach my $cpanid (keys %$data) {
    print "Updating author $cpanid\n";
    next unless $data->{$cpanid};
    my $sth = $cdbi->sth_update(\@fields, $data->{$cpanid});
    my $values = $info->{$cpanid};
    next unless ($cpanid and $values);
    $sth->execute($cpanid, $values->{email}, $values->{fullname})
      or do {
        $cdbi->db_error($sth);
        $self->{error_msg} = $cdbi->{error_msg};
        return;
      };
    $sth->finish();
  }
  $dbh->commit or do {
    $cdbi->db_error();
    $self->{error_msg} = $cdbi->{error_msg};
    return;
  };
  return 1;
}

sub delete {
  my $self = shift;
  $self->{info_msg} = q{No author data to delete};
  return;
}

package CPAN::Search::Lite::Populate::dists;
use base qw(CPAN::Search::Lite::Populate);
use CPAN::Search::Lite::Util qw(has_data);

sub new {
  my ($class, %args) = @_;
  my $info = $args{info};
  die "No dist info available" unless has_data($info);
  my $cdbi = $args{cdbi};
  die "No dbi object available"
    unless ($cdbi and ref($cdbi) eq 'CPAN::Search::Lite::DBI::Index::dists');
  my $self = {
              info => $info,
              insert => {},
              update => {},
              delete => {},
              ids => {},
              obj => {},
              cdbi => $cdbi,
              error_msg => '',
              info_msg => '',
  };
  bless $self, $class;
}

sub insert {
  my $self = shift;
  unless ($dbh) {
    $self->{error_msg} = q{No db handle available};
    return;
  }
  return unless my $auth_obj = $self->{obj}->{auths};
  my $cdbi = $self->{cdbi};
  my $auth_ids = $auth_obj->{ids};
  my $dists = $self->{info};
  my $data = $setup ? $dists : $self->{insert};
  unless (has_data($data)) {
    $self->{info_msg} = q{No dist data to insert};
    return;
  }
  unless ($dists and $auth_ids) {
    $self->{error_msg}->{index} = q{No dist index data available};
    return;
  }
  
  my $dist_ids = $self->{ids};
  my @fields = qw(auth_id dist_name dist_file dist_vers
                  dist_abs size birth readme changes meta install md5);
  my $sth = $cdbi->sth_insert(\@fields) or do {
    $self->{error_msg} = $cdbi->{error_msg};
    return;
  };
  foreach my $distname (keys %$data) {
    my $values = $dists->{$distname};
    my $cpanid = $values->{cpanid};
    next unless ($values and $cpanid and $auth_ids->{$cpanid});
    print "Inserting $distname of $cpanid\n";
    $sth->execute($auth_ids->{$cpanid}, $distname, 
                  $values->{filename}, $values->{version}, 
                  $values->{description}, $values->{size}, 
                  $values->{date}, $values->{readme}, 
                  $values->{changes}, $values->{meta},
                  $values->{install}, $values->{md5}) 
      or do {
        $cdbi->db_error($sth);
        $self->{error_msg} = $cdbi->{error_msg};
        return;
      };
    $dist_ids->{$distname} = $sth->{mysql_insertid};
  }
  $dbh->commit or do {
      $cdbi->db_error($sth);
      $self->{error_msg} = $cdbi->{error_msg};
      return;
  };
  $sth->finish();
  return 1;
}

sub update {
  my $self = shift;
  unless ($dbh) {
    $self->{error_msg} = q{No db handle available};
    return;
  }
  my $cdbi = $self->{cdbi};
  my $data = $self->{update};
  unless (has_data($data)) {
    $self->{info_msg} = q{No dist data to update};
    return;
  }
  return unless my $auth_obj = $self->{obj}->{auths};
  my $auth_ids = $auth_obj->{ids};
  my $dists = $self->{info};
  unless ($dists and $auth_ids) {
    $self->{error_msg} = q{No dist index data available};
    return;
  }
  
  my @fields = qw(auth_id dist_name dist_file dist_vers
                  dist_abs size birth readme changes meta install md5);
  foreach my $distname (keys %$data) {
      next unless $data->{$distname};
      my $sth = $cdbi->sth_update(\@fields, $data->{$distname});
      my $values = $dists->{$distname};
      my $cpanid = $values->{cpanid};
      next unless ($values and $cpanid and $auth_ids->{$cpanid});
      print "Updating $distname of $cpanid\n";
      $sth->execute($auth_ids->{$values->{cpanid}}, $distname, 
                    $values->{filename}, $values->{version}, 
                    $values->{description}, $values->{size}, 
                    $values->{date}, $values->{readme}, 
                    $values->{changes}, $values->{meta},
                    $values->{install}, $values->{md5}) 
          or do {
              $cdbi->db_error($sth);
              $self->{error_msg} = $cdbi->{error_msg};
              return;
          };
      $sth->finish();
  }
  $dbh->commit or do {
    $cdbi->db_error();
    $self->{error_msg} = $cdbi->{error_msg};
    return;
  };
  return 1;
}

sub delete {
  my $self = shift;
  unless ($dbh) {
    $self->{error_msg} = q{No db handle available};
    return;
  }
  my $cdbi = $self->{cdbi};
  my $data = $self->{delete};
  unless (has_data($data)) {
    $self->{info_msg} = q{No dist data to delete};
    return;
  }
  
  my $sth = $cdbi->sth_delete('dist_id');
  foreach my $distname(keys %$data) {
    print "Deleting $distname\n";
    $sth->execute($data->{$distname}) or do {
      $cdbi->db_error($sth);
      $self->{error_msg} = $cdbi->{error_msg};
      return;
    };
  }
  $sth->finish();
  $dbh->commit or do {
    $cdbi->db_error();
    $self->{error_msg} = $cdbi->{error_msg};
    return;
  };
  return 1;
}

package CPAN::Search::Lite::Populate::mods;
use base qw(CPAN::Search::Lite::Populate);
use CPAN::Search::Lite::Util qw(has_data);

sub new {
  my ($class, %args) = @_;
  my $info = $args{info};
  die "No module info available" unless has_data($info);
  my $cdbi = $args{cdbi};
  die "No dbi object available"
    unless ($cdbi and ref($cdbi) eq 'CPAN::Search::Lite::DBI::Index::mods');
  my $self = {
              info => $info,
              insert => {},
              update => {},
              delete => {},
              ids => {},
              obj => {},
              cdbi => $cdbi,
              error_msg => '',
              info_msg => '',
             };
  bless $self, $class;
}

sub insert {
  my $self = shift;
  unless ($dbh) {
    $self->{error_msg} = q{No db handle available};
    return;
  }
  return unless my $dist_obj = $self->{obj}->{dists};
  my $cdbi = $self->{cdbi};
  my $dist_ids = $dist_obj->{ids};
  my $mods = $self->{info};
  my $data = $setup ? $mods : $self->{insert};
  unless (has_data($data)) {
    $self->{info_msg} = q{No module data to insert};
    return;
  }
  unless ($mods and $dist_ids) {
    $self->{error_msg} = q{No module index data available};
    return;
  }
  
  my $mod_ids = $self->{ids};
  my @fields = qw(dist_id mod_name mod_abs doc src
                  mod_vers dslip chapterid);
  my $sth = $cdbi->sth_insert(\@fields) or do {
    $self->{error_msg} = $cdbi->{error_msg};
    return;
  };
  foreach my $modname(keys %$data) {
    my $values = $mods->{$modname};
    next unless ($values and $dist_ids->{$values->{dist}});
    $sth->execute($dist_ids->{$values->{dist}}, $modname,
                  $values->{description}, $values->{doc},
                  $values->{src}, $values->{version},
                  $values->{dslip}, $values->{chapterid})
      or do {
        $cdbi->db_error($sth);
        $self->{error_msg} = $cdbi->{error_msg};
        return;
      };
    $mod_ids->{$modname} = $sth->{mysql_insertid};
  }
  $dbh->commit or do {
    $cdbi->db_error($sth);
    $self->{error_msg} = $cdbi->{error_msg};
    return;
  };
  $sth->finish();
  return 1;
}

sub update {
  my $self = shift;
  unless ($dbh) {
    $self->{error_msg} = q{No db handle available};
    return;
  }
  my $cdbi = $self->{cdbi};
  my $data = $self->{update};
  unless (has_data($data)) {
    $self->{info_msg} = q{No module data to update};
    return;
  }
  return unless my $dist_obj = $self->{obj}->{dists};
  my $dist_ids = $dist_obj->{ids};
  my $mods = $self->{info};
  unless ($dist_ids and $mods) {
    $self->{error_msg} = q{No module index data available};
    return;
  }
  
  my @fields = qw(dist_id mod_name mod_abs doc src
                  mod_vers dslip chapterid);
  foreach my $modname (keys %$data) {
      next unless $data->{$modname};
      print "Updating $modname\n";
      my $sth = $cdbi->sth_update(\@fields, $data->{$modname});
      my $values = $mods->{$modname};
      next unless ($values and $dist_ids->{$values->{dist}});
      $sth->execute($dist_ids->{$values->{dist}}, $modname,
                    $values->{description}, $values->{doc},
                    $values->{src}, $values->{version},
                    $values->{dslip}, $values->{chapterid})
          or do {
              $cdbi->db_error($sth);
              $self->{error_msg} = $cdbi->{error_msg};
              return;
          };
      $sth->finish();
  }
  $dbh->commit or do {
    $cdbi->db_error();
    $self->{error_msg} = $cdbi->{error_msg};
    return;
  };
  return 1;
}

sub delete {
  my $self = shift;
  unless ($dbh) {
    $self->{error_msg} = q{No db handle available};
        return;
  }
  return unless my $dist_obj = $self->{obj}->{dists};
  my $cdbi = $self->{cdbi};
  my $data = $dist_obj->{delete};
  if (has_data($data)) {
    my $sth = $cdbi->sth_delete('dist_id');
    foreach my $distname(keys %$data) {
      $sth->execute($data->{$distname}) or do {
        $cdbi->db_error($sth);
        $self->{error_msg} = $cdbi->{error_msg};
        return;
      };
    }
    $sth->finish();
  }

  $data = $self->{delete};
  if (has_data($data)) {
    my $sth = $cdbi->sth_delete('mod_id');
    foreach my $modname(keys %$data) {
      $sth->execute($data->{$modname}) or do {
        $cdbi->db_error($sth);
        $self->{error_msg} = $cdbi->{error_msg};
        return;
      };
      print "Deleting $modname\n";
    }
  }

  $dbh->commit or do {
    $cdbi->db_error();
    $self->{error_msg} = $cdbi->{error_msg};
    return;
  };
  return 1;
}

package CPAN::Search::Lite::Populate::chaps;
use base qw(CPAN::Search::Lite::Populate);
use CPAN::Search::Lite::Util qw(has_data);

sub new {
  my ($class, %args) = @_;
  my $cdbi = $args{cdbi};
  die "No dbi object available"
    unless ($cdbi and ref($cdbi) eq 'CPAN::Search::Lite::DBI::Index::chaps');
  my $self = {
              obj => {},
              cdbi => $cdbi,
              error_msg => '',
              info_msg => '',
             };
  bless $self, $class;
}

sub insert {
  my $self = shift;
  unless ($dbh) {
    $self->{error_msg} = q{No db handle available};
    return;
  }
  return unless my $dist_obj = $self->{obj}->{dists};
  my $cdbi = $self->{cdbi};
  my $dist_insert = $dist_obj->{insert};
  my $dists = $dist_obj->{info};
  my $dist_ids = $dist_obj->{ids};
  my $data = $setup ? $dists : $dist_insert;
  unless (has_data($data)) {
    $self->{info_msg} = q{No chap data to insert};
    return;
  }
  unless ($dists and $dist_ids) {
    $self->{error_msg} = q{No chap index data available};
    return;
  }
  
  my @fields = qw(chapterid dist_id subchapter);
  my $sth = $cdbi->sth_insert(\@fields) or do {
    $self->{error_msg} = $cdbi->{error_msg};
    return;
  };
  foreach my $dist (keys %$data) {
    my $values = $dists->{$dist};
    next unless defined $values->{chapterid};
    foreach my $chap_id(keys %{$values->{chapterid}}) {
      foreach my $sub_chap(keys %{$values->{chapterid}->{$chap_id}}) {
        next unless $dist_ids->{$dist};
        $sth->execute($chap_id, $dist_ids->{$dist}, $sub_chap)
          or do {
            $cdbi->db_error($sth);
            $self->{error_msg} = $cdbi->{error_msg};
            return;
          };
      }
    }
  }
  $dbh->commit or do {
    $cdbi->db_error($sth);
    $self->{error_msg} = $cdbi->{error_msg};
    return;
  };
  $sth->finish();
  return 1;
}

sub update {
  my $self = shift;
  unless ($dbh) {
    $self->{error_msg} = q{No db handle available};
    return;
  }
  my $cdbi = $self->{cdbi};
  return unless my $dist_obj = $self->{obj}->{dists};
  my $dists = $dist_obj->{info};
  my $dist_ids = $dist_obj->{ids};
  my $data = $dist_obj->{update};
  unless (has_data($data)) {
    $self->{info_msg} = q{No chap data to update};
    return;
  }
  unless ($dist_ids and $dists) {
    $self->{error_msg} = q{No chap index data available};
    return;
  }
  
  my $sth = $cdbi->sth_delete('dist_id');
  foreach my $distname(keys %$data) {
      next unless $data->{$distname};
      $sth->execute($data->{$distname}) or do {
          $cdbi->db_error($sth);
          $self->{error_msg} = $cdbi->{error_msg};
          return;
      };
  }
  $sth->finish();
  
  my @fields = qw(chapterid dist_id subchapter);
  $sth = $cdbi->sth_insert(\@fields);
  foreach my $dist (keys %$data) {
    my $values = $dists->{$dist};
    next unless defined $values->{chapterid};
    foreach my $chap_id(keys %{$values->{chapterid}}) {
      foreach my $sub_chap(keys %{$values->{chapterid}->{$chap_id}}) {
        next unless $dist_ids->{$dist};
        $sth->execute($chap_id, $dist_ids->{$dist}, $sub_chap)
          or do {
            $cdbi->db_error($sth);
            $self->{error_msg} = $cdbi->{error_msg};
            return;
          };
      }
    }
  }
  $dbh->commit or do {
    $cdbi->db_error($sth);
    $self->{error_msg} = $cdbi->{error_msg};
    return;
  };
  $sth->finish();
  return 1;
}

sub delete {
  my $self = shift;
  unless ($dbh) {
    $self->{error_msg} = q{No db handle available};
        return;
  }
  return unless my $dist_obj = $self->{obj}->{dists};
  my $cdbi = $self->{cdbi};
  my $data = $dist_obj->{delete};
  unless (has_data($data)) {
    $self->{info_msg} = q{No chap data to delete};
    return;
  }
  
  my $sth = $cdbi->sth_delete('dist_id');
  foreach my $distname(keys %$data) {
    $sth->execute($data->{$distname}) or do {
      $cdbi->db_error($sth);
      $self->{error_msg} = $cdbi->{error_msg};
      return;
    };
  }
  $sth->finish();
  $dbh->commit or do {
    $cdbi->db_error();
    $self->{error_msg} = $cdbi->{error_msg};
    return;
  };
  return 1;
}

package CPAN::Search::Lite::Populate::reqs;
use base qw(CPAN::Search::Lite::Populate);
use CPAN::Search::Lite::Util qw(has_data);

sub new {
  my ($class, %args) = @_;
  my $cdbi = $args{cdbi};
  die "No dbi object available"
    unless ($cdbi and ref($cdbi) eq 'CPAN::Search::Lite::DBI::Index::reqs');
  my $self = {
              obj => {},
              error_msg => '',
              info_msg => '',
              cdbi => $cdbi,
             };
  bless $self, $class;
}

sub insert {
    my $self = shift;
    unless ($dbh) {
        $self->{error_msg} = q{No db handle available};
        return;
    }
    return unless my $dist_obj = $self->{obj}->{dists};
    return unless my $mod_obj = $self->{obj}->{mods};
    my $cdbi = $self->{cdbi};
    my $dist_insert = $dist_obj->{insert};
    my $dists = $dist_obj->{info};
    my $dist_ids = $dist_obj->{ids};
    my $mod_ids = $mod_obj->{ids};
    my $data = $setup ? $dists : $dist_insert;
    unless (has_data($data)) {
        $self->{info_msg} = q{No req data to insert};
        return;
    }
    unless ($dist_ids and $mod_ids and $dists) {
        $self->{error_msg} = q{No req index data available};
        return;
    }
    
    my @fields = qw(dist_id mod_id req_vers);
    my $sth = $cdbi->sth_insert(\@fields) or do {
        $self->{error_msg} = $cdbi->{error_msg};
        return;
    };
    foreach my $dist (keys %$data) {
        my $values = $dists->{$dist};
        my $requires = $values->{requires};
        next unless (defined $requires);
        if ( ref($requires) eq 'HASH')  {
            foreach my $module (keys %{$requires}) {
                next unless ($dist_ids->{$dist} and $mod_ids->{$module});
                $sth->execute($dist_ids->{$dist}, $mod_ids->{$module}, 
                              $requires->{$module})
                    or do {
                        $cdbi->db_error($sth);
                        $self->{error_msg} = $cdbi->{error_msg};
                        return;
                    };
            }
        }
        else {
            my $module = $requires;
            next unless ($dist_ids->{$dist} and $mod_ids->{$module});
            $sth->execute($dist_ids->{$dist}, $mod_ids->{$module}, 0)
                or do {
                    $cdbi->db_error($sth);
                    $self->{error_msg} = $cdbi->{error_msg};
                    return;
                };
        }
    }
    $dbh->commit or do {
        $cdbi->db_error($sth);
        $self->{error_msg} = $cdbi->{error_msg};
        return;
    };
    $sth->finish();
    return 1;
}

sub update {
    my $self = shift;
    unless ($dbh) {
        $self->{error_msg} = q{No db handle available};
        return;
    }
    my $cdbi = $self->{cdbi};
    return unless my $dist_obj = $self->{obj}->{dists};
    return unless my $mod_obj = $self->{obj}->{mods};
    my $dists = $dist_obj->{info};
    my $dist_ids = $dist_obj->{ids};
    my $mod_ids = $mod_obj->{ids};
    my $data = $dist_obj->{update};
    unless (has_data($data)) {
        $self->{info_msg} = q{No req data to update};
        return;
    }
    unless ($dist_ids and $mod_ids and $dists) {
        $self->{error_msg} = q{No author index data available};
        return;
    }
    
    my $sth = $cdbi->sth_delete('dist_id');
    foreach my $distname(keys %$data) {
        next unless $data->{$distname};
        $sth->execute($data->{$distname}) or do {
            $cdbi->db_error($sth);
            $self->{error_msg} = $cdbi->{error_msg};
            return;
        };
    }
    $sth->finish();
    
    my @fields = qw(dist_id mod_id req_vers);
    $sth = $cdbi->sth_insert(\@fields);
    foreach my $dist (keys %$data) {
        my $values = $dists->{$dist};
        my $requires = $values->{requires};
        next unless defined $requires;
        if (ref($requires) eq 'HASH') {
            foreach my $module (keys %{$requires}) {
                next unless ($dist_ids->{$dist} and $mod_ids->{$module});
                $sth->execute($dist_ids->{$dist}, $mod_ids->{$module},
                              $requires->{$module})
                    or do {
                        $cdbi->db_error($sth);
                        $self->{error_msg} = $cdbi->{error_msg};
                        return;
                    };
            }
        }
        else {
            my $module = $requires;
            next unless ($dist_ids->{$dist} and $mod_ids->{$module});
            $sth->execute($dist_ids->{$dist}, $mod_ids->{$module}, 0)
                or do {
                    $cdbi->db_error($sth);
                    $self->{error_msg} = $cdbi->{error_msg};
                    return;
                };
        }
    }
    $dbh->commit or do {
        $cdbi->db_error($sth);
        $self->{error_msg} = $cdbi->{error_msg};
        return;
    };
    $sth->finish();
    return 1;
}

sub delete {
  my $self = shift;
  unless ($dbh) {
    $self->{error_msg} = q{No db handle available};
    return;
  }
  return unless my $dist_obj = $self->{obj}->{dists};
  return unless my $mod_obj = $self->{obj}->{mods};
  my $cdbi = $self->{cdbi};
  my $data = $dist_obj->{delete};
  if (has_data($data)) {  
    my $sth = $cdbi->sth_delete('dist_id');
    foreach my $distname(keys %$data) {
      $sth->execute($data->{$distname}) or do {
        $cdbi->db_error($sth);
        $self->{error_msg} = $cdbi->{error_msg};
        return;
      };
    }
    $sth->finish();
  }

  $data = $mod_obj->{delete};
  if (has_data($data)) {
    my $sth = $cdbi->sth_delete('mod_id');
    foreach my $modname(keys %$data) {
      $sth->execute($data->{$modname}) or do {
        $cdbi->db_error($sth);
        $self->{error_msg} = $cdbi->{error_msg};
        return;
      };
    }
  }

  $dbh->commit or do {
    $cdbi->db_error();
    $self->{error_msg} = $cdbi->{error_msg};
    return;
  };
  return 1;
}

package CPAN::Search::Lite::Populate::ppms;
use base qw(CPAN::Search::Lite::Populate);
use CPAN::Search::Lite::Util qw(has_data);

sub new {
  my ($class, %args) = @_;
  my $info = $args{info};
  die "No ppm info available" unless has_data($info);
  my $cdbi = $args{cdbi};
  die "No dbi object available"
    unless ($cdbi and ref($cdbi) eq 'CPAN::Search::Lite::DBI::Index::ppms');
  my $self = {
              info => $info,
              insert => {},
              update => {},
              delete => {},
              ids => {},
              obj => {},
              cdbi => $cdbi,
              error_msg => '',
              info_msg => '',
             };
  bless $self, $class;
}

sub insert {
  my $self = shift;
  unless ($dbh) {
    $self->{error_msg} = q{No db handle available};
    return;
  }
  return unless my $dist_obj = $self->{obj}->{dists};
  my $cdbi = $self->{cdbi};
  my $dist_ids = $dist_obj->{ids};
  my $ppms = $self->{info};
  my $data = $setup ? $ppms : $self->{insert};
  unless (has_data($data)) {
    $self->{info_msg} = q{No ppm data to insert};
    return;
  }
  unless ($ppms and $dist_ids) {
      $self->{error_msg} = q{No ppm index data available};
      return;
  }
  
  my @fields = qw(dist_id rep_id ppm_vers);
  my $sth = $cdbi->sth_insert(\@fields) or do {
    $self->{error_msg} = $cdbi->{error_msg};
    return;
  };
  foreach my $rep_id (keys %$data) {
      my $values = $data->{$rep_id};
      unless (has_data($values)) {
	print "No data to insert for rep_id=$rep_id\n";
	next;
      }
      foreach my $package (keys %{$values}) {
          print "Inserting $package for rep_id=$rep_id\n";
          $sth->execute($dist_ids->{$package}, 
                        $rep_id, 
                        $values->{$package}->{version})
              or do {
                  $cdbi->db_error($sth);
                  $self->{error_msg} = $cdbi->{error_msg};
                  return;
              };
      }
  }
  $dbh->commit or do {
    $cdbi->db_error($sth);
    $self->{error_msg} = $cdbi->{error_msg};
    return;
  };
  $sth->finish();
  return 1;
}

sub update {
  my $self = shift;
  unless ($dbh) {
    $self->{error_msg} = q{No db handle available};
    return;
  }
  my $cdbi = $self->{cdbi};
  my $data = $self->{update};
  unless (has_data($data)) {
    $self->{info_msg} = q{No ppm data to update};
    return;
  }
  foreach my $rep_id (keys %$data) {
    my $values = $data->{$rep_id};
    unless (has_data($values)) {
      print "No data to update for rep_id=$rep_id\n";
      next;
    }
    foreach my $package (keys %{$values}) {
      print "Updating $package for rep_id=$rep_id\n";
      my $dist_id = $values->{$package}->{dist_id};
      my $ppm_vers = $values->{$package}->{ppm_vers};
      next unless ($dist_id and $rep_id);
      my $sql = q{UPDATE LOW_PRIORITY } .
        q{ ppms SET ppm_vers = ? } .
          qq{ WHERE dist_id = $dist_id } .
            qq { AND rep_id = $rep_id };
      my $sth = $dbh->prepare($sql) or do {
        $self->db_error();
        return;
      };
      $sth->execute($ppm_vers) or do {
        $self->db_error($sth);
        return;
      };
      $sth->finish;
    }
  }
  $dbh->commit or do {
    $self->db_error();
    return;
  };
  return 1;
}

sub delete {
  my $self = shift;
  unless ($dbh) {
    $self->{error_msg} = q{No db handle available};
    return;
  }
  my $cdbi = $self->{cdbi};
  my $data = $self->{delete};
  unless (has_data($data)) {
    $self->{info_msg} = q{No ppm data to delete};
    return;
  }
  foreach my $rep_id (keys %$data) {
      next unless $rep_id;
      my $values = $data->{$rep_id};
      unless (has_data($values)) {
	print "No data to delete for rep_id=$rep_id\n";
	next;
      }
      my $sth = $cdbi->sth_delete('dist_id', $rep_id);
      foreach my $package (keys %{$values}) {
          print "Deleting $package from rep_id=$rep_id\n";
          $sth->execute($values->{$package}) or do {
              $cdbi->db_error($sth);
              $self->{error_msg} = $cdbi->{error_msg};
              return;
          };
      }
      $sth->finish();
  }
  $dbh->commit or do {
    $cdbi->db_error();
    $self->{error_msg} = $cdbi->{error_msg};
    return;
  };
  return 1;
}

package CPAN::Search::Lite::Populate::cat;
use base qw(CPAN::Search::Lite::Populate);
use CPAN::Search::Lite::Util qw(has_data);

my %features = (content_weights => {
                                    subject => 2,
                                    body => 1,
                                   },
                stopwords => Lingua::StopWords::getStopWords('en'),
                stemming => 'porter',
               );

my $chaps = {
  2 => {subject => q{Perl Core Modules},
        body => q{Perl Core Modules},
       },
  3 => {subject => q{Development Support},
        body => q{Development Support},
       },
  4 => {subject => q{Operating System Interfaces},
        body => q{Operating System Interfaces},
       },
  5 => {subject => q{Networking Devices IPC},
        body => q{Network Devices IPC FTP Socket},
       },
  6 => {subject => q{Data Type Utilities},
        body => q{Data Type Utilities Date Time Math Tie List Tree Class Algorithm Sort Statistics},
       },
  7 => {subject => q{Database Interfaces},
        body => q{Database Interfaces DBD DBI SQL},
       },
  8 => {subject => q{User Interfaces},
        body => q{User Interfaces Tk Term Curses Dialogue Log},
       },
  9 => {subject => q{Language Interfaces},
        body => q{Language Interfaces},
       },
  10 => {subject => q{File Names Systems Locking},
         body => q{File Name System Locking Directory Dir Stat cwd},
        },
  11 => {subject => q{String Lang Text Proc},
         body => q{String Language Text Processing XML Parse},
        },
  12 => {subject => q{Opt Arg Param Proc},
         body => q{Option Argument Parameters Processing Argv Config Getopt},
        },
  13 => {subject => q{Internationalization Locale},
         body => q{Internationalization Locale Unicode I18N},
        },
  14 => {subject => q{Security and Encryption},
         body => q{Security Encryption Authentication Authen Crypt Digest PGP Des},
        },
  15 => {subject => q{World Wide Web HTML HTTP CGI},
         body => q{World Wide Web HTML HTTP CGI WWW Apache MIME Kwiki URI URL},
        },
  16 => {subject => q{Server and Daemon Utilities},
         body => q{Server Daemon Utilties Event},
        },
  17 => {subject => q{Archiving and Compression},
         body => q{Archive Compress File tar gzip gz zip bzip},
        },
  18 => {subject => q{Images Pixmaps Bitmaps},
         body => q{Image Pixmap Bitmap Chart Graph Graphic},
        },
  19 => {subject => q{Mail and Usenet News},
         body => q{Mail Usenet News Sendmail NNTP SMTP IMAP POP3 MIME},
        },
  20 => {subject => q{Control Flow Utilities},
         body => q{Control Flow Utilities callback exception hook},
        },
  21 => {subject => q{File Handle Input Output},
         body => q{File Handle Input Output Dir Directory Log IO},
        },
  22 => {subject => q{Microsoft Windows Modules},
         body => q{Microsoft Windows Modules Win32 Win32API},
        },
  23 => {subject => q{Miscellaneous Modules},
         body => q{Miscellaneous Modules},
        },
  24 => {subject => q{Commercial Software Interfaces},
         body => q{Commercial Software Interfaces},
        },
  26 => {subject => q{Documentation},
         body => q{Documentation},
        },
  27 => {subject => q{Pragma},
         body => q{Pragma},
        },
  28 => {subject => q{Perl6},
         body => q{Perl6},
        },
  99 => {subject => q{Not In Modulelist},
         body => q{Not In Modulelist},
        },
};

sub new {
  my ($class, %args) = @_;
  my $self = {
              obj => {},
              error_msg => '',
              info_msg => '',
              learner => {},
              missing => {},
              cat_threshold => $args{cat_threshold},
             };
  bless $self, $class;
}

sub categorize {
    my $self = shift;
    $self->train() or return;
    $self->missing() or return;
    $self->insert_and_update() or return;
    return 1;
}

sub train {
    my $self = shift;
    return unless my $mod_obj = $self->{obj}->{mods};
    my $mod_info = $mod_obj->{info};
    my ($docs);

    foreach my $mod_name (%$mod_info) {
        (my $subject = $mod_name) =~ s{::}{ }g;
        my $body = '';
        my $abs = $mod_info->{$mod_name}->{description};
        ($body = $abs) =~ s{::}{ }g if $abs;
        my $chapterid = $mod_info->{$mod_name}->{chapterid};
        if ($chapterid) {
            $docs->{$mod_name} = {categories => [$chapterid],
                                  content => {subject => $subject,
                                              body => $body,
                                             },
                                 };
        }
    }

    foreach my $cat(keys %$chaps) {
        $docs->{$cat} = {categories => [$cat],
                         content => {subject => $chaps->{$cat}->{subject},
                                     body => $chaps->{$cat}->{body},
                                    },
                        };
    }
    my $c = 
        AI::Categorizer->new(
                             knowledge_set => 
                             AI::Categorizer::KnowledgeSet->new( name => 'CSL',
                                                               ),
                             verbose => 1,
                            );
    while (my ($name, $data) = each %$docs) {
        $c->knowledge_set->make_document(name => $name, %$data, %features);
    }

    my $learner = $c->learner;
    $learner->train;
    $self->{learner} = $learner;
    return 1;
}

sub missing {
    my $self = shift;
    unless ($dbh) {
        $self->{error_msg} = q{No db handle available};
        return;
    }
    return unless my $dist_obj = $self->{obj}->{dists};
    my $dist_info = $dist_obj->{info};
    my $missing_mods;
    my $sql = 'SELECT mod_name,mod_id,mod_abs,dist_id ' .
        ' FROM mods WHERE chapterid IS NULL ';
    my $sth = $dbh->prepare($sql) or do {
        $self->db_error();
        return;
    };
    $sth->execute() or do {
        $self->db_error($sth);
        return;
    };
    while (my ($mod_name,$mod_id,$mod_abs,$dist_id,$dist_name) = 
           $sth->fetchrow_array) {
        (my $subject = $mod_name) =~ s{::}{ }g;
        my $body = '';
        ($body = $mod_abs) =~ s{::}{ }g if $mod_abs;
        $missing_mods->{$mod_name} = {content => {subject => $subject,
                                                  body => $body,
                                                 },
                                      dist_id => $dist_id,
                                      mod_id => $mod_id,
                                 };
    }
    $sth->finish;

    my $cat_dists;
    $sql = 'SELECT chapterid,dist_id,subchapter FROM chaps';
    $sth = $dbh->prepare($sql) or do {
        $self->db_error();
        return;
    };
    $sth->execute() or do {
        $self->db_error($sth);
        return;
    };
    while (my ($chapterid, $dist_id, $subchapter) = $sth->fetchrow_array) {
        $cat_dists->{$dist_id}->{$chapterid}->{$subchapter}++;
    }
    $sth->finish;

    my $learner = $self->{learner};
    my $insert_mods;
    my $cat_threshold = $self->{cat_threshold};
    while (my ($name, $data) = each %$missing_mods) {
        my $doc = AI::Categorizer::Document->new( name => $name,
                                                  content => $data->{content},
                                                  %features);
        my $r = $learner->categorize($doc);
        my $b = $r->best_category;
        next unless ($b and $r->scores($b) > $cat_threshold);
        $insert_mods->{$name} = {chapterid => $b,
                                 dist_id => $data->{dist_id},
                                 mod_id => $data->{mod_id},
                                };
    }

    my $insert_dists;
    foreach my $dist (keys %$dist_info) {
        my $dist_id;
        foreach my $module (keys %{$dist_info->{$dist}->{modules}}) {
            my $chapterid = $insert_mods->{$module}->{chapterid};
            next unless defined $chapterid;
            $dist_id = $insert_mods->{$module}->{dist_id};
            next unless defined $dist_id;
            (my $subchapter = $module) =~ s!^([^:]+).*!$1!;
            next unless $subchapter;
            next if $cat_dists->{$dist_id}->{$chapterid}->{$subchapter};
            $insert_dists->{$dist_id}->{$chapterid}->{$subchapter}++;
        }
    }
    $self->{missing} = {mods => $insert_mods, dists => $insert_dists};
    return 1;
}

sub insert_and_update {
    my $self = shift;
    unless ($dbh) {
        $self->{error_msg} = q{No db handle available};
        return;
    }
    return unless my $mod_obj = $self->{obj}->{mods};
    my $mod_ids = $mod_obj->{ids};
    return unless my $dist_obj = $self->{obj}->{dists};
    my $dist_ids = $dist_obj->{ids};
    my %dist_names = reverse %$dist_ids;

    my $update = $self->{missing}->{mods};
    foreach my $module (keys %$update) {
        next unless $update->{$module};
        next unless (my $chapterid = $update->{$module}->{chapterid});
        next unless (my $mod_id = $update->{$module}->{mod_id});
        my $sql = q{UPDATE LOW_PRIORITY } .
            qq{ mods SET chapterid = $chapterid } .
                qq{ WHERE mod_id = $mod_id };
        my $sth = $dbh->prepare($sql) or do {
            $self->db_error();
            return;
        };
        $sth->execute() or do {
            $self->db_error($sth);
            return;
        };
        print "Inserting chapterid = $chapterid for $module\n";
        $sth->finish;
    }
    $dbh->commit or do {
        $self->db_error();
        return;
    };

    my $insert = $self->{missing}->{dists};
    my @fields = qw(chapterid dist_id subchapter);
    my $flds = join ',', @fields;
    my $vals = join ',', map '?', @fields;
    my $sql = q{INSERT LOW_PRIORITY INTO chaps } .
        qq{ ($flds) VALUES ($vals) };
    my $sth = $dbh->prepare($sql) or do {
        $self->db_error();
        return;
    };
    foreach my $dist_id (keys %$insert) {
        foreach my $chapterid (keys %{$insert->{$dist_id}} ) {
            foreach my $subchapter (keys %{$insert->{$dist_id}->{$chapterid}}) {
                $sth->execute($chapterid, $dist_id, $subchapter)
                    or do {
                        $self->db_error($sth);
                        return;
                    };
                print "Inserting chapter info: $chapterid/$subchapter for $dist_names{$dist_id}\n";
            }
        }
    }
    $dbh->commit or do {
        $self->db_error($sth);
        return;
    };
    $sth->finish();
    return 1;
}

package CPAN::Search::Lite::Populate;
use CPAN::Search::Lite::Util qw(has_data);

sub db_error {
  my ($obj, $sth) = @_;
  return unless $dbh;
  $sth->finish if $sth;
  $obj->{error_msg} = q{Database error: } . $dbh->errstr;
}

1;

__END__

=head1 NAME

CPAN::Search::Lite::Populate - create and populate database tables

=head1 DESCRIPTION

This module is responsible for creating the tables
(if C<setup> is passed as an option) and then for 
inserting, updating, or deleting (as appropriate) the
relevant information from the indices of
I<CPAN::Search::Lite::Info> and I<CPAN::Search::Lite::PPM> and the
state information from I<CPAN::Search::Lite::State>. It does
this through the C<insert>, C<update>, and C<delete>
methods associated with each table.

Note that the tables are created with the C<setup> argument
passed into the C<new> method when creating the
C<CPAN::Search::Lite::Index> object; existing tables will be
dropped.

=head1 TABLES

The tables used are described below.

=head2 mods

This table contains module information, and is created as

  mod_id SMALLINT UNSIGNED NOT NULL AUTO_INCREMENT
  dist_id SMALLINT UNSIGNED NOT NULL
  mod_name VARCHAR(100) NOT NULL
  mod_abs TINYTEXT
  doc bool
  mod_vers VARCHAR(10)
  dslip CHAR(5)
  chapterid TINYINT(2) UNSIGNED
  PRIMARY KEY (mod_id)
  FULLTEXT (mod_abs)
  KEY (dist_id)
  KEY (mod_name(100))

=over 3

=item * mod_id

This is the primary (unique) key of the table.

=item * dist_id

This key corresponds to the id of the associated distribution
in the C<dists> table.

=item * mod_name

This is the module's name.

=item * mod_abs

This is a description, if available, of the module.

=item * doc

This value, if true, signifies that documentation for the
module exists, and is located, eg, in F<dist_name/Foo/Bar.pm>
for a module C<Foo::Bar> in the C<dist_name> distribution.

=item * src

This value, if true, signifies that the source code for the
module exists, and is located, eg, in F<dist_name/Foo/Bar.pm>
for a module C<Foo::Bar> in the C<dist_name> distribution.

=item * mod_vers

This value, if present, gives the version of the module.

=item * dslip

This is a 5 character string expressing the dslip
(development, support, language, interface, public
license) information.

=item * chapterid

This number corresponds to the chapter id of the module,
if present.

=back

=head2 dists

This table contains distribution information, and is created as

  dist_id SMALLINT UNSIGNED NOT NULL AUTO_INCREMENT
  stamp TIMESTAMP(8)
  auth_id SMALLINT UNSIGNED NOT NULL
  dist_name VARCHAR(90) NOT NULL
  dist_file VARCHAR(110) NOT NULL
  dist_vers VARCHAR(20)
  dist_abs TINYTEXT
  size MEDIUMINT UNSIGNED NOT NULL
  birth DATE NOT NULL
  readme bool
  changes bool
  meta bool
  install bool
  PRIMARY KEY (dist_id)
  FULLTEXT (dist_abs)
  KEY (auth_id)
  KEY (dist_name(90))

=over 3

=item * dist_id

This is the primary (unique) key of the table.

=item * stamp

This is a timestamp for the table indicating when the
entry was either inserted or last updated.

=item * auth_id

This corresponds to the CPAN author id of the distribution
in the C<auths> table.

=item * dist_name

This corresponds to the distribution name (eg, for
F<My-Distname-0.22.tar.gz>, C<dist_name> will be C<My-Distname>).

=item * dist_file

This corresponds to the CPAN file name.

=item * dist_vers

This is the version of the CPAN file (eg, for
F<My-Distname-0.22.tar.gz>, C<dist_vers> will be C<0.22>).

=item * dist_abs

This is a description of the distribtion. If not directly
supplied, the description for, eg, C<Foo::Bar>, if present, will 
be used for the C<Foo-Bar> distribution.

=item * size

This corresponds to the size of the distribution, in bytes.

=item * birth

This corresponds to the last modified time
of the distribution, in the form I<YYYY/MM/DD>.

=item * readme

This value, if true, indicates that a F<README> file for
the distribution is available.

=item * changes

This value, if true, indicates that a F<Changes> file for
the distribution is available.

=item * meta

This value, if true, indicates that a F<META.yml> file for
the distribution is available.

=item * install

This value, if true, indicates that an F<INSTALL> file for
the distribution is available.

=back

=head2 auths

This table contains CPAN author information, and is created as

  auth_id SMALLINT UNSIGNED NOT NULL AUTO_INCREMENT
  cpanid VARCHAR(20) NOT NULL
  fullname VARCHAR(40) NOT NULL
  email TINYTEXT
  PRIMARY KEY (auth_id)
  FULLTEXT (fullname)
  KEY (cpanid(20))

=over 3

=item * auth_id

This is the primary (unique) key of the table.

=item * cpanid

This gives the CPAN author id.

=item * fullname

This is the full name of the author.

=item * email

This is the supplied email address of the author.

=back

=head2 chaps

This table contains chapter information associated with
distributions. PAUSE allows one, when registering modules,
to associate a chapter id with each module (see the C<mods>
table). This information is used here to associate chapters
(and subchapters) with distributions in the following manner.
Suppose a distribution C<Quantum-Theory> contains a module
C<Beta::Decay> with chapter id C<55>, and
another module C<Laser> with chapter id C<87>. The
C<Quantum-Theory> distribution will then have two
entries in this table - C<chapterid> of I<55> and
C<subchapter> of I<Beta>, and C<chapterid> of I<87> and
C<subchapter> of I<Laser>.

The table is created as follows.

  chap_id SMALLINT UNSIGNED NOT NULL AUTO_INCREMENT
  chapterid TINYINT UNSIGNED NOT NULL
  dist_id SMALLINT UNSIGNED NOT NULL
  subchapter TINYTEXT
  KEY (dist_id)

=over 3

=item * chap_id

This is the primary (unique) key of the table.

=item * chapterid

This number corresponds to the chapter id.

=item * dist_id

This is the id corresponding to the distribution in the
C<dists> table.

=item * subchapter

This is the subchapter.

=back

=head2 reqs

This table lists the prerequisites of the distribution,
as found in the F<META.yml> file (if supplied - note that
only relatively recent versions of C<ExtUtils::MakeMaker>
or C<Module::Build> generate this file when making a
distribution). The table is created as

  req_id SMALLINT UNSIGNED NOT NULL AUTO_INCREMENT
  dist_id SMALLINT UNSIGNED NOT NULL
  mod_id SMALLINT UNSIGNED NOT NULL
  req_vers VARCHAR(10)
  KEY (dist_id)

=over 3

=item * req_id

This is the primary (unique) key of the table.

=item * dist_id

This corresponds to the id of the distribution in the
C<dists> table.

=item * mod_id

This corresponds to the id of the prerequisite module
in the C<mods> table.

=item * req_vers

This is the version of the prerequisite module, if specified.

=back

=head2 ppms

This table contains information on Win32 ppm
packages available in the repositories specified
in C<$repositories> of L<CPAN::Search::Lite::Util>.
The table is created as

  ppm_id SMALLINT UNSIGNED NOT NULL AUTO_INCREMENT
  dist_id SMALLINT UNSIGNED NOT NULL
  rep_id TINYINT(2) UNSIGNED NOT NULL
  ppm_vers VARCHAR(20)
  KEY (dist_id)

=over 3

=item * ppm_id

This is the primary (unique) key of the table.

=item * dist_id

This is the id of the distribution appearing in the
C<dists> table.

=item * rep_id

This is the id of the repository appearing in the
C<$repositories> data structure.

=item * ppm_vers

This is the version of the ppm package found.

=back

=head2 reps

This table contains information on the Win32 ppm
repositories specified in C<$repositories> of 
L<CPAN::Search::Lite::Util>.
The table is created as

  rep_id SMALLINT UNSIGNED NOT NULL
  abs TINYTEXT
  browse TINYTEXT
  perl VARCHAR(10)
  alias VARCHAR(20)
  KEY (rep_id)

=over 3

=item * rep_id

This is the primary (unique) key of the table, and
corresponds to the C<rep_id> of the C<ppms> table.

=item * abs

This is a description of the repository.

=item * browse

This is a URL where one can browse the repository.

=item * perl

This specifies the perl version the repository corresponds to.

=item * alias

This specifies a short alias for the repository.

=back

=head2 chapters

This contains information on the chapters.
The table is created as

  chapterid SMALLINT UNSIGNED NOT NULL
  chap_link TINYTEXT
  KEY (chapterid)

=over 3

=item * chapterid

This is the id of the distribution appearing in the
C<dists> table.

This is the primary (unique) key of the table, and
corresponds to the C<chapterid> of the C<dists>, C<mods>,
and C<chaps> table.

=item * chap_link

This is a description of the chapter that C<chapterid> corresponds
to (eg, C<File_Handle_Input_Output>).

=back

=head1 CATEGORIES

When uploading a module to PAUSE, there exists an option
to assign it to one of 24 broad categories. However, many
modules have not been assigned such a category, for one
reason or another. When populating the tables, the
I<AI::Categorizer> module is used to guess a possible
category for those modules that haven't been assigned one,
based on a training set based on the modules that have been
assigned a category (see <AI::Categorizer> for general
details). If this guess is above a configurable
threshold (see L<CPAN::Search::Lite::Index>, the guess is
accepted and subsequently inserted into the database, as
well as updating the categories associated with the
module's distribution.

=head1 SEE ALSO

L<CPAN::Search::Lite::Index>

=cut