#!/usr/local/perl5.005_56.Mar06/bin/perl -w
eval
'exec perl -w -S $0 "$@"'
if
0;
my
%OPT
= (
database
=>
'DB'
,
dir
=>
$WAIT::Config
->{WAIT_home} ||
'/tmp'
,
table
=>
'kbox'
,
clean
=> 0,
remove
=> 0,
);
GetOptions(\
%OPT
,
'database=s'
,
'dir=s'
,
'table=s'
,
'clean!'
,
'remove'
,
) ||
die
"Usage: ...\n"
;
my
$db
;
if
(
$OPT
{clean} and -d
"$OPT{dir}/$OPT{database}"
) {
eval
{
my
$tmp
= WAIT::Database->
open
(
name
=>
$OPT
{database},
'directory'
=>
$OPT
{dir})
or
die
"Could not open table $OPT{table}: $@"
;
my
$tbl
=
$tmp
->table(
name
=>
$OPT
{table});
$tbl
->drop
if
$tbl
;
$tmp
->
close
;
rmtree(
"$OPT{dir}/$OPT{database}/$OPT{table}"
,1,1)
if
-d
"$OPT{dir}/$OPT{database}/$OPT{table}"
;
};
exit
;
}
unless
(-d
"$OPT{dir}/$OPT{database}"
) {
$db
= WAIT::Database->create(
name
=>
$OPT
{database},
'directory'
=>
$OPT
{dir})
or
die
"Could not open database $OPT{database}: $@"
;
}
else
{
$db
= WAIT::Database->
open
(
name
=>
$OPT
{database},
'directory'
=>
$OPT
{dir})
or
die
"Could not open table $OPT{table}: $@"
;
}
my
$layout
= new WAIT::Parse::HTML;
my
$stem
= [{
'prefix'
=> [
'isotr'
,
'isolc'
],
'intervall'
=> [
'isotr'
,
'isolc'
],
},
'decode_entities'
,
'isotr'
,
'isolc'
,
'split2'
,
'stop'
,
'Stem'
];
my
$text
= [{
'prefix'
=> [
'isotr'
,
'isolc'
],
'intervall'
=> [
'isotr'
,
'isolc'
],
},
'decode_entities'
,
'isotr'
,
'isolc'
,
'split2'
,
'stop'
];
my
$sound
= [
'decode_entities'
,
'isotr'
,
'isolc'
,
'split2'
,
'Soundex'
];
my
%D
;
my
$access
=
tie
(
%D
,
'WAIT::Document::Find'
,
sub
{
$_
[0] =~ /\.htm/; },
"/usr/local/etc/httpd/htdocs/berlin"
);
die
$@
unless
defined
$access
;
my
$tb
=
$db
->table(
name
=>
$OPT
{table}) ||
$db
->create_table
(
name
=>
$OPT
{table},
attr
=> [
'docid'
,
'headline'
,
'size'
],
keyset
=> [[
'docid'
]],
layout
=>
$layout
,
access
=>
$access
,
invindex
=>
[
'text'
=>
$stem
,
'title'
=>
$stem
,
'title'
=>
$text
,
]
);
die
unless
$tb
;
my
@DIRS
;
if
(
@ARGV
) {
@DIRS
=
@ARGV
;
}
else
{
@DIRS
= @{
$WAIT::Config
->{manpath}};
}
while
(
my
(
$path
,
$content
) =
each
%D
) {
&index
(
$path
,
$content
);
}
$db
->
close
();
exit
;
my
$NO
;
sub
index
{
my
(
$did
,
$value
) =
@_
;
if
(
$tb
->have(
'docid'
=>
$did
)) {
if
(!
$OPT
{remove}) {
print
"duplicate\n"
;
return
;
}
}
elsif
(
$OPT
{remove}) {
print
"missing\n"
;
return
;
}
if
(-s
$did
< 100) {
print
"too small\n"
;
return
;
}
unless
(
defined
$value
) {
print
"unavailable\n"
;
return
;
}
printf
STDERR
"ok [%d]\n"
, ++
$NO
;
my
$record
=
$layout
->
split
(
$value
);
$record
->{size} =
length
(
$value
);
my
$headline
=
$record
->{title} ||
$did
;
$headline
=~ s/\s+/ /g;
$headline
=~ s/^\s+//;
printf
"%s\n"
,
substr
(
$headline
,0,80);
if
(
$OPT
{remove}) {
$tb
->
delete
(
'docid'
=>
$did
,
headline
=>
$headline
, %{
$record
});
}
else
{
$tb
->insert(
'docid'
=>
$did
,
headline
=>
$headline
, %{
$record
});
}
}