lib/File/Tabular/Web/Attachments/Indexed.pm


            
              1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
—
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
              package File::Tabular::Web::Attachments::Indexed;
use base qw/File::Tabular::Web::Attachments/;
use strict;
use warnings;
no warnings 'uninitialized';
use Search::Indexer;
use Search::QueryParser 0.92;
use locale;
#----------------------------------------------------------------------
sub app_initialize {
#----------------------------------------------------------------------
  my $self = shift;
  $self->SUPER::app_initialize;
  # indexed fields are specified as "[fields]upload field=indexed" in config
  my $upld_ref = $self->{app}{upload_fields};
  my @indexed = grep {$upld_ref->{$_} =~ /indexed/} values %$upld_ref;
  @indexed < 2 or die "currently no support for multiple indexed fields";
  $self->{app}{indexed_field} = $indexed[0];
}
#======================================================================
#                 REQUEST HANDLING : SEARCH METHODS                   #
#======================================================================
#----------------------------------------------------------------------
sub words_queried { 
#----------------------------------------------------------------------
  my $self = shift;
  my $all_search = "$self->{search_string_orig} $self->{search_fulltext}";
  return ($all_search =~ m([\w/]+)g);
}
#----------------------------------------------------------------------
sub log_search {
#----------------------------------------------------------------------
  my $self = shift;
  return if not $self->{logger};
  my $msg = sprintf "[%s][%s] $self->{user}", 
    $self->{search_string_orig},
    $self->{search_fulltext};
  $self->{logger}->info($msg);
}
#----------------------------------------------------------------------
sub before_search {
#----------------------------------------------------------------------
  my ($self) = @_;
  $self->SUPER::before_search;
  # searches into the fulltext index are passed through param 'SFT'
  unless($self->{search_fulltext} = $self->param('SFT')) {
    delete $self->{fulltext_result};
    return;
  }
  $self->{app}{indexer} ||= Search::Indexer->new( 
    dir          => $self->{app}{dir},
    preMatch     => $self->{cfg}->get('preMatch'),
    postMatch    => $self->{cfg}->get('postMatch'),
   );
  my $result = $self->{app}{indexer}
                    ->search($self->{search_fulltext}, "implicit_plus");
  # MAYBE add some logging here (time for fulltext search, #docs found)
  if ($result) {                # nonempty results
    $self->{results}{killedWords} = join ", ", @{$result->{killedWords}};
    $self->{results}{regex} = $result->{regex};
    # HACK : build a regex with all document ids, and add that into
    # the search string. Not efficient if the result set is large;
    # will require more clever handling in File::Tabular::compile_query 
    # using some kind of representation for sets of integers (bit vectors
    # or Set::IntSpan::Fast)
    # ASSUMES the document number is the record key, stored in first field
    my $doc_ids       = join ",", keys %{$result->{scores}}
      or return;                # no scores, no results
    $self->{search_string} = "#$doc_ids";
    $self->{search_string} .= " AND ($self->{search_string_orig})" 
      if $self->{search_string_orig};
  }
  $self->{fulltext_result} = $result;
  return $self;
}
#----------------------------------------------------------------------
sub search { # call parent search(), add fulltext scores into results
#----------------------------------------------------------------------
  my ($self, $search_string) = @_;
  $self->SUPER::search($search_string);
  my $fulltext_result = $self->{fulltext_result} or return;
  # merge scores into results 
  $self->{data}->ht->add('score'); # new field for storing 'score'
  foreach my $record (@{$self->{results}{records}}) {
    my $doc_id       = $self->key($record);
    $record->{score} = $fulltext_result->{scores}{$doc_id};
  }
  $self->{orderBy} ||= "score : -num"; # default sorting by decreasing scores
}
#----------------------------------------------------------------------
sub sort_and_slice { 
#----------------------------------------------------------------------
  my ($self) = @_;
  $self->SUPER::sort_and_slice;
  $self->add_excerpts;
}
#----------------------------------------------------------------------
sub add_excerpts { # add text excerpts from attached files
#----------------------------------------------------------------------
  my ($self) = @_;
  $self->{fulltext_result} or return;
  # need new field in the Hash::Type to store the excerpts
  $self->{data}->ht->add('excerpts'); 
  # add excerpts into each displayed record
  my $regex = $self->{results}->{regex};
  foreach my $record (@{$self->{results}{records}}) {
    my $buf = $self->indexed_doc_content($record);
    my $excerpts = $self->{app}{indexer}->excerpts($buf, $regex);
    $record->{excerpts} = join(' / ', @$excerpts);
  }
}
#----------------------------------------------------------------------
sub params_for_next_slice { 
#----------------------------------------------------------------------
  my ($self, $start) = @_;
  return ("SFT=$self->{search_fulltext}",
          $self->SUPER::params_for_next_slice($start));
}
#======================================================================
#                        HANDLING ATTACHMENTS                         #
#======================================================================
#----------------------------------------------------------------------
sub after_add_attachment {
#----------------------------------------------------------------------
  my ($self, $record, $field, $path) = @_;
  if ($field eq $self->{app}{indexed_field}) {
    my $buf  = $self->indexed_doc_content($record);
    delete $self->{app}{indexer};
    my $indexer = Search::Indexer->new(dir       => $self->{app}{dir},
                                       writeMode => 1);
    $indexer->add($self->key($record), $buf);
  }
}
#----------------------------------------------------------------------
sub before_delete_attachment {
#----------------------------------------------------------------------
  my ($self, $record, $field, $path) = @_;
  if ($field eq $self->{app}{indexed_field}) {
    delete $self->{app}{indexer};
    my $indexer = Search::Indexer->new(dir       => $self->{app}{dir},
                                       writeMode => 1);
    $indexer->remove($self->key($record));
  }
}
#----------------------------------------------------------------------
sub indexed_doc_content {
#----------------------------------------------------------------------
  my ($self, $record) = @_;
  # this is the default implementation, MOST PROBABLY INADEQUATE
  # should be overridden in subclasses to perform appropriate
  # conversions from Html, Pdf, Word, etc.
  my $path = $self->upload_fullpath($record, $self->{indexed_field});
  open my $fh, $path or die "open $path: $!";
  local $/;
  my $content = <$fh>; # just return the file content
  return $content;
}
1;
__END__
=head1 NAME
File::Tabular::Web::Attachments::Indexed - Fulltext indexing in documents attached to File::Tabular::Web
=head1 DESCRIPTION
This abstract class adds support for 
fulltext indexing in documents attached to a
L<File::Tabular::Web|File::Tabular::Web> application.
Queries into the fulltext index should be passed under the
C<SFT> ("search full text") parameter, in addition to the 
usual C<S> parameter (search in metadata record). So for
example
  http://my/app.ftw?S=2007&SFT=perl
will search records containing the word "2007" and having an attached
document in which there is the word "perl". Queries can of course be
much more complex, with boolean operators, parentheses, excluded words, etc.
--- see L<Search::Indexer> and L<Query::Parser>.
Indexing requires some mechanism to convert attached documents 
into plain text. This cannot be guessed by the present class,
so you should write a subclass that implements such
conversions; see the L</SUBCLASSING> section below.
=head1 RESERVED FIELD NAMES
Records retrieved from a fulltext search will have two 
additional fields : C<score> (how well the document 
matched the query) and C<excerpts> (strings
of text fragments close to the searched words).
Therefore those field names should not be present
as regular fields in the data file.
=head1 CONFIGURATION
=head2 [fields]
  upload fieldname1
  upload fieldname2 = indexed
Currently only one single upload field can be indexed
within a given application.
=head2 subclassing
This class relies on the L</indexed_doc_content> method 
for converting attached documents into plain text, which
is a prerequisite to perform the indexing. The default
implementation of L</indexed_doc_content> just returns 
the raw file content, so it is most likely inappropriate
to suit your needs; therefore you should write a subclass
that overrides this method, and then associate this subclass
to your application within the configuration file :
  [application]
  class = My::Subclass::Of::File::Tabular::Web::Attachements::Indexed
=head2 Asynchronous indexing
If your uploaded documents are Microsoft Office or OpenOffice
documents, it may be too costly to convert them on the fly, while
answering the HTTP request. A way to deal with this is to 
override the L</after_add_attachment> and 
L</before_delete_attachment> methods : instead of 
performing immediate adds or deletions into the index, 
these method can write indexing requests into an event queue.
A separate process then reads the event queue and 
performs the indexing operations.
=head1 METHODS
=head2 app_initialize
Calls the L<parent method|File::Tabular::Web::Attachments/app_initialize>;
records in C<< $self->{app}{indexed_field} >> which is the name
of the indexed field.
=head2 words_queried
Returns a list of words queried either in the C<S> or C<SFT> parameters.
=head2 log_search
Logs both the C<S> and C<SFT> parameters.
=head2 before_search
Performs the fulltext search, and combines the results
into the usual search string coming from the C<S> parameter.
=head2 search
Calls the L<parent method|File::Tabular::Web/search>
and adds a C<score> field into each record.
=head2 sort_and_slice
Calls the L<parent method|File::Tabular::Web/sort_and_slice>
and adds excerpts of the searched words from attached documents
into each record of the slice.
=head2 add_excerpts
Implementation to find excerpts of searched word within 
attached documents and add them into the result set.
=head2 params_for_next_slice
Returns a string repeating the search parameters, for
generating URLs to the next or previous slice.
=head2 after_add_attachment
Performs the indexing of the attached document
=head2 before_delete_attachment
Removes the document from the index.
=head2 indexed_doc_content
  my $plain_text = $self->indexed_doc_content($record);
Returns the plain text representation of the document attached
to C<$record>. To get to the actual file, your implementation 
can access 
  my $path = $self->upload_fullpath($record, $self->{indexed_field});
	Global
`s`	Focus search bar
`?`	Bring up this help dialog
	GitHub
`g` `p`	Go to pull requests
`g` `i`	go to github issues (only if github is preferred repository)
	POD
`g` `a`	Go to author
`g` `c`	Go to changes
`g` `i`	Go to issues
`g` `d`	Go to dist
`g` `r`	Go to repository/SCM
`g` `s`	Go to source
`g` `b`	Go to file browse
	Search terms
module: (e.g. module:Plugin)
distribution: (e.g. distribution:Dancer auth)
author: (e.g. author:SONGMU Redis)
version: (e.g. version:1.00)