lib/CWB/Web/Search.pm - metacpan.org


            
              1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
—
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
              package CWB::Web::Search;
## -*-cperl-*-
use Carp;
use CWB::CL;
use HTML::Entities;
#
#  CWB::Web::Search::MatchList objects (used internally)
#
package CWB::Web::Search::MatchList;
use Carp;
use HTML::Entities;
use CWB::CL;
# CWB::Web::Search::MatchList object = reference to list of structs:
# $ml->[$i]->{'center'}     = center of match (mean of keywords)
# $ml->[$i]->{'variance'}   = variance of match (-> quality)
# $ml->[$i]->{'n'}          = number of keywords in this match
# $ml->[$i]->{'first'}      = first corpus position of minimal match
# $ml->[$i]->{'last'}       = last corpus position of minimal match
# $ml->[$i]->{'keyword'}->{$cpos}  = code of keyword at c.p. $cpos
# $ml->[$i]->{'quality'}    = relevance of match (must be explicitly recomputed)
sub new {
  my $class = shift;
  my $self = [];
  bless $self, $class;
  $self->add(@_);               # optional initialisation
  return $self;
}
# find insertion point by 'center' value (this _would_ be correct
# if the match list were ordered); returns index into list
sub find_point {
  my $self = shift;
  my $center = shift;           # center of element to insert
  my $i = shift;                # optional arg: start index
  $i = 0 unless defined $i;
  my $size = @$self;
  $i++
    while ($i < $size and $center > $self->[$i]->{'center'});
  $i--
    while ($i > 0 and $center < $self->[$i-1]->{'center'});
  return $i;
}
# add new matches to list, initialised to size 1, variance 0
# Usage: $self->add($keyword, @cpos);
sub add {
  my $self = shift;
  my $keyword = shift;          # code of keyword
  my $i = 0;
  foreach my $center (@_) {
    $i = $self->find_point($center, $i);        # should be fast if cpos list is ordered
    splice @$self, $i, 0, {
                           'center' => $center,
                           'variance' => 0,
                           'n' => 1,
                           'first' => $center,
                           'last' => $center,
                           'keyword' => { $center => $keyword },
                           'quality' => 0,
                           };
  }
}
# returns matchlist size
sub size {
  my $self = shift;
  return scalar @$self;
}
# retrieve single match: returns
# (first, last, center, variance, quality, %keyword [as cpos => keycode pairs])
sub match {
  my $self = shift;
  my $n = shift;
  my $m = $self->[$n];
  return ($m->{'first'}, $m->{'last'}, $m->{'center'}, $m->{'variance'}, $m->{'quality'}, %{$m->{'keyword'}});
}
# summarize (single) match  (maximal length defaults to 160 words)
# Usage: $summary = $ml->summarize($n, $word_att, $start_tag, $end_tag [, $maximal_length]);
sub summarize {
  my $self = shift;
  my $n = shift;
  my $word_att = shift;
  my $start_tag = shift;
  my $end_tag = shift;
  my $maximal_length = shift;
  $maximal_length = 160 unless defined $maximal_length;
  $maximal_length = int ($maximal_length / 2) + 1; # summary is symmetric to center of match
  my $match = $self->[$n];
  my $length = int(sqrt($match->{'variance'})*2 + 15); # default summary extends 2 * s.d. + 15 tokens to each side from center
  $length = $maximal_length if $length > $maximal_length;
  $length = 5 if $length < 5;   # minimum size is 11 words
  my @words = ();               # summary as word list
  # summary must be contained in context, i.e. use this matches 'first' and 'last' as hard boundaries
  my $first = $match->{'center'} - $length;
  my $pre_ell = "... ";         # ellipsis before summary if not start of document
  if ($first < $match->{'first'}) {
    $first = $match->{'first'};
    $pre_ell = "";
  }
  my $last = $match->{'center'} + $length;
  my $post_ell = " ...";        # ellipsis after summary if not end of document
  if ($last > $match->{'last'}) {
    $last = $match->{'last'};
    $post_ell = "";
  }
  foreach my $cpos ($first .. $last) {
    my $token = encode_entities($word_att->cpos2str($cpos));
    $token = $start_tag.$token.$end_tag
      if (defined $match->{'keyword'}->{$cpos});
    push @words, $token;
  }
  return $pre_ell."@words".$post_ell;
}
# (re-)compute quality values of matches
# Usage: $ml->compute_quality($no_of_keywords [, @preferred ]);
sub compute_quality {
  my $self = shift;
  my $no_of_keywords = shift;
  my @preferred = @_;           # list of s-attribute handles for preferred regions -> higher quality score
  # each keyword present in the match scores 1 point; proximity of the
  # keywords scores up to $proximity_points points (scaled by standard deviation)
  my $proximity_points =                       # we'll probably want to make that configurable
    ($no_of_keywords > 2) ? 2 : 1;
  my $preferred_points = (@preferred) ?        # ... and that as well
    (($no_of_keywords > 3) ? 2 : 1)   : 0; 
  my $total_points = ($no_of_keywords - 1) + $proximity_points + $preferred_points;
   
  foreach my $match (@$self) {
    my $variance = $match->{'variance'};
    $variance = 4 if $variance < 4; # scale proximity score to 0 .. $proximity_points range
    my $sd = sqrt($variance/4);
    my $cnt_preferred = 0; # a match gets higher score, if one or more of its keywords fall into preferred environments
    foreach my $keyword (keys %{$match->{'keyword'}}) {
      $cnt_preferred++ 
        if grep { defined $_->cpos2struc($keyword) } @preferred;
    }
    my $quality = 
      (($match->{'n'} - 1) + ($proximity_points / $sd) + ($preferred_points * $cnt_preferred / $no_of_keywords)) 
       / $total_points;
    $match->{'quality'} = int ($quality*100 + 0.5); # rounded percentage
  }
}
# sort matchlist by one the fields
#   'first'  ...  sort by first position in match
#   'quality' ... sort by quality values (make sure you've called compute_quality() beforehand!)
sub sort {
  my $self = shift;
  my $method = shift;
  my %sort = (
              'first' => sub {return sort {$a->{'first'} <=> $b->{'first'}} @_},
              'quality' => sub {return sort {$b->{'quality'} <=> $a->{'quality'}} @_},
              );
  croak "Usage:  \$ml->sort('".join("' | '", keys %sort)."');"
    unless defined $sort{$method};
  @$self = &{$sort{$method}}(@$self);
}
# remove overlapping matches ("cull" the one with highest quality rating)
# NB don't forget to $ml->compute_quality; $ml->sort('first') before!
sub cull {
  my $self = shift;
 I_LOOP:
  for (my $i = 0; $i < @$self; $i++) {
    my $i_first = $self->[$i]->{'first'};
    my $i_last = $self->[$i]->{'last'};
  J_LOOP:
    for (my $j = $i+1; $j < @$self; $j++) {
      last unless $j < @$self;  # check again for 'redo'
      my $j_first = $self->[$j]->{'first'};
      my $j_last = $self->[$j]->{'last'};
      # compare i-th and j-th element of match list
      last J_LOOP # assume ml is ordered by 'first' -> no more overlaps possible after this point
        if $j_first > $i_last; 
      if ($i_last >= $j_first) {
        if ($self->[$i]->{'quality'} >= $self->[$j]->{'quality'}) {
          # i-th match is better -> keep it, delete j-th match
          splice @$self, $j, 1;
          redo J_LOOP;          # must re-consider j-th match
        }
        else {
          # j-th match is better -> delete i-th match and restart outer loop
          splice @$self, $i, 1;
          redo I_LOOP;
        }
      }
    }
  }
  # meself had better be culled nicely now ...
}
# **
# cleanup: to be done
# find overlapping matches & select the one with higher quality
# **
# expand to desired context setting
# template for the context_expansion_func() function:
#    ($first, $last) = context_expansion_func($first, $last);
sub expand {
  my $self = shift;
  my $context_expansion = shift;
  croak 'Usage: $ml->expand(\&context_expansion_func);'
    unless defined $context_expansion and ref $context_expansion eq 'CODE';
  foreach my $match (@$self) {
    ($match->{'first'}, $match->{'last'}) = 
      &{$context_expansion}($match->{'first'}, $match->{'last'});
  }
}
# this is the big one: check positive/negative constraint on matches; or join keyword list
# template for the check_match_size() function:
#    $ok = check_match_size($first, $last);
sub constraint {
  my $self = shift;
  my $mode = lc shift;          # mode is positive, negative, or join
  croak "Usage: \$ml->constraint('positive'|'negative'|'optional'|'join', \$keyword, \\&check_match_size, \@cpos_list);"
    unless $mode =~ /^((posi|nega)tive|join|optional)$/ and @_ >= 2;
  my $keyword = shift;          # code of keyword (unused for negative constraint)
  my $check_match_size = shift; # pointer to function used for checking match size
  # @_ now contains the list of corpus positions for the constraint
  my %used = ();                # in join mode, remember which positions from list have been used
   
  # apply constraint to all matches in list
  my ($i, $j) = (0, 0);
  while ($i < @$self) {         # list size may changed (negative constraint)
    my $match = $self->[$i];
    $j = find_closest(\@_, $match->{'center'}, $j);
    # $first .. $last is minimal match region with the new keyword included
    my $ok = (defined $j) ? 1 : 0; # if @_ == 0, can't find closest -> not ok -> proceed as if check_match_size() had returned false
    my ($new_position, $first, $last);
    if ($ok) {
      $new_position = $_[$j];
      ($first, $last) = new_boundaries($match->{'first'}, $match->{'last'}, $new_position);
      $ok = &{$check_match_size}($first, $last);
    }
    if ($ok) {
      if ($mode eq 'negative') {
        # negative constraint -> remove match if keyword is within range
        splice @$self, $i, 1;
        next;                   # don't increment $i when we've removed a match
      }
      else {
        # positive constraint / join -> adjoin new keyword to match
        $match->{'first'} = $first;
        $match->{'last'} = $last;
        my $n = $match->{'n'}; # adjust center and variance values
        my $center = $match->{'center'};
        my $variance = $match->{'variance'};
        # I believe that the equations below are correct ... ;o)
        $match->{'center'} = ($n * $center + $new_position) / ($n+1);
        $match->{'variance'} = ($n/($n+1)) * 
          ($variance + (1/($n+1)) * ($new_position - $center)**2);
        $match->{'n'} = $n + 1;
        $match->{'keyword'}->{$new_position} = $keyword;
        # in join mode, don't add keyword later if it was 'used up' now
        $used{$new_position} = 1;
      }
    }
    else {
      # no keyword within range -> remove match if positive constraint
      if ($mode eq 'positive') {
        splice @$self, $i, 1;
        next;                   # don't increment $i after removing match
      }
    }
     
    $i++;                       # go to next match in list
  }
   
  # in join mode, add unused keywords as new matches to list 
  if ($mode eq 'join') {
    foreach my $cpos (grep {not defined $used{$_}} @_) {
      $self->add($keyword, $cpos) # check match size (e.g. 'header') for new single-keyword matches
        if &{$check_match_size}($cpos, $cpos);
    }
  }
}
# internal function: modify boundaries so that $pos is included in $start .. $end interval
# Usage: ($start, $end) = new_boundaries($start, $end, $pos);
sub new_boundaries {
  my ($start, $end, $pos) = @_;
  $start = ($pos < $start) ? $pos : $start;
  $end = ($pos > $end) ? $pos : $end;
  return ($start, $end);
}
# internal function: find element in sorted list closest to given value; returns index
# Usage: $index = find_closest(\@list, $val [, $index]);  
sub find_closest {
  my $listref = shift;          # reference to list
  my $value = shift;            # comparison value
  my $i = shift;                # optional: start index (may speed up search)
  $i = 0 unless defined $i;
  my $size = @$listref;
  $i = $size - 1 if $i >= $size;
  return undef if $size == 0;   # can't find anything in empty list
  while (1) {
    if ($i > 0 and $i < $size 
        and abs($listref->[$i-1] - $value) < abs($listref->[$i] - $value)) {
      $i--;
    }
    elsif ($i < ($size - 1) 
           and abs($listref->[$i+1] - $value) < abs($listref->[$i] - $value)) {
      $i++;
    }
    else {
      last;                     # found local minimum
    }
  }
  return $i;
}
# debugging ...
sub print {
  my $self = shift;
  my $match;
  for (my $i = 0; $i < @$self; $i++) {
    $match = $self->[$i];
    print "MatchList entry #",($i+1),"   [",$match->{'n'}," keywords]\n";
    print "  center:   ",(int $match->{'center'}),"\n";
    print "  variance: ",(int $match->{'variance'}),"\n";
    print "  first:    ",$match->{'first'},"\n";
    print "  last:     ",$match->{'last'},"\n";
    my @keys = map {$_."(".$match->{'keyword'}->{$_}.")"} sort keys %{$match->{'keyword'}};
    print "  keys:     @keys\n";
    print "  quality:  ",$match->{'quality'},"%\n";
  }
}
#
#  the main CWB::Web::Search package
#
package CWB::Web::Search;
# error handler: print to STDERR and exit :or: user-defined callback
sub error {
  my $self = shift;
  if (defined $self->{'error_handler'}) {
    &{$self->{'error_handler'}}(@_);
    exit 1;                     # user-defined error handler should exit
  }
  else {
    print STDERR "\nCWB::Web::Search Error:\n";
    grep { chomp(my $l=$_); print STDERR "\t$l\n"; } @_;
    exit 1;
  }
}
# CWB::Web::Search object struct:
# {
#   'error_handler' => user-defined error handler (standard fallback if undefined)
#   'corpus'     => name of query corpus
#   'ch'         => CWB::CL::Corpus object
#   'word'       => CWB::CL::Attribute object (word attribute)
#   'lemma'      => CWB::CL::Attribute object (lemma attribute)
#   'check_size' => reference to subroutine 
#                   that checks whether all keywords fall within specified range
#   'expand'     => reference to subroutine
#                   that expands match to full context (textual representation/HTML)
#                   with keywords highlighted
#   'cull'       => remove overlapping matches 'never', 'before' or 'after' context expansion
#   'hlstart'    => HTML tag (with attributes) used for keyword highlighting (default: <B>)
#   'hlend'      => closing tag for keyword highlighting (default: </B>)
#   'data'       => s-attributes whose values will be returned (separately) 
#                   {
#                     <attribute> => CL::Attribute object,
#                     ...
#                   }
#   'ignore_case' => 'always', 'auto', 'never'  (ignore case)
#   'ignore_diac' => 'always', 'auto', 'never'  (ignore diacritics)
#   'preferred'  => [ CL::Attribute object, ... ]  (list of preferred environments,
#                   such as <title> or <header>, which increase match relevance)
#   'ml'         => CWB::Web::Search::MatchList object
# }
# object constructor:  opens CL corpus/attributes & initialises variables
sub new {
  croak "Usage: \$s = new CWB::Web::Search 'HGC';"
    unless @_ == 2;
  my $class = shift;
  my $corpus = lc shift;
  my $self = {};
  bless($self, $class);         # so we can use object methods for initialisation
  $self->{'error_handler'} = undef;
  # open specified corpus
  my $ch = new CWB::CL::Corpus $corpus;              
  $self->error("Can't open corpus '$corpus'.")
    unless defined $ch;
  $self->{'ch'} = $ch;
  # open word & lemma attributes
  my $word = $ch->attribute("word", 'p');
  $self->error("Can't access 'word' attribute of corpus '$corpus'.")
    unless defined $word;
  $self->{'word'} = $word;
  my $lemma = $ch->attribute("lemma", 'p');
  $self->error("Can't access 'lemma' attribute of corpus '$corpus'.")
    unless defined $lemma;
  $self->{'lemma'} = $lemma;
  # init some variables to default values
  $self->{'check_size'} = $self->make_check_size_sub("30 tokens");
  $self->{'expand'} = $self->make_expand_sub("50 tokens");
  $self->{'data'} = {};         # by default, no data is returned
  $self->{'ignore_case'} = 'auto'; # ignore case/diacritics set to automatic by default
  $self->{'ignore_diac'} = 'auto';
  $self->{'ml'} = new CWB::Web::Search::MatchList; # empty match list
  $self->{'cull'} = 'never';    # don't cull() by default
  $self->highlight("<B>");      # default highlighting is <B> .. </B>
  $self->preferred();           # preferred environments depend on specific corpus
  return $self;
}
# object destructor: undef backends
sub DESTROY {
  my $self = shift;
  delete $self->{'data'};       # undef's attribute handles
  delete $self->{'word'};
  delete $self->{'lemma'};
  delete $self->{'check_size'};
  delete $self->{'expand'};
  delete $self->{'ch'};
  delete $self->{'ml'};
}
# error handler
sub on_error {
  my $self = shift;
  my $handler = shift;
  croak 'Usage: $s->on_error(\&my_error_handler);'
    unless (not defined $handler or ref $handler eq 'CODE') and @_ == 0;
  $self->{'error_handler'} = $handler;
}
# create match size checking subroutine
# Usage:  $coderef = make_check_size_sub("2 s");
#         ok() if &$coderef($first, $last); 
sub make_check_size_sub {
  my ($self, $context) = @_;
  if ($context =~ /^\s*([0-9]+)\s*(words?|tokens?)?\s*$/) {
    my $words = $1;
    return sub {
      my ($first, $last) = @_;
      return ($last-$first) <= $words;
    }
  }
  elsif ($context =~ /^\s*([0-9]*)\s*([A-Za-z\xc0-\xff_-]+)\s*$/) {
    my ($nr, $env) = ($1, $2);  # size is $nr <$env>...</$env> regions
    $nr = 1 if $nr eq "";       # "s" means "1 s"
    my $att = $self->{'ch'}->attribute($env, 's');
    $self->error("Invalid match size '$context'.", "(no <$env> regions in corpus)")
      unless defined $att;
    return sub {
      my ($first, $last) = $att->cpos2struc(@_);
      return (defined $first and defined $last and ($last-$first) < $nr);
    }
  }
  else {
    $self->error("Invalid match size '$context'.");
  }
}
# create context expansion subroutine
# Usage:  $coderef = make_expand_sub("s");
#         ($from, $to) = &$coderef($first, $last);
sub make_expand_sub {
  my ($self, $context) = @_;
  if ($context =~ /^\s*([0-9]+)\s*(words?|tokens?)?\s*$/) {
    my $words = $1;
    my $size = $self->{'word'}->max_cpos;
    return sub {
      my ($first, $last) = @_;
      $first -= $words; $last += $words;
      $first = 0 if $first < 0;
      $last = $size - 1 if $last >= $size;
      return ($first, $last);
    }
  }
  elsif ($context =~ /^\s*([0-9]*)\s*([A-Za-z\xc0-\xff_-]+)\s*$/) {
    my ($nr, $env) = ($1, $2);  # size is $nr <$env>...</$env> regions
    $nr = 1 if $nr eq "";       # "s" means "1 s"
    my $att = $self->{'ch'}->attribute($env, 's');
    my $size = $att->max_struc;
    $self->error("Invalid context size '$context'.", "(no <$env> regions in corpus)")
      unless defined $att;
    return sub {
      my ($first, $last) = @_;
      my $dummy;
      my $struc = $att->cpos2struc($first);
      if (defined $struc) {
        $struc -= $nr - 1;
        $struc = 0 if $struc < 0;
        ($first, $dummy) = $att->struc2cpos($struc);
      }
      $struc = $att->cpos2struc($last);
      if (defined $struc) {
        $struc += $nr - 1;
        $struc = $size - 1 if $struc >= $size;
        ($dummy, $last) = $att->struc2cpos($struc); 
      }
      return ($first, $last);
    }
  }
  else {
    $self->error("Invalid context size '$context'.");
  }
}
# search engine settings: search window
sub window {
  my $self = shift;
  croak 'Usage: $s->window($window_size);'
    unless @_ == 1;
  my $window = shift;
  $self->{'check_size'} = $self->make_check_size_sub($window);
}
# search engine settings: context returned
sub context {
  my $self = shift;
  croak 'Usage: $s->context($context_size);'
    unless @_ == 1;
  my $context = shift;
  $self->{'expand'} = $self->make_expand_sub($context);
}
# regex flags settings: ignore case / ignore diacritics
sub ignore_case {
  my $self = shift;
  my $val = lc shift;
  if ($val =~ /^[01]$/) {
    carp "Usage is now: \$s->ignore_case('always'|'auto'|'never');";
    $val = ($val) ? 'always' : 'never';
  }
  croak "Usage: \$s->ignore_case('always'|'auto'|'never');"
    unless $val =~ /^(always|auto|never)$/;
  $self->{'ignore_case'} = $val;
}
sub ignore_diacritics {
  my $self = shift;
  my $val = lc shift;
  if ($val =~ /^[01]$/) {
    carp "Usage is now: \$s->ignore_diacritics('always'|'auto'|'never');";
    $val = ($val) ? 'always' : 'never';
  }
  croak "Usage: \$s->ignore_diacritics('always'|'auto'|'never');"
    unless $val =~ /^(always|auto|never)$/;
  $self->{'ignore_diac'} = $val;
}
# select culling strategy
sub cull {
  my $self = shift;
  my $method = shift;
  croak "Usage: \$search->cull('never' | 'before' | 'after');"
    unless defined $method and ($method = lc $method) =~ /^(never|before|after)$/;
  $self->{'cull'} = $method;
}
# set preferred environments
sub preferred {
  my $self = shift;
  my $ch = $self->{'ch'};
  my @env = @_;
  my @att = ();                 # compile list of s-attribute handles
  foreach my $env (@env) {
    croak "Usage: \$search->preferred(\$region_name, ... );"
      unless $env =~ /^[A-Za-z0-9_-]+$/;
    my $att = $ch->attribute($env, 's');
    $self->error("Preferred environment: no <$env>..</$env> regions in corpus.")
      unless defined $att;
    push @att, $att;
  }
  $self->{'preferred'} = [@att];
}
# HTML tag used for keyword highlighting (open tag, with optional attributes)
sub highlight {
  my $self = shift;
  my $tag = shift;
  croak 'Usage: $search->highlight($open_tag);'
    unless defined $tag and $tag =~ /^<([A-Za-z0-9_-]+)(\s.*>|>)$/;
  my $name = $1;
  $self->{'hlstart'} = $tag;
  $self->{'hlend'} = "</$name>";
}
# search engine settings: data (region values)
sub data {
  my $self = shift;
  croak 'Usage: $search->data($att1, ...);'
    unless @_ > 0;
  my %atts = ();                # will be stored in $self->{'data'} field
  foreach my $att (@_) {
    my $att_handle = $self->{'ch'}->attribute($att, 's');
    $self->error("Data attribute '$att' does not exist.")
      unless defined $att_handle;
    $self->error("Data attribute '$att' has no annotated values.")
      unless $att_handle->struc_values;
    $atts{$att} = $att_handle;
  }
  $self->{'data'} = \%atts;
}
# return number of matches found
sub size {
  my $self = shift;
  return $self->{'ml'}->size;
}
# return data for match (returns result struct)
sub match {
  my $self = shift;
  my $n = shift;
  my $flag = shift;
  croak "Usage: \$search->match(\$n [, 'context']);"
    if defined $flag and (lc $flag ne 'context');
  my $word_att = $self->{'word'};
  my $start_tag = $self->{'hlstart'};
  my $end_tag = $self->{'hlend'};
  my $ml = $self->{'ml'};
  my $size = $ml->size;
  return undef
    unless $n >= 0 and $n < $size;
  my $result = {};              # build result struct
  my ($first, $last, $center, $variance, $quality, %keyword) = $ml->match($n);
  $result->{'cpos'} = int ($center + 0.5);
  $result->{'quality'} = $quality;
  $result->{'summary'} = $ml->summarize($n, $word_att, $start_tag, $end_tag);
  if (defined $flag) {
    # build full match text (including context)
    my @words = ();
    foreach my $cpos ($first .. $last) {
      my $token = encode_entities($word_att->cpos2str($cpos));
      $token = $start_tag.$token.$end_tag
        if defined $keyword{$cpos};
      push @words, $token;
    }
    $result->{'context'} = "@words";
  }
  foreach my $att (keys %{$self->{'data'}}) {
    my $data = $self->{'data'}->{$att}->cpos2struc2str($center);
    $result->{'data'}->{$att} = (defined $data) ? $data : "";
  }
  return $result;
}
# front-end to the query function; parses query string entered by user
sub query_string {
  my $self = shift;
  croak "Usage \$search->query_string(\$string);"
    unless @_ == 1;
  my $query = shift;
  # double quotes ("...") mark multi-word keys (exact match)
  # -> convert spaces to underlines, so we don't split them when tokenising the string
  while ($query =~ /(^|[^\\])\"(.*?[^\\])\"/) {
    my ($pre, $multiword, $post) = ($`.$1, $2, $');
    #    print "SUBST '$query' -> ($pre | $multiword | $post)\n";
    # if either one of the quot
    $multiword =~ tr[ ][_];
    $query = $pre.$multiword.$post;
  }
  my @keywords = split " ", $query;
   
  # a single keyword is implicitly required -- make that explicit (because of the
  # special tricks for multi-word keys below)
  if (@keywords == 1 and $keywords[0] !~ /^[+-]/) {
    $keywords[0] = "+".$keywords[0];
  }
  # now convert _ to SPC in multi-word keys, with extra post-processing
  my @extra = ();
  foreach my $keyword (@keywords) {
    if ($keyword =~ /_/) {
      $keyword =~ tr[_][ ];
      my @components = split " ", $keyword;
      if ($keyword =~ /^\+/) {
        # only the last token in a multi-word key is added to the set of keywords
        # (for highlighting and match relevance); however, for required multi-word keys, 
        # we can add the other tokens as additional required keywords to the query
        pop @components;        # last token get's listed anyway
        push @keywords, map {(/^\+/) ? "$_" : "+$_"} @components; # add other tokens explicitly
      }
    }
  }
  # now we can run the query with the tokenised list of keywords
  return $self->query(@keywords);
}
# the search query function ... 
sub query {
  my $self = shift;
  $self->error("No search keywords specified.")
    unless @_;
  my @required_keywords = grep {/^\+/} @_; # +computer => required keyword
  my @excluded_keywords = grep {/^-/} @_;  # -windows => must _not_ be in match
  my @optional_keywords = grep {not /^[-+]/} @_; # the remaining keywords are optional
  map {s/^\+//;} @required_keywords; # remove '+' and '-' marks
  map {s/^-//;} @excluded_keywords;
  my $no_of_keywords = @required_keywords + @optional_keywords; # max. no of keywords in a match
  $self->error("Query containing only negative constraints not allowed.")
    unless $no_of_keywords > 0;
   
  # reset matchlist
  my $ml = $self->{'ml'} = new CWB::Web::Search::MatchList;
  my @list = ();                # list of current keyword's corpus positions
  my $keyword = "";             # keyword we're currently processing
  my $has_required_keywords = @required_keywords > 0;
  # start with required keywords (if there are any)
  if ($has_required_keywords) {
    $keyword = shift @required_keywords;
    $ml->constraint("join", $keyword, $self->{'check_size'},
                    $self->get_positions($keyword));    # initial matchlist (add to empty list)
    foreach $keyword (@required_keywords) {
      $ml->constraint('positive', $keyword, $self->{'check_size'}, 
                      $self->get_positions($keyword));
    }
  }
  # add optional keywords now
  foreach $keyword (@optional_keywords) {
    if ($has_required_keywords) {
      # if there were any required keywords, optional keywords can 
      # only be adjoined to the existing matches
      $ml->constraint('optional', $keyword, $self->{'check_size'}, 
                    $self->get_positions($keyword));
    }
    else {
      # otherwise, optional keywords may introduce new matches
      $ml->constraint('join', $keyword, $self->{'check_size'},
                      $self->get_positions($keyword));
    }
  }
  # finally, apply excluded keywords
  foreach $keyword (@excluded_keywords) {
    $ml->constraint('negative', $keyword, $self->{'check_size'}, 
                    $self->get_positions($keyword));
  }
  # now, compute quality, expand the match list & sort it
  $ml->compute_quality($no_of_keywords, @{$self->{'preferred'}});
  if ($self->{'cull'} eq 'before') {
    $ml->sort('first');
    $ml->cull;
  }
  $ml->expand($self->{'expand'});
  if ($self->{'cull'} eq 'after') {
    $ml->sort('first');
    $ml->cull;
  }
  $ml->sort('quality');
  # finally, return number of matches
  return $ml->size;
}
# internal method: get IDs (word OR lemma) matching given keyword
# returns ($attribute, @id_list)
sub get_IDs {
  my $self = shift;
  my $keyword = shift;
   
  my $word_att = $self->{'word'};
  my $lemma_att = $self->{'lemma'};
  my $flags = '';               # construct ignore case/diacritics flags for regex2id()
  if (($self->{'ignore_case'} eq 'always') or
      ($self->{'ignore_case'} eq 'auto' and $keyword !~ /[A-Z\xc0-\xd6\xd8-\xde]/)) {
    $flags .= 'c';
  }
  if (($self->{'ignore_diac'} eq 'always') or
      ($self->{'ignore_diac'} eq 'auto' and $keyword !~ /[\xc0-\xd6\xd8-\xf6\xf8-\xff]/)) {
    $flags .= 'd';
  }
  if (not $flags and $keyword =~ /^[A-za-z0-9\xc0-\xff_-]+$/) {
    # regular word -> look up in index (use lemma attribute if possible)
    my $lemma_id = $lemma_att->str2id($keyword);
    my $word_id = $word_att->str2id($keyword);
    my $lemma_freq = (defined $lemma_id) ? $lemma_att->id2freq($lemma_id) : 0;
    my $word_freq = (defined $word_id) ? $word_att->id2freq($word_id) : 0;
    if ($lemma_freq > $word_freq) {
      # user entered lemma form -> look up in lemma index
      return ($lemma_att, $lemma_id);
      # NB: $lemma_id must be defined because $lemma_freq > 0
    }
    else {
      if ($word_freq > 0) {
        # user entered an inflected form or incorrect lemma
        # the word form was found in the corpus, so let's try to lemmatise it
        my @cpos = $word_att->idlist2cpos($word_id);
        my %lemma_id = ();              # collect lemma ID's cooccuring with word form
        for (my $i = 0; $i < @cpos and $i < 100; $i++) { # try first 100 occurrences only
          $lemma_id{$lemma_att->cpos2id($cpos[$i])} = 1;
        }
        my @lemma_ids = grep {$lemma_att->id2str($_) ne "<unknown>"} keys %lemma_id;
        if (@lemma_ids) {
          # we've found one or more possible lemmatisations -> pick most frequent one
          ($lemma_id) = sort {$lemma_att->id2freq($b) <=> $lemma_att->id2freq($a)} @lemma_ids;
          $lemma_freq = $lemma_att->id2freq($lemma_id);
          # now check again if lemma is more 'general' than word form
          if ($lemma_freq > $word_freq) {
            return ($lemma_att, $lemma_id);
          }
        }
        # if lemmatisation attempt failed, fall back on word form
        return ($word_att, $word_id);
      }
      else {
        # not in corpus; return word attribute with empty ID list
        return ($word_att);
      }
    }            
  }
  else {
    # regular expression -> match on lemma or word forms
    # (if ignore case/diacritics was requested, we must always use regexps)
    my @lemma_ids = $lemma_att->regex2id($keyword, $flags);
    my @word_ids = $word_att->regex2id($keyword, $flags);
    my $lemma_freq = $lemma_att->idlist2freq(@lemma_ids);
    my $word_freq = $word_att->idlist2freq(@word_ids);
    if ($lemma_freq > $word_freq) {
      return ($lemma_att, @lemma_ids);
    }
    else {
      return ($word_att, @word_ids);
    }
  }
   
}
# internal method: get positions of a given keyword
sub get_positions {
  my $self = shift;
  my $keyword = shift;
  my ($attribute, @id_list);
  if ($keyword =~ /\s+/) {
    # keyword contains blanks -> multi-word string
    my (@keywords) = split /\s+/, $keyword;
    $keyword = shift @keywords; # match first keyword
    ($attribute, @id_list) = $self->get_IDs($keyword);
    my @cpos_list = $attribute->idlist2cpos(@id_list);
    foreach $keyword (@keywords) {
      ($attribute, @id_list) = $self->get_IDs($keyword);
      my %matches_keyword = map {($_ => 1)} @id_list; # make lookup hash for matching IDs
      foreach my $cpos (@cpos_list) {
        my $id = $attribute->cpos2id($cpos+1);
        if (defined $id and $matches_keyword{$id}) { 
          $cpos = $cpos + 1;    # tokens in multi-word string must be adjacent ...
        }
        else {
          $id = $attribute->cpos2id($cpos+2);
          if (defined $id and $matches_keyword{$id}) {
            $cpos = $cpos + 2;  # ... allowing at most 1 intervening corpus position
          }
          else {
            $cpos = -1;         # delete this candidate
          }
        }
      }
      @cpos_list = grep {$_ >= 0} @cpos_list; # remove deleted candidates from list
    }
    return @cpos_list;
  } 
  else {
    ($attribute, @id_list) = $self->get_IDs($keyword);
    return $attribute->idlist2cpos(@id_list);
  }
}
return 1;
__END__
=head1 NAME
  CWB::Web::Search - A WWW search style front-end to the CWB
=head1 SYNOPSIS
  use CWB::Web::Search;
  # typically, a search object is used for a single search only
  $search = new CWB::Web::Search 'WEB-SITE-INDEX';
  # here, 'web-site-index' is a CWB-encoded corpus containing the
  # textual content of an indexed WWW site
  $search->window("1 document"); # search window 
  $search->context("document");  # match context returned as HTML
  # window and context size are specified in CQP syntax
  $search->data("url", "date");  # values of s-attributes are returned
  # here, markup is <url http://...> and <date 13 Oct 1999>
  $search->ignore_case(1);       # case-insensitive search
  $search->ignore_diacritics(1); # search ignores diacritics
  $search->cull('after');        # remove duplicate documents (context)
  $search->highlight('<font color=red>'); # HTML highlighting tag
  # run query - returns number of matches (for convenience)
  $nr_matches = $search->query("+editor", "free", "GNU", "-Microsoft");
  # look for documents containing the word 'editor', preferably
  # 'free' or 'GNU' as well, and not containing 'Microsoft'
  $nr_matches = $search->size;  # same as number returned by query()
  # alternatively, let WebSearch::Search parse the query string
  $nr_matches = $search->query_string("+editor free GNU -Microsoft");
  # typical result processing loop
  for ($i = 0; $i < $nr_matches; $i++) {
    $nr = $i + 1;               # match number
    $m = $search->match($i);    # returns result struct without 'context'
    $m->{'cpos'};               # corpus position of match centre
    $m->{'quality'};            # relevance of this match
    $m->{'summary'};            # summary of match (HTML encoded)
    $m->{'data'}->{'url'};      # requested data values
    $m->{'data'}->{'date'};
    if ($want_context) {
      $m = $search->match($i, 'context');
      $m->{'context'};          # match with context (HTML encoded)
    }
  }
  undef $search;
=head1 DESCRIPTION
The I<CWB::Web::Search> module executes simple queries similar to
commercial Web search engines on CWB-encoded corpora. The I<query()> method
returns I<keywords> found in the corpus with the requested amount of
context in HTML format. Additionally, data stored in structural
attributes can be returned. Typically, a CGI script will create a
I<CWB::Web::Search> object for a single query.
=head1 ERRORS
If the I<CWB::Web::Search> module encounters an error condition, an error
message is printed on C<STDERR> and the program is terminated. A user-defined
error handler can be installed with the I<on_error()> method. In this case,
the error callback function is passed the error message generated by the module
as a list of strings.
=head1 CORPUS REGISTRY
If you need to use a registry other than the default corpus registry,
you should change the setting directly in the L<CWB::CL|CWB::CL> module.
  use CWB::CL;
  $CWB::CL::Registry = "/path/to/my/registry";
This will affect all new I<CWB::Web::Search> objects.
=head1 RESULT STRUCTURE
The search module's I<match()> method return a I<result struct> for
the n-th match of the last query executed. A CGI script will usually
iterate through all matches with a loop similar to this:
    $nr_matches = $search->query(...);
    for ($n = 0; $n < $nr_maches; $n++) {
      $m = $search->match($n);
      # code for processing match data in result struct $m 
    }
A I<result struct> $m has the following fields:
=over 4
=item $m->{'cpos'}
I<Corpus  position> of the I<centre>  of  this match (the I<centre> is
computed from the positions of all search I<keywords> in a match).
=item $m->{'quality'}
An estimate of the I<relevance> of this match. This ranking is given as a
percentage with 100% corresponding to a "perfect match". The matches found
by the I<query()> method are sorted according to their 'quality' value. 
=item $m->{'summary'}
A text segment from the corpus containing most of the <keywords> found
in this match (up to a reasonable maxium length). It is returned in
HTML format with the I<keywords> highlighted.
=item $m->{'context'}
The text segment from the corpus containing all <keywords> found in
this match, expanded according to the I<context()> setting. It is
returned in HTML format with the I<keywords> highlighted.
B<NB> The I<context> field is only included if the C<'context'> switch
was passed to the I<match()> method:
    $m = $search->match($n, 'context');
See the remarks on I<virtual context> in the description of the
I<cull()> method below.
=item $m->{'data'}
The values of the structural attributes requested by the I<data()> 
method are returned in the subfields of the 'data' field. A typical
CGI application will use the 'data' field to retrieve document URLs,
e.g.
    $match_url = $m->{'data'}->{'url'};
where the search corpus contains regions like the following
    <url http://www.ims.uni-stuttgart.de/> ... </url>
The values stored in the 'data' field are not HTML encoded.
=back
=head1 METHODS
=over 4
=item $search = new CWB::Web::Search $corpus;
Create I<CWB::Web::Search> object for WWW search queries on the
CWB corpus C<$corpus>.
=item @results = $search->query($key1, $key2, ... );
Searches corpus for the specified I<keywords> and returns a list
of matches sorted by (decreasing) relevance. 
  
See L<"RESULT STRUCTURE"> for the format of the C<@results> list. 
=back
=head1 COPYRIGHT
Copyright (C) 1999-2020 Stefan Evert [http::/purl.org/stefan.evert]
This software is provided AS IS and the author makes no warranty as to
its use and performance. You may use the software, redistribute and
modify it under the same terms as Perl itself.
=cut
	Global
`s`	Focus search bar
`?`	Bring up this help dialog
	GitHub
`g` `p`	Go to pull requests
`g` `i`	go to github issues (only if github is preferred repository)
	POD
`g` `a`	Go to author
`g` `c`	Go to changes
`g` `i`	Go to issues
`g` `d`	Go to dist
`g` `r`	Go to repository/SCM
`g` `s`	Go to source
`g` `b`	Go to file browse
	Search terms
module: (e.g. module:Plugin)
distribution: (e.g. distribution:Dancer auth)
author: (e.g. author:SONGMU Redis)
version: (e.g. version:1.00)