lib/Bio/Polloc/GroupCriteria.pm


            
              —
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
—
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
—
86
87
88
89
90
91
92
93
94
95
96
97
—
98
99
100
101
102
103
104
105
106
107
108
109
—
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
—
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
—
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
—
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
—
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
—
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
—
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
—
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
—
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
—
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
—
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
—
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
—
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
—
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
—
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
—
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
—
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
—
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
              
=head1 NAME
Bio::Polloc::GroupCriteria - Rules to group loci
=head1 DESCRIPTION
Takes loci and returns groups of loci based on certain
rules.  If created via .bme (.cfg) files, it is defined
in the C<[ RuleGroup ]> and C<[ GroupExtension ]>
namespaces.
=head1 AUTHOR - Luis M. Rodriguez-R
Email lmrodriguezr at gmail dot com
=head1 IMPLEMENTS OR EXTENDS
=over
=item *
L<Bio::Polloc::Polloc::Root>
=back
=cut
package Bio::Polloc::GroupCriteria;
use strict;
use base qw(Bio::Polloc::Polloc::Root);
use List::Util qw(min max first);
use Bio::Polloc::Polloc::IO;
use Bio::Polloc::LociGroup;
use Bio::Polloc::GroupCriteria::operator;
use Bio::Polloc::GroupCriteria::operator::cons;
use Bio::Seq;
use Error qw(:try);
our $VERSION = $Bio::Polloc::Polloc::Root::VERSION;
#
=head1 APPENDIX
Methods provided by the package
=cut
=head2 new
Generic initialization method
B<Arguments>
=over
=item -souce I<str>
See L<source()>
=item -target I<str>
See L<target()>
=item -features I<Bio::Polloc::LociGroup>
Alias of C<-loci>
=item -loci I<Bio::Polloc::LociGroup>
See L<locigroup()>
=back
B<Returns>
The C<Bio::Polloc::GroupCriteria> object
=cut
sub new {
   my($caller,@args) = @_;
   my $self = $caller->SUPER::new(@args);
   $self->_initialize(@args);
   return $self;
}
=head2 source
Sets/gets the type of source loci (see L<Bio::Polloc::LocusI->family()>
=cut
sub source {
   my($self, $value) = @_;
   $self->{'_source'} = $value if defined $value;
   return $self->{'_source'};
}
=head2 target
Sets/gets the type of target loci (see L<Bio::Polloc::LocusI->family()>
=cut
sub target {
   my($self, $value) = @_;
   $self->{'_target'} = $value if defined $value;
   return $self->{'_target'};
}
=head2 locigroup
Gets/sets the input L<Bio::Polloc::LociGroup> object containing
all the loci to evaluate.
=cut
sub locigroup {
   my($self, $value) = @_;
   if(defined $value){
      $self->{'_locigroup'} = $value;
      $self->{'_reorder'} = 1;
   }
   return $self->{'_locigroup'};
}
=head2 condition
Sets/gets the conditions set to evaluate.
=cut
sub condition {
   my($self, $value) = @_;
   if(defined $value){
      $self->throw('Unexpected type of condition', $value)
                unless UNIVERSAL::can($value, 'isa') and $value->isa('Bio::Polloc::GroupCriteria::operator');
      $self->{'_condition'} = $value;
   }
   return $self->{'_condition'};
}
=head2 evaluate
Compares two loci based on the defined conditions
B<Parameters>
=over
=item *
The first locus (a L<Bio::Polloc::LocusI> object)
=item *
The second locus (a L<Bio::Polloc::LocusI> object)
=back
B<Returns>
Boolean
B<Throws>
L<Bio::Polloc::Polloc::Error> if unexpected input or undefined condition, source or
target
=cut
sub evaluate {
   my($self, $feat1, $feat2) = @_;
   # Test the input
   $feat1->isa('Bio::Polloc::LocusI') or
        $self->throw("First feature of illegal class", $feat1);
    
   $feat2->isa('Bio::Polloc::LocusI') or
        $self->throw("Second feature of illegal class", $feat2);
    
   defined $self->condition or
        $self->throw("Undefined condition, impossible to group");
    
   $self->condition->type eq 'bool' or
        $self->throw("Unexpected type of condition", $self->condition);
    
   $self->throw("Undefined source features") unless defined $self->source;
   $self->throw("Undefined target features") unless defined $self->target;
    
   # Run
   return 0 unless $feat1->family eq $self->source;
   return 0 unless $feat2->family eq $self->target;
   $Bio::Polloc::GroupCriteria::operator::cons::OP_CONS->{'FEAT1'} = $feat1;
   $Bio::Polloc::GroupCriteria::operator::cons::OP_CONS->{'FEAT2'} = $feat2;
   my $o = $self->condition->operate;
   $Bio::Polloc::GroupCriteria::operator::cons::OP_CONS->{'FEAT1'} = undef;
   $Bio::Polloc::GroupCriteria::operator::cons::OP_CONS->{'FEAT2'} = undef;
   return $o;
}
=head2 get_loci
Gets the stored loci
B<Note>
The stored loci can also be obtained with C<$object-E<gt>locigroup-E<gt>loci>,
but this function ensures a consistent order in the loci for its evaluation.
=cut
sub get_loci {
   my($self,@args) = @_;
   $self->{'_features'} = $self->locigroup->loci
        if defined $self->locigroup and not defined $self->{'_features'};
   $self->{'_features'} = [] unless defined $self->{'_features'};
   if($self->{'_reorder'} && $self->source ne $self->target){
      my @src = ();
      my @tgt = ();
      my @oth = ();
      for my $ft (@{$self->locigroup->loci}){
            if($ft->family eq $self->source){ push (@src, $ft) }
         elsif($ft->family eq $self->target){ push (@tgt, $ft) }
         else{ push @oth, $ft }
      }
      $self->{'_features'} = [];
      push @{$self->{'_features'}}, @tgt, @src, @oth;
      $self->{'_reorder'} = 0;
   }
   return $self->{'_features'};
}
=head2 get_locus
Get the locus with the specified index.
B<Arguments>
The index (int, mandatory).
B<Returns>
A L<Bio::Polloc::LocusI> object or undef.
B<Note>
This is a lazzy method, and should be used B<ONLY> after C<get_loci()>
were called at least once.  Otherwise, the order might not be the expected,
and weird results would appear.
=cut
sub get_locus {
   my($self, $index) = @_;
   return unless defined $index;
   return unless defined $self->{'_features'};
   return $self->{'_features'}->[$index];
}
=head2 extension
Sets the conditions for group extensions.
B<Arguments>
Array, hash or string with C<-key =E<gt> value> pairs.  Supported values are:
=over
=item -function I<str>
=over
=item C<context>
Searches the flanking regions in the target sequence.
=back
=item -upstream I<int>
Extension in number of residues upstream the feature.
=item -downstream I<int>
Extension in number of residues downstream the feature.
=item -detectstrand I<bool (int)>
Should I detect the proper strand?  Otherwise, the stored strand
is trusted.  This is useful for non-directed features like repeats,
which context is actually directed.
=item -alldetected I<bool (int)>
Include all detected features (even these overlapping with input features).
=item -feature I<bool (int)>
Should I include the feature region in the search? 0 by default.
=item -lensd I<float>
Number of Standar Deviations (SD) tolerated as half of the range of lengths
for a feature.  The average (Avg) and the standard deviation of the length
are calculated based on all the stored features, and the Avg+(SD*lensd) is
considered as the largest possible new feature.  No minimum length constraint
is given, unless explicitly set with -minlen.  This argument is ignored if
C<-maxlen> is explicitly set.  Default is 1.5.
=item -maxlen I<int>
Maximum length of a new feature in number of residues.  If zero (0) evaluates
C<-lensd> instead.  Default is 0.
=item -minlen I<int>
Minimum length of a new feature in number of residues.  Default is 0.
=item -similarity I<float>
Minimum fraction of similarity to include a found region. 0.8 by default.
=item -oneside I<bool (int)>
Should I consider features with only one of the sides?  Takes effect only if
both -upstream and -downstream are defined. 0 by default.
=item -algorithm I<str>
=over
=item C<blast>
Use BLAST to search (after multiple alignment and consensus calculation of
queries).  Default algorithm.
=item C<hmmer>
Use HMMer to search (after multiple alignment and C<hmmbuild> of query
sequences).
=back
=item -score I<int>
Minimum score for either algorithms B<blast> and B<hmmer>. 20 by default.
=item -consensusperc I<float>
Minimum percentage a residue must appear in order to include it in the
consensus used as query.  60 by default.  Only if -algorithm blast.
=item -e I<float>
If C<-algorithm> B<blast>, maximum e-value.  0.1 by default.
=item -p I<str>
If C<-algorithm> B<blast>, program used (C<[t]blast[npx]>).  B<blastn> by
default.
=back
B<Throws>
L<Bio::Polloc::Polloc::Error> if unexpected input,
=cut
sub extension {
   my ($self, @args) = @_;
   return $self->{'_groupextension'} unless $#args>=0;
   @args = split /\s+/, $args[0] if $#args == 0;
   $self->throw("Odd number of elements, impossible to build key-value pairs", \@args)
        unless $#args%2;
   my %f = @args;
   $f{'-function'} ||= 'context';
   $f{'-algorithm'} ||= 'blast';
   ($f{'-feature'} ||= 0) += 0;
   ($f{'-downstream'} ||= 0) += 0;
   ($f{'-upstream'} ||= 0) += 0;
   ($f{'-detectstrand'} ||= 0) += 0;
   ($f{'-alldetected'} ||= 0) += 0;
   ($f{'-oneside'} ||= 0) += 0;
   $f{'-lensd'} = defined $f{'-lensd'} ? $f{'-lensd'}+0 : 1.5;
   $f{'-maxlen'} = defined $f{'-maxlen'} ? $f{'-maxlen'}+0 : 0;
   $f{'-minlen'} = defined $f{'-minlen'} ? $f{'-minlen'}+0 : 0;
   $f{'-similarity'} = defined $f{'-similarity'} ? $f{'-similarity'}+0 : 0.8;
   $f{'-score'} = defined $f{'-score'} ? $f{'-score'}+0 : 20;
   $f{'-consensusperc'} = defined $f{'-consensusperc'} ? $f{'-consensusperc'}+0 : 60;
   $f{'-e'} = defined $f{'-e'} ? $f{'-e'}+0 : 0.1;
   $f{'-p'} = 'blastn' unless defined $f{'-p'};
   $self->{'_groupextension'} = \%f;
   return $self->{'_groupextension'};
}
=head2 extend
Extends a group based on the arguments provided by L<Bio::Polloc::GroupCriteria->extension()>.
B<Arguments>
=over
=item -loci I<Bio::Polloc::LociGroup>
The L<Bio::Polloc::LociGroup> containing the loci in the group to extend.
=back
B<Returns>
A L<Bio::Polloc::LociGroup> object containing the updated group, i.e. the
original group PLUS the extended features.
B<Throws>
L<Bio::Polloc::Polloc::Error> if unexpected input or weird extension definition.
=cut
sub extend {
   my ($self, @args) = @_;
   my ($loci) = $self->_rearrange([qw(LOCI)], @args);
   # Check input
   my $ext = $self->{'_groupextension'};
   return unless defined $ext;
   $self->throw("The loci are not into an object", $loci)
        unless defined $loci and ref($loci) and UNIVERSAL::can($loci,'isa');
   $self->throw("Unexpected type for the group of loci", $loci)
        unless $loci->isa('Bio::Polloc::LociGroup');
   return unless $#{$loci->loci}>=0;
   # Set ID base
   my $group_id = $self->_next_group_id;
   # Run
   my @new = ();
   $self->debug("--- Extending group (based on ".($#{$loci->loci}+1)." loci) ---");
   if(lc($ext->{'-function'}) eq 'context'){
      my ($up_pos, $down_pos, $in_pos);
      $loci->fix_strands(max($ext->{'-downstream'}, $ext->{'-upstream'}))
        if $ext->{'-detectstrand'} and ($ext->{'-upstream'} or $ext->{'-downstream'});
      # Search
      my $eval_feature = $ext->{'-feature'} ? 1 : 0;
      $up_pos = $self->_search_aln_seqs($loci->align_context(-1, $ext->{'-upstream'}, 0))
        if $ext->{'-upstream'};
      $down_pos = $self->_search_aln_seqs($loci->align_context(1, $ext->{'-downstream'},0))
        if $ext->{'-downstream'};
      $in_pos = $self->_search_aln_seqs($loci->align_context(0, 0, 0)) if $ext->{'-feature'};
      # Determine maximum size
      my $max_len = $ext->{'-maxlen'};
      unless($max_len){
         my($len_avg, $len_sd) = $self->locigroup->avg_length;
         $self->warn("Building size constrains based in one sequence only")
                if $#{$self->locigroup->loci}<1;
         $max_len = $len_avg + $len_sd*$ext->{'-lensd'};
      }
      $self->debug("Comparing results with maximum feature's length of $max_len");
      # Evaluate/pair
      if($ext->{'-upstream'} and $ext->{'-downstream'}){
         # Detect border pairs
         push @new, $self->_detect_border_pairs($up_pos, $down_pos, $max_len);
         if($eval_feature){
            $self->debug("Filtering results with in-feature sequences");
            my @prefilter = @new;
            @new = ();
            BORDER: for my $br (@prefilter){
               push @new,
                  first { $br->[0] ne $_->[0] # != ctg
                     and $br->[3] == $_->[3] # upstream's strand
                     and $br->[3]*$_->[1] < $br->[3]*$br->[2] # no overlap
                     and $br->[3]*$_->[2] > $br->[3]*$br->[1] # no overlap
                  } @$in_pos;
               #WITHIN: for my $in (@$in_pos){
        #         $self->throw("Unexpected array structure (in-feature)", $in)
        #                       unless defined $in->[0] and defined $in->[4];
        #         next WITHIN   if $br->[0] eq $in->[0] # == ctg
        #                       or $br->[3] != $in->[3] # upstream's strand
#                               or $br->[3]*$in->[1] >= $br->[3]*$br->[2] # overlap
#                               or $br->[3]*$in->[2] <= $br->[3]*$br->[1]; # overlap
#                 # Good!
#                 $br->[4] = (2*$br->[4] + $in->[4])/3;
#                 push @new, $br;
#                 next BORDER;
              # }
            }
         }
      }elsif($eval_feature){ @new = @$in_pos }else{
         $self->throw('Nothing to evaluate!  '.
                        'I need either the two borders or the middle sequence (or both)');
      }
   }else{
      $self->throw('Unsupported function for group extension', $ext->{'-function'});
   }
   # And finally, create the detected features, discarding loci overlapping input loci
   $self->debug("Found ".($#new+1)." loci, creating extend features");
   my $comments = "Based on group $group_id: ";
   for my $locus (@{$loci->loci}) { $comments.= $locus->id . ", " if defined $locus->id }
   $comments = substr $comments, 0, -2;
    
   my $newloci = Bio::Polloc::LociGroup->new();
   $newloci->name($loci->name."-ext") if defined $loci->name;
   $newloci->featurename($loci->featurename) if defined $loci->featurename;
   $newloci->genomes($loci->genomes) if defined $loci->genomes;
   NEW: for my $itemk (0 .. $#new){
      my $item = $new[$itemk];
      ($item->[1], $item->[2]) = (min($item->[1], $item->[2]), max($item->[1], $item->[2]));
      unless($ext->{'-alldetected'}){
         OLD: for my $locus (@{$loci->loci}){
            # Not new! :
            next NEW if $item->[1]<$locus->to and $item->[2]>$locus->from;
         }
      }
      my $seq;
      my($Gk, $acc) = split /:/, $item->[0], 2;
      $Gk+=0;
      for my $ck (0 .. $#{$self->genomes->[$Gk]->get_sequences}){
         my $id = $self->genomes->[$Gk]->get_sequences->[$ck]->display_id;
         if($id eq $acc or $id =~ m/\|$acc(\.\d+)?(\||\s*$)/){
            $seq = [$Gk,$ck]; last;
         }
      }
      $self->warn('I can not find the sequence', $acc) unless defined $seq;
      $self->throw('Undefined genome-contig pair', $acc, 'UnexpectedException')
                unless defined $self->genomes->[$seq->[0]]->get_sequences->[$seq->[1]];
      my $id = $self->source . "-ext:".($Gk+1).".$group_id.".($#{$newloci->loci}+2);
      $newloci->add_loci(Bio::Polloc::LocusI->new(
                -type=>'extend',
                -from=>$item->[1],
                -to=>$item->[2],
                -id=>(defined $id ? $id : ''),
                -strand=>($item->[3]==-1 ? '+' : '-'),
                #                      Gk:Genome                   ck:Contig
                -seq=>$self->genomes->[$seq->[0]]->get_sequences->[$seq->[1]],
                -score=>$item->[4],
                -basefeature=>$loci->loci->[0],
                -comments=>$comments,
                -genome=>$self->genomes->[$Gk]
      ));
   }
   return $newloci;
}
=head2 build_bin
Compares all the included loci and returns the identity matrix
B<Arguments>
=over
=item -complete I<bool (int)>
If true, calculates the complete matrix instead of only the bottom-left triangle.
=back
B<Returns>
A reference to a boolean 2-dimensional array (only left-down triangle)
B<Note>
B<WARNING!>  The order of the output is not allways the same of the input.
Please use C<get_loci()> instead, as source features B<MUST> be after
target features in the array.  Otherwise, it is not possible to have the
full picture without building the full matrix (instead of half).
=cut
sub build_bin {
   my($self,@args) = @_;
   my $bin = [];
   my($complete) = $self->_rearrange([qw(COMPLETE)], @args);
   for my $i (0 .. $#{$self->get_loci}){
      $bin->[$i] = [];
      my $lim = $complete ? $#{$self->get_loci} : $i;
      for my $j (0 .. $lim){
         $bin->[$i]->[$j] = $self->evaluate(
                $self->get_loci->[$i],
                $self->get_loci->[$j]
         );
      }
   }
   return $bin;
}
=head2 bin_build_groups
Builds groups of loci based on a binary matrix
B<Arguments>
A matrix as returned by L<Bio::Polloc::GroupCriteria::build_bin()>
B<Returns>
A 2-D arrayref.
B<Note>
This method is intended to build groups providing information on all-vs-all
comparisons.  If you do not need this information, use the much more
efficient L<Bio::Polloc::GroupCriteria::build_groups()> method, that relies on
transitive property of groups to avoid unnecessary comparisons.  Please note
that this function also relies on transitivity, but gives you the option to
examine all the paired comparisons and even write your own grouping function.
=cut
sub bin_build_groups {
   my($self,$bin) = @_;
   my $groups = [];
   FEAT: for my $f (0 .. $#{$self->get_loci}){
      GROUP: for my $g (0 .. $#{$groups}){
         MEMBER: for my $m (0 .. $#{$groups->[$g]}){
            if($bin->[$f]->[$groups->[$g]->[$m]] ){
               push @{$groups->[$g]}, $f;
               next FEAT;
            }
         }
      }
      push @{$groups}, [$f]; # If not found in previous groups
   }
   # Change indexes by Bio::Polloc::LocusI objects
   return $self->_feat_index2obj($groups);
}
=head2 build_groups
This is the main method, creates groups of loci.
B<Arguments>
=over
=item -cpus I<int>
If defined, attempts to distribute the work among the specified number of
cores. B<Warning>: This parameter is experimental, and relies on
C<Parallel::ForkManager>.  It can be used in production with certain
confidence, but it is highly probable to B<NOT> work in parallel (to avoid
errors, this method ignores the command at ANY possible error).
B<Unimplemented>: This argument is currently ignored. Some algorithmic
considerations must be addressed before using it. B<TODO>.
=item -advance I<coderef>
A reference to a function to call at every new pair.  The function is called
with three arguments, the first is the index of the first locus, the second
is the index of the second locus and the third is the total number of loci.
Note that this function is called B<BEFORE> running the comparison.
=over
B<Returns>
An arrayref of L<Bio::Polloc::LociGroup> objects, each containing one consistent
group of loci.
B<Note>
This method is faster than combining C<build_bin()> and C<build_groups_bin()>,
and it should be used whenever transitivity can be freely assumed and you do
not need the all-vs-all matrix for further evaluation (for example, manual
inspection).
=cut
sub build_groups {
   my($self,@args) = @_;
   my ($cpus, $advance) = $self->_rearrange([qw(CPUS ADVANCE)], @args);
    
   my $groups = [[0]]; #<- this is bcs first feature is ignored in FEAT1
   my $loci = $self->get_loci;
   my $l_max = $#$loci;
   $self->debug("Building groups for ".($l_max+1)." loci");
   $self->warn('Nothing to do, any stored loci') unless $l_max>=0;
   FEAT1: for my $i (1 .. $l_max){
      FEAT2: for my $j (0 .. $i-1){
         $self->debug("Evaluate [$i vs $j]");
         &$advance($i, $j, $l_max+1) if defined $advance;
         next FEAT2 unless $self->evaluate(
                $loci->[$i],
                $loci->[$j]
         );
         # --> If I am here, FEAT1 ~ FEAT2 <--
         GROUP: for my $g (0 .. $#{$groups}){
            MEMBER: for my $m (0 .. $#{$groups->[$g]}){
               if($j == $groups->[$g]->[$m]){
                  # I.e., if FEAT2 is member of GROUP
                  push @{$groups->[$g]}, $i;
                  next FEAT1; #<- This is why the current method is way more efficient
               }
            }#MEMBER
         }#GROUP
      }#FEAT2
      # --> If I am here, FEAT1 belongs to a new group <--
      push @{$groups}, [$i];
   }#FEAT1
   my $out = [];
   for my $gk (0 .. $#$groups){
      my $group = Bio::Polloc::LociGroup->new(-name=>sprintf("%04s", $gk+1)); #+++ ToDo: Is ID ok?
      $group->genomes($self->genomes);
      for my $lk (0 .. $#{$groups->[$gk]}){
         my $locus = $loci->[ $groups->[$gk]->[$lk] ];
         # Paranoid bugbuster:
         $self->throw('Impossible to gather the locus back:'.
                ' $groups->['.$gk.']->['.$lk.']: '.$groups->[$gk]->[$lk],
                $loci, 'Bio::Polloc::Polloc::UnexpectedException')
                unless defined $locus;
         $group->add_loci($locus);
      }
      push @$out, $group;
   }
   return $out;
}
=head2 genomes
Gets the genomes of the base group of loci.  This function is similar
to calling C<locigroup()-E<gt>genomes()>, but is read-only.
=cut
sub genomes {
   my ($self, $value) = @_;
   $self->warn("Attempting to set the genomes from a read-only function")
        if defined $value;
   return unless defined $self->locigroup;
   return $self->locigroup->genomes;
}
=head1 INTERNAL METHODS
Methods intended to be used only within the scope of Bio::Polloc::*
=head2 _detect_border_pairs
=cut
sub _detect_border_pairs {
   my($self, $up_pos, $down_pos, $max_len) = @_;
   return unless $up_pos and $down_pos;
   my $ext = $self->{'_groupextension'};
   my @out = ();
   US: for my $us (@$up_pos){
      $self->throw("Unexpected array structure (upstream): ", $us)
                  unless defined $us->[0] and defined $us->[4];
      $self->debug(" US: ", join(':', @$us));
      my $found;
      my $pair = [];
      DS: for my $ds (@$down_pos){
         $self->throw("Unexpected array structure (downstream): ", $ds)
                          unless defined $ds->[0] and defined $ds->[4];
         $self->debug("  DS: ", join(':', @$ds));
         next DS if $us->[0] ne $ds->[0] # != ctg
                 or $us->[3] == $ds->[3] # == strand
                 or abs($ds->[2]-$us->[2]) > $max_len # too large
                 or abs($ds->[2]-$us->[2]) < $ext->{'-minlen'} # too short
                 or defined $found and abs($us->[2]-$ds->[2]) > $found; # prev better
         # Good!
         $self->debug("Saving pair ".$us->[1]."..".$us->[2]."/".$ds->[1]."..".$ds->[2]);
         $found = abs($us->[2]-$ds->[2]);
         $pair = [$us->[0], $us->[2], $ds->[2], $us->[3], ($us->[4]+$ds->[4])/2];
      }
      push @out, $pair if $#$pair>1;
   }
   return @out;
}
=head2 _next_group_id
Returns an incremental ID that attempts to identify the group used as basis
of extension.  Please note that this method DOES NOT check if the group's ID
is the right one, and it is basically intended to keep track of how many
times the C<extend()> function has been called.
=cut
sub _next_group_id {
   my $self = shift;
   $self->{'_next_group_id'}||= 0;
   return ++$self->{'_next_group_id'};
}
=head2 _build_subseq
B<Arguments>
All the following arguments are mandatory and must be passed in that order.
The strand will be determined by the relative position of from/to:
=over
=item seq I<Bio::Seq object>
The sequence
=item from I<int>
The B<from> position
=item to I<int>
The B<to> position
=back
=back
=back
B<Returns>
A L<Bio::Seq> object.
B<Comments>
This method should be located at a higher hierarchy module (Root?).
This method is static.
=cut
sub _build_subseq {
   my($self, $seq, $from, $to) = @_;
   $self->throw("No main sequence", $seq)
        unless defined $seq and UNIVERSAL::can($seq, 'isa') and $seq->isa('Bio::Seq');
   my ($start, $end) = (min($to, $from), max($to, $from));
   $start = max($start, 1);
   $end = min($end, $seq->length);
   return unless $start != $end;
   my $seqstr = $seq->subseq($start, $end);
   my $cleanstr = $seqstr;
   $cleanstr =~ s/^N*//;
   $cleanstr =~ s/N*$//;
   return unless length $cleanstr > 0; # See issue BME#5
   my $subseq = Bio::Seq->new(-seq=>$seqstr);
   $subseq = $subseq->revcom if $from < $to;
   return $subseq;
}
=head2 _search_aln_seqs
Uses an alignment to search in the sequences of the collection of genomes
B<Arguments>
A Bio::SimpleAlign object
B<Returns>
A reference to an array of references to arrays, each with structure:
C<["genome-key:acc", from, to, strand, score]>
=cut
sub _search_aln_seqs {
   my ($self, $aln) = @_;
   my $ext = $self->{'_groupextension'};
   return unless defined $ext;
   return unless defined $self->genomes;
   my $pos = [];
   return $pos unless defined $aln; #<- For example, if zero sequences.  To gracefully exit.
   my $alg = lc $ext->{'-algorithm'};
   if($alg eq 'blast' or $alg eq 'hmmer'){ # ------------------------------- BLAST & HMMer
      # -------------------------------------------------------------------- Setup DB
      unless(defined $self->{'_seqsdb'}){
         $self->{'_seqsdb'} = Bio::Polloc::Polloc::IO->tempdir();
         $self->debug("Creating DB at ".$self->{'_seqsdb'});
         for my $genomek (0 .. $#{$self->genomes}){
            my $file = $self->{'_seqsdb'}."/$genomek";
            my $fasta = Bio::SeqIO->new(-file=>">$file", -format=>'Fasta');
            for my $ctg (@{$self->genomes->[$genomek]->get_sequences}){ $fasta->write_seq($ctg) }
            # BLAST requires a formatdb (not only the fasta)
            if($alg eq 'blast'){
               my $run = Bio::Polloc::Polloc::IO->new(-file=>"formatdb -p F -i '$file' 2>&1 |");
               while($run->_readline) {} # just run ;o)
               $run->close;
            }
         }
      }
      # -------------------------------------------------------------------- Predefine vars
      my $factory;
      my $query;
      if($alg eq 'blast'){
         $self->_load_module('Bio::Tools::Run::StandAloneBlast');
         my $cons_seq = $aln->consensus_string($ext->{'-consensusperc'});
         $cons_seq =~ s/\?/N/g;
         $query = Bio::Seq->new(-seq=>$cons_seq);
      }elsif($alg eq 'hmmer'){
         $self->_load_module('Bio::Tools::Run::Hmmer');
         my $tmpio = Bio::Polloc::Polloc::IO->new();
         # The following lines should be addressed with a three-lines code,
         # but the buggy AUTOLOAD of Bio::Tools::Run::Hmmer let us no option
         # -lrr
         $factory = Bio::Tools::Run::Hmmer->new();
         $factory->hmm($tmpio->tempfile);
         $factory->program('hmmbuild');
         $factory->run($aln);
         #$factory->calibrate();
      }
      # -------------------------------------------------------------------- Search
      $self->debug("Searching... alg:$alg, sim:".$ext->{'-similarity'}." score:".$ext->{'-score'}." e:".$ext->{'-e'});
      GENOME: for my $Gk (0 .. $#{$self->genomes}){
         my $report;
         if($alg eq 'blast'){
            next GENOME if ($query->seq =~ tr/N//) > 0.25*$query->length; # issue#14
            $factory = Bio::Tools::Run::StandAloneBlast->new(
                '-e'=>$ext->{'-e'}, '-program'=>$ext->{'-p'},
                '-db'=>$self->{'_seqsdb'}."/$Gk" );
            # Try to handle issue#14 and possible undocumented related issues:
            # (still causing some problems in the STDERR output)
            try { $report = $factory->blastall($query); }
            catch Error with {
               $self->debug("Launch BLAST with query: ".$query->seq());
               $self->warn("BLAST failed, skipping query and attempting to continue");
               next GENOME;
            }
            otherwise {
               $self->throw("BLAST failed", $_, 'Bio::Polloc::Polloc::UnexpectedException');
            };
         }elsif($alg eq 'hmmer'){
            $factory->program('hmmsearch');
            $report = $factory->run($self->{'_seqsdb'}."/$Gk");
         }
         # ----------------------------------------------------------------- Parse search
         RESULT: while(my $res = $report->next_result){
            HIT: while(my $hit = $res->next_hit){
               HSP: while(my $hsp = $hit->next_hsp){
                  # -------------------------------------------------------- Eval criteria
                  if(   ($alg eq 'blast'
                                and $hsp->frac_identical('query') >= $ext->{'-similarity'}
                                and $hsp->score >= $ext->{'-score'})
                  or
                        ($alg eq 'hmmer'
                                and $hsp->score >= $ext->{'-score'}
                                and $hsp->evalue <= $ext->{'-e'})
                  ){
                        # -------------------------------------------------- Save result
                        $self->debug("Found: sim:".$hsp->frac_identical('query').", score:".
                                $hsp->score.", e:".$hsp->evalue);
                        my $r_pos = ["$Gk:".$hit->accession,
                                $hsp->strand('hit')!=$hsp->strand('query')?
                                                $hsp->start('hit'):$hsp->end('hit'),
                                $hsp->strand('hit')!=$hsp->strand('query')?
                                                $hsp->end('hit'):$hsp->start('hit'),
                                $hsp->strand('hit')!=$hsp->strand('query')?
                                                -1 : 1,
                                $hsp->bits];
                        push @$pos, $r_pos;
                  }
               } # HSP
            } # HIT
         } # RESULT
      } # GENOME
   }else{ # ---------------------------------------------------------------- UNSUPPORTED
      $self->throw('Unsupported search algorithm', $ext->{'-algorithm'});
   }
   return $pos;
}
=head2 _feat_index2obj
Takes an index 2D matrix and returns it as the equivalent L<Bio::Polloc::LocusI> objects
B<Arguments>
2D matrix of integers (arrayref)
B<Returns>
2D matrix of L<Bio::Polloc::LocusI> objects (ref)
=cut
sub _feat_index2obj{
   my($self,$groups) = @_;
   for my $g (0 .. $#{$groups}){
      for my $m (0 .. $#{$groups->[$g]}){
         $groups->[$g]->[$m] = $self->get_locus($groups->[$g]->[$m]);
      }
   }
   return $groups;
}
=head2 _grouprules_cleanup
=cut
# Issue #7
sub _grouprules_cleanup {
   my $self = shift;
   if(defined $self->{'_seqsdb'}) {
      my $tmp = $self->{'_seqsdb'};
      for my $k (0 .. $#{$self->genomes || []}){
         while(<$tmp/$k.*>){
            unlink $_ or $self->throw("Impossible to delete '$_'", $!);
         }
         unlink "$tmp/$k" or $self->throw("Impossible to delete '$tmp/$k'", $!);
      }
      rmdir $tmp;
   }
}
=head2 _initialize
=cut
sub _initialize {
   my($self, @args) = @_;
   $self->_register_cleanup_method(\&_grouprules_cleanup);
   my($source, $target, $features, $loci) =
        $self->_rearrange([qw(SOURCE TARGET FEATURES LOCI)], @args);
   # $self->throw('Discouraged use of -features flag, use -loci instead');
   $self->source($source);
   $self->target($target);
   $loci = $features if defined $features and not defined $loci;
   $self->locigroup($loci);
}
1;
	Global
`s`	Focus search bar
`?`	Bring up this help dialog
	GitHub
`g` `p`	Go to pull requests
`g` `i`	go to github issues (only if github is preferred repository)
	POD
`g` `a`	Go to author
`g` `c`	Go to changes
`g` `i`	Go to issues
`g` `d`	Go to dist
`g` `r`	Go to repository/SCM
`g` `s`	Go to source
`g` `b`	Go to file browse
	Search terms
module: (e.g. module:Plugin)
distribution: (e.g. distribution:Dancer auth)
author: (e.g. author:SONGMU Redis)
version: (e.g. version:1.00)