Perl x Open Food Facts Hackathon: Paris, France - May 24-25 Learn more

=head1 NAME
Bio::Polloc::LociGroup - A group of loci
=head1 AUTHOR - Luis M. Rodriguez-R
Email lmrodriguezr at gmail dot com
=head1 IMPLEMENTS OR EXTENDS
=over
=item *
L<Bio::Polloc::Polloc::Root>
=back
=cut
use strict;
=head1 PUBLIC METHODS
Methods provided by the package
=cut
=head2 new
The basic initialization method
=cut
sub new {
my($caller,@args) = @_;
my $self = $caller->SUPER::new(@args);
$self->_initialize(@args);
return $self;
}
=head2 add_locus
Alias of C<add_loci()>
=cut
sub add_locus { return shift->add_loci(@_) }
=head2 add_loci
Adds loci to the collection on the specified
genome's space
=head3 Throws
A L<Bio::Polloc::Polloc::Error> if an argument is not
a L<Bio::Polloc::LocusI> object.
=head3 Arguments
The first argument B<can> be the identifier of
the genome's space (int). All the following are
expected to be L<Bio::Polloc::LocusI> objects.
=cut
sub add_loci {
my ($self,@l) = @_;
my $space;
if(defined $l[0] and not ref $l[0]){
$space = 0 + shift @l;
}
$self->{'_loci'} = [] unless defined $self->{'_loci'};
for my $locus (@l){
$self->debug("Saving locus (".($#l+1)." loci, cur:".($#{$self->{'_loci'}}+1).")");
$self->throw('Expecting a Bio::Polloc::LocusI object', $locus)
unless UNIVERSAL::can($locus, 'isa') and $locus->isa('Bio::Polloc::LocusI');
$locus->genome($self->genomes->[$space]) if defined $space;
push @{ $self->{'_loci'} }, $locus;
}
}
=head2 loci
Gets the loci
=cut
sub loci {
my $self = shift;
$self->{'_loci'} = [] unless defined $self->{'_loci'};
return $self->{'_loci'};
}
=head2 structured_loci
Returns a two-dimensional array where the first key corresponds
to the number of the genome space and the second key is an
incremental for each locus.
=head3 Note
This function is provided for convenience in some output formating,
but its use should be avoided as it causes a huge processing time
penalty.
=head3 Warning
Loci without defined genome will not be included in the output.
=cut
sub structured_loci {
my $self = shift;
return unless defined $self->genomes;
my $struct = [];
for my $locus (@{$self->loci}){
next unless defined $locus->genome;
my $space = 0;
for my $genome (@{$self->genomes}){
$struct->[$space] = [] unless defined $struct->[$space];
if($genome->name eq $locus->genome->name){
push @{ $struct->[$space] }, $locus;
}
$space++;
}
}
return $struct;
}
=head2 locus
Get a locus by ID
=head3 Arguments
The ID of the locus (str).
=cut
sub locus {
my ($self, $id) = @_;
for my $locus (@{$self->loci}){ return $locus if $locus->id eq $id }
return;
}
=head2 name
Gets/sets the name of the group. This is supposed
to be unique!
=head3 Note
Future implementations could assume unique naming
for getting/setting/initializing groups of loci
by name.
=cut
sub name {
my ($self, $value) = @_;
$self->{'_name'} = $value if defined $value;
return $self->{'_name'};
}
=head2 genomes
Gets/sets the genomes to be used as analysis base.
=head3 Arguments
A reference to an array of L<Bio::Polloc::Genome> objects.
=cut
sub genomes {
my($self, $value) = @_;
$self->{'_genomes'} = $value if defined $value;
return unless defined $self->{'_genomes'};
$self->throw("Unexpected type of genomes collection", $self->{'_genomes'})
unless ref($self->{'_genomes'}) and ref($self->{'_genomes'})=~m/ARRAY/i;
return $self->{'_genomes'};
}
=head2 featurename
Gets/Sets the name of the feature common to all the
loci in the group.
=cut
sub featurename {
my ($self, $value) = @_;
$self->{'_featurename'} = $value if defined $value;
return $self->{'_featurename'};
}
=head2 avg_length
Gets the average length of the stored loci.
=head3 Returns
The average length (float) or an array containing the
average length (float) and the standard deviation (float),
depending on the expected output.
=head3 Syntax
my $len = $locigroup->length;
Or,
my($len, $sd) = $locigroup->length;
=cut
sub avg_length {
my $self = shift;
my $len_avg = 0;
my $len_sd = 0;
if($#{$self->loci} >= 1){
# AVG
$len_avg+= abs($_->from - $_->to) for @{$self->loci};
$len_avg = $len_avg/($#{$self->loci}+1);
return $len_avg unless wantarray;
# SD
$len_sd+= (abs($_->from - $_->to) - $len_avg)**2 for @{$self->loci};
$len_sd = sqrt($len_sd/$#{$self->loci}); # n-1, not n (unbiased SD)
}elsif($#{$self->loci}==0){
$len_avg = abs($self->loci->[0]->from - $self->loci->[0]->to);
}
return wantarray ? ($len_avg, $len_sd) : $len_avg;
}
=head2 align_context
=head3 Arguments
Arguments work in the same way L<Bio::Polloc::LocusI-E<gt>context_seq()>
arguments do.
=over
=item 1
Ref: Int, reference position.
=item 2
From: Int, the I<from> position.
=item 3
To: Int, the I<to> position.
=back
=cut
sub align_context {
my ($self, $ref, $from, $to) = @_;
$from+=0; $to+=0; $ref+=0;
return if $from == $to and $ref!=0;
my $factory = Bio::Tools::Run::Alignment::Muscle->new();
$factory->quiet(1);
my @seqs = ();
LOCUS: for my $locus (@{$self->loci}){
# Get the sequence
my $seq = $locus->context_seq($ref, $from, $to);
next LOCUS unless defined $seq;
$seq->display_id($locus->id) if defined $locus->id;
push @seqs, $seq;
} #LOCUS
return unless $#seqs>-1; # Impossible without sequences
# small trick to build an alignment, even if there is only one sequence:
push @seqs, Bio::Seq->new(-seq=>$seqs[0]->seq, -id=>'dup-seq') unless $#seqs>0;
$self->debug("Aligning context sequences");
return $factory->align(\@seqs);
}
=head2 fix_strands
Fixes the strand of the loci based on the flanking regions, to have all the
loci in the group with the same orientation.
=head3 Arguments
=over
=item -size I<int>
Context size (500 by default)
=item -force I<bool (int)>
Force the detection, even if it was previously detected.
=back
=cut
sub fix_strands {
my ($self, @args) = @_;
my ($size, $force) = $self->_rearrange([qw(SIZE FORCE)], @args);
return if not $force and defined $self->{'_fixed_strands'} and $self->{'_fixed_strands'} == $#{$self->loci};
$self->{'_fixed_strands'} = $#{$self->loci};
$self->_load_module('Bio::Polloc::GroupCriteria');
return unless $#{$self->loci}>0; # No need to check
$size ||= 500;
my $factory = Bio::Tools::Run::Alignment::Muscle->new();
$factory->quiet(1);
# Find a suitable reference
my $ref = [undef, undef];
LOCUS: for my $lk (1 .. $#{$self->loci}){
my $ref_test = [
Bio::Polloc::GroupCriteria->_build_subseq(
$self->loci->[$lk]->seq,
$self->loci->[$lk]->from - $size,
$self->loci->[$lk]->from),
Bio::Polloc::GroupCriteria->_build_subseq(
$self->loci->[$lk]->seq,
$self->loci->[$lk]->to,
$self->loci->[$lk]->to + $size)
];
if(defined $ref->[0] and defined $ref->[1]){
# Longer pair:
$ref = $ref_test
if defined $ref_test->[0] and defined $ref_test->[1]
and $ref_test->[0]->length >= $ref->[0]->length
and $ref_test->[1]->length >= $ref->[1]->length;
}elsif(defined $ref->[0] or defined $ref->[1]){
# Both sequences defined:
$ref = $ref_test if defined $ref_test->[0] and defined $ref_test->[1];
}else{
# At least one sequence defined:
$ref = $ref_test if defined $ref_test->[0] or defined $ref_test->[1];
}
}
unless(defined $ref->[0] or defined $ref->[1]){
$self->debug('Impossible to find a suitable reference');
return;
}
$ref = defined $ref->[0] ?
( defined $ref->[1] ?
Bio::Seq->new(-seq=>$ref->[0]->seq . ("N"x20) . $ref->[1]->seq)
: $ref->[0]
) : $ref->[1];
$ref->id('ref');
$self->loci->[0]->strand('+');
# Compare
LOCUS: for my $k (0 .. $#{$self->loci}){
my $tgt = Bio::Polloc::GroupCriteria->_build_subseq(
$self->loci->[$k]->seq,
$self->loci->[$k]->from-$size,
$self->loci->[$k]->to+$size);
next LOCUS unless $tgt; # <- This may be way too paranoic!
$tgt->id('tgt');
my $tgtrc = $tgt->revcom;
$self->debug("Setting strand for ".$self->loci->[$k]->id) if defined $self->loci->[$k]->id;
my $eval_fun = 'average_percentage_identity';
#$eval_fun = 'overall_percentage_identity';
if($factory->align([$ref, $tgt])->$eval_fun
< $factory->align([$ref,$tgtrc])->$eval_fun){
$self->debug("Assuming negative strand, setting locus orientation");
$self->loci->[$k]->strand('-');
}else{
$self->debug("Assuming positive strand, setting locus orientation");
$self->loci->[$k]->strand('+');
}
} # LOCUS
}
=head1 INTERNAL METHODS
Methods intended to be used only within the scope of Bio::Polloc::*
=head2 _initialize
=cut
sub _initialize {
my ($self, @args) = @_;
my($name, $featurename, $genomes) = $self->_rearrange([qw(NAME FEATURENAME GENOMES)], @args);
$self->name($name);
$self->featurename($featurename);
$self->genomes($genomes);
}
1;