=head1 NAME Bio::Polloc::LociGroup - A group of loci =head1 AUTHOR - Luis M. Rodriguez-R Email lmrodriguezr at gmail dot com =head1 IMPLEMENTS OR EXTENDS =over =item * L<Bio::Polloc::Polloc::Root> =back =cut package Bio::Polloc::LociGroup; use strict; use Bio::Polloc::Polloc::IO; use base qw(Bio::Polloc::Polloc::Root); =head1 PUBLIC METHODS Methods provided by the package =cut =head2 new The basic initialization method =cut sub new { my($caller,@args) = @_; my $self = $caller->SUPER::new(@args); $self->_initialize(@args); return $self; } =head2 add_locus Alias of C<add_loci()> =cut sub add_locus { return shift->add_loci(@_) } =head2 add_loci Adds loci to the collection on the specified genome's space =head3 Throws A L<Bio::Polloc::Polloc::Error> if an argument is not a L<Bio::Polloc::LocusI> object. =head3 Arguments The first argument B<can> be the identifier of the genome's space (int). All the following are expected to be L<Bio::Polloc::LocusI> objects. =cut sub add_loci { my ($self,@l) = @_; my $space; if(defined $l[0] and not ref $l[0]){ $space = 0 + shift @l; } $self->{'_loci'} = [] unless defined $self->{'_loci'}; for my $locus (@l){ $self->debug("Saving locus (".($#l+1)." loci, cur:".($#{$self->{'_loci'}}+1).")"); $self->throw('Expecting a Bio::Polloc::LocusI object', $locus) unless UNIVERSAL::can($locus, 'isa') and $locus->isa('Bio::Polloc::LocusI'); $locus->genome($self->genomes->[$space]) if defined $space; push @{ $self->{'_loci'} }, $locus; } } =head2 loci Gets the loci =cut sub loci { my $self = shift; $self->{'_loci'} = [] unless defined $self->{'_loci'}; return $self->{'_loci'}; } =head2 structured_loci Returns a two-dimensional array where the first key corresponds to the number of the genome space and the second key is an incremental for each locus. =head3 Note This function is provided for convenience in some output formating, but its use should be avoided as it causes a huge processing time penalty. =head3 Warning Loci without defined genome will not be included in the output. =cut sub structured_loci { my $self = shift; return unless defined $self->genomes; my $struct = []; for my $locus (@{$self->loci}){ next unless defined $locus->genome; my $space = 0; for my $genome (@{$self->genomes}){ $struct->[$space] = [] unless defined $struct->[$space]; if($genome->name eq $locus->genome->name){ push @{ $struct->[$space] }, $locus; } $space++; } } return $struct; } =head2 locus Get a locus by ID =head3 Arguments The ID of the locus (str). =cut sub locus { my ($self, $id) = @_; for my $locus (@{$self->loci}){ return $locus if $locus->id eq $id } return; } =head2 name Gets/sets the name of the group. This is supposed to be unique! =head3 Note Future implementations could assume unique naming for getting/setting/initializing groups of loci by name. =cut sub name { my ($self, $value) = @_; $self->{'_name'} = $value if defined $value; return $self->{'_name'}; } =head2 genomes Gets/sets the genomes to be used as analysis base. =head3 Arguments A reference to an array of L<Bio::Polloc::Genome> objects. =cut sub genomes { my($self, $value) = @_; $self->{'_genomes'} = $value if defined $value; return unless defined $self->{'_genomes'}; $self->throw("Unexpected type of genomes collection", $self->{'_genomes'}) unless ref($self->{'_genomes'}) and ref($self->{'_genomes'})=~m/ARRAY/i; return $self->{'_genomes'}; } =head2 featurename Gets/Sets the name of the feature common to all the loci in the group. =cut sub featurename { my ($self, $value) = @_; $self->{'_featurename'} = $value if defined $value; return $self->{'_featurename'}; } =head2 avg_length Gets the average length of the stored loci. =head3 Returns The average length (float) or an array containing the average length (float) and the standard deviation (float), depending on the expected output. =head3 Syntax my $len = $locigroup->length; Or, my($len, $sd) = $locigroup->length; =cut sub avg_length { my $self = shift; my $len_avg = 0; my $len_sd = 0; if($#{$self->loci} >= 1){ # AVG $len_avg+= abs($_->from - $_->to) for @{$self->loci}; $len_avg = $len_avg/($#{$self->loci}+1); return $len_avg unless wantarray; # SD $len_sd+= (abs($_->from - $_->to) - $len_avg)**2 for @{$self->loci}; $len_sd = sqrt($len_sd/$#{$self->loci}); # n-1, not n (unbiased SD) }elsif($#{$self->loci}==0){ $len_avg = abs($self->loci->[0]->from - $self->loci->[0]->to); } return wantarray ? ($len_avg, $len_sd) : $len_avg; } =head2 align_context =head3 Arguments Arguments work in the same way L<Bio::Polloc::LocusI-E<gt>context_seq()> arguments do. =over =item 1 Ref: Int, reference position. =item 2 From: Int, the I<from> position. =item 3 To: Int, the I<to> position. =back =cut sub align_context { my ($self, $ref, $from, $to) = @_; $from+=0; $to+=0; $ref+=0; return if $from == $to and $ref!=0; my $factory = Bio::Tools::Run::Alignment::Muscle->new(); $factory->quiet(1); my @seqs = (); LOCUS: for my $locus (@{$self->loci}){ # Get the sequence my $seq = $locus->context_seq($ref, $from, $to); next LOCUS unless defined $seq; $seq->display_id($locus->id) if defined $locus->id; push @seqs, $seq; } #LOCUS return unless $#seqs>-1; # Impossible without sequences # small trick to build an alignment, even if there is only one sequence: push @seqs, Bio::Seq->new(-seq=>$seqs[0]->seq, -id=>'dup-seq') unless $#seqs>0; $self->debug("Aligning context sequences"); return $factory->align(\@seqs); } =head2 fix_strands Fixes the strand of the loci based on the flanking regions, to have all the loci in the group with the same orientation. =head3 Arguments =over =item -size I<int> Context size (500 by default) =item -force I<bool (int)> Force the detection, even if it was previously detected. =back =cut sub fix_strands { my ($self, @args) = @_; my ($size, $force) = $self->_rearrange([qw(SIZE FORCE)], @args); return if not $force and defined $self->{'_fixed_strands'} and $self->{'_fixed_strands'} == $#{$self->loci}; $self->{'_fixed_strands'} = $#{$self->loci}; $self->_load_module('Bio::Polloc::GroupCriteria'); return unless $#{$self->loci}>0; # No need to check $size ||= 500; my $factory = Bio::Tools::Run::Alignment::Muscle->new(); $factory->quiet(1); # Find a suitable reference my $ref = [undef, undef]; LOCUS: for my $lk (1 .. $#{$self->loci}){ my $ref_test = [ Bio::Polloc::GroupCriteria->_build_subseq( $self->loci->[$lk]->seq, $self->loci->[$lk]->from - $size, $self->loci->[$lk]->from), Bio::Polloc::GroupCriteria->_build_subseq( $self->loci->[$lk]->seq, $self->loci->[$lk]->to, $self->loci->[$lk]->to + $size) ]; if(defined $ref->[0] and defined $ref->[1]){ # Longer pair: $ref = $ref_test if defined $ref_test->[0] and defined $ref_test->[1] and $ref_test->[0]->length >= $ref->[0]->length and $ref_test->[1]->length >= $ref->[1]->length; }elsif(defined $ref->[0] or defined $ref->[1]){ # Both sequences defined: $ref = $ref_test if defined $ref_test->[0] and defined $ref_test->[1]; }else{ # At least one sequence defined: $ref = $ref_test if defined $ref_test->[0] or defined $ref_test->[1]; } } unless(defined $ref->[0] or defined $ref->[1]){ $self->debug('Impossible to find a suitable reference'); return; } $ref = defined $ref->[0] ? ( defined $ref->[1] ? Bio::Seq->new(-seq=>$ref->[0]->seq . ("N"x20) . $ref->[1]->seq) : $ref->[0] ) : $ref->[1]; $ref->id('ref'); $self->loci->[0]->strand('+'); #Â Compare LOCUS: for my $k (0 .. $#{$self->loci}){ my $tgt = Bio::Polloc::GroupCriteria->_build_subseq( $self->loci->[$k]->seq, $self->loci->[$k]->from-$size, $self->loci->[$k]->to+$size); next LOCUS unless $tgt; # <- This may be way too paranoic! $tgt->id('tgt'); my $tgtrc = $tgt->revcom; $self->debug("Setting strand for ".$self->loci->[$k]->id) if defined $self->loci->[$k]->id; my $eval_fun = 'average_percentage_identity'; #$eval_fun = 'overall_percentage_identity'; if($factory->align([$ref, $tgt])->$eval_fun < $factory->align([$ref,$tgtrc])->$eval_fun){ $self->debug("Assuming negative strand, setting locus orientation"); $self->loci->[$k]->strand('-'); }else{ $self->debug("Assuming positive strand, setting locus orientation"); $self->loci->[$k]->strand('+'); } } # LOCUS } =head1 INTERNAL METHODS Methods intended to be used only within the scope of Bio::Polloc::* =head2 _initialize =cut sub _initialize { my ($self, @args) = @_; my($name, $featurename, $genomes) = $self->_rearrange([qw(NAME FEATURENAME GENOMES)], @args); $self->name($name); $self->featurename($featurename); $self->genomes($genomes); } 1;