package Bio::MUST::Core::Roles::Listable;
# ABSTRACT: Listable Moose role for objects with implied id lists
$Bio::MUST::Core::Roles::Listable::VERSION = '0.191300';
use Moose::Role;
use autodie;
use feature qw(say);
use Carp;
use Const::Fast;
use Date::Format;
use List::AllUtils;
use POSIX qw(ceil floor);
use Bio::MUST::Core::Types;
use Bio::MUST::Core::Constants qw(:seqids);
requires 'all_seq_ids';
# IdList factory methods
# alias for std_list emphasizing its use as a lookup
sub new_lookup {
return shift->_list_from_seq_ids(0);
}
sub std_list {
return shift->_list_from_seq_ids(0);
}
sub alphabetical_list {
return shift->_list_from_seq_ids(1);
}
sub _list_from_seq_ids {
my $self = shift;
my $sort = shift;
my @ids = map { $_->full_id } $self->all_seq_ids;
@ids = sort @ids if $sort; # optionally sort list
return Bio::MUST::Core::IdList->new( ids => \@ids );
}
around qw(complete_seq_list len_mapper) => sub {
my $method = shift;
my $self = shift;
# ensure that seqs are available (e.g., the object is an Ali)
unless ( $self->can('all_seqs') ) {
carp '[BMC] Warning: cannot proceed without seqs; returning undef!';
return;
}
return $self->$method(@_);
};
sub complete_seq_list {
my $self = shift;
my $min_res = shift;
# get (non-missing char) lengths of all seqs and record max_len
my @lengths = map { $_->nomiss_seq_len } $self->all_seqs;
my $max_len = List::AllUtils::max @lengths;
# convert fractional min_res to conservative integer (if needed)
$min_res = ceil($min_res * $max_len)
if 0 < $min_res && $min_res < 1;
# filter out seqs with less than min_res non-missing chars
my @ids = map { $_->full_id } $self->all_seq_ids;
my @indices = grep { $lengths[$_] >= $min_res } 0..$#ids;
return Bio::MUST::Core::IdList->new( ids => [ @ids[@indices] ] );
}
# IdMapper factory methods
sub std_mapper {
my $self = shift;
my $prefix = shift // 'seq';
my @seq_ids = $self->all_seq_ids;
return Bio::MUST::Core::IdMapper->new(
long_ids => [ map { $_->full_id } @seq_ids ], # list context
abbr_ids => [ map { $prefix . $_ } 1..@seq_ids ], # scalar context
);
}
sub acc_mapper {
my $self = shift;
my $prefix = shift // q{};
# Note: this mapper could fail with non-GenBank Seqs
my @seq_ids = $self->all_seq_ids;
return Bio::MUST::Core::IdMapper->new(
long_ids => [ map { $_->full_id } @seq_ids ],
abbr_ids => [ map { $prefix . $_->accession } @seq_ids ],
);
}
sub len_mapper {
my $self = shift;
my @seq_ids = $self->all_seq_ids;
my @lengths = map { $_->nomiss_seq_len } $self->all_seqs;
return Bio::MUST::Core::IdMapper->new(
long_ids => [ map { $_->full_id . '@' . shift @lengths } @seq_ids ],
abbr_ids => [ map { $_->full_id } @seq_ids ],
);
}
sub regex_mapper {
my $self = shift;
my $prefix = shift // q{};
my $regex = shift // $DEF_ID;
my @long_ids = map { $_->full_id } $self->all_seq_ids;
# extract unique id component and substitute forbidden chars
# Note: this implementation was definitely too smart...
# my @abbr_ids = map { $prefix . $_ }
# map { $_ =~ s{$NOID_CHARS}{_}g; $_ }
# map { $_ =~ $regex; $1 } @long_ids;
my @abbr_ids;
for my $long_id (@long_ids) {
my @ids = $long_id =~ $regex; # capture original id(s)
s{$NOID_CHARS}{_}xmsg for @ids; # substitute forbidden chars
push @abbr_ids, join q{}, $prefix, @ids;
}
return Bio::MUST::Core::IdMapper->new(
long_ids => \@long_ids,
abbr_ids => \@abbr_ids
);
}
sub org_mapper_from_long_ids {
my $self = shift;
my $mapper = shift; # mapper long_org => abbr_org
my @long_ids;
my @abbr_ids;
ID:
for my $seq_id ( $self->all_seq_ids ) {
next ID if $seq_id->is_foreign;
push @long_ids, $seq_id->full_id;
push @abbr_ids, $mapper->abbr_id_for( $seq_id->full_org )
. '|' . $seq_id->accession;
}
return Bio::MUST::Core::IdMapper->new(
long_ids => \@long_ids,
abbr_ids => \@abbr_ids
);
}
sub org_mapper_from_abbr_ids {
my $self = shift;
my $mapper = shift; # mapper long_org => abbr_org
my @long_ids;
my @abbr_ids;
ID:
for my $seq_id ( $self->all_seq_ids ) {
my $abbr_id = $seq_id->full_id;
my ($abbr_org, $accession) = split /\|/xms, $abbr_id, 2;
next ID unless $abbr_org;
push @long_ids, $mapper->long_id_for($abbr_org) . '@' . $accession;
push @abbr_ids, $abbr_id;
}
return Bio::MUST::Core::IdMapper->new(
long_ids => \@long_ids,
abbr_ids => \@abbr_ids
);
}
const my $NBS_ID_LEN => 79;
sub store_nbs {
my $self = shift;
my $outfile = shift;
# #Sequences extracted from c111_78.ali of the 5 May 2009 at 11 hours 40
# #File c111_78.nbs created on Tuesday 5 May 2009 at 11 hours 40
# #184 positions remain on the 184 aligned positions
# #life.col,life.nom
# #Here is the list of the 78 species used:
# Aciduliprofundum_boonei_T469___________________________________________________ Aeropyrum_pernix_K1____________________________________________________________
# Archaeoglobus_fulgidus_DSM_4304________________________________________________ Archaeoglobus_profundus_Av18__DSM_5631_________________________________________
# ...
my @ids = $self->all_seq_ids;
open my $out, '>', $outfile;
# print minimum header
print {$out} "#File $outfile created on " . ctime(time);
say {$out} '#Here is the list of the ' . scalar @ids . ' species used:';
# print padded ids on two columns
for my $i (0..$#ids) {
my $id = $ids[$i]->foreign_id;
my $pad_id = $id . '_' x ($NBS_ID_LEN - length $id);
my $term = $i % 2 ? "\n" : q{ };
print {$out} $pad_id . $term;
}
say {$out} q{};
return;
}
no Moose::Role;
1;
__END__
=pod
=head1 NAME
Bio::MUST::Core::Roles::Listable - Listable Moose role for objects with implied id lists
=head1 VERSION
version 0.191300
=head1 SYNOPSIS
# TODO
=head1 DESCRIPTION
# TODO
=head1 METHODS
=head2 new_lookup
=head2 std_list
=head2 alphabetical_list
=head2 complete_seq_list
=head2 std_mapper
=head2 acc_mapper
=head2 len_mapper
=head2 regex_mapper
=head2 org_mapper_from_long_ids
=head2 org_mapper_from_abbr_ids
=head2 store_nbs
=head1 AUTHOR
Denis BAURAIN <denis.baurain@uliege.be>
=head1 COPYRIGHT AND LICENSE
This software is copyright (c) 2013 by University of Liege / Unit of Eukaryotic Phylogenomics / Denis BAURAIN.
This is free software; you can redistribute it and/or modify it under
the same terms as the Perl 5 programming language system itself.
=cut