Hide Show 131 lines of Pod
use
vars
qw(%CONSERVATION_GROUPS)
;
BEGIN {
%CONSERVATION_GROUPS
= (
'strong'
=> [
qw(
STA
NEQK
NHQK
NDEQ
QHRK
MILV
MILF
HY
FYW )
],
'weak'
=> [
qw(
CSA
ATV
SAG
STNK
STPA
SGND
SNDEQK
NDEQHK
NEQHRK
FVLIM
HFY )
],);
}
FAST::Bio::FeatureHolderI)
;
Hide Show 16 lines of Pod
sub
new {
my
(
$class
,
@args
) =
@_
;
my
$self
=
$class
->SUPER::new(
@args
);
my
(
$src
,
$score
,
$id
,
$acc
,
$desc
,
$seqs
,
$feats
,
$coll
,
$sa
,
$con
,
$cmeta
) =
$self
->_rearrange([
qw(
SOURCE
SCORE
ID
ACCESSION
DESCRIPTION
SEQS
FEATURES
ANNOTATION
SEQ_ANNOTATION
CONSENSUS
CONSENSUS_META
)
],
@args
);
$src
&&
$self
->source(
$src
);
defined
$score
&&
$self
->score(
$score
);
$self
->{
'_seq'
} = {};
$self
->{
'_order'
} = {};
$self
->{
'_start_end_lists'
} = {};
$self
->{
'_dis_name'
} = {};
$self
->{
'_id'
} =
'NoName'
;
$id
&&
$self
->id(
$id
);
$acc
&&
$self
->accession(
$acc
);
$desc
&&
$self
->description(
$desc
);
$coll
&&
$self
->annotation(
$coll
);
if
(
$sa
) {
$self
->throw(
"Must supply an alignment-based annotation collection (-annotation) "
.
"with a sequence annotation collection"
)
if
!
$coll
;
$coll
->add_Annotation(
'seq_annotation'
,
$sa
);
}
if
(
$feats
&&
ref
$feats
eq
'ARRAY'
) {
for
my
$feat
(
@$feats
) {
$self
->add_SeqFeature(
$feat
);
}
}
$con
&&
$self
->consensus(
$con
);
$cmeta
&&
$self
->consensus_meta(
$cmeta
);
if
(
$seqs
&&
ref
(
$seqs
) eq
'ARRAY'
) {
for
my
$seq
(
@$seqs
) {
$self
->add_seq(
$seq
);
}
}
return
$self
;
}
Hide Show 23 lines of Pod
sub
addSeq {
my
$self
=
shift
;
$self
->deprecated(
"addSeq - deprecated method. Use add_seq() instead."
);
$self
->add_seq(
@_
);
}
sub
add_seq {
my
$self
=
shift
;
my
@args
=
@_
;
my
(
$seq
,
$order
) =
$self
->_rearrange([
qw(SEQ ORDER)
],
@args
);
my
(
$name
,
$id
,
$start
,
$end
);
unless
(
$seq
) {
$self
->throw(
"LocatableSeq argument required"
);
}
if
( !
ref
$seq
|| !
$seq
->isa(
'FAST::Bio::LocatableSeq'
) ) {
$self
->throw(
"Unable to process non locatable sequences ["
.
ref
(
$seq
).
"]"
);
}
!
defined
(
$order
) and
$order
= 1 +
keys
%{
$self
->{
'_seq'
}};
$order
--;
if
(
$order
< 0) {
$self
->throw(
"User-specified value for ORDER must be >= 1"
);
}
$id
=
$seq
->id() ||
$seq
->display_id ||
$seq
->primary_id;
$name
=
$seq
->get_nse;
if
(
$self
->{
'_seq'
}->{
$name
} ) {
$self
->
warn
(
"Replacing one sequence [$name]\n"
)
unless
$self
->verbose < 0;
}
else
{
$self
->debug(
"Assigning $name to $order\n"
);
my
$ordh
=
$self
->{
'_order'
};
if
(
$ordh
->{
$order
}) {
my
$c
;
$c
=
sub
{
return
((
$_
[1]-
$_
[0] == 1) ? (
$c
->(
@_
[1..
$#_
]),
$_
[0]) :
$_
[0]); };
map
{
$ordh
->{
$_
+1} =
$ordh
->{
$_
}
}
$c
->(
sort
{
$a
<=>
$b
}
grep
{
$_
>=
$order
}
keys
%{
$ordh
});
}
$ordh
->{
$order
} =
$name
;
unless
(
exists
(
$self
->{
'_start_end_lists'
}->{
$id
})) {
$self
->{
'_start_end_lists'
}->{
$id
} = [];
}
push
@{
$self
->{
'_start_end_lists'
}->{
$id
}},
$seq
;
}
$self
->{
'_seq'
}->{
$name
} =
$seq
;
}
Hide Show 10 lines of Pod
sub
removeSeq {
my
$self
=
shift
;
$self
->deprecated(
"removeSeq - deprecated method. Use remove_seq() instead."
);
$self
->remove_seq(
@_
);
}
sub
remove_seq {
my
$self
=
shift
;
my
$seq
=
shift
;
my
(
$name
,
$id
,
$start
,
$end
);
$self
->throw(
"Need FAST::Bio::Locatable seq argument "
)
unless
ref
$seq
&&
$seq
->isa(
'FAST::Bio::LocatableSeq'
);
$id
=
$seq
->id();
$start
=
$seq
->start();
$end
=
$seq
->end();
$name
=
sprintf
(
"%s/%d-%d"
,
$id
,
$start
,
$end
);
if
( !
exists
$self
->{
'_seq'
}->{
$name
} ) {
$self
->throw(
"Sequence $name does not exist in the alignment to remove!"
);
}
delete
$self
->{
'_seq'
}->{
$name
};
if
(
exists
$self
->{
'_start_end_lists'
}->{
$id
}) {
my
(
$i
,
$found
);;
for
(
$i
=0;
$i
< @{
$self
->{
'_start_end_lists'
}->{
$id
}};
$i
++) {
if
(${
$self
->{
'_start_end_lists'
}->{
$id
}}[
$i
] eq
$seq
) {
$found
= 1;
last
;
}
}
if
(
$found
) {
splice
@{
$self
->{
'_start_end_lists'
}->{
$id
}},
$i
, 1;
}
else
{
$self
->throw(
"Could not find the sequence to remoce from the start-end list"
);
}
}
else
{
$self
->throw(
"There is no seq list for the name $id"
);
}
my
%rev_order
=
reverse
%{
$self
->{
'_order'
}};
my
$no
=
$rev_order
{
$name
};
my
$num_sequences
=
$self
->num_sequences;
for
(;
$no
<
$num_sequences
;
$no
++) {
$self
->{
'_order'
}->{
$no
} =
$self
->{
'_order'
}->{
$no
+1};
}
delete
$self
->{
'_order'
}->{
$no
};
return
1;
}
Hide Show 12 lines of Pod
sub
purge {
my
(
$self
,
$perc
) =
@_
;
my
(
%duplicate
,
@dups
);
my
@seqs
=
$self
->each_seq();
for
(
my
$i
=0;
$i
<
@seqs
- 1;
$i
++ ) {
my
$seq
=
$seqs
[
$i
];
next
if
exists
$duplicate
{
$seq
->display_id} ;
my
$one
=
$seq
->seq();
my
@one
=
split
''
,
$one
;
for
(
my
$j
=
$i
+1;
$j
<
@seqs
;
$j
++) {
my
$seq2
=
$seqs
[
$j
];
next
if
exists
$duplicate
{
$seq2
->display_id} ;
my
$two
=
$seq2
->seq();
my
@two
=
split
''
,
$two
;
my
$count
= 0;
my
$res
= 0;
for
(
my
$k
=0;
$k
<
@one
;
$k
++) {
if
(
$one
[
$k
] ne
'.'
&&
$one
[
$k
] ne
'-'
&&
defined
(
$two
[
$k
]) &&
$one
[
$k
] eq
$two
[
$k
]) {
$count
++;
}
if
(
$one
[
$k
] ne
'.'
&&
$one
[
$k
] ne
'-'
&&
defined
(
$two
[
$k
]) &&
$two
[
$k
] ne
'.'
&&
$two
[
$k
] ne
'-'
) {
$res
++;
}
}
my
$ratio
= 0;
$ratio
=
$count
/
$res
unless
$res
== 0;
if
(
$ratio
>
$perc
) {
$self
->
warn
(
"duplicate: "
,
$seq2
->display_id)
if
$self
->verbose > 0;
$duplicate
{
$seq2
->display_id} = 1;
push
@dups
,
$seq2
;
}
}
}
foreach
my
$seq
(
@dups
) {
$self
->remove_seq(
$seq
);
}
return
@dups
;
}
Hide Show 11 lines of Pod
sub
sort_alphabetically {
my
$self
=
shift
;
my
(
$seq
,
$nse
,
@arr
,
%hash
,
$count
);
foreach
$seq
(
$self
->each_seq() ) {
$nse
=
$seq
->get_nse;
$hash
{
$nse
} =
$seq
;
}
$count
= 0;
%{
$self
->{
'_order'
}} = ();
foreach
$nse
(
sort
_alpha_startend
keys
%hash
) {
$self
->{
'_order'
}->{
$count
} =
$nse
;
$count
++;
}
1;
}
Hide Show 10 lines of Pod
sub
sort_by_list {
my
(
$self
,
$list
) =
@_
;
my
(
@seq
,
@ids
,
%order
);
foreach
my
$seq
(
$self
->each_seq() ) {
push
@seq
,
$seq
;
push
@ids
,
$seq
->display_id;
}
my
$ct
=1;
open
(
my
$listfh
,
'<'
,
$list
) ||
$self
->throw(
"can't open file for reading: $list"
);
while
(<
$listfh
>) {
chomp
;
my
$name
=
$_
;
$self
->throw(
"Not found in alignment: $name"
)
unless
&_in_aln
(
$name
, \
@ids
);
$order
{
$name
}=
$ct
++;
}
close
(
$listfh
);
my
@sorted
=
map
{
$_
->[1] }
sort
{
$a
->[0] <=>
$b
->[0] }
map
{ [
$order
{
$_
->id()},
$_
] }
@seq
;
my
$aln
=
$self
->new;
foreach
(
@sorted
) {
$aln
->add_seq(
$_
) }
return
$aln
;
}
Hide Show 14 lines of Pod
sub
set_new_reference {
my
(
$self
,
$seqid
) =
@_
;
my
$aln
=
$self
->new;
my
(
@seq
,
@ids
,
@new_seq
);
my
$is_num
=0;
foreach
my
$seq
(
$self
->each_seq() ) {
push
@seq
,
$seq
;
push
@ids
,
$seq
->display_id;
}
if
(
$seqid
=~ /^\d+$/) {
$is_num
=1;
$self
->throw(
"The new reference sequence number has to be a positive integer >1 and <= num_sequences "
)
if
(
$seqid
<= 1 ||
$seqid
>
$self
->num_sequences);
}
else
{
$self
->throw(
"The new reference sequence not in alignment "
)
unless
&_in_aln
(
$seqid
, \
@ids
);
}
for
(
my
$i
=0;
$i
<=
$#seq
;
$i
++) {
my
$pos
=
$i
+1;
if
( (
$is_num
&&
$pos
==
$seqid
) || (
$seqid
eq
$seq
[
$i
]->display_id) ) {
unshift
@new_seq
,
$seq
[
$i
];
}
else
{
push
@new_seq
,
$seq
[
$i
];
}
}
foreach
(
@new_seq
) {
$aln
->add_seq(
$_
); }
return
$aln
;
}
sub
_in_aln {
my
(
$str
,
$ref
) =
@_
;
foreach
(
@$ref
) {
return
1
if
$str
eq
$_
;
}
return
0;
}
Hide Show 20 lines of Pod
sub
uniq_seq {
my
(
$self
,
$seqid
) =
@_
;
my
$aln
=
$self
->new;
my
(
%member
,
%order
,
@seq
,
@uniq_str
,
$st
);
my
$order
=0;
my
$len
=
$self
->
length
();
$st
= {};
foreach
my
$seq
(
$self
->each_seq() ) {
my
$str
=
$seq
->seq();
$str
=~ s/n/\?/gi
if
$str
=~ /^[atcgn-]+$/i;
$str
=
&_convert_leading_ending_gaps
(
$str
,
'-'
,
'?'
);
local
$FAST::Bio::LocatableSeq::GAP_SYMBOLS
=
'-\?'
;
my
$new
= FAST::Bio::LocatableSeq->new(
-id
=>
$seq
->id(),
-alphabet
=>
$seq
->alphabet,
-seq
=>
$str
,
-start
=>
$seq
->start,
-end
=>
$seq
->end
);
push
@seq
,
$new
;
}
foreach
my
$seq
(
@seq
) {
my
$str
=
$seq
->seq();
my
(
$seen
,
$key
) =
&_check_uniq
(
$str
, \
@uniq_str
,
$len
);
if
(
$seen
) {
my
@memb
= @{
$member
{
$key
}};
push
@memb
,
$seq
;
$member
{
$key
} = \
@memb
;
}
else
{
push
@uniq_str
,
$key
;
$order
++;
$member
{
$key
} = [ (
$seq
) ];
$order
{
$key
} =
$order
;
}
}
foreach
my
$str
(
sort
{
$order
{
$a
} <=>
$order
{
$b
}}
keys
%order
) {
my
$str2
=
&_convert_leading_ending_gaps
(
$str
,
'?'
,
'-'
);
$str2
=~ s/\?/N/g
if
$str2
=~ /^[atcg\-\?]+$/i;
my
$gap
=
'-'
;
my
$end
= CORE::
length
(
$str2
);
$end
-= CORE::
length
($1)
while
$str2
=~ m/(
$gap
+)/g;
my
$new
= FAST::Bio::LocatableSeq->new(
-id
=>
"ST"
.
$order
{
$str
},
-seq
=>
$str2
,
-start
=>1,
-end
=>
$end
);
$aln
->add_seq(
$new
);
foreach
(@{
$member
{
$str
}}) {
push
@{
$$st
{
$order
{
$str
}}},
$_
->id();
$self
->debug(
$_
->id(),
"\t"
,
"ST"
,
$order
{
$str
},
"\n"
);
}
}
return
wantarray
? (
$aln
,
$st
) :
$aln
;
}
sub
_check_uniq {
my
(
$str1
,
$ref
,
$length
) =
@_
;
my
@char1
=
split
//,
$str1
;
my
@array
=
@$ref
;
return
(0,
$str1
)
if
@array
==0;
foreach
my
$str2
(
@array
) {
my
$diff
=0;
my
@char2
=
split
//,
$str2
;
for
(
my
$i
=0;
$i
<=
$length
-1;
$i
++) {
next
if
$char1
[
$i
] eq
'?'
;
next
if
$char2
[
$i
] eq
'?'
;
$diff
++
if
$char1
[
$i
] ne
$char2
[
$i
];
}
return
(1,
$str2
)
if
$diff
== 0;
}
return
(0,
$str1
);
}
sub
_convert_leading_ending_gaps {
my
$s
=
shift
;
my
$sym1
=
shift
;
my
$sym2
=
shift
;
my
@array
=
split
//,
$s
;
for
(
my
$i
=0;
$i
<=
$#array
;
$i
++) {
(
$array
[
$i
] eq
$sym1
) ? (
$array
[
$i
] =
$sym2
):(
last
);
}
for
(
my
$i
=
$#array
;
$i
>= 0;
$i
--) {
(
$array
[
$i
] eq
$sym1
) ? (
$array
[
$i
] =
$sym2
):(
last
);
}
my
$s_new
=
join
''
,
@array
;
return
$s_new
;
}
Hide Show 14 lines of Pod
sub
eachSeq {
my
$self
=
shift
;
$self
->deprecated(
"eachSeq - deprecated method. Use each_seq() instead."
);
$self
->each_seq();
}
sub
each_seq {
my
$self
=
shift
;
my
(
@arr
,
$order
);
foreach
$order
(
sort
{
$a
<=>
$b
}
keys
%{
$self
->{
'_order'
}} ) {
if
(
exists
$self
->{
'_seq'
}->{
$self
->{
'_order'
}->{
$order
}} ) {
push
(
@arr
,
$self
->{
'_seq'
}->{
$self
->{
'_order'
}->{
$order
}});
}
}
return
@arr
;
}
Hide Show 12 lines of Pod
sub
each_alphabetically {
my
$self
=
shift
;
my
(
$seq
,
$nse
,
@arr
,
%hash
,
$count
);
foreach
$seq
(
$self
->each_seq() ) {
$nse
=
$seq
->get_nse;
$hash
{
$nse
} =
$seq
;
}
foreach
$nse
(
sort
_alpha_startend
keys
%hash
) {
push
(
@arr
,
$hash
{
$nse
});
}
return
@arr
;
}
sub
_alpha_startend {
my
(
$aname
,
$astart
,
$bname
,
$bstart
);
(
$aname
,
$astart
) =
split
(/-/,
$a
);
(
$bname
,
$bstart
) =
split
(/-/,
$b
);
if
(
$aname
eq
$bname
) {
return
$astart
<=>
$bstart
;
}
else
{
return
$aname
cmp
$bname
;
}
}
Hide Show 12 lines of Pod
sub
eachSeqWithId {
my
$self
=
shift
;
$self
->deprecated(
"eachSeqWithId - deprecated method. Use each_seq_with_id() instead."
);
$self
->each_seq_with_id(
@_
);
}
sub
each_seq_with_id {
my
$self
=
shift
;
my
$id
=
shift
;
$self
->throw(
"Method each_seq_with_id needs a sequence name argument"
)
unless
defined
$id
;
my
(
@arr
,
$seq
);
if
(
exists
(
$self
->{
'_start_end_lists'
}->{
$id
})) {
@arr
= @{
$self
->{
'_start_end_lists'
}->{
$id
}};
}
return
@arr
;
}
Hide Show 12 lines of Pod
sub
get_seq_by_pos {
my
$self
=
shift
;
my
(
$pos
) =
@_
;
$self
->throw(
"Sequence position has to be a positive integer, not [$pos]"
)
unless
$pos
=~ /^\d+$/ and
$pos
> 0;
$self
->throw(
"No sequence at position [$pos]"
)
unless
$pos
<=
$self
->num_sequences ;
my
$nse
=
$self
->{
'_order'
}->{--
$pos
};
return
$self
->{
'_seq'
}->{
$nse
};
}
Hide Show 11 lines of Pod
sub
get_seq_by_id {
my
(
$self
,
$name
) =
@_
;
unless
(
defined
$name
) {
$self
->
warn
(
"Must provide a sequence name"
);
return
;
}
for
my
$seq
(
values
%{
$self
->{
'_seq'
}} ) {
if
(
$seq
->id eq
$name
) {
return
$seq
;
}
}
return
;
}
Hide Show 34 lines of Pod
sub
seq_with_features{
my
(
$self
,
%arg
) =
@_
;
$self
->throw(
"must provide a -pos argument"
)
unless
$arg
{-
pos
};
$self
->splice_by_seq_pos(
$arg
{-
pos
});
my
$consensus_string
=
$self
->consensus_string(
$arg
{-consensus});
$consensus_string
=
$arg
{-mask}->(
$consensus_string
)
if
defined
(
$arg
{-mask});
my
(
@bs
,
@es
);
push
@bs
, 1
if
$consensus_string
=~ /^[^?]/;
while
(
$consensus_string
=~ /\?[^?]/g){
push
@bs
,
pos
(
$consensus_string
);
}
while
(
$consensus_string
=~ /[^?]\?/g){
push
@es
,
pos
(
$consensus_string
);
}
push
@es
, CORE::
length
(
$consensus_string
)
if
$consensus_string
=~ /[^?]$/;
my
$seq
= FAST::Bio::Seq->new();
while
(
my
$b
=
shift
@bs
){
my
$e
=
shift
@es
;
$seq
->add_SeqFeature(
FAST::Bio::SeqFeature::Generic->new(
-start
=>
$b
- 1 +
$self
->get_seq_by_pos(
$arg
{-
pos
})->start,
-end
=>
$e
- 1 +
$self
->get_seq_by_pos(
$arg
{-
pos
})->start,
-source_tag
=>
$self
->source ||
'MSA'
,
)
);
}
return
$seq
;
}
Hide Show 18 lines of Pod
sub
select
{
my
$self
=
shift
;
my
(
$start
,
$end
) =
@_
;
$self
->throw(
"Select start has to be a positive integer, not [$start]"
)
unless
$start
=~ /^\d+$/ and
$start
> 0;
$self
->throw(
"Select end has to be a positive integer, not [$end]"
)
unless
$end
=~ /^\d+$/ and
$end
> 0;
$self
->throw(
"Select $start [$start] has to be smaller than or equal to end [$end]"
)
unless
$start
<=
$end
;
my
$aln
=
$self
->new;
foreach
my
$pos
(
$start
..
$end
) {
$aln
->add_seq(
$self
->get_seq_by_pos(
$pos
));
}
$aln
->id(
$self
->id);
return
$aln
;
}
Hide Show 23 lines of Pod
sub
select_noncont {
my
$self
=
shift
;
my
$nosort
= 0;
my
(
@pos
) =
@_
;
if
(
$pos
[0] !~ m{^\d+$}) {
my
$sortcmd
=
shift
@pos
;
if
(
$sortcmd
eq
'nosort'
) {
$nosort
= 1;
}
else
{
$self
->throw(
"Command not recognized: $sortcmd. Only 'nosort' implemented at this time."
);
}
}
my
$end
=
$self
->num_sequences;
foreach
(
@pos
) {
$self
->throw(
"position must be a positive integer, > 0 and <= $end not [$_]"
)
unless
( /^\d+$/ &&
$_
> 0 &&
$_
<=
$end
);
}
@pos
=
sort
{
$a
<=>
$b
}
@pos
unless
$nosort
;
my
$aln
=
$self
->new;
foreach
my
$p
(
@pos
) {
$aln
->add_seq(
$self
->get_seq_by_pos(
$p
));
}
$aln
->id(
$self
->id);
return
$aln
;
}
Hide Show 18 lines of Pod
sub
slice {
my
$self
=
shift
;
my
(
$start
,
$end
,
$keep_gap_only
) =
@_
;
$self
->throw(
"Slice start has to be a positive integer, not [$start]"
)
unless
$start
=~ /^\d+$/ and
$start
> 0;
$self
->throw(
"Slice end has to be a positive integer, not [$end]"
)
unless
$end
=~ /^\d+$/ and
$end
> 0;
$self
->throw(
"Slice start [$start] has to be smaller than or equal to end [$end]"
)
unless
$start
<=
$end
;
$self
->throw(
"This alignment has only "
.
$self
->
length
.
" residues. Slice start "
.
"[$start] is too big."
)
if
$start
>
$self
->
length
;
my
$cons_meta
=
$self
->consensus_meta;
my
$aln
=
$self
->new;
$aln
->id(
$self
->id);
foreach
my
$seq
(
$self
->each_seq() ) {
my
$new_seq
=
$seq
->isa(
'FAST::Bio::Seq::MetaI'
) ?
FAST::Bio::Seq::Meta->new
(
-id
=>
$seq
->id,
-alphabet
=>
$seq
->alphabet,
-strand
=>
$seq
->strand,
-verbose
=>
$self
->verbose) :
FAST::Bio::LocatableSeq->new
(
-id
=>
$seq
->id,
-alphabet
=>
$seq
->alphabet,
-strand
=>
$seq
->strand,
-verbose
=>
$self
->verbose);
my
$seq_end
=
$end
;
$seq_end
=
$seq
->
length
if
(
$end
>
$seq
->
length
);
my
$slice_seq
=
$seq
->subseq(
$start
,
$seq_end
);
$new_seq
->seq(
$slice_seq
);
$slice_seq
=~ s/\W//g;
if
(
$start
> 1) {
my
$pre_start_seq
=
$seq
->subseq(1,
$start
- 1);
$pre_start_seq
=~ s/\W//g;
if
(!
defined
(
$seq
->strand)) {
$new_seq
->start(
$seq
->start + CORE::
length
(
$pre_start_seq
) );
}
elsif
(
$seq
->strand < 0){
$new_seq
->start(
$seq
->end - CORE::
length
(
$pre_start_seq
) - CORE::
length
(
$slice_seq
) + 1);
}
else
{
$new_seq
->start(
$seq
->start + CORE::
length
(
$pre_start_seq
) );
}
}
else
{
if
((
defined
$seq
->strand)&&(
$seq
->strand < 0)){
$new_seq
->start(
$seq
->end - CORE::
length
(
$slice_seq
) + 1);
}
else
{
$new_seq
->start(
$seq
->start);
}
}
if
(
$new_seq
->isa(
'FAST::Bio::Seq::MetaI'
)) {
for
my
$meta_name
(
$seq
->meta_names) {
$new_seq
->named_meta(
$meta_name
,
$seq
->named_submeta(
$meta_name
,
$start
,
$end
));
}
}
$new_seq
->end(
$new_seq
->start + CORE::
length
(
$slice_seq
) - 1 );
if
(
$new_seq
->start and
$new_seq
->end >=
$new_seq
->start) {
$aln
->add_seq(
$new_seq
);
}
else
{
if
(
$keep_gap_only
) {
$aln
->add_seq(
$new_seq
);
}
else
{
my
$nse
=
$seq
->get_nse();
$self
->
warn
(
"Slice [$start-$end] of sequence [$nse] contains no residues."
.
" Sequence excluded from the new alignment."
);
}
}
}
if
(
$cons_meta
) {
my
$new
= FAST::Bio::Seq::Meta->new();
for
my
$meta_name
(
$cons_meta
->meta_names) {
$new
->named_meta(
$meta_name
,
$cons_meta
->named_submeta(
$meta_name
,
$start
,
$end
));
}
$aln
->consensus_meta(
$new
);
}
$aln
->annotation(
$self
->annotation);
return
$aln
;
}
Hide Show 15 lines of Pod
sub
remove_columns {
my
(
$self
,
@args
) =
@_
;
@args
||
$self
->throw(
"Must supply column ranges or column types"
);
my
$aln
;
if
(
$args
[0][0] =~ /^[a-z_]+$/i) {
$aln
=
$self
->_remove_columns_by_type(
$args
[0]);
}
elsif
(
$args
[0][0] =~ /^\d+$/) {
$aln
=
$self
->_remove_columns_by_num(\
@args
);
}
else
{
$self
->throw(
"You must pass array references to remove_columns(), not @args"
);
}
$aln
;
}
Hide Show 16 lines of Pod
sub
remove_gaps {
my
(
$self
,
$gapchar
,
$all_gaps_columns
) =
@_
;
my
$gap_line
;
if
(
$all_gaps_columns
) {
$gap_line
=
$self
->all_gap_line(
$gapchar
);
}
else
{
$gap_line
=
$self
->gap_line(
$gapchar
);
}
my
$aln
=
$self
->new;
my
@remove
;
my
$length
= 0;
my
$del_char
=
$gapchar
||
$self
->gap_char;
while
(
$gap_line
=~ m/[
$del_char
]/g) {
my
$start
=
pos
(
$gap_line
)-1;
$gap_line
=~/\G[
$del_char
]+/gc;
my
$end
=
pos
(
$gap_line
)-1;
$start
-=
$length
;
$end
-=
$length
;
$length
+= (
$end
-
$start
+1);
push
@remove
, [
$start
,
$end
];
}
$aln
=
$#remove
>= 0 ?
$self
->_remove_col(
$aln
,\
@remove
) :
$self
;
return
$aln
;
}
sub
_remove_col {
my
(
$self
,
$aln
,
$remove
) =
@_
;
my
@new
;
my
$gap
=
$self
->gap_char;
foreach
my
$seq
(
$self
->each_seq){
my
$new_seq
= FAST::Bio::LocatableSeq->new(
-id
=>
$seq
->id,
-alphabet
=>
$seq
->alphabet,
-strand
=>
$seq
->strand,
-verbose
=>
$self
->verbose);
my
$sequence
=
$seq
->seq;
foreach
my
$pair
(@{
$remove
}){
my
$start
=
$pair
->[0];
my
$end
=
$pair
->[1];
$sequence
=
$seq
->seq
unless
$sequence
;
my
$orig
=
$sequence
;
my
$head
=
$start
> 0 ?
substr
(
$sequence
, 0,
$start
) :
''
;
my
$tail
= (
$end
+ 1) >= CORE::
length
(
$sequence
) ?
''
:
substr
(
$sequence
,
$end
+ 1);
$sequence
=
$head
.
$tail
;
unless
(
defined
$new_seq
->start) {
if
(
$start
== 0) {
my
$start_adjust
= () =
substr
(
$orig
, 0,
$end
+ 1) =~ /
$gap
/g;
$new_seq
->start(
$seq
->start +
$end
+ 1 -
$start_adjust
);
}
else
{
my
$start_adjust
=
$orig
=~ /^
$gap
+/;
if
(
$start_adjust
) {
$start_adjust
= $+[0] ==
$start
;
}
$new_seq
->start(
$seq
->start +
$start_adjust
);
}
}
if
((
$end
+ 1) >= CORE::
length
(
$orig
)) {
my
$end_adjust
= () =
substr
(
$orig
,
$start
) =~ /
$gap
/g;
$new_seq
->end(
$seq
->end - (CORE::
length
(
$orig
) -
$start
) +
$end_adjust
);
}
else
{
$new_seq
->end(
$seq
->end);
}
}
if
(
$new_seq
->end <
$new_seq
->start) {
$new_seq
->start(0);
$new_seq
->end(0);
}
$new_seq
->seq(
$sequence
)
if
$sequence
;
push
@new
,
$new_seq
;
}
foreach
my
$new
(
@new
){
$aln
->add_seq(
$new
);
}
return
$aln
;
}
sub
_remove_columns_by_type {
my
(
$self
,
$type
) =
@_
;
my
$aln
=
$self
->new;
my
@remove
;
my
$gap
=
$self
->gap_char
if
(
grep
{
$_
eq
'gaps'
} @{
$type
});
my
$all_gaps_columns
=
$self
->gap_char
if
(
grep
/all_gaps_columns/,@{
$type
});
my
%matchchars
= (
'match'
=>
'\*'
,
'weak'
=>
'\.'
,
'strong'
=>
':'
,
'mismatch'
=>
' '
,
'gaps'
=>
''
,
'all_gaps_columns'
=>
''
);
my
$del_char
;
foreach
my
$type
(@{
$type
}){
$del_char
.=
$matchchars
{
$type
};
}
my
$length
= 0;
my
$match_line
=
$self
->match_line;
if
(
$del_char
){
while
(
$match_line
=~ m/[
$del_char
]/g ){
my
$start
=
pos
(
$match_line
)-1;
$match_line
=~/\G[
$del_char
]+/gc;
my
$end
=
pos
(
$match_line
)-1;
$start
-=
$length
;
$end
-=
$length
;
$length
+= (
$end
-
$start
+1);
push
@remove
, [
$start
,
$end
];
}
}
$aln
=
$#remove
>= 0 ?
$self
->_remove_col(
$aln
,\
@remove
) :
$self
;
$aln
=
$aln
->remove_gaps()
if
$gap
;
$aln
=
$aln
->remove_gaps(
''
, 1)
if
$all_gaps_columns
;
$aln
;
}
sub
_remove_columns_by_num {
my
(
$self
,
$positions
) =
@_
;
my
$aln
=
$self
->new;
@$positions
=
sort
{
$a
->[0] <=>
$b
->[0] }
@$positions
;
my
@remove
;
my
$length
= 0;
foreach
my
$pos
(@{
$positions
}) {
my
(
$start
,
$end
) = @{
$pos
};
$start
-=
$length
;
$end
-=
$length
;
$length
+= (
$end
-
$start
+1);
push
@remove
, [
$start
,
$end
];
}
$aln
=
$#remove
>= 0 ?
$self
->_remove_col(
$aln
,\
@remove
) :
$self
;
$aln
;
}
Hide Show 18 lines of Pod
sub
splice_by_seq_pos{
my
(
$self
,
$pos
) =
@_
;
my
$guide
=
$self
->get_seq_by_pos(
$pos
);
my
$guide_seq
=
$guide
->seq;
$guide_seq
=~ s/\./\-/g;
my
@gaps
= ();
$pos
= -1;
while
((
$pos
=
index
(
$guide_seq
,
'-'
,
$pos
)) > -1 ){
unshift
@gaps
,
$pos
;
$pos
++;
}
foreach
my
$seq
(
$self
->each_seq){
my
@bases
=
split
''
,
$seq
->seq;
splice
(
@bases
,
$_
, 1)
foreach
@gaps
;
$seq
->seq(
join
(
''
,
@bases
));
}
1;
}
Hide Show 16 lines of Pod
sub
map_chars {
my
$self
=
shift
;
my
$from
=
shift
;
my
$to
=
shift
;
my
(
$seq
,
$temp
);
$self
->throw(
"Need exactly two arguments"
)
unless
defined
$from
and
defined
$to
;
foreach
$seq
(
$self
->each_seq() ) {
$temp
=
$seq
->seq();
$temp
=~ s/
$from
/
$to
/g;
$seq
->seq(
$temp
);
}
return
1;
}
Hide Show 10 lines of Pod
sub
uppercase {
my
$self
=
shift
;
my
$seq
;
my
$temp
;
foreach
$seq
(
$self
->each_seq() ) {
$temp
=
$seq
->seq();
$temp
=~
tr
/[a-z]/[A-Z]/;
$seq
->seq(
$temp
);
}
return
1;
}
Hide Show 14 lines of Pod
sub
cigar_line {
my
$self
=
shift
;
my
$thr
=
shift
||100;
my
%cigars
;
my
@consensus
=
split
""
,(
$self
->consensus_string(
$thr
));
my
$len
=
$self
->
length
;
my
$gapchar
=
$self
->gap_char;
foreach
my
$seq
(
$self
->each_seq ) {
my
@seq
=
split
""
,
uc
(
$seq
->seq);
my
$pos
= 1;
for
(
my
$x
= 0 ;
$x
<
$len
;
$x
++ ) {
if
(
$seq
[
$x
] eq
$consensus
[
$x
]) {
push
@{
$cigars
{
$seq
->get_nse}},
$pos
;
$pos
++;
}
elsif
(
$seq
[
$x
] ne
$gapchar
) {
$pos
++;
}
}
}
for
my
$name
(
keys
%cigars
) {
splice
@{
$cigars
{
$name
}}, 1, 0, ${
$cigars
{
$name
}}[0]
if
( ${
$cigars
{
$name
}}[0] + 1 < ${
$cigars
{
$name
}}[1] );
push
@{
$cigars
{
$name
}}, ${
$cigars
{
$name
}}[$
( ${
$cigars
{
$name
}}[($
${
$cigars
{
$name
}}[$
for
(
my
$x
= 1 ;
$x
< $
if
(${
$cigars
{
$name
}}[
$x
- 1] + 1 < ${
$cigars
{
$name
}}[
$x
] &&
${
$cigars
{
$name
}}[
$x
+ 1] > ${
$cigars
{
$name
}}[
$x
] + 1) {
splice
@{
$cigars
{
$name
}},
$x
, 0, ${
$cigars
{
$name
}}[
$x
];
}
}
}
for
my
$name
(
keys
%cigars
) {
my
@remove
;
for
(
my
$x
= 0 ;
$x
< $
if
( ${
$cigars
{
$name
}}[
$x
] == ${
$cigars
{
$name
}}[(
$x
- 1)] + 1 &&
${
$cigars
{
$name
}}[
$x
] == ${
$cigars
{
$name
}}[(
$x
+ 1)] - 1 ) {
unshift
@remove
,
$x
;
}
}
for
my
$pos
(
@remove
) {
splice
@{
$cigars
{
$name
}},
$pos
, 1;
}
}
for
my
$name
(
keys
%cigars
) {
my
(
$start
,
$end
,
$str
) =
""
;
while
( (
$start
,
$end
) =
splice
@{
$cigars
{
$name
}}, 0, 2 ) {
$str
.= (
$start
.
","
.
$end
.
":"
);
}
$str
=~ s/:$//;
$cigars
{
$name
} =
$str
;
}
%cigars
;
}
Hide Show 13 lines of Pod
sub
match_line {
my
(
$self
,
$matchlinechar
,
$strong
,
$weak
) =
@_
;
my
%matchchars
= (
'match'
=>
$matchlinechar
||
'*'
,
'weak'
=>
$weak
||
'.'
,
'strong'
=>
$strong
||
':'
,
'mismatch'
=>
' '
,
);
my
@seqchars
;
my
$alphabet
;
foreach
my
$seq
(
$self
->each_seq ) {
push
@seqchars
, [
split
(//,
uc
(
$seq
->seq)) ];
$alphabet
=
$seq
->alphabet
unless
defined
$alphabet
;
}
my
$refseq
=
shift
@seqchars
;
my
$matchline
;
POS:
foreach
my
$pos
( 0..
$self
->
length
) {
my
$refchar
=
$refseq
->[
$pos
];
my
$char
=
$matchchars
{
'mismatch'
};
unless
(
defined
$refchar
) {
last
if
$pos
==
$self
->
length
;
goto
bottom;
}
my
%col
= (
$refchar
=> 1);
my
$dash
= (
$refchar
eq
'-'
||
$refchar
eq
'.'
||
$refchar
eq
' '
);
foreach
my
$seq
(
@seqchars
) {
next
if
$pos
>=
scalar
@$seq
;
$dash
= 1
if
(
$seq
->[
$pos
] eq
'-'
||
$seq
->[
$pos
] eq
'.'
||
$seq
->[
$pos
] eq
' '
);
$col
{
$seq
->[
$pos
]}++
if
defined
$seq
->[
$pos
];
}
my
@colresidues
=
sort
keys
%col
;
if
(
$dash
) {
$char
=
$matchchars
{
'mismatch'
} }
elsif
(
@colresidues
== 1 ) {
$char
=
$matchchars
{
'match'
} }
elsif
(
$alphabet
eq
'protein'
) {
TYPE:
foreach
my
$type
(
qw(strong weak)
) {
my
%groups
;
foreach
my
$c
(
@colresidues
) {
foreach
my
$f
(
grep
{
index
(
$_
,
$c
) >= 0 } @{
$CONSERVATION_GROUPS
{
$type
}} ) {
push
@{
$groups
{
$f
}},
$c
;
}
}
GRP:
foreach
my
$cols
(
values
%groups
) {
@$cols
=
sort
@$cols
;
next
if
(
scalar
@$cols
!=
scalar
@colresidues
);
for
(
$_
=0;
$_
< (
scalar
@$cols
);
$_
++ ) {
next
GRP
if
(
$cols
->[
$_
] ne
$colresidues
[
$_
] );
}
$char
=
$matchchars
{
$type
};
last
TYPE;
}
}
}
bottom:
$matchline
.=
$char
;
}
return
$matchline
;
}
Hide Show 11 lines of Pod
sub
gap_line {
my
(
$self
,
$gapchar
) =
@_
;
$gapchar
=
$gapchar
||
$self
->gap_char;
my
%gap_hsh
;
foreach
my
$seq
(
$self
->each_seq ) {
my
$i
= 0;
map
{
$gap_hsh
{
$_
->[0]} =
undef
}
grep
{
$_
->[1] eq
$gapchar
}
map
{[
$i
++,
$_
]}
split
(//,
uc
(
$seq
->seq));
}
my
$gap_line
;
foreach
my
$pos
( 0..
$self
->
length
-1 ) {
$gap_line
.= (
exists
$gap_hsh
{
$pos
}) ?
$gapchar
:
'.'
;
}
return
$gap_line
;
}
Hide Show 11 lines of Pod
sub
all_gap_line {
my
(
$self
,
$gapchar
) =
@_
;
$gapchar
=
$gapchar
||
$self
->gap_char;
my
%gap_hsh
;
my
@seqs
=
$self
->each_seq;
foreach
my
$seq
(
@seqs
) {
my
$i
= 0;
map
{
$gap_hsh
{
$_
->[0]}++}
grep
{
$_
->[1] eq
$gapchar
}
map
{[
$i
++,
$_
]}
split
(//,
uc
(
$seq
->seq));
}
my
$gap_line
;
foreach
my
$pos
( 0..
$self
->
length
-1 ) {
if
(
exists
$gap_hsh
{
$pos
} &&
$gap_hsh
{
$pos
} ==
scalar
@seqs
) {
$gap_line
.=
$gapchar
;
}
else
{
$gap_line
.=
'.'
;
}
}
return
$gap_line
;
}
Hide Show 12 lines of Pod
sub
gap_col_matrix {
my
(
$self
,
$gapchar
) =
@_
;
$gapchar
=
$gapchar
||
$self
->gap_char;
my
%gap_hsh
;
my
@cols
;
foreach
my
$seq
(
$self
->each_seq ) {
my
$i
= 0;
my
$str
=
$seq
->seq;
my
$len
=
$seq
->
length
;
my
$ch
;
my
$id
=
$seq
->display_id;
while
(
$i
<
$len
) {
$ch
=
substr
(
$str
,
$i
, 1);
$cols
[
$i
++]->{
$id
} = (
$ch
eq
$gapchar
);
}
}
return
\
@cols
;
}
Hide Show 17 lines of Pod
sub
match {
my
(
$self
,
$match
) =
@_
;
$match
||=
'.'
;
my
(
$matching_char
) =
$match
;
$matching_char
=
"\\$match"
if
$match
=~ /[\^.$|()\[\]]/ ;
$self
->map_chars(
$matching_char
,
'-'
);
my
@seqs
=
$self
->each_seq();
return
1
unless
scalar
@seqs
> 1;
my
$refseq
=
shift
@seqs
;
my
@refseq
=
split
//,
$refseq
->seq;
my
$gapchar
=
$self
->gap_char;
foreach
my
$seq
(
@seqs
) {
my
@varseq
=
split
//,
$seq
->seq();
for
(
my
$i
=0;
$i
<
scalar
@varseq
;
$i
++) {
$varseq
[
$i
] =
$match
if
defined
$refseq
[
$i
] &&
(
$refseq
[
$i
] =~ /[A-Za-z\*]/ ||
$refseq
[
$i
] =~ /
$gapchar
/ )
&&
$refseq
[
$i
] eq
$varseq
[
$i
];
}
$seq
->seq(
join
''
,
@varseq
);
}
$self
->match_char(
$match
);
return
1;
}
Hide Show 12 lines of Pod
sub
unmatch {
my
(
$self
,
$match
) =
@_
;
$match
||=
'.'
;
my
@seqs
=
$self
->each_seq();
return
1
unless
scalar
@seqs
> 1;
my
$refseq
=
shift
@seqs
;
my
@refseq
=
split
//,
$refseq
->seq;
my
$gapchar
=
$self
->gap_char;
foreach
my
$seq
(
@seqs
) {
my
@varseq
=
split
//,
$seq
->seq();
for
(
my
$i
=0;
$i
<
scalar
@varseq
;
$i
++) {
$varseq
[
$i
] =
$refseq
[
$i
]
if
defined
$refseq
[
$i
] &&
(
$refseq
[
$i
] =~ /[A-Za-z\*]/ ||
$refseq
[
$i
] =~ /
$gapchar
/ ) &&
$varseq
[
$i
] eq
$match
;
}
$seq
->seq(
join
''
,
@varseq
);
}
$self
->match_char(
''
);
return
1;
}
Hide Show 18 lines of Pod
sub
id {
my
(
$self
,
$name
) =
@_
;
if
(
defined
(
$name
)) {
$self
->{
'_id'
} =
$name
;
}
return
$self
->{
'_id'
};
}
Hide Show 10 lines of Pod
sub
accession {
my
(
$self
,
$acc
) =
@_
;
if
(
defined
(
$acc
)) {
$self
->{
'_accession'
} =
$acc
;
}
return
$self
->{
'_accession'
};
}
Hide Show 10 lines of Pod
sub
description {
my
(
$self
,
$name
) =
@_
;
if
(
defined
(
$name
)) {
$self
->{
'_description'
} =
$name
;
}
return
$self
->{
'_description'
};
}
Hide Show 12 lines of Pod
sub
missing_char {
my
(
$self
,
$char
) =
@_
;
if
(
defined
$char
) {
$self
->throw(
"Single missing character, not [$char]!"
)
if
CORE::
length
(
$char
) > 1;
$self
->{
'_missing_char'
} =
$char
;
}
return
$self
->{
'_missing_char'
};
}
Hide Show 10 lines of Pod
sub
match_char {
my
(
$self
,
$char
) =
@_
;
if
(
defined
$char
) {
$self
->throw(
"Single match character, not [$char]!"
)
if
CORE::
length
(
$char
) > 1;
$self
->{
'_match_char'
} =
$char
;
}
return
$self
->{
'_match_char'
};
}
Hide Show 10 lines of Pod
sub
gap_char {
my
(
$self
,
$char
) =
@_
;
if
(
defined
$char
|| !
defined
$self
->{
'_gap_char'
} ) {
$char
=
'-'
unless
defined
$char
;
$self
->throw(
"Single gap character, not [$char]!"
)
if
CORE::
length
(
$char
) > 1;
$self
->{
'_gap_char'
} =
$char
;
}
return
$self
->{
'_gap_char'
};
}
Hide Show 10 lines of Pod
sub
symbol_chars{
my
(
$self
,
$includeextra
) =
@_
;
unless
(
$self
->{
'_symbols'
}) {
foreach
my
$seq
(
$self
->each_seq) {
map
{
$self
->{
'_symbols'
}->{
$_
} = 1; }
split
(//,
$seq
->seq);
}
}
my
%copy
= %{
$self
->{
'_symbols'
}};
if
( !
$includeextra
) {
foreach
my
$char
(
$self
->gap_char,
$self
->match_char,
$self
->missing_char) {
delete
$copy
{
$char
}
if
(
defined
$char
);
}
}
return
keys
%copy
;
}
Hide Show 15 lines of Pod
sub
score {
my
$self
=
shift
;
$self
->{score} =
shift
if
@_
;
return
$self
->{score};
}
Hide Show 14 lines of Pod
sub
consensus_string {
my
$self
=
shift
;
my
$threshold
=
shift
;
my
$out
=
""
;
my
$len
=
$self
->
length
- 1;
foreach
( 0 ..
$len
) {
$out
.=
$self
->_consensus_aa(
$_
,
$threshold
);
}
return
$out
;
}
sub
_consensus_aa {
my
$self
=
shift
;
my
$point
=
shift
;
my
$threshold_percent
=
shift
|| -1 ;
my
(
$seq
,
%hash
,
$count
,
$letter
,
$key
);
my
$gapchar
=
$self
->gap_char;
foreach
$seq
(
$self
->each_seq() ) {
$letter
=
substr
(
$seq
->seq,
$point
,1);
$self
->throw(
"--$point-----------"
)
if
$letter
eq
''
;
(
$letter
eq
$gapchar
||
$letter
=~ /\./) &&
next
;
$hash
{
$letter
}++;
}
my
$number_of_sequences
=
$self
->num_sequences();
my
$threshold
=
$number_of_sequences
*
$threshold_percent
/ 100. ;
$count
= -1;
$letter
=
'?'
;
foreach
$key
(
sort
keys
%hash
) {
if
(
$hash
{
$key
} >
$count
&&
$hash
{
$key
} >=
$threshold
) {
$letter
=
$key
;
$count
=
$hash
{
$key
};
}
}
return
$letter
;
}
Hide Show 18 lines of Pod
sub
consensus_iupac {
my
$self
=
shift
;
my
$out
=
""
;
my
$len
=
$self
->
length
-1;
foreach
my
$seq
(
$self
->each_seq() ) {
$self
->throw(
"Seq ["
.
$seq
->get_nse.
"] is a protein"
)
if
$seq
->alphabet eq
'protein'
;
}
foreach
my
$count
( 0 ..
$len
) {
$out
.=
$self
->_consensus_iupac(
$count
);
}
return
$out
;
}
sub
_consensus_iupac {
my
(
$self
,
$column
) =
@_
;
my
(
$string
,
$char
,
$rna
);
foreach
my
$seq
(
$self
->each_seq() ) {
$string
.=
substr
(
$seq
->seq,
$column
, 1);
}
$string
=
uc
$string
;
if
(
$string
=~ /N/) {
$string
=~ /\W/ ?
return
'n'
:
return
'N'
;
}
return
'-'
if
$string
=~ /^\W+$/;
if
(
$string
=~ /U/) {
$string
=~ s/U/T/;
$rna
= 1;
}
if
(
$string
=~ /[VDHB]/) {
$string
=~ s/V/AGC/;
$string
=~ s/D/AGT/;
$string
=~ s/H/ACT/;
$string
=~ s/B/CTG/;
}
if
(
$string
=~ /[SKYRWM]/) {
$string
=~ s/S/GC/;
$string
=~ s/K/GT/;
$string
=~ s/Y/CT/;
$string
=~ s/R/AG/;
$string
=~ s/W/AT/;
$string
=~ s/M/AC/;
}
if
(
$string
=~ /A/) {
$char
=
'A'
;
if
(
$string
=~ /G/) {
$char
=
'R'
;
if
(
$string
=~ /C/) {
$char
=
'V'
;
if
(
$string
=~ /T/) {
$char
=
'N'
;
}
}
elsif
(
$string
=~ /T/) {
$char
=
'D'
;
}
}
elsif
(
$string
=~ /C/) {
$char
=
'M'
;
if
(
$string
=~ /T/) {
$char
=
'H'
;
}
}
elsif
(
$string
=~ /T/) {
$char
=
'W'
;
}
}
elsif
(
$string
=~ /C/) {
$char
=
'C'
;
if
(
$string
=~ /T/) {
$char
=
'Y'
;
if
(
$string
=~ /G/) {
$char
=
'B'
;
}
}
elsif
(
$string
=~ /G/) {
$char
=
'S'
;
}
}
elsif
(
$string
=~ /G/) {
$char
=
'G'
;
if
(
$string
=~ /C/) {
$char
=
'S'
;
}
elsif
(
$string
=~ /T/) {
$char
=
'K'
;
}
}
elsif
(
$string
=~ /T/) {
$char
=
'T'
;
}
$char
=
'U'
if
$rna
and
$char
eq
'T'
;
$char
=
lc
$char
if
$string
=~ /\W/;
return
$char
;
}
Hide Show 12 lines of Pod
sub
consensus_meta {
my
(
$self
,
$meta
) =
@_
;
if
(
$meta
&& (!
ref
$meta
|| !
$meta
->isa(
'FAST::Bio::Seq::MetaI'
))) {
$self
->throw(
'Not a FAST::Bio::Seq::MetaI object'
);
}
return
$self
->{
'_aln_meta'
} =
$meta
if
$meta
;
return
$self
->{
'_aln_meta'
}
}
Hide Show 11 lines of Pod
sub
is_flush {
my
(
$self
,
$report
) =
@_
;
my
$seq
;
my
$length
= (-1);
my
$temp
;
foreach
$seq
(
$self
->each_seq() ) {
if
(
$length
== (-1) ) {
$length
= CORE::
length
(
$seq
->seq());
next
;
}
$temp
= CORE::
length
(
$seq
->seq());
if
(
$temp
!=
$length
) {
$self
->
warn
(
"expecting $length not $temp from "
.
$seq
->display_id)
if
(
$report
);
$self
->debug(
"expecting $length not $temp from "
.
$seq
->display_id);
$self
->debug(
$seq
->seq().
"\n"
);
return
0;
}
}
return
1;
}
Hide Show 11 lines of Pod
sub
length_aln {
my
$self
=
shift
;
$self
->deprecated(
"length_aln - deprecated method. Use length() instead."
);
$self
->
length
(
@_
);
}
sub
length
{
my
$self
=
shift
;
my
$seq
;
my
$length
= -1;
my
$temp
;
foreach
$seq
(
$self
->each_seq() ) {
$temp
=
$seq
->
length
();
if
(
$temp
>
$length
) {
$length
=
$temp
;
}
}
return
$length
;
}
Hide Show 11 lines of Pod
sub
maxname_length {
my
$self
=
shift
;
$self
->deprecated(
"maxname_length - deprecated method."
.
" Use maxdisplayname_length() instead."
);
$self
->maxdisplayname_length();
}
sub
maxnse_length {
my
$self
=
shift
;
$self
->deprecated(
"maxnse_length - deprecated method."
.
" Use maxnse_length() instead."
);
$self
->maxdisplayname_length();
}
sub
maxdisplayname_length {
my
$self
=
shift
;
my
$maxname
= (-1);
my
(
$seq
,
$len
);
foreach
$seq
(
$self
->each_seq() ) {
$len
= CORE::
length
$self
->displayname(
$seq
->get_nse());
if
(
$len
>
$maxname
) {
$maxname
=
$len
;
}
}
return
$maxname
;
}
Hide Show 12 lines of Pod
sub
max_metaname_length {
my
$self
=
shift
;
my
$maxname
= (-1);
my
(
$seq
,
$len
);
for
$seq
(
$self
->each_seq() ) {
next
if
!
$seq
->isa(
'FAST::Bio::Seq::MetaI'
|| !
$seq
->meta_names);
for
my
$mtag
(
$seq
->meta_names) {
$len
= CORE::
length
$mtag
;
if
(
$len
>
$maxname
) {
$maxname
=
$len
;
}
}
}
for
my
$meta
(
$self
->consensus_meta) {
next
unless
$meta
;
for
my
$name
(
$meta
->meta_names) {
$len
= CORE::
length
$name
;
if
(
$len
>
$maxname
) {
$maxname
=
$len
;
}
}
}
return
$maxname
;
}
Hide Show 11 lines of Pod
sub
num_residues {
my
$self
=
shift
;
my
$count
= 0;
foreach
my
$seq
(
$self
->each_seq) {
my
$str
=
$seq
->seq();
$count
+= (
$str
=~ s/[A-Za-z]//g);
}
return
$count
;
}
Hide Show 11 lines of Pod
sub
num_sequences {
my
$self
=
shift
;
return
scalar
(
$self
->each_seq);
}
Hide Show 16 lines of Pod
sub
average_percentage_identity{
my
(
$self
,
@args
) =
@_
;
my
@alphabet
= (
'A'
,
'B'
,
'C'
,
'D'
,
'E'
,
'F'
,
'G'
,
'H'
,
'I'
,
'J'
,
'K'
,
'L'
,
'M'
,
'N'
,
'O'
,
'P'
,
'Q'
,
'R'
,
'S'
,
'T'
,
'U'
,
'V'
,
'W'
,
'X'
,
'Y'
,
'Z'
);
my
(
$len
,
$total
,
$subtotal
,
$divisor
,
$subdivisor
,
@seqs
,
@countHashes
);
if
(!
$self
->is_flush()) {
$self
->throw(
"All sequences in the alignment must be the same length"
);
}
@seqs
=
$self
->each_seq();
$len
=
$self
->
length
();
for
(
my
$index
=0;
$index
<
$len
;
$index
++) {
foreach
my
$letter
(
@alphabet
) {
$countHashes
[
$index
]->{
$letter
} = 0;
}
}
foreach
my
$seq
(
@seqs
) {
my
@seqChars
=
split
//,
$seq
->seq();
for
(
my
$column
=0;
$column
<
@seqChars
;
$column
++ ) {
my
$char
=
uc
(
$seqChars
[
$column
]);
if
(
exists
$countHashes
[
$column
]->{
$char
}) {
$countHashes
[
$column
]->{
$char
}++;
}
}
}
$total
= 0;
$divisor
= 0;
for
(
my
$column
=0;
$column
<
$len
;
$column
++) {
my
%hash
= %{
$countHashes
[
$column
]};
$subdivisor
= 0;
foreach
my
$res
(
keys
%hash
) {
$total
+=
$hash
{
$res
}*(
$hash
{
$res
} - 1);
$subdivisor
+=
$hash
{
$res
};
}
$divisor
+=
$subdivisor
* (
$subdivisor
- 1);
}
return
$divisor
> 0 ? (
$total
/
$divisor
)*100.0 : 0;
}
Hide Show 11 lines of Pod
sub
percentage_identity {
my
$self
=
shift
;
return
$self
->average_percentage_identity();
}
Hide Show 16 lines of Pod
sub
overall_percentage_identity{
my
(
$self
,
$length_measure
) =
@_
;
my
%alphabet
=
map
{
$_
=>
undef
}
qw (A
C G T U B D E F H I J K L M N O P Q R S V W X Y Z);
my
%enum
=
map
{
$_
=>
undef
}
qw (align
short long);
$self
->throw(
"Unknown argument [$length_measure]"
)
if
$length_measure
and not
exists
$enum
{
$length_measure
};
$length_measure
||=
'align'
;
if
(!
$self
->is_flush()) {
$self
->throw(
"All sequences in the alignment must be the same length"
);
}
my
$len
;
my
$total
= 0;
my
@countHashes
;
my
@seqs
=
$self
->each_seq;
my
$nof_seqs
=
scalar
@seqs
;
my
$aln_len
=
$self
->
length
();
for
my
$seq
(
@seqs
) {
my
$seqstr
=
$seq
->seq;
for
my
$column
(0 ..
$aln_len
-1) {
my
$char
=
uc
(
substr
(
$seqstr
,
$column
, 1) );
if
(
exists
$alphabet
{
$char
} ) {
if
(
defined
$countHashes
[
$column
]->{
$char
} ) {
$countHashes
[
$column
]->{
$char
}++;
}
else
{
$countHashes
[
$column
]->{
$char
} = 1;
}
if
(
$countHashes
[
$column
]->{
$char
} ==
$nof_seqs
) {
$total
++;
}
}
}
if
(
$length_measure
eq
'short'
||
$length_measure
eq
'long'
) {
my
$seq_len
=
$seqstr
=~
tr
/[A-Za-z]//;
if
(
$length_measure
eq
'short'
) {
if
( (not
defined
$len
) || (
$seq_len
<
$len
) ) {
$len
=
$seq_len
;
}
}
elsif
(
$length_measure
eq
'long'
) {
if
( (not
defined
$len
) || (
$seq_len
>
$len
) ) {
$len
=
$seq_len
;
}
}
}
}
if
(
$length_measure
eq
'align'
) {
$len
=
$aln_len
;
}
return
(
$total
/
$len
) * 100.0;
}
Hide Show 45 lines of Pod
sub
column_from_residue_number {
my
(
$self
,
$name
,
$resnumber
) =
@_
;
$self
->throw(
"No sequence with name [$name]"
)
unless
$self
->{
'_start_end_lists'
}->{
$name
};
$self
->throw(
"Second argument residue number missing"
)
unless
$resnumber
;
foreach
my
$seq
(
$self
->each_seq_with_id(
$name
)) {
my
$col
;
eval
{
$col
=
$seq
->column_from_residue_number(
$resnumber
);
};
next
if
$@;
return
$col
;
}
$self
->throw(
"Could not find a sequence segment in $name "
.
"containing residue number $resnumber"
);
}
Hide Show 17 lines of Pod
sub
get_displayname {
my
$self
=
shift
;
$self
->deprecated(
"get_displayname - deprecated method. Use displayname() instead."
);
$self
->displayname(
@_
);
}
sub
set_displayname {
my
$self
=
shift
;
$self
->deprecated(
"set_displayname - deprecated method. Use displayname() instead."
);
$self
->displayname(
@_
);
}
sub
displayname {
my
(
$self
,
$name
,
$disname
) =
@_
;
$self
->throw(
"No sequence with name [$name]"
)
unless
defined
$self
->{
'_seq'
}->{
$name
};
if
(
$disname
and
$name
) {
$self
->{
'_dis_name'
}->{
$name
} =
$disname
;
return
$disname
;
}
elsif
(
defined
$self
->{
'_dis_name'
}->{
$name
} ) {
return
$self
->{
'_dis_name'
}->{
$name
};
}
else
{
return
$name
;
}
}
Hide Show 11 lines of Pod
sub
set_displayname_count {
my
$self
=
shift
;
my
(
@arr
,
$name
,
$seq
,
$count
,
$temp
,
$nse
);
foreach
$seq
(
$self
->each_alphabetically() ) {
$nse
=
$seq
->get_nse();
if
(
defined
$name
and
$name
eq (
$seq
->id()) ) {
$temp
=
sprintf
(
"%s_%s"
,
$name
,
$count
);
$self
->displayname(
$nse
,
$temp
);
$count
++;
}
else
{
$count
= 1;
$name
=
$seq
->id();
$temp
=
sprintf
(
"%s_%s"
,
$name
,
$count
);
$self
->displayname(
$nse
,
$temp
);
$count
++;
}
}
return
1;
}
Hide Show 11 lines of Pod
sub
set_displayname_flat {
my
$self
=
shift
;
my
(
$nse
,
$seq
);
foreach
$seq
(
$self
->each_seq() ) {
$nse
=
$seq
->get_nse();
$self
->displayname(
$nse
,
$seq
->id());
}
return
1;
}
Hide Show 10 lines of Pod
sub
set_displayname_normal {
my
$self
=
shift
;
my
(
$nse
,
$seq
);
foreach
$seq
(
$self
->each_seq() ) {
$nse
=
$seq
->get_nse();
$self
->displayname(
$nse
,
$nse
);
}
return
1;
}
Hide Show 12 lines of Pod
sub
source{
my
(
$self
,
$value
) =
@_
;
if
(
defined
$value
) {
$self
->{
'_source'
} =
$value
;
}
return
$self
->{
'_source'
};
}
Hide Show 14 lines of Pod
sub
set_displayname_safe {
my
$self
=
shift
;
my
$idlength
=
shift
|| 10;
my
(
$seq
,
%phylip_name
);
my
$ct
=0;
my
$new
=FAST::Bio::SimpleAlign->new();
foreach
$seq
(
$self
->each_seq() ) {
$ct
++;
my
$pname
=
"S"
.
sprintf
"%0"
. (
$idlength
-1) .
"s"
,
$ct
;
$phylip_name
{
$pname
}=
$seq
->id();
my
$new_seq
= FAST::Bio::LocatableSeq->new(
-id
=>
$pname
,
-seq
=>
$seq
->seq(),
-alphabet
=>
$seq
->alphabet,
-start
=>
$seq
->{_start},
-end
=>
$seq
->{_end}
);
$new
->add_seq(
$new_seq
);
}
$self
->debug(
"$ct seq names changed. Restore names by using restore_displayname."
);
return
(
$new
, \
%phylip_name
);
}
Hide Show 11 lines of Pod
sub
restore_displayname {
my
$self
=
shift
;
my
$ref
=
shift
;
my
%name
=
%$ref
;
my
$new
=FAST::Bio::SimpleAlign->new();
foreach
my
$seq
(
$self
->each_seq() ) {
$self
->throw(
"No sequence with name"
)
unless
defined
$name
{
$seq
->id()};
my
$new_seq
= FAST::Bio::LocatableSeq->new(
-id
=>
$name
{
$seq
->id()},
-seq
=>
$seq
->seq(),
-alphabet
=>
$seq
->alphabet,
-start
=>
$seq
->{_start},
-end
=>
$seq
->{_end}
);
$new
->add_seq(
$new_seq
);
}
return
$new
;
}
Hide Show 11 lines of Pod
sub
sort_by_start {
my
$self
=
shift
;
my
(
$seq
,
$nse
,
@arr
,
%hash
,
$count
);
foreach
$seq
(
$self
->each_seq() ) {
$nse
=
$seq
->get_nse;
$hash
{
$nse
} =
$seq
;
}
$count
= 0;
%{
$self
->{
'_order'
}} = ();
foreach
$nse
(
sort
_startend
keys
%hash
) {
$self
->{
'_order'
}->{
$count
} =
$nse
;
$count
++;
}
1;
}
sub
_startend
{
my
(
$aname
,
$arange
) =
split
(/[\/]/,
$a
);
my
(
$bname
,
$brange
) =
split
(/[\/]/,
$b
);
my
(
$astart
,
$aend
) =
split
(/\-/,
$arange
);
my
(
$bstart
,
$bend
) =
split
(/\-/,
$brange
);
return
$astart
<=>
$bstart
;
}
Hide Show 46 lines of Pod
sub
bracket_string {
my
(
$self
,
@args
) =
@_
;
my
(
$ref
,
$a1
,
$a2
,
$delim
,
$sep
) =
$self
->_rearrange([
qw(refseq allele1 allele2 delimiters separator)
],
@args
);
$self
->throw(
'Missing refseq/allele1/allele2'
)
if
(!
$a1
|| !
$a2
|| !
$ref
);
my
(
$ld
,
$rd
);
(
$ld
,
$rd
) =
split
(
''
,
$delim
, 2)
if
$delim
;
$ld
||=
'['
;
$rd
||=
']'
;
$sep
||=
'/'
;
my
(
$refseq
,
$allele1
,
$allele2
) =
map
{(
$self
->each_seq_with_id(
$_
) )} (
$ref
,
$a1
,
$a2
);
if
(!
$refseq
|| !
$allele1
|| !
$allele2
) {
$self
->throw(
"One of your refseq/allele IDs is invalid!"
);
}
my
$len
=
$self
->
length
-1;
my
$bic
=
''
;
for
my
$column
( 0 ..
$len
) {
my
$string
;
my
(
$compres
,
$res1
,
$res2
) =
map
{
substr
(
$_
->seq,
$column
, 1)} (
$refseq
,
$allele1
,
$allele2
);
$string
= (
$compres
eq
$res1
&&
$compres
eq
$res2
) ?
$compres
:
$ld
.
$res1
.
$sep
.
$res2
.
$rd
;
$bic
.=
$string
;
}
return
$bic
;
}
Hide Show 17 lines of Pod
sub
get_SeqFeatures {
my
$self
=
shift
;
my
$filter_cb
=
shift
;
$self
->throw(
"Arg (filter callback) must be a coderef"
)
unless
!
defined
(
$filter_cb
) or
ref
(
$filter_cb
) eq
'CODE'
;
if
( !
defined
$self
->{
'_as_feat'
} ) {
$self
->{
'_as_feat'
} = [];
}
if
(
$filter_cb
) {
return
grep
{
$filter_cb
->(
$_
) } @{
$self
->{
'_as_feat'
}};
}
return
@{
$self
->{
'_as_feat'
}};
}
Hide Show 12 lines of Pod
sub
add_SeqFeature {
my
(
$self
,
@feat
) =
@_
;
$self
->{
'_as_feat'
} = []
unless
$self
->{
'_as_feat'
};
foreach
my
$feat
(
@feat
) {
if
( !
$feat
->isa(
"FAST::Bio::SeqFeatureI"
) ) {
$self
->throw(
"$feat is not a SeqFeatureI and that's what we expect..."
);
}
push
(@{
$self
->{
'_as_feat'
}},
$feat
);
}
return
1;
}
Hide Show 11 lines of Pod
sub
remove_SeqFeatures {
my
$self
=
shift
;
return
()
unless
$self
->{
'_as_feat'
};
my
@feats
= @{
$self
->{
'_as_feat'
}};
$self
->{
'_as_feat'
} = [];
return
@feats
;
}
Hide Show 10 lines of Pod
sub
feature_count {
my
(
$self
) =
@_
;
if
(
defined
(
$self
->{
'_as_feat'
})) {
return
($
}
else
{
return
0;
}
}
Hide Show 31 lines of Pod
sub
annotation {
my
(
$obj
,
$value
) =
@_
;
if
(
defined
$value
) {
$obj
->throw(
"object of class "
.
ref
(
$value
).
" does not implement "
.
"FAST::Bio::AnnotationCollectionI. Too bad."
)
unless
$value
->isa(
"FAST::Bio::AnnotationCollectionI"
);
$obj
->{
'_annotation'
} =
$value
;
}
elsif
( !
defined
$obj
->{
'_annotation'
}) {
$obj
->{
'_annotation'
} = FAST::Bio::Annotation::Collection->new();
}
return
$obj
->{
'_annotation'
};
}
Hide Show 15 lines of Pod
sub
no_residues {
my
$self
=
shift
;
$self
->deprecated(
-warn_version
=> 1.0069,
-throw_version
=> 1.0075,
-message
=>
'Use of method no_residues() is deprecated, use num_residues() instead'
);
$self
->num_residues(
@_
);
}
Hide Show 11 lines of Pod
sub
no_sequences {
my
$self
=
shift
;
$self
->deprecated(
-warn_version
=> 1.0069,
-throw_version
=> 1.0075,
-message
=>
'Use of method no_sequences() is deprecated, use num_sequences() instead'
);
$self
->num_sequences(
@_
);
}
Hide Show 19 lines of Pod
sub
mask_columns {
my
$self
=
shift
;
my
$nonres
=
$FAST::Bio::LocatableSeq::GAP_SYMBOLS
.
$FAST::Bio::LocatableSeq::FRAMESHIFT_SYMBOLS
;
my
(
$start
,
$end
,
$mask_char
) =
@_
;
unless
(
defined
$mask_char
) {
$mask_char
=
'N'
}
$self
->throw(
"Mask start has to be a positive integer and less than "
.
"alignment length, not [$start]"
)
unless
$start
=~ /^\d+$/ &&
$start
> 0 &&
$start
<=
$self
->
length
;
$self
->throw(
"Mask end has to be a positive integer and less than "
.
"alignment length, not [$end]"
)
unless
$end
=~ /^\d+$/ &&
$end
> 0 &&
$end
<=
$self
->
length
;
$self
->throw(
"Mask start [$start] has to be smaller than or equal to "
.
"end [$end]"
)
unless
$start
<=
$end
;
$self
->throw(
"Mask character $mask_char has to be a single character "
.
"and not a gap or frameshift symbol"
)
unless
CORE::
length
(
$mask_char
) == 1 &&
$mask_char
!~ m{
$nonres
};
my
$aln
=
$self
->new;
$aln
->id(
$self
->id);
foreach
my
$seq
(
$self
->each_seq() ) {
my
$new_seq
= FAST::Bio::LocatableSeq->new(
-id
=>
$seq
->id,
-alphabet
=>
$seq
->alphabet,
-strand
=>
$seq
->strand,
-verbose
=>
$self
->verbose);
my
$masked_string
=
substr
(
$seq
->seq,
$start
- 1,
$end
-
$start
+ 1);
$masked_string
=~ s{[^
$nonres
]}{
$mask_char
}g;
my
$new_dna_string
=
substr
(
$seq
->seq,0,
$start
-1) .
$masked_string
.
substr
(
$seq
->seq,
$end
);
$new_seq
->seq(
$new_dna_string
);
$aln
->add_seq(
$new_seq
);
}
return
$aln
;
}
1;