Hide Show 279 lines of Pod
use
vars
qw(%DNAChanges @Nucleotides %NucleotideIndexes
$GapChars $SeqCount $DefaultGapPenalty %DistanceMethods
$CODONS %synchanges $synsites $Precision $GCChhars)
;
BEGIN {
$GapChars
=
'[\.\-]'
;
$GCChhars
=
'[GCS]'
;
@Nucleotides
=
qw(A G T C)
;
$SeqCount
= 2;
$Precision
= 5;
%NucleotideIndexes
= (
'A'
=> 0,
'T'
=> 1,
'C'
=> 2,
'G'
=> 3,
'AT'
=> 0,
'AC'
=> 1,
'AG'
=> 2,
'CT'
=> 3,
'GT'
=> 4,
'CG'
=> 5,
);
$DefaultGapPenalty
= 0;
%DNAChanges
= (
'Transversions'
=> {
'A'
=> [
'T'
,
'C'
],
'T'
=> [
'A'
,
'G'
],
'C'
=> [
'A'
,
'G'
],
'G'
=> [
'C'
,
'T'
],
},
'Transitions'
=> {
'A'
=> [
'G'
],
'G'
=> [
'A'
],
'C'
=> [
'T'
],
'T'
=> [
'C'
],
},
);
%DistanceMethods
= (
'jc|jukes|jukescantor|jukes\-cantor'
=>
'JukesCantor'
,
'jcuncor|uncorrected'
=>
'Uncorrected'
,
'f81|felsenstein81'
=>
'F81'
,
'k2|k2p|k80|kimura'
=>
'Kimura'
,
't92|tamura|tamura92'
=>
'Tamura'
,
'f84|felsenstein84'
=>
'F84'
,
'tajimanei|tajima\-nei'
=>
'TajimaNei'
,
'jinnei|jin\-nei'
=>
'JinNei'
);
}
$CODONS
= get_codons();
my
@t
=
split
''
,
"FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG"
;
$synsites
= get_syn_sites();
%synchanges
= get_syn_changes();
Hide Show 11 lines of Pod
sub
new {
my
(
$class
,
@args
) =
@_
;
my
$self
=
$class
->SUPER::new(
@args
);
$self
->pairwise_stats( new Bio::Align::PairwiseStatistics());
return
$self
;
}
Hide Show 15 lines of Pod
sub
distance{
my
(
$self
,
@args
) =
@_
;
my
(
$aln
,
$method
) =
$self
->_rearrange([
qw(ALIGN METHOD)
],
@args
);
if
( !
defined
$aln
|| !
ref
(
$aln
) || !
$aln
->isa(
'Bio::Align::AlignI'
) ) {
$self
->throw(
"Must supply a valid Bio::Align::AlignI for the -align parameter in distance"
);
}
$method
||=
'JukesCantor'
;
foreach
my
$m
(
keys
%DistanceMethods
) {
if
(
defined
$m
&&
$method
=~ /
$m
/i ) {
my
$mtd
=
"D_$DistanceMethods{$m}"
;
return
$self
->
$mtd
(
$aln
);
}
}
$self
->
warn
(
"Unrecognized distance method $method must be one of ["
.
join
(
','
,
$self
->available_distance_methods()).
"]"
);
return
;
}
Hide Show 11 lines of Pod
sub
available_distance_methods{
my
(
$self
,
@args
) =
@_
;
return
values
%DistanceMethods
;
}
Hide Show 5 lines of Pod
Hide Show 13 lines of Pod
sub
D_JukesCantor{
my
(
$self
,
$aln
,
$gappenalty
) =
@_
;
return
0
unless
$self
->_check_arg(
$aln
);
$gappenalty
=
$DefaultGapPenalty
unless
defined
$gappenalty
;
my
(
@seqs
,
@names
,
@values
,
%dist
);
my
$seqct
= 0;
foreach
my
$seq
(
$aln
->each_seq) {
push
@names
,
$seq
->display_id;
push
@seqs
,
uc
$seq
->seq();
$seqct
++;
}
my
$precisionstr
=
"%.$Precision"
.
"f"
;
for
(
my
$i
= 0;
$i
<
$seqct
-1;
$i
++ ) {
$dist
{
$names
[
$i
]}->{
$names
[
$i
]} = [
$i
,
$i
];
$values
[
$i
][
$i
] =
sprintf
(
$precisionstr
,0);
for
(
my
$j
=
$i
+1;
$j
<
$seqct
;
$j
++ ) {
my
(
$matrix
,
$pfreq
,
$gaps
) =
$self
->_build_nt_matrix(
$seqs
[
$i
],
$seqs
[
$j
]);
my
$m
= (
$matrix
->[0]->[0] +
$matrix
->[1]->[1] +
$matrix
->[2]->[2] +
$matrix
->[3]->[3] );
my
$D
= 1 - (
$m
/ (
$aln
->
length
-
$gaps
+ (
$gaps
*
$gappenalty
)));
my
$d
= (- 3 / 4) *
log
( 1 - (4 *
$D
/ 3));
$dist
{
$names
[
$i
]}->{
$names
[
$j
]} = [
$i
,
$j
];
$dist
{
$names
[
$j
]}->{
$names
[
$i
]} = [
$i
,
$j
];
$values
[
$j
][
$i
] =
$values
[
$i
][
$j
] =
sprintf
(
$precisionstr
,
$d
);
$dist
{
$names
[
$j
]}->{
$names
[
$j
]} = [
$j
,
$j
];
$values
[
$j
][
$j
] =
sprintf
(
$precisionstr
,0);
}
}
return
Bio::Matrix::PhylipDist->new(
-program
=>
'bioperl_DNAstats'
,
-matrix
=> \
%dist
,
-names
=> \
@names
,
-values
=> \
@values
);
}
Hide Show 14 lines of Pod
sub
D_F81{
my
(
$self
,
$aln
,
$gappenalty
) =
@_
;
return
0
unless
$self
->_check_arg(
$aln
);
$gappenalty
=
$DefaultGapPenalty
unless
defined
$gappenalty
;
my
(
@seqs
,
@names
,
@values
,
%dist
);
my
$seqct
= 0;
foreach
my
$seq
(
$aln
->each_seq) {
push
@names
,
$seq
->display_id;;
push
@seqs
,
uc
$seq
->seq();
$seqct
++;
}
my
$precisionstr
=
"%.$Precision"
.
"f"
;
for
(
my
$i
= 0;
$i
<
$seqct
-1;
$i
++ ) {
$dist
{
$names
[
$i
]}->{
$names
[
$i
]} = [
$i
,
$i
];
$values
[
$i
][
$i
] =
sprintf
(
$precisionstr
,0);
for
(
my
$j
=
$i
+1;
$j
<
$seqct
;
$j
++ ) {
my
(
$matrix
,
$pfreq
,
$gaps
) =
$self
->_build_nt_matrix(
$seqs
[
$i
],
$seqs
[
$j
]);
my
$m
= (
$matrix
->[0]->[0] +
$matrix
->[1]->[1] +
$matrix
->[2]->[2] +
$matrix
->[3]->[3] );
my
$D
= 1 - (
$m
/ (
$aln
->
length
-
$gaps
+ (
$gaps
*
$gappenalty
)));
my
$d
= (- 3 / 4) *
log
( 1 - (4 *
$D
/ 3));
$dist
{
$names
[
$i
]}->{
$names
[
$j
]} = [
$i
,
$j
];
$dist
{
$names
[
$j
]}->{
$names
[
$i
]} = [
$i
,
$j
];
$values
[
$j
][
$i
] =
$values
[
$i
][
$j
] =
sprintf
(
$precisionstr
,
$d
);
$dist
{
$names
[
$j
]}->{
$names
[
$j
]} = [
$j
,
$j
];
$values
[
$j
][
$j
] =
sprintf
(
$precisionstr
,0);
}
}
return
Bio::Matrix::PhylipDist->new(
-program
=>
'bioperl_DNAstats'
,
-matrix
=> \
%dist
,
-names
=> \
@names
,
-values
=> \
@values
);
}
Hide Show 12 lines of Pod
sub
D_Uncorrected {
my
(
$self
,
$aln
,
$gappenalty
) =
@_
;
$gappenalty
=
$DefaultGapPenalty
unless
defined
$gappenalty
;
return
0
unless
$self
->_check_arg(
$aln
);
my
(
@seqs
,
@names
,
@values
,
%dist
);
my
$seqct
= 0;
foreach
my
$seq
(
$aln
->each_seq) {
push
@names
,
$seq
->display_id;
push
@seqs
,
uc
$seq
->seq();
$seqct
++;
}
my
$precisionstr
=
"%.$Precision"
.
"f"
;
my
$len
=
$aln
->
length
;
for
(
my
$i
= 0;
$i
<
$seqct
-1;
$i
++ ) {
$dist
{
$names
[
$i
]}->{
$names
[
$i
]} = [
$i
,
$i
];
$values
[
$i
][
$i
] =
sprintf
(
$precisionstr
,0);
for
(
my
$j
=
$i
+1;
$j
<
$seqct
;
$j
++ ) {
my
(
$matrix
,
$pfreq
,
$gaps
) =
$self
->_build_nt_matrix(
$seqs
[
$i
],
$seqs
[
$j
]);
my
$m
= (
$matrix
->[0]->[0] +
$matrix
->[1]->[1] +
$matrix
->[2]->[2] +
$matrix
->[3]->[3] );
my
$D
= 1 - (
$m
/ (
$len
-
$gaps
+ (
$gaps
*
$gappenalty
)));
$dist
{
$names
[
$i
]}->{
$names
[
$j
]} = [
$i
,
$j
];
$dist
{
$names
[
$j
]}->{
$names
[
$i
]} = [
$i
,
$j
];
$values
[
$j
][
$i
] =
$values
[
$i
][
$j
] =
sprintf
(
$precisionstr
,
$D
);
$dist
{
$names
[
$j
]}->{
$names
[
$j
]} = [
$j
,
$j
];
$values
[
$j
][
$j
] =
sprintf
(
$precisionstr
,0);
}
}
return
Bio::Matrix::PhylipDist->new(
-program
=>
'bioperl_DNAstats'
,
-matrix
=> \
%dist
,
-names
=> \
@names
,
-values
=> \
@values
);
}
Hide Show 12 lines of Pod
sub
D_Kimura {
my
(
$self
,
$aln
) =
@_
;
return
0
unless
$self
->_check_arg(
$aln
);
my
(
@names
,
@values
,
%dist
);
my
$seqct
= 0;
foreach
my
$seq
(
$aln
->each_seq) {
push
@names
,
$seq
->display_id;
$seqct
++;
}
my
$precisionstr
=
"%.$Precision"
.
"f"
;
for
(
my
$i
= 0;
$i
<
$seqct
-1;
$i
++ ) {
$dist
{
$names
[
$i
]}->{
$names
[
$i
]} = [
$i
,
$i
];
$values
[
$i
][
$i
] =
sprintf
(
$precisionstr
,0);
for
(
my
$j
=
$i
+1;
$j
<
$seqct
;
$j
++ ) {
my
$pairwise
=
$aln
->select_noncont(
$i
+1,
$j
+1);
my
$L
=
$self
->pairwise_stats->number_of_comparable_bases(
$pairwise
);
unless
(
$L
) {
$L
= 1;
}
my
$P
=
$self
->transitions(
$pairwise
) /
$L
;
my
$Q
=
$self
->transversions(
$pairwise
) /
$L
;
my
$K
= 0;
my
$a
= 1 / ( 1 - (2 *
$P
) -
$Q
);
my
$b
= 1 / ( 1 - 2 *
$Q
);
if
(
$a
< 0 ||
$b
< 0 ) {
$K
= -1;
}
else
{
$K
= (1/2) *
log
(
$a
) + (1/4) *
log
(
$b
);
}
$dist
{
$names
[
$i
]}->{
$names
[
$j
]} = [
$i
,
$j
];
$dist
{
$names
[
$j
]}->{
$names
[
$i
]} = [
$i
,
$j
];
$values
[
$j
][
$i
] =
$values
[
$i
][
$j
] =
sprintf
(
$precisionstr
,
$K
);
$dist
{
$names
[
$j
]}->{
$names
[
$j
]} = [
$j
,
$j
];
$values
[
$j
][
$j
] =
sprintf
(
$precisionstr
,0);
}
}
return
Bio::Matrix::PhylipDist->new(
-program
=>
'bioperl_DNAstats'
,
-matrix
=> \
%dist
,
-names
=> \
@names
,
-values
=> \
@values
);
}
Hide Show 14 lines of Pod
sub
D_Kimura_variance {
my
(
$self
,
$aln
) =
@_
;
return
0
unless
$self
->_check_arg(
$aln
);
my
(
@names
,
@values
,
%dist
,
@var
);
my
$seqct
= 0;
foreach
my
$seq
(
$aln
->each_seq) {
push
@names
,
$seq
->display_id;
$seqct
++;
}
my
$precisionstr
=
"%.$Precision"
.
"f"
;
for
(
my
$i
= 0;
$i
<
$seqct
-1;
$i
++ ) {
$dist
{
$names
[
$i
]}->{
$names
[
$i
]} = [
$i
,
$i
];
$values
[
$i
][
$i
] =
sprintf
(
$precisionstr
,0);
for
(
my
$j
=
$i
+1;
$j
<
$seqct
;
$j
++ ) {
my
$pairwise
=
$aln
->select_noncont(
$i
+1,
$j
+1);
my
$L
=
$self
->pairwise_stats->number_of_comparable_bases(
$pairwise
);
unless
(
$L
) {
$L
= 1;
}
my
$P
=
$self
->transitions(
$pairwise
) /
$L
;
my
$Q
=
$self
->transversions(
$pairwise
) /
$L
;
my
(
$a
,
$b
,
$K
,
$var_k
);
my
$a_denom
= ( 1 - (2 *
$P
) -
$Q
);
my
$b_denom
= 1 - 2 *
$Q
;
unless
(
$a_denom
> 0 &&
$b_denom
> 0 ) {
$a
= 1;
$b
= 1;
$K
= -1;
$var_k
= -1;
}
else
{
$a
= 1 /
$a_denom
;
$b
= 1 /
$b_denom
;
$K
= (1/2) *
log
(
$a
) + (1/4) *
log
(
$b
);
my
$c
= (
$a
-
$b
) / 2;
my
$d
= (
$a
+
$b
) / 2;
$var_k
= (
$a
**2 *
$P
+
$d
**2 *
$Q
- (
$a
*
$P
+
$d
*
$Q
)**2 ) /
$L
;
}
$dist
{
$names
[
$i
]}->{
$names
[
$j
]} = [
$i
,
$j
];
$dist
{
$names
[
$j
]}->{
$names
[
$i
]} = [
$i
,
$j
];
$values
[
$j
][
$i
] =
$values
[
$i
][
$j
] =
sprintf
(
$precisionstr
,
$K
);
$dist
{
$names
[
$j
]}->{
$names
[
$j
]} = [
$j
,
$j
];
$values
[
$j
]->[
$j
] =
sprintf
(
$precisionstr
,0);
$var
[
$j
]->[
$i
] =
$var
[
$i
]->[
$j
] =
sprintf
(
$precisionstr
,
$var_k
);
$var
[
$j
]->[
$j
] =
$values
[
$j
]->[
$j
];
}
}
return
( Bio::Matrix::PhylipDist->new(
-program
=>
'bioperl_DNAstats'
,
-matrix
=> \
%dist
,
-names
=> \
@names
,
-values
=> \
@values
),
Bio::Matrix::PhylipDist->new(
-program
=>
'bioperl_DNAstats'
,
-matrix
=> \
%dist
,
-names
=> \
@names
,
-values
=> \
@var
)
);
}
Hide Show 11 lines of Pod
sub
D_Tamura {
my
(
$self
,
$aln
) =
@_
;
return
0
unless
$self
->_check_arg(
$aln
);
my
(
@seqs
,
@names
,
@values
,
%dist
,
$i
,
$j
);
my
$seqct
= 0;
my
$length
=
$aln
->
length
;
foreach
my
$seq
(
$aln
->each_seq) {
push
@names
,
$seq
->display_id;;
push
@seqs
,
uc
$seq
->seq();
$seqct
++;
}
my
$precisionstr
=
"%.$Precision"
.
"f"
;
my
(
@gap
,
@gc
,
@trans
,
@tranv
,
@score
);
$i
= 0;
for
my
$t1
(
@seqs
) {
$j
= 0;
for
my
$t2
(
@seqs
) {
$gap
[
$i
][
$j
] = 0;
for
(
my
$k
= 0;
$k
<
$length
;
$k
++ ) {
my
(
$c1
,
$c2
) = (
substr
(
$seqs
[
$i
],
$k
,1),
substr
(
$seqs
[
$j
],
$k
,1) );
if
(
$c1
=~ /^
$GapChars
$/ ||
$c2
=~ /^
$GapChars
$/ ) {
$gap
[
$i
][
$j
]++;
}
elsif
(
$c2
=~ /^
$GCChhars
$/i ) {
$gc
[
$i
][
$j
]++;
}
}
$gc
[
$i
][
$j
] = (
$gc
[
$i
][
$j
] /
(
$length
-
$gap
[
$i
][
$j
]) );
$j
++;
}
$i
++;
}
for
(
$i
= 0;
$i
<
$seqct
-1;
$i
++ ) {
$dist
{
$names
[
$i
]}->{
$names
[
$i
]} = [
$i
,
$i
];
$values
[
$i
][
$i
] =
sprintf
(
$precisionstr
,0);
for
(
$j
=
$i
+1;
$j
<
$seqct
;
$j
++ ) {
my
$pairwise
=
$aln
->select_noncont(
$i
+1,
$j
+1);
my
$L
=
$self
->pairwise_stats->number_of_comparable_bases(
$pairwise
);
my
$P
=
$self
->transitions(
$pairwise
) /
$L
;
my
$Q
=
$self
->transversions(
$pairwise
) /
$L
;
my
$C
=
$gc
[
$i
][
$j
] +
$gc
[
$j
][
$i
]-
( 2 *
$gc
[
$i
][
$j
] *
$gc
[
$j
][
$i
] );
if
(
$P
) {
$P
=
$P
/
$C
;
}
my
$d
= -(
$C
*
log
(1-
$P
-
$Q
)) -(0.5* ( 1 -
$C
) *
log
(1 - 2 *
$Q
));
$dist
{
$names
[
$i
]}->{
$names
[
$j
]} = [
$i
,
$j
];
$dist
{
$names
[
$j
]}->{
$names
[
$i
]} = [
$i
,
$j
];
$values
[
$j
][
$i
] =
$values
[
$i
][
$j
] =
sprintf
(
$precisionstr
,
$d
);
$dist
{
$names
[
$j
]}->{
$names
[
$j
]} = [
$j
,
$j
];
$values
[
$j
][
$j
] =
sprintf
(
$precisionstr
,0);
}
}
return
Bio::Matrix::PhylipDist->new(
-program
=>
'bioperl_DNAstats'
,
-matrix
=> \
%dist
,
-names
=> \
@names
,
-values
=> \
@values
);
}
Hide Show 12 lines of Pod
sub
D_F84 {
my
(
$self
,
$aln
,
$gappenalty
) =
@_
;
return
0
unless
$self
->_check_arg(
$aln
);
$self
->throw_not_implemented();
my
(
@seqs
,
@names
,
@values
,
%dist
);
my
$seqct
= 0;
foreach
my
$seq
(
$aln
->each_seq) {
my
$id
=
$seq
->display_id;
if
( !
length
(
$id
) ||
$id
=~ /^\s+$/ ) {
$id
=
$seqct
+1;
}
push
@names
,
$id
;
push
@seqs
,
uc
$seq
->seq();
$seqct
++;
}
my
$precisionstr
=
"%.$Precision"
.
"f"
;
for
(
my
$i
= 0;
$i
<
$seqct
-1;
$i
++ ) {
$dist
{
$names
[
$i
]}->{
$names
[
$i
]} = [
$i
,
$i
];
$values
[
$i
][
$i
] =
sprintf
(
$precisionstr
,0);
for
(
my
$j
=
$i
+1;
$j
<
$seqct
;
$j
++ ) {
}
}
}
Hide Show 12 lines of Pod
sub
D_TajimaNei{
my
(
$self
,
$aln
) =
@_
;
return
0
unless
$self
->_check_arg(
$aln
);
my
(
@seqs
,
@names
,
@values
,
%dist
);
my
$seqct
= 0;
foreach
my
$seq
(
$aln
->each_seq) {
push
@names
,
$seq
->display_id;
push
@seqs
,
uc
$seq
->seq();
$seqct
++;
}
my
$precisionstr
=
"%.$Precision"
.
"f"
;
my
(
$i
,
$j
,
$bs
);
for
(
$i
=0;
$i
<
$seqct
-1;
$i
++ ) {
$dist
{
$names
[
$i
]}->{
$names
[
$i
]} = [
$i
,
$i
];
$values
[
$i
][
$i
] =
sprintf
(
$precisionstr
,0);
for
(
$j
=
$i
+1;
$j
<
$seqct
;
$j
++ ) {
my
(
$matrix
,
$pfreq
,
$gaps
) =
$self
->_build_nt_matrix(
$seqs
[
$i
],
$seqs
[
$j
]);
my
$pairwise
=
$aln
->select_noncont(
$i
+1,
$j
+1);
my
$slen
=
$self
->pairwise_stats->number_of_comparable_bases(
$pairwise
);
my
$fij2
= 0;
for
(
$bs
= 0;
$bs
< 4;
$bs
++ ) {
my
$fi
= 0;
map
{
$fi
+=
$matrix
->[
$bs
]->[
$_
] } 0..3;
my
$fj
= 0;
map
{
$fj
+=
$matrix
->[
$_
]->[
$bs
] } 0..3;
my
$fij
= (
$fi
&&
$fj
) ? (
$fi
+
$fj
) /( 2 *
$slen
) : 0;
$fij2
+=
$fij
**2;
}
my
(
$pair
,
$h
) = (0,0);
for
(
$bs
= 0;
$bs
< 3;
$bs
++ ) {
for
(
my
$bs1
=
$bs
+1;
$bs1
<= 3;
$bs1
++ ) {
my
$fij
=
$pfreq
->[
$pair
++] /
$slen
;
if
(
$fij
) {
my
(
$ci1
,
$ci2
,
$cj1
,
$cj2
) = (0,0,0,0);
map
{
$ci1
+=
$matrix
->[
$_
]->[
$bs
] } 0..3;
map
{
$cj1
+=
$matrix
->[
$bs
]->[
$_
] } 0..3;
map
{
$ci2
+=
$matrix
->[
$_
]->[
$bs1
] } 0..3;
map
{
$cj2
+=
$matrix
->[
$bs1
]->[
$_
] } 0..3;
if
(
$fij
) {
$h
+= ( (
$fij
**2) / 2 ) /
( ( (
$ci1
+
$cj1
) / (2 *
$slen
) ) *
( (
$ci2
+
$cj2
) / (2 *
$slen
) )
);
}
$self
->debug(
"slen is $slen h is $h fij = $fij ci1 =$ci1 cj1=$cj1 ci2=$ci2 cj2=$cj2\n"
);
}
}
}
my
$m
= (
$matrix
->[0]->[0] +
$matrix
->[1]->[1] +
$matrix
->[2]->[2] +
$matrix
->[3]->[3] );
my
$D
= 1 - (
$m
/
$slen
);
my
$d
;
if
(
$h
== 0 ) {
$d
= -1;
}
else
{
my
$b
= (1 -
$fij2
+ ((
$D
**2)/
$h
)) / 2;
my
$c
= 1-
$D
/
$b
;
if
(
$c
< 0 ) {
$d
= -1;
}
else
{
$d
= (-1 *
$b
) *
log
(
$c
);
}
}
$dist
{
$names
[
$i
]}->{
$names
[
$j
]} = [
$i
,
$j
];
$dist
{
$names
[
$j
]}->{
$names
[
$i
]} = [
$i
,
$j
];
$values
[
$j
][
$i
] =
$values
[
$i
][
$j
] =
sprintf
(
$precisionstr
,
$d
);
$dist
{
$names
[
$j
]}->{
$names
[
$j
]} = [
$j
,
$j
];
$values
[
$j
][
$j
] =
sprintf
(
$precisionstr
,0);
}
}
return
Bio::Matrix::PhylipDist->new(
-program
=>
'bioperl_DNAstats'
,
-matrix
=> \
%dist
,
-names
=> \
@names
,
-values
=> \
@values
);
}
Hide Show 12 lines of Pod
sub
D_JinNei{
my
(
$self
,
@args
) =
@_
;
$self
->
warn
(
"JinNei implementation not completed"
);
return
;
}
Hide Show 12 lines of Pod
sub
transversions{
my
(
$self
,
$aln
) =
@_
;
return
$self
->_trans_count_helper(
$aln
,
$DNAChanges
{
'Transversions'
});
}
Hide Show 11 lines of Pod
sub
transitions{
my
(
$self
,
$aln
) =
@_
;
return
$self
->_trans_count_helper(
$aln
,
$DNAChanges
{
'Transitions'
});
}
sub
_trans_count_helper {
my
(
$self
,
$aln
,
$type
) =
@_
;
return
0
unless
(
$self
->_check_arg(
$aln
) );
if
( !
$aln
->is_flush ) {
$self
->throw(
"must be flush"
) }
my
(
@tcount
);
my
(
$first
,
$second
) = (
uc
$aln
->get_seq_by_pos(1)->seq(),
uc
$aln
->get_seq_by_pos(2)->seq() );
my
$alen
=
$aln
->
length
;
for
(
my
$i
= 0;
$i
<
$alen
;
$i
++ ) {
my
(
$c1
,
$c2
) = (
substr
(
$first
,
$i
,1),
substr
(
$second
,
$i
,1) );
if
(
$c1
ne
$c2
) {
foreach
my
$nt
( @{
$type
->{
$c1
}} ) {
if
(
$nt
eq
$c2
) {
$tcount
[
$i
]++;
}
}
}
}
my
$sum
= 0;
map
{
if
(
$_
) {
$sum
+=
$_
} }
@tcount
;
return
$sum
;
}
sub
_build_nt_matrix {
my
(
$self
,
$seqa
,
$seqb
) =
@_
;
my
$basect_matrix
= [ [
qw(0 0 0 0)
],
[
qw(0 0 0 0)
],
[
qw(0 0 0 0)
],
[
qw(0 0 0 0)
] ];
my
$gaps
= 0;
my
$pfreq
= [
qw( 0 0 0 0 0 0)
];
my
$len_a
=
length
(
$seqa
);
for
(
my
$i
= 0;
$i
<
$len_a
;
$i
++) {
my
(
$ti
,
$tj
) = (
substr
(
$seqa
,
$i
,1),
substr
(
$seqb
,
$i
,1));
$ti
=~
tr
/U/T/;
$tj
=~
tr
/U/T/;
if
(
$ti
=~ /^
$GapChars
$/) {
$gaps
++;
next
; }
if
(
$tj
=~ /^
$GapChars
$/) {
$gaps
++;
next
}
my
$ti_index
=
$NucleotideIndexes
{
$ti
};
my
$tj_index
=
$NucleotideIndexes
{
$tj
};
if
( !
defined
$ti_index
) {
print
"ti_index not defined for $ti\n"
;
next
;
}
$basect_matrix
->[
$ti_index
]->[
$tj_index
]++;
if
(
$ti
ne
$tj
) {
$pfreq
->[
$NucleotideIndexes
{
join
(
''
,
sort
(
$ti
,
$tj
))}]++;
}
}
return
(
$basect_matrix
,
$pfreq
,
$gaps
);
}
sub
_check_ambiguity_nucleotide {
my
(
$base1
,
$base2
) =
@_
;
my
%iub
= Bio::Tools::IUPAC->iupac_iub();
my
@amb1
= @{
$iub
{
uc
(
$base1
)} };
my
@amb2
= @{
$iub
{
uc
(
$base2
)} };
my
(
$pmatch
) = (0);
for
my
$amb
(
@amb1
) {
if
(
grep
{
$amb
eq
$_
}
@amb2
) {
$pmatch
= 1;
last
;
}
}
if
(
$pmatch
) {
return
(1 /
scalar
@amb1
) * (1 /
scalar
@amb2
);
}
else
{
return
0;
}
}
sub
_check_arg {
my
(
$self
,
$aln
) =
@_
;
if
( !
defined
$aln
|| !
$aln
->isa(
'Bio::Align::AlignI'
) ) {
$self
->
warn
(
"Must provide a Bio::Align::AlignI compliant object to Bio::Align::DNAStatistics"
);
return
0;
}
elsif
(
$aln
->get_seq_by_pos(1)->alphabet ne
'dna'
) {
$self
->
warn
(
"Must provide a DNA alignment to Bio::Align::DNAStatistics, you provided a "
.
$aln
->get_seq_by_pos(1)->alphabet);
return
0;
}
return
1;
}
Hide Show 15 lines of Pod
sub
pairwise_stats{
my
(
$self
,
$value
) =
@_
;
if
(
defined
$value
) {
$self
->{
'_pairwise_stats'
} =
$value
;
}
return
$self
->{
'_pairwise_stats'
};
}
Hide Show 14 lines of Pod
sub
calc_KaKs_pair {
my
(
$self
,
$aln
,
$seq1_id
,
$seq2_id
) =
@_
;
$self
->throw(
"Needs 3 arguments - an alignment object, and 2 sequence ids"
)
if
@_
!= 4;
$self
->throw (
"This calculation needs a Bio::Align::AlignI compatible object, not a [ "
.
ref
(
$aln
) .
" ]object"
)
unless
$aln
->isa(
'Bio::Align::AlignI'
);
my
@seqs
= (
{
id
=>
$seq1_id
,
seq
=>
uc
((
$aln
->each_seq_with_id(
$seq1_id
))[0]->seq)},
{
id
=>
$seq2_id
,
seq
=>
uc
((
$aln
->each_seq_with_id(
$seq2_id
))[0]->seq)}
) ;
if
(
length
(
$seqs
[0]{
'seq'
}) !=
length
(
$seqs
[1]{
'seq'
})) {
$self
->throw(
" aligned sequences must be of equal length!"
);
}
my
$results
= [];
$self
->_get_av_ds_dn(\
@seqs
,
$results
);
return
$results
;
}
Hide Show 13 lines of Pod
sub
calc_all_KaKs_pairs {
my
(
$self
,
$aln
) =
@_
;
$self
->throw (
"This calculation needs a Bio::Align::AlignI compatible object, not a [ "
.
ref
(
$aln
) .
" ]object"
)
unless
$aln
->isa(
'Bio::Align::AlignI'
);
my
@seqs
;
for
my
$seq
(
$aln
->each_seq) {
push
@seqs
, {
id
=>
$seq
->display_id,
seq
=>
$seq
->seq};
}
my
$results
;
$results
=
$self
->_get_av_ds_dn(\
@seqs
,
$results
);
return
$results
;
}
Hide Show 13 lines of Pod
sub
calc_average_KaKs {
my
(
$self
,
$aln
,
$bootstrap_rpt
) =
@_
;
$bootstrap_rpt
||= 1000;
$self
->throw (
"This calculation needs a Bio::Align::AlignI compatible object, not a [ "
.
ref
(
$aln
) .
" ]object"
)
unless
$aln
->isa(
'Bio::Align::AlignI'
);
my
@seqs
;
for
my
$seq
(
$aln
->each_seq) {
push
@seqs
, {
id
=>
$seq
->display_id,
seq
=>
$seq
->seq};
}
my
$results
;
my
(
$ds_orig
,
$dn_orig
) =
$self
->_get_av_ds_dn(\
@seqs
);
$results
= {
D_s
=>
$ds_orig
,
D_n
=>
$dn_orig
};
$self
->_run_bootstrap(\
@seqs
,
$results
,
$bootstrap_rpt
);
return
$results
;
}
sub
_run_bootstrap {
my
(
$self
,
$seq_ref
,
$results
,
$bootstrap_rpt
) =
@_
;
my
@seqs
=
@$seq_ref
;
my
@btstrp_aoa
;
my
%bootstrap_values
= (
ds
=> [],
dn
=>[]);
my
$c
= 0;
while
(
$c
<
length
$seqs
[0]{
'seq'
}) {
for
(0..
$#seqs
) {
push
@{
$btstrp_aoa
[
$_
]},
substr
(
$seqs
[
$_
]{
'seq'
},
$c
, 3);
}
$c
+=3;
}
for
(1..
$bootstrap_rpt
) {
my
$sampled
= _resample (\
@btstrp_aoa
);
my
(
$ds
,
$dn
) =
$self
->_get_av_ds_dn (
$sampled
) ;
push
@{
$bootstrap_values
{
'ds'
}},
$ds
;
push
@{
$bootstrap_values
{
'dn'
}},
$dn
;
}
$results
->{
'D_s_var'
} = sampling_variance(
$bootstrap_values
{
'ds'
});
$results
->{
'D_n_var'
} = sampling_variance(
$bootstrap_values
{
'dn'
});
$results
->{
'z_score'
} = (
$results
->{
'D_n'
} -
$results
->{
'D_s'
}) /
sqrt
(
$results
->{
'D_s_var'
} +
$results
->{
'D_n_var'
} );
}
sub
_resample {
my
$ref
=
shift
;
my
$codon_num
=
scalar
(@{
$ref
->[0]});
my
@altered
;
for
(0..
$codon_num
-1) {
my
$rand
=
int
(
rand
(
$codon_num
));
for
(0..
$#$ref
) {
push
@{
$altered
[
$_
]},
$ref
->[
$_
][
$rand
];
}
}
my
@stringed
=
map
{
join
''
,
@$_
}
@altered
;
my
@return
;
for
(
@stringed
) {
push
@return
, {
id
=>
'1'
,
seq
=>
$_
};
}
return
\
@return
;
}
sub
_get_av_ds_dn {
my
$self
=
shift
;
my
$seq_ref
=
shift
;
my
$result
=
shift
if
@_
;
my
@caller
=
caller
(1);
my
@seqarray
=
@$seq_ref
;
my
$bootstrap_score_list
;
my
%dsfor_average
= (
ds
=> [],
dn
=> []);
for
(
my
$i
= 0;
$i
<
scalar
@seqarray
;
$i
++) {
for
(
my
$j
=
$i
+1;
$j
<
scalar
@seqarray
;
$j
++ ){
if
(
length
(
$seqarray
[
$i
]{
'seq'
}) !=
length
(
$seqarray
[
$j
]{
'seq'
})) {
$self
->
warn
(
" aligned sequences must be of equal length!"
);
next
;
}
my
$syn_site_count
= count_syn_sites(
$seqarray
[
$i
]{
'seq'
},
$synsites
);
my
$syn_site_count2
= count_syn_sites(
$seqarray
[
$j
]{
'seq'
},
$synsites
);
my
(
$syn_count
,
$non_syn_count
,
$gap_cnt
) = analyse_mutations(
$seqarray
[
$i
]{
'seq'
},
$seqarray
[
$j
]{
'seq'
});
my
$av_s_site
= (
$syn_site_count
+
$syn_site_count2
)/2;
my
$av_ns_syn_site
=
length
(
$seqarray
[
$i
]{
'seq'
}) -
$gap_cnt
-
$av_s_site
;
my
$syn_prop
=
$syn_count
/
$av_s_site
;
my
$nc_prop
=
$non_syn_count
/
$av_ns_syn_site
;
my
$d_syn
=
$self
->jk(
$syn_prop
);
my
$d_nc
=
$self
->jk(
$nc_prop
);
next
unless
$d_nc
>=0 &&
$d_syn
>=0;
push
@{
$dsfor_average
{
'ds'
}},
$d_syn
;
push
@{
$dsfor_average
{
'dn'
}},
$d_nc
;
if
(
$caller
[3] =~ /calc_KaKs_pair/ ||
$caller
[3] =~ /calc_all_KaKs_pairs/) {
my
$d_syn_var
= jk_var(
$syn_prop
,
length
(
$seqarray
[
$i
]{
'seq'
}) -
$gap_cnt
);
my
$d_nc_var
= jk_var(
$nc_prop
,
length
(
$seqarray
[
$i
]{
'seq'
}) -
$gap_cnt
);
my
$z
= (
$d_syn_var
+
$d_nc_var
) ?
(
$d_nc
-
$d_syn
) /
sqrt
(
$d_syn_var
+
$d_nc_var
) : 0;
push
@$result
, {
S
=>
$av_s_site
,
N
=>
$av_ns_syn_site
,
S_d
=>
$syn_count
,
N_d
=>
$non_syn_count
,
P_s
=>
$syn_prop
,
P_n
=>
$nc_prop
,
D_s
=> @{
$dsfor_average
{
'ds'
}}[-1],
D_n
=> @{
$dsfor_average
{
'dn'
}}[-1],
D_n_var
=>
$d_nc_var
,
D_s_var
=>
$d_syn_var
,
Seq1
=>
$seqarray
[
$i
]{
'id'
},
Seq2
=>
$seqarray
[
$j
]{
'id'
},
z_score
=>
$z
,
};
$self
->
warn
(
" number of mutations too small to justify normal test for $seqarray[$i]{'id'} and $seqarray[$j]{'id'}\n- use Fisher's exact, or bootstrap a MSA"
)
if
(
$syn_count
< 10 ||
$non_syn_count
< 10 ) &&
$self
->verbose > -1 ;
}
}
}
return
$result
if
$caller
[3]=~ /calc_all_KaKs/ ||
$caller
[3] =~ /calc_KaKs_pair/;
return
( mean (
$dsfor_average
{
'ds'
}),mean (
$dsfor_average
{
'dn'
})) ;
}
sub
jk {
my
(
$self
,
$p
) =
@_
;
if
(
$p
> 0.75) {
$self
->
warn
(
" Jukes Cantor won't work -too divergent!"
);
return
-1;
}
return
-1 * (3/4) * (
log
(1 - (4/3) *
$p
));
}
sub
jk_var {
my
(
$p
,
$n
) =
@_
;
return
(9 *
$p
* (1 -
$p
))/(((3 - 4
*$p
) **2) *
$n
);
}
sub
analyse_mutations {
my
(
$seq1
,
$seq2
) =
@_
;
my
%mutator
= (
2
=> {
0
=>[[1,2],
[2,1]],
1
=>[[0,2],
[2,0]],
2
=>[[0,1],
[1,0]],
},
3
=> [ [0,1,2],
[1,0,2],
[0,2,1],
[1,2,0],
[2,0,1],
[2,1,0] ],
);
my
$TOTAL
= 0;
my
$TOTAL_n
= 0;
my
$gap_cnt
= 0;
my
%input
;
my
$seqlen
=
length
(
$seq1
);
for
(
my
$j
=0;
$j
<
$seqlen
;
$j
+=3) {
$input
{
'cod1'
} =
substr
(
$seq1
,
$j
,3);
$input
{
'cod2'
} =
substr
(
$seq2
,
$j
,3);
if
(
$input
{
'cod1'
} =~ /\-/ ||
$input
{
'cod2'
} =~ /\-/){
$gap_cnt
+= 3;
next
;
}
my
(
$diff_cnt
,
$same
) = count_diffs(\
%input
);
next
if
$diff_cnt
== 0 ;
if
(
$diff_cnt
== 1) {
$TOTAL
+=
$synchanges
{
$input
{
'cod1'
}}{
$input
{
'cod2'
}};
$TOTAL_n
+= 1 -
$synchanges
{
$input
{
'cod1'
}}{
$input
{
'cod2'
}};
}
elsif
(
$diff_cnt
==2) {
my
$s_cnt
= 0;
my
$n_cnt
= 0;
my
$tot_muts
= 4;
OUTER:
for
my
$perm
(@{
$mutator
{
'2'
}{
$same
}}) {
my
$altered
=
$input
{
'cod1'
};
my
$prev
=
$altered
;
for
my
$mut_i
(
@$perm
) {
substr
(
$altered
,
$mut_i
,1) =
substr
(
$input
{
'cod2'
},
$mut_i
, 1);
if
(
$t
[
$CODONS
->{
$altered
}] eq
'*'
) {
$tot_muts
-=2;
next
OUTER;
}
else
{
$s_cnt
+=
$synchanges
{
$prev
}{
$altered
};
}
$prev
=
$altered
;
}
}
if
(
$tot_muts
!= 0) {
$TOTAL
+= (
$s_cnt
/(
$tot_muts
/2));
$TOTAL_n
+= (
$tot_muts
-
$s_cnt
)/ (
$tot_muts
/ 2);
}
}
elsif
(
$diff_cnt
==3 ) {
my
$s_cnt
= 0;
my
$n_cnt
= 0;
my
$tot_muts
= 18;
OUTER:
for
my
$perm
(@{
$mutator
{
'3'
}}) {
my
$altered
=
$input
{
'cod1'
};
my
$prev
=
$altered
;
for
my
$mut_i
(
@$perm
) {
substr
(
$altered
,
$mut_i
,1) =
substr
(
$input
{
'cod2'
},
$mut_i
, 1);
if
(
$t
[
$CODONS
->{
$altered
}] eq
'*'
) {
$tot_muts
-=3;
next
OUTER;
}
else
{
$s_cnt
+=
$synchanges
{
$prev
}{
$altered
};
}
$prev
=
$altered
;
}
}
if
(
$tot_muts
!= 0) {
$TOTAL
+= (
$s_cnt
/ (
$tot_muts
/3));
$TOTAL_n
+= 3 - (
$s_cnt
/ (
$tot_muts
/3));
}
}
}
return
(
$TOTAL
,
$TOTAL_n
,
$gap_cnt
);
}
sub
count_diffs {
my
$ref
=
shift
;
my
$cnt
= 0;
my
$same
=
undef
;
for
(0..2) {
if
(
substr
(
$ref
->{
'cod1'
},
$_
,1) ne
substr
(
$ref
->{
'cod2'
},
$_
, 1)){
$cnt
++;
}
else
{
$same
=
$_
;
}
}
return
(
$cnt
,
$same
);
}
Hide Show 19 lines of Pod
sub
get_syn_changes {
my
%results
;
my
@codons
= _make_codons ();
my
$arr_len
=
scalar
@codons
;
for
(
my
$i
= 0;
$i
<
$arr_len
-1;
$i
++) {
my
$cod1
=
$codons
[
$i
];
for
(
my
$j
=
$i
+1;
$j
<
$arr_len
;
$j
++) {
my
$diff_cnt
= 0;
for
my
$pos
(0..2) {
$diff_cnt
++
if
substr
(
$cod1
,
$pos
, 1) ne
substr
(
$codons
[
$j
],
$pos
, 1);
}
next
if
$diff_cnt
!=1;
if
(
$t
[
$CODONS
->{
$cod1
}] eq
$t
[
$CODONS
->{
$codons
[
$j
]}]) {
$results
{
$cod1
}{
$codons
[
$j
]} =1;
$results
{
$codons
[
$j
]}{
$cod1
} = 1;
}
elsif
(
$t
[
$CODONS
->{
$cod1
}] eq
'*'
or
$t
[
$CODONS
->{
$codons
[
$j
]}] eq
'*'
) {
$results
{
$cod1
}{
$codons
[
$j
]} = -1;
$results
{
$codons
[
$j
]}{
$cod1
} = -1;
}
else
{
$results
{
$cod1
}{
$codons
[
$j
]} = 0;
$results
{
$codons
[
$j
]}{
$cod1
} = 0;
}
}
}
return
%results
;
}
Hide Show 11 lines of Pod
sub
dnds_pattern_number{
my
(
$self
,
$aln
) =
@_
;
return
(
$aln
->remove_gaps->
length
)/3;
}
sub
count_syn_sites {
my
(
$seq
,
$synsite
) =
@_
;
__PACKAGE__->throw(
"not integral number of codons"
)
if
length
(
$seq
) % 3 != 0;
my
$S
= 0;
for
(
my
$i
= 0;
$i
<
length
(
$seq
);
$i
+=3) {
my
$cod
=
substr
(
$seq
,
$i
, 3);
next
if
$cod
=~ /\-/;
$S
+=
$synsite
->{
$cod
}{
's'
};
}
return
$S
;
}
sub
get_syn_sites {
my
@nucs
=
qw(T C A G)
;
my
%raw_results
;
for
my
$i
(
@nucs
) {
for
my
$j
(
@nucs
) {
for
my
$k
(
@nucs
) {
my
$cod
=
"$i$j$k"
;
my
$aa
=
$t
[
$CODONS
->{
$cod
}];
for
my
$i
(
qw(0 1 2)
){
my
$s
= 0;
my
$n
= 3;
for
my
$nuc
(
qw(A T C G)
) {
next
if
substr
(
$cod
,
$i
,1) eq
$nuc
;
my
$test
=
$cod
;
substr
(
$test
,
$i
, 1) =
$nuc
;
if
(
$t
[
$CODONS
->{
$test
}] eq
$aa
) {
$s
++;
}
if
(
$t
[
$CODONS
->{
$test
}] eq
'*'
) {
$n
--;
}
}
$raw_results
{
$cod
}[
$i
] = {
's'
=>
$s
,
'n'
=>
$n
};
}
}
}
}
my
%final_results
;
for
my
$cod
(
sort
keys
%raw_results
) {
my
$t
= 0;
map
{
$t
+= (
$_
->{
's'
} /
$_
->{
'n'
})} @{
$raw_results
{
$cod
}};
$final_results
{
$cod
} = {
's'
=>
$t
,
'n'
=> 3 -
$t
};
}
return
\
%final_results
;
}
sub
_make_codons {
my
@nucs
=
qw(T C A G)
;
my
@codons
;
for
my
$i
(
@nucs
) {
for
my
$j
(
@nucs
) {
for
my
$k
(
@nucs
) {
push
@codons
,
"$i$j$k"
;
}
}
}
return
@codons
;
}
sub
get_codons {
my
$x
= 0;
my
$CODONS
= {};
for
my
$codon
(_make_codons) {
$CODONS
->{
$codon
} =
$x
;
$x
++;
}
return
$CODONS
;
}
sub
mean {
my
$ref
=
shift
;
my
$el_num
=
scalar
@$ref
;
my
$tot
= 0;
map
{
$tot
+=
$_
}
@$ref
;
return
(
$tot
/
$el_num
);
}
sub
variance {
my
$ref
=
shift
;
my
$mean
= mean(
$ref
);
my
$sum_of_squares
= 0;
map
{
$sum_of_squares
+= (
$_
-
$mean
) **2}
@$ref
;
return
$sum_of_squares
;
}
sub
sampling_variance {
my
$ref
=
shift
;
return
variance(
$ref
) / (
scalar
@$ref
-1);
}
1;