#!/usr/bin/perl -w
use
Getopt::Long
qw(:config bundling require_order auto_version)
;
use
vars
qw($VERSION $DESC $NAME $COMMAND $DATE)
;
$VERSION
=
$FAST::VERSION
;
$DESC
=
"Calculate molecular population genetic statistics from DNA alignments.\n"
;
$NAME
= $0;
$NAME
=~ s/^.*\///;
$COMMAND
=
join
" "
,
$NAME
,
@ARGV
;
$DATE
= POSIX::strftime(
"%c"
,
localtime
());
use
constant
hlPI
=>
log
(4 *
atan2
1, 1)/2;
my
$def_format
=
$FAST::DEF_FORMAT
;
my
$def_logname
=
$FAST::DEF_LOGNAME
;
my
$man
=
undef
;
my
$help
=
undef
;
my
$moltype
=
undef
;
my
$format
=
$def_format
;
my
$log
=
undef
;
my
$logname
=
$def_logname
;
my
$comment
=
undef
;
my
$suppress
=
undef
;
my
$latex
=
undef
;
my
$pairwise
=
undef
;
my
$slide_window
=
undef
;
my
$upblue
=
undef
;
my
$absolute
=
undef
;
my
$label_input
=
undef
;
my
$L
;
my
$s
;
my
$S
;
my
$Theta_w
;
my
$a1
;
my
$a2
;
my
$e1
;
my
$e2
;
my
$Theta_w_per_site
;
my
$Num_alleles
;
my
$H
;
my
$nseq
;
my
$Exp_num_alleles
;
my
$Prob_allele_partition
;
my
$Pi
;
my
$pi
;
my
$eta_S
;
my
$eta
;
my
$Tajima_D
;
my
$FuLi_D_star
;
my
$FuLi_F_star
;
my
$anp1
;
my
$SE_pi_no_rec
;
my
$SE_pi_LE
;
my
$SE_Theta_w_no_recomb_per_site
;
my
$SE_Theta_w_LE_per_site
;
my
$TL
;
GetOptions(
'help|h'
=> \
$help
,
'man'
=> \
$man
,
'moltype|m=s'
=>
sub
{
my
(
undef
,
$val
) =
@_
;
die
"$NAME: --moltype or -m option argument must be \"dna\", \"rna\" or \"protein\""
unless
$val
=~ /dna|rna|protein/i;
$moltype
=
$val
;
},
'format=s'
=> \
$format
,
'log|l'
=> \
$log
,
'logname|L=s'
=> \
$logname
,
'comment|C=s'
=> \
$comment
,
'suppress|s'
=> \
$suppress
,
'latex|x'
=> \
$latex
,
'pairwise|p'
=> \
$pairwise
,
'window|w=s'
=>
sub
{
my
(
undef
,
$val
) =
@_
;
my
(
$ws
,
$ss
,
$stat
) =
split
/:/,
$val
;
die
"$NAME: --window or -w option expects string argument <window-size>:<step-size>:<statistic>\nin which <window-size> and <step-size> are positive integers\nand <statistic> is either \"p\", \"w\" or \"d\"\nselecting pi, Watterson's estimator per-site or Tajima's D respectively.\n"
unless
(
$ws
&&
$ws
=~ /^\d+$/ &&
$ws
> 0 &&
$ss
&&
$ws
=~ /^\d+$/ &&
$ss
> 0 &&
$stat
=~ /^[pwd]$/);
$slide_window
=
$val
;
},
'upblue|u'
=> \
$upblue
,
'absolute'
=> \
$absolute
,
'label=s'
=> \
$label_input
,
)
or pod2usage(2);
pod2usage(1)
if
$help
;
pod2usage(
-verbose
=> 2)
if
$man
;
my
$fromSTDIN
= ((-t STDIN) ? false : true);
pod2usage(
"$NAME: Requires at least one argument FILE [FILE2…FILEN] unless input from STDIN.\n"
)
if
(!(
$fromSTDIN
) && (
@ARGV
== 0));
pod2usage(
"$NAME: Requires exactly zero arguments if input is from STDIN.\n"
)
if
(
$fromSTDIN
&& (
@ARGV
!= 0));
&FAST::log
(
$logname
,
$DATE
,
$COMMAND
,
$comment
)
if
(
$log
);
my
$label
=
$label_input
||
' '
;
$^ =
'LATEX_TOP'
if
(
$latex
);
$^ =
'SILENT_TOP'
if
(
$suppress
);
$~ = (
$latex
?
'STDOUT_LATEX'
:
'STDOUT'
);
$^ =
'ABS_TOP'
if
(
$absolute
);
$~ =
'STDOUT_ABS'
if
(
$absolute
);
my
$IN
;
unless
(
@ARGV
) {
if
(
$moltype
) {
$IN
= FAST::Bio::AlignIO->new(
-fh
=>
*STDIN
{IO},
'-format'
=>
$def_format
,
'-alphabet'
=>
$moltype
);
}
else
{
$IN
= FAST::Bio::AlignIO->new(
-fh
=>
*STDIN
{IO},
'-format'
=>
$def_format
);
}
}
while
(
$IN
or
@ARGV
) {
if
(
@ARGV
) {
my
$file
=
shift
(
@ARGV
);
unless
(-e
$file
) {
warn
"$NAME: Could not find file $file. Skipping.\n"
;
next
;
}
elsif
(
$moltype
) {
$IN
= FAST::Bio::AlignIO->new(
-file
=>
$file
,
'-format'
=>
$def_format
,
'-alphabet'
=>
$moltype
);
}
else
{
$IN
= FAST::Bio::AlignIO->new(
-file
=>
$file
,
'-format'
=>
$def_format
);
}
}
if
(
$IN
) {
while
(
my
$aln
=
$IN
->next_aln){
my
@seqs
=
$aln
->each_seq();
my
$ua
= FAST::Bio::UnivAln->new(
'-seqs'
=> [
$aln
->each_seq()]);
$TL
=
$aln
->
length
();
$nseq
= (
scalar
(
@seqs
));
$a1
=
$a2
= 0;
for
my
$i
(1..(
$nseq
- 1)) {
$a1
+= (1 /
$i
);
$a2
+= (1 / (
$i
** 2));
}
my
$b1
= (
$nseq
+ 1) / (3 * (
$nseq
-1));
my
$b2
= 2 * ((
$nseq
** 2) +
$nseq
+ 3) / (9 *
$nseq
* (
$nseq
- 1));
my
$c1
= (
$b1
- (1 /
$a1
));
my
$c2
=
$b2
- ((
$nseq
+ 2) / (
$a1
*
$nseq
)) + (
$a2
/ (
$a1
** 2));
$e1
=
$c1
/
$a1
;
$e2
=
$c2
/ (
$a1
** 2 +
$a2
);
if
(
$slide_window
) {
my
(
$win_sz
,
$step_sz
,
$output
) =
split
/:/,
$slide_window
;
my
$gfi
=
$ua
->gap_free_inds();
my
$inds
=
$gfi
;
if
(
$pairwise
) {
$output
=
'p'
;
my
@ids
= @{
$ua
->row_ids() };
for
my
$i
(0..
$#ids
) {
for
my
$j
((
$i
+1)..
$#ids
) {
printf
"%10s "
,
$ids
[
$i
];
}
}
print
"\n"
;
for
my
$i
(0..
$#ids
) {
for
my
$j
((
$i
+1)..
$#ids
) {
printf
"%10s "
,
$ids
[
$j
];
}
}
print
"\n"
;
}
for
(
my
$b
= 0; (
$b
+
$win_sz
- 1) <
$#$inds
;
$b
+=
$step_sz
) {
my
$first
=
$$inds
[
$b
];
my
$last
=
$$inds
[(
$b
+
$win_sz
- 1)];
my
$midpoint
=
$first
+ ((
$last
-
$first
) / 2);
my
$slice
= [
@$inds
[
$b
..(
$b
+
$win_sz
-1)]];
my
$sz
=
scalar
(
@$slice
);
warn
"ERROR: window size is $sz instead of $win_sz.\n"
unless
(
$sz
==
$win_sz
);
unless
(
$pairwise
) {
my
$subaln
= new FAST::Bio::UnivAln(
-seqs
=>
scalar
(
$ua
->seqs([],
$slice
)));
my
$return_val
= calculate(
$subaln
,
$output
);
printf
"%d-%d %d %7.6f\n"
,
$first
,
$last
,
$midpoint
,
$return_val
;
}
else
{
printf
"%d "
,
$midpoint
;
for
my
$i
(1..
$nseq
) {
for
my
$j
((
$i
+1)..
$nseq
) {
my
$subaln
= new FAST::Bio::UnivAln(
-seqs
=>
scalar
(
$ua
->seqs([
$i
,
$j
],
$slice
)));
my
$return_val
= calculate(
$subaln
,
$output
);
printf
"%10.6f "
,
$return_val
;
}
}
print
"\n"
;
}
}
}
elsif
(
$pairwise
) {
my
$output
=
'p'
;
my
@ids
= @{
$ua
->row_ids() };
print
(
' '
x 11);
for
my
$i
(0..(
$#ids
-1)) {
print
(
$latex
?
'& '
:
' '
);
printf
"%10s "
,
$ids
[
$i
];
}
print
(
$latex
?
"\\\\\n"
:
"\n"
);
for
my
$i
(1..
$nseq
) {
printf
"%10s "
,
$ids
[(
$i
-1)];
for
my
$j
(1..(
$i
-1)) {
print
(
$latex
?
'& '
:
' '
);
my
$pair
= new FAST::Bio::UnivAln(
-seqs
=>
scalar
(
$ua
->seqs([
$i
,
$j
],[])));
my
$return_val
=
&calculate
(
$pair
,
$output
);
printf
"%10.6f "
,
$return_val
;
}
if
(
$latex
) {
for
my
$j
(
$i
..(
$nseq
-1)) {
print
"&"
,(
' '
x 12);
}
}
print
(
$latex
?
" \\\\\n"
:
"\n"
);
}
}
elsif
(
$upblue
) {
print
$nseq
,
"\n"
;
for
my
$i
(1..
$nseq
) {
for
my
$j
((
$i
+1)..
$nseq
) {
my
$pair
= new FAST::Bio::UnivAln(
-seqs
=>
scalar
(
$ua
->seqs([
$i
,
$j
],[])));
print
join
' '
,
$i
,
$j
;
my
$return_val
=
&calculate
(
$pair
,
'p'
);
}
}
}
else
{
&calculate
(
$ua
);
}
}
undef
$IN
;
}
}
sub
get_pattern {
my
@chars
= @{
$_
[0] };
my
$count
= {};
map
{
$$count
{
$_
}++}
@chars
;
return
$count
;
}
sub
Ramanujan_logfact{
my
$lf
;
my
$n
=
shift
;
$lf
= (
$n
*
log
$n
) -
$n
;
$lf
+=
log
(
$n
+ (4 *
$n
** 2) + (8 *
$n
** 3)) / 6;
$lf
+= hlPI;
}
sub
calculate {
my
$ua
=
shift
;
my
$output
=
shift
;
if
(
$output
&&
$output
!~ /[pwd]/) {
die
"$NAME: Error, output (third parameter to -w) must be equal to \'p\', \'w\', or \'d\´.\n"
;
}
my
$gfs
=
$ua
->gap_free_sites();
my
$gf
= FAST::Bio::UnivAln->new(
'-seqs'
=>
$gfs
);
$L
=
$gf
->width();
my
$nseqs
=
$gf
->height();
my
$vs
=
$gf
->var_sites();
my
$v
= FAST::Bio::UnivAln->new(
'-seqs'
=>
$vs
);
$S
=
$v
->width();
$s
=
$S
/ (
$L
|| 1);
$Theta_w
=
$S
/
$a1
;
$Theta_w_per_site
=
$Theta_w
/ (
$L
|| 1);
return
$Theta_w_per_site
if
(
$output
&&
$output
eq
'w'
);
print
join
' '
,
''
,
$S
,
"\n"
if
(
$upblue
);
my
@seqs
=
split
/\n/,
$vs
;
my
%alleles
;
for
my
$i
(0..
$#seqs
) {
$alleles
{
$seqs
[
$i
]}++;
}
$Num_alleles
=
scalar
keys
%alleles
;
$H
= 1;
my
%a
;
map
{
$H
-= ((
$_
/
$nseqs
) ** 2)}
values
%alleles
;
if
(!
$slide_window
and !
$pairwise
and
$Theta_w
> 0) {
$Exp_num_alleles
= 1 /
$Theta_w
;
for
my
$i
(1 .. (
$#seqs
)) {
$Exp_num_alleles
+= 1 / (
$Theta_w
+
$i
);
}
$Exp_num_alleles
*=
$Theta_w
;
map
{
$a
{
$_
}++}
values
%alleles
;
my
$log_prob
=
$Num_alleles
*
log
(
$Theta_w
);
$log_prob
+= Ramanujan_logfact(
$Num_alleles
);
foreach
my
$i
(0 .. (
$nseqs
- 1)) {
$log_prob
-=
log
(
$Theta_w
+
$i
);
}
foreach
(
keys
%a
) {
$log_prob
-= (
$a
{
$_
} *
log
(
$_
)) + Ramanujan_logfact(
$a
{
$_
});
}
$Prob_allele_partition
=
exp
$log_prob
;
}
else
{
$Exp_num_alleles
= 1;
$Prob_allele_partition
= 1;
}
undef
%alleles
;
$Pi
= 0;
$eta_S
= 0;
$eta
= 0;
my
@pats
=
$v
->map_c(\
&get_pattern
);
foreach
my
$pat
(
@pats
) {
$eta_S
+=
scalar
grep
{
$_
== 1}
values
%$pat
;
$eta
+= (
scalar
values
%$pat
) - 1;
my
$F
= 0;
map
{
$F
+= ((
$_
/
$nseqs
) ** 2)}
values
%$pat
;
$Pi
+= (1 -
$F
);
}
$Pi
and
$Pi
*= (
$nseqs
/ (
$nseqs
- 1));
$pi
=
$Pi
/ (
$L
|| 1);
return
$pi
if
(
$output
&&
$output
eq
'p'
);
my
$Tajima_D_num
=
$Pi
-
$Theta_w
;
my
$Tajima_D_den
=
sqrt
( (
$e1
*
$S
) + (
$e2
*
$S
* (
$S
- 1)));
$Tajima_D
=
$Tajima_D_num
/ (
$Tajima_D_den
|| 1);
return
$Tajima_D
if
(
$output
&&
$output
eq
'd'
);
if
(
$nseq
> 2) {
my
$cn
= (
$nseq
== 2 ? 1 : (2 * (((
$nseq
*
$a1
) - (2 * (
$nseq
- 1)))
/ ((
$nseq
- 1) * (
$nseq
- 2)))));
$anp1
=
$a1
+ (1 / (
$nseq
));
my
$dn
= (
$cn
+
((
$nseq
- 2) / ((
$nseq
- 1) ** 2)) +
((2 / (
$nseq
- 1)) *
(3/2 - (((2 *
$anp1
) - 3) / (
$nseq
- 2)) - (1 /
$nseq
))
)
);
my
$v_D_st
= ((((
$nseq
/ (
$nseq
- 1)) ** 2) *
$a2
) +
((
$a1
** 2) *
$dn
) -
((2 *
$nseq
*
$a1
* (
$a1
+ 1)) / ((
$nseq
- 1) ** 2)))
/
( (
$a1
** 2) +
$a2
);
my
$u_D_st
= ((
$nseq
/ (
$nseq
- 1)) * (
$a1
- (
$nseq
/ (
$nseq
- 1)))) -
$v_D_st
;
my
$FuLi_D_star_num
= (
$eta
*
$nseqs
/ (
$nseqs
- 1)) - (
$eta_S
*
$a1
);
my
$FuLi_D_star_den
=
sqrt
( (
$u_D_st
*
$eta
) + (
$v_D_st
* (
$eta
** 2)));
$FuLi_D_star
=
$FuLi_D_star_num
/ (
$FuLi_D_star_den
|| 1);
}
else
{
$FuLi_D_star
=
"NA"
;
}
my
$v_F_st
= ( ((2 * (
$nseq
** 3) + (110 * (
$nseq
** 2)) - (255 *
$nseq
) + 153)
/ (9 * (
$nseq
** 2) * (
$nseq
- 1)))
+ ((2 * (
$nseq
- 1) *
$a1
) / (
$nseq
** 2))
- (8 *
$a2
/
$nseq
)
)
/ ((
$a1
** 2) +
$a2
);
my
$u_F_st
= ((((4 * (
$nseq
** 2))
+ (19 *
$nseq
)
+ 3
- (12 * (
$nseq
+ 1) *
$anp1
)
)
/
(3 *
$nseq
* (
$nseq
- 1)))
/
$a1
)
-
$v_F_st
;
my
$FuLi_F_star_num
=
$Pi
- (
$eta_S
* (
$nseq
- 1) /
$nseq
);
my
$FuLi_F_star_den
=
sqrt
( (
$u_F_st
*
$eta
) + (
$v_F_st
* (
$eta
** 2)));
$FuLi_F_star
=
$FuLi_F_star_num
/ (
$FuLi_F_star_den
|| 1);
my
$Var_Pi_denom
= ((11 * (
$nseqs
** 2)) - (7 *
$nseqs
) + 6);
my
$Total_Variance_Pi_no_rec
= ((3 *
$nseqs
* (
$nseqs
+ 1) *
$Pi
) + (2 * ((
$nseqs
** 2) +
$nseqs
+ 3) * (
$Pi
** 2))) /
$Var_Pi_denom
;
my
$SE_Pi_no_rec
= (
sqrt
$Total_Variance_Pi_no_rec
);
$SE_pi_no_rec
=
$SE_Pi_no_rec
/ (
$L
|| 1);
my
$Total_Variance_Pi_LE
=
$Pi
* (
$nseqs
+ 1) / (3 * (
$nseqs
- 1));
my
$SE_Pi_LE
= (
sqrt
$Total_Variance_Pi_LE
);
$SE_pi_LE
=
$SE_Pi_LE
/ (
$L
|| 1);
my
$Variance_Theta_w_no_recomb
= (((
$a1
** 2) *
$S
) + (
$a2
* (
$S
** 2))) / ((
$a1
** 4) + ((
$a1
** 2) *
$a2
));
my
$Variance_Theta_w_LE
=
$S
/ (
$a1
** 2);
$SE_Theta_w_no_recomb_per_site
= (
sqrt
$Variance_Theta_w_no_recomb
) / (
$L
|| 1);
$SE_Theta_w_LE_per_site
= (
sqrt
$Variance_Theta_w_LE
) / (
$L
|| 1);
write
;
}
format
STDOUT_TOP =
%
% 0.n 0.k 0.H 0.E(k|Th_w) 0.Pr(ai..ak) 1.L 2.L(gf) 3.S 4.s 5.Th_w_ps 6.SE_Thwps_LD 7.SE_Thwps_LE 8.pi 9.SE_pi_LD 10.SE_pi_LE 11.Tajim_D 12.FuLi_D* 13.FuLi_F* 14.label
% --- --- ---- ----------- ----------- ----- ----- ----- ------- -------- ------------- ------------- -------- ----------- ---------- ---------- ---------- ---------- --------
.
format
STDOUT =
@>> @>> @.
$nseq
,
$Num_alleles
,
$H
,
$Exp_num_alleles
,
$Prob_allele_partition
,
$TL
,
$L
,
$S
,
$s
,
$Theta_w_per_site
,
$SE_Theta_w_no_recomb_per_site
,
$SE_Theta_w_LE_per_site
,
$pi
,
$SE_pi_no_rec
,
$SE_pi_LE
,
$Tajima_D
,
$FuLi_D_star
,
$FuLi_F_star
,
$label
.
format
ABS_TOP =
0.n 1.k 2.Len 3.S 4.Th_W 5. Pi 6.EtaS 7.Eta 8.label
--- --- ----- ----- ------- -------- ----- ----- -------
.
format
STDOUT_ABS =
@>> @>> @>>>> @>>>> @
$nseq
,
$Num_alleles
,
$L
,
$S
,
$Theta_w
,
$Pi
,
$eta_S
,
$eta
,
$label
.
format
STDOUT_LATEX =
@<<<<<< & @>> & @>>>> (@>>>>) & @>>>> & @
$label
,
$nseq
,
$TL
,
$L
,
$S
,
$s
,
$Theta_w_per_site
,
$SE_Theta_w_no_recomb_per_site
,
$pi
,
$SE_pi_no_rec
,
$Tajima_D
.
format
LATEX_TOP =
\documentclass[12pt,letterpaper,oneside]{article}
\begin{document}
\begin{table}[h]
\begin{center}
\begin{tabular}{lccccccc}
\hline
& n & Len (gf) & S & s & $\hat{\theta}_W \pm SE$ & $\hat{pi} \pm SE$ & Tajima's D \\
\hline
.
format
SILENT_TOP =
.
Hide Show 234 lines of Pod