#!/usr/bin/perl -w
use POSIX;
use Getopt::Long qw(:config bundling require_order auto_version);
use FAST;
use strict;
use vars qw($VERSION $DESC $NAME $COMMAND $DATE);
$VERSION = $FAST::VERSION;
$DESC = "Calculate molecular population genetic statistics from DNA alignments.\n";
$NAME = $0;
$NAME =~ s/^.*\///;
$COMMAND = join " ",$NAME,@ARGV;
$DATE = POSIX::strftime("%c",localtime());
use constant { true => 1, false => 0 };
use constant hlPI => log(4 * atan2 1, 1)/2;
## DEFAULT OPTION VALUES
my $def_format = $FAST::DEF_FORMAT; #7/1/13 "fasta";
my $def_logname = $FAST::DEF_LOGNAME; #7/1/13 "FAST.log.txt";
## OPTION VARIABLES
my $man = undef; # --man
my $help = undef; # -h
my $moltype = undef; # -m, in case bioperl can't tell
my $format = $def_format; # --format
my $log = undef; # -l
my $logname = $def_logname; # -L
my $comment = undef; # -C
my $suppress = undef; #-s
my $latex = undef; #-x
my $pairwise = undef;
my $slide_window = undef; #-w
my $upblue = undef; #-u
my $absolute = undef; #--absolute
my $label_input = undef;
my $L;
my $s;
my $S;
my $Theta_w;
my $a1;
my $a2;
my $e1;
my $e2;
my $Theta_w_per_site;
my $Num_alleles;
my $H;
my $nseq;
my $Exp_num_alleles;
my $Prob_allele_partition;
my $Pi;
my $pi;
my $eta_S;
my $eta;
my $Tajima_D;
my $FuLi_D_star;
my $FuLi_F_star;
my $anp1;
my $SE_pi_no_rec;
my $SE_pi_LE;
my $SE_Theta_w_no_recomb_per_site;
my $SE_Theta_w_LE_per_site;
my $TL;
GetOptions('help|h' => \$help,
'man' => \$man,
'moltype|m=s' => sub{ my (undef,$val) = @_;
die "$NAME: --moltype or -m option argument must be \"dna\", \"rna\" or \"protein\""
unless $val =~ /dna|rna|protein/i;
$moltype = $val;
},
'format=s' => \$format,
'log|l' => \$log,
'logname|L=s' => \$logname,
'comment|C=s' => \$comment,
'suppress|s' => \$suppress,
'latex|x' => \$latex,
'pairwise|p' => \$pairwise,
'window|w=s' => sub{ my (undef,$val) = @_;
my ($ws,$ss,$stat) = split /:/,$val;
die "$NAME: --window or -w option expects string argument <window-size>:<step-size>:<statistic>\nin which <window-size> and <step-size> are positive integers\nand <statistic> is either \"p\", \"w\" or \"d\"\nselecting pi, Watterson's estimator per-site or Tajima's D respectively.\n"
unless ($ws && $ws =~ /^\d+$/ && $ws > 0 && $ss && $ws =~ /^\d+$/ && $ss > 0 && $stat =~ /^[pwd]$/);
$slide_window = $val;
},
'upblue|u' => \$upblue,
'absolute' => \$absolute,
'label=s' => \$label_input,
)
or pod2usage(2);
pod2usage(1) if $help;
pod2usage(-verbose => 2) if $man;
my $fromSTDIN = ((-t STDIN) ? false : true);
pod2usage("$NAME: Requires at least one argument FILE [FILE2…FILEN] unless input from STDIN.\n") if (!($fromSTDIN) && (@ARGV == 0));
pod2usage("$NAME: Requires exactly zero arguments if input is from STDIN.\n") if ($fromSTDIN && (@ARGV != 0));
&FAST::log($logname, $DATE, $COMMAND, $comment) if ($log);
my $label = $label_input || ' ';
$^ = 'LATEX_TOP' if ($latex);
$^ = 'SILENT_TOP' if ($suppress);
$~ = ($latex ? 'STDOUT_LATEX' : 'STDOUT' );
$^ = 'ABS_TOP' if ($absolute);
$~ = 'STDOUT_ABS' if ($absolute);
my $IN;
unless (@ARGV) {
if ($moltype) {
$IN = FAST::Bio::AlignIO->new(-fh => *STDIN{IO}, '-format' => $def_format, '-alphabet' => $moltype);
}
else {
$IN = FAST::Bio::AlignIO->new(-fh => *STDIN{IO}, '-format' => $def_format);
}
}
while ($IN or @ARGV) {
if (@ARGV) {
my $file = shift (@ARGV);
unless (-e $file) {
warn "$NAME: Could not find file $file. Skipping.\n";
next;
}
elsif ($moltype) {
$IN = FAST::Bio::AlignIO->new(-file => $file, '-format' => $def_format, '-alphabet' => $moltype);
}
else {
$IN = FAST::Bio::AlignIO->new(-file => $file, '-format' => $def_format);
}
}
if ($IN) {
while ( my $aln = $IN->next_aln){
my @seqs = $aln->each_seq();
my $ua = FAST::Bio::UnivAln->new('-seqs' => [$aln->each_seq()]);
$TL = $aln->length();
## CALCULATION OF NEEDED PARAMETERS THAT DEPEND ON NUMBER OF SEQS
$nseq = (scalar (@seqs));
## $nseq_stat = ($opt_p ? 2 : scalar (@seqs));
$a1 = $a2 = 0;
for my $i (1..($nseq - 1)) {
$a1 += (1 / $i);
$a2 += (1 / ($i ** 2)); ## a2 = bn in Fu Li 93
}
my $b1 = ($nseq + 1) / (3 * ($nseq -1));
my $b2 = 2 * (($nseq ** 2) + $nseq + 3) / (9 * $nseq * ($nseq - 1));
my $c1 = ($b1 - (1 / $a1));
my $c2 = $b2 - (($nseq + 2) / ($a1 * $nseq)) + ($a2 / ($a1 ** 2));
$e1 = $c1 / $a1;
$e2 = $c2 / ($a1 ** 2 + $a2);
if ($slide_window) { ## SLIDING-WINDOW MODE
my ($win_sz,$step_sz,$output) = split /:/,$slide_window;
my $gfi = $ua->gap_free_inds(); ## IGNORE GAP-FREE SITES FROM WHOLE ALIGNMENT
my $inds = $gfi;
if ($pairwise) { ## PAIRWISE-PI-WINDOW MODE
$output = 'p'; ## FORCE OUTPUT TO BE PI
## OUTPUT PAIRWISE-MODE HEADER
my @ids = @{ $ua->row_ids() };
for my $i (0..$#ids) {
for my $j (($i+1)..$#ids) {
printf "%10s ",$ids[$i];
}
}
print "\n";
for my $i (0..$#ids) {
for my $j (($i+1)..$#ids) {
printf "%10s ",$ids[$j];
}
}
print "\n";
} ## END PAIRWISE-MODE HEADER
## BEGIN SLIDING WINDOWS
for (my $b = 0; ($b + $win_sz - 1) < $#$inds; $b += $step_sz) {
my $first = $$inds[$b];
my $last = $$inds[($b + $win_sz - 1)];
my $midpoint = $first + (($last - $first) / 2);
my $slice = [ @$inds[$b..($b+$win_sz-1)]];
my $sz = scalar(@$slice);
warn "ERROR: window size is $sz instead of $win_sz.\n" unless ($sz == $win_sz);
unless ($pairwise) {
my $subaln = new FAST::Bio::UnivAln(-seqs=>scalar($ua->seqs([],$slice)));
my $return_val = calculate($subaln,$output);
printf "%d-%d %d %7.6f\n",$first,$last,$midpoint, $return_val;
}
else { ##PAIRWISE MODE
printf "%d ",$midpoint;
for my $i (1..$nseq) {
for my $j (($i+1)..$nseq) {
my $subaln = new FAST::Bio::UnivAln(-seqs=>scalar($ua->seqs([$i,$j],$slice)));
my $return_val = calculate($subaln,$output);
printf "%10.6f ",$return_val;
}
}
print "\n";
}
}
}
elsif ($pairwise) { ## PAIRWISE-MATRIX-MODE
## WE PUT OPT_U HERE TO MAXIMIZE SEGSITES BETWEEN PAIRS
## INSTEAD OF REMOVING ALL SITES CONTAINING A GAP IN ANY
## ALLELE
my $output = 'p'; ## output is 'pi', modifies behavior of &calculate
my @ids = @{ $ua->row_ids() };
## PRINT HEADER
print (' ' x 11);
for my $i (0..($#ids-1)) {
print ($latex ? '& ' : ' '); ## LATEX MODE
printf "%10s ",$ids[$i];
}
print ($latex ? "\\\\\n" : "\n");
## PRINT MATRIX
for my $i (1..$nseq) {
printf "%10s ",$ids[($i-1)];
for my $j (1..($i-1)) {
print ($latex ? '& ' : ' '); ## LATEX MODE
my $pair = new FAST::Bio::UnivAln(-seqs=>scalar($ua->seqs([$i,$j],[])));
my $return_val = &calculate($pair,$output);
printf "%10.6f ",$return_val;
}
if ($latex) {
for my $j ($i..($nseq-1)) {
print "&",(' ' x 12);
}
}
print ($latex ? " \\\\\n" : "\n");
}
}
elsif ($upblue) {
print $nseq,"\n";
for my $i (1..$nseq) {
for my $j (($i+1)..$nseq) {
my $pair = new FAST::Bio::UnivAln(-seqs=>scalar($ua->seqs([$i,$j],[])));
print join ' ',$i,$j;
my $return_val = &calculate($pair,'p');
}
}
}
else {
&calculate($ua);
}
}
undef $IN;
}
}
## THIS FUNCTION WILL BE MAPPED OVER ALIGNMENT COLUMNS TO COUNT STATES
sub get_pattern {
my @chars = @{ $_[0] };
my $count = {};
map {$$count{$_}++} @chars;
return $count;
}
## THIS APPROXIMATION FOR THE LOG FACTORIAL IS DUE TO RAMANUJAN TAKEN FROM WIKIPEDIA ON MARCH 13 2009
## NEEDS TO BE VERIFIED AND COMPARED TO THE MATH::GSL::SF AND BIGFLOAT FUNCTIONS
## INPUT: INTEGER, FRACTIONAL PART WILL BE FLOORED AWAY
sub Ramanujan_logfact{
my $lf;
my $n = shift;
$lf = ($n * log $n) - $n;
$lf += log ($n + (4 * $n ** 2) + (8 * $n ** 3)) / 6;
$lf += hlPI; #this constant is half-log PI defined at the top
}
sub calculate {
my $ua = shift;
my $output = shift;
if ($output && $output !~ /[pwd]/) {
die "$NAME: Error, output (third parameter to -w) must be equal to \'p\', \'w\', or \'d\´.\n";
}
my $gfs = $ua->gap_free_sites();
my $gf = FAST::Bio::UnivAln->new('-seqs' => $gfs);
$L = $gf->width(); ## NOT my BECAUSE IT IS PRINTED GLOBAL
my $nseqs = $gf->height(); ## LOCAL DISTINCT VARIABLE BECAUSE OF PAIRWISE MODE
my $vs = $gf->var_sites();
my $v = FAST::Bio::UnivAln->new('-seqs' => $vs);
$S = $v->width();
$s = $S / ($L || 1);
$Theta_w = $S / $a1;
$Theta_w_per_site = $Theta_w / ($L || 1);
return $Theta_w_per_site if ($output && $output eq 'w');
print join ' ','',$S,"\n" if ($upblue);
## NEW IMPLEMENTATION $opt_u NOT TESTED; REQUIRES PRINTING NUMBER PW DIFF EACH PAIR OF SEQS!?
my @seqs = split /\n/,$vs; ## NEWLINES GO AWAY
my %alleles;
for my $i (0..$#seqs) {
$alleles{$seqs[$i]}++; ## FOR THE ALLELE CONFIGURATION
}
## COMPUTE PROBABILITY OF ALLELIC CONFIGURATION USING KARLIN MACGREGOR 1972
$Num_alleles = scalar keys %alleles;
$H = 1;
my %a;
map {$H -= (($_/$nseqs) ** 2)} values %alleles;
if (!$slide_window and !$pairwise and $Theta_w > 0) {
$Exp_num_alleles = 1 / $Theta_w;
for my $i (1 .. ($#seqs )) {
$Exp_num_alleles += 1 / ($Theta_w + $i);
}
$Exp_num_alleles *= $Theta_w; ## Ewen's Sampling formula
map {$a{$_}++} values %alleles; ## THIS MAKES THE a_i
my $log_prob = $Num_alleles * log ($Theta_w);
# $log_prob += Math::Gsl::Sf::sf_lnfact($Num_alleles);
$log_prob += Ramanujan_logfact($Num_alleles);
foreach my $i (0 .. ($nseqs - 1)) {
$log_prob -= log ($Theta_w + $i);
}
foreach (keys %a) {
# $log_prob -= ($a{$_} * log ($_)) + Math::Gsl::Sf::sf_lnfact($a{$_});
$log_prob -= ($a{$_} * log ($_)) + Ramanujan_logfact($a{$_});
}
$Prob_allele_partition = exp $log_prob;
}
else {
$Exp_num_alleles = 1;
$Prob_allele_partition = 1;
}
undef %alleles;
## end COMPUTE PROBABILITY OF ALLELIC CONFIGURATION USING KARLIN MACGREGOR 1972
$Pi = 0;
$eta_S = 0; ## FOR fu li D* and F*
$eta = 0;
my @pats = $v->map_c(\&get_pattern);
foreach my $pat (@pats) {
$eta_S += scalar grep {$_ == 1} values %$pat;
$eta += (scalar values %$pat) - 1;
my $F = 0;
map {$F += (($_ / $nseqs) ** 2)} values %$pat;
$Pi += (1 - $F);
}
$Pi and $Pi *= ($nseqs / ($nseqs - 1)); ## for stochastic and sampling variance see Tajima 93
## USED TO BE $nseqs_stat -- for pairwise pi mode. Why?
$pi = $Pi / ($L || 1); ## nuc diversity
return $pi if ($output && $output eq 'p');
my $Tajima_D_num = $Pi - $Theta_w;
my $Tajima_D_den = sqrt ( ($e1 * $S) + ($e2 * $S * ($S - 1)));
$Tajima_D = $Tajima_D_num / ($Tajima_D_den || 1);
return $Tajima_D if ($output && $output eq 'd');
if ($nseq > 2) {
my $cn = ($nseq == 2 ? 1 : (2 * ((($nseq * $a1) - (2 * ($nseq - 1)))
/ (($nseq - 1) * ($nseq - 2))))); ## FU LI eq 14. p 695
$anp1 = $a1 + (1 / ($nseq));
my $dn = ($cn +
(($nseq - 2) / (($nseq - 1) ** 2)) +
((2 / ($nseq - 1)) *
(3/2 - (((2 * $anp1) - 3) / ($nseq - 2)) - (1 / $nseq))
)
);
my $v_D_st = (((($nseq / ($nseq - 1)) ** 2) * $a2) +
(($a1 ** 2) * $dn) -
((2 * $nseq * $a1 * ($a1 + 1)) / (($nseq - 1) ** 2)))
/
( ($a1 ** 2) + $a2 );
my $u_D_st = (($nseq / ($nseq - 1)) * ($a1 - ($nseq / ($nseq - 1)))) - $v_D_st;
my $FuLi_D_star_num = ($eta * $nseqs / ($nseqs - 1)) - ($eta_S * $a1);
my $FuLi_D_star_den = sqrt ( ($u_D_st * $eta) + ($v_D_st * ($eta ** 2)));
$FuLi_D_star = $FuLi_D_star_num / ($FuLi_D_star_den || 1);
}
else {
$FuLi_D_star = "NA";
}
## THE BELOW EXPRESSION IS FROM FU LI 93
# $v_F_st = ( $dn + (((2 * (($nseq ** 2) + $nseq + 3))) / (9 * $nseq * ($nseq - 1)))
# - (2 / ($nseq - 1) * ((4 * $a2) - 6 + (8 / $nseq))))
# /
# ( ($a1 ** 2) + $a2 );
# $u_F_st = ((($nseq / ($nseq - 1)) + (($nseq + 1 ) / (3 * ($nseq - 1)))
# - (4 / $nseq / ($nseq - 1))
# + (2 * ($nseq + 1)
# / (($nseq - 1) ** 2)
# * ($anp1 - ((2 * $nseq) / ($nseq + 1)))
# )
# )
# / $a1)
# - $v_F_st;
## THE BELOW EXPRESSION IS FROM SIMONSEN ET AL P 428
my $v_F_st = ( ((2 * ($nseq ** 3) + (110 * ($nseq ** 2)) - (255 * $nseq) + 153)
/ (9 * ($nseq ** 2) * ($nseq - 1)))
+ ((2 * ($nseq - 1) * $a1) / ($nseq ** 2))
- (8 * $a2 / $nseq)
)
/ (($a1 ** 2) + $a2);
# ## THE BELOW EXPRESSION IS FROM SIMONSEN ET AL P 428
my $u_F_st = ((((4 * ($nseq ** 2))
+ (19 * $nseq)
+ 3
- (12 * ($nseq + 1) * $anp1)
)
/
(3 * $nseq * ($nseq - 1)))
/ $a1)
- $v_F_st;
my $FuLi_F_star_num = $Pi - ($eta_S * ($nseq - 1) / $nseq);
my $FuLi_F_star_den = sqrt ( ($u_F_st * $eta) + ($v_F_st * ($eta ** 2)));
$FuLi_F_star = $FuLi_F_star_num / ($FuLi_F_star_den || 1);
## THESE STATISTICS BELOW CORRESPOND TO FU AND LI "other tests" AND
## ARE CORRECTED IN SIMONSEN ET AL 95
# $v_D_st1 = (($a2 / ($a1 ** 2)) - ( (2/$nseq) * ( 1 + (1 / $a1) - $a1 + ($a1 / $nseq)))
# - (1 / ($nseq ** 2)))
# / ( ($a1 ** 2) + $a2 );
# $u_D_st1 = (($a1 * ($nseq - 1)) - $nseq) / ($nseq * ($a1 ** 2)) - $v_D_st1;
# $FuLi_D_star1_num = $Theta_w - ($eta_S * ($nseqs - 1) / $nseqs); ## Simonsen Et Al 95
# $FuLi_D_star1_den = sqrt ( ($u_D_st1 * $S) + ($v_D_st1 * ($S ** 2)));
# $FuLi_D_star = $FuLi_D_star1_num / ($FuLi_D_star1_den || 1);
my $Var_Pi_denom = ((11 * ($nseqs ** 2)) - (7 * $nseqs) + 6);
my $Total_Variance_Pi_no_rec = ((3 * $nseqs * ($nseqs + 1) * $Pi) + (2 * (($nseqs ** 2) + $nseqs + 3) * ($Pi ** 2))) / $Var_Pi_denom;
my $SE_Pi_no_rec = (sqrt $Total_Variance_Pi_no_rec);
$SE_pi_no_rec = $SE_Pi_no_rec / ($L || 1);
my $Total_Variance_Pi_LE = $Pi * ($nseqs + 1) / (3 * ($nseqs - 1));
my $SE_Pi_LE = (sqrt $Total_Variance_Pi_LE);
$SE_pi_LE = $SE_Pi_LE / ($L || 1);
my $Variance_Theta_w_no_recomb = ((($a1 ** 2) * $S) + ($a2 * ($S ** 2))) / (($a1 ** 4) + (($a1 ** 2) * $a2));
my $Variance_Theta_w_LE = $S / ($a1 ** 2);
# $SE_Theta_w_no_recomb = (sqrt $Variance_Theta_w_no_recomb);
# $SE_Theta_w_LE = (sqrt $Variance_Theta_w_LE);
$SE_Theta_w_no_recomb_per_site = (sqrt $Variance_Theta_w_no_recomb) / ($L || 1);
$SE_Theta_w_LE_per_site = (sqrt $Variance_Theta_w_LE) / ($L || 1);
write;
}
format STDOUT_TOP =
%
% 0.n 0.k 0.H 0.E(k|Th_w) 0.Pr(ai..ak) 1.L 2.L(gf) 3.S 4.s 5.Th_w_ps 6.SE_Thwps_LD 7.SE_Thwps_LE 8.pi 9.SE_pi_LD 10.SE_pi_LE 11.Tajim_D 12.FuLi_D* 13.FuLi_F* 14.label
% --- --- ---- ----------- ----------- ----- ----- ----- ------- -------- ------------- ------------- -------- ----------- ---------- ---------- ---------- ---------- --------
.
format STDOUT =
@>> @>> @.### @.#### @.#### @>>>> @>>>> @>>>> @#.#### @#.##### @#.##### @#.##### @#.##### @#.##### @#.##### @#.##### @#.##### @#.##### @<<<<<<
$nseq,$Num_alleles,$H,$Exp_num_alleles,$Prob_allele_partition,$TL,$L,$S,$s,$Theta_w_per_site,$SE_Theta_w_no_recomb_per_site,$SE_Theta_w_LE_per_site,$pi,$SE_pi_no_rec,$SE_pi_LE,$Tajima_D,$FuLi_D_star,$FuLi_F_star,$label
.
format ABS_TOP =
0.n 1.k 2.Len 3.S 4.Th_W 5. Pi 6.EtaS 7.Eta 8.label
--- --- ----- ----- ------- -------- ----- ----- -------
.
format STDOUT_ABS =
@>> @>> @>>>> @>>>> @#.#### @#.##### @>>>> @>>>> @<<<<<<
$nseq,$Num_alleles,$L,$S,$Theta_w,$Pi,$eta_S,$eta,$label
.
format STDOUT_LATEX =
@<<<<<< & @>> & @>>>> (@>>>>) & @>>>> & @#.#### & \(@#.#### \pm @#.#### \) & \(@#.#### \pm @#.#### \) & @#.#### \\
$label,$nseq,$TL,$L,$S,$s,$Theta_w_per_site,$SE_Theta_w_no_recomb_per_site,$pi,$SE_pi_no_rec,$Tajima_D
.
format LATEX_TOP =
\documentclass[12pt,letterpaper,oneside]{article}
\begin{document}
\begin{table}[h]
\begin{center}
\begin{tabular}{lccccccc}
\hline
& n & Len (gf) & S & s & $\hat{\theta}_W \pm SE$ & $\hat{pi} \pm SE$ & Tajima's D \\
\hline
.
format SILENT_TOP =
.
__END__
=head1 alnpi
B<alnpi> - calculate molecular population genetic statistics from DNA alignments
=head1 SYNOPSIS
B<alnpi> [OPTIONS] [MULTIFASTA-FILE...]
=head1 DESCRIPTION
B<alnpi> takes multifasta format alignment data as input, and outputs
molecular population genetic statistics. Options modulate the type of
statistics calculated, the mode of calculation, and the style of
output. By default, per-site statistics are calculated over the entire
output, specifically after gap-containing sites are
removed. Optionally, statistics may be calculated pair-wise across
sequences, or in sliding windows of specified length. The statistics
calculated by default include:
=over 4
=item 1. Number of sequences (n)
=item 2. Number of alleles/distinct sequences (k)
=item 3. Heterozygosity (H)
=item 4. Expected number of alleles given Watterson estimator (Ewens, 1972)
=item 5. Probability of allelic configuration (H, Karlin and MacGregor 1972)
=item 6. Total alignment length (L)
=item 7. Number of gap-free sites (L_gf)
=item 8. Number of gap-free segregating sites (S)
=item 9. Fraction of gap-free segregating sites (s)
=item 10. Watterson estimator per gap-free site (Th_w_ps, Watterson 1975)
=item 11. Standard Error of Th_w_ps assuming no recombination (SE_Thwps_LD)
=item 12. Standard Error of Th_w_ps assuming free recombination (SE_Thwps_LE)
=item 13. Nucleotide Diversity in gap-free sites (pi, Nei and Li 1979)
=item 14. Standard Error of pi assuming no recombination (SE_pi_LD)
=item 15. Standard Error of pi assuming free recombination (SE_pi_LE)
=item 16. Tajima's D (Tajima, 1989)
=item 17. Fu and Li's D* (Fu and Li, 1993)
=item 18. Fu and Li's F* (Fu and Li, 1993, Simonsen et al. 1995)
=back
Invoked in absolute mode with B<alnpi --absolute>, B<alnpi> outputs these
statistics instead:
=over 4
=item 1. Number of sequences (n)
=item 2. Number of alleles/distinct sequences (k)
=item 3. Number of gap-free sites (Len)
=item 4. Number of segregating gap-free sites (S)
=item 5. Watterson estimator for gap-free sites (Th_W, Watterson 1975)
=item 6. Total alignment length (L)
=item 7. Average pairwise number of differences among gap-free sites (Pi)
=item 8. Eta S, (Fu and Li, 1993)
=item 9. Eta, (Fu and Li, 1993)
=back
Sliding window analysis supports output for only for nucleotide
diversity, Watterson estimator per-site, and Tajima's D.
Options specific to B<alnpi>:
B<-s>, B<--suppress> suppress table header output
B<-x>, B<--latex> print table with Latex formating
B<-p>, B<--pairwise> calculate stats pairwise across sequences
B<-w>, B<--window>=<string> calculate stats in sliding windows
B<--absolute> output statistics not normalized per-site
B<--label>=<string> label input/output descriptively with <string>
Options general to FAST:
B<-h>, B<--help> print a brief help message
B<--man> print full documentation
B<--version> print version
B<-l>, B<--log> create/append to logfile
B<-L>, B<--logname>=<string> use logfile name <string>
B<-C>, B<--comment>=<string> save comment <string> to log
B<--format>=<format> use alternative format for input
B<--moltype>=<[dna|rna|protein] specify input sequence type
=head1 INPUT AND OUTPUT
B<alnpi> is part of FAST, the FAST Analysis of Sequences Toolbox, based
on Bioperl. Most core FAST utilities expect input and return output in
multifasta format. Input can occur in one or more files or on
STDIN. Output occurs to STDOUT. The FAST utility B<fasconvert> can
reformat other formats to and from multifasta.
=head1 OPTIONS
=over 8
=item B<-s>,
B<--suppress>
Supress header output
=item B<-x>,
B<--latex>
LaTeX-style output
=item B<-p>,
B<--pairwise>
Statistics are calculated pairwise over all sequences
=item B<-w>, B<--window>=<string>
Sliding window analysis. Option argument <string> expected to be in
the form "window-size:step-size:statistic" where window-size and
step-size are positive integers and "statistic" may be one of "p", "s"
or "d" for nucleotide diversity, Watterson estimator or Tajima's D
respectively.
=item B<--absolute>
Output a smaller set of statistics not normalized by number of
gap-free sites.
=item B<--label>
Text label for the input data, to be placed in the output.
=item B<-h>,
B<--help>
Print a brief help message and exit.
=item B<--man>
Print the manual page and exit.
=item B<--version>
Print version information and exit.
=item B<-l>,
B<--log>
Creates, or appends to, a generic FAST logfile in the current working
directory. The logfile records date/time of execution, full command
with options and arguments, and an optional comment.
=item B<-L [string]>,
B<--logname=[string]>
Use [string] as the name of the logfile. Default is "FAST.log.txt".
=item B<-C [string]>,
B<--comment=[string]>
Include comment [string] in logfile. No comment is saved by default.
=item B<--format=[format]>
Use alternative format for input. See man page for "fasconvert" for
allowed formats. This is for convenience; the FAST tools are designed
to exchange data in Fasta format, and "fasta" is the default format
for this tool.
=item B<-m [dna|rna|protein]>,
B<--moltype=[dna|rna|protein]>
Specify the type of sequence on input (should not be needed in most
cases, but sometimes Bioperl cannot guess and complains when
processing data).
=back
=head1 EXAMPLES
Generate sliding window of Tajima's D, the data plotted in Fig. 4A of
Ardell et al. (2003) Genetics 165:1761. The data files ship with FAST:
B<fasgrep> -v "(AF194|349[06])" t/data/ArdellEtAl03_ncbi_popset_32329588.fas | B<alncut> -g | B<fastr> --strict -N - | B<alnpi> --window 100:25:d
Statistics for 5'UTRs, the last line in Table 1 of Ardell et al. (2003) Genetics 165:1761:
B<gbfalncut> -k t/data/ArdellEtAl03_ncbi_popset_32329588.fas t/data/AF194338.1.gb 5.UTR | fasgrep -v "(AF194|349[06])" | B<alncut> -g | B<fastr> --strict -a "-" | B<alnpi>
=head1 SEE ALSO
=over 8
=item C<man perlre>
=item C<perldoc perlre>
Documentation on perl regular expressions.
=item C<man FAST>
=item C<perldoc FAST>
Introduction and cookbook for FAST
=item L<The FAST Home Page|http://compbio.ucmerced.edu/ardell/FAST>"
=back
=head1 CITING
If you use FAST, please cite I<Ardell (2013). FAST: FAST Analysis of
Sequences Toolbox. Bioinformatics> and Bioperl I<Stajich et al.>.
=cut