# Cared for by Jason Stajich <jason-at-bioperl-dot-org>
#
# Copyright Jason Stajich
#
# You may distribute this module under the same terms as perl itself
# POD documentation - main docs before the code
=head1 NAME
Bio::PopGen::Statistics - Population Genetics statistical tests
=head1 SYNOPSIS
use Bio::PopGen::Statistics;
use Bio::AlignIO;
use Bio::PopGen::IO;
use Bio::PopGen::Simulation::Coalescent;
my $sim = new Bio::PopGen::Simulation::Coalescent( -sample_size => 12);
my $tree = $sim->next_tree;
$sim->add_Mutations($tree,20);
my $stats = new Bio::PopGen::Statistics();
my $individuals = [ $tree->get_leaf_nodes];
my $pi = $stats->pi($individuals);
my $D = $stats->tajima_D($individuals);
# Alternatively to do this on input data from
# See the tests in t/PopGen.t for more examples
my $parser = new Bio::PopGen::IO(-format => 'prettybase',
-file => 't/data/popstats.prettybase');
my $pop = $parser->next_population;
# Note that you can also call the stats as a class method if you like
# the only reason to instantiate it (as above) is if you want
# to set the verbosity for debugging
$pi = Bio::PopGen::Statistics->pi($pop);
$theta = Bio::PopGen::Statistics->theta($pop);
# Pi and Theta also take additional arguments,
# see the documentation for more information
use Bio::PopGen::Utilities;
use Bio::AlignIO;
my $in = new Bio::AlignIO(-file => 't/data/t7.aln',
-format => 'clustalw');
my $aln = $in->next_aln;
# get a population, each sequence is an individual and
# for the default case, every site which is not monomorphic
# is a 'marker'. Each individual will have a 'genotype' for the
# site which will be the specific base in the alignment at that
# site
my $pop = Bio::PopGen::Utilities->aln_to_population(-alignment => $aln);
=head1 DESCRIPTION
This object is intended to provide implementations some standard
population genetics statistics about alleles in populations.
This module was previously named Bio::Tree::Statistics.
This object is a place to accumulate routines for calculating various
statistics from the coalescent simulation, marker/allele, or from
aligned sequence data given that you can calculate alleles, number of
segregating sites.
Currently implemented:
Fu and Li's D (fu_and_li_D)
Fu and Li's D* (fu_and_li_D_star)
Fu and Li's F (fu_and_li_F)
Fu and Li's F* (fu_and_li_F_star)
Tajima's D (tajima_D)
Watterson's theta (theta)
pi (pi) - number of pairwise differences
composite_LD (composite_LD)
Count based methods also exist in case you have already calculated the key statistics (seg sites, num individuals, etc) and just want to compute the statistic.
In all cases where a the method expects an arrayref of
L<Bio::PopGen::IndividualI> objects and L<Bio::PopGen::PopulationI>
object will also work.
=head2 REFERENCES
Fu Y.X and Li W.H. (1993) "Statistical Tests of Neutrality of
Mutations." Genetics 133:693-709.
Fu Y.X. (1996) "New Statistical Tests of Neutrality for DNA samples
from a Population." Genetics 143:557-570.
Tajima F. (1989) "Statistical method for testing the neutral mutation
hypothesis by DNA polymorphism." Genetics 123:585-595.
=head2 CITING THIS WORK
Please see this reference for use of this implementation.
Stajich JE and Hahn MW "Disentangling the Effects of Demography and Selection in Human History." (2005) Mol Biol Evol 22(1):63-73.
If you use these Bio::PopGen modules please cite the Bioperl
publication (see FAQ) and the above reference.
=head1 FEEDBACK
=head2 Mailing Lists
User feedback is an integral part of the evolution of this and other
Bioperl modules. Send your comments and suggestions preferably to
the Bioperl mailing list. Your participation is much appreciated.
$self->warn("no genotype for marker $marker_name for individual ". $ind->unique_id. "\n");
next;
}
my@alleles= sort$genotype->get_Alleles;
nextif( scalar@alleles!= 2);
my$genostr= join(',', @alleles);
$allelef{$alleles[0]}++;
$allelef{$alleles[1]}++;
}
# we should check for cases where there > 2 alleles or
# only 1 allele and throw out those markers.
my@alleles= sortkeys%allelef;
my$allele_count= scalar@alleles;
# test if site is polymorphic
if( $allele_count!= 2) {
# only really warn if we're seeing multi-allele
$self->warn("Skipping $marker_name because it has $allele_count alleles (".join(',',@alleles)."), \ncomposite_LD will currently only work for biallelic markers") if$allele_count> 2;
next; # skip this marker
}
# Need to do something here to detect alleles which aren't
# a single character
if( length($alleles[0]) != 1 ||
length($alleles[1]) != 1 ) {
$self->warn("An individual has an allele which is not a single base, this is currently not supported in composite_LD - consider recoding the allele as a single character");
next;
}
# fix the call for allele 1 (A or B) and
# allele 2 (a or b) in terms of how we'll do the
# N square from Weir p.126
$self->debug( "$alleles[0] is 1, $alleles[1] is 2 for $marker_name\n");
$lookup{$marker_name}->{'1'} = $alleles[0];
$lookup{$marker_name}->{'2'} = $alleles[1];
}
@marker_names= sortkeys%lookup;
my$site_count= scalar@marker_names;
# where the final data will be stored
my%stats_for_sites;
# standard way of generating pairwise combos
# LD is done by comparing all the pairwise site (marker)
# combinations and keeping track of the genotype and
# pairwise genotype (ie genotypes of the 2 sites) frequencies