NAME
Bio::Minimizer - minimizer package
Based on the ideas put forth by Roberts et al 2004: https://academic.oup.com/bioinformatics/article/20/18/3363/202143
SYNOPSIS
my
$minimizer
= Bio::Minimizer->new(
$sequenceString
);
my
$kmers
=
$minimizer
->{kmers};
# hash of minimizer => kmer
my
$minimizers
=
$minimizer
->{minimizers};
# hash of minimizer => [kmer1,kmer2,...]
# hash of minimizer => [start1,start2,...]
# Start coordinates are on the fwd strand even when
# matched against the rev strand.
my
$starts
=
$minimizer
->{starts};
# With more options
my
$minimizer2
= Bio::Minimizer->new(
$sequenceString
,{
k
=>31,
l
=>21});
DESCRIPTION
Creates a set of minimizers from sequence
EXAMPLES
example: Sort a fastq file by minimizer, potentially shrinking gzip size.
This is implemented in this package's scripts/sort*.pl scripts.
use
Bio::Minimizer
# Read fastq file via stdin, in this example
while
(
my
$id
= <>){
# Grab an entry
(
$seq
,
$plus
,
$qual
) = (
scalar
(<>),
scalar
(<>),
scalar
(<>));
chomp
(
$id
,
$seq
,
$plus
,
$qual
);
# minimizer object
$MINIMIZER
= Bio::Minimizer->new(
$seq
,{
k
=>
length
(
$seq
)});
# The only minimizer in this entry because k==length(seq)
$minMinimizer
= (
values
(%{
$$MINIMIZER
{minimizers}}))[0];
# combine the minimum minimizer with the entry, for
# sorting later.
# Save the entry as a string so that we don't have to
# parse it later.
my
$entry
= [
$minMinimizer
,
"$id\n$seq\n$plus\n$qual\n"
];
push
(
@entry
,
$entry
);
}
for
my
$e
(
sort
{
$$a
[0] cmp
$$b
[0]}
@entry
){
$$e
[1];
}