—#!/usr/bin/env perl
# PODNAME: fu-uniq
# ABSTRACT: Dereplicate sequences and generate abundance information
use
5.012;
use
warnings;
use
File::Basename;
use
FASTX::Reader;
use
FASTX::Seq;
use
Data::Dumper;
use
Try::Tiny;
# The following placeholder is to be programmatically replaced with 'use lib "$RealBin/../lib"' if needed
#~loclib~
if
( -e
"$RealBin/../lib/Proch/N50.pm"
and -e
"$RealBin/../Changes"
) {
}
use
Proch::Seqfu;
my
$VERSION
=
$Proch::Seqfu::VERSION
//
"<Dev>"
;
my
$BASENAME
= basename($0);
my
$AUTHOR
=
'Andrea Telatin'
;
my
$DESC
=
'Print unique sequence with USEARCH labels'
;
# Default settings from environment or hardcoded values
my
$opt_def_qual
;
my
$opt_line_length
=
$ENV
{
'FU_LINE_LENGTH'
} // 80;
# Command line options
my
(
@Options
,
$opt_sep_size
,
# Print cluster size as a comment not in sequence name
$opt_min_size
,
# Print only clusters of size >= N
$opt_no_label
,
# Do not print cluster size
$opt_keep_name
,
# Use first sequence name as cluster Name
$opt_prefix
,
# Default cluster name prefix
$opt_separator
,
# Default cluster name separator
$opt_fasta
,
# Force FASTA output (IGNORED)
$opt_fastq
,
# Force FASTQ output (IGNORED)
$opt_strip_comm
,
# Strip comments (IGNORED)
$opt_upper
,
# Convert to uppercase
$opt_revcompl
,
# Reverse complement (IGNORED)
$opt_quiet
,
# Quiet mode
$opt_debug
,
# Debug mode
$opt_citation
,
# Show citation
$outdir
,
$opt_version
,
$force
,
);
setOptions();
sub
version {
say
$BASENAME
,
" "
,
$VERSION
;
say
STDERR
"Using Proch::Seqfu="
,
$Proch::Seqfu::VERSION
,
" and FASTX::Reader="
,
$FASTX::Reader::VERSION
;
exit
(0);
}
version()
if
(
$opt_version
);
my
$optional_spacer
=
$opt_sep_size
?
"\t"
:
";"
;
# Initialize storage
my
%counter
;
# Store sequence counts
my
%labels
;
# Store sequence labels
my
$total_seqs
= 0;
my
$unique_seqs
= 0;
# Process each input file
foreach
my
$file
(
@ARGV
) {
debug(
"Reading $file"
);
# Handle input validation
if
(
$file
ne
'-'
) {
croak
"ERROR: File $file does not exist"
unless
-e
$file
;
croak
"ERROR: File $file is not readable"
unless
-r
$file
;
}
my
$input_file
=
$file
eq
'-'
?
"{{STDIN}}"
:
$file
;
# Initialize FASTX reader with error handling
my
$FASTX
;
try
{
$FASTX
= FASTX::Reader->new({
filename
=>
$input_file
});
}
catch
{
croak
"ERROR: Failed to initialize FASTX reader for $file: $_"
;
};
# Process sequences
while
(
my
$seq
=
$FASTX
->getRead()) {
$total_seqs
++;
# Extract size if present, default to 1
my
$size
=
$seq
->{seq} =~ /size=(\d+)/ ? $1 : 1;
# Store uppercase sequence and count
my
$sequence
=
uc
(
$seq
->{seq});
$counter
{
$sequence
} +=
$size
;
# Store label if keeping names
if
(
$opt_keep_name
) {
$labels
{
$sequence
} //=
$seq
->{id};
# Store first occurrence only
}
}
}
debug(
"Total sequences processed: $total_seqs"
);
# Output sequences
my
$seq_counter
= 0;
for
my
$seq
(
sort
{
$counter
{
$b
} <=>
$counter
{
$a
} }
keys
%counter
) {
# Skip if below minimum size
next
if
(
$counter
{
$seq
} <
$opt_min_size
);
$seq_counter
++;
$unique_seqs
++;
# Generate sequence name
my
$name
=
$opt_keep_name
?
$labels
{
$seq
}
:
$opt_prefix
.
$opt_separator
.
$seq_counter
;
# Add size information unless disabled
my
$size_string
=
$opt_no_label
?
''
:
$optional_spacer
.
'size='
.
$counter
{
$seq
} .
";"
;
# Output sequence
try
{
'>'
,
$name
,
$size_string
,
"\n"
,
$seq
,
"\n"
;
}
catch
{
croak
"ERROR: Failed to write sequence: $_"
;
};
}
debug(
sprintf
(
"Found %d unique sequences from %d total sequences"
,
$unique_seqs
,
$total_seqs
));
sub
ver {
say
"$BASENAME $VERSION"
;
exit
(0);
}
sub
setOptions {
@Options
= (
'Options:'
,
{
OPT
=>
"k|keepname!"
,
VAR
=>\
$opt_keep_name
,
DESC
=>
"Use first sequence name as cluster name"
},
{
OPT
=>
"p|prefix=s"
,
VAR
=>\
$opt_prefix
,
DEFAULT
=>
'seq'
,
DESC
=>
"Sequence prefix"
},
{
OPT
=>
"s|separator=s"
,
VAR
=>\
$opt_separator
,
DEFAULT
=>
'.'
,
DESC
=>
"Prefix and counter separator"
},
{
OPT
=>
"m|min-size=i"
,
VAR
=>\
$opt_min_size
,
DEFAULT
=>0,
DESC
=>
"Print only sequences found at least N times"
},
{
OPT
=>
'size-as-comment!'
,
VAR
=>\
$opt_sep_size
,
DEFAULT
=>0,
DESC
=>
"Add size as comment, not as part of sequence name"
},
'General:'
,
{
OPT
=>
"help"
,
VAR
=>\
&usage
,
DESC
=>
"This help"
},
{
OPT
=>
"version"
,
VAR
=>\
&version
,
DESC
=>
"Print version and exit"
},
{
OPT
=>
"citation"
,
VAR
=>\
&show_citation
,
DESC
=>
"Print citation for seqfu"
},
{
OPT
=>
"quiet!"
,
VAR
=>\
$opt_quiet
,
DEFAULT
=>0,
DESC
=>
"No screen output"
},
{
OPT
=>
"debug!"
,
VAR
=>\
$opt_debug
,
DEFAULT
=>0,
DESC
=>
"Debug mode: keep all temporary files"
},
'Common seqfu options:'
,
{
OPT
=>
"w|line-width=i"
,
VAR
=>\
$opt_line_length
,
DEFAULT
=>80,
DESC
=>
"FASTA line size (0 for unlimited)"
},
{
OPT
=>
"strip"
,
VAR
=>\
$opt_strip_comm
,
DESC
=>
"Strip comments"
},
{
OPT
=>
"fasta"
,
VAR
=>\
$opt_fasta
,
DESC
=>
"Force FASTA output"
},
{
OPT
=>
"fastq"
,
VAR
=>\
$opt_fastq
,
DESC
=>
"Force FASTQ output"
},
{
OPT
=>
"rc"
,
VAR
=>\
$opt_revcompl
,
DESC
=>
"Print reverse complementary"
},
{
OPT
=>
'q|qual=f'
,
VAR
=>\
$opt_def_qual
,
DEFAULT
=>32,
DESC
=>
"Default quality for FASTQ files"
},
{
OPT
=>
'upper'
,
VAR
=>\
$opt_upper
,
DESC
=>
"Convert sequence to uppercase"
},
);
(!
@ARGV
) && (usage(1));
GetOptions(
map
{
$_
->{OPT},
$_
->{VAR}}
grep
{
ref
}
@Options
) || usage(1);
# Validate parameters
croak
"ERROR: Please specify either --fasta or --fastq, not both"
if
$opt_fasta
and
$opt_fastq
;
croak
"ERROR: Minimum size must be non-negative"
if
defined
$opt_min_size
&&
$opt_min_size
< 0;
if
(
$opt_line_length
< 1) {
$opt_line_length
= 1_000_000_000_000_000;
}
# Set default values
foreach
(
@Options
) {
if
(
ref
$_
&&
defined
(
$_
->{DEFAULT}) && !
defined
(${
$_
->{VAR}})) {
${
$_
->{VAR}} =
$_
->{DEFAULT};
}
}
}
sub
debug {
say
STDERR
'#'
,
$_
[0]
if
$opt_debug
;
}
# Usage function preserved exactly as in original for API compatibility
sub
usage {
my
(
$exitcode
) =
@_
;
$exitcode
||= 0;
$exitcode
= 0
if
$exitcode
eq
'help'
;
select
STDERR
if
$exitcode
;
"Name:\n "
,
ucfirst
(
$BASENAME
),
" $VERSION by $AUTHOR\n"
,
"Synopsis:\n $DESC\n"
,
"Usage:\n $BASENAME [options] filename (or '-' for STDIN)\n"
;
foreach
(
@Options
) {
if
(
ref
) {
my
$def
=
defined
(
$_
->{DEFAULT}) ?
" (default '$_->{DEFAULT}')"
:
""
;
$def
= (
$def
?
' (default OFF)'
:
'(default ON)'
)
if
$_
->{OPT} =~ m/!$/;
my
$opt
=
$_
->{OPT};
$opt
=~ s/!$//;
$opt
=~ s/=s$/ [X]/;
$opt
=~ s/=i$/ [N]/;
$opt
=~ s/=f$/ [n.n]/;
printf
STDERR
" --%-16s %s%s\n"
,
$opt
,
$_
->{DESC},
$def
;
}
else
{
"$_\n"
;
}
}
exit
(
$exitcode
);
}
__END__
=pod
=encoding UTF-8
=head1 NAME
fu-uniq - Dereplicate sequences and generate abundance information
=head1 VERSION
version 1.7.0
=head1 SYNOPSIS
fu-uniq [options] input.fa > uniq.fa
=head1 DESCRIPTION
fu-uniq is a tool for dereplicating DNA sequences and generating abundance
information. It identifies unique sequences and can track their abundance
using USEARCH-style labels. The tool supports both exact sequence matching
and customizable output formats.
Key features:
- Dereplicates sequences while maintaining abundance information
- Supports USEARCH-style size annotations
- Flexible sequence naming options
- Handles both FASTA and FASTQ inputs
- Processes gzipped files automatically
=head1 NAME
fu-uniq - Dereplicate sequences and generate abundance information
=head1 OPTIONS
=head2 Sequence Processing
=over 4
=item B<-k>, B<--keepname>
Use the name of the first occurrence of each unique sequence as the cluster name.
This is useful for maintaining meaningful identifiers. Default: ON
=item B<-m>, B<--min-size> I<N>
Only output sequences that appear at least N times. This helps filter out
rare sequences or potential sequencing errors. Default: 0 (no filtering)
=item B<--size-as-comment>
Add size information as a comment rather than part of the sequence name.
This affects the format of the output headers. Default: OFF
Example with option OFF:
>seq1;size=10;
Example with option ON:
>seq1 size=10;
=back
=head2 Output Formatting
=over 4
=item B<-p>, B<--prefix> I<STR>
Prefix for sequence names when not using --keepname. Default: 'seq'
=item B<-s>, B<--separator> I<STR>
Character(s) to separate prefix from sequence number. Default: '.'
=item B<-w>, B<--line-width> I<N>
Width for wrapping FASTA sequence lines. Use 0 for single-line sequences.
Default: 80
=back
=head1 EXAMPLES
Basic deduplication:
# Find unique sequences and add abundance information
fu-uniq input.fa > uniq.fa
Keep only abundant sequences:
# Keep sequences that appear at least 10 times
fu-uniq -m 10 input.fa > abundant.fa
Custom sequence naming:
# Use custom prefix and separator
fu-uniq -p 'cluster' -s '_' input.fa > clusters.fa
Process multiple files:
# Combine and deduplicate multiple files
fu-uniq file1.fa file2.fa > combined_uniq.fa
Add size as comment:
# Place size information in sequence comment
fu-uniq --size-as-comment input.fa > commented.fa
=head1 NOTES
=over 4
=item * Memory usage scales with the number of unique sequences
=item * Sequence comparison is case-insensitive
=item * Size annotations in input files (;size=N;) are respected and combined
=back
=head1 MODERN ALTERNATIVE
This suite of tools has been superseded by B<SeqFu>, a compiled
program providing faster and safer tools for sequence analysis.
This suite is maintained for the higher portability of Perl scripts
under certain circumstances.
SeqFu is available at L<https://github.com/telatin/seqfu2>, and
can be installed with BioConda C<conda install -c bioconda seqfu>
=head1 CITING
Telatin A, Fariselli P, Birolo G.
I<SeqFu: A Suite of Utilities for the Robust and Reproducible Manipulation of Sequence Files>.
Bioengineering 2021, 8, 59. L<https://doi.org/10.3390/bioengineering8050059>
=cut
=head1 AUTHOR
Andrea Telatin <andrea@telatin.com>
=head1 COPYRIGHT AND LICENSE
This software is Copyright (c) 2018-2027 by Quadram Institute Bioscience.
This is free software, licensed under:
The MIT (X11) License
=cut