#!/usr/bin/env perl # PODNAME: fu-uniq # ABSTRACT: Dereplicate sequences and generate abundance information use 5.012; use warnings; use Getopt::Long qw(:config no_ignore_case); use File::Basename; use FASTX::Reader; use FASTX::Seq; use Data::Dumper; use Digest::MD5 qw(md5_hex); use Carp qw(croak); use Try::Tiny; use FindBin qw($RealBin); # The following placeholder is to be programmatically replaced with 'use lib "$RealBin/../lib"' if needed #~loclib~ if ( -e "$RealBin/../lib/Proch/N50.pm" and -e "$RealBin/../Changes" ) { use lib "$RealBin/../lib"; } use Proch::Seqfu; my $VERSION = $Proch::Seqfu::VERSION // "<Dev>"; my $BASENAME = basename($0); my $AUTHOR = 'Andrea Telatin'; my $DESC = 'Print unique sequence with USEARCH labels'; # Default settings from environment or hardcoded values my $opt_def_qual; my $opt_line_length = $ENV{'FU_LINE_LENGTH'} // 80; # Command line options my( @Options, $opt_sep_size, # Print cluster size as a comment not in sequence name $opt_min_size, # Print only clusters of size >= N $opt_no_label, # Do not print cluster size $opt_keep_name, # Use first sequence name as cluster Name $opt_prefix, # Default cluster name prefix $opt_separator, # Default cluster name separator $opt_fasta, # Force FASTA output (IGNORED) $opt_fastq, # Force FASTQ output (IGNORED) $opt_strip_comm, # Strip comments (IGNORED) $opt_upper, # Convert to uppercase $opt_revcompl, # Reverse complement (IGNORED) $opt_quiet, # Quiet mode $opt_debug, # Debug mode $opt_citation, # Show citation $outdir, $opt_version, $force, ); setOptions(); sub version { say $BASENAME, " ", $VERSION; say STDERR "Using Proch::Seqfu=", $Proch::Seqfu::VERSION, " and FASTX::Reader=", $FASTX::Reader::VERSION; exit(0); } version() if ($opt_version); my $optional_spacer = $opt_sep_size ? "\t" : ";"; # Initialize storage my %counter; # Store sequence counts my %labels; # Store sequence labels my $total_seqs = 0; my $unique_seqs = 0; # Process each input file foreach my $file (@ARGV) { debug("Reading $file"); # Handle input validation if ($file ne '-') { croak "ERROR: File $file does not exist" unless -e $file; croak "ERROR: File $file is not readable" unless -r $file; } my $input_file = $file eq '-' ? "{{STDIN}}" : $file; # Initialize FASTX reader with error handling my $FASTX; try { $FASTX = FASTX::Reader->new({ filename => $input_file }); } catch { croak "ERROR: Failed to initialize FASTX reader for $file: $_"; }; # Process sequences while (my $seq = $FASTX->getRead()) { $total_seqs++; # Extract size if present, default to 1 my $size = $seq->{seq} =~ /size=(\d+)/ ? $1 : 1; # Store uppercase sequence and count my $sequence = uc($seq->{seq}); $counter{$sequence} += $size; # Store label if keeping names if ($opt_keep_name) { $labels{$sequence} //= $seq->{id}; # Store first occurrence only } } } debug("Total sequences processed: $total_seqs"); # Output sequences my $seq_counter = 0; for my $seq (sort { $counter{$b} <=> $counter{$a} } keys %counter) { # Skip if below minimum size next if ($counter{$seq} < $opt_min_size); $seq_counter++; $unique_seqs++; # Generate sequence name my $name = $opt_keep_name ? $labels{$seq} : $opt_prefix . $opt_separator . $seq_counter; # Add size information unless disabled my $size_string = $opt_no_label ? '' : $optional_spacer . 'size=' . $counter{$seq} . ";"; # Output sequence try { print '>', $name, $size_string, "\n", $seq, "\n"; } catch { croak "ERROR: Failed to write sequence: $_"; }; } debug(sprintf("Found %d unique sequences from %d total sequences", $unique_seqs, $total_seqs)); sub ver { say "$BASENAME $VERSION"; exit(0); } sub setOptions { @Options = ( 'Options:', {OPT=>"k|keepname!", VAR=>\$opt_keep_name, DESC=>"Use first sequence name as cluster name"}, {OPT=>"p|prefix=s", VAR=>\$opt_prefix, DEFAULT=>'seq', DESC=>"Sequence prefix"}, {OPT=>"s|separator=s", VAR=>\$opt_separator, DEFAULT=>'.', DESC=>"Prefix and counter separator"}, {OPT=>"m|min-size=i", VAR=>\$opt_min_size, DEFAULT=>0, DESC=>"Print only sequences found at least N times"}, {OPT=>'size-as-comment!', VAR=>\$opt_sep_size, DEFAULT=>0, DESC=>"Add size as comment, not as part of sequence name"}, 'General:', {OPT=>"help", VAR=>\&usage, DESC=>"This help"}, {OPT=>"version", VAR=>\&version, DESC=>"Print version and exit"}, {OPT=>"citation", VAR=>\&show_citation, DESC=>"Print citation for seqfu"}, {OPT=>"quiet!", VAR=>\$opt_quiet, DEFAULT=>0, DESC=>"No screen output"}, {OPT=>"debug!", VAR=>\$opt_debug, DEFAULT=>0, DESC=>"Debug mode: keep all temporary files"}, 'Common seqfu options:', {OPT=>"w|line-width=i", VAR=>\$opt_line_length, DEFAULT=>80, DESC=>"FASTA line size (0 for unlimited)"}, {OPT=>"strip", VAR=>\$opt_strip_comm, DESC=>"Strip comments"}, {OPT=>"fasta", VAR=>\$opt_fasta, DESC=>"Force FASTA output"}, {OPT=>"fastq", VAR=>\$opt_fastq, DESC=>"Force FASTQ output"}, {OPT=>"rc", VAR=>\$opt_revcompl, DESC=>"Print reverse complementary"}, {OPT=>'q|qual=f', VAR=>\$opt_def_qual, DEFAULT=>32, DESC=>"Default quality for FASTQ files"}, {OPT=>'upper', VAR=>\$opt_upper, DESC=>"Convert sequence to uppercase"}, ); (!@ARGV) && (usage(1)); GetOptions(map {$_->{OPT}, $_->{VAR}} grep { ref } @Options) || usage(1); # Validate parameters croak "ERROR: Please specify either --fasta or --fastq, not both" if $opt_fasta and $opt_fastq; croak "ERROR: Minimum size must be non-negative" if defined $opt_min_size && $opt_min_size < 0; if ($opt_line_length < 1) { $opt_line_length = 1_000_000_000_000_000; } # Set default values foreach (@Options) { if (ref $_ && defined($_->{DEFAULT}) && !defined(${$_->{VAR}})) { ${$_->{VAR}} = $_->{DEFAULT}; } } } sub debug { say STDERR '#', $_[0] if $opt_debug; } # Usage function preserved exactly as in original for API compatibility sub usage { my($exitcode) = @_; $exitcode ||= 0; $exitcode = 0 if $exitcode eq 'help'; select STDERR if $exitcode; print "Name:\n ", ucfirst($BASENAME), " $VERSION by $AUTHOR\n", "Synopsis:\n $DESC\n", "Usage:\n $BASENAME [options] filename (or '-' for STDIN)\n"; foreach (@Options) { if (ref) { my $def = defined($_->{DEFAULT}) ? " (default '$_->{DEFAULT}')" : ""; $def = ($def ? ' (default OFF)' : '(default ON)') if $_->{OPT} =~ m/!$/; my $opt = $_->{OPT}; $opt =~ s/!$//; $opt =~ s/=s$/ [X]/; $opt =~ s/=i$/ [N]/; $opt =~ s/=f$/ [n.n]/; printf STDERR " --%-16s %s%s\n", $opt, $_->{DESC}, $def; } else { print "$_\n"; } } exit($exitcode); } __END__ =pod =encoding UTF-8 =head1 NAME fu-uniq - Dereplicate sequences and generate abundance information =head1 VERSION version 1.7.0 =head1 SYNOPSIS fu-uniq [options] input.fa > uniq.fa =head1 DESCRIPTION fu-uniq is a tool for dereplicating DNA sequences and generating abundance information. It identifies unique sequences and can track their abundance using USEARCH-style labels. The tool supports both exact sequence matching and customizable output formats. Key features: - Dereplicates sequences while maintaining abundance information - Supports USEARCH-style size annotations - Flexible sequence naming options - Handles both FASTA and FASTQ inputs - Processes gzipped files automatically =head1 NAME fu-uniq - Dereplicate sequences and generate abundance information =head1 OPTIONS =head2 Sequence Processing =over 4 =item B<-k>, B<--keepname> Use the name of the first occurrence of each unique sequence as the cluster name. This is useful for maintaining meaningful identifiers. Default: ON =item B<-m>, B<--min-size> I<N> Only output sequences that appear at least N times. This helps filter out rare sequences or potential sequencing errors. Default: 0 (no filtering) =item B<--size-as-comment> Add size information as a comment rather than part of the sequence name. This affects the format of the output headers. Default: OFF Example with option OFF: >seq1;size=10; Example with option ON: >seq1 size=10; =back =head2 Output Formatting =over 4 =item B<-p>, B<--prefix> I<STR> Prefix for sequence names when not using --keepname. Default: 'seq' =item B<-s>, B<--separator> I<STR> Character(s) to separate prefix from sequence number. Default: '.' =item B<-w>, B<--line-width> I<N> Width for wrapping FASTA sequence lines. Use 0 for single-line sequences. Default: 80 =back =head1 EXAMPLES Basic deduplication: # Find unique sequences and add abundance information fu-uniq input.fa > uniq.fa Keep only abundant sequences: # Keep sequences that appear at least 10 times fu-uniq -m 10 input.fa > abundant.fa Custom sequence naming: # Use custom prefix and separator fu-uniq -p 'cluster' -s '_' input.fa > clusters.fa Process multiple files: # Combine and deduplicate multiple files fu-uniq file1.fa file2.fa > combined_uniq.fa Add size as comment: # Place size information in sequence comment fu-uniq --size-as-comment input.fa > commented.fa =head1 NOTES =over 4 =item * Memory usage scales with the number of unique sequences =item * Sequence comparison is case-insensitive =item * Size annotations in input files (;size=N;) are respected and combined =back =head1 MODERN ALTERNATIVE This suite of tools has been superseded by B<SeqFu>, a compiled program providing faster and safer tools for sequence analysis. This suite is maintained for the higher portability of Perl scripts under certain circumstances. SeqFu is available at L<https://github.com/telatin/seqfu2>, and can be installed with BioConda C<conda install -c bioconda seqfu> =head1 CITING Telatin A, Fariselli P, Birolo G. I<SeqFu: A Suite of Utilities for the Robust and Reproducible Manipulation of Sequence Files>. Bioengineering 2021, 8, 59. L<https://doi.org/10.3390/bioengineering8050059> =cut =head1 AUTHOR Andrea Telatin <andrea@telatin.com> =head1 COPYRIGHT AND LICENSE This software is Copyright (c) 2018-2027 by Quadram Institute Bioscience. This is free software, licensed under: The MIT (X11) License =cut