bin/fu-uniq - metacpan.org


            
              1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
—
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
              #!/usr/bin/env perl
# PODNAME: fu-uniq
# ABSTRACT: Dereplicate sequences and generate abundance information
use 5.012;
use warnings;
use Getopt::Long qw(:config no_ignore_case);
use File::Basename;
use FASTX::Reader;
use FASTX::Seq;
use Data::Dumper;
use Digest::MD5 qw(md5_hex);
use Carp qw(croak);
use Try::Tiny;
use FindBin qw($RealBin);
# The following placeholder is to be programmatically replaced with 'use lib "$RealBin/../lib"' if needed
#~loclib~
if ( -e "$RealBin/../lib/Proch/N50.pm" and -e "$RealBin/../Changes" ) {
    use lib "$RealBin/../lib";
}
use Proch::Seqfu;
my $VERSION = $Proch::Seqfu::VERSION // "<Dev>";
my $BASENAME = basename($0);
my $AUTHOR = 'Andrea Telatin';
my $DESC = 'Print unique sequence with USEARCH labels';
# Default settings from environment or hardcoded values
my $opt_def_qual;
my $opt_line_length = $ENV{'FU_LINE_LENGTH'} // 80;
# Command line options
my(
    @Options,
    $opt_sep_size,      # Print cluster size as a comment not in sequence name
    $opt_min_size,      # Print only clusters of size >= N
    $opt_no_label,      # Do not print cluster size
    $opt_keep_name,     # Use first sequence name as cluster Name
    $opt_prefix,        # Default cluster name prefix
    $opt_separator,     # Default cluster name separator
    $opt_fasta,         # Force FASTA output (IGNORED)
    $opt_fastq,         # Force FASTQ output (IGNORED)
    $opt_strip_comm,    # Strip comments (IGNORED)
    $opt_upper,         # Convert to uppercase
    $opt_revcompl,      # Reverse complement (IGNORED)
    $opt_quiet,         # Quiet mode
    $opt_debug,         # Debug mode
    $opt_citation,      # Show citation
    $outdir,           
    $opt_version,
    $force,
);
setOptions();
sub version {
    say $BASENAME, " ", $VERSION;
    say STDERR "Using Proch::Seqfu=", $Proch::Seqfu::VERSION, " and FASTX::Reader=", $FASTX::Reader::VERSION;
    exit(0);
}
version() if ($opt_version);
my $optional_spacer = $opt_sep_size ? "\t" : ";";
# Initialize storage
my %counter;  # Store sequence counts
my %labels;   # Store sequence labels
my $total_seqs = 0;
my $unique_seqs = 0;
# Process each input file
foreach my $file (@ARGV) {
    debug("Reading $file");
     
    # Handle input validation
    if ($file ne '-') {
        croak "ERROR: File $file does not exist" unless -e $file;
        croak "ERROR: File $file is not readable" unless -r $file;
    }
     
    my $input_file = $file eq '-' ? "{{STDIN}}" : $file;
     
    # Initialize FASTX reader with error handling
    my $FASTX;
    try {
        $FASTX = FASTX::Reader->new({ filename => $input_file });
    } catch {
        croak "ERROR: Failed to initialize FASTX reader for $file: $_";
    };
    # Process sequences
    while (my $seq = $FASTX->getRead()) {
        $total_seqs++;
         
        # Extract size if present, default to 1
        my $size = $seq->{seq} =~ /size=(\d+)/ ? $1 : 1;
         
        # Store uppercase sequence and count
        my $sequence = uc($seq->{seq});
        $counter{$sequence} += $size;
         
        # Store label if keeping names
        if ($opt_keep_name) {
            $labels{$sequence} //= $seq->{id};  # Store first occurrence only
        }
    }
}
debug("Total sequences processed: $total_seqs");
# Output sequences
my $seq_counter = 0;
for my $seq (sort { $counter{$b} <=> $counter{$a} } keys %counter) {
    # Skip if below minimum size
    next if ($counter{$seq} < $opt_min_size);
     
    $seq_counter++;
    $unique_seqs++;
     
    # Generate sequence name
    my $name = $opt_keep_name 
        ? $labels{$seq} 
        : $opt_prefix . $opt_separator . $seq_counter;
     
    # Add size information unless disabled
    my $size_string = $opt_no_label ? '' : $optional_spacer . 'size=' . $counter{$seq} . ";";
     
    # Output sequence
    try {
        print '>', $name, $size_string, "\n", $seq, "\n";
    } catch {
        croak "ERROR: Failed to write sequence: $_";
    };
}
debug(sprintf("Found %d unique sequences from %d total sequences", $unique_seqs, $total_seqs));
sub ver {
    say "$BASENAME $VERSION";
    exit(0);
}
sub setOptions {
    @Options = (
        'Options:',
        {OPT=>"k|keepname!",      VAR=>\$opt_keep_name,                  DESC=>"Use first sequence name as cluster name"},
        {OPT=>"p|prefix=s",       VAR=>\$opt_prefix,     DEFAULT=>'seq', DESC=>"Sequence prefix"},
        {OPT=>"s|separator=s",    VAR=>\$opt_separator,  DEFAULT=>'.',   DESC=>"Prefix and counter separator"},
        {OPT=>"m|min-size=i",     VAR=>\$opt_min_size,   DEFAULT=>0,    DESC=>"Print only sequences found at least N times"},
        {OPT=>'size-as-comment!', VAR=>\$opt_sep_size,   DEFAULT=>0,    DESC=>"Add size as comment, not as part of sequence name"},
         
        'General:',
        {OPT=>"help",      VAR=>\&usage,                 DESC=>"This help"},
        {OPT=>"version",   VAR=>\&version,               DESC=>"Print version and exit"},
        {OPT=>"citation",  VAR=>\&show_citation,         DESC=>"Print citation for seqfu"},
        {OPT=>"quiet!",    VAR=>\$opt_quiet, DEFAULT=>0, DESC=>"No screen output"},
        {OPT=>"debug!",    VAR=>\$opt_debug, DEFAULT=>0, DESC=>"Debug mode: keep all temporary files"},
         
        'Common seqfu options:',
        {OPT=>"w|line-width=i", VAR=>\$opt_line_length, DEFAULT=>80, DESC=>"FASTA line size (0 for unlimited)"},
        {OPT=>"strip",          VAR=>\$opt_strip_comm,               DESC=>"Strip comments"},
        {OPT=>"fasta",          VAR=>\$opt_fasta,                   DESC=>"Force FASTA output"},
        {OPT=>"fastq",          VAR=>\$opt_fastq,                   DESC=>"Force FASTQ output"},
        {OPT=>"rc",             VAR=>\$opt_revcompl,                DESC=>"Print reverse complementary"},
        {OPT=>'q|qual=f',       VAR=>\$opt_def_qual,   DEFAULT=>32, DESC=>"Default quality for FASTQ files"},
        {OPT=>'upper',          VAR=>\$opt_upper,                   DESC=>"Convert sequence to uppercase"},
    );
    (!@ARGV) && (usage(1));
    GetOptions(map {$_->{OPT}, $_->{VAR}} grep { ref } @Options) || usage(1);
    # Validate parameters
    croak "ERROR: Please specify either --fasta or --fastq, not both"
        if $opt_fasta and $opt_fastq;
    croak "ERROR: Minimum size must be non-negative"
        if defined $opt_min_size && $opt_min_size < 0;
     
    if ($opt_line_length < 1) {
        $opt_line_length = 1_000_000_000_000_000;
    }
    # Set default values
    foreach (@Options) {
        if (ref $_ && defined($_->{DEFAULT}) && !defined(${$_->{VAR}})) {
            ${$_->{VAR}} = $_->{DEFAULT};
        }
    }
}
sub debug {
    say STDERR '#', $_[0] if $opt_debug;
}
# Usage function preserved exactly as in original for API compatibility
sub usage {
    my($exitcode) = @_;
    $exitcode ||= 0;
    $exitcode = 0 if $exitcode eq 'help';
    select STDERR if $exitcode;
    print
        "Name:\n  ", ucfirst($BASENAME), " $VERSION by $AUTHOR\n",
        "Synopsis:\n  $DESC\n",
        "Usage:\n  $BASENAME [options] filename (or '-' for STDIN)\n";
     
    foreach (@Options) {
        if (ref) {
            my $def = defined($_->{DEFAULT}) ? " (default '$_->{DEFAULT}')" : "";
            $def = ($def ? ' (default OFF)' : '(default ON)') if $_->{OPT} =~ m/!$/;
            my $opt = $_->{OPT};
            $opt =~ s/!$//;
            $opt =~ s/=s$/ [X]/;
            $opt =~ s/=i$/ [N]/;
            $opt =~ s/=f$/ [n.n]/;
            printf STDERR "  --%-16s %s%s\n", $opt, $_->{DESC}, $def;
        }
        else {
            print "$_\n";
        }
    }
    exit($exitcode);
}
__END__
=pod
=encoding UTF-8
=head1 NAME
fu-uniq - Dereplicate sequences and generate abundance information
=head1 VERSION
version 1.7.0
=head1 SYNOPSIS
    fu-uniq [options] input.fa > uniq.fa
=head1 DESCRIPTION
fu-uniq is a tool for dereplicating DNA sequences and generating abundance
information. It identifies unique sequences and can track their abundance
using USEARCH-style labels. The tool supports both exact sequence matching
and customizable output formats.
Key features:
- Dereplicates sequences while maintaining abundance information
- Supports USEARCH-style size annotations
- Flexible sequence naming options
- Handles both FASTA and FASTQ inputs
- Processes gzipped files automatically
=head1 NAME
fu-uniq - Dereplicate sequences and generate abundance information
=head1 OPTIONS
=head2 Sequence Processing
=over 4
=item B<-k>, B<--keepname>
Use the name of the first occurrence of each unique sequence as the cluster name.
This is useful for maintaining meaningful identifiers. Default: ON
=item B<-m>, B<--min-size> I<N>
Only output sequences that appear at least N times. This helps filter out
rare sequences or potential sequencing errors. Default: 0 (no filtering)
=item B<--size-as-comment>
Add size information as a comment rather than part of the sequence name.
This affects the format of the output headers. Default: OFF
Example with option OFF:
    >seq1;size=10;
Example with option ON:
    >seq1    size=10;
=back
=head2 Output Formatting
=over 4
=item B<-p>, B<--prefix> I<STR>
Prefix for sequence names when not using --keepname. Default: 'seq'
=item B<-s>, B<--separator> I<STR>
Character(s) to separate prefix from sequence number. Default: '.'
=item B<-w>, B<--line-width> I<N>
Width for wrapping FASTA sequence lines. Use 0 for single-line sequences.
Default: 80
=back
=head1 EXAMPLES
Basic deduplication:
    # Find unique sequences and add abundance information
    fu-uniq input.fa > uniq.fa
Keep only abundant sequences:
    # Keep sequences that appear at least 10 times
    fu-uniq -m 10 input.fa > abundant.fa
Custom sequence naming:
    # Use custom prefix and separator
    fu-uniq -p 'cluster' -s '_' input.fa > clusters.fa
Process multiple files:
    # Combine and deduplicate multiple files
    fu-uniq file1.fa file2.fa > combined_uniq.fa
Add size as comment:
    # Place size information in sequence comment
    fu-uniq --size-as-comment input.fa > commented.fa
=head1 NOTES
=over 4
=item * Memory usage scales with the number of unique sequences
=item * Sequence comparison is case-insensitive
=item * Size annotations in input files (;size=N;) are respected and combined
=back
=head1 MODERN ALTERNATIVE
This suite of tools has been superseded by B<SeqFu>, a compiled
program providing faster and safer tools for sequence analysis.
This suite is maintained for the higher portability of Perl scripts
under certain circumstances.
SeqFu is available at L<https://github.com/telatin/seqfu2>, and
can be installed with BioConda C<conda install -c bioconda seqfu>
=head1 CITING
Telatin A, Fariselli P, Birolo G.
I<SeqFu: A Suite of Utilities for the Robust and Reproducible Manipulation of Sequence Files>.
Bioengineering 2021, 8, 59. L<https://doi.org/10.3390/bioengineering8050059>
=cut
=head1 AUTHOR
Andrea Telatin <andrea@telatin.com>
=head1 COPYRIGHT AND LICENSE
This software is Copyright (c) 2018-2027 by Quadram Institute Bioscience.
This is free software, licensed under:
  The MIT (X11) License
=cut
	Global
`s`	Focus search bar
`?`	Bring up this help dialog
	GitHub
`g` `p`	Go to pull requests
`g` `i`	go to github issues (only if github is preferred repository)
	POD
`g` `a`	Go to author
`g` `c`	Go to changes
`g` `i`	Go to issues
`g` `d`	Go to dist
`g` `r`	Go to repository/SCM
`g` `s`	Go to source
`g` `b`	Go to file browse
	Search terms
module: (e.g. module:Plugin)
distribution: (e.g. distribution:Dancer auth)
author: (e.g. author:SONGMU Redis)
version: (e.g. version:1.00)