From Code to Community: Sponsoring The Perl and Raku Conference 2025 Learn more

#!/usr/bin/env perl
# ABSTRACT: A script to calculate N50 from one or multiple FASTA/FASTQ files.
# PODNAME: n50
use 5.012;
use Term::ANSIColor qw(:constants colorvalid colored);
use FindBin qw($RealBin);
# The following placeholder is to be programmatically replaced with 'use lib "$RealBin/../lib"' if needed
#~loclib~
if ( -e "$RealBin/../lib/Proch/N50.pm" and -e "$RealBin/../Changes" ) {
use lib "$RealBin/../lib";
}
our %program = (
'NAME' => 'FASTX N50',
'AUTHOR' => 'Andrea Telatin',
'MAIL' => 'andrea.telatin@gmail.com',
'VERSION' => '4.2.0',
);
my $hasJSON = undef;
my $t;
local $Term::ANSIColor::AUTORESET = 1;
my $opt_separator = "\t";
my $opt_format = 'default';
my %formats = (
'default' => 'Prints only N50 for single file, TSV for multiple files',
'tsv' => 'Tab separated output (file, seqs, total size, N50, min, max)',
'json' => 'JSON (JavaScript Object Notation) output',
'csv' => 'Alias for tsv and --separator ","',
'custom' => 'Custom format with --template STRING',
'screen' => 'Screen friendly table (requires Text::ASCIITable)',
'short' => 'Not implemented',
'full' => 'Not implemented',
);
my (
$opt_help,
$opt_version,
$opt_debug,
$opt_color,
$opt_nonewline,
$opt_noheader,
$opt_pretty,
$opt_basename,
$opt_template,
$opt_fullpath,
$opt_thousand_separator,
$opt_ne,
$opt_format_screen, # x: same as -f screen
$opt_format_json, # j: same as -f json
$opt_sort_by, # o
$opt_reverse_sort, # r
);
$opt_sort_by = 'N50';
$opt_reverse_sort = undef;
my %valid_sort_keys = (
'auN' => 1,
'Ne' => 1,
'N90' => 1,
'N75' => 1,
'N50' => 1,
'min' => 1,
'max' => 1,
'seqs' => 1,
'size' => 1,
'path' => 1,
);
$valid_sort_keys{"N$opt_ne"} = 1 if (defined $opt_ne);
my $tab = "\t";
my $new = "\n";
my $_opt = GetOptions(
'a|abspath' => \$opt_fullpath,
'b|basename' => \$opt_basename,
'c|color' => \$opt_color,
'd|debug' => \$opt_debug,
'e|Ne=i' => \$opt_ne,
'f|format=s' => \$opt_format,
'h|help' => \$opt_help,
'j|json' => \$opt_format_json,
'n|nonewline' => \$opt_nonewline,
'o|sortby=s' => \$opt_sort_by,
'p|pretty' => \$opt_pretty,
'r|reverse' => \$opt_reverse_sort,
's|separator=s' => \$opt_separator,
'q|thousands-sep' => \$opt_thousand_separator,
't|template=s' => \$opt_template,
'u|noheader' => \$opt_noheader,
'v|version' => \$opt_version,
'x|screen' => \$opt_format_screen,
);
$opt_sort_by = 'N50' if ( $opt_sort_by eq 'n50' );
pod2usage( { -exitval => 0, -verbose => 2 } ) if $opt_help;
version() if defined $opt_version;
our %output_object;
our %output_print;
# Added in v1.5: list accepted output formats programmatically
# [-f list] or [--format list] will print a list of accepted formats
if ( $opt_format eq 'list' ) {
say STDERR "AVAILABLE OUTPUT FORMATS:";
for my $f ( sort keys %formats ) {
# Use colors if allowed
if ($opt_color) {
print BOLD, $f, "\t";
# Print in RED unimplemented format
if ( $formats{$f} eq 'Not implemented' ) {
say RED $formats{$f}, RESET;
}
else {
say RESET, $formats{$f};
}
}
else {
say "$f\t$formats{$f}";
}
}
exit;
}
# No files provided, no version
unless ( defined $ARGV[0] or $opt_version ) {
say STDERR "\n Usage: n50 file.fa ... \n Type n50 --help for full manual";
}
# Shot output formats
die "Error: Please specify either -x (--screen) or -j (--json)\n"
if ( $opt_format_screen and $opt_format_json );
die "Error: -x (--screen) or -j (--json) are incompatible with -f (--format)\n"
if ( $opt_format ne 'default' and ( $opt_format_screen or $opt_format_json ) );
$opt_format = 'screen' if ($opt_format_screen);
$opt_format = 'json' if ($opt_format_json);
# Sorting by / reverse
unless ( defined $valid_sort_keys{$opt_sort_by} ) {
say STDERR " FATAL ERROR: Invalid sort key for -o ($opt_sort_by)";
say STDERR " Valid sort keys are: ", join( ', ', keys %valid_sort_keys );
die;
}
my %sorters = (
asc => sub {
$output_object{$b}{$opt_sort_by} <=> $output_object{$a}{$opt_sort_by};
},
desc => sub {
$output_object{$a}{$opt_sort_by} <=> $output_object{$b}{$opt_sort_by};
},
string_asc => sub { $a cmp $b },
string_desc => sub { $b cmp $a },
);
my $sorting_order;
if ( $opt_sort_by eq 'path' ) {
$sorting_order = 'string_asc';
$sorting_order = 'string_desc' if ($opt_reverse_sort);
}
else {
$sorting_order = 'asc';
$sorting_order = 'desc' if ($opt_reverse_sort);
}
debug("Sorting by <$opt_sort_by>: order=$sorting_order");
die("Unexpected fatal error:\nInvalid sort function \"$sorting_order\".\n")
if ( not defined $sorters{$sorting_order} );
my $sort_function = $sorters{$sorting_order};
if ( defined $opt_format ) {
$opt_format = lc($opt_format);
if ( !$formats{$opt_format} ) {
my @list = sort keys(%formats);
die " FATAL ERROR:\n Output format not valid (--format '$opt_format').\n Use one of the following: "
. join( ', ', @list ) . ".\n";
}
# IMPORT JSON ONLY IF NEEDED
if ( $opt_format eq 'json' ) {
$hasJSON = eval {
require JSON::PP;
JSON::PP->import();
1;
};
die "FATAL ERROR: Please install perl module JSON::PP first [e.g. cpanm JSON::PP]\n"
unless ($hasJSON);
}
# IMPORT ASCII TABLE ONLY IF NEEDE
if ( $opt_format eq 'screen' ) {
my $has_table = eval {
Text::ASCIITable->import();
$t = Text::ASCIITable->new();
my @cols = ('File', 'Seqs', 'Total bp', 'N50', 'min', 'max', 'N75', 'N90','auN');
push @cols, "N$opt_ne" if (defined $opt_ne);
$t->setCols( @cols );
1;
};
if ( !$has_table ) {
die
"ERROR:\nFormat 'screen' requires Text::ASCIITable installed.\n";
}
}
if ( $opt_format eq 'custom' and !defined $opt_template ) {
print STDERR " [WARNING] There is no default template at the moment!\n";
print STDERR
" Specify a --template STRING with --format custom or no output will be printed\n";
}
if ( $formats{$opt_format} eq 'Not implemented' ) {
print STDERR
" WARNING: Format '$opt_format' not implemented yet. Switching to 'tsv'.\n";
$opt_format = 'tsv';
}
}
foreach my $file (@ARGV) {
# Check if file exists / check if '-' supplied read STDIN
if ( ( ! -e "$file" ) and ( $file ne '-' ) ) {
die " FATAL ERROR:\n File not found ($file).\n";
} elsif (-d "$file") {
print STDERR "WARNING: Ignoring directory $file.\n";
next;
} elsif ( $file eq '-' ) {
# Set file to <STDIN>
$file = '-';
}
else {
# Open filehandle with $file
open STDIN, '<', "$file"
|| die " FATAL ERROR:\n Unable to open file for reading ($file).\n";
}
my $JSON = 0;
$JSON = 1 if (defined $opt_format and $opt_format =~ /JSON/ );
my $FileStats = Proch::N50::getStats( $file, $JSON, $opt_ne ) || 0;
# Validate answer: check {status}==1
if ( !$FileStats->{status} ) {
print STDERR "[WARNING]\tError parsing \"$file\". Skipped.\n";
if ($opt_debug) {
print Dumper $FileStats;
}
next;
}
say Dumper $FileStats if ($opt_debug);
if ( !defined $FileStats->{auN} ) {
say STDERR
"Fatal error: 'auN' statistics not calculated parsing \"$file\". Proch::N50 > 1.2.0 required, ", $Proch::N50::VERSION , " found.";
say STDERR Dumper $FileStats;
say STDERR $Proch::N50::VERSION;
die;
}
my $n50 = $FileStats->{N50} + 0;
my $n = $FileStats->{seqs} + 0;
my $slen = $FileStats->{size} + 0;
my $min = $FileStats->{min} + 0;
my $max = $FileStats->{max} + 0;
my $n75 = $FileStats->{N75} + 0;
my $n90 = $FileStats->{N90} + 0;
my $auN = $FileStats->{auN} + 0;
my $Ne = defined $opt_ne ? $FileStats->{Ne} : undef;
say STDERR "[$file]\tTotalSize:$slen;N50:$n50;Sequences:$n" if ($opt_debug);
$file = basename($file) if ($opt_basename);
$file = File::Spec->rel2abs($file) if ($opt_fullpath);
my %metrics = (
'seqs' => $n,
'N50' => $n50,
'size' => $slen,
'min' => $min,
'max' => $max,
'auN' => $auN,
'N75' => $n75,
'N90' => $n90,
);
$metrics{"N$opt_ne"} = $Ne if (defined $opt_ne);
if ( defined $output_object{$file} ) {
print STDERR " WARNING: ",
"Overwriting '$file': multiple items with the same filename. Try not using -b/--basename.\n";
}
$output_object{$file} = \%metrics;
$output_print{$file} = {};
for my $key ( keys %{ $output_object{$file} } ) {
if ( $opt_thousand_separator or $opt_format eq 'screen' ) {
$output_print{$file}{$key} = thousands($output_object{$file}{$key} );
} else {
$output_print{$file}{$key} = $output_object{$file}{$key} ;
}
}
}
my $file_num = scalar keys %output_object;
# Format Output
if ( not $opt_format or $opt_format eq 'default' ) {
debug("Activating format <default>");
# DEFAULT: format
if ( $file_num == 1 ) {
# If only one file is supplied, just return N50 (to allow easy pipeline parsing)
my @keys = keys %output_object;
if ($opt_nonewline) {
print $opt_thousand_separator
? thousands( $output_object{ $keys[0] }{'N50'} )
: $output_object{ $keys[0] }{'N50'};
}
else {
say $opt_thousand_separator
? thousands( $output_object{ $keys[0] }{'N50'} )
: $output_object{ $keys[0] }{'N50'};
}
}
else {
# Print table
foreach my $r ( sort $sort_function keys %output_object ) {
say $r, $opt_separator,
$opt_thousand_separator
? thousands( $output_print{$r}{'N50'} )
: $output_print{$r}{'N50'};
}
}
}
elsif ( $opt_format eq 'json' ) {
# Print JSON object
my $json = JSON::PP->new->ascii->allow_nonref;
my $json_printed = $json->encode( \%output_print );
if ($opt_pretty) {
$json_printed = $json->pretty->encode( \%output_print );
}
say $json_printed;
}
elsif ( $opt_format eq 'tsv' or $opt_format eq 'csv' ) {
$opt_separator = ',' if ( $opt_format eq 'csv' );
# TSV format
my @fields = ( 'path', 'seqs', 'size', 'N50', 'min', 'max', 'N75', 'N90','auN');
push(@fields, "N$opt_ne") if (defined $opt_ne);
say '#', join( $opt_separator, @fields ) if ( !defined $opt_noheader );
foreach my $r ( sort $sort_function keys %output_object ) {
print $r, $opt_separator;
for ( my $i = 1 ; $i <= $#fields ; $i++ ) {
if ($opt_thousand_separator) {
print '"' if ( $opt_format eq 'csv' or $opt_separator eq ',' );
print thousands( $output_object{$r}{ $fields[$i] } )
if ( defined $output_object{$r}{ $fields[$i] } );
print '"' if ( $opt_format eq 'csv' or $opt_separator eq ',' );
}
else {
print $output_object{$r}{ $fields[$i] }
if ( defined $output_object{$r}{ $fields[$i] } );
}
if ( ( $i == $#fields ) and ( not $opt_nonewline ) ) {
print "\n";
}
else {
print $opt_separator;
}
}
}
}
elsif ( $opt_format eq 'custom' ) {
my @fields = ( 'seqs', 'size', 'N50', 'min', 'max','N75', 'N90','auN' );
push(@fields, "N$opt_ne") if (defined $opt_ne);
# Format: custom (use tags {new}{tab} {path} ...)
foreach my $r ( sort $sort_function keys %output_object ) {
my $output_string = '';
$output_string .= $opt_template if ( defined $opt_template );
$output_string =~ s/(\{new\}|\{n\}|\\n)/$new/g
if ( $output_string =~ /(\{new\}|\{n\}|\\n)/ );
$output_string =~ s/(\{tab\}|\{t\}|\\t)/$tab/g
if ( $output_string =~ /(\{tab\}|{t}|\\t)/ );
$output_string =~ s/\{path\}/$r/g if ( $output_string =~ /\{path\}/ );
foreach my $f (@fields) {
$output_string =~ s/\{$f\}/$output_print{$r}{$f}/g;
}
print $output_string;
}
}
elsif ( $opt_format eq 'screen' ) {
my @fields = ( 'path', 'seqs', 'size', 'N50', 'min', 'max', 'N75', 'N90','auN' );
push(@fields, "N$opt_ne") if (defined $opt_ne);
#my $field = 'N50';
foreach my $r ( sort $sort_function keys %output_object ) {
my @array;
push( @array, $r );
for ( my $i = 1 ; $i <= $#fields ; $i++ ) {
if ( defined $output_print{$r}{ $fields[$i] } ) {
push( @array, $output_print{$r}{ $fields[$i] } )
} else {
say Dumper $output_print{$r};
die "$fields[$i] not defined";
}
}
$t->addRow(@array);
}
print $t;
}
# Print debug information
sub debug {
return unless defined $opt_debug;
my ( $message, $title ) = @_;
$title = 'INFO' unless defined $title;
$title = uc($title);
printMessage( $message, $title, 'green', 'yellow' );
return 1;
}
# Print message with colors unless --nocolor
sub printMessage {
my ( $message, $title, $title_color, $message_color ) = @_;
if (! defined $title_color) {
$title_color = 'reset';
} elsif (! colorvalid($title_color)) {
say STDERR "$title_color not valid title color";
$title_color = 'reset';
}
if (! defined $message_color) {
$message_color = 'reset';
} elsif (! colorvalid($message_color)) {
say STDERR "$message_color not valid title color";
$message_color = 'reset';
}
print STDERR colored( "$title", $title_color ), "\t" if ($title);
say colored( "$message", $message_color );
return 1;
}
sub version {
printMessage( "$program{NAME}, ver. $program{VERSION}",
'', undef, 'bold blue' );
printMessage("Program to calculate N50 from multiple FASTA/FASTQ files,\n" .
);
printMessage("Using Proch::N50 $Proch::N50::VERSION", '', undef, 'red');
return $program{VERSION};
}
# Calculate N50 from a hash of contig lengths and their counts
sub n50fromHash {
my ( $hash_ref, $total ) = @_;
my $tlen = 0;
foreach my $s ( sort { $a <=> $b } keys %{$hash_ref} ) {
$tlen += $s * ${$hash_ref}{$s};
# In my original implementation it was >=, here > to comply with 'seqkit'
return $s if ( $tlen > ( $total / 2 ) );
}
return 0;
}
sub thousands {
my ($number) = @_;
my ($int, $dec) = split /\./, $number;
$int =~ s/(\d{1,3}?)(?=(\d{3})+$)/$1,/g;
$dec = $dec ? ".$dec" : '';
return "$int$dec";
}
# Heng Li's subroutine (edited)
sub readfq {
my ( $fh, $aux ) = @_;
@$aux = [ undef, 0 ] if ( !(@$aux) );
return if ( $aux->[1] );
if ( !defined( $aux->[0] ) ) {
while (<$fh>) {
chomp;
if ( substr( $_, 0, 1 ) eq '>' || substr( $_, 0, 1 ) eq '@' ) {
$aux->[0] = $_;
last;
}
}
if ( !defined( $aux->[0] ) ) {
$aux->[1] = 1;
return;
}
}
my $name = '';
if ( defined $_ ) {
$name = /^.(\S+)/ ? $1 : '';
}
my $seq = '';
my $c;
$aux->[0] = undef;
while (<$fh>) {
chomp;
$c = substr( $_, 0, 1 );
last if ( $c eq '>' || $c eq '@' || $c eq '+' );
$seq .= $_;
}
$aux->[0] = $_;
$aux->[1] = 1 if ( !defined( $aux->[0] ) );
return ( $name, $seq ) if ( $c ne '+' );
my $qual = '';
while (<$fh>) {
chomp;
$qual .= $_;
if ( length($qual) >= length($seq) ) {
$aux->[0] = undef;
return ( $name, $seq, $qual );
}
}
$aux->[1] = 1;
return ( $name, $seq );
}
__END__
=pod
=encoding UTF-8
=head1 NAME
n50 - A script to calculate N50 from one or multiple FASTA/FASTQ files.
=head1 VERSION
version 1.7.0
=head1 SYNOPSIS
n50.pl [options] [FILE1 FILE2 FILE3...]
=head1 DESCRIPTION
This program parses a list of FASTA/FASTQ files calculating for each one the
number of sequences, the sum of sequences lengths and the N50, N75, N90 and auN*.
It will print the result in different formats, by default only the N50 is printed
for a single file and all metrics in TSV format for multiple files.
=head1 PARAMETERS
=over 12
=item I<-o, --sortby>
Sort by field: 'N50' (default), 'min', 'max', 'seqs', 'size', 'path'.
By default will be descending for numeric fields, ascending for 'path'.
See C<-r, --reverse>.
=item I<-r, --reverse>
Reverse sort (see: C<-o>);
=item I<-f, --format>
Output format: default, tsv, json, custom, screen.
See below for format specific switches. Specify "list" to list available formats.
=item I<-e>
Also calculate a custom N{e} metric. Expecting an integer 0 < e < 100.
=item I<-s, --separator>
Separator to be used in 'tsv' output. Default: tab.
The 'tsv' format will print a header line, followed by a line for each file
given as input with: file path, as received, total number of sequences,
total size in bp, and finally N50.
=item I<-b, --basename>
Instead of printing the path of each file, will only print
the filename, stripping relative or absolute paths to it. See C<-a>.
Warning: if you are reading multiple files with the same basename, only one will be printed.
This is the intended behaviour and you will only receive a warning.
=item I<-a, --abspath>
Instead of printing the path of each file, as supplied by
the user (can be relative), it will the absolute path.
Will override -b (basename). See C<-b>.
=item I<-u, --noheader>
When used with 'tsv' output format, will suppress header
line.
=item I<-n, --nonewline>
If used with 'default' (or 'csv' output format), will NOT print the
newline character after the N50 for a single file. Useful in bash scripting:
n50=$(n50.pl filename);
=item I<-t, --template>
String to be used with 'custom' format. Will be used as template
string for each sample, replacing {new} with newlines, {tab} with
tab and {N50}, {seqs}, {size}, {path} with sample's N50, number of sequences,
total size in bp and file path respectively (the latter will
respect --basename if used).
=item I<-q, --thousands-sep>
Add the thousands separator in all the printed numbers. Enabled by default
with --format screen (-x).
=item I<-p, --pretty>
If used with 'json' output format, will format the JSON
in pretty print mode. Example:
{
"file1.fa" : {
"size" : 290,
"N50" : 290,
"seqs" : 2
},
"file2.fa" : {
"N50" : 456,
"size" : 456,
"seqs" : 2
}
}
=item I<-h, --help>
Will display this full help message and quit, even if other
arguments are supplied.
=back
=head2 Output formats
These are the values for C<--format>.
=over 4
=item I<tsv> (tab separated values)
#path seqs size N50 min max
test2.fa 8 825 189 4 256
reads.fa 5 247 100 6 102
small.fa 6 130 65 4 65
=item I<csv> (comma separated values)
Same as C<--format tsv> and C<--separator ,>:
#path,seqs,size,N50,min,max
test.fa,8,825,189,4,256
reads.fa,5,247,100,6,102
small_test.fa,6,130,65,4,65
=item I<screen> (screen friendly)
Use C<-x> as shortcut for C<--format screen>. Enables --thousands-sep (-q) by default.
.-----------------------------------------------------------------------------------------.
| File | Seqs | Total bp | N50 | min | max | N75 | N90 | auN |
+---------------+------+----------+--------+-------+--------+-------+-------+-------------+
| big.fa | 4 | 18,359 | 11,840 | 2,167 | 11,840 | 2,176 | 2,167 | 8923.21,984 |
| sim1.fa | 39 | 18,864 | 679 | 20 | 971 | 408 | 313 | 733.51,389 |
| sim2.fa | 21 | 7,530 | 493 | 68 | 989 | 330 | 174 | 575.47,012 |
| test.fa | 8 | 825 | 189 | 4 | 256 | 168 | 168 | 260.99,515 |
'---------------+------+----------+--------+-------+--------+-------+-------+-------------'
=item I<json> (JSON)
Use C<-j> as shortcut for C<--format json>.
{
"data/sim1.fa" : {
"seqs" : 39,
"N50" : 679,
"max" : 971,
"N90" : 313,
"min" : 20,
"size" : 18864,
"auN" : 733.51389,
"N75" : 408
},
"data/sim2.fa" : {
"max" : 989,
"seqs" : 21,
"N50" : 493,
"N90" : 174,
"min" : 68,
"auN" : 575.47012,
"N75" : 330,
"size" : 7530
}
}
=item I<custom>
Will print the output using the template string provided with -t TEMPLATE.
Fields are in the C<{field_name}> format. C<{new}>/C<{n}>/C<\n> is the newline, C<{tab}>/C<{t}>/C<\t> is a tab.
All the keys of the JSON object are valid fields: C<{seqs}>, C<{N50}>, C<{min}>, C<{max}>, C<{size}>.
=back
=head1 EXAMPLE USAGES
Screen friendly table (C<-x> is a shortcut for C<--format screen>), sorted by N50 descending (default):
n50.pl -x files/*.fa
Screen friendly table, sorted by total contig length (C<--sortby max>) ascending (C<--reverse>):
n50.pl -x -o max -r files/*.fa
Tabular (tsv) output is default:
n50.pl -o max -r files/*.fa
A custom output format:
n50.pl data/*.fa -f custom -t '{path}{tab}N50={N50};Sum={size}{new}'
=head1 CITING
Telatin A, Fariselli P, Birolo G.
I<SeqFu: A Suite of Utilities for the Robust and Reproducible Manipulation of Sequence Files>.
Bioengineering 2021, 8, 59. L<https://doi.org/10.3390/bioengineering8050059>
=head1 CONTRIBUTING, BUGS
The repository of this project is available at
=head1 AUTHOR
Andrea Telatin <andrea@telatin.com>
=head1 COPYRIGHT AND LICENSE
This software is Copyright (c) 2018-2027 by Quadram Institute Bioscience.
This is free software, licensed under:
The MIT (X11) License
=cut