—#!/usr/bin/perl -w
use
POSIX;
use
Pod::Usage;
use
FAST;
use
FAST::Bio::SeqIO;
use
FAST::List::Gen
qw/range/
;
## version 0.974 with changes to tests not not fail w perl >= 5.19.0.
use
strict;
$VERSION
=
$FAST::VERSION
;
$DESC
=
"Select and reorder sequence record data based on index lists and ranges.\n"
;
$NAME
= $0;
$NAME
=~ s/^.*\///;
$COMMAND
=
join
" "
,
$NAME
,
@ARGV
;
$DATE
= POSIX::strftime(
"%c"
,
localtime
());
## DEFAULT OPTION VALUES
my
$def_format
=
$FAST::DEF_FORMAT
;
#7/1/13 "fasta";
my
$def_logname
=
$FAST::DEF_LOGNAME
;
#7/1/13 "FAST.log.txt";
my
$def_split_field_regex
=
$FAST::DEF_SPLIT_REGEX
;
# ' '
my
$def_split_other_regex
=
''
;
my
$def_field_join
=
$FAST::DEF_JOIN_STRING
;
# " "
my
$def_other_join
=
''
;
## OPTION VARIABLES
my
$man
=
undef
;
# --man
my
$format
=
$def_format
;
my
$help
=
undef
;
# -h
my
$moltype
=
undef
;
# -m, in case bioperl can't tell
my
$log
=
undef
;
# -l
my
$logname
=
$def_logname
;
# -L
my
$comment
=
undef
;
# -C
my
$strict
=
undef
;
# --strict
my
$identifier
=
undef
;
# -i
my
$description
=
undef
;
# -d
my
$field
=
undef
;
# -f
my
$split_on_regex
=
undef
;
# -S
my
$annotate
=
undef
;
# -a
my
$join
=
undef
;
# -j
GetOptions(
'help|h'
=> \
$help
,
'man'
=> \
$man
,
'moltype|m=s'
=>
sub
{
my
(
undef
,
$val
) =
@_
;
die
"$NAME: --moltype or -m option argument must be \"dna\", \"rna\" or \"protein\""
unless
$val
=~ /dna|rna|protein/i;
$moltype
=
$val
;
},
'log|l'
=> \
$log
,
'logname|L=s'
=> \
$logname
,
'comment|C=s'
=> \
$comment
,
's|strict'
=> \
$strict
,
'identifier|i'
=> \
$identifier
,
'description|d'
=> \
$description
,
'field|f'
=> \
$field
,
'split-on-regex|S=s'
=> \
$split_on_regex
,
'join|j=s'
=> \
$join
,
'annotate|a'
=> \
$annotate
,
'fastq|q'
=>
sub
{
$format
=
'fastq'
;},
)
or
exit
(1);
pod2usage(
-verbose
=> 1)
if
$help
;
pod2usage(
-verbose
=> 2)
if
$man
;
my
$fromSTDIN
= ((-t STDIN) ? false : true);
pod2usage(
"$NAME: expects one range-string argument and at least one input filename or glob.\n"
)
if
(!(
$fromSTDIN
) && (
@ARGV
< 2));
pod2usage(
"$NAME: expects one range-string argument when input is on STDIN.\n"
)
if
(
$fromSTDIN
&&
@ARGV
!= 1);
&FAST::log
(
$logname
,
$DATE
,
$COMMAND
,
$comment
,
$fromSTDIN
)
if
(
$log
);
##PARSE MULTICOORDS
my
@RANGES
= ();
my
$ranges
=
shift
@ARGV
;
if
(
$ranges
=~ /,/) {
@RANGES
=
split
(/,/,
$ranges
);
}
else
{
$RANGES
[0] =
$ranges
;
}
my
@ranges
= ();
foreach
my
$range
(
@RANGES
) {
my
(
$b
,
$e
,
$by
);
if
(
$range
=~ /^(-?\d+)?(\.\.|:|-)(-?\d+)?(:(-?\d+))?$/) {
if
($1) {
$b
= $1 + 0;
}
else
{
$b
= 1;
}
if
($3) {
$e
= $3 + 0;
}
else
{
$e
= -1;
}
if
($5) {
$by
= $5 + 0;
}
else
{
$by
=
undef
;
}
}
elsif
(
$range
=~ /^([\d-]+)$/){
$b
= $1 + 0;
$e
=
$b
;
$by
=
undef
;
}
else
{
die
"$NAME: Bad range \"$range\"\n"
;
}
push
@ranges
, [
$b
,
$e
,
$by
];
}
my
(
$selector
,
$type
);
if
(
$field
) {
$selector
=
"desc"
;
$type
=
"description field"
;
unless
(
defined
$split_on_regex
) {
$split_on_regex
=
$def_split_field_regex
;
}
}
elsif
(
$description
) {
$selector
=
"desc"
;
$type
=
"description"
;
unless
(
defined
$split_on_regex
) {
$split_on_regex
=
$def_split_other_regex
;
}
}
elsif
(
$identifier
) {
$selector
=
"id"
;
$type
=
"identifier"
;
unless
(
defined
$split_on_regex
) {
$split_on_regex
=
$def_split_other_regex
;
}
}
else
{
$selector
=
"seq"
;
$type
=
"sequence"
;
unless
(
defined
$split_on_regex
) {
$split_on_regex
=
$def_split_other_regex
;
}
}
my
$split_re
=
qr/$split_on_regex/
;
my
$OUT
= FAST::Bio::SeqIO->newFh(
'-format'
=>
$format
);
my
$IN
;
unless
(
@ARGV
) {
if
(
$moltype
) {
$IN
= FAST::Bio::SeqIO->new(
-fh
=>
*STDIN
{IO},
'-format'
=>
$format
,
'-alphabet'
=>
$moltype
);
}
else
{
$IN
= FAST::Bio::SeqIO->new(
-fh
=>
*STDIN
{IO},
'-format'
=>
$format
);
}
}
while
(
$IN
or
@ARGV
) {
if
(
@ARGV
) {
my
$file
=
shift
(
@ARGV
);
unless
(-e
$file
) {
warn
"$NAME: Could not find file $file. Skipping.\n"
;
next
;
}
elsif
(
$moltype
) {
$IN
= FAST::Bio::SeqIO->new(
-file
=>
$file
,
'-format'
=>
$format
,
'-alphabet'
=>
$moltype
);
}
else
{
$IN
= FAST::Bio::SeqIO->new(
-file
=>
$file
,
'-format'
=>
$format
);
}
}
if
(
$IN
) {
record:
while
(
my
$seq
=
$IN
->next_seq()) {
my
$olddata
=
$seq
->
$selector
();
my
$seqid
=
$seq
->id();
# for error messages if necessary
my
$length
;
my
@olddata
= ();
my
@oldqual
= ();
#used to store quality data if cutting sequence
if
(
$field
) {
@olddata
=
split
$split_re
,
$olddata
;
#if ($selector eq 'seq' and $format eq 'fastq'){
#my $length_seq;
#my @oldqual_split = split ' ', $seq->qual_text(); #store qual data for cutting
#foreach my $seq_length (@olddata) {
# $length_seq = length($seq_length);
# push @oldqual, splice (@oldqual_split,,$length_seq);
#}
#}
$length
=
scalar
@olddata
;
}
else
{
@olddata
=
split
$split_re
,
$olddata
;
$length
=
length
$olddata
;
if
(
$selector
eq
'seq'
and
$format
eq
'fastq'
){
@oldqual
=
split
' '
,
$seq
->qual_text();
#store qual data for cutting
}
}
my
@newdata
= ();
my
@newqual
= ();
my
$newdata
;
my
$newqual
;
my
$newseq
;
foreach
my
$range
(
@ranges
){
# if ($spawn) {
# @newdata = ();
# }
my
(
$b
,
$e
,
$by
) =
@$range
;
## interpret negative indices
$b
= (
$b
< 0 ?
$length
+
$b
+ 1 :
$b
);
$e
= (
$e
< 0 ?
$length
+
$e
+ 1 :
$e
);
## check length bounds
if
(
$strict
) {
## strict; warns and skips sequence if indices are out of bounds,
if
(!
$by
&&
$e
<
$b
) {
warn
"$NAME: Skipped sequence $seqid with $type length $length: bad range with end $e less than $b.\n"
;
next
record;
}
if
(
$b
< 1 or
$e
< 1 or
$b
>
$length
or
$e
>
$length
) {
warn
"$NAME: Skipped sequence $seqid with $type length $length: indices $b and/or $e out-of-bounds.\n"
;
next
record;
}
}
else
{
## otherwise, returns longest possible result within range
if
(!
$by
&&
$e
<
$b
) {
next
;
}
if
(
$b
< 1) {
$b
= 1;
}
elsif
(
$b
>
$length
) {
next
;
}
if
(
$e
>
$length
) {
$e
=
$length
;
}
elsif
(
$e
< 0) {
next
;
}
}
$b
-= 1;
$e
-= 1;
my
@slice
= ();
if
(
$b
==
$e
) {
push
@newdata
,
$olddata
[
$b
];
if
(
$selector
eq
'seq'
&&
$format
eq
'fastq'
){
push
@newqual
,
$oldqual
[
$b
];
}
}
elsif
(
$by
) {
push
@newdata
,
@olddata
[@{ FAST::List::Gen::range (
$b
,
$e
,
$by
) }];
if
(
$selector
eq
'seq'
&&
$format
eq
'fastq'
){
push
@newqual
,
@oldqual
[@{ FAST::List::Gen::range (
$b
,
$e
,
$by
) }];
}
}
else
{
push
@newdata
,
@olddata
[@{ FAST::List::Gen::range (
$b
,
$e
) }];
if
(
$selector
eq
'seq'
&&
$format
eq
'fastq'
){
push
@newqual
,
@oldqual
[@{ FAST::List::Gen::range (
$b
,
$e
) }];
}
}
}
## if no @olddata, don't print
if
(
@newdata
== 0 ||
@newdata
== 1 && not
defined
$newdata
[0]) {
warn
"$NAME: empty data slice on sequence $seqid. Skipping.\n"
;
next
;
}
if
(
$field
&&
defined
$join
&&
$join
&&
$join
eq
'\t'
){
$join
=
"\t"
;
}
elsif
(not
defined
$join
) {
if
(
$field
) {
$join
=
$def_field_join
;
}
else
{
$join
=
$def_other_join
;
}
}
$newdata
=
join
$join
,
@newdata
;
if
(
$format
eq
'fastq'
){
if
(
$moltype
) {
$newseq
= new FAST::Bio::Seq::Quality(
-seq
=>
$seq
->seq,
-id
=>
$seq
->id,
-desc
=>
$seq
->desc,
-qual
=>
$seq
->qual,
-alphabet
=>
$moltype
);
}
else
{
$newseq
= new FAST::Bio::Seq::Quality(
-seq
=>
$seq
->seq,
-id
=>
$seq
->id,
-desc
=>
$seq
->desc,
-qual
=>
$seq
->qual);
}
}
else
{
if
(
$moltype
) {
$newseq
= new FAST::Bio::Seq(
-seq
=>
$seq
->seq,
-id
=>
$seq
->id,
-desc
=>
$seq
->desc,
-alphabet
=>
$moltype
);
}
else
{
$newseq
= new FAST::Bio::Seq(
-seq
=>
$seq
->seq,
-id
=>
$seq
->id,
-desc
=>
$seq
->desc);
}
}
$newseq
->
$selector
(
$newdata
);
if
(
$selector
eq
'seq'
&&
$format
eq
'fastq'
) {
$newseq
->qual(\
@newqual
);}
$OUT
$newseq
;
}
undef
$IN
;
}
}
__END__
=head1 NAME
B<fascut> - select biosequence record data by character or field ranges
=head1 SYNOPSIS
B<fascut> [OPTION]... [INDEX-SET] [MULTIFASTA-FILE]...
=head1 RANGE SPECIFICATION
RANGE-LIST : { RANGE | RANGE,RANGE-LIST }
RANGE : { INDEX | FROM>SEP<TO | FROM>SEP<TO:BY }
>SEP< : { .. | : | - }
INDEX : nonzero integer
FROM,TO : nonzero integer or the empty string
-- positive integers count from data first character or field
-- negative integers count back from data last character or field
BY : nonzero integer
-- positive integers step forward
-- negative integers step backward
=head1 DESCRIPTION
B<fascut> takes biological sequence records on input, and on output,
transforms one component (e.g. sequence, description or identifier) of
each record as the concatenation of index- and range-based selections
of input data. Data is indexed by character or optionally by field. By
default, B<fascut> operates character-wise on sequence data. The
sequence of data selections is specified in the first argument as a
comma-separated list of ranges. A range is either a single index
integer or a range of integers, with reversals and variable step sizes
allowed. Arbitrary repetition of indices is allowed across ranges in
the range-list. B<fascut> outputs the concatenation of the ordered
data selections specified by each range.
The one mandatory argument to B<fascut> is a sequence of indices or
ranges separated by commas (,). The ranges may be specified in
Perl-style (or Genbank coordinate style)llike "from..to", in
R/Octave-style like "from:to" or UNIX B<cut>-style as in "from-to". If
the index bounds of a simple range are missing, "from" defaults to "1"
and "to" defaults to "-1". An optional ":by" suffix specifies a
non-zero integer step-increment, which may be positive or
negative. Negative step-increments imply a reversal of input data.
B<One-based indexing> applies consistently in B<fascut> whether
indexing data by character (like B<cut -c>) or by field. B<fascut -f>
cuts descriptions by field, after descriptions are split by strings of
white-space, or optionally a user-defined Perl regex. Negative
indices, starting with -1, count backwards from last characters and
fields.
After converting negative indices to positive indices for any given
sequence on input, B<fascut> expects the "from" parameter to be less
than its respective "to" parameter for every range unless a negative
"by" parameter is given, in which case "from" is expected to be
greater than "to". If any of this is violated for a given range and
sequence, then that range will be skipped for that sequence. If after
conversions, "from" is negative, it defaults to "1". If "to" is
greater than a sequence length, it defaults to "-1". However, if
strict-mode (-s) is enabled, then both types of bad range
specifications will abort processing of a sequence record with no
output for that sequence except a warning.
When the range-list argument starts with a negative index integer, you
will need to terminate option processing with "--" before supplying
the range-string argument to B<fascut>.
Options specific to B<fascut>:
-B<i>, B<--identifier> cuts identifiers (by character)
-B<d>, B<--description> cuts descriptions (by character)
-B<f>, B<--field> cuts descriptions (by field)
-B<S>, B<--split-on-regex>=<regex> split description to fields using <regex>
-B<j>, B<--join>=<string> join selected field ranges using <string>
-B<s>, B<--strict> strict range-checking; skips sequences with
warnings if ranges are out-of-bounds
Options general to FAST:
-B<h>, B<--help> print a brief help message
B<--man> print full documentation
B<--version> print version
B<-l>, B<--log> create/append to logfile
-B<L>, B<--logname>=<string> use logfile name <string>
-B<C>, B<--comment>=<string> save comment <string> to log
B<--format>=<format> use alternative format for input
B<--moltype>=<[dna|rna|protein]> specify input sequence type
B<-q>, B<--fastq> use fastq format as input and output
=head1 INPUT AND OUTPUT
B<fascut> is part of FAST, the FAST Analysis of Sequences Toolbox, based
on Bioperl. Most core FAST utilities expect input and return output in
multifasta format. Input can occur in one or more files or on
STDIN. Output occurs to STDOUT. The FAST utility B<fasconvert> can
reformat other formats to and from multifasta.
=head1 OPTIONS
=over 8
=item B<-i>
B<--identifier>
Cut identifiers by character. Use the B<-S> option to alter how the
identifier data is split.
=item B<-d>
B<--description>
Cut descriptions by character. Use the B<-f> option to split
descriptions by strings of whitespace instead. Use the B<-S> option to
split the description with an arbitrary regex.
=item B<-f>
B<--field>
Cut descriptions by field. By default, the description is split on
strings of white-space.
=item B<-S regex >
B<--split-on-regex regex>
Use B<regex> to split data. Special characters in the regex option
argument must be quoted to protect them from the shell.
=item B<-j string>
B<--join=string>
Paste selected fields together with string B<string> for new
description. Default is a single space character " ". Use "\t" to
indicate a tab-character.
=item B<-s>
B<--strict>
This option will implement strict range checking on the
coordinates. When used this option will skip any sequences for which
the coordinates are out of range (the default is to output the longest
valid subsequence contained within the range).
=item B<-h>,
B<--help>
Print a brief help message and exit.
=item B<--man>
Print the manual page and exit.
=item B<--version>
Print version information and exit.
=item B<-l>,
B<--log>
Creates, or appends to, a generic FAST logfile in the current working
directory. The logfile records date/time of execution, full command
with options and arguments, and an optional comment.
=item B<-L [string]>,
B<--logname=[string]>
Use [string] as the name of the logfile. Default is "FAST.log.txt".
=item B<-C [string]>,
B<--comment=[string]>
Include comment [string] in logfile. No comment is saved by default.
=item B<--format=[format]>
Use alternative format for input. See man page for "fasconvert" for
allowed formats. This is for convenience; the FAST tools are designed
to exchange data in Fasta format, and "fasta" is the default format
for this tool.
=item B<-m [dna|rna|protein]>,
B<--moltype=[dna|rna|protein]>
Specify the type of sequence on input (should not be needed in most
cases, but sometimes Bioperl cannot guess and complains when
processing data).
=item B<-q>
use fastq format as input and output.
=back
=head1 EXAMPLES
Example 1:
=over 8
B<fascut> 4 < in.fas > out.fas
=back
Example 2:
=over 8
B<fascut> 1..8 < in.fas > out.fas
=back
Get all but the first 3 bases/aas:
=over 8
B<fascut> 4..-1 < in.fas > out.fas
=back
Get the last 3 bases/aas:
=over 8
B<fascut> -- -3..-1 < in.fas > out.fas
=back
Get the last 3 bases/aas to position 1000 if possible:
=over 8
B<fascut> -- -3..1000 < in.fas > out.fas
=back
Example 3:
=over 8
B<fascut> 1..4,2..6,-1 < in.fas > out.fas
=back
=head1 SEE ALSO
=over 8
=item C<man perlre>
=item C<perldoc perlre>
Documentation on perl regular expressions.
=item C<man FAST>
=item C<perldoc FAST>
Introduction and cookbook for FAST
=item L<The FAST Home Page|http://compbio.ucmerced.edu/ardell/FAST>"
=back
=head1 CITING
If you use FAST, please cite I<Lawrence et al. (2015). FAST: FAST Analysis of
Sequences Toolbox.> and Bioperl I<Stajich et al.>.
=cut