bin/faeq-seq - metacpan.org


            
              1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
—
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
              #!/usr/bin/env perl
#Copyright (c) 2010 Joachim Bargsten <code at bargsten dot org>. All rights reserved.
use warnings;
use strict;
use 5.010;
use Carp;
use Getopt::Long;
use Pod::Usage;
use Data::Dumper;
use Bio::Gonzales::Seq::Filter::DuplicateSeqs;
use Bio::Gonzales::Seq::IO qw(faiterate);
our ( $help, $filter, $eval );
GetOptions( "help|h|?" => \$help, "filter|f" => \$filter, "eval|e=s" => \$eval )
  or pod2usage( -verbose => 2 );
pod2usage( -verbose => 2, -noperldoc => 1 ) if $help;
our @files = @ARGV;
die "no fasta files supplied, try '$0 --help' for help" unless ( @files > 0 );
for (@files) {
  die "this is not a file, try '$0 --help' for help" unless ( -f $_ );
}
#flush output text imediately
local $| = 1;
my $counter = 0;
print STDERR "Finding duplicates";
our $dup_seq = Bio::Gonzales::Seq::Filter::DuplicateSeqs->new;
$dup_seq->normalizer(
  sub {
    local $_ = $_[0]->seq;
    eval $eval;
    return $_;
  }
) if ( defined $eval );
for my $file (@files) {
  $dup_seq->namespace($file) if ( @files > 1 );
  my $fit = faiterate($file);
  while ( my $so = $fit->() ) {
    $dup_seq->add_seq($so);
    print STDERR "." if $counter++ % 100 == 0;
  }
}
print STDERR "\n";
#filter out unique seqs
#my @deleted_seqs = map { delete $eq{$_} if ( @{ $eq{$_} } < 2 ) } keys %eq;
my @deleted_seqs = @{ $dup_seq->seq_groups_wo_duplicates };
print STDERR "Writing to stdout";
#print unique ids if filter is active
if ($filter) {
  for my $del_seq_objs (@deleted_seqs) {
    print_filtered($del_seq_objs);
    print STDERR "." if $counter++ % 100 == 0;
  }
}
#calculate also some statistics
my $num_identical_seqs = $dup_seq->num_duplicate_seqs;
my $num_unique_seqs    = $dup_seq->num_groups_w_duplicates;
#my %stats;
#update_stats($seq_objs, ...)
for my $seq_objs ( @{ $dup_seq->seq_groups_w_duplicates } ) {
  if ($filter) {
    print_filtered($seq_objs);
  } else {
    print_ids($seq_objs);
  }
  print STDERR "." if $counter++ % 100 == 0;
}
print STDERR "\n";
#init only if needed
our $seq_writer;
sub print_filtered {
  my ($seq_objs) = @_;
  $seq_writer = Bio::SeqIO->new(
    -format => 'fasta',
    -fh     => \*STDOUT,
  ) unless ($seq_writer);
  return unless ($seq_objs);
  $seq_writer->write_seq($seq_objs->[0] );
}
#FIXME better statistics
#sub update_stats {
  #my ($seq_objs, $stats, $is_duplicated) = @_;
  #my %tmp_stat;
  #if($is_duplicated) {
    ## count number of duplicated sequences per namespace
    #$tmp_stat
  #}
  #for my $so ( @{$seq_objs} ) {
    #$stats{$so->info->{namespace}} //= { };
    #if($is_duplicated) {
    #}
    #print "\t" if ( not $first );
    #say stringify_seq_obj($seq_obj);
    #undef $first;
  #}
#}
sub print_ids {
  my ($seq_objs) = @_;
  my $first = 1;
  for my $seq_obj ( @{$seq_objs} ) {
    print "\t" if ( not $first );
    say stringify_seq_obj($seq_obj);
    undef $first;
  }
}
sub stringify_seq_obj {
  my ($so) = @_;
  return join "\t", ( $so->id, ( $so->desc // "" ), ( $so->info->{namespace} // "" ) );
}
say STDERR
  "Found a total number of $num_identical_seqs identical sequences, which can be reduced to $num_unique_seqs. ";
say STDERR ( $num_identical_seqs - $num_unique_seqs ) . " sequences redundant.";
__END__
=head1 NAME
fafind-eq-seq - find equal sequences
=head1 SYNOPSIS
    ./fafind-eq-seq [--help] [--eval 'perlcode'] <file1> [<file2> ... <fileN>] >file_with_results.txt
    ./fafind-eq-seq [--help] [--eval 'perlcode'] --filter <file1> [<file2> ... <fileN>] >file_with_only_unique_seqs.fasta
=head1 DESCRIPTION
Find identical / equal sequences in a given set of fasta files. Info messages
go to standard error (stderr), results to standard output (stdout).
The result output of F<file_with_results.txt> consists of lines following the pattern
    <ID> <DESCRIPTION><TAB><FILE>
    <TAB><ID> <DESCRIPTION><TAB><FILE>
    <TAB><ID> <DESCRIPTION><TAB><FILE>
    <ID> <DESCRIPTION><TAB><FILE>
    <TAB><ID> <DESCRIPTION><TAB><FILE>
    <TAB><ID> <DESCRIPTION><TAB><FILE>
    <ID> <DESCRIPTION><TAB><FILE>
    <TAB><ID> <DESCRIPTION><TAB><FILE>
    <TAB><ID> <DESCRIPTION><TAB><FILE>
whereas each unindented line and the following <TAB>-indented lines mark one
group of identical sequences.
=head1 OPTIONS
=over 4
=item B<--filter>
Do not print the groups but the sequences in fasta format instead. Duplicated
sequences are omitted. The resulting fasta output is not checked for identical
ids, etc.
I<Synonyms:> B<-f>
=item B<--help>
Display this message.
I<Synonyms:> B<-?>, B<-h>
=item B<--eval>
Manipulate input sequences on the fly. The current sequence string is set to
C<$_>.
This doesn't change the actual output sequence, e.g. on filtering.
Can be very handy for comparing aa-sequences from two different files, at
which one file uses * as stop codon and the other file not:
    ./fafind-eq-seq --eval 's/\*$//' <file1> <file2> >file_with_results.txt
I<Synonyms:> B<-e>
=back
=head1 AUTHOR
jw bargsten, C<< <joachim.bargsten at wur.nl> >>
=cut
	Global
`s`	Focus search bar
`?`	Bring up this help dialog
	GitHub
`g` `p`	Go to pull requests
`g` `i`	go to github issues (only if github is preferred repository)
	POD
`g` `a`	Go to author
`g` `c`	Go to changes
`g` `i`	Go to issues
`g` `d`	Go to dist
`g` `r`	Go to repository/SCM
`g` `s`	Go to source
`g` `b`	Go to file browse
	Search terms
module: (e.g. module:Plugin)
distribution: (e.g. distribution:Dancer auth)
author: (e.g. author:SONGMU Redis)
version: (e.g. version:1.00)