#!/usr/bin/env perl
use strict;
use Carp;
use Pod::Usage;

=head1 NAME

fasta-decoy.pl - decoy input databanks following several moethods


Reads input fasta file and produce a decoyed databank with several methods:

=over 4

=item reverse: simply reverse each sequence

=item shuffle: shuffle AA in each sequence

=item shuffle & avoid known cleaved peptides: shuffle sequence but avoid producing known tryptic peptides

=item Markov model: learn Markov model chain distribution of a given level, then produces entries corresponding to this distribution



  #reverse sequences for a local (optionaly compressed) file
  fasta-decoy.pl --in=/tmp/uniprot_sprot.fasta.gz --method=reverse

  #download databanks from the web | uncompress it and shuffle the sequence
  wget -silent -O - ftp://ftp.expasy.org/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.fasta.gz | zcat |  databatanks-decoy.pl --method=shuffle

  #use a .dat file (with splice forms) as an input
  uniprotdat2fasta.pl --in=uniprot_sprot_human.dat | fasta-decoy.pl --method=markovmodel

  #reversing each sequence
  fasta-decoy.pl --ac-prefix=DECOY_ --in=mitoch.fasta --method=reverse --out=mitoch-reverse.fasta

  #drawing amino acid following distribution in original fasta (end of sequence is considered as a learned random event)
  fasta-decoy.pl --ac-prefix=DECOY_ --in=mitoch.fasta --method=markovmodel --markovmodel-level=0 --out=mitoch-markovmodel_0.fasta

  #drawing amino acid with a markov model (here of length 3)
  fasta-decoy.pl --ac-prefix=DECOY_ --in=mitoch.fasta --method=markovmodel --markovmodel-level=3 --out=mitoch-markovmodel_3.fasta

  #each sequence is randomly shuffled
  fasta-decoy.pl --ac-prefix=DECOY_ --in=mitoch.fasta --method=shuffle --out=mitoch-shuffle.fasta

  #idem, but no tryptic peptide (of length>=6) from the original bank must be found in the random one;
  => see script fasta-shuffle-notryptic.pl


=head3 --in=infile.fasta

An input fasta file (will be uncompressed if ending with gz)

=head3 -out=outfile.fasta

A .fasta file [default is stdout]

=head3 --method=(reverse|shuffle|markovmodel)

Set the decoying method

=head1 OPTIONS

=head2 --ac-prefix=string

Set a key to be prepended before the AC in the randomized bank. By default, it will be dependent on the choosen method.

=head2 --method=shuffle options

=head2 --method=markovmodel options

=head3 --markovmodel-level=int [default 3]

Set length of the model (0 means only AA distrbution will be respected, 3 means chains of length 3  distribution etc.). Setting a length >3 can deal to memory burnout.

=head2 misc

=head3 --norandomseed

Random generator seed is set to 0, so 2 run on same data will produce the same result

=head3 --noprogressbar

do not display terminal progress bar (if possible)

=head3 --help

=head3 --man

=head3 --verbose

Setting an environment variable DO_NOT_DELETE_TEMP=1 will keep the temporay file after the script exit

=head1 EXAMPLE


Copyright (C) 2004-2006  Geneva Bioinformatics www.genebio.com

This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.

This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
Lesser General Public License for more details.

You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

=head1 AUTHORS

Alexandre Masselot, www.genebio.com


use SelectSaver;
use File::Basename;
use List::Util qw/shuffle/;

use Getopt::Long;
my ($acPrefix, $method, $noProgressBar);

my $markovmodel_level=2;

my $inFile='-';
my $outFile='-';
my $inFD;

my $outLength;
my $noSeed;

my($help, $man, $verbose);

if (!GetOptions(
		"out=s" => \$outFile,

		"method=s" => \$method,
		"ac-prefix=s" =>\$acPrefix,



		"noprogressbar" => \$noProgressBar,

                "help" => \$help,
                "man" => \$man,
                "verbose" => \$verbose,
    || $help || $man){

  pod2usage(-verbose=>2, -exitval=>2) if(defined $man);
  pod2usage(-verbose=>1, -exitval=>2);

my $nbentries=0;
my $nbpept;
srand(1)  if $noSeed;

die "no --method=methodname argument (see --help)" unless $method;

#init parsing progress bar
my ($pg, $size, $nextpgupdate, $readsize);
my $imaxreshuffle=1000;
my $imaxreshuffleFinal=10000;


#set output on default
my $saver;
if ($outFile ne '-'){
    open FDOUT, ">:gzip", $outFile or die "cannot open for writing gziped [$outFile]: $!";
    $saver=new SelectSaver(\*FDOUT);
    open (FDOUT, ">$outFile") or die "could not open for writing [$outFile]:$!";
    $saver=new SelectSaver(\*FDOUT);

if($method eq 'reverse'){
  $acPrefix ||='REV_';
  while((my ($head, $seq)=__nextEntry())[0]){
    $head=~s/>/>$acPrefix/ or die "entry header does not start with '>': $head";
    $seq=reverse $seq;
    print  "$head\n";
    print __prettySeq($seq)."\n";
  print STDERR "reverted $nbentries\n" if $verbose;
} elsif ($method eq 'markovmodel') {
  $acPrefix ||='MARKOVMODEL_';
  die "--markovmodel-level=int [$markovmodel_level] should be between 0 and 3" unless $markovmodel_level>=0 && $markovmodel_level<=3;

  my %chain;			#count and prob 
  while ((my ($head, $seq)=__nextEntry())[0]) {
    my $buf='';
    while ($seq=~/(.)/g) {
      my $p=pos $seq;
      my $q=$p-$markovmodel_level-1;
      $q=0 if $q<0;
      my $prec=substr $seq, $q, $p-$q-1;
      $chain{$prec || '^'}{$1}++;
  foreach my $h (values %chain) {
    my $tot=0;
    $tot+=$_ foreach values %$h;
    $h->{$_}=$h->{$_}*1.0/$tot foreach keys %$h;
    if ( -t STDIN && -t STDOUT){
      require Term::ProgressBar;
      $size=(stat $inFile)[7];
      $pg=Term::ProgressBar->new ({name=> "markovmodel generation",

  foreach (1..$nbentries) {
    $nextpgupdate=$pg->update($readsize) if $pg && $readsize>=$nextpgupdate;

    print ">$acPrefix$_\n";
    my $seq='';
    my $prec='';
    while (1) {
      my $hNext;
      my $crand=rand;
      my $nextAA='?';
      foreach $_ (keys %$hNext) {
	if ($crand<=0) {
      last if $nextAA eq '*';
      if ($markovmodel_level>0) {
      die "NO next AA generated" if $nextAA eq '?';
    print __prettySeq($seq)."\n";
} elsif ($method eq 'shuffle') {
  $acPrefix ||='SHFL_';
  while ((my ($head, $seq)=__nextEntry())[0]) {
    $head=~s/>/>$acPrefix/ or die "entry header does not start with '>': $head";
    $seq=join ('', shuffle  split(//, $seq));
    print  "$head\n";
    print __prettySeq($seq)."\n";
  print STDERR "shuffled $nbentries\n" if $verbose;
} else {
  die "unimplemented method [$method]";

sub __nextEntry{
  lock ($inFD);
  local $/="\n>";
  my $contents=<$inFD>;
  return undef unless $contents;

  $readsize+=length $contents;
  $nextpgupdate=$pg->update($readsize) if $pg && $readsize>=$nextpgupdate;

  chomp $contents;
  $contents=">$contents" unless $contents=~/^>/;
  my ($head, $seq)=split /\n/,$contents,2;
  return ($head, $seq);

sub __prettySeq{
  return $_;

sub __setInput{
    if ((!$noProgressBar) && ($inFile !~ /gz$/i) && ($inFile ne '-')&& -t STDIN && -t STDOUT){
      require Term::ProgressBar;
      $size=(stat $inFile)[7];
      $pg=Term::ProgressBar->new ({name=> "parsing ".basename($inFile),
  if ($@){
    warn "could not use Term::ProgressBar (consider installing the module for more interactive use";
  #set input
  if ($inFile eq '-'){
    require PerlIO::gzip;
    open $inFD, "<:gzip", "$inFile" or die $!;
    open ($inFD, "<$inFile") or die "cannot open for reading [$inFile]: $!";