#!/usr/bin/perl

# Mass spectrometry Perl program for splitting a peptSpectra.xml file into two files (training/test sets)

# Copyright (C) 2006 Jacques Colinge

# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.

# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# Lesser General Public License for more details.

# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

# Contact:
#  Prof. Jacques Colinge
#  Upper Austria University of Applied Sciences at Hagenberg
#  Hauptstrasse 117
#  A-4232 Hagenberg, Austria
#  www.fh-hagenberg.at

=head1 NAME

splitPeptSpectraAtRandom.pl - Splits a .peptSpectra.xml file in two .peptSpectra.xml files

=head1 SYNOPSIS

splitPeptSpectraAtRandom.pl [options] peptSpectra.xml files

=head1 OPTIONS

Use splitPeptSpectraAtRandom.pl -h

=head1 DESCRIPTION

The script splits a .peptSpectra.xml file in two .peptSpectra.xml files randomly. It is mainly
used for creating a training and a test set from one original set of peptide/spectrum matches.

The respective sizes of the two created files can be defined via a proportion or by setting a
fixed number of requested matches for the first output file.

=head1 EXAMPLE

./splitPeptSpectraAtRandom.pl --proportion=0.5 --out=test example.peptSpectra.xml

=head1 AUTHOR

Jacques Colinge

=cut

use strict;
use Getopt::Long;
use Math::Random::TT800;

my ($help, $number, $proportion, $out, $setOne, $setTwo);

if (!GetOptions('help' => \$help,
		'h' => \$help,
		'number=i' => \$number,
		'proportion=f' => \$proportion,
		'out=s' => \$out,
		'set1=s' => \$setOne,
		'set2=s' => \$setTwo) || defined($help) || (!defined($number) && !defined($proportion)) || (!defined($out) && (!defined($setOne) || !defined($setTwo)))){
  print STDERR "Usage: extractUniquePeptSpectra.pl [options] peptSpectra.xml
\t-h
\t-help
\t--number=int           [number of spectra in the first set]
\t--proportion=float     [proportion of spectra in the first set]
\t--out=fname            [base file name for the two output files]
\t--set1=fname           [file name for set one]
\t--set2=fname           [file name for set two]\n";
exit(0);
}

# Counts the peptides
my $count;
open(F, $ARGV[0]) || CORE::die("Cannot open [$ARGV[0]]: $!");
while (<F>){
  if (/<idi:sequence>/){
    $count++;
  }
}
close(F);
my $first = defined($number) ? $number : int($proportion*$count);
$first = $count if ($first > $count);
$first = 0 if ($first < 0);

# Generates the random selection
my (%selected, @remain);
my $tt = new Math::Random::TT800;
for (my $i = 0; $i < $count; $i++){
  $remain[$i] = $i;
}
for (my $i = 0; $i < $first; $i++){
  my $sel = int($tt->next()*($count-$i));
  $selected{$remain[$sel]} = 1;
  $remain[$sel] = $remain[$count-1-$i];
}

# Prints the header
if (defined($out)){
  $setOne = "$out-1.peptSpectra.xml";
  $setTwo = "$out-2.peptSpectra.xml";
}
open(S1, ">$setOne") || CORE::die("Cannot create [$setOne]: $!");
open(S2, ">$setTwo") || CORE::die("Cannot create [$setTwo]: $!");
open(F, $ARGV[0]) || CORE::die("Cannot open [$ARGV[0]]: $!");
while (<F>){
  print S1;
  print S2;
  last if (/<idi:Identifications>/);
}

# Dispatch spectra
my $oneIdentification;
my $num = 0;
while (<F>){
  last if (/<\/idi:Identifications>/);

  $oneIdentification .= $_;
  if (/<\/idi:OneIdentification>/){
    if ($selected{$num}){
      print S1 $oneIdentification;
    }
    else{
      print S2 $oneIdentification;
    }
    $num++;
    undef($oneIdentification);
  }
}

# Prints the end of the file
print S1;
print S2;
while (<F>){
  print S1;
  print S2
}
close(F);
close(S1);
close(S2);