#!/usr/bin/env perl
package Bio::MLST::Bin::GetEmmSequenceType;
$Bio::MLST::Bin::GetEmmSequenceType::VERSION = '2.1.1706216';
# ABSTRACT: Given an assembly find the MLST sequence type of the emm gene from the CDC.
# PODNAME: get_emm_sequence_type


BEGIN { unshift( @INC, '../lib' ) }
use lib "/software/pathogen/internal/prod/lib";
use Moose;
use Getopt::Long;
use Cwd;
use Bio::MLST::Check;
use Bio::MLST::Databases;
use Bio::MLST::Validate::Executable;
use Bio::MLST::SearchForFiles;
use Bio::MLST::CheckMultipleSpecies;

my ($species, $output_fasta_files, $output_directory, $output_phylip_files, $available_databases, $base_directory, $makeblastdb_exec, $blastn_exec, $spreadsheet_basename,$parallel_processes, $help, $version);

GetOptions ('s|species=s'              => \$species,
            'o|output_directory=s'     => \$output_directory,
            'c|output_fasta_files'     => \$output_fasta_files,
            'y|output_phylip_files'    => \$output_phylip_files,
            'a|available_databases'    => \$available_databases,
            'b|mlst_base_directory=s'  => \$base_directory,
            'm|makeblastdb_exec=s'     => \$makeblastdb_exec,
            'n|blastn_exec=s'          => \$blastn_exec,
            'p|spreadsheet_basename=s' => \$spreadsheet_basename,
            'd|parallel_processes=i'   => \$parallel_processes,
            'h|help'                   => \$help,
            'v|version'                => \$version,
);

(! $version ) or die "$0 version " . Bio::MLST::Bin::GetEmmSequenceType->VERSION . "\n";

( ((@ARGV > 0) || (defined($available_databases))) && ! $help ) or die <<USAGE;
Usage: get_emm_sequence_type [options] *.fasta

   -s STR Species of MLST scheme (0 or more comma separated)
   -d INT Number of threads [1]
   -c     Output a FASTA file of concatenated alleles and unknown sequences 
   -y     Output a phylip file of concatenated alleles and unknown sequences
   -o STR Output directory [.]
   -a     Print out all available MLST schemes and exit
   -h     Print this message and exit
   -v     Print version number and exit

USAGE
;

$base_directory ||= $ENV{MLST_DATABASES};
$base_directory ||= '/lustre/scratch108/pathogen/pathpipe/mlst';

# Multiple versions of blast lying around, so use a particular one if possible
# Warns if the user's preferred executable cannot be found; errors if defaults are also missing
my $validator = Bio::MLST::Validate::Executable->new();
$makeblastdb_exec = $validator->preferred_executable($makeblastdb_exec,
                                                     ['/software/pubseq/bin/ncbi-blast-2.2.28+/bin/makeblastdb',
                                                      'makeblastdb']);
$blastn_exec = $validator->preferred_executable($blastn_exec,
                                                ['/software/pubseq/bin/ncbi-blast-2.2.28+/bin/blastn',
                                                 'blastn']);

$spreadsheet_basename ||= 'emm_results';
$output_directory    ||= getcwd;
$output_fasta_files  ||= 0;
$output_phylip_files ||= 0;
$parallel_processes  ||= 2;
$species ||= 'Streptococcus pyogenes emm';

my $multiple_fastas = Bio::MLST::Check->new(
  species               => $species,
  base_directory        => $base_directory,
  raw_input_fasta_files => \@ARGV,
  makeblastdb_exec      => $makeblastdb_exec,
  blastn_exec           => $blastn_exec,
  output_directory      => $output_directory,
  output_fasta_files    => $output_fasta_files,
  spreadsheet_basename  => $spreadsheet_basename,
  parallel_processes    => $parallel_processes,
  output_phylip_files   => $output_phylip_files,
  show_contamination_instead_of_alt_matches => 0
);
$multiple_fastas->create_result_files;

__END__

=pod

=encoding UTF-8

=head1 NAME

get_emm_sequence_type - Given an assembly find the MLST sequence type of the emm gene from the CDC.

=head1 VERSION

version 2.1.1706216

=head1 SYNOPSIS

Given Fasta files and a Species regex, lookup the relevant MLST database and output the sequence type to a file.
It requires NBCI Blast+ to be available in your PATH.

   # Basic usage, sequence type result written to my_assembly.fa.st
   get_emm_sequence_type my_assembly.fa

   # Multiple fasta files
   get_emm_sequence_type myfasta.fa anotherfasta.fa yetanother.fa

   # Split into 8 parallel processes (much faster), default is 2
   get_emm_sequence_type -d 8 *.fa

   # output a fasta file with the concatenated alleles and unknown sequences
   get_emm_sequence_type -c  my_assembly.fa

   # output a phylip file with the concatenated alleles and unknown sequences
   get_emm_sequence_type -y  my_assembly.fa

   # Specify an output directory
   get_emm_sequence_type  -o /path/to/results my_assembly.fa

   # This help message
   get_emm_sequence_type -h

   # print version
   get_emm_sequence_type -v

=head1 AUTHOR

Andrew J. Page <ap13@sanger.ac.uk>

=head1 COPYRIGHT AND LICENSE

This software is Copyright (c) 2012 by Wellcome Trust Sanger Institute.

This is free software, licensed under:

  The GNU General Public License, Version 3, June 2007

=cut