#!perl -w
# $Id: search2gff.PLS,v 1.14 2006/01/19 22:56:02 lapp Exp $
# Author: Jason Stajich <jason-at-bioperl-dot-org>
# Description: Turn SearchIO parseable report(s) into a GFF report
#
=head1 NAME
search2gff - Turn SearchIO parseable reports(s) into a GFF report
=head1 SYNOPSIS
Usage:
search2gff [-o outputfile] [-f reportformat] [-i inputfilename] OR file1 file2 ..
=head1 DESCRIPTION
This script will turn a protein Search report (BLASTP, FASTP, SSEARCH,
AXT, WABA) into a GFF File.
The options are:
-i infilename - (optional) inputfilename, will read
either ARGV files or from STDIN
-o filename - the output filename [default STDOUT]
-f format - search result format (blast, fasta,waba,axt)
(ssearch is fasta format). default is blast.
-t/--type seqtype - if you want to see query or hit information
in the GFF report
-s/--source - specify the source (will be algorithm name
otherwise like BLASTN)
--method - the method tag (primary_tag) of the features
(default is similarity)
--scorefunc - a string or a file that when parsed evaluates
to a closure which will be passed a feature
object and that returns the score to be printed
--locfunc - a string or a file that when parsed evaluates
to a closure which will be passed two
features, query and hit, and returns the
location (Bio::LocationI compliant) for the
GFF3 feature created for each HSP; the closure
may use the clone_loc() and create_loc()
functions for convenience, see their PODs
--onehsp - only print the first HSP feature for each hit
-p/--parent - the parent to which HSP features should refer
if not the name of the hit or query (depending
on --type)
--target/--notarget - whether to always add the Target tag or not
-h - this help menu
--version - GFF version to use (put a 3 here to use gff 3)
--component - generate GFF component fields (chromosome)
-m/--match - generate a 'match' line which is a container
of all the similarity HSPs
--addid - add ID tag in the absence of --match
-c/--cutoff - specify an evalue cutoff
Additionally specify the filenames you want to process on the
command-line. If no files are specified then STDIN input is assumed.
You specify this by doing: search2gff E<lt> file1 file2 file3
=head1 AUTHOR
Jason Stajich, jason-at-bioperl-dot-org
=head1 Contributors
Hilmar Lapp, hlapp-at-gmx-dot-net
=cut
use strict;
use Bio::Tools::GFF;
use Getopt::Long;
use Bio::SearchIO;
use Bio::Location::Simple; # pre-declare to simplify $locfunc implementations
use Bio::Location::Atomic; # pre-declare to simplify $locfunc implementations
use Storable qw(dclone); # for cloning location objects
use Bio::Factory::FTLocationFactory;
my ($output, # output file (if not stdout)
$input, # name of the input file
$format, # format of the input file, defauly is blast
$type, # 'query' or 'hit'
$cutoff, # cut-off value for e-value filter
$sourcetag, # explicit source tag (will be taken from program
# otherwise
$methodtag, # primary tag (a.k.a. method), default 'similarity'
$gffver, # GFF version (dialect) to write
$scorefunc, # closure returning the score for a passed feature
$locfunc, # closure returning a location object for a passed
# query and hit feature
$addid, # flag: whether to always add the ID for $match == 0
$parent, # the name of the parent to use; if set and $match == 0
# will always add the target
$comp, # flag: whether to print a component feature
$addtarget, # flag: whether to always add the Target tag, default
# is true
$match, # flag: whether to print match lines as containers
$onehsp, # flag: whether to consider only the first HSP for a hit
$quiet, # flag: run quietly
$help); # flag: show help screen
# set defaults:
$format = 'blast';
$type = 'query';
$gffver = 2;
$methodtag = "similarity";
$addtarget = 1;
GetOptions(
'i|input:s' => \$input,
'component' => \$comp,
'm|match' => \$match,
'o|output:s' => \$output,
'f|format:s' => \$format,
's|source:s' => \$sourcetag,
'method=s' => \$methodtag,
'addid' => \$addid,
'scorefunc=s'=> \$scorefunc,
'locfunc=s' => \$locfunc,
'p|parent=s' => \$parent,
'target!' => \$addtarget,
'onehsp' => \$onehsp,
't|type:s' => \$type,
'c|cutoff:s' => \$cutoff,
'v|version:i'=> \$gffver,
'q|quiet' => \$quiet,
'h|help' => sub{ exec('perldoc',$0);
exit(0)
},
);
$type = lc($type);
if( $type =~ /target/ ) { $type = 'hit' }
elsif( $type ne 'query' && $type ne 'hit' ) {
die("seqtype must be either 'query' or 'hit'");
}
# custom or default function returning the score
$scorefunc = defined($scorefunc) ? parse_code($scorefunc) : sub {shift->score};
# custom or default function returning the location
$locfunc = defined($locfunc) ? parse_code($locfunc) : sub { shift->location };
# if --match is given then $addid needs to be disabled
$addid = undef if $addid && $match;
# if no input is provided STDIN will be used
my $parser = new Bio::SearchIO(-format => $format,
-verbose => $quiet ? -1 : 0,
-file => $input);
my $out;
if( defined $output ) {
$out = new Bio::Tools::GFF(-gff_version => $gffver,
-file => ">$output");
} else {
$out = new Bio::Tools::GFF(-gff_version => $gffver); # STDOUT
}
my (%seen_hit,%seen);
my $other = $type eq 'query' ? 'hit' : 'query';
while( my $result = $parser->next_result ) {
my $qname = $result->query_name;
if ( $comp && $type eq 'query' &&
$result->query_length ) {
$out->write_feature(Bio::SeqFeature::Generic->new
(-start => 1,
-end => $result->query_length,
-seq_id => $qname,
-source_tag => 'chromosome',
-primary_tag => 'Component',
-tag => {
'Sequence' => $qname
}));
}
while( my $hit = $result->next_hit ) {
next if( defined $cutoff && $hit->significance > $cutoff);
my $acc = $qname;
if( $seen{$qname."-".$hit->name}++ ) {
$acc = $qname."-".$seen{$qname.'-'.$hit->name};
}
if( $comp && $type eq 'hit' && $hit->length &&
! $seen_hit{$hit->name}++ ) {
$out->write_feature(Bio::SeqFeature::Generic->new
(-start => 1,
-end => $hit->length,
-seq_id => $hit->name,
-source_tag => 'chromosome',
-primary_tag => 'Component',
-tag => {
'Sequence' => $hit->name
}));
}
my (%min,%max,$seqid,$name,$st);
while( my $hsp = $hit->next_hsp ) {
my $feature = new Bio::SeqFeature::Generic;
my ($proxyfor,$otherf);
if( $type eq 'query' ) {
($proxyfor,$otherf) = ($hsp->query,
$hsp->hit);
$name ||= $hit->name;
} else {
($otherf,$proxyfor) = ($hsp->query,
$hsp->hit);
$name ||= $acc;
}
$proxyfor->score($hit->bits) unless( $proxyfor->score );
if (($gffver == 3) && ($match || $parent)) {
$feature->add_tag_value('Parent', $parent || $name);
}
$min{$type} = $proxyfor->start
unless defined $min{$type} && $min{$type} < $proxyfor->start;
$max{$type} = $proxyfor->end
unless defined $max{$type} && $max{$type} > $proxyfor->end;
$min{$other} = $otherf->start
unless defined $min{$other} && $min{$other} < $otherf->start;
$max{$other} = $otherf->end
unless defined $max{$other} && $max{$other} > $otherf->end;
if ($addtarget || $match) {
$feature->add_tag_value('Target', 'Sequence:'.$name);
$feature->add_tag_value('Target', $otherf->start);
$feature->add_tag_value('Target', $otherf->end);
}
if ($addid) {
$feature->add_tag_value('ID', $name);
}
$feature->location(&$locfunc($proxyfor,$otherf));
# strand for feature is always going to be product of
# query & hit strands so that target can always be just
# '+'
$feature->strand ( $proxyfor->strand * $otherf->strand);
if( $sourcetag ) {
$feature->source_tag($sourcetag);
} else {
$feature->source_tag($proxyfor->source_tag);
}
$feature->score(&$scorefunc($proxyfor));
$feature->frame($proxyfor->frame);
$feature->seq_id($proxyfor->seq_id );
$feature->primary_tag($methodtag);
# add annotation if encoded in the query description
my $desc = $result->query_description;
while ($desc =~ /\/([^=]+)=(\S+)/g) {
$feature->add_tag_value($1,$2);
}
$seqid ||= $proxyfor->seq_id;
$out->write_feature($feature);
$st ||= $sourcetag || $proxyfor->source_tag;
last if $onehsp;
}
if( $match ) {
my $matchf = Bio::SeqFeature::Generic->new
(-start => $min{$type},
-end => $max{$type},
-strand=> $hit->strand($type)*$hit->strand($other),
-primary_tag => 'match',
-source_tag => $st,
-score => $hit->bits,
-seq_id => $seqid);
if( $gffver == 3 ) {
$matchf->add_tag_value('ID', $name);
}
$matchf->add_tag_value('Target', "Sequence:$name");
$out->write_feature($matchf);
}
}
}
sub parse_code{
my $src = shift;
my $code;
# file or subroutine?
if(-r $src) {
if(! (($code = do $src) && (ref($code) eq "CODE"))) {
die "error in parsing code block $src: $@" if $@;
die "unable to read file $src: $!" if $!;
die "failed to run $src, or it failed to return a closure";
}
} else {
$code = eval $src;
die "error in parsing code block \"$src\": $@" if $@;
die "\"$src\" fails to return a closure"
unless ref($code) eq "CODE";
}
return $code;
}
=head2 clone_loc
Title : clone_loc
Usage : my $l = clone_loc($feature->location);
Function: Helper function to simplify the task of cloning locations
for --locfunc closures.
Presently simply implemented using Storable::dclone().
Example :
Returns : A L<Bio::LocationI> object of the same type and with the
same properties as the argument, but physically different.
All structured properties will be cloned as well.
Args : A L<Bio::LocationI> compliant object
=cut
sub clone_loc{
return dclone(shift);
}
=head2 create_loc
Title : create_loc
Usage : my $l = create_loc("10..12");
Function: Helper function to simplify the task of creating locations
for --locfunc closures. Creates a location from a feature-
table formatted string.
Example :
Returns : A L<Bio::LocationI> object representing the location given
as formatted string.
Args : A GenBank feature-table formatted string.
=cut
sub create_loc{
return Bio::Factory::FTLocationFactory->from_string(shift);
}