#
# BioPerl module for FAST::Bio::AlignIO::msf
# based on the FAST::Bio::SeqIO::msf module
# by Ewan Birney <birney@ebi.ac.uk>
# and Lincoln Stein <lstein@cshl.org>
#
# and the SimpleAlign.pm module of Ewan Birney
#
# Copyright Peter Schattner
#
# You may distribute this module under the same terms as perl itself
# _history
# September 5, 2000
# POD documentation - main docs before the code
=head1 NAME
FAST::Bio::AlignIO::msf - msf sequence input/output stream
=head1 SYNOPSIS
Do not use this module directly. Use it via the L<FAST::Bio::AlignIO> class.
=head1 DESCRIPTION
This object can transform L<FAST::Bio::Align::AlignI> objects to and from msf
flat file databases.
=head1 FEEDBACK
=head2 Support
Please direct usage questions or support issues to the mailing list:
I<bioperl-l@bioperl.org>
rather than to the module maintainer directly. Many experienced and
reponsive experts will be able look at the problem and quickly
address it. Please include a thorough description of the problem
with code and data examples if at all possible.
=head2 Reporting Bugs
Report bugs to the Bioperl bug tracking system to help us keep track
the bugs and their resolution. Bug reports can be submitted via the
web:
=head1 AUTHORS - Peter Schattner
Email: schattner@alum.mit.edu
=head1 APPENDIX
The rest of the documentation details each of the object
methods. Internal methods are usually preceded with a _
=cut
# Let the code begin...
use vars qw(%valid_type);
use strict;
use FAST::Bio::SeqIO::gcg; # for GCG_checksum()
BEGIN {
%valid_type = qw( dna N rna N protein P );
}
=head2 next_aln
Title : next_aln
Usage : $aln = $stream->next_aln()
Function: returns the next alignment in the stream. Tries to read *all* MSF
It reads all non whitespace characters in the alignment
area. For MSFs with weird gaps (eg ~~~) map them by using
$aln->map_chars('~','-')
Returns : FAST::Bio::Align::AlignI object
Args : NONE
=cut
sub next_aln {
my $self = shift;
my $entry;
my (%hash,$name,$str,@names,$seqname,$start,$end,$count,$seq);
my $aln = FAST::Bio::SimpleAlign->new(-source => 'gcg' );
while( $entry = $self->_readline) {
$entry =~ m{//} && last; # move to alignment section
$entry =~ /Name:\s+(\S+)/ && do { $name = $1;
$hash{$name} = ""; # blank line
push(@names,$name); # we need it ordered!
};
# otherwise - skip
}
# alignment section
while( $entry = $self->_readline) {
next if ( $entry =~ /^\s+(\d+)/ ) ;
$entry =~ /^\s*(\S+)\s+(.*)$/ && do {
$name = $1;
$str = $2;
if( ! exists $hash{$name} ) {
$self->throw("$name exists as an alignment line but not in the header. Not confident of what is going on!");
}
$str =~ s/\s//g;
$str =~ s/~/-/g;
$hash{$name} .= $str;
};
}
# return 0 if scalar @names < 1;
if (scalar(@names) < 1) {
undef $aln;
return $aln;
}
# now got this as a name - sequence hash. Let's make some sequences!
for $name ( @names ) {
if( $name =~ m{(\S+)/(\d+)-(\d+)} ) {
$seqname = $1;
$start = $2;
$end = $3;
} else {
$seqname = $name;
$start = 1;
$str = $hash{$name};
$str =~ s/[^0-9A-Za-z$FAST::Bio::LocatableSeq::OTHER_SYMBOLS]//g;
$end = length($str);
}
$seq = FAST::Bio::LocatableSeq->new('-seq' => $hash{$name},
'-display_id' => $seqname,
'-start' => $start,
'-end' => $end,
'-alphabet' => $self->alphabet,
);
$aln->add_seq($seq);
# If $end <= 0, we have either reached the end of
# file in <> or we have encountered some other error
}
return $aln if $aln->num_sequences;
return;
}
=head2 write_aln
Title : write_aln
Usage : $stream->write_aln(@aln)
Function: writes the $aln object into the stream in MSF format
Sequence type of the alignment is determined by the first sequence.
Returns : 1 for success and 0 for error
Args : FAST::Bio::Align::AlignI object
=cut
sub write_aln {
my ($self,@aln) = @_;
my $msftag;
my $type;
my $count = 0;
my $maxname;
my ($length,$date,$name,$seq,$miss,$pad,%hash,@arr,$tempcount,$index);
foreach my $aln (@aln) {
if( ! $aln || ! $aln->isa('FAST::Bio::Align::AlignI') ) {
$self->warn("Must provide a FAST::Bio::Align::AlignI object when calling write_aln");
next;
}
$date = localtime(time);
$msftag = "MSF";
$type = $valid_type{$aln->get_seq_by_pos(1)->alphabet};
$maxname = $aln->maxdisplayname_length();
$length = $aln->length();
$name = $aln->id();
if( !defined $name ) {
$name = "Align";
}
$self->_print (sprintf("\n%s MSF: %d Type: %s %s Check: 00 ..\n\n",
$name, $aln->num_sequences, $type, $date));
my $seqCountFormat = "%".($maxname > 20 ? $maxname + 2: 22)."s%-27d%27d\n";
my $seqNameFormat = "%-".($maxname > 20 ? $maxname : 20)."s ";
foreach $seq ( $aln->each_seq() ) {
$name = $aln->displayname($seq->get_nse());
$miss = $maxname - length ($name);
$miss += 2;
$pad = " " x $miss;
$self->_print (sprintf(" Name: %s%sLen: %d Check: %d Weight: 1.00\n",$name,$pad,length $seq->seq(), FAST::Bio::SeqIO::gcg->GCG_checksum($seq)));
$hash{$name} = $seq->seq();
push(@arr,$name);
}
# ok - heavy handed, but there you go.
#
$self->_print ("\n//\n\n\n");
while( $count < $length ) {
# there is another block to go!
$self->_print (sprintf($seqCountFormat,' ',$count+1,$count+50));
foreach $name ( @arr ) {
$self->_print (sprintf($seqNameFormat,$name));
$tempcount = $count;
$index = 0;
while( ($tempcount + 10 < $length) && ($index < 5) ) {
$self->_print (sprintf("%s ",substr($hash{$name},
$tempcount,10)));
$tempcount += 10;
$index++;
} #
# ok, could be the very last guy ;)
#
if( $index < 5) {
# space to print!
#
$self->_print (sprintf("%s ",substr($hash{$name},$tempcount)));
$tempcount += 10;
}
$self->_print ("\n");
}
$self->_print ("\n\n");
$count = $tempcount;
}
}
$self->flush if $self->_flush_on_write && defined $self->_fh;
return 1;
}
1;