use
vars
qw(%valid_type)
;
BEGIN {
%valid_type
=
qw( dna N rna N protein P )
;
}
sub
next_aln {
my
$self
=
shift
;
my
$entry
;
my
(
%hash
,
$name
,
$str
,
@names
,
$seqname
,
$start
,
$end
,
$count
,
$seq
);
my
$aln
= FAST::Bio::SimpleAlign->new(
-source
=>
'gcg'
);
while
(
$entry
=
$self
->_readline) {
$entry
=~ m{//} &&
last
;
$entry
=~ /Name:\s+(\S+)/ &&
do
{
$name
= $1;
$hash
{
$name
} =
""
;
push
(
@names
,
$name
);
};
}
while
(
$entry
=
$self
->_readline) {
next
if
(
$entry
=~ /^\s+(\d+)/ ) ;
$entry
=~ /^\s*(\S+)\s+(.*)$/ &&
do
{
$name
= $1;
$str
= $2;
if
( !
exists
$hash
{
$name
} ) {
$self
->throw(
"$name exists as an alignment line but not in the header. Not confident of what is going on!"
);
}
$str
=~ s/\s//g;
$str
=~ s/~/-/g;
$hash
{
$name
} .=
$str
;
};
}
if
(
scalar
(
@names
) < 1) {
undef
$aln
;
return
$aln
;
}
for
$name
(
@names
) {
if
(
$name
=~ m{(\S+)/(\d+)-(\d+)} ) {
$seqname
= $1;
$start
= $2;
$end
= $3;
}
else
{
$seqname
=
$name
;
$start
= 1;
$str
=
$hash
{
$name
};
$str
=~ s/[^0-9A-Za-z
$FAST::Bio::LocatableSeq::OTHER_SYMBOLS
]//g;
$end
=
length
(
$str
);
}
$seq
= FAST::Bio::LocatableSeq->new(
'-seq'
=>
$hash
{
$name
},
'-display_id'
=>
$seqname
,
'-start'
=>
$start
,
'-end'
=>
$end
,
'-alphabet'
=>
$self
->alphabet,
);
$aln
->add_seq(
$seq
);
}
return
$aln
if
$aln
->num_sequences;
return
;
}
sub
write_aln {
my
(
$self
,
@aln
) =
@_
;
my
$msftag
;
my
$type
;
my
$count
= 0;
my
$maxname
;
my
(
$length
,
$date
,
$name
,
$seq
,
$miss
,
$pad
,
%hash
,
@arr
,
$tempcount
,
$index
);
foreach
my
$aln
(
@aln
) {
if
( !
$aln
|| !
$aln
->isa(
'FAST::Bio::Align::AlignI'
) ) {
$self
->
warn
(
"Must provide a FAST::Bio::Align::AlignI object when calling write_aln"
);
next
;
}
$date
=
localtime
(
time
);
$msftag
=
"MSF"
;
$type
=
$valid_type
{
$aln
->get_seq_by_pos(1)->alphabet};
$maxname
=
$aln
->maxdisplayname_length();
$length
=
$aln
->
length
();
$name
=
$aln
->id();
if
( !
defined
$name
) {
$name
=
"Align"
;
}
$self
->_print (
sprintf
(
"\n%s MSF: %d Type: %s %s Check: 00 ..\n\n"
,
$name
,
$aln
->num_sequences,
$type
,
$date
));
my
$seqCountFormat
=
"%"
.(
$maxname
> 20 ?
$maxname
+ 2: 22).
"s%-27d%27d\n"
;
my
$seqNameFormat
=
"%-"
.(
$maxname
> 20 ?
$maxname
: 20).
"s "
;
foreach
$seq
(
$aln
->each_seq() ) {
$name
=
$aln
->displayname(
$seq
->get_nse());
$miss
=
$maxname
-
length
(
$name
);
$miss
+= 2;
$pad
=
" "
x
$miss
;
$self
->_print (
sprintf
(
" Name: %s%sLen: %d Check: %d Weight: 1.00\n"
,
$name
,
$pad
,
length
$seq
->seq(), FAST::Bio::SeqIO::gcg->GCG_checksum(
$seq
)));
$hash
{
$name
} =
$seq
->seq();
push
(
@arr
,
$name
);
}
$self
->_print (
"\n//\n\n\n"
);
while
(
$count
<
$length
) {
$self
->_print (
sprintf
(
$seqCountFormat
,
' '
,
$count
+1,
$count
+50));
foreach
$name
(
@arr
) {
$self
->_print (
sprintf
(
$seqNameFormat
,
$name
));
$tempcount
=
$count
;
$index
= 0;
while
( (
$tempcount
+ 10 <
$length
) && (
$index
< 5) ) {
$self
->_print (
sprintf
(
"%s "
,
substr
(
$hash
{
$name
},
$tempcount
,10)));
$tempcount
+= 10;
$index
++;
}
if
(
$index
< 5) {
$self
->_print (
sprintf
(
"%s "
,
substr
(
$hash
{
$name
},
$tempcount
)));
$tempcount
+= 10;
}
$self
->_print (
"\n"
);
}
$self
->_print (
"\n\n"
);
$count
=
$tempcount
;
}
}
$self
->flush
if
$self
->_flush_on_write &&
defined
$self
->_fh;
return
1;
}
1;