sub
new {
my
(
$class
,
@args
)=
@_
;
my
$self
=
$class
->SUPER::new(
@args
);
my
(
%instances
,
@header
,
$n
);
my
(
$file
)=
$self
->_rearrange([
'FILE'
],
@args
);
$self
->{file} =
$file
;
$self
->{_factor}=1;
$self
->_initialize_io(
@args
) ||
warn
"Did you intend to use STDIN?"
;
$self
->{_end}=0;
undef
$self
->{hid};
return
$self
if
(
$file
=~/^>/);
my
$buf
=
$self
->_readline;
$self
->throw(
'Cannot parse HTML format yet'
)
if
(
$buf
=~/^<HTML>/);
while
(
defined
(
$buf
=
$self
->_readline)) {
chomp
(
$buf
);
if
(
$buf
=~/DATABASE AND MOTIFS/) {
while
(
$buf
=
$self
->_readline) {
if
(
$buf
=~/DATABASE/) {
$buf
=~s/^[\s\t]+//;
chomp
$buf
;
(
$n
,
$self
->{_dbname},
$self
->{_dbtype})=
split
(/\s/,
$buf
);
$self
->{_dbtype}=~s/[\(\)]//g;
}
if
(
$buf
=~/MOTIFS/) {
$buf
=~s/^[\s\t]+//;
chomp
$buf
;
(
$n
,
$self
->{_mrsc},
$self
->{_msrctype})=
split
(/\s/,
$buf
);
$self
->{_msrctype}=~s/[\(\)]//g;
last
;
}
}
if
(
$self
->{_msrctype} ne
$self
->{_dbtype}) {
$self
->{_factor}=3;
$self
->{_mixquery}=1;
}
}
if
(
$buf
=~m/MOTIF WIDTH BEST POSSIBLE MATCH/) {
$self
->_readline;
while
(
defined
(
$buf
=
$self
->_readline)) {
last
if
(
$buf
!~/\w/);
$buf
=~s/\t+//g;
$buf
=~s/^\s+//g;
my
(
$id
,
$width
,
$seq
)=
split
(/\s+/,
$buf
);
push
@{
$self
->{hid}},
$id
;
$self
->{
length
}->{
$id
}=
$width
;
$self
->{seq}->{
$id
}=
$seq
;
}
next
;
}
if
(
$buf
=~m/section i:/i) {
$self
->_readline;
$self
->_readline;
$self
->_readline;
%instances
=_get_genes(
$self
);
$self
->{instances}=\
%instances
;
if
(!(
%instances
)) {
$self
->
warn
(
"Your MAST analysis did not find any matches satisfying the current threshold.\nSee MAST documentation for more information.\n"
);
return
$self
;
}
next
;
}
if
(
$buf
=~m/section ii:/i) {
$self
->_readline;
$self
->_readline;
$self
->_readline;
last
;
}
$buf
=~s/[\t+\s+]/ /g;
push
@header
,
$buf
unless
((
$buf
=~/\*{10,}/)||(
$buf
!~/\w/));
}
$self
->throw(
'Could not read Section I, probably wrong format, make sure it is not HTML, giving up...'
)
if
!(
%instances
);
$self
->
warn
(
"This file might be an unreadable version, proceed with caution!\n"
)
if
(!
grep
(/\s+MAST\s+version\s+3/,
@header
));
$self
->{unstructured} = \
@header
;
$self
->_initialize;
return
$self
;
}
sub
_get_genes {
my
$self
=
shift
;
my
%llid
;
my
$ok
=0;
my
$i
=0;
my
%instances
;
while
(
my
$line
=
$self
->_readline) {
last
if
(
$line
=~/^[\s\t*]/);
chomp
(
$line
);
$i
++;
next
if
(
$line
eq
''
);
$line
=~s/\s+/,/g;
my
(
$id
,
$key
,
$eval
,
$len
)=
split
(/,/,
$line
);
unless
(
$len
) {
warn
"Malformed data found: $line\n"
;
next
;
}
$instances
{
$id
}=new Bio::Matrix::PSM::InstanceSite (
-id
=>
$id
,
-desc
=>
$key
,
-score
=>
$eval
,
-width
=>
$len
,
-seq
=>
'ACGT'
);
}
return
%instances
;
}
sub
next_psm {
my
$self
=
shift
;
return
if
(
$self
->{_end}==1);
my
(
@lmotifsm
,
%index
,
$eval
,
$scheme
,
$sid
);
%index
= %{
$self
->{
length
}};
my
(
@instances
,
%instances
);
my
$line
=
$self
->_readline;
$line
=~s/[\t\n]//;
if
(
$line
=~ /\*{10,}/) {
$self
->{_end}=1;
return
;
}
do
{
if
(
$line
!~/^\s/) {
(
$sid
,
$eval
,
$scheme
)=
split
(/\s+/,
$line
,3);
}
else
{
$scheme
.=
$line
; }
$line
=
$self
->_readline;
$line
=~s/[\t\n]//;
}
until
(
$line
!~/^\s/);
my
$pos
=1;
$scheme
=~s/\s+//g;
$scheme
=~s/\n//g;
my
@motifs
=
split
(/_/,
$scheme
);
while
(
@motifs
) {
my
$next
=
shift
(
@motifs
);
if
(!(
$next
=~/\D/)) {
last
if
(!
@motifs
);
$pos
+=
$next
;
next
;
}
my
$id
=
$next
;
my
$score
=
$id
=~m/\[/ ?
'strong'
:
'weak'
;
my
$frame
;
my
$strand
=
$id
=~ m/\-\d/ ? -1 : 1 ;
if
(
$self
->{_mixquery}) {
$frame
= 0
if
$id
=~ m/\d+a/ ;
$frame
= 1
if
$id
=~ m/\d+b/ ;
$frame
= 2
if
$id
=~ m/\d+c/ ;
}
$id
=~s/\D+//g;
my
@s
;
my
$width
=
$index
{
$id
};
my
$seq
=
'N'
x (
$width
*$self
->{_factor});
my
$instance
=new Bio::Matrix::PSM::InstanceSite
(
-id
=>
"$id\@$sid"
,
-mid
=>
$id
,
-accession_number
=>
$sid
,
-desc
=>
"Motif $id occurrance in $sid"
,
-score
=>
$score
,
-seq
=>
$seq
,
-alphabet
=>
'dna'
,
-start
=>
$pos
,
-strand
=>
$strand
);
$instance
->frame(
$frame
)
if
(
$self
->{_mixquery});
push
@instances
,
$instance
;
$pos
+=
$index
{
$id
}
*$self
->{_factor};
}
my
$psm
= new Bio::Matrix::PSM::Psm (
-instances
=> \
@instances
,
-e_val
=>
$eval
,
-id
=>
$sid
);
$self
->_pushback(
$line
);
return
$psm
;
}
sub
write_psm {
my
(
$self
,
$matrix
)=
@_
;
my
$w
=
$matrix
->width;
my
$header
=
"ALPHABET= ACGT\nlog-odds matrix: alength= 4 w= $w\n"
;
$self
->_print(
$header
);
unless
(
$matrix
->get_logs_array(
'A'
)) {
warn
"No log-odds data, available, using normal distribution to recalculate the PWM"
;
$matrix
->calc_weight({
A
=>0.25,
C
=>0.25,
G
=>0.25,
T
=>0.25});
}
while
(
my
%h
=
$matrix
->next_pos) {
$self
->_print (
join
(
"\t"
,
$h
{lA},
$h
{lC},
$h
{lG},
$h
{lT},
"\n"
));
}
}
1;