our
$VERSION
=
$Bio::Polloc::Polloc::Root::VERSION
;
sub
new {
my
(
$caller
,
@args
) =
@_
;
my
$self
=
$caller
->SUPER::new(
@args
);
$self
->_initialize(
@args
);
return
$self
;
}
sub
source {
my
(
$self
,
$value
) =
@_
;
$self
->{
'_source'
} =
$value
if
defined
$value
;
return
$self
->{
'_source'
};
}
sub
target {
my
(
$self
,
$value
) =
@_
;
$self
->{
'_target'
} =
$value
if
defined
$value
;
return
$self
->{
'_target'
};
}
sub
locigroup {
my
(
$self
,
$value
) =
@_
;
if
(
defined
$value
){
$self
->{
'_locigroup'
} =
$value
;
$self
->{
'_reorder'
} = 1;
}
return
$self
->{
'_locigroup'
};
}
sub
condition {
my
(
$self
,
$value
) =
@_
;
if
(
defined
$value
){
$self
->throw(
'Unexpected type of condition'
,
$value
)
unless
UNIVERSAL::can(
$value
,
'isa'
) and
$value
->isa(
'Bio::Polloc::GroupCriteria::operator'
);
$self
->{
'_condition'
} =
$value
;
}
return
$self
->{
'_condition'
};
}
sub
evaluate {
my
(
$self
,
$feat1
,
$feat2
) =
@_
;
$feat1
->isa(
'Bio::Polloc::LocusI'
) or
$self
->throw(
"First feature of illegal class"
,
$feat1
);
$feat2
->isa(
'Bio::Polloc::LocusI'
) or
$self
->throw(
"Second feature of illegal class"
,
$feat2
);
defined
$self
->condition or
$self
->throw(
"Undefined condition, impossible to group"
);
$self
->condition->type eq
'bool'
or
$self
->throw(
"Unexpected type of condition"
,
$self
->condition);
$self
->throw(
"Undefined source features"
)
unless
defined
$self
->source;
$self
->throw(
"Undefined target features"
)
unless
defined
$self
->target;
return
0
unless
$feat1
->family eq
$self
->source;
return
0
unless
$feat2
->family eq
$self
->target;
$Bio::Polloc::GroupCriteria::operator::cons::OP_CONS
->{
'FEAT1'
} =
$feat1
;
$Bio::Polloc::GroupCriteria::operator::cons::OP_CONS
->{
'FEAT2'
} =
$feat2
;
my
$o
=
$self
->condition->operate;
$Bio::Polloc::GroupCriteria::operator::cons::OP_CONS
->{
'FEAT1'
} =
undef
;
$Bio::Polloc::GroupCriteria::operator::cons::OP_CONS
->{
'FEAT2'
} =
undef
;
return
$o
;
}
sub
get_loci {
my
(
$self
,
@args
) =
@_
;
$self
->{
'_features'
} =
$self
->locigroup->loci
if
defined
$self
->locigroup and not
defined
$self
->{
'_features'
};
$self
->{
'_features'
} = []
unless
defined
$self
->{
'_features'
};
if
(
$self
->{
'_reorder'
} &&
$self
->source ne
$self
->target){
my
@src
= ();
my
@tgt
= ();
my
@oth
= ();
for
my
$ft
(@{
$self
->locigroup->loci}){
if
(
$ft
->family eq
$self
->source){
push
(
@src
,
$ft
) }
elsif
(
$ft
->family eq
$self
->target){
push
(
@tgt
,
$ft
) }
else
{
push
@oth
,
$ft
}
}
$self
->{
'_features'
} = [];
push
@{
$self
->{
'_features'
}},
@tgt
,
@src
,
@oth
;
$self
->{
'_reorder'
} = 0;
}
return
$self
->{
'_features'
};
}
sub
get_locus {
my
(
$self
,
$index
) =
@_
;
return
unless
defined
$index
;
return
unless
defined
$self
->{
'_features'
};
return
$self
->{
'_features'
}->[
$index
];
}
sub
extension {
my
(
$self
,
@args
) =
@_
;
return
$self
->{
'_groupextension'
}
unless
$#args
>=0;
@args
=
split
/\s+/,
$args
[0]
if
$#args
== 0;
$self
->throw(
"Odd number of elements, impossible to build key-value pairs"
, \
@args
)
unless
$#args
%2;
my
%f
=
@args
;
$f
{
'-function'
} ||=
'context'
;
$f
{
'-algorithm'
} ||=
'blast'
;
(
$f
{
'-feature'
} ||= 0) += 0;
(
$f
{
'-downstream'
} ||= 0) += 0;
(
$f
{
'-upstream'
} ||= 0) += 0;
(
$f
{
'-detectstrand'
} ||= 0) += 0;
(
$f
{
'-alldetected'
} ||= 0) += 0;
(
$f
{
'-oneside'
} ||= 0) += 0;
$f
{
'-lensd'
} =
defined
$f
{
'-lensd'
} ?
$f
{
'-lensd'
}+0 : 1.5;
$f
{
'-maxlen'
} =
defined
$f
{
'-maxlen'
} ?
$f
{
'-maxlen'
}+0 : 0;
$f
{
'-minlen'
} =
defined
$f
{
'-minlen'
} ?
$f
{
'-minlen'
}+0 : 0;
$f
{
'-similarity'
} =
defined
$f
{
'-similarity'
} ?
$f
{
'-similarity'
}+0 : 0.8;
$f
{
'-score'
} =
defined
$f
{
'-score'
} ?
$f
{
'-score'
}+0 : 20;
$f
{
'-consensusperc'
} =
defined
$f
{
'-consensusperc'
} ?
$f
{
'-consensusperc'
}+0 : 60;
$f
{
'-e'
} =
defined
$f
{
'-e'
} ?
$f
{
'-e'
}+0 : 0.1;
$f
{
'-p'
} =
'blastn'
unless
defined
$f
{
'-p'
};
$self
->{
'_groupextension'
} = \
%f
;
return
$self
->{
'_groupextension'
};
}
sub
extend {
my
(
$self
,
@args
) =
@_
;
my
(
$loci
) =
$self
->_rearrange([
qw(LOCI)
],
@args
);
my
$ext
=
$self
->{
'_groupextension'
};
return
unless
defined
$ext
;
$self
->throw(
"The loci are not into an object"
,
$loci
)
unless
defined
$loci
and
ref
(
$loci
) and UNIVERSAL::can(
$loci
,
'isa'
);
$self
->throw(
"Unexpected type for the group of loci"
,
$loci
)
unless
$loci
->isa(
'Bio::Polloc::LociGroup'
);
return
unless
$
my
$group_id
=
$self
->_next_group_id;
my
@new
= ();
$self
->debug(
"--- Extending group (based on "
.($
if
(
lc
(
$ext
->{
'-function'
}) eq
'context'
){
my
(
$up_pos
,
$down_pos
,
$in_pos
);
$loci
->fix_strands(max(
$ext
->{
'-downstream'
},
$ext
->{
'-upstream'
}))
if
$ext
->{
'-detectstrand'
} and (
$ext
->{
'-upstream'
} or
$ext
->{
'-downstream'
});
my
$eval_feature
=
$ext
->{
'-feature'
} ? 1 : 0;
$up_pos
=
$self
->_search_aln_seqs(
$loci
->align_context(-1,
$ext
->{
'-upstream'
}, 0))
if
$ext
->{
'-upstream'
};
$down_pos
=
$self
->_search_aln_seqs(
$loci
->align_context(1,
$ext
->{
'-downstream'
},0))
if
$ext
->{
'-downstream'
};
$in_pos
=
$self
->_search_aln_seqs(
$loci
->align_context(0, 0, 0))
if
$ext
->{
'-feature'
};
my
$max_len
=
$ext
->{
'-maxlen'
};
unless
(
$max_len
){
my
(
$len_avg
,
$len_sd
) =
$self
->locigroup->avg_length;
$self
->
warn
(
"Building size constrains based in one sequence only"
)
if
$
$max_len
=
$len_avg
+
$len_sd
*$ext
->{
'-lensd'
};
}
$self
->debug(
"Comparing results with maximum feature's length of $max_len"
);
if
(
$ext
->{
'-upstream'
} and
$ext
->{
'-downstream'
}){
push
@new
,
$self
->_detect_border_pairs(
$up_pos
,
$down_pos
,
$max_len
);
if
(
$eval_feature
){
$self
->debug(
"Filtering results with in-feature sequences"
);
my
@prefilter
=
@new
;
@new
= ();
BORDER:
for
my
$br
(
@prefilter
){
push
@new
,
first {
$br
->[0] ne
$_
->[0]
and
$br
->[3] ==
$_
->[3]
and
$br
->[3]
*$_
->[1] <
$br
->[3]
*$br
->[2]
and
$br
->[3]
*$_
->[2] >
$br
->[3]
*$br
->[1]
}
@$in_pos
;
}
}
}
elsif
(
$eval_feature
){
@new
=
@$in_pos
}
else
{
$self
->throw(
'Nothing to evaluate! '
.
'I need either the two borders or the middle sequence (or both)'
);
}
}
else
{
$self
->throw(
'Unsupported function for group extension'
,
$ext
->{
'-function'
});
}
$self
->debug(
"Found "
.(
$#new
+1).
" loci, creating extend features"
);
my
$comments
=
"Based on group $group_id: "
;
for
my
$locus
(@{
$loci
->loci}) {
$comments
.=
$locus
->id .
", "
if
defined
$locus
->id }
$comments
=
substr
$comments
, 0, -2;
my
$newloci
= Bio::Polloc::LociGroup->new();
$newloci
->name(
$loci
->name.
"-ext"
)
if
defined
$loci
->name;
$newloci
->featurename(
$loci
->featurename)
if
defined
$loci
->featurename;
$newloci
->genomes(
$loci
->genomes)
if
defined
$loci
->genomes;
NEW:
for
my
$itemk
(0 ..
$#new
){
my
$item
=
$new
[
$itemk
];
(
$item
->[1],
$item
->[2]) = (min(
$item
->[1],
$item
->[2]), max(
$item
->[1],
$item
->[2]));
unless
(
$ext
->{
'-alldetected'
}){
OLD:
for
my
$locus
(@{
$loci
->loci}){
next
NEW
if
$item
->[1]<
$locus
->to and
$item
->[2]>
$locus
->from;
}
}
my
$seq
;
my
(
$Gk
,
$acc
) =
split
/:/,
$item
->[0], 2;
$Gk
+=0;
for
my
$ck
(0 .. $
my
$id
=
$self
->genomes->[
$Gk
]->get_sequences->[
$ck
]->display_id;
if
(
$id
eq
$acc
or
$id
=~ m/\|
$acc
(\.\d+)?(\||\s*$)/){
$seq
= [
$Gk
,
$ck
];
last
;
}
}
$self
->
warn
(
'I can not find the sequence'
,
$acc
)
unless
defined
$seq
;
$self
->throw(
'Undefined genome-contig pair'
,
$acc
,
'UnexpectedException'
)
unless
defined
$self
->genomes->[
$seq
->[0]]->get_sequences->[
$seq
->[1]];
my
$id
=
$self
->source .
"-ext:"
.(
$Gk
+1).
".$group_id."
.($
$newloci
->add_loci(Bio::Polloc::LocusI->new(
-type
=>
'extend'
,
-from
=>
$item
->[1],
-to
=>
$item
->[2],
-id
=>(
defined
$id
?
$id
:
''
),
-strand
=>(
$item
->[3]==-1 ?
'+'
:
'-'
),
-seq
=>
$self
->genomes->[
$seq
->[0]]->get_sequences->[
$seq
->[1]],
-score
=>
$item
->[4],
-basefeature
=>
$loci
->loci->[0],
-comments
=>
$comments
,
-genome
=>
$self
->genomes->[
$Gk
]
));
}
return
$newloci
;
}
sub
build_bin {
my
(
$self
,
@args
) =
@_
;
my
$bin
= [];
my
(
$complete
) =
$self
->_rearrange([
qw(COMPLETE)
],
@args
);
for
my
$i
(0 .. $
$bin
->[
$i
] = [];
my
$lim
=
$complete
? $
for
my
$j
(0 ..
$lim
){
$bin
->[
$i
]->[
$j
] =
$self
->evaluate(
$self
->get_loci->[
$i
],
$self
->get_loci->[
$j
]
);
}
}
return
$bin
;
}
sub
bin_build_groups {
my
(
$self
,
$bin
) =
@_
;
my
$groups
= [];
FEAT:
for
my
$f
(0 .. $
GROUP:
for
my
$g
(0 .. $
MEMBER:
for
my
$m
(0 .. $
if
(
$bin
->[
$f
]->[
$groups
->[
$g
]->[
$m
]] ){
push
@{
$groups
->[
$g
]},
$f
;
next
FEAT;
}
}
}
push
@{
$groups
}, [
$f
];
}
return
$self
->_feat_index2obj(
$groups
);
}
sub
build_groups {
my
(
$self
,
@args
) =
@_
;
my
(
$cpus
,
$advance
) =
$self
->_rearrange([
qw(CPUS ADVANCE)
],
@args
);
my
$groups
= [[0]];
my
$loci
=
$self
->get_loci;
my
$l_max
=
$#$loci
;
$self
->debug(
"Building groups for "
.(
$l_max
+1).
" loci"
);
$self
->
warn
(
'Nothing to do, any stored loci'
)
unless
$l_max
>=0;
FEAT1:
for
my
$i
(1 ..
$l_max
){
FEAT2:
for
my
$j
(0 ..
$i
-1){
$self
->debug(
"Evaluate [$i vs $j]"
);
&$advance
(
$i
,
$j
,
$l_max
+1)
if
defined
$advance
;
next
FEAT2
unless
$self
->evaluate(
$loci
->[
$i
],
$loci
->[
$j
]
);
GROUP:
for
my
$g
(0 .. $
MEMBER:
for
my
$m
(0 .. $
if
(
$j
==
$groups
->[
$g
]->[
$m
]){
push
@{
$groups
->[
$g
]},
$i
;
next
FEAT1;
}
}
}
}
push
@{
$groups
}, [
$i
];
}
my
$out
= [];
for
my
$gk
(0 ..
$#$groups
){
my
$group
= Bio::Polloc::LociGroup->new(
-name
=>
sprintf
(
"%04s"
,
$gk
+1));
$group
->genomes(
$self
->genomes);
for
my
$lk
(0 .. $
my
$locus
=
$loci
->[
$groups
->[
$gk
]->[
$lk
] ];
$self
->throw(
'Impossible to gather the locus back:'
.
' $groups->['
.
$gk
.
']->['
.
$lk
.
']: '
.
$groups
->[
$gk
]->[
$lk
],
$loci
,
'Bio::Polloc::Polloc::UnexpectedException'
)
unless
defined
$locus
;
$group
->add_loci(
$locus
);
}
push
@$out
,
$group
;
}
return
$out
;
}
sub
genomes {
my
(
$self
,
$value
) =
@_
;
$self
->
warn
(
"Attempting to set the genomes from a read-only function"
)
if
defined
$value
;
return
unless
defined
$self
->locigroup;
return
$self
->locigroup->genomes;
}
sub
_detect_border_pairs {
my
(
$self
,
$up_pos
,
$down_pos
,
$max_len
) =
@_
;
return
unless
$up_pos
and
$down_pos
;
my
$ext
=
$self
->{
'_groupextension'
};
my
@out
= ();
US:
for
my
$us
(
@$up_pos
){
$self
->throw(
"Unexpected array structure (upstream): "
,
$us
)
unless
defined
$us
->[0] and
defined
$us
->[4];
$self
->debug(
" US: "
,
join
(
':'
,
@$us
));
my
$found
;
my
$pair
= [];
DS:
for
my
$ds
(
@$down_pos
){
$self
->throw(
"Unexpected array structure (downstream): "
,
$ds
)
unless
defined
$ds
->[0] and
defined
$ds
->[4];
$self
->debug(
" DS: "
,
join
(
':'
,
@$ds
));
next
DS
if
$us
->[0] ne
$ds
->[0]
or
$us
->[3] ==
$ds
->[3]
or
abs
(
$ds
->[2]-
$us
->[2]) >
$max_len
or
abs
(
$ds
->[2]-
$us
->[2]) <
$ext
->{
'-minlen'
}
or
defined
$found
and
abs
(
$us
->[2]-
$ds
->[2]) >
$found
;
$self
->debug(
"Saving pair "
.
$us
->[1].
".."
.
$us
->[2].
"/"
.
$ds
->[1].
".."
.
$ds
->[2]);
$found
=
abs
(
$us
->[2]-
$ds
->[2]);
$pair
= [
$us
->[0],
$us
->[2],
$ds
->[2],
$us
->[3], (
$us
->[4]+
$ds
->[4])/2];
}
push
@out
,
$pair
if
$#$pair
>1;
}
return
@out
;
}
sub
_next_group_id {
my
$self
=
shift
;
$self
->{
'_next_group_id'
}||= 0;
return
++
$self
->{
'_next_group_id'
};
}
sub
_build_subseq {
my
(
$self
,
$seq
,
$from
,
$to
) =
@_
;
$self
->throw(
"No main sequence"
,
$seq
)
unless
defined
$seq
and UNIVERSAL::can(
$seq
,
'isa'
) and
$seq
->isa(
'Bio::Seq'
);
my
(
$start
,
$end
) = (min(
$to
,
$from
), max(
$to
,
$from
));
$start
= max(
$start
, 1);
$end
= min(
$end
,
$seq
->
length
);
return
unless
$start
!=
$end
;
my
$seqstr
=
$seq
->subseq(
$start
,
$end
);
my
$cleanstr
=
$seqstr
;
$cleanstr
=~ s/^N*//;
$cleanstr
=~ s/N*$//;
return
unless
length
$cleanstr
> 0;
my
$subseq
= Bio::Seq->new(
-seq
=>
$seqstr
);
$subseq
=
$subseq
->revcom
if
$from
<
$to
;
return
$subseq
;
}
sub
_search_aln_seqs {
my
(
$self
,
$aln
) =
@_
;
my
$ext
=
$self
->{
'_groupextension'
};
return
unless
defined
$ext
;
return
unless
defined
$self
->genomes;
my
$pos
= [];
return
$pos
unless
defined
$aln
;
my
$alg
=
lc
$ext
->{
'-algorithm'
};
if
(
$alg
eq
'blast'
or
$alg
eq
'hmmer'
){
unless
(
defined
$self
->{
'_seqsdb'
}){
$self
->{
'_seqsdb'
} = Bio::Polloc::Polloc::IO->tempdir();
$self
->debug(
"Creating DB at "
.
$self
->{
'_seqsdb'
});
for
my
$genomek
(0 .. $
my
$file
=
$self
->{
'_seqsdb'
}.
"/$genomek"
;
my
$fasta
= Bio::SeqIO->new(
-file
=>
">$file"
,
-format
=>
'Fasta'
);
for
my
$ctg
(@{
$self
->genomes->[
$genomek
]->get_sequences}){
$fasta
->write_seq(
$ctg
) }
if
(
$alg
eq
'blast'
){
my
$run
= Bio::Polloc::Polloc::IO->new(
-file
=>
"formatdb -p F -i '$file' 2>&1 |"
);
while
(
$run
->_readline) {}
$run
->
close
;
}
}
}
my
$factory
;
my
$query
;
if
(
$alg
eq
'blast'
){
$self
->_load_module(
'Bio::Tools::Run::StandAloneBlast'
);
my
$cons_seq
=
$aln
->consensus_string(
$ext
->{
'-consensusperc'
});
$cons_seq
=~ s/\?/N/g;
$query
= Bio::Seq->new(
-seq
=>
$cons_seq
);
}
elsif
(
$alg
eq
'hmmer'
){
$self
->_load_module(
'Bio::Tools::Run::Hmmer'
);
my
$tmpio
= Bio::Polloc::Polloc::IO->new();
$factory
= Bio::Tools::Run::Hmmer->new();
$factory
->hmm(
$tmpio
->tempfile);
$factory
->program(
'hmmbuild'
);
$factory
->run(
$aln
);
}
$self
->debug(
"Searching... alg:$alg, sim:"
.
$ext
->{
'-similarity'
}.
" score:"
.
$ext
->{
'-score'
}.
" e:"
.
$ext
->{
'-e'
});
GENOME:
for
my
$Gk
(0 .. $
my
$report
;
if
(
$alg
eq
'blast'
){
next
GENOME
if
(
$query
->seq =~
tr
/N//) > 0.25
*$query
->
length
;
$factory
= Bio::Tools::Run::StandAloneBlast->new(
'-e'
=>
$ext
->{
'-e'
},
'-program'
=>
$ext
->{
'-p'
},
'-db'
=>
$self
->{
'_seqsdb'
}.
"/$Gk"
);
try
{
$report
=
$factory
->blastall(
$query
); }
catch
Error
with
{
$self
->debug(
"Launch BLAST with query: "
.
$query
->seq());
$self
->
warn
(
"BLAST failed, skipping query and attempting to continue"
);
next
GENOME;
}
otherwise {
$self
->throw(
"BLAST failed"
,
$_
,
'Bio::Polloc::Polloc::UnexpectedException'
);
};
}
elsif
(
$alg
eq
'hmmer'
){
$factory
->program(
'hmmsearch'
);
$report
=
$factory
->run(
$self
->{
'_seqsdb'
}.
"/$Gk"
);
}
RESULT:
while
(
my
$res
=
$report
->next_result){
HIT:
while
(
my
$hit
=
$res
->next_hit){
HSP:
while
(
my
$hsp
=
$hit
->next_hsp){
if
( (
$alg
eq
'blast'
and
$hsp
->frac_identical(
'query'
) >=
$ext
->{
'-similarity'
}
and
$hsp
->score >=
$ext
->{
'-score'
})
or
(
$alg
eq
'hmmer'
and
$hsp
->score >=
$ext
->{
'-score'
}
and
$hsp
->evalue <=
$ext
->{
'-e'
})
){
$self
->debug(
"Found: sim:"
.
$hsp
->frac_identical(
'query'
).
", score:"
.
$hsp
->score.
", e:"
.
$hsp
->evalue);
my
$r_pos
= [
"$Gk:"
.
$hit
->accession,
$hsp
->strand(
'hit'
)!=
$hsp
->strand(
'query'
)?
$hsp
->start(
'hit'
):
$hsp
->end(
'hit'
),
$hsp
->strand(
'hit'
)!=
$hsp
->strand(
'query'
)?
$hsp
->end(
'hit'
):
$hsp
->start(
'hit'
),
$hsp
->strand(
'hit'
)!=
$hsp
->strand(
'query'
)?
-1 : 1,
$hsp
->bits];
push
@$pos
,
$r_pos
;
}
}
}
}
}
}
else
{
$self
->throw(
'Unsupported search algorithm'
,
$ext
->{
'-algorithm'
});
}
return
$pos
;
}
sub
_feat_index2obj{
my
(
$self
,
$groups
) =
@_
;
for
my
$g
(0 .. $
for
my
$m
(0 .. $
$groups
->[
$g
]->[
$m
] =
$self
->get_locus(
$groups
->[
$g
]->[
$m
]);
}
}
return
$groups
;
}
sub
_grouprules_cleanup {
my
$self
=
shift
;
if
(
defined
$self
->{
'_seqsdb'
}) {
my
$tmp
=
$self
->{
'_seqsdb'
};
for
my
$k
(0 .. $
while
(<
$tmp
/
$k
.*>){
unlink
$_
or
$self
->throw(
"Impossible to delete '$_'"
, $!);
}
unlink
"$tmp/$k"
or
$self
->throw(
"Impossible to delete '$tmp/$k'"
, $!);
}
rmdir
$tmp
;
}
}
sub
_initialize {
my
(
$self
,
@args
) =
@_
;
$self
->_register_cleanup_method(\
&_grouprules_cleanup
);
my
(
$source
,
$target
,
$features
,
$loci
) =
$self
->_rearrange([
qw(SOURCE TARGET FEATURES LOCI)
],
@args
);
$self
->source(
$source
);
$self
->target(
$target
);
$loci
=
$features
if
defined
$features
and not
defined
$loci
;
$self
->locigroup(
$loci
);
}
1;