$Bio::MUST::Apps::Leel::OrgProcessor::VERSION
=
'0.213470'
;
has
'ali_proc'
=> (
is
=>
'ro'
,
isa
=>
'Bio::MUST::Apps::Leel::AliProcessor'
,
required
=> 1,
);
has
'protein_seqs'
=> (
is
=>
'ro'
,
isa
=>
'Bio::MUST::Core::Ali::Temporary'
,
init_arg
=>
undef
,
lazy
=> 1,
builder
=>
'_build_protein_seqs'
,
handles
=> [
qw(abbr_id_for long_id_for)
],
);
has
'protein_seq_mapper'
=> (
is
=>
'ro'
,
isa
=>
'Bio::MUST::Core::IdMapper'
,
init_arg
=>
undef
,
lazy
=> 1,
builder
=>
'_build_protein_seq_mapper'
,
handles
=> {
accession_for
=>
'long_id_for'
,
},
);
has
$_
.
'_seqs'
=> (
is
=>
'ro'
,
isa
=>
'Bio::MUST::Core::Ali'
,
init_arg
=>
undef
,
lazy
=> 1,
builder
=>
'_build_'
.
$_
.
'_seqs'
,
)
for
qw(transcript aligned)
;
sub
_build_protein_seqs {
my
$self
=
shift
;
my
$ap
=
$self
->ali_proc;
my
$org
=
$self
->org;
my
$ali
=
$ap
->ali;
my
@seqs
=
$ali
->filter_seqs(
sub
{
$_
->full_org eq
$org
} );
carp
"[ORG] Warning: no seq for org $org; skipping org!"
unless
@seqs
;
return
Temporary->new(
seqs
=> \
@seqs
);
}
sub
_build_protein_seq_mapper {
my
$self
=
shift
;
my
@protein_seq_ids
=
$self
->protein_seqs->all_seq_ids;
my
@abbr_ids
=
map
{
$self
->abbr_id_for(
$_
->full_id )
}
@protein_seq_ids
;
my
@long_ids
=
map
{
$_
->accession
=~ s{
$TAIL_42
}{}xmsgr
}
@protein_seq_ids
;
return
IdMapper->new(
long_ids
=> \
@long_ids
,
abbr_ids
=> \
@abbr_ids
,
);
}
sub
_build_transcript_seqs {
my
$self
=
shift
;
my
$ap
=
$self
->ali_proc;
my
$rp
=
$ap
->run_proc;
my
$args
=
$rp
->blast_args_for(
'transcripts'
) // {};
$args
->{-outfmt} = 5;
$args
->{-max_target_seqs} = 5
unless
defined
$args
->{-max_target_seqs};
$args
->{-db_gencode} =
$self
->code;
my
@seqs
;
for
my
$bank
(
$self
->all_banks) {
my
$file
= file(
$rp
->bank_dir,
$bank
);
my
$blastdb
= Bio::MUST::Drivers::Blast::Database->new(
file
=>
$file
);
my
$parser
=
$blastdb
->tblastn(
$self
->protein_seqs,
$args
);
my
$bo
=
$parser
->blast_output;
tie
my
%mask_for
,
'Tie::IxHash'
;
my
%hit_for
;
PROTEIN:
for
my
$protein
(
$bo
->all_iterations) {
my
$protein_acc
=
$self
->accession_for(
$protein
->query_def );
TRANSCRIPT:
for
my
$transcript
(
$protein
->all_hits) {
my
@fields
=
split
/\|/xms,
$transcript
->id;
my
$transcript_acc
=
pop
@fields
;
my
$rank
=
$transcript
->num;
unless
(
$protein_acc
eq
$transcript_acc
) {
next
TRANSCRIPT
if
$rp
->id_match_mode eq
'enforce'
;
}
elsif
(
$rank
> 1) {
}
$self
->_collect_hsps(
protein
=>
$protein
,
transcript
=>
$transcript
,
mask_for
=> \
%mask_for
,
hit_for
=> \
%hit_for
,
);
next
PROTEIN;
}
}
$parser
->remove
unless
$rp
->debug_mode;
my
@hits
=
$self
->_fetch_and_trim_hits(
blastdb
=>
$blastdb
,
mask_for
=> \
%mask_for
,
hit_for
=> \
%hit_for
,
);
push
@seqs
,
@hits
}
my
$missing_n
=
$self
->protein_seqs->count_seqs -
@seqs
;
carp
"[ORG] Warning: cannot find $missing_n transcript(s):"
.
' skipping it (them)!'
if
$missing_n
;
return
Ali->new(
seqs
=> \
@seqs
);
}
sub
_build_aligned_seqs {
my
$self
=
shift
;
my
$ap
=
$self
->ali_proc;
my
$rp
=
$ap
->run_proc;
my
$aligned_seqs
= Ali->new();
TRANSCRIPT:
for
my
$transcript_seq
(
$self
->transcript_seqs->all_seqs ) {
my
$protein_seq
=
$ap
->protein_for(
$transcript_seq
->full_id );
my
$exo
= Bio::MUST::Drivers::Exonerate::Aligned->new(
dna_seq
=>
$transcript_seq
,
pep_seq
=>
$protein_seq
->clone->degap,
code
=>
$self
->code,
);
my
$new_seq
=
$exo
->target_seq;
unless
(
$new_seq
->seq_len) {
next
TRANSCRIPT;
}
$aligned_seqs
->add_seq(
$ap
->integrator->align(
new_seq
=>
$new_seq
,
subject
=>
$exo
->query_seq,
template
=>
$protein_seq
,
cds_seq
=>
$exo
->spliced_seq,
start
=>
$exo
->query_start,
)
);
}
return
$aligned_seqs
;
}
sub
BUILD {
my
$self
=
shift
;
my
$ap
=
$self
->ali_proc;
my
$rp
=
$ap
->run_proc;
return
unless
$self
->protein_seqs->all_long_ids;
return
unless
$self
->transcript_seqs->all_seq_ids;
if
(
$rp
->aligner_mode eq
'off'
) {
$ap
->cds_ali->add_seq(
$self
->transcript_seqs->all_seqs );
return
;
}
return
unless
$self
->aligned_seqs->all_seq_ids;
$ap
->cds_ali->add_seq(
$self
->aligned_seqs->all_seqs );
return
;
}
__PACKAGE__->meta->make_immutable;
1;