$Bio::EnsEMBL::IdMapping::TranscriptScoreBuilder::VERSION
=
'112.0.0_57'
;
$Bio::EnsEMBL::IdMapping::TranscriptScoreBuilder::VERSION
=
'112.0.057'
;
no
warnings
'uninitialized'
;
our
@ISA
=
qw(Bio::EnsEMBL::IdMapping::ScoreBuilder)
;
sub
score_transcripts {
my
$self
=
shift
;
my
$exon_matrix
=
shift
;
unless
(
$exon_matrix
and
$exon_matrix
->isa(
'Bio::EnsEMBL::IdMapping::ScoredMappingMatrix'
)) {
throw(
'Need a Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.'
);
}
$self
->logger->info(
"-- Scoring transcripts...\n\n"
, 0,
'stamped'
);
my
$matrix
=
$self
->scores_from_exon_scores(
$exon_matrix
);
if
(
$self
->logger->loglevel eq
'debug'
) {
$matrix
->
log
(
'transcript'
,
$self
->conf->param(
'basedir'
));
}
my
$fmt
=
"%-40s%10.0f\n"
;
$self
->logger->info(
"Scoring matrix:\n"
);
$self
->logger->info(
sprintf
(
$fmt
,
"Total source transcripts:"
,
$self
->cache->get_count_by_name(
'transcripts_by_id'
,
'source'
)), 1);
$self
->logger->info(
sprintf
(
$fmt
,
"Total target transcripts:"
,
$self
->cache->get_count_by_name(
'transcripts_by_id'
,
'target'
)), 1);
$self
->log_matrix_stats(
$matrix
);
$self
->logger->info(
"\nDone with transcript scoring.\n\n"
);
return
$matrix
;
}
sub
scores_from_exon_scores {
my
$self
=
shift
;
my
$exon_matrix
=
shift
;
unless
(
$exon_matrix
and
$exon_matrix
->isa(
'Bio::EnsEMBL::IdMapping::ScoredMappingMatrix'
)) {
throw(
'Need a Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.'
);
}
my
$dump_path
= path_append(
$self
->conf->param(
'basedir'
),
'matrix'
);
my
$matrix
= Bio::EnsEMBL::IdMapping::ScoredMappingMatrix->new(
-DUMP_PATH
=>
$dump_path
,
-CACHE_FILE
=>
'transcript_matrix.ser'
,
);
my
$transcript_cache
=
$matrix
->cache_file;
if
(-s
$transcript_cache
) {
$self
->logger->info(
"Reading transcript scoring matrix from file...\n"
, 0,
'stamped'
);
$self
->logger->debug(
"Cache file $transcript_cache.\n"
, 1);
$matrix
->read_from_file;
$self
->logger->info(
"Done.\n\n"
, 0,
'stamped'
);
}
else
{
$self
->logger->info(
"No transcript scoring matrix found. Will build new one.\n"
);
$self
->logger->info(
"Transcript scoring...\n"
, 0,
'stamped'
);
$matrix
=
$self
->build_scores(
$matrix
,
$exon_matrix
);
$self
->logger->info(
"Done.\n\n"
, 0,
'stamped'
);
$matrix
->write_to_file;
}
return
$matrix
;
}
sub
build_scores {
my
$self
=
shift
;
my
$matrix
=
shift
;
my
$exon_matrix
=
shift
;
unless
(
$matrix
and
$matrix
->isa(
'Bio::EnsEMBL::IdMapping::ScoredMappingMatrix'
)) {
throw(
'Need a transcript Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.'
);
}
unless
(
$exon_matrix
and
$exon_matrix
->isa(
'Bio::EnsEMBL::IdMapping::ScoredMappingMatrix'
)) {
throw(
'Need a exon Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.'
);
}
$self
->flag_matrix_from_exon_scores(
$matrix
,
$exon_matrix
);
my
$final_matrix
=
$self
->score_matrix_from_flag_matrix(
$matrix
,
$exon_matrix
);
return
$final_matrix
;
}
sub
flag_matrix_from_exon_scores {
my
$self
=
shift
;
my
$matrix
=
shift
;
my
$exon_matrix
=
shift
;
my
$i
;
my
$num_transcripts
=
scalar
(
keys
%{
$self
->cache->get_by_name(
'transcripts_by_id'
,
'source'
) });
my
$progress_id
=
$self
->logger->init_progress(
$num_transcripts
, 100);
$self
->logger->info(
"Creating flag matrix...\n"
, 1);
foreach
my
$source_transcript
(
values
%{
$self
->cache->get_by_name(
'transcripts_by_id'
,
'source'
) }) {
$self
->logger->log_progress(
$progress_id
, ++
$i
, 1);
foreach
my
$source_exon
(@{
$source_transcript
->get_all_Exons }) {
foreach
my
$target_exon_id
(@{
$exon_matrix
->get_targets_for_source(
$source_exon
->id) }) {
foreach
my
$target_transcript
(@{
$self
->cache->get_by_key(
'transcripts_by_exon_id'
,
'target'
,
$target_exon_id
) }) {
$matrix
->add_score(
$source_transcript
->id,
$target_transcript
->id, 1);
}
}
}
}
$self
->logger->info(
"\n"
);
return
$matrix
;
}
sub
score_matrix_from_flag_matrix {
my
$self
=
shift
;
my
$flag_matrix
=
shift
;
my
$exon_matrix
=
shift
;
unless
(
$flag_matrix
and
$flag_matrix
->isa(
'Bio::EnsEMBL::IdMapping::ScoredMappingMatrix'
)) {
throw(
'Need a transcript Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.'
);
}
unless
(
$exon_matrix
and
$exon_matrix
->isa(
'Bio::EnsEMBL::IdMapping::ScoredMappingMatrix'
)) {
throw(
'Need an exon Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.'
);
}
my
$transcript_score_threshold
=
$self
->conf->param(
'transcript_score_threshold'
) || 0;
my
$matrix
= Bio::EnsEMBL::IdMapping::ScoredMappingMatrix->new(
-DUMP_PATH
=>
$flag_matrix
->dump_path,
-CACHE_FILE
=>
$flag_matrix
->cache_file_name,
);
my
$i
;
my
$num_transcripts
=
scalar
(
keys
%{
$self
->cache->get_by_name(
'transcripts_by_id'
,
'source'
) });
my
$progress_id
=
$self
->logger->init_progress(
$num_transcripts
, 100);
$self
->logger->info(
"Creating score matrix from flag matrix...\n"
, 1);
my
$fmt_d1
=
"%-14s%-15s%-14s%-14s%-14s\n"
;
my
$fmt_d2
=
"%.6f"
;
foreach
my
$source_transcript
(
values
%{
$self
->cache->get_by_name(
'transcripts_by_id'
,
'source'
) }) {
$self
->logger->log_progress(
$progress_id
, ++
$i
, 1);
my
%source_exons
=
map
{
$_
->
id
=> 1 }
@{
$source_transcript
->get_all_Exons };
my
$source_transcript_length
=
$source_transcript
->
length
;
foreach
my
$target_transcript_id
(@{
$flag_matrix
->get_targets_for_source(
$source_transcript
->id) }) {
my
$target_transcript
=
$self
->cache->get_by_key(
'transcripts_by_id'
,
'target'
,
$target_transcript_id
);
my
$source_transcript_score
= 0;
my
$target_transcript_score
= 0;
my
$target_transcript_length
=
$target_transcript
->
length
;
my
%target_exons
=
map
{
$_
->
id
=> 1 }
@{
$target_transcript
->get_all_Exons };
foreach
my
$source_exon
(@{
$source_transcript
->get_all_Exons }) {
my
$max_source_score
= -1;
foreach
my
$target_exon_id
(@{
$exon_matrix
->get_targets_for_source(
$source_exon
->id) }) {
next
unless
(
$target_exons
{
$target_exon_id
});
my
$score
=
$exon_matrix
->get_score(
$source_exon
->id,
$target_exon_id
);
$max_source_score
=
$score
if
(
$score
>
$max_source_score
);
}
if
(
$max_source_score
> 0) {
$source_transcript_score
+= (
$max_source_score
*
$source_exon
->
length
);
}
}
foreach
my
$target_exon
(@{
$target_transcript
->get_all_Exons }) {
my
$max_target_score
= -1;
foreach
my
$source_exon_id
(@{
$exon_matrix
->get_sources_for_target(
$target_exon
->id) }) {
next
unless
(
$source_exons
{
$source_exon_id
});
my
$score
=
$exon_matrix
->get_score(
$source_exon_id
,
$target_exon
->id);
$max_target_score
=
$score
if
(
$score
>
$max_target_score
);
}
if
(
$max_target_score
> 0) {
$target_transcript_score
+= (
$max_target_score
*
$target_exon
->
length
);
}
}
if
((
$source_transcript_length
+
$target_transcript_length
) > 0) {
if
((
$source_transcript_score
>
$source_transcript_length
) or
(
$target_transcript_score
>
$target_transcript_length
)) {
$self
->logger->warning(
"Score > length for source ($source_transcript_score <> $source_transcript_length) or target ($target_transcript_score <> $target_transcript_length).\n"
, 1);
}
else
{
my
$source_transcript_biotype_group
=
$self
->get_biotype_group(
$source_transcript
->biotype());
my
$target_transcript_biotype_group
=
$self
->get_biotype_group(
$target_transcript
->biotype());
$self
->logger->info(
$source_transcript
->id.
":"
.
$target_transcript
->id.
" source score: $source_transcript_score"
.
" source length: $source_transcript_length"
.
" source biotype:"
.
$source_transcript
->biotype() .
" source group: $source_transcript_biotype_group"
.
" target score: $target_transcript_score"
.
" target biotype:"
.
$target_transcript
->biotype() .
" target group: $target_transcript_biotype_group"
.
" target length: $target_transcript_length\n"
);
my
$transcript_score
=
(
$source_transcript_score
+
$target_transcript_score
) /
(
$source_transcript_length
+
$target_transcript_length
);
if
(
$source_transcript
->biotype() ne
$target_transcript
->biotype()) {
$transcript_score
=
$transcript_score
* 0.9;
}
if
(
$source_transcript_biotype_group
ne
$target_transcript_biotype_group
) {
$transcript_score
=
$transcript_score
* 0.8;
}
if
(
$transcript_score
>
$transcript_score_threshold
) {
$matrix
->add_score(
$source_transcript
->id,
$target_transcript
->id,
$transcript_score
);
}
}
}
else
{
$self
->logger->warning(
"Combined length of source ("
.
$source_transcript
->id.
") and target ("
.
$target_transcript
->id.
") transcript is zero!\n"
, 1);
}
}
}
$self
->logger->info(
"\n"
);
return
$matrix
;
}
sub
biotype_transcript_rescore {
my
(
$self
,
$matrix
) =
@_
;
if
(
defined
(
$matrix
) &&
!
$matrix
->isa(
'Bio::EnsEMBL::IdMapping::ScoredMappingMatrix'
) )
{
throw(
'Exprected Bio::EnsEMBL::IdMapping::ScoredMappingMatrix'
);
}
my
$i
= 0;
foreach
my
$entry
( @{
$matrix
->get_all_Entries } ) {
my
$source_transcript
=
$self
->cache->get_by_key(
'transcripts_by_id'
,
'source'
,
$entry
->source() );
my
$target_transcript
=
$self
->cache->get_by_key(
'transcripts_by_id'
,
'target'
,
$entry
->target() );
if
(
$source_transcript
->biotype() ne
$target_transcript
->biotype() )
{
$matrix
->set_score(
$entry
->source(),
$entry
->target(),
0.9
*$entry
->score() );
$i
++;
}
}
$self
->logger->debug(
"Scored transcripts with biotype mismatch: $i\n"
,
1 );
}
sub
different_translation_rescore {
my
$self
=
shift
;
my
$matrix
=
shift
;
unless
(
$matrix
and
$matrix
->isa(
'Bio::EnsEMBL::IdMapping::ScoredMappingMatrix'
) )
{
throw(
'Need a Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.'
);
}
my
$i
= 0;
foreach
my
$entry
(
sort
{
$b
->score <=>
$a
->score }
@{
$matrix
->get_all_Entries } )
{
last
if
(
$entry
->score < 1 );
my
$source_tl
=
$self
->cache->get_by_key(
'transcripts_by_id'
,
'source'
,
$entry
->source )->translation;
my
$target_tl
=
$self
->cache->get_by_key(
'transcripts_by_id'
,
'target'
,
$entry
->target )->translation;
next
if
( !
$source_tl
and !
$target_tl
);
if
( !
$source_tl
or !
$target_tl
or (
$source_tl
->seq ne
$target_tl
->seq ) )
{
$matrix
->set_score(
$entry
->source(),
$entry
->target(),
0.9
*$entry
->score() );
$i
++;
}
}
$self
->logger->debug(
"Non-perfect translations on perfect transcripts: $i\n"
,
1 );
}
sub
non_mapped_gene_rescore {
my
$self
=
shift
;
my
$matrix
=
shift
;
my
$gene_mappings
=
shift
;
unless
(
$matrix
and
$matrix
->isa(
'Bio::EnsEMBL::IdMapping::ScoredMappingMatrix'
) )
{
throw(
'Need a transcript Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.'
);
}
unless
(
$gene_mappings
and
$gene_mappings
->isa(
'Bio::EnsEMBL::IdMapping::MappingList'
) )
{
throw(
'Need a gene Bio::EnsEMBL::IdMapping::MappingList.'
);
}
my
%gene_lookup
=
map
{
$_
->
source
=>
$_
->target }
@{
$gene_mappings
->get_all_Entries };
my
$i
= 0;
foreach
my
$entry
( @{
$matrix
->get_all_Entries } ) {
my
$source_gene
=
$self
->cache->get_by_key(
'genes_by_transcript_id'
,
'source'
,
$entry
->source );
my
$target_gene
=
$self
->cache->get_by_key(
'genes_by_transcript_id'
,
'target'
,
$entry
->target );
my
$mapped_target
=
$gene_lookup
{
$source_gene
->id };
if
( !
$mapped_target
or (
$mapped_target
!=
$target_gene
->id ) ) {
$matrix
->set_score(
$entry
->source(),
$entry
->target(),
0.9
*$entry
->score() );
$i
++;
}
}
$self
->logger->debug(
"Scored transcripts in non-mapped genes: $i\n"
,
1 );
}
sub
get_biotype_group {
my
(
$self
,
$biotype
) =
@_
;
my
$dba
=
$self
->cache->get_DBAdaptor(
'target'
);
my
$biotype_adaptor
=
$dba
->get_BiotypeAdaptor();
my
$biotype_object
=
$biotype_adaptor
->fetch_by_name_object_type(
$biotype
,
'transcript'
);
return
$biotype_object
->biotype_group();
}
1;