$Bio::EnsEMBL::IdMapping::GeneScoreBuilder::VERSION
=
'112.0.0'
;
no
warnings
'uninitialized'
;
our
@ISA
=
qw(Bio::EnsEMBL::IdMapping::ScoreBuilder)
;
sub
score_genes {
my
$self
=
shift
;
my
$transcript_matrix
=
shift
;
unless
(
$transcript_matrix
and
$transcript_matrix
->isa(
'Bio::EnsEMBL::IdMapping::ScoredMappingMatrix'
)) {
throw(
'Need a Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.'
);
}
$self
->logger->info(
"-- Scoring genes...\n\n"
, 0,
'stamped'
);
my
$matrix
=
$self
->scores_from_transcript_scores(
$transcript_matrix
);
if
(
$self
->logger->loglevel eq
'debug'
) {
$matrix
->
log
(
'gene'
,
$self
->conf->param(
'basedir'
));
}
my
$fmt
=
"%-40s%10.0f\n"
;
$self
->logger->info(
"Scoring matrix:\n"
);
$self
->logger->info(
sprintf
(
$fmt
,
"Total source genes:"
,
$self
->cache->get_count_by_name(
'genes_by_id'
,
'source'
)), 1);
$self
->logger->info(
sprintf
(
$fmt
,
"Total target genes:"
,
$self
->cache->get_count_by_name(
'genes_by_id'
,
'target'
)), 1);
$self
->log_matrix_stats(
$matrix
);
$self
->logger->info(
"\nDone with gene scoring.\n\n"
);
return
$matrix
;
}
sub
scores_from_transcript_scores {
my
$self
=
shift
;
my
$transcript_matrix
=
shift
;
unless
(
$transcript_matrix
and
$transcript_matrix
->isa(
'Bio::EnsEMBL::IdMapping::ScoredMappingMatrix'
)) {
throw(
'Need a Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.'
);
}
my
$dump_path
= path_append(
$self
->conf->param(
'basedir'
),
'matrix'
);
my
$matrix
= Bio::EnsEMBL::IdMapping::ScoredMappingMatrix->new(
-DUMP_PATH
=>
$dump_path
,
-CACHE_FILE
=>
'gene_matrix.ser'
,
);
my
$gene_cache
=
$matrix
->cache_file;
if
(-s
$gene_cache
) {
$self
->logger->info(
"Reading gene scoring matrix from file...\n"
, 0,
'stamped'
);
$self
->logger->debug(
"Cache file $gene_cache.\n"
, 1);
$matrix
->read_from_file;
$self
->logger->info(
"Done.\n\n"
, 0,
'stamped'
);
}
else
{
$self
->logger->info(
"No gene scoring matrix found. Will build new one.\n"
);
$self
->logger->info(
"Gene scoring...\n"
, 0,
'stamped'
);
$matrix
=
$self
->build_scores(
$matrix
,
$transcript_matrix
);
$self
->logger->info(
"Done.\n\n"
, 0,
'stamped'
);
$matrix
->write_to_file;
}
return
$matrix
;
}
sub
build_scores {
my
$self
=
shift
;
my
$matrix
=
shift
;
my
$transcript_matrix
=
shift
;
unless
(
$matrix
and
$matrix
->isa(
'Bio::EnsEMBL::IdMapping::ScoredMappingMatrix'
)) {
throw(
'Need a gene Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.'
);
}
unless
(
$transcript_matrix
and
$transcript_matrix
->isa(
'Bio::EnsEMBL::IdMapping::ScoredMappingMatrix'
)) {
throw(
'Need a transcript Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.'
);
}
$self
->flag_matrix_from_transcript_scores(
$matrix
,
$transcript_matrix
);
my
$final_matrix
=
$self
->score_matrix_from_flag_matrix(
$matrix
,
$transcript_matrix
);
return
$final_matrix
;
}
sub
flag_matrix_from_transcript_scores {
my
$self
=
shift
;
my
$matrix
=
shift
;
my
$transcript_matrix
=
shift
;
unless
(
$matrix
and
$matrix
->isa(
'Bio::EnsEMBL::IdMapping::ScoredMappingMatrix'
)) {
throw(
'Need a gene Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.'
);
}
unless
(
$transcript_matrix
and
$transcript_matrix
->isa(
'Bio::EnsEMBL::IdMapping::ScoredMappingMatrix'
)) {
throw(
'Need a transcript Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.'
);
}
my
$i
;
my
$num_entries
=
$transcript_matrix
->get_entry_count;
my
$progress_id
=
$self
->logger->init_progress(
$num_entries
, 100);
$self
->logger->info(
"Creating flag matrix...\n"
, 1);
foreach
my
$entry
(@{
$transcript_matrix
->get_all_Entries }) {
$self
->logger->log_progress(
$progress_id
, ++
$i
, 1);
my
$source_gene
=
$self
->cache->get_by_key(
'genes_by_transcript_id'
,
'source'
,
$entry
->source);
my
$target_gene
=
$self
->cache->get_by_key(
'genes_by_transcript_id'
,
'target'
,
$entry
->target);
$matrix
->add_score(
$source_gene
->id,
$target_gene
->id, 1);
}
$self
->logger->info(
"\n"
);
return
$matrix
;
}
sub
score_matrix_from_flag_matrix {
my
$self
=
shift
;
my
$flag_matrix
=
shift
;
my
$transcript_matrix
=
shift
;
my
$simple_scoring
=
shift
;
unless
(
$flag_matrix
and
$flag_matrix
->isa(
'Bio::EnsEMBL::IdMapping::ScoredMappingMatrix'
) )
{
throw(
'Need a gene Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.'
);
}
unless
(
$transcript_matrix
and
$transcript_matrix
->isa(
'Bio::EnsEMBL::IdMapping::ScoredMappingMatrix'
)
)
{
throw(
'Need a transcript Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.'
);
}
my
$matrix
=
Bio::EnsEMBL::IdMapping::ScoredMappingMatrix->new(
-DUMP_PATH
=>
$flag_matrix
->dump_path,
-CACHE_FILE
=>
$flag_matrix
->cache_file_name,
);
my
$i
;
my
$num_entries
=
$flag_matrix
->get_entry_count;
my
$progress_id
=
$self
->logger->init_progress(
$num_entries
, 100 );
$self
->logger->info(
"Creating score matrix from flag matrix...\n"
,
1 );
my
$gene_score_threshold
=
$self
->conf()->param(
'gene_score_threshold'
) || 0;
foreach
my
$entry
( @{
$flag_matrix
->get_all_Entries } ) {
$self
->logger->log_progress(
$progress_id
, ++
$i
, 1 );
my
$score
= 0;
my
$source_gene
=
$self
->cache->get_by_key(
'genes_by_id'
,
'source'
,
$entry
->source );
my
$target_gene
=
$self
->cache->get_by_key(
'genes_by_id'
,
'target'
,
$entry
->target );
if
(
$simple_scoring
) {
$score
=
$self
->simple_gene_gene_score(
$source_gene
,
$target_gene
,
$transcript_matrix
);
}
else
{
$score
=
$self
->complex_gene_gene_score(
$source_gene
,
$target_gene
,
$transcript_matrix
);
}
if
(
$score
>
$gene_score_threshold
) {
$matrix
->add_score(
$entry
->source,
$entry
->target,
$score
);
}
}
$self
->logger->info(
"\n"
);
return
$matrix
;
}
sub
complex_gene_gene_score {
my
$self
=
shift
;
my
$source_gene
=
shift
;
my
$target_gene
=
shift
;
my
$transcript_matrix
=
shift
;
my
$source_gene_score
= 0;
my
$target_gene_score
= 0;
my
$source_gene_accum_length
= 0;
my
$target_gene_accum_length
= 0;
my
%target_transcripts
=
map
{
$_
->
id
=> 1 }
@{
$target_gene
->get_all_Transcripts };
foreach
my
$source_transcript
(@{
$source_gene
->get_all_Transcripts }) {
my
$max_source_score
= -1;
foreach
my
$target_transcript_id
(@{
$transcript_matrix
->get_targets_for_source(
$source_transcript
->id) }) {
next
unless
(
$target_transcripts
{
$target_transcript_id
});
my
$score
=
$transcript_matrix
->get_score(
$source_transcript
->id,
$target_transcript_id
);
$max_source_score
=
$score
if
(
$score
>
$max_source_score
);
}
if
(
$max_source_score
> 0) {
$source_gene_score
+= (
$max_source_score
*
$source_transcript
->
length
);
}
$source_gene_accum_length
+=
$source_transcript
->
length
;
}
my
%source_transcripts
=
map
{
$_
->
id
=> 1 }
@{
$source_gene
->get_all_Transcripts };
foreach
my
$target_transcript
(@{
$target_gene
->get_all_Transcripts }) {
my
$max_target_score
= -1;
foreach
my
$source_transcript_id
(@{
$transcript_matrix
->get_sources_for_target(
$target_transcript
->id) }) {
next
unless
(
$source_transcripts
{
$source_transcript_id
});
my
$score
=
$transcript_matrix
->get_score(
$source_transcript_id
,
$target_transcript
->id);
$max_target_score
=
$score
if
(
$score
>
$max_target_score
);
}
if
(
$max_target_score
> 0) {
$target_gene_score
+= (
$max_target_score
*
$target_transcript
->
length
);
}
$target_gene_accum_length
+=
$target_transcript
->
length
;
}
my
$gene_score
= 0;
if
((
$source_gene_accum_length
+
$target_gene_accum_length
) > 0) {
$gene_score
= (
$source_gene_score
+
$target_gene_score
) /
(
$source_gene_accum_length
+
$target_gene_accum_length
);
}
else
{
$self
->logger->warning(
"Combined transcript length of source ("
.
$source_gene
->id.
") and target ("
.
$target_gene
->id.
") gene is zero!\n"
, 1);
}
if
(
$gene_score
> 1) {
$self
->logger->warning(
"Illegal gene score: $gene_score ("
.
join
(
"|"
,
$source_gene_score
,
$target_gene_score
,
$source_gene_accum_length
,
$target_gene_accum_length
).
")\n"
, 1);
}
return
$gene_score
;
}
sub
simple_gene_gene_score {
my
$self
=
shift
;
my
$source_gene
=
shift
;
my
$target_gene
=
shift
;
my
$transcript_matrix
=
shift
;
my
$gene_score
= 0;
foreach
my
$source_transcript
(@{
$source_gene
->get_all_Transcripts }) {
foreach
my
$target_transcript
(@{
$target_gene
->get_all_Transcripts }) {
my
$score
=
$transcript_matrix
->get_score(
$source_transcript
->id,
$target_transcript
->id);
$gene_score
=
$score
if
(
$score
>
$gene_score
);
}
}
return
$gene_score
;
}
sub
simple_gene_rescore {
my
$self
=
shift
;
my
$gene_matrix
=
shift
;
my
$transcript_matrix
=
shift
;
$gene_matrix
=
$self
->score_matrix_from_flag_matrix(
$gene_matrix
,
$transcript_matrix
, 1);
}
sub
biotype_gene_rescore {
my
$self
=
shift
;
my
$matrix
=
shift
;
unless
(
$matrix
and
$matrix
->isa(
'Bio::EnsEMBL::IdMapping::ScoredMappingMatrix'
) )
{
throw(
'Need a Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.'
);
}
my
$i
= 0;
foreach
my
$entry
( @{
$matrix
->get_all_Entries } ) {
my
$source_gene
=
$self
->cache->get_by_key(
'genes_by_id'
,
'source'
,
$entry
->source );
my
$target_gene
=
$self
->cache->get_by_key(
'genes_by_id'
,
'target'
,
$entry
->target );
if
(
$source_gene
->biotype() ne
$target_gene
->biotype() ) {
$matrix
->set_score(
$entry
->source(),
$entry
->target(),
0.9
*$entry
->score() );
$i
++;
}
}
$self
->logger->debug(
"Scored genes with biotype mismatch: $i\n"
, 1 );
}
sub
location_gene_rescore {
my
$self
=
shift
;
my
$matrix
=
shift
;
unless
(
$matrix
and
$matrix
->isa(
'Bio::EnsEMBL::IdMapping::ScoredMappingMatrix'
) )
{
throw(
'Need a Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.'
);
}
my
$i
= 0;
foreach
my
$entry
( @{
$matrix
->get_all_Entries } ) {
my
$source_gene
=
$self
->cache->get_by_key(
'genes_by_id'
,
'source'
,
$entry
->source );
my
$target_gene
=
$self
->cache->get_by_key(
'genes_by_id'
,
'target'
,
$entry
->target );
if
(
$source_gene
->seq_region_name() ne
$target_gene
->seq_region_name() ) {
$matrix
->set_score(
$entry
->source(),
$entry
->target(),
0.9
*$entry
->score() );
$i
++;
}
}
}
1;