$Bio::EnsEMBL::IdMapping::StableIdMapper::VERSION
=
'112.0_55'
;
$Bio::EnsEMBL::IdMapping::StableIdMapper::VERSION
=
'112.055'
;
no
warnings
'uninitialized'
;
our
@ISA
=
qw(Bio::EnsEMBL::IdMapping::BaseObject)
;
my
%debug_mappings
;
sub
new {
my
$caller
=
shift
;
my
$class
=
ref
(
$caller
) ||
$caller
;
my
$self
=
$class
->SUPER::new(
@_
);
my
$stable_id_generator
=
$self
->conf->param(
'plugin_stable_id_generator'
) ||
'Bio::EnsEMBL::IdMapping::StableIdGenerator::EnsemblGeneric'
;
$self
->logger->debug(
"Using $stable_id_generator to generate stable Ids.\n"
);
inject(
$stable_id_generator
);
my
$generator_instance
=
$stable_id_generator
->new(
-LOGGER
=>
$self
->logger,
-CONF
=>
$self
->conf,
-CACHE
=>
$self
->cache
);
$self
->stable_id_generator(
$generator_instance
);
return
$self
;
}
sub
generate_mapping_session {
my
$self
=
shift
;
return
if
(
$self
->mapping_session_date);
$self
->logger->info(
"Generating new mapping_session...\n"
);
$self
->mapping_session_date(
time
);
$self
->mapping_session_date_fmt(strftime(
"%Y-%m-%d %T"
,
localtime
(
$self
->mapping_session_date)));
my
$s_dba
=
$self
->cache->get_DBAdaptor(
'source'
);
my
$s_dbh
=
$s_dba
->dbc->db_handle;
my
$t_dba
=
$self
->cache->get_DBAdaptor(
'target'
);
my
$t_dbh
=
$t_dba
->dbc->db_handle;
my
$mapping_session_id
=
$self
->conf->param(
'mapping_session_id'
);
if
(
$mapping_session_id
) {
$self
->logger->debug(
"Using manually configured mapping_session_id $mapping_session_id\n"
, 1);
}
else
{
my
$sql
=
qq(SELECT MAX(mapping_session_id)
FROM mapping_session);
$mapping_session_id
=
$self
->fetch_value_from_db(
$s_dbh
,
$sql
);
unless
(
$mapping_session_id
) {
$self
->logger->debug(
"No previous mapping_session found.\n"
, 1);
}
$mapping_session_id
++;
$self
->logger->debug(
"Using mapping_session_id $mapping_session_id\n"
, 1);
}
$self
->mapping_session_id(
$mapping_session_id
);
my
$i
;
my
$fh
=
$self
->get_filehandle(
'mapping_session.txt'
,
'tables'
);
my
$sth1
=
$s_dbh
->prepare(
"SELECT * FROM mapping_session"
);
$sth1
->execute;
while
(
my
@row
=
$sth1
->fetchrow_array) {
$i
++;
print
$fh
join
(
"\t"
,
@row
);
print
$fh
"\n"
;
}
$sth1
->finish;
my
$release_sql
=
qq(
SELECT meta_value FROM meta WHERE meta_key = 'schema_version'
)
;
my
$old_release
=
$self
->fetch_value_from_db(
$s_dbh
,
$release_sql
);
my
$new_release
=
$self
->fetch_value_from_db(
$t_dbh
,
$release_sql
);
my
$assembly_sql
=
qq(
SELECT meta_value FROM meta WHERE meta_key = 'assembly.default'
)
;
my
$old_assembly
=
$self
->fetch_value_from_db(
$s_dbh
,
$assembly_sql
);
my
$new_assembly
=
$self
->fetch_value_from_db(
$t_dbh
,
$assembly_sql
);
unless
(
$old_release
and
$new_release
and
$old_assembly
and
$new_assembly
) {
$self
->logger->warning(
"Not all data for new mapping_session found:\n"
, 1);
$self
->logger->info(
"old_release: $old_release, new_release: $new_release"
);
$self
->logger->info(
"old_assembly: $old_assembly, new_assembly $new_assembly\n"
, 2);
}
print
$fh
join
(
"\t"
,
$mapping_session_id
,
$self
->conf->param(
'sourcedbname'
),
$self
->conf->param(
'targetdbname'
),
$old_release
,
$new_release
,
$old_assembly
,
$new_assembly
,
$self
->mapping_session_date_fmt);
print
$fh
"\n"
;
close
(
$fh
);
$self
->logger->info(
"Done writing "
.++
$i
.
" mapping_session entries.\n\n"
);
}
sub
map_stable_ids {
my
$self
=
shift
;
my
$mappings
=
shift
;
my
$type
=
shift
;
unless
(
$mappings
and
$mappings
->isa(
'Bio::EnsEMBL::IdMapping::MappingList'
)) {
throw(
"Need a Bio::EnsEMBL::IdMapping::MappingList of ${type}s."
);
}
$self
->generate_mapping_session;
$self
->logger->info(
"== Stable ID mapping for $type...\n\n"
, 0,
'stamped'
);
my
%all_sources
= %{
$self
->cache->get_by_name(
"${type}s_by_id"
,
'source'
) };
my
%all_targets
= %{
$self
->cache->get_by_name(
"${type}s_by_id"
,
'target'
) };
unless
(
scalar
(
keys
%all_sources
)) {
$self
->logger->info(
"No cached ${type}s found.\n\n"
);
return
;
}
my
%stats
=
map
{
$_
=> 0 }
qw(mapped new lost)
;
my
%sources_mapped
= ();
my
%targets_mapped
= ();
my
%scores_by_target
= ();
foreach
my
$e
(@{
$mappings
->get_all_Entries }) {
$sources_mapped
{
$e
->source} =
$e
->target;
$targets_mapped
{
$e
->target} =
$e
->source;
$scores_by_target
{
$e
->target} =
$e
->score;
}
my
$new_stable_id
=
$self
->stable_id_generator->initial_stable_id(
$type
);
foreach
my
$tid
(
keys
%all_targets
) {
my
$t_obj
=
$all_targets
{
$tid
};
if
(
my
$sid
=
$targets_mapped
{
$tid
}) {
my
$s_obj
=
$all_sources
{
$sid
};
$t_obj
->stable_id(
$s_obj
->stable_id);
$t_obj
->created_date(
$s_obj
->created_date);
my
$old_version
=
$s_obj
->version();
my
$new_version
=
$self
->stable_id_generator->calculate_version(
$s_obj
,
$t_obj
) ;
$t_obj
->version(
$new_version
);
if
(
$old_version
==
$new_version
) {
$t_obj
->modified_date(
$s_obj
->modified_date);
}
else
{
$t_obj
->modified_date(
$self
->mapping_session_date);
if
(
$scores_by_target
{
$tid
} == 1) {
$scores_by_target
{
$tid
} = 0.99;
}
}
unless
(
$type
eq
'exon'
) {
if
( !(
$s_obj
->stable_id eq
$t_obj
->stable_id &&
$s_obj
->version ==
$t_obj
->version &&
$scores_by_target
{
$tid
} > 0.9999 ) )
{
my
$key
=
join
(
"\t"
,
$s_obj
->stable_id,
$s_obj
->version,
$t_obj
->stable_id,
$t_obj
->version,
$self
->mapping_session_id,
$type
,
$scores_by_target
{
$tid
} );
$self
->add_stable_id_event(
'new'
,
$key
);
}
}
push
@{
$debug_mappings
{
$type
} }, [
$sid
,
$tid
,
$t_obj
->stable_id ];
$stats
{
'mapped'
}++;
}
else
{
$t_obj
->stable_id(
$new_stable_id
);
$t_obj
->version(1);
$t_obj
->created_date(
$self
->mapping_session_date);
$t_obj
->modified_date(
$self
->mapping_session_date);
unless
(
$type
eq
'exon'
) {
my
$key
=
join
(
"\t"
,
'\N'
,
0,
$t_obj
->stable_id,
$t_obj
->version,
$self
->mapping_session_id,
$type
,
0
);
$self
->add_stable_id_event(
'new'
,
$key
);
}
$new_stable_id
=
$self
->stable_id_generator->increment_stable_id(
$new_stable_id
);
$stats
{
'new'
}++;
}
}
my
$fh
;
if
(
$type
eq
'gene'
or
$type
eq
'transcript'
) {
$fh
=
$self
->get_filehandle(
"${type}s_lost.txt"
,
'debug'
);
}
foreach
my
$sid
(
keys
%all_sources
) {
my
$s_obj
=
$all_sources
{
$sid
};
unless
(
$sources_mapped
{
$sid
}) {
unless
(
$type
eq
'exon'
) {
my
$key
=
join
(
"\t"
,
$s_obj
->stable_id,
$s_obj
->version,
'\N'
,
0,
$self
->mapping_session_id,
$type
,
0
);
$self
->add_stable_id_event(
'new'
,
$key
);
}
my
$status
;
$stats
{
'lost'
}++;
if
(
$type
eq
'gene'
or
$type
eq
'transcript'
) {
print
$fh
$s_obj
->stable_id,
"\t$status\n"
;
}
}
}
close
(
$fh
)
if
(
defined
(
$fh
));
$self
->write_stable_ids_to_file(
$type
, \
%all_targets
);
$self
->generate_mapping_stats(
$type
, \
%stats
);
$self
->logger->info(
"Done.\n\n"
);
}
sub
generate_similarity_events {
my
(
$self
,
$mappings
,
$scores
,
$type
) =
@_
;
unless
(
$mappings
and
$mappings
->isa(
'Bio::EnsEMBL::IdMapping::MappingList'
) )
{
throw(
'Need a gene Bio::EnsEMBL::IdMapping::MappingList.'
);
}
unless
(
$scores
and
$scores
->isa(
'Bio::EnsEMBL::IdMapping::ScoredMappingMatrix'
) )
{
throw(
'Need a Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.'
);
}
throw(
"Need a type (gene|transcript|translation)."
)
unless
(
$type
);
my
$mapped
;
foreach
my
$e
( @{
$mappings
->get_all_Entries } ) {
$mapped
->{
'source'
}->{
$e
->source } = 1;
$mapped
->{
'target'
}->{
$e
->target } = 1;
}
foreach
my
$dbtype
(
keys
%$mapped
) {
my
$m1
=
"get_all_${dbtype}s"
;
my
$m2
=
"get_Entries_for_${dbtype}"
;
foreach
my
$id
( @{
$scores
->
$m1
} ) {
if
(
$mapped
->{
$dbtype
}->{
$id
} ) {
next
}
my
@entries
=
sort
{
$b
->score <=>
$a
->score } @{
$scores
->
$m2
(
$id
) };
unless
(
@entries
) {
next
}
my
$top_score
=
$entries
[0]->score;
if
(
$top_score
< 0.75 ) {
next
}
while
(
my
$e
=
shift
(
@entries
) ) {
if
(
$mapped
->{
'source'
}->{
$e
->source} ) {
next
; }
if
(
$mapped
->{
'target'
}->{
$e
->target} ) {
next
; }
if
(
$e
->score > (
$top_score
*0.95 ) ) {
my
$s_obj
=
$self
->cache->get_by_key(
"${type}s_by_id"
,
'source'
,
$e
->source );
my
$t_obj
=
$self
->cache->get_by_key(
"${type}s_by_id"
,
'target'
,
$e
->target );
my
$key
=
join
(
"\t"
,
$s_obj
->stable_id,
$s_obj
->version,
$t_obj
->stable_id,
$t_obj
->version,
$self
->mapping_session_id,
$type
,
$e
->score );
$self
->add_stable_id_event(
'similarity'
,
$key
);
}
}
}
}
}
sub
filter_same_gene_transcript_similarities {
my
$self
=
shift
;
my
$transcript_scores
=
shift
;
unless
(
$transcript_scores
and
$transcript_scores
->isa(
'Bio::EnsEMBL::IdMapping::ScoredMappingMatrix'
)) {
throw(
'Need a Bio::EnsEMBL::IdMapping::ScoredMappingMatrix of transcripts.'
);
}
my
$filtered_scores
= Bio::EnsEMBL::IdMapping::ScoredMappingMatrix->new(
-DUMP_PATH
=> path_append(
$self
->conf->param(
'basedir'
),
'matrix'
),
-CACHE_FILE
=>
'filtered_transcript_scores.ser'
,
);
my
%all_targets
=
map
{
$_
->
stable_id
=> 1 }
values
%{
$self
->cache->get_by_name(
"transcripts_by_id"
,
'target'
) };
my
$i
= 0;
foreach
my
$e
(@{
$transcript_scores
->get_all_Entries }) {
my
$s_tr
=
$self
->cache->get_by_key(
'transcripts_by_id'
,
'source'
,
$e
->source);
my
$s_gene
=
$self
->cache->get_by_key(
'genes_by_transcript_id'
,
'source'
,
$e
->source);
my
$t_gene
=
$self
->cache->get_by_key(
'genes_by_transcript_id'
,
'target'
,
$e
->target);
if
((
$s_gene
->stable_id eq
$t_gene
->stable_id) and
$all_targets
{
$s_tr
->stable_id}) {
$i
++;
next
;
}
$filtered_scores
->add_Entry(
$e
);
}
$self
->logger->debug(
"Skipped $i same gene transcript mappings.\n"
);
return
$filtered_scores
;
}
sub
generate_translation_similarity_events {
my
$self
=
shift
;
my
$mappings
=
shift
;
my
$transcript_scores
=
shift
;
unless
(
$mappings
and
$mappings
->isa(
'Bio::EnsEMBL::IdMapping::MappingList'
)) {
throw(
'Need a gene Bio::EnsEMBL::IdMapping::MappingList.'
);
}
unless
(
$transcript_scores
and
$transcript_scores
->isa(
'Bio::EnsEMBL::IdMapping::ScoredMappingMatrix'
)) {
throw(
'Need a Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.'
);
}
my
$translation_scores
= Bio::EnsEMBL::IdMapping::ScoredMappingMatrix->new(
-DUMP_PATH
=> path_append(
$self
->conf->param(
'basedir'
),
'matrix'
),
-CACHE_FILE
=>
'translation_scores.ser'
,
);
foreach
my
$e
(@{
$transcript_scores
->get_all_Entries }) {
my
$s_tl
=
$self
->cache->get_by_key(
'transcripts_by_id'
,
'source'
,
$e
->source)->translation;
my
$t_tl
=
$self
->cache->get_by_key(
'transcripts_by_id'
,
'target'
,
$e
->target)->translation;
if
(
$s_tl
and
$t_tl
) {
$translation_scores
->add_score(
$s_tl
->id,
$t_tl
->id,
$e
->score);
}
}
$self
->generate_similarity_events(
$mappings
,
$translation_scores
,
'translation'
);
}
sub
write_stable_ids_to_file {
my
$self
=
shift
;
my
$type
=
shift
;
my
$all_targets
=
shift
;
$self
->logger->info(
"Writing ${type} stable IDs to file...\n"
);
my
$fh
=
$self
->get_filehandle(
"${type}_stable_id.txt"
,
'tables'
);
my
@sorted_targets
=
map
{
$all_targets
->{
$_
} }
sort
{
$a
<=>
$b
}
keys
%$all_targets
;
foreach
my
$obj
(
@sorted_targets
) {
my
$created_date
=
$obj
->created_date;
unless
(
$created_date
) {
$created_date
=
$self
->mapping_session_date;
}
my
$modified_date
=
$obj
->modified_date;
unless
(
$modified_date
) {
$modified_date
=
$self
->mapping_session_date;
}
my
$row
=
join
(
"\t"
,
$obj
->id,
$obj
->stable_id,
$obj
->version,
strftime(
"%Y-%m-%d %T"
,
localtime
(
$created_date
)),
strftime(
"%Y-%m-%d %T"
,
localtime
(
$modified_date
)),
);
print
$fh
"$row\n"
;
}
close
(
$fh
);
$self
->logger->info(
"Done writing "
.
scalar
(
@sorted_targets
).
" entries.\n\n"
);
}
sub
generate_mapping_stats {
my
$self
=
shift
;
my
$type
=
shift
;
my
$stats
=
shift
;
my
$result
=
ucfirst
(
$type
).
" mapping results:\n\n"
;
my
$fmt1
=
"%-10s%-10s%-10s%-10s\n"
;
my
$fmt2
=
"%-10s%6.0f %6.0f %4.2f%%\n"
;
$result
.=
sprintf
(
$fmt1
,
qw(TYPE MAPPED LOST PERCENTAGE)
);
$result
.= (
'-'
x40).
"\n"
;
my
$mapped_total
=
$stats
->{
'mapped'
};
my
$lost_total
=
$stats
->{
'lost'
};
$result
.=
sprintf
(
$fmt2
,
'total'
,
$mapped_total
,
$lost_total
,
$mapped_total
/(
$mapped_total
+
$lost_total
)*100);
$self
->logger->info(
$result
.
"\n"
);
my
$fh
=
$self
->get_filehandle(
"${type}_mapping_stats.txt"
,
'stats'
);
print
$fh
$result
;
close
(
$fh
);
}
sub
dump_debug_mappings {
my
$self
=
shift
;
foreach
my
$type
(
keys
%debug_mappings
) {
$self
->logger->debug(
"Writing $type mappings to debug/${type}_mappings.txt...\n"
);
my
$fh
=
$self
->get_filehandle(
"${type}_mappings.txt"
,
'debug'
);
foreach
my
$row
(@{
$debug_mappings
{
$type
} }) {
print
$fh
join
(
"\t"
,
@$row
);
print
$fh
"\n"
;
}
close
(
$fh
);
$self
->logger->debug(
"Done.\n"
);
}
}
sub
write_stable_id_events {
my
$self
=
shift
;
my
$event_type
=
shift
;
throw(
"Need an event type (new|similarity)."
)
unless
(
$event_type
);
$self
->logger->debug(
"Writing $event_type stable_id_events to file...\n"
);
my
$fh
=
$self
->get_filehandle(
"stable_id_event_${event_type}.txt"
,
'tables'
);
my
$i
= 0;
foreach
my
$event
(@{
$self
->get_all_stable_id_events(
$event_type
) }) {
print
$fh
"$event\n"
;
$i
++;
}
close
(
$fh
);
$self
->logger->debug(
"Done writing $i entries.\n"
);
}
sub
add_stable_id_event {
my
(
$self
,
$type
,
$event
) =
@_
;
throw(
"Need an event type (new|similarity)."
)
unless
(
$type
);
$self
->{
'stable_id_events'
}->{
$type
}->{
$event
} = 1;
}
sub
get_all_stable_id_events {
my
(
$self
,
$type
) =
@_
;
throw(
"Need an event type (new|similarity)."
)
unless
(
$type
);
return
[
keys
%{
$self
->{
'stable_id_events'
}->{
$type
} } ];
}
sub
mapping_session_id {
my
$self
=
shift
;
$self
->{
'_mapping_session_id'
} =
shift
if
(
@_
);
return
$self
->{
'_mapping_session_id'
};
}
sub
mapping_session_date {
my
$self
=
shift
;
$self
->{
'_mapping_session_date'
} =
shift
if
(
@_
);
return
$self
->{
'_mapping_session_date'
};
}
sub
mapping_session_date_fmt {
my
$self
=
shift
;
$self
->{
'_mapping_session_date_fmt'
} =
shift
if
(
@_
);
return
$self
->{
'_mapping_session_date_fmt'
};
}
sub
stable_id_generator {
my
$self
=
shift
;
$self
->{
'_stable_id_generator'
} =
shift
if
(
@_
);
return
$self
->{
'_stable_id_generator'
};
}
1;