$Bio::EnsEMBL::Utils::TranscriptSelector::VERSION
=
'112.0_56'
;
$Bio::EnsEMBL::Utils::TranscriptSelector::VERSION
=
'112.056'
;
sub
new {
my
$class
=
shift
;
my
$self
= {
'ccds_dba'
=>
shift
,
'verbose'
=>
shift
,
};
bless
$self
,
$class
;
if
(not
defined
(
$self
->{
'ccds_dba'
}) ) { warning (
"Running without CCDS DB"
);}
return
$self
;
}
sub
select_canonical_transcript_for_Gene {
my
$self
=
shift
;
my
$gene
=
shift
;
if
(!
$gene
) {throw (
'Cannot select canonical transcript without a gene.'
);}
my
$transcript_array
=
$gene
->get_all_Transcripts;
my
@transcripts
;
if
(
$transcript_array
) {
@transcripts
=
@$transcript_array
;
}
else
{
warning(
'No transcripts attached to gene '
.
$gene
->stable_id);
return
;
}
my
@encoded
;
foreach
my
$transcript
(
@transcripts
) {
my
$encoded_transcript
=
$self
->encode_transcript(
$transcript
);
push
(
@encoded
,
$encoded_transcript
);
if
(
$self
->{
'verbose'
}) {
printf
"%s encoded to: [%s,%s,%s,%s,%s,%s,%s]\n"
,
$transcript
->stable_id,
@$encoded_transcript
;
}
}
my
$sorted_ids
=
$self
->sort_into_canonical_order(\
@encoded
);
if
(
$self
->{
'verbose'
}) {
print
"Sorted order: \n"
;
foreach
my
$dbID
(
@$sorted_ids
) {
print
$dbID
.
"\n"
;
}
}
my
$canonical_dbID
=
$sorted_ids
->[0];
foreach
my
$transcript
(
@transcripts
) {
if
(
$transcript
->dbID ==
$canonical_dbID
) {
if
(
$self
->{
'verbose'
}) {
print
'Chosen transcript: '
.
$transcript
->stable_id.
"\n"
;}
return
$transcript
;
}
}
throw (
'Run out of transcripts without finding selected canonical dbID.'
)
}
my
%source_priority
= (
'ccds'
=> 1,
'merged'
=> 2,
'other'
=> 3);
my
%biotype_priority
= (
'protein_coding'
=> 1,
'nonsense_mediated_decay'
=> 2,
'non_stop_decay'
=> 2,
'polymorphic_pseudogene'
=> 2,
'protein_coding_LoF'
=> 2,
'other'
=> 3,
);
sub
encode_transcript {
my
$self
=
shift
;
my
$transcript
=
shift
;
my
$type
;
if
(
$self
->{
'ccds_dba'
} &&
$transcript
->slice->is_reference()
&&
$self
->check_Ens_trans_against_CCDS(
$transcript
) ) {
$type
=
'ccds'
;
}
elsif
(
$transcript
->analysis->logic_name eq
'ensembl_havana_transcript'
) {
$type
=
'merged'
;
}
else
{
$type
=
'other'
;
}
my
$translation
=
$transcript
->translate;
my
$translation_length
= 0;
if
(
$translation
) {
$translation_length
=
$translation
->
length
;
if
(
$translation
->seq =~ /\*/) {
$translation_length
= 0;}
}
my
$translates
= 0;
if
(
$translation_length
!= 0) {
$translates
= 1;}
my
$transcript_length
=
$transcript
->
length
;
my
$biotype
;
if
(
$transcript
->biotype() ne
'protein_coding'
&&
$transcript
->biotype() ne
'nonsense_mediated_decay'
&&
$transcript
->biotype() ne
'non_stop_decay'
&&
$transcript
->biotype() ne
'polymorphic_pseudogene'
&&
$transcript
->biotype() ne
'protein_coding_LoF'
) {
$biotype
=
'other'
;
}
else
{
$biotype
=
$transcript
->biotype(); }
return
[
$transcript
->dbID,
$translates
,
$source_priority
{
$type
},
$biotype_priority
{
$biotype
},
$translation_length
,
$transcript_length
,
$transcript
->stable_id];
}
sub
sort_into_canonical_order {
my
$self
=
shift
;
my
$encoded_list_ref
=
shift
;
my
@sorted_ids
=
map
{
$_
->[0] }
sort
{
$b
->[1] <=>
$a
->[1] ||
$a
->[2] <=>
$b
->[2] ||
$a
->[3] <=>
$b
->[3] ||
$b
->[4] <=>
$a
->[4] ||
$b
->[5] <=>
$a
->[5] ||
$a
->[6] cmp
$b
->[6]
} @{
$encoded_list_ref
};
return
\
@sorted_ids
;
}
sub
check_Ens_trans_against_CCDS {
my
(
$self
,
$transcript
) =
@_
;
my
@translateable_exons
= @{
$transcript
->get_all_translateable_Exons };
my
$seq_region_name
=
$transcript
->slice->seq_region_name;
my
$seq_region_start
=
$transcript
->seq_region_start;
my
$seq_region_end
=
$transcript
->seq_region_end;
my
$ccds_dba
=
$self
->{
'ccds_dba'
};
my
$ext_slice
=
$ccds_dba
->get_SliceAdaptor->fetch_by_region(
'toplevel'
,
$seq_region_name
,
$seq_region_start
,
$seq_region_end
);
EXT_GENE:
foreach
my
$ext_gene
( @{
$ext_slice
->get_all_Genes } ) {
EXT_TRANS:
foreach
my
$ext_trans
( @{
$ext_gene
->get_all_Transcripts } ) {
my
@ext_exons
= @{
$ext_trans
->get_all_Exons };
if
(
scalar
(
@translateable_exons
) ==
scalar
(
@ext_exons
) ) {
for
(
my
$i
= 0 ;
$i
<
scalar
(
@translateable_exons
) ;
$i
++ ) {
if
(
$translateable_exons
[
$i
]->coding_region_start(
$transcript
)
!=
$ext_exons
[
$i
]->seq_region_start
||
$translateable_exons
[
$i
]->strand
!=
$ext_exons
[
$i
]->strand
||
$translateable_exons
[
$i
]->coding_region_end(
$transcript
)
!=
$ext_exons
[
$i
]->seq_region_end
) {
next
EXT_TRANS;
}
}
print
"Ensembl transcript "
.
$transcript
->display_id
.
" found match "
.
$ext_gene
->display_id
.
" in CCDS DB.\n"
if
$self
->{
'verbose'
};
if
(
$ext_gene
->stable_id !~ /^CCDS/) {
throw (
sprintf
(
"Database does not appear to contain CCDS IDs. Possible configuration problem with CCDS source. Found ID %s"
,
$ext_gene
->stable_id()));
}
return
1;
}
}
}
}
1;