=head1 LICENSE
Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute
Copyright [2016-2024] EMBL-European Bioinformatics Institute
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
=cut
=head1 CONTACT
Please email comments or questions to the public Ensembl
Questions may also be sent to the Ensembl help desk at
=cut
=head1 NAME
=head1 SYNOPSIS
=head1 DESCRIPTION
=head1 METHODS
=cut
use strict;
use vars qw(@ISA);
@ISA = qw(Bio::EnsEMBL::Utils::VegaCuration::Transcript);
=head2 check_CDS_start_end_remarks
Args : B::E::Transcript
Example : my $results = $support->check_CDS_end_remarks($transcript)
Description: identifies incorrect 'CDS end...' transcript remarks in a
otter-derived Vega database
Returntype : hashref
=cut
sub check_CDS_start_end_remarks {
my $self = shift;
my $trans = shift;
# info for checking
my @remarks = @{$trans->get_all_Attributes('remark')};
my $coding_end = $trans->cdna_coding_end;
my $coding_start = $trans->cdna_coding_start;
my $trans_end = $trans->length;
my $trans_seq = $trans->seq->seq;
my $stop_codon = substr($trans_seq, $coding_end-3, 3);
my $start_codon = substr($trans_seq, $coding_start-1, 3);
#hashref to return results
my $results;
#extra CDS end not found remarks
if (grep {$_->value eq 'CDS end not found'} @remarks) {
if ( ($coding_end != $trans_end)
&& ( grep {$_ eq $stop_codon} qw(TGA TAA TAG) ) ) {
$results->{'END_EXTRA'} = 1;
}
}
#missing CDS end not found remark
if ( $coding_end == $trans_end ) {
if (! grep {$_->value eq 'CDS end not found'} @remarks) {
if (grep {$_ eq $stop_codon} qw(TGA TAA TAG)) {
$results->{'END_MISSING_2'} = 1;
}
else {
$results->{'END_MISSING_1'} = $stop_codon;
}
}
}
#extra CDS start not found remark
if (grep {$_->value eq 'CDS start not found'} @remarks) {
if ( ($coding_start != 1)
&& ($start_codon eq 'ATG') ) {
$results->{'START_EXTRA'} = 1;
}
}
#missing CDS start not found remark
if ( $coding_start == 1) {
if ( ! grep {$_->value eq 'CDS start not found'} @remarks) {
if ($start_codon eq 'ATG') {
$results->{'START_MISSING_2'} = 1;
} else {
$results->{'START_MISSING_1'} = $start_codon;
}
}
}
return $results;
}
=head2 check_CDS_start_end_remarks_loutre
Args : B::E::Transcript
Example : my $results = $support->check_CDS_end_remarks($transcript)
Description: identifies incorrect 'CDS end...' transcript attribs in a loutre
of a loutre-derived Vega database.
Returntype : hashref
=cut
sub check_CDS_start_end_remarks_loutre {
my $self = shift;
my $trans = shift;
# info for checking
my @stops = qw(TGA TAA TAG);
my %attributes;
foreach my $attribute (@{$trans->get_all_Attributes()}) {
push @{$attributes{$attribute->code}}, $attribute;
}
# warn $trans->stable_id;
# warn Data::Dumper::Dumper(\%attributes);
my $coding_end = $trans->cdna_coding_end;
my $coding_start = $trans->cdna_coding_start;
my $trans_end = $trans->length;
my $trans_seq = $trans->seq->seq;
my $stop_codon_offset = 3 + $trans->translation->end_Exon->end_phase;
my $initial_exon_phase = $trans->translation->start_Exon->phase;
my $stop_codon = substr($trans_seq, $coding_end-3, 3);
my $start_codon = substr($trans_seq, $coding_start-1, 3);
my $start_codon_incorrect = 1;
if ($start_codon eq 'ATG' ) {
$start_codon_incorrect = 0;
}
elsif ($start_codon eq 'CTG') {
foreach my $attrib (@{$attributes{'remark'}}) {
if ($attrib->value =~ /non[- ]ATG start/) {
$start_codon_incorrect = 0;
}
}
}
# warn "$start_codon -- $initial_exon_phase -- $coding_start -- $start_codon_incorrect";
#hashref to return results
my $results;
#extra CDS end not found remarks
if ($attributes{'cds_end_NF'}) {
if ( ($attributes{'cds_end_NF'}->[0]->value == 1)
&& ($coding_end != $trans_end)
&& ( grep {$_ eq $stop_codon} @stops) ) {
# warn $trans->stable_id.": $coding_end--$trans_end--$stop_codon";
# warn $trans->translation->end_Exon->end_phase;
$results->{'END_EXTRA'} = $stop_codon;
}
}
#missing CDS end not found remark
if ( $coding_end == $trans_end ) {
if ($attributes{'cds_end_NF'}) {
if ($attributes{'cds_end_NF'}->[0]->value == 0 ) {
if (! grep {$_ eq $stop_codon} @stops) {
# warn $trans->translation->end_Exon->end_phase;
$results->{'END_MISSING'}{'WRONG'} = $stop_codon;
}
}
}
elsif (! grep {$_ eq $stop_codon} @stops) {
$results->{'END_MISSING'}{'ABSENT'} = $stop_codon;
}
}
#extra CDS start not found remark
if ( $attributes{'cds_start_NF'}) {
if ( ($attributes{'cds_start_NF'}->[0]->value == 1 )
&& (! $start_codon_incorrect)) {
unless ( ($coding_start == 1) && ( $initial_exon_phase > 0)) {
$results->{'START_EXTRA'} = $start_codon;
}
}
}
#missing CDS start not found remark
if ( $coding_start == 1) {
if ( $attributes{'cds_start_NF'} ) {
if ( $attributes{'cds_start_NF'}->[0]->value == 0 ) {
if ($start_codon_incorrect) {
$results->{'START_MISSING'}{'ABSENT'} = $start_codon;
}
elsif ($initial_exon_phase > 0) {
$results->{'START_MISSING'}{'INITIAL_PHASE'} = $initial_exon_phase;
}
}
}
elsif ($start_codon ne 'ATG') {
$results->{'START_MISSING'}{'ABSENT'} = $start_codon;
}
}
return $results;
}
=head2 get_havana_seleno_comments
Args : none
Example : my $results = $support->get_havana_seleno_comments
Description: parses the HEREDOC containing Havana comments in this module
Returntype : hashref
=cut
sub get_havana_seleno_comments {
my $seen_translations;
while (<DATA>) {
next if /^\s+$/ or /#+/;
my ($obj,$comment) = split /=/;
$obj =~ s/^\s+|\s+$//g;
$comment =~ s/^\s+|\s+$//g;
# We add the origin as now "seen" can come from a number of places, and have
# a number of consequences in different cases, not just discounted Secs from this method. -- ds23
$seen_translations->{$obj} = [ $comment,"notsec-havana" ];
}
return $seen_translations;
}
sub check_for_stops {
my $support = shift;
my ($gene,$seen_transcripts,$log_object) = @_;
my $transcripts;
my $has_log_object=defined($log_object);
if($has_log_object){
my @help = $log_object->species_params->get_trans($gene->stable_id);
$transcripts=\@help;
}else{
$log_object=$support;
$transcripts=$gene->get_all_Transcripts;
}
my $gname = $gene->get_all_Attributes('name')->[0]->value;
my $gsi = $gene->stable_id;
my $scodon = 'TGA';
my $mod_date = $support->date_format( $gene->modified_date,'%d/%m/%y' );
my $hidden_remak_ttributes;
TRANS:
foreach my $trans (@$transcripts) {
my $tsi = $trans->stable_id;
my $tID = $trans->dbID;
my $tname = $trans->get_all_Attributes('name')->[0]->value;
if($has_log_object){
$hidden_remak_ttributes=$log_object->species_params->get_attributes->{$tsi}->{'hidden_remark'};
}else{
$hidden_remak_ttributes=$trans->get_all_Attributes('hidden_remark');
}
foreach my $rem (@$hidden_remak_ttributes) {
if ($rem->value =~ /not_for_Vega/) {
#$support->log_verbose("Skipping transcript $tname ($tsi) since 'not_for_Vega'\n",1);
$log_object->_save_log('log_verbose', '', $gsi, '', $tsi, '', "Skipping transcript $tname ($tsi) since 'not_for_Vega'");
next TRANS;
}
}
# go no further if there is a ribosomal framshift attribute
foreach my $attrib (@{$trans->get_all_Attributes('_rib_frameshift')}) {
if ($attrib->value) {
$log_object->_save_log('log', '', $gsi, '', $tsi, '', "Skipping $tsi ($tname) since it has a ribosomal frameshift attribute");
next TRANS;
}
}
foreach my $attrib (@{$trans->get_all_Attributes('hidden_remark')}) {
if ($attrib->value eq 'ribosomal_frameshift') {
$log_object->_save_log('log', '', $gsi, '', $tsi, '', "Skipping $tsi ($tname) since it has a ribosomal frameshift hidden_remark");
next TRANS;
}
}
#$support->log_verbose("Studying transcript $tsi ($tname, $tID)\n",1);
$log_object->_save_log('log_verbose', '', $gsi, '', $tsi, '', "Studying transcript $tsi ($tname, $tID)");
my $peptide;
# go no further if the transcript doesn't translate or if there are no stops
next TRANS unless ($peptide = $trans->translate);
my $pseq = $peptide->seq;
my $orig_seq = $pseq;
# (translate method trims stops from sequence end)
next TRANS unless ($pseq =~ /\*/);
next TRANS if ($trans->biotype eq 'polymorphic_pseudogene' or
$trans->biotype =~ /processed_pseudogene$/);
# warn sprintf("Stop codon is '%s'\n",substr($trans->translateable_seq,-3));
#$support->log_verbose("Stops found in $tsi ($tname)\n",1);
$log_object->_save_log('log_verbose', '', $gsi, '', $tsi, '', "Stops found in $tsi ($tname)");
# find out where and how many stops there are
my @found_stops;
my $mrna = $trans->translateable_seq;
my $offset = 0;
my $tstop;
while ($pseq =~ /^([^\*]*)\*(.*)/) {
my $pseq1_f = $1;
$pseq = $2;
my $seq_flag = 0;
$offset += length($pseq1_f) * 3;
my $stop = substr($mrna, $offset, 3);
my $aaoffset = int($offset / 3)+1;
push(@found_stops, [ $stop, $aaoffset ]);
$tstop .= "$aaoffset ";
$offset += 3;
}
# are all stops TGA...?
my $num_stops = scalar(@found_stops);
my $num_tga = 0;
my $positions;
foreach my $stop (@found_stops) {
$positions .= $stop->[0]."(".$stop->[1].") ";
if ($stop->[0] eq $scodon) {
$num_tga++;
}
}
my $source = $gene->source;
#...no - an internal stop codon error in the database...
if ($num_tga < $num_stops) {
if ($source eq 'havana') {
#$support->log_warning("INTERNAL STOPS HAVANA: Transcript $tsi ($tname) from gene $gname has non \'$scodon\' stop codons [$mod_date]:\nSequence = $orig_seq\nStops at $positions)\n\n");
$log_object->_save_log('log_warning', '', $gsi, 'TRANSCRIPT', $tsi, 'VQCT_internal_stop', "INTERNAL STOPS HAVANA: Transcript $tsi ($tname) from gene $gname has non \'$scodon\' stop codons [$mod_date]: Sequence = $orig_seq Stops at $positions)");
}
else {
#$support->log_warning("INTERNAL STOPS EXTERNAL: Transcript $tsi ($tname) from gene $gname has non \'$scodon\' stop codons[$mod_date]:\nSequence = $orig_seq\nStops at $positions)\n\n");
$log_object->_save_log('log_warning', '', $gsi, 'TRANSCRIPT', $tsi, 'VQCT_internal_stop', "INTERNAL STOPS EXTERNAL: Transcript $tsi ($tname) from gene $gname has non \'$scodon\' stop codons[$mod_date]: Sequence = $orig_seq Stops at $positions)");
}
}
#...yes - check remarks
else {
my $flag_remark = 0; # 1 if word seleno has been used
my $flag_remark2 = 0; # 1 if existing remark has correct numbering
my $alabel = 'Annotation_remark- selenocysteine ';
my $alabel2 = 'selenocysteine ';
my $annot_stops;
my $remarks;
my $att;
#get both hidden_remarks and remarks
foreach my $remark_type ('remark','hidden_remark') {
if($has_log_object){
$att=$log_object->species_params->get_attributes->{$trans->stable_id}->{$remark_type};
}else{
$att=$trans->get_all_Attributes($remark_type)
}
foreach my $attrib ( @$att) {
push @{$remarks->{$remark_type}}, $attrib->value;
}
}
#parse remarks to check syntax for location of edits
while (my ($attrib,$remarks)= each %$remarks) {
foreach my $text (@{$remarks}) {
if ( ($attrib eq 'remark') && ($text=~/^$alabel(.*)/) ){
#$support->log_warning("seleno remark for $tsi stored as Annotation_remark not hidden remark) [$mod_date]\n");
$log_object->_save_log('log_warning', '', $gsi, '', $tsi, 'VQCT_wrong_selC_coord', "seleno remark for $tsi stored as Annotation_remark not hidden remark) [$mod_date]");
$annot_stops=$1;
}
elsif ($text =~ /^$alabel2(.*)/) {
my $maybe = $1;
if($maybe =~ /^\s*\d+(\s+\d+)*\s*$/) {
$annot_stops=$maybe;
} else {
$log_object->_save_log('log', '', $gene->stable_id, '', $tsi, '', "Maybe annotated stop in incorrect format, maybe just a remark that happens to begin '$alabel2'".
" -- might need to investigate: '$alabel2$maybe' [$mod_date]");
}
}
}
}
#check the location of the annotated edits matches actual stops in the sequence
my @annotated_stops;
if ($annot_stops){
my $i = 0;
foreach my $offset (split(/\s+/, $annot_stops)) {
#OK if it matches a known stop
if (
defined($found_stops[$i]) && defined($found_stops[$i]->[1]) && ($found_stops[$i]->[1] == $offset)) {
push @annotated_stops, $offset;
}
# catch old annotations where number was in DNA not peptide coordinates
elsif (defined($found_stops[$i]) && defined($found_stops[$i]->[1]) && (($found_stops[$i]->[1] * 3) == $offset)) {
$log_object->_save_log('log_warning', '', $gene->stable_id, 'DNA', $tsi, 'VQCT_wrong_selC_coord', "DNA: Annotated stop for transcript tsi ($tname) is in DNA not peptide coordinates) [$mod_date]");
}
# catch old annotations where number off by one
elsif (defined($found_stops[$i]) && defined($found_stops[$i]->[1]) && (($found_stops[$i]->[1]) == $offset+1)) {
$log_object->_save_log('log_warning', '', $gene->stable_id, 'PEPTIDE', $tsi, 'VQCT_wrong_selC_coord', "PEPTIDE: Annotated stop for transcript $tsi ($tname) is out by one) [$mod_date]");
}
elsif (defined($offset) && ($offset=~/^\d+$/)){
if ($offset == length($orig_seq)+1) {
if($seen_transcripts->{$tsi} && $seen_transcripts->{$tsi}->[1] eq 'known-tga-stop') {
$log_object->_save_log('log', '', $gene->stable_id, 'TRANSCRIPT', $tsi, '', "Annotated stop for transcript $tsi ($tname) known to be a stop codon. Ok. [$mod_date]");
} elsif($seen_transcripts->{$tsi} && $seen_transcripts->{$tsi}->[1] eq 'known-terminal-sec') {
$log_object->_save_log('log', '', $gene->stable_id, 'TRANSCRIPT', $tsi, '', "Annotated stop for transcript $tsi ($tname) known to be a terminal Sec. Ok. [$mod_date]");
} else {
$log_object->_save_log('log_warning', '', $gene->stable_id, 'TRANSCRIPT', $tsi, '', "Annotated stop for transcript $tsi ($tname) \"$offset\" matches actual stop codon yet has no entry in script config to disambiguate it. Please investigate and add appropriate entry to config arrays in add_selcys.pl. [$mod_date]");
}
}
else {
$log_object->_save_log('log_warning', '', $gene->stable_id, 'TRANSCRIPT', $tsi, 'VQCT_wrong_selC_coord', "Annotated stop for transcript $tsi ($tname) \"$offset\" does not match a TGA codon) [$mod_date]");
push @annotated_stops, $offset;
}
}
$i++;
}
}
#check location of found stops matches annotated ones - any new ones are reported
foreach my $stop ( @found_stops ) {
my $pos = $stop->[1];
my $seq = $stop->[0];
unless ( grep { $pos == $_} @annotated_stops) {
if ($seen_transcripts->{$tsi} && $seen_transcripts->{$tsi}->[1] eq 'notsec-havana') {
#$support->log_verbose("Transcript $tsi ($tname) has potential selenocysteines but has been discounted by annotators:\n\t".$seen_transcripts->{$tsi}.") [$mod_date]\n");
$log_object->_save_log('log_verbose', '', $gene->stable_id, '', $tsi, 'VQCT_pot_selC', "Transcript $tsi ($tname) has potential selenocysteines but has been discounted by annotators: ".$seen_transcripts->{$tsi}->[0].") [$mod_date]");
}
else {
#$support->log("POTENTIAL SELENO ($seq) in $tsi ($tname, gene $gname) found at $pos [$mod_date]\n");
$log_object->_save_log('log', '', $gene->stable_id, '', $tsi, 'VQCT_pot_selC', "POTENTIAL SELENO ($seq) in $tsi ($tname, gene $gname) found at $pos [$mod_date]");
}
}
}
}
}
}
sub _save_log{
my $self=shift;
my $log_type = shift;
my $chrom_name=shift || '';
my $gsi=shift || '';
my $type=shift || '';
my $tsi=shift || '';
my $tag=shift || '';
my $txt=shift || '';
$self->$log_type($txt."\n");
}
#details of annotators comments
__DATA__
OTTHUMT00000144659 = FIXED- changed to transcript
OTTHUMT00000276377 = FIXED- changed to transcript
OTTHUMT00000257741 = FIXED- changed to nmd
OTTHUMT00000155694 = NOT_FIXED- should be nmd but external annotation but cannot be fixed
OTTHUMT00000155695 = NOT_FIXED- should be nmd but external annotation but cannot be fixed
OTTHUMT00000282573 = FIXED- changed to unprocessed pseudogene
OTTHUMT00000285227 = FIXED- changed start site
OTTHUMT00000151008 = FIXED- incorrect trimming of CDS, removed extra stop codon
OTTHUMT00000157999 = FIXED- changed incorrect stop
OTTHUMT00000150523 = FIXED- incorrect trimming of CDS
OTTHUMT00000150525 = FIXED- incorrect trimming of CDS
OTTHUMT00000150522 = FIXED- incorrect trimming of CDS
OTTHUMT00000150521 = FIXED- incorrect trimming of CDS
OTTHUMT00000246819 = FIXED- corrected frame
OTTHUMT00000314078 = FIXED- corrected frame
OTTHUMT00000080133 = FIXED- corrected frame
OTTHUMT00000286423 = FIXED- changed to transcript
OTTMUST00000055509 = FIXED- error
OTTMUST00000038729 = FIXED- corrected frame
OTTMUST00000021760 = FIXED- corrected frame
OTTMUST00000023057 = FIXED- corrected frame
OTTMUST00000015207 = FIXED- corrected frame
OTTMUST00000056646 = FIXED- error
OTTMUST00000059686 = FIXED- corrected frame
OTTMUST00000013426 = FIXED- corrected frame
OTTMUST00000044412 = FIXED- error