sub
new {
my
(
$caller
,
$seq
,
$seq_h
,
$ann_l
) =
@_
;
my
$class
=
ref
(
$caller
) ||
$caller
;
my
$self
=
bless
({
seq
=>
$seq
,
curr_feats
=> [],
curr_coords
=> [],
seq_h
=>
$seq_h
,
ann_l
=>
$ann_l
,
},
$class
);
return
$self
;
}
sub
add_source {
my
(
$self
,
$length
,
$tags
) =
@_
;
my
$feat
= FAST::Bio::SeqFeature::Generic->new(
-primary
=>
'source'
,
-start
=> 1,
-end
=>
$length
,
);
for
(
keys
%{
$tags
} ) {
for
my
$val
( @{
$tags
->{
$_
}} ) {
$feat
->add_tag_value(
$_
=>
$val
);
}
}
return
$feat
;
}
sub
has_gene {
my
(
$self
,
$gene
,
$gname
,
$id
) =
@_
;
unless
(
$gene
) {
if
(
defined
$self
->{curr_gene} ) {
return
$self
->{curr_gene};
}
else
{
return
0;
}
}
else
{
if
(
$id
&& !
$self
->{curr_ltag} ) {
$self
->{curr_ltag} =
$id
;
}
if
(
$gname
&& !
$self
->{curr_gname} ) {
$self
->{curr_gname} =
$gname
;
}
my
$tags
= {};
for
my
$child
( @{
$gene
->{Children}} ) {
my
$name
=
$child
->{Name};
if
(
$name
eq
'dbxref'
) {
$tags
->{dbxref} ||= [];
push
@{
$tags
->{dbxref}},
$self
->dbxref(
$child
);
}
elsif
(
$name
!~ /name/ ){
$self
->complain(
"Unrecognized element '$name'. I don't "
.
"know what to do with $name elements"
);
}
}
my
$feat
= FAST::Bio::SeqFeature::Generic->new(
-primary
=>
'gene'
,
);
my
%seen
;
for
(
keys
%{
$tags
} ) {
for
my
$val
( @{
$tags
->{
$_
}} ) {
$feat
->add_tag_value(
$_
=>
$val
)
unless
++
$seen
{
$_
.
$val
} > 1;
}
}
$self
->{curr_gene} =
$feat
;
return
$feat
;
}
}
sub
_has_CDS {
my
(
$self
,
$transcript
) =
@_
;
if
( !
$transcript
) {
if
(
defined
$self
->{curr_cds} ) {
return
$self
->{curr_cds};
}
else
{
return
0;
}
}
else
{
my
$tags
=
$self
->{curr_tags};
$self
->{curr_cds} =
$self
->_add_CDS(
$transcript
,
$tags
);
}
}
sub
add_annotation {
my
(
$self
,
$seq
,
$type
,
$id
,
$tags
,
$feats
) =
@_
;
unless
(
$self
->has_gene ) {
shift
;
$self
->_add_generic_annotation(
@_
);
return
0;
}
my
$feat
;
if
(
$type
eq
'gene'
) {
$feat
=
$self
->has_gene;
$feat
->add_tag_value(
gene
=> (
$self
->{curr_gname} ||
$id
) )
unless
$feat
->has_tag(
'gene'
);
}
else
{
$feat
= FAST::Bio::SeqFeature::Generic->new;
$feat
->primary_tag(
$type
);
my
$gene
=
$self
->has_gene;
$gene
->add_tag_value(
gene
=> (
$self
->{curr_gname} ||
$id
) )
unless
$gene
->has_tag(
'gene'
);
$feat
->add_tag_value(
gene
=> (
$self
->{curr_gname} ||
$id
) )
unless
$feat
->has_tag(
'gene'
);;
}
for
(
keys
%{
$tags
} ) {
if
(
$_
eq
'name'
&&
$tags
->{type}->[0] eq
'gene'
) {
$feat
->add_tag_value(
gene
=>
$tags
->{name}->[0] )
unless
$feat
->has_tag(
'gene'
);
delete
$tags
->{name};
}
else
{
next
if
$_
eq
'type'
&&
$tags
->{
$_
}->[0] eq
'gene'
;
next
if
$_
eq
'gene'
&&
$feat
->has_tag(
'gene'
);
for
my
$val
( @{
$tags
->{
$_
}} ) {
$feat
->add_tag_value(
$_
=>
$val
);
}
}
}
$feat
->strand(
$self
->{curr_strand} );
$feat
->start(
$self
->{curr_coords}->[0] );
$feat
->end(
$self
->{curr_coords}->[1] );
my
@annotations
= (
$feat
);
if
(
$self
->has_gene &&
$type
ne
'gene'
) {
my
$gene
=
$self
->has_gene;
$gene
->strand(
$self
->{curr_strand} );
$gene
->start(
$self
->{curr_coords}->[0] );
$gene
->end(
$self
->{curr_coords}->[-1] );
push
@annotations
,
$gene
;
$self
->{curr_gene} =
''
;
}
for
( @{
$feats
} ) {
$self
->complain(
"bad feature $_"
)
unless
ref
(
$_
) =~ /Bio/;
push
@annotations
,
$_
;
}
my
$seqid
=
$seq
->id;
my
$list
=
$self
->{ann_l};
if
(
$list
->[0] &&
$annotations
[0]->start <
$list
->[0]->start ) {
unshift
@{
$list
},
@annotations
;
}
else
{
push
@{
$list
},
@annotations
;
}
$self
->{curr_gene} =
''
;
$self
->{curr_ltag} =
''
;
$self
->{curr_gname} =
''
;
$self
->{curr_coords} = [];
$self
->{curr_feats} = [];
$self
->{curr_strand} = 0;
$self
->{ann_seq} =
$seq
;
$self
->flush;
}
sub
_add_generic_annotation {
my
(
$self
,
$seq
,
$type
,
$id
,
$tags
,
$feats
) =
@_
;
for
(
@$feats
) {
$_
->primary_tag(
$type
);
}
push
@{
$self
->{ann_l}},
@$feats
;
$self
->{curr_coords} = [];
$self
->{curr_feats} = [];
$self
->{curr_strand} = 0;
$self
->{ann_seq} =
$seq
;
$self
->flush;
}
sub
feature_set {
my
(
$self
,
$id
,
$gname
,
$set
,
$anntype
) =
@_
;
my
$stype
=
$set
->{_type}->{Characters};
$self
->{curr_loc} = [];
$self
->{curr_tags} = {};
$self
->{curr_subfeats} = [];
$self
->{curr_strand} = 0;
my
@feats
= ();
my
$tags
=
$self
->{curr_tags};
my
$sname
=
$set
->{_name}->{Characters} ||
$set
->{Attributes}->{id};
if
(
$set
->{Attributes}->{problem} ) {
$tags
->{problem} = [
$set
->{Attributes}->{problem}];
}
my
@fcount
=
grep
{
$_
->{Name} eq
'feature_span'
} @{
$set
->{Children}};
if
(
@fcount
== 1 ) {
$self
->_build_feature_set(
$set
, 1);
my
(
$feat
) = @{
$self
->{curr_subfeats}};
$feat
->primary_tag(
'transcript'
)
if
$feat
->primary_tag eq
'exon'
;
if
(
$feat
->primary_tag eq
'transcript'
) {
$feat
->add_tag_value(
gene
=> (
$gname
||
$id
) )
unless
$feat
->has_tag(
'gene'
);
}
my
%seen_tag
;
for
my
$tag
(
keys
%{
$tags
} ) {
for
my
$val
( @{
$tags
->{
$tag
}} ) {
$feat
->add_tag_value(
$tag
=>
$val
)
if
$val
&& ++
$seen_tag
{
$tag
.
$val
} < 2;
}
}
@feats
= (
$feat
);
}
else
{
$self
->{curr_ltag} =
$id
;
$self
->{curr_cds} =
''
;
$gname
=
$id
if
$gname
eq
'gene'
;
$self
->{curr_gname} =
$gname
;
if
(
$self
->has_gene ) {
unless
(
$anntype
=~/RNA/i ) {
$stype
=~ s/transcript/mRNA/;
}
}
$self
->{curr_feat} = FAST::Bio::SeqFeature::Generic->new(
-primary
=>
$stype
,
-id
=>
$id
,
);
my
$feat
=
$self
->{curr_feat};
$self
->_build_feature_set(
$set
);
my
$gene
=
$gname
||
$self
->{curr_ltag};
$feat
->add_tag_value(
gene
=>
$gene
)
unless
$feat
->has_tag(
'gene'
);
my
$cds
=
$self
->_has_CDS(
$feat
);
if
(
$cds
) {
$feat
->primary_tag(
'mRNA'
);
$cds
->remove_tag(
'standard_name'
)
if
$cds
->has_tag(
'standard_name'
);
$cds
->add_tag_value(
standard_name
=>
$sname
);
$cds
->remove_tag(
'gene'
)
if
$cds
->has_tag(
'gene'
);
$cds
->add_tag_value(
gene
=>
$gene
);
if
(
$cds
->has_tag(
'protein_id'
) && !
$cds
->get_tag_values(
'protein_id'
) ) {
my
$pid
=
$self
->protein_id(
$cds
,
$sname
);
$cds
->remove_tag(
'protein_id'
);
$cds
->add_tag_value(
protein_id
=>
$pid
);
}
my
@subfeats
= @{
$self
->{curr_subfeats}};
for
my
$sf
( @ subfeats ) {
$sf
->add_tag_value(
standard_name
=>
$sname
)
unless
$sf
->has_tag(
'standard_name'
);
$sf
->add_tag_value(
gene
=>
$gene
)
unless
$sf
->has_tag(
'gene'
);
}
$feat
->add_tag_value(
standard_name
=>
$sname
)
unless
$feat
->has_tag(
'standard_name'
);
$feat
->add_tag_value(
gene
=>
$gene
)
unless
$feat
->has_tag(
'gene'
);
my
%seen
;
if
(
$feat
->
length
==
$cds
->
length
) {
for
my
$t
(
$feat
->all_tags ) {
next
if
$t
=~ /gene|standard_name/;
$cds
->add_tag_value(
$t
=>
$feat
->get_tag_values(
$t
) );
}
undef
$feat
;
}
@feats
=
sort
{
$a
->start <=>
$b
->start } (
$cds
,
@subfeats
);
unshift
@feats
,
$feat
if
$feat
;
}
else
{
if
( @{
$self
->{curr_loc}} > 1 ) {
my
$loc
= FAST::Bio::Location::Split->new(
-splittype
=>
'JOIN'
);
my
@loc
=
sort
{
$a
->start <=>
$b
->start } @{
$self
->{curr_loc}};
for
(
@loc
) {
$loc
->add_sub_Location(
$_
)
}
$feat
->location(
$loc
);
}
else
{
$feat
->location(
$self
->{curr_loc}->[0] );
}
for
(
keys
%$tags
) {
next
if
/gene/ &&
$feat
->has_tag(
'gene'
);
for
my
$v
( @{
$tags
->{
$_
}} ) {
$feat
->add_tag_value(
$_
=>
$v
);
}
}
my
@subfeats
= @{
$self
->{curr_subfeats}};
for
my
$sf
( @ subfeats ) {
$sf
->add_tag_value(
standard_name
=>
$sname
)
unless
$sf
->has_tag(
'standard_name'
);
$sf
->add_tag_value(
gene
=>
$gene
)
unless
$sf
->has_tag(
'gene'
);
}
@feats
= (
$feat
,
@subfeats
);
}
}
$self
->{curr_coords}->[0] ||= 1000000000000;
$self
->{curr_coords}->[1] ||= -1000000000000;
for
(
@feats
) {
if
(
$self
->{curr_coords}->[0] >
$_
->start ) {
$self
->{curr_coords}->[0] =
$_
->start;
}
if
(
$self
->{curr_coords}->[1] <
$_
->end ) {
$self
->{curr_coords}->[1] =
$_
->end;
}
}
$self
->flush(
$set
);
return
@feats
;
}
sub
_build_feature_set {
my
(
$self
,
$set
,
$keep_subfeat
) =
@_
;
for
my
$child
( @{
$set
->{Children}} ) {
my
$name
=
$child
->{Name};
if
(
$name
eq
'date'
) {
$self
->date(
$child
);
}
elsif
(
$name
eq
'comment'
) {
$self
->comment(
$child
);
}
elsif
(
$name
eq
'evidence'
) {
$self
->evidence(
$child
);
}
elsif
(
$name
eq
'feature_span'
) {
$self
->_add_feature_span(
$child
,
$keep_subfeat
);
}
elsif
(
$name
eq
'property'
) {
$self
->property(
$child
);
}
elsif
(
$name
=~ /synonym|author|description/) {
$self
->{curr_tags}->{
$name
} = [
$child
->{Characters}];
}
elsif
(
$name
!~ /name|type|seq/ ){
$self
->complain(
"Unrecognized element '$name'. I don't "
.
"know what to do with $name elements"
);
}
}
}
sub
_add_feature_span {
my
(
$self
,
$el
,
$keep_subfeat
) =
@_
;
my
$tags
=
$self
->{curr_tags};
my
$feat
=
$self
->{curr_feat};
my
$type
=
$el
->{_type}->{Characters} ||
$el
->{Name};
my
$id
=
$el
->{Attributes}->{id} ||
$el
->{_name}->{Characters};
my
$seqr
=
$el
->{_seq_relationship};
my
$start
=
int
$seqr
->{_span}->{_start}->{Characters};
my
$end
=
int
$seqr
->{_span}->{_end}->{Characters};
my
$stype
=
$seqr
->{Attributes}->{type};
my
$seqid
=
$seqr
->{Attributes}->{seq};
push
@{
$self
->{seq_l}},
$self
->{seq_h}->{
$seqid
};
if
(
$start
>
$end
) {
$self
->{curr_strand} = -1;
(
$start
,
$end
) = (
$end
,
$start
);
}
else
{
$self
->{curr_strand} = 1;
}
if
(
$type
eq
'exon'
) {
my
$sl
= FAST::Bio::Location::Simple->new(
-start
=>
$start
,
-end
=>
$end
,
-strand
=>
$self
->{curr_strand} );
push
@{
$self
->{curr_loc}},
$sl
;
}
if
(
$type
=~ /start_codon|translate offset/ ) {
$self
->{curr_tags}->{codon_start} = [
$start
];
}
else
{
if
(
$type
eq
'exon'
) {
return
unless
$keep_subfeat
;
}
push
@{
$self
->{curr_subfeats}},
FAST::Bio::SeqFeature::Generic->new(
-start
=>
$start
,
-end
=>
$end
,
-strand
=>
$self
->{curr_strand},
-primary
=>
$type
);
}
my
$tscript
=
$el
->{Attributes}->{produces_seq};
if
(
$tscript
&&
$tscript
ne
'null'
) {
my
$subseq
=
$self
->{seq_h}->{
$el
->{Attributes}->{produces_seq}};
$self
->{curr_tags}->{product} = [
$el
->{Attributes}->{produces_seq}];
$self
->{curr_tags}->{translation} = [
$subseq
->seq]
if
$subseq
;
}
$self
->flush(
$el
);
}
sub
_add_CDS {
my
(
$self
,
$feat
,
$tags
) =
@_
;
my
$loc
= {};
my
$single
= 0;
if
( @{
$self
->{curr_loc}} > 1 ) {
$loc
= FAST::Bio::Location::Split->new;
my
@loc
=
sort
{
$a
->start <=>
$b
->start } @{
$self
->{curr_loc}};
for
(
@loc
) {
$loc
->add_sub_Location(
$_
);
}
}
else
{
$loc
=
$self
->{curr_loc}->[0];
$single
++;
}
my
@exons
=
$single
?
$loc
:
$loc
->sub_Location(1);
$feat
->location(
$loc
);
my
$seq
=
$self
->{seq_h}->{
$tags
->{protein_id}->[0] };
$seq
||=
$self
->{seq_h}->{
$tags
->{product}->[0] } ||
$self
->{seq_h}->{
$tags
->{gene}->[0] } ||
$self
->{seq_h}->{
$tags
->{standard_name}->[0] };
my
(
$start
,
$stop
,
$peptide
) = ();
if
(
$seq
) {
$peptide
=
$seq
->display_id;
my
$desc
=
$seq
->description ||
''
;
$desc
=~ s/,|\n//g;
$desc
=~ s/\)(\w)/\) $1/g;
if
(
$desc
=~ /cds_boundaries:.+?(\d+)\.\.(\d+)/ ) {
(
$start
,
$stop
) = ($1 -
$self
->{offset}, $2 -
$self
->{offset});
}
else
{
$start
=
$loc
->start;
$stop
=
$loc
->end;
}
}
else
{
$self
->
warn
(
"I did not find a protein sequence for "
.
$feat
->display_name);
}
delete
$tags
->{transcript};
my
@exons_to_add
= ();
for
(
@exons
) {
my
$exon
= FAST::Bio::Location::Simple->new;
if
(
$_
->end <
$start
||
$_
->start >
$stop
) {
next
;
}
if
(
$_
->start <
$start
&&
$_
->end >
$start
) {
$exon
->start(
$start
);
}
if
(
$_
->end >
$stop
&&
$_
->start <
$stop
) {
$exon
->end(
$stop
);
}
unless
(
$exon
->valid_Location) {
$exon
->start(
$_
->start );
$exon
->end(
$_
->end );
}
$exon
->strand (
$self
->{curr_strand} );
push
@exons_to_add
,
$exon
;
}
my
$cds_loc
;
if
(
@exons_to_add
> 1 ) {
$cds_loc
= FAST::Bio::Location::Split->new(
-splittype
=>
'JOIN'
);
for
(
@exons_to_add
) {
$cds_loc
->add_sub_Location(
$_
);
}
}
else
{
$cds_loc
=
$exons_to_add
[0];
}
my
$parent
=
$self
->{curr_gname} ||
$self
->{curr_ltag};
my
$cds_tags
= {};
for
my
$k
(
keys
%$tags
) {
if
(
$k
=~ /product|protein|translation|codon_start/ ) {
$cds_tags
->{
$k
} =
$tags
->{
$k
};
delete
$tags
->{
$k
};
}
}
for
(
keys
%$tags
) {
for
my
$v
( @{
$tags
->{
$_
}} ) {
$feat
->add_tag_value(
$_
=>
$v
)
unless
$feat
->has_tag(
$_
);
}
}
if
(
$self
->{curr_gname} ) {
$cds_tags
->{gene} = [
$self
->{curr_gname}];
}
my
$gene
=
$self
->has_gene;
my
$cds
= FAST::Bio::SeqFeature::Generic->new(
-primary
=>
'CDS'
,
-location
=>
$cds_loc
,
);
$cds_tags
->{translation} = [
$seq
->seq];
for
(
keys
%{
$cds_tags
} ) {
my
%seen
;
for
my
$val
(@{
$cds_tags
->{
$_
}}) {
next
if
++
$seen
{
$val
} > 1;
$cds
->add_tag_value(
$_
=>
$val
);
}
}
$cds
;
}
1;