my
$RESERVED_TAGS
=
"ID|Name|Alias|Parent|Target|Gap|Derives_from|Note|Dbxref|Ontology_term|Index"
;
sub
_initialize {
my
(
$self
,
%arg
) =
@_
;
$self
->SUPER::_initialize(
%arg
);
$self
->version(
$arg
{-version} || DEFAULT_VERSION);
$self
->validate(
$arg
{-validate_terms} || 0);
if
(
$arg
{-file} =~ /^>.*/ ) {
$self
->_print(
"##gff-version "
.
$self
->version() .
"\n"
);
}
else
{
my
$directive
;
while
((
$directive
=
$self
->_readline()) && (
$directive
=~ /^
$self
->_handle_directive(
$directive
);
}
$self
->_pushback(
$directive
);
}
if
(
$arg
{-file} =~ /^>.*/ ) {
$self
->_print(
"##gff-version "
.
$self
->version() .
"\n"
);
}
if
(
$self
->validate) {
$self
->so(
Bio::Ontology::OntologyStore->get_ontology(
'Sequence Ontology Feature Annotation'
)
);
}
}
sub
next_feature {
my
$self
=
shift
;
my
$gff_string
;
my
(
$f
) =
$self
->_buffer_feature();
if
(
$f
){
return
$f
;
}
return
if
$self
->fasta_mode();
while
((
$gff_string
=
$self
->_readline()) &&
defined
(
$gff_string
)) {
next
if
$gff_string
=~ /^\s*$/;
next
if
$gff_string
=~ /^\
last
;
}
return
unless
$gff_string
;
if
(
$gff_string
=~ /^>/){
$self
->_pushback(
$gff_string
);
$self
->fasta_mode(1);
return
;
}
elsif
(
$gff_string
=~ /^
$self
->_handle_directive(
$gff_string
);
return
$self
->next_feature();
}
else
{
return
$self
->_handle_feature(
$gff_string
);
}
}
sub
next_feature_group {
my
$self
=
shift
;
my
$feat
;
my
%seen_ids
;
my
@all_feats
;
my
@toplevel_feats
;
$self
->{group_not_done} = 1;
while
(
$self
->{group_not_done} && (
$feat
=
$self
->next_feature()) &&
defined
(
$feat
)) {
if
(
my
$anno_ID
=
$feat
->get_Annotations(
'ID'
)) {
my
$attr_ID
=
$anno_ID
->value;
$self
->throw(
"Oops! ID $attr_ID exists more than once in your file!"
)
if
(
exists
(
$seen_ids
{
$attr_ID
}));
$seen_ids
{
$attr_ID
} =
$feat
;
}
push
(
@all_feats
,
$feat
);
}
foreach
$feat
(
@all_feats
) {
my
@parents
=
$feat
->get_Annotations(
'Parent'
);
if
(
@parents
) {
foreach
my
$parent
(
@parents
) {
my
$parent_id
=
$parent
->value;
$self
->throw(
"Parent with ID $parent_id not found!"
)
unless
(
exists
(
$seen_ids
{
$parent_id
}));
$seen_ids
{
$parent_id
}->add_SeqFeature(
$feat
);
}
}
else
{
push
(
@toplevel_feats
,
$feat
);
}
}
return
@toplevel_feats
;
}
sub
next_seq() {
my
$self
=
shift
;
return
unless
$self
->fasta_mode();
if
(!
$self
->seqio){
$self
->seqio( Bio::SeqIO->new(
-format
=>
'fasta'
,
-fh
=>
$self
->_fh()) );
}
return
$self
->seqio->next_seq();
}
sub
write_feature {
my
(
$self
,
$feature
) =
@_
;
if
(!
$feature
) {
$self
->throw(
"gff.pm cannot write_feature unless you give a feature to write.\n"
);
}
$self
->throw(
"only Bio::SeqFeature::Annotated objects are writeable"
)
unless
$feature
->isa(
'Bio::SeqFeature::Annotated'
);
if
(
$self
->version == 1){
return
$self
->_write_feature_1(
$feature
);
}
elsif
(
$self
->version == 2){
return
$self
->_write_feature_2(
$feature
);
}
elsif
(
$self
->version == 2.5){
return
$self
->_write_feature_25(
$feature
);
}
elsif
(
$self
->version == 3){
return
$self
->_write_feature_3(
$feature
);
}
else
{
$self
->throw(
sprintf
(
"don't know how to write GFF version %s"
,
$self
->version));
}
}
sub
fasta_mode {
my
(
$self
,
$val
) =
@_
;
$self
->{
'fasta_mode'
} =
$val
if
defined
(
$val
);
return
$self
->{
'fasta_mode'
};
}
sub
seqio {
my
(
$self
,
$val
) =
@_
;
$self
->{
'seqio'
} =
$val
if
defined
(
$val
);
return
$self
->{
'seqio'
};
}
sub
sequence_region {
my
(
$self
,
$k
,
$v
) =
@_
;
if
(
defined
(
$k
) &&
defined
(
$v
)){
$self
->{
'sequence_region'
}{
$k
} =
$v
;
return
$v
;
}
elsif
(
defined
(
$k
)){
return
$self
->{
'sequence-region'
}{
$k
};
}
else
{
return
;
}
}
sub
so {
my
$self
=
shift
;
my
$val
=
shift
;
$self
->{so} =
$val
if
defined
(
$val
);
return
$self
->{so};
}
sub
validate {
my
(
$self
,
$val
) =
@_
;
$self
->{
'validate'
} =
$val
if
defined
(
$val
);
return
$self
->{
'validate'
};
}
sub
version {
my
$self
=
shift
;
my
$val
=
shift
;
my
%valid
=
map
{
$_
=>1} (1, 2, 2.5, 3);
if
(
defined
$val
&&
$valid
{
$val
}){
return
$self
->{
'version'
} =
$val
;
}
elsif
(
defined
(
$val
)){
$self
->throw(
'invalid version. valid versions: '
.
join
(
' '
,
sort
keys
%valid
));
}
return
$self
->{
'version'
};
}
sub
_buffer_feature {
my
(
$self
,
$f
) =
@_
;
if
(
$f
) {
push
@{
$self
->{
'buffer'
} },
$f
;
return
$f
;
}
elsif
(
$self
->{
'buffer'
} ) {
return
shift
@{
$self
->{
'buffer'
} };
}
else
{
return
;
}
}
sub
_handle_directive {
my
(
$self
,
$directive_string
) =
@_
;
$directive_string
=~ s/^
my
(
$directive
,
@arg
) =
split
/\s+/,
$directive_string
;
if
(
$directive
eq
'gff-version'
){
my
$version
=
$arg
[0];
if
(
$version
!= 3){
$self
->throw(
"this is not a gff version 3 document, it is version '$version'"
);
}
}
elsif
(
$directive
eq
'sequence-region'
){
my
$fta
= Bio::Annotation::OntologyTerm->new();
$fta
->name(
'region'
);
my
$f
= Bio::SeqFeature::Annotated->new();
$f
->seq_id(
$arg
[0] );
$f
->start(
$arg
[1] );
$f
->end(
$arg
[2] );
$f
->type(
$fta
);
$self
->sequence_region(
$f
->
seq_id
=>
$f
);
$self
->_buffer_feature(
$f
);
}
elsif
(
$directive
eq
'feature-ontology'
){
$self
->
warn
(
"'##$directive' directive handling not yet implemented"
);
}
elsif
(
$directive
eq
'attribute-ontology'
){
$self
->
warn
(
"'##$directive' directive handling not yet implemented"
);
}
elsif
(
$directive
eq
'source-ontology'
){
$self
->
warn
(
"'##$directive' directive handling not yet implemented"
);
}
elsif
(
$directive
eq
'FASTA'
or
$directive
=~ /^>/){
$self
->fasta_mode(1);
return
;
}
elsif
(
$directive
eq
'#'
){
$self
->{group_not_done} = 0;
}
else
{
$self
->throw(
"don't know what do do with directive: '##"
.
$directive
.
"'"
);
}
}
sub
_handle_feature {
my
(
$self
,
$feature_string
) =
@_
;
my
$feat
= Bio::SeqFeature::Annotated->new();
my
(
$seq
,
$source
,
$type
,
$start
,
$end
,
$score
,
$strand
,
$phase
,
$attribute_string
) =
split
/\t/,
$feature_string
;
$feat
->seq_id(
$seq
);
$feat
->source(
$source
);
$feat
->start(
$start
)
unless
$start
eq
'.'
;
$feat
->end(
$end
)
unless
$end
eq
'.'
;
$feat
->strand(
$strand
eq
'+'
? 1 :
$strand
eq
'-'
? -1 : 0);
$feat
->score(
$score
);
$feat
->phase(
$phase
);
my
$fta
= Bio::Annotation::OntologyTerm->new();
if
(
$self
->validate()){
if
(
$seq
=~ /[^a-zA-Z0-9\.\-\:\^\*\$\@\!\+\_\?]/) {
$self
->throw(
"Validation Error: seqid ($seq) contains characters that are not [a-zA-Z0-9.:^*\$\@!+_?\-] and not escaped"
);
}
if
(
$seq
=~ /\s/) {
$self
->throw(
"Validation Error: seqid ($seq) contains unescaped whitespace"
)
}
if
(
$seq
=~ /^>/) {
$self
->throw(
"Validation Error: seqid ($seq) begins with a >"
)
}
if
(
$start
>
$end
) {
$self
->throw(
"Validation Error: start ($start) must be less than or equal to end in $seq"
);
}
my
$region
=
$self
->sequence_region(
$seq
);
if
(
defined
(
$region
) &&
$start
<
$region
->start() ||
$end
>
$region
->end() ) {
$self
->throw(
"Validation Error: sequence location ($seq from $start to $end) does not appear to lie within a defined sequence-region"
)
}
$self
->throw(
"Validation Error: strand is not one of [+-.?] at $seq"
)
if
(
$strand
=~ /^[^\+\-\.\?]$/);
$self
->throw(
"Validation Error: phase is not one of [.012] at $seq"
)
if
(
$phase
=~ /^[^\.012]$/);
my
$feature_type
;
if
(
$type
=~ /^\D+:\d+$/){
(
$feature_type
) =
$self
->so->find_terms(
-identifier
=>
$type
);
}
else
{
(
$feature_type
) =
$self
->so->find_terms(
-name
=>
$type
);
}
if
(!
$feature_type
){
$self
->throw(
"Validation Error: couldn't find ontology term for '$type'."
);
}
$fta
->term(
$feature_type
);
}
else
{
if
(
$type
=~ /^\D+:\d+$/){
$fta
->identifier(
$type
)
}
else
{
$fta
->name(
$type
);
}
}
$feat
->type(
$fta
);
my
%attr
= ();
chomp
$attribute_string
;
unless
(
$attribute_string
eq
'.'
) {
my
@attributes
=
split
';'
,
$attribute_string
;
foreach
my
$attribute
(
@attributes
){
my
(
$key
,
$values
) =
split
'='
,
$attribute
;
$values
=~ s/^["']//;
$values
=~ s/["']$//;
my
@values
=
map
{uri_unescape(
$_
)}
split
','
,
$values
;
if
(
$attr
{
$key
}) {
my
@tmparray
= @{
$attr
{
$key
}};
push
@tmparray
,
@values
;
$attr
{
$key
} = [
@tmparray
];
}
else
{
$attr
{
$key
} = [
@values
];
}
}
}
if
(
$attr
{Dbxref}){
foreach
my
$value
(@{
$attr
{Dbxref} }){
my
$a
= Bio::Annotation::DBLink->new();
my
(
$db
,
$accession
) =
$value
=~ /^(.+?):(.+)$/;
if
(!
$db
or !
$accession
){
$self
->throw(
"Error in line:\n$feature_string\nDbxref value '$value' did not conform to GFF3 specification"
);
next
;
}
$a
->database(
$db
);
$a
->primary_id(
$accession
);
$feat
->add_Annotation(
'Dbxref'
,
$a
);
}
}
if
(
$attr
{Ontology_term}){
foreach
my
$id
(@{
$attr
{Ontology_term} }){
my
$a
= Bio::Annotation::OntologyTerm->new();
if
(
$self
->validate()){
my
$ont_name
= Bio::Ontology::OntologyStore->guess_ontology(
$id
);
my
$ont
= Bio::Ontology::OntologyStore->get_ontology(
$ont_name
);
my
(
$term
) =
$ont
->find_terms(
-identifier
=>
$id
);
$a
->term(
$term
);
}
else
{
$a
->identifier(
$id
);
}
$feat
->add_Annotation(
'Ontology_term'
,
$a
);
}
}
if
(
$attr
{Gap}){
for
my
$value
(@{
$attr
{Gap} }) {
my
$a
= Bio::Annotation::SimpleValue->new();
$a
->value(
$value
);
$feat
->add_Annotation(
'Gap'
,
$a
);
}
}
if
(
$attr
{Target}){
my
$target_collection
= Bio::Annotation::Collection->new();
foreach
my
$target_string
(@{
$attr
{Target} } ) {
$target_string
=~ s/\+/ /g
unless
$target_string
=~ / /;
my
(
$t_id
,
$tstart
,
$tend
,
$strand
,
$extra
) =
split
/\s+/,
$target_string
;
if
(!
$tend
||
$extra
) {
$self
->throw(
"The value in the Target string, $target_string, does not conform to the GFF3 specification"
);
}
my
$a
= Bio::Annotation::Target->new(
-target_id
=>
$t_id
,
-start
=>
$tstart
,
-end
=>
$tend
,
);
if
(
$strand
&&
$strand
eq
'+'
) {
$strand
= 1;
}
elsif
(
$strand
&&
$strand
eq
'-'
) {
$strand
= -1;
}
else
{
$strand
=
''
;
}
$a
->strand(
$strand
)
if
$strand
;
$feat
->add_Annotation(
'Target'
,
$a
);
}
}
if
(
$attr
{ID}){
if
(
scalar
( @{
$attr
{ID} } ) > 1){
$self
->throw(
"Error in line:\n$feature_string\nA feature may have at most one ID value"
);
}
if
(
$self
->{
'allIDs'
}->{${
$attr
{ID}}[0]} &&
$self
->validate()) {
$self
->throw(
"Validation Error: The ID ${$attr{ID}}[0] occurs more than once in the file, but should be unique"
);
}
$self
->{
'allIDs'
}->{${
$attr
{ID}}[0]} = 1;
my
$a
= Bio::Annotation::SimpleValue->new();
$a
->value( @{
$attr
{ID} }[0] );
$feat
->add_Annotation(
'ID'
,
$a
);
}
if
(
$attr
{Name}){
if
(
scalar
( @{
$attr
{Name} } ) > 1){
$self
->throw(
"Error in line:\n$feature_string\nA feature may have at most one Name value"
);
}
my
$a
= Bio::Annotation::SimpleValue->new();
$a
->value( @{
$attr
{Name} }[0] );
$feat
->add_Annotation(
'Name'
,
$a
);
}
foreach
my
$other_canonical
(
qw(Alias Parent Note Derives_from Index)
){
if
(
$attr
{
$other_canonical
}){
foreach
my
$value
(@{
$attr
{
$other_canonical
} }){
my
$a
= Bio::Annotation::SimpleValue->new();
$a
->value(
$value
);
$feat
->add_Annotation(
$other_canonical
,
$a
);
}
}
}
my
@non_reserved_tags
=
grep
{/^[a-z]/}
keys
%attr
;
foreach
my
$non_reserved_tag
(
@non_reserved_tags
) {
foreach
my
$value
(@{
$attr
{
$non_reserved_tag
} }){
$feat
=
$self
->_handle_non_reserved_tag(
$feat
,
$non_reserved_tag
,
$value
);
}
}
my
@illegal_tags
=
grep
{!/(
$RESERVED_TAGS
)/}
grep
{/^[A-Z]/}
keys
%attr
;
if
(
@illegal_tags
> 0) {
my
$tags
=
join
(
", "
,
@illegal_tags
);
$self
->throw(
"The following tag(s) are illegal and are causing this parser to die: $tags"
);
}
return
$feat
;
}
sub
_handle_non_reserved_tag {
my
$self
=
shift
;
my
(
$feat
,
$tag
,
$value
) =
@_
;
my
$a
= Bio::Annotation::SimpleValue->new();
$a
->value(
$value
);
$feat
->add_Annotation(
$tag
,
$a
);
return
$feat
;
}
sub
_write_feature_1 {
my
(
$self
,
$feature
) =
@_
;
$self
->throw(
sprintf
(
"write_feature unimplemented for GFF version %s"
,
$self
->version));
}
sub
_write_feature_2 {
my
(
$self
,
$feature
) =
@_
;
$self
->throw(
sprintf
(
"write_feature unimplemented for GFF version %s"
,
$self
->version));
}
sub
_write_feature_25 {
my
(
$self
,
$feature
,
$group
) =
@_
;
if
(!
defined
(
$group
)){
$group
= (
$feature
->get_Annotations(
'ID'
))[0]->value;
}
my
$seq
=
$feature
->seq_id->value;
my
$source
=
$feature
->source->value;
my
$type
=
$feature
->type->name;
$type
=
'EXON'
if
$type
eq
'exon'
;
my
$min
=
$feature
->start ||
'.'
;
my
$max
=
$feature
->end ||
'.'
;
my
$strand
=
$feature
->strand == 1 ?
'+'
:
$feature
->strand == -1 ?
'-'
:
'.'
;
my
$score
=
$feature
->score->value;
my
$phase
=
$feature
->phase->value;
if
(
$type
eq
'EXON'
or
$type
eq
'CDS'
or
$type
eq
'start_codon'
or
$type
eq
'stop_codon'
){
my
$attr
=
sprintf
(
'gene_id "%s"; transcript_id "%s";'
,
$group
,
$group
);
my
$outstring
=
sprintf
(
"%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n"
,
$seq
,
$source
,
$type
,
$min
,
$max
,
$score
,
$strand
,
$phase
,
$attr
);
$self
->_print(
$outstring
);
}
foreach
my
$subfeat
(
$feature
->get_SeqFeatures){
$self
->_write_feature_25(
$subfeat
,
$group
);
}
}
sub
_write_feature_3 {
my
(
$self
,
$feature
) =
@_
;
my
$seq
=
$feature
->seq_id->value;
my
$source
;
if
(
$feature
->source()) {
$source
=
$feature
->source->value;
}
else
{
$source
=
$feature
->source() ||
"unknownsource"
;
}
my
$type
;
if
(
$feature
->type()) {
$type
=
$feature
->type->name; }
else
{
$type
=
"region"
; }
my
$min
=
$feature
->start ||
'.'
;
my
$max
=
$feature
->end ||
'.'
;
my
$strand
=
$feature
->strand == 1 ?
'+'
:
$feature
->strand == -1 ?
'-'
:
'.'
;
my
$score
=
$feature
->score->value;
my
$phase
=
$feature
->phase->value;
my
@attr
;
if
(
my
@v
= (
$feature
->get_Annotations(
'Name'
))){
my
$vstring
=
join
','
,
map
{uri_escape(
$_
->value)}
@v
;
push
@attr
,
"Name=$vstring"
;
}
if
(
my
@v
= (
$feature
->get_Annotations(
'ID'
))){
my
$vstring
=
join
','
,
map
{uri_escape(
$_
->value)}
@v
;
push
@attr
,
"ID=$vstring"
;
$self
->throw(
'GFF3 features may have at most one ID, feature with these IDs is invalid:\n'
.
$vstring
)
if
scalar
(
@v
) > 1;
}
if
(
my
@v
= (
$feature
->get_Annotations(
'Parent'
))){
my
$vstring
=
join
','
,
map
{uri_escape(
$_
->value)}
@v
;
push
@attr
,
"Parent=$vstring"
;
}
if
(
my
@v
= (
$feature
->get_Annotations(
'dblink'
))){
my
$vstring
=
join
','
,
map
{uri_escape(
$_
->database .
':'
.
$_
->primary_id)}
@v
;
push
@attr
,
"Dbxref=$vstring"
;
}
if
(
my
@v
= (
$feature
->get_Annotations(
'ontology_term'
))){
my
$vstring
=
join
','
,
map
{uri_escape(
$_
->identifier)}
@v
;
push
@attr
,
"Ontology_term=$vstring"
;
}
if
(
my
@v
= (
$feature
->get_Annotations(
'comment'
))){
my
$vstring
=
join
','
,
map
{uri_escape(
$_
->text)}
@v
;
push
@attr
,
"Note=$vstring"
;
}
if
(
my
@v
= (
$feature
->get_Annotations(
'Target'
))){
my
%strand_map
= (
1
=>
'+'
,
0
=>
''
,
-1
=>
'-'
,
'+'
=>
'+'
,
'-'
=>
'-'
);
my
$vstring
=
join
','
,
map
{
uri_escape(
$_
->target_id).
' '
.
$_
->start.
' '
.
$_
->end.(
defined
$_
->strand ?
' '
.
$strand_map
{
$_
->strand} :
''
)
}
@v
;
push
@attr
,
"Target=$vstring"
;
}
my
$attr
=
join
';'
,
@attr
;
my
$outstring
=
sprintf
(
"%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n"
,
$seq
,
$source
,
$type
,
$min
,
$max
,
$score
,
$strand
,
$phase
,
$attr
);
$self
->_print(
$outstring
);
foreach
my
$subfeat
(
$feature
->get_SeqFeatures){
$self
->_write_feature_3(
$subfeat
);
}
}
1;