use
vars
qw($Default_Source)
;
$Default_Source
=
'BSML'
;
sub
_initialize {
my
(
$self
) =
shift
;
$self
->SUPER::_initialize(
@_
);
$self
->{
'_parser'
} = XML::SAX::ParserFactory->parser(
'Handler'
=>
$self
);
if
( !
defined
$self
->sequence_factory ) {
$self
->sequence_factory(FAST::Bio::Seq::SeqFactory->new
(
-verbose
=>
$self
->verbose(),
-type
=>
'FAST::Bio::Seq::RichSeq'
));
}
return
;
}
sub
next_seq {
my
$self
=
shift
;
if
( @{
$self
->{
'_seendata'
}->{
'_seqs'
} || []} ||
eof
(
$self
->_fh)) {
return
shift
@{
$self
->{
'_seendata'
}->{
'_seqs'
}};
}
$self
->{
'_parser'
}->parse_file(
$self
->_fh);
return
shift
@{
$self
->{
'_seendata'
}->{
'_seqs'
}};
}
sub
start_document {
my
(
$self
,
$doc
) =
@_
;
$self
->{
'_seendata'
} = {
'_seqs'
=> [],
'_authors'
=> [],
'_feats'
=> [] };
$self
->SUPER::start_document(
$doc
);
}
sub
end_document {
my
(
$self
,
$doc
) =
@_
;
$self
->SUPER::end_document(
$doc
);
}
sub
start_element {
my
(
$self
,
$ele
) =
@_
;
my
$name
=
uc
(
$ele
->{
'LocalName'
});
my
$attr
=
$ele
->{
'Attributes'
};
my
$seqid
=
defined
$self
->{
'_seendata'
}->{
'_seqs'
}->[-1] ?
$self
->{
'_seendata'
}->{
'_seqs'
}->[-1]->display_id :
undef
;
for
my
$k
(
keys
%$attr
) {
$attr
->{
uc
$k
} =
$attr
->{
$k
};
delete
$attr
->{
$k
};
}
if
(
$name
eq
'BSML'
) {
}
elsif
(
$name
eq
'DEFINITIONS'
) {
}
elsif
(
$name
eq
'SEQUENCES'
) {
}
elsif
(
$name
eq
'SEQUENCE'
) {
my
(
$id
,
$acc
,
$title
,
$desc
,
$length
,
$topology
,
$mol
) =
map
{
$attr
->{
'{}'
.
$_
}->{
'Value'
} }
qw(ID IC-ACCKEY
TITLE COMMENT
LENGTH
TOPOLOGY
MOLECULE)
;
push
@{
$self
->{
'_seendata'
}->{
'_seqs'
}},
$self
->sequence_factory->create
(
-display_id
=>
$id
,
-accession_number
=>
$acc
,
-description
=>
$desc
,
-length
=>
$length
,
-is_circular
=> (
$topology
=~ /^linear$/i) ? 0 : 1,
-molecule
=>
$mol
,
);
}
elsif
(
$name
eq
'FEATURE-TABLES'
) {
}
elsif
(
$name
eq
'ATTRIBUTE'
) {
my
$curseq
=
$self
->{
'_seendata'
}->{
'_seqs'
}->[-1];
my
(
$name
,
$content
) =
map
{
$attr
->{
'{}'
.
$_
}->{
'Value'
} }
qw(NAME CONTENT)
;
if
(
$name
=~ /^version$/i ) {
my
(
$version
);
if
(
$content
=~ /^[^\.]+\.(\d+)/) {
$version
= $1;
}
else
{
$version
=
$content
}
$curseq
->seq_version(
$version
);
}
elsif
(
$name
eq
'organism-species'
) {
my
(
$genus
,
$species
,
$subsp
) =
split
(/\s+/,
$content
,3);
$curseq
->species(FAST::Bio::Species->new(
-sub_species
=>
$subsp
,
-classification
=>
[
$species
,
$genus
]));
}
elsif
(
$name
eq
'organism-classification'
) {
my
(
@class
) =(
split
(/\s*;\s*/,
$content
),
$curseq
->species->species);
$curseq
->species->classification([
reverse
@class
]);
}
elsif
(
$name
eq
'database-xref'
) {
my
(
$db
,
$id
) =
split
(/:/,
$content
);
$curseq
->annotation->add_Annotation(
'dblink'
,
FAST::Bio::Annotation::DBLink->new
(
-database
=>
$db
,
-primary_id
=>
$id
));
}
elsif
(
$name
eq
'date-created'
||
$name
eq
'date-last-updated'
) {
$curseq
->add_date(
$content
);
}
}
elsif
(
$name
eq
'FEATURE'
) {
my
(
$id
,
$class
,
$type
,
$title
,
$display_auto
)
=
map
{
$attr
->{
'{}'
.
$_
}->{
'Value'
} }
qw(ID CLASS VALUE-TYPE
TITLE DISPLAY-AUTO)
;
push
@{
$self
->{
'_seendata'
}->{
'_feats'
}},
FAST::Bio::SeqFeature::Generic->new
(
-seq_id
=>
$self
->{
'_seendata'
}->{
'_seqs'
}->[-1]->display_id,
-source_tag
=>
$Default_Source
,
-primary_tag
=>
$type
,
-tag
=> {
'ID'
=>
$id
,
});
}
elsif
(
$name
eq
'QUALIFIER'
) {
my
(
$type
,
$value
) =
map
{
$attr
->{
'{}'
.
$_
}->{
'Value'
} }
qw(VALUE-TYPE
VALUE)
;
my
$curfeat
=
$self
->{
'_seendata'
}->{
'_feats'
}->[-1];
$curfeat
->add_tag_value(
$type
,
$value
);
}
elsif
(
$name
eq
'INTERVAL-LOC'
) {
my
$curfeat
=
$self
->{
'_seendata'
}->{
'_feats'
}->[-1];
my
(
$start
,
$end
,
$strand
) =
map
{
$attr
->{
'{}'
.
$_
}->{
'Value'
} }
qw(STARTPOS
ENDPOS
COMPLEMENT)
;
$curfeat
->start(
$start
);
$curfeat
->end(
$end
);
$curfeat
->strand(-1)
if
(
$strand
);
}
elsif
(
$name
eq
'REFERENCE'
) {
push
@{
$self
->{
'_seendata'
}->{
'_annot'
}},
FAST::Bio::Annotation::Reference->new();
}
push
@{
$self
->{
'_state'
}},
$name
;
$self
->SUPER::start_element(
$ele
);
}
sub
end_element {
my
(
$self
,
$ele
) =
@_
;
pop
@{
$self
->{
'_state'
}};
my
$name
=
uc
$ele
->{
'LocalName'
};
my
$curseq
=
$self
->{
'_seendata'
}->{
'_seqs'
}->[-1];
if
(
$name
eq
'REFERENCE'
) {
my
$ref
=
pop
@{
$self
->{
'_seendata'
}->{
'_annot'
}};
$curseq
->annotation->add_Annotation(
'reference'
,
$ref
);
}
elsif
(
$name
eq
'FEATURE'
) {
my
$feat
=
pop
@{
$self
->{
'_seendata'
}->{
'_feats'
}};
$curseq
->add_SeqFeature(
$feat
);
}
$self
->SUPER::end_element(
$ele
);
}
sub
characters {
my
(
$self
,
$data
) =
@_
;
if
( ! @{
$self
->{
'_state'
}} ) {
$self
->
warn
(
"Calling characters with no previous start_element call. Ignoring data"
);
}
else
{
my
$curseq
=
$self
->{
'_seendata'
}->{
'_seqs'
}->[-1];
my
$curfeat
=
$self
->{
'_seendata'
}->{
'_feats'
}->[-1];
my
$curannot
=
$self
->{
'_seendata'
}->{
'_annot'
}->[-1];
my
$name
=
$self
->{
'_state'
}->[-1];
if
(
$name
eq
'REFAUTHORS'
) {
$curannot
->authors(
$data
->{
'Data'
});
}
elsif
(
$name
eq
'REFTITLE'
) {
$curannot
->title(
$data
->{
'Data'
});
}
elsif
(
$name
eq
'REFJOURNAL'
) {
$curannot
->location(
$data
->{
'Data'
});
}
elsif
(
$name
eq
'SEQ-DATA'
) {
$data
->{
'Data'
} =~ s/\s+//g;
$curseq
->seq(
$data
->{
'Data'
});
}
}
$self
->SUPER::characters(
$data
);
}
1;