use
vars
qw(%FTQUAL_NO_QUOTE)
;
%FTQUAL_NO_QUOTE
=(
'anticodon'
=>1,
'citation'
=>1,
'codon'
=>1,
'codon_start'
=>1,
'cons_splice'
=>1,
'direction'
=>1,
'evidence'
=>1,
'label'
=>1,
'mod_base'
=> 1,
'number'
=> 1,
'rpt_type'
=> 1,
'rpt_unit'
=> 1,
'transl_except'
=> 1,
'transl_table'
=> 1,
'usedin'
=> 1,
);
sub
_initialize {
my
(
$self
,
@args
) =
@_
;
$self
->SUPER::_initialize(
@args
);
$self
->{
'_func_ftunit_hash'
} = {};
$self
->_show_dna(1);
if
( !
defined
$self
->sequence_factory ) {
$self
->sequence_factory(FAST::Bio::Seq::SeqFactory->new
(
-verbose
=>
$self
->verbose(),
-type
=>
'FAST::Bio::Seq::RichSeq'
));
}
}
sub
next_seq {
my
(
$self
,
@args
) =
@_
;
my
(
$pseq
,
$c
,
$line
,
$name
,
$desc
,
$acc
,
$seqc
,
$mol
,
$div
,
$date
,
$comment
,
@date_arr
);
my
(
$annotation
,
%params
,
@features
) =
FAST::Bio::Annotation::Collection->new();
$line
=
$self
->_readline;
if
( !
defined
$line
) {
return
;
}
if
(
$line
=~ /^\s+$/ ) {
while
(
defined
(
$line
=
$self
->_readline) ) {
$line
=~/^\S/ &&
last
;
}
return
unless
$line
;
}
$self
->throw(
"EMBL stream with no ID. Not embl in my book"
)
unless
$line
=~ /^ID\s+\S+/;
my
$alphabet
;
if
(
$line
=~
tr
/;/;/ == 6) {
my
$topology
;
my
$sv
;
if
(
$line
=~ m/^ID (\w+);\s+SV (\d+); (\w+); ([^;]+); (\w{3}); (\w{3}); (\d+) BP./) {
(
$name
,
$sv
,
$topology
,
$mol
,
$div
) = ($1, $2, $3, $4, $6);
}
if
(
defined
$sv
) {
$params
{
'-seq_version'
} =
$sv
;
$params
{
'-version'
} =
$sv
;
}
if
(
defined
$topology
&&
$topology
eq
'circular'
) {
$params
{
'-is_circular'
} = 1;
}
if
(
defined
$mol
) {
if
(
$mol
=~ /DNA/) {
$alphabet
=
'dna'
;
}
elsif
(
$mol
=~ /RNA/) {
$alphabet
=
'rna'
;
}
elsif
(
$mol
=~ /AA/) {
$alphabet
=
'protein'
;
}
}
}
else
{
if
(
$line
=~ /^ID\s+(\S+)[^;]*;\s+(\S+)[^;]*;\s+(\S+)[^;]*;/) {
(
$name
,
$mol
,
$div
) = ($1, $2, $3);
}
if
(
$mol
) {
if
(
$mol
=~ /circular/ ) {
$params
{
'-is_circular'
} = 1;
$mol
=~ s|circular ||;
}
if
(
defined
$mol
) {
if
(
$mol
=~ /DNA/) {
$alphabet
=
'dna'
;
}
elsif
(
$mol
=~ /RNA/) {
$alphabet
=
'rna'
;
}
elsif
(
$mol
=~ /AA/) {
$alphabet
=
'protein'
;
}
}
}
}
unless
(
defined
$name
&&
length
(
$name
) ) {
$name
=
"unknown_id"
;
}
my
$buffer
=
$line
;
local
$_
;
BEFORE_FEATURE_TABLE :
my
$ncbi_taxid
;
until
( !
defined
$buffer
) {
$_
=
$buffer
;
if
( /^(F[HT]|SQ)/ ) {
$self
->_pushback(
$_
)
if
( $1 eq
'SQ'
|| $1 eq
'FT'
);
last
;
}
if
(/^DE\s+(\S.*\S)/) {
$desc
.=
$desc
?
" $1"
: $1;
}
if
( /^AC\s+(.*)?/ || /^PA\s+(.*)?/) {
my
@accs
=
split
(/[; ]+/, $1);
$params
{
'-accession_number'
} =
shift
@accs
unless
defined
$params
{
'-accession_number'
};
push
@{
$params
{
'-secondary_accessions'
}},
@accs
;
}
if
( /^SV\s+\S+\.(\d+);?/ ) {
my
$sv
= $1;
$params
{
'-seq_version'
} =
$sv
;
$params
{
'-version'
} =
$sv
;
}
if
( /^DT\s+(.+)$/ ) {
my
$line
= $1;
my
(
$date
,
$version
) =
split
(
' '
,
$line
, 2);
$date
=~
tr
/,//d;
if
(
$version
) {
if
(
$version
=~ /\(Rel\. (\d+), Created\)/xms ) {
my
$release
= FAST::Bio::Annotation::SimpleValue->new(
-tagname
=>
'creation_release'
,
-value
=> $1
);
$annotation
->add_Annotation(
$release
);
}
elsif
(
$version
=~ /\(Rel\. (\d+), Last updated, Version (\d+)\)/xms ) {
my
$release
= FAST::Bio::Annotation::SimpleValue->new(
-tagname
=>
'update_release'
,
-value
=> $1
);
$annotation
->add_Annotation(
$release
);
my
$update
= FAST::Bio::Annotation::SimpleValue->new(
-tagname
=>
'update_version'
,
-value
=> $2
);
$annotation
->add_Annotation(
$update
);
}
}
push
@{
$params
{
'-dates'
}},
$date
;
}
if
( /^KW (.*)\S*$/ ) {
my
@kw
=
split
(/\s*\;\s*/,$1);
push
@{
$params
{
'-keywords'
}},
@kw
;
}
elsif
(/^O[SC]/) {
my
$species
=
$self
->_read_EMBL_Species(\
$buffer
,
$params
{
'-accession_number'
});
$params
{
'-species'
}=
$species
;
}
elsif
(/^OX/) {
if
(/NCBI_TaxID=(\d+)/) {
$ncbi_taxid
=$1;
}
my
@links
=
$self
->_read_EMBL_TaxID_DBLink(\
$buffer
);
foreach
my
$dblink
(
@links
) {
$annotation
->add_Annotation(
'dblink'
,
$dblink
);
}
}
elsif
(/^R/) {
my
@refs
=
$self
->_read_EMBL_References(\
$buffer
);
foreach
my
$ref
(
@refs
) {
$annotation
->add_Annotation(
'reference'
,
$ref
);
}
}
elsif
(/^DR/) {
my
@links
=
$self
->_read_EMBL_DBLink(\
$buffer
);
foreach
my
$dblink
(
@links
) {
$annotation
->add_Annotation(
'dblink'
,
$dblink
);
}
}
elsif
(/^CC\s+(.*)/) {
$comment
.= $1;
$comment
.=
" "
;
while
(
defined
(
$_
=
$self
->_readline) ) {
if
(/^CC\s+(.*)/) {
$comment
.= $1;
$comment
.=
" "
;
}
else
{
last
;
}
}
my
$commobj
= FAST::Bio::Annotation::Comment->new();
$commobj
->text(
$comment
);
$annotation
->add_Annotation(
'comment'
,
$commobj
);
$comment
=
""
;
}
$buffer
=
$self
->_readline;
}
while
(
defined
(
$_
=
$self
->_readline) ) {
/^FT\s{3}\w/ &&
last
;
/^SQ / &&
last
;
/^CO / &&
last
;
}
$buffer
=
$_
;
if
(
defined
(
$buffer
) &&
$buffer
=~ /^FT /) {
until
( !
defined
(
$buffer
) ) {
my
$ftunit
=
$self
->_read_FTHelper_EMBL(\
$buffer
);
my
$feat
=
$ftunit
->_generic_seqfeature(
$self
->location_factory(),
$name
);
if
(
$params
{
'-species'
} && (
$feat
->primary_tag eq
'source'
)
&&
$feat
->has_tag(
'db_xref'
)
&& (!
$params
{
'-species'
}->ncbi_taxid())) {
foreach
my
$tagval
(
$feat
->get_tag_values(
'db_xref'
)) {
if
(
index
(
$tagval
,
"taxon:"
) == 0) {
$params
{
'-species'
}->ncbi_taxid(
substr
(
$tagval
,6));
last
;
}
}
}
push
(
@features
,
$feat
);
if
(
$buffer
!~ /^FT/ ) {
last
;
}
}
}
if
(
$params
{
'-species'
} &&
defined
$ncbi_taxid
&& (!
$params
{
'-species'
}->ncbi_taxid())) {
$params
{
'-species'
}->ncbi_taxid(
$ncbi_taxid
);
}
while
(
defined
(
$buffer
) &&
$buffer
=~ /^XX/ ) {
$buffer
=
$self
->_readline();
}
if
(
$buffer
=~ /^CO/ ) {
while
(
defined
(
$buffer
) ) {
$annotation
->add_Annotation(
$_
)
for
$self
->_read_EMBL_Contig(\
$buffer
);
if
( !
$buffer
||
$buffer
!~ /^CO/ ) {
last
;
}
}
$buffer
||=
''
;
}
if
(
$buffer
!~ /^\/\//) {
if
(
$buffer
!~ /^SQ/ ) {
while
(
defined
(
$_
=
$self
->_readline) ) {
/^SQ/ &&
last
;
}
}
$seqc
=
""
;
while
(
defined
(
$_
=
$self
->_readline) ) {
m{^//} &&
last
;
$_
=
uc
(
$_
);
s/[^A-Za-z]//g;
$seqc
.=
$_
;
}
}
my
$seq
=
$self
->sequence_factory->create
(
-verbose
=>
$self
->verbose(),
-division
=>
$div
,
-seq
=>
$seqc
,
-desc
=>
$desc
,
-display_id
=>
$name
,
-annotation
=>
$annotation
,
-molecule
=>
$mol
,
-alphabet
=>
$alphabet
,
-features
=> \
@features
,
%params
);
return
$seq
;
}
sub
_write_ID_line {
my
(
$self
,
$seq
) =
@_
;
my
$id_line
;
if
(
$self
->_id_generation_func ) {
$id_line
=
"ID "
. &{
$self
->_id_generation_func}(
$seq
) .
"\nXX\n"
;
}
else
{
my
$name
=
$seq
->accession_number();
if
( not(
defined
$name
) ||
$name
eq
'unknown'
) {
$name
=
$seq
->id() ||
''
;
}
$self
->
warn
(
"No whitespace allowed in EMBL id ["
.
$name
.
"]"
)
if
$name
=~ /\s/;
my
$version
=
$seq
->version() || 1;
my
$len
=
$seq
->
length
();
my
$div
;
if
(
$seq
->can(
'division'
) &&
defined
(
$seq
->division) &&
$self
->_is_valid_division(
$seq
->division) ) {
$div
=
$seq
->division();
}
else
{
$div
||=
'UNC'
;
}
my
$mol
;
if
(
$seq
->can(
'molecule'
)
&&
defined
(
$seq
->molecule)
&&
$self
->_is_valid_molecule_type(
$seq
->molecule)
) {
$mol
=
$seq
->molecule();
}
elsif
(
$seq
->can(
'primary_seq'
) &&
defined
$seq
->primary_seq->alphabet) {
my
$alphabet
=
$seq
->primary_seq->alphabet;
if
(
$alphabet
eq
'dna'
) {
$mol
=
'unassigned DNA'
;
}
elsif
(
$alphabet
eq
'rna'
) {
$mol
=
'unassigned RNA'
;
}
elsif
(
$alphabet
eq
'protein'
) {
$self
->
warn
(
"Protein sequence found; EMBL is a nucleotide format."
);
$mol
=
'AA'
;
}
}
my
$topology
=
'linear'
;
if
(
$seq
->is_circular) {
$topology
=
'circular'
;
}
$mol
||=
''
;
$id_line
=
"ID $name; SV $version; $topology; $mol; STD; $div; $len BP.\nXX\n"
;
$self
->_print(
$id_line
);
}
}
sub
_is_valid_division {
my
(
$self
,
$division
) =
@_
;
my
%EMBL_divisions
= (
"PHG"
=> 1,
"ENV"
=> 1,
"FUN"
=> 1,
"HUM"
=> 1,
"INV"
=> 1,
"MAM"
=> 1,
"VRT"
=> 1,
"MUS"
=> 1,
"PLN"
=> 1,
"PRO"
=> 1,
"ROD"
=> 1,
"SYN"
=> 1,
"UNC"
=> 1,
"VRL"
=> 1
);
return
exists
(
$EMBL_divisions
{
$division
});
}
sub
_is_valid_molecule_type {
my
(
$self
,
$moltype
) =
@_
;
my
%EMBL_molecule_types
= (
"genomic DNA"
=> 1,
"genomic RNA"
=> 1,
"mRNA"
=> 1,
"tRNA"
=> 1,
"rRNA"
=> 1,
"snoRNA"
=> 1,
"snRNA"
=> 1,
"scRNA"
=> 1,
"pre-RNA"
=> 1,
"other RNA"
=> 1,
"other DNA"
=> 1,
"unassigned DNA"
=> 1,
"unassigned RNA"
=> 1
);
return
exists
(
$EMBL_molecule_types
{
$moltype
});
}
sub
write_seq {
my
(
$self
,
@seqs
) =
@_
;
foreach
my
$seq
(
@seqs
) {
$self
->throw(
"Attempting to write with no seq!"
)
unless
defined
$seq
;
unless
(
ref
$seq
&&
$seq
->isa(
'FAST::Bio::SeqI'
) ) {
$self
->
warn
(
"$seq is not a SeqI compliant sequence object!"
)
if
$self
->verbose >= 0;
unless
(
ref
$seq
&&
$seq
->isa(
'FAST::Bio::PrimarySeqI'
) ) {
$self
->throw(
"$seq is not a PrimarySeqI compliant sequence object!"
);
}
}
my
$str
=
$seq
->seq ||
''
;
$self
->_write_ID_line(
$seq
);
my
(
$acc
);
{
if
(
my
$func
=
$self
->_ac_generation_func ) {
$acc
= &{
$func
}(
$seq
);
}
elsif
(
$seq
->isa(
'FAST::Bio::Seq::RichSeqI'
) &&
defined
(
$seq
->accession_number) ) {
$acc
=
$seq
->accession_number;
$acc
=
join
(
"; "
,
$acc
,
$seq
->get_secondary_accessions);
}
elsif
(
$seq
->can(
'accession_number'
) ) {
$acc
=
$seq
->accession_number;
}
if
(
defined
$acc
) {
$self
->_print(
"AC $acc;\n"
,
"XX\n"
) ||
return
;
}
}
my
$switch
=0;
if
(
$seq
->can(
'get_dates'
) ) {
my
@dates
=
$seq
->get_dates();
my
$ct
= 1;
my
$date_flag
= 0;
my
(
$cr
) =
$seq
->annotation->get_Annotations(
"creation_release"
);
my
(
$ur
) =
$seq
->annotation->get_Annotations(
"update_release"
);
my
(
$uv
) =
$seq
->annotation->get_Annotations(
"update_version"
);
unless
(
$cr
&&
$ur
&&
$ur
) {
$date_flag
= 1;
}
foreach
my
$dt
(
@dates
) {
if
(!
$date_flag
) {
$self
->_write_line_EMBL_regex(
"DT "
,
"DT "
,
$dt
.
" (Rel. $cr, Created)"
,
'\s+|$'
,80)
if
$ct
== 1;
$self
->_write_line_EMBL_regex(
"DT "
,
"DT "
,
$dt
.
" (Rel. $ur, Last updated, Version $uv)"
,
'\s+|$'
,80)
if
$ct
== 2;
}
else
{
$self
->_write_line_EMBL_regex(
"DT "
,
"DT "
,
$dt
,
'\s+|$'
,80);
}
$switch
=1;
$ct
++;
}
if
(
$switch
== 1) {
$self
->_print(
"XX\n"
) ||
return
;
}
}
$self
->_write_line_EMBL_regex(
"DE "
,
"DE "
,
$seq
->desc(),
'\s+|$'
,80) ||
return
;
$self
->_print(
"XX\n"
) ||
return
;
{
my
(
$kw
);
if
(
my
$func
=
$self
->_kw_generation_func ) {
$kw
= &{
$func
}(
$seq
);
}
elsif
(
$seq
->can(
'keywords'
) ) {
$kw
=
$seq
->keywords;
}
if
(
defined
$kw
) {
$self
->_write_line_EMBL_regex(
"KW "
,
"KW "
,
$kw
,
'\s+|$'
, 80) ||
return
;
$self
->_print(
"XX\n"
) ||
return
;
}
}
if
(
$seq
->can(
'species'
) && (
my
$spec
=
$seq
->species)) {
my
@class
=
$spec
->classification();
shift
@class
;
my
$OS
=
$spec
->scientific_name;
if
(
$spec
->common_name) {
$OS
.=
' ('
.
$spec
->common_name.
')'
;
}
$self
->_print(
"OS $OS\n"
) ||
return
;
my
$OC
=
join
(
'; '
,
reverse
(
@class
)) .
'.'
;
$self
->_write_line_EMBL_regex(
"OC "
,
"OC "
,
$OC
,
'; |$'
,80) ||
return
;
if
(
$spec
->organelle) {
$self
->_write_line_EMBL_regex(
"OG "
,
"OG "
,
$spec
->organelle,
'; |$'
,80) ||
return
;
}
my
$ncbi_taxid
=
$spec
->ncbi_taxid;
if
(
$ncbi_taxid
) {
$self
->_print(
"OX NCBI_TaxID=$ncbi_taxid\n"
) ||
return
;
}
$self
->_print(
"XX\n"
) ||
return
;
}
my
$t
= 1;
if
(
$seq
->can(
'annotation'
) &&
defined
$seq
->annotation ) {
foreach
my
$ref
(
$seq
->annotation->get_Annotations(
'reference'
) ) {
$self
->_print(
"RN [$t]\n"
) ||
return
;
if
(
$ref
->comment) {
$self
->_write_line_EMBL_regex(
"RC "
,
"RC "
,
$ref
->comment,
'\s+|$'
, 80) ||
return
;
}
my
$start
=
$ref
->start;
my
$end
=
$ref
->end;
if
(
$start
and
$end
) {
$self
->_print(
"RP $start-$end\n"
) ||
return
;
}
elsif
(
$start
or
$end
) {
$self
->throw(
"Both start and end are needed for a valid RP line."
.
" Got: start='$start' end='$end'"
);
}
if
(
my
$med
=
$ref
->medline) {
$self
->_print(
"RX MEDLINE; $med.\n"
) ||
return
;
}
if
(
my
$pm
=
$ref
->pubmed) {
$self
->_print(
"RX PUBMED; $pm.\n"
) ||
return
;
}
my
$authors
=
$ref
->authors;
$authors
=~ s/([\w\.]) (\w)/$1
$self
->_write_line_EMBL_regex(
"RA "
,
"RA "
,
$authors
.
";"
,
'\s+|$'
, 80) ||
return
;
my
$ref_title
=
$ref
->title ||
''
;
$ref_title
=~ s/[\s;]*$/;/;
$self
->_write_line_EMBL_regex(
"RT "
,
"RT "
,
$ref_title
,
'\s+|$'
, 80) ||
return
;
$self
->_write_line_EMBL_regex(
"RL "
,
"RL "
,
$ref
->location,
'\s+|$'
, 80) ||
return
;
$self
->_print(
"XX\n"
) ||
return
;
$t
++;
}
if
(
my
@db_xref
=
$seq
->annotation->get_Annotations(
'dblink'
) ) {
for
my
$dr
(
@db_xref
) {
my
$db_name
=
$dr
->database;
my
$prim
=
$dr
->primary_id;
my
$opt
=
$dr
->optional_id ||
''
;
my
$line
=
$opt
?
"$db_name; $prim; $opt."
:
"$db_name; $prim."
;
$self
->_write_line_EMBL_regex(
"DR "
,
"DR "
,
$line
,
'\s+|$'
, 80) ||
return
;
}
$self
->_print(
"XX\n"
) ||
return
;
}
foreach
my
$comment
(
$seq
->annotation->get_Annotations(
'comment'
) ) {
$self
->_write_line_EMBL_regex(
"CC "
,
"CC "
,
$comment
->text,
'\s+|$'
, 80) ||
return
;
$self
->_print(
"XX\n"
) ||
return
;
}
}
$self
->_print(
"FH Key Location/Qualifiers\n"
) ||
return
;
$self
->_print(
"FH\n"
) ||
return
;
my
@feats
=
$seq
->can(
'top_SeqFeatures'
) ?
$seq
->top_SeqFeatures : ();
if
(
$feats
[0]) {
if
(
defined
$self
->_post_sort ) {
my
$post_sort_func
=
$self
->_post_sort();
my
@fth
;
foreach
my
$sf
(
@feats
) {
push
(
@fth
,FAST::Bio::SeqIO::FTHelper::from_SeqFeature(
$sf
,
$seq
));
}
@fth
=
sort
{
&$post_sort_func
(
$a
,
$b
) }
@fth
;
foreach
my
$fth
(
@fth
) {
$self
->_print_EMBL_FTHelper(
$fth
) ||
return
;
}
}
else
{
foreach
my
$sf
(
@feats
) {
my
@fth
= FAST::Bio::SeqIO::FTHelper::from_SeqFeature(
$sf
,
$seq
);
foreach
my
$fth
(
@fth
) {
if
(
$fth
->key eq
'CONTIG'
) {
$self
->_show_dna(0);
}
$self
->_print_EMBL_FTHelper(
$fth
) ||
return
;
}
}
}
}
if
(
$self
->_show_dna() == 0 ) {
$self
->_print(
"//\n"
) ||
return
;
return
;
}
$self
->_print(
"XX\n"
) ||
return
;
if
(
$seq
->can(
'annotation'
) &&
defined
$seq
->annotation) {
foreach
my
$ctg
(
$seq
->annotation->get_Annotations(
'contig'
) ) {
if
(
$ctg
->value) {
$self
->_write_line_EMBL_regex(
"CO "
,
"CO "
,
$ctg
->value,
'[,]|$'
, 80) ||
return
;
}
}
}
if
(
length
(
$str
)) {
$str
=~
tr
/A-Z/a-z/;
my
$alen
=
$str
=~
tr
/a/a/;
my
$clen
=
$str
=~
tr
/c/c/;
my
$glen
=
$str
=~
tr
/g/g/;
my
$tlen
=
$str
=~
tr
/t/t/;
my
$len
=
$seq
->
length
();
my
$olen
=
$seq
->
length
() - (
$alen
+
$tlen
+
$clen
+
$glen
);
if
(
$olen
< 0 ) {
$self
->
warn
(
"Weird. More atgc than bases. Problem!"
);
}
$self
->_print(
"SQ Sequence $len BP; $alen A; $clen C; $glen G; $tlen T; $olen other;\n"
) ||
return
;
my
$nuc
= 60;
my
$whole_pat
=
'a10'
x 6;
my
$out_pat
=
'A11'
x 6;
my
$length
=
length
(
$str
);
my
$whole
=
int
(
$length
/
$nuc
) *
$nuc
;
my
(
$i
);
for
(
$i
= 0;
$i
<
$whole
;
$i
+=
$nuc
) {
my
$blocks
=
pack
$out_pat
,
unpack
$whole_pat
,
substr
(
$str
,
$i
,
$nuc
);
$self
->_print(
sprintf
(
" $blocks%9d\n"
,
$i
+
$nuc
)) ||
return
;
}
if
(
my
$last
=
substr
(
$str
,
$i
)) {
my
$last_len
=
length
(
$last
);
my
$last_pat
=
'a10'
x
int
(
$last_len
/ 10) .
'a'
.
$last_len
% 10;
my
$blocks
=
pack
$out_pat
,
unpack
(
$last_pat
,
$last
);
$self
->_print(
sprintf
(
" $blocks%9d\n"
,
$length
)) ||
return
;
}
}
$self
->_print(
"//\n"
) ||
return
;
$self
->flush
if
$self
->_flush_on_write &&
defined
$self
->_fh;
}
return
1;
}
sub
_print_EMBL_FTHelper {
my
(
$self
,
$fth
) =
@_
;
if
( !
ref
$fth
|| !
$fth
->isa(
'FAST::Bio::SeqIO::FTHelper'
) ) {
$fth
->
warn
(
"$fth is not a FTHelper class. Attempting to print, but there could be tears!"
);
}
if
(
$fth
->key eq
'CONTIG'
) {
$self
->_print(
"XX\n"
) ||
return
;
$self
->_write_line_EMBL_regex(
"CO "
,
"CO "
,
$fth
->loc,
'\,|$'
,80) ||
return
;
return
1;
}
$self
->_write_line_EMBL_regex(
sprintf
(
"FT %-15s "
,
$fth
->key),
"FT "
,
$fth
->loc,
'\,|$'
,80) ||
return
;
foreach
my
$tag
(
keys
%{
$fth
->field} ) {
if
( !
defined
$fth
->field->{
$tag
} ) {
next
;
}
foreach
my
$value
( @{
$fth
->field->{
$tag
}} ) {
$value
=~ s/\"/\"\"/g;
if
(
$value
eq
"_no_value"
) {
$self
->_write_line_EMBL_regex(
"FT "
,
"FT "
,
"/$tag"
,
'.|$'
,80) ||
return
;
}
elsif
(!
$FTQUAL_NO_QUOTE
{
$tag
} or
length
(
"/$tag=$value"
)>=60) {
my
$pat
=
$value
=~ /\s/ ?
'\s|\-|$'
:
'.|\-|$'
;
$self
->_write_line_EMBL_regex(
"FT "
,
"FT "
,
"/$tag=\"$value\""
,
$pat
,80) ||
return
;
}
else
{
$self
->_write_line_EMBL_regex(
"FT "
,
"FT "
,
"/$tag=$value"
,
'.|$'
,80) ||
return
;
}
}
}
return
1;
}
sub
_read_EMBL_Contig {
my
(
$self
,
$buffer
) =
@_
;
my
@ret
;
if
(
$$buffer
!~ /^CO/ ) {
warn
(
"Not parsing line '$$buffer' which maybe important"
);
}
$self
->_pushback(
$$buffer
);
while
(
defined
(
$_
=
$self
->_readline) ) {
/^C/ ||
last
;
/^CO\s+(.*)/ &&
do
{
push
@ret
, FAST::Bio::Annotation::SimpleValue->new(
-tagname
=>
'contig'
,
-value
=> $1);
};
}
$$buffer
=
$_
;
return
@ret
;
}
sub
_read_EMBL_References {
my
(
$self
,
$buffer
) =
@_
;
my
(
@refs
);
if
(
$$buffer
!~ /^RN/ ) {
warn
(
"Not parsing line '$$buffer' which maybe important"
);
}
my
$b1
;
my
$b2
;
my
$title
;
my
$loc
;
my
$au
;
my
$med
;
my
$pm
;
my
$com
;
while
(
defined
(
$_
=
$self
->_readline) ) {
/^R/ ||
last
;
/^RP (\d+)-(\d+)/ &&
do
{
$b1
=$1;
$b2
=$2;};
/^RX MEDLINE;\s+(\d+)/ &&
do
{
$med
=$1};
/^RX PUBMED;\s+(\d+)/ &&
do
{
$pm
=$1};
/^RA (.*)/ &&
do
{
$au
=
$self
->_concatenate_lines(
$au
,$1);
next
;
};
/^RT (.*)/ &&
do
{
$title
=
$self
->_concatenate_lines(
$title
,$1);
next
;
};
/^RL (.*)/ &&
do
{
$loc
=
$self
->_concatenate_lines(
$loc
,$1);
next
;
};
/^RC (.*)/ &&
do
{
$com
=
$self
->_concatenate_lines(
$com
,$1);
next
;
};
}
my
$ref
= FAST::Bio::Annotation::Reference->new();
$au
=~ s/;\s*$//g;
$title
=~ s/;\s*$//g;
$ref
->start(
$b1
);
$ref
->end(
$b2
);
$ref
->authors(
$au
);
$ref
->title(
$title
);
$ref
->location(
$loc
);
$ref
->medline(
$med
);
$ref
->comment(
$com
);
$ref
->pubmed(
$pm
);
push
(
@refs
,
$ref
);
$$buffer
=
$_
;
return
@refs
;
}
sub
_read_EMBL_Species {
my
(
$self
,
$buffer
,
$acc
) =
@_
;
my
$org
;
$_
=
$$buffer
;
my
(
$sub_species
,
$species
,
$genus
,
$common
,
$sci_name
,
$class_lines
);
while
(
defined
(
$_
||=
$self
->_readline )) {
if
(/^OS\s+(.+)/) {
$sci_name
.= (
$sci_name
) ?
' '
.$1 : $1;
}
elsif
(s/^OC\s+(.+)$//) {
$class_lines
.= $1;
}
elsif
(/^OG\s+(.*)/) {
$org
= $1;
}
else
{
last
;
}
$_
=
undef
;
}
$self
->_pushback(
$_
);
$sci_name
=~ s{\.$}{};
$sci_name
||
return
;
my
@class
=
map
{ s/^\s+//; s/\s+$//; s/\s{2,}/ /g;
$_
; }
split
/(?<!subgen)[;\.]+/,
$class_lines
;
my
$possible_genus
=
$class
[-1];
$possible_genus
.=
"|$class[-2]"
if
$class
[-2];
if
(
$sci_name
=~ /^(
$possible_genus
)/) {
$genus
= $1;
(
$species
) =
$sci_name
=~ /^
$genus
\s+(.+)/;
}
else
{
$species
=
$sci_name
;
}
if
(
$genus
) {
return
if
$genus
=~ /^(Unknown|None)$/i;
}
if
(
$species
=~ /subsp\.|var\./) {
(
$species
,
$sub_species
) =
$species
=~ /(.+)\s+((?:subsp\.|var\.).+)/;
}
unless
(
$class
[-1] eq
'Viruses'
) {
(
$species
,
$common
) =
$species
=~ /^(.+)\s+\((.+)\)$/;
$sci_name
=~ s/\s+\(.+\)$//
if
$common
;
}
unless
(
$class
[-1] eq
$sci_name
) {
push
(
@class
,
$sci_name
);
}
@class
=
reverse
@class
;
$self
->throw(
"$acc seems to be missing its OS line: invalid."
)
unless
$sci_name
;
my
%names
;
foreach
my
$i
(0..
$#class
) {
my
$name
=
$class
[
$i
];
$names
{
$name
}++;
}
my
$make
= FAST::Bio::Species->new();
$make
->scientific_name(
$sci_name
);
$make
->classification(
@class
);
unless
(
$class
[-1] eq
'Viruses'
) {
$make
->genus(
$genus
)
if
$genus
;
$make
->species(
$species
)
if
$species
;
$make
->sub_species(
$sub_species
)
if
$sub_species
;
$make
->common_name(
$common
)
if
$common
;
}
$make
->organelle(
$org
)
if
$org
;
return
$make
;
}
sub
_read_EMBL_DBLink {
my
(
$self
,
$buffer
) =
@_
;
my
(
@db_link
);
$_
=
$$buffer
;
while
(
defined
(
$_
||=
$self
->_readline )) {
if
( /^DR ([^\s;]+);\s*([^\s;]+);?\s*([^\s;]+)?\.$/) {
my
(
$databse
,
$prim_id
,
$sec_id
) = ($1,$2,$3);
my
$link
= FAST::Bio::Annotation::DBLink->new(
-database
=>
$databse
,
-primary_id
=>
$prim_id
,
-optional_id
=>
$sec_id
);
push
(
@db_link
,
$link
);
}
else
{
last
;
}
$_
=
undef
;
}
$$buffer
=
$_
;
return
@db_link
;
}
sub
_read_EMBL_TaxID_DBLink {
my
(
$self
,
$buffer
) =
@_
;
my
(
@db_link
);
$_
=
$$buffer
;
while
(
defined
(
$_
||=
$self
->_readline )) {
if
( /^OX (\S+)=(\d+);$/ ) {
my
(
$databse
,
$prim_id
) = ($1,$2);
my
$link
= FAST::Bio::Annotation::DBLink->new(
-database
=>
$databse
,
-primary_id
=>
$prim_id
,);
push
(
@db_link
,
$link
);
}
else
{
last
;
}
$_
=
undef
;
}
$$buffer
=
$_
;
return
@db_link
;
}
sub
_filehandle{
my
(
$obj
,
$value
) =
@_
;
if
(
defined
$value
) {
$obj
->{
'_filehandle'
} =
$value
;
}
return
$obj
->{
'_filehandle'
};
}
sub
_read_FTHelper_EMBL {
my
(
$self
,
$buffer
) =
@_
;
my
(
$key
,
$loc
,
@qual
,
);
if
(
$$buffer
=~ /^FT\s{3}(\S+)\s+(\S+)/ ) {
$key
= $1;
$loc
= $2;
while
(
defined
(
$_
=
$self
->_readline) ) {
if
(/^FT(\s+)(.+?)\s*$/) {
if
(
length
($1) > 4) {
if
(
@qual
) {
push
(
@qual
, $2);
}
elsif
(
substr
($2, 0, 1) eq
'/'
) {
@qual
= ($2);
}
else
{
$loc
.= $2;
}
}
else
{
last
;
}
}
else
{
last
;
}
}
}
elsif
(
$$buffer
=~ /^CO\s+(\S+)/) {
$key
=
'CONTIG'
;
$loc
= $1;
while
(
defined
(
$_
=
$self
->_readline) ) {
if
(/^CO\s+(\S+)\s*$/) {
$loc
.= $1;
}
else
{
last
;
}
}
}
else
{
return
;
}
$$buffer
=
$_
;
my
$out
= FAST::Bio::SeqIO::FTHelper->new();
$out
->verbose(
$self
->verbose());
$out
->key(
$key
);
$out
->loc(
$loc
);
QUAL:
for
(
my
$i
= 0;
$i
<
@qual
;
$i
++) {
$_
=
$qual
[
$i
];
my
(
$qualifier
,
$value
) = m{^/([^=]+)(?:=(.+))?}
or
$self
->throw(
"Can't see new qualifier in: $_\nfrom:\n"
.
join
(
''
,
map
"$_\n"
,
@qual
));
if
(
defined
$value
) {
if
(
substr
(
$value
, 0, 1) eq
'"'
) {
QUOTES:
while
(
$value
!~ /
"$/ or $value =~ tr/"
/
"/ % 2) { #"
$i
++;
my
$next
=
$qual
[
$i
];
if
(!
defined
(
$next
)) {
$self
->
warn
(
"Unbalanced quote in:\n"
.
join
(
"\n"
,
@qual
).
"\nAdding quote to close..."
.
"Check sequence quality!"
);
$value
.=
'"'
;
last
QUOTES;
}
if
(
$qualifier
eq
"translation"
) {
$value
.=
$next
;
}
else
{
$value
.=
" $next"
;
}
}
$value
=~ s/^
"|"
$//g;
$value
=~ s/
""
/
"/g; #"
}
}
else
{
$value
=
'_no_value'
;
}
$out
->field->{
$qualifier
} ||= [];
push
(@{
$out
->field->{
$qualifier
}},
$value
);
}
return
$out
;
}
sub
_write_line_EMBL {
my
(
$self
,
$pre1
,
$pre2
,
$line
,
$length
) =
@_
;
$length
||
$self
->throw(
"Miscalled write_line_EMBL without length. Programming error!"
);
my
$subl
=
$length
-
length
$pre2
;
my
$linel
=
length
$line
;
my
$i
;
my
$sub
=
substr
(
$line
,0,
$length
-
length
$pre1
);
$self
->_print(
"$pre1$sub\n"
) ||
return
;
for
(
$i
= (
$length
-
length
$pre1
);
$i
<
$linel
;) {
$sub
=
substr
(
$line
,
$i
,(
$subl
));
$self
->_print(
"$pre2$sub\n"
) ||
return
;
$i
+=
$subl
;
}
return
1;
}
sub
_write_line_EMBL_regex {
my
(
$self
,
$pre1
,
$pre2
,
$line
,
$regex
,
$length
) =
@_
;
$length
||
$self
->throw(
"Programming error - called write_line_EMBL_regex without length."
);
my
$subl
=
$length
- (
length
$pre1
) -1 ;
my
(
@lines
);
CHUNK:
while
(
$line
) {
foreach
my
$pat
(
$regex
,
'[,;\.\/-]\s|'
.
$regex
,
'[,;\.\/-]|'
.
$regex
) {
if
(
$line
=~ m/^(.{0,
$subl
})(
$pat
)(.*)/ ) {
my
$l
= $1.$2;
$l
=~ s/
if
$pre1
eq
"RA "
;
my
$newl
= $3;
$line
=
substr
(
$line
,
length
(
$l
));
$l
=~ s/\s+$//;
next
CHUNK
if
(
$l
eq
''
);
push
(
@lines
,
$l
);
next
CHUNK;
}
}
$self
->
warn
(
"trouble dissecting \"$line\"\n into chunks "
.
"of $subl chars or less - this tag won't print right"
);
$line
=
substr
(
$line
,0,
$subl
) .
" "
.
substr
(
$line
,
$subl
);
}
my
$s
=
shift
@lines
;
(
$self
->_print(
"$pre1$s\n"
) ||
return
)
if
$s
;
foreach
my
$s
(
@lines
) {
$self
->_print(
"$pre2$s\n"
) ||
return
;
}
return
1;
}
sub
_post_sort{
my
$obj
=
shift
;
if
(
@_
) {
my
$value
=
shift
;
$obj
->{
'_post_sort'
} =
$value
;
}
return
$obj
->{
'_post_sort'
};
}
sub
_show_dna{
my
$obj
=
shift
;
if
(
@_
) {
my
$value
=
shift
;
$obj
->{
'_show_dna'
} =
$value
;
}
return
$obj
->{
'_show_dna'
};
}
sub
_id_generation_func{
my
$obj
=
shift
;
if
(
@_
) {
my
$value
=
shift
;
$obj
->{
'_id_generation_func'
} =
$value
;
}
return
$obj
->{
'_id_generation_func'
};
}
sub
_ac_generation_func{
my
$obj
=
shift
;
if
(
@_
) {
my
$value
=
shift
;
$obj
->{
'_ac_generation_func'
} =
$value
;
}
return
$obj
->{
'_ac_generation_func'
};
}
sub
_sv_generation_func{
my
$obj
=
shift
;
if
(
@_
) {
my
$value
=
shift
;
$obj
->{
'_sv_generation_func'
} =
$value
;
}
return
$obj
->{
'_sv_generation_func'
};
}
sub
_kw_generation_func{
my
$obj
=
shift
;
if
(
@_
) {
my
$value
=
shift
;
$obj
->{
'_kw_generation_func'
} =
$value
;
}
return
$obj
->{
'_kw_generation_func'
};
}
1;