our
$FTQUAL_LINE_LENGTH
= 60;
our
%FTQUAL_NO_QUOTE
=
map
{
$_
=> 1}
qw(
anticodon citation
codon codon_start
cons_splice direction
evidence label
mod_base number
rpt_type rpt_unit
transl_except transl_table
usedin
)
;
our
%DBSOURCE
=
map
{
$_
=> 1}
qw(
EchoBASE IntAct SWISS-2DPAGE ECO2DBASE ECOGENE TIGRFAMs
TIGR GO InterPro Pfam PROSITE SGD GermOnline
HSSP PhosSite Ensembl RGD AGD ArrayExpress KEGG
H-InvDB HGNC LinkHub PANTHER PRINTS SMART SMR
MGI MIM RZPD-ProtExp ProDom MEROPS TRANSFAC Reactome
UniGene GlycoSuiteDB PIRSF HSC-2DPAGE PHCI-2DPAGE
PMMA-2DPAGE Siena-2DPAGE Rat-heart-2DPAGE Aarhus/Ghent-2DPAGE
Biocyc MetaCyc Biocyc:Metacyc GenomeReviews FlyBase
TMHOBP COMPLUYEAST-2DPAGE OGP DictyBase HAMAP
PhotoList Gramene WormBase WormPep Genew ZFIN
PeroxiBase MaizeDB TAIR DrugBank REBASE HPA
swissprot GenBank GenPept REFSEQ embl PDB UniProtKB
DIP PeptideAtlas PRIDE CYGD HOGENOME Gene3D
Project)
;
our
%VALID_MOLTYPE
=
map
{
$_
=> 1}
qw(NA DNA RNA tRNA rRNA cDNA cRNA ms-DNA
mRNA uRNA ss-RNA ss-DNA snRNA snoRNA PRT)
;
our
%VALID_ALPHABET
= (
'bp'
=>
'dna'
,
'aa'
=>
'protein'
,
'rc'
=>
''
);
sub
_initialize {
my
(
$self
,
@args
) =
@_
;
$self
->SUPER::_initialize(
@args
);
$self
->{
'_func_ftunit_hash'
} = {};
$self
->_show_dna(1);
if
( !
defined
$self
->sequence_factory ) {
$self
->sequence_factory(FAST::Bio::Seq::SeqFactory->new
(
-verbose
=>
$self
->verbose(),
-type
=>
'FAST::Bio::Seq::RichSeq'
));
}
}
sub
next_seq {
my
(
$self
,
@args
) =
@_
;
my
%args
=
@args
;
my
$builder
=
$self
->sequence_builder();
my
$seq
;
my
%params
;
RECORDSTART:
while
(1) {
my
$buffer
;
my
(
@acc
,
@features
);
my
(
$display_id
,
$annotation
);
my
$species
;
@features
= ();
$annotation
=
undef
;
@acc
= ();
$species
=
undef
;
%params
= (
-verbose
=>
$self
->verbose );
local
($/) =
"\n"
;
while
(
defined
(
$buffer
=
$self
->_readline() ) ) {
last
if
index
(
$buffer
,
'LOCUS '
) == 0;
}
return
unless
defined
$buffer
;
$buffer
=~ /^LOCUS\s+(\S.*)$/o
||
$self
->throw(
"GenBank stream with bad LOCUS line. Not GenBank in my book. Got '$buffer'"
);
my
@tokens
=
split
(
' '
, $1 );
$display_id
=
shift
(
@tokens
);
$params
{
'-display_id'
} =
$display_id
;
my
$seqlength
=
shift
(
@tokens
);
if
(
exists
$VALID_ALPHABET
{
$seqlength
} ) {
$self
->
warn
(
"Bad LOCUS name? Changing [$params{'-display_id'}] to 'unknown' and length to $display_id"
);
$params
{
'-display_id'
} =
'unknown'
;
$params
{
'-length'
} =
$display_id
;
unshift
@tokens
,
$seqlength
;
}
else
{
$params
{
'-length'
} =
$seqlength
;
}
my
$alphabet
=
lc
(
shift
@tokens
);
$params
{
'-alphabet'
} =
(
exists
$VALID_ALPHABET
{
$alphabet
} )
?
$VALID_ALPHABET
{
$alphabet
}
:
$self
->
warn
(
"Unknown alphabet: $alphabet"
);
if
(
$params
{
'-alphabet'
} eq
'protein'
) {
$params
{
'-molecule'
} =
'PRT'
;
}
else
{
$params
{
'-molecule'
} =
shift
(
@tokens
);
}
if
(
$params
{
'-molecule'
} eq
'dna'
||
$params
{
'-molecule'
} eq
'rna'
) {
$params
{
'-molecule'
} =
uc
$params
{
'-molecule'
};
}
$self
->debug(
"Unrecognized molecule type:"
.
$params
{
'-molecule'
} )
if
!
exists
(
$VALID_MOLTYPE
{
$params
{
'-molecule'
} } );
my
$circ
=
shift
(
@tokens
);
if
(
$circ
eq
'circular'
) {
$params
{
'-is_circular'
} = 1;
$params
{
'-division'
} =
shift
(
@tokens
);
}
else
{
$params
{
'-division'
} =
( CORE::
length
(
$circ
) == 3 ) ?
$circ
:
shift
(
@tokens
);
}
my
$date
=
join
(
' '
,
@tokens
);
if
(
$date
=~ s/\s*((\d{1,2})-(\w{3})-(\d{2,4})).*/$1/ ) {
if
(
length
(
$date
) < 11 ) {
my
(
$d
,
$m
,
$y
) = ( $2, $3, $4 );
if
(
length
(
$d
) == 1 ) {
$d
=
"0$d"
;
}
if
(
length
(
$y
) == 2 ) {
if
(
$y
> 60 ) {
$y
=
"19$y"
;
}
else
{
$y
=
"20$y"
;
}
$self
->
warn
(
"Date was malformed, guessing the century for $date to be $y\n"
);
}
$params
{
'-dates'
} = [
join
(
'-'
,
$d
,
$m
,
$y
) ];
}
else
{
$params
{
'-dates'
} = [
$date
];
}
}
$builder
->add_slot_value(
%params
);
%params
= ();
if
( !
$builder
->want_object() ) {
$builder
->make_object();
next
RECORDSTART;
}
if
(
$builder
->want_slot(
'annotation'
) ) {
$annotation
= FAST::Bio::Annotation::Collection->new();
}
$buffer
=
$self
->_readline();
until
( !
defined
(
$buffer
) ) {
$_
=
$buffer
;
if
(/^DEFINITION\s+(\S.*\S)/) {
my
@desc
= ($1);
while
(
defined
(
$_
=
$self
->_readline ) ) {
if
(/^\s+(.*)/) {
push
(
@desc
, $1 );
next
}
last
;
}
$builder
->add_slot_value(
-desc
=>
join
(
' '
,
@desc
) );
$buffer
=
$_
;
}
if
(/^ACCESSION\s+(\S.*\S)/) {
push
(
@acc
,
split
( /\s+/, $1 ) );
while
(
defined
(
$_
=
$self
->_readline ) ) {
/^\s+(.*)/ &&
do
{
push
(
@acc
,
split
( /\s+/, $1 ) );
next
};
last
;
}
$buffer
=
$_
;
next
;
}
elsif
(/^PID\s+(\S+)/) {
$params
{
'-pid'
} = $1;
}
elsif
(/^VERSION\s+(\S.+)$/) {
my
(
$acc
,
$gi
) =
split
(
' '
, $1 );
if
(
$acc
=~ /^\w+\.(\d+)/ ) {
$params
{
'-version'
} = $1;
$params
{
'-seq_version'
} = $1;
}
if
(
$gi
&& (
index
(
$gi
,
"GI:"
) == 0 ) ) {
$params
{
'-primary_id'
} =
substr
(
$gi
, 3 );
}
}
elsif
(/^KEYWORDS\s+(\S.*)/) {
my
@kw
=
split
( /\s*\;\s*/, $1 );
while
(
defined
(
$_
=
$self
->_readline ) ) {
chomp
;
/^\s+(.*)/
&&
do
{
push
(
@kw
,
split
( /\s*\;\s*/, $1 ) );
next
};
last
;
}
@kw
&&
$kw
[-1] =~ s/\.$//;
$params
{
'-keywords'
} = \
@kw
;
$buffer
=
$_
;
next
;
}
elsif
(/^SOURCE\s+\S/) {
if
(
$builder
->want_slot(
'species'
) ) {
$species
=
$self
->_read_GenBank_Species( \
$buffer
);
$builder
->add_slot_value(
-species
=>
$species
);
}
else
{
while
(
defined
(
$buffer
=
$self
->_readline() ) ) {
last
if
substr
(
$buffer
, 0, 1 ) ne
' '
;
}
}
next
;
}
elsif
(/^REFERENCE\s+\S/) {
if
(
$annotation
) {
my
@refs
=
$self
->_read_GenBank_References( \
$buffer
);
foreach
my
$ref
(
@refs
) {
$annotation
->add_Annotation(
'reference'
,
$ref
);
}
}
else
{
while
(
defined
(
$buffer
=
$self
->_readline() ) ) {
last
if
substr
(
$buffer
, 0, 1 ) ne
' '
;
}
}
next
;
}
elsif
(/^PROJECT\s+(\S.*)/) {
if
(
$annotation
) {
my
$project
=
FAST::Bio::Annotation::SimpleValue->new(
-value
=> $1 );
$annotation
->add_Annotation(
'project'
,
$project
);
}
}
elsif
(/^COMMENT\s+(\S.*)/) {
if
(
$annotation
) {
my
$comment
= $1;
while
(
defined
(
$_
=
$self
->_readline ) ) {
last
if
(/^\S/);
$comment
.=
$_
;
}
$comment
=~ s/\n/ /g;
$comment
=~ s/ +/ /g;
$annotation
->add_Annotation(
'comment'
,
FAST::Bio::Annotation::Comment->new(
-text
=>
$comment
,
-tagname
=>
'comment'
)
);
$buffer
=
$_
;
}
else
{
while
(
defined
(
$buffer
=
$self
->_readline() ) ) {
last
if
substr
(
$buffer
, 0, 1 ) ne
' '
;
}
}
next
;
}
elsif
(/^DB(?:SOURCE|LINK)\s+(\S.+)/) {
if
(
$annotation
) {
my
$dbsource
= $1;
while
(
defined
(
$_
=
$self
->_readline ) ) {
last
if
(/^\S/);
$dbsource
.=
$_
;
}
if
(
$dbsource
=~
s/(UniProt(?:KB)?|swissprot):\s+locus\s+(\S+)\,.+\n// )
{
$annotation
->add_Annotation(
'dblink'
,
FAST::Bio::Annotation::DBLink->new(
-primary_id
=> $2,
-database
=> $1,
-tagname
=>
'dblink'
)
);
if
(
$dbsource
=~ s/\s+created:\s+([^\.]+)\.\n// ) {
$annotation
->add_Annotation(
'swissprot_dates'
,
FAST::Bio::Annotation::SimpleValue->new(
-tagname
=>
'date_created'
,
-value
=> $1
)
);
}
while
(
$dbsource
=~
s/\s+(sequence|annotation)\s+updated:\s+([^\.]+)\.\n//g
)
{
$annotation
->add_Annotation(
'swissprot_dates'
,
FAST::Bio::Annotation::SimpleValue->new(
-tagname
=>
'date_updated'
,
-value
=> $2
)
);
}
$dbsource
=~ s/\n/ /g;
if
(
$dbsource
=~
s/\s+xrefs:\s+((?:\S+,\s+)+\S+)\s+xrefs/xrefs/ )
{
my
$i
= 0;
for
my
$dbsrc
(
split
( /,\s+/, $1 ) ) {
if
(
$dbsrc
=~ /(\S+)\.(\d+)/
||
$dbsrc
=~ /(\S+)/ )
{
my
(
$id
,
$version
) = ( $1, $2 );
$version
=
''
unless
defined
$version
;
my
$db
;
if
(
$id
=~ /^\d\S{3}/ ) {
$db
=
'PDB'
;
}
else
{
$db
=
(
$i
++ % 2 ) ?
'GenPept'
:
'GenBank'
;
}
$annotation
->add_Annotation(
'dblink'
,
FAST::Bio::Annotation::DBLink->new(
-primary_id
=>
$id
,
-version
=>
$version
,
-database
=>
$db
,
-tagname
=>
'dblink'
)
);
}
}
}
elsif
(
$dbsource
=~ s/\s+xrefs:\s+(.+)\s+xrefs/xrefs/i )
{
my
$i
= 0;
for
my
$id
(
split
( /\,\s+/, $1 ) ) {
my
(
$acc
,
$db
);
if
(
$id
=~ /gi:\s+(\d+)/ ) {
$acc
= $1;
$db
= (
$i
++ % 2 ) ?
'GenPept'
:
'GenBank'
;
}
elsif
(
$id
=~ /pdb\s+accession\s+(\S+)/ ) {
$acc
= $1;
$db
=
'PDB'
;
}
else
{
$acc
=
$id
;
$db
=
''
;
}
$annotation
->add_Annotation(
'dblink'
,
FAST::Bio::Annotation::DBLink->new(
-primary_id
=>
$acc
,
-database
=>
$db
,
-tagname
=>
'dblink'
)
);
}
}
else
{
$self
->debug(
"Cannot match $dbsource\n"
);
}
if
(
$dbsource
=~
s/xrefs\s+\(non\-sequence\s+databases\):\s+
((?:\S+,\s+)+\S+)//x
)
{
for
my
$id
(
split
( /\,\s+/, $1 ) ) {
my
$db
;
$db
=
substr
(
$id
, 0,
index
(
$id
,
':'
) );
if
( !
exists
$DBSOURCE
{
$db
} ) {
$db
=
''
;
}
$id
=
substr
(
$id
,
index
(
$id
,
':'
) + 1 );
$annotation
->add_Annotation(
'dblink'
,
FAST::Bio::Annotation::DBLink->new(
-primary_id
=>
$id
,
-database
=>
$db
,
-tagname
=>
'dblink'
)
);
}
}
}
else
{
if
(
$dbsource
=~
/^(\S*?):?\s
*accession
\s+(\S+)\.(\d+)/ )
{
my
(
$db
,
$id
,
$version
) = ( $1, $2, $3 );
$annotation
->add_Annotation(
'dblink'
,
FAST::Bio::Annotation::DBLink->new(
-primary_id
=>
$id
,
-version
=>
$version
,
-database
=>
$db
||
'GenBank'
,
-tagname
=>
'dblink'
)
);
}
elsif
(
$dbsource
=~ /^(\S*?):?\s
*accession
\s+(\S+)/ ) {
my
(
$db
,
$id
) = ( $1, $2 );
$annotation
->add_Annotation(
'dblink'
,
FAST::Bio::Annotation::DBLink->new(
-primary_id
=>
$id
,
-database
=>
$db
||
'GenBank'
,
-tagname
=>
'dblink'
)
);
}
elsif
(
$dbsource
=~ /(\S+)([\.:])\s*(\S+)/ ) {
my
(
$db
,
$version
);
my
@ids
= ();
if
( $2 eq
':'
) {
$db
= $1;
@ids
=
split
( /,/, $3 );
}
else
{
(
$db
,
$version
) = (
'GenBank'
, $3 );
$ids
[0] = $1;
}
foreach
my
$id
(
@ids
) {
$annotation
->add_Annotation(
'dblink'
,
FAST::Bio::Annotation::DBLink->new(
-primary_id
=>
$id
,
-version
=>
$version
,
-database
=>
$db
,
-tagname
=>
'dblink'
)
);
}
}
else
{
$self
->
warn
(
"Unrecognized DBSOURCE data: $dbsource\n"
);
}
}
$buffer
=
$_
;
}
else
{
while
(
defined
(
$buffer
=
$self
->_readline() ) ) {
last
if
substr
(
$buffer
, 0, 1 ) ne
' '
;
}
}
next
;
}
last
if
(/^(FEATURES|ORIGIN)/);
$buffer
=
$self
->_readline;
}
return
unless
defined
$buffer
;
$builder
->add_slot_value(
-accession_number
=>
shift
(
@acc
),
-secondary_accessions
=> \
@acc
,
%params
);
$builder
->add_slot_value(
-annotation
=>
$annotation
)
if
$annotation
;
%params
= ();
if
( !
$builder
->want_object() ) {
$builder
->make_object();
next
RECORDSTART;
}
if
(
$builder
->want_slot(
'features'
) &&
defined
(
$_
) && /^FEATURES/o ) {
$buffer
=
$self
->_readline;
while
(
defined
(
$buffer
) ) {
last
if
(
$buffer
=~ /^BASE|ORIGIN|CONTIG|WGS/o );
my
$ftunit
=
$self
->_read_FTHelper_GenBank( \
$buffer
);
if
( !
defined
$ftunit
) {
$self
->
warn
(
"Unexpected error in feature table for "
.
$params
{
'-display_id'
}
.
" Skipping feature, attempting to recover"
);
unless
( (
$buffer
=~ /^\s{5,5}\S+/o )
or (
$buffer
=~ /^\S+/o ) )
{
$buffer
=
$self
->_readline();
}
next
;
}
my
$feat
=
$ftunit
->_generic_seqfeature(
$self
->location_factory(),
$display_id
);
if
(
$species
&& (
$feat
->primary_tag eq
'source'
)
&&
$feat
->has_tag(
'db_xref'
)
&& (
!
$species
->ncbi_taxid()
|| (
$species
->ncbi_taxid
&&
$species
->ncbi_taxid =~ /^list/ )
)
)
{
foreach
my
$tagval
(
$feat
->get_tag_values(
'db_xref'
) ) {
if
(
index
(
$tagval
,
"taxon:"
) == 0 ) {
$species
->ncbi_taxid(
substr
(
$tagval
, 6 ) );
last
;
}
}
}
push
(
@features
,
$feat
);
}
$builder
->add_slot_value(
-features
=> \
@features
);
$_
=
$buffer
;
}
if
(
defined
(
$_
) ) {
if
(/^CONTIG\s+(.*)/o) {
my
$ctg
= $1;
while
(
defined
(
$_
=
$self
->_readline)) {
last
if
m{^ORIGIN|//}o;
s/\s+(.*)/$1/;
$ctg
.=
$_
;
}
if
(
$ctg
) {
$annotation
->add_Annotation(
FAST::Bio::Annotation::SimpleValue->new(
-tagname
=>
'contig'
,
-value
=>
$ctg
)
);
}
}
elsif
(/^WGS|WGS_SCAFLD\s+/o) {
while
(
$_
=~ s/(^WGS|WGS_SCAFLD)\s+// ) {
chomp
;
$annotation
->add_Annotation(
FAST::Bio::Annotation::SimpleValue->new(
-value
=>
$_
,
-tagname
=>
lc
($1)
)
);
$_
=
$self
->_readline;
}
}
elsif
( !m{^ORIGIN|//}o ) {
while
(
defined
(
$_
=
$self
->_readline ) ) {
last
if
m{^(ORIGIN|//)};
}
}
}
if
( !
$builder
->want_object() ) {
$builder
->make_object();
next
RECORDSTART;
}
if
(
$builder
->want_slot(
'seq'
) ) {
if
(
defined
(
$_
) && s/^ORIGIN\s+// ) {
if
(
$annotation
&&
length
(
$_
) > 0 ) {
$annotation
->add_Annotation(
'origin'
,
FAST::Bio::Annotation::SimpleValue->new(
-tagname
=>
'origin'
,
-value
=>
$_
)
);
}
my
$seqc
=
''
;
while
(
defined
(
$_
=
$self
->_readline ) ) {
last
if
m{^//};
$_
=
uc
(
$_
);
s/[^A-Za-z]//g;
$seqc
.=
$_
;
}
$builder
->add_slot_value(
-seq
=>
$seqc
);
}
}
elsif
(
defined
(
$_
) && (
substr
(
$_
, 0, 2 ) ne
'//'
) ) {
while
(
defined
(
$_
=
$self
->_readline ) ) {
last
if
substr
(
$_
, 0, 2 ) eq
'//'
;
}
}
$seq
=
$builder
->make_object();
next
RECORDSTART
unless
$seq
;
last
RECORDSTART;
}
return
$seq
;
}
sub
write_seq {
my
(
$self
,
@seqs
) =
@_
;
foreach
my
$seq
(
@seqs
) {
$self
->throw(
"Attempting to write with no seq!"
)
unless
defined
$seq
;
if
( !
ref
$seq
|| !
$seq
->isa(
'FAST::Bio::SeqI'
) ) {
$self
->
warn
(
" $seq is not a SeqI compliant module. Attempting to dump, but may fail!"
);
}
my
$str
=
$seq
->seq;
my
(
$div
,
$mol
);
my
$len
=
$seq
->
length
();
if
(
$seq
->can(
'division'
) ) {
$div
=
$seq
->division;
}
if
( !
defined
$div
|| !
$div
) {
$div
=
'UNK'
; }
my
$alpha
=
$seq
->alphabet;
if
( !
$seq
->can(
'molecule'
) || !
defined
(
$mol
=
$seq
->molecule()) ) {
$mol
=
$alpha
||
'DNA'
;
}
my
$circular
=
'linear '
;
$circular
=
'circular'
if
$seq
->is_circular;
local
($^W) = 0;
my
$temp_line
;
if
(
$self
->_id_generation_func ) {
$temp_line
= &{
$self
->_id_generation_func}(
$seq
);
}
else
{
my
$date
=
''
;
if
(
$seq
->can(
'get_dates'
) ) {
(
$date
) =
$seq
->get_dates();
}
$self
->
warn
(
"No whitespace allowed in GenBank display id ["
.
$seq
->display_id.
"]"
)
if
$seq
->display_id =~ /\s/;
$temp_line
=
sprintf
(
"%-12s%-15s%13s %s%4s%-8s%-8s %3s %-s\n"
,
'LOCUS'
,
$seq
->id(),
$len
,
(
lc
(
$alpha
) eq
'protein'
) ? (
'aa'
,
''
,
''
) :
(
'bp'
,
''
,
$mol
),
$circular
,
$div
,
$date
);
}
$self
->_print(
$temp_line
);
$self
->_write_line_GenBank_regex(
"DEFINITION "
,
" "
,
$seq
->desc(),
"\\s\+\|\$"
,80);
if
(
$self
->_ac_generation_func ) {
$temp_line
= &{
$self
->_ac_generation_func}(
$seq
);
$self
->_print(
"ACCESSION $temp_line\n"
);
}
else
{
my
@acc
= ();
push
(
@acc
,
$seq
->accession_number());
if
(
$seq
->isa(
'FAST::Bio::Seq::RichSeqI'
) ) {
push
(
@acc
,
$seq
->get_secondary_accessions());
}
$self
->_print(
"ACCESSION "
,
join
(
" "
,
@acc
),
"\n"
);
}
if
(
$seq
->isa(
'FAST::Bio::Seq::RichSeqI'
) &&
$seq
->pid()) {
$self
->_print(
"PID "
,
$seq
->pid(),
"\n"
);
}
if
(
defined
$self
->_sv_generation_func() ) {
$temp_line
= &{
$self
->_sv_generation_func}(
$seq
);
if
(
$temp_line
) {
$self
->_print(
"VERSION $temp_line\n"
);
}
}
else
{
if
(
$seq
->isa(
'FAST::Bio::Seq::RichSeqI'
) &&
defined
(
$seq
->seq_version)) {
my
$id
=
$seq
->primary_id();
$self
->_print(
"VERSION "
,
$seq
->accession_number(),
"."
,
$seq
->seq_version,
(
$id
&& (
$id
=~ /^\d+$/) ?
" GI:"
.
$id
:
""
),
"\n"
);
}
}
for
my
$proj
(
$seq
->annotation->get_Annotations(
'project'
) ) {
$self
->_print(
"PROJECT "
.
$proj
->value.
"\n"
);
}
foreach
my
$ref
(
$seq
->annotation->get_Annotations(
'dblink'
) ) {
my
(
$db
,
$id
) = (
$ref
->database,
$ref
->primary_id);
my
$prefix
=
$db
eq
'Project'
?
'DBLINK'
:
'DBSOURCE'
;
my
$text
=
$db
eq
'GenBank'
?
''
:
$db
eq
'Project'
?
"$db:$id"
:
"$db accession $id"
;
$self
->_print(
sprintf
(
"%-11s %s\n"
,
$prefix
,
$text
));
}
if
(
defined
$self
->_kw_generation_func() ) {
$temp_line
= &{
$self
->_kw_generation_func}(
$seq
);
$self
->_print(
"KEYWORDS $temp_line\n"
);
}
else
{
if
(
$seq
->can(
'keywords'
) ) {
my
$kw
=
$seq
->keywords;
$kw
.=
'.'
if
(
$kw
!~ /\.$/ );
$self
->_print(
"KEYWORDS $kw\n"
);
}
}
foreach
my
$ref
(
$seq
->annotation->get_Annotations(
'segment'
) ) {
$self
->_print(
sprintf
(
"%-11s %s\n"
,
'SEGMENT'
,
$ref
->value));
}
if
(
my
$spec
=
$seq
->species) {
my
(
$on
,
$sn
,
$cn
) = (
$spec
->can(
'organelle'
) ?
$spec
->organelle :
''
,
$spec
->scientific_name,
$spec
->common_name);
my
@classification
;
if
(
$spec
->isa(
'FAST::Bio::Species'
)) {
@classification
=
$spec
->classification;
shift
(
@classification
);
}
else
{
my
$node
=
$spec
;
while
(
$node
) {
$node
=
$node
->ancestor ||
last
;
unshift
(
@classification
,
$node
->node_name);
}
@classification
=
reverse
@classification
;
}
my
$abname
=
$spec
->name(
'abbreviated'
) ?
$spec
->name(
'abbreviated'
)->[0] :
$sn
;
my
$sl
=
$on
?
"$on "
:
''
;
$sl
.=
$cn
?
$abname
.
" ($cn)"
:
"$abname"
;
$self
->_write_line_GenBank_regex(
"SOURCE "
,
' '
x12,
$sl
,
"\\s\+\|\$"
,80);
$self
->_print(
" ORGANISM "
,
$spec
->scientific_name,
"\n"
);
my
$OC
=
join
(
'; '
, (
reverse
(
@classification
))) .
'.'
;
$self
->_write_line_GenBank_regex(
' '
x12,
' '
x12,
$OC
,
"\\s\+\|\$"
,80);
}
my
$count
= 1;
foreach
my
$ref
(
$seq
->annotation->get_Annotations(
'reference'
) ) {
$temp_line
=
"REFERENCE $count"
;
if
(
$ref
->start) {
$temp_line
.=
sprintf
(
" (%s %d to %d)"
,
(
$seq
->alphabet() eq
"protein"
?
"residues"
:
"bases"
),
$ref
->start,
$ref
->end);
}
elsif
(
$ref
->gb_reference) {
$temp_line
.=
sprintf
(
" (%s)"
,
$ref
->gb_reference);
}
$self
->_print(
"$temp_line\n"
);
$self
->_write_line_GenBank_regex(
" AUTHORS "
,
' '
x12,
$ref
->authors,
"\\s\+\|\$"
,80);
$self
->_write_line_GenBank_regex(
" CONSRTM "
,
' '
x12,
$ref
->consortium,
"\\s\+\|\$"
,80)
if
$ref
->consortium;
$self
->_write_line_GenBank_regex(
" TITLE "
,
" "
x12,
$ref
->title,
"\\s\+\|\$"
,80);
$self
->_write_line_GenBank_regex(
" JOURNAL "
,
" "
x12,
$ref
->location,
"\\s\+\|\$"
,80);
if
(
$ref
->medline) {
$self
->_write_line_GenBank_regex(
" MEDLINE "
,
" "
x12,
$ref
->medline,
"\\s\+\|\$"
,80);
}
if
(
$ref
->pubmed ) {
$self
->_write_line_GenBank_regex(
" PUBMED "
,
" "
x12,
$ref
->pubmed,
"\\s\+\|\$"
,
80);
}
if
(
$ref
->comment) {
$self
->_write_line_GenBank_regex(
" REMARK "
,
" "
x12,
$ref
->comment,
"\\s\+\|\$"
,80);
}
$count
++;
}
foreach
my
$comment
(
$seq
->annotation->get_Annotations(
'comment'
) ) {
$self
->_write_line_GenBank_regex(
"COMMENT "
,
" "
x12,
$comment
->text,
"\\s\+\|\$"
,80);
}
$self
->_print(
"FEATURES Location/Qualifiers\n"
);
if
(
defined
$self
->_post_sort ) {
my
$post_sort_func
=
$self
->_post_sort();
my
@fth
;
foreach
my
$sf
(
$seq
->top_SeqFeatures ) {
push
(
@fth
,FAST::Bio::SeqIO::FTHelper::from_SeqFeature(
$sf
,
$seq
));
}
@fth
=
sort
{
&$post_sort_func
(
$a
,
$b
) }
@fth
;
foreach
my
$fth
(
@fth
) {
$self
->_print_GenBank_FTHelper(
$fth
);
}
}
else
{
foreach
my
$sf
(
$seq
->top_SeqFeatures ) {
my
@fth
= FAST::Bio::SeqIO::FTHelper::from_SeqFeature(
$sf
,
$seq
);
foreach
my
$fth
(
@fth
) {
if
( !
$fth
->isa(
'FAST::Bio::SeqIO::FTHelper'
) ) {
$sf
->throw(
"Cannot process FTHelper... $fth"
);
}
$self
->_print_GenBank_FTHelper(
$fth
);
}
}
}
if
(
$seq
->annotation->get_Annotations(
'wgs'
)) {
foreach
my
$wgs
(
map
{
$seq
->annotation->get_Annotations(
$_
)}
qw(wgs wgs_scaffold)
) {
$self
->_print(
sprintf
(
"%-11s %s\n"
,
uc
(
$wgs
->tagname),
$wgs
->value));
}
$self
->_show_dna(0);
}
if
(
$seq
->annotation->get_Annotations(
'contig'
)) {
my
$ct
= 0;
my
$cline
;
foreach
my
$contig
(
$seq
->annotation->get_Annotations(
'contig'
)) {
unless
(
$ct
) {
$cline
=
uc
(
$contig
->tagname).
" "
.
$contig
->value.
"\n"
;
}
else
{
$cline
=
" "
.
$contig
->value.
"\n"
;
}
$self
->_print(
$cline
);
$ct
++;
}
$self
->_show_dna(0);
}
if
(
$seq
->
length
== 0 ) {
$self
->_show_dna(0) }
if
(
$self
->_show_dna() == 0 ) {
$self
->_print(
"\n//\n"
);
return
;
}
$str
=~
tr
/A-Z/a-z/;
my
(
$o
) =
$seq
->annotation->get_Annotations(
'origin'
);
$self
->_print(
sprintf
(
"%-12s%s\n"
,
'ORIGIN'
,
$o
?
$o
->value :
''
));
my
$nuc
= 60;
my
$whole_pat
=
'a10'
x 6;
my
$out_pat
=
'A11'
x 6;
my
$length
=
length
(
$str
);
my
$whole
=
int
(
$length
/
$nuc
) *
$nuc
;
my
$i
;
for
(
$i
= 0;
$i
<
$whole
;
$i
+=
$nuc
) {
my
$blocks
=
pack
$out_pat
,
unpack
$whole_pat
,
substr
(
$str
,
$i
,
$nuc
);
chop
$blocks
;
$self
->_print(
sprintf
(
"%9d $blocks\n"
,
$i
+
$nuc
- 59));
}
if
(
my
$last
=
substr
(
$str
,
$i
)) {
my
$last_len
=
length
(
$last
);
my
$last_pat
=
'a10'
x
int
(
$last_len
/ 10) .
'a'
.
$last_len
% 10;
my
$blocks
=
pack
$out_pat
,
unpack
(
$last_pat
,
$last
);
$blocks
=~ s/ +$//;
$self
->_print(
sprintf
(
"%9d $blocks\n"
,
$length
-
$last_len
+ 1));
}
$self
->_print(
"//\n"
);
$self
->flush
if
$self
->_flush_on_write &&
defined
$self
->_fh;
return
1;
}
}
sub
_print_GenBank_FTHelper {
my
(
$self
,
$fth
) =
@_
;
if
( !
ref
$fth
|| !
$fth
->isa(
'FAST::Bio::SeqIO::FTHelper'
) ) {
$fth
->
warn
(
"$fth is not a FTHelper class. Attempting to print, but there could be tears!"
);
}
my
$spacer
= (
length
$fth
->key >= 15) ?
' '
:
''
;
$self
->_write_line_GenBank_regex(
sprintf
(
" %-16s%s"
,
$fth
->key,
$spacer
),
" "
x21,
$fth
->loc,
"\,\|\$"
,80);
foreach
my
$tag
(
keys
%{
$fth
->field} ) {
foreach
my
$value
( @{
$fth
->field->{
$tag
}} ) {
$value
=~ s/\"/\"\"/g;
if
(
$value
eq
"_no_value"
) {
$self
->_write_line_GenBank_regex(
" "
x21,
" "
x21,
"/$tag"
,
"\.\|\$"
,80);
}
elsif
(!
$FTQUAL_NO_QUOTE
{
$tag
} or
length
(
"/$tag=$value"
) >=
$FTQUAL_LINE_LENGTH
) {
my
(
$pat
) = (
$value
=~ /\s/ ?
'\s|$'
:
'.|$'
);
$self
->_write_line_GenBank_regex(
" "
x21,
" "
x21,
"/$tag=\"$value\""
,
$pat
,80);
}
else
{
$self
->_write_line_GenBank_regex(
" "
x21,
" "
x21,
"/$tag=$value"
,
"\.\|\$"
,80);
}
}
}
}
sub
_read_GenBank_References {
my
(
$self
,
$buffer
) =
@_
;
my
(
@refs
);
my
$ref
;
if
(
$$buffer
!~ /^REFERENCE/ ) {
warn
(
"Not parsing line '$$buffer' which maybe important"
);
}
$_
=
$$buffer
;
my
(
@title
,
@loc
,
@authors
,
@consort
,
@com
,
@medline
,
@pubmed
);
REFLOOP:
while
(
defined
(
$_
) ||
defined
(
$_
=
$self
->_readline) ) {
if
(/^\s{2}AUTHORS\s+(.*)/o) {
push
(
@authors
, $1);
while
(
defined
(
$_
=
$self
->_readline) ) {
/^\s{9,}(.*)/o &&
do
{
push
(
@authors
, $1);
next
;};
last
;
}
$ref
->authors(
join
(
' '
,
@authors
));
}
if
(/^\s{2}CONSRTM\s+(.*)/o) {
push
(
@consort
, $1);
while
(
defined
(
$_
=
$self
->_readline) ) {
/^\s{9,}(.*)/o &&
do
{
push
(
@consort
, $1);
next
;};
last
;
}
$ref
->consortium(
join
(
' '
,
@consort
));
}
if
(/^\s{2}TITLE\s+(.*)/o) {
push
(
@title
, $1);
while
(
defined
(
$_
=
$self
->_readline) ) {
/^\s{9,}(.*)/o &&
do
{
push
(
@title
, $1);
next
;
};
last
;
}
$ref
->title(
join
(
' '
,
@title
));
}
if
(/^\s{2}JOURNAL\s+(.*)/o) {
push
(
@loc
, $1);
while
(
defined
(
$_
=
$self
->_readline) ) {
/^\s{9,}(.*)/o &&
do
{
push
(
@loc
, $1);
next
;
};
last
;
}
$ref
->location(
join
(
' '
,
@loc
));
redo
REFLOOP;
}
if
(/^\s{2}REMARK\s+(.*)/o) {
push
(
@com
, $1);
while
(
defined
(
$_
=
$self
->_readline) ) {
/^\s{9,}(.*)/o &&
do
{
push
(
@com
, $1);
next
;
};
last
;
}
$ref
->comment(
join
(
' '
,
@com
));
redo
REFLOOP;
}
if
( /^\s{2}MEDLINE\s+(.*)/ ) {
push
(
@medline
,$1);
while
(
defined
(
$_
=
$self
->_readline) ) {
/^\s{9,}(.*)/ &&
do
{
push
(
@medline
, $1);
next
;
};
last
;
}
$ref
->medline(
join
(
' '
,
@medline
));
redo
REFLOOP;
}
if
( /^\s{3}PUBMED\s+(.*)/ ) {
push
(
@pubmed
,$1);
while
(
defined
(
$_
=
$self
->_readline) ) {
/^\s{9,}(.*)/ &&
do
{
push
(
@pubmed
, $1);
next
;
};
last
;
}
$ref
->pubmed(
join
(
' '
,
@pubmed
));
redo
REFLOOP;
}
/^REFERENCE/o &&
do
{
$self
->_add_ref_to_array(\
@refs
,
$ref
)
if
defined
$ref
;
@authors
= ();
@title
= ();
@loc
= ();
@com
= ();
@pubmed
= ();
@medline
= ();
$ref
= FAST::Bio::Annotation::Reference->new(
-tagname
=>
'reference'
);
if
(/^REFERENCE\s+\d+\s+\([a-z]+ (\d+) to (\d+)\)/){
$ref
->start($1);
$ref
->end($2);
}
elsif
(/^REFERENCE\s+\d+\s+\((.*)\)/) {
$ref
->gb_reference($1);
}
};
/^(FEATURES)|(COMMENT)/o &&
last
;
$_
=
undef
;
}
$self
->_add_ref_to_array(\
@refs
,
$ref
)
if
defined
$ref
;
$$buffer
=
$_
;
return
@refs
;
}
sub
_add_ref_to_array {
my
(
$self
,
$refs
,
$ref
) =
@_
;
my
$au
=
$ref
->authors();
my
$title
=
$ref
->title();
$au
=~ s/;\s*$//g
if
$au
;
$title
=~ s/;\s*$//g
if
$title
;
$ref
->authors(
$au
);
$ref
->title(
$title
);
push
(@{
$refs
},
$ref
);
}
sub
_read_GenBank_Species {
my
(
$self
,
$buffer
) =
@_
;
my
@unkn_names
= (
'other'
,
'unknown organism'
,
'not specified'
,
'not shown'
,
'Unspecified'
,
'Unknown'
,
'None'
,
'unclassified'
,
'unidentified organism'
,
'not supplied'
);
my
@unkn_genus
= (
'unknown'
,
'unclassified'
,
'uncultured'
,
'unidentified'
);
my
$line
=
$$buffer
;
my
(
$sub_species
,
$species
,
$genus
,
$sci_name
,
$common
,
$class_lines
,
$source_flag
,
$abbr_name
,
$organelle
,
$sl
);
my
%source
=
map
{
$_
=> 1 }
qw(SOURCE ORGANISM CLASSIFICATION)
;
my
(
$ann
,
$tag
,
$data
);
while
(
defined
(
$line
) ||
defined
(
$line
=
$self
->_readline())) {
$line
=~ s{<[^>]+>}{}g;
if
(
$line
=~ m{^(?:\s{0,2})(\w+)\s+(.+)?$}ox) {
(
$tag
,
$data
) = ($1, $2 ||
''
);
last
if
(
$tag
&& !
exists
$source
{
$tag
});
}
else
{
return
unless
$tag
;
(
$data
=
$line
) =~ s{^\s+}{};
chomp
$data
;
$tag
=
'CLASSIFICATION'
if
(
$tag
ne
'CLASSIFICATION'
&&
$tag
eq
'ORGANISM'
&&
$line
=~ m{[;\.]+});
}
(
exists
$ann
->{
$tag
}) ? (
$ann
->{
$tag
} .=
' '
.
$data
) : (
$ann
->{
$tag
} .=
$data
);
$line
=
undef
;
}
(
$sl
,
$class_lines
,
$sci_name
) = (
$ann
->{SOURCE},
$ann
->{CLASSIFICATION},
$ann
->{ORGANISM});
$$buffer
=
$line
;
$sci_name
||
return
;
if
(
$sl
=~ m{^
(mitochondrion|chloroplast|plastid)?
\s*(.*?)
\s*(?: \( (.*?) \) )?\.?
$
}xms ){
(
$organelle
,
$abbr_name
,
$common
) = ($1, $2, $3);
}
else
{
$abbr_name
=
$sl
;
}
my
@class
=
map
{ s/^\s+//; s/\s+$//; s/\s{2,}/ /g;
$_
; }
split
/(?<!subgen)[;\.]+/,
$class_lines
;
my
$possible_genus
=
quotemeta
(
$class
[-1])
. (
$class
[-2] ?
"|"
.
quotemeta
(
$class
[-2]) :
''
);
if
(
$sci_name
=~ /^(
$possible_genus
)/) {
$genus
= $1;
(
$species
) =
$sci_name
=~ /^
$genus
\s+(.+)/;
}
else
{
$species
=
$sci_name
;
}
if
(
$species
&&
$species
=~ /(.+)\s+((?:subsp\.|var\.).+)/) {
(
$species
,
$sub_species
) = ($1, $2);
}
my
$unkn
=
grep
{
$_
eq
$sl
}
@unkn_names
;
return
unless
(
$species
||
$genus
) and
$unkn
== 0;
push
(
@class
,
$sci_name
);
@class
=
reverse
@class
;
my
$make
= FAST::Bio::Species->new();
$make
->scientific_name(
$sci_name
);
$make
->classification(
@class
)
if
@class
> 0;
$make
->common_name(
$common
)
if
$common
;
$make
->name(
'abbreviated'
,
$abbr_name
)
if
$abbr_name
;
$make
->organelle(
$organelle
)
if
$organelle
;
return
$make
;
}
sub
_read_FTHelper_GenBank {
my
(
$self
,
$buffer
) =
@_
;
my
(
$key
,
$loc
);
my
@qual
= ();
if
(
$$buffer
=~ /^\s{5}(\S+)\s+(.+?)\s*$/o) {
$key
= $1;
$loc
= $2;
while
(
defined
(
$_
=
$self
->_readline) ) {
if
(/^(\s+)(.+?)\s*$/o) {
if
(
length
($1) > 6) {
if
(
@qual
|| (
index
($2,
'/'
) == 0)) {
push
(
@qual
, $2);
}
else
{
$loc
.= $2;
}
}
else
{
last
;
}
}
else
{
last
;
}
}
}
else
{
$self
->debug(
"no feature key!\n"
);
$$buffer
=
$self
->_readline();
return
;
}
$$buffer
=
$_
;
my
$out
= FAST::Bio::SeqIO::FTHelper->new();
$out
->verbose(
$self
->verbose());
$out
->key(
$key
);
$out
->loc(
$loc
);
QUAL:
for
(
my
$i
= 0;
$i
<
@qual
;
$i
++) {
$_
=
$qual
[
$i
];
my
(
$qualifier
,
$value
) = (m{^/([^=]+)(?:=(.+))?})
or
$self
->
warn
(
"cannot see new qualifier in feature $key: "
.
$qual
[
$i
]);
$qualifier
=
''
unless
(
defined
$qualifier
);
if
(
defined
$value
) {
if
(
substr
(
$value
, 0, 1) eq
'"'
) {
while
(
$value
!~ /\
"$/ or $value =~ tr/"
/"/ % 2) {
if
(
$i
>=
$#qual
) {
$self
->
warn
(
"Unbalanced quote in:\n"
.
join
(
"\n"
,
@qual
) .
"No further qualifiers will "
.
"be added for this feature"
);
last
QUAL;
}
$i
++;
my
$next
=
$qual
[
$i
];
if
(
$qualifier
!~ /^translation$/i ) {
$value
.=
" "
;
}
$value
.=
$next
;
}
$value
=~ s/^
"|"
$//g;
$value
=~ s/
""
/\"/g;
}
elsif
(
$value
=~ /^\(/ ) {
my
$left
= (
$value
=~
tr
/\(/\(/);
my
$right
= (
$value
=~
tr
/\)/\)/);
while
(
$left
!=
$right
) {
if
(
$i
>=
$#qual
) {
$self
->
warn
(
"Unbalanced parens in:\n"
.
join
(
"\n"
,
@qual
).
"\nNo further qualifiers will "
.
"be added for this feature"
);
last
QUAL;
}
$i
++;
my
$next
=
$qual
[
$i
];
$value
.=
$next
;
$left
+= (
$next
=~
tr
/\(/\(/);
$right
+= (
$next
=~
tr
/\)/\)/);
}
}
}
else
{
$value
=
'_no_value'
;
}
$out
->field->{
$qualifier
} ||= [];
push
(@{
$out
->field->{
$qualifier
}},
$value
);
}
return
$out
;
}
sub
_write_line_GenBank {
my
(
$self
,
$pre1
,
$pre2
,
$line
,
$length
) =
@_
;
$length
||
$self
->throw(
"Miscalled write_line_GenBank without length. Programming error!"
);
my
$subl
=
$length
-
length
$pre2
;
my
$linel
=
length
$line
;
my
$i
;
my
$subr
=
substr
(
$line
,0,
$length
-
length
$pre1
);
$self
->_print(
"$pre1$subr\n"
);
for
(
$i
= (
$length
-
length
$pre1
);
$i
<
$linel
;
$i
+=
$subl
) {
$subr
=
substr
(
$line
,
$i
,
$subl
);
$self
->_print(
"$pre2$subr\n"
);
}
}
sub
_write_line_GenBank_regex {
my
(
$self
,
$pre1
,
$pre2
,
$line
,
$regex
,
$length
) =
@_
;
$length
||
$self
->throw(
"Miscalled write_line_GenBank without length. Programming error!"
);
my
$subl
=
$length
- (
length
$pre1
) - 2;
my
@lines
= ();
CHUNK:
while
(
$line
) {
foreach
my
$pat
(
$regex
,
'[,;\.\/-]\s|'
.
$regex
,
'[,;\.\/-]|'
.
$regex
) {
if
(
$line
=~ m/^(.{0,
$subl
})(
$pat
)(.*)/ ) {
my
$l
= $1.$2;
$line
=
substr
(
$line
,
length
(
$l
));
$l
=~ s/\s+$//;
next
CHUNK
if
(
$l
eq
''
);
push
(
@lines
,
$l
);
next
CHUNK;
}
}
$self
->
warn
(
"trouble dissecting \"$line\"\n into chunks "
.
"of $subl chars or less - this tag won't print right"
);
$line
=
substr
(
$line
,0,
$subl
) .
" "
.
substr
(
$line
,
$subl
);
}
my
$s
=
shift
@lines
;
$self
->_print(
"$pre1$s\n"
)
if
$s
;
foreach
my
$s
(
@lines
) {
$self
->_print(
"$pre2$s\n"
);
}
}
sub
_post_sort {
my
(
$obj
,
$value
) =
@_
;
if
(
defined
$value
) {
$obj
->{
'_post_sort'
} =
$value
;
}
return
$obj
->{
'_post_sort'
};
}
sub
_show_dna {
my
(
$obj
,
$value
) =
@_
;
if
(
defined
$value
) {
$obj
->{
'_show_dna'
} =
$value
;
}
return
$obj
->{
'_show_dna'
};
}
sub
_id_generation_func {
my
(
$obj
,
$value
) =
@_
;
if
(
defined
$value
) {
$obj
->{
'_id_generation_func'
} =
$value
;
}
return
$obj
->{
'_id_generation_func'
};
}
sub
_ac_generation_func {
my
(
$obj
,
$value
) =
@_
;
if
(
defined
$value
) {
$obj
->{
'_ac_generation_func'
} =
$value
;
}
return
$obj
->{
'_ac_generation_func'
};
}
sub
_sv_generation_func {
my
(
$obj
,
$value
) =
@_
;
if
(
defined
$value
) {
$obj
->{
'_sv_generation_func'
} =
$value
;
}
return
$obj
->{
'_sv_generation_func'
};
}
sub
_kw_generation_func {
my
(
$obj
,
$value
) =
@_
;
if
(
defined
$value
) {
$obj
->{
'_kw_generation_func'
} =
$value
;
}
return
$obj
->{
'_kw_generation_func'
};
}
1;