use
vars
qw(%FTQUAL_NO_QUOTE)
;
%FTQUAL_NO_QUOTE
=(
'anticodon'
=> 1,
'citation'
=> 1,
'codon'
=> 1,
'codon_start'
=> 1,
'cons_splice'
=> 1,
'direction'
=> 1,
'evidence'
=> 1,
'label'
=> 1,
'mod_base'
=> 1,
'number'
=> 1,
'rpt_type'
=> 1,
'rpt_unit'
=> 1,
'transl_except'
=> 1,
'transl_table'
=> 1,
'usedin'
=> 1,
);
sub
_initialize {
my
(
$self
,
@args
) =
@_
;
$self
->SUPER::_initialize(
@args
);
$self
->{
'_func_ftunit_hash'
} = {};
$self
->_show_dna(1);
if
( !
defined
$self
->sequence_factory ) {
$self
->sequence_factory(new Bio::Seq::SeqFactory
(
-verbose
=>
$self
->verbose(),
-type
=>
'Bio::Seq::RichSeq'
));
}
}
sub
next_seq {
my
(
$self
,
@args
) =
@_
;
my
$builder
=
$self
->sequence_builder();
my
$seq
;
my
%params
;
RECORDSTART:
while
(1) {
my
$buffer
;
my
(
@acc
,
@features
);
my
(
$display_id
,
$annotation
);
my
$species
;
@features
= ();
$annotation
=
undef
;
@acc
= ();
$species
=
undef
;
%params
= (
-verbose
=>
$self
->verbose);
local
($/) =
"\n"
;
while
(
defined
(
$buffer
=
$self
->_readline())) {
last
if
index
(
$buffer
,
'LOCUS '
) == 0;
}
return
unless
defined
$buffer
;
$buffer
=~ /^LOCUS\s+(\S.*)$/o ||
$self
->throw(
"GenBank stream with bad LOCUS line. Not GenBank in my book. Got '$buffer'"
);
my
@tokens
=
split
(
' '
, $1);
$display_id
=
shift
(
@tokens
);
$params
{
'-display_id'
} =
$display_id
;
$params
{
'-length'
} =
shift
(
@tokens
);
$params
{
'-alphabet'
} = (
lc
(
shift
@tokens
) eq
'bp'
) ?
'dna'
:
'protein'
;
if
((
$params
{
'-alphabet'
} eq
'dna'
) || (
@tokens
> 2)) {
$params
{
'-molecule'
} =
shift
(
@tokens
);
my
$circ
=
shift
(
@tokens
);
if
(
$circ
eq
'circular'
) {
$params
{
'-is_circular'
} = 1;
$params
{
'-division'
} =
shift
(
@tokens
);
}
else
{
$params
{
'-division'
} =
(CORE::
length
(
$circ
) == 3 ) ?
$circ
:
shift
(
@tokens
);
}
}
else
{
$params
{
'-molecule'
} =
'PRT'
if
(
$params
{
'-alphabet'
} eq
'aa'
);
$params
{
'-division'
} =
shift
(
@tokens
);
}
my
$date
=
join
(
' '
,
@tokens
);
if
(
$date
=~ s/\s*((\d{1,2})-(\w{3})-(\d{2,4})).*/$1/) {
if
(
length
(
$date
) < 11 ) {
my
(
$d
,
$m
,
$y
) = ($2,$3,$4);
if
(
length
(
$d
) == 1 ) {
$d
=
"0$d"
;
}
if
(
length
(
$y
) == 2 ) {
if
(
$y
> 60 ) {
$y
=
"19$y"
;
}
else
{
$y
=
"20$y"
;
}
$self
->
warn
(
"Date was malformed, guessing the century for $date to be $y\n"
);
}
$params
{
'-dates'
} = [
join
(
'-'
,
$d
,
$m
,
$y
)];
}
else
{
$params
{
'-dates'
} = [
$date
];
}
}
$builder
->add_slot_value(
%params
);
%params
= ();
if
(!
$builder
->want_object()) {
$builder
->make_object();
next
RECORDSTART;
}
if
(
$builder
->want_slot(
'annotation'
)) {
$annotation
= new Bio::Annotation::Collection;
}
$buffer
=
$self
->_readline();
until
( !
defined
(
$buffer
) ) {
$_
=
$buffer
;
if
(/^DEFINITION\s+(\S.*\S)/) {
my
@desc
= ($1);
while
(
defined
(
$_
=
$self
->_readline) ) {
if
( /^\s+(.*)/ ) {
push
(
@desc
, $1);
next
};
last
;
}
$builder
->add_slot_value(
-desc
=>
join
(
' '
,
@desc
));
$buffer
=
$_
;
}
if
( /^ACCESSION\s+(\S.*\S)/ ) {
push
(
@acc
,
split
(/\s+/,$1));
while
(
defined
(
$_
=
$self
->_readline) ) {
/^\s+(.*)/ &&
do
{
push
(
@acc
,
split
(/\s+/,$1));
next
};
last
;
}
$buffer
=
$_
;
next
;
}
elsif
( /^PID\s+(\S+)/ ) {
$params
{
'-pid'
} = $1;
}
elsif
( /^VERSION\s+(.+)$/ ) {
my
(
$acc
,
$gi
) =
split
(
' '
,$1);
if
(
$acc
=~ /^\w+\.(\d+)/) {
$params
{
'-version'
} = $1;
$params
{
'-seq_version'
} = $1;
}
if
(
$gi
&& (
index
(
$gi
,
"GI:"
) == 0)) {
$params
{
'-primary_id'
} =
substr
(
$gi
,3);
}
}
elsif
( /^KEYWORDS\s+(.*)/ ) {
my
@kw
=
split
(/\s*\;\s*/,$1);
while
(
defined
(
$_
=
$self
->_readline) ) {
chomp
;
/^\s+(.*)/ &&
do
{
push
(
@kw
,
split
(/\s*\;\s*/,$1));
next
};
last
;
}
@kw
&&
$kw
[-1] =~ s/\.$//;
$params
{
'-keywords'
} = \
@kw
;
$buffer
=
$_
;
next
;
}
elsif
(/^SOURCE/) {
if
(
$builder
->want_slot(
'species'
)) {
$species
=
$self
->_read_GenBank_Species(\
$buffer
);
$builder
->add_slot_value(
-species
=>
$species
);
}
else
{
while
(
defined
(
$buffer
=
$self
->_readline())) {
last
if
substr
(
$buffer
,0,1) ne
' '
;
}
}
next
;
}
elsif
(/^REFERENCE/) {
if
(
$annotation
) {
my
@refs
=
$self
->_read_GenBank_References(\
$buffer
);
foreach
my
$ref
(
@refs
) {
$annotation
->add_Annotation(
'reference'
,
$ref
);
}
}
else
{
while
(
defined
(
$buffer
=
$self
->_readline())) {
last
if
substr
(
$buffer
,0,1) ne
' '
;
}
}
next
;
}
elsif
(/^COMMENT\s+(.*)/) {
if
(
$annotation
) {
my
$comment
= $1;
while
(
defined
(
$_
=
$self
->_readline)) {
last
if
(/^\S/);
$comment
.=
$_
;
}
$comment
=~ s/\n/ /g;
$comment
=~ s/ +/ /g;
$annotation
->add_Annotation(
'comment'
,
Bio::Annotation::Comment->new(
-text
=>
$comment
,
-tagname
=>
'comment'
));
$buffer
=
$_
;
}
else
{
while
(
defined
(
$buffer
=
$self
->_readline())) {
last
if
substr
(
$buffer
,0,1) ne
' '
;
}
}
next
;
}
elsif
( /^DBSOURCE\s+(.+)/ ) {
if
(
$annotation
) {
my
$dbsource
= $1;
while
(
defined
(
$_
=
$self
->_readline)) {
last
if
(/^\S/);
$dbsource
.=
$_
;
}
if
(
$dbsource
=~ s/swissprot:\s+locus\s+(\S+)\,.+\n// ) {
$annotation
->add_Annotation
(
'dblink'
,
Bio::Annotation::DBLink->new
(
-primary_id
=> $1,
-database
=>
'swissprot'
,
-tagname
=>
'dblink'
));
if
(
$dbsource
=~ s/\s+created:\s+([^\.]+)\.\n// ) {
$annotation
->add_Annotation
(
'swissprot_dates'
,
Bio::Annotation::SimpleValue->new
(
-tagname
=>
'date_created'
,
-value
=> $1));
}
while
(
$dbsource
=~ s/\s+(sequence|annotation)\s+updated:\s+([^\.]+)\.\n//g ) {
$annotation
->add_Annotation
(
'swissprot_dates'
,
Bio::Annotation::SimpleValue->new
(
-tagname
=>
'date_updated'
,
-value
=> $1));
}
$dbsource
=~ s/\n/ /g;
if
(
$dbsource
=~ s/\s+xrefs:\s+((?:\S+,\s+)+\S+)\s+xrefs/xrefs/ ) {
my
$i
= 0;
for
my
$dbsrc
(
split
(/,\s+/,$1) ) {
if
(
$dbsrc
=~ /(\S+)\.(\d+)/ ||
$dbsrc
=~ /(\S+)/ ) {
my
(
$id
,
$version
) = ($1,$2);
$version
=
''
unless
defined
$version
;
my
$db
;
if
(
$id
=~ /^\d\S{3}/) {
$db
=
'PDB'
;
}
else
{
$db
= (
$i
++ % 2 ) ?
'GenPept'
:
'GenBank'
;
}
$annotation
->add_Annotation
(
'dblink'
,
Bio::Annotation::DBLink->new
(
-primary_id
=>
$id
,
-version
=>
$version
,
-database
=>
$db
,
-tagname
=>
'dblink'
));
}
}
}
elsif
(
$dbsource
=~ s/\s+xrefs:\s+(.+)\s+xrefs/xrefs/i ) {
my
$i
= 0;
for
my
$id
(
split
(/\,\s+/,$1) ) {
my
(
$acc
,
$db
);
if
(
$id
=~ /gi:\s+(\d+)/ ) {
$acc
= $1;
$db
= (
$i
++ % 2 ) ?
'GenPept'
:
'GenBank'
;
}
elsif
(
$id
=~ /pdb\s+accession\s+(\S+)/ ) {
$acc
= $1;
$db
=
'PDB'
;
}
else
{
$acc
=
$id
;
$db
=
''
;
}
$annotation
->add_Annotation
(
'dblink'
,
Bio::Annotation::DBLink->new
(
-primary_id
=>
$acc
,
-database
=>
$db
,
-tagname
=>
'dblink'
));
}
}
else
{
$self
->debug(
"Cannot match $dbsource\n"
);
}
if
(
$dbsource
=~ s/xrefs\s+\(non\-sequence\s+databases\):\s+
((?:\S+,\s+)+\S+)//x ) {
for
my
$id
(
split
(/\,\s+/,$1) ) {
my
$db
;
if
(
$id
=~ s/^(EchoBASE|IntAct|SWISS-2DPAGE|ECO2DBASE|ECOGENE|TIGRFAMs|TIGR|GO|InterPro|Pfam|PROSITE|SGD|GermOnline|HSSP|PhosSite)//i ) {
$db
= $1;
}
$annotation
->add_Annotation
(
'dblink'
,
Bio::Annotation::DBLink->new
(
-primary_id
=>
$id
,
-database
=>
$db
,
-tagname
=>
'dblink'
));
}
}
}
else
{
if
(
$dbsource
=~ /(\S+)\.(\d+)/ ) {
my
(
$id
,
$version
) = ($1,$2);
$annotation
->add_Annotation
(
'dblink'
,
Bio::Annotation::DBLink->new
(
-primary_id
=>
$id
,
-version
=>
$version
,
-database
=>
'GenBank'
,
-tagname
=>
'dblink'
));
}
}
$buffer
=
$_
;
}
else
{
while
(
defined
(
$buffer
=
$self
->_readline())) {
last
if
substr
(
$buffer
,0,1) ne
' '
;
}
}
next
;
}
last
if
( /^(FEATURES|ORIGIN)/ );
$buffer
=
$self
->_readline;
}
return
unless
defined
$buffer
;
$builder
->add_slot_value(
-accession_number
=>
shift
(
@acc
),
-secondary_accessions
=> \
@acc
,
%params
);
$builder
->add_slot_value(
-annotation
=>
$annotation
)
if
$annotation
;
%params
= ();
if
(!
$builder
->want_object()) {
$builder
->make_object();
next
RECORDSTART;
}
if
(
$builder
->want_slot(
'features'
) &&
defined
(
$_
) && /^FEATURES/o) {
$buffer
=
$self
->_readline;
while
(
defined
(
$buffer
) ) {
last
if
(
$buffer
=~ /^BASE|ORIGIN|CONTIG|WGS/o);
my
$ftunit
=
$self
->_read_FTHelper_GenBank(\
$buffer
);
if
( !
defined
$ftunit
) {
$self
->
warn
(
"Unexpected error in feature table for "
.
$params
{
'-display_id'
}.
" Skipping feature, attempting to recover"
);
unless
( (
$buffer
=~ /^\s{5,5}\S+/o) or
(
$buffer
=~ /^\S+/o)) {
$buffer
=
$self
->_readline();
}
next
;
}
my
$feat
=
$ftunit
->_generic_seqfeature(
$self
->location_factory(),
$display_id
);
if
(
$species
&& (
$feat
->primary_tag eq
'source'
) &&
$feat
->has_tag(
'db_xref'
) && (!
$species
->ncbi_taxid() ||
(
$species
->ncbi_taxid &&
$species
->ncbi_taxid =~ /^list/))) {
foreach
my
$tagval
(
$feat
->get_tag_values(
'db_xref'
)) {
if
(
index
(
$tagval
,
"taxon:"
) == 0) {
$species
->ncbi_taxid(
substr
(
$tagval
,6));
last
;
}
}
}
push
(
@features
,
$feat
);
}
$builder
->add_slot_value(
-features
=> \
@features
);
$_
=
$buffer
;
}
if
(
defined
(
$_
) ) {
if
( /^CONTIG/o ) {
my
@contig
;
while
(
$_
!~ m{^//}) {
$_
=~ /^(?:CONTIG)?\s+(.*)/;
$annotation
->add_Annotation(
Bio::Annotation::SimpleValue->new(
-value
=> $1,
-tagname
=>
'CONTIG'
));
$_
=
$self
->_readline;
}
$self
->_pushback(
$_
);
}
elsif
( /^WGS|WGS_SCAFLD\s+/o ) {
while
(
$_
=~ s/(^WGS|WGS_SCAFLD)\s+//){
chomp
;
$annotation
->add_Annotation(
Bio::Annotation::SimpleValue->new(
-value
=>
$_
,
-tagname
=> $1));
$_
=
$self
->_readline;
}
}
elsif
(! m{^(ORIGIN|//)} ) {
while
(
defined
(
$_
=
$self
->_readline) ) {
last
if
m{^(ORIGIN|//)};
}
}
}
if
(!
$builder
->want_object()) {
$builder
->make_object();
next
RECORDSTART;
}
if
(
$builder
->want_slot(
'seq'
)) {
if
(
defined
(
$_
) && s/^ORIGIN\s+//) {
chomp
;
if
(
$annotation
&&
length
(
$_
) > 0 ) {
$annotation
->add_Annotation(
'origin'
,
Bio::Annotation::SimpleValue->new(
-tagname
=>
'origin'
,
-value
=>
$_
));
}
my
$seqc
=
''
;
while
(
defined
(
$_
=
$self
->_readline) ) {
m{^//} &&
last
;
$_
=
uc
(
$_
);
s/[^A-Za-z]//g;
$seqc
.=
$_
;
}
$self
->debug(
"sequence length is "
.
length
(
$seqc
) .
"\n"
);
$builder
->add_slot_value(
-seq
=>
$seqc
);
}
}
elsif
(
defined
(
$_
) && (
substr
(
$_
,0,2) ne
'//'
)) {
while
(
defined
(
$_
=
$self
->_readline) ) {
last
if
substr
(
$_
,0,2) eq
'//'
;
}
}
$seq
=
$builder
->make_object();
next
RECORDSTART
unless
$seq
;
last
RECORDSTART;
}
return
$seq
;
}
sub
write_seq {
my
(
$self
,
@seqs
) =
@_
;
foreach
my
$seq
(
@seqs
) {
$self
->throw(
"Attempting to write with no seq!"
)
unless
defined
$seq
;
if
( !
ref
$seq
|| !
$seq
->isa(
'Bio::SeqI'
) ) {
$self
->
warn
(
" $seq is not a SeqI compliant module. Attempting to dump, but may fail!"
);
}
my
$str
=
$seq
->seq;
my
(
$div
,
$mol
);
my
$len
=
$seq
->
length
();
if
(
$seq
->can(
'division'
) ) {
$div
=
$seq
->division;
}
if
( !
defined
$div
|| !
$div
) {
$div
=
'UNK'
; }
my
$alpha
=
$seq
->alphabet;
if
( !
$seq
->can(
'molecule'
) || !
defined
(
$mol
=
$seq
->molecule()) ) {
$mol
=
$alpha
||
'DNA'
;
}
my
$circular
=
'linear '
;
$circular
=
'circular'
if
$seq
->is_circular;
local
($^W) = 0;
my
$temp_line
;
if
(
$self
->_id_generation_func ) {
$temp_line
= &{
$self
->_id_generation_func}(
$seq
);
}
else
{
my
$date
=
''
;
if
(
$seq
->can(
'get_dates'
) ) {
(
$date
) =
$seq
->get_dates();
}
$self
->
warn
(
"No whitespace allowed in GenBank display id ["
.
$seq
->display_id.
"]"
)
if
$seq
->display_id =~ /\s/;
$temp_line
=
sprintf
(
"%-12s%-15s%13s %s%4s%-8s%-8s %3s %-s"
,
'LOCUS'
,
$seq
->id(),
$len
,
(
lc
(
$alpha
) eq
'protein'
) ? (
'aa'
,
''
,
''
) :
(
'bp'
,
''
,
$mol
),
$circular
,
$div
,
$date
);
}
$self
->_print(
"$temp_line\n"
);
$self
->_write_line_GenBank_regex(
"DEFINITION "
,
" "
,
$seq
->desc(),
"\\s\+\|\$"
,80);
if
(
$self
->_ac_generation_func ) {
$temp_line
= &{
$self
->_ac_generation_func}(
$seq
);
$self
->_print(
"ACCESSION $temp_line\n"
);
}
else
{
my
@acc
= ();
push
(
@acc
,
$seq
->accession_number());
if
(
$seq
->isa(
'Bio::Seq::RichSeqI'
) ) {
push
(
@acc
,
$seq
->get_secondary_accessions());
}
$self
->_print(
"ACCESSION "
,
join
(
" "
,
@acc
),
"\n"
);
}
if
(
$seq
->isa(
'Bio::Seq::RichSeqI'
) &&
$seq
->pid()) {
$self
->_print(
"PID "
,
$seq
->pid(),
"\n"
);
}
if
(
defined
$self
->_sv_generation_func() ) {
$temp_line
= &{
$self
->_sv_generation_func}(
$seq
);
if
(
$temp_line
) {
$self
->_print(
"VERSION $temp_line\n"
);
}
}
else
{
if
(
$seq
->isa(
'Bio::Seq::RichSeqI'
) &&
defined
(
$seq
->seq_version)) {
my
$id
=
$seq
->primary_id();
$self
->_print(
"VERSION "
,
$seq
->accession_number(),
"."
,
$seq
->seq_version,
(
$id
&& (
$id
=~ /^\d+$/) ?
" GI:"
.
$id
:
""
),
"\n"
);
}
}
foreach
my
$ref
(
$seq
->annotation->get_Annotations(
'dblink'
) ) {
$self
->_print(
'DBSOURCE accession '
,
$ref
->primary_id,
"\n"
);
}
if
(
defined
$self
->_kw_generation_func() ) {
$temp_line
= &{
$self
->_kw_generation_func}(
$seq
);
$self
->_print(
"KEYWORDS $temp_line\n"
);
}
else
{
if
(
$seq
->can(
'keywords'
) ) {
my
$kw
=
$seq
->keywords;
$kw
.=
'.'
if
(
$kw
!~ /\.$/ );
$self
->_print(
"KEYWORDS $kw\n"
);
}
}
foreach
my
$ref
(
$seq
->annotation->get_Annotations(
'segment'
) ) {
$self
->_print(
sprintf
(
"%-11s %s\n"
,
'SEGMENT'
,
$ref
->value));
}
if
(
my
$spec
=
$seq
->species) {
my
(
$on
,
$sn
,
$cn
) = (
$spec
->organelle,
$spec
->scientific_name,
$spec
->common_name);
my
$abname
=
$spec
->name(
'abbreviated'
) ?
$spec
->name(
'abbreviated'
)->[0] :
$sn
;
my
$sl
=
$on
?
"$on "
:
''
;
$sl
.=
$cn
?
$abname
.
" ($cn)."
:
"$abname."
;
$self
->_write_line_GenBank_regex(
"SOURCE "
,
' '
x12,
$sl
,
"\\s\+\|\$"
,80);
$self
->_print(
" ORGANISM "
,
$spec
->scientific_name,
"\n"
);
my
@classification
=
$spec
->classification;
shift
(
@classification
);
my
$OC
=
join
(
'; '
, (
reverse
(
@classification
))) .
'.'
;
$self
->_write_line_GenBank_regex(
' '
x12,
' '
x12,
$OC
,
"\\s\+\|\$"
,80);
}
my
$count
= 1;
foreach
my
$ref
(
$seq
->annotation->get_Annotations(
'reference'
) ) {
$temp_line
=
"REFERENCE $count"
;
if
(
$ref
->start) {
$temp_line
.=
sprintf
(
" (%s %d to %d)"
,
(
$seq
->alphabet() eq
"protein"
?
"residues"
:
"bases"
),
$ref
->start,
$ref
->end);
}
elsif
(
$ref
->gb_reference) {
$temp_line
.=
sprintf
(
" (%s)"
,
$ref
->gb_reference);
}
$self
->_print(
"$temp_line\n"
);
$self
->_write_line_GenBank_regex(
" AUTHORS "
,
' '
x12,
$ref
->authors,
"\\s\+\|\$"
,80);
$self
->_write_line_GenBank_regex(
" CONSRTM "
,
' '
x12,
$ref
->consortium,
"\\s\+\|\$"
,80)
if
$ref
->consortium;
$self
->_write_line_GenBank_regex(
" TITLE "
,
" "
x12,
$ref
->title,
"\\s\+\|\$"
,80);
$self
->_write_line_GenBank_regex(
" JOURNAL "
,
" "
x12,
$ref
->location,
"\\s\+\|\$"
,80);
if
(
$ref
->medline) {
$self
->_write_line_GenBank_regex(
" MEDLINE "
,
" "
x12,
$ref
->medline,
"\\s\+\|\$"
,80);
}
if
(
$ref
->pubmed ) {
$self
->_write_line_GenBank_regex(
" PUBMED "
,
" "
x12,
$ref
->pubmed,
"\\s\+\|\$"
,
80);
}
if
(
$ref
->comment) {
$self
->_write_line_GenBank_regex(
" REMARK "
,
" "
x12,
$ref
->comment,
"\\s\+\|\$"
,80);
}
$count
++;
}
foreach
my
$comment
(
$seq
->annotation->get_Annotations(
'comment'
) ) {
$self
->_write_line_GenBank_regex(
"COMMENT "
,
" "
x12,
$comment
->text,
"\\s\+\|\$"
,80);
}
$self
->_print(
"FEATURES Location/Qualifiers\n"
);
if
(
defined
$self
->_post_sort ) {
my
$post_sort_func
=
$self
->_post_sort();
my
@fth
;
foreach
my
$sf
(
$seq
->top_SeqFeatures ) {
push
(
@fth
,Bio::SeqIO::FTHelper::from_SeqFeature(
$sf
,
$seq
));
}
@fth
=
sort
{
&$post_sort_func
(
$a
,
$b
) }
@fth
;
foreach
my
$fth
(
@fth
) {
$self
->_print_GenBank_FTHelper(
$fth
);
}
}
else
{
foreach
my
$sf
(
$seq
->top_SeqFeatures ) {
my
@fth
= Bio::SeqIO::FTHelper::from_SeqFeature(
$sf
,
$seq
);
foreach
my
$fth
(
@fth
) {
if
( !
$fth
->isa(
'Bio::SeqIO::FTHelper'
) ) {
$sf
->throw(
"Cannot process FTHelper... $fth"
);
}
$self
->_print_GenBank_FTHelper(
$fth
);
}
}
}
if
(
$seq
->annotation->get_Annotations(
'WGS'
)) {
foreach
my
$wgs
(
map
{
$seq
->annotation->get_Annotations(
$_
)}
qw(WGS WGS_SCAFLD)
) {
$self
->_print(
sprintf
(
"%-11s %s\n"
,
$wgs
->tagname,
$wgs
->value));
}
$self
->_show_dna(0);
}
if
(
$seq
->annotation->get_Annotations(
'CONTIG'
)) {
my
$ct
= 0;
my
$cline
;
foreach
my
$contig
(
$seq
->annotation->get_Annotations(
'CONTIG'
)) {
unless
(
$ct
) {
$cline
=
$contig
->tagname.
" "
.
$contig
->value.
"\n"
;
}
else
{
$cline
=
" "
.
$contig
->value.
"\n"
;
}
$self
->_print(
$cline
);
$ct
++;
}
$self
->_show_dna(0);
}
if
(
$seq
->
length
== 0 ) {
$self
->_show_dna(0) }
if
(
$self
->_show_dna() == 0 ) {
$self
->_print(
"\n//\n"
);
return
;
}
$str
=~
tr
/A-Z/a-z/;
unless
(
$mol
eq
'protein'
) {
my
$alen
=
$str
=~
tr
/a/a/;
my
$clen
=
$str
=~
tr
/c/c/;
my
$glen
=
$str
=~
tr
/g/g/;
my
$tlen
=
$str
=~
tr
/t/t/;
my
$olen
=
$len
- (
$alen
+
$tlen
+
$clen
+
$glen
);
if
(
$olen
< 0 ) {
$self
->
warn
(
"Weird. More atgc than bases. Problem!"
);
}
my
$base_count
=
sprintf
(
"BASE COUNT %8s a %6s c %6s g %6s t%s\n"
,
$alen
,
$clen
,
$glen
,
$tlen
,
(
$olen
> 0 ) ?
sprintf
(
"%6s others"
,
$olen
) :
''
);
$self
->_print(
$base_count
);
}
my
(
$o
) =
$seq
->annotation->get_Annotations(
'origin'
);
$self
->_print(
sprintf
(
"%-12s%s\n"
,
'ORIGIN'
,
$o
?
$o
->value :
''
));
my
$nuc
= 60;
my
$whole_pat
=
'a10'
x 6;
my
$out_pat
=
'A11'
x 6;
my
$length
=
length
(
$str
);
my
$whole
=
int
(
$length
/
$nuc
) *
$nuc
;
my
$i
;
for
(
$i
= 0;
$i
<
$whole
;
$i
+=
$nuc
) {
my
$blocks
=
pack
$out_pat
,
unpack
$whole_pat
,
substr
(
$str
,
$i
,
$nuc
);
chop
$blocks
;
$self
->_print(
sprintf
(
"%9d $blocks\n"
,
$i
+
$nuc
- 59));
}
if
(
my
$last
=
substr
(
$str
,
$i
)) {
my
$last_len
=
length
(
$last
);
my
$last_pat
=
'a10'
x
int
(
$last_len
/ 10) .
'a'
.
$last_len
% 10;
my
$blocks
=
pack
$out_pat
,
unpack
(
$last_pat
,
$last
);
$blocks
=~ s/ +$//;
$self
->_print(
sprintf
(
"%9d $blocks\n"
,
$length
-
$last_len
+ 1));
}
$self
->_print(
"//\n"
);
$self
->flush
if
$self
->_flush_on_write &&
defined
$self
->_fh;
return
1;
}
}
sub
_print_GenBank_FTHelper {
my
(
$self
,
$fth
) =
@_
;
if
( !
ref
$fth
|| !
$fth
->isa(
'Bio::SeqIO::FTHelper'
) ) {
$fth
->
warn
(
"$fth is not a FTHelper class. Attempting to print, but there could be tears!"
);
}
$self
->_write_line_GenBank_regex(
sprintf
(
" %-16s"
,
$fth
->key),
" "
x21,
$fth
->loc,
"\,\|\$"
,80);
foreach
my
$tag
(
keys
%{
$fth
->field} ) {
foreach
my
$value
( @{
$fth
->field->{
$tag
}} ) {
$value
=~ s/\"/\"\"/g;
if
(
$value
eq
"_no_value"
) {
$self
->_write_line_GenBank_regex(
" "
x21,
" "
x21,
"/$tag"
,
"\.\|\$"
,80);
}
elsif
(!
$FTQUAL_NO_QUOTE
{
$tag
}) {
my
(
$pat
) = (
$value
=~ /\s/ ?
'\s|$'
:
'.|$'
);
$self
->_write_line_GenBank_regex(
" "
x21,
" "
x21,
"/$tag=\"$value\""
,
$pat
,80);
}
else
{
$self
->_write_line_GenBank_regex(
" "
x21,
" "
x21,
"/$tag=$value"
,
"\.\|\$"
,80);
}
}
}
}
sub
_read_GenBank_References {
my
(
$self
,
$buffer
) =
@_
;
my
(
@refs
);
my
$ref
;
if
(
$$buffer
!~ /^REFERENCE/ ) {
warn
(
"Not parsing line '$$buffer' which maybe important"
);
}
$_
=
$$buffer
;
my
(
@title
,
@loc
,
@authors
,
@consort
,
@com
,
@medline
,
@pubmed
);
REFLOOP:
while
(
defined
(
$_
) ||
defined
(
$_
=
$self
->_readline) ) {
if
(/^\s{2}AUTHORS\s+(.*)/o) {
push
(
@authors
, $1);
while
(
defined
(
$_
=
$self
->_readline) ) {
/^\s{9,}(.*)/o &&
do
{
push
(
@authors
, $1);
next
;};
last
;
}
$ref
->authors(
join
(
' '
,
@authors
));
}
if
(/^\s{2}CONSRTM\s+(.*)/o) {
push
(
@consort
, $1);
while
(
defined
(
$_
=
$self
->_readline) ) {
/^\s{9,}(.*)/o &&
do
{
push
(
@consort
, $1);
next
;};
last
;
}
$ref
->consortium(
join
(
' '
,
@consort
));
}
if
(/^\s{2}TITLE\s+(.*)/o) {
push
(
@title
, $1);
while
(
defined
(
$_
=
$self
->_readline) ) {
/^\s{9,}(.*)/o &&
do
{
push
(
@title
, $1);
next
;
};
last
;
}
$ref
->title(
join
(
' '
,
@title
));
}
if
(/^\s{2}JOURNAL\s+(.*)/o) {
push
(
@loc
, $1);
while
(
defined
(
$_
=
$self
->_readline) ) {
/^\s{9,}(.*)/o &&
do
{
push
(
@loc
, $1);
next
;
};
last
;
}
$ref
->location(
join
(
' '
,
@loc
));
redo
REFLOOP;
}
if
(/^\s{2}REMARK\s+(.*)/o) {
push
(
@com
, $1);
while
(
defined
(
$_
=
$self
->_readline) ) {
/^\s{9,}(.*)/o &&
do
{
push
(
@com
, $1);
next
;
};
last
;
}
$ref
->comment(
join
(
' '
,
@com
));
redo
REFLOOP;
}
if
( /^\s{2}MEDLINE\s+(.*)/ ) {
push
(
@medline
,$1);
while
(
defined
(
$_
=
$self
->_readline) ) {
/^\s{9,}(.*)/ &&
do
{
push
(
@medline
, $1);
next
;
};
last
;
}
$ref
->medline(
join
(
' '
,
@medline
));
redo
REFLOOP;
}
if
( /^\s{3}PUBMED\s+(.*)/ ) {
push
(
@pubmed
,$1);
while
(
defined
(
$_
=
$self
->_readline) ) {
/^\s{9,}(.*)/ &&
do
{
push
(
@pubmed
, $1);
next
;
};
last
;
}
$ref
->pubmed(
join
(
' '
,
@pubmed
));
redo
REFLOOP;
}
/^REFERENCE/o &&
do
{
$self
->_add_ref_to_array(\
@refs
,
$ref
)
if
$ref
;
@authors
= ();
@title
= ();
@loc
= ();
@com
= ();
@pubmed
= ();
@medline
= ();
$ref
= Bio::Annotation::Reference->new(
-tagname
=>
'reference'
);
if
(/^REFERENCE\s+\d+\s+\([a-z]+ (\d+) to (\d+)\)/){
$ref
->start($1);
$ref
->end($2);
}
elsif
(/^REFERENCE\s+\d+\s+\((.*)\)/) {
$ref
->gb_reference($1);
}
};
/^(FEATURES)|(COMMENT)/o &&
last
;
$_
=
undef
;
}
$self
->_add_ref_to_array(\
@refs
,
$ref
)
if
$ref
;
$$buffer
=
$_
;
return
@refs
;
}
sub
_add_ref_to_array {
my
(
$self
,
$refs
,
$ref
) =
@_
;
my
$au
=
$ref
->authors();
my
$title
=
$ref
->title();
$au
=~ s/;\s*$//g
if
$au
;
$title
=~ s/;\s*$//g
if
$title
;
$ref
->authors(
$au
);
$ref
->title(
$title
);
push
(@{
$refs
},
$ref
);
}
sub
_read_GenBank_Species {
my
(
$self
,
$buffer
) =
@_
;
my
@unkn_names
= (
'other'
,
'unknown organism'
,
'not specified'
,
'not shown'
,
'Unspecified'
,
'Unknown'
,
'None'
,
'unclassified'
,
'unidentified organism'
,
'not supplied'
);
my
@unkn_genus
= (
'unknown'
,
'unclassified'
,
'uncultured'
,
'unidentified'
);
$_
=
$$buffer
;
my
(
$sub_species
,
$species
,
$genus
,
$sci_name
,
$common
,
$class_lines
,
$source_flag
,
$abbr_name
,
$organelle
,
$sl
);
while
(
defined
(
$_
) ||
defined
(
$_
=
$self
->_readline())) {
s/<[^>]+>//g;
if
( /^SOURCE\s+(.*)/o ) {
$sl
= $1;
$sl
=~ s/\.$//;
$source_flag
= 1;
}
elsif
( /^\s{2}ORGANISM/o ) {
$source_flag
= 0;
(
$sci_name
) =
$_
=~ /\w+\s+(.*)/o;
}
elsif
(
$source_flag
) {
$sl
.=
$_
;
$sl
=~ s/\n//g;
$sl
=~ s/\s+/ /g;
$source_flag
= 0;
}
elsif
( /^\s+(.+)/o ) {
my
$line
= $1;
if
(
$line
!~ /[;\.]$/) {
$sci_name
.=
' '
.
$line
;
}
else
{
$class_lines
.=
$line
;
}
}
else
{
last
;
}
$_
=
undef
;
}
$$buffer
=
$_
;
if
(
$sl
=~ m{^
(mitochondrion|chloroplast|plastid)?
\s*(.*?)
\s*(?: \( (.*?) \) )?\.?
$
}xms ){
(
$organelle
,
$abbr_name
,
$common
) = ($1, $2, $3);
}
else
{
$abbr_name
=
$sl
;
}
$sci_name
||
return
;
my
@class
=
map
{ s/^\s+//; s/\s+$//; s/\s{2,}/ /g;
$_
; }
split
/[;\.]+/,
$class_lines
;
my
$possible_genus
=
$class
[-1];
$possible_genus
.=
"|$class[-2]"
if
$class
[-2];
if
(
$sci_name
=~ /^(
$possible_genus
)/) {
$genus
= $1;
(
$species
) =
$sci_name
=~ /^
$genus
\s+(.+)/;
}
else
{
$species
=
$sci_name
;
}
if
(
$species
=~ /subsp\.|var\./) {
(
$species
,
$sub_species
) =
$species
=~ /(.+)\s+((?:subsp\.|var\.).+)/;
}
my
$unkn
=
grep
{
$_
eq
$sl
}
@unkn_names
;
return
unless
(
$species
||
$genus
) and
$unkn
== 0;
push
(
@class
,
$sci_name
);
@class
=
reverse
@class
;
my
$make
= Bio::Species->new();
$make
->scientific_name(
$sci_name
);
$make
->classification(
@class
)
if
@class
> 0;
$make
->common_name(
$common
)
if
$common
;
$make
->name(
'abbreviated'
,
$abbr_name
)
if
$abbr_name
;
$make
->organelle(
$organelle
)
if
$organelle
;
return
$make
;
}
sub
_read_FTHelper_GenBank {
my
(
$self
,
$buffer
) =
@_
;
my
(
$key
,
$loc
);
my
@qual
= ();
if
(
$$buffer
=~ /^\s{5}(\S+)\s+(.+?)\s*$/o) {
$key
= $1;
$loc
= $2;
while
(
defined
(
$_
=
$self
->_readline) ) {
if
(/^(\s+)(.+?)\s*$/o) {
if
(
length
($1) > 6) {
if
(
@qual
|| (
index
($2,
'/'
) == 0)) {
push
(
@qual
, $2);
}
else
{
$loc
.= $2;
}
}
else
{
last
;
}
}
else
{
last
;
}
}
}
else
{
$self
->debug(
"no feature key!\n"
);
$$buffer
=
$self
->_readline();
return
;
}
$$buffer
=
$_
;
my
$out
= new Bio::SeqIO::FTHelper();
$out
->verbose(
$self
->verbose());
$out
->key(
$key
);
$out
->loc(
$loc
);
QUAL:
for
(
my
$i
= 0;
$i
<
@qual
;
$i
++) {
$_
=
$qual
[
$i
];
my
(
$qualifier
,
$value
) = (m{^/([^=]+)(?:=(.+))?})
or
$self
->
warn
(
"cannot see new qualifier in feature $key: "
.
$qual
[
$i
]);
$qualifier
=
''
unless
(
defined
$qualifier
);
if
(
defined
$value
) {
if
(
substr
(
$value
, 0, 1) eq
'"'
) {
while
(
$value
!~ /\
"$/ or $value =~ tr/"
/"/ % 2) {
if
(
$i
>=
$#qual
) {
$self
->
warn
(
"Unbalanced quote in:\n"
.
join
(
"\n"
,
@qual
) .
"No further qualifiers will "
.
"be added for this feature"
);
last
QUAL;
}
$i
++;
my
$next
=
$qual
[
$i
];
if
(
$qualifier
ne
'translation'
) {
$value
.=
" "
;
}
$value
.=
$next
;
}
$value
=~ s/^
"|"
$//g;
$value
=~ s/
""
/\"/g;
}
elsif
(
$value
=~ /^\(/ ) {
my
$left
= (
$value
=~
tr
/\(/\(/);
my
$right
= (
$value
=~
tr
/\)/\)/);
while
(
$left
!=
$right
) {
if
(
$i
>=
$#qual
) {
$self
->
warn
(
"Unbalanced parens in:\n"
.
join
(
"\n"
,
@qual
).
"\nNo further qualifiers will "
.
"be added for this feature"
);
last
QUAL;
}
$i
++;
my
$next
=
$qual
[
$i
];
$value
.=
$next
;
$left
+= (
$next
=~
tr
/\(/\(/);
$right
+= (
$next
=~
tr
/\)/\)/);
}
}
}
else
{
$value
=
'_no_value'
;
}
$out
->field->{
$qualifier
} ||= [];
push
(@{
$out
->field->{
$qualifier
}},
$value
);
}
return
$out
;
}
sub
_write_line_GenBank {
my
(
$self
,
$pre1
,
$pre2
,
$line
,
$length
) =
@_
;
$length
||
$self
->throw(
"Miscalled write_line_GenBank without length. Programming error!"
);
my
$subl
=
$length
-
length
$pre2
;
my
$linel
=
length
$line
;
my
$i
;
my
$subr
=
substr
(
$line
,0,
$length
-
length
$pre1
);
$self
->_print(
"$pre1$subr\n"
);
for
(
$i
= (
$length
-
length
$pre1
);
$i
<
$linel
;
$i
+=
$subl
) {
$subr
=
substr
(
$line
,
$i
,
$subl
);
$self
->_print(
"$pre2$subr\n"
);
}
}
sub
_write_line_GenBank_regex {
my
(
$self
,
$pre1
,
$pre2
,
$line
,
$regex
,
$length
) =
@_
;
$length
||
$self
->throw(
"Miscalled write_line_GenBank without length. Programming error!"
);
my
$subl
=
$length
- (
length
$pre1
) - 2;
my
@lines
= ();
CHUNK:
while
(
$line
) {
foreach
my
$pat
(
$regex
,
'[,;\.\/-]\s|'
.
$regex
,
'[,;\.\/-]|'
.
$regex
) {
if
(
$line
=~ m/^(.{1,
$subl
})(
$pat
)(.*)/ ) {
my
$l
= $1.$2;
my
$newl
= $3;
$line
=
substr
(
$line
,
length
(
$l
));
$l
=~ s/\s+$//;
push
(
@lines
,
$l
);
next
CHUNK;
}
}
$self
->
warn
(
"trouble dissecting \"$line\"\n into chunks "
.
"of $subl chars or less - this tag won't print right"
);
$line
=
substr
(
$line
,0,
$subl
) .
" "
.
substr
(
$line
,
$subl
);
}
my
$s
=
shift
@lines
;
$self
->_print(
"$pre1$s\n"
)
if
$s
;
foreach
my
$s
(
@lines
) {
$self
->_print(
"$pre2$s\n"
);
}
}
sub
_post_sort {
my
(
$obj
,
$value
) =
@_
;
if
(
defined
$value
) {
$obj
->{
'_post_sort'
} =
$value
;
}
return
$obj
->{
'_post_sort'
};
}
sub
_show_dna {
my
(
$obj
,
$value
) =
@_
;
if
(
defined
$value
) {
$obj
->{
'_show_dna'
} =
$value
;
}
return
$obj
->{
'_show_dna'
};
}
sub
_id_generation_func {
my
(
$obj
,
$value
) =
@_
;
if
(
defined
$value
) {
$obj
->{
'_id_generation_func'
} =
$value
;
}
return
$obj
->{
'_id_generation_func'
};
}
sub
_ac_generation_func {
my
(
$obj
,
$value
) =
@_
;
if
(
defined
$value
) {
$obj
->{
'_ac_generation_func'
} =
$value
;
}
return
$obj
->{
'_ac_generation_func'
};
}
sub
_sv_generation_func {
my
(
$obj
,
$value
) =
@_
;
if
(
defined
$value
) {
$obj
->{
'_sv_generation_func'
} =
$value
;
}
return
$obj
->{
'_sv_generation_func'
};
}
sub
_kw_generation_func {
my
(
$obj
,
$value
) =
@_
;
if
(
defined
$value
) {
$obj
->{
'_kw_generation_func'
} =
$value
;
}
return
$obj
->{
'_kw_generation_func'
};
}
1;