use
constant
CONFIG_FILE_NAME
=>
'config.dat'
;
sub
new {
my
$class
=
shift
;
$class
=
ref
(
$class
)
if
ref
(
$class
);
my
$self
=
$class
->SUPER::new(
@_
);
my
(
$flat_directory
,
$dbname
,
$format
) =
$self
->_rearrange([
qw(DIRECTORY DBNAME FORMAT)
],
@_
);
defined
$flat_directory
or
$self
->throw(
'Please supply a -directory argument'
);
defined
$dbname
or
$self
->throw(
'Please supply a -dbname argument'
);
$self
->directory(
$flat_directory
);
$self
->dbname(
$dbname
);
$self
->throw(
"Base directory $flat_directory doesn't exist"
)
unless
-e
$flat_directory
;
$self
->throw(
"$flat_directory isn't a directory"
)
unless
-d _;
my
$dbpath
= Bio::Root::IO->catfile(
$flat_directory
,
$dbname
);
unless
(-d
$dbpath
) {
$self
->debug(
"creating db directory $dbpath\n"
);
mkdir
$dbpath
,0777 or
$self
->throw(
"Can't create $dbpath: $!"
);
}
$self
->_read_config();
$self
->_initialize(
@_
);
$self
->throw(
'you must specify an indexing scheme'
)
unless
$self
->indexing_scheme;
my
$index_type
=
$self
->indexing_scheme eq
'BerkeleyDB/1'
?
'BDB'
:
$self
->indexing_scheme eq
'flat/1'
?
'Binary'
:
$self
->throw(
"unknown indexing scheme: "
.
$self
->indexing_scheme);
$format
=
$self
->file_format;
if
(
$index_type
eq
'Binary'
) {
my
$child_class
=
'Bio::DB::Flat::BinarySearch'
;
eval
"use $child_class"
;
$self
->throw($@)
if
$@;
push
@_
, (
'-format'
,
$format
);
return
$child_class
->new(
@_
);
}
my
$child_class
=
"Bio\:\:DB\:\:Flat\:\:$index_type\:\:\L$format"
;
eval
"use $child_class"
;
$self
->throw($@)
if
$@;
bless
$self
,
$child_class
;
$self
->_initialize(
@_
);
$self
->_set_namespaces(
@_
);
$self
;
}
sub
_initialize {
my
$self
=
shift
;
my
(
$flat_write_flag
,
$dbname
,
$flat_indexing
,
$flat_verbose
,
$flat_outfile
,
$flat_format
)
=
$self
->_rearrange([
qw(WRITE_FLAG DBNAME INDEX VERBOSE OUT FORMAT)
],
@_
);
$self
->write_flag(
$flat_write_flag
)
if
defined
$flat_write_flag
;
if
(
defined
$flat_indexing
) {
$flat_indexing
=
'BerkeleyDB/1'
if
$flat_indexing
=~ /bdb/;
$flat_indexing
=
'flat/1'
if
$flat_indexing
=~ /^(flat|binary)/;
$self
->indexing_scheme(
$flat_indexing
);
}
$self
->verbose(
$flat_verbose
)
if
defined
$flat_verbose
;
$self
->dbname(
$dbname
)
if
defined
$dbname
;
$self
->out_file(
$flat_outfile
)
if
defined
$flat_outfile
;
$self
->file_format(
$flat_format
)
if
defined
$flat_format
;
}
sub
_set_namespaces {
my
$self
=
shift
;
$self
->primary_namespace(
$self
->default_primary_namespace)
unless
defined
$self
->{flat_primary_namespace};
$self
->secondary_namespaces(
$self
->default_secondary_namespaces)
unless
defined
$self
->{flat_secondary_namespaces};
$self
->file_format(
$self
->default_file_format)
unless
defined
$self
->{flat_format};
}
sub
new_from_registry {
my
(
$self
,
%config
) =
@_
;
my
$location
=
$config
{
'location'
} or
$self
->throw(
'location tag must be specified.'
);
my
$dbname
=
$config
{
'dbname'
} or
$self
->throw(
'dbname tag must be specified.'
);
my
$db
=
$self
->new(
-directory
=>
$location
,
-dbname
=>
$dbname
,
);
$db
;
}
sub
directory {
my
$self
=
shift
;
my
$d
=
$self
->{flat_directory};
$self
->{flat_directory} =
shift
if
@_
;
$d
;
}
sub
write_flag {
my
$self
=
shift
;
my
$d
=
$self
->{flat_write_flag};
$self
->{flat_write_flag} =
shift
if
@_
;
$d
;
}
sub
verbose {
my
$self
=
shift
;
my
$d
=
$self
->{flat_verbose};
$self
->{flat_verbose} =
shift
if
@_
;
$d
;
}
sub
out_file {
my
$self
=
shift
;
my
$d
=
$self
->{flat_outfile};
$self
->{flat_outfile} =
shift
if
@_
;
$d
;
}
sub
dbname {
my
$self
=
shift
;
my
$d
=
$self
->{flat_dbname};
$self
->{flat_dbname} =
shift
if
@_
;
$d
;
}
sub
primary_namespace {
my
$self
=
shift
;
my
$d
=
$self
->{flat_primary_namespace};
$self
->{flat_primary_namespace} =
shift
if
@_
;
$d
;
}
sub
secondary_namespaces {
my
$self
=
shift
;
my
$d
=
$self
->{flat_secondary_namespaces};
$self
->{flat_secondary_namespaces} = (
ref
(
$_
[0]) eq
'ARRAY'
?
shift
: [
@_
])
if
@_
;
return
unless
$d
;
$d
= [
$d
]
if
$d
&&
ref
(
$d
) ne
'ARRAY'
;
return
wantarray
?
@$d
:
$d
;
}
sub
file_format {
my
$self
=
shift
;
my
$d
=
$self
->{flat_format};
$self
->{flat_format} =
shift
if
@_
;
$d
;
}
sub
alphabet {
my
$self
=
shift
;
my
$d
=
$self
->{flat_alphabet};
$self
->{flat_alphabet} =
shift
if
@_
;
$d
;
}
sub
parse_one_record {
my
$self
=
shift
;
my
$fh
=
shift
;
my
$parser
=
$self
->{cached_parsers}{
fileno
(
$fh
)}
||= Bio::SeqIO->new(
-fh
=>
$fh
,
-format
=>
$self
->default_file_format);
my
$seq
=
$parser
->next_seq or
return
;
$self
->{flat_alphabet} ||=
$seq
->alphabet;
my
$ids
=
$self
->seq_to_ids(
$seq
);
return
$ids
;
}
sub
indexing_scheme {
my
$self
=
shift
;
my
$d
=
$self
->{flat_indexing};
$self
->{flat_indexing} =
shift
if
@_
;
$d
;
}
sub
add_flat_file {
my
$self
=
shift
;
my
(
$file_path
,
$file_length
,
$nf
) =
@_
;
unless
(File::Spec->file_name_is_absolute(
$file_path
)) {
$file_path
= File::Spec->rel2abs(
$file_path
);
}
-r
$file_path
or
$self
->throw(
"flat file $file_path cannot be read: $!"
);
my
$current_size
= -s _;
if
(
defined
$file_length
) {
$current_size
==
$file_length
or
$self
->throw(
"flat file $file_path has changed size. Was $file_length bytes; now $current_size"
);
}
else
{
$file_length
=
$current_size
;
}
unless
(
defined
$nf
) {
$self
->{flat_file_index} = 0
unless
exists
$self
->{flat_file_index};
$nf
=
$self
->{flat_file_index}++;
}
$self
->{flat_flat_file_path}{
$nf
} =
$file_path
;
$self
->{flat_flat_file_no}{
$file_path
} =
$nf
;
$nf
;
}
sub
write_config {
my
$self
=
shift
;
$self
->write_flag or
$self
->throw(
"cannot write configuration file because write_flag is not set"
);
my
$path
=
$self
->_config_path;
open
(
my
$F
,
">$path"
) or
$self
->throw(
"open error on $path: $!"
);
my
$index_type
=
$self
->indexing_scheme;
print
$F
"index\t$index_type\n"
;
my
$format
=
$self
->file_format;
my
$alphabet
=
$self
->alphabet;
my
$alpha
=
$alphabet
?
"/$alphabet"
:
''
;
print
$F
"format\tURN:LSID:open-bio.org:${format}${alpha}\n"
;
my
@filenos
=
$self
->_filenos or
$self
->throw(
"cannot write config file because no flat files defined"
);
for
my
$nf
(
@filenos
) {
my
$path
=
$self
->{flat_flat_file_path}{
$nf
};
my
$size
= -s
$path
;
print
$F
join
(
"\t"
,
"fileid_$nf"
,
$path
,
$size
),
"\n"
;
}
my
$primary_ns
=
$self
->primary_namespace
or
$self
->throw(
'cannot write config file because no primary namespace defined'
);
print
$F
join
(
"\t"
,
'primary_namespace'
,
$primary_ns
),
"\n"
;
my
@secondary
=
$self
->secondary_namespaces;
print
$F
join
(
"\t"
,
'secondary_namespaces'
,
@secondary
),
"\n"
;
close
$F
or
$self
->throw(
"close error on $path: $!"
);
}
sub
files {
my
$self
=
shift
;
return
unless
$self
->{flat_flat_file_no};
return
keys
%{
$self
->{flat_flat_file_no}};
}
sub
write_seq {
my
$self
=
shift
;
my
$seq
=
shift
;
$self
->write_flag or
$self
->throw(
"cannot write sequences because write_flag is not set"
);
my
$file
=
$self
->out_file or
$self
->throw(
'no outfile defined; use the -out argument to new()'
);
my
$seqio
=
$self
->{flat_cached_parsers}{
$file
}
||= Bio::SeqIO->new(
-Format
=>
$self
->file_format,
-file
=>
">$file"
)
or
$self
->throw(
"couldn't create Bio::SeqIO object"
);
my
$fh
=
$seqio
->_fh or
$self
->throw(
"couldn't get filehandle from Bio::SeqIO object"
);
my
$offset
=
tell
(
$fh
);
$seqio
->write_seq(
$seq
);
my
$length
=
tell
(
$fh
)-
$offset
;
my
$ids
=
$self
->seq_to_ids(
$seq
);
$self
->_store_index(
$ids
,
$file
,
$offset
,
$length
);
$self
->{flat_outfile_dirty}++;
}
sub
close
{
my
$self
=
shift
;
return
unless
$self
->{flat_outfile_dirty};
$self
->write_config;
delete
$self
->{flat_outfile_dirty};
delete
$self
->{flat_cached_parsers}{
$self
->out_file};
}
sub
_filenos {
my
$self
=
shift
;
return
unless
$self
->{flat_flat_file_path};
return
keys
%{
$self
->{flat_flat_file_path}};
}
sub
_read_config {
my
$self
=
shift
;
my
$path
=
$self
->_config_path;
return
unless
-e
$path
;
open
(
my
$F
,
$path
) or
$self
->throw(
"open error on $path: $!"
);
my
%config
;
while
(<
$F
>) {
chomp
;
my
(
$tag
,
@values
) =
split
"\t"
;
$config
{
$tag
} = \
@values
;
}
CORE::
close
$F
or
$self
->throw(
"close error on $path: $!"
);
$config
{
index
}[0] =~ m~(flat/1|BerkeleyDB/1)~
or
$self
->throw(
"invalid configuration file $path: no index line"
);
$self
->indexing_scheme($1);
if
(
$config
{
format
}) {
if
(
$config
{
format
}[0] =~ /^URN:LSID:
open
-bio\.org:(\w+)(?:\/(\w+))/) {
$self
->file_format($1);
$self
->alphabet($2);
}
else
{
$self
->file_format(
$config
{
format
}[0]);
}
}
my
$primary_namespace
=
$config
{primary_namespace}[0]
or
$self
->throw(
"invalid configuration file $path: no primary namespace defined"
);
$self
->primary_namespace(
$primary_namespace
);
$self
->secondary_namespaces(
$config
{secondary_namespaces});
my
@normalized_files
=
grep
{
$_
ne
''
}
map
{/^fileid_(\S+)/ && $1}
keys
%config
;
for
my
$nf
(
@normalized_files
) {
my
(
$file_path
,
$file_length
) = @{
$config
{
"fileid_${nf}"
}};
$self
->add_flat_file(
$file_path
,
$file_length
,
$nf
);
}
1;
}
sub
_config_path {
my
$self
=
shift
;
$self
->_catfile(
$self
->_config_name);
}
sub
_catfile {
my
$self
=
shift
;
my
$component
=
shift
;
Bio::Root::IO->catfile(
$self
->directory,
$self
->dbname,
$component
);
}
sub
_config_name { CONFIG_FILE_NAME }
sub
_path2fileno {
my
$self
=
shift
;
my
$path
=
shift
;
return
$self
->add_flat_file(
$path
)
unless
exists
$self
->{flat_flat_file_no}{
$path
};
$self
->{flat_flat_file_no}{
$path
};
}
sub
_fileno2path {
my
$self
=
shift
;
my
$fileno
=
shift
;
$self
->{flat_flat_file_path}{
$fileno
};
}
sub
_files {
my
$self
=
shift
;
my
$paths
=
$self
->{flat_flat_file_no};
return
keys
%$paths
;
}
sub
fetch {
shift
->get_Seq_by_id(
@_
) }
sub
get_Seq_by_id {
my
$self
=
shift
;
my
$id
=
shift
;
$self
->throw_not_implemented;
}
sub
get_Seq_by_acc {
my
$self
=
shift
;
return
$self
->get_Seq_by_id(
shift
)
if
@_
== 1;
my
(
$ns
,
$key
) =
@_
;
$self
->throw_not_implemented;
}
sub
fetch_raw {
my
(
$self
,
$id
,
$namespace
) =
@_
;
$self
->throw_not_implemented;
}
sub
default_file_format {
my
$self
=
shift
;
$self
->throw_not_implemented;
}
sub
_store_index {
my
$self
=
shift
;
my
(
$ids
,
$file
,
$offset
,
$length
) =
@_
;
$self
->throw_not_implemented;
}
sub
default_primary_namespace {
return
"ACC"
;
}
sub
default_secondary_namespaces {
return
;
}
sub
seq_to_ids {
my
$self
=
shift
;
my
$seq
=
shift
;
my
%ids
;
$ids
{
$self
->primary_namespace} =
$seq
->accession_number;
\
%ids
;
}
sub
DESTROY {
my
$self
=
shift
;
$self
->
close
;
}
1;