Hide Show 184 lines of Pod
use
Fcntl
qw(SEEK_END SEEK_CUR)
;
sub
systell{
sysseek
(
$_
[0], 0, SEEK_CUR) }
sub
syseof{
sysseek
(
$_
[0], 0, SEEK_END) }
use
constant
CONFIG_FILE_NAME
=>
'config.dat'
;
my
@formats
= [
'FASTA'
,
'SWISSPROT'
,
'EMBL'
];
Hide Show 40 lines of Pod
sub
new {
my
(
$class
,
@args
) =
@_
;
my
$self
=
$class
->SUPER::new(
@args
);
bless
$self
,
$class
;
my
(
$index_dir
,
$dbname
,
$format
,
$write_flag
,
$primary_pattern
,
$primary_namespace
,
$start_pattern
,
$secondary_patterns
) =
$self
->_rearrange([
qw(DIRECTORY
DBNAME
FORMAT
WRITE_FLAG
PRIMARY_PATTERN
PRIMARY_NAMESPACE
START_PATTERN
SECONDARY_PATTERNS)
],
@args
);
$self
->index_directory(
$index_dir
);
$self
->dbname(
$dbname
);
if
(
$self
->index_directory &&
$self
->read_config_file) {
my
$fh
=
$self
->primary_index_filehandle;
my
$record_width
=
$self
->read_header(
$fh
);
$self
->record_size(
$record_width
);
}
$format
||= DEFAULT_FORMAT;
$self
->
format
(
$format
);
$self
->write_flag (
$write_flag
);
if
(
$self
->write_flag && !
$primary_namespace
) {
(
$primary_namespace
,
$primary_pattern
,
$start_pattern
,
$secondary_patterns
) =
$self
->_guess_patterns(
$self
->
format
);
}
$self
->primary_pattern (
$primary_pattern
);
$self
->primary_namespace (
$primary_namespace
);
$self
->start_pattern (
$start_pattern
);
$self
->secondary_patterns(
$secondary_patterns
);
return
$self
;
}
sub
new_from_registry {
my
(
$self
,
%config
) =
@_
;
my
$dbname
=
$config
{
'dbname'
};
my
$location
=
$config
{
'location'
};
my
$index
= new Bio::DB::Flat::BinarySearch(
-dbname
=>
$dbname
,
-index_dir
=>
$location
,
);
}
Hide Show 11 lines of Pod
sub
get_Seq_by_id {
my
(
$self
,
$id
) =
@_
;
local
$^W = 0;
my
(
$fh
,
$length
) =
$self
->get_stream_by_id(
$id
);
unless
(
defined
(
$self
->
format
)) {
$self
->throw(
"Can't create sequence - format is not defined"
);
}
return
unless
$fh
;
unless
(
defined
(
$self
->{_seqio}) ) {
$self
->{_seqio} = new Bio::SeqIO(
-fh
=>
$fh
,
-format
=>
$self
->
format
);
}
else
{
$self
->{_seqio}->fh(
$fh
);
}
return
$self
->{_seqio}->next_seq;
}
Hide Show 11 lines of Pod
sub
get_entry_by_id {
my
(
$self
,
$id
) =
@_
;
my
(
$fh
,
$length
) =
$self
->get_stream_by_id(
$id
);
my
$entry
;
sysread
(
$fh
,
$entry
,
$length
);
return
$entry
;
}
Hide Show 11 lines of Pod
sub
get_stream_by_id {
my
(
$self
,
$id
) =
@_
;
unless
(
$self
->record_size ) {
if
(
$self
->index_directory &&
$self
->read_config_file) {
my
$fh
=
$self
->primary_index_filehandle;
my
$record_width
=
$self
->read_header(
$fh
);
$self
->record_size(
$record_width
);
}
}
my
$indexfh
=
$self
->primary_index_filehandle;
syseof (
$indexfh
);
my
$filesize
= systell(
$indexfh
);
$self
->throw(
"file was not parsed properly, record size is empty"
)
unless
$self
->record_size;
my
$end
= (
$filesize
-
$self
->{
'_start_pos'
}) /
$self
->record_size;
my
(
$newid
,
$rest
,
$fhpos
) =
$self
->find_entry(
$indexfh
,0,
$end
,
$id
,
$self
->record_size);
my
(
$fileid
,
$pos
,
$length
) =
split
(/\t/,
$rest
);
if
(!
$newid
) {
return
;
}
my
$fh
=
$self
->get_filehandle_by_fileid(
$fileid
);
my
$file
=
$self
->{_file}{
$fileid
};
open
(
my
$IN
,
"<$file"
);
my
$entry
;
sysseek
(
$IN
,
$pos
,0);
return
(
$IN
,
$length
);
}
Hide Show 11 lines of Pod
sub
get_Seq_by_acc {
my
(
$self
,
$acc
) =
@_
;
local
$^W = 0;
if
(
$self
->primary_namespace eq
"ACC"
) {
return
$self
->get_Seq_by_id(
$acc
);
}
else
{
return
$self
->get_Seq_by_secondary(
"ACC"
,
$acc
);
}
}
Hide Show 11 lines of Pod
sub
get_Seq_by_version {
my
(
$self
,
$acc
) =
@_
;
local
$^W = 0;
if
(
$self
->primary_namespace eq
"VERSION"
) {
return
$self
->get_Seq_by_id(
$acc
);
}
else
{
return
$self
->get_Seq_by_secondary(
"VERSION"
,
$acc
);
}
}
Hide Show 11 lines of Pod
sub
get_Seq_by_secondary {
my
(
$self
,
$name
,
$id
) =
@_
;
my
@names
=
$self
->secondary_namespaces;
my
$found
= 0;
foreach
my
$tmpname
(
@names
) {
if
(
$name
eq
$tmpname
) {
$found
= 1;
}
}
if
(
$found
== 0) {
$self
->throw(
"Secondary index for $name doesn't exist\n"
);
}
my
$fh
=
$self
->open_secondary_index(
$name
);
syseof (
$fh
);
my
$filesize
= systell(
$fh
);
my
$recsize
=
$self
->{
'_secondary_record_size'
}{
$name
};
my
$end
= (
$filesize
-
$self
->{
'_start_pos'
})/
$recsize
;
my
(
$newid
,
$primary_id
,
$pos
) =
$self
->find_entry(
$fh
,0,
$end
,
$id
,
$recsize
);
sysseek
(
$fh
,
$pos
,0);
my
$record
=
$newid
;
while
(
$record
=~ /^
$newid
/ &&
$pos
>= 0) {
$record
=
$self
->read_record(
$fh
,
$pos
,
$recsize
);
$pos
=
$pos
-
$recsize
;
}
$pos
+=
$recsize
;
my
$current_id
=
$newid
;
my
%primary_id
;
$primary_id
{
$primary_id
} = 1;
while
(
$current_id
eq
$newid
) {
$record
=
$self
->read_record(
$fh
,
$pos
,
$recsize
);
my
(
$secid
,
$primary_id
) =
split
(/\t/,
$record
,2);
$current_id
=
$secid
;
if
(
$current_id
eq
$newid
) {
$primary_id
=~ s/ //g;
$primary_id
{
$primary_id
} = 1;
$pos
=
$pos
+
$recsize
;
}
}
if
(!
defined
(
$newid
)) {
return
;
}
my
@entry
;
foreach
my
$id
(
keys
%primary_id
) {
push
@entry
,
$self
->get_Seq_by_id(
$id
);
}
return
wantarray
?
@entry
:
$entry
[0];
}
Hide Show 11 lines of Pod
sub
read_header {
my
(
$self
,
$fh
) =
@_
;
my
$record_width
;
sysread
(
$fh
,
$record_width
,HEADER_SIZE);
$self
->{
'_start_pos'
} = HEADER_SIZE;
$record_width
=~ s/ //g;
$record_width
=
$record_width
* 1;
return
$record_width
;
}
Hide Show 11 lines of Pod
sub
read_record {
my
(
$self
,
$fh
,
$pos
,
$len
) =
@_
;
sysseek
(
$fh
,
$pos
,0);
my
$record
;
sysread
(
$fh
,
$record
,
$len
);
return
$record
;
}
Hide Show 11 lines of Pod
sub
get_all_primary_ids {
my
$self
=
shift
;
my
$fh
=
$self
->primary_index_filehandle;
syseof(
$fh
);
my
$filesize
= systell(
$fh
);
my
$recsize
=
$self
->record_size;
my
$end
=
$filesize
;
my
@ids
;
for
(
my
$pos
=
$self
->{
'_start_pos'
};
$pos
<
$end
;
$pos
+=
$recsize
) {
my
$record
=
$self
->read_record(
$fh
,
$pos
,
$recsize
);
my
(
$entryid
) =
split
(/\t/,
$record
);
push
@ids
,
$entryid
;
}
@ids
;
}
Hide Show 11 lines of Pod
sub
find_entry {
my
(
$self
,
$fh
,
$start
,
$end
,
$id
,
$recsize
) =
@_
;
my
$mid
=
int
( (
$end
+1+
$start
) / 2);
my
$pos
= (
$mid
-1)
*$recsize
+
$self
->{
'_start_pos'
};
my
(
$record
) =
$self
->read_record(
$fh
,
$pos
,
$recsize
);
my
(
$entryid
,
$rest
) =
split
(/\t/,
$record
,2);
$rest
=~ s/\s+$//;
my
(
$first
,
$second
) =
$id
le
$entryid
? (
$id
,
$entryid
) : (
$entryid
,
$id
);
if
(
$id
eq
$entryid
) {
return
(
$id
,
$rest
,
$pos
-
$recsize
);
}
elsif
(
$first
eq
$id
) {
if
(
$end
-
$start
<= 1) {
return
;
}
my
$end
=
$mid
;
$self
->find_entry(
$fh
,
$start
,
$end
,
$id
,
$recsize
);
}
elsif
(
$second
eq
$id
) {
if
(
$end
-
$start
<= 1) {
return
;
}
$start
=
$mid
;
$self
->find_entry(
$fh
,
$start
,
$end
,
$id
,
$recsize
);
}
}
Hide Show 11 lines of Pod
sub
build_index {
my
(
$self
,
@files
) =
@_
;
$self
->write_flag or
$self
->throw(
'Cannot build index unless -write_flag is true'
);
my
$rootdir
=
$self
->index_directory;
if
(!
defined
(
$rootdir
)) {
$self
->throw(
"No index directory set - can't build indices"
);
}
if
(! -d
$rootdir
) {
$self
->throw(
"Index directory [$rootdir] is not a directory. Cant' build indices"
);
}
my
$dbpath
= Bio::Root::IO->catfile(
$rootdir
,
$self
->dbname);
if
(! -d
$dbpath
) {
warn
"Creating directory $dbpath\n"
;
mkdir
$dbpath
,0777 or
$self
->throw(
"Couldn't create $dbpath: $!"
);
}
unless
(
@files
) {
$self
->throw(
"Must enter an array of filenames to index"
);
}
foreach
my
$file
(
@files
) {
$file
= File::Spec->rel2abs(
$file
)
unless
File::Spec->file_name_is_absolute(
$file
);
unless
( -e
$file
) {
$self
->throw(
"Can't index file [$file] as it doesn't exist"
);
}
}
if
(
my
$filehash
=
$self
->{_dbfile}) {
push
@files
,
keys
%$filehash
;
}
my
%seen
;
@files
=
grep
{!
$seen
{
$_
}++}
@files
;
$self
->make_config_file(\
@files
);
my
$entries
= 0;
foreach
my
$file
(
@files
) {
$entries
+=
$self
->_index_file(
$file
);
}
$self
->make_config_file(\
@files
);
$self
->write_primary_index;
$self
->write_secondary_indices;
$entries
;
}
Hide Show 12 lines of Pod
sub
_index_file {
my
(
$self
,
$file
) =
@_
;
my
$v
=
$self
->verbose;
open
(
my
$FILE
,
"<"
,
$file
) ||
$self
->throw(
"Can't open file [$file]"
);
my
$recstart
= 0;
my
$fileid
=
$self
->get_fileid_by_filename(
$file
);
my
$found
= 0;
my
$id
;
my
$count
= 0;
my
$primary
=
$self
->primary_pattern;
my
$start_pattern
=
$self
->start_pattern;
my
$pos
= 0;
my
$new_primary_entry
;
my
$length
;
my
$fh
=
$FILE
;
my
$done
= -1;
my
@secondary_names
=
$self
->secondary_namespaces;
my
%secondary_id
;
my
$last_one
;
while
(<
$fh
>) {
$last_one
=
$_
;
$self
->{alphabet} ||=
$self
->guess_alphabet(
$_
);
if
(
$_
=~ /
$start_pattern
/) {
if
(
$done
== 0) {
$id
=
$new_primary_entry
;
$self
->{alphabet} ||=
$self
->guess_alphabet(
$_
);
my
$tmplen
= (
tell
$fh
) -
length
(
$_
);
$length
=
$tmplen
-
$pos
;
unless
(
defined
(
$id
)) {
$self
->throw(
"No id defined for sequence"
);
}
unless
(
defined
(
$fileid
)) {
$self
->throw(
"No fileid defined for file $file"
);
}
unless
(
defined
(
$pos
)) {
$self
->throw(
"No position defined for "
.
$id
.
"\n"
);
}
unless
(
defined
(
$length
)) {
$self
->throw(
"No length defined for "
.
$id
.
"\n"
);
}
$self
->_add_id_position(
$id
,
$pos
,
$fileid
,
$length
,\
%secondary_id
);
$pos
=
$tmplen
;
if
(
$count
> 0 &&
$count
%1000 == 0) {
$self
->debug(
"Indexed $count ids\n"
)
if
$v
> 0;
}
$count
++;
}
else
{
$done
= 0;
}
}
if
(
$_
=~ /
$primary
/) {
$new_primary_entry
= $1;
}
my
$secondary_patterns
=
$self
->secondary_patterns;
foreach
my
$sec
(
@secondary_names
) {
my
$pattern
=
$secondary_patterns
->{
$sec
};
if
(
$_
=~ /
$pattern
/) {
$secondary_id
{
$sec
} = $1;
}
}
}
$id
=
$new_primary_entry
;
my
$tmplen
= (
tell
$fh
) -
length
(
$last_one
);
$length
=
$tmplen
-
$pos
;
if
(!
defined
(
$id
)) {
$self
->throw(
"No id defined for sequence"
);
}
if
(!
defined
(
$fileid
)) {
$self
->throw(
"No fileid defined for file $file"
);
}
if
(!
defined
(
$pos
)) {
$self
->throw(
"No position defined for "
.
$id
.
"\n"
);
}
if
(!
defined
(
$length
)) {
$self
->throw(
"No length defined for "
.
$id
.
"\n"
);
}
$self
->_add_id_position(
$id
,
$pos
,
$fileid
,
$length
,\
%secondary_id
);
$count
++;
close
(FILE);
$count
;
}
Hide Show 12 lines of Pod
sub
write_primary_index {
my
(
$self
) =
@_
;
my
@ids
=
keys
%{
$self
->{_id}};
@ids
=
sort
{
$a
cmp
$b
}
@ids
;
open
(
my
$INDEX
,
">"
.
$self
->primary_index_file) ||
$self
->throw(
"Can't open primary index file ["
.
$self
->primary_index_file .
"]"
);
my
$recordlength
=
$self
->{_maxidlength} +
$self
->{_maxfileidlength} +
$self
->{_maxposlength} +
$self
->{_maxlengthlength} + 3;
print
$INDEX
sprintf
(
"%4d"
,
$recordlength
);
foreach
my
$id
(
@ids
) {
if
(!
defined
(
$self
->{_id}{
$id
}{_fileid})) {
$self
->throw(
"No fileid for $id\n"
);
}
if
(!
defined
(
$self
->{_id}{
$id
}{_pos})) {
$self
->throw(
"No position for $id\n"
);
}
if
(!
defined
(
$self
->{_id}{
$id
}{_length})) {
$self
->throw(
"No length for $id"
);
}
my
$record
=
$id
.
"\t"
.
$self
->{_id}{
$id
}{_fileid} .
"\t"
.
$self
->{_id}{
$id
}{_pos} .
"\t"
.
$self
->{_id}{
$id
}{_length};
print
$INDEX
sprintf
(
"%-${recordlength}s"
,
$record
);
}
}
Hide Show 12 lines of Pod
sub
write_secondary_indices {
my
(
$self
) =
@_
;
my
@names
=
keys
(%{
$self
->{_secondary_id}});
foreach
my
$name
(
@names
) {
my
@seconds
=
keys
%{
$self
->{_secondary_id}{
$name
}};
my
$length
= 0;
foreach
my
$second
(
@seconds
) {
my
$tmplen
=
length
(
$second
) + 1;
my
@prims
=
keys
%{
$self
->{_secondary_id}{
$name
}{
$second
}};
foreach
my
$prim
(
@prims
) {
my
$recordlen
=
$tmplen
+
length
(
$prim
);
if
(
$recordlen
>
$length
) {
$length
=
$recordlen
;
}
}
}
my
$fh
=
$self
->new_secondary_filehandle(
$name
);
print
$fh
sprintf
(
"%4d"
,
$length
);
@seconds
=
sort
@seconds
;
foreach
my
$second
(
@seconds
) {
my
@prims
=
keys
%{
$self
->{_secondary_id}{
$name
}{
$second
}};
my
$tmp
=
$second
;
foreach
my
$prim
(
@prims
) {
my
$record
=
$tmp
.
"\t"
.
$prim
;
if
(
length
(
$record
) >
$length
) {
$self
->throw(
"Something has gone horribly wrong - length of record is more than we thought [$length]\n"
);
}
else
{
print
$fh
sprintf
(
"%-${length}s"
,
$record
);
print
$fh
sprintf
(
"%-${length}s"
,
$record
);
}
}
}
close
(
$fh
);
}
}
Hide Show 12 lines of Pod
sub
new_secondary_filehandle {
my
(
$self
,
$name
) =
@_
;
my
$indexdir
=
$self
->_config_path;
my
$secindex
= Bio::Root::IO->catfile(
$indexdir
,
"id_$name.index"
);
open
(
my
$fh
,
">"
,
$secindex
) ||
$self
->throw($!);
return
$fh
;
}
Hide Show 12 lines of Pod
sub
open_secondary_index {
my
(
$self
,
$name
) =
@_
;
if
(!
defined
(
$self
->{_secondary_filehandle}{
$name
})) {
my
$indexdir
=
$self
->_config_path;
my
$secindex
=
$indexdir
.
"/id_$name.index"
;
if
(! -e
$secindex
) {
$self
->throw(
"Index is not present for namespace [$name]\n"
);
}
open
(
my
$newfh
,
"<"
,
$secindex
) ||
$self
->throw($!);
my
$reclen
=
$self
->read_header(
$newfh
);
$self
->{_secondary_filehandle} {
$name
} =
$newfh
;
$self
->{_secondary_record_size}{
$name
} =
$reclen
;
}
return
$self
->{_secondary_filehandle}{
$name
};
}
Hide Show 12 lines of Pod
sub
_add_id_position {
my
(
$self
,
$id
,
$pos
,
$fileid
,
$length
,
$secondary_id
) =
@_
;
if
(!
defined
(
$id
)) {
$self
->throw(
"No id defined. Can't add id position"
);
}
if
(!
defined
(
$pos
)) {
$self
->throw(
"No position defined. Can't add id position"
);
}
if
( !
defined
(
$fileid
)) {
$self
->throw(
"No fileid defined. Can't add id position"
);
}
if
(!
defined
(
$length
) ||
$length
<= 0) {
$self
->throw(
"No length defined or <= 0 [$length]. Can't add id position"
);
}
$self
->{_id}{
$id
}{_pos} =
$pos
;
$self
->{_id}{
$id
}{_length} =
$length
;
$self
->{_id}{
$id
}{_fileid} =
$fileid
;
foreach
my
$sec
(
keys
(
%$secondary_id
)) {
my
$value
=
$secondary_id
->{
$sec
};
$self
->{_secondary_id}{
$sec
}{
$value
}{
$id
} = 1;
}
$self
->{_maxidlength} =
length
(
$id
)
if
!
exists
$self
->{_maxidlength} or
length
(
$id
) >=
$self
->{_maxidlength};
$self
->{_maxfileidlength} =
length
(
$fileid
)
if
!
exists
$self
->{_maxfileidlength} or
length
(
$fileid
) >=
$self
->{_maxfileidlength};
$self
->{_maxposlength} =
length
(
$pos
)
if
!
exists
$self
->{_maxposlength} or
length
(
$pos
) >=
$self
->{_maxposlength};
$self
->{_maxlengthlength} =
length
(
$length
)
if
!
exists
$self
->{_maxlengthlength} or
length
(
$length
) >=
$self
->{_maxlengthlength};
}
Hide Show 11 lines of Pod
sub
make_config_file {
my
(
$self
,
$files
) =
@_
;
my
@files
=
@$files
;
my
$configfile
=
$self
->_config_file;
open
(
my
$CON
,
">"
,
$configfile
) ||
$self
->throw(
"Can't create config file [$configfile]"
);
print
$CON
"index\tflat/1\n"
;
my
$count
= 0;
foreach
my
$file
(
@files
) {
my
$size
= -s
$file
;
print
$CON
"fileid_$count\t$file\t$size\n"
;
my
$fh
;
open
(
$fh
,
"<"
,
$file
) ||
$self
->throw($!);
$self
->{_fileid}{
$count
} =
$fh
;
$self
->{_file} {
$count
} =
$file
;
$self
->{_dbfile}{
$file
} =
$count
;
$self
->{_size}{
$count
} =
$size
;
$count
++;
}
print
$CON
"primary_namespace\t"
.
$self
->primary_namespace.
"\n"
;
my
$second_patterns
=
$self
->secondary_patterns;
my
@second
=
keys
%$second_patterns
;
if
((
@second
)) {
print
$CON
"secondary_namespaces"
;
foreach
my
$second
(
@second
) {
print
$CON
"\t$second"
;
}
print
$CON
"\n"
;
}
unless
(
defined
(
$self
->
format
) ) {
$self
->throw(
"Format does not exist in module - can't write config file"
);
}
else
{
my
$format
=
$self
->
format
;
my
$alphabet
=
$self
->alphabet;
my
$alpha
=
$alphabet
?
"/$alphabet"
:
''
;
print
$CON
"format\t"
.
"$format\n"
;
}
close
(
$CON
);
}
Hide Show 11 lines of Pod
sub
read_config_file {
my
(
$self
) =
@_
;
my
$configfile
=
$self
->_config_file;
return
unless
-e
$configfile
;
open
(
my
$CON
,
"<"
,
$configfile
) ||
$self
->throw(
"Can't open configfile [$configfile]"
);
my
$line
= <
$CON
>;
chomp
(
$line
);
my
$version
;
if
(
$line
=~ m{
index
\tflat/(\d+)}) {
$version
= $1;
}
else
{
$self
->throw(
"First line not compatible with flat file index. Should be something like\n\nindex\tflat/1"
);
}
$self
->index_type(
"flat"
);
$self
->index_version(
$version
);
while
(<
$CON
>) {
chomp
;
if
(
$_
=~ /^fileid_(\d+)\t(\S+)\t(\d+)/) {
my
$fileid
= $1;
my
$filename
= $2;
my
$filesize
= $3;
if
(! -e
$filename
) {
$self
->throw(
"File [$filename] does not exist!"
);
}
if
(-s
$filename
!=
$filesize
) {
$self
->throw(
"Flatfile size for $filename differs from what the index thinks it is. Real size ["
. (-s
$filename
) .
"] Index thinks it is ["
.
$filesize
.
"]"
);
}
my
$fh
;
open
(
$fh
,
"<"
,
$filename
) ||
$self
->throw($!);
close
$fh
;
$self
->{_fileid}{
$fileid
} =
$fh
;
$self
->{_file} {
$fileid
} =
$filename
;
$self
->{_dbfile}{
$filename
} =
$fileid
;
$self
->{_size} {
$fileid
} =
$filesize
;
}
if
( /(.*)_namespaces?\t(.+)/ ) {
if
($1 eq
"primary"
) {
$self
->primary_namespace($2);
}
elsif
($1 eq
"secondary"
) {
$self
->secondary_namespaces(
split
"\t"
,$2);
}
else
{
$self
->throw(
"Unknown namespace name in config file [$1"
);
}
}
if
(
$_
=~ /
format
\t(\S+)/) {
my
$format
= $1;
if
(
$format
=~ /^URN:LSID:
open
-bio\.org:(\w+)(?:\/(\w+))?/) {
$self
->
format
($1);
$self
->alphabet($2);
}
else
{
$self
->
format
($1);
}
}
}
close
(
$CON
);
my
@fileid_keys
=
keys
(%{
$self
->{_fileid}});
if
(!(
@fileid_keys
)) {
$self
->throw(
"No flatfile fileid files in config - check the index has been made correctly"
);
}
if
(!
defined
(
$self
->primary_namespace)) {
$self
->throw(
"No primary namespace exists"
);
}
if
(! -e
$self
->primary_index_file) {
$self
->throw(
"Primary index file ["
.
$self
->primary_index_file .
"] doesn't exist"
);
}
1;
}
Hide Show 11 lines of Pod
sub
get_fileid_by_filename {
my
(
$self
,
$file
) =
@_
;
if
(!
defined
(
$self
->{_dbfile})) {
$self
->throw(
"No file to fileid mapping present. Has the fileid file been read?"
);
}
return
$self
->{_dbfile}{
$file
};
}
Hide Show 11 lines of Pod
sub
get_filehandle_by_fileid {
my
(
$self
,
$fileid
) =
@_
;
if
(!
defined
(
$self
->{_fileid}{
$fileid
})) {
$self
->throw(
"ERROR: undefined fileid in index [$fileid]"
);
}
return
$self
->{_fileid}{
$fileid
};
}
Hide Show 12 lines of Pod
sub
primary_index_file {
my
(
$self
) =
@_
;
return
Bio::Root::IO->catfile(
$self
->_config_path,
"key_"
.
$self
->primary_namespace .
".key"
);
}
Hide Show 12 lines of Pod
sub
primary_index_filehandle {
my
(
$self
) =
@_
;
unless
(
defined
(
$self
->{
'_primary_index_handle'
})) {
open
(
$self
->{
'_primary_index_handle'
},
"<"
.
$self
->primary_index_file) || self->throw($@);
}
return
$self
->{
'_primary_index_handle'
};
}
Hide Show 12 lines of Pod
sub
format
{
my
(
$obj
,
$value
) =
@_
;
if
(
defined
$value
) {
$obj
->{
'format'
} =
$value
;
}
return
$obj
->{
'format'
};
}
sub
alphabet{
my
(
$obj
,
$value
) =
@_
;
if
(
defined
$value
) {
$obj
->{alphabet} =
$value
;
}
return
$obj
->{alphabet};
}
Hide Show 12 lines of Pod
sub
write_flag{
my
(
$obj
,
$value
) =
@_
;
if
(
defined
$value
) {
$obj
->{
'write_flag'
} =
$value
;
}
return
$obj
->{
'write_flag'
};
}
Hide Show 11 lines of Pod
sub
dbname {
my
$self
=
shift
;
my
$d
=
$self
->{flat_dbname};
$self
->{flat_dbname} =
shift
if
@_
;
$d
;
}
Hide Show 12 lines of Pod
sub
index_directory {
my
(
$self
,
$arg
) =
@_
;
if
(
defined
(
$arg
)) {
if
(
$arg
!~ m{/$}) {
$arg
.=
"/"
;
}
$self
->{_index_directory} =
$arg
;
}
return
$self
->{_index_directory};
}
sub
_config_path {
my
$self
=
shift
;
my
$root
=
$self
->index_directory;
my
$dbname
=
$self
->dbname;
Bio::Root::IO->catfile(
$root
,
$dbname
);
}
sub
_config_file {
my
$self
=
shift
;
my
$path
=
$self
->_config_path;
Bio::Root::IO->catfile(
$path
,CONFIG_FILE_NAME);
}
Hide Show 12 lines of Pod
sub
record_size {
my
$self
=
shift
;
$self
->{_record_size} =
shift
if
@_
;
return
$self
->{_record_size};
}
Hide Show 11 lines of Pod
sub
primary_namespace {
my
$self
=
shift
;
$self
->{_primary_namespace} =
shift
if
@_
;
return
$self
->{_primary_namespace};
}
Hide Show 12 lines of Pod
sub
index_type {
my
$self
=
shift
;
$self
->{_index_type} =
shift
if
@_
;
return
$self
->{_index_type};
}
Hide Show 12 lines of Pod
sub
index_version {
my
$self
=
shift
;
$self
->{_index_version} =
shift
if
@_
;
return
$self
->{_index_version};
}
Hide Show 12 lines of Pod
sub
primary_pattern{
my
$obj
=
shift
;
$obj
->{
'primary_pattern'
} =
shift
if
@_
;
return
$obj
->{
'primary_pattern'
};
}
Hide Show 12 lines of Pod
sub
start_pattern{
my
$obj
=
shift
;
$obj
->{
'start_pattern'
} =
shift
if
@_
;
return
$obj
->{
'start_pattern'
};
}
Hide Show 12 lines of Pod
sub
secondary_patterns{
my
(
$obj
,
$value
) =
@_
;
if
(
defined
$value
) {
$obj
->{
'secondary_patterns'
} =
$value
;
my
@names
=
keys
%$value
;
foreach
my
$name
(
@names
) {
$obj
->secondary_namespaces(
$name
);
}
}
return
$obj
->{
'secondary_patterns'
};
}
Hide Show 12 lines of Pod
sub
secondary_namespaces {
my
(
$obj
,
@values
) =
@_
;
if
(
@values
) {
push
(@{
$obj
->{
'secondary_namespaces'
}},
@values
);
}
return
@{
$obj
->{
'secondary_namespaces'
} || []};
}
sub
new_SWISSPROT_index {
my
(
$self
,
$index_dir
,
@files
) =
@_
;
my
%secondary_patterns
;
my
$start_pattern
=
"^ID (\\S+)"
;
my
$primary_pattern
=
"^AC (\\S+)\\;"
;
$secondary_patterns
{
"ID"
} =
$start_pattern
;
my
$index
= new Bio::DB::Flat::BinarySearch
(
-index_dir
=>
$index_dir
,
-format
=>
'swissprot'
,
-primary_pattern
=>
$primary_pattern
,
-primary_namespace
=>
"ACC"
,
-start_pattern
=>
$start_pattern
,
-secondary_patterns
=> \
%secondary_patterns
);
$index
->build_index(
@files
);
}
sub
new_EMBL_index {
my
(
$self
,
$index_dir
,
@files
) =
@_
;
my
%secondary_patterns
;
my
$start_pattern
=
"^ID (\\S+)"
;
my
$primary_pattern
=
"^AC (\\S+)\\;"
;
my
$primary_namespace
=
"ACC"
;
$secondary_patterns
{
"ID"
} =
$start_pattern
;
my
$index
= new Bio::DB::Flat::BinarySearch
(
-index_dir
=>
$index_dir
,
-format
=>
'embl'
,
-primary_pattern
=>
$primary_pattern
,
-primary_namespace
=>
"ACC"
,
-start_pattern
=>
$start_pattern
,
-secondary_patterns
=> \
%secondary_patterns
);
$index
->build_index(
@files
);
return
$index
;
}
sub
new_FASTA_index {
my
(
$self
,
$index_dir
,
@files
) =
@_
;
my
%secondary_patterns
;
my
$start_pattern
=
"^>"
;
my
$primary_pattern
=
"^>(\\S+)"
;
my
$primary_namespace
=
"ACC"
;
$secondary_patterns
{
"ID"
} =
"^>\\S+ +(\\S+)"
;
my
$index
= new Bio::DB::Flat::BinarySearch
(
-index_dir
=>
$index_dir
,
-format
=>
'fasta'
,
-primary_pattern
=>
$primary_pattern
,
-primary_namespace
=>
"ACC"
,
-start_pattern
=>
$start_pattern
,
-secondary_patterns
=> \
%secondary_patterns
);
$index
->build_index(
@files
);
return
$index
;
}
sub
guess_alphabet {
my
$self
=
shift
;
my
$line
=
shift
;
my
$format
=
$self
->
format
;
return
'protein'
if
$format
eq
'swissprot'
;
if
(
$format
eq
'genbank'
) {
return
unless
$line
=~ /^LOCUS/;
return
'dna'
if
$line
=~ /\s+\d+\s+bp/i;
return
'protein'
;
}
if
(
$format
eq
'embl'
) {
return
unless
$line
=~ /^ID/;
return
'dna'
if
$line
=~ / DNA;/i;
return
'rna'
if
$line
=~ / RNA;/i;
return
'protein'
;
}
return
;
}
sub
_guess_patterns {
my
$self
=
shift
;
my
$format
=
shift
;
if
(
$format
=~ /swiss(prot)?/i) {
return
(
'ID'
,
"^ID (\\S+)"
,
"^ID (\\S+)"
,
{
ACC
=>
"^AC (\\S+);"
});
}
if
(
$format
=~ /embl/i) {
return
(
'ID'
,
"^ID (\\S+)"
,
"^ID (\\S+)"
,
{
ACC
=>
q/^AC (\S+);/
,
VERSION
=>
q/^SV\s+(\S+)/
});
}
if
(
$format
=~ /genbank/i) {
return
(
'ID'
,
q/^LOCUS\s+(\S+)/
,
q/^LOCUS/
,
{
ACC
=>
q/^ACCESSION\s+(\S+)/
,
VERSION
=>
q/^VERSION\s+(\S+)/
});
}
if
(
$format
=~ /fasta/i) {
return
(
'ACC'
,
'^>(\S+)'
,
'^>(\S+)'
,
);
}
$self
->throw(
"I can't handle format $format"
);
}
1;