use
vars
qw($DEFAULT_INDEX_DIR $DEFAULT_NODE_INDEX $DEFAULT_NAME2ID_INDEX $DEFAULT_ID2NAME_INDEX
$NCBI_TAXONOMY_HOSTNAME $DEFAULT_PARENT_INDEX
$NCBI_TAXONOMY_FILE @DIVISIONS)
;
$DEFAULT_INDEX_DIR
=
'/tmp'
;
$DEFAULT_NODE_INDEX
=
'nodes'
;
$DEFAULT_NAME2ID_INDEX
=
'names2id'
;
$DEFAULT_ID2NAME_INDEX
=
'id2names'
;
$DEFAULT_PARENT_INDEX
=
'parents'
;
$NCBI_TAXONOMY_HOSTNAME
=
'ftp.ncbi.nih.gov'
;
$NCBI_TAXONOMY_FILE
=
'/pub/taxonomy/taxdump.tar.gz'
;
$DB_BTREE
->{
'flags'
} = R_DUP;
@DIVISIONS
= ([
qw(BCT Bacteria)
],
[
qw(INV Invertebrates)
],
[
qw(MAM Mammals)
],
[
qw(PHG Phages)
],
[
qw(PLN Plants)
],
[
qw(PRI Primates)
],
[
qw(ROD Rodents)
],
[
qw(SYN Synthetic)
],
[
qw(UNA Unassigned)
],
[
qw(VRL Viruses)
],
[
qw(VRT Vertebrates)
],
[
qw(ENV 'Environmental samples')
]);
sub
new {
my
(
$class
,
@args
) =
@_
;
my
$self
=
$class
->SUPER::new(
@args
);
my
(
$dir
,
$nodesfile
,
$namesfile
,
$force
) =
$self
->_rearrange([
qw
(DIRECTORY NODESFILE NAMESFILE FORCE)], @args);
$self
->index_directory(
$dir
||
$DEFAULT_INDEX_DIR
);
if
(
$nodesfile
) {
$self
->_build_index(
$nodesfile
,
$namesfile
,
$force
);
}
$self
->_db_connect;
return
$self
;
}
sub
get_taxon {
my
(
$self
) =
shift
;
my
(
$taxonid
,
$name
);
if
(
@_
> 1) {
(
$taxonid
,
$name
) =
$self
->_rearrange([
qw(TAXONID NAME)
],
@_
);
if
(
$name
) {
(
$taxonid
,
my
@others
) =
$self
->get_taxonids(
$name
);
$self
->
warn
(
"There were multiple ids ($taxonid @others) matching '$name', using '$taxonid'"
)
if
@others
> 0;
}
}
else
{
$taxonid
=
shift
;
}
return
unless
$taxonid
;
$taxonid
=~ /^\d+$/ ||
return
;
my
$node
=
$self
->{
'_nodes'
}->[
$taxonid
] ||
return
;
length
(
$node
) ||
return
;
my
(
$taxid
,
undef
,
$rank
,
$code
,
$divid
,
$gen_code
,
$mito
) =
split
(SEPARATOR,
$node
);
last
unless
defined
$taxid
;
my
(
$taxon_names
) =
$self
->{
'_id2name'
}->[
$taxid
];
my
(
$sci_name
,
@common_names
) =
split
(SEPARATOR,
$taxon_names
);
my
$taxon
= FAST::Bio::Taxon->new(
-name
=>
$sci_name
,
-common_names
=> [
@common_names
],
-ncbi_taxid
=>
$taxid
,
-rank
=>
$rank
,
-division
=>
$DIVISIONS
[
$divid
]->[1],
-genetic_code
=>
$gen_code
,
-mito_genetic_code
=>
$mito
);
$taxon
->{
'db_handle'
} =
$self
;
$self
->_handle_internal_id(
$taxon
);
return
$taxon
;
}
*get_Taxonomy_Node
= \
&get_taxon
;
sub
get_taxonids {
my
(
$self
,
$query
) =
@_
;
my
$ids
=
$self
->{
'_name2id'
}->{
lc
(
$query
)};
unless
(
$ids
) {
if
(
$query
=~ /_/) {
$query
=~ s/_/ /g;
$ids
=
$self
->{
'_name2id'
}->{
lc
(
$query
)};
}
$ids
||
return
;
}
my
@ids
=
split
(SEPARATOR,
$ids
);
return
wantarray
() ?
@ids
:
shift
@ids
;
}
*get_taxonid
= \
&get_taxonids
;
sub
get_Children_Taxids {
my
(
$self
,
$node
) =
@_
;
$self
->
warn
(
"get_Children_Taxids is deprecated, use each_Descendent instead"
);
my
$id
;
if
(
ref
(
$node
) ) {
if
(
$node
->can(
'object_id'
) ) {
$id
=
$node
->object_id;
}
elsif
(
$node
->can(
'ncbi_taxid'
) ) {
$id
=
$node
->ncbi_taxid;
}
else
{
$self
->
warn
(
"Don't know how to extract a taxon id from the object of type "
.
ref
(
$node
).
"\n"
);
return
;
}
}
else
{
$id
=
$node
}
my
@vals
=
$self
->{
'_parentbtree'
}->get_dup(
$id
);
return
@vals
;
}
sub
ancestor {
my
(
$self
,
$taxon
) =
@_
;
$self
->throw(
"Must supply a FAST::Bio::Taxon"
)
unless
ref
(
$taxon
) &&
$taxon
->isa(
'FAST::Bio::Taxon'
);
$self
->throw(
"The supplied Taxon must belong to this database"
)
unless
$taxon
->db_handle &&
$taxon
->db_handle eq
$self
;
my
$id
=
$taxon
->id ||
$self
->throw(
"The supplied Taxon is missing its id!"
);
my
$node
=
$self
->{
'_nodes'
}->[
$id
];
if
(
length
(
$node
)) {
my
(
undef
,
$parent_id
) =
split
(SEPARATOR,
$node
);
$parent_id
||
return
;
$parent_id
eq
$id
&&
return
;
return
$self
->get_taxon(
$parent_id
);
}
return
;
}
sub
each_Descendent {
my
(
$self
,
$taxon
) =
@_
;
$self
->throw(
"Must supply a FAST::Bio::Taxon"
)
unless
ref
(
$taxon
) &&
$taxon
->isa(
'FAST::Bio::Taxon'
);
$self
->throw(
"The supplied Taxon must belong to this database"
)
unless
$taxon
->db_handle &&
$taxon
->db_handle eq
$self
;
my
$id
=
$taxon
->id ||
$self
->throw(
"The supplied Taxon is missing its id!"
);
my
@desc_ids
=
$self
->{
'_parentbtree'
}->get_dup(
$id
);
my
@descs
;
foreach
my
$desc_id
(
@desc_ids
) {
push
(
@descs
,
$self
->get_taxon(
$desc_id
) ||
next
);
}
return
@descs
;
}
sub
_build_index {
my
(
$self
,
$nodesfile
,
$namesfile
,
$force
) =
@_
;
my
(
$dir
) = (
$self
->index_directory);
my
$nodeindex
=
"$dir/$DEFAULT_NODE_INDEX"
;
my
$name2idindex
=
"$dir/$DEFAULT_NAME2ID_INDEX"
;
my
$id2nameindex
=
"$dir/$DEFAULT_ID2NAME_INDEX"
;
my
$parent2childindex
=
"$dir/$DEFAULT_PARENT_INDEX"
;
$self
->{
'_nodes'
} = [];
$self
->{
'_id2name'
} = [];
$self
->{
'_name2id'
} = {};
$self
->{
'_parent2children'
} = {};
if
(! -e
$nodeindex
||
$force
) {
my
(
%parent2children
,
@nodes
);
open
(NODES,
$nodesfile
) ||
$self
->throw(
"Cannot open node file '$nodesfile' for reading"
);
unlink
$nodeindex
;
unlink
$parent2childindex
;
my
$nh
=
tie
(
@nodes
,
'DB_File'
,
$nodeindex
, O_RDWR|O_CREAT, 0644,
$DB_RECNO
) ||
$self
->throw(
"Cannot open file '$nodeindex': $!"
);
my
$btree
=
tie
(
%parent2children
,
'DB_File'
,
$parent2childindex
, O_RDWR|O_CREAT, 0644,
$DB_BTREE
) ||
$self
->throw(
"Cannot open file '$parent2childindex': $!"
);
while
(<NODES>) {
chomp
;
my
(
$taxid
,
$parent
,
$rank
,
$code
,
$divid
,
undef
,
$gen_code
,
undef
,
$mito
) =
split
(/\t\|\t/,
$_
);
next
if
$taxid
== 1;
if
(
$parent
== 1) {
$parent
=
$taxid
;
}
$nodes
[
$taxid
] =
join
(SEPARATOR, (
$taxid
,
$parent
,
$rank
,
$code
,
$divid
,
$gen_code
,
$mito
));
$btree
->put(
$parent
,
$taxid
);
}
close
(NODES);
$nh
=
$btree
=
undef
;
untie
@nodes
;
untie
%parent2children
;
}
if
((! -e
$name2idindex
|| -z
$name2idindex
) || (! -e
$id2nameindex
|| -z
$id2nameindex
) ||
$force
) {
open
(NAMES,
$namesfile
) ||
$self
->throw(
"Cannot open names file '$namesfile' for reading"
);
unlink
$name2idindex
;
unlink
$id2nameindex
;
my
(
@id2name
,
%name2id
);
my
$idh
=
tie
(
@id2name
,
'DB_File'
,
$id2nameindex
, O_RDWR|O_CREAT, 0644,
$DB_RECNO
) ||
$self
->throw(
"Cannot open file '$id2nameindex': $!"
);
my
$nameh
=
tie
(
%name2id
,
'DB_File'
,
$name2idindex
, O_RDWR|O_CREAT, 0644,
$DB_HASH
) ||
$self
->throw(
"Cannot open file '$name2idindex': $!"
);
while
(<NAMES>) {
chomp
;
my
(
$taxid
,
$name
,
$unique_name
,
$class
) =
split
(/\t\|\t/,
$_
);
next
if
$taxid
== 1;
$class
=~ s/\s+\|\s*$//;
my
$lc_name
=
lc
(
$name
);
my
$orig_name
=
$name
;
if
(
$lc_name
=~ /\(class\)$/) {
$name2id
{
$lc_name
} =
$taxid
;
$name
=~ s/\s+\(class\)$//;
$lc_name
=
lc
(
$name
);
}
my
$taxids
=
$name2id
{
$lc_name
} ||
''
;
my
%taxids
=
map
{
$_
=> 1 }
split
(SEPARATOR,
$taxids
);
unless
(
exists
$taxids
{
$taxid
}) {
$taxids
{
$taxid
} = 1;
$name2id
{
$lc_name
} =
join
(SEPARATOR,
keys
%taxids
);
}
if
(
$unique_name
) {
$name2id
{
lc
(
$unique_name
)} =
$taxid
;
}
my
$names
=
$id2name
[
$taxid
] ||
''
;
my
@names
=
split
(SEPARATOR,
$names
);
if
(
$class
&&
$class
eq
'scientific name'
) {
unshift
(
@names
,
$name
);
push
(
@names
,
$orig_name
)
if
(
$orig_name
ne
$name
);
push
(
@names
,
$unique_name
)
if
$unique_name
;
}
else
{
push
(
@names
,
$name
);
push
(
@names
,
$orig_name
)
if
(
$orig_name
ne
$name
);
push
(
@names
,
$unique_name
)
if
$unique_name
;
}
$id2name
[
$taxid
] =
join
(SEPARATOR,
@names
);
}
close
(NAMES);
$idh
=
$nameh
=
undef
;
untie
(
%name2id
);
untie
(
@id2name
);
}
}
sub
_db_connect {
my
$self
=
shift
;
return
if
$self
->{
'_initialized'
};
$self
->{
'_nodes'
} = [];
$self
->{
'_id2name'
} = [];
$self
->{
'_name2id'
} = {};
my
(
$dir
) = (
$self
->index_directory);
my
$nodeindex
=
"$dir/$DEFAULT_NODE_INDEX"
;
my
$name2idindex
=
"$dir/$DEFAULT_NAME2ID_INDEX"
;
my
$id2nameindex
=
"$dir/$DEFAULT_ID2NAME_INDEX"
;
my
$parent2childindex
=
"$dir/$DEFAULT_PARENT_INDEX"
;
if
( ! -e
$nodeindex
||
! -e
$name2idindex
||
! -e
$id2nameindex
) {
$self
->
warn
(
"Index files have not been created"
);
return
0;
}
tie
( @{
$self
->{
'_nodes'
}},
'DB_File'
,
$nodeindex
, O_RDWR,
undef
,
$DB_RECNO
)
||
$self
->throw(
"$! $nodeindex"
);
tie
(@{
$self
->{
'_id2name'
}},
'DB_File'
,
$id2nameindex
,O_RDWR,
undef
,
$DB_RECNO
) ||
$self
->throw(
"$! $id2nameindex"
);
tie
( %{
$self
->{
'_name2id'
}},
'DB_File'
,
$name2idindex
, O_RDWR,
undef
,
$DB_HASH
) ||
$self
->throw(
"$! $name2idindex"
);
$self
->{
'_parentbtree'
} =
tie
( %{
$self
->{
'_parent2children'
}},
'DB_File'
,
$parent2childindex
,
O_RDWR, 0644,
$DB_BTREE
);
$self
->{
'_initialized'
} = 1;
}
sub
index_directory {
my
$self
=
shift
;
return
$self
->{
'index_directory'
} =
shift
if
@_
;
return
$self
->{
'index_directory'
};
}
1;