use
constant
MAX_SEGMENT
=> 1_000_000_000;
sub
new {
my
$class
=
shift
;
my
(
$file
,
$fasta
,
$dbdir
,
$preferred_groups
) = rearrange([
[
qw(GFF FILE)
],
'FASTA'
,
[
qw(DSN DB DIR DIRECTORY)
],
'PREFERRED_GROUPS'
,
],
@_
);
my
$self
=
bless
{
data
=> [] },
$class
;
$self
->preferred_groups(
$preferred_groups
)
if
defined
$preferred_groups
;
$file
||=
$dbdir
;
$fasta
||=
$dbdir
;
$self
->load_gff(
$file
)
if
$file
;
$self
->load_or_store_fasta(
$fasta
)
if
$fasta
;
return
$self
;
}
sub
load_or_store_fasta {
my
$self
=
shift
;
my
$fasta
=
shift
;
if
((-f
$fasta
&& -w dirname(
$fasta
))
or
(-d
$fasta
&& -w
$fasta
)) {
my
$dna_db
=
eval
{Bio::DB::Fasta->new(
$fasta
);}
or
warn
"$@\nCan't open sequence file(s). Use -gff instead of -dir if you wish to load features without sequence.\n"
;
$dna_db
&&
$self
->dna_db(
$dna_db
);
}
else
{
$self
->load_fasta(
$fasta
);
}
}
sub
dna_db {
my
$self
=
shift
;
my
$d
=
$self
->{dna_db};
$self
->{dna_db} =
shift
if
@_
;
$d
;
}
sub
insert_sequence {
my
$self
=
shift
;
my
(
$id
,
$offset
,
$seq
) =
@_
;
$self
->{dna}{
$id
} .=
$seq
;
}
sub
get_dna {
my
$self
=
shift
;
my
(
$id
,
$start
,
$stop
,
$class
) =
@_
;
if
(
my
$dna_db
=
$self
->dna_db) {
return
$dna_db
->seq(
$id
,
$start
=>
$stop
);
}
return
''
unless
$self
->{dna};
return
$self
->{dna}{
$id
}
unless
defined
$start
||
defined
$stop
;
$start
= 1
if
!
defined
$start
;
my
$reversed
= 0;
if
(
$start
>
$stop
) {
$reversed
++;
(
$start
,
$stop
) = (
$stop
,
$start
);
}
my
$dna
=
substr
(
$self
->{dna}{
$id
},
$start
-1,
$stop
-
$start
+1);
if
(
$reversed
) {
$dna
=~
tr
/gatcGATC/ctagCTAG/;
$dna
=
reverse
$dna
;
}
$dna
;
}
sub
setup_load {
my
$self
=
shift
;
$self
->{tmp} = {};
$self
->{data} = [];
1;
}
sub
finish_load {
my
$self
=
shift
;
my
$idx
= 0;
foreach
my
$arrayref
(
values
%{
$self
->{tmp}}) {
foreach
(
@$arrayref
) {
$_
->{feature_id} =
$idx
++; }
push
@{
$self
->{data}},
@$arrayref
;
}
1;
}
sub
load_gff_line {
my
$self
=
shift
;
my
$feature_hash
=
shift
;
$feature_hash
->{strand} =
''
if
$feature_hash
->{strand} &&
$feature_hash
->{strand} eq
'.'
;
$feature_hash
->{phase} =
''
if
$feature_hash
->{phase} &&
$feature_hash
->{phase} eq
'.'
;
$feature_hash
->{gclass} =
'Sequence'
unless
length
$feature_hash
->{gclass} > 0;
push
@{
$self
->{tmp}{
$feature_hash
->{gclass},
$feature_hash
->{gname}}},
$feature_hash
;
}
sub
get_abscoords {
my
$self
=
shift
;
my
(
$name
,
$class
,
$refseq
) =
@_
;
my
%refs
;
my
$regexp
;
if
(
$name
=~ /[*?]/) {
$name
=
quotemeta
(
$name
);
$name
=~ s/\\\*/.*/g;
$name
=~ s/\\\?/.?/g;
$regexp
++;
}
for
my
$feature
(@{
$self
->{data}}) {
my
$no_match_class_name
;
my
$empty_class_name
;
my
$class_matches
= !
defined
(
$feature
->{gclass}) ||
length
(
$feature
->{gclass}) == 0 ||
$feature
->{gclass} eq
$class
;
if
(
defined
$feature
->{gname}) {
my
$matches
=
$class_matches
&& (
$regexp
?
$feature
->{gname} =~ /
$name
/i :
lc
(
$feature
->{gname}) eq
lc
(
$name
));
$no_match_class_name
= !
$matches
;
}
else
{
$empty_class_name
= 1;
}
if
(
$no_match_class_name
){
my
$feature_attributes
=
$feature
->{attributes};
my
$attributes
= {
Alias
=>
$name
};
if
(!
$self
->_matching_attributes(
$feature_attributes
,
$attributes
)){
next
;
}
}
push
@{
$refs
{
$feature
->{
ref
}}},
$feature
;
}
if
(!
%refs
) {
$self
->error(
"$name not found in database"
);
return
;
}
my
(
$ref
) =
keys
%refs
;
my
@found
= @{
$refs
{
$ref
}};
my
(
$strand
,
$start
,
$stop
);
my
@found_segments
;
foreach
my
$ref
(
keys
%refs
) {
next
if
defined
(
$refseq
) and
lc
(
$ref
) ne
lc
(
$refseq
);
my
@found
= @{
$refs
{
$ref
}};
my
(
$strand
,
$start
,
$stop
,
$name
);
foreach
(
@found
) {
$strand
||=
$_
->{strand};
$strand
=
'+'
if
$strand
&&
$strand
eq
'.'
;
$start
=
$_
->{start}
if
!
defined
(
$start
) ||
$start
>
$_
->{start};
$stop
=
$_
->{stop}
if
!
defined
(
$stop
) ||
$stop
<
$_
->{stop};
$name
||=
$_
->{gname};
}
push
@found_segments
,[
$ref
,
$class
,
$start
,
$stop
,
$strand
,
$name
];
}
return
\
@found_segments
;
}
sub
search_notes {
my
$self
=
shift
;
my
(
$search_string
,
$limit
) =
@_
;
$search_string
=~
tr
/*?//d;
my
@results
;
my
@words
=
map
{
quotemeta
(
$_
)}
$search_string
=~ /(\w+)/g;
my
$search
=
join
'|'
,
@words
;
for
my
$feature
(@{
$self
->{data}}) {
next
unless
defined
$feature
->{gclass} &&
defined
$feature
->{gname};
next
unless
$feature
->{attributes};
my
@attributes
= @{
$feature
->{attributes}};
my
@values
=
map
{
$_
->[1]}
@attributes
;
my
$value
=
"@values"
;
my
$matches
= 0;
for
my
$w
(
@words
) {
my
@hits
=
$value
=~ /(
$w
)/ig;
$matches
+=
@hits
;
}
next
unless
$matches
;
my
$relevance
= 10 *
$matches
;
my
$featname
= Bio::DB::GFF::Featname->new(
$feature
->{gclass}=>
$feature
->{gname});
my
$note
;
$note
=
join
' '
,
map
{
$_
->[1]}
grep
{
$_
->[0] eq
'Note'
} @{
$feature
->{attributes}};
$note
.=
join
' '
,
grep
/
$search
/,
map
{
$_
->[1]}
grep
{
$_
->[0] ne
'Note'
} @{
$feature
->{attributes}};
push
@results
,[
$featname
,
$note
,
$relevance
];
last
if
defined
$limit
&&
@results
>=
$limit
;
}
my
$match_sub
=
'sub {'
;
foreach
(
split
/\s+/,
$search_string
) {
$match_sub
.=
"return unless \$_[0] =~ /\Q$_\E/i; "
;
}
$match_sub
.=
"};"
;
my
$match
=
eval
$match_sub
;
my
@matches
=
grep
{
$match
->(
$_
->[1]) }
@results
;
return
@matches
;
}
sub
_delete_features {
my
$self
=
shift
;
my
@feature_ids
=
sort
{
$b
<=>
$a
}
@_
;
my
$removed
= 0;
foreach
(
@feature_ids
) {
next
unless
$_
>= 0 &&
$_
< @{
$self
->{data}};
$removed
+=
defined
splice
(@{
$self
->{data}},
$_
,1);
}
$removed
;
}
sub
_delete {
my
$self
=
shift
;
my
$delete_spec
=
shift
;
my
$ranges
=
$delete_spec
->{segments} || [];
my
$types
=
$delete_spec
->{types} || [];
my
$force
=
$delete_spec
->{force};
my
$range_type
=
$delete_spec
->{range_type};
my
$deleted
= 0;
if
(
@$ranges
) {
my
@args
=
@$types
? (
-type
=>
$types
) : ();
push
@args
,(
-range_type
=>
$range_type
);
my
%ids_to_remove
=
map
{
$_
->
id
=> 1}
map
{
$_
->features(
@args
)}
@$ranges
;
$deleted
=
$self
->delete_features(
keys
%ids_to_remove
);
}
elsif
(
@$types
) {
my
%ids_to_remove
=
map
{
$_
->
id
=> 1}
$self
->features(
-type
=>
$types
);
$deleted
=
$self
->delete_features(
keys
%ids_to_remove
);
}
else
{
$self
->throw(
"This operation would delete all feature data and -force not specified"
)
unless
$force
;
$deleted
= @{
$self
->{data}};
@{
$self
->{data}} = ();
}
$deleted
;
}
sub
do_attributes{
my
$self
=
shift
;
my
(
$feature_id
,
$tag
) =
@_
;
my
$attr
;
my
$feature
=
$self
->_basic_features_by_id(
$feature_id
);
my
@result
;
for
my
$attr
(@{
$feature
->{attributes}}) {
my
(
$attr_name
,
$attr_value
) =
@$attr
;
if
(
defined
(
$tag
) &&
lc
(
$attr_name
) eq
lc
(
$tag
)){
push
@result
,
$attr_value
;}
elsif
(!
defined
(
$tag
)) {
push
@result
,(
$attr_name
,
$attr_value
);}
}
return
@result
;
}
sub
_feature_by_attribute{
my
$self
=
shift
;
my
(
$attributes
,
$callback
) =
@_
;
$callback
||
$self
->throw(
'must provide a callback argument'
);
my
$count
= 0;
my
$feature_id
= -1;
my
$feature_group_id
=
undef
;
for
my
$feature
(@{
$self
->{data}}) {
$feature_id
++;
for
my
$attr
(@{
$feature
->{attributes}}) {
my
(
$attr_name
,
$attr_value
) =
@$attr
;
foreach
(
keys
%$attributes
) {
if
(
lc
(
$_
) eq
lc
(
$attr_name
) &&
lc
(
$attributes
->{
$_
}) eq
lc
(
$attr_value
)) {
$callback
->(
$self
->_hash_to_array(
$feature
));
$count
++;
}
}
}
}
}
sub
get_features{
my
$self
=
shift
;
my
$count
= 0;
my
(
$search
,
$options
,
$callback
) =
@_
;
my
$found_features
;
$found_features
=
$self
->_get_features_by_search_options(
$search
,
$options
);
@{
$found_features
} =
sort
{
lc
(
"$a->{gclass}:$a->{gname}"
) cmp
lc
(
"$b->{gclass}:$b->{gname}"
)}
@{
$found_features
}
if
$options
->{sort_by_group} ;
for
my
$feature
(@{
$found_features
}) {
$count
++;
$callback
->(
$self
->_hash_to_array(
$feature
)
);
}
return
$count
;
}
sub
_feature_by_name {
my
$self
=
shift
;
my
(
$class
,
$name
,
$location
,
$callback
) =
@_
;
$callback
||
$self
->throw(
'must provide a callback argument'
);
my
$count
= 0;
my
$regexp
;
if
(
$name
=~ /[*?]/) {
$name
=
quotemeta
(
$name
);
$name
=~ s/\\\*/.*/g;
$name
=~ s/\\\?/.?/g;
$regexp
++;
}
for
my
$feature
(@{
$self
->{data}}) {
next
unless
(
$regexp
&&
$feature
->{gname} =~ /
$name
/i) ||
lc
(
$feature
->{gname}) eq
lc
(
$name
);
next
if
defined
(
$feature
->{gclass}) &&
length
(
$feature
->{gclass}) > 0 &&
$feature
->{gclass} ne
$class
;
if
(
$location
) {
next
if
$location
->[0] ne
$feature
->{
ref
};
next
if
$location
->[1] &&
$location
->[1] >
$feature
->{stop};
next
if
$location
->[2] &&
$location
->[2] <
$feature
->{start};
}
$count
++;
$callback
->(
$self
->_hash_to_array(
$feature
),0);
}
return
$count
;
}
sub
_feature_by_id{
my
$self
=
shift
;
my
(
$ids
,
$type
,
$callback
) =
@_
;
$callback
||
$self
->throw(
'must provide a callback argument'
);
my
$feature_group_id
=
undef
;
my
$count
= 0;
if
(
$type
eq
'feature'
){
for
my
$feature_id
(
@$ids
){
my
$feature
=
$self
->_basic_features_by_id(
$feature_id
);
$callback
->(
$self
->_hash_to_array(
$feature
))
if
$callback
;
$count
++;
}
}
}
sub
_basic_features_by_id{
my
$self
=
shift
;
my
(
$ids
) =
@_
;
$ids
= [
$ids
]
unless
ref
$ids
=~ /ARRAY/;
my
@result
;
for
my
$feature_id
(
@$ids
){
push
@result
, ${
$self
->{data}}[
$feature_id
];
}
return
wantarray
() ?
@result
:
$result
[0];
}
sub
get_features_iterator {
my
$self
=
shift
;
my
(
$search
,
$options
,
$callback
) =
@_
;
$callback
||
$self
->throw(
'must provide a callback argument'
);
my
$results
=
$self
->_get_features_by_search_options(
$search
,
$options
);
my
$results_array
=
$self
->_convert_feature_hash_to_array(
$results
);
return
Bio::DB::GFF::Adaptor::memory::iterator->new(
$results_array
,
$callback
);
}
sub
get_types {
my
$self
=
shift
;
my
(
$srcseq
,
$class
,
$start
,
$stop
,
$want_count
,
$typelist
) =
@_
;
my
(
%result
,
%obj
);
for
my
$feature
(@{
$self
->{data}}) {
my
$feature_start
=
$feature
->{start};
my
$feature_stop
=
$feature
->{stop};
my
$feature_ref
=
$feature
->{
ref
};
my
$feature_class
=
$feature
->{class};
my
$feature_method
=
$feature
->{method};
my
$feature_source
=
$feature
->{source};
if
(
defined
$srcseq
){
next
unless
lc
(
$feature_ref
) eq
lc
(
$srcseq
);
}
if
(
defined
$class
){
next
unless
defined
$feature_class
&&
$feature_class
eq
$class
;
}
if
(
defined
$start
or
defined
$stop
) {
$start
= 1
unless
defined
$start
;
$stop
= MAX_SEGMENT
unless
defined
$stop
;
next
unless
$feature_stop
>=
$start
&&
$feature_start
<=
$stop
;
}
if
(
defined
$typelist
&&
@$typelist
){
next
unless
$self
->_matching_typelist(
$feature_method
,
$feature_source
,
$typelist
);
}
my
$type
= Bio::DB::GFF::Typename->new(
$feature_method
,
$feature_source
);
$result
{
$type
}++;
$obj
{
$type
} =
$type
;
}
return
$want_count
?
%result
:
values
%obj
;
}
sub
classes {
my
$self
=
shift
;
my
%classes
;
for
my
$feature
(@{
$self
->{data}}) {
$classes
{
$feature
->{gclass}}++;
}
my
@classes
=
sort
keys
%classes
;
return
@classes
;
}
sub
_get_features_by_search_options{
my
$count
= 0;
my
(
$self
,
$search
,
$options
) =
@_
;
my
(
$rangetype
,
$refseq
,
$class
,
$start
,
$stop
,
$types
,
$sparse
,
$order_by_group
,
$attributes
) =
(@{
$search
}{
qw(rangetype refseq refclass start stop types)
},
@{
$options
}{
qw(sparse sort_by_group ATTRIBUTES)
}) ;
my
@found_features
;
my
$data
=
$self
->{data};
my
$feature_id
= -1 ;
my
$feature_group_id
=
undef
;
for
my
$feature
(@{
$data
}) {
$feature_id
++;
my
$feature_start
=
$feature
->{start};
my
$feature_stop
=
$feature
->{stop};
my
$feature_ref
=
$feature
->{
ref
};
if
(
defined
$refseq
){
next
unless
lc
(
$feature_ref
) eq
lc
(
$refseq
);
}
if
(
defined
$start
or
defined
$stop
) {
$start
= 0
unless
defined
(
$start
);
$stop
= MAX_SEGMENT
unless
defined
(
$stop
);
if
(
$rangetype
eq
'overlaps'
) {
next
unless
$feature_stop
>=
$start
&&
$feature_start
<=
$stop
;
}
elsif
(
$rangetype
eq
'contains'
) {
next
unless
$feature_start
>=
$start
&&
$feature_stop
<=
$stop
;
}
elsif
(
$rangetype
eq
'contained_in'
) {
next
unless
$feature_start
<=
$start
&&
$feature_stop
>=
$stop
;
}
else
{
next
unless
$feature_start
==
$start
&&
$feature_stop
==
$stop
;
}
}
my
$feature_source
=
$feature
->{source};
my
$feature_method
=
$feature
->{method};
if
(
defined
$types
&&
@$types
){
next
unless
$self
->_matching_typelist(
$feature_method
,
$feature_source
,
$types
);
}
my
$feature_attributes
=
$feature
->{attributes};
if
(
defined
$attributes
){
next
unless
$self
->_matching_attributes(
$feature_attributes
,
$attributes
);
}
my
$found_feature
=
$feature
;
$found_feature
->{feature_id} =
$feature_id
;
$found_feature
->{group_id} =
$feature_group_id
;
push
@found_features
,
$found_feature
;
}
return
\
@found_features
;
}
sub
_hash_to_array {
my
(
$self
,
$feature_hash
) =
@_
;
my
@array
= @{
$feature_hash
}{
@hash2array_map
};
return
wantarray
?
@array
: \
@array
;
}
sub
_convert_feature_hash_to_array{
my
(
$self
,
$feature_hash_array
) =
@_
;
my
@features_array_array
=
map
{
scalar
$self
->_hash_to_array(
$_
)}
@$feature_hash_array
;
return
\
@features_array_array
;
}
sub
_matching_typelist{
my
(
$self
,
$feature_method
,
$feature_source
,
$typelist
) =
@_
;
foreach
(
@$typelist
) {
my
(
$search_method
,
$search_source
) =
@$_
;
next
if
lc
(
$search_method
) ne
lc
(
$feature_method
);
next
if
defined
(
$search_source
) &&
lc
(
$search_source
) ne
lc
(
$feature_source
);
return
1;
}
return
0;
}
sub
_matching_attributes {
my
(
$self
,
$feature_attributes
,
$attributes
) =
@_
;
foreach
(
keys
%$attributes
) {
return
0
if
!_match_all_attr_in_feature(
$_
,
$attributes
->{
$_
},
$feature_attributes
)
}
return
1;
}
sub
_match_all_attr_in_feature{
my
(
$attr_name
,
$attr_value
,
$feature_attributes
) =
@_
;
for
my
$attr
(
@$feature_attributes
) {
my
(
$feature_attr_name
,
$feature_attr_value
) =
@$attr
;
next
if
(
$attr_name
ne
$feature_attr_name
||
$attr_value
ne
$feature_attr_value
);
return
1;
}
return
0;
}
sub
do_initialize { 1; }
sub
get_feature_by_group_id{ 1; }
1;