=head1 LICENSE
Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute
Copyright [2016-2024] EMBL-European Bioinformatics Institute
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
=cut
=head1 CONTACT
Please email comments or questions to the public Ensembl
Questions may also be sent to the Ensembl help desk at
=cut
=head1 NAME
Bio::EnsEMBL::Utils::Tree::Interval::Immutable
=head1 SYNOPSIS
# define a set of intervals to be added to the tree
my $intervals = [ Bio::EnsEMBL::Utils::Interval->new(121626874, 122092717),
Bio::EnsEMBL::Utils::Interval->new(121637917, 121658918),
Bio::EnsEMBL::Utils::Interval->new(122096077, 124088369) ];
# initialise the tree with the above intervals
my $tree = Bio::EnsEMBL::Utils::Tree::Interval::Immutable->new($intervals);
# point query
my $results = $tree->query(121779004);
if (scalar @$results) {
print "Intervals contain 121779004\n";
}
# same query, but use interval query
my $results = $tree->query(121779004, 121779004);
if (scalar @$results) {
print "Found containing interval: [", $result->[0]->start, ', ', $result->[0]->end, "\n";
}
=head1 DESCRIPTION
An implementation of an immutable interval tree. Immutable means the tree is
initialised with a fixed set of intervals at creation time. Intervals cannot
be added to or removed from the tree during its life cycle.
Implementation heavily inspired by https://github.com/tylerkahn/intervaltree-python
This implementation does not support Intervals having a start > end - i.e.
intervals spanning the origin of a circular chromosome.
=head1 METHODS
=cut
$Bio::EnsEMBL::Utils::Tree::Interval::Immutable::VERSION = '112.0_52'; # TRIAL
$Bio::EnsEMBL::Utils::Tree::Interval::Immutable::VERSION = '112.052';
use strict;
use Bio::EnsEMBL::Utils::Scalar qw(assert_ref);
=head2 new
Arg [1] : Arrayref of Bio::EnsEMBL::Utils::Interval instances
Example : my $tree = Bio::EnsEMBL::Utils::Tree::Immutable([ $i1, $i2, $i3 ]);
Description : Constructor. Creates a new immutable tree instance
Returntype : Bio::EnsEMBL::Utils::Tree::Interval::Immutable
Exceptions : none
Caller : general
=cut
sub new {
my $caller = shift;
my $class = ref($caller) || $caller;
my $intervals = shift;
if (defined $intervals ) {
assert_ref($intervals, 'ARRAY');
}
my $self = bless({}, $class);
$self->{top_node} = $self->_divide_intervals($intervals);
return $self;
}
=head2 root
Arg [] : none
Example : my $root = $tree->root();
Description : Return the tree top node
Returntype : Bio::EnsEMBL::Utils::Tree::Interval::Immutable::Node
Exceptions : none
Caller : general
=cut
sub root {
return shift->{top_node};
}
=head2 query
Arg [1] : scalar, $start
Where the query interval begins
Arg [2] : (optional) scalar, $end
Where the query interval ends
Example : my $results = $tree->query(121626874, 122092717);
Description : Query the tree if its intervals overlap the interval whose start
and end points are specified by the argument list.
If end is not specified, it is assumed to be the same as start
so effectively making a point query.
Returntype : An arrayref of Bio::EnsEMBL::Utils::Interval instances
Exceptions : none
Caller : general
=cut
sub query {
my ($self, $start, $end) = @_;
my $interval;
if (defined $start) {
$end = $start unless defined $end;
$interval = Bio::EnsEMBL::Utils::Interval->new($start, $end);
}
return [] unless $interval;
return $self->_query_point($self->root, $interval->start, []) if $interval->is_point;
my $result = [];
return $result unless $self->root or $interval->is_empty;
my $node = $self->root;
while ($node) {
if ($interval->contains($node->x_center)) {
push @{$result}, @{$node->s_center_beg};
$self->_range_query_left($node->left, $interval, $result);
$self->_range_query_right($node->right, $interval, $result);
last;
}
if ($interval->is_left_of($node->x_center)) {
foreach my $s_beg (@{$node->s_center_beg}) {
last unless $interval->intersects($s_beg);
push @{$result}, $s_beg;
}
$node = $node->left;
} else {
foreach my $s_end (@{$node->s_center_end}) {
last unless $interval->intersects($s_end);
push @{$result}, $s_end;
}
$node = $node->right;
}
}
return sort_by_begin(uniq($result));
}
sub _query_point {
my ($self, $node, $point, $result) = @_;
return $result unless $node;
# if x is less than x_center, the leftmost set of intervals, S_left, is considered
if ($point <= $node->x_center) {
# if x is less than x_center, we know that all intervals in S_center end after x,
# or they could not also overlap x_center. Therefore, we need only find those intervals
# in S_center that begin before x. We can consult the lists of S_center that have already
# been constructed. Since we only care about the interval beginnings in this scenario,
# we can consult the list sorted by beginnings.
# Suppose we find the closest number no greater than x in this list. All ranges from the
# beginning of the list to that found point overlap x because they begin before x and end
# after x (as we know because they overlap x_center which is larger than x).
# Thus, we can simply start enumerating intervals in the list until the startpoint value exceeds x.
foreach my $s_beg (@{$node->s_center_beg}) {
last if $s_beg->is_right_of($point);
push @{$result}, $s_beg;
}
# since x < x_center, we also consider the leftmost set of intervals
return $self->_query_point($node->left, $point, $result);
} else {
# if x is greater than x_center, we know that all intervals in S_center must begin before x,
# so we find those intervals that end after x using the list sorted by interval endings.
foreach my $s_end (@{$node->s_center_end}) {
last if $s_end->is_left_of($point);
push @{$result}, $s_end;
}
# since x > x_center, we also consider the rightmost set of intervals
return $self->_query_point($node->right, $point, $result);
}
return sort_by_begin(uniq($result));
}
# This corresponds to the left branch of the range search, once we find a node, whose
# midpoint is contained in the query interval. All intervals in the left subtree of that node
# are guaranteed to intersect with the query, if they have an endpoint greater or equal than
# the start of the query interval. Basically, this means that every time we branch to the left
# in the binary search, we need to add the whole right subtree to the result set.
sub _range_query_left {
my ($self, $node, $interval, $result) = @_;
while ($node) {
if ($interval->contains($node->x_center)) {
push @{$result}, @{$node->s_center_beg};
if ($node->right) {
# in-order traversal of the right subtree to add all its intervals
$self->_in_order_traversal($node->right, $result);
}
$node = $node->left;
} else {
foreach my $seg_end (@{$node->s_center_end}) {
last if $seg_end->is_left_of($interval);
push @{$result}, $seg_end;
}
$node = $node->right;
}
}
}
# This corresponds to the right branch of the range search, once we find a node, whose
# midpoint is contained in the query interval. All intervals in the right subtree of that node
# are guaranteed to intersect with the query, if they have an endpoint smaller or equal than
# the end of the query interval. Basically, this means that every time we branch to the right
# in the binary search, we need to add the whole left subtree to the result set.
sub _range_query_right {
my ($self, $node, $interval, $result) = @_;
while ($node) {
if ($interval->contains($node->x_center)) {
push @{$result}, @{$node->s_center_beg};
if ($node->left) {
# in-order traversal of the left subtree to add all its intervals
$self->_in_order_traversal($node->left, $result);
}
$node = $node->right;
} else {
foreach my $seg_beg (@{$node->s_center_beg}) {
last if $seg_beg->is_right_of($interval);
push @{$result}, $seg_beg;
}
$node = $node->left;
}
}
}
sub in_order_traversal {
my ($self) = @_;
my $result = [];
$self->_in_order_traversal($self->root, $result);
return $result;
}
sub _in_order_traversal {
my ($self, $node, $result) = @_;
return unless $node;
$result ||= [];
$self->_in_order_traversal($node->left, $result);
push @{$result}, @{$node->s_center_beg};
$self->_in_order_traversal($node->right, $result);
}
sub _divide_intervals {
my ($self, $intervals, $sorted) = @_;
return undef unless scalar @{$intervals};
my $sorted_intervals;
if ($sorted) {
$sorted_intervals = $intervals;
} else {
$sorted_intervals = sort_by_begin($intervals);
}
my $x_center = $self->_center_sorted($sorted_intervals);
my ($s_center, $s_left, $s_right) = ([], [], []);
foreach my $sorted_interval (@{$sorted_intervals}) {
if ($sorted_interval->spans_origin) {
throw "Cannot build a tree containing an interval that spans the origin";
}
if ($sorted_interval->end < $x_center) {
push @{$s_left}, $sorted_interval;
} elsif ($sorted_interval->start > $x_center) {
push @{$s_right}, $sorted_interval;
} else {
push @{$s_center}, $sorted_interval;
}
}
my $node = Bio::EnsEMBL::Utils::Tree::Interval::Immutable::Node->new($x_center,
$s_center,
$self->_divide_intervals($s_left, 1),
$self->_divide_intervals($s_right, 1));
}
sub _center {
my ($self, $intervals) = @_;
my $sorted_intervals = sort_by_begin($intervals);
my $len = scalar @{$sorted_intervals};
return $sorted_intervals->[int($len/2)]->start;
}
sub _center_sorted {
my ($self, $sorted_intervals) = @_;
my $len = scalar @{$sorted_intervals};
return $sorted_intervals->[int($len/2)]->start;
}
sub sort_by_begin {
my $intervals = shift;
return [ map { $_->[1] } sort { $a->[0] <=> $b->[0] } map { [ $_->start, $_ ] } @{$intervals} ];
}
sub uniq {
my $intervals = shift;
tie my %seen, 'Tie::RefHash';
return [ grep { ! $seen{ $_ }++ } @{$intervals} ];
}
1;