lib/Bio/EnsEMBL/DBSQL/SequenceAdaptor.pm


            
              —
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
—
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
—
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
—
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
—
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
—
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
—
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
—
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
—
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
              
=head1 LICENSE
Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute
Copyright [2016-2024] EMBL-European Bioinformatics Institute
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
     http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
=head1 CONTACT
  Please email comments or questions to the public Ensembl
  developers list at <dev@ensembl.org>.
  Questions may also be sent to the Ensembl help desk at
  <helpdesk@ensembl.org>.
=cut
=head1 NAME
Bio::EnsEMBL::DBSQL::SequenceAdaptor - produce sequence strings from locations
=head1 SYNOPSIS
  my $sa = $registry->get_adaptor( 'Human', 'Core', 'Sequence' );
  my $dna =
    ${ $sa->fetch_by_Slice_start_end_strand( $slice, 1, 1000, -1 ) };
=head1 DESCRIPTION
An adaptor for the retrieval of DNA sequence from the EnsEMBL database
=head1 METHODS
=cut
package Bio::EnsEMBL::DBSQL::SequenceAdaptor;
$Bio::EnsEMBL::DBSQL::SequenceAdaptor::VERSION = '112.0_54'; # TRIAL
$Bio::EnsEMBL::DBSQL::SequenceAdaptor::VERSION = '112.054';
use strict;
use warnings;
use Bio::EnsEMBL::Utils::Exception qw(throw);
use Bio::EnsEMBL::Utils::Sequence  qw(reverse_comp);
use Bio::EnsEMBL::Utils::Scalar qw( assert_ref );
use DBI qw/:sql_types/;
use base qw(Bio::EnsEMBL::DBSQL::BaseAdaptor Bio::EnsEMBL::DBSQL::BaseSequenceAdaptor);
our @EXPORT = (@{$DBI::EXPORT_TAGS{'sql_types'}});
=head2 new
  Arg [1]    : none
  Example    : my $sa = $db_adaptor->get_SequenceAdaptor();
  Description: Constructor. Calls superclass constructor and initialises
               internal cache structure.
  Returntype : Bio::EnsEMBL::DBSQL::SequenceAdaptor
  Exceptions : none
  Caller     : DBAdaptor::get_SequenceAdaptor
  Status     : Stable
=cut
sub new {
  my ($caller, $db, $chunk_power, $cache_size) = @_;
  my $class = ref($caller) || $caller;
  my $self = $class->SUPER::new($db);
  $self->_init_seq_instance($chunk_power, $cache_size);
  $self->_populate_seq_region_edits();
  return $self;
}
=head2 fetch_by_Slice_start_end_strand
  Arg  [1]   : Bio::EnsEMBL::Slice slice
               The slice from which you want the sequence
  Arg  [2]   : (optional) int startBasePair 
               The start base pair relative to the start of the slice. Negative
               values or values greater than the length of the slice are fine.
               default = 1
  Arg  [3]   : (optional) int endBasePair
               The end base pair relative to the start of the slice. Negative
               values or values greater than the length of the slice are fine,
               but the end must be greater than or equal to the start
               count from 1
               default = the length of the slice
  Arg  [4]   : (optional) int strand 
               1, -1
               default = 1
  Example    : $dna = $seq_adptr->fetch_by_Slice_start_end_strand($slice, 1, 
                                                                  1000, -1);
  Description: Retrieves from db the sequence for this slice
               uses AssemblyMapper to find the assembly
  Returntype : string 
  Exceptions : endBasePair should be less or equal to length of slice 
  Caller     : Bio::EnsEMBL::Slice::seq(), Slice::subseq() 
  Status     : Stable
=cut
sub fetch_by_Slice_start_end_strand {
   my ( $self, $slice, $start, $end, $strand ) = @_;
   if(!ref($slice) || !($slice->isa("Bio::EnsEMBL::Slice") or $slice->isa('Bio::EnsEMBL::LRGSlice')) ) {
     throw("Slice argument is required.");
   }
   $start = 1 if(!defined($start));
  
   if ( ( !defined($end) || $start > $end || $start < 0 || $end < 0 || $slice->start> $slice->end ) && $slice->is_circular ) {
          
       if ( !defined($end) || ($start > $end ) ) {
     return $self->_fetch_by_Slice_start_end_strand_circular( $slice, $start, $end, $strand );
       }
       if ( defined($end) && ($end < 0) ) {
     $end += $slice->seq_region_length;
       }
        
       if ($start < 0) {
           $start += $slice->seq_region_length;
       }
       if($slice->start> $slice->end) {
           return $self->_fetch_by_Slice_start_end_strand_circular( $slice, $slice->start, $slice->end, $strand );
       }
  }
         
  if ( ( !defined($end) ) && (not $slice->is_circular) ) {
           $end = $slice->end() - $slice->start() + 1;
  }
  if ( $start > $end ) {
      throw("Start must be less than or equal to end.");
  }
   $strand ||= 1;
   #get a new slice that spans the exact region to retrieve dna from
   my $right_expand  = $end - $slice->length(); #negative is fine
   my $left_expand   = 1 - $start; #negative is fine
   if($right_expand || $left_expand) {
     $slice = $slice->expand($left_expand, $right_expand);
   }
    
   #retrieve normalized 'non-symlinked' slices
   #this allows us to support haplotypes and PARs
   my $slice_adaptor = $slice->adaptor();
   my @symproj=@{$slice_adaptor->fetch_normalized_slice_projection($slice)};
   if(@symproj == 0) {
     throw('Could not retrieve normalized Slices. Database contains ' .
           'incorrect assembly_exception information.');
   }
   #call this method again with any slices that were 'symlinked' to by this
   #slice
   if(@symproj != 1 || $symproj[0]->[2] != $slice) {
     my $seq;
     foreach my $segment (@symproj) {
       my $symlink_slice = $segment->[2];
       #get sequence from each symlinked area
       $seq .= ${$self->fetch_by_Slice_start_end_strand($symlink_slice,
                                                        1,undef,1)};
     }
     if($strand == -1) {
       reverse_comp(\$seq);
     }
     return \$seq;
   }
   # we need to project this slice onto the sequence coordinate system
   # even if the slice is in the same coord system, we want to trim out
   # flanking gaps (if the slice is past the edges of the seqregion)
   my $csa = $self->db->get_CoordSystemAdaptor();
   my $seqlevel = $csa->fetch_sequence_level();
   my @projection=@{$slice->project($seqlevel->name(), $seqlevel->version())};
   my $seq = '';
   my $total = 0;
   my $tmp_seq;
   #fetch sequence from each of the sequence regions projected onto
   foreach my $segment (@projection) {
     my ($start, $end, $seq_slice) = @$segment;
     #check for gaps between segments and pad them with Ns
     my $gap = $start - $total - 1;
     if($gap) {
       $seq .= 'N' x $gap;
     }
     my $seq_region_id = $slice_adaptor->get_seq_region_id($seq_slice);
     $tmp_seq = ${$self->_fetch_seq($seq_region_id,
                                    $seq_slice->start, $seq_slice->length())};
     if(!defined $tmp_seq) {
         throw('No sequence found for seq_region '.$seq_region_id.':'.$seq_slice->start);
     }
     #reverse compliment on negatively oriented slices
     if($seq_slice->strand == -1) {
       reverse_comp(\$tmp_seq);
     }
     $seq .= $tmp_seq;
     $total = $end;
   }
   #check for any remaining gaps at the end
   my $gap = $slice->length - $total;
   if($gap) {
     $seq .= 'N' x $gap;
   }
   #if the sequence is too short it is because we came in with a seqlevel
   #slice that was partially off of the seq_region.  Pad the end with Ns
   #to make long enough
   if(length($seq) != $slice->length()) {
     $seq .= 'N' x ($slice->length() - length($seq));
   }
   if(defined($self->{_rna_edits_cache}) and defined($self->{_rna_edits_cache}->{$slice->get_seq_region_id})){
     $self->_rna_edit($slice,\$seq);
   }
   #if they asked for the negative slice strand revcomp the whole thing
   reverse_comp(\$seq) if($strand == -1);
   return \$seq;
}
=head2 can_access_Slice
  Description : Returns 1 since we can access any Slice's data
=cut
sub can_access_Slice {
  return 1;
}
sub _fetch_by_Slice_start_end_strand_circular {
  my ( $self, $slice, $start, $end, $strand ) = @_;
  assert_ref( $slice, 'Bio::EnsEMBL::Slice' );
   
  $strand ||= 1;
  if ( !defined($start) ) {
    $start ||= 1;
  }
  if ( !defined($end) ) {
      $end = $slice->end() - $slice->start() + 1;
  }
  if ( $start > $end && $slice->is_circular() ) {
    my ($seq, $seq1, $seq2);
    my $midpoint = $slice->seq_region_length - $slice->start + 1;
    $seq1 = ${ $self->_fetch_by_Slice_start_end_strand_circular( $slice, 1,  $midpoint, 1 )};
    $seq2 = ${ $self->_fetch_by_Slice_start_end_strand_circular( $slice, $midpoint + 1, $slice->length(), 1 )};
    $seq = $slice->strand > 0 ? "$seq1$seq2" : "$seq2$seq1";
    reverse_comp( \$seq ) if ( $strand == -1 );
    return \$seq;
  }
  # Get a new slice that spans the exact region to retrieve dna from
  my $right_expand = $end - $slice->length();    #negative is fine
  my $left_expand  = 1 - $start;                 #negative is fine
  if ( $right_expand || $left_expand ) {
    $slice =
        $slice->strand > 0
      ? $slice->expand( $left_expand,  $right_expand )
      : $slice->expand( $right_expand, $left_expand );
  }
  # Retrieve normalized 'non-symlinked' slices.  This allows us to
  # support haplotypes and PARs.
  my $slice_adaptor = $slice->adaptor();
  my @symproj =
    @{ $slice_adaptor->fetch_normalized_slice_projection($slice) };
  if ( @symproj == 0 ) {
    throw(   'Could not retrieve normalized Slices. Database contains '
           . 'incorrect assembly_exception information.' );
  }
  # Call this method again with any slices that were 'symlinked' to by
  # this slice.
  if ( @symproj != 1 || $symproj[0]->[2] != $slice ) {
    my $seq;
    foreach my $segment (@symproj) {
      my $symlink_slice = $segment->[2];
      # Get sequence from each symlinked area.
      $seq .= ${
        $self->fetch_by_Slice_start_end_strand( $symlink_slice, 1,
                                                undef, 1 ) };
    }
    if ( $strand == -1 ) {
      reverse_comp( \$seq );
    }
    return \$seq;
  }
  # We need to project this slice onto the sequence coordinate system
  # even if the slice is in the same coord system, we want to trim out
  # flanking gaps (if the slice is past the edges of the seqregion).
  my $csa      = $self->db->get_CoordSystemAdaptor();
  my $seqlevel = $csa->fetch_sequence_level();
  my @projection =
    @{ $slice->project( $seqlevel->name(), $seqlevel->version() ) };
  my $seq   = '';
  my $total = 0;
  my $tmp_seq;
  # Fetch sequence from each of the sequence regions projected onto.
  foreach my $segment (@projection) {
    my ( $start, $end, $seq_slice ) = @{$segment};
    # Check for gaps between segments and pad them with Ns
    my $gap = $start - $total - 1;
    if ($gap) {
      $seq .= 'N' x $gap;
    }
    my $seq_region_id = $slice_adaptor->get_seq_region_id($seq_slice);
    $tmp_seq = ${
      $self->_fetch_seq( $seq_region_id, $seq_slice->start(),
                         $seq_slice->length() ) };
    # Reverse compliment on negatively oriented slices.
    if ( $seq_slice->strand == -1 ) {
      reverse_comp( \$tmp_seq );
    }
    $seq .= $tmp_seq;
    $total = $end;
  }
  # Check for any remaining gaps at the end.
  my $gap = $slice->length() - $total;
  if ($gap) {
    $seq .= 'N' x $gap;
  }
  # If the sequence is too short it is because we came in with a
  # seqlevel slice that was partially off of the seq_region.  Pad the
  # end with Ns to make long enough
  if ( length($seq) != $slice->length() ) {
    $seq .= 'N' x ( $slice->length() - length($seq) );
  }
  if ( defined( $self->{_rna_edits_cache} )
       && defined(
            $self->{_rna_edits_cache}->{ $slice->get_seq_region_id } ) )
  {
    $self->_rna_edit( $slice, \$seq );
  }
  return \$seq;
} ## end sub _fetch_by_Slice_start_end_strand_circular
=head2 _rna_edit
  Description : Performs within sequence region editting when 
                the underlying sequence is incorrect. Used by LRGs.
=cut
sub _rna_edit {
  my $self  = shift;
  my $slice = shift;
  my $seq   = shift; #reference to string
  my $s_start = $slice->start;   # substr start at 0 , but seq starts at 1 (so no -1 here)
  my $s_end = $s_start+length($$seq) - 1; # But we do need -1 here to keep the coords closed
  foreach my $edit (@{$self->{_rna_edits_cache}->{$slice->get_seq_region_id}}){
    # seq_region_attrib for an exit looks like
    # <start> <end> <edit>
    # 2568 2569 GT
    my ($start, $end, $txt) = split (/\s+/, $edit);
    # check that RNA edit is not outside the requested region : happens quite often with LRG regions
    next if ($end < $s_start);
    next if ($s_end < $start);
    # Length of the edit ($txt)
    my $edit_length = length($txt);
    # If the edit isn't fully encompassed by the slice, we need to extract the
    # edit's sub-sequence that we're patching on to the sequence
    if($start < $s_start || $end > $s_end) {
        my $edit_offset;
        # Find the offset for the start of the edit, eg.
        #      seq slice:   TC
        #      LRG edit :  CG
        # Offset should be 1 to extract starting at the G
        # Formula: offset = max(s_start - start, 0)
        $edit_offset = ($s_start - $start) < 0 ? 0 : $s_start - $start;
        # Find the length of the overlapping piece of the edit.
        # This needs to take in to account the offset
        #      seq slice:   TC
        #      LRG edit :    GA
        # Where we're want the length to be 1 from an existing offset of 1 to
        # extract just G
        # Forumla: length = length(edit) - offset - max(end - s_end, 0)
        $edit_length = length($txt) - $edit_offset - ($end - $s_end < 0 ? 0 : $end - $s_end);
        # Extract the overlapping edit
        $txt = substr($txt, $edit_offset, $edit_length);
    }
    # Apply the patch, we don't want negative offsets as that's totally wrong.
    # Do a max(start - s_start, 0) to prevent this
    substr($$seq,($start-$s_start < 0 ? 0 : $start-$s_start),$edit_length, $txt);
  }
  return;
}
=head2 _fetch_raw_seq
  Description : Communicates with the database to fetch back sequence
=cut
sub _fetch_raw_seq {
  my ($self, $id, $start, $length) = @_;
  my $sql = <<'SQL';
SELECT UPPER(SUBSTR(d.sequence, ?, ?))
FROM dna d
WHERE d.seq_region_id =?
SQL
  my $seq = $self->dbc()->sql_helper()->execute_single_result(
    -SQL => $sql, 
    -PARAMS => [[$start, SQL_INTEGER], [$length, SQL_INTEGER], [$id, SQL_INTEGER]],
    -NO_ERROR => 1
  );
  return \$seq;
}
=head2 store
  Arg [1]    : int $seq_region_id the id of the sequence region this dna
               will be associated with.
  Arg [2]    : string $sequence the dna sequence to be stored 
               in the database.  Note that the sequence passed in will be
               converted to uppercase.
  Example    : $seq_adaptor->store(11, 'ACTGGGTACCAAACAAACACAACA');
  Description: stores a dna sequence in the databases dna table and returns the
               database identifier for the new record.
  Returntype : none
  Exceptions : throw if the database insert fails
  Caller     : sequence loading scripts
  Status     : Stable
=cut
sub store {
  my ($self, $seq_region_id, $sequence) = @_;
  if(!$seq_region_id) {
    throw('seq_region_id is required');
  }
  $sequence = uc($sequence);
  my $statement = 
    $self->prepare("INSERT INTO dna(seq_region_id, sequence) VALUES(?,?)");
  $statement->bind_param(1,$seq_region_id,SQL_INTEGER);
  $statement->bind_param(2,$sequence,SQL_LONGVARCHAR);
  $statement->execute();
  $statement->finish();
  return;
}
=head2 remove   
    
  Arg [1]    : int $seq_region_id the id of the sequence region this dna   
               is associated with.   
  Example    : $seq_adaptor->remove(11);   
  Description: removes a dna sequence for a given seq_region_id    
  Returntype : none    
  Exceptions : throw if the database delete fails    
  Caller     : Internal    
  Status     : Stable    
    
=cut   
    
sub remove {   
  my ($self, $seq_region_id) = @_;   
    
  if(!$seq_region_id) {    
    throw('seq_region_id is required');    
  }    
    
  my $statement =    
    $self->prepare("DELETE FROM dna WHERE seq_region_id = ?");   
    
  $statement->bind_param(1,$seq_region_id,SQL_INTEGER);    
  $statement->execute();   
    
  $statement->finish();    
    
  return;    
}    
=head2 _populate_seq_region_edits
  Description:  Query the database for any _rna_edit attributes attached to a seq region
=cut
sub _populate_seq_region_edits {
  my ($self) = @_;
  my $sql;
  my @params = ('_rna_edit');
  if($self->db()->is_multispecies()) {
    $sql = <<'SQL';
select sra.seq_region_id, sra.value 
from seq_region_attrib sra 
join attrib_type using (attrib_type_id) 
join seq_region s using (seq_region_id)
join coord_system cs using (coord_system_id)
where code =?
and species_id =?
SQL
    push(@params, $self->db()->species_id());
  }
  else {
    $sql = <<'SQL';
select sra.seq_region_id, sra.value 
from seq_region_attrib sra join attrib_type using (attrib_type_id) 
where code = ?
SQL
  }
  my $mapper = sub {
    my ($row, $array) = @_;
    my ($seq_region_id, $value) = @{$row};
    my ($start, $end, $substring) = split (/\s+/, $value);
    my $edit_length = ($end - $start) + 1;
    my $substring_length = length($substring);
    if($edit_length != $substring_length) {
      throw "seq_region_id $seq_region_id has an attrib of type '_rna_edit' (value '$value'). Edit length ${edit_length} is not the same as the replacement's length ${substring_length}. Please fix. We only support substitutions via this mechanism";
    }
    if(defined $array) {
      push(@{$array}, $value);
      return;
    }
    return [$value];
  };
  my $edits = $self->dbc()->sql_helper->execute_into_hash(-SQL => $sql, -PARAMS => ['_rna_edit'], -CALLBACK => $mapper);
  $self->{_rna_edits_cache} = $edits if %{$edits};
  return;
}
1;
	Global
`s`	Focus search bar
`?`	Bring up this help dialog
	GitHub
`g` `p`	Go to pull requests
`g` `i`	go to github issues (only if github is preferred repository)
	POD
`g` `a`	Go to author
`g` `c`	Go to changes
`g` `i`	Go to issues
`g` `d`	Go to dist
`g` `r`	Go to repository/SCM
`g` `s`	Go to source
`g` `b`	Go to file browse
	Search terms
module: (e.g. module:Plugin)
distribution: (e.g. distribution:Dancer auth)
author: (e.g. author:SONGMU Redis)
version: (e.g. version:1.00)