$CLIPSeqTools::PreprocessApp::annotate_with_conservation::VERSION
=
'0.1.0'
;
use
PDL::Lite;
$PDL::BIGPDL
= 0;
$PDL::BIGPDL
++;
option
'rname_sizes'
=> (
is
=>
'rw'
,
isa
=>
'Str'
,
required
=> 1,
documentation
=>
'file with sizes for reference alignment sequences (rnames). Must be tab delimited (chromosome\tsize) with one line per rname.'
,
);
option
'cons_dir'
=> (
is
=>
'rw'
,
isa
=>
'Str'
,
required
=> 1,
documentation
=>
'directory with phastCons or phyloP files.'
,
);
option
'drop'
=> (
is
=>
'rw'
,
isa
=>
'Bool'
,
documentation
=>
'drop column if they already exist (not supported in SQlite).'
,
);
with
"CLIPSeqTools::Role::Option::Library"
=> {
-alias
=> {
validate_args
=>
'_validate_args_for_library'
},
-excludes
=>
'validate_args'
,
};
sub
validate_args {
my
(
$self
) =
@_
;
$self
->_validate_args_for_library;
}
sub
run {
my
(
$self
) =
@_
;
warn
"Starting: annotate_with_conservation\n"
;
warn
"Validating arguments\n"
if
$self
->verbose;
$self
->validate_args();
warn
"Reading sizes for reference alignment sequences\n"
if
$self
->verbose;
my
%rname_sizes
=
$self
->read_rname_sizes;
warn
"Opening reads collection\n"
if
$self
->verbose;
my
$reads_collection
=
$self
->reads_collection;
my
@rnames
=
$reads_collection
->rnames_for_all_strands;
$reads_collection
->schema->storage->debug(1)
if
$self
->verbose > 1;
warn
"Creating new column conservation if required\n"
if
$self
->verbose;
$self
->create_new_column_if_required(
$reads_collection
);
warn
"Looping on annotation file to annotate records.\nThis might take a long time. Relax...\n"
if
$self
->verbose;
foreach
my
$rname
(
@rnames
) {
warn
"Reading conservation data for $rname\n"
if
$self
->verbose;
my
$pdl
=
$self
->plylop_pdl_for(
$rname
,
$rname_sizes
{
$rname
});
warn
"Annotating records for $rname\n"
if
$self
->verbose;
$reads_collection
->schema->txn_do(
sub
{
$reads_collection
->foreach_record_on_rname_do(
$rname
,
sub
{
my
(
$record
) =
@_
;
my
$record_plylop_avg
=
$pdl
->slice([
$record
->start,
$record
->stop])->average();
$record
->conservation(
$record_plylop_avg
);
$record
->update();
return
0;
});
});
}
}
sub
create_new_column_if_required {
my
(
$self
,
$reads_collection
) =
@_
;
if
(
$self
->drop) {
warn
"Droping column conservation\n"
if
$self
->verbose;
try
{
$reads_collection
->schema->storage->dbh_do(
sub
{
my
(
$storage
,
$dbh
,
@cols
) =
@_
;
$dbh
->
do
(
'ALTER TABLE '
.
$self
->table.
' DROP COLUMN conservation'
);
});
}
catch
{
warn
"Warning: Column could not be dropped.\n"
if
$self
->verbose;
};
}
try
{
warn
"Creating column conservation\n"
if
$self
->verbose;
$reads_collection
->schema->storage->dbh_do(
sub
{
my
(
$storage
,
$dbh
,
@cols
) =
@_
;
$dbh
->
do
(
'ALTER TABLE '
.
$self
->table.
' ADD COLUMN conservation INT(5)'
);
});
}
catch
{
warn
"Warning: Column creation failed. Maybe column already exist.\n"
if
$self
->verbose;
warn
"$_\n"
if
$self
->verbose > 1;
};
my
$conservation_params
= {
data_type
=>
'decimal'
,
is_numeric
=> 1
};
$reads_collection
->resultset->result_source->add_columns(
'conservation'
=>
$conservation_params
);
$reads_collection
->resultset->result_class->add_columns(
'conservation'
=>
$conservation_params
);
$reads_collection
->resultset->result_class->register_column(
'conservation'
);
}
sub
read_rname_sizes {
my
(
$self
) =
@_
;
my
%rname_size
;
open
(
my
$CHRSIZE
,
'<'
,
$self
->rname_sizes);
while
(
my
$line
= <
$CHRSIZE
>) {
chomp
$line
;
my
(
$chr
,
$size
) =
split
(/\t/,
$line
);
$rname_size
{
$chr
} =
$size
;
}
close
$CHRSIZE
;
return
%rname_size
;
}
sub
plylop_pdl_for {
my
(
$self
,
$rname
,
$rname_size
) =
@_
;
my
$pdl
= PDL->zeros(PDL::short(),
$rname_size
);
my
@files
=
glob
$self
->cons_dir .
'/'
.
$rname
.
'.*'
;
die
"More than one matching files for $rname"
if
@files
> 1;
my
$file
=
$files
[0];
chomp
$file
;
open
(
my
$H
,
"gzip -dc $file |"
);
my
(
$start
,
$step
);
while
(
my
$line
=
$H
->getline) {
chomp
$line
;
if
(
$line
=~ /^fixedStep\schrom=(.+)\sstart=(.+)\sstep=(.+)/) {
(
$start
,
$step
) = ($2 - 1, $3);
}
else
{
my
$piddle_tag_region
=
$pdl
->slice([
$start
,
$start
+
$step
- 1]) .=
int
(1000 *
$line
);
$start
+=
$step
;
}
}
close
$H
;
return
$pdl
;
}
1;