#!/usr/bin/env perl
our
%opts
= (
config
=>
"$Bin/../share/config.yaml"
,
chromosome
=>
undef
,
reference
=>
undef
,
cb_bed
=>
undef
,
mut_num
=> 0,
mut_rate
=> 0,
seed
=>
"None"
,
mut_file
=>
"./snv_mutations.txt"
,
);
$opts
{
'scripts'
} = {
};
my
$result
= main();
exit
(
$result
);
sub
main {
GetOptions(
\
%opts
,
"help|?"
,
"man"
,
"config|c=s"
=> \
$opts
{
'config'
},
"chromosome|chrom=s"
=> \
$opts
{
'chromosome'
},
"reference|ref=s"
=> \
$opts
{
'reference'
},
"cb_bed=s"
=> \
$opts
{
'cb_bed'
},
"mut_num|n:i"
=> \
$opts
{
'mut_num'
},
"mut_rate:f"
=> \
$opts
{
'mut_rate'
},
"seed:i"
=> \
$opts
{
'seed'
},
"mut_file|mf:s"
=> \
$opts
{
'mut_file'
},
) or pod2usage(64);
if
(
$opts
{
'help'
}) { pod2usage(1) };
if
(
$opts
{
'man'
}) { pod2usage(
-exitstatus
=> 0,
-verbose
=> 2) };
while
(
my
(
$arg
,
$value
) =
each
(
%opts
)) {
if
(!
defined
$value
) {
print
(
"ERROR: Missing argument $arg\n"
);
pod2usage(128);
}
}
if
(
$opts
{
'seed'
} eq
'None'
) {
srand
;
}
else
{
srand
(
$opts
{
'seed'
});
random_set_seed_from_phrase(
$opts
{
'seed'
});
}
my
$config
= YAML::LoadFile(
$opts
{
'config'
});
my
$cb_bed
=
$opts
{
'cb_bed'
};
my
$chromosome
=
$opts
{
'chromosome'
};
my
$genome
= Bio::SeqIO->new(
-file
=>
"<$opts{'reference'}"
,
-format
=>
"largefasta"
);
my
$primary_id
;
my
$seq
;
while
(
$seq
=
$genome
->next_seq){
$primary_id
=
$seq
->primary_id;
last
if
(
$primary_id
eq
$chromosome
);
}
croak
"ERROR: chromosome $chromosome inputted not found in the reference genome given"
if
(
$primary_id
ne
$chromosome
);
my
$signature_file
= dist_file(
'NGS-Tools-BAMSurgeon'
,
'signatures.txt'
);
croak
"ERROR: no signatures file given, please use --picksomatic to generate random mutations"
if
(not
$signature_file
);
my
$signatures
=
&read_signature
(
$signature_file
);
my
(
$signature
,
$mut_rate
,
$vaf_param
) =
&read_config
(
$config
,
$signatures
,
$config
->{
'cancer'
}->{
'sig_vary'
});
my
$num_picks
;
my
$min_loc
;
my
$max_loc
;
my
$bases
;
if
(
$opts
{
'mut_rate'
} != 0){
$mut_rate
=
$opts
{
'mut_rate'
};
(
$num_picks
,
$min_loc
,
$max_loc
,
$bases
) =
&calculate_muts
(
$mut_rate
,
$cb_bed
,
$chromosome
);
}
elsif
(
$mut_rate
!= 0){
(
$num_picks
,
$min_loc
,
$max_loc
,
$bases
) =
&calculate_muts
(
$mut_rate
,
$cb_bed
,
$chromosome
);
}
else
{
(
$num_picks
,
$min_loc
,
$max_loc
,
$bases
) =
&calculate_muts
(0.0000002,
$cb_bed
,
$chromosome
);
}
if
(
$opts
{
'mut_num'
} != 0) {
$num_picks
=
$opts
{
'mut_num'
};
say
(
"INFO: choosing $num_picks mutations."
);
}
else
{
say
(
"INFO: choosing $num_picks mutations with mutation rate of $mut_rate."
);
}
my
$locations
= [];
my
%loc
= ();
my
$vcf_num
= 0;
if
(
$config
->{
'vcf_file'
}) {
my
$vcf_file
=
$config
->{
'vcf_file'
};
my
$vcf_mutations
=
&read_vcf
(
$vcf_file
,
$chromosome
);
my
@vcf_muts
;
foreach
my
$i
(0..
scalar
(@{
$vcf_mutations
})-1) {
if
(
exists
$loc
{${${
$vcf_mutations
}[
$i
]}[1]}) {
say
(
"WARN: duplicated location in vcf file, omitted"
);
}
else
{
my
$vaf
=
&choose_vaf
(
$vaf_param
);
push
(
@vcf_muts
, [${${
$vcf_mutations
}[
$i
]}[0], ${${
$vcf_mutations
}[
$i
]}[1], ${${
$vcf_mutations
}[
$i
]}[1],
$vaf
, ${${
$vcf_mutations
}[
$i
]}[2]]);
$loc
{${${
$vcf_mutations
}[
$i
]}[1]} = 1;
}
}
$vcf_num
=
scalar
(
@vcf_muts
);
say
(
"INFO: added $vcf_num SNVs from vcf file"
);
push
(@{
$locations
},
@vcf_muts
);
$num_picks
-=
$vcf_num
;
say
(
"INFO: choosing $num_picks after subtracting from vcf file"
);
}
my
$timing_effect
=
$config
->{
'rep_time_effect'
};
my
$rep_time_file
= dist_file(
'NGS-Tools-BAMSurgeon'
,
'imputed_byref.csv'
);
$timing_effect
= 1
if
((!
defined
$timing_effect
) & (
defined
$rep_time_file
));
my
$rep_time
=
&read_replication_time
(
$rep_time_file
,
$chromosome
,
$timing_effect
);
my
$i
= 1;
my
@alt_C
= (
'A'
,
'T'
,
'G'
);
my
@alt_T
= (
'A'
,
'C'
,
'G'
);
my
$current_signature
= {};
while
(
$i
<=
$num_picks
) {
my
$vaf
=
&choose_vaf
(
$vaf_param
);
my
$location
= [];
push
(@{
$location
},
$chromosome
);
my
(
$base
,
$tri_region
) =
&sample_base
(
$bases
,
$min_loc
,
$max_loc
,
$cb_bed
,
$seq
);
push
(@{
$location
},
$base
);
push
(@{
$location
},
substr
(
$tri_region
, 1, 1));
my
(
$mut_prob
,
$NA_use
) =
&mutation_probability
(
$base
,
$rep_time
);
if
(
$NA_use
){
say
(
"WARN: position with no replication information "
.
join
(
" "
, @{
$location
}).
" chosen at number $i, choosing again"
);
next
;
}
if
(
exists
$loc
{${
$location
}[1]}) {
say
(
"WARN: duplicated location "
.
join
(
" "
, @{
$location
}).
" chosen at number $i, choosing again"
);
next
;
}
my
$rand_num
= random_uniform(1, 0, 1);
next
if
(
$rand_num
>
$mut_prob
);
my
$alt
;
if
(
substr
(
$tri_region
, 1, 1) eq
'C'
){
$alt
=
$alt_C
[
rand
@alt_C
];
}
else
{
$alt
=
$alt_T
[
rand
@alt_T
];
}
if
(!
$current_signature
->{
$tri_region
}->{
$alt
}){
$current_signature
->{
$tri_region
}->{
$alt
} = 0;
}
my
$current_rate
=
$current_signature
->{
$tri_region
}->{
$alt
} /
$i
;
next
if
(
$current_rate
>=
$signature
->{
$tri_region
}->{
$alt
});
push
(@{
$location
},
$alt
);
$current_signature
->{
$tri_region
}->{
$alt
} += 1;
push
(@{
$locations
}, [${
$location
}[0], ${
$location
}[1], ${
$location
}[1],
$vaf
, ${
$location
}[3]]);
say
"INFO: "
.
$i
.
" mutations chosen"
if
(
$i
% 100 == 0);
$loc
{${
$location
}[1]} = 1;
$i
+= 1;
}
my
$mut_file
=
$opts
{
'mut_file'
};
@{
$locations
} =
sort
{
$a
->[1] <=>
$b
->[1] } @{
$locations
};
(
open
my
$mutation_file
,
">"
,
"$mut_file"
) or croak
"ERROR: unable to open mut_file file at: $mut_file"
;
foreach
my
$location
(@{
$locations
}) {
print
(
$mutation_file
join
(
"\t"
, @{
$location
}).
"\n"
);
}
close
(
$mutation_file
);
return
0;
}
sub
read_signature {
my
(
$signature_file
) =
@_
;
say
"INFO: reading signatures file $signature_file"
;
open
(
my
$sig_in
,
'<'
,
$signature_file
) or croak
"ERROR: unable to open signatures file at : $signature_file"
;
my
$signatures
= {};
my
$header
= <
$sig_in
>;
$header
=~ s/\W+$//;
my
@header
=
split
(
'\t'
,
$header
);
foreach
my
$signature
(
@header
){
if
(
$signature
=~ m/Signature/){
$signatures
->{
$signature
} = {};
}
else
{
say
(
"WARN: unvalidated signature $signature dropped"
)
if
(
$signature
=~ m/Signature/);
}
}
while
(<
$sig_in
>){
s/\W+$//;
my
%row
;
@row
{
@header
} =
split
(
'\t'
);
foreach
my
$signature
(
keys
%$signatures
){
$signatures
->{
$signature
}->{
$row
{
$header
[2]}} =
$row
{
$signature
};
}
}
close
(
$sig_in
);
foreach
my
$signature
(
keys
%{
$signatures
}) {
my
$sum
= 0;
foreach
my
$mutation
(
keys
%{
$signatures
->{
$signature
}}) {
$sum
+=
$signatures
->{
$signature
}->{
$mutation
};
}
$sum
=
int
((
$sum
* 10000.0) + 0.5) / 10000.0;
if
(
$sum
!= 1.0000) {
delete
$signatures
->{
$signature
};
say
(
"WARN: $signature dropped for probabilities not added up to 1"
);
print
$sum
;
}
}
return
(
$signatures
);
}
sub
read_config {
my
(
$config
,
$signatures
) =
@_
;
my
$proportions
= {};
my
$mut_rate
= 0.000001;
if
((not
$config
->{
'signatures'
}) && (not
$config
->{
'cancer'
}->{
'name'
})) {
croak
"ERROR: no signature or cancer type has been specified, please use randomsites.py to generate random mutations"
;
}
elsif
(not
$config
->{
'signatures'
}) {
my
(
$cancer_sigs
) =
&read_cancer
(
$config
,
$config
->{
'cancer'
}->{
'mut_vary'
});
my
$cancer
=
$config
->{
'cancer'
}->{
'name'
};
$cancer
=
lc
(
$cancer
);
my
$proportions_sum
= 0;
my
@sig_list
;
my
$pro_hash
;
if
((
defined
$config
->{
'cancer'
}->{
'sig_vary'
}) && (
$config
->{
'cancer'
}->{
'sig_vary'
}) == 1) {
my
$yaml
= YAML::Tiny->
read
(dist_file(
'NGS-Tools-BAMSurgeon'
,
'cancer_sig_vary.yaml'
));
my
$cancer_var
=
$yaml
->[0];
foreach
my
$cancer
(
keys
%{
$cancer_var
}) {
my
$mod_cancer
=
lc
(
$cancer
);
$cancer_var
->{
$mod_cancer
} =
$cancer_var
->{
$cancer
};
}
if
(
$cancer_var
->{
$cancer
}){
my
@cancer_pp
= @{
$cancer_var
->{
$cancer
}};
my
$chosen_pp
=
$cancer_pp
[
rand
@cancer_pp
];
@sig_list
= (
keys
$chosen_pp
);
$pro_hash
=
$chosen_pp
;
}
else
{
say
"WARN: sig_vary chosen as 1 but no sig_vary information available for "
.
$cancer
.
", using default signatures"
;
}
}
if
((!
@sig_list
) || (!
defined
$pro_hash
)){
croak
"ERROR: $cancer signature is not defined in the cancer files provided"
if
(
scalar
(
keys
%{
$cancer_sigs
->{
$cancer
}}) <= 1);
@sig_list
=
keys
%{
$cancer_sigs
->{
$cancer
}};
$pro_hash
=
$cancer_sigs
->{
$cancer
};
}
my
@requests
= ();
foreach
my
$request
(
@sig_list
) {
next
if
(
$request
eq
'Mut rate'
);
if
(
$signatures
->{
$request
}) {
push
(
@requests
,
$request
);
$proportions_sum
+=
$pro_hash
->{
$request
};
}
else
{
say
"WARN: $request is not defined in signatures file, proportions recalculated"
;
}
}
say
(
"INFO: using signature information of $cancer"
);
foreach
my
$request
(
@requests
) {
next
if
(
$request
eq
'Mut rate'
);
$proportions
->{
$request
}->{
'mutations'
} =
$signatures
->{
$request
};
$proportions
->{
$request
}->{
'proportion'
} = (
$pro_hash
->{
$request
})/
$proportions_sum
;
say
"INFO: $request with proportion "
.
$proportions
->{
$request
}->{
'proportion'
};
}
if
(!
$cancer_sigs
->{
$cancer
}->{
'Mut rate'
}){
say
"WARN: $cancer mutation rate is not defined in the cancer files provided, using default of 0.0000002 if necessary"
;
$mut_rate
= 0.0000002;
}
else
{
$mut_rate
=
$cancer_sigs
->{
$cancer
}->{
'Mut rate'
};
}
}
else
{
my
@requests
= (
keys
%{
$config
->{
'signatures'
}});
my
$proportions_sum
= 0;
say
(
"INFO: using specified signature information"
);
foreach
my
$request
(
@requests
) {
my
$mod_request
=
ucfirst
(
lc
(
$request
));
$mod_request
=~ s/1a$/1A/;
$mod_request
=~ s/1b$/1B/;
croak
"ERROR: $mod_request is not defined in signatures file"
if
(not
$signatures
->{
$mod_request
});
$proportions
->{
$request
}->{
'mutations'
} =
$signatures
->{
$mod_request
};
$proportions
->{
$request
}->{
'proportion'
} =
$config
->{
'signatures'
}->{
$request
};
$proportions_sum
+=
$proportions
->{
$request
}->{
'proportion'
};
say
"INFO: $request with proportion "
.
$proportions
->{
$request
}->{
'proportion'
};
}
croak
"ERROR: signatures proportions not adding up to 1"
if
(
$proportions_sum
!= 1);
}
my
$signature
=
&concat_signature
(
$proportions
);
my
$vaf_param
;
$vaf_param
->{
'minvaf'
} = (
defined
$config
->{
'minvaf'
})?
$config
->{
'minvaf'
} : 1;
$vaf_param
->{
'maxvaf'
} = (
defined
$config
->{
'maxvaf'
})?
$config
->{
'maxvaf'
} : 1;
$vaf_param
->{
'vafbeta1'
} = (
defined
$config
->{
'vafbeta1'
})?
$config
->{
'vafbeta1'
} : 2.0;
$vaf_param
->{
'vafbeta2'
} = (
defined
$config
->{
'vafbeta2'
})?
$config
->{
'vafbeta2'
} : 2.0;
return
(
$signature
,
$mut_rate
,
$vaf_param
);
}
sub
read_cancer {
my
(
$config
,
$mut_vary
) =
@_
;
my
$cancer_sigs
= YAML::LoadFile(dist_file(
'NGS-Tools-BAMSurgeon'
,
'cancer_sigmut.yaml'
));
foreach
my
$cancer
(
keys
%{
$cancer_sigs
}) {
my
$mod_cancer
=
lc
(
$cancer
);
$cancer_sigs
->{
$mod_cancer
} =
$cancer_sigs
->{
$cancer
};
}
my
@cancers
= (
keys
%{
$cancer_sigs
});
my
$cancer
=
$config
->{
'cancer'
}->{
'name'
};
$cancer
=
lc
(
$cancer
);
my
@full_names
=
grep
/
$cancer
/i,
@cancers
;
if
((
defined
$mut_vary
) &&
$mut_vary
== 1) {
my
$yaml
= YAML::Tiny->
read
(dist_file(
'NGS-Tools-BAMSurgeon'
,
'cancer_mut_vary.yaml'
));
my
$cancer_muts
=
$yaml
->[0];
foreach
my
$cancer_mut
(
keys
%{
$cancer_muts
}) {
my
$mod_cancer
=
lc
(
$cancer_mut
);
$cancer_muts
->{
$mod_cancer
} =
$cancer_muts
->{
$cancer_mut
};
}
if
(not
$cancer_muts
->{
$cancer
}){
say
"WARN: mut_vary chosen as 1 but no mut_vary information available for "
.
$cancer
.
", using default mutation rates"
;
}
else
{
foreach
my
$name
(
@full_names
) {
$cancer_sigs
->{
$name
}->{
'Mut rate'
} =
&cancer_mut_sim
(
$cancer_muts
->{
$cancer
});
}
$cancer_sigs
->{
$cancer
}->{
'Mut rate'
} =
&cancer_mut_sim
(
$cancer_muts
->{
$cancer
});
}
}
return
(
$cancer_sigs
);
}
sub
cancer_mut_sim {
my
(
$cancer_model
) =
@_
;
my
$mu
;
my
$sigma
;
my
$rate
;
my
$rand_num
= random_uniform(1, 0, 1);
if
(
$rand_num
<=
$cancer_model
->{
'lambda'
}->[0]){
$mu
=
$cancer_model
->{
'mu'
}->[0];
$sigma
=
$cancer_model
->{
'sigma'
}->[0];
}
else
{
$mu
=
$cancer_model
->{
'mu'
}->[1];
$sigma
=
$cancer_model
->{
'sigma'
}->[1];
}
my
$log_rate
= random_normal(1,
$mu
,
$sigma
);
$rate
= 10*
*$log_rate
/1000000;
return
(
$rate
);
}
sub
concat_signature {
my
(
$proportions
) =
@_
;
my
@nucleotides
= (
'A'
,
'T'
,
'C'
,
'G'
);
my
$signature
= {};
foreach
my
$first
(
@nucleotides
) {
foreach
my
$middle
(
'C'
,
'T'
) {
foreach
my
$last
(
@nucleotides
) {
$signature
->{
"$first$middle$last"
} = {};
}
}
}
foreach
my
$mut_sig
(
keys
%{
$proportions
}) {
foreach
my
$mutation
(
keys
%{
$proportions
->{
$mut_sig
}->{
'mutations'
}}){
$proportions
->{
'Total'
}->{
'mutations'
}->{
$mutation
} +=
$proportions
->{
$mut_sig
}->{
'mutations'
}->{
$mutation
} *
$proportions
->{
$mut_sig
}->{
'proportion'
};
}
}
my
$pro_max
= 0;
foreach
my
$mutation
(
keys
%{
$proportions
->{
'Total'
}->{
'mutations'
}}){
if
(
$proportions
->{
'Total'
}->{
'mutations'
}->{
$mutation
} >
$pro_max
){
$pro_max
=
$proportions
->{
'Total'
}->{
'mutations'
}->{
$mutation
};
}
}
foreach
my
$mutation
(
keys
%{
$proportions
->{
'Total'
}->{
'mutations'
}}){
my
$mut_tri
=
substr
(
$mutation
, 0, 1).
substr
(
$mutation
, 2, 1).
substr
(
$mutation
, 6, 1);
my
$alt_base
=
substr
(
$mutation
, 4, 1);
$signature
->{
$mut_tri
}->{
$alt_base
} +=
$proportions
->{
'Total'
}->{
'mutations'
}->{
$mutation
};
}
$signature
->{
'max'
} =
$pro_max
;
return
(
$signature
);
}
sub
sample_base {
my
(
$bases
,
$min_loc
,
$max_loc
,
$cb_bed
,
$seq
) =
@_
;
my
$flag
= 0;
my
$base
;
my
$tri_region
;
while
(!
$flag
){
my
$position
= random_uniform_integer(1, 1,
$bases
);
$base
=
&find_location
(
$position
,
$cb_bed
);
$tri_region
=
$seq
->subseq(
$base
-1,
$base
+1);
$tri_region
=
uc
(
$tri_region
);
my
$ref
=
substr
(
$tri_region
, 1, 1);
if
(
$ref
=~ m/A|G/) {
$tri_region
=~
tr
/ATCG/TAGC/;
$tri_region
=~ s/^(.)(.)(.)$/$3$2$1/;
}
$flag
= (
$tri_region
!~ m/N+/);
say
"WARN: callable trinucleotide region "
.
$base
.
" "
.
$tri_region
.
" contains N in reference genome, choosing again"
if
(!
$flag
);
}
return
(
$base
,
$tri_region
);
}
sub
find_location {
my
(
$position
,
$cb_bed
) =
@_
;
my
$base
;
my
$sum
= 0;
open
(
my
$bed_in
,
'<'
,
$cb_bed
) or croak
"ERROR: unable to open bed file at : $cb_bed"
;
while
(<
$bed_in
>) {
s/\W+$//;
my
@row
=
split
(
'\t'
);
my
$pos
=
$position
+
$row
[1] - 1;
if
(
$pos
>=
$row
[2]){
$position
=
$position
- (
$row
[2] -
$row
[1]);
}
else
{
$base
=
$pos
;
last
;
}
}
close
(
$bed_in
);
$base
=
$base
+ 1;
return
(
$base
);
}
sub
mutation_probability {
my
(
$base
,
$rep_time
) =
@_
;
my
$mut_prob
;
my
$NA_use
= 0;
my
$base_region
= ceil(
$base
/ 100000) - 1;
$mut_prob
=
$rep_time
->[
$base_region
];
if
(!
defined
$mut_prob
) {
$NA_use
= 1;
}
return
(
$mut_prob
,
$NA_use
);
}
sub
read_replication_time {
my
(
$rep_time_file
,
$chromosome
,
$timing_effect
) =
@_
;
my
$rep_time
= [];
my
@equal_time
= (1)x3000;
$chromosome
= 23
if
(
$chromosome
eq
'X'
);
if
(not
$rep_time_file
){
say
"WARN: no replication timing effect file given, treating all bases as with same replication time"
;
$rep_time
= \
@equal_time
;
return
$rep_time
;
}
if
(
$chromosome
eq
'Y'
){
say
"WARN: no replication timing available for chromosome Y, treating all bases as with same replication time"
;
$rep_time
= \
@equal_time
;
return
$rep_time
;
}
open
(
my
$csv_in
,
'<'
,
$rep_time_file
) or croak
"ERROR: unable to open vcf file at : $rep_time_file"
;
my
$header_line
= <
$csv_in
>;
$header_line
=~ s/\W+$//;
my
@header
=
split
(
'\t'
,
$header_line
);
my
(
$index
) =
grep
{
$header
[
$_
] eq
'corr'
} (0 ..
@header
-1);
if
(not
defined
$index
){
say
"WARN: replication timing effect file does not have \'corr\' column, treating all bases as same"
;
$rep_time
= \
@equal_time
;
close
(
$csv_in
);
return
$rep_time
;
}
my
$min
= 2000;
my
$max
= -2000;
while
(<
$csv_in
>){
s/\W+$//;
my
$row
= [
split
(
'\t'
)];
next
if
(${
$row
}[0] !=
$chromosome
);
if
(${
$row
}[
$index
] eq
"NA"
|| ${
$row
}[
$index
] eq
"NaN"
){
push
(@{
$rep_time
},
undef
);
}
else
{
my
$corr
= ${
$row
}[
$index
];
$corr
= 10*
*$corr
;
push
(@{
$rep_time
},
$corr
);
$max
=
$corr
if
(
$corr
>
$max
);
$min
=
$corr
if
(
$corr
<
$min
);
}
}
close
(
$csv_in
);
my
$diff
=
$max
-
$min
;
my
$scale
= (
$max
-
$diff
*
$timing_effect
) /
$max
;
foreach
my
$i
(0..
scalar
(@{
$rep_time
})-1) {
next
if
(!
defined
$rep_time
->[
$i
]);
$rep_time
->[
$i
] = ((1 -
$scale
)*(
$rep_time
->[
$i
] -
$min
)) /
$diff
+
$scale
;
}
return
(
$rep_time
);
}
sub
rep_mut_correlation {
my
(
$replication
,
$rep_mut_numbers
) =
@_
;
my
$slope
=
$rep_mut_numbers
->{
'rep_mut_regression'
}->{
'slope'
};
my
$intercept
=
$rep_mut_numbers
->{
'rep_mut_regression'
}->{
'intercept'
};
my
$se
=
$rep_mut_numbers
->{
'rep_mut_regression'
}->{
'se'
};
my
$rep_mut
=
$replication
*$slope
+
$intercept
;
return
(
$rep_mut
);
}
sub
choose_vaf {
my
(
$vaf_param
) =
@_
;
my
$vaf
= Math::Random::random_beta(1,
$vaf_param
->{
'vafbeta1'
},
$vaf_param
->{
'vafbeta2'
});
$vaf
=
$vaf
* (
$vaf_param
->{
'maxvaf'
}-
$vaf_param
->{
'minvaf'
}) +
$vaf_param
->{
'minvaf'
};
return
(
$vaf
);
}
sub
calculate_muts {
my
(
$mut_rate
,
$cb_bed
,
$chromosome
) =
@_
;
my
$bases
= 0;
my
$min_loc
= 0;
my
$max_loc
;
open
(
my
$bed_in
,
'<'
,
$cb_bed
) or croak
"ERROR: unable to open bed file at : $cb_bed"
;
my
$first_line
= <
$bed_in
>;
my
@first_row
=
split
(
'\t'
,
$first_line
);
croak
"ERROR: primary_id of fasta file not the same as chromosome of bed file given"
if
(
$first_row
[0] ne
$chromosome
);
$bases
+= (
$first_row
[2] -
$first_row
[1]);
$min_loc
=
$first_row
[1];
my
@last_row
;
while
(<
$bed_in
>) {
s/\W+$//;
my
@row
=
split
(
'\t'
);
croak
"ERROR: primary_id of fasta file not the same as chromosome of bed file given"
if
(
$row
[0] ne
$chromosome
);
$bases
+= (
$row
[2] -
$row
[1]);
@last_row
=
@row
;
}
close
(
$bed_in
);
$max_loc
=
$last_row
[2] - 1;
print
"INFO: Chromosome: $chromosome\tCovered bases: $bases\tMin: $min_loc\tMax: $max_loc\n"
;
my
$SNV_num
= 200;
if
(
$bases
){
$SNV_num
=
$bases
*
$mut_rate
;
$SNV_num
=
sprintf
(
"%.0f"
,
$SNV_num
);
}
else
{
say
(
"INFO: cannot obtain readcount information for chromosome $chromosome, picking 200 mutations."
);
}
return
(
$SNV_num
,
$min_loc
,
$max_loc
,
$bases
);
}
sub
check_location {
my
(
$location
,
$cb_bed
) =
@_
;
my
$flag
= 1;
my
$pos
=
$location
- 1;
open
(
my
$bed_in
,
'<'
,
$cb_bed
) or croak
"ERROR: unable to open bed file at : $cb_bed"
;
while
(<
$bed_in
>) {
s/\W+$//;
my
@row
=
split
(
'\t'
);
if
(
$pos
>=
$row
[1] &&
$pos
<
$row
[2]){
$flag
= 0;
last
;
}
}
close
(
$bed_in
);
return
(
$flag
);
}
sub
read_vcf {
my
(
$vcf_file
,
$chromosome
) =
@_
;
say
"INFO: reading vcf file $vcf_file"
;
open
(
my
$vcf_in
,
'<'
,
$vcf_file
) or croak
"ERROR: unable to open vcf file at : $vcf_file"
;
my
$vcf_mutations
= [];
while
(<
$vcf_in
>) {
s/\W+$//;
my
@row
=
split
(
'\t'
);
if
(
$row
[0] eq
$chromosome
) {
if
(
length
(
$row
[3]) == 1 &&
length
(
$row
[4]) == 1) {
push
(@{
$vcf_mutations
}, [
$chromosome
,
$row
[1],
$row
[4]]);
}
}
}
close
(
$vcf_in
);
return
(
$vcf_mutations
);
}