bin/generate_signature.pl


            
              1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
              #!/usr/bin/env perl
### generate_signature.pl ############################################################################
# Generate mutation input varfile according to the Trinucleotide Signature and Proportions specified
# - read the signatures file table for mutation types and probabilities
# - calculate mutation number and probabilities for each signature specified
# - randomly choose positions in the chromosome according to mutation type (BY INDEPENDENT SAMPLING)
# - specify chromosome, start, end, vaf, and alternative base in BEDfile format as output (varfile)
# - take in custome mutations vcf file and incorporate according to chromosome
# - take in cancer file and cancer name to generate cancer proportioned mutations and rates
# - take in covered bases bed file and only spike in mutation in bases with over 10x coverage
# Usage: <generate_signature.pl> [options] [file ...]
        # Options:
        # --help                                brief help message
        # --man                         full documentation
        # --config                      configuration file in YAML format containing signature information (Required)
        # --chromosome        the chromosome that mutations are generated for (Required)
        # --cb_bed            the bed file for the callable bases of the chromosome (default = "None")
        # --mut_num                     number of sites to generate (default = 0)
        # --mut_rate                    rate of mutation to generate (default = 0)
        # --seed              seed used for reproducible random mutation location generation (default = "None")
        # --mut_file          file to output mutations to (default = "./mut_file.txt")
        # --minvaf            minimum variant allele fraction (default = 0.25)
        # --maxvaf            maximum variant allele fraction (default = 0.5)
        # --vafbeta1          left shape parameter for beta distribution of VAFs (default = 2.0)
        # --vafbeta2          right shape parameter for beta distribution of VAFs (default = 2.0)
### HISTORY #######################################################################################
# Version               Date            Developer               Comments
# 0.01                  2015-05-15      lliu                    Initial code development.
#
# 0.02                  2016-01-10      lliu                    Incorporate replication timing effect in mutation rate
#                               
# 0.03                  2016-05-01      lliu                    Incorporate mutation rate and signature variations
#
#
### INCLUDES ######################################################################################
use NGS::Tools::BAMSurgeon::Helper;
use warnings;
use strict;
use Carp;
use Getopt::Long;
use Pod::Usage;
use File::Basename;
use File::Path qw(make_path);   # makes directories given paths
use FindBin qw($Bin);           # finds bin where this method was invoked
use Path::Class;
use YAML qw(LoadFile);
use YAML::Tiny;
use Data::Dumper;
use List::MoreUtils qw( each_array );
use POSIX qw(strftime);
use POSIX qw(ceil);
use Bio::SeqIO;                 # reads reference genome fasta file
use Bio::SeqIO::fasta;
use Math::Random;
use POSIX qw(ceil);
use File::ShareDir ':ALL';
### COMMAND LINE DEFAULT ARGUMENTS ################################################################
# list of arguments and default values go here as hash key/value pairs
our %opts = (
        config                              => "$Bin/../share/config.yaml",  ### the yaml config file
        chromosome                                      => undef,
        reference                                       => undef,
        cb_bed                      => undef,                                                        ### full path to chromosome callable bases bed file
        mut_num                     => 0,
        mut_rate                    => 0,                                                            
        seed                        => "None",                                                       ###seed to make reproducible random picks
        mut_file                    => "./snv_mutations.txt",                                        ###file to output mutations to
        );
$opts {'scripts'} = {
        };
### MAIN CALLER ###################################################################################
my $result = main();
exit($result);
### FUNCTIONS #####################################################################################
### main ##########################################################################################
# Description:
#               Main subroutine for program
# Input Variables:
#               %opts = command line arguments
# Output Variables:
#               N/A
sub main {
        GetOptions(
                \%opts,
                "help|?",
                "man",
                "config|c=s" => \$opts{'config'},            ### YAML file,
                "chromosome|chrom=s" => \$opts{'chromosome'},
                "reference|ref=s" => \$opts{'reference'},
                "cb_bed=s" => \$opts{'cb_bed'},
                "mut_num|n:i" => \$opts{'mut_num'},
                "mut_rate:f" => \$opts{'mut_rate'},
                "seed:i" => \$opts{'seed'},
                "mut_file|mf:s" => \$opts{'mut_file'},
                ) or pod2usage(64);
        if ($opts{'help'}) { pod2usage(1) };
        if ($opts{'man'}) { pod2usage(-exitstatus => 0, -verbose => 2) };
        ### check for undef arguments in %opts ###
        while(my ($arg, $value) = each(%opts)) {
                if (!defined $value) {
                        print("ERROR: Missing argument $arg\n");
                        pod2usage(128);         # prints out SYNOPSIS
                        }
                } # while
        #load seed
        if ($opts{'seed'} eq 'None') {
                srand;
        } else {
                srand($opts{'seed'});
                random_set_seed_from_phrase($opts{'seed'});
                }
        #read in files
        my $config = YAML::LoadFile($opts{'config'});
        my $cb_bed = $opts{'cb_bed'};
        my $chromosome = $opts{'chromosome'};
        #read in chromosome genome
        my $genome = Bio::SeqIO->new(-file => "<$opts{'reference'}",
                                                                        -format => "largefasta");
        my $primary_id;
        my $seq;
        while ($seq = $genome->next_seq){
                $primary_id = $seq->primary_id;
                last if ($primary_id eq $chromosome);
        } 
        croak "ERROR: chromosome $chromosome inputted not found in the reference genome given" if ($primary_id ne $chromosome);
        #read in signature file
        my $signature_file = dist_file('NGS-Tools-BAMSurgeon',  'signatures.txt');
        croak "ERROR: no signatures file given, please use --picksomatic to generate random mutations" if (not $signature_file);
        my $signatures = &read_signature($signature_file);
        #read in proportions from config file
        my ($signature, $mut_rate, $vaf_param) = &read_config($config, $signatures, $config->{'cancer'}->{'sig_vary'});
        #determine number of mutations to pick and basic information of the chromosome
        my $num_picks;
        my $min_loc; # smallest covered base location
        my $max_loc; # largest covered base location
        my $bases; # number of covered bases
        if ($opts{'mut_rate'} != 0){ #if mutation rate is given
                $mut_rate = $opts{'mut_rate'};
                ($num_picks, $min_loc, $max_loc, $bases) = &calculate_muts($mut_rate, $cb_bed, $chromosome);
        } elsif ($mut_rate != 0){ #if given cancer
                ($num_picks, $min_loc, $max_loc, $bases) = &calculate_muts($mut_rate, $cb_bed, $chromosome);
        } else { #use default mutation rate
                ($num_picks, $min_loc, $max_loc, $bases) = &calculate_muts(0.0000002, $cb_bed, $chromosome);
                }
        if ($opts{'mut_num'} != 0) { # if mutation number is given, takes precedence
                $num_picks = $opts{'mut_num'};
                say ("INFO: choosing $num_picks mutations.");
        } else {
                say ("INFO: choosing $num_picks mutations with mutation rate of $mut_rate.");
        }
         
        my $locations = []; #chosen mutations
        my %loc = (); #variable for testing for duplicate locations
        my $vcf_num = 0;
        #read vcf file and extract relevant mutations
        if ($config->{'vcf_file'}) {
                my $vcf_file = $config->{'vcf_file'};
                my $vcf_mutations = &read_vcf($vcf_file, $chromosome);
                my @vcf_muts;
                foreach my $i (0..scalar(@{$vcf_mutations})-1) {
                        if (exists $loc{${${$vcf_mutations}[$i]}[1]}) {
                                say ("WARN: duplicated location in vcf file, omitted");
                        } else {
                                my $vaf = &choose_vaf($vaf_param);
                                push (@vcf_muts, [${${$vcf_mutations}[$i]}[0], ${${$vcf_mutations}[$i]}[1], ${${$vcf_mutations}[$i]}[1], $vaf, ${${$vcf_mutations}[$i]}[2]]);
                                $loc{${${$vcf_mutations}[$i]}[1]} = 1;
                                }
                        }
                $vcf_num = scalar(@vcf_muts);
                say ("INFO: added $vcf_num SNVs from vcf file");
                push (@{$locations}, @vcf_muts);
                $num_picks -= $vcf_num;
                say ("INFO: choosing $num_picks after subtracting from vcf file");
                }
        #read in replication timing
        my $timing_effect = $config->{'rep_time_effect'};
        my $rep_time_file = dist_file('NGS-Tools-BAMSurgeon',  'imputed_byref.csv');
        $timing_effect = 1 if ((!defined $timing_effect) & (defined $rep_time_file));
        my $rep_time = &read_replication_time($rep_time_file, $chromosome, $timing_effect);
        my $i = 1; #number chosen so far
        my @alt_C = ('A', 'T', 'G');
        my @alt_T = ('A', 'C', 'G');
        my $current_signature = {};
        while ($i <= $num_picks) {
                my $vaf = &choose_vaf($vaf_param);
                my $location = []; #current mutation in construction
                push (@{$location}, $chromosome); #chrom[0]
                my ($base, $tri_region) = &sample_base($bases, $min_loc, $max_loc, $cb_bed, $seq);
                push (@{$location}, $base); #location[1]
                push (@{$location}, substr($tri_region, 1, 1)); #ref[2]
                my ($mut_prob, $NA_use) = &mutation_probability($base, $rep_time);
                if ($NA_use){
                        say ("WARN: position with no replication information ".join("  ", @{$location})." chosen at number $i, choosing again");
                        next;
                }
                if (exists $loc{${$location}[1]}) {
                        say ("WARN: duplicated location ".join("  ", @{$location})." chosen at number $i, choosing again");
                        next;
                }
                #Layer 1: replication time
                my $rand_num = random_uniform(1, 0, 1);
                next if ($rand_num > $mut_prob);
                #Layer 2: signature
                my $alt;
                if (substr($tri_region, 1, 1) eq 'C'){
                        $alt = $alt_C[rand @alt_C];
                } else {
                        $alt = $alt_T[rand @alt_T];
                }
                #my $alt_rand = random_uniform(1, 0, 1);
                #next if ($alt_rand > ($signature->{$tri_region}->{$alt} / $signature->{'max'}));
                if (!$current_signature->{$tri_region}->{$alt}){
                        $current_signature->{$tri_region}->{$alt} = 0;
                }
                my $current_rate = $current_signature->{$tri_region}->{$alt} / $i;
                next if ($current_rate >= $signature->{$tri_region}->{$alt});
                #Passed all layers
                push (@{$location}, $alt); #alt[3]
                $current_signature->{$tri_region}->{$alt} += 1;
                #actual format required by BAMsurgeon
                #'chr', 'start', 'end', 'vaf', 'alt'
                push (@{$locations}, [${$location}[0], ${$location}[1], ${$location}[1], $vaf, ${$location}[3]]);
                #used for testing with R script
                #'chr', 'start', 'ref', 'alt'
                #push (@{$locations}, $location); 
                say "INFO: ".$i." mutations chosen" if ($i % 100 == 0);
                $loc{${$location}[1]} = 1;
                $i += 1;
                }
  
        my $mut_file = $opts{'mut_file'};
        #sort mutations by ascending location
        @{$locations} = sort { $a->[1] <=> $b->[1] } @{$locations};
        (open my $mutation_file, ">", "$mut_file") or croak "ERROR: unable to open mut_file file at: $mut_file";
        foreach my $location (@{$locations}) {
                print ($mutation_file join("\t", @{$location})."\n");
                }
        close ($mutation_file);
        return 0;
        } #main
### read_signature #################################################################################
# Description:
#               Read the signature file provided assuming that the third column is "Somatic Mutation Type"
# Input Variables:
#               $signature_file = signature file
# Output Variables:
#               $signatures = all of the signatures stored in a hash of hash of hash
sub read_signature {
        my ($signature_file) = @_;
        say "INFO: reading signatures file $signature_file";
         
        open (my $sig_in, '<', $signature_file) or croak "ERROR: unable to open signatures file at : $signature_file";
        my $signatures = {};
        my $header = <$sig_in>;
        $header =~ s/\W+$//;
        my @header = split('\t', $header);
        #read the header line and make a key for each of the valid mutations
        foreach my $signature (@header){
                #if ($signature =~ m/Signature\s\d/) { #assume that signatures that begin with a number are validated
                if ($signature =~ m/Signature/){
                        $signatures->{$signature} = {};
                } else {
                        say ("WARN: unvalidated signature $signature dropped") if ($signature =~ m/Signature/);
                        }
                }
        #read each line and assign the probabilities of each mutation to each signature
        while (<$sig_in>){
                s/\W+$//;
                my %row;
                @row{@header} = split('\t');
                foreach my $signature (keys %$signatures){
                        $signatures->{$signature}->{$row{$header[2]}} = $row{$signature}; #assume that the 3rd column is 'somatic mutation type'
                        }
                }
        close($sig_in);
        #validate that each signature has probabilities rdding up to 1
        foreach my $signature (keys %{$signatures}) {
                my $sum = 0;
                foreach my $mutation (keys %{$signatures->{$signature}}) {
                        $sum += $signatures->{$signature}->{$mutation};
                        }
                $sum = int(($sum * 10000.0) + 0.5) / 10000.0;
                if ($sum != 1.0000) {
                        delete $signatures->{$signature};
                        say ("WARN: $signature dropped for probabilities not added up to 1");
                        print $sum;
                        }
                }
        return ($signatures);
        }
### read_config ####################################################################################
# Description:
#               Read the config hash specified by the YAML file
# Input Variables:
#               $config = config of signature type or cancer type to simulate
#               $signatures = the hash of probabilities of all signatures from file
#               $sig_vary = file containing variation in cancer signature
# Output Variables:
#               $signature = the overall trinucleotide signature to follow
#               $mut_rate = the mutation rate determined by the cancer or by default
sub read_config {
        my ($config, $signatures) = @_;
        my $proportions = {};
        my $mut_rate = 0.000001;
        #checking if signature or cancer information present, if both present signature info is used
        if ((not $config->{'signatures'}) && (not $config->{'cancer'}->{'name'})) {
                croak "ERROR: no signature or cancer type has been specified, please use randomsites.py to generate random mutations";
        } elsif (not $config->{'signatures'}) {
                #read in cancer related files
                my ($cancer_sigs) = &read_cancer($config, $config->{'cancer'}->{'mut_vary'});
                my $cancer = $config->{'cancer'}->{'name'};
                $cancer = lc ($cancer);
                my $proportions_sum = 0;
                my @sig_list;
                my $pro_hash;
                #TODO find better way to generation variation in signatures
                if ((defined $config->{'cancer'}->{'sig_vary'}) && ($config->{'cancer'}->{'sig_vary'}) == 1) {
                        #read in cancer signatures variation file
                        my $yaml = YAML::Tiny->read(dist_file('NGS-Tools-BAMSurgeon',  'cancer_sig_vary.yaml'));
                        my $cancer_var = $yaml->[0];
                        #save lowercase cancer keys
                        foreach my $cancer (keys %{$cancer_var}) {
                                my $mod_cancer = lc ($cancer);
                                $cancer_var->{$mod_cancer} = $cancer_var->{$cancer};
                        }
                        if ($cancer_var->{$cancer}){
                                #pick a signature proportion for the cancer at random
                                my @cancer_pp = @{$cancer_var->{$cancer}};
                                my $chosen_pp = $cancer_pp[rand @cancer_pp];
                                @sig_list = (keys $chosen_pp);
                                $pro_hash = $chosen_pp;
                                }
                        else {
                                say "WARN: sig_vary chosen as 1 but no sig_vary information available for ".$cancer.", using default signatures";
                                }
                        }
                if ((!@sig_list) || (!defined $pro_hash)){
                        croak "ERROR: $cancer signature is not defined in the cancer files provided" if (scalar (keys %{$cancer_sigs->{$cancer}}) <= 1);
                        @sig_list = keys %{$cancer_sigs->{$cancer}};
                        $pro_hash = $cancer_sigs->{$cancer};
                }
                my @requests = ();
                foreach my $request (@sig_list) {
                        next if ($request eq 'Mut rate');
                        if ($signatures->{$request}) {
                                push (@requests, $request);
                                $proportions_sum += $pro_hash->{$request};
                        } else {
                                say "WARN: $request is not defined in signatures file, proportions recalculated";
                        }
                }
                say ("INFO: using signature information of $cancer");
                foreach my $request (@requests) {
                        next if ($request eq 'Mut rate');
                        $proportions->{$request}->{'mutations'} = $signatures->{$request};
                        $proportions->{$request}->{'proportion'} = ($pro_hash->{$request})/$proportions_sum;
                        say "INFO:      $request with proportion ".$proportions->{$request}->{'proportion'};
                }
                if (!$cancer_sigs->{$cancer}->{'Mut rate'}){
                        say "WARN: $cancer mutation rate is not defined in the cancer files provided, using default of 0.0000002 if necessary";
                        $mut_rate = 0.0000002;
                } else {
                        $mut_rate = $cancer_sigs->{$cancer}->{'Mut rate'};
                }
        } else {
                my @requests = (keys %{$config->{'signatures'}});
                my $proportions_sum = 0;
                say ("INFO: using specified signature information");
                foreach my $request (@requests) {
                        my $mod_request = ucfirst (lc ($request));
                        $mod_request =~ s/1a$/1A/;
                        $mod_request =~ s/1b$/1B/;
                        croak "ERROR: $mod_request is not defined in signatures file" if (not $signatures->{$mod_request});
                        $proportions->{$request}->{'mutations'} = $signatures->{$mod_request};
                        $proportions->{$request}->{'proportion'} = $config->{'signatures'}->{$request};
                        $proportions_sum += $proportions->{$request}->{'proportion'};
                        say "INFO:      $request with proportion ".$proportions->{$request}->{'proportion'};
                }
                croak "ERROR: signatures proportions not adding up to 1" if ($proportions_sum != 1);
        }
        my $signature = &concat_signature($proportions);
        my $vaf_param;
        $vaf_param->{'minvaf'} = (defined $config->{'minvaf'})? $config->{'minvaf'} : 1;
        $vaf_param->{'maxvaf'} = (defined $config->{'maxvaf'})? $config->{'maxvaf'} : 1;
        $vaf_param->{'vafbeta1'} = (defined $config->{'vafbeta1'})? $config->{'vafbeta1'} : 2.0;
        $vaf_param->{'vafbeta2'} = (defined $config->{'vafbeta2'})? $config->{'vafbeta2'} : 2.0;
        return ($signature, $mut_rate, $vaf_param);
        }
### read_cancer ####################################################################################
# Description:
#               Read the config hash with regards to the cancer information
# Input Variables:
#               $config = config of signature type or cancer type to simulate
#               $mut_vary = file containing information of the mutation rate distribution of cancers
# Output Variables:
#               $cancer_sigs = the trinucleotide signatures and mutation rate information of cancers
sub read_cancer {
        my ($config, $mut_vary) = @_;
        my $cancer_sigs = YAML::LoadFile(dist_file('NGS-Tools-BAMSurgeon',  'cancer_sigmut.yaml'));
        #save lowercase cancer keys
        foreach my $cancer (keys %{$cancer_sigs}) {
                my $mod_cancer = lc ($cancer);
                $cancer_sigs->{$mod_cancer} = $cancer_sigs->{$cancer};
                }
        #alter chosen cancer mutation and signature according to vary options
        my @cancers = (keys %{$cancer_sigs});
        my $cancer = $config->{'cancer'}->{'name'};
        $cancer = lc ($cancer);
        my @full_names = grep/$cancer/i, @cancers;
        #TODO change variation to scale
        if ((defined $mut_vary) && $mut_vary == 1) {
                #read in mutation rate variation file
                my $yaml = YAML::Tiny->read(dist_file(  'NGS-Tools-BAMSurgeon',  'cancer_mut_vary.yaml'));
                my $cancer_muts = $yaml->[0];
                #save lowercase cancer keys
                foreach my $cancer_mut (keys %{$cancer_muts}) {
                        my $mod_cancer = lc ($cancer_mut);
                        $cancer_muts->{$mod_cancer} = $cancer_muts->{$cancer_mut};
                }
                if (not $cancer_muts->{$cancer}){
                        say "WARN: mut_vary chosen as 1 but no mut_vary information available for ".$cancer.", using default mutation rates";
                        }
                else {
                        foreach my $name (@full_names) {
                                $cancer_sigs->{$name}->{'Mut rate'} = &cancer_mut_sim($cancer_muts->{$cancer});
                                }
                        #also add in name in case it is not found in sigmut file
                        $cancer_sigs->{$cancer}->{'Mut rate'} = &cancer_mut_sim($cancer_muts->{$cancer});
                        }
                }
        return ($cancer_sigs);
        }
### cancer_mut_sim ###############################################################################
# Description:
#               Simulate the mutation rate of the cancer according to 2 Normal Gaussian mixture model *see R script
# Input Variables:
#               $cancer_model: the lambda, mu, sigma parameters of the specific cancer
# Output Variables:
#               $rate = the simulated mutation rate of the cancer
sub cancer_mut_sim {
        my ($cancer_model) = @_;
        my $mu;
        my $sigma;
        my $rate;
        my $rand_num = random_uniform(1, 0, 1);
        if ($rand_num <= $cancer_model->{'lambda'}->[0]){
                $mu = $cancer_model->{'mu'}->[0];
                $sigma = $cancer_model->{'sigma'}->[0];
        } else {
                $mu = $cancer_model->{'mu'}->[1];
                $sigma = $cancer_model->{'sigma'}->[1];
        }
        my $log_rate = random_normal(1, $mu, $sigma);
        $rate = 10**$log_rate/1000000;
        return ($rate);
        }
### concat_signature ###############################################################################
# Description:
#               Calcualte the overall trinucleotide signature
# Input Variables:
#               $proportion = the mutations and their proportion makeup of the signature
# Output Variables:
#               $signature = the overall trinucleotide signature to follow
sub concat_signature {
        my ($proportions) = @_;
        my @nucleotides = ('A', 'T', 'C', 'G');
        my $signature = {};
        #generate all trinucleodie combinations
        foreach my $first (@nucleotides) {
                foreach my $middle ('C', 'T') {
                        foreach my $last (@nucleotides) {
                                $signature->{"$first$middle$last"} = {};
                                }
                        }
                }
        #cumulate the effects of all signatures
        foreach my $mut_sig (keys %{$proportions}) {
                foreach my $mutation (keys %{$proportions->{$mut_sig}->{'mutations'}}){
                        $proportions->{'Total'}->{'mutations'}->{$mutation} += $proportions->{$mut_sig}->{'mutations'}->{$mutation} * $proportions->{$mut_sig}->{'proportion'};
                        }
                }
        #find the maximum for normalization
        my $pro_max = 0;
        foreach my $mutation (keys %{$proportions->{'Total'}->{'mutations'}}){
                if ($proportions->{'Total'}->{'mutations'}->{$mutation} > $pro_max){
                        $pro_max = $proportions->{'Total'}->{'mutations'}->{$mutation};
                        }
                }
        #assign to "XXX" -> "ALT" format
        foreach my $mutation (keys %{$proportions->{'Total'}->{'mutations'}}){
                        my $mut_tri = substr($mutation, 0, 1).substr($mutation, 2, 1).substr($mutation, 6, 1);
                        my $alt_base = substr($mutation, 4, 1);
                        $signature->{$mut_tri}->{$alt_base} += $proportions->{'Total'}->{'mutations'}->{$mutation};
                }
        $signature->{'max'} = $pro_max;
        return ($signature);
        }
### sample_base ###################################################################################
# Description:
#               Randomly sampling a base location in the genome, in the covered bases given
# Input Variables:
#               $bases = total number of callable bases
#               $min_loc = minimum location of covered base
#               $max_loc = maximum location of covered base
#               $cb_bed = cb_bed file
#               $seq = the genome sequence
# Output Variables:
#               $base = the randomly sampled location
#               $tri_region = the trinucleotide context of the location
sub sample_base {
        my ($bases, $min_loc, $max_loc, $cb_bed, $seq) = @_;
        my $flag = 0;
        my $base;
        my $tri_region;
        while (!$flag){
                my $position = random_uniform_integer(1, 1, $bases);
                $base = &find_location($position, $cb_bed);
                $tri_region = $seq->subseq($base-1,$base+1);
                $tri_region = uc($tri_region);
                my $ref = substr($tri_region, 1, 1);
                if ($ref =~ m/A|G/) {
                        $tri_region =~ tr/ATCG/TAGC/;
                        $tri_region =~ s/^(.)(.)(.)$/$3$2$1/;
                }
                 
                #skip regions that are not given in the reference genome
                $flag = ($tri_region !~ m/N+/);
                say "WARN: callable trinucleotide region ".$base." ".$tri_region." contains N in reference genome, choosing again" if (!$flag); 
        }
        return ($base, $tri_region);
        }
### find_location #############################################################################
# Description:
#               Find location of base in covered bases
# Input Variables:
#               $position = position of base in covered bases
#               $cb_bed = covered bases bed file
# Output Variables:
#               $base = location of base
sub find_location {
        my ($position, $cb_bed) = @_;
        my $base;
        my $sum = 0;
        open (my $bed_in, '<', $cb_bed) or croak "ERROR: unable to open bed file at : $cb_bed";
        while (<$bed_in>) {
                s/\W+$//;
                my @row = split('\t');
                my $pos = $position + $row[1] - 1;
                if ($pos >= $row[2]){
                        $position = $position - ($row[2] - $row[1]);
                } else {
                        $base = $pos;
                        last;
                        }
                }
        close($bed_in);
        #correct for the fact that bed files are 0 based
        $base = $base + 1;
        return ($base);
        }
### mutation_probability #########################################################################
# Description:
#               Calcualte the probability that the given base would be mutated, based on replication timing
# Input Variables:
#               $base = the location of the base
#               $rep_time = replicating timing data
# Output Variables:
#               $mut_prob = probability of mutation of given base
#               $NA_use = whether the region has no replication timing information
sub mutation_probability {
        my ($base, $rep_time) = @_;
        my $mut_prob;
        my $NA_use = 0;
        my $base_region = ceil($base / 100000) - 1;
        $mut_prob = $rep_time->[$base_region];
        if (!defined $mut_prob) {
                        $NA_use = 1;
                }
        return ($mut_prob, $NA_use);
        }
### read_replication_time #########################################################################
# Description:
#               Read and process file storing replication time of the genome
# Input Variables:
#               $rep_time_file = csv file storing chromosome location and replication time
#               $chromosome = the current chromosome
#               $timing_effect = how much the mutation rate of regions should be affected by replication
# Output Variables:
#               $rep_times = store replciation times of chromosome locations
sub read_replication_time {
        my ($rep_time_file, $chromosome, $timing_effect) = @_;
        my $rep_time = [];
        my @equal_time = (1)x3000;
        $chromosome = 23 if ($chromosome eq 'X');
        if (not $rep_time_file){
                say "WARN: no replication timing effect file given, treating all bases as with same replication time";
                $rep_time = \@equal_time;
                return $rep_time;
        }
        if ($chromosome eq 'Y'){
                say "WARN: no replication timing available for chromosome Y, treating all bases as with same replication time";
                $rep_time = \@equal_time;
                return $rep_time;
        }
        open (my $csv_in, '<', $rep_time_file) or croak "ERROR: unable to open vcf file at : $rep_time_file";
        my $header_line = <$csv_in>;
        $header_line =~ s/\W+$//;
        my @header = split('\t', $header_line);
        my ($index) = grep {$header[$_] eq 'corr' } (0 .. @header-1);
        if (not defined $index){
                say "WARN: replication timing effect file does not have \'corr\' column, treating all bases as same";
                $rep_time = \@equal_time;
                close($csv_in);
                return $rep_time;
        }
        my $min = 2000;
        my $max = -2000;
        #read each line and assign the mutation rate to the chromosome location
        while (<$csv_in>){
                s/\W+$//;
                my $row = [split('\t')];
                next if (${$row}[0] != $chromosome); #skip to current chromosome
                if (${$row}[$index] eq "NA" || ${$row}[$index] eq "NaN"){
                        push (@{$rep_time}, undef);
                } else{ 
                        my $corr = ${$row}[$index];
                        $corr = 10**$corr;
                        push (@{$rep_time}, $corr);
                        $max = $corr if ($corr > $max);
                        $min = $corr if ($corr < $min);
                        }
                }
        close($csv_in);
        my $diff = $max - $min;
        my $scale = ($max - $diff * $timing_effect) / $max;
        #my $scale = 1 - abs(($diff * $timing_effect) / $max);
        #scale the range by timing effect
        foreach my $i (0..scalar(@{$rep_time})-1) {
                next if (!defined $rep_time->[$i]);
                $rep_time->[$i] = ((1 - $scale)*($rep_time->[$i] - $min)) / $diff + $scale;
        }
        return ($rep_time);
        }
### rep_mut_correlation ###########################################################################
# Description:
#               Use correlation to calculate mutation rate of region by replication time
# Input Variables:
#               $replication = replication time of region
#               $$rep_mut_numbers = contains coefficients and values used to calculate mutation rate
# Output Variables:
#               $with_error = mutation of region calculated
sub rep_mut_correlation {
        my ($replication, $rep_mut_numbers) = @_;
        my $slope = $rep_mut_numbers->{'rep_mut_regression'}->{'slope'};
        my $intercept = $rep_mut_numbers->{'rep_mut_regression'}->{'intercept'};
        my $se = $rep_mut_numbers->{'rep_mut_regression'}->{'se'};
        my $rep_mut = $replication*$slope + $intercept;
        #my $with_error = random_normal(1, $rep_mut, $se);
        return ($rep_mut);
        }
### choose_vaf ###############################################################################
# Description:
#               Choose the variant allele fraction according to parameters described
# Input Variables:
#               $vaf_param = the parameters for generating vaf
# Output Variables:
#               $vafs = the vafs generated
sub choose_vaf {
        my ($vaf_param) = @_;
        my $vaf = Math::Random::random_beta(1, $vaf_param->{'vafbeta1'}, $vaf_param->{'vafbeta2'});
        $vaf = $vaf * ($vaf_param->{'maxvaf'}-$vaf_param->{'minvaf'}) + $vaf_param->{'minvaf'};
        return ($vaf);
        }
### calculate_muts #################################################################################
# Description:
#               Calculate the number of picks from the covered bases
# Input Variables:
#               $mut_rate = mutation rate per callable base
#               $cb_bed = bed file containing callable bases
#       $chromosome = the fasta chromosome
# Output Variables:
#               $SNV_num = number of SNVs to spike in
sub calculate_muts {
        my ($mut_rate, $cb_bed, $chromosome) = @_;
        my $bases = 0;
        my $min_loc = 0;
        my $max_loc;
        open (my $bed_in, '<', $cb_bed) or croak "ERROR: unable to open bed file at : $cb_bed";
        my $first_line = <$bed_in>;
        my @first_row = split('\t', $first_line);
        croak "ERROR: primary_id of fasta file not the same as chromosome of bed file given" if ($first_row[0] ne $chromosome);
        #bed file includes position at chromSTART and excludes position at chromEND
        #range calculated by END - START + 1 - 1
        $bases += ($first_row[2] - $first_row[1]);
        $min_loc = $first_row[1];
        my @last_row;
        while (<$bed_in>) {
                s/\W+$//;
                my @row = split('\t');
                croak "ERROR: primary_id of fasta file not the same as chromosome of bed file given" if ($row[0] ne $chromosome);
                 
                $bases += ($row[2] - $row[1]);
                @last_row = @row;
                }
        close($bed_in);
        $max_loc = $last_row[2] - 1;
        print "INFO: Chromosome: $chromosome\tCovered bases: $bases\tMin: $min_loc\tMax: $max_loc\n";
        my $SNV_num = 200;
        if ($bases){
                $SNV_num = $bases * $mut_rate;
                $SNV_num = sprintf("%.0f", $SNV_num);
        } else {
                say ("INFO: cannot obtain readcount information for chromosome $chromosome, picking 200 mutations.");
        }
        return ($SNV_num, $min_loc, $max_loc, $bases);
        }
### check_location ################################################################################
# Description:
#               Check if the location generated is one of the callable bases
# Input Variables:
#               $location = location to spike in mutation
#               $cb_bed = covered bases bed file
# Output Variables:
#               $flag = 0 for location exists, 1 for location not covered
sub check_location {
        my ($location, $cb_bed) = @_;
        # preset flag assuming that location is NOT found
        my $flag = 1;
        #correct for the fact that bed files are 0 based
        my $pos = $location - 1;
        open (my $bed_in, '<', $cb_bed) or croak "ERROR: unable to open bed file at : $cb_bed";
        while (<$bed_in>) {
                s/\W+$//;
                my @row = split('\t');
                #bed file includes position at chromSTART and excludes position at chromEND
                if ($pos >= $row[1] && $pos < $row[2]){
                        $flag = 0;
                        last;
                        }
                }
        close($bed_in);
        return ($flag);
        }
### read_vcf #################################################################################
# Description:
#               Read the vcf file with the required mutations and record those applicable to this chromosome
# Input Variables:
#               $vcf_file = vcf file
#               $chromosome = the current chromosome
# Output Variables:
#               $vcf_mutations = the mutations listed in the vcf file for the current chromosome
sub read_vcf {
        my ($vcf_file, $chromosome) = @_;
        say "INFO: reading vcf file $vcf_file";
         
        open (my $vcf_in, '<', $vcf_file) or croak "ERROR: unable to open vcf file at : $vcf_file";
        my $vcf_mutations = [];
        while (<$vcf_in>) {
                s/\W+$//;
                my @row = split('\t');
                #$row[0] =~ s/chr//;
                if ($row[0] eq $chromosome) {
                        # check that line represents a SNV
                        if (length($row[3]) == 1 && length($row[4]) == 1) {
                                push (@{$vcf_mutations}, [$chromosome, $row[1], $row[4]]);
                                } 
                        }
                }
        close($vcf_in);
        return ($vcf_mutations);
        }
__END__
	Global
`s`	Focus search bar
`?`	Bring up this help dialog
	GitHub
`g` `p`	Go to pull requests
`g` `i`	go to github issues (only if github is preferred repository)
	POD
`g` `a`	Go to author
`g` `c`	Go to changes
`g` `i`	Go to issues
`g` `d`	Go to dist
`g` `r`	Go to repository/SCM
`g` `s`	Go to source
`g` `b`	Go to file browse
	Search terms
module: (e.g. module:Plugin)
distribution: (e.g. distribution:Dancer auth)
author: (e.g. author:SONGMU Redis)
version: (e.g. version:1.00)