The Perl Toolchain Summit 2025 Needs You: You can help 🙏 Learn more

#!/usr/bin/env perl
#
# A script that creates a JSON array of simulated BFF/PXF
#
# Note: Check also Monarch Initiative:
#
# Last Modified: Nov/08/2024
#
# $VERSION taken from Pheno::Ranker
#
# Copyright (C) 2023-2024 Manuel Rueda - CNAG (manuel.rueda@cnag.eu)
#
# License: Artistic License 2.0
#
# If this program helps you in your research, please cite.
use strict;
use Getopt::Long qw(:config no_ignore_case);
##### Main #####
randomize_ga4gh();
################
exit;
sub randomize_ga4gh {
my $VERSION = '0.09';
my $format = 'bff';
my $number = 100;
my $output = 'individuals.json';
my ( $phenotypicFeatures, $diseases, $treatments, $procedures, $exposures,
$ethnicity )
= (1) x 6;
# Reading arguments
GetOptions(
'format|f=s' => \$format, # string
'number|n=i' => \$number, # string
'output|o=s' => \$output, # string
#
'diseases=i' => \$diseases, # integer
'exposures=i' => \$exposures, # integer
'phenotypicFeatures=i' => \$phenotypicFeatures, # integer
'procedures=i' => \$procedures, # integer
'treatments=i' => \$treatments, # integer
#
'max-diseases-pool=i' => \my $max_diseases_pool, # integer
'max-ethnicity-pool=i' => \my $max_ethnicity_pool, # integer
'max-exposures-pool=i' => \my $max_exposures_pool, # integer
'max-phenotypicFeatures-pool=i' => \my $max_phenotypicFeatures_pool, # integer
'max-treatments-pool=i' => \my $max_treatments_pool, # integer
'max-procedures-pool=i' => \my $max_procedures_pool, # integer
#
'random-seed=i' => \my $random_seed, # integer
'external-ontologies=s' => \my $ext_ontologies, # string
#
'help|?' => \my $help, # flag
'man' => \my $man, # flag
'debug=i' => \my $debug, # integer
'verbose|' => \my $verbose, # flag
'version|V' => sub { print "$0 Version $VERSION\n"; exit; }
) or pod2usage(2);
pod2usage(1) if $help;
pod2usage( -verbose => 2, -exitval => 0 ) if $man;
# Create object
my $randomize = Randomizer->new(
{
format => $format,
number => $number,
output => $output,
diseases => $diseases,
ethnicity => $ethnicity,
exposures => $exposures,
phenotypicFeatures => $phenotypicFeatures,
procedures => $procedures,
treatments => $treatments,
max_diseases_pool => $max_diseases_pool,
max_ethnicity_pool => $max_ethnicity_pool,
max_exposures_pool => $max_exposures_pool,
max_phenotypicFeatures_pool => $max_phenotypicFeatures_pool,
max_procedures_pool => $max_procedures_pool,
max_treatments_pool => $max_treatments_pool,
random_seed => $random_seed,
ext_ontologies => $ext_ontologies,
debug => $debug,
verbose => $verbose
}
);
# Run method
$randomize->run;
}
package Randomizer;
use strict;
use autodie;
use feature qw(say);
#use Data::Printer;
use List::Util 1.50 qw(head shuffle);
use Data::Fake qw(Core Company Dates Names);
use FindBin qw($Bin);
use lib $Bin;
qw($hpo_array $omim_array $rxnorm_array $ncit_procedures_array $ncit_exposures_array $ethnicity_array);
sub new {
my ( $class, $self ) = @_;
bless $self, $class;
return $self;
}
sub run {
my $self = shift;
my $number = $self->{number};
my $format = $self->{format};
my $output = $self->{output};
my $random_seed = $self->{random_seed};
my %func = (
bff => \&bff_generator,
pxf => \&pxf_generator
);
# Set seed if defined
srand($random_seed) if defined $random_seed; # user can set it to 0
# Load external ontologies file if present
$self->{ontologies_data} =
$self->{ext_ontologies}
? validate_json( $self->{ext_ontologies} )
: undef; # setter
#########
# START #
#########
my $json_data;
for ( my $i = 1 ; $i <= $number ; $i++ ) {
push @$json_data, $func{$format}->( $i, $self );
}
#######
# END #
#######
#p $json_data;
# Serialize the data and write
write_json( { filepath => $output, data => $json_data } );
}
sub write_json {
my $arg = shift;
my $file = $arg->{filepath};
my $json_data = $arg->{data};
# Note that canonical DOES not match the order of nsort from Sort::Naturally
my $json = JSON::XS->new->utf8->canonical->pretty->encode($json_data);
path($file)->spew_utf8($json);
return 1;
}
sub pxf_generator {
my ( $id, $self ) = @_;
my $result_hash = run_functions($self);
my $pxf = fake_hash(
{
id => "Phenopacket_" . $id,
subject => {
id => "IndividualId_" . $id,
age => {
iso8601duration =>
fake_template( "P%dY", fake_int_mod( 1, 99 ) )
},
sex => fake_pick_mod( [ 'MALE', 'FEMALE' ] )
},
diseases => $result_hash->{diseases},
phenotypicFeatures => $result_hash->{phenotypicFeatures},
medicalActions => merge_medical_actions($result_hash)
}
);
return $pxf->();
}
sub merge_medical_actions {
my $hash = shift;
# Initialize empty arrays for treatments and procedures
my @processed_treatments;
my @processed_procedures;
# Process treatments if defined
if ( defined $hash->{treatments} ) {
@processed_treatments =
map { { treatment => $_ } } @{ $hash->{treatments} };
}
# Process procedures if defined
if ( defined $hash->{procedures} ) {
@processed_procedures =
map { { procedure => $_ } } @{ $hash->{procedures} };
}
# Merge the processed arrays and return a reference
# NB: If undef no elements will be added
return [ @processed_treatments, @processed_procedures ];
}
sub bff_generator {
my ( $id, $self ) = @_;
my $default_array = [];
my $result_hash = run_functions($self);
my $bff = fake_hash(
{
id => "Beacon_" . $id,
ethnicity => $result_hash->{ethnicity},
sex => fake_pick_mod(
[
{ id => "NCIT:C20197", label => "Male" },
{ id => "NCIT:C16576", label => "Female" }
]
),
diseases => $result_hash->{diseases} // $default_array,
phenotypicFeatures => $result_hash->{phenotypicFeatures}
// $default_array,
treatments => $result_hash->{treatments} // $default_array,
interventionsOrProcedures => $result_hash->{procedures}
// $default_array,
exposures => $result_hash->{exposures} // $default_array
}
);
return $bff->();
}
######################
# START ARRAY TERMS #
######################
sub create_entries {
my ( $params, $ontologies_array, $n, $max ) = @_;
my $shuffled_slice = shuffle_slice( $max, $ontologies_array );
my $array;
for ( my $i = 0 ; $i < $n ; $i++ ) {
push @$array,
$params->{entry_creator}->( $shuffled_slice->[$i], $params );
}
return $array;
}
sub common_entry_creator {
my ( $element, $params ) = @_;
return {
$params->{type} => $element,
$params->{onset} => {
age => {
iso8601duration =>
fake_template( "P%dY", fake_int_mod( 1, 99 ) )
}
}
};
}
sub phenotypicFeatures {
my ( $format, $ontologies_array, $n, $max ) = @_;
my $params = {
type => $format eq 'bff' ? 'featureType' : 'type',
onset => $format eq 'bff' ? 'ageOfOnset' : 'onset',
entry_creator => \&common_entry_creator
};
return create_entries( $params, $ontologies_array, $n, $max );
}
sub diseases {
my ( $format, $ontologies_array, $n, $max ) = @_;
my $params = {
type => $format eq 'bff' ? 'diseaseCode' : 'term',
onset => $format eq 'bff' ? 'ageOfOnset' : 'onset',
entry_creator => \&common_entry_creator
};
return create_entries( $params, $ontologies_array, $n, $max );
}
sub treatments {
my ( $format, $ontologies_array, $n, $max ) = @_;
my $params = {
entry_creator => sub {
my ( $element, $p ) = @_;
return $format eq 'bff'
? { treatmentCode => $element }
: { agent => $element };
}
};
return create_entries( $params, $ontologies_array, $n, $max );
}
sub procedures {
my ( $format, $ontologies_array, $n, $max ) = @_;
my $params = {
type => $format eq 'bff' ? 'procedureCode' : 'term',
onset => $format eq 'bff' ? 'ageAtProcedure' : 'onset',
entry_creator => \&common_entry_creator
};
return create_entries( $params, $ontologies_array, $n, $max );
}
sub exposures {
my ( $format, $ontologies_array, $n, $max ) = @_;
my $default_duration = 'P999Y';
my $default_ontology_term =
{ id => 'NCIT:C126101', label => 'Not Available' };
my $params = {
type => $format eq 'bff' ? 'exposureCode' : 'term',
onset => $format eq 'bff' ? 'ageAtExposure' : 'onset',
entry_creator => \&common_entry_creator
};
my $entries = create_entries( $params, $ontologies_array, $n, $max );
# Add 'duration' 'and 'unit' to pass bff-validator
foreach my $entry (@$entries) {
$entry->{duration} = $default_duration;
$entry->{unit} = $default_ontology_term;
}
return $entries;
}
sub ethnicity {
my ( undef, $ontologies_array, undef, $max ) = @_;
my $shuffled_slice = shuffle_slice( $max, $ontologies_array );
return $shuffled_slice->[0];
}
####################
# END ARRAY TERMS #
####################
sub load_ontology_hash {
my $self = shift;
my %ont = (
diseases => $omim_array,
ethnicity => $ethnicity_array,
exposures => $ncit_exposures_array,
phenotypicFeatures => $hpo_array,
procedures => $ncit_procedures_array,
treatments => $rxnorm_array
);
return \%ont;
}
sub run_functions {
my $self = shift;
my $ontologies = load_ontology_hash($self);
my %func = (
diseases => \&diseases,
ethnicity => \&ethnicity,
exposures => \&exposures,
phenotypicFeatures => \&phenotypicFeatures,
procedures => \&procedures,
treatments => \&treatments
);
my %hash;
# *** IMPORTANT ***
# sort keys (below) is mandatory for reproducibility
for my $key ( sort keys %func ) {
my $ontologies_array =
exists $self->{ontologies_data}{$key}
? $self->{ontologies_data}{$key}
: $ontologies->{$key};
$hash{$key} = $func{$key}->(
$self->{format}, $ontologies_array, $self->{$key},
$self->{ 'max_' . $key . '_pool' }
);
}
return \%hash;
}
sub shuffle_slice {
my ( $max, $array ) = @_;
# head -> 1.50 List::Util (5.26 has 1.4602)
#my @items = sample $count, @values; # 1.54 List::Util
# *** IMPORTANT ***
# If $max was defined by the user then use it, otherwise @$array;
my @slice = defined $max ? head $max, @$array : @$array; # slice of refs
my @shuffled_slice = shuffle @slice;
return wantarray ? @shuffled_slice : \@shuffled_slice;
}
sub fake_int_mod {
# This subroutine was built because fake_int did not respond to srand
my ( $low, $high ) = @_;
my $range = $high - $low;
return int( rand($range) ) + 1;
}
sub fake_pick_mod {
# This subroutine was built because fake_pick did not respond to srand
# NB: The original from Data::Fake worked with array (not with arrayref)
my $array = shift;
return $array->[ int( rand(@$array) ) ];
}
sub validate_json {
my $file = shift;
my $data = read_yaml($file);
my $schema = {
type => "object",
properties => {
diseases => { '$ref' => '#/$defs/array' },
phenotypicFeatures => { '$ref' => '#/$defs/array' },
treatments => { '$ref' => '#/$defs/array' },
procedures => { '$ref' => '#/$defs/array' },
exposures => { '$ref' => '#/$defs/array' },
ethnicity => { '$ref' => '#/$defs/array' }
},
'$defs' => {
array => {
type => "array",
items => { '$ref' => '#/$defs/item' }
},
item => {
type => "object",
required => [ "id", "label" ],
properties => {
id => { type => "string", pattern => qq/^\\w[^:]+:.+\$/ },
label => { type => "string" }
}
}
}
};
# Load at runtime
require JSON::Validator;
# Create object and load schema
my $jv = JSON::Validator->new;
# Load schema in object
$jv->schema($schema);
# Validate data
my @errors = $jv->validate($data);
# Show error if any
say_errors( \@errors ) and die if @errors;
# return data if ok
return $data;
}
sub say_errors {
my $errors = shift;
if ( @{$errors} ) {
say join "\n", @{$errors};
}
return 1;
}
sub read_yaml {
# Load at runtime
require YAML::XS;
YAML::XS->import('LoadFile');
return LoadFile(shift); # Decode to Perl data structure
}
1;
=head1 NAME
bff-pxf-simulator: A script that creates a JSON array of simulated BFF/PXF
=head1 SYNOPSIS
bff-pxf-simulator [-options]
Options:
-f, --format <format> Format [bff|pxf]
-n, --number <number> Set the number of individuals to generate [100]
-o, --output <file> Output file [individuals.json]
--external-ontologies <file> Path to a YAML file containing ontology terms
--random-seed <seed> Initializes pseudorandom number sequences (seed must be an integer)
--diseases <number> Set the number of diseases per individual [1]
--exposures <number> Set the number of exposures per individual [1]
--phenotypicFeatures <number> Set the number of phenotypic features per individual [1]
--procedures <number> Set the number of procedures per individual [1]
--treatments <number> Set the number of treatments per individual [1]
--max-[term]-pool <size> Limit the selection to the first N elements of the term array
--max-ethnicity-pool <size> Restrict the ethnicity pool size; each individual will have only one ethnicity
Generic Options:
-debug <level> Print debugging (from 1 to 5, being 5 max)
-h, --help Brief help message
-man Full documentation
-v, --verbose Verbosity on
-V, --version Print version
=head1 DESCRIPTION
This script generates a JSON array of simulated BFF/PXF data. The files can be created based on pre-loaded ontologies or by utilizing an external YAML file.
=head1 SUMMARY
A script that creates a JSON array of simulated BFF/PXF.
Implemented array terms:
B<BFF:> C<diseases, exposures, interventionsOrProcedures, phenotypicFeatures, treatments>.
procedures = interventionsOrProcedures
B<PXF:> C<interventionsOrProcedures, medicalActions.procedure, medicalActions.treatment, phenotypicFeatures>.
procedures = medicalActions.procedure
treatments = medicalActions.treatment
=head1 INSTALLATION
(only needed if you did not install C<Pheno-Ranker>)
$ cpanm --installdeps .
=head3 System requirements
* Ideally a Debian-based distribution (Ubuntu or Mint), but any other (e.g., CentOs, OpenSuse) should do as well.
* Perl 5 (>= 5.10 core; installed by default in most Linux distributions). Check the version with "perl -v"
* 1GB of RAM.
* 1 core (it only uses one core per job).
* At least 1GB HDD.
=head1 HOW TO RUN BFF-PXF-SIMULATOR
When run without any arguments, the software will use default settings. To modify any parameters, please refer to the synopsis for guidance.
If you prefer not to include a specific term in the analysis, set its value to zero. For example:
C<--treatments 0>
B<Examples:>
$ ./bff-pxf-simulator -f pxf # BFF with 100 samples
$ ./bff-pxf-simulator -f pxf -n 1000 -o pxf.json # PXF with 1K samples and saved to pxf.json
$ ./bff-pxf-simulator -phenotypicFeatures 10 # BFF with 100 samples and 10 pF each
$ ./bff-pxf-simulator -diseases 0 -exposures 0 -procedures 0 -phenotypicFeatures 0 -treatments 0 # Only sex and ethnicity
=head2 COMMON ERRORS AND SOLUTIONS
* Error message: Foo
Solution: Bar
* Error message: Foo
Solution: Bar
=head1 AUTHOR
Written by Manuel Rueda, PhD. Info about CNAG can be found at L<https://www.cnag.eu>.
=head1 COPYRIGHT AND LICENSE
This PERL file is copyrighted. See the LICENSE file included in this distribution.
=cut