our
@EXPORT_OK
=
qw($VERSION write_json)
;
$SIG
{__WARN__} =
sub
{
warn
BOLD YELLOW
"Warn: "
,
@_
};
$SIG
{__DIE__} =
sub
{
die
BOLD RED
"Error: "
,
@_
};
$Data::Dumper::Sortkeys
= 1;
our
$VERSION
=
'1.04'
;
our
$share_dir
= dist_dir(
'Pheno-Ranker'
);
my
(
$config_sort_by
,
$config_similarity_metric_cohort
,
$config_max_out
,
$config_max_number_vars
,
$config_max_matrix_items_in_ram
,
@config_allowed_terms
);
my
$default_config_file
= catfile(
$share_dir
,
'conf'
,
'config.yaml'
);
has
'config_file'
=> (
is
=>
'ro'
,
isa
=>
sub
{
die
"Config file '$_[0]' is not a valid file"
unless
-e
$_
[0] },
default
=>
$default_config_file
,
coerce
=>
sub
{
$_
[0] //
$default_config_file
},
trigger
=>
sub
{
my
(
$self
,
$config_file
) =
@_
;
my
$config
= read_yaml(
$config_file
);
$self
->_set_basic_config(
$config
);
$self
->_validate_and_set_exclusive_config(
$config
,
$config_file
);
$self
->_set_additional_config(
$config
,
$config_file
);
lock_hash(
%$config
);
}
);
sub
_set_basic_config {
my
(
$self
,
$config
) =
@_
;
$config_sort_by
=
$config
->{sort_by} //
'hamming'
;
$config_similarity_metric_cohort
=
$config
->{similarity_metric_cohort}
//
'hamming'
;
$config_max_out
=
$config
->{max_out} // 50;
$config_max_number_vars
=
$config
->{max_number_vars} // 10_000;
$config_max_matrix_items_in_ram
=
$config
->{max_matrix_items_in_ram} // 5_000;
}
sub
_validate_and_set_exclusive_config {
my
(
$self
,
$config
,
$config_file
) =
@_
;
unless
(
exists
$config
->{allowed_terms}
&& ArrayRef->check(
$config
->{allowed_terms} )
&& @{
$config
->{allowed_terms} } )
{
die
"No <allowed terms> provided or not an array ref at $config_file\n"
;
}
@config_allowed_terms
= @{
$config
->{allowed_terms} };
}
sub
_set_additional_config {
my
(
$self
,
$config
,
$config_file
) =
@_
;
$self
->{primary_key} =
$config
->{primary_key} //
'id'
;
$self
->{exclude_variables_regex} =
$config
->{exclude_variables_regex}
//
undef
;
$self
->{exclude_variables_regex_qr} =
defined
$self
->{exclude_variables_regex}
?
qr/$self->{exclude_variables_regex}/
:
undef
;
$self
->{array_terms} =
$config
->{array_terms} // [
'foo'
];
$self
->{array_regex} =
$config
->{array_regex} //
'^([^:]+):(\d+)'
;
$self
->{array_regex_qr} =
qr/$self->{array_regex}/
;
$self
->{array_terms_regex_str} =
'^('
.
join
(
'|'
,
map
{
"\Q$_\E"
} @{
$self
->{array_terms} } ) .
'):'
;
$self
->{array_terms_regex_qr} =
qr/$self->{array_terms_regex_str}/
;
$self
->{
format
} =
$config
->{
format
};
$self
->{seed} =
(
defined
$config
->{seed} && Int->check(
$config
->{seed} ) )
?
$config
->{seed}
: 123456789;
if
(
$self
->{array_terms}[0] ne
'foo'
) {
unless
(
exists
$config
->{id_correspondence}
&& HashRef->check(
$config
->{id_correspondence} ) )
{
die
"No <id_correspondence> provided or not a hash ref at $config_file\n"
;
}
$self
->{id_correspondence} =
$config
->{id_correspondence};
if
(
exists
$config
->{
format
} && Str->check(
$config
->{
format
} ) ) {
die
"<$config->{format}> does not match any key from <id_correspondence>\n"
unless
exists
$config
->{id_correspondence}{
$config
->{
format
} };
}
}
}
has
sort_by
=> (
default
=>
$config_sort_by
,
is
=>
'ro'
,
coerce
=>
sub
{
$_
[0] //
$config_sort_by
},
lazy
=> 1,
isa
=> Enum [
qw(hamming jaccard)
]
);
has
similarity_metric_cohort
=> (
default
=>
$config_similarity_metric_cohort
,
is
=>
'ro'
,
coerce
=>
sub
{
$_
[0] //
$config_similarity_metric_cohort
},
lazy
=> 1,
isa
=> Enum [
qw(hamming jaccard)
]
);
has
max_out
=> (
default
=>
$config_max_out
,
is
=>
'ro'
,
coerce
=>
sub
{
$_
[0] //
$config_max_out
},
lazy
=> 1,
isa
=> Int
);
has
max_number_vars
=> (
default
=>
$config_max_number_vars
,
is
=>
'ro'
,
coerce
=>
sub
{
$_
[0] //
$config_max_number_vars
},
lazy
=> 1,
isa
=> Int
);
has
max_matrix_items_in_ram
=> (
default
=>
$config_max_matrix_items_in_ram
,
is
=>
'ro'
,
coerce
=>
sub
{
$_
[0] //
$config_max_matrix_items_in_ram
},
lazy
=> 1,
isa
=> Int
);
has
hpo_file
=> (
default
=> catfile(
$share_dir
,
'db'
,
'hp.json'
),
coerce
=>
sub
{
$_
[0] // catfile(
$share_dir
,
'db'
,
'hp.json'
) },
is
=>
'ro'
,
isa
=>
sub
{
die
"Error <$_[0]> is not a valid file"
unless
-e
$_
[0] },
);
has
poi_out_dir
=> (
default
=> catdir(
'./'
),
coerce
=>
sub
{
$_
[0] // catdir(
'./'
) },
is
=>
'ro'
,
isa
=>
sub
{
die
"<$_[0]> dir does not exist"
unless
-d
$_
[0] },
);
has
[
qw/include_terms exclude_terms/
] => (
is
=>
'ro'
,
lazy
=> 1,
isa
=>
sub
{
my
$value
=
shift
;
die
"<--include_terms> and <--exclude_terms> must be an array ref\n"
unless
ref
$value
eq
'ARRAY'
;
foreach
my
$term
(
@$value
) {
die
"Invalid term '$term' in <--include_terms> or <--exclude_terms>. Allowed values are: "
.
join
(
', '
,
@config_allowed_terms
) .
"\n"
unless
grep
{
$_
eq
$term
}
@config_allowed_terms
;
}
},
default
=>
sub
{ [] },
);
has
cli
=> (
is
=>
'ro'
,
isa
=> Bool,
default
=> 0,
coerce
=>
sub
{
$_
[0] // 0 },
);
has
[
qw/target_file weights_file out_file include_hpo_ascendants
retain_excluded_phenotypicFeatures align align_basename export export_basename
log verbose age cytoscape_json graph_stats/
] => (
is
=>
'ro'
);
has
[
qw/append_prefixes reference_files patients_of_interest/
] =>
(
default
=>
sub
{ [] },
is
=>
'ro'
);
has
[
qw/glob_hash_file ref_hash_file ref_binary_hash_file/
] => (
is
=>
'ro'
);
sub
BUILD {
my
$self
=
shift
;
if
( @{
$self
->{append_prefixes} } ) {
die
"<--append_prefixes> requires at least 2 cohort files!\n"
unless
@{
$self
->{reference_files} } > 1;
die
"The number of items in <--r> and <--append-prefixes> must match!\n"
unless
@{
$self
->{reference_files} } == @{
$self
->{append_prefixes} };
}
if
( @{
$self
->{patients_of_interest} } ) {
die
"<--patients-of-interest> must be used with <--r>\n"
unless
@{
$self
->{reference_files} };
}
}
sub
run {
my
$self
=
shift
;
my
$reference_files
=
$self
->{reference_files};
my
$target_file
=
$self
->{target_file};
my
$weights_file
=
$self
->{weights_file};
my
$export
=
$self
->{export};
my
$export_basename
=
$self
->{export_basename};
my
$include_hpo_ascendants
=
$self
->{include_hpo_ascendants};
my
$hpo_file
=
$self
->{hpo_file};
my
$align
=
$self
->{align};
my
$align_basename
=
$self
->{align_basename};
my
$out_file
=
$self
->{out_file};
my
$cytoscape_json
=
$self
->{cytoscape_json};
my
$graph_stats
=
$self
->{graph_stats};
my
$append_prefixes
=
$self
->{append_prefixes};
my
$primary_key
=
$self
->{primary_key};
my
$poi
=
$self
->{patients_of_interest};
my
$poi_out_dir
=
$self
->{poi_out_dir};
my
$cli
=
$self
->{cli};
my
$similarity_metric_cohort
=
$self
->{similarity_metric_cohort};
my
$weight
=
undef
;
my
$align_dir
=
defined
$align
? dirname(
$align
) :
'.'
;
die
"Directory <$align_dir> does not exist (used with --align)\n"
unless
-d
$align_dir
;
my
$export_dir
=
defined
$export
? dirname(
$export
) :
'.'
;
die
"Directory <$export_dir> does not exist (used with --export)\n"
unless
-d
$export_dir
;
my
$has_precomputed
=
defined
$self
->{glob_hash_file}
&&
defined
$self
->{ref_hash_file}
&&
defined
$self
->{ref_binary_hash_file};
my
(
$glob_hash
,
$ref_hash
,
$ref_binary_hash
,
$hash2serialize
);
if
(
$has_precomputed
) {
say
"Using precomputed data"
if
$self
->{verbose};
$glob_hash
= read_json(
$self
->{glob_hash_file} );
$ref_hash
= read_json(
$self
->{ref_hash_file} );
$ref_binary_hash
= read_json(
$self
->{ref_binary_hash_file} );
$self
->add_attribute(
'format'
,
'PXF'
);
$hash2serialize
= {
glob_hash
=>
$glob_hash
,
ref_hash
=>
$ref_hash
,
ref_binary_hash
=>
$ref_binary_hash
,
};
}
else
{
my
$ref_data
=
$self
->_load_reference_cohort_data(
$reference_files
,
$primary_key
,
$append_prefixes
);
$weight
= validate_json(
$weights_file
);
if
(
$include_hpo_ascendants
) {
my
(
$nodes
,
$edges
) = parse_hpo_json( read_json(
$hpo_file
) );
$self
->{nodes} =
$nodes
;
$self
->{edges} =
$edges
;
}
my
(
$coverage_stats
,
$glob_hash_computed
,
$ref_hash_computed
,
$ref_binary_hash_computed
,
$hash2serialize_computed
)
=
$self
->_compute_cohort_metrics(
$ref_data
,
$weight
,
$primary_key
,
$target_file
);
$glob_hash
=
$glob_hash_computed
;
$ref_hash
=
$ref_hash_computed
;
$ref_binary_hash
=
$ref_binary_hash_computed
;
$hash2serialize
=
$hash2serialize_computed
;
}
cohort_comparison(
$ref_binary_hash
,
$self
)
unless
$target_file
;
my
$graph
=
$self
->_perform_graph_calculations(
$out_file
,
$cytoscape_json
,
$graph_stats
,
$similarity_metric_cohort
);
if
(
$target_file
) {
$self
->_process_patient_data(
{
target_file
=>
$target_file
,
primary_key
=>
$primary_key
,
weight
=>
$weight
,
glob_hash
=>
$glob_hash
,
ref_hash
=>
$ref_hash
,
ref_binary_hash
=>
$ref_binary_hash
,
align
=>
$align
,
align_basename
=>
$align_basename
,
out_file
=>
$out_file
,
cli
=>
$cli
,
verbose
=>
$self
->{verbose},
},
\
$hash2serialize
);
}
if
(
defined
$export
) {
serialize_hashes(
{
data
=>
$hash2serialize
,
export_basename
=>
$export
?
$export
:
$export_basename
}
);
}
return
1;
}
sub
_load_reference_cohort_data {
my
(
$self
,
$reference_files
,
$primary_key
,
$append_prefixes
) =
@_
;
my
$ref_data
= [];
for
my
$cohort_file
( @{
$reference_files
} ) {
die
"<$cohort_file> does not exist\n"
unless
-f
$cohort_file
;
my
$json_data
= io_yaml_or_json(
{
filepath
=>
$cohort_file
,
mode
=>
'read'
}
);
my
$msg
=
"Sorry, <$cohort_file> does not contain primary_key <$primary_key>. Are you using the right configuration file?\n"
;
if
(
ref
$json_data
eq
ref
[] ) {
die
$msg
unless
exists
$json_data
->[0]->{
$primary_key
};
}
else
{
die
$msg
unless
exists
$json_data
->{
$primary_key
};
}
push
@$ref_data
,
$json_data
;
}
$ref_data
= append_and_rename_primary_key(
{
ref_data
=>
$ref_data
,
append_prefixes
=>
$append_prefixes
,
primary_key
=>
$primary_key
}
);
return
$ref_data
;
}
sub
_compute_cohort_metrics {
my
(
$self
,
$ref_data
,
$weight
,
$primary_key
,
$target_file
) =
@_
;
my
$coverage_stats
= coverage_stats(
$ref_data
);
die
"--include-terms <@{$self->{include_terms}}> does not exist in the cohort(s)\n"
unless
check_existence_of_include_terms(
$coverage_stats
,
$self
->{include_terms} );
$self
->add_attribute(
'format'
, check_format(
$ref_data
) )
unless
defined
$self
->{
format
};
restructure_pxf_interpretations(
$ref_data
,
$self
);
my
(
$glob_hash
,
$ref_hash
) =
create_glob_and_ref_hashes(
$ref_data
,
$weight
,
$self
);
if
(
keys
%$glob_hash
>
$self
->{max_number_vars} ) {
$glob_hash
= randomize_variables(
$glob_hash
,
$self
);
}
my
$ref_binary_hash
= create_binary_digit_string(
$glob_hash
,
$ref_hash
);
my
$hash2serialize
= {
glob_hash
=>
$glob_hash
,
ref_hash
=>
$ref_hash
,
ref_binary_hash
=>
$ref_binary_hash
,
coverage_stats
=>
$coverage_stats
};
return
(
$coverage_stats
,
$glob_hash
,
$ref_hash
,
$ref_binary_hash
,
$hash2serialize
);
}
sub
_process_patient_data {
my
(
$self
,
$params
,
$hash2serialize_ref
) =
@_
;
my
$target_file
=
$params
->{target_file};
my
$primary_key
=
$params
->{primary_key};
my
$weight
=
$params
->{weight};
my
$glob_hash
=
$params
->{glob_hash};
my
$ref_hash
=
$params
->{ref_hash};
my
$ref_binary_hash
=
$params
->{ref_binary_hash};
my
$align
=
$params
->{align};
my
$align_basename
=
$params
->{align_basename};
my
$out_file
=
$params
->{out_file};
my
$cli
=
$params
->{cli};
my
$verbose
=
$params
->{verbose};
my
$tar_data
= array2object(
io_yaml_or_json( {
filepath
=>
$target_file
,
mode
=>
'read'
} ) );
die
"Sorry, <$target_file> does not contain primary_key <$primary_key>. Are you using the right config file?\n"
unless
exists
$tar_data
->{
$primary_key
};
restructure_pxf_interpretations(
$tar_data
,
$self
);
my
$tar_data_id
=
$tar_data
->{
$primary_key
};
my
$tar_hash
= {
$tar_data_id
=> remap_hash(
{
hash
=>
$tar_data
,
weight
=>
$weight
,
self
=>
$self
}
)
};
my
$tar_binary_hash
= create_binary_digit_string(
$glob_hash
,
$tar_hash
);
my
(
$results_rank
,
$results_align
,
$alignment_ascii
,
$alignment_dataframe
,
$alignment_csv
)
= compare_and_rank(
{
glob_hash
=>
$glob_hash
,
ref_hash
=>
$ref_hash
,
tar_hash
=>
$tar_hash
,
ref_binary_hash
=>
$ref_binary_hash
,
tar_binary_hash
=>
$tar_binary_hash
,
weight
=>
$weight
,
self
=>
$self
}
);
say
join
"\n"
,
@$results_rank
if
$cli
;
write_array2txt( {
filepath
=>
$out_file
,
data
=>
$results_rank
} );
if
(
defined
$align
) {
write_alignment(
{
align
=>
$align
?
$align
:
$align_basename
,
ascii
=>
$alignment_ascii
,
dataframe
=>
$alignment_dataframe
,
csv
=>
$alignment_csv
}
);
}
$$hash2serialize_ref
->{tar_hash} =
$tar_hash
;
$$hash2serialize_ref
->{tar_binary_hash} =
$tar_binary_hash
;
$$hash2serialize_ref
->{alignment_hash} =
$results_align
if
defined
$align
;
return
1;
}
sub
_perform_graph_calculations {
my
(
$self
,
$out_file
,
$cytoscape_json
,
$graph_stats
,
$similarity_metric_cohort
)
=
@_
;
my
$graph
;
if
(
$cytoscape_json
) {
$graph
= matrix2graph(
{
matrix
=>
$out_file
,
json
=>
$cytoscape_json
,
graph_stats
=> 1,
verbose
=>
$self
->{verbose},
}
);
}
if
(
defined
$graph_stats
) {
cytoscape2graph(
{
graph
=>
$graph
,
output
=>
$graph_stats
,
metric
=>
$similarity_metric_cohort
,
verbose
=>
$self
->{verbose},
}
);
}
return
$graph
;
}
sub
add_attribute {
my
(
$self
,
$name
,
$value
) =
@_
;
$self
->{
$name
} =
$value
;
return
1;
}
1;
Hide Show 63 lines of Pod