bin/pheno-ranker - metacpan.org


            
              1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
—
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
              #!/usr/bin/env perl
#
#   A script that performs semantic similarity in PXF|BFF data structures
#
#   Last Modified: Aug/09/2024
#
#   $VERSION taken from Pheno::Ranker
#
#   Copyright (C) 2023-2025 Manuel Rueda - CNAG (manuel.rueda@cnag.eu)
#
#   License: Artistic License 2.0
#
#   If this program helps you in your research, please cite.
package main;
use strict;
use warnings;
use autodie;
use feature qw(say);
use Getopt::Long qw(:config no_ignore_case);
use Pod::Usage;
use Data::Dumper;
use Sys::Hostname;
use POSIX qw(strftime);
use Term::ANSIColor qw(:constants);
use File::ShareDir::ProjectDistDir qw(dist_dir);
use FindBin qw($Bin);
use lib "$Bin/../lib";
use Pheno::Ranker qw($VERSION write_json);
# Defining a few variables
my $out_file_cohort      = 'matrix.txt';
my $out_file_patient     = 'rank.txt';
my $out_file_graph       = 'graph.json';
my $out_file_graph_stats = 'graph_stats.txt';
my $export_basename      = 'export';
my $align_basename       = 'alignment';
my $log_file             = 'pheno-ranker-log.json';
my $color                = 1;
my $age                  = 0;
my $cli                  = 1;
# Reading arguments
GetOptions(
    'reference|r=s{1,}'              => \my @reference_files,             # array
    'target|t=s'                     => \my $target_file,                 # string
    'weights|w=s'                    => \my $weights_file,                # string
    'append-prefixes=s{1,}'          => \my @append_prefixes,             # array
    'out-file|o=s'                   => \my $out_file_arg,                # string
    'max-out:i'                      => \my $max_out,                     # integer
    'max-number-var:i'               => \my $max_number_var,              # integer
    'include-hpo-ascendants'         => \my $include_hpo_ascendants,      # flag
    'export|e:s'                     => \my $export,                      # opt-string (defined)
    'align|a:s'                      => \my $align,                       # opt-string (defined)
    'cytoscape-json:s'               => \my $cytoscape_json,              # opt-string (defined)
    'graph-stats:s'                  => \my $graph_stats,                 # opt-string (defined)
    'sort-by=s'                      => \my $sort_by,                     # string
    'similarity-metric-cohort=s'     => \my $similarity_metric_cohort,    # string
    'patients-of-interest|poi=s{1,}' => \my @patients_of_interest,        # array
    'poi-out-dir=s'                  => \my $poi_out_dir,                 # string
    'include-terms=s{1,11}'          => \my @include_terms,               # array
    'exclude-terms=s{1,11}'          => \my @exclude_terms,               # array
    'config=s'                       => \my $config_file,                 # string
    'age!'                           => \$age,                            # flag
    'help|?'                         => \my $help,                        # flag
    'log:s'                          => \my $log,                         # opt-string (defined)
    'man'                            => \my $man,                         # flag
    'debug=i'                        => \my $debug,                       # integer
    'verbose|'                       => \my $verbose,                     # flag
    'color!'                         => \$color,                          # flag
    'version|V'                      => sub { say "$0 Version $VERSION"; exit; }
) or pod2usage(2);
pod2usage(1)                              if $help;
pod2usage( -verbose => 2, -exitval => 0 ) if $man;
pod2usage(
    -message => "Please specify a reference-cohort(s) with <--r>\n",
    -exitval => 1
) unless @reference_files;
pod2usage(
    -message =>
      "<--graph_stats> only works in conjunction with <--cytoscape-json>\n",
    -exitval => 1
) if ( defined $graph_stats && !defined $cytoscape_json );
pod2usage(
    -message =>
      "Weights file <$weights_file> does not exist\n",
    -exitval => 1
) if ( defined $weights_file && !-f $weights_file );
# Set the name of the output
my $out_file = $out_file_arg
  // ( $target_file ? $out_file_patient : $out_file_cohort );
# Set cytoscape-json logic
handle_option( \$cytoscape_json, "<--cytoscape-json> only works in cohort-mode",
    $target_file, $out_file_graph );
# Set graph-stats logic
handle_option( \$graph_stats, "<--graph-stats> only works in cohort-mode",
    $target_file, $out_file_graph_stats );
# Turning color off if argument <--no-color>
$ENV{'ANSI_COLORS_DISABLED'} = 1 unless $color;
# Start printing to STDOUT
say BOLD CYAN program_header($VERSION), RESET if $verbose;
######################
# START PHENO-RANKER #
######################
# Load data as hashref
my $data = {
    reference_files          => \@reference_files,
    target_file              => $target_file,
    weights_file             => $weights_file,
    include_hpo_ascendants   => $include_hpo_ascendants,
    hpo_file                 => undef,
    align                    => $align,
    align_basename           => $align_basename,
    export                   => $export,
    export_basename          => $export_basename,
    out_file                 => $out_file,
    cytoscape_json           => $cytoscape_json,
    graph_stats              => $graph_stats,
    max_out                  => $max_out,
    max_number_var           => $max_number_var,
    sort_by                  => $sort_by,
    similarity_metric_cohort => $similarity_metric_cohort,
    patients_of_interest     => \@patients_of_interest,
    poi_out_dir              => $poi_out_dir,
    include_terms            => \@include_terms,
    exclude_terms            => \@exclude_terms,
    config_file              => $config_file,
    age                      => $age,                        # Solution, use ageRange in PXF/BFF, measures' values more difficult
    cli                      => $cli,
    append_prefixes          => \@append_prefixes,
    log                      => $log,
    debug                    => $debug,
    verbose                  => $verbose
};
# Create object
my $ranker = Pheno::Ranker->new($data);
# Run method
$ranker->run();
# Create log if <--log>
write_log( $log ? $log : $log_file, $data, $VERSION )
  if defined $log;
####################
# END PHENO-RANKER #
####################
sub handle_option {
    my ( $option_ref, $message, $target_file, $default ) = @_;
    if ( defined $$option_ref ) {
        pod2usage( -message => $message, -exitval => 1 ) if $target_file;
        $$option_ref = $$option_ref ? $$option_ref : $default;
    }
}
sub write_log {
    my ( $log, $data, $VERSION ) = @_;
    # NB: Darwin does not have nproc to show #logical-cores, using sysctl instead
    my $os = $^O;
    chomp(
        my $ncpuhost =
          lc($os) eq 'darwin' ? qx{/usr/sbin/sysctl -n hw.logicalcpu}
        : $os eq 'MSWin32' ? qx{wmic cpu get NumberOfLogicalProcessors}
        :                    qx{/usr/bin/nproc} // 1
    );
    # For the Windows command, the result will also contain the string
    # "NumberOfLogicalProcessors" which is the header of the output.
    # So we need to extract the actual number from it:
    if ( $os eq 'MSWin32' ) {
        ($ncpuhost) = $ncpuhost =~ /(\d+)/;
    }
    $ncpuhost = 0 + $ncpuhost;    # coercing it to be a number
    my $info = {
        date      => ( strftime "%a %b %e %H:%M:%S %Y", localtime ),
        ncpuhost  => $ncpuhost,
        hostname  => hostname,
        id        => time . substr( "00000$$", -5 ),                   # string
        version   => $VERSION,
             user => $ENV{'LOGNAME'}
          || $ENV{'USER'}
          || $ENV{'USERNAME'}
          || 'dummy-user'
    };
    # Saving file
    say BOLD GREEN "Writing <$log> file\n" if $data->{verbose};
    write_json(
        {
            filepath => $log,
            data     => { info => $info, data => $data }
        }
    );
}
sub program_header {
    my $VERSION = shift;
    my $str     = <<EOF;
****************************************
*   Rank against cohort(s) (BFF/PXF)   *
*          - PHENO-RANKER -            *
*          Version: $VERSION              *
*   (C) 2023-2025 Manuel Rueda, PhD    *
*       The Artistic License 2.0       *
****************************************
EOF
    return $str;
}
=head1 NAME
pheno-ranker: A script that performs semantic similarity in PXF/BFF data structures and beyond (JSON|YAML)
=head1 SYNOPSIS
 pheno-ranker -r <individuals.json> -t <patient.json> [-options]
   Arguments:
     * Cohort mode:
       -r, --reference <file>         BFF/PXF file(s) in JSON or YAML format (array or object)
     * Patient mode:
       -t, --target <file>            BFF/PXF file in JSON or YAML format (object or array of 1 object)
   Options:
     -age                             Include age-related variables; excludes agent-like terms (BFF/PXF-only) [>no-age|age]
     -a, --align [path/basename]      Write alignment file(s). If not specified, default filenames are used [default: alignment.*]
     -append-prefixes <prefixes>      Prefixes for primary_key when #cohorts >= 2 [default: C]
     -config <file>                   YAML config file to modify default parameters [default: share/conf/config.yaml]
     -cytoscape-json [file]           Serializes the pairwise comparison matrix as an undirected graph in JSON, compatible with Cytoscape [default: graph.json]
     -e, --export [path/basename]     Export miscellaneous JSON files. If not specified, default filenames are used [default: export.*]
     -exclude-terms <terms>           Exclude BFF/PXF terms (e.g., --exclude-terms sex, id) or column names in JSON-derived from CSV 
     -graph-stats [file]              Generates a text file with key graph metrics, for use with <-cytoscape-json> [default: graph_stats.txt]
     -include-hpo-ascendants          Include ascendant terms from the Human Phenotype Ontology (HPO)
     -include-terms <terms>           Include BFF/PXF terms (e.g., --include-terms diseases) or column names in JSON-derived from CSV
     -max-number-var <number>         Maximum variables for binary string [default: 10000]
     -max-out <number>                Print only N comparisons [default: 50]
     -o, --out-file <file>            Output file path [default: -r matrix.txt | -t rank.txt]
     -poi, --patients-of-interest <id_list>   Export JSON files for the selected individual IDs during a dry-run
     -poi-out-dir <directory>         Directory for JSON files (used with --poi)
     -similarity-metric-cohort <metric>  Similarity metric for cohort mode [>hamming|jaccard]
     -sort-by <metric>                Sort by Hamming distance or Jaccard index [>hamming|jaccard]
     -w, --weights <file>             YAML file with weights
   Generic Options:
     -debug <level>                   Print debugging (from 1 to 5, being 5 max)
     -h, --help                       Brief help message
     -log                             Save log file [default: pheno-ranker-log.json]
     -man                             Full documentation
     -no-color                        Toggle color output [>color|no-color]
     -v, --verbose                    Verbosity on
     -V, --version                    Print version
=head1 SUMMARY
Pheno-Ranker is a lightweight, easy-to-install tool for performing semantic similarity analysis on phenotypic data in JSON/YAML formats, including Beacon v2 Models and Phenopackets v2. It also supports pre-processed CSV files prepared using the included C<csv2pheno-ranker> utility.
=head1 INSTALLATION
If you plan to only use C<pheno-ranker> CLI, we recommend installing it via CPAN. See details below.
=head2 Non containerized
The script runs on command-line Linux and it has been tested on Debian/RedHat/macOS based distributions (only showing commands for Debian). Perl 5 is installed by default on Linux, 
but we will install a few CPAN modules with C<cpanminus>.
=head3 Method 1: From CPAN
First install system level dependencies:
  sudo apt-get install cpanminus libperl-dev
Now you have to choose between one of the 2 options below:
B<Option 1:> System-level installation:
  cpanm --notest --sudo Pheno::Ranker
  pheno-ranker -h
B<Option 2:> Install Pheno-Ranker and the dependencies at C<~/perl5>
  cpanm --local-lib=~/perl5 local::lib && eval $(perl -I ~/perl5/lib/perl5/ -Mlocal::lib)
  cpanm --notest Pheno::Ranker
  pheno-ranker --help
To ensure Perl recognizes your local modules every time you start a new terminal, you should type:
  echo 'eval $(perl -I ~/perl5/lib/perl5/ -Mlocal::lib)' >> ~/.bashrc
=head3 Method 2: From CPAN in a CONDA environment
Please follow L<these instructions|https://cnag-biomedical-informatics.github.io/pheno-ranker/download-and-installation/#__tabbed_1_2>.
=head3 Method 3: From GitHub
  git clone https://github.com/cnag-biomedical-informatics/pheno-ranker.git
  cd pheno-ranker
Install system level dependencies:
   
  sudo apt-get install cpanminus libperl-dev
Now you have to choose between one of the 2 options below:
B<Option 1:> Install dependencies (they're harmless to your system) as C<sudo>:
  cpanm --notest --sudo --installdeps .
  bin/pheno-ranker --help            
B<Option 2:> Install the dependencies at C<~/perl5>:
  cpanm --local-lib=~/perl5 local::lib && eval $(perl -I ~/perl5/lib/perl5/ -Mlocal::lib)
  cpanm --notest --installdeps .
  bin/pheno-ranker --help
To ensure Perl recognizes your local modules every time you start a new terminal, you should type:
  echo 'eval $(perl -I ~/perl5/lib/perl5/ -Mlocal::lib)' >> ~/.bashrc
I<Optional:> If you want to use C<utils/barcode> or C<utils/bff_pxf_plot>:
  sudo apt-get install python3-pip libzbar0
  pip3 install -r requirements.txt
=head2 Containerized
=head3 Method 4: From Docker Hub
Download a docker image (latest version - amd64|x86-64) from L<Docker Hub|https://hub.docker.com/r/manuelrueda/pheno-ranker> by executing:
  docker pull manuelrueda/pheno-ranker:latest
  docker image tag manuelrueda/pheno-ranker:latest cnag/pheno-ranker:latest
See additional instructions below.
=head3 Method 5: With Dockerfile
Please download the C<Dockerfile> from the repo:
  wget https://raw.githubusercontent.com/cnag-biomedical-informatics/pheno-ranker/main/Dockerfile
And then run:
  # Docker Version 19.03 and Above (Supports buildx)
  docker buildx build -t cnag/pheno-ranker:latest .
  # Docker Version Older than 19.03 (Does Not Support buildx)
  docker build -t cnag/pheno-ranker:latest .
=head3 Additional instructions for Methods 4 and 5
To run the container (detached) execute:
  docker run -tid -e USERNAME=root --name pheno-ranker cnag/pheno-ranker:latest
To enter:
  docker exec -ti pheno-ranker bash
The command-line executable can be found at:
  /usr/share/pheno-ranker/bin/pheno-ranker
The default container user is C<root> but you can also run the container as C<$UID=1000> (C<dockeruser>). 
  docker run --user 1000 -tid --name pheno-ranker cnag/pheno-ranker:latest
  
=head3 Mounting volumes
Docker containers are fully isolated. If you need the mount a volume to the container please use the following syntax (C<-v host:container>). 
Find an example below (note that you need to change the paths to match yours):
  docker run -tid --volume /media/mrueda/4TBT/data:/data --name pheno-ranker-mount cnag/pheno-ranker:latest
Then I will do something like this:
  # First I create an alias to simplify invocation (from the host)
  alias pheno-ranker='docker exec -ti pheno-ranker-mount /usr/share/pheno-ranker/bin/pheno-ranker'
  # Now I use the alias to run the command (note that I use the flag --o to specify the filepath)
  pheno-ranker -r /data/individuals.json -o /data/matrix.txt
=head3 System requirements
  * Ideally a Debian-based distribution (Ubuntu or Mint), but any other (e.g., CentOS, OpenSUSE) should do as well.
    (It should also work on macOS and Windows Server, but we are only providing information for Linux here)
  * Perl 5 (>= 5.26 core; installed by default in most Linux distributions). Check the version with "perl -v".
  * >= 4GB of RAM
  * 1 core
  * At least 16GB HDD
=head1 HOW TO RUN PHENO-RANKER
For executing pheno-ranker you will need a PXF/BFF file(s) in JSON|YAML format. The reference cohort must be a JSON array, where each individual data are consolidated in one object.
You can download examples from L<this location|https://github.com/CNAG-Biomedical-Informatics/pheno-ranker/tree/main/share/ex>.
There are two modes of operation:
=over 4
=item Cohort mode:
  
B<Intra-cohort:> With C<--r> argument and 1 cohort.
B<Inter-cohort:> With C<--r> and multiple cohort files. It can be used in combination with C<--append-prefixes> to add prefixes to each individual id.
=item Patient Mode:
With C<-r> reference cohort(s) and C<--t> patient data.
=back
B<Examples:>
 $ ./pheno-ranker -r phenopackets.json  # intra-cohort
 $ ./pheno-ranker -r phenopackets.yaml -o my_matrix.txt # intra-cohort
 $ ./pheno-ranker -r phenopackets.json -w weights.yaml --exclude-terms sex ethnicity exposures # intra-cohort with weights
 $ $path/pheno-ranker -r individuals.json others.yaml --append-prefixes CANCER CONTROL  # inter-cohort
 $ $path/pheno-ranker -r individuals.json -t patient.yaml -max-out 100 # mode patient
=head2 COMMON ERRORS AND SOLUTIONS
 * Error message: R plotting
     Error in scan(file = file, what = what, sep = sep, quote = quote, dec = dec,  : 
     line 1 did not have X elements
     Calls: as.matrix -> read.table -> scan
     Execution halted
   Solution: Make sure that the values of your primary key (e.g., "id") do not contain spaces (e.g., "my fav id" must be "my_fav_id")
 * Error message: Foo
   Solution: Bar
=head1 CITATION
The author requests that any published work that utilizes C<Pheno-Ranker> includes a cite to the following reference:
Leist, I.C. et al., (2024). Pheno-Ranker: a toolkit for comparison of phenotypic data stored in GA4GH standards and beyond. I<BMC Bioinformatics>. DOI: 10.1186/s12859-024-05993-2
=head1 AUTHOR 
Written by Manuel Rueda, PhD. Info about CNAG can be found at L<https://www.cnag.eu>.
=head1 COPYRIGHT AND LICENSE
This PERL file is copyrighted. See the LICENSE file included in this distribution.
=cut
	Global
`s`	Focus search bar
`?`	Bring up this help dialog
	GitHub
`g` `p`	Go to pull requests
`g` `i`	go to github issues (only if github is preferred repository)
	POD
`g` `a`	Go to author
`g` `c`	Go to changes
`g` `i`	Go to issues
`g` `d`	Go to dist
`g` `r`	Go to repository/SCM
`g` `s`	Go to source
`g` `b`	Go to file browse
	Search terms
module: (e.g. module:Plugin)
distribution: (e.g. distribution:Dancer auth)
author: (e.g. author:SONGMU Redis)
version: (e.g. version:1.00)