``````#!/usr/bin/perl

#
# dbcolstatscores.pm
# Copyright (C) 1997-2018 by John Heidemann <johnh@isi.edu>
#
# public license, version 2.  See the file COPYING
# in \$dblibdir for details.
#

package Fsdb::Filter::dbcolstatscores;

dbcolstatscores - compute z-scores or t-scores for each value in a population

dbcolstatscores [-t] [--tmean=MEAN] [--tstddev=STDDEV] column

Compute statistics (z-score and optionally t-score) over a COLUMN of
numbers.  Creates new columns called "zscore", "tscore".
T-scores are only computed if requested with the C<-t> option,
or if C<--tmean> or C<--tstddev> are explicitly specified
(defaults are mean of 50, standard deviation of 10).

You may recall from your statistics class that a z-score is simply
the value normalized by mean and standard deviation, so that 0.0
is the mean and positive or negative values are multiples of the
standard deviation.
It assumes data follows a normal (Gaussian) distribution.

T-score scales the z-score to match a mean of 50 and a standard
deviation of 10.  This program allows generalized t-scores that use
any mean and standard deviation.

Other scales are sometimes used as well.  The Wechsler Adult
Intelligence Scale (one type of IQ test) is adjusted to a mean of 100
and a standard deviation of 15.  Other tests scale to other standard
deviations.

This program requires two passes over the data, and consumes
O(1) memory and O(number of rows) disk space.

=over 4

=item B<-a> or B<--include-non-numeric>

Compute stats over all records (treat non-numeric records
as zero rather than just ignoring them).

=item B<-t>

Compute t-scores in addition to z-scores.

=item B<--tmean MEAN>

Use the given MEAN for t-scores.

=item B<--tstddev STDDEV> or B<--tsd STDDEV>

Use the given STDDEV for the standard deviation of the t-scores.

=item B<-f FORMAT> or B<--format FORMAT>

Specify a L<printf(3)>-style format for output statistics.
Defaults to C<%.5g>.

=item B<-T TmpDir>

where to put tmp files.
Also uses environment variable TMPDIR, if -T is
not specified.
Default is /tmp.

=back

=for comment
begin_standard_fsdb_options

This module also supports the standard fsdb options:

=over 4

=item B<-d>

Enable debugging output.

=item B<-i> or B<--input> InputSource

Read from InputSource, typically a file name, or C<-> for standard input,
or (if in Perl) a IO::Handle, Fsdb::IO or Fsdb::BoundedQueue objects.

=item B<-o> or B<--output> OutputDestination

Write to OutputDestination, typically a file name, or C<-> for standard output,
or (if in Perl) a IO::Handle, Fsdb::IO or Fsdb::BoundedQueue objects.

=item B<--autorun> or B<--noautorun>

By default, programs process automatically,
but Fsdb::Filter objects in Perl do not run until you invoke
the run() method.
The C<--(no)autorun> option controls that behavior within Perl.

=item B<--help>

Show help.

=item B<--man>

Show full manual.

=back

=for comment
end_standard_fsdb_options

#fsdb name id test1
a 1 80
b 2 70
c 3 65
d 4 90
e 5 70
f 6 90

cat DATA/grades.fsdb | dbcolstatscores --tmean 50 --tstddev 10 test1 | dbcolneaten

#fsdb name id test1 zscore   tscore
a       1  80    0.23063  52.306
b       2  70    -0.69188 43.081
c       3  65    -1.1531  38.469
d       4  90    1.1531   61.531
e       5  70    -0.69188 43.081
f       6  90    1.1531   61.531
#  | dbcolstatscores --tmean 50 --tstddev 10 test1
#  | dbcolneaten

L<dbcolpercentile(1)>,
L<dbcolstats(1)>,
L<Fsdb>,
L<dbcolscorrelate>

=cut

@ISA = qw(Fsdb::Filter);
(\$VERSION) = 2.0;

use strict;
use Pod::Usage;
use Carp;

use Fsdb::Filter;
use Fsdb::IO::Writer;
use Fsdb::Filter::dbpipeline qw(dbpipeline_open2 dbpipeline_close2_hash dbcolstats);
use Fsdb::Support qw(\$is_numeric_regexp);
use Fsdb::Support::NamedTmpfile;

\$filter = new Fsdb::Filter::dbcolstatscores(@arguments);

Create a new dbcolstatscores object, taking command-line arguments.

=cut

sub new (\$@) {
my \$class = shift @_;
my \$self = \$class->SUPER::new(@_);
bless \$self, \$class;
\$self->set_defaults;
\$self->parse_options(@_);
\$self->SUPER::post_new();
return \$self;
}

\$filter->set_defaults();

Internal: set up defaults.

=cut

sub set_defaults (\$) {
my(\$self) = @_;
\$self->SUPER::set_defaults();
\$self->{_include_non_numeric} = undef;
\$self->{_do_tscores} = undef;
\$self->{_t_mean} = undef;
\$self->{_t_stddev} = undef;
\$self->{_format} = "%.5g";
\$self->set_default_tmpdir;
}

\$filter->parse_options(@ARGV);

Internal: parse command-line arguments.

=cut

sub parse_options (\$@) {
my \$self = shift @_;

my(@argv) = @_;
\$self->get_options(
\@argv,
'help|?' => sub { pod2usage(1); },
'man' => sub { pod2usage(-verbose => 2); },
'a|include-non-numeric!' => \\$self->{_include_non_numeric},
'autorun!' => \\$self->{_autorun},
'close!' => \\$self->{_close},
'd|debug+' => \\$self->{_debug},
'f|format=s' => \\$self->{_format},
'i|input=s' => sub { \$self->parse_io_option('input', @_); },
'log!' => \\$self->{_logprog},
'o|output=s' => sub { \$self->parse_io_option('output', @_); },
't!' => \\$self->{_do_tscores},
'tmean=f' => \\$self->{_t_mean},
'tstddev|tsd=f' => \\$self->{_t_stddev},
'T|tmpdir|tempdir=s' => \\$self->{_tmpdir},
) or pod2usage(2);
\$self->parse_target_column(\@argv);
}

\$filter->setup();

=cut

sub setup (\$) {
my(\$self) = @_;

pod2usage(2) if (!defined(\$self->{_target_column}));
\$self->{_do_tscores} = 1 if (defined(\$self->{_t_mean}));
\$self->{_do_tscores} = 1 if (defined(\$self->{_t_stddev}));
\$self->{_t_mean} ||= 50.0;
\$self->{_t_stddev} ||= 10.0;

\$self->{_target_coli} = \$self->{_in}->col_to_i(\$self->{_target_column});
croak(\$self->{_prog} . ": target column " . \$self->{_target_column} . " is not in input stream.\n")
if (!defined(\$self->{_target_coli}));

\$self->finish_io_option('output', -clone => \$self->{_in}, -outputheader => 'delay');
\$self->{_out}->col_create('zscore')
or croak(\$self->{_prog} . ": cannot create column zscore (maybe it already existed?)\n");
if (\$self->{_do_tscores}) {
\$self->{_out}->col_create('tscore')
or croak(\$self->{_prog} . ": cannot create column tscore (maybe it already existed?)\n");
};
}

\$filter->run();

Internal: run over each rows.

=cut
sub run (\$) {
my(\$self) = @_;

#
# Shunt the data to a separate file.
#
\$self->{_copy_filename} = Fsdb::Support::NamedTmpfile::alloc(\$self->{_tmpdir});
my \$copy_writer = new Fsdb::IO::Writer(-file => \$self->{_copy_filename},
-clone => \$self->{_in});

# and take stats
dbpipeline_open2([-cols => [qw(data)]], dbcolstats(qw(data)));

my \$target_coli = \$self->{_target_coli};
my \$fref;
# copy and send to stats
\$copy_writer->write_rowobj(\$fref);
\$stats_sink->write_row(\$fref->[\$target_coli]);
};
# close up both
\$copy_writer->close;
my \$stats_href = dbpipeline_close2_hash(\$stats_source_queue, \$stats_sink, \$stats_thread);
foreach (qw(mean stddev)) {
croak(\$self->{_prog} . ": internal error, missing stats field \$_.\n")
if (!defined(\$stats_href->{\$_}));
};
my \$mean = \$stats_href->{'mean'};
my \$stddev = \$stats_href->{'stddev'};

#
# now re-open the copy and generate the new data
#
\$self->{_in} = new Fsdb::IO::Reader(-file => \$self->{_copy_filename},
my \$new_target_coli = \$self->{_in}->col_to_i(\$self->{_target_column});
croak(\$self->{_prog} . ": internal error: old and new target column numbers don't match.\n")
if (\$target_coli != \$new_target_coli);
my \$write_fastpath_sub = \$self->{_out}->fastpath_sub(); # regenerate
my \$zscore_coli = \$self->{_out}->col_to_i('zscore');
my \$tscore_coli = \$self->{_do_tscores} ? \$self->{_out}->col_to_i('tscore') : undef;
my \$x = \$fref->[\$target_coli];
if (\$x =~ /\$is_numeric_regexp/) {
my \$zscore = (\$x - \$mean) / \$stddev;
\$fref->[\$zscore_coli] = \$self->numeric_formatting(\$zscore);
if (\$self->{_do_tscores}) {
my \$tscore = \$zscore * \$self->{_t_stddev} + \$self->{_t_mean};
\$fref->[\$tscore_coli] = \$self->numeric_formatting(\$tscore);
};
} else {
\$fref->[\$zscore_coli] = '-';
\$fref->[\$tscore_coli] = '-' if (\$self->{_do_tscores});
};
&\$write_fastpath_sub(\$fref);
};
}