package Algorithm::RegressionTree;
#--------------------------------------------------------------------------------------
# Copyright (c) 2017 Avinash Kak. All rights reserved. This program is free
# software. You may modify and/or distribute it under the same terms as Perl itself.
# This copyright notice must remain attached to the file.
#
# Algorithm::RegressionTree is a Perl module for constructing regression trees. It calls
# on the main Algorithm::DecisionTree module for some of its functionality.
# -------------------------------------------------------------------------------------
#use lib 'blib/lib', 'blib/arch';
#use 5.10.0;
use strict;
use warnings;
use Carp;
use File::Basename;
use Algorithm::DecisionTree 3.43;
use List::Util qw(reduce min max pairmap sum);
use Math::GSL::Matrix;
use Graphics::GnuplotIF;
our $VERSION = '3.43';
@Algorithm::RegressionTree::ISA = ('Algorithm::DecisionTree');
############################################ Constructor ##############################################
sub new {
my ($class, %args) = @_;
my @params = keys %args;
croak "\nYou have used a wrong name for a keyword argument --- perhaps a misspelling\n"
if check_for_illegal_params(@params) == 0;
my %dtargs = %args;
delete $dtargs{dependent_variable_column};
delete $dtargs{predictor_columns};
delete $dtargs{mse_threshold};
delete $dtargs{need_data_normalization};
delete $dtargs{jacobian_choice};
delete $dtargs{debug1_r};
delete $dtargs{debug2_r};
delete $dtargs{debug3_r};
my $instance = Algorithm::DecisionTree->new(%dtargs);
bless $instance, $class;
$instance->{_dependent_variable_column} = $args{dependent_variable_column} || undef;
$instance->{_predictor_columns} = $args{predictor_columns} || 0;
$instance->{_mse_threshold} = $args{mse_threshold} || 0.01;
$instance->{_jacobian_choice} = $args{jacobian_choice} || 0;
$instance->{_need_data_normalization} = $args{need_data_normalization} || 0;
$instance->{_dependent_var} = undef;
$instance->{_dependent_var_values} = undef;
$instance->{_samples_dependent_var_val_hash} = undef;
$instance->{_root_node} = undef;
$instance->{_debug1_r} = $args{debug1_r} || 0;
$instance->{_debug2_r} = $args{debug2_r} || 0;
$instance->{_debug3_r} = $args{debug3_r} || 0;
$instance->{_sample_points_for_dependent_var} = [];
$instance->{_output_for_plots} = {};
$instance->{_output_for_surface_plots} = {};
bless $instance, $class;
}
############################################## Methods #################################################
sub get_training_data_for_regression {
my $self = shift;
die("Aborted. get_training_data_csv() is only for CSV files") unless $self->{_training_datafile} =~ /\.csv$/;
my @dependent_var_values;
my %all_record_ids_with_dependent_var_values;
my $firstline;
my %data_hash;
$|++;
open FILEIN, $self->{_training_datafile};
my $record_index = 0;
while (<FILEIN>) {
next if /^[ ]*\r?\n?$/;
$_ =~ s/\r?\n?$//;
my $record = $self->{_csv_cleanup_needed} ? cleanup_csv($_) : $_;
if ($record_index == 0) {
$firstline = $record;
$record_index++;
next;
}
my @parts = split /,/, $record;
my $record_label = shift @parts;
$record_label =~ s/^\s*\"|\"\s*$//g;
$data_hash{$record_label} = \@parts;
push @dependent_var_values, $parts[$self->{_dependent_variable_column}-1];
$all_record_ids_with_dependent_var_values{$parts[0]} = $parts[$self->{_dependent_variable_column}-1];
print "." if $record_index % 10000 == 0;
$record_index++;
}
close FILEIN;
$self->{_how_many_total_training_samples} = $record_index; #it's less by 1 from total num of records; okay
print "\n\nTotal number of training samples: $self->{_how_many_total_training_samples}\n" if $self->{_debug1_r};
my @all_feature_names = grep $_, split /,/, substr($firstline, index($firstline,','));
my $dependent_var_column_heading = $all_feature_names[$self->{_dependent_variable_column} - 1];
my @feature_names = map {$all_feature_names[$_-1]} @{$self->{_predictor_columns}};
my %dependent_var_value_for_sample_hash = map {"sample_" . $_ => "$dependent_var_column_heading=" . $data_hash{$_}->[$self->{_dependent_variable_column} - 1 ] } keys %data_hash;
my @sample_names = map {"sample_$_"} keys %data_hash;
my %feature_values_for_samples_hash = map {my $sampleID = $_; "sample_" . $sampleID => [map {my $fname = $all_feature_names[$_-1]; $fname . "=" . eval{$data_hash{$sampleID}->[$_-1] =~ /^\d+$/ ? sprintf("%.1f", $data_hash{$sampleID}->[$_-1] ) : $data_hash{$sampleID}->[$_-1] } } @{$self->{_predictor_columns}} ] } keys %data_hash;
my %features_and_values_hash = map { my $a = $_; {$all_feature_names[$a-1] => [ map {my $b = $_; $b =~ /^\d+$/ ? sprintf("%.1f",$b) : $b} map {$data_hash{$_}->[$a-1]} keys %data_hash ]} } @{$self->{_predictor_columns}};
my %numeric_features_valuerange_hash = ();
my %feature_values_how_many_uniques_hash = ();
my %features_and_unique_values_hash = ();
my $numregex = '[+-]?\ *(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?';
foreach my $feature (keys %features_and_values_hash) {
my %seen = ();
my @unique_values_for_feature = grep {$_ if $_ ne 'NA' && !$seen{$_}++} @{$features_and_values_hash{$feature}};
$feature_values_how_many_uniques_hash{$feature} = scalar @unique_values_for_feature;
my $not_all_values_float = 0;
map {$not_all_values_float = 1 if $_ !~ /^$numregex$/} @unique_values_for_feature;
if ($not_all_values_float == 0) {
my @minmaxvalues = minmax(\@unique_values_for_feature);
$numeric_features_valuerange_hash{$feature} = \@minmaxvalues;
}
$features_and_unique_values_hash{$feature} = \@unique_values_for_feature;
}
if ($self->{_debug1_r}) {
print "\nDependent var values: @dependent_var_values\n";
print "\nEach sample data record:\n";
foreach my $kee (sort {sample_index($a) <=> sample_index($b)} keys %feature_values_for_samples_hash) {
print "$kee => @{$feature_values_for_samples_hash{$kee}}\n";
}
print "\ndependent var value for each data sample:\n";
foreach my $kee (sort {sample_index($a) <=> sample_index($b)} keys %dependent_var_value_for_sample_hash) {
print "$kee => $dependent_var_value_for_sample_hash{$kee}\n";
}
print "\nfeatures and the values taken by them:\n";
for my $kee (sort keys %features_and_values_hash) {
print "$kee => @{$features_and_values_hash{$kee}}\n";
}
print "\nnumeric features and their ranges:\n";
for my $kee (sort keys %numeric_features_valuerange_hash) {
print "$kee => @{$numeric_features_valuerange_hash{$kee}}\n";
}
print "\nnumber of unique values in each feature:\n";
for my $kee (sort keys %feature_values_how_many_uniques_hash) {
print "$kee => $feature_values_how_many_uniques_hash{$kee}\n";
}
}
$self->{_XMatrix} = undef;
$self->{_YVector} = undef;
$self->{_dependent_var} = $dependent_var_column_heading;
$self->{_dependent_var_values} = \@dependent_var_values;
$self->{_feature_names} = \@feature_names;
$self->{_samples_dependent_var_val_hash} = \%dependent_var_value_for_sample_hash;
$self->{_training_data_hash} = \%feature_values_for_samples_hash;
$self->{_features_and_values_hash} = \%features_and_values_hash;
$self->{_features_and_unique_values_hash} = \%features_and_unique_values_hash;
$self->{_numeric_features_valuerange_hash} = \%numeric_features_valuerange_hash;
$self->{_feature_values_how_many_uniques_hash} = \%feature_values_how_many_uniques_hash;
$self->SUPER::calculate_first_order_probabilities();
}
sub construct_XMatrix_and_YVector_all_data {
my $self = shift;
my $matrix_rows_as_lists = [ map {my @arr = @$_; [map substr($_,index($_,'=')+1), @arr] } map {$self->{_training_data_hash}->{$_}} sort {sample_index($a) <=> sample_index($b)} keys %{$self->{_training_data_hash}} ];
map {push @$_, 1} @{$matrix_rows_as_lists};
map {print "XMatrix row: @$_\n"} @{$matrix_rows_as_lists} if $self->{_debug1_r};
my $XMatrix = Math::GSL::Matrix->new(scalar @{$matrix_rows_as_lists}, scalar @{$matrix_rows_as_lists->[0]});
pairmap {$XMatrix->set_row($a,$b)} ( 0..@{$matrix_rows_as_lists}-1, @{$matrix_rows_as_lists} )
[ map { $_, $_ + @{$matrix_rows_as_lists} } ( 0 .. @{$matrix_rows_as_lists}-1 ) ];
if ($self->{_debug1_r}) {
foreach my $rowindex (0..@{$matrix_rows_as_lists}-1) {
my @onerow = $XMatrix->row($rowindex)->as_list;
print "XMatrix row again: @onerow\n";
}
}
$self->{_XMatrix} = $XMatrix;
my @dependent_var_values = map {my $val = $self->{_samples_dependent_var_val_hash}->{$_}; substr($val,index($val,'=')+1)} sort {sample_index($a) <=> sample_index($b)} keys %{$self->{_samples_dependent_var_val_hash}};
print "dependent var values: @dependent_var_values\n" if $self->{_debug1_r};
my $YVector = Math::GSL::Matrix->new(scalar @{$matrix_rows_as_lists}, 1);
pairmap {$YVector->set_row($a,$b)} ( 0..@{$matrix_rows_as_lists}-1, map {[$_]} @dependent_var_values )
[ map { $_, $_ + @{$matrix_rows_as_lists} } ( 0 .. @{$matrix_rows_as_lists}-1 ) ];
if ($self->{_debug1_r}) {
foreach my $rowindex (0..@{$matrix_rows_as_lists}-1) {
my @onerow = $YVector->row($rowindex)->as_list;
print "YVector row: @onerow\n";
}
}
$self->{_YVector} = $YVector;
return ($XMatrix, $YVector);
}
sub estimate_regression_coefficients {
my $self = shift;
my ($XMatrix, $YVector, $display) = @_;
$display = 0 unless defined $display;
my ($nrows, $ncols) = $XMatrix->dim;
print "nrows=$nrows ncols=$ncols\n" if $self->{_debug2_r};
my $jacobian_choice = $self->{_jacobian_choice};
my $X = $XMatrix->copy();
my $y = $YVector->copy();
if ($self->{_need_data_normalization}) {
die "normalization feature is yet to be added to the module --- sorry";
}
my $beta0 = (transpose($X) * $X)->inverse() * transpose($X) * $y;
my ($betarows, $betacols) = $beta0->dim;
die "Something has gone wrong with the calculation of beta coefficients" if $betacols > 1;
if ($jacobian_choice == 0) {
# my $error = sum(abs_vector_as_list( $y - ($X * $beta) )) / $nrows;
my $error = sum( map abs, ($y - ($X * $beta0) )->col(0)->as_list ) / $nrows;
my $predictions = $X * $beta0;
if ($display) {
if ($ncols == 2) {
my @xvalues = $X->col(0)->as_list;
my @yvalues = $predictions->col(0)->as_list;
$self->{_output_for_plots}->{scalar(keys %{$self->{_output_for_plots}}) + 1} = [\@xvalues,\@yvalues];
} elsif ($ncols == 3) {
my @xvalues;
my @yvalues = $predictions->col(0)->as_list;
foreach my $row_index (0 .. $X->rows - 1) {
my @onerow = $X->row($row_index)->as_list;
pop @onerow;
push @xvalues, "@onerow";
}
$self->{_output_for_surface_plots}->{scalar(keys %{$self->{_output_for_surface_plots}}) + 1} = [\@xvalues,\@yvalues];
} else {
print "no display when the overall dimensionality of the data exceeds 3\n";
}
}
return ($error, $beta0);
}
my $beta = $beta0;
if ($self->{_debug2_r}) {
print "\ndisplaying beta0 matrix\n";
display_matrix($beta);
}
my $gamma = 0.1;
my $iterate_again_flag = 1;
my $delta = 0.001;
my $master_interation_index = 0;
$|++;
while (1) {
print "*" unless $master_interation_index++ % 100;
last unless $iterate_again_flag;
$gamma *= 0.1;
$beta0 = 0.99 * $beta0;
print "\n\n======== starting iterations with gamma= $gamma ===========\n\n\n" if $self->{_debug2_r};
$beta = $beta0;
my $beta_old = Math::GSL::Matrix->new($betarows, 1)->zero;
my $error_old = sum( map abs, ($y - ($X * $beta_old) )->col(0)->as_list ) / $nrows;
my $error;
foreach my $iteration (0 .. 1499) {
print "." unless $iteration % 100;
$beta_old = $beta->copy;
my $jacobian;
if ($jacobian_choice == 1) {
$jacobian = $X;
} elsif ($jacobian_choice == 2) {
my $x_times_delta_beta = $delta * $X * $beta;
$jacobian = Math::GSL::Matrix->new($nrows, $ncols);
foreach my $i (0 .. $nrows - 1) {
my @row = ($x_times_delta_beta->get_elem($i,0)) x $ncols;
$jacobian->set_row($i, \@row);
}
$jacobian = (1.0/$delta) * $jacobian;
} else {
die "wrong choice for the jacobian_choice";
}
# $beta = $beta_old + 2 * $gamma * transpose($X) * ( $y - ($X * $beta) );
$beta = $beta_old + 2 * $gamma * transpose($jacobian) * ( $y - ($X * $beta) );
$error = sum( map abs, ($y - ($X * $beta) )->col(0)->as_list ) / $nrows;
if ($error > $error_old) {
if (vector_norm($beta - $beta_old) < (0.00001 * vector_norm($beta_old))) {
$iterate_again_flag = 0;
last;
} else {
last;
}
}
if ($self->{_debug2_r}) {
print "\n\niteration: $iteration gamma: $gamma current error: $error\n";
print "\nnew beta:\n";
display_matrix $beta;
}
if ( vector_norm($beta - $beta_old) < (0.00001 * vector_norm($beta_old)) ) {
print "iterations used: $iteration with gamma: $gamma\n" if $self->{_debug2_r};
$iterate_again_flag = 0;
last;
}
$error_old = $error;
}
}
display_matrix($beta) if $self->{_debug2_r};
my $predictions = $X * $beta;
my @error_distribution = ($y - ($X * $beta))->as_list;
my $squared_error = sum map abs, @error_distribution;
my $error = $squared_error / $nrows;
if ($display) {
if ($ncols == 2) {
my @xvalues = $X->col(0)->as_list;
my @yvalues = $predictions->col(0)->as_list;
$self->{_output_for_plots}->{scalar(keys %{$self->{_output_for_plots}}) + 1} = [\@xvalues,\@yvalues];
} elsif ($ncols == 3) {
my @xvalues;
my @yvalues = $predictions->col(0)->as_list;
foreach my $row_index (0 .. $X->rows - 1) {
my @onerow = $X->row($row_index)->as_list;
pop @onerow;
push @xvalues, "@onerow";
}
$self->{_output_for_surface_plots}->{scalar(keys %{$self->{_output_for_surface_plots}}) + 1} = [\@xvalues,\@yvalues];
} else {
print "no display when the overall dimensionality of the data exceeds 3\n";
}
}
return ($error, $beta);
}
##------------------------------- Construct Regression Tree ------------------------------------
## At the root node, you do ordinary linear regression for the entire dataset so that you
## can later compare the linear regression fit with the results obtained through the
## regression tree. Subsequently, you call the recursive_descent() method to construct
## the tree.
sub construct_regression_tree {
my $self = shift;
print "\nConstructing regression tree...\n";
my $root_node = RTNode->new(undef, undef, undef, [], $self, 'root');
my ($XMatrix,$YVector) = $self->construct_XMatrix_and_YVector_all_data();
my ($error,$beta) = $self->estimate_regression_coefficients($XMatrix, $YVector, 1);
$root_node->set_node_XMatrix($XMatrix);
$root_node->set_node_YVector($YVector);
$root_node->set_node_error($error);
$root_node->set_node_beta($beta);
$root_node->set_num_data_points($XMatrix->cols);
print "\nerror at root: $error\n";
print "\nbeta at root:\n";
display_matrix($beta);
$self->{_root_node} = $root_node;
$self->recursive_descent($root_node) if $self->{_max_depth_desired} > 0;
return $root_node;
}
## We first look for a feature, along with its partitioning point, that yields the
## largest reduction in MSE compared to the MSE at the parent node. This feature and
## its partitioning point are then used to create two child nodes in the tree.
sub recursive_descent {
my $self = shift;
my $node = shift;
print "\n==================== ENTERING RECURSIVE DESCENT ==========================\n";
my $node_serial_number = $node->get_serial_num();
my @features_and_values_or_thresholds_on_branch = @{$node->get_branch_features_and_values_or_thresholds()};
my @copy_of_path_attributes = @{deep_copy_array(\@features_and_values_or_thresholds_on_branch)};
if (@features_and_values_or_thresholds_on_branch > 0) {
my ($error,$beta,$XMatrix,$YVector) =
$self->_error_for_given_sequence_of_features_and_values_or_thresholds(\@copy_of_path_attributes);
$node->set_node_XMatrix($XMatrix);
$node->set_node_YVector($YVector);
$node->set_node_error($error);
$node->set_node_beta($beta);
$node->set_num_data_points($XMatrix->cols);
print "\nNODE SERIAL NUMBER: $node_serial_number\n";
print "\nFeatures and values or thresholds on branch: @features_and_values_or_thresholds_on_branch\n";
return if $error <= $self->{_mse_threshold};
}
my ($best_feature,$best_minmax_error_at_partitioning_point,$best_feature_partitioning_point) =
$self->best_feature_calculator(\@copy_of_path_attributes);
return unless defined $best_feature_partitioning_point;
print "\nBest feature found: $best_feature\n";
print "Best feature partitioning_point: $best_feature_partitioning_point\n";
print "Best minmax error at partitioning point: $best_minmax_error_at_partitioning_point\n";
$node->set_feature($best_feature);
$node->display_node() if $self->{_debug2_r};
return if (defined $self->{_max_depth_desired}) &&
(@features_and_values_or_thresholds_on_branch >= $self->{_max_depth_desired});
if ($best_minmax_error_at_partitioning_point > $self->{_mse_threshold}) {
my @extended_branch_features_and_values_or_thresholds_for_lessthan_child =
@{deep_copy_array(\@features_and_values_or_thresholds_on_branch)};
my @extended_branch_features_and_values_or_thresholds_for_greaterthan_child =
@{deep_copy_array(\@features_and_values_or_thresholds_on_branch)};
my $feature_threshold_combo_for_less_than = "$best_feature" . '<' . "$best_feature_partitioning_point";
my $feature_threshold_combo_for_greater_than = "$best_feature" . '>' . "$best_feature_partitioning_point";
push @extended_branch_features_and_values_or_thresholds_for_lessthan_child,
$feature_threshold_combo_for_less_than;
push @extended_branch_features_and_values_or_thresholds_for_greaterthan_child,
$feature_threshold_combo_for_greater_than;
my $left_child_node = RTNode->new(undef, undef, undef,
\@extended_branch_features_and_values_or_thresholds_for_lessthan_child, $self);
$node->add_child_link($left_child_node);
$self->recursive_descent($left_child_node);
my $right_child_node = RTNode->new(undef, undef, undef,
\@extended_branch_features_and_values_or_thresholds_for_greaterthan_child, $self);
$node->add_child_link($right_child_node);
$self->recursive_descent($right_child_node);
}
}
## This is the heart of the regression tree constructor. Its main job is to figure
## out the best feature to use for partitioning the training data samples at the
## current node. The partitioning criterion is that the largest of the MSE's in
## the two partitions should be smaller than the error associated with the parent
## node.
sub best_feature_calculator {
my $self = shift;
my $features_and_values_or_thresholds_on_branch = shift;
my @features_and_values_or_thresholds_on_branch = @$features_and_values_or_thresholds_on_branch;
print "\n\nfeatures_and_values_or_thresholds_on_branch: @features_and_values_or_thresholds_on_branch\n";
if (@features_and_values_or_thresholds_on_branch == 0) {
my $best_partition_point_for_feature_hash = { map {$_ => undef} @{$self->{_feature_names}} };
my $best_minmax_error_for_feature_hash = { map {$_ => undef} @{$self->{_feature_names}} };
foreach my $i (0 .. @{$self->{_feature_names}}-1) {
my $feature_name = $self->{_feature_names}->[$i];
my @values = @{$self->{_sampling_points_for_numeric_feature_hash}->{$feature_name}};
my @partitioning_errors;
my %partitioning_error_hash;
foreach my $value (@values[10 .. $#values - 10]) {
my $feature_and_less_than_value_string = "$feature_name" . '<' . "$value";
my $feature_and_greater_than_value_string = "$feature_name" . '>' . "$value";
my @for_left_child;
my @for_right_child;
if (@features_and_values_or_thresholds_on_branch) {
@for_left_child = @{deep_copy_array(\@features_and_values_or_thresholds_on_branch)};
push @for_left_child, $feature_and_less_than_value_string;
@for_right_child = @{deep_copy_array(\@features_and_values_or_thresholds_on_branch)};
push @for_right_child, $feature_and_greater_than_value_string;
} else {
@for_left_child = ($feature_and_less_than_value_string);
@for_right_child = ($feature_and_greater_than_value_string);
}
my ($error1,$beta1,$XMatrix1,$YVector1) =
$self->_error_for_given_sequence_of_features_and_values_or_thresholds(\@for_left_child);
my ($error2,$beta2,$XMatrix2,$YVector2) =
$self->_error_for_given_sequence_of_features_and_values_or_thresholds(\@for_right_child);
my $partitioning_error = max($error1, $error2);
push @partitioning_errors, $partitioning_error;
$partitioning_error_hash{$partitioning_error} = $value;
}
my $min_max_error_for_feature = min(@partitioning_errors);
$best_partition_point_for_feature_hash->{$feature_name} =
$partitioning_error_hash{$min_max_error_for_feature};
$best_minmax_error_for_feature_hash->{$feature_name} = $min_max_error_for_feature;
}
my $best_feature_name;
my $best_feature_paritioning_point;
my $best_minmax_error_at_partitioning_point;
foreach my $feature (keys %{$best_minmax_error_for_feature_hash}) {
if (! defined $best_minmax_error_at_partitioning_point) {
$best_minmax_error_at_partitioning_point = $best_minmax_error_for_feature_hash->{$feature};
$best_feature_name = $feature;
} elsif ($best_minmax_error_at_partitioning_point > $best_minmax_error_for_feature_hash->{$feature}) {
$best_minmax_error_at_partitioning_point = $best_minmax_error_for_feature_hash->{$feature};
$best_feature_name = $feature;
}
}
my $best_feature_partitioning_point = $best_partition_point_for_feature_hash->{$best_feature_name};
return ($best_feature_name,$best_minmax_error_at_partitioning_point,$best_feature_partitioning_point);
} else {
my $pattern1 = '(.+)=(.+)';
my $pattern2 = '(.+)<(.+)';
my $pattern3 = '(.+)>(.+)';
my @true_numeric_types;
my @symbolic_types;
my @true_numeric_types_feature_names;
my @symbolic_types_feature_names;
foreach my $item (@features_and_values_or_thresholds_on_branch) {
if ($item =~ /$pattern2/) {
push @true_numeric_types, $item;
push @true_numeric_types_feature_names, $1;
} elsif ($item =~ /$pattern3/) {
push @true_numeric_types, $item;
push @true_numeric_types_feature_names, $1;
} elsif ($item =~ /$pattern1/) {
push @symbolic_types, $item;
push @symbolic_types_feature_names, $1;
} else {
die "format error in the representation of feature and values or thresholds";
}
}
my %seen = ();
@true_numeric_types_feature_names = grep {$_ if !$seen{$_}++} @true_numeric_types_feature_names;
%seen = ();
@symbolic_types_feature_names = grep {$_ if !$seen{$_}++} @symbolic_types_feature_names;
my @bounded_intervals_numeric_types =
@{$self->find_bounded_intervals_for_numeric_features(\@true_numeric_types)};
# Calculate the upper and the lower bounds to be used when searching for the best
# threshold for each of the numeric features that are in play at the current node:
my (%upperbound, %lowerbound);
foreach my $feature (@true_numeric_types_feature_names) {
$upperbound{$feature} = undef;
$lowerbound{$feature} = undef;
}
foreach my $item (@bounded_intervals_numeric_types) {
foreach my $feature_grouping (@$item) {
if ($feature_grouping->[1] eq '>') {
$lowerbound{$feature_grouping->[0]} = $feature_grouping->[2];
} else {
$upperbound{$feature_grouping->[0]} = $feature_grouping->[2];
}
}
}
my $best_partition_point_for_feature_hash = { map {$_ => undef} @{$self->{_feature_names}} };
my $best_minmax_error_for_feature_hash = { map {$_ => undef} @{$self->{_feature_names}} };
foreach my $i (0 .. @{$self->{_feature_names}}-1) {
my $feature_name = $self->{_feature_names}->[$i];
my @values = @{$self->{_sampling_points_for_numeric_feature_hash}->{$feature_name}};
my @newvalues;
if (contained_in($feature_name, @true_numeric_types_feature_names)) {
if (defined($upperbound{$feature_name}) && defined($lowerbound{$feature_name}) &&
$lowerbound{$feature_name} >= $upperbound{$feature_name}) {
next;
} elsif (defined($upperbound{$feature_name}) && defined($lowerbound{$feature_name}) &&
$lowerbound{$feature_name} < $upperbound{$feature_name}) {
foreach my $x (@values) {
push @newvalues, $x if $x > $lowerbound{$feature_name} && $x <= $upperbound{$feature_name};
}
} elsif (defined($upperbound{$feature_name})) {
foreach my $x (@values) {
push @newvalues, $x if $x <= $upperbound{$feature_name};
}
} elsif (defined($lowerbound{$feature_name})) {
foreach my $x (@values) {
push @newvalues, $x if $x > $lowerbound{$feature_name};
}
} else {
die "Error is bound specifications in best feature calculator";
}
} else {
@newvalues = @{deep_copy_array(\@values)};
}
next if @newvalues < 30;
my @partitioning_errors;
my %partitioning_error_hash;
foreach my $value (@newvalues[10 .. $#newvalues - 10]) {
my $feature_and_less_than_value_string = "$feature_name" . '<' . "$value";
my $feature_and_greater_than_value_string = "$feature_name" . '>' . "$value";
my @for_left_child;
my @for_right_child;
if (@features_and_values_or_thresholds_on_branch) {
@for_left_child = @{deep_copy_array(\@features_and_values_or_thresholds_on_branch)};
push @for_left_child, $feature_and_less_than_value_string;
@for_right_child = @{deep_copy_array(\@features_and_values_or_thresholds_on_branch)};
push @for_right_child, $feature_and_greater_than_value_string;
} else {
@for_left_child = ($feature_and_less_than_value_string);
@for_right_child = ($feature_and_greater_than_value_string);
}
my ($error1,$beta1,$XMatrix1,$YVector1) =
$self->_error_for_given_sequence_of_features_and_values_or_thresholds(\@for_left_child);
my ($error2,$beta2,$XMatrix2,$YVector2) =
$self->_error_for_given_sequence_of_features_and_values_or_thresholds(\@for_right_child);
my $partitioning_error = max($error1, $error2);
push @partitioning_errors, $partitioning_error;
$partitioning_error_hash{$partitioning_error} = $value;
}
my $min_max_error_for_feature = min(@partitioning_errors);
$best_partition_point_for_feature_hash->{$feature_name} =
$partitioning_error_hash{$min_max_error_for_feature};
$best_minmax_error_for_feature_hash->{$feature_name} = $min_max_error_for_feature;
}
my $best_feature_name;
my $best_feature_paritioning_point;
my $best_minmax_error_at_partitioning_point;
foreach my $feature (keys %{$best_minmax_error_for_feature_hash}) {
if (! defined $best_minmax_error_at_partitioning_point) {
$best_minmax_error_at_partitioning_point = $best_minmax_error_for_feature_hash->{$feature};
$best_feature_name = $feature;
} elsif ($best_minmax_error_at_partitioning_point > $best_minmax_error_for_feature_hash->{$feature}) {
$best_minmax_error_at_partitioning_point = $best_minmax_error_for_feature_hash->{$feature};
$best_feature_name = $feature;
}
}
my $best_feature_partitioning_point = $best_partition_point_for_feature_hash->{$best_feature_name};
return ($best_feature_name,$best_minmax_error_at_partitioning_point,$best_feature_partitioning_point);
}
}
## This method requires that all truly numeric types only be expressed as '<' or '>'
## constructs in the array of branch features and thresholds
sub _error_for_given_sequence_of_features_and_values_or_thresholds{
my $self = shift;
my $array_of_features_and_values_or_thresholds = shift;
if (@$array_of_features_and_values_or_thresholds == 0) {
my ($XMatrix,$YVector) = $self->construct_XMatrix_and_YVector_all_data();
my ($errors,$beta) = $self->estimate_regression_coefficients($XMatrix,$YVector);
return ($errors,$beta,$XMatrix,$YVector)
}
my $pattern1 = '(.+)=(.+)';
my $pattern2 = '(.+)<(.+)';
my $pattern3 = '(.+)>(.+)';
my @true_numeric_types;
my @symbolic_types;
my @true_numeric_types_feature_names;
my @symbolic_types_feature_names;
foreach my $item (@$array_of_features_and_values_or_thresholds) {
if ($item =~ /$pattern2/) {
push @true_numeric_types, $item;
push @true_numeric_types_feature_names, $1;
} elsif ($item =~ /$pattern3/) {
push @true_numeric_types, $item;
push @true_numeric_types_feature_names, $1;
} elsif ($item =~ /$pattern1/) {
push @symbolic_types, $item;
push @symbolic_types_feature_names, $1;
} else {
die "format error in the representation of feature and values or thresholds";
}
}
my %seen = ();
@true_numeric_types_feature_names = grep {$_ if !$seen{$_}++} @true_numeric_types_feature_names;
%seen = ();
@symbolic_types_feature_names = grep {$_ if !$seen{$_}++} @symbolic_types_feature_names;
my @bounded_intervals_numeric_types =
@{$self->find_bounded_intervals_for_numeric_features(\@true_numeric_types)};
# Calculate the upper and the lower bounds to be used when searching for the best
# threshold for each of the numeric features that are in play at the current node:
my (%upperbound, %lowerbound);
foreach my $feature (@true_numeric_types_feature_names) {
$upperbound{$feature} = undef;
$lowerbound{$feature} = undef;
}
foreach my $item (@bounded_intervals_numeric_types) {
foreach my $feature_grouping (@$item) {
if ($feature_grouping->[1] eq '>') {
$lowerbound{$feature_grouping->[0]} = $feature_grouping->[2];
} else {
$upperbound{$feature_grouping->[0]} = $feature_grouping->[2];
}
}
}
my %training_samples_at_node;
foreach my $feature_name (@true_numeric_types_feature_names) {
if ((defined $lowerbound{$feature_name}) && (defined $upperbound{$feature_name}) &&
($upperbound{$feature_name} <= $lowerbound{$feature_name})) {
return (undef,undef,undef,undef);
} elsif ((defined $lowerbound{$feature_name}) && (defined $upperbound{$feature_name})) {
foreach my $sample (keys %{$self->{_training_data_hash}}) {
my @feature_val_pairs = @{$self->{_training_data_hash}->{$sample}};
foreach my $feature_and_val (@feature_val_pairs) {
my $value_for_feature = substr($feature_and_val, index($feature_and_val,'=')+1 );
my $feature_involved = substr($feature_and_val, 0, index($feature_and_val,'=') );
if (($feature_name eq $feature_involved) &&
($lowerbound{$feature_name} < $value_for_feature) &&
($value_for_feature <= $upperbound{$feature_name})) {
$training_samples_at_node{$sample} = 1;
last;
}
}
}
} elsif ((defined $upperbound{$feature_name}) && (! defined $lowerbound{$feature_name})) {
foreach my $sample (keys %{$self->{_training_data_hash}}) {
my @feature_val_pairs = @{$self->{_training_data_hash}->{$sample}};
foreach my $feature_and_val (@feature_val_pairs) {
my $value_for_feature = substr($feature_and_val, index($feature_and_val,'=')+1 );
my $feature_involved = substr($feature_and_val, 0, index($feature_and_val,'=') );
if (($feature_name eq $feature_involved) &&
($value_for_feature <= $upperbound{$feature_name})) {
$training_samples_at_node{$sample} = 1;
last;
}
}
}
} elsif ((defined $lowerbound{$feature_name}) && (! defined $upperbound{$feature_name})) {
foreach my $sample (keys %{$self->{_training_data_hash}}) {
my @feature_val_pairs = @{$self->{_training_data_hash}->{$sample}};
foreach my $feature_and_val (@feature_val_pairs) {
my $value_for_feature = substr($feature_and_val, index($feature_and_val,'=')+1 );
my $feature_involved = substr($feature_and_val, 0, index($feature_and_val,'=') );
if (($feature_name eq $feature_involved) &&
($value_for_feature > $lowerbound{$feature_name})) {
$training_samples_at_node{$sample} = 1;
last;
}
}
}
} else {
die "Ill formatted call to the '_error_for_given_sequence_...' method";
}
}
foreach my $feature_and_value (@symbolic_types) {
if ($feature_and_value =~ /$pattern1/) {
my ($feature,$value) = ($1,$2);
foreach my $sample (keys %{$self->{_training_data_hash}}) {
my @feature_val_pairs = @{$self->{_training_data_hash}->{$sample}};
foreach my $feature_and_val (@feature_val_pairs) {
my $feature_in_sample = substr($feature_and_val, 0, index($feature_and_val,'=') );
my $value_in_sample = substr($feature_and_val, index($feature_and_val,'=')+1 );
if (($feature eq $feature_in_sample) && ($value eq $value_in_sample)) {
$training_samples_at_node{$sample} = 1;
last;
}
}
}
}
}
my @training_samples_at_node = keys %training_samples_at_node;
my $matrix_rows_as_lists = [ map {my @arr = @$_; [map substr($_,index($_,'=')+1), @arr] } map {$self->{_training_data_hash}->{$_}} sort {sample_index($a) <=> sample_index($b)} @training_samples_at_node ];;
map {push @$_, 1} @{$matrix_rows_as_lists};
my $XMatrix = Math::GSL::Matrix->new(scalar @{$matrix_rows_as_lists}, scalar @{$matrix_rows_as_lists->[0]});
pairmap {$XMatrix->set_row($a,$b)} ( 0..@{$matrix_rows_as_lists}-1, @{$matrix_rows_as_lists} )
[ map { $_, $_ + @{$matrix_rows_as_lists} } ( 0 .. @{$matrix_rows_as_lists}-1 ) ];
if ($self->{_debug3_r}) {
print "\nXMatrix as its transpose:";
my $displayX = transpose($XMatrix);
display_matrix($displayX);
}
my @dependent_var_values = map {my $val = $self->{_samples_dependent_var_val_hash}->{$_}; substr($val,index($val,'=')+1)} sort {sample_index($a) <=> sample_index($b)} @training_samples_at_node;
print "dependent var values: @dependent_var_values\n" if $self->{_debug1_r};
my $YVector = Math::GSL::Matrix->new(scalar @{$matrix_rows_as_lists}, 1);
pairmap {$YVector->set_row($a,$b)} ( 0..@{$matrix_rows_as_lists}-1, map {[$_]} @dependent_var_values )
[ map { $_, $_ + @{$matrix_rows_as_lists} } ( 0 .. @{$matrix_rows_as_lists}-1 ) ];
if ($self->{_debug3_r}) {
print "\nYVector:";
my $displayY = transpose($YVector);
display_matrix($displayY);
}
my ($error,$beta) = $self->estimate_regression_coefficients($XMatrix, $YVector);
if ($self->{_debug3_r}) {
display_matrix($beta);
print("\n\nerror distribution at node: ", $error);
}
return ($error,$beta,$XMatrix,$YVector);
}
#----------------------------- Predict with Regression Tree ------------------------------
sub predictions_for_all_data_used_for_regression_estimation {
my $self = shift;
my $root_node = shift;
my %predicted_values;
my %leafnode_for_values;
my $ncols = $self->{_XMatrix}->cols;
if ($ncols == 2) {
foreach my $sample (keys %{$self->{_training_data_hash}}) {
my $pattern = '(\S+)\s*=\s*(\S+)';
$self->{_training_data_hash}->{$sample}->[0] =~ /$pattern/;
my ($feature,$value) = ($1, $2);
my $new_feature_and_value = "$feature=$value";
my $answer = $self->prediction_for_single_data_point($root_node, [$new_feature_and_value]);
$predicted_values{$value} = $answer->{'prediction'};
$leafnode_for_values{"$value,$predicted_values{$value}"} = $answer->{'solution_path'}->[-1];
}
my @leaf_nodes_used = keys %{{map {$_ => 1} values %leafnode_for_values}};
foreach my $leaf (@leaf_nodes_used) {
my @xvalues;
my @yvalues;
foreach my $x (sort {$a <=> $b} keys %predicted_values) {
if ($leaf == $leafnode_for_values{"$x,$predicted_values{$x}"}) {
push @xvalues, $x;
push @yvalues, $predicted_values{$x};
}
}
$self->{_output_for_plots}->{scalar(keys %{$self->{_output_for_plots}}) + 1} = [\@xvalues,\@yvalues];
}
} elsif ($ncols == 3) {
foreach my $sample (keys %{$self->{_training_data_hash}}) {
my $pattern = '(\S+)\s*=\s*(\S+)';
my @features_and_vals;
my @newvalues;
foreach my $feature_and_val (@{$self->{_training_data_hash}->{$sample}}) {
$feature_and_val =~ /$pattern/;
my ($feature,$value) = ($1, $2);
push @newvalues, $value;
my $new_feature_and_value = "$feature=$value";
push @features_and_vals, $new_feature_and_value;
}
my $answer = $self->prediction_for_single_data_point($root_node, \@features_and_vals);
$predicted_values{"@newvalues"} = $answer->{'prediction'};
$leafnode_for_values{"@newvalues"} = $answer->{'solution_path'}->[-1];
}
my @leaf_nodes_used = keys %{{map {$_ => 1} values %leafnode_for_values}};
foreach my $leaf (@leaf_nodes_used) {
my @xvalues;
my @yvalues;
foreach my $kee (keys %predicted_values) {
if ($leaf == $leafnode_for_values{$kee}) {
push @xvalues, $kee;
push @yvalues, $predicted_values{$kee};
}
}
$self->{_output_for_surface_plots}->{scalar(keys %{$self->{_output_for_surface_plots}}) + 1} = [\@xvalues,\@yvalues];
}
} else {
die "\nThe module does not yet include regression based predictions when you have more than " .
"two predictor variables --- on account of the difficulty of properly visualizing the " .
"quality of such predictions. Future versions of this modue may nonetheless allow for " .
"regression based predictions in such cases (depending on user feedback)";
}
}
sub bulk_predictions_for_data_in_a_csv_file {
my $self = shift;
my ($root_node, $filename, $columns) = @_;
die("Aborted. bulk_predictions_for_data_in_a_csv_file() is only for CSV files") unless $filename =~ /\.csv$/;
my $basefilename = basename($filename, '.csv');
my $out_file_name = $basefilename . "_output.csv";
unlink $out_file_name if -e $out_file_name;
open FILEOUT, "> $out_file_name" or die "Unable to open $out_file_name: $!";
$|++;
open FILEIN, $filename or die "Unable to open $filename: $!";
my $record_index = 0;
my @fieldnames;
while (<FILEIN>) {
next if /^[ ]*\r?\n?$/;
$_ =~ s/\r?\n?$//;
my $record = $self->{_csv_cleanup_needed} ? cleanup_csv($_) : $_;
if ($record_index == 0) {
@fieldnames = grep $_, split /,/, substr($record, index($record,','));
$record_index++;
next;
}
my @feature_vals = grep $_, split /,/, $record;
my @features_and_vals;
foreach my $col_index (@$columns) {
push @features_and_vals, "$fieldnames[$col_index-1]=$feature_vals[$col_index-1]"
}
my $answer = $self->prediction_for_single_data_point($root_node, \@features_and_vals);
print FILEOUT "$record => $answer->{'prediction'}\n";
}
close FILEIN;
close FILEOUT;
}
sub mse_for_tree_regression_for_all_training_samples {
my $self = shift;
my $root_node = shift;
my %predicted_values;
my %dependent_var_values;
my %leafnode_for_values;
my $total_error = 0.0;
my %samples_at_leafnode;
foreach my $sample (keys %{$self->{_training_data_hash}}) {
my $pattern = '(\S+)\s*=\s*(\S+)';
my @features_and_vals;
my $newvalue;
foreach my $feature_and_val (@{$self->{_training_data_hash}->{$sample}}) {
$feature_and_val =~ /$pattern/;
my ($feature,$value) = ($1, $2);
if (contained_in($feature, @{$self->{_feature_names}})) {
my $new_feature_and_value = "$feature=$value";
push @features_and_vals, "$feature=$value";
}
$newvalue = $value;
}
my $answer = $self->prediction_for_single_data_point($root_node, \@features_and_vals);
$predicted_values{"@features_and_vals"} = $answer->{'prediction'};
$self->{_samples_dependent_var_val_hash}->{$sample} =~ /$pattern/;
my ($dep_feature,$dep_value) = ($1, $2);
$dependent_var_values{"@features_and_vals"} = $dep_value;
my $leafnode_for_sample = $answer->{'solution_path'}->[-1];
$leafnode_for_values{"@features_and_vals"} = $answer->{'solution_path'}->[-1];
my $error_for_sample = abs($predicted_values{"@features_and_vals"} - $dependent_var_values{"@features_and_vals"});
$total_error += $error_for_sample;
if (exists $samples_at_leafnode{$leafnode_for_sample}) {
push @{$samples_at_leafnode{$leafnode_for_sample}}, $sample;
} else {
$samples_at_leafnode{$leafnode_for_sample} = [$sample];
}
}
my @leafnodes_used = keys %{{map {$_ => 1} values %leafnode_for_values}};
my $errors_at_leafnode = { map {$_ => 0.0} @leafnodes_used };
foreach my $kee (keys %predicted_values) {
foreach my $leaf (@leafnodes_used) {
$errors_at_leafnode->{$leaf} += abs($predicted_values{$kee} - $dependent_var_values{$kee})
if $leaf == $leafnode_for_values{$kee};
}
}
my $total_error_per_data_point = $total_error / (scalar keys %{$self->{_training_data_hash}});
print "\n\nTree Regression: Total MSE per sample with tree regression: $total_error_per_data_point\n";
foreach my $leafnode (@leafnodes_used) {
my $error_per_data_point = $errors_at_leafnode->{$leafnode} / @{$samples_at_leafnode{$leafnode}};
print " MSE per sample at leafnode $leafnode: $error_per_data_point\n";
}
my $error_with_linear_regression = $self->{_root_node}->get_node_error();
print "For comparision, the MSE per sample error with Linear Regression: $error_with_linear_regression\n";
}
## Calculated the predicted value for the dependent variable from a given value for all
## the predictor variables.
sub prediction_for_single_data_point {
my $self = shift;
my $root_node = shift;
my $features_and_values = shift;
die "Error in the names you have used for features and/or values when calling " .
"prediction_for_single_data_point()" unless $self->_check_names_used($features_and_values);
my $pattern = '(\S+)\s*=\s*(\S+)';
my @new_features_and_values;
foreach my $feature_and_value (@$features_and_values) {
$feature_and_value =~ /$pattern/;
my ($feature,$value) = ($1, $2);
push @new_features_and_values, "$feature=$value";
}
my %answer;
$answer{'solution_path'} = [];
my $prediction = $self->recursive_descent_for_prediction($root_node,\@new_features_and_values, \%answer);
$answer{'solution_path'} = [reverse( @{$answer{'solution_path'}} )];
return \%answer;
}
sub recursive_descent_for_prediction {
my $self = shift;
my $node = shift;
my $feature_and_values = shift;
my $answer = shift;
my @children = @{$node->get_children()};
if (@children == 0) {
my $leaf_node_prediction = $node->node_prediction_from_features_and_values($feature_and_values);
$answer->{'prediction'} = $leaf_node_prediction;
push @{$answer->{'solution_path'}}, $node->get_serial_num();
return $answer;
}
my $feature_tested_at_node = $node->get_feature();
print "\nFeature tested at node for prediction: $feature_tested_at_node\n" if $self->{_debug3};
my $value_for_feature;
my $path_found;
my $pattern = '(\S+)\s*=\s*(\S+)';
my ($feature,$value);
foreach my $feature_and_value (@$feature_and_values) {
$feature_and_value =~ /$pattern/;
my ($feature,$value) = ($1, $2);
if ($feature eq $feature_tested_at_node) {
$value_for_feature = $value;
}
}
if (! defined $value_for_feature) {
my $leaf_node_prediction = $node->node_prediction_from_features_and_values($feature_and_values);
$answer->{'prediction'} = $leaf_node_prediction;
push @{$answer->{'solution_path'}}, $node->get_serial_num();
return $answer;
}
foreach my $child (@children) {
my @branch_features_and_values = @{$child->get_branch_features_and_values_or_thresholds()};
my $last_feature_and_value_on_branch = $branch_features_and_values[-1];
my $pattern1 = '(.+)<(.+)';
my $pattern2 = '(.+)>(.+)';
if ($last_feature_and_value_on_branch =~ /$pattern1/) {
my ($feature,$threshold) = ($1, $2);
if ($value_for_feature <= $threshold) {
$path_found = 1;
$answer = $self->recursive_descent_for_prediction($child, $feature_and_values, $answer);
push @{$answer->{'solution_path'}}, $node->get_serial_num();
last;
}
}
if ($last_feature_and_value_on_branch =~ /$pattern2/) {
my ($feature,$threshold) = ($1, $2);
if ($value_for_feature > $threshold) {
$path_found = 1;
$answer = $self->recursive_descent_for_prediction($child, $feature_and_values, $answer);
push @{$answer->{'solution_path'}}, $node->get_serial_num();
last;
}
}
}
return $answer if $path_found;
}
#-------------------------------------- Utility Methods ----------------------------------------
## This method is used to verify that you used legal feature names in the test
## sample that you want to classify with the decision tree.
sub _check_names_used {
my $self = shift;
my $features_and_values_test_data = shift;
my @features_and_values_test_data = @$features_and_values_test_data;
my $pattern = '(\S+)\s*=\s*(\S+)';
foreach my $feature_and_value (@features_and_values_test_data) {
$feature_and_value =~ /$pattern/;
my ($feature,$value) = ($1,$2);
die "Your test data has formatting error" unless defined($feature) && defined($value);
return 0 unless contained_in($feature, @{$self->{_feature_names}});
}
return 1;
}
sub display_all_plots {
my $self = shift;
my $ncols = $self->{_XMatrix}->cols;
unlink "regression_plots.png" if -e "regression_plots.png";
my $master_datafile = $self->{_training_datafile};
my $filename = basename($master_datafile);
my $temp_file = "__temp_" . $filename;
unlink $temp_file if -e $temp_file;
open OUTPUT, ">$temp_file"
or die "Unable to open a temp file in this directory: $!\n";
if ($ncols == 2) {
my @predictor_entries = $self->{_XMatrix}->col(0)->as_list;
my @dependent_val_vals = $self->{_YVector}->col(0)->as_list;
map {print OUTPUT "$predictor_entries[$_] $dependent_val_vals[$_]\n"} 0 .. $self->{_XMatrix}->rows - 1;
print OUTPUT "\n\n";
foreach my $plot (sort {$a <=> $b} keys %{$self->{_output_for_plots}}) {
map {print OUTPUT "$self->{_output_for_plots}->{$plot}->[0]->[$_] $self->{_output_for_plots}->{$plot}->[1]->[$_]\n"} 0 .. @{$self->{_output_for_plots}->{$plot}->[0]} - 1;
print OUTPUT "\n\n"
}
close OUTPUT;
my $gplot = Graphics::GnuplotIF->new( persist => 1 );
my $hardcopy_plot = Graphics::GnuplotIF->new();
$hardcopy_plot->gnuplot_cmd('set terminal png', "set output \"regression_plots.png\"");
$gplot->gnuplot_cmd( "set noclip" );
$gplot->gnuplot_cmd( "set pointsize 2" );
my $arg_string = "";
foreach my $i (0 .. scalar(keys %{$self->{_output_for_plots}})) {
if ($i == 0) {
$arg_string .= "\"$temp_file\" index $i using 1:2 notitle with points lt -1 pt 1, ";
} elsif ($i == 1) {
$arg_string .= "\"$temp_file\" index $i using 1:2 title \"linear regression\" with lines lt 1 lw 4, ";
} elsif ($i == 2) {
$arg_string .= "\"$temp_file\" index $i using 1:2 title \"tree regression\" with lines lt 3 lw 4, ";
} else {
$arg_string .= "\"$temp_file\" index $i using 1:2 notitle with lines lt 3 lw 4, ";
}
}
$arg_string = $arg_string =~ /^(.*),[ ]+$/;
$arg_string = $1;
$hardcopy_plot->gnuplot_cmd( "plot $arg_string" );
$gplot->gnuplot_cmd( "plot $arg_string" );
$gplot->gnuplot_pause(-1);
} elsif ($ncols == 3) {
my @dependent_val_vals = $self->{_YVector}->col(0)->as_list;
foreach my $i (0 .. $self->{_XMatrix}->rows - 1) {
my @onerow = $self->{_XMatrix}->row($i)->as_list;
pop @onerow;
print OUTPUT "@onerow $dependent_val_vals[$i]\n";
}
print OUTPUT "\n\n";
foreach my $plot (sort {$a <=> $b} keys %{$self->{_output_for_surface_plots}}) {
my @plot_data = @{$self->{_output_for_surface_plots}->{$plot}};
my @predictors = @{$plot_data[0]};
my @predictions = @{$plot_data[1]};
map {print OUTPUT "$predictors[$_] $predictions[$_]\n"} 0 .. @predictions - 1;
print OUTPUT "\n\n"
}
close OUTPUT;
my $gplot = Graphics::GnuplotIF->new( persist => 1 );
my $hardcopy_plot = Graphics::GnuplotIF->new();
$hardcopy_plot->gnuplot_cmd('set terminal png', "set output \"regression_plots.png\"");
$gplot->gnuplot_cmd( "set noclip" );
$gplot->gnuplot_cmd( "set pointsize 2" );
my $arg_string = "";
foreach my $i (0 .. scalar(keys %{$self->{_output_for_surface_plots}})) {
if ($i == 0) {
$arg_string .= "\"$temp_file\" index $i using 1:2:3 notitle with points lt -1 pt 1, ";
} elsif ($i == 1) {
$arg_string .= "\"$temp_file\" index $i using 1:2:3 title \"linear regression\" with points lt 1 pt 2, ";
} elsif ($i == 2) {
$arg_string .= "\"$temp_file\" index $i using 1:2:3 title \"tree regression\" with points lt 3 pt 3, ";
} else {
$arg_string .= "\"$temp_file\" index $i using 1:2:3 notitle with points lt 3 pt 3, ";
}
}
$arg_string = $arg_string =~ /^(.*),[ ]+$/;
$arg_string = $1;
$hardcopy_plot->gnuplot_cmd( "splot $arg_string" );
$gplot->gnuplot_cmd( "splot $arg_string" );
$gplot->gnuplot_pause(-1);
} else {
die "no visual displays for regression from more then 2 predictor vars";
}
}
sub DESTROY {
unlink glob "__temp_*";
}
############################################## Utility Routines ##########################################
# checks whether an element is in an array:
sub contained_in {
my $ele = shift;
my @array = @_;
my $count = 0;
map {$count++ if $ele eq $_} @array;
return $count;
}
sub minmax {
my $arr = shift;
my ($min, $max);
foreach my $i (0..@{$arr}-1) {
if ( (!defined $min) || ($arr->[$i] < $min) ) {
$min = $arr->[$i];
}
if ( (!defined $max) || ($arr->[$i] > $max) ) {
$max = $arr->[$i];
}
}
return ($min, $max);
}
sub sample_index {
my $arg = shift;
$arg =~ /_(.+)$/;
return $1;
}
sub check_for_illegal_params {
my @params = @_;
my @legal_params = qw / training_datafile
max_depth_desired
dependent_variable_column
predictor_columns
mse_threshold
need_data_normalization
jacobian_choice
csv_cleanup_needed
debug1_r
debug2_r
debug3_r
/;
my $found_match_flag;
foreach my $param (@params) {
foreach my $legal (@legal_params) {
$found_match_flag = 0;
if ($param eq $legal) {
$found_match_flag = 1;
last;
}
}
last if $found_match_flag == 0;
}
return $found_match_flag;
}
sub cleanup_csv {
my $line = shift;
$line =~ tr/\/:?()[]{}'/ /;
# my @double_quoted = substr($line, index($line,',')) =~ /\"[^\"]+\"/g;
my @double_quoted = substr($line, index($line,',')) =~ /\"[^\"]*\"/g;
for (@double_quoted) {
my $item = $_;
$item = substr($item, 1, -1);
$item =~ s/^\s+|,|\s+$//g;
$item = join '_', split /\s+/, $item;
substr($line, index($line, $_), length($_)) = $item;
}
my @white_spaced = $line =~ /,(\s*[^,]+)(?=,|$)/g;
for (@white_spaced) {
my $item = $_;
$item =~ s/\s+/_/g;
$item =~ s/^\s*_|_\s*$//g;
substr($line, index($line, $_), length($_)) = $item;
}
$line =~ s/,\s*(?=,|$)/,NA/g;
return $line;
}
sub transpose {
my $matrix = shift;
my $num_rows = $matrix->rows();
my $num_cols = $matrix->cols();
my $transpose = Math::GSL::Matrix->new($num_cols, $num_rows);
foreach my $i (0..$num_rows-1) {
my @row = $matrix->row($i)->as_list;
$transpose->set_col($i, \@row );
}
return $transpose;
}
sub vector_subtract {
my $vec1 = shift;
my $vec2 = shift;
die "wrong data types for vector subtract calculation\n" if @$vec1 != @$vec2;
my @result;
foreach my $i (0..@$vec1-1){
push @result, $vec1->[$i] - $vec2->[$i];
}
return @result;
}
sub vector_norm {
my $vec = shift; # assume it to be a column vector
my ($rows, $cols) = $vec->dim;
die "vector_norm() can only be called for a single column matrix" if $cols > 1;
my @norm = (transpose($vec) * $vec)->as_list;
return sqrt($norm[0]);
}
sub display_matrix {
my $matrix = shift;
my $nrows = $matrix->rows();
my $ncols = $matrix->cols();
print "\nDisplaying a matrix of size $nrows rows and $ncols columns:\n";
foreach my $i (0..$nrows-1) {
my $row = $matrix->row($i);
my @row_as_list = $row->as_list;
map { printf("%.4f ", $_) } @row_as_list;
print "\n";
}
print "\n\n";
}
# Meant only for an array of strings (no nesting):
sub deep_copy_array {
my $ref_in = shift;
my $ref_out;
return [] if scalar @$ref_in == 0;
foreach my $i (0..@{$ref_in}-1) {
$ref_out->[$i] = $ref_in->[$i];
}
return $ref_out;
}
############################################# Class RTNode #############################################
# The nodes of a regression tree are instances of this class:
package RTNode;
use strict;
use Carp;
# $feature is the feature test at the current node. $branch_features_and_values is
# an anonymous array holding the feature names and corresponding values on the path
# from the root to the current node:
sub new {
my ($class, $feature, $error, $beta, $branch_features_and_values_or_thresholds, $rt, $root_or_not) = @_;
$root_or_not = '' if !defined $root_or_not;
if ($root_or_not eq 'root') {
$rt->{nodes_created} = -1;
$rt->{class_names} = undef;
}
my $self = {
_rt => $rt,
_feature => $feature,
_error => $error,
_beta => $beta,
_branch_features_and_values_or_thresholds => $branch_features_and_values_or_thresholds,
_num_data_points => undef,
_XMatrix => undef,
_YVector => undef,
_linked_to => [],
};
bless $self, $class;
$self->{_serial_number} = $self->get_next_serial_num();
return $self;
}
sub node_prediction_from_features_and_values {
my $self = shift;
my $feature_and_values = shift;
my $ncols = $self->{_XMatrix}->cols;
my $pattern = '(\S+)\s*=\s*(\S+)';
my ($feature,$value);
my @Xlist;
foreach my $feature_name (@{$self->{_rt}->{_feature_names}}) {
foreach my $feature_and_value (@{$feature_and_values}) {
$feature_and_value =~ /$pattern/;
my ($feature, $value) = ($1, $2);
push @Xlist, $value if $feature_name eq $feature;
}
}
push @Xlist, 1;
my $dataMatrix = Math::GSL::Matrix->new(1, $ncols);
$dataMatrix->set_row(0, \@Xlist);
my $prediction = $dataMatrix * $self->get_node_beta();
return $prediction->get_elem(0,0);
}
sub node_prediction_from_data_as_matrix {
my $self = shift;
my $dataMatrix = shift;
my $prediction = $dataMatrix * $self->get_node_beta();
return $prediction->get_elem(0,0);
}
sub node_prediction_from_data_as_list {
my $self = shift;
my $data_as_list = shift;
my @data_arr = @{$data_as_list};
my $ncols = $self->{_XMatrix}->cols;
die "wrong number of elements in data list" if @data_arr != $ncols - 1;
push @data_arr, 1;
my $dataMatrix = Math::GSL::Matrix->new(1, $self->{_XMatrix}->cols);
my $prediction = $dataMatrix * $self->get_node_beta();
return $prediction->get_elem(0,0);
}
sub how_many_nodes {
my $self = shift;
return $self->{_rt}->{nodes_created} + 1;
}
sub get_num_data_points {
my $self = shift;
return $self->{_num_data_points};
}
sub set_num_data_points {
my $self = shift;
my $how_many = shift;
$self->{_num_data_points} = $how_many;
}
sub set_node_XMatrix {
my $self = shift;
my $xmatrix = shift;
$self->{_XMatrix} = $xmatrix;
}
sub get_node_XMatrix {
my $self = shift;
return $self->{_XMatrix};
}
sub set_node_YVector {
my $self = shift;
my $yvector = shift;
$self->{_YVector} = $yvector;
}
sub get_node_YVector {
my $self = shift;
return $self->{_YVector};
}
sub set_node_error {
my $self = shift;
my $error = shift;
$self->{_error} = $error;
}
sub get_node_error {
my $self = shift;
return $self->{_error};
}
sub set_node_beta {
my $self = shift;
my $beta = shift;
$self->{_beta} = $beta;
}
sub get_node_beta {
my $self = shift;
return $self->{_beta};
}
sub get_next_serial_num {
my $self = shift;
$self->{_rt}->{nodes_created} += 1;
return $self->{_rt}->{nodes_created};
}
sub get_serial_num {
my $self = shift;
$self->{_serial_number};
}
# this returns the feature test at the current node
sub get_feature {
my $self = shift;
return $self->{ _feature };
}
sub set_feature {
my $self = shift;
my $feature = shift;
$self->{_feature} = $feature;
}
sub get_branch_features_and_values_or_thresholds {
my $self = shift;
return $self->{_branch_features_and_values_or_thresholds};
}
sub get_children {
my $self = shift;
return $self->{_linked_to};
}
sub add_child_link {
my ($self, $new_node, ) = @_;
push @{$self->{_linked_to}}, $new_node;
}
sub delete_all_links {
my $self = shift;
$self->{_linked_to} = undef;
}
sub display_node {
my $self = shift;
my $feature_at_node = $self->get_feature() || " ";
my $serial_num = $self->get_serial_num();
my @branch_features_and_values_or_thresholds = @{$self->get_branch_features_and_values_or_thresholds()};
print "\n\nNODE $serial_num" .
":\n Branch features and values to this node: @branch_features_and_values_or_thresholds" .
"\n Best feature test at current node: $feature_at_node\n\n";
$self->{_rt}->estimate_regression_coefficients($self->get_node_XMatrix(), $self->get_node_YVector(), 1);
}
sub display_regression_tree {
my $self = shift;
my $offset = shift;
my $serial_num = $self->get_serial_num();
if (@{$self->get_children()} > 0) {
my $feature_at_node = $self->get_feature() || " ";
my @branch_features_and_values_or_thresholds = @{$self->get_branch_features_and_values_or_thresholds()};
print "NODE $serial_num: $offset BRANCH TESTS TO NODE: @branch_features_and_values_or_thresholds\n";
my $second_line_offset = "$offset" . " " x (8 + length("$serial_num"));
print "$second_line_offset" . "Decision Feature: $feature_at_node\n\n";
$offset .= " ";
foreach my $child (@{$self->get_children()}) {
$child->display_regression_tree($offset);
}
} else {
my @branch_features_and_values_or_thresholds = @{$self->get_branch_features_and_values_or_thresholds()};
print "NODE $serial_num: $offset BRANCH TESTS TO LEAF NODE: @branch_features_and_values_or_thresholds\n";
my $second_line_offset = "$offset" . " " x (8 + length("$serial_num"));
}
}
1;