#UMLS::Association::Measures::Direct
# Computes the association between two sets of terms
# using Direct association, which is the association
# between sets A and C using direct co-occurrences
use lib '/home/henryst/UMLS-Association/lib';
use strict;
use warnings;
package UMLS::Association::Measures::Direct;
# Gets stats (n11,n1p,np1,npp) for each pairHash in the pairHashList
# using direct association
# Input:
# $pairHashListRef - ref to an array of pairHashes
# $matrixFileName - the fileName of the co-occurrence matrix
# $noOrder - 1 if order is enforced, 0 if not
# Output:
# \@statsList - ref to an array of \@stats, refs to arrays
# containing the ordered values: n11, n1p, np1, npp
# for each of the pair hashes. The index of the
# \@statsList corresponds to the index of the pairHash
# in the input $pairHashListRef
sub getStats {
my $pairHashListRef = shift;
my $matrixFileName = shift;
my $noOrder = shift;
#read in the matrix of all values needed for all
# pair sets in the pair hash list
my ($matrixRef, $vocabSize) = &UMLS::Association::StatFinder::readInMatrix($pairHashListRef, $matrixFileName);
#compute n1p,np1, and npp for all values
my ($n1pRef, $np1Ref, $npp) = &_getAllCounts($matrixRef);
#compute n11,n1p,np1,npp for all pair hashes
# and place into the statsList, a parallel array
# of stats for that pair hash
my @statsList = ();
foreach my $pairHashRef (@{$pairHashListRef}) {
push @statsList, &_statsFromAllCounts($matrixRef, $n1pRef, $np1Ref, $npp, $noOrder, $pairHashRef);
}
#return the stats list, an array of array refs
# each array ref contains four values:
# n11, n1p, np1, and npp for the pair hash at
# the corresponding index in the pairHashList
return \@statsList;
}
# Computes n1p, np1, and npp for every CUI in the subgraph
# Input:
# $subGraphRef - ref to the subgraph or matrix read in
# Output:
# \%n1p - ref to a hash{$cui}=n1p for that cui, order enforced
# \%np1 - ref to a hash{$cui}=np1 for that cui, order enforced
# $npp - npp for the subGraphRef
sub _getAllCounts {
my $subGraphRef = shift;
#find stats by iterating over all keys
my %n1p = ();
my %np1 = ();
my $npp = 0;
foreach my $key1 (keys %{$subGraphRef}) {
foreach my $key2 (keys %{${$subGraphRef}{$key1}}) {
#grab the value from the sub graph
my $value = ${${$subGraphRef}{$key1}}{$key2};
$n1p{$key1} += $value;
$np1{$key2} += $value;
$npp += $value;
}
}
return \%n1p, \%np1, $npp;
}
# Computes n11, n1p, np1,and npp for the pairHash using
# the allCounts calculated from the _getAllCounts function
# Input:
# $subGraphRef - ref to the subgraph or matrix read in
# $n1pRef - ref to a hash{$cui}=n1p for that cui, order enforced
# $np1Ref - ref to a hash{$cui}=np1 for that cui, order enforced
# $npp - npp for the subGraphRef
# $noOrder - 0 if order is enforced, 1 if not
# $pairHashRef - ref to a pairHash
# Output:
# \@stats - ref to an array of (n11,n1p,np1,npp)
sub _statsFromAllCounts {
my $subGraphRef = shift;
my $n1pRef = shift;
my $np1Ref = shift;
my $npp = shift;
my $noOrder = shift;
my $pairHashRef = shift;
#NOTE: finding N11 is the bottleneck, but I don't think there is much I can do about it
#find stats by iterating over all keys
############ calculate n11
my $n11 = 0;
foreach my $key1 (@{${$pairHashRef}{'set1'}}) {
foreach my $key2 (@{${$pairHashRef}{'set2'}}) {
if (defined ${${$subGraphRef}{$key1}}{$key2}) {
$n11 += ${${$subGraphRef}{$key1}}{$key2};
}
if ($noOrder && defined ${${$subGraphRef}{$key2}}{$key1}) {
$n11 += ${${$subGraphRef}{$key2}}{$key1};
}
}
}
#remove noorder double counts (nodes pointing at themselves)
if ($noOrder) {
foreach my $key1 (@{${$pairHashRef}{'set1'}}) {
if (exists ${${$subGraphRef}{$key1}}{$key1}) {
#remove double counts, only if the key is in key2's set
foreach my $key2 (@{${$pairHashRef}{'set2'}}) {
if ($key1 eq $key2) {
$n11 -= ${${$subGraphRef}{$key1}}{$key1};
}
}
}
}
}
##################################
############## calculate n1p
my $n1p = 0;
foreach my $key1 (@{${$pairHashRef}{'set1'}}) {
#calculate n1p
if (defined ${$n1pRef}{$key1}) {
$n1p += ${$n1pRef}{$key1};
}
if ($noOrder && defined ${$np1Ref}{$key1}) {
$n1p += ${$np1Ref}{$key1};
}
}
#remove noorder double counts
if ($noOrder) {
foreach my $key1 (@{${$pairHashRef}{'set1'}}) {
foreach my $key2 (@{${$pairHashRef}{'set1'}}) {
if (defined ${${$subGraphRef}{$key1}}{$key2}) {
$n1p -= ${${$subGraphRef}{$key1}}{$key2};
}
}
}
}
#####################################
############## #calculate np1
my $np1 = 0;
foreach my $key2 (@{${$pairHashRef}{'set2'}}) {
#calculate np1
if (defined ${$np1Ref}{$key2}) {
$np1 += ${$np1Ref}{$key2};
}
if ($noOrder && defined ${$n1pRef}{$key2}) {
$np1 += ${$n1pRef}{$key2};
}
}
#remove noorder double counts
if ($noOrder) {
foreach my $key1 (@{${$pairHashRef}{'set2'}}) {
foreach my $key2 (@{${$pairHashRef}{'set2'}}) {
if (defined ${${$subGraphRef}{$key1}}{$key2}) {
$np1 -= ${${$subGraphRef}{$key1}}{$key2};
}
}
}
}
##############################
#pack and return the stats
my @stats = ($n11, $n1p, $np1, $npp);
return \@stats;
}
1;