#UMLS::Association::Measures::Direct # Computes the association between two sets of terms # using Direct association, which is the association # between sets A and C using direct co-occurrences use lib '/home/henryst/UMLS-Association/lib'; use strict; use warnings; package UMLS::Association::Measures::Direct; # Gets stats (n11,n1p,np1,npp) for each pairHash in the pairHashList # using direct association # Input: # $pairHashListRef - ref to an array of pairHashes # $matrixFileName - the fileName of the co-occurrence matrix # $noOrder - 1 if order is enforced, 0 if not # Output: # \@statsList - ref to an array of \@stats, refs to arrays # containing the ordered values: n11, n1p, np1, npp # for each of the pair hashes. The index of the # \@statsList corresponds to the index of the pairHash # in the input $pairHashListRef sub getStats { my $pairHashListRef = shift; my $matrixFileName = shift; my $noOrder = shift; #read in the matrix of all values needed for all # pair sets in the pair hash list my ($matrixRef, $vocabSize) = &UMLS::Association::StatFinder::readInMatrix($pairHashListRef, $matrixFileName); #compute n1p,np1, and npp for all values my ($n1pRef, $np1Ref, $npp) = &_getAllCounts($matrixRef); #compute n11,n1p,np1,npp for all pair hashes # and place into the statsList, a parallel array # of stats for that pair hash my @statsList = (); foreach my $pairHashRef (@{$pairHashListRef}) { push @statsList, &_statsFromAllCounts($matrixRef, $n1pRef, $np1Ref, $npp, $noOrder, $pairHashRef); } #return the stats list, an array of array refs # each array ref contains four values: # n11, n1p, np1, and npp for the pair hash at # the corresponding index in the pairHashList return \@statsList; } # Computes n1p, np1, and npp for every CUI in the subgraph # Input: # $subGraphRef - ref to the subgraph or matrix read in # Output: # \%n1p - ref to a hash{$cui}=n1p for that cui, order enforced # \%np1 - ref to a hash{$cui}=np1 for that cui, order enforced # $npp - npp for the subGraphRef sub _getAllCounts { my $subGraphRef = shift; #find stats by iterating over all keys my %n1p = (); my %np1 = (); my $npp = 0; foreach my $key1 (keys %{$subGraphRef}) { foreach my $key2 (keys %{${$subGraphRef}{$key1}}) { #grab the value from the sub graph my $value = ${${$subGraphRef}{$key1}}{$key2}; $n1p{$key1} += $value; $np1{$key2} += $value; $npp += $value; } } return \%n1p, \%np1, $npp; } # Computes n11, n1p, np1,and npp for the pairHash using # the allCounts calculated from the _getAllCounts function # Input: # $subGraphRef - ref to the subgraph or matrix read in # $n1pRef - ref to a hash{$cui}=n1p for that cui, order enforced # $np1Ref - ref to a hash{$cui}=np1 for that cui, order enforced # $npp - npp for the subGraphRef # $noOrder - 0 if order is enforced, 1 if not # $pairHashRef - ref to a pairHash # Output: # \@stats - ref to an array of (n11,n1p,np1,npp) sub _statsFromAllCounts { my $subGraphRef = shift; my $n1pRef = shift; my $np1Ref = shift; my $npp = shift; my $noOrder = shift; my $pairHashRef = shift; #NOTE: finding N11 is the bottleneck, but I don't think there is much I can do about it #find stats by iterating over all keys ############ calculate n11 my $n11 = 0; foreach my $key1 (@{${$pairHashRef}{'set1'}}) { foreach my $key2 (@{${$pairHashRef}{'set2'}}) { if (defined ${${$subGraphRef}{$key1}}{$key2}) { $n11 += ${${$subGraphRef}{$key1}}{$key2}; } if ($noOrder && defined ${${$subGraphRef}{$key2}}{$key1}) { $n11 += ${${$subGraphRef}{$key2}}{$key1}; } } } #remove noorder double counts (nodes pointing at themselves) if ($noOrder) { foreach my $key1 (@{${$pairHashRef}{'set1'}}) { if (exists ${${$subGraphRef}{$key1}}{$key1}) { #remove double counts, only if the key is in key2's set foreach my $key2 (@{${$pairHashRef}{'set2'}}) { if ($key1 eq $key2) { $n11 -= ${${$subGraphRef}{$key1}}{$key1}; } } } } } ################################## ############## calculate n1p my $n1p = 0; foreach my $key1 (@{${$pairHashRef}{'set1'}}) { #calculate n1p if (defined ${$n1pRef}{$key1}) { $n1p += ${$n1pRef}{$key1}; } if ($noOrder && defined ${$np1Ref}{$key1}) { $n1p += ${$np1Ref}{$key1}; } } #remove noorder double counts if ($noOrder) { foreach my $key1 (@{${$pairHashRef}{'set1'}}) { foreach my $key2 (@{${$pairHashRef}{'set1'}}) { if (defined ${${$subGraphRef}{$key1}}{$key2}) { $n1p -= ${${$subGraphRef}{$key1}}{$key2}; } } } } ##################################### ############## #calculate np1 my $np1 = 0; foreach my $key2 (@{${$pairHashRef}{'set2'}}) { #calculate np1 if (defined ${$np1Ref}{$key2}) { $np1 += ${$np1Ref}{$key2}; } if ($noOrder && defined ${$n1pRef}{$key2}) { $np1 += ${$n1pRef}{$key2}; } } #remove noorder double counts if ($noOrder) { foreach my $key1 (@{${$pairHashRef}{'set2'}}) { foreach my $key2 (@{${$pairHashRef}{'set2'}}) { if (defined ${${$subGraphRef}{$key1}}{$key2}) { $np1 -= ${${$subGraphRef}{$key1}}{$key2}; } } } } ############################## #pack and return the stats my @stats = ($n11, $n1p, $np1, $npp); return \@stats; } 1;