our
@EXPORT
=
qw(hd_fast jaccard_similarity jaccard_similarity_formatted estimate_hamming_stats z_score p_value_from_z_score _p_value add_stats)
;
my
$inline_dir
= catdir( File::HomeDir->my_home,
'.Inline'
);
unless
( -d
$inline_dir
) {
make_path(
$inline_dir
) or
die
"Cannot create directory $inline_dir: $!"
;
}
use
Inline
C
=>
Config
=>
directory
=>
$inline_dir
;
#include "EXTERN.h"
#include "perl.h"
#include "XSUB.h"
SV* c_jaccard_similarity(char* str1, char* str2, int length) {
int union_count = 0, intersection = 0, i;
for(i = 0; i < length; i++){
if(str1[i] == '1' || str2[i] == '1'){
union_count++;
if(str1[i] == '1' && str2[i] == '1')
intersection++;
}
}
double similarity = union_count ? ((double)intersection) / union_count : 0.0;
/* Create a new array (AV) and push the two results */
AV* av = newAV();
av_push(av, newSVnv(similarity)); /* push similarity (double) */
av_push(av, newSViv(intersection)); /* push intersection (int) */
/* Wrap the array in a reference and increment the reference count.
Do not call sv_2mortal on the resulting RV. */
SV* rv = newRV_inc((SV*)av);
return rv;
}
/*
This function computes the Hamming distance between two strings,
assuming they are both composed of '0' and '1' characters and have equal length.
It is defined as c_hd_fast to avoid a naming conflict with the Perl wrapper.
*/
int c_hd_fast(char* s1, char* s2, int len) {
int diff = 0;
int i;
for(i = 0; i < len; i++){
if (s1[i] != s2[i])
diff++;
}
return diff;
}
END_C
sub
hd_fast {
my
(
$s1
,
$s2
) =
@_
;
die
"Strings must be the same length"
if
length
(
$s1
) !=
length
(
$s2
);
return
c_hd_fast(
$s1
,
$s2
,
length
(
$s1
) );
}
sub
_hd_fast {
return
(
$_
[0] ^
$_
[1] ) =~
tr
/\001-\255//;
}
sub
jaccard_similarity {
my
(
$str1
,
$str2
) =
@_
;
my
$len
=
length
(
$str1
);
die
"Strings must be of equal length"
if
$len
!=
length
(
$str2
);
my
(
$jaccard
,
$intersection
) =
@{ c_jaccard_similarity(
$str1
,
$str2
,
$len
) };
return
(
$jaccard
,
$intersection
);
}
sub
_jaccard_similarity {
my
(
$str1
,
$str2
) =
@_
;
my
$len
=
length
(
$str1
);
die
"Strings must be of equal length"
if
$len
!=
length
(
$str2
);
my
(
$intersection
,
$union
) = ( 0, 0 );
for
my
$i
( 0 ..
$len
- 1 ) {
my
$b1
=
vec
(
$str1
,
$i
, 8 );
my
$b2
=
vec
(
$str2
,
$i
, 8 );
if
(
$b1
==
ord
(
'1'
) ||
$b2
==
ord
(
'1'
) ) {
$union
++;
$intersection
++
if
(
$b1
==
ord
(
'1'
) &&
$b2
==
ord
(
'1'
) );
}
}
return
$union
== 0 ? ( 0, 0 ) : (
$intersection
/
$union
,
$intersection
);
}
sub
jaccard_similarity_formatted {
my
(
$result
,
undef
) = jaccard_similarity(
@_
);
return
sprintf
(
"%.6f"
,
$result
);
}
sub
estimate_hamming_stats {
my
$length
=
shift
;
my
$probability_mismatch
= 0.5;
my
$estimated_average
=
$length
*
$probability_mismatch
;
my
$estimated_std_dev
=
sqrt
(
$length
*
$probability_mismatch
* ( 1 -
$probability_mismatch
) );
return
$estimated_average
,
$estimated_std_dev
;
}
sub
z_score {
my
(
$observed_value
,
$expected_value
,
$std_dev
) =
@_
;
return
0
if
$std_dev
== 0;
return
(
$observed_value
-
$expected_value
) /
$std_dev
;
}
sub
p_value_from_z_score {
return
pnorm(
shift
)
}
sub
add_stats {
my
$array
=
shift
;
my
%hash_out
;
my
$stat
= Statistics::Descriptive::Full->new();
$stat
->add_data(
$array
);
$hash_out
{mean} =
$stat
->mean();
$hash_out
{sd} =
$stat
->standard_deviation();
$hash_out
{count} =
$stat
->count();
$hash_out
{per25} =
$stat
->percentile(25);
$hash_out
{per75} =
$stat
->percentile(75);
$hash_out
{min} =
$stat
->min();
$hash_out
{max} =
$stat
->max();
$hash_out
{median} =
$stat
->median();
$hash_out
{sum} =
$stat
->sum();
return
\
%hash_out
;
}
1;