package Data::CosineSimilarity;
use strict;
use warnings;
our $VERSION = 0.02;
=head1 NAME
Data::CosineSimilarity - Compute the Cosine Similarity
=head1 SYNOPSIS
$cs = Data::CosineSimilarity->new;
$cs->add( label1 => { feature1 => 3, feature2 => 1, feature3 => 10 } );
$cs->add( label2 => ... );
$cs->add( label3 => ... );
# computes the cosine similarity
my $r = $cs->similarity( 'label1', 'label2' );
# the result object
my $cosine = $r->cosine;
my $radian = $r->radian;
my $degree = $r->degree;
my ($label1, $label2) = $r->labels;
# computes all the cosine similarity between 'label1' and the others.
my @all = $cs->all_for_label('label1');
# computes all, and returns the best
my ($best_label, $r) = $cs->best_for_label('label2');
# computes all, and returns the worst
my ($worst_label, $r) = $cs->worst_for_label('label2');
=head1 DESCRIPTION
Compute the cosine similarities between a set of vectors.
=head2 $class->new( %opts )
If all the feature vectors are normed then the computation of the cosine
becomes just the dot product of the vectors. In this case, specify the
option normed => 1, the performance will be greatly improved.
=cut
sub new {
my $class = shift;
my %opts = @_;
return bless {
normed => $opts{normed} ? 1 : 0,
labels => {},
}, $class;
}
=head2 $self->add( label => $features )
=cut
sub add {
my $self = shift;
my ($label, $features) = @_;
die 'label required' unless $label;
die 'features required' unless $features;
die 'features must be a hashref'
unless ref $features eq 'HASH';
die 'features must contain terms'
unless keys %$features;
my $norm = $self->{normed} ? 1 : _euclidean_norm($features);
die 'euclidean norm is null' if $norm == 0;
$self->{labels}{$label} = {
features => $features,
norm => $norm,
};
}
sub _euclidean_norm {
my ($features) = @_;
my $sum = 0;
$sum += $_**2 for values %$features;
return sqrt $sum;
}
sub _scalar_product {
my ($features1, $features2) = @_;
my $product = 0;
for (keys %$features1) {
my $c1 = $features1->{$_};
my $c2 = $features2->{$_} or next;
$product += $c1 * $c2;
}
return $product;
}
=head2 $self->similarity( $label1, $label2 )
=cut
sub similarity {
my $self = shift;
my ($label1, $label2) = @_;
my $product = _scalar_product(
$self->{labels}{$label1}{features},
$self->{labels}{$label2}{features}
);
my $cosine;
if ($self->{normed}) {
$cosine = $product;
}
else {
$cosine = $product / ( $self->{labels}{$label1}{norm} * $self->{labels}{$label2}{norm} );
}
return Data::CosineSimilarity::Result->_new(
labels => [ $label1, $label2 ],
cosine => $cosine,
);
}
=head2 $self->all_for_label( $label )
=cut
sub all_for_label {
my $self = shift;
my ($label) = @_;
my @result;
for (keys %{ $self->{labels} }) {
next if $_ eq $label;
push @result, $self->similarity($label, $_);
}
return sort { $b->cosine <=> $a->cosine } @result;
}
=head2 $self->best_for_label( $label )
=cut
sub best_for_label {
my $self = shift;
my ($label) = @_;
my @sorted = $self->all_for_label($label);
my $r = shift @sorted;
my (undef, $best) = $r->labels;
return ($best, $r);
}
=head2 $self->worst_for_label( $label )
=cut
sub worst_for_label {
my $self = shift;
my ($label) = @_;
my @sorted = $self->all_for_label($label);
my $r = pop @sorted;
my (undef, $worst) = $r->labels;
return ($worst, $r);
}
package Data::CosineSimilarity::Result;
use strict;
use warnings;
use Math::Trig;
sub _new {
my $class = shift;
my %args = @_;
return bless \%args, $class;
}
sub labels { @{ $_[0]->{labels} } }
sub cosine { $_[0]->{cosine} }
sub radian { acos( $_[0]->cosine ) }
sub degree { rad2deg( $_[0]->radian ) }
=head1 AUTHOR
Antoine Imbert, C<< <antoine.imbert at gmail.com> >>
=head1 LICENSE AND COPYRIGHT
This library is free software; you can redistribute it and/or modify it under the same terms as Perl itself.
=cut
1;