Perl x Open Food Facts Hackathon: Paris, France - May 24-25 Learn more

#!/usr/bin/perl -w
# This script is for testing Sereal decode speeds, with various
# generated test inputs (which are first encoded). Sample usages:
#
# decode.pl --build --output=data.srl
#
# will (1) build a "graph" (a hash of small strings, really,
# which can be seen as an adjacency list representation of
# a graph, the vertex and its neighbors) of 1e5 vertices
# (2) decode the encoded blob 5 times (the 'graph', 1e5, and 5
# being the defaults).
#
# Other inputs types (--type=T) are
# aoi (array of int) (value == key)
# aoir (array of int) (value == randomly shuffled key)
# aof (array of float) (rand())
# aos (array of string) (value eq key)
# hoi (hash of int)
# hof (hash of float)
# hos (hash of string)
#
# The 'base' number of elements in each case is controlled by --elem=N.
# For the array and hash the number of elements is trivial, for the graph
# the total number of elements (in its hash-of-hashes) is O(N log N).
#
# The number decode repeats is controlled by --repeat_decode=N and --repeat_decode=N.
#
# The encode input needs to be built only once, the --output tells
# where to save the encoded blob. The encode blob can be read back
# from the save file with --input, much faster, especially in the case
# of the graph input.
use strict;
use Fcntl qw[O_RDONLY O_WRONLY O_CREAT O_TRUNC];
use List::Util qw[shuffle];
sub MB () { 2**20 }
my %Opt;
my @Opt= (
'input=s', 'output=s', 'type=s', 'elem=f', 'build',
'repeat_encode=i', 'repeat_decode=i',
# If non-zero, will drop the minimum and maximum
# values before computing statistics IF the number
# of measurements is at least this limit. So with
# a value of 5 will leave 3 measurements. Lowers
# the stddev, should not affect avg/median (much).
# Helpful in reducing cache effects.
'min_max_drop_limit=i',
'size'
);
my %OptO= map { my ($n)= /^(\w+)/; $_ => \$Opt{$n} } @Opt;
my @OptU= map { "--$_" } @Opt;
GetOptions(%OptO) or die "GetOptions: @OptU\n";
my $data;
my $blob;
my $size;
my $data_size;
my $blob_size;
my $dt;
if ( defined $Opt{size} ) {
eval 'use Devel::Size qw[total_size]';
if ($@) {
die "$0: --size but Devel::Size=total_size not found\n";
}
}
if ( defined $Opt{build} ) {
die "$0: --input with --build makes no sense\n" if defined $Opt{input};
$Opt{elem} //= 1e5;
}
else {
die "$0: --output without --build makes no sense\n" if defined $Opt{output};
die "$0: --elem without --build makes no sense\n" if defined $Opt{elem};
die "$0: Must specify either --build or --input\n" unless defined $Opt{input};
}
if ( defined( $Opt{output} ) ) {
die "$0: --input with --output makes no sense\n" if defined $Opt{input};
}
$Opt{type} //= 'graph';
$Opt{repeat_encode} //= 1;
$Opt{repeat_decode} //= 5;
$Opt{min_max_drop_limit} //= 0;
my %TYPE= map { $_ => 1 } qw[aoi aoir aof aos hoi hof hos graph];
die "$0: Unexpected --type=$Opt{type}\n$0: Expected --type=@{[join('|', sort keys %TYPE)]}\n"
unless exists $TYPE{ $Opt{type} };
sub Times::new {
my $t= Time::HiRes::time();
my ( $u, $s, $cu, $cs )= times();
bless {
wall => $t,
usr => $u,
sys => $s,
cpu => $u + $s,
cusr => $cu,
csys => $cs,
},
$_[0];
}
sub Times::diff {
die "Unexpected diff(@_)\n" unless ref $_[0] eq ref $_[1];
bless { map { $_ => ( $_[0]->{$_} - $_[1]->{$_} ) } keys %{ $_[0] } }, ref $_[0];
}
sub Times::wall { $_[0]->{wall} }
sub Times::usr { $_[0]->{usr} }
sub Times::sys { $_[0]->{sys} }
sub Times::cpu { $_[0]->{cpu} }
# times() can often sum just a tad higher than wallclock.
sub Times::pct { 100 * ( $_[0]->cpu > $_[0]->wall ? 1 : $_[0]->cpu / $_[0]->wall ) }
sub timeit {
my $code= shift;
my $t0= Times->new();
my @res= $code->(@_);
my $t1= Times->new();
my $dt= $t1->diff($t0);
return $dt;
}
sub __stats {
# The caller is supposed to have done this sorting
# already, but let's be wasteful and paranoid.
my @v= sort { $a <=> $b } @_;
my $min= $v[0];
my $max= $v[-1];
my $med= @v % 2 ? $v[ @v / 2 ] : ( $v[ @v / 2 - 1 ] + $v[ @v / 2 ] ) / 2;
my $sum= 0;
for my $t (@_) {
$sum += $t;
}
my $avg= $sum / @_;
my $sqsum= 0;
for my $t (@_) {
$sqsum += ( $avg - $t )**2;
}
my $stddev= sqrt( $sqsum / @_ );
return (
avg => $avg,
stddev => $stddev,
rstddev => $avg ? $stddev / $avg : undef,
min => $min, med => $med, max => $max
);
}
sub stats {
my %stats;
for my $k (qw(wall cpu)) {
my @v= sort { $a <=> $b } map { $_->{$k} } @_;
if ( $Opt{min_max_drop_limit} > 0
&& @v >= $Opt{min_max_drop_limit} )
{
print "$k: dropping min and max ($v[0] and $v[-1])\n";
shift @v;
pop @v;
}
$stats{$k}= { __stats(@v) };
}
return %stats;
}
if ( defined $Opt{build} ) {
print "building data\n";
my $E;
if ( $Opt{type} eq 'graph' ) {
print "building graph\n";
my $V= $Opt{elem};
$E= int( $V * log($V) / log(2) );
printf(
"data of %d (%.1fM) vertices %d (%.1fM) edges\n",
$V, $V / MB, $E, $E / MB
);
$dt= timeit(
sub {
for my $i ( 1 .. $E ) {
my $a= int( rand($V) );
my $b= int( rand($V) );
$data->{$a}{$b}++;
}
} );
}
elsif ( $Opt{type} eq 'aoi' ) {
print "building aoi\n";
$E= $Opt{elem};
$dt= timeit(
sub {
for my $i ( 1 .. $E ) {
push @$data, $i;
}
} );
}
elsif ( $Opt{type} eq 'aoir' ) {
print "building aoir\n";
$E= $Opt{elem};
$dt= timeit(
sub {
for my $i ( shuffle 1 .. $E ) {
push @$data, $i;
}
} );
}
elsif ( $Opt{type} eq 'aof' ) {
print "building aof\n";
$E= $Opt{elem};
$dt= timeit(
sub {
for my $i ( 1 .. $E ) {
push @$data, rand();
}
} );
}
elsif ( $Opt{type} eq 'aos' ) {
print "building aos\n";
$E= $Opt{elem};
$dt= timeit(
sub {
for my $i ( 1 .. $E ) {
push @$data, rand() . $$;
}
} );
}
elsif ( $Opt{type} eq 'hoi' ) {
print "building hoi\n";
$E= $Opt{elem};
$dt= timeit(
sub {
for my $i ( 1 .. $E ) {
$data->{$i}= $i;
}
} );
}
elsif ( $Opt{type} eq 'hof' ) {
print "building hof\n";
$E= $Opt{elem};
$dt= timeit(
sub {
for my $i ( 1 .. $E ) {
$data->{$i}= rand();
}
} );
}
elsif ( $Opt{type} eq 'hos' ) {
print "building hos\n";
$E= $Opt{elem};
$dt= timeit(
sub {
for my $i ( 1 .. $E ) {
$data->{$i}= "$i";
}
} );
}
else {
die "$0: Unexpected type '$Opt{type}'\n";
}
printf(
"build %.2f sec %.2f usr %.2f sys %.2f cpu %3d%% (%.1f elements/sec)\n",
$dt->wall, $dt->usr, $dt->sys, $dt->cpu, $dt->pct, $E / $dt->wall
);
if ( $Opt{size} ) {
$dt= timeit( sub { $data_size= total_size($data); } );
printf(
"data size %d bytes (%.1fMB) %.1f sec\n",
$data_size, $data_size / MB, $dt->wall
);
}
my $encoder= Sereal::Encoder->new;
{
print "encoding data\n";
my @dt;
for my $i ( 1 .. $Opt{repeat_encode} ) {
$dt= timeit( sub { $blob= $encoder->encode($data); } );
$blob_size= length($blob);
printf(
"%d/%d: encode to %d bytes (%.1fMB) %.2f sec %.2f usr %.2f sys %.2f cpu %3d%% (%.1f MB/sec)\n",
$i, $Opt{repeat_encode}, $blob_size, $blob_size / MB, $dt->wall, $dt->usr,
$dt->sys, $dt->cpu, $dt->pct,
$blob_size / ( MB * $dt->wall ) );
push @dt, $dt;
}
if (@dt) {
my %stats= stats(@dt);
for my $k (qw(wall cpu)) {
my $avg= $stats{$k}{avg};
printf(
"encode %-4s avg %.2f sec (%.1f MB/sec) stddev %.2f sec (%.2f) min %.2f med %.2f max %.2f\n",
$k,
$avg, $avg ? $blob_size / ( MB * $avg ) : 0, $stats{$k}{stddev},
$avg ? $stats{$k}{rstddev} : 0,
$stats{$k}{min}, $stats{$k}{med}, $stats{$k}{max} );
}
}
}
if ( defined $Opt{output} ) {
print "opening output\n";
my $fh;
sysopen( $fh, $Opt{output}, O_WRONLY | O_CREAT | O_TRUNC )
or die qq[sysopen "$Opt{output}": $!\n];
print "writing blob\n";
$dt= timeit(
sub {
syswrite( $fh, $blob )
or die qq[syswrite "$Opt{otput}": $!\n];
} );
$blob_size= length($blob);
printf(
"wrote %d bytes (%.1f MB) %.2f sec %.2f usr %.2f sys %.2f cpu %3d%% (%.1f MB/sec)\n",
$blob_size, $blob_size / MB, $dt->wall, $dt->usr, $dt->sys, $dt->cpu, $dt->pct,
$blob_size / ( MB * $dt->wall ) );
}
}
elsif ( defined $Opt{input} ) {
print "opening input\n";
my $fh;
sysopen( $fh, $Opt{input}, O_RDONLY ) or die qq[sysopen "$Opt{input}": $!\n];
print "reading blob\n";
$dt= timeit(
sub {
sysread( $fh, $blob, -s $fh )
or die qq[sysread "$Opt{input}": $!\n];
} );
$blob_size= length($blob);
printf(
"read %d bytes (%.1f MB) %.2f sec %.2f usr %.2f sys %.2f cpu %3d%% (%.1f MB/sec)\n",
$blob_size, $blob_size / MB, $dt->wall, $dt->usr, $dt->sys, $dt->cpu, $dt->pct,
$blob_size / ( MB * $dt->wall ) );
}
my $decoder= Sereal::Decoder->new;
{
print "decoding blob\n";
$blob_size= length($blob);
my @dt;
for my $i ( 1 .. $Opt{repeat_decode} ) {
$dt= timeit( sub { $data= $decoder->decode($blob); } );
printf(
"%d/%d: decode from %d bytes (%.1fM) %.2f sec %.2f usr %.2f sys %.2f cpu %3d%% (%.1f MB/sec)\n",
$i, $Opt{repeat_decode}, $blob_size, $blob_size / MB,
$dt->wall, $dt->usr, $dt->sys, $dt->cpu, $dt->pct, $blob_size / ( MB * $dt->wall ) );
push @dt, $dt;
}
if ( ref $data eq 'HASH' ) {
printf( "data is hashref of %d elements\n", scalar keys %{$data} );
}
elsif ( ref $data eq 'ARRAY' ) {
printf( "data is hashref of %d elements\n", scalar @{$data} );
}
elsif ( ref $data ) {
printf( "data is ref of %s\n", ref $data );
}
else {
printf("data is of unexpected type\n");
}
if (@dt) {
my %stats= stats(@dt);
for my $k (qw(wall cpu)) {
my $avg= $stats{$k}{avg};
printf(
"decode %-4s avg %.2f sec (%.1f MB/sec) stddev %.2f sec (%.2f) min %.2f med %.2f max %.2f\n",
$k,
$avg, $avg ? $blob_size / ( MB * $stats{$k}{avg} ) : 0, $stats{$k}{stddev},
$avg ? $stats{$k}{rstddev} : 0,
$stats{$k}{min}, $stats{$k}{med}, $stats{$k}{max} );
}
}
if ( $Opt{size} ) {
$dt= timeit( sub { $data_size= total_size($data); } );
printf(
"data size %d bytes (%.1fMB) %.1f sec\n",
$data_size, $data_size / MB, $dt->wall
);
}
}
if ( $Opt{size} ) {
if ( $blob_size && $data_size ) {
printf( "data size / blob size %.2f\n", $data_size / $blob_size );
}
}
exit(0);