————————————package
Pg::Explain::StringAnonymizer;
# UTF8 boilerplace, per http://stackoverflow.com/questions/6162484/why-does-modern-perl-avoid-utf-8-by-default/
use
v5.18;
use
strict;
use
warnings;
use
utf8;
use
open
qw( :std :utf8 )
;
use
Unicode::Collate;
if
(
grep
/\P{ASCII}/ =>
@ARGV
) {
@ARGV
=
map
{ decode(
'UTF-8'
,
$_
) }
@ARGV
;
}
# UTF8 boilerplace, per http://stackoverflow.com/questions/6162484/why-does-modern-perl-avoid-utf-8-by-default/
use
Carp;
=head1 NAME
Pg::Explain::StringAnonymizer - Class to anonymize sets of strings
=head1 VERSION
Version 2.9
=cut
our
$VERSION
=
'2.9'
;
=head1 SYNOPSIS
This module provides a way to turn defined set of strings into anonymized version of it, that has 4 properties:
=over
=item * the same original string should give the same output string (within the same input set)
=item * strings shouldn't be very long
=item * it shouldn't be possible to reverse the operation
=item * generated strings should be easy to read, and easy to distinguish between themselves.
=back
Points first and third can be done easily with some hashing function (md5, sha), but generated hashes violate fourth point, and sometimes also second.
Example of usage:
my $anonymizer = Pg::Explain::StringAnonymizer->new();
$anonymizer->add( 'a', 'b', 'c');
$anonymizer->add( 'depesz' );
$anonymizer->add( [ "any strings, "are possible" ] );
$anonymizer->finalize();
print $anonymizer->anonymized( 'a' ), "\n";
my $full_dictionary = $anonymizer->anonymization_dictionary();
=head1 METHODS
=head2 new
Object constructor, doesn't take any arguments.
=cut
sub
new {
my
$class
=
shift
;
my
$self
=
bless
{},
$class
;
$self
->{
'strings'
} = {};
return
$self
;
}
=head2 add
Adds new string(s) to anonymization list.
Strings can be given either as list of ArrayRef.
It is important to note, that one cannot add() more elements to anonymized set after finalization (call to finalize() method).
If such call will be made (add() after finalize()) it will raise exception.
=cut
sub
add {
my
$self
=
shift
;
croak(
"Cannot run ->add() after finalization.\n"
)
if
$self
->{
'is_finalized'
};
my
@input
=
@_
;
@input
= @{
$input
[ 0 ] }
if
'ARRAY'
eq
ref
(
$input
[ 0 ] );
for
my
$string
(
@input
) {
next
if
$self
->{
'strings'
}->{
$string
};
$self
->{
'strings'
}->{
$string
} =
$self
->_hash(
$string
);
}
return
;
}
=head2 finalize
Finalizes string set creation, and creates anonymized versions.
It has to be called after some number of add() calls, so that it will have something to work on.
After running finalize() one cannot add() more string.
Also, before finalize() you cannot run anonymized() or anonymization_dictionary() methods.
=cut
sub
finalize {
my
$self
=
shift
;
return
if
$self
->{
'is_finalized'
};
$self
->{
'is_finalized'
} = 1;
$self
->_make_prefixes(
'keys'
=> [
keys
%{
$self
->{
'strings'
} } ],
'level'
=> 0,
);
$self
->_stringify();
my
@keys_sorted
=
sort
{
length
(
$b
) <=>
length
(
$a
) }
keys
%{
$self
->{
'strings'
} };
$self
->{
'keys_re'
} =
join
'|'
,
map
{
qr{\Q$_\E}
}
@keys_sorted
;
return
;
}
=head2 anonymized
Returns anonymized version of given string, or undef if the string wasn't previously added to anonymization set.
If it will be called before finalize() it will raise exception.
=cut
sub
anonymized {
my
$self
=
shift
;
croak(
"Cannot run ->anonymized() before finalization.\n"
)
unless
$self
->{
'is_finalized'
};
my
$input
=
shift
;
return
$self
->{
'strings'
}->{
$input
};
}
=head2 anonymize_text
Anonymize given text using loaded dictionary of substiturions.
=cut
sub
anonymize_text {
my
$self
=
shift
;
my
$text
=
shift
;
my
$re
=
$self
->{
'keys_re'
};
$text
=~ s{(\b|\s)(
$re
)(\b|\s)}{ $1 .
$self
->{
'strings'
}->{$2} . $3 }mge;
return
$text
;
}
=head2 anonymization_dictionary
Returns hash reference containing all input strings and their anonymized versions, like:
{
'original1' => 'anon1',
'original2' => 'anon2',
...
'originalN' => 'anonN',
}
If it will be called before finalize() it will raise exception.
=cut
sub
anonymization_dictionary {
my
$self
=
shift
;
croak(
"Cannot run ->anonymization_dictionary() before finalization.\n"
)
unless
$self
->{
'is_finalized'
};
return
$self
->{
'strings'
};
}
=head1 INTERNAL METHODS
=head2 _hash
Converts given string into array of 32 integers in range 0..31.
This is done by taking sha1 checksum of string, splitting it into 32 5-bit
long "segments", and transposing each segment into integer.
=cut
sub
_hash {
my
$self
=
shift
;
my
$input
=
shift
;
my
$hash
= sha1(
$input
);
# sha1() (20 bytes) to 32 integers (0..31) transformation thanks to
# mauke and LeoNerd on #perl on irc.freenode.net
my
$binary_hash
=
unpack
(
"B*"
,
$hash
);
my
@segments
=
unpack
"(a5)*"
,
$binary_hash
;
return
[
map
{
oct
"0b$_"
}
@segments
];
}
=head2 _word
Returns n-th word from number-to-word translation dictionary.
=cut
sub
_word {
my
$self
=
shift
;
my
$n
=
shift
;
$n
= 0
unless
defined
$n
;
$n
= 0
if
$n
< 0;
$n
= 31
if
$n
> 31;
my
@words
=
qw(
alpha bravo charlie delta
echo foxtrot golf hotel
india juliet kilo lima
mike november oscar papa
quebec romeo sierra tango
uniform victor whiskey xray
yankee zulu two three
four five six seven
)
;
return
$words
[
$n
];
}
=head2 _make_prefixes
Scan given keys, and changes their values (in ->{'strings'} hash) to
shortest unique prefix.
=cut
sub
_make_prefixes {
my
$self
=
shift
;
my
%args
=
@_
;
my
$S
=
$self
->{
'strings'
};
my
%unique_ints
= ();
for
my
$key
( @{
$args
{
'keys'
} } ) {
my
$KA
=
$S
->{
$key
};
my
$interesting_int
=
$KA
->[
$args
{
'level'
} ];
$unique_ints
{
$interesting_int
}++;
}
# At this moment, we know how many times given int happened at given
# level, so we can make sensible decisions
my
%to_redo
= ();
for
my
$key
( @{
$args
{
'keys'
} } ) {
my
$KA
=
$S
->{
$key
};
my
$interesting_int
=
$KA
->[
$args
{
'level'
} ];
if
( 1 ==
$unique_ints
{
$interesting_int
} ) {
splice
@{
$KA
}, 1 +
$args
{
'level'
};
next
;
}
push
@{
$to_redo
{
$interesting_int
} },
$key
;
}
# In to_redo, we have blocks of keys, that share prefix (up to given
# level), so they have to be further processed.
for
my
$key_group
(
values
%to_redo
) {
$self
->_make_prefixes(
'keys'
=>
$key_group
,
'level'
=>
$args
{
'level'
} + 1,
);
}
return
;
}
=head2 _stringify
Converts arrays of ints (prefixes for hashed words) into strings
=cut
sub
_stringify {
my
$self
=
shift
;
for
my
$key
(
keys
%{
$self
->{
'strings'
} } ) {
my
$ints
=
$self
->{
'strings'
}->{
$key
};
my
@words
=
map
{
$self
->_word(
$_
) } @{
$ints
};
$self
->{
'strings'
}->{
$key
} =
join
(
'_'
,
@words
);
}
}
=head1 AUTHOR
hubert depesz lubaczewski, C<< <depesz at depesz.com> >>
=head1 BUGS
Please report any bugs or feature requests to C<depesz at depesz.com>.
=head1 SUPPORT
You can find documentation for this module with the perldoc command.
perldoc Pg::Explain::StringAnonymizer
=head1 COPYRIGHT & LICENSE
Copyright 2008-2023 hubert depesz lubaczewski, all rights reserved.
This program is free software; you can redistribute it and/or modify it
under the same terms as Perl itself.
=cut
1;
# End of Pg::Explain::StringAnonymizer