lib/Pg/Explain/StringAnonymizer.pm


            
              1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
—
22
23
24
25
26
27
28
29
30
31
32
33
—
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
—
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
—
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
—
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
—
149
150
151
152
153
154
155
156
157
158
159
160
161
162
—
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
—
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
—
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
—
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
—
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
—
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
              package Pg::Explain::StringAnonymizer;
# UTF8 boilerplace, per http://stackoverflow.com/questions/6162484/why-does-modern-perl-avoid-utf-8-by-default/
use v5.18;
use strict;
use warnings;
use warnings qw( FATAL utf8 );
use utf8;
use open qw( :std :utf8 );
use Unicode::Normalize qw( NFC );
use Unicode::Collate;
use Encode qw( decode );
if ( grep /\P{ASCII}/ => @ARGV ) {
    @ARGV = map { decode( 'UTF-8', $_ ) } @ARGV;
}
# UTF8 boilerplace, per http://stackoverflow.com/questions/6162484/why-does-modern-perl-avoid-utf-8-by-default/
use Carp;
use Digest::SHA qw( sha1 );
=head1 NAME
Pg::Explain::StringAnonymizer - Class to anonymize sets of strings
=head1 VERSION
Version 2.9
=cut
our $VERSION = '2.9';
=head1 SYNOPSIS
This module provides a way to turn defined set of strings into anonymized version of it, that has 4 properties:
=over
=item * the same original string should give the same output string (within the same input set)
=item * strings shouldn't be very long
=item * it shouldn't be possible to reverse the operation
=item * generated strings should be easy to read, and easy to distinguish between themselves.
=back
Points first and third can be done easily with some hashing function (md5, sha), but generated hashes violate fourth point, and sometimes also second.
Example of usage:
    my $anonymizer = Pg::Explain::StringAnonymizer->new();
    $anonymizer->add( 'a', 'b', 'c');
    $anonymizer->add( 'depesz' );
    $anonymizer->add( [ "any strings, "are possible" ] );
    $anonymizer->finalize();
    print $anonymizer->anonymized( 'a' ), "\n";
    my $full_dictionary = $anonymizer->anonymization_dictionary();
=head1 METHODS
=head2 new
Object constructor, doesn't take any arguments.
=cut
sub new {
    my $class = shift;
    my $self  = bless {}, $class;
    $self->{ 'strings' } = {};
    return $self;
}
=head2 add
Adds new string(s) to anonymization list.
Strings can be given either as list of ArrayRef.
It is important to note, that one cannot add() more elements to anonymized set after finalization (call to finalize() method).
If such call will be made (add() after finalize()) it will raise exception.
=cut
sub add {
    my $self = shift;
    croak( "Cannot run ->add() after finalization.\n" ) if $self->{ 'is_finalized' };
    my @input = @_;
    @input = @{ $input[ 0 ] } if 'ARRAY' eq ref( $input[ 0 ] );
    for my $string ( @input ) {
        next if $self->{ 'strings' }->{ $string };
        $self->{ 'strings' }->{ $string } = $self->_hash( $string );
    }
    return;
}
=head2 finalize
Finalizes string set creation, and creates anonymized versions.
It has to be called after some number of add() calls, so that it will have something to work on.
After running finalize() one cannot add() more string.
Also, before finalize() you cannot run anonymized() or anonymization_dictionary() methods.
=cut
sub finalize {
    my $self = shift;
    return if $self->{ 'is_finalized' };
    $self->{ 'is_finalized' } = 1;
    $self->_make_prefixes(
        'keys'  => [ keys %{ $self->{ 'strings' } } ],
        'level' => 0,
    );
    $self->_stringify();
    my @keys_sorted = sort { length( $b ) <=> length( $a ) } keys %{ $self->{ 'strings' } };
    $self->{ 'keys_re' } = join '|', map { qr{\Q$_\E} } @keys_sorted;
    return;
}
=head2 anonymized
Returns anonymized version of given string, or undef if the string wasn't previously added to anonymization set.
If it will be called before finalize() it will raise exception.
=cut
sub anonymized {
    my $self = shift;
    croak( "Cannot run ->anonymized() before finalization.\n" ) unless $self->{ 'is_finalized' };
    my $input = shift;
    return $self->{ 'strings' }->{ $input };
}
=head2 anonymize_text
Anonymize given text using loaded dictionary of substiturions.
=cut
sub anonymize_text {
    my $self = shift;
    my $text = shift;
    my $re   = $self->{ 'keys_re' };
    $text =~ s{(\b|\s)($re)(\b|\s)}{ $1 . $self->{'strings'}->{$2} . $3 }mge;
    return $text;
}
=head2 anonymization_dictionary
Returns hash reference containing all input strings and their anonymized versions, like:
    {
        'original1' => 'anon1',
        'original2' => 'anon2',
        ...
        'originalN' => 'anonN',
    }
If it will be called before finalize() it will raise exception.
=cut
sub anonymization_dictionary {
    my $self = shift;
    croak( "Cannot run ->anonymization_dictionary() before finalization.\n" ) unless $self->{ 'is_finalized' };
    return $self->{ 'strings' };
}
=head1 INTERNAL METHODS
=head2 _hash
Converts given string into array of 32 integers in range 0..31.
This is done by taking sha1 checksum of string, splitting it into 32 5-bit
long "segments", and transposing each segment into integer.
=cut
sub _hash {
    my $self  = shift;
    my $input = shift;
    my $hash = sha1( $input );
    # sha1() (20 bytes) to 32 integers (0..31) transformation thanks to
    # mauke and LeoNerd on #perl on irc.freenode.net
    my $binary_hash = unpack( "B*", $hash );
    my @segments    = unpack "(a5)*", $binary_hash;
    return [ map { oct "0b$_" } @segments ];
}
=head2 _word
Returns n-th word from number-to-word translation dictionary.
=cut
sub _word {
    my $self = shift;
    my $n    = shift;
    $n = 0 unless defined $n;
    $n = 0  if $n < 0;
    $n = 31 if $n > 31;
    my @words = qw(
        alpha     bravo      charlie    delta
        echo      foxtrot    golf       hotel
        india     juliet     kilo       lima
        mike      november   oscar      papa
        quebec    romeo      sierra     tango
        uniform   victor     whiskey    xray
        yankee    zulu       two        three
        four      five       six        seven
    );
    return $words[ $n ];
}
=head2 _make_prefixes
Scan given keys, and changes their values (in ->{'strings'} hash) to
shortest unique prefix.
=cut
sub _make_prefixes {
    my $self = shift;
    my %args = @_;
    my $S = $self->{ 'strings' };
    my %unique_ints = ();
    for my $key ( @{ $args{ 'keys' } } ) {
        my $KA              = $S->{ $key };
        my $interesting_int = $KA->[ $args{ 'level' } ];
        $unique_ints{ $interesting_int }++;
    }
    # At this moment, we know how many times given int happened at given
    # level, so we can make sensible decisions
    my %to_redo = ();
    for my $key ( @{ $args{ 'keys' } } ) {
        my $KA              = $S->{ $key };
        my $interesting_int = $KA->[ $args{ 'level' } ];
        if ( 1 == $unique_ints{ $interesting_int } ) {
            splice @{ $KA }, 1 + $args{ 'level' };
            next;
        }
        push @{ $to_redo{ $interesting_int } }, $key;
    }
    # In to_redo, we have blocks of keys, that share prefix (up to given
    # level), so they have to be further processed.
    for my $key_group ( values %to_redo ) {
        $self->_make_prefixes(
            'keys'  => $key_group,
            'level' => $args{ 'level' } + 1,
        );
    }
    return;
}
=head2 _stringify
Converts arrays of ints (prefixes for hashed words) into strings
=cut
sub _stringify {
    my $self = shift;
    for my $key ( keys %{ $self->{ 'strings' } } ) {
        my $ints  = $self->{ 'strings' }->{ $key };
        my @words = map { $self->_word( $_ ) } @{ $ints };
        $self->{ 'strings' }->{ $key } = join( '_', @words );
    }
}
=head1 AUTHOR
hubert depesz lubaczewski, C<< <depesz at depesz.com> >>
=head1 BUGS
Please report any bugs or feature requests to C<depesz at depesz.com>.
=head1 SUPPORT
You can find documentation for this module with the perldoc command.
    perldoc Pg::Explain::StringAnonymizer
=head1 COPYRIGHT & LICENSE
Copyright 2008-2023 hubert depesz lubaczewski, all rights reserved.
This program is free software; you can redistribute it and/or modify it
under the same terms as Perl itself.
=cut
1;    # End of Pg::Explain::StringAnonymizer
	Global
`s`	Focus search bar
`?`	Bring up this help dialog
	GitHub
`g` `p`	Go to pull requests
`g` `i`	go to github issues (only if github is preferred repository)
	POD
`g` `a`	Go to author
`g` `c`	Go to changes
`g` `i`	Go to issues
`g` `d`	Go to dist
`g` `r`	Go to repository/SCM
`g` `s`	Go to source
`g` `b`	Go to file browse
	Search terms
module: (e.g. module:Plugin)
distribution: (e.g. distribution:Dancer auth)
author: (e.g. author:SONGMU Redis)
version: (e.g. version:1.00)