#!/usr/bin/bash # PODNAME: zxcvbn-build-data-leipzig # ABSTRACT: generate word ranking data from uni-leipzig corpora # usage: # # download Corpora Collection tarballs from # https://wortschatz.uni-leipzig.de/en/download/ # # zxcvbn-build-data-leipzig $tarball_path > data/$whatever.txt # # then you can use those text files as input to # Data::Password::zxcvbn::AuthorTools::BuildRankedDictionaries tarball="$1" if [[ ! -f "$tarball" ]]; then >&2 cat <<HELP Usage: Download Corpora Collection tarballs from https://wortschatz.uni-leipzig.de/en/download/ Then run: zxcvbn-build-data-leipzig $tarball_path > data/$whatever.txt Then you can use those text files as input to Data::Password::zxcvbn::AuthorTools::BuildRankedDictionaries HELP exit 1 fi # note: this is a bash script with embedded perl, instead of a perl # script with IPC::Run, because IPC::Run kept dropping bits of the # output and I got fed up with trying to fix it; this works # # also, since these data files are pretty big, using `sort` saves me # from having to re-invent its mixed ram/disk storage strategy tar -x --wildcards '*-words.txt' -O -f "$tarball" | \ perl -E 'use v5.26; use strict; use warnings; use Text::Unidecode; binmode STDIN, ":utf8"; # filter out non-simple words, fold everything to lowercase ASCII while (my $line = <>) { chomp $line; my ($rank, $word, $count) = split /\t/,$line; $word = unidecode($word); next unless $word =~ /^\w+$/; print fc($word),"\t$count\n"; } ' | sort | \ perl -E 'use v5.26; use strict; use warnings; my ($current_word, $current_count) = ("",0); # add up consecutive lines for the same word; this is needed # because the previous filter will produce identical output # for differing inputs, in different places; the `sort` brings # the identical words together while (my $line = <>) { chomp $line; my ($word, $count) = split /\t/, $line; if ($word eq $current_word) { $current_count += $count; } else { print "$current_word\t$current_count\n" if $current_word; ($current_word, $current_count) = ($word, $count); } } print "$current_word\t$current_count\n" if $current_word; ' | sort -k2nr exit # POD goes here __END__ =pod =encoding UTF-8 =head1 NAME zxcvbn-build-data-leipzig - generate word ranking data from uni-leipzig corpora =head1 VERSION version 1.0.2 =head1 AUTHOR Gianni Ceccarelli <gianni.ceccarelli@broadbean.com> =head1 COPYRIGHT AND LICENSE This software is copyright (c) 2023 by BroadBean UK, a CareerBuilder Company. This is free software; you can redistribute it and/or modify it under the same terms as the Perl 5 programming language system itself. =cut