#!/usr/bin/bash

# PODNAME: zxcvbn-build-data-leipzig
# ABSTRACT: generate word ranking data from uni-leipzig corpora

# usage:
#
# download Corpora Collection tarballs from
# https://wortschatz.uni-leipzig.de/en/download/
#
#   zxcvbn-build-data-leipzig $tarball_path > data/$whatever.txt
#
# then you can use those text files as input to
# Data::Password::zxcvbn::AuthorTools::BuildRankedDictionaries

tarball="$1"

if [[ ! -f "$tarball" ]]; then
    >&2 cat <<HELP
Usage:

Download Corpora Collection tarballs from
https://wortschatz.uni-leipzig.de/en/download/

Then run:

  zxcvbn-build-data-leipzig $tarball_path > data/$whatever.txt

Then you can use those text files as input to
Data::Password::zxcvbn::AuthorTools::BuildRankedDictionaries

HELP

    exit 1
fi

# note: this is a bash script with embedded perl, instead of a perl
# script with IPC::Run, because IPC::Run kept dropping bits of the
# output and I got fed up with trying to fix it; this works
#
# also, since these data files are pretty big, using `sort` saves me
# from having to re-invent its mixed ram/disk storage strategy

tar -x --wildcards '*-words.txt' -O -f "$tarball" | \
    perl -E 'use v5.26; use strict; use warnings; use Text::Unidecode;
binmode STDIN, ":utf8";

# filter out non-simple words, fold everything to lowercase ASCII
while (my $line = <>) {
    chomp $line;
    my ($rank, $word, $count) = split /\t/,$line;

    $word = unidecode($word);

    next unless $word =~ /^\w+$/;

    print fc($word),"\t$count\n";
}
' | sort | \
    perl -E 'use v5.26; use strict; use warnings;

my ($current_word, $current_count) = ("",0);

# add up consecutive lines for the same word; this is needed
# because the previous filter will produce identical output
# for differing inputs, in different places; the `sort` brings
# the identical words together
while (my $line = <>) {
    chomp $line;
    my ($word, $count) = split /\t/, $line;

    if ($word eq $current_word) {
        $current_count += $count;
    }
    else {
        print "$current_word\t$current_count\n" if $current_word;
        ($current_word, $current_count) = ($word, $count);
    }
}
print "$current_word\t$current_count\n" if $current_word;
' | sort -k2nr

exit

# POD goes here

__END__

=pod

=encoding UTF-8

=head1 NAME

zxcvbn-build-data-leipzig - generate word ranking data from uni-leipzig corpora

=head1 VERSION

version 1.0.2

=head1 AUTHOR

Gianni Ceccarelli <gianni.ceccarelli@broadbean.com>

=head1 COPYRIGHT AND LICENSE

This software is copyright (c) 2023 by BroadBean UK, a CareerBuilder Company.

This is free software; you can redistribute it and/or modify it under
the same terms as the Perl 5 programming language system itself.

=cut