#!/usr/bin/perl
## -*-cperl-*-
## Author:  Stefan Evert
## Purpose: import existing sentence alignment into CWB
##
$| = 1;
use warnings;
use strict;

use CWB;
use CL;

use Getopt::Long;
use Pod::Usage;

## configuration variables
our $Verbose      = 0;      # -v ... show progress & status messages
our $Opt_Source   = undef;  # -l1 <id> ... source corpus (overrides alignment file header)
our $Opt_Target   = undef;  # -l2 <id> ... target corpus (overrides alignment file header)
our $Opt_Grid     = undef;  # -s <name> ... alignment grid attribute (usually <s>, overrides header)
our $Opt_Key      = undef;  # -k <pattern> ... key for identifying grid regions in alignment beads (overrides header)
our $Opt_NH       = 0;      # -nh ... alignment file does not have header (-l1, -l2, -s, -k are then required)
our $Opt_Inverse  = 0;      # -i ... encode "inverse" alignment (from target to source language)
our $Opt_Prune    = 0;      # -p ... automatically delete alignment beads if their keys are not found (implies -e)
our $Opt_Empty    = 0;      # -e ... allow 1:0 and 0:1 alignment beads (will be skipped)
our $Opt_Registry = undef;  # -r <dir> ... use registry directory <dir>
our $Opt_Help     = 0;      # -h ... show usage message

our $Opt_Test     = 0;      # -t .. use only first 100000 sentences for testing

OPTIONS_AND_USAGE:
{
  my $ok = GetOptions(
    "v|verbose"    => \$Verbose,
    "l1|source=s"  => \$Opt_Source,
    "l2|target=s"  => \$Opt_Target,
    "s|grid=s"     => \$Opt_Grid,
    "k|key=s"      => \$Opt_Key,
    "nh|no-header" => \$Opt_NH,
    "i|inverse"    => \$Opt_Inverse,
    "p|prune"      => \$Opt_Prune,
    "e|empty"      => \$Opt_Empty,
    "r|registry=s" => \$Opt_Registry,
    "h|help"       => \$Opt_Help,
    "t|test"       => \$Opt_Test,
  );
  pod2usage(-msg => "(Type 'perldoc cwb-align-import' for more information.)",
    -exitval => 0, -verbose => 1) if $Opt_Help;
  pod2usage(-msg => "(Type 'cwb-align-import -h' for more information.)",
    -exitval => 1, -verbose => 0) if $ok and @ARGV == 0;
  pod2usage(-msg => "SYNTAX ERROR.", 
    -exitval => 2, -verbose => 0) 
    unless $ok and @ARGV == 1;
  die "Flags -l1, -l2, -s and -k must be specified if -nh option is used.\n"
    if $Opt_NH and not($Opt_Source and $Opt_Target and $Opt_Grid and $Opt_Key);
  $Opt_Empty = 1 if $Opt_Prune;  # -p implies -e
}

## global variables
our ($C1_id, $C2_id, $C1_lc, $C2_lc, $S_id); # source and target corpus name (with lowercase variant) and alignment grid 
our ($align_file, $FH); # alignment file and file handle
our ($key_pattern);     # pattern used to generate keys that identify regions in the alignment grid
our (%R1, %R2);         # hashes mapping keys to [start, end] regions, in source and target corpus
our @Beads;             # list of alignment beads, with entries [$l1_start, $l1_end, $l2_start, $l2_end, ($annot)]

SETUP:
{
  $align_file = shift @ARGV;
  $FH = CWB::OpenFile $align_file;
  unless ($Opt_NH) {
    my $line = <$FH>;
    chomp $line;
    my @F = split /\t/, $line;
    die "Format error in alignment file header: ``$line''\n" unless @F == 4;
    ($C1_id, $C2_id, $S_id, $key_pattern) = @F;
  }
  $C1_id = $Opt_Source if $Opt_Source;
  $C2_id = $Opt_Target if $Opt_Target;
  $S_id  = $Opt_Grid if $Opt_Grid;
  $key_pattern = $Opt_Key if $Opt_Key;
  if ($Opt_Inverse) {
    ($C1_id, $C2_id) = ($C2_id, $C1_id); # swap source and target language with -i option
  }

  $C1_id = uc($C1_id);
  $C2_id = uc($C2_id);
  $C1_lc = lc($C1_id);
  $C2_lc = lc($C2_id);
}

MAKE_KEYS:
{
  print "Generating keys for grid regions:\n" if $Verbose;
  print "  - $C1_id "             if $Verbose;
  build_region_keys(\%R1, $C1_id, $S_id, $key_pattern);
  print " ok\n"   if $Verbose;
  print "  - $C2_id " if $Verbose;
  build_region_keys(\%R2, $C2_id, $S_id, $key_pattern);
  print " ok\n" if $Verbose;
}

READ_ALIGNMENT:
{
  print "Processing " if $Verbose;
  my $beads = 0;
  my $lines = 0;

  LINE:
  while (<$FH>) {
    $lines++;
    print "." if $Verbose and ($lines & 0xFFFF) == 1;  # 16 dots per 1M alignment beads

    chomp;
    my ($l1_keys, $l2_keys) = split /\t/;        # annotations are ignored so far
    if ($Opt_Inverse) {  
      ($l1_keys, $l2_keys) = ($l2_keys, $l1_keys); # swap source and target language with -i option
    }
    my @l1_keys = split " ", $l1_keys;
    my @l2_keys = split " ", $l2_keys;
    next LINE if $Opt_Empty and (@l1_keys == 0 xor @l2_keys == 0);    # skip 1:0 and 0:1 beads (only with -e or -p)
    die "ERROR: Syntax error on line #$.: ``$_''\n" unless @l1_keys and @l2_keys;

    ## need option to drop alignment beads with missing keys
    my @missing_l1 = grep { not exists $R1{$_} } @l1_keys;
    my @missing_l2 = grep { not exists $R2{$_} } @l2_keys;
    next LINE if $Opt_Prune and (@missing_l1 or @missing_l2);     # -p => prune alignment beads with missing keys
    die "ERROR: grid key(s) @missing_l1 not found in corpus $C1_id\n" if @missing_l1;
    die "ERROR: grid key(s) @missing_l2 not found in corpus $C2_id\n" if @missing_l2;

    ## extract regions for bead and check that multiple regions are contiguous
    my @l1_regions = sort { $a->[0] <=> $b->[0] } @R1{@l1_keys}; # ensure that regions are sorted correctly
    foreach my $i (1 .. $#l1_regions) {
      die "Error: alignment bead #$lines is non-contiguous in $C1_id\n\t(keys: @l1_keys)\n"
        unless $l1_regions[$i - 1][1] + 1 == $l1_regions[$i][0];
    }
    my @l2_regions = sort { $a->[0] <=> $b->[0] } @R2{@l2_keys};
    foreach my $i (1 .. $#l2_regions) {
      die "Error: alignment bead #$lines is non-contiguous in $C2_id\n\t(keys: @l2_keys)\n"
        unless $l2_regions[$i - 1][1] + 1 == $l2_regions[$i][0];
    }

    push @Beads, [$l1_regions[0][0], $l1_regions[-1][1], $l2_regions[0][0], $l2_regions[-1][1]];
    $beads++;
  }
  print " $beads / $lines alignment beads\n" if $Verbose;
}

our $n_beads = @Beads;
SORT_AND_VALIDATE:
{
  my $ok = 1;
  foreach my $i (1 .. $n_beads - 1) {
    if ($Beads[$i][0] <= $Beads[$i - 1][1]) {
      $ok = 0;  # overlapping or unordered source alignment regions => need to sort @Beads
      last;
    }
  }
  last SORT_AND_VALIDATE if $ok;    # regions are sorted properly and non-overlapping: skip rest of this block

  print "Sorting alignment beads ... " if $Verbose;
  @Beads = sort { $a->[0] <=> $b->[0] } @Beads;  # sort by ascending start position in source corpus

  print "checking ... " if $Verbose;
  foreach my $i (1 .. $n_beads - 1) {
    if ($Beads[$i][0] <= $Beads[$i - 1][1]) {
      die "Error: overlapping alignment beads in $C1_id\n",
        "\t(first overlap: " . $Beads[$i - 1][0] . ".." . $Beads[$i - 1][1] . " with " . $Beads[$i][0] . ".." . $Beads[$i][1] . ")\n";
    }
  }
  print "ok\n" if $Verbose;
}

EDIT_REGISTRY:
{
  my $rf_name = ($Opt_Registry) ? "$Opt_Registry/$C1_lc" : $C1_id;
  my $rf = new CWB::RegistryFile $rf_name;
  my $has_alignment = $rf->attribute($C2_lc);
  if (defined $has_alignment) {
    die "Error: attribute $C1_id.$C2_lc exists, but is not an alignment attribute. Can't continue.\n"
      unless $has_alignment eq "a";
    ## alignment attribute is already defined, nothing to do here
  }
  else {
    print "Registering alignment attribute $C1_id.$C2_lc ... " if $Verbose;
    $rf->add_attribute($C2_lc, "a");
    $rf->write;
    print "successful\n";
  }
}

ENCODE_ALIGNMENT:
{
  print "Writing alignment file ... " if $Verbose;
  my $temp = new CWB::TempFile "imported_alignment.gz"; # temporary input file for cwb-align-encode
  $temp->write(join("\t", $C1_id, $S_id, $C2_id, $S_id), "\n");
  foreach my $bead (@Beads) {
    $temp->write(join("\t", @$bead), "\n");
  }
  $temp->finish;
  
  print "encoding ... " if $Verbose;
  my $encode_cmd = "$CWB::AlignEncode -D ";
  $encode_cmd .= " -r '$Opt_Registry' " if $Opt_Registry;
  $encode_cmd .= $temp->name;
  CWB::Shell::Cmd($encode_cmd);
  $temp->close;
  
  print "done\n" if $Verbose;
}

print "Alignment $C1_id => $C2_id has been created with $n_beads non-empty beads.\n";
exit 0;


sub build_region_keys {
  my ($hashref, $corpus, $grid_att, $key_pattern) = @_;
  my $decode_cmd = "$CWB::SDecode";
  $decode_cmd .= " -r '$Opt_Registry'" if $Opt_Registry;
  my @regions = ();  # get vector of grid regions
  CWB::Shell::Cmd("$decode_cmd -v $corpus -S ${grid_att}", \@regions);
  print ".." if $Verbose;
  my @keys = ("") x @regions; # initialise vector of keys with empty strings
  while ($key_pattern =~ s/^( [^{}]* ) \{ ( [^}]+ ) \}//x) {
    my ($literal, $name) = ($1, $2);
    my @data = ();
    CWB::Shell::Cmd("$decode_cmd -n $corpus -S ${grid_att}_${name}", \@data);
    die "ERROR: number of regions for <${grid_att}> differs from number of regions for <${grid_att}_${name}>, error in specification of region keys.\n"
      unless @data == @keys;
    print "." if $Verbose;
    foreach my $idx (0 .. $#keys) {
      $keys[$idx] .= $literal.$data[$idx];
    }
    print "." if $Verbose;
  }
  if ($key_pattern ne "") {
    foreach my $idx (0 .. $#keys) {
      $keys[$idx] .= $key_pattern;
    }
    print "." if $Verbose;
  }
  foreach my $idx (0 .. $#keys) {
    my $k = $keys[$idx];
    die "ERROR: duplicate key '$k' found; please revise your key specification.\n"
      if exists $hashref->{$k};
    $hashref->{$k} = [ split /\t/, $regions[$idx] ];
  }
  return $hashref;
}


__END__

=head1 NAME

cwb-align-import - Import existing sentence alignment into a CWB corpus

=head1 SYNOPSIS

  cwb-align-import [options] <alignment_beads.txt>

Options:

  -r <dir>, --registry=<dir>    use registry directory <dir>
  -i, --inverse                 encode inverse alignment (target -> source)
  -p, --prune                   ignore alignment beads with ID errors
  -e, --empty                   allow 1:0 and 0:1 alignments (not encoded)
  -v, --verbose                 show progress messages during processing
  -h, --help                    display short help page

  -nh, --no-header              alignment file without header; must specify:
  -l1 <name>, --source=<name>   CWB name of source corpus
  -l2 <name>, --target=<name>   CWB name of target corpus
  -s <att>,   --grid=<att>      alignment grid (s-attribute, usually sentences)
  -k <spec>,  --key=<spec>      pattern for constructing unique sentence IDs
  
=head1 DESCRIPTION

B<Short description of what the module does>


=head1 OPTIONS

=over 4

=item --help, -h

Show usage and options summary.

=item --verbose, -v

Verbose mode (shows progress messages during processing).

==item --registry=I<dir>, -r I<dir>

Locate corpora in CWB registry directory I<dir>, overriding the default directory and the environment variable C<CORPUS_REGISTRY>.

=item --inverse, -i

Encode inverse alignment (from B<target> language to B<source> language).

=item --prune, -p

Automatically ignore alignment beads if sentence IDs are not found, either in the source or the target corpus.  Without C<-p>, B<cwb-align-import> will abort with an error message in this case.  Note that the C<-p> option implies C<-e> (see below).

=item --empty, -e

Allow 1:0 and 0:1 alignment beads, which will be silently ignored (without C<-e>, they cause a fatal error).

=item --no-header, -nh

Alignment file does not contain a header line.  In this case, the header information must be provided on the command line with the C<-l1>, C<-l2>, C<-s> and C<-k> flags (documented below).

=item --source=I<ID>, -l1 I<ID>

CWB corpus I<ID> of the source language corpus.  Overrides information in alignment file header, if present.

=item --target=I<ID>, -l2 I<ID>

CWB corpus I<ID> of the target language corpus.  Overrides information in alignment file header, if present.

=item --grid=I<attribute>, -s I<attribute>

CWB I<attribute> used as alignment grid (i.e., each alignment bead links I<n> grid regions in the source language to I<m> grid regions in the target language).  For the most common case of sentence alignment, the grid I<attribute> will usually be C<s>.  Note that the same attribute is used for both source and target language corpus.

=item --key=I<pattern>, -k I<pattern>



=back


=head1 DETAILS



=head1 AUTHOR

Stefan Evert E<lt>stefan.evert@uos.deE<gt>

=head1 COPYRIGHT

Copyright (C) 2007-2010 Stefan Evert [http::/purl.org/stefan.evert]

This software is provided AS IS and the author makes no warranty as to
its use and performance. You may use the software, redistribute and
modify it under the same terms as Perl itself.


=cut