#!/usr/bin/perl
package main;
use 5.008004;
use strict;
use warnings FATAL => 'all';

use Carp qw(croak);
use Pod::Usage;
use Getopt::Long qw(:config bundling);

our $VERSION = 0.002006; # X.YYYZZZ

my $verbose;

# Strictness levels.
my ($CONTEXT, $ADJACENT, $SURROUNDED) = (0..10);

my $usage =<<'END';
usage: git-autofixup [<options>] <revision>

-h                 show usage
--help             show manpage
--version          show version
-v, --verbose      increase verbosity (use up to 2 times)
-c N, --context N  set number of diff context lines (default 3)
-s N, --strict N   set strictness (default 0)
    Assign hunk to topic branch commit if:
    0: exactly one topic branch commit is blamed in hunk context or 
       changed lines are adjacent to exactly one topic branch commit
    1: changed lines are adjacent to exactly one topic branch commit
    2: changed lines are surrounded by exactly one topic branch commit
END

# Parse hunks out of `git diff` output. Return an array of hunk hashrefs.
sub parse_hunks {
    my $fh = shift;
    my ($file_a, $file_b);
    my @hunks;
    my $line;
    while ($line = <$fh>) {
        if ($line =~ /^--- (.*)/) {
            $file_a = $1;
        } elsif ($line =~ /^\+\+\+ (.*)/) {
            $file_b = $1;
        } elsif ($line =~ /^@@ -(\d+)(?:,(\d+))? \+\d+(?:,\d+)? @@/) {
            my $header = $line;

            for ($file_a, $file_b) {
                s#^[ab]/##;
            }

            next if $file_a ne $file_b; # Ignore creations and deletions.

            my $lines = [];
            while (1) {
                $line = <$fh>;
                if (!defined($line) || $line =~ /^[^ +-\\]/) {
                    last;
                }
                push @{$lines}, $line;
            }

            push(@hunks, {
                file => $file_a,
                start => $1,
                count => defined($2) ? $2 : 1,
                header => $header,
                lines => $lines,
            });
            # The next line after a hunk could be a header for the next commit
            # or hunk.
            redo if defined $line;
        }
    }
    return @hunks;
}

sub get_summary_for_commits {
    my $rev = shift;
    my %commits;
    for (qx(git log --no-merges --format=%H:%s $rev..)) {
        chomp;
        my ($sha, $msg) = split ':', $_, 2;
        $commits{$sha} = $msg;
    }
    return \%commits;
}

# Return targets of fixup!/squash! commits.
sub get_sha_aliases {
    my $summary_for = shift;
    my %aliases;
    my @targets = keys(%{$summary_for});
    for my $sha (@targets) {
        my $summary = $summary_for->{$sha};
        next if $summary !~ /^(?:fixup|squash)! (.*)/;
        my $prefix = $1;
        if ($prefix =~ /^(?:(?:fixup|squash)! ){2}/) {
            die "fixup commits for fixup commits aren't supported: $sha";
        }
        my @matches = grep {startswith($summary_for->{$_}, $prefix)} @targets;
        if (@matches > 1) {
            die "ambiguous fixup commit target: multiple commit summaries start with: $prefix\n";
        } elsif (@matches == 0) {
            die "no fixup target: $sha";
        } elsif (@matches == 1) {
            $aliases{$sha} = $matches[0];
        }
    }
    return \%aliases;
}

sub get_fixup_sha {
    my $args = shift;
    my $hunk = $args->{hunk};
    my $blame = $args->{blame};
    my $summary_for = $args->{summary_for};
    my $strict = $args->{strict};
    if (grep {!defined} ($hunk, $blame, $summary_for, $strict)) {
        croak 'missing argument';
    }

    my @targets;
    if ($args->{strict} == $CONTEXT) {
        @targets = get_fixup_targets_from_all_context($args);
        my @topic_targets = grep {defined $summary_for->{$_}} @targets;
        if (@topic_targets > 1) {
            # The context assignment is ambiguous, but an adjacency assignment
            # might not be.
            @targets = get_fixup_targets_from_adjacent_context($args);
        }
    } else {
        @targets = get_fixup_targets_from_adjacent_context($args);
    }

    my $upstream_is_blamed = grep {!defined $summary_for->{$_}} @targets;
    my @topic_targets = grep {defined $summary_for->{$_}} @targets;
    if ($strict && $upstream_is_blamed) {
        $verbose && print hunk_desc($hunk), " changes lines blamed on upstream\n";
        return;
    } elsif (@topic_targets > 1) {
        $verbose && print hunk_desc($hunk), " has multiple targets\n";
        return;
    } elsif (@topic_targets == 0) {
        $verbose && print hunk_desc($hunk), " has no targets\n";
        return;
    }
    if ($verbose) {
        printf "%s fixes %s %s\n",
            hunk_desc($hunk),
            substr($topic_targets[0], 0, 8),
            $summary_for->{$topic_targets[0]};
    }
    return $topic_targets[0];
}

sub hunk_desc {
    my $hunk = shift;
    return join " ", (
        $hunk->{file},
        $hunk->{header} =~ /(@@[^@]*@@)/,
    );
}

sub get_fixup_targets_from_all_context {
    my $args = shift;
    my ($hunk, $blame, $summary_for) = @{$args}{qw(hunk blame summary_for)};
    croak 'missing argument' if grep {!defined} ($hunk, $blame, $summary_for);

    my @targets = uniq(map {$_->{sha}} values(%{$blame}));
    return wantarray ? @targets : \@targets;
}

sub uniq {
    my %seen;
    return grep {!$seen{$_}++} @_;
}

sub get_fixup_targets_from_adjacent_context {
    my $args = shift;
    my $hunk = $args->{hunk};
    my $blame = $args->{blame};
    my $summary_for = $args->{summary_for};
    my $strict = $args->{strict};
    if (grep {!defined} ($hunk, $blame, $summary_for, $strict)) {
        croak 'missing argument';
    }

    my $blame_indexes = get_blame_indexes($hunk);

    my %blamed;
    my $diff = $hunk->{lines};
    for (my $di = 0; $di < @{$diff}; $di++) { # diff index
        my $bi = $blame_indexes->[$di];
        my $line = $diff->[$di];
        if (startswith($line, '-')) {
            my $sha = $blame->{$bi}{sha};
            $blamed{$sha} = 1;
        } elsif (startswith($line, '+')) {
            my @lines;
            if ($di > 0 && defined $blame->{$bi-1}) {
                push @lines, $bi-1;
            }
            if (defined $blame->{$bi}) {
                push @lines, $bi;
            }
            my @adjacent_shas = uniq(map {$_->{sha}} @{$blame}{@lines});
            my @target_shas = grep {defined $summary_for->{$_}} @adjacent_shas;
            # Note that lines at the beginning or end of a file can be
            # "surrounded" by a single line.
            my $is_surrounded = @target_shas > 0
                && @target_shas == @adjacent_shas
                && $target_shas[0] eq $target_shas[-1];
            my $is_adjacent = @target_shas == 1;
            if ($is_surrounded || ($strict < $SURROUNDED && $is_adjacent)) {
                $blamed{$target_shas[0]} = 1;
            }
            while ($di < @$diff-1 && startswith($diff->[$di+1], '+')) {
                $di++;
            }
        }
    }
    my @targets = keys %blamed;
    return wantarray ? @targets : \@targets;
}

sub startswith {
    my ($haystack, $needle) = @_;
    return index($haystack, $needle, 0) == 0;
}

# Map lines in a hunk's diff to the corresponding `git blame HEAD` output.
sub get_blame_indexes {
    my $hunk = shift;
    my @indexes;
    my $bi = $hunk->{start};
    for (my $di = 0; $di < @{$hunk->{lines}}; $di++) {
        push @indexes, $bi;
        my $first = substr($hunk->{lines}[$di], 0, 1);
        if ($first eq '-' or $first eq ' ') {
            $bi++;
        }
        # Don't increment $bi for added lines.
    }
    return \@indexes;
}

sub print_hunk_blamediff {
    my $args = shift;
    my $fh = $args->{fh};
    my $hunk = $args->{hunk};
    my $summary_for = $args->{summary_for};
    my $blame = $args->{blame};
    my $blame_indexes = $args->{blame_indexes};
    if (grep {!defined} ($fh, $hunk, $summary_for, $blame, $blame_indexes)) {
        croak 'missing argument';
    }

    my $format = "%-8.8s|%4.4s|%-30.30s|%-30.30s\n";
    for (my $i = 0; $i < @{$hunk->{lines}}; $i++) {
        my $line_r = $hunk->{lines}[$i];
        my $bi = $blame_indexes->[$i];
        my $sha = defined $blame->{$bi} ? $blame->{$bi}{sha} : undef;

        my $display_sha = defined($sha) ? $sha : q{};
        my $display_bi = $bi;
        if (startswith($line_r, '+')) {
            $display_sha = q{}; # For added lines.
            $display_bi = q{};
        }
        if (defined($sha) && !defined($summary_for->{$sha})) {
            # For lines from before the given upstream revision.
            $display_sha = '^';
        }

        my $line_l = '';
        if (defined $blame->{$bi} && !startswith($line_r, '+')) {
            $line_l = $blame->{$bi}{text};
        }

        for ($line_l, $line_r) {
            # For the table to line up, tabs need to be converted to a string of fixed width.
            s/\t/^I/g;
            # Remove trailing newlines and carriage returns. If more trailing
            # whitespace is removed, that's fine.
            $_ = rtrim($_);
        }

        printf {$fh} $format, $display_sha, $display_bi, $line_l, $line_r;
    }
    print {$fh} "\n";
    return;
}

sub rtrim {
    my $s = shift;
    $s =~ s/\s+\z//;
    return $s;
}

sub blame {
    my ($hunk, $alias_for) = @_;
    if ($hunk->{count} == 0) {
        return {};
    }
    my @cmd = (
        'git', 'blame', '--porcelain',
        '-L' => "$hunk->{start},+$hunk->{count}",
        'HEAD', '--',
        "$hunk->{file}");
    my %blame;
    my ($sha, $line_num);
    open(my $fh, '-|', @cmd) or die "git blame: $!\n";
    while (my $line = <$fh>) {
        if ($line =~ /^([0-9a-f]{40}) \d+ (\d+)/) {
             ($sha, $line_num) = ($1, $2);
        }
        if (startswith($line, "\t")) {
            if (defined $alias_for->{$sha}) {
                $sha = $alias_for->{$sha};
            }
            $blame{$line_num} = {sha => $sha, text => substr($line, 1)};
        }
    }
    close($fh) or die "git blame: non-zero exit code";
    return \%blame;
}

sub get_diff_hunks {
    my $num_context_lines = shift;
    my @cmd = (qw(git diff --ignore-submodules), "-U$num_context_lines");
    open(my $fh, '-|', @cmd) or die $!;
    my @hunks = parse_hunks($fh, keep_lines => 1);
    close($fh) or die "git diff: non-zero exit code";
    return wantarray ? @hunks : \@hunks;
}

sub commit_fixup {
    my ($sha, $hunks) = @_;
    open my $fh, '|-', 'git apply --unidiff-zero --cached -' or die "git apply: $!\n";
    for my $hunk (@{$hunks}) {
        print({$fh}
            "--- a/$hunk->{file}\n",
            "+++ b/$hunk->{file}\n",
            $hunk->{header},
            @{$hunk->{lines}},
        );
    }
    close $fh or die "git apply: non-zero exit code\n";
    system('git', 'commit', "--fixup=$sha") == 0 or die "git commit: $!\n";
    return;
}

sub is_index_dirty {
    open(my $fh, '-|', 'git status --porcelain') or die "git status: $!\n";
    my $dirty;
    while (my $line = <$fh>) {
        if ($line =~ /^[^?! ]/) {
            $dirty = 1;
            last;
        }
    }
    close $fh or die "git status: non-zero exit code\n";
    return $dirty;
}

sub get_fixup_hunks_by_sha {
    my $args = shift;
    my $hunks = $args->{hunks};
    my $blame_for = $args->{blame_for};
    my $summary_for = $args->{summary_for};
    my $strict = $args->{strict};
    if (grep {!defined} ($hunks, $blame_for, $summary_for, $strict)) {
        croak 'missing argument';
    }

    my %hunks_for;
    for my $hunk (@{$hunks}) {
        my $blame = $blame_for->{$hunk};
        my $sha = get_fixup_sha({
            hunk => $hunk,
            blame => $blame,
            summary_for => $summary_for,
            strict => $strict,
        });
        if ($verbose > 1) {
            print_hunk_blamediff({
                fh => *STDOUT,
                hunk => $hunk,
                summary_for => $summary_for,
                blame => $blame,
                blame_indexes => get_blame_indexes($hunk)
            });
        }
        next if !$sha;
        push @{$hunks_for{$sha}}, $hunk;
    }
    return \%hunks_for;
}

sub main {
    $verbose = 0;
    my $num_context_lines = 3;
    my $strict = $CONTEXT;
    my $man;

    my ($help, $show_version);
    GetOptions(
        'h' => \$help,
        'help' => \$man,
        'version' => \$show_version,
        'verbose|v+' => \$verbose,
        'strict|s=i' => \$strict,
        'context|c=i' => \$num_context_lines,
    ) or return 1;
    if ($help) {
        print $usage;
        return 0;
    }
    if ($show_version) {
        print "$VERSION\n";
        return 0;
    }
    if ($man) {
        pod2usage(-exitval => 0, -verbose => 2);
    }

    @ARGV == 1 or die "No upstream commit given.\n";
    my $upstream = shift @ARGV;
    qx(git rev-parse --verify ${upstream}^{commit});
    $? == 0 or die "Can't resolve given commit.\n";

    if ($num_context_lines < 0) {
        die "invalid number of context lines: $num_context_lines\n";
    }

    if ($strict < 0) {
        die "invalid strictness level: $strict\n";
    } elsif ($strict > 0 && $num_context_lines == 0) {
        die "strict hunk assignment requires context\n";
    }


    my $toplevel = qx(git rev-parse --show-toplevel);
    chomp $toplevel;
    $? == 0 or die "Can't get repo toplevel dir\n";
    chdir $toplevel or die $!;

    if (is_index_dirty()) {
        die "There are staged changes. Clean up the index and try again.\n";
    }

    my $hunks = get_diff_hunks($num_context_lines);
    my $summary_for = get_summary_for_commits($upstream);
    my $alias_for = get_sha_aliases($summary_for);
    my %blame_for = map {$_ => blame($_, $alias_for)} @{$hunks};
    my $hunks_for = get_fixup_hunks_by_sha({
        hunks => $hunks,
        blame_for => \%blame_for,
        summary_for => $summary_for,
        strict => $strict,
    });
    while (my ($sha, $fixup_hunks) = each %{$hunks_for}) {
        commit_fixup($sha, $fixup_hunks);
    }
    return 0;
}

if (!caller()) {
    exit main();
}
1;

__END__

=pod

=head1 NAME

App::Git::Autofixup - create fixup commits for topic branches

=head1 SYNOPSIS

    git-autofixup [<options>] <revision>

=head1 DESCRIPTION

F<git-autofixup> parses hunks of changes in the working directory out of C<git diff> output and uses C<git blame> to assign those hunks to commits in C<E<lt>revisionE<gt>..HEAD>, which will typically represent a topic branch, and then creates fixup commits to be used with C<git rebase --interactive --autosquash>. [See C<git help revisions> for information about git revision specification syntax.] It is assumed that unstaged changes near changes previously committed to the topic branch are related.

C<@{upstream}> or C<@{u}> is likely a convenient value to use for C<E<lt>revisionE<gt>> if the current branch has a tracking branch.

By default a hunk will be included in a fixup commit if all the lines in the hunk's context blamed on topic branch commits refer to the same commit, so there's no ambiguity about which commit the hunk corresponds to. If there is ambiguity the assignment behaviour used under C<--strict 1> will be used to attempt to resolve it. If C<--strict 1> is given the same topic branch commit must be blamed for every removed line and at least one of the lines adjacent to each added line, and added lines must not be adjacent to lines blamed on other topic branch commits. All the same restrictions apply when C<--strict 2> is given, but each added line must be surrounded by lines blamed on the same topic branch commit.

For example,  the added line in the hunk below is adjacent to lines committed by commits C<99f370af> and C<a1eadbe2>. If these are both topic branch commits then it's ambiguous which commit the added line is fixing up and the hunk will be ignored.

    COMMIT  |LINE|HEAD                          |WORKING DIRECTORY
    99f370af|   1|first line                    | first line
            |    |                              |+added line
    a1eadbe2|   2|second line                   | second line

But if that second line were instead blamed on an upstream commit (denoted by C<^>), the hunk would be added to a fixup commit for C<99f370af>:

    99f370af|   1|first line                    | first line
            |    |                              |+added line
    ^       |   2|second line                   | second line

Output similar to this example can be generated by setting verbosity to 2 or greater by using the verbosity option multiple times, eg. C<git-autofixup -vv>, and can be helpful in determining how a hunk will be handled.

F<git-autofixup> is not to be used mindlessly. Always inspect the created fixup commits to ensure hunks have been assigned correctly, especially when used on a working directory that has been changed with a mix of fixups and new work.

=head1 OPTIONS

=over

=item -h

Show usage.

=item --help

Show manpage.

=item --version

Show version.

=item -v, --verbose

Increase verbosity. Can be used up to two times.

=item -c N, --context N

Change the number of context lines C<git diff> uses around hunks. Default: 3. This can change how hunks are assigned to fixup commits, especially with C<--strict 0>.

=item -s N, --strict N

Set how strict F<git-autofixup> is about assigning hunks to fixup commits. Default: 0. Strictness levels are described under DESCRIPTION.

=back

=head1 INSTALLATION

If cpan is available, run C<cpan -i App::Git::Autofixup>. Otherwise, copy F<git-autofixup> to a directory in C<PATH> and ensure it has execute permissions. It can then be invoked as either C<git autofixup> or C<git-autofixup>, since git searches C<PATH> for appropriately named binaries.

Git is distributed with Perl 5 for platforms not expected to already have it installed, but installing modules with cpan requires other tools that might not be available, such as make. This script has no dependencies outside of the standard library, so it is hoped that it works on any platform that Git does without much trouble.

Requires a git supporting C<commit --fixup>: 1.7.4 or later.


=head1 BUGS/LIMITATIONS

If a topic branch adds some lines in one commit and subsequently removes some of them in another, a hunk in the working directory that re-adds those lines will be assigned to fixup the first commit, and during rebasing they'll be removed again by the later commit.

Not tested in F<cmd.exe> on Windows. Run it from Git Bash, Cygwin, or a similar Unix emulation environment.

=head1 ACKNOWLEDGEMENTS

F<git-autofixup> was inspired by a description of L<hg absorb|https://bitbucket.org/facebook/hg-experimental/src/38d6e5d7f355f58330cd707059baac38d69a1210/hgext3rd/absorb/__init__.py> in the L<Mercurial Sprint Notes|https://groups.google.com/forum/#!topic/mozilla.dev.version-control/nh4fITFlEMk>. While I was working on it I found L<git-superfixup|https://gist.github.com/oktal3700/cafe086b49c89f814be4a7507a32a3f7>, by oktal3700, which was helpful to examine.

=head1 COPYRIGHT AND LICENSE

Copyright (C) 2017, Jordan Torbiak.

This program is free software; you can redistribute it and/or modify it under the terms of the Artistic License v2.0.

=cut