#!/usr/bin/perl
package main;
use 5.008004;
use strict;
use warnings FATAL => 'all';

use Carp qw(croak);
use Pod::Usage;
use Getopt::Long qw(:config bundling);
use File::Temp;
use File::Spec ();

our $VERSION = 0.003001; # X.YYYZZZ

my $VERBOSE;
my @GIT_OPTIONS;

# Strictness levels.
my ($CONTEXT, $ADJACENT, $SURROUNDED) = (0..10);

my $usage =<<'END';
usage: git-autofixup [<options>] <revision>

-h                 show usage
--help             show manpage
--version          show version
-v, --verbose      increase verbosity (use up to 2 times)
-c N, --context N  set number of diff context lines (default 3)
-e, --exit-code    use more detailed exit codes (see --help)
-s N, --strict N   set strictness (default 0)
    Assign a hunk to fixup a topic branch commit if:

    0: either only one topic branch commit is blamed in the hunk context or
       blocks of added lines are adjacent to exactly one topic branch commit.
       Removing upstream lines is allowed for this level.
    1: blocks of added lines are adjacent to exactly one topic branch commit
    2: blocks of added lines are surrounded by exactly one topic branch commit

    Regardless of strictness level, removed lines are correlated with the
    commit they're blamed on, and all the blocks of changed lines in a hunk
    must be correlated with the same topic branch commit in order to be
    assigned to it. See the --help for more details.
-g ARG, --gitopt ARG
    Specify option for git. Can be used multiple times.
END

# Parse hunks out of `git diff` output. Return an array of hunk hashrefs.
sub parse_hunks {
    my $fh = shift;
    my ($file_a, $file_b);
    my @hunks;
    my $line;
    while ($line = <$fh>) {
        if ($line =~ /^--- (.*)/) {
            $file_a = $1;
        } elsif ($line =~ /^\+\+\+ (.*)/) {
            $file_b = $1;
        } elsif ($line =~ /^@@ -(\d+)(?:,(\d+))? \+\d+(?:,\d+)? @@/) {
            my $header = $line;

            for ($file_a, $file_b) {
                s#^[abiwco]/##;
            }

            next if $file_a ne $file_b; # Ignore creations and deletions.

            my $lines = [];
            while (1) {
                $line = <$fh>;
                if (!defined($line) || $line =~ /^[^ +\\-]/) {
                    last;
                }
                push @{$lines}, $line;
            }

            push(@hunks, {
                file => $file_a,
                start => $1,
                count => defined($2) ? $2 : 1,
                header => $header,
                lines => $lines,
            });
            # The next line after a hunk could be a header for the next commit
            # or hunk.
            redo if defined $line;
        }
    }
    return @hunks;
}

sub git_cmd {
    return ('git', @GIT_OPTIONS, @_);
}

sub summary_for_commits {
    my $rev = shift;
    my %commits;
    for (qx(git log --no-merges --format=%H:%s $rev..)) {
        chomp;
        my ($sha, $msg) = split ':', $_, 2;
        $commits{$sha} = $msg;
    }
    return \%commits;
}

# Return targets of fixup!/squash! commits.
sub sha_aliases {
    my $summary_for = shift;
    my %aliases;
    my @targets = keys(%{$summary_for});
    for my $sha (@targets) {
        my $summary = $summary_for->{$sha};
        next if $summary !~ /^(?:fixup|squash)! (.*)/;
        my $prefix = $1;
        if ($prefix =~ /^(?:(?:fixup|squash)! ){2}/) {
            die "fixup commits for fixup commits aren't supported: $sha";
        }
        my @matches = grep {startswith($summary_for->{$_}, $prefix)} @targets;
        if (@matches > 1) {
            die "ambiguous fixup commit target: multiple commit summaries start with: $prefix\n";
        } elsif (@matches == 0) {
            die "no fixup target in topic branch: $sha\n";
        } elsif (@matches == 1) {
            $aliases{$sha} = $matches[0];
        }
    }
    return \%aliases;
}

sub fixup_sha {
    my $args = shift;
    my $hunk = $args->{hunk};
    my $blame = $args->{blame};
    my $summary_for = $args->{summary_for};
    my $strict = $args->{strict};
    if (grep {!defined} ($hunk, $blame, $summary_for, $strict)) {
        croak 'missing argument';
    }

    my @targets;
    if ($args->{strict} == $CONTEXT) {
        @targets = fixup_targets_from_all_context($args);
        my @topic_targets = grep {defined $summary_for->{$_}} @targets;
        if (@topic_targets > 1) {
            # The context assignment is ambiguous, but an adjacency assignment
            # might not be.
            @targets = fixup_targets_from_adjacent_context($args);
        }
    } else {
        @targets = fixup_targets_from_adjacent_context($args);
    }

    my $upstream_is_blamed = grep {!defined $summary_for->{$_}} @targets;
    my @topic_targets = grep {defined $summary_for->{$_}} @targets;
    if ($strict && $upstream_is_blamed) {
        $VERBOSE && print hunk_desc($hunk), " changes lines blamed on upstream\n";
        return;
    } elsif (@topic_targets > 1) {
        $VERBOSE && print hunk_desc($hunk), " has multiple targets\n";
        return;
    } elsif (@topic_targets == 0) {
        $VERBOSE && print hunk_desc($hunk), " has no targets\n";
        return;
    }
    return $topic_targets[0];
}

sub hunk_desc {
    my $hunk = shift;
    return join " ", (
        $hunk->{file},
        $hunk->{header} =~ /(@@[^@]*@@)/,
    );
}

sub fixup_targets_from_all_context {
    my $args = shift;
    my ($hunk, $blame, $summary_for) = @{$args}{qw(hunk blame summary_for)};
    croak 'missing argument' if grep {!defined} ($hunk, $blame, $summary_for);

    my @targets = uniq(map {$_->{sha}} values(%{$blame}));
    return wantarray ? @targets : \@targets;
}

sub uniq {
    my %seen;
    return grep {!$seen{$_}++} @_;
}

sub fixup_targets_from_adjacent_context {
    my $args = shift;
    my $hunk = $args->{hunk};
    my $blame = $args->{blame};
    my $summary_for = $args->{summary_for};
    my $strict = $args->{strict};
    if (grep {!defined} ($hunk, $blame, $summary_for, $strict)) {
        croak 'missing argument';
    }

    my $blame_indexes = blame_indexes($hunk);

    my %blamed;
    my $diff = $hunk->{lines};
    for (my $di = 0; $di < @{$diff}; $di++) { # diff index
        my $bi = $blame_indexes->[$di];
        my $line = $diff->[$di];
        if (startswith($line, '-')) {
            my $sha = $blame->{$bi}{sha};
            $blamed{$sha} = 1;
        } elsif (startswith($line, '+')) {
            my @lines;
            if ($di > 0 && defined $blame->{$bi-1}) {
                push @lines, $bi-1;
            }
            if (defined $blame->{$bi}) {
                push @lines, $bi;
            }
            my @adjacent_shas = uniq(map {$_->{sha}} @{$blame}{@lines});
            my @target_shas = grep {defined $summary_for->{$_}} @adjacent_shas;
            # Note that lines at the beginning or end of a file can be
            # "surrounded" by a single line.
            my $is_surrounded = @target_shas > 0
                && @target_shas == @adjacent_shas
                && $target_shas[0] eq $target_shas[-1];
            my $is_adjacent = @target_shas == 1;
            if ($is_surrounded || ($strict < $SURROUNDED && $is_adjacent)) {
                $blamed{$target_shas[0]} = 1;
            }
            while ($di < @$diff-1 && startswith($diff->[$di+1], '+')) {
                $di++;
            }
        }
    }
    my @targets = keys %blamed;
    return wantarray ? @targets : \@targets;
}

sub startswith {
    my ($haystack, $needle) = @_;
    return index($haystack, $needle, 0) == 0;
}

# Map lines in a hunk's diff to the corresponding `git blame HEAD` output.
sub blame_indexes {
    my $hunk = shift;
    my @indexes;
    my $bi = $hunk->{start};
    for (my $di = 0; $di < @{$hunk->{lines}}; $di++) {
        push @indexes, $bi;
        my $first = substr($hunk->{lines}[$di], 0, 1);
        if ($first eq '-' or $first eq ' ') {
            $bi++;
        }
        # Don't increment $bi for added lines.
    }
    return \@indexes;
}

sub print_hunk_blamediff {
    my $args = shift;
    my $fh = $args->{fh};
    my $hunk = $args->{hunk};
    my $summary_for = $args->{summary_for};
    my $blame = $args->{blame};
    my $blame_indexes = $args->{blame_indexes};
    if (grep {!defined} ($fh, $hunk, $summary_for, $blame, $blame_indexes)) {
        croak 'missing argument';
    }

    my $format = "%-8.8s|%4.4s|%-30.30s|%-30.30s\n";
    for (my $i = 0; $i < @{$hunk->{lines}}; $i++) {
        my $line_r = $hunk->{lines}[$i];
        my $bi = $blame_indexes->[$i];
        my $sha = defined $blame->{$bi} ? $blame->{$bi}{sha} : undef;

        my $display_sha = defined($sha) ? $sha : q{};
        my $display_bi = $bi;
        if (startswith($line_r, '+')) {
            $display_sha = q{}; # For added lines.
            $display_bi = q{};
        }
        if (defined($sha) && !defined($summary_for->{$sha})) {
            # For lines from before the given upstream revision.
            $display_sha = '^';
        }

        my $line_l = '';
        if (defined $blame->{$bi} && !startswith($line_r, '+')) {
            $line_l = $blame->{$bi}{text};
        }

        for ($line_l, $line_r) {
            # For the table to line up, tabs need to be converted to a string of fixed width.
            s/\t/^I/g;
            # Remove trailing newlines and carriage returns. If more trailing
            # whitespace is removed, that's fine.
            $_ = rtrim($_);
        }

        printf {$fh} $format, $display_sha, $display_bi, $line_l, $line_r;
    }
    print {$fh} "\n";
    return;
}

sub rtrim {
    my $s = shift;
    $s =~ s/\s+\z//;
    return $s;
}

sub blame {
    my ($hunk, $alias_for) = @_;
    if ($hunk->{count} == 0) {
        return {};
    }
    my @cmd = git_cmd(
        'blame', '--porcelain',
        '-L' => "$hunk->{start},+$hunk->{count}",
        'HEAD', '--',
        "$hunk->{file}");
    my %blame;
    my ($sha, $line_num);
    open(my $fh, '-|', @cmd) or die "git blame: $!\n";
    while (my $line = <$fh>) {
        if ($line =~ /^([0-9a-f]{40}) \d+ (\d+)/) {
             ($sha, $line_num) = ($1, $2);
        }
        if (startswith($line, "\t")) {
            if (defined $alias_for->{$sha}) {
                $sha = $alias_for->{$sha};
            }
            $blame{$line_num} = {sha => $sha, text => substr($line, 1)};
        }
    }
    close($fh) or die "git blame: non-zero exit code";
    return \%blame;
}

sub diff_hunks {
    my $num_context_lines = shift;
    my @cmd = git_cmd(qw(-c diff.noprefix=false diff --no-ext-diff --ignore-submodules), "-U$num_context_lines");
    if (is_index_dirty()) {
        push @cmd, "--cached";
    }
    open(my $fh, '-|', @cmd) or die $!;
    my @hunks = parse_hunks($fh, keep_lines => 1);
    close($fh) or die "git diff: non-zero exit code";
    return wantarray ? @hunks : \@hunks;
}

sub commit_fixup {
    my ($sha, $hunks) = @_;
    open my $fh, '|-', git_cmd(qw(apply --unidiff-zero --cached -)) or die "git apply: $!\n";
    for my $hunk (@{$hunks}) {
        print({$fh}
            "--- a/$hunk->{file}\n",
            "+++ b/$hunk->{file}\n",
            $hunk->{header},
            @{$hunk->{lines}},
        );
    }
    close $fh or die "git apply: non-zero exit code\n";
    system(git_cmd('commit', "--fixup=$sha")) == 0 or die "git commit: $!\n";
    return;
}

sub is_index_dirty {
    return system(git_cmd(qw(diff-index --cached HEAD --quiet))) != 0;
}

sub fixup_hunks_by_sha {
    my $args = shift;
    my $hunks = $args->{hunks};
    my $blame_for = $args->{blame_for};
    my $summary_for = $args->{summary_for};
    my $strict = $args->{strict};
    if (grep {!defined} ($hunks, $blame_for, $summary_for, $strict)) {
        croak 'missing argument';
    }

    my %hunks_for;
    for my $hunk (@{$hunks}) {
        my $blame = $blame_for->{$hunk};
        my $sha = fixup_sha({
            hunk => $hunk,
            blame => $blame,
            summary_for => $summary_for,
            strict => $strict,
        });
        if ($sha && $VERBOSE) {
            printf "%s fixes %s %s\n",
                hunk_desc($hunk),
                substr($sha, 0, 8),
                $summary_for->{$sha};
        }
        if ($VERBOSE > 1) {
            print_hunk_blamediff({
                fh => *STDOUT,
                hunk => $hunk,
                summary_for => $summary_for,
                blame => $blame,
                blame_indexes => blame_indexes($hunk)
            });
        }
        next if !$sha;
        push @{$hunks_for{$sha}}, $hunk;
    }
    return \%hunks_for;
}

# Return SHAs in some consistent order.
#
# Currently they're ordered by how early their assigned hunks appear in the
# diff output. This assumes $hunks is in the order it was parsed from the diff.
# This ordering seems nice since it'd be similar to the order a human would
# make commits in if they were working their way down the diff.
sub ordered_shas {
    my $hunks = shift;
    my $sha_for = shift;
    my @ordered = ();
    for my $hunk (@{$hunks}) {
        if (defined $sha_for->{$hunk}) {
            push @ordered, $sha_for->{$hunk};
        }
    }
    return uniq(@ordered);
}

# Reverse the sha->hunks hashef and return a hunk->sha hashref.
sub sha_for_hunk_map {
    my $hunks_for = shift;
    my %sha_for;
    for my $sha (keys %{$hunks_for}) {
        for my $hunk (@{$hunks_for->{$sha}}) {
            if (defined $sha_for{$hunk}) {
                die "multiple SHAs for hunk";  # This should never happen.
            }
            $sha_for{$hunk} = $sha;
        }
    }
    return \%sha_for;
}

sub exit_code {
    my ($hunks, $hunks_for) = @_;
    my $hunk_count = scalar @{$hunks};

    my $assigned_hunk_count = 0;
    for (values %{$hunks_for}) {
        $assigned_hunk_count += @{$_};
    }

    my $rc;
    if ($hunk_count == 0) {
        $rc = 3;  # no hunks to assign
    } elsif ($assigned_hunk_count == 0) {
        $rc = 2;  # hunks exist, but none assigned
    } elsif ($assigned_hunk_count < $hunk_count) {
        $rc = 1;  # not all hunks assigned
    } elsif ($hunk_count == $assigned_hunk_count) {
        $rc = 0;  # all hunks assigned
    } else {
        die "unexpected conditions when choosing exit code";
    }
    return $rc;
}

# Create a temporary index so we can craft commits with already-staged hunks.
# Return a File::Temp object so the caller has control over its lifetime.
sub create_temp_index {
    my $tempfile = File::Temp->new(
        TEMPLATE => 'git-autofixup_index.XXXXXX',
        DIR => File::Spec->tmpdir());
    $ENV{GIT_INDEX_FILE} = $tempfile->filename();
    # A blank index makes it look like we're deleting everything, so read
    # HEAD's tree into it.
    qx(git read-tree HEAD^{tree});
    $? == 0 or die "Can't read HEAD's tree into temp index.\n";
    return $tempfile;
}

sub main {
    $VERBOSE = 0;
    my $help;
    my $man;
    my $show_version;
    my $strict = $CONTEXT;
    my $num_context_lines = 3;
    my $dryrun;
    my $use_detailed_exit_codes;

    GetOptions(
        'h' => \$help,
        'help' => \$man,
        'version' => \$show_version,
        'verbose|v+' => \$VERBOSE,
        'strict|s=i' => \$strict,
        'context|c=i' => \$num_context_lines,
        'dryrun|n' => \$dryrun,
        'gitopt|g=s' => \@GIT_OPTIONS,
        'exit-code' => \$use_detailed_exit_codes,
    ) or return 1;
    if ($help) {
        print $usage;
        return 0;
    }
    if ($show_version) {
        print "$VERSION\n";
        return 0;
    }
    if ($man) {
        pod2usage(-exitval => 0, -verbose => 2);
    }

    @ARGV == 1 or die "No upstream commit given.\n";
    my $upstream = shift @ARGV;
    qx(git rev-parse --verify ${upstream}^{commit});
    $? == 0 or die "Can't resolve given commit.\n";

    if ($num_context_lines < 0) {
        die "invalid number of context lines: $num_context_lines\n";
    }

    if ($strict < 0) {
        die "invalid strictness level: $strict\n";
    } elsif ($strict > 0 && $num_context_lines == 0) {
        die "strict hunk assignment requires context\n";
    }


    my $toplevel = qx(git rev-parse --show-toplevel);
    chomp $toplevel;
    $? == 0 or die "Can't get repo toplevel dir\n";
    chdir $toplevel or die $!;

    my $hunks = diff_hunks($num_context_lines);
    my $summary_for = summary_for_commits($upstream);
    my $alias_for = sha_aliases($summary_for);
    my %blame_for = map {$_ => blame($_, $alias_for)} @{$hunks};
    my $hunks_for = fixup_hunks_by_sha({
        hunks => $hunks,
        blame_for => \%blame_for,
        summary_for => $summary_for,
        strict => $strict,
    });
    my @ordered_shas = ordered_shas($hunks, sha_for_hunk_map($hunks_for));

    if ($dryrun) {
        if ($use_detailed_exit_codes) {
            return exit_code($hunks, $hunks_for);
        }
        return 0;
    }

    local $ENV{GIT_INDEX_FILE};  # Throw away changes between main() calls.
    if (is_index_dirty()) {
        # Limit the tempfile's lifetime to the execution of main().
        my $tempfile = create_temp_index();
    }

    for my $sha (@ordered_shas) {
        my $fixup_hunks = $hunks_for->{$sha};
        commit_fixup($sha, $fixup_hunks);
    }
    if ($use_detailed_exit_codes) {
        return exit_code($hunks, $hunks_for);
    }
    return 0;
}

if (!caller()) {
    exit main();
}
1;

__END__

=pod

=head1 NAME

App::Git::Autofixup - create fixup commits for topic branches

=head1 SYNOPSIS

    git-autofixup [<options>] <revision>

=head1 DESCRIPTION

F<git-autofixup> parses hunks of changes in the working directory out of C<git diff> output and uses C<git blame> to assign those hunks to commits in C<E<lt>revisionE<gt>..HEAD>, which will typically represent a topic branch, and then creates fixup commits to be used with C<git rebase --interactive --autosquash>. It is assumed that hunks near changes that were previously committed to the topic branch are related.

C<@{upstream}> or C<@{u}> is likely a convenient value to use for C<E<lt>revisionE<gt>> if the current branch has a tracking branch. See C<git help revisions> for other ways to specify revisions.

If any changes have been staged to the index using C<git add>, then F<git-autofixup> will only consider staged hunks when trying to create fixup commits. A temporary index is used to create any resulting commits.

By default a hunk will be included in a fixup commit if all the lines in the hunk's context blamed on topic branch commits refer to the same commit, so there's no ambiguity about which commit the hunk corresponds to. If there is ambiguity the assignment behaviour used under C<--strict 1> will be used to attempt to resolve it. If C<--strict 1> is given the same topic branch commit must be blamed for every removed line and at least one of the lines adjacent to each added line, and added lines must not be adjacent to lines blamed on other topic branch commits. All the same restrictions apply when C<--strict 2> is given, but each added line must be surrounded by lines blamed on the same topic branch commit.

For example,  the added line in the hunk below is adjacent to lines committed by commits C<99f370af> and C<a1eadbe2>. If these are both topic branch commits then it's ambiguous which commit the added line is fixing up and the hunk will be ignored.

    COMMIT  |LINE|HEAD                          |WORKING DIRECTORY
    99f370af|   1|first line                    | first line
            |    |                              |+added line
    a1eadbe2|   2|second line                   | second line

But if that second line were instead blamed on an upstream commit (denoted by C<^>), the hunk would be added to a fixup commit for C<99f370af>:

    99f370af|   1|first line                    | first line
            |    |                              |+added line
    ^       |   2|second line                   | second line

Output similar to this example can be generated by setting verbosity to 2 or greater by using the verbosity option multiple times, eg. C<git-autofixup -vv>, and can be helpful in determining how a hunk will be handled.

F<git-autofixup> is not to be used mindlessly. Always inspect the created fixup commits to ensure hunks have been assigned correctly, especially when used on a working directory that has been changed with a mix of fixups and new work.

=head1 OPTIONS

=over

=item -h

Show usage.

=item --help

Show manpage.

=item --version

Show version.

=item -v, --verbose

Increase verbosity. Can be used up to two times.

=item -c N, --context N

Change the number of context lines C<git diff> uses around hunks. Default: 3. This can change how hunks are assigned to fixup commits, especially with C<--strict 0>.

=item -s N, --strict N

Set how strict F<git-autofixup> is about assigning hunks to fixup commits. Default: 0. Strictness levels are described under DESCRIPTION.

=item -g ARG, --gitopt ARG

Specify option for git. Can be used multiple times. Useful for testing, to override config options that break git-autofixup, or to override global diff options to tweak what git-autofixup considers a hunk.

Note ARG won't be wordsplit, so to give multiple arguments, such as for setting a config option like C<-c diff.algorithm>, this option must be used multiple times: C<-g -c -g diff.algorithm=patience>.

=item -e, --exit-code

Use more detailed exit codes:

=over

=item 0:

All hunks have been assigned.

=item 1:

Only some hunks have been assigned.

=item 2:

No hunks have been assigned.

=item 3:

There was nothing to be assigned.

=item 255:

Unexpected error occurred.

=back

=back

=head1 INSTALLATION

If cpan is available, run C<cpan -i App::Git::Autofixup>. Otherwise, copy F<git-autofixup> to a directory in C<PATH> and ensure it has execute permissions. It can then be invoked as either C<git autofixup> or C<git-autofixup>, since git searches C<PATH> for appropriately named binaries.

Git is distributed with Perl 5 for platforms not expected to already have it installed, but installing modules with cpan requires other tools that might not be available, such as make. This script has no dependencies outside of the standard library, so it is hoped that it works on any platform that Git does without much trouble.

Requires a git supporting C<commit --fixup>: 1.7.4 or later.


=head1 BUGS/LIMITATIONS

If a topic branch adds some lines in one commit and subsequently removes some of them in another, a hunk in the working directory that re-adds those lines will be assigned to fixup the first commit, and during rebasing they'll be removed again by the later commit.

Not tested in F<cmd.exe> on Windows. Run it from Git Bash, Cygwin, or a similar Unix emulation environment.

=head1 ACKNOWLEDGEMENTS

F<git-autofixup> was inspired by a description of L<hg absorb|https://bitbucket.org/facebook/hg-experimental/src/38d6e5d7f355f58330cd707059baac38d69a1210/hgext3rd/absorb/__init__.py> in the L<Mercurial Sprint Notes|https://groups.google.com/forum/#!topic/mozilla.dev.version-control/nh4fITFlEMk>. While I was working on it I found L<git-superfixup|https://gist.github.com/oktal3700/cafe086b49c89f814be4a7507a32a3f7>, by oktal3700, which was helpful to examine.

=head1 COPYRIGHT AND LICENSE

Copyright (C) 2017, Jordan Torbiak.

This program is free software; you can redistribute it and/or modify it under the terms of the Artistic License v2.0.

=cut