#!/usr/bin/perl

=begin metadata

Name: uniq
Description: report or filter out repeated lines in a file
Author: Jonathan Feinberg, jdf@pobox.com
License: perl

=end metadata

=cut


# uniq - report or filter out repeated lines in a file

use strict;

use constant EX_SUCCESS => 0;
use constant EX_FAILURE => 1;

my $VERSION = '1.2';

END {
  close STDOUT || die "$0: can't close stdout: $!\n";
  $? = 1 if $? == 255;  # from die
}

sub usage {
  print "usage: $0 [-c | -d | -u] [-f fields] [-s chars] ",
    "[input_file [output_file]]\n";
  exit EX_FAILURE;
}

sub version { print "$0 (Perl Power Tools) $VERSION\n"; exit 0; }

# options
my ($optc, $optd, $optf, $opts, $optu);

sub get_numeric_arg {
  # $_ contains current arg
  my ($argname, $desc) = @_;
  my $opt;
  if    (length) { $opt = $_ }
  elsif (@ARGV)  { $opt = shift @ARGV }
  else           {die "$0: option requires an argument -- $argname\n"}
  die "$0: invalid number of $desc: `$opt'\n" unless $opt =~ m/\A[0-9]+\Z/;
  $opt;
}

while (@ARGV && $ARGV[0] =~ /^[-+]/) {
  local $_ = shift;
  last if ($_ eq '--');
  /^-v$/       && version();     # terminates
  /^-c$/       && ($optc++, next);
  /^-d$/       && ($optd++, next);
  /^-u$/       && ($optu++, next);
  /^-(\d+)$/   && (($optf = $1), next);
  /^\+(\d+)$/  && (($opts = $1), next);
  s/^-f//      && (($optf = get_numeric_arg('f', 'fields to skip')), next);
  s/^-s//      && (($opts = get_numeric_arg('s', 'bytes to skip')), next);

  warn "$0: invalid option -- $_\n";
  usage();
}
my $infile = shift;
my $outfile = shift;
if (@ARGV) {
  warn "$0: unexpected argument: '$ARGV[0]'\n";
  usage();
}
my ($fh, $out, $comp, $save_comp, $line, $save_line, $count, $eof);

if (defined $infile) {
  if (-d $infile) {
    warn "$0: '$infile' is a directory\n";
    exit EX_FAILURE;
  }
  unless (open $fh, '<', $infile) {
    warn "$0: failed to open '$infile': $!\n";
    exit EX_FAILURE;
  }
} else {
  $fh = *STDIN;
}
if (defined $outfile) {
  unless (open $out, '>', $outfile) {
    warn "$0: failed to open '$outfile': $!\n";
    exit EX_FAILURE;
  }
} else {
  $out = *STDOUT;
}

# prime the pump
$comp = $line = <$fh>;
exit EX_SUCCESS unless defined $line;
if ($optf) {($comp) = (split ' ', $comp, $optf+1)[$optf] }
if ($opts) { $comp  =  substr($comp, $opts) }

LINES:
while (!$eof) {
  $save_line = $line;
  $save_comp = $comp;
  $count = 1;
 DUPS:
  while (!($eof = eof($fh))) {
    $comp = $line = <$fh>;
    if ($optf) {($comp) = (split ' ', $comp, $optf+1)[$optf] }
    if ($opts) { $comp  =  substr($comp, $opts) }
    last DUPS if $comp ne $save_comp;
    ++$count;
  }
  # when we get here, $save_line is the first occurrence of a sequence
  # of duplicate lines, $count is the number of times it appears
  if    ($optc) { printf {$out} '%7d %s', $count, $save_line }
  elsif ($optd) { print  {$out} $save_line if $count >  1 }
  elsif ($optu) { print  {$out} $save_line if $count == 1 }
  else          { print  {$out} $save_line }
}

exit EX_SUCCESS;

__END__

=head1 NAME

uniq - report or filter out repeated lines in a file

=head1 SYNOPSIS

    uniq [B<-c> | B<-d> | B<-u>] [B<-f> I<fields>] [B<-s> I<chars>]
         [input_file [output_file]]

=head1 DESCRIPTION

The uniq utility reads the standard input comparing adjacent lines and
writes a copy of each unique input line to the standard output.  The
second and succeeding copies of identical adjacent input lines are not
written.  Repeated lines in the input will not be detected if they are not
adjacent, so it may be necessary to sort the files first.

The following options are available:

=over

=item -c

Precede each output line with the count of the number of times the
line occurred in the input, followed by a single space.

=item -d

Don't output lines that are not repeated in the input.

=item -f I<fields>

Ignore the first fields in each input line when doing comparisons.
A field is a string of non-blank characters separated
from adjacent fields by blanks.  Field numbers are one based,
i.e. the first field is field one.

=item -s I<chars>

Ignore the first chars characters in each input line when doing
comparisons.  If specified in conjunction with the B<-f> option, the
first chars characters after the first fields fields will be ignored.
Character numbers are one based, i.e. the first character is
character one.

=item -u

Don't output lines that are repeated in the input.

=back

If additional arguments are specified on the command line, they are
used as the names of input files.

The uniq utility exits 0 on success or >0 if an error occurred.

=head1 COMPATIBILITY

The historic B<->I<number> and B<+>I<number> options are supported as
synonyms for B<-f> I<fields> and B<-s> I<chars>, respectively.

This version accepts 0 as a valid argument for the B<-f> and
B<-s> switches; some implementations of uniq do not.

=head1 SEE ALSO

sort(1)

=head1 BUGS

I<uniq> has no known bugs.

=head1 AUTHOR

The Perl implementation of I<uniq> was written by Jonathan Feinberg,
I<jdf@pobox.com>.

=head1 COPYRIGHT and LICENSE

This program is copyright (c) Jonathan Feinberg 1999.

This program is free and open software. You may use, modify, distribute,
and sell this program (and any modified variants) in any way you wish,
provided you do not restrict others from doing the same.