#!/usr/bin/perl =begin metadata Name: uniq Description: report or filter out repeated lines in a file Author: Jonathan Feinberg, jdf@pobox.com License: perl =end metadata =cut # uniq - report or filter out repeated lines in a file use strict; use constant EX_SUCCESS => 0; use constant EX_FAILURE => 1; my $VERSION = '1.2'; END { close STDOUT || die "$0: can't close stdout: $!\n"; $? = 1 if $? == 255; # from die } sub usage { print "usage: $0 [-c | -d | -u] [-f fields] [-s chars] ", "[input_file [output_file]]\n"; exit EX_FAILURE; } sub version { print "$0 (Perl Power Tools) $VERSION\n"; exit 0; } # options my ($optc, $optd, $optf, $opts, $optu); sub get_numeric_arg { # $_ contains current arg my ($argname, $desc) = @_; my $opt; if (length) { $opt = $_ } elsif (@ARGV) { $opt = shift @ARGV } else {die "$0: option requires an argument -- $argname\n"} die "$0: invalid number of $desc: `$opt'\n" unless $opt =~ m/\A[0-9]+\Z/; $opt; } while (@ARGV && $ARGV[0] =~ /^[-+]/) { local $_ = shift; last if ($_ eq '--'); /^-v$/ && version(); # terminates /^-c$/ && ($optc++, next); /^-d$/ && ($optd++, next); /^-u$/ && ($optu++, next); /^-(\d+)$/ && (($optf = $1), next); /^\+(\d+)$/ && (($opts = $1), next); s/^-f// && (($optf = get_numeric_arg('f', 'fields to skip')), next); s/^-s// && (($opts = get_numeric_arg('s', 'bytes to skip')), next); warn "$0: invalid option -- $_\n"; usage(); } my $infile = shift; my $outfile = shift; if (@ARGV) { warn "$0: unexpected argument: '$ARGV[0]'\n"; usage(); } my ($fh, $out, $comp, $save_comp, $line, $save_line, $count, $eof); if (defined $infile) { if (-d $infile) { warn "$0: '$infile' is a directory\n"; exit EX_FAILURE; } unless (open $fh, '<', $infile) { warn "$0: failed to open '$infile': $!\n"; exit EX_FAILURE; } } else { $fh = *STDIN; } if (defined $outfile) { unless (open $out, '>', $outfile) { warn "$0: failed to open '$outfile': $!\n"; exit EX_FAILURE; } } else { $out = *STDOUT; } # prime the pump $comp = $line = <$fh>; exit EX_SUCCESS unless defined $line; if ($optf) {($comp) = (split ' ', $comp, $optf+1)[$optf] } if ($opts) { $comp = substr($comp, $opts) } LINES: while (!$eof) { $save_line = $line; $save_comp = $comp; $count = 1; DUPS: while (!($eof = eof($fh))) { $comp = $line = <$fh>; if ($optf) {($comp) = (split ' ', $comp, $optf+1)[$optf] } if ($opts) { $comp = substr($comp, $opts) } last DUPS if $comp ne $save_comp; ++$count; } # when we get here, $save_line is the first occurrence of a sequence # of duplicate lines, $count is the number of times it appears if ($optc) { printf {$out} '%7d %s', $count, $save_line } elsif ($optd) { print {$out} $save_line if $count > 1 } elsif ($optu) { print {$out} $save_line if $count == 1 } else { print {$out} $save_line } } exit EX_SUCCESS; __END__ =head1 NAME uniq - report or filter out repeated lines in a file =head1 SYNOPSIS uniq [B<-c> | B<-d> | B<-u>] [B<-f> I<fields>] [B<-s> I<chars>] [input_file [output_file]] =head1 DESCRIPTION The uniq utility reads the standard input comparing adjacent lines and writes a copy of each unique input line to the standard output. The second and succeeding copies of identical adjacent input lines are not written. Repeated lines in the input will not be detected if they are not adjacent, so it may be necessary to sort the files first. The following options are available: =over =item -c Precede each output line with the count of the number of times the line occurred in the input, followed by a single space. =item -d Don't output lines that are not repeated in the input. =item -f I<fields> Ignore the first fields in each input line when doing comparisons. A field is a string of non-blank characters separated from adjacent fields by blanks. Field numbers are one based, i.e. the first field is field one. =item -s I<chars> Ignore the first chars characters in each input line when doing comparisons. If specified in conjunction with the B<-f> option, the first chars characters after the first fields fields will be ignored. Character numbers are one based, i.e. the first character is character one. =item -u Don't output lines that are repeated in the input. =back If additional arguments are specified on the command line, they are used as the names of input files. The uniq utility exits 0 on success or >0 if an error occurred. =head1 COMPATIBILITY The historic B<->I<number> and B<+>I<number> options are supported as synonyms for B<-f> I<fields> and B<-s> I<chars>, respectively. This version accepts 0 as a valid argument for the B<-f> and B<-s> switches; some implementations of uniq do not. =head1 SEE ALSO sort(1) =head1 BUGS I<uniq> has no known bugs. =head1 AUTHOR The Perl implementation of I<uniq> was written by Jonathan Feinberg, I<jdf@pobox.com>. =head1 COPYRIGHT and LICENSE This program is copyright (c) Jonathan Feinberg 1999. This program is free and open software. You may use, modify, distribute, and sell this program (and any modified variants) in any way you wish, provided you do not restrict others from doing the same.