# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#   Copyright (c) 2002-2003 Vivendi Universal Net USA
#
#   May be copied under the same terms as perl itself.
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~


# Database-like operations on tab-delimited files.
#
# Given two files:
# band_data.tab with fields band_id, band_name, and band_status
# song_data.tab with fields song_id, band_id, song_title
#
# The following sequence is more or less equivalent to
#
#    SELECT song_id, band_data.band_id AS band_id, 
#           song_title, band_name, 
#           int(band_id/1000) AS band_dir
#     FROM song_data INNER JOIN band_data ON song_data.band_id=band_data.band_id
#     WHERE band_status = 'APPROVED' 
#  ORDER BY band_name
# INTO TABLE songband
#
#
#  $band_data = Text::TabTable->import_headered("band_data.tab") ;
#  $song_data = Text::TabTable->import_headered("song_data.tab") ;
#  $joined = $song_data->join($band_data, "band_id", "band_id", "INNER") ;
#  $selected = $joined->select(
#               [
#                 'song_id',
#                 ['band_data.band_id', 'band_id'],
#                 'song_title',
#                 'band_name',
#                 [ sub { int($_[0] / 1000) }, "band_dir", ["band_id"]],
#               ],
# 
#               sub { $_[0]->band_status eq 'APPROVED' },
#             ) ;
# $out = $selected->order("band_name") ;
# $out->export_headered("songband.tab") ;
#
##############################################################################
#
# You can speed up LEFT and INNER joins on primary keys if you create an index 
# for the primary key column on the *right-side* table using
# 
# $righttable->build_primary_index("band_id") ;
# $newtable = $lefttable->join($righttable, "band_id", "band_id", "LEFT") ;
#
# If both tables are already sorted by the primary key because order() was
# previously used, this will be slower.
#
# The index will not be used for RIGHT joins.

package Text::TabTable ;

use strict ;
use Carp ;
use Data::Dumper ;
use Fcntl qw(O_WRONLY O_EXCL O_CREAT) ;

use vars qw($SORT $JOIN $VERBOSE $TMPDIR $VERSION) ;

$VERSION = "1.02" ;

$TMPDIR = "." ;
$SORT = "/bin/sort" ;
$JOIN = "/usr/bin/join" ;

$VERBOSE=$ENV{TABTABLE_VERBOSE} ;

####
# Constructor.  Takes a tab delimited file with a field name
# header line and returns a TabTable object.  Parses the header line
# and creates a temporary file without the header line.
####
sub import_headered
{
  my ($package,$fname) = @_ ;
  my $newf = _make_tempfile() ;

  carp "importing $fname" if $VERBOSE ;
  open(F, $fname) || return undef ;
  open(NEWF, ">$newf") || return undef ;
  my $header = <F> ;
  my $buf ;

  # copy the unheadered version of the file to a new file.
  while( read(F, $buf, 2048) ) {
    print NEWF $buf ;
  }
  close F ;
  close NEWF ;

  chomp $header ;
  my @fieldnames = split(/\t/, $header) ;
  my @fields = map { Text::TabTable::Field->new($_) } @fieldnames ;

  my $name = $fname ;
  $name =~ s/\..*$// ;   # remove extensions
  $name =~ s@.*\/@@ ;   # remove path

  my $self = {
               filename => $newf,
	       fieldlist => Text::TabTable::FieldList->new(@fields),
	       name => $name,
	     } ;
  bless $self, $package ;
}

####
# Alternate constructor.  Takes a tab delimited file *without* a field name
# header line, plus the field names,  and returns a TabTable object.  This
# saves time because it doesn't require making a tempfile without the header.
####
sub import_unheadered
{
  my ($package,$fname, @fieldnames) = @_ ;
  my $newf = _make_tempfile() ;

  carp "importing $fname (unheadered)" if $VERBOSE ;
  return undef if !-f $fname || !-r $fname ;

  my @fields = map { Text::TabTable::Field->new($_) } @fieldnames ;

  my $name = $fname ;
  $name =~ s/\..*$// ;   # remove extensions
  $name =~ s@.*\/@@ ;   # remove path

  my $self = {
               filename => $fname,
	       dontdelete => 1,     # so we know it's not a tempfile.
	       fieldlist => Text::TabTable::FieldList->new(@fields),
	       name => $name,
	     } ;
  bless $self, $package ;
}


####
# Undoes the escaping done by MediaExtractor.
####
sub unescape
{
  my ($str) = @_ ;

  my $x = $str ;
  $str =~ s/\\\\/\xff/g ;
  $str =~ s/\\n/\n/g ;
  $str =~ s/\\t/\t/g ;
  $str =~ s/\xff/\\/g ;

  return $str ;
}

####
# This is the same escaping done by MediaExtractor.
####
sub escape
{
  my ($str) = @_ ;

  $str =~ s/\\/\\\\/g;
  $str =~ s/\t/\\t/g;
  $str =~ s/\n/\\n/g;


  return $str ;
}

####
# Writes out a table as a file with a header.
####
sub export_headered
{
  my ($self, $filename) = @_ ;
  carp "exporting $self->{name} to $filename" if $VERBOSE ;

  open(F, ">$filename") || croak "$filename: $!\n" ;
  print F $self->{fieldlist}->as_string(), "\n" ;
  close F ;
  system "cat $self->{filename} >> $filename" ;
}

####
# Writes out a table as a file without a header.
####
sub export_unheadered
{
  my ($self, $filename) = @_ ;
  carp "exporting $self->{name} to $filename" if $VERBOSE ;
  system "cp $self->{filename}  $filename" ;
  if ($?) {
    unlink $filename ;
    croak "can't export to $filename: $!" ;
  }
}

####
# Returns a new table that has only one of each value in the specified
# column.
####
sub uniq
{
  my ($table, $colname) = @_ ;

  my $colnum = $table->{fieldlist}->find_colnum($colname) ;
  croak "no field $colname in table" if !$colname ;

  if (!$table->{sorted_column} || $table->{sorted_column} != $colnum) {
    my $name = $table->name() ;
    $table = $table->order($colname) ;
    $table->name($name) ;
  }

  carp "uniquing $table->{name} by $colname" if $VERBOSE ;

  my $newf = _make_tempfile() ;

  open(OLDF, "<$table->{filename}") || die ;
  open(NEWF, ">$newf") || croak "$newf: $!\n" ;

  my $oldval = undef ;
  while (<OLDF>) {
    chomp ;
    my @f = split(/\t/, $_, -1) ;
    if ($oldval ne $f[$colnum-1] || !defined $oldval) {
      print NEWF $_, "\n" ;
      $oldval = $f[$colnum-1] ;
    }
  }

  close(OLDF) ;
  close(NEWF) ;

  my $newtable = {
               filename => $newf,
	       fieldlist => $table->{fieldlist}->deepcopy(),
	       sorted_colnum => $colnum,
	       name => $table->name(),
	     } ;

  if (!defined wantarray ) {
    carp "Warning: Useless uniq in void context." ;
  }

  bless $newtable, ref $table ;
}

####
# Export to a cdb file.
# There will be a special key "*FIELDNAMES*" whose value is a tab
# separated list of the names of the fields.
# The rest of the cdb file will be of the form key => tab-delimited-values.
#
# The key must be unique; however as a special case multiple blank keys
# are allowed to be present; only the first one is used.  This is a hack,
# but is too good an optimization to pass up.
####
sub export_cdb
{
  my ($self, $filename, $colname) = @_ ;
  require CDB_File ;
  carp "exporting $self->{name} to cdb $filename" if $VERBOSE ;

  my $t = CDB_File->new($filename, "$filename.new$$") or croak "$filename: $!" ;

  $t->insert("*FIELDNAMES*", $self->{fieldlist}->as_string()) ;

  open(F, "< $self->{filename}") || die "$self->{filename}: $!" ;
  my $colnum = $self->{fieldlist}->find_colnum($colname) ;
  $colnum-- ;

  # Create a regex to skip over the columns before the key column and 
  # collect the key column in $1.
  my $regex = '^' . ('[^\t]*\t' x $colnum) . '([^\t]*)' ;
  $regex = qr($regex) ;
  my $didblankkey = 0 ;
  while (<F>) {
    chomp ;
    /$regex/ || die ;
    my $key = $1 ;

    next if $key eq '' && $didblankkey++ ;

    $t->insert($key, $_) ;
  }

  $t->finish() ;
  close(F) ;
}

####
# Returns the named column of the table as an array.
####
sub export_column
{
  my ($table, $colname) = @_ ;
  carp "exporting $table->{name} column $colname" if $VERBOSE ;

  my $colnum = $table->{fieldlist}->find_colnum($colname) ;

  if (!defined $colnum) {
    croak "no column $colname" ;
  }

  my @arr ;
  open(CUT, "cut -f$colnum $table->{filename}|") || die ;
  while(defined ($_=<CUT>)) {
    chomp ;
    push @arr, $_ ;
  }
  close CUT ;

  return @arr ;
}

####
# Returns a new TabTable that is sorted by the requested column.  Dies
# if no such column.  You can specify -descending=>1 or -numeric =>1
# after the fieldname.
# 
# The sort is stable, so you can sort on multiple fields by doing
# multiple sorts, with the most important one last.
####
sub order
{
  my ($self, $fieldname, %args) = @_ ;
  carp "sorting $self->{name} by $fieldname" if $VERBOSE ;

  my $newf = _make_tempfile() ;

  # This is a flag that gets turned off if the sort is not alphabetic
  # and ascending.  In that case, the sort order is not correct for the
  # join() method, and so join() would have to re-sort.
  my $joinable_sort = 1 ;

  my $colnum = $self->{fieldlist}->find_colnum($fieldname) ;
  if (!$colnum) {
    unlink $newf ;
    croak "No such field $fieldname" ;
  }

  my @sortargs = ("-s",
		  "-T$TMPDIR",
                  "-t\t", 
		  "-k$colnum,$colnum", 
		  "-o$newf", 
		  $self->{filename}
		) ;
  if ($args{-descending}) {
    unshift @sortargs, "-r" ;
    $joinable_sort = 0 ;
  }
  if ($args{-numeric}) {
    unshift @sortargs, "-n" ;
    $joinable_sort = 0 ;
  }

  system $SORT, @sortargs ;

  if ($?) {
    unlink $newf ;
    croak "sort error" ;
  }

  my $newtable = {
               filename => $newf,
	       fieldlist => $self->{fieldlist}->deepcopy(),
	       sorted_colnum => $joinable_sort ? $colnum : undef,
	       name => $self->name(),
	     } ;

  if (!defined wantarray ) {
    carp "Warning: Useless order in void context." ;
  }

  bless $newtable, ref $self ;
}

####
# Returns a new table with two columns.  The first column will contain the
# unique values of the specified field, the second column will contain the
# number of occurrences of that value.
####
sub groupby_and_count
{
  my ($table, $fieldname, $newfieldname, %args) = @_ ;
  my $colnum = $table->{fieldlist}->find_colnum($fieldname) ;
  if (!$colnum) {
    croak "No such field $fieldname" ;
  }
  # Create a temporary table that is sorted by the specified column.
  my $sortedtable = $table->order($fieldname, %args);

  # Taken from uniq(), One pass through the file counting the number
  # of times the specified column appears and creating a new file
  # with the specified column and count.
  my $newf = _make_tempfile() ;
  open(OLDF, "<$sortedtable->{filename}") || die ;
  open(NEWF, ">$newf") || croak "$newf: $!\n" ;

  my $count = 0 ;
  my $oldval = undef ;
  while (<OLDF>) {
    chomp ;
    my @f = split(/\t/, $_, -1) ;
    if (!defined $oldval) {
      $oldval = $f[$colnum-1] ;
      $count = 1 ;
    } elsif ($oldval ne $f[$colnum-1]) {
      print NEWF $oldval, "\t", $count, "\n" ;
      $oldval = $f[$colnum-1] ;
      $count = 1 ;
    } else {
      $count++ ;
    }
  }
  if (defined $oldval) {
    print NEWF $oldval, "\t", $count, "\n" ;
  }

  close(OLDF) ;
  close(NEWF) ;

  my @newfieldnames = ($fieldname, $newfieldname) ;
  my @newfields = map { Text::TabTable::Field->new($_) } @newfieldnames ;
  my $newtable = {
    filename => $newf,
    fieldlist => Text::TabTable::FieldList->new(@newfields),
    sorted_colnum => 1,
    name => $table->name(),
  } ;

  bless $newtable, ref $table ;
}

####
# Takes a list of pairs of oldname=>newname and changes the names of fields
# of the table.  This wipes out the old field names entirely.
sub rename_fields
{
  my ($table, @renames) = @_ ;

  my $oldname ;
  my $newname ;
  my @fields = $table->{fieldlist}->fields() ;
  while ( ($oldname = shift(@renames)) && ($newname = shift(@renames)) ) {
    my $colnum = $table->{fieldlist}->find_colnum($oldname) ;
    $fields[$colnum-1]->set_name($newname) ;
  }
}

####
# Gets or sets the name of the table.
####
sub name
{
  my ($self, $name) = @_ ;
  if (defined $name) {
    $self->{name} = $name ;
  }
  return $self->{name} ;
}

####
# takes a table and exports it as a cdb file, creating a primary key
# index.  This can make joins go faster if this table is on the right side
# of the join, since neither table has to be sorted, and building a cdb
# is generally faster than sorting (~O(n) instead of O(nLogn).
####
sub build_primary_index
{
  my ($self, $colname) = @_ ;
  my $newf = _make_tempfile() ;

  $self->export_cdb($newf, $colname) ;
  my $colnum = $self->{fieldlist}->find_colnum($colname) ;

  $self->{cdb}{$colnum} = $newf ;
}

####
# Returns a new table created by joining the two tables on a specified
# column.  the $side parameter can be specified as LEFT or RIGHT to
# create LEFT/RIGHT joins, or can be INNER, OUTER, or undef.
# $leftfield/$rightfield are the field names to be used in the two tables
# for joining.
#
# If the right table has a primary index on the join column (created
# by build_primary_index()), and it's either a left or inner join, 
# a simpler join algorithm will be used that does not require sorting.
#
# Both tables must have names.  Tables get names either by setting them
# with the name() method, or from the filename in the import_headered
# method.
####
sub join
{
  my ($lefttable, $righttable, $leftfield, $rightfield, $side) = @_ ;

  if (!$lefttable->name() || !$righttable->name()) {
    croak "both tables must have name()s" ;
  }

  my $leftcol = $lefttable->{fieldlist}->find_colnum($leftfield) ;
  croak "no field $leftfield in left table" if !$leftfield ;
  my $rightcol = $righttable->{fieldlist}->find_colnum($rightfield) ;
  croak "no field $rightfield in right table" if !$rightfield ;

  if ($righttable->{cdb}{$rightcol} && $side ne 'RIGHT' && $side ne 'OUTER') {
    if ($VERBOSE) {
      carp "index joining $lefttable->{name} with $righttable->{name}" ;
    }
    return $lefttable->_join_using_index($righttable, 
                                         $leftcol, $rightcol, $side) ;
  }

  # tables must be sorted by field.
  if (!$lefttable->{sorted_colnum} || $lefttable->{sorted_colnum} ne $leftcol) {
    $lefttable = $lefttable->order($leftfield) ;
  }
  if (!$righttable->{sorted_colnum} || $righttable->{sorted_colnum} ne $rightcol) {
    $righttable = $righttable->order($rightfield) ;
  }

  carp "joining $lefttable->{name} with $righttable->{name}" if $VERBOSE ;


  # create a format string for join(1).
  # Looks like
  # 1.1,1.2,1.3,1.4, ... ,2.1,2.2,2.3, ...

  my $format = 
       join(",", map { "1.$_" } 1..$lefttable->{fieldlist}->fieldcount())
       . "," .
       join(",", map { "2.$_" } 1..$righttable->{fieldlist}->fieldcount()) ;

  
  
  my $command = "$JOIN -1 $leftcol -2 $rightcol -o $format -t '\t' " ;

  if ($side eq 'LEFT') {
    $command .= "-a 1 "
  } elsif ($side eq 'RIGHT') {
    $command .= "-a 2 " ;
  } elsif ($side eq 'OUTER') {
    $command .= "-a 1 -a 2 " ;
  } elsif (defined $side && $side ne 'INNER') {
    croak "invalid side argument" ;
  }

  $command .= $lefttable->{filename} . " " ;
  $command .= $righttable->{filename} . " " ;

  my $newf = _make_tempfile() ;
  $command .= "> $newf" ;

  system $command ;

  croak "join failed" if $? ;


  # We've now joined the files, so we just have to create a fieldlist
  # for the new table.

  my $leftlistcopy = $lefttable->{fieldlist}->deepcopy ;
  foreach my $field ($leftlistcopy->fields) {
    $field->add_name( $lefttable->name . "." . $field->name() ) ;
  }
  my $rightlistcopy = $righttable->{fieldlist}->deepcopy ;
  foreach my $field ($rightlistcopy->fields) {
    $field->add_name( $righttable->name . "." . $field->name() ) ;
  }

  # we've now got copies of the two fieldlists, with new aliases for
  # the field names of the form tablename.fieldname.  Construct
  # a final field list from these two lists.

  my @fields = ($leftlistcopy->fields, $rightlistcopy->fields) ;

  my $newtable = {
	       name => $lefttable->{name},
               filename => $newf,
	       fieldlist => Text::TabTable::FieldList->new(@fields),
	       sorted_colnum => $leftcol,
	     } ;
  
  if (!defined wantarray ) {
    carp "Warning: Useless join in void context." ;
  }
  bless $newtable, ref $lefttable ;
}

####
# called by ->join() to perform a join when there is an appropriate cdb index
# present on the right side table and it's not a right join.
# 
# The column numbers passed in are 1-based.
####
sub _join_using_index
{
  my ($lefttable, $righttable, $leftcol, $rightcol, $side) = @_ ;

  my $isleftjoin = $side eq 'LEFT' ;
  my $emptyright ;
  if ($isleftjoin) {
    $emptyright = "\t" x ($righttable->{fieldlist}->fieldcount() - 1) ;
  }

  open(LEFTF, $lefttable->{filename}) || croak ;
  require CDB_File ;

  my $newf = _make_tempfile() ;
  open(NEWF, ">$newf") || croak "$newf: $!" ;

  my %right ;

  tie (%right, 'CDB_File', $righttable->{cdb}{$rightcol}) || die ;

  # create a regex that will extract the join field from a tab delimited
  # line.
  my $regex = '^' . ('[^\t]*\t' x ($leftcol-1)) . '([^\t]*)' ;
  $regex = qr($regex) ;

  my $leftfieldcount = $lefttable->{fieldlist}->fieldcount() ;
  my $rightfieldcount = $righttable->{fieldlist}->fieldcount() ;

  while (<LEFTF>) {
    chomp ;
    _add_missing_tabs(\$_, $leftfieldcount) ;
    /$regex/ || die "malformed temp file in line $_" ;
    my $key = $1 ;


    if (exists $right{$key}) {
      # found a match.  Print a complete line.
      my $val = $right{$key} ;
      _add_missing_tabs(\$val, $rightfieldcount) ;
      print NEWF CORE::join("\t", $_, $val), "\n" ;
    } else {
      # didn't match.  print a line if it's a left join, otherwise skip it.
      if ($isleftjoin) {
        print NEWF CORE::join("\t", $_, $emptyright), "\n" ;
      }
    }
  }

  untie %right ;

  close LEFTF ;
  close NEWF ;

  # We've now joined the files, so we just have to create a fieldlist
  # for the new table.

  my $leftlistcopy = $lefttable->{fieldlist}->deepcopy ;
  foreach my $field ($leftlistcopy->fields) {
    $field->add_name( $lefttable->name . "." . $field->name() ) ;
  }
  my $rightlistcopy = $righttable->{fieldlist}->deepcopy ;
  foreach my $field ($rightlistcopy->fields) {
    $field->add_name( $righttable->name . "." . $field->name() ) ;
  }

  # we've now got copies of the two fieldlists, with new aliases for
  # the field names of the form tablename.fieldname.  Construct
  # a final field list from these two lists.

  my @fields = ($leftlistcopy->fields, $rightlistcopy->fields) ;

  my $newtable = {
	       name => $lefttable->{name},
               filename => $newf,
	       fieldlist => Text::TabTable::FieldList->new(@fields),
	     } ;
  
  if (!defined wantarray ) {
    carp "Warning: Useless join in void context." ;
  }
  bless $newtable, ref $lefttable ;

}

# Given a ref to a string that's supposed to have n columns, make sure there are
# n-1 tabs by adding more at the end.
sub _add_missing_tabs
{
  my ($strref, $n) = @_ ;

  my $tabcount = ($$strref =~ tr/\t/\t/) ;

  if ($tabcount < $n-1) {
    $$strref .= "\t" x ( $n-1-$tabcount ) ;
  }
}


####
# processes a table and creates a new one with different stuff.
#
# parameters:
#  table is a Text::TabTable object.
#
#  fieldspecs is a listref containing items of any of the following forms
#   fieldname    ( a simple scalar )
#   [fieldname, newfieldname]       ( for "cd_table.id as cd_id")
#   [sub {...}, newfieldname, [list of fieldnames]]
#            (for calculated fields.  The sub receives values for the
#             listed fields as parameters, and returns the new value)
#   fieldspecs can also be a simple "*", which returns all fields unchanged.
#  
#  wheresub is an optional subref.  It is passed an object with getvalue,
#    setvalue, and autoloaded field-name-named methods to get and set values
#    of fields by name.  It is expected to return a true value if the 
#    row should be included in the output.
####
sub select
{
  my ($table, $fieldspecs, $wheresub) = @_ ;

  carp "selecting from $table->{name}" if $VERBOSE ;

  my $newtable = { name => $table->{name} } ;

  # this gets set to zero if there is a where clause or calculated columns.
  # Otherwise it just runs /bin/cut to pick the right columns.
  my $cut_ok = 1 ;

  # create a field list for the new table based on the selected fields.
  # also create an array saying how to calculate each output field.
  my @fieldrules ;
  if (!ref $fieldspecs && ($fieldspecs eq '*' || !defined $fieldspecs)) {
    # simple case.  Just copy the fieldlist.
    undef $fieldspecs ;
    $newtable->{fieldlist} = $table->{fieldlist}->deepcopy() ;

    # we don't need any rules in this case; input = output
  } else {
    # make a new fieldlist, and rules.

    # @fields is the list of Field objects being built.
    my @fields ;

    foreach my $fieldspec (@$fieldspecs) {
      if (!ref $fieldspec) {
        # a simple scalar, representing a field name.
	my $colnum = $table->{fieldlist}->find_colnum($fieldspec) ;
	if (!$colnum) {
	  croak "no field $fieldspec in table" ;
	}
	# find_colnum returns 1-based column numbers.
	push @fieldrules, $colnum - 1 ;
	push @fields, Text::TabTable::Field->new($fieldspec) ;
      } elsif (@$fieldspec == 2) {
        # a field name to look up and what to call it in the output table.

	my $colnum = $table->{fieldlist}->find_colnum($fieldspec->[0]) ;
	if (!$colnum) {
	  croak "no field $fieldspec->[0] in table" ;
	}
	# find_colnum returns 1-based column numbers.
	push @fieldrules, $colnum - 1 ;
	push @fields, Text::TabTable::Field->new($fieldspec->[1]) ;
      } elsif (@$fieldspec == 3) {
        # A subref, a new column name, and a list of columns to pass to
	# the subref.
	my @paramcols ;

	# since we're doing a calculated column, we have to use perl instead
	# of /bin/cut.
	$cut_ok = 0 ;

	foreach my $fieldname (@{$fieldspec->[2]}) {
	  my $colnum = $table->{fieldlist}->find_colnum($fieldname) ;
	  if (!$colnum) {
	    croak "no field $fieldname in table" ;
	  }
	  push @paramcols, $colnum-1 ;
	}
	# create a rule consiting of the subref followed by a listref of
	# what columns to get parameters from.
	push @fieldrules, [ $fieldspec->[0], \@paramcols ] ;
	
	# fieldname is the new name passed in.
	push @fields,  Text::TabTable::Field->new($fieldspec->[1]) ;
      } else {
        croak "bad fieldspec" ;
      }
    }

    $newtable->{fieldlist} = Text::TabTable::FieldList->new(@fields) ;
  }

  $newtable->{filename} = _make_tempfile() ;

  # build a hash saying which field is in which position in the input.
  my %fieldloc ;
  $table->_build_fieldloc(\%fieldloc) ;

  # cut won't reorder columns, which angers me.  So if the columns aren't
  # sorted, don't use cut.
  my $test_unsort = CORE::join(" ", @fieldrules) ;
  my $test_sort = CORE::join(" ", sort {$a <=> $b } @fieldrules) ;
  if ($test_unsort ne $test_sort) {
    $cut_ok = 0 ;
  }

  # now $newtable->{fieldlist} contains the table names.  We're done with that.
  #     $newtable->{filename}  contains the name of the file to be created.
  #     @fieldrules tells us how to create each output column.
  #     %fieldloc says which column number a field name can be found in.
  # so it's time to start processing.


  if ($cut_ok && !$wheresub && $fieldspecs) {
    # there aren't any calculated columns, and there's no where clause,
    # so we can just use cut to pick the columns they wanted.

    # @fieldrules has zero-based column numbers.  Make one-based.
    my @cutfields = map { $_ + 1 } @fieldrules ;

    carp "...selecting using cut" if $VERBOSE ;
    system "cut -f" . CORE::join(',', @cutfields) . 
                      " $table->{filename} > $newtable->{filename}" ;
    if ($?) {
      unlink $newtable->{filename} ;
      croak "cut error in select" ;
    }
  } else {

    # process the file using perl.

    open(INFILE, $table->{filename}) || croak "can't open table file" ;
    open(OUTFILE, ">$newtable->{filename}") || croak "can't open output table file" ;

    while(<INFILE>) {
      chomp ;
      my @values = split(/\t/, $_, 999999) ;

      # run the where clause subroutine, if any, and skip if it says to.
      if ($wheresub) {
	my $rowdata = bless([\%fieldloc, \@values], 'Text::TabTable::DataRow') ;
	next if !&$wheresub($rowdata) ;
      }

      if (!$fieldspecs) {
	# select *.  Just print them out.
	print OUTFILE CORE::join("\t", @values), "\n" ;
      } else {
	my @outvals ;

	# use the @fieldrules to create @outvals from @values.
	foreach my $rule (@fieldrules) {
	  if (!ref $rule) {
	    push @outvals, $values[$rule] ;
	  } else {
	    # it's an arrayref containing a subref and a bunch of column
	    # numbers.  Call the subroutine with the values pointed to 
	    # by those column numbers and use the return value as the 
	    # output field value.
	    my @params = map { $values[$_] } @{$rule->[1]} ;
	    my $subref = $rule->[0] ;
	    push @outvals, scalar(&$subref(@params)) ;
	  }
	}

	print OUTFILE CORE::join("\t", @outvals), "\n" ;
      }
    }

    close OUTFILE ;
    close INFILE ;
  }

  if (!defined wantarray ) {
    carp "Warning: select used in void context." ;
  }

  return bless $newtable, ref $table ;
}

####
# Fills in a hash with a mapping from field name to column number.
####
sub _build_fieldloc
{
  my ($table, $hr_fieldloc) = @_ ;
  my $pos = 0 ;
  foreach my $field ($table->{fieldlist}->fields()) {
    foreach my $fieldname ($field->names()) {
      $hr_fieldloc->{$fieldname} = $pos if !exists $hr_fieldloc->{$fieldname} ;
    }
    $pos++ ;
  }
}

####
# Runs through the rows of a tab table, calling a subroutine for each
# line.  The subroutine has the same calling convention as the where
# part of a select() call.
#
# This is like a select(undef,sub {}) but does not return a new table.
####
sub iterate
{
  my ($table, $wheresub) = @_ ;
  carp "iterating over $table->{name}" if $VERBOSE ;

  open(INFILE, $table->{filename}) || croak "can't open table file" ;

  die if !$wheresub ;

  # build a hash saying which field is in which position in the input.
  my %fieldloc ;
  $table->_build_fieldloc(\%fieldloc) ;

  while(<INFILE>) {
    chomp ;
    my @values = split(/\t/, $_, 999999) ;

    my $rowdata = bless([\%fieldloc, \@values], 'Text::TabTable::DataRow') ;
    &$wheresub($rowdata) ;
  }

  close INFILE ;
}

####
# Returns an object with a next() method, which gives one row object each
# time next() is called.
#
# If -unescape=>1, the tab/backslash/newline escaping will be removed.
####
sub make_iterator
{
  my ($table, %args) = @_ ;
  carp "iterating over $table->{name}" if $VERBOSE ;

  if ($args{-unescape}) {
    return Text::TabTable::Iterator::Unescaping->new($table) ;
  } else {
    return Text::TabTable::Iterator->new($table) ;
  }
}

sub DESTROY
{
  my $self = shift ;
  if (!$self->{dontdelete}) {
    unlink $self->{filename} ;
  }

  foreach my $cdbfile ( values %{$self->{cdb}} ) {
    unlink $cdbfile ;
  }
}

####
# Creates a temporary file and returns its filename.
####
use vars qw(@TEMPFILES) ;
sub _make_tempfile
{
  my $watchdog = 0 ;
  while ($watchdog++ < 1000) {
    my $fname = "$TMPDIR/$$." . int(rand(9999999)) ;
    my $status = sysopen TEMPF, $fname, O_CREAT | O_WRONLY | O_EXCL, 0666 ;
    close(TEMPF) ;
    if (defined $status) {
      push @TEMPFILES, $fname ;
      return $fname ;
    }
  }
  die "couldn't create a temporary file\n" ;
}

END {
  # delete any tempfiles that didn't get deleted.  This shouldn't happen.
  foreach my $file (@TEMPFILES) {
    unlink $file ;
  }
}


###############################################################################
#               Text::TabTable::Field
###############################################################################
package Text::TabTable::Field ;

sub new
{
  my ($package, $fieldname) = @_ ;
  my $self = { names => [$fieldname] } ;
  bless $self, $package ;
}

sub name
{
  return $_[0]->{names}->[0] ;
}

####
# Return all the names for this field.
sub names
{
  return @{$_[0]->{names}} ;
}

sub has_name
{
  my $self = shift ;
  my $name = shift ;
  if (grep( $_ eq $name, @{$self->{names}})) {
    return 1 ;
  } else {
    return 0 ;
  }
}

####
# Add an alias name to a field.
####
sub add_name
{
  my ($self, @names) = @_ ;
  push @{$self->{names}}, @names ;
}

####
# sets the name of the field, wiping out all previous aliases.
####
sub set_name
{
  my ($self, $name) = @_ ;
  $self->{names} = [$name] ;
}

###############################################################################
#               Text::TabTable::FieldList
#
# Represents the list of Fields on a table.
###############################################################################
package Text::TabTable::FieldList ;

use Data::Dumper ;

sub new
{
  my ($package, @fields) = @_ ;
  
  bless { fields => \@fields }, $package ;
}

sub deepcopy
{
  my ($self) = @_ ;

  no strict ;
  my $newent = eval Dumper($self) ;
}

####
# Returns a 1-based column number for the given field, or undef
# if not present.
####
sub find_colnum
{
  my ($self, $fieldname) = @_ ;
  for (my $i = 0 ; $i < @{$self->{fields}} ; $i++) {
    if ($self->{fields}->[$i]->has_name($fieldname)) {
      return $i+1 ;
    }
  }
  return undef ;
}

####
# Return the number of fields.
####
sub fieldcount
{
  my $self = shift ;
  return scalar(@{$self->{fields}}) ;
}

####
# Return field names as a tab-delimited string.
####
sub as_string
{
  my $self = shift ;
  my @names = map {$_->name} @{$self->{fields}} ;
  return join("\t", @names) ;
}

sub fields
{
  return @{$_[0]->{fields}} ;
}

###############################################################################
#               Text::TabTable::Iterator
#               Text::TabTable::Iterator::Unescaping
#
# A thing that returns one row at a time from a table.
# The ::Unescaping version will run Text::TabTable::Unescape on all the
# data first.
###############################################################################
package Text::TabTable::Iterator ;

@Text::TabTable::Iterator::Unescaping::ISA = ('MP3Com::TabTable::Iterator') ;

use strict ;
use Carp ;

sub new
{
  my ($package, $table) = @_ ;

  require IO::File ;

  my %fieldloc ;
  $table->_build_fieldloc(\%fieldloc) ;

  my $fh = IO::File->new("<$table->{filename}") || croak ;

  my $self = {
  		fieldloc => \%fieldloc,
		fh => $fh
	      } ;
  bless $self, $package ;
}

sub next
{
  my ($self) = @_ ;
  my $line = $self->{fh}->getline() ;

  if (!$line) {
    return undef ;
    delete $self->{fh} ;
  }
  chomp $line ;
  my @values = split(/\t/, $line, -1) ;

  return bless([$self->{fieldloc}, \@values], 'Text::TabTable::DataRow') ;
}

sub Text::TabTable::Iterator::Unescaping::next
{
  my ($self) = @_ ;

  # get a row and unescape the data in it.

  my $row = $self->Text::TabTable::Iterator::next() ;
  return undef if !$row ;

  foreach my $val (@{$row->[1]}) {
    $val = Text::TabTable::unescape($val) ;
  }

  return $row ;
}

###############################################################################
#               Text::TabTable::DataRow
#
# Represents a row of data from a TabTable.
###############################################################################
package Text::TabTable::DataRow ;

use vars qw($AUTOLOAD) ;
use Carp ;

use strict ;

# This constructor is not actually used by select(); it blesses the
# right structure itself for speed purposes.
sub new
{
  my ($package, $name2colhash, $values) = @_ ;
  bless [$name2colhash, $values], $package ;
}

sub getvalue
{
  my $self = shift ;
  my $name = shift ;
  return $self->[1][  $self->[0]{$name}   ] ;
}

sub setvalue
{
  my $self = shift ;
  my $name = shift ;
  my $newval = shift ;
  $self->[1][$self->[0]{$name}] = $newval ;
}

# to save work for autoload.
sub DESTROY {} ;

####
# implements field-named methods for getting values.
####
sub AUTOLOAD
{
  my $self = shift ;

  my $name = $AUTOLOAD ;
  $name =~ s/.*:// ;

  if (!exists $self->[0]{$name}) {
    croak "No $name field in table" ;
  }

  # create a function to calculate it.
  eval <<EOT ;
    sub $name {
      return \$_[0]->[1][  \$_[0]->[0]{'$name'} ] ;
    }
EOT

  return $self->$name() ;
}


1;