package Archive::BagIt;

use strict;
use 5.006;
use warnings;


our $VERSION = '0.053.3'; # VERSION

use utf8;
use open ':std', ':utf8';
our @checksum_algos = qw(md5 sha1);
our $DEBUG=0;
use Encode qw(decode);
use File::Find;
use Data::Dumper;
#use Data::Printer;

sub new {
  my ($class,$bag_path) = @_;
  my $self = {};
  bless $self, $class;
  $bag_path=~s!/$!!;
  $self->{'bag_path'} = $bag_path || "";
  if($bag_path) {
    $self->_open();
  }
  return $self;
}

sub _open {
  my($self) = @_;

  $self->_load_manifests();
  $self->_load_tagmanifests();

  return $self;
}

sub _load_manifests {
  my ($self) = @_;

  my @manifests = $self->manifest_files();
  foreach my $manifest_file (@manifests) {
    die("Cannot open $manifest_file: $!") unless (open (my $MANIFEST,"<:encoding(utf8)", $manifest_file));
    while (my $line = <$MANIFEST>) {
        chomp($line);
        my ($digest,$file);
        ($digest, $file) = $line =~ /^([a-f0-9]+)\s+(.+)$/;
        if(!$file) {
          die ("This is not a valid manifest file");
        } else {
          print "file: $file \n" if $DEBUG;
          $self->{entries}->{$file} = $digest;
        }
    }
    close($MANIFEST);
  }

  return $self;

}

sub _load_tagmanifests {
  my ($self) = @_;

  my @tagmanifests = $self->tagmanifest_files();
  foreach my $tagmanifest_file (@tagmanifests) {
    die("Cannot open $tagmanifest_file: $!") unless (open(my $TAGMANIFEST,"<:encoding(utf8)", $tagmanifest_file));
    while (my $line = <$TAGMANIFEST>) {
      chomp($line);
      my($digest,$file) = split(/\s+/, $line, 2);
      $self->{tagentries}->{$file} = $digest;
    }
    close($TAGMANIFEST);

  }
  return $self;
}


sub make_bag {
  my ($class, $bag_dir) = @_;
  unless ( -d $bag_dir) { die ( "source bag directory doesn't exist"); }
  unless ( -d $bag_dir."/data") {
    rename ($bag_dir, $bag_dir.".tmp");
    mkdir  ($bag_dir);
    rename ($bag_dir.".tmp", $bag_dir."/data");
  }
  my $self=$class->new($bag_dir);
  $self->_write_bagit($bag_dir);
  $self->_write_baginfo($bag_dir);
  $self->_manifest_md5($bag_dir);
  $self->_tagmanifest_md5($bag_dir);
  $self->_open();
  return $self;
}

sub _write_bagit {
    my($self, $bagit) = @_;
    open(my $BAGIT, ">", $bagit."/bagit.txt") or die("Can't open $bagit/bagit.txt for writing: $!");
    print($BAGIT "BagIt-Version: 0.97\nTag-File-Character-Encoding: UTF-8");
    close($BAGIT);
    return 1;
}



sub _write_baginfo {
    use POSIX;
    my($self, $bagit, %param) = @_;
    open(my $BAGINFO, ">", $bagit."/bag-info.txt") or die("Can't open $bagit/bag-info.txt for writing: $!");
    $param{'Bagging-Date'} = POSIX::strftime("%F", gmtime(time));
    $param{'Bag-Software-Agent'} = 'Archive::BagIt <http://search.cpan.org/~rjeschmi/Archive-BagIt>';
    while(my($key, $value) = each(%param)) {
        print($BAGINFO "$key: $value\n");
    }
    close($BAGINFO);
    return 1;
}

sub _manifest_crc32 {
    require String::CRC32;
    my($self,$bagit) = @_;
    my $manifest_file = "$bagit/manifest-crc32.txt";
    my $data_dir = "$bagit/data";

    # Generate MD5 digests for all of the files under ./data
    open(my $fh, ">:encoding(utf8)",$manifest_file) or die("Cannot create manifest-crc32.txt: $!\n");
    find(
        sub {
            $_=decode('utf8', $_);
            my $file = decode('utf8', $File::Find::name);
            if (-f $_) {
                open(my $DATA, "<:encoding(utf8)", $_) or die("Cannot read $_: $!");
                my $digest = sprintf("%010d",crc32($DATA));
                close($DATA);
                my $filename = substr($file, length($bagit) + 1);
                print($fh "$digest  $filename\n");
            }
        },
        $data_dir
    );
    close($fh);
}


sub _manifest_md5 {
    use Digest::MD5;
    my($self, $bagit) = @_;
    my $manifest_file = "$bagit/manifest-md5.txt";
    my $data_dir = "$bagit/data";
    #print "creating manifest: $data_dir\n";
    # Generate MD5 digests for all of the files under ./data
    open(my $md5_fh, ">:encoding(utf8)",$manifest_file) or die("Cannot create manifest-md5.txt: $!\n");
    find(
        sub {
            my $file = decode('utf8', $File::Find::name);
            if (-f $_) {
                open(my $DATA, "<:raw", "$_") or die("Cannot read $_: $!");
                my $digest = Digest::MD5->new->addfile($DATA)->hexdigest;
                close($DATA);
                my $filename = substr($file, length($bagit) + 1);
                print($md5_fh "$digest  $filename\n");
                #print "lineout: $digest $filename\n";
            }
        },
        $data_dir
    );
    close($md5_fh);
}

sub _tagmanifest_md5 {
  my ($self, $bagit) = @_;

  use Digest::MD5;

  my $tagmanifest_file= "$bagit/tagmanifest-md5.txt";

  open (my $md5_fh, ">:encoding(utf8)", $tagmanifest_file) or die ("Cannot create tagmanifest-md5.txt: $! \n");

  find (
    sub {
      $_ = decode('utf8',$_);
      my $file = decode('utf8',$File::Find::name);
      if ($_=~m/^data$/) {
        $File::Find::prune=1;
      }
      elsif ($_=~m/^tagmanifest-.*\.txt/) {
        # Ignore, we can't take digest from ourselves
      }
      elsif ( -f $_ ) {
        open(my $DATA, "<:raw", "$_") or die("Cannot read $_: $!");
        my $digest = Digest::MD5->new->addfile($DATA)->hexdigest;
        close($DATA);
        my $filename = substr($file, length($bagit) + 1);
        print($md5_fh "$digest  $filename\n");
      }
  }, $bagit);

  close($md5_fh);
}


sub verify_bag {
    my ($self,$opts) = @_;
    #removed the ability to pass in a bag in the parameters, but might want options
    #like $return all errors rather than dying on first one
    my $bagit = $self->{'bag_path'};
    my $manifest_file = "$bagit/manifest-md5.txt";
    my $payload_dir   = "$bagit/data";
    my %manifest      = ();
    my $return_all_errors = $opts->{return_all_errors};
    my %invalids;
    my @payload       = ();

    die("$manifest_file is not a regular file") unless -f ($manifest_file);
    die("$payload_dir is not a directory") unless -d ($payload_dir);

    unless ($self->version() > .95) {
        die ("Bag Version is unsupported");
    }

    # Read the manifest file
    #print Dumper($self->{entries});
    foreach my $entry (keys(%{$self->{entries}})) {
      $manifest{$entry} = $self->{entries}->{$entry};
    }

    # Compile a list of payload files
    find(sub{ push(@payload, decode('utf8',$File::Find::name))  }, $payload_dir);

    # Evaluate each file against the manifest
    my $digestobj = new Digest::MD5;
    foreach my $file (@payload) {
        next if (-d ($file));
        my $local_name = substr($file, length($bagit) + 1);
        my ($digest);
        #p %manifest;
        unless ($manifest{$local_name}) {
          die ("file found not in manifest: [$local_name]");
        }
        #my $start_time=time();
        open(my $fh, "<:raw", "$bagit/$local_name") or die ("Cannot open $local_name");
        $digest = $digestobj->addfile($fh)->hexdigest;
        close($fh);
        #print "$bagit/$local_name md5 in ".(time()-$start_time)."\n";
        unless ($digest eq $manifest{$local_name}) {
          if($return_all_errors) {
            $invalids{$local_name} = $digest;
          }
          else {
            die ("file: $local_name invalid");
          }
        }
        delete($manifest{$local_name});
    }
    if($return_all_errors && keys(%invalids) ) {
      foreach my $invalid (keys(%invalids)) {
        print "invalid: $invalid hash: ".$invalids{$invalid}."\n";
      }
      die ("bag verify failed with invalid files");
    }
    # Make sure there are no missing files
    if (keys(%manifest)) { die ("Missing files in bag"); }

    return 1;
}


sub get_checksum {
  my($self) =@_;
  my $bagit = $self->{'bag_path'};
  open(my $SRCFILE, "<:raw",  $bagit."/manifest-md5.txt");
  my $srchex=Digest::MD5->new->addfile($SRCFILE)->hexdigest;
  close($SRCFILE);
  return $srchex;
}


sub version {
    my($self) = @_;
    my $bagit = $self->{'bag_path'};
    my $file = join("/", $bagit, "bagit.txt");
    open(my $BAGIT, "<", $file) or die("Cannot read $file: $!");
    my $version_string = <$BAGIT>;
    my $encoding_string = <$BAGIT>;
    close($BAGIT);
    $version_string =~ /^BagIt-Version: ([0-9.]+)$/;
    return $1 || 0;
}


sub payload_files {
  my($self) = @_;
  my @payload = $self->_payload_files();
  return @payload;
}

sub _payload_files{
  my($self) = @_;

  my $payload_dir = join( "/", $self->{"bag_path"}, "data");

  my @payload=();
  File::Find::find( sub{

    push(@payload,decode('utf8',$File::Find::name));
    #print "name: ".$File::Find::name."\n";
  }, $payload_dir);

  return @payload;

}


sub non_payload_files{
  my ($self) = @_;
  my @non_payload = $self->_non_payload_files();
  return @non_payload;

}


sub _non_payload_files {
  my($self) = @_;

  my @payload = ();
  File::Find::find( sub {
    $File::Find::name = decode ('utf8', $File::Find::name);
    if(-f $File::Find::name) {
      my ($relpath) = ($File::Find::name=~m!$self->{"bag_path"}/(.*$)!);
      push(@payload, $relpath);
    }
    elsif(-d _ && $_ eq "data") {
      $File::Find::prune=1;
    }
    else {
      #directories in the root other than data?
    }
  }, $self->{"bag_path"});

  return @payload;

}



sub manifest_files {
  my($self) = @_;
  my @manifest_files;
  foreach my $algo (@checksum_algos) {
    my $manifest_file = $self->{"bag_path"}."/manifest-$algo.txt";
    if (-f $manifest_file) {
      push @manifest_files, $manifest_file;
    }
  }
  #print Dumper(@manifest_files);
  return @manifest_files;
}


sub tagmanifest_files {
  my ($self) = @_;
  my @tagmanifest_files;
  foreach my $algo (@checksum_algos) {
    my $tagmanifest_file = $self->{"bag_path"}."/tagmanifest-$algo.txt";
    if (-f $tagmanifest_file) {
      push @tagmanifest_files, $tagmanifest_file;
    }
  }
  return @tagmanifest_files;

}

1; # End of Archive::BagIt

__END__

=pod

=encoding UTF-8

=head1 NAME

Archive::BagIt

=head1 VERSION

version 0.053.3

=head1 SYNOPSIS

This modules will hopefully help with the basic commands needed to create
and verify a bag. My intention is not to be strict and enforce all of the
specification. The reference implementation is the java version
and I will endeavour to maintain compatibility with it.

    use Archive::BagIt;

    #read in an existing bag:
    my $bag_dir = "/path/to/bag";
    my $bag = Archive::BagIt->new($bag_dir);


    #construct bag in an existing directory
    my $bag2 = Archive::BagIt->make_bag($bag_dir);

    # Validate a BagIt archive against its manifest
    my $bag3 = Archive::BagIt->new($bag_dir);
    my $is_valid = $bag3->verify_bag();

=head1 WARNING

This is experimental software for the moment and under active development. I
hope to have a beta version available soon.

We use it fairly widely in-house, but it doesn't necessarily implement all of the specs.

Email me with anything you need done urgently.

Also: Check out Archive::BagIt::Fast if you are willing to add some extra dependencies to get
better speed by mmap-ing files.

=head1 NAME

Archive::BagIt - An interface to make and verify bags according to the BagIt standard

=head1 SUBROUTINES

=head2 new

An Object Oriented Interface to a bag. Opens an existing bag.

  my $bag = Archive::BagIt->new('/path/to/bag');

=head2 make_bag

A constructor that will make and return a bag from a directory

If a data directory exists, assume it is already a bag (no checking for invalid files in root)

=head2 verify_bag

An interface to verify a bag.

You might also want to check L<Archive::BagIt::Fast> to see a more direct way of
accessing files (and thus faster).

=head2 get_checksum

This is the checksum for the bag, md5 of the manifest-md5.txt

=head2 version

Returns the bagit version according to the bagit.txt file.

=head2 payload_files

Returns an array with all of the payload files (those files that are below the data directory)

=head2 non_payload_files

Returns an array with files that are in the root of the bag, non-manifest files

=head2 manifest_files

Return an array with the list of manifest files that exist in the bag

=head2 tagmanifest_files

Return an array with the list of tagmanifest files

=head1 AUTHOR

Robert Schmidt, E<lt>rjeschmi at gmail.comE<gt>
William Wueppelmann, E<lt>william at c7a.caE<gt>

=head1 BUGS

Please report any bugs or feature requests to C<bug-archive-bagit at rt.cpan.org>, or through
the web interface at L<http://rt.cpan.org/NoAuth/ReportBug.html?Queue=Archive-BagIt>.  I will be notified, and then you'll
automatically be notified of progress on your bug as I make changes.

=head1 SUPPORT

You can find documentation for this module with the perldoc command.

    perldoc Archive::BagIt

You can also look for information at:

=over 4

=item * RT: CPAN's request tracker (report bugs here)

L<http://rt.cpan.org/NoAuth/Bugs.html?Dist=Archive-BagIt>

=item * AnnoCPAN: Annotated CPAN documentation

L<http://annocpan.org/dist/Archive-BagIt>

=item * CPAN Ratings

L<http://cpanratings.perl.org/d/Archive-BagIt>

=item * Search CPAN

L<http://search.cpan.org/dist/Archive-BagIt/>

=back

=head1 COPYRIGHT

Copyright (c) 2012, the above named author(s).

=head1 LICENSE

This library is free software; you can redistribute it and/or modify
it under the same terms as Perl itself.

=head1 AVAILABILITY

The latest version of this module is available from the Comprehensive Perl
Archive Network (CPAN). Visit L<http://www.perl.com/CPAN/> to find a CPAN
site near you, or see L<https://metacpan.org/module/Archive::BagIt/>.

=head1 SOURCE

The development version is on github at L<https://github.com/rjeschmi/Archive-BagIt>
and may be cloned from L<git://github.com/rjeschmi/Archive-BagIt.git>

=head1 BUGS AND LIMITATIONS

You can make new bug reports, and view existing ones, through the
web interface at L<https://github.com/rjeschmi/Archive-BagIt/issues>.

=head1 AUTHOR

Rob Schmidt <rjeschmi@gmail.com>

=head1 COPYRIGHT AND LICENSE

This software is copyright (c) 2017 by Rob Schmidt and William Wueppelmann.

This is free software; you can redistribute it and/or modify it under
the same terms as the Perl 5 programming language system itself.

=cut