package EBook::Ishmael; use 5.016; our $VERSION = '1.05'; use strict; use warnings; use Encode qw(find_encoding encode); use File::Basename; use File::Path qw(remove_tree); use File::Temp qw(tempfile); use Getopt::Long; use List::Util qw(max); use JSON; use XML::LibXML; use EBook::Ishmael::EBook; use EBook::Ishmael::ImageID; use EBook::Ishmael::TextBrowserDump; use constant { MODE_TEXT => 0, MODE_META => 1, MODE_ID => 2, MODE_HTML => 3, MODE_RAW_TIME => 4, MODE_COVER => 5, MODE_IMAGE => 6, }; my $PRGNAM = 'ishmael'; my $PRGVER = $VERSION; my $HELP = <<"HERE"; $PRGNAM - $PRGVER Usage: $0 [options] file [output] Options: -d|--dumper=<dumper> Specify dumper to use for formatting text -e|--encoding=<enc> Print text output in specified encoding -f|--format=<format> Specify ebook format -w|--width=<width> Specify output line width -H|--html Dump ebook HTML -c|--cover Dump ebook cover image -g|--image Dump ebook images -i|--identify Identify ebook format -m|--metadata[=<form>] Print ebook metadata -r|--raw Dump the raw, unformatted ebook text -h|--help Print help message -v|--version Print version/copyright info HERE my $VERSION_MSG = <<"HERE"; $PRGNAM - $PRGVER Copyright (C) 2025 Samuel Young This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. HERE my $STDOUT = '-'; my %FORMAT_ALTS = ( 'fb2' => 'fictionbook2', 'azw' => 'mobi', 'azw3' => 'kf8', ); my %META_MODES = map { $_ => 1 } qw( ishmael json pjson xml pxml ); # Replace characters that cannot be encoded with empty strings. my $ENC_SUBST = sub { q[] }; sub _get_out { my $file = shift; if ($file ne $STDOUT) { open my $fh, '>', $file or die "Failed to open $file for writing: $!\n"; return $fh; } else { return *STDOUT; } } sub init { my $class = shift; my $self = { Ebook => undef, Mode => MODE_TEXT, Dumper => $ENV{ISHMAEL_DUMPER}, Encode => $ENV{ISHMAEL_ENCODING}, Format => undef, Output => undef, Width => 80, Meta => undef, }; Getopt::Long::config('bundling'); GetOptions( 'dumper|d=s' => \$self->{Dumper}, 'encoding|e=s' => \$self->{Encode}, 'format|f=s' => \$self->{Format}, 'width|w=i' => \$self->{Width}, 'html|H' => sub { $self->{Mode} = MODE_HTML }, 'cover|c' => sub { $self->{Mode} = MODE_COVER }, 'image|g' => sub { $self->{Mode} = MODE_IMAGE }, 'identify|i' => sub { $self->{Mode} = MODE_ID }, 'metadata|m:s' => sub { # Some DWIMery that if the given argument is not a valid metadata # format, assume the user meant for it be a file argument and put # it back into @ARGV. $self->{Mode} = MODE_META; if (!$_[1] or exists $META_MODES{ lc $_[1] }) { $self->{Meta} = lc $_[1] || 'ishmael'; } else { $self->{Meta} = 'ishmael'; unshift @ARGV, $_[1]; } }, 'raw|r' => sub { $self->{Mode} = MODE_RAW_TIME }, 'help|h' => sub { print $HELP; exit 0; }, 'version|v' => sub { print $VERSION_MSG; exit 0; }, ) or die "Error in command line arguments\n$HELP"; $self->{Ebook} = shift @ARGV or die $HELP; $self->{Output} = shift @ARGV; if ($self->{Mode} == MODE_COVER) { $self->{Output} //= (fileparse($self->{Ebook}, qr/\.[^.]*/))[0] . '.-'; } elsif ($self->{Mode} == MODE_IMAGE) { $self->{Output} //= (fileparse($self->{Ebook}, qr/\.[^.]*/))[0]; } else { $self->{Output} //= $STDOUT; } if (defined $self->{Format}) { $self->{Format} = lc $self->{Format}; if (exists $FORMAT_ALTS{ $self->{Format} }) { $self->{Format} = $FORMAT_ALTS{ $self->{Format} }; } unless (exists $EBOOK_FORMATS{ $self->{Format} }) { die "$self->{Format} is not a recognized ebook format\n"; } } if (defined $self->{Encode} and not defined find_encoding($self->{Encode})) { die "'$self->{Encode}' is an invalid character encoding\n"; } bless $self, $class; return $self; } sub text { my $self = shift; my $ebook = EBook::Ishmael::EBook->new( $self->{Ebook}, $self->{Format} ); my $tmp = do { my ($tf, $tp) = tempfile(UNLINK => 1); close $tf; $tp; }; $ebook->html($tmp); my $oh = _get_out($self->{Output}); unless (defined $self->{Encode}) { binmode $oh, ':utf8'; } my $dump = browser_dump( $tmp, { browser => $self->{Dumper}, width => $self->{Width}, } ); if (defined $self->{Encode}) { print { $oh } encode($self->{Encode}, $dump, $ENC_SUBST); } else { print { $oh } $dump; } close $oh unless $self->{Output} eq $STDOUT; 1; } sub meta { my $self = shift; if ($self->{Meta} eq 'ishmael') { $self->meta_ishmael; } elsif ($self->{Meta} eq 'json') { $self->meta_json(0); } elsif ($self->{Meta} eq 'pjson') { $self->meta_json(1); } elsif ($self->{Meta} eq 'xml') { $self->meta_xml(0); } elsif ($self->{Meta} eq 'pxml') { $self->meta_xml(1); } else { die "'$self->{Meta}' is not a valid metadata format\n"; } 1; } sub meta_ishmael { my $self = shift; my $ebook = EBook::Ishmael::EBook->new( $self->{Ebook}, $self->{Format} ); my %meta = %{ $ebook->metadata }; my $oh = _get_out($self->{Output}); binmode $oh, ':utf8'; my $klen = max(map { length } keys %meta) + 1; for my $k (sort keys %meta) { printf { $oh } "%-*s %s\n", $klen, "$k:", join ", ", @{ $meta{ $k } }; } close $oh unless $self->{Output} eq $STDOUT; 1; } sub meta_json { my $self = shift; my $pretty = shift // 0; my $ebook = EBook::Ishmael::EBook->new( $self->{Ebook}, $self->{Format} ); my $meta = $ebook->metadata; my $oh = _get_out($self->{Output}); for my $k (keys %{ $meta }) { # Flatten arrays that contain a single item if (@{ $meta->{ $k } } == 1) { $meta->{ $k } = $meta->{ $k }->[0]; } } say { $oh } to_json($meta, { utf8 => 1, pretty => $pretty, canonical => 1 }); close $oh unless $self->{Output} eq $STDOUT; 1; } sub meta_xml { my $self = shift; my $pretty = shift // 0; my $ebook = EBook::Ishmael::EBook->new( $self->{Ebook}, $self->{Format}, ); my $meta = $ebook->metadata; my $oh = _get_out($self->{Output}); my $dom = XML::LibXML::Document->new('1.0', 'UTF-8'); my $root = XML::LibXML::Element->new('ishmael'); $dom->setDocumentElement($root); $root->setAttribute('version', $PRGVER); my $metan = $root->appendChild( XML::LibXML::Element->new('metadata') ); for my $k (sort keys %$meta) { my $n = $metan->appendChild( XML::LibXML::Element->new(lc $k) ); for my $i (@{ $meta->{ $k } }) { my $in = $n->appendChild( XML::LibXML::Element->new('item') ); $in->appendChild( XML::LibXML::Text->new($i) ); } } $dom->toFH($oh, $pretty); close $oh unless $self->{Output} eq $STDOUT; 1; } sub id { my $self = shift; my $id = ebook_id($self->{Ebook}); say defined $id ? $id : "Could not identify format for $self->{Ebook}"; 1; } sub html { my $self = shift; my $ebook = EBook::Ishmael::EBook->new( $self->{Ebook}, $self->{Format} ); my $oh = _get_out($self->{Output}); unless (defined $self->{Encode}) { binmode $oh, ':utf8'; } my $html = $ebook->html; if (defined $self->{Encode}) { say { $oh } encode($self->{Encode}, $html, $ENC_SUBST); } else { say { $oh } $html; } close $oh unless $self->{Output} eq $STDOUT; 1; } sub raw { my $self = shift; my $ebook = EBook::Ishmael::EBook->new( $self->{Ebook}, $self->{Format} ); my $oh = _get_out($self->{Output}); unless (defined $self->{Encode}) { binmode $oh, ':utf8'; } my $raw = $ebook->raw; if (defined $self->{Encode}) { say { $oh } encode($self->{Encode}, $raw, $ENC_SUBST); } else { say { $oh } $raw; } close $oh unless $self->{Output} eq $STDOUT; 1; } sub cover { my $self = shift; my $ebook = EBook::Ishmael::EBook->new( $self->{Ebook}, $self->{Format} ); unless ($ebook->has_cover) { say "$self->{Ebook} does not have a cover"; return; } my $cover = $ebook->cover; my $fmt = image_id(\$cover); unless (defined $fmt) { die "Could not dump $self->{Ebook} cover; could not identify cover image format\n"; } if ($self->{Output} =~ /\.\*$/) { warn "Using '.*' for suffix substitution is deprecated; please use '.-' instead\n"; } $self->{Output} =~ s/\.[\-@]$/.$fmt/; my $oh = _get_out($self->{Output}); binmode $oh; print { $oh } $ebook->cover; close $oh unless $self->{Output} eq $STDOUT; 1; } sub image { my $self = shift; if ($self->{Output} eq $STDOUT) { die "Cannot dump images to stdout\n"; } my $ebook = EBook::Ishmael::EBook->new( $self->{Ebook}, $self->{Format} ); my $num = $ebook->image_num; unless ($num) { say "$self->{Ebook} has no images"; return; } my $base = basename($self->{Output}); my $pad = length $num; my $mkdir = 0; unless (-d $self->{Output}) { mkdir $self->{Output} or die "Failed to mkdir $self->{Output}: $!\n"; $mkdir = 1; } my @created; eval { for my $i (0 .. $num - 1) { my $ii = $i + 1; my $img = $ebook->image($i); my $id = image_id($img); unless (defined $id) { warn "Could not identify image #$ii\'s format, skipping\n"; next; } my $b = sprintf "%s-%0*d.%s", $base, $pad, $ii, $id; my $p = File::Spec->catfile($self->{Output}, $b); open my $fh, '>', $p or die "Failed to open $p for writing: $!\n"; binmode $fh; print { $fh } $$img; close $fh; push @created, $p; } 1; } or do { for my $c (@created) { unlink $c; } rmdir $self->{Output} if $mkdir; die $@; }; unless (@created) { rmdir $self->{Output} if $mkdir; die "Could not dump any images in $self->{Output}\n"; } say $self->{Output}; for my $c (map { basename($_) } @created) { say " $c"; } 1; } sub run { my $self = shift; if ($self->{Mode} == MODE_TEXT) { $self->text; } elsif ($self->{Mode} == MODE_META) { $self->meta; } elsif ($self->{Mode} == MODE_ID) { $self->id; } elsif ($self->{Mode} == MODE_HTML) { $self->html; } elsif ($self->{Mode} == MODE_RAW_TIME) { $self->raw; } elsif ($self->{Mode} == MODE_COVER) { $self->cover; } elsif ($self->{Mode} == MODE_IMAGE) { $self->image; } 1; } 1; =head1 NAME EBook::Ishmael - EBook dumper =head1 SYNOPSIS use EBook::Ishmael; my $ishmael = EBook::Ishmael->init(); $ishmael->run(); =head1 DESCRIPTION B<EBook::Ishmael> is the workhorse module for L<ishmael>. If you're looking for user documentation, you should consult its manual instead of this (this is developer documentation). =head1 METHODS =head2 $i = EBook::Ishmael->init() Reads C<@ARGV> and returns a blessed C<EBook::Ishmael> object. Consult the manual for L<ishmael> for a list of options that are available. =head2 $i->text() Dumps ebook file to text, default run mode. =head2 $i->meta() Dumps ebook metadata, C<--metadata> mode. =head2 $i->meta_ishmael() Dumps ebook metadata, C<--metadata=ishmael> mode. =head2 $i->meta_json($pretty) Dumps ebook metadata in JSON form, C<--metadata=p?json> mode. =head2 $i->meta_xml($pretty) Dumps ebook metadata in XML form, C<--metadata=p?xml> mode. =head2 $i->id() Identify the format of the given ebook, C<--identify> mode. =head2 $i->html() Dump the HTML-ified contents of a given ebook, C<--html> mode. =head2 $i->raw() Dump the raw, unformatted text contents of a given ebook, C<--raw> mode. =head2 $i->cover() Dump the binary data of the cover image of a given ebook, if one is present, C<--cover> mode. =head2 $i->run() Runs L<ishmael> based on the parameters processed during C<init()>. =head1 AUTHOR Written by Samuel Young, E<lt>samyoung12788@gmail.comE<gt>. This project's source can be found on its L<Codeberg Page|https://codeberg.org/1-1sam/ishmael>. Comments and pull requests are welcome! =head1 COPYRIGHT Copyright (C) 2025 Samuel Young This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. =head1 SEE ALSO L<ishmael> =cut