use
5.016;
our
$VERSION
=
'1.01'
;
my
$PDFTOHTML
= which
'pdftohtml'
;
my
$PDFINFO
= which
'pdfinfo'
;
my
$PDFTOPNG
= which
'pdftopng'
;
my
$PDFIMAGES
= which
'pdfimages'
;
our
$CAN_TEST
= (
defined
$PDFTOHTML
and
defined
$PDFINFO
and
defined
$PDFTOPNG
and
defined
$PDFIMAGES
);
my
$MAGIC
=
'%PDF'
;
sub
heuristic {
my
$class
=
shift
;
my
$file
=
shift
;
my
$fh
=
shift
;
read
$fh
,
my
(
$mag
), 4;
return
$mag
eq
$MAGIC
;
}
sub
_get_metadata {
my
$self
=
shift
;
my
%meta
= (
'Author'
=>
sub
{
author
=>
shift
},
'CreationDate'
=>
sub
{
created
=>
shift
},
'Creator'
=>
sub
{
contributor
=>
shift
},
'ModDate'
=>
sub
{
modified
=>
shift
},
'PDF version'
=>
sub
{
format
=>
'PDF '
.
shift
},
'Producer'
=>
sub
{
contributor
=>
shift
},
'Title'
=>
sub
{
title
=>
shift
},
);
unless
(
defined
$PDFINFO
) {
die
"Cannot read PDF $self->{Source}: pdfinfo not installed\n"
;
}
my
$info
=
qx/$PDFINFO '$self->{Source}'/
;
unless
($? >> 8 == 0) {
die
"Failed to run '$PDFINFO' on $self->{Source}\n"
;
}
for
my
$l
(
split
/\n/,
$info
) {
my
(
$field
,
$content
) =
split
/:\s*/,
$l
, 2;
unless
(
exists
$meta
{
$field
} and
$content
) {
next
;
}
my
(
$k
,
$v
) =
$meta
{
$field
}->(
$content
);
push
@{
$self
->{Metadata}->
$k
},
$v
;
}
return
1;
}
sub
_images {
my
$self
=
shift
;
$self
->{_imgdir} = tempdir(
CLEANUP
=> 1);
my
$root
= File::Spec->catfile(
$self
->{_imgdir},
'ishmael'
);
qx/$PDFIMAGES -png '$self->{Source}' '$root'/
;
unless
($? >> 8 == 0) {
die
"Failed to run '$PDFIMAGES' on $self->{Source}\n"
;
}
@{
$self
->{_images} } = dir(
$self
->{_imgdir});
return
1;
}
sub
new {
my
$class
=
shift
;
my
$file
=
shift
;
my
$self
= {
Source
=>
undef
,
Metadata
=> EBook::Ishmael::EBook::Metadata->new,
_imgdir
=>
undef
,
_images
=> [],
};
bless
$self
,
$class
;
$self
->{Source} = File::Spec->rel2abs(
$file
);
$self
->_get_metadata();
unless
(@{
$self
->{Metadata}->
format
}) {
$self
->{Metadata}->
format
([
'PDF'
]);
}
return
$self
;
}
sub
html {
my
$self
=
shift
;
my
$out
=
shift
;
unless
(
defined
$PDFTOHTML
) {
die
"Cannot read PDF $self->{Source}: pdftohtml not installed\n"
;
}
my
$html
=
''
;
open
my
$fh
,
'>'
,
$out
// \
$html
or
die
sprintf
"Failed to open %s for writing: $!\n"
,
$out
//
'in-memory scalar'
;
binmode
$fh
,
':utf8'
;
my
$raw
=
qx/$PDFTOHTML -i -s -stdout '$self->{Source}'/
;
unless
($? >> 8 == 0) {
die
"Failed to run '$PDFTOHTML' on $self->{Source}\n"
;
}
my
$dom
= XML::LibXML->load_html(
string
=>
$raw
);
my
(
$body
) =
$dom
->findnodes(
'/html/body'
);
print
{
$fh
}
map
{
$_
->toString() }
$body
->childNodes();
close
$fh
;
return
$out
//
$html
;
}
sub
raw {
my
$self
=
shift
;
my
$out
=
shift
;
unless
(
defined
$PDFTOHTML
) {
die
"Cannot read PDF $self->{Source}: pdftohtml not installed\n"
;
}
my
$raw
=
''
;
open
my
$fh
,
'>'
,
$out
// \
$raw
or
die
sprintf
"Failed to open %s for writing: $!\n"
,
$out
//
'in-memory scalar'
;
binmode
$fh
,
':utf8'
;
my
$rawml
=
qx/$PDFTOHTML -i -s -stdout '$self->{Source}'/
;
unless
($? >> 8 == 0) {
die
"Failed to run '$PDFTOHTML' on $self->{Source}\n"
;
}
my
$dom
= XML::LibXML->load_html(
string
=>
$rawml
);
my
(
$body
) =
$dom
->findnodes(
'/html/body'
);
print
{
$fh
}
$body
->textContent;
close
$fh
;
return
$out
//
$raw
;
}
sub
metadata {
my
$self
=
shift
;
return
$self
->{Metadata}->hash;
}
sub
has_cover { 1 }
sub
cover {
my
$self
=
shift
;
my
$out
=
shift
;
unless
(
defined
$PDFTOPNG
) {
die
"Cannot dump PDF $self->{Source} cover: pdftopng not installed\n"
;
}
my
$tmpdir
= tempdir(
CLEANUP
=> 1);
my
$tmproot
= File::Spec->catfile(
$tmpdir
,
'tmp'
);
qx/$PDFTOPNG -f 1 -l 1 '$self->{Source}' '$tmproot'/
;
unless
($? >> 8 == 0) {
die
"Failed to run '$PDFTOPNG' on $self->{Source}\n"
;
}
my
(
$png
) = dir(
$tmpdir
);
unless
(
defined
$png
) {
die
"'$PDFTOPNG' could not produce a cover image from $self->{Source}\n"
;
}
my
$bin
;
open
my
$wh
,
'>'
,
$out
// \
$bin
or
die
sprintf
"Failed to open %s for writing: $!\n"
,
$out
//
'in-memory scalar'
;
binmode
$wh
;
open
my
$rh
,
'<'
,
$png
or
die
"Failed to open $png for reading: $!\n"
;
binmode
$rh
;
while
(
read
$rh
,
my
(
$buf
), 1024) {
print
{
$wh
}
$buf
;
}
close
$wh
;
close
$rh
;
return
$out
//
$bin
;
}
sub
image_num {
my
$self
=
shift
;
unless
(
defined
$self
->{_imgdir}) {
$self
->_images;
}
return
scalar
@{
$self
->{_images} };
}
sub
image {
my
$self
=
shift
;
my
$n
=
shift
;
if
(
$n
>=
$self
->image_num) {
return
undef
;
}
open
my
$fh
,
'<'
,
$self
->{_images}[
$n
]
or
die
"Failed to open $self->{_images}[$n]: $!\n"
;
binmode
$fh
;
my
$img
=
do
{
local
$/ =
undef
;
readline
$fh
};
close
$fh
;
return
\
$img
;
}
1;