—package
Renard::API::MuPDF::mutool;
# ABSTRACT: Retrieve PDF image and text data via MuPDF's mutool
$Renard::API::MuPDF::mutool::VERSION
=
'0.006'
;
use
XML::Simple;
use
Alien::MuPDF 0.007;
use
Path::Tiny;
BEGIN {
our
$MUTOOL_PATH
= Alien::MuPDF->mutool_path;
}
fun _call_mutool(
@mutool_args
) {
my
@args
= (
$Renard::API::MuPDF::mutool::MUTOOL_PATH
,
@mutool_args
);
my
(
$stdout
,
$exit
);
# Note: The code below is marked as uncoverable because it only applies
# on Windows and we are currently only automatically checking coverage
# on Linux via Travis-CI.
# uncoverable branch true
if
( $^O eq
'MSWin32'
) {
# Need to redirect to a file for two reasons:
# - /SUBSYSTEM:WINDOWS closes stdin/stdout <https://github.com/project-renard/curie/issues/128>.
# - MuPDF does not set the mode on stdout to binary <http://bugs.ghostscript.com/show_bug.cgi?id=694954>.
my
$temp_fh
= File::Temp->new;
# uncoverable statement
close
$temp_fh
;
# to avoid Windows file locking # uncoverable statement
my
$output_param
= 0;
# uncoverable statement
for
my
$idx
(1..
@args
-2) {
# uncoverable statement
# uncoverable branch true
if
(
$args
[
$idx
] eq
'-o'
# uncoverable statement
&&
$args
[
$idx
+1] eq
'-'
) {
$args
[
$idx
+1] =
$temp_fh
->filename;
# uncoverable statement
$output_param
= 1;
# uncoverable statement
}
}
# uncoverable branch true
if
( not
$output_param
) {
# uncoverable statement
# redirect into a temp file
my
$cmd
=
join
" "
,
# uncoverable statement
map
{
$_
=~ /\s/ ?
"\"$_\""
:
$_
}
# uncoverable statement
@args
;
# uncoverable statement
my
$redir
=
$temp_fh
->filename;
# uncoverable statement
@args
= (
"$cmd > \"$redir\""
);
# uncoverable statement
}
$log
->infof(
"running mutool: %s"
, \
@args
);
# uncoverable statement
system
(
@args
);
# uncoverable statement
$stdout
= path(
$temp_fh
->filename )->slurp_raw;
# uncoverable statement
$exit
= $?;
# uncoverable statement
}
else
{
# Make sure STDOUT is :raw
open
my
$dup
,
">&="
,
*STDOUT
or
die
$!;
local
*STDOUT
;
open
(STDOUT,
">&="
,
$dup
);
binmode
*STDOUT
,
':raw'
;
(
$stdout
,
undef
,
$exit
) = capture {
$log
->infof(
"running mutool: %s"
, \
@args
);
system
(
@args
);
};
}
die
"Unexpected mutool exit: $exit"
if
$exit
;
return
$stdout
;
}
fun get_mutool_pdf_page_as_png(
$pdf_filename
,
$pdf_page_no
,
$zoom_level
) {
my
$stdout
= _call_mutool(
qw(draw)
,
qw( -r )
, (
$zoom_level
* MUPDF_DEFAULT_RESOLUTION),
# calculate the resolution
qw( -F png )
,
qw( -o -)
,
$pdf_filename
,
$pdf_page_no
,
);
return
$stdout
;
}
fun get_mutool_text_stext_raw(
$pdf_filename
,
$pdf_page_no
) {
my
$stdout
= _call_mutool(
qw(draw)
,
qw(-F stext)
,
qw(-o -)
,
$pdf_filename
,
$pdf_page_no
,
);
return
$stdout
;
}
fun get_mutool_text_stext_xml(
$pdf_filename
,
$pdf_page_no
) {
my
$stext_xml
= get_mutool_text_stext_raw(
$pdf_filename
,
$pdf_page_no
,
);
my
$stext
= XMLin(
$stext_xml
,
KeyAttr
=> [],
ForceArray
=> [
qw(page block line font char)
] );
return
$stext
;
}
fun get_mutool_page_info_raw(
$pdf_filename
) {
my
$stdout
= _call_mutool(
qw(pages)
,
$pdf_filename
);
# remove the first line
$stdout
=~ s/^[^\n]*\n//s;
# wraps the data with a root node
return
"<document>$stdout</document>"
}
fun get_mutool_page_info_xml(
$pdf_filename
) {
my
$page_info_xml
= get_mutool_page_info_raw(
$pdf_filename
);
my
$page_info
= XMLin(
$page_info_xml
,
KeyAttr
=> [],
ForceArray
=> [
qw(page)
] );
my
$root_media_box_p
= Renard::API::MuPDF::mutool::ObjectParser->new(
filename
=>
$pdf_filename
,
string
=> Renard::API::MuPDF::mutool::get_mutool_get_object_raw(
$pdf_filename
,
'Root/Pages/MediaBox'
),
is_toplevel
=> 0,
);
my
$root_media_box
;
if
(
$root_media_box_p
->data ) {
$root_media_box
->{l} =
$root_media_box_p
->data->[0];
$root_media_box
->{b} =
$root_media_box_p
->data->[1];
$root_media_box
->{r} =
$root_media_box_p
->data->[2];
$root_media_box
->{t} =
$root_media_box_p
->data->[3];
}
for
my
$page_hash
(@{
$page_info
->{page} }) {
unless
(
exists
$page_hash
->{CropBox} ) {
my
$media_box
=
exists
$page_hash
->{MediaBox} ?
$page_hash
->{MediaBox} :
$root_media_box
;
$page_hash
->{CropBox} = {
%$media_box
};
}
}
return
$page_info
;
}
fun get_mutool_outline_simple(
$pdf_filename
) {
my
$outline_text
= _call_mutool(
qw(show)
,
$pdf_filename
,
qw(outline)
);
my
@outline_items
= ();
utf8::upgrade(
$outline_text
);
open
my
$outline_fh
,
'<:crlf'
, \
$outline_text
;
while
(
defined
(
my
$line
= <
$outline_fh
> ) ) {
$line
=~ /^
(?<prefix>[+|-])
(?<indent>\t*)
"(?<text>.*)"
\t
(?<reference>
# #123,20,40
( \
# (?<page>\d+)(,(?<dx>-?\d+),(?<dy>-?\d+))? )
|
# #page=123&zoom=nan,20,40
# #page=123&view=Fit
( \
# page=(?<page>\d+)(&(view|zoom)=[^&,]+?)*(,(?<dx>-?\d+),(?<dy>-?\d+))? )
|
\Q(null)\E
)
$
/x;
my
%copy
= %+;
$copy
{level} =
length
(
$copy
{indent}) - 1;
$copy
{text} =~ s/\\x([0-9A-F]{2})/
chr
(
hex
($1))/ge;
$copy
{
open
} =
$copy
{prefix} eq
'-'
;
delete
$copy
{prefix};
delete
$copy
{indent};
delete
$copy
{reference};
# not storing the offsets yet and not every line has offsets
delete
@copy
{
qw(dx dy)
};
push
@outline_items
, \
%copy
;
}
return
\
@outline_items
;
}
fun get_mutool_get_trailer_raw(
$pdf_filename
) {
my
$trailer_text
= _call_mutool(
qw(show)
,
$pdf_filename
,
qw(trailer)
);
utf8::upgrade(
$trailer_text
);
open
my
$trailer_fh
,
'<:crlf'
, \
$trailer_text
;
do
{
local
$/ =
''
; <
$trailer_fh
> };
}
fun get_mutool_get_object_raw(
$pdf_filename
,
$object_id
) {
my
$object_text
= _call_mutool(
qw(show)
,
$pdf_filename
,
$object_id
,
);
utf8::upgrade(
$object_text
);
open
my
$object_fh
,
'<:crlf'
, \
$object_text
;
do
{
local
$/ =
''
; <
$object_fh
> };
}
fun get_mutool_get_info_object_parsed(
$pdf_filename
) {
my
$trailer
= Renard::API::MuPDF::mutool::ObjectParser->new(
filename
=>
$pdf_filename
,
string
=> Renard::API::MuPDF::mutool::get_mutool_get_trailer_raw(
$pdf_filename
),
);
my
$info
=
$trailer
->resolve_key(
'Info'
);
}
1;
__END__
=pod
=encoding UTF-8
=head1 NAME
Renard::API::MuPDF::mutool - Retrieve PDF image and text data via MuPDF's mutool
=head1 VERSION
version 0.006
=head1 FUNCTIONS
=head2 _call_mutool
_call_mutool( @args )
Helper function which calls C<mutool> with the contents of the C<@args> array.
Returns the captured C<STDOUT> of the call.
This function dies if C<mutool> unsuccessfully exits.
=head2 get_mutool_pdf_page_as_png
get_mutool_pdf_page_as_png($pdf_filename, $pdf_page_no)
This function returns a PNG stream that renders page number C<$pdf_page_no> of
the PDF file C<$pdf_filename>.
=head2 get_mutool_text_stext_raw
get_mutool_text_stext_raw($pdf_filename, $pdf_page_no)
This function returns an XML string that contains structured text from page
number C<$pdf_page_no> of the PDF file C<$pdf_filename>.
The XML format is defined by the output of C<mutool> looks like this (for page
23 of the C<pdf_reference_1-7.pdf> file):
<?xml version="1.0"?>
<document name="(null)">
<page height="666" width="531">
<block bbox="261.18 616.16397 269.77766 625.2532">
<line bbox="261.18 616.16397 269.77766 625.2532" dir="1 0" wmode="0">
<font name="MyriadPro-Semibold" size="7.98">
<char bbox="261.18 616.16397 265.45729 625.2532" c="2" x="261.18" y="623.2582"/>
<char bbox="265.50038 616.16397 269.77766 625.2532" c="3" x="265.50038" y="623.2582"/>
</font>
</line>
</block>
<block bbox="225.78 88.20229 305.18159 117.93829">
<line bbox="225.78 88.20229 305.18159 117.93829" dir="1 0" wmode="0">
<font name="MyriadPro-Bold" size="24">
<char bbox="225.78 88.20229 239.724 117.93829" c="P" x="225.78" y="111.93829"/>
<char bbox="239.5176 88.20229 248.63759 117.93829" c="r" x="239.5176" y="111.93829"/>
<char bbox="248.4552 88.20229 261.1272 117.93829" c="e" x="248.4552" y="111.93829"/>
<char bbox="261.1128 88.20229 269.29679 117.93829" c="f" x="261.1128" y="111.93829"/>
</font>
</line>
</block>
</page>
</document>
Simplified, the high-level structure looks like:
<page> -> [list of blocks]
<block> -> [list of blocks]
a block is either:
- stext
<line> -> [list of lines] (all have same baseline)
<font> -> [list of fonts] (horizontal spaces over a line)
<char> -> [list of chars]
- image
# TODO document the image data from mutool
=head2 get_mutool_text_stext_xml
get_mutool_text_stext_xml($pdf_filename, $pdf_page_no)
Returns a HashRef of the structured text from from page
number C<$pdf_page_no> of the PDF file C<$pdf_filename>.
See the function L<get_mutool_text_stext_raw|/get_mutool_text_stext_raw> for
details on the structure of this data.
=head2 get_mutool_page_info_raw
get_mutool_page_info_raw($pdf_filename)
Returns an XML string of the page bounding boxes of PDF file C<$pdf_filename>.
The data is in the form:
<document>
<page pagenum="1">
<MediaBox l="0" b="0" r="531" t="666" />
<CropBox l="0" b="0" r="531" t="666" />
<Rotate v="0" />
</page>
<page pagenum="2">
...
</page>
</document>
=head2 get_mutool_page_info_xml
get_mutool_page_info_xml($pdf_filename)
Returns a HashRef containing the page bounding boxes of PDF file
C<$pdf_filename>.
See function L<get_mutool_page_info_raw|/get_mutool_page_info_raw> for
information on the structure of the data.
=head2 get_mutool_outline_simple
fun get_mutool_outline_simple($pdf_filename)
Returns an array of the outline of the PDF file C<$pdf_filename> as an
C<ArrayRef[HashRef]> which corresponds to the C<items> attribute of
L<Renard::Incunabula::Outline>.
=head2 get_mutool_get_trailer_raw
fun get_mutool_get_trailer_raw($pdf_filename)
Returns the trailer of the PDF file C<$pdf_filename> as a string.
=head2 get_mutool_get_object_raw
fun get_mutool_get_object_raw($pdf_filename, $object_id)
Returns the object given by the ID C<$object_id> for PDF file C<$pdf_filename>
as a string.
=head2 get_mutool_get_info_object_parsed
fun get_mutool_get_info_object_parsed( $pdf_filename )
Returns the document information dictionary as a
L<Renard::API::MuPDF::mutool::ObjectParser> object.
See Table 10.2 on pg. 844 of the I<PDF Reference, version 1.7> to see the
entries that usually used (e.g., Title, Author).
=head1 SEE ALSO
L<Repository information|http://project-renard.github.io/doc/development/repo/p5-Renard-API-MuPDF-mutool/>
=head1 AUTHOR
Project Renard
=head1 COPYRIGHT AND LICENSE
This software is copyright (c) 2017 by Project Renard.
This is free software; you can redistribute it and/or modify it under
the same terms as the Perl 5 programming language system itself.
=cut