—#!/usr/bin/env perl
# /=====================================================================\ #
# | latexml | #
# | main conversion program | #
# |=====================================================================| #
# | Part of LaTeXML: | #
# | Public domain software, produced as part of work done by the | #
# | United States Government & not subject to copyright in the US. | #
# |---------------------------------------------------------------------| #
# | Bruce Miller <bruce.miller@nist.gov> #_# | #
# | http://dlmf.nist.gov/LaTeXML/ (o o) | #
# \=========================================================ooo==U==ooo=/ #
use
strict;
use
warnings;
use
FindBin;
use
Pod::Usage;
use
LaTeXML::Core;
use
Encode;
#**********************************************************************
# Parse command line
my
(
$verbosity
,
$strict
,
$comments
,
$noparse
,
$includestyles
) = (0, 0, 1, 0, 0);
my
(
$format
,
$destination
,
$help
,
$showversion
) = (
'xml'
,
''
);
my
(
$preamble
,
$postamble
) = (
undef
,
undef
);
my
(
$documentid
);
my
$inputencoding
;
my
$mode
=
undef
;
my
@paths
= ();
my
(
@preload
);
GetOptions(
"destination=s"
=> \
$destination
,
"output=s"
=> \
$destination
,
"preload=s"
=> \
@preload
,
"path=s"
=> \
@paths
,
"preamble=s"
=> \
$preamble
,
"postamble=s"
=> \
$postamble
,
"quiet"
=>
sub
{
$verbosity
--; },
"verbose"
=>
sub
{
$verbosity
++; },
"strict"
=> \
$strict
,
"xml"
=>
sub
{
$format
=
'xml'
; },
"tex"
=>
sub
{
$format
=
'tex'
; },
"box"
=>
sub
{
$format
=
'box'
; },
"bibtex"
=>
sub
{
$mode
=
'BibTeX'
; },
"noparse"
=> \
$noparse
,
"includestyles"
=> \
$includestyles
,
"inputencoding=s"
=> \
$inputencoding
,
"comments!"
=> \
$comments
,
"VERSION"
=> \
$showversion
,
"debug=s"
=>
sub
{
no
strict
'refs'
; ${
'LaTeXML::'
.
$_
[1] .
'::DEBUG'
} = 1; },
"documentid=s"
=> \
$documentid
,
"help"
=> \
$help
,
) or pod2usage(
-message
=>
$LaTeXML::IDENTITY
,
-exitval
=> 1,
-verbose
=> 0,
-output
=> \
*STDERR
);
pod2usage(
-message
=>
$LaTeXML::IDENTITY
,
-exitval
=> 0,
-verbose
=> 2,
-output
=> \
*STDOUT
)
if
$help
;
if
(
$showversion
) {
STDERR
"$LaTeXML::IDENTITY\n"
;
exit
(0); }
pod2usage(
-message
=>
"$LaTeXML::IDENTITY\nMissing input TeX file"
,
-exitval
=> 1,
-verbose
=> 0,
-output
=> \
*STDERR
)
unless
@ARGV
;
my
$source
=
$ARGV
[0];
#**********************************************************************
# Set up the processing.
STDERR
"$LaTeXML::IDENTITY\n"
if
$verbosity
> -1;
STDERR
"processing started "
.
localtime
() .
"\n"
if
$verbosity
> -1;
@paths
=
map
{ pathname_canonical(
$_
) }
@paths
;
if
(
my
@baddirs
=
grep
{ !-d
$_
}
@paths
) {
warn
"These path directories do not exist: "
.
join
(
', '
,
@baddirs
) .
"\n"
; }
my
$latexml
= LaTeXML::Core->new(
preload
=> [
@preload
],
searchpaths
=> [
grep
{ -d
$_
}
reverse
(
@paths
)],
graphicspaths
=> [
'.'
],
verbosity
=>
$verbosity
,
strict
=>
$strict
,
includecomments
=>
$comments
,
inputencoding
=>
$inputencoding
,
includestyles
=>
$includestyles
,
documentid
=>
$documentid
,
nomathparse
=>
$noparse
);
# Check that destination is valid before wasting any time...
if
(
$destination
) {
$destination
= pathname_canonical(
$destination
);
if
(
my
$dir
= pathname_directory(
$destination
)) {
pathname_mkdir(
$dir
) or
die
"Couldn't create destination directory $dir: $!"
; } }
binmode
(STDERR,
":encoding(UTF-8)"
);
$mode
=
'BibTeX'
if
!
defined
$mode
&& (
$source
=~ /\.bib$/);
$mode
=
'TeX'
unless
defined
$mode
;
# ========================================
# First read and digest whatever we're given.
my
$digested
;
my
$serialized
;
eval
{
if
(
$source
eq
'-'
) {
{
local
$/ =
undef
;
$source
=
"literal:"
. <>; } }
$digested
=
$latexml
->digestFile(
$source
,
mode
=>
$mode
,
preamble
=>
$preamble
,
postamble
=>
$postamble
);
# ========================================
# Now, convert to DOM, encode and output, if desired.
if
(
$digested
) {
$latexml
->withState(
sub
{
if
(
$format
eq
'tex'
) {
$serialized
= LaTeXML::Core::Token::UnTeX(
$digested
); }
elsif
(
$format
eq
'box'
) {
$serialized
= (
$verbosity
> 0 ?
$digested
->stringify :
$digested
->toString); }
else
{
my
$dom
=
$latexml
->convertDocument(
$digested
);
$serialized
=
$dom
->toString(1);
}
# NOTE that we are serializing via LaTeXML's Document::serialize_aux
# which has NOT been encoded into bytes, so we need an explicit encode before printing/returning
$serialized
= Encode::encode(
'UTF-8'
,
$serialized
)
if
$serialized
;
}); }
};
if
($@) {
# Fatal error?
STDERR $@; }
$latexml
->showProfile();
# Show profile (if any)
STDERR
"\nConversion complete: "
.
$latexml
->getStatusMessage .
".\n"
;
STDERR
"processing finished "
.
localtime
() .
"\n"
if
$verbosity
> -1;
if
(!
$serialized
) { }
elsif
(
$destination
) {
my
$OUTPUT
;
open
(
$OUTPUT
,
">"
,
$destination
) or
die
"Couldn't open output file $destination: $!"
;
$OUTPUT
$serialized
;
close
(
$OUTPUT
); }
else
{
$serialized
; }
# ========================================
# Now, unbind stuff, so we can clear memory
$latexml
=
undef
;
$digested
=
undef
;
$serialized
=
undef
;
if
($@) {
# non-zero exit code for fatal conversions
exit
1;
}
#**********************************************************************
__END__
=head1 NAME
C<latexml> - transforms a TeX/LaTeX file into XML.
=head1 SYNOPSIS
latexml [options] I<texfile>
Options:
--destination=file sets destination file (default stdout).
--output=file [obsolete synonym for --destination]
--preload=module requests loading of an optional module;
can be repeated
--preamble=file sets a preamble file which will
effectively be prepended to the main file.
--postamble=file sets a postamble file which will
effectively be appended to the main file.
--includestyles allows latexml to load raw *.sty file;
by default it avoids this.
--path=dir adds to the paths searched for files,
modules, etc;
--documentid=id assign an id to the document root.
--quiet suppress messages (can repeat)
--verbose more informative output (can repeat)
--strict makes latexml less forgiving of errors
--bibtex processes as a BibTeX bibliography.
--xml requests xml output (default).
--tex requests TeX output after expansion.
--box requests box output after expansion
and digestion.
--noparse suppresses parsing math
--nocomments omit comments from the output
--inputencoding=enc specify the input encoding.
--VERSION show version number.
--debug=package enables debugging output for the named
package
--help shows this help message.
If I<texfile> is '-', latexml reads the TeX source from standard input.
If I<texfile> has an explicit extention of C<.bib>, it is processed
as a BibTeX bibliography.
=head1 OPTIONS AND ARGUMENTS
=over 4
=item C<--destination>=I<file>
Specifies the destination file; by default the XML is written to stdout.
=item C<--preload>=I<module>
Requests the loading of an optional module or package. This may be useful if the TeX code
does not specificly require the module (eg. through input or usepackage).
For example, use C<--preload=LaTeX.pool> to force LaTeX mode.
=item C<--preamble>=I<file>, C<--postamble>=I<file>
Specifies a file whose contents will effectively be prepended or appended
to the main document file's content. This can be useful when processing
TeX fragments, in which case the preamble would contain documentclass and begindocument
control sequences. This option is not used when processing BibTeX files.
=item C<--includestyles>
This optional allows processing of style files (files with extensions C<sty>,
C<cls>, C<clo>, C<cnf>). By default, these files are ignored unless a latexml
implementation of them is found (with an extension of C<ltxml>).
These style files generally fall into two classes: Those
that merely affect document style are ignorable in the XML.
Others define new markup and document structure, often using
deeper LaTeX macros to achieve their ends. Although the omission
will lead to other errors (missing macro definitions), it is
unlikely that processing the TeX code in the style file will
lead to a correct document.
=item C<--path>=I<dir>
Add I<dir> to the search paths used when searching for files, modules, style files, etc;
somewhat like TEXINPUTS. This option can be repeated.
=item C<--documentid>=I<id>
Assigns an ID to the root element of the XML document. This ID is generally
inherited as the prefix of ID's on all other elements within the document.
This is useful when constructing a site of multiple documents so that
all nodes have unique IDs.
=item C<--quiet>
Reduces the verbosity of output during processing, used twice is pretty silent.
=item C<--verbose>
Increases the verbosity of output during processing, used twice is pretty chatty.
Can be useful for getting more details when errors occur.
=item C<--strict>
Specifies a strict processing mode. By default, undefined control sequences and
invalid document constructs (that violate the DTD) give warning messages, but attempt
to continue processing. Using --strict makes them generate fatal errors.
=item C<--bibtex>
Forces latexml to treat the file as a BibTeX bibliography.
Note that the timing is slightly different than the usual
case with BibTeX and LaTeX. In the latter case, BibTeX simply
selects and formats a subset of the bibliographic entries; the
actual TeX expansion is carried out when the result is included
in a LaTeX document. In contrast, latexml processes and expands
the entire bibliography; the selection of entries is done
during postprocessing. This also means that any packages
that define macros used in the bibliography must be
specified using the C<--preload> option.
=item C<--xml>
Requests XML output; this is the default.
=item C<--tex>
Requests TeX output for debugging purposes; processing is only carried out through expansion and digestion.
This may not be quite valid TeX, since Unicode may be introduced.
=item C<--box>
Requests Box output for debugging purposes; processing is carried out through expansion and digestions,
and the result is printed.
=item C<--nocomments>
Normally latexml preserves comments from the source file, and adds a comment every 25 lines as
an aid in tracking the source. The option --nocomments discards such comments.
=item C<--inputencoding=>I<encoding>
Specify the input encoding, eg. C<--inputencoding=iso-8859-1>.
The encoding must be one known to Perl's Encode package.
Note that this only enables the translation of the input bytes to
UTF-8 used internally by LaTeXML, but does not affect catcodes.
It is usually better to use LaTeX's inputenc package.
Note that this does not affect the output encoding, which is
always UTF-8.
=item C<--VERSION>
Shows the version number of the LaTeXML package..
=item C<--debug>=I<package>
Enables debugging output for the named package. The package is given without the leading LaTeXML::.
=item C<--help>
Shows this help message.
=back
=head1 SEE ALSO
L<latexmlpost>, L<latexmlmath>, L<LaTeXML>
=cut