Sponsoring The Perl Toolchain Summit 2025: Help make this important event another success Learn more

# /=====================================================================\ #
# | LaTeXML::Core::Mouth | #
# | Analog of TeX's Mouth: Tokenizes strings & files | #
# |=====================================================================| #
# | Part of LaTeXML: | #
# | Public domain software, produced as part of work done by the | #
# | United States Government & not subject to copyright in the US. | #
# |---------------------------------------------------------------------| #
# | Bruce Miller <bruce.miller@nist.gov> #_# | #
# \=========================================================ooo==U==ooo=/ #
use strict;
use Encode qw(decode);
our $READLINE_PROGRESS_QUANTUM = 25;
# Factory method;
# Create an appropriate Mouth
# options are
# quiet,
# atletter,
# content
sub create {
my ($class, $source, %options) = @_;
if ($options{content}) { # we've cached the content of this source
my ($dir, $name, $ext) = pathname_split($source);
$options{source} = $source;
$options{shortsource} = "$name.$ext";
return $class->new($options{content}, %options); }
elsif ($source =~ s/^literal://) { # we've supplied literal data
$options{source} = ''; # the source does not have a corresponding file name
return $class->new($source, %options); }
elsif (!defined $source) {
return $class->new('', %options); }
else {
my $type = pathname_protocol($source);
my $newclass = "LaTeXML::Core::Mouth::$type";
if (!$newclass->can('new')) { # not already defined somewhere?
require "LaTeXML/Core/Mouth/$type.pm"; } # Load it!
return $newclass->new($source, %options); } }
sub new {
my ($class, $string, %options) = @_;
$string = q{} unless defined $string;
#$options{source} = "Anonymous String" unless defined $options{source};
#$options{shortsource} = "String" unless defined $options{shortsource};
my $self = bless { source => $options{source},
shortsource => $options{shortsource},
fordefinitions => ($options{fordefinitions} ? 1 : 0),
notes => ($options{notes} ? 1 : 0),
}, $class;
$self->openString($string);
$self->initialize;
return $self; }
sub openString {
my ($self, $string) = @_;
# if (0){
if (defined $string) {
if (utf8::is_utf8($string)) { } # If already utf7
elsif (my $encoding = $STATE->lookupValue('PERL_INPUT_ENCODING')) {
# Note that if chars in the input cannot be decoded, they are replaced by \x{FFFD}
# I _think_ that for TeX's behaviour we actually should turn such un-decodeable chars in to space(?).
$string = decode($encoding, $string, Encode::FB_DEFAULT);
if ($string =~ s/\x{FFFD}/ /g) { # Just remove the replacement chars, and warn (or Info?)
Info('misdefined', $encoding, $self, "input isn't valid under encoding $encoding"); } } }
$$self{string} = $string;
$$self{buffer} = [(defined $string ? splitLines($string) : ())];
return; }
sub initialize {
my ($self) = @_;
$$self{lineno} = 0;
$$self{colno} = 0;
$$self{chars} = [];
$$self{nchars} = 0;
if ($$self{notes}) {
my $source = defined($$self{source}) ? ($$self{source} || 'Literal String') : 'Anonymous String';
$$self{note_message} = "Processing " . ($$self{fordefinitions} ? "definitions" : "content")
. " " . $source;
ProgressSpinup($$self{note_message}); }
if ($$self{fordefinitions}) {
$$self{saved_at_cc} = $STATE->lookupCatcode('@');
$$self{SAVED_INCLUDE_COMMENTS} = $STATE->lookupValue('INCLUDE_COMMENTS');
$STATE->assignCatcode('@' => CC_LETTER);
$STATE->assignValue(INCLUDE_COMMENTS => 0); }
return; }
sub finish {
my ($self) = @_;
return if $$self{finished};
$$self{finished} = 1;
$$self{buffer} = [];
$$self{lineno} = 0;
$$self{colno} = 0;
$$self{chars} = [];
$$self{nchars} = 0;
if ($$self{fordefinitions}) {
$STATE->assignCatcode('@' => $$self{saved_at_cc});
$STATE->assignValue(INCLUDE_COMMENTS => $$self{SAVED_INCLUDE_COMMENTS}); }
if ($$self{notes}) {
ProgressSpindown($$self{note_message}); }
return; }
# This is (hopefully) a platform independent way of splitting a string
# into "lines" ending with CRLF, CR or LF (DOS, Mac or Unix).
# Note that TeX considers newlines to be \r, ie CR, ie ^^M
sub splitLines {
my ($string) = @_;
my @lines = split(/\r\n|\r|\n/s, $string, -1);
# split returns an extra empty string if $string ends with an EOL
# this must be removed
if (@lines && $lines[-1] eq '') { pop(@lines); }
return @lines; }
# This is (hopefully) a correct way to split a line into "chars",
# or what is probably more desired is "Grapheme clusters" (even "extended")
# These are unicode characters that include any following combining chars, accents & such.
# I am thinking that when we deal with unicode this may be the most correct way?
# If it's not the way XeTeX does it, perhaps, it must be that ALL combining chars
# have to be converted to the proper accent control sequences!
sub splitChars {
my ($line) = @_;
return [$line =~ m/\X/g]; }
sub getNextLine {
my ($self) = @_;
return unless scalar(@{ $$self{buffer} });
my $line = shift(@{ $$self{buffer} });
return $line; }
sub hasMoreInput {
my ($self) = @_;
return !$self->isEOL || scalar(@{ $$self{buffer} }); }
# Get the next character & it's catcode from the input,
# handling TeX's "^^" encoding.
# Note that this is the only place where catcode lookup is done,
# and that it is somewhat `inlined'.
sub getNextChar {
my ($self) = @_;
if ($$self{colno} < $$self{nchars}) {
my $ch = $$self{chars}[$$self{colno}++];
my $cc = $$STATE{catcode}{$ch}[0] // CC_OTHER; # $STATE->lookupCatcode($ch); OPEN CODED!
if (($cc == CC_SUPER) # Possible convert ^^x
&& ($$self{colno} + 1 < $$self{nchars}) && ($ch eq $$self{chars}[$$self{colno}])) {
my ($c1, $c2);
if (($$self{colno} + 2 < $$self{nchars}) # ^^ followed by TWO LOWERCASE Hex digits???
&& (($c1 = $$self{chars}[$$self{colno} + 1]) =~ /^[0-9a-f]$/)
&& (($c2 = $$self{chars}[$$self{colno} + 2]) =~ /^[0-9a-f]$/)) {
$ch = chr(hex($c1 . $c2));
splice(@{ $$self{chars} }, $$self{colno} - 1, 4, $ch);
$$self{nchars} -= 3; }
else { # OR ^^ followed by a SINGLE Control char type code???
my $c = $$self{chars}[$$self{colno} + 1];
my $cn = ord($c);
$ch = chr($cn + ($cn >= 64 ? -64 : 64));
splice(@{ $$self{chars} }, $$self{colno} - 1, 3, $ch);
$$self{nchars} -= 2; }
$cc = $STATE->lookupCatcode($ch) // CC_OTHER; }
return ($ch, $cc); }
else {
return (undef, undef); } }
sub stringify {
my ($self) = @_;
return "Mouth[<string>\@$$self{lineno}x$$self{colno}]"; }
#**********************************************************************
sub getLocator {
my ($self) = @_;
my ($toLine, $toCol, $fromLine, $fromCol) = ($$self{lineno}, $$self{colno});
my $maxCol = ($$self{nchars} ? $$self{nchars} - 1 : 0); #There is always a trailing EOL char
if ((defined $toCol) && ($toCol >= $maxCol)) {
$fromLine = $toLine;
$fromCol = 0; }
else {
$fromLine = $toLine;
$fromCol = $toCol; }
return LaTeXML::Common::Locator->new($$self{source}, $fromLine, $fromCol, $toLine, $toCol); }
sub getSource {
my ($self) = @_;
return $$self{source}; }
#**********************************************************************
# See The TeXBook, Chapter 8, The Characters You Type, pp.46--47.
#**********************************************************************
sub handle_escape { # Read control sequence
my ($self) = @_;
# NOTE: We're using control sequences WITH the \ prepended!!!
my ($ch, $cc) = getNextChar($self);
# Knuth, p.46 says that Newlines are converted to spaces,
# Bit I believe that he does NOT mean within control sequences
my $cs = "\\" . $ch; # I need this standardized to be able to lookup tokens (A better way???)
if ((defined $cc) && ($cc == CC_LETTER)) { # For letter, read more letters for csname.
while ((($ch, $cc) = getNextChar($self)) && $ch && ($cc == CC_LETTER)) {
$cs .= $ch; }
# We WILL skip spaces, but not till next token is read (in case catcode changes!!!!)
$$self{skipping_spaces} = 1;
$$self{colno}-- if (defined $cc) && ($cc != CC_LETTER); }
return T_CS($cs); }
sub handle_EOL {
my ($self) = @_;
# Note that newines should be converted to space (with " " for content)
# but it makes nicer XML with occasional \n. Hopefully, this is harmless?
my $token = ($$self{colno} == 1
? T_CS('\par')
: ($STATE->lookupValue('PRESERVE_NEWLINES') ? Token("\n", CC_SPACE) : T_SPACE));
$$self{colno} = $$self{nchars}; # Ignore any remaining characters after EOL
return $token; }
sub handle_space {
my ($self) = @_;
my ($ch, $cc);
# Skip any following spaces!
while ((($ch, $cc) = getNextChar($self)) && (defined $ch) && (($cc == CC_SPACE) || ($cc == CC_EOL))) { }
$$self{colno}-- if ($$self{colno} <= $$self{nchars}) && (defined $ch); # backup at nonspace/eol
return T_SPACE; }
sub handle_comment {
my ($self) = @_;
my $n = $$self{colno};
$$self{colno} = $$self{nchars};
my $comment = join('', @{ $$self{chars} }[$n .. $$self{nchars} - 1]);
$comment =~ s/^\s+//; $comment =~ s/\s+$//;
if ($comment && $STATE->lookupValue('INCLUDE_COMMENTS')) {
return T_COMMENT($comment); }
elsif (($STATE->lookupValue('PRESERVE_NEWLINES') || 0) > 1) {
return T_MARKER('EOL'); } # Required EOL during \read
else {
return; } }
# These cache the (presumably small) set of distinct letters, etc
# converted to Tokens.
# Note that this gets filled during runtime and carries over to through Daemon frames.
# However, since the values don't depend on any particular document, bindings, etc,
# they should be safe.
my %LETTER = ();
my %OTHER = ();
my %ACTIVE = ();
# # Dispatch table for catcodes.
# Possibly want to think about caching (common) letters, etc to keep from
# creating tokens like crazy... or making them more compact... or ???
my @DISPATCH = ( # [CONSTANT]
\&handle_escape, # T_ESCAPE
sub { ($_[1] eq '{' ? T_BEGIN : Token($_[1], CC_BEGIN)) }, # T_BEGIN
sub { ($_[1] eq '}' ? T_END : Token($_[1], CC_END)) }, # T_END
sub { ($_[1] eq '$' ? T_MATH : Token($_[1], CC_MATH)) }, # T_MATH
sub { ($_[1] eq '&' ? T_ALIGN : Token($_[1], CC_ALIGN)) }, # T_ALIGN
\&handle_EOL, # T_EOL
sub { ($_[1] eq '#' ? T_PARAM : Token($_[1], CC_PARAM)) }, # T_PARAM
sub { ($_[1] eq '^' ? T_SUPER : Token($_[1], CC_SUPER)) }, # T_SUPER
sub { ($_[1] eq '_' ? T_SUB : Token($_[1], CC_SUB)) }, # T_SUB
sub { undef; }, # T_IGNORE (we'll read next token)
\&handle_space, # T_SPACE
sub { $LETTER{ $_[1] } || ($LETTER{ $_[1] } = T_LETTER($_[1])); }, # T_LETTER
sub { $OTHER{ $_[1] } || ($OTHER{ $_[1] } = T_OTHER($_[1])); }, # T_OTHER
sub { $ACTIVE{ $_[1] } || ($ACTIVE{ $_[1] } = T_ACTIVE($_[1])); }, # T_ACTIVE
\&handle_comment, # T_COMMENT
sub { T_OTHER($_[1]); } # T_INVALID (we could get unicode!)
);
# Read the next token, or undef if exhausted.
# Note that this also returns COMMENT tokens containing source comments,
# and also locator comments (file, line# info).
# LaTeXML::Core::Gullet intercepts them and passes them on at appropriate times.
sub readToken {
my ($self) = @_;
while (1) { # Iterate till we find a token, or run out. (use return)
# ===== Get next line, if we need to.
if ($$self{colno} >= $$self{nchars}) {
$$self{lineno}++;
$$self{colno} = 0;
my $line = $self->getNextLine;
# For \read, we have to return something for EOL, and handle implicit final newline
my $read_mode = (($STATE->lookupValue('PRESERVE_NEWLINES') || 0) > 1);
my $eolch = "\r";
if (my $eol = $STATE->lookupDefinition(T_CS('\endlinechar'))) {
$eol = $eol->valueOf()->valueOf;
$eolch = (($eol > 0) && ($eol <= 255) ? chr($eol) : undef); }
if (!defined $line) { # Exhausted the input.
my $eolcc = ((defined $eolch) && $STATE->lookupCatcode($eolch)) // CC_OTHER;
my $eoftoken = $read_mode && (defined $eolch) && !$$self{at_eof} && $$self{source}
&& ($eolcc == CC_EOL ? T_CS('\par')
: Token($eolch, $eolcc));
$$self{at_eof} = 1;
$$self{chars} = [];
$$self{nchars} = 0;
return $eoftoken if $eoftoken;
return; }
# Remove trailing spaces from external sources
if ($$self{source}) { $line =~ s/ *$//s; }
# Then append the appropriate \endlinechar, or "\r";
$line .= $eolch if defined $eolch;
$$self{chars} = splitChars($line);
$$self{nchars} = scalar(@{ $$self{chars} });
# In state N, skip spaces
while (($$self{colno} < $$self{nchars})
# DIRECT ACCESS to $STATE's catcode table!!!
&& (($$STATE{catcode}{ $$self{chars}[$$self{colno}] }[0] || CC_OTHER) == CC_SPACE)) {
$$self{colno}++; }
# If upcoming line is empty, and there is no recognizable EOL, fake one
return T_MARKER('EOL') if $read_mode
&& ($$self{colno} >= $$self{nchars}) && ((!defined $eolch) || ($eolch ne "\r"));
# Sneak a comment out, every so often.
if ((($$self{lineno} % $READLINE_PROGRESS_QUANTUM) == 0) && $STATE->lookupValue('INCLUDE_COMMENTS')) {
return T_COMMENT("**** " . ($$self{shortsource} || 'String') . " Line $$self{lineno} ****"); }
}
if ($$self{skipping_spaces}) { # In state S, skip spaces
my ($ch, $cc);
while ((($ch, $cc) = getNextChar($self)) && (defined $ch) && ($cc == CC_SPACE)) { }
$$self{colno}-- if ($$self{colno} <= $$self{nchars}) && (defined $cc) && ($cc != CC_SPACE);
if ((defined $cc) && ($cc == CC_EOL)) { # If we've got an EOL
getNextChar($self);
$$self{colno}-- if ($$self{colno} < $$self{nchars}); }
$$self{skipping_spaces} = 0; }
# ==== Extract next token from line.
my ($ch, $cc) = getNextChar($self);
my $token = (defined $cc ? $DISPATCH[$cc] : undef);
$token = &$token($self, $ch) if ref $token eq 'CODE';
return $token if defined $token; # Else, repeat till we get something or run out.
}
return; }
#**********************************************************************
# Read all tokens until a token equal to $until (if given), or until exhausted.
# Returns an empty Tokens list, if there is no input
sub readTokens {
my ($self) = @_;
my @tokens = ();
while (defined(my $token = $self->readToken())) {
push(@tokens, $token); }
while (@tokens && $tokens[-1]->getCatcode == CC_SPACE) { # Remove trailing space
pop(@tokens); }
return Tokens(@tokens); }
#**********************************************************************
# Read a raw line; there are so many variants of how it should end,
# that the Mouth API is left as simple as possible.
# Alas: $noread true means NOT to read a new line, but only return
# the remainder of the current line, if any. This is useful when combining
# with previously peeked tokens from the Gullet.
sub readRawLine {
my ($self, $noread) = @_;
my $line;
if ($$self{colno} < $$self{nchars}) {
$line = join('', @{ $$self{chars} }[$$self{colno} .. $$self{nchars} - 1]);
# strip the final carriage return, if it has been added back
$line =~ s/\r$//s;
$$self{colno} = $$self{nchars}; }
elsif ($noread) {
$line = ''; }
else {
$line = $self->getNextLine;
if (!defined $line) {
$$self{at_eof} = 1;
$$self{chars} = []; $$self{nchars} = 0; $$self{colno} = 0; }
else {
$line =~ s/ *$//s;
$$self{lineno}++;
$$self{chars} = splitChars($line);
$$self{nchars} = scalar(@{ $$self{chars} });
$$self{colno} = $$self{nchars}; } }
return $line; }
sub isEOL {
my ($self) = @_;
my $savecolno = $$self{colno};
# We have to peek past any to-be-skipped spaces!!!!
if ($$self{skipping_spaces}) {
my ($ch, $cc);
while ((($ch, $cc) = getNextChar($self)) && (defined $ch) && ($cc == CC_SPACE)) { }
$$self{colno}-- if ($$self{colno} <= $$self{nchars}) && (defined $cc) && ($cc != CC_SPACE);
if ((defined $cc) && ($cc == CC_EOL)) { # If we've got an EOL
getNextChar($self);
$$self{colno}-- if ($$self{colno} < $$self{nchars}); } }
my $eol = $$self{colno} >= $$self{nchars};
$$self{colno} = $savecolno;
return $eol; }
#======================================================================
1;
__END__
=pod
=head1 NAME
C<LaTeXML::Core::Mouth> - tokenize the input.
=head1 DESCRIPTION
A C<LaTeXML::Core::Mouth> (and subclasses) is responsible for I<tokenizing>, ie.
converting plain text and strings into L<LaTeXML::Core::Token>s according to the
current category codes (catcodes) stored in the C<LaTeXML::Core::State>.
It extends L<LaTeXML::Common::Object>.
=head2 Creating Mouths
=over 4
=item C<< $mouth = LaTeXML::Core::Mouth->create($source, %options); >>
Creates a new Mouth of the appropriate class for reading from C<$source>.
=item C<< $mouth = LaTeXML::Core::Mouth->new($string, %options); >>
Creates a new Mouth reading from C<$string>.
=back
=head2 Methods
=over 4
=item C<< $token = $mouth->readToken; >>
Returns the next L<LaTeXML::Core::Token> from the source.
=item C<< $boole = $mouth->hasMoreInput; >>
Returns whether there is more data to read.
=item C<< $string = $mouth->getLocator; >>
Return a description of current position in the source, for reporting errors.
=item C<< $tokens = $mouth->readTokens; >>
Reads all remaining tokens in the mouth, removing any trailing space catcode tokens
=item C<< $lines = $mouth->readRawLine; >>
Reads a raw (untokenized) line from C<$mouth>, or undef if none is found.
=back
=head1 AUTHOR
Bruce Miller <bruce.miller@nist.gov>
=head1 COPYRIGHT
Public domain software, produced as part of work done by the
United States Government & not subject to copyright in the US.
=cut