############################################################################### # # LaTeX::TOM::Parser # # The parsing class # ############################################################################### package LaTeX::TOM::Parser; use strict; use base qw( LaTeX::TOM::Node LaTeX::TOM::Tree ); use constant true => 1; use constant false => 0; use Carp qw(carp croak); use File::Basename qw(fileparse); our $VERSION = '0.13'; my %error_handlers = ( 0 => sub { warn "parse error: $_[0].\n" }, 1 => sub { die "parse error: $_[0].\n" }, 2 => sub {}, ); # Constructor # sub _new { my $class = shift; no strict 'refs'; my $self = bless { config => { BRACELESS => \%{'LaTeX::TOM::BRACELESS'}, INNERCMDS => \%{'LaTeX::TOM::INNERCMDS'}, MATHENVS => \%{'LaTeX::TOM::MATHENVS'}, MATHBRACKETS => \%{'LaTeX::TOM::MATHBRACKETS'}, PARSE_ERRORS_FATAL => ${'LaTeX::TOM::PARSE_ERRORS_FATAL'}, TEXTENVS => \%{'LaTeX::TOM::TEXTENVS'}, }, }; $self->_init(@_); return $self; } sub new { # XXX deprecated as of 2023-01-30 carp 'Direct use of LaTeX::TOM::Parser constructor is deprecated and will be removed in future version'; shift->_new(@_); } # Set/reset "globals" # sub _init { my $parser = shift; my ($parse_errors_fatal, $readinputs, $applymappings) = @_; my $retrieve_opt_default = sub { my ($opt, $default) = @_; return $opt if defined $opt; return $default; }; # set user options # $parser->{readinputs} = $retrieve_opt_default->($readinputs, 0); $parser->{applymappings} = $retrieve_opt_default->($applymappings, 0); $parser->{PARSE_ERRORS_FATAL} = $retrieve_opt_default->($parse_errors_fatal, $parser->{config}{PARSE_ERRORS_FATAL}); # init internal stuff # $parser->{MATHBRACKETS} = $parser->{config}{MATHBRACKETS}; # this will hold a running list/hash of commands that have been remapped $parser->{MAPPEDCMDS} = {}; # this will hold a running list/hash of commands that have been used. We dont # bother apply mappings except to commands that have been used. $parser->{USED_COMMANDS} = {}; # no file yet $parser->{file} = undef; } # Parse a LaTeX file, return a tree. You probably want this method. # sub parseFile { my $parser = shift; my $filename = shift; # init variables # $parser->{file} = $filename; # file name member data my $tree = {}; # init output tree # read in text from file or bomb out # my $text = _readFile($filename, true); # do the parse # $tree = $parser->parse($text); return $tree; } # main parsing entrypoint # sub parse { my $parser = shift; my ($text) = @_; # first half of parsing (goes up to finding commands, reading inputs) # my ($tree, $bracehash) = $parser->_parseA($text); _debug( 'done with _parseA', sub { $tree->_warn() }, ); # handle mappings # $parser->_applyMappings($tree) if $parser->{applymappings}; _debug( 'done with _applyMappings', sub { $tree->_warn() }, ); # second half of parsing (environments) # $parser->_parseB($tree); _debug( 'done with _parseB', sub { $tree->_warn() }, ); # once all the above is done we can propegate math/plaintext modes down # $parser->_propegateModes($tree, 0, 0); # math = 0, plaintext = 0 _debug( 'done with _propegateModes', sub { $tree->_warn() }, ); # handle kooky \[ \] math mode # if (not exists $parser->{MAPPEDCMDS}->{'\\['}) { # math mode (\[ \], \( \)) $parser->_stage5($tree, {'\\[' => '\\]', '\\(' => '\\)'}, 1); $parser->_propegateModes($tree, 0, 0); # have to do this again of course $parser->{MATHBRACKETS}->{'\\['} = '\\]'; # put back in brackets list for $parser->{MATHBRACKETS}->{'\\('} = '\\)'; # printing purposes. } _debug( undef, sub { $tree->_warn() }, ); $tree->listify; # add linked-list stuff return $tree; } # Parsing with no mappings and no externally accessible parser object. # sub _basicparse { my $parser = shift; # @_ would break code my $text = shift; my $parse_errors_fatal = (defined $_[0] ? $_[0] : $parser->{config}{PARSE_ERRORS_FATAL}); my $readinputs = (defined $_[1] ? $_[1] : 1); $parser = LaTeX::TOM::Parser->_new($parse_errors_fatal, $readinputs); my ($tree, $bracehash) = $parser->_parseA($text); $parser->_parseB($tree); $tree->listify; # add linked-list stuff return ($tree, $bracehash); } # start the tree. separate out comment and text nodes. # sub _stage1 { my $parser = shift; my $text = shift; my @nodes = _getTextAndCommentNodes($text, 0, length($text)); return LaTeX::TOM::Tree->_new([@nodes]); } # this stage parses the braces ({}) and adds the corresponding structure to # the tree. # sub _stage2 { my $parser = shift; my $tree = shift; my $bracehash = shift || undef; my $startidx = shift || 0; # last two params for starting at some specific my $startpos = shift || 0; # node and offset. my %blankhash; if (not defined $bracehash) { $bracehash = {%blankhash}; } my $leftidx = -1; my $leftpos = -1; my $leftcount = 0; # loop through the nodes for (my $i = $startidx; $i < @{$tree->{nodes}}; $i++) { my $node = $tree->{nodes}[$i]; my $spos = $node->{start}; # get text start position # set position placeholder within the text block my $pos = ($i == $startidx) ? $startpos : 0; if ($node->{type} eq 'TEXT') { _debug("parseStage2: looking at text node: [$node->{content}]", undef); my ($nextpos, $brace) = _findbrace($node->{content}, $pos); while ($nextpos != -1) { $pos = $nextpos + 1; # update position pointer # handle left brace if ($brace eq '{') { _debug("found '{' at position $nextpos, leftcount is $leftcount", undef); if ($leftcount == 0) { $leftpos = $nextpos; $leftidx = $i } $leftcount++; } # handle right brance elsif ($brace eq '}') { _debug("found '}' at position $nextpos, leftcount is $leftcount", undef); my $rightpos = $nextpos; $leftcount--; # found the corresponding right brace to our starting left brace if ($leftcount == 0) { # see if we have to split the text node into 3 parts # if ($leftidx == $i) { my ($leftside, $textnode3) = $node->split($rightpos, $rightpos); my ($textnode1, $textnode2) = $leftside->split($leftpos, $leftpos); # make the new GROUP node my $groupnode = LaTeX::TOM::Node->_new( {type => 'GROUP', start => $textnode2->{start} - 1, end => $textnode2->{end} + 1, children => LaTeX::TOM::Tree->_new([$textnode2]), }); # splice the new subtree into the old location splice @{$tree->{nodes}}, $i, 1, $textnode1, $groupnode, $textnode3; # add to the brace-pair lookup table $bracehash->{$groupnode->{start}} = $groupnode->{end}; $bracehash->{$groupnode->{end}} = $groupnode->{start}; # recur into new child node $parser->_stage2($groupnode->{children}, $bracehash); $i++; # skip to textnode3 for further processing } # split across nodes # else { my ($textnode1, $textnode2) = $tree->{nodes}[$leftidx]->split($leftpos, $leftpos); my ($textnode3, $textnode4) = $node->split($rightpos, $rightpos); # remove nodes in between the node we found '{' in and the node # we found '}' in # my @removed = splice @{$tree->{nodes}}, $leftidx+1, $i-$leftidx-1; # create a group node that contains the text after the left brace, # then all the nodes up until the next text node, then the text # before the right brace. # my $groupnode = LaTeX::TOM::Node->_new( {type => 'GROUP', start => $textnode2->{start} - 1, end => $textnode3->{end} + 1, children => LaTeX::TOM::Tree->_new( [$textnode2, @removed, $textnode3]), }); # replace the two original text nodes with the leftover left and # right portions, as well as the group node with everything in # the middle. # splice @{$tree->{nodes}}, $leftidx, 2, $textnode1, $groupnode, $textnode4; # add to the brace-pair lookup table $bracehash->{$groupnode->{start}} = $groupnode->{end}; $bracehash->{$groupnode->{end}} = $groupnode->{start}; # recur into new child nodes $parser->_stage2($groupnode->{children}, $bracehash); # step back to textnode4 on this level for further processing $i -= scalar @removed; } $leftpos = -1; # reset left data $leftidx = -1; last; } # $leftcount == 0 # check for '}'-based error # if ($leftcount < 0) { $error_handlers{$parser->{PARSE_ERRORS_FATAL}}->("'}' before '{' at " . ($spos + $rightpos)); $leftcount = 0; # reset and continue } } # right brace ($nextpos, $brace) = _findbrace($node->{content}, $pos); } # while (braces left) } # if TEXT } # loop over all nodes # check for extra '{' parse error # if ($leftcount > 0) { my $spos = $tree->{nodes}[$leftidx]->{start}; # get text start position $error_handlers{$parser->{PARSE_ERRORS_FATAL}}->("unmatched '{' at " . ($spos + $leftpos)); # try to continue on, after the offending brace $parser->_stage2($tree, $bracehash, $leftidx, $leftpos + 1); } return $bracehash; } # this stage finds LaTeX commands and accordingly turns GROUP nodes into # command nodes, labeled with the command # sub _stage3 { my $parser = shift; my $tree = shift; my $parent = shift; for (my $i = 0; $i < @{$tree->{nodes}}; $i++) { my $node = $tree->{nodes}[$i]; # check text node for command tag if ($node->{type} eq 'TEXT') { my $text = $node->{content}; # inner command (such as {\command text text}). our regexp checks to see # if this text chunk begins with \command, since that would be the case # due to the previous parsing stages. if found, the parent node is # promoted to a command. # if ($text =~ /^\s*\\(\w+\*?)/ && defined $parent && $parser->{config}{INNERCMDS}->{$1}) { my $command = $1; # if the parent is already a command node, we have to make a new # nested command node # if ($parent->{type} eq 'COMMAND') { # make a new command node my $newnode = LaTeX::TOM::Node->_new( {type => 'COMMAND', command => $command, start => $parent->{start}, end => $parent->{end}, position => 'inner', children => $parent->{children} }); # point parent to it $parent->{children} = LaTeX::TOM::Tree->_new([$newnode]); # start over at this level (get additional inner commands) $parent = $newnode; $i = -1; $parser->{USED_COMMANDS}->{$newnode->{command}} = 1; } # parent is a naked group, we can make it into a command node # elsif ($parent->{type} eq 'GROUP') { $parent->{type} = 'COMMAND'; $parent->{command} = $command; $parent->{position} = 'inner'; # start over at this level $i = -1; $parser->{USED_COMMANDS}->{$parent->{command}} = 1; } $node->{content} =~ s/^\s*\\(?:\w+\*?)//o; } # outer command (such as \command{parameters}). our regexp checks to # see if this text chunk ends in \command, since that would be the case # due to the previous parsing stages. # if ($text =~ /(?:^|[^\\])(\\\w+\*?(\s*\[.*?\])?)\s*$/os && defined $tree->{nodes}[$i+1] && $tree->{nodes}[$i+1]->{type} eq 'GROUP') { my $tag = $1; _debug("found text node [$text] with command tag [$tag]", undef); # remove the text $node->{content} =~ s/\\\w+\*?\s*(?:\[.*?\])?\s*$//os; # parse it for command and ops $tag =~ /^\\(\w+\*?)\s*(?:\[(.*?)\])?$/os; my $command = $1; my $opts = $2; # make the next node a command node with the above data my $next = $tree->{nodes}[$i+1]; $next->{type} = 'COMMAND'; $next->{command} = $command; $next->{opts} = $opts; $next->{position} = 'outer'; $parser->{USED_COMMANDS}->{$next->{command}} = 1; } # recognize braceless commands # if ($text =~ /(\\(\w+\*?)[ \t]+(\S+))/gso || $text =~ /(\\(\w+)(\d+))/gso) { my $all = $1; my $command = $2; my $param = $3; if ($parser->{config}{BRACELESS}->{$command}) { # warn "found braceless command $command with param $param"; # get location to split from node text my $a = index $node->{content}, $all, 0; my $b = $a + length($all) - 1; # make all the new nodes # new left and right text nodes my ($leftnode, $rightnode) = $node->split($a, $b); # param contents node my $pstart = index $node->{content}, $param, $a; my $newchild = LaTeX::TOM::Node->_new( {type => 'TEXT', start => $node->{start} + $pstart, end => $node->{start} + $pstart + length($param) - 1, content => $param }); # new command node my $commandnode = LaTeX::TOM::Node->_new( {type => 'COMMAND', braces => 0, command => $command, start => $node->{start} + $a, end => $node->{start} + $b, children => LaTeX::TOM::Tree->_new([$newchild]), }); $parser->{USED_COMMANDS}->{$commandnode->{command}} = 1; # splice these all into the original array splice @{$tree->{nodes}}, $i, 1, $leftnode, $commandnode, $rightnode; # make the rightnode the node we're currently analyzing $node = $rightnode; # make sure outer loop will continue parsing *after* rightnode $i += 2; } } } # recur if ($node->{type} eq 'GROUP' || $node->{type} eq 'COMMAND') { $parser->_stage3($node->{children}, $node); } } } # this stage finds \begin{x} \end{x} environments and shoves their contents # down into a new child node, with a parent node of ENVIRONMENT type. # # this has the effect of making the tree deeper, since much of the structure # is in environment tags and will now be picked up. # # for ENVIRONMENTs, "start" means the ending } on the \begin tag, # "end" means the starting \ on the \end tag, # "ostart" is the starting \ on the "begin" tag, # "oend" is the ending } on the "end" tag, and # and "class" is the "x" from above. # sub _stage4 { my $parser = shift; my $tree = shift; my $bcount = 0; # \begin "stack count" my $class = ""; # environment class my $bidx = 0; # \begin array index. for (my $i = 0; $i < @{$tree->{nodes}}; $i++) { my $node = $tree->{nodes}->[$i]; # see if this is a "\begin" command node if ($node->{type} eq 'COMMAND' && $node->{command} eq 'begin') { _debug("parseStage4: found a begin COMMAND node, $node->{children}->{nodes}[0]->{content} @ $node->{start}", undef); # start a new "stack" if ($bcount == 0) { $bidx = $i; $bcount++; $class = $node->{children}->{nodes}->[0]->{content}; _debug("parseStage4: opening environment tag found, class = $class", undef); } # add to the "stack" elsif ($node->{children}->{nodes}->[0]->{content} eq $class) { $bcount++; _debug("parseStage4: incrementing tag count for $class", undef); } } # handle "\end" command nodes elsif ($node->{type} eq 'COMMAND' && $node->{command} eq 'end' && $node->{children}->{nodes}->[0]->{content} eq $class) { $bcount--; _debug("parseStage4: decrementing tag count for $class", undef); # we found our closing "\end" tag. replace everything with the proper # ENVIRONMENT tag and subtree. # if ($bcount == 0) { _debug("parseStage4: closing environment $class", undef); # first we must take everything between the "\begin" and "\end" # nodes and put them in a new array, removing them from the old one my @newarray = splice @{$tree->{nodes}}, $bidx+1, $i - ($bidx + 1); # make the ENVIRONMENT node my $start = $tree->{nodes}[$bidx]->{end}; my $end = $node->{start}; my $envnode = LaTeX::TOM::Node->_new( {type => 'ENVIRONMENT', class => $class, start => $start, # "inner" start and end end => $end, ostart => $start - length('begin') - length($class) - 2, oend => $end + length('end') + length($class) + 2, children => LaTeX::TOM::Tree->_new([@newarray]), }); if ($parser->{config}{MATHENVS}->{$envnode->{class}}) { $envnode->{math} = 1; } # replace the \begin and \end COMMAND nodes with the single # environment node splice @{$tree->{nodes}}, $bidx, 2, $envnode; $class = ""; # reset class. # i is going to change by however many nodes we removed $i -= scalar @newarray; # recur into the children $parser->_stage4($envnode->{children}); } } # recur in general elsif ($node->{children}) { $parser->_stage4($node->{children}); } } # parse error if we're missing an "\end" tag. if ($bcount > 0) { $error_handlers{$parser->{PARSE_ERRORS_FATAL}}->( "missing \\end{$class} for \\begin{$class} at position $tree->{nodes}[$bidx]->{end}" ); } } # This is the "math" stage: here we grab simple-delimiter math modes from # the text they are embedded in, and turn those into new groupings, with the # "math" flag set. # # having this top level to go over all the bracket types prevents some pretty # bad combinatorial explosion # sub _stage5 { my $parser = shift; my $tree = shift; my $caremath = shift || 0; my $brackets = $parser->{MATHBRACKETS}; # loop through all the different math mode bracket types foreach my $left (sort {length($b) <=> length($a)} keys %$brackets) { my $right = $brackets->{$left}; $parser->_stage5_r($tree, $left, $right, $caremath); } } # recursive meat of above # sub _stage5_r { my $parser = shift; my $tree = shift; my $left = shift; my $right = shift; my $caremath = shift || 0; # do we care if we're already in math mode? # this matters for \( \), \[ \] my $leftpos = -1; # no text pos for found left brace yet. my $leftidx = -1; # no array index for found left brace yet. # loop through the nodes for (my $i = 0; $i < scalar @{$tree->{nodes}}; $i++) { my $node = $tree->{nodes}[$i]; my $pos = 0; # position placeholder within the text block my $spos = $node->{start}; # get text start position if ($node->{type} eq 'TEXT' && (!$caremath || (!$node->{math} && $caremath))) { # search for left brace if we haven't started a pair yet if ($leftidx == -1) { $leftpos = _findsymbol($node->{content}, $left, $pos); if ($leftpos != -1) { _debug("found (left) $left in [$node->{content}]", undef); $leftidx = $i; $pos = $leftpos + 1; # next pos to search from } } # search for a right brace if ($leftpos != -1) { my $rightpos = _findsymbol($node->{content}, $right, $pos); # found if ($rightpos != -1) { # we have to split the text node into 3 parts if ($leftidx == $i) { _debug("splitwithin: found (right) $right in [$node->{content}]", undef); my ($leftnode, $textnode3) = $node->split($rightpos, $rightpos + length($right) - 1); my ($textnode1, $textnode2) = $leftnode->split($leftpos, $leftpos + length($left) - 1); my $startpos = $spos; # get text start position # make the math ENVIRONMENT node my $mathnode = LaTeX::TOM::Node->_new( {type => 'ENVIRONMENT', class => $left, # use left delim as class math => 1, start => $startpos + $leftpos, ostart => $startpos + $leftpos - length($left) + 1, end => $startpos + $rightpos, oend => $startpos + $rightpos + length($right) - 1, children => LaTeX::TOM::Tree->_new([$textnode2]), }); splice @{$tree->{nodes}}, $i, 1, $textnode1, $mathnode, $textnode3; $i++; # skip ahead two nodes, so we'll be parsing textnode3 } # split across nodes else { _debug("splitacross: found (right) $right in [$node->{content}]", undef); # create new set of 4 smaller text nodes from the original two # that contain the left and right delimiters # my ($textnode1, $textnode2) = $tree->{nodes}[$leftidx]->split($leftpos, $leftpos + length($left) - 1); my ($textnode3, $textnode4) = $tree->{nodes}[$i]->split($rightpos, $rightpos + length($right) - 1); # nodes to remove "from the middle" (between the left and right # text nodes which contain the delimiters) # my @remnodes = splice @{$tree->{nodes}}, $leftidx+1, $i - $leftidx - 1; # create a math node that contains the text after the left brace, # then all the nodes up until the next text node, then the text # before the right brace. # my $mathnode = LaTeX::TOM::Node->_new( {type => 'ENVIRONMENT', class => $left, math => 1, start => $textnode2->{start} - 1, end => $textnode3->{end} + 1, ostart => $textnode2->{start} - 1 - length($left) + 1, oend => $textnode3->{end} + 1 + length($right) - 1, children => LaTeX::TOM::Tree->_new( [$textnode2, @remnodes, $textnode3]), }); # replace (TEXT_A, ... , TEXT_B) with the mathnode created above splice @{$tree->{nodes}}, $leftidx, 2, $textnode1, $mathnode, $textnode4; # do all nodes again but the very leftmost # $i = $leftidx; } $leftpos = -1; # reset left data $leftidx = -1; } # right brace } # left brace else { my $rightpos = _findsymbol($node->{content}, $right, $pos); if ($rightpos != -1) { my $startpos = $node->{start}; # get text start position $error_handlers{$parser->{PARSE_ERRORS_FATAL}}->("unmatched '$right' at " . ($startpos + $rightpos)); } } } # if TEXT # recur, but not into verbatim environments! # elsif ($node->{children} && !( ($node->{type} eq 'COMMAND' && $node->{command} =~ /^verb/) || ($node->{type} eq 'ENVIRONMENT' && $node->{class} =~ /^verbatim/))) { if ($LaTeX::TOM::DEBUG) { my $message = "Recurring into $node->{type} node "; $message .= $node->{command} if ($node->{type} eq 'COMMAND'); $message .= $node->{class} if ($node->{type} eq 'ENVIRONMENT'); _debug($message, undef); } $parser->_stage5_r($node->{children}, $left, $right, $caremath); } } # loop over text blocks if ($leftpos != -1) { my $startpos = $tree->{nodes}[$leftidx]->{start}; # get text start position $error_handlers{$parser->{PARSE_ERRORS_FATAL}}->("unmatched '$left' at " . ($startpos + $leftpos)); } } # This stage propegates the math mode flag and plaintext flags downward. # # After this is done, we can make the claim that only text nodes marked with # the plaintext flag should be printed. math nodes will have the "math" flag, # and also plantext = 0. # sub _propegateModes { my $parser = shift; my $tree = shift; my $math = shift; # most likely want to call this with 0 my $plaintext = shift; # ditto this-- default to nothing visible. foreach my $node (@{$tree->{nodes}}) { # handle text nodes on this level. set flags. # if ($node->{type} eq 'TEXT') { $node->{math} = $math; $node->{plaintext} = $plaintext; } # propegate flags downward, possibly modified # elsif (defined $node->{children}) { my $mathflag = $math; # math propegates down by default my $plaintextflag = 0; # plaintext flag does NOT propegate by default # handle math or plain text forcing envs # if ($node->{type} eq 'ENVIRONMENT' || $node->{type} eq 'COMMAND') { if (defined $node->{class} && ( $parser->{config}{MATHENVS}->{$node->{class}} || $parser->{config}{MATHENVS}->{"$node->{class}*"}) ) { $mathflag = 1; $plaintextflag = 0; } elsif (($node->{type} eq 'COMMAND' && ($parser->{config}{TEXTENVS}->{$node->{command}} || $parser->{config}{TEXTENVS}->{"$node->{command}*"})) || ($node->{type} eq 'ENVIRONMENT' && ($parser->{config}{TEXTENVS}->{$node->{class}} || $parser->{config}{TEXTENVS}{"$node->{command}*"})) ) { $mathflag = 0; $plaintextflag = 1; } } # groupings change nothing # elsif ($node->{type} eq 'GROUP') { $mathflag = $math; $plaintextflag = $plaintext; } # recur $parser->_propegateModes($node->{children}, $mathflag, $plaintextflag); } } } # apply a mapping to text nodes in a tree # # for newcommands and defs: mapping is a hash: # # {name, nparams, template, type} # # name is a string # nparams is an integer # template is a tree fragement containing text nodes with #x flags, where # parameters will be replaced. # type is "command" # # for newenvironments: # # {name, nparams, btemplate, etemplate, type} # # same as above, except type is "environment" and there are two templates, # btemplate and etemplate. # sub _applyMapping { my $parser = shift; my $tree = shift; my $mapping = shift; my $i = shift || 0; # index to start with, in tree. my $applications = 0; # keep track of # of applications for (; $i < @{$tree->{nodes}}; $i++) { my $node = $tree->{nodes}[$i]; # begin environment nodes # if ($node->{type} eq 'COMMAND' && $node->{command} eq 'begin' && $node->{children}->{nodes}[0]->{content} eq $mapping->{name} ) { # grab the nparams next group nodes as parameters # my @params = (); my $remain = $mapping->{nparams}; my $j = 1; while ($remain > 0 && ($i + $j) < scalar @{$tree->{nodes}}) { my $node = $tree->{nodes}[$i + $j]; # grab group node if ($node->{type} eq 'GROUP') { push @params, $node->{children}; $remain--; } $j++; } # if we didn't get enough group nodes, bomb out next if $remain; # otherwise make new subtree my $applied = _applyParamsToTemplate($mapping->{btemplate}, @params); # splice in the result splice @{$tree->{nodes}}, $i, $j, @{$applied->{nodes}}; # skip past all the new stuff $i += scalar @{$applied->{nodes}} - 1; } # end environment nodes # elsif ($node->{type} eq 'COMMAND' && $node->{command} eq 'end' && $node->{children}->{nodes}[0]->{content} eq $mapping->{name} ) { # make new subtree (no params) my $applied = $mapping->{etemplate}->copy(); # splice in the result splice @{$tree->{nodes}}, $i, 1, @{$applied->{nodes}}; # skip past all the new stuff $i += scalar @{$applied->{nodes}} - 1; $applications++; # only count end environment nodes } # newcommand nodes # elsif ($node->{type} eq 'COMMAND' && $node->{command} eq $mapping->{name} && $mapping->{nparams} ) { my @params = (); # children of COMMAND node will be first parameter push @params, $node->{children}; # find next nparams GROUP nodes and push their children onto @params my $remain = $mapping->{nparams} - 1; my $j = 1; while ($remain > 0 && ($i + $j) < scalar @{$tree->{nodes}}) { my $node = $tree->{nodes}[$i + $j]; # grab group node if ($node->{type} eq 'GROUP') { push @params, $node->{children}; $remain--; } $j++; } # if we didn't get enough group nodes, bomb out next if ($remain > 0); # apply the params to the template my $applied = _applyParamsToTemplate($mapping->{template}, @params); # splice in the result splice @{$tree->{nodes}}, $i, $j, @{$applied->{nodes}}; # skip past all the new stuff $i += scalar @{$applied->{nodes}} - 1; $applications++; } # find 0-param mappings elsif ($node->{type} eq 'TEXT' && !$mapping->{nparams}) { my $text = $node->{content}; my $command = $mapping->{name}; # find occurrences of the mapping command # my $wordend = ($command =~ /\w$/ ? 1 : 0); while (($wordend && $text =~ /\\\Q$command\E(\W|$)/g) || (!$wordend && $text =~ /\\\Q$command\E/g)) { _debug("found occurrence of mapping $command", undef); my $idx = index $node->{content}, '\\' . $command, 0; # split the text node at that command my ($leftnode, $rightnode) = $node->split($idx, $idx + length($command)); # copy the mapping template my $applied = $mapping->{template}->copy(); # splice the new nodes in splice @{$tree->{nodes}}, $i, 1, $leftnode, @{$applied->{nodes}}, $rightnode; # adjust i so we end up on rightnode when we're done $i += scalar @{$applied->{nodes}} + 1; # get the next node $node = $tree->{nodes}[$i]; # count application $applications++; } } # recur elsif ($node->{children}) { $applications += $parser->_applyMapping($node->{children}, $mapping); } } return $applications; } # find and apply all mappings in the tree, progressively and recursively. # a mapping applies to the entire tree and subtree consisting of nodes AFTER # itself in the level array. # sub _applyMappings { my $parser = shift; my $tree = shift; for (my $i = 0; $i < @{$tree->{nodes}}; $i++) { my $prev = $tree->{nodes}[$i-1]; my $node = $tree->{nodes}[$i]; # find newcommands if ($node->{type} eq 'COMMAND' && $node->{command} =~ /^(re)?newcommand$/) { my $mapping = _makeMapping($tree, $i); next if (!$mapping->{name}); # skip fragged commands if ($parser->{USED_COMMANDS}->{$mapping->{name}}) { _debug("applying (nc) mapping $mapping->{name}", undef); } else { _debug("NOT applying (nc) mapping $mapping->{name}", undef); next; } # add to mappings list # $parser->{MAPPEDCMDS}->{"\\$mapping->{name}"} = 1; _debug("found a mapping with name $mapping->{name}, $mapping->{nparams} params", undef); # remove the mapping declaration # splice @{$tree->{nodes}}, $i, $mapping->{skip} + 1; # apply the mapping my $count = $parser->_applyMapping($tree, $mapping, $i); if ($count > 0) { _debug("printing altered subtree", sub { $tree->_warn() }); } $i--; # since we removed the cmd node, check this index again } # handle "\newenvironment" mappings elsif ($node->{type} eq 'COMMAND' && $node->{command} =~ /^(re)?newenvironment$/) { # make a mapping hash # my $mapping = $parser->_makeEnvMapping($tree, $i); next if (!$mapping->{name}); # skip fragged commands. _debug("applying (ne) mapping $mapping->{name}", undef); # remove the mapping declaration # splice @{$tree->{nodes}}, $i, $mapping->{skip} + 1; # apply the mapping # my $count = $parser->_applyMapping($tree, $mapping, $i); } # handle "\def" stype commands. elsif ($node->{type} eq 'COMMAND' && defined $prev && $prev->{type} eq 'TEXT' && $prev->{content} =~ /\\def\s*$/o) { _debug("found def style mapping $node->{command}", undef); # remove the \def $prev->{content} =~ s/\\def\s*$//o; # make the mapping my $mapping = {name => $node->{command}, nparams => 0, template => $node->{children}->copy(), type => 'command'}; next if (!$mapping->{name}); # skip fragged commands if ($parser->{USED_COMMANDS}->{$mapping->{name}}) { _debug("applying (def) mapping $mapping->{name}", undef); } else { _debug("NOT applying (def) mapping $mapping->{name}", undef); next; } # add to mappings list # $parser->{MAPPEDCMDS}->{"\\$mapping->{name}"} = 1; _debug("template is", sub { $mapping->{template}->_warn() }); # remove the command node splice @{$tree->{nodes}}, $i, 1; # apply the mapping my $count = $parser->_applyMapping($tree, $mapping, $i); $i--; # check this index again } # recur elsif ($node->{children}) { $parser->_applyMappings($node->{children}); } } } # read files from \input commands and place into the tree, parsed # # also include bibliographies # sub _addInputs { my $parser = shift; my $tree = shift; for (my $i = 0; $i < @{$tree->{nodes}}; $i++) { my $node = $tree->{nodes}[$i]; if ($node->{type} eq 'COMMAND' && $node->{command} eq 'input' ) { my $file = $node->{children}->{nodes}[0]->{content}; next if $file =~ /pstex/; # ignore pstex images _debug("reading input file $file", undef); my $contents; my $filename = fileparse($file); my $has_extension = qr/\.\S+$/; # read in contents of file if (-e $file && $filename =~ $has_extension) { $contents = _readFile($file); } elsif ($filename !~ $has_extension) { $file = "$file.tex"; $contents = _readFile($file) if -e $file; } # dump Psfig/TeX files, they aren't useful to us and have # nonconforming syntax. Use declaration line as our heuristic. # if (defined $contents && $contents =~ m!^ \% \s*? Psfig/TeX \s* $!mx ) { undef $contents; carp "ignoring Psfig input `$file'"; } # actually do the parse of the sub-content # if (defined $contents) { # parse into a tree my ($subtree,) = $parser->_basicparse($contents, $parser->{PARSE_ERRORS_FATAL}); # replace \input command node with subtree splice @{$tree->{nodes}}, $i, 1, @{$subtree->{nodes}}; # step back $i--; } } elsif ($node->{type} eq 'COMMAND' && $node->{command} eq 'bibliography' ) { # try to find a .bbl file # foreach my $file (<*.bbl>) { my $contents = _readFile($file); if (defined $contents) { my ($subtree,) = $parser->_basicparse($contents, $parser->{PARSE_ERRORS_FATAL}); splice @{$tree->{nodes}}, $i, 1, @{$subtree->{nodes}}; $i--; } } } # recur if ($node->{children}) { $parser->_addInputs($node->{children}); } } } # do pre-mapping parsing # sub _parseA { my $parser = shift; my $text = shift; my $tree = $parser->_stage1($text); my $bracehash = $parser->_stage2($tree); $parser->_stage3($tree); $parser->_addInputs($tree) if $parser->{readinputs}; return ($tree, $bracehash); } # do post-mapping parsing (make environments) # sub _parseB { my $parser = shift; my $tree = shift; $parser->_stage4($tree); _debug("done with parseStage4", undef); $parser->_stage5($tree, 0); _debug("done with parseStage5", undef); } ############################################################################### # # Parser "Static" Subroutines # ############################################################################### # find next unescaped char in some text # sub _uindex { my $text = shift; my $char = shift; my $pos = shift; my $realbrace = 0; my $idx = -1; # get next opening brace do { $realbrace = 1; $idx = index $text, $char, $pos; if ($idx != -1) { $pos = $idx + 1; my $prevchar = substr $text, $idx - 1, 1; if ($prevchar eq '\\') { $realbrace = 0; $idx = -1; } } } while (!$realbrace); return $idx; } sub _find { my ($text, $symbol, $pos) = @_; my ($found, $index); # get next occurrence of the symbol do { $found = true; $index = index $text, $symbol, $pos; if ($symbol eq '}' && $index - 1 >= 0 && substr($text, $index - 1, 1) eq ' ') { #$pos = $index + 1; $index = -1; } if ($index != -1) { $pos = $index + 1; # make sure this occurrence isn't escaped. this is imperfect. my $prev_char = ($index - 1 >= 0) ? (substr $text, $index - 1, 1) : ''; my $pprev_char = ($index - 2 >= 0) ? (substr $text, $index - 2, 1) : ''; if ($prev_char eq '\\' && $pprev_char ne '\\') { $found = false; } } } until ($found); return $index; } # support function: find the next occurrence of some symbol which is # not escaped. # sub _findsymbol { return _find(@_); } # support function: find the earliest next brace in some (flat) text # sub _findbrace { my ($text, $pos) = @_; my $index_o = _find($text, '{', $pos); my $index_c = _find($text, '}', $pos); # handle all find cases if ($index_o == -1 && $index_c == -1) { return (-1, ''); } elsif ($index_c == -1 || ($index_o != -1 && $index_o < $index_c)) { return ($index_o, '{'); } elsif ($index_o == -1 || $index_c < $index_o) { return ($index_c, '}'); } } # skip "blank nodes" in a tree, starting at some position. will finish # at the first non-blank node. (ie, not a comment or whitespace TEXT node. # sub _skipBlankNodes { my ($tree, $i) = @_; my $node = $tree->{nodes}[$$i]; while ($node->{type} eq 'COMMENT' || ($node->{type} eq 'TEXT' && $node->{content} =~ /^\s*$/s) ) { $node = $tree->{nodes}[++$$i]; } } # is the passed-in node a valid parameter node? for this to be true, it must # either be a GROUP or a position = inner command. # sub _validParamNode { my ($node) = @_; if ($node->{type} eq 'GROUP' || ($node->{type} eq 'COMMAND' && $node->{position} eq 'inner')) { return true; } return false; } # duplicate a valid param node. This means for a group, copy the child tree. # for a command, make a new tree with just the command node and its child tree. # sub _duplicateParam { my $parser = shift; my $node = shift; if ($node->{type} eq 'GROUP') { return $node->{children}->copy(); } elsif ($node->{type} eq 'COMMAND') { my $subtree = $node->{children}->copy(); # copy child subtree my $nodecopy = $node->copy(); # make a new node with old data $nodecopy->{children} = $subtree; # set the child pointer to new subtree # return a new tree with the new node (subtree) as its only element return LaTeX::TOM::Tree->_new([$nodecopy]); } return undef; } sub _getMapping { my ($type, $tree, $i) = @_; my $node = $tree->{nodes}[$$i]; if ($node->{type} ne 'COMMAND' || ($node->{command} ne "new$type" && $node->{command} ne "renew$type") ) { return (); } # figure out command (first child, text node) my $command = $node->{children}->{nodes}[0]->{content}; if ($command =~ /^\s* \\(\S+) \s*$/x) { $command = $1; } $node = $tree->{nodes}[++$$i]; # figure out number of params my $nparams = 0; if ($node->{type} eq 'TEXT') { my $text = $node->{content}; if ($text =~ /^\s* \[\s* ([0-9]+) \s*\] \s*$/x) { $nparams = $1; } $$i++; } return ($command, $nparams); } # make a mapping from a newenvironment fragment # # newenvironments have the following syntax: # # \newenvironment{name}[nparams]?{beginTeX}{endTeX} # sub _makeEnvMapping { my $parser = shift; my ($tree, $index) = @_; my $i = $index; my ($command, $nparams) = _getMapping('environment', $tree, \$i) or return undef; # default templates-- just repeat the declarations # my ($btemplate) = $parser->_basicparse("\\begin{$command}", 2, 0); my ($etemplate) = $parser->_basicparse("\\end{$command}", 2, 0); my $end_pos = $i; # get two group subtrees... one for the begin and one for the end # templates. we only ignore whitespace TEXT nodes and comments # _skipBlankNodes($tree, \$i); my $node = $tree->{nodes}[$i]; if (_validParamNode($node)) { $btemplate = $parser->_duplicateParam($node); $i++; _skipBlankNodes($tree, \$i); $node = $tree->{nodes}[$i]; if (_validParamNode($node)) { $etemplate = $parser->_duplicateParam($node); $end_pos = $i; } } # build and return the mapping hash # return { type => 'environment', name => $command, nparams => $nparams, btemplate => $btemplate, # begin template etemplate => $etemplate, # end template skip => $end_pos - $index, }; } # make a mapping from a newcommand fragment # takes tree pointer and index of command node # # newcommands have the following syntax: # # \newcommand{\name}[nparams]?{anyTeX} # sub _makeMapping { my ($tree, $index) = @_; my $i = $index; my ($command, $nparams) = _getMapping('command', $tree, \$i) or return undef; # grab subtree template (array ref) # my $node = $tree->{nodes}[$i]; my $template; if ($node->{type} eq 'GROUP') { $template = $node->{children}->copy(); } else { return undef; } # build and return the mapping hash # return { type => 'command', name => $command, nparams => $nparams, template => $template, skip => $i - $index, }; } # this sub is the main entry point for the sub that actually takes a set of # parameter trees and inserts them into a template tree. the return result, # newly allocated, should be plopped back into the original tree where the # parameters (along with the initial command invocation) # sub _applyParamsToTemplate { my $template = shift; my @params = @_; # have to copy the template to a freshly allocated tree # my $applied = $template->copy(); # now recursively apply the params. # _applyParamsToTemplate_r($applied, @params); return $applied; } # recursive helper for above # sub _applyParamsToTemplate_r { my $template = shift; my @params = @_; for (my $i = 0; $i < @{$template->{nodes}}; $i++) { my $node = $template->{nodes}[$i]; if ($node->{type} eq 'TEXT') { my $text = $node->{content}; # find occurrences of the parameter flags # if ($text =~ /(#([0-9]+))/) { my $all = $1; my $num = $2; # get the index of the flag we just found # my $idx = index $text, $all, 0; # split the node on the location of the flag # my ($leftnode, $rightnode) = $node->split($idx, $idx + length($all) - 1); # make a copy of the param we want # my $param = $params[$num - 1]->copy(); # splice the new text nodes, along with the parameter subtree, into # the old location # splice @{$template->{nodes}}, $i, 1, $leftnode, @{$param->{nodes}}, $rightnode; # skip forward to where $rightnode is in $template on next iteration # $i += scalar @{$param->{nodes}}; } } # recur elsif (defined $node->{children}) { _applyParamsToTemplate_r($node->{children}, @params); } } } # This sub takes a chunk of the document text between two points and makes # it into a list of TEXT nodes and COMMENT nodes, as we would expect from # '%' prefixed LaTeX comment lines # sub _getTextAndCommentNodes { my ($text, $begins, $ends) = @_; my $node_text = substr $text, $begins, $ends - $begins; _debug("getTextAndCommentNodes: looking at [$node_text]", undef); my $make_node = sub { my ($mode_type, $begins, $start_pos, $output) = @_; return LaTeX::TOM::Node->_new({ type => uc $mode_type, start => $begins + $start_pos, end => $begins + $start_pos + length($output) - 1, content => $output, }); }; my @lines = split (/( (?:\s* # whitespace (?($mode_type, $begins, $start_pos, $output); $start_pos += length($output); # update start position $output = $line; $mode_type = $line_type; } } push @nodes, $make_node->($mode_type, $begins, $start_pos, $output) if defined $output; return @nodes; } # Read in the contents of a text file on disk. Return in string scalar. # sub _readFile { my ($file, $raise_error) = @_; $raise_error ||= false; my $opened = open(my $fh, '<', $file); unless ($opened) { croak "Cannot open `$file': $!" if $raise_error; return undef; } my $contents = do { local $/; <$fh> }; close($fh); return $contents; } sub _debug { my ($message, $code) = @_; my $DEBUG = $LaTeX::TOM::DEBUG; return unless $DEBUG >= 1 && $DEBUG <= 2; my ($filename, $line) = (caller)[1,2]; my $caller = join ':', (fileparse($filename))[0], $line; warn "$caller: $message\n" if $DEBUG >= 1 && defined $message; $code->() if $DEBUG == 2 && defined $code; } 1;