/*
regex - Regular expression pattern matching and replacement
By: Ozan S. Yigit (oz)
Dept. of Computer Science
York University
These routines are the PUBLIC DOMAIN equivalents of regex
routines as found in 4.nBSD UN*X, with minor extensions.
These routines are derived from various implementations found
in software tools books, and Conroy's grep. They are NOT derived
from licensed/restricted software.
For more interesting/academic/complicated implementations,
see Henry Spencer's regexp routines, or GNU Emacs pattern
matching module.
Modification history:
$Log: regex.c,v $
Revision 1.4 1991/10/17 03:56:42 oz
miscellaneous changes, small cleanups etc.
Revision 1.3 1989/04/01 14:18:09 oz
Change all references to a dfa: this is actually an nfa.
Revision 1.2 88/08/28 15:36:04 oz
Use a complement bitmap to represent NCL.
This removes the need to have seperate
code in the pmatch case block - it is
just CCL code now.
Use the actual CCL code in the CLO
section of ematch. No need for a recursive
pmatch call.
Use a bitmap table to set char bits in an
8-bit chunk.
Interfaces:
re_comp: compile a regular expression into a NFA.
char *re_comp(s)
char *s;
re_exec: execute the NFA to match a pattern.
int re_exec(s)
char *s;
Regular Expressions:
[1] char matches itself, unless it is a special
character (metachar): . \ [ ] * + ^ $
[2] . matches any character.
[3] \ matches the character following it.
It is used as an escape character for all
other meta-characters, and itself. When used
in a set ([4]), it is treated as an ordinary
character.
[4] [set] matches one of the characters in the set.
If the first character in the set is "^",
it matches a character NOT in the set, i.e.
complements the set. A shorthand S-E is
used to specify a set of characters S upto
E, inclusive. The special characters "]" and
"-" have no special meaning if they appear
as the first chars in the set.
examples: match:
[a-z] any lowercase alpha
[^]-] any char except ] and -
[^A-Z] any char except uppercase alpha
[a-zA-Z] any alpha
[5] * any regular expression form [1] to [4], followed by
closure char (*) matches zero or more matches of
that form.
[6] + same as [5], except it matches one or more.
[10] a composite regular expression xy where x and y
are in the form [1] to [10] matches the longest
match of x followed by a match for y.
[11] ^ a regular expression starting with a ^ character
$ and/or ending with a $ character, restricts the
pattern matching to the beginning of the line,
or the end of line. [anchors] Elsewhere in the
pattern, ^ and $ are treated as ordinary characters.
Acknowledgements:
HCR's Hugh Redelmeier has been most helpful in various
stages of development. He convinced me to include BOW
and EOW constructs, originally invented by Rob Pike at
the University of Toronto.
References:
Software tools Kernighan & Plauger
Software tools in Pascal Kernighan & Plauger
Grep [rsx-11 C dist] David Conroy
ed - text editor Un*x Programmer's Manual
Advanced editing on Un*x B. W. Kernighan
RegExp routines Henry Spencer
Notes:
This implementation uses a bit-set representation for character
classes for speed and compactness. Each character is represented
by one bit in a 128-bit block. Thus, CCL always takes a
constant 16 bytes in the internal nfa, and re_exec does a single
bit comparison to locate the character in the set.
Examples:
pattern: foo*.*
compile: CHR f CHR o CLO CHR o END CLO ANY END END
matches: fo foo fooo foobar fobar foxx ...
pattern: fo[ob]a[rz]
compile: CHR f CHR o CCL bitset CHR a CCL bitset END
matches: fobar fooar fobaz fooaz
pattern: foo\\+
compile: CHR f CHR o CHR o CHR \ CLO CHR \ END END
matches: foo\ foo\\ foo\\\ ...
pattern: \(foo\)[1-3]\1 (same as foo[1-3]foo)
compile: BOT 1 CHR f CHR o CHR o EOT 1 CCL bitset REF 1 END
matches: foo1foo foo2foo foo3foo
pattern: \(fo.*\)-\1
compile: BOT 1 CHR f CHR o CLO ANY END EOT 1 CHR - REF 1 END
matches: foo-foo fo-fo fob-fob foobar-foobar ...
*/
#include <config.h>
#include <stdio.h> /* printf() */
#include "dwarf.h"
#include "libdwarf.h"
#include "dd_regex.h"
#include "dd_safe_strcpy.h"
#include "dd_checkutil.h"
#include "dd_glflags.h"
/* This version corrects two bugs (see 'code by davea'
below).
Compared to the original source:
Regular expression grouping with ()
is ignored and word checking is ignored.
Except when using ^ or $ in the regex
we find any matching substring.
David Anderson. 2 September 2021.
*/
#ifndef DW_DLV_OK
/* Makes testing easier, these
must match libdwarf.h definitions */
#define DW_DLV_NO_ENTRY -1
#define DW_DLV_OK 0
#define DW_DLV_ERROR 1
#endif /* DW_DLV_OK */
#define inascii(x) (0177&(x))
#define MAXNFA 2048
#define OKP 1
#define NOP 0
#define CHR 1
#define ANY 2
#define CCL 3 /* character class [ */
#define BOL 4 /* beginning of line, ^ */
#define EOL 5 /* end of line, 5 */
#define REF 10
#define CLO 11
#define END 0
#define MAXPREDEF 11
/*
The following defines are not meant to be changeable.
They are for readability only.
*/
#define MAXCHR 128
#define CHRBIT 8
#define BITBLK MAXCHR/CHRBIT
#define BLKIND 0170
#define BITIND 07
#define ASCIIB 0177
#ifdef NO_UCHAR
typedef
char
CHAR
;
#else
typedef
unsigned
char
CHAR
;
#endif
static
CHAR
nfa[MAXNFA];
/* automaton.. */
static
int
sta = NOP;
/* status of lastpat */
static
CHAR
bittab[BITBLK];
/* bit table for CCL */
/* pre-set bits... */
static
CHAR
bitarr[] = {1,2,4,8,16,32,64,128};
#ifdef DEBUG
static
void
nfadump(
CHAR
*);
static
void
symbolic(
char
*);
#endif
static
void
chset(
CHAR
c)
{
bittab[(
CHAR
) ((c) & BLKIND) >> 3] |= bitarr[(c) & BITIND];
}
#define badpat (nfa[0] = END)
#define store(x) do { \
if
( mp > endpoint){ \
badpat; \
printf
(
"ERROR: Pattern too large to store "
\
"in regex\n"
); \
glflags.gf_count_major_errors++; \
return
DW_DLV_ERROR; \
} \
*mp++ = (x); \
}
while
(0)
static
void
resetbittab(
void
)
{
int
j = 0;
for
(j = 0; j < BITBLK; ++j) {
bittab[j] = 0;
}
for
(j = 0; j < MAXNFA; ++j) {
nfa[j] = 0;
}
sta = NOP;
}
#if 0
/* Some may not be alone,
. \ [ ] * + ^ $
but some can be.
*/
static
int
dangermetachar(
CHAR
c)
{
switch
(c) {
case
'?'
:
case
'\\'
:
case
'['
:
case
'+'
:
case
'^'
:
case
'$'
:
return
1;
default
:
return
0;
}
}
#endif
int
dd_re_comp(
const
char
*pat)
{
const
char
*p = 0;
/* pattern pointer */
CHAR
*mp = nfa;
CHAR
*lp = 0;
/* saved pointer.. */
CHAR
*sp = nfa;
/* another one.. */
CHAR
*endpoint = nfa+MAXNFA - 2;
int
n = 0;
CHAR
mask = 0;
/* xor mask -CCL/NCL */
int
c1 = 0;
int
c2 = 0;
resetbittab();
if
(!pat || !*pat) {
badpat;
printf
(
"ERROR: Empty or NULL not a valid "
"regular expression\n"
);
glflags.gf_count_major_errors++;
return
DW_DLV_NO_ENTRY;
}
for
(p = pat; *p; p++) {
int
a = (unsigned
char
)*p;
int
b = inascii(*p);
if
(b != a) {
badpat;
printf
(
"ERROR: Only the ASCII subset of "
" characters are supported in dwarfdump "
"regular expressions\n"
);
glflags.gf_count_major_errors++;
return
DW_DLV_NO_ENTRY;
}
}
for
(p = pat; *p; p++) {
lp = mp;
switch
(*p) {
case
'.'
:
/* match any char.. */
store(ANY);
break
;
case
'^'
:
/* match beginning.. */
if
(p == pat)
store(BOL);
else
{
store(CHR);
store(*p);
}
break
;
case
'$'
:
/* match endofline.. */
if
(!*(p+1)) {
store(EOL);
}
else
{
store(CHR);
store(*p);
}
break
;
case
'['
:
/* match char class..*/
store(CCL);
if
(*++p ==
'^'
) {
mask = 0377;
p++;
}
else
{
mask = 0;
}
if
(*p ==
'-'
) {
/* real dash */
chset(*p++);
}
if
(*p ==
']'
) {
/* real brac */
chset(*p++);
}
while
(*p && *p !=
']'
) {
if
(*p ==
'-'
&& *(p+1) && *(p+1) !=
']'
) {
p++;
c1 = *(p-2) + 1;
c2 = *p++;
while
(c1 <= c2)
chset((
CHAR
)c1++);
}
#ifdef EXTEND
else
if
(*p ==
'\\'
&& *(p+1)) {
p++;
chset(*p++);
}
#endif /* EXTEND */
else
{
chset(*p++);
}
}
if
(!*p) {
badpat;
printf
(
"ERROR: Regular expression %s missing ]\n"
,
pat) ;
glflags.gf_count_major_errors++;
return
DW_DLV_ERROR;
}
/* Storing the bitmask into nfa */
for
(n = 0; n < BITBLK; bittab[n++] =
(
char
) 0) {
store(mask ^ bittab[n]);
}
break
;
case
'*'
:
/* match 0 or more.. */
case
'+'
:
/* match 1 or more.. */
if
(p == pat) {
badpat;
printf
(
"ERROR: Regular expression %s has empty * + "
"Closure\n"
,pat);
glflags.gf_count_major_errors++;
return
DW_DLV_ERROR;
}
lp = sp;
/* previous opcode */
if
(*lp == CLO) {
/* equivalence.. */
break
;
}
switch
(*lp) {
case
BOL:
case
REF:
badpat;
printf
(
"ERROR Regular expression %s has illegal "
"* + closure\n"
,pat);
glflags.gf_count_major_errors++;
return
DW_DLV_ERROR;
default
:
break
;
}
if
(*p ==
'+'
) {
for
(sp = mp; lp < sp; lp++) {
store(*lp);
}
}
store(END);
store(END);
sp = mp;
while
(--mp > lp) {
*mp = mp[-1];
}
store(CLO);
mp = sp;
break
;
case
'\\'
:
/* tags, backrefs .. */
++p;
if
(!*p) {
badpat;
printf
(
"ERROR Regular expression backslash"
" missing its operand. Erroneous regex\n"
);
glflags.gf_count_major_errors++;
return
DW_DLV_ERROR;
}
store(CHR);
store(*p);
break
;
default
:
/* an ordinary char */
store(CHR);
store(*p);
break
;
}
sp = lp;
}
store(END);
sta = OKP;
#ifdef DEBUG
symbolic(
"Final nfa"
);
#endif /* DEBUG */
return
DW_DLV_OK;
}
static
char
*bol;
static
int
dd_pmatch(
const
char
*,
CHAR
*,
char
**str_out,
int
level);
/*
re_exec:
execute nfa to find a match.
special cases: (nfa[0])
BOL Match only once, starting from the
beginning.
CHR First locate the character without
calling pmatch, and if found, call
pmatch for the remaining string.
END re_comp failed, poor luser did not
check for it. Fail fast.
If a match is found, bopat[0] and eopat[0] are set
to the beginning and the end of the matched fragment,
respectively.
return DW_DLV_OK, DW_DLV_NO_ENTRY or DW_DLV_ERROR
*/
int
dd_re_exec(
char
*lp)
{
CHAR
c = 0;
char
*ep = 0;
CHAR
*ap = nfa;
int
res = 0;
int
level = 0;
bol = lp;
switch
(*ap) {
case
BOL:
/* anchored: match from BOL only */
res = dd_pmatch(lp,ap,&ep,level);
if
(res != DW_DLV_OK) {
return
res;
}
break
;
case
CHR:
/* ordinary char: locate it fast */
c = *(ap+1);
while
(*lp && *lp != c) {
lp++;
}
if
(!*lp) {
/* if EOS, fail, else fall thru. */
return
DW_DLV_NO_ENTRY;
}
goto
regmatch;
/* GO TO avoids a fall-through
warning from gcc */
default
: {
/* regular matching all the way. */
regmatch:
do
{
res = dd_pmatch(lp,ap,&ep,level);
if
(res == DW_DLV_ERROR) {
return
res;
}
if
(res == DW_DLV_OK) {
break
;
}
lp++;
}
while
(*lp);
break
;
}
case
END:
/* munged automaton. fail always */
badpat;
printf
(
"ERROR in in regex automaton. "
"END out of place\n"
);
glflags.gf_count_major_errors++;
return
DW_DLV_ERROR;
}
if
(res != DW_DLV_OK) {
return
res;
}
return
DW_DLV_OK;
}
/*
dd_pmatch: internal routine for the hard part
This code is partly snarfed from an early grep written by
David Conroy. The backref and tag stuff, and various other
innovations are by oz.
special case optimizations: (nfa[n], nfa[n+1])
CLO ANY
We KNOW .* will match everything upto the
end of line. Thus, directly go to the end of
line, without recursive pmatch calls. As in
the other closure cases, the remaining pattern
must be matched by moving backwards on the
string recursively, to find a match for xy
(x is ".*" and y is the remaining pattern)
where the match satisfies the LONGEST match for
x followed by a match for y.
CLO CHR
We can again scan the string forward for the
single char and at the point of failure, we
execute the remaining nfa recursively, same as
above.
At the end of a successful match, bopat[n] and eopat[n]
are set to the beginning and end of subpatterns matched
by tagged expressions (n = 1 to 9).
returns DW_DLV_OK for a match, and sets the ep pointer
to point into the input at the next char to check
DW_DLV_NO_ENTRY for a non-match
DW_DLV_NO_ERROR for an internal error
*/
/*
character classification table for word boundary operators BOW
and EOW. the reason for not using ctype macros is that we can
let the user add into our own table. see re_modw. This table
is not in the bitset form, since we may wish to extend it in the
future for other character classifications.
TRUE for 0-9 A-Z a-z _
*/
#if 0 /* For debugging only */
static
char
*
naming(
CHAR
c)
{
#define CHR 1
#define ANY 2
#define CCL 3 /* character class [ */
#define BOL 4 /* beginning of line, ^ */
#define EOL 5 /* end of line, 5 */
#define BOT 6 /* beginning of tag, ( */
#define EOT 7 /* end of tag, ) */
#define BOW 8 /* nonstandard beginning of ? < */
#define EOW 9 /* nonstandard end of ? > */
static
int
use1;
/* Two result arrays so we can allow naming called twice
in one printf. Turns any UCHAR into a useful string.
Array index must match the #define value for each name.*/
static
char
nam1[10];
static
char
nam2[10];
static
char
* n[] = {
"END(0)"
,
"CHR"
,
"ANY"
,
"CCL"
,
"BOL"
,
"EOL"
,
"BOT"
,
"EOT"
,
"BOW"
,
"EOW"
,
"REF"
,
"CLO"
,
"bogus"
,
};
char
*outtab= 0;
if
(use1) {
outtab = nam1;
}
else
{
outtab = nam2;
}
outtab[0] = 0;
if
(c && c <= MAXPREDEF) {
dd_safe_strcpy(outtab,
sizeof
(nam1),n[c],
strlen
(n[c]));
}
else
{
sprintf
(outtab,
"0x%x"
,c);
/* debugging only */
}
use1 = !use1;
return
outtab;
}
#define iswordc(x) chrtyp[inascii(x)]
#endif /* 0 */
#define isinset(x,y) ((x)[((y)&BLKIND)>>3] & bitarr[(y)&BITIND])
/* skip values for CLO XXX to skip past the closure */
#define ANYSKIP 2 /* [CLO] ANY END ... */
#define CHRSKIP 3 /* [CLO] CHR chr END ... */
#define CCLSKIP 18 /* [CLO] CCL 16bytes END ... */
static
int
dd_pmatch(
const
char
*lp,
CHAR
*ap,
char
**end_ptr,
int
level)
{
int
op = 0;
int
c = 0;
int
n = 0;
char
*e = 0;
/* extra pointer for CLO */
int
res = 0;
while
((op = *ap) != END) {
ap++;
switch
(op) {
case
CHR:
if
(*lp++ != *ap++) {
return
DW_DLV_NO_ENTRY;
}
break
;
case
ANY:
if
(!*lp++) {
return
DW_DLV_NO_ENTRY;
}
break
;
case
CCL:
c = *lp++;
if
(!isinset(ap,c)) {
return
DW_DLV_NO_ENTRY;
}
ap += BITBLK;
break
;
case
BOL:
if
(lp != bol) {
return
DW_DLV_NO_ENTRY;
}
break
;
case
EOL:
if
(*lp) {
return
DW_DLV_NO_ENTRY;
}
break
;
case
CLO: {
const
char
*are = lp;
switch
(*ap) {
case
ANY:
while
(*lp) {
lp++;
}
n = ANYSKIP;
break
;
case
CHR:
c = *(ap+1);
while
(*lp && c == *lp) {
lp++;
}
n = CHRSKIP;
break
;
case
CCL:
for
(c = *lp ; c && isinset(ap+1,c); c = *lp) {
lp++;
}
n = CCLSKIP;
break
;
default
:
badpat;
printf
(
"ERROR Regular expression has illegal "
"closure: bad nfa\n"
);
glflags.gf_count_major_errors++;
return
DW_DLV_ERROR;
}
ap += n;
if
(lp == are) {
/* [code by davea]
Empty closure is fine for early accept iff
it is the end of the regex too. Else
this is a NO_ENTRY */
if
(!*lp && !*ap) {
/* early accept. Nothing left to check. */
return
DW_DLV_OK;
}
return
DW_DLV_NO_ENTRY;
}
while
(lp >= are) {
res = dd_pmatch(lp, ap,&e,level+1);
if
(res == DW_DLV_ERROR) {
return
res;
}
if
(res == DW_DLV_OK) {
*end_ptr = e;
return
res;
}
/* NO_ENTRY so far */
--lp;
}
}
return
DW_DLV_NO_ENTRY;
default
:
badpat;
printf
(
"ERROR Regular expression has illegal "
"dd_re_exec: bad nfa.\n"
);
glflags.gf_count_major_errors++;
return
DW_DLV_ERROR;
}
}
*end_ptr = (
char
*)lp;
return
DW_DLV_OK;
}
#ifdef DEBUG
/* symbolic - produce a symbolic dump of the nfa */
static
void
symbolic(
char
*s)
{
printf
(
"pattern: %s\n"
, s);
printf
(
"nfacode:\n"
);
nfadump(nfa);
}
static
void
nfadump(
CHAR
*ap)
{
register
int
n = 0;
while
(*ap != END)
switch
(*ap++) {
case
CLO:
printf
(
"CLOSURE"
);
nfadump(ap);
switch
(*ap) {
case
CHR:
n = CHRSKIP;
break
;
case
ANY:
n = ANYSKIP;
break
;
case
CCL:
n = CCLSKIP;
break
;
default
:
break
;
}
ap += n;
break
;
case
CHR:
printf
(
"\tCHR %c\n"
,*ap++);
break
;
case
ANY:
printf
(
"\tANY .\n"
);
break
;
case
BOL:
printf
(
"\tBOL -\n"
);
break
;
case
EOL:
printf
(
"\tEOL -\n"
);
break
;
case
CCL:
printf
(
"\tCCL ["
);
for
(n = 0; n < MAXCHR; n++)
if
(isinset(ap,(
CHAR
)n)) {
if
(n <
' '
)
printf
(
"^%c"
, n ^ 0x040);
else
printf
(
"%c"
, n);
}
printf
(
"]\n"
);
ap += BITBLK;
break
;
default
:
printf
(
"bad nfa. opcode %o\n"
, ap[-1]);
exit
(EXIT_FAILURE);
break
;
}
}
#endif /* DEBUG */