#include "Regexp.h"
#include <limits.h>
#include <panda/from_chars.h>
namespace panda { namespace uri { namespace router {
%%{
machine regexp_parser;
action mark {
mark = p;
}
action expression_start {
data->re->expressions.push_back({});
data->expression = &data->re->expressions.back();
}
action expression_end {
data->expression = nullptr;
}
action element_start {
data->expression->elements.push_back({});
data->element = &data->expression->elements.back();
}
action element_end {
maybe_join(data->expression->elements);
data->element = nullptr;
}
action literal_end {
data->element->token.type = Regexp::Token::Type::Literal;
assign_literal(data->element->token.literal, mark, p);
}
action symclass_end {
data->element->token.type = Regexp::Token::Type::Symclass;
parse_symclass(string_view(mark, p - mark), data->element->token.symclass);
}
action special {
data->element->token.type = Regexp::Token::Type::Symclass;
switch (*(p-1)) {
case '.':
data->element->token.symclass.ranges.push_back({CHAR_MIN, CHAR_MAX});
break;
}
}
action escaped_special {
data->element->token.type = Regexp::Token::Type::Symclass;
auto is_special = symclass_from_escaped(*(p-1), data->element->token.symclass);
assert(is_special);
}
action group_start {
if (p < pe-2 && p[1] == '?' && p[2] == ':') {
data->element->token.type = Regexp::Token::Type::Group;
p += 2;
} else {
data->element->token.type = Regexp::Token::Type::Capture;
}
data->element->token.regexp = std::make_unique<Regexp>();
data_stack.push_back({data->element->token.regexp.get()});
data = &data_stack.back();
stack.resize(top+1);
fcall group_regexp;
}
action group_end {
data_stack.pop_back();
data = &data_stack.back();
stack.pop_back();
fret;
}
action quant_end {
data->element->quant.min = num[0];
data->element->quant.max = num[1];
}
escaped = "\\" (any - 'x');
number = digit+;
symclass_char = (any - "\\" - "]") | escaped;
symclass = ("[" (symclass_char+) >mark %symclass_end "]");
qnt_maybe = "?" %{ num[0] = 0; num[1] = 1; };
qnt_any = "*" %{ num[0] = 0; num[1] = -1; };
qnt_anynn = "+" %{ num[0] = 1; num[1] = -1; };
qnt_num0 = number >mark %{ nsave(num[0]); };
qnt_num1 = number >mark %{ nsave(num[1]); };
qnt_exact = ("{" qnt_num0 "}") %{ num[1] = num[0]; };
qnt_upto = ("{," qnt_num1 "}") %{ num[0] = 0; };
qnt_from = ("{" qnt_num0 ",}") %{ num[1] = -1; };
qnt_minmax = ("{" qnt_num0 "," qnt_num1 "}");
quant = (qnt_maybe | qnt_any | qnt_anynn | qnt_exact | qnt_upto | qnt_from | qnt_minmax) %quant_end;
escaped_special = "\\" [dDwWsStnr] %escaped_special;
special = [.] %special;
lichar = (any - [{}()+*?|] - "\\" - "[" - "]" - special) | (escaped - escaped_special);
literal = (lichar+) >mark %literal_end;
group = '(' @group_start;
token = literal | symclass | special | escaped_special | group;
element = (token quant?) >element_start %element_end;
expression = (element+) >expression_start %expression_end;
regexp = (expression ('|' expression)*)?;
group_regexp := regexp ')' @group_end;
main := regexp;
write data;
}%%
static void assign_literal (string& dst, const char* p, const char* pe) {
while (p < pe) {
if (*p == '\\') {
++p;
assert(p < pe);
}
dst += *p++;
}
}
static void maybe_join (std::vector<Regexp::Element>& v) {
if (v.size() < 2) return;
auto& e1 = v[v.size()-2];
auto& e2 = v[v.size()-1];
if (e1.token.type != Regexp::Token::Type::Literal || e1.token.type != e2.token.type) return;
if (!e1.quant.is_default() || !e2.quant.is_default()) return;
e1.token.literal += e2.token.literal;
v.pop_back();
}
static bool symclass_from_escaped (char c, Regexp::Symclass& s) {
switch (c) {
case 'd':
s.ranges.push_back({'0', '9'});
return true;
case 'D':
s.ranges.push_back({CHAR_MIN, '0' - 1});
s.ranges.push_back({'9' + 1, CHAR_MAX});
return true;
case 'w':
s.ranges.push_back({'a', 'z'});
s.ranges.push_back({'A', 'Z'});
s.ranges.push_back({'0', '9'});
s.chars += '_';
return true;
case 'W':
s.ranges.push_back({CHAR_MIN, '0' - 1});
s.ranges.push_back({'9' + 1, 'A' - 1});
s.ranges.push_back({'Z' + 1, '_' - 1});
s.ranges.push_back({'_' + 1, 'a' - 1});
s.ranges.push_back({'z' + 1, CHAR_MAX});
return true;
case 't':
s.chars += "\t\v";
return true;
case 'n':
s.chars += "\n";
return true;
case 'r':
s.chars += "\r";
return true;
case 's':
s.chars += " \n\r\t\v";
return true;
case 'S':
s.ranges.push_back({CHAR_MIN, '\t' - 1});
s.ranges.push_back({'\n' + 1, '\r' - 1});
s.ranges.push_back({'\r' + 1, ' ' - 1});
s.ranges.push_back({' ' + 1, '\v' - 1});
s.ranges.push_back({'\v' + 1, CHAR_MAX});
return true;
}
return false;
}
static void parse_symclass (string_view str, Regexp::Symclass& s) {
//printf("parse symclass: %s\n", string(str).c_str());
auto p = str.data();
auto pe = p + str.length();
if (p != pe && *p == '^') {
s.inverse = true;
++p;
}
if (p == pe) throw std::logic_error("empty symclass");
bool has_char_after_range = false;
while (p < pe) {
if (*p == '-' && has_char_after_range && (p+1) < pe) {
s.ranges.push_back({s.chars.back(), *++p});
s.chars.pop_back();
has_char_after_range = false;
} else if (*p == '\\') {
++p;
assert(p < pe);
if (!symclass_from_escaped(*p, s)) {
s.chars.push_back(*p);
has_char_after_range = true;
}
} else {
s.chars.push_back(*p);
has_char_after_range = true;
}
++p;
}
}
RegexpPtr Regexp::parse (string_view str) {
const char* ps = str.data();
const char* p = ps;
const char* pe = p + str.length();
const char* eof = pe;
int cs = regexp_parser_start;
int top = 0;
std::vector<int> stack;
stack.resize(8);
const char* mark = nullptr;
int num[2];
auto ret = std::make_unique<Regexp>();
struct Data {
Regexp* re;
Regexp::Expression* expression = nullptr;
Regexp::Element* element = nullptr;
};
std::vector<Data> data_stack = {{ret.get()}};
auto data = &data_stack.back();
auto nsave = [&](int& dest) {
auto res = from_chars(mark, p, dest);
assert(!res.ec);
};
%% write exec;
if (cs < regexp_parser_first_final) {
throw std::logic_error("bad regexp");
}
return ret;
}
}}}