#define lutf8lib_c
#define LUA_LIB
#include "lprefix.h"
#include <assert.h>
#include <limits.h>
#include <stdlib.h>
#include <string.h>
#include "lua.h"
#include "lauxlib.h"
#include "lualib.h"
#define MAXUNICODE 0x10FFFF
#define iscont(p) ((*(p) & 0xC0) == 0x80)
static
lua_Integer u_posrelat (lua_Integer pos,
size_t
len) {
if
(pos >= 0)
return
pos;
else
if
(0u - (
size_t
)pos > len)
return
0;
else
return
(lua_Integer)len + pos + 1;
}
static
const
char
*utf8_decode (
const
char
*o,
int
*val) {
static
const
unsigned
int
limits[] = {0xFF, 0x7F, 0x7FF, 0xFFFF};
const
unsigned
char
*s = (
const
unsigned
char
*)o;
unsigned
int
c = s[0];
unsigned
int
res = 0;
if
(c < 0x80)
res = c;
else
{
int
count = 0;
while
(c & 0x40) {
int
cc = s[++count];
if
((cc & 0xC0) != 0x80)
return
NULL;
res = (res << 6) | (cc & 0x3F);
c <<= 1;
}
res |= ((c & 0x7F) << (count * 5));
if
(count > 3 || res > MAXUNICODE || res <= limits[count])
return
NULL;
s += count;
}
if
(val) *val = res;
return
(
const
char
*)s + 1;
}
static
int
utflen (lua_State *L) {
int
n = 0;
size_t
len;
const
char
*s = marpa_luaL_checklstring(L, 1, &len);
lua_Integer posi = u_posrelat(marpa_luaL_optinteger(L, 2, 1), len);
lua_Integer posj = u_posrelat(marpa_luaL_optinteger(L, 3, -1), len);
marpa_luaL_argcheck(L, 1 <= posi && --posi <= (lua_Integer)len, 2,
"initial position out of string"
);
marpa_luaL_argcheck(L, --posj < (lua_Integer)len, 3,
"final position out of string"
);
while
(posi <= posj) {
const
char
*s1 = utf8_decode(s + posi, NULL);
if
(s1 == NULL) {
marpa_lua_pushnil(L);
marpa_lua_pushinteger(L, posi + 1);
return
2;
}
posi = s1 - s;
n++;
}
marpa_lua_pushinteger(L, n);
return
1;
}
static
int
codepoint (lua_State *L) {
size_t
len;
const
char
*s = marpa_luaL_checklstring(L, 1, &len);
lua_Integer posi = u_posrelat(marpa_luaL_optinteger(L, 2, 1), len);
lua_Integer pose = u_posrelat(marpa_luaL_optinteger(L, 3, posi), len);
int
n;
const
char
*se;
marpa_luaL_argcheck(L, posi >= 1, 2,
"out of range"
);
marpa_luaL_argcheck(L, pose <= (lua_Integer)len, 3,
"out of range"
);
if
(posi > pose)
return
0;
if
(pose - posi >= INT_MAX)
return
marpa_luaL_error(L,
"string slice too long"
);
n = (
int
)(pose - posi) + 1;
marpa_luaL_checkstack(L, n,
"string slice too long"
);
n = 0;
se = s + pose;
for
(s += posi - 1; s < se;) {
int
code;
s = utf8_decode(s, &code);
if
(s == NULL)
return
marpa_luaL_error(L,
"invalid UTF-8 code"
);
marpa_lua_pushinteger(L, code);
n++;
}
return
n;
}
static
void
pushutfchar (lua_State *L,
int
arg) {
lua_Integer code = marpa_luaL_checkinteger(L, arg);
marpa_luaL_argcheck(L, 0 <= code && code <= MAXUNICODE, arg,
"value out of range"
);
marpa_lua_pushfstring(L,
"%U"
, (
long
)code);
}
static
int
utfchar (lua_State *L) {
int
n = marpa_lua_gettop(L);
if
(n == 1)
pushutfchar(L, 1);
else
{
int
i;
luaL_Buffer b;
marpa_luaL_buffinit(L, &b);
for
(i = 1; i <= n; i++) {
pushutfchar(L, i);
marpa_luaL_addvalue(&b);
}
marpa_luaL_pushresult(&b);
}
return
1;
}
static
int
byteoffset (lua_State *L) {
size_t
len;
const
char
*s = marpa_luaL_checklstring(L, 1, &len);
lua_Integer n = marpa_luaL_checkinteger(L, 2);
lua_Integer posi = (n >= 0) ? 1 : len + 1;
posi = u_posrelat(marpa_luaL_optinteger(L, 3, posi), len);
marpa_luaL_argcheck(L, 1 <= posi && --posi <= (lua_Integer)len, 3,
"position out of range"
);
if
(n == 0) {
while
(posi > 0 && iscont(s + posi)) posi--;
}
else
{
if
(iscont(s + posi))
marpa_luaL_error(L,
"initial position is a continuation byte"
);
if
(n < 0) {
while
(n < 0 && posi > 0) {
do
{
posi--;
}
while
(posi > 0 && iscont(s + posi));
n++;
}
}
else
{
n--;
while
(n > 0 && posi < (lua_Integer)len) {
do
{
posi++;
}
while
(iscont(s + posi));
n--;
}
}
}
if
(n == 0)
marpa_lua_pushinteger(L, posi + 1);
else
marpa_lua_pushnil(L);
return
1;
}
static
int
iter_aux (lua_State *L) {
size_t
len;
const
char
*s = marpa_luaL_checklstring(L, 1, &len);
lua_Integer n = marpa_lua_tointeger(L, 2) - 1;
if
(n < 0)
n = 0;
else
if
(n < (lua_Integer)len) {
n++;
while
(iscont(s + n)) n++;
}
if
(n >= (lua_Integer)len)
return
0;
else
{
int
code;
const
char
*next = utf8_decode(s + n, &code);
if
(next == NULL || iscont(next))
return
marpa_luaL_error(L,
"invalid UTF-8 code"
);
marpa_lua_pushinteger(L, n + 1);
marpa_lua_pushinteger(L, code);
return
2;
}
}
static
int
iter_codes (lua_State *L) {
marpa_luaL_checkstring(L, 1);
marpa_lua_pushcfunction(L, iter_aux);
marpa_lua_pushvalue(L, 1);
marpa_lua_pushinteger(L, 0);
return
3;
}
#define UTF8PATT "[\0-\x7F\xC2-\xF4][\x80-\xBF]*"
static
const
luaL_Reg funcs[] = {
{
"offset"
, byteoffset},
{
"codepoint"
, codepoint},
{
"char"
, utfchar},
{
"len"
, utflen},
{
"codes"
, iter_codes},
{
"charpattern"
, NULL},
{NULL, NULL}
};
LUAMOD_API
int
marpa_luaopen_utf8 (lua_State *L) {
marpa_luaL_newlib(L, funcs);
marpa_lua_pushlstring(L, UTF8PATT,
sizeof
(UTF8PATT)/
sizeof
(
char
) - 1);
marpa_lua_setfield(L, -2,
"charpattern"
);
return
1;
}