#include "spvm_utf8.h"
#define utf_cont(ch) (((ch) & 0xc0) == 0x80)
#define SPVM_UTF8PROC_ERROR_INVALIDUTF8 -3
int32_t SPVM_UTF8_iterate(
const
uint8_t *str, int32_t
strlen
, int32_t *dst) {
uint32_t uc;
const
uint8_t *end;
*dst = -1;
if
(!
strlen
)
return
0;
end = str + ((
strlen
< 0) ? 4 :
strlen
);
uc = *str++;
if
(uc < 0x80) {
*dst = uc;
return
1;
}
if
((uc - 0xc2) > (0xf4-0xc2))
return
SPVM_UTF8PROC_ERROR_INVALIDUTF8;
if
(uc < 0xe0) {
if
(str >= end || !utf_cont(*str))
return
SPVM_UTF8PROC_ERROR_INVALIDUTF8;
*dst = ((uc & 0x1f)<<6) | (*str & 0x3f);
return
2;
}
if
(uc < 0xf0) {
if
((str + 1 >= end) || !utf_cont(*str) || !utf_cont(str[1]))
return
SPVM_UTF8PROC_ERROR_INVALIDUTF8;
if
(uc == 0xed && *str > 0x9f)
return
SPVM_UTF8PROC_ERROR_INVALIDUTF8;
uc = ((uc & 0xf)<<12) | ((*str & 0x3f)<<6) | (str[1] & 0x3f);
if
(uc < 0x800)
return
SPVM_UTF8PROC_ERROR_INVALIDUTF8;
*dst = uc;
return
3;
}
if
((str + 2 >= end) || !utf_cont(*str) || !utf_cont(str[1]) || !utf_cont(str[2]))
return
SPVM_UTF8PROC_ERROR_INVALIDUTF8;
if
(uc == 0xf0) {
if
(*str < 0x90)
return
SPVM_UTF8PROC_ERROR_INVALIDUTF8;
}
else
if
(uc == 0xf4) {
if
(*str > 0x8f)
return
SPVM_UTF8PROC_ERROR_INVALIDUTF8;
}
*dst = ((uc & 7)<<18) | ((*str & 0x3f)<<12) | ((str[1] & 0x3f)<<6) | (str[2] & 0x3f);
return
4;
}
int32_t SPVM_UTF8_is_valid_utf8_code_point(int32_t code_point) {
return
SPVM_UTF8_is_unicode_scalar_value(code_point);
}
int32_t SPVM_UTF8_is_unicode_scalar_value(int32_t code_point) {
int32_t is_unicode_scalar_value = 0;
if
(code_point >= 0 && code_point <= 0x10FFFF) {
if
(!(code_point >= 0xD800 && code_point <= 0xDFFF)) {
is_unicode_scalar_value = 1;
}
}
return
is_unicode_scalar_value;
}
int32_t SPVM_UTF8_convert_unicode_codepoint_to_utf8_character(int32_t uc, uint8_t* dst) {
if
(uc < 0x00) {
return
0;
}
else
if
(uc < 0x80) {
dst[0] = (uint8_t)uc;
return
1;
}
else
if
(uc < 0x800) {
dst[0] = (uint8_t)(0xC0 + (uc >> 6));
dst[1] = (uint8_t)(0x80 + (uc & 0x3F));
return
2;
}
else
if
(uc < 0x10000) {
dst[0] = (uint8_t)(0xE0 + (uc >> 12));
dst[1] = (uint8_t)(0x80 + ((uc >> 6) & 0x3F));
dst[2] = (uint8_t)(0x80 + (uc & 0x3F));
return
3;
}
else
if
(uc < 0x110000) {
dst[0] = (uint8_t)(0xF0 + (uc >> 18));
dst[1] = (uint8_t)(0x80 + ((uc >> 12) & 0x3F));
dst[2] = (uint8_t)(0x80 + ((uc >> 6) & 0x3F));
dst[3] = (uint8_t)(0x80 + (uc & 0x3F));
return
4;
}
else
{
return
0;
}
}