// Copyright (c) 2023 Yuki Kimoto
// MIT License
#include "spvm_utf8.h"
#define utf_cont(ch) (((ch) & 0xc0) == 0x80)
#define SPVM_UTF8PROC_ERROR_INVALIDUTF8 -3
int32_t SPVM_UTF8_iterate(const uint8_t *str, int32_t strlen, int32_t *dst) {
uint32_t uc;
const uint8_t *end;
*dst = -1;
if (!strlen) return 0;
end = str + ((strlen < 0) ? 4 : strlen);
uc = *str++;
if (uc < 0x80) {
*dst = uc;
return 1;
}
// Must be between 0xc2 and 0xf4 inclusive to be valid
if ((uc - 0xc2) > (0xf4-0xc2)) return SPVM_UTF8PROC_ERROR_INVALIDUTF8;
if (uc < 0xe0) { // 2-byte sequence
// Must have valid continuation character
if (str >= end || !utf_cont(*str)) return SPVM_UTF8PROC_ERROR_INVALIDUTF8;
*dst = ((uc & 0x1f)<<6) | (*str & 0x3f);
return 2;
}
if (uc < 0xf0) { // 3-byte sequence
if ((str + 1 >= end) || !utf_cont(*str) || !utf_cont(str[1]))
return SPVM_UTF8PROC_ERROR_INVALIDUTF8;
// Check for surrogate chars
if (uc == 0xed && *str > 0x9f)
return SPVM_UTF8PROC_ERROR_INVALIDUTF8;
uc = ((uc & 0xf)<<12) | ((*str & 0x3f)<<6) | (str[1] & 0x3f);
if (uc < 0x800)
return SPVM_UTF8PROC_ERROR_INVALIDUTF8;
*dst = uc;
return 3;
}
// 4-byte sequence
// Must have 3 valid continuation characters
if ((str + 2 >= end) || !utf_cont(*str) || !utf_cont(str[1]) || !utf_cont(str[2]))
return SPVM_UTF8PROC_ERROR_INVALIDUTF8;
// Make sure in correct range (0x10000 - 0x10ffff)
if (uc == 0xf0) {
if (*str < 0x90) return SPVM_UTF8PROC_ERROR_INVALIDUTF8;
} else if (uc == 0xf4) {
if (*str > 0x8f) return SPVM_UTF8PROC_ERROR_INVALIDUTF8;
}
*dst = ((uc & 7)<<18) | ((*str & 0x3f)<<12) | ((str[1] & 0x3f)<<6) | (str[2] & 0x3f);
return 4;
}
int32_t SPVM_UTF8_is_valid_utf8_code_point(int32_t code_point) {
return SPVM_UTF8_is_unicode_scalar_value(code_point);
}
int32_t SPVM_UTF8_is_unicode_scalar_value(int32_t code_point) {
int32_t is_unicode_scalar_value = 0;
if (code_point >= 0 && code_point <= 0x10FFFF) {
if (!(code_point >= 0xD800 && code_point <= 0xDFFF)) {
is_unicode_scalar_value = 1;
}
}
return is_unicode_scalar_value;
}
int32_t SPVM_UTF8_convert_unicode_codepoint_to_utf8_character(int32_t uc, uint8_t* dst) {
if (uc < 0x00) {
return 0;
} else if (uc < 0x80) {
dst[0] = (uint8_t)uc;
return 1;
} else if (uc < 0x800) {
dst[0] = (uint8_t)(0xC0 + (uc >> 6));
dst[1] = (uint8_t)(0x80 + (uc & 0x3F));
return 2;
// Note: we allow encoding 0xd800-0xdfff here, so as not to change
// the API, however, these are actually invalid in UTF-8
} else if (uc < 0x10000) {
dst[0] = (uint8_t)(0xE0 + (uc >> 12));
dst[1] = (uint8_t)(0x80 + ((uc >> 6) & 0x3F));
dst[2] = (uint8_t)(0x80 + (uc & 0x3F));
return 3;
} else if (uc < 0x110000) {
dst[0] = (uint8_t)(0xF0 + (uc >> 18));
dst[1] = (uint8_t)(0x80 + ((uc >> 12) & 0x3F));
dst[2] = (uint8_t)(0x80 + ((uc >> 6) & 0x3F));
dst[3] = (uint8_t)(0x80 + (uc & 0x3F));
return 4;
}
else {
return 0;
}
}