#define PERL_NO_GET_CONTEXT
#include "EXTERN.h"
#include "perl.h"
#include "XSUB.h"
#include "ppport.h"
#include "utf8_valid.h"
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
/*
uri_encode.c - functions for URI percent encoding / decoding
*/
#define _______ "\0\0\0\0"
static const char uri_encode_tbl[ sizeof(U32) * 0x100 ] = {
/* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
"%00\0" "%01\0" "%02\0" "%03\0" "%04\0" "%05\0" "%06\0" "%07\0" "%08\0" "%09\0" "%0A\0" "%0B\0" "%0C\0" "%0D\0" "%0E\0" "%0F\0" /* 0: 0 ~ 15 */
"%10\0" "%11\0" "%12\0" "%13\0" "%14\0" "%15\0" "%16\0" "%17\0" "%18\0" "%19\0" "%1A\0" "%1B\0" "%1C\0" "%1D\0" "%1E\0" "%1F\0" /* 1: 16 ~ 31 */
"%20\0" "%21\0" "%22\0" "%23\0" "%24\0" "%25\0" "%26\0" "%27\0" "%28\0" "%29\0" "%2A\0" "%2B\0" "%2C\0" _______ _______ "%2F\0" /* 2: 32 ~ 47 */
_______ _______ _______ _______ _______ _______ _______ _______ _______ _______ "%3A\0" "%3B\0" "%3C\0" "%3D\0" "%3E\0" "%3F\0" /* 3: 48 ~ 63 */
"%40\0" _______ _______ _______ _______ _______ _______ _______ _______ _______ _______ _______ _______ _______ _______ _______ /* 4: 64 ~ 79 */
_______ _______ _______ _______ _______ _______ _______ _______ _______ _______ _______ "%5B\0" "%5C\0" "%5D\0" "%5E\0" _______ /* 5: 80 ~ 95 */
"%60\0" _______ _______ _______ _______ _______ _______ _______ _______ _______ _______ _______ _______ _______ _______ _______ /* 6: 96 ~ 111 */
_______ _______ _______ _______ _______ _______ _______ _______ _______ _______ _______ "%7B\0" "%7C\0" "%7D\0" _______ "%7F\0" /* 7: 112 ~ 127 */
"%80\0" "%81\0" "%82\0" "%83\0" "%84\0" "%85\0" "%86\0" "%87\0" "%88\0" "%89\0" "%8A\0" "%8B\0" "%8C\0" "%8D\0" "%8E\0" "%8F\0" /* 8: 128 ~ 143 */
"%90\0" "%91\0" "%92\0" "%93\0" "%94\0" "%95\0" "%96\0" "%97\0" "%98\0" "%99\0" "%9A\0" "%9B\0" "%9C\0" "%9D\0" "%9E\0" "%9F\0" /* 9: 144 ~ 159 */
"%A0\0" "%A1\0" "%A2\0" "%A3\0" "%A4\0" "%A5\0" "%A6\0" "%A7\0" "%A8\0" "%A9\0" "%AA\0" "%AB\0" "%AC\0" "%AD\0" "%AE\0" "%AF\0" /* A: 160 ~ 175 */
"%B0\0" "%B1\0" "%B2\0" "%B3\0" "%B4\0" "%B5\0" "%B6\0" "%B7\0" "%B8\0" "%B9\0" "%BA\0" "%BB\0" "%BC\0" "%BD\0" "%BE\0" "%BF\0" /* B: 176 ~ 191 */
"%C0\0" "%C1\0" "%C2\0" "%C3\0" "%C4\0" "%C5\0" "%C6\0" "%C7\0" "%C8\0" "%C9\0" "%CA\0" "%CB\0" "%CC\0" "%CD\0" "%CE\0" "%CF\0" /* C: 192 ~ 207 */
"%D0\0" "%D1\0" "%D2\0" "%D3\0" "%D4\0" "%D5\0" "%D6\0" "%D7\0" "%D8\0" "%D9\0" "%DA\0" "%DB\0" "%DC\0" "%DD\0" "%DE\0" "%DF\0" /* D: 208 ~ 223 */
"%E0\0" "%E1\0" "%E2\0" "%E3\0" "%E4\0" "%E5\0" "%E6\0" "%E7\0" "%E8\0" "%E9\0" "%EA\0" "%EB\0" "%EC\0" "%ED\0" "%EE\0" "%EF\0" /* E: 224 ~ 239 */
"%F0\0" "%F1\0" "%F2\0" "%F3\0" "%F4\0" "%F5\0" "%F6\0" "%F7\0" "%F8\0" "%F9\0" "%FA\0" "%FB\0" "%FC\0" "%FD\0" "%FE\0" "%FF" /* F: 240 ~ 255 */
};
#undef _______
#define __ 0xFF
static const unsigned char hexval[0x100] = {
__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__, /* 00-0F */
__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__, /* 10-1F */
__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__, /* 20-2F */
0, 1, 2, 3, 4, 5, 6, 7, 8, 9,__,__,__,__,__,__, /* 30-3F */
__,10,11,12,13,14,15,__,__,__,__,__,__,__,__,__, /* 40-4F */
__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__, /* 50-5F */
__,10,11,12,13,14,15,__,__,__,__,__,__,__,__,__, /* 60-6F */
__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__, /* 70-7F */
__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__, /* 80-8F */
__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__, /* 90-9F */
__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__, /* A0-AF */
__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__, /* B0-BF */
__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__, /* C0-CF */
__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__, /* D0-DF */
__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__, /* E0-EF */
__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__, /* F0-FF */
};
#undef __
size_t uri_encode (const char *src, const size_t len, char *dst)
{
size_t i = 0, j = 0;
while (i < len)
{
const char octet = src[i++];
const U32 code = ((U32*)uri_encode_tbl)[ (unsigned char)octet ];
if (code) {
*((U32*)&dst[j]) = code;
j += 3;
}
else dst[j++] = octet;
}
dst[j] = '\0';
return j;
}
size_t uri_decode (const char *src, const size_t len, char *dst)
{
size_t i = 0, j = 0;
while(i < len)
{
int copy_char = 1;
if(src[i] == '%' && i + 2 < len)
{
const unsigned char v1 = hexval[ (unsigned char)src[i+1] ];
const unsigned char v2 = hexval[ (unsigned char)src[i+2] ];
/* skip invalid hex sequences */
if ((v1 | v2) != 0xFF)
{
dst[j] = (v1 << 4) | v2;
j++;
i += 3;
copy_char = 0;
}
}
if (copy_char)
{
dst[j] = src[i];
i++;
j++;
}
}
dst[j] = '\0';
return j;
}
static const char * const kHex = "0123456789ABCDEF";
static void THX_assert_wellformed_utf8(pTHX_ const char *src, size_t len)
{
size_t cursor;
if (!utf8_check(src, len, &cursor))
{
char sequence[sizeof("\\xHH \\xHH \\xHH")];
char *d;
src += cursor;
len = utf8_maximal_subpart(src, len);
for (d = sequence; len > 0; len--)
{
const unsigned char c = (unsigned char)*src++;
*d++ = kHex[c >> 4];
*d++ = kHex[c & 15];
if (len > 1)
*d++ = ' ';
}
*d = 0;
croak("Can't decode ill-formed UTF-8 octet sequence <%s>\n", sequence);
}
}
static void THX_assert_wellformed_unicode(pTHX_ const char *src, size_t len)
{
size_t cursor;
if (!utf8_check(src, len, &cursor))
{
const U32 flags = (UTF8_ALLOW_ANYUV|UTF8_CHECK_ONLY) & ~UTF8_ALLOW_LONG;
const unsigned char *cur = (const unsigned char *)src;
UV ord;
cur += cursor;
len -= cursor;
ord = utf8n_to_uvuni(cur, len, &cursor, flags);
if (cursor != (STRLEN) -1)
{
const char *fmt;
if ((ord & 0xF800) == 0xD800)
fmt = "Can't represent surrogate code point U+%"UVXf" in UTF-8 encoding";
else
fmt = "Can't represent super code point \\x{%"UVXf"} in UTF-8 encoding";
croak(fmt, ord);
}
else
{
char sequence[UTF8_MAXBYTES * 3];
char *d;
cursor = 1;
if (UTF8_IS_START(*cur))
{
size_t skip = UTF8SKIP(cur);
if (skip > len)
skip = len;
while (cursor < skip && UTF8_IS_CONTINUATION(cur[cursor]))
cursor++;
}
for (d = sequence; cursor > 0; cursor--)
{
const unsigned char c = *cur++;
*d++ = kHex[c >> 4];
*d++ = kHex[c & 15];
if (cursor > 1)
*d++ = ' ';
}
*d = 0;
croak("Can't decode ill-formed UTF-X octet sequence <%s>\n", sequence);
}
}
}
static void THX_uri_encode_dsv (pTHX_ const char *src, size_t len, SV *dsv)
{
char *dst;
/* ensure dsv is a SVt_PV */
SvUPGRADE(dsv, SVt_PV);
/* increase the size of dsv (only works on SVt_PV)
return pointer to char buffer */
dst = SvGROW(dsv, len * 3 + 1);
len = uri_encode(src, len, dst);
/* set the current length of dsv */
SvCUR_set(dsv, len);
/* turn on string POK flag, turn off all other OK bits */
SvPOK_only(dsv);
}
static void THX_uri_decode_dsv (pTHX_ const char *src, size_t len, SV *dsv)
{
char *dst;
/* ensure dsv is a SVt_PV */
SvUPGRADE(dsv, SVt_PV);
/* increase the size of dsv (only works on SVt_PV)
return pointer to char buffer */
dst = SvGROW(dsv, len + 1);
len = uri_decode(src, len, dst);
/* set the current length of dsv */
SvCUR_set(dsv, len);
/* turn on string POK flag, turn off all other OK bits */
SvPOK_only(dsv);
}
/* handle Perl context */
#define uri_encode_dsv(src, len, dsv) \
THX_uri_encode_dsv(aTHX_ src, len, dsv)
/* handle Perl context */
#define uri_decode_dsv(src, len, dsv) \
THX_uri_decode_dsv(aTHX_ src, len, dsv)
#define assert_wellformed_utf8(src, len) \
THX_assert_wellformed_utf8(aTHX_ src, len)
#define assert_wellformed_unicode(src, len) \
THX_assert_wellformed_unicode(aTHX_ src, len)
MODULE = URI::Encode::XS PACKAGE = URI::Encode::XS
PROTOTYPES: ENABLED
void
uri_encode(SV *uri)
PREINIT:
/* declare TARG */
dXSTARG;
const char *src;
size_t len;
PPCODE:
/* call fetch() if a tied variable to populate the sv */
SvGETMAGIC(uri);
/* check for undef */
if (!SvOK(uri))
{
croak("uri_encode() requires a scalar argument to encode!");
}
/* copy the sv without the magic struct */
src = SvPV_nomg_const(uri, len);
/* if scalar contains any utf8 encoded data */
if (SvUTF8(uri))
{
/* make a temp copy */
uri = sv_2mortal(newSVpvn(src, len));
/* turn on the utf8 flag */
SvUTF8_on(uri);
/* if any of the characters don't fit into an octet ... */
if (!sv_utf8_downgrade(uri, TRUE))
croak("Wide character in octet string");
/* copy the SV */
src = SvPV_const(uri, len);
}
uri_encode_dsv(src, len, TARG);
/* push TARG into return stack */
PUSHTARG;
void
uri_encode_utf8(SV *uri)
PREINIT:
dXSTARG;
const char *src;
size_t len;
PPCODE:
SvGETMAGIC(uri);
if (!SvOK(uri))
{
croak("uri_encode_utf8() requires a scalar argument to encode!");
}
src = SvPV_nomg_const(uri, len);
if (!SvUTF8(uri))
{
uri = sv_2mortal(newSVpvn(src, len));
sv_utf8_encode(uri);
src = SvPV_const(uri, len);
}
assert_wellformed_unicode(src, len);
uri_encode_dsv(src, len, TARG);
PUSHTARG;
void
uri_decode(SV *uri)
ALIAS:
URI::Encode::XS::uri_decode = 0
URI::Encode::XS::uri_decode_utf8 = 1
PREINIT:
/* declare TARG */
dXSTARG;
const char *src;
size_t len;
PPCODE:
/* call fetch() if a tied variable to populate the sv */
SvGETMAGIC(uri);
/* check for undef */
if (!SvOK(uri))
{
croak("%s() requires a scalar argument to decode!",
ix == 0 ? "uri_decode" : "uri_decode_utf8");
}
/* copy the sv without the magic struct */
src = SvPV_nomg_const(uri, len);
/* if scalar contains any utf8 encoded data */
if (SvUTF8(uri))
{
/* make a temp copy */
uri = sv_2mortal(newSVpvn(src, len));
/* turn on the utf8 flag */
SvUTF8_on(uri);
/* if any of the characters don't fit into an octet ... */
if (!sv_utf8_downgrade(uri, TRUE))
croak("Wide character in octet string");
/* copy the SV */
src = SvPV_const(uri, len);
}
uri_decode_dsv(src, len, TARG);
if (ix == 1)
{
src = SvPV_const(TARG, len);
assert_wellformed_utf8(src, len);
SvUTF8_on(TARG);
}
/* push TARG into return stack */
PUSHTARG;