Add Lexbor files for URL handling (#18656)

Relates to #14461 and https://wiki.php.net/rfc/url_parsing_api
This commit is contained in:
Máté Kocsis 2025-05-27 13:06:02 +02:00 committed by GitHub
parent d585a5609d
commit 400b7b8c74
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
15 changed files with 210811 additions and 1 deletions

View file

@ -7,6 +7,9 @@ ignore:
- "ext/lexbor/lexbor/html"
- "ext/lexbor/lexbor/ns"
- "ext/lexbor/lexbor/ports"
- "ext/lexbor/lexbor/punycode"
- "ext/lexbor/lexbor/tag"
- "ext/lexbor/lexbor/unicode"
- "ext/lexbor/lexbor/url"
- "ext/pcre/pcre2lib"
- "ext/uri/uriparser"

View file

@ -17,6 +17,7 @@ PHP_NEW_EXTENSION([lexbor], m4_normalize([
$LEXBOR_DIR/core/hash.c
$LEXBOR_DIR/core/mem.c
$LEXBOR_DIR/core/mraw.c
$LEXBOR_DIR/core/plog.c
$LEXBOR_DIR/core/print.c
$LEXBOR_DIR/core/serialize.c
$LEXBOR_DIR/core/shs.c
@ -174,7 +175,11 @@ PHP_NEW_EXTENSION([lexbor], m4_normalize([
$LEXBOR_DIR/html/tree/open_elements.c
$LEXBOR_DIR/ns/ns.c
$LEXBOR_DIR/ports/posix/lexbor/core/memory.c
$LEXBOR_DIR/punycode/punycode.c
$LEXBOR_DIR/tag/tag.c
$LEXBOR_DIR/unicode/idna.c
$LEXBOR_DIR/unicode/unicode.c
$LEXBOR_DIR/url/url.c
]),
[no],,
[-DZEND_ENABLE_STATIC_TSRMLS_CACHE=1 $PHP_LEXBOR_CFLAGS])
@ -193,7 +198,10 @@ PHP_ADD_BUILD_DIR([
$ext_builddir/$LEXBOR_DIR/html/tree/insertion_mode
$ext_builddir/$LEXBOR_DIR/ns
$ext_builddir/$LEXBOR_DIR/ports/posix/lexbor/core
$ext_builddir/$LEXBOR_DIR/punycode
$ext_builddir/$LEXBOR_DIR/tag
$ext_builddir/$LEXBOR_DIR/unicode
$ext_builddir/$LEXBOR_DIR/url
])
PHP_ADD_INCLUDE([$ext_srcdir])
PHP_INSTALL_HEADERS([ext/lexbor], m4_normalize([

View file

@ -3,7 +3,7 @@
EXTENSION("lexbor", "php_lexbor.c", false, "/I " + configure_module_dirname + " /DZEND_ENABLE_STATIC_TSRMLS_CACHE=1");
PHP_LEXBOR="yes";
ADD_SOURCES("ext/lexbor/lexbor/ports/windows_nt/lexbor/core", "memory.c", "lexbor");
ADD_SOURCES("ext/lexbor/lexbor/core", "array_obj.c array.c avl.c bst.c diyfp.c conv.c dobject.c dtoa.c hash.c mem.c mraw.c print.c serialize.c shs.c str.c strtod.c", "lexbor");
ADD_SOURCES("ext/lexbor/lexbor/core", "array_obj.c array.c avl.c bst.c diyfp.c conv.c dobject.c dtoa.c hash.c mem.c mraw.c plog.c print.c serialize.c shs.c str.c strtod.c", "lexbor");
ADD_SOURCES("ext/lexbor/lexbor/dom", "interface.c", "lexbor");
ADD_SOURCES("ext/lexbor/lexbor/dom/interfaces", "attr.c cdata_section.c character_data.c comment.c document.c document_fragment.c document_type.c element.c node.c processing_instruction.c shadow_root.c text.c", "lexbor");
ADD_SOURCES("ext/lexbor/lexbor/html/tokenizer", "error.c state_comment.c state_doctype.c state_rawtext.c state_rcdata.c state_script.c state.c", "lexbor");
@ -17,7 +17,10 @@ ADD_SOURCES("ext/lexbor/lexbor/css/selectors", "state.c selectors.c selector.c p
ADD_SOURCES("ext/lexbor/lexbor/css/syntax", "state.c parser.c syntax.c anb.c tokenizer.c token.c","lexbor");
ADD_SOURCES("ext/lexbor/lexbor/css/syntax/tokenizer", "error.c","lexbor");
ADD_SOURCES("ext/lexbor/lexbor/ns", "ns.c","lexbor");
ADD_SOURCES("ext/lexbor/lexbor/punycode", "punycode.c","lexbor");
ADD_SOURCES("ext/lexbor/lexbor/tag", "tag.c","lexbor");
ADD_SOURCES("ext/lexbor/lexbor/unicode", "idna.c unicode.c","lexbor");
ADD_SOURCES("ext/lexbor/lexbor/url", "url.c","lexbor");
ADD_FLAG("CFLAGS_LEXBOR", "/D LEXBOR_BUILDING /utf-8");
AC_DEFINE("HAVE_LEXBOR", 1, "Define to 1 if the PHP extension 'lexbor' is available.");

View file

@ -0,0 +1,30 @@
/*
* Copyright (C) 2023-2024 Alexander Borisov
*
* Author: Alexander Borisov <borisov@lexbor.com>
*/
#ifndef LEXBOR_PUNYCODE_BASE_H
#define LEXBOR_PUNYCODE_BASE_H
#ifdef __cplusplus
extern "C" {
#endif
#include "lexbor/core/base.h"
#define LXB_PUNYCODE_VERSION_MAJOR 1
#define LXB_PUNYCODE_VERSION_MINOR 1
#define LXB_PUNYCODE_VERSION_PATCH 0
#define LEXBOR_PUNYCODE_VERSION_STRING LEXBOR_STRINGIZE(LXB_PUNYCODE_VERSION_MAJOR) "." \
LEXBOR_STRINGIZE(LXB_PUNYCODE_VERSION_MINOR) "." \
LEXBOR_STRINGIZE(LXB_PUNYCODE_VERSION_PATCH)
#ifdef __cplusplus
} /* extern "C" */
#endif
#endif /* LEXBOR_PUNYCODE_BASE_H */

View file

@ -0,0 +1,671 @@
/*
* Copyright (C) 2023-2024 Alexander Borisov
*
* Author: Alexander Borisov <borisov@lexbor.com>
*/
#include "lexbor/punycode/punycode.h"
#include "lexbor/encoding/encoding.h"
enum {
LXB_PUNYCODE_BASE = 36,
LXB_PUNYCODE_TMIN = 1,
LXB_PUNYCODE_TMAX = 26,
LXB_PUNYCODE_SKEW = 38,
LXB_PUNYCODE_DAMP = 700,
LXB_PUNYCODE_INITIAL_BIAS = 72,
LXB_PUNYCODE_INITIAL_N = 0x80,
LXB_PUNYCODE_DELIMITER = 0x2D
};
static lxb_status_t
lxb_punycode_callback_cp(const lxb_codepoint_t *cps, size_t len, void *ctx);
lxb_inline lxb_char_t *
lxb_punycode_encode_realloc(lxb_char_t *p, lxb_char_t **buf,
const lxb_char_t **end, const lxb_char_t *buffer)
{
size_t cur_size = *end - *buf;
size_t nsize = cur_size * 2;
lxb_char_t *tmp;
if (*buf == buffer) {
tmp = lexbor_malloc(nsize);
if (tmp == NULL) {
return NULL;
}
memcpy(tmp, *buf, cur_size);
}
else {
tmp = lexbor_realloc(*buf, nsize);
if (tmp == NULL) {
return lexbor_free(*buf);
}
}
*buf = tmp;
*end = tmp + nsize;
return tmp + cur_size;
}
lxb_inline lxb_codepoint_t *
lxb_punycode_decode_realloc(lxb_codepoint_t *p, lxb_codepoint_t **buf,
const lxb_codepoint_t **end,
const lxb_codepoint_t *buffer)
{
size_t cur_size = *end - *buf;
size_t nsize = cur_size * 2;
lxb_codepoint_t *tmp;
if (*buf == buffer) {
tmp = lexbor_malloc(nsize * sizeof(lxb_codepoint_t));
if (tmp == NULL) {
return NULL;
}
memcpy(tmp, *buf, cur_size * sizeof(lxb_codepoint_t));
}
else {
tmp = lexbor_realloc(*buf, nsize * sizeof(lxb_codepoint_t));
if (tmp == NULL) {
return lexbor_free(*buf);
}
}
*buf = tmp;
*end = tmp + nsize;
return tmp + cur_size;
}
static char
lxb_punycode_encode_digit(size_t d) {
return d + 22 + 75 * (d < 26);
}
static size_t
lxb_punycode_decode_digit(lxb_codepoint_t cp)
{
return cp - 48 < 10 ? cp - 22 : cp - 65 < 26 ? cp - 65
: cp - 97 < 26 ? cp - 97 : LXB_PUNYCODE_BASE;
}
static size_t
lxb_punycode_adapt(size_t delta, size_t numpoints, bool firsttime)
{
size_t k;
delta = firsttime ? delta / LXB_PUNYCODE_DAMP : delta >> 1;
delta += delta / numpoints;
for (k = 0;
delta > ((LXB_PUNYCODE_BASE - LXB_PUNYCODE_TMIN) * LXB_PUNYCODE_TMAX) / 2;
k += LXB_PUNYCODE_BASE)
{
delta /= LXB_PUNYCODE_BASE - LXB_PUNYCODE_TMIN;
}
return k + (LXB_PUNYCODE_BASE - LXB_PUNYCODE_TMIN + 1)
* delta / (delta + LXB_PUNYCODE_SKEW);
}
static lxb_status_t
lxb_punycode_encode_body(const lxb_codepoint_t *cps, const lxb_codepoint_t *cps_end,
lxb_char_t *p, lxb_char_t *buf, const lxb_char_t *end,
const lxb_char_t *buffer, lxb_punycode_encode_cb_f cb,
void *ctx)
{
bool unchanged;
size_t h, b, n, q, k, t, delta, bias;
lxb_status_t status;
lxb_codepoint_t cp, m;
const lxb_codepoint_t *cps_t, *cps_p;
n = LXB_PUNYCODE_INITIAL_N;
bias = LXB_PUNYCODE_INITIAL_BIAS;
delta = 0;
b = p - buf;
cps_p = cps + b;
if (cps_p >= cps_end) {
unchanged = true;
goto done;
}
if (p > buf) {
*p++ = LXB_PUNYCODE_DELIMITER;
}
unchanged = false;
while (cps_p < cps_end) {
m = UINT32_MAX;
cps_t = cps;
while (cps_t < cps_end) {
cp = *cps_t++;
if (cp >= n && cp < m) {
m = cp;
}
}
h = (cps_p - cps) + 1;
if (m - n > (UINT32_MAX - delta) / h) {
status = LXB_STATUS_ERROR_OVERFLOW;
goto failed;
}
delta += (m - n) * h;
n = m;
cps_t = cps;
while (cps_t < cps_end) {
cp = *cps_t++;
if (cp < n) {
if (++delta == 0) {
status = LXB_STATUS_ERROR_OVERFLOW;
goto failed;
}
}
if (cp == n) {
q = delta;
k = LXB_PUNYCODE_BASE;
for (;; k += LXB_PUNYCODE_BASE) {
t = k <= bias ? LXB_PUNYCODE_TMIN :
k >= bias + LXB_PUNYCODE_TMAX
? LXB_PUNYCODE_TMAX : k - bias;
if (q < t) {
break;
}
if (p >= end) {
p = lxb_punycode_encode_realloc(p, &buf, &end, buffer);
if (p == NULL) {
return LXB_STATUS_ERROR_MEMORY_ALLOCATION;
}
}
*p++ = lxb_punycode_encode_digit(t + (q - t)
% (LXB_PUNYCODE_BASE - t));
q = (q - t) / (LXB_PUNYCODE_BASE - t);
}
h = cps_p - cps;
if (p >= end) {
p = lxb_punycode_encode_realloc(p, &buf, &end, buffer);
if (p == NULL) {
return LXB_STATUS_ERROR_MEMORY_ALLOCATION;
}
}
*p++ = lxb_punycode_encode_digit(q);
bias = lxb_punycode_adapt(delta, h + 1, h == b);
delta = 0;
cps_p += 1;
}
}
delta += 1;
n += 1;
}
done:
status = cb(buf, p - buf, ctx, unchanged);
failed:
if (buf != buffer) {
(void) lexbor_free(buf);
}
return status;
}
lxb_status_t
lxb_punycode_encode(const lxb_char_t *data, size_t length,
lxb_punycode_encode_cb_f cb, void *ctx)
{
size_t cp_length;
uint8_t len;
lxb_char_t *p, *buf;
lxb_status_t status;
lxb_codepoint_t cp, *cps, *cps_p;
const lxb_char_t *data_p, *data_end, *end;
const lxb_codepoint_t *cps_end;
lxb_char_t buffer[4096];
lxb_codepoint_t input[4096];
/*
* Make GCC happy.
* length variable can be 0.
*/
input[0] = 0x00;
p = buffer;
buf = buffer;
end = buffer + sizeof(buffer);
data_p = data;
data_end = data + length;
cp_length = 0;
while (data_p < data_end) {
len = lxb_encoding_decode_utf_8_length(*data_p);
if (len == 0) {
return LXB_STATUS_ERROR_UNEXPECTED_DATA;
}
data_p += len;
cp_length += 1;
}
if (cp_length <= sizeof(input) / sizeof(lxb_codepoint_t)) {
cps = input;
}
else {
cps = lexbor_malloc(cp_length * sizeof(lxb_codepoint_t));
if (cps == NULL) {
return LXB_STATUS_ERROR_MEMORY_ALLOCATION;
}
}
data_p = data;
cps_p = cps;
cps_end = cps + cp_length;
while (data_p < data_end) {
cp = lxb_encoding_decode_valid_utf_8_single(&data_p, data_end);
if (cp == LXB_ENCODING_DECODE_ERROR) {
status = LXB_STATUS_ERROR_UNEXPECTED_DATA;
goto done;
}
*cps_p++ = cp;
if (cp < 0x80) {
if (p >= end) {
p = lxb_punycode_encode_realloc(p, &buf, &end, buffer);
if (p == NULL) {
status = LXB_STATUS_ERROR_MEMORY_ALLOCATION;
goto done;
}
}
*p++ = cp;
}
}
status = lxb_punycode_encode_body(cps, cps_end, p, buf, end, buffer,
cb, ctx);
done:
if (cps != input) {
(void) lexbor_free(cps);
}
return status;
}
lxb_status_t
lxb_punycode_encode_cp(const lxb_codepoint_t *cps, size_t length,
lxb_punycode_encode_cb_f cb, void *ctx)
{
lxb_char_t *p, *buf;
lxb_codepoint_t cp;
const lxb_char_t *end;
const lxb_codepoint_t *cps_p, *cps_end;
lxb_char_t buffer[4096];
p = buffer;
buf = buffer;
end = buffer + sizeof(buffer);
cps_p = cps;
cps_end = cps + length;
while (cps_p < cps_end) {
cp = *cps_p++;
if (cp < 0x80) {
if (p >= end) {
p = lxb_punycode_encode_realloc(p, &buf, &end, buffer);
if (p == NULL) {
return LXB_STATUS_ERROR_MEMORY_ALLOCATION;
}
}
*p++ = cp;
}
}
return lxb_punycode_encode_body(cps, cps_end, p, buf, end, buffer, cb, ctx);
}
lxb_status_t
lxb_punycode_decode(const lxb_char_t *data, size_t length,
lexbor_serialize_cb_f cb, void *ctx)
{
lexbor_serialize_ctx_t nctx = {.cb = cb, .ctx = ctx};
return lxb_punycode_decode_cb_cp(data, length, lxb_punycode_callback_cp,
&nctx);
}
static lxb_status_t
lxb_punycode_callback_cp(const lxb_codepoint_t *cps, size_t len, void *ctx)
{
uint8_t i;
size_t length;
lxb_status_t status;
const lxb_codepoint_t *cps_p, *cps_end;
lexbor_serialize_ctx_t *nctx = ctx;
lxb_char_t *p, *buf, *end;
lxb_char_t buffer[4096];
/*
* Make GCC happy.
* len variable can be 0.
*/
buffer[0] = 0x00;
cps_p = cps;
cps_end = cps_p + len;
length = 0;
while (cps_p < cps_end) {
i = lxb_encoding_encode_utf_8_length(*cps_p++);
if (i == 0) {
return LXB_STATUS_ERROR_UNEXPECTED_DATA;
}
length += i;
}
buf = buffer;
end = buffer + sizeof(buffer);
if (buf + length > end) {
buf = lexbor_malloc(length);
if (buf == NULL) {
return LXB_STATUS_ERROR_MEMORY_ALLOCATION;
}
end = buf + length;
}
p = buf;
cps_p = cps;
while (cps_p < cps_end) {
(void) lxb_encoding_encode_utf_8_single(NULL, &p, end, *cps_p++);
}
status = nctx->cb(buf, p - buf, nctx->ctx);
if (buf != buffer) {
(void) lexbor_free(buf);
}
return status;
}
lxb_status_t
lxb_punycode_decode_cp(const lxb_codepoint_t *data, size_t length,
lexbor_serialize_cb_cp_f cb, void *ctx)
{
size_t buf_len, digit, oldi, bias, w, k, t, i, h, in;
const lxb_codepoint_t *delimiter, *data_p, *data_end;
lxb_status_t status;
lxb_codepoint_t cp, n;
lxb_codepoint_t *p, *buf;
const lxb_codepoint_t *end;
lxb_codepoint_t buffer[4096];
p = buffer;
buf = buffer;
buf_len = sizeof(buffer) / sizeof(lxb_codepoint_t);
end = buffer + buf_len;
data_p = data;
data_end = data + length;
delimiter = data_end;
while (delimiter != data) {
delimiter -= 1;
if (*delimiter == LXB_PUNYCODE_DELIMITER) {
break;
}
}
while (data_p < delimiter) {
cp = *data_p++;
if (cp >= 0x80) {
status = LXB_STATUS_ERROR_UNEXPECTED_DATA;
goto done;
}
if (p >= end) {
p = lxb_punycode_decode_realloc(p, &buf, &end, buffer);
if (p == NULL) {
return LXB_STATUS_ERROR_MEMORY_ALLOCATION;
}
}
*p++ = cp;
}
i = 0;
n = LXB_PUNYCODE_INITIAL_N;
bias = LXB_PUNYCODE_INITIAL_BIAS;
data_p = (delimiter != data) ? delimiter + 1: data;
in = data_p - data;
for (; in < length; p++) {
for (oldi = i, w = 1, k = LXB_PUNYCODE_BASE; ; k += LXB_PUNYCODE_BASE) {
if (in >= length) {
status = LXB_STATUS_ERROR_UNEXPECTED_DATA;
goto done;
}
cp = data[in++];
digit = lxb_punycode_decode_digit(cp);
if (digit >= LXB_PUNYCODE_BASE) {
status = LXB_STATUS_ERROR_UNEXPECTED_DATA;
goto done;
}
if (digit > (UINT32_MAX - i) / w) {
status = LXB_STATUS_ERROR_OVERFLOW;
goto done;
}
i += digit * w;
t = k <= bias ? LXB_PUNYCODE_TMIN
: k >= bias + LXB_PUNYCODE_TMAX ? LXB_PUNYCODE_TMAX : k - bias;
if (digit < t) {
break;
}
if (w > UINT32_MAX / (LXB_PUNYCODE_BASE - t)) {
status = LXB_STATUS_ERROR_OVERFLOW;
goto done;
}
w *= (LXB_PUNYCODE_BASE - t);
}
h = (p - buf) + 1;
bias = lxb_punycode_adapt(i - oldi, h, oldi == 0);
if (i / h > UINT32_MAX - n) {
status = LXB_STATUS_ERROR_OVERFLOW;
goto done;
}
n += i / h;
i %= h;
if (p >= end) {
p = lxb_punycode_decode_realloc(p, &buf, &end, buffer);
if (p == NULL) {
return LXB_STATUS_ERROR_MEMORY_ALLOCATION;
}
}
memmove(buf + i + 1, buf + i, ((h - 1) - i) * sizeof(lxb_codepoint_t));
buf[i++] = n;
}
status = cb(buf, p - buf, ctx);
done:
if (buffer != buf) {
(void) lexbor_free(buf);
}
return status;
}
lxb_status_t
lxb_punycode_decode_cb_cp(const lxb_char_t *data, size_t length,
lexbor_serialize_cb_cp_f cb, void *ctx)
{
size_t buf_len, digit, oldi, bias, w, k, t, i, h, in;
const lxb_char_t *delimiter, *data_p, *data_end;
lxb_status_t status;
lxb_codepoint_t cp, n;
lxb_codepoint_t *p, *buf;
const lxb_codepoint_t *end;
lxb_codepoint_t buffer[4096];
p = buffer;
buf = buffer;
buf_len = sizeof(buffer) / sizeof(lxb_codepoint_t);
end = buffer + buf_len;
data_p = data;
data_end = data + length;
delimiter = data_end;
while (delimiter != data) {
delimiter -= 1;
if (*delimiter == LXB_PUNYCODE_DELIMITER) {
break;
}
}
while (data_p < delimiter) {
cp = *data_p++;
if (cp >= 0x80) {
status = LXB_STATUS_ERROR_UNEXPECTED_DATA;
goto done;
}
if (p >= end) {
p = lxb_punycode_decode_realloc(p, &buf, &end, buffer);
if (p == NULL) {
return LXB_STATUS_ERROR_MEMORY_ALLOCATION;
}
}
*p++ = cp;
}
i = 0;
n = LXB_PUNYCODE_INITIAL_N;
bias = LXB_PUNYCODE_INITIAL_BIAS;
data_p = (delimiter != data) ? delimiter + 1: data;
in = data_p - data;
for (; in < length; p++) {
for (oldi = i, w = 1, k = LXB_PUNYCODE_BASE; ; k += LXB_PUNYCODE_BASE) {
if (in >= length) {
status = LXB_STATUS_ERROR_UNEXPECTED_DATA;
goto done;
}
cp = data[in++];
digit = lxb_punycode_decode_digit(cp);
if (digit >= LXB_PUNYCODE_BASE) {
status = LXB_STATUS_ERROR_UNEXPECTED_DATA;
goto done;
}
if (digit > (UINT32_MAX - i) / w) {
status = LXB_STATUS_ERROR_OVERFLOW;
goto done;
}
i += digit * w;
t = k <= bias ? LXB_PUNYCODE_TMIN
: k >= bias + LXB_PUNYCODE_TMAX ? LXB_PUNYCODE_TMAX : k - bias;
if (digit < t) {
break;
}
if (w > UINT32_MAX / (LXB_PUNYCODE_BASE - t)) {
status = LXB_STATUS_ERROR_OVERFLOW;
goto done;
}
w *= (LXB_PUNYCODE_BASE - t);
}
h = (p - buf) + 1;
bias = lxb_punycode_adapt(i - oldi, h, oldi == 0);
if (i / h > UINT32_MAX - n) {
status = LXB_STATUS_ERROR_OVERFLOW;
goto done;
}
n += i / h;
i %= h;
if (p >= end) {
p = lxb_punycode_decode_realloc(p, &buf, &end, buffer);
if (p == NULL) {
return LXB_STATUS_ERROR_MEMORY_ALLOCATION;
}
}
memmove(buf + i + 1, buf + i, ((h - 1) - i) * sizeof(lxb_codepoint_t));
buf[i++] = n;
}
status = cb(buf, p - buf, ctx);
done:
if (buffer != buf) {
(void) lexbor_free(buf);
}
return status;
}

View file

@ -0,0 +1,109 @@
/*
* Copyright (C) 2023 Alexander Borisov
*
* Author: Alexander Borisov <borisov@lexbor.com>
*/
#ifndef LEXBOR_PUNYCODE_H
#define LEXBOR_PUNYCODE_H
#ifdef __cplusplus
extern "C" {
#endif
#include "lexbor/punycode/base.h"
typedef lxb_status_t
(*lxb_punycode_encode_cb_f)(const lxb_char_t *data, size_t len, void *ctx,
bool unchanged);
/*
* Punycode: A Bootstring encoding of Unicode
* for Internationalized Domain Names in Applications (IDNA).
*
* https://www.rfc-editor.org/rfc/inline-errata/rfc3492.html
*/
/*
* Encoding from characters to characters.
*
* @param[in] Input characters for encoding. Not NULL.
* @param[in] Length of characters. Can be 0.
* @param[in] Callback for results. Сalled only once when encoding is complete.
* @param[in] Context for callback.
*
* @return LXB_STATUS_OK if successful, otherwise an error status value.
*/
LXB_API lxb_status_t
lxb_punycode_encode(const lxb_char_t *data, size_t length,
lxb_punycode_encode_cb_f cb, void *ctx);
/*
* Encoding from code points to characters.
*
* Same as lxb_punycode_encode() only the input is code points.
*
* @param[in] Input code points for encoding. Not NULL.
* @param[in] Length of code points. Can be 0.
* @param[in] Callback for results. Сalled only once when encoding is complete.
* @param[in] Context for callback.
*
* @return LXB_STATUS_OK if successful, otherwise an error status value.
*/
LXB_API lxb_status_t
lxb_punycode_encode_cp(const lxb_codepoint_t *cps, size_t length,
lxb_punycode_encode_cb_f cb, void *ctx);
/*
* Decoding from characters to characters.
*
* @param[in] Input characters for encoding. Not NULL.
* @param[in] Length of characters. Can be 0.
* @param[in] Callback for results. Сalled only once when encoding is complete.
* @param[in] Context for callback.
*
* @return LXB_STATUS_OK if successful, otherwise an error status value.
*/
LXB_API lxb_status_t
lxb_punycode_decode(const lxb_char_t *data, size_t length,
lexbor_serialize_cb_f cb, void *ctx);
/*
* Decoding from code points to code points.
*
* Same as lxb_punycode_decode() only the input/output is code points.
*
* @param[in] Input code points for encoding. Not NULL.
* @param[in] Length of code points. Can be 0.
* @param[in] Callback for results. Сalled only once when encoding is complete.
* @param[in] Context for callback.
*
* @return LXB_STATUS_OK if successful, otherwise an error status value.
*/
LXB_API lxb_status_t
lxb_punycode_decode_cp(const lxb_codepoint_t *data, size_t length,
lexbor_serialize_cb_cp_f cb, void *ctx);
/*
* Decoding from characters to code points.
*
* Same as lxb_punycode_decode() only the output is code points.
*
* @param[in] Input code points for encoding. Not NULL.
* @param[in] Length of code points. Can be 0.
* @param[in] Callback for results. Сalled only once when encoding is complete.
* @param[in] Context for callback.
*
* @return LXB_STATUS_OK if successful, otherwise an error status value.
*/
LXB_API lxb_status_t
lxb_punycode_decode_cb_cp(const lxb_char_t *data, size_t length,
lexbor_serialize_cb_cp_f cb, void *ctx);
#ifdef __cplusplus
} /* extern "C" */
#endif
#endif /* LEXBOR_PUNYCODE_H */

View file

@ -0,0 +1,157 @@
/*
* Copyright (C) 2023-2024 Alexander Borisov
*
* Author: Alexander Borisov <borisov@lexbor.com>
*/
#ifndef LEXBOR_UNICODE_BASE_H
#define LEXBOR_UNICODE_BASE_H
#ifdef __cplusplus
extern "C" {
#endif
#include "lexbor/core/base.h"
#include "lexbor/core/str.h"
#define LXB_UNICODE_VERSION_MAJOR 0
#define LXB_UNICODE_VERSION_MINOR 3
#define LXB_UNICODE_VERSION_PATCH 0
#define LXB_UNICODE_VERSION_STRING LEXBOR_STRINGIZE(LXB_UNICODE_VERSION_MAJOR) "." \
LEXBOR_STRINGIZE(LXB_UNICODE_VERSION_MINOR) "." \
LEXBOR_STRINGIZE(LXB_UNICODE_VERSION_PATCH)
enum {
LXB_UNICODE_DECOMPOSITION_TYPE__UNDEF = 0x00,
LXB_UNICODE_DECOMPOSITION_TYPE_CIRCLE,
LXB_UNICODE_DECOMPOSITION_TYPE_COMPAT,
LXB_UNICODE_DECOMPOSITION_TYPE_FINAL,
LXB_UNICODE_DECOMPOSITION_TYPE_FONT,
LXB_UNICODE_DECOMPOSITION_TYPE_FRACTION,
LXB_UNICODE_DECOMPOSITION_TYPE_INITIAL,
LXB_UNICODE_DECOMPOSITION_TYPE_ISOLATED,
LXB_UNICODE_DECOMPOSITION_TYPE_MEDIAL,
LXB_UNICODE_DECOMPOSITION_TYPE_NARROW,
LXB_UNICODE_DECOMPOSITION_TYPE_NOBREAK,
LXB_UNICODE_DECOMPOSITION_TYPE_SMALL,
LXB_UNICODE_DECOMPOSITION_TYPE_SQUARE,
LXB_UNICODE_DECOMPOSITION_TYPE_SUB,
LXB_UNICODE_DECOMPOSITION_TYPE_SUPER,
LXB_UNICODE_DECOMPOSITION_TYPE_VERTICAL,
LXB_UNICODE_DECOMPOSITION_TYPE_WIDE,
LXB_UNICODE_DECOMPOSITION_TYPE__LAST_ENTRY
};
#define LXB_UNICODE_CANONICAL_SEPARATELY (1 << 7)
#define LXB_UNICODE_IS_CANONICAL_SEPARATELY(a) ((a) >> 7)
#define LXB_UNICODE_DECOMPOSITION_TYPE(a) ((a) & ~(1 << 7))
typedef uint8_t lxb_unicode_decomposition_type_t;
enum {
LXB_UNICODE_QUICK__UNDEF = 0x00,
LXB_UNICODE_QUICK_NFC_MAYBE = 1 << 0,
LXB_UNICODE_QUICK_NFC_NO = 1 << 1,
LXB_UNICODE_QUICK_NFD_NO = 1 << 2,
LXB_UNICODE_QUICK_NFKC_MAYBE = 1 << 3,
LXB_UNICODE_QUICK_NFKC_NO = 1 << 4,
LXB_UNICODE_QUICK_NFKD_NO = 1 << 5
};
typedef uint8_t lxb_unicode_quick_type_t;
enum {
LXB_UNICODE_IDNA__UNDEF = 0x00,
LXB_UNICODE_IDNA_DEVIATION,
LXB_UNICODE_IDNA_DISALLOWED,
LXB_UNICODE_IDNA_IGNORED,
LXB_UNICODE_IDNA_MAPPED,
LXB_UNICODE_IDNA_VALID
};
typedef uint8_t lxb_unicode_idna_type_t;
typedef struct lxb_unicode_normalizer lxb_unicode_normalizer_t;
typedef struct {
lxb_codepoint_t cp;
uint8_t ccc;
}
lxb_unicode_buffer_t;
typedef lxb_status_t
(*lxb_unicode_nf_handler_f)(lxb_unicode_normalizer_t *uc, const lxb_char_t *data,
size_t length, lexbor_serialize_cb_f cb, void *ctx,
bool is_last);
typedef lxb_unicode_buffer_t *
(*lxb_unicode_de_handler_f)(lxb_unicode_normalizer_t *uc, lxb_codepoint_t cp,
lxb_unicode_buffer_t **buf,
const lxb_unicode_buffer_t **end);
typedef void
(*lxb_unicode_co_handler_f)(lxb_unicode_buffer_t *starter,
lxb_unicode_buffer_t *op, lxb_unicode_buffer_t *p);
typedef struct {
uint16_t normalization; /* lxb_unicode_normalization_t */
uint16_t idna; /* lxb_unicode_idna_t */
}
lxb_unicode_entry_t;
typedef struct {
lxb_unicode_decomposition_type_t type;
lxb_unicode_quick_type_t quick; /* Quick Check. */
uint8_t ccc; /* Canonical Combining Class. */
uint8_t length;
uint16_t decomposition; /* lxb_codepoint_t */
uint16_t composition; /* lxb_unicode_composition_entry_t */
}
lxb_unicode_normalization_entry_t;
typedef struct {
lxb_unicode_idna_type_t type;
uint8_t length;
uint16_t index;
}
lxb_unicode_idna_entry_t;
typedef struct {
uint8_t length; /* Length in lxb_unicode_composition_cps_t */
uint16_t index; /* lxb_unicode_composition_cps_t */
lxb_codepoint_t cp; /* Begin code point in lxb_unicode_composition_cps_t */
}
lxb_unicode_composition_entry_t;
typedef struct {
lxb_codepoint_t cp;
bool exclusion;
}
lxb_unicode_composition_cp_t;
struct lxb_unicode_normalizer {
lxb_unicode_de_handler_f decomposition;
lxb_unicode_co_handler_f composition;
lxb_unicode_buffer_t *starter;
lxb_unicode_buffer_t *buf;
const lxb_unicode_buffer_t *end;
lxb_unicode_buffer_t *p;
lxb_unicode_buffer_t *ican;
lxb_char_t tmp[4];
uint8_t tmp_lenght;
uint8_t quick_ccc;
lxb_unicode_quick_type_t quick_type;
size_t flush_cp;
};
#ifdef __cplusplus
} /* extern "C" */
#endif
#endif /* LEXBOR_UNICODE_BASE_H */

View file

@ -0,0 +1,738 @@
/*
* Copyright (C) 2023 Alexander Borisov
*
* Author: Alexander Borisov <borisov@lexbor.com>
*/
#include "lexbor/unicode/idna.h"
#include "lexbor/unicode/unicode.h"
#include "lexbor/punycode/punycode.h"
#include "lexbor/encoding/encoding.h"
typedef struct {
lxb_unicode_idna_cb_f cb;
void *context;
lxb_unicode_idna_flag_t flags;
}
lxb_unicode_idna_ctx_t;
typedef struct {
lxb_char_t buffer[4096];
lxb_char_t *p;
lxb_char_t *buf;
const lxb_char_t *end;
lxb_unicode_idna_flag_t flags;
}
lxb_unicode_idna_ascii_ctx_t;
static lxb_status_t
lxb_unicode_idna_processing_body(lxb_unicode_idna_t *idna, const void *data,
size_t len, lxb_unicode_idna_cb_f cb, void *ctx,
lxb_unicode_idna_flag_t flags, bool is_cp);
static lxb_status_t
lxb_unicode_idna_norm_c_cb(const lxb_codepoint_t *cps, size_t len, void *ctx);
static lxb_status_t
lxb_unicode_idna_norm_c_send(const lxb_codepoint_t *cps,
const lxb_codepoint_t *p,
lxb_unicode_idna_ctx_t *context);
static lxb_status_t
lxb_unicode_idna_punycode_cb(const lxb_codepoint_t *cps, size_t len, void *ctx);
static lxb_status_t
lxb_unicode_idna_to_ascii_cb(const lxb_codepoint_t *part, size_t len,
void *ctx, lxb_status_t status);
static lxb_status_t
lxb_unicode_idna_to_ascii_body(lxb_unicode_idna_t *idna, const void *data,
size_t length, lexbor_serialize_cb_f cb, void *ctx,
lxb_unicode_idna_flag_t flags, bool is_cp);
static lxb_status_t
lxb_unicode_idna_ascii_puny_cb(const lxb_char_t *data, size_t length, void *ctx,
bool unchanged);
static lxb_status_t
lxb_unicode_idna_to_unicode_cb(const lxb_codepoint_t *part, size_t len,
void *ctx, lxb_status_t status);
static lxb_status_t
lxb_unicode_idna_to_unicode_body(lxb_unicode_idna_t *idna, const void *data,
size_t length, lexbor_serialize_cb_f cb,
void *ctx, lxb_unicode_idna_flag_t flags,
bool is_cp);
static bool
lxb_unicode_idna_validity_criteria_h(const void *data, size_t length,
lxb_unicode_idna_flag_t flags, bool is_cp);
lxb_unicode_idna_t *
lxb_unicode_idna_create(void)
{
return lexbor_malloc(sizeof(lxb_unicode_idna_t));
}
lxb_status_t
lxb_unicode_idna_init(lxb_unicode_idna_t *idna)
{
if (idna == NULL) {
return LXB_STATUS_ERROR_OBJECT_IS_NULL;
}
return lxb_unicode_normalizer_init(&idna->normalizer, LXB_UNICODE_NFC);
}
void
lxb_unicode_idna_clean(lxb_unicode_idna_t *idna)
{
lxb_unicode_normalizer_clean(&idna->normalizer);
}
lxb_unicode_idna_t *
lxb_unicode_idna_destroy(lxb_unicode_idna_t *idna, bool self_destroy)
{
if (idna == NULL) {
return NULL;
}
(void) lxb_unicode_normalizer_destroy(&idna->normalizer, false);
if (self_destroy) {
return lexbor_free(idna);
}
return idna;
}
lxb_codepoint_t *
lxb_unicode_idna_realloc(lxb_codepoint_t *buf, const lxb_codepoint_t *buffer,
lxb_codepoint_t **buf_p, lxb_codepoint_t **buf_end,
size_t len)
{
size_t nlen;
lxb_codepoint_t *tmp;
nlen = ((*buf_end - buf) * 4) + len;
if (buf == buffer) {
tmp = lexbor_malloc(nlen * sizeof(lxb_codepoint_t));
if (tmp == NULL) {
return NULL;
}
}
else {
tmp = lexbor_realloc(buf, nlen * sizeof(lxb_codepoint_t));
if (tmp == NULL) {
return lexbor_free(buf);
}
}
*buf_p = tmp + (*buf_p - buf);
*buf_end = tmp + nlen;
return tmp;
}
lxb_status_t
lxb_unicode_idna_processing(lxb_unicode_idna_t *idna, const lxb_char_t *data,
size_t length, lxb_unicode_idna_cb_f cb, void *ctx,
lxb_unicode_idna_flag_t flags)
{
return lxb_unicode_idna_processing_body(idna, data, length, cb, ctx,
flags, false);
}
lxb_status_t
lxb_unicode_idna_processing_cp(lxb_unicode_idna_t *idna,
const lxb_codepoint_t *cps, size_t length,
lxb_unicode_idna_cb_f cb, void *ctx,
lxb_unicode_idna_flag_t flags)
{
return lxb_unicode_idna_processing_body(idna, cps, length, cb, ctx,
flags, true);
}
static lxb_status_t
lxb_unicode_idna_processing_body(lxb_unicode_idna_t *idna, const void *data,
size_t len, lxb_unicode_idna_cb_f cb, void *ctx,
lxb_unicode_idna_flag_t flags, bool is_cp)
{
bool need;
size_t i, length;
lxb_status_t status;
lxb_codepoint_t cp, *buf, *buf_p, *buf_end;
const lxb_char_t *end, *p;
lxb_unicode_idna_type_t type;
const lxb_unicode_idna_entry_t *udata;
const lxb_codepoint_t *maps;
lxb_unicode_idna_ctx_t context;
lxb_codepoint_t buffer[4096];
buf = buffer;
buf_p = buffer;
buf_end = buffer + (sizeof(buffer) / sizeof(lxb_codepoint_t));
p = data;
len *= (is_cp) ? sizeof(lxb_codepoint_t) : 1;
end = (const lxb_char_t *) data + len;
while (p < end) {
if (is_cp) {
cp = *((const lxb_codepoint_t *) p);
p = (const lxb_char_t *) ((const lxb_codepoint_t *) p + 1);
}
else {
cp = lxb_encoding_decode_valid_utf_8_single(&p, end);
if (cp > LXB_ENCODING_DECODE_MAX_CODEPOINT) {
status = LXB_STATUS_ERROR_UNEXPECTED_DATA;
goto done;
}
}
type = lxb_unicode_idna_type(cp);
again:
switch (type) {
case LXB_UNICODE_IDNA_IGNORED:
break;
case LXB_UNICODE_IDNA_MAPPED:
udata = lxb_unicode_idna_entry_by_cp(cp);
maps = lxb_unicode_idna_map(udata, &length);
if (buf_p + length > buf_end) {
buf = lxb_unicode_idna_realloc(buf, buffer, &buf_p,
&buf_end, length);
if (buf == NULL) {
return LXB_STATUS_ERROR_MEMORY_ALLOCATION;
}
}
for (i = 0; i < length; i++) {
*buf_p++ = maps[i];
}
break;
case LXB_UNICODE_IDNA_DEVIATION:
if ((flags & LXB_UNICODE_IDNA_FLAG_TRANSITIONAL_PROCESSING)) {
type = LXB_UNICODE_IDNA_MAPPED;
goto again;
}
/* Fall through. */
case LXB_UNICODE_IDNA_DISALLOWED:
/* Fall through. */
case LXB_UNICODE_IDNA_VALID:
default:
if (buf_p >= buf_end) {
buf = lxb_unicode_idna_realloc(buf, buffer, &buf_p,
&buf_end, 1);
if (buf == NULL) {
return LXB_STATUS_ERROR_MEMORY_ALLOCATION;
}
}
*buf_p++ = cp;
break;
}
}
context.cb = cb;
context.context = ctx;
context.flags = flags;
need = lxb_unicode_quick_check_cp(&idna->normalizer, buf, buf_p - buf,
true);
if (need) {
lxb_unicode_flush_count_set(&idna->normalizer, UINT32_MAX);
status = lxb_unicode_normalize_cp(&idna->normalizer, buf, buf_p - buf,
lxb_unicode_idna_norm_c_cb,
&context, true);
}
else {
status = lxb_unicode_idna_norm_c_cb(buf, buf_p - buf, &context);
}
done:
if (buf != buffer) {
(void) lexbor_free(buf);
}
return status;
}
static lxb_status_t
lxb_unicode_idna_norm_c_cb(const lxb_codepoint_t *cps, size_t len, void *ctx)
{
lxb_status_t status;
lxb_unicode_idna_ctx_t *context = ctx;
const lxb_codepoint_t *p, *end;
p = cps;
end = cps + len;
while (p < end) {
/* U+002E ( . ) FULL STOP. */
if (*p == 0x002E) {
status = lxb_unicode_idna_norm_c_send(cps, p, context);
if (status != LXB_STATUS_OK) {
return status;
}
cps = p + 1;
}
p += 1;
}
/*
* We need to call a zero-length callback if the last codepoint was a
* U+002E ( . ) FULL STOP.
*
* For example, "muuuu." will call for two callbacks.
* First: "muuuu".
* Second: "" -- empty string with length = 0.
*/
if (p > cps || (len >= 1 && p[-1] == '.')) {
return lxb_unicode_idna_norm_c_send(cps, p, context);
}
return LXB_STATUS_OK;
}
static lxb_status_t
lxb_unicode_idna_norm_c_send(const lxb_codepoint_t *cps,
const lxb_codepoint_t *p,
lxb_unicode_idna_ctx_t *context)
{
bool cr;
lxb_status_t status;
/* xn-- or Xn-- or xN-- or XN-- */
if (p - cps >= 4
&& (cps[0] == 0x0078 || cps[0] == 0x0058)
&& (cps[1] == 0x006E || cps[1] == 0x004E)
&& cps[2] == 0x002D && cps[3] == 0x002D)
{
cps += 4;
status = lxb_punycode_decode_cp(cps, p - cps,
lxb_unicode_idna_punycode_cb,
context);
if (status == LXB_STATUS_OK) {
return LXB_STATUS_OK;
}
cps -= 4;
}
else {
status = LXB_STATUS_OK;
}
cr = lxb_unicode_idna_validity_criteria_cp(cps, p - cps, context->flags);
if (!cr) {
return LXB_STATUS_ERROR_UNEXPECTED_RESULT;
}
return context->cb(cps, p - cps, context->context, status);
}
static lxb_status_t
lxb_unicode_idna_punycode_cb(const lxb_codepoint_t *cps, size_t len, void *ctx)
{
bool cr;
lxb_unicode_idna_ctx_t *context = ctx;
lxb_unicode_idna_ascii_ctx_t *asc = context->context;
cr = lxb_unicode_idna_validity_criteria_cp(cps, len, asc->flags);
if (!cr) {
return LXB_STATUS_ERROR_UNEXPECTED_RESULT;
}
return context->cb(cps, len, context->context, LXB_STATUS_OK);
}
lxb_status_t
lxb_unicode_idna_to_ascii(lxb_unicode_idna_t *idna, const lxb_char_t *data,
size_t length, lexbor_serialize_cb_f cb, void *ctx,
lxb_unicode_idna_flag_t flags)
{
return lxb_unicode_idna_to_ascii_body(idna, data, length, cb, ctx,
flags, false);
}
lxb_status_t
lxb_unicode_idna_to_ascii_cp(lxb_unicode_idna_t *idna, const lxb_codepoint_t *cps,
size_t length, lexbor_serialize_cb_f cb, void *ctx,
lxb_unicode_idna_flag_t flags)
{
return lxb_unicode_idna_to_ascii_body(idna, cps, length, cb, ctx,
flags, true);
}
static lxb_status_t
lxb_unicode_idna_to_ascii_body(lxb_unicode_idna_t *idna, const void *data,
size_t length, lexbor_serialize_cb_f cb, void *ctx,
lxb_unicode_idna_flag_t flags, bool is_cp)
{
size_t len;
lxb_status_t status;
lxb_unicode_idna_ascii_ctx_t context;
context.p = context.buffer;
context.buf = context.buffer;
context.end = context.buf + sizeof(context.buffer);
context.flags = flags;
if (!is_cp) {
status = lxb_unicode_idna_processing(idna, data, length,
lxb_unicode_idna_to_ascii_cb,
&context, flags);
}
else {
status = lxb_unicode_idna_processing_cp(idna, data, length,
lxb_unicode_idna_to_ascii_cb,
&context, flags);
}
if (status != LXB_STATUS_OK) {
goto done;
}
/* Remove last U+002E ( . ) FULL STOP. */
if (context.p > context.buf) {
context.p -= 1;
}
len = context.p - context.buf;
status = cb(context.buf, len, ctx);
done:
if (context.buf != context.buffer) {
(void) lexbor_free(context.buf);
}
return status;
}
static lxb_status_t
lxb_unicode_idna_to_ascii_cb(const lxb_codepoint_t *part, size_t len,
void *ctx, lxb_status_t status)
{
if (status != LXB_STATUS_OK) {
return status;
}
return lxb_punycode_encode_cp(part, len, lxb_unicode_idna_ascii_puny_cb,
ctx);
}
static lxb_status_t
lxb_unicode_idna_ascii_puny_cb(const lxb_char_t *data, size_t length, void *ctx,
bool unchanged)
{
size_t nlen;
lxb_char_t *tmp;
lxb_unicode_idna_ascii_ctx_t *asc = ctx;
static const lexbor_str_t prefix = lexbor_str("xn--");
if (asc->p + length + 6 > asc->end) {
nlen = ((asc->end - asc->buf) * 4) + length + 6;
if (asc->buf == asc->buffer) {
tmp = lexbor_malloc(nlen);
}
else {
tmp = lexbor_realloc(asc->buf, nlen);
}
if (tmp == NULL) {
return LXB_STATUS_ERROR_MEMORY_ALLOCATION;
}
asc->p = tmp + (asc->p - asc->buf);
asc->buf = tmp;
asc->end = tmp + nlen;
}
if (!unchanged) {
memcpy(asc->p, prefix.data, prefix.length);
asc->p += 4;
}
memcpy(asc->p, data, length);
asc->p += length;
*asc->p++ = '.';
*asc->p = 0x00;
return LXB_STATUS_OK;
}
bool
lxb_unicode_idna_validity_criteria(const lxb_char_t *data, size_t length,
lxb_unicode_idna_flag_t flags)
{
return lxb_unicode_idna_validity_criteria_h(data, length, flags, false);
}
bool
lxb_unicode_idna_validity_criteria_cp(const lxb_codepoint_t *data, size_t length,
lxb_unicode_idna_flag_t flags)
{
return lxb_unicode_idna_validity_criteria_h(data, length, flags, true);
}
static bool
lxb_unicode_idna_validity_criteria_h(const void *data, size_t length,
lxb_unicode_idna_flag_t flags, bool is_cp)
{
size_t len;
lxb_codepoint_t cp;
const lxb_codepoint_t *cps;
const lxb_char_t *p, *end;
lxb_unicode_idna_type_t type;
p = data;
len = length * ((is_cp) ? sizeof(lxb_codepoint_t) : 1);
end = (const lxb_char_t *) data + len;
if (flags & LXB_UNICODE_IDNA_FLAG_CHECK_HYPHENS) {
/* U+002D HYPHEN-MINUS */
if (is_cp) {
cps = data;
if (length > 4) {
if (cps[3] == 0x002D || cps[4] == 0x002D) {
return false;
}
}
if (length >= 1) {
if (cps[0] == 0x002D || cps[length - 1] == 0x002D) {
return false;
}
}
}
else {
if (length > 4) {
if (p[3] == 0x002D || p[4] == 0x002D) {
return false;
}
}
if (length >= 1) {
if (p[0] == 0x002D || p[-1] == 0x002D) {
return false;
}
}
}
}
else if (length >= 4) {
if (is_cp) {
cps = data;
if ( (cps[0] == 0x0078 || cps[0] == 0x0058)
&& (cps[1] == 0x006E || cps[1] == 0x004E)
&& cps[2] == 0x002D && cps[3] == 0x002D)
{
return false;
}
}
else {
if ( (p[0] == 0x0078 || p[0] == 0x0058)
&& (p[1] == 0x006E || p[1] == 0x004E)
&& p[2] == 0x002D && p[3] == 0x002D)
{
return false;
}
}
}
while (p < end) {
if (!is_cp) {
cp = lxb_encoding_decode_valid_utf_8_single(&p, end);
if (cp == LXB_ENCODING_DECODE_ERROR) {
return false;
}
}
else {
cp = *((const lxb_codepoint_t *) p);
p = (const lxb_char_t *) ((const lxb_codepoint_t *) p + 1);
}
/* U+002E ( . ) FULL STOP */
if (cp == 0x002E) {
return false;
}
type = lxb_unicode_idna_type(cp);
switch (type) {
case LXB_UNICODE_IDNA_VALID:
break;
case LXB_UNICODE_IDNA_DEVIATION:
if (!(flags & LXB_UNICODE_IDNA_FLAG_TRANSITIONAL_PROCESSING)) {
break;
}
/* Fall through. */
case LXB_UNICODE_IDNA_DISALLOWED:
case LXB_UNICODE_IDNA_IGNORED:
case LXB_UNICODE_IDNA_MAPPED:
default:
return false;
}
}
return true;
}
lxb_status_t
lxb_unicode_idna_to_unicode(lxb_unicode_idna_t *idna, const lxb_char_t *data,
size_t length, lexbor_serialize_cb_f cb,
void *ctx, lxb_unicode_idna_flag_t flags)
{
return lxb_unicode_idna_to_unicode_body(idna, data, length, cb, ctx,
flags, false);
}
lxb_status_t
lxb_unicode_idna_to_unicode_cp(lxb_unicode_idna_t *idna,
const lxb_codepoint_t *cps,
size_t length, lexbor_serialize_cb_f cb,
void *ctx, lxb_unicode_idna_flag_t flags)
{
return lxb_unicode_idna_to_unicode_body(idna, cps, length, cb, ctx,
flags, true);
}
static lxb_status_t
lxb_unicode_idna_to_unicode_body(lxb_unicode_idna_t *idna, const void *data,
size_t length, lexbor_serialize_cb_f cb,
void *ctx, lxb_unicode_idna_flag_t flags,
bool is_cp)
{
size_t len;
lxb_status_t status;
lxb_unicode_idna_ascii_ctx_t context;
context.p = context.buffer;
context.buf = context.buffer;
context.end = context.buf + sizeof(context.buffer);
context.flags = flags;
if (!is_cp) {
status = lxb_unicode_idna_processing(idna, data, length,
lxb_unicode_idna_to_unicode_cb,
&context, flags);
}
else {
status = lxb_unicode_idna_processing_cp(idna, data, length,
lxb_unicode_idna_to_unicode_cb,
&context, flags);
}
if (status != LXB_STATUS_OK) {
goto done;
}
/* Remove last U+002E ( . ) FULL STOP. */
if (context.p > context.buf) {
context.p -= 1;
}
len = context.p - context.buf;
status = cb(context.buf, len, ctx);
done:
if (context.buf != context.buffer) {
(void) lexbor_free(context.buf);
}
return status;
}
static lxb_status_t
lxb_unicode_idna_to_unicode_cb(const lxb_codepoint_t *part, size_t len,
void *ctx, lxb_status_t status)
{
int8_t res;
size_t length, nlen;
lxb_char_t *tmp;
const lxb_codepoint_t *p, *end;
lxb_unicode_idna_ascii_ctx_t *asc = ctx;
if (status != LXB_STATUS_OK) {
return status;
}
p = part;
end = part + len;
length = 0;
while (p < end) {
res = lxb_encoding_encode_utf_8_length(*p++);
if (res == 0) {
return LXB_STATUS_ERROR_UNEXPECTED_DATA;
}
length += res;
}
if (asc->p + length + 2 > asc->end) {
nlen = ((asc->end - asc->buf) * 4) + length + 2;
if (asc->buf == asc->buffer) {
tmp = lexbor_malloc(nlen);
}
else {
tmp = lexbor_realloc(asc->buf, nlen);
}
if (tmp == NULL) {
return LXB_STATUS_ERROR_MEMORY_ALLOCATION;
}
asc->p = tmp + (asc->p - asc->buf);
asc->buf = tmp;
asc->end = tmp + nlen;
}
p = part;
while (p < end) {
(void) lxb_encoding_encode_utf_8_single(NULL, &asc->p, asc->end, *p++);
}
*asc->p++ = '.';
*asc->p = 0x00;
return LXB_STATUS_OK;
}

View file

@ -0,0 +1,264 @@
/*
* Copyright (C) 2023 Alexander Borisov
*
* Author: Alexander Borisov <borisov@lexbor.com>
*
* UNICODE IDNA COMPATIBILITY PROCESSING
* https://www.unicode.org/reports/tr46/
*/
#ifndef LEXBOR_UNICODE_IDNA_H
#define LEXBOR_UNICODE_IDNA_H
#ifdef __cplusplus
extern "C" {
#endif
#include "lexbor/unicode/base.h"
typedef lxb_status_t
(*lxb_unicode_idna_cb_f)(const lxb_codepoint_t *part, size_t len,
void *ctx, lxb_status_t status);
typedef enum {
LXB_UNICODE_IDNA_FLAG_UNDEF = 0x00,
LXB_UNICODE_IDNA_FLAG_USE_STD3ASCII_RULES = 1 << 1,
LXB_UNICODE_IDNA_FLAG_CHECK_HYPHENS = 1 << 2,
LXB_UNICODE_IDNA_FLAG_CHECK_BIDI = 1 << 3, /* Not implemented. */
LXB_UNICODE_IDNA_FLAG_CHECK_JOINERS = 1 << 4, /* Not implemented. */
LXB_UNICODE_IDNA_FLAG_TRANSITIONAL_PROCESSING = 1 << 5,
LXB_UNICODE_IDNA_FLAG_VERIFY_DNS_LENGTH = 1 << 6
}
lxb_unicode_idna_flag_t;
typedef struct {
lxb_unicode_normalizer_t normalizer;
}
lxb_unicode_idna_t;
/*
* Create lxb_unicode_idna_t object.
*
* @return lxb_unicode_idna_t * if successful, otherwise NULL.
*/
LXB_API lxb_unicode_idna_t *
lxb_unicode_idna_create(void);
/*
* Initialization of lxb_unicode_idna_t object.
*
* @param[in] lxb_unicode_idna_t *. May be NULL,
* LXB_STATUS_ERROR_OBJECT_IS_NULL status will be returned.
*
* @return LXB_STATUS_OK if successful, otherwise an error status value.
*/
LXB_API lxb_status_t
lxb_unicode_idna_init(lxb_unicode_idna_t *idna);
/*
* Clears the object. Returns to states as after initialization.
*
* @param[in] lxb_unicode_idna_t *
*/
LXB_API void
lxb_unicode_idna_clean(lxb_unicode_idna_t *idna);
/*
* Destroy lxb_unicode_idna_t object.
*
* Release of occupied resources.
*
* @param[in] lxb_unicode_idna_t *. Can be NULL.
* @param[in] if false: only destroys internal buffers.
* if true: destroys the lxb_unicode_idna_t object and all internal buffers.
*
* @return lxb_unicode_idna_t * if self_destroy = false, otherwise NULL.
*/
LXB_API lxb_unicode_idna_t *
lxb_unicode_idna_destroy(lxb_unicode_idna_t *idna, bool self_destroy);
/*
* Domain name processing.
*
* Mapping, Normalization (NFC), Converting, Validating.
*
* Callback will be invoked at each level of the domain name.
*
* For example:
* lexbor.com -- there will be two callbacks, for "lexbor" and "com".
*
* https://www.unicode.org/reports/tr46/#Processing
*
* @param[in] lxb_unicode_idna_t *.
* @param[in] Input characters for processing. Not NULL.
* @param[in] Length of characters. Can be 0.
* @param[in] Callback for results of processing.
* @param[in] Context for callback.
* @param[in] Bitmap of IDNA flags (LXB_UNICODE_IDNA_FLAG_*).
*
* @return LXB_STATUS_OK if successful, otherwise an error status value.
*/
LXB_API lxb_status_t
lxb_unicode_idna_processing(lxb_unicode_idna_t *idna, const lxb_char_t *data,
size_t length, lxb_unicode_idna_cb_f cb, void *ctx,
lxb_unicode_idna_flag_t flags);
/*
* Domain name processing for code points.
*
* This function is exactly the same as lxb_unicode_idna_processing() only it
* takes code points instead of characters as input.
*
* * Please, see lxb_unicode_idna_processing() function.
*
* @param[in] lxb_unicode_idna_t *.
* @param[in] Input code points for processing. Not NULL.
* @param[in] Length of code points. Can be 0.
* @param[in] Callback for results of processing.
* @param[in] Context for callback.
* @param[in] Bitmap of IDNA flags (LXB_UNICODE_IDNA_FLAG_*).
*
* @return LXB_STATUS_OK if successful, otherwise an error status value.
*/
LXB_API lxb_status_t
lxb_unicode_idna_processing_cp(lxb_unicode_idna_t *idna,
const lxb_codepoint_t *cps, size_t length,
lxb_unicode_idna_cb_f cb, void *ctx,
lxb_unicode_idna_flag_t flags);
/*
* Processing and converting domain name to ASCII.
*
* Does the same thing as lxb_unicode_idna_processing() + converts each part
* domain name to Punycode.
*
* Callback will be invoked only once in at end of processing.
*
* https://www.unicode.org/reports/tr46/#ToASCII
*
* @param[in] lxb_unicode_idna_t *.
* @param[in] Input characters for processing. Not NULL.
* @param[in] Length of characters. Can be 0.
* @param[in] Callback for results of processing.
* @param[in] Context for callback.
* @param[in] Bitmap of IDNA flags (LXB_UNICODE_IDNA_FLAG_*).
*
* @return LXB_STATUS_OK if successful, otherwise an error status value.
*/
LXB_API lxb_status_t
lxb_unicode_idna_to_ascii(lxb_unicode_idna_t *idna, const lxb_char_t *data,
size_t length, lexbor_serialize_cb_f cb, void *ctx,
lxb_unicode_idna_flag_t flags);
/*
* Processing and converting domain name to ASCII for code points.
*
* This function is exactly the same as lxb_unicode_idna_to_ascii() only it
* takes code points instead of characters as input.
*
* Please, see lxb_unicode_idna_to_ascii() function.
*
* @param[in] lxb_unicode_idna_t *.
* @param[in] Input characters for processing. Not NULL.
* @param[in] Length of characters. Can be 0.
* @param[in] Callback for results of processing.
* @param[in] Context for callback.
* @param[in] Bitmap of IDNA flags (LXB_UNICODE_IDNA_FLAG_*).
*
* @return LXB_STATUS_OK if successful, otherwise an error status value.
*/
LXB_API lxb_status_t
lxb_unicode_idna_to_ascii_cp(lxb_unicode_idna_t *idna, const lxb_codepoint_t *cps,
size_t length, lexbor_serialize_cb_f cb, void *ctx,
lxb_unicode_idna_flag_t flags);
/*
* Processing and converting domain name to Unicode.
*
* Does the same thing as lxb_unicode_idna_processing().
*
* Callback will be invoked only once in at end of processing.
*
* https://www.unicode.org/reports/tr46/#ToUnicode
*
* @param[in] lxb_unicode_idna_t *.
* @param[in] Input characters for processing. Not NULL.
* @param[in] Length of characters. Can be 0.
* @param[in] Callback for results of processing.
* @param[in] Context for callback.
* @param[in] Bitmap of IDNA flags (LXB_UNICODE_IDNA_FLAG_*).
*
* @return LXB_STATUS_OK if successful, otherwise an error status value.
*/
LXB_API lxb_status_t
lxb_unicode_idna_to_unicode(lxb_unicode_idna_t *idna, const lxb_char_t *data,
size_t length, lexbor_serialize_cb_f cb, void *ctx,
lxb_unicode_idna_flag_t flags);
/*
* Processing and converting domain name to Unicode for code points.
*
* This function is exactly the same as lxb_unicode_idna_to_unicode() only it
* takes code points instead of characters as input.
*
* Please, see lxb_unicode_idna_to_unicode() function.
*
* @param[in] lxb_unicode_idna_t *.
* @param[in] Input characters for processing. Not NULL.
* @param[in] Length of characters. Can be 0.
* @param[in] Callback for results of processing.
* @param[in] Context for callback.
* @param[in] Bitmap of IDNA flags (LXB_UNICODE_IDNA_FLAG_*).
*
* @return LXB_STATUS_OK if successful, otherwise an error status value.
*/
LXB_API lxb_status_t
lxb_unicode_idna_to_unicode_cp(lxb_unicode_idna_t *idna, const lxb_codepoint_t *cps,
size_t length, lexbor_serialize_cb_f cb, void *ctx,
lxb_unicode_idna_flag_t flags);
/*
* Validity Criteria.
*
* The function checks the domain name for validity according to a number of
* criteria.
*
* LXB_UNICODE_IDNA_FLAG_CHECK_BIDI and LXB_UNICODE_IDNA_FLAG_CHECK_JOINERS
* not implemented.
*
* https://www.unicode.org/reports/tr46/#Validity_Criteria
*
* @param[in] Input characters for processing. Not NULL.
* @param[in] Length of characters. Can be 0.
* @param[in] Bitmap of IDNA flags (LXB_UNICODE_IDNA_FLAG_*).
*
* @return true if valid, otherwise false.
*/
LXB_API bool
lxb_unicode_idna_validity_criteria(const lxb_char_t *data, size_t length,
lxb_unicode_idna_flag_t flags);
/*
* Validity Criteria.
*
* Same as lxb_unicode_idna_validity_criteria() only it takes codepoints as
* input.
*
* @param[in] Input codepoints for processing. Not NULL.
* @param[in] Length of codepoints. Can be 0.
* @param[in] Bitmap of IDNA flags (LXB_UNICODE_IDNA_FLAG_*).
*
* @return true if valid, otherwise false.
*/
LXB_API bool
lxb_unicode_idna_validity_criteria_cp(const lxb_codepoint_t *data, size_t length,
lxb_unicode_idna_flag_t flags);
#ifdef __cplusplus
} /* extern "C" */
#endif
#endif /* LEXBOR_UNICODE_IDNA_H */

201955
ext/lexbor/lexbor/unicode/res.h Normal file

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,405 @@
/*
* Copyright (C) 2023 Alexander Borisov
*
* Author: Alexander Borisov <borisov@lexbor.com>
*/
#ifndef LEXBOR_UNICODE_H
#define LEXBOR_UNICODE_H
#ifdef __cplusplus
extern "C" {
#endif
#include "lexbor/unicode/base.h"
#include "lexbor/unicode/idna.h"
#include "lexbor/core/array_obj.h"
typedef enum {
LXB_UNICODE_NFC = 0x00, /* Normalization Form C (NFC). */
LXB_UNICODE_NFD = 0x01, /* Normalization Form D (NFD). */
LXB_UNICODE_NFKC = 0x02, /* Normalization Form KC (NFKC). */
LXB_UNICODE_NFKD = 0x03 /* Normalization Form KD (NFKD). */
}
lxb_unicode_form_t;
/*
* Create lxb_unicode_normalizer_t object.
*
* @return lxb_unicode_normalizer_t * if successful, otherwise NULL.
*/
LXB_API lxb_unicode_normalizer_t *
lxb_unicode_normalizer_create(void);
/*
* Initialization of lxb_unicode_normalizer_t object.
*
* Support normalization forms:
* Normalization Form D (NFD): LXB_UNICODE_NFD
* Normalization Form C (NFC): LXB_UNICODE_NFC
* Normalization Form KD (NFKD): LXB_UNICODE_NFKD
* Normalization Form KC (NFKC): LXB_UNICODE_NFKC
*
* https://www.unicode.org/reports/tr15/
*
* @param[in] lxb_unicode_normalizer_t *
* @param[in] Normalization form.
*
* @return LXB_STATUS_OK if successful, otherwise an error status value.
*/
LXB_API lxb_status_t
lxb_unicode_normalizer_init(lxb_unicode_normalizer_t *uc,
lxb_unicode_form_t form);
/*
* Initialization of lxb_unicode_normalizer_t object.
*
* Clears the object. Returns to states as after initialization.
*
* @param[in] lxb_unicode_normalizer_t *
*/
LXB_API void
lxb_unicode_normalizer_clean(lxb_unicode_normalizer_t *uc);
/*
* Destroy lxb_unicode_normalizer_t object.
*
* Release of occupied resources.
*
* @param[in] lxb_unicode_normalizer_t *. Can be NULL.
* @param[in] if false: only destroys internal buffers.
* if true: destroys the lxb_unicode_normalizer_t object and all internal buffers.
*
* @return lxb_unicode_normalizer_t * if self_destroy = false, otherwise NULL.
*/
LXB_API lxb_unicode_normalizer_t *
lxb_unicode_normalizer_destroy(lxb_unicode_normalizer_t *uc, bool self_destroy);
/*
* Unicode normalization forms.
*
* This is a function with an implementation of the unicode normalization
* algorithm.
*
* The function is designed to work with a stream (chunks).
*
* Please, see examples for this function in examples/lexbor/unicode directory.
*
* @param[in] lxb_unicode_normalizer_t *
* @param[in] Input characters for normalization. Not NULL.
* @param[in] Length of characters. Can be 0.
* @param[in] Callback for results of normalization.
* @param[in] Context for callback.
* @param[in] Set to true if the last chunk or the only one chunk is processed.
*
* @return LXB_STATUS_OK if successful, otherwise an error status value.
*/
LXB_API lxb_status_t
lxb_unicode_normalize(lxb_unicode_normalizer_t *uc, const lxb_char_t *data,
size_t length, lexbor_serialize_cb_f cb, void *ctx,
bool is_last);
/*
* Unicode normalization end.
*
* The function is used to complete a normalization.
* Same as calling the lxb_unicode_normalize() function with is_last = true.
*
* Use this function only if you do not set is_last = true in
* the lxb_unicode_normalize() function.
*
* For example:
* status = lxb_unicode_normalize(uc, data, length, cb, NULL, false);
* status = lxb_unicode_normalize(uc, data, length, cb, NULL, false);
* lxb_unicode_normalize_end(uc);
*
* The same as:
* status = lxb_unicode_normalize(uc, data, length, cb, NULL, false);
* status = lxb_unicode_normalize(uc, data, length, cb, NULL, true);
*
* @param[in] lxb_unicode_normalizer_t *
* @param[in] Callback for results of normalization.
* @param[in] Context for callback.
*
* @return LXB_STATUS_OK if successful, otherwise an error status value.
*/
LXB_API lxb_status_t
lxb_unicode_normalize_end(lxb_unicode_normalizer_t *uc, lexbor_serialize_cb_f cb,
void *ctx);
/*
* Unicode normalization forms for code points.
*
* This function is exactly the same as lxb_unicode_normalize() only it takes
* code points instead of characters as input.
*
* Also, unlike the lxb_unicode_normalize() function, a callback will be called
* to return a code points, not characters.
*
* The function is designed to work with a stream (chunks).
*
* @param[in] lxb_unicode_normalizer_t *
* @param[in] Input code points for normalization. Not NULL.
* @param[in] Length of code points. Can be 0.
* @param[in] Callback for results of normalization.
* @param[in] Context for callback.
* @param[in] Set to true if the last chunk or the only one chunk is processed.
*
* @return LXB_STATUS_OK if successful, otherwise an error status value.
*/
LXB_API lxb_status_t
lxb_unicode_normalize_cp(lxb_unicode_normalizer_t *uc, const lxb_codepoint_t *cps,
size_t length, lexbor_serialize_cb_cp_f cb, void *ctx,
bool is_last);
/*
* Unicode normalization end for code points.
*
* This function is completely similar to lxb_unicode_normalize_end(),
* only it takes a function with code points as a callback function.
*
* Same as calling the lxb_unicode_normalize_cp() function with is_last = true.
*
* Use this function only if you do not set is_last = true in
* the lxb_unicode_normalize_cp() function.
*
* For example:
* status = lxb_unicode_normalize_cp(uc, cps, length, cb, NULL, false);
* status = lxb_unicode_normalize_cp(uc, cps, length, cb, NULL, false);
* lxb_unicode_normalize_cp_end(uc);
*
* The same as:
* status = lxb_unicode_normalize_cp(uc, cps, length, cb, NULL, false);
* status = lxb_unicode_normalize_cp(uc, cps, length, cb, NULL, true);
*
* @param[in] lxb_unicode_normalizer_t *
* @param[in] Callback for results of normalization.
* @param[in] Context for callback.
*
* @return LXB_STATUS_OK if successful, otherwise an error status value.
*/
LXB_API lxb_status_t
lxb_unicode_normalize_cp_end(lxb_unicode_normalizer_t *uc,
lexbor_serialize_cb_cp_f cb, void *ctx);
/*
* Quick Check.
*
* The basic normalization algorithm is not simple and requires time
* and resources.
* This function checks relatively quickly if the text needs to be normalized.
*
* The function is designed to work with a stream (chunks).
*
* https://www.unicode.org/reports/tr15/#Detecting_Normalization_Forms
*
* @param[in] lxb_unicode_normalizer_t *
* @param[in] Input characters for checks. Not NULL.
* @param[in] Length of characters. Can be 0.
* @param[in] Set to true if the last chunk or the only one chunk is processed.
*
* @return true if it needs to be normalized, otherwise false.
*/
LXB_API bool
lxb_unicode_quick_check(lxb_unicode_normalizer_t *uc, const lxb_char_t *data,
size_t length, bool is_last);
/*
* Quick Check End.
*
* The function is used to complete a quick check.
* Same as calling the lxb_unicode_quick_check() function with is_last = true.
*
* Use this function only if you do not set is_last = true in
* the lxb_unicode_quick_check() function.
*
* For example:
* is = lxb_unicode_quick_check(uc, data, length, false);
* is = lxb_unicode_quick_check(uc, data, length, false);
* is = lxb_unicode_quick_check_end(uc);
*
* The same as:
* is = lxb_unicode_quick_check(uc, data, length, false);
* is = lxb_unicode_quick_check(uc, data, length, true);
*
* @param[in] lxb_unicode_normalizer_t *
*
* @return true if it needs to be normalized, otherwise false.
*/
LXB_API bool
lxb_unicode_quick_check_end(lxb_unicode_normalizer_t *uc);
/*
* Quick Check for code points.
*
* Same as lxb_unicode_quick_check() only it takes code points as input.
*
* @param[in] lxb_unicode_normalizer_t *
* @param[in] Input code points for checks. Not NULL.
* @param[in] Length of code points. Can be 0.
* @param[in] Set to true if the last chunk or the only one chunk is processed.
*
* @return true if it needs to be normalized, otherwise false.
*/
LXB_API bool
lxb_unicode_quick_check_cp(lxb_unicode_normalizer_t *uc,
const lxb_codepoint_t *cps, size_t length,
bool is_last);
/*
* Quick Check End for code points.
*
* Same as lxb_unicode_quick_check_end().
*
* For example:
* is = lxb_unicode_quick_check_cp(uc, cps, length, false);
* is = lxb_unicode_quick_check_cp(uc, cps, length, false);
* is = lxb_unicode_quick_check_cp_end(uc);
*
* The same as:
* is = lxb_unicode_quick_check_cp(uc, cps, length, false);
* is = lxb_unicode_quick_check_cp(uc, cps, length, true);
*
* @param[in] lxb_unicode_normalizer_t *
*
* @return true if it needs to be normalized, otherwise false.
*/
LXB_API bool
lxb_unicode_quick_check_cp_end(lxb_unicode_normalizer_t *uc);
/*
* Flush.
*
* Force flush the buffer to the user's callback if it possible.
*
* Please, see lxb_unicode_flush_count_set() function.
*
* @param[in] lxb_unicode_normalizer_t *.
* @param[in] Callback.
* @param[in] Callback context.
*
* @return LXB_STATUS_OK if successful, otherwise an error status value.
*/
LXB_API lxb_status_t
lxb_unicode_flush(lxb_unicode_normalizer_t *uc, lexbor_serialize_cb_f cb,
void *ctx);
/*
* Flush for code points.
*
* Same as lxb_unicode_flush(), but it takes a callback with code points as
* input.
*
* @param[in] lxb_unicode_normalizer_t *.
* @param[in] Callback.
* @param[in] Callback context.
*
* @return LXB_STATUS_OK if successful, otherwise an error status value.
*/
LXB_API lxb_status_t
lxb_unicode_flush_cp(lxb_unicode_normalizer_t *uc, lexbor_serialize_cb_cp_f cb,
void *ctx);
/*
* Change normalization form.
*
* You should only apply this function after one of the following actions:
* 1. The lxb_unicode_normalize() function was called with is_last = true.
* That is, the processing of the previous type was successfully
* completed.
* OR
* 2. The end of normalization function was called:
lxb_unicode_normalize_end().
* OR
* 3. The lxb_unicode_normalizer_t object cleanup function was called:
* lxb_unicode_normalizer_clean().
*
*
* All this is to be able to normalize or quickly check text with different
* types without creating new objects.
*
* @param[in] lxb_unicode_normalizer_t *.
* @param[in] Normalization form.
*
* @return LXB_STATUS_OK if successful, otherwise an error status value.
*/
LXB_API lxb_status_t
lxb_unicode_normalization_form_set(lxb_unicode_normalizer_t *uc,
lxb_unicode_form_t form);
LXB_API const lxb_unicode_entry_t *
lxb_unicode_entry(lxb_codepoint_t cp);
LXB_API const lxb_unicode_composition_cp_t *
lxb_unicode_compose_entry(lxb_codepoint_t first, lxb_codepoint_t second);
LXB_API lxb_unicode_idna_type_t
lxb_unicode_idna_type(lxb_codepoint_t cp);
LXB_API const lxb_unicode_composition_cp_t *
lxb_unicode_composition_cp(lxb_codepoint_t first, lxb_codepoint_t second);
LXB_API const lxb_unicode_normalization_entry_t *
lxb_unicode_normalization_entry(const lxb_unicode_entry_t *entry);
LXB_API const lxb_unicode_normalization_entry_t *
lxb_unicode_normalization_entry_by_cp(lxb_codepoint_t cp);
LXB_API const lxb_unicode_normalization_entry_t *
lxb_unicode_normalization_entry_by_index(uint16_t index);
LXB_API bool
lxb_unicode_normalization_is_null(const lxb_unicode_normalization_entry_t *entry);
LXB_API const lxb_codepoint_t *
lxb_unicode_full_canonical(const lxb_unicode_normalization_entry_t *entry,
size_t *out_length);
LXB_API const lxb_codepoint_t *
lxb_unicode_full_compatibility(const lxb_unicode_normalization_entry_t *entry,
size_t *out_length);
LXB_API const lxb_unicode_idna_entry_t *
lxb_unicode_idna_entry(const lxb_unicode_entry_t *entry);
LXB_API const lxb_unicode_idna_entry_t *
lxb_unicode_idna_entry_by_cp(lxb_codepoint_t cp);
LXB_API const lxb_unicode_idna_entry_t *
lxb_unicode_idna_entry_by_index(uint16_t index);
LXB_API const lxb_codepoint_t *
lxb_unicode_idna_map(const lxb_unicode_idna_entry_t *entry,
size_t *out_length);
/*
* Inline functions.
*/
/*
* Sets the buffer size for codepoints.
*
* By default, 4096 processed codepoints are accumulated before converting them
* to lxb_char_t and returning the result to the user via callback.
*
* If set the count to 0, the user callback will be called for every codepoint
* processed. That is, it will be streaming without accumulation in
* the intermediate buffer.
*
* @param[in] lxb_unicode_normalizer_t *.
* @param[in] Count of codepoints in the buffer.
*/
lxb_inline void
lxb_unicode_flush_count_set(lxb_unicode_normalizer_t *uc, size_t count)
{
uc->flush_cp = count;
}
#ifdef __cplusplus
} /* extern "C" */
#endif
#endif /* LEXBOR_UNICODE_H */

View file

@ -0,0 +1,32 @@
/*
* Copyright (C) 2023-2024 Alexander Borisov
*
* Author: Alexander Borisov <borisov@lexbor.com>
*/
#ifndef LEXBOR_URL_BASE_H
#define LEXBOR_URL_BASE_H
#ifdef __cplusplus
extern "C" {
#endif
#include "lexbor/core/base.h"
#include "lexbor/core/mraw.h"
#include "lexbor/core/str.h"
#define LXB_URL_VERSION_MAJOR 0
#define LXB_URL_VERSION_MINOR 3
#define LXB_URL_VERSION_PATCH 0
#define LXB_URL_VERSION_STRING LEXBOR_STRINGIZE(LXB_URL_VERSION_MAJOR) "." \
LEXBOR_STRINGIZE(LXB_URL_VERSION_MINOR) "." \
LEXBOR_STRINGIZE(LXB_URL_VERSION_PATCH)
#ifdef __cplusplus
} /* extern "C" */
#endif
#endif /* LEXBOR_URL_BASE_H */

4845
ext/lexbor/lexbor/url/url.c Normal file

File diff suppressed because it is too large Load diff

551
ext/lexbor/lexbor/url/url.h Normal file
View file

@ -0,0 +1,551 @@
/*
* Copyright (C) 2023 Alexander Borisov
*
* Author: Alexander Borisov <borisov@lexbor.com>
*
* The URL Standard.
* By specification: https://url.spec.whatwg.org/
*/
#ifndef LEXBOR_URL_H
#define LEXBOR_URL_H
#ifdef __cplusplus
extern "C" {
#endif
#include "lexbor/url/base.h"
#include "lexbor/core/mraw.h"
#include "lexbor/core/plog.h"
#include "lexbor/encoding/encoding.h"
#include "lexbor/unicode/unicode.h"
typedef enum {
LXB_URL_ERROR_TYPE_DOMAIN_TO_ASCII = 0x00,
LXB_URL_ERROR_TYPE_DOMAIN_TO_UNICODE,
LXB_URL_ERROR_TYPE_DOMAIN_INVALID_CODE_POINT,
LXB_URL_ERROR_TYPE_HOST_INVALID_CODE_POINT,
LXB_URL_ERROR_TYPE_IPV4_EMPTY_PART,
LXB_URL_ERROR_TYPE_IPV4_TOO_MANY_PARTS,
LXB_URL_ERROR_TYPE_IPV4_NON_NUMERIC_PART,
LXB_URL_ERROR_TYPE_IPV4_NON_DECIMAL_PART,
LXB_URL_ERROR_TYPE_IPV4_OUT_OF_RANGE_PART,
LXB_URL_ERROR_TYPE_IPV6_UNCLOSED,
LXB_URL_ERROR_TYPE_IPV6_INVALID_COMPRESSION,
LXB_URL_ERROR_TYPE_IPV6_TOO_MANY_PIECES,
LXB_URL_ERROR_TYPE_IPV6_MULTIPLE_COMPRESSION,
LXB_URL_ERROR_TYPE_IPV6_INVALID_CODE_POINT,
LXB_URL_ERROR_TYPE_IPV6_TOO_FEW_PIECES,
LXB_URL_ERROR_TYPE_IPV4_IN_IPV6_TOO_MANY_PIECES,
LXB_URL_ERROR_TYPE_IPV4_IN_IPV6_INVALID_CODE_POINT,
LXB_URL_ERROR_TYPE_IPV4_IN_IPV6_OUT_OF_RANGE_PART,
LXB_URL_ERROR_TYPE_IPV4_IN_IPV6_TOO_FEW_PARTS,
LXB_URL_ERROR_TYPE_INVALID_URL_UNIT,
LXB_URL_ERROR_TYPE_SPECIAL_SCHEME_MISSING_FOLLOWING_SOLIDUS,
LXB_URL_ERROR_TYPE_MISSING_SCHEME_NON_RELATIVE_URL,
LXB_URL_ERROR_TYPE_INVALID_REVERSE_SOLIDUS,
LXB_URL_ERROR_TYPE_INVALID_CREDENTIALS,
LXB_URL_ERROR_TYPE_HOST_MISSING,
LXB_URL_ERROR_TYPE_PORT_OUT_OF_RANGE,
LXB_URL_ERROR_TYPE_PORT_INVALID,
LXB_URL_ERROR_TYPE_FILE_INVALID_WINDOWS_DRIVE_LETTER,
LXB_URL_ERROR_TYPE_FILE_INVALID_WINDOWS_DRIVE_LETTER_HOST,
LXB_URL_ERROR_TYPE__LAST_ENTRY
}
lxb_url_error_type_t;
typedef enum {
LXB_URL_STATE__UNDEF = 0x00,
LXB_URL_STATE_SCHEME_START_STATE,
LXB_URL_STATE_SCHEME_STATE,
LXB_URL_STATE_NO_SCHEME_STATE,
LXB_URL_STATE_SPECIAL_RELATIVE_OR_AUTHORITY_STATE,
LXB_URL_STATE_PATH_OR_AUTHORITY_STATE,
LXB_URL_STATE_RELATIVE_STATE,
LXB_URL_STATE_RELATIVE_SLASH_STATE,
LXB_URL_STATE_SPECIAL_AUTHORITY_SLASHES_STATE,
LXB_URL_STATE_SPECIAL_AUTHORITY_IGNORE_SLASHES_STATE,
LXB_URL_STATE_AUTHORITY_STATE,
LXB_URL_STATE_HOST_STATE,
LXB_URL_STATE_HOSTNAME_STATE,
LXB_URL_STATE_PORT_STATE,
LXB_URL_STATE_FILE_STATE,
LXB_URL_STATE_FILE_SLASH_STATE,
LXB_URL_STATE_FILE_HOST_STATE,
LXB_URL_STATE_PATH_START_STATE,
LXB_URL_STATE_PATH_STATE,
LXB_URL_STATE_OPAQUE_PATH_STATE,
LXB_URL_STATE_QUERY_STATE,
LXB_URL_STATE_FRAGMENT_STATE
}
lxb_url_state_t;
/*
* New values can only be added downwards.
* Before LXB_URL_SCHEMEL_TYPE__LAST_ENTRY.
*
* Please, see lxb_url_scheme_res in /lexbor/url/url.c.
*/
typedef enum {
LXB_URL_SCHEMEL_TYPE__UNDEF = 0x00,
LXB_URL_SCHEMEL_TYPE__UNKNOWN = 0x01,
LXB_URL_SCHEMEL_TYPE_HTTP = 0x02,
LXB_URL_SCHEMEL_TYPE_HTTPS = 0x03,
LXB_URL_SCHEMEL_TYPE_WS = 0x04,
LXB_URL_SCHEMEL_TYPE_WSS = 0x05,
LXB_URL_SCHEMEL_TYPE_FTP = 0x06,
LXB_URL_SCHEMEL_TYPE_FILE = 0x07,
LXB_URL_SCHEMEL_TYPE__LAST_ENTRY
}
lxb_url_scheme_type_t;
typedef struct {
const lexbor_str_t name;
uint16_t port;
lxb_url_scheme_type_t type;
}
lxb_url_scheme_data_t;
typedef struct {
lexbor_str_t name;
lxb_url_scheme_type_t type;
}
lxb_url_scheme_t;
typedef enum {
LXB_URL_HOST_TYPE__UNDEF = 0x00,
LXB_URL_HOST_TYPE_DOMAIN = 0x01,
LXB_URL_HOST_TYPE_OPAQUE = 0x02,
LXB_URL_HOST_TYPE_IPV4 = 0x03,
LXB_URL_HOST_TYPE_IPV6 = 0x04,
LXB_URL_HOST_TYPE_EMPTY = 0x05
}
lxb_url_host_type_t;
typedef struct {
lxb_url_host_type_t type;
union {
uint16_t ipv6[8];
uint32_t ipv4;
lexbor_str_t opaque;
lexbor_str_t domain;
} u;
}
lxb_url_host_t;
typedef struct {
lexbor_str_t str;
size_t length;
bool opaque;
}
lxb_url_path_t;
typedef struct {
lxb_url_scheme_t scheme;
lxb_url_host_t host;
lexbor_str_t username;
lexbor_str_t password;
uint16_t port;
bool has_port;
lxb_url_path_t path;
lexbor_str_t query;
lexbor_str_t fragment;
lexbor_mraw_t *mraw;
}
lxb_url_t;
typedef struct {
lxb_url_t *url;
lexbor_mraw_t *mraw;
lexbor_plog_t *log;
lxb_unicode_idna_t *idna;
}
lxb_url_parser_t;
/*
* Create lxb_url_parser_t object.
*
* @return lxb_url_parser_t * if successful, otherwise NULL.
*/
LXB_API lxb_url_parser_t *
lxb_url_parser_create(void);
/*
* Initialization of lxb_url_parser_t object.
*
* The parser is not bound to the received URLs in any way. That is, after
* parsing the lxb_url_parser_t object can be destroyed and we can continue
* working with the received URLs.
*
* Memory for created URLs is taken from lexbor_mraw_t object, which you can
* pass during initialization of lxb_url_parser_t object, or a new lexbor_mraw_t
* object will be created during initialization if NULL is passed.
*
* Each created URL will have a pointer to the lexbor_mraw_t object.
*
* By destroying the lexbor_mraw_t object you destroy all the URL objects
* created by the parser. Use the lxb_url_destroy() function to destroy a
* specific URL.
*
* Destroying the lxb_url_parser_t object with lxb_url_parser_destroy() does
* not destroy the lexbor_mraw_t memory object.
*
* Please, see functions lxb_url_parser_memory_destroy(), lxb_url_destroy(),
* lxb_url_memory_destroy().
*
* @param[in] lxb_url_parser_t *
* @param[in] lexbor_mraw_t *. Can be NULL. If pass NULL, it will create its own
* memory object inside parser and it will be bound to all created URLs.
*
* @return LXB_STATUS_OK if successful, otherwise an error status value.
*/
LXB_API lxb_status_t
lxb_url_parser_init(lxb_url_parser_t *parser, lexbor_mraw_t *mraw);
/*
* Clears the object. Returns object to states as after initialization.
*
* This function must be called before the parsing functions can be reused.
*
* For example:
* lxb_url_parse()
* lxb_url_parser_clean()
* lxb_url_parse()
* lxb_url_destroy()
*
* @param[in] lxb_url_parser_t *
*/
LXB_API void
lxb_url_parser_clean(lxb_url_parser_t *parser);
/*
* Destroy lxb_url_parser_t object.
*
* Release of occupied resources.
* The lexbor_mraw_t memory object is not destroyed in this function.
*
* @param[in] lxb_url_parser_t *. Can be NULL.
* @param[in] if false: only destroys internal buffers.
* if true: destroys the lxb_url_parser_t object and all internal buffers.
*
* @return lxb_url_parser_t * if self_destroy = false, otherwise NULL.
*/
LXB_API lxb_url_parser_t *
lxb_url_parser_destroy(lxb_url_parser_t *parser, bool destroy_self);
/*
* Destroys the lexbor_mraw_t object, and thus all associated URLs.
*
* After that, new URLs cannot be parsed until a new lexbor_mraw_t object is
* assigned to the lxb_url_parser_t object.
*
* @param[in] lxb_url_parser_t *.
*/
LXB_API void
lxb_url_parser_memory_destroy(lxb_url_parser_t *parser);
/*
* URL parser.
*
* This functional an implementation of URL parsing according to the WHATWG
* specification.
*
* @param[in] lxb_url_parser_t *.
* @param[in] const lxb_url_t *. Base URL, can be NULL.
* @param[in] Input characters. Not NULL.
* @param[in] Length of characters. Can be 0.
*
* @return lxb_url_t * if successful, otherwise NULL.
*/
LXB_API lxb_url_t *
lxb_url_parse(lxb_url_parser_t *parser, const lxb_url_t *base_url,
const lxb_char_t *data, size_t length);
/*
* URL basic parser.
*
* This functional an implementation of URL parsing according to the WHATWG
* specification.
*
* Use the lxb_url_get() function to get the URL object.
*
* @param[in] lxb_url_parser_t *.
* @param[in] lxb_url_t *. Can be NULL.
* @param[in] const lxb_url_t *. Base URL, can be NULL.
* @param[in] Input characters. Not NULL.
* @param[in] Length of characters. Can be 0.
* @param[in] lxb_url_state_t, for default set to LXB_URL_STATE__UNDEF.
* @param[in] lxb_encoding_t, default (LXB_ENCODING_DEFAULT) LXB_ENCODING_UTF_8.
*
* @return LXB_STATUS_OK if successful, otherwise an error status value.
*/
LXB_API lxb_status_t
lxb_url_parse_basic(lxb_url_parser_t *parser, lxb_url_t *url,
const lxb_url_t *base_url,
const lxb_char_t *data, size_t length,
lxb_url_state_t override_state, lxb_encoding_t encoding);
/*
* Erase URL.
*
* Frees all internal memory occupied by the URL object, but does not destroy
* the object.
*
* @param[in] lxb_url_t *.
*
* @return NULL.
*/
LXB_API void
lxb_url_erase(lxb_url_t *url);
/*
* Destroys URL.
*
* @param[in] lxb_url_t *.
*
* @return NULL.
*/
LXB_API lxb_url_t *
lxb_url_destroy(lxb_url_t *url);
/*
* Destroys the lexbor_mraw_t memory object.
*
* The function will destroy all URLs associated with the lexbor_mraw_t memory
* object, including the passed one.
*
* Keep in mind, if you have a live lxb_url_parser_t parsing object, you will
* have a pointer to garbage after calling this function instead of a pointer
* to the lexbor_mraw_t object.
* In this case you need to assign a new memory object lexbor_mraw_t for the
* parser. Use the lxb_url_mraw_set() function.
*
* @param[in] lxb_url_t *.
*/
LXB_API void
lxb_url_memory_destroy(lxb_url_t *url);
/*
* Below is an API for modifying the URL object according to the
* https://url.spec.whatwg.org/#api specification.
*
* It is not necessary to pass the lxb_url_parser_t object to API functions.
* You need to pass the parser if you want to have logs of parsing.
*
* All API functions can be passed NULL as "const lxb_char_t *" data.
*/
LXB_API lxb_status_t
lxb_url_api_href_set(lxb_url_t *url, lxb_url_parser_t *parser,
const lxb_char_t *href, size_t length);
LXB_API lxb_status_t
lxb_url_api_protocol_set(lxb_url_t *url, lxb_url_parser_t *parser,
const lxb_char_t *protocol, size_t length);
LXB_API lxb_status_t
lxb_url_api_username_set(lxb_url_t *url,
const lxb_char_t *username, size_t length);
LXB_API lxb_status_t
lxb_url_api_password_set(lxb_url_t *url,
const lxb_char_t *password, size_t length);
LXB_API lxb_status_t
lxb_url_api_host_set(lxb_url_t *url, lxb_url_parser_t *parser,
const lxb_char_t *host, size_t length);
LXB_API lxb_status_t
lxb_url_api_hostname_set(lxb_url_t *url, lxb_url_parser_t *parser,
const lxb_char_t *hostname, size_t length);
LXB_API lxb_status_t
lxb_url_api_port_set(lxb_url_t *url, lxb_url_parser_t *parser,
const lxb_char_t *port, size_t length);
LXB_API lxb_status_t
lxb_url_api_pathname_set(lxb_url_t *url, lxb_url_parser_t *parser,
const lxb_char_t *pathname, size_t length);
LXB_API lxb_status_t
lxb_url_api_search_set(lxb_url_t *url, lxb_url_parser_t *parser,
const lxb_char_t *search, size_t length);
LXB_API lxb_status_t
lxb_url_api_hash_set(lxb_url_t *url, lxb_url_parser_t *parser,
const lxb_char_t *hash, size_t length);
/*
* Below are functions for serializing a URL object and its individual
* parameters.
*
* Note that the callback may be called more than once.
* For example, the lxb_url_serialize() function will callback multiple times:
* 1. http
* 2. ://
* 3. example.com
* and so on.
*/
LXB_API lxb_status_t
lxb_url_serialize(const lxb_url_t *url, lexbor_serialize_cb_f cb, void *ctx,
bool exclude_fragment);
LXB_API lxb_status_t
lxb_url_serialize_scheme(const lxb_url_t *url,
lexbor_serialize_cb_f cb, void *ctx);
LXB_API lxb_status_t
lxb_url_serialize_username(const lxb_url_t *url,
lexbor_serialize_cb_f cb, void *ctx);
LXB_API lxb_status_t
lxb_url_serialize_password(const lxb_url_t *url,
lexbor_serialize_cb_f cb, void *ctx);
LXB_API lxb_status_t
lxb_url_serialize_host(const lxb_url_host_t *host,
lexbor_serialize_cb_f cb, void *ctx);
LXB_API lxb_status_t
lxb_url_serialize_host_unicode(lxb_unicode_idna_t *idna,
const lxb_url_host_t *host,
lexbor_serialize_cb_f cb, void *ctx);
LXB_API lxb_status_t
lxb_url_serialize_host_ipv4(uint32_t ipv4,
lexbor_serialize_cb_f cb, void *ctx);
LXB_API lxb_status_t
lxb_url_serialize_host_ipv6(const uint16_t *ipv6,
lexbor_serialize_cb_f cb, void *ctx);
LXB_API lxb_status_t
lxb_url_serialize_port(const lxb_url_t *url,
lexbor_serialize_cb_f cb, void *ctx);
LXB_API lxb_status_t
lxb_url_serialize_path(const lxb_url_path_t *path,
lexbor_serialize_cb_f cb, void *ctx);
LXB_API lxb_status_t
lxb_url_serialize_query(const lxb_url_t *url,
lexbor_serialize_cb_f cb, void *ctx);
LXB_API lxb_status_t
lxb_url_serialize_fragment(const lxb_url_t *url,
lexbor_serialize_cb_f cb, void *ctx);
/*
* Creates a clone of the object's URL.
*
* For lexbor_mraw_t *, use url->mraw or another lexbor_mraw_t * object.
*
* @param[in] lexbor_mraw_t *.
* @param[in] lxb_url_t *.
*
* @return a new URL object if successful, otherwise NULL value.
*/
LXB_API lxb_url_t *
lxb_url_clone(lexbor_mraw_t *mraw, lxb_url_t *url);
/*
* Inline functions.
*/
lxb_inline const lexbor_str_t *
lxb_url_scheme(const lxb_url_t *url)
{
return &url->scheme.name;
}
lxb_inline const lexbor_str_t *
lxb_url_username(const lxb_url_t *url)
{
return &url->username;
}
lxb_inline const lexbor_str_t *
lxb_url_password(const lxb_url_t *url)
{
return &url->password;
}
lxb_inline const lxb_url_host_t *
lxb_url_host(const lxb_url_t *url)
{
return &url->host;
}
lxb_inline uint16_t
lxb_url_port(const lxb_url_t *url)
{
return url->port;
}
lxb_inline bool
lxb_url_has_port(const lxb_url_t *url)
{
return url->has_port;
}
lxb_inline const lxb_url_path_t *
lxb_url_path(const lxb_url_t *url)
{
return &url->path;
}
lxb_inline const lexbor_str_t *
lxb_url_path_str(const lxb_url_t *url)
{
return &url->path.str;
}
lxb_inline const lexbor_str_t *
lxb_url_query(const lxb_url_t *url)
{
return &url->query;
}
lxb_inline const lexbor_str_t *
lxb_url_fragment(const lxb_url_t *url)
{
return &url->fragment;
}
lxb_inline lexbor_mraw_t *
lxb_url_mraw(lxb_url_parser_t *parser)
{
return parser->mraw;
}
lxb_inline void
lxb_url_mraw_set(lxb_url_parser_t *parser, lexbor_mraw_t *mraw)
{
parser->mraw = mraw;
}
lxb_inline lxb_url_t *
lxb_url_get(lxb_url_parser_t *parser)
{
return parser->url;
}
#ifdef __cplusplus
} /* extern "C" */
#endif
#endif /* LEXBOR_URL_H */