mirror of
https://github.com/php/php-src.git
synced 2025-08-15 13:38:49 +02:00
Add Lexbor files for URL handling (#18656)
Relates to #14461 and https://wiki.php.net/rfc/url_parsing_api
This commit is contained in:
parent
d585a5609d
commit
400b7b8c74
15 changed files with 210811 additions and 1 deletions
|
@ -7,6 +7,9 @@ ignore:
|
|||
- "ext/lexbor/lexbor/html"
|
||||
- "ext/lexbor/lexbor/ns"
|
||||
- "ext/lexbor/lexbor/ports"
|
||||
- "ext/lexbor/lexbor/punycode"
|
||||
- "ext/lexbor/lexbor/tag"
|
||||
- "ext/lexbor/lexbor/unicode"
|
||||
- "ext/lexbor/lexbor/url"
|
||||
- "ext/pcre/pcre2lib"
|
||||
- "ext/uri/uriparser"
|
||||
|
|
|
@ -17,6 +17,7 @@ PHP_NEW_EXTENSION([lexbor], m4_normalize([
|
|||
$LEXBOR_DIR/core/hash.c
|
||||
$LEXBOR_DIR/core/mem.c
|
||||
$LEXBOR_DIR/core/mraw.c
|
||||
$LEXBOR_DIR/core/plog.c
|
||||
$LEXBOR_DIR/core/print.c
|
||||
$LEXBOR_DIR/core/serialize.c
|
||||
$LEXBOR_DIR/core/shs.c
|
||||
|
@ -174,7 +175,11 @@ PHP_NEW_EXTENSION([lexbor], m4_normalize([
|
|||
$LEXBOR_DIR/html/tree/open_elements.c
|
||||
$LEXBOR_DIR/ns/ns.c
|
||||
$LEXBOR_DIR/ports/posix/lexbor/core/memory.c
|
||||
$LEXBOR_DIR/punycode/punycode.c
|
||||
$LEXBOR_DIR/tag/tag.c
|
||||
$LEXBOR_DIR/unicode/idna.c
|
||||
$LEXBOR_DIR/unicode/unicode.c
|
||||
$LEXBOR_DIR/url/url.c
|
||||
]),
|
||||
[no],,
|
||||
[-DZEND_ENABLE_STATIC_TSRMLS_CACHE=1 $PHP_LEXBOR_CFLAGS])
|
||||
|
@ -193,7 +198,10 @@ PHP_ADD_BUILD_DIR([
|
|||
$ext_builddir/$LEXBOR_DIR/html/tree/insertion_mode
|
||||
$ext_builddir/$LEXBOR_DIR/ns
|
||||
$ext_builddir/$LEXBOR_DIR/ports/posix/lexbor/core
|
||||
$ext_builddir/$LEXBOR_DIR/punycode
|
||||
$ext_builddir/$LEXBOR_DIR/tag
|
||||
$ext_builddir/$LEXBOR_DIR/unicode
|
||||
$ext_builddir/$LEXBOR_DIR/url
|
||||
])
|
||||
PHP_ADD_INCLUDE([$ext_srcdir])
|
||||
PHP_INSTALL_HEADERS([ext/lexbor], m4_normalize([
|
||||
|
|
|
@ -3,7 +3,7 @@
|
|||
EXTENSION("lexbor", "php_lexbor.c", false, "/I " + configure_module_dirname + " /DZEND_ENABLE_STATIC_TSRMLS_CACHE=1");
|
||||
PHP_LEXBOR="yes";
|
||||
ADD_SOURCES("ext/lexbor/lexbor/ports/windows_nt/lexbor/core", "memory.c", "lexbor");
|
||||
ADD_SOURCES("ext/lexbor/lexbor/core", "array_obj.c array.c avl.c bst.c diyfp.c conv.c dobject.c dtoa.c hash.c mem.c mraw.c print.c serialize.c shs.c str.c strtod.c", "lexbor");
|
||||
ADD_SOURCES("ext/lexbor/lexbor/core", "array_obj.c array.c avl.c bst.c diyfp.c conv.c dobject.c dtoa.c hash.c mem.c mraw.c plog.c print.c serialize.c shs.c str.c strtod.c", "lexbor");
|
||||
ADD_SOURCES("ext/lexbor/lexbor/dom", "interface.c", "lexbor");
|
||||
ADD_SOURCES("ext/lexbor/lexbor/dom/interfaces", "attr.c cdata_section.c character_data.c comment.c document.c document_fragment.c document_type.c element.c node.c processing_instruction.c shadow_root.c text.c", "lexbor");
|
||||
ADD_SOURCES("ext/lexbor/lexbor/html/tokenizer", "error.c state_comment.c state_doctype.c state_rawtext.c state_rcdata.c state_script.c state.c", "lexbor");
|
||||
|
@ -17,7 +17,10 @@ ADD_SOURCES("ext/lexbor/lexbor/css/selectors", "state.c selectors.c selector.c p
|
|||
ADD_SOURCES("ext/lexbor/lexbor/css/syntax", "state.c parser.c syntax.c anb.c tokenizer.c token.c","lexbor");
|
||||
ADD_SOURCES("ext/lexbor/lexbor/css/syntax/tokenizer", "error.c","lexbor");
|
||||
ADD_SOURCES("ext/lexbor/lexbor/ns", "ns.c","lexbor");
|
||||
ADD_SOURCES("ext/lexbor/lexbor/punycode", "punycode.c","lexbor");
|
||||
ADD_SOURCES("ext/lexbor/lexbor/tag", "tag.c","lexbor");
|
||||
ADD_SOURCES("ext/lexbor/lexbor/unicode", "idna.c unicode.c","lexbor");
|
||||
ADD_SOURCES("ext/lexbor/lexbor/url", "url.c","lexbor");
|
||||
ADD_FLAG("CFLAGS_LEXBOR", "/D LEXBOR_BUILDING /utf-8");
|
||||
|
||||
AC_DEFINE("HAVE_LEXBOR", 1, "Define to 1 if the PHP extension 'lexbor' is available.");
|
||||
|
|
30
ext/lexbor/lexbor/punycode/base.h
Normal file
30
ext/lexbor/lexbor/punycode/base.h
Normal file
|
@ -0,0 +1,30 @@
|
|||
/*
|
||||
* Copyright (C) 2023-2024 Alexander Borisov
|
||||
*
|
||||
* Author: Alexander Borisov <borisov@lexbor.com>
|
||||
*/
|
||||
|
||||
#ifndef LEXBOR_PUNYCODE_BASE_H
|
||||
#define LEXBOR_PUNYCODE_BASE_H
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#include "lexbor/core/base.h"
|
||||
|
||||
|
||||
#define LXB_PUNYCODE_VERSION_MAJOR 1
|
||||
#define LXB_PUNYCODE_VERSION_MINOR 1
|
||||
#define LXB_PUNYCODE_VERSION_PATCH 0
|
||||
|
||||
#define LEXBOR_PUNYCODE_VERSION_STRING LEXBOR_STRINGIZE(LXB_PUNYCODE_VERSION_MAJOR) "." \
|
||||
LEXBOR_STRINGIZE(LXB_PUNYCODE_VERSION_MINOR) "." \
|
||||
LEXBOR_STRINGIZE(LXB_PUNYCODE_VERSION_PATCH)
|
||||
|
||||
|
||||
#ifdef __cplusplus
|
||||
} /* extern "C" */
|
||||
#endif
|
||||
|
||||
#endif /* LEXBOR_PUNYCODE_BASE_H */
|
671
ext/lexbor/lexbor/punycode/punycode.c
Normal file
671
ext/lexbor/lexbor/punycode/punycode.c
Normal file
|
@ -0,0 +1,671 @@
|
|||
/*
|
||||
* Copyright (C) 2023-2024 Alexander Borisov
|
||||
*
|
||||
* Author: Alexander Borisov <borisov@lexbor.com>
|
||||
*/
|
||||
|
||||
#include "lexbor/punycode/punycode.h"
|
||||
#include "lexbor/encoding/encoding.h"
|
||||
|
||||
|
||||
enum {
|
||||
LXB_PUNYCODE_BASE = 36,
|
||||
LXB_PUNYCODE_TMIN = 1,
|
||||
LXB_PUNYCODE_TMAX = 26,
|
||||
LXB_PUNYCODE_SKEW = 38,
|
||||
LXB_PUNYCODE_DAMP = 700,
|
||||
LXB_PUNYCODE_INITIAL_BIAS = 72,
|
||||
LXB_PUNYCODE_INITIAL_N = 0x80,
|
||||
LXB_PUNYCODE_DELIMITER = 0x2D
|
||||
};
|
||||
|
||||
|
||||
static lxb_status_t
|
||||
lxb_punycode_callback_cp(const lxb_codepoint_t *cps, size_t len, void *ctx);
|
||||
|
||||
|
||||
lxb_inline lxb_char_t *
|
||||
lxb_punycode_encode_realloc(lxb_char_t *p, lxb_char_t **buf,
|
||||
const lxb_char_t **end, const lxb_char_t *buffer)
|
||||
{
|
||||
size_t cur_size = *end - *buf;
|
||||
size_t nsize = cur_size * 2;
|
||||
lxb_char_t *tmp;
|
||||
|
||||
if (*buf == buffer) {
|
||||
tmp = lexbor_malloc(nsize);
|
||||
if (tmp == NULL) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
memcpy(tmp, *buf, cur_size);
|
||||
}
|
||||
else {
|
||||
tmp = lexbor_realloc(*buf, nsize);
|
||||
if (tmp == NULL) {
|
||||
return lexbor_free(*buf);
|
||||
}
|
||||
}
|
||||
|
||||
*buf = tmp;
|
||||
*end = tmp + nsize;
|
||||
|
||||
return tmp + cur_size;
|
||||
}
|
||||
|
||||
lxb_inline lxb_codepoint_t *
|
||||
lxb_punycode_decode_realloc(lxb_codepoint_t *p, lxb_codepoint_t **buf,
|
||||
const lxb_codepoint_t **end,
|
||||
const lxb_codepoint_t *buffer)
|
||||
{
|
||||
size_t cur_size = *end - *buf;
|
||||
size_t nsize = cur_size * 2;
|
||||
lxb_codepoint_t *tmp;
|
||||
|
||||
if (*buf == buffer) {
|
||||
tmp = lexbor_malloc(nsize * sizeof(lxb_codepoint_t));
|
||||
if (tmp == NULL) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
memcpy(tmp, *buf, cur_size * sizeof(lxb_codepoint_t));
|
||||
}
|
||||
else {
|
||||
tmp = lexbor_realloc(*buf, nsize * sizeof(lxb_codepoint_t));
|
||||
if (tmp == NULL) {
|
||||
return lexbor_free(*buf);
|
||||
}
|
||||
}
|
||||
|
||||
*buf = tmp;
|
||||
*end = tmp + nsize;
|
||||
|
||||
return tmp + cur_size;
|
||||
}
|
||||
|
||||
static char
|
||||
lxb_punycode_encode_digit(size_t d) {
|
||||
return d + 22 + 75 * (d < 26);
|
||||
}
|
||||
|
||||
static size_t
|
||||
lxb_punycode_decode_digit(lxb_codepoint_t cp)
|
||||
{
|
||||
return cp - 48 < 10 ? cp - 22 : cp - 65 < 26 ? cp - 65
|
||||
: cp - 97 < 26 ? cp - 97 : LXB_PUNYCODE_BASE;
|
||||
}
|
||||
|
||||
static size_t
|
||||
lxb_punycode_adapt(size_t delta, size_t numpoints, bool firsttime)
|
||||
{
|
||||
size_t k;
|
||||
|
||||
delta = firsttime ? delta / LXB_PUNYCODE_DAMP : delta >> 1;
|
||||
delta += delta / numpoints;
|
||||
|
||||
for (k = 0;
|
||||
delta > ((LXB_PUNYCODE_BASE - LXB_PUNYCODE_TMIN) * LXB_PUNYCODE_TMAX) / 2;
|
||||
k += LXB_PUNYCODE_BASE)
|
||||
{
|
||||
delta /= LXB_PUNYCODE_BASE - LXB_PUNYCODE_TMIN;
|
||||
}
|
||||
|
||||
return k + (LXB_PUNYCODE_BASE - LXB_PUNYCODE_TMIN + 1)
|
||||
* delta / (delta + LXB_PUNYCODE_SKEW);
|
||||
}
|
||||
|
||||
static lxb_status_t
|
||||
lxb_punycode_encode_body(const lxb_codepoint_t *cps, const lxb_codepoint_t *cps_end,
|
||||
lxb_char_t *p, lxb_char_t *buf, const lxb_char_t *end,
|
||||
const lxb_char_t *buffer, lxb_punycode_encode_cb_f cb,
|
||||
void *ctx)
|
||||
{
|
||||
bool unchanged;
|
||||
size_t h, b, n, q, k, t, delta, bias;
|
||||
lxb_status_t status;
|
||||
lxb_codepoint_t cp, m;
|
||||
const lxb_codepoint_t *cps_t, *cps_p;
|
||||
|
||||
n = LXB_PUNYCODE_INITIAL_N;
|
||||
bias = LXB_PUNYCODE_INITIAL_BIAS;
|
||||
delta = 0;
|
||||
b = p - buf;
|
||||
cps_p = cps + b;
|
||||
|
||||
if (cps_p >= cps_end) {
|
||||
unchanged = true;
|
||||
goto done;
|
||||
}
|
||||
|
||||
if (p > buf) {
|
||||
*p++ = LXB_PUNYCODE_DELIMITER;
|
||||
}
|
||||
|
||||
unchanged = false;
|
||||
|
||||
while (cps_p < cps_end) {
|
||||
m = UINT32_MAX;
|
||||
cps_t = cps;
|
||||
|
||||
while (cps_t < cps_end) {
|
||||
cp = *cps_t++;
|
||||
|
||||
if (cp >= n && cp < m) {
|
||||
m = cp;
|
||||
}
|
||||
}
|
||||
|
||||
h = (cps_p - cps) + 1;
|
||||
|
||||
if (m - n > (UINT32_MAX - delta) / h) {
|
||||
status = LXB_STATUS_ERROR_OVERFLOW;
|
||||
goto failed;
|
||||
}
|
||||
|
||||
delta += (m - n) * h;
|
||||
n = m;
|
||||
|
||||
cps_t = cps;
|
||||
|
||||
while (cps_t < cps_end) {
|
||||
cp = *cps_t++;
|
||||
|
||||
if (cp < n) {
|
||||
if (++delta == 0) {
|
||||
status = LXB_STATUS_ERROR_OVERFLOW;
|
||||
goto failed;
|
||||
}
|
||||
}
|
||||
|
||||
if (cp == n) {
|
||||
q = delta;
|
||||
k = LXB_PUNYCODE_BASE;
|
||||
|
||||
for (;; k += LXB_PUNYCODE_BASE) {
|
||||
t = k <= bias ? LXB_PUNYCODE_TMIN :
|
||||
k >= bias + LXB_PUNYCODE_TMAX
|
||||
? LXB_PUNYCODE_TMAX : k - bias;
|
||||
|
||||
if (q < t) {
|
||||
break;
|
||||
}
|
||||
|
||||
if (p >= end) {
|
||||
p = lxb_punycode_encode_realloc(p, &buf, &end, buffer);
|
||||
if (p == NULL) {
|
||||
return LXB_STATUS_ERROR_MEMORY_ALLOCATION;
|
||||
}
|
||||
}
|
||||
|
||||
*p++ = lxb_punycode_encode_digit(t + (q - t)
|
||||
% (LXB_PUNYCODE_BASE - t));
|
||||
q = (q - t) / (LXB_PUNYCODE_BASE - t);
|
||||
}
|
||||
|
||||
h = cps_p - cps;
|
||||
|
||||
if (p >= end) {
|
||||
p = lxb_punycode_encode_realloc(p, &buf, &end, buffer);
|
||||
if (p == NULL) {
|
||||
return LXB_STATUS_ERROR_MEMORY_ALLOCATION;
|
||||
}
|
||||
}
|
||||
|
||||
*p++ = lxb_punycode_encode_digit(q);
|
||||
bias = lxb_punycode_adapt(delta, h + 1, h == b);
|
||||
delta = 0;
|
||||
cps_p += 1;
|
||||
}
|
||||
}
|
||||
|
||||
delta += 1;
|
||||
n += 1;
|
||||
}
|
||||
|
||||
done:
|
||||
|
||||
status = cb(buf, p - buf, ctx, unchanged);
|
||||
|
||||
failed:
|
||||
|
||||
if (buf != buffer) {
|
||||
(void) lexbor_free(buf);
|
||||
}
|
||||
|
||||
return status;
|
||||
}
|
||||
|
||||
lxb_status_t
|
||||
lxb_punycode_encode(const lxb_char_t *data, size_t length,
|
||||
lxb_punycode_encode_cb_f cb, void *ctx)
|
||||
{
|
||||
size_t cp_length;
|
||||
uint8_t len;
|
||||
lxb_char_t *p, *buf;
|
||||
lxb_status_t status;
|
||||
lxb_codepoint_t cp, *cps, *cps_p;
|
||||
const lxb_char_t *data_p, *data_end, *end;
|
||||
const lxb_codepoint_t *cps_end;
|
||||
lxb_char_t buffer[4096];
|
||||
lxb_codepoint_t input[4096];
|
||||
|
||||
/*
|
||||
* Make GCC happy.
|
||||
* length variable can be 0.
|
||||
*/
|
||||
input[0] = 0x00;
|
||||
|
||||
p = buffer;
|
||||
buf = buffer;
|
||||
end = buffer + sizeof(buffer);
|
||||
|
||||
data_p = data;
|
||||
data_end = data + length;
|
||||
cp_length = 0;
|
||||
|
||||
while (data_p < data_end) {
|
||||
len = lxb_encoding_decode_utf_8_length(*data_p);
|
||||
if (len == 0) {
|
||||
return LXB_STATUS_ERROR_UNEXPECTED_DATA;
|
||||
}
|
||||
|
||||
data_p += len;
|
||||
cp_length += 1;
|
||||
}
|
||||
|
||||
if (cp_length <= sizeof(input) / sizeof(lxb_codepoint_t)) {
|
||||
cps = input;
|
||||
}
|
||||
else {
|
||||
cps = lexbor_malloc(cp_length * sizeof(lxb_codepoint_t));
|
||||
if (cps == NULL) {
|
||||
return LXB_STATUS_ERROR_MEMORY_ALLOCATION;
|
||||
}
|
||||
}
|
||||
|
||||
data_p = data;
|
||||
|
||||
cps_p = cps;
|
||||
cps_end = cps + cp_length;
|
||||
|
||||
while (data_p < data_end) {
|
||||
cp = lxb_encoding_decode_valid_utf_8_single(&data_p, data_end);
|
||||
if (cp == LXB_ENCODING_DECODE_ERROR) {
|
||||
status = LXB_STATUS_ERROR_UNEXPECTED_DATA;
|
||||
goto done;
|
||||
}
|
||||
|
||||
*cps_p++ = cp;
|
||||
|
||||
if (cp < 0x80) {
|
||||
if (p >= end) {
|
||||
p = lxb_punycode_encode_realloc(p, &buf, &end, buffer);
|
||||
if (p == NULL) {
|
||||
status = LXB_STATUS_ERROR_MEMORY_ALLOCATION;
|
||||
goto done;
|
||||
}
|
||||
}
|
||||
|
||||
*p++ = cp;
|
||||
}
|
||||
}
|
||||
|
||||
status = lxb_punycode_encode_body(cps, cps_end, p, buf, end, buffer,
|
||||
cb, ctx);
|
||||
done:
|
||||
|
||||
if (cps != input) {
|
||||
(void) lexbor_free(cps);
|
||||
}
|
||||
|
||||
return status;
|
||||
}
|
||||
|
||||
lxb_status_t
|
||||
lxb_punycode_encode_cp(const lxb_codepoint_t *cps, size_t length,
|
||||
lxb_punycode_encode_cb_f cb, void *ctx)
|
||||
{
|
||||
lxb_char_t *p, *buf;
|
||||
lxb_codepoint_t cp;
|
||||
const lxb_char_t *end;
|
||||
const lxb_codepoint_t *cps_p, *cps_end;
|
||||
lxb_char_t buffer[4096];
|
||||
|
||||
p = buffer;
|
||||
buf = buffer;
|
||||
end = buffer + sizeof(buffer);
|
||||
|
||||
cps_p = cps;
|
||||
cps_end = cps + length;
|
||||
|
||||
while (cps_p < cps_end) {
|
||||
cp = *cps_p++;
|
||||
|
||||
if (cp < 0x80) {
|
||||
if (p >= end) {
|
||||
p = lxb_punycode_encode_realloc(p, &buf, &end, buffer);
|
||||
if (p == NULL) {
|
||||
return LXB_STATUS_ERROR_MEMORY_ALLOCATION;
|
||||
}
|
||||
}
|
||||
|
||||
*p++ = cp;
|
||||
}
|
||||
}
|
||||
|
||||
return lxb_punycode_encode_body(cps, cps_end, p, buf, end, buffer, cb, ctx);
|
||||
}
|
||||
|
||||
lxb_status_t
|
||||
lxb_punycode_decode(const lxb_char_t *data, size_t length,
|
||||
lexbor_serialize_cb_f cb, void *ctx)
|
||||
{
|
||||
lexbor_serialize_ctx_t nctx = {.cb = cb, .ctx = ctx};
|
||||
|
||||
return lxb_punycode_decode_cb_cp(data, length, lxb_punycode_callback_cp,
|
||||
&nctx);
|
||||
}
|
||||
|
||||
static lxb_status_t
|
||||
lxb_punycode_callback_cp(const lxb_codepoint_t *cps, size_t len, void *ctx)
|
||||
{
|
||||
uint8_t i;
|
||||
size_t length;
|
||||
lxb_status_t status;
|
||||
const lxb_codepoint_t *cps_p, *cps_end;
|
||||
lexbor_serialize_ctx_t *nctx = ctx;
|
||||
lxb_char_t *p, *buf, *end;
|
||||
lxb_char_t buffer[4096];
|
||||
|
||||
/*
|
||||
* Make GCC happy.
|
||||
* len variable can be 0.
|
||||
*/
|
||||
buffer[0] = 0x00;
|
||||
|
||||
cps_p = cps;
|
||||
cps_end = cps_p + len;
|
||||
length = 0;
|
||||
|
||||
while (cps_p < cps_end) {
|
||||
i = lxb_encoding_encode_utf_8_length(*cps_p++);
|
||||
if (i == 0) {
|
||||
return LXB_STATUS_ERROR_UNEXPECTED_DATA;
|
||||
}
|
||||
|
||||
length += i;
|
||||
}
|
||||
|
||||
buf = buffer;
|
||||
end = buffer + sizeof(buffer);
|
||||
|
||||
if (buf + length > end) {
|
||||
buf = lexbor_malloc(length);
|
||||
if (buf == NULL) {
|
||||
return LXB_STATUS_ERROR_MEMORY_ALLOCATION;
|
||||
}
|
||||
|
||||
end = buf + length;
|
||||
}
|
||||
|
||||
p = buf;
|
||||
cps_p = cps;
|
||||
|
||||
while (cps_p < cps_end) {
|
||||
(void) lxb_encoding_encode_utf_8_single(NULL, &p, end, *cps_p++);
|
||||
}
|
||||
|
||||
status = nctx->cb(buf, p - buf, nctx->ctx);
|
||||
|
||||
if (buf != buffer) {
|
||||
(void) lexbor_free(buf);
|
||||
}
|
||||
|
||||
return status;
|
||||
}
|
||||
|
||||
lxb_status_t
|
||||
lxb_punycode_decode_cp(const lxb_codepoint_t *data, size_t length,
|
||||
lexbor_serialize_cb_cp_f cb, void *ctx)
|
||||
{
|
||||
size_t buf_len, digit, oldi, bias, w, k, t, i, h, in;
|
||||
const lxb_codepoint_t *delimiter, *data_p, *data_end;
|
||||
lxb_status_t status;
|
||||
lxb_codepoint_t cp, n;
|
||||
lxb_codepoint_t *p, *buf;
|
||||
const lxb_codepoint_t *end;
|
||||
lxb_codepoint_t buffer[4096];
|
||||
|
||||
p = buffer;
|
||||
buf = buffer;
|
||||
buf_len = sizeof(buffer) / sizeof(lxb_codepoint_t);
|
||||
end = buffer + buf_len;
|
||||
|
||||
data_p = data;
|
||||
data_end = data + length;
|
||||
delimiter = data_end;
|
||||
|
||||
while (delimiter != data) {
|
||||
delimiter -= 1;
|
||||
|
||||
if (*delimiter == LXB_PUNYCODE_DELIMITER) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
while (data_p < delimiter) {
|
||||
cp = *data_p++;
|
||||
|
||||
if (cp >= 0x80) {
|
||||
status = LXB_STATUS_ERROR_UNEXPECTED_DATA;
|
||||
goto done;
|
||||
}
|
||||
|
||||
if (p >= end) {
|
||||
p = lxb_punycode_decode_realloc(p, &buf, &end, buffer);
|
||||
if (p == NULL) {
|
||||
return LXB_STATUS_ERROR_MEMORY_ALLOCATION;
|
||||
}
|
||||
}
|
||||
|
||||
*p++ = cp;
|
||||
}
|
||||
|
||||
i = 0;
|
||||
n = LXB_PUNYCODE_INITIAL_N;
|
||||
bias = LXB_PUNYCODE_INITIAL_BIAS;
|
||||
data_p = (delimiter != data) ? delimiter + 1: data;
|
||||
in = data_p - data;
|
||||
|
||||
for (; in < length; p++) {
|
||||
for (oldi = i, w = 1, k = LXB_PUNYCODE_BASE; ; k += LXB_PUNYCODE_BASE) {
|
||||
if (in >= length) {
|
||||
status = LXB_STATUS_ERROR_UNEXPECTED_DATA;
|
||||
goto done;
|
||||
}
|
||||
|
||||
cp = data[in++];
|
||||
digit = lxb_punycode_decode_digit(cp);
|
||||
|
||||
if (digit >= LXB_PUNYCODE_BASE) {
|
||||
status = LXB_STATUS_ERROR_UNEXPECTED_DATA;
|
||||
goto done;
|
||||
}
|
||||
|
||||
if (digit > (UINT32_MAX - i) / w) {
|
||||
status = LXB_STATUS_ERROR_OVERFLOW;
|
||||
goto done;
|
||||
}
|
||||
|
||||
i += digit * w;
|
||||
t = k <= bias ? LXB_PUNYCODE_TMIN
|
||||
: k >= bias + LXB_PUNYCODE_TMAX ? LXB_PUNYCODE_TMAX : k - bias;
|
||||
|
||||
if (digit < t) {
|
||||
break;
|
||||
}
|
||||
|
||||
if (w > UINT32_MAX / (LXB_PUNYCODE_BASE - t)) {
|
||||
status = LXB_STATUS_ERROR_OVERFLOW;
|
||||
goto done;
|
||||
}
|
||||
|
||||
w *= (LXB_PUNYCODE_BASE - t);
|
||||
}
|
||||
|
||||
h = (p - buf) + 1;
|
||||
|
||||
bias = lxb_punycode_adapt(i - oldi, h, oldi == 0);
|
||||
|
||||
if (i / h > UINT32_MAX - n) {
|
||||
status = LXB_STATUS_ERROR_OVERFLOW;
|
||||
goto done;
|
||||
}
|
||||
|
||||
n += i / h;
|
||||
i %= h;
|
||||
|
||||
if (p >= end) {
|
||||
p = lxb_punycode_decode_realloc(p, &buf, &end, buffer);
|
||||
if (p == NULL) {
|
||||
return LXB_STATUS_ERROR_MEMORY_ALLOCATION;
|
||||
}
|
||||
}
|
||||
|
||||
memmove(buf + i + 1, buf + i, ((h - 1) - i) * sizeof(lxb_codepoint_t));
|
||||
buf[i++] = n;
|
||||
}
|
||||
|
||||
status = cb(buf, p - buf, ctx);
|
||||
|
||||
done:
|
||||
|
||||
if (buffer != buf) {
|
||||
(void) lexbor_free(buf);
|
||||
}
|
||||
|
||||
return status;
|
||||
}
|
||||
|
||||
lxb_status_t
|
||||
lxb_punycode_decode_cb_cp(const lxb_char_t *data, size_t length,
|
||||
lexbor_serialize_cb_cp_f cb, void *ctx)
|
||||
{
|
||||
size_t buf_len, digit, oldi, bias, w, k, t, i, h, in;
|
||||
const lxb_char_t *delimiter, *data_p, *data_end;
|
||||
lxb_status_t status;
|
||||
lxb_codepoint_t cp, n;
|
||||
lxb_codepoint_t *p, *buf;
|
||||
const lxb_codepoint_t *end;
|
||||
lxb_codepoint_t buffer[4096];
|
||||
|
||||
p = buffer;
|
||||
buf = buffer;
|
||||
buf_len = sizeof(buffer) / sizeof(lxb_codepoint_t);
|
||||
end = buffer + buf_len;
|
||||
|
||||
data_p = data;
|
||||
data_end = data + length;
|
||||
delimiter = data_end;
|
||||
|
||||
while (delimiter != data) {
|
||||
delimiter -= 1;
|
||||
|
||||
if (*delimiter == LXB_PUNYCODE_DELIMITER) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
while (data_p < delimiter) {
|
||||
cp = *data_p++;
|
||||
|
||||
if (cp >= 0x80) {
|
||||
status = LXB_STATUS_ERROR_UNEXPECTED_DATA;
|
||||
goto done;
|
||||
}
|
||||
|
||||
if (p >= end) {
|
||||
p = lxb_punycode_decode_realloc(p, &buf, &end, buffer);
|
||||
if (p == NULL) {
|
||||
return LXB_STATUS_ERROR_MEMORY_ALLOCATION;
|
||||
}
|
||||
}
|
||||
|
||||
*p++ = cp;
|
||||
}
|
||||
|
||||
i = 0;
|
||||
n = LXB_PUNYCODE_INITIAL_N;
|
||||
bias = LXB_PUNYCODE_INITIAL_BIAS;
|
||||
data_p = (delimiter != data) ? delimiter + 1: data;
|
||||
in = data_p - data;
|
||||
|
||||
for (; in < length; p++) {
|
||||
for (oldi = i, w = 1, k = LXB_PUNYCODE_BASE; ; k += LXB_PUNYCODE_BASE) {
|
||||
if (in >= length) {
|
||||
status = LXB_STATUS_ERROR_UNEXPECTED_DATA;
|
||||
goto done;
|
||||
}
|
||||
|
||||
cp = data[in++];
|
||||
digit = lxb_punycode_decode_digit(cp);
|
||||
|
||||
if (digit >= LXB_PUNYCODE_BASE) {
|
||||
status = LXB_STATUS_ERROR_UNEXPECTED_DATA;
|
||||
goto done;
|
||||
}
|
||||
|
||||
if (digit > (UINT32_MAX - i) / w) {
|
||||
status = LXB_STATUS_ERROR_OVERFLOW;
|
||||
goto done;
|
||||
}
|
||||
|
||||
i += digit * w;
|
||||
t = k <= bias ? LXB_PUNYCODE_TMIN
|
||||
: k >= bias + LXB_PUNYCODE_TMAX ? LXB_PUNYCODE_TMAX : k - bias;
|
||||
|
||||
if (digit < t) {
|
||||
break;
|
||||
}
|
||||
|
||||
if (w > UINT32_MAX / (LXB_PUNYCODE_BASE - t)) {
|
||||
status = LXB_STATUS_ERROR_OVERFLOW;
|
||||
goto done;
|
||||
}
|
||||
|
||||
w *= (LXB_PUNYCODE_BASE - t);
|
||||
}
|
||||
|
||||
h = (p - buf) + 1;
|
||||
|
||||
bias = lxb_punycode_adapt(i - oldi, h, oldi == 0);
|
||||
|
||||
if (i / h > UINT32_MAX - n) {
|
||||
status = LXB_STATUS_ERROR_OVERFLOW;
|
||||
goto done;
|
||||
}
|
||||
|
||||
n += i / h;
|
||||
i %= h;
|
||||
|
||||
if (p >= end) {
|
||||
p = lxb_punycode_decode_realloc(p, &buf, &end, buffer);
|
||||
if (p == NULL) {
|
||||
return LXB_STATUS_ERROR_MEMORY_ALLOCATION;
|
||||
}
|
||||
}
|
||||
|
||||
memmove(buf + i + 1, buf + i, ((h - 1) - i) * sizeof(lxb_codepoint_t));
|
||||
buf[i++] = n;
|
||||
}
|
||||
|
||||
status = cb(buf, p - buf, ctx);
|
||||
|
||||
done:
|
||||
|
||||
if (buffer != buf) {
|
||||
(void) lexbor_free(buf);
|
||||
}
|
||||
|
||||
return status;
|
||||
}
|
109
ext/lexbor/lexbor/punycode/punycode.h
Normal file
109
ext/lexbor/lexbor/punycode/punycode.h
Normal file
|
@ -0,0 +1,109 @@
|
|||
/*
|
||||
* Copyright (C) 2023 Alexander Borisov
|
||||
*
|
||||
* Author: Alexander Borisov <borisov@lexbor.com>
|
||||
*/
|
||||
|
||||
#ifndef LEXBOR_PUNYCODE_H
|
||||
#define LEXBOR_PUNYCODE_H
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#include "lexbor/punycode/base.h"
|
||||
|
||||
|
||||
typedef lxb_status_t
|
||||
(*lxb_punycode_encode_cb_f)(const lxb_char_t *data, size_t len, void *ctx,
|
||||
bool unchanged);
|
||||
|
||||
|
||||
/*
|
||||
* Punycode: A Bootstring encoding of Unicode
|
||||
* for Internationalized Domain Names in Applications (IDNA).
|
||||
*
|
||||
* https://www.rfc-editor.org/rfc/inline-errata/rfc3492.html
|
||||
*/
|
||||
|
||||
/*
|
||||
* Encoding from characters to characters.
|
||||
*
|
||||
* @param[in] Input characters for encoding. Not NULL.
|
||||
* @param[in] Length of characters. Can be 0.
|
||||
* @param[in] Callback for results. Сalled only once when encoding is complete.
|
||||
* @param[in] Context for callback.
|
||||
*
|
||||
* @return LXB_STATUS_OK if successful, otherwise an error status value.
|
||||
*/
|
||||
LXB_API lxb_status_t
|
||||
lxb_punycode_encode(const lxb_char_t *data, size_t length,
|
||||
lxb_punycode_encode_cb_f cb, void *ctx);
|
||||
|
||||
/*
|
||||
* Encoding from code points to characters.
|
||||
*
|
||||
* Same as lxb_punycode_encode() only the input is code points.
|
||||
*
|
||||
* @param[in] Input code points for encoding. Not NULL.
|
||||
* @param[in] Length of code points. Can be 0.
|
||||
* @param[in] Callback for results. Сalled only once when encoding is complete.
|
||||
* @param[in] Context for callback.
|
||||
*
|
||||
* @return LXB_STATUS_OK if successful, otherwise an error status value.
|
||||
*/
|
||||
LXB_API lxb_status_t
|
||||
lxb_punycode_encode_cp(const lxb_codepoint_t *cps, size_t length,
|
||||
lxb_punycode_encode_cb_f cb, void *ctx);
|
||||
|
||||
/*
|
||||
* Decoding from characters to characters.
|
||||
*
|
||||
* @param[in] Input characters for encoding. Not NULL.
|
||||
* @param[in] Length of characters. Can be 0.
|
||||
* @param[in] Callback for results. Сalled only once when encoding is complete.
|
||||
* @param[in] Context for callback.
|
||||
*
|
||||
* @return LXB_STATUS_OK if successful, otherwise an error status value.
|
||||
*/
|
||||
LXB_API lxb_status_t
|
||||
lxb_punycode_decode(const lxb_char_t *data, size_t length,
|
||||
lexbor_serialize_cb_f cb, void *ctx);
|
||||
|
||||
/*
|
||||
* Decoding from code points to code points.
|
||||
*
|
||||
* Same as lxb_punycode_decode() only the input/output is code points.
|
||||
*
|
||||
* @param[in] Input code points for encoding. Not NULL.
|
||||
* @param[in] Length of code points. Can be 0.
|
||||
* @param[in] Callback for results. Сalled only once when encoding is complete.
|
||||
* @param[in] Context for callback.
|
||||
*
|
||||
* @return LXB_STATUS_OK if successful, otherwise an error status value.
|
||||
*/
|
||||
LXB_API lxb_status_t
|
||||
lxb_punycode_decode_cp(const lxb_codepoint_t *data, size_t length,
|
||||
lexbor_serialize_cb_cp_f cb, void *ctx);
|
||||
|
||||
/*
|
||||
* Decoding from characters to code points.
|
||||
*
|
||||
* Same as lxb_punycode_decode() only the output is code points.
|
||||
*
|
||||
* @param[in] Input code points for encoding. Not NULL.
|
||||
* @param[in] Length of code points. Can be 0.
|
||||
* @param[in] Callback for results. Сalled only once when encoding is complete.
|
||||
* @param[in] Context for callback.
|
||||
*
|
||||
* @return LXB_STATUS_OK if successful, otherwise an error status value.
|
||||
*/
|
||||
LXB_API lxb_status_t
|
||||
lxb_punycode_decode_cb_cp(const lxb_char_t *data, size_t length,
|
||||
lexbor_serialize_cb_cp_f cb, void *ctx);
|
||||
|
||||
#ifdef __cplusplus
|
||||
} /* extern "C" */
|
||||
#endif
|
||||
|
||||
#endif /* LEXBOR_PUNYCODE_H */
|
157
ext/lexbor/lexbor/unicode/base.h
Normal file
157
ext/lexbor/lexbor/unicode/base.h
Normal file
|
@ -0,0 +1,157 @@
|
|||
/*
|
||||
* Copyright (C) 2023-2024 Alexander Borisov
|
||||
*
|
||||
* Author: Alexander Borisov <borisov@lexbor.com>
|
||||
*/
|
||||
|
||||
#ifndef LEXBOR_UNICODE_BASE_H
|
||||
#define LEXBOR_UNICODE_BASE_H
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#include "lexbor/core/base.h"
|
||||
#include "lexbor/core/str.h"
|
||||
|
||||
|
||||
#define LXB_UNICODE_VERSION_MAJOR 0
|
||||
#define LXB_UNICODE_VERSION_MINOR 3
|
||||
#define LXB_UNICODE_VERSION_PATCH 0
|
||||
|
||||
#define LXB_UNICODE_VERSION_STRING LEXBOR_STRINGIZE(LXB_UNICODE_VERSION_MAJOR) "." \
|
||||
LEXBOR_STRINGIZE(LXB_UNICODE_VERSION_MINOR) "." \
|
||||
LEXBOR_STRINGIZE(LXB_UNICODE_VERSION_PATCH)
|
||||
|
||||
|
||||
enum {
|
||||
LXB_UNICODE_DECOMPOSITION_TYPE__UNDEF = 0x00,
|
||||
LXB_UNICODE_DECOMPOSITION_TYPE_CIRCLE,
|
||||
LXB_UNICODE_DECOMPOSITION_TYPE_COMPAT,
|
||||
LXB_UNICODE_DECOMPOSITION_TYPE_FINAL,
|
||||
LXB_UNICODE_DECOMPOSITION_TYPE_FONT,
|
||||
LXB_UNICODE_DECOMPOSITION_TYPE_FRACTION,
|
||||
LXB_UNICODE_DECOMPOSITION_TYPE_INITIAL,
|
||||
LXB_UNICODE_DECOMPOSITION_TYPE_ISOLATED,
|
||||
LXB_UNICODE_DECOMPOSITION_TYPE_MEDIAL,
|
||||
LXB_UNICODE_DECOMPOSITION_TYPE_NARROW,
|
||||
LXB_UNICODE_DECOMPOSITION_TYPE_NOBREAK,
|
||||
LXB_UNICODE_DECOMPOSITION_TYPE_SMALL,
|
||||
LXB_UNICODE_DECOMPOSITION_TYPE_SQUARE,
|
||||
LXB_UNICODE_DECOMPOSITION_TYPE_SUB,
|
||||
LXB_UNICODE_DECOMPOSITION_TYPE_SUPER,
|
||||
LXB_UNICODE_DECOMPOSITION_TYPE_VERTICAL,
|
||||
LXB_UNICODE_DECOMPOSITION_TYPE_WIDE,
|
||||
LXB_UNICODE_DECOMPOSITION_TYPE__LAST_ENTRY
|
||||
};
|
||||
#define LXB_UNICODE_CANONICAL_SEPARATELY (1 << 7)
|
||||
#define LXB_UNICODE_IS_CANONICAL_SEPARATELY(a) ((a) >> 7)
|
||||
#define LXB_UNICODE_DECOMPOSITION_TYPE(a) ((a) & ~(1 << 7))
|
||||
typedef uint8_t lxb_unicode_decomposition_type_t;
|
||||
|
||||
enum {
|
||||
LXB_UNICODE_QUICK__UNDEF = 0x00,
|
||||
LXB_UNICODE_QUICK_NFC_MAYBE = 1 << 0,
|
||||
LXB_UNICODE_QUICK_NFC_NO = 1 << 1,
|
||||
LXB_UNICODE_QUICK_NFD_NO = 1 << 2,
|
||||
LXB_UNICODE_QUICK_NFKC_MAYBE = 1 << 3,
|
||||
LXB_UNICODE_QUICK_NFKC_NO = 1 << 4,
|
||||
LXB_UNICODE_QUICK_NFKD_NO = 1 << 5
|
||||
};
|
||||
typedef uint8_t lxb_unicode_quick_type_t;
|
||||
|
||||
enum {
|
||||
LXB_UNICODE_IDNA__UNDEF = 0x00,
|
||||
LXB_UNICODE_IDNA_DEVIATION,
|
||||
LXB_UNICODE_IDNA_DISALLOWED,
|
||||
LXB_UNICODE_IDNA_IGNORED,
|
||||
LXB_UNICODE_IDNA_MAPPED,
|
||||
LXB_UNICODE_IDNA_VALID
|
||||
};
|
||||
typedef uint8_t lxb_unicode_idna_type_t;
|
||||
|
||||
typedef struct lxb_unicode_normalizer lxb_unicode_normalizer_t;
|
||||
|
||||
typedef struct {
|
||||
lxb_codepoint_t cp;
|
||||
uint8_t ccc;
|
||||
}
|
||||
lxb_unicode_buffer_t;
|
||||
|
||||
typedef lxb_status_t
|
||||
(*lxb_unicode_nf_handler_f)(lxb_unicode_normalizer_t *uc, const lxb_char_t *data,
|
||||
size_t length, lexbor_serialize_cb_f cb, void *ctx,
|
||||
bool is_last);
|
||||
|
||||
typedef lxb_unicode_buffer_t *
|
||||
(*lxb_unicode_de_handler_f)(lxb_unicode_normalizer_t *uc, lxb_codepoint_t cp,
|
||||
lxb_unicode_buffer_t **buf,
|
||||
const lxb_unicode_buffer_t **end);
|
||||
|
||||
typedef void
|
||||
(*lxb_unicode_co_handler_f)(lxb_unicode_buffer_t *starter,
|
||||
lxb_unicode_buffer_t *op, lxb_unicode_buffer_t *p);
|
||||
|
||||
|
||||
typedef struct {
|
||||
uint16_t normalization; /* lxb_unicode_normalization_t */
|
||||
uint16_t idna; /* lxb_unicode_idna_t */
|
||||
}
|
||||
lxb_unicode_entry_t;
|
||||
|
||||
typedef struct {
|
||||
lxb_unicode_decomposition_type_t type;
|
||||
lxb_unicode_quick_type_t quick; /* Quick Check. */
|
||||
uint8_t ccc; /* Canonical Combining Class. */
|
||||
uint8_t length;
|
||||
uint16_t decomposition; /* lxb_codepoint_t */
|
||||
uint16_t composition; /* lxb_unicode_composition_entry_t */
|
||||
}
|
||||
lxb_unicode_normalization_entry_t;
|
||||
|
||||
typedef struct {
|
||||
lxb_unicode_idna_type_t type;
|
||||
uint8_t length;
|
||||
uint16_t index;
|
||||
}
|
||||
lxb_unicode_idna_entry_t;
|
||||
|
||||
typedef struct {
|
||||
uint8_t length; /* Length in lxb_unicode_composition_cps_t */
|
||||
uint16_t index; /* lxb_unicode_composition_cps_t */
|
||||
lxb_codepoint_t cp; /* Begin code point in lxb_unicode_composition_cps_t */
|
||||
}
|
||||
lxb_unicode_composition_entry_t;
|
||||
|
||||
typedef struct {
|
||||
lxb_codepoint_t cp;
|
||||
bool exclusion;
|
||||
}
|
||||
lxb_unicode_composition_cp_t;
|
||||
|
||||
struct lxb_unicode_normalizer {
|
||||
lxb_unicode_de_handler_f decomposition;
|
||||
lxb_unicode_co_handler_f composition;
|
||||
|
||||
lxb_unicode_buffer_t *starter;
|
||||
|
||||
lxb_unicode_buffer_t *buf;
|
||||
const lxb_unicode_buffer_t *end;
|
||||
lxb_unicode_buffer_t *p;
|
||||
lxb_unicode_buffer_t *ican;
|
||||
|
||||
lxb_char_t tmp[4];
|
||||
uint8_t tmp_lenght;
|
||||
|
||||
uint8_t quick_ccc;
|
||||
lxb_unicode_quick_type_t quick_type;
|
||||
|
||||
size_t flush_cp;
|
||||
};
|
||||
|
||||
|
||||
#ifdef __cplusplus
|
||||
} /* extern "C" */
|
||||
#endif
|
||||
|
||||
#endif /* LEXBOR_UNICODE_BASE_H */
|
738
ext/lexbor/lexbor/unicode/idna.c
Normal file
738
ext/lexbor/lexbor/unicode/idna.c
Normal file
|
@ -0,0 +1,738 @@
|
|||
/*
|
||||
* Copyright (C) 2023 Alexander Borisov
|
||||
*
|
||||
* Author: Alexander Borisov <borisov@lexbor.com>
|
||||
*/
|
||||
|
||||
#include "lexbor/unicode/idna.h"
|
||||
#include "lexbor/unicode/unicode.h"
|
||||
#include "lexbor/punycode/punycode.h"
|
||||
#include "lexbor/encoding/encoding.h"
|
||||
|
||||
|
||||
typedef struct {
|
||||
lxb_unicode_idna_cb_f cb;
|
||||
void *context;
|
||||
lxb_unicode_idna_flag_t flags;
|
||||
}
|
||||
lxb_unicode_idna_ctx_t;
|
||||
|
||||
typedef struct {
|
||||
lxb_char_t buffer[4096];
|
||||
lxb_char_t *p;
|
||||
lxb_char_t *buf;
|
||||
const lxb_char_t *end;
|
||||
lxb_unicode_idna_flag_t flags;
|
||||
}
|
||||
lxb_unicode_idna_ascii_ctx_t;
|
||||
|
||||
|
||||
static lxb_status_t
|
||||
lxb_unicode_idna_processing_body(lxb_unicode_idna_t *idna, const void *data,
|
||||
size_t len, lxb_unicode_idna_cb_f cb, void *ctx,
|
||||
lxb_unicode_idna_flag_t flags, bool is_cp);
|
||||
|
||||
static lxb_status_t
|
||||
lxb_unicode_idna_norm_c_cb(const lxb_codepoint_t *cps, size_t len, void *ctx);
|
||||
|
||||
static lxb_status_t
|
||||
lxb_unicode_idna_norm_c_send(const lxb_codepoint_t *cps,
|
||||
const lxb_codepoint_t *p,
|
||||
lxb_unicode_idna_ctx_t *context);
|
||||
|
||||
static lxb_status_t
|
||||
lxb_unicode_idna_punycode_cb(const lxb_codepoint_t *cps, size_t len, void *ctx);
|
||||
|
||||
static lxb_status_t
|
||||
lxb_unicode_idna_to_ascii_cb(const lxb_codepoint_t *part, size_t len,
|
||||
void *ctx, lxb_status_t status);
|
||||
|
||||
static lxb_status_t
|
||||
lxb_unicode_idna_to_ascii_body(lxb_unicode_idna_t *idna, const void *data,
|
||||
size_t length, lexbor_serialize_cb_f cb, void *ctx,
|
||||
lxb_unicode_idna_flag_t flags, bool is_cp);
|
||||
|
||||
static lxb_status_t
|
||||
lxb_unicode_idna_ascii_puny_cb(const lxb_char_t *data, size_t length, void *ctx,
|
||||
bool unchanged);
|
||||
|
||||
static lxb_status_t
|
||||
lxb_unicode_idna_to_unicode_cb(const lxb_codepoint_t *part, size_t len,
|
||||
void *ctx, lxb_status_t status);
|
||||
|
||||
static lxb_status_t
|
||||
lxb_unicode_idna_to_unicode_body(lxb_unicode_idna_t *idna, const void *data,
|
||||
size_t length, lexbor_serialize_cb_f cb,
|
||||
void *ctx, lxb_unicode_idna_flag_t flags,
|
||||
bool is_cp);
|
||||
|
||||
static bool
|
||||
lxb_unicode_idna_validity_criteria_h(const void *data, size_t length,
|
||||
lxb_unicode_idna_flag_t flags, bool is_cp);
|
||||
|
||||
lxb_unicode_idna_t *
|
||||
lxb_unicode_idna_create(void)
|
||||
{
|
||||
return lexbor_malloc(sizeof(lxb_unicode_idna_t));
|
||||
}
|
||||
|
||||
lxb_status_t
|
||||
lxb_unicode_idna_init(lxb_unicode_idna_t *idna)
|
||||
{
|
||||
if (idna == NULL) {
|
||||
return LXB_STATUS_ERROR_OBJECT_IS_NULL;
|
||||
}
|
||||
|
||||
return lxb_unicode_normalizer_init(&idna->normalizer, LXB_UNICODE_NFC);
|
||||
}
|
||||
|
||||
void
|
||||
lxb_unicode_idna_clean(lxb_unicode_idna_t *idna)
|
||||
{
|
||||
lxb_unicode_normalizer_clean(&idna->normalizer);
|
||||
}
|
||||
|
||||
lxb_unicode_idna_t *
|
||||
lxb_unicode_idna_destroy(lxb_unicode_idna_t *idna, bool self_destroy)
|
||||
{
|
||||
if (idna == NULL) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
(void) lxb_unicode_normalizer_destroy(&idna->normalizer, false);
|
||||
|
||||
if (self_destroy) {
|
||||
return lexbor_free(idna);
|
||||
}
|
||||
|
||||
return idna;
|
||||
}
|
||||
|
||||
lxb_codepoint_t *
|
||||
lxb_unicode_idna_realloc(lxb_codepoint_t *buf, const lxb_codepoint_t *buffer,
|
||||
lxb_codepoint_t **buf_p, lxb_codepoint_t **buf_end,
|
||||
size_t len)
|
||||
{
|
||||
size_t nlen;
|
||||
lxb_codepoint_t *tmp;
|
||||
|
||||
nlen = ((*buf_end - buf) * 4) + len;
|
||||
|
||||
if (buf == buffer) {
|
||||
tmp = lexbor_malloc(nlen * sizeof(lxb_codepoint_t));
|
||||
if (tmp == NULL) {
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
else {
|
||||
tmp = lexbor_realloc(buf, nlen * sizeof(lxb_codepoint_t));
|
||||
if (tmp == NULL) {
|
||||
return lexbor_free(buf);
|
||||
}
|
||||
}
|
||||
|
||||
*buf_p = tmp + (*buf_p - buf);
|
||||
*buf_end = tmp + nlen;
|
||||
|
||||
return tmp;
|
||||
}
|
||||
|
||||
lxb_status_t
|
||||
lxb_unicode_idna_processing(lxb_unicode_idna_t *idna, const lxb_char_t *data,
|
||||
size_t length, lxb_unicode_idna_cb_f cb, void *ctx,
|
||||
lxb_unicode_idna_flag_t flags)
|
||||
{
|
||||
return lxb_unicode_idna_processing_body(idna, data, length, cb, ctx,
|
||||
flags, false);
|
||||
}
|
||||
|
||||
lxb_status_t
|
||||
lxb_unicode_idna_processing_cp(lxb_unicode_idna_t *idna,
|
||||
const lxb_codepoint_t *cps, size_t length,
|
||||
lxb_unicode_idna_cb_f cb, void *ctx,
|
||||
lxb_unicode_idna_flag_t flags)
|
||||
{
|
||||
return lxb_unicode_idna_processing_body(idna, cps, length, cb, ctx,
|
||||
flags, true);
|
||||
}
|
||||
|
||||
static lxb_status_t
|
||||
lxb_unicode_idna_processing_body(lxb_unicode_idna_t *idna, const void *data,
|
||||
size_t len, lxb_unicode_idna_cb_f cb, void *ctx,
|
||||
lxb_unicode_idna_flag_t flags, bool is_cp)
|
||||
{
|
||||
bool need;
|
||||
size_t i, length;
|
||||
lxb_status_t status;
|
||||
lxb_codepoint_t cp, *buf, *buf_p, *buf_end;
|
||||
const lxb_char_t *end, *p;
|
||||
lxb_unicode_idna_type_t type;
|
||||
const lxb_unicode_idna_entry_t *udata;
|
||||
const lxb_codepoint_t *maps;
|
||||
lxb_unicode_idna_ctx_t context;
|
||||
lxb_codepoint_t buffer[4096];
|
||||
|
||||
buf = buffer;
|
||||
buf_p = buffer;
|
||||
buf_end = buffer + (sizeof(buffer) / sizeof(lxb_codepoint_t));
|
||||
|
||||
p = data;
|
||||
len *= (is_cp) ? sizeof(lxb_codepoint_t) : 1;
|
||||
end = (const lxb_char_t *) data + len;
|
||||
|
||||
while (p < end) {
|
||||
if (is_cp) {
|
||||
cp = *((const lxb_codepoint_t *) p);
|
||||
p = (const lxb_char_t *) ((const lxb_codepoint_t *) p + 1);
|
||||
}
|
||||
else {
|
||||
cp = lxb_encoding_decode_valid_utf_8_single(&p, end);
|
||||
if (cp > LXB_ENCODING_DECODE_MAX_CODEPOINT) {
|
||||
status = LXB_STATUS_ERROR_UNEXPECTED_DATA;
|
||||
goto done;
|
||||
}
|
||||
}
|
||||
|
||||
type = lxb_unicode_idna_type(cp);
|
||||
|
||||
again:
|
||||
|
||||
switch (type) {
|
||||
case LXB_UNICODE_IDNA_IGNORED:
|
||||
break;
|
||||
|
||||
case LXB_UNICODE_IDNA_MAPPED:
|
||||
udata = lxb_unicode_idna_entry_by_cp(cp);
|
||||
maps = lxb_unicode_idna_map(udata, &length);
|
||||
|
||||
if (buf_p + length > buf_end) {
|
||||
buf = lxb_unicode_idna_realloc(buf, buffer, &buf_p,
|
||||
&buf_end, length);
|
||||
if (buf == NULL) {
|
||||
return LXB_STATUS_ERROR_MEMORY_ALLOCATION;
|
||||
}
|
||||
}
|
||||
|
||||
for (i = 0; i < length; i++) {
|
||||
*buf_p++ = maps[i];
|
||||
}
|
||||
|
||||
break;
|
||||
|
||||
case LXB_UNICODE_IDNA_DEVIATION:
|
||||
if ((flags & LXB_UNICODE_IDNA_FLAG_TRANSITIONAL_PROCESSING)) {
|
||||
type = LXB_UNICODE_IDNA_MAPPED;
|
||||
goto again;
|
||||
}
|
||||
|
||||
/* Fall through. */
|
||||
|
||||
case LXB_UNICODE_IDNA_DISALLOWED:
|
||||
/* Fall through. */
|
||||
|
||||
case LXB_UNICODE_IDNA_VALID:
|
||||
default:
|
||||
if (buf_p >= buf_end) {
|
||||
buf = lxb_unicode_idna_realloc(buf, buffer, &buf_p,
|
||||
&buf_end, 1);
|
||||
if (buf == NULL) {
|
||||
return LXB_STATUS_ERROR_MEMORY_ALLOCATION;
|
||||
}
|
||||
}
|
||||
|
||||
*buf_p++ = cp;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
context.cb = cb;
|
||||
context.context = ctx;
|
||||
context.flags = flags;
|
||||
|
||||
|
||||
need = lxb_unicode_quick_check_cp(&idna->normalizer, buf, buf_p - buf,
|
||||
true);
|
||||
if (need) {
|
||||
lxb_unicode_flush_count_set(&idna->normalizer, UINT32_MAX);
|
||||
|
||||
status = lxb_unicode_normalize_cp(&idna->normalizer, buf, buf_p - buf,
|
||||
lxb_unicode_idna_norm_c_cb,
|
||||
&context, true);
|
||||
}
|
||||
else {
|
||||
status = lxb_unicode_idna_norm_c_cb(buf, buf_p - buf, &context);
|
||||
}
|
||||
|
||||
done:
|
||||
|
||||
if (buf != buffer) {
|
||||
(void) lexbor_free(buf);
|
||||
}
|
||||
|
||||
return status;
|
||||
}
|
||||
|
||||
static lxb_status_t
|
||||
lxb_unicode_idna_norm_c_cb(const lxb_codepoint_t *cps, size_t len, void *ctx)
|
||||
{
|
||||
lxb_status_t status;
|
||||
lxb_unicode_idna_ctx_t *context = ctx;
|
||||
const lxb_codepoint_t *p, *end;
|
||||
|
||||
p = cps;
|
||||
end = cps + len;
|
||||
|
||||
while (p < end) {
|
||||
/* U+002E ( . ) FULL STOP. */
|
||||
|
||||
if (*p == 0x002E) {
|
||||
status = lxb_unicode_idna_norm_c_send(cps, p, context);
|
||||
if (status != LXB_STATUS_OK) {
|
||||
return status;
|
||||
}
|
||||
|
||||
cps = p + 1;
|
||||
}
|
||||
|
||||
p += 1;
|
||||
}
|
||||
|
||||
/*
|
||||
* We need to call a zero-length callback if the last codepoint was a
|
||||
* U+002E ( . ) FULL STOP.
|
||||
*
|
||||
* For example, "muuuu." will call for two callbacks.
|
||||
* First: "muuuu".
|
||||
* Second: "" -- empty string with length = 0.
|
||||
*/
|
||||
|
||||
if (p > cps || (len >= 1 && p[-1] == '.')) {
|
||||
return lxb_unicode_idna_norm_c_send(cps, p, context);
|
||||
}
|
||||
|
||||
return LXB_STATUS_OK;
|
||||
}
|
||||
|
||||
static lxb_status_t
|
||||
lxb_unicode_idna_norm_c_send(const lxb_codepoint_t *cps,
|
||||
const lxb_codepoint_t *p,
|
||||
lxb_unicode_idna_ctx_t *context)
|
||||
{
|
||||
bool cr;
|
||||
lxb_status_t status;
|
||||
|
||||
/* xn-- or Xn-- or xN-- or XN-- */
|
||||
|
||||
if (p - cps >= 4
|
||||
&& (cps[0] == 0x0078 || cps[0] == 0x0058)
|
||||
&& (cps[1] == 0x006E || cps[1] == 0x004E)
|
||||
&& cps[2] == 0x002D && cps[3] == 0x002D)
|
||||
{
|
||||
cps += 4;
|
||||
status = lxb_punycode_decode_cp(cps, p - cps,
|
||||
lxb_unicode_idna_punycode_cb,
|
||||
context);
|
||||
if (status == LXB_STATUS_OK) {
|
||||
return LXB_STATUS_OK;
|
||||
}
|
||||
|
||||
cps -= 4;
|
||||
}
|
||||
else {
|
||||
status = LXB_STATUS_OK;
|
||||
}
|
||||
|
||||
cr = lxb_unicode_idna_validity_criteria_cp(cps, p - cps, context->flags);
|
||||
if (!cr) {
|
||||
return LXB_STATUS_ERROR_UNEXPECTED_RESULT;
|
||||
}
|
||||
|
||||
return context->cb(cps, p - cps, context->context, status);
|
||||
}
|
||||
|
||||
static lxb_status_t
|
||||
lxb_unicode_idna_punycode_cb(const lxb_codepoint_t *cps, size_t len, void *ctx)
|
||||
{
|
||||
bool cr;
|
||||
lxb_unicode_idna_ctx_t *context = ctx;
|
||||
lxb_unicode_idna_ascii_ctx_t *asc = context->context;
|
||||
|
||||
cr = lxb_unicode_idna_validity_criteria_cp(cps, len, asc->flags);
|
||||
if (!cr) {
|
||||
return LXB_STATUS_ERROR_UNEXPECTED_RESULT;
|
||||
}
|
||||
|
||||
return context->cb(cps, len, context->context, LXB_STATUS_OK);
|
||||
}
|
||||
|
||||
lxb_status_t
|
||||
lxb_unicode_idna_to_ascii(lxb_unicode_idna_t *idna, const lxb_char_t *data,
|
||||
size_t length, lexbor_serialize_cb_f cb, void *ctx,
|
||||
lxb_unicode_idna_flag_t flags)
|
||||
{
|
||||
return lxb_unicode_idna_to_ascii_body(idna, data, length, cb, ctx,
|
||||
flags, false);
|
||||
}
|
||||
|
||||
lxb_status_t
|
||||
lxb_unicode_idna_to_ascii_cp(lxb_unicode_idna_t *idna, const lxb_codepoint_t *cps,
|
||||
size_t length, lexbor_serialize_cb_f cb, void *ctx,
|
||||
lxb_unicode_idna_flag_t flags)
|
||||
{
|
||||
return lxb_unicode_idna_to_ascii_body(idna, cps, length, cb, ctx,
|
||||
flags, true);
|
||||
}
|
||||
|
||||
static lxb_status_t
|
||||
lxb_unicode_idna_to_ascii_body(lxb_unicode_idna_t *idna, const void *data,
|
||||
size_t length, lexbor_serialize_cb_f cb, void *ctx,
|
||||
lxb_unicode_idna_flag_t flags, bool is_cp)
|
||||
{
|
||||
size_t len;
|
||||
lxb_status_t status;
|
||||
lxb_unicode_idna_ascii_ctx_t context;
|
||||
|
||||
context.p = context.buffer;
|
||||
context.buf = context.buffer;
|
||||
context.end = context.buf + sizeof(context.buffer);
|
||||
context.flags = flags;
|
||||
|
||||
if (!is_cp) {
|
||||
status = lxb_unicode_idna_processing(idna, data, length,
|
||||
lxb_unicode_idna_to_ascii_cb,
|
||||
&context, flags);
|
||||
}
|
||||
else {
|
||||
status = lxb_unicode_idna_processing_cp(idna, data, length,
|
||||
lxb_unicode_idna_to_ascii_cb,
|
||||
&context, flags);
|
||||
}
|
||||
|
||||
if (status != LXB_STATUS_OK) {
|
||||
goto done;
|
||||
}
|
||||
|
||||
/* Remove last U+002E ( . ) FULL STOP. */
|
||||
|
||||
if (context.p > context.buf) {
|
||||
context.p -= 1;
|
||||
}
|
||||
|
||||
len = context.p - context.buf;
|
||||
|
||||
status = cb(context.buf, len, ctx);
|
||||
|
||||
done:
|
||||
|
||||
if (context.buf != context.buffer) {
|
||||
(void) lexbor_free(context.buf);
|
||||
}
|
||||
|
||||
return status;
|
||||
}
|
||||
|
||||
static lxb_status_t
|
||||
lxb_unicode_idna_to_ascii_cb(const lxb_codepoint_t *part, size_t len,
|
||||
void *ctx, lxb_status_t status)
|
||||
{
|
||||
if (status != LXB_STATUS_OK) {
|
||||
return status;
|
||||
}
|
||||
|
||||
return lxb_punycode_encode_cp(part, len, lxb_unicode_idna_ascii_puny_cb,
|
||||
ctx);
|
||||
}
|
||||
|
||||
static lxb_status_t
|
||||
lxb_unicode_idna_ascii_puny_cb(const lxb_char_t *data, size_t length, void *ctx,
|
||||
bool unchanged)
|
||||
{
|
||||
size_t nlen;
|
||||
lxb_char_t *tmp;
|
||||
lxb_unicode_idna_ascii_ctx_t *asc = ctx;
|
||||
|
||||
static const lexbor_str_t prefix = lexbor_str("xn--");
|
||||
|
||||
if (asc->p + length + 6 > asc->end) {
|
||||
nlen = ((asc->end - asc->buf) * 4) + length + 6;
|
||||
|
||||
if (asc->buf == asc->buffer) {
|
||||
tmp = lexbor_malloc(nlen);
|
||||
}
|
||||
else {
|
||||
tmp = lexbor_realloc(asc->buf, nlen);
|
||||
}
|
||||
|
||||
if (tmp == NULL) {
|
||||
return LXB_STATUS_ERROR_MEMORY_ALLOCATION;
|
||||
}
|
||||
|
||||
asc->p = tmp + (asc->p - asc->buf);
|
||||
asc->buf = tmp;
|
||||
asc->end = tmp + nlen;
|
||||
}
|
||||
|
||||
if (!unchanged) {
|
||||
memcpy(asc->p, prefix.data, prefix.length);
|
||||
asc->p += 4;
|
||||
}
|
||||
|
||||
memcpy(asc->p, data, length);
|
||||
|
||||
asc->p += length;
|
||||
*asc->p++ = '.';
|
||||
*asc->p = 0x00;
|
||||
|
||||
return LXB_STATUS_OK;
|
||||
}
|
||||
|
||||
bool
|
||||
lxb_unicode_idna_validity_criteria(const lxb_char_t *data, size_t length,
|
||||
lxb_unicode_idna_flag_t flags)
|
||||
{
|
||||
return lxb_unicode_idna_validity_criteria_h(data, length, flags, false);
|
||||
}
|
||||
|
||||
bool
|
||||
lxb_unicode_idna_validity_criteria_cp(const lxb_codepoint_t *data, size_t length,
|
||||
lxb_unicode_idna_flag_t flags)
|
||||
{
|
||||
return lxb_unicode_idna_validity_criteria_h(data, length, flags, true);
|
||||
}
|
||||
|
||||
static bool
|
||||
lxb_unicode_idna_validity_criteria_h(const void *data, size_t length,
|
||||
lxb_unicode_idna_flag_t flags, bool is_cp)
|
||||
{
|
||||
size_t len;
|
||||
lxb_codepoint_t cp;
|
||||
const lxb_codepoint_t *cps;
|
||||
const lxb_char_t *p, *end;
|
||||
lxb_unicode_idna_type_t type;
|
||||
|
||||
p = data;
|
||||
len = length * ((is_cp) ? sizeof(lxb_codepoint_t) : 1);
|
||||
end = (const lxb_char_t *) data + len;
|
||||
|
||||
if (flags & LXB_UNICODE_IDNA_FLAG_CHECK_HYPHENS) {
|
||||
/* U+002D HYPHEN-MINUS */
|
||||
|
||||
if (is_cp) {
|
||||
cps = data;
|
||||
|
||||
if (length > 4) {
|
||||
if (cps[3] == 0x002D || cps[4] == 0x002D) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
if (length >= 1) {
|
||||
if (cps[0] == 0x002D || cps[length - 1] == 0x002D) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
else {
|
||||
if (length > 4) {
|
||||
if (p[3] == 0x002D || p[4] == 0x002D) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
if (length >= 1) {
|
||||
if (p[0] == 0x002D || p[-1] == 0x002D) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
else if (length >= 4) {
|
||||
if (is_cp) {
|
||||
cps = data;
|
||||
|
||||
if ( (cps[0] == 0x0078 || cps[0] == 0x0058)
|
||||
&& (cps[1] == 0x006E || cps[1] == 0x004E)
|
||||
&& cps[2] == 0x002D && cps[3] == 0x002D)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
}
|
||||
else {
|
||||
if ( (p[0] == 0x0078 || p[0] == 0x0058)
|
||||
&& (p[1] == 0x006E || p[1] == 0x004E)
|
||||
&& p[2] == 0x002D && p[3] == 0x002D)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
while (p < end) {
|
||||
if (!is_cp) {
|
||||
cp = lxb_encoding_decode_valid_utf_8_single(&p, end);
|
||||
if (cp == LXB_ENCODING_DECODE_ERROR) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
else {
|
||||
cp = *((const lxb_codepoint_t *) p);
|
||||
p = (const lxb_char_t *) ((const lxb_codepoint_t *) p + 1);
|
||||
}
|
||||
|
||||
/* U+002E ( . ) FULL STOP */
|
||||
|
||||
if (cp == 0x002E) {
|
||||
return false;
|
||||
}
|
||||
|
||||
type = lxb_unicode_idna_type(cp);
|
||||
|
||||
switch (type) {
|
||||
case LXB_UNICODE_IDNA_VALID:
|
||||
break;
|
||||
|
||||
case LXB_UNICODE_IDNA_DEVIATION:
|
||||
if (!(flags & LXB_UNICODE_IDNA_FLAG_TRANSITIONAL_PROCESSING)) {
|
||||
break;
|
||||
}
|
||||
|
||||
/* Fall through. */
|
||||
|
||||
case LXB_UNICODE_IDNA_DISALLOWED:
|
||||
case LXB_UNICODE_IDNA_IGNORED:
|
||||
case LXB_UNICODE_IDNA_MAPPED:
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
lxb_status_t
|
||||
lxb_unicode_idna_to_unicode(lxb_unicode_idna_t *idna, const lxb_char_t *data,
|
||||
size_t length, lexbor_serialize_cb_f cb,
|
||||
void *ctx, lxb_unicode_idna_flag_t flags)
|
||||
{
|
||||
return lxb_unicode_idna_to_unicode_body(idna, data, length, cb, ctx,
|
||||
flags, false);
|
||||
}
|
||||
|
||||
lxb_status_t
|
||||
lxb_unicode_idna_to_unicode_cp(lxb_unicode_idna_t *idna,
|
||||
const lxb_codepoint_t *cps,
|
||||
size_t length, lexbor_serialize_cb_f cb,
|
||||
void *ctx, lxb_unicode_idna_flag_t flags)
|
||||
{
|
||||
return lxb_unicode_idna_to_unicode_body(idna, cps, length, cb, ctx,
|
||||
flags, true);
|
||||
}
|
||||
|
||||
static lxb_status_t
|
||||
lxb_unicode_idna_to_unicode_body(lxb_unicode_idna_t *idna, const void *data,
|
||||
size_t length, lexbor_serialize_cb_f cb,
|
||||
void *ctx, lxb_unicode_idna_flag_t flags,
|
||||
bool is_cp)
|
||||
{
|
||||
size_t len;
|
||||
lxb_status_t status;
|
||||
lxb_unicode_idna_ascii_ctx_t context;
|
||||
|
||||
context.p = context.buffer;
|
||||
context.buf = context.buffer;
|
||||
context.end = context.buf + sizeof(context.buffer);
|
||||
context.flags = flags;
|
||||
|
||||
if (!is_cp) {
|
||||
status = lxb_unicode_idna_processing(idna, data, length,
|
||||
lxb_unicode_idna_to_unicode_cb,
|
||||
&context, flags);
|
||||
}
|
||||
else {
|
||||
status = lxb_unicode_idna_processing_cp(idna, data, length,
|
||||
lxb_unicode_idna_to_unicode_cb,
|
||||
&context, flags);
|
||||
}
|
||||
|
||||
if (status != LXB_STATUS_OK) {
|
||||
goto done;
|
||||
}
|
||||
|
||||
/* Remove last U+002E ( . ) FULL STOP. */
|
||||
|
||||
if (context.p > context.buf) {
|
||||
context.p -= 1;
|
||||
}
|
||||
|
||||
len = context.p - context.buf;
|
||||
|
||||
status = cb(context.buf, len, ctx);
|
||||
|
||||
done:
|
||||
|
||||
if (context.buf != context.buffer) {
|
||||
(void) lexbor_free(context.buf);
|
||||
}
|
||||
|
||||
return status;
|
||||
}
|
||||
|
||||
|
||||
static lxb_status_t
|
||||
lxb_unicode_idna_to_unicode_cb(const lxb_codepoint_t *part, size_t len,
|
||||
void *ctx, lxb_status_t status)
|
||||
{
|
||||
int8_t res;
|
||||
size_t length, nlen;
|
||||
lxb_char_t *tmp;
|
||||
const lxb_codepoint_t *p, *end;
|
||||
lxb_unicode_idna_ascii_ctx_t *asc = ctx;
|
||||
|
||||
if (status != LXB_STATUS_OK) {
|
||||
return status;
|
||||
}
|
||||
|
||||
p = part;
|
||||
end = part + len;
|
||||
|
||||
length = 0;
|
||||
|
||||
while (p < end) {
|
||||
res = lxb_encoding_encode_utf_8_length(*p++);
|
||||
if (res == 0) {
|
||||
return LXB_STATUS_ERROR_UNEXPECTED_DATA;
|
||||
}
|
||||
|
||||
length += res;
|
||||
}
|
||||
|
||||
if (asc->p + length + 2 > asc->end) {
|
||||
nlen = ((asc->end - asc->buf) * 4) + length + 2;
|
||||
|
||||
if (asc->buf == asc->buffer) {
|
||||
tmp = lexbor_malloc(nlen);
|
||||
}
|
||||
else {
|
||||
tmp = lexbor_realloc(asc->buf, nlen);
|
||||
}
|
||||
|
||||
if (tmp == NULL) {
|
||||
return LXB_STATUS_ERROR_MEMORY_ALLOCATION;
|
||||
}
|
||||
|
||||
asc->p = tmp + (asc->p - asc->buf);
|
||||
asc->buf = tmp;
|
||||
asc->end = tmp + nlen;
|
||||
}
|
||||
|
||||
p = part;
|
||||
|
||||
while (p < end) {
|
||||
(void) lxb_encoding_encode_utf_8_single(NULL, &asc->p, asc->end, *p++);
|
||||
}
|
||||
|
||||
*asc->p++ = '.';
|
||||
*asc->p = 0x00;
|
||||
|
||||
return LXB_STATUS_OK;
|
||||
}
|
264
ext/lexbor/lexbor/unicode/idna.h
Normal file
264
ext/lexbor/lexbor/unicode/idna.h
Normal file
|
@ -0,0 +1,264 @@
|
|||
/*
|
||||
* Copyright (C) 2023 Alexander Borisov
|
||||
*
|
||||
* Author: Alexander Borisov <borisov@lexbor.com>
|
||||
*
|
||||
* UNICODE IDNA COMPATIBILITY PROCESSING
|
||||
* https://www.unicode.org/reports/tr46/
|
||||
*/
|
||||
|
||||
#ifndef LEXBOR_UNICODE_IDNA_H
|
||||
#define LEXBOR_UNICODE_IDNA_H
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#include "lexbor/unicode/base.h"
|
||||
|
||||
|
||||
typedef lxb_status_t
|
||||
(*lxb_unicode_idna_cb_f)(const lxb_codepoint_t *part, size_t len,
|
||||
void *ctx, lxb_status_t status);
|
||||
|
||||
typedef enum {
|
||||
LXB_UNICODE_IDNA_FLAG_UNDEF = 0x00,
|
||||
LXB_UNICODE_IDNA_FLAG_USE_STD3ASCII_RULES = 1 << 1,
|
||||
LXB_UNICODE_IDNA_FLAG_CHECK_HYPHENS = 1 << 2,
|
||||
LXB_UNICODE_IDNA_FLAG_CHECK_BIDI = 1 << 3, /* Not implemented. */
|
||||
LXB_UNICODE_IDNA_FLAG_CHECK_JOINERS = 1 << 4, /* Not implemented. */
|
||||
LXB_UNICODE_IDNA_FLAG_TRANSITIONAL_PROCESSING = 1 << 5,
|
||||
LXB_UNICODE_IDNA_FLAG_VERIFY_DNS_LENGTH = 1 << 6
|
||||
}
|
||||
lxb_unicode_idna_flag_t;
|
||||
|
||||
typedef struct {
|
||||
lxb_unicode_normalizer_t normalizer;
|
||||
}
|
||||
lxb_unicode_idna_t;
|
||||
|
||||
|
||||
/*
|
||||
* Create lxb_unicode_idna_t object.
|
||||
*
|
||||
* @return lxb_unicode_idna_t * if successful, otherwise NULL.
|
||||
*/
|
||||
LXB_API lxb_unicode_idna_t *
|
||||
lxb_unicode_idna_create(void);
|
||||
|
||||
/*
|
||||
* Initialization of lxb_unicode_idna_t object.
|
||||
*
|
||||
* @param[in] lxb_unicode_idna_t *. May be NULL,
|
||||
* LXB_STATUS_ERROR_OBJECT_IS_NULL status will be returned.
|
||||
*
|
||||
* @return LXB_STATUS_OK if successful, otherwise an error status value.
|
||||
*/
|
||||
LXB_API lxb_status_t
|
||||
lxb_unicode_idna_init(lxb_unicode_idna_t *idna);
|
||||
|
||||
/*
|
||||
* Clears the object. Returns to states as after initialization.
|
||||
*
|
||||
* @param[in] lxb_unicode_idna_t *
|
||||
*/
|
||||
LXB_API void
|
||||
lxb_unicode_idna_clean(lxb_unicode_idna_t *idna);
|
||||
|
||||
/*
|
||||
* Destroy lxb_unicode_idna_t object.
|
||||
*
|
||||
* Release of occupied resources.
|
||||
*
|
||||
* @param[in] lxb_unicode_idna_t *. Can be NULL.
|
||||
* @param[in] if false: only destroys internal buffers.
|
||||
* if true: destroys the lxb_unicode_idna_t object and all internal buffers.
|
||||
*
|
||||
* @return lxb_unicode_idna_t * if self_destroy = false, otherwise NULL.
|
||||
*/
|
||||
LXB_API lxb_unicode_idna_t *
|
||||
lxb_unicode_idna_destroy(lxb_unicode_idna_t *idna, bool self_destroy);
|
||||
|
||||
/*
|
||||
* Domain name processing.
|
||||
*
|
||||
* Mapping, Normalization (NFC), Converting, Validating.
|
||||
*
|
||||
* Callback will be invoked at each level of the domain name.
|
||||
*
|
||||
* For example:
|
||||
* lexbor.com -- there will be two callbacks, for "lexbor" and "com".
|
||||
*
|
||||
* https://www.unicode.org/reports/tr46/#Processing
|
||||
*
|
||||
* @param[in] lxb_unicode_idna_t *.
|
||||
* @param[in] Input characters for processing. Not NULL.
|
||||
* @param[in] Length of characters. Can be 0.
|
||||
* @param[in] Callback for results of processing.
|
||||
* @param[in] Context for callback.
|
||||
* @param[in] Bitmap of IDNA flags (LXB_UNICODE_IDNA_FLAG_*).
|
||||
*
|
||||
* @return LXB_STATUS_OK if successful, otherwise an error status value.
|
||||
*/
|
||||
LXB_API lxb_status_t
|
||||
lxb_unicode_idna_processing(lxb_unicode_idna_t *idna, const lxb_char_t *data,
|
||||
size_t length, lxb_unicode_idna_cb_f cb, void *ctx,
|
||||
lxb_unicode_idna_flag_t flags);
|
||||
|
||||
/*
|
||||
* Domain name processing for code points.
|
||||
*
|
||||
* This function is exactly the same as lxb_unicode_idna_processing() only it
|
||||
* takes code points instead of characters as input.
|
||||
*
|
||||
* * Please, see lxb_unicode_idna_processing() function.
|
||||
*
|
||||
* @param[in] lxb_unicode_idna_t *.
|
||||
* @param[in] Input code points for processing. Not NULL.
|
||||
* @param[in] Length of code points. Can be 0.
|
||||
* @param[in] Callback for results of processing.
|
||||
* @param[in] Context for callback.
|
||||
* @param[in] Bitmap of IDNA flags (LXB_UNICODE_IDNA_FLAG_*).
|
||||
*
|
||||
* @return LXB_STATUS_OK if successful, otherwise an error status value.
|
||||
*/
|
||||
LXB_API lxb_status_t
|
||||
lxb_unicode_idna_processing_cp(lxb_unicode_idna_t *idna,
|
||||
const lxb_codepoint_t *cps, size_t length,
|
||||
lxb_unicode_idna_cb_f cb, void *ctx,
|
||||
lxb_unicode_idna_flag_t flags);
|
||||
|
||||
/*
|
||||
* Processing and converting domain name to ASCII.
|
||||
*
|
||||
* Does the same thing as lxb_unicode_idna_processing() + converts each part
|
||||
* domain name to Punycode.
|
||||
*
|
||||
* Callback will be invoked only once in at end of processing.
|
||||
*
|
||||
* https://www.unicode.org/reports/tr46/#ToASCII
|
||||
*
|
||||
* @param[in] lxb_unicode_idna_t *.
|
||||
* @param[in] Input characters for processing. Not NULL.
|
||||
* @param[in] Length of characters. Can be 0.
|
||||
* @param[in] Callback for results of processing.
|
||||
* @param[in] Context for callback.
|
||||
* @param[in] Bitmap of IDNA flags (LXB_UNICODE_IDNA_FLAG_*).
|
||||
*
|
||||
* @return LXB_STATUS_OK if successful, otherwise an error status value.
|
||||
*/
|
||||
LXB_API lxb_status_t
|
||||
lxb_unicode_idna_to_ascii(lxb_unicode_idna_t *idna, const lxb_char_t *data,
|
||||
size_t length, lexbor_serialize_cb_f cb, void *ctx,
|
||||
lxb_unicode_idna_flag_t flags);
|
||||
|
||||
/*
|
||||
* Processing and converting domain name to ASCII for code points.
|
||||
*
|
||||
* This function is exactly the same as lxb_unicode_idna_to_ascii() only it
|
||||
* takes code points instead of characters as input.
|
||||
*
|
||||
* Please, see lxb_unicode_idna_to_ascii() function.
|
||||
*
|
||||
* @param[in] lxb_unicode_idna_t *.
|
||||
* @param[in] Input characters for processing. Not NULL.
|
||||
* @param[in] Length of characters. Can be 0.
|
||||
* @param[in] Callback for results of processing.
|
||||
* @param[in] Context for callback.
|
||||
* @param[in] Bitmap of IDNA flags (LXB_UNICODE_IDNA_FLAG_*).
|
||||
*
|
||||
* @return LXB_STATUS_OK if successful, otherwise an error status value.
|
||||
*/
|
||||
LXB_API lxb_status_t
|
||||
lxb_unicode_idna_to_ascii_cp(lxb_unicode_idna_t *idna, const lxb_codepoint_t *cps,
|
||||
size_t length, lexbor_serialize_cb_f cb, void *ctx,
|
||||
lxb_unicode_idna_flag_t flags);
|
||||
|
||||
/*
|
||||
* Processing and converting domain name to Unicode.
|
||||
*
|
||||
* Does the same thing as lxb_unicode_idna_processing().
|
||||
*
|
||||
* Callback will be invoked only once in at end of processing.
|
||||
*
|
||||
* https://www.unicode.org/reports/tr46/#ToUnicode
|
||||
*
|
||||
* @param[in] lxb_unicode_idna_t *.
|
||||
* @param[in] Input characters for processing. Not NULL.
|
||||
* @param[in] Length of characters. Can be 0.
|
||||
* @param[in] Callback for results of processing.
|
||||
* @param[in] Context for callback.
|
||||
* @param[in] Bitmap of IDNA flags (LXB_UNICODE_IDNA_FLAG_*).
|
||||
*
|
||||
* @return LXB_STATUS_OK if successful, otherwise an error status value.
|
||||
*/
|
||||
LXB_API lxb_status_t
|
||||
lxb_unicode_idna_to_unicode(lxb_unicode_idna_t *idna, const lxb_char_t *data,
|
||||
size_t length, lexbor_serialize_cb_f cb, void *ctx,
|
||||
lxb_unicode_idna_flag_t flags);
|
||||
|
||||
/*
|
||||
* Processing and converting domain name to Unicode for code points.
|
||||
*
|
||||
* This function is exactly the same as lxb_unicode_idna_to_unicode() only it
|
||||
* takes code points instead of characters as input.
|
||||
*
|
||||
* Please, see lxb_unicode_idna_to_unicode() function.
|
||||
*
|
||||
* @param[in] lxb_unicode_idna_t *.
|
||||
* @param[in] Input characters for processing. Not NULL.
|
||||
* @param[in] Length of characters. Can be 0.
|
||||
* @param[in] Callback for results of processing.
|
||||
* @param[in] Context for callback.
|
||||
* @param[in] Bitmap of IDNA flags (LXB_UNICODE_IDNA_FLAG_*).
|
||||
*
|
||||
* @return LXB_STATUS_OK if successful, otherwise an error status value.
|
||||
*/
|
||||
LXB_API lxb_status_t
|
||||
lxb_unicode_idna_to_unicode_cp(lxb_unicode_idna_t *idna, const lxb_codepoint_t *cps,
|
||||
size_t length, lexbor_serialize_cb_f cb, void *ctx,
|
||||
lxb_unicode_idna_flag_t flags);
|
||||
|
||||
/*
|
||||
* Validity Criteria.
|
||||
*
|
||||
* The function checks the domain name for validity according to a number of
|
||||
* criteria.
|
||||
*
|
||||
* LXB_UNICODE_IDNA_FLAG_CHECK_BIDI and LXB_UNICODE_IDNA_FLAG_CHECK_JOINERS
|
||||
* not implemented.
|
||||
*
|
||||
* https://www.unicode.org/reports/tr46/#Validity_Criteria
|
||||
*
|
||||
* @param[in] Input characters for processing. Not NULL.
|
||||
* @param[in] Length of characters. Can be 0.
|
||||
* @param[in] Bitmap of IDNA flags (LXB_UNICODE_IDNA_FLAG_*).
|
||||
*
|
||||
* @return true if valid, otherwise false.
|
||||
*/
|
||||
LXB_API bool
|
||||
lxb_unicode_idna_validity_criteria(const lxb_char_t *data, size_t length,
|
||||
lxb_unicode_idna_flag_t flags);
|
||||
|
||||
/*
|
||||
* Validity Criteria.
|
||||
*
|
||||
* Same as lxb_unicode_idna_validity_criteria() only it takes codepoints as
|
||||
* input.
|
||||
*
|
||||
* @param[in] Input codepoints for processing. Not NULL.
|
||||
* @param[in] Length of codepoints. Can be 0.
|
||||
* @param[in] Bitmap of IDNA flags (LXB_UNICODE_IDNA_FLAG_*).
|
||||
*
|
||||
* @return true if valid, otherwise false.
|
||||
*/
|
||||
LXB_API bool
|
||||
lxb_unicode_idna_validity_criteria_cp(const lxb_codepoint_t *data, size_t length,
|
||||
lxb_unicode_idna_flag_t flags);
|
||||
|
||||
|
||||
#ifdef __cplusplus
|
||||
} /* extern "C" */
|
||||
#endif
|
||||
|
||||
#endif /* LEXBOR_UNICODE_IDNA_H */
|
201955
ext/lexbor/lexbor/unicode/res.h
Normal file
201955
ext/lexbor/lexbor/unicode/res.h
Normal file
File diff suppressed because it is too large
Load diff
1039
ext/lexbor/lexbor/unicode/unicode.c
Normal file
1039
ext/lexbor/lexbor/unicode/unicode.c
Normal file
File diff suppressed because it is too large
Load diff
405
ext/lexbor/lexbor/unicode/unicode.h
Normal file
405
ext/lexbor/lexbor/unicode/unicode.h
Normal file
|
@ -0,0 +1,405 @@
|
|||
/*
|
||||
* Copyright (C) 2023 Alexander Borisov
|
||||
*
|
||||
* Author: Alexander Borisov <borisov@lexbor.com>
|
||||
*/
|
||||
|
||||
#ifndef LEXBOR_UNICODE_H
|
||||
#define LEXBOR_UNICODE_H
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#include "lexbor/unicode/base.h"
|
||||
#include "lexbor/unicode/idna.h"
|
||||
#include "lexbor/core/array_obj.h"
|
||||
|
||||
|
||||
typedef enum {
|
||||
LXB_UNICODE_NFC = 0x00, /* Normalization Form C (NFC). */
|
||||
LXB_UNICODE_NFD = 0x01, /* Normalization Form D (NFD). */
|
||||
LXB_UNICODE_NFKC = 0x02, /* Normalization Form KC (NFKC). */
|
||||
LXB_UNICODE_NFKD = 0x03 /* Normalization Form KD (NFKD). */
|
||||
}
|
||||
lxb_unicode_form_t;
|
||||
|
||||
|
||||
/*
|
||||
* Create lxb_unicode_normalizer_t object.
|
||||
*
|
||||
* @return lxb_unicode_normalizer_t * if successful, otherwise NULL.
|
||||
*/
|
||||
LXB_API lxb_unicode_normalizer_t *
|
||||
lxb_unicode_normalizer_create(void);
|
||||
|
||||
/*
|
||||
* Initialization of lxb_unicode_normalizer_t object.
|
||||
*
|
||||
* Support normalization forms:
|
||||
* Normalization Form D (NFD): LXB_UNICODE_NFD
|
||||
* Normalization Form C (NFC): LXB_UNICODE_NFC
|
||||
* Normalization Form KD (NFKD): LXB_UNICODE_NFKD
|
||||
* Normalization Form KC (NFKC): LXB_UNICODE_NFKC
|
||||
*
|
||||
* https://www.unicode.org/reports/tr15/
|
||||
*
|
||||
* @param[in] lxb_unicode_normalizer_t *
|
||||
* @param[in] Normalization form.
|
||||
*
|
||||
* @return LXB_STATUS_OK if successful, otherwise an error status value.
|
||||
*/
|
||||
LXB_API lxb_status_t
|
||||
lxb_unicode_normalizer_init(lxb_unicode_normalizer_t *uc,
|
||||
lxb_unicode_form_t form);
|
||||
|
||||
/*
|
||||
* Initialization of lxb_unicode_normalizer_t object.
|
||||
*
|
||||
* Clears the object. Returns to states as after initialization.
|
||||
*
|
||||
* @param[in] lxb_unicode_normalizer_t *
|
||||
*/
|
||||
LXB_API void
|
||||
lxb_unicode_normalizer_clean(lxb_unicode_normalizer_t *uc);
|
||||
|
||||
/*
|
||||
* Destroy lxb_unicode_normalizer_t object.
|
||||
*
|
||||
* Release of occupied resources.
|
||||
*
|
||||
* @param[in] lxb_unicode_normalizer_t *. Can be NULL.
|
||||
* @param[in] if false: only destroys internal buffers.
|
||||
* if true: destroys the lxb_unicode_normalizer_t object and all internal buffers.
|
||||
*
|
||||
* @return lxb_unicode_normalizer_t * if self_destroy = false, otherwise NULL.
|
||||
*/
|
||||
LXB_API lxb_unicode_normalizer_t *
|
||||
lxb_unicode_normalizer_destroy(lxb_unicode_normalizer_t *uc, bool self_destroy);
|
||||
|
||||
/*
|
||||
* Unicode normalization forms.
|
||||
*
|
||||
* This is a function with an implementation of the unicode normalization
|
||||
* algorithm.
|
||||
*
|
||||
* The function is designed to work with a stream (chunks).
|
||||
*
|
||||
* Please, see examples for this function in examples/lexbor/unicode directory.
|
||||
*
|
||||
* @param[in] lxb_unicode_normalizer_t *
|
||||
* @param[in] Input characters for normalization. Not NULL.
|
||||
* @param[in] Length of characters. Can be 0.
|
||||
* @param[in] Callback for results of normalization.
|
||||
* @param[in] Context for callback.
|
||||
* @param[in] Set to true if the last chunk or the only one chunk is processed.
|
||||
*
|
||||
* @return LXB_STATUS_OK if successful, otherwise an error status value.
|
||||
*/
|
||||
LXB_API lxb_status_t
|
||||
lxb_unicode_normalize(lxb_unicode_normalizer_t *uc, const lxb_char_t *data,
|
||||
size_t length, lexbor_serialize_cb_f cb, void *ctx,
|
||||
bool is_last);
|
||||
|
||||
/*
|
||||
* Unicode normalization end.
|
||||
*
|
||||
* The function is used to complete a normalization.
|
||||
* Same as calling the lxb_unicode_normalize() function with is_last = true.
|
||||
*
|
||||
* Use this function only if you do not set is_last = true in
|
||||
* the lxb_unicode_normalize() function.
|
||||
*
|
||||
* For example:
|
||||
* status = lxb_unicode_normalize(uc, data, length, cb, NULL, false);
|
||||
* status = lxb_unicode_normalize(uc, data, length, cb, NULL, false);
|
||||
* lxb_unicode_normalize_end(uc);
|
||||
*
|
||||
* The same as:
|
||||
* status = lxb_unicode_normalize(uc, data, length, cb, NULL, false);
|
||||
* status = lxb_unicode_normalize(uc, data, length, cb, NULL, true);
|
||||
*
|
||||
* @param[in] lxb_unicode_normalizer_t *
|
||||
* @param[in] Callback for results of normalization.
|
||||
* @param[in] Context for callback.
|
||||
*
|
||||
* @return LXB_STATUS_OK if successful, otherwise an error status value.
|
||||
*/
|
||||
LXB_API lxb_status_t
|
||||
lxb_unicode_normalize_end(lxb_unicode_normalizer_t *uc, lexbor_serialize_cb_f cb,
|
||||
void *ctx);
|
||||
|
||||
/*
|
||||
* Unicode normalization forms for code points.
|
||||
*
|
||||
* This function is exactly the same as lxb_unicode_normalize() only it takes
|
||||
* code points instead of characters as input.
|
||||
*
|
||||
* Also, unlike the lxb_unicode_normalize() function, a callback will be called
|
||||
* to return a code points, not characters.
|
||||
*
|
||||
* The function is designed to work with a stream (chunks).
|
||||
*
|
||||
* @param[in] lxb_unicode_normalizer_t *
|
||||
* @param[in] Input code points for normalization. Not NULL.
|
||||
* @param[in] Length of code points. Can be 0.
|
||||
* @param[in] Callback for results of normalization.
|
||||
* @param[in] Context for callback.
|
||||
* @param[in] Set to true if the last chunk or the only one chunk is processed.
|
||||
*
|
||||
* @return LXB_STATUS_OK if successful, otherwise an error status value.
|
||||
*/
|
||||
LXB_API lxb_status_t
|
||||
lxb_unicode_normalize_cp(lxb_unicode_normalizer_t *uc, const lxb_codepoint_t *cps,
|
||||
size_t length, lexbor_serialize_cb_cp_f cb, void *ctx,
|
||||
bool is_last);
|
||||
|
||||
/*
|
||||
* Unicode normalization end for code points.
|
||||
*
|
||||
* This function is completely similar to lxb_unicode_normalize_end(),
|
||||
* only it takes a function with code points as a callback function.
|
||||
*
|
||||
* Same as calling the lxb_unicode_normalize_cp() function with is_last = true.
|
||||
*
|
||||
* Use this function only if you do not set is_last = true in
|
||||
* the lxb_unicode_normalize_cp() function.
|
||||
*
|
||||
* For example:
|
||||
* status = lxb_unicode_normalize_cp(uc, cps, length, cb, NULL, false);
|
||||
* status = lxb_unicode_normalize_cp(uc, cps, length, cb, NULL, false);
|
||||
* lxb_unicode_normalize_cp_end(uc);
|
||||
*
|
||||
* The same as:
|
||||
* status = lxb_unicode_normalize_cp(uc, cps, length, cb, NULL, false);
|
||||
* status = lxb_unicode_normalize_cp(uc, cps, length, cb, NULL, true);
|
||||
*
|
||||
* @param[in] lxb_unicode_normalizer_t *
|
||||
* @param[in] Callback for results of normalization.
|
||||
* @param[in] Context for callback.
|
||||
*
|
||||
* @return LXB_STATUS_OK if successful, otherwise an error status value.
|
||||
*/
|
||||
LXB_API lxb_status_t
|
||||
lxb_unicode_normalize_cp_end(lxb_unicode_normalizer_t *uc,
|
||||
lexbor_serialize_cb_cp_f cb, void *ctx);
|
||||
|
||||
/*
|
||||
* Quick Check.
|
||||
*
|
||||
* The basic normalization algorithm is not simple and requires time
|
||||
* and resources.
|
||||
* This function checks relatively quickly if the text needs to be normalized.
|
||||
*
|
||||
* The function is designed to work with a stream (chunks).
|
||||
*
|
||||
* https://www.unicode.org/reports/tr15/#Detecting_Normalization_Forms
|
||||
*
|
||||
* @param[in] lxb_unicode_normalizer_t *
|
||||
* @param[in] Input characters for checks. Not NULL.
|
||||
* @param[in] Length of characters. Can be 0.
|
||||
* @param[in] Set to true if the last chunk or the only one chunk is processed.
|
||||
*
|
||||
* @return true if it needs to be normalized, otherwise false.
|
||||
*/
|
||||
LXB_API bool
|
||||
lxb_unicode_quick_check(lxb_unicode_normalizer_t *uc, const lxb_char_t *data,
|
||||
size_t length, bool is_last);
|
||||
|
||||
/*
|
||||
* Quick Check End.
|
||||
*
|
||||
* The function is used to complete a quick check.
|
||||
* Same as calling the lxb_unicode_quick_check() function with is_last = true.
|
||||
*
|
||||
* Use this function only if you do not set is_last = true in
|
||||
* the lxb_unicode_quick_check() function.
|
||||
*
|
||||
* For example:
|
||||
* is = lxb_unicode_quick_check(uc, data, length, false);
|
||||
* is = lxb_unicode_quick_check(uc, data, length, false);
|
||||
* is = lxb_unicode_quick_check_end(uc);
|
||||
*
|
||||
* The same as:
|
||||
* is = lxb_unicode_quick_check(uc, data, length, false);
|
||||
* is = lxb_unicode_quick_check(uc, data, length, true);
|
||||
*
|
||||
* @param[in] lxb_unicode_normalizer_t *
|
||||
*
|
||||
* @return true if it needs to be normalized, otherwise false.
|
||||
*/
|
||||
LXB_API bool
|
||||
lxb_unicode_quick_check_end(lxb_unicode_normalizer_t *uc);
|
||||
|
||||
/*
|
||||
* Quick Check for code points.
|
||||
*
|
||||
* Same as lxb_unicode_quick_check() only it takes code points as input.
|
||||
*
|
||||
* @param[in] lxb_unicode_normalizer_t *
|
||||
* @param[in] Input code points for checks. Not NULL.
|
||||
* @param[in] Length of code points. Can be 0.
|
||||
* @param[in] Set to true if the last chunk or the only one chunk is processed.
|
||||
*
|
||||
* @return true if it needs to be normalized, otherwise false.
|
||||
*/
|
||||
LXB_API bool
|
||||
lxb_unicode_quick_check_cp(lxb_unicode_normalizer_t *uc,
|
||||
const lxb_codepoint_t *cps, size_t length,
|
||||
bool is_last);
|
||||
|
||||
/*
|
||||
* Quick Check End for code points.
|
||||
*
|
||||
* Same as lxb_unicode_quick_check_end().
|
||||
*
|
||||
* For example:
|
||||
* is = lxb_unicode_quick_check_cp(uc, cps, length, false);
|
||||
* is = lxb_unicode_quick_check_cp(uc, cps, length, false);
|
||||
* is = lxb_unicode_quick_check_cp_end(uc);
|
||||
*
|
||||
* The same as:
|
||||
* is = lxb_unicode_quick_check_cp(uc, cps, length, false);
|
||||
* is = lxb_unicode_quick_check_cp(uc, cps, length, true);
|
||||
*
|
||||
* @param[in] lxb_unicode_normalizer_t *
|
||||
*
|
||||
* @return true if it needs to be normalized, otherwise false.
|
||||
*/
|
||||
LXB_API bool
|
||||
lxb_unicode_quick_check_cp_end(lxb_unicode_normalizer_t *uc);
|
||||
|
||||
/*
|
||||
* Flush.
|
||||
*
|
||||
* Force flush the buffer to the user's callback if it possible.
|
||||
*
|
||||
* Please, see lxb_unicode_flush_count_set() function.
|
||||
*
|
||||
* @param[in] lxb_unicode_normalizer_t *.
|
||||
* @param[in] Callback.
|
||||
* @param[in] Callback context.
|
||||
*
|
||||
* @return LXB_STATUS_OK if successful, otherwise an error status value.
|
||||
*/
|
||||
LXB_API lxb_status_t
|
||||
lxb_unicode_flush(lxb_unicode_normalizer_t *uc, lexbor_serialize_cb_f cb,
|
||||
void *ctx);
|
||||
|
||||
/*
|
||||
* Flush for code points.
|
||||
*
|
||||
* Same as lxb_unicode_flush(), but it takes a callback with code points as
|
||||
* input.
|
||||
*
|
||||
* @param[in] lxb_unicode_normalizer_t *.
|
||||
* @param[in] Callback.
|
||||
* @param[in] Callback context.
|
||||
*
|
||||
* @return LXB_STATUS_OK if successful, otherwise an error status value.
|
||||
*/
|
||||
LXB_API lxb_status_t
|
||||
lxb_unicode_flush_cp(lxb_unicode_normalizer_t *uc, lexbor_serialize_cb_cp_f cb,
|
||||
void *ctx);
|
||||
|
||||
/*
|
||||
* Change normalization form.
|
||||
*
|
||||
* You should only apply this function after one of the following actions:
|
||||
* 1. The lxb_unicode_normalize() function was called with is_last = true.
|
||||
* That is, the processing of the previous type was successfully
|
||||
* completed.
|
||||
* OR
|
||||
* 2. The end of normalization function was called:
|
||||
lxb_unicode_normalize_end().
|
||||
* OR
|
||||
* 3. The lxb_unicode_normalizer_t object cleanup function was called:
|
||||
* lxb_unicode_normalizer_clean().
|
||||
*
|
||||
*
|
||||
* All this is to be able to normalize or quickly check text with different
|
||||
* types without creating new objects.
|
||||
*
|
||||
* @param[in] lxb_unicode_normalizer_t *.
|
||||
* @param[in] Normalization form.
|
||||
*
|
||||
* @return LXB_STATUS_OK if successful, otherwise an error status value.
|
||||
*/
|
||||
LXB_API lxb_status_t
|
||||
lxb_unicode_normalization_form_set(lxb_unicode_normalizer_t *uc,
|
||||
lxb_unicode_form_t form);
|
||||
|
||||
LXB_API const lxb_unicode_entry_t *
|
||||
lxb_unicode_entry(lxb_codepoint_t cp);
|
||||
|
||||
LXB_API const lxb_unicode_composition_cp_t *
|
||||
lxb_unicode_compose_entry(lxb_codepoint_t first, lxb_codepoint_t second);
|
||||
|
||||
LXB_API lxb_unicode_idna_type_t
|
||||
lxb_unicode_idna_type(lxb_codepoint_t cp);
|
||||
|
||||
LXB_API const lxb_unicode_composition_cp_t *
|
||||
lxb_unicode_composition_cp(lxb_codepoint_t first, lxb_codepoint_t second);
|
||||
|
||||
LXB_API const lxb_unicode_normalization_entry_t *
|
||||
lxb_unicode_normalization_entry(const lxb_unicode_entry_t *entry);
|
||||
|
||||
LXB_API const lxb_unicode_normalization_entry_t *
|
||||
lxb_unicode_normalization_entry_by_cp(lxb_codepoint_t cp);
|
||||
|
||||
LXB_API const lxb_unicode_normalization_entry_t *
|
||||
lxb_unicode_normalization_entry_by_index(uint16_t index);
|
||||
|
||||
LXB_API bool
|
||||
lxb_unicode_normalization_is_null(const lxb_unicode_normalization_entry_t *entry);
|
||||
|
||||
LXB_API const lxb_codepoint_t *
|
||||
lxb_unicode_full_canonical(const lxb_unicode_normalization_entry_t *entry,
|
||||
size_t *out_length);
|
||||
|
||||
LXB_API const lxb_codepoint_t *
|
||||
lxb_unicode_full_compatibility(const lxb_unicode_normalization_entry_t *entry,
|
||||
size_t *out_length);
|
||||
|
||||
LXB_API const lxb_unicode_idna_entry_t *
|
||||
lxb_unicode_idna_entry(const lxb_unicode_entry_t *entry);
|
||||
|
||||
LXB_API const lxb_unicode_idna_entry_t *
|
||||
lxb_unicode_idna_entry_by_cp(lxb_codepoint_t cp);
|
||||
|
||||
LXB_API const lxb_unicode_idna_entry_t *
|
||||
lxb_unicode_idna_entry_by_index(uint16_t index);
|
||||
|
||||
LXB_API const lxb_codepoint_t *
|
||||
lxb_unicode_idna_map(const lxb_unicode_idna_entry_t *entry,
|
||||
size_t *out_length);
|
||||
|
||||
/*
|
||||
* Inline functions.
|
||||
*/
|
||||
|
||||
/*
|
||||
* Sets the buffer size for codepoints.
|
||||
*
|
||||
* By default, 4096 processed codepoints are accumulated before converting them
|
||||
* to lxb_char_t and returning the result to the user via callback.
|
||||
*
|
||||
* If set the count to 0, the user callback will be called for every codepoint
|
||||
* processed. That is, it will be streaming without accumulation in
|
||||
* the intermediate buffer.
|
||||
*
|
||||
* @param[in] lxb_unicode_normalizer_t *.
|
||||
* @param[in] Count of codepoints in the buffer.
|
||||
*/
|
||||
lxb_inline void
|
||||
lxb_unicode_flush_count_set(lxb_unicode_normalizer_t *uc, size_t count)
|
||||
{
|
||||
uc->flush_cp = count;
|
||||
}
|
||||
|
||||
|
||||
#ifdef __cplusplus
|
||||
} /* extern "C" */
|
||||
#endif
|
||||
|
||||
#endif /* LEXBOR_UNICODE_H */
|
32
ext/lexbor/lexbor/url/base.h
Normal file
32
ext/lexbor/lexbor/url/base.h
Normal file
|
@ -0,0 +1,32 @@
|
|||
/*
|
||||
* Copyright (C) 2023-2024 Alexander Borisov
|
||||
*
|
||||
* Author: Alexander Borisov <borisov@lexbor.com>
|
||||
*/
|
||||
|
||||
#ifndef LEXBOR_URL_BASE_H
|
||||
#define LEXBOR_URL_BASE_H
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#include "lexbor/core/base.h"
|
||||
#include "lexbor/core/mraw.h"
|
||||
#include "lexbor/core/str.h"
|
||||
|
||||
|
||||
#define LXB_URL_VERSION_MAJOR 0
|
||||
#define LXB_URL_VERSION_MINOR 3
|
||||
#define LXB_URL_VERSION_PATCH 0
|
||||
|
||||
#define LXB_URL_VERSION_STRING LEXBOR_STRINGIZE(LXB_URL_VERSION_MAJOR) "." \
|
||||
LEXBOR_STRINGIZE(LXB_URL_VERSION_MINOR) "." \
|
||||
LEXBOR_STRINGIZE(LXB_URL_VERSION_PATCH)
|
||||
|
||||
|
||||
#ifdef __cplusplus
|
||||
} /* extern "C" */
|
||||
#endif
|
||||
|
||||
#endif /* LEXBOR_URL_BASE_H */
|
4845
ext/lexbor/lexbor/url/url.c
Normal file
4845
ext/lexbor/lexbor/url/url.c
Normal file
File diff suppressed because it is too large
Load diff
551
ext/lexbor/lexbor/url/url.h
Normal file
551
ext/lexbor/lexbor/url/url.h
Normal file
|
@ -0,0 +1,551 @@
|
|||
/*
|
||||
* Copyright (C) 2023 Alexander Borisov
|
||||
*
|
||||
* Author: Alexander Borisov <borisov@lexbor.com>
|
||||
*
|
||||
* The URL Standard.
|
||||
* By specification: https://url.spec.whatwg.org/
|
||||
*/
|
||||
|
||||
#ifndef LEXBOR_URL_H
|
||||
#define LEXBOR_URL_H
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#include "lexbor/url/base.h"
|
||||
#include "lexbor/core/mraw.h"
|
||||
#include "lexbor/core/plog.h"
|
||||
#include "lexbor/encoding/encoding.h"
|
||||
#include "lexbor/unicode/unicode.h"
|
||||
|
||||
|
||||
typedef enum {
|
||||
LXB_URL_ERROR_TYPE_DOMAIN_TO_ASCII = 0x00,
|
||||
LXB_URL_ERROR_TYPE_DOMAIN_TO_UNICODE,
|
||||
LXB_URL_ERROR_TYPE_DOMAIN_INVALID_CODE_POINT,
|
||||
LXB_URL_ERROR_TYPE_HOST_INVALID_CODE_POINT,
|
||||
LXB_URL_ERROR_TYPE_IPV4_EMPTY_PART,
|
||||
LXB_URL_ERROR_TYPE_IPV4_TOO_MANY_PARTS,
|
||||
LXB_URL_ERROR_TYPE_IPV4_NON_NUMERIC_PART,
|
||||
LXB_URL_ERROR_TYPE_IPV4_NON_DECIMAL_PART,
|
||||
LXB_URL_ERROR_TYPE_IPV4_OUT_OF_RANGE_PART,
|
||||
LXB_URL_ERROR_TYPE_IPV6_UNCLOSED,
|
||||
LXB_URL_ERROR_TYPE_IPV6_INVALID_COMPRESSION,
|
||||
LXB_URL_ERROR_TYPE_IPV6_TOO_MANY_PIECES,
|
||||
LXB_URL_ERROR_TYPE_IPV6_MULTIPLE_COMPRESSION,
|
||||
LXB_URL_ERROR_TYPE_IPV6_INVALID_CODE_POINT,
|
||||
LXB_URL_ERROR_TYPE_IPV6_TOO_FEW_PIECES,
|
||||
LXB_URL_ERROR_TYPE_IPV4_IN_IPV6_TOO_MANY_PIECES,
|
||||
LXB_URL_ERROR_TYPE_IPV4_IN_IPV6_INVALID_CODE_POINT,
|
||||
LXB_URL_ERROR_TYPE_IPV4_IN_IPV6_OUT_OF_RANGE_PART,
|
||||
LXB_URL_ERROR_TYPE_IPV4_IN_IPV6_TOO_FEW_PARTS,
|
||||
LXB_URL_ERROR_TYPE_INVALID_URL_UNIT,
|
||||
LXB_URL_ERROR_TYPE_SPECIAL_SCHEME_MISSING_FOLLOWING_SOLIDUS,
|
||||
LXB_URL_ERROR_TYPE_MISSING_SCHEME_NON_RELATIVE_URL,
|
||||
LXB_URL_ERROR_TYPE_INVALID_REVERSE_SOLIDUS,
|
||||
LXB_URL_ERROR_TYPE_INVALID_CREDENTIALS,
|
||||
LXB_URL_ERROR_TYPE_HOST_MISSING,
|
||||
LXB_URL_ERROR_TYPE_PORT_OUT_OF_RANGE,
|
||||
LXB_URL_ERROR_TYPE_PORT_INVALID,
|
||||
LXB_URL_ERROR_TYPE_FILE_INVALID_WINDOWS_DRIVE_LETTER,
|
||||
LXB_URL_ERROR_TYPE_FILE_INVALID_WINDOWS_DRIVE_LETTER_HOST,
|
||||
LXB_URL_ERROR_TYPE__LAST_ENTRY
|
||||
}
|
||||
lxb_url_error_type_t;
|
||||
|
||||
typedef enum {
|
||||
LXB_URL_STATE__UNDEF = 0x00,
|
||||
LXB_URL_STATE_SCHEME_START_STATE,
|
||||
LXB_URL_STATE_SCHEME_STATE,
|
||||
LXB_URL_STATE_NO_SCHEME_STATE,
|
||||
LXB_URL_STATE_SPECIAL_RELATIVE_OR_AUTHORITY_STATE,
|
||||
LXB_URL_STATE_PATH_OR_AUTHORITY_STATE,
|
||||
LXB_URL_STATE_RELATIVE_STATE,
|
||||
LXB_URL_STATE_RELATIVE_SLASH_STATE,
|
||||
LXB_URL_STATE_SPECIAL_AUTHORITY_SLASHES_STATE,
|
||||
LXB_URL_STATE_SPECIAL_AUTHORITY_IGNORE_SLASHES_STATE,
|
||||
LXB_URL_STATE_AUTHORITY_STATE,
|
||||
LXB_URL_STATE_HOST_STATE,
|
||||
LXB_URL_STATE_HOSTNAME_STATE,
|
||||
LXB_URL_STATE_PORT_STATE,
|
||||
LXB_URL_STATE_FILE_STATE,
|
||||
LXB_URL_STATE_FILE_SLASH_STATE,
|
||||
LXB_URL_STATE_FILE_HOST_STATE,
|
||||
LXB_URL_STATE_PATH_START_STATE,
|
||||
LXB_URL_STATE_PATH_STATE,
|
||||
LXB_URL_STATE_OPAQUE_PATH_STATE,
|
||||
LXB_URL_STATE_QUERY_STATE,
|
||||
LXB_URL_STATE_FRAGMENT_STATE
|
||||
}
|
||||
lxb_url_state_t;
|
||||
|
||||
/*
|
||||
* New values can only be added downwards.
|
||||
* Before LXB_URL_SCHEMEL_TYPE__LAST_ENTRY.
|
||||
*
|
||||
* Please, see lxb_url_scheme_res in /lexbor/url/url.c.
|
||||
*/
|
||||
typedef enum {
|
||||
LXB_URL_SCHEMEL_TYPE__UNDEF = 0x00,
|
||||
LXB_URL_SCHEMEL_TYPE__UNKNOWN = 0x01,
|
||||
LXB_URL_SCHEMEL_TYPE_HTTP = 0x02,
|
||||
LXB_URL_SCHEMEL_TYPE_HTTPS = 0x03,
|
||||
LXB_URL_SCHEMEL_TYPE_WS = 0x04,
|
||||
LXB_URL_SCHEMEL_TYPE_WSS = 0x05,
|
||||
LXB_URL_SCHEMEL_TYPE_FTP = 0x06,
|
||||
LXB_URL_SCHEMEL_TYPE_FILE = 0x07,
|
||||
LXB_URL_SCHEMEL_TYPE__LAST_ENTRY
|
||||
}
|
||||
lxb_url_scheme_type_t;
|
||||
|
||||
typedef struct {
|
||||
const lexbor_str_t name;
|
||||
uint16_t port;
|
||||
lxb_url_scheme_type_t type;
|
||||
}
|
||||
lxb_url_scheme_data_t;
|
||||
|
||||
typedef struct {
|
||||
lexbor_str_t name;
|
||||
lxb_url_scheme_type_t type;
|
||||
}
|
||||
lxb_url_scheme_t;
|
||||
|
||||
typedef enum {
|
||||
LXB_URL_HOST_TYPE__UNDEF = 0x00,
|
||||
LXB_URL_HOST_TYPE_DOMAIN = 0x01,
|
||||
LXB_URL_HOST_TYPE_OPAQUE = 0x02,
|
||||
LXB_URL_HOST_TYPE_IPV4 = 0x03,
|
||||
LXB_URL_HOST_TYPE_IPV6 = 0x04,
|
||||
LXB_URL_HOST_TYPE_EMPTY = 0x05
|
||||
}
|
||||
lxb_url_host_type_t;
|
||||
|
||||
typedef struct {
|
||||
lxb_url_host_type_t type;
|
||||
|
||||
union {
|
||||
uint16_t ipv6[8];
|
||||
uint32_t ipv4;
|
||||
lexbor_str_t opaque;
|
||||
lexbor_str_t domain;
|
||||
} u;
|
||||
}
|
||||
lxb_url_host_t;
|
||||
|
||||
typedef struct {
|
||||
lexbor_str_t str;
|
||||
size_t length;
|
||||
bool opaque;
|
||||
}
|
||||
lxb_url_path_t;
|
||||
|
||||
typedef struct {
|
||||
lxb_url_scheme_t scheme;
|
||||
|
||||
lxb_url_host_t host;
|
||||
|
||||
lexbor_str_t username;
|
||||
lexbor_str_t password;
|
||||
|
||||
uint16_t port;
|
||||
bool has_port;
|
||||
|
||||
lxb_url_path_t path;
|
||||
|
||||
lexbor_str_t query;
|
||||
lexbor_str_t fragment;
|
||||
|
||||
lexbor_mraw_t *mraw;
|
||||
}
|
||||
lxb_url_t;
|
||||
|
||||
typedef struct {
|
||||
lxb_url_t *url;
|
||||
lexbor_mraw_t *mraw;
|
||||
lexbor_plog_t *log;
|
||||
|
||||
lxb_unicode_idna_t *idna;
|
||||
}
|
||||
lxb_url_parser_t;
|
||||
|
||||
|
||||
/*
|
||||
* Create lxb_url_parser_t object.
|
||||
*
|
||||
* @return lxb_url_parser_t * if successful, otherwise NULL.
|
||||
*/
|
||||
LXB_API lxb_url_parser_t *
|
||||
lxb_url_parser_create(void);
|
||||
|
||||
/*
|
||||
* Initialization of lxb_url_parser_t object.
|
||||
*
|
||||
* The parser is not bound to the received URLs in any way. That is, after
|
||||
* parsing the lxb_url_parser_t object can be destroyed and we can continue
|
||||
* working with the received URLs.
|
||||
*
|
||||
* Memory for created URLs is taken from lexbor_mraw_t object, which you can
|
||||
* pass during initialization of lxb_url_parser_t object, or a new lexbor_mraw_t
|
||||
* object will be created during initialization if NULL is passed.
|
||||
*
|
||||
* Each created URL will have a pointer to the lexbor_mraw_t object.
|
||||
*
|
||||
* By destroying the lexbor_mraw_t object you destroy all the URL objects
|
||||
* created by the parser. Use the lxb_url_destroy() function to destroy a
|
||||
* specific URL.
|
||||
*
|
||||
* Destroying the lxb_url_parser_t object with lxb_url_parser_destroy() does
|
||||
* not destroy the lexbor_mraw_t memory object.
|
||||
*
|
||||
* Please, see functions lxb_url_parser_memory_destroy(), lxb_url_destroy(),
|
||||
* lxb_url_memory_destroy().
|
||||
*
|
||||
* @param[in] lxb_url_parser_t *
|
||||
* @param[in] lexbor_mraw_t *. Can be NULL. If pass NULL, it will create its own
|
||||
* memory object inside parser and it will be bound to all created URLs.
|
||||
*
|
||||
* @return LXB_STATUS_OK if successful, otherwise an error status value.
|
||||
*/
|
||||
LXB_API lxb_status_t
|
||||
lxb_url_parser_init(lxb_url_parser_t *parser, lexbor_mraw_t *mraw);
|
||||
|
||||
/*
|
||||
* Clears the object. Returns object to states as after initialization.
|
||||
*
|
||||
* This function must be called before the parsing functions can be reused.
|
||||
*
|
||||
* For example:
|
||||
* lxb_url_parse()
|
||||
* lxb_url_parser_clean()
|
||||
* lxb_url_parse()
|
||||
* lxb_url_destroy()
|
||||
*
|
||||
* @param[in] lxb_url_parser_t *
|
||||
*/
|
||||
LXB_API void
|
||||
lxb_url_parser_clean(lxb_url_parser_t *parser);
|
||||
|
||||
/*
|
||||
* Destroy lxb_url_parser_t object.
|
||||
*
|
||||
* Release of occupied resources.
|
||||
* The lexbor_mraw_t memory object is not destroyed in this function.
|
||||
*
|
||||
* @param[in] lxb_url_parser_t *. Can be NULL.
|
||||
* @param[in] if false: only destroys internal buffers.
|
||||
* if true: destroys the lxb_url_parser_t object and all internal buffers.
|
||||
*
|
||||
* @return lxb_url_parser_t * if self_destroy = false, otherwise NULL.
|
||||
*/
|
||||
LXB_API lxb_url_parser_t *
|
||||
lxb_url_parser_destroy(lxb_url_parser_t *parser, bool destroy_self);
|
||||
|
||||
/*
|
||||
* Destroys the lexbor_mraw_t object, and thus all associated URLs.
|
||||
*
|
||||
* After that, new URLs cannot be parsed until a new lexbor_mraw_t object is
|
||||
* assigned to the lxb_url_parser_t object.
|
||||
*
|
||||
* @param[in] lxb_url_parser_t *.
|
||||
*/
|
||||
LXB_API void
|
||||
lxb_url_parser_memory_destroy(lxb_url_parser_t *parser);
|
||||
|
||||
/*
|
||||
* URL parser.
|
||||
*
|
||||
* This functional an implementation of URL parsing according to the WHATWG
|
||||
* specification.
|
||||
*
|
||||
* @param[in] lxb_url_parser_t *.
|
||||
* @param[in] const lxb_url_t *. Base URL, can be NULL.
|
||||
* @param[in] Input characters. Not NULL.
|
||||
* @param[in] Length of characters. Can be 0.
|
||||
*
|
||||
* @return lxb_url_t * if successful, otherwise NULL.
|
||||
*/
|
||||
LXB_API lxb_url_t *
|
||||
lxb_url_parse(lxb_url_parser_t *parser, const lxb_url_t *base_url,
|
||||
const lxb_char_t *data, size_t length);
|
||||
|
||||
/*
|
||||
* URL basic parser.
|
||||
*
|
||||
* This functional an implementation of URL parsing according to the WHATWG
|
||||
* specification.
|
||||
*
|
||||
* Use the lxb_url_get() function to get the URL object.
|
||||
*
|
||||
* @param[in] lxb_url_parser_t *.
|
||||
* @param[in] lxb_url_t *. Can be NULL.
|
||||
* @param[in] const lxb_url_t *. Base URL, can be NULL.
|
||||
* @param[in] Input characters. Not NULL.
|
||||
* @param[in] Length of characters. Can be 0.
|
||||
* @param[in] lxb_url_state_t, for default set to LXB_URL_STATE__UNDEF.
|
||||
* @param[in] lxb_encoding_t, default (LXB_ENCODING_DEFAULT) LXB_ENCODING_UTF_8.
|
||||
*
|
||||
* @return LXB_STATUS_OK if successful, otherwise an error status value.
|
||||
*/
|
||||
LXB_API lxb_status_t
|
||||
lxb_url_parse_basic(lxb_url_parser_t *parser, lxb_url_t *url,
|
||||
const lxb_url_t *base_url,
|
||||
const lxb_char_t *data, size_t length,
|
||||
lxb_url_state_t override_state, lxb_encoding_t encoding);
|
||||
|
||||
/*
|
||||
* Erase URL.
|
||||
*
|
||||
* Frees all internal memory occupied by the URL object, but does not destroy
|
||||
* the object.
|
||||
*
|
||||
* @param[in] lxb_url_t *.
|
||||
*
|
||||
* @return NULL.
|
||||
*/
|
||||
LXB_API void
|
||||
lxb_url_erase(lxb_url_t *url);
|
||||
|
||||
/*
|
||||
* Destroys URL.
|
||||
*
|
||||
* @param[in] lxb_url_t *.
|
||||
*
|
||||
* @return NULL.
|
||||
*/
|
||||
LXB_API lxb_url_t *
|
||||
lxb_url_destroy(lxb_url_t *url);
|
||||
|
||||
/*
|
||||
* Destroys the lexbor_mraw_t memory object.
|
||||
*
|
||||
* The function will destroy all URLs associated with the lexbor_mraw_t memory
|
||||
* object, including the passed one.
|
||||
*
|
||||
* Keep in mind, if you have a live lxb_url_parser_t parsing object, you will
|
||||
* have a pointer to garbage after calling this function instead of a pointer
|
||||
* to the lexbor_mraw_t object.
|
||||
* In this case you need to assign a new memory object lexbor_mraw_t for the
|
||||
* parser. Use the lxb_url_mraw_set() function.
|
||||
*
|
||||
* @param[in] lxb_url_t *.
|
||||
*/
|
||||
LXB_API void
|
||||
lxb_url_memory_destroy(lxb_url_t *url);
|
||||
|
||||
|
||||
/*
|
||||
* Below is an API for modifying the URL object according to the
|
||||
* https://url.spec.whatwg.org/#api specification.
|
||||
*
|
||||
* It is not necessary to pass the lxb_url_parser_t object to API functions.
|
||||
* You need to pass the parser if you want to have logs of parsing.
|
||||
*
|
||||
* All API functions can be passed NULL as "const lxb_char_t *" data.
|
||||
*/
|
||||
|
||||
LXB_API lxb_status_t
|
||||
lxb_url_api_href_set(lxb_url_t *url, lxb_url_parser_t *parser,
|
||||
const lxb_char_t *href, size_t length);
|
||||
|
||||
LXB_API lxb_status_t
|
||||
lxb_url_api_protocol_set(lxb_url_t *url, lxb_url_parser_t *parser,
|
||||
const lxb_char_t *protocol, size_t length);
|
||||
|
||||
LXB_API lxb_status_t
|
||||
lxb_url_api_username_set(lxb_url_t *url,
|
||||
const lxb_char_t *username, size_t length);
|
||||
|
||||
LXB_API lxb_status_t
|
||||
lxb_url_api_password_set(lxb_url_t *url,
|
||||
const lxb_char_t *password, size_t length);
|
||||
|
||||
LXB_API lxb_status_t
|
||||
lxb_url_api_host_set(lxb_url_t *url, lxb_url_parser_t *parser,
|
||||
const lxb_char_t *host, size_t length);
|
||||
|
||||
LXB_API lxb_status_t
|
||||
lxb_url_api_hostname_set(lxb_url_t *url, lxb_url_parser_t *parser,
|
||||
const lxb_char_t *hostname, size_t length);
|
||||
|
||||
LXB_API lxb_status_t
|
||||
lxb_url_api_port_set(lxb_url_t *url, lxb_url_parser_t *parser,
|
||||
const lxb_char_t *port, size_t length);
|
||||
|
||||
LXB_API lxb_status_t
|
||||
lxb_url_api_pathname_set(lxb_url_t *url, lxb_url_parser_t *parser,
|
||||
const lxb_char_t *pathname, size_t length);
|
||||
|
||||
LXB_API lxb_status_t
|
||||
lxb_url_api_search_set(lxb_url_t *url, lxb_url_parser_t *parser,
|
||||
const lxb_char_t *search, size_t length);
|
||||
|
||||
LXB_API lxb_status_t
|
||||
lxb_url_api_hash_set(lxb_url_t *url, lxb_url_parser_t *parser,
|
||||
const lxb_char_t *hash, size_t length);
|
||||
|
||||
|
||||
/*
|
||||
* Below are functions for serializing a URL object and its individual
|
||||
* parameters.
|
||||
*
|
||||
* Note that the callback may be called more than once.
|
||||
* For example, the lxb_url_serialize() function will callback multiple times:
|
||||
* 1. http
|
||||
* 2. ://
|
||||
* 3. example.com
|
||||
* and so on.
|
||||
*/
|
||||
|
||||
LXB_API lxb_status_t
|
||||
lxb_url_serialize(const lxb_url_t *url, lexbor_serialize_cb_f cb, void *ctx,
|
||||
bool exclude_fragment);
|
||||
|
||||
LXB_API lxb_status_t
|
||||
lxb_url_serialize_scheme(const lxb_url_t *url,
|
||||
lexbor_serialize_cb_f cb, void *ctx);
|
||||
|
||||
LXB_API lxb_status_t
|
||||
lxb_url_serialize_username(const lxb_url_t *url,
|
||||
lexbor_serialize_cb_f cb, void *ctx);
|
||||
|
||||
LXB_API lxb_status_t
|
||||
lxb_url_serialize_password(const lxb_url_t *url,
|
||||
lexbor_serialize_cb_f cb, void *ctx);
|
||||
|
||||
LXB_API lxb_status_t
|
||||
lxb_url_serialize_host(const lxb_url_host_t *host,
|
||||
lexbor_serialize_cb_f cb, void *ctx);
|
||||
|
||||
LXB_API lxb_status_t
|
||||
lxb_url_serialize_host_unicode(lxb_unicode_idna_t *idna,
|
||||
const lxb_url_host_t *host,
|
||||
lexbor_serialize_cb_f cb, void *ctx);
|
||||
|
||||
LXB_API lxb_status_t
|
||||
lxb_url_serialize_host_ipv4(uint32_t ipv4,
|
||||
lexbor_serialize_cb_f cb, void *ctx);
|
||||
|
||||
LXB_API lxb_status_t
|
||||
lxb_url_serialize_host_ipv6(const uint16_t *ipv6,
|
||||
lexbor_serialize_cb_f cb, void *ctx);
|
||||
|
||||
LXB_API lxb_status_t
|
||||
lxb_url_serialize_port(const lxb_url_t *url,
|
||||
lexbor_serialize_cb_f cb, void *ctx);
|
||||
|
||||
LXB_API lxb_status_t
|
||||
lxb_url_serialize_path(const lxb_url_path_t *path,
|
||||
lexbor_serialize_cb_f cb, void *ctx);
|
||||
|
||||
LXB_API lxb_status_t
|
||||
lxb_url_serialize_query(const lxb_url_t *url,
|
||||
lexbor_serialize_cb_f cb, void *ctx);
|
||||
|
||||
LXB_API lxb_status_t
|
||||
lxb_url_serialize_fragment(const lxb_url_t *url,
|
||||
lexbor_serialize_cb_f cb, void *ctx);
|
||||
|
||||
/*
|
||||
* Creates a clone of the object's URL.
|
||||
*
|
||||
* For lexbor_mraw_t *, use url->mraw or another lexbor_mraw_t * object.
|
||||
*
|
||||
* @param[in] lexbor_mraw_t *.
|
||||
* @param[in] lxb_url_t *.
|
||||
*
|
||||
* @return a new URL object if successful, otherwise NULL value.
|
||||
*/
|
||||
LXB_API lxb_url_t *
|
||||
lxb_url_clone(lexbor_mraw_t *mraw, lxb_url_t *url);
|
||||
|
||||
/*
|
||||
* Inline functions.
|
||||
*/
|
||||
|
||||
lxb_inline const lexbor_str_t *
|
||||
lxb_url_scheme(const lxb_url_t *url)
|
||||
{
|
||||
return &url->scheme.name;
|
||||
}
|
||||
|
||||
lxb_inline const lexbor_str_t *
|
||||
lxb_url_username(const lxb_url_t *url)
|
||||
{
|
||||
return &url->username;
|
||||
}
|
||||
|
||||
lxb_inline const lexbor_str_t *
|
||||
lxb_url_password(const lxb_url_t *url)
|
||||
{
|
||||
return &url->password;
|
||||
}
|
||||
|
||||
lxb_inline const lxb_url_host_t *
|
||||
lxb_url_host(const lxb_url_t *url)
|
||||
{
|
||||
return &url->host;
|
||||
}
|
||||
|
||||
lxb_inline uint16_t
|
||||
lxb_url_port(const lxb_url_t *url)
|
||||
{
|
||||
return url->port;
|
||||
}
|
||||
|
||||
lxb_inline bool
|
||||
lxb_url_has_port(const lxb_url_t *url)
|
||||
{
|
||||
return url->has_port;
|
||||
}
|
||||
|
||||
lxb_inline const lxb_url_path_t *
|
||||
lxb_url_path(const lxb_url_t *url)
|
||||
{
|
||||
return &url->path;
|
||||
}
|
||||
|
||||
lxb_inline const lexbor_str_t *
|
||||
lxb_url_path_str(const lxb_url_t *url)
|
||||
{
|
||||
return &url->path.str;
|
||||
}
|
||||
|
||||
lxb_inline const lexbor_str_t *
|
||||
lxb_url_query(const lxb_url_t *url)
|
||||
{
|
||||
return &url->query;
|
||||
}
|
||||
|
||||
lxb_inline const lexbor_str_t *
|
||||
lxb_url_fragment(const lxb_url_t *url)
|
||||
{
|
||||
return &url->fragment;
|
||||
}
|
||||
|
||||
lxb_inline lexbor_mraw_t *
|
||||
lxb_url_mraw(lxb_url_parser_t *parser)
|
||||
{
|
||||
return parser->mraw;
|
||||
}
|
||||
|
||||
lxb_inline void
|
||||
lxb_url_mraw_set(lxb_url_parser_t *parser, lexbor_mraw_t *mraw)
|
||||
{
|
||||
parser->mraw = mraw;
|
||||
}
|
||||
|
||||
lxb_inline lxb_url_t *
|
||||
lxb_url_get(lxb_url_parser_t *parser)
|
||||
{
|
||||
return parser->url;
|
||||
}
|
||||
|
||||
|
||||
#ifdef __cplusplus
|
||||
} /* extern "C" */
|
||||
#endif
|
||||
|
||||
#endif /* LEXBOR_URL_H */
|
Loading…
Add table
Add a link
Reference in a new issue