Add Lexbor files for URL handling (#18656)

Relates to #14461 and https://wiki.php.net/rfc/url_parsing_api
2025-08-15 13:38:49 +02:00 · 2025-05-27 13:06:02 +02:00 · 2025-05-27 13:06:02 +02:00 · 400b7b8c74
commit 400b7b8c74
parent d585a5609d
15 changed files with 210811 additions and 1 deletions
--- a/codecov.yml
+++ b/codecov.yml
@ -7,6 +7,9 @@ ignore:
  - "ext/lexbor/lexbor/html"
  - "ext/lexbor/lexbor/ns"
  - "ext/lexbor/lexbor/ports"
  - "ext/lexbor/lexbor/punycode"
  - "ext/lexbor/lexbor/tag"
  - "ext/lexbor/lexbor/unicode"
  - "ext/lexbor/lexbor/url"
  - "ext/pcre/pcre2lib"
  - "ext/uri/uriparser"
--- a/ext/lexbor/config.m4
+++ b/ext/lexbor/config.m4
@ -17,6 +17,7 @@ PHP_NEW_EXTENSION([lexbor], m4_normalize([
    $LEXBOR_DIR/core/hash.c
    $LEXBOR_DIR/core/mem.c
    $LEXBOR_DIR/core/mraw.c
    $LEXBOR_DIR/core/plog.c
    $LEXBOR_DIR/core/print.c
    $LEXBOR_DIR/core/serialize.c
    $LEXBOR_DIR/core/shs.c
@ -174,7 +175,11 @@ PHP_NEW_EXTENSION([lexbor], m4_normalize([
    $LEXBOR_DIR/html/tree/open_elements.c
    $LEXBOR_DIR/ns/ns.c
    $LEXBOR_DIR/ports/posix/lexbor/core/memory.c
    $LEXBOR_DIR/punycode/punycode.c
    $LEXBOR_DIR/tag/tag.c
    $LEXBOR_DIR/unicode/idna.c
    $LEXBOR_DIR/unicode/unicode.c
    $LEXBOR_DIR/url/url.c
  ]),
  [no],,
  [-DZEND_ENABLE_STATIC_TSRMLS_CACHE=1 $PHP_LEXBOR_CFLAGS])
@ -193,7 +198,10 @@ PHP_ADD_BUILD_DIR([
  $ext_builddir/$LEXBOR_DIR/html/tree/insertion_mode
  $ext_builddir/$LEXBOR_DIR/ns
  $ext_builddir/$LEXBOR_DIR/ports/posix/lexbor/core
  $ext_builddir/$LEXBOR_DIR/punycode
  $ext_builddir/$LEXBOR_DIR/tag
  $ext_builddir/$LEXBOR_DIR/unicode
  $ext_builddir/$LEXBOR_DIR/url
 ])
 PHP_ADD_INCLUDE([$ext_srcdir])
 PHP_INSTALL_HEADERS([ext/lexbor], m4_normalize([
--- a/ext/lexbor/config.w32
+++ b/ext/lexbor/config.w32
@ -3,7 +3,7 @@
 EXTENSION("lexbor", "php_lexbor.c", false, "/I " + configure_module_dirname + " /DZEND_ENABLE_STATIC_TSRMLS_CACHE=1");
 PHP_LEXBOR="yes";
 ADD_SOURCES("ext/lexbor/lexbor/ports/windows_nt/lexbor/core", "memory.c", "lexbor");
-ADD_SOURCES("ext/lexbor/lexbor/core", "array_obj.c array.c avl.c bst.c diyfp.c conv.c dobject.c dtoa.c hash.c mem.c mraw.c print.c serialize.c shs.c str.c strtod.c", "lexbor");
+ADD_SOURCES("ext/lexbor/lexbor/core", "array_obj.c array.c avl.c bst.c diyfp.c conv.c dobject.c dtoa.c hash.c mem.c mraw.c plog.c print.c serialize.c shs.c str.c strtod.c", "lexbor");
 ADD_SOURCES("ext/lexbor/lexbor/dom", "interface.c", "lexbor");
 ADD_SOURCES("ext/lexbor/lexbor/dom/interfaces", "attr.c cdata_section.c character_data.c comment.c document.c document_fragment.c document_type.c element.c node.c processing_instruction.c shadow_root.c text.c", "lexbor");
 ADD_SOURCES("ext/lexbor/lexbor/html/tokenizer", "error.c state_comment.c state_doctype.c state_rawtext.c state_rcdata.c state_script.c state.c", "lexbor");
@ -17,7 +17,10 @@ ADD_SOURCES("ext/lexbor/lexbor/css/selectors", "state.c selectors.c selector.c p
 ADD_SOURCES("ext/lexbor/lexbor/css/syntax", "state.c parser.c syntax.c anb.c tokenizer.c token.c","lexbor");
 ADD_SOURCES("ext/lexbor/lexbor/css/syntax/tokenizer", "error.c","lexbor");
 ADD_SOURCES("ext/lexbor/lexbor/ns", "ns.c","lexbor");
 ADD_SOURCES("ext/lexbor/lexbor/punycode", "punycode.c","lexbor");
 ADD_SOURCES("ext/lexbor/lexbor/tag", "tag.c","lexbor");
 ADD_SOURCES("ext/lexbor/lexbor/unicode", "idna.c unicode.c","lexbor");
 ADD_SOURCES("ext/lexbor/lexbor/url", "url.c","lexbor");
 ADD_FLAG("CFLAGS_LEXBOR", "/D LEXBOR_BUILDING /utf-8");
 AC_DEFINE("HAVE_LEXBOR", 1, "Define to 1 if the PHP extension 'lexbor' is available.");
--- a/ext/lexbor/lexbor/punycode/base.h
+++ b/ext/lexbor/lexbor/punycode/base.h
@ -0,0 +1,30 @@
 /*
 * Copyright (C) 2023-2024 Alexander Borisov
 *
 * Author: Alexander Borisov <borisov@lexbor.com>
 */
 #ifndef LEXBOR_PUNYCODE_BASE_H
 #define LEXBOR_PUNYCODE_BASE_H
 #ifdef __cplusplus
 extern "C" {
 #endif
 #include "lexbor/core/base.h"
 #define LXB_PUNYCODE_VERSION_MAJOR 1
 #define LXB_PUNYCODE_VERSION_MINOR 1
 #define LXB_PUNYCODE_VERSION_PATCH 0
 #define LEXBOR_PUNYCODE_VERSION_STRING LEXBOR_STRINGIZE(LXB_PUNYCODE_VERSION_MAJOR) "." \
                                       LEXBOR_STRINGIZE(LXB_PUNYCODE_VERSION_MINOR) "." \
                                       LEXBOR_STRINGIZE(LXB_PUNYCODE_VERSION_PATCH)
 #ifdef __cplusplus
 } /* extern "C" */
 #endif
 #endif /* LEXBOR_PUNYCODE_BASE_H */
--- a/ext/lexbor/lexbor/punycode/punycode.c
+++ b/ext/lexbor/lexbor/punycode/punycode.c
@ -0,0 +1,671 @@
 /*
 * Copyright (C) 2023-2024 Alexander Borisov
 *
 * Author: Alexander Borisov <borisov@lexbor.com>
 */
 #include "lexbor/punycode/punycode.h"
 #include "lexbor/encoding/encoding.h"
 enum {
    LXB_PUNYCODE_BASE = 36,
    LXB_PUNYCODE_TMIN = 1,
    LXB_PUNYCODE_TMAX = 26,
    LXB_PUNYCODE_SKEW = 38,
    LXB_PUNYCODE_DAMP = 700,
    LXB_PUNYCODE_INITIAL_BIAS = 72,
    LXB_PUNYCODE_INITIAL_N = 0x80,
    LXB_PUNYCODE_DELIMITER = 0x2D
 };
 static lxb_status_t
 lxb_punycode_callback_cp(const lxb_codepoint_t *cps, size_t len, void *ctx);
 lxb_inline lxb_char_t *
 lxb_punycode_encode_realloc(lxb_char_t *p, lxb_char_t **buf,
                            const lxb_char_t **end, const lxb_char_t *buffer)
 {
    size_t cur_size = *end - *buf;
    size_t nsize = cur_size * 2;
    lxb_char_t *tmp;
    if (*buf == buffer) {
        tmp = lexbor_malloc(nsize);
        if (tmp == NULL) {
            return NULL;
        }
        memcpy(tmp, *buf, cur_size);
    }
    else {
        tmp = lexbor_realloc(*buf, nsize);
        if (tmp == NULL) {
            return lexbor_free(*buf);
        }
    }
    *buf = tmp;
    *end = tmp + nsize;
    return tmp + cur_size;
 }
 lxb_inline lxb_codepoint_t *
 lxb_punycode_decode_realloc(lxb_codepoint_t *p, lxb_codepoint_t **buf,
                            const lxb_codepoint_t **end,
                            const lxb_codepoint_t *buffer)
 {
    size_t cur_size = *end - *buf;
    size_t nsize = cur_size * 2;
    lxb_codepoint_t *tmp;
    if (*buf == buffer) {
        tmp = lexbor_malloc(nsize * sizeof(lxb_codepoint_t));
        if (tmp == NULL) {
            return NULL;
        }
        memcpy(tmp, *buf, cur_size * sizeof(lxb_codepoint_t));
    }
    else {
        tmp = lexbor_realloc(*buf, nsize * sizeof(lxb_codepoint_t));
        if (tmp == NULL) {
            return lexbor_free(*buf);
        }
    }
    *buf = tmp;
    *end = tmp + nsize;
    return tmp + cur_size;
 }
 static char
 lxb_punycode_encode_digit(size_t d) {
    return d + 22 + 75 * (d < 26);
 }
 static size_t
 lxb_punycode_decode_digit(lxb_codepoint_t cp)
 {
    return  cp - 48 < 10 ? cp - 22 : cp - 65 < 26 ? cp - 65
          : cp - 97 < 26 ? cp - 97 : LXB_PUNYCODE_BASE;
 }
 static size_t
 lxb_punycode_adapt(size_t delta, size_t numpoints, bool firsttime)
 {
    size_t k;
    delta = firsttime ? delta / LXB_PUNYCODE_DAMP : delta >> 1;
    delta += delta / numpoints;
    for (k = 0;
         delta > ((LXB_PUNYCODE_BASE - LXB_PUNYCODE_TMIN) * LXB_PUNYCODE_TMAX) / 2;
         k += LXB_PUNYCODE_BASE)
    {
        delta /= LXB_PUNYCODE_BASE - LXB_PUNYCODE_TMIN;
    }
    return k + (LXB_PUNYCODE_BASE - LXB_PUNYCODE_TMIN + 1)
           * delta / (delta + LXB_PUNYCODE_SKEW);
 }
 static lxb_status_t
 lxb_punycode_encode_body(const lxb_codepoint_t *cps, const lxb_codepoint_t *cps_end,
                         lxb_char_t *p, lxb_char_t *buf, const lxb_char_t *end,
                         const lxb_char_t *buffer, lxb_punycode_encode_cb_f cb,
                         void *ctx)
 {
    bool unchanged;
    size_t h, b, n, q, k, t, delta, bias;
    lxb_status_t status;
    lxb_codepoint_t cp, m;
    const lxb_codepoint_t *cps_t, *cps_p;
    n = LXB_PUNYCODE_INITIAL_N;
    bias = LXB_PUNYCODE_INITIAL_BIAS;
    delta = 0;
    b = p - buf;
    cps_p = cps + b;
    if (cps_p >= cps_end) {
        unchanged = true;
        goto done;
    }
    if (p > buf) {
        *p++ = LXB_PUNYCODE_DELIMITER;
    }
    unchanged = false;
    while (cps_p < cps_end) {
        m = UINT32_MAX;
        cps_t = cps;
        while (cps_t < cps_end) {
            cp = *cps_t++;
            if (cp >= n && cp < m) {
                m = cp;
            }
        }
        h = (cps_p - cps) + 1;
        if (m - n > (UINT32_MAX - delta) / h) {
            status = LXB_STATUS_ERROR_OVERFLOW;
            goto failed;
        }
        delta += (m - n) * h;
        n = m;
        cps_t = cps;
        while (cps_t < cps_end) {
            cp = *cps_t++;
            if (cp < n) {
                if (++delta == 0) {
                    status = LXB_STATUS_ERROR_OVERFLOW;
                    goto failed;
                }
            }
            if (cp == n) {
                q = delta;
                k = LXB_PUNYCODE_BASE;
                for (;; k += LXB_PUNYCODE_BASE) {
                    t = k <= bias ? LXB_PUNYCODE_TMIN :
                    k >= bias + LXB_PUNYCODE_TMAX
                    ? LXB_PUNYCODE_TMAX : k - bias;
                    if (q < t) {
                        break;
                    }
                    if (p >= end) {
                        p = lxb_punycode_encode_realloc(p, &buf, &end, buffer);
                        if (p == NULL) {
                            return LXB_STATUS_ERROR_MEMORY_ALLOCATION;
                        }
                    }
                    *p++ = lxb_punycode_encode_digit(t + (q - t)
                                                     % (LXB_PUNYCODE_BASE - t));
                    q = (q - t) / (LXB_PUNYCODE_BASE - t);
                }
                h = cps_p - cps;
                if (p >= end) {
                    p = lxb_punycode_encode_realloc(p, &buf, &end, buffer);
                    if (p == NULL) {
                        return LXB_STATUS_ERROR_MEMORY_ALLOCATION;
                    }
                }
                *p++ = lxb_punycode_encode_digit(q);
                bias = lxb_punycode_adapt(delta, h + 1, h == b);
                delta = 0;
                cps_p += 1;
            }
        }
        delta += 1;
        n += 1;
    }
 done:
    status = cb(buf, p - buf, ctx, unchanged);
 failed:
    if (buf != buffer) {
        (void) lexbor_free(buf);
    }
    return status;
 }
 lxb_status_t
 lxb_punycode_encode(const lxb_char_t *data, size_t length,
                    lxb_punycode_encode_cb_f cb, void *ctx)
 {
    size_t cp_length;
    uint8_t len;
    lxb_char_t *p, *buf;
    lxb_status_t status;
    lxb_codepoint_t cp, *cps, *cps_p;
    const lxb_char_t *data_p, *data_end, *end;
    const lxb_codepoint_t *cps_end;
    lxb_char_t buffer[4096];
    lxb_codepoint_t input[4096];
    /*
     * Make GCC happy.
     * length variable can be 0.
     */
    input[0] = 0x00;
    p = buffer;
    buf = buffer;
    end = buffer + sizeof(buffer);
    data_p = data;
    data_end = data + length;
    cp_length = 0;
    while (data_p < data_end) {
        len = lxb_encoding_decode_utf_8_length(*data_p);
        if (len == 0) {
            return LXB_STATUS_ERROR_UNEXPECTED_DATA;
        }
        data_p += len;
        cp_length += 1;
    }
    if (cp_length <= sizeof(input) / sizeof(lxb_codepoint_t)) {
        cps = input;
    }
    else {
        cps = lexbor_malloc(cp_length * sizeof(lxb_codepoint_t));
        if (cps == NULL) {
            return LXB_STATUS_ERROR_MEMORY_ALLOCATION;
        }
    }
    data_p = data;
    cps_p = cps;
    cps_end = cps + cp_length;
    while (data_p < data_end) {
        cp = lxb_encoding_decode_valid_utf_8_single(&data_p, data_end);
        if (cp == LXB_ENCODING_DECODE_ERROR) {
            status = LXB_STATUS_ERROR_UNEXPECTED_DATA;
            goto done;
        }
        *cps_p++ = cp;
        if (cp < 0x80) {
            if (p >= end) {
                p = lxb_punycode_encode_realloc(p, &buf, &end, buffer);
                if (p == NULL) {
                    status = LXB_STATUS_ERROR_MEMORY_ALLOCATION;
                    goto done;
                }
            }
            *p++ = cp;
        }
    }
    status = lxb_punycode_encode_body(cps, cps_end, p, buf, end, buffer,
                                      cb, ctx);
 done:
    if (cps != input) {
        (void) lexbor_free(cps);
    }
    return status;
 }
 lxb_status_t
 lxb_punycode_encode_cp(const lxb_codepoint_t *cps, size_t length,
                       lxb_punycode_encode_cb_f cb, void *ctx)
 {
    lxb_char_t *p, *buf;
    lxb_codepoint_t cp;
    const lxb_char_t *end;
    const lxb_codepoint_t *cps_p, *cps_end;
    lxb_char_t buffer[4096];
    p = buffer;
    buf = buffer;
    end = buffer + sizeof(buffer);
    cps_p = cps;
    cps_end = cps + length;
    while (cps_p < cps_end) {
        cp = *cps_p++;
        if (cp < 0x80) {
            if (p >= end) {
                p = lxb_punycode_encode_realloc(p, &buf, &end, buffer);
                if (p == NULL) {
                    return LXB_STATUS_ERROR_MEMORY_ALLOCATION;
                }
            }
            *p++ = cp;
        }
    }
    return lxb_punycode_encode_body(cps, cps_end, p, buf, end, buffer, cb, ctx);
 }
 lxb_status_t
 lxb_punycode_decode(const lxb_char_t *data, size_t length,
                    lexbor_serialize_cb_f cb, void *ctx)
 {
    lexbor_serialize_ctx_t nctx = {.cb = cb, .ctx = ctx};
    return lxb_punycode_decode_cb_cp(data, length, lxb_punycode_callback_cp,
                                     &nctx);
 }
 static lxb_status_t
 lxb_punycode_callback_cp(const lxb_codepoint_t *cps, size_t len, void *ctx)
 {
    uint8_t i;
    size_t length;
    lxb_status_t status;
    const lxb_codepoint_t *cps_p, *cps_end;
    lexbor_serialize_ctx_t *nctx = ctx;
    lxb_char_t *p, *buf, *end;
    lxb_char_t buffer[4096];
    /*
     * Make GCC happy.
     * len variable can be 0.
     */
    buffer[0] = 0x00;
    cps_p = cps;
    cps_end = cps_p + len;
    length = 0;
    while (cps_p < cps_end) {
        i = lxb_encoding_encode_utf_8_length(*cps_p++);
        if (i == 0) {
            return LXB_STATUS_ERROR_UNEXPECTED_DATA;
        }
        length += i;
    }
    buf = buffer;
    end = buffer + sizeof(buffer);
    if (buf + length > end) {
        buf = lexbor_malloc(length);
        if (buf == NULL) {
            return LXB_STATUS_ERROR_MEMORY_ALLOCATION;
        }
        end = buf + length;
    }
    p = buf;
    cps_p = cps;
    while (cps_p < cps_end) {
        (void) lxb_encoding_encode_utf_8_single(NULL, &p, end, *cps_p++);
    }
    status = nctx->cb(buf, p - buf, nctx->ctx);
    if (buf != buffer) {
        (void) lexbor_free(buf);
    }
    return status;
 }
 lxb_status_t
 lxb_punycode_decode_cp(const lxb_codepoint_t *data, size_t length,
                       lexbor_serialize_cb_cp_f cb, void *ctx)
 {
    size_t buf_len, digit, oldi, bias, w, k, t, i, h, in;
    const lxb_codepoint_t *delimiter, *data_p, *data_end;
    lxb_status_t status;
    lxb_codepoint_t cp, n;
    lxb_codepoint_t *p, *buf;
    const lxb_codepoint_t *end;
    lxb_codepoint_t buffer[4096];
    p = buffer;
    buf = buffer;
    buf_len = sizeof(buffer) / sizeof(lxb_codepoint_t);
    end = buffer + buf_len;
    data_p = data;
    data_end = data + length;
    delimiter = data_end;
    while (delimiter != data) {
        delimiter -= 1;
        if (*delimiter == LXB_PUNYCODE_DELIMITER) {
            break;
        }
    }
    while (data_p < delimiter) {
        cp = *data_p++;
        if (cp >= 0x80) {
            status = LXB_STATUS_ERROR_UNEXPECTED_DATA;
            goto done;
        }
        if (p >= end) {
            p = lxb_punycode_decode_realloc(p, &buf, &end, buffer);
            if (p == NULL) {
                return LXB_STATUS_ERROR_MEMORY_ALLOCATION;
            }
        }
        *p++ = cp;
    }
    i = 0;
    n = LXB_PUNYCODE_INITIAL_N;
    bias = LXB_PUNYCODE_INITIAL_BIAS;
    data_p = (delimiter != data) ? delimiter + 1: data;
    in = data_p - data;
    for (; in < length; p++) {
        for (oldi = i, w = 1, k = LXB_PUNYCODE_BASE; ; k += LXB_PUNYCODE_BASE) {
            if (in >= length) {
                status = LXB_STATUS_ERROR_UNEXPECTED_DATA;
                goto done;
            }
            cp = data[in++];
            digit = lxb_punycode_decode_digit(cp);
            if (digit >= LXB_PUNYCODE_BASE) {
                status = LXB_STATUS_ERROR_UNEXPECTED_DATA;
                goto done;
            }
            if (digit > (UINT32_MAX - i) / w) {
                status = LXB_STATUS_ERROR_OVERFLOW;
                goto done;
            }
            i += digit * w;
            t = k <= bias ? LXB_PUNYCODE_TMIN
            : k >= bias + LXB_PUNYCODE_TMAX ? LXB_PUNYCODE_TMAX : k - bias;
            if (digit < t) {
                break;
            }
            if (w > UINT32_MAX / (LXB_PUNYCODE_BASE - t)) {
                status = LXB_STATUS_ERROR_OVERFLOW;
                goto done;
            }
            w *= (LXB_PUNYCODE_BASE - t);
        }
        h = (p - buf) + 1;
        bias = lxb_punycode_adapt(i - oldi, h, oldi == 0);
        if (i / h > UINT32_MAX - n) {
            status = LXB_STATUS_ERROR_OVERFLOW;
            goto done;
        }
        n += i / h;
        i %= h;
        if (p >= end) {
            p = lxb_punycode_decode_realloc(p, &buf, &end, buffer);
            if (p == NULL) {
                return LXB_STATUS_ERROR_MEMORY_ALLOCATION;
            }
        }
        memmove(buf + i + 1, buf + i, ((h - 1) - i) * sizeof(lxb_codepoint_t));
        buf[i++] = n;
    }
    status = cb(buf, p - buf, ctx);
 done:
    if (buffer != buf) {
        (void) lexbor_free(buf);
    }
    return status;
 }
 lxb_status_t
 lxb_punycode_decode_cb_cp(const lxb_char_t *data, size_t length,
                          lexbor_serialize_cb_cp_f cb, void *ctx)
 {
    size_t buf_len, digit, oldi, bias, w, k, t, i, h, in;
    const lxb_char_t *delimiter, *data_p, *data_end;
    lxb_status_t status;
    lxb_codepoint_t cp, n;
    lxb_codepoint_t *p, *buf;
    const lxb_codepoint_t *end;
    lxb_codepoint_t buffer[4096];
    p = buffer;
    buf = buffer;
    buf_len = sizeof(buffer) / sizeof(lxb_codepoint_t);
    end = buffer + buf_len;
    data_p = data;
    data_end = data + length;
    delimiter = data_end;
    while (delimiter != data) {
        delimiter -= 1;
        if (*delimiter == LXB_PUNYCODE_DELIMITER) {
            break;
        }
    }
    while (data_p < delimiter) {
        cp = *data_p++;
        if (cp >= 0x80) {
            status = LXB_STATUS_ERROR_UNEXPECTED_DATA;
            goto done;
        }
        if (p >= end) {
            p = lxb_punycode_decode_realloc(p, &buf, &end, buffer);
            if (p == NULL) {
                return LXB_STATUS_ERROR_MEMORY_ALLOCATION;
            }
        }
        *p++ = cp;
    }
    i = 0;
    n = LXB_PUNYCODE_INITIAL_N;
    bias = LXB_PUNYCODE_INITIAL_BIAS;
    data_p = (delimiter != data) ? delimiter + 1: data;
    in = data_p - data;
    for (; in < length; p++) {
        for (oldi = i, w = 1, k = LXB_PUNYCODE_BASE; ; k += LXB_PUNYCODE_BASE) {
            if (in >= length) {
                status = LXB_STATUS_ERROR_UNEXPECTED_DATA;
                goto done;
            }
            cp = data[in++];
            digit = lxb_punycode_decode_digit(cp);
            if (digit >= LXB_PUNYCODE_BASE) {
                status = LXB_STATUS_ERROR_UNEXPECTED_DATA;
                goto done;
            }
            if (digit > (UINT32_MAX - i) / w) {
                status = LXB_STATUS_ERROR_OVERFLOW;
                goto done;
            }
            i += digit * w;
            t = k <= bias ? LXB_PUNYCODE_TMIN
              : k >= bias + LXB_PUNYCODE_TMAX ? LXB_PUNYCODE_TMAX : k - bias;
            if (digit < t) {
                break;
            }
            if (w > UINT32_MAX / (LXB_PUNYCODE_BASE - t)) {
                status = LXB_STATUS_ERROR_OVERFLOW;
                goto done;
            }
            w *= (LXB_PUNYCODE_BASE - t);
        }
        h = (p - buf) + 1;
        bias = lxb_punycode_adapt(i - oldi, h, oldi == 0);
        if (i / h > UINT32_MAX - n) {
            status = LXB_STATUS_ERROR_OVERFLOW;
            goto done;
        }
        n += i / h;
        i %= h;
        if (p >= end) {
            p = lxb_punycode_decode_realloc(p, &buf, &end, buffer);
            if (p == NULL) {
                return LXB_STATUS_ERROR_MEMORY_ALLOCATION;
            }
        }
        memmove(buf + i + 1, buf + i, ((h - 1) - i) * sizeof(lxb_codepoint_t));
        buf[i++] = n;
    }
    status = cb(buf, p - buf, ctx);
 done:
    if (buffer != buf) {
        (void) lexbor_free(buf);
    }
    return status;
 }
--- a/ext/lexbor/lexbor/punycode/punycode.h
+++ b/ext/lexbor/lexbor/punycode/punycode.h
@ -0,0 +1,109 @@
 /*
 * Copyright (C) 2023 Alexander Borisov
 *
 * Author: Alexander Borisov <borisov@lexbor.com>
 */
 #ifndef LEXBOR_PUNYCODE_H
 #define LEXBOR_PUNYCODE_H
 #ifdef __cplusplus
 extern "C" {
 #endif
 #include "lexbor/punycode/base.h"
 typedef lxb_status_t
 (*lxb_punycode_encode_cb_f)(const lxb_char_t *data, size_t len, void *ctx,
                            bool unchanged);
 /*
 * Punycode: A Bootstring encoding of Unicode
 * for Internationalized Domain Names in Applications (IDNA).
 *
 * https://www.rfc-editor.org/rfc/inline-errata/rfc3492.html
 */
 /*
 * Encoding from characters to characters.
 *
 * @param[in] Input characters for encoding. Not NULL.
 * @param[in] Length of characters. Can be 0.
 * @param[in] Callback for results. Сalled only once when encoding is complete.
 * @param[in] Context for callback.
 *
 * @return LXB_STATUS_OK if successful, otherwise an error status value.
 */
 LXB_API lxb_status_t
 lxb_punycode_encode(const lxb_char_t *data, size_t length,
                    lxb_punycode_encode_cb_f cb, void *ctx);
 /*
 * Encoding from code points to characters.
 *
 * Same as lxb_punycode_encode() only the input is code points.
 *
 * @param[in] Input code points for encoding. Not NULL.
 * @param[in] Length of code points. Can be 0.
 * @param[in] Callback for results. Сalled only once when encoding is complete.
 * @param[in] Context for callback.
 *
 * @return LXB_STATUS_OK if successful, otherwise an error status value.
 */
 LXB_API lxb_status_t
 lxb_punycode_encode_cp(const lxb_codepoint_t *cps, size_t length,
                       lxb_punycode_encode_cb_f cb, void *ctx);
 /*
 * Decoding from characters to characters.
 *
 * @param[in] Input characters for encoding. Not NULL.
 * @param[in] Length of characters. Can be 0.
 * @param[in] Callback for results. Сalled only once when encoding is complete.
 * @param[in] Context for callback.
 *
 * @return LXB_STATUS_OK if successful, otherwise an error status value.
 */
 LXB_API lxb_status_t
 lxb_punycode_decode(const lxb_char_t *data, size_t length,
                    lexbor_serialize_cb_f cb, void *ctx);
 /*
 * Decoding from code points to code points.
 *
 * Same as lxb_punycode_decode() only the input/output is code points.
 *
 * @param[in] Input code points for encoding. Not NULL.
 * @param[in] Length of code points. Can be 0.
 * @param[in] Callback for results. Сalled only once when encoding is complete.
 * @param[in] Context for callback.
 *
 * @return LXB_STATUS_OK if successful, otherwise an error status value.
 */
 LXB_API lxb_status_t
 lxb_punycode_decode_cp(const lxb_codepoint_t *data, size_t length,
                       lexbor_serialize_cb_cp_f cb, void *ctx);
 /*
 * Decoding from characters to code points.
 *
 * Same as lxb_punycode_decode() only the output is code points.
 *
 * @param[in] Input code points for encoding. Not NULL.
 * @param[in] Length of code points. Can be 0.
 * @param[in] Callback for results. Сalled only once when encoding is complete.
 * @param[in] Context for callback.
 *
 * @return LXB_STATUS_OK if successful, otherwise an error status value.
 */
 LXB_API lxb_status_t
 lxb_punycode_decode_cb_cp(const lxb_char_t *data, size_t length,
                          lexbor_serialize_cb_cp_f cb, void *ctx);
 #ifdef __cplusplus
 } /* extern "C" */
 #endif
 #endif /* LEXBOR_PUNYCODE_H */
--- a/ext/lexbor/lexbor/unicode/base.h
+++ b/ext/lexbor/lexbor/unicode/base.h
@ -0,0 +1,157 @@
 /*
 * Copyright (C) 2023-2024 Alexander Borisov
 *
 * Author: Alexander Borisov <borisov@lexbor.com>
 */
 #ifndef LEXBOR_UNICODE_BASE_H
 #define LEXBOR_UNICODE_BASE_H
 #ifdef __cplusplus
 extern "C" {
 #endif
 #include "lexbor/core/base.h"
 #include "lexbor/core/str.h"
 #define LXB_UNICODE_VERSION_MAJOR 0
 #define LXB_UNICODE_VERSION_MINOR 3
 #define LXB_UNICODE_VERSION_PATCH 0
 #define LXB_UNICODE_VERSION_STRING LEXBOR_STRINGIZE(LXB_UNICODE_VERSION_MAJOR) "." \
                                   LEXBOR_STRINGIZE(LXB_UNICODE_VERSION_MINOR) "." \
                                   LEXBOR_STRINGIZE(LXB_UNICODE_VERSION_PATCH)
 enum {
    LXB_UNICODE_DECOMPOSITION_TYPE__UNDEF = 0x00,
    LXB_UNICODE_DECOMPOSITION_TYPE_CIRCLE,
    LXB_UNICODE_DECOMPOSITION_TYPE_COMPAT,
    LXB_UNICODE_DECOMPOSITION_TYPE_FINAL,
    LXB_UNICODE_DECOMPOSITION_TYPE_FONT,
    LXB_UNICODE_DECOMPOSITION_TYPE_FRACTION,
    LXB_UNICODE_DECOMPOSITION_TYPE_INITIAL,
    LXB_UNICODE_DECOMPOSITION_TYPE_ISOLATED,
    LXB_UNICODE_DECOMPOSITION_TYPE_MEDIAL,
    LXB_UNICODE_DECOMPOSITION_TYPE_NARROW,
    LXB_UNICODE_DECOMPOSITION_TYPE_NOBREAK,
    LXB_UNICODE_DECOMPOSITION_TYPE_SMALL,
    LXB_UNICODE_DECOMPOSITION_TYPE_SQUARE,
    LXB_UNICODE_DECOMPOSITION_TYPE_SUB,
    LXB_UNICODE_DECOMPOSITION_TYPE_SUPER,
    LXB_UNICODE_DECOMPOSITION_TYPE_VERTICAL,
    LXB_UNICODE_DECOMPOSITION_TYPE_WIDE,
    LXB_UNICODE_DECOMPOSITION_TYPE__LAST_ENTRY
 };
 #define LXB_UNICODE_CANONICAL_SEPARATELY        (1 << 7)
 #define LXB_UNICODE_IS_CANONICAL_SEPARATELY(a)  ((a) >> 7)
 #define LXB_UNICODE_DECOMPOSITION_TYPE(a)       ((a) & ~(1 << 7))
 typedef uint8_t lxb_unicode_decomposition_type_t;
 enum {
    LXB_UNICODE_QUICK__UNDEF     = 0x00,
    LXB_UNICODE_QUICK_NFC_MAYBE  = 1 << 0,
    LXB_UNICODE_QUICK_NFC_NO     = 1 << 1,
    LXB_UNICODE_QUICK_NFD_NO     = 1 << 2,
    LXB_UNICODE_QUICK_NFKC_MAYBE = 1 << 3,
    LXB_UNICODE_QUICK_NFKC_NO    = 1 << 4,
    LXB_UNICODE_QUICK_NFKD_NO    = 1 << 5
 };
 typedef uint8_t lxb_unicode_quick_type_t;
 enum {
    LXB_UNICODE_IDNA__UNDEF = 0x00,
    LXB_UNICODE_IDNA_DEVIATION,
    LXB_UNICODE_IDNA_DISALLOWED,
    LXB_UNICODE_IDNA_IGNORED,
    LXB_UNICODE_IDNA_MAPPED,
    LXB_UNICODE_IDNA_VALID
 };
 typedef uint8_t lxb_unicode_idna_type_t;
 typedef struct lxb_unicode_normalizer lxb_unicode_normalizer_t;
 typedef struct {
    lxb_codepoint_t cp;
    uint8_t         ccc;
 }
 lxb_unicode_buffer_t;
 typedef lxb_status_t
 (*lxb_unicode_nf_handler_f)(lxb_unicode_normalizer_t *uc, const lxb_char_t *data,
                            size_t length, lexbor_serialize_cb_f cb, void *ctx,
                            bool is_last);
 typedef lxb_unicode_buffer_t *
 (*lxb_unicode_de_handler_f)(lxb_unicode_normalizer_t *uc, lxb_codepoint_t cp,
                            lxb_unicode_buffer_t **buf,
                            const lxb_unicode_buffer_t **end);
 typedef void
 (*lxb_unicode_co_handler_f)(lxb_unicode_buffer_t *starter,
                            lxb_unicode_buffer_t *op, lxb_unicode_buffer_t *p);
 typedef struct {
    uint16_t normalization; /* lxb_unicode_normalization_t */
    uint16_t idna;          /* lxb_unicode_idna_t */
 }
 lxb_unicode_entry_t;
 typedef struct {
    lxb_unicode_decomposition_type_t type;
    lxb_unicode_quick_type_t         quick;         /* Quick Check.               */
    uint8_t                          ccc;           /* Canonical Combining Class. */
    uint8_t                          length;
    uint16_t                         decomposition; /* lxb_codepoint_t */
    uint16_t                         composition;   /* lxb_unicode_composition_entry_t */
 }
 lxb_unicode_normalization_entry_t;
 typedef struct {
    lxb_unicode_idna_type_t type;
    uint8_t                 length;
    uint16_t                index;
 }
 lxb_unicode_idna_entry_t;
 typedef struct {
    uint8_t         length;  /* Length in lxb_unicode_composition_cps_t */
    uint16_t        index;   /* lxb_unicode_composition_cps_t */
    lxb_codepoint_t cp;      /* Begin code point in lxb_unicode_composition_cps_t */
 }
 lxb_unicode_composition_entry_t;
 typedef struct {
    lxb_codepoint_t cp;
    bool            exclusion;
 }
 lxb_unicode_composition_cp_t;
 struct lxb_unicode_normalizer {
    lxb_unicode_de_handler_f   decomposition;
    lxb_unicode_co_handler_f   composition;
    lxb_unicode_buffer_t       *starter;
    lxb_unicode_buffer_t       *buf;
    const lxb_unicode_buffer_t *end;
    lxb_unicode_buffer_t       *p;
    lxb_unicode_buffer_t       *ican;
    lxb_char_t                 tmp[4];
    uint8_t                    tmp_lenght;
    uint8_t                    quick_ccc;
    lxb_unicode_quick_type_t   quick_type;
    size_t                     flush_cp;
 };
 #ifdef __cplusplus
 } /* extern "C" */
 #endif
 #endif /* LEXBOR_UNICODE_BASE_H */
--- a/ext/lexbor/lexbor/unicode/idna.c
+++ b/ext/lexbor/lexbor/unicode/idna.c
@ -0,0 +1,738 @@
 /*
 * Copyright (C) 2023 Alexander Borisov
 *
 * Author: Alexander Borisov <borisov@lexbor.com>
 */
 #include "lexbor/unicode/idna.h"
 #include "lexbor/unicode/unicode.h"
 #include "lexbor/punycode/punycode.h"
 #include "lexbor/encoding/encoding.h"
 typedef struct {
    lxb_unicode_idna_cb_f   cb;
    void                    *context;
    lxb_unicode_idna_flag_t flags;
 }
 lxb_unicode_idna_ctx_t;
 typedef struct {
    lxb_char_t              buffer[4096];
    lxb_char_t              *p;
    lxb_char_t              *buf;
    const lxb_char_t        *end;
    lxb_unicode_idna_flag_t flags;
 }
 lxb_unicode_idna_ascii_ctx_t;
 static lxb_status_t
 lxb_unicode_idna_processing_body(lxb_unicode_idna_t *idna, const void *data,
                                 size_t len, lxb_unicode_idna_cb_f cb, void *ctx,
                                 lxb_unicode_idna_flag_t flags, bool is_cp);
 static lxb_status_t
 lxb_unicode_idna_norm_c_cb(const lxb_codepoint_t *cps, size_t len, void *ctx);
 static lxb_status_t
 lxb_unicode_idna_norm_c_send(const lxb_codepoint_t *cps,
                             const lxb_codepoint_t *p,
                             lxb_unicode_idna_ctx_t *context);
 static lxb_status_t
 lxb_unicode_idna_punycode_cb(const lxb_codepoint_t *cps, size_t len, void *ctx);
 static lxb_status_t
 lxb_unicode_idna_to_ascii_cb(const lxb_codepoint_t *part, size_t len,
                             void *ctx, lxb_status_t status);
 static lxb_status_t
 lxb_unicode_idna_to_ascii_body(lxb_unicode_idna_t *idna, const void *data,
                               size_t length, lexbor_serialize_cb_f cb, void *ctx,
                               lxb_unicode_idna_flag_t flags, bool is_cp);
 static lxb_status_t
 lxb_unicode_idna_ascii_puny_cb(const lxb_char_t *data, size_t length, void *ctx,
                               bool unchanged);
 static lxb_status_t
 lxb_unicode_idna_to_unicode_cb(const lxb_codepoint_t *part, size_t len,
                               void *ctx, lxb_status_t status);
 static lxb_status_t
 lxb_unicode_idna_to_unicode_body(lxb_unicode_idna_t *idna, const void *data,
                                 size_t length, lexbor_serialize_cb_f cb,
                                 void *ctx, lxb_unicode_idna_flag_t flags,
                                 bool is_cp);
 static bool
 lxb_unicode_idna_validity_criteria_h(const void *data, size_t length,
                                     lxb_unicode_idna_flag_t flags, bool is_cp);
 lxb_unicode_idna_t *
 lxb_unicode_idna_create(void)
 {
    return lexbor_malloc(sizeof(lxb_unicode_idna_t));
 }
 lxb_status_t
 lxb_unicode_idna_init(lxb_unicode_idna_t *idna)
 {
    if (idna == NULL) {
        return LXB_STATUS_ERROR_OBJECT_IS_NULL;
    }
    return lxb_unicode_normalizer_init(&idna->normalizer, LXB_UNICODE_NFC);
 }
 void
 lxb_unicode_idna_clean(lxb_unicode_idna_t *idna)
 {
    lxb_unicode_normalizer_clean(&idna->normalizer);
 }
 lxb_unicode_idna_t *
 lxb_unicode_idna_destroy(lxb_unicode_idna_t *idna, bool self_destroy)
 {
    if (idna == NULL) {
        return NULL;
    }
    (void) lxb_unicode_normalizer_destroy(&idna->normalizer, false);
    if (self_destroy) {
        return lexbor_free(idna);
    }
    return idna;
 }
 lxb_codepoint_t *
 lxb_unicode_idna_realloc(lxb_codepoint_t *buf, const lxb_codepoint_t *buffer,
                         lxb_codepoint_t **buf_p, lxb_codepoint_t **buf_end,
                         size_t len)
 {
    size_t nlen;
    lxb_codepoint_t *tmp;
    nlen = ((*buf_end - buf) * 4) + len;
    if (buf == buffer) {
        tmp = lexbor_malloc(nlen * sizeof(lxb_codepoint_t));
        if (tmp == NULL) {
            return NULL;
        }
    }
    else {
        tmp = lexbor_realloc(buf, nlen * sizeof(lxb_codepoint_t));
        if (tmp == NULL) {
            return lexbor_free(buf);
        }
    }
    *buf_p = tmp + (*buf_p - buf);
    *buf_end = tmp + nlen;
    return tmp;
 }
 lxb_status_t
 lxb_unicode_idna_processing(lxb_unicode_idna_t *idna, const lxb_char_t *data,
                            size_t length, lxb_unicode_idna_cb_f cb, void *ctx,
                            lxb_unicode_idna_flag_t flags)
 {
    return lxb_unicode_idna_processing_body(idna, data, length, cb, ctx,
                                            flags, false);
 }
 lxb_status_t
 lxb_unicode_idna_processing_cp(lxb_unicode_idna_t *idna,
                               const lxb_codepoint_t *cps, size_t length,
                               lxb_unicode_idna_cb_f cb, void *ctx,
                               lxb_unicode_idna_flag_t flags)
 {
    return lxb_unicode_idna_processing_body(idna, cps, length, cb, ctx,
                                            flags, true);
 }
 static lxb_status_t
 lxb_unicode_idna_processing_body(lxb_unicode_idna_t *idna, const void *data,
                                 size_t len, lxb_unicode_idna_cb_f cb, void *ctx,
                                 lxb_unicode_idna_flag_t flags, bool is_cp)
 {
    bool need;
    size_t i, length;
    lxb_status_t status;
    lxb_codepoint_t cp, *buf, *buf_p, *buf_end;
    const lxb_char_t *end, *p;
    lxb_unicode_idna_type_t type;
    const lxb_unicode_idna_entry_t *udata;
    const lxb_codepoint_t *maps;
    lxb_unicode_idna_ctx_t context;
    lxb_codepoint_t buffer[4096];
    buf = buffer;
    buf_p = buffer;
    buf_end = buffer + (sizeof(buffer) / sizeof(lxb_codepoint_t));
    p = data;
    len *= (is_cp) ? sizeof(lxb_codepoint_t) : 1;
    end = (const lxb_char_t *) data + len;
    while (p < end) {
        if (is_cp) {
            cp = *((const lxb_codepoint_t *) p);
            p = (const lxb_char_t *) ((const lxb_codepoint_t *) p + 1);
        }
        else {
            cp = lxb_encoding_decode_valid_utf_8_single(&p, end);
            if (cp > LXB_ENCODING_DECODE_MAX_CODEPOINT) {
                status = LXB_STATUS_ERROR_UNEXPECTED_DATA;
                goto done;
            }
        }
        type = lxb_unicode_idna_type(cp);
    again:
        switch (type) {
            case LXB_UNICODE_IDNA_IGNORED:
                break;
            case LXB_UNICODE_IDNA_MAPPED:
                udata = lxb_unicode_idna_entry_by_cp(cp);
                maps = lxb_unicode_idna_map(udata, &length);
                if (buf_p + length > buf_end) {
                    buf = lxb_unicode_idna_realloc(buf, buffer, &buf_p,
                                                   &buf_end, length);
                    if (buf == NULL) {
                        return LXB_STATUS_ERROR_MEMORY_ALLOCATION;
                    }
                }
                for (i = 0; i < length; i++) {
                    *buf_p++ = maps[i];
                }
                break;
            case LXB_UNICODE_IDNA_DEVIATION:
                if ((flags & LXB_UNICODE_IDNA_FLAG_TRANSITIONAL_PROCESSING)) {
                    type = LXB_UNICODE_IDNA_MAPPED;
                    goto again;
                }
                /* Fall through. */
            case LXB_UNICODE_IDNA_DISALLOWED:
                /* Fall through. */
            case LXB_UNICODE_IDNA_VALID:
            default:
                if (buf_p >= buf_end) {
                    buf = lxb_unicode_idna_realloc(buf, buffer, &buf_p,
                                                   &buf_end, 1);
                    if (buf == NULL) {
                        return LXB_STATUS_ERROR_MEMORY_ALLOCATION;
                    }
                }
                *buf_p++ = cp;
                break;
        }
    }
    context.cb = cb;
    context.context = ctx;
    context.flags = flags;
    need = lxb_unicode_quick_check_cp(&idna->normalizer, buf, buf_p - buf,
                                      true);
    if (need) {
        lxb_unicode_flush_count_set(&idna->normalizer, UINT32_MAX);
        status = lxb_unicode_normalize_cp(&idna->normalizer, buf, buf_p - buf,
                                          lxb_unicode_idna_norm_c_cb,
                                          &context, true);
    }
    else {
        status = lxb_unicode_idna_norm_c_cb(buf, buf_p - buf, &context);
    }
 done:
    if (buf != buffer) {
        (void) lexbor_free(buf);
    }
    return status;
 }
 static lxb_status_t
 lxb_unicode_idna_norm_c_cb(const lxb_codepoint_t *cps, size_t len, void *ctx)
 {
    lxb_status_t status;
    lxb_unicode_idna_ctx_t *context = ctx;
    const lxb_codepoint_t *p, *end;
    p = cps;
    end = cps + len;
    while (p < end) {
        /* U+002E ( . ) FULL STOP. */
        if (*p == 0x002E) {
            status = lxb_unicode_idna_norm_c_send(cps, p, context);
            if (status != LXB_STATUS_OK) {
                return status;
            }
            cps = p + 1;
        }
        p += 1;
    }
    /*
     * We need to call a zero-length callback if the last codepoint was a
     * U+002E ( . ) FULL STOP.
     *
     * For example, "muuuu." will call for two callbacks.
     * First: "muuuu".
     * Second: "" -- empty string with length = 0.
     */
    if (p > cps || (len >= 1 && p[-1] == '.')) {
        return lxb_unicode_idna_norm_c_send(cps, p, context);
    }
    return LXB_STATUS_OK;
 }
 static lxb_status_t
 lxb_unicode_idna_norm_c_send(const lxb_codepoint_t *cps,
                             const lxb_codepoint_t *p,
                             lxb_unicode_idna_ctx_t *context)
 {
    bool cr;
    lxb_status_t status;
    /* xn-- or Xn-- or xN-- or XN-- */
    if (p - cps >= 4
        && (cps[0] == 0x0078 || cps[0] == 0x0058)
        && (cps[1] == 0x006E || cps[1] == 0x004E)
        && cps[2] == 0x002D && cps[3] == 0x002D)
    {
        cps += 4;
        status = lxb_punycode_decode_cp(cps, p - cps,
                                        lxb_unicode_idna_punycode_cb,
                                        context);
        if (status == LXB_STATUS_OK) {
            return LXB_STATUS_OK;
        }
        cps -= 4;
    }
    else {
        status = LXB_STATUS_OK;
    }
    cr = lxb_unicode_idna_validity_criteria_cp(cps, p - cps, context->flags);
    if (!cr) {
        return LXB_STATUS_ERROR_UNEXPECTED_RESULT;
    }
    return context->cb(cps, p - cps, context->context, status);
 }
 static lxb_status_t
 lxb_unicode_idna_punycode_cb(const lxb_codepoint_t *cps, size_t len, void *ctx)
 {
    bool cr;
    lxb_unicode_idna_ctx_t *context = ctx;
    lxb_unicode_idna_ascii_ctx_t *asc = context->context;
    cr = lxb_unicode_idna_validity_criteria_cp(cps, len, asc->flags);
    if (!cr) {
        return LXB_STATUS_ERROR_UNEXPECTED_RESULT;
    }
    return context->cb(cps, len, context->context, LXB_STATUS_OK);
 }
 lxb_status_t
 lxb_unicode_idna_to_ascii(lxb_unicode_idna_t *idna, const lxb_char_t *data,
                          size_t length, lexbor_serialize_cb_f cb, void *ctx,
                          lxb_unicode_idna_flag_t flags)
 {
    return lxb_unicode_idna_to_ascii_body(idna, data, length, cb, ctx,
                                          flags, false);
 }
 lxb_status_t
 lxb_unicode_idna_to_ascii_cp(lxb_unicode_idna_t *idna, const lxb_codepoint_t *cps,
                             size_t length, lexbor_serialize_cb_f cb, void *ctx,
                             lxb_unicode_idna_flag_t flags)
 {
    return lxb_unicode_idna_to_ascii_body(idna, cps, length, cb, ctx,
                                          flags, true);
 }
 static lxb_status_t
 lxb_unicode_idna_to_ascii_body(lxb_unicode_idna_t *idna, const void *data,
                               size_t length, lexbor_serialize_cb_f cb, void *ctx,
                               lxb_unicode_idna_flag_t flags, bool is_cp)
 {
    size_t len;
    lxb_status_t status;
    lxb_unicode_idna_ascii_ctx_t context;
    context.p = context.buffer;
    context.buf = context.buffer;
    context.end = context.buf + sizeof(context.buffer);
    context.flags = flags;
    if (!is_cp) {
        status = lxb_unicode_idna_processing(idna, data, length,
                                             lxb_unicode_idna_to_ascii_cb,
                                             &context, flags);
    }
    else {
        status = lxb_unicode_idna_processing_cp(idna, data, length,
                                                lxb_unicode_idna_to_ascii_cb,
                                                &context, flags);
    }
    if (status != LXB_STATUS_OK) {
        goto done;
    }
    /* Remove last U+002E ( . ) FULL STOP. */
    if (context.p > context.buf) {
        context.p -= 1;
    }
    len = context.p - context.buf;
    status = cb(context.buf, len, ctx);
 done:
    if (context.buf != context.buffer) {
        (void) lexbor_free(context.buf);
    }
    return status;
 }
 static lxb_status_t
 lxb_unicode_idna_to_ascii_cb(const lxb_codepoint_t *part, size_t len,
                             void *ctx, lxb_status_t status)
 {
    if (status != LXB_STATUS_OK) {
        return status;
    }
    return lxb_punycode_encode_cp(part, len, lxb_unicode_idna_ascii_puny_cb,
                                  ctx);
 }
 static lxb_status_t
 lxb_unicode_idna_ascii_puny_cb(const lxb_char_t *data, size_t length, void *ctx,
                               bool unchanged)
 {
    size_t nlen;
    lxb_char_t *tmp;
    lxb_unicode_idna_ascii_ctx_t *asc = ctx;
    static const lexbor_str_t prefix = lexbor_str("xn--");
    if (asc->p + length + 6 > asc->end) {
        nlen = ((asc->end - asc->buf) * 4) + length + 6;
        if (asc->buf == asc->buffer) {
            tmp = lexbor_malloc(nlen);
        }
        else {
            tmp = lexbor_realloc(asc->buf, nlen);
        }
        if (tmp == NULL) {
            return LXB_STATUS_ERROR_MEMORY_ALLOCATION;
        }
        asc->p = tmp + (asc->p - asc->buf);
        asc->buf = tmp;
        asc->end = tmp + nlen;
    }
    if (!unchanged) {
        memcpy(asc->p, prefix.data, prefix.length);
        asc->p += 4;
    }
    memcpy(asc->p, data, length);
    asc->p += length;
    *asc->p++ = '.';
    *asc->p = 0x00;
    return LXB_STATUS_OK;
 }
 bool
 lxb_unicode_idna_validity_criteria(const lxb_char_t *data, size_t length,
                                   lxb_unicode_idna_flag_t flags)
 {
    return lxb_unicode_idna_validity_criteria_h(data, length, flags, false);
 }
 bool
 lxb_unicode_idna_validity_criteria_cp(const lxb_codepoint_t *data, size_t length,
                                      lxb_unicode_idna_flag_t flags)
 {
    return lxb_unicode_idna_validity_criteria_h(data, length, flags, true);
 }
 static bool
 lxb_unicode_idna_validity_criteria_h(const void *data, size_t length,
                                     lxb_unicode_idna_flag_t flags, bool is_cp)
 {
    size_t len;
    lxb_codepoint_t cp;
    const lxb_codepoint_t *cps;
    const lxb_char_t *p, *end;
    lxb_unicode_idna_type_t type;
    p = data;
    len = length * ((is_cp) ? sizeof(lxb_codepoint_t) : 1);
    end = (const lxb_char_t *) data + len;
    if (flags & LXB_UNICODE_IDNA_FLAG_CHECK_HYPHENS) {
        /* U+002D HYPHEN-MINUS */
        if (is_cp) {
            cps = data;
            if (length > 4) {
                if (cps[3] == 0x002D || cps[4] == 0x002D) {
                    return false;
                }
            }
            if (length >= 1) {
                if (cps[0] == 0x002D || cps[length - 1] == 0x002D) {
                    return false;
                }
            }
        }
        else {
            if (length > 4) {
                if (p[3] == 0x002D || p[4] == 0x002D) {
                    return false;
                }
            }
            if (length >= 1) {
                if (p[0] == 0x002D || p[-1] == 0x002D) {
                    return false;
                }
            }
        }
    }
    else if (length >= 4) {
        if (is_cp) {
            cps = data;
            if (   (cps[0] == 0x0078 || cps[0] == 0x0058)
                && (cps[1] == 0x006E || cps[1] == 0x004E)
                &&  cps[2] == 0x002D && cps[3] == 0x002D)
            {
                return false;
            }
        }
        else {
            if (   (p[0] == 0x0078 || p[0] == 0x0058)
                && (p[1] == 0x006E || p[1] == 0x004E)
                &&  p[2] == 0x002D && p[3] == 0x002D)
            {
                return false;
            }
        }
    }
    while (p < end) {
        if (!is_cp) {
            cp = lxb_encoding_decode_valid_utf_8_single(&p, end);
            if (cp == LXB_ENCODING_DECODE_ERROR) {
                return false;
            }
        }
        else {
            cp = *((const lxb_codepoint_t *) p);
            p = (const lxb_char_t *) ((const lxb_codepoint_t *) p + 1);
        }
        /* U+002E ( . ) FULL STOP */
        if (cp == 0x002E) {
            return false;
        }
        type = lxb_unicode_idna_type(cp);
        switch (type) {
            case LXB_UNICODE_IDNA_VALID:
                break;
            case LXB_UNICODE_IDNA_DEVIATION:
                if (!(flags & LXB_UNICODE_IDNA_FLAG_TRANSITIONAL_PROCESSING)) {
                    break;
                }
                /* Fall through. */
            case LXB_UNICODE_IDNA_DISALLOWED:
            case LXB_UNICODE_IDNA_IGNORED:
            case LXB_UNICODE_IDNA_MAPPED:
            default:
                return false;
        }
    }
    return true;
 }
 lxb_status_t
 lxb_unicode_idna_to_unicode(lxb_unicode_idna_t *idna, const lxb_char_t *data,
                            size_t length, lexbor_serialize_cb_f cb,
                            void *ctx, lxb_unicode_idna_flag_t flags)
 {
    return lxb_unicode_idna_to_unicode_body(idna, data, length, cb, ctx,
                                            flags, false);
 }
 lxb_status_t
 lxb_unicode_idna_to_unicode_cp(lxb_unicode_idna_t *idna,
                               const lxb_codepoint_t *cps,
                               size_t length, lexbor_serialize_cb_f cb,
                               void *ctx, lxb_unicode_idna_flag_t flags)
 {
    return lxb_unicode_idna_to_unicode_body(idna, cps, length, cb, ctx,
                                            flags, true);
 }
 static lxb_status_t
 lxb_unicode_idna_to_unicode_body(lxb_unicode_idna_t *idna, const void *data,
                                 size_t length, lexbor_serialize_cb_f cb,
                                 void *ctx, lxb_unicode_idna_flag_t flags,
                                 bool is_cp)
 {
    size_t len;
    lxb_status_t status;
    lxb_unicode_idna_ascii_ctx_t context;
    context.p = context.buffer;
    context.buf = context.buffer;
    context.end = context.buf + sizeof(context.buffer);
    context.flags = flags;
    if (!is_cp) {
        status = lxb_unicode_idna_processing(idna, data, length,
                                             lxb_unicode_idna_to_unicode_cb,
                                             &context, flags);
    }
    else {
        status = lxb_unicode_idna_processing_cp(idna, data, length,
                                                lxb_unicode_idna_to_unicode_cb,
                                                &context, flags);
    }
    if (status != LXB_STATUS_OK) {
        goto done;
    }
    /* Remove last U+002E ( . ) FULL STOP. */
    if (context.p > context.buf) {
        context.p -= 1;
    }
    len = context.p - context.buf;
    status = cb(context.buf, len, ctx);
 done:
    if (context.buf != context.buffer) {
        (void) lexbor_free(context.buf);
    }
    return status;
 }
 static lxb_status_t
 lxb_unicode_idna_to_unicode_cb(const lxb_codepoint_t *part, size_t len,
                               void *ctx, lxb_status_t status)
 {
    int8_t res;
    size_t length, nlen;
    lxb_char_t *tmp;
    const lxb_codepoint_t *p, *end;
    lxb_unicode_idna_ascii_ctx_t *asc = ctx;
    if (status != LXB_STATUS_OK) {
        return status;
    }
    p = part;
    end = part + len;
    length = 0;
    while (p < end) {
        res = lxb_encoding_encode_utf_8_length(*p++);
        if (res == 0) {
            return LXB_STATUS_ERROR_UNEXPECTED_DATA;
        }
        length += res;
    }
    if (asc->p + length + 2 > asc->end) {
        nlen = ((asc->end - asc->buf) * 4) + length + 2;
        if (asc->buf == asc->buffer) {
            tmp = lexbor_malloc(nlen);
        }
        else {
            tmp = lexbor_realloc(asc->buf, nlen);
        }
        if (tmp == NULL) {
            return LXB_STATUS_ERROR_MEMORY_ALLOCATION;
        }
        asc->p = tmp + (asc->p - asc->buf);
        asc->buf = tmp;
        asc->end = tmp + nlen;
    }
    p = part;
    while (p < end) {
        (void) lxb_encoding_encode_utf_8_single(NULL, &asc->p, asc->end, *p++);
    }
    *asc->p++ = '.';
    *asc->p = 0x00;
    return LXB_STATUS_OK;
 }
--- a/ext/lexbor/lexbor/unicode/idna.h
+++ b/ext/lexbor/lexbor/unicode/idna.h
@ -0,0 +1,264 @@
 /*
 * Copyright (C) 2023 Alexander Borisov
 *
 * Author: Alexander Borisov <borisov@lexbor.com>
 *
 * UNICODE IDNA COMPATIBILITY PROCESSING
 * https://www.unicode.org/reports/tr46/
 */
 #ifndef LEXBOR_UNICODE_IDNA_H
 #define LEXBOR_UNICODE_IDNA_H
 #ifdef __cplusplus
 extern "C" {
 #endif
 #include "lexbor/unicode/base.h"
 typedef lxb_status_t
 (*lxb_unicode_idna_cb_f)(const lxb_codepoint_t *part, size_t len,
                         void *ctx, lxb_status_t status);
 typedef enum {
    LXB_UNICODE_IDNA_FLAG_UNDEF                   = 0x00,
    LXB_UNICODE_IDNA_FLAG_USE_STD3ASCII_RULES     = 1 << 1,
    LXB_UNICODE_IDNA_FLAG_CHECK_HYPHENS           = 1 << 2,
    LXB_UNICODE_IDNA_FLAG_CHECK_BIDI              = 1 << 3, /* Not implemented. */
    LXB_UNICODE_IDNA_FLAG_CHECK_JOINERS           = 1 << 4, /* Not implemented. */
    LXB_UNICODE_IDNA_FLAG_TRANSITIONAL_PROCESSING = 1 << 5,
    LXB_UNICODE_IDNA_FLAG_VERIFY_DNS_LENGTH       = 1 << 6
 }
 lxb_unicode_idna_flag_t;
 typedef struct {
    lxb_unicode_normalizer_t normalizer;
 }
 lxb_unicode_idna_t;
 /*
 * Create lxb_unicode_idna_t object.
 *
 * @return lxb_unicode_idna_t * if successful, otherwise NULL.
 */
 LXB_API lxb_unicode_idna_t *
 lxb_unicode_idna_create(void);
 /*
 * Initialization of lxb_unicode_idna_t object.
 *
 * @param[in] lxb_unicode_idna_t *.  May be NULL,
 * LXB_STATUS_ERROR_OBJECT_IS_NULL status will be returned.
 *
 * @return LXB_STATUS_OK if successful, otherwise an error status value.
 */
 LXB_API lxb_status_t
 lxb_unicode_idna_init(lxb_unicode_idna_t *idna);
 /*
 * Clears the object.  Returns to states as after initialization.
 *
 * @param[in] lxb_unicode_idna_t *
 */
 LXB_API void
 lxb_unicode_idna_clean(lxb_unicode_idna_t *idna);
 /*
 * Destroy lxb_unicode_idna_t object.
 *
 * Release of occupied resources.
 *
 * @param[in] lxb_unicode_idna_t *. Can be NULL.
 * @param[in] if false: only destroys internal buffers.
 * if true: destroys the lxb_unicode_idna_t object and all internal buffers.
 *
 * @return lxb_unicode_idna_t * if self_destroy = false, otherwise NULL.
 */
 LXB_API lxb_unicode_idna_t *
 lxb_unicode_idna_destroy(lxb_unicode_idna_t *idna, bool self_destroy);
 /*
 * Domain name processing.
 *
 * Mapping, Normalization (NFC), Converting, Validating.
 *
 * Callback will be invoked at each level of the domain name.
 *
 * For example:
 *     lexbor.com -- there will be two callbacks, for "lexbor" and "com".
 *
 * https://www.unicode.org/reports/tr46/#Processing
 *
 * @param[in] lxb_unicode_idna_t *.
 * @param[in] Input characters for processing. Not NULL.
 * @param[in] Length of characters. Can be 0.
 * @param[in] Callback for results of processing.
 * @param[in] Context for callback.
 * @param[in] Bitmap of IDNA flags (LXB_UNICODE_IDNA_FLAG_*).
 *
 * @return LXB_STATUS_OK if successful, otherwise an error status value.
 */
 LXB_API lxb_status_t
 lxb_unicode_idna_processing(lxb_unicode_idna_t *idna, const lxb_char_t *data,
                            size_t length, lxb_unicode_idna_cb_f cb, void *ctx,
                            lxb_unicode_idna_flag_t flags);
 /*
 * Domain name processing for code points.
 *
 * This function is exactly the same as lxb_unicode_idna_processing() only it
 * takes code points instead of characters as input.
 *
 * * Please, see lxb_unicode_idna_processing() function.
 *
 * @param[in] lxb_unicode_idna_t *.
 * @param[in] Input code points for processing. Not NULL.
 * @param[in] Length of code points. Can be 0.
 * @param[in] Callback for results of processing.
 * @param[in] Context for callback.
 * @param[in] Bitmap of IDNA flags (LXB_UNICODE_IDNA_FLAG_*).
 *
 * @return LXB_STATUS_OK if successful, otherwise an error status value.
 */
 LXB_API lxb_status_t
 lxb_unicode_idna_processing_cp(lxb_unicode_idna_t *idna,
                               const lxb_codepoint_t *cps, size_t length,
                               lxb_unicode_idna_cb_f cb, void *ctx,
                               lxb_unicode_idna_flag_t flags);
 /*
 * Processing and converting domain name to ASCII.
 *
 * Does the same thing as lxb_unicode_idna_processing() + converts each part
 * domain name to Punycode.
 *
 * Callback will be invoked only once in at end of processing.
 *
 * https://www.unicode.org/reports/tr46/#ToASCII
 *
 * @param[in] lxb_unicode_idna_t *.
 * @param[in] Input characters for processing. Not NULL.
 * @param[in] Length of characters. Can be 0.
 * @param[in] Callback for results of processing.
 * @param[in] Context for callback.
 * @param[in] Bitmap of IDNA flags (LXB_UNICODE_IDNA_FLAG_*).
 *
 * @return LXB_STATUS_OK if successful, otherwise an error status value.
 */
 LXB_API lxb_status_t
 lxb_unicode_idna_to_ascii(lxb_unicode_idna_t *idna, const lxb_char_t *data,
                          size_t length, lexbor_serialize_cb_f cb, void *ctx,
                          lxb_unicode_idna_flag_t flags);
 /*
 * Processing and converting domain name to ASCII for code points.
 *
 * This function is exactly the same as lxb_unicode_idna_to_ascii() only it
 * takes code points instead of characters as input.
 *
 * Please, see lxb_unicode_idna_to_ascii() function.
 *
 * @param[in] lxb_unicode_idna_t *.
 * @param[in] Input characters for processing. Not NULL.
 * @param[in] Length of characters. Can be 0.
 * @param[in] Callback for results of processing.
 * @param[in] Context for callback.
 * @param[in] Bitmap of IDNA flags (LXB_UNICODE_IDNA_FLAG_*).
 *
 * @return LXB_STATUS_OK if successful, otherwise an error status value.
 */
 LXB_API lxb_status_t
 lxb_unicode_idna_to_ascii_cp(lxb_unicode_idna_t *idna, const lxb_codepoint_t *cps,
                             size_t length, lexbor_serialize_cb_f cb, void *ctx,
                             lxb_unicode_idna_flag_t flags);
 /*
 * Processing and converting domain name to Unicode.
 *
 * Does the same thing as lxb_unicode_idna_processing().
 *
 * Callback will be invoked only once in at end of processing.
 *
 * https://www.unicode.org/reports/tr46/#ToUnicode
 *
 * @param[in] lxb_unicode_idna_t *.
 * @param[in] Input characters for processing. Not NULL.
 * @param[in] Length of characters. Can be 0.
 * @param[in] Callback for results of processing.
 * @param[in] Context for callback.
 * @param[in] Bitmap of IDNA flags (LXB_UNICODE_IDNA_FLAG_*).
 *
 * @return LXB_STATUS_OK if successful, otherwise an error status value.
 */
 LXB_API lxb_status_t
 lxb_unicode_idna_to_unicode(lxb_unicode_idna_t *idna, const lxb_char_t *data,
                            size_t length, lexbor_serialize_cb_f cb, void *ctx,
                            lxb_unicode_idna_flag_t flags);
 /*
 * Processing and converting domain name to Unicode for code points.
 *
 * This function is exactly the same as lxb_unicode_idna_to_unicode() only it
 * takes code points instead of characters as input.
 *
 * Please, see lxb_unicode_idna_to_unicode() function.
 *
 * @param[in] lxb_unicode_idna_t *.
 * @param[in] Input characters for processing. Not NULL.
 * @param[in] Length of characters. Can be 0.
 * @param[in] Callback for results of processing.
 * @param[in] Context for callback.
 * @param[in] Bitmap of IDNA flags (LXB_UNICODE_IDNA_FLAG_*).
 *
 * @return LXB_STATUS_OK if successful, otherwise an error status value.
 */
 LXB_API lxb_status_t
 lxb_unicode_idna_to_unicode_cp(lxb_unicode_idna_t *idna, const lxb_codepoint_t *cps,
                               size_t length, lexbor_serialize_cb_f cb, void *ctx,
                               lxb_unicode_idna_flag_t flags);
 /*
 * Validity Criteria.
 *
 * The function checks the domain name for validity according to a number of
 * criteria.
 *
 * LXB_UNICODE_IDNA_FLAG_CHECK_BIDI and LXB_UNICODE_IDNA_FLAG_CHECK_JOINERS
 * not implemented.
 *
 * https://www.unicode.org/reports/tr46/#Validity_Criteria
 *
 * @param[in] Input characters for processing. Not NULL.
 * @param[in] Length of characters. Can be 0.
 * @param[in] Bitmap of IDNA flags (LXB_UNICODE_IDNA_FLAG_*).
 *
 * @return true if valid, otherwise false.
 */
 LXB_API bool
 lxb_unicode_idna_validity_criteria(const lxb_char_t *data, size_t length,
                                   lxb_unicode_idna_flag_t flags);
 /*
 * Validity Criteria.
 *
 * Same as lxb_unicode_idna_validity_criteria() only it takes codepoints as
 * input.
 *
 * @param[in] Input codepoints for processing. Not NULL.
 * @param[in] Length of codepoints. Can be 0.
 * @param[in] Bitmap of IDNA flags (LXB_UNICODE_IDNA_FLAG_*).
 *
 * @return true if valid, otherwise false.
 */
 LXB_API bool
 lxb_unicode_idna_validity_criteria_cp(const lxb_codepoint_t *data, size_t length,
                                      lxb_unicode_idna_flag_t flags);
 #ifdef __cplusplus
 } /* extern "C" */
 #endif
 #endif /* LEXBOR_UNICODE_IDNA_H */
--- a/ext/lexbor/lexbor/unicode/res.h
+++ b/ext/lexbor/lexbor/unicode/res.h
--- a/ext/lexbor/lexbor/unicode/unicode.c
+++ b/ext/lexbor/lexbor/unicode/unicode.c
--- a/ext/lexbor/lexbor/unicode/unicode.h
+++ b/ext/lexbor/lexbor/unicode/unicode.h
@ -0,0 +1,405 @@
 /*
 * Copyright (C) 2023 Alexander Borisov
 *
 * Author: Alexander Borisov <borisov@lexbor.com>
 */
 #ifndef LEXBOR_UNICODE_H
 #define LEXBOR_UNICODE_H
 #ifdef __cplusplus
 extern "C" {
 #endif
 #include "lexbor/unicode/base.h"
 #include "lexbor/unicode/idna.h"
 #include "lexbor/core/array_obj.h"
 typedef enum {
    LXB_UNICODE_NFC  = 0x00, /* Normalization Form C (NFC).   */
    LXB_UNICODE_NFD  = 0x01, /* Normalization Form D (NFD).   */
    LXB_UNICODE_NFKC = 0x02, /* Normalization Form KC (NFKC). */
    LXB_UNICODE_NFKD = 0x03  /* Normalization Form KD (NFKD). */
 }
 lxb_unicode_form_t;
 /*
 * Create lxb_unicode_normalizer_t object.
 *
 * @return lxb_unicode_normalizer_t * if successful, otherwise NULL.
 */
 LXB_API lxb_unicode_normalizer_t *
 lxb_unicode_normalizer_create(void);
 /*
 * Initialization of lxb_unicode_normalizer_t object.
 *
 * Support normalization forms:
 *     Normalization Form D (NFD):   LXB_UNICODE_NFD
 *     Normalization Form C (NFC):   LXB_UNICODE_NFC
 *     Normalization Form KD (NFKD): LXB_UNICODE_NFKD
 *     Normalization Form KC (NFKC): LXB_UNICODE_NFKC
 *
 * https://www.unicode.org/reports/tr15/
 *
 * @param[in] lxb_unicode_normalizer_t *
 * @param[in] Normalization form.
 *
 * @return LXB_STATUS_OK if successful, otherwise an error status value.
 */
 LXB_API lxb_status_t
 lxb_unicode_normalizer_init(lxb_unicode_normalizer_t *uc,
                            lxb_unicode_form_t form);
 /*
 * Initialization of lxb_unicode_normalizer_t object.
 *
 * Clears the object.  Returns to states as after initialization.
 *
 * @param[in] lxb_unicode_normalizer_t *
 */
 LXB_API void
 lxb_unicode_normalizer_clean(lxb_unicode_normalizer_t *uc);
 /*
 * Destroy lxb_unicode_normalizer_t object.
 *
 * Release of occupied resources.
 *
 * @param[in] lxb_unicode_normalizer_t *. Can be NULL.
 * @param[in] if false: only destroys internal buffers.
 * if true: destroys the lxb_unicode_normalizer_t object and all internal buffers.
 *
 * @return lxb_unicode_normalizer_t * if self_destroy = false, otherwise NULL.
 */
 LXB_API lxb_unicode_normalizer_t *
 lxb_unicode_normalizer_destroy(lxb_unicode_normalizer_t *uc, bool self_destroy);
 /*
 * Unicode normalization forms.
 *
 * This is a function with an implementation of the unicode normalization
 * algorithm.
 *
 * The function is designed to work with a stream (chunks).
 *
 * Please, see examples for this function in examples/lexbor/unicode directory.
 *
 * @param[in] lxb_unicode_normalizer_t *
 * @param[in] Input characters for normalization. Not NULL.
 * @param[in] Length of characters. Can be 0.
 * @param[in] Callback for results of normalization.
 * @param[in] Context for callback.
 * @param[in] Set to true if the last chunk or the only one chunk is processed.
 *
 * @return LXB_STATUS_OK if successful, otherwise an error status value.
 */
 LXB_API lxb_status_t
 lxb_unicode_normalize(lxb_unicode_normalizer_t *uc, const lxb_char_t *data,
                      size_t length, lexbor_serialize_cb_f cb, void *ctx,
                      bool is_last);
 /*
 * Unicode normalization end.
 *
 * The function is used to complete a normalization.
 * Same as calling the lxb_unicode_normalize() function with is_last = true.
 *
 * Use this function only if you do not set is_last = true in
 * the lxb_unicode_normalize() function.
 *
 * For example:
 *     status = lxb_unicode_normalize(uc, data, length, cb, NULL, false);
 *     status = lxb_unicode_normalize(uc, data, length, cb, NULL, false);
 *     lxb_unicode_normalize_end(uc);
 *
 *     The same as:
 *     status = lxb_unicode_normalize(uc, data, length, cb, NULL, false);
 *     status = lxb_unicode_normalize(uc, data, length, cb, NULL, true);
 *
 * @param[in] lxb_unicode_normalizer_t *
 * @param[in] Callback for results of normalization.
 * @param[in] Context for callback.
 *
 * @return LXB_STATUS_OK if successful, otherwise an error status value.
 */
 LXB_API lxb_status_t
 lxb_unicode_normalize_end(lxb_unicode_normalizer_t *uc, lexbor_serialize_cb_f cb,
                          void *ctx);
 /*
 * Unicode normalization forms for code points.
 *
 * This function is exactly the same as lxb_unicode_normalize() only it takes
 * code points instead of characters as input.
 *
 * Also, unlike the lxb_unicode_normalize() function, a callback will be called
 * to return a code points, not characters.
 *
 * The function is designed to work with a stream (chunks).
 *
 * @param[in] lxb_unicode_normalizer_t *
 * @param[in] Input code points for normalization. Not NULL.
 * @param[in] Length of code points. Can be 0.
 * @param[in] Callback for results of normalization.
 * @param[in] Context for callback.
 * @param[in] Set to true if the last chunk or the only one chunk is processed.
 *
 * @return LXB_STATUS_OK if successful, otherwise an error status value.
 */
 LXB_API lxb_status_t
 lxb_unicode_normalize_cp(lxb_unicode_normalizer_t *uc, const lxb_codepoint_t *cps,
                         size_t length, lexbor_serialize_cb_cp_f cb, void *ctx,
                         bool is_last);
 /*
 * Unicode normalization end for code points.
 *
 * This function is completely similar to lxb_unicode_normalize_end(),
 * only it takes a function with code points as a callback function.
 *
 * Same as calling the lxb_unicode_normalize_cp() function with is_last = true.
 *
 * Use this function only if you do not set is_last = true in
 * the lxb_unicode_normalize_cp() function.
 *
 * For example:
 *     status = lxb_unicode_normalize_cp(uc, cps, length, cb, NULL, false);
 *     status = lxb_unicode_normalize_cp(uc, cps, length, cb, NULL, false);
 *     lxb_unicode_normalize_cp_end(uc);
 *
 *     The same as:
 *     status = lxb_unicode_normalize_cp(uc, cps, length, cb, NULL, false);
 *     status = lxb_unicode_normalize_cp(uc, cps, length, cb, NULL, true);
 *
 * @param[in] lxb_unicode_normalizer_t *
 * @param[in] Callback for results of normalization.
 * @param[in] Context for callback.
 *
 * @return LXB_STATUS_OK if successful, otherwise an error status value.
 */
 LXB_API lxb_status_t
 lxb_unicode_normalize_cp_end(lxb_unicode_normalizer_t *uc,
                             lexbor_serialize_cb_cp_f cb, void *ctx);
 /*
 * Quick Check.
 *
 * The basic normalization algorithm is not simple and requires time
 * and resources.
 * This function checks relatively quickly if the text needs to be normalized.
 *
 * The function is designed to work with a stream (chunks).
 *
 * https://www.unicode.org/reports/tr15/#Detecting_Normalization_Forms
 *
 * @param[in] lxb_unicode_normalizer_t *
 * @param[in] Input characters for checks. Not NULL.
 * @param[in] Length of characters. Can be 0.
 * @param[in] Set to true if the last chunk or the only one chunk is processed.
 *
 * @return true if it needs to be normalized, otherwise false.
 */
 LXB_API bool
 lxb_unicode_quick_check(lxb_unicode_normalizer_t *uc, const lxb_char_t *data,
                        size_t length, bool is_last);
 /*
 * Quick Check End.
 *
 * The function is used to complete a quick check.
 * Same as calling the lxb_unicode_quick_check() function with is_last = true.
 *
 * Use this function only if you do not set is_last = true in
 * the lxb_unicode_quick_check() function.
 *
 * For example:
 *     is = lxb_unicode_quick_check(uc, data, length, false);
 *     is = lxb_unicode_quick_check(uc, data, length, false);
 *     is = lxb_unicode_quick_check_end(uc);
 *
 *     The same as:
 *     is = lxb_unicode_quick_check(uc, data, length, false);
 *     is = lxb_unicode_quick_check(uc, data, length, true);
 *
 * @param[in] lxb_unicode_normalizer_t *
 *
 * @return true if it needs to be normalized, otherwise false.
 */
 LXB_API bool
 lxb_unicode_quick_check_end(lxb_unicode_normalizer_t *uc);
 /*
 * Quick Check for code points.
 *
 * Same as lxb_unicode_quick_check() only it takes code points as input.
 *
 * @param[in] lxb_unicode_normalizer_t *
 * @param[in] Input code points for checks. Not NULL.
 * @param[in] Length of code points. Can be 0.
 * @param[in] Set to true if the last chunk or the only one chunk is processed.
 *
 * @return true if it needs to be normalized, otherwise false.
 */
 LXB_API bool
 lxb_unicode_quick_check_cp(lxb_unicode_normalizer_t *uc,
                           const lxb_codepoint_t *cps, size_t length,
                           bool is_last);
 /*
 * Quick Check End for code points.
 *
 * Same as lxb_unicode_quick_check_end().
 *
 * For example:
 *     is = lxb_unicode_quick_check_cp(uc, cps, length, false);
 *     is = lxb_unicode_quick_check_cp(uc, cps, length, false);
 *     is = lxb_unicode_quick_check_cp_end(uc);
 *
 *     The same as:
 *     is = lxb_unicode_quick_check_cp(uc, cps, length, false);
 *     is = lxb_unicode_quick_check_cp(uc, cps, length, true);
 *
 * @param[in] lxb_unicode_normalizer_t *
 *
 * @return true if it needs to be normalized, otherwise false.
 */
 LXB_API bool
 lxb_unicode_quick_check_cp_end(lxb_unicode_normalizer_t *uc);
 /*
 * Flush.
 *
 * Force flush the buffer to the user's callback if it possible.
 *
 * Please, see lxb_unicode_flush_count_set() function.
 *
 * @param[in] lxb_unicode_normalizer_t *.
 * @param[in] Callback.
 * @param[in] Callback context.
 *
 * @return LXB_STATUS_OK if successful, otherwise an error status value.
 */
 LXB_API lxb_status_t
 lxb_unicode_flush(lxb_unicode_normalizer_t *uc, lexbor_serialize_cb_f cb,
                  void *ctx);
 /*
 * Flush for code points.
 *
 * Same as lxb_unicode_flush(), but it takes a callback with code points as
 * input.
 *
 * @param[in] lxb_unicode_normalizer_t *.
 * @param[in] Callback.
 * @param[in] Callback context.
 *
 * @return LXB_STATUS_OK if successful, otherwise an error status value.
 */
 LXB_API lxb_status_t
 lxb_unicode_flush_cp(lxb_unicode_normalizer_t *uc, lexbor_serialize_cb_cp_f cb,
                     void *ctx);
 /*
 * Change normalization form.
 *
 * You should only apply this function after one of the following actions:
 *     1. The lxb_unicode_normalize() function was called with is_last = true.
 *        That is, the processing of the previous type was successfully
 *        completed.
 *  OR
 *     2. The end of normalization function was called:
          lxb_unicode_normalize_end().
 *  OR
 *     3. The lxb_unicode_normalizer_t object cleanup function was called:
 *        lxb_unicode_normalizer_clean().
 *
 *
 * All this is to be able to normalize or quickly check text with different
 * types without creating new objects.
 *
 * @param[in] lxb_unicode_normalizer_t *.
 * @param[in] Normalization form.
 *
 * @return LXB_STATUS_OK if successful, otherwise an error status value.
 */
 LXB_API lxb_status_t
 lxb_unicode_normalization_form_set(lxb_unicode_normalizer_t *uc,
                                   lxb_unicode_form_t form);
 LXB_API const lxb_unicode_entry_t *
 lxb_unicode_entry(lxb_codepoint_t cp);
 LXB_API const lxb_unicode_composition_cp_t *
 lxb_unicode_compose_entry(lxb_codepoint_t first, lxb_codepoint_t second);
 LXB_API lxb_unicode_idna_type_t
 lxb_unicode_idna_type(lxb_codepoint_t cp);
 LXB_API const lxb_unicode_composition_cp_t *
 lxb_unicode_composition_cp(lxb_codepoint_t first, lxb_codepoint_t second);
 LXB_API const lxb_unicode_normalization_entry_t *
 lxb_unicode_normalization_entry(const lxb_unicode_entry_t *entry);
 LXB_API const lxb_unicode_normalization_entry_t *
 lxb_unicode_normalization_entry_by_cp(lxb_codepoint_t cp);
 LXB_API const lxb_unicode_normalization_entry_t *
 lxb_unicode_normalization_entry_by_index(uint16_t index);
 LXB_API bool
 lxb_unicode_normalization_is_null(const lxb_unicode_normalization_entry_t *entry);
 LXB_API const lxb_codepoint_t *
 lxb_unicode_full_canonical(const lxb_unicode_normalization_entry_t *entry,
                           size_t *out_length);
 LXB_API const lxb_codepoint_t *
 lxb_unicode_full_compatibility(const lxb_unicode_normalization_entry_t *entry,
                               size_t *out_length);
 LXB_API const lxb_unicode_idna_entry_t *
 lxb_unicode_idna_entry(const lxb_unicode_entry_t *entry);
 LXB_API const lxb_unicode_idna_entry_t *
 lxb_unicode_idna_entry_by_cp(lxb_codepoint_t cp);
 LXB_API const lxb_unicode_idna_entry_t *
 lxb_unicode_idna_entry_by_index(uint16_t index);
 LXB_API const lxb_codepoint_t *
 lxb_unicode_idna_map(const lxb_unicode_idna_entry_t *entry,
                     size_t *out_length);
 /*
 * Inline functions.
 */
 /*
 * Sets the buffer size for codepoints.
 *
 * By default, 4096 processed codepoints are accumulated before converting them
 * to lxb_char_t and returning the result to the user via callback.
 *
 * If set the count to 0, the user callback will be called for every codepoint
 * processed.  That is, it will be streaming without accumulation in
 * the intermediate buffer.
 *
 * @param[in] lxb_unicode_normalizer_t *.
 * @param[in] Count of codepoints in the buffer.
 */
 lxb_inline void
 lxb_unicode_flush_count_set(lxb_unicode_normalizer_t *uc, size_t count)
 {
    uc->flush_cp = count;
 }
 #ifdef __cplusplus
 } /* extern "C" */
 #endif
 #endif /* LEXBOR_UNICODE_H */
--- a/ext/lexbor/lexbor/url/base.h
+++ b/ext/lexbor/lexbor/url/base.h
@ -0,0 +1,32 @@
 /*
 * Copyright (C) 2023-2024 Alexander Borisov
 *
 * Author: Alexander Borisov <borisov@lexbor.com>
 */
 #ifndef LEXBOR_URL_BASE_H
 #define LEXBOR_URL_BASE_H
 #ifdef __cplusplus
 extern "C" {
 #endif
 #include "lexbor/core/base.h"
 #include "lexbor/core/mraw.h"
 #include "lexbor/core/str.h"
 #define LXB_URL_VERSION_MAJOR 0
 #define LXB_URL_VERSION_MINOR 3
 #define LXB_URL_VERSION_PATCH 0
 #define LXB_URL_VERSION_STRING LEXBOR_STRINGIZE(LXB_URL_VERSION_MAJOR) "."    \
                               LEXBOR_STRINGIZE(LXB_URL_VERSION_MINOR) "."    \
                               LEXBOR_STRINGIZE(LXB_URL_VERSION_PATCH)
 #ifdef __cplusplus
 } /* extern "C" */
 #endif
 #endif /* LEXBOR_URL_BASE_H */
--- a/ext/lexbor/lexbor/url/url.c
+++ b/ext/lexbor/lexbor/url/url.c
--- a/ext/lexbor/lexbor/url/url.h
+++ b/ext/lexbor/lexbor/url/url.h
@ -0,0 +1,551 @@
 /*
 * Copyright (C) 2023 Alexander Borisov
 *
 * Author: Alexander Borisov <borisov@lexbor.com>
 *
 * The URL Standard.
 * By specification: https://url.spec.whatwg.org/
 */
 #ifndef LEXBOR_URL_H
 #define LEXBOR_URL_H
 #ifdef __cplusplus
 extern "C" {
 #endif
 #include "lexbor/url/base.h"
 #include "lexbor/core/mraw.h"
 #include "lexbor/core/plog.h"
 #include "lexbor/encoding/encoding.h"
 #include "lexbor/unicode/unicode.h"
 typedef enum {
    LXB_URL_ERROR_TYPE_DOMAIN_TO_ASCII = 0x00,
    LXB_URL_ERROR_TYPE_DOMAIN_TO_UNICODE,
    LXB_URL_ERROR_TYPE_DOMAIN_INVALID_CODE_POINT,
    LXB_URL_ERROR_TYPE_HOST_INVALID_CODE_POINT,
    LXB_URL_ERROR_TYPE_IPV4_EMPTY_PART,
    LXB_URL_ERROR_TYPE_IPV4_TOO_MANY_PARTS,
    LXB_URL_ERROR_TYPE_IPV4_NON_NUMERIC_PART,
    LXB_URL_ERROR_TYPE_IPV4_NON_DECIMAL_PART,
    LXB_URL_ERROR_TYPE_IPV4_OUT_OF_RANGE_PART,
    LXB_URL_ERROR_TYPE_IPV6_UNCLOSED,
    LXB_URL_ERROR_TYPE_IPV6_INVALID_COMPRESSION,
    LXB_URL_ERROR_TYPE_IPV6_TOO_MANY_PIECES,
    LXB_URL_ERROR_TYPE_IPV6_MULTIPLE_COMPRESSION,
    LXB_URL_ERROR_TYPE_IPV6_INVALID_CODE_POINT,
    LXB_URL_ERROR_TYPE_IPV6_TOO_FEW_PIECES,
    LXB_URL_ERROR_TYPE_IPV4_IN_IPV6_TOO_MANY_PIECES,
    LXB_URL_ERROR_TYPE_IPV4_IN_IPV6_INVALID_CODE_POINT,
    LXB_URL_ERROR_TYPE_IPV4_IN_IPV6_OUT_OF_RANGE_PART,
    LXB_URL_ERROR_TYPE_IPV4_IN_IPV6_TOO_FEW_PARTS,
    LXB_URL_ERROR_TYPE_INVALID_URL_UNIT,
    LXB_URL_ERROR_TYPE_SPECIAL_SCHEME_MISSING_FOLLOWING_SOLIDUS,
    LXB_URL_ERROR_TYPE_MISSING_SCHEME_NON_RELATIVE_URL,
    LXB_URL_ERROR_TYPE_INVALID_REVERSE_SOLIDUS,
    LXB_URL_ERROR_TYPE_INVALID_CREDENTIALS,
    LXB_URL_ERROR_TYPE_HOST_MISSING,
    LXB_URL_ERROR_TYPE_PORT_OUT_OF_RANGE,
    LXB_URL_ERROR_TYPE_PORT_INVALID,
    LXB_URL_ERROR_TYPE_FILE_INVALID_WINDOWS_DRIVE_LETTER,
    LXB_URL_ERROR_TYPE_FILE_INVALID_WINDOWS_DRIVE_LETTER_HOST,
    LXB_URL_ERROR_TYPE__LAST_ENTRY
 }
 lxb_url_error_type_t;
 typedef enum {
    LXB_URL_STATE__UNDEF = 0x00,
    LXB_URL_STATE_SCHEME_START_STATE,
    LXB_URL_STATE_SCHEME_STATE,
    LXB_URL_STATE_NO_SCHEME_STATE,
    LXB_URL_STATE_SPECIAL_RELATIVE_OR_AUTHORITY_STATE,
    LXB_URL_STATE_PATH_OR_AUTHORITY_STATE,
    LXB_URL_STATE_RELATIVE_STATE,
    LXB_URL_STATE_RELATIVE_SLASH_STATE,
    LXB_URL_STATE_SPECIAL_AUTHORITY_SLASHES_STATE,
    LXB_URL_STATE_SPECIAL_AUTHORITY_IGNORE_SLASHES_STATE,
    LXB_URL_STATE_AUTHORITY_STATE,
    LXB_URL_STATE_HOST_STATE,
    LXB_URL_STATE_HOSTNAME_STATE,
    LXB_URL_STATE_PORT_STATE,
    LXB_URL_STATE_FILE_STATE,
    LXB_URL_STATE_FILE_SLASH_STATE,
    LXB_URL_STATE_FILE_HOST_STATE,
    LXB_URL_STATE_PATH_START_STATE,
    LXB_URL_STATE_PATH_STATE,
    LXB_URL_STATE_OPAQUE_PATH_STATE,
    LXB_URL_STATE_QUERY_STATE,
    LXB_URL_STATE_FRAGMENT_STATE
 }
 lxb_url_state_t;
 /*
 * New values can only be added downwards.
 * Before LXB_URL_SCHEMEL_TYPE__LAST_ENTRY.
 *
 * Please, see lxb_url_scheme_res in /lexbor/url/url.c.
 */
 typedef enum {
    LXB_URL_SCHEMEL_TYPE__UNDEF      = 0x00,
    LXB_URL_SCHEMEL_TYPE__UNKNOWN    = 0x01,
    LXB_URL_SCHEMEL_TYPE_HTTP        = 0x02,
    LXB_URL_SCHEMEL_TYPE_HTTPS       = 0x03,
    LXB_URL_SCHEMEL_TYPE_WS          = 0x04,
    LXB_URL_SCHEMEL_TYPE_WSS         = 0x05,
    LXB_URL_SCHEMEL_TYPE_FTP         = 0x06,
    LXB_URL_SCHEMEL_TYPE_FILE        = 0x07,
    LXB_URL_SCHEMEL_TYPE__LAST_ENTRY
 }
 lxb_url_scheme_type_t;
 typedef struct {
    const lexbor_str_t    name;
    uint16_t              port;
    lxb_url_scheme_type_t type;
 }
 lxb_url_scheme_data_t;
 typedef struct {
    lexbor_str_t          name;
    lxb_url_scheme_type_t type;
 }
 lxb_url_scheme_t;
 typedef enum {
    LXB_URL_HOST_TYPE__UNDEF = 0x00,
    LXB_URL_HOST_TYPE_DOMAIN = 0x01,
    LXB_URL_HOST_TYPE_OPAQUE = 0x02,
    LXB_URL_HOST_TYPE_IPV4   = 0x03,
    LXB_URL_HOST_TYPE_IPV6   = 0x04,
    LXB_URL_HOST_TYPE_EMPTY  = 0x05
 }
 lxb_url_host_type_t;
 typedef struct {
    lxb_url_host_type_t type;
    union {
        uint16_t     ipv6[8];
        uint32_t     ipv4;
        lexbor_str_t opaque;
        lexbor_str_t domain;
    } u;
 }
 lxb_url_host_t;
 typedef struct {
    lexbor_str_t str;
    size_t       length;
    bool         opaque;
 }
 lxb_url_path_t;
 typedef struct {
    lxb_url_scheme_t   scheme;
    lxb_url_host_t     host;
    lexbor_str_t       username;
    lexbor_str_t       password;
    uint16_t           port;
    bool               has_port;
    lxb_url_path_t     path;
    lexbor_str_t       query;
    lexbor_str_t       fragment;
    lexbor_mraw_t      *mraw;
 }
 lxb_url_t;
 typedef struct {
    lxb_url_t          *url;
    lexbor_mraw_t      *mraw;
    lexbor_plog_t      *log;
    lxb_unicode_idna_t *idna;
 }
 lxb_url_parser_t;
 /*
 * Create lxb_url_parser_t object.
 *
 * @return lxb_url_parser_t * if successful, otherwise NULL.
 */
 LXB_API lxb_url_parser_t *
 lxb_url_parser_create(void);
 /*
 * Initialization of lxb_url_parser_t object.
 *
 * The parser is not bound to the received URLs in any way. That is, after
 * parsing the lxb_url_parser_t object can be destroyed and we can continue
 * working with the received URLs.
 *
 * Memory for created URLs is taken from lexbor_mraw_t object, which you can
 * pass during initialization of lxb_url_parser_t object, or a new lexbor_mraw_t
 * object will be created during initialization if NULL is passed.
 *
 * Each created URL will have a pointer to the lexbor_mraw_t object.
 *
 * By destroying the lexbor_mraw_t object you destroy all the URL objects
 * created by the parser. Use the lxb_url_destroy() function to destroy a
 * specific URL.
 *
 * Destroying the lxb_url_parser_t object with lxb_url_parser_destroy() does
 * not destroy the lexbor_mraw_t memory object.
 *
 * Please, see functions lxb_url_parser_memory_destroy(), lxb_url_destroy(),
 * lxb_url_memory_destroy().
 *
 * @param[in] lxb_url_parser_t *
 * @param[in] lexbor_mraw_t *. Can be NULL. If pass NULL, it will create its own
 * memory object inside parser and it will be bound to all created URLs.
 *
 * @return LXB_STATUS_OK if successful, otherwise an error status value.
 */
 LXB_API lxb_status_t
 lxb_url_parser_init(lxb_url_parser_t *parser, lexbor_mraw_t *mraw);
 /*
 * Clears the object. Returns object to states as after initialization.
 *
 * This function must be called before the parsing functions can be reused.
 *
 * For example:
 *     lxb_url_parse()
 *     lxb_url_parser_clean()
 *     lxb_url_parse()
 *     lxb_url_destroy()
 *
 * @param[in] lxb_url_parser_t *
 */
 LXB_API void
 lxb_url_parser_clean(lxb_url_parser_t *parser);
 /*
 * Destroy lxb_url_parser_t object.
 *
 * Release of occupied resources.
 * The lexbor_mraw_t memory object is not destroyed in this function.
 *
 * @param[in] lxb_url_parser_t *. Can be NULL.
 * @param[in] if false: only destroys internal buffers.
 * if true: destroys the lxb_url_parser_t object and all internal buffers.
 *
 * @return lxb_url_parser_t * if self_destroy = false, otherwise NULL.
 */
 LXB_API lxb_url_parser_t *
 lxb_url_parser_destroy(lxb_url_parser_t *parser, bool destroy_self);
 /*
 * Destroys the lexbor_mraw_t object, and thus all associated URLs.
 *
 * After that, new URLs cannot be parsed until a new lexbor_mraw_t object is
 * assigned to the lxb_url_parser_t object.
 *
 * @param[in] lxb_url_parser_t *.
 */
 LXB_API void
 lxb_url_parser_memory_destroy(lxb_url_parser_t *parser);
 /*
 * URL parser.
 *
 * This functional an implementation of URL parsing according to the WHATWG
 * specification.
 *
 * @param[in] lxb_url_parser_t *.
 * @param[in] const lxb_url_t *. Base URL, can be NULL.
 * @param[in] Input characters. Not NULL.
 * @param[in] Length of characters. Can be 0.
 *
 * @return lxb_url_t * if successful, otherwise NULL.
 */
 LXB_API lxb_url_t *
 lxb_url_parse(lxb_url_parser_t *parser, const lxb_url_t *base_url,
              const lxb_char_t *data, size_t length);
 /*
 * URL basic parser.
 *
 * This functional an implementation of URL parsing according to the WHATWG
 * specification.
 *
 * Use the lxb_url_get() function to get the URL object.
 *
 * @param[in] lxb_url_parser_t *.
 * @param[in] lxb_url_t *. Can be NULL.
 * @param[in] const lxb_url_t *. Base URL, can be NULL.
 * @param[in] Input characters. Not NULL.
 * @param[in] Length of characters. Can be 0.
 * @param[in] lxb_url_state_t, for default set to LXB_URL_STATE__UNDEF.
 * @param[in] lxb_encoding_t, default (LXB_ENCODING_DEFAULT) LXB_ENCODING_UTF_8.
 *
 * @return LXB_STATUS_OK if successful, otherwise an error status value.
 */
 LXB_API lxb_status_t
 lxb_url_parse_basic(lxb_url_parser_t *parser, lxb_url_t *url,
                    const lxb_url_t *base_url,
                    const lxb_char_t *data, size_t length,
                    lxb_url_state_t override_state, lxb_encoding_t encoding);
 /*
 * Erase URL.
 *
 * Frees all internal memory occupied by the URL object, but does not destroy
 * the object.
 *
 * @param[in] lxb_url_t *.
 *
 * @return NULL.
 */
 LXB_API void
 lxb_url_erase(lxb_url_t *url);
 /*
 * Destroys URL.
 *
 * @param[in] lxb_url_t *.
 *
 * @return NULL.
 */
 LXB_API lxb_url_t *
 lxb_url_destroy(lxb_url_t *url);
 /*
 * Destroys the lexbor_mraw_t memory object.
 *
 * The function will destroy all URLs associated with the lexbor_mraw_t memory
 * object, including the passed one.
 *
 * Keep in mind, if you have a live lxb_url_parser_t parsing object, you will
 * have a pointer to garbage after calling this function instead of a pointer
 * to the lexbor_mraw_t object.
 * In this case you need to assign a new memory object lexbor_mraw_t for the
 * parser. Use the lxb_url_mraw_set() function.
 *
 * @param[in] lxb_url_t *.
 */
 LXB_API void
 lxb_url_memory_destroy(lxb_url_t *url);
 /*
 * Below is an API for modifying the URL object according to the
 * https://url.spec.whatwg.org/#api specification.
 *
 * It is not necessary to pass the lxb_url_parser_t object to API functions.
 * You need to pass the parser if you want to have logs of parsing.
 *
 * All API functions can be passed NULL as "const lxb_char_t *" data.
 */
 LXB_API lxb_status_t
 lxb_url_api_href_set(lxb_url_t *url, lxb_url_parser_t *parser,
                     const lxb_char_t *href, size_t length);
 LXB_API lxb_status_t
 lxb_url_api_protocol_set(lxb_url_t *url, lxb_url_parser_t *parser,
                         const lxb_char_t *protocol, size_t length);
 LXB_API lxb_status_t
 lxb_url_api_username_set(lxb_url_t *url,
                         const lxb_char_t *username, size_t length);
 LXB_API lxb_status_t
 lxb_url_api_password_set(lxb_url_t *url,
                         const lxb_char_t *password, size_t length);
 LXB_API lxb_status_t
 lxb_url_api_host_set(lxb_url_t *url, lxb_url_parser_t *parser,
                     const lxb_char_t *host, size_t length);
 LXB_API lxb_status_t
 lxb_url_api_hostname_set(lxb_url_t *url, lxb_url_parser_t *parser,
                         const lxb_char_t *hostname, size_t length);
 LXB_API lxb_status_t
 lxb_url_api_port_set(lxb_url_t *url, lxb_url_parser_t *parser,
                     const lxb_char_t *port, size_t length);
 LXB_API lxb_status_t
 lxb_url_api_pathname_set(lxb_url_t *url, lxb_url_parser_t *parser,
                         const lxb_char_t *pathname, size_t length);
 LXB_API lxb_status_t
 lxb_url_api_search_set(lxb_url_t *url, lxb_url_parser_t *parser,
                       const lxb_char_t *search, size_t length);
 LXB_API lxb_status_t
 lxb_url_api_hash_set(lxb_url_t *url, lxb_url_parser_t *parser,
                     const lxb_char_t *hash, size_t length);
 /*
 * Below are functions for serializing a URL object and its individual
 * parameters.
 *
 * Note that the callback may be called more than once.
 * For example, the lxb_url_serialize() function will callback multiple times:
 * 1. http
 * 2. ://
 * 3. example.com
 * and so on.
 */
 LXB_API lxb_status_t
 lxb_url_serialize(const lxb_url_t *url, lexbor_serialize_cb_f cb, void *ctx,
                  bool exclude_fragment);
 LXB_API lxb_status_t
 lxb_url_serialize_scheme(const lxb_url_t *url,
                         lexbor_serialize_cb_f cb, void *ctx);
 LXB_API lxb_status_t
 lxb_url_serialize_username(const lxb_url_t *url,
                           lexbor_serialize_cb_f cb, void *ctx);
 LXB_API lxb_status_t
 lxb_url_serialize_password(const lxb_url_t *url,
                           lexbor_serialize_cb_f cb, void *ctx);
 LXB_API lxb_status_t
 lxb_url_serialize_host(const lxb_url_host_t *host,
                       lexbor_serialize_cb_f cb, void *ctx);
 LXB_API lxb_status_t
 lxb_url_serialize_host_unicode(lxb_unicode_idna_t *idna,
                               const lxb_url_host_t *host,
                               lexbor_serialize_cb_f cb, void *ctx);
 LXB_API lxb_status_t
 lxb_url_serialize_host_ipv4(uint32_t ipv4,
                            lexbor_serialize_cb_f cb, void *ctx);
 LXB_API lxb_status_t
 lxb_url_serialize_host_ipv6(const uint16_t *ipv6,
                            lexbor_serialize_cb_f cb, void *ctx);
 LXB_API lxb_status_t
 lxb_url_serialize_port(const lxb_url_t *url,
                       lexbor_serialize_cb_f cb, void *ctx);
 LXB_API lxb_status_t
 lxb_url_serialize_path(const lxb_url_path_t *path,
                       lexbor_serialize_cb_f cb, void *ctx);
 LXB_API lxb_status_t
 lxb_url_serialize_query(const lxb_url_t *url,
                        lexbor_serialize_cb_f cb, void *ctx);
 LXB_API lxb_status_t
 lxb_url_serialize_fragment(const lxb_url_t *url,
                           lexbor_serialize_cb_f cb, void *ctx);
 /*
 * Creates a clone of the object's URL.
 *
 * For lexbor_mraw_t *, use url->mraw or another lexbor_mraw_t * object.
 *
 * @param[in] lexbor_mraw_t *.
 * @param[in] lxb_url_t *.
 *
 * @return a new URL object if successful, otherwise NULL value.
 */
 LXB_API lxb_url_t *
 lxb_url_clone(lexbor_mraw_t *mraw, lxb_url_t *url);
 /*
 * Inline functions.
 */
 lxb_inline const lexbor_str_t *
 lxb_url_scheme(const lxb_url_t *url)
 {
    return &url->scheme.name;
 }
 lxb_inline const lexbor_str_t *
 lxb_url_username(const lxb_url_t *url)
 {
    return &url->username;
 }
 lxb_inline const lexbor_str_t *
 lxb_url_password(const lxb_url_t *url)
 {
    return &url->password;
 }
 lxb_inline const lxb_url_host_t *
 lxb_url_host(const lxb_url_t *url)
 {
    return &url->host;
 }
 lxb_inline uint16_t
 lxb_url_port(const lxb_url_t *url)
 {
    return url->port;
 }
 lxb_inline bool
 lxb_url_has_port(const lxb_url_t *url)
 {
    return url->has_port;
 }
 lxb_inline const lxb_url_path_t *
 lxb_url_path(const lxb_url_t *url)
 {
    return &url->path;
 }
 lxb_inline const lexbor_str_t *
 lxb_url_path_str(const lxb_url_t *url)
 {
    return &url->path.str;
 }
 lxb_inline const lexbor_str_t *
 lxb_url_query(const lxb_url_t *url)
 {
    return &url->query;
 }
 lxb_inline const lexbor_str_t *
 lxb_url_fragment(const lxb_url_t *url)
 {
    return &url->fragment;
 }
 lxb_inline lexbor_mraw_t *
 lxb_url_mraw(lxb_url_parser_t *parser)
 {
    return parser->mraw;
 }
 lxb_inline void
 lxb_url_mraw_set(lxb_url_parser_t *parser, lexbor_mraw_t *mraw)
 {
    parser->mraw = mraw;
 }
 lxb_inline lxb_url_t *
 lxb_url_get(lxb_url_parser_t *parser)
 {
    return parser->url;
 }
 #ifdef __cplusplus
 } /* extern "C" */
 #endif
 #endif /* LEXBOR_URL_H */