php-src/ext/dom/html_document.c
Niels Dossche 2952e164a9
Fix GH-17481: UTF-8 corruption in \Dom\HTMLDocument
We need to properly handle the case when we return from having too few
bytes, this needs to be handled separately because the while loop
otherwise just performs a partial byte copy.

Closes GH-17489.
2025-01-17 16:25:08 +01:00

1700 lines
58 KiB
C

/*
+----------------------------------------------------------------------+
| Copyright (c) The PHP Group |
+----------------------------------------------------------------------+
| This source file is subject to version 3.01 of the PHP license, |
| that is bundled with this package in the file LICENSE, and is |
| available through the world-wide-web at the following url: |
| https://www.php.net/license/3_01.txt |
| If you did not receive a copy of the PHP license and are unable to |
| obtain it through the world-wide-web, please send a note to |
| license@php.net so we can mail you a copy immediately. |
+----------------------------------------------------------------------+
| Authors: Niels Dossche <nielsdos@php.net> |
+----------------------------------------------------------------------+
*/
#ifdef HAVE_CONFIG_H
#include <config.h>
#endif
#include "php.h"
#if defined(HAVE_LIBXML) && defined(HAVE_DOM)
#include "php_dom.h"
#include "infra.h"
#include "html5_parser.h"
#include "html5_serializer.h"
#include "namespace_compat.h"
#include "private_data.h"
#include "dom_properties.h"
#include <Zend/zend_smart_string.h>
#include <lexbor/html/encoding.h>
#include <lexbor/encoding/encoding.h>
/* Implementation defined, but as HTML5 defaults in all other cases to UTF-8, we'll do the same. */
#define DOM_FALLBACK_ENCODING_ID LXB_ENCODING_UTF_8
typedef struct dom_line_column_cache {
size_t last_line;
size_t last_column;
size_t last_offset;
} dom_line_column_cache;
typedef struct dom_lexbor_libxml2_bridge_application_data {
const char *input_name;
const lxb_codepoint_t *current_input_codepoints;
const char *current_input_characters;
size_t current_input_length;
size_t current_total_offset;
dom_line_column_cache cache_tokenizer;
bool html_no_implied;
} dom_lexbor_libxml2_bridge_application_data;
typedef struct dom_character_encoding_data {
const lxb_encoding_data_t *encoding_data;
size_t bom_shift;
} dom_character_encoding_data;
typedef zend_result (*dom_write_output)(void*, const char *, size_t);
typedef struct dom_output_ctx {
const lxb_encoding_data_t *encoding_data;
const lxb_encoding_data_t *decoding_data;
lxb_encoding_encode_t *encode;
lxb_encoding_decode_t *decode;
lxb_codepoint_t *codepoints;
lxb_char_t *encoding_output;
void *output_data;
dom_write_output write_output;
} dom_output_ctx;
typedef struct dom_decoding_encoding_ctx {
/* We can skip some conversion if the input and output encoding are both UTF-8,
* we only have to validate and substitute replacement characters */
bool fast_path; /* Put first, near the encode & decode structures, for cache locality */
lxb_encoding_encode_t encode;
lxb_encoding_decode_t decode;
const lxb_encoding_data_t *encode_data;
const lxb_encoding_data_t *decode_data;
lxb_char_t encoding_output[4096];
lxb_codepoint_t codepoints[4096];
} dom_decoding_encoding_ctx;
/* https://dom.spec.whatwg.org/#dom-document-implementation */
zend_result dom_modern_document_implementation_read(dom_object *obj, zval *retval)
{
const uint32_t PROP_INDEX = 0;
#if ZEND_DEBUG
zend_string *implementation_str = ZSTR_INIT_LITERAL("implementation", false);
const zend_property_info *prop_info = zend_get_property_info(dom_abstract_base_document_class_entry, implementation_str, 0);
zend_string_release_ex(implementation_str, false);
ZEND_ASSERT(OBJ_PROP_TO_NUM(prop_info->offset) == PROP_INDEX);
#endif
zval *cached_implementation = OBJ_PROP_NUM(&obj->std, PROP_INDEX);
if (Z_ISUNDEF_P(cached_implementation)) {
php_dom_create_implementation(cached_implementation, true);
}
ZVAL_OBJ_COPY(retval, Z_OBJ_P(cached_implementation));
return SUCCESS;
}
static void dom_decoding_encoding_ctx_init(dom_decoding_encoding_ctx *ctx)
{
ctx->decode_data = ctx->encode_data = lxb_encoding_data(LXB_ENCODING_UTF_8);
ctx->fast_path = true;
(void) lxb_encoding_encode_init(
&ctx->encode,
ctx->encode_data,
ctx->encoding_output,
sizeof(ctx->encoding_output) / sizeof(*ctx->encoding_output)
);
(void) lxb_encoding_encode_replace_set(&ctx->encode, LXB_ENCODING_REPLACEMENT_BYTES, LXB_ENCODING_REPLACEMENT_SIZE);
(void) lxb_encoding_decode_init(
&ctx->decode,
ctx->decode_data,
ctx->codepoints,
sizeof(ctx->codepoints) / sizeof(*ctx->codepoints)
);
(void) lxb_encoding_decode_replace_set(&ctx->decode, LXB_ENCODING_REPLACEMENT_BUFFER, LXB_ENCODING_REPLACEMENT_BUFFER_LEN);
}
static const char *dom_lexbor_tokenizer_error_code_to_string(lxb_html_tokenizer_error_id_t id)
{
switch (id) {
case LXB_HTML_TOKENIZER_ERROR_ABCLOFEMCO: return "abrupt-closing-of-empty-comment";
case LXB_HTML_TOKENIZER_ERROR_ABDOPUID: return "abrupt-doctype-public-identifier";
case LXB_HTML_TOKENIZER_ERROR_ABDOSYID: return "abrupt-doctype-system-identifier";
case LXB_HTML_TOKENIZER_ERROR_ABOFDIINNUCHRE: return "absence-of-digits-in-numeric-character-reference";
case LXB_HTML_TOKENIZER_ERROR_CDINHTCO: return "cdata-in-html-content";
case LXB_HTML_TOKENIZER_ERROR_CHREOUUNRA: return "character-reference-outside-unicode-range";
case LXB_HTML_TOKENIZER_ERROR_COCHININST: return "control-character-in-input-stream";
case LXB_HTML_TOKENIZER_ERROR_COCHRE: return "control-character-reference";
case LXB_HTML_TOKENIZER_ERROR_ENTAWIAT: return "end-tag-with-attributes";
case LXB_HTML_TOKENIZER_ERROR_DUAT: return "duplicate-attribute";
case LXB_HTML_TOKENIZER_ERROR_ENTAWITRSO: return "end-tag-with-trailing-solidus";
case LXB_HTML_TOKENIZER_ERROR_EOBETANA: return "eof-before-tag-name";
case LXB_HTML_TOKENIZER_ERROR_EOINCD: return "eof-in-cdata";
case LXB_HTML_TOKENIZER_ERROR_EOINCO: return "eof-in-comment";
case LXB_HTML_TOKENIZER_ERROR_EOINDO: return "eof-in-doctype";
case LXB_HTML_TOKENIZER_ERROR_EOINSCHTCOLITE: return "eof-in-script-html-comment-like-text";
case LXB_HTML_TOKENIZER_ERROR_EOINTA: return "eof-in-tag";
case LXB_HTML_TOKENIZER_ERROR_INCLCO: return "incorrectly-closed-comment";
case LXB_HTML_TOKENIZER_ERROR_INOPCO: return "incorrectly-opened-comment";
case LXB_HTML_TOKENIZER_ERROR_INCHSEAFDONA: return "invalid-character-sequence-after-doctype-name";
case LXB_HTML_TOKENIZER_ERROR_INFICHOFTANA: return "invalid-first-character-of-tag-name";
case LXB_HTML_TOKENIZER_ERROR_MIATVA: return "missing-attribute-value";
case LXB_HTML_TOKENIZER_ERROR_MIDONA: return "missing-doctype-name";
case LXB_HTML_TOKENIZER_ERROR_MIDOPUID: return "missing-doctype-public-identifier";
case LXB_HTML_TOKENIZER_ERROR_MIDOSYID: return "missing-doctype-system-identifier";
case LXB_HTML_TOKENIZER_ERROR_MIENTANA: return "missing-end-tag-name";
case LXB_HTML_TOKENIZER_ERROR_MIQUBEDOPUID: return "missing-quote-before-doctype-public-identifier";
case LXB_HTML_TOKENIZER_ERROR_MIQUBEDOSYID: return "missing-quote-before-doctype-system-identifier";
case LXB_HTML_TOKENIZER_ERROR_MISEAFCHRE: return "missing-semicolon-after-character-reference";
case LXB_HTML_TOKENIZER_ERROR_MIWHAFDOPUKE: return "missing-whitespace-after-doctype-public-keyword";
case LXB_HTML_TOKENIZER_ERROR_MIWHAFDOSYKE: return "missing-whitespace-after-doctype-system-keyword";
case LXB_HTML_TOKENIZER_ERROR_MIWHBEDONA: return "missing-whitespace-before-doctype-name";
case LXB_HTML_TOKENIZER_ERROR_MIWHBEAT: return "missing-whitespace-between-attributes";
case LXB_HTML_TOKENIZER_ERROR_MIWHBEDOPUANSYID: return "missing-whitespace-between-doctype-public-and-system-identifiers";
case LXB_HTML_TOKENIZER_ERROR_NECO: return "nested-comment";
case LXB_HTML_TOKENIZER_ERROR_NOCHRE: return "noncharacter-character-reference";
case LXB_HTML_TOKENIZER_ERROR_NOININST: return "noncharacter-in-input-stream";
case LXB_HTML_TOKENIZER_ERROR_NOVOHTELSTTAWITRSO: return "non-void-html-element-start-tag-with-trailing-solidus";
case LXB_HTML_TOKENIZER_ERROR_NUCHRE: return "null-character-reference";
case LXB_HTML_TOKENIZER_ERROR_SUCHRE: return "surrogate-character-reference";
case LXB_HTML_TOKENIZER_ERROR_SUININST: return "surrogate-in-input-stream";
case LXB_HTML_TOKENIZER_ERROR_UNCHAFDOSYID: return "unexpected-character-after-doctype-system-identifier";
case LXB_HTML_TOKENIZER_ERROR_UNCHINATNA: return "unexpected-character-in-attribute-name";
case LXB_HTML_TOKENIZER_ERROR_UNCHINUNATVA: return "unexpected-character-in-unquoted-attribute-value";
case LXB_HTML_TOKENIZER_ERROR_UNEQSIBEATNA: return "unexpected-equals-sign-before-attribute-name";
case LXB_HTML_TOKENIZER_ERROR_UNNUCH: return "unexpected-null-character";
case LXB_HTML_TOKENIZER_ERROR_UNQUMAINOFTANA: return "unexpected-question-mark-instead-of-tag-name";
case LXB_HTML_TOKENIZER_ERROR_UNSOINTA: return "unexpected-solidus-in-tag";
case LXB_HTML_TOKENIZER_ERROR_UNNACHRE: return "unknown-named-character-reference";
default: return "unknown error";
}
}
static const char *dom_lexbor_tree_error_code_to_string(lxb_html_tree_error_id_t id)
{
switch (id) {
case LXB_HTML_RULES_ERROR_UNTO: return "unexpected-token";
case LXB_HTML_RULES_ERROR_UNCLTO: return "unexpected-closed-token";
case LXB_HTML_RULES_ERROR_NUCH: return "null-character";
case LXB_HTML_RULES_ERROR_UNCHTO: return "unexpected-character-token";
case LXB_HTML_RULES_ERROR_UNTOININMO: return "unexpected-token-in-initial-mode";
case LXB_HTML_RULES_ERROR_BADOTOININMO: return "bad-doctype-token-in-initial-mode";
case LXB_HTML_RULES_ERROR_DOTOINBEHTMO: return "doctype-token-in-before-html-mode";
case LXB_HTML_RULES_ERROR_UNCLTOINBEHTMO: return "unexpected-closed-token-in-before-html-mode";
case LXB_HTML_RULES_ERROR_DOTOINBEHEMO: return "doctype-token-in-before-head-mode";
case LXB_HTML_RULES_ERROR_UNCLTOINBEHEMO: return "unexpected-closed_token-in-before-head-mode";
case LXB_HTML_RULES_ERROR_DOTOINHEMO: return "doctype-token-in-head-mode";
case LXB_HTML_RULES_ERROR_NOVOHTELSTTAWITRSO: return "non-void-html-element-start-tag-with-trailing-solidus";
case LXB_HTML_RULES_ERROR_HETOINHEMO: return "head-token-in-head-mode";
case LXB_HTML_RULES_ERROR_UNCLTOINHEMO: return "unexpected-closed-token-in-head-mode";
case LXB_HTML_RULES_ERROR_TECLTOWIOPINHEMO: return "template-closed-token-without-opening-in-head-mode";
case LXB_HTML_RULES_ERROR_TEELISNOCUINHEMO: return "template-element-is-not-current-in-head-mode";
case LXB_HTML_RULES_ERROR_DOTOINHENOMO: return "doctype-token-in-head-noscript-mode";
case LXB_HTML_RULES_ERROR_DOTOAFHEMO: return "doctype-token-after-head-mode";
case LXB_HTML_RULES_ERROR_HETOAFHEMO: return "head-token-after-head-mode";
case LXB_HTML_RULES_ERROR_DOTOINBOMO: return "doctype-token-in-body-mode";
case LXB_HTML_RULES_ERROR_BAENOPELISWR: return "bad-ending-open-elements-is-wrong";
case LXB_HTML_RULES_ERROR_OPELISWR: return "open-elements-is-wrong";
case LXB_HTML_RULES_ERROR_UNELINOPELST: return "unexpected-element-in-open-elements-stack";
case LXB_HTML_RULES_ERROR_MIELINOPELST: return "missing-element-in-open-elements-stack";
case LXB_HTML_RULES_ERROR_NOBOELINSC: return "no-body-element-in-scope";
case LXB_HTML_RULES_ERROR_MIELINSC: return "missing-element-in-scope";
case LXB_HTML_RULES_ERROR_UNELINSC: return "unexpected-element-in-scope";
case LXB_HTML_RULES_ERROR_UNELINACFOST: return "unexpected-element-in-active-formatting-stack";
case LXB_HTML_RULES_ERROR_UNENOFFI: return "unexpected-end-of-file";
case LXB_HTML_RULES_ERROR_CHINTATE: return "characters-in-table-text";
case LXB_HTML_RULES_ERROR_DOTOINTAMO: return "doctype-token-in-table-mode";
case LXB_HTML_RULES_ERROR_DOTOINSEMO: return "doctype-token-in-select-mode";
case LXB_HTML_RULES_ERROR_DOTOAFBOMO: return "doctype-token-after-body-mode";
case LXB_HTML_RULES_ERROR_DOTOINFRMO: return "doctype-token-in-frameset-mode";
case LXB_HTML_RULES_ERROR_DOTOAFFRMO: return "doctype-token-after-frameset-mode";
case LXB_HTML_RULES_ERROR_DOTOFOCOMO: return "doctype-token-foreign-content-mode";
default: return "unknown error";
}
}
static const char *dom_lexbor_libxml2_bridge_status_code_to_string(lexbor_libxml2_bridge_status status)
{
switch (status) {
case LEXBOR_LIBXML2_BRIDGE_STATUS_CANNOT_INIT: return "cannot initialize data structures";
case LEXBOR_LIBXML2_BRIDGE_STATUS_FATAL_PARSE: return "fatal error in parsing";
case LEXBOR_LIBXML2_BRIDGE_STATUS_OVERFLOW: return "string length overflow";
case LEXBOR_LIBXML2_BRIDGE_STATUS_OOM: return "out of memory";
default: return "unknown error";
}
}
static void dom_reset_line_column_cache(dom_line_column_cache *cache)
{
cache->last_line = 1;
cache->last_column = 1;
cache->last_offset = 0;
}
static void dom_find_line_and_column_using_cache(
const dom_lexbor_libxml2_bridge_application_data *application_data,
dom_line_column_cache *cache,
size_t offset
)
{
offset -= application_data->current_total_offset;
if (offset > application_data->current_input_length) {
/* Possible with empty input, also just good for general safety */
offset = application_data->current_input_length;
}
size_t last_column = cache->last_column;
size_t last_line = cache->last_line;
size_t last_offset = cache->last_offset;
/* Either unicode or UTF-8 data */
if (application_data->current_input_codepoints != NULL) {
while (last_offset < offset) {
if (application_data->current_input_codepoints[last_offset] == 0x000A /* Unicode codepoint for line feed */) {
last_line++;
last_column = 1;
} else {
last_column++;
}
last_offset++;
}
} else {
while (last_offset < offset) {
const lxb_char_t current = application_data->current_input_characters[last_offset];
if (current == '\n') {
last_line++;
last_column = 1;
last_offset++;
} else {
/* See Lexbor tokenizer patch
* Note for future self: branchlessly computing the length and jumping by the length would be nice,
* however it takes so many instructions to do so that it is slower than this naive method. */
if ((current & 0b11000000) != 0b10000000) {
last_column++;
}
last_offset++;
}
}
}
cache->last_column = last_column;
cache->last_line = last_line;
cache->last_offset = last_offset;
}
static void dom_lexbor_libxml2_bridge_tokenizer_error_reporter(
void *application_data_voidptr,
lxb_html_tokenizer_error_t *error,
size_t offset
)
{
dom_lexbor_libxml2_bridge_application_data *application_data = application_data_voidptr;
dom_find_line_and_column_using_cache(application_data, &application_data->cache_tokenizer, offset);
php_libxml_pretend_ctx_error_ex(application_data->input_name, application_data->cache_tokenizer.last_line, application_data->cache_tokenizer.last_column, "tokenizer error %s in %s, line: %zu, column: %zu\n", dom_lexbor_tokenizer_error_code_to_string(error->id), application_data->input_name, application_data->cache_tokenizer.last_line, application_data->cache_tokenizer.last_column);
}
static void dom_lexbor_libxml2_bridge_tree_error_reporter(
void *application_data_voidptr,
lxb_html_tree_error_t *error,
size_t line,
size_t column,
size_t len
)
{
dom_lexbor_libxml2_bridge_application_data *application_data = application_data_voidptr;
if (line == 1 && application_data->html_no_implied && error->id == LXB_HTML_RULES_ERROR_UNTOININMO) {
/* For no implied mode, we want to mimick libxml's behaviour of not reporting an error for a lacking doctype. */
return;
}
if (len <= 1) {
/* Possible with EOF, or single-character tokens, don't use a range in the error display in this case */
php_libxml_pretend_ctx_error_ex(
application_data->input_name,
line,
column,
"tree error %s in %s, line: %zu, column: %zu\n",
dom_lexbor_tree_error_code_to_string(error->id),
application_data->input_name,
line,
column
);
} else {
php_libxml_pretend_ctx_error_ex(
application_data->input_name,
line,
column,
"tree error %s in %s, line: %zu, column: %zu-%zu\n",
dom_lexbor_tree_error_code_to_string(error->id),
application_data->input_name,
line,
column,
column + len - 1
);
}
}
static xmlNodePtr dom_search_child(xmlNodePtr parent, const char *searching_for)
{
xmlNodePtr node = parent->children;
while (node != NULL) {
if (node->type == XML_ELEMENT_NODE && strcmp((const char *) node->name, searching_for) == 0) {
return node;
}
node = node->next;
}
return NULL;
}
static void dom_place_remove_element_and_hoist_children(xmlNodePtr parent, const char *searching_for)
{
xmlNodePtr node = dom_search_child(parent, searching_for);
if (node != NULL) {
xmlUnlinkNode(node);
xmlNodePtr child = node->children;
while (child != NULL) {
xmlUnlinkNode(child);
xmlAddChild(parent, child);
child = node->children;
}
xmlFreeNode(node);
}
}
static void dom_post_process_html5_loading(
xmlDocPtr lxml_doc,
zend_long options,
const lexbor_libxml2_bridge_extracted_observations *observations
)
{
if (options & HTML_PARSE_NOIMPLIED) {
xmlNodePtr html_node = dom_search_child((xmlNodePtr) lxml_doc, "html");
if (!observations->has_explicit_head_tag) {
dom_place_remove_element_and_hoist_children(html_node, "head");
}
if (!observations->has_explicit_body_tag) {
dom_place_remove_element_and_hoist_children(html_node, "body");
}
if (!observations->has_explicit_html_tag) {
dom_place_remove_element_and_hoist_children((xmlNodePtr) lxml_doc, "html");
}
}
}
/* https://html.spec.whatwg.org/multipage/parsing.html#determining-the-character-encoding */
static dom_character_encoding_data dom_determine_encoding(const char *source, size_t source_len)
{
dom_character_encoding_data result;
/* BOM sniffing */
if (source_len >= 3 && source[0] == '\xEF' && source[1] == '\xBB' && source[2] == '\xBF') {
result.encoding_data = lxb_encoding_data(LXB_ENCODING_UTF_8);
result.bom_shift = 3;
return result;
} else if (source_len >= 2) {
if (source[0] == '\xFE' && source[1] == '\xFF') {
result.encoding_data = lxb_encoding_data(LXB_ENCODING_UTF_16BE);
result.bom_shift = 2;
return result;
} else if (source[0] == '\xFF' && source[1] == '\xFE') {
result.encoding_data = lxb_encoding_data(LXB_ENCODING_UTF_16LE);
result.bom_shift = 2;
return result;
}
}
/* Perform prescan */
lxb_html_encoding_t encoding;
lxb_status_t status = lxb_html_encoding_init(&encoding);
if (status != LXB_STATUS_OK) {
goto fallback_uninit;
}
/* This is the "wait either for 1024 bytes or 500ms" part */
if (source_len > 1024) {
source_len = 1024;
}
status = lxb_html_encoding_determine(&encoding, (const lxb_char_t *) source, (const lxb_char_t *) source + source_len);
if (status != LXB_STATUS_OK) {
goto fallback;
}
lxb_html_encoding_entry_t *entry = lxb_html_encoding_meta_entry(&encoding, 0);
if (entry == NULL) {
goto fallback;
}
result.encoding_data = lxb_encoding_data_by_pre_name(entry->name, entry->end - entry->name);
if (!result.encoding_data) {
goto fallback;
}
result.bom_shift = 0;
lxb_html_encoding_destroy(&encoding, false);
return result;
fallback:
lxb_html_encoding_destroy(&encoding, false);
fallback_uninit:
result.encoding_data = lxb_encoding_data(DOM_FALLBACK_ENCODING_ID);
result.bom_shift = 0;
return result;
}
static void dom_setup_parser_encoding_manually(const lxb_char_t *buf_start, const lxb_encoding_data_t *encoding_data, dom_decoding_encoding_ctx *decoding_encoding_ctx, dom_lexbor_libxml2_bridge_application_data *application_data)
{
static const lxb_codepoint_t replacement_codepoint = LXB_ENCODING_REPLACEMENT_CODEPOINT;
decoding_encoding_ctx->decode_data = encoding_data;
(void) lxb_encoding_decode_init(
&decoding_encoding_ctx->decode,
decoding_encoding_ctx->decode_data,
decoding_encoding_ctx->codepoints,
sizeof(decoding_encoding_ctx->codepoints) / sizeof(*decoding_encoding_ctx->codepoints)
);
(void) lxb_encoding_decode_replace_set(
&decoding_encoding_ctx->decode,
&replacement_codepoint,
LXB_ENCODING_REPLACEMENT_BUFFER_LEN
);
/* Note: encode_data is for UTF-8 */
decoding_encoding_ctx->fast_path = decoding_encoding_ctx->decode_data == decoding_encoding_ctx->encode_data;
if (decoding_encoding_ctx->fast_path) {
application_data->current_input_codepoints = NULL;
application_data->current_input_characters = (const char *) buf_start;
} else {
application_data->current_input_codepoints = decoding_encoding_ctx->codepoints;
application_data->current_input_characters = NULL;
}
}
static void dom_setup_parser_encoding_implicitly(
const lxb_char_t **buf_ref,
size_t *read,
dom_decoding_encoding_ctx *decoding_encoding_ctx,
dom_lexbor_libxml2_bridge_application_data *application_data
)
{
const char *buf_start = (const char *) *buf_ref;
dom_character_encoding_data dom_encoding_data = dom_determine_encoding(buf_start, *read);
*buf_ref += dom_encoding_data.bom_shift;
*read -= dom_encoding_data.bom_shift;
dom_setup_parser_encoding_manually((const lxb_char_t *) buf_start, dom_encoding_data.encoding_data, decoding_encoding_ctx, application_data);
}
static bool dom_process_parse_chunk(
lexbor_libxml2_bridge_parse_context *ctx,
lxb_html_document_t *document,
lxb_html_parser_t *parser,
size_t encoded_length,
const lxb_char_t *encoding_output,
size_t input_buffer_length,
size_t *tokenizer_error_offset,
size_t *tree_error_offset
)
{
dom_lexbor_libxml2_bridge_application_data *application_data = ctx->application_data;
application_data->current_input_length = input_buffer_length;
lexbor_status_t lexbor_status = lxb_html_document_parse_chunk(document, encoding_output, encoded_length);
if (UNEXPECTED(lexbor_status != LXB_STATUS_OK)) {
return false;
}
if (ctx->tokenizer_error_reporter || ctx->tree_error_reporter) {
lexbor_libxml2_bridge_report_errors(ctx, parser, encoding_output, application_data->current_total_offset, tokenizer_error_offset, tree_error_offset);
dom_find_line_and_column_using_cache(application_data, &application_data->cache_tokenizer, application_data->current_total_offset + input_buffer_length);
}
application_data->current_total_offset += input_buffer_length;
application_data->cache_tokenizer.last_offset = 0;
return true;
}
static bool dom_decode_encode_fast_path(
lexbor_libxml2_bridge_parse_context *ctx,
lxb_html_document_t *document,
lxb_html_parser_t *parser,
const lxb_char_t **buf_ref_ref,
const lxb_char_t *buf_end,
dom_decoding_encoding_ctx *decoding_encoding_ctx,
size_t *tokenizer_error_offset,
size_t *tree_error_offset
)
{
const lxb_char_t *buf_ref = *buf_ref_ref;
/* If we returned for needing more bytes, we need to finish up the buffer for the old codepoint. */
if (decoding_encoding_ctx->decode.status == LXB_STATUS_CONTINUE) {
lxb_char_t buf[4];
lxb_char_t *buf_ptr = buf;
lxb_codepoint_t codepoint = lxb_encoding_decode_utf_8_single(&decoding_encoding_ctx->decode, &buf_ref, buf_end);
if (lxb_encoding_encode_utf_8_single(&decoding_encoding_ctx->encode, &buf_ptr, buf + sizeof(buf), codepoint) > sizeof(buf)) {
buf_ptr = zend_mempcpy(buf, LXB_ENCODING_REPLACEMENT_BYTES, LXB_ENCODING_REPLACEMENT_SIZE);
}
decoding_encoding_ctx->decode.status = LXB_STATUS_OK;
if (!dom_process_parse_chunk(
ctx,
document,
parser,
buf_ptr - buf,
buf,
buf_ptr - buf,
tokenizer_error_offset,
tree_error_offset
)) {
goto fail_oom;
}
}
const lxb_char_t *last_output = buf_ref;
while (buf_ref != buf_end) {
/* Fast path converts non-validated UTF-8 -> validated UTF-8 */
if (decoding_encoding_ctx->decode.u.utf_8.need == 0 && *buf_ref < 0x80) {
/* Fast path within the fast path: try to skip non-mb bytes in bulk if we are not in a state where we
* need more UTF-8 bytes to complete a sequence.
* It might be tempting to use SIMD here, but it turns out that this is less efficient because
* we need to process the same byte multiple times sometimes when mixing ASCII with multibyte. */
buf_ref++;
continue;
}
const lxb_char_t *buf_ref_backup = buf_ref;
lxb_codepoint_t codepoint = lxb_encoding_decode_utf_8_single(&decoding_encoding_ctx->decode, &buf_ref, buf_end);
if (UNEXPECTED(codepoint > LXB_ENCODING_MAX_CODEPOINT)) {
size_t skip = buf_ref - buf_ref_backup; /* Skip invalid data, it's replaced by the UTF-8 replacement bytes */
if (!dom_process_parse_chunk(
ctx,
document,
parser,
buf_ref - last_output - skip,
last_output,
buf_ref - last_output,
tokenizer_error_offset,
tree_error_offset
)) {
goto fail_oom;
}
if (codepoint == LXB_ENCODING_DECODE_CONTINUE) {
ZEND_ASSERT(buf_ref == buf_end);
/* The decoder needs more data but the entire buffer is consumed.
* All valid data is outputted, and if the remaining data for the code point
* is invalid, the next call will output the replacement bytes. */
*buf_ref_ref = buf_ref;
decoding_encoding_ctx->decode.status = LXB_STATUS_CONTINUE;
return true;
}
if (!dom_process_parse_chunk(
ctx,
document,
parser,
LXB_ENCODING_REPLACEMENT_SIZE,
LXB_ENCODING_REPLACEMENT_BYTES,
0,
tokenizer_error_offset,
tree_error_offset
)) {
goto fail_oom;
}
last_output = buf_ref;
}
}
if (buf_ref != last_output
&& !dom_process_parse_chunk(
ctx,
document,
parser,
buf_ref - last_output,
last_output,
buf_ref - last_output,
tokenizer_error_offset,
tree_error_offset
)) {
goto fail_oom;
}
*buf_ref_ref = buf_ref;
return true;
fail_oom:
*buf_ref_ref = buf_ref;
return false;
}
static bool dom_decode_encode_slow_path(
lexbor_libxml2_bridge_parse_context *ctx,
lxb_html_document_t *document,
lxb_html_parser_t *parser,
const lxb_char_t **buf_ref_ref,
const lxb_char_t *buf_end,
dom_decoding_encoding_ctx *decoding_encoding_ctx,
size_t *tokenizer_error_offset,
size_t *tree_error_offset
)
{
const lxb_char_t *buf_ref = *buf_ref_ref;
lexbor_status_t decode_status, encode_status;
do {
decode_status = decoding_encoding_ctx->decode_data->decode(&decoding_encoding_ctx->decode, &buf_ref, buf_end);
const lxb_codepoint_t *codepoints_ref = (const lxb_codepoint_t *) decoding_encoding_ctx->codepoints;
size_t decoding_buffer_used = lxb_encoding_decode_buf_used(&decoding_encoding_ctx->decode);
const lxb_codepoint_t *codepoints_end = decoding_encoding_ctx->codepoints + decoding_buffer_used;
do {
encode_status = decoding_encoding_ctx->encode_data->encode(&decoding_encoding_ctx->encode, &codepoints_ref, codepoints_end);
ZEND_ASSERT(encode_status != LXB_STATUS_ERROR && "parameters and replacements should be valid");
if (!dom_process_parse_chunk(
ctx,
document,
parser,
lxb_encoding_encode_buf_used(&decoding_encoding_ctx->encode),
decoding_encoding_ctx->encoding_output,
decoding_buffer_used,
tokenizer_error_offset,
tree_error_offset
)) {
goto fail_oom;
}
lxb_encoding_encode_buf_used_set(&decoding_encoding_ctx->encode, 0);
} while (encode_status == LXB_STATUS_SMALL_BUFFER);
lxb_encoding_decode_buf_used_set(&decoding_encoding_ctx->decode, 0);
} while (decode_status == LXB_STATUS_SMALL_BUFFER);
*buf_ref_ref = buf_ref;
return true;
fail_oom:
*buf_ref_ref = buf_ref;
return false;
}
static bool dom_parse_decode_encode_step(
lexbor_libxml2_bridge_parse_context *ctx,
lxb_html_document_t *document,
lxb_html_parser_t *parser,
const lxb_char_t **buf_ref_ref,
const lxb_char_t *buf_end,
dom_decoding_encoding_ctx *decoding_encoding_ctx,
size_t *tokenizer_error_offset,
size_t *tree_error_offset
)
{
if (decoding_encoding_ctx->fast_path) {
return dom_decode_encode_fast_path(
ctx,
document,
parser,
buf_ref_ref,
buf_end,
decoding_encoding_ctx,
tokenizer_error_offset,
tree_error_offset
);
} else {
return dom_decode_encode_slow_path(
ctx,
document,
parser,
buf_ref_ref,
buf_end,
decoding_encoding_ctx,
tokenizer_error_offset,
tree_error_offset
);
}
}
static bool dom_parse_decode_encode_finish(
lexbor_libxml2_bridge_parse_context *ctx,
lxb_html_document_t *document,
lxb_html_parser_t *parser,
dom_decoding_encoding_ctx *decoding_encoding_ctx,
size_t *tokenizer_error_offset,
size_t *tree_error_offset
)
{
lxb_status_t status;
status = lxb_encoding_decode_finish(&decoding_encoding_ctx->decode);
ZEND_ASSERT(status == LXB_STATUS_OK);
size_t decoding_buffer_size = lxb_encoding_decode_buf_used(&decoding_encoding_ctx->decode);
if (decoding_buffer_size > 0) {
const lxb_codepoint_t *codepoints_ref = (const lxb_codepoint_t *) decoding_encoding_ctx->codepoints;
const lxb_codepoint_t *codepoints_end = codepoints_ref + decoding_buffer_size;
status = decoding_encoding_ctx->encode_data->encode(&decoding_encoding_ctx->encode, &codepoints_ref, codepoints_end);
ZEND_ASSERT(status == LXB_STATUS_OK);
/* No need to produce output here, as we finish the encoder below and pass the chunk. */
}
status = lxb_encoding_encode_finish(&decoding_encoding_ctx->encode);
ZEND_ASSERT(status == LXB_STATUS_OK);
if (lxb_encoding_encode_buf_used(&decoding_encoding_ctx->encode)
&& !dom_process_parse_chunk(
ctx,
document,
parser,
lxb_encoding_encode_buf_used(&decoding_encoding_ctx->encode),
decoding_encoding_ctx->encoding_output,
lxb_encoding_decode_buf_used(&decoding_encoding_ctx->decode),
tokenizer_error_offset,
tree_error_offset
)) {
return false;
}
return true;
}
static bool check_options_validity(uint32_t arg_num, zend_long options)
{
const zend_long VALID_OPTIONS = XML_PARSE_NOERROR | XML_PARSE_COMPACT | HTML_PARSE_NOIMPLIED | DOM_HTML_NO_DEFAULT_NS;
if ((options & ~VALID_OPTIONS) != 0) {
zend_argument_value_error(arg_num, "contains invalid flags (allowed flags: "
"LIBXML_NOERROR, "
"LIBXML_COMPACT, "
"LIBXML_HTML_NOIMPLIED, "
"Dom\\NO_DEFAULT_NS)");
return false;
}
return true;
}
PHP_METHOD(Dom_HTMLDocument, createEmpty)
{
const char *encoding = "UTF-8";
size_t encoding_len = strlen("UTF-8");
if (zend_parse_parameters(ZEND_NUM_ARGS(), "|p", &encoding, &encoding_len) == FAILURE) {
RETURN_THROWS();
}
const lxb_encoding_data_t *encoding_data = lxb_encoding_data_by_name((const lxb_char_t *) encoding, encoding_len);
if (encoding_data == NULL) {
zend_argument_value_error(1, "must be a valid document encoding");
RETURN_THROWS();
}
xmlDocPtr lxml_doc = php_dom_create_html_doc();
if (UNEXPECTED(lxml_doc == NULL)) {
goto oom;
}
lxml_doc->encoding = xmlStrdup((const xmlChar *) encoding);
dom_object *intern = php_dom_instantiate_object_helper(
return_value,
dom_html_document_class_entry,
(xmlNodePtr) lxml_doc,
NULL
);
dom_set_xml_class(intern->document);
intern->document->private_data = php_dom_libxml_private_data_header(php_dom_private_data_create());
return;
oom:
php_dom_throw_error(INVALID_STATE_ERR, true);
RETURN_THROWS();
}
/* Only bother to register error handling when the error reports can become observable. */
static bool dom_should_register_error_handlers(zend_long options)
{
if (options & XML_PARSE_NOERROR) {
return false;
}
return php_libxml_uses_internal_errors() || ((EG(error_reporting) | EG(user_error_handler_error_reporting)) & E_WARNING);
}
PHP_METHOD(Dom_HTMLDocument, createFromString)
{
const char *source, *override_encoding = NULL;
size_t source_len, override_encoding_len;
zend_long options = 0;
if (zend_parse_parameters(
ZEND_NUM_ARGS(),
"s|lp!",
&source,
&source_len,
&options,
&override_encoding,
&override_encoding_len
) == FAILURE) {
RETURN_THROWS();
}
if (!check_options_validity(2, options)) {
RETURN_THROWS();
}
dom_lexbor_libxml2_bridge_application_data application_data;
application_data.input_name = "Entity";
application_data.current_total_offset = 0;
application_data.html_no_implied = options & HTML_PARSE_NOIMPLIED;
dom_reset_line_column_cache(&application_data.cache_tokenizer);
lexbor_libxml2_bridge_parse_context ctx;
lexbor_libxml2_bridge_parse_context_init(&ctx);
if (dom_should_register_error_handlers(options)) {
lexbor_libxml2_bridge_parse_set_error_callbacks(
&ctx,
dom_lexbor_libxml2_bridge_tokenizer_error_reporter,
dom_lexbor_libxml2_bridge_tree_error_reporter
);
}
ctx.application_data = &application_data;
size_t tokenizer_error_offset = 0;
size_t tree_error_offset = 0;
/* Setup everything encoding & decoding related */
const lxb_char_t *buf_ref = (const lxb_char_t *) source;
dom_decoding_encoding_ctx decoding_encoding_ctx;
dom_decoding_encoding_ctx_init(&decoding_encoding_ctx);
if (override_encoding != NULL) {
const lxb_encoding_data_t *encoding_data = lxb_encoding_data_by_name(
(const lxb_char_t *) override_encoding,
override_encoding_len
);
if (!encoding_data) {
zend_argument_value_error(3, "must be a valid document encoding");
RETURN_THROWS();
}
dom_setup_parser_encoding_manually(buf_ref, encoding_data, &decoding_encoding_ctx, &application_data);
} else {
dom_setup_parser_encoding_implicitly(&buf_ref, &source_len, &decoding_encoding_ctx, &application_data);
}
lxb_html_document_t *document = lxb_html_document_create();
if (UNEXPECTED(document == NULL)) {
goto fail_oom;
}
lxb_status_t lexbor_status = lxb_html_document_parse_chunk_begin(document);
if (UNEXPECTED(lexbor_status != LXB_STATUS_OK)) {
goto fail_oom;
}
lxb_html_parser_t *parser = document->dom_document.parser;
while (source_len > 0) {
size_t chunk_size = source_len;
const size_t MAX_CHUNK_SIZE = sizeof(decoding_encoding_ctx.encoding_output) / sizeof(*decoding_encoding_ctx.encoding_output);
if (chunk_size > MAX_CHUNK_SIZE) {
chunk_size = MAX_CHUNK_SIZE;
}
source_len -= chunk_size;
const lxb_char_t *buf_end = buf_ref + chunk_size;
bool result = dom_parse_decode_encode_step(
&ctx,
document,
parser,
&buf_ref,
buf_end,
&decoding_encoding_ctx,
&tokenizer_error_offset,
&tree_error_offset
);
if (!result) {
goto fail_oom;
}
/* In the string case we have a single buffer that acts as a sliding window.
* The `current_input_characters` field starts pointing at the start of the buffer, but needs to slide along the
* sliding window as well. */
if (application_data.current_input_characters) {
application_data.current_input_characters += chunk_size;
}
}
if (!dom_parse_decode_encode_finish(&ctx, document, parser, &decoding_encoding_ctx, &tokenizer_error_offset, &tree_error_offset)) {
goto fail_oom;
}
lexbor_status = lxb_html_document_parse_chunk_end(document);
if (lexbor_status != LXB_STATUS_OK) {
goto fail_oom;
}
php_dom_private_data *private_data = php_dom_private_data_create();
xmlDocPtr lxml_doc;
lexbor_libxml2_bridge_status bridge_status = lexbor_libxml2_bridge_convert_document(
document,
&lxml_doc,
options & XML_PARSE_COMPACT,
!(options & DOM_HTML_NO_DEFAULT_NS),
private_data
);
lexbor_libxml2_bridge_copy_observations(parser->tree, &ctx.observations);
if (UNEXPECTED(bridge_status != LEXBOR_LIBXML2_BRIDGE_STATUS_OK)) {
php_dom_private_data_destroy(private_data);
php_libxml_ctx_error(
NULL,
"%s in %s",
dom_lexbor_libxml2_bridge_status_code_to_string(bridge_status),
application_data.input_name
);
lxb_html_document_destroy(document);
RETURN_FALSE;
}
lxb_html_document_destroy(document);
dom_post_process_html5_loading(lxml_doc, options, &ctx.observations);
if (decoding_encoding_ctx.decode_data) {
lxml_doc->encoding = xmlStrdup((const xmlChar *) decoding_encoding_ctx.decode_data->name);
} else {
lxml_doc->encoding = xmlStrdup((const xmlChar *) "UTF-8");
}
dom_object *intern = php_dom_instantiate_object_helper(
return_value,
dom_html_document_class_entry,
(xmlNodePtr) lxml_doc,
NULL
);
dom_set_xml_class(intern->document);
intern->document->quirks_mode = ctx.observations.quirks_mode;
intern->document->private_data = php_dom_libxml_private_data_header(private_data);
return;
fail_oom:
lxb_html_document_destroy(document);
php_dom_throw_error(INVALID_STATE_ERR, true);
RETURN_THROWS();
}
PHP_METHOD(Dom_HTMLDocument, createFromFile)
{
const char *filename, *override_encoding = NULL;
php_dom_private_data *private_data = NULL;
size_t filename_len, override_encoding_len;
zend_long options = 0;
php_stream *stream = NULL;
if (zend_parse_parameters(
ZEND_NUM_ARGS(),
"p|lp!",
&filename,
&filename_len,
&options,
&override_encoding,
&override_encoding_len
) == FAILURE) {
RETURN_THROWS();
}
/* See php_libxml_streams_IO_open_wrapper(), apparently this caused issues in the past. */
if (strstr(filename, "%00")) {
zend_argument_value_error(1, "must not contain percent-encoded NUL bytes");
RETURN_THROWS();
}
if (!check_options_validity(2, options)) {
RETURN_THROWS();
}
dom_lexbor_libxml2_bridge_application_data application_data;
application_data.input_name = filename;
application_data.current_total_offset = 0;
application_data.html_no_implied = options & HTML_PARSE_NOIMPLIED;
dom_reset_line_column_cache(&application_data.cache_tokenizer);
lexbor_libxml2_bridge_parse_context ctx;
lexbor_libxml2_bridge_parse_context_init(&ctx);
if (dom_should_register_error_handlers(options)) {
lexbor_libxml2_bridge_parse_set_error_callbacks(
&ctx,
dom_lexbor_libxml2_bridge_tokenizer_error_reporter,
dom_lexbor_libxml2_bridge_tree_error_reporter
);
}
ctx.application_data = &application_data;
char buf[4096];
/* Setup everything encoding & decoding related */
dom_decoding_encoding_ctx decoding_encoding_ctx;
dom_decoding_encoding_ctx_init(&decoding_encoding_ctx);
bool should_determine_encoding_implicitly = true; /* First read => determine encoding implicitly */
if (override_encoding != NULL) {
const lxb_encoding_data_t *encoding_data = lxb_encoding_data_by_name(
(const lxb_char_t *) override_encoding,
override_encoding_len
);
if (!encoding_data) {
zend_argument_value_error(3, "must be a valid document encoding");
RETURN_THROWS();
}
should_determine_encoding_implicitly = false;
dom_setup_parser_encoding_manually((const lxb_char_t *) buf, encoding_data, &decoding_encoding_ctx, &application_data);
}
zend_string *opened_path = NULL;
stream = php_stream_open_wrapper_ex(filename, "rb", REPORT_ERRORS, &opened_path, php_libxml_get_stream_context());
if (!stream) {
if (!EG(exception)) {
zend_throw_exception_ex(NULL, 0, "Cannot open file '%s'", filename);
}
RETURN_THROWS();
}
/* MIME sniff */
if (should_determine_encoding_implicitly) {
zend_string *charset = php_libxml_sniff_charset_from_stream(stream);
if (charset != NULL) {
const lxb_encoding_data_t *encoding_data = lxb_encoding_data_by_name(
(const lxb_char_t *) ZSTR_VAL(charset),
ZSTR_LEN(charset)
);
if (encoding_data != NULL) {
should_determine_encoding_implicitly = false;
dom_setup_parser_encoding_manually(
(const lxb_char_t *) buf,
encoding_data,
&decoding_encoding_ctx,
&application_data
);
}
zend_string_release_ex(charset, false);
}
}
lxb_html_document_t *document = lxb_html_document_create();
if (UNEXPECTED(document == NULL)) {
goto fail_oom;
}
lxb_status_t lexbor_status = lxb_html_document_parse_chunk_begin(document);
if (UNEXPECTED(lexbor_status != LXB_STATUS_OK)) {
goto fail_oom;
}
size_t tokenizer_error_offset = 0;
size_t tree_error_offset = 0;
ssize_t read;
lxb_html_parser_t *parser = document->dom_document.parser;
while ((read = php_stream_read(stream, buf, sizeof(buf))) > 0) {
const lxb_char_t *buf_ref = (const lxb_char_t *) buf;
if (should_determine_encoding_implicitly) {
should_determine_encoding_implicitly = false;
dom_setup_parser_encoding_implicitly(&buf_ref, (size_t *) &read, &decoding_encoding_ctx, &application_data);
}
const lxb_char_t *buf_end = buf_ref + read;
bool result = dom_parse_decode_encode_step(
&ctx,
document,
parser,
&buf_ref,
buf_end,
&decoding_encoding_ctx,
&tokenizer_error_offset,
&tree_error_offset
);
if (!result) {
goto fail_oom;
}
}
if (!dom_parse_decode_encode_finish(&ctx, document, parser, &decoding_encoding_ctx, &tokenizer_error_offset, &tree_error_offset)) {
goto fail_oom;
}
lexbor_status = lxb_html_document_parse_chunk_end(document);
if (lexbor_status != LXB_STATUS_OK) {
goto fail_oom;
}
private_data = php_dom_private_data_create();
xmlDocPtr lxml_doc;
lexbor_libxml2_bridge_status bridge_status = lexbor_libxml2_bridge_convert_document(
document,
&lxml_doc,
options & XML_PARSE_COMPACT,
!(options & DOM_HTML_NO_DEFAULT_NS),
private_data
);
lexbor_libxml2_bridge_copy_observations(parser->tree, &ctx.observations);
if (UNEXPECTED(bridge_status != LEXBOR_LIBXML2_BRIDGE_STATUS_OK)) {
php_libxml_ctx_error(NULL, "%s in %s", dom_lexbor_libxml2_bridge_status_code_to_string(bridge_status), filename);
RETVAL_FALSE;
goto fail_general;
}
lxb_html_document_destroy(document);
dom_post_process_html5_loading(lxml_doc, options, &ctx.observations);
if (decoding_encoding_ctx.decode_data) {
lxml_doc->encoding = xmlStrdup((const xmlChar *) decoding_encoding_ctx.decode_data->name);
} else {
lxml_doc->encoding = xmlStrdup((const xmlChar *) "UTF-8");
}
if (stream->wrapper == &php_plain_files_wrapper && opened_path != NULL) {
xmlChar *converted = xmlPathToURI((const xmlChar *) ZSTR_VAL(opened_path));
if (UNEXPECTED(!converted)) {
goto fail_oom;
}
/* Check for "file:/" instead of "file://" because of libxml2 quirk */
if (strncmp((const char *) converted, "file:/", sizeof("file:/") - 1) != 0) {
xmlChar *buffer = xmlStrdup((const xmlChar *) "file://");
if (UNEXPECTED(!buffer)) {
xmlFree(converted);
goto fail_oom;
}
xmlChar *new_buffer = xmlStrcat(buffer, converted);
if (UNEXPECTED(!new_buffer)) {
xmlFree(buffer);
xmlFree(converted);
goto fail_oom;
}
xmlFree(converted);
lxml_doc->URL = new_buffer;
} else {
#ifdef PHP_WIN32
converted = php_dom_libxml_fix_file_path(converted);
#endif
lxml_doc->URL = converted;
}
} else {
lxml_doc->URL = xmlStrdup((const xmlChar *) filename);
}
if (opened_path != NULL) {
zend_string_release_ex(opened_path, false);
}
php_stream_close(stream);
stream = NULL;
dom_object *intern = php_dom_instantiate_object_helper(
return_value,
dom_html_document_class_entry,
(xmlNodePtr) lxml_doc,
NULL
);
dom_set_xml_class(intern->document);
intern->document->quirks_mode = ctx.observations.quirks_mode;
intern->document->private_data = php_dom_libxml_private_data_header(private_data);
return;
fail_oom:
php_dom_throw_error(INVALID_STATE_ERR, true);
fail_general:
if (private_data != NULL) {
php_dom_private_data_destroy(private_data);
}
lxb_html_document_destroy(document);
php_stream_close(stream);
if (opened_path != NULL) {
zend_string_release_ex(opened_path, false);
}
}
static zend_result dom_write_output_smart_str(void *ctx, const char *buf, size_t size)
{
smart_str_appendl((smart_str *) ctx, buf, size);
return SUCCESS;
}
static zend_result dom_write_output_stream(void *application_data, const char *buf, size_t len)
{
php_stream *stream = (php_stream *) application_data;
if (UNEXPECTED(php_stream_write(stream, buf, len) < 0)) {
return FAILURE;
}
return SUCCESS;
}
static zend_result dom_saveHTML_write_string_len(void *application_data, const char *buf, size_t len)
{
dom_output_ctx *output = (dom_output_ctx *) application_data;
lxb_status_t decode_status, encode_status;
const lxb_char_t *buf_ref = (const lxb_char_t *) buf;
const lxb_char_t *buf_end = buf_ref + len;
do {
decode_status = output->decoding_data->decode(output->decode, &buf_ref, buf_end);
const lxb_codepoint_t *codepoints_ref = output->codepoints;
const lxb_codepoint_t *codepoints_end = codepoints_ref + lxb_encoding_decode_buf_used(output->decode);
do {
encode_status = output->encoding_data->encode(output->encode, &codepoints_ref, codepoints_end);
if (UNEXPECTED(output->write_output(
output->output_data,
(const char *) output->encoding_output,
lxb_encoding_encode_buf_used(output->encode)
) != SUCCESS)) {
return FAILURE;
}
lxb_encoding_encode_buf_used_set(output->encode, 0);
} while (encode_status == LXB_STATUS_SMALL_BUFFER);
lxb_encoding_decode_buf_used_set(output->decode, 0);
} while (decode_status == LXB_STATUS_SMALL_BUFFER);
return SUCCESS;
}
static zend_result dom_saveHTML_write_string(void *application_data, const char *buf)
{
return dom_saveHTML_write_string_len(application_data, buf, strlen(buf));
}
static zend_result dom_common_save(dom_output_ctx *output_ctx, dom_object *intern, const xmlDoc *docp, const xmlNode *node)
{
/* Initialize everything related to encoding & decoding */
const lxb_encoding_data_t *decoding_data = lxb_encoding_data(LXB_ENCODING_UTF_8);
const lxb_encoding_data_t *encoding_data = lxb_encoding_data_by_name(
(const lxb_char_t *) docp->encoding,
strlen((const char *) docp->encoding)
);
lxb_encoding_encode_t encode;
lxb_encoding_decode_t decode;
lxb_char_t encoding_output[4096];
lxb_codepoint_t codepoints[4096];
(void) lxb_encoding_encode_init(&encode, encoding_data, encoding_output, sizeof(encoding_output) / sizeof(*encoding_output));
(void) lxb_encoding_decode_init(&decode, decoding_data, codepoints, sizeof(codepoints) / sizeof(*codepoints));
if (encoding_data->encoding == LXB_ENCODING_UTF_8) {
lxb_encoding_encode_replace_set(&encode, LXB_ENCODING_REPLACEMENT_BYTES, LXB_ENCODING_REPLACEMENT_SIZE);
} else {
/* Fallback if there is no replacement by default */
lxb_encoding_encode_replace_set(&encode, (const lxb_char_t *) "?", 1);
}
lxb_encoding_decode_replace_set(&decode, LXB_ENCODING_REPLACEMENT_BUFFER, LXB_ENCODING_REPLACEMENT_BUFFER_LEN);
output_ctx->encoding_data = encoding_data;
output_ctx->decoding_data = decoding_data;
output_ctx->encode = &encode;
output_ctx->decode = &decode;
output_ctx->codepoints = codepoints;
output_ctx->encoding_output = encoding_output;
dom_html5_serialize_context ctx;
ctx.write_string_len = dom_saveHTML_write_string_len;
ctx.write_string = dom_saveHTML_write_string;
ctx.application_data = output_ctx;
ctx.private_data = php_dom_get_private_data(intern);
if (UNEXPECTED(dom_html5_serialize_outer(&ctx, node) != SUCCESS)) {
return FAILURE;
}
(void) lxb_encoding_decode_finish(&decode);
if (lxb_encoding_decode_buf_used(&decode)) {
const lxb_codepoint_t *codepoints_ref = (const lxb_codepoint_t *) codepoints;
(void) encoding_data->encode(&encode, &codepoints_ref, codepoints_ref + lxb_encoding_decode_buf_used(&decode));
if (UNEXPECTED(output_ctx->write_output(
output_ctx->output_data,
(const char *) encoding_output,
lxb_encoding_encode_buf_used(&encode)) != SUCCESS
)) {
return FAILURE;
}
}
(void) lxb_encoding_encode_finish(&encode);
if (lxb_encoding_encode_buf_used(&encode)) {
if (UNEXPECTED(output_ctx->write_output(
output_ctx->output_data,
(const char *) encoding_output,
lxb_encoding_encode_buf_used(&encode)) != SUCCESS
)) {
return FAILURE;
}
}
return SUCCESS;
}
PHP_METHOD(Dom_HTMLDocument, saveHtmlFile)
{
zval *id;
xmlDoc *docp;
size_t file_len;
dom_object *intern;
char *file;
id = ZEND_THIS;
if (zend_parse_parameters(ZEND_NUM_ARGS(), "p", &file, &file_len) == FAILURE) {
RETURN_THROWS();
}
if (file_len == 0) {
zend_argument_must_not_be_empty_error(1);
RETURN_THROWS();
}
php_stream *stream = php_stream_open_wrapper_ex(file, "wb", REPORT_ERRORS, /* opened_path */ NULL, php_libxml_get_stream_context());
if (!stream) {
RETURN_FALSE;
}
DOM_GET_OBJ(docp, id, xmlDocPtr, intern);
dom_output_ctx output_ctx;
output_ctx.output_data = stream;
output_ctx.write_output = dom_write_output_stream;
if (UNEXPECTED(dom_common_save(&output_ctx, intern, docp, (const xmlNode *) docp) != SUCCESS)) {
php_stream_close(stream);
RETURN_FALSE;
}
zend_long bytes = php_stream_tell(stream);
php_stream_close(stream);
RETURN_LONG(bytes);
}
PHP_METHOD(Dom_HTMLDocument, saveHtml)
{
zval *nodep = NULL;
const xmlDoc *docp;
const xmlNode *node;
dom_object *intern, *nodeobj;
if (zend_parse_parameters(ZEND_NUM_ARGS(), "|O!", &nodep, dom_modern_node_class_entry) == FAILURE) {
RETURN_THROWS();
}
DOM_GET_OBJ(docp, ZEND_THIS, xmlDocPtr, intern);
if (nodep != NULL) {
DOM_GET_OBJ(node, nodep, xmlNodePtr, nodeobj);
if (node->doc != docp) {
php_dom_throw_error(WRONG_DOCUMENT_ERR, true);
RETURN_THROWS();
}
} else {
node = (const xmlNode *) docp;
}
smart_str buf = {0};
dom_output_ctx output_ctx;
output_ctx.output_data = &buf;
output_ctx.write_output = dom_write_output_smart_str;
/* Can't fail because dom_write_output_smart_str() can't fail. */
zend_result result = dom_common_save(&output_ctx, intern, docp, node);
ZEND_ASSERT(result == SUCCESS);
RETURN_STR(smart_str_extract(&buf));
}
zend_result dom_html_document_encoding_write(dom_object *obj, zval *newval)
{
DOM_PROP_NODE(xmlDocPtr, docp, obj);
/* Typed property, can only be IS_STRING. */
ZEND_ASSERT(Z_TYPE_P(newval) == IS_STRING);
zend_string *str = Z_STR_P(newval);
const lxb_encoding_data_t *encoding_data = lxb_encoding_data_by_name((const lxb_char_t *) ZSTR_VAL(str), ZSTR_LEN(str));
if (encoding_data != NULL) {
xmlFree(BAD_CAST docp->encoding);
docp->encoding = xmlStrdup((const xmlChar *) encoding_data->name);
} else {
zend_value_error("Invalid document encoding");
return FAILURE;
}
return SUCCESS;
}
static xmlNodePtr dom_html_document_element_read_raw(const xmlDoc *docp, bool (*accept)(const xmlChar *))
{
const xmlNode *root = xmlDocGetRootElement(docp);
if (root == NULL || !(php_dom_ns_is_fast(root, php_dom_ns_is_html_magic_token) && xmlStrEqual(root->name, BAD_CAST "html"))) {
return NULL;
}
xmlNodePtr cur = root->children;
while (cur != NULL) {
if (cur->type == XML_ELEMENT_NODE && php_dom_ns_is_fast(cur, php_dom_ns_is_html_magic_token) && accept(cur->name)) {
return cur;
}
cur = cur->next;
}
return NULL;
}
zend_result dom_html_document_element_read_helper(dom_object *obj, zval *retval, bool (*accept)(const xmlChar *))
{
DOM_PROP_NODE(const xmlDoc *, docp, obj);
const xmlNode *element = dom_html_document_element_read_raw(docp, accept);
php_dom_create_nullable_object((xmlNodePtr) element, retval, obj);
return SUCCESS;
}
static bool dom_accept_body_name(const xmlChar *name)
{
return xmlStrEqual(name, BAD_CAST "body") || xmlStrEqual(name, BAD_CAST "frameset");
}
static bool dom_accept_head_name(const xmlChar *name)
{
return xmlStrEqual(name, BAD_CAST "head");
}
/* https://html.spec.whatwg.org/#dom-document-body */
zend_result dom_html_document_body_read(dom_object *obj, zval *retval)
{
return dom_html_document_element_read_helper(obj, retval, dom_accept_body_name);
}
/* https://html.spec.whatwg.org/#dom-document-head */
zend_result dom_html_document_head_read(dom_object *obj, zval *retval)
{
return dom_html_document_element_read_helper(obj, retval, dom_accept_head_name);
}
/* https://html.spec.whatwg.org/#dom-document-body */
zend_result dom_html_document_body_write(dom_object *obj, zval *newval)
{
DOM_PROP_NODE(xmlDocPtr, docp, obj);
/* 1. If the new value is not a body or frameset element, then throw a "HierarchyRequestError" DOMException. */
if (Z_TYPE_P(newval) != IS_NULL) {
dom_object *newval_intern = Z_DOMOBJ_P(newval);
if (newval_intern->ptr != NULL) {
xmlNodePtr newval_node = ((php_libxml_node_ptr *) newval_intern->ptr)->node;
/* Note: because this property has type HTMLElement, we know the namespace is correct. */
if (dom_accept_body_name(newval_node->name)) {
/* 2. If the new value is the same as the body element, return. */
const xmlNode *current_body_element = dom_html_document_element_read_raw(docp, dom_accept_body_name);
if (current_body_element == newval_node) {
return SUCCESS;
}
/* 3. If the body element is not null, then replace the body element with the new value within the body element's parent and return. */
if (current_body_element != NULL) {
php_dom_adopt_node(newval_node, obj, docp);
xmlNodePtr old = xmlReplaceNode((xmlNodePtr) current_body_element, newval_node);
if (old != NULL && old->_private == NULL) {
php_libxml_node_free_resource(old);
}
return SUCCESS;
}
/* 4. If there is no document element, throw a "HierarchyRequestError" DOMException. */
xmlNodePtr root = xmlDocGetRootElement(docp);
if (root == NULL) {
php_dom_throw_error_with_message(HIERARCHY_REQUEST_ERR, "A body can only be set if there is a document element", true);
return FAILURE;
}
/* 5. Append the new value to the document element. */
php_dom_adopt_node(newval_node, obj, docp);
xmlAddChild(root, newval_node);
return SUCCESS;
}
}
}
php_dom_throw_error_with_message(HIERARCHY_REQUEST_ERR, "The new body must either be a body or a frameset tag", true);
return FAILURE;
}
/* https://dom.spec.whatwg.org/#concept-child-text-content */
static zend_string *dom_get_child_text_content(const xmlNode *node)
{
smart_str content = {0};
const xmlNode *text = node->children;
while (text != NULL) {
if ((text->type == XML_TEXT_NODE || text->type == XML_CDATA_SECTION_NODE) && text->content != NULL) {
smart_str_appends(&content, (const char *) text->content);
}
text = text->next;
}
return smart_str_extract(&content);
}
/* https://html.spec.whatwg.org/#the-title-element-2 */
static xmlNodePtr dom_get_title_element(const xmlDoc *doc)
{
xmlNodePtr node = doc->children;
while (node != NULL) {
if (node->type == XML_ELEMENT_NODE) {
if (php_dom_ns_is_fast(node, php_dom_ns_is_html_magic_token) && xmlStrEqual(node->name, BAD_CAST "title")) {
break;
}
}
node = php_dom_next_in_tree_order(node, NULL);
}
return node;
}
/* The subtle difference is that this is about the direct title descendant of the svg element,
* whereas the html variant of this function is about the first in-tree title element. */
static xmlNodePtr dom_get_svg_title_element(xmlNodePtr svg)
{
xmlNodePtr cur = svg->children;
while (cur != NULL) {
if (cur->type == XML_ELEMENT_NODE
&& php_dom_ns_is_fast(cur, php_dom_ns_is_svg_magic_token) && xmlStrEqual(cur->name, BAD_CAST "title")) {
break;
}
cur = cur->next;
}
return cur;
}
/* https://html.spec.whatwg.org/#document.title */
zend_result dom_html_document_title_read(dom_object *obj, zval *retval)
{
DOM_PROP_NODE(const xmlDoc *, docp, obj);
xmlNodePtr root = xmlDocGetRootElement(docp);
if (root == NULL) {
ZVAL_EMPTY_STRING(retval);
return SUCCESS;
}
zend_string *value = zend_empty_string;
/* 1. If the document element is an SVG svg element,
* then let value be the child text content of the first SVG title element that is a child of the document element. */
if (php_dom_ns_is_fast(root, php_dom_ns_is_svg_magic_token) && xmlStrEqual(root->name, BAD_CAST "svg")) {
const xmlNode *title = dom_get_svg_title_element(root);
if (title != NULL) {
value = dom_get_child_text_content(title);
}
} else {
/* 2. Otherwise, let value be the child text content of the title element,
* or the empty string if the title element is null. */
const xmlNode *title = dom_get_title_element(docp);
if (title != NULL) {
value = dom_get_child_text_content(title);
}
}
/* 3. Strip and collapse ASCII whitespace in value. */
value = dom_strip_and_collapse_ascii_whitespace(value);
/* 4. Return value. */
ZVAL_STR(retval, value);
return SUCCESS;
}
static void dom_string_replace_all(xmlDocPtr docp, xmlNodePtr element, zval *zv)
{
dom_remove_all_children(element);
xmlNode *text = xmlNewDocText(docp, BAD_CAST Z_STRVAL_P(zv));
xmlAddChild(element, text);
}
/* https://html.spec.whatwg.org/#document.title */
zend_result dom_html_document_title_write(dom_object *obj, zval *newval)
{
DOM_PROP_NODE(xmlDocPtr, docp, obj);
xmlNodePtr root = xmlDocGetRootElement(docp);
if (root == NULL) {
return SUCCESS;
}
/* If the document element is an SVG svg element */
if (php_dom_ns_is_fast(root, php_dom_ns_is_svg_magic_token) && xmlStrEqual(root->name, BAD_CAST "svg")) {
/* 1. If there is an SVG title element that is a child of the document element, let element be the first such element. */
xmlNodePtr element = dom_get_svg_title_element(root);
/* 2. Otherwise: */
if (element == NULL) {
/* 2.1. Let element be the result of creating an element given the document element's node document,
* title, and the SVG namespace. */
/* Annoyingly, we must create it in the svg namespace _without_ prefix... */
xmlNsPtr ns = root->ns;
if (ns->prefix != NULL) {
/* Slow path... */
php_dom_libxml_ns_mapper *ns_mapper = php_dom_get_ns_mapper(obj);
zend_string *href = ZSTR_INIT_LITERAL(DOM_SVG_NS_URI, false);
ns = php_dom_libxml_ns_mapper_get_ns(ns_mapper, zend_empty_string, href);
zend_string_release_ex(href, false);
}
element = xmlNewDocNode(docp, ns, BAD_CAST "title", NULL);
if (UNEXPECTED(element == NULL)) {
php_dom_throw_error(INVALID_STATE_ERR, true);
return FAILURE;
}
/* 2.2. Insert element as the first child of the document element. */
if (root->children == NULL) {
root->last = element;
} else {
element->next = root->children;
root->children->prev = element;
}
root->children = element;
element->parent = root;
}
/* 3. String replace all with the given value within element. */
dom_string_replace_all(docp, element, newval);
}
/* If the document element is in the HTML namespace */
else if (php_dom_ns_is_fast(root, php_dom_ns_is_html_magic_token)) {
/* 1. If the title element is null and the head element is null, then return. */
xmlNodePtr title = dom_get_title_element(docp);
xmlNodePtr head = dom_html_document_element_read_raw(docp, dom_accept_head_name);
if (title == NULL && head == NULL) {
return SUCCESS;
}
/* 2. If the title element is non-null, let element be the title element. */
xmlNodePtr element = title;
/* 3. Otherwise: */
if (element == NULL) {
/* 3.1. Let element be the result of creating an element given the document element's node document, title,
* and the HTML namespace. */
php_dom_libxml_ns_mapper *ns_mapper = php_dom_get_ns_mapper(obj);
element = xmlNewDocNode(docp, php_dom_libxml_ns_mapper_ensure_html_ns(ns_mapper), BAD_CAST "title", NULL);
if (UNEXPECTED(element == NULL)) {
php_dom_throw_error(INVALID_STATE_ERR, true);
return FAILURE;
}
/* 3.2. Append element to the head element. */
xmlAddChild(head, element);
}
/* 4. String replace all with the given value within element. */
dom_string_replace_all(docp, element, newval);
}
return SUCCESS;
}
#if ZEND_DEBUG
PHP_METHOD(Dom_HTMLDocument, debugGetTemplateCount)
{
xmlDocPtr doc;
dom_object *intern;
ZEND_PARSE_PARAMETERS_NONE();
DOM_GET_OBJ(doc, ZEND_THIS, xmlDocPtr, intern);
ZEND_IGNORE_VALUE(doc);
RETURN_LONG((zend_long) php_dom_get_template_count((const php_dom_private_data *) intern->document->private_data));
}
#endif
#endif /* HAVE_LIBXML && HAVE_DOM */