Use SWAR to seek for non-ASCII UTF-8 in DOM parsing (#16350)

GitHub FYP test case:
```
Benchmark 1: ./sapi/cli/php test.php
  Time (mean ± σ):     502.8 ms ±   6.2 ms    [User: 498.3 ms, System: 3.2 ms]
  Range (min … max):   495.2 ms … 509.8 ms    10 runs

Benchmark 2: ./sapi/cli/php_old test.php
  Time (mean ± σ):     518.4 ms ±   4.3 ms    [User: 513.9 ms, System: 3.2 ms]
  Range (min … max):   511.5 ms … 525.5 ms    10 runs

Summary
  ./sapi/cli/php test.php ran
    1.03 ± 0.02 times faster than ./sapi/cli/php_old test.php
```

Wikipedia English homepage test case:
```
Benchmark 1: ./sapi/cli/php test.php
  Time (mean ± σ):     301.1 ms ±   4.2 ms    [User: 295.5 ms, System: 4.8 ms]
  Range (min … max):   296.3 ms … 308.8 ms    10 runs

Benchmark 2: ./sapi/cli/php_old test.php
  Time (mean ± σ):     308.2 ms ±   1.7 ms    [User: 304.6 ms, System: 2.9 ms]
  Range (min … max):   306.9 ms … 312.8 ms    10 runs

Summary
  ./sapi/cli/php test.php ran
    1.02 ± 0.02 times faster than ./sapi/cli/php_old test.php
```
This commit is contained in:
Niels Dossche 2024-10-12 13:29:33 +02:00 committed by GitHub
parent 497dbaa2df
commit baa76be615
No known key found for this signature in database
GPG key ID: B5690EEEBB952194

View file

@ -30,6 +30,7 @@
#include <Zend/zend_smart_string.h> #include <Zend/zend_smart_string.h>
#include <lexbor/html/encoding.h> #include <lexbor/html/encoding.h>
#include <lexbor/encoding/encoding.h> #include <lexbor/encoding/encoding.h>
#include <lexbor/core/swar.h>
/* Implementation defined, but as HTML5 defaults in all other cases to UTF-8, we'll do the same. */ /* Implementation defined, but as HTML5 defaults in all other cases to UTF-8, we'll do the same. */
#define DOM_FALLBACK_ENCODING_ID LXB_ENCODING_UTF_8 #define DOM_FALLBACK_ENCODING_ID LXB_ENCODING_UTF_8
@ -517,6 +518,30 @@ static bool dom_process_parse_chunk(
return true; return true;
} }
/* This seeks, using SWAR techniques, to the first non-ASCII byte in a UTF-8 input.
* Returns true if the entire input was consumed without encountering non-ASCII, false otherwise. */
static zend_always_inline bool dom_seek_utf8_non_ascii(const lxb_char_t **data, const lxb_char_t *end)
{
while (*data + sizeof(size_t) <= end) {
size_t bytes;
memcpy(&bytes, *data, sizeof(bytes));
/* If the top bit is set, it's not ASCII. */
if ((bytes & LEXBOR_SWAR_REPEAT(0x80)) != 0) {
return false;
}
*data += sizeof(size_t);
}
while (*data < end) {
if (**data & 0x80) {
return false;
}
(*data)++;
}
return true;
}
static bool dom_decode_encode_fast_path( static bool dom_decode_encode_fast_path(
lexbor_libxml2_bridge_parse_context *ctx, lexbor_libxml2_bridge_parse_context *ctx,
lxb_html_document_t *document, lxb_html_document_t *document,
@ -534,13 +559,13 @@ static bool dom_decode_encode_fast_path(
const lxb_char_t *last_output = buf_ref; const lxb_char_t *last_output = buf_ref;
while (buf_ref != buf_end) { while (buf_ref != buf_end) {
/* Fast path converts non-validated UTF-8 -> validated UTF-8 */ /* Fast path converts non-validated UTF-8 -> validated UTF-8 */
if (decoding_encoding_ctx->decode.u.utf_8.need == 0 && *buf_ref < 0x80) { if (decoding_encoding_ctx->decode.u.utf_8.need == 0) {
/* Fast path within the fast path: try to skip non-mb bytes in bulk if we are not in a state where we /* Fast path within the fast path: try to skip non-mb bytes in bulk if we are not in a state where we
* need more UTF-8 bytes to complete a sequence. * need more UTF-8 bytes to complete a sequence. */
* It might be tempting to use SIMD here, but it turns out that this is less efficient because if (dom_seek_utf8_non_ascii(&buf_ref, buf_end)) {
* we need to process the same byte multiple times sometimes when mixing ASCII with multibyte. */ ZEND_ASSERT(buf_ref == buf_end);
buf_ref++; break;
continue; }
} }
const lxb_char_t *buf_ref_backup = buf_ref; const lxb_char_t *buf_ref_backup = buf_ref;
lxb_codepoint_t codepoint = lxb_encoding_decode_utf_8_single(&decoding_encoding_ctx->decode, &buf_ref, buf_end); lxb_codepoint_t codepoint = lxb_encoding_decode_utf_8_single(&decoding_encoding_ctx->decode, &buf_ref, buf_end);