PCRE: Only remember valid UTF-8 if start offset zero

PCRE only validates the string starting from the start offset
(minus maximum look-behind, but let's ignore that), so we can
only remember that the string is fully valid UTF-8 is the original
start offset is zero.
This commit is contained in:
Nikita Popov 2020-02-07 17:01:39 +01:00
parent c9e78e6d33
commit cd5591a28d
3 changed files with 19 additions and 4 deletions

View file

@ -1167,7 +1167,7 @@ PHPAPI void php_pcre_match_impl(pcre_cache_entry *pce, zend_string *subject_str,
PCRE2_SPTR mark = NULL; /* Target for MARK name */
zval marks; /* Array of marks for PREG_PATTERN_ORDER */
pcre2_match_data *match_data;
PCRE2_SIZE start_offset2;
PCRE2_SIZE start_offset2, orig_start_offset;
char *subject = ZSTR_VAL(subject_str);
size_t subject_len = ZSTR_LEN(subject_str);
@ -1263,8 +1263,10 @@ PHPAPI void php_pcre_match_impl(pcre_cache_entry *pce, zend_string *subject_str,
}
}
options = (pce->compile_options & PCRE2_UTF) && !is_known_valid_utf8(subject_str, start_offset2)
? 0 : PCRE2_NO_UTF_CHECK;
orig_start_offset = start_offset2;
options =
(pce->compile_options & PCRE2_UTF) && !is_known_valid_utf8(subject_str, orig_start_offset)
? 0 : PCRE2_NO_UTF_CHECK;
/* Execute the regular expression. */
#ifdef HAVE_PCRE_JIT_SUPPORT
@ -1454,7 +1456,8 @@ error:
if (PCRE_G(error_code) == PHP_PCRE_NO_ERROR) {
/* If there was no error and we're in /u mode, remember that the string is valid UTF-8. */
if ((pce->compile_options & PCRE2_UTF) && !ZSTR_IS_INTERNED(subject_str)) {
if ((pce->compile_options & PCRE2_UTF)
&& !ZSTR_IS_INTERNED(subject_str) && orig_start_offset == 0) {
GC_ADD_FLAGS(subject_str, IS_STR_VALID_UTF8);
}