PCRE: Check whether start offset is on char boundary

We need not just the whole string to be UTF-8, but the start
position to be on a character boundary as well. Check this by
looking for a continuation byte.
This commit is contained in:
Nikita Popov 2020-02-07 16:39:06 +01:00
parent 0d49cf4ed2
commit c9e78e6d33
2 changed files with 39 additions and 1 deletions

View file

@ -1130,6 +1130,22 @@ static void php_do_pcre_match(INTERNAL_FUNCTION_PARAMETERS, int global) /* {{{ *
}
/* }}} */
static zend_always_inline zend_bool is_known_valid_utf8(
zend_string *subject_str, PCRE2_SIZE start_offset) {
if (!(GC_FLAGS(subject_str) & IS_STR_VALID_UTF8)) {
/* We don't know whether the string is valid UTF-8 or not. */
return 0;
}
if (start_offset == ZSTR_LEN(subject_str)) {
/* Degenerate case: Offset points to end of string. */
return 1;
}
/* Check that the offset does not point to an UTF-8 continuation byte. */
return (ZSTR_VAL(subject_str)[start_offset] & 0xc0) != 0x80;
}
/* {{{ php_pcre_match_impl() */
PHPAPI void php_pcre_match_impl(pcre_cache_entry *pce, zend_string *subject_str, zval *return_value,
zval *subpats, int global, int use_flags, zend_long flags, zend_off_t start_offset)
@ -1247,7 +1263,7 @@ PHPAPI void php_pcre_match_impl(pcre_cache_entry *pce, zend_string *subject_str,
}
}
options = (pce->compile_options & PCRE2_UTF) && !(GC_FLAGS(subject_str) & IS_STR_VALID_UTF8)
options = (pce->compile_options & PCRE2_UTF) && !is_known_valid_utf8(subject_str, start_offset2)
? 0 : PCRE2_NO_UTF_CHECK;
/* Execute the regular expression. */

View file

@ -0,0 +1,22 @@
--TEST--
Bug #79241: Segmentation fault on preg_match()
--FILE--
<?php
// if "" string is used directly without json_decode,
// the issue does not reproduce
$text = json_decode('""');
$pattern = '/\b/u';
// it has to be exact two calls to preg_match(),
// with the second call offsetting after the tick symbol
var_dump(preg_match($pattern, $text, $matches, 0, 0));
var_dump(preg_match($pattern, $text, $matches, 0, 1));
var_dump(preg_last_error() == PREG_BAD_UTF8_OFFSET_ERROR);
?>
--EXPECT--
int(0)
bool(false)
bool(true)