Use fast text conversion filters for mb_strpos, mb_stripos, mb_substr, etc

This boosts the performance of mb_strpos, mb_stripos, mb_strrpos, mb_strripos, mb_strstr, mb_stristr, mb_strrchr, and mb_strrichr when used on non-UTF-8 strings. mb_substr is also faster. With UTF-8 input, there is no appreciable difference in performance for mb_strpos, mb_stripos, mb_strrpos, etc. This is expected, since the only real difference here (aside from shorter and simpler code) is that the new text conversion code is used when converting non-UTF-8 input strings to UTF-8. (This is done because internally, mb_strpos, etc. work only on UTF-8 text.) For ASCII, speed is boosted by 30-65%. For other legacy text encodings, the degree of performance improvement will depend on how slow the legacy conversion code was. One other minor, but notable difference is that strings encoded using UTF-8 variants from Japanese mobile vendors (SoftBank, KDDI, Docomo) will not undergo encoding conversion but will be processed "as is". It is expected that this will result in a large performance boost for such input strings; but realistically, the number of users who work with such strings is probably minute. I was not originally planning to include mb_substr in this commit, but fuzzing of the reimplemented mb_strstr revealed that mb_substr needed to be reimplemented, too; using the old mbfl_substr, which was based on the old text conversion filters, in combination with functions which use the new text conversion filters caused bugs. The performance boost for mb_substr varies from 10%-500%, depending on the encoding and input string used.
2025-08-16 05:58:45 +02:00 · 2022-10-04 17:34:33 +09:00 · 2022-10-04 17:34:33 +09:00 · 0c0774f5b4
commit 0c0774f5b4
parent b96b88b669
8 changed files with 265 additions and 555 deletions
--- a/ext/mbstring/libmbfl/mbfl/mbfilter.c
+++ b/ext/mbstring/libmbfl/mbfl/mbfilter.c
@ -429,42 +429,6 @@ const mbfl_encoding *mbfl_identify_encoding(mbfl_string *string, const mbfl_enco
 	return enc;
 }

-/*
- *  strlen
- */
-size_t mbfl_strlen(const mbfl_string *string)
-{
-	size_t len = 0;
-	const mbfl_encoding *encoding = string->encoding;
-
-	if (encoding->flag & MBFL_ENCTYPE_SBCS) {
-		len = string->len;
-	} else if (encoding->flag & MBFL_ENCTYPE_WCS2) {
-		len = string->len/2;
-	} else if (encoding->flag & MBFL_ENCTYPE_WCS4) {
-		len = string->len/4;
-	} else if (encoding->mblen_table) {
-		const unsigned char *mbtab = encoding->mblen_table;
-		unsigned char *p = string->val, *e = p + string->len;
-		while (p < e) {
-			p += mbtab[*p];
-			len++;
-		}
-	} else {
-		uint32_t wchar_buf[128];
-		unsigned char *in = string->val;
-		size_t in_len = string->len;
-		unsigned int state = 0;
-
-		while (in_len) {
-			len += encoding->to_wchar(&in, &in_len, wchar_buf, 128, &state);
-		}
-	}
-
-	return len;
-}
-
-
 /*
 *  strpos
 */
@ -528,136 +492,6 @@ retry:
 	return 0;
 }

-static const unsigned char *mbfl_find_offset_utf8(
-		const unsigned char *str, const unsigned char *end, ssize_t offset) {
-	if (offset < 0) {
-		const unsigned char *pos = end;
-		while (offset < 0) {
-			if (pos <= str) {
-				return NULL;
-			}
-
-			unsigned char c = *(--pos);
-			if (c < 0x80) {
-				++offset;
-			} else if ((c & 0xc0) != 0x80) {
-				++offset;
-			}
-		}
-		return pos;
-	} else {
-		const unsigned char *u8_tbl = mbfl_encoding_utf8.mblen_table;
-		const unsigned char *pos = str;
-		while (offset-- > 0) {
-			if (pos >= end) {
-				return NULL;
-			}
-			pos += u8_tbl[*pos];
-		}
-		return pos;
-	}
-}
-
-static size_t mbfl_pointer_to_offset_utf8(const unsigned char *start, const unsigned char *pos) {
-	size_t result = 0;
-	while (pos > start) {
-		unsigned char c = *--pos;
-		if (c < 0x80) {
-			++result;
-		} else if ((c & 0xc0) != 0x80) {
-			++result;
-		}
-	}
-	return result;
-}
-
-size_t
-mbfl_strpos(
-    mbfl_string *haystack,
-    mbfl_string *needle,
-    ssize_t offset,
-    int reverse)
-{
-	size_t result;
-	mbfl_string _haystack_u8, _needle_u8;
-	const mbfl_string *haystack_u8, *needle_u8 = NULL;
-	const unsigned char *offset_pointer;
-
-	if (haystack->encoding->no_encoding != mbfl_no_encoding_utf8) {
-		mbfl_string_init(&_haystack_u8);
-		haystack_u8 = mbfl_convert_encoding(haystack, &_haystack_u8, &mbfl_encoding_utf8);
-		if (haystack_u8 == NULL) {
-			result = MBFL_ERROR_ENCODING;
-			goto out;
-		}
-	} else {
-		haystack_u8 = haystack;
-	}
-
-	if (needle->encoding->no_encoding != mbfl_no_encoding_utf8) {
-		mbfl_string_init(&_needle_u8);
-		needle_u8 = mbfl_convert_encoding(needle, &_needle_u8, &mbfl_encoding_utf8);
-		if (needle_u8 == NULL) {
-			result = MBFL_ERROR_ENCODING;
-			goto out;
-		}
-	} else {
-		needle_u8 = needle;
-	}
-
-	offset_pointer = mbfl_find_offset_utf8(
-		haystack_u8->val, haystack_u8->val + haystack_u8->len, offset);
-	if (!offset_pointer) {
-		result = MBFL_ERROR_OFFSET;
-		goto out;
-	}
-
-	result = MBFL_ERROR_NOT_FOUND;
-	if (haystack_u8->len < needle_u8->len) {
-		goto out;
-	}
-
-	const char *found_pos;
-	if (!reverse) {
-		found_pos = zend_memnstr(
-			(const char *) offset_pointer,
-			(const char *) needle_u8->val, needle_u8->len,
-			(const char *) haystack_u8->val + haystack_u8->len);
-	} else {
-		if (offset >= 0) {
-			found_pos = zend_memnrstr(
-				(const char *) offset_pointer,
-				(const char *) needle_u8->val, needle_u8->len,
-				(const char *) haystack_u8->val + haystack_u8->len);
-		} else {
-			size_t needle_len = mbfl_strlen(needle_u8);
-			offset_pointer = mbfl_find_offset_utf8(
-				offset_pointer, haystack_u8->val + haystack_u8->len, needle_len);
-			if (!offset_pointer) {
-				offset_pointer = haystack_u8->val + haystack_u8->len;
-			}
-
-			found_pos = zend_memnrstr(
-				(const char *) haystack_u8->val,
-				(const char *) needle_u8->val, needle_u8->len,
-				(const char *) offset_pointer);
-		}
-	}
-
-	if (found_pos) {
-		result = mbfl_pointer_to_offset_utf8(haystack_u8->val, (const unsigned char *) found_pos);
-	}
-
-out:
-	if (haystack_u8 == &_haystack_u8) {
-		mbfl_string_clear(&_haystack_u8);
-	}
-	if (needle_u8 == &_needle_u8) {
-		mbfl_string_clear(&_needle_u8);
-	}
-	return result;
-}
-
 /*
 *  substr_count
 */
@ -727,176 +561,6 @@ mbfl_substr_count(
 	return result;
 }

-/*
- *  substr
- */
-struct collector_substr_data {
-	mbfl_convert_filter *next_filter;
-	size_t start;
-	size_t stop;
-	size_t output;
-};
-
-static int
-collector_substr(int c, void* data)
-{
-	struct collector_substr_data *pc = (struct collector_substr_data*)data;
-
-	if (pc->output >= pc->stop) {
-		return -1;
-	}
-
-	if (pc->output >= pc->start) {
-		(*pc->next_filter->filter_function)(c, pc->next_filter);
-	}
-
-	pc->output++;
-
-	return 0;
-}
-
-mbfl_string *
-mbfl_substr(
-    mbfl_string *string,
-    mbfl_string *result,
-    size_t from,
-    size_t length)
-{
-	const mbfl_encoding *encoding = string->encoding;
-	size_t n, k, len, start, end;
-	unsigned m;
-	unsigned char *p, *w;
-
-	mbfl_string_init(result);
-	result->encoding = string->encoding;
-
-	if ((encoding->flag & (MBFL_ENCTYPE_SBCS | MBFL_ENCTYPE_WCS2 | MBFL_ENCTYPE_WCS4)) ||
-	   encoding->mblen_table != NULL) {
-		len = string->len;
-		if (encoding->flag & MBFL_ENCTYPE_SBCS) {
-			start = from;
-		} else if (encoding->flag & MBFL_ENCTYPE_WCS2) {
-			start = from*2;
-		} else if (encoding->flag & MBFL_ENCTYPE_WCS4) {
-			start = from*4;
-		} else {
-			const unsigned char *mbtab = encoding->mblen_table;
-			start = 0;
-			n = 0;
-			k = 0;
-			p = string->val;
-			/* search start position */
-			while (k <= from) {
-				start = n;
-				if (n >= len) {
-					break;
-				}
-				m = mbtab[*p];
-				n += m;
-				p += m;
-				k++;
-			}
-		}
-
-		if (length == MBFL_SUBSTR_UNTIL_END) {
-			end = len;
-		} else if (encoding->flag & MBFL_ENCTYPE_SBCS) {
-			end = start + length;
-		} else if (encoding->flag & MBFL_ENCTYPE_WCS2) {
-			end = start + length*2;
-		} else if (encoding->flag & MBFL_ENCTYPE_WCS4) {
-			end = start + length*4;
-		} else {
-			const unsigned char *mbtab = encoding->mblen_table;
-			end = start;
-			n = start;
-			k = 0;
-			p = string->val + start;
-			/* detect end position */
-			while (k <= length) {
-				end = n;
-				if (n >= len) {
-					break;
-				}
-				m = mbtab[*p];
-				n += m;
-				p += m;
-				k++;
-			}
-		}
-
-		if (start > len) {
-			start = len;
-		}
-		if (end > len) {
-			end = len;
-		}
-		if (start > end) {
-			start = end;
-		}
-
-		/* allocate memory and copy */
-		n = end - start;
-		result->len = 0;
-		result->val = w = (unsigned char*)emalloc(n + 1);
-		result->len = n;
-		memcpy(w, string->val + start, n);
-		w[n] = '\0';
-	} else {
-		mbfl_memory_device device;
-		struct collector_substr_data pc;
-		mbfl_convert_filter *decoder;
-		mbfl_convert_filter *encoder;
-
-		if (length == MBFL_SUBSTR_UNTIL_END) {
-			length = mbfl_strlen(string) - from;
-		}
-
-		mbfl_memory_device_init(&device, length + 1, 0);
-		mbfl_string_init(result);
-		result->encoding = string->encoding;
-		/* output code filter */
-		decoder = mbfl_convert_filter_new(
-		    &mbfl_encoding_wchar,
-		    string->encoding,
-		    mbfl_memory_device_output, 0, &device);
-		/* wchar filter */
-		encoder = mbfl_convert_filter_new(
-		    string->encoding,
-		    &mbfl_encoding_wchar,
-		    collector_substr, 0, &pc);
-		if (decoder == NULL || encoder == NULL) {
-			mbfl_convert_filter_delete(encoder);
-			mbfl_convert_filter_delete(decoder);
-			return NULL;
-		}
-		pc.next_filter = decoder;
-		pc.start = from;
-		pc.stop = from + length;
-		pc.output = 0;
-
-		/* feed data */
-		p = string->val;
-		n = string->len;
-		if (p != NULL) {
-			while (n > 0) {
-				if ((*encoder->filter_function)(*p++, encoder) < 0) {
-					break;
-				}
-				n--;
-			}
-		}
-
-		mbfl_convert_filter_flush(encoder);
-		mbfl_convert_filter_flush(decoder);
-		result = mbfl_memory_device_result(&device, result);
-		mbfl_convert_filter_delete(encoder);
-		mbfl_convert_filter_delete(decoder);
-	}
-
-	return result;
-}
-
 /*
 *  strcut
 */
--- a/ext/mbstring/libmbfl/mbfl/mbfilter.h
+++ b/ext/mbstring/libmbfl/mbfl/mbfilter.h
@ -190,24 +190,11 @@ static inline int mbfl_is_error(size_t len) {
 	return len >= (size_t) -16;
 }

-/*
- * strlen
- */
-MBFLAPI extern size_t
-mbfl_strlen(const mbfl_string *string);
-
 #define MBFL_ERROR_NOT_FOUND ((size_t) -1)
 #define MBFL_ERROR_ENCODING ((size_t) -4)
 #define MBFL_ERROR_EMPTY ((size_t) -8)
 #define MBFL_ERROR_OFFSET ((size_t) -16)

-/*
- * strpos.
- * Errors: MBFL_ERROR_NOT_FOUND, MBFL_ERROR_ENCODING, MBFL_ERROR_OFFSET
- */
-MBFLAPI extern size_t
-mbfl_strpos(mbfl_string *haystack, mbfl_string *needle, ssize_t offset, int reverse);
-
 /*
 * substr_count
 */
@ -219,12 +206,6 @@ mbfl_substr_count(mbfl_string *haystack, mbfl_string *needle);
 */
 #define MBFL_SUBSTR_UNTIL_END ((size_t) -1)

-/*
- * substr
- */
-MBFLAPI extern mbfl_string *
-mbfl_substr(mbfl_string *string, mbfl_string *result, size_t from, size_t length);
-
 /*
 * strcut
 */
--- a/ext/mbstring/libmbfl/mbfl/mbfl_consts.h
+++ b/ext/mbstring/libmbfl/mbfl/mbfl_consts.h
@ -32,8 +32,8 @@
 #define MBFL_CONSTS_H

 #define MBFL_ENCTYPE_SBCS		0x00000001 /* single-byte encoding */
-#define MBFL_ENCTYPE_WCS2		0x00000010 /* 2 bytes/char */
-#define MBFL_ENCTYPE_WCS4		0x00000100 /* 4 bytes/char */
+#define MBFL_ENCTYPE_WCS2		0x00000002 /* 2 bytes/char */
+#define MBFL_ENCTYPE_WCS4		0x00000004 /* 4 bytes/char */
 #define MBFL_ENCTYPE_GL_UNSAFE	0x00004000

 #define MBFL_WCSPLANE_UCS2MAX		0x00010000
--- a/ext/mbstring/mbstring.c
+++ b/ext/mbstring/mbstring.c
@ -1872,6 +1872,104 @@ PHP_FUNCTION(mb_strlen)
 }
 /* }}} */

+/* See mbfl_no_encoding definition for list of UTF-8 encodings */
+static inline bool php_mb_is_no_encoding_utf8(enum mbfl_no_encoding no_enc)
+{
+	return (no_enc >= mbfl_no_encoding_utf8 && no_enc <= mbfl_no_encoding_utf8_sb);
+}
+
+static unsigned char* offset_to_pointer_utf8(unsigned char *str, unsigned char *end, ssize_t offset) {
+	if (offset < 0) {
+		unsigned char *pos = end;
+		while (offset < 0) {
+			if (pos <= str) {
+				return NULL;
+			}
+
+			unsigned char c = *--pos;
+			if (c < 0x80 || (c & 0xC0) != 0x80) {
+				offset++;
+			}
+		}
+		return pos;
+	} else {
+		const unsigned char *u8_tbl = mbfl_encoding_utf8.mblen_table;
+		unsigned char *pos = str;
+		while (offset-- > 0) {
+			if (pos >= end) {
+				return NULL;
+			}
+			pos += u8_tbl[*pos];
+		}
+		return pos;
+	}
+}
+
+static size_t pointer_to_offset_utf8(unsigned char *start, unsigned char *pos) {
+	size_t result = 0;
+	while (pos > start) {
+		unsigned char c = *--pos;
+		if (c < 0x80 || (c & 0xC0) != 0x80) {
+			result++;
+		}
+	}
+	return result;
+}
+
+static size_t mb_find_strpos(zend_string *haystack, zend_string *needle, const mbfl_encoding *enc, ssize_t offset, bool reverse)
+{
+	size_t result;
+	zend_string *haystack_u8 = NULL, *needle_u8 = NULL;
+	unsigned char *offset_pointer;
+
+	if (!php_mb_is_no_encoding_utf8(enc->no_encoding)) {
+		haystack_u8 = php_mb_convert_encoding_ex(ZSTR_VAL(haystack), ZSTR_LEN(haystack), &mbfl_encoding_utf8, enc);
+		needle_u8 = php_mb_convert_encoding_ex(ZSTR_VAL(needle), ZSTR_LEN(needle), &mbfl_encoding_utf8, enc);
+	} else {
+		haystack_u8 = haystack;
+		needle_u8 = needle;
+	}
+
+	offset_pointer = offset_to_pointer_utf8((unsigned char*)ZSTR_VAL(haystack_u8), (unsigned char*)ZSTR_VAL(haystack_u8) + ZSTR_LEN(haystack_u8), offset);
+	if (!offset_pointer) {
+		result = MBFL_ERROR_OFFSET;
+		goto out;
+	}
+
+	result = MBFL_ERROR_NOT_FOUND;
+	if (ZSTR_LEN(haystack_u8) < ZSTR_LEN(needle_u8)) {
+		goto out;
+	}
+
+	const char *found_pos;
+	if (!reverse) {
+		found_pos = zend_memnstr((const char*)offset_pointer, ZSTR_VAL(needle_u8), ZSTR_LEN(needle_u8), ZSTR_VAL(haystack_u8) + ZSTR_LEN(haystack_u8));
+	} else if (offset >= 0) {
+		found_pos = zend_memnrstr((const char*)offset_pointer, ZSTR_VAL(needle_u8), ZSTR_LEN(needle_u8), ZSTR_VAL(haystack_u8) + ZSTR_LEN(haystack_u8));
+	} else {
+		size_t needle_len = pointer_to_offset_utf8((unsigned char*)ZSTR_VAL(needle), (unsigned char*)ZSTR_VAL(needle) + ZSTR_LEN(needle));
+		offset_pointer = offset_to_pointer_utf8(offset_pointer, (unsigned char*)ZSTR_VAL(haystack_u8) + ZSTR_LEN(haystack_u8), needle_len);
+		if (!offset_pointer) {
+			offset_pointer = (unsigned char*)ZSTR_VAL(haystack_u8) + ZSTR_LEN(haystack_u8);
+		}
+
+		found_pos = zend_memnrstr(ZSTR_VAL(haystack_u8), ZSTR_VAL(needle_u8), ZSTR_LEN(needle_u8), (const char*)offset_pointer);
+	}
+
+	if (found_pos) {
+		result = pointer_to_offset_utf8((unsigned char*)ZSTR_VAL(haystack_u8), (unsigned char*)found_pos);
+	}
+
+out:
+	if (haystack_u8 != NULL && haystack_u8 != haystack) {
+		zend_string_free(haystack_u8);
+	}
+	if (needle_u8 != NULL && needle_u8 != needle) {
+		zend_string_free(needle_u8);
+	}
+	return result;
+}
+
 static void handle_strpos_error(size_t error) {
 	switch (error) {
 	case MBFL_ERROR_NOT_FOUND:
@ -1888,32 +1986,26 @@ static void handle_strpos_error(size_t error) {
 	}
 }

-/* {{{ Find position of first occurrence of a string within another */
 PHP_FUNCTION(mb_strpos)
 {
-	int reverse = 0;
 	zend_long offset = 0;
-	char *haystack_val, *needle_val;
-	mbfl_string haystack, needle;
+	zend_string *needle, *haystack;
 	zend_string *enc_name = NULL;

 	ZEND_PARSE_PARAMETERS_START(2, 4)
-		Z_PARAM_STRING(haystack_val, haystack.len)
-		Z_PARAM_STRING(needle_val, needle.len)
+		Z_PARAM_STR(haystack)
+		Z_PARAM_STR(needle)
 		Z_PARAM_OPTIONAL
 		Z_PARAM_LONG(offset)
 		Z_PARAM_STR_OR_NULL(enc_name)
 	ZEND_PARSE_PARAMETERS_END();

-	haystack.val = (unsigned char*)haystack_val;
-	needle.val = (unsigned char*)needle_val;
-
-	haystack.encoding = needle.encoding = php_mb_get_encoding(enc_name, 4);
-	if (!haystack.encoding) {
+	const mbfl_encoding *enc = php_mb_get_encoding(enc_name, 4);
+	if (!enc) {
 		RETURN_THROWS();
 	}

-	size_t n = mbfl_strpos(&haystack, &needle, offset, reverse);
+	size_t n = mb_find_strpos(haystack, needle, enc, offset, false);
 	if (!mbfl_is_error(n)) {
 		RETVAL_LONG(n);
 	} else {
@ -1921,33 +2013,28 @@ PHP_FUNCTION(mb_strpos)
 		RETVAL_FALSE;
 	}
 }
-/* }}} */

 /* {{{ Find position of last occurrence of a string within another */
 PHP_FUNCTION(mb_strrpos)
 {
-	mbfl_string haystack, needle;
-	char *haystack_val, *needle_val;
-	zend_string *enc_name = NULL;
 	zend_long offset = 0;
+	zend_string *needle, *haystack;
+	zend_string *enc_name = NULL;

 	ZEND_PARSE_PARAMETERS_START(2, 4)
-		Z_PARAM_STRING(haystack_val, haystack.len)
-		Z_PARAM_STRING(needle_val, needle.len)
+		Z_PARAM_STR(haystack)
+		Z_PARAM_STR(needle)
 		Z_PARAM_OPTIONAL
 		Z_PARAM_LONG(offset)
 		Z_PARAM_STR_OR_NULL(enc_name)
 	ZEND_PARSE_PARAMETERS_END();

-	haystack.val = (unsigned char*)haystack_val;
-	needle.val = (unsigned char*)needle_val;
-
-	haystack.encoding = needle.encoding = php_mb_get_encoding(enc_name, 4);
-	if (!haystack.encoding) {
+	const mbfl_encoding *enc = php_mb_get_encoding(enc_name, 4);
+	if (!enc) {
 		RETURN_THROWS();
 	}

-	size_t n = mbfl_strpos(&haystack, &needle, offset, 1);
+	size_t n = mb_find_strpos(haystack, needle, enc, offset, true);
 	if (!mbfl_is_error(n)) {
 		RETVAL_LONG(n);
 	} else {
@ -1961,27 +2048,23 @@ PHP_FUNCTION(mb_strrpos)
 PHP_FUNCTION(mb_stripos)
 {
 	zend_long offset = 0;
-	mbfl_string haystack, needle;
-	char *haystack_val, *needle_val;
+	zend_string *haystack, *needle;
 	zend_string *from_encoding = NULL;

 	ZEND_PARSE_PARAMETERS_START(2, 4)
-		Z_PARAM_STRING(haystack_val, haystack.len)
-		Z_PARAM_STRING(needle_val, needle.len)
+		Z_PARAM_STR(haystack)
+		Z_PARAM_STR(needle)
 		Z_PARAM_OPTIONAL
 		Z_PARAM_LONG(offset)
 		Z_PARAM_STR_OR_NULL(from_encoding)
 	ZEND_PARSE_PARAMETERS_END();

-	haystack.val = (unsigned char*)haystack_val;
-	needle.val = (unsigned char*)needle_val;
-
 	const mbfl_encoding *enc = php_mb_get_encoding(from_encoding, 4);
 	if (!enc) {
 		RETURN_THROWS();
 	}

-	size_t n = php_mb_stripos(0, (char *)haystack.val, haystack.len, (char *)needle.val, needle.len, offset, enc);
+	size_t n = php_mb_stripos(false, haystack, needle, offset, enc);

 	if (!mbfl_is_error(n)) {
 		RETVAL_LONG(n);
@ -1996,27 +2079,23 @@ PHP_FUNCTION(mb_stripos)
 PHP_FUNCTION(mb_strripos)
 {
 	zend_long offset = 0;
-	mbfl_string haystack, needle;
-	char *haystack_val, *needle_val;
+	zend_string *haystack, *needle;
 	zend_string *from_encoding = NULL;

 	ZEND_PARSE_PARAMETERS_START(2, 4)
-		Z_PARAM_STRING(haystack_val, haystack.len)
-		Z_PARAM_STRING(needle_val, needle.len)
+		Z_PARAM_STR(haystack)
+		Z_PARAM_STR(needle)
 		Z_PARAM_OPTIONAL
 		Z_PARAM_LONG(offset)
 		Z_PARAM_STR_OR_NULL(from_encoding)
 	ZEND_PARSE_PARAMETERS_END();

-	haystack.val = (unsigned char*)haystack_val;
-	needle.val = (unsigned char*)needle_val;
-
 	const mbfl_encoding *enc = php_mb_get_encoding(from_encoding, 4);
 	if (!enc) {
 		RETURN_THROWS();
 	}

-	size_t n = php_mb_stripos(1, (char *)haystack.val, haystack.len, (char *)needle.val, needle.len, offset, enc);
+	size_t n = php_mb_stripos(true, haystack, needle, offset, enc);

 	if (!mbfl_is_error(n)) {
 		RETVAL_LONG(n);
@ -2027,57 +2106,123 @@ PHP_FUNCTION(mb_strripos)
 }
 /* }}} */

+static zend_string* mb_get_substr_slow(unsigned char *in, size_t in_len, size_t from, size_t len, const mbfl_encoding *enc)
+{
+	uint32_t wchar_buf[128];
+	unsigned int state = 0;
+
+	mb_convert_buf buf;
+	mb_convert_buf_init(&buf, MIN(len, in_len - from), MBSTRG(current_filter_illegal_substchar), MBSTRG(current_filter_illegal_mode));
+
+	while (in_len && len) {
+		size_t out_len = enc->to_wchar(&in, &in_len, wchar_buf, 128, &state);
+		ZEND_ASSERT(out_len <= 128);
+
+		if (from >= out_len) {
+			from -= out_len;
+		} else {
+			enc->from_wchar(wchar_buf + from, MIN(out_len - from, len), &buf, !in_len || out_len >= len);
+			from = 0;
+			len -= MIN(out_len, len);
+		}
+	}
+
+	return mb_convert_buf_result(&buf);
+}
+
+static zend_string* mb_get_substr(zend_string *input, size_t from, size_t len, const mbfl_encoding *enc)
+{
+	unsigned char *in = (unsigned char*)ZSTR_VAL(input);
+	size_t in_len = ZSTR_LEN(input);
+
+	if (from >= in_len) {
+		/* No supported text encoding decodes to more than one codepoint per byte
+		 * So if the number of codepoints to skip >= number of input bytes,
+		 * then definitely the output should be empty */
+		return zend_empty_string;
+	}
+
+	/* Does each codepoint have a fixed byte width? */
+	unsigned int flag = enc->flag & (MBFL_ENCTYPE_SBCS | MBFL_ENCTYPE_WCS2 | MBFL_ENCTYPE_WCS4);
+	if (flag) {
+		/* The value of the flag is 2 if each codepoint takes 2 bytes, or 4 if 4 bytes */
+		from *= flag;
+		len *= flag;
+		if (from >= in_len) {
+			return zend_empty_string;
+		}
+		in += from;
+		in_len -= from;
+		if (len > in_len) {
+			len = in_len;
+		}
+		return zend_string_init_fast((const char*)in, len);
+	} else if (enc->mblen_table != NULL) {
+		const unsigned char *mbtab = enc->mblen_table;
+		unsigned char *limit = in + in_len;
+		while (from && in < limit) {
+			in += mbtab[*in];
+			from--;
+		}
+		if (in >= limit) {
+			return zend_empty_string;
+		} else if (len == MBFL_SUBSTR_UNTIL_END) {
+			return zend_string_init_fast((const char*)in, limit - in);
+		}
+		unsigned char *end = in;
+		while (len && end < limit) {
+			end += mbtab[*end];
+			len--;
+		}
+		if (end > limit) {
+			end = limit;
+		}
+		return zend_string_init_fast((const char*)in, end - in);
+	}
+
+	return mb_get_substr_slow(in, in_len, from, len, enc);
+}
+
 #define MB_STRSTR 1
 #define MB_STRRCHR 2
 #define MB_STRISTR 3
 #define MB_STRRICHR 4
-/* {{{ php_mb_strstr_variants */
+
 static void php_mb_strstr_variants(INTERNAL_FUNCTION_PARAMETERS, unsigned int variant)
 {
-	int reverse_mode = 0;
+	bool reverse_mode = false, part = false;
 	size_t n;
-	char *haystack_val, *needle_val;
-	mbfl_string haystack, needle, result, *ret = NULL;
+	zend_string *haystack, *needle;
 	zend_string *encoding_name = NULL;
-	bool part = 0;

 	ZEND_PARSE_PARAMETERS_START(2, 4)
-		Z_PARAM_STRING(haystack_val, haystack.len)
-		Z_PARAM_STRING(needle_val, needle.len)
+		Z_PARAM_STR(haystack)
+		Z_PARAM_STR(needle)
 		Z_PARAM_OPTIONAL
 		Z_PARAM_BOOL(part)
 		Z_PARAM_STR_OR_NULL(encoding_name)
 	ZEND_PARSE_PARAMETERS_END();

-	haystack.val = (unsigned char*)haystack_val;
-	needle.val = (unsigned char*)needle_val;
-	haystack.encoding = needle.encoding = php_mb_get_encoding(encoding_name, 4);
-	if (!haystack.encoding) {
+	const mbfl_encoding *enc = php_mb_get_encoding(encoding_name, 4);
+	if (!enc) {
 		RETURN_THROWS();
 	}

-	if (variant == MB_STRRCHR || variant == MB_STRRICHR) { reverse_mode = 1; }
+	if (variant == MB_STRRCHR || variant == MB_STRRICHR) {
+		reverse_mode = true;
+	}

 	if (variant == MB_STRISTR || variant == MB_STRRICHR) {
-		n = php_mb_stripos(reverse_mode, (char *)haystack.val, haystack.len, (char *)needle.val,
-			needle.len, 0, needle.encoding);
+		n = php_mb_stripos(reverse_mode, haystack, needle, 0, enc);
 	} else {
-		n = mbfl_strpos(&haystack, &needle, 0, reverse_mode);
+		n = mb_find_strpos(haystack, needle, enc, 0, reverse_mode);
 	}

 	if (!mbfl_is_error(n)) {
 		if (part) {
-			ret = mbfl_substr(&haystack, &result, 0, n);
-			ZEND_ASSERT(ret != NULL);
-			// TODO: avoid reallocation ???
-			RETVAL_STRINGL((char *)ret->val, ret->len);
-			efree(ret->val);
+			RETVAL_STR(mb_get_substr(haystack, 0, n, enc));
 		} else {
-			ret = mbfl_substr(&haystack, &result, n, MBFL_SUBSTR_UNTIL_END);
-			ZEND_ASSERT(ret != NULL);
-			// TODO: avoid reallocation ???
-			RETVAL_STRINGL((char *)ret->val, ret->len);
-			efree(ret->val);
+			RETVAL_STR(mb_get_substr(haystack, n, MBFL_SUBSTR_UNTIL_END, enc));
 		}
 	} else {
 		// FIXME use handle_strpos_error(n)
@ -2158,39 +2303,31 @@ PHP_FUNCTION(mb_substr_count)
 /* {{{ Returns part of a string */
 PHP_FUNCTION(mb_substr)
 {
-	char *str;
-	zend_string *encoding = NULL;
+	zend_string *str, *encoding = NULL;
 	zend_long from, len;
 	size_t real_from, real_len;
-	size_t str_len;
-	bool len_is_null = 1;
-	mbfl_string string, result, *ret;
+	bool len_is_null = true;

 	ZEND_PARSE_PARAMETERS_START(2, 4)
-		Z_PARAM_STRING(str, str_len)
+		Z_PARAM_STR(str)
 		Z_PARAM_LONG(from)
 		Z_PARAM_OPTIONAL
 		Z_PARAM_LONG_OR_NULL(len, len_is_null)
 		Z_PARAM_STR_OR_NULL(encoding)
 	ZEND_PARSE_PARAMETERS_END();

-	string.encoding = php_mb_get_encoding(encoding, 4);
-	if (!string.encoding) {
+	const mbfl_encoding *enc = php_mb_get_encoding(encoding, 4);
+	if (!enc) {
 		RETURN_THROWS();
 	}

-	string.val = (unsigned char *)str;
-	string.len = str_len;
-
-	/* measures length */
 	size_t mblen = 0;
 	if (from < 0 || (!len_is_null && len < 0)) {
-		mblen = mbfl_strlen(&string);
+		mblen = mb_get_strlen(str, enc);
 	}

 	/* if "from" position is negative, count start position from the end
-	 * of the string
-	 */
+	 * of the string */
 	if (from >= 0) {
 		real_from = (size_t) from;
 	} else if (-from < mblen) {
@ -2200,8 +2337,7 @@ PHP_FUNCTION(mb_substr)
 	}

 	/* if "length" position is negative, set it to the length
-	 * needed to stop that many chars from the end of the string
-	 */
+	 * needed to stop that many chars from the end of the string */
 	if (len_is_null) {
 		real_len = MBFL_SUBSTR_UNTIL_END;
 	} else if (len >= 0) {
@ -2212,12 +2348,7 @@ PHP_FUNCTION(mb_substr)
 		real_len = 0;
 	}

-	ret = mbfl_substr(&string, &result, real_from, real_len);
-	ZEND_ASSERT(ret != NULL);
-
-	// TODO: avoid reallocation ???
-	RETVAL_STRINGL((char *)ret->val, ret->len); /* the string is already strdup()'ed */
-	efree(ret->val);
+	RETVAL_STR(mb_get_substr(str, real_from, real_len, enc));
 }
 /* }}} */

@ -2347,64 +2478,6 @@ PHP_FUNCTION(mb_strwidth)
 	RETVAL_LONG(mb_get_strwidth(string, enc));
 }

-/* Cut 'n' codepoints from beginning of string
- * Remove this once mb_substr is implemented using the new conversion filters */
-static zend_string* mb_drop_chars(zend_string *input, const mbfl_encoding *enc, size_t n)
-{
-	if (n >= ZSTR_LEN(input)) {
-		/* No supported text encoding decodes to more than one codepoint per byte
-		 * So if the number of codepoints to drop >= number of input bytes,
-		 * then definitely the output should be empty
-		 * This also guards `ZSTR_LEN(input) - n` (below) from underflow */
-		return zend_empty_string;
-	}
-
-	uint32_t wchar_buf[128];
-	unsigned char *in = (unsigned char*)ZSTR_VAL(input);
-	size_t in_len = ZSTR_LEN(input);
-	unsigned int state = 0;
-
-	mb_convert_buf buf;
-	mb_convert_buf_init(&buf, ZSTR_LEN(input) - n, MBSTRG(current_filter_illegal_substchar), MBSTRG(current_filter_illegal_mode));
-
-	while (in_len) {
-		size_t out_len = enc->to_wchar(&in, &in_len, wchar_buf, 128, &state);
-		ZEND_ASSERT(out_len <= 128);
-
-		if (n >= out_len) {
-			n -= out_len;
-		} else {
-			enc->from_wchar(wchar_buf + n, out_len - n, &buf, !in_len);
-			n = 0;
-		}
-	}
-
-	return mb_convert_buf_result(&buf);
-}
-
-/* Pick 'n' codepoints from beginning of string
- * Remove this once mb_substr is implemented using the new conversion filters */
-static zend_string* mb_pick_chars(zend_string *input, const mbfl_encoding *enc, size_t n)
-{
-	uint32_t wchar_buf[128];
-	unsigned char *in = (unsigned char*)ZSTR_VAL(input);
-	size_t in_len = ZSTR_LEN(input);
-	unsigned int state = 0;
-
-	mb_convert_buf buf;
-	mb_convert_buf_init(&buf, n, MBSTRG(current_filter_illegal_substchar), MBSTRG(current_filter_illegal_mode));
-
-	while (in_len && n) {
-		size_t out_len = enc->to_wchar(&in, &in_len, wchar_buf, 128, &state);
-		ZEND_ASSERT(out_len <= 128);
-
-		enc->from_wchar(wchar_buf, MIN(out_len, n), &buf, !in_len || out_len >= n);
-		n -= MIN(out_len, n);
-	}
-
-	return mb_convert_buf_result(&buf);
-}
-
 static zend_string* mb_trim_string(zend_string *input, zend_string *marker, const mbfl_encoding *enc, unsigned int from, int width)
 {
 	uint32_t wchar_buf[128];
@ -2454,11 +2527,19 @@ static zend_string* mb_trim_string(zend_string *input, zend_string *marker, cons
 	/* The input string fits in the requested width; we don't need to append the trim marker
 	 * However, if the string contains erroneous byte sequences, those should be converted
 	 * to error markers */
-	if (from == 0 && !input_err) {
-		/* This just increments the string's refcount; it doesn't really 'copy' it */
-		return zend_string_copy(input);
+	if (!input_err) {
+		if (from == 0) {
+			/* This just increments the string's refcount; it doesn't really 'copy' it */
+			return zend_string_copy(input);
+		} else {
+			return mb_get_substr(input, from, MBFL_SUBSTR_UNTIL_END, enc);
+		}
+	} else {
+		/* We can't use `mb_get_substr`, because it uses the fastest method possible of
+		 * picking out a substring, which may not include converting erroneous byte
+		 * sequences to error markers */
+		return mb_get_substr_slow((unsigned char*)ZSTR_VAL(input), ZSTR_LEN(input), from, MBFL_SUBSTR_UNTIL_END, enc);
 	}
-	return mb_drop_chars(input, enc, from);

 	/* The input string is too wide; we need to build a new string which
 	 * includes some portion of the input string, with the trim marker
@ -2534,7 +2615,7 @@ PHP_FUNCTION(mb_strimwidth)
 		width += mb_get_strwidth(str, enc);

 		if (from > 0) {
-			zend_string *trimmed = mb_pick_chars(str, enc, from);
+			zend_string *trimmed = mb_get_substr(str, 0, from, enc);
 			width -= mb_get_strwidth(trimmed, enc);
 			zend_string_free(trimmed);
 		}
@ -2558,13 +2639,6 @@ static inline bool php_mb_is_unsupported_no_encoding(enum mbfl_no_encoding no_en
 			|| (no_enc >= mbfl_no_encoding_cp50220 && no_enc <= mbfl_no_encoding_cp50222));
 }

-
-/* See mbfl_no_encoding definition for list of UTF-8 encodings */
-static inline bool php_mb_is_no_encoding_utf8(enum mbfl_no_encoding no_enc)
-{
-	return (no_enc >= mbfl_no_encoding_utf8 && no_enc <= mbfl_no_encoding_utf8_sb);
-}
-
 MBSTRING_API zend_string* php_mb_convert_encoding_ex(const char *input, size_t length, const mbfl_encoding *to_encoding, const mbfl_encoding *from_encoding)
 {
 	unsigned int num_errors = 0;
@ -4733,46 +4807,17 @@ MBSTRING_API char *php_mb_safe_strrchr(const char *s, unsigned int c, size_t nby
 	return last;
 }

-MBSTRING_API size_t php_mb_stripos(int mode, const char *old_haystack, size_t old_haystack_len, const char *old_needle, size_t old_needle_len, zend_long offset, const mbfl_encoding *enc)
+MBSTRING_API size_t php_mb_stripos(bool mode, zend_string *haystack, zend_string *needle, zend_long offset, const mbfl_encoding *enc)
 {
-	size_t n = (size_t)-1;
-	mbfl_string haystack, needle;
-	zend_string *haystack_converted = NULL, *needle_converted = NULL;
+	/* We're using simple case-folding here, because we'd have to deal with remapping of
+	 * offsets otherwise. */
+	zend_string *haystack_conv = mbstring_convert_case(PHP_UNICODE_CASE_FOLD_SIMPLE, ZSTR_VAL(haystack), ZSTR_LEN(haystack), enc);
+	zend_string *needle_conv = mbstring_convert_case(PHP_UNICODE_CASE_FOLD_SIMPLE, ZSTR_VAL(needle), ZSTR_LEN(needle), enc);

-	mbfl_string_init_set(&haystack, enc);
-	mbfl_string_init_set(&needle, enc);
+	size_t n = mb_find_strpos(haystack_conv, needle_conv, enc, offset, mode);

-	do {
-		/* We're using simple case-folding here, because we'd have to deal with remapping of
-		 * offsets otherwise. */
-		haystack_converted = mbstring_convert_case(PHP_UNICODE_CASE_FOLD_SIMPLE, (char*)old_haystack, old_haystack_len, enc);
-		haystack.val = (unsigned char*)ZSTR_VAL(haystack_converted);
-		haystack.len = ZSTR_LEN(haystack_converted);
-
-		if (!haystack.val) {
-			break;
-		}
-		if (haystack.len == 0) {
-			break;
-		}
-
-		needle_converted = mbstring_convert_case(PHP_UNICODE_CASE_FOLD_SIMPLE, (char*)old_needle, old_needle_len, enc);
-		needle.val = (unsigned char*)ZSTR_VAL(needle_converted);
-		needle.len = ZSTR_LEN(needle_converted);
-
-		if (!needle.val) {
-			break;
-		}
-
-		n = mbfl_strpos(&haystack, &needle, offset, mode);
-	} while(0);
-
-	if (haystack_converted) {
-		zend_string_free(haystack_converted);
-	}
-	if (needle_converted) {
-		zend_string_free(needle_converted);
-	}
+	zend_string_free(haystack_conv);
+	zend_string_free(needle_conv);

 	return n;
 }
--- a/ext/mbstring/mbstring.h
+++ b/ext/mbstring/mbstring.h
@ -64,7 +64,7 @@ MBSTRING_API zend_string* php_mb_convert_encoding(

 MBSTRING_API size_t php_mb_mbchar_bytes(const char *s, const mbfl_encoding *enc);

-MBSTRING_API size_t php_mb_stripos(int mode, const char *old_haystack, size_t old_haystack_len, const char *old_needle, size_t old_needle_len, zend_long offset, const mbfl_encoding *encoding);
+MBSTRING_API size_t php_mb_stripos(bool mode, zend_string *haystack, zend_string *needle, zend_long offset, const mbfl_encoding *enc);
 MBSTRING_API bool php_mb_check_encoding(const char *input, size_t length, const mbfl_encoding *encoding);

 ZEND_BEGIN_MODULE_GLOBALS(mbstring)
--- a/ext/mbstring/tests/mb_strrchr_basic.phpt
+++ b/ext/mbstring/tests/mb_strrchr_basic.phpt
@ -17,7 +17,6 @@ var_dump(bin2hex(mb_strrchr($string_ascii, 'd', false, 'ISO-8859-1')));
 var_dump(bin2hex(mb_strrchr($string_ascii, 'd')));
 var_dump(bin2hex(mb_strrchr($string_ascii, 'd', true)));

-
 echo "\n-- ASCII string: needle doesn't exist --\n";
 var_dump(mb_strrchr($string_ascii, '123'));

@ -27,11 +26,14 @@ var_dump(bin2hex(mb_strrchr($string_mb, $needle1)));
 var_dump(bin2hex(mb_strrchr($string_mb, $needle1, false, 'utf-8')));
 var_dump(bin2hex(mb_strrchr($string_mb, $needle1, true)));

-
 echo "\n-- Multibyte string: needle doesn't exist --\n";
 $needle2 = base64_decode('44GT44KT44Gr44Gh44Gv44CB5LiW55WM');
 var_dump(mb_strrchr($string_mb, $needle2));

+echo "\n-- Regression tests --\n";
+// Regression test from when mb_strrchr was being reimplemented
+var_dump(mb_strrchr("\x00t\x00", "", false, "UTF32"));
+
 ?>
 --EXPECT--
 *** Testing mb_strrchr() : basic functionality ***
@ -51,3 +53,6 @@ string(0) ""

 -- Multibyte string: needle doesn't exist --
 bool(false)
+
+-- Regression tests --
+string(0) ""
--- a/ext/mbstring/tests/mb_strstr.phpt
+++ b/ext/mbstring/tests/mb_strstr.phpt
@ -21,6 +21,11 @@ mb_internal_encoding("EUC-JP");
 var_dump(FROM_EUC_JP(mb_strstr(EUC_JP("あいうえおかきくけこ"), EUC_JP("おかき"))));
 var_dump(FROM_EUC_JP(mb_strstr(EUC_JP("あいうえおかきくけこ"), EUC_JP("おかき"), false)));
 var_dump(FROM_EUC_JP(mb_strstr(EUC_JP("あいうえおかきくけこ"), EUC_JP("おかき"), true)));
+
+// Regression test from when mb_strstr was being reimplemented
+var_dump(bin2hex(mb_strstr("\xdd\x00", "", false, 'UTF-8')));
+var_dump(bin2hex(mb_strstr("M\xff\xff\xff\x00", "\x00", false, "SJIS")));
+
 ?>
 --EXPECT--
 string(18) "おかきくけこ"
@ -31,3 +36,5 @@ string(12) "あいうえ"
 string(18) "おかきくけこ"
 string(18) "おかきくけこ"
 string(12) "あいうえ"
+string(4) "dd00"
+string(0) ""
--- a/ext/mbstring/tests/mb_substr.phpt
+++ b/ext/mbstring/tests/mb_substr.phpt
@ -85,6 +85,12 @@ print "3: " . mb_convert_encoding(mb_substr($utf7, -5, 3, 'UTF-7'), 'UTF-8', 'UT
 print "4: " . mb_convert_encoding(mb_substr($utf7, 1, null, 'UTF-7'), 'UTF-8', 'UTF-7') . "\n";
 print "5:" . mb_convert_encoding(mb_substr($utf7, 10, 0, 'UTF-7'), 'UTF-8', 'UTF-7') . "\n";

+echo "Regression:\n";
+/* During development, one >= comparison in mb_get_substr was wrongly written as >
+ * This was caught by libFuzzer */
+$str = "\xbd\xbd\xbd\xbd\xbd\xbd\xbd\xbe\xbd\xbd\xbd\x8b\x8b\x8b\x8b\x8b\x8b\x8b\x8b\x8b\x8b\x8b\x8b\x8b\x8b\x8b\x8b\x8b\x8b\x8b\x8b\x8b\x8b\x8b\x8b\x8b\x8b\x89\x8b\x8b\x8b\x8b\x8b\x8b\x8b\x8b\x8b\x8b\x8b\x8b\x8b\x8b\x8b\x8b\x8b\x8b\x8b\x8b\x8b\x8b\x8b\x00\x00\x00\x00\x8b\x8b\x8b\x8b\x8b\x8b\x8b\x8b\x8b\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8b\x8b\x8b\xbd\xbd\xbd\xbd\xbd\xbd\xbd\xbe\x01:O\xaa\xd3";
+echo bin2hex(mb_substr($str, 0, 128, "JIS")), "\n";
+
 ?>
 --EXPECT--
 EUC-JP:
@ -136,3 +142,5 @@ UTF-7:
 3: йте
 4: reek: Σὲ γνωρίζω ἀπὸ τὴν κόψη Russian: Зарегистрируйтесь
 5:
+Regression:
+1b28493d3d3d3d3d3d3d3e3d3d3d1b28423f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f000000003f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f1b28493d3d3d3d3d3d3d3e1b2842013a4f1b28492a1b2842