Avoid string copies in ext/intl after string conversion (#18636)

Introduces intl_convert_utf8_to_utf16_zstr() to convert a UTF-8 string
to a UTF-16 string zend_string* instance. This way we avoid a double
copy later from a UChar* into a zend_string*.
This commit is contained in:
Niels Dossche 2025-05-25 12:29:36 +02:00 committed by GitHub
parent 16e154a2c4
commit b8fa1657ff
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
4 changed files with 66 additions and 27 deletions

View file

@ -526,6 +526,10 @@ PHP 8.5 UPGRADE NOTES
. Add OPcode specialization for `=== []` and `!== []` comparisons.
. Creating exception objects is now much faster.
- Intl:
. Now avoids creating extra string copies when converting strings
for use in the collator.
- ReflectionProperty:
. Improved performance of the following methods: getValue(), getRawValue(),
isInitialized(), setValue(), setRawValue().

View file

@ -38,8 +38,6 @@ static void collator_convert_hash_item_from_utf8_to_utf16(
{
const char* old_val;
size_t old_val_len;
UChar* new_val = NULL;
int32_t new_val_len = 0;
zval znew_val;
/* Process string values only. */
@ -49,17 +47,13 @@ static void collator_convert_hash_item_from_utf8_to_utf16(
old_val = Z_STRVAL_P( hashData );
old_val_len = Z_STRLEN_P( hashData );
/* Convert it from UTF-8 to UTF-16LE and save the result to new_val[_len]. */
intl_convert_utf8_to_utf16( &new_val, &new_val_len, old_val, old_val_len, status );
/* Convert it from UTF-8 to UTF-16LE. */
zend_string *zstr = intl_convert_utf8_to_utf16_zstr( old_val, old_val_len, status );
if( U_FAILURE( *status ) )
return;
/* Update current hash item with the converted value. */
ZVAL_STRINGL( &znew_val, (char*)new_val, UBYTES(new_val_len + 1) );
//???
efree(new_val);
/* hack to fix use of initialized value */
Z_STRLEN(znew_val) = Z_STRLEN(znew_val) - UBYTES(1);
ZVAL_NEW_STR( &znew_val, zstr );
if( hashKey)
{
@ -176,23 +170,19 @@ zval* collator_convert_zstr_utf16_to_utf8( zval* utf16_zval, zval *rv )
zend_string *collator_convert_zstr_utf8_to_utf16(zend_string *utf8_str)
{
UChar *ustr = NULL;
int32_t ustr_len = 0;
UErrorCode status = U_ZERO_ERROR;
/* Convert the string to UTF-16. */
intl_convert_utf8_to_utf16(
&ustr, &ustr_len,
zend_string *zstr = intl_convert_utf8_to_utf16_zstr(
ZSTR_VAL(utf8_str), ZSTR_LEN(utf8_str),
&status);
// FIXME Or throw error or use intl internal error handler
if (U_FAILURE(status)) {
php_error(E_WARNING,
"Error casting object to string in collator_convert_zstr_utf8_to_utf16()");
zstr = ZSTR_EMPTY_ALLOC();
}
zend_string *zstr = zend_string_init((char *) ustr, UBYTES(ustr_len), 0);
efree((char *)ustr);
return zstr;
}
@ -203,8 +193,6 @@ zval* collator_convert_object_to_string( zval* obj, zval *rv )
{
zval* zstr = NULL;
UErrorCode status = U_ZERO_ERROR;
UChar* ustr = NULL;
int32_t ustr_len = 0;
/* Bail out if it's not an object. */
if( Z_TYPE_P( obj ) != IS_OBJECT )
@ -229,25 +217,20 @@ zval* collator_convert_object_to_string( zval* obj, zval *rv )
}
/* Convert the string to UTF-16. */
intl_convert_utf8_to_utf16(
&ustr, &ustr_len,
zend_string *converted_str = intl_convert_utf8_to_utf16_zstr(
Z_STRVAL_P( zstr ), Z_STRLEN_P( zstr ),
&status );
// FIXME Or throw error or use intl internal error handler
if( U_FAILURE( status ) )
if( U_FAILURE( status ) ) {
php_error( E_WARNING, "Error casting object to string in collator_convert_object_to_string()" );
converted_str = ZSTR_EMPTY_ALLOC();
}
/* Cleanup zstr to hold utf16 string. */
zval_ptr_dtor_str( zstr );
/* Set string. */
ZVAL_STRINGL( zstr, (char*)ustr, UBYTES(ustr_len));
//???
efree((char *)ustr);
/* Don't free ustr cause it's set in zstr without copy.
* efree( ustr );
*/
ZVAL_STR( zstr, converted_str );
return zstr;
}

View file

@ -104,6 +104,54 @@ void intl_convert_utf8_to_utf16(
}
/* }}} */
/* Convert given string from UTF-8 to UTF-16 to a zend_string*
*
* @param src String to convert.
* @param src_len Length of the source string.
* @param status Conversion status.
*
* @return zend_string* on success and NULL on failure
*/
zend_string *intl_convert_utf8_to_utf16_zstr(
const char* src, size_t src_len,
UErrorCode* status )
{
int32_t dst_len = 0;
*status = U_ZERO_ERROR;
if(src_len > INT32_MAX) {
/* we cannot fit this string */
*status = U_BUFFER_OVERFLOW_ERROR;
return NULL;
}
/* Pre-flight */
u_strFromUTF8( NULL, 0, &dst_len, src, (int32_t)src_len, status );
if( *status != U_BUFFER_OVERFLOW_ERROR && *status != U_STRING_NOT_TERMINATED_WARNING )
return NULL;
/* Note: l=sizeof(UChar)-1 because we need sizeof(UChar) bytes for the NUL terminator instead of 1. */
zend_string *dst = zend_string_safe_alloc(sizeof(UChar), dst_len, sizeof(UChar) - 1, false);
UChar *dst_buf = (UChar *) ZSTR_VAL(dst);
ZEND_ASSERT((ZSTR_LEN(dst) - 1) / 2 == dst_len);
/* However, the length must not include the NUL terminator that we included previously. */
ZSTR_LEN(dst)--;
/* Convert source string from UTF-8 to UTF-16. */
*status = U_ZERO_ERROR;
u_strFromUTF8( dst_buf, dst_len + 1, NULL, src, src_len, status );
if( U_FAILURE( *status ) )
{
zend_string_efree( dst );
return NULL;
}
dst_buf[dst_len] = 0;
return dst;
}
/* {{{ intl_convert_utf16_to_utf8
* Convert given string from UTF-16 to UTF-8.
*

View file

@ -23,6 +23,10 @@ void intl_convert_utf8_to_utf16(
const char* src, size_t src_len,
UErrorCode* status );
zend_string *intl_convert_utf8_to_utf16_zstr(
const char* src, size_t src_len,
UErrorCode* status );
zend_string* intl_convert_utf16_to_utf8(
const UChar* src, int32_t src_len,
UErrorCode* status );