Add grapheme_levenshtein function. (#18087)

Measure levenshtein for grapheme cluster unit
This commit is contained in:
tekimen 2025-04-28 16:22:52 +09:00 committed by GitHub
parent 6fa669a125
commit bdcea111f3
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
6 changed files with 359 additions and 1 deletions

1
NEWS
View file

@ -89,6 +89,7 @@ PHP NEWS
. Added Locale::isRightToLeft to check if a locale is written right to left. . Added Locale::isRightToLeft to check if a locale is written right to left.
(David Carlier) (David Carlier)
. Added null bytes presence in locale inputs for Locale class. (David Carlier) . Added null bytes presence in locale inputs for Locale class. (David Carlier)
. Added grapheme_levenshtein() function. (Yuya Hamada)
- MySQLi: - MySQLi:
. Fixed bugs GH-17900 and GH-8084 (calling mysqli::__construct twice). . Fixed bugs GH-17900 and GH-8084 (calling mysqli::__construct twice).

View file

@ -319,6 +319,8 @@ PHP 8.5 UPGRADE NOTES
- Intl: - Intl:
. Added locale_is_right_to_left/Locale::isRightToLeft, returns true if . Added locale_is_right_to_left/Locale::isRightToLeft, returns true if
the locale is written right to left (after its enrichment with likely subtags). the locale is written right to left (after its enrichment with likely subtags).
. Added grapheme_levenshtein() function.
RFC: https://wiki.php.net/rfc/grapheme_levenshtein
- Pdo\Sqlite: - Pdo\Sqlite:
. Added support for Pdo\Sqlite::setAuthorizer(), which is the equivalent of . Added support for Pdo\Sqlite::setAuthorizer(), which is the equivalent of

View file

@ -918,4 +918,219 @@ PHP_FUNCTION(grapheme_str_split)
ubrk_close(bi); ubrk_close(bi);
} }
PHP_FUNCTION(grapheme_levenshtein)
{
zend_string *string1, *string2;
zend_long cost_ins = 1;
zend_long cost_rep = 1;
zend_long cost_del = 1;
ZEND_PARSE_PARAMETERS_START(2, 5)
Z_PARAM_STR(string1)
Z_PARAM_STR(string2)
Z_PARAM_OPTIONAL
Z_PARAM_LONG(cost_ins)
Z_PARAM_LONG(cost_rep)
Z_PARAM_LONG(cost_del)
ZEND_PARSE_PARAMETERS_END();
if (cost_ins <= 0 || cost_ins > UINT_MAX / 4) {
zend_argument_value_error(3, "must be greater than 0 and less than or equal to %d", UINT_MAX / 4);
RETURN_THROWS();
}
if (cost_rep <= 0 || cost_rep > UINT_MAX / 4) {
zend_argument_value_error(4, "must be greater than 0 and less than or equal to %d", UINT_MAX / 4);
RETURN_THROWS();
}
if (cost_del <= 0 || cost_del > UINT_MAX / 4) {
zend_argument_value_error(5, "must be greater than 0 and less than or equal to %d", UINT_MAX / 4);
RETURN_THROWS();
}
zend_long c0, c1, c2;
zend_long retval;
size_t i2;
char *pstr1, *pstr2;
UChar *ustring1 = NULL;
UChar *ustring2 = NULL;
int32_t ustring1_len = 0;
int32_t ustring2_len = 0;
UErrorCode ustatus = U_ZERO_ERROR;
/* When all costs are equal, levenshtein fulfills the requirements of a metric, which means
* that the distance is symmetric. If string1 is shorter than string2 we can save memory (and CPU time)
* by having shorter rows (p1 & p2). */
if (ZSTR_LEN(string1) < ZSTR_LEN(string2) && cost_ins == cost_rep && cost_rep == cost_del) {
zend_string *tmp = string1;
string1 = string2;
string2 = tmp;
}
pstr1 = ZSTR_VAL(string1);
pstr2 = ZSTR_VAL(string2);
intl_convert_utf8_to_utf16(&ustring1, &ustring1_len, pstr1, ZSTR_LEN(string1), &ustatus);
if (U_FAILURE(ustatus)) {
intl_error_set_code(NULL, ustatus);
intl_error_set_custom_msg(NULL, "Error converting input string to UTF-16", 0);
efree(ustring1);
RETURN_FALSE;
}
intl_convert_utf8_to_utf16(&ustring2, &ustring2_len, pstr2, ZSTR_LEN(string2), &ustatus);
if (U_FAILURE(ustatus)) {
intl_error_set_code(NULL, ustatus);
intl_error_set_custom_msg(NULL, "Error converting input string to UTF-16", 0);
efree(ustring2);
efree(ustring1);
RETURN_FALSE;
}
UBreakIterator *bi1, *bi2;
int32_t strlen_1, strlen_2;
strlen_1 = grapheme_split_string(ustring1, ustring1_len, NULL, 0);
strlen_2 = grapheme_split_string(ustring2, ustring2_len, NULL, 0);
if (strlen_1 == 0) {
efree(ustring1);
efree(ustring2);
RETURN_LONG(strlen_2 * cost_ins);
}
if (strlen_2 == 0) {
efree(ustring1);
efree(ustring2);
RETURN_LONG(strlen_1 * cost_del);
}
unsigned char u_break_iterator_buffer1[U_BRK_SAFECLONE_BUFFERSIZE];
unsigned char u_break_iterator_buffer2[U_BRK_SAFECLONE_BUFFERSIZE];
bi1 = grapheme_get_break_iterator(u_break_iterator_buffer1, &ustatus);
if (U_FAILURE(ustatus)) {
intl_error_set_code(NULL, ustatus);
intl_error_set_custom_msg(NULL, "Error on grapheme_get_break_iterator for argument #1 ($string1)", 0);
efree(ustring2);
efree(ustring1);
ubrk_close(bi1);
RETURN_FALSE;
}
bi2 = grapheme_get_break_iterator(u_break_iterator_buffer2, &ustatus);
if (U_FAILURE(ustatus)) {
intl_error_set_code(NULL, ustatus);
intl_error_set_custom_msg(NULL, "Error on grapheme_get_break_iterator for argument #2 ($string2)", 0);
efree(ustring2);
efree(ustring1);
ubrk_close(bi2);
ubrk_close(bi1);
RETURN_FALSE;
}
ubrk_setText(bi1, ustring1, ustring1_len, &ustatus);
if (U_FAILURE(ustatus)) {
intl_error_set_code(NULL, ustatus);
intl_error_set_custom_msg(NULL, "Error on ubrk_setText for argument #1 ($string1)", 0);
efree(ustring2);
efree(ustring1);
ubrk_close(bi2);
ubrk_close(bi1);
RETURN_FALSE;
}
ubrk_setText(bi2, ustring2, ustring2_len, &ustatus);
if (U_FAILURE(ustatus)) {
intl_error_set_code(NULL, ustatus);
intl_error_set_custom_msg(NULL, "Error on ubrk_setText for argument #2 ($string2)", 0);
efree(ustring2);
efree(ustring1);
ubrk_close(bi2);
ubrk_close(bi1);
RETURN_FALSE;
}
UCollator *collator = ucol_open("", &ustatus);
if (U_FAILURE(ustatus)) {
intl_error_set_code(NULL, ustatus);
intl_error_set_custom_msg(NULL, "Error on ucol_open", 0);
efree(ustring2);
efree(ustring1);
ubrk_close(bi2);
ubrk_close(bi1);
ucol_close(collator);
RETURN_FALSE;
}
zend_long *p1, *p2, *tmp;
p1 = safe_emalloc(strlen_2 + 1, sizeof(zend_long), 0);
p2 = safe_emalloc(strlen_2 + 1, sizeof(zend_long), 0);
for (i2 = 0; i2 <= strlen_2; i2++) {
p1[i2] = i2 * cost_ins;
}
int32_t current1 = 0;
int32_t current2 = 0;
int32_t pos1 = 0;
int32_t pos2 = 0;
while (true) {
current1 = ubrk_current(bi1);
pos1 = ubrk_next(bi1);
if (pos1 == UBRK_DONE) {
break;
}
p2[0] = p1[0] + cost_del;
for (i2 = 0, pos2 = 0; pos2 != UBRK_DONE; i2++) {
current2 = ubrk_current(bi2);
pos2 = ubrk_next(bi2);
if (pos2 == UBRK_DONE) {
break;
}
if (ucol_strcoll(collator, ustring1 + current1, pos1 - current1, ustring2 + current2, pos2 - current2) == UCOL_EQUAL) {
c0 = p1[i2];
} else {
c0 = p1[i2] + cost_rep;
}
c1 = p1[i2 + 1] + cost_del;
if (c1 < c0) {
c0 = c1;
}
c2 = p2[i2] + cost_ins;
if (c2 < c0) {
c0 = c2;
}
p2[i2 + 1] = c0;
}
ubrk_first(bi2);
tmp = p1;
p1 = p2;
p2 = tmp;
}
ucol_close(collator);
ubrk_close(bi1);
ubrk_close(bi2);
efree(ustring1);
efree(ustring2);
retval = p1[strlen_2];
efree(p1);
efree(p2);
RETURN_LONG(retval);
}
/* }}} */ /* }}} */

View file

@ -447,6 +447,8 @@ function grapheme_stristr(string $haystack, string $needle, bool $beforeNeedle =
function grapheme_str_split(string $string, int $length = 1): array|false {} function grapheme_str_split(string $string, int $length = 1): array|false {}
function grapheme_levenshtein(string $string1, string $string2, int $insertion_cost = 1, int $replacement_cost = 1, int $deletion_cost = 1): int|false {}
/** @param int $next */ /** @param int $next */
function grapheme_extract(string $haystack, int $size, int $type = GRAPHEME_EXTR_COUNT, int $offset = 0, &$next = null): string|false {} function grapheme_extract(string $haystack, int $size, int $type = GRAPHEME_EXTR_COUNT, int $offset = 0, &$next = null): string|false {}

View file

@ -1,5 +1,5 @@
/* This is a generated file, edit the .stub.php file instead. /* This is a generated file, edit the .stub.php file instead.
* Stub hash: 4fb44fc170e74af2e9fb52c5a1029004f708fcda */ * Stub hash: adcf3b6ef720a518087efedbe2b62b10ad4b2624 */
ZEND_BEGIN_ARG_WITH_RETURN_OBJ_INFO_EX(arginfo_intlcal_create_instance, 0, 0, IntlCalendar, 1) ZEND_BEGIN_ARG_WITH_RETURN_OBJ_INFO_EX(arginfo_intlcal_create_instance, 0, 0, IntlCalendar, 1)
ZEND_ARG_INFO_WITH_DEFAULT_VALUE(0, timezone, "null") ZEND_ARG_INFO_WITH_DEFAULT_VALUE(0, timezone, "null")
@ -489,6 +489,14 @@ ZEND_BEGIN_ARG_WITH_RETURN_TYPE_MASK_EX(arginfo_grapheme_str_split, 0, 1, MAY_BE
ZEND_ARG_TYPE_INFO_WITH_DEFAULT_VALUE(0, length, IS_LONG, 0, "1") ZEND_ARG_TYPE_INFO_WITH_DEFAULT_VALUE(0, length, IS_LONG, 0, "1")
ZEND_END_ARG_INFO() ZEND_END_ARG_INFO()
ZEND_BEGIN_ARG_WITH_RETURN_TYPE_MASK_EX(arginfo_grapheme_levenshtein, 0, 2, MAY_BE_LONG|MAY_BE_FALSE)
ZEND_ARG_TYPE_INFO(0, string1, IS_STRING, 0)
ZEND_ARG_TYPE_INFO(0, string2, IS_STRING, 0)
ZEND_ARG_TYPE_INFO_WITH_DEFAULT_VALUE(0, insertion_cost, IS_LONG, 0, "1")
ZEND_ARG_TYPE_INFO_WITH_DEFAULT_VALUE(0, replacement_cost, IS_LONG, 0, "1")
ZEND_ARG_TYPE_INFO_WITH_DEFAULT_VALUE(0, deletion_cost, IS_LONG, 0, "1")
ZEND_END_ARG_INFO()
ZEND_BEGIN_ARG_WITH_RETURN_TYPE_MASK_EX(arginfo_grapheme_extract, 0, 2, MAY_BE_STRING|MAY_BE_FALSE) ZEND_BEGIN_ARG_WITH_RETURN_TYPE_MASK_EX(arginfo_grapheme_extract, 0, 2, MAY_BE_STRING|MAY_BE_FALSE)
ZEND_ARG_TYPE_INFO(0, haystack, IS_STRING, 0) ZEND_ARG_TYPE_INFO(0, haystack, IS_STRING, 0)
ZEND_ARG_TYPE_INFO(0, size, IS_LONG, 0) ZEND_ARG_TYPE_INFO(0, size, IS_LONG, 0)
@ -903,6 +911,7 @@ ZEND_FUNCTION(grapheme_substr);
ZEND_FUNCTION(grapheme_strstr); ZEND_FUNCTION(grapheme_strstr);
ZEND_FUNCTION(grapheme_stristr); ZEND_FUNCTION(grapheme_stristr);
ZEND_FUNCTION(grapheme_str_split); ZEND_FUNCTION(grapheme_str_split);
ZEND_FUNCTION(grapheme_levenshtein);
ZEND_FUNCTION(grapheme_extract); ZEND_FUNCTION(grapheme_extract);
ZEND_FUNCTION(idn_to_ascii); ZEND_FUNCTION(idn_to_ascii);
ZEND_FUNCTION(idn_to_utf8); ZEND_FUNCTION(idn_to_utf8);
@ -1091,6 +1100,7 @@ static const zend_function_entry ext_functions[] = {
ZEND_FE(grapheme_strstr, arginfo_grapheme_strstr) ZEND_FE(grapheme_strstr, arginfo_grapheme_strstr)
ZEND_FE(grapheme_stristr, arginfo_grapheme_stristr) ZEND_FE(grapheme_stristr, arginfo_grapheme_stristr)
ZEND_FE(grapheme_str_split, arginfo_grapheme_str_split) ZEND_FE(grapheme_str_split, arginfo_grapheme_str_split)
ZEND_FE(grapheme_levenshtein, arginfo_grapheme_levenshtein)
ZEND_FE(grapheme_extract, arginfo_grapheme_extract) ZEND_FE(grapheme_extract, arginfo_grapheme_extract)
ZEND_FE(idn_to_ascii, arginfo_idn_to_ascii) ZEND_FE(idn_to_ascii, arginfo_idn_to_ascii)
ZEND_FE(idn_to_utf8, arginfo_idn_to_utf8) ZEND_FE(idn_to_utf8, arginfo_idn_to_utf8)

View file

@ -0,0 +1,128 @@
--TEST--
grapheme_levenshtein() function test
--EXTENSIONS--
intl
--FILE--
<?php
echo '--- Equal ---' . \PHP_EOL;
var_dump(grapheme_levenshtein('12345', '12345'));
echo '--- First string empty ---' . \PHP_EOL;
var_dump(grapheme_levenshtein('', 'xyz'));
echo '--- Second string empty ---' . \PHP_EOL;
var_dump(grapheme_levenshtein('xyz', ''));
echo '--- Both empty ---' . \PHP_EOL;
var_dump(grapheme_levenshtein('', ''));
var_dump(grapheme_levenshtein('', '', 10, 10, 10));
echo '--- 1 character ---' . \PHP_EOL;
var_dump(grapheme_levenshtein('1', '2'));
echo '--- 2 character swapped ---' . \PHP_EOL;
var_dump(grapheme_levenshtein('12', '21'));
echo '--- Inexpensive deletion ---' . \PHP_EOL;
var_dump(grapheme_levenshtein('2121', '11', 2));
echo '--- Expensive deletion ---' . \PHP_EOL;
var_dump(grapheme_levenshtein('2121', '11', 2, 1, 5));
//
echo '--- Inexpensive insertion ---' . \PHP_EOL;
var_dump(grapheme_levenshtein('11', '2121'));
echo '--- Expensive insertion ---' . \PHP_EOL;
var_dump(grapheme_levenshtein('11', '2121', 5));
echo '--- Expensive replacement ---' . \PHP_EOL;
var_dump(grapheme_levenshtein('111', '121', 2, 3, 2));
echo '--- Very expensive replacement ---' . \PHP_EOL;
var_dump(grapheme_levenshtein('111', '121', 2, 9, 2));
echo '--- 128 codepoints ---' . \PHP_EOL;
var_dump(grapheme_levenshtein(str_repeat("a", 128), str_repeat("a", 125) . "abc"));
echo '--- 128 codepoints over ---' . \PHP_EOL;
var_dump(grapheme_levenshtein(str_repeat("a", 128) . "abc", str_repeat("a", 128) . "aaa"));
var_dump(grapheme_levenshtein(str_repeat("a", 256) . "abc", "aaa"));
echo '--- 128 codepoints over only $string1 ---' . \PHP_EOL;
var_dump(grapheme_levenshtein(str_repeat("a", 128) . "abc", "aaa"));
echo '--- 128 codepoints over only $string2 ---' . \PHP_EOL;
var_dump(grapheme_levenshtein("abc", str_repeat("a", 128) . "aaa"));
echo '--- 128 codepoints over Hiragana ---' . \PHP_EOL;
var_dump(grapheme_levenshtein(str_repeat("あ", 128) . "あああ", str_repeat("あ", 128) . "あいう"));
echo '--- Variable selector ---' . \PHP_EOL;
$ka = "カ́";
var_dump(grapheme_levenshtein("カ", $ka));
// variable $nabe and $nabe_E0100 is seems nothing different.
// However, $nabe_E0100 is variable selector in U+908A U+E0100.
// So grapheme_levenshtein result is maybe 0.
$nabe = '邊';
$nabe_E0100 = "邊󠄀";
var_dump(grapheme_levenshtein($nabe, $nabe_E0100));
// combining character
var_dump(grapheme_levenshtein("\u{0065}\u{0301}", "\u{00e9}"));
// Corner case
echo '--- Corner case ---' . PHP_EOL;
try {
grapheme_levenshtein($nabe, $nabe_E0100, 0, 1, 1);
} catch (ValueError $e) {
echo $e->getMessage() . PHP_EOL;
}
try {
grapheme_levenshtein($nabe, $nabe_E0100, 1, 0, 1);
} catch (ValueError $e) {
echo $e->getMessage() . PHP_EOL;
}
try {
grapheme_levenshtein($nabe, $nabe_E0100, 1, 1, 0);
} catch (ValueError $e) {
echo $e->getMessage() . PHP_EOL;
}
?>
--EXPECTF--
--- Equal ---
int(0)
--- First string empty ---
int(3)
--- Second string empty ---
int(3)
--- Both empty ---
int(0)
int(0)
--- 1 character ---
int(1)
--- 2 character swapped ---
int(2)
--- Inexpensive deletion ---
int(2)
--- Expensive deletion ---
int(10)
--- Inexpensive insertion ---
int(2)
--- Expensive insertion ---
int(10)
--- Expensive replacement ---
int(3)
--- Very expensive replacement ---
int(4)
--- 128 codepoints ---
int(2)
--- 128 codepoints over ---
int(2)
int(256)
--- 128 codepoints over only $string1 ---
int(128)
--- 128 codepoints over only $string2 ---
int(130)
--- 128 codepoints over Hiragana ---
int(2)
--- Variable selector ---
int(1)
int(0)
int(0)
--- Corner case ---
grapheme_levenshtein(): Argument #3 ($insertion_cost) must be greater than 0 and less than or equal to %d
grapheme_levenshtein(): Argument #4 ($replacement_cost) must be greater than 0 and less than or equal to %d
grapheme_levenshtein(): Argument #5 ($deletion_cost) must be greater than 0 and less than or equal to %d