mirror of
https://github.com/php/php-src.git
synced 2025-08-15 13:38:49 +02:00
Add grapheme_levenshtein function. (#18087)
Measure levenshtein for grapheme cluster unit
This commit is contained in:
parent
6fa669a125
commit
bdcea111f3
6 changed files with 359 additions and 1 deletions
1
NEWS
1
NEWS
|
@ -89,6 +89,7 @@ PHP NEWS
|
|||
. Added Locale::isRightToLeft to check if a locale is written right to left.
|
||||
(David Carlier)
|
||||
. Added null bytes presence in locale inputs for Locale class. (David Carlier)
|
||||
. Added grapheme_levenshtein() function. (Yuya Hamada)
|
||||
|
||||
- MySQLi:
|
||||
. Fixed bugs GH-17900 and GH-8084 (calling mysqli::__construct twice).
|
||||
|
|
|
@ -319,6 +319,8 @@ PHP 8.5 UPGRADE NOTES
|
|||
- Intl:
|
||||
. Added locale_is_right_to_left/Locale::isRightToLeft, returns true if
|
||||
the locale is written right to left (after its enrichment with likely subtags).
|
||||
. Added grapheme_levenshtein() function.
|
||||
RFC: https://wiki.php.net/rfc/grapheme_levenshtein
|
||||
|
||||
- Pdo\Sqlite:
|
||||
. Added support for Pdo\Sqlite::setAuthorizer(), which is the equivalent of
|
||||
|
|
|
@ -918,4 +918,219 @@ PHP_FUNCTION(grapheme_str_split)
|
|||
ubrk_close(bi);
|
||||
}
|
||||
|
||||
PHP_FUNCTION(grapheme_levenshtein)
|
||||
{
|
||||
zend_string *string1, *string2;
|
||||
zend_long cost_ins = 1;
|
||||
zend_long cost_rep = 1;
|
||||
zend_long cost_del = 1;
|
||||
|
||||
ZEND_PARSE_PARAMETERS_START(2, 5)
|
||||
Z_PARAM_STR(string1)
|
||||
Z_PARAM_STR(string2)
|
||||
Z_PARAM_OPTIONAL
|
||||
Z_PARAM_LONG(cost_ins)
|
||||
Z_PARAM_LONG(cost_rep)
|
||||
Z_PARAM_LONG(cost_del)
|
||||
ZEND_PARSE_PARAMETERS_END();
|
||||
|
||||
if (cost_ins <= 0 || cost_ins > UINT_MAX / 4) {
|
||||
zend_argument_value_error(3, "must be greater than 0 and less than or equal to %d", UINT_MAX / 4);
|
||||
RETURN_THROWS();
|
||||
}
|
||||
|
||||
if (cost_rep <= 0 || cost_rep > UINT_MAX / 4) {
|
||||
zend_argument_value_error(4, "must be greater than 0 and less than or equal to %d", UINT_MAX / 4);
|
||||
RETURN_THROWS();
|
||||
}
|
||||
|
||||
if (cost_del <= 0 || cost_del > UINT_MAX / 4) {
|
||||
zend_argument_value_error(5, "must be greater than 0 and less than or equal to %d", UINT_MAX / 4);
|
||||
RETURN_THROWS();
|
||||
}
|
||||
|
||||
zend_long c0, c1, c2;
|
||||
zend_long retval;
|
||||
size_t i2;
|
||||
char *pstr1, *pstr2;
|
||||
|
||||
UChar *ustring1 = NULL;
|
||||
UChar *ustring2 = NULL;
|
||||
|
||||
int32_t ustring1_len = 0;
|
||||
int32_t ustring2_len = 0;
|
||||
|
||||
UErrorCode ustatus = U_ZERO_ERROR;
|
||||
|
||||
/* When all costs are equal, levenshtein fulfills the requirements of a metric, which means
|
||||
* that the distance is symmetric. If string1 is shorter than string2 we can save memory (and CPU time)
|
||||
* by having shorter rows (p1 & p2). */
|
||||
if (ZSTR_LEN(string1) < ZSTR_LEN(string2) && cost_ins == cost_rep && cost_rep == cost_del) {
|
||||
zend_string *tmp = string1;
|
||||
string1 = string2;
|
||||
string2 = tmp;
|
||||
}
|
||||
|
||||
pstr1 = ZSTR_VAL(string1);
|
||||
pstr2 = ZSTR_VAL(string2);
|
||||
|
||||
intl_convert_utf8_to_utf16(&ustring1, &ustring1_len, pstr1, ZSTR_LEN(string1), &ustatus);
|
||||
|
||||
if (U_FAILURE(ustatus)) {
|
||||
intl_error_set_code(NULL, ustatus);
|
||||
|
||||
intl_error_set_custom_msg(NULL, "Error converting input string to UTF-16", 0);
|
||||
efree(ustring1);
|
||||
RETURN_FALSE;
|
||||
}
|
||||
|
||||
intl_convert_utf8_to_utf16(&ustring2, &ustring2_len, pstr2, ZSTR_LEN(string2), &ustatus);
|
||||
|
||||
if (U_FAILURE(ustatus)) {
|
||||
intl_error_set_code(NULL, ustatus);
|
||||
|
||||
intl_error_set_custom_msg(NULL, "Error converting input string to UTF-16", 0);
|
||||
efree(ustring2);
|
||||
efree(ustring1);
|
||||
RETURN_FALSE;
|
||||
}
|
||||
|
||||
UBreakIterator *bi1, *bi2;
|
||||
|
||||
int32_t strlen_1, strlen_2;
|
||||
strlen_1 = grapheme_split_string(ustring1, ustring1_len, NULL, 0);
|
||||
strlen_2 = grapheme_split_string(ustring2, ustring2_len, NULL, 0);
|
||||
|
||||
if (strlen_1 == 0) {
|
||||
efree(ustring1);
|
||||
efree(ustring2);
|
||||
RETURN_LONG(strlen_2 * cost_ins);
|
||||
}
|
||||
if (strlen_2 == 0) {
|
||||
efree(ustring1);
|
||||
efree(ustring2);
|
||||
RETURN_LONG(strlen_1 * cost_del);
|
||||
}
|
||||
|
||||
unsigned char u_break_iterator_buffer1[U_BRK_SAFECLONE_BUFFERSIZE];
|
||||
unsigned char u_break_iterator_buffer2[U_BRK_SAFECLONE_BUFFERSIZE];
|
||||
bi1 = grapheme_get_break_iterator(u_break_iterator_buffer1, &ustatus);
|
||||
if (U_FAILURE(ustatus)) {
|
||||
intl_error_set_code(NULL, ustatus);
|
||||
intl_error_set_custom_msg(NULL, "Error on grapheme_get_break_iterator for argument #1 ($string1)", 0);
|
||||
efree(ustring2);
|
||||
efree(ustring1);
|
||||
ubrk_close(bi1);
|
||||
RETURN_FALSE;
|
||||
}
|
||||
|
||||
bi2 = grapheme_get_break_iterator(u_break_iterator_buffer2, &ustatus);
|
||||
if (U_FAILURE(ustatus)) {
|
||||
intl_error_set_code(NULL, ustatus);
|
||||
intl_error_set_custom_msg(NULL, "Error on grapheme_get_break_iterator for argument #2 ($string2)", 0);
|
||||
efree(ustring2);
|
||||
efree(ustring1);
|
||||
ubrk_close(bi2);
|
||||
ubrk_close(bi1);
|
||||
RETURN_FALSE;
|
||||
}
|
||||
ubrk_setText(bi1, ustring1, ustring1_len, &ustatus);
|
||||
|
||||
if (U_FAILURE(ustatus)) {
|
||||
intl_error_set_code(NULL, ustatus);
|
||||
|
||||
intl_error_set_custom_msg(NULL, "Error on ubrk_setText for argument #1 ($string1)", 0);
|
||||
efree(ustring2);
|
||||
efree(ustring1);
|
||||
ubrk_close(bi2);
|
||||
ubrk_close(bi1);
|
||||
RETURN_FALSE;
|
||||
}
|
||||
|
||||
ubrk_setText(bi2, ustring2, ustring2_len, &ustatus);
|
||||
if (U_FAILURE(ustatus)) {
|
||||
intl_error_set_code(NULL, ustatus);
|
||||
|
||||
intl_error_set_custom_msg(NULL, "Error on ubrk_setText for argument #2 ($string2)", 0);
|
||||
efree(ustring2);
|
||||
efree(ustring1);
|
||||
ubrk_close(bi2);
|
||||
ubrk_close(bi1);
|
||||
RETURN_FALSE;
|
||||
}
|
||||
UCollator *collator = ucol_open("", &ustatus);
|
||||
if (U_FAILURE(ustatus)) {
|
||||
intl_error_set_code(NULL, ustatus);
|
||||
|
||||
intl_error_set_custom_msg(NULL, "Error on ucol_open", 0);
|
||||
efree(ustring2);
|
||||
efree(ustring1);
|
||||
ubrk_close(bi2);
|
||||
ubrk_close(bi1);
|
||||
ucol_close(collator);
|
||||
RETURN_FALSE;
|
||||
}
|
||||
|
||||
zend_long *p1, *p2, *tmp;
|
||||
p1 = safe_emalloc(strlen_2 + 1, sizeof(zend_long), 0);
|
||||
p2 = safe_emalloc(strlen_2 + 1, sizeof(zend_long), 0);
|
||||
|
||||
for (i2 = 0; i2 <= strlen_2; i2++) {
|
||||
p1[i2] = i2 * cost_ins;
|
||||
}
|
||||
|
||||
int32_t current1 = 0;
|
||||
int32_t current2 = 0;
|
||||
int32_t pos1 = 0;
|
||||
int32_t pos2 = 0;
|
||||
|
||||
while (true) {
|
||||
current1 = ubrk_current(bi1);
|
||||
pos1 = ubrk_next(bi1);
|
||||
if (pos1 == UBRK_DONE) {
|
||||
break;
|
||||
}
|
||||
p2[0] = p1[0] + cost_del;
|
||||
for (i2 = 0, pos2 = 0; pos2 != UBRK_DONE; i2++) {
|
||||
current2 = ubrk_current(bi2);
|
||||
pos2 = ubrk_next(bi2);
|
||||
if (pos2 == UBRK_DONE) {
|
||||
break;
|
||||
}
|
||||
if (ucol_strcoll(collator, ustring1 + current1, pos1 - current1, ustring2 + current2, pos2 - current2) == UCOL_EQUAL) {
|
||||
c0 = p1[i2];
|
||||
} else {
|
||||
c0 = p1[i2] + cost_rep;
|
||||
}
|
||||
c1 = p1[i2 + 1] + cost_del;
|
||||
if (c1 < c0) {
|
||||
c0 = c1;
|
||||
}
|
||||
c2 = p2[i2] + cost_ins;
|
||||
if (c2 < c0) {
|
||||
c0 = c2;
|
||||
}
|
||||
p2[i2 + 1] = c0;
|
||||
}
|
||||
ubrk_first(bi2);
|
||||
tmp = p1;
|
||||
p1 = p2;
|
||||
p2 = tmp;
|
||||
}
|
||||
|
||||
ucol_close(collator);
|
||||
|
||||
ubrk_close(bi1);
|
||||
ubrk_close(bi2);
|
||||
|
||||
efree(ustring1);
|
||||
efree(ustring2);
|
||||
|
||||
retval = p1[strlen_2];
|
||||
|
||||
efree(p1);
|
||||
efree(p2);
|
||||
RETURN_LONG(retval);
|
||||
}
|
||||
|
||||
/* }}} */
|
||||
|
|
|
@ -447,6 +447,8 @@ function grapheme_stristr(string $haystack, string $needle, bool $beforeNeedle =
|
|||
|
||||
function grapheme_str_split(string $string, int $length = 1): array|false {}
|
||||
|
||||
function grapheme_levenshtein(string $string1, string $string2, int $insertion_cost = 1, int $replacement_cost = 1, int $deletion_cost = 1): int|false {}
|
||||
|
||||
/** @param int $next */
|
||||
function grapheme_extract(string $haystack, int $size, int $type = GRAPHEME_EXTR_COUNT, int $offset = 0, &$next = null): string|false {}
|
||||
|
||||
|
|
12
ext/intl/php_intl_arginfo.h
generated
12
ext/intl/php_intl_arginfo.h
generated
|
@ -1,5 +1,5 @@
|
|||
/* This is a generated file, edit the .stub.php file instead.
|
||||
* Stub hash: 4fb44fc170e74af2e9fb52c5a1029004f708fcda */
|
||||
* Stub hash: adcf3b6ef720a518087efedbe2b62b10ad4b2624 */
|
||||
|
||||
ZEND_BEGIN_ARG_WITH_RETURN_OBJ_INFO_EX(arginfo_intlcal_create_instance, 0, 0, IntlCalendar, 1)
|
||||
ZEND_ARG_INFO_WITH_DEFAULT_VALUE(0, timezone, "null")
|
||||
|
@ -489,6 +489,14 @@ ZEND_BEGIN_ARG_WITH_RETURN_TYPE_MASK_EX(arginfo_grapheme_str_split, 0, 1, MAY_BE
|
|||
ZEND_ARG_TYPE_INFO_WITH_DEFAULT_VALUE(0, length, IS_LONG, 0, "1")
|
||||
ZEND_END_ARG_INFO()
|
||||
|
||||
ZEND_BEGIN_ARG_WITH_RETURN_TYPE_MASK_EX(arginfo_grapheme_levenshtein, 0, 2, MAY_BE_LONG|MAY_BE_FALSE)
|
||||
ZEND_ARG_TYPE_INFO(0, string1, IS_STRING, 0)
|
||||
ZEND_ARG_TYPE_INFO(0, string2, IS_STRING, 0)
|
||||
ZEND_ARG_TYPE_INFO_WITH_DEFAULT_VALUE(0, insertion_cost, IS_LONG, 0, "1")
|
||||
ZEND_ARG_TYPE_INFO_WITH_DEFAULT_VALUE(0, replacement_cost, IS_LONG, 0, "1")
|
||||
ZEND_ARG_TYPE_INFO_WITH_DEFAULT_VALUE(0, deletion_cost, IS_LONG, 0, "1")
|
||||
ZEND_END_ARG_INFO()
|
||||
|
||||
ZEND_BEGIN_ARG_WITH_RETURN_TYPE_MASK_EX(arginfo_grapheme_extract, 0, 2, MAY_BE_STRING|MAY_BE_FALSE)
|
||||
ZEND_ARG_TYPE_INFO(0, haystack, IS_STRING, 0)
|
||||
ZEND_ARG_TYPE_INFO(0, size, IS_LONG, 0)
|
||||
|
@ -903,6 +911,7 @@ ZEND_FUNCTION(grapheme_substr);
|
|||
ZEND_FUNCTION(grapheme_strstr);
|
||||
ZEND_FUNCTION(grapheme_stristr);
|
||||
ZEND_FUNCTION(grapheme_str_split);
|
||||
ZEND_FUNCTION(grapheme_levenshtein);
|
||||
ZEND_FUNCTION(grapheme_extract);
|
||||
ZEND_FUNCTION(idn_to_ascii);
|
||||
ZEND_FUNCTION(idn_to_utf8);
|
||||
|
@ -1091,6 +1100,7 @@ static const zend_function_entry ext_functions[] = {
|
|||
ZEND_FE(grapheme_strstr, arginfo_grapheme_strstr)
|
||||
ZEND_FE(grapheme_stristr, arginfo_grapheme_stristr)
|
||||
ZEND_FE(grapheme_str_split, arginfo_grapheme_str_split)
|
||||
ZEND_FE(grapheme_levenshtein, arginfo_grapheme_levenshtein)
|
||||
ZEND_FE(grapheme_extract, arginfo_grapheme_extract)
|
||||
ZEND_FE(idn_to_ascii, arginfo_idn_to_ascii)
|
||||
ZEND_FE(idn_to_utf8, arginfo_idn_to_utf8)
|
||||
|
|
128
ext/intl/tests/grapheme_levenshtein.phpt
Normal file
128
ext/intl/tests/grapheme_levenshtein.phpt
Normal file
|
@ -0,0 +1,128 @@
|
|||
--TEST--
|
||||
grapheme_levenshtein() function test
|
||||
--EXTENSIONS--
|
||||
intl
|
||||
--FILE--
|
||||
<?php
|
||||
echo '--- Equal ---' . \PHP_EOL;
|
||||
var_dump(grapheme_levenshtein('12345', '12345'));
|
||||
|
||||
echo '--- First string empty ---' . \PHP_EOL;
|
||||
var_dump(grapheme_levenshtein('', 'xyz'));
|
||||
echo '--- Second string empty ---' . \PHP_EOL;
|
||||
var_dump(grapheme_levenshtein('xyz', ''));
|
||||
echo '--- Both empty ---' . \PHP_EOL;
|
||||
var_dump(grapheme_levenshtein('', ''));
|
||||
var_dump(grapheme_levenshtein('', '', 10, 10, 10));
|
||||
|
||||
echo '--- 1 character ---' . \PHP_EOL;
|
||||
var_dump(grapheme_levenshtein('1', '2'));
|
||||
echo '--- 2 character swapped ---' . \PHP_EOL;
|
||||
var_dump(grapheme_levenshtein('12', '21'));
|
||||
|
||||
echo '--- Inexpensive deletion ---' . \PHP_EOL;
|
||||
var_dump(grapheme_levenshtein('2121', '11', 2));
|
||||
echo '--- Expensive deletion ---' . \PHP_EOL;
|
||||
var_dump(grapheme_levenshtein('2121', '11', 2, 1, 5));
|
||||
|
||||
//
|
||||
echo '--- Inexpensive insertion ---' . \PHP_EOL;
|
||||
var_dump(grapheme_levenshtein('11', '2121'));
|
||||
echo '--- Expensive insertion ---' . \PHP_EOL;
|
||||
var_dump(grapheme_levenshtein('11', '2121', 5));
|
||||
|
||||
echo '--- Expensive replacement ---' . \PHP_EOL;
|
||||
var_dump(grapheme_levenshtein('111', '121', 2, 3, 2));
|
||||
echo '--- Very expensive replacement ---' . \PHP_EOL;
|
||||
var_dump(grapheme_levenshtein('111', '121', 2, 9, 2));
|
||||
|
||||
echo '--- 128 codepoints ---' . \PHP_EOL;
|
||||
var_dump(grapheme_levenshtein(str_repeat("a", 128), str_repeat("a", 125) . "abc"));
|
||||
echo '--- 128 codepoints over ---' . \PHP_EOL;
|
||||
var_dump(grapheme_levenshtein(str_repeat("a", 128) . "abc", str_repeat("a", 128) . "aaa"));
|
||||
var_dump(grapheme_levenshtein(str_repeat("a", 256) . "abc", "aaa"));
|
||||
echo '--- 128 codepoints over only $string1 ---' . \PHP_EOL;
|
||||
var_dump(grapheme_levenshtein(str_repeat("a", 128) . "abc", "aaa"));
|
||||
echo '--- 128 codepoints over only $string2 ---' . \PHP_EOL;
|
||||
var_dump(grapheme_levenshtein("abc", str_repeat("a", 128) . "aaa"));
|
||||
echo '--- 128 codepoints over Hiragana ---' . \PHP_EOL;
|
||||
var_dump(grapheme_levenshtein(str_repeat("あ", 128) . "あああ", str_repeat("あ", 128) . "あいう"));
|
||||
|
||||
echo '--- Variable selector ---' . \PHP_EOL;
|
||||
$ka = "カ́";
|
||||
var_dump(grapheme_levenshtein("カ", $ka));
|
||||
// variable $nabe and $nabe_E0100 is seems nothing different.
|
||||
// However, $nabe_E0100 is variable selector in U+908A U+E0100.
|
||||
// So grapheme_levenshtein result is maybe 0.
|
||||
$nabe = '邊';
|
||||
$nabe_E0100 = "邊󠄀";
|
||||
var_dump(grapheme_levenshtein($nabe, $nabe_E0100));
|
||||
|
||||
// combining character
|
||||
var_dump(grapheme_levenshtein("\u{0065}\u{0301}", "\u{00e9}"));
|
||||
|
||||
// Corner case
|
||||
echo '--- Corner case ---' . PHP_EOL;
|
||||
try {
|
||||
grapheme_levenshtein($nabe, $nabe_E0100, 0, 1, 1);
|
||||
} catch (ValueError $e) {
|
||||
echo $e->getMessage() . PHP_EOL;
|
||||
}
|
||||
|
||||
try {
|
||||
grapheme_levenshtein($nabe, $nabe_E0100, 1, 0, 1);
|
||||
} catch (ValueError $e) {
|
||||
echo $e->getMessage() . PHP_EOL;
|
||||
}
|
||||
|
||||
try {
|
||||
grapheme_levenshtein($nabe, $nabe_E0100, 1, 1, 0);
|
||||
} catch (ValueError $e) {
|
||||
echo $e->getMessage() . PHP_EOL;
|
||||
}
|
||||
?>
|
||||
--EXPECTF--
|
||||
--- Equal ---
|
||||
int(0)
|
||||
--- First string empty ---
|
||||
int(3)
|
||||
--- Second string empty ---
|
||||
int(3)
|
||||
--- Both empty ---
|
||||
int(0)
|
||||
int(0)
|
||||
--- 1 character ---
|
||||
int(1)
|
||||
--- 2 character swapped ---
|
||||
int(2)
|
||||
--- Inexpensive deletion ---
|
||||
int(2)
|
||||
--- Expensive deletion ---
|
||||
int(10)
|
||||
--- Inexpensive insertion ---
|
||||
int(2)
|
||||
--- Expensive insertion ---
|
||||
int(10)
|
||||
--- Expensive replacement ---
|
||||
int(3)
|
||||
--- Very expensive replacement ---
|
||||
int(4)
|
||||
--- 128 codepoints ---
|
||||
int(2)
|
||||
--- 128 codepoints over ---
|
||||
int(2)
|
||||
int(256)
|
||||
--- 128 codepoints over only $string1 ---
|
||||
int(128)
|
||||
--- 128 codepoints over only $string2 ---
|
||||
int(130)
|
||||
--- 128 codepoints over Hiragana ---
|
||||
int(2)
|
||||
--- Variable selector ---
|
||||
int(1)
|
||||
int(0)
|
||||
int(0)
|
||||
--- Corner case ---
|
||||
grapheme_levenshtein(): Argument #3 ($insertion_cost) must be greater than 0 and less than or equal to %d
|
||||
grapheme_levenshtein(): Argument #4 ($replacement_cost) must be greater than 0 and less than or equal to %d
|
||||
grapheme_levenshtein(): Argument #5 ($deletion_cost) must be greater than 0 and less than or equal to %d
|
Loading…
Add table
Add a link
Reference in a new issue