Refactor levenshtein()

Closes GH-5816
This commit is contained in:
George Peter Banyard 2020-07-06 19:04:52 +02:00
parent 91fbd12d57
commit c4a0ba8d6e
8 changed files with 158 additions and 129 deletions

View file

@ -985,7 +985,7 @@ function iptcparse(string $iptcblock): array|false {}
/* levenshtein.c */ /* levenshtein.c */
function levenshtein(string $str1, string $str2, $cost_ins = UNKNOWN, int $cost_rep = UNKNOWN, int $cost_del = UNKNOWN): int {} function levenshtein(string $str1, string $str2, int $cost_ins = 1, int $cost_rep = 1, int $cost_del = 1): int {}
/* link.c */ /* link.c */

View file

@ -1,5 +1,5 @@
/* This is a generated file, edit the .stub.php file instead. /* This is a generated file, edit the .stub.php file instead.
* Stub hash: 9cf2c691c081d9aaefaa9d22337a9e00efb0af77 */ * Stub hash: 05b740207a70a9d272b4c327882fd0b52016a0af */
ZEND_BEGIN_ARG_WITH_RETURN_TYPE_INFO_EX(arginfo_set_time_limit, 0, 1, _IS_BOOL, 0) ZEND_BEGIN_ARG_WITH_RETURN_TYPE_INFO_EX(arginfo_set_time_limit, 0, 1, _IS_BOOL, 0)
ZEND_ARG_TYPE_INFO(0, seconds, IS_LONG, 0) ZEND_ARG_TYPE_INFO(0, seconds, IS_LONG, 0)
@ -1568,9 +1568,9 @@ ZEND_END_ARG_INFO()
ZEND_BEGIN_ARG_WITH_RETURN_TYPE_INFO_EX(arginfo_levenshtein, 0, 2, IS_LONG, 0) ZEND_BEGIN_ARG_WITH_RETURN_TYPE_INFO_EX(arginfo_levenshtein, 0, 2, IS_LONG, 0)
ZEND_ARG_TYPE_INFO(0, str1, IS_STRING, 0) ZEND_ARG_TYPE_INFO(0, str1, IS_STRING, 0)
ZEND_ARG_TYPE_INFO(0, str2, IS_STRING, 0) ZEND_ARG_TYPE_INFO(0, str2, IS_STRING, 0)
ZEND_ARG_INFO(0, cost_ins) ZEND_ARG_TYPE_INFO_WITH_DEFAULT_VALUE(0, cost_ins, IS_LONG, 0, "1")
ZEND_ARG_TYPE_INFO(0, cost_rep, IS_LONG, 0) ZEND_ARG_TYPE_INFO_WITH_DEFAULT_VALUE(0, cost_rep, IS_LONG, 0, "1")
ZEND_ARG_TYPE_INFO(0, cost_del, IS_LONG, 0) ZEND_ARG_TYPE_INFO_WITH_DEFAULT_VALUE(0, cost_del, IS_LONG, 0, "1")
ZEND_END_ARG_INFO() ZEND_END_ARG_INFO()
#if defined(HAVE_SYMLINK) || defined(PHP_WIN32) #if defined(HAVE_SYMLINK) || defined(PHP_WIN32)

View file

@ -15,42 +15,36 @@
*/ */
#include "php.h" #include "php.h"
#include <stdlib.h>
#include <errno.h>
#include <ctype.h>
#include "php_string.h" #include "php_string.h"
#define LEVENSHTEIN_MAX_LENGTH 255 #define LEVENSHTEIN_MAX_LENGTH 255
/* {{{ reference_levdist /* {{{ reference_levdist
* reference implementation, only optimized for memory usage, not speed */ * reference implementation, only optimized for memory usage, not speed */
static zend_long reference_levdist(const char *s1, size_t l1, const char *s2, size_t l2, zend_long cost_ins, zend_long cost_rep, zend_long cost_del ) static zend_long reference_levdist(const zend_string *string1, const zend_string *string2, zend_long cost_ins, zend_long cost_rep, zend_long cost_del )
{ {
zend_long *p1, *p2, *tmp; zend_long *p1, *p2, *tmp;
zend_long c0, c1, c2; zend_long c0, c1, c2;
size_t i1, i2; size_t i1, i2;
if (l1 == 0) { if (ZSTR_LEN(string1) == 0) {
return l2 * cost_ins; return ZSTR_LEN(string2) * cost_ins;
} }
if (l2 == 0) { if (ZSTR_LEN(string2) == 0) {
return l1 * cost_del; return ZSTR_LEN(string1) * cost_del;
} }
if ((l1 > LEVENSHTEIN_MAX_LENGTH) || (l2 > LEVENSHTEIN_MAX_LENGTH)) { p1 = safe_emalloc((ZSTR_LEN(string2) + 1), sizeof(zend_long), 0);
return -1; p2 = safe_emalloc((ZSTR_LEN(string2) + 1), sizeof(zend_long), 0);
}
p1 = safe_emalloc((l2 + 1), sizeof(zend_long), 0);
p2 = safe_emalloc((l2 + 1), sizeof(zend_long), 0);
for (i2 = 0; i2 <= l2; i2++) { for (i2 = 0; i2 <= ZSTR_LEN(string2); i2++) {
p1[i2] = i2 * cost_ins; p1[i2] = i2 * cost_ins;
} }
for (i1 = 0; i1 < l1 ; i1++) { for (i1 = 0; i1 < ZSTR_LEN(string1) ; i1++) {
p2[0] = p1[0] + cost_del; p2[0] = p1[0] + cost_del;
for (i2 = 0; i2 < l2; i2++) { for (i2 = 0; i2 < ZSTR_LEN(string2); i2++) {
c0 = p1[i2] + ((s1[i1] == s2[i2]) ? 0 : cost_rep); c0 = p1[i2] + ((ZSTR_VAL(string1)[i1] == ZSTR_VAL(string2)[i2]) ? 0 : cost_rep);
c1 = p1[i2 + 1] + cost_del; c1 = p1[i2 + 1] + cost_del;
if (c1 < c0) { if (c1 < c0) {
c0 = c1; c0 = c1;
@ -65,7 +59,7 @@ static zend_long reference_levdist(const char *s1, size_t l1, const char *s2, si
p1 = p2; p1 = p2;
p2 = tmp; p2 = tmp;
} }
c0 = p1[l2]; c0 = p1[ZSTR_LEN(string2)];
efree(p1); efree(p1);
efree(p2); efree(p2);
@ -74,56 +68,31 @@ static zend_long reference_levdist(const char *s1, size_t l1, const char *s2, si
} }
/* }}} */ /* }}} */
/* {{{ custom_levdist */
static int custom_levdist(char *str1, char *str2, char *callback_name)
{
php_error_docref(NULL, E_WARNING, "The general Levenshtein support is not there yet");
/* not there yet */
return -1;
}
/* }}} */
/* {{{ Calculate Levenshtein distance between two strings */ /* {{{ Calculate Levenshtein distance between two strings */
PHP_FUNCTION(levenshtein) PHP_FUNCTION(levenshtein)
{ {
int argc = ZEND_NUM_ARGS(); zend_string *string1, *string2;
char *str1, *str2; zend_long cost_ins = 1;
char *callback_name; zend_long cost_rep = 1;
size_t str1_len, str2_len, callback_len; zend_long cost_del = 1;
zend_long cost_ins, cost_rep, cost_del; zend_long distance = 0;
zend_long distance = -1;
switch (argc) { if (zend_parse_parameters(ZEND_NUM_ARGS(), "SS|lll", &string1, &string2, &cost_ins, &cost_rep, &cost_del) == FAILURE) {
case 2: /* just two strings: use maximum performance version */ RETURN_THROWS();
if (zend_parse_parameters(2, "ss", &str1, &str1_len, &str2, &str2_len) == FAILURE) {
RETURN_THROWS();
}
distance = reference_levdist(str1, str1_len, str2, str2_len, 1, 1, 1);
break;
case 5: /* more general version: calc cost by ins/rep/del weights */
if (zend_parse_parameters(5, "sslll", &str1, &str1_len, &str2, &str2_len, &cost_ins, &cost_rep, &cost_del) == FAILURE) {
RETURN_THROWS();
}
distance = reference_levdist(str1, str1_len, str2, str2_len, cost_ins, cost_rep, cost_del);
break;
case 3: /* most general version: calc cost by user-supplied function */
if (zend_parse_parameters(3, "sss", &str1, &str1_len, &str2, &str2_len, &callback_name, &callback_len) == FAILURE) {
RETURN_THROWS();
}
distance = custom_levdist(str1, str2, callback_name);
break;
default:
WRONG_PARAM_COUNT;
} }
if (distance < 0 && /* TODO */ ZEND_NUM_ARGS() != 3) { if (ZSTR_LEN(string1) > LEVENSHTEIN_MAX_LENGTH) {
php_error_docref(NULL, E_WARNING, "Argument string(s) too long"); zend_argument_value_error(1, "must be less than %d characters", LEVENSHTEIN_MAX_LENGTH + 1);
RETURN_THROWS();
} }
if (ZSTR_LEN(string2) > LEVENSHTEIN_MAX_LENGTH) {
zend_argument_value_error(2, "must be less than %d characters", LEVENSHTEIN_MAX_LENGTH + 1);
RETURN_THROWS();
}
distance = reference_levdist(string1, string2, cost_ins, cost_rep, cost_del);
RETURN_LONG(distance); RETURN_LONG(distance);
} }
/* }}} */ /* }}} */

View file

@ -3,76 +3,61 @@ levenshtein() function test
--FILE-- --FILE--
<?php <?php
function test_me($title,$expect,$text1,$text2,$cost1="",$cost2="",$cost3="") { echo '--- Equal ---' . \PHP_EOL;
var_dump(levenshtein('12345', '12345'));
if ($cost1=="") { echo '--- First string empty ---' . \PHP_EOL;
$result=levenshtein($text1,$text2); var_dump(levenshtein('', 'xyz'));
} echo '--- Second string empty ---' . \PHP_EOL;
elseif ($cost2=="") { var_dump(levenshtein('xyz', ''));
$result=levenshtein($text1,$text2,$cost1); echo '--- Both empty ---' . \PHP_EOL;
} var_dump(levenshtein('', ''));
else { var_dump(levenshtein('', '', 10, 10, 10));
$result=levenshtein($text1,$text2,$cost1,$cost2,$cost3);
}
if($result==$expect) return 0;
echo "$title: result is $result instead of $expect "; echo '--- 1 character ---' . \PHP_EOL;
echo "for '$text1'/'$text2' "; var_dump(levenshtein('1', '2'));
if($cost1) echo "($cost1:$cost2:$cost3)"; echo '--- 2 character swapped ---' . \PHP_EOL;
echo "\n"; var_dump(levenshtein('12', '21'));
return 1; echo '--- Inexpensive deletion ---' . \PHP_EOL;
} var_dump(levenshtein('2121', '11', 2));
echo '--- Expensive deletion ---' . \PHP_EOL;
var_dump(levenshtein('2121', '11', 2, 1, 5));
$n=0; echo '--- Inexpensive insertion ---' . \PHP_EOL;
var_dump(levenshtein('11', '2121'));
echo '--- Expensive insertion ---' . \PHP_EOL;
var_dump(levenshtein('11', '2121', 5));
$n += test_me("equal" , 0, "12345", "12345"); echo '--- Expensive replacement ---' . \PHP_EOL;
$n += test_me("1st empty" , 3, "", "xzy"); var_dump(levenshtein('111', '121', 2, 3, 2));
$n += test_me("2nd empty" , 3, "xzy", ""); echo '--- Very expensive replacement ---' . \PHP_EOL;
$n += test_me("both empty" , 0, "", ""); var_dump(levenshtein('111', '121', 2, 9, 2));
$n += test_me("1 char" , 1, "1", "2");
$n += test_me("2 char swap", 2, "12", "21");
$n += test_me("inexpensive delete", 2, "2121", "11", 2, 1, 1);
$n += test_me("expensive delete" , 10, "2121", "11", 2, 1, 5);
$n += test_me("inexpensive insert", 2, "11", "2121", 1, 1, 1);
$n += test_me("expensive insert" , 10, "11", "2121", 5, 1, 1);
$n += test_me("expensive replace" , 3, "111", "121", 2, 3, 2);
$n += test_me("very expensive replace", 4, "111", "121", 2, 9, 2);
$n += test_me("bug #7368", 2, "13458", "12345");
$n += test_me("bug #7368", 2, "1345", "1234");
$n += test_me("bug #6562", 1, "debugg", "debug");
$n += test_me("bug #6562", 1, "ddebug", "debug");
$n += test_me("bug #6562", 2, "debbbug", "debug");
$n += test_me("bug #6562", 1, "debugging", "debuging");
$n += test_me("bug #16473", 2, "a", "bc");
$n += test_me("bug #16473", 2, "xa", "xbc");
$n += test_me("bug #16473", 2, "xax", "xbcx");
$n += test_me("bug #16473", 2, "ax", "bcx");
$n += test_me("custom", -1, "111", "121", "my_levcode");
$n += test_me("lt maxlength1", 254, "AbcdefghijklmnopqrstuvwxyzAbcdefghijklmnopqrstuvwxyzAbcdefghijklmnopqrstuvwxyzAbcdefghijklmnopqrstuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsu", "A");
$n += test_me("gt maxlength1", -1, "AbcdefghijklmnopqrstuvwxyzAbcdefghijklmnopqrstuvwxyzAbcdefghijklmnopqrstuvwxyzAbcdefghijklmnopqrstuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuv", "A");
$n += test_me("lt maxlength2", 254, "A", "AbcdefghijklmnopqrstuvwxyzAbcdefghijklmnopqrstuvwxyzAbcdefghijklmnopqrstuvwxyzAbcdefghijklmnopqrstuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsu");
$n += test_me("gt maxlength2", -1, "A", "AbcdefghijklmnopqrstuvwxyzAbcdefghijklmnopqrstuvwxyzAbcdefghijklmnopqrstuvwxyzAbcdefghijklmnopqrstuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuv");
echo ($n==0)?"all passed\n":"$n failed\n";
var_dump(levenshtein("", "", -1, -1, -1));
var_dump(levenshtein("", "", 10, 10, 10));
?> ?>
--EXPECTF-- --EXPECT--
Warning: levenshtein(): The general Levenshtein support is not there yet in %s on line %d --- Equal ---
int(0)
Warning: levenshtein(): Argument string(s) too long in %s on line %d --- First string empty ---
int(3)
Warning: levenshtein(): Argument string(s) too long in %s on line %d --- Second string empty ---
all passed int(3)
--- Both empty ---
int(0) int(0)
int(0) int(0)
--- 1 character ---
int(1)
--- 2 character swapped ---
int(2)
--- Inexpensive deletion ---
int(2)
--- Expensive deletion ---
int(10)
--- Inexpensive insertion ---
int(2)
--- Expensive insertion ---
int(10)
--- Expensive replacement ---
int(3)
--- Very expensive replacement ---
int(4)

View file

@ -0,0 +1,16 @@
--TEST--
levenshtein() bug 16473
--FILE--
<?php
var_dump(levenshtein('a', 'bc'));
var_dump(levenshtein('xa', 'xbc'));
var_dump(levenshtein('xax', 'xbcx'));
var_dump(levenshtein('ax', 'bcx'));
?>
--EXPECT--
int(2)
int(2)
int(2)
int(2)

View file

@ -0,0 +1,16 @@
--TEST--
levenshtein() bug 6562
--FILE--
<?php
var_dump(levenshtein('debugg', 'debug'));
var_dump(levenshtein('ddebug', 'debug'));
var_dump(levenshtein('debbbug', 'debug'));
var_dump(levenshtein('debugging', 'debuging'));
?>
--EXPECT--
int(1)
int(1)
int(2)
int(1)

View file

@ -0,0 +1,12 @@
--TEST--
levenshtein() bug 7368
--FILE--
<?php
var_dump(levenshtein('13458', '12345'));
var_dump(levenshtein('1345', '1234'));
?>
--EXPECT--
int(2)
int(2)

View file

@ -0,0 +1,31 @@
--TEST--
levenshtein() error conditions
--FILE--
<?php
echo '--- String 1 ---' . \PHP_EOL;
var_dump(levenshtein('AbcdefghijklmnopqrstuvwxyzAbcdefghijklmnopqrstuvwxyzAbcdefghijklmnopqrstuvwxyzAbcdefghijklmnopqrstuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsu', 'A'));
try {
var_dump(levenshtein('AbcdefghijklmnopqrstuvwxyzAbcdefghijklmnopqrstuvwxyzAbcdefghijklmnopqrstuvwxyzAbcdefghijklmnopqrstuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuv', 'A'));
} catch (\ValueError $e) {
echo $e->getMessage() . \PHP_EOL;
}
echo '--- String 2 ---' . \PHP_EOL;
var_dump(levenshtein('A', 'AbcdefghijklmnopqrstuvwxyzAbcdefghijklmnopqrstuvwxyzAbcdefghijklmnopqrstuvwxyzAbcdefghijklmnopqrstuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsu'));
try {
var_dump(levenshtein('A', 'AbcdefghijklmnopqrstuvwxyzAbcdefghijklmnopqrstuvwxyzAbcdefghijklmnopqrstuvwxyzAbcdefghijklmnopqrstuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuvwxyzAbcdefghijklmnopqrtsuv'));
} catch (\ValueError $e) {
echo $e->getMessage() . \PHP_EOL;
}
// TODO ValueError for negative costs?
// var_dump(levenshtein("", "", -1, -1, -1));
?>
--EXPECT--
--- String 1 ---
int(254)
levenshtein(): Argument #1 ($str1) must be less than 256 characters
--- String 2 ---
int(254)
levenshtein(): Argument #2 ($str2) must be less than 256 characters