php-src/ext/mbstring/tests/encoding_tests.inc
Alex Dowad 3c73225125 New internal interface for fast text conversion in mbstring
When converting text to/from wchars, mbstring makes one function call
for each and every byte or wchar to be converted. Typically, each of
these conversion functions contains a state machine, and its state has
to be restored and then saved for every single one of these calls.
It doesn't take much to see that this is grossly inefficient.

Instead of converting one byte or wchar on each call, the new
conversion functions will either fill up or drain a whole buffer of
wchars on each call. In benchmarks, this is about 3-10× faster.

Adding the new, faster conversion functions for all supported legacy
text encodings still needs some work. Also, all the code which uses
the old-style conversion functions needs to be converted to use the
new ones. After that, the old code can be dropped. (The mailparse
extension will also have to be fixed up so it will still compile.)
2021-12-21 08:33:11 +02:00

246 lines
9.5 KiB
PHP

<?php
// Common code for tests which focus on conversion and verification of text
// in some specific encoding
// Read a file with one character and its equivalent Unicode codepoint on each
// line, delimited by tabs
function readConversionTable($path, &$from, &$to, $utf32 = false) {
$from = array();
$to = array();
$fp = fopen($path, 'r+');
while ($line = fgets($fp, 256)) {
if ($line[0] == '#')
continue;
if (sscanf($line, "0x%x\t0x%x", $char, $codepoint) == 2) {
$codepoint = $utf32 ? pack('N', $codepoint) : pack('n', $codepoint);
if ($char == PHP_INT_MAX) {
// We may be on a 32-bit machine and testing a text encoding with 4-byte codes
// (which can't be represented in a PHP integer)
$char = "";
for ($i = 2; $i < strlen($line); $i += 2) {
$substr = substr($line, $i, 2);
if (ctype_xdigit($substr))
$char .= chr(hexdec($substr));
else
break;
}
} else {
if ($char <= 0xFF)
$char = chr($char); // hex codes must not have leading zero bytes
else if ($char <= 0xFFFF)
$char = pack('n', $char);
else if ($char <= 0xFFFFFF)
$char = chr($char >> 16) . pack('n', $char & 0xFFFF);
else
$char = pack('N', $char);
}
$from[$char] = $codepoint;
$to[$codepoint] = $char;
}
}
}
function dbgPrint($str) {
$result = '';
if (mb_check_encoding($str, 'ASCII'))
$result .= '"' . $str . '" ';
return $result . "(" . bin2hex($str) . ")";
}
function identifyValidString($goodString, $encoding) {
$result = mb_check_encoding($goodString, $encoding);
if (!$result)
die("mb_check_encoding failed on good $encoding string: " . dbgPrint($goodString));
}
function identifyInvalidString($badString, $encoding) {
$result = mb_check_encoding($badString, $encoding);
if ($result)
die("mb_check_encoding passed on bad $encoding string: " . dbgPrint($badString));
}
function testConversion($fromString, $toString, $fromEncoding, $toEncoding) {
$result = mb_convert_encoding($fromString, $toEncoding, $fromEncoding);
if ($result !== $toString)
die("mb_convert_encoding not working on $fromEncoding input: " . dbgPrint($fromString) . "\nExpected $toEncoding: " . dbgPrint($toString) . "\nActually got: " . dbgPrint($result));
}
function testValidConversion($fromString, $toString, $fromEncoding, $toEncoding) {
$illegalChars = mb_get_info('illegal_chars');
testConversion($fromString, $toString, $fromEncoding, $toEncoding);
if (mb_get_info('illegal_chars') !== $illegalChars)
die("mb_convert_encoding incremented illegal_chars on valid $fromEncoding string: " . dbgPrint($fromString) . " when converting to $toEncoding");
}
function convertValidString($fromString, $toString, $fromEncoding, $toEncoding, $bothWays = true) {
testValidConversion($fromString, $toString, $fromEncoding, $toEncoding);
if ($bothWays)
testValidConversion($toString, $fromString, $toEncoding, $fromEncoding);
}
function convertInvalidString($fromString, $toString, $fromEncoding, $toEncoding) {
$illegalChars = mb_get_info('illegal_chars');
testConversion($fromString, $toString, $fromEncoding, $toEncoding);
if (mb_get_info('illegal_chars') <= $illegalChars)
die("mb_convert_encoding did not increment illegal_chars on invalid $fromEncoding string: " . dbgPrint($fromString) . " when converting to $toEncoding");
}
function testValidString($fromString, $toString, $fromEncoding, $toEncoding, $bothWays = true) {
identifyValidString($fromString, $fromEncoding);
convertValidString($fromString, $toString, $fromEncoding, $toEncoding, $bothWays);
}
function testInvalidString($fromString, $toString, $fromEncoding, $toEncoding) {
identifyInvalidString($fromString, $fromEncoding);
convertInvalidString($fromString, $toString, $fromEncoding, $toEncoding);
}
// Only for encodings where valid characters can be concatenated together in any
// way, without any escape sequences
function testAllValidChars($charMap, $fromEncoding, $toEncoding, $bothWays = true) {
$goodChars = array_keys($charMap);
shuffle($goodChars);
// Try a long string
$fromString = $toString = '';
foreach ($goodChars as $goodChar) {
$fromString .= $goodChar;
$toString .= $charMap[$goodChar];
}
testValidString($fromString, $toString, $fromEncoding, $toEncoding, $bothWays);
// Try various shorter ones
while (!empty($goodChars)) {
$length = min(rand(5,10), count($goodChars));
$fromString = $toString = '';
while ($length--) {
$goodChar = array_pop($goodChars);
$fromString .= $goodChar;
$toString .= $charMap[$goodChar];
}
testValidString($fromString, $toString, $fromEncoding, $toEncoding, $bothWays);
}
}
function testAllInvalidChars($badChars, $charMap, $fromEncoding, $toEncoding, $replacement) {
$badChars = array_keys($badChars);
$goodChars = array();
while (!empty($badChars)) {
if (empty($goodChars)) {
$goodChars = array_keys($charMap);
shuffle($goodChars);
}
$goodChar = array_pop($goodChars);
$fromString = array_pop($badChars) . $goodChar;
$toString = $replacement . $charMap[$goodChar];
testInvalidString($fromString, $toString, $fromEncoding, $toEncoding);
}
}
function convertAllInvalidChars($badChars, $charMap, $fromEncoding, $toEncoding, $replacement) {
$badChars = array_keys($badChars);
$goodChars = array();
while (!empty($badChars)) {
if (empty($goodChars)) {
$goodChars = array_keys($charMap);
shuffle($goodChars);
}
$goodChar = array_pop($goodChars);
$fromString = array_pop($badChars) . $goodChar;
$toString = $replacement . $charMap[$goodChar];
convertInvalidString($fromString, $toString, $fromEncoding, $toEncoding);
}
}
function testTruncatedChars($truncated, $fromEncoding, $toEncoding, $replacement) {
$truncatedChars = array_keys($truncated);
foreach ($truncatedChars as $truncatedChar) {
testInvalidString($truncatedChar, $replacement, $fromEncoding, $toEncoding);
}
}
// For variable-width encodings, where we have an exhaustive list of
// all valid characters of any width
//
// `$startBytes` maps from first-byte values to the corresponding character length
// (For encodings where the first byte can tell you the length of a multi-byte
// character)
// Note that `$startBytes` can be partial!
function findInvalidChars($valid, &$invalid, &$truncated, $startBytes = array()) {
$invalid = array();
$truncated = array();
$prefixes = array(); /* All sequences which are not (but can start) a valid character */
foreach ($valid as $char => $unicode) {
for ($len = 1; $len < strlen($char); $len++)
$prefixes[substr($char, 0, $len)] = true;
}
$varLength = function($prefix) use($valid, $prefixes, &$invalid, &$truncated, &$varLength) {
for ($byte = 0; $byte < 256; $byte++) {
$str = $prefix . chr($byte);
if (!isset($valid[$str])) {
if (isset($prefixes[$str])) {
$truncated[$str] = true;
$varLength($str);
} else {
$invalid[$str] = true;
}
}
}
};
$fixedLength = function($prefix, $remaining) use($valid, $prefixes, &$invalid, &$truncated, &$fixedLength) {
if ($remaining == 0) {
if (!isset($valid[$prefix]))
$invalid[$prefix] = true;
} else if ($remaining == 1) {
$truncated[$prefix] = true;
for ($i = 0; $i < 256; $i++) {
$str = $prefix . chr($i);
if (!isset($valid[$str]))
$invalid[$str] = true;
}
} else {
$truncated[$prefix] = true;
for ($i = 0; $i < 256; $i++)
$fixedLength($prefix . chr($i), $remaining - 1);
}
};
for ($byte = 0; $byte < 256; $byte++) {
if (isset($startBytes[$byte])) {
$fixedLength(chr($byte), $startBytes[$byte] - 1);
} else {
$str = chr($byte);
if (!isset($valid[$str])) {
if (isset($prefixes[$str])) {
$truncated[$str] = true;
$varLength($str);
} else {
$invalid[$str] = true;
}
}
}
}
}
function testEncodingFromUTF16ConversionTable($path, $encoding, $replacement = '%', $startBytes = array()) {
srand(1000); // Make results consistent
mb_substitute_character(0x25); // '%'
readConversionTable($path, $toUnicode, $fromUnicode);
findInvalidChars($toUnicode, $invalid, $truncated, $startBytes);
testAllValidChars($toUnicode, $encoding, 'UTF-16BE');
testAllInvalidChars($invalid, $toUnicode, $encoding, 'UTF-16BE', "\x00%");
testTruncatedChars($truncated, $encoding, 'UTF-16BE', "\x00%");
echo "Tested $encoding -> UTF-16BE\n";
findInvalidChars($fromUnicode, $invalid, $unused, array_fill_keys(range(0,0xFF), 2));
convertAllInvalidChars($invalid, $fromUnicode, 'UTF-16BE', $encoding, $replacement);
echo "Tested UTF-16BE -> $encoding\n";
}
?>