mirror of
https://github.com/php/php-src.git
synced 2025-08-16 05:58:45 +02:00

Since mb_decode_numericentity does not require all HTML entities to end with ';', but allows them to be terminated by ANY non-digit character, it doesn't make sense that valid entities which butt up against the end of the input string are not converted. As it turned out, supporting this case also made it possible to simplify the code nicely.
212 lines
13 KiB
PHP
212 lines
13 KiB
PHP
--TEST--
|
|
Test mb_decode_numericentity() function : Convert HTML entities to text
|
|
--EXTENSIONS--
|
|
mbstring
|
|
--FILE--
|
|
<?php
|
|
|
|
function varDumpToString($var)
|
|
{
|
|
ob_start();
|
|
var_dump($var);
|
|
return trim(ob_get_clean());
|
|
}
|
|
|
|
function test($desc, $str, $expected, $convmap, $encoding) {
|
|
$result = mb_decode_numericentity($str, $convmap, $encoding);
|
|
echo $desc, ": ", varDumpToString($str), " => ", varDumpToString($result);
|
|
if ($result === $expected)
|
|
echo " (Good)\n";
|
|
else
|
|
echo " (BAD; expected ", varDumpToString($expected), ")\n";
|
|
}
|
|
|
|
function testNonAscii($desc, $str, $expected, $convmap, $encoding) {
|
|
$result = mb_decode_numericentity($str, $convmap, $encoding);
|
|
echo $desc, ": ", bin2hex($str), " => ", bin2hex($result);
|
|
if ($result === $expected)
|
|
echo " (Good)\n";
|
|
else
|
|
echo " (BAD; expected ", bin2hex($expected), ")\n";
|
|
}
|
|
|
|
$str1 = '¡¢£¤¥¦§¨©ª«¬­®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖרÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ';
|
|
$str2 = 'ƒΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩαβγδεζηθικλμνξοπρςστυφχψωϑϒϖ•…′″‾⁄℘ℑℜ™ℵ←↑→↓↔↵⇐⇑⇒⇓⇔∀∂∃∅∇∈∉∋∏∑−∗√∝∞∠∧∨∩∪∫∴∼≅≈≠≡≤≥⊂⊃⊄⊆⊇⊕⊗⊥⋅⌈⌉⌊⌋〈〉◊♠♣♥♦';
|
|
$str3 = 'aŒbœcŠdše€fg';
|
|
|
|
// Typical convmap, typical numeric entity-encoded string
|
|
$convmap = array(0x0, 0x2FFFF, 0, 0xFFFF);
|
|
echo "1: " . mb_decode_numericentity($str1, $convmap, "UTF-8") . "\n";
|
|
echo "2: " . mb_decode_numericentity($str2, $convmap, "UTF-8") . "\n";
|
|
echo "3: " . mb_decode_numericentity($str3, $convmap, "UTF-8") . "\n";
|
|
|
|
// Numeric entities which are truncated at end of string
|
|
echo "4: " . mb_decode_numericentity('�', $convmap), "\n"; // Entity is too big
|
|
echo "5: " . mb_decode_numericentity('�', $convmap), "\n"; // Entity is too big
|
|
echo "6: " . mb_decode_numericentity('�', $convmap), "\n"; // Too many digits
|
|
echo "7: " . mb_decode_numericentity('�', $convmap), "\n"; // Too many digits
|
|
echo "8: " . mb_decode_numericentity('�', $convmap), "\n"; // Too many digits
|
|
echo "9: " . mb_decode_numericentity('�', $convmap), "\n"; // Too many digits
|
|
echo "10: " . bin2hex(mb_decode_numericentity('�', $convmap)), "\n"; // OK
|
|
echo "11: " . bin2hex(mb_decode_numericentity('�', $convmap)), "\n"; // OK
|
|
// Try with hex, not just decimal entities
|
|
echo "11b: " . bin2hex(mb_decode_numericentity('�', $convmap)), "\n"; // OK
|
|
echo "11c: " . bin2hex(mb_decode_numericentity('�', $convmap)), "\n"; // OK
|
|
echo "11d: " . bin2hex(mb_decode_numericentity('𐀀', $convmap)), "\n"; // OK
|
|
echo "11e: " . mb_decode_numericentity('�', $convmap), "\n"; // Too many digits
|
|
|
|
// Large decimal entity, converting from non-ASCII input encoding
|
|
echo "12: " . bin2hex(mb_decode_numericentity(mb_convert_encoding('�', 'UCS-4', 'ASCII'), [0, 0x7FFFFFFF, 0, 0x7FFFFFFF], 'UCS-4')), "\n";
|
|
|
|
$convmap = [];
|
|
echo "13: " . mb_decode_numericentity('föo', $convmap, "UTF-8") . "\n";
|
|
|
|
$convmap = array(0x0, 0x2FFFF, 0); // 3 elements
|
|
try {
|
|
echo "14: " . mb_decode_numericentity($str3, $convmap, "UTF-8") . "\n";
|
|
} catch (ValueError $ex) {
|
|
echo "14: " . $ex->getMessage()."\n";
|
|
}
|
|
|
|
echo "15: " . bin2hex(mb_decode_numericentity('�', [0, 1, 0, 0xFFFF], 'UTF-8')) . "\n";
|
|
echo "16: " . bin2hex(mb_decode_numericentity('�', [0, 1, 0, 0xFFFF], 'UTF-8')) . "\n";
|
|
|
|
// Weird convmap
|
|
$convmap = [
|
|
0, 0, 0, 0, // Only one codepoint, empty mask
|
|
100, 50, 0, 0xFFFF // 'End' codepoint is before 'start' codepoint
|
|
];
|
|
echo "17: " . mb_decode_numericentity('föo', $convmap, "UTF-8") . "\n";
|
|
|
|
// Convmap with positive offset
|
|
$convmap = [0, 10, 1000, 0xFFFF];
|
|
echo "18: " . bin2hex(mb_decode_numericentity("ϩϪϫ", $convmap, "UTF-8")) . "\n";
|
|
echo "19: " . bin2hex(mb_decode_numericentity("ϩϪϫ", $convmap, "UTF-8")) . "\n";
|
|
|
|
echo "20: " . mb_decode_numericentity("{a;", [0, 0xFFFF, 0, 0xFFFF]) . "\n";
|
|
|
|
test("10 digits for decimal entity", "A", "A", [0, 0xFFFF, 0, 0xFFFF], 'ASCII');
|
|
test("More than 10 digits for decimal entity", "¥", "¥", [0, 0xFFFF, 0, 0xFFFF], 'ASCII');
|
|
|
|
test("8 digits for hex entity", "A", "A", [0, 0xFFFF, 0, 0xFFFF], 'ASCII');
|
|
test("More than 8 digits for hex entity", "Ł", "Ł", [0, 0xFFFF, 0, 0xFFFF], 'ASCII');
|
|
|
|
test("Single &", "&", "&", [0, 0xFFFF, 0, 0xFFFF], "ASCII");
|
|
|
|
// An entity can come right after a preceding ampersand
|
|
test("Successive &", "&A,", "&A,", [0, 0xFFFF, 0, 0xFFFF], 'ASCII');
|
|
|
|
// An entity can come right after a preceding &#
|
|
test("Successive &#", "&#2", "", [0, 0xFFFF, 0, 0xFFFF], 'ASCII');
|
|
test("Successive &#x", "&#x2", "", [0, 0xFFFF, 0, 0xFFFF], 'ASCII');
|
|
|
|
test("&#x only", "&#x;", "&#x;", [0, 0xFFFF, 0, 0xFFFF], 'ASCII');
|
|
|
|
// The starting & of an entity can terminate a preceding entity
|
|
test("Successive A", "AA", "AA", [0, 0xFFFF, 0, 0xFFFF], 'ASCII');
|
|
test("Successive hex entities", "22", "22", [0, 0xFFFF, 0, 0xFFFF], 'ASCII');
|
|
|
|
// An entity can come right after an entity which is invalid because of being too long
|
|
test("Starting entity immediately after decimal entity which is too long", "�A", "�A", [0, 0xFFFF, 0, 0xFFFF], 'ASCII');
|
|
test("Starting entity immediately after hex entity which is too long", "�A", "�", [0, 0xFFFF, 0, 0xFFFF], 'ASCII');
|
|
|
|
$ucs4_test1 = mb_convert_encoding("�A", 'UCS-4BE', 'ASCII');
|
|
testNonAscii("Starting entity immediately after valid decimal entity which is just within maximum length", $ucs4_test1, "\x3B\x9A\xCA\x00\x00\x00\x00A", [0, 0xFFFFFFFF, 0, 0xFFFFFFFF], 'UCS-4BE');
|
|
$ucs4_test2 = mb_convert_encoding("�A", 'UCS-4BE', 'ASCII');
|
|
testNonAscii("Starting entity immediately after valid hex entity which is just within maximum length", $ucs4_test2, "\x11\x11\x11\x11\x00\x00\x00A", [0, 0xFFFFFFFF, 0, 0xFFFFFFFF], 'UCS-4BE');
|
|
|
|
test("Starting entity immediately after invalid decimal entity", "�A", "�A", [0x1, 0xFFFF, 0, 0xFFFF], 'ASCII');
|
|
test("Starting entity immediately after invalid hex entity", "�A", "
", [0x1, 0xFFFF, 0, 0xFFFF], 'ASCII');
|
|
|
|
test("Starting entity immediately after too-big decimal entity", "�A", "�A", [0, 0xFFFFFFFF, 0, 0xFFFFFFFF], 'ASCII');
|
|
|
|
// If the numeric entity decodes to 0xFFFFFFFF, that should be passed through
|
|
// Originally, the new implementation of mb_decode_numericentity used -1 as a marker indicating
|
|
// that the entity could not be successfully decoded, so if the entity decoded successfully to
|
|
// 0xFFFFFFFF (-1), it would be treated as an invalid entity
|
|
test("Regression test (entity which decodes to 0xFFFFFFFF)", "", "?", [0xFFFFFF86, 0xFFFFFFFF, 0xF, 0xFC015448], 'HZ');
|
|
|
|
// With the legacy conversion filters, a trailing & could be truncated by mb_decode_numericentity,
|
|
// because some text encodings did not properly invoke the next flush function in the chain
|
|
test("Regression test (truncation of successive & with JIS encoding)", "&&&", "&&&", [0x20FF37FF, 0x7202F569, 0xC4090023, 0xF160], "JIS");
|
|
|
|
// Previously, signed arithmetic was used on convmap entries
|
|
test("Regression test (convmap entries are now treated as unsigned)", ",", "?,", [0x22FFFF11, 0xBF111189, 0x67726511, 0x1161E719], "ASCII");
|
|
|
|
// Try with '&', '&#', or '&#' at the end of a buffer of wchars, with more input
|
|
// still left to process in the next buffer
|
|
// (mb_decode_numericentity splits its input into 'chunks' and processes it one
|
|
// chunk at a time)
|
|
$convmap = [0, 0xFFFF, 0, 0xFFFF];
|
|
for ($i = 0; $i < 256; $i++) {
|
|
$padding = str_repeat("a", $i);
|
|
// First try invalid decimal/hex entities
|
|
if (mb_decode_numericentity($padding . "&#ZZZ", $convmap, 'UTF-8') !== $padding . "&#ZZZ")
|
|
die("&#ZZZ is broken when it spans two buffers!");
|
|
if (mb_decode_numericentity($padding . "&#xZZZ", $convmap, 'UTF-8') !== $padding . "&#xZZZ")
|
|
die("&#xZZZ is broken when it spans two buffers!");
|
|
// Now try valid decimal/hex entities
|
|
if (mb_decode_numericentity($padding . "A", $convmap, 'UTF-8') !== $padding . "A")
|
|
die("A is broken when it spans two buffers!");
|
|
if (mb_decode_numericentity($padding . "A", $convmap, 'UTF-8') !== $padding . "A")
|
|
die("A is broken when it spans two buffers!");
|
|
}
|
|
|
|
// Try huge entities, big enough to fill an entire buffer
|
|
for ($i = 12; $i < 256; $i++) {
|
|
$str = "&#" . str_repeat("0", $i) . "65";
|
|
if (mb_decode_numericentity($str, $convmap, 'UTF-8') !== $str)
|
|
die("Decimal entity with huge number of digits broken");
|
|
|
|
$str = "&#x" . str_repeat("0", $i) . "41";
|
|
if (mb_decode_numericentity($str, $convmap, 'UTF-8') !== $str)
|
|
die("Hexadecimal entity with huge number of digits broken");
|
|
}
|
|
|
|
?>
|
|
--EXPECT--
|
|
1: ¡¢£¤¥¦§¨©ª«¬®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖרÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ
|
|
2: ƒΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩαβγδεζηθικλμνξοπρςστυφχψωϑϒϖ•…′″‾⁄℘ℑℜ™ℵ←↑→↓↔↵⇐⇑⇒⇓⇔∀∂∃∅∇∈∉∋∏∑−∗√∝∞∠∧∨∩∪∫∴∼≅≈≠≡≤≥⊂⊃⊄⊆⊇⊕⊗⊥⋅⌈⌉⌊⌋〈〉◊♠♣♥♦
|
|
3: aŒbœcŠdše€fg
|
|
4: �
|
|
5: �
|
|
6: �
|
|
7: �
|
|
8: �
|
|
9: �
|
|
10: 00
|
|
11: 00
|
|
11b: 00
|
|
11c: 00
|
|
11d: f0908080
|
|
11e: �
|
|
12: 00bc614e
|
|
13: föo
|
|
14: mb_decode_numericentity(): Argument #2 ($map) must have a multiple of 4 elements
|
|
15: 00
|
|
16: 00
|
|
17: föo
|
|
18: 010203
|
|
19: 010203
|
|
20: {a;
|
|
10 digits for decimal entity: string(13) "A" => string(1) "A" (Good)
|
|
More than 10 digits for decimal entity: string(14) "¥" => string(14) "¥" (Good)
|
|
8 digits for hex entity: string(12) "A" => string(1) "A" (Good)
|
|
More than 8 digits for hex entity: string(13) "Ł" => string(13) "Ł" (Good)
|
|
Single &: string(1) "&" => string(1) "&" (Good)
|
|
Successive &: string(6) "&A," => string(3) "&A," (Good)
|
|
Successive &#: string(8) "&#2" => string(3) "" (Good)
|
|
Successive &#x: string(9) "&#x2" => string(4) "" (Good)
|
|
&#x only: string(4) "&#x;" => string(4) "&#x;" (Good)
|
|
Successive A: string(9) "AA" => string(2) "AA" (Good)
|
|
Successive hex entities: string(11) "22" => string(2) "22" (Good)
|
|
Starting entity immediately after decimal entity which is too long: string(18) "�A" => string(14) "�A" (Good)
|
|
Starting entity immediately after hex entity which is too long: string(17) "�A" => string(13) "�" (Good)
|
|
Starting entity immediately after valid decimal entity which is just within maximum length: 000000260000002300000031000000300000003000000030000000300000003000000030000000300000003000000030000000260000002300000036000000350000003b => 3b9aca0000000041 (Good)
|
|
Starting entity immediately after valid hex entity which is just within maximum length: 0000002600000023000000780000003100000031000000310000003100000031000000310000003100000031000000260000002300000036000000350000003b => 1111111100000041 (Good)
|
|
Starting entity immediately after invalid decimal entity: string(8) "�A" => string(4) "�A" (Good)
|
|
Starting entity immediately after invalid hex entity: string(9) "�A" => string(5) "
" (Good)
|
|
Starting entity immediately after too-big decimal entity: string(17) "�A" => string(13) "�A" (Good)
|
|
Regression test (entity which decodes to 0xFFFFFFFF): string(5) "" => string(1) "?" (Good)
|
|
Regression test (truncation of successive & with JIS encoding): string(3) "&&&" => string(3) "&&&" (Good)
|
|
Regression test (convmap entries are now treated as unsigned): string(4) "," => string(2) "?," (Good)
|