mirror of
https://github.com/php/php-src.git
synced 2025-08-16 05:58:45 +02:00
178 lines
9.9 KiB
PHP
178 lines
9.9 KiB
PHP
--TEST--
|
|
Test mb_decode_numericentity() function : Convert HTML entities to text
|
|
--EXTENSIONS--
|
|
mbstring
|
|
--FILE--
|
|
<?php
|
|
|
|
function varDumpToString($var)
|
|
{
|
|
ob_start();
|
|
var_dump($var);
|
|
return trim(ob_get_clean());
|
|
}
|
|
|
|
function test($desc, $str, $expected, $convmap, $encoding) {
|
|
$result = mb_decode_numericentity($str, $convmap, $encoding);
|
|
echo $desc, ": ", varDumpToString($str), " => ", varDumpToString($result);
|
|
if ($result === $expected)
|
|
echo " (Good)\n";
|
|
else
|
|
echo " (BAD; expected ", varDumpToString($expected), ")\n";
|
|
}
|
|
|
|
function testNonAscii($desc, $str, $expected, $convmap, $encoding) {
|
|
$result = mb_decode_numericentity($str, $convmap, $encoding);
|
|
echo $desc, ": ", bin2hex($str), " => ", bin2hex($result);
|
|
if ($result === $expected)
|
|
echo " (Good)\n";
|
|
else
|
|
echo " (BAD; expected ", bin2hex($expected), ")\n";
|
|
}
|
|
|
|
$str1 = '¡¢£¤¥¦§¨©ª«¬­®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖרÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ';
|
|
$str2 = 'ƒΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩαβγδεζηθικλμνξοπρςστυφχψωϑϒϖ•…′″‾⁄℘ℑℜ™ℵ←↑→↓↔↵⇐⇑⇒⇓⇔∀∂∃∅∇∈∉∋∏∑−∗√∝∞∠∧∨∩∪∫∴∼≅≈≠≡≤≥⊂⊃⊄⊆⊇⊕⊗⊥⋅⌈⌉⌊⌋〈〉◊♠♣♥♦';
|
|
$str3 = 'aŒbœcŠdše€fg';
|
|
|
|
// Typical convmap, typical numeric entity-encoded string
|
|
$convmap = array(0x0, 0x2FFFF, 0, 0xFFFF);
|
|
echo "1: " . mb_decode_numericentity($str1, $convmap, "UTF-8") . "\n";
|
|
echo "2: " . mb_decode_numericentity($str2, $convmap, "UTF-8") . "\n";
|
|
echo "3: " . mb_decode_numericentity($str3, $convmap, "UTF-8") . "\n";
|
|
|
|
// Numeric entities which are truncated at end of string
|
|
echo "4: " . mb_decode_numericentity('�', $convmap), "\n"; // Entity is too big
|
|
echo "5: " . mb_decode_numericentity('�', $convmap), "\n"; // Entity is too big
|
|
echo "6: " . mb_decode_numericentity('�', $convmap), "\n"; // Too many digits
|
|
echo "7: " . mb_decode_numericentity('�', $convmap), "\n"; // Too many digits
|
|
echo "8: " . mb_decode_numericentity('�', $convmap), "\n"; // Too many digits
|
|
echo "9: " . mb_decode_numericentity('�', $convmap), "\n"; // Too many digits
|
|
echo "10: " . bin2hex(mb_decode_numericentity('�', $convmap)), "\n"; // OK
|
|
echo "11: " . bin2hex(mb_decode_numericentity('�', $convmap)), "\n"; // OK
|
|
// Try with hex, not just decimal entities
|
|
echo "11b: " . bin2hex(mb_decode_numericentity('�', $convmap)), "\n"; // OK
|
|
echo "11c: " . bin2hex(mb_decode_numericentity('�', $convmap)), "\n"; // OK
|
|
echo "11d: " . bin2hex(mb_decode_numericentity('𐀀', $convmap)), "\n"; // OK
|
|
echo "11e: " . mb_decode_numericentity('�', $convmap), "\n"; // Too many digits
|
|
|
|
// Large decimal entity, converting from non-ASCII input encoding
|
|
echo "12: " . bin2hex(mb_decode_numericentity(mb_convert_encoding('�', 'UCS-4', 'ASCII'), [0, 0x7FFFFFFF, 0, 0x7FFFFFFF], 'UCS-4')), "\n";
|
|
|
|
$convmap = [];
|
|
echo "13: " . mb_decode_numericentity('föo', $convmap, "UTF-8") . "\n";
|
|
|
|
echo "15: " . bin2hex(mb_decode_numericentity('�', [0, 1, 0, 0xFFFF], 'UTF-8')) . "\n";
|
|
echo "16: " . bin2hex(mb_decode_numericentity('�', [0, 1, 0, 0xFFFF], 'UTF-8')) . "\n";
|
|
|
|
// Weird convmap
|
|
$convmap = [
|
|
0, 0, 0, 0, // Only one codepoint, empty mask
|
|
100, 50, 0, 0xFFFF // 'End' codepoint is before 'start' codepoint
|
|
];
|
|
echo "17: " . mb_decode_numericentity('föo', $convmap, "UTF-8") . "\n";
|
|
|
|
// Convmap with positive offset
|
|
$convmap = [0, 10, 1000, 0xFFFF];
|
|
echo "18: " . bin2hex(mb_decode_numericentity("ϩϪϫ", $convmap, "UTF-8")) . "\n";
|
|
echo "19: " . bin2hex(mb_decode_numericentity("ϩϪϫ", $convmap, "UTF-8")) . "\n";
|
|
|
|
echo "20: " . mb_decode_numericentity("{a;", [0, 0xFFFF, 0, 0xFFFF]) . "\n";
|
|
|
|
test("10 digits for decimal entity", "A", "A", [0, 0xFFFF, 0, 0xFFFF], 'ASCII');
|
|
test("More than 10 digits for decimal entity", "¥", "¥", [0, 0xFFFF, 0, 0xFFFF], 'ASCII');
|
|
|
|
test("8 digits for hex entity", "A", "A", [0, 0xFFFF, 0, 0xFFFF], 'ASCII');
|
|
test("More than 8 digits for hex entity", "Ł", "Ł", [0, 0xFFFF, 0, 0xFFFF], 'ASCII');
|
|
|
|
test("Single &", "&", "&", [0, 0xFFFF, 0, 0xFFFF], "ASCII");
|
|
|
|
// An entity can come right after a preceding ampersand
|
|
test("Successive &", "&A,", "&A,", [0, 0xFFFF, 0, 0xFFFF], 'ASCII');
|
|
|
|
// An entity can come right after a preceding &#
|
|
test("Successive &#", "&#2", "", [0, 0xFFFF, 0, 0xFFFF], 'ASCII');
|
|
test("Successive &#x", "&#x2", "", [0, 0xFFFF, 0, 0xFFFF], 'ASCII');
|
|
|
|
test("&#x only", "&#x;", "&#x;", [0, 0xFFFF, 0, 0xFFFF], 'ASCII');
|
|
|
|
// The starting & of an entity can terminate a preceding entity
|
|
test("Successive A", "AA", "AA", [0, 0xFFFF, 0, 0xFFFF], 'ASCII');
|
|
test("Successive hex entities", "22", "22", [0, 0xFFFF, 0, 0xFFFF], 'ASCII');
|
|
|
|
// An entity can come right after an entity which is invalid because of being too long
|
|
test("Starting entity immediately after decimal entity which is too long", "�A", "�A", [0, 0xFFFF, 0, 0xFFFF], 'ASCII');
|
|
test("Starting entity immediately after hex entity which is too long", "�A", "�", [0, 0xFFFF, 0, 0xFFFF], 'ASCII');
|
|
|
|
test("Starting entity immediately after invalid decimal entity", "�A", "�A", [0x1, 0xFFFF, 0, 0xFFFF], 'ASCII');
|
|
test("Starting entity immediately after invalid hex entity", "�A", "
", [0x1, 0xFFFF, 0, 0xFFFF], 'ASCII');
|
|
|
|
// Try with '&', '&#', or '&#' at the end of a buffer of wchars, with more input
|
|
// still left to process in the next buffer
|
|
// (mb_decode_numericentity splits its input into 'chunks' and processes it one
|
|
// chunk at a time)
|
|
$convmap = [0, 0xFFFF, 0, 0xFFFF];
|
|
for ($i = 0; $i < 256; $i++) {
|
|
$padding = str_repeat("a", $i);
|
|
// First try invalid decimal/hex entities
|
|
if (mb_decode_numericentity($padding . "&#ZZZ", $convmap, 'UTF-8') !== $padding . "&#ZZZ")
|
|
die("&#ZZZ is broken when it spans two buffers!");
|
|
if (mb_decode_numericentity($padding . "&#xZZZ", $convmap, 'UTF-8') !== $padding . "&#xZZZ")
|
|
die("&#xZZZ is broken when it spans two buffers!");
|
|
// Now try valid decimal/hex entities
|
|
if (mb_decode_numericentity($padding . "A", $convmap, 'UTF-8') !== $padding . "A")
|
|
die("A is broken when it spans two buffers!");
|
|
if (mb_decode_numericentity($padding . "A", $convmap, 'UTF-8') !== $padding . "A")
|
|
die("A is broken when it spans two buffers!");
|
|
}
|
|
|
|
// Try huge entities, big enough to fill an entire buffer
|
|
for ($i = 12; $i < 256; $i++) {
|
|
$str = "&#" . str_repeat("0", $i) . "65";
|
|
if (mb_decode_numericentity($str, $convmap, 'UTF-8') !== $str)
|
|
die("Decimal entity with huge number of digits broken");
|
|
|
|
$str = "&#x" . str_repeat("0", $i) . "41";
|
|
if (mb_decode_numericentity($str, $convmap, 'UTF-8') !== $str)
|
|
die("Hexadecimal entity with huge number of digits broken");
|
|
}
|
|
|
|
?>
|
|
--EXPECT--
|
|
1: ¡¢£¤¥¦§¨©ª«¬®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖרÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ
|
|
2: ƒΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩαβγδεζηθικλμνξοπρςστυφχψωϑϒϖ•…′″‾⁄℘ℑℜ™ℵ←↑→↓↔↵⇐⇑⇒⇓⇔∀∂∃∅∇∈∉∋∏∑−∗√∝∞∠∧∨∩∪∫∴∼≅≈≠≡≤≥⊂⊃⊄⊆⊇⊕⊗⊥⋅⌈⌉⌊⌋〈〉◊♠♣♥♦
|
|
3: aŒbœcŠdše€fg
|
|
4: �
|
|
5: �
|
|
6: �
|
|
7: �
|
|
8: �
|
|
9: �
|
|
10: 00
|
|
11: 00
|
|
11b: 00
|
|
11c: 00
|
|
11d: f0908080
|
|
11e: �
|
|
12: 00bc614e
|
|
13: föo
|
|
15: 00
|
|
16: 00
|
|
17: föo
|
|
18: 010203
|
|
19: 010203
|
|
20: {a;
|
|
10 digits for decimal entity: string(13) "A" => string(1) "A" (Good)
|
|
More than 10 digits for decimal entity: string(14) "¥" => string(14) "¥" (Good)
|
|
8 digits for hex entity: string(12) "A" => string(1) "A" (Good)
|
|
More than 8 digits for hex entity: string(13) "Ł" => string(13) "Ł" (Good)
|
|
Single &: string(1) "&" => string(1) "&" (Good)
|
|
Successive &: string(6) "&A," => string(3) "&A," (Good)
|
|
Successive &#: string(8) "&#2" => string(3) "" (Good)
|
|
Successive &#x: string(9) "&#x2" => string(4) "" (Good)
|
|
&#x only: string(4) "&#x;" => string(4) "&#x;" (Good)
|
|
Successive A: string(9) "AA" => string(2) "AA" (Good)
|
|
Successive hex entities: string(11) "22" => string(2) "22" (Good)
|
|
Starting entity immediately after decimal entity which is too long: string(18) "�A" => string(14) "�A" (Good)
|
|
Starting entity immediately after hex entity which is too long: string(17) "�A" => string(13) "�" (Good)
|
|
Starting entity immediately after invalid decimal entity: string(8) "�A" => string(4) "�A" (Good)
|
|
Starting entity immediately after invalid hex entity: string(9) "�A" => string(5) "
" (Good)
|