- Fixed get_next_char(), used by htmlentities/htmlspecialchars, accepting

certain ill-formed UTF-8 sequences.
This commit is contained in:
Gustavo André dos Santos Lopes 2010-10-14 19:14:06 +00:00
parent c99251f87e
commit 30d6cc8919
5 changed files with 92 additions and 7 deletions

2
NEWS
View file

@ -33,6 +33,8 @@
and when there was data in the buffer before the emulation started. Also made and when there was data in the buffer before the emulation started. Also made
more consistent its behavior -- should return failure every time less data more consistent its behavior -- should return failure every time less data
than was requested was skipped. (Gustavo) than was requested was skipped. (Gustavo)
- Fixed htmlentities/htmlspecialchars accepting certain ill-formed UTF-8
sequences. (Gustavo)
- Fixed bug #53021 (In html_entity_decode, failure to convert numeric entities - Fixed bug #53021 (In html_entity_decode, failure to convert numeric entities
with ENT_NOQUOTES and ISO-8859-1). Fixed and extended the fix of ENT_NOQUOTES with ENT_NOQUOTES and ISO-8859-1). Fixed and extended the fix of ENT_NOQUOTES

View file

@ -540,7 +540,7 @@ inline static unsigned int get_next_char(enum entity_charset charset,
MB_WRITE(c); MB_WRITE(c);
this_char = c; this_char = c;
pos++; pos++;
} else if (c < 0xc0) { } else if (c < 0xc2) {
MB_FAILURE(pos); MB_FAILURE(pos);
} else if (c < 0xe0) { } else if (c < 0xe0) {
CHECK_LEN(pos, 2); CHECK_LEN(pos, 2);
@ -572,7 +572,7 @@ inline static unsigned int get_next_char(enum entity_charset charset,
MB_WRITE((unsigned char)str[pos + 1]); MB_WRITE((unsigned char)str[pos + 1]);
MB_WRITE((unsigned char)str[pos + 2]); MB_WRITE((unsigned char)str[pos + 2]);
pos += 3; pos += 3;
} else if (c < 0xf8) { } else if (c < 0xf5) {
CHECK_LEN(pos, 4); CHECK_LEN(pos, 4);
if (str[pos + 1] < 0x80 || str[pos + 1] > 0xbf) { if (str[pos + 1] < 0x80 || str[pos + 1] > 0xbf) {
MB_FAILURE(pos); MB_FAILURE(pos);
@ -584,7 +584,7 @@ inline static unsigned int get_next_char(enum entity_charset charset,
MB_FAILURE(pos); MB_FAILURE(pos);
} }
this_char = ((c & 0x07) << 18) | ((str[pos + 1] & 0x3f) << 12) | ((str[pos + 2] & 0x3f) << 6) | (str[pos + 3] & 0x3f); this_char = ((c & 0x07) << 18) | ((str[pos + 1] & 0x3f) << 12) | ((str[pos + 2] & 0x3f) << 6) | (str[pos + 3] & 0x3f);
if (this_char < 0x10000) { if (this_char < 0x10000 || this_char > 0x10FFFF) {
MB_FAILURE(pos); MB_FAILURE(pos);
} }
MB_WRITE((unsigned char)c); MB_WRITE((unsigned char)c);

View file

@ -50,8 +50,8 @@ foreach($strings as $string) {
%unicode|string%(16) "266561637574653b" %unicode|string%(16) "266561637574653b"
%unicode|string%(2) "79" %unicode|string%(2) "79"
%unicode|string%(2) "79" %unicode|string%(2) "79"
%unicode|string%(8) "f7bfbfbf" %unicode|string%(0) ""
%unicode|string%(8) "f7bfbfbf" %unicode|string%(0) ""
%unicode|string%(0) "" %unicode|string%(0) ""
%unicode|string%(0) "" %unicode|string%(0) ""
%unicode|string%(0) "" %unicode|string%(0) ""

View file

@ -0,0 +1,83 @@
--TEST--
Test get_next_char(), used by htmlentities()/htmlspecialchars(): validity of UTF-8 sequences
--FILE--
<?php
/* conformance to Unicode 5.2, section 3.9, D92 */
$val_ranges = array(
array(array(0x00, 0x7F)),
array(array(0xC2, 0xDF), array(0x80, 0xBF)),
array(array(0xE0, 0xE0), array(0xA0, 0xBF), array(0x80, 0xBF)),
array(array(0xE1, 0xEC), array(0x80, 0xBF), array(0x80, 0xBF)),
array(array(0xED, 0xED), array(0x80, 0x9F), array(0x80, 0xBF)),
array(array(0xEE, 0xEF), array(0x80, 0xBF), array(0x80, 0xBF)),
array(array(0xF0, 0xF0), array(0x90, 0xBF), array(0x80, 0xBF), array(0x80, 0xBF)),
array(array(0xF1, 0xF3), array(0x80, 0xBF), array(0x80, 0xBF), array(0x80, 0xBF)),
array(array(0xF4, 0xF4), array(0x80, 0x8F), array(0x80, 0xBF), array(0x80, 0xBF)),
);
function is_valid($seq) {
global $val_ranges;
$b = ord($seq[0]);
foreach ($val_ranges as $l) {
if ($b >= $l[0][0] && $b <= $l[0][1]) {
if (count($l) != strlen($seq)) {
return false;
}
for ($n = 1; $n < strlen($seq); $n++) {
if (ord($seq[$n]) < $l[$n][0] || ord($seq[$n]) > $l[$n][1]) {
return false;
}
}
return true;
}
}
return false;
}
function concordance($s) {
$vhe = strlen(htmlspecialchars($s, ENT_QUOTES, "UTF-8")) > 0;
$v = is_valid($s);
return ($vhe === $v);
}
for ($b1 = 0xC0; $b1 < 0xE0; $b1++) {
for ($b2 = 0x80; $b2 < 0xBF; $b2++) {
$s = chr($b1).chr($b2);
if (!concordance($s))
echo "Discordance for ".bin2hex($s),"\n";
}
}
for ($b1 = 0xE0; $b1 < 0xEF; $b1++) {
for ($b2 = 0x80; $b2 < 0xBF; $b2++) {
$s = chr($b1).chr($b2)."\x80";
if (!concordance($s))
echo "Discordance for ".bin2hex($s),"\n";
$s = chr($b1).chr($b2)."\xBF";
if (!concordance($s))
echo "Discordance for ".bin2hex($s),"\n";
}
}
for ($b1 = 0xF0; $b1 < 0xFF; $b1++) {
for ($b2 = 0x80; $b2 < 0xBF; $b2++) {
$s = chr($b1).chr($b2)."\x80\x80";
if (!concordance($s))
echo "Discordance for ".bin2hex($s),"\n";
$s = chr($b1).chr($b2)."\xBF\x80";
if (!concordance($s))
echo "Discordance for ".bin2hex($s),"\n";
$s = chr($b1).chr($b2)."\x80\xBF";
if (!concordance($s))
echo "Discordance for ".bin2hex($s),"\n";
$s = chr($b1).chr($b2)."\xBF\xBF";
if (!concordance($s))
echo "Discordance for ".bin2hex($s),"\n";
}
}
echo "Done.\n";
--EXPECT--
Done.

View file

@ -50,8 +50,8 @@ foreach($strings as $string) {
%unicode|string%(16) "266561637574653b" %unicode|string%(16) "266561637574653b"
%unicode|string%(0) "" %unicode|string%(0) ""
%unicode|string%(0) "" %unicode|string%(0) ""
%unicode|string%(8) "f7bfbfbf" %unicode|string%(0) ""
%unicode|string%(8) "f7bfbfbf" %unicode|string%(0) ""
%unicode|string%(0) "" %unicode|string%(0) ""
%unicode|string%(0) "" %unicode|string%(0) ""
%unicode|string%(0) "" %unicode|string%(0) ""