Fix new conversion filter for HTML entities

While fuzzing the new mb_decode_numericentity implementation, I discovered
that the fast conversion filter for 'HTML-ENTITIES' did not correctly
handle an empty named entity ('&;'), nor did it correctly handle
invalid named entities whose names were a prefix of a valid entity.
Also, it did not correctly handle the case where a named entity is
truncated and another named entity starts abruptly.
This commit is contained in:
Alex Dowad 2022-06-27 09:21:00 +02:00
parent 9c3972fb3d
commit fa83a8e15e
2 changed files with 16 additions and 5 deletions

View file

@ -334,6 +334,11 @@ void mbfl_filt_conv_html_dec_copy(mbfl_convert_filter *src, mbfl_convert_filter
memcpy(dest->opaque, src->opaque, html_enc_buffer_size+1); memcpy(dest->opaque, src->opaque, html_enc_buffer_size+1);
} }
static bool is_html_entity_char(unsigned char c)
{
return (c >= '0' && c <= '9') || (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || c == '#';
}
static size_t mb_htmlent_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state) static size_t mb_htmlent_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
{ {
unsigned char *p = *in, *e = p + *in_len; unsigned char *p = *in, *e = p + *in_len;
@ -345,9 +350,9 @@ static size_t mb_htmlent_to_wchar(unsigned char **in, size_t *in_len, uint32_t *
if (c == '&') { if (c == '&') {
/* Find terminating ; for HTML entity */ /* Find terminating ; for HTML entity */
unsigned char *terminator = p; unsigned char *terminator = p;
while (terminator < e && *terminator != ';') while (terminator < e && is_html_entity_char(*terminator))
terminator++; terminator++;
if (terminator < e) { if (terminator < e && *terminator == ';') {
if (*p == '#' && (e - p) >= 2) { if (*p == '#' && (e - p) >= 2) {
/* Numeric entity */ /* Numeric entity */
unsigned int value = 0; unsigned int value = 0;
@ -390,11 +395,11 @@ static size_t mb_htmlent_to_wchar(unsigned char **in, size_t *in_len, uint32_t *
*out++ = value; *out++ = value;
p = terminator + 1; p = terminator + 1;
goto next_iteration; goto next_iteration;
} else { } else if (terminator > p && terminator < e) {
/* Named entity */ /* Named entity */
mbfl_html_entity_entry *entity = (mbfl_html_entity_entry*)mbfl_html_entity_list; mbfl_html_entity_entry *entity = (mbfl_html_entity_entry*)mbfl_html_entity_list;
while (entity->name) { while (entity->name) {
if (!strncmp((char*)p, entity->name, terminator - p)) { if (!strncmp((char*)p, entity->name, terminator - p) && strlen(entity->name) == terminator - p) {
*out++ = entity->code; *out++ = entity->code;
p = terminator + 1; p = terminator + 1;
goto next_iteration; goto next_iteration;
@ -409,7 +414,7 @@ bad_entity:
while (p < terminator && out < limit) { while (p < terminator && out < limit) {
*out++ = *p++; *out++ = *p++;
} }
if (terminator < e && out < limit) { if (terminator < e && *terminator == ';' && out < limit) {
*out++ = *p++; *out++ = *p++;
} }
} else { } else {

View file

@ -45,6 +45,12 @@ convertFromEntities("\x00", '&#00000;');
testConversion(str_repeat('あ', 100), str_repeat('&#12354;', 100)); testConversion(str_repeat('あ', 100), str_repeat('&#12354;', 100));
convertFromEntities("&;", "&;");
convertFromEntities("&f;", "&f;");
convertFromEntities("&A", "&&#65;");
convertFromEntities("&A", "&&#x41;");
echo "Done!\n"; echo "Done!\n";
?> ?>
--EXPECTF-- --EXPECTF--