php-src/ext/pcre/pcre2lib/pcre2_compile_class.c
2025-02-16 12:16:46 +01:00

2737 lines
74 KiB
C

/*************************************************
* Perl-Compatible Regular Expressions *
*************************************************/
/* PCRE is a library of functions to support regular expressions whose syntax
and semantics are as close as possible to those of the Perl 5 language.
Written by Philip Hazel
Original API code Copyright (c) 1997-2012 University of Cambridge
New API code Copyright (c) 2016-2024 University of Cambridge
-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the University of Cambridge nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
-----------------------------------------------------------------------------
*/
#ifdef HAVE_CONFIG_H
#include "config.h"
#endif
#include "pcre2_compile.h"
typedef struct {
/* Option bits for eclass. */
uint32_t options;
uint32_t xoptions;
/* Rarely used members. */
int *errorcodeptr;
compile_block *cb;
/* Bitmap is needed. */
BOOL needs_bitmap;
} eclass_context;
/* Checks the allowed tokens at the end of a class structure in debug mode.
When a new token is not processed by all loops, and the token is equals to
a) one of the cases here:
the compiler will complain about a duplicated case value.
b) none of the cases here:
the loop without the handler will stop with an assertion failure. */
#ifdef PCRE2_DEBUG
#define CLASS_END_CASES(meta) \
default: \
PCRE2_ASSERT((meta) <= META_END); \
/* Fall through */ \
case META_CLASS: \
case META_CLASS_NOT: \
case META_CLASS_EMPTY: \
case META_CLASS_EMPTY_NOT: \
case META_CLASS_END: \
case META_ECLASS_AND: \
case META_ECLASS_OR: \
case META_ECLASS_SUB: \
case META_ECLASS_XOR: \
case META_ECLASS_NOT:
#else
#define CLASS_END_CASES(meta) \
default:
#endif
#ifdef SUPPORT_WIDE_CHARS
/* Heapsort algorithm. */
static void do_heapify(uint32_t *buffer, size_t size, size_t i)
{
size_t max;
size_t left;
size_t right;
uint32_t tmp1, tmp2;
while (TRUE)
{
max = i;
left = (i << 1) + 2;
right = left + 2;
if (left < size && buffer[left] > buffer[max]) max = left;
if (right < size && buffer[right] > buffer[max]) max = right;
if (i == max) return;
/* Swap items. */
tmp1 = buffer[i];
tmp2 = buffer[i + 1];
buffer[i] = buffer[max];
buffer[i + 1] = buffer[max + 1];
buffer[max] = tmp1;
buffer[max + 1] = tmp2;
i = max;
}
}
#ifdef SUPPORT_UNICODE
#define PARSE_CLASS_UTF 0x1
#define PARSE_CLASS_CASELESS_UTF 0x2
#define PARSE_CLASS_RESTRICTED_UTF 0x4
#define PARSE_CLASS_TURKISH_UTF 0x8
/* Get the range of nocase characters which includes the
'c' character passed as argument, or directly follows 'c'. */
static const uint32_t*
get_nocase_range(uint32_t c)
{
uint32_t left = 0;
uint32_t right = PRIV(ucd_nocase_ranges_size);
uint32_t middle;
if (c > MAX_UTF_CODE_POINT) return PRIV(ucd_nocase_ranges) + right;
while (TRUE)
{
/* Range end of the middle element. */
middle = ((left + right) >> 1) | 0x1;
if (PRIV(ucd_nocase_ranges)[middle] <= c)
left = middle + 1;
else if (middle > 1 && PRIV(ucd_nocase_ranges)[middle - 2] > c)
right = middle - 1;
else
return PRIV(ucd_nocase_ranges) + (middle - 1);
}
}
/* Get the list of othercase characters, which belongs to the passed range.
Create ranges from these characters, and append them to the buffer argument. */
static size_t
utf_caseless_extend(uint32_t start, uint32_t end, uint32_t options,
uint32_t *buffer)
{
uint32_t new_start = start;
uint32_t new_end = end;
uint32_t c = start;
const uint32_t *list;
uint32_t tmp[3];
size_t result = 2;
const uint32_t *skip_range = get_nocase_range(c);
uint32_t skip_start = skip_range[0];
#if PCRE2_CODE_UNIT_WIDTH == 8
PCRE2_ASSERT(options & PARSE_CLASS_UTF);
#endif
#if PCRE2_CODE_UNIT_WIDTH == 32
if (end > MAX_UTF_CODE_POINT) end = MAX_UTF_CODE_POINT;
#endif
while (c <= end)
{
uint32_t co;
if (c > skip_start)
{
c = skip_range[1];
skip_range += 2;
skip_start = skip_range[0];
continue;
}
/* Compute caseless set. */
if ((options & (PARSE_CLASS_TURKISH_UTF|PARSE_CLASS_RESTRICTED_UTF)) ==
PARSE_CLASS_TURKISH_UTF &&
UCD_ANY_I(c))
{
co = PRIV(ucd_turkish_dotted_i_caseset) + (UCD_DOTTED_I(c)? 0 : 3);
}
else if ((co = UCD_CASESET(c)) != 0 &&
(options & PARSE_CLASS_RESTRICTED_UTF) != 0 &&
PRIV(ucd_caseless_sets)[co] < 128)
{
co = 0; /* Ignore the caseless set if it's restricted. */
}
if (co != 0)
list = PRIV(ucd_caseless_sets) + co;
else
{
co = UCD_OTHERCASE(c);
list = tmp;
tmp[0] = c;
tmp[1] = NOTACHAR;
if (co != c)
{
tmp[1] = co;
tmp[2] = NOTACHAR;
}
}
c++;
/* Add characters. */
do
{
#if PCRE2_CODE_UNIT_WIDTH == 16
if (!(options & PARSE_CLASS_UTF) && *list > 0xffff) continue;
#endif
if (*list < new_start)
{
if (*list + 1 == new_start)
{
new_start--;
continue;
}
}
else if (*list > new_end)
{
if (*list - 1 == new_end)
{
new_end++;
continue;
}
}
else continue;
result += 2;
if (buffer != NULL)
{
buffer[0] = *list;
buffer[1] = *list;
buffer += 2;
}
}
while (*(++list) != NOTACHAR);
}
if (buffer != NULL)
{
buffer[0] = new_start;
buffer[1] = new_end;
buffer += 2;
(void)buffer;
}
return result;
}
#endif
/* Add a character list to a buffer. */
static size_t
append_char_list(const uint32_t *p, uint32_t *buffer)
{
const uint32_t *n;
size_t result = 0;
while (*p != NOTACHAR)
{
n = p;
while (n[0] == n[1] - 1) n++;
PCRE2_ASSERT(*p < 0xffff);
if (buffer != NULL)
{
buffer[0] = *p;
buffer[1] = *n;
buffer += 2;
}
result += 2;
p = n + 1;
}
return result;
}
static uint32_t
get_highest_char(uint32_t options)
{
(void)options; /* Avoid compiler warning. */
#if PCRE2_CODE_UNIT_WIDTH == 8
return MAX_UTF_CODE_POINT;
#else
#ifdef SUPPORT_UNICODE
return GET_MAX_CHAR_VALUE((options & PARSE_CLASS_UTF) != 0);
#else
return MAX_UCHAR_VALUE;
#endif
#endif
}
/* Add a negated character list to a buffer. */
static size_t
append_negated_char_list(const uint32_t *p, uint32_t options, uint32_t *buffer)
{
const uint32_t *n;
uint32_t start = 0;
size_t result = 2;
PCRE2_ASSERT(*p > 0);
while (*p != NOTACHAR)
{
n = p;
while (n[0] == n[1] - 1) n++;
PCRE2_ASSERT(*p < 0xffff);
if (buffer != NULL)
{
buffer[0] = start;
buffer[1] = *p - 1;
buffer += 2;
}
result += 2;
start = *n + 1;
p = n + 1;
}
if (buffer != NULL)
{
buffer[0] = start;
buffer[1] = get_highest_char(options);
buffer += 2;
(void)buffer;
}
return result;
}
static uint32_t *
append_non_ascii_range(uint32_t options, uint32_t *buffer)
{
if (buffer == NULL) return NULL;
buffer[0] = 0x100;
buffer[1] = get_highest_char(options);
return buffer + 2;
}
static size_t
parse_class(uint32_t *ptr, uint32_t options, uint32_t *buffer)
{
size_t total_size = 0;
size_t size;
uint32_t meta_arg;
uint32_t start_char;
while (TRUE)
{
switch (META_CODE(*ptr))
{
case META_ESCAPE:
meta_arg = META_DATA(*ptr);
switch (meta_arg)
{
case ESC_D:
case ESC_W:
case ESC_S:
buffer = append_non_ascii_range(options, buffer);
total_size += 2;
break;
case ESC_h:
size = append_char_list(PRIV(hspace_list), buffer);
total_size += size;
if (buffer != NULL) buffer += size;
break;
case ESC_H:
size = append_negated_char_list(PRIV(hspace_list), options, buffer);
total_size += size;
if (buffer != NULL) buffer += size;
break;
case ESC_v:
size = append_char_list(PRIV(vspace_list), buffer);
total_size += size;
if (buffer != NULL) buffer += size;
break;
case ESC_V:
size = append_negated_char_list(PRIV(vspace_list), options, buffer);
total_size += size;
if (buffer != NULL) buffer += size;
break;
case ESC_p:
case ESC_P:
ptr++;
if (meta_arg == ESC_p && (*ptr >> 16) == PT_ANY)
{
if (buffer != NULL)
{
buffer[0] = 0;
buffer[1] = get_highest_char(options);
buffer += 2;
}
total_size += 2;
}
break;
}
ptr++;
continue;
case META_POSIX_NEG:
buffer = append_non_ascii_range(options, buffer);
total_size += 2;
ptr += 2;
continue;
case META_POSIX:
ptr += 2;
continue;
case META_BIGVALUE:
/* Character literal */
ptr++;
break;
CLASS_END_CASES(*ptr)
if (*ptr >= META_END) return total_size;
break;
}
start_char = *ptr;
if (ptr[1] == META_RANGE_LITERAL || ptr[1] == META_RANGE_ESCAPED)
{
ptr += 2;
PCRE2_ASSERT(*ptr < META_END || *ptr == META_BIGVALUE);
if (*ptr == META_BIGVALUE) ptr++;
#ifdef EBCDIC
#error "Missing EBCDIC support"
#endif
}
#ifdef SUPPORT_UNICODE
if (options & PARSE_CLASS_CASELESS_UTF)
{
size = utf_caseless_extend(start_char, *ptr++, options, buffer);
if (buffer != NULL) buffer += size;
total_size += size;
continue;
}
#endif
if (buffer != NULL)
{
buffer[0] = start_char;
buffer[1] = *ptr;
buffer += 2;
}
ptr++;
total_size += 2;
}
return total_size;
}
/* Extra uint32_t values for storing the lengths of range lists in
the worst case. Two uint32_t lengths and a range end for a range
starting before 255 */
#define CHAR_LIST_EXTRA_SIZE 3
/* Starting character values for each character list. */
static const uint32_t char_list_starts[] = {
#if PCRE2_CODE_UNIT_WIDTH == 32
XCL_CHAR_LIST_HIGH_32_START,
#endif
#if PCRE2_CODE_UNIT_WIDTH == 32 || defined SUPPORT_UNICODE
XCL_CHAR_LIST_LOW_32_START,
#endif
XCL_CHAR_LIST_HIGH_16_START,
/* Must be terminated by XCL_CHAR_LIST_LOW_16_START,
which also represents the end of the bitset. */
XCL_CHAR_LIST_LOW_16_START,
};
static class_ranges *
compile_optimize_class(uint32_t *start_ptr, uint32_t options,
uint32_t xoptions, compile_block *cb)
{
class_ranges* cranges;
uint32_t *ptr;
uint32_t *buffer;
uint32_t *dst;
uint32_t class_options = 0;
size_t range_list_size = 0, total_size, i;
uint32_t tmp1, tmp2;
const uint32_t *char_list_next;
uint16_t *next_char;
uint32_t char_list_start, char_list_end;
uint32_t range_start, range_end;
#ifdef SUPPORT_UNICODE
if (options & PCRE2_UTF)
class_options |= PARSE_CLASS_UTF;
if ((options & PCRE2_CASELESS) && (options & (PCRE2_UTF|PCRE2_UCP)))
class_options |= PARSE_CLASS_CASELESS_UTF;
if (xoptions & PCRE2_EXTRA_CASELESS_RESTRICT)
class_options |= PARSE_CLASS_RESTRICTED_UTF;
if (xoptions & PCRE2_EXTRA_TURKISH_CASING)
class_options |= PARSE_CLASS_TURKISH_UTF;
#endif
/* Compute required space for the range. */
range_list_size = parse_class(start_ptr, class_options, NULL);
PCRE2_ASSERT((range_list_size & 0x1) == 0);
/* Allocate buffer. The total_size also represents the end of the buffer. */
total_size = range_list_size +
((range_list_size >= 2) ? CHAR_LIST_EXTRA_SIZE : 0);
cranges = cb->cx->memctl.malloc(
sizeof(class_ranges) + total_size * sizeof(uint32_t),
cb->cx->memctl.memory_data);
if (cranges == NULL) return NULL;
cranges->next = NULL;
cranges->range_list_size = (uint16_t)range_list_size;
cranges->char_lists_types = 0;
cranges->char_lists_size = 0;
cranges->char_lists_start = 0;
if (range_list_size == 0) return cranges;
buffer = (uint32_t*)(cranges + 1);
parse_class(start_ptr, class_options, buffer);
/* Using <= instead of == to help static analysis. */
if (range_list_size <= 2) return cranges;
/* In-place sorting of ranges. */
i = (((range_list_size >> 2) - 1) << 1);
while (TRUE)
{
do_heapify(buffer, range_list_size, i);
if (i == 0) break;
i -= 2;
}
i = range_list_size - 2;
while (TRUE)
{
tmp1 = buffer[i];
tmp2 = buffer[i + 1];
buffer[i] = buffer[0];
buffer[i + 1] = buffer[1];
buffer[0] = tmp1;
buffer[1] = tmp2;
do_heapify(buffer, i, 0);
if (i == 0) break;
i -= 2;
}
/* Merge ranges whenever possible. */
dst = buffer;
ptr = buffer + 2;
range_list_size -= 2;
/* The second condition is a very rare corner case, where the end of the last
range is the maximum character. This range cannot be extended further. */
while (range_list_size > 0 && dst[1] != ~(uint32_t)0)
{
if (dst[1] + 1 < ptr[0])
{
dst += 2;
dst[0] = ptr[0];
dst[1] = ptr[1];
}
else if (dst[1] < ptr[1]) dst[1] = ptr[1];
ptr += 2;
range_list_size -= 2;
}
PCRE2_ASSERT(dst[1] <= get_highest_char(class_options));
/* When the number of ranges are less than six,
they are not converted to range lists. */
ptr = buffer;
while (ptr < dst && ptr[1] < 0x100) ptr += 2;
if (dst - ptr < (2 * (6 - 1)))
{
cranges->range_list_size = (uint16_t)(dst + 2 - buffer);
return cranges;
}
/* Compute character lists structures. */
char_list_next = char_list_starts;
char_list_start = *char_list_next++;
#if PCRE2_CODE_UNIT_WIDTH == 32
char_list_end = XCL_CHAR_LIST_HIGH_32_END;
#elif defined SUPPORT_UNICODE
char_list_end = XCL_CHAR_LIST_LOW_32_END;
#else
char_list_end = XCL_CHAR_LIST_HIGH_16_END;
#endif
next_char = (uint16_t*)(buffer + total_size);
tmp1 = 0;
tmp2 = ((sizeof(char_list_starts) / sizeof(uint32_t)) - 1) * XCL_TYPE_BIT_LEN;
PCRE2_ASSERT(tmp2 <= 3 * XCL_TYPE_BIT_LEN && tmp2 >= XCL_TYPE_BIT_LEN);
range_start = dst[0];
range_end = dst[1];
while (TRUE)
{
if (range_start >= char_list_start)
{
if (range_start == range_end || range_end < char_list_end)
{
tmp1++;
next_char--;
if (char_list_start < XCL_CHAR_LIST_LOW_32_START)
*next_char = (uint16_t)((range_end << XCL_CHAR_SHIFT) | XCL_CHAR_END);
else
*(uint32_t*)(--next_char) =
(range_end << XCL_CHAR_SHIFT) | XCL_CHAR_END;
}
if (range_start < range_end)
{
if (range_start > char_list_start)
{
tmp1++;
next_char--;
if (char_list_start < XCL_CHAR_LIST_LOW_32_START)
*next_char = (uint16_t)(range_start << XCL_CHAR_SHIFT);
else
*(uint32_t*)(--next_char) = (range_start << XCL_CHAR_SHIFT);
}
else
cranges->char_lists_types |= XCL_BEGIN_WITH_RANGE << tmp2;
}
PCRE2_ASSERT((uint32_t*)next_char >= dst + 2);
if (dst > buffer)
{
dst -= 2;
range_start = dst[0];
range_end = dst[1];
continue;
}
range_start = 0;
range_end = 0;
}
if (range_end >= char_list_start)
{
PCRE2_ASSERT(range_start < char_list_start);
if (range_end < char_list_end)
{
tmp1++;
next_char--;
if (char_list_start < XCL_CHAR_LIST_LOW_32_START)
*next_char = (uint16_t)((range_end << XCL_CHAR_SHIFT) | XCL_CHAR_END);
else
*(uint32_t*)(--next_char) =
(range_end << XCL_CHAR_SHIFT) | XCL_CHAR_END;
PCRE2_ASSERT((uint32_t*)next_char >= dst + 2);
}
cranges->char_lists_types |= XCL_BEGIN_WITH_RANGE << tmp2;
}
if (tmp1 >= XCL_ITEM_COUNT_MASK)
{
cranges->char_lists_types |= XCL_ITEM_COUNT_MASK << tmp2;
next_char--;
if (char_list_start < XCL_CHAR_LIST_LOW_32_START)
*next_char = (uint16_t)tmp1;
else
*(uint32_t*)(--next_char) = tmp1;
}
else
cranges->char_lists_types |= tmp1 << tmp2;
if (range_start < XCL_CHAR_LIST_LOW_16_START) break;
PCRE2_ASSERT(tmp2 >= XCL_TYPE_BIT_LEN);
char_list_end = char_list_start - 1;
char_list_start = *char_list_next++;
tmp1 = 0;
tmp2 -= XCL_TYPE_BIT_LEN;
}
if (dst[0] < XCL_CHAR_LIST_LOW_16_START) dst += 2;
PCRE2_ASSERT((uint16_t*)dst <= next_char);
cranges->char_lists_size =
(size_t)((uint8_t*)(buffer + total_size) - (uint8_t*)next_char);
cranges->char_lists_start = (size_t)((uint8_t*)next_char - (uint8_t*)buffer);
cranges->range_list_size = (uint16_t)(dst - buffer);
return cranges;
}
#endif /* SUPPORT_WIDE_CHARS */
#ifdef SUPPORT_UNICODE
void PRIV(update_classbits)(uint32_t ptype, uint32_t pdata, BOOL negated,
uint8_t *classbits)
{
/* Update PRIV(xclass) when this function is changed. */
int c, chartype;
const ucd_record *prop;
uint32_t gentype;
BOOL set_bit;
if (ptype == PT_ANY)
{
if (!negated) memset(classbits, 0xff, 32);
return;
}
for (c = 0; c < 256; c++)
{
prop = GET_UCD(c);
set_bit = FALSE;
(void)set_bit;
switch (ptype)
{
case PT_LAMP:
chartype = prop->chartype;
set_bit = (chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt);
break;
case PT_GC:
set_bit = (PRIV(ucp_gentype)[prop->chartype] == pdata);
break;
case PT_PC:
set_bit = (prop->chartype == pdata);
break;
case PT_SC:
set_bit = (prop->script == pdata);
break;
case PT_SCX:
set_bit = (prop->script == pdata ||
MAPBIT(PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(prop), pdata) != 0);
break;
case PT_ALNUM:
gentype = PRIV(ucp_gentype)[prop->chartype];
set_bit = (gentype == ucp_L || gentype == ucp_N);
break;
case PT_SPACE: /* Perl space */
case PT_PXSPACE: /* POSIX space */
switch(c)
{
HSPACE_BYTE_CASES:
VSPACE_BYTE_CASES:
set_bit = TRUE;
break;
default:
set_bit = (PRIV(ucp_gentype)[prop->chartype] == ucp_Z);
break;
}
break;
case PT_WORD:
chartype = prop->chartype;
gentype = PRIV(ucp_gentype)[chartype];
set_bit = (gentype == ucp_L || gentype == ucp_N ||
chartype == ucp_Mn || chartype == ucp_Pc);
break;
case PT_UCNC:
set_bit = (c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
c == CHAR_GRAVE_ACCENT || c >= 0xa0);
break;
case PT_BIDICL:
set_bit = (UCD_BIDICLASS_PROP(prop) == pdata);
break;
case PT_BOOL:
set_bit = MAPBIT(PRIV(ucd_boolprop_sets) +
UCD_BPROPS_PROP(prop), pdata) != 0;
break;
case PT_PXGRAPH:
chartype = prop->chartype;
gentype = PRIV(ucp_gentype)[chartype];
set_bit = (gentype != ucp_Z && (gentype != ucp_C || chartype == ucp_Cf));
break;
case PT_PXPRINT:
chartype = prop->chartype;
set_bit = (chartype != ucp_Zl && chartype != ucp_Zp &&
(PRIV(ucp_gentype)[chartype] != ucp_C || chartype == ucp_Cf));
break;
case PT_PXPUNCT:
gentype = PRIV(ucp_gentype)[prop->chartype];
set_bit = (gentype == ucp_P || (c < 128 && gentype == ucp_S));
break;
default:
PCRE2_ASSERT(ptype == PT_PXXDIGIT);
set_bit = (c >= CHAR_0 && c <= CHAR_9) ||
(c >= CHAR_A && c <= CHAR_F) ||
(c >= CHAR_a && c <= CHAR_f);
break;
}
if (negated) set_bit = !set_bit;
if (set_bit) *classbits |= (uint8_t)(1 << (c & 0x7));
if ((c & 0x7) == 0x7) classbits++;
}
}
#endif /* SUPPORT_UNICODE */
#ifdef SUPPORT_WIDE_CHARS
/*************************************************
* XClass related properties *
*************************************************/
/* XClass needs to be generated. */
#define XCLASS_REQUIRED 0x1
/* XClass has 8 bit character. */
#define XCLASS_HAS_8BIT_CHARS 0x2
/* XClass has properties. */
#define XCLASS_HAS_PROPS 0x4
/* XClass has character lists. */
#define XCLASS_HAS_CHAR_LISTS 0x8
/* XClass matches to all >= 256 characters. */
#define XCLASS_HIGH_ANY 0x10
#endif
/*************************************************
* Internal entry point for add range to class *
*************************************************/
/* This function sets the overall range for characters < 256.
It also handles non-utf case folding.
Arguments:
options the options bits
xoptions the extra options bits
cb compile data
start start of range character
end end of range character
Returns: cb->classbits is updated
*/
static void
add_to_class(uint32_t options, uint32_t xoptions, compile_block *cb,
uint32_t start, uint32_t end)
{
uint8_t *classbits = cb->classbits.classbits;
uint32_t c, byte_start, byte_end;
uint32_t classbits_end = (end <= 0xff ? end : 0xff);
/* If caseless matching is required, scan the range and process alternate
cases. In Unicode, there are 8-bit characters that have alternate cases that
are greater than 255 and vice-versa (though these may be ignored if caseless
restriction is in force). Sometimes we can just extend the original range. */
if ((options & PCRE2_CASELESS) != 0)
{
#ifdef SUPPORT_UNICODE
/* UTF mode. This branch is taken if we don't support wide characters (e.g.
8-bit library, without UTF), but we do treat those characters as Unicode
(if UCP flag is set). In this case, we only need to expand the character class
set to include the case pairs which are in the 0-255 codepoint range. */
if ((options & (PCRE2_UTF|PCRE2_UCP)) != 0)
{
BOOL turkish_i = (xoptions & (PCRE2_EXTRA_TURKISH_CASING|PCRE2_EXTRA_CASELESS_RESTRICT)) ==
PCRE2_EXTRA_TURKISH_CASING;
if (start < 128)
{
uint32_t lo_end = (classbits_end < 127 ? classbits_end : 127);
for (c = start; c <= lo_end; c++)
{
if (turkish_i && UCD_ANY_I(c)) continue;
SETBIT(classbits, cb->fcc[c]);
}
}
if (classbits_end >= 128)
{
uint32_t hi_start = (start > 128 ? start : 128);
for (c = hi_start; c <= classbits_end; c++)
{
uint32_t co = UCD_OTHERCASE(c);
if (co <= 0xff) SETBIT(classbits, co);
}
}
}
else
#endif /* SUPPORT_UNICODE */
/* Not UTF mode */
{
for (c = start; c <= classbits_end; c++)
SETBIT(classbits, cb->fcc[c]);
}
}
/* Use the bitmap for characters < 256. Otherwise use extra data. */
byte_start = (start + 7) >> 3;
byte_end = (classbits_end + 1) >> 3;
if (byte_start >= byte_end)
{
for (c = start; c <= classbits_end; c++)
/* Regardless of start, c will always be <= 255. */
SETBIT(classbits, c);
return;
}
for (c = byte_start; c < byte_end; c++)
classbits[c] = 0xff;
byte_start <<= 3;
byte_end <<= 3;
for (c = start; c < byte_start; c++)
SETBIT(classbits, c);
for (c = byte_end; c <= classbits_end; c++)
SETBIT(classbits, c);
}
#if PCRE2_CODE_UNIT_WIDTH == 8
/*************************************************
* Internal entry point for add list to class *
*************************************************/
/* This function is used for adding a list of horizontal or vertical whitespace
characters to a class. The list must be in order so that ranges of characters
can be detected and handled appropriately. This function sets the overall range
so that the internal functions can try to avoid duplication when handling
case-independence.
Arguments:
options the options bits
xoptions the extra options bits
cb contains pointers to tables etc.
p points to row of 32-bit values, terminated by NOTACHAR
Returns: cb->classbits is updated
*/
static void
add_list_to_class(uint32_t options, uint32_t xoptions, compile_block *cb,
const uint32_t *p)
{
while (p[0] < 256)
{
unsigned int n = 0;
while(p[n+1] == p[0] + n + 1) n++;
add_to_class(options, xoptions, cb, p[0], p[n]);
p += n + 1;
}
}
/*************************************************
* Add characters not in a list to a class *
*************************************************/
/* This function is used for adding the complement of a list of horizontal or
vertical whitespace to a class. The list must be in order.
Arguments:
options the options bits
xoptions the extra options bits
cb contains pointers to tables etc.
p points to row of 32-bit values, terminated by NOTACHAR
Returns: cb->classbits is updated
*/
static void
add_not_list_to_class(uint32_t options, uint32_t xoptions, compile_block *cb,
const uint32_t *p)
{
if (p[0] > 0)
add_to_class(options, xoptions, cb, 0, p[0] - 1);
while (p[0] < 256)
{
while (p[1] == p[0] + 1) p++;
add_to_class(options, xoptions, cb, p[0] + 1, (p[1] > 255) ? 255 : p[1] - 1);
p++;
}
}
#endif /* PCRE2_CODE_UNIT_WIDTH == 8 */
/*************************************************
* Main entry-point to compile a character class *
*************************************************/
/* This function consumes a "leaf", which is a set of characters that will
become a single OP_CLASS OP_NCLASS, OP_XCLASS, or OP_ALLANY. */
uint32_t *
PRIV(compile_class_not_nested)(uint32_t options, uint32_t xoptions,
uint32_t *start_ptr, PCRE2_UCHAR **pcode, BOOL negate_class, BOOL* has_bitmap,
int *errorcodeptr, compile_block *cb, PCRE2_SIZE *lengthptr)
{
uint32_t *pptr = start_ptr;
PCRE2_UCHAR *code = *pcode;
BOOL should_flip_negation;
const uint8_t *cbits = cb->cbits;
/* Some functions such as add_to_class() or eclass processing
expects that the bitset is stored in cb->classbits.classbits. */
uint8_t *const classbits = cb->classbits.classbits;
#ifdef SUPPORT_UNICODE
BOOL utf = (options & PCRE2_UTF) != 0;
#else /* No Unicode support */
BOOL utf = FALSE;
#endif
/* Helper variables for OP_XCLASS opcode (for characters > 255). */
#ifdef SUPPORT_WIDE_CHARS
uint32_t xclass_props;
PCRE2_UCHAR *class_uchardata;
class_ranges* cranges;
#endif
/* If an XClass contains a negative special such as \S, we need to flip the
negation flag at the end, so that support for characters > 255 works correctly
(they are all included in the class). An XClass may need to insert specific
matching or non-matching code for wide characters.
*/
should_flip_negation = FALSE;
/* XClass will be used when characters > 255 might match. */
#ifdef SUPPORT_WIDE_CHARS
xclass_props = 0;
#if PCRE2_CODE_UNIT_WIDTH == 8
cranges = NULL;
if (utf)
#endif
{
if (lengthptr != NULL)
{
cranges = compile_optimize_class(pptr, options, xoptions, cb);
if (cranges == NULL)
{
*errorcodeptr = ERR21;
return NULL;
}
/* Caching the pre-processed character ranges. */
if (cb->next_cranges != NULL)
cb->next_cranges->next = cranges;
else
cb->cranges = cranges;
cb->next_cranges = cranges;
}
else
{
/* Reuse the pre-processed character ranges. */
cranges = cb->cranges;
PCRE2_ASSERT(cranges != NULL);
cb->cranges = cranges->next;
}
if (cranges->range_list_size > 0)
{
const uint32_t *ranges = (const uint32_t*)(cranges + 1);
if (ranges[0] <= 255)
xclass_props |= XCLASS_HAS_8BIT_CHARS;
if (ranges[cranges->range_list_size - 1] == GET_MAX_CHAR_VALUE(utf) &&
ranges[cranges->range_list_size - 2] <= 256)
xclass_props |= XCLASS_HIGH_ANY;
}
}
class_uchardata = code + LINK_SIZE + 2; /* For XCLASS items */
#endif /* SUPPORT_WIDE_CHARS */
/* Initialize the 256-bit (32-byte) bit map to all zeros. We build the map
in a temporary bit of memory, in case the class contains fewer than two
8-bit characters because in that case the compiled code doesn't use the bit
map. */
memset(classbits, 0, 32);
/* Process items until end_ptr is reached. */
while (TRUE)
{
uint32_t meta = *(pptr++);
BOOL local_negate;
int posix_class;
int taboffset, tabopt;
class_bits_storage pbits;
uint32_t escape, c;
/* Handle POSIX classes such as [:alpha:] etc. */
switch (META_CODE(meta))
{
case META_POSIX:
case META_POSIX_NEG:
local_negate = (meta == META_POSIX_NEG);
posix_class = *(pptr++);
if (local_negate) should_flip_negation = TRUE; /* Note negative special */
/* If matching is caseless, upper and lower are converted to alpha.
This relies on the fact that the class table starts with alpha,
lower, upper as the first 3 entries. */
if ((options & PCRE2_CASELESS) != 0 && posix_class <= 2)
posix_class = 0;
/* When PCRE2_UCP is set, some of the POSIX classes are converted to
different escape sequences that use Unicode properties \p or \P.
Others that are not available via \p or \P have to generate
XCL_PROP/XCL_NOTPROP directly, which is done here. */
#ifdef SUPPORT_UNICODE
/* TODO This entire block of code here appears to be unreachable!? I simply
can't see how it can be hit, given that the frontend parser doesn't emit
META_POSIX for GRAPH/PRINT/PUNCT when UCP is set. */
if ((options & PCRE2_UCP) != 0 &&
(xoptions & PCRE2_EXTRA_ASCII_POSIX) == 0)
{
uint32_t ptype;
switch(posix_class)
{
case PC_GRAPH:
case PC_PRINT:
case PC_PUNCT:
ptype = (posix_class == PC_GRAPH)? PT_PXGRAPH :
(posix_class == PC_PRINT)? PT_PXPRINT : PT_PXPUNCT;
PRIV(update_classbits)(ptype, 0, local_negate, classbits);
if ((xclass_props & XCLASS_HIGH_ANY) == 0)
{
if (lengthptr != NULL)
*lengthptr += 3;
else
{
*class_uchardata++ = local_negate? XCL_NOTPROP : XCL_PROP;
*class_uchardata++ = (PCRE2_UCHAR)ptype;
*class_uchardata++ = 0;
}
xclass_props |= XCLASS_REQUIRED | XCLASS_HAS_PROPS;
}
continue;
/* For the other POSIX classes (ex: ascii) we are going to
fall through to the non-UCP case and build a bit map for
characters with code points less than 256. However, if we are in
a negated POSIX class, characters with code points greater than
255 must either all match or all not match, depending on whether
the whole class is not or is negated. For example, for
[[:^ascii:]... they must all match, whereas for [^[:^ascii:]...
they must not.
In the special case where there are no xclass items, this is
automatically handled by the use of OP_CLASS or OP_NCLASS, but an
explicit range is needed for OP_XCLASS. Setting a flag here
causes the range to be generated later when it is known that
OP_XCLASS is required. In the 8-bit library this is relevant only in
utf mode, since no wide characters can exist otherwise. */
default:
break;
}
}
#endif /* SUPPORT_UNICODE */
/* In the non-UCP case, or when UCP makes no difference, we build the
bit map for the POSIX class in a chunk of local store because we may
be adding and subtracting from it, and we don't want to subtract bits
that may be in the main map already. At the end we or the result into
the bit map that is being built. */
posix_class *= 3;
/* Copy in the first table (always present) */
memcpy(pbits.classbits, cbits + PRIV(posix_class_maps)[posix_class], 32);
/* If there is a second table, add or remove it as required. */
taboffset = PRIV(posix_class_maps)[posix_class + 1];
tabopt = PRIV(posix_class_maps)[posix_class + 2];
if (taboffset >= 0)
{
if (tabopt >= 0)
for (int i = 0; i < 32; i++)
pbits.classbits[i] |= cbits[i + taboffset];
else
for (int i = 0; i < 32; i++)
pbits.classbits[i] &= (uint8_t)(~cbits[i + taboffset]);
}
/* Now see if we need to remove any special characters. An option
value of 1 removes vertical space and 2 removes underscore. */
if (tabopt < 0) tabopt = -tabopt;
if (tabopt == 1) pbits.classbits[1] &= ~0x3c;
else if (tabopt == 2) pbits.classbits[11] &= 0x7f;
/* Add the POSIX table or its complement into the main table that is
being built and we are done. */
{
uint32_t *classwords = cb->classbits.classwords;
if (local_negate)
for (int i = 0; i < 8; i++)
classwords[i] |= (uint32_t)(~pbits.classwords[i]);
else
for (int i = 0; i < 8; i++)
classwords[i] |= pbits.classwords[i];
}
#ifdef SUPPORT_WIDE_CHARS
/* Every class contains at least one < 256 character. */
xclass_props |= XCLASS_HAS_8BIT_CHARS;
#endif
continue; /* End of POSIX handling */
/* Other than POSIX classes, the only items we should encounter are
\d-type escapes and literal characters (possibly as ranges). */
case META_BIGVALUE:
meta = *(pptr++);
break;
case META_ESCAPE:
escape = META_DATA(meta);
switch(escape)
{
case ESC_d:
for (int i = 0; i < 32; i++) classbits[i] |= cbits[i+cbit_digit];
break;
case ESC_D:
should_flip_negation = TRUE;
for (int i = 0; i < 32; i++)
classbits[i] |= (uint8_t)(~cbits[i+cbit_digit]);
break;
case ESC_w:
for (int i = 0; i < 32; i++) classbits[i] |= cbits[i+cbit_word];
break;
case ESC_W:
should_flip_negation = TRUE;
for (int i = 0; i < 32; i++)
classbits[i] |= (uint8_t)(~cbits[i+cbit_word]);
break;
/* Perl 5.004 onwards omitted VT from \s, but restored it at Perl
5.18. Before PCRE 8.34, we had to preserve the VT bit if it was
previously set by something earlier in the character class.
Luckily, the value of CHAR_VT is 0x0b in both ASCII and EBCDIC, so
we could just adjust the appropriate bit. From PCRE 8.34 we no
longer treat \s and \S specially. */
case ESC_s:
for (int i = 0; i < 32; i++) classbits[i] |= cbits[i+cbit_space];
break;
case ESC_S:
should_flip_negation = TRUE;
for (int i = 0; i < 32; i++)
classbits[i] |= (uint8_t)(~cbits[i+cbit_space]);
break;
/* When adding the horizontal or vertical space lists to a class, or
their complements, disable PCRE2_CASELESS, because it justs wastes
time, and in the "not-x" UTF cases can create unwanted duplicates in
the XCLASS list (provoked by characters that have more than one other
case and by both cases being in the same "not-x" sublist). */
case ESC_h:
#if PCRE2_CODE_UNIT_WIDTH == 8
#ifdef SUPPORT_UNICODE
if (cranges != NULL) break;
#endif
add_list_to_class(options & ~PCRE2_CASELESS, xoptions,
cb, PRIV(hspace_list));
#else
PCRE2_ASSERT(cranges != NULL);
#endif
break;
case ESC_H:
#if PCRE2_CODE_UNIT_WIDTH == 8
#ifdef SUPPORT_UNICODE
if (cranges != NULL) break;
#endif
add_not_list_to_class(options & ~PCRE2_CASELESS, xoptions,
cb, PRIV(hspace_list));
#else
PCRE2_ASSERT(cranges != NULL);
#endif
break;
case ESC_v:
#if PCRE2_CODE_UNIT_WIDTH == 8
#ifdef SUPPORT_UNICODE
if (cranges != NULL) break;
#endif
add_list_to_class(options & ~PCRE2_CASELESS, xoptions,
cb, PRIV(vspace_list));
#else
PCRE2_ASSERT(cranges != NULL);
#endif
break;
case ESC_V:
#if PCRE2_CODE_UNIT_WIDTH == 8
#ifdef SUPPORT_UNICODE
if (cranges != NULL) break;
#endif
add_not_list_to_class(options & ~PCRE2_CASELESS, xoptions,
cb, PRIV(vspace_list));
#else
PCRE2_ASSERT(cranges != NULL);
#endif
break;
/* If Unicode is not supported, \P and \p are not allowed and are
faulted at parse time, so will never appear here. */
#ifdef SUPPORT_UNICODE
case ESC_p:
case ESC_P:
{
uint32_t ptype = *pptr >> 16;
uint32_t pdata = *(pptr++) & 0xffff;
/* The "Any" is processed by PRIV(update_classbits)(). */
if (ptype == PT_ANY)
{
#if PCRE2_CODE_UNIT_WIDTH == 8
if (!utf && escape == ESC_p) memset(classbits, 0xff, 32);
#endif
continue;
}
PRIV(update_classbits)(ptype, pdata, (escape == ESC_P), classbits);
if ((xclass_props & XCLASS_HIGH_ANY) == 0)
{
if (lengthptr != NULL)
*lengthptr += 3;
else
{
*class_uchardata++ = (escape == ESC_p)? XCL_PROP : XCL_NOTPROP;
*class_uchardata++ = ptype;
*class_uchardata++ = pdata;
}
xclass_props |= XCLASS_REQUIRED | XCLASS_HAS_PROPS;
}
}
continue;
#endif
}
#ifdef SUPPORT_WIDE_CHARS
/* Every non-property class contains at least one < 256 character. */
xclass_props |= XCLASS_HAS_8BIT_CHARS;
#endif
/* End handling \d-type escapes */
continue;
CLASS_END_CASES(meta)
/* Literals. */
if (meta < META_END) break;
/* Non-literals: end of class contents. */
goto END_PROCESSING;
}
/* A literal character may be followed by a range meta. At parse time
there are checks for out-of-order characters, for ranges where the two
characters are equal, and for hyphens that cannot indicate a range. At
this point, therefore, no checking is needed. */
c = meta;
/* Remember if \r or \n were explicitly used */
if (c == CHAR_CR || c == CHAR_NL) cb->external_flags |= PCRE2_HASCRORLF;
/* Process a character range */
if (*pptr == META_RANGE_LITERAL || *pptr == META_RANGE_ESCAPED)
{
uint32_t d;
#ifdef EBCDIC
BOOL range_is_literal = (*pptr == META_RANGE_LITERAL);
#endif
++pptr;
d = *(pptr++);
if (d == META_BIGVALUE) d = *(pptr++);
/* Remember an explicit \r or \n, and add the range to the class. */
if (d == CHAR_CR || d == CHAR_NL) cb->external_flags |= PCRE2_HASCRORLF;
#if PCRE2_CODE_UNIT_WIDTH == 8
#ifdef SUPPORT_UNICODE
if (cranges != NULL) continue;
xclass_props |= XCLASS_HAS_8BIT_CHARS;
#endif
/* In an EBCDIC environment, Perl treats alphabetic ranges specially
because there are holes in the encoding, and simply using the range
A-Z (for example) would include the characters in the holes. This
applies only to literal ranges; [\xC1-\xE9] is different to [A-Z]. */
#ifdef EBCDIC
if (range_is_literal &&
(cb->ctypes[c] & ctype_letter) != 0 &&
(cb->ctypes[d] & ctype_letter) != 0 &&
(c <= CHAR_z) == (d <= CHAR_z))
{
uint32_t uc = (d <= CHAR_z)? 0 : 64;
uint32_t C = c - uc;
uint32_t D = d - uc;
if (C <= CHAR_i)
{
add_to_class(options, xoptions, cb, C + uc,
((D < CHAR_i)? D : CHAR_i) + uc);
C = CHAR_j;
}
if (C <= D && C <= CHAR_r)
{
add_to_class(options, xoptions, cb, C + uc,
((D < CHAR_r)? D : CHAR_r) + uc);
C = CHAR_s;
}
if (C <= D)
add_to_class(options, xoptions, cb, C + uc, D + uc);
}
else
#endif
/* Not an EBCDIC special range */
add_to_class(options, xoptions, cb, c, d);
#else
PCRE2_ASSERT(cranges != NULL);
#endif
continue;
} /* End of range handling */
/* Character ranges are ignored when class_ranges is present. */
#if PCRE2_CODE_UNIT_WIDTH == 8
#ifdef SUPPORT_UNICODE
if (cranges != NULL) continue;
xclass_props |= XCLASS_HAS_8BIT_CHARS;
#endif
/* Handle a single character. */
add_to_class(options, xoptions, cb, meta, meta);
#else
PCRE2_ASSERT(cranges != NULL);
#endif
} /* End of main class-processing loop */
END_PROCESSING:
#ifdef SUPPORT_WIDE_CHARS
PCRE2_ASSERT((xclass_props & XCLASS_HAS_PROPS) == 0 ||
(xclass_props & XCLASS_HIGH_ANY) == 0);
if (cranges != NULL)
{
uint32_t *range = (uint32_t*)(cranges + 1);
uint32_t *end = range + cranges->range_list_size;
while (range < end && range[0] < 256)
{
PCRE2_ASSERT((xclass_props & XCLASS_HAS_8BIT_CHARS) != 0);
/* Add range to bitset. If we are in UTF or UCP mode, then clear the
caseless bit, because the cranges handle caselessness (only) in this
condition; see the condition for PARSE_CLASS_CASELESS_UTF in
compile_optimize_class(). */
add_to_class(((options & (PCRE2_UTF|PCRE2_UCP)) != 0)?
(options & ~PCRE2_CASELESS) : options, xoptions, cb, range[0], range[1]);
if (range[1] > 255) break;
range += 2;
}
if (cranges->char_lists_size > 0)
{
/* The cranges structure is still used and freed later. */
PCRE2_ASSERT((xclass_props & XCLASS_HIGH_ANY) == 0);
xclass_props |= XCLASS_REQUIRED | XCLASS_HAS_CHAR_LISTS;
}
else
{
if ((xclass_props & XCLASS_HIGH_ANY) != 0)
{
PCRE2_ASSERT(range + 2 == end && range[0] <= 256 &&
range[1] >= GET_MAX_CHAR_VALUE(utf));
should_flip_negation = TRUE;
range = end;
}
while (range < end)
{
uint32_t range_start = range[0];
uint32_t range_end = range[1];
range += 2;
xclass_props |= XCLASS_REQUIRED;
if (range_start < 256) range_start = 256;
if (lengthptr != NULL)
{
#ifdef SUPPORT_UNICODE
if (utf)
{
*lengthptr += 1;
if (range_start < range_end)
*lengthptr += PRIV(ord2utf)(range_start, class_uchardata);
*lengthptr += PRIV(ord2utf)(range_end, class_uchardata);
continue;
}
#endif /* SUPPORT_UNICODE */
*lengthptr += range_start < range_end ? 3 : 2;
continue;
}
#ifdef SUPPORT_UNICODE
if (utf)
{
if (range_start < range_end)
{
*class_uchardata++ = XCL_RANGE;
class_uchardata += PRIV(ord2utf)(range_start, class_uchardata);
}
else
*class_uchardata++ = XCL_SINGLE;
class_uchardata += PRIV(ord2utf)(range_end, class_uchardata);
continue;
}
#endif /* SUPPORT_UNICODE */
/* Without UTF support, character values are constrained
by the bit length, and can only be > 256 for 16-bit and
32-bit libraries. */
#if PCRE2_CODE_UNIT_WIDTH != 8
if (range_start < range_end)
{
*class_uchardata++ = XCL_RANGE;
*class_uchardata++ = range_start;
}
else
*class_uchardata++ = XCL_SINGLE;
*class_uchardata++ = range_end;
#endif /* PCRE2_CODE_UNIT_WIDTH == 8 */
}
if (lengthptr == NULL)
cb->cx->memctl.free(cranges, cb->cx->memctl.memory_data);
}
}
#endif /* SUPPORT_WIDE_CHARS */
/* If there are characters with values > 255, or Unicode property settings
(\p or \P), we have to compile an extended class, with its own opcode,
unless there were no property settings and there was a negated special such
as \S in the class, and PCRE2_UCP is not set, because in that case all
characters > 255 are in or not in the class, so any that were explicitly
given as well can be ignored.
In the UCP case, if certain negated POSIX classes (ex: [:^ascii:]) were
were present in a class, we either have to match or not match all wide
characters (depending on whether the whole class is or is not negated).
This requirement is indicated by match_all_or_no_wide_chars being true.
We do this by including an explicit range, which works in both cases.
This applies only in UTF and 16-bit and 32-bit non-UTF modes, since there
cannot be any wide characters in 8-bit non-UTF mode.
When there *are* properties in a positive UTF-8 or any 16-bit or 32_bit
class where \S etc is present without PCRE2_UCP, causing an extended class
to be compiled, we make sure that all characters > 255 are included by
forcing match_all_or_no_wide_chars to be true.
If, when generating an xclass, there are no characters < 256, we can omit
the bitmap in the actual compiled code. */
#ifdef SUPPORT_WIDE_CHARS /* Defined for 16/32 bits, or 8-bit with Unicode */
if ((xclass_props & XCLASS_REQUIRED) != 0)
{
PCRE2_UCHAR *previous = code;
if ((xclass_props & XCLASS_HAS_CHAR_LISTS) == 0)
*class_uchardata++ = XCL_END; /* Marks the end of extra data */
*code++ = OP_XCLASS;
code += LINK_SIZE;
*code = negate_class? XCL_NOT:0;
if ((xclass_props & XCLASS_HAS_PROPS) != 0) *code |= XCL_HASPROP;
/* If the map is required, move up the extra data to make room for it;
otherwise just move the code pointer to the end of the extra data. */
if ((xclass_props & XCLASS_HAS_8BIT_CHARS) != 0 || has_bitmap != NULL)
{
if (negate_class)
{
uint32_t *classwords = cb->classbits.classwords;
for (int i = 0; i < 8; i++) classwords[i] = ~classwords[i];
}
if (has_bitmap == NULL)
{
*code++ |= XCL_MAP;
(void)memmove(code + (32 / sizeof(PCRE2_UCHAR)), code,
CU2BYTES(class_uchardata - code));
memcpy(code, classbits, 32);
code = class_uchardata + (32 / sizeof(PCRE2_UCHAR));
}
else
{
code = class_uchardata;
if ((xclass_props & XCLASS_HAS_8BIT_CHARS) != 0)
*has_bitmap = TRUE;
}
}
else code = class_uchardata;
if ((xclass_props & XCLASS_HAS_CHAR_LISTS) != 0)
{
/* Char lists size is an even number, because all items are 16 or 32
bit values. The character list data is always aligned to 32 bits. */
size_t char_lists_size = cranges->char_lists_size;
PCRE2_ASSERT((char_lists_size & 0x1) == 0 &&
(cb->char_lists_size & 0x3) == 0);
if (lengthptr != NULL)
{
char_lists_size = CLIST_ALIGN_TO(char_lists_size, sizeof(uint32_t));
#if PCRE2_CODE_UNIT_WIDTH == 8
*lengthptr += 2 + LINK_SIZE;
#else
*lengthptr += 1 + LINK_SIZE;
#endif
cb->char_lists_size += char_lists_size;
char_lists_size /= sizeof(PCRE2_UCHAR);
/* Storage space for character lists is included
in the maximum pattern size. */
if (*lengthptr > MAX_PATTERN_SIZE ||
MAX_PATTERN_SIZE - *lengthptr < char_lists_size)
{
*errorcodeptr = ERR20; /* Pattern is too large */
return NULL;
}
}
else
{
uint8_t *data;
PCRE2_ASSERT(cranges->char_lists_types <= XCL_TYPE_MASK);
#if PCRE2_CODE_UNIT_WIDTH == 8
/* Encode as high / low bytes. */
code[0] = (uint8_t)(XCL_LIST |
(cranges->char_lists_types >> 8));
code[1] = (uint8_t)cranges->char_lists_types;
code += 2;
#else
*code++ = (PCRE2_UCHAR)(XCL_LIST | cranges->char_lists_types);
#endif
/* Character lists are stored in backwards direction from
byte code start. The non-dfa/dfa matchers can access these
lists using the byte code start stored in match blocks.
Each list is aligned to 32 bit with an optional unused
16 bit value at the beginning of the character list. */
cb->char_lists_size += char_lists_size;
data = (uint8_t*)cb->start_code - cb->char_lists_size;
memcpy(data, (uint8_t*)(cranges + 1) + cranges->char_lists_start,
char_lists_size);
/* Since character lists total size is less than MAX_PATTERN_SIZE,
their starting offset fits into a value which size is LINK_SIZE. */
char_lists_size = cb->char_lists_size;
PUT(code, 0, (uint32_t)(char_lists_size >> 1));
code += LINK_SIZE;
#if defined PCRE2_DEBUG || defined SUPPORT_VALGRIND
if ((char_lists_size & 0x2) != 0)
{
/* In debug the unused 16 bit value is set
to a fixed value and marked unused. */
((uint16_t*)data)[-1] = 0x5555;
#ifdef SUPPORT_VALGRIND
VALGRIND_MAKE_MEM_NOACCESS(data - 2, 2);
#endif
}
#endif
cb->char_lists_size =
CLIST_ALIGN_TO(char_lists_size, sizeof(uint32_t));
cb->cx->memctl.free(cranges, cb->cx->memctl.memory_data);
}
}
/* Now fill in the complete length of the item */
PUT(previous, 1, (int)(code - previous));
goto DONE; /* End of class handling */
}
#endif /* SUPPORT_WIDE_CHARS */
/* If there are no characters > 255, or they are all to be included or
excluded, set the opcode to OP_CLASS or OP_NCLASS, depending on whether the
whole class was negated and whether there were negative specials such as \S
(non-UCP) in the class. Then copy the 32-byte map into the code vector,
negating it if necessary. */
if (negate_class)
{
uint32_t *classwords = cb->classbits.classwords;
for (int i = 0; i < 8; i++) classwords[i] = ~classwords[i];
}
if ((SELECT_VALUE8(!utf, 0) || negate_class != should_flip_negation) &&
cb->classbits.classwords[0] == ~(uint32_t)0)
{
const uint32_t *classwords = cb->classbits.classwords;
int i;
for (i = 0; i < 8; i++)
if (classwords[i] != ~(uint32_t)0) break;
if (i == 8)
{
*code++ = OP_ALLANY;
goto DONE; /* End of class handling */
}
}
*code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
memcpy(code, classbits, 32);
code += 32 / sizeof(PCRE2_UCHAR);
DONE:
*pcode = code;
return pptr - 1;
}
/* ===================================================================*/
/* Here follows a block of ECLASS-compiling functions. You may well want to
read them from top to bottom; they are ordered from leafmost (at the top) to
outermost parser (at the bottom of the file). */
/* This function folds one operand using the negation operator.
The new, combined chunk of stack code is written out to *pop_info. */
static void
fold_negation(eclass_op_info *pop_info, PCRE2_SIZE *lengthptr,
BOOL preserve_classbits)
{
/* If the chunk of stack code is already composed of multiple ops, we won't
descend in and try and propagate the negation down the tree. (That would lead
to O(n^2) compile-time, which could be exploitable with a malicious regex -
although maybe that's not really too much of a worry in a library that offers
an exponential-time matching function!) */
if (pop_info->op_single_type == 0)
{
if (lengthptr != NULL)
*lengthptr += 1;
else
pop_info->code_start[pop_info->length] = ECL_NOT;
pop_info->length += 1;
}
/* Otherwise, it's a nice single-op item, so we can easily fold in the negation
without needing to produce an ECL_NOT. */
else if (pop_info->op_single_type == ECL_ANY ||
pop_info->op_single_type == ECL_NONE)
{
pop_info->op_single_type = (pop_info->op_single_type == ECL_NONE)?
ECL_ANY : ECL_NONE;
if (lengthptr == NULL)
*(pop_info->code_start) = pop_info->op_single_type;
}
else
{
PCRE2_ASSERT(pop_info->op_single_type == ECL_XCLASS &&
pop_info->length >= 1 + LINK_SIZE + 1);
if (lengthptr == NULL)
pop_info->code_start[1 + LINK_SIZE] ^= XCL_NOT;
}
if (!preserve_classbits)
{
for (int i = 0; i < 8; i++)
pop_info->bits.classwords[i] = ~pop_info->bits.classwords[i];
}
}
/* This function folds together two operands using a binary operator.
The new, combined chunk of stack code is written out to *lhs_op_info. */
static void
fold_binary(int op, eclass_op_info *lhs_op_info, eclass_op_info *rhs_op_info,
PCRE2_SIZE *lengthptr)
{
switch (op)
{
/* ECL_AND truth table:
LHS RHS RESULT
----------------
ANY * RHS
* ANY LHS
NONE * NONE
* NONE NONE
X Y X & Y
*/
case ECL_AND:
if (rhs_op_info->op_single_type == ECL_ANY)
{
/* no-op: drop the RHS */
}
else if (lhs_op_info->op_single_type == ECL_ANY)
{
/* no-op: drop the LHS, and memmove the RHS into its place */
if (lengthptr == NULL)
memmove(lhs_op_info->code_start, rhs_op_info->code_start,
CU2BYTES(rhs_op_info->length));
lhs_op_info->length = rhs_op_info->length;
lhs_op_info->op_single_type = rhs_op_info->op_single_type;
}
else if (rhs_op_info->op_single_type == ECL_NONE)
{
/* the result is ECL_NONE: write into the LHS */
if (lengthptr == NULL)
lhs_op_info->code_start[0] = ECL_NONE;
lhs_op_info->length = 1;
lhs_op_info->op_single_type = ECL_NONE;
}
else if (lhs_op_info->op_single_type == ECL_NONE)
{
/* the result is ECL_NONE: drop the RHS */
}
else
{
/* Both of LHS & RHS are either ECL_XCLASS, or compound operations. */
if (lengthptr != NULL)
*lengthptr += 1;
else
{
PCRE2_ASSERT(rhs_op_info->code_start ==
lhs_op_info->code_start + lhs_op_info->length);
rhs_op_info->code_start[rhs_op_info->length] = ECL_AND;
}
lhs_op_info->length += rhs_op_info->length + 1;
lhs_op_info->op_single_type = 0;
}
for (int i = 0; i < 8; i++)
lhs_op_info->bits.classwords[i] &= rhs_op_info->bits.classwords[i];
break;
/* ECL_OR truth table:
LHS RHS RESULT
----------------
ANY * ANY
* ANY ANY
NONE * RHS
* NONE LHS
X Y X | Y
*/
case ECL_OR:
if (rhs_op_info->op_single_type == ECL_NONE)
{
/* no-op: drop the RHS */
}
else if (lhs_op_info->op_single_type == ECL_NONE)
{
/* no-op: drop the LHS, and memmove the RHS into its place */
if (lengthptr == NULL)
memmove(lhs_op_info->code_start, rhs_op_info->code_start,
CU2BYTES(rhs_op_info->length));
lhs_op_info->length = rhs_op_info->length;
lhs_op_info->op_single_type = rhs_op_info->op_single_type;
}
else if (rhs_op_info->op_single_type == ECL_ANY)
{
/* the result is ECL_ANY: write into the LHS */
if (lengthptr == NULL)
lhs_op_info->code_start[0] = ECL_ANY;
lhs_op_info->length = 1;
lhs_op_info->op_single_type = ECL_ANY;
}
else if (lhs_op_info->op_single_type == ECL_ANY)
{
/* the result is ECL_ANY: drop the RHS */
}
else
{
/* Both of LHS & RHS are either ECL_XCLASS, or compound operations. */
if (lengthptr != NULL)
*lengthptr += 1;
else
{
PCRE2_ASSERT(rhs_op_info->code_start ==
lhs_op_info->code_start + lhs_op_info->length);
rhs_op_info->code_start[rhs_op_info->length] = ECL_OR;
}
lhs_op_info->length += rhs_op_info->length + 1;
lhs_op_info->op_single_type = 0;
}
for (int i = 0; i < 8; i++)
lhs_op_info->bits.classwords[i] |= rhs_op_info->bits.classwords[i];
break;
/* ECL_XOR truth table:
LHS RHS RESULT
----------------
ANY * !RHS
* ANY !LHS
NONE * RHS
* NONE LHS
X Y X ^ Y
*/
case ECL_XOR:
if (rhs_op_info->op_single_type == ECL_NONE)
{
/* no-op: drop the RHS */
}
else if (lhs_op_info->op_single_type == ECL_NONE)
{
/* no-op: drop the LHS, and memmove the RHS into its place */
if (lengthptr == NULL)
memmove(lhs_op_info->code_start, rhs_op_info->code_start,
CU2BYTES(rhs_op_info->length));
lhs_op_info->length = rhs_op_info->length;
lhs_op_info->op_single_type = rhs_op_info->op_single_type;
}
else if (rhs_op_info->op_single_type == ECL_ANY)
{
/* the result is !LHS: fold in the negation, and drop the RHS */
/* Preserve the classbits, because we promise to deal with them later. */
fold_negation(lhs_op_info, lengthptr, TRUE);
}
else if (lhs_op_info->op_single_type == ECL_ANY)
{
/* the result is !RHS: drop the LHS, memmove the RHS into its place, and
fold in the negation */
if (lengthptr == NULL)
memmove(lhs_op_info->code_start, rhs_op_info->code_start,
CU2BYTES(rhs_op_info->length));
lhs_op_info->length = rhs_op_info->length;
lhs_op_info->op_single_type = rhs_op_info->op_single_type;
/* Preserve the classbits, because we promise to deal with them later. */
fold_negation(lhs_op_info, lengthptr, TRUE);
}
else
{
/* Both of LHS & RHS are either ECL_XCLASS, or compound operations. */
if (lengthptr != NULL)
*lengthptr += 1;
else
{
PCRE2_ASSERT(rhs_op_info->code_start ==
lhs_op_info->code_start + lhs_op_info->length);
rhs_op_info->code_start[rhs_op_info->length] = ECL_XOR;
}
lhs_op_info->length += rhs_op_info->length + 1;
lhs_op_info->op_single_type = 0;
}
for (int i = 0; i < 8; i++)
lhs_op_info->bits.classwords[i] ^= rhs_op_info->bits.classwords[i];
break;
default:
PCRE2_DEBUG_UNREACHABLE();
break;
}
}
static BOOL
compile_eclass_nested(eclass_context *context, BOOL negated,
uint32_t **pptr, PCRE2_UCHAR **pcode,
eclass_op_info *pop_info, PCRE2_SIZE *lengthptr);
/* This function consumes a group of implicitly-unioned class elements.
These can be characters, ranges, properties, or nested classes, as long
as they are all joined by being placed adjacently. */
static BOOL
compile_class_operand(eclass_context *context, BOOL negated,
uint32_t **pptr, PCRE2_UCHAR **pcode, eclass_op_info *pop_info,
PCRE2_SIZE *lengthptr)
{
uint32_t *ptr = *pptr;
uint32_t *prev_ptr;
PCRE2_UCHAR *code = *pcode;
PCRE2_UCHAR *code_start = code;
PCRE2_SIZE prev_length = (lengthptr != NULL)? *lengthptr : 0;
PCRE2_SIZE extra_length;
uint32_t meta = META_CODE(*ptr);
switch (meta)
{
case META_CLASS_EMPTY_NOT:
case META_CLASS_EMPTY:
++ptr;
pop_info->length = 1;
if ((meta == META_CLASS_EMPTY) == negated)
{
*code++ = pop_info->op_single_type = ECL_ANY;
memset(pop_info->bits.classbits, 0xff, 32);
}
else
{
*code++ = pop_info->op_single_type = ECL_NONE;
memset(pop_info->bits.classbits, 0, 32);
}
break;
case META_CLASS:
case META_CLASS_NOT:
if ((*ptr & CLASS_IS_ECLASS) != 0)
{
if (!compile_eclass_nested(context, negated, &ptr, &code,
pop_info, lengthptr))
return FALSE;
PCRE2_ASSERT(*ptr == META_CLASS_END);
ptr++;
goto DONE;
}
ptr++;
/* Fall through */
default:
/* Scan forward characters, ranges, and properties.
For example: inside [a-z_ -- m] we don't have brackets around "a-z_" but
we still need to collect that fragment up into a "leaf" OP_CLASS. */
prev_ptr = ptr;
ptr = PRIV(compile_class_not_nested)(
context->options, context->xoptions, ptr, &code,
(meta != META_CLASS_NOT) == negated, &context->needs_bitmap,
context->errorcodeptr, context->cb, lengthptr);
if (ptr == NULL) return FALSE;
/* We must have a 100% guarantee that ptr increases when
compile_class_operand() returns, even on Release builds, so that we can
statically prove our loops terminate. */
if (ptr <= prev_ptr)
{
PCRE2_DEBUG_UNREACHABLE();
return FALSE;
}
/* If we fell through above, consume the closing ']'. */
if (meta == META_CLASS || meta == META_CLASS_NOT)
{
PCRE2_ASSERT(*ptr == META_CLASS_END);
ptr++;
}
/* Regardless of whether (lengthptr == NULL), some data will still be written
out to *pcode, which we need: we have to peek at it, to transform the opcode
into the ECLASS version (since we need to hoist up the bitmaps). */
PCRE2_ASSERT(code > code_start);
extra_length = (lengthptr != NULL)? *lengthptr - prev_length : 0;
/* Easiest case: convert OP_ALLANY to ECL_ANY */
if (*code_start == OP_ALLANY)
{
PCRE2_ASSERT(code - code_start == 1 && extra_length == 0);
pop_info->length = 1;
*code_start = pop_info->op_single_type = ECL_ANY;
memset(pop_info->bits.classbits, 0xff, 32);
}
/* For OP_CLASS and OP_NCLASS, we hoist out the bitmap and convert to
ECL_NONE / ECL_ANY respectively. */
else if (*code_start == OP_CLASS || *code_start == OP_NCLASS)
{
PCRE2_ASSERT(code - code_start == 1 + 32 / sizeof(PCRE2_UCHAR) &&
extra_length == 0);
pop_info->length = 1;
*code_start = pop_info->op_single_type =
(*code_start == OP_CLASS)? ECL_NONE : ECL_ANY;
memcpy(pop_info->bits.classbits, code_start + 1, 32);
/* Rewind the code pointer, but make sure we adjust *lengthptr, because we
do need to reserve that space (even though we only use it temporarily). */
if (lengthptr != NULL)
*lengthptr += code - (code_start + 1);
code = code_start + 1;
if (!context->needs_bitmap && *code_start == ECL_NONE)
{
uint32_t *classwords = pop_info->bits.classwords;
for (int i = 0; i < 8; i++)
if (classwords[i] != 0)
{
context->needs_bitmap = TRUE;
break;
}
}
else
context->needs_bitmap = TRUE;
}
/* Finally, for OP_XCLASS we hoist out the bitmap (if any), and convert to
ECL_XCLASS. */
else
{
PCRE2_ASSERT(*code_start == OP_XCLASS);
*code_start = pop_info->op_single_type = ECL_XCLASS;
PCRE2_ASSERT(code - code_start >= 1 + LINK_SIZE + 1);
memcpy(pop_info->bits.classbits, context->cb->classbits.classbits, 32);
pop_info->length = (code - code_start) + extra_length;
}
break;
} /* End of switch(meta) */
pop_info->code_start = (lengthptr == NULL)? code_start : NULL;
if (lengthptr != NULL)
{
*lengthptr += code - code_start;
code = code_start;
}
DONE:
PCRE2_ASSERT(lengthptr == NULL || (code == code_start));
*pptr = ptr;
*pcode = code;
return TRUE;
}
/* This function consumes a group of implicitly-unioned class elements.
These can be characters, ranges, properties, or nested classes, as long
as they are all joined by being placed adjacently. */
static BOOL
compile_class_juxtaposition(eclass_context *context, BOOL negated,
uint32_t **pptr, PCRE2_UCHAR **pcode, eclass_op_info *pop_info,
PCRE2_SIZE *lengthptr)
{
uint32_t *ptr = *pptr;
PCRE2_UCHAR *code = *pcode;
#ifdef PCRE2_DEBUG
PCRE2_UCHAR *start_code = *pcode;
#endif
/* See compile_class_binary_loose() for comments on compile-time folding of
the "negated" flag. */
/* Because it's a non-empty class, there must be an operand at the start. */
if (!compile_class_operand(context, negated, &ptr, &code, pop_info, lengthptr))
return FALSE;
while (*ptr != META_CLASS_END &&
!(*ptr >= META_ECLASS_AND && *ptr <= META_ECLASS_NOT))
{
uint32_t op;
BOOL rhs_negated;
eclass_op_info rhs_op_info;
if (negated)
{
/* !(A juxtapose B) -> !A && !B */
op = ECL_AND;
rhs_negated = TRUE;
}
else
{
/* A juxtapose B -> A || B */
op = ECL_OR;
rhs_negated = FALSE;
}
/* An operand must follow the operator. */
if (!compile_class_operand(context, rhs_negated, &ptr, &code,
&rhs_op_info, lengthptr))
return FALSE;
/* Convert infix to postfix (RPN). */
fold_binary(op, pop_info, &rhs_op_info, lengthptr);
if (lengthptr == NULL)
code = pop_info->code_start + pop_info->length;
}
PCRE2_ASSERT(lengthptr == NULL || code == start_code);
*pptr = ptr;
*pcode = code;
return TRUE;
}
/* This function consumes unary prefix operators. */
static BOOL
compile_class_unary(eclass_context *context, BOOL negated,
uint32_t **pptr, PCRE2_UCHAR **pcode, eclass_op_info *pop_info,
PCRE2_SIZE *lengthptr)
{
uint32_t *ptr = *pptr;
#ifdef PCRE2_DEBUG
PCRE2_UCHAR *start_code = *pcode;
#endif
while (*ptr == META_ECLASS_NOT)
{
++ptr;
negated = !negated;
}
*pptr = ptr;
/* Because it's a non-empty class, there must be an operand. */
if (!compile_class_juxtaposition(context, negated, pptr, pcode,
pop_info, lengthptr))
return FALSE;
PCRE2_ASSERT(lengthptr == NULL || *pcode == start_code);
return TRUE;
}
/* This function consumes tightly-binding binary operators. */
static BOOL
compile_class_binary_tight(eclass_context *context, BOOL negated,
uint32_t **pptr, PCRE2_UCHAR **pcode, eclass_op_info *pop_info,
PCRE2_SIZE *lengthptr)
{
uint32_t *ptr = *pptr;
PCRE2_UCHAR *code = *pcode;
#ifdef PCRE2_DEBUG
PCRE2_UCHAR *start_code = *pcode;
#endif
/* See compile_class_binary_loose() for comments on compile-time folding of
the "negated" flag. */
/* Because it's a non-empty class, there must be an operand at the start. */
if (!compile_class_unary(context, negated, &ptr, &code, pop_info, lengthptr))
return FALSE;
while (*ptr == META_ECLASS_AND)
{
uint32_t op;
BOOL rhs_negated;
eclass_op_info rhs_op_info;
if (negated)
{
/* !(A && B) -> !A || !B */
op = ECL_OR;
rhs_negated = TRUE;
}
else
{
/* A && B -> A && B */
op = ECL_AND;
rhs_negated = FALSE;
}
++ptr;
/* An operand must follow the operator. */
if (!compile_class_unary(context, rhs_negated, &ptr, &code,
&rhs_op_info, lengthptr))
return FALSE;
/* Convert infix to postfix (RPN). */
fold_binary(op, pop_info, &rhs_op_info, lengthptr);
if (lengthptr == NULL)
code = pop_info->code_start + pop_info->length;
}
PCRE2_ASSERT(lengthptr == NULL || code == start_code);
*pptr = ptr;
*pcode = code;
return TRUE;
}
/* This function consumes loosely-binding binary operators. */
static BOOL
compile_class_binary_loose(eclass_context *context, BOOL negated,
uint32_t **pptr, PCRE2_UCHAR **pcode, eclass_op_info *pop_info,
PCRE2_SIZE *lengthptr)
{
uint32_t *ptr = *pptr;
PCRE2_UCHAR *code = *pcode;
#ifdef PCRE2_DEBUG
PCRE2_UCHAR *start_code = *pcode;
#endif
/* We really want to fold the negation operator, if at all possible, so that
simple cases can be reduced down. In particular, in 8-bit no-UTF mode, we want
to produce a fully-folded expression, so that we can guarantee not to emit any
OP_ECLASS codes (in the same way that we never emit OP_XCLASS in this mode).
This has the consequence that with a little ingenuity, we can in fact avoid
emitting (nearly...) all cases of the "NOT" operator. Imagine that we have:
!(A ...
We have parsed the preceding "!", and we are about to parse the "A" operand. We
don't know yet whether there will even be a following binary operand! Both of
these are possibilities for what follows:
!(A && B)
!(A)
However, we can still fold the "!" into the "A" operand, because no matter what
the following binary operator will be, we can produce an expression which is
equivalent. */
/* Because it's a non-empty class, there must be an operand at the start. */
if (!compile_class_binary_tight(context, negated, &ptr, &code,
pop_info, lengthptr))
return FALSE;
while (*ptr >= META_ECLASS_OR && *ptr <= META_ECLASS_XOR)
{
uint32_t op;
BOOL op_neg;
BOOL rhs_negated;
eclass_op_info rhs_op_info;
if (negated)
{
/* The whole expression is being negated; we respond by unconditionally
negating the LHS A, before seeing what follows. And hooray! We can recover,
no matter what follows. */
/* !(A || B) -> !A && !B */
/* !(A -- B) -> !(A && !B) -> !A || B */
/* !(A XOR B) -> !(!A XOR !B) -> !A XNOR !B */
op = (*ptr == META_ECLASS_OR )? ECL_AND :
(*ptr == META_ECLASS_SUB)? ECL_OR :
/*ptr == META_ECLASS_XOR*/ ECL_XOR;
op_neg = (*ptr == META_ECLASS_XOR);
rhs_negated = *ptr != META_ECLASS_SUB;
}
else
{
/* A || B -> A || B */
/* A -- B -> A && !B */
/* A XOR B -> A XOR B */
op = (*ptr == META_ECLASS_OR )? ECL_OR :
(*ptr == META_ECLASS_SUB)? ECL_AND :
/*ptr == META_ECLASS_XOR*/ ECL_XOR;
op_neg = FALSE;
rhs_negated = *ptr == META_ECLASS_SUB;
}
++ptr;
/* An operand must follow the operator. */
if (!compile_class_binary_tight(context, rhs_negated, &ptr, &code,
&rhs_op_info, lengthptr))
return FALSE;
/* Convert infix to postfix (RPN). */
fold_binary(op, pop_info, &rhs_op_info, lengthptr);
if (op_neg) fold_negation(pop_info, lengthptr, FALSE);
if (lengthptr == NULL)
code = pop_info->code_start + pop_info->length;
}
PCRE2_ASSERT(lengthptr == NULL || code == start_code);
*pptr = ptr;
*pcode = code;
return TRUE;
}
/* This function converts the META codes in pptr into opcodes written to
pcode. The pptr must start at a META_CLASS or META_CLASS_NOT.
The class is compiled as a left-associative sequence of operator
applications.
The pptr will be left pointing at the matching META_CLASS_END. */
static BOOL
compile_eclass_nested(eclass_context *context, BOOL negated,
uint32_t **pptr, PCRE2_UCHAR **pcode,
eclass_op_info *pop_info, PCRE2_SIZE *lengthptr)
{
uint32_t *ptr = *pptr;
#ifdef PCRE2_DEBUG
PCRE2_UCHAR *start_code = *pcode;
#endif
/* The CLASS_IS_ECLASS bit must be set since it is a nested class. */
PCRE2_ASSERT(*ptr == (META_CLASS | CLASS_IS_ECLASS) ||
*ptr == (META_CLASS_NOT | CLASS_IS_ECLASS));
if (*ptr++ == (META_CLASS_NOT | CLASS_IS_ECLASS))
negated = !negated;
(*pptr)++;
/* Because it's a non-empty class, there must be an operand at the start. */
if (!compile_class_binary_loose(context, negated, pptr, pcode,
pop_info, lengthptr))
return FALSE;
PCRE2_ASSERT(**pptr == META_CLASS_END);
PCRE2_ASSERT(lengthptr == NULL || *pcode == start_code);
return TRUE;
}
BOOL
PRIV(compile_class_nested)(uint32_t options, uint32_t xoptions,
uint32_t **pptr, PCRE2_UCHAR **pcode, int *errorcodeptr,
compile_block *cb, PCRE2_SIZE *lengthptr)
{
eclass_context context;
eclass_op_info op_info;
PCRE2_SIZE previous_length = (lengthptr != NULL)? *lengthptr : 0;
PCRE2_UCHAR *code = *pcode;
PCRE2_UCHAR *previous;
BOOL allbitsone = TRUE;
context.needs_bitmap = FALSE;
context.options = options;
context.xoptions = xoptions;
context.errorcodeptr = errorcodeptr;
context.cb = cb;
previous = code;
*code++ = OP_ECLASS;
code += LINK_SIZE;
*code++ = 0; /* Flags, currently zero. */
if (!compile_eclass_nested(&context, FALSE, pptr, &code, &op_info, lengthptr))
return FALSE;
if (lengthptr != NULL)
{
*lengthptr += code - previous;
code = previous;
/* (*lengthptr - previous_length) now holds the amount of buffer that
we require to make the call to compile_class_nested() with
lengthptr = NULL, and including the (1+LINK_SIZE+1) that we write out
before that call. */
}
/* Do some useful counting of what's in the bitmap. */
for (int i = 0; i < 8; i++)
if (op_info.bits.classwords[i] != 0xffffffff)
{
allbitsone = FALSE;
break;
}
/* After constant-folding the extended class syntax, it may turn out to be
a simple class after all. In that case, we can unwrap it from the
OP_ECLASS container - and in fact, we must do so, because in 8-bit
no-Unicode mode the matcher is compiled without support for OP_ECLASS. */
#ifndef SUPPORT_WIDE_CHARS
PCRE2_ASSERT(op_info.op_single_type != 0);
#else
if (op_info.op_single_type != 0)
#endif
{
/* Rewind back over the OP_ECLASS. */
code = previous;
/* If the bits are all ones, and the "high characters" are all matched
too, we use a special-cased encoding of OP_ALLANY. */
if (op_info.op_single_type == ECL_ANY && allbitsone)
{
/* Advancing code means rewinding lengthptr, at this point. */
if (lengthptr != NULL) *lengthptr -= 1;
*code++ = OP_ALLANY;
}
/* If the high bits are all matched / all not-matched, then we emit an
OP_NCLASS/OP_CLASS respectively. */
else if (op_info.op_single_type == ECL_ANY ||
op_info.op_single_type == ECL_NONE)
{
PCRE2_SIZE required_len = 1 + (32 / sizeof(PCRE2_UCHAR));
if (lengthptr != NULL)
{
if (required_len > (*lengthptr - previous_length))
*lengthptr = previous_length + required_len;
}
/* Advancing code means rewinding lengthptr, at this point. */
if (lengthptr != NULL) *lengthptr -= required_len;
*code++ = (op_info.op_single_type == ECL_ANY)? OP_NCLASS : OP_CLASS;
memcpy(code, op_info.bits.classbits, 32);
code += 32 / sizeof(PCRE2_UCHAR);
}
/* Otherwise, we have an ECL_XCLASS, so we have the OP_XCLASS data
there, but, we pulled out its bitmap into op_info, so now we have to
put that back into the OP_XCLASS. */
else
{
#ifndef SUPPORT_WIDE_CHARS
PCRE2_DEBUG_UNREACHABLE();
#else
BOOL need_map = context.needs_bitmap;
PCRE2_SIZE required_len;
PCRE2_ASSERT(op_info.op_single_type == ECL_XCLASS);
required_len = op_info.length + (need_map? 32/sizeof(PCRE2_UCHAR) : 0);
if (lengthptr != NULL)
{
/* Don't unconditionally request all the space we need - we may
already have asked for more during processing of the ECLASS. */
if (required_len > (*lengthptr - previous_length))
*lengthptr = previous_length + required_len;
/* The code we write out here won't be ignored, even during the
(lengthptr != NULL) phase, because if there's a following quantifier
it will peek backwards. So we do have to write out a (truncated)
OP_XCLASS, even on this branch. */
*lengthptr -= 1 + LINK_SIZE + 1;
*code++ = OP_XCLASS;
PUT(code, 0, 1 + LINK_SIZE + 1);
code += LINK_SIZE;
*code++ = 0;
}
else
{
PCRE2_UCHAR *rest;
PCRE2_SIZE rest_len;
PCRE2_UCHAR flags;
/* 1 unit: OP_XCLASS | LINK_SIZE units | 1 unit: flags | ...rest */
PCRE2_ASSERT(op_info.length >= 1 + LINK_SIZE + 1);
rest = op_info.code_start + 1 + LINK_SIZE + 1;
rest_len = (op_info.code_start + op_info.length) - rest;
/* First read any data we use, before memmove splats it. */
flags = op_info.code_start[1 + LINK_SIZE];
PCRE2_ASSERT((flags & XCL_MAP) == 0);
/* Next do the memmove before any writes. */
memmove(code + 1 + LINK_SIZE + 1 + (need_map? 32/sizeof(PCRE2_UCHAR) : 0),
rest, CU2BYTES(rest_len));
/* Finally write the header data. */
*code++ = OP_XCLASS;
PUT(code, 0, (int)required_len);
code += LINK_SIZE;
*code++ = flags | (need_map? XCL_MAP : 0);
if (need_map)
{
memcpy(code, op_info.bits.classbits, 32);
code += 32 / sizeof(PCRE2_UCHAR);
}
code += rest_len;
}
#endif /* SUPPORT_WIDE_CHARS */
}
}
/* Otherwise, we're going to keep the OP_ECLASS. However, again we need
to do some adjustment to insert the bitmap if we have one. */
#ifdef SUPPORT_WIDE_CHARS
else
{
BOOL need_map = context.needs_bitmap;
PCRE2_SIZE required_len =
1 + LINK_SIZE + 1 + (need_map? 32/sizeof(PCRE2_UCHAR) : 0) + op_info.length;
if (lengthptr != NULL)
{
if (required_len > (*lengthptr - previous_length))
*lengthptr = previous_length + required_len;
/* As for the XCLASS branch above, we do have to write out a dummy
OP_ECLASS, because of the backwards peek by the quantifier code. Write
out a (truncated) OP_ECLASS, even on this branch. */
*lengthptr -= 1 + LINK_SIZE + 1;
*code++ = OP_ECLASS;
PUT(code, 0, 1 + LINK_SIZE + 1);
code += LINK_SIZE;
*code++ = 0;
}
else
{
if (need_map)
{
PCRE2_UCHAR *map_start = previous + 1 + LINK_SIZE + 1;
previous[1 + LINK_SIZE] |= ECL_MAP;
memmove(map_start + 32/sizeof(PCRE2_UCHAR), map_start,
CU2BYTES(code - map_start));
memcpy(map_start, op_info.bits.classbits, 32);
code += 32 / sizeof(PCRE2_UCHAR);
}
PUT(previous, 1, (int)(code - previous));
}
}
#endif /* SUPPORT_WIDE_CHARS */
*pcode = code;
return TRUE;
}
/* End of pcre2_compile_class.c */