mirror of
https://github.com/php/php-src.git
synced 2025-08-15 21:48:51 +02:00
2737 lines
74 KiB
C
2737 lines
74 KiB
C
/*************************************************
|
|
* Perl-Compatible Regular Expressions *
|
|
*************************************************/
|
|
|
|
/* PCRE is a library of functions to support regular expressions whose syntax
|
|
and semantics are as close as possible to those of the Perl 5 language.
|
|
|
|
Written by Philip Hazel
|
|
Original API code Copyright (c) 1997-2012 University of Cambridge
|
|
New API code Copyright (c) 2016-2024 University of Cambridge
|
|
|
|
-----------------------------------------------------------------------------
|
|
Redistribution and use in source and binary forms, with or without
|
|
modification, are permitted provided that the following conditions are met:
|
|
|
|
* Redistributions of source code must retain the above copyright notice,
|
|
this list of conditions and the following disclaimer.
|
|
|
|
* Redistributions in binary form must reproduce the above copyright
|
|
notice, this list of conditions and the following disclaimer in the
|
|
documentation and/or other materials provided with the distribution.
|
|
|
|
* Neither the name of the University of Cambridge nor the names of its
|
|
contributors may be used to endorse or promote products derived from
|
|
this software without specific prior written permission.
|
|
|
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
|
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
|
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
|
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
|
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
|
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
|
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
|
POSSIBILITY OF SUCH DAMAGE.
|
|
-----------------------------------------------------------------------------
|
|
*/
|
|
|
|
#ifdef HAVE_CONFIG_H
|
|
#include "config.h"
|
|
#endif
|
|
|
|
#include "pcre2_compile.h"
|
|
|
|
typedef struct {
|
|
/* Option bits for eclass. */
|
|
uint32_t options;
|
|
uint32_t xoptions;
|
|
/* Rarely used members. */
|
|
int *errorcodeptr;
|
|
compile_block *cb;
|
|
/* Bitmap is needed. */
|
|
BOOL needs_bitmap;
|
|
} eclass_context;
|
|
|
|
/* Checks the allowed tokens at the end of a class structure in debug mode.
|
|
When a new token is not processed by all loops, and the token is equals to
|
|
a) one of the cases here:
|
|
the compiler will complain about a duplicated case value.
|
|
b) none of the cases here:
|
|
the loop without the handler will stop with an assertion failure. */
|
|
|
|
#ifdef PCRE2_DEBUG
|
|
#define CLASS_END_CASES(meta) \
|
|
default: \
|
|
PCRE2_ASSERT((meta) <= META_END); \
|
|
/* Fall through */ \
|
|
case META_CLASS: \
|
|
case META_CLASS_NOT: \
|
|
case META_CLASS_EMPTY: \
|
|
case META_CLASS_EMPTY_NOT: \
|
|
case META_CLASS_END: \
|
|
case META_ECLASS_AND: \
|
|
case META_ECLASS_OR: \
|
|
case META_ECLASS_SUB: \
|
|
case META_ECLASS_XOR: \
|
|
case META_ECLASS_NOT:
|
|
#else
|
|
#define CLASS_END_CASES(meta) \
|
|
default:
|
|
#endif
|
|
|
|
#ifdef SUPPORT_WIDE_CHARS
|
|
|
|
/* Heapsort algorithm. */
|
|
|
|
static void do_heapify(uint32_t *buffer, size_t size, size_t i)
|
|
{
|
|
size_t max;
|
|
size_t left;
|
|
size_t right;
|
|
uint32_t tmp1, tmp2;
|
|
|
|
while (TRUE)
|
|
{
|
|
max = i;
|
|
left = (i << 1) + 2;
|
|
right = left + 2;
|
|
|
|
if (left < size && buffer[left] > buffer[max]) max = left;
|
|
if (right < size && buffer[right] > buffer[max]) max = right;
|
|
if (i == max) return;
|
|
|
|
/* Swap items. */
|
|
tmp1 = buffer[i];
|
|
tmp2 = buffer[i + 1];
|
|
buffer[i] = buffer[max];
|
|
buffer[i + 1] = buffer[max + 1];
|
|
buffer[max] = tmp1;
|
|
buffer[max + 1] = tmp2;
|
|
i = max;
|
|
}
|
|
}
|
|
|
|
#ifdef SUPPORT_UNICODE
|
|
|
|
#define PARSE_CLASS_UTF 0x1
|
|
#define PARSE_CLASS_CASELESS_UTF 0x2
|
|
#define PARSE_CLASS_RESTRICTED_UTF 0x4
|
|
#define PARSE_CLASS_TURKISH_UTF 0x8
|
|
|
|
/* Get the range of nocase characters which includes the
|
|
'c' character passed as argument, or directly follows 'c'. */
|
|
|
|
static const uint32_t*
|
|
get_nocase_range(uint32_t c)
|
|
{
|
|
uint32_t left = 0;
|
|
uint32_t right = PRIV(ucd_nocase_ranges_size);
|
|
uint32_t middle;
|
|
|
|
if (c > MAX_UTF_CODE_POINT) return PRIV(ucd_nocase_ranges) + right;
|
|
|
|
while (TRUE)
|
|
{
|
|
/* Range end of the middle element. */
|
|
middle = ((left + right) >> 1) | 0x1;
|
|
|
|
if (PRIV(ucd_nocase_ranges)[middle] <= c)
|
|
left = middle + 1;
|
|
else if (middle > 1 && PRIV(ucd_nocase_ranges)[middle - 2] > c)
|
|
right = middle - 1;
|
|
else
|
|
return PRIV(ucd_nocase_ranges) + (middle - 1);
|
|
}
|
|
}
|
|
|
|
/* Get the list of othercase characters, which belongs to the passed range.
|
|
Create ranges from these characters, and append them to the buffer argument. */
|
|
|
|
static size_t
|
|
utf_caseless_extend(uint32_t start, uint32_t end, uint32_t options,
|
|
uint32_t *buffer)
|
|
{
|
|
uint32_t new_start = start;
|
|
uint32_t new_end = end;
|
|
uint32_t c = start;
|
|
const uint32_t *list;
|
|
uint32_t tmp[3];
|
|
size_t result = 2;
|
|
const uint32_t *skip_range = get_nocase_range(c);
|
|
uint32_t skip_start = skip_range[0];
|
|
|
|
#if PCRE2_CODE_UNIT_WIDTH == 8
|
|
PCRE2_ASSERT(options & PARSE_CLASS_UTF);
|
|
#endif
|
|
|
|
#if PCRE2_CODE_UNIT_WIDTH == 32
|
|
if (end > MAX_UTF_CODE_POINT) end = MAX_UTF_CODE_POINT;
|
|
#endif
|
|
|
|
while (c <= end)
|
|
{
|
|
uint32_t co;
|
|
|
|
if (c > skip_start)
|
|
{
|
|
c = skip_range[1];
|
|
skip_range += 2;
|
|
skip_start = skip_range[0];
|
|
continue;
|
|
}
|
|
|
|
/* Compute caseless set. */
|
|
|
|
if ((options & (PARSE_CLASS_TURKISH_UTF|PARSE_CLASS_RESTRICTED_UTF)) ==
|
|
PARSE_CLASS_TURKISH_UTF &&
|
|
UCD_ANY_I(c))
|
|
{
|
|
co = PRIV(ucd_turkish_dotted_i_caseset) + (UCD_DOTTED_I(c)? 0 : 3);
|
|
}
|
|
else if ((co = UCD_CASESET(c)) != 0 &&
|
|
(options & PARSE_CLASS_RESTRICTED_UTF) != 0 &&
|
|
PRIV(ucd_caseless_sets)[co] < 128)
|
|
{
|
|
co = 0; /* Ignore the caseless set if it's restricted. */
|
|
}
|
|
|
|
if (co != 0)
|
|
list = PRIV(ucd_caseless_sets) + co;
|
|
else
|
|
{
|
|
co = UCD_OTHERCASE(c);
|
|
list = tmp;
|
|
tmp[0] = c;
|
|
tmp[1] = NOTACHAR;
|
|
|
|
if (co != c)
|
|
{
|
|
tmp[1] = co;
|
|
tmp[2] = NOTACHAR;
|
|
}
|
|
}
|
|
c++;
|
|
|
|
/* Add characters. */
|
|
do
|
|
{
|
|
#if PCRE2_CODE_UNIT_WIDTH == 16
|
|
if (!(options & PARSE_CLASS_UTF) && *list > 0xffff) continue;
|
|
#endif
|
|
|
|
if (*list < new_start)
|
|
{
|
|
if (*list + 1 == new_start)
|
|
{
|
|
new_start--;
|
|
continue;
|
|
}
|
|
}
|
|
else if (*list > new_end)
|
|
{
|
|
if (*list - 1 == new_end)
|
|
{
|
|
new_end++;
|
|
continue;
|
|
}
|
|
}
|
|
else continue;
|
|
|
|
result += 2;
|
|
if (buffer != NULL)
|
|
{
|
|
buffer[0] = *list;
|
|
buffer[1] = *list;
|
|
buffer += 2;
|
|
}
|
|
}
|
|
while (*(++list) != NOTACHAR);
|
|
}
|
|
|
|
if (buffer != NULL)
|
|
{
|
|
buffer[0] = new_start;
|
|
buffer[1] = new_end;
|
|
buffer += 2;
|
|
(void)buffer;
|
|
}
|
|
return result;
|
|
}
|
|
|
|
#endif
|
|
|
|
/* Add a character list to a buffer. */
|
|
|
|
static size_t
|
|
append_char_list(const uint32_t *p, uint32_t *buffer)
|
|
{
|
|
const uint32_t *n;
|
|
size_t result = 0;
|
|
|
|
while (*p != NOTACHAR)
|
|
{
|
|
n = p;
|
|
while (n[0] == n[1] - 1) n++;
|
|
|
|
PCRE2_ASSERT(*p < 0xffff);
|
|
|
|
if (buffer != NULL)
|
|
{
|
|
buffer[0] = *p;
|
|
buffer[1] = *n;
|
|
buffer += 2;
|
|
}
|
|
|
|
result += 2;
|
|
p = n + 1;
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
static uint32_t
|
|
get_highest_char(uint32_t options)
|
|
{
|
|
(void)options; /* Avoid compiler warning. */
|
|
|
|
#if PCRE2_CODE_UNIT_WIDTH == 8
|
|
return MAX_UTF_CODE_POINT;
|
|
#else
|
|
#ifdef SUPPORT_UNICODE
|
|
return GET_MAX_CHAR_VALUE((options & PARSE_CLASS_UTF) != 0);
|
|
#else
|
|
return MAX_UCHAR_VALUE;
|
|
#endif
|
|
#endif
|
|
}
|
|
|
|
/* Add a negated character list to a buffer. */
|
|
static size_t
|
|
append_negated_char_list(const uint32_t *p, uint32_t options, uint32_t *buffer)
|
|
{
|
|
const uint32_t *n;
|
|
uint32_t start = 0;
|
|
size_t result = 2;
|
|
|
|
PCRE2_ASSERT(*p > 0);
|
|
|
|
while (*p != NOTACHAR)
|
|
{
|
|
n = p;
|
|
while (n[0] == n[1] - 1) n++;
|
|
|
|
PCRE2_ASSERT(*p < 0xffff);
|
|
|
|
if (buffer != NULL)
|
|
{
|
|
buffer[0] = start;
|
|
buffer[1] = *p - 1;
|
|
buffer += 2;
|
|
}
|
|
|
|
result += 2;
|
|
start = *n + 1;
|
|
p = n + 1;
|
|
}
|
|
|
|
if (buffer != NULL)
|
|
{
|
|
buffer[0] = start;
|
|
buffer[1] = get_highest_char(options);
|
|
buffer += 2;
|
|
(void)buffer;
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
static uint32_t *
|
|
append_non_ascii_range(uint32_t options, uint32_t *buffer)
|
|
{
|
|
if (buffer == NULL) return NULL;
|
|
|
|
buffer[0] = 0x100;
|
|
buffer[1] = get_highest_char(options);
|
|
return buffer + 2;
|
|
}
|
|
|
|
static size_t
|
|
parse_class(uint32_t *ptr, uint32_t options, uint32_t *buffer)
|
|
{
|
|
size_t total_size = 0;
|
|
size_t size;
|
|
uint32_t meta_arg;
|
|
uint32_t start_char;
|
|
|
|
while (TRUE)
|
|
{
|
|
switch (META_CODE(*ptr))
|
|
{
|
|
case META_ESCAPE:
|
|
meta_arg = META_DATA(*ptr);
|
|
switch (meta_arg)
|
|
{
|
|
case ESC_D:
|
|
case ESC_W:
|
|
case ESC_S:
|
|
buffer = append_non_ascii_range(options, buffer);
|
|
total_size += 2;
|
|
break;
|
|
|
|
case ESC_h:
|
|
size = append_char_list(PRIV(hspace_list), buffer);
|
|
total_size += size;
|
|
if (buffer != NULL) buffer += size;
|
|
break;
|
|
|
|
case ESC_H:
|
|
size = append_negated_char_list(PRIV(hspace_list), options, buffer);
|
|
total_size += size;
|
|
if (buffer != NULL) buffer += size;
|
|
break;
|
|
|
|
case ESC_v:
|
|
size = append_char_list(PRIV(vspace_list), buffer);
|
|
total_size += size;
|
|
if (buffer != NULL) buffer += size;
|
|
break;
|
|
|
|
case ESC_V:
|
|
size = append_negated_char_list(PRIV(vspace_list), options, buffer);
|
|
total_size += size;
|
|
if (buffer != NULL) buffer += size;
|
|
break;
|
|
|
|
case ESC_p:
|
|
case ESC_P:
|
|
ptr++;
|
|
if (meta_arg == ESC_p && (*ptr >> 16) == PT_ANY)
|
|
{
|
|
if (buffer != NULL)
|
|
{
|
|
buffer[0] = 0;
|
|
buffer[1] = get_highest_char(options);
|
|
buffer += 2;
|
|
}
|
|
total_size += 2;
|
|
}
|
|
break;
|
|
}
|
|
ptr++;
|
|
continue;
|
|
case META_POSIX_NEG:
|
|
buffer = append_non_ascii_range(options, buffer);
|
|
total_size += 2;
|
|
ptr += 2;
|
|
continue;
|
|
case META_POSIX:
|
|
ptr += 2;
|
|
continue;
|
|
case META_BIGVALUE:
|
|
/* Character literal */
|
|
ptr++;
|
|
break;
|
|
CLASS_END_CASES(*ptr)
|
|
if (*ptr >= META_END) return total_size;
|
|
break;
|
|
}
|
|
|
|
start_char = *ptr;
|
|
|
|
if (ptr[1] == META_RANGE_LITERAL || ptr[1] == META_RANGE_ESCAPED)
|
|
{
|
|
ptr += 2;
|
|
PCRE2_ASSERT(*ptr < META_END || *ptr == META_BIGVALUE);
|
|
|
|
if (*ptr == META_BIGVALUE) ptr++;
|
|
|
|
#ifdef EBCDIC
|
|
#error "Missing EBCDIC support"
|
|
#endif
|
|
}
|
|
|
|
#ifdef SUPPORT_UNICODE
|
|
if (options & PARSE_CLASS_CASELESS_UTF)
|
|
{
|
|
size = utf_caseless_extend(start_char, *ptr++, options, buffer);
|
|
if (buffer != NULL) buffer += size;
|
|
total_size += size;
|
|
continue;
|
|
}
|
|
#endif
|
|
|
|
if (buffer != NULL)
|
|
{
|
|
buffer[0] = start_char;
|
|
buffer[1] = *ptr;
|
|
buffer += 2;
|
|
}
|
|
|
|
ptr++;
|
|
total_size += 2;
|
|
}
|
|
|
|
return total_size;
|
|
}
|
|
|
|
/* Extra uint32_t values for storing the lengths of range lists in
|
|
the worst case. Two uint32_t lengths and a range end for a range
|
|
starting before 255 */
|
|
#define CHAR_LIST_EXTRA_SIZE 3
|
|
|
|
/* Starting character values for each character list. */
|
|
|
|
static const uint32_t char_list_starts[] = {
|
|
#if PCRE2_CODE_UNIT_WIDTH == 32
|
|
XCL_CHAR_LIST_HIGH_32_START,
|
|
#endif
|
|
#if PCRE2_CODE_UNIT_WIDTH == 32 || defined SUPPORT_UNICODE
|
|
XCL_CHAR_LIST_LOW_32_START,
|
|
#endif
|
|
XCL_CHAR_LIST_HIGH_16_START,
|
|
/* Must be terminated by XCL_CHAR_LIST_LOW_16_START,
|
|
which also represents the end of the bitset. */
|
|
XCL_CHAR_LIST_LOW_16_START,
|
|
};
|
|
|
|
static class_ranges *
|
|
compile_optimize_class(uint32_t *start_ptr, uint32_t options,
|
|
uint32_t xoptions, compile_block *cb)
|
|
{
|
|
class_ranges* cranges;
|
|
uint32_t *ptr;
|
|
uint32_t *buffer;
|
|
uint32_t *dst;
|
|
uint32_t class_options = 0;
|
|
size_t range_list_size = 0, total_size, i;
|
|
uint32_t tmp1, tmp2;
|
|
const uint32_t *char_list_next;
|
|
uint16_t *next_char;
|
|
uint32_t char_list_start, char_list_end;
|
|
uint32_t range_start, range_end;
|
|
|
|
#ifdef SUPPORT_UNICODE
|
|
if (options & PCRE2_UTF)
|
|
class_options |= PARSE_CLASS_UTF;
|
|
|
|
if ((options & PCRE2_CASELESS) && (options & (PCRE2_UTF|PCRE2_UCP)))
|
|
class_options |= PARSE_CLASS_CASELESS_UTF;
|
|
|
|
if (xoptions & PCRE2_EXTRA_CASELESS_RESTRICT)
|
|
class_options |= PARSE_CLASS_RESTRICTED_UTF;
|
|
|
|
if (xoptions & PCRE2_EXTRA_TURKISH_CASING)
|
|
class_options |= PARSE_CLASS_TURKISH_UTF;
|
|
#endif
|
|
|
|
/* Compute required space for the range. */
|
|
|
|
range_list_size = parse_class(start_ptr, class_options, NULL);
|
|
PCRE2_ASSERT((range_list_size & 0x1) == 0);
|
|
|
|
/* Allocate buffer. The total_size also represents the end of the buffer. */
|
|
|
|
total_size = range_list_size +
|
|
((range_list_size >= 2) ? CHAR_LIST_EXTRA_SIZE : 0);
|
|
|
|
cranges = cb->cx->memctl.malloc(
|
|
sizeof(class_ranges) + total_size * sizeof(uint32_t),
|
|
cb->cx->memctl.memory_data);
|
|
|
|
if (cranges == NULL) return NULL;
|
|
|
|
cranges->next = NULL;
|
|
cranges->range_list_size = (uint16_t)range_list_size;
|
|
cranges->char_lists_types = 0;
|
|
cranges->char_lists_size = 0;
|
|
cranges->char_lists_start = 0;
|
|
|
|
if (range_list_size == 0) return cranges;
|
|
|
|
buffer = (uint32_t*)(cranges + 1);
|
|
parse_class(start_ptr, class_options, buffer);
|
|
|
|
/* Using <= instead of == to help static analysis. */
|
|
if (range_list_size <= 2) return cranges;
|
|
|
|
/* In-place sorting of ranges. */
|
|
|
|
i = (((range_list_size >> 2) - 1) << 1);
|
|
while (TRUE)
|
|
{
|
|
do_heapify(buffer, range_list_size, i);
|
|
if (i == 0) break;
|
|
i -= 2;
|
|
}
|
|
|
|
i = range_list_size - 2;
|
|
while (TRUE)
|
|
{
|
|
tmp1 = buffer[i];
|
|
tmp2 = buffer[i + 1];
|
|
buffer[i] = buffer[0];
|
|
buffer[i + 1] = buffer[1];
|
|
buffer[0] = tmp1;
|
|
buffer[1] = tmp2;
|
|
|
|
do_heapify(buffer, i, 0);
|
|
if (i == 0) break;
|
|
i -= 2;
|
|
}
|
|
|
|
/* Merge ranges whenever possible. */
|
|
dst = buffer;
|
|
ptr = buffer + 2;
|
|
range_list_size -= 2;
|
|
|
|
/* The second condition is a very rare corner case, where the end of the last
|
|
range is the maximum character. This range cannot be extended further. */
|
|
|
|
while (range_list_size > 0 && dst[1] != ~(uint32_t)0)
|
|
{
|
|
if (dst[1] + 1 < ptr[0])
|
|
{
|
|
dst += 2;
|
|
dst[0] = ptr[0];
|
|
dst[1] = ptr[1];
|
|
}
|
|
else if (dst[1] < ptr[1]) dst[1] = ptr[1];
|
|
|
|
ptr += 2;
|
|
range_list_size -= 2;
|
|
}
|
|
|
|
PCRE2_ASSERT(dst[1] <= get_highest_char(class_options));
|
|
|
|
/* When the number of ranges are less than six,
|
|
they are not converted to range lists. */
|
|
|
|
ptr = buffer;
|
|
while (ptr < dst && ptr[1] < 0x100) ptr += 2;
|
|
if (dst - ptr < (2 * (6 - 1)))
|
|
{
|
|
cranges->range_list_size = (uint16_t)(dst + 2 - buffer);
|
|
return cranges;
|
|
}
|
|
|
|
/* Compute character lists structures. */
|
|
|
|
char_list_next = char_list_starts;
|
|
char_list_start = *char_list_next++;
|
|
#if PCRE2_CODE_UNIT_WIDTH == 32
|
|
char_list_end = XCL_CHAR_LIST_HIGH_32_END;
|
|
#elif defined SUPPORT_UNICODE
|
|
char_list_end = XCL_CHAR_LIST_LOW_32_END;
|
|
#else
|
|
char_list_end = XCL_CHAR_LIST_HIGH_16_END;
|
|
#endif
|
|
next_char = (uint16_t*)(buffer + total_size);
|
|
|
|
tmp1 = 0;
|
|
tmp2 = ((sizeof(char_list_starts) / sizeof(uint32_t)) - 1) * XCL_TYPE_BIT_LEN;
|
|
PCRE2_ASSERT(tmp2 <= 3 * XCL_TYPE_BIT_LEN && tmp2 >= XCL_TYPE_BIT_LEN);
|
|
range_start = dst[0];
|
|
range_end = dst[1];
|
|
|
|
while (TRUE)
|
|
{
|
|
if (range_start >= char_list_start)
|
|
{
|
|
if (range_start == range_end || range_end < char_list_end)
|
|
{
|
|
tmp1++;
|
|
next_char--;
|
|
|
|
if (char_list_start < XCL_CHAR_LIST_LOW_32_START)
|
|
*next_char = (uint16_t)((range_end << XCL_CHAR_SHIFT) | XCL_CHAR_END);
|
|
else
|
|
*(uint32_t*)(--next_char) =
|
|
(range_end << XCL_CHAR_SHIFT) | XCL_CHAR_END;
|
|
}
|
|
|
|
if (range_start < range_end)
|
|
{
|
|
if (range_start > char_list_start)
|
|
{
|
|
tmp1++;
|
|
next_char--;
|
|
|
|
if (char_list_start < XCL_CHAR_LIST_LOW_32_START)
|
|
*next_char = (uint16_t)(range_start << XCL_CHAR_SHIFT);
|
|
else
|
|
*(uint32_t*)(--next_char) = (range_start << XCL_CHAR_SHIFT);
|
|
}
|
|
else
|
|
cranges->char_lists_types |= XCL_BEGIN_WITH_RANGE << tmp2;
|
|
}
|
|
|
|
PCRE2_ASSERT((uint32_t*)next_char >= dst + 2);
|
|
|
|
if (dst > buffer)
|
|
{
|
|
dst -= 2;
|
|
range_start = dst[0];
|
|
range_end = dst[1];
|
|
continue;
|
|
}
|
|
|
|
range_start = 0;
|
|
range_end = 0;
|
|
}
|
|
|
|
if (range_end >= char_list_start)
|
|
{
|
|
PCRE2_ASSERT(range_start < char_list_start);
|
|
|
|
if (range_end < char_list_end)
|
|
{
|
|
tmp1++;
|
|
next_char--;
|
|
|
|
if (char_list_start < XCL_CHAR_LIST_LOW_32_START)
|
|
*next_char = (uint16_t)((range_end << XCL_CHAR_SHIFT) | XCL_CHAR_END);
|
|
else
|
|
*(uint32_t*)(--next_char) =
|
|
(range_end << XCL_CHAR_SHIFT) | XCL_CHAR_END;
|
|
|
|
PCRE2_ASSERT((uint32_t*)next_char >= dst + 2);
|
|
}
|
|
|
|
cranges->char_lists_types |= XCL_BEGIN_WITH_RANGE << tmp2;
|
|
}
|
|
|
|
if (tmp1 >= XCL_ITEM_COUNT_MASK)
|
|
{
|
|
cranges->char_lists_types |= XCL_ITEM_COUNT_MASK << tmp2;
|
|
next_char--;
|
|
|
|
if (char_list_start < XCL_CHAR_LIST_LOW_32_START)
|
|
*next_char = (uint16_t)tmp1;
|
|
else
|
|
*(uint32_t*)(--next_char) = tmp1;
|
|
}
|
|
else
|
|
cranges->char_lists_types |= tmp1 << tmp2;
|
|
|
|
if (range_start < XCL_CHAR_LIST_LOW_16_START) break;
|
|
|
|
PCRE2_ASSERT(tmp2 >= XCL_TYPE_BIT_LEN);
|
|
char_list_end = char_list_start - 1;
|
|
char_list_start = *char_list_next++;
|
|
tmp1 = 0;
|
|
tmp2 -= XCL_TYPE_BIT_LEN;
|
|
}
|
|
|
|
if (dst[0] < XCL_CHAR_LIST_LOW_16_START) dst += 2;
|
|
PCRE2_ASSERT((uint16_t*)dst <= next_char);
|
|
|
|
cranges->char_lists_size =
|
|
(size_t)((uint8_t*)(buffer + total_size) - (uint8_t*)next_char);
|
|
cranges->char_lists_start = (size_t)((uint8_t*)next_char - (uint8_t*)buffer);
|
|
cranges->range_list_size = (uint16_t)(dst - buffer);
|
|
return cranges;
|
|
}
|
|
|
|
#endif /* SUPPORT_WIDE_CHARS */
|
|
|
|
#ifdef SUPPORT_UNICODE
|
|
|
|
void PRIV(update_classbits)(uint32_t ptype, uint32_t pdata, BOOL negated,
|
|
uint8_t *classbits)
|
|
{
|
|
/* Update PRIV(xclass) when this function is changed. */
|
|
int c, chartype;
|
|
const ucd_record *prop;
|
|
uint32_t gentype;
|
|
BOOL set_bit;
|
|
|
|
if (ptype == PT_ANY)
|
|
{
|
|
if (!negated) memset(classbits, 0xff, 32);
|
|
return;
|
|
}
|
|
|
|
for (c = 0; c < 256; c++)
|
|
{
|
|
prop = GET_UCD(c);
|
|
set_bit = FALSE;
|
|
(void)set_bit;
|
|
|
|
switch (ptype)
|
|
{
|
|
case PT_LAMP:
|
|
chartype = prop->chartype;
|
|
set_bit = (chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt);
|
|
break;
|
|
|
|
case PT_GC:
|
|
set_bit = (PRIV(ucp_gentype)[prop->chartype] == pdata);
|
|
break;
|
|
|
|
case PT_PC:
|
|
set_bit = (prop->chartype == pdata);
|
|
break;
|
|
|
|
case PT_SC:
|
|
set_bit = (prop->script == pdata);
|
|
break;
|
|
|
|
case PT_SCX:
|
|
set_bit = (prop->script == pdata ||
|
|
MAPBIT(PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(prop), pdata) != 0);
|
|
break;
|
|
|
|
case PT_ALNUM:
|
|
gentype = PRIV(ucp_gentype)[prop->chartype];
|
|
set_bit = (gentype == ucp_L || gentype == ucp_N);
|
|
break;
|
|
|
|
case PT_SPACE: /* Perl space */
|
|
case PT_PXSPACE: /* POSIX space */
|
|
switch(c)
|
|
{
|
|
HSPACE_BYTE_CASES:
|
|
VSPACE_BYTE_CASES:
|
|
set_bit = TRUE;
|
|
break;
|
|
|
|
default:
|
|
set_bit = (PRIV(ucp_gentype)[prop->chartype] == ucp_Z);
|
|
break;
|
|
}
|
|
break;
|
|
|
|
case PT_WORD:
|
|
chartype = prop->chartype;
|
|
gentype = PRIV(ucp_gentype)[chartype];
|
|
set_bit = (gentype == ucp_L || gentype == ucp_N ||
|
|
chartype == ucp_Mn || chartype == ucp_Pc);
|
|
break;
|
|
|
|
case PT_UCNC:
|
|
set_bit = (c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
|
|
c == CHAR_GRAVE_ACCENT || c >= 0xa0);
|
|
break;
|
|
|
|
case PT_BIDICL:
|
|
set_bit = (UCD_BIDICLASS_PROP(prop) == pdata);
|
|
break;
|
|
|
|
case PT_BOOL:
|
|
set_bit = MAPBIT(PRIV(ucd_boolprop_sets) +
|
|
UCD_BPROPS_PROP(prop), pdata) != 0;
|
|
break;
|
|
|
|
case PT_PXGRAPH:
|
|
chartype = prop->chartype;
|
|
gentype = PRIV(ucp_gentype)[chartype];
|
|
set_bit = (gentype != ucp_Z && (gentype != ucp_C || chartype == ucp_Cf));
|
|
break;
|
|
|
|
case PT_PXPRINT:
|
|
chartype = prop->chartype;
|
|
set_bit = (chartype != ucp_Zl && chartype != ucp_Zp &&
|
|
(PRIV(ucp_gentype)[chartype] != ucp_C || chartype == ucp_Cf));
|
|
break;
|
|
|
|
case PT_PXPUNCT:
|
|
gentype = PRIV(ucp_gentype)[prop->chartype];
|
|
set_bit = (gentype == ucp_P || (c < 128 && gentype == ucp_S));
|
|
break;
|
|
|
|
default:
|
|
PCRE2_ASSERT(ptype == PT_PXXDIGIT);
|
|
set_bit = (c >= CHAR_0 && c <= CHAR_9) ||
|
|
(c >= CHAR_A && c <= CHAR_F) ||
|
|
(c >= CHAR_a && c <= CHAR_f);
|
|
break;
|
|
}
|
|
|
|
if (negated) set_bit = !set_bit;
|
|
if (set_bit) *classbits |= (uint8_t)(1 << (c & 0x7));
|
|
if ((c & 0x7) == 0x7) classbits++;
|
|
}
|
|
}
|
|
|
|
#endif /* SUPPORT_UNICODE */
|
|
|
|
|
|
|
|
#ifdef SUPPORT_WIDE_CHARS
|
|
|
|
/*************************************************
|
|
* XClass related properties *
|
|
*************************************************/
|
|
|
|
/* XClass needs to be generated. */
|
|
#define XCLASS_REQUIRED 0x1
|
|
/* XClass has 8 bit character. */
|
|
#define XCLASS_HAS_8BIT_CHARS 0x2
|
|
/* XClass has properties. */
|
|
#define XCLASS_HAS_PROPS 0x4
|
|
/* XClass has character lists. */
|
|
#define XCLASS_HAS_CHAR_LISTS 0x8
|
|
/* XClass matches to all >= 256 characters. */
|
|
#define XCLASS_HIGH_ANY 0x10
|
|
|
|
#endif
|
|
|
|
|
|
/*************************************************
|
|
* Internal entry point for add range to class *
|
|
*************************************************/
|
|
|
|
/* This function sets the overall range for characters < 256.
|
|
It also handles non-utf case folding.
|
|
|
|
Arguments:
|
|
options the options bits
|
|
xoptions the extra options bits
|
|
cb compile data
|
|
start start of range character
|
|
end end of range character
|
|
|
|
Returns: cb->classbits is updated
|
|
*/
|
|
|
|
static void
|
|
add_to_class(uint32_t options, uint32_t xoptions, compile_block *cb,
|
|
uint32_t start, uint32_t end)
|
|
{
|
|
uint8_t *classbits = cb->classbits.classbits;
|
|
uint32_t c, byte_start, byte_end;
|
|
uint32_t classbits_end = (end <= 0xff ? end : 0xff);
|
|
|
|
/* If caseless matching is required, scan the range and process alternate
|
|
cases. In Unicode, there are 8-bit characters that have alternate cases that
|
|
are greater than 255 and vice-versa (though these may be ignored if caseless
|
|
restriction is in force). Sometimes we can just extend the original range. */
|
|
|
|
if ((options & PCRE2_CASELESS) != 0)
|
|
{
|
|
#ifdef SUPPORT_UNICODE
|
|
/* UTF mode. This branch is taken if we don't support wide characters (e.g.
|
|
8-bit library, without UTF), but we do treat those characters as Unicode
|
|
(if UCP flag is set). In this case, we only need to expand the character class
|
|
set to include the case pairs which are in the 0-255 codepoint range. */
|
|
if ((options & (PCRE2_UTF|PCRE2_UCP)) != 0)
|
|
{
|
|
BOOL turkish_i = (xoptions & (PCRE2_EXTRA_TURKISH_CASING|PCRE2_EXTRA_CASELESS_RESTRICT)) ==
|
|
PCRE2_EXTRA_TURKISH_CASING;
|
|
if (start < 128)
|
|
{
|
|
uint32_t lo_end = (classbits_end < 127 ? classbits_end : 127);
|
|
for (c = start; c <= lo_end; c++)
|
|
{
|
|
if (turkish_i && UCD_ANY_I(c)) continue;
|
|
SETBIT(classbits, cb->fcc[c]);
|
|
}
|
|
}
|
|
if (classbits_end >= 128)
|
|
{
|
|
uint32_t hi_start = (start > 128 ? start : 128);
|
|
for (c = hi_start; c <= classbits_end; c++)
|
|
{
|
|
uint32_t co = UCD_OTHERCASE(c);
|
|
if (co <= 0xff) SETBIT(classbits, co);
|
|
}
|
|
}
|
|
}
|
|
|
|
else
|
|
#endif /* SUPPORT_UNICODE */
|
|
|
|
/* Not UTF mode */
|
|
{
|
|
for (c = start; c <= classbits_end; c++)
|
|
SETBIT(classbits, cb->fcc[c]);
|
|
}
|
|
}
|
|
|
|
/* Use the bitmap for characters < 256. Otherwise use extra data. */
|
|
|
|
byte_start = (start + 7) >> 3;
|
|
byte_end = (classbits_end + 1) >> 3;
|
|
|
|
if (byte_start >= byte_end)
|
|
{
|
|
for (c = start; c <= classbits_end; c++)
|
|
/* Regardless of start, c will always be <= 255. */
|
|
SETBIT(classbits, c);
|
|
return;
|
|
}
|
|
|
|
for (c = byte_start; c < byte_end; c++)
|
|
classbits[c] = 0xff;
|
|
|
|
byte_start <<= 3;
|
|
byte_end <<= 3;
|
|
|
|
for (c = start; c < byte_start; c++)
|
|
SETBIT(classbits, c);
|
|
|
|
for (c = byte_end; c <= classbits_end; c++)
|
|
SETBIT(classbits, c);
|
|
}
|
|
|
|
|
|
#if PCRE2_CODE_UNIT_WIDTH == 8
|
|
/*************************************************
|
|
* Internal entry point for add list to class *
|
|
*************************************************/
|
|
|
|
/* This function is used for adding a list of horizontal or vertical whitespace
|
|
characters to a class. The list must be in order so that ranges of characters
|
|
can be detected and handled appropriately. This function sets the overall range
|
|
so that the internal functions can try to avoid duplication when handling
|
|
case-independence.
|
|
|
|
Arguments:
|
|
options the options bits
|
|
xoptions the extra options bits
|
|
cb contains pointers to tables etc.
|
|
p points to row of 32-bit values, terminated by NOTACHAR
|
|
|
|
Returns: cb->classbits is updated
|
|
*/
|
|
|
|
static void
|
|
add_list_to_class(uint32_t options, uint32_t xoptions, compile_block *cb,
|
|
const uint32_t *p)
|
|
{
|
|
while (p[0] < 256)
|
|
{
|
|
unsigned int n = 0;
|
|
|
|
while(p[n+1] == p[0] + n + 1) n++;
|
|
add_to_class(options, xoptions, cb, p[0], p[n]);
|
|
|
|
p += n + 1;
|
|
}
|
|
}
|
|
|
|
|
|
|
|
/*************************************************
|
|
* Add characters not in a list to a class *
|
|
*************************************************/
|
|
|
|
/* This function is used for adding the complement of a list of horizontal or
|
|
vertical whitespace to a class. The list must be in order.
|
|
|
|
Arguments:
|
|
options the options bits
|
|
xoptions the extra options bits
|
|
cb contains pointers to tables etc.
|
|
p points to row of 32-bit values, terminated by NOTACHAR
|
|
|
|
Returns: cb->classbits is updated
|
|
*/
|
|
|
|
static void
|
|
add_not_list_to_class(uint32_t options, uint32_t xoptions, compile_block *cb,
|
|
const uint32_t *p)
|
|
{
|
|
if (p[0] > 0)
|
|
add_to_class(options, xoptions, cb, 0, p[0] - 1);
|
|
while (p[0] < 256)
|
|
{
|
|
while (p[1] == p[0] + 1) p++;
|
|
add_to_class(options, xoptions, cb, p[0] + 1, (p[1] > 255) ? 255 : p[1] - 1);
|
|
p++;
|
|
}
|
|
}
|
|
#endif /* PCRE2_CODE_UNIT_WIDTH == 8 */
|
|
|
|
|
|
|
|
/*************************************************
|
|
* Main entry-point to compile a character class *
|
|
*************************************************/
|
|
|
|
/* This function consumes a "leaf", which is a set of characters that will
|
|
become a single OP_CLASS OP_NCLASS, OP_XCLASS, or OP_ALLANY. */
|
|
|
|
uint32_t *
|
|
PRIV(compile_class_not_nested)(uint32_t options, uint32_t xoptions,
|
|
uint32_t *start_ptr, PCRE2_UCHAR **pcode, BOOL negate_class, BOOL* has_bitmap,
|
|
int *errorcodeptr, compile_block *cb, PCRE2_SIZE *lengthptr)
|
|
{
|
|
uint32_t *pptr = start_ptr;
|
|
PCRE2_UCHAR *code = *pcode;
|
|
BOOL should_flip_negation;
|
|
const uint8_t *cbits = cb->cbits;
|
|
/* Some functions such as add_to_class() or eclass processing
|
|
expects that the bitset is stored in cb->classbits.classbits. */
|
|
uint8_t *const classbits = cb->classbits.classbits;
|
|
|
|
#ifdef SUPPORT_UNICODE
|
|
BOOL utf = (options & PCRE2_UTF) != 0;
|
|
#else /* No Unicode support */
|
|
BOOL utf = FALSE;
|
|
#endif
|
|
|
|
/* Helper variables for OP_XCLASS opcode (for characters > 255). */
|
|
|
|
#ifdef SUPPORT_WIDE_CHARS
|
|
uint32_t xclass_props;
|
|
PCRE2_UCHAR *class_uchardata;
|
|
class_ranges* cranges;
|
|
#endif
|
|
|
|
/* If an XClass contains a negative special such as \S, we need to flip the
|
|
negation flag at the end, so that support for characters > 255 works correctly
|
|
(they are all included in the class). An XClass may need to insert specific
|
|
matching or non-matching code for wide characters.
|
|
*/
|
|
|
|
should_flip_negation = FALSE;
|
|
|
|
/* XClass will be used when characters > 255 might match. */
|
|
|
|
#ifdef SUPPORT_WIDE_CHARS
|
|
xclass_props = 0;
|
|
|
|
#if PCRE2_CODE_UNIT_WIDTH == 8
|
|
cranges = NULL;
|
|
|
|
if (utf)
|
|
#endif
|
|
{
|
|
if (lengthptr != NULL)
|
|
{
|
|
cranges = compile_optimize_class(pptr, options, xoptions, cb);
|
|
|
|
if (cranges == NULL)
|
|
{
|
|
*errorcodeptr = ERR21;
|
|
return NULL;
|
|
}
|
|
|
|
/* Caching the pre-processed character ranges. */
|
|
if (cb->next_cranges != NULL)
|
|
cb->next_cranges->next = cranges;
|
|
else
|
|
cb->cranges = cranges;
|
|
|
|
cb->next_cranges = cranges;
|
|
}
|
|
else
|
|
{
|
|
/* Reuse the pre-processed character ranges. */
|
|
cranges = cb->cranges;
|
|
PCRE2_ASSERT(cranges != NULL);
|
|
cb->cranges = cranges->next;
|
|
}
|
|
|
|
if (cranges->range_list_size > 0)
|
|
{
|
|
const uint32_t *ranges = (const uint32_t*)(cranges + 1);
|
|
|
|
if (ranges[0] <= 255)
|
|
xclass_props |= XCLASS_HAS_8BIT_CHARS;
|
|
|
|
if (ranges[cranges->range_list_size - 1] == GET_MAX_CHAR_VALUE(utf) &&
|
|
ranges[cranges->range_list_size - 2] <= 256)
|
|
xclass_props |= XCLASS_HIGH_ANY;
|
|
}
|
|
}
|
|
|
|
class_uchardata = code + LINK_SIZE + 2; /* For XCLASS items */
|
|
#endif /* SUPPORT_WIDE_CHARS */
|
|
|
|
/* Initialize the 256-bit (32-byte) bit map to all zeros. We build the map
|
|
in a temporary bit of memory, in case the class contains fewer than two
|
|
8-bit characters because in that case the compiled code doesn't use the bit
|
|
map. */
|
|
|
|
memset(classbits, 0, 32);
|
|
|
|
/* Process items until end_ptr is reached. */
|
|
|
|
while (TRUE)
|
|
{
|
|
uint32_t meta = *(pptr++);
|
|
BOOL local_negate;
|
|
int posix_class;
|
|
int taboffset, tabopt;
|
|
class_bits_storage pbits;
|
|
uint32_t escape, c;
|
|
|
|
/* Handle POSIX classes such as [:alpha:] etc. */
|
|
switch (META_CODE(meta))
|
|
{
|
|
case META_POSIX:
|
|
case META_POSIX_NEG:
|
|
|
|
local_negate = (meta == META_POSIX_NEG);
|
|
posix_class = *(pptr++);
|
|
|
|
if (local_negate) should_flip_negation = TRUE; /* Note negative special */
|
|
|
|
/* If matching is caseless, upper and lower are converted to alpha.
|
|
This relies on the fact that the class table starts with alpha,
|
|
lower, upper as the first 3 entries. */
|
|
|
|
if ((options & PCRE2_CASELESS) != 0 && posix_class <= 2)
|
|
posix_class = 0;
|
|
|
|
/* When PCRE2_UCP is set, some of the POSIX classes are converted to
|
|
different escape sequences that use Unicode properties \p or \P.
|
|
Others that are not available via \p or \P have to generate
|
|
XCL_PROP/XCL_NOTPROP directly, which is done here. */
|
|
|
|
#ifdef SUPPORT_UNICODE
|
|
/* TODO This entire block of code here appears to be unreachable!? I simply
|
|
can't see how it can be hit, given that the frontend parser doesn't emit
|
|
META_POSIX for GRAPH/PRINT/PUNCT when UCP is set. */
|
|
if ((options & PCRE2_UCP) != 0 &&
|
|
(xoptions & PCRE2_EXTRA_ASCII_POSIX) == 0)
|
|
{
|
|
uint32_t ptype;
|
|
|
|
switch(posix_class)
|
|
{
|
|
case PC_GRAPH:
|
|
case PC_PRINT:
|
|
case PC_PUNCT:
|
|
ptype = (posix_class == PC_GRAPH)? PT_PXGRAPH :
|
|
(posix_class == PC_PRINT)? PT_PXPRINT : PT_PXPUNCT;
|
|
|
|
PRIV(update_classbits)(ptype, 0, local_negate, classbits);
|
|
|
|
if ((xclass_props & XCLASS_HIGH_ANY) == 0)
|
|
{
|
|
if (lengthptr != NULL)
|
|
*lengthptr += 3;
|
|
else
|
|
{
|
|
*class_uchardata++ = local_negate? XCL_NOTPROP : XCL_PROP;
|
|
*class_uchardata++ = (PCRE2_UCHAR)ptype;
|
|
*class_uchardata++ = 0;
|
|
}
|
|
xclass_props |= XCLASS_REQUIRED | XCLASS_HAS_PROPS;
|
|
}
|
|
continue;
|
|
|
|
/* For the other POSIX classes (ex: ascii) we are going to
|
|
fall through to the non-UCP case and build a bit map for
|
|
characters with code points less than 256. However, if we are in
|
|
a negated POSIX class, characters with code points greater than
|
|
255 must either all match or all not match, depending on whether
|
|
the whole class is not or is negated. For example, for
|
|
[[:^ascii:]... they must all match, whereas for [^[:^ascii:]...
|
|
they must not.
|
|
|
|
In the special case where there are no xclass items, this is
|
|
automatically handled by the use of OP_CLASS or OP_NCLASS, but an
|
|
explicit range is needed for OP_XCLASS. Setting a flag here
|
|
causes the range to be generated later when it is known that
|
|
OP_XCLASS is required. In the 8-bit library this is relevant only in
|
|
utf mode, since no wide characters can exist otherwise. */
|
|
|
|
default:
|
|
break;
|
|
}
|
|
}
|
|
#endif /* SUPPORT_UNICODE */
|
|
|
|
/* In the non-UCP case, or when UCP makes no difference, we build the
|
|
bit map for the POSIX class in a chunk of local store because we may
|
|
be adding and subtracting from it, and we don't want to subtract bits
|
|
that may be in the main map already. At the end we or the result into
|
|
the bit map that is being built. */
|
|
|
|
posix_class *= 3;
|
|
|
|
/* Copy in the first table (always present) */
|
|
|
|
memcpy(pbits.classbits, cbits + PRIV(posix_class_maps)[posix_class], 32);
|
|
|
|
/* If there is a second table, add or remove it as required. */
|
|
|
|
taboffset = PRIV(posix_class_maps)[posix_class + 1];
|
|
tabopt = PRIV(posix_class_maps)[posix_class + 2];
|
|
|
|
if (taboffset >= 0)
|
|
{
|
|
if (tabopt >= 0)
|
|
for (int i = 0; i < 32; i++)
|
|
pbits.classbits[i] |= cbits[i + taboffset];
|
|
else
|
|
for (int i = 0; i < 32; i++)
|
|
pbits.classbits[i] &= (uint8_t)(~cbits[i + taboffset]);
|
|
}
|
|
|
|
/* Now see if we need to remove any special characters. An option
|
|
value of 1 removes vertical space and 2 removes underscore. */
|
|
|
|
if (tabopt < 0) tabopt = -tabopt;
|
|
if (tabopt == 1) pbits.classbits[1] &= ~0x3c;
|
|
else if (tabopt == 2) pbits.classbits[11] &= 0x7f;
|
|
|
|
/* Add the POSIX table or its complement into the main table that is
|
|
being built and we are done. */
|
|
|
|
{
|
|
uint32_t *classwords = cb->classbits.classwords;
|
|
|
|
if (local_negate)
|
|
for (int i = 0; i < 8; i++)
|
|
classwords[i] |= (uint32_t)(~pbits.classwords[i]);
|
|
else
|
|
for (int i = 0; i < 8; i++)
|
|
classwords[i] |= pbits.classwords[i];
|
|
}
|
|
|
|
#ifdef SUPPORT_WIDE_CHARS
|
|
/* Every class contains at least one < 256 character. */
|
|
xclass_props |= XCLASS_HAS_8BIT_CHARS;
|
|
#endif
|
|
continue; /* End of POSIX handling */
|
|
|
|
/* Other than POSIX classes, the only items we should encounter are
|
|
\d-type escapes and literal characters (possibly as ranges). */
|
|
case META_BIGVALUE:
|
|
meta = *(pptr++);
|
|
break;
|
|
|
|
case META_ESCAPE:
|
|
escape = META_DATA(meta);
|
|
|
|
switch(escape)
|
|
{
|
|
case ESC_d:
|
|
for (int i = 0; i < 32; i++) classbits[i] |= cbits[i+cbit_digit];
|
|
break;
|
|
|
|
case ESC_D:
|
|
should_flip_negation = TRUE;
|
|
for (int i = 0; i < 32; i++)
|
|
classbits[i] |= (uint8_t)(~cbits[i+cbit_digit]);
|
|
break;
|
|
|
|
case ESC_w:
|
|
for (int i = 0; i < 32; i++) classbits[i] |= cbits[i+cbit_word];
|
|
break;
|
|
|
|
case ESC_W:
|
|
should_flip_negation = TRUE;
|
|
for (int i = 0; i < 32; i++)
|
|
classbits[i] |= (uint8_t)(~cbits[i+cbit_word]);
|
|
break;
|
|
|
|
/* Perl 5.004 onwards omitted VT from \s, but restored it at Perl
|
|
5.18. Before PCRE 8.34, we had to preserve the VT bit if it was
|
|
previously set by something earlier in the character class.
|
|
Luckily, the value of CHAR_VT is 0x0b in both ASCII and EBCDIC, so
|
|
we could just adjust the appropriate bit. From PCRE 8.34 we no
|
|
longer treat \s and \S specially. */
|
|
|
|
case ESC_s:
|
|
for (int i = 0; i < 32; i++) classbits[i] |= cbits[i+cbit_space];
|
|
break;
|
|
|
|
case ESC_S:
|
|
should_flip_negation = TRUE;
|
|
for (int i = 0; i < 32; i++)
|
|
classbits[i] |= (uint8_t)(~cbits[i+cbit_space]);
|
|
break;
|
|
|
|
/* When adding the horizontal or vertical space lists to a class, or
|
|
their complements, disable PCRE2_CASELESS, because it justs wastes
|
|
time, and in the "not-x" UTF cases can create unwanted duplicates in
|
|
the XCLASS list (provoked by characters that have more than one other
|
|
case and by both cases being in the same "not-x" sublist). */
|
|
|
|
case ESC_h:
|
|
#if PCRE2_CODE_UNIT_WIDTH == 8
|
|
#ifdef SUPPORT_UNICODE
|
|
if (cranges != NULL) break;
|
|
#endif
|
|
add_list_to_class(options & ~PCRE2_CASELESS, xoptions,
|
|
cb, PRIV(hspace_list));
|
|
#else
|
|
PCRE2_ASSERT(cranges != NULL);
|
|
#endif
|
|
break;
|
|
|
|
case ESC_H:
|
|
#if PCRE2_CODE_UNIT_WIDTH == 8
|
|
#ifdef SUPPORT_UNICODE
|
|
if (cranges != NULL) break;
|
|
#endif
|
|
add_not_list_to_class(options & ~PCRE2_CASELESS, xoptions,
|
|
cb, PRIV(hspace_list));
|
|
#else
|
|
PCRE2_ASSERT(cranges != NULL);
|
|
#endif
|
|
break;
|
|
|
|
case ESC_v:
|
|
#if PCRE2_CODE_UNIT_WIDTH == 8
|
|
#ifdef SUPPORT_UNICODE
|
|
if (cranges != NULL) break;
|
|
#endif
|
|
add_list_to_class(options & ~PCRE2_CASELESS, xoptions,
|
|
cb, PRIV(vspace_list));
|
|
#else
|
|
PCRE2_ASSERT(cranges != NULL);
|
|
#endif
|
|
break;
|
|
|
|
case ESC_V:
|
|
#if PCRE2_CODE_UNIT_WIDTH == 8
|
|
#ifdef SUPPORT_UNICODE
|
|
if (cranges != NULL) break;
|
|
#endif
|
|
add_not_list_to_class(options & ~PCRE2_CASELESS, xoptions,
|
|
cb, PRIV(vspace_list));
|
|
#else
|
|
PCRE2_ASSERT(cranges != NULL);
|
|
#endif
|
|
break;
|
|
|
|
/* If Unicode is not supported, \P and \p are not allowed and are
|
|
faulted at parse time, so will never appear here. */
|
|
|
|
#ifdef SUPPORT_UNICODE
|
|
case ESC_p:
|
|
case ESC_P:
|
|
{
|
|
uint32_t ptype = *pptr >> 16;
|
|
uint32_t pdata = *(pptr++) & 0xffff;
|
|
|
|
/* The "Any" is processed by PRIV(update_classbits)(). */
|
|
if (ptype == PT_ANY)
|
|
{
|
|
#if PCRE2_CODE_UNIT_WIDTH == 8
|
|
if (!utf && escape == ESC_p) memset(classbits, 0xff, 32);
|
|
#endif
|
|
continue;
|
|
}
|
|
|
|
PRIV(update_classbits)(ptype, pdata, (escape == ESC_P), classbits);
|
|
|
|
if ((xclass_props & XCLASS_HIGH_ANY) == 0)
|
|
{
|
|
if (lengthptr != NULL)
|
|
*lengthptr += 3;
|
|
else
|
|
{
|
|
*class_uchardata++ = (escape == ESC_p)? XCL_PROP : XCL_NOTPROP;
|
|
*class_uchardata++ = ptype;
|
|
*class_uchardata++ = pdata;
|
|
}
|
|
xclass_props |= XCLASS_REQUIRED | XCLASS_HAS_PROPS;
|
|
}
|
|
}
|
|
continue;
|
|
#endif
|
|
}
|
|
|
|
#ifdef SUPPORT_WIDE_CHARS
|
|
/* Every non-property class contains at least one < 256 character. */
|
|
xclass_props |= XCLASS_HAS_8BIT_CHARS;
|
|
#endif
|
|
/* End handling \d-type escapes */
|
|
continue;
|
|
|
|
CLASS_END_CASES(meta)
|
|
/* Literals. */
|
|
if (meta < META_END) break;
|
|
/* Non-literals: end of class contents. */
|
|
goto END_PROCESSING;
|
|
}
|
|
|
|
/* A literal character may be followed by a range meta. At parse time
|
|
there are checks for out-of-order characters, for ranges where the two
|
|
characters are equal, and for hyphens that cannot indicate a range. At
|
|
this point, therefore, no checking is needed. */
|
|
|
|
c = meta;
|
|
|
|
/* Remember if \r or \n were explicitly used */
|
|
|
|
if (c == CHAR_CR || c == CHAR_NL) cb->external_flags |= PCRE2_HASCRORLF;
|
|
|
|
/* Process a character range */
|
|
|
|
if (*pptr == META_RANGE_LITERAL || *pptr == META_RANGE_ESCAPED)
|
|
{
|
|
uint32_t d;
|
|
|
|
#ifdef EBCDIC
|
|
BOOL range_is_literal = (*pptr == META_RANGE_LITERAL);
|
|
#endif
|
|
++pptr;
|
|
d = *(pptr++);
|
|
if (d == META_BIGVALUE) d = *(pptr++);
|
|
|
|
/* Remember an explicit \r or \n, and add the range to the class. */
|
|
|
|
if (d == CHAR_CR || d == CHAR_NL) cb->external_flags |= PCRE2_HASCRORLF;
|
|
|
|
#if PCRE2_CODE_UNIT_WIDTH == 8
|
|
#ifdef SUPPORT_UNICODE
|
|
if (cranges != NULL) continue;
|
|
xclass_props |= XCLASS_HAS_8BIT_CHARS;
|
|
#endif
|
|
|
|
/* In an EBCDIC environment, Perl treats alphabetic ranges specially
|
|
because there are holes in the encoding, and simply using the range
|
|
A-Z (for example) would include the characters in the holes. This
|
|
applies only to literal ranges; [\xC1-\xE9] is different to [A-Z]. */
|
|
|
|
#ifdef EBCDIC
|
|
if (range_is_literal &&
|
|
(cb->ctypes[c] & ctype_letter) != 0 &&
|
|
(cb->ctypes[d] & ctype_letter) != 0 &&
|
|
(c <= CHAR_z) == (d <= CHAR_z))
|
|
{
|
|
uint32_t uc = (d <= CHAR_z)? 0 : 64;
|
|
uint32_t C = c - uc;
|
|
uint32_t D = d - uc;
|
|
|
|
if (C <= CHAR_i)
|
|
{
|
|
add_to_class(options, xoptions, cb, C + uc,
|
|
((D < CHAR_i)? D : CHAR_i) + uc);
|
|
C = CHAR_j;
|
|
}
|
|
|
|
if (C <= D && C <= CHAR_r)
|
|
{
|
|
add_to_class(options, xoptions, cb, C + uc,
|
|
((D < CHAR_r)? D : CHAR_r) + uc);
|
|
C = CHAR_s;
|
|
}
|
|
|
|
if (C <= D)
|
|
add_to_class(options, xoptions, cb, C + uc, D + uc);
|
|
}
|
|
else
|
|
#endif
|
|
/* Not an EBCDIC special range */
|
|
|
|
add_to_class(options, xoptions, cb, c, d);
|
|
#else
|
|
PCRE2_ASSERT(cranges != NULL);
|
|
#endif
|
|
continue;
|
|
} /* End of range handling */
|
|
|
|
/* Character ranges are ignored when class_ranges is present. */
|
|
#if PCRE2_CODE_UNIT_WIDTH == 8
|
|
#ifdef SUPPORT_UNICODE
|
|
if (cranges != NULL) continue;
|
|
xclass_props |= XCLASS_HAS_8BIT_CHARS;
|
|
#endif
|
|
/* Handle a single character. */
|
|
|
|
add_to_class(options, xoptions, cb, meta, meta);
|
|
#else
|
|
PCRE2_ASSERT(cranges != NULL);
|
|
#endif
|
|
} /* End of main class-processing loop */
|
|
|
|
END_PROCESSING:
|
|
|
|
#ifdef SUPPORT_WIDE_CHARS
|
|
PCRE2_ASSERT((xclass_props & XCLASS_HAS_PROPS) == 0 ||
|
|
(xclass_props & XCLASS_HIGH_ANY) == 0);
|
|
|
|
if (cranges != NULL)
|
|
{
|
|
uint32_t *range = (uint32_t*)(cranges + 1);
|
|
uint32_t *end = range + cranges->range_list_size;
|
|
|
|
while (range < end && range[0] < 256)
|
|
{
|
|
PCRE2_ASSERT((xclass_props & XCLASS_HAS_8BIT_CHARS) != 0);
|
|
/* Add range to bitset. If we are in UTF or UCP mode, then clear the
|
|
caseless bit, because the cranges handle caselessness (only) in this
|
|
condition; see the condition for PARSE_CLASS_CASELESS_UTF in
|
|
compile_optimize_class(). */
|
|
add_to_class(((options & (PCRE2_UTF|PCRE2_UCP)) != 0)?
|
|
(options & ~PCRE2_CASELESS) : options, xoptions, cb, range[0], range[1]);
|
|
|
|
if (range[1] > 255) break;
|
|
range += 2;
|
|
}
|
|
|
|
if (cranges->char_lists_size > 0)
|
|
{
|
|
/* The cranges structure is still used and freed later. */
|
|
PCRE2_ASSERT((xclass_props & XCLASS_HIGH_ANY) == 0);
|
|
xclass_props |= XCLASS_REQUIRED | XCLASS_HAS_CHAR_LISTS;
|
|
}
|
|
else
|
|
{
|
|
if ((xclass_props & XCLASS_HIGH_ANY) != 0)
|
|
{
|
|
PCRE2_ASSERT(range + 2 == end && range[0] <= 256 &&
|
|
range[1] >= GET_MAX_CHAR_VALUE(utf));
|
|
should_flip_negation = TRUE;
|
|
range = end;
|
|
}
|
|
|
|
while (range < end)
|
|
{
|
|
uint32_t range_start = range[0];
|
|
uint32_t range_end = range[1];
|
|
|
|
range += 2;
|
|
xclass_props |= XCLASS_REQUIRED;
|
|
|
|
if (range_start < 256) range_start = 256;
|
|
|
|
if (lengthptr != NULL)
|
|
{
|
|
#ifdef SUPPORT_UNICODE
|
|
if (utf)
|
|
{
|
|
*lengthptr += 1;
|
|
|
|
if (range_start < range_end)
|
|
*lengthptr += PRIV(ord2utf)(range_start, class_uchardata);
|
|
|
|
*lengthptr += PRIV(ord2utf)(range_end, class_uchardata);
|
|
continue;
|
|
}
|
|
#endif /* SUPPORT_UNICODE */
|
|
|
|
*lengthptr += range_start < range_end ? 3 : 2;
|
|
continue;
|
|
}
|
|
|
|
#ifdef SUPPORT_UNICODE
|
|
if (utf)
|
|
{
|
|
if (range_start < range_end)
|
|
{
|
|
*class_uchardata++ = XCL_RANGE;
|
|
class_uchardata += PRIV(ord2utf)(range_start, class_uchardata);
|
|
}
|
|
else
|
|
*class_uchardata++ = XCL_SINGLE;
|
|
|
|
class_uchardata += PRIV(ord2utf)(range_end, class_uchardata);
|
|
continue;
|
|
}
|
|
#endif /* SUPPORT_UNICODE */
|
|
|
|
/* Without UTF support, character values are constrained
|
|
by the bit length, and can only be > 256 for 16-bit and
|
|
32-bit libraries. */
|
|
#if PCRE2_CODE_UNIT_WIDTH != 8
|
|
if (range_start < range_end)
|
|
{
|
|
*class_uchardata++ = XCL_RANGE;
|
|
*class_uchardata++ = range_start;
|
|
}
|
|
else
|
|
*class_uchardata++ = XCL_SINGLE;
|
|
|
|
*class_uchardata++ = range_end;
|
|
#endif /* PCRE2_CODE_UNIT_WIDTH == 8 */
|
|
}
|
|
|
|
if (lengthptr == NULL)
|
|
cb->cx->memctl.free(cranges, cb->cx->memctl.memory_data);
|
|
}
|
|
}
|
|
#endif /* SUPPORT_WIDE_CHARS */
|
|
|
|
/* If there are characters with values > 255, or Unicode property settings
|
|
(\p or \P), we have to compile an extended class, with its own opcode,
|
|
unless there were no property settings and there was a negated special such
|
|
as \S in the class, and PCRE2_UCP is not set, because in that case all
|
|
characters > 255 are in or not in the class, so any that were explicitly
|
|
given as well can be ignored.
|
|
|
|
In the UCP case, if certain negated POSIX classes (ex: [:^ascii:]) were
|
|
were present in a class, we either have to match or not match all wide
|
|
characters (depending on whether the whole class is or is not negated).
|
|
This requirement is indicated by match_all_or_no_wide_chars being true.
|
|
We do this by including an explicit range, which works in both cases.
|
|
This applies only in UTF and 16-bit and 32-bit non-UTF modes, since there
|
|
cannot be any wide characters in 8-bit non-UTF mode.
|
|
|
|
When there *are* properties in a positive UTF-8 or any 16-bit or 32_bit
|
|
class where \S etc is present without PCRE2_UCP, causing an extended class
|
|
to be compiled, we make sure that all characters > 255 are included by
|
|
forcing match_all_or_no_wide_chars to be true.
|
|
|
|
If, when generating an xclass, there are no characters < 256, we can omit
|
|
the bitmap in the actual compiled code. */
|
|
|
|
#ifdef SUPPORT_WIDE_CHARS /* Defined for 16/32 bits, or 8-bit with Unicode */
|
|
if ((xclass_props & XCLASS_REQUIRED) != 0)
|
|
{
|
|
PCRE2_UCHAR *previous = code;
|
|
|
|
if ((xclass_props & XCLASS_HAS_CHAR_LISTS) == 0)
|
|
*class_uchardata++ = XCL_END; /* Marks the end of extra data */
|
|
*code++ = OP_XCLASS;
|
|
code += LINK_SIZE;
|
|
*code = negate_class? XCL_NOT:0;
|
|
if ((xclass_props & XCLASS_HAS_PROPS) != 0) *code |= XCL_HASPROP;
|
|
|
|
/* If the map is required, move up the extra data to make room for it;
|
|
otherwise just move the code pointer to the end of the extra data. */
|
|
|
|
if ((xclass_props & XCLASS_HAS_8BIT_CHARS) != 0 || has_bitmap != NULL)
|
|
{
|
|
if (negate_class)
|
|
{
|
|
uint32_t *classwords = cb->classbits.classwords;
|
|
for (int i = 0; i < 8; i++) classwords[i] = ~classwords[i];
|
|
}
|
|
|
|
if (has_bitmap == NULL)
|
|
{
|
|
*code++ |= XCL_MAP;
|
|
(void)memmove(code + (32 / sizeof(PCRE2_UCHAR)), code,
|
|
CU2BYTES(class_uchardata - code));
|
|
memcpy(code, classbits, 32);
|
|
code = class_uchardata + (32 / sizeof(PCRE2_UCHAR));
|
|
}
|
|
else
|
|
{
|
|
code = class_uchardata;
|
|
if ((xclass_props & XCLASS_HAS_8BIT_CHARS) != 0)
|
|
*has_bitmap = TRUE;
|
|
}
|
|
}
|
|
else code = class_uchardata;
|
|
|
|
if ((xclass_props & XCLASS_HAS_CHAR_LISTS) != 0)
|
|
{
|
|
/* Char lists size is an even number, because all items are 16 or 32
|
|
bit values. The character list data is always aligned to 32 bits. */
|
|
size_t char_lists_size = cranges->char_lists_size;
|
|
PCRE2_ASSERT((char_lists_size & 0x1) == 0 &&
|
|
(cb->char_lists_size & 0x3) == 0);
|
|
|
|
if (lengthptr != NULL)
|
|
{
|
|
char_lists_size = CLIST_ALIGN_TO(char_lists_size, sizeof(uint32_t));
|
|
|
|
#if PCRE2_CODE_UNIT_WIDTH == 8
|
|
*lengthptr += 2 + LINK_SIZE;
|
|
#else
|
|
*lengthptr += 1 + LINK_SIZE;
|
|
#endif
|
|
|
|
cb->char_lists_size += char_lists_size;
|
|
|
|
char_lists_size /= sizeof(PCRE2_UCHAR);
|
|
|
|
/* Storage space for character lists is included
|
|
in the maximum pattern size. */
|
|
if (*lengthptr > MAX_PATTERN_SIZE ||
|
|
MAX_PATTERN_SIZE - *lengthptr < char_lists_size)
|
|
{
|
|
*errorcodeptr = ERR20; /* Pattern is too large */
|
|
return NULL;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
uint8_t *data;
|
|
|
|
PCRE2_ASSERT(cranges->char_lists_types <= XCL_TYPE_MASK);
|
|
#if PCRE2_CODE_UNIT_WIDTH == 8
|
|
/* Encode as high / low bytes. */
|
|
code[0] = (uint8_t)(XCL_LIST |
|
|
(cranges->char_lists_types >> 8));
|
|
code[1] = (uint8_t)cranges->char_lists_types;
|
|
code += 2;
|
|
#else
|
|
*code++ = (PCRE2_UCHAR)(XCL_LIST | cranges->char_lists_types);
|
|
#endif
|
|
|
|
/* Character lists are stored in backwards direction from
|
|
byte code start. The non-dfa/dfa matchers can access these
|
|
lists using the byte code start stored in match blocks.
|
|
Each list is aligned to 32 bit with an optional unused
|
|
16 bit value at the beginning of the character list. */
|
|
|
|
cb->char_lists_size += char_lists_size;
|
|
data = (uint8_t*)cb->start_code - cb->char_lists_size;
|
|
|
|
memcpy(data, (uint8_t*)(cranges + 1) + cranges->char_lists_start,
|
|
char_lists_size);
|
|
|
|
/* Since character lists total size is less than MAX_PATTERN_SIZE,
|
|
their starting offset fits into a value which size is LINK_SIZE. */
|
|
|
|
char_lists_size = cb->char_lists_size;
|
|
PUT(code, 0, (uint32_t)(char_lists_size >> 1));
|
|
code += LINK_SIZE;
|
|
|
|
#if defined PCRE2_DEBUG || defined SUPPORT_VALGRIND
|
|
if ((char_lists_size & 0x2) != 0)
|
|
{
|
|
/* In debug the unused 16 bit value is set
|
|
to a fixed value and marked unused. */
|
|
((uint16_t*)data)[-1] = 0x5555;
|
|
#ifdef SUPPORT_VALGRIND
|
|
VALGRIND_MAKE_MEM_NOACCESS(data - 2, 2);
|
|
#endif
|
|
}
|
|
#endif
|
|
|
|
cb->char_lists_size =
|
|
CLIST_ALIGN_TO(char_lists_size, sizeof(uint32_t));
|
|
|
|
cb->cx->memctl.free(cranges, cb->cx->memctl.memory_data);
|
|
}
|
|
}
|
|
|
|
/* Now fill in the complete length of the item */
|
|
|
|
PUT(previous, 1, (int)(code - previous));
|
|
goto DONE; /* End of class handling */
|
|
}
|
|
#endif /* SUPPORT_WIDE_CHARS */
|
|
|
|
/* If there are no characters > 255, or they are all to be included or
|
|
excluded, set the opcode to OP_CLASS or OP_NCLASS, depending on whether the
|
|
whole class was negated and whether there were negative specials such as \S
|
|
(non-UCP) in the class. Then copy the 32-byte map into the code vector,
|
|
negating it if necessary. */
|
|
|
|
if (negate_class)
|
|
{
|
|
uint32_t *classwords = cb->classbits.classwords;
|
|
|
|
for (int i = 0; i < 8; i++) classwords[i] = ~classwords[i];
|
|
}
|
|
|
|
if ((SELECT_VALUE8(!utf, 0) || negate_class != should_flip_negation) &&
|
|
cb->classbits.classwords[0] == ~(uint32_t)0)
|
|
{
|
|
const uint32_t *classwords = cb->classbits.classwords;
|
|
int i;
|
|
|
|
for (i = 0; i < 8; i++)
|
|
if (classwords[i] != ~(uint32_t)0) break;
|
|
|
|
if (i == 8)
|
|
{
|
|
*code++ = OP_ALLANY;
|
|
goto DONE; /* End of class handling */
|
|
}
|
|
}
|
|
|
|
*code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
|
|
memcpy(code, classbits, 32);
|
|
code += 32 / sizeof(PCRE2_UCHAR);
|
|
|
|
DONE:
|
|
*pcode = code;
|
|
return pptr - 1;
|
|
}
|
|
|
|
|
|
|
|
/* ===================================================================*/
|
|
/* Here follows a block of ECLASS-compiling functions. You may well want to
|
|
read them from top to bottom; they are ordered from leafmost (at the top) to
|
|
outermost parser (at the bottom of the file). */
|
|
|
|
/* This function folds one operand using the negation operator.
|
|
The new, combined chunk of stack code is written out to *pop_info. */
|
|
|
|
static void
|
|
fold_negation(eclass_op_info *pop_info, PCRE2_SIZE *lengthptr,
|
|
BOOL preserve_classbits)
|
|
{
|
|
/* If the chunk of stack code is already composed of multiple ops, we won't
|
|
descend in and try and propagate the negation down the tree. (That would lead
|
|
to O(n^2) compile-time, which could be exploitable with a malicious regex -
|
|
although maybe that's not really too much of a worry in a library that offers
|
|
an exponential-time matching function!) */
|
|
|
|
if (pop_info->op_single_type == 0)
|
|
{
|
|
if (lengthptr != NULL)
|
|
*lengthptr += 1;
|
|
else
|
|
pop_info->code_start[pop_info->length] = ECL_NOT;
|
|
pop_info->length += 1;
|
|
}
|
|
|
|
/* Otherwise, it's a nice single-op item, so we can easily fold in the negation
|
|
without needing to produce an ECL_NOT. */
|
|
|
|
else if (pop_info->op_single_type == ECL_ANY ||
|
|
pop_info->op_single_type == ECL_NONE)
|
|
{
|
|
pop_info->op_single_type = (pop_info->op_single_type == ECL_NONE)?
|
|
ECL_ANY : ECL_NONE;
|
|
if (lengthptr == NULL)
|
|
*(pop_info->code_start) = pop_info->op_single_type;
|
|
}
|
|
else
|
|
{
|
|
PCRE2_ASSERT(pop_info->op_single_type == ECL_XCLASS &&
|
|
pop_info->length >= 1 + LINK_SIZE + 1);
|
|
if (lengthptr == NULL)
|
|
pop_info->code_start[1 + LINK_SIZE] ^= XCL_NOT;
|
|
}
|
|
|
|
if (!preserve_classbits)
|
|
{
|
|
for (int i = 0; i < 8; i++)
|
|
pop_info->bits.classwords[i] = ~pop_info->bits.classwords[i];
|
|
}
|
|
}
|
|
|
|
|
|
|
|
/* This function folds together two operands using a binary operator.
|
|
The new, combined chunk of stack code is written out to *lhs_op_info. */
|
|
|
|
static void
|
|
fold_binary(int op, eclass_op_info *lhs_op_info, eclass_op_info *rhs_op_info,
|
|
PCRE2_SIZE *lengthptr)
|
|
{
|
|
switch (op)
|
|
{
|
|
/* ECL_AND truth table:
|
|
|
|
LHS RHS RESULT
|
|
----------------
|
|
ANY * RHS
|
|
* ANY LHS
|
|
NONE * NONE
|
|
* NONE NONE
|
|
X Y X & Y
|
|
*/
|
|
|
|
case ECL_AND:
|
|
if (rhs_op_info->op_single_type == ECL_ANY)
|
|
{
|
|
/* no-op: drop the RHS */
|
|
}
|
|
else if (lhs_op_info->op_single_type == ECL_ANY)
|
|
{
|
|
/* no-op: drop the LHS, and memmove the RHS into its place */
|
|
if (lengthptr == NULL)
|
|
memmove(lhs_op_info->code_start, rhs_op_info->code_start,
|
|
CU2BYTES(rhs_op_info->length));
|
|
lhs_op_info->length = rhs_op_info->length;
|
|
lhs_op_info->op_single_type = rhs_op_info->op_single_type;
|
|
}
|
|
else if (rhs_op_info->op_single_type == ECL_NONE)
|
|
{
|
|
/* the result is ECL_NONE: write into the LHS */
|
|
if (lengthptr == NULL)
|
|
lhs_op_info->code_start[0] = ECL_NONE;
|
|
lhs_op_info->length = 1;
|
|
lhs_op_info->op_single_type = ECL_NONE;
|
|
}
|
|
else if (lhs_op_info->op_single_type == ECL_NONE)
|
|
{
|
|
/* the result is ECL_NONE: drop the RHS */
|
|
}
|
|
else
|
|
{
|
|
/* Both of LHS & RHS are either ECL_XCLASS, or compound operations. */
|
|
if (lengthptr != NULL)
|
|
*lengthptr += 1;
|
|
else
|
|
{
|
|
PCRE2_ASSERT(rhs_op_info->code_start ==
|
|
lhs_op_info->code_start + lhs_op_info->length);
|
|
rhs_op_info->code_start[rhs_op_info->length] = ECL_AND;
|
|
}
|
|
lhs_op_info->length += rhs_op_info->length + 1;
|
|
lhs_op_info->op_single_type = 0;
|
|
}
|
|
|
|
for (int i = 0; i < 8; i++)
|
|
lhs_op_info->bits.classwords[i] &= rhs_op_info->bits.classwords[i];
|
|
break;
|
|
|
|
/* ECL_OR truth table:
|
|
|
|
LHS RHS RESULT
|
|
----------------
|
|
ANY * ANY
|
|
* ANY ANY
|
|
NONE * RHS
|
|
* NONE LHS
|
|
X Y X | Y
|
|
*/
|
|
|
|
case ECL_OR:
|
|
if (rhs_op_info->op_single_type == ECL_NONE)
|
|
{
|
|
/* no-op: drop the RHS */
|
|
}
|
|
else if (lhs_op_info->op_single_type == ECL_NONE)
|
|
{
|
|
/* no-op: drop the LHS, and memmove the RHS into its place */
|
|
if (lengthptr == NULL)
|
|
memmove(lhs_op_info->code_start, rhs_op_info->code_start,
|
|
CU2BYTES(rhs_op_info->length));
|
|
lhs_op_info->length = rhs_op_info->length;
|
|
lhs_op_info->op_single_type = rhs_op_info->op_single_type;
|
|
}
|
|
else if (rhs_op_info->op_single_type == ECL_ANY)
|
|
{
|
|
/* the result is ECL_ANY: write into the LHS */
|
|
if (lengthptr == NULL)
|
|
lhs_op_info->code_start[0] = ECL_ANY;
|
|
lhs_op_info->length = 1;
|
|
lhs_op_info->op_single_type = ECL_ANY;
|
|
}
|
|
else if (lhs_op_info->op_single_type == ECL_ANY)
|
|
{
|
|
/* the result is ECL_ANY: drop the RHS */
|
|
}
|
|
else
|
|
{
|
|
/* Both of LHS & RHS are either ECL_XCLASS, or compound operations. */
|
|
if (lengthptr != NULL)
|
|
*lengthptr += 1;
|
|
else
|
|
{
|
|
PCRE2_ASSERT(rhs_op_info->code_start ==
|
|
lhs_op_info->code_start + lhs_op_info->length);
|
|
rhs_op_info->code_start[rhs_op_info->length] = ECL_OR;
|
|
}
|
|
lhs_op_info->length += rhs_op_info->length + 1;
|
|
lhs_op_info->op_single_type = 0;
|
|
}
|
|
|
|
for (int i = 0; i < 8; i++)
|
|
lhs_op_info->bits.classwords[i] |= rhs_op_info->bits.classwords[i];
|
|
break;
|
|
|
|
/* ECL_XOR truth table:
|
|
|
|
LHS RHS RESULT
|
|
----------------
|
|
ANY * !RHS
|
|
* ANY !LHS
|
|
NONE * RHS
|
|
* NONE LHS
|
|
X Y X ^ Y
|
|
*/
|
|
|
|
case ECL_XOR:
|
|
if (rhs_op_info->op_single_type == ECL_NONE)
|
|
{
|
|
/* no-op: drop the RHS */
|
|
}
|
|
else if (lhs_op_info->op_single_type == ECL_NONE)
|
|
{
|
|
/* no-op: drop the LHS, and memmove the RHS into its place */
|
|
if (lengthptr == NULL)
|
|
memmove(lhs_op_info->code_start, rhs_op_info->code_start,
|
|
CU2BYTES(rhs_op_info->length));
|
|
lhs_op_info->length = rhs_op_info->length;
|
|
lhs_op_info->op_single_type = rhs_op_info->op_single_type;
|
|
}
|
|
else if (rhs_op_info->op_single_type == ECL_ANY)
|
|
{
|
|
/* the result is !LHS: fold in the negation, and drop the RHS */
|
|
/* Preserve the classbits, because we promise to deal with them later. */
|
|
fold_negation(lhs_op_info, lengthptr, TRUE);
|
|
}
|
|
else if (lhs_op_info->op_single_type == ECL_ANY)
|
|
{
|
|
/* the result is !RHS: drop the LHS, memmove the RHS into its place, and
|
|
fold in the negation */
|
|
if (lengthptr == NULL)
|
|
memmove(lhs_op_info->code_start, rhs_op_info->code_start,
|
|
CU2BYTES(rhs_op_info->length));
|
|
lhs_op_info->length = rhs_op_info->length;
|
|
lhs_op_info->op_single_type = rhs_op_info->op_single_type;
|
|
|
|
/* Preserve the classbits, because we promise to deal with them later. */
|
|
fold_negation(lhs_op_info, lengthptr, TRUE);
|
|
}
|
|
else
|
|
{
|
|
/* Both of LHS & RHS are either ECL_XCLASS, or compound operations. */
|
|
if (lengthptr != NULL)
|
|
*lengthptr += 1;
|
|
else
|
|
{
|
|
PCRE2_ASSERT(rhs_op_info->code_start ==
|
|
lhs_op_info->code_start + lhs_op_info->length);
|
|
rhs_op_info->code_start[rhs_op_info->length] = ECL_XOR;
|
|
}
|
|
lhs_op_info->length += rhs_op_info->length + 1;
|
|
lhs_op_info->op_single_type = 0;
|
|
}
|
|
|
|
for (int i = 0; i < 8; i++)
|
|
lhs_op_info->bits.classwords[i] ^= rhs_op_info->bits.classwords[i];
|
|
break;
|
|
|
|
default:
|
|
PCRE2_DEBUG_UNREACHABLE();
|
|
break;
|
|
}
|
|
}
|
|
|
|
|
|
|
|
static BOOL
|
|
compile_eclass_nested(eclass_context *context, BOOL negated,
|
|
uint32_t **pptr, PCRE2_UCHAR **pcode,
|
|
eclass_op_info *pop_info, PCRE2_SIZE *lengthptr);
|
|
|
|
/* This function consumes a group of implicitly-unioned class elements.
|
|
These can be characters, ranges, properties, or nested classes, as long
|
|
as they are all joined by being placed adjacently. */
|
|
|
|
static BOOL
|
|
compile_class_operand(eclass_context *context, BOOL negated,
|
|
uint32_t **pptr, PCRE2_UCHAR **pcode, eclass_op_info *pop_info,
|
|
PCRE2_SIZE *lengthptr)
|
|
{
|
|
uint32_t *ptr = *pptr;
|
|
uint32_t *prev_ptr;
|
|
PCRE2_UCHAR *code = *pcode;
|
|
PCRE2_UCHAR *code_start = code;
|
|
PCRE2_SIZE prev_length = (lengthptr != NULL)? *lengthptr : 0;
|
|
PCRE2_SIZE extra_length;
|
|
uint32_t meta = META_CODE(*ptr);
|
|
|
|
switch (meta)
|
|
{
|
|
case META_CLASS_EMPTY_NOT:
|
|
case META_CLASS_EMPTY:
|
|
++ptr;
|
|
pop_info->length = 1;
|
|
if ((meta == META_CLASS_EMPTY) == negated)
|
|
{
|
|
*code++ = pop_info->op_single_type = ECL_ANY;
|
|
memset(pop_info->bits.classbits, 0xff, 32);
|
|
}
|
|
else
|
|
{
|
|
*code++ = pop_info->op_single_type = ECL_NONE;
|
|
memset(pop_info->bits.classbits, 0, 32);
|
|
}
|
|
break;
|
|
|
|
case META_CLASS:
|
|
case META_CLASS_NOT:
|
|
if ((*ptr & CLASS_IS_ECLASS) != 0)
|
|
{
|
|
if (!compile_eclass_nested(context, negated, &ptr, &code,
|
|
pop_info, lengthptr))
|
|
return FALSE;
|
|
|
|
PCRE2_ASSERT(*ptr == META_CLASS_END);
|
|
ptr++;
|
|
goto DONE;
|
|
}
|
|
|
|
ptr++;
|
|
/* Fall through */
|
|
|
|
default:
|
|
/* Scan forward characters, ranges, and properties.
|
|
For example: inside [a-z_ -- m] we don't have brackets around "a-z_" but
|
|
we still need to collect that fragment up into a "leaf" OP_CLASS. */
|
|
|
|
prev_ptr = ptr;
|
|
ptr = PRIV(compile_class_not_nested)(
|
|
context->options, context->xoptions, ptr, &code,
|
|
(meta != META_CLASS_NOT) == negated, &context->needs_bitmap,
|
|
context->errorcodeptr, context->cb, lengthptr);
|
|
if (ptr == NULL) return FALSE;
|
|
|
|
/* We must have a 100% guarantee that ptr increases when
|
|
compile_class_operand() returns, even on Release builds, so that we can
|
|
statically prove our loops terminate. */
|
|
if (ptr <= prev_ptr)
|
|
{
|
|
PCRE2_DEBUG_UNREACHABLE();
|
|
return FALSE;
|
|
}
|
|
|
|
/* If we fell through above, consume the closing ']'. */
|
|
if (meta == META_CLASS || meta == META_CLASS_NOT)
|
|
{
|
|
PCRE2_ASSERT(*ptr == META_CLASS_END);
|
|
ptr++;
|
|
}
|
|
|
|
/* Regardless of whether (lengthptr == NULL), some data will still be written
|
|
out to *pcode, which we need: we have to peek at it, to transform the opcode
|
|
into the ECLASS version (since we need to hoist up the bitmaps). */
|
|
PCRE2_ASSERT(code > code_start);
|
|
extra_length = (lengthptr != NULL)? *lengthptr - prev_length : 0;
|
|
|
|
/* Easiest case: convert OP_ALLANY to ECL_ANY */
|
|
|
|
if (*code_start == OP_ALLANY)
|
|
{
|
|
PCRE2_ASSERT(code - code_start == 1 && extra_length == 0);
|
|
pop_info->length = 1;
|
|
*code_start = pop_info->op_single_type = ECL_ANY;
|
|
memset(pop_info->bits.classbits, 0xff, 32);
|
|
}
|
|
|
|
/* For OP_CLASS and OP_NCLASS, we hoist out the bitmap and convert to
|
|
ECL_NONE / ECL_ANY respectively. */
|
|
|
|
else if (*code_start == OP_CLASS || *code_start == OP_NCLASS)
|
|
{
|
|
PCRE2_ASSERT(code - code_start == 1 + 32 / sizeof(PCRE2_UCHAR) &&
|
|
extra_length == 0);
|
|
pop_info->length = 1;
|
|
*code_start = pop_info->op_single_type =
|
|
(*code_start == OP_CLASS)? ECL_NONE : ECL_ANY;
|
|
memcpy(pop_info->bits.classbits, code_start + 1, 32);
|
|
/* Rewind the code pointer, but make sure we adjust *lengthptr, because we
|
|
do need to reserve that space (even though we only use it temporarily). */
|
|
if (lengthptr != NULL)
|
|
*lengthptr += code - (code_start + 1);
|
|
code = code_start + 1;
|
|
|
|
if (!context->needs_bitmap && *code_start == ECL_NONE)
|
|
{
|
|
uint32_t *classwords = pop_info->bits.classwords;
|
|
|
|
for (int i = 0; i < 8; i++)
|
|
if (classwords[i] != 0)
|
|
{
|
|
context->needs_bitmap = TRUE;
|
|
break;
|
|
}
|
|
}
|
|
else
|
|
context->needs_bitmap = TRUE;
|
|
}
|
|
|
|
/* Finally, for OP_XCLASS we hoist out the bitmap (if any), and convert to
|
|
ECL_XCLASS. */
|
|
|
|
else
|
|
{
|
|
PCRE2_ASSERT(*code_start == OP_XCLASS);
|
|
*code_start = pop_info->op_single_type = ECL_XCLASS;
|
|
|
|
PCRE2_ASSERT(code - code_start >= 1 + LINK_SIZE + 1);
|
|
|
|
memcpy(pop_info->bits.classbits, context->cb->classbits.classbits, 32);
|
|
pop_info->length = (code - code_start) + extra_length;
|
|
}
|
|
|
|
break;
|
|
} /* End of switch(meta) */
|
|
|
|
pop_info->code_start = (lengthptr == NULL)? code_start : NULL;
|
|
|
|
if (lengthptr != NULL)
|
|
{
|
|
*lengthptr += code - code_start;
|
|
code = code_start;
|
|
}
|
|
|
|
DONE:
|
|
PCRE2_ASSERT(lengthptr == NULL || (code == code_start));
|
|
|
|
*pptr = ptr;
|
|
*pcode = code;
|
|
return TRUE;
|
|
}
|
|
|
|
|
|
|
|
/* This function consumes a group of implicitly-unioned class elements.
|
|
These can be characters, ranges, properties, or nested classes, as long
|
|
as they are all joined by being placed adjacently. */
|
|
|
|
static BOOL
|
|
compile_class_juxtaposition(eclass_context *context, BOOL negated,
|
|
uint32_t **pptr, PCRE2_UCHAR **pcode, eclass_op_info *pop_info,
|
|
PCRE2_SIZE *lengthptr)
|
|
{
|
|
uint32_t *ptr = *pptr;
|
|
PCRE2_UCHAR *code = *pcode;
|
|
#ifdef PCRE2_DEBUG
|
|
PCRE2_UCHAR *start_code = *pcode;
|
|
#endif
|
|
|
|
/* See compile_class_binary_loose() for comments on compile-time folding of
|
|
the "negated" flag. */
|
|
|
|
/* Because it's a non-empty class, there must be an operand at the start. */
|
|
if (!compile_class_operand(context, negated, &ptr, &code, pop_info, lengthptr))
|
|
return FALSE;
|
|
|
|
while (*ptr != META_CLASS_END &&
|
|
!(*ptr >= META_ECLASS_AND && *ptr <= META_ECLASS_NOT))
|
|
{
|
|
uint32_t op;
|
|
BOOL rhs_negated;
|
|
eclass_op_info rhs_op_info;
|
|
|
|
if (negated)
|
|
{
|
|
/* !(A juxtapose B) -> !A && !B */
|
|
op = ECL_AND;
|
|
rhs_negated = TRUE;
|
|
}
|
|
else
|
|
{
|
|
/* A juxtapose B -> A || B */
|
|
op = ECL_OR;
|
|
rhs_negated = FALSE;
|
|
}
|
|
|
|
/* An operand must follow the operator. */
|
|
if (!compile_class_operand(context, rhs_negated, &ptr, &code,
|
|
&rhs_op_info, lengthptr))
|
|
return FALSE;
|
|
|
|
/* Convert infix to postfix (RPN). */
|
|
fold_binary(op, pop_info, &rhs_op_info, lengthptr);
|
|
if (lengthptr == NULL)
|
|
code = pop_info->code_start + pop_info->length;
|
|
}
|
|
|
|
PCRE2_ASSERT(lengthptr == NULL || code == start_code);
|
|
|
|
*pptr = ptr;
|
|
*pcode = code;
|
|
return TRUE;
|
|
}
|
|
|
|
|
|
|
|
/* This function consumes unary prefix operators. */
|
|
|
|
static BOOL
|
|
compile_class_unary(eclass_context *context, BOOL negated,
|
|
uint32_t **pptr, PCRE2_UCHAR **pcode, eclass_op_info *pop_info,
|
|
PCRE2_SIZE *lengthptr)
|
|
{
|
|
uint32_t *ptr = *pptr;
|
|
#ifdef PCRE2_DEBUG
|
|
PCRE2_UCHAR *start_code = *pcode;
|
|
#endif
|
|
|
|
while (*ptr == META_ECLASS_NOT)
|
|
{
|
|
++ptr;
|
|
negated = !negated;
|
|
}
|
|
|
|
*pptr = ptr;
|
|
/* Because it's a non-empty class, there must be an operand. */
|
|
if (!compile_class_juxtaposition(context, negated, pptr, pcode,
|
|
pop_info, lengthptr))
|
|
return FALSE;
|
|
|
|
PCRE2_ASSERT(lengthptr == NULL || *pcode == start_code);
|
|
return TRUE;
|
|
}
|
|
|
|
|
|
|
|
/* This function consumes tightly-binding binary operators. */
|
|
|
|
static BOOL
|
|
compile_class_binary_tight(eclass_context *context, BOOL negated,
|
|
uint32_t **pptr, PCRE2_UCHAR **pcode, eclass_op_info *pop_info,
|
|
PCRE2_SIZE *lengthptr)
|
|
{
|
|
uint32_t *ptr = *pptr;
|
|
PCRE2_UCHAR *code = *pcode;
|
|
#ifdef PCRE2_DEBUG
|
|
PCRE2_UCHAR *start_code = *pcode;
|
|
#endif
|
|
|
|
/* See compile_class_binary_loose() for comments on compile-time folding of
|
|
the "negated" flag. */
|
|
|
|
/* Because it's a non-empty class, there must be an operand at the start. */
|
|
if (!compile_class_unary(context, negated, &ptr, &code, pop_info, lengthptr))
|
|
return FALSE;
|
|
|
|
while (*ptr == META_ECLASS_AND)
|
|
{
|
|
uint32_t op;
|
|
BOOL rhs_negated;
|
|
eclass_op_info rhs_op_info;
|
|
|
|
if (negated)
|
|
{
|
|
/* !(A && B) -> !A || !B */
|
|
op = ECL_OR;
|
|
rhs_negated = TRUE;
|
|
}
|
|
else
|
|
{
|
|
/* A && B -> A && B */
|
|
op = ECL_AND;
|
|
rhs_negated = FALSE;
|
|
}
|
|
|
|
++ptr;
|
|
|
|
/* An operand must follow the operator. */
|
|
if (!compile_class_unary(context, rhs_negated, &ptr, &code,
|
|
&rhs_op_info, lengthptr))
|
|
return FALSE;
|
|
|
|
/* Convert infix to postfix (RPN). */
|
|
fold_binary(op, pop_info, &rhs_op_info, lengthptr);
|
|
if (lengthptr == NULL)
|
|
code = pop_info->code_start + pop_info->length;
|
|
}
|
|
|
|
PCRE2_ASSERT(lengthptr == NULL || code == start_code);
|
|
|
|
*pptr = ptr;
|
|
*pcode = code;
|
|
return TRUE;
|
|
}
|
|
|
|
|
|
|
|
/* This function consumes loosely-binding binary operators. */
|
|
|
|
static BOOL
|
|
compile_class_binary_loose(eclass_context *context, BOOL negated,
|
|
uint32_t **pptr, PCRE2_UCHAR **pcode, eclass_op_info *pop_info,
|
|
PCRE2_SIZE *lengthptr)
|
|
{
|
|
uint32_t *ptr = *pptr;
|
|
PCRE2_UCHAR *code = *pcode;
|
|
#ifdef PCRE2_DEBUG
|
|
PCRE2_UCHAR *start_code = *pcode;
|
|
#endif
|
|
|
|
/* We really want to fold the negation operator, if at all possible, so that
|
|
simple cases can be reduced down. In particular, in 8-bit no-UTF mode, we want
|
|
to produce a fully-folded expression, so that we can guarantee not to emit any
|
|
OP_ECLASS codes (in the same way that we never emit OP_XCLASS in this mode).
|
|
|
|
This has the consequence that with a little ingenuity, we can in fact avoid
|
|
emitting (nearly...) all cases of the "NOT" operator. Imagine that we have:
|
|
!(A ...
|
|
We have parsed the preceding "!", and we are about to parse the "A" operand. We
|
|
don't know yet whether there will even be a following binary operand! Both of
|
|
these are possibilities for what follows:
|
|
!(A && B)
|
|
!(A)
|
|
However, we can still fold the "!" into the "A" operand, because no matter what
|
|
the following binary operator will be, we can produce an expression which is
|
|
equivalent. */
|
|
|
|
/* Because it's a non-empty class, there must be an operand at the start. */
|
|
if (!compile_class_binary_tight(context, negated, &ptr, &code,
|
|
pop_info, lengthptr))
|
|
return FALSE;
|
|
|
|
while (*ptr >= META_ECLASS_OR && *ptr <= META_ECLASS_XOR)
|
|
{
|
|
uint32_t op;
|
|
BOOL op_neg;
|
|
BOOL rhs_negated;
|
|
eclass_op_info rhs_op_info;
|
|
|
|
if (negated)
|
|
{
|
|
/* The whole expression is being negated; we respond by unconditionally
|
|
negating the LHS A, before seeing what follows. And hooray! We can recover,
|
|
no matter what follows. */
|
|
/* !(A || B) -> !A && !B */
|
|
/* !(A -- B) -> !(A && !B) -> !A || B */
|
|
/* !(A XOR B) -> !(!A XOR !B) -> !A XNOR !B */
|
|
op = (*ptr == META_ECLASS_OR )? ECL_AND :
|
|
(*ptr == META_ECLASS_SUB)? ECL_OR :
|
|
/*ptr == META_ECLASS_XOR*/ ECL_XOR;
|
|
op_neg = (*ptr == META_ECLASS_XOR);
|
|
rhs_negated = *ptr != META_ECLASS_SUB;
|
|
}
|
|
else
|
|
{
|
|
/* A || B -> A || B */
|
|
/* A -- B -> A && !B */
|
|
/* A XOR B -> A XOR B */
|
|
op = (*ptr == META_ECLASS_OR )? ECL_OR :
|
|
(*ptr == META_ECLASS_SUB)? ECL_AND :
|
|
/*ptr == META_ECLASS_XOR*/ ECL_XOR;
|
|
op_neg = FALSE;
|
|
rhs_negated = *ptr == META_ECLASS_SUB;
|
|
}
|
|
|
|
++ptr;
|
|
|
|
/* An operand must follow the operator. */
|
|
if (!compile_class_binary_tight(context, rhs_negated, &ptr, &code,
|
|
&rhs_op_info, lengthptr))
|
|
return FALSE;
|
|
|
|
/* Convert infix to postfix (RPN). */
|
|
fold_binary(op, pop_info, &rhs_op_info, lengthptr);
|
|
if (op_neg) fold_negation(pop_info, lengthptr, FALSE);
|
|
if (lengthptr == NULL)
|
|
code = pop_info->code_start + pop_info->length;
|
|
}
|
|
|
|
PCRE2_ASSERT(lengthptr == NULL || code == start_code);
|
|
|
|
*pptr = ptr;
|
|
*pcode = code;
|
|
return TRUE;
|
|
}
|
|
|
|
|
|
|
|
/* This function converts the META codes in pptr into opcodes written to
|
|
pcode. The pptr must start at a META_CLASS or META_CLASS_NOT.
|
|
|
|
The class is compiled as a left-associative sequence of operator
|
|
applications.
|
|
|
|
The pptr will be left pointing at the matching META_CLASS_END. */
|
|
|
|
static BOOL
|
|
compile_eclass_nested(eclass_context *context, BOOL negated,
|
|
uint32_t **pptr, PCRE2_UCHAR **pcode,
|
|
eclass_op_info *pop_info, PCRE2_SIZE *lengthptr)
|
|
{
|
|
uint32_t *ptr = *pptr;
|
|
#ifdef PCRE2_DEBUG
|
|
PCRE2_UCHAR *start_code = *pcode;
|
|
#endif
|
|
|
|
/* The CLASS_IS_ECLASS bit must be set since it is a nested class. */
|
|
PCRE2_ASSERT(*ptr == (META_CLASS | CLASS_IS_ECLASS) ||
|
|
*ptr == (META_CLASS_NOT | CLASS_IS_ECLASS));
|
|
|
|
if (*ptr++ == (META_CLASS_NOT | CLASS_IS_ECLASS))
|
|
negated = !negated;
|
|
|
|
(*pptr)++;
|
|
|
|
/* Because it's a non-empty class, there must be an operand at the start. */
|
|
if (!compile_class_binary_loose(context, negated, pptr, pcode,
|
|
pop_info, lengthptr))
|
|
return FALSE;
|
|
|
|
PCRE2_ASSERT(**pptr == META_CLASS_END);
|
|
PCRE2_ASSERT(lengthptr == NULL || *pcode == start_code);
|
|
return TRUE;
|
|
}
|
|
|
|
BOOL
|
|
PRIV(compile_class_nested)(uint32_t options, uint32_t xoptions,
|
|
uint32_t **pptr, PCRE2_UCHAR **pcode, int *errorcodeptr,
|
|
compile_block *cb, PCRE2_SIZE *lengthptr)
|
|
{
|
|
eclass_context context;
|
|
eclass_op_info op_info;
|
|
PCRE2_SIZE previous_length = (lengthptr != NULL)? *lengthptr : 0;
|
|
PCRE2_UCHAR *code = *pcode;
|
|
PCRE2_UCHAR *previous;
|
|
BOOL allbitsone = TRUE;
|
|
|
|
context.needs_bitmap = FALSE;
|
|
context.options = options;
|
|
context.xoptions = xoptions;
|
|
context.errorcodeptr = errorcodeptr;
|
|
context.cb = cb;
|
|
|
|
previous = code;
|
|
*code++ = OP_ECLASS;
|
|
code += LINK_SIZE;
|
|
*code++ = 0; /* Flags, currently zero. */
|
|
if (!compile_eclass_nested(&context, FALSE, pptr, &code, &op_info, lengthptr))
|
|
return FALSE;
|
|
|
|
if (lengthptr != NULL)
|
|
{
|
|
*lengthptr += code - previous;
|
|
code = previous;
|
|
/* (*lengthptr - previous_length) now holds the amount of buffer that
|
|
we require to make the call to compile_class_nested() with
|
|
lengthptr = NULL, and including the (1+LINK_SIZE+1) that we write out
|
|
before that call. */
|
|
}
|
|
|
|
/* Do some useful counting of what's in the bitmap. */
|
|
for (int i = 0; i < 8; i++)
|
|
if (op_info.bits.classwords[i] != 0xffffffff)
|
|
{
|
|
allbitsone = FALSE;
|
|
break;
|
|
}
|
|
|
|
/* After constant-folding the extended class syntax, it may turn out to be
|
|
a simple class after all. In that case, we can unwrap it from the
|
|
OP_ECLASS container - and in fact, we must do so, because in 8-bit
|
|
no-Unicode mode the matcher is compiled without support for OP_ECLASS. */
|
|
|
|
#ifndef SUPPORT_WIDE_CHARS
|
|
PCRE2_ASSERT(op_info.op_single_type != 0);
|
|
#else
|
|
if (op_info.op_single_type != 0)
|
|
#endif
|
|
{
|
|
/* Rewind back over the OP_ECLASS. */
|
|
code = previous;
|
|
|
|
/* If the bits are all ones, and the "high characters" are all matched
|
|
too, we use a special-cased encoding of OP_ALLANY. */
|
|
|
|
if (op_info.op_single_type == ECL_ANY && allbitsone)
|
|
{
|
|
/* Advancing code means rewinding lengthptr, at this point. */
|
|
if (lengthptr != NULL) *lengthptr -= 1;
|
|
*code++ = OP_ALLANY;
|
|
}
|
|
|
|
/* If the high bits are all matched / all not-matched, then we emit an
|
|
OP_NCLASS/OP_CLASS respectively. */
|
|
|
|
else if (op_info.op_single_type == ECL_ANY ||
|
|
op_info.op_single_type == ECL_NONE)
|
|
{
|
|
PCRE2_SIZE required_len = 1 + (32 / sizeof(PCRE2_UCHAR));
|
|
|
|
if (lengthptr != NULL)
|
|
{
|
|
if (required_len > (*lengthptr - previous_length))
|
|
*lengthptr = previous_length + required_len;
|
|
}
|
|
|
|
/* Advancing code means rewinding lengthptr, at this point. */
|
|
if (lengthptr != NULL) *lengthptr -= required_len;
|
|
*code++ = (op_info.op_single_type == ECL_ANY)? OP_NCLASS : OP_CLASS;
|
|
memcpy(code, op_info.bits.classbits, 32);
|
|
code += 32 / sizeof(PCRE2_UCHAR);
|
|
}
|
|
|
|
/* Otherwise, we have an ECL_XCLASS, so we have the OP_XCLASS data
|
|
there, but, we pulled out its bitmap into op_info, so now we have to
|
|
put that back into the OP_XCLASS. */
|
|
|
|
else
|
|
{
|
|
#ifndef SUPPORT_WIDE_CHARS
|
|
PCRE2_DEBUG_UNREACHABLE();
|
|
#else
|
|
BOOL need_map = context.needs_bitmap;
|
|
PCRE2_SIZE required_len;
|
|
|
|
PCRE2_ASSERT(op_info.op_single_type == ECL_XCLASS);
|
|
required_len = op_info.length + (need_map? 32/sizeof(PCRE2_UCHAR) : 0);
|
|
|
|
if (lengthptr != NULL)
|
|
{
|
|
/* Don't unconditionally request all the space we need - we may
|
|
already have asked for more during processing of the ECLASS. */
|
|
if (required_len > (*lengthptr - previous_length))
|
|
*lengthptr = previous_length + required_len;
|
|
|
|
/* The code we write out here won't be ignored, even during the
|
|
(lengthptr != NULL) phase, because if there's a following quantifier
|
|
it will peek backwards. So we do have to write out a (truncated)
|
|
OP_XCLASS, even on this branch. */
|
|
*lengthptr -= 1 + LINK_SIZE + 1;
|
|
*code++ = OP_XCLASS;
|
|
PUT(code, 0, 1 + LINK_SIZE + 1);
|
|
code += LINK_SIZE;
|
|
*code++ = 0;
|
|
}
|
|
else
|
|
{
|
|
PCRE2_UCHAR *rest;
|
|
PCRE2_SIZE rest_len;
|
|
PCRE2_UCHAR flags;
|
|
|
|
/* 1 unit: OP_XCLASS | LINK_SIZE units | 1 unit: flags | ...rest */
|
|
PCRE2_ASSERT(op_info.length >= 1 + LINK_SIZE + 1);
|
|
rest = op_info.code_start + 1 + LINK_SIZE + 1;
|
|
rest_len = (op_info.code_start + op_info.length) - rest;
|
|
|
|
/* First read any data we use, before memmove splats it. */
|
|
flags = op_info.code_start[1 + LINK_SIZE];
|
|
PCRE2_ASSERT((flags & XCL_MAP) == 0);
|
|
|
|
/* Next do the memmove before any writes. */
|
|
memmove(code + 1 + LINK_SIZE + 1 + (need_map? 32/sizeof(PCRE2_UCHAR) : 0),
|
|
rest, CU2BYTES(rest_len));
|
|
|
|
/* Finally write the header data. */
|
|
*code++ = OP_XCLASS;
|
|
PUT(code, 0, (int)required_len);
|
|
code += LINK_SIZE;
|
|
*code++ = flags | (need_map? XCL_MAP : 0);
|
|
if (need_map)
|
|
{
|
|
memcpy(code, op_info.bits.classbits, 32);
|
|
code += 32 / sizeof(PCRE2_UCHAR);
|
|
}
|
|
code += rest_len;
|
|
}
|
|
#endif /* SUPPORT_WIDE_CHARS */
|
|
}
|
|
}
|
|
|
|
/* Otherwise, we're going to keep the OP_ECLASS. However, again we need
|
|
to do some adjustment to insert the bitmap if we have one. */
|
|
|
|
#ifdef SUPPORT_WIDE_CHARS
|
|
else
|
|
{
|
|
BOOL need_map = context.needs_bitmap;
|
|
PCRE2_SIZE required_len =
|
|
1 + LINK_SIZE + 1 + (need_map? 32/sizeof(PCRE2_UCHAR) : 0) + op_info.length;
|
|
|
|
if (lengthptr != NULL)
|
|
{
|
|
if (required_len > (*lengthptr - previous_length))
|
|
*lengthptr = previous_length + required_len;
|
|
|
|
/* As for the XCLASS branch above, we do have to write out a dummy
|
|
OP_ECLASS, because of the backwards peek by the quantifier code. Write
|
|
out a (truncated) OP_ECLASS, even on this branch. */
|
|
*lengthptr -= 1 + LINK_SIZE + 1;
|
|
*code++ = OP_ECLASS;
|
|
PUT(code, 0, 1 + LINK_SIZE + 1);
|
|
code += LINK_SIZE;
|
|
*code++ = 0;
|
|
}
|
|
else
|
|
{
|
|
if (need_map)
|
|
{
|
|
PCRE2_UCHAR *map_start = previous + 1 + LINK_SIZE + 1;
|
|
previous[1 + LINK_SIZE] |= ECL_MAP;
|
|
memmove(map_start + 32/sizeof(PCRE2_UCHAR), map_start,
|
|
CU2BYTES(code - map_start));
|
|
memcpy(map_start, op_info.bits.classbits, 32);
|
|
code += 32 / sizeof(PCRE2_UCHAR);
|
|
}
|
|
PUT(previous, 1, (int)(code - previous));
|
|
}
|
|
}
|
|
#endif /* SUPPORT_WIDE_CHARS */
|
|
|
|
*pcode = code;
|
|
return TRUE;
|
|
}
|
|
|
|
/* End of pcre2_compile_class.c */
|