mirror of
https://github.com/php/php-src.git
synced 2025-08-15 21:48:51 +02:00
2280 lines
65 KiB
C
2280 lines
65 KiB
C
/*************************************************
|
|
* Perl-Compatible Regular Expressions *
|
|
*************************************************/
|
|
|
|
/* PCRE is a library of functions to support regular expressions whose syntax
|
|
and semantics are as close as possible to those of the Perl 5 language.
|
|
|
|
Written by Philip Hazel
|
|
This module by Zoltan Herczeg
|
|
Original API code Copyright (c) 1997-2012 University of Cambridge
|
|
New API code Copyright (c) 2016-2024 University of Cambridge
|
|
|
|
-----------------------------------------------------------------------------
|
|
Redistribution and use in source and binary forms, with or without
|
|
modification, are permitted provided that the following conditions are met:
|
|
|
|
* Redistributions of source code must retain the above copyright notice,
|
|
this list of conditions and the following disclaimer.
|
|
|
|
* Redistributions in binary form must reproduce the above copyright
|
|
notice, this list of conditions and the following disclaimer in the
|
|
documentation and/or other materials provided with the distribution.
|
|
|
|
* Neither the name of the University of Cambridge nor the names of its
|
|
contributors may be used to endorse or promote products derived from
|
|
this software without specific prior written permission.
|
|
|
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
|
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
|
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
|
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
|
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
|
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
|
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
|
POSSIBILITY OF SUCH DAMAGE.
|
|
-----------------------------------------------------------------------------
|
|
*/
|
|
|
|
/* XClass matching code. */
|
|
|
|
#ifdef SUPPORT_WIDE_CHARS
|
|
|
|
#define ECLASS_CHAR_DATA STACK_TOP
|
|
#define ECLASS_STACK_DATA STACK_LIMIT
|
|
|
|
#define SET_CHAR_OFFSET(value) \
|
|
if ((value) != charoffset) \
|
|
{ \
|
|
if ((value) < charoffset) \
|
|
OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, SLJIT_IMM, (sljit_sw)(charoffset - (value))); \
|
|
else \
|
|
OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, (sljit_sw)((value) - charoffset)); \
|
|
} \
|
|
charoffset = (value);
|
|
|
|
#define READ_FROM_CHAR_LIST(destination) \
|
|
if (list_ind <= 1) \
|
|
{ \
|
|
destination = *(const uint16_t*)next_char; \
|
|
next_char += 2; \
|
|
} \
|
|
else \
|
|
{ \
|
|
destination = *(const uint32_t*)next_char; \
|
|
next_char += 4; \
|
|
}
|
|
|
|
#define XCLASS_LOCAL_RANGES_SIZE 32
|
|
#define XCLASS_LOCAL_RANGES_LOG2_SIZE 5
|
|
|
|
typedef struct xclass_stack_item {
|
|
sljit_u32 first_item;
|
|
sljit_u32 last_item;
|
|
struct sljit_jump *jump;
|
|
} xclass_stack_item;
|
|
|
|
typedef struct xclass_ranges {
|
|
size_t range_count;
|
|
/* Pointer to ranges. A stack area is provided when a small buffer is enough. */
|
|
uint32_t *ranges;
|
|
uint32_t local_ranges[XCLASS_LOCAL_RANGES_SIZE * 2];
|
|
/* Stack size must be log2(ranges / 2). */
|
|
xclass_stack_item *stack;
|
|
xclass_stack_item local_stack[XCLASS_LOCAL_RANGES_LOG2_SIZE];
|
|
} xclass_ranges;
|
|
|
|
static void xclass_compute_ranges(compiler_common *common, PCRE2_SPTR cc, xclass_ranges *ranges)
|
|
{
|
|
DEFINE_COMPILER;
|
|
size_t range_count = 0, est_range_count;
|
|
size_t est_stack_size, tmp;
|
|
uint32_t type, list_ind;
|
|
uint32_t est_type;
|
|
uint32_t char_list_add, range_start, range_end;
|
|
const uint8_t *next_char;
|
|
const uint8_t *est_next_char;
|
|
#if defined SUPPORT_UNICODE && (PCRE2_CODE_UNIT_WIDTH == 8 || PCRE2_CODE_UNIT_WIDTH == 16)
|
|
BOOL utf = common->utf;
|
|
#endif /* SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == [8|16] */
|
|
|
|
if (*cc == XCL_SINGLE || *cc == XCL_RANGE)
|
|
{
|
|
/* Only a few ranges are present. */
|
|
do
|
|
{
|
|
type = *cc++;
|
|
SLJIT_ASSERT(type == XCL_SINGLE || type == XCL_RANGE);
|
|
GETCHARINCTEST(range_end, cc);
|
|
ranges->ranges[range_count] = range_end;
|
|
|
|
if (type == XCL_RANGE)
|
|
{
|
|
GETCHARINCTEST(range_end, cc);
|
|
}
|
|
|
|
ranges->ranges[range_count + 1] = range_end;
|
|
range_count += 2;
|
|
}
|
|
while (*cc != XCL_END);
|
|
|
|
SLJIT_ASSERT(range_count <= XCLASS_LOCAL_RANGES_SIZE);
|
|
ranges->range_count = range_count;
|
|
return;
|
|
}
|
|
|
|
SLJIT_ASSERT(cc[0] >= XCL_LIST);
|
|
#if PCRE2_CODE_UNIT_WIDTH == 8
|
|
type = (uint32_t)(cc[0] << 8) | cc[1];
|
|
cc += 2;
|
|
#else
|
|
type = cc[0];
|
|
cc++;
|
|
#endif /* CODE_UNIT_WIDTH */
|
|
|
|
/* Align characters. */
|
|
next_char = (const uint8_t*)common->start - (GET(cc, 0) << 1);
|
|
type &= XCL_TYPE_MASK;
|
|
|
|
/* Estimate size. */
|
|
est_next_char = next_char;
|
|
est_type = type;
|
|
est_range_count = 0;
|
|
list_ind = 0;
|
|
|
|
while (est_type > 0)
|
|
{
|
|
uint32_t item_count = est_type & XCL_ITEM_COUNT_MASK;
|
|
|
|
if (item_count == XCL_ITEM_COUNT_MASK)
|
|
{
|
|
if (list_ind <= 1)
|
|
{
|
|
item_count = *(const uint16_t*)est_next_char;
|
|
est_next_char += 2;
|
|
}
|
|
else
|
|
{
|
|
item_count = *(const uint32_t*)est_next_char;
|
|
est_next_char += 4;
|
|
}
|
|
}
|
|
|
|
est_type >>= XCL_TYPE_BIT_LEN;
|
|
est_next_char += (size_t)item_count << (list_ind <= 1 ? 1 : 2);
|
|
list_ind++;
|
|
est_range_count += item_count + 1;
|
|
}
|
|
|
|
if (est_range_count > XCLASS_LOCAL_RANGES_SIZE)
|
|
{
|
|
est_stack_size = 0;
|
|
tmp = est_range_count - 1;
|
|
|
|
/* Compute log2(est_range_count) */
|
|
while (tmp > 0)
|
|
{
|
|
est_stack_size++;
|
|
tmp >>= 1;
|
|
}
|
|
|
|
ranges->stack = (xclass_stack_item*)SLJIT_MALLOC((sizeof(xclass_stack_item) * est_stack_size)
|
|
+ ((sizeof(uint32_t) << 1) * (size_t)est_range_count), compiler->allocator_data);
|
|
|
|
if (ranges->stack == NULL)
|
|
{
|
|
sljit_set_compiler_memory_error(compiler);
|
|
ranges->ranges = NULL;
|
|
return;
|
|
}
|
|
|
|
ranges->ranges = (uint32_t*)(ranges->stack + est_stack_size);
|
|
}
|
|
|
|
char_list_add = XCL_CHAR_LIST_LOW_16_ADD;
|
|
range_start = ~(uint32_t)0;
|
|
list_ind = 0;
|
|
|
|
if ((type & XCL_BEGIN_WITH_RANGE) != 0)
|
|
range_start = XCL_CHAR_LIST_LOW_16_START;
|
|
|
|
while (type > 0)
|
|
{
|
|
uint32_t item_count = type & XCL_ITEM_COUNT_MASK;
|
|
|
|
if (item_count == XCL_ITEM_COUNT_MASK)
|
|
{
|
|
READ_FROM_CHAR_LIST(item_count);
|
|
SLJIT_ASSERT(item_count >= XCL_ITEM_COUNT_MASK);
|
|
}
|
|
|
|
while (item_count > 0)
|
|
{
|
|
READ_FROM_CHAR_LIST(range_end);
|
|
|
|
if ((range_end & XCL_CHAR_END) != 0)
|
|
{
|
|
range_end = char_list_add + (range_end >> XCL_CHAR_SHIFT);
|
|
|
|
if (range_start == ~(uint32_t)0)
|
|
range_start = range_end;
|
|
|
|
ranges->ranges[range_count] = range_start;
|
|
ranges->ranges[range_count + 1] = range_end;
|
|
range_count += 2;
|
|
range_start = ~(uint32_t)0;
|
|
}
|
|
else
|
|
range_start = char_list_add + (range_end >> XCL_CHAR_SHIFT);
|
|
|
|
item_count--;
|
|
}
|
|
|
|
list_ind++;
|
|
type >>= XCL_TYPE_BIT_LEN;
|
|
|
|
if (range_start == ~(uint32_t)0)
|
|
{
|
|
if ((type & XCL_BEGIN_WITH_RANGE) != 0)
|
|
{
|
|
if (list_ind == 1) range_start = XCL_CHAR_LIST_HIGH_16_START;
|
|
#if PCRE2_CODE_UNIT_WIDTH == 32
|
|
else if (list_ind == 2) range_start = XCL_CHAR_LIST_LOW_32_START;
|
|
else range_start = XCL_CHAR_LIST_HIGH_32_START;
|
|
#else
|
|
else range_start = XCL_CHAR_LIST_LOW_32_START;
|
|
#endif
|
|
}
|
|
}
|
|
else if ((type & XCL_BEGIN_WITH_RANGE) == 0)
|
|
{
|
|
if (list_ind == 1) range_end = XCL_CHAR_LIST_LOW_16_END;
|
|
else if (list_ind == 2) range_end = XCL_CHAR_LIST_HIGH_16_END;
|
|
#if PCRE2_CODE_UNIT_WIDTH == 32
|
|
else if (list_ind == 3) range_end = XCL_CHAR_LIST_LOW_32_END;
|
|
else range_end = XCL_CHAR_LIST_HIGH_32_END;
|
|
#else
|
|
else range_end = XCL_CHAR_LIST_LOW_32_END;
|
|
#endif
|
|
|
|
ranges->ranges[range_count] = range_start;
|
|
ranges->ranges[range_count + 1] = range_end;
|
|
range_count += 2;
|
|
range_start = ~(uint32_t)0;
|
|
}
|
|
|
|
if (list_ind == 1) char_list_add = XCL_CHAR_LIST_HIGH_16_ADD;
|
|
#if PCRE2_CODE_UNIT_WIDTH == 32
|
|
else if (list_ind == 2) char_list_add = XCL_CHAR_LIST_LOW_32_ADD;
|
|
else char_list_add = XCL_CHAR_LIST_HIGH_32_ADD;
|
|
#else
|
|
else char_list_add = XCL_CHAR_LIST_LOW_32_ADD;
|
|
#endif
|
|
}
|
|
|
|
SLJIT_ASSERT(range_count > 0 && range_count <= (est_range_count << 1));
|
|
SLJIT_ASSERT(next_char <= (const uint8_t*)common->start);
|
|
ranges->range_count = range_count;
|
|
}
|
|
|
|
static void xclass_check_bitset(compiler_common *common, const sljit_u8 *bitset, jump_list **found, jump_list **backtracks)
|
|
{
|
|
DEFINE_COMPILER;
|
|
struct sljit_jump *jump;
|
|
|
|
jump = CMP(SLJIT_GREATER, TMP1, 0, SLJIT_IMM, 255);
|
|
if (!optimize_class(common, bitset, (bitset[31] & 0x80) != 0, TRUE, found))
|
|
{
|
|
OP2(SLJIT_AND, TMP2, 0, TMP1, 0, SLJIT_IMM, 0x7);
|
|
OP2(SLJIT_LSHR, TMP1, 0, TMP1, 0, SLJIT_IMM, 3);
|
|
OP1(SLJIT_MOV_U8, TMP1, 0, SLJIT_MEM1(TMP1), (sljit_sw)bitset);
|
|
OP2(SLJIT_SHL, TMP2, 0, SLJIT_IMM, 1, TMP2, 0);
|
|
OP2U(SLJIT_AND | SLJIT_SET_Z, TMP1, 0, TMP2, 0);
|
|
add_jump(compiler, found, JUMP(SLJIT_NOT_ZERO));
|
|
}
|
|
|
|
add_jump(compiler, backtracks, JUMP(SLJIT_JUMP));
|
|
JUMPHERE(jump);
|
|
}
|
|
|
|
#if defined SUPPORT_UNICODE && (PCRE2_CODE_UNIT_WIDTH == 8 || PCRE2_CODE_UNIT_WIDTH == 16)
|
|
|
|
static void xclass_update_min_max(compiler_common *common, PCRE2_SPTR cc, sljit_u32 *min_ptr, sljit_u32 *max_ptr)
|
|
{
|
|
uint32_t type, list_ind, c;
|
|
sljit_u32 min = *min_ptr;
|
|
sljit_u32 max = *max_ptr;
|
|
uint32_t char_list_add;
|
|
const uint8_t *next_char;
|
|
BOOL utf = TRUE;
|
|
|
|
/* This function is pointless without utf 8/16. */
|
|
SLJIT_ASSERT(common->utf);
|
|
if (*cc == XCL_SINGLE || *cc == XCL_RANGE)
|
|
{
|
|
/* Only a few ranges are present. */
|
|
do
|
|
{
|
|
type = *cc++;
|
|
SLJIT_ASSERT(type == XCL_SINGLE || type == XCL_RANGE);
|
|
GETCHARINCTEST(c, cc);
|
|
|
|
if (c < min)
|
|
min = c;
|
|
|
|
if (type == XCL_RANGE)
|
|
{
|
|
GETCHARINCTEST(c, cc);
|
|
}
|
|
|
|
if (c > max)
|
|
max = c;
|
|
}
|
|
while (*cc != XCL_END);
|
|
|
|
SLJIT_ASSERT(min <= MAX_UTF_CODE_POINT && max <= MAX_UTF_CODE_POINT && min <= max);
|
|
*min_ptr = min;
|
|
*max_ptr = max;
|
|
return;
|
|
}
|
|
|
|
SLJIT_ASSERT(cc[0] >= XCL_LIST);
|
|
#if PCRE2_CODE_UNIT_WIDTH == 8
|
|
type = (uint32_t)(cc[0] << 8) | cc[1];
|
|
cc += 2;
|
|
#else
|
|
type = cc[0];
|
|
cc++;
|
|
#endif /* CODE_UNIT_WIDTH */
|
|
|
|
/* Align characters. */
|
|
next_char = (const uint8_t*)common->start - (GET(cc, 0) << 1);
|
|
type &= XCL_TYPE_MASK;
|
|
|
|
SLJIT_ASSERT(type != 0);
|
|
|
|
/* Detect minimum. */
|
|
|
|
/* Skip unused ranges. */
|
|
list_ind = 0;
|
|
while ((type & (XCL_BEGIN_WITH_RANGE | XCL_ITEM_COUNT_MASK)) == 0)
|
|
{
|
|
type >>= XCL_TYPE_BIT_LEN;
|
|
list_ind++;
|
|
}
|
|
|
|
SLJIT_ASSERT(list_ind <= 2);
|
|
switch (list_ind)
|
|
{
|
|
case 0:
|
|
char_list_add = XCL_CHAR_LIST_LOW_16_ADD;
|
|
c = XCL_CHAR_LIST_LOW_16_START;
|
|
break;
|
|
|
|
case 1:
|
|
char_list_add = XCL_CHAR_LIST_HIGH_16_ADD;
|
|
c = XCL_CHAR_LIST_HIGH_16_START;
|
|
break;
|
|
|
|
default:
|
|
char_list_add = XCL_CHAR_LIST_LOW_32_ADD;
|
|
c = XCL_CHAR_LIST_LOW_32_START;
|
|
break;
|
|
}
|
|
|
|
if ((type & XCL_BEGIN_WITH_RANGE) != 0)
|
|
{
|
|
if (c < min)
|
|
min = c;
|
|
}
|
|
else
|
|
{
|
|
if ((type & XCL_ITEM_COUNT_MASK) == XCL_ITEM_COUNT_MASK)
|
|
{
|
|
if (list_ind <= 1)
|
|
c = *(const uint16_t*)(next_char + 2);
|
|
else
|
|
c = *(const uint32_t*)(next_char + 4);
|
|
}
|
|
else
|
|
{
|
|
if (list_ind <= 1)
|
|
c = *(const uint16_t*)next_char;
|
|
else
|
|
c = *(const uint32_t*)next_char;
|
|
}
|
|
|
|
c = char_list_add + (c >> XCL_CHAR_SHIFT);
|
|
if (c < min)
|
|
min = c;
|
|
}
|
|
|
|
/* Detect maximum. */
|
|
|
|
/* Skip intermediate ranges. */
|
|
while (TRUE)
|
|
{
|
|
if ((type & XCL_ITEM_COUNT_MASK) == XCL_ITEM_COUNT_MASK)
|
|
{
|
|
if (list_ind <= 1)
|
|
{
|
|
c = *(const uint16_t*)next_char;
|
|
next_char += (c + 1) << 1;
|
|
}
|
|
else
|
|
{
|
|
c = *(const uint32_t*)next_char;
|
|
next_char += (c + 1) << 2;
|
|
}
|
|
}
|
|
else
|
|
next_char += (type & XCL_ITEM_COUNT_MASK) << (list_ind <= 1 ? 1 : 2);
|
|
|
|
if ((type >> XCL_TYPE_BIT_LEN) == 0)
|
|
break;
|
|
|
|
list_ind++;
|
|
type >>= XCL_TYPE_BIT_LEN;
|
|
}
|
|
|
|
SLJIT_ASSERT(list_ind <= 2 && type != 0);
|
|
switch (list_ind)
|
|
{
|
|
case 0:
|
|
char_list_add = XCL_CHAR_LIST_LOW_16_ADD;
|
|
c = XCL_CHAR_LIST_LOW_16_END;
|
|
break;
|
|
|
|
case 1:
|
|
char_list_add = XCL_CHAR_LIST_HIGH_16_ADD;
|
|
c = XCL_CHAR_LIST_HIGH_16_END;
|
|
break;
|
|
|
|
default:
|
|
char_list_add = XCL_CHAR_LIST_LOW_32_ADD;
|
|
c = XCL_CHAR_LIST_LOW_32_END;
|
|
break;
|
|
}
|
|
|
|
if ((type & XCL_ITEM_COUNT_MASK) != 0)
|
|
{
|
|
/* Type is reused as temporary. */
|
|
if (list_ind <= 1)
|
|
type = *(const uint16_t*)(next_char - 2);
|
|
else
|
|
type = *(const uint32_t*)(next_char - 4);
|
|
|
|
if (type & XCL_CHAR_END)
|
|
c = char_list_add + (type >> XCL_CHAR_SHIFT);
|
|
}
|
|
|
|
if (c > max)
|
|
max = c;
|
|
|
|
SLJIT_ASSERT(min <= MAX_UTF_CODE_POINT && max <= MAX_UTF_CODE_POINT && min <= max);
|
|
*min_ptr = min;
|
|
*max_ptr = max;
|
|
}
|
|
|
|
#endif /* SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == [8|16] */
|
|
|
|
#define XCLASS_IS_ECLASS 0x001
|
|
#ifdef SUPPORT_UNICODE
|
|
#define XCLASS_SAVE_CHAR 0x002
|
|
#define XCLASS_HAS_TYPE 0x004
|
|
#define XCLASS_HAS_SCRIPT 0x008
|
|
#define XCLASS_HAS_SCRIPT_EXTENSION 0x010
|
|
#define XCLASS_HAS_BOOL 0x020
|
|
#define XCLASS_HAS_BIDICL 0x040
|
|
#define XCLASS_NEEDS_UCD (XCLASS_HAS_TYPE | XCLASS_HAS_SCRIPT | XCLASS_HAS_SCRIPT_EXTENSION | XCLASS_HAS_BOOL | XCLASS_HAS_BIDICL)
|
|
#define XCLASS_SCRIPT_EXTENSION_NOTPROP 0x080
|
|
#define XCLASS_SCRIPT_EXTENSION_RESTORE_RETURN_ADDR 0x100
|
|
#define XCLASS_SCRIPT_EXTENSION_RESTORE_LOCAL0 0x200
|
|
#endif /* SUPPORT_UNICODE */
|
|
|
|
static PCRE2_SPTR compile_char1_matchingpath(compiler_common *common, PCRE2_UCHAR type, PCRE2_SPTR cc, jump_list **backtracks, BOOL check_str_ptr);
|
|
|
|
/* TMP3 must be preserved because it is used by compile_iterator_matchingpath. */
|
|
static void compile_xclass_matchingpath(compiler_common *common, PCRE2_SPTR cc, jump_list **backtracks, sljit_u32 status)
|
|
{
|
|
DEFINE_COMPILER;
|
|
jump_list *found = NULL;
|
|
jump_list *check_result = NULL;
|
|
jump_list **list = (cc[0] & XCL_NOT) == 0 ? &found : backtracks;
|
|
sljit_uw c, charoffset;
|
|
sljit_u32 max = READ_CHAR_MAX, min = 0;
|
|
struct sljit_jump *jump = NULL;
|
|
PCRE2_UCHAR flags;
|
|
PCRE2_SPTR ccbegin;
|
|
sljit_u32 compares, invertcmp, depth;
|
|
sljit_u32 first_item, last_item, mid_item;
|
|
sljit_u32 range_start, range_end;
|
|
xclass_ranges ranges;
|
|
BOOL has_cmov, last_range_set;
|
|
|
|
#ifdef SUPPORT_UNICODE
|
|
sljit_u32 category_list = 0;
|
|
sljit_u32 items;
|
|
int typereg = TMP1;
|
|
#endif /* SUPPORT_UNICODE */
|
|
|
|
SLJIT_ASSERT(common->locals_size >= SSIZE_OF(sw));
|
|
/* Scanning the necessary info. */
|
|
flags = *cc++;
|
|
ccbegin = cc;
|
|
compares = 0;
|
|
|
|
if (flags & XCL_MAP)
|
|
cc += 32 / sizeof(PCRE2_UCHAR);
|
|
|
|
#ifdef SUPPORT_UNICODE
|
|
while (*cc == XCL_PROP || *cc == XCL_NOTPROP)
|
|
{
|
|
compares++;
|
|
cc++;
|
|
|
|
items = 0;
|
|
|
|
switch(*cc)
|
|
{
|
|
case PT_LAMP:
|
|
items = UCPCAT3(ucp_Lu, ucp_Ll, ucp_Lt);
|
|
break;
|
|
|
|
case PT_GC:
|
|
items = UCPCAT_RANGE(PRIV(ucp_typerange)[(int)cc[1] * 2], PRIV(ucp_typerange)[(int)cc[1] * 2 + 1]);
|
|
break;
|
|
|
|
case PT_PC:
|
|
items = UCPCAT(cc[1]);
|
|
break;
|
|
|
|
case PT_WORD:
|
|
items = UCPCAT2(ucp_Mn, ucp_Pc) | UCPCAT_L | UCPCAT_N;
|
|
break;
|
|
|
|
case PT_ALNUM:
|
|
items = UCPCAT_L | UCPCAT_N;
|
|
break;
|
|
|
|
case PT_SCX:
|
|
status |= XCLASS_HAS_SCRIPT_EXTENSION;
|
|
if (cc[-1] == XCL_NOTPROP)
|
|
{
|
|
status |= XCLASS_SCRIPT_EXTENSION_NOTPROP;
|
|
break;
|
|
}
|
|
compares++;
|
|
/* Fall through */
|
|
|
|
case PT_SC:
|
|
status |= XCLASS_HAS_SCRIPT;
|
|
break;
|
|
|
|
case PT_SPACE:
|
|
case PT_PXSPACE:
|
|
case PT_PXGRAPH:
|
|
case PT_PXPRINT:
|
|
case PT_PXPUNCT:
|
|
status |= XCLASS_SAVE_CHAR | XCLASS_HAS_TYPE;
|
|
break;
|
|
|
|
case PT_UCNC:
|
|
case PT_PXXDIGIT:
|
|
status |= XCLASS_SAVE_CHAR;
|
|
break;
|
|
|
|
case PT_BOOL:
|
|
status |= XCLASS_HAS_BOOL;
|
|
break;
|
|
|
|
case PT_BIDICL:
|
|
status |= XCLASS_HAS_BIDICL;
|
|
break;
|
|
|
|
default:
|
|
SLJIT_UNREACHABLE();
|
|
break;
|
|
}
|
|
|
|
if (items > 0)
|
|
{
|
|
if (cc[-1] == XCL_NOTPROP)
|
|
items ^= UCPCAT_ALL;
|
|
category_list |= items;
|
|
status |= XCLASS_HAS_TYPE;
|
|
compares--;
|
|
}
|
|
|
|
cc += 2;
|
|
}
|
|
|
|
if (category_list == UCPCAT_ALL)
|
|
{
|
|
/* All or no characters are accepted, same as dotall. */
|
|
if (status & XCLASS_IS_ECLASS)
|
|
{
|
|
if (list != backtracks)
|
|
OP2(SLJIT_OR, ECLASS_STACK_DATA, 0, ECLASS_STACK_DATA, 0, SLJIT_IMM, 1);
|
|
return;
|
|
}
|
|
|
|
compile_char1_matchingpath(common, OP_ALLANY, cc, backtracks, FALSE);
|
|
if (list == backtracks)
|
|
add_jump(compiler, backtracks, JUMP(SLJIT_JUMP));
|
|
return;
|
|
}
|
|
|
|
if (category_list != 0)
|
|
compares++;
|
|
#endif
|
|
|
|
if (*cc != XCL_END)
|
|
{
|
|
#if defined SUPPORT_UNICODE && (PCRE2_CODE_UNIT_WIDTH == 8 || PCRE2_CODE_UNIT_WIDTH == 16)
|
|
if (common->utf && compares == 0 && !(status & XCLASS_IS_ECLASS))
|
|
{
|
|
SLJIT_ASSERT(category_list == 0);
|
|
max = 0;
|
|
min = (flags & XCL_MAP) != 0 ? 0 : READ_CHAR_MAX;
|
|
xclass_update_min_max(common, cc, &min, &max);
|
|
}
|
|
#endif
|
|
compares++;
|
|
#ifdef SUPPORT_UNICODE
|
|
status |= XCLASS_SAVE_CHAR;
|
|
#endif /* SUPPORT_UNICODE */
|
|
}
|
|
|
|
#ifdef SUPPORT_UNICODE
|
|
SLJIT_ASSERT(compares > 0 || category_list != 0);
|
|
#else /* !SUPPORT_UNICODE */
|
|
SLJIT_ASSERT(compares > 0);
|
|
#endif /* SUPPORT_UNICODE */
|
|
|
|
/* We are not necessary in utf mode even in 8 bit mode. */
|
|
cc = ccbegin;
|
|
if (!(status & XCLASS_IS_ECLASS))
|
|
{
|
|
if ((flags & XCL_NOT) != 0)
|
|
read_char(common, min, max, backtracks, READ_CHAR_UPDATE_STR_PTR);
|
|
else
|
|
{
|
|
#ifdef SUPPORT_UNICODE
|
|
read_char(common, min, max, (status & XCLASS_NEEDS_UCD) ? backtracks : NULL, 0);
|
|
#else /* !SUPPORT_UNICODE */
|
|
read_char(common, min, max, NULL, 0);
|
|
#endif /* SUPPORT_UNICODE */
|
|
}
|
|
}
|
|
|
|
if ((flags & XCL_MAP) != 0)
|
|
{
|
|
SLJIT_ASSERT(!(status & XCLASS_IS_ECLASS));
|
|
xclass_check_bitset(common, (const sljit_u8 *)cc, &found, backtracks);
|
|
cc += 32 / sizeof(PCRE2_UCHAR);
|
|
}
|
|
|
|
#ifdef SUPPORT_UNICODE
|
|
if (status & XCLASS_NEEDS_UCD)
|
|
{
|
|
if ((status & (XCLASS_SAVE_CHAR | XCLASS_IS_ECLASS)) == XCLASS_SAVE_CHAR)
|
|
OP1(SLJIT_MOV, RETURN_ADDR, 0, TMP1, 0);
|
|
|
|
#if PCRE2_CODE_UNIT_WIDTH == 32
|
|
if (!common->utf)
|
|
{
|
|
OP2U(SLJIT_SUB | SLJIT_SET_GREATER_EQUAL, TMP1, 0, SLJIT_IMM, MAX_UTF_CODE_POINT + 1);
|
|
SELECT(SLJIT_GREATER_EQUAL, TMP1, SLJIT_IMM, UNASSIGNED_UTF_CHAR, TMP1);
|
|
}
|
|
#endif /* PCRE2_CODE_UNIT_WIDTH == 32 */
|
|
|
|
OP2(SLJIT_LSHR, TMP2, 0, TMP1, 0, SLJIT_IMM, UCD_BLOCK_SHIFT);
|
|
OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 1);
|
|
OP1(SLJIT_MOV_U16, TMP2, 0, SLJIT_MEM1(TMP2), (sljit_sw)PRIV(ucd_stage1));
|
|
OP2(SLJIT_AND, TMP1, 0, TMP1, 0, SLJIT_IMM, UCD_BLOCK_MASK);
|
|
OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, UCD_BLOCK_SHIFT);
|
|
OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, TMP2, 0);
|
|
OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, (sljit_sw)PRIV(ucd_stage2));
|
|
OP1(SLJIT_MOV_U16, TMP2, 0, SLJIT_MEM2(TMP2, TMP1), 1);
|
|
OP2(SLJIT_SHL, TMP1, 0, TMP2, 0, SLJIT_IMM, 3);
|
|
OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 2);
|
|
OP2(SLJIT_ADD, TMP2, 0, TMP2, 0, TMP1, 0);
|
|
|
|
ccbegin = cc;
|
|
|
|
if (status & XCLASS_HAS_BIDICL)
|
|
{
|
|
OP1(SLJIT_MOV_U16, TMP1, 0, SLJIT_MEM1(TMP2), (sljit_sw)PRIV(ucd_records) + SLJIT_OFFSETOF(ucd_record, scriptx_bidiclass));
|
|
OP2(SLJIT_LSHR, TMP1, 0, TMP1, 0, SLJIT_IMM, UCD_BIDICLASS_SHIFT);
|
|
|
|
while (*cc == XCL_PROP || *cc == XCL_NOTPROP)
|
|
{
|
|
cc++;
|
|
|
|
if (*cc == PT_BIDICL)
|
|
{
|
|
compares--;
|
|
invertcmp = (compares == 0 && list != backtracks);
|
|
if (cc[-1] == XCL_NOTPROP)
|
|
invertcmp ^= 0x1;
|
|
jump = CMP(SLJIT_EQUAL ^ invertcmp, TMP1, 0, SLJIT_IMM, (int)cc[1]);
|
|
add_jump(compiler, compares > 0 ? list : backtracks, jump);
|
|
}
|
|
cc += 2;
|
|
}
|
|
|
|
cc = ccbegin;
|
|
}
|
|
|
|
if (status & XCLASS_HAS_BOOL)
|
|
{
|
|
OP1(SLJIT_MOV_U16, TMP1, 0, SLJIT_MEM1(TMP2), (sljit_sw)PRIV(ucd_records) + SLJIT_OFFSETOF(ucd_record, bprops));
|
|
OP2(SLJIT_AND, TMP1, 0, TMP1, 0, SLJIT_IMM, UCD_BPROPS_MASK);
|
|
OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 2);
|
|
|
|
while (*cc == XCL_PROP || *cc == XCL_NOTPROP)
|
|
{
|
|
cc++;
|
|
if (*cc == PT_BOOL)
|
|
{
|
|
compares--;
|
|
invertcmp = (compares == 0 && list != backtracks);
|
|
if (cc[-1] == XCL_NOTPROP)
|
|
invertcmp ^= 0x1;
|
|
|
|
OP2U(SLJIT_AND32 | SLJIT_SET_Z, SLJIT_MEM1(TMP1), (sljit_sw)(PRIV(ucd_boolprop_sets) + (cc[1] >> 5)), SLJIT_IMM, (sljit_sw)(1u << (cc[1] & 0x1f)));
|
|
add_jump(compiler, compares > 0 ? list : backtracks, JUMP(SLJIT_NOT_ZERO ^ invertcmp));
|
|
}
|
|
cc += 2;
|
|
}
|
|
|
|
cc = ccbegin;
|
|
}
|
|
|
|
if (status & XCLASS_HAS_SCRIPT)
|
|
{
|
|
OP1(SLJIT_MOV_U8, TMP1, 0, SLJIT_MEM1(TMP2), (sljit_sw)PRIV(ucd_records) + SLJIT_OFFSETOF(ucd_record, script));
|
|
|
|
while (*cc == XCL_PROP || *cc == XCL_NOTPROP)
|
|
{
|
|
cc++;
|
|
|
|
switch (*cc)
|
|
{
|
|
case PT_SCX:
|
|
if (cc[-1] == XCL_NOTPROP)
|
|
break;
|
|
/* Fall through */
|
|
|
|
case PT_SC:
|
|
compares--;
|
|
invertcmp = (compares == 0 && list != backtracks);
|
|
if (cc[-1] == XCL_NOTPROP)
|
|
invertcmp ^= 0x1;
|
|
|
|
add_jump(compiler, compares > 0 ? list : backtracks, CMP(SLJIT_EQUAL ^ invertcmp, TMP1, 0, SLJIT_IMM, (int)cc[1]));
|
|
}
|
|
cc += 2;
|
|
}
|
|
|
|
cc = ccbegin;
|
|
}
|
|
|
|
if (status & XCLASS_HAS_SCRIPT_EXTENSION)
|
|
{
|
|
OP1(SLJIT_MOV_U16, TMP1, 0, SLJIT_MEM1(TMP2), (sljit_sw)PRIV(ucd_records) + SLJIT_OFFSETOF(ucd_record, scriptx_bidiclass));
|
|
OP2(SLJIT_AND, TMP1, 0, TMP1, 0, SLJIT_IMM, UCD_SCRIPTX_MASK);
|
|
OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 2);
|
|
|
|
if (status & XCLASS_SCRIPT_EXTENSION_NOTPROP)
|
|
{
|
|
if (status & XCLASS_HAS_TYPE)
|
|
{
|
|
if ((status & (XCLASS_SAVE_CHAR | XCLASS_IS_ECLASS)) == XCLASS_SAVE_CHAR)
|
|
{
|
|
OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), LOCAL0, TMP2, 0);
|
|
status |= XCLASS_SCRIPT_EXTENSION_RESTORE_LOCAL0;
|
|
}
|
|
else
|
|
{
|
|
OP1(SLJIT_MOV, RETURN_ADDR, 0, TMP2, 0);
|
|
status |= XCLASS_SCRIPT_EXTENSION_RESTORE_RETURN_ADDR;
|
|
}
|
|
}
|
|
OP1(SLJIT_MOV_U8, TMP2, 0, SLJIT_MEM1(TMP2), (sljit_sw)PRIV(ucd_records) + SLJIT_OFFSETOF(ucd_record, script));
|
|
}
|
|
|
|
while (*cc == XCL_PROP || *cc == XCL_NOTPROP)
|
|
{
|
|
cc++;
|
|
|
|
if (*cc == PT_SCX)
|
|
{
|
|
compares--;
|
|
invertcmp = (compares == 0 && list != backtracks);
|
|
|
|
jump = NULL;
|
|
if (cc[-1] == XCL_NOTPROP)
|
|
{
|
|
jump = CMP(SLJIT_EQUAL, TMP2, 0, SLJIT_IMM, (int)cc[1]);
|
|
if (invertcmp)
|
|
{
|
|
add_jump(compiler, backtracks, jump);
|
|
jump = NULL;
|
|
}
|
|
invertcmp ^= 0x1;
|
|
}
|
|
|
|
OP2U(SLJIT_AND32 | SLJIT_SET_Z, SLJIT_MEM1(TMP1), (sljit_sw)(PRIV(ucd_script_sets) + (cc[1] >> 5)), SLJIT_IMM, (sljit_sw)(1u << (cc[1] & 0x1f)));
|
|
add_jump(compiler, compares > 0 ? list : backtracks, JUMP(SLJIT_NOT_ZERO ^ invertcmp));
|
|
|
|
if (jump != NULL)
|
|
JUMPHERE(jump);
|
|
}
|
|
cc += 2;
|
|
}
|
|
|
|
if (status & XCLASS_SCRIPT_EXTENSION_RESTORE_LOCAL0)
|
|
OP1(SLJIT_MOV, TMP2, 0, SLJIT_MEM1(SLJIT_SP), LOCAL0);
|
|
else if (status & XCLASS_SCRIPT_EXTENSION_RESTORE_RETURN_ADDR)
|
|
OP1(SLJIT_MOV, TMP2, 0, RETURN_ADDR, 0);
|
|
cc = ccbegin;
|
|
}
|
|
|
|
if (status & XCLASS_SAVE_CHAR)
|
|
OP1(SLJIT_MOV, TMP1, 0, (status & XCLASS_IS_ECLASS) ? ECLASS_CHAR_DATA : RETURN_ADDR, 0);
|
|
|
|
if (status & XCLASS_HAS_TYPE)
|
|
{
|
|
if (status & XCLASS_SAVE_CHAR)
|
|
typereg = RETURN_ADDR;
|
|
|
|
OP1(SLJIT_MOV_U8, TMP2, 0, SLJIT_MEM1(TMP2), (sljit_sw)PRIV(ucd_records) + SLJIT_OFFSETOF(ucd_record, chartype));
|
|
OP2(SLJIT_SHL, typereg, 0, SLJIT_IMM, 1, TMP2, 0);
|
|
|
|
if (category_list > 0)
|
|
{
|
|
compares--;
|
|
invertcmp = (compares == 0 && list != backtracks);
|
|
OP2U(SLJIT_AND | SLJIT_SET_Z, typereg, 0, SLJIT_IMM, category_list);
|
|
add_jump(compiler, compares > 0 ? list : backtracks, JUMP(SLJIT_NOT_ZERO ^ invertcmp));
|
|
}
|
|
}
|
|
}
|
|
#endif /* SUPPORT_UNICODE */
|
|
|
|
/* Generating code. */
|
|
charoffset = 0;
|
|
|
|
#ifdef SUPPORT_UNICODE
|
|
while (*cc == XCL_PROP || *cc == XCL_NOTPROP)
|
|
{
|
|
compares--;
|
|
invertcmp = (compares == 0 && list != backtracks);
|
|
jump = NULL;
|
|
|
|
if (*cc == XCL_NOTPROP)
|
|
invertcmp ^= 0x1;
|
|
cc++;
|
|
switch(*cc)
|
|
{
|
|
case PT_LAMP:
|
|
case PT_GC:
|
|
case PT_PC:
|
|
case PT_SC:
|
|
case PT_SCX:
|
|
case PT_BOOL:
|
|
case PT_BIDICL:
|
|
case PT_WORD:
|
|
case PT_ALNUM:
|
|
compares++;
|
|
/* Already handled. */
|
|
break;
|
|
|
|
case PT_SPACE:
|
|
case PT_PXSPACE:
|
|
SET_CHAR_OFFSET(9);
|
|
OP2U(SLJIT_SUB | SLJIT_SET_LESS_EQUAL, TMP1, 0, SLJIT_IMM, 0xd - 0x9);
|
|
OP_FLAGS(SLJIT_MOV, TMP2, 0, SLJIT_LESS_EQUAL);
|
|
|
|
OP2U(SLJIT_SUB | SLJIT_SET_Z, TMP1, 0, SLJIT_IMM, 0x85 - 0x9);
|
|
OP_FLAGS(SLJIT_OR, TMP2, 0, SLJIT_EQUAL);
|
|
|
|
OP2U(SLJIT_SUB | SLJIT_SET_Z, TMP1, 0, SLJIT_IMM, 0x180e - 0x9);
|
|
OP_FLAGS(SLJIT_OR, TMP2, 0, SLJIT_EQUAL);
|
|
|
|
OP2U(SLJIT_AND | SLJIT_SET_Z, typereg, 0, SLJIT_IMM, UCPCAT_RANGE(ucp_Zl, ucp_Zs));
|
|
OP_FLAGS(SLJIT_OR | SLJIT_SET_Z, TMP2, 0, SLJIT_NOT_ZERO);
|
|
jump = JUMP(SLJIT_NOT_ZERO ^ invertcmp);
|
|
break;
|
|
|
|
case PT_UCNC:
|
|
OP2U(SLJIT_SUB | SLJIT_SET_Z, TMP1, 0, SLJIT_IMM, (sljit_sw)(CHAR_DOLLAR_SIGN - charoffset));
|
|
OP_FLAGS(SLJIT_MOV, TMP2, 0, SLJIT_EQUAL);
|
|
OP2U(SLJIT_SUB | SLJIT_SET_Z, TMP1, 0, SLJIT_IMM, (sljit_sw)(CHAR_COMMERCIAL_AT - charoffset));
|
|
OP_FLAGS(SLJIT_OR, TMP2, 0, SLJIT_EQUAL);
|
|
OP2U(SLJIT_SUB | SLJIT_SET_Z, TMP1, 0, SLJIT_IMM, (sljit_sw)(CHAR_GRAVE_ACCENT - charoffset));
|
|
OP_FLAGS(SLJIT_OR, TMP2, 0, SLJIT_EQUAL);
|
|
|
|
SET_CHAR_OFFSET(0xa0);
|
|
OP2U(SLJIT_SUB | SLJIT_SET_LESS_EQUAL, TMP1, 0, SLJIT_IMM, (sljit_sw)(0xd7ff - charoffset));
|
|
OP_FLAGS(SLJIT_OR, TMP2, 0, SLJIT_LESS_EQUAL);
|
|
SET_CHAR_OFFSET(0);
|
|
OP2U(SLJIT_SUB | SLJIT_SET_GREATER_EQUAL, TMP1, 0, SLJIT_IMM, 0xe000 - 0);
|
|
OP_FLAGS(SLJIT_OR | SLJIT_SET_Z, TMP2, 0, SLJIT_GREATER_EQUAL);
|
|
jump = JUMP(SLJIT_NOT_ZERO ^ invertcmp);
|
|
break;
|
|
|
|
case PT_PXGRAPH:
|
|
OP2U(SLJIT_AND | SLJIT_SET_Z, typereg, 0, SLJIT_IMM, UCPCAT_RANGE(ucp_Cc, ucp_Cs) | UCPCAT_RANGE(ucp_Zl, ucp_Zs));
|
|
OP_FLAGS(SLJIT_MOV, TMP2, 0, SLJIT_NOT_ZERO);
|
|
|
|
OP2U(SLJIT_AND | SLJIT_SET_Z, typereg, 0, SLJIT_IMM, UCPCAT(ucp_Cf));
|
|
jump = JUMP(SLJIT_ZERO);
|
|
|
|
c = charoffset;
|
|
/* In case of ucp_Cf, we overwrite the result. */
|
|
SET_CHAR_OFFSET(0x2066);
|
|
OP2U(SLJIT_SUB | SLJIT_SET_LESS_EQUAL, TMP1, 0, SLJIT_IMM, 0x2069 - 0x2066);
|
|
OP_FLAGS(SLJIT_MOV, TMP2, 0, SLJIT_LESS_EQUAL);
|
|
|
|
OP2U(SLJIT_SUB | SLJIT_SET_Z, TMP1, 0, SLJIT_IMM, 0x061c - 0x2066);
|
|
OP_FLAGS(SLJIT_OR, TMP2, 0, SLJIT_EQUAL);
|
|
|
|
OP2U(SLJIT_SUB | SLJIT_SET_Z, TMP1, 0, SLJIT_IMM, 0x180e - 0x2066);
|
|
OP_FLAGS(SLJIT_OR, TMP2, 0, SLJIT_EQUAL);
|
|
|
|
/* Restore charoffset. */
|
|
SET_CHAR_OFFSET(c);
|
|
|
|
JUMPHERE(jump);
|
|
jump = CMP(SLJIT_ZERO ^ invertcmp, TMP2, 0, SLJIT_IMM, 0);
|
|
break;
|
|
|
|
case PT_PXPRINT:
|
|
OP2U(SLJIT_AND | SLJIT_SET_Z, typereg, 0, SLJIT_IMM, UCPCAT_RANGE(ucp_Cc, ucp_Cs) | UCPCAT2(ucp_Zl, ucp_Zp));
|
|
OP_FLAGS(SLJIT_MOV, TMP2, 0, SLJIT_NOT_ZERO);
|
|
|
|
OP2U(SLJIT_AND | SLJIT_SET_Z, typereg, 0, SLJIT_IMM, UCPCAT(ucp_Cf));
|
|
jump = JUMP(SLJIT_ZERO);
|
|
|
|
c = charoffset;
|
|
/* In case of ucp_Cf, we overwrite the result. */
|
|
SET_CHAR_OFFSET(0x2066);
|
|
OP2U(SLJIT_SUB | SLJIT_SET_LESS_EQUAL, TMP1, 0, SLJIT_IMM, 0x2069 - 0x2066);
|
|
OP_FLAGS(SLJIT_MOV, TMP2, 0, SLJIT_LESS_EQUAL);
|
|
|
|
OP2U(SLJIT_SUB | SLJIT_SET_Z, TMP1, 0, SLJIT_IMM, 0x061c - 0x2066);
|
|
OP_FLAGS(SLJIT_OR, TMP2, 0, SLJIT_EQUAL);
|
|
|
|
/* Restore charoffset. */
|
|
SET_CHAR_OFFSET(c);
|
|
|
|
JUMPHERE(jump);
|
|
jump = CMP(SLJIT_ZERO ^ invertcmp, TMP2, 0, SLJIT_IMM, 0);
|
|
break;
|
|
|
|
case PT_PXPUNCT:
|
|
OP2U(SLJIT_AND | SLJIT_SET_Z, typereg, 0, SLJIT_IMM, UCPCAT_RANGE(ucp_Sc, ucp_So));
|
|
OP_FLAGS(SLJIT_MOV, TMP2, 0, SLJIT_NOT_ZERO);
|
|
|
|
SET_CHAR_OFFSET(0);
|
|
OP2U(SLJIT_SUB | SLJIT_SET_LESS_EQUAL, TMP1, 0, SLJIT_IMM, 0x7f);
|
|
OP_FLAGS(SLJIT_AND, TMP2, 0, SLJIT_LESS_EQUAL);
|
|
|
|
OP2U(SLJIT_AND | SLJIT_SET_Z, typereg, 0, SLJIT_IMM, UCPCAT_RANGE(ucp_Pc, ucp_Ps));
|
|
OP_FLAGS(SLJIT_OR | SLJIT_SET_Z, TMP2, 0, SLJIT_NOT_ZERO);
|
|
jump = JUMP(SLJIT_NOT_ZERO ^ invertcmp);
|
|
break;
|
|
|
|
case PT_PXXDIGIT:
|
|
SET_CHAR_OFFSET(CHAR_A);
|
|
OP2(SLJIT_AND, TMP2, 0, TMP1, 0, SLJIT_IMM, ~0x20);
|
|
OP2U(SLJIT_SUB | SLJIT_SET_LESS_EQUAL, TMP2, 0, SLJIT_IMM, CHAR_F - CHAR_A);
|
|
OP_FLAGS(SLJIT_MOV, TMP2, 0, SLJIT_LESS_EQUAL);
|
|
|
|
SET_CHAR_OFFSET(CHAR_0);
|
|
OP2U(SLJIT_SUB | SLJIT_SET_LESS_EQUAL, TMP1, 0, SLJIT_IMM, CHAR_9 - CHAR_0);
|
|
OP_FLAGS(SLJIT_OR, TMP2, 0, SLJIT_LESS_EQUAL);
|
|
|
|
SET_CHAR_OFFSET(0xff10);
|
|
jump = CMP(SLJIT_GREATER, TMP1, 0, SLJIT_IMM, 0xff46 - 0xff10);
|
|
|
|
OP2U(SLJIT_SUB | SLJIT_SET_LESS_EQUAL, TMP1, 0, SLJIT_IMM, 0xff19 - 0xff10);
|
|
OP_FLAGS(SLJIT_OR, TMP2, 0, SLJIT_LESS_EQUAL);
|
|
|
|
SET_CHAR_OFFSET(0xff21);
|
|
OP2U(SLJIT_SUB | SLJIT_SET_LESS_EQUAL, TMP1, 0, SLJIT_IMM, 0xff26 - 0xff21);
|
|
OP_FLAGS(SLJIT_OR, TMP2, 0, SLJIT_LESS_EQUAL);
|
|
|
|
SET_CHAR_OFFSET(0xff41);
|
|
OP2U(SLJIT_SUB | SLJIT_SET_LESS_EQUAL, TMP1, 0, SLJIT_IMM, 0xff46 - 0xff41);
|
|
OP_FLAGS(SLJIT_OR, TMP2, 0, SLJIT_LESS_EQUAL);
|
|
|
|
SET_CHAR_OFFSET(0xff10);
|
|
|
|
JUMPHERE(jump);
|
|
OP2U(SLJIT_SUB | SLJIT_SET_Z, TMP2, 0, SLJIT_IMM, 0);
|
|
jump = JUMP(SLJIT_NOT_ZERO ^ invertcmp);
|
|
break;
|
|
|
|
default:
|
|
SLJIT_UNREACHABLE();
|
|
break;
|
|
}
|
|
|
|
cc += 2;
|
|
|
|
if (jump != NULL)
|
|
add_jump(compiler, compares > 0 ? list : backtracks, jump);
|
|
}
|
|
|
|
if (compares == 0)
|
|
{
|
|
if (found != NULL)
|
|
set_jumps(found, LABEL());
|
|
|
|
if (status & XCLASS_IS_ECLASS)
|
|
OP2(SLJIT_OR, ECLASS_STACK_DATA, 0, ECLASS_STACK_DATA, 0, SLJIT_IMM, 1);
|
|
return;
|
|
}
|
|
#endif /* SUPPORT_UNICODE */
|
|
|
|
SLJIT_ASSERT(compares == 1);
|
|
ranges.range_count = 0;
|
|
ranges.ranges = ranges.local_ranges;
|
|
ranges.stack = ranges.local_stack;
|
|
|
|
xclass_compute_ranges(common, cc, &ranges);
|
|
|
|
/* Memory error is set for the compiler. */
|
|
if (ranges.stack == NULL)
|
|
return;
|
|
|
|
#if (defined SLJIT_DEBUG && SLJIT_DEBUG) && \
|
|
defined SUPPORT_UNICODE && (PCRE2_CODE_UNIT_WIDTH == 8 || PCRE2_CODE_UNIT_WIDTH == 16)
|
|
if (common->utf)
|
|
{
|
|
min = READ_CHAR_MAX;
|
|
max = 0;
|
|
xclass_update_min_max(common, cc, &min, &max);
|
|
SLJIT_ASSERT(ranges.ranges[0] == min && ranges.ranges[ranges.range_count - 1] == max);
|
|
}
|
|
#endif /* SLJIT_DEBUG && SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == [8|16] */
|
|
|
|
invertcmp = (list != backtracks);
|
|
|
|
if (ranges.range_count == 2)
|
|
{
|
|
range_start = ranges.ranges[0];
|
|
range_end = ranges.ranges[1];
|
|
|
|
if (range_start < range_end)
|
|
{
|
|
SET_CHAR_OFFSET(range_start);
|
|
jump = CMP(SLJIT_LESS_EQUAL ^ invertcmp, TMP1, 0, SLJIT_IMM, (sljit_sw)(range_end - range_start));
|
|
}
|
|
else
|
|
jump = CMP(SLJIT_EQUAL ^ invertcmp, TMP1, 0, SLJIT_IMM, (sljit_sw)(range_start - charoffset));
|
|
|
|
add_jump(compiler, backtracks, jump);
|
|
|
|
SLJIT_ASSERT(ranges.stack == ranges.local_stack);
|
|
if (found != NULL)
|
|
set_jumps(found, LABEL());
|
|
|
|
if (status & XCLASS_IS_ECLASS)
|
|
OP2(SLJIT_OR, ECLASS_STACK_DATA, 0, ECLASS_STACK_DATA, 0, SLJIT_IMM, 1);
|
|
return;
|
|
}
|
|
|
|
range_start = ranges.ranges[0];
|
|
SET_CHAR_OFFSET(range_start);
|
|
if (ranges.range_count >= 6)
|
|
{
|
|
/* Early fail. */
|
|
range_end = ranges.ranges[ranges.range_count - 1];
|
|
add_jump(compiler, (flags & XCL_NOT) == 0 ? backtracks : &found,
|
|
CMP(SLJIT_GREATER, TMP1, 0, SLJIT_IMM, (sljit_sw)(range_end - range_start)));
|
|
}
|
|
|
|
depth = 0;
|
|
first_item = 0;
|
|
last_item = ranges.range_count - 2;
|
|
has_cmov = sljit_has_cpu_feature(SLJIT_HAS_CMOV) != 0;
|
|
|
|
while (TRUE)
|
|
{
|
|
/* At least two items are present. */
|
|
SLJIT_ASSERT(first_item < last_item && charoffset == ranges.ranges[0]);
|
|
last_range_set = FALSE;
|
|
|
|
if (first_item + 6 <= last_item)
|
|
{
|
|
mid_item = ((first_item + last_item) >> 1) & ~(sljit_u32)1;
|
|
SLJIT_ASSERT(last_item >= mid_item + 4);
|
|
|
|
range_end = ranges.ranges[mid_item + 1];
|
|
if (first_item + 6 > mid_item && ranges.ranges[mid_item] == range_end)
|
|
{
|
|
OP2U(SLJIT_SUB | SLJIT_SET_GREATER | SLJIT_SET_Z, TMP1, 0, SLJIT_IMM, (sljit_sw)(range_end - charoffset));
|
|
ranges.stack[depth].jump = JUMP(SLJIT_GREATER);
|
|
OP_FLAGS(SLJIT_MOV, TMP2, 0, SLJIT_EQUAL);
|
|
last_range_set = TRUE;
|
|
}
|
|
else
|
|
ranges.stack[depth].jump = CMP(SLJIT_GREATER, TMP1, 0, SLJIT_IMM, (sljit_sw)(range_end - charoffset));
|
|
|
|
ranges.stack[depth].first_item = (sljit_u32)(mid_item + 2);
|
|
ranges.stack[depth].last_item = (sljit_u32)last_item;
|
|
|
|
depth++;
|
|
SLJIT_ASSERT(ranges.stack == ranges.local_stack ?
|
|
depth <= XCLASS_LOCAL_RANGES_LOG2_SIZE : (ranges.stack + depth) <= (xclass_stack_item*)ranges.ranges);
|
|
|
|
last_item = mid_item;
|
|
if (!last_range_set)
|
|
continue;
|
|
|
|
last_item -= 2;
|
|
}
|
|
|
|
if (!last_range_set)
|
|
{
|
|
range_start = ranges.ranges[first_item];
|
|
range_end = ranges.ranges[first_item + 1];
|
|
|
|
if (range_start < range_end)
|
|
{
|
|
SET_CHAR_OFFSET(range_start);
|
|
OP2U(SLJIT_SUB | SLJIT_SET_LESS_EQUAL, TMP1, 0, SLJIT_IMM, (sljit_sw)(range_end - range_start));
|
|
OP_FLAGS(SLJIT_MOV, TMP2, 0, SLJIT_LESS_EQUAL);
|
|
}
|
|
else
|
|
{
|
|
OP2U(SLJIT_SUB | SLJIT_SET_Z, TMP1, 0, SLJIT_IMM, (sljit_sw)(range_start - charoffset));
|
|
OP_FLAGS(SLJIT_MOV, TMP2, 0, SLJIT_EQUAL);
|
|
}
|
|
first_item += 2;
|
|
}
|
|
|
|
SLJIT_ASSERT(first_item <= last_item);
|
|
|
|
do
|
|
{
|
|
range_start = ranges.ranges[first_item];
|
|
range_end = ranges.ranges[first_item + 1];
|
|
|
|
if (range_start < range_end)
|
|
{
|
|
SET_CHAR_OFFSET(range_start);
|
|
OP2U(SLJIT_SUB | SLJIT_SET_LESS_EQUAL, TMP1, 0, SLJIT_IMM, (sljit_sw)(range_end - range_start));
|
|
|
|
if (has_cmov)
|
|
SELECT(SLJIT_LESS_EQUAL, TMP2, STR_END, 0, TMP2);
|
|
else
|
|
OP_FLAGS(SLJIT_OR | ((first_item == last_item) ? SLJIT_SET_Z : 0), TMP2, 0, SLJIT_LESS_EQUAL);
|
|
}
|
|
else
|
|
{
|
|
OP2U(SLJIT_SUB | SLJIT_SET_Z, TMP1, 0, SLJIT_IMM, (sljit_sw)(range_start - charoffset));
|
|
|
|
if (has_cmov)
|
|
SELECT(SLJIT_EQUAL, TMP2, STR_END, 0, TMP2);
|
|
else
|
|
OP_FLAGS(SLJIT_OR | ((first_item == last_item) ? SLJIT_SET_Z : 0), TMP2, 0, SLJIT_EQUAL);
|
|
}
|
|
|
|
first_item += 2;
|
|
}
|
|
while (first_item <= last_item);
|
|
|
|
if (depth == 0) break;
|
|
|
|
add_jump(compiler, &check_result, JUMP(SLJIT_JUMP));
|
|
|
|
/* The charoffset resets after the end of a branch is reached. */
|
|
charoffset = ranges.ranges[0];
|
|
depth--;
|
|
first_item = ranges.stack[depth].first_item;
|
|
last_item = ranges.stack[depth].last_item;
|
|
JUMPHERE(ranges.stack[depth].jump);
|
|
}
|
|
|
|
if (check_result != NULL)
|
|
set_jumps(check_result, LABEL());
|
|
|
|
if (has_cmov)
|
|
jump = CMP(SLJIT_NOT_EQUAL ^ invertcmp, TMP2, 0, SLJIT_IMM, 0);
|
|
else
|
|
{
|
|
sljit_set_current_flags(compiler, SLJIT_SET_Z);
|
|
jump = JUMP(SLJIT_NOT_EQUAL ^ invertcmp);
|
|
}
|
|
|
|
add_jump(compiler, backtracks, jump);
|
|
|
|
if (found != NULL)
|
|
set_jumps(found, LABEL());
|
|
|
|
if (status & XCLASS_IS_ECLASS)
|
|
OP2(SLJIT_OR, ECLASS_STACK_DATA, 0, ECLASS_STACK_DATA, 0, SLJIT_IMM, 1);
|
|
|
|
if (ranges.stack != ranges.local_stack)
|
|
SLJIT_FREE(ranges.stack, compiler->allocator_data);
|
|
}
|
|
|
|
static PCRE2_SPTR compile_eclass_matchingpath(compiler_common *common, PCRE2_SPTR cc, jump_list **backtracks)
|
|
{
|
|
DEFINE_COMPILER;
|
|
PCRE2_SPTR end = cc + GET(cc, 0) - 1;
|
|
PCRE2_SPTR begin;
|
|
jump_list *not_found;
|
|
jump_list *found = NULL;
|
|
|
|
cc += LINK_SIZE;
|
|
|
|
/* Should be optimized later. */
|
|
read_char(common, 0, READ_CHAR_MAX, backtracks, 0);
|
|
|
|
if (((*cc++) & ECL_MAP) != 0)
|
|
{
|
|
xclass_check_bitset(common, (const sljit_u8 *)cc, &found, backtracks);
|
|
cc += 32 / sizeof(PCRE2_UCHAR);
|
|
}
|
|
|
|
begin = cc;
|
|
|
|
OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), LOCAL0, ECLASS_CHAR_DATA, 0);
|
|
OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), LOCAL1, ECLASS_STACK_DATA, 0);
|
|
OP1(SLJIT_MOV, ECLASS_STACK_DATA, 0, SLJIT_IMM, 0);
|
|
OP1(SLJIT_MOV, ECLASS_CHAR_DATA, 0, TMP1, 0);
|
|
|
|
/* All eclass must start with an xclass. */
|
|
SLJIT_ASSERT(*cc == ECL_XCLASS);
|
|
|
|
while (cc < end)
|
|
{
|
|
switch (*cc)
|
|
{
|
|
case ECL_AND:
|
|
++cc;
|
|
OP2(SLJIT_OR, TMP2, 0, ECLASS_STACK_DATA, 0, SLJIT_IMM, ~(sljit_sw)1);
|
|
OP2(SLJIT_LSHR, ECLASS_STACK_DATA, 0, ECLASS_STACK_DATA, 0, SLJIT_IMM, 1);
|
|
OP2(SLJIT_AND, ECLASS_STACK_DATA, 0, ECLASS_STACK_DATA, 0, TMP2, 0);
|
|
break;
|
|
|
|
case ECL_OR:
|
|
++cc;
|
|
OP2(SLJIT_AND, TMP2, 0, ECLASS_STACK_DATA, 0, SLJIT_IMM, 1);
|
|
OP2(SLJIT_LSHR, ECLASS_STACK_DATA, 0, ECLASS_STACK_DATA, 0, SLJIT_IMM, 1);
|
|
OP2(SLJIT_OR, ECLASS_STACK_DATA, 0, ECLASS_STACK_DATA, 0, TMP2, 0);
|
|
break;
|
|
|
|
case ECL_XOR:
|
|
++cc;
|
|
OP2(SLJIT_AND, TMP2, 0, ECLASS_STACK_DATA, 0, SLJIT_IMM, 1);
|
|
OP2(SLJIT_LSHR, ECLASS_STACK_DATA, 0, ECLASS_STACK_DATA, 0, SLJIT_IMM, 1);
|
|
OP2(SLJIT_XOR, ECLASS_STACK_DATA, 0, ECLASS_STACK_DATA, 0, TMP2, 0);
|
|
break;
|
|
|
|
case ECL_NOT:
|
|
++cc;
|
|
OP2(SLJIT_XOR, ECLASS_STACK_DATA, 0, ECLASS_STACK_DATA, 0, SLJIT_IMM, 1);
|
|
break;
|
|
|
|
default:
|
|
SLJIT_ASSERT(*cc == ECL_XCLASS);
|
|
if (cc != begin)
|
|
{
|
|
OP1(SLJIT_MOV, TMP1, 0, ECLASS_CHAR_DATA, 0);
|
|
OP2(SLJIT_SHL, ECLASS_STACK_DATA, 0, ECLASS_STACK_DATA, 0, SLJIT_IMM, 1);
|
|
}
|
|
|
|
not_found = NULL;
|
|
compile_xclass_matchingpath(common, cc + 1 + LINK_SIZE, ¬_found, XCLASS_IS_ECLASS);
|
|
set_jumps(not_found, LABEL());
|
|
|
|
cc += GET(cc, 1);
|
|
break;
|
|
}
|
|
}
|
|
|
|
OP2U(SLJIT_SUB | SLJIT_SET_Z, ECLASS_STACK_DATA, 0, SLJIT_IMM, 0);
|
|
OP1(SLJIT_MOV, ECLASS_CHAR_DATA, 0, SLJIT_MEM1(SLJIT_SP), LOCAL0);
|
|
OP1(SLJIT_MOV, ECLASS_STACK_DATA, 0, SLJIT_MEM1(SLJIT_SP), LOCAL1);
|
|
add_jump(compiler, backtracks, JUMP(SLJIT_EQUAL));
|
|
set_jumps(found, LABEL());
|
|
return end;
|
|
}
|
|
|
|
/* Generic character matching code. */
|
|
|
|
#undef SET_CHAR_OFFSET
|
|
#undef READ_FROM_CHAR_LIST
|
|
#undef XCLASS_LOCAL_RANGES_SIZE
|
|
#undef XCLASS_LOCAL_RANGES_LOG2_SIZE
|
|
|
|
#endif /* SUPPORT_WIDE_CHARS */
|
|
|
|
static PCRE2_SPTR byte_sequence_compare(compiler_common *common, BOOL caseless, PCRE2_SPTR cc,
|
|
compare_context *context, jump_list **backtracks)
|
|
{
|
|
DEFINE_COMPILER;
|
|
unsigned int othercasebit = 0;
|
|
PCRE2_SPTR othercasechar = NULL;
|
|
#ifdef SUPPORT_UNICODE
|
|
int utflength;
|
|
#endif
|
|
|
|
if (caseless && char_has_othercase(common, cc))
|
|
{
|
|
othercasebit = char_get_othercase_bit(common, cc);
|
|
SLJIT_ASSERT(othercasebit);
|
|
/* Extracting bit difference info. */
|
|
#if PCRE2_CODE_UNIT_WIDTH == 8
|
|
othercasechar = cc + (othercasebit >> 8);
|
|
othercasebit &= 0xff;
|
|
#elif PCRE2_CODE_UNIT_WIDTH == 16 || PCRE2_CODE_UNIT_WIDTH == 32
|
|
/* Note that this code only handles characters in the BMP. If there
|
|
ever are characters outside the BMP whose othercase differs in only one
|
|
bit from itself (there currently are none), this code will need to be
|
|
revised for PCRE2_CODE_UNIT_WIDTH == 32. */
|
|
othercasechar = cc + (othercasebit >> 9);
|
|
if ((othercasebit & 0x100) != 0)
|
|
othercasebit = (othercasebit & 0xff) << 8;
|
|
else
|
|
othercasebit &= 0xff;
|
|
#endif /* PCRE2_CODE_UNIT_WIDTH == [8|16|32] */
|
|
}
|
|
|
|
if (context->sourcereg == -1)
|
|
{
|
|
#if PCRE2_CODE_UNIT_WIDTH == 8
|
|
#if defined SLJIT_UNALIGNED && SLJIT_UNALIGNED
|
|
if (context->length >= 4)
|
|
OP1(SLJIT_MOV_S32, TMP1, 0, SLJIT_MEM1(STR_PTR), -context->length);
|
|
else if (context->length >= 2)
|
|
OP1(SLJIT_MOV_U16, TMP1, 0, SLJIT_MEM1(STR_PTR), -context->length);
|
|
else
|
|
#endif
|
|
OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), -context->length);
|
|
#elif PCRE2_CODE_UNIT_WIDTH == 16
|
|
#if defined SLJIT_UNALIGNED && SLJIT_UNALIGNED
|
|
if (context->length >= 4)
|
|
OP1(SLJIT_MOV_S32, TMP1, 0, SLJIT_MEM1(STR_PTR), -context->length);
|
|
else
|
|
#endif
|
|
OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), -context->length);
|
|
#elif PCRE2_CODE_UNIT_WIDTH == 32
|
|
OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), -context->length);
|
|
#endif /* PCRE2_CODE_UNIT_WIDTH == [8|16|32] */
|
|
context->sourcereg = TMP2;
|
|
}
|
|
|
|
#ifdef SUPPORT_UNICODE
|
|
utflength = 1;
|
|
if (common->utf && HAS_EXTRALEN(*cc))
|
|
utflength += GET_EXTRALEN(*cc);
|
|
|
|
do
|
|
{
|
|
#endif
|
|
|
|
context->length -= IN_UCHARS(1);
|
|
#if (defined SLJIT_UNALIGNED && SLJIT_UNALIGNED) && (PCRE2_CODE_UNIT_WIDTH == 8 || PCRE2_CODE_UNIT_WIDTH == 16)
|
|
|
|
/* Unaligned read is supported. */
|
|
if (othercasebit != 0 && othercasechar == cc)
|
|
{
|
|
context->c.asuchars[context->ucharptr] = *cc | othercasebit;
|
|
context->oc.asuchars[context->ucharptr] = othercasebit;
|
|
}
|
|
else
|
|
{
|
|
context->c.asuchars[context->ucharptr] = *cc;
|
|
context->oc.asuchars[context->ucharptr] = 0;
|
|
}
|
|
context->ucharptr++;
|
|
|
|
#if PCRE2_CODE_UNIT_WIDTH == 8
|
|
if (context->ucharptr >= 4 || context->length == 0 || (context->ucharptr == 2 && context->length == 1))
|
|
#else
|
|
if (context->ucharptr >= 2 || context->length == 0)
|
|
#endif
|
|
{
|
|
if (context->length >= 4)
|
|
OP1(SLJIT_MOV_S32, context->sourcereg, 0, SLJIT_MEM1(STR_PTR), -context->length);
|
|
else if (context->length >= 2)
|
|
OP1(SLJIT_MOV_U16, context->sourcereg, 0, SLJIT_MEM1(STR_PTR), -context->length);
|
|
#if PCRE2_CODE_UNIT_WIDTH == 8
|
|
else if (context->length >= 1)
|
|
OP1(SLJIT_MOV_U8, context->sourcereg, 0, SLJIT_MEM1(STR_PTR), -context->length);
|
|
#endif /* PCRE2_CODE_UNIT_WIDTH == 8 */
|
|
context->sourcereg = context->sourcereg == TMP1 ? TMP2 : TMP1;
|
|
|
|
switch(context->ucharptr)
|
|
{
|
|
case 4 / sizeof(PCRE2_UCHAR):
|
|
if (context->oc.asint != 0)
|
|
OP2(SLJIT_OR, context->sourcereg, 0, context->sourcereg, 0, SLJIT_IMM, context->oc.asint);
|
|
add_jump(compiler, backtracks, CMP(SLJIT_NOT_EQUAL, context->sourcereg, 0, SLJIT_IMM, context->c.asint | context->oc.asint));
|
|
break;
|
|
|
|
case 2 / sizeof(PCRE2_UCHAR):
|
|
if (context->oc.asushort != 0)
|
|
OP2(SLJIT_OR, context->sourcereg, 0, context->sourcereg, 0, SLJIT_IMM, context->oc.asushort);
|
|
add_jump(compiler, backtracks, CMP(SLJIT_NOT_EQUAL, context->sourcereg, 0, SLJIT_IMM, context->c.asushort | context->oc.asushort));
|
|
break;
|
|
|
|
#if PCRE2_CODE_UNIT_WIDTH == 8
|
|
case 1:
|
|
if (context->oc.asbyte != 0)
|
|
OP2(SLJIT_OR, context->sourcereg, 0, context->sourcereg, 0, SLJIT_IMM, context->oc.asbyte);
|
|
add_jump(compiler, backtracks, CMP(SLJIT_NOT_EQUAL, context->sourcereg, 0, SLJIT_IMM, context->c.asbyte | context->oc.asbyte));
|
|
break;
|
|
#endif
|
|
|
|
default:
|
|
SLJIT_UNREACHABLE();
|
|
break;
|
|
}
|
|
context->ucharptr = 0;
|
|
}
|
|
|
|
#else
|
|
|
|
/* Unaligned read is unsupported or in 32 bit mode. */
|
|
if (context->length >= 1)
|
|
OP1(MOV_UCHAR, context->sourcereg, 0, SLJIT_MEM1(STR_PTR), -context->length);
|
|
|
|
context->sourcereg = context->sourcereg == TMP1 ? TMP2 : TMP1;
|
|
|
|
if (othercasebit != 0 && othercasechar == cc)
|
|
{
|
|
OP2(SLJIT_OR, context->sourcereg, 0, context->sourcereg, 0, SLJIT_IMM, othercasebit);
|
|
add_jump(compiler, backtracks, CMP(SLJIT_NOT_EQUAL, context->sourcereg, 0, SLJIT_IMM, *cc | othercasebit));
|
|
}
|
|
else
|
|
add_jump(compiler, backtracks, CMP(SLJIT_NOT_EQUAL, context->sourcereg, 0, SLJIT_IMM, *cc));
|
|
|
|
#endif
|
|
|
|
cc++;
|
|
#ifdef SUPPORT_UNICODE
|
|
utflength--;
|
|
}
|
|
while (utflength > 0);
|
|
#endif
|
|
|
|
return cc;
|
|
}
|
|
|
|
#ifdef SUPPORT_UNICODE
|
|
|
|
#if PCRE2_CODE_UNIT_WIDTH != 32
|
|
|
|
/* The code in this function copies the logic of the interpreter function that
|
|
is defined in the pcre2_extuni.c source. If that code is updated, this
|
|
function, and those below it, must be kept in step (note by PH, June 2024). */
|
|
|
|
static PCRE2_SPTR SLJIT_FUNC do_extuni_utf(jit_arguments *args, PCRE2_SPTR cc)
|
|
{
|
|
PCRE2_SPTR start_subject = args->begin;
|
|
PCRE2_SPTR end_subject = args->end;
|
|
int lgb, rgb, ricount;
|
|
PCRE2_SPTR prevcc, endcc, bptr;
|
|
BOOL first = TRUE;
|
|
BOOL was_ep_ZWJ = FALSE;
|
|
uint32_t c;
|
|
|
|
prevcc = cc;
|
|
endcc = NULL;
|
|
do
|
|
{
|
|
GETCHARINC(c, cc);
|
|
rgb = UCD_GRAPHBREAK(c);
|
|
|
|
if (first)
|
|
{
|
|
lgb = rgb;
|
|
endcc = cc;
|
|
first = FALSE;
|
|
continue;
|
|
}
|
|
|
|
if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0)
|
|
break;
|
|
|
|
/* ZWJ followed by Extended Pictographic is allowed only if the ZWJ was
|
|
preceded by Extended Pictographic. */
|
|
|
|
if (lgb == ucp_gbZWJ && rgb == ucp_gbExtended_Pictographic && !was_ep_ZWJ)
|
|
break;
|
|
|
|
/* Not breaking between Regional Indicators is allowed only if there
|
|
are an even number of preceding RIs. */
|
|
|
|
if (lgb == ucp_gbRegional_Indicator && rgb == ucp_gbRegional_Indicator)
|
|
{
|
|
ricount = 0;
|
|
bptr = prevcc;
|
|
|
|
/* bptr is pointing to the left-hand character */
|
|
while (bptr > start_subject)
|
|
{
|
|
bptr--;
|
|
BACKCHAR(bptr);
|
|
GETCHAR(c, bptr);
|
|
|
|
if (UCD_GRAPHBREAK(c) != ucp_gbRegional_Indicator)
|
|
break;
|
|
|
|
ricount++;
|
|
}
|
|
|
|
if ((ricount & 1) != 0) break; /* Grapheme break required */
|
|
}
|
|
|
|
/* Set a flag when ZWJ follows Extended Pictographic (with optional Extend in
|
|
between; see next statement). */
|
|
|
|
was_ep_ZWJ = (lgb == ucp_gbExtended_Pictographic && rgb == ucp_gbZWJ);
|
|
|
|
/* If Extend follows Extended_Pictographic, do not update lgb; this allows
|
|
any number of them before a following ZWJ. */
|
|
|
|
if (rgb != ucp_gbExtend || lgb != ucp_gbExtended_Pictographic)
|
|
lgb = rgb;
|
|
|
|
prevcc = endcc;
|
|
endcc = cc;
|
|
}
|
|
while (cc < end_subject);
|
|
|
|
return endcc;
|
|
}
|
|
|
|
#endif /* PCRE2_CODE_UNIT_WIDTH != 32 */
|
|
|
|
/* The code in this function copies the logic of the interpreter function that
|
|
is defined in the pcre2_extuni.c source. If that code is updated, this
|
|
function, and the one below it, must be kept in step (note by PH, June 2024). */
|
|
|
|
static PCRE2_SPTR SLJIT_FUNC do_extuni_utf_invalid(jit_arguments *args, PCRE2_SPTR cc)
|
|
{
|
|
PCRE2_SPTR start_subject = args->begin;
|
|
PCRE2_SPTR end_subject = args->end;
|
|
int lgb, rgb, ricount;
|
|
PCRE2_SPTR prevcc, endcc, bptr;
|
|
BOOL first = TRUE;
|
|
BOOL was_ep_ZWJ = FALSE;
|
|
uint32_t c;
|
|
|
|
prevcc = cc;
|
|
endcc = NULL;
|
|
do
|
|
{
|
|
GETCHARINC_INVALID(c, cc, end_subject, break);
|
|
rgb = UCD_GRAPHBREAK(c);
|
|
|
|
if (first)
|
|
{
|
|
lgb = rgb;
|
|
endcc = cc;
|
|
first = FALSE;
|
|
continue;
|
|
}
|
|
|
|
if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0)
|
|
break;
|
|
|
|
/* ZWJ followed by Extended Pictographic is allowed only if the ZWJ was
|
|
preceded by Extended Pictographic. */
|
|
|
|
if (lgb == ucp_gbZWJ && rgb == ucp_gbExtended_Pictographic && !was_ep_ZWJ)
|
|
break;
|
|
|
|
/* Not breaking between Regional Indicators is allowed only if there
|
|
are an even number of preceding RIs. */
|
|
|
|
if (lgb == ucp_gbRegional_Indicator && rgb == ucp_gbRegional_Indicator)
|
|
{
|
|
ricount = 0;
|
|
bptr = prevcc;
|
|
|
|
/* bptr is pointing to the left-hand character */
|
|
while (bptr > start_subject)
|
|
{
|
|
GETCHARBACK_INVALID(c, bptr, start_subject, break);
|
|
|
|
if (UCD_GRAPHBREAK(c) != ucp_gbRegional_Indicator)
|
|
break;
|
|
|
|
ricount++;
|
|
}
|
|
|
|
if ((ricount & 1) != 0)
|
|
break; /* Grapheme break required */
|
|
}
|
|
|
|
/* Set a flag when ZWJ follows Extended Pictographic (with optional Extend in
|
|
between; see next statement). */
|
|
|
|
was_ep_ZWJ = (lgb == ucp_gbExtended_Pictographic && rgb == ucp_gbZWJ);
|
|
|
|
/* If Extend follows Extended_Pictographic, do not update lgb; this allows
|
|
any number of them before a following ZWJ. */
|
|
|
|
if (rgb != ucp_gbExtend || lgb != ucp_gbExtended_Pictographic)
|
|
lgb = rgb;
|
|
|
|
prevcc = endcc;
|
|
endcc = cc;
|
|
}
|
|
while (cc < end_subject);
|
|
|
|
return endcc;
|
|
}
|
|
|
|
/* The code in this function copies the logic of the interpreter function that
|
|
is defined in the pcre2_extuni.c source. If that code is updated, this
|
|
function must be kept in step (note by PH, June 2024). */
|
|
|
|
static PCRE2_SPTR SLJIT_FUNC do_extuni_no_utf(jit_arguments *args, PCRE2_SPTR cc)
|
|
{
|
|
PCRE2_SPTR start_subject = args->begin;
|
|
PCRE2_SPTR end_subject = args->end;
|
|
int lgb, rgb, ricount;
|
|
PCRE2_SPTR bptr;
|
|
uint32_t c;
|
|
BOOL was_ep_ZWJ = FALSE;
|
|
|
|
/* Patch by PH */
|
|
/* GETCHARINC(c, cc); */
|
|
c = *cc++;
|
|
|
|
#if PCRE2_CODE_UNIT_WIDTH == 32
|
|
if (c >= 0x110000)
|
|
return cc;
|
|
#endif /* PCRE2_CODE_UNIT_WIDTH == 32 */
|
|
lgb = UCD_GRAPHBREAK(c);
|
|
|
|
while (cc < end_subject)
|
|
{
|
|
c = *cc;
|
|
#if PCRE2_CODE_UNIT_WIDTH == 32
|
|
if (c >= 0x110000)
|
|
break;
|
|
#endif /* PCRE2_CODE_UNIT_WIDTH == 32 */
|
|
rgb = UCD_GRAPHBREAK(c);
|
|
|
|
if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0)
|
|
break;
|
|
|
|
/* ZWJ followed by Extended Pictographic is allowed only if the ZWJ was
|
|
preceded by Extended Pictographic. */
|
|
|
|
if (lgb == ucp_gbZWJ && rgb == ucp_gbExtended_Pictographic && !was_ep_ZWJ)
|
|
break;
|
|
|
|
/* Not breaking between Regional Indicators is allowed only if there
|
|
are an even number of preceding RIs. */
|
|
|
|
if (lgb == ucp_gbRegional_Indicator && rgb == ucp_gbRegional_Indicator)
|
|
{
|
|
ricount = 0;
|
|
bptr = cc - 1;
|
|
|
|
/* bptr is pointing to the left-hand character */
|
|
while (bptr > start_subject)
|
|
{
|
|
bptr--;
|
|
c = *bptr;
|
|
#if PCRE2_CODE_UNIT_WIDTH == 32
|
|
if (c >= 0x110000)
|
|
break;
|
|
#endif /* PCRE2_CODE_UNIT_WIDTH == 32 */
|
|
|
|
if (UCD_GRAPHBREAK(c) != ucp_gbRegional_Indicator) break;
|
|
|
|
ricount++;
|
|
}
|
|
|
|
if ((ricount & 1) != 0)
|
|
break; /* Grapheme break required */
|
|
}
|
|
|
|
/* Set a flag when ZWJ follows Extended Pictographic (with optional Extend in
|
|
between; see next statement). */
|
|
|
|
was_ep_ZWJ = (lgb == ucp_gbExtended_Pictographic && rgb == ucp_gbZWJ);
|
|
|
|
/* If Extend follows Extended_Pictographic, do not update lgb; this allows
|
|
any number of them before a following ZWJ. */
|
|
|
|
if (rgb != ucp_gbExtend || lgb != ucp_gbExtended_Pictographic)
|
|
lgb = rgb;
|
|
|
|
cc++;
|
|
}
|
|
|
|
return cc;
|
|
}
|
|
|
|
static void compile_clist(compiler_common *common, PCRE2_SPTR cc, jump_list **backtracks)
|
|
{
|
|
DEFINE_COMPILER;
|
|
const sljit_u32 *other_cases;
|
|
struct sljit_jump *jump;
|
|
sljit_u32 min = 0, max = READ_CHAR_MAX;
|
|
BOOL has_cmov = sljit_has_cpu_feature(SLJIT_HAS_CMOV) != 0;
|
|
|
|
SLJIT_ASSERT(cc[1] == PT_CLIST);
|
|
|
|
if (cc[0] == OP_PROP)
|
|
{
|
|
other_cases = PRIV(ucd_caseless_sets) + cc[2];
|
|
|
|
min = *other_cases++;
|
|
max = min;
|
|
|
|
while (*other_cases != NOTACHAR)
|
|
{
|
|
if (*other_cases > max) max = *other_cases;
|
|
if (*other_cases < min) min = *other_cases;
|
|
other_cases++;
|
|
}
|
|
}
|
|
|
|
other_cases = PRIV(ucd_caseless_sets) + cc[2];
|
|
SLJIT_ASSERT(other_cases[0] != NOTACHAR && other_cases[1] != NOTACHAR);
|
|
/* The NOTACHAR is higher than any character. */
|
|
SLJIT_ASSERT(other_cases[0] < other_cases[1] && other_cases[1] < other_cases[2]);
|
|
|
|
read_char(common, min, max, backtracks, READ_CHAR_UPDATE_STR_PTR);
|
|
|
|
/* At least two characters are required.
|
|
Otherwise this case would be handled by the normal code path. */
|
|
/* NOTACHAR is the unsigned maximum. */
|
|
|
|
/* Optimizing character pairs, if their difference is power of 2. */
|
|
if (is_powerof2(other_cases[1] ^ other_cases[0]))
|
|
{
|
|
OP2(SLJIT_OR, TMP2, 0, TMP1, 0, SLJIT_IMM, (sljit_sw)(other_cases[1] ^ other_cases[0]));
|
|
OP2U(SLJIT_SUB | SLJIT_SET_Z, TMP2, 0, SLJIT_IMM, other_cases[1]);
|
|
OP_FLAGS(SLJIT_MOV, TMP2, 0, SLJIT_EQUAL);
|
|
other_cases += 2;
|
|
}
|
|
else if (is_powerof2(other_cases[2] ^ other_cases[1]))
|
|
{
|
|
SLJIT_ASSERT(other_cases[2] != NOTACHAR);
|
|
|
|
OP2(SLJIT_OR, TMP2, 0, TMP1, 0, SLJIT_IMM, (sljit_sw)(other_cases[2] ^ other_cases[1]));
|
|
OP2U(SLJIT_SUB | SLJIT_SET_Z, TMP2, 0, SLJIT_IMM, other_cases[2]);
|
|
OP_FLAGS(SLJIT_MOV, TMP2, 0, SLJIT_EQUAL);
|
|
|
|
OP2U(SLJIT_SUB | SLJIT_SET_Z, TMP1, 0, SLJIT_IMM, (sljit_sw)other_cases[0]);
|
|
|
|
if (has_cmov)
|
|
SELECT(SLJIT_EQUAL, TMP2, STR_END, 0, TMP2);
|
|
else
|
|
OP_FLAGS(SLJIT_OR | ((other_cases[3] == NOTACHAR) ? SLJIT_SET_Z : 0), TMP2, 0, SLJIT_EQUAL);
|
|
|
|
other_cases += 3;
|
|
}
|
|
else
|
|
{
|
|
OP2U(SLJIT_SUB | SLJIT_SET_Z, TMP1, 0, SLJIT_IMM, (sljit_sw)(*other_cases++));
|
|
OP_FLAGS(SLJIT_MOV, TMP2, 0, SLJIT_EQUAL);
|
|
}
|
|
|
|
while (*other_cases != NOTACHAR)
|
|
{
|
|
OP2U(SLJIT_SUB | SLJIT_SET_Z, TMP1, 0, SLJIT_IMM, (sljit_sw)(*other_cases++));
|
|
|
|
if (has_cmov)
|
|
SELECT(SLJIT_EQUAL, TMP2, STR_END, 0, TMP2);
|
|
else
|
|
OP_FLAGS(SLJIT_OR | ((*other_cases == NOTACHAR) ? SLJIT_SET_Z : 0), TMP2, 0, SLJIT_EQUAL);
|
|
}
|
|
|
|
if (has_cmov)
|
|
jump = CMP(cc[0] == OP_PROP ? SLJIT_EQUAL : SLJIT_NOT_EQUAL, TMP2, 0, SLJIT_IMM, 0);
|
|
else
|
|
jump = JUMP(cc[0] == OP_PROP ? SLJIT_ZERO : SLJIT_NOT_ZERO);
|
|
|
|
add_jump(compiler, backtracks, jump);
|
|
}
|
|
|
|
#endif /* SUPPORT_UNICODE */
|
|
|
|
static PCRE2_SPTR compile_char1_matchingpath(compiler_common *common, PCRE2_UCHAR type, PCRE2_SPTR cc, jump_list **backtracks, BOOL check_str_ptr)
|
|
{
|
|
DEFINE_COMPILER;
|
|
int length;
|
|
unsigned int c, oc, bit;
|
|
compare_context context;
|
|
struct sljit_jump *jump[3];
|
|
jump_list *end_list;
|
|
#ifdef SUPPORT_UNICODE
|
|
PCRE2_UCHAR propdata[5];
|
|
#endif /* SUPPORT_UNICODE */
|
|
|
|
switch(type)
|
|
{
|
|
case OP_NOT_DIGIT:
|
|
case OP_DIGIT:
|
|
/* Digits are usually 0-9, so it is worth to optimize them. */
|
|
if (check_str_ptr)
|
|
detect_partial_match(common, backtracks);
|
|
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
|
|
if (common->utf && is_char7_bitset((const sljit_u8*)common->ctypes - cbit_length + cbit_digit, FALSE))
|
|
read_char7_type(common, backtracks, type == OP_NOT_DIGIT);
|
|
else
|
|
#endif
|
|
read_char8_type(common, backtracks, type == OP_NOT_DIGIT);
|
|
/* Flip the starting bit in the negative case. */
|
|
OP2U(SLJIT_AND | SLJIT_SET_Z, TMP1, 0, SLJIT_IMM, ctype_digit);
|
|
add_jump(compiler, backtracks, JUMP(type == OP_DIGIT ? SLJIT_ZERO : SLJIT_NOT_ZERO));
|
|
return cc;
|
|
|
|
case OP_NOT_WHITESPACE:
|
|
case OP_WHITESPACE:
|
|
if (check_str_ptr)
|
|
detect_partial_match(common, backtracks);
|
|
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
|
|
if (common->utf && is_char7_bitset((const sljit_u8*)common->ctypes - cbit_length + cbit_space, FALSE))
|
|
read_char7_type(common, backtracks, type == OP_NOT_WHITESPACE);
|
|
else
|
|
#endif
|
|
read_char8_type(common, backtracks, type == OP_NOT_WHITESPACE);
|
|
OP2U(SLJIT_AND | SLJIT_SET_Z, TMP1, 0, SLJIT_IMM, ctype_space);
|
|
add_jump(compiler, backtracks, JUMP(type == OP_WHITESPACE ? SLJIT_ZERO : SLJIT_NOT_ZERO));
|
|
return cc;
|
|
|
|
case OP_NOT_WORDCHAR:
|
|
case OP_WORDCHAR:
|
|
if (check_str_ptr)
|
|
detect_partial_match(common, backtracks);
|
|
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
|
|
if (common->utf && is_char7_bitset((const sljit_u8*)common->ctypes - cbit_length + cbit_word, FALSE))
|
|
read_char7_type(common, backtracks, type == OP_NOT_WORDCHAR);
|
|
else
|
|
#endif
|
|
read_char8_type(common, backtracks, type == OP_NOT_WORDCHAR);
|
|
OP2U(SLJIT_AND | SLJIT_SET_Z, TMP1, 0, SLJIT_IMM, ctype_word);
|
|
add_jump(compiler, backtracks, JUMP(type == OP_WORDCHAR ? SLJIT_ZERO : SLJIT_NOT_ZERO));
|
|
return cc;
|
|
|
|
case OP_ANY:
|
|
if (check_str_ptr)
|
|
detect_partial_match(common, backtracks);
|
|
read_char(common, common->nlmin, common->nlmax, backtracks, READ_CHAR_UPDATE_STR_PTR);
|
|
if (common->nltype == NLTYPE_FIXED && common->newline > 255)
|
|
{
|
|
jump[0] = CMP(SLJIT_NOT_EQUAL, TMP1, 0, SLJIT_IMM, (common->newline >> 8) & 0xff);
|
|
end_list = NULL;
|
|
if (common->mode != PCRE2_JIT_PARTIAL_HARD)
|
|
add_jump(compiler, &end_list, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));
|
|
else
|
|
check_str_end(common, &end_list);
|
|
|
|
OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), 0);
|
|
add_jump(compiler, backtracks, CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, common->newline & 0xff));
|
|
set_jumps(end_list, LABEL());
|
|
JUMPHERE(jump[0]);
|
|
}
|
|
else
|
|
check_newlinechar(common, common->nltype, backtracks, TRUE);
|
|
return cc;
|
|
|
|
case OP_ALLANY:
|
|
if (check_str_ptr)
|
|
detect_partial_match(common, backtracks);
|
|
#ifdef SUPPORT_UNICODE
|
|
if (common->utf && common->invalid_utf)
|
|
{
|
|
read_char(common, 0, READ_CHAR_MAX, backtracks, READ_CHAR_UPDATE_STR_PTR);
|
|
return cc;
|
|
}
|
|
#endif /* SUPPORT_UNICODE */
|
|
|
|
skip_valid_char(common);
|
|
return cc;
|
|
|
|
case OP_ANYBYTE:
|
|
if (check_str_ptr)
|
|
detect_partial_match(common, backtracks);
|
|
OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
|
|
return cc;
|
|
|
|
#ifdef SUPPORT_UNICODE
|
|
case OP_NOTPROP:
|
|
case OP_PROP:
|
|
if (check_str_ptr)
|
|
detect_partial_match(common, backtracks);
|
|
if (cc[0] == PT_CLIST)
|
|
{
|
|
compile_clist(common, cc - 1, backtracks);
|
|
return cc + 2;
|
|
}
|
|
|
|
propdata[0] = 0;
|
|
propdata[1] = type == OP_NOTPROP ? XCL_NOTPROP : XCL_PROP;
|
|
propdata[2] = cc[0];
|
|
propdata[3] = cc[1];
|
|
propdata[4] = XCL_END;
|
|
compile_xclass_matchingpath(common, propdata, backtracks, 0);
|
|
return cc + 2;
|
|
#endif
|
|
|
|
case OP_ANYNL:
|
|
if (check_str_ptr)
|
|
detect_partial_match(common, backtracks);
|
|
read_char(common, common->bsr_nlmin, common->bsr_nlmax, NULL, 0);
|
|
jump[0] = CMP(SLJIT_NOT_EQUAL, TMP1, 0, SLJIT_IMM, CHAR_CR);
|
|
/* We don't need to handle soft partial matching case. */
|
|
end_list = NULL;
|
|
if (common->mode != PCRE2_JIT_PARTIAL_HARD)
|
|
add_jump(compiler, &end_list, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));
|
|
else
|
|
check_str_end(common, &end_list);
|
|
OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), 0);
|
|
OP2U(SLJIT_SUB | SLJIT_SET_Z, TMP1, 0, SLJIT_IMM, CHAR_NL);
|
|
OP_FLAGS(SLJIT_MOV, TMP1, 0, SLJIT_EQUAL);
|
|
#if PCRE2_CODE_UNIT_WIDTH == 16 || PCRE2_CODE_UNIT_WIDTH == 32
|
|
OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, UCHAR_SHIFT);
|
|
#endif
|
|
OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0);
|
|
jump[1] = JUMP(SLJIT_JUMP);
|
|
JUMPHERE(jump[0]);
|
|
check_newlinechar(common, common->bsr_nltype, backtracks, FALSE);
|
|
set_jumps(end_list, LABEL());
|
|
JUMPHERE(jump[1]);
|
|
return cc;
|
|
|
|
case OP_NOT_HSPACE:
|
|
case OP_HSPACE:
|
|
if (check_str_ptr)
|
|
detect_partial_match(common, backtracks);
|
|
|
|
if (type == OP_NOT_HSPACE)
|
|
read_char(common, 0x9, 0x3000, backtracks, READ_CHAR_UPDATE_STR_PTR);
|
|
else
|
|
read_char(common, 0x9, 0x3000, NULL, 0);
|
|
|
|
add_jump(compiler, &common->hspace, JUMP(SLJIT_FAST_CALL));
|
|
sljit_set_current_flags(compiler, SLJIT_SET_Z);
|
|
add_jump(compiler, backtracks, JUMP(type == OP_NOT_HSPACE ? SLJIT_NOT_ZERO : SLJIT_ZERO));
|
|
return cc;
|
|
|
|
case OP_NOT_VSPACE:
|
|
case OP_VSPACE:
|
|
if (check_str_ptr)
|
|
detect_partial_match(common, backtracks);
|
|
|
|
if (type == OP_NOT_VSPACE)
|
|
read_char(common, 0xa, 0x2029, backtracks, READ_CHAR_UPDATE_STR_PTR);
|
|
else
|
|
read_char(common, 0xa, 0x2029, NULL, 0);
|
|
|
|
add_jump(compiler, &common->vspace, JUMP(SLJIT_FAST_CALL));
|
|
sljit_set_current_flags(compiler, SLJIT_SET_Z);
|
|
add_jump(compiler, backtracks, JUMP(type == OP_NOT_VSPACE ? SLJIT_NOT_ZERO : SLJIT_ZERO));
|
|
return cc;
|
|
|
|
#ifdef SUPPORT_UNICODE
|
|
case OP_EXTUNI:
|
|
if (check_str_ptr)
|
|
detect_partial_match(common, backtracks);
|
|
|
|
SLJIT_ASSERT(TMP1 == SLJIT_R0 && STR_PTR == SLJIT_R1);
|
|
OP1(SLJIT_MOV, SLJIT_R0, 0, ARGUMENTS, 0);
|
|
|
|
#if PCRE2_CODE_UNIT_WIDTH != 32
|
|
sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS2(W, W, W), SLJIT_IMM,
|
|
common->utf ? (common->invalid_utf ? SLJIT_FUNC_ADDR(do_extuni_utf_invalid) : SLJIT_FUNC_ADDR(do_extuni_utf)) : SLJIT_FUNC_ADDR(do_extuni_no_utf));
|
|
if (common->invalid_utf)
|
|
add_jump(compiler, backtracks, CMP(SLJIT_EQUAL, SLJIT_RETURN_REG, 0, SLJIT_IMM, 0));
|
|
#else
|
|
sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS2(W, W, W), SLJIT_IMM,
|
|
common->invalid_utf ? SLJIT_FUNC_ADDR(do_extuni_utf_invalid) : SLJIT_FUNC_ADDR(do_extuni_no_utf));
|
|
if (common->invalid_utf)
|
|
add_jump(compiler, backtracks, CMP(SLJIT_EQUAL, SLJIT_RETURN_REG, 0, SLJIT_IMM, 0));
|
|
#endif
|
|
|
|
OP1(SLJIT_MOV, STR_PTR, 0, SLJIT_RETURN_REG, 0);
|
|
|
|
if (common->mode == PCRE2_JIT_PARTIAL_HARD)
|
|
{
|
|
jump[0] = CMP(SLJIT_LESS, SLJIT_RETURN_REG, 0, STR_END, 0);
|
|
/* Since we successfully read a char above, partial matching must occur. */
|
|
check_partial(common, TRUE);
|
|
JUMPHERE(jump[0]);
|
|
}
|
|
return cc;
|
|
#endif
|
|
|
|
case OP_CHAR:
|
|
case OP_CHARI:
|
|
length = 1;
|
|
#ifdef SUPPORT_UNICODE
|
|
if (common->utf && HAS_EXTRALEN(*cc)) length += GET_EXTRALEN(*cc);
|
|
#endif
|
|
|
|
if (check_str_ptr && common->mode != PCRE2_JIT_COMPLETE)
|
|
detect_partial_match(common, backtracks);
|
|
|
|
if (type == OP_CHAR || !char_has_othercase(common, cc) || char_get_othercase_bit(common, cc) != 0)
|
|
{
|
|
OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(length));
|
|
if (length > 1 || (check_str_ptr && common->mode == PCRE2_JIT_COMPLETE))
|
|
add_jump(compiler, backtracks, CMP(SLJIT_GREATER, STR_PTR, 0, STR_END, 0));
|
|
|
|
context.length = IN_UCHARS(length);
|
|
context.sourcereg = -1;
|
|
#if defined SLJIT_UNALIGNED && SLJIT_UNALIGNED
|
|
context.ucharptr = 0;
|
|
#endif
|
|
return byte_sequence_compare(common, type == OP_CHARI, cc, &context, backtracks);
|
|
}
|
|
|
|
#ifdef SUPPORT_UNICODE
|
|
if (common->utf)
|
|
{
|
|
GETCHAR(c, cc);
|
|
}
|
|
else
|
|
#endif
|
|
c = *cc;
|
|
|
|
SLJIT_ASSERT(type == OP_CHARI && char_has_othercase(common, cc));
|
|
|
|
if (check_str_ptr && common->mode == PCRE2_JIT_COMPLETE)
|
|
add_jump(compiler, backtracks, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));
|
|
|
|
oc = char_othercase(common, c);
|
|
read_char(common, c < oc ? c : oc, c > oc ? c : oc, NULL, 0);
|
|
|
|
SLJIT_ASSERT(!is_powerof2(c ^ oc));
|
|
|
|
if (sljit_has_cpu_feature(SLJIT_HAS_CMOV))
|
|
{
|
|
OP2U(SLJIT_SUB | SLJIT_SET_Z, TMP1, 0, SLJIT_IMM, oc);
|
|
SELECT(SLJIT_EQUAL, TMP1, SLJIT_IMM, c, TMP1);
|
|
add_jump(compiler, backtracks, CMP(SLJIT_NOT_EQUAL, TMP1, 0, SLJIT_IMM, c));
|
|
}
|
|
else
|
|
{
|
|
jump[0] = CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, c);
|
|
add_jump(compiler, backtracks, CMP(SLJIT_NOT_EQUAL, TMP1, 0, SLJIT_IMM, oc));
|
|
JUMPHERE(jump[0]);
|
|
}
|
|
return cc + length;
|
|
|
|
case OP_NOT:
|
|
case OP_NOTI:
|
|
if (check_str_ptr)
|
|
detect_partial_match(common, backtracks);
|
|
|
|
length = 1;
|
|
#ifdef SUPPORT_UNICODE
|
|
if (common->utf)
|
|
{
|
|
#if PCRE2_CODE_UNIT_WIDTH == 8
|
|
c = *cc;
|
|
if (c < 128 && !common->invalid_utf)
|
|
{
|
|
OP1(SLJIT_MOV_U8, TMP1, 0, SLJIT_MEM1(STR_PTR), 0);
|
|
if (type == OP_NOT || !char_has_othercase(common, cc))
|
|
add_jump(compiler, backtracks, CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, c));
|
|
else
|
|
{
|
|
/* Since UTF8 code page is fixed, we know that c is in [a-z] or [A-Z] range. */
|
|
OP2(SLJIT_OR, TMP2, 0, TMP1, 0, SLJIT_IMM, 0x20);
|
|
add_jump(compiler, backtracks, CMP(SLJIT_EQUAL, TMP2, 0, SLJIT_IMM, c | 0x20));
|
|
}
|
|
/* Skip the variable-length character. */
|
|
OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
|
|
jump[0] = CMP(SLJIT_LESS, TMP1, 0, SLJIT_IMM, 0xc0);
|
|
OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(TMP1), (sljit_sw)PRIV(utf8_table4) - 0xc0);
|
|
OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0);
|
|
JUMPHERE(jump[0]);
|
|
return cc + 1;
|
|
}
|
|
else
|
|
#endif /* PCRE2_CODE_UNIT_WIDTH == 8 */
|
|
{
|
|
GETCHARLEN(c, cc, length);
|
|
}
|
|
}
|
|
else
|
|
#endif /* SUPPORT_UNICODE */
|
|
c = *cc;
|
|
|
|
if (type == OP_NOT || !char_has_othercase(common, cc))
|
|
{
|
|
read_char(common, c, c, backtracks, READ_CHAR_UPDATE_STR_PTR);
|
|
add_jump(compiler, backtracks, CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, c));
|
|
}
|
|
else
|
|
{
|
|
oc = char_othercase(common, c);
|
|
read_char(common, c < oc ? c : oc, c > oc ? c : oc, backtracks, READ_CHAR_UPDATE_STR_PTR);
|
|
bit = c ^ oc;
|
|
if (is_powerof2(bit))
|
|
{
|
|
OP2(SLJIT_OR, TMP1, 0, TMP1, 0, SLJIT_IMM, bit);
|
|
add_jump(compiler, backtracks, CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, c | bit));
|
|
}
|
|
else
|
|
{
|
|
add_jump(compiler, backtracks, CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, c));
|
|
add_jump(compiler, backtracks, CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, oc));
|
|
}
|
|
}
|
|
return cc + length;
|
|
|
|
case OP_CLASS:
|
|
case OP_NCLASS:
|
|
if (check_str_ptr)
|
|
detect_partial_match(common, backtracks);
|
|
|
|
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
|
|
bit = (common->utf && is_char7_bitset((const sljit_u8 *)cc, type == OP_NCLASS)) ? 127 : 255;
|
|
if (type == OP_NCLASS)
|
|
read_char(common, 0, bit, backtracks, READ_CHAR_UPDATE_STR_PTR);
|
|
else
|
|
read_char(common, 0, bit, NULL, 0);
|
|
#else
|
|
if (type == OP_NCLASS)
|
|
read_char(common, 0, 255, backtracks, READ_CHAR_UPDATE_STR_PTR);
|
|
else
|
|
read_char(common, 0, 255, NULL, 0);
|
|
#endif
|
|
|
|
if (optimize_class(common, (const sljit_u8 *)cc, type == OP_NCLASS, FALSE, backtracks))
|
|
return cc + 32 / sizeof(PCRE2_UCHAR);
|
|
|
|
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
|
|
jump[0] = NULL;
|
|
if (common->utf)
|
|
{
|
|
jump[0] = CMP(SLJIT_GREATER, TMP1, 0, SLJIT_IMM, bit);
|
|
if (type == OP_CLASS)
|
|
{
|
|
add_jump(compiler, backtracks, jump[0]);
|
|
jump[0] = NULL;
|
|
}
|
|
}
|
|
#elif PCRE2_CODE_UNIT_WIDTH != 8
|
|
jump[0] = CMP(SLJIT_GREATER, TMP1, 0, SLJIT_IMM, 255);
|
|
if (type == OP_CLASS)
|
|
{
|
|
add_jump(compiler, backtracks, jump[0]);
|
|
jump[0] = NULL;
|
|
}
|
|
#endif /* SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8 */
|
|
|
|
OP2(SLJIT_AND, TMP2, 0, TMP1, 0, SLJIT_IMM, 0x7);
|
|
OP2(SLJIT_LSHR, TMP1, 0, TMP1, 0, SLJIT_IMM, 3);
|
|
OP1(SLJIT_MOV_U8, TMP1, 0, SLJIT_MEM1(TMP1), (sljit_sw)cc);
|
|
OP2(SLJIT_SHL, TMP2, 0, SLJIT_IMM, 1, TMP2, 0);
|
|
OP2U(SLJIT_AND | SLJIT_SET_Z, TMP1, 0, TMP2, 0);
|
|
add_jump(compiler, backtracks, JUMP(SLJIT_ZERO));
|
|
|
|
#if defined SUPPORT_UNICODE || PCRE2_CODE_UNIT_WIDTH != 8
|
|
if (jump[0] != NULL)
|
|
JUMPHERE(jump[0]);
|
|
#endif
|
|
return cc + 32 / sizeof(PCRE2_UCHAR);
|
|
|
|
#if defined SUPPORT_UNICODE || PCRE2_CODE_UNIT_WIDTH == 16 || PCRE2_CODE_UNIT_WIDTH == 32
|
|
case OP_XCLASS:
|
|
if (check_str_ptr)
|
|
detect_partial_match(common, backtracks);
|
|
compile_xclass_matchingpath(common, cc + LINK_SIZE, backtracks, 0);
|
|
return cc + GET(cc, 0) - 1;
|
|
|
|
case OP_ECLASS:
|
|
if (check_str_ptr)
|
|
detect_partial_match(common, backtracks);
|
|
return compile_eclass_matchingpath(common, cc, backtracks);
|
|
#endif
|
|
}
|
|
SLJIT_UNREACHABLE();
|
|
return cc;
|
|
}
|
|
|
|
static SLJIT_INLINE PCRE2_SPTR compile_charn_matchingpath(compiler_common *common, PCRE2_SPTR cc, PCRE2_SPTR ccend, jump_list **backtracks)
|
|
{
|
|
/* This function consumes at least one input character. */
|
|
/* To decrease the number of length checks, we try to concatenate the fixed length character sequences. */
|
|
DEFINE_COMPILER;
|
|
PCRE2_SPTR ccbegin = cc;
|
|
compare_context context;
|
|
int size;
|
|
|
|
context.length = 0;
|
|
do
|
|
{
|
|
if (cc >= ccend)
|
|
break;
|
|
|
|
if (*cc == OP_CHAR)
|
|
{
|
|
size = 1;
|
|
#ifdef SUPPORT_UNICODE
|
|
if (common->utf && HAS_EXTRALEN(cc[1]))
|
|
size += GET_EXTRALEN(cc[1]);
|
|
#endif
|
|
}
|
|
else if (*cc == OP_CHARI)
|
|
{
|
|
size = 1;
|
|
#ifdef SUPPORT_UNICODE
|
|
if (common->utf)
|
|
{
|
|
if (char_has_othercase(common, cc + 1) && char_get_othercase_bit(common, cc + 1) == 0)
|
|
size = 0;
|
|
else if (HAS_EXTRALEN(cc[1]))
|
|
size += GET_EXTRALEN(cc[1]);
|
|
}
|
|
else
|
|
#endif
|
|
if (char_has_othercase(common, cc + 1) && char_get_othercase_bit(common, cc + 1) == 0)
|
|
size = 0;
|
|
}
|
|
else
|
|
size = 0;
|
|
|
|
cc += 1 + size;
|
|
context.length += IN_UCHARS(size);
|
|
}
|
|
while (size > 0 && context.length <= 128);
|
|
|
|
cc = ccbegin;
|
|
if (context.length > 0)
|
|
{
|
|
/* We have a fixed-length byte sequence. */
|
|
OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, context.length);
|
|
add_jump(compiler, backtracks, CMP(SLJIT_GREATER, STR_PTR, 0, STR_END, 0));
|
|
|
|
context.sourcereg = -1;
|
|
#if defined SLJIT_UNALIGNED && SLJIT_UNALIGNED
|
|
context.ucharptr = 0;
|
|
#endif
|
|
do cc = byte_sequence_compare(common, *cc == OP_CHARI, cc + 1, &context, backtracks); while (context.length > 0);
|
|
return cc;
|
|
}
|
|
|
|
/* A non-fixed length character will be checked if length == 0. */
|
|
return compile_char1_matchingpath(common, *cc, cc + 1, backtracks, TRUE);
|
|
}
|
|
|
|
|