php-src/ext/pcre/pcre2lib/pcre2_jit_char_inc.h
2025-02-16 12:16:46 +01:00

2280 lines
65 KiB
C

/*************************************************
* Perl-Compatible Regular Expressions *
*************************************************/
/* PCRE is a library of functions to support regular expressions whose syntax
and semantics are as close as possible to those of the Perl 5 language.
Written by Philip Hazel
This module by Zoltan Herczeg
Original API code Copyright (c) 1997-2012 University of Cambridge
New API code Copyright (c) 2016-2024 University of Cambridge
-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the University of Cambridge nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
-----------------------------------------------------------------------------
*/
/* XClass matching code. */
#ifdef SUPPORT_WIDE_CHARS
#define ECLASS_CHAR_DATA STACK_TOP
#define ECLASS_STACK_DATA STACK_LIMIT
#define SET_CHAR_OFFSET(value) \
if ((value) != charoffset) \
{ \
if ((value) < charoffset) \
OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, SLJIT_IMM, (sljit_sw)(charoffset - (value))); \
else \
OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, (sljit_sw)((value) - charoffset)); \
} \
charoffset = (value);
#define READ_FROM_CHAR_LIST(destination) \
if (list_ind <= 1) \
{ \
destination = *(const uint16_t*)next_char; \
next_char += 2; \
} \
else \
{ \
destination = *(const uint32_t*)next_char; \
next_char += 4; \
}
#define XCLASS_LOCAL_RANGES_SIZE 32
#define XCLASS_LOCAL_RANGES_LOG2_SIZE 5
typedef struct xclass_stack_item {
sljit_u32 first_item;
sljit_u32 last_item;
struct sljit_jump *jump;
} xclass_stack_item;
typedef struct xclass_ranges {
size_t range_count;
/* Pointer to ranges. A stack area is provided when a small buffer is enough. */
uint32_t *ranges;
uint32_t local_ranges[XCLASS_LOCAL_RANGES_SIZE * 2];
/* Stack size must be log2(ranges / 2). */
xclass_stack_item *stack;
xclass_stack_item local_stack[XCLASS_LOCAL_RANGES_LOG2_SIZE];
} xclass_ranges;
static void xclass_compute_ranges(compiler_common *common, PCRE2_SPTR cc, xclass_ranges *ranges)
{
DEFINE_COMPILER;
size_t range_count = 0, est_range_count;
size_t est_stack_size, tmp;
uint32_t type, list_ind;
uint32_t est_type;
uint32_t char_list_add, range_start, range_end;
const uint8_t *next_char;
const uint8_t *est_next_char;
#if defined SUPPORT_UNICODE && (PCRE2_CODE_UNIT_WIDTH == 8 || PCRE2_CODE_UNIT_WIDTH == 16)
BOOL utf = common->utf;
#endif /* SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == [8|16] */
if (*cc == XCL_SINGLE || *cc == XCL_RANGE)
{
/* Only a few ranges are present. */
do
{
type = *cc++;
SLJIT_ASSERT(type == XCL_SINGLE || type == XCL_RANGE);
GETCHARINCTEST(range_end, cc);
ranges->ranges[range_count] = range_end;
if (type == XCL_RANGE)
{
GETCHARINCTEST(range_end, cc);
}
ranges->ranges[range_count + 1] = range_end;
range_count += 2;
}
while (*cc != XCL_END);
SLJIT_ASSERT(range_count <= XCLASS_LOCAL_RANGES_SIZE);
ranges->range_count = range_count;
return;
}
SLJIT_ASSERT(cc[0] >= XCL_LIST);
#if PCRE2_CODE_UNIT_WIDTH == 8
type = (uint32_t)(cc[0] << 8) | cc[1];
cc += 2;
#else
type = cc[0];
cc++;
#endif /* CODE_UNIT_WIDTH */
/* Align characters. */
next_char = (const uint8_t*)common->start - (GET(cc, 0) << 1);
type &= XCL_TYPE_MASK;
/* Estimate size. */
est_next_char = next_char;
est_type = type;
est_range_count = 0;
list_ind = 0;
while (est_type > 0)
{
uint32_t item_count = est_type & XCL_ITEM_COUNT_MASK;
if (item_count == XCL_ITEM_COUNT_MASK)
{
if (list_ind <= 1)
{
item_count = *(const uint16_t*)est_next_char;
est_next_char += 2;
}
else
{
item_count = *(const uint32_t*)est_next_char;
est_next_char += 4;
}
}
est_type >>= XCL_TYPE_BIT_LEN;
est_next_char += (size_t)item_count << (list_ind <= 1 ? 1 : 2);
list_ind++;
est_range_count += item_count + 1;
}
if (est_range_count > XCLASS_LOCAL_RANGES_SIZE)
{
est_stack_size = 0;
tmp = est_range_count - 1;
/* Compute log2(est_range_count) */
while (tmp > 0)
{
est_stack_size++;
tmp >>= 1;
}
ranges->stack = (xclass_stack_item*)SLJIT_MALLOC((sizeof(xclass_stack_item) * est_stack_size)
+ ((sizeof(uint32_t) << 1) * (size_t)est_range_count), compiler->allocator_data);
if (ranges->stack == NULL)
{
sljit_set_compiler_memory_error(compiler);
ranges->ranges = NULL;
return;
}
ranges->ranges = (uint32_t*)(ranges->stack + est_stack_size);
}
char_list_add = XCL_CHAR_LIST_LOW_16_ADD;
range_start = ~(uint32_t)0;
list_ind = 0;
if ((type & XCL_BEGIN_WITH_RANGE) != 0)
range_start = XCL_CHAR_LIST_LOW_16_START;
while (type > 0)
{
uint32_t item_count = type & XCL_ITEM_COUNT_MASK;
if (item_count == XCL_ITEM_COUNT_MASK)
{
READ_FROM_CHAR_LIST(item_count);
SLJIT_ASSERT(item_count >= XCL_ITEM_COUNT_MASK);
}
while (item_count > 0)
{
READ_FROM_CHAR_LIST(range_end);
if ((range_end & XCL_CHAR_END) != 0)
{
range_end = char_list_add + (range_end >> XCL_CHAR_SHIFT);
if (range_start == ~(uint32_t)0)
range_start = range_end;
ranges->ranges[range_count] = range_start;
ranges->ranges[range_count + 1] = range_end;
range_count += 2;
range_start = ~(uint32_t)0;
}
else
range_start = char_list_add + (range_end >> XCL_CHAR_SHIFT);
item_count--;
}
list_ind++;
type >>= XCL_TYPE_BIT_LEN;
if (range_start == ~(uint32_t)0)
{
if ((type & XCL_BEGIN_WITH_RANGE) != 0)
{
if (list_ind == 1) range_start = XCL_CHAR_LIST_HIGH_16_START;
#if PCRE2_CODE_UNIT_WIDTH == 32
else if (list_ind == 2) range_start = XCL_CHAR_LIST_LOW_32_START;
else range_start = XCL_CHAR_LIST_HIGH_32_START;
#else
else range_start = XCL_CHAR_LIST_LOW_32_START;
#endif
}
}
else if ((type & XCL_BEGIN_WITH_RANGE) == 0)
{
if (list_ind == 1) range_end = XCL_CHAR_LIST_LOW_16_END;
else if (list_ind == 2) range_end = XCL_CHAR_LIST_HIGH_16_END;
#if PCRE2_CODE_UNIT_WIDTH == 32
else if (list_ind == 3) range_end = XCL_CHAR_LIST_LOW_32_END;
else range_end = XCL_CHAR_LIST_HIGH_32_END;
#else
else range_end = XCL_CHAR_LIST_LOW_32_END;
#endif
ranges->ranges[range_count] = range_start;
ranges->ranges[range_count + 1] = range_end;
range_count += 2;
range_start = ~(uint32_t)0;
}
if (list_ind == 1) char_list_add = XCL_CHAR_LIST_HIGH_16_ADD;
#if PCRE2_CODE_UNIT_WIDTH == 32
else if (list_ind == 2) char_list_add = XCL_CHAR_LIST_LOW_32_ADD;
else char_list_add = XCL_CHAR_LIST_HIGH_32_ADD;
#else
else char_list_add = XCL_CHAR_LIST_LOW_32_ADD;
#endif
}
SLJIT_ASSERT(range_count > 0 && range_count <= (est_range_count << 1));
SLJIT_ASSERT(next_char <= (const uint8_t*)common->start);
ranges->range_count = range_count;
}
static void xclass_check_bitset(compiler_common *common, const sljit_u8 *bitset, jump_list **found, jump_list **backtracks)
{
DEFINE_COMPILER;
struct sljit_jump *jump;
jump = CMP(SLJIT_GREATER, TMP1, 0, SLJIT_IMM, 255);
if (!optimize_class(common, bitset, (bitset[31] & 0x80) != 0, TRUE, found))
{
OP2(SLJIT_AND, TMP2, 0, TMP1, 0, SLJIT_IMM, 0x7);
OP2(SLJIT_LSHR, TMP1, 0, TMP1, 0, SLJIT_IMM, 3);
OP1(SLJIT_MOV_U8, TMP1, 0, SLJIT_MEM1(TMP1), (sljit_sw)bitset);
OP2(SLJIT_SHL, TMP2, 0, SLJIT_IMM, 1, TMP2, 0);
OP2U(SLJIT_AND | SLJIT_SET_Z, TMP1, 0, TMP2, 0);
add_jump(compiler, found, JUMP(SLJIT_NOT_ZERO));
}
add_jump(compiler, backtracks, JUMP(SLJIT_JUMP));
JUMPHERE(jump);
}
#if defined SUPPORT_UNICODE && (PCRE2_CODE_UNIT_WIDTH == 8 || PCRE2_CODE_UNIT_WIDTH == 16)
static void xclass_update_min_max(compiler_common *common, PCRE2_SPTR cc, sljit_u32 *min_ptr, sljit_u32 *max_ptr)
{
uint32_t type, list_ind, c;
sljit_u32 min = *min_ptr;
sljit_u32 max = *max_ptr;
uint32_t char_list_add;
const uint8_t *next_char;
BOOL utf = TRUE;
/* This function is pointless without utf 8/16. */
SLJIT_ASSERT(common->utf);
if (*cc == XCL_SINGLE || *cc == XCL_RANGE)
{
/* Only a few ranges are present. */
do
{
type = *cc++;
SLJIT_ASSERT(type == XCL_SINGLE || type == XCL_RANGE);
GETCHARINCTEST(c, cc);
if (c < min)
min = c;
if (type == XCL_RANGE)
{
GETCHARINCTEST(c, cc);
}
if (c > max)
max = c;
}
while (*cc != XCL_END);
SLJIT_ASSERT(min <= MAX_UTF_CODE_POINT && max <= MAX_UTF_CODE_POINT && min <= max);
*min_ptr = min;
*max_ptr = max;
return;
}
SLJIT_ASSERT(cc[0] >= XCL_LIST);
#if PCRE2_CODE_UNIT_WIDTH == 8
type = (uint32_t)(cc[0] << 8) | cc[1];
cc += 2;
#else
type = cc[0];
cc++;
#endif /* CODE_UNIT_WIDTH */
/* Align characters. */
next_char = (const uint8_t*)common->start - (GET(cc, 0) << 1);
type &= XCL_TYPE_MASK;
SLJIT_ASSERT(type != 0);
/* Detect minimum. */
/* Skip unused ranges. */
list_ind = 0;
while ((type & (XCL_BEGIN_WITH_RANGE | XCL_ITEM_COUNT_MASK)) == 0)
{
type >>= XCL_TYPE_BIT_LEN;
list_ind++;
}
SLJIT_ASSERT(list_ind <= 2);
switch (list_ind)
{
case 0:
char_list_add = XCL_CHAR_LIST_LOW_16_ADD;
c = XCL_CHAR_LIST_LOW_16_START;
break;
case 1:
char_list_add = XCL_CHAR_LIST_HIGH_16_ADD;
c = XCL_CHAR_LIST_HIGH_16_START;
break;
default:
char_list_add = XCL_CHAR_LIST_LOW_32_ADD;
c = XCL_CHAR_LIST_LOW_32_START;
break;
}
if ((type & XCL_BEGIN_WITH_RANGE) != 0)
{
if (c < min)
min = c;
}
else
{
if ((type & XCL_ITEM_COUNT_MASK) == XCL_ITEM_COUNT_MASK)
{
if (list_ind <= 1)
c = *(const uint16_t*)(next_char + 2);
else
c = *(const uint32_t*)(next_char + 4);
}
else
{
if (list_ind <= 1)
c = *(const uint16_t*)next_char;
else
c = *(const uint32_t*)next_char;
}
c = char_list_add + (c >> XCL_CHAR_SHIFT);
if (c < min)
min = c;
}
/* Detect maximum. */
/* Skip intermediate ranges. */
while (TRUE)
{
if ((type & XCL_ITEM_COUNT_MASK) == XCL_ITEM_COUNT_MASK)
{
if (list_ind <= 1)
{
c = *(const uint16_t*)next_char;
next_char += (c + 1) << 1;
}
else
{
c = *(const uint32_t*)next_char;
next_char += (c + 1) << 2;
}
}
else
next_char += (type & XCL_ITEM_COUNT_MASK) << (list_ind <= 1 ? 1 : 2);
if ((type >> XCL_TYPE_BIT_LEN) == 0)
break;
list_ind++;
type >>= XCL_TYPE_BIT_LEN;
}
SLJIT_ASSERT(list_ind <= 2 && type != 0);
switch (list_ind)
{
case 0:
char_list_add = XCL_CHAR_LIST_LOW_16_ADD;
c = XCL_CHAR_LIST_LOW_16_END;
break;
case 1:
char_list_add = XCL_CHAR_LIST_HIGH_16_ADD;
c = XCL_CHAR_LIST_HIGH_16_END;
break;
default:
char_list_add = XCL_CHAR_LIST_LOW_32_ADD;
c = XCL_CHAR_LIST_LOW_32_END;
break;
}
if ((type & XCL_ITEM_COUNT_MASK) != 0)
{
/* Type is reused as temporary. */
if (list_ind <= 1)
type = *(const uint16_t*)(next_char - 2);
else
type = *(const uint32_t*)(next_char - 4);
if (type & XCL_CHAR_END)
c = char_list_add + (type >> XCL_CHAR_SHIFT);
}
if (c > max)
max = c;
SLJIT_ASSERT(min <= MAX_UTF_CODE_POINT && max <= MAX_UTF_CODE_POINT && min <= max);
*min_ptr = min;
*max_ptr = max;
}
#endif /* SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == [8|16] */
#define XCLASS_IS_ECLASS 0x001
#ifdef SUPPORT_UNICODE
#define XCLASS_SAVE_CHAR 0x002
#define XCLASS_HAS_TYPE 0x004
#define XCLASS_HAS_SCRIPT 0x008
#define XCLASS_HAS_SCRIPT_EXTENSION 0x010
#define XCLASS_HAS_BOOL 0x020
#define XCLASS_HAS_BIDICL 0x040
#define XCLASS_NEEDS_UCD (XCLASS_HAS_TYPE | XCLASS_HAS_SCRIPT | XCLASS_HAS_SCRIPT_EXTENSION | XCLASS_HAS_BOOL | XCLASS_HAS_BIDICL)
#define XCLASS_SCRIPT_EXTENSION_NOTPROP 0x080
#define XCLASS_SCRIPT_EXTENSION_RESTORE_RETURN_ADDR 0x100
#define XCLASS_SCRIPT_EXTENSION_RESTORE_LOCAL0 0x200
#endif /* SUPPORT_UNICODE */
static PCRE2_SPTR compile_char1_matchingpath(compiler_common *common, PCRE2_UCHAR type, PCRE2_SPTR cc, jump_list **backtracks, BOOL check_str_ptr);
/* TMP3 must be preserved because it is used by compile_iterator_matchingpath. */
static void compile_xclass_matchingpath(compiler_common *common, PCRE2_SPTR cc, jump_list **backtracks, sljit_u32 status)
{
DEFINE_COMPILER;
jump_list *found = NULL;
jump_list *check_result = NULL;
jump_list **list = (cc[0] & XCL_NOT) == 0 ? &found : backtracks;
sljit_uw c, charoffset;
sljit_u32 max = READ_CHAR_MAX, min = 0;
struct sljit_jump *jump = NULL;
PCRE2_UCHAR flags;
PCRE2_SPTR ccbegin;
sljit_u32 compares, invertcmp, depth;
sljit_u32 first_item, last_item, mid_item;
sljit_u32 range_start, range_end;
xclass_ranges ranges;
BOOL has_cmov, last_range_set;
#ifdef SUPPORT_UNICODE
sljit_u32 category_list = 0;
sljit_u32 items;
int typereg = TMP1;
#endif /* SUPPORT_UNICODE */
SLJIT_ASSERT(common->locals_size >= SSIZE_OF(sw));
/* Scanning the necessary info. */
flags = *cc++;
ccbegin = cc;
compares = 0;
if (flags & XCL_MAP)
cc += 32 / sizeof(PCRE2_UCHAR);
#ifdef SUPPORT_UNICODE
while (*cc == XCL_PROP || *cc == XCL_NOTPROP)
{
compares++;
cc++;
items = 0;
switch(*cc)
{
case PT_LAMP:
items = UCPCAT3(ucp_Lu, ucp_Ll, ucp_Lt);
break;
case PT_GC:
items = UCPCAT_RANGE(PRIV(ucp_typerange)[(int)cc[1] * 2], PRIV(ucp_typerange)[(int)cc[1] * 2 + 1]);
break;
case PT_PC:
items = UCPCAT(cc[1]);
break;
case PT_WORD:
items = UCPCAT2(ucp_Mn, ucp_Pc) | UCPCAT_L | UCPCAT_N;
break;
case PT_ALNUM:
items = UCPCAT_L | UCPCAT_N;
break;
case PT_SCX:
status |= XCLASS_HAS_SCRIPT_EXTENSION;
if (cc[-1] == XCL_NOTPROP)
{
status |= XCLASS_SCRIPT_EXTENSION_NOTPROP;
break;
}
compares++;
/* Fall through */
case PT_SC:
status |= XCLASS_HAS_SCRIPT;
break;
case PT_SPACE:
case PT_PXSPACE:
case PT_PXGRAPH:
case PT_PXPRINT:
case PT_PXPUNCT:
status |= XCLASS_SAVE_CHAR | XCLASS_HAS_TYPE;
break;
case PT_UCNC:
case PT_PXXDIGIT:
status |= XCLASS_SAVE_CHAR;
break;
case PT_BOOL:
status |= XCLASS_HAS_BOOL;
break;
case PT_BIDICL:
status |= XCLASS_HAS_BIDICL;
break;
default:
SLJIT_UNREACHABLE();
break;
}
if (items > 0)
{
if (cc[-1] == XCL_NOTPROP)
items ^= UCPCAT_ALL;
category_list |= items;
status |= XCLASS_HAS_TYPE;
compares--;
}
cc += 2;
}
if (category_list == UCPCAT_ALL)
{
/* All or no characters are accepted, same as dotall. */
if (status & XCLASS_IS_ECLASS)
{
if (list != backtracks)
OP2(SLJIT_OR, ECLASS_STACK_DATA, 0, ECLASS_STACK_DATA, 0, SLJIT_IMM, 1);
return;
}
compile_char1_matchingpath(common, OP_ALLANY, cc, backtracks, FALSE);
if (list == backtracks)
add_jump(compiler, backtracks, JUMP(SLJIT_JUMP));
return;
}
if (category_list != 0)
compares++;
#endif
if (*cc != XCL_END)
{
#if defined SUPPORT_UNICODE && (PCRE2_CODE_UNIT_WIDTH == 8 || PCRE2_CODE_UNIT_WIDTH == 16)
if (common->utf && compares == 0 && !(status & XCLASS_IS_ECLASS))
{
SLJIT_ASSERT(category_list == 0);
max = 0;
min = (flags & XCL_MAP) != 0 ? 0 : READ_CHAR_MAX;
xclass_update_min_max(common, cc, &min, &max);
}
#endif
compares++;
#ifdef SUPPORT_UNICODE
status |= XCLASS_SAVE_CHAR;
#endif /* SUPPORT_UNICODE */
}
#ifdef SUPPORT_UNICODE
SLJIT_ASSERT(compares > 0 || category_list != 0);
#else /* !SUPPORT_UNICODE */
SLJIT_ASSERT(compares > 0);
#endif /* SUPPORT_UNICODE */
/* We are not necessary in utf mode even in 8 bit mode. */
cc = ccbegin;
if (!(status & XCLASS_IS_ECLASS))
{
if ((flags & XCL_NOT) != 0)
read_char(common, min, max, backtracks, READ_CHAR_UPDATE_STR_PTR);
else
{
#ifdef SUPPORT_UNICODE
read_char(common, min, max, (status & XCLASS_NEEDS_UCD) ? backtracks : NULL, 0);
#else /* !SUPPORT_UNICODE */
read_char(common, min, max, NULL, 0);
#endif /* SUPPORT_UNICODE */
}
}
if ((flags & XCL_MAP) != 0)
{
SLJIT_ASSERT(!(status & XCLASS_IS_ECLASS));
xclass_check_bitset(common, (const sljit_u8 *)cc, &found, backtracks);
cc += 32 / sizeof(PCRE2_UCHAR);
}
#ifdef SUPPORT_UNICODE
if (status & XCLASS_NEEDS_UCD)
{
if ((status & (XCLASS_SAVE_CHAR | XCLASS_IS_ECLASS)) == XCLASS_SAVE_CHAR)
OP1(SLJIT_MOV, RETURN_ADDR, 0, TMP1, 0);
#if PCRE2_CODE_UNIT_WIDTH == 32
if (!common->utf)
{
OP2U(SLJIT_SUB | SLJIT_SET_GREATER_EQUAL, TMP1, 0, SLJIT_IMM, MAX_UTF_CODE_POINT + 1);
SELECT(SLJIT_GREATER_EQUAL, TMP1, SLJIT_IMM, UNASSIGNED_UTF_CHAR, TMP1);
}
#endif /* PCRE2_CODE_UNIT_WIDTH == 32 */
OP2(SLJIT_LSHR, TMP2, 0, TMP1, 0, SLJIT_IMM, UCD_BLOCK_SHIFT);
OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 1);
OP1(SLJIT_MOV_U16, TMP2, 0, SLJIT_MEM1(TMP2), (sljit_sw)PRIV(ucd_stage1));
OP2(SLJIT_AND, TMP1, 0, TMP1, 0, SLJIT_IMM, UCD_BLOCK_MASK);
OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, UCD_BLOCK_SHIFT);
OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, TMP2, 0);
OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, (sljit_sw)PRIV(ucd_stage2));
OP1(SLJIT_MOV_U16, TMP2, 0, SLJIT_MEM2(TMP2, TMP1), 1);
OP2(SLJIT_SHL, TMP1, 0, TMP2, 0, SLJIT_IMM, 3);
OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 2);
OP2(SLJIT_ADD, TMP2, 0, TMP2, 0, TMP1, 0);
ccbegin = cc;
if (status & XCLASS_HAS_BIDICL)
{
OP1(SLJIT_MOV_U16, TMP1, 0, SLJIT_MEM1(TMP2), (sljit_sw)PRIV(ucd_records) + SLJIT_OFFSETOF(ucd_record, scriptx_bidiclass));
OP2(SLJIT_LSHR, TMP1, 0, TMP1, 0, SLJIT_IMM, UCD_BIDICLASS_SHIFT);
while (*cc == XCL_PROP || *cc == XCL_NOTPROP)
{
cc++;
if (*cc == PT_BIDICL)
{
compares--;
invertcmp = (compares == 0 && list != backtracks);
if (cc[-1] == XCL_NOTPROP)
invertcmp ^= 0x1;
jump = CMP(SLJIT_EQUAL ^ invertcmp, TMP1, 0, SLJIT_IMM, (int)cc[1]);
add_jump(compiler, compares > 0 ? list : backtracks, jump);
}
cc += 2;
}
cc = ccbegin;
}
if (status & XCLASS_HAS_BOOL)
{
OP1(SLJIT_MOV_U16, TMP1, 0, SLJIT_MEM1(TMP2), (sljit_sw)PRIV(ucd_records) + SLJIT_OFFSETOF(ucd_record, bprops));
OP2(SLJIT_AND, TMP1, 0, TMP1, 0, SLJIT_IMM, UCD_BPROPS_MASK);
OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 2);
while (*cc == XCL_PROP || *cc == XCL_NOTPROP)
{
cc++;
if (*cc == PT_BOOL)
{
compares--;
invertcmp = (compares == 0 && list != backtracks);
if (cc[-1] == XCL_NOTPROP)
invertcmp ^= 0x1;
OP2U(SLJIT_AND32 | SLJIT_SET_Z, SLJIT_MEM1(TMP1), (sljit_sw)(PRIV(ucd_boolprop_sets) + (cc[1] >> 5)), SLJIT_IMM, (sljit_sw)(1u << (cc[1] & 0x1f)));
add_jump(compiler, compares > 0 ? list : backtracks, JUMP(SLJIT_NOT_ZERO ^ invertcmp));
}
cc += 2;
}
cc = ccbegin;
}
if (status & XCLASS_HAS_SCRIPT)
{
OP1(SLJIT_MOV_U8, TMP1, 0, SLJIT_MEM1(TMP2), (sljit_sw)PRIV(ucd_records) + SLJIT_OFFSETOF(ucd_record, script));
while (*cc == XCL_PROP || *cc == XCL_NOTPROP)
{
cc++;
switch (*cc)
{
case PT_SCX:
if (cc[-1] == XCL_NOTPROP)
break;
/* Fall through */
case PT_SC:
compares--;
invertcmp = (compares == 0 && list != backtracks);
if (cc[-1] == XCL_NOTPROP)
invertcmp ^= 0x1;
add_jump(compiler, compares > 0 ? list : backtracks, CMP(SLJIT_EQUAL ^ invertcmp, TMP1, 0, SLJIT_IMM, (int)cc[1]));
}
cc += 2;
}
cc = ccbegin;
}
if (status & XCLASS_HAS_SCRIPT_EXTENSION)
{
OP1(SLJIT_MOV_U16, TMP1, 0, SLJIT_MEM1(TMP2), (sljit_sw)PRIV(ucd_records) + SLJIT_OFFSETOF(ucd_record, scriptx_bidiclass));
OP2(SLJIT_AND, TMP1, 0, TMP1, 0, SLJIT_IMM, UCD_SCRIPTX_MASK);
OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 2);
if (status & XCLASS_SCRIPT_EXTENSION_NOTPROP)
{
if (status & XCLASS_HAS_TYPE)
{
if ((status & (XCLASS_SAVE_CHAR | XCLASS_IS_ECLASS)) == XCLASS_SAVE_CHAR)
{
OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), LOCAL0, TMP2, 0);
status |= XCLASS_SCRIPT_EXTENSION_RESTORE_LOCAL0;
}
else
{
OP1(SLJIT_MOV, RETURN_ADDR, 0, TMP2, 0);
status |= XCLASS_SCRIPT_EXTENSION_RESTORE_RETURN_ADDR;
}
}
OP1(SLJIT_MOV_U8, TMP2, 0, SLJIT_MEM1(TMP2), (sljit_sw)PRIV(ucd_records) + SLJIT_OFFSETOF(ucd_record, script));
}
while (*cc == XCL_PROP || *cc == XCL_NOTPROP)
{
cc++;
if (*cc == PT_SCX)
{
compares--;
invertcmp = (compares == 0 && list != backtracks);
jump = NULL;
if (cc[-1] == XCL_NOTPROP)
{
jump = CMP(SLJIT_EQUAL, TMP2, 0, SLJIT_IMM, (int)cc[1]);
if (invertcmp)
{
add_jump(compiler, backtracks, jump);
jump = NULL;
}
invertcmp ^= 0x1;
}
OP2U(SLJIT_AND32 | SLJIT_SET_Z, SLJIT_MEM1(TMP1), (sljit_sw)(PRIV(ucd_script_sets) + (cc[1] >> 5)), SLJIT_IMM, (sljit_sw)(1u << (cc[1] & 0x1f)));
add_jump(compiler, compares > 0 ? list : backtracks, JUMP(SLJIT_NOT_ZERO ^ invertcmp));
if (jump != NULL)
JUMPHERE(jump);
}
cc += 2;
}
if (status & XCLASS_SCRIPT_EXTENSION_RESTORE_LOCAL0)
OP1(SLJIT_MOV, TMP2, 0, SLJIT_MEM1(SLJIT_SP), LOCAL0);
else if (status & XCLASS_SCRIPT_EXTENSION_RESTORE_RETURN_ADDR)
OP1(SLJIT_MOV, TMP2, 0, RETURN_ADDR, 0);
cc = ccbegin;
}
if (status & XCLASS_SAVE_CHAR)
OP1(SLJIT_MOV, TMP1, 0, (status & XCLASS_IS_ECLASS) ? ECLASS_CHAR_DATA : RETURN_ADDR, 0);
if (status & XCLASS_HAS_TYPE)
{
if (status & XCLASS_SAVE_CHAR)
typereg = RETURN_ADDR;
OP1(SLJIT_MOV_U8, TMP2, 0, SLJIT_MEM1(TMP2), (sljit_sw)PRIV(ucd_records) + SLJIT_OFFSETOF(ucd_record, chartype));
OP2(SLJIT_SHL, typereg, 0, SLJIT_IMM, 1, TMP2, 0);
if (category_list > 0)
{
compares--;
invertcmp = (compares == 0 && list != backtracks);
OP2U(SLJIT_AND | SLJIT_SET_Z, typereg, 0, SLJIT_IMM, category_list);
add_jump(compiler, compares > 0 ? list : backtracks, JUMP(SLJIT_NOT_ZERO ^ invertcmp));
}
}
}
#endif /* SUPPORT_UNICODE */
/* Generating code. */
charoffset = 0;
#ifdef SUPPORT_UNICODE
while (*cc == XCL_PROP || *cc == XCL_NOTPROP)
{
compares--;
invertcmp = (compares == 0 && list != backtracks);
jump = NULL;
if (*cc == XCL_NOTPROP)
invertcmp ^= 0x1;
cc++;
switch(*cc)
{
case PT_LAMP:
case PT_GC:
case PT_PC:
case PT_SC:
case PT_SCX:
case PT_BOOL:
case PT_BIDICL:
case PT_WORD:
case PT_ALNUM:
compares++;
/* Already handled. */
break;
case PT_SPACE:
case PT_PXSPACE:
SET_CHAR_OFFSET(9);
OP2U(SLJIT_SUB | SLJIT_SET_LESS_EQUAL, TMP1, 0, SLJIT_IMM, 0xd - 0x9);
OP_FLAGS(SLJIT_MOV, TMP2, 0, SLJIT_LESS_EQUAL);
OP2U(SLJIT_SUB | SLJIT_SET_Z, TMP1, 0, SLJIT_IMM, 0x85 - 0x9);
OP_FLAGS(SLJIT_OR, TMP2, 0, SLJIT_EQUAL);
OP2U(SLJIT_SUB | SLJIT_SET_Z, TMP1, 0, SLJIT_IMM, 0x180e - 0x9);
OP_FLAGS(SLJIT_OR, TMP2, 0, SLJIT_EQUAL);
OP2U(SLJIT_AND | SLJIT_SET_Z, typereg, 0, SLJIT_IMM, UCPCAT_RANGE(ucp_Zl, ucp_Zs));
OP_FLAGS(SLJIT_OR | SLJIT_SET_Z, TMP2, 0, SLJIT_NOT_ZERO);
jump = JUMP(SLJIT_NOT_ZERO ^ invertcmp);
break;
case PT_UCNC:
OP2U(SLJIT_SUB | SLJIT_SET_Z, TMP1, 0, SLJIT_IMM, (sljit_sw)(CHAR_DOLLAR_SIGN - charoffset));
OP_FLAGS(SLJIT_MOV, TMP2, 0, SLJIT_EQUAL);
OP2U(SLJIT_SUB | SLJIT_SET_Z, TMP1, 0, SLJIT_IMM, (sljit_sw)(CHAR_COMMERCIAL_AT - charoffset));
OP_FLAGS(SLJIT_OR, TMP2, 0, SLJIT_EQUAL);
OP2U(SLJIT_SUB | SLJIT_SET_Z, TMP1, 0, SLJIT_IMM, (sljit_sw)(CHAR_GRAVE_ACCENT - charoffset));
OP_FLAGS(SLJIT_OR, TMP2, 0, SLJIT_EQUAL);
SET_CHAR_OFFSET(0xa0);
OP2U(SLJIT_SUB | SLJIT_SET_LESS_EQUAL, TMP1, 0, SLJIT_IMM, (sljit_sw)(0xd7ff - charoffset));
OP_FLAGS(SLJIT_OR, TMP2, 0, SLJIT_LESS_EQUAL);
SET_CHAR_OFFSET(0);
OP2U(SLJIT_SUB | SLJIT_SET_GREATER_EQUAL, TMP1, 0, SLJIT_IMM, 0xe000 - 0);
OP_FLAGS(SLJIT_OR | SLJIT_SET_Z, TMP2, 0, SLJIT_GREATER_EQUAL);
jump = JUMP(SLJIT_NOT_ZERO ^ invertcmp);
break;
case PT_PXGRAPH:
OP2U(SLJIT_AND | SLJIT_SET_Z, typereg, 0, SLJIT_IMM, UCPCAT_RANGE(ucp_Cc, ucp_Cs) | UCPCAT_RANGE(ucp_Zl, ucp_Zs));
OP_FLAGS(SLJIT_MOV, TMP2, 0, SLJIT_NOT_ZERO);
OP2U(SLJIT_AND | SLJIT_SET_Z, typereg, 0, SLJIT_IMM, UCPCAT(ucp_Cf));
jump = JUMP(SLJIT_ZERO);
c = charoffset;
/* In case of ucp_Cf, we overwrite the result. */
SET_CHAR_OFFSET(0x2066);
OP2U(SLJIT_SUB | SLJIT_SET_LESS_EQUAL, TMP1, 0, SLJIT_IMM, 0x2069 - 0x2066);
OP_FLAGS(SLJIT_MOV, TMP2, 0, SLJIT_LESS_EQUAL);
OP2U(SLJIT_SUB | SLJIT_SET_Z, TMP1, 0, SLJIT_IMM, 0x061c - 0x2066);
OP_FLAGS(SLJIT_OR, TMP2, 0, SLJIT_EQUAL);
OP2U(SLJIT_SUB | SLJIT_SET_Z, TMP1, 0, SLJIT_IMM, 0x180e - 0x2066);
OP_FLAGS(SLJIT_OR, TMP2, 0, SLJIT_EQUAL);
/* Restore charoffset. */
SET_CHAR_OFFSET(c);
JUMPHERE(jump);
jump = CMP(SLJIT_ZERO ^ invertcmp, TMP2, 0, SLJIT_IMM, 0);
break;
case PT_PXPRINT:
OP2U(SLJIT_AND | SLJIT_SET_Z, typereg, 0, SLJIT_IMM, UCPCAT_RANGE(ucp_Cc, ucp_Cs) | UCPCAT2(ucp_Zl, ucp_Zp));
OP_FLAGS(SLJIT_MOV, TMP2, 0, SLJIT_NOT_ZERO);
OP2U(SLJIT_AND | SLJIT_SET_Z, typereg, 0, SLJIT_IMM, UCPCAT(ucp_Cf));
jump = JUMP(SLJIT_ZERO);
c = charoffset;
/* In case of ucp_Cf, we overwrite the result. */
SET_CHAR_OFFSET(0x2066);
OP2U(SLJIT_SUB | SLJIT_SET_LESS_EQUAL, TMP1, 0, SLJIT_IMM, 0x2069 - 0x2066);
OP_FLAGS(SLJIT_MOV, TMP2, 0, SLJIT_LESS_EQUAL);
OP2U(SLJIT_SUB | SLJIT_SET_Z, TMP1, 0, SLJIT_IMM, 0x061c - 0x2066);
OP_FLAGS(SLJIT_OR, TMP2, 0, SLJIT_EQUAL);
/* Restore charoffset. */
SET_CHAR_OFFSET(c);
JUMPHERE(jump);
jump = CMP(SLJIT_ZERO ^ invertcmp, TMP2, 0, SLJIT_IMM, 0);
break;
case PT_PXPUNCT:
OP2U(SLJIT_AND | SLJIT_SET_Z, typereg, 0, SLJIT_IMM, UCPCAT_RANGE(ucp_Sc, ucp_So));
OP_FLAGS(SLJIT_MOV, TMP2, 0, SLJIT_NOT_ZERO);
SET_CHAR_OFFSET(0);
OP2U(SLJIT_SUB | SLJIT_SET_LESS_EQUAL, TMP1, 0, SLJIT_IMM, 0x7f);
OP_FLAGS(SLJIT_AND, TMP2, 0, SLJIT_LESS_EQUAL);
OP2U(SLJIT_AND | SLJIT_SET_Z, typereg, 0, SLJIT_IMM, UCPCAT_RANGE(ucp_Pc, ucp_Ps));
OP_FLAGS(SLJIT_OR | SLJIT_SET_Z, TMP2, 0, SLJIT_NOT_ZERO);
jump = JUMP(SLJIT_NOT_ZERO ^ invertcmp);
break;
case PT_PXXDIGIT:
SET_CHAR_OFFSET(CHAR_A);
OP2(SLJIT_AND, TMP2, 0, TMP1, 0, SLJIT_IMM, ~0x20);
OP2U(SLJIT_SUB | SLJIT_SET_LESS_EQUAL, TMP2, 0, SLJIT_IMM, CHAR_F - CHAR_A);
OP_FLAGS(SLJIT_MOV, TMP2, 0, SLJIT_LESS_EQUAL);
SET_CHAR_OFFSET(CHAR_0);
OP2U(SLJIT_SUB | SLJIT_SET_LESS_EQUAL, TMP1, 0, SLJIT_IMM, CHAR_9 - CHAR_0);
OP_FLAGS(SLJIT_OR, TMP2, 0, SLJIT_LESS_EQUAL);
SET_CHAR_OFFSET(0xff10);
jump = CMP(SLJIT_GREATER, TMP1, 0, SLJIT_IMM, 0xff46 - 0xff10);
OP2U(SLJIT_SUB | SLJIT_SET_LESS_EQUAL, TMP1, 0, SLJIT_IMM, 0xff19 - 0xff10);
OP_FLAGS(SLJIT_OR, TMP2, 0, SLJIT_LESS_EQUAL);
SET_CHAR_OFFSET(0xff21);
OP2U(SLJIT_SUB | SLJIT_SET_LESS_EQUAL, TMP1, 0, SLJIT_IMM, 0xff26 - 0xff21);
OP_FLAGS(SLJIT_OR, TMP2, 0, SLJIT_LESS_EQUAL);
SET_CHAR_OFFSET(0xff41);
OP2U(SLJIT_SUB | SLJIT_SET_LESS_EQUAL, TMP1, 0, SLJIT_IMM, 0xff46 - 0xff41);
OP_FLAGS(SLJIT_OR, TMP2, 0, SLJIT_LESS_EQUAL);
SET_CHAR_OFFSET(0xff10);
JUMPHERE(jump);
OP2U(SLJIT_SUB | SLJIT_SET_Z, TMP2, 0, SLJIT_IMM, 0);
jump = JUMP(SLJIT_NOT_ZERO ^ invertcmp);
break;
default:
SLJIT_UNREACHABLE();
break;
}
cc += 2;
if (jump != NULL)
add_jump(compiler, compares > 0 ? list : backtracks, jump);
}
if (compares == 0)
{
if (found != NULL)
set_jumps(found, LABEL());
if (status & XCLASS_IS_ECLASS)
OP2(SLJIT_OR, ECLASS_STACK_DATA, 0, ECLASS_STACK_DATA, 0, SLJIT_IMM, 1);
return;
}
#endif /* SUPPORT_UNICODE */
SLJIT_ASSERT(compares == 1);
ranges.range_count = 0;
ranges.ranges = ranges.local_ranges;
ranges.stack = ranges.local_stack;
xclass_compute_ranges(common, cc, &ranges);
/* Memory error is set for the compiler. */
if (ranges.stack == NULL)
return;
#if (defined SLJIT_DEBUG && SLJIT_DEBUG) && \
defined SUPPORT_UNICODE && (PCRE2_CODE_UNIT_WIDTH == 8 || PCRE2_CODE_UNIT_WIDTH == 16)
if (common->utf)
{
min = READ_CHAR_MAX;
max = 0;
xclass_update_min_max(common, cc, &min, &max);
SLJIT_ASSERT(ranges.ranges[0] == min && ranges.ranges[ranges.range_count - 1] == max);
}
#endif /* SLJIT_DEBUG && SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == [8|16] */
invertcmp = (list != backtracks);
if (ranges.range_count == 2)
{
range_start = ranges.ranges[0];
range_end = ranges.ranges[1];
if (range_start < range_end)
{
SET_CHAR_OFFSET(range_start);
jump = CMP(SLJIT_LESS_EQUAL ^ invertcmp, TMP1, 0, SLJIT_IMM, (sljit_sw)(range_end - range_start));
}
else
jump = CMP(SLJIT_EQUAL ^ invertcmp, TMP1, 0, SLJIT_IMM, (sljit_sw)(range_start - charoffset));
add_jump(compiler, backtracks, jump);
SLJIT_ASSERT(ranges.stack == ranges.local_stack);
if (found != NULL)
set_jumps(found, LABEL());
if (status & XCLASS_IS_ECLASS)
OP2(SLJIT_OR, ECLASS_STACK_DATA, 0, ECLASS_STACK_DATA, 0, SLJIT_IMM, 1);
return;
}
range_start = ranges.ranges[0];
SET_CHAR_OFFSET(range_start);
if (ranges.range_count >= 6)
{
/* Early fail. */
range_end = ranges.ranges[ranges.range_count - 1];
add_jump(compiler, (flags & XCL_NOT) == 0 ? backtracks : &found,
CMP(SLJIT_GREATER, TMP1, 0, SLJIT_IMM, (sljit_sw)(range_end - range_start)));
}
depth = 0;
first_item = 0;
last_item = ranges.range_count - 2;
has_cmov = sljit_has_cpu_feature(SLJIT_HAS_CMOV) != 0;
while (TRUE)
{
/* At least two items are present. */
SLJIT_ASSERT(first_item < last_item && charoffset == ranges.ranges[0]);
last_range_set = FALSE;
if (first_item + 6 <= last_item)
{
mid_item = ((first_item + last_item) >> 1) & ~(sljit_u32)1;
SLJIT_ASSERT(last_item >= mid_item + 4);
range_end = ranges.ranges[mid_item + 1];
if (first_item + 6 > mid_item && ranges.ranges[mid_item] == range_end)
{
OP2U(SLJIT_SUB | SLJIT_SET_GREATER | SLJIT_SET_Z, TMP1, 0, SLJIT_IMM, (sljit_sw)(range_end - charoffset));
ranges.stack[depth].jump = JUMP(SLJIT_GREATER);
OP_FLAGS(SLJIT_MOV, TMP2, 0, SLJIT_EQUAL);
last_range_set = TRUE;
}
else
ranges.stack[depth].jump = CMP(SLJIT_GREATER, TMP1, 0, SLJIT_IMM, (sljit_sw)(range_end - charoffset));
ranges.stack[depth].first_item = (sljit_u32)(mid_item + 2);
ranges.stack[depth].last_item = (sljit_u32)last_item;
depth++;
SLJIT_ASSERT(ranges.stack == ranges.local_stack ?
depth <= XCLASS_LOCAL_RANGES_LOG2_SIZE : (ranges.stack + depth) <= (xclass_stack_item*)ranges.ranges);
last_item = mid_item;
if (!last_range_set)
continue;
last_item -= 2;
}
if (!last_range_set)
{
range_start = ranges.ranges[first_item];
range_end = ranges.ranges[first_item + 1];
if (range_start < range_end)
{
SET_CHAR_OFFSET(range_start);
OP2U(SLJIT_SUB | SLJIT_SET_LESS_EQUAL, TMP1, 0, SLJIT_IMM, (sljit_sw)(range_end - range_start));
OP_FLAGS(SLJIT_MOV, TMP2, 0, SLJIT_LESS_EQUAL);
}
else
{
OP2U(SLJIT_SUB | SLJIT_SET_Z, TMP1, 0, SLJIT_IMM, (sljit_sw)(range_start - charoffset));
OP_FLAGS(SLJIT_MOV, TMP2, 0, SLJIT_EQUAL);
}
first_item += 2;
}
SLJIT_ASSERT(first_item <= last_item);
do
{
range_start = ranges.ranges[first_item];
range_end = ranges.ranges[first_item + 1];
if (range_start < range_end)
{
SET_CHAR_OFFSET(range_start);
OP2U(SLJIT_SUB | SLJIT_SET_LESS_EQUAL, TMP1, 0, SLJIT_IMM, (sljit_sw)(range_end - range_start));
if (has_cmov)
SELECT(SLJIT_LESS_EQUAL, TMP2, STR_END, 0, TMP2);
else
OP_FLAGS(SLJIT_OR | ((first_item == last_item) ? SLJIT_SET_Z : 0), TMP2, 0, SLJIT_LESS_EQUAL);
}
else
{
OP2U(SLJIT_SUB | SLJIT_SET_Z, TMP1, 0, SLJIT_IMM, (sljit_sw)(range_start - charoffset));
if (has_cmov)
SELECT(SLJIT_EQUAL, TMP2, STR_END, 0, TMP2);
else
OP_FLAGS(SLJIT_OR | ((first_item == last_item) ? SLJIT_SET_Z : 0), TMP2, 0, SLJIT_EQUAL);
}
first_item += 2;
}
while (first_item <= last_item);
if (depth == 0) break;
add_jump(compiler, &check_result, JUMP(SLJIT_JUMP));
/* The charoffset resets after the end of a branch is reached. */
charoffset = ranges.ranges[0];
depth--;
first_item = ranges.stack[depth].first_item;
last_item = ranges.stack[depth].last_item;
JUMPHERE(ranges.stack[depth].jump);
}
if (check_result != NULL)
set_jumps(check_result, LABEL());
if (has_cmov)
jump = CMP(SLJIT_NOT_EQUAL ^ invertcmp, TMP2, 0, SLJIT_IMM, 0);
else
{
sljit_set_current_flags(compiler, SLJIT_SET_Z);
jump = JUMP(SLJIT_NOT_EQUAL ^ invertcmp);
}
add_jump(compiler, backtracks, jump);
if (found != NULL)
set_jumps(found, LABEL());
if (status & XCLASS_IS_ECLASS)
OP2(SLJIT_OR, ECLASS_STACK_DATA, 0, ECLASS_STACK_DATA, 0, SLJIT_IMM, 1);
if (ranges.stack != ranges.local_stack)
SLJIT_FREE(ranges.stack, compiler->allocator_data);
}
static PCRE2_SPTR compile_eclass_matchingpath(compiler_common *common, PCRE2_SPTR cc, jump_list **backtracks)
{
DEFINE_COMPILER;
PCRE2_SPTR end = cc + GET(cc, 0) - 1;
PCRE2_SPTR begin;
jump_list *not_found;
jump_list *found = NULL;
cc += LINK_SIZE;
/* Should be optimized later. */
read_char(common, 0, READ_CHAR_MAX, backtracks, 0);
if (((*cc++) & ECL_MAP) != 0)
{
xclass_check_bitset(common, (const sljit_u8 *)cc, &found, backtracks);
cc += 32 / sizeof(PCRE2_UCHAR);
}
begin = cc;
OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), LOCAL0, ECLASS_CHAR_DATA, 0);
OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), LOCAL1, ECLASS_STACK_DATA, 0);
OP1(SLJIT_MOV, ECLASS_STACK_DATA, 0, SLJIT_IMM, 0);
OP1(SLJIT_MOV, ECLASS_CHAR_DATA, 0, TMP1, 0);
/* All eclass must start with an xclass. */
SLJIT_ASSERT(*cc == ECL_XCLASS);
while (cc < end)
{
switch (*cc)
{
case ECL_AND:
++cc;
OP2(SLJIT_OR, TMP2, 0, ECLASS_STACK_DATA, 0, SLJIT_IMM, ~(sljit_sw)1);
OP2(SLJIT_LSHR, ECLASS_STACK_DATA, 0, ECLASS_STACK_DATA, 0, SLJIT_IMM, 1);
OP2(SLJIT_AND, ECLASS_STACK_DATA, 0, ECLASS_STACK_DATA, 0, TMP2, 0);
break;
case ECL_OR:
++cc;
OP2(SLJIT_AND, TMP2, 0, ECLASS_STACK_DATA, 0, SLJIT_IMM, 1);
OP2(SLJIT_LSHR, ECLASS_STACK_DATA, 0, ECLASS_STACK_DATA, 0, SLJIT_IMM, 1);
OP2(SLJIT_OR, ECLASS_STACK_DATA, 0, ECLASS_STACK_DATA, 0, TMP2, 0);
break;
case ECL_XOR:
++cc;
OP2(SLJIT_AND, TMP2, 0, ECLASS_STACK_DATA, 0, SLJIT_IMM, 1);
OP2(SLJIT_LSHR, ECLASS_STACK_DATA, 0, ECLASS_STACK_DATA, 0, SLJIT_IMM, 1);
OP2(SLJIT_XOR, ECLASS_STACK_DATA, 0, ECLASS_STACK_DATA, 0, TMP2, 0);
break;
case ECL_NOT:
++cc;
OP2(SLJIT_XOR, ECLASS_STACK_DATA, 0, ECLASS_STACK_DATA, 0, SLJIT_IMM, 1);
break;
default:
SLJIT_ASSERT(*cc == ECL_XCLASS);
if (cc != begin)
{
OP1(SLJIT_MOV, TMP1, 0, ECLASS_CHAR_DATA, 0);
OP2(SLJIT_SHL, ECLASS_STACK_DATA, 0, ECLASS_STACK_DATA, 0, SLJIT_IMM, 1);
}
not_found = NULL;
compile_xclass_matchingpath(common, cc + 1 + LINK_SIZE, &not_found, XCLASS_IS_ECLASS);
set_jumps(not_found, LABEL());
cc += GET(cc, 1);
break;
}
}
OP2U(SLJIT_SUB | SLJIT_SET_Z, ECLASS_STACK_DATA, 0, SLJIT_IMM, 0);
OP1(SLJIT_MOV, ECLASS_CHAR_DATA, 0, SLJIT_MEM1(SLJIT_SP), LOCAL0);
OP1(SLJIT_MOV, ECLASS_STACK_DATA, 0, SLJIT_MEM1(SLJIT_SP), LOCAL1);
add_jump(compiler, backtracks, JUMP(SLJIT_EQUAL));
set_jumps(found, LABEL());
return end;
}
/* Generic character matching code. */
#undef SET_CHAR_OFFSET
#undef READ_FROM_CHAR_LIST
#undef XCLASS_LOCAL_RANGES_SIZE
#undef XCLASS_LOCAL_RANGES_LOG2_SIZE
#endif /* SUPPORT_WIDE_CHARS */
static PCRE2_SPTR byte_sequence_compare(compiler_common *common, BOOL caseless, PCRE2_SPTR cc,
compare_context *context, jump_list **backtracks)
{
DEFINE_COMPILER;
unsigned int othercasebit = 0;
PCRE2_SPTR othercasechar = NULL;
#ifdef SUPPORT_UNICODE
int utflength;
#endif
if (caseless && char_has_othercase(common, cc))
{
othercasebit = char_get_othercase_bit(common, cc);
SLJIT_ASSERT(othercasebit);
/* Extracting bit difference info. */
#if PCRE2_CODE_UNIT_WIDTH == 8
othercasechar = cc + (othercasebit >> 8);
othercasebit &= 0xff;
#elif PCRE2_CODE_UNIT_WIDTH == 16 || PCRE2_CODE_UNIT_WIDTH == 32
/* Note that this code only handles characters in the BMP. If there
ever are characters outside the BMP whose othercase differs in only one
bit from itself (there currently are none), this code will need to be
revised for PCRE2_CODE_UNIT_WIDTH == 32. */
othercasechar = cc + (othercasebit >> 9);
if ((othercasebit & 0x100) != 0)
othercasebit = (othercasebit & 0xff) << 8;
else
othercasebit &= 0xff;
#endif /* PCRE2_CODE_UNIT_WIDTH == [8|16|32] */
}
if (context->sourcereg == -1)
{
#if PCRE2_CODE_UNIT_WIDTH == 8
#if defined SLJIT_UNALIGNED && SLJIT_UNALIGNED
if (context->length >= 4)
OP1(SLJIT_MOV_S32, TMP1, 0, SLJIT_MEM1(STR_PTR), -context->length);
else if (context->length >= 2)
OP1(SLJIT_MOV_U16, TMP1, 0, SLJIT_MEM1(STR_PTR), -context->length);
else
#endif
OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), -context->length);
#elif PCRE2_CODE_UNIT_WIDTH == 16
#if defined SLJIT_UNALIGNED && SLJIT_UNALIGNED
if (context->length >= 4)
OP1(SLJIT_MOV_S32, TMP1, 0, SLJIT_MEM1(STR_PTR), -context->length);
else
#endif
OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), -context->length);
#elif PCRE2_CODE_UNIT_WIDTH == 32
OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), -context->length);
#endif /* PCRE2_CODE_UNIT_WIDTH == [8|16|32] */
context->sourcereg = TMP2;
}
#ifdef SUPPORT_UNICODE
utflength = 1;
if (common->utf && HAS_EXTRALEN(*cc))
utflength += GET_EXTRALEN(*cc);
do
{
#endif
context->length -= IN_UCHARS(1);
#if (defined SLJIT_UNALIGNED && SLJIT_UNALIGNED) && (PCRE2_CODE_UNIT_WIDTH == 8 || PCRE2_CODE_UNIT_WIDTH == 16)
/* Unaligned read is supported. */
if (othercasebit != 0 && othercasechar == cc)
{
context->c.asuchars[context->ucharptr] = *cc | othercasebit;
context->oc.asuchars[context->ucharptr] = othercasebit;
}
else
{
context->c.asuchars[context->ucharptr] = *cc;
context->oc.asuchars[context->ucharptr] = 0;
}
context->ucharptr++;
#if PCRE2_CODE_UNIT_WIDTH == 8
if (context->ucharptr >= 4 || context->length == 0 || (context->ucharptr == 2 && context->length == 1))
#else
if (context->ucharptr >= 2 || context->length == 0)
#endif
{
if (context->length >= 4)
OP1(SLJIT_MOV_S32, context->sourcereg, 0, SLJIT_MEM1(STR_PTR), -context->length);
else if (context->length >= 2)
OP1(SLJIT_MOV_U16, context->sourcereg, 0, SLJIT_MEM1(STR_PTR), -context->length);
#if PCRE2_CODE_UNIT_WIDTH == 8
else if (context->length >= 1)
OP1(SLJIT_MOV_U8, context->sourcereg, 0, SLJIT_MEM1(STR_PTR), -context->length);
#endif /* PCRE2_CODE_UNIT_WIDTH == 8 */
context->sourcereg = context->sourcereg == TMP1 ? TMP2 : TMP1;
switch(context->ucharptr)
{
case 4 / sizeof(PCRE2_UCHAR):
if (context->oc.asint != 0)
OP2(SLJIT_OR, context->sourcereg, 0, context->sourcereg, 0, SLJIT_IMM, context->oc.asint);
add_jump(compiler, backtracks, CMP(SLJIT_NOT_EQUAL, context->sourcereg, 0, SLJIT_IMM, context->c.asint | context->oc.asint));
break;
case 2 / sizeof(PCRE2_UCHAR):
if (context->oc.asushort != 0)
OP2(SLJIT_OR, context->sourcereg, 0, context->sourcereg, 0, SLJIT_IMM, context->oc.asushort);
add_jump(compiler, backtracks, CMP(SLJIT_NOT_EQUAL, context->sourcereg, 0, SLJIT_IMM, context->c.asushort | context->oc.asushort));
break;
#if PCRE2_CODE_UNIT_WIDTH == 8
case 1:
if (context->oc.asbyte != 0)
OP2(SLJIT_OR, context->sourcereg, 0, context->sourcereg, 0, SLJIT_IMM, context->oc.asbyte);
add_jump(compiler, backtracks, CMP(SLJIT_NOT_EQUAL, context->sourcereg, 0, SLJIT_IMM, context->c.asbyte | context->oc.asbyte));
break;
#endif
default:
SLJIT_UNREACHABLE();
break;
}
context->ucharptr = 0;
}
#else
/* Unaligned read is unsupported or in 32 bit mode. */
if (context->length >= 1)
OP1(MOV_UCHAR, context->sourcereg, 0, SLJIT_MEM1(STR_PTR), -context->length);
context->sourcereg = context->sourcereg == TMP1 ? TMP2 : TMP1;
if (othercasebit != 0 && othercasechar == cc)
{
OP2(SLJIT_OR, context->sourcereg, 0, context->sourcereg, 0, SLJIT_IMM, othercasebit);
add_jump(compiler, backtracks, CMP(SLJIT_NOT_EQUAL, context->sourcereg, 0, SLJIT_IMM, *cc | othercasebit));
}
else
add_jump(compiler, backtracks, CMP(SLJIT_NOT_EQUAL, context->sourcereg, 0, SLJIT_IMM, *cc));
#endif
cc++;
#ifdef SUPPORT_UNICODE
utflength--;
}
while (utflength > 0);
#endif
return cc;
}
#ifdef SUPPORT_UNICODE
#if PCRE2_CODE_UNIT_WIDTH != 32
/* The code in this function copies the logic of the interpreter function that
is defined in the pcre2_extuni.c source. If that code is updated, this
function, and those below it, must be kept in step (note by PH, June 2024). */
static PCRE2_SPTR SLJIT_FUNC do_extuni_utf(jit_arguments *args, PCRE2_SPTR cc)
{
PCRE2_SPTR start_subject = args->begin;
PCRE2_SPTR end_subject = args->end;
int lgb, rgb, ricount;
PCRE2_SPTR prevcc, endcc, bptr;
BOOL first = TRUE;
BOOL was_ep_ZWJ = FALSE;
uint32_t c;
prevcc = cc;
endcc = NULL;
do
{
GETCHARINC(c, cc);
rgb = UCD_GRAPHBREAK(c);
if (first)
{
lgb = rgb;
endcc = cc;
first = FALSE;
continue;
}
if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0)
break;
/* ZWJ followed by Extended Pictographic is allowed only if the ZWJ was
preceded by Extended Pictographic. */
if (lgb == ucp_gbZWJ && rgb == ucp_gbExtended_Pictographic && !was_ep_ZWJ)
break;
/* Not breaking between Regional Indicators is allowed only if there
are an even number of preceding RIs. */
if (lgb == ucp_gbRegional_Indicator && rgb == ucp_gbRegional_Indicator)
{
ricount = 0;
bptr = prevcc;
/* bptr is pointing to the left-hand character */
while (bptr > start_subject)
{
bptr--;
BACKCHAR(bptr);
GETCHAR(c, bptr);
if (UCD_GRAPHBREAK(c) != ucp_gbRegional_Indicator)
break;
ricount++;
}
if ((ricount & 1) != 0) break; /* Grapheme break required */
}
/* Set a flag when ZWJ follows Extended Pictographic (with optional Extend in
between; see next statement). */
was_ep_ZWJ = (lgb == ucp_gbExtended_Pictographic && rgb == ucp_gbZWJ);
/* If Extend follows Extended_Pictographic, do not update lgb; this allows
any number of them before a following ZWJ. */
if (rgb != ucp_gbExtend || lgb != ucp_gbExtended_Pictographic)
lgb = rgb;
prevcc = endcc;
endcc = cc;
}
while (cc < end_subject);
return endcc;
}
#endif /* PCRE2_CODE_UNIT_WIDTH != 32 */
/* The code in this function copies the logic of the interpreter function that
is defined in the pcre2_extuni.c source. If that code is updated, this
function, and the one below it, must be kept in step (note by PH, June 2024). */
static PCRE2_SPTR SLJIT_FUNC do_extuni_utf_invalid(jit_arguments *args, PCRE2_SPTR cc)
{
PCRE2_SPTR start_subject = args->begin;
PCRE2_SPTR end_subject = args->end;
int lgb, rgb, ricount;
PCRE2_SPTR prevcc, endcc, bptr;
BOOL first = TRUE;
BOOL was_ep_ZWJ = FALSE;
uint32_t c;
prevcc = cc;
endcc = NULL;
do
{
GETCHARINC_INVALID(c, cc, end_subject, break);
rgb = UCD_GRAPHBREAK(c);
if (first)
{
lgb = rgb;
endcc = cc;
first = FALSE;
continue;
}
if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0)
break;
/* ZWJ followed by Extended Pictographic is allowed only if the ZWJ was
preceded by Extended Pictographic. */
if (lgb == ucp_gbZWJ && rgb == ucp_gbExtended_Pictographic && !was_ep_ZWJ)
break;
/* Not breaking between Regional Indicators is allowed only if there
are an even number of preceding RIs. */
if (lgb == ucp_gbRegional_Indicator && rgb == ucp_gbRegional_Indicator)
{
ricount = 0;
bptr = prevcc;
/* bptr is pointing to the left-hand character */
while (bptr > start_subject)
{
GETCHARBACK_INVALID(c, bptr, start_subject, break);
if (UCD_GRAPHBREAK(c) != ucp_gbRegional_Indicator)
break;
ricount++;
}
if ((ricount & 1) != 0)
break; /* Grapheme break required */
}
/* Set a flag when ZWJ follows Extended Pictographic (with optional Extend in
between; see next statement). */
was_ep_ZWJ = (lgb == ucp_gbExtended_Pictographic && rgb == ucp_gbZWJ);
/* If Extend follows Extended_Pictographic, do not update lgb; this allows
any number of them before a following ZWJ. */
if (rgb != ucp_gbExtend || lgb != ucp_gbExtended_Pictographic)
lgb = rgb;
prevcc = endcc;
endcc = cc;
}
while (cc < end_subject);
return endcc;
}
/* The code in this function copies the logic of the interpreter function that
is defined in the pcre2_extuni.c source. If that code is updated, this
function must be kept in step (note by PH, June 2024). */
static PCRE2_SPTR SLJIT_FUNC do_extuni_no_utf(jit_arguments *args, PCRE2_SPTR cc)
{
PCRE2_SPTR start_subject = args->begin;
PCRE2_SPTR end_subject = args->end;
int lgb, rgb, ricount;
PCRE2_SPTR bptr;
uint32_t c;
BOOL was_ep_ZWJ = FALSE;
/* Patch by PH */
/* GETCHARINC(c, cc); */
c = *cc++;
#if PCRE2_CODE_UNIT_WIDTH == 32
if (c >= 0x110000)
return cc;
#endif /* PCRE2_CODE_UNIT_WIDTH == 32 */
lgb = UCD_GRAPHBREAK(c);
while (cc < end_subject)
{
c = *cc;
#if PCRE2_CODE_UNIT_WIDTH == 32
if (c >= 0x110000)
break;
#endif /* PCRE2_CODE_UNIT_WIDTH == 32 */
rgb = UCD_GRAPHBREAK(c);
if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0)
break;
/* ZWJ followed by Extended Pictographic is allowed only if the ZWJ was
preceded by Extended Pictographic. */
if (lgb == ucp_gbZWJ && rgb == ucp_gbExtended_Pictographic && !was_ep_ZWJ)
break;
/* Not breaking between Regional Indicators is allowed only if there
are an even number of preceding RIs. */
if (lgb == ucp_gbRegional_Indicator && rgb == ucp_gbRegional_Indicator)
{
ricount = 0;
bptr = cc - 1;
/* bptr is pointing to the left-hand character */
while (bptr > start_subject)
{
bptr--;
c = *bptr;
#if PCRE2_CODE_UNIT_WIDTH == 32
if (c >= 0x110000)
break;
#endif /* PCRE2_CODE_UNIT_WIDTH == 32 */
if (UCD_GRAPHBREAK(c) != ucp_gbRegional_Indicator) break;
ricount++;
}
if ((ricount & 1) != 0)
break; /* Grapheme break required */
}
/* Set a flag when ZWJ follows Extended Pictographic (with optional Extend in
between; see next statement). */
was_ep_ZWJ = (lgb == ucp_gbExtended_Pictographic && rgb == ucp_gbZWJ);
/* If Extend follows Extended_Pictographic, do not update lgb; this allows
any number of them before a following ZWJ. */
if (rgb != ucp_gbExtend || lgb != ucp_gbExtended_Pictographic)
lgb = rgb;
cc++;
}
return cc;
}
static void compile_clist(compiler_common *common, PCRE2_SPTR cc, jump_list **backtracks)
{
DEFINE_COMPILER;
const sljit_u32 *other_cases;
struct sljit_jump *jump;
sljit_u32 min = 0, max = READ_CHAR_MAX;
BOOL has_cmov = sljit_has_cpu_feature(SLJIT_HAS_CMOV) != 0;
SLJIT_ASSERT(cc[1] == PT_CLIST);
if (cc[0] == OP_PROP)
{
other_cases = PRIV(ucd_caseless_sets) + cc[2];
min = *other_cases++;
max = min;
while (*other_cases != NOTACHAR)
{
if (*other_cases > max) max = *other_cases;
if (*other_cases < min) min = *other_cases;
other_cases++;
}
}
other_cases = PRIV(ucd_caseless_sets) + cc[2];
SLJIT_ASSERT(other_cases[0] != NOTACHAR && other_cases[1] != NOTACHAR);
/* The NOTACHAR is higher than any character. */
SLJIT_ASSERT(other_cases[0] < other_cases[1] && other_cases[1] < other_cases[2]);
read_char(common, min, max, backtracks, READ_CHAR_UPDATE_STR_PTR);
/* At least two characters are required.
Otherwise this case would be handled by the normal code path. */
/* NOTACHAR is the unsigned maximum. */
/* Optimizing character pairs, if their difference is power of 2. */
if (is_powerof2(other_cases[1] ^ other_cases[0]))
{
OP2(SLJIT_OR, TMP2, 0, TMP1, 0, SLJIT_IMM, (sljit_sw)(other_cases[1] ^ other_cases[0]));
OP2U(SLJIT_SUB | SLJIT_SET_Z, TMP2, 0, SLJIT_IMM, other_cases[1]);
OP_FLAGS(SLJIT_MOV, TMP2, 0, SLJIT_EQUAL);
other_cases += 2;
}
else if (is_powerof2(other_cases[2] ^ other_cases[1]))
{
SLJIT_ASSERT(other_cases[2] != NOTACHAR);
OP2(SLJIT_OR, TMP2, 0, TMP1, 0, SLJIT_IMM, (sljit_sw)(other_cases[2] ^ other_cases[1]));
OP2U(SLJIT_SUB | SLJIT_SET_Z, TMP2, 0, SLJIT_IMM, other_cases[2]);
OP_FLAGS(SLJIT_MOV, TMP2, 0, SLJIT_EQUAL);
OP2U(SLJIT_SUB | SLJIT_SET_Z, TMP1, 0, SLJIT_IMM, (sljit_sw)other_cases[0]);
if (has_cmov)
SELECT(SLJIT_EQUAL, TMP2, STR_END, 0, TMP2);
else
OP_FLAGS(SLJIT_OR | ((other_cases[3] == NOTACHAR) ? SLJIT_SET_Z : 0), TMP2, 0, SLJIT_EQUAL);
other_cases += 3;
}
else
{
OP2U(SLJIT_SUB | SLJIT_SET_Z, TMP1, 0, SLJIT_IMM, (sljit_sw)(*other_cases++));
OP_FLAGS(SLJIT_MOV, TMP2, 0, SLJIT_EQUAL);
}
while (*other_cases != NOTACHAR)
{
OP2U(SLJIT_SUB | SLJIT_SET_Z, TMP1, 0, SLJIT_IMM, (sljit_sw)(*other_cases++));
if (has_cmov)
SELECT(SLJIT_EQUAL, TMP2, STR_END, 0, TMP2);
else
OP_FLAGS(SLJIT_OR | ((*other_cases == NOTACHAR) ? SLJIT_SET_Z : 0), TMP2, 0, SLJIT_EQUAL);
}
if (has_cmov)
jump = CMP(cc[0] == OP_PROP ? SLJIT_EQUAL : SLJIT_NOT_EQUAL, TMP2, 0, SLJIT_IMM, 0);
else
jump = JUMP(cc[0] == OP_PROP ? SLJIT_ZERO : SLJIT_NOT_ZERO);
add_jump(compiler, backtracks, jump);
}
#endif /* SUPPORT_UNICODE */
static PCRE2_SPTR compile_char1_matchingpath(compiler_common *common, PCRE2_UCHAR type, PCRE2_SPTR cc, jump_list **backtracks, BOOL check_str_ptr)
{
DEFINE_COMPILER;
int length;
unsigned int c, oc, bit;
compare_context context;
struct sljit_jump *jump[3];
jump_list *end_list;
#ifdef SUPPORT_UNICODE
PCRE2_UCHAR propdata[5];
#endif /* SUPPORT_UNICODE */
switch(type)
{
case OP_NOT_DIGIT:
case OP_DIGIT:
/* Digits are usually 0-9, so it is worth to optimize them. */
if (check_str_ptr)
detect_partial_match(common, backtracks);
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
if (common->utf && is_char7_bitset((const sljit_u8*)common->ctypes - cbit_length + cbit_digit, FALSE))
read_char7_type(common, backtracks, type == OP_NOT_DIGIT);
else
#endif
read_char8_type(common, backtracks, type == OP_NOT_DIGIT);
/* Flip the starting bit in the negative case. */
OP2U(SLJIT_AND | SLJIT_SET_Z, TMP1, 0, SLJIT_IMM, ctype_digit);
add_jump(compiler, backtracks, JUMP(type == OP_DIGIT ? SLJIT_ZERO : SLJIT_NOT_ZERO));
return cc;
case OP_NOT_WHITESPACE:
case OP_WHITESPACE:
if (check_str_ptr)
detect_partial_match(common, backtracks);
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
if (common->utf && is_char7_bitset((const sljit_u8*)common->ctypes - cbit_length + cbit_space, FALSE))
read_char7_type(common, backtracks, type == OP_NOT_WHITESPACE);
else
#endif
read_char8_type(common, backtracks, type == OP_NOT_WHITESPACE);
OP2U(SLJIT_AND | SLJIT_SET_Z, TMP1, 0, SLJIT_IMM, ctype_space);
add_jump(compiler, backtracks, JUMP(type == OP_WHITESPACE ? SLJIT_ZERO : SLJIT_NOT_ZERO));
return cc;
case OP_NOT_WORDCHAR:
case OP_WORDCHAR:
if (check_str_ptr)
detect_partial_match(common, backtracks);
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
if (common->utf && is_char7_bitset((const sljit_u8*)common->ctypes - cbit_length + cbit_word, FALSE))
read_char7_type(common, backtracks, type == OP_NOT_WORDCHAR);
else
#endif
read_char8_type(common, backtracks, type == OP_NOT_WORDCHAR);
OP2U(SLJIT_AND | SLJIT_SET_Z, TMP1, 0, SLJIT_IMM, ctype_word);
add_jump(compiler, backtracks, JUMP(type == OP_WORDCHAR ? SLJIT_ZERO : SLJIT_NOT_ZERO));
return cc;
case OP_ANY:
if (check_str_ptr)
detect_partial_match(common, backtracks);
read_char(common, common->nlmin, common->nlmax, backtracks, READ_CHAR_UPDATE_STR_PTR);
if (common->nltype == NLTYPE_FIXED && common->newline > 255)
{
jump[0] = CMP(SLJIT_NOT_EQUAL, TMP1, 0, SLJIT_IMM, (common->newline >> 8) & 0xff);
end_list = NULL;
if (common->mode != PCRE2_JIT_PARTIAL_HARD)
add_jump(compiler, &end_list, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));
else
check_str_end(common, &end_list);
OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), 0);
add_jump(compiler, backtracks, CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, common->newline & 0xff));
set_jumps(end_list, LABEL());
JUMPHERE(jump[0]);
}
else
check_newlinechar(common, common->nltype, backtracks, TRUE);
return cc;
case OP_ALLANY:
if (check_str_ptr)
detect_partial_match(common, backtracks);
#ifdef SUPPORT_UNICODE
if (common->utf && common->invalid_utf)
{
read_char(common, 0, READ_CHAR_MAX, backtracks, READ_CHAR_UPDATE_STR_PTR);
return cc;
}
#endif /* SUPPORT_UNICODE */
skip_valid_char(common);
return cc;
case OP_ANYBYTE:
if (check_str_ptr)
detect_partial_match(common, backtracks);
OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
return cc;
#ifdef SUPPORT_UNICODE
case OP_NOTPROP:
case OP_PROP:
if (check_str_ptr)
detect_partial_match(common, backtracks);
if (cc[0] == PT_CLIST)
{
compile_clist(common, cc - 1, backtracks);
return cc + 2;
}
propdata[0] = 0;
propdata[1] = type == OP_NOTPROP ? XCL_NOTPROP : XCL_PROP;
propdata[2] = cc[0];
propdata[3] = cc[1];
propdata[4] = XCL_END;
compile_xclass_matchingpath(common, propdata, backtracks, 0);
return cc + 2;
#endif
case OP_ANYNL:
if (check_str_ptr)
detect_partial_match(common, backtracks);
read_char(common, common->bsr_nlmin, common->bsr_nlmax, NULL, 0);
jump[0] = CMP(SLJIT_NOT_EQUAL, TMP1, 0, SLJIT_IMM, CHAR_CR);
/* We don't need to handle soft partial matching case. */
end_list = NULL;
if (common->mode != PCRE2_JIT_PARTIAL_HARD)
add_jump(compiler, &end_list, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));
else
check_str_end(common, &end_list);
OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), 0);
OP2U(SLJIT_SUB | SLJIT_SET_Z, TMP1, 0, SLJIT_IMM, CHAR_NL);
OP_FLAGS(SLJIT_MOV, TMP1, 0, SLJIT_EQUAL);
#if PCRE2_CODE_UNIT_WIDTH == 16 || PCRE2_CODE_UNIT_WIDTH == 32
OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, UCHAR_SHIFT);
#endif
OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0);
jump[1] = JUMP(SLJIT_JUMP);
JUMPHERE(jump[0]);
check_newlinechar(common, common->bsr_nltype, backtracks, FALSE);
set_jumps(end_list, LABEL());
JUMPHERE(jump[1]);
return cc;
case OP_NOT_HSPACE:
case OP_HSPACE:
if (check_str_ptr)
detect_partial_match(common, backtracks);
if (type == OP_NOT_HSPACE)
read_char(common, 0x9, 0x3000, backtracks, READ_CHAR_UPDATE_STR_PTR);
else
read_char(common, 0x9, 0x3000, NULL, 0);
add_jump(compiler, &common->hspace, JUMP(SLJIT_FAST_CALL));
sljit_set_current_flags(compiler, SLJIT_SET_Z);
add_jump(compiler, backtracks, JUMP(type == OP_NOT_HSPACE ? SLJIT_NOT_ZERO : SLJIT_ZERO));
return cc;
case OP_NOT_VSPACE:
case OP_VSPACE:
if (check_str_ptr)
detect_partial_match(common, backtracks);
if (type == OP_NOT_VSPACE)
read_char(common, 0xa, 0x2029, backtracks, READ_CHAR_UPDATE_STR_PTR);
else
read_char(common, 0xa, 0x2029, NULL, 0);
add_jump(compiler, &common->vspace, JUMP(SLJIT_FAST_CALL));
sljit_set_current_flags(compiler, SLJIT_SET_Z);
add_jump(compiler, backtracks, JUMP(type == OP_NOT_VSPACE ? SLJIT_NOT_ZERO : SLJIT_ZERO));
return cc;
#ifdef SUPPORT_UNICODE
case OP_EXTUNI:
if (check_str_ptr)
detect_partial_match(common, backtracks);
SLJIT_ASSERT(TMP1 == SLJIT_R0 && STR_PTR == SLJIT_R1);
OP1(SLJIT_MOV, SLJIT_R0, 0, ARGUMENTS, 0);
#if PCRE2_CODE_UNIT_WIDTH != 32
sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS2(W, W, W), SLJIT_IMM,
common->utf ? (common->invalid_utf ? SLJIT_FUNC_ADDR(do_extuni_utf_invalid) : SLJIT_FUNC_ADDR(do_extuni_utf)) : SLJIT_FUNC_ADDR(do_extuni_no_utf));
if (common->invalid_utf)
add_jump(compiler, backtracks, CMP(SLJIT_EQUAL, SLJIT_RETURN_REG, 0, SLJIT_IMM, 0));
#else
sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS2(W, W, W), SLJIT_IMM,
common->invalid_utf ? SLJIT_FUNC_ADDR(do_extuni_utf_invalid) : SLJIT_FUNC_ADDR(do_extuni_no_utf));
if (common->invalid_utf)
add_jump(compiler, backtracks, CMP(SLJIT_EQUAL, SLJIT_RETURN_REG, 0, SLJIT_IMM, 0));
#endif
OP1(SLJIT_MOV, STR_PTR, 0, SLJIT_RETURN_REG, 0);
if (common->mode == PCRE2_JIT_PARTIAL_HARD)
{
jump[0] = CMP(SLJIT_LESS, SLJIT_RETURN_REG, 0, STR_END, 0);
/* Since we successfully read a char above, partial matching must occur. */
check_partial(common, TRUE);
JUMPHERE(jump[0]);
}
return cc;
#endif
case OP_CHAR:
case OP_CHARI:
length = 1;
#ifdef SUPPORT_UNICODE
if (common->utf && HAS_EXTRALEN(*cc)) length += GET_EXTRALEN(*cc);
#endif
if (check_str_ptr && common->mode != PCRE2_JIT_COMPLETE)
detect_partial_match(common, backtracks);
if (type == OP_CHAR || !char_has_othercase(common, cc) || char_get_othercase_bit(common, cc) != 0)
{
OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(length));
if (length > 1 || (check_str_ptr && common->mode == PCRE2_JIT_COMPLETE))
add_jump(compiler, backtracks, CMP(SLJIT_GREATER, STR_PTR, 0, STR_END, 0));
context.length = IN_UCHARS(length);
context.sourcereg = -1;
#if defined SLJIT_UNALIGNED && SLJIT_UNALIGNED
context.ucharptr = 0;
#endif
return byte_sequence_compare(common, type == OP_CHARI, cc, &context, backtracks);
}
#ifdef SUPPORT_UNICODE
if (common->utf)
{
GETCHAR(c, cc);
}
else
#endif
c = *cc;
SLJIT_ASSERT(type == OP_CHARI && char_has_othercase(common, cc));
if (check_str_ptr && common->mode == PCRE2_JIT_COMPLETE)
add_jump(compiler, backtracks, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));
oc = char_othercase(common, c);
read_char(common, c < oc ? c : oc, c > oc ? c : oc, NULL, 0);
SLJIT_ASSERT(!is_powerof2(c ^ oc));
if (sljit_has_cpu_feature(SLJIT_HAS_CMOV))
{
OP2U(SLJIT_SUB | SLJIT_SET_Z, TMP1, 0, SLJIT_IMM, oc);
SELECT(SLJIT_EQUAL, TMP1, SLJIT_IMM, c, TMP1);
add_jump(compiler, backtracks, CMP(SLJIT_NOT_EQUAL, TMP1, 0, SLJIT_IMM, c));
}
else
{
jump[0] = CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, c);
add_jump(compiler, backtracks, CMP(SLJIT_NOT_EQUAL, TMP1, 0, SLJIT_IMM, oc));
JUMPHERE(jump[0]);
}
return cc + length;
case OP_NOT:
case OP_NOTI:
if (check_str_ptr)
detect_partial_match(common, backtracks);
length = 1;
#ifdef SUPPORT_UNICODE
if (common->utf)
{
#if PCRE2_CODE_UNIT_WIDTH == 8
c = *cc;
if (c < 128 && !common->invalid_utf)
{
OP1(SLJIT_MOV_U8, TMP1, 0, SLJIT_MEM1(STR_PTR), 0);
if (type == OP_NOT || !char_has_othercase(common, cc))
add_jump(compiler, backtracks, CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, c));
else
{
/* Since UTF8 code page is fixed, we know that c is in [a-z] or [A-Z] range. */
OP2(SLJIT_OR, TMP2, 0, TMP1, 0, SLJIT_IMM, 0x20);
add_jump(compiler, backtracks, CMP(SLJIT_EQUAL, TMP2, 0, SLJIT_IMM, c | 0x20));
}
/* Skip the variable-length character. */
OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
jump[0] = CMP(SLJIT_LESS, TMP1, 0, SLJIT_IMM, 0xc0);
OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(TMP1), (sljit_sw)PRIV(utf8_table4) - 0xc0);
OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0);
JUMPHERE(jump[0]);
return cc + 1;
}
else
#endif /* PCRE2_CODE_UNIT_WIDTH == 8 */
{
GETCHARLEN(c, cc, length);
}
}
else
#endif /* SUPPORT_UNICODE */
c = *cc;
if (type == OP_NOT || !char_has_othercase(common, cc))
{
read_char(common, c, c, backtracks, READ_CHAR_UPDATE_STR_PTR);
add_jump(compiler, backtracks, CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, c));
}
else
{
oc = char_othercase(common, c);
read_char(common, c < oc ? c : oc, c > oc ? c : oc, backtracks, READ_CHAR_UPDATE_STR_PTR);
bit = c ^ oc;
if (is_powerof2(bit))
{
OP2(SLJIT_OR, TMP1, 0, TMP1, 0, SLJIT_IMM, bit);
add_jump(compiler, backtracks, CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, c | bit));
}
else
{
add_jump(compiler, backtracks, CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, c));
add_jump(compiler, backtracks, CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, oc));
}
}
return cc + length;
case OP_CLASS:
case OP_NCLASS:
if (check_str_ptr)
detect_partial_match(common, backtracks);
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
bit = (common->utf && is_char7_bitset((const sljit_u8 *)cc, type == OP_NCLASS)) ? 127 : 255;
if (type == OP_NCLASS)
read_char(common, 0, bit, backtracks, READ_CHAR_UPDATE_STR_PTR);
else
read_char(common, 0, bit, NULL, 0);
#else
if (type == OP_NCLASS)
read_char(common, 0, 255, backtracks, READ_CHAR_UPDATE_STR_PTR);
else
read_char(common, 0, 255, NULL, 0);
#endif
if (optimize_class(common, (const sljit_u8 *)cc, type == OP_NCLASS, FALSE, backtracks))
return cc + 32 / sizeof(PCRE2_UCHAR);
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
jump[0] = NULL;
if (common->utf)
{
jump[0] = CMP(SLJIT_GREATER, TMP1, 0, SLJIT_IMM, bit);
if (type == OP_CLASS)
{
add_jump(compiler, backtracks, jump[0]);
jump[0] = NULL;
}
}
#elif PCRE2_CODE_UNIT_WIDTH != 8
jump[0] = CMP(SLJIT_GREATER, TMP1, 0, SLJIT_IMM, 255);
if (type == OP_CLASS)
{
add_jump(compiler, backtracks, jump[0]);
jump[0] = NULL;
}
#endif /* SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8 */
OP2(SLJIT_AND, TMP2, 0, TMP1, 0, SLJIT_IMM, 0x7);
OP2(SLJIT_LSHR, TMP1, 0, TMP1, 0, SLJIT_IMM, 3);
OP1(SLJIT_MOV_U8, TMP1, 0, SLJIT_MEM1(TMP1), (sljit_sw)cc);
OP2(SLJIT_SHL, TMP2, 0, SLJIT_IMM, 1, TMP2, 0);
OP2U(SLJIT_AND | SLJIT_SET_Z, TMP1, 0, TMP2, 0);
add_jump(compiler, backtracks, JUMP(SLJIT_ZERO));
#if defined SUPPORT_UNICODE || PCRE2_CODE_UNIT_WIDTH != 8
if (jump[0] != NULL)
JUMPHERE(jump[0]);
#endif
return cc + 32 / sizeof(PCRE2_UCHAR);
#if defined SUPPORT_UNICODE || PCRE2_CODE_UNIT_WIDTH == 16 || PCRE2_CODE_UNIT_WIDTH == 32
case OP_XCLASS:
if (check_str_ptr)
detect_partial_match(common, backtracks);
compile_xclass_matchingpath(common, cc + LINK_SIZE, backtracks, 0);
return cc + GET(cc, 0) - 1;
case OP_ECLASS:
if (check_str_ptr)
detect_partial_match(common, backtracks);
return compile_eclass_matchingpath(common, cc, backtracks);
#endif
}
SLJIT_UNREACHABLE();
return cc;
}
static SLJIT_INLINE PCRE2_SPTR compile_charn_matchingpath(compiler_common *common, PCRE2_SPTR cc, PCRE2_SPTR ccend, jump_list **backtracks)
{
/* This function consumes at least one input character. */
/* To decrease the number of length checks, we try to concatenate the fixed length character sequences. */
DEFINE_COMPILER;
PCRE2_SPTR ccbegin = cc;
compare_context context;
int size;
context.length = 0;
do
{
if (cc >= ccend)
break;
if (*cc == OP_CHAR)
{
size = 1;
#ifdef SUPPORT_UNICODE
if (common->utf && HAS_EXTRALEN(cc[1]))
size += GET_EXTRALEN(cc[1]);
#endif
}
else if (*cc == OP_CHARI)
{
size = 1;
#ifdef SUPPORT_UNICODE
if (common->utf)
{
if (char_has_othercase(common, cc + 1) && char_get_othercase_bit(common, cc + 1) == 0)
size = 0;
else if (HAS_EXTRALEN(cc[1]))
size += GET_EXTRALEN(cc[1]);
}
else
#endif
if (char_has_othercase(common, cc + 1) && char_get_othercase_bit(common, cc + 1) == 0)
size = 0;
}
else
size = 0;
cc += 1 + size;
context.length += IN_UCHARS(size);
}
while (size > 0 && context.length <= 128);
cc = ccbegin;
if (context.length > 0)
{
/* We have a fixed-length byte sequence. */
OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, context.length);
add_jump(compiler, backtracks, CMP(SLJIT_GREATER, STR_PTR, 0, STR_END, 0));
context.sourcereg = -1;
#if defined SLJIT_UNALIGNED && SLJIT_UNALIGNED
context.ucharptr = 0;
#endif
do cc = byte_sequence_compare(common, *cc == OP_CHARI, cc + 1, &context, backtracks); while (context.length > 0);
return cc;
}
/* A non-fixed length character will be checked if length == 0. */
return compile_char1_matchingpath(common, *cc, cc + 1, backtracks, TRUE);
}