mirror of
https://github.com/ruby/ruby.git
synced 2025-08-15 13:39:04 +02:00
update to Unicode 11.0.0 (main step, not complete yet)
- common.mk: Change Unicode version to 11.0.0, and Emoji version to 11.0 - test/ruby/enc/test_emoji_breaks.rb: update hard-coded Emoji version - enc/unicode/11.0.0, enc/unicode/11.0.0/casefold.h, enc/unicode/name2ctype.h: Add generated files. Files for Unicode 10.0.0 will be removed once we are sure 11.0.0 works. - lib/unicode_normalize/tables.rb: Updated table. - regparse.c: Almost completely reimplement grapheme cluster detection in function node_extended_grapheme_cluster(). git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@66213 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
This commit is contained in:
parent
78be4478d1
commit
66a6073859
6 changed files with 47704 additions and 407 deletions
483
regparse.c
483
regparse.c
|
@ -5831,6 +5831,7 @@ node_extended_grapheme_cluster(Node** np, ScanEnv* env)
|
|||
Node* list2 = NULL;
|
||||
Node* alt = NULL;
|
||||
Node* alt2 = NULL;
|
||||
Node* top_alt = NULL;
|
||||
BBuf *pbuf1 = NULL;
|
||||
int r = 0;
|
||||
int num1;
|
||||
|
@ -5845,9 +5846,9 @@ node_extended_grapheme_cluster(Node** np, ScanEnv* env)
|
|||
if (ONIGENC_IS_UNICODE(env->enc)) {
|
||||
/* UTF-8, UTF-16BE/LE, UTF-32BE/LE */
|
||||
CClassNode* cc;
|
||||
OnigCodePoint sb_out = (ONIGENC_MBC_MINLEN(env->enc) > 1) ? 0x00 : 0x80;
|
||||
Node **seq = node_array; /* seq[5] */
|
||||
Node **alts = node_array+5; /* alts[4] */
|
||||
/* OnigCodePoint sb_out = (ONIGENC_MBC_MINLEN(env->enc) > 1) ? 0x00 : 0x80; */
|
||||
/* Node **seq = node_array; * seq[5] */
|
||||
/* Node **alts = node_array+5; * alts[4] */
|
||||
|
||||
for (i=0; i<NODE_ARRAY_SIZE; i++)
|
||||
node_array[i] = NULL_NODE;
|
||||
|
@ -5857,320 +5858,183 @@ node_extended_grapheme_cluster(Node** np, ScanEnv* env)
|
|||
* order the various expressions appear in the grammar)
|
||||
* in the old-style parts. It is forwards in the new-style
|
||||
* parts (in blocks ending with create_sequence_node()). */
|
||||
/* Unicode 10.0.0 */
|
||||
/* CRLF
|
||||
* | Prepend*
|
||||
* ( RI-sequence | Hangul-Syllable | !Control )
|
||||
* ( Grapheme_Extend | SpacingMark )*
|
||||
* | . */
|
||||
|
||||
/* Unicode 10.0.0 */
|
||||
/* ( Grapheme_Extend | SpacingMark )* */
|
||||
R_ERR(create_property_node(&np1, env, "Grapheme_Cluster_Break=Extend"));
|
||||
|
||||
cc = NCCLASS(np1);
|
||||
R_ERR(add_property_to_cc(cc, "Grapheme_Cluster_Break=SpacingMark", 0, env));
|
||||
R_ERR(add_code_range(&(cc->mbuf), env, 0x200D, 0x200D));
|
||||
|
||||
R_ERR(quantify_node(&np1, 0, REPEAT_INFINITE));
|
||||
|
||||
tmp = node_new_list(np1, NULL_NODE);
|
||||
if (IS_NULL(tmp)) goto err;
|
||||
list = tmp;
|
||||
np1 = NULL;
|
||||
|
||||
/* Unicode 10.0.0 */
|
||||
/* ( RI-sequence | Hangul-Syllable | !Control ) */
|
||||
/* !Control */
|
||||
np1 = node_new_cclass();
|
||||
if (IS_NULL(np1)) goto err;
|
||||
cc = NCCLASS(np1);
|
||||
R_ERR(add_property_to_cc(cc, "Grapheme_Cluster_Break=Control", 1, env));
|
||||
if (! (ONIGENC_MBC_MINLEN(env->enc) > 1)) {
|
||||
BITSET_CLEAR_BIT(cc->bs, 0x0a);
|
||||
BITSET_CLEAR_BIT(cc->bs, 0x0d);
|
||||
}
|
||||
|
||||
tmp = onig_node_new_alt(np1, NULL_NODE);
|
||||
if (IS_NULL(tmp)) goto err;
|
||||
alt = tmp;
|
||||
np1 = NULL;
|
||||
|
||||
/* Unicode 10.0.0 */
|
||||
/* Hangul-Syllable
|
||||
* := L* V+ T*
|
||||
* | L* LV V* T*
|
||||
* | L* LVT T*
|
||||
* | L+
|
||||
* | T+ */
|
||||
/* Unicode 11.0.0 */
|
||||
/* Hangul-Syllable
|
||||
* := L* (V+ | LV V* | LVT) T*
|
||||
* | L+
|
||||
* | T+ */
|
||||
/* these are equivalent, so we leave things as is for the moment */
|
||||
|
||||
/* T+ */
|
||||
R_ERR(quantify_property_node(&np1, env, "Grapheme_Cluster_Break=T", '+'));
|
||||
|
||||
tmp = onig_node_new_alt(np1, alt);
|
||||
if (IS_NULL(tmp)) goto err;
|
||||
alt = tmp;
|
||||
np1 = NULL;
|
||||
|
||||
/* L+ */
|
||||
R_ERR(quantify_property_node(&np1, env, "Grapheme_Cluster_Break=L", '+'));
|
||||
|
||||
tmp = onig_node_new_alt(np1, alt);
|
||||
if (IS_NULL(tmp)) goto err;
|
||||
alt = tmp;
|
||||
np1 = NULL;
|
||||
|
||||
/* L* LVT T* */
|
||||
/* Unicode 11.0.0
|
||||
* CRLF (this is added last because it is common with non-Unicode encodings)
|
||||
* | [Control CR LF]
|
||||
* | precore* core postcore*
|
||||
* | . (to catch invalid stuff, because this seems to be spec for String#grapheme_clusters) */
|
||||
{
|
||||
R_ERR(quantify_property_node(seq+0, env, "Grapheme_Cluster_Break=L", '*'));
|
||||
R_ERR(create_property_node(seq+1, env, "Grapheme_Cluster_Break=LVT"));
|
||||
R_ERR(quantify_property_node(seq+2, env, "Grapheme_Cluster_Break=T", '*'));
|
||||
Node *alts[4];
|
||||
|
||||
R_ERR(create_sequence_node(&list2, seq));
|
||||
}
|
||||
|
||||
tmp = onig_node_new_alt(list2, alt);
|
||||
if (IS_NULL(tmp)) goto err;
|
||||
alt = tmp;
|
||||
list2 = NULL;
|
||||
|
||||
/* L* LV V* T* */
|
||||
{
|
||||
R_ERR(quantify_property_node(seq+0, env, "Grapheme_Cluster_Break=L", '*'));
|
||||
R_ERR(create_property_node(seq+1, env, "Grapheme_Cluster_Break=LV"));
|
||||
R_ERR(quantify_property_node(seq+2, env, "Grapheme_Cluster_Break=V", '*'));
|
||||
R_ERR(quantify_property_node(seq+3, env, "Grapheme_Cluster_Break=T", '*'));
|
||||
|
||||
R_ERR(create_sequence_node(&list2, seq));
|
||||
}
|
||||
|
||||
tmp = onig_node_new_alt(list2, alt);
|
||||
if (IS_NULL(tmp)) goto err;
|
||||
alt = tmp;
|
||||
list2 = NULL;
|
||||
|
||||
/* L* V+ T* */
|
||||
{
|
||||
R_ERR(quantify_property_node(seq+0, env, "Grapheme_Cluster_Break=L", '*'));
|
||||
R_ERR(quantify_property_node(seq+1, env, "Grapheme_Cluster_Break=V", '+'));
|
||||
R_ERR(quantify_property_node(seq+2, env, "Grapheme_Cluster_Break=T", '*'));
|
||||
|
||||
R_ERR(create_sequence_node(&list2, seq));
|
||||
}
|
||||
|
||||
tmp = onig_node_new_alt(list2, alt);
|
||||
if (IS_NULL(tmp)) goto err;
|
||||
alt = tmp;
|
||||
list2 = NULL;
|
||||
/* end of Hangul-Syllable */
|
||||
|
||||
/* Unicode 10.0.0 */
|
||||
/* Emoji sequence := (E_Base | EBG) Extend* E_Modifier?
|
||||
* (ZWJ (Glue_After_Zwj | EBG Extend* E_Modifier?) )* */
|
||||
/* ZWJ (Glue_After_Zwj | E_Base_GAZ Extend* E_Modifier?) */
|
||||
|
||||
{
|
||||
/* Unicode 10.0.0 */
|
||||
/* Emoji variation sequence
|
||||
* http://unicode.org/Public/emoji/4.0/emoji-zwj-sequences.txt
|
||||
*/
|
||||
/* Emoji U+FE0F */
|
||||
{
|
||||
seq[0] = node_new_cclass();
|
||||
if (IS_NULL(seq[0])) goto err;
|
||||
cc = NCCLASS(seq[0]);
|
||||
R_ERR(add_ctype_to_cc_by_range(cc, -1, 0, env, sb_out, onigenc_unicode_GCB_ranges_Emoji));
|
||||
|
||||
r = ONIGENC_CODE_TO_MBC(env->enc, 0xfe0f, buf); /* VARIATION SELECTOR-16 */
|
||||
if (r < 0) goto err;
|
||||
seq[1] = node_new_str_raw(buf, buf + r);
|
||||
if (IS_NULL(seq[1])) goto err;
|
||||
R_ERR(quantify_node(seq+1, 0, 1));
|
||||
|
||||
R_ERR(create_sequence_node(alts+0, seq));
|
||||
/* [Control CR LF] (CR and LF are not in the spec, but this is a conformed fix) */
|
||||
alts[0] = node_new_cclass();
|
||||
if (IS_NULL(alts[0])) goto err;
|
||||
cc = NCCLASS(alts[0]);
|
||||
R_ERR(add_property_to_cc(cc, "Grapheme_Cluster_Break=Control", 0, env));
|
||||
if (ONIGENC_MBC_MINLEN(env->enc) > 1) { /* UTF-16/UTF-32 */
|
||||
R_ERR(add_code_range(&(cc->mbuf), env, 0x000A, 0x000A)); /* CR */
|
||||
R_ERR(add_code_range(&(cc->mbuf), env, 0x000D, 0x000D)); /* LF */
|
||||
}
|
||||
else {
|
||||
BITSET_SET_BIT(cc->bs, 0x0a);
|
||||
BITSET_SET_BIT(cc->bs, 0x0d);
|
||||
}
|
||||
|
||||
/* Unicode 10.0.0 */
|
||||
/* Glue_After_Zwj */
|
||||
/* precore* core postcore* */
|
||||
{
|
||||
seq[0] = node_new_cclass();
|
||||
if (IS_NULL(seq[0])) goto err;
|
||||
cc = NCCLASS(seq[0]);
|
||||
R_ERR(add_ctype_to_cc_by_range(cc, -1, 0, env, sb_out, onigenc_unicode_GCB_ranges_GAZ));
|
||||
R_ERR(add_property_to_cc(cc, "Grapheme_Cluster_Break=Glue_After_Zwj", 0, env));
|
||||
Node *seq[4];
|
||||
|
||||
R_ERR(quantify_property_node(seq+1, env, "Grapheme_Cluster_Break=Extend", '*'));
|
||||
/* precore*; precore := Prepend */
|
||||
R_ERR(quantify_property_node(seq+0, env, "Grapheme_Cluster_Break=Prepend", '*'));
|
||||
|
||||
/* core := hangul-syllable
|
||||
* | ri-sequence
|
||||
* | xpicto-sequence
|
||||
* | [^Control CR LF] */
|
||||
{
|
||||
Node *core_alts[7];
|
||||
|
||||
/* hangul-syllable :=
|
||||
* L* (V+ | LV V* | LVT) T*
|
||||
* | L+
|
||||
* | T+ */
|
||||
/* hangul-syllable is an alternative (would be called H_alt)
|
||||
* inside an alternative, but we flatten it into core_alts */
|
||||
|
||||
/* L* (V+ | LV V* | LVT) T* */
|
||||
{
|
||||
Node *H_seq[4];
|
||||
|
||||
R_ERR(quantify_property_node(H_seq+0, env, "Grapheme_Cluster_Break=L", '*'));
|
||||
|
||||
/* V+ | LV V* | LVT */
|
||||
{
|
||||
Node *H_alt2[4];
|
||||
|
||||
R_ERR(quantify_property_node(H_alt2+0, env, "Grapheme_Cluster_Break=V", '+'));
|
||||
|
||||
/* LV V* */
|
||||
{
|
||||
Node *H_seq2[3];
|
||||
|
||||
R_ERR(create_property_node(H_seq2+0, env, "Grapheme_Cluster_Break=LV"));
|
||||
R_ERR(quantify_property_node(H_seq2+1, env, "Grapheme_Cluster_Break=V", '*'));
|
||||
|
||||
H_seq2[2] = NULL_NODE;
|
||||
R_ERR(create_sequence_node(H_alt2+1, H_seq2));
|
||||
}
|
||||
|
||||
R_ERR(create_property_node(H_alt2+2, env, "Grapheme_Cluster_Break=LVT"));
|
||||
|
||||
H_alt2[3] = NULL_NODE;
|
||||
R_ERR(create_alternate_node(H_seq+1, H_alt2));
|
||||
}
|
||||
|
||||
R_ERR(quantify_property_node(H_seq+2, env, "Grapheme_Cluster_Break=T", '*'));
|
||||
|
||||
H_seq[3] = NULL_NODE;
|
||||
R_ERR(create_sequence_node(core_alts+0, H_seq));
|
||||
}
|
||||
/* end of L* (V+ | LV V* | LVT) T*, result is in core_alts[0] */
|
||||
|
||||
/* L+ */
|
||||
R_ERR(quantify_property_node(core_alts+1, env, "Grapheme_Cluster_Break=L", '+'));
|
||||
|
||||
/* T+ */
|
||||
R_ERR(quantify_property_node(core_alts+2, env, "Grapheme_Cluster_Break=T", '+'));
|
||||
/* end of hangul-syllable */
|
||||
|
||||
/* ri-sequence := RI RI */
|
||||
R_ERR(quantify_property_node(core_alts+3, env, "Regional_Indicator", '2'));
|
||||
|
||||
/* xpicto-sequence := \p{Extended_Pictographic} (Extend* ZWJ \p{Extended_Pictographic})* */
|
||||
{
|
||||
Node *XP_seq[3];
|
||||
|
||||
R_ERR(create_property_node(XP_seq+0, env, "Extended_Pictographic"));
|
||||
|
||||
/* (Extend* ZWJ \p{Extended_Pictographic})* */
|
||||
{
|
||||
Node *Ex_seq[4];
|
||||
|
||||
R_ERR(quantify_property_node(Ex_seq+0, env, "Grapheme_Cluster_Break=Extend", '*'));
|
||||
|
||||
/* ZWJ (ZERO WIDTH JOINER) */
|
||||
r = ONIGENC_CODE_TO_MBC(env->enc, 0x200D, buf);
|
||||
if (r < 0) goto err;
|
||||
Ex_seq[1] = node_new_str_raw(buf, buf + r);
|
||||
if (IS_NULL(Ex_seq[1])) goto err;
|
||||
|
||||
R_ERR(create_property_node(Ex_seq+2, env, "Extended_Pictographic"));
|
||||
|
||||
Ex_seq[3] = NULL_NODE;
|
||||
R_ERR(create_sequence_node(XP_seq+1, Ex_seq));
|
||||
}
|
||||
R_ERR(quantify_node(XP_seq+1, 0, REPEAT_INFINITE)); /* TODO: Check about node freeing */
|
||||
/* end of (Extend* ZWJ \p{Extended_Pictographic})* */
|
||||
|
||||
XP_seq[2] = NULL_NODE;
|
||||
R_ERR(create_sequence_node(core_alts+4, XP_seq));
|
||||
}
|
||||
/* end of xpicto-sequence, result is in core_alts[4] */
|
||||
|
||||
/* [^Control CR LF] */
|
||||
core_alts[5] = node_new_cclass();
|
||||
if (IS_NULL(core_alts[5])) goto err;
|
||||
cc = NCCLASS(core_alts[5]);
|
||||
if (ONIGENC_MBC_MINLEN(env->enc) > 1) { /* UTF-16/UTF-32 */
|
||||
BBuf *inverted_buf = NULL;
|
||||
|
||||
/* Start with a positive buffer and invert at the end,
|
||||
* because otherwise adding single-character ranges works the wrong way. */
|
||||
R_ERR(add_property_to_cc(cc, "Grapheme_Cluster_Break=Control", 0, env));
|
||||
R_ERR(add_code_range(&(cc->mbuf), env, 0x000A, 0x000A)); /* CR */
|
||||
R_ERR(add_code_range(&(cc->mbuf), env, 0x000D, 0x000D)); /* LF */
|
||||
R_ERR(not_code_range_buf(env->enc, cc->mbuf, &inverted_buf, env));
|
||||
cc->mbuf = inverted_buf; /* TODO: check what to do with buffer before inversion */
|
||||
}
|
||||
else {
|
||||
R_ERR(add_property_to_cc(cc, "Grapheme_Cluster_Break=Control", 1, env));
|
||||
BITSET_CLEAR_BIT(cc->bs, 0x0a);
|
||||
BITSET_CLEAR_BIT(cc->bs, 0x0d);
|
||||
}
|
||||
/* end of [^Control CR LF], result in core_alts[5] */
|
||||
|
||||
core_alts[6] = NULL_NODE;
|
||||
R_ERR(create_alternate_node(seq+1, core_alts));
|
||||
}
|
||||
/* end of core := hangul-syllable | ri-sequence | xpicto-sequence | [^Control CR LF],
|
||||
* result is in seq[1] */
|
||||
|
||||
/* postcore*; postcore = [Extend ZWJ SpacingMark] */
|
||||
R_ERR(create_property_node(seq+2, env, "Grapheme_Cluster_Break=Extend"));
|
||||
cc = NCCLASS(seq[2]);
|
||||
R_ERR(add_property_to_cc(cc, "Grapheme_Cluster_Break=SpacingMark", 0, env));
|
||||
R_ERR(add_code_range(&(cc->mbuf), env, 0x200D, 0x200D));
|
||||
R_ERR(quantify_node(seq+2, 0, REPEAT_INFINITE));
|
||||
|
||||
seq[3] = NULL_NODE;
|
||||
R_ERR(create_sequence_node(alts+1, seq));
|
||||
}
|
||||
/* end of (precore* core postcore*), result is in alts[1] */
|
||||
|
||||
/* E_Base_GAZ Extend* E_Modifier? */
|
||||
{
|
||||
R_ERR(create_property_node(seq+0, env, "Grapheme_Cluster_Break=E_Base_GAZ"));
|
||||
R_ERR(quantify_property_node(seq+1, env, "Grapheme_Cluster_Break=Extend", '*'));
|
||||
R_ERR(quantify_property_node(seq+2, env, "Grapheme_Cluster_Break=E_Modifier", '?'));
|
||||
/* PerlSyntax: (?s:.), RubySyntax: (?m:.) */
|
||||
/* Not in spec, but added to catch invalid stuff,
|
||||
* because this is spec for String#grapheme_clusters. */
|
||||
np1 = node_new_anychar();
|
||||
if (IS_NULL(np1)) goto err;
|
||||
|
||||
R_ERR(create_sequence_node(alts+2, seq));
|
||||
}
|
||||
option = env->option;
|
||||
ONOFF(option, ONIG_OPTION_MULTILINE, 0);
|
||||
tmp = node_new_option(option);
|
||||
if (IS_NULL(tmp)) goto err;
|
||||
NENCLOSE(tmp)->target = np1;
|
||||
alts[2] = tmp;
|
||||
|
||||
R_ERR(create_alternate_node(&alt2, alts));
|
||||
}
|
||||
|
||||
tmp = node_new_list(alt2, NULL_NODE);
|
||||
if (IS_NULL(tmp)) goto err;
|
||||
list2 = tmp;
|
||||
alt2 = NULL;
|
||||
|
||||
/* ZWJ */
|
||||
r = ONIGENC_CODE_TO_MBC(env->enc, 0x200D, buf); /* ZERO WIDTH JOINER (ZWJ) */
|
||||
if (r < 0) goto err;
|
||||
np1 = node_new_str_raw(buf, buf + r);
|
||||
if (IS_NULL(np1)) goto err;
|
||||
|
||||
tmp = node_new_list(np1, list2);
|
||||
if (IS_NULL(tmp)) goto err;
|
||||
list2 = tmp;
|
||||
np1 = NULL;
|
||||
|
||||
R_ERR(quantify_node(&list2, 0, REPEAT_INFINITE));
|
||||
np1 = list2;
|
||||
list2 = NULL;
|
||||
|
||||
tmp = node_new_list(np1, NULL_NODE);
|
||||
if (IS_NULL(tmp)) goto err;
|
||||
list2 = tmp;
|
||||
np1 = NULL;
|
||||
|
||||
/* E_Modifier? */
|
||||
R_ERR(quantify_property_node(&np1, env, "Grapheme_Cluster_Break=E_Modifier", '?'));
|
||||
|
||||
tmp = node_new_list(np1, list2);
|
||||
if (IS_NULL(tmp)) goto err;
|
||||
list2 = tmp;
|
||||
np1 = NULL;
|
||||
|
||||
/* Extend* */
|
||||
R_ERR(quantify_property_node(&np1, env, "Grapheme_Cluster_Break=Extend", '*'));
|
||||
|
||||
tmp = node_new_list(np1, list2);
|
||||
if (IS_NULL(tmp)) goto err;
|
||||
list2 = tmp;
|
||||
np1 = NULL;
|
||||
|
||||
/* (E_Base | EBG) */
|
||||
np1 = node_new_cclass();
|
||||
if (IS_NULL(np1)) goto err;
|
||||
cc = NCCLASS(np1);
|
||||
R_ERR(add_ctype_to_cc_by_range(cc, -1, 0, env, sb_out, onigenc_unicode_GCB_ranges_E_Base));
|
||||
R_ERR(add_property_to_cc(cc, "Grapheme_Cluster_Break=E_Base", 0, env));
|
||||
R_ERR(add_property_to_cc(cc, "Grapheme_Cluster_Break=E_Base_GAZ", 0, env));
|
||||
|
||||
tmp = node_new_list(np1, list2);
|
||||
if (IS_NULL(tmp)) goto err;
|
||||
list2 = tmp;
|
||||
np1 = NULL;
|
||||
|
||||
tmp = onig_node_new_alt(list2, alt);
|
||||
if (IS_NULL(tmp)) goto err;
|
||||
alt = tmp;
|
||||
list2 = NULL;
|
||||
|
||||
/* Unicode 10.0.0 */
|
||||
/* a sequence starting with ZWJ seems artificial, but GraphemeBreakTest
|
||||
* has such examples.
|
||||
* http://www.unicode.org/Public/9.0.0/ucd/auxiliary/GraphemeBreakTest.html
|
||||
*/
|
||||
/* ZWJ (E_Base_GAZ | Glue_After_Zwj) E_Modifier? */
|
||||
{
|
||||
r = ONIGENC_CODE_TO_MBC(env->enc, 0x200D, buf); /* ZERO WIDTH JOINER (ZWJ) */
|
||||
if (r < 0) goto err;
|
||||
seq[0] = node_new_str_raw(buf, buf + r);
|
||||
if (IS_NULL(seq[0])) goto err;
|
||||
|
||||
seq[1] = node_new_cclass();
|
||||
if (IS_NULL(seq[1])) goto err;
|
||||
cc = NCCLASS(seq[1]);
|
||||
R_ERR(add_property_to_cc(cc, "Grapheme_Cluster_Break=Glue_After_Zwj", 0, env));
|
||||
R_ERR(add_property_to_cc(cc, "Grapheme_Cluster_Break=E_Base_GAZ", 0, env));
|
||||
|
||||
R_ERR(quantify_property_node(seq+2, env, "Grapheme_Cluster_Break=E_Modifier", '?'));
|
||||
|
||||
R_ERR(create_sequence_node(&list2, seq));
|
||||
} /* End of ZWJ (E_Base_GAZ | Glue_After_Zwj) E_Modifier? */
|
||||
|
||||
tmp = onig_node_new_alt(list2, alt);
|
||||
if (IS_NULL(tmp)) goto err;
|
||||
alt = tmp;
|
||||
list2 = NULL;
|
||||
|
||||
/* Unicode 10.0.0/11.0.0 */
|
||||
/* this is Regional_Indicator+ in the Unicode 10.0.0 regular expression,
|
||||
* but the segmentation rules and Unicode 11.0.0 use Regional_Indicator{2}, so no need to fix */
|
||||
/* RI-Sequence := Regional_Indicator{2} */
|
||||
R_ERR(quantify_property_node(&np1, env, "Regional_Indicator", '2'));
|
||||
|
||||
tmp = onig_node_new_alt(np1, alt);
|
||||
if (IS_NULL(tmp)) goto err;
|
||||
alt = tmp;
|
||||
np1 = NULL;
|
||||
|
||||
tmp = node_new_list(alt, list);
|
||||
if (IS_NULL(tmp)) goto err;
|
||||
list = tmp;
|
||||
alt = NULL;
|
||||
|
||||
/* Prepend* */
|
||||
R_ERR(quantify_property_node(&np1, env, "Grapheme_Cluster_Break=Prepend", '*'));
|
||||
|
||||
tmp = node_new_list(np1, list);
|
||||
if (IS_NULL(tmp)) goto err;
|
||||
list = tmp;
|
||||
np1 = NULL;
|
||||
|
||||
/* PerlSyntax: (?s:.), RubySyntax: (?m:.) */
|
||||
np1 = node_new_anychar();
|
||||
if (IS_NULL(np1)) goto err;
|
||||
|
||||
option = env->option;
|
||||
ONOFF(option, ONIG_OPTION_MULTILINE, 0);
|
||||
tmp = node_new_option(option);
|
||||
if (IS_NULL(tmp)) goto err;
|
||||
NENCLOSE(tmp)->target = np1;
|
||||
np1 = tmp;
|
||||
|
||||
tmp = onig_node_new_alt(np1, NULL_NODE);
|
||||
if (IS_NULL(tmp)) goto err;
|
||||
alt = tmp;
|
||||
np1 = NULL;
|
||||
|
||||
/* Prepend+ ZWJ* */
|
||||
{
|
||||
R_ERR(quantify_property_node(seq+0, env, "Grapheme_Cluster_Break=Prepend", '+'));
|
||||
|
||||
r = ONIGENC_CODE_TO_MBC(env->enc, 0x200D, buf); /* does this belong to Prepend?? */
|
||||
if (r < 0) goto err;
|
||||
seq[1] = node_new_str_raw(buf, buf + r);
|
||||
if (IS_NULL(seq[1])) goto err;
|
||||
R_ERR(quantify_node(seq+1, 0, 1));
|
||||
|
||||
R_ERR(create_sequence_node(&list2, seq));
|
||||
}
|
||||
|
||||
tmp = onig_node_new_alt(list2, alt);
|
||||
if (IS_NULL(tmp)) goto err;
|
||||
alt = tmp;
|
||||
list2 = NULL;
|
||||
|
||||
tmp = onig_node_new_alt(list, alt);
|
||||
if (IS_NULL(tmp)) goto err;
|
||||
alt = tmp;
|
||||
list = NULL;
|
||||
alts[3] = NULL_NODE;
|
||||
R_ERR(create_alternate_node(&top_alt, alts));
|
||||
} /* end of (CRLF | Control | precore* core postcore*) (without CRLF!), result is in top_alt */
|
||||
}
|
||||
else
|
||||
#endif /* USE_UNICODE_PROPERTIES */
|
||||
|
@ -6186,11 +6050,12 @@ node_extended_grapheme_cluster(Node** np, ScanEnv* env)
|
|||
NENCLOSE(tmp)->target = np1;
|
||||
np1 = tmp;
|
||||
|
||||
alt = onig_node_new_alt(np1, NULL_NODE);
|
||||
if (IS_NULL(alt)) goto err;
|
||||
top_alt = onig_node_new_alt(np1, NULL_NODE);
|
||||
if (IS_NULL(top_alt)) goto err;
|
||||
np1 = NULL;
|
||||
}
|
||||
|
||||
/* add in CRLF to complete (CRLF | Control | precore* core postcore*) */
|
||||
/* \x0D\x0A */
|
||||
r = ONIGENC_CODE_TO_MBC(env->enc, 0x0D, buf);
|
||||
if (r < 0) goto err;
|
||||
|
@ -6200,15 +6065,15 @@ node_extended_grapheme_cluster(Node** np, ScanEnv* env)
|
|||
np1 = node_new_str_raw(buf, buf + num1 + r);
|
||||
if (IS_NULL(np1)) goto err;
|
||||
|
||||
tmp = onig_node_new_alt(np1, alt);
|
||||
tmp = onig_node_new_alt(np1, top_alt);
|
||||
if (IS_NULL(tmp)) goto err;
|
||||
alt = tmp;
|
||||
top_alt = tmp;
|
||||
np1 = NULL;
|
||||
|
||||
/* (?>\x0D\x0A|...) */
|
||||
tmp = node_new_enclose(ENCLOSE_STOP_BACKTRACK);
|
||||
if (IS_NULL(tmp)) goto err;
|
||||
NENCLOSE(tmp)->target = alt;
|
||||
NENCLOSE(tmp)->target = top_alt;
|
||||
np1 = tmp;
|
||||
|
||||
#ifdef USE_UNICODE_PROPERTIES
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue