Update IR

IR commit: 87cba9af675afd2ca20cbaab397ad1c83d700475
2025-08-15 21:48:51 +02:00 · 2024-03-27 00:06:15 +03:00 · 2024-03-27 00:06:15 +03:00 · 9fae55f5db
commit 9fae55f5db
parent 6316eb1b2c
5 changed files with 530 additions and 145 deletions
--- a/ext/opcache/jit/ir/ir.c
+++ b/ext/opcache/jit/ir/ir.c
@ -1984,10 +1984,25 @@ void _ir_BEGIN(ir_ctx *ctx, ir_ref src)
 	}
 }

+ir_ref _ir_fold_condition(ir_ctx *ctx, ir_ref ref)
+{
+	ir_insn *insn = &ctx->ir_base[ref];
+
+	if (insn->op == IR_NE && IR_IS_CONST_REF(insn->op2)) {
+		ir_insn *op2_insn = &ctx->ir_base[insn->op2];
+
+		if (IR_IS_TYPE_INT(op2_insn->type) && op2_insn->val.u64 == 0) {
+			return insn->op1;
+		}
+	}
+	return ref;
+}
+
 ir_ref _ir_IF(ir_ctx *ctx, ir_ref condition)
 {
 	ir_ref if_ref;

+	condition = _ir_fold_condition(ctx, condition);
 	IR_ASSERT(ctx->control);
 	if (IR_IS_CONST_REF(condition)) {
 		condition = ir_ref_is_true(ctx, condition) ? IR_TRUE : IR_FALSE;
--- a/ext/opcache/jit/ir/ir.h
+++ b/ext/opcache/jit/ir/ir.h
@ -531,8 +531,9 @@ void ir_strtab_free(ir_strtab *strtab);

 /* debug related */
 #ifdef IR_DEBUG
-# define IR_DEBUG_SCCP         (1<<27)
-# define IR_DEBUG_GCM          (1<<28)
+# define IR_DEBUG_SCCP         (1<<26)
+# define IR_DEBUG_GCM          (1<<27)
+# define IR_DEBUG_GCM_SPLIT    (1<<28)
 # define IR_DEBUG_SCHEDULE     (1<<29)
 # define IR_DEBUG_RA           (1<<30)
 #endif
--- a/ext/opcache/jit/ir/ir_gcm.c
+++ b/ext/opcache/jit/ir/ir_gcm.c
@ -14,13 +14,14 @@
 #define IR_GCM_IS_SCHEDULED_EARLY(b) (((int32_t)(b)) < 0)
 #define IR_GCM_EARLY_BLOCK(b)        ((uint32_t)-((int32_t)(b)))

-static uint32_t ir_gcm_schedule_early(ir_ctx *ctx, ir_ref ref, ir_list *queue_rest)
+#define IR_GCM_SPLIT 1
+
+static uint32_t ir_gcm_schedule_early(ir_ctx *ctx, ir_ref ref, ir_list *queue_late)
 {
 	ir_ref n, *p, input;
 	ir_insn *insn;
 	uint32_t dom_depth;
 	uint32_t b, result;
-	bool reschedule_late = 1;

 	insn = &ctx->ir_base[ref];

@ -38,25 +39,17 @@ static uint32_t ir_gcm_schedule_early(ir_ctx *ctx, ir_ref ref, ir_list *queue_re
 			if (IR_GCM_IS_SCHEDULED_EARLY(b)) {
 				b = IR_GCM_EARLY_BLOCK(b);
 			} else if (!b) {
-				b = ir_gcm_schedule_early(ctx, input, queue_rest);
+				b = ir_gcm_schedule_early(ctx, input, queue_late);
 			}
 			if (dom_depth < ctx->cfg_blocks[b].dom_depth) {
 				dom_depth = ctx->cfg_blocks[b].dom_depth;
 				result = b;
 			}
-			reschedule_late = 0;
 		}
 	}

 	ctx->cfg_map[ref] = IR_GCM_EARLY_BLOCK(result);
-	if (UNEXPECTED(reschedule_late)) {
-		/* Floating nodes that don't depend on other nodes
-		 * (e.g. only on constants), have to be scheduled to the
-		 * last common ancestor. Otherwise they always go to the
-		 * first block.
-		 */
-		ir_list_push_unchecked(queue_rest, ref);
-	}
+	ir_list_push_unchecked(queue_late, ref);
 	return result;
 }

@ -80,64 +73,26 @@ static uint32_t ir_gcm_find_lca(ir_ctx *ctx, uint32_t b1, uint32_t b2)
 	return b2;
 }

-static void ir_gcm_schedule_late(ir_ctx *ctx, ir_ref ref, uint32_t b)
+static uint32_t ir_gcm_select_best_block(ir_ctx *ctx, ir_ref ref, uint32_t lca)
 {
-	ir_ref n, *p, use;
-	ir_use_list *use_list;
-	uint32_t lca = 0;
-
-	IR_ASSERT(ctx->ir_base[ref].op != IR_PARAM && ctx->ir_base[ref].op != IR_VAR);
-	IR_ASSERT(ctx->ir_base[ref].op != IR_PHI && ctx->ir_base[ref].op != IR_PI);
-
-	IR_ASSERT(IR_GCM_IS_SCHEDULED_EARLY(b));
-	b = IR_GCM_EARLY_BLOCK(b);
-	ctx->cfg_map[ref] = b;
-	use_list = &ctx->use_lists[ref];
-	n = use_list->count;
-
-	for (p = &ctx->use_edges[use_list->refs]; n > 0; p++, n--) {
-		use = *p;
-		b = ctx->cfg_map[use];
-		if (IR_GCM_IS_SCHEDULED_EARLY(b)) {
-			ir_gcm_schedule_late(ctx, use, b);
-			b = ctx->cfg_map[use];
-			IR_ASSERT(b != 0);
-		} else if (!b) {
-			continue;
-		} else if (ctx->ir_base[use].op == IR_PHI) {
-			ir_insn *insn = &ctx->ir_base[use];
-			ir_ref *p = insn->ops + 2; /* PHI data inputs */
-			ir_ref *q = ctx->ir_base[insn->op1].ops + 1; /* MERGE inputs */
-			ir_ref n = insn->inputs_count - 1;
-
-			for (;n > 0; p++, q++, n--) {
-				if (*p == ref) {
-					b = ctx->cfg_map[*q];
-					lca = !lca ? b : ir_gcm_find_lca(ctx, lca, b);
-				}
-			}
-			continue;
-		}
-		lca = !lca ? b : ir_gcm_find_lca(ctx, lca, b);
-	}
-
-	IR_ASSERT(lca != 0 && "No Common Ancestor");
-	b = lca;
-
-	if (b != ctx->cfg_map[ref]) {
-		ir_block *bb = &ctx->cfg_blocks[b];
+	ir_block *bb = &ctx->cfg_blocks[lca];
 	uint32_t loop_depth = bb->loop_depth;
+	uint32_t flags, best, b;

-		if (loop_depth) {
-			uint32_t flags;
+	if (!loop_depth) {
+		return lca;
+	}
+
+	if (ctx->ir_base[ref].op >= IR_EQ && ctx->ir_base[ref].op <= IR_UGT) {
+		ir_use_list *use_list = &ctx->use_lists[ref];

-			use_list = &ctx->use_lists[ref];
 		if (use_list->count == 1) {
-				use = ctx->use_edges[use_list->refs];
+			ir_ref use = ctx->use_edges[use_list->refs];
 			ir_insn *insn = &ctx->ir_base[use];
 			if (insn->op == IR_IF || insn->op == IR_GUARD || insn->op == IR_GUARD_NOT) {
-					ctx->cfg_map[ref] = b;
-					return;
+				/* Don't hoist invariant comparison */
+				return lca;
+			}
 		}
 	}

@ -145,13 +100,16 @@ static void ir_gcm_schedule_late(ir_ctx *ctx, ir_ref ref, uint32_t b)
 	if ((flags & IR_BB_LOOP_WITH_ENTRY)
 	 && !(ctx->binding && ir_binding_find(ctx, ref))) {
 		/* Don't move loop invariant code across an OSR ENTRY if we can't restore it */
-			} else {
+		return lca;
+	}
+
+	best = b = lca;
 	do {
-					lca = bb->dom_parent;
-					bb = &ctx->cfg_blocks[lca];
+		b = bb->dom_parent;
+		bb = &ctx->cfg_blocks[b];
 		if (bb->loop_depth < loop_depth) {
 			if (!bb->loop_depth) {
-							b = lca;
+				best = b;
 				break;
 			}
 			flags = (bb->flags & IR_BB_LOOP_HEADER) ? bb->flags : ctx->cfg_blocks[bb->loop_header].flags;
@ -160,24 +118,326 @@ static void ir_gcm_schedule_late(ir_ctx *ctx, ir_ref ref, uint32_t b)
 				break;
 			}
 			loop_depth = bb->loop_depth;
-						b = lca;
-					}
-				} while (lca != ctx->cfg_map[ref]);
+			best = b;
 		}
+	} while (b != ctx->cfg_map[ref]);
+
+	return best;
 }

-		ctx->cfg_map[ref] = b;
-		if (ctx->ir_base[ref + 1].op == IR_OVERFLOW) {
-			/* OVERFLOW is a projection and must be scheduled together with previous ADD/SUB/MUL_OV */
-			ctx->cfg_map[ref + 1] = b;
-		}
-	}
-}
+#if IR_GCM_SPLIT
+/* Partially Dead Code Elimination through splitting the node and sunking the clones
+ *
+ * This code is based on the Benedikt Meurer's idea first implemented in V8.
+ * See: https://codereview.chromium.org/899433005
+ */

-static void ir_gcm_schedule_rest(ir_ctx *ctx, ir_ref ref)
+typedef struct _ir_gcm_split_data {
+	ir_sparse_set totally_useful;
+	ir_list       worklist;
+} ir_gcm_split_data;
+
+static void _push_predecessors(ir_ctx *ctx, ir_block *bb, ir_gcm_split_data *data)
 {
+	uint32_t *p, i, n = bb->predecessors_count;
+
+	IR_ASSERT(n > 0);
+	p = ctx->cfg_edges + bb->predecessors;
+	do {
+		i = *p;
+		if (!ir_sparse_set_in(&data->totally_useful, i)) {
+			ir_list_push(&data->worklist, i);
+		}
+		p++;
+		n--;
+	} while (n > 0);
+}
+
+static bool _check_successors(ir_ctx *ctx, ir_block *bb, ir_gcm_split_data *data)
+{
+	uint32_t *p, i, n = bb->successors_count;
+
+	if (n <= 1) {
+		IR_ASSERT(ir_sparse_set_in(&data->totally_useful, ctx->cfg_edges[bb->successors]));
+		return 1;
+	}
+
+	p = ctx->cfg_edges + bb->successors;
+	do {
+		i = *p;
+		if (!ir_sparse_set_in(&data->totally_useful, i)) {
+			return 0;
+		}
+		p++;
+		n--;
+	} while (n > 0);
+
+	return 1;
+}
+
+static bool ir_split_partially_dead_node(ir_ctx *ctx, ir_ref ref, uint32_t b)
+{
+	ir_use_list *use_list;
+	ir_insn *insn;
 	ir_ref n, *p, use;
-	uint32_t b = ctx->cfg_map[ref];
+	uint32_t i;
+	ir_gcm_split_data *data = ctx->data;
+
+	IR_ASSERT(b > 0 && b <= ctx->cfg_blocks_count);
+
+	/* 1. Find a set of blocks where the node is TOTALLY_USEFUL (not PARTIALLY_DEAD)
+	 * 1.1. Collect the blocks where the node is really USED.
+	 */
+	ir_sparse_set_clear(&data->totally_useful);
+
+	use_list = &ctx->use_lists[ref];
+	n = use_list->count;
+	for (p = &ctx->use_edges[use_list->refs]; n > 0; p++, n--) {
+		use = *p;
+		insn = &ctx->ir_base[use];
+		if (insn->op == IR_PHI) {
+			ir_ref *p = insn->ops + 2; /* PHI data inputs */
+			ir_ref *q = ctx->ir_base[insn->op1].ops + 1; /* MERGE inputs */
+			ir_ref n = insn->inputs_count - 1;
+
+			for (;n > 0; p++, q++, n--) {
+				if (*p == ref) {
+					i = ctx->cfg_map[*q];
+					IR_ASSERT(i > 0 && i <= ctx->cfg_blocks_count);
+					if (!ir_sparse_set_in(&data->totally_useful, i)) {
+						if (i == b) return 0; /* node is totally-useful in the scheduled block */
+						ir_sparse_set_add(&data->totally_useful, i);
+					}
+				}
+			}
+		} else {
+			i = ctx->cfg_map[use];
+			if (!i) {
+				continue;
+			}
+			IR_ASSERT(i > 0 && i <= ctx->cfg_blocks_count);
+			if (!ir_sparse_set_in(&data->totally_useful, i)) {
+				if (i == b) return 0; /* node is totally-useful in the scheduled block */
+				ir_sparse_set_add(&data->totally_useful, i);
+			}
+		}
+	}
+
+#ifdef IR_DEBUG
+	if (ctx->flags & IR_DEBUG_GCM_SPLIT) {
+		bool first = 1;
+		fprintf(stderr, "*** Split partially dead node d_%d scheduled to BB%d\n", ref, b);
+		IR_SPARSE_SET_FOREACH(&data->totally_useful, i) {
+			if (first) {
+				fprintf(stderr, "\td_%d is USED in [BB%d", ref, i);
+				first = 0;
+			} else {
+				fprintf(stderr, ", BB%d", i);
+			}
+		} IR_SPARSE_SET_FOREACH_END();
+		fprintf(stderr, "]\n");
+	}
+#endif
+
+	/* 1.2. Iteratively check the predecessors of already found TOTALLY_USEFUL blocks and
+	 *      add them into TOTALLY_USEFUL set if all of their sucessors are already there.
+	 */
+	IR_SPARSE_SET_FOREACH(&data->totally_useful, i) {
+		_push_predecessors(ctx, &ctx->cfg_blocks[i], data);
+	} IR_SPARSE_SET_FOREACH_END();
+
+	while (ir_list_len(&data->worklist)) {
+		i = ir_list_pop(&data->worklist);
+		if (!ir_sparse_set_in(&data->totally_useful, i)) {
+			ir_block *bb = &ctx->cfg_blocks[i];
+
+			if (_check_successors(ctx, bb, data)) {
+				if (i == b) {
+					/* node is TOTALLY_USEFUL in the scheduled block */
+					ir_list_clear(&data->worklist);
+					return 0;
+				}
+				ir_sparse_set_add(&data->totally_useful, i);
+				_push_predecessors(ctx, bb, data);
+			}
+		}
+	}
+
+	IR_ASSERT(!ir_sparse_set_in(&data->totally_useful, b));
+
+#ifdef IR_DEBUG
+	if (ctx->flags & IR_DEBUG_GCM_SPLIT) {
+		bool first = 1;
+		IR_SPARSE_SET_FOREACH(&data->totally_useful, i) {
+			if (first) {
+				fprintf(stderr, "\td_%d is TOTALLY_USEFUL in [BB%d", ref, i);
+				first = 0;
+			} else {
+				fprintf(stderr, ", BB%d", i);
+			}
+		} IR_SPARSE_SET_FOREACH_END();
+		fprintf(stderr, "]\n");
+	}
+#endif
+
+	/* 2. Split the USEs into partitions */
+	use_list = &ctx->use_lists[ref];
+	ir_hashtab hash;
+	uint32_t j, clone, clones_count = 0, uses_count = 0;
+	struct {
+		ir_ref   ref;
+		uint32_t block;
+		uint32_t use_count;
+		uint32_t use;
+	} *clones = ir_mem_malloc(sizeof(*clones) * use_list->count);
+	struct {
+		ir_ref   ref;
+		uint32_t block;
+		uint32_t next;
+	} *uses = ir_mem_malloc(sizeof(*uses) * use_list->count);
+
+	ir_hashtab_init(&hash, use_list->count);
+	n = use_list->count;
+	for (p = &ctx->use_edges[use_list->refs]; n > 0; p++, n--) {
+		use = *p;
+		insn = &ctx->ir_base[use];
+		if (insn->op == IR_PHI) {
+			ir_ref *p = insn->ops + 2; /* PHI data inputs */
+			ir_ref *q = ctx->ir_base[insn->op1].ops + 1; /* MERGE inputs */
+			ir_ref n = insn->inputs_count - 1;
+
+			/* PHIs must be processed once */
+			if (ir_hashtab_find(&hash, -use) != (ir_ref)IR_INVALID_VAL) {
+				continue;
+			}
+			ir_hashtab_add(&hash, -use, IR_NULL);
+			for (;n > 0; p++, q++, n--) {
+				if (*p == ref) {
+					j = i = ctx->cfg_map[*q];
+					while (ir_sparse_set_in(&data->totally_useful, ctx->cfg_blocks[j].idom)) {
+						j = ctx->cfg_blocks[j].idom;
+					}
+					clone = ir_hashtab_find(&hash, j);
+					if (clone == IR_INVALID_VAL) {
+						clone = clones_count++;
+						ir_hashtab_add(&hash, j, clone);
+						clones[clone].block = j;
+						clones[clone].use_count = 0;
+						clones[clone].use = (uint32_t)-1;
+					}
+					uses[uses_count].ref = use;
+					uses[uses_count].block = i;
+					uses[uses_count].next = clones[clone].use;
+					clones[clone].use_count++;
+					clones[clone].use = uses_count++;
+				}
+			}
+		} else {
+			j = i = ctx->cfg_map[use];
+			IR_ASSERT(i > 0);
+			while (ir_sparse_set_in(&data->totally_useful, ctx->cfg_blocks[j].idom)) {
+				j = ctx->cfg_blocks[j].idom;
+			}
+			clone = ir_hashtab_find(&hash, j);
+			if (clone == IR_INVALID_VAL) {
+				clone = clones_count++;
+				ir_hashtab_add(&hash, j, clone);
+				clones[clone].block = j;
+				clones[clone].use_count = 0;
+				clones[clone].use = -1;
+			}
+			uses[uses_count].ref = use;
+			uses[uses_count].block = i;
+			uses[uses_count].next = clones[clone].use;
+			clones[clone].use_count++;
+			clones[clone].use = uses_count++;
+		}
+	}
+
+#ifdef IR_DEBUG
+	if (ctx->flags & IR_DEBUG_GCM_SPLIT) {
+		for (i = 0; i < clones_count; i++) {
+			uint32_t u = clones[i].use;
+
+			fprintf(stderr, "\tCLONE #%d in BB%d USES(%d)=[d_%d/BB%d",
+				i, clones[i].block, clones[i].use_count, uses[u].ref, uses[u].block);
+			u = uses[u].next;
+			while (u != (uint32_t)-1) {
+				fprintf(stderr, ", d_%d/BB%d", uses[u].ref, uses[u].block);
+				u = uses[u].next;
+			}
+			fprintf(stderr, "]\n");
+		}
+	}
+#endif
+
+	/* Create Clones */
+	insn = &ctx->ir_base[ref];
+	clones[0].ref = ref;
+	for (i = 1; i < clones_count; i++) {
+		clones[i].ref = clone = ir_emit(ctx, insn->optx, insn->op1, insn->op2, insn->op3);
+		insn = &ctx->ir_base[ref];
+		if (insn->op1 > 0) ir_use_list_add(ctx, insn->op1, clone);
+		if (insn->op2 > 0) ir_use_list_add(ctx, insn->op2, clone);
+		if (insn->op3 > 0) ir_use_list_add(ctx, insn->op3, clone);
+	}
+
+	/* Reconstruct IR: Update DEF->USE lists, CFG mapping and etc */
+	ctx->use_lists = ir_mem_realloc(ctx->use_lists, ctx->insns_count * sizeof(ir_use_list));
+	ctx->cfg_map = ir_mem_realloc(ctx->cfg_map, ctx->insns_count * sizeof(uint32_t));
+	n = ctx->use_lists[ref].refs;
+	for (i = 0; i < clones_count; i++) {
+		clone = clones[i].ref;
+		ctx->cfg_map[clone] = clones[i].block;
+		ctx->use_lists[clone].count = clones[i].use_count;
+		ctx->use_lists[clone].refs = n;
+
+		uint32_t u = clones[i].use;
+		while (u != (uint32_t)-1) {
+			use = uses[u].ref;
+			ctx->use_edges[n++] = use;
+			u = uses[u].next;
+			if (i > 0) {
+				/* replace inputs */
+				ir_insn *insn = &ctx->ir_base[use];
+				ir_ref k, l = insn->inputs_count;
+
+				for (k = 1; k <= l; k++) {
+					if (ir_insn_op(insn, k) == ref) {
+						if (insn->op == IR_PHI) {
+							j = ctx->cfg_map[ir_insn_op(&ctx->ir_base[insn->op1], k - 1)];
+							while (ir_sparse_set_in(&data->totally_useful, ctx->cfg_blocks[j].idom)) {
+								j = ctx->cfg_blocks[j].idom;
+							}
+							if (j != clones[i].block) {
+								continue;
+							}
+						}
+						ir_insn_set_op(insn, k, clone);
+						break;
+					}
+				}
+			}
+		}
+	}
+
+	ir_mem_free(uses);
+	ir_mem_free(clones);
+	ir_hashtab_free(&hash);
+
+#ifdef IR_DEBUG
+	if (ctx->flags & IR_DEBUG_GCM_SPLIT) {
+		ir_check(ctx);
+	}
+#endif
+
+	return 1;
+}
+#endif
+
+static void ir_gcm_schedule_late(ir_ctx *ctx, ir_ref ref, uint32_t b)
+{
+	ir_ref n, use;
 	uint32_t lca = 0;

 	IR_ASSERT(ctx->ir_base[ref].op != IR_PARAM && ctx->ir_base[ref].op != IR_VAR);
@ -186,10 +446,9 @@ static void ir_gcm_schedule_rest(ir_ctx *ctx, ir_ref ref)
 	IR_ASSERT(IR_GCM_IS_SCHEDULED_EARLY(b));
 	b = IR_GCM_EARLY_BLOCK(b);
 	ctx->cfg_map[ref] = b;
-	n = ctx->use_lists[ref].count;

-	for (p = &ctx->use_edges[ctx->use_lists[ref].refs]; n > 0; p++, n--) {
-		use = *p;
+	for (n = 0; n < ctx->use_lists[ref].count; n++) {
+		use = ctx->use_edges[ctx->use_lists[ref].refs + n];
 		b = ctx->cfg_map[use];
 		if (IR_GCM_IS_SCHEDULED_EARLY(b)) {
 			ir_gcm_schedule_late(ctx, use, b);
@ -215,7 +474,16 @@ static void ir_gcm_schedule_rest(ir_ctx *ctx, ir_ref ref)
 	}

 	IR_ASSERT(lca != 0 && "No Common Ancestor");
-	b = lca;
+
+#if IR_GCM_SPLIT
+	if (ctx->use_lists[ref].count > 1
+	 && ir_split_partially_dead_node(ctx, ref, lca)) {
+		return;
+	}
+#endif
+
+	if (lca != ctx->cfg_map[ref]) {
+		b = ir_gcm_select_best_block(ctx, ref, lca);

 		ctx->cfg_map[ref] = b;
 		if (ctx->ir_base[ref + 1].op == IR_OVERFLOW) {
@ -223,6 +491,7 @@ static void ir_gcm_schedule_rest(ir_ctx *ctx, ir_ref ref)
 			ctx->cfg_map[ref + 1] = b;
 		}
 	}
+}

 int ir_gcm(ir_ctx *ctx)
 {
@ -230,7 +499,6 @@ int ir_gcm(ir_ctx *ctx)
 	ir_block *bb;
 	ir_list queue_early;
 	ir_list queue_late;
-	ir_list queue_rest;
 	uint32_t *_blocks, b;
 	ir_insn *insn, *use_insn;
 	ir_use_list *use_list;
@ -309,7 +577,6 @@ int ir_gcm(ir_ctx *ctx)
 			}
 			if (insn->type != IR_VOID) {
 				IR_ASSERT(ir_op_flags[insn->op] & IR_OP_FLAG_MEM);
-				ir_list_push_unchecked(&queue_late, ref);
 			}
 			ref = insn->op1; /* control predecessor */
 		}
@ -328,27 +595,17 @@ int ir_gcm(ir_ctx *ctx)
 					if (EXPECTED(ctx->use_lists[ref].count != 0)) {
 						_blocks[ref] = b; /* pin to block */
 						ir_list_push_unchecked(&queue_early, ref);
-						ir_list_push_unchecked(&queue_late, ref);
 					}
 				} else if (use_insn->op == IR_PARAM) {
 					bb->flags |= IR_BB_HAS_PARAM;
 					_blocks[ref] = b; /* pin to block */
-					if (EXPECTED(ctx->use_lists[ref].count != 0)) {
-						ir_list_push_unchecked(&queue_late, ref);
-					}
 				} else if (use_insn->op == IR_VAR) {
 					bb->flags |= IR_BB_HAS_VAR;
 					_blocks[ref] = b; /* pin to block */
-					if (EXPECTED(ctx->use_lists[ref].count != 0)) {
-						/* This is necessary only for VADDR */
-						ir_list_push_unchecked(&queue_late, ref);
 				}
 			}
 		}
 	}
-	}
-
-	ir_list_init(&queue_rest, ctx->insns_count);

 	n = ir_list_len(&queue_early);
 	while (n > 0) {
@ -359,7 +616,7 @@ int ir_gcm(ir_ctx *ctx)
 		for (p = insn->ops + 2; k > 0; p++, k--) {
 			ref = *p;
 			if (ref > 0 && _blocks[ref] == 0) {
-				ir_gcm_schedule_early(ctx, ref, &queue_rest);
+				ir_gcm_schedule_early(ctx, ref, &queue_late);
 			}
 		}
 	}
@ -373,31 +630,32 @@ int ir_gcm(ir_ctx *ctx)
 	}
 #endif

+#if IR_GCM_SPLIT
+	ir_gcm_split_data data;
+
+	ir_sparse_set_init(&data.totally_useful, ctx->cfg_blocks_count + 1);
+	ir_list_init(&data.worklist, ctx->cfg_blocks_count + 1);
+	ctx->data = &data;
+#endif
+
 	n = ir_list_len(&queue_late);
 	while (n > 0) {
 		n--;
 		ref = ir_list_at(&queue_late, n);
-		use_list = &ctx->use_lists[ref];
-		k = use_list->count;
-		for (p = &ctx->use_edges[use_list->refs]; k > 0; p++, k--) {
-			ref = *p;
-			b = _blocks[ref];
+		b = ctx->cfg_map[ref];
 		if (IR_GCM_IS_SCHEDULED_EARLY(b)) {
 			ir_gcm_schedule_late(ctx, ref, b);
 		}
 	}
-	}

-	n = ir_list_len(&queue_rest);
-	while (n > 0) {
-		n--;
-		ref = ir_list_at(&queue_rest, n);
-		ir_gcm_schedule_rest(ctx, ref);
-	}
+#if IR_GCM_SPLIT
+	ir_list_free(&data.worklist);
+	ir_sparse_set_free(&data.totally_useful);
+	ctx->data = NULL;
+#endif

 	ir_list_free(&queue_early);
 	ir_list_free(&queue_late);
-	ir_list_free(&queue_rest);

 #ifdef IR_DEBUG
 	if (ctx->flags & IR_DEBUG_GCM) {
@ -465,6 +723,7 @@ int ir_schedule(ir_ctx *ctx)
 	ir_ref i, j, k, n, *p, *q, ref, new_ref, prev_ref, insns_count, consts_count, use_edges_count;
 	ir_ref *_xlat;
 	ir_ref *edges;
+	ir_ref prev_b_end;
 	uint32_t b, prev_b;
 	uint32_t *_blocks = ctx->cfg_map;
 	ir_ref *_next = ir_mem_malloc(ctx->insns_count * sizeof(ir_ref));
@ -475,14 +734,15 @@ int ir_schedule(ir_ctx *ctx)
 	ir_use_list *lists, *use_list, *new_list;

 	/* Create a double-linked list of nodes ordered by BB, respecting BB->start and BB->end */
-	prev_b = _blocks[1];
-	IR_ASSERT(prev_b);
+	IR_ASSERT(_blocks[1] == 1);
+	prev_b = 1;
+	prev_b_end = ctx->cfg_blocks[1].end;
 	_prev[1] = 0;
-	_prev[ctx->cfg_blocks[1].end] = 0;
+	_prev[prev_b_end] = 0;
 	for (i = 2, j = 1; i < ctx->insns_count; i++) {
 		b = _blocks[i];
 		IR_ASSERT((int32_t)b >= 0);
-		if (b == prev_b) {
+		if (b == prev_b && i <= prev_b_end) {
 			/* add to the end of the list */
 			_next[j] = i;
 			_prev[i] = j;
@ -492,6 +752,7 @@ int ir_schedule(ir_ctx *ctx)
 			if (i == bb->start) {
 				IR_ASSERT(bb->end > bb->start);
 				prev_b = b;
+				prev_b_end = bb->end;
 				_prev[bb->end] = 0;
 				/* add to the end of the list */
 				_next[j] = i;
@ -603,6 +864,29 @@ int ir_schedule(ir_ctx *ctx)
 				insn = &ctx->ir_base[i];
 			}
 		}
+		if (bb->successors_count > 1) {
+			ir_ref input, j = bb->end;
+			ir_insn *end = &ctx->ir_base[j];
+
+			if (end->op == IR_IF) {
+				/* Move condition closer to IF */
+				input = end->op2;
+				if (input > 0 && _blocks[input] == b && !_xlat[input] && _prev[j] != input) {
+					if (input == i) {
+						i = _next[i];
+						insn = &ctx->ir_base[i];
+					}
+					/* remove "input" */
+					_prev[_next[input]] = _prev[input];
+					_next[_prev[input]] = _next[input];
+					/* insert before "j" */
+					_prev[input] = _prev[j];
+					_next[input] = j;
+					_next[_prev[j]] = input;
+					_prev[j] = input;
+				}
+			}
+		}
 		while (i != bb->end) {
 			ir_ref n, j, *p, input;

--- a/ext/opcache/jit/ir/ir_private.h
+++ b/ext/opcache/jit/ir/ir_private.h
@ -479,6 +479,91 @@ IR_ALWAYS_INLINE int ir_bitset_pop_first(ir_bitset set, uint32_t len)
 	} \
 } while (0)

+/* Sparse Set */
+typedef struct _ir_sparse_set {
+	uint32_t size;
+	uint32_t len;
+	uint32_t *data;
+} ir_sparse_set;
+
+#define IR_SPARSE_SET_DENSE(set,  n) (set)->data[n]
+#define IR_SPARSE_SET_SPARSE(set, n) (set)->data[-1 - ((int32_t)(n))]
+
+IR_ALWAYS_INLINE void ir_sparse_set_init(ir_sparse_set *set, uint32_t size)
+{
+	set->size = size;
+	set->len = 0;
+	set->data = (uint32_t*)ir_mem_malloc(sizeof(uint32_t) * 2 * size) + size;
+}
+
+IR_ALWAYS_INLINE void ir_sparse_set_clear(ir_sparse_set *set)
+{
+	set->len = 0;
+}
+
+IR_ALWAYS_INLINE void ir_sparse_set_free(ir_sparse_set *set)
+{
+	ir_mem_free(set->data - set->size);
+}
+
+IR_ALWAYS_INLINE bool ir_sparse_set_empty(const ir_sparse_set *set)
+{
+	return set->len == 0;
+}
+
+IR_ALWAYS_INLINE bool ir_sparse_set_in(const ir_sparse_set *set, uint32_t n)
+{
+	uint32_t idx = IR_SPARSE_SET_SPARSE(set, n);
+
+	return idx < set->len && IR_SPARSE_SET_DENSE(set, idx) == n;
+}
+
+IR_ALWAYS_INLINE void ir_sparse_set_add(ir_sparse_set *set, uint32_t n)
+{
+	uint32_t idx;
+
+	IR_ASSERT(!ir_sparse_set_in(set, n));
+	idx = set->len++;
+	IR_SPARSE_SET_DENSE(set,	idx) = n;
+	IR_SPARSE_SET_SPARSE(set, n) = idx;
+}
+
+IR_ALWAYS_INLINE void ir_sparse_set_del(ir_sparse_set *set, uint32_t n)
+{
+	uint32_t last;
+
+	IR_ASSERT(ir_sparse_set_in(set, n));
+	last = IR_SPARSE_SET_DENSE(set,	set->len - 1);
+	if (last != n) {
+		uint32_t idx = IR_SPARSE_SET_SPARSE(set, n);
+
+		IR_SPARSE_SET_DENSE(set, idx) = last;
+		IR_SPARSE_SET_SPARSE(set, last) = idx;
+
+	}
+	set->len--;
+}
+
+IR_ALWAYS_INLINE uint32_t ir_sparse_set_pop(ir_sparse_set *set)
+{
+	if (set->len > 0) {
+		set->len--;
+		return IR_SPARSE_SET_DENSE(set, set->len);
+	}
+	return -1; /* empty set */
+}
+
+#define IR_SPARSE_SET_FOREACH(set, bit) do { \
+	ir_sparse_set *_set = (set); \
+	uint32_t _i, _len = _set->len; \
+	uint32_t *_p = _set->data; \
+	for (_i = 0; _i < _len; _p++, _i++) { \
+		(bit) = *_p; \
+
+#define IR_SPARSE_SET_FOREACH_END() \
+	} \
+} while (0)
+
 /* Bit Queue */
 typedef struct _ir_bitqueue {
 	uint32_t  len;
--- a/ext/opcache/jit/ir/ir_save.c
+++ b/ext/opcache/jit/ir/ir_save.c
@ -116,7 +116,7 @@ void ir_save(const ir_ctx *ctx, uint32_t save_flags, FILE *f)

 		if ((save_flags & IR_SAVE_CFG)
 		 && ctx->cfg_map
-		 && ctx->cfg_map[i]
+		 && (int32_t)ctx->cfg_map[i] > 0 /* the node may be scheduled incompletely */
 		 && ctx->cfg_blocks[ctx->cfg_map[i]].start == i) {
 			uint32_t b = ctx->cfg_map[i];
 			ir_block *bb = &ctx->cfg_blocks[b];
@ -288,7 +288,7 @@ void ir_save(const ir_ctx *ctx, uint32_t save_flags, FILE *f)

 		if ((save_flags & IR_SAVE_CFG_MAP)
 		 && ctx->cfg_map
-		 && ctx->cfg_map[i]) {
+		 && ctx->cfg_map[i] > 0) { /* the node may be scheduled incompletely */
 			if (first) {
 				fprintf(f, " #");
 				first = 0;