diff --git a/ext/opcache/jit/ir/ir.c b/ext/opcache/jit/ir/ir.c
index a9f55cc0e46..e90a5e80bf0 100644
--- a/ext/opcache/jit/ir/ir.c
+++ b/ext/opcache/jit/ir/ir.c
@@ -803,7 +803,9 @@ ir_ref ir_proto(ir_ctx *ctx, uint8_t flags, ir_type ret_type, uint32_t params_co
 	proto->flags = flags;
 	proto->ret_type = ret_type;
 	proto->params_count = params_count;
-	memcpy(proto->param_types, param_types, params_count);
+	if (params_count) {
+		memcpy(proto->param_types, param_types, params_count);
+	}
 	return ir_strl(ctx, (const char *)proto, offsetof(ir_proto_t, param_types) + params_count);
 }
 
diff --git a/ext/opcache/jit/ir/ir.h b/ext/opcache/jit/ir/ir.h
index ec5e57129c9..be8779e0194 100644
--- a/ext/opcache/jit/ir/ir.h
+++ b/ext/opcache/jit/ir/ir.h
@@ -854,6 +854,9 @@ void ir_gdb_unregister_all(void);
 bool ir_gdb_present(void);
 
 /* IR load API (implementation in ir_load.c) */
+#define IR_RESOLVE_SYM_ADD_THUNK (1<<0)
+#define IR_RESOLVE_SYM_SILENT    (1<<1)
+
 struct _ir_loader {
 	uint32_t default_func_flags;
 	bool (*init_module)       (ir_loader *loader, const char *name, const char *filename, const char *target);
@@ -870,7 +873,7 @@ struct _ir_loader {
 	bool (*sym_data_end)      (ir_loader *loader, uint32_t flags);
 	bool (*func_init)         (ir_loader *loader, ir_ctx *ctx, const char *name);
 	bool (*func_process)      (ir_loader *loader, ir_ctx *ctx, const char *name);
-	void*(*resolve_sym_name)  (ir_loader *loader, const char *name, bool add_thunk);
+	void*(*resolve_sym_name)  (ir_loader *loader, const char *name, uint32_t flags);
 	bool (*has_sym)           (ir_loader *loader, const char *name);
 	bool (*add_sym)           (ir_loader *loader, const char *name, void *addr);
 };
diff --git a/ext/opcache/jit/ir/ir_aarch64.dasc b/ext/opcache/jit/ir/ir_aarch64.dasc
index 772eea7a5d7..1d927cc8c72 100644
--- a/ext/opcache/jit/ir/ir_aarch64.dasc
+++ b/ext/opcache/jit/ir/ir_aarch64.dasc
@@ -4366,11 +4366,15 @@ static void ir_emit_va_arg(ir_ctx *ctx, ir_ref def, ir_insn *insn)
 	ir_backend_data *data = ctx->data;
 	dasm_State **Dst = &data->dasm_state;
 	ir_type type = insn->type;
-	ir_reg def_reg = ctx->regs[def][0];
+	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
 	ir_reg op2_reg = ctx->regs[def][2];
 	ir_reg tmp_reg = ctx->regs[def][3];
 	int32_t offset;
 
+	if (ctx->use_lists[def].count == 1) {
+		/* dead load */
+		return;
+	}
 	IR_ASSERT(def_reg != IR_REG_NONE && tmp_reg != IR_REG_NONE);
 	if (op2_reg != IR_REG_NONE) {
 		if (IR_REG_SPILLED(op2_reg)) {
@@ -4394,11 +4398,15 @@ static void ir_emit_va_arg(ir_ctx *ctx, ir_ref def, ir_insn *insn)
 	ir_backend_data *data = ctx->data;
 	dasm_State **Dst = &data->dasm_state;
 	ir_type type = insn->type;
-	ir_reg def_reg = ctx->regs[def][0];
+	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
 	ir_reg op2_reg = ctx->regs[def][2];
 	ir_reg tmp_reg = ctx->regs[def][3];
 	int32_t offset;
 
+	if (ctx->use_lists[def].count == 1) {
+		/* dead load */
+		return;
+	}
 	IR_ASSERT(def_reg != IR_REG_NONE && tmp_reg != IR_REG_NONE);
 	if (op2_reg != IR_REG_NONE) {
 		if (IR_REG_SPILLED(op2_reg)) {
@@ -4935,6 +4943,28 @@ static void ir_emit_tailcall(ir_ctx *ctx, ir_ref def, ir_insn *insn)
 		return;
 	}
 
+	/* Move op2 to a tmp register before epilogue if it's in
+	 * used_preserved_regs, because it will be overridden. */
+
+	ir_reg op2_reg = IR_REG_NONE;
+	if (!IR_IS_CONST_REF(insn->op2)) {
+		op2_reg = ctx->regs[def][2];
+		IR_ASSERT(op2_reg != IR_REG_NONE);
+
+		if (IR_REG_SPILLED(op2_reg)) {
+			op2_reg = IR_REG_INT_TMP;
+			ir_emit_load(ctx, IR_ADDR, op2_reg, insn->op2);
+		} else if (IR_REGSET_IN((ir_regset)ctx->used_preserved_regs, IR_REG_NUM(op2_reg))) {
+			ir_reg orig_op2_reg = op2_reg;
+			op2_reg = IR_REG_INT_TMP;
+
+			ir_type type = ctx->ir_base[insn->op2].type;
+			| ASM_REG_REG_OP mov, type, op2_reg, IR_REG_NUM(orig_op2_reg)
+		} else {
+			op2_reg = IR_REG_NUM(op2_reg);
+		}
+	}
+
 	ir_emit_epilogue(ctx);
 
 	if (IR_IS_CONST_REF(insn->op2)) {
@@ -4947,13 +4977,8 @@ static void ir_emit_tailcall(ir_ctx *ctx, ir_ref def, ir_insn *insn)
 			|	br Rx(IR_REG_INT_TMP)
 		}
     } else {
-		ir_reg op2_reg = ctx->regs[def][2];
-
 		IR_ASSERT(op2_reg != IR_REG_NONE);
-		if (IR_REG_SPILLED(op2_reg)) {
-			op2_reg = IR_REG_NUM(op2_reg);
-			ir_emit_load(ctx, IR_ADDR, op2_reg, insn->op2);
-		}
+		IR_ASSERT(!IR_REGSET_IN((ir_regset)ctx->used_preserved_regs, op2_reg));
 		|	br Rx(op2_reg)
     }
 }
diff --git a/ext/opcache/jit/ir/ir_cfg.c b/ext/opcache/jit/ir/ir_cfg.c
index 01532c8ea3e..34375b0a3b5 100644
--- a/ext/opcache/jit/ir/ir_cfg.c
+++ b/ext/opcache/jit/ir/ir_cfg.c
@@ -244,7 +244,6 @@ int ir_build_cfg(ir_ctx *ctx)
 		_blocks[start] = b;
 		_blocks[end] = b;
 		IR_ASSERT(IR_IS_BB_START(insn->op));
-		IR_ASSERT(end > start);
 		bb->start = start;
 		bb->end = end;
 		bb->successors = count;
@@ -583,7 +582,6 @@ static int ir_remove_unreachable_blocks(ir_ctx *ctx)
 	return 1;
 }
 
-#if 0
 static void compute_postnum(const ir_ctx *ctx, uint32_t *cur, uint32_t b)
 {
 	uint32_t i, *p;
@@ -607,34 +605,42 @@ static void compute_postnum(const ir_ctx *ctx, uint32_t *cur, uint32_t b)
 
 /* Computes dominator tree using algorithm from "A Simple, Fast Dominance Algorithm" by
  * Cooper, Harvey and Kennedy. */
-int ir_build_dominators_tree(ir_ctx *ctx)
+static int ir_build_dominators_tree_slow(ir_ctx *ctx)
 {
 	uint32_t blocks_count, b, postnum;
 	ir_block *blocks, *bb;
 	uint32_t *edges;
 	bool changed;
 
+	blocks = ctx->cfg_blocks;
+	edges  = ctx->cfg_edges;
+	blocks_count = ctx->cfg_blocks_count;
+
+	/* Clear the dominators tree */
+	for (b = 0, bb = &blocks[0]; b <= blocks_count; b++, bb++) {
+		bb->idom = 0;
+		bb->dom_depth = 0;
+		bb->dom_child = 0;
+		bb->dom_next_child = 0;
+	}
+
 	ctx->flags2 &= ~IR_NO_LOOPS;
 
 	postnum = 1;
 	compute_postnum(ctx, &postnum, 1);
 
-	/* Find immediate dominators */
-	blocks = ctx->cfg_blocks;
-	edges  = ctx->cfg_edges;
-	blocks_count = ctx->cfg_blocks_count;
+	/* Find immediate dominators by iterative fixed-point algorithm */
 	blocks[1].idom = 1;
 	do {
 		changed = 0;
 		/* Iterating in Reverse Post Order */
 		for (b = 2, bb = &blocks[2]; b <= blocks_count; b++, bb++) {
 			IR_ASSERT(!(bb->flags & IR_BB_UNREACHABLE));
+			IR_ASSERT(bb->predecessors_count > 0);
 			if (bb->predecessors_count == 1) {
 				uint32_t pred_b = edges[bb->predecessors];
 
-				if (blocks[pred_b].idom <= 0) {
-					//IR_ASSERT("Wrong blocks order: BB is before its single predecessor");
-				} else if (bb->idom != pred_b) {
+				if (blocks[pred_b].idom > 0 && bb->idom != pred_b) {
 					bb->idom = pred_b;
 					changed = 1;
 				}
@@ -680,39 +686,37 @@ int ir_build_dominators_tree(ir_ctx *ctx)
 			}
 		}
 	} while (changed);
+
+	/* Build dominators tree */
 	blocks[1].idom = 0;
 	blocks[1].dom_depth = 0;
-
-	/* Construct dominators tree */
 	for (b = 2, bb = &blocks[2]; b <= blocks_count; b++, bb++) {
-		IR_ASSERT(!(bb->flags & IR_BB_UNREACHABLE));
-		if (bb->idom > 0) {
-			ir_block *idom_bb = &blocks[bb->idom];
+		uint32_t idom = bb->idom;
+		ir_block *idom_bb = &blocks[idom];
 
-			bb->dom_depth = idom_bb->dom_depth + 1;
-			/* Sort by block number to traverse children in pre-order */
-			if (idom_bb->dom_child == 0) {
-				idom_bb->dom_child = b;
-			} else if (b < idom_bb->dom_child) {
-				bb->dom_next_child = idom_bb->dom_child;
-				idom_bb->dom_child = b;
-			} else {
-				int child = idom_bb->dom_child;
-				ir_block *child_bb = &blocks[child];
+		bb->dom_depth = idom_bb->dom_depth + 1;
+		/* Sort by block number to traverse children in pre-order */
+		if (idom_bb->dom_child == 0) {
+			idom_bb->dom_child = b;
+		} else if (b < idom_bb->dom_child) {
+			bb->dom_next_child = idom_bb->dom_child;
+			idom_bb->dom_child = b;
+		} else {
+			int child = idom_bb->dom_child;
+			ir_block *child_bb = &blocks[child];
 
-				while (child_bb->dom_next_child > 0 && b > child_bb->dom_next_child) {
-					child = child_bb->dom_next_child;
-					child_bb = &blocks[child];
-				}
-				bb->dom_next_child = child_bb->dom_next_child;
-				child_bb->dom_next_child = b;
+			while (child_bb->dom_next_child > 0 && b > child_bb->dom_next_child) {
+				child = child_bb->dom_next_child;
+				child_bb = &blocks[child];
 			}
+			bb->dom_next_child = child_bb->dom_next_child;
+			child_bb->dom_next_child = b;
 		}
 	}
 
 	return 1;
 }
-#else
+
 /* A single pass modification of "A Simple, Fast Dominance Algorithm" by
  * Cooper, Harvey and Kennedy, that relays on IR block ordering.
  * It may fallback to the general slow fixed-point algorithm.  */
@@ -747,7 +751,11 @@ int ir_build_dominators_tree(ir_ctx *ctx)
 		if (UNEXPECTED(idom >= b)) {
 			/* In rare cases, LOOP_BEGIN.op1 may be a back-edge. Skip back-edges. */
 			ctx->flags2 &= ~IR_NO_LOOPS;
-			IR_ASSERT(k > 1 && "Wrong blocks order: BB is before its single predecessor");
+//			IR_ASSERT(k > 1 && "Wrong blocks order: BB is before its single predecessor");
+			if (UNEXPECTED(k <= 1)) {
+				ir_list_free(&worklist);
+				return ir_build_dominators_tree_slow(ctx);
+			}
 			ir_list_push(&worklist, idom);
 			while (1) {
 				k--;
@@ -942,7 +950,6 @@ static int ir_build_dominators_tree_iterative(ir_ctx *ctx)
 
 	return 1;
 }
-#endif
 
 static bool ir_dominates(const ir_block *blocks, uint32_t b1, uint32_t b2)
 {
@@ -958,7 +965,7 @@ static bool ir_dominates(const ir_block *blocks, uint32_t b1, uint32_t b2)
 
 int ir_find_loops(ir_ctx *ctx)
 {
-	uint32_t i, j, n, count;
+	uint32_t b, j, n, count;
 	uint32_t *entry_times, *exit_times, *sorted_blocks, time = 1;
 	ir_block *blocks = ctx->cfg_blocks;
 	uint32_t *edges = ctx->cfg_edges;
@@ -983,13 +990,13 @@ int ir_find_loops(ir_ctx *ctx)
 		int child;
 
 next:
-		i = ir_worklist_peek(&work);
-		if (!entry_times[i]) {
-			entry_times[i] = time++;
+		b = ir_worklist_peek(&work);
+		if (!entry_times[b]) {
+			entry_times[b] = time++;
 		}
 
-		/* Visit blocks immediately dominated by i. */
-		bb = &blocks[i];
+		/* Visit blocks immediately dominated by "b". */
+		bb = &blocks[b];
 		for (child = bb->dom_child; child > 0; child = blocks[child].dom_next_child) {
 			if (ir_worklist_push(&work, child)) {
 				goto next;
@@ -999,17 +1006,17 @@ next:
 		/* Visit join edges. */
 		if (bb->successors_count) {
 			uint32_t *p = edges + bb->successors;
-			for (j = 0; j < bb->successors_count; j++,p++) {
+			for (j = 0; j < bb->successors_count; j++, p++) {
 				uint32_t succ = *p;
 
-				if (blocks[succ].idom == i) {
+				if (blocks[succ].idom == b) {
 					continue;
 				} else if (ir_worklist_push(&work, succ)) {
 					goto next;
 				}
 			}
 		}
-		exit_times[i] = time++;
+		exit_times[b] = time++;
 		ir_worklist_pop(&work);
 	}
 
@@ -1018,7 +1025,7 @@ next:
 	j = 1;
 	n = 2;
 	while (j != n) {
-		i = j;
+		uint32_t i = j;
 		j = n;
 		for (; i < j; i++) {
 			int child;
@@ -1030,9 +1037,82 @@ next:
 	count = n;
 
 	/* Identify loops. See Sreedhar et al, "Identifying Loops Using DJ Graphs". */
+	uint32_t prev_dom_depth = blocks[sorted_blocks[n - 1]].dom_depth;
+	uint32_t prev_irreducible = 0;
 	while (n > 1) {
-		i = sorted_blocks[--n];
-		ir_block *bb = &blocks[i];
+		b = sorted_blocks[--n];
+		ir_block *bb = &blocks[b];
+
+		IR_ASSERT(bb->dom_depth <= prev_dom_depth);
+		if (UNEXPECTED(prev_irreducible) && bb->dom_depth != prev_dom_depth) {
+			/* process delyed irreducible loops */
+			do {
+				b = sorted_blocks[prev_irreducible];
+				bb = &blocks[b];
+				if ((bb->flags & IR_BB_IRREDUCIBLE_LOOP) && !bb->loop_depth) {
+					/* process irreducible loop */
+					uint32_t hdr = b;
+
+					bb->loop_depth = 1;
+					if (ctx->ir_base[bb->start].op == IR_MERGE) {
+						ctx->ir_base[bb->start].op = IR_LOOP_BEGIN;
+					}
+
+					/* find the closing edge(s) of the irreucible loop */
+					IR_ASSERT(bb->predecessors_count > 1);
+					uint32_t *p = &edges[bb->predecessors];
+					j = bb->predecessors_count;
+					do {
+						uint32_t pred = *p;
+
+						if (entry_times[pred] > entry_times[b] && exit_times[pred] < exit_times[b]) {
+							if (!ir_worklist_len(&work)) {
+								ir_bitset_clear(work.visited, ir_bitset_len(ir_worklist_capasity(&work)));
+							}
+							blocks[pred].loop_header = 0; /* support for merged loops */
+							ir_worklist_push(&work, pred);
+						}
+						p++;
+					} while (--j);
+					IR_ASSERT(ir_worklist_len(&work) != 0);
+
+					/* collect members of the irreducible loop */
+					while (ir_worklist_len(&work)) {
+						b = ir_worklist_pop(&work);
+						if (b != hdr) {
+							ir_block *bb = &blocks[b];
+							bb->loop_header = hdr;
+							if (bb->predecessors_count) {
+								uint32_t *p = &edges[bb->predecessors];
+								uint32_t n = bb->predecessors_count;
+								do {
+									uint32_t pred = *p;
+									while (blocks[pred].loop_header > 0) {
+										pred = blocks[pred].loop_header;
+									}
+									if (pred != hdr) {
+										if (entry_times[pred] > entry_times[hdr] && exit_times[pred] < exit_times[hdr]) {
+											/* "pred" is a descendant of "hdr" */
+											ir_worklist_push(&work, pred);
+										} else {
+											/* another entry to the irreducible loop */
+											bb->flags |= IR_BB_IRREDUCIBLE_LOOP;
+											if (ctx->ir_base[bb->start].op == IR_MERGE) {
+												ctx->ir_base[bb->start].op = IR_LOOP_BEGIN;
+											}
+										}
+									}
+									p++;
+								} while (--n);
+							}
+						}
+					}
+				}
+			} while (--prev_irreducible != n);
+			prev_irreducible = 0;
+			b = sorted_blocks[n];
+			bb = &blocks[b];
+		}
 
 		if (bb->predecessors_count > 1) {
 			bool irreducible = 0;
@@ -1047,7 +1127,7 @@ next:
 				if (bb->idom != pred) {
 					/* In a loop back-edge (back-join edge), the successor dominates
 					   the predecessor.  */
-					if (ir_dominates(blocks, i, pred)) {
+					if (ir_dominates(blocks, b, pred)) {
 						if (!ir_worklist_len(&work)) {
 							ir_bitset_clear(work.visited, ir_bitset_len(ir_worklist_capasity(&work)));
 						}
@@ -1056,8 +1136,9 @@ next:
 					} else {
 						/* Otherwise it's a cross-join edge.  See if it's a branch
 						   to an ancestor on the DJ spanning tree.  */
-						if (entry_times[pred] > entry_times[i] && exit_times[pred] < exit_times[i]) {
+						if (entry_times[pred] > entry_times[b] && exit_times[pred] < exit_times[b]) {
 							irreducible = 1;
+							break;
 						}
 					}
 				}
@@ -1065,46 +1146,55 @@ next:
 			} while (--j);
 
 			if (UNEXPECTED(irreducible)) {
-				// TODO: Support for irreducible loops ???
-				bb->flags |= IR_BB_IRREDUCIBLE_LOOP;
-				ctx->flags2 |= IR_IRREDUCIBLE_CFG;
-				while (ir_worklist_len(&work)) {
-					ir_worklist_pop(&work);
+				bb->flags |= IR_BB_LOOP_HEADER | IR_BB_IRREDUCIBLE_LOOP;
+				ctx->flags2 |= IR_CFG_HAS_LOOPS | IR_IRREDUCIBLE_CFG;
+				/* Remember the position of the first irreducible loop to process all the irreducible loops
+				 * after the reducible loops with the same dominator tree depth
+				 */
+				if (!prev_irreducible) {
+					prev_irreducible = n;
 				}
+				ir_list_clear(&work.l);
 			} else if (ir_worklist_len(&work)) {
+				/* collect members of the reducible loop */
+				uint32_t hdr = b;
+
 				bb->flags |= IR_BB_LOOP_HEADER;
 				ctx->flags2 |= IR_CFG_HAS_LOOPS;
 				bb->loop_depth = 1;
+				if (ctx->ir_base[bb->start].op == IR_MERGE) {
+					ctx->ir_base[bb->start].op = IR_LOOP_BEGIN;
+				}
 				while (ir_worklist_len(&work)) {
-					j = ir_worklist_pop(&work);
-					while (blocks[j].loop_header > 0) {
-						j = blocks[j].loop_header;
-					}
-					if (j != i) {
-						ir_block *bb = &blocks[j];
-						if (bb->idom == 0 && j != 1) {
-							/* Ignore blocks that are unreachable or only abnormally reachable. */
-							continue;
-						}
-						bb->loop_header = i;
+					b = ir_worklist_pop(&work);
+					if (b != hdr) {
+						ir_block *bb = &blocks[b];
+						bb->loop_header = hdr;
 						if (bb->predecessors_count) {
 							uint32_t *p = &edges[bb->predecessors];
-							j = bb->predecessors_count;
+							uint32_t n = bb->predecessors_count;
 							do {
-								ir_worklist_push(&work, *p);
+								uint32_t pred = *p;
+								while (blocks[pred].loop_header > 0) {
+									pred = blocks[pred].loop_header;
+								}
+								if (pred != hdr) {
+									ir_worklist_push(&work, pred);
+								}
 								p++;
-							} while (--j);
+							} while (--n);
 						}
 					}
 				}
 			}
 		}
 	}
+	IR_ASSERT(!prev_irreducible);
 
 	if (ctx->flags2 & IR_CFG_HAS_LOOPS) {
 		for (n = 1; n < count; n++) {
-			i = sorted_blocks[n];
-			ir_block *bb = &blocks[i];
+			b = sorted_blocks[n];
+			ir_block *bb = &blocks[b];
 			if (bb->loop_header > 0) {
 				ir_block *loop = &blocks[bb->loop_header];
 				uint32_t loop_depth = loop->loop_depth;
@@ -1389,7 +1479,7 @@ restart:
 						goto restart;
 					}
 				} else if (b != predecessor && ctx->cfg_blocks[predecessor].loop_header != b) {
-					ir_dump_cfg(ctx, stderr);
+					/* not a loop back-edge */
 					IR_ASSERT(b == predecessor || ctx->cfg_blocks[predecessor].loop_header == b);
 				}
 			}
diff --git a/ext/opcache/jit/ir/ir_check.c b/ext/opcache/jit/ir/ir_check.c
index f12b4776fa1..a791baef5db 100644
--- a/ext/opcache/jit/ir/ir_check.c
+++ b/ext/opcache/jit/ir/ir_check.c
@@ -213,13 +213,18 @@ bool ir_check(const ir_ctx *ctx)
 									ok = 0;
 								}
 							}
-							break;
-						case IR_OPND_CONTROL_DEP:
 							if ((ctx->flags2 & IR_LINEAR)
 							 && use >= i
 							 && !(insn->op == IR_LOOP_BEGIN)) {
 								fprintf(stderr, "ir_base[%d].ops[%d] invalid forward reference (%d)\n", i, j, use);
 								ok = 0;
+							}
+							break;
+						case IR_OPND_CONTROL_DEP:
+							if ((ctx->flags2 & IR_LINEAR)
+							 && use >= i) {
+								fprintf(stderr, "ir_base[%d].ops[%d] invalid forward reference (%d)\n", i, j, use);
+								ok = 0;
 							} else if (insn->op == IR_PHI) {
 								ir_insn *merge_insn = &ctx->ir_base[insn->op1];
 								if (merge_insn->op != IR_MERGE && merge_insn->op != IR_LOOP_BEGIN) {
diff --git a/ext/opcache/jit/ir/ir_emit.c b/ext/opcache/jit/ir/ir_emit.c
index c82655daf48..fab9f56228d 100644
--- a/ext/opcache/jit/ir/ir_emit.c
+++ b/ext/opcache/jit/ir/ir_emit.c
@@ -309,7 +309,7 @@ static void* ir_sym_addr(ir_ctx *ctx, const ir_insn *addr_insn)
 {
 	const char *name = ir_get_str(ctx, addr_insn->val.name);
 	void *addr = (ctx->loader && ctx->loader->resolve_sym_name) ?
-		ctx->loader->resolve_sym_name(ctx->loader, name, 0) :
+		ctx->loader->resolve_sym_name(ctx->loader, name, IR_RESOLVE_SYM_SILENT) :
 		ir_resolve_sym_name(name);
 
 	return addr;
@@ -320,7 +320,7 @@ static void* ir_sym_val(ir_ctx *ctx, const ir_insn *addr_insn)
 {
 	const char *name = ir_get_str(ctx, addr_insn->val.name);
 	void *addr = (ctx->loader && ctx->loader->resolve_sym_name) ?
-		ctx->loader->resolve_sym_name(ctx->loader, name, addr_insn->op == IR_FUNC) :
+		ctx->loader->resolve_sym_name(ctx->loader, name, addr_insn->op == IR_FUNC ? IR_RESOLVE_SYM_ADD_THUNK : 0) :
 		ir_resolve_sym_name(name);
 
 	IR_ASSERT(addr);
diff --git a/ext/opcache/jit/ir/ir_fold.h b/ext/opcache/jit/ir/ir_fold.h
index 88539e52ab0..90112214d0c 100644
--- a/ext/opcache/jit/ir/ir_fold.h
+++ b/ext/opcache/jit/ir/ir_fold.h
@@ -1909,7 +1909,9 @@ IR_FOLD(SUB(_, SUB))
 IR_FOLD(SUB(ADD, ADD))
 {
 	if (IR_IS_TYPE_INT(IR_OPT_TYPE(opt))) {
-		if (op1_insn->op1 == op2_insn->op1) {
+		if (op1 == op2) {
+			IR_FOLD_CONST_U(0);
+		} else if (op1_insn->op1 == op2_insn->op1) {
 			/* (a + b) - (a + c) => b - c */
 			op1 = op1_insn->op2;
 			op2 = op2_insn->op2;
diff --git a/ext/opcache/jit/ir/ir_gcm.c b/ext/opcache/jit/ir/ir_gcm.c
index 8bd6be5d10a..0d8a6c2d760 100644
--- a/ext/opcache/jit/ir/ir_gcm.c
+++ b/ext/opcache/jit/ir/ir_gcm.c
@@ -785,6 +785,139 @@ IR_ALWAYS_INLINE ir_ref ir_count_constant(ir_ref *_xlat, ir_ref ref)
 	return 0;
 }
 
+IR_ALWAYS_INLINE bool ir_is_good_bb_order(ir_ctx *ctx, uint32_t b, ir_block *bb, ir_ref start)
+{
+	ir_insn	*insn = &ctx->ir_base[start];
+	uint32_t n = insn->inputs_count;
+	ir_ref *p = insn->ops + 1;
+
+	if (n == 1) {
+		return *p < start;
+	} else {
+		IR_ASSERT(n > 1);
+		for (; n > 0; p++, n--) {
+			ir_ref input = *p;
+			if (input < start) {
+				/* ordered */
+			} else if ((bb->flags & IR_BB_LOOP_HEADER)
+			  && (ctx->cfg_map[input] == b || ctx->cfg_blocks[ctx->cfg_map[input]].loop_header == b)) {
+				/* back-edge of reducible loop */
+			} else if ((bb->flags & IR_BB_IRREDUCIBLE_LOOP)
+			  && (ctx->cfg_blocks[ctx->cfg_map[input]].loop_header == ctx->cfg_blocks[b].loop_header)) {
+				/* closing edge of irreducible loop */
+			} else {
+				return 0;
+			}
+		}
+		return 1;
+	}
+}
+
+static IR_NEVER_INLINE void ir_fix_bb_order(ir_ctx *ctx, ir_ref *_prev, ir_ref *_next)
+{
+	uint32_t b, succ, count, *q, *xlat;
+	ir_block *bb;
+	ir_ref ref, n, prev;
+	ir_worklist worklist;
+	ir_block *new_blocks;
+
+#if 0
+	for (b = 1, bb = ctx->cfg_blocks + 1; b <= ctx->cfg_blocks_count; b++, bb++) {
+		if (!ir_is_good_bb_order(ctx, b, bb, bb->start)) {
+			goto fix;
+		}
+	}
+	return;
+
+fix:
+#endif
+	count = ctx->cfg_blocks_count + 1;
+	new_blocks = ir_mem_malloc(count * sizeof(ir_block));
+	xlat = ir_mem_malloc(count * sizeof(uint32_t));
+	ir_worklist_init(&worklist, count);
+	ir_worklist_push(&worklist, 1);
+	while (ir_worklist_len(&worklist) != 0) {
+next:
+		b = ir_worklist_peek(&worklist);
+		bb = &ctx->cfg_blocks[b];
+		n = bb->successors_count;
+		if (n == 1) {
+			succ = ctx->cfg_edges[bb->successors];
+			if (ir_worklist_push(&worklist, succ)) {
+				goto next;
+			}
+		} else if (n > 1) {
+			uint32_t best = 0;
+			uint32_t best_loop_depth = 0;
+
+			q = ctx->cfg_edges + bb->successors + n;
+			do {
+				q--;
+				succ = *q;
+				if (ir_bitset_in(worklist.visited, succ)) {
+					/* already processed */
+				} else if ((ctx->cfg_blocks[succ].flags & IR_BB_LOOP_HEADER)
+				  && (succ == b || ctx->cfg_blocks[b].loop_header == succ)) {
+					/* back-edge of reducible loop */
+				} else if ((ctx->cfg_blocks[succ].flags & IR_BB_IRREDUCIBLE_LOOP)
+				  && (ctx->cfg_blocks[succ].loop_header == ctx->cfg_blocks[b].loop_header)) {
+					/* closing edge of irreducible loop */
+				} else if (!best) {
+					best = succ;
+					best_loop_depth = ctx->cfg_blocks[best].loop_depth;
+				} else if (ctx->cfg_blocks[succ].loop_depth < best_loop_depth) {
+					/* prefer deeper loop */
+					best = succ;
+					best_loop_depth = ctx->cfg_blocks[best].loop_depth;
+				}
+				n--;
+			} while (n > 0);
+			if (best) {
+				ir_worklist_push(&worklist, best);
+				goto next;
+			}
+		}
+		ir_worklist_pop(&worklist);
+		count--;
+		new_blocks[count] = *bb;
+		xlat[b] = count;
+	}
+	IR_ASSERT(count == 1);
+	xlat[0] = 0;
+	ir_worklist_free(&worklist);
+
+	prev = 0;
+	for (b = 1, bb = new_blocks + 1; b <= ctx->cfg_blocks_count; b++, bb++) {
+		bb->idom = xlat[bb->idom];
+		bb->loop_header = xlat[bb->loop_header];
+		n = bb->successors_count;
+		if (n > 0) {
+			for (q = ctx->cfg_edges + bb->successors; n > 0; q++, n--) {
+				*q = xlat[*q];
+			}
+		}
+		n = bb->predecessors_count;
+		if (n > 0) {
+			for (q = ctx->cfg_edges + bb->predecessors; n > 0; q++, n--) {
+				*q = xlat[*q];
+			}
+		}
+		_next[prev] = bb->start;
+		_prev[bb->start] = prev;
+		prev = bb->end;
+	}
+	_next[0] = 0;
+	_next[prev] = 0;
+
+	for (ref = 2; ref < ctx->insns_count; ref++) {
+		ctx->cfg_map[ref] = xlat[ctx->cfg_map[ref]];
+	}
+	ir_mem_free(xlat);
+
+	ir_mem_free(ctx->cfg_blocks);
+	ctx->cfg_blocks = new_blocks;
+}
+
 int ir_schedule(ir_ctx *ctx)
 {
 	ir_ctx new_ctx;
@@ -800,6 +933,7 @@ int ir_schedule(ir_ctx *ctx)
 	ir_block *bb;
 	ir_insn *insn, *new_insn;
 	ir_use_list *lists, *use_list, *new_list;
+	bool bad_bb_order = 0;
 
 	/* Create a double-linked list of nodes ordered by BB, respecting BB->start and BB->end */
 	IR_ASSERT(_blocks[1] == 1);
@@ -818,27 +952,50 @@ int ir_schedule(ir_ctx *ctx)
 		} else if (b > prev_b) {
 			bb = &ctx->cfg_blocks[b];
 			if (i == bb->start) {
-				IR_ASSERT(bb->end > bb->start);
-				prev_b = b;
-				prev_b_end = bb->end;
-				_prev[bb->end] = 0;
+				if (bb->end > bb->start) {
+					prev_b = b;
+					prev_b_end = bb->end;
+					/* add to the end of the list */
+					_next[j] = i;
+					_prev[i] = j;
+					j = i;
+				} else {
+					prev_b = 0;
+					prev_b_end = 0;
+					k = bb->end;
+					while (_blocks[_prev[k]] == b) {
+						k = _prev[k];
+					}
+					/* insert before "k" */
+					_prev[i] = _prev[k];
+					_next[i] = k;
+					_next[_prev[k]] = i;
+					_prev[k] = i;
+				}
+				if (!ir_is_good_bb_order(ctx, b, bb, i)) {
+					bad_bb_order = 1;
+				}
+			} else if (i != bb->end) {
+				/* move down late (see the following loop) */
+				_next[i] = _move_down;
+				_move_down = i;
+			} else {
+				IR_ASSERT(bb->start > bb->end);
+				prev_b = 0;
+				prev_b_end = 0;
 				/* add to the end of the list */
 				_next[j] = i;
 				_prev[i] = j;
 				j = i;
-			} else {
-				IR_ASSERT(i != bb->end);
-				/* move down late (see the following loop) */
-				_next[i] = _move_down;
-				_move_down = i;
 			}
 		} else if (b) {
 			bb = &ctx->cfg_blocks[b];
 			IR_ASSERT(i != bb->start);
-			if (_prev[bb->end]) {
+			if (i > bb->end) {
 				/* move up, insert before the end of the already scheduled BB */
 				k = bb->end;
 			} else {
+				IR_ASSERT(i > bb->start);
 				/* move up, insert at the end of the block */
 				k = ctx->cfg_blocks[b + 1].start;
 			}
@@ -883,6 +1040,10 @@ int ir_schedule(ir_ctx *ctx)
 	}
 #endif
 
+	if (bad_bb_order) {
+		ir_fix_bb_order(ctx, _prev, _next);
+	}
+
 	_xlat = ir_mem_calloc((ctx->consts_count + ctx->insns_count), sizeof(ir_ref));
 	_xlat += ctx->consts_count;
 	_xlat[IR_TRUE] = IR_TRUE;
diff --git a/ext/opcache/jit/ir/ir_private.h b/ext/opcache/jit/ir/ir_private.h
index 69a0101d24e..ac952e402f5 100644
--- a/ext/opcache/jit/ir/ir_private.h
+++ b/ext/opcache/jit/ir/ir_private.h
@@ -62,7 +62,7 @@
 #define IR_MAX(a, b)          (((a) > (b)) ? (a) : (b))
 #define IR_MIN(a, b)          (((a) < (b)) ? (a) : (b))
 
-#define IR_IS_POWER_OF_TWO(x) (!((x) & ((x) - 1)))
+#define IR_IS_POWER_OF_TWO(x) ((x) && (!((x) & ((x) - 1))))
 
 #define IR_LOG2(x) ir_ntzl(x)
 
diff --git a/ext/opcache/jit/ir/ir_save.c b/ext/opcache/jit/ir/ir_save.c
index b12cc267af6..595f2d9d6a2 100644
--- a/ext/opcache/jit/ir/ir_save.c
+++ b/ext/opcache/jit/ir/ir_save.c
@@ -140,6 +140,9 @@ void ir_save(const ir_ctx *ctx, uint32_t save_flags, FILE *f)
 					fprintf(f, ", loop=BB%d(%d)", bb->loop_header, bb->loop_depth);
 				}
 			}
+			if (bb->flags & IR_BB_IRREDUCIBLE_LOOP) {
+				fprintf(f, ", IRREDUCIBLE");
+			}
 			if (bb->predecessors_count) {
 				uint32_t i;
 
diff --git a/ext/opcache/jit/ir/ir_sccp.c b/ext/opcache/jit/ir/ir_sccp.c
index 2e006516df8..221a86a5ad8 100644
--- a/ext/opcache/jit/ir/ir_sccp.c
+++ b/ext/opcache/jit/ir/ir_sccp.c
@@ -1732,7 +1732,20 @@ static ir_ref ir_promote_i2i(ir_ctx *ctx, ir_type type, ir_ref ref, ir_ref use,
 	ir_ref *p, n, input;
 
 	if (IR_IS_CONST_REF(ref)) {
-		return ir_const(ctx, insn->val, type);
+		ir_val val;
+
+		switch (type) {
+			case IR_I8:  val.i64 = insn->val.i8; break;
+			case IR_U8:  val.u64 = insn->val.u8; break;
+			case IR_I16: val.i64 = insn->val.i16; break;
+			case IR_U16: val.u64 = insn->val.u16; break;
+			case IR_I32: val.i64 = insn->val.i32; break;
+			case IR_U32: val.u64 = insn->val.u32; break;
+			case IR_CHAR:val.i64 = insn->val.i8; break;
+			case IR_BOOL:val.u64 = insn->val.u8 != 0; break;
+			default: IR_ASSERT(0); val.u64 = 0;
+		}
+		return ir_const(ctx, val, type);
 	} else {
 		ir_bitqueue_add(worklist, ref);
 		switch (insn->op) {
diff --git a/ext/opcache/jit/ir/ir_x86.dasc b/ext/opcache/jit/ir/ir_x86.dasc
index 76602c2b4bc..d56cb8645e1 100644
--- a/ext/opcache/jit/ir/ir_x86.dasc
+++ b/ext/opcache/jit/ir/ir_x86.dasc
@@ -6868,7 +6868,24 @@ static void ir_emit_return_fp(ir_ctx *ctx, ir_ref ref, ir_insn *insn)
 	ir_backend_data *data = ctx->data;
 	dasm_State **Dst = &data->dasm_state;
 
-	if (op2_reg == IR_REG_NONE || IR_REG_SPILLED(op2_reg)) {
+	if (IR_IS_CONST_REF(insn->op2)) {
+		ir_insn *value = &ctx->ir_base[insn->op2];
+
+		if ((type == IR_FLOAT && value->val.f == 0.0) || (type == IR_DOUBLE && value->val.d == 0.0)) {
+			|	fldz
+		} else if ((type == IR_FLOAT && value->val.f == 1.0) || (type == IR_DOUBLE && value->val.d == 1.0)) {
+			|	fld1
+		} else {
+			int label = ir_const_label(ctx, insn->op2);
+
+			if (type == IR_DOUBLE) {
+				|	fld qword [=>label]
+			} else {
+				IR_ASSERT(type == IR_FLOAT);
+				|	fld dword [=>label]
+			}
+		}
+	} else if (op2_reg == IR_REG_NONE || IR_REG_SPILLED(op2_reg)) {
 		ir_reg fp;
 		int32_t offset = ir_ref_spill_slot_offset(ctx, insn->op2, &fp);
 
@@ -8442,11 +8459,15 @@ static void ir_emit_va_arg(ir_ctx *ctx, ir_ref def, ir_insn *insn)
 	ir_backend_data *data = ctx->data;
 	dasm_State **Dst = &data->dasm_state;
 	ir_type type = insn->type;
-	ir_reg def_reg = ctx->regs[def][0];
+	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
 	ir_reg op2_reg = ctx->regs[def][2];
 	ir_reg tmp_reg = ctx->regs[def][3];
 	int32_t offset;
 
+	if (ctx->use_lists[def].count == 1) {
+		/* dead load */
+		return;
+	}
 	IR_ASSERT(def_reg != IR_REG_NONE && tmp_reg != IR_REG_NONE);
 	if (op2_reg != IR_REG_NONE) {
 		if (IR_REG_SPILLED(op2_reg)) {
@@ -8471,11 +8492,15 @@ static void ir_emit_va_arg(ir_ctx *ctx, ir_ref def, ir_insn *insn)
 	ir_backend_data *data = ctx->data;
 	dasm_State **Dst = &data->dasm_state;
 	ir_type type = insn->type;
-	ir_reg def_reg = ctx->regs[def][0];
+	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
 	ir_reg op2_reg = ctx->regs[def][2];
 	ir_reg tmp_reg = ctx->regs[def][3];
 	int32_t offset;
 
+	if (ctx->use_lists[def].count == 1) {
+		/* dead load */
+		return;
+	}
 	IR_ASSERT(def_reg != IR_REG_NONE&& tmp_reg != IR_REG_NONE);
 	if (op2_reg != IR_REG_NONE) {
 		if (IR_REG_SPILLED(op2_reg)) {
@@ -9221,6 +9246,58 @@ static void ir_emit_tailcall(ir_ctx *ctx, ir_ref def, ir_insn *insn)
 		return;
 	}
 
+	/* Move op2 to a tmp register before epilogue if it's in
+	 * used_preserved_regs, because it will be overridden. */
+
+	ir_reg op2_reg = IR_REG_NONE;
+	ir_mem mem = IR_MEM_B(IR_REG_NONE);
+	if (!IR_IS_CONST_REF(insn->op2)) {
+		op2_reg = ctx->regs[def][2];
+
+		ir_regset preserved_regs = (ir_regset)ctx->used_preserved_regs | IR_REGSET(IR_REG_STACK_POINTER);
+		if (ctx->flags & IR_USE_FRAME_POINTER) {
+			preserved_regs |= IR_REGSET(IR_REG_FRAME_POINTER);
+		}
+
+		bool is_spill_slot = op2_reg != IR_REG_NONE
+			&& IR_REG_SPILLED(op2_reg)
+			&& ctx->vregs[insn->op2];
+
+		if (op2_reg != IR_REG_NONE && !is_spill_slot) {
+			if (IR_REGSET_IN(preserved_regs, IR_REG_NUM(op2_reg))) {
+				ir_ref orig_op2_reg = op2_reg;
+				op2_reg = IR_REG_RAX;
+
+				if (IR_REG_SPILLED(orig_op2_reg)) {
+					ir_emit_load(ctx, IR_ADDR, op2_reg, insn->op2);
+				} else {
+					ir_type type = ctx->ir_base[insn->op2].type;
+					| ASM_REG_REG_OP mov, type, op2_reg, IR_REG_NUM(orig_op2_reg)
+				}
+			} else {
+				op2_reg = IR_REG_NUM(op2_reg);
+			}
+		} else {
+			if (ir_rule(ctx, insn->op2) & IR_FUSED) {
+				IR_ASSERT(op2_reg == IR_REG_NONE);
+				mem = ir_fuse_load(ctx, def, insn->op2);
+			} else {
+				mem = ir_ref_spill_slot(ctx, insn->op2);
+			}
+			ir_reg base = IR_MEM_BASE(mem);
+			ir_reg index = IR_MEM_INDEX(mem);
+			if ((base != IR_REG_NONE && IR_REGSET_IN(preserved_regs, base)) ||
+					(index != IR_REG_NONE && IR_REGSET_IN(preserved_regs, index))) {
+				op2_reg = IR_REG_RAX;
+
+				ir_type type = ctx->ir_base[insn->op2].type;
+				ir_emit_load_mem_int(ctx, type, op2_reg, mem);
+			} else {
+				op2_reg = IR_REG_NONE;
+			}
+		}
+	}
+
 	ir_emit_epilogue(ctx);
 
 	if (IR_IS_CONST_REF(insn->op2)) {
@@ -9246,22 +9323,10 @@ static void ir_emit_tailcall(ir_ctx *ctx, ir_ref def, ir_insn *insn)
 |.endif
 		}
     } else {
-		ir_reg op2_reg = ctx->regs[def][2];
-
 		if (op2_reg != IR_REG_NONE) {
-			if (IR_REG_SPILLED(op2_reg)) {
-				op2_reg = IR_REG_NUM(op2_reg);
-				ir_emit_load(ctx, IR_ADDR, op2_reg, insn->op2);
-			}
+			IR_ASSERT(!IR_REGSET_IN((ir_regset)ctx->used_preserved_regs, op2_reg));
 			|	jmp Ra(op2_reg)
 		} else {
-			ir_mem mem;
-
-			if (ir_rule(ctx, insn->op2) & IR_FUSED) {
-				mem = ir_fuse_load(ctx, def, insn->op2);
-			} else {
-				mem = ir_ref_spill_slot(ctx, insn->op2);
-			}
 			|	ASM_TMEM_OP jmp, aword, mem
 		}
     }