diff --git a/ext/opcache/jit/ir/ir.c b/ext/opcache/jit/ir/ir.c index a9f55cc0e46..e90a5e80bf0 100644 --- a/ext/opcache/jit/ir/ir.c +++ b/ext/opcache/jit/ir/ir.c @@ -803,7 +803,9 @@ ir_ref ir_proto(ir_ctx *ctx, uint8_t flags, ir_type ret_type, uint32_t params_co proto->flags = flags; proto->ret_type = ret_type; proto->params_count = params_count; - memcpy(proto->param_types, param_types, params_count); + if (params_count) { + memcpy(proto->param_types, param_types, params_count); + } return ir_strl(ctx, (const char *)proto, offsetof(ir_proto_t, param_types) + params_count); } diff --git a/ext/opcache/jit/ir/ir.h b/ext/opcache/jit/ir/ir.h index ec5e57129c9..be8779e0194 100644 --- a/ext/opcache/jit/ir/ir.h +++ b/ext/opcache/jit/ir/ir.h @@ -854,6 +854,9 @@ void ir_gdb_unregister_all(void); bool ir_gdb_present(void); /* IR load API (implementation in ir_load.c) */ +#define IR_RESOLVE_SYM_ADD_THUNK (1<<0) +#define IR_RESOLVE_SYM_SILENT (1<<1) + struct _ir_loader { uint32_t default_func_flags; bool (*init_module) (ir_loader *loader, const char *name, const char *filename, const char *target); @@ -870,7 +873,7 @@ struct _ir_loader { bool (*sym_data_end) (ir_loader *loader, uint32_t flags); bool (*func_init) (ir_loader *loader, ir_ctx *ctx, const char *name); bool (*func_process) (ir_loader *loader, ir_ctx *ctx, const char *name); - void*(*resolve_sym_name) (ir_loader *loader, const char *name, bool add_thunk); + void*(*resolve_sym_name) (ir_loader *loader, const char *name, uint32_t flags); bool (*has_sym) (ir_loader *loader, const char *name); bool (*add_sym) (ir_loader *loader, const char *name, void *addr); }; diff --git a/ext/opcache/jit/ir/ir_aarch64.dasc b/ext/opcache/jit/ir/ir_aarch64.dasc index 772eea7a5d7..1d927cc8c72 100644 --- a/ext/opcache/jit/ir/ir_aarch64.dasc +++ b/ext/opcache/jit/ir/ir_aarch64.dasc @@ -4366,11 +4366,15 @@ static void ir_emit_va_arg(ir_ctx *ctx, ir_ref def, ir_insn *insn) ir_backend_data *data = ctx->data; dasm_State **Dst = &data->dasm_state; ir_type type = insn->type; - ir_reg def_reg = ctx->regs[def][0]; + ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]); ir_reg op2_reg = ctx->regs[def][2]; ir_reg tmp_reg = ctx->regs[def][3]; int32_t offset; + if (ctx->use_lists[def].count == 1) { + /* dead load */ + return; + } IR_ASSERT(def_reg != IR_REG_NONE && tmp_reg != IR_REG_NONE); if (op2_reg != IR_REG_NONE) { if (IR_REG_SPILLED(op2_reg)) { @@ -4394,11 +4398,15 @@ static void ir_emit_va_arg(ir_ctx *ctx, ir_ref def, ir_insn *insn) ir_backend_data *data = ctx->data; dasm_State **Dst = &data->dasm_state; ir_type type = insn->type; - ir_reg def_reg = ctx->regs[def][0]; + ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]); ir_reg op2_reg = ctx->regs[def][2]; ir_reg tmp_reg = ctx->regs[def][3]; int32_t offset; + if (ctx->use_lists[def].count == 1) { + /* dead load */ + return; + } IR_ASSERT(def_reg != IR_REG_NONE && tmp_reg != IR_REG_NONE); if (op2_reg != IR_REG_NONE) { if (IR_REG_SPILLED(op2_reg)) { @@ -4935,6 +4943,28 @@ static void ir_emit_tailcall(ir_ctx *ctx, ir_ref def, ir_insn *insn) return; } + /* Move op2 to a tmp register before epilogue if it's in + * used_preserved_regs, because it will be overridden. */ + + ir_reg op2_reg = IR_REG_NONE; + if (!IR_IS_CONST_REF(insn->op2)) { + op2_reg = ctx->regs[def][2]; + IR_ASSERT(op2_reg != IR_REG_NONE); + + if (IR_REG_SPILLED(op2_reg)) { + op2_reg = IR_REG_INT_TMP; + ir_emit_load(ctx, IR_ADDR, op2_reg, insn->op2); + } else if (IR_REGSET_IN((ir_regset)ctx->used_preserved_regs, IR_REG_NUM(op2_reg))) { + ir_reg orig_op2_reg = op2_reg; + op2_reg = IR_REG_INT_TMP; + + ir_type type = ctx->ir_base[insn->op2].type; + | ASM_REG_REG_OP mov, type, op2_reg, IR_REG_NUM(orig_op2_reg) + } else { + op2_reg = IR_REG_NUM(op2_reg); + } + } + ir_emit_epilogue(ctx); if (IR_IS_CONST_REF(insn->op2)) { @@ -4947,13 +4977,8 @@ static void ir_emit_tailcall(ir_ctx *ctx, ir_ref def, ir_insn *insn) | br Rx(IR_REG_INT_TMP) } } else { - ir_reg op2_reg = ctx->regs[def][2]; - IR_ASSERT(op2_reg != IR_REG_NONE); - if (IR_REG_SPILLED(op2_reg)) { - op2_reg = IR_REG_NUM(op2_reg); - ir_emit_load(ctx, IR_ADDR, op2_reg, insn->op2); - } + IR_ASSERT(!IR_REGSET_IN((ir_regset)ctx->used_preserved_regs, op2_reg)); | br Rx(op2_reg) } } diff --git a/ext/opcache/jit/ir/ir_cfg.c b/ext/opcache/jit/ir/ir_cfg.c index 01532c8ea3e..34375b0a3b5 100644 --- a/ext/opcache/jit/ir/ir_cfg.c +++ b/ext/opcache/jit/ir/ir_cfg.c @@ -244,7 +244,6 @@ int ir_build_cfg(ir_ctx *ctx) _blocks[start] = b; _blocks[end] = b; IR_ASSERT(IR_IS_BB_START(insn->op)); - IR_ASSERT(end > start); bb->start = start; bb->end = end; bb->successors = count; @@ -583,7 +582,6 @@ static int ir_remove_unreachable_blocks(ir_ctx *ctx) return 1; } -#if 0 static void compute_postnum(const ir_ctx *ctx, uint32_t *cur, uint32_t b) { uint32_t i, *p; @@ -607,34 +605,42 @@ static void compute_postnum(const ir_ctx *ctx, uint32_t *cur, uint32_t b) /* Computes dominator tree using algorithm from "A Simple, Fast Dominance Algorithm" by * Cooper, Harvey and Kennedy. */ -int ir_build_dominators_tree(ir_ctx *ctx) +static int ir_build_dominators_tree_slow(ir_ctx *ctx) { uint32_t blocks_count, b, postnum; ir_block *blocks, *bb; uint32_t *edges; bool changed; + blocks = ctx->cfg_blocks; + edges = ctx->cfg_edges; + blocks_count = ctx->cfg_blocks_count; + + /* Clear the dominators tree */ + for (b = 0, bb = &blocks[0]; b <= blocks_count; b++, bb++) { + bb->idom = 0; + bb->dom_depth = 0; + bb->dom_child = 0; + bb->dom_next_child = 0; + } + ctx->flags2 &= ~IR_NO_LOOPS; postnum = 1; compute_postnum(ctx, &postnum, 1); - /* Find immediate dominators */ - blocks = ctx->cfg_blocks; - edges = ctx->cfg_edges; - blocks_count = ctx->cfg_blocks_count; + /* Find immediate dominators by iterative fixed-point algorithm */ blocks[1].idom = 1; do { changed = 0; /* Iterating in Reverse Post Order */ for (b = 2, bb = &blocks[2]; b <= blocks_count; b++, bb++) { IR_ASSERT(!(bb->flags & IR_BB_UNREACHABLE)); + IR_ASSERT(bb->predecessors_count > 0); if (bb->predecessors_count == 1) { uint32_t pred_b = edges[bb->predecessors]; - if (blocks[pred_b].idom <= 0) { - //IR_ASSERT("Wrong blocks order: BB is before its single predecessor"); - } else if (bb->idom != pred_b) { + if (blocks[pred_b].idom > 0 && bb->idom != pred_b) { bb->idom = pred_b; changed = 1; } @@ -680,39 +686,37 @@ int ir_build_dominators_tree(ir_ctx *ctx) } } } while (changed); + + /* Build dominators tree */ blocks[1].idom = 0; blocks[1].dom_depth = 0; - - /* Construct dominators tree */ for (b = 2, bb = &blocks[2]; b <= blocks_count; b++, bb++) { - IR_ASSERT(!(bb->flags & IR_BB_UNREACHABLE)); - if (bb->idom > 0) { - ir_block *idom_bb = &blocks[bb->idom]; + uint32_t idom = bb->idom; + ir_block *idom_bb = &blocks[idom]; - bb->dom_depth = idom_bb->dom_depth + 1; - /* Sort by block number to traverse children in pre-order */ - if (idom_bb->dom_child == 0) { - idom_bb->dom_child = b; - } else if (b < idom_bb->dom_child) { - bb->dom_next_child = idom_bb->dom_child; - idom_bb->dom_child = b; - } else { - int child = idom_bb->dom_child; - ir_block *child_bb = &blocks[child]; + bb->dom_depth = idom_bb->dom_depth + 1; + /* Sort by block number to traverse children in pre-order */ + if (idom_bb->dom_child == 0) { + idom_bb->dom_child = b; + } else if (b < idom_bb->dom_child) { + bb->dom_next_child = idom_bb->dom_child; + idom_bb->dom_child = b; + } else { + int child = idom_bb->dom_child; + ir_block *child_bb = &blocks[child]; - while (child_bb->dom_next_child > 0 && b > child_bb->dom_next_child) { - child = child_bb->dom_next_child; - child_bb = &blocks[child]; - } - bb->dom_next_child = child_bb->dom_next_child; - child_bb->dom_next_child = b; + while (child_bb->dom_next_child > 0 && b > child_bb->dom_next_child) { + child = child_bb->dom_next_child; + child_bb = &blocks[child]; } + bb->dom_next_child = child_bb->dom_next_child; + child_bb->dom_next_child = b; } } return 1; } -#else + /* A single pass modification of "A Simple, Fast Dominance Algorithm" by * Cooper, Harvey and Kennedy, that relays on IR block ordering. * It may fallback to the general slow fixed-point algorithm. */ @@ -747,7 +751,11 @@ int ir_build_dominators_tree(ir_ctx *ctx) if (UNEXPECTED(idom >= b)) { /* In rare cases, LOOP_BEGIN.op1 may be a back-edge. Skip back-edges. */ ctx->flags2 &= ~IR_NO_LOOPS; - IR_ASSERT(k > 1 && "Wrong blocks order: BB is before its single predecessor"); +// IR_ASSERT(k > 1 && "Wrong blocks order: BB is before its single predecessor"); + if (UNEXPECTED(k <= 1)) { + ir_list_free(&worklist); + return ir_build_dominators_tree_slow(ctx); + } ir_list_push(&worklist, idom); while (1) { k--; @@ -942,7 +950,6 @@ static int ir_build_dominators_tree_iterative(ir_ctx *ctx) return 1; } -#endif static bool ir_dominates(const ir_block *blocks, uint32_t b1, uint32_t b2) { @@ -958,7 +965,7 @@ static bool ir_dominates(const ir_block *blocks, uint32_t b1, uint32_t b2) int ir_find_loops(ir_ctx *ctx) { - uint32_t i, j, n, count; + uint32_t b, j, n, count; uint32_t *entry_times, *exit_times, *sorted_blocks, time = 1; ir_block *blocks = ctx->cfg_blocks; uint32_t *edges = ctx->cfg_edges; @@ -983,13 +990,13 @@ int ir_find_loops(ir_ctx *ctx) int child; next: - i = ir_worklist_peek(&work); - if (!entry_times[i]) { - entry_times[i] = time++; + b = ir_worklist_peek(&work); + if (!entry_times[b]) { + entry_times[b] = time++; } - /* Visit blocks immediately dominated by i. */ - bb = &blocks[i]; + /* Visit blocks immediately dominated by "b". */ + bb = &blocks[b]; for (child = bb->dom_child; child > 0; child = blocks[child].dom_next_child) { if (ir_worklist_push(&work, child)) { goto next; @@ -999,17 +1006,17 @@ next: /* Visit join edges. */ if (bb->successors_count) { uint32_t *p = edges + bb->successors; - for (j = 0; j < bb->successors_count; j++,p++) { + for (j = 0; j < bb->successors_count; j++, p++) { uint32_t succ = *p; - if (blocks[succ].idom == i) { + if (blocks[succ].idom == b) { continue; } else if (ir_worklist_push(&work, succ)) { goto next; } } } - exit_times[i] = time++; + exit_times[b] = time++; ir_worklist_pop(&work); } @@ -1018,7 +1025,7 @@ next: j = 1; n = 2; while (j != n) { - i = j; + uint32_t i = j; j = n; for (; i < j; i++) { int child; @@ -1030,9 +1037,82 @@ next: count = n; /* Identify loops. See Sreedhar et al, "Identifying Loops Using DJ Graphs". */ + uint32_t prev_dom_depth = blocks[sorted_blocks[n - 1]].dom_depth; + uint32_t prev_irreducible = 0; while (n > 1) { - i = sorted_blocks[--n]; - ir_block *bb = &blocks[i]; + b = sorted_blocks[--n]; + ir_block *bb = &blocks[b]; + + IR_ASSERT(bb->dom_depth <= prev_dom_depth); + if (UNEXPECTED(prev_irreducible) && bb->dom_depth != prev_dom_depth) { + /* process delyed irreducible loops */ + do { + b = sorted_blocks[prev_irreducible]; + bb = &blocks[b]; + if ((bb->flags & IR_BB_IRREDUCIBLE_LOOP) && !bb->loop_depth) { + /* process irreducible loop */ + uint32_t hdr = b; + + bb->loop_depth = 1; + if (ctx->ir_base[bb->start].op == IR_MERGE) { + ctx->ir_base[bb->start].op = IR_LOOP_BEGIN; + } + + /* find the closing edge(s) of the irreucible loop */ + IR_ASSERT(bb->predecessors_count > 1); + uint32_t *p = &edges[bb->predecessors]; + j = bb->predecessors_count; + do { + uint32_t pred = *p; + + if (entry_times[pred] > entry_times[b] && exit_times[pred] < exit_times[b]) { + if (!ir_worklist_len(&work)) { + ir_bitset_clear(work.visited, ir_bitset_len(ir_worklist_capasity(&work))); + } + blocks[pred].loop_header = 0; /* support for merged loops */ + ir_worklist_push(&work, pred); + } + p++; + } while (--j); + IR_ASSERT(ir_worklist_len(&work) != 0); + + /* collect members of the irreducible loop */ + while (ir_worklist_len(&work)) { + b = ir_worklist_pop(&work); + if (b != hdr) { + ir_block *bb = &blocks[b]; + bb->loop_header = hdr; + if (bb->predecessors_count) { + uint32_t *p = &edges[bb->predecessors]; + uint32_t n = bb->predecessors_count; + do { + uint32_t pred = *p; + while (blocks[pred].loop_header > 0) { + pred = blocks[pred].loop_header; + } + if (pred != hdr) { + if (entry_times[pred] > entry_times[hdr] && exit_times[pred] < exit_times[hdr]) { + /* "pred" is a descendant of "hdr" */ + ir_worklist_push(&work, pred); + } else { + /* another entry to the irreducible loop */ + bb->flags |= IR_BB_IRREDUCIBLE_LOOP; + if (ctx->ir_base[bb->start].op == IR_MERGE) { + ctx->ir_base[bb->start].op = IR_LOOP_BEGIN; + } + } + } + p++; + } while (--n); + } + } + } + } + } while (--prev_irreducible != n); + prev_irreducible = 0; + b = sorted_blocks[n]; + bb = &blocks[b]; + } if (bb->predecessors_count > 1) { bool irreducible = 0; @@ -1047,7 +1127,7 @@ next: if (bb->idom != pred) { /* In a loop back-edge (back-join edge), the successor dominates the predecessor. */ - if (ir_dominates(blocks, i, pred)) { + if (ir_dominates(blocks, b, pred)) { if (!ir_worklist_len(&work)) { ir_bitset_clear(work.visited, ir_bitset_len(ir_worklist_capasity(&work))); } @@ -1056,8 +1136,9 @@ next: } else { /* Otherwise it's a cross-join edge. See if it's a branch to an ancestor on the DJ spanning tree. */ - if (entry_times[pred] > entry_times[i] && exit_times[pred] < exit_times[i]) { + if (entry_times[pred] > entry_times[b] && exit_times[pred] < exit_times[b]) { irreducible = 1; + break; } } } @@ -1065,46 +1146,55 @@ next: } while (--j); if (UNEXPECTED(irreducible)) { - // TODO: Support for irreducible loops ??? - bb->flags |= IR_BB_IRREDUCIBLE_LOOP; - ctx->flags2 |= IR_IRREDUCIBLE_CFG; - while (ir_worklist_len(&work)) { - ir_worklist_pop(&work); + bb->flags |= IR_BB_LOOP_HEADER | IR_BB_IRREDUCIBLE_LOOP; + ctx->flags2 |= IR_CFG_HAS_LOOPS | IR_IRREDUCIBLE_CFG; + /* Remember the position of the first irreducible loop to process all the irreducible loops + * after the reducible loops with the same dominator tree depth + */ + if (!prev_irreducible) { + prev_irreducible = n; } + ir_list_clear(&work.l); } else if (ir_worklist_len(&work)) { + /* collect members of the reducible loop */ + uint32_t hdr = b; + bb->flags |= IR_BB_LOOP_HEADER; ctx->flags2 |= IR_CFG_HAS_LOOPS; bb->loop_depth = 1; + if (ctx->ir_base[bb->start].op == IR_MERGE) { + ctx->ir_base[bb->start].op = IR_LOOP_BEGIN; + } while (ir_worklist_len(&work)) { - j = ir_worklist_pop(&work); - while (blocks[j].loop_header > 0) { - j = blocks[j].loop_header; - } - if (j != i) { - ir_block *bb = &blocks[j]; - if (bb->idom == 0 && j != 1) { - /* Ignore blocks that are unreachable or only abnormally reachable. */ - continue; - } - bb->loop_header = i; + b = ir_worklist_pop(&work); + if (b != hdr) { + ir_block *bb = &blocks[b]; + bb->loop_header = hdr; if (bb->predecessors_count) { uint32_t *p = &edges[bb->predecessors]; - j = bb->predecessors_count; + uint32_t n = bb->predecessors_count; do { - ir_worklist_push(&work, *p); + uint32_t pred = *p; + while (blocks[pred].loop_header > 0) { + pred = blocks[pred].loop_header; + } + if (pred != hdr) { + ir_worklist_push(&work, pred); + } p++; - } while (--j); + } while (--n); } } } } } } + IR_ASSERT(!prev_irreducible); if (ctx->flags2 & IR_CFG_HAS_LOOPS) { for (n = 1; n < count; n++) { - i = sorted_blocks[n]; - ir_block *bb = &blocks[i]; + b = sorted_blocks[n]; + ir_block *bb = &blocks[b]; if (bb->loop_header > 0) { ir_block *loop = &blocks[bb->loop_header]; uint32_t loop_depth = loop->loop_depth; @@ -1389,7 +1479,7 @@ restart: goto restart; } } else if (b != predecessor && ctx->cfg_blocks[predecessor].loop_header != b) { - ir_dump_cfg(ctx, stderr); + /* not a loop back-edge */ IR_ASSERT(b == predecessor || ctx->cfg_blocks[predecessor].loop_header == b); } } diff --git a/ext/opcache/jit/ir/ir_check.c b/ext/opcache/jit/ir/ir_check.c index f12b4776fa1..a791baef5db 100644 --- a/ext/opcache/jit/ir/ir_check.c +++ b/ext/opcache/jit/ir/ir_check.c @@ -213,13 +213,18 @@ bool ir_check(const ir_ctx *ctx) ok = 0; } } - break; - case IR_OPND_CONTROL_DEP: if ((ctx->flags2 & IR_LINEAR) && use >= i && !(insn->op == IR_LOOP_BEGIN)) { fprintf(stderr, "ir_base[%d].ops[%d] invalid forward reference (%d)\n", i, j, use); ok = 0; + } + break; + case IR_OPND_CONTROL_DEP: + if ((ctx->flags2 & IR_LINEAR) + && use >= i) { + fprintf(stderr, "ir_base[%d].ops[%d] invalid forward reference (%d)\n", i, j, use); + ok = 0; } else if (insn->op == IR_PHI) { ir_insn *merge_insn = &ctx->ir_base[insn->op1]; if (merge_insn->op != IR_MERGE && merge_insn->op != IR_LOOP_BEGIN) { diff --git a/ext/opcache/jit/ir/ir_emit.c b/ext/opcache/jit/ir/ir_emit.c index c82655daf48..fab9f56228d 100644 --- a/ext/opcache/jit/ir/ir_emit.c +++ b/ext/opcache/jit/ir/ir_emit.c @@ -309,7 +309,7 @@ static void* ir_sym_addr(ir_ctx *ctx, const ir_insn *addr_insn) { const char *name = ir_get_str(ctx, addr_insn->val.name); void *addr = (ctx->loader && ctx->loader->resolve_sym_name) ? - ctx->loader->resolve_sym_name(ctx->loader, name, 0) : + ctx->loader->resolve_sym_name(ctx->loader, name, IR_RESOLVE_SYM_SILENT) : ir_resolve_sym_name(name); return addr; @@ -320,7 +320,7 @@ static void* ir_sym_val(ir_ctx *ctx, const ir_insn *addr_insn) { const char *name = ir_get_str(ctx, addr_insn->val.name); void *addr = (ctx->loader && ctx->loader->resolve_sym_name) ? - ctx->loader->resolve_sym_name(ctx->loader, name, addr_insn->op == IR_FUNC) : + ctx->loader->resolve_sym_name(ctx->loader, name, addr_insn->op == IR_FUNC ? IR_RESOLVE_SYM_ADD_THUNK : 0) : ir_resolve_sym_name(name); IR_ASSERT(addr); diff --git a/ext/opcache/jit/ir/ir_fold.h b/ext/opcache/jit/ir/ir_fold.h index 88539e52ab0..90112214d0c 100644 --- a/ext/opcache/jit/ir/ir_fold.h +++ b/ext/opcache/jit/ir/ir_fold.h @@ -1909,7 +1909,9 @@ IR_FOLD(SUB(_, SUB)) IR_FOLD(SUB(ADD, ADD)) { if (IR_IS_TYPE_INT(IR_OPT_TYPE(opt))) { - if (op1_insn->op1 == op2_insn->op1) { + if (op1 == op2) { + IR_FOLD_CONST_U(0); + } else if (op1_insn->op1 == op2_insn->op1) { /* (a + b) - (a + c) => b - c */ op1 = op1_insn->op2; op2 = op2_insn->op2; diff --git a/ext/opcache/jit/ir/ir_gcm.c b/ext/opcache/jit/ir/ir_gcm.c index 8bd6be5d10a..0d8a6c2d760 100644 --- a/ext/opcache/jit/ir/ir_gcm.c +++ b/ext/opcache/jit/ir/ir_gcm.c @@ -785,6 +785,139 @@ IR_ALWAYS_INLINE ir_ref ir_count_constant(ir_ref *_xlat, ir_ref ref) return 0; } +IR_ALWAYS_INLINE bool ir_is_good_bb_order(ir_ctx *ctx, uint32_t b, ir_block *bb, ir_ref start) +{ + ir_insn *insn = &ctx->ir_base[start]; + uint32_t n = insn->inputs_count; + ir_ref *p = insn->ops + 1; + + if (n == 1) { + return *p < start; + } else { + IR_ASSERT(n > 1); + for (; n > 0; p++, n--) { + ir_ref input = *p; + if (input < start) { + /* ordered */ + } else if ((bb->flags & IR_BB_LOOP_HEADER) + && (ctx->cfg_map[input] == b || ctx->cfg_blocks[ctx->cfg_map[input]].loop_header == b)) { + /* back-edge of reducible loop */ + } else if ((bb->flags & IR_BB_IRREDUCIBLE_LOOP) + && (ctx->cfg_blocks[ctx->cfg_map[input]].loop_header == ctx->cfg_blocks[b].loop_header)) { + /* closing edge of irreducible loop */ + } else { + return 0; + } + } + return 1; + } +} + +static IR_NEVER_INLINE void ir_fix_bb_order(ir_ctx *ctx, ir_ref *_prev, ir_ref *_next) +{ + uint32_t b, succ, count, *q, *xlat; + ir_block *bb; + ir_ref ref, n, prev; + ir_worklist worklist; + ir_block *new_blocks; + +#if 0 + for (b = 1, bb = ctx->cfg_blocks + 1; b <= ctx->cfg_blocks_count; b++, bb++) { + if (!ir_is_good_bb_order(ctx, b, bb, bb->start)) { + goto fix; + } + } + return; + +fix: +#endif + count = ctx->cfg_blocks_count + 1; + new_blocks = ir_mem_malloc(count * sizeof(ir_block)); + xlat = ir_mem_malloc(count * sizeof(uint32_t)); + ir_worklist_init(&worklist, count); + ir_worklist_push(&worklist, 1); + while (ir_worklist_len(&worklist) != 0) { +next: + b = ir_worklist_peek(&worklist); + bb = &ctx->cfg_blocks[b]; + n = bb->successors_count; + if (n == 1) { + succ = ctx->cfg_edges[bb->successors]; + if (ir_worklist_push(&worklist, succ)) { + goto next; + } + } else if (n > 1) { + uint32_t best = 0; + uint32_t best_loop_depth = 0; + + q = ctx->cfg_edges + bb->successors + n; + do { + q--; + succ = *q; + if (ir_bitset_in(worklist.visited, succ)) { + /* already processed */ + } else if ((ctx->cfg_blocks[succ].flags & IR_BB_LOOP_HEADER) + && (succ == b || ctx->cfg_blocks[b].loop_header == succ)) { + /* back-edge of reducible loop */ + } else if ((ctx->cfg_blocks[succ].flags & IR_BB_IRREDUCIBLE_LOOP) + && (ctx->cfg_blocks[succ].loop_header == ctx->cfg_blocks[b].loop_header)) { + /* closing edge of irreducible loop */ + } else if (!best) { + best = succ; + best_loop_depth = ctx->cfg_blocks[best].loop_depth; + } else if (ctx->cfg_blocks[succ].loop_depth < best_loop_depth) { + /* prefer deeper loop */ + best = succ; + best_loop_depth = ctx->cfg_blocks[best].loop_depth; + } + n--; + } while (n > 0); + if (best) { + ir_worklist_push(&worklist, best); + goto next; + } + } + ir_worklist_pop(&worklist); + count--; + new_blocks[count] = *bb; + xlat[b] = count; + } + IR_ASSERT(count == 1); + xlat[0] = 0; + ir_worklist_free(&worklist); + + prev = 0; + for (b = 1, bb = new_blocks + 1; b <= ctx->cfg_blocks_count; b++, bb++) { + bb->idom = xlat[bb->idom]; + bb->loop_header = xlat[bb->loop_header]; + n = bb->successors_count; + if (n > 0) { + for (q = ctx->cfg_edges + bb->successors; n > 0; q++, n--) { + *q = xlat[*q]; + } + } + n = bb->predecessors_count; + if (n > 0) { + for (q = ctx->cfg_edges + bb->predecessors; n > 0; q++, n--) { + *q = xlat[*q]; + } + } + _next[prev] = bb->start; + _prev[bb->start] = prev; + prev = bb->end; + } + _next[0] = 0; + _next[prev] = 0; + + for (ref = 2; ref < ctx->insns_count; ref++) { + ctx->cfg_map[ref] = xlat[ctx->cfg_map[ref]]; + } + ir_mem_free(xlat); + + ir_mem_free(ctx->cfg_blocks); + ctx->cfg_blocks = new_blocks; +} + int ir_schedule(ir_ctx *ctx) { ir_ctx new_ctx; @@ -800,6 +933,7 @@ int ir_schedule(ir_ctx *ctx) ir_block *bb; ir_insn *insn, *new_insn; ir_use_list *lists, *use_list, *new_list; + bool bad_bb_order = 0; /* Create a double-linked list of nodes ordered by BB, respecting BB->start and BB->end */ IR_ASSERT(_blocks[1] == 1); @@ -818,27 +952,50 @@ int ir_schedule(ir_ctx *ctx) } else if (b > prev_b) { bb = &ctx->cfg_blocks[b]; if (i == bb->start) { - IR_ASSERT(bb->end > bb->start); - prev_b = b; - prev_b_end = bb->end; - _prev[bb->end] = 0; + if (bb->end > bb->start) { + prev_b = b; + prev_b_end = bb->end; + /* add to the end of the list */ + _next[j] = i; + _prev[i] = j; + j = i; + } else { + prev_b = 0; + prev_b_end = 0; + k = bb->end; + while (_blocks[_prev[k]] == b) { + k = _prev[k]; + } + /* insert before "k" */ + _prev[i] = _prev[k]; + _next[i] = k; + _next[_prev[k]] = i; + _prev[k] = i; + } + if (!ir_is_good_bb_order(ctx, b, bb, i)) { + bad_bb_order = 1; + } + } else if (i != bb->end) { + /* move down late (see the following loop) */ + _next[i] = _move_down; + _move_down = i; + } else { + IR_ASSERT(bb->start > bb->end); + prev_b = 0; + prev_b_end = 0; /* add to the end of the list */ _next[j] = i; _prev[i] = j; j = i; - } else { - IR_ASSERT(i != bb->end); - /* move down late (see the following loop) */ - _next[i] = _move_down; - _move_down = i; } } else if (b) { bb = &ctx->cfg_blocks[b]; IR_ASSERT(i != bb->start); - if (_prev[bb->end]) { + if (i > bb->end) { /* move up, insert before the end of the already scheduled BB */ k = bb->end; } else { + IR_ASSERT(i > bb->start); /* move up, insert at the end of the block */ k = ctx->cfg_blocks[b + 1].start; } @@ -883,6 +1040,10 @@ int ir_schedule(ir_ctx *ctx) } #endif + if (bad_bb_order) { + ir_fix_bb_order(ctx, _prev, _next); + } + _xlat = ir_mem_calloc((ctx->consts_count + ctx->insns_count), sizeof(ir_ref)); _xlat += ctx->consts_count; _xlat[IR_TRUE] = IR_TRUE; diff --git a/ext/opcache/jit/ir/ir_private.h b/ext/opcache/jit/ir/ir_private.h index 69a0101d24e..ac952e402f5 100644 --- a/ext/opcache/jit/ir/ir_private.h +++ b/ext/opcache/jit/ir/ir_private.h @@ -62,7 +62,7 @@ #define IR_MAX(a, b) (((a) > (b)) ? (a) : (b)) #define IR_MIN(a, b) (((a) < (b)) ? (a) : (b)) -#define IR_IS_POWER_OF_TWO(x) (!((x) & ((x) - 1))) +#define IR_IS_POWER_OF_TWO(x) ((x) && (!((x) & ((x) - 1)))) #define IR_LOG2(x) ir_ntzl(x) diff --git a/ext/opcache/jit/ir/ir_save.c b/ext/opcache/jit/ir/ir_save.c index b12cc267af6..595f2d9d6a2 100644 --- a/ext/opcache/jit/ir/ir_save.c +++ b/ext/opcache/jit/ir/ir_save.c @@ -140,6 +140,9 @@ void ir_save(const ir_ctx *ctx, uint32_t save_flags, FILE *f) fprintf(f, ", loop=BB%d(%d)", bb->loop_header, bb->loop_depth); } } + if (bb->flags & IR_BB_IRREDUCIBLE_LOOP) { + fprintf(f, ", IRREDUCIBLE"); + } if (bb->predecessors_count) { uint32_t i; diff --git a/ext/opcache/jit/ir/ir_sccp.c b/ext/opcache/jit/ir/ir_sccp.c index 2e006516df8..221a86a5ad8 100644 --- a/ext/opcache/jit/ir/ir_sccp.c +++ b/ext/opcache/jit/ir/ir_sccp.c @@ -1732,7 +1732,20 @@ static ir_ref ir_promote_i2i(ir_ctx *ctx, ir_type type, ir_ref ref, ir_ref use, ir_ref *p, n, input; if (IR_IS_CONST_REF(ref)) { - return ir_const(ctx, insn->val, type); + ir_val val; + + switch (type) { + case IR_I8: val.i64 = insn->val.i8; break; + case IR_U8: val.u64 = insn->val.u8; break; + case IR_I16: val.i64 = insn->val.i16; break; + case IR_U16: val.u64 = insn->val.u16; break; + case IR_I32: val.i64 = insn->val.i32; break; + case IR_U32: val.u64 = insn->val.u32; break; + case IR_CHAR:val.i64 = insn->val.i8; break; + case IR_BOOL:val.u64 = insn->val.u8 != 0; break; + default: IR_ASSERT(0); val.u64 = 0; + } + return ir_const(ctx, val, type); } else { ir_bitqueue_add(worklist, ref); switch (insn->op) { diff --git a/ext/opcache/jit/ir/ir_x86.dasc b/ext/opcache/jit/ir/ir_x86.dasc index 76602c2b4bc..d56cb8645e1 100644 --- a/ext/opcache/jit/ir/ir_x86.dasc +++ b/ext/opcache/jit/ir/ir_x86.dasc @@ -6868,7 +6868,24 @@ static void ir_emit_return_fp(ir_ctx *ctx, ir_ref ref, ir_insn *insn) ir_backend_data *data = ctx->data; dasm_State **Dst = &data->dasm_state; - if (op2_reg == IR_REG_NONE || IR_REG_SPILLED(op2_reg)) { + if (IR_IS_CONST_REF(insn->op2)) { + ir_insn *value = &ctx->ir_base[insn->op2]; + + if ((type == IR_FLOAT && value->val.f == 0.0) || (type == IR_DOUBLE && value->val.d == 0.0)) { + | fldz + } else if ((type == IR_FLOAT && value->val.f == 1.0) || (type == IR_DOUBLE && value->val.d == 1.0)) { + | fld1 + } else { + int label = ir_const_label(ctx, insn->op2); + + if (type == IR_DOUBLE) { + | fld qword [=>label] + } else { + IR_ASSERT(type == IR_FLOAT); + | fld dword [=>label] + } + } + } else if (op2_reg == IR_REG_NONE || IR_REG_SPILLED(op2_reg)) { ir_reg fp; int32_t offset = ir_ref_spill_slot_offset(ctx, insn->op2, &fp); @@ -8442,11 +8459,15 @@ static void ir_emit_va_arg(ir_ctx *ctx, ir_ref def, ir_insn *insn) ir_backend_data *data = ctx->data; dasm_State **Dst = &data->dasm_state; ir_type type = insn->type; - ir_reg def_reg = ctx->regs[def][0]; + ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]); ir_reg op2_reg = ctx->regs[def][2]; ir_reg tmp_reg = ctx->regs[def][3]; int32_t offset; + if (ctx->use_lists[def].count == 1) { + /* dead load */ + return; + } IR_ASSERT(def_reg != IR_REG_NONE && tmp_reg != IR_REG_NONE); if (op2_reg != IR_REG_NONE) { if (IR_REG_SPILLED(op2_reg)) { @@ -8471,11 +8492,15 @@ static void ir_emit_va_arg(ir_ctx *ctx, ir_ref def, ir_insn *insn) ir_backend_data *data = ctx->data; dasm_State **Dst = &data->dasm_state; ir_type type = insn->type; - ir_reg def_reg = ctx->regs[def][0]; + ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]); ir_reg op2_reg = ctx->regs[def][2]; ir_reg tmp_reg = ctx->regs[def][3]; int32_t offset; + if (ctx->use_lists[def].count == 1) { + /* dead load */ + return; + } IR_ASSERT(def_reg != IR_REG_NONE&& tmp_reg != IR_REG_NONE); if (op2_reg != IR_REG_NONE) { if (IR_REG_SPILLED(op2_reg)) { @@ -9221,6 +9246,58 @@ static void ir_emit_tailcall(ir_ctx *ctx, ir_ref def, ir_insn *insn) return; } + /* Move op2 to a tmp register before epilogue if it's in + * used_preserved_regs, because it will be overridden. */ + + ir_reg op2_reg = IR_REG_NONE; + ir_mem mem = IR_MEM_B(IR_REG_NONE); + if (!IR_IS_CONST_REF(insn->op2)) { + op2_reg = ctx->regs[def][2]; + + ir_regset preserved_regs = (ir_regset)ctx->used_preserved_regs | IR_REGSET(IR_REG_STACK_POINTER); + if (ctx->flags & IR_USE_FRAME_POINTER) { + preserved_regs |= IR_REGSET(IR_REG_FRAME_POINTER); + } + + bool is_spill_slot = op2_reg != IR_REG_NONE + && IR_REG_SPILLED(op2_reg) + && ctx->vregs[insn->op2]; + + if (op2_reg != IR_REG_NONE && !is_spill_slot) { + if (IR_REGSET_IN(preserved_regs, IR_REG_NUM(op2_reg))) { + ir_ref orig_op2_reg = op2_reg; + op2_reg = IR_REG_RAX; + + if (IR_REG_SPILLED(orig_op2_reg)) { + ir_emit_load(ctx, IR_ADDR, op2_reg, insn->op2); + } else { + ir_type type = ctx->ir_base[insn->op2].type; + | ASM_REG_REG_OP mov, type, op2_reg, IR_REG_NUM(orig_op2_reg) + } + } else { + op2_reg = IR_REG_NUM(op2_reg); + } + } else { + if (ir_rule(ctx, insn->op2) & IR_FUSED) { + IR_ASSERT(op2_reg == IR_REG_NONE); + mem = ir_fuse_load(ctx, def, insn->op2); + } else { + mem = ir_ref_spill_slot(ctx, insn->op2); + } + ir_reg base = IR_MEM_BASE(mem); + ir_reg index = IR_MEM_INDEX(mem); + if ((base != IR_REG_NONE && IR_REGSET_IN(preserved_regs, base)) || + (index != IR_REG_NONE && IR_REGSET_IN(preserved_regs, index))) { + op2_reg = IR_REG_RAX; + + ir_type type = ctx->ir_base[insn->op2].type; + ir_emit_load_mem_int(ctx, type, op2_reg, mem); + } else { + op2_reg = IR_REG_NONE; + } + } + } + ir_emit_epilogue(ctx); if (IR_IS_CONST_REF(insn->op2)) { @@ -9246,22 +9323,10 @@ static void ir_emit_tailcall(ir_ctx *ctx, ir_ref def, ir_insn *insn) |.endif } } else { - ir_reg op2_reg = ctx->regs[def][2]; - if (op2_reg != IR_REG_NONE) { - if (IR_REG_SPILLED(op2_reg)) { - op2_reg = IR_REG_NUM(op2_reg); - ir_emit_load(ctx, IR_ADDR, op2_reg, insn->op2); - } + IR_ASSERT(!IR_REGSET_IN((ir_regset)ctx->used_preserved_regs, op2_reg)); | jmp Ra(op2_reg) } else { - ir_mem mem; - - if (ir_rule(ctx, insn->op2) & IR_FUSED) { - mem = ir_fuse_load(ctx, def, insn->op2); - } else { - mem = ir_ref_spill_slot(ctx, insn->op2); - } | ASM_TMEM_OP jmp, aword, mem } }