8333684: C2 SuperWord: multiple smaller refactorings in preparation for JDK-8332163

Reviewed-by: chagedorn, kvn
2025-09-15 16:44:36 +02:00 · 2024-06-13 06:35:26 +00:00 · 2024-06-13 06:35:26 +00:00 · 5d2a19def1
commit 5d2a19def1
parent 301bd70856
6 changed files with 224 additions and 185 deletions
--- a/src/hotspot/share/opto/phasetype.hpp
+++ b/src/hotspot/share/opto/phasetype.hpp
@ -68,9 +68,10 @@
  flags(AFTER_RANGE_CHECK_ELIMINATION,  "After Range Check Elimination") \
  flags(BEFORE_PRE_MAIN_POST,           "Before Pre/Main/Post Loops") \
  flags(AFTER_PRE_MAIN_POST,            "After Pre/Main/Post Loops") \
-  flags(SUPERWORD1_BEFORE_SCHEDULE,     "Superword 1, Before Schedule") \
+  flags(AUTO_VECTORIZATION1_BEFORE_APPLY,       "AutoVectorization 1, Before Apply") \
-  flags(SUPERWORD2_BEFORE_OUTPUT,       "Superword 2, Before Output") \
+  flags(AUTO_VECTORIZATION2_AFTER_REORDER,      "AutoVectorization 2, After Apply Memop Reordering") \
-  flags(SUPERWORD3_AFTER_OUTPUT,        "Superword 3, After Output") \
+  flags(AUTO_VECTORIZATION3_AFTER_ADJUST_LIMIT, "AutoVectorization 3, After Adjusting Pre-Loop Limit") \
  flags(AUTO_VECTORIZATION4_AFTER_APPLY,        "AutoVectorization 4, After Apply") \
  flags(BEFORE_CLOOPS,                  "Before CountedLoop") \
  flags(AFTER_CLOOPS,                   "After CountedLoop") \
  flags(PHASEIDEAL_BEFORE_EA,           "PhaseIdealLoop before EA") \
--- a/src/hotspot/share/opto/superword.cpp
+++ b/src/hotspot/share/opto/superword.cpp
@ -481,10 +481,9 @@ bool SuperWord::SLP_extract() {
  filter_packs_for_profitable();
  DEBUG_ONLY(verify_packs();)
  DEBUG_ONLY(verify_no_extract());
-  schedule();
+  return schedule_and_apply();
  return output();
 }
 // Find the "seed" memops pairs. These are pairs that we strongly suspect would lead to vectorization.
@ -1466,7 +1465,7 @@ const AlignmentSolution* SuperWord::pack_alignment_solution(const Node_List* pac
 // that the packs impose. Remove packs that do not have a compatible solution.
 void SuperWord::filter_packs_for_alignment() {
  // We do not need to filter if no alignment is required.
-  if (!vectors_should_be_aligned()) {
+  if (!VLoop::vectors_should_be_aligned()) {
    return;
  }
@ -1592,20 +1591,12 @@ bool SuperWord::implemented(const Node_List* pack, const uint size) const {
    } else if (p0->is_Cmp()) {
      // Cmp -> Bool -> Cmove
      retValue = UseVectorCmov;
-    } else if (requires_long_to_int_conversion(opc)) {
+    } else if (VectorNode::is_scalar_op_that_returns_int_but_vector_op_returns_long(opc)) {
-      // Java API for Long.bitCount/numberOfLeadingZeros/numberOfTrailingZeros
+      // Requires extra vector long -> int conversion.
      // returns int type, but Vector API for them returns long type. To unify
      // the implementation in backend, superword splits the vector implementation
      // for Java API into an execution node with long type plus another node
      // converting long to int.
      retValue = VectorNode::implemented(opc, size, T_LONG) &&
                 VectorCastNode::implemented(Op_ConvL2I, size, T_LONG, T_INT);
    } else {
-      // Vector unsigned right shift for signed subword types behaves differently
+      if (VectorNode::can_use_RShiftI_instead_of_URShiftI(p0, velt_basic_type(p0))) {
      // from Java Spec. But when the shift amount is a constant not greater than
      // the number of sign extended bits, the unsigned right shift can be
      // vectorized to a signed right shift.
      if (VectorNode::can_transform_shift_op(p0, velt_basic_type(p0))) {
        opc = Op_RShiftI;
      }
      retValue = VectorNode::implemented(opc, size, velt_basic_type(p0));
@ -1630,36 +1621,87 @@ uint SuperWord::max_implemented_size(const Node_List* pack) {
  }
 }
-// Java API for Long.bitCount/numberOfLeadingZeros/numberOfTrailingZeros
+// If the j-th input for all nodes in the pack is the same input: return it, else nullptr.
-// returns int type, but Vector API for them returns long type. To unify
+Node* PackSet::same_inputs_at_index_or_null(const Node_List* pack, const int index) const {
-// the implementation in backend, superword splits the vector implementation
+  Node* p0_in = pack->at(0)->in(index);
-// for Java API into an execution node with long type plus another node
+  for (uint i = 1; i < pack->size(); i++) {
-// converting long to int.
+    if (pack->at(i)->in(index) != p0_in) {
-bool SuperWord::requires_long_to_int_conversion(int opc) {
+      return nullptr; // not same
  switch(opc) {
    case Op_PopCountL:
    case Op_CountLeadingZerosL:
    case Op_CountTrailingZerosL:
      return true;
    default:
      return false;
  }
 }
 //------------------------------same_inputs--------------------------
 // For pack p, are all idx operands the same?
 bool SuperWord::same_inputs(const Node_List* p, int idx) const {
  Node* p0 = p->at(0);
  uint vlen = p->size();
  Node* p0_def = p0->in(idx);
  for (uint i = 1; i < vlen; i++) {
    Node* pi = p->at(i);
    Node* pi_def = pi->in(idx);
    if (p0_def != pi_def) {
      return false;
    }
  }
-  return true;
+  return p0_in;
 }
 VTransformBoolTest PackSet::get_bool_test(const Node_List* bool_pack) const {
  BoolNode* bol = bool_pack->at(0)->as_Bool();
  BoolTest::mask mask = bol->_test._test;
  bool is_negated = false;
  assert(mask == BoolTest::eq ||
         mask == BoolTest::ne ||
         mask == BoolTest::ge ||
         mask == BoolTest::gt ||
         mask == BoolTest::lt ||
         mask == BoolTest::le,
         "Bool should be one of: eq, ne, ge, gt, lt, le");
 #ifdef ASSERT
  for (uint j = 0; j < bool_pack->size(); j++) {
    Node* m = bool_pack->at(j);
    assert(m->as_Bool()->_test._test == mask,
           "all bool nodes must have same test");
  }
 #endif
  CmpNode* cmp0 = bol->in(1)->as_Cmp();
  assert(get_pack(cmp0) != nullptr, "Bool must have matching Cmp pack");
  if (cmp0->Opcode() == Op_CmpF || cmp0->Opcode() == Op_CmpD) {
    // If we have a Float or Double comparison, we must be careful with
    // handling NaN's correctly. CmpF and CmpD have a return code, as
    // they are based on the java bytecodes fcmpl/dcmpl:
    // -1: cmp_in1 <  cmp_in2, or at least one of the two is a NaN
    //  0: cmp_in1 == cmp_in2  (no NaN)
    //  1: cmp_in1 >  cmp_in2  (no NaN)
    //
    // The "mask" selects which of the [-1, 0, 1] cases lead to "true".
    //
    // Note: ordered   (O) comparison returns "false" if either input is NaN.
    //       unordered (U) comparison returns "true"  if either input is NaN.
    //
    // The VectorMaskCmpNode does a comparison directly on in1 and in2, in the java
    // standard way (all comparisons are ordered, except NEQ is unordered).
    //
    // In the following, "mask" already matches the cmp code for VectorMaskCmpNode:
    //   BoolTest::eq:  Case 0     -> EQ_O
    //   BoolTest::ne:  Case -1, 1 -> NEQ_U
    //   BoolTest::ge:  Case 0, 1  -> GE_O
    //   BoolTest::gt:  Case 1     -> GT_O
    //
    // But the lt and le comparisons must be converted from unordered to ordered:
    //   BoolTest::lt:  Case -1    -> LT_U -> VectorMaskCmp would interpret lt as LT_O
    //   BoolTest::le:  Case -1, 0 -> LE_U -> VectorMaskCmp would interpret le as LE_O
    //
    if (mask == BoolTest::lt || mask == BoolTest::le) {
      // Negating the mask gives us the negated result, since all non-NaN cases are
      // negated, and the unordered (U) comparisons are turned into ordered (O) comparisons.
      //          VectorMaskCmp(LT_U, in1_cmp, in2_cmp)
      // <==> NOT VectorMaskCmp(GE_O, in1_cmp, in2_cmp)
      //          VectorMaskCmp(LE_U, in1_cmp, in2_cmp)
      // <==> NOT VectorMaskCmp(GT_O, in1_cmp, in2_cmp)
      //
      // When a VectorBlend uses the negated mask, it can simply swap its blend-inputs:
      //      VectorBlend(    VectorMaskCmp(LT_U, in1_cmp, in2_cmp), in1_blend, in2_blend)
      // <==> VectorBlend(NOT VectorMaskCmp(GE_O, in1_cmp, in2_cmp), in1_blend, in2_blend)
      // <==> VectorBlend(    VectorMaskCmp(GE_O, in1_cmp, in2_cmp), in2_blend, in1_blend)
      //      VectorBlend(    VectorMaskCmp(LE_U, in1_cmp, in2_cmp), in1_blend, in2_blend)
      // <==> VectorBlend(NOT VectorMaskCmp(GT_O, in1_cmp, in2_cmp), in1_blend, in2_blend)
      // <==> VectorBlend(    VectorMaskCmp(GT_O, in1_cmp, in2_cmp), in2_blend, in1_blend)
      mask = bol->_test.negate();
      is_negated = true;
    }
  }
  return VTransformBoolTest(mask, is_negated);
 }
 //------------------------------profitable---------------------------
@ -1696,10 +1738,9 @@ bool SuperWord::profitable(const Node_List* p) const {
    // case (different shift counts) because it is not supported yet.
    Node* cnt = p0->in(2);
    Node_List* cnt_pk = get_pack(cnt);
-    if (cnt_pk != nullptr)
+    if (cnt_pk != nullptr || _packset.same_inputs_at_index_or_null(p, 2) == nullptr) {
      return false;
    if (!same_inputs(p, 2))
      return false;
    }
  }
  if (!p0->is_Store()) {
    // For now, return false if not all uses are vector.
@ -2042,7 +2083,9 @@ public:
  }
 };
-// The C2 graph (specifically the memory graph), needs to be re-ordered.
+// We want to replace the packed scalars from the PackSet and replace them
 // with vector operations. This requires scheduling and re-ordering the memory
 // graph. We take these steps:
 // (1) Build the PacksetGraph. It combines the dependency graph with the
 //     packset. The PacksetGraph gives us the dependencies that must be
 //     respected after scheduling.
@ -2050,10 +2093,11 @@ public:
 //     a linear order of all memops in the body. The order respects the
 //     dependencies of the PacksetGraph.
 // (3) If the PacksetGraph has cycles, we cannot schedule. Abort.
-// (4) Use the memops_schedule to re-order the memops in all slices.
+// (4) Apply the vectorization, including re-ordering the memops and replacing
-void SuperWord::schedule() {
+//     packed scalars with vector operations.
-  if (_packset.length() == 0) {
+bool SuperWord::schedule_and_apply() {
-    return; // empty packset
+  if (_packset.is_empty()) {
    return false;
  }
  ResourceMark rm;
@ -2079,27 +2123,40 @@ void SuperWord::schedule() {
    }
 #endif
    _packset.clear();
-    return;
+    return false;
  }
  // (4) Apply the vectorization, including re-ordering the memops.
  return apply(memops_schedule);
 }
 bool SuperWord::apply(Node_List& memops_schedule) {
  Compile* C = phase()->C;
  CountedLoopNode* cl = lpt()->_head->as_CountedLoop();
  C->print_method(PHASE_AUTO_VECTORIZATION1_BEFORE_APPLY, 4, cl);
  apply_memops_reordering_with_schedule(memops_schedule);
  C->print_method(PHASE_AUTO_VECTORIZATION2_AFTER_REORDER, 4, cl);
  adjust_pre_loop_limit_to_align_main_loop_vectors();
  C->print_method(PHASE_AUTO_VECTORIZATION3_AFTER_ADJUST_LIMIT, 4, cl);
  bool is_success = apply_vectorization();
  C->print_method(PHASE_AUTO_VECTORIZATION4_AFTER_APPLY, 4, cl);
  return is_success;
 }
 // Reorder the memory graph for all slices in parallel. We walk over the schedule once,
 // and track the current memory state of each slice.
 void SuperWord::apply_memops_reordering_with_schedule(Node_List& memops_schedule) {
 #ifndef PRODUCT
  if (is_trace_superword_info()) {
-    tty->print_cr("SuperWord::schedule: memops_schedule:");
+    tty->print_cr("\nSuperWord::apply_memops_reordering_with_schedule:");
    memops_schedule.dump();
  }
 #endif
  CountedLoopNode* cl = lpt()->_head->as_CountedLoop();
  phase()->C->print_method(PHASE_SUPERWORD1_BEFORE_SCHEDULE, 4, cl);
  // (4) Use the memops_schedule to re-order the memops in all slices.
  schedule_reorder_memops(memops_schedule);
 }
 // Reorder the memory graph for all slices in parallel. We walk over the schedule once,
 // and track the current memory state of each slice.
 void SuperWord::schedule_reorder_memops(Node_List &memops_schedule) {
  int max_slices = phase()->C->num_alias_types();
  // When iterating over the memops_schedule, we keep track of the current memory state,
  // which is the Phi or a store in the loop.
@ -2180,32 +2237,24 @@ void SuperWord::schedule_reorder_memops(Node_List &memops_schedule) {
  }
 }
 //------------------------------output---------------------------
 // Convert packs into vector node operations
 // At this point, all correctness and profitability checks have passed.
 // We start the irreversible process of editing the C2 graph. Should
 // there be an unexpected situation (assert fails), then we can only
 // bail out of the compilation, as the graph has already been partially
 // modified. We bail out, and retry without SuperWord.
-bool SuperWord::output() {
+bool SuperWord::apply_vectorization() {
  CountedLoopNode *cl = lpt()->_head->as_CountedLoop();
  assert(cl->is_main_loop(), "SLP should only work on main loops");
  Compile* C = phase()->C;
-  if (_packset.is_empty()) {
+  assert(!_packset.is_empty(), "vectorization requires non-empty packset");
    return false;
  }
 #ifndef PRODUCT
  if (TraceLoopOpts) {
-    tty->print("SuperWord::output    ");
+    tty->print("SuperWord::apply_vectorization ");
    lpt()->dump_head();
  }
 #endif
  phase()->C->print_method(PHASE_SUPERWORD2_BEFORE_OUTPUT, 4, cl);
  adjust_pre_loop_limit_to_align_main_loop_vectors();
  DEBUG_ONLY(verify_no_extract());
  uint max_vlen_in_bytes = 0;
  uint max_vlen = 0;
@ -2214,7 +2263,7 @@ bool SuperWord::output() {
    Node* n = body().at(i);
    Node_List* p = get_pack(n);
    if (p != nullptr && n == p->at(p->size()-1)) {
-      // After schedule_reorder_memops, we know that the memops have the same order in the pack
+      // After apply_memops_reordering_with_schedule, we know that the memops have the same order in the pack
      // as in the memory slice. Hence, "first" is the first memop in the slice from the pack,
      // and "n" is the last node in the slice from the pack.
      Node* first = p->at(0);
@ -2294,79 +2343,32 @@ bool SuperWord::output() {
        BoolNode* bol = n->in(1)->as_Bool();
        assert(bol != nullptr, "must have Bool above CMove");
-        BoolTest::mask bol_test = bol->_test._test;
+        Node_List* bool_pack = get_pack(bol);
-        assert(bol_test == BoolTest::eq ||
+        assert(bool_pack != nullptr, "CMove must have matching Bool pack");
               bol_test == BoolTest::ne ||
               bol_test == BoolTest::ge ||
               bol_test == BoolTest::gt ||
               bol_test == BoolTest::lt ||
               bol_test == BoolTest::le,
               "CMove bool should be one of: eq,ne,ge,ge,lt,le");
        Node_List* p_bol = get_pack(bol);
        assert(p_bol != nullptr, "CMove must have matching Bool pack");
 #ifdef ASSERT
        for (uint j = 0; j < p_bol->size(); j++) {
          Node* m = p_bol->at(j);
          assert(m->as_Bool()->_test._test == bol_test,
                 "all bool nodes must have same test");
        }
 #endif
        CmpNode* cmp = bol->in(1)->as_Cmp();
        assert(cmp != nullptr, "must have cmp above CMove");
-        Node_List* p_cmp = get_pack(cmp);
+        Node_List* cmp_pack = get_pack(cmp);
-        assert(p_cmp != nullptr, "Bool must have matching Cmp pack");
+        assert(cmp_pack != nullptr, "Bool must have matching Cmp pack");
-        Node* cmp_in1 = vector_opd(p_cmp, 1);
+        Node* cmp_in1 = vector_opd(cmp_pack, 1);
-        Node* cmp_in2 = vector_opd(p_cmp, 2);
+        Node* cmp_in2 = vector_opd(cmp_pack, 2);
        Node* blend_in1 = vector_opd(p, 2);
        Node* blend_in2 = vector_opd(p, 3);
-        if (cmp->Opcode() == Op_CmpF || cmp->Opcode() == Op_CmpD) {
+        VTransformBoolTest bool_test = _packset.get_bool_test(bool_pack);
-          // If we have a Float or Double comparison, we must be careful with
+        BoolTest::mask test_mask = bool_test._mask;
-          // handling NaN's correctly. CmpF and CmpD have a return code, as
+        if (bool_test._is_negated) {
-          // they are based on the java bytecodes fcmpl/dcmpl:
+           // We can cancel out the negation by swapping the blend inputs.
-          // -1: cmp_in1 <  cmp_in2, or at least one of the two is a NaN
+           swap(blend_in1, blend_in2);
          //  0: cmp_in1 == cmp_in2  (no NaN)
          //  1: cmp_in1 >  cmp_in2  (no NaN)
          //
          // The "bol_test" selects which of the [-1, 0, 1] cases lead to "true".
          //
          // Note: ordered   (O) comparison returns "false" if either input is NaN.
          //       unordered (U) comparison returns "true"  if either input is NaN.
          //
          // The VectorMaskCmpNode does a comparison directly on in1 and in2, in the java
          // standard way (all comparisons are ordered, except NEQ is unordered).
          //
          // In the following, "bol_test" already matches the cmp code for VectorMaskCmpNode:
          //   BoolTest::eq:  Case 0     -> EQ_O
          //   BoolTest::ne:  Case -1, 1 -> NEQ_U
          //   BoolTest::ge:  Case 0, 1  -> GE_O
          //   BoolTest::gt:  Case 1     -> GT_O
          //
          // But the lt and le comparisons must be converted from unordered to ordered:
          //   BoolTest::lt:  Case -1    -> LT_U -> VectorMaskCmp would interpret lt as LT_O
          //   BoolTest::le:  Case -1, 0 -> LE_U -> VectorMaskCmp would interpret le as LE_O
          //
          if (bol_test == BoolTest::lt || bol_test == BoolTest::le) {
            // Negating the bol_test and swapping the blend-inputs leaves all non-NaN cases equal,
            // but converts the unordered (U) to an ordered (O) comparison.
            //      VectorBlend(VectorMaskCmp(LT_U, in1_cmp, in2_cmp), in1_blend, in2_blend)
            // <==> VectorBlend(VectorMaskCmp(GE_O, in1_cmp, in2_cmp), in2_blend, in1_blend)
            //      VectorBlend(VectorMaskCmp(LE_U, in1_cmp, in2_cmp), in1_blend, in2_blend)
            // <==> VectorBlend(VectorMaskCmp(GT_O, in1_cmp, in2_cmp), in2_blend, in1_blend)
            bol_test = bol->_test.negate();
            swap(blend_in1, blend_in2);
          }
        }
        // VectorMaskCmp
-        ConINode* bol_test_node  = igvn().intcon((int)bol_test);
+        ConINode* test_mask_node  = igvn().intcon((int)test_mask);
        BasicType bt = velt_basic_type(cmp);
        const TypeVect* vt = TypeVect::make(bt, vlen);
-        VectorNode* mask = new VectorMaskCmpNode(bol_test, cmp_in1, cmp_in2, bol_test_node, vt);
+        VectorNode* mask = new VectorMaskCmpNode(test_mask, cmp_in1, cmp_in2, test_mask_node, vt);
        phase()->register_new_node_with_ctrl_of(mask, p->at(0));
        igvn()._worklist.push(mask);
@ -2408,40 +2410,23 @@ bool SuperWord::output() {
            vlen_in_bytes = in2->as_Vector()->length_in_bytes();
          }
        } else {
-          // Vector unsigned right shift for signed subword types behaves differently
+          if (VectorNode::can_use_RShiftI_instead_of_URShiftI(n, velt_basic_type(n))) {
          // from Java Spec. But when the shift amount is a constant not greater than
          // the number of sign extended bits, the unsigned right shift can be
          // vectorized to a signed right shift.
          if (VectorNode::can_transform_shift_op(n, velt_basic_type(n))) {
            opc = Op_RShiftI;
          }
          vn = VectorNode::make(opc, in1, in2, vlen, velt_basic_type(n));
          vlen_in_bytes = vn->as_Vector()->length_in_bytes();
        }
-      } else if (opc == Op_SqrtF || opc == Op_SqrtD ||
+      } else if (VectorNode::is_scalar_unary_op_with_equal_input_and_output_types(opc)) {
                 opc == Op_AbsF || opc == Op_AbsD ||
                 opc == Op_AbsI || opc == Op_AbsL ||
                 opc == Op_NegF || opc == Op_NegD ||
                 opc == Op_RoundF || opc == Op_RoundD ||
                 opc == Op_ReverseBytesI || opc == Op_ReverseBytesL ||
                 opc == Op_ReverseBytesUS || opc == Op_ReverseBytesS ||
                 opc == Op_ReverseI || opc == Op_ReverseL ||
                 opc == Op_PopCountI || opc == Op_CountLeadingZerosI ||
                 opc == Op_CountTrailingZerosI) {
        assert(n->req() == 2, "only one input expected");
        Node* in = vector_opd(p, 1);
        vn = VectorNode::make(opc, in, nullptr, vlen, velt_basic_type(n));
        vlen_in_bytes = vn->as_Vector()->length_in_bytes();
-      } else if (requires_long_to_int_conversion(opc)) {
+      } else if (VectorNode::is_scalar_op_that_returns_int_but_vector_op_returns_long(opc)) {
        // Java API for Long.bitCount/numberOfLeadingZeros/numberOfTrailingZeros
        // returns int type, but Vector API for them returns long type. To unify
        // the implementation in backend, superword splits the vector implementation
        // for Java API into an execution node with long type plus another node
        // converting long to int.
        assert(n->req() == 2, "only one input expected");
        Node* in = vector_opd(p, 1);
        Node* longval = VectorNode::make(opc, in, nullptr, vlen, T_LONG);
        phase()->register_new_node_with_ctrl_of(longval, first);
        // Requires extra vector long -> int conversion.
        vn = VectorCastNode::make(Op_VectorCastL2X, longval, T_INT, vlen);
        vlen_in_bytes = vn->as_Vector()->length_in_bytes();
      } else if (VectorNode::is_convert_opcode(opc)) {
@ -2525,8 +2510,6 @@ bool SuperWord::output() {
    }
  }
  phase()->C->print_method(PHASE_SUPERWORD3_AFTER_OUTPUT, 4, cl);
  return true;
 }
@ -2537,13 +2520,13 @@ Node* SuperWord::vector_opd(Node_List* p, int opd_idx) {
  uint vlen = p->size();
  Node* opd = p0->in(opd_idx);
  CountedLoopNode *cl = lpt()->_head->as_CountedLoop();
-  bool have_same_inputs = same_inputs(p, opd_idx);
+  Node* same_input = _packset.same_inputs_at_index_or_null(p, opd_idx);
  // Insert index population operation to create a vector of increasing
  // indices starting from the iv value. In some special unrolled loops
  // (see JDK-8286125), we need scalar replications of the iv value if
  // all inputs are the same iv, so we do a same inputs check here.
-  if (opd == iv() && !have_same_inputs) {
+  if (opd == iv() && same_input == nullptr) {
    BasicType p0_bt = velt_basic_type(p0);
    BasicType iv_bt = is_subword_type(p0_bt) ? p0_bt : T_INT;
    assert(VectorNode::is_populate_index_supported(iv_bt), "Should support");
@ -2554,7 +2537,7 @@ Node* SuperWord::vector_opd(Node_List* p, int opd_idx) {
    return vn;
  }
-  if (have_same_inputs) {
+  if (same_input != nullptr) {
    if (opd->is_Vector() || opd->is_LoadVector()) {
      if (opd_idx == 2 && VectorNode::is_shift(p0)) {
        assert(false, "shift's count can't be vector");
@ -2849,7 +2832,7 @@ bool SuperWord::is_velt_basic_type_compatible_use_def(Node* use, Node* def) cons
  assert(is_java_primitive(def_bt), "sanity %s", type2name(def_bt));
  // Nodes like Long.bitCount: expect long input, and int output.
-  if (requires_long_to_int_conversion(use->Opcode())) {
+  if (VectorNode::is_scalar_op_that_returns_int_but_vector_op_returns_long(use->Opcode())) {
    return type2aelembytes(def_bt) == 8 &&
           type2aelembytes(use_bt) == 4;
  }
@ -2996,7 +2979,7 @@ VStatus VLoopBody::construct() {
 BasicType SuperWord::longer_type_for_conversion(Node* n) const {
  if (!(VectorNode::is_convert_opcode(n->Opcode()) ||
-        requires_long_to_int_conversion(n->Opcode())) ||
+        VectorNode::is_scalar_op_that_returns_int_but_vector_op_returns_long(n->Opcode())) ||
      !in_bb(n->in(1))) {
    return T_ILLEGAL;
  }
@ -3173,7 +3156,7 @@ LoadNode::ControlDependency SuperWord::control_dependency(Node_List* p) {
 // determined by SuperWord::filter_packs_for_alignment().
 void SuperWord::determine_mem_ref_and_aw_for_main_loop_alignment() {
  if (_mem_ref_for_main_loop_alignment != nullptr) {
-    assert(vectors_should_be_aligned(), "mem_ref only set if filtered for alignment");
+    assert(VLoop::vectors_should_be_aligned(), "mem_ref only set if filtered for alignment");
    return;
  }
--- a/src/hotspot/share/opto/superword.hpp
+++ b/src/hotspot/share/opto/superword.hpp
@ -362,6 +362,10 @@ public:
    }
  }
  Node* same_inputs_at_index_or_null(const Node_List* pack, const int index) const;
  VTransformBoolTest get_bool_test(const Node_List* bool_pack) const;
 private:
  SplitStatus split_pack(const char* split_name, Node_List* pack, SplitTask task);
 public:
@ -545,12 +549,6 @@ class SuperWord : public ResourceObj {
  // Accessors
  Arena* arena()                   { return &_arena; }
  // should we align vector memory references on this platform?
  bool vectors_should_be_aligned() { return !Matcher::misaligned_vectors_ok() || AlignVector; }
  // For pack p, are all idx operands the same?
  bool same_inputs(const Node_List* p, int idx) const;
  // CloneMap utilities
  bool same_origin_idx(Node* a, Node* b) const;
  bool same_generation(Node* a, Node* b) const;
@ -600,13 +598,10 @@ private:
  DEBUG_ONLY(void verify_packs() const;)
-  // Adjust the memory graph for the packed operations
+  bool schedule_and_apply();
-  void schedule();
+  bool apply(Node_List& memops_schedule);
-  // Helper function for schedule, that reorders all memops, slice by slice, according to the schedule
+  void apply_memops_reordering_with_schedule(Node_List& memops_schedule);
-  void schedule_reorder_memops(Node_List &memops_schedule);
+  bool apply_vectorization();
  // Convert packs into vector node operations
  bool output();
  // Create a vector operand for the nodes in pack p for operand: in(opd_idx)
  Node* vector_opd(Node_List* p, int opd_idx);
@ -632,8 +627,6 @@ private:
  // Return the longer type for vectorizable type-conversion node or illegal type for other nodes.
  BasicType longer_type_for_conversion(Node* n) const;
  static bool requires_long_to_int_conversion(int opc);
  bool is_velt_basic_type_compatible_use_def(Node* use, Node* def) const;
  static LoadNode::ControlDependency control_dependency(Node_List* p);
--- a/src/hotspot/share/opto/vectorization.hpp
+++ b/src/hotspot/share/opto/vectorization.hpp
@ -129,6 +129,9 @@ public:
  int estimated_body_length() const { return lpt()->_body.size(); };
  int estimated_node_count()  const { return (int)(1.10 * phase()->C->unique()); };
  // Should we align vector memory references on this platform?
  static bool vectors_should_be_aligned() { return !Matcher::misaligned_vectors_ok() || AlignVector; }
 #ifndef PRODUCT
  const VTrace& vtrace()      const { return _vtrace; }
@ -1320,4 +1323,12 @@ private:
 #endif
 };
 struct VTransformBoolTest {
  const BoolTest::mask _mask;
  const bool _is_negated;
  VTransformBoolTest(const BoolTest::mask mask, bool is_negated) :
    _mask(mask), _is_negated(is_negated) {}
 };
 #endif // SHARE_OPTO_VECTORIZATION_HPP
--- a/src/hotspot/share/opto/vectornode.cpp
+++ b/src/hotspot/share/opto/vectornode.cpp
@ -507,7 +507,11 @@ bool VectorNode::is_shift_opcode(int opc) {
  }
 }
-bool VectorNode::can_transform_shift_op(Node* n, BasicType bt) {
+// Vector unsigned right shift for signed subword types behaves differently
 // from Java Spec. But when the shift amount is a constant not greater than
 // the number of sign extended bits, the unsigned right shift can be
 // vectorized to a signed right shift.
 bool VectorNode::can_use_RShiftI_instead_of_URShiftI(Node* n, BasicType bt) {
  if (n->Opcode() != Op_URShiftI) {
    return false;
  }
@ -920,6 +924,50 @@ bool VectorNode::is_vector_bitwise_not_pattern(Node* n) {
  return false;
 }
 bool VectorNode::is_scalar_unary_op_with_equal_input_and_output_types(int opc) {
  switch (opc) {
    case Op_SqrtF:
    case Op_SqrtD:
    case Op_AbsF:
    case Op_AbsD:
    case Op_AbsI:
    case Op_AbsL:
    case Op_NegF:
    case Op_NegD:
    case Op_RoundF:
    case Op_RoundD:
    case Op_ReverseBytesI:
    case Op_ReverseBytesL:
    case Op_ReverseBytesUS:
    case Op_ReverseBytesS:
    case Op_ReverseI:
    case Op_ReverseL:
    case Op_PopCountI:
    case Op_CountLeadingZerosI:
    case Op_CountTrailingZerosI:
      return true;
    default:
      return false;
  }
 }
 // Java API for Long.bitCount/numberOfLeadingZeros/numberOfTrailingZeros
 // returns int type, but Vector API for them returns long type. To unify
 // the implementation in backend, AutoVectorization splits the vector
 // implementation for Java API into an execution node with long type plus
 // another node converting long to int.
 bool VectorNode::is_scalar_op_that_returns_int_but_vector_op_returns_long(int opc) {
  switch (opc) {
    case Op_PopCountL:
    case Op_CountLeadingZerosL:
    case Op_CountTrailingZerosL:
      return true;
    default:
      return false;
  }
 }
 Node* VectorNode::try_to_gen_masked_vector(PhaseGVN* gvn, Node* node, const TypeVect* vt) {
  int vopc = node->Opcode();
  uint vlen = vt->length();
--- a/src/hotspot/share/opto/vectornode.hpp
+++ b/src/hotspot/share/opto/vectornode.hpp
@ -84,7 +84,7 @@ class VectorNode : public TypeNode {
  static VectorNode* make_mask_node(int vopc, Node* n1, Node* n2, uint vlen, BasicType bt);
  static bool is_shift_opcode(int opc);
-  static bool can_transform_shift_op(Node* n, BasicType bt);
+  static bool can_use_RShiftI_instead_of_URShiftI(Node* n, BasicType bt);
  static bool is_convert_opcode(int opc);
  static bool is_minmax_opcode(int opc);
@ -130,6 +130,9 @@ class VectorNode : public TypeNode {
    return is_vector_shift_count(n->Opcode());
  }
  static bool is_scalar_unary_op_with_equal_input_and_output_types(int opc);
  static bool is_scalar_op_that_returns_int_but_vector_op_returns_long(int opc);
  static void trace_new_vector(Node* n, const char* context) {
 #ifdef ASSERT
    if (TraceNewVectors) {