8284960: Integration of JEP 426: Vector API (Fourth Incubator)

Co-authored-by: Jatin Bhateja <jbhateja@openjdk.org> Co-authored-by: Paul Sandoz <psandoz@openjdk.org> Co-authored-by: Sandhya Viswanathan <sviswanathan@openjdk.org> Co-authored-by: Smita Kamath <svkamath@openjdk.org> Co-authored-by: Joshua Zhu <jzhu@openjdk.org> Co-authored-by: Xiaohong Gong <xgong@openjdk.org> Co-authored-by: John R Rose <jrose@openjdk.org> Co-authored-by: Eric Liu <eliu@openjdk.org> Co-authored-by: Ningsheng Jian <njian@openjdk.org> Reviewed-by: ngasson, vlivanov, mcimadamore, jlahoda, kvn
2025-09-15 08:34:30 +02:00 · 2022-05-31 16:02:09 +00:00 · 2022-05-31 16:02:09 +00:00 · 6f6486e977
commit 6f6486e977
parent 171a7cdd5d
227 changed files with 20949 additions and 21221 deletions
--- a/src/hotspot/cpu/aarch64/aarch64.ad
+++ b/src/hotspot/cpu/aarch64/aarch64.ad
@ -2468,6 +2468,9 @@ const bool Matcher::match_rule_supported_vector(int opcode, int vlen, BasicType
      break;
    case Op_LoadVectorGather:
    case Op_StoreVectorScatter:
+    case Op_CompressV:
+    case Op_CompressM:
+    case Op_ExpandV:
      return false;
    default:
      break;
@ -8658,7 +8661,6 @@ instruct countTrailingZerosL(iRegINoSp dst, iRegL src) %{
 //

 instruct popCountI(iRegINoSp dst, iRegIorL2I src, vRegF tmp) %{
-  predicate(UsePopCountInstruction);
  match(Set dst (PopCountI src));
  effect(TEMP tmp);
  ins_cost(INSN_COST * 13);
@ -8680,7 +8682,6 @@ instruct popCountI(iRegINoSp dst, iRegIorL2I src, vRegF tmp) %{
 %}

 instruct popCountI_mem(iRegINoSp dst, memory4 mem, vRegF tmp) %{
-  predicate(UsePopCountInstruction);
  match(Set dst (PopCountI (LoadI mem)));
  effect(TEMP tmp);
  ins_cost(INSN_COST * 13);
@ -8703,7 +8704,6 @@ instruct popCountI_mem(iRegINoSp dst, memory4 mem, vRegF tmp) %{

 // Note: Long.bitCount(long) returns an int.
 instruct popCountL(iRegINoSp dst, iRegL src, vRegD tmp) %{
-  predicate(UsePopCountInstruction);
  match(Set dst (PopCountL src));
  effect(TEMP tmp);
  ins_cost(INSN_COST * 13);
@ -8723,7 +8723,6 @@ instruct popCountL(iRegINoSp dst, iRegL src, vRegD tmp) %{
 %}

 instruct popCountL_mem(iRegINoSp dst, memory8 mem, vRegD tmp) %{
-  predicate(UsePopCountInstruction);
  match(Set dst (PopCountL (LoadL mem)));
  effect(TEMP tmp);
  ins_cost(INSN_COST * 13);
--- a/src/hotspot/cpu/aarch64/aarch64_neon.ad
+++ b/src/hotspot/cpu/aarch64/aarch64_neon.ad
@ -5683,14 +5683,58 @@ instruct vround2D_reg(vecX dst, vecX src, immI rmode) %{
  ins_pipe(vdop_fp128);
 %}

-instruct vpopcount4I(vecX dst, vecX src) %{
-  predicate(UsePopCountInstruction && n->as_Vector()->length() == 4);
+instruct vpopcountID(vecD dst, vecD src) %{
+  predicate(n->as_Vector()->length_in_bytes() < 16);
  match(Set dst (PopCountVI src));
-  format %{
-    "cnt     $dst, $src\t# vector (16B)\n\t"
-    "uaddlp  $dst, $dst\t# vector (16B)\n\t"
-    "uaddlp  $dst, $dst\t# vector (8H)"
+  ins_cost(3 * INSN_COST);
+  format %{ "vpopcountI  $dst, $src\t# vector (8B/4H/2S)" %}
+  ins_encode %{
+    BasicType bt = Matcher::vector_element_basic_type(this);
+    __ cnt(as_FloatRegister($dst$$reg), __ T8B,
+           as_FloatRegister($src$$reg));
+    if (bt == T_SHORT || bt == T_INT) {
+      __ uaddlp(as_FloatRegister($dst$$reg), __ T8B,
+                as_FloatRegister($dst$$reg));
+      if (bt == T_INT) {
+        __ uaddlp(as_FloatRegister($dst$$reg), __ T4H,
+                  as_FloatRegister($dst$$reg));
+      }
+    }
  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct vpopcountIX(vecX dst, vecX src) %{
+  predicate(n->as_Vector()->length_in_bytes() == 16);
+  match(Set dst (PopCountVI src));
+  ins_cost(3 * INSN_COST);
+  format %{ "vpopcountI  $dst, $src\t# vector (16B/8H/4S)" %}
+  ins_encode %{
+    BasicType bt = Matcher::vector_element_basic_type(this);
+    __ cnt(as_FloatRegister($dst$$reg), __ T16B,
+           as_FloatRegister($src$$reg));
+    if (bt == T_SHORT || bt == T_INT) {
+      __ uaddlp(as_FloatRegister($dst$$reg), __ T16B,
+                as_FloatRegister($dst$$reg));
+      if (bt == T_INT) {
+        __ uaddlp(as_FloatRegister($dst$$reg), __ T8H,
+                  as_FloatRegister($dst$$reg));
+      }
+    }
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+// If the PopCountVL is generated by auto-vectorization, the dst basic
+// type is T_INT. And once we have unified the type definition for
+// Vector API and auto-vectorization, this rule can be merged with
+// "vpopcountLX" rule.
+instruct vpopcountLD(vecD dst, vecX src) %{
+  predicate(n->as_Vector()->length_in_bytes() < 16 &&
+            n->bottom_type()->is_vect()->element_basic_type() == T_INT);
+  match(Set dst (PopCountVL src));
+  ins_cost(5 * INSN_COST);
+  format %{ "vpopcountL  $dst, $src\t# vector (2S)" %}
  ins_encode %{
    __ cnt(as_FloatRegister($dst$$reg), __ T16B,
           as_FloatRegister($src$$reg));
@ -5698,24 +5742,28 @@ instruct vpopcount4I(vecX dst, vecX src) %{
              as_FloatRegister($dst$$reg));
    __ uaddlp(as_FloatRegister($dst$$reg), __ T8H,
              as_FloatRegister($dst$$reg));
+    __ uaddlp(as_FloatRegister($dst$$reg), __ T4S,
+              as_FloatRegister($dst$$reg));
+    __ xtn(as_FloatRegister($dst$$reg), __ T2S,
+           as_FloatRegister($dst$$reg), __ T2D);
  %}
  ins_pipe(pipe_class_default);
 %}

-instruct vpopcount2I(vecD dst, vecD src) %{
-  predicate(UsePopCountInstruction && n->as_Vector()->length() == 2);
-  match(Set dst (PopCountVI src));
-  format %{
-    "cnt     $dst, $src\t# vector (8B)\n\t"
-    "uaddlp  $dst, $dst\t# vector (8B)\n\t"
-    "uaddlp  $dst, $dst\t# vector (4H)"
-  %}
+instruct vpopcountLX(vecX dst, vecX src) %{
+  predicate(n->as_Vector()->length_in_bytes() == 16 &&
+            n->bottom_type()->is_vect()->element_basic_type() == T_LONG);
+  match(Set dst (PopCountVL src));
+  ins_cost(4 * INSN_COST);
+  format %{ "vpopcountL  $dst, $src\t# vector (2D)" %}
  ins_encode %{
-    __ cnt(as_FloatRegister($dst$$reg), __ T8B,
+    __ cnt(as_FloatRegister($dst$$reg), __ T16B,
           as_FloatRegister($src$$reg));
-    __ uaddlp(as_FloatRegister($dst$$reg), __ T8B,
+    __ uaddlp(as_FloatRegister($dst$$reg), __ T16B,
              as_FloatRegister($dst$$reg));
-    __ uaddlp(as_FloatRegister($dst$$reg), __ T4H,
+    __ uaddlp(as_FloatRegister($dst$$reg), __ T8H,
+              as_FloatRegister($dst$$reg));
+    __ uaddlp(as_FloatRegister($dst$$reg), __ T4S,
              as_FloatRegister($dst$$reg));
  %}
  ins_pipe(pipe_class_default);
@ -5921,3 +5969,131 @@ instruct vmask_tolong16B(iRegLNoSp dst, vecX src) %{
  %}
  ins_pipe(pipe_slow);
 %}
+
+//------------------------- CountLeadingZerosV -----------------------------
+
+instruct countLeadingZerosVD(vecD dst, vecD src) %{
+  predicate(n->as_Vector()->length_in_bytes() == 8);
+  match(Set dst (CountLeadingZerosV src));
+  ins_cost(INSN_COST);
+  format %{ "countLeadingZerosV $dst, $src\t# vector (8B/4H/2S)" %}
+  ins_encode %{
+    BasicType bt = Matcher::vector_element_basic_type(this);
+    Assembler::SIMD_Arrangement size = __ esize2arrangement((unsigned)type2aelembytes(bt), false);
+    __ clz(as_FloatRegister($dst$$reg), size, as_FloatRegister($src$$reg));
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct countLeadingZerosVX(vecX dst, vecX src) %{
+  predicate(n->as_Vector()->length_in_bytes() == 16);
+  match(Set dst (CountLeadingZerosV src));
+  ins_cost(INSN_COST);
+  format %{ "countLeadingZerosV $dst, $src\t# vector (16B/8H/4S/2D)" %}
+  ins_encode %{
+    BasicType bt = Matcher::vector_element_basic_type(this);
+    Assembler::SIMD_Arrangement size = __ esize2arrangement((unsigned)type2aelembytes(bt), true);
+    if (bt != T_LONG) {
+      __ clz(as_FloatRegister($dst$$reg), size, as_FloatRegister($src$$reg));
+    } else {
+      __ umov(rscratch1, as_FloatRegister($src$$reg), __ D, 0);
+      __ clz(rscratch1, rscratch1);
+      __ mov(as_FloatRegister($dst$$reg), __ D, 0, rscratch1);
+      __ umov(rscratch1, as_FloatRegister($src$$reg), __ D, 1);
+      __ clz(rscratch1, rscratch1);
+      __ mov(as_FloatRegister($dst$$reg), __ D, 1, rscratch1);
+    }
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+//------------------------- CountTrailingZerosV ----------------------------
+
+instruct countTrailingZerosVD(vecD dst, vecD src) %{
+  predicate(n->as_Vector()->length_in_bytes() == 8);
+  match(Set dst (CountTrailingZerosV src));
+  ins_cost(3 * INSN_COST);
+  format %{ "countTrailingZerosV $dst, $src\t# vector (8B/4H/2S)" %}
+  ins_encode %{
+    BasicType bt = Matcher::vector_element_basic_type(this);
+    Assembler::SIMD_Arrangement size = __ esize2arrangement((unsigned)type2aelembytes(bt), false);
+    __ neon_reverse_bits(as_FloatRegister($dst$$reg), as_FloatRegister($src$$reg), bt, false);
+    __ clz(as_FloatRegister($dst$$reg), size, as_FloatRegister($dst$$reg));
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct countTrailingZerosVX(vecX dst, vecX src) %{
+  predicate(n->as_Vector()->length_in_bytes() == 16);
+  match(Set dst (CountTrailingZerosV src));
+  ins_cost(3 * INSN_COST);
+  format %{ "countTrailingZerosV $dst, $src\t# vector (16B/8H/4S/2D)" %}
+  ins_encode %{
+    BasicType bt = Matcher::vector_element_basic_type(this);
+    Assembler::SIMD_Arrangement size = __ esize2arrangement((unsigned)type2aelembytes(bt), true);
+    __ neon_reverse_bits(as_FloatRegister($dst$$reg), as_FloatRegister($src$$reg), bt, true);
+    if (bt != T_LONG) {
+      __ clz(as_FloatRegister($dst$$reg), size, as_FloatRegister($dst$$reg));
+    } else {
+      __ umov(rscratch1, as_FloatRegister($dst$$reg), __ D, 0);
+      __ clz(rscratch1, rscratch1);
+      __ mov(as_FloatRegister($dst$$reg), __ D, 0, rscratch1);
+      __ umov(rscratch1, as_FloatRegister($dst$$reg), __ D, 1);
+      __ clz(rscratch1, rscratch1);
+      __ mov(as_FloatRegister($dst$$reg), __ D, 1, rscratch1);
+    }
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+//------------------------------ ReverseV -----------------------------------
+
+instruct vreverseD(vecD dst, vecD src) %{
+  predicate(n->as_Vector()->length_in_bytes() == 8);
+  match(Set dst (ReverseV src));
+  ins_cost(2 * INSN_COST);
+  format %{ "ReverseV $dst, $src\t# vector (D)" %}
+  ins_encode %{
+    BasicType bt = Matcher::vector_element_basic_type(this);
+    __ neon_reverse_bits(as_FloatRegister($dst$$reg), as_FloatRegister($src$$reg), bt, false);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct vreverseX(vecX dst, vecX src) %{
+  predicate(n->as_Vector()->length_in_bytes() == 16);
+  match(Set dst (ReverseV src));
+  ins_cost(2 * INSN_COST);
+  format %{ "ReverseV $dst, $src\t# vector (X)" %}
+  ins_encode %{
+    BasicType bt = Matcher::vector_element_basic_type(this);
+    __ neon_reverse_bits(as_FloatRegister($dst$$reg), as_FloatRegister($src$$reg), bt, true);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+//---------------------------- ReverseBytesV --------------------------------
+
+instruct vreverseBytesD(vecD dst, vecD src) %{
+  predicate(n->as_Vector()->length_in_bytes() == 8);
+  match(Set dst (ReverseBytesV src));
+  ins_cost(INSN_COST);
+  format %{ "ReverseBytesV $dst, $src\t# vector (D)" %}
+  ins_encode %{
+    BasicType bt = Matcher::vector_element_basic_type(this);
+    __ neon_reverse_bytes(as_FloatRegister($dst$$reg), as_FloatRegister($src$$reg), bt, false);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct vreverseBytesX(vecX dst, vecX src) %{
+  predicate(n->as_Vector()->length_in_bytes() == 16);
+  match(Set dst (ReverseBytesV src));
+  ins_cost(INSN_COST);
+  format %{ "ReverseBytesV $dst, $src\t# vector (X)" %}
+  ins_encode %{
+    BasicType bt = Matcher::vector_element_basic_type(this);
+    __ neon_reverse_bytes(as_FloatRegister($dst$$reg), as_FloatRegister($src$$reg), bt, true);
+  %}
+  ins_pipe(pipe_slow);
+%}
--- a/src/hotspot/cpu/aarch64/aarch64_neon_ad.m4
+++ b/src/hotspot/cpu/aarch64/aarch64_neon_ad.m4
@ -2445,28 +2445,50 @@ instruct vround2D_reg(vecX dst, vecX src, immI rmode) %{
  ins_pipe(vdop_fp128);
 %}
 dnl
-define(`VPOPCOUNT', `
-instruct vpopcount$1$2`'(vec$5 dst, vec$5 src) %{
-  predicate(UsePopCountInstruction && n->as_Vector()->length() == $1);
-  match(Set dst (PopCountVI src));
-  format %{
-    "cnt     $dst, $src\t# vector ($3B)\n\t"
-    "uaddlp  $dst, $dst\t# vector ($3B)\n\t"
-    "uaddlp  $dst, $dst\t# vector ($4H)"
-  %}
-  ins_encode %{
-    __ cnt(as_FloatRegister($dst$$reg), __ T$3B,
-           as_FloatRegister($src$$reg));
-    __ uaddlp(as_FloatRegister($dst$$reg), __ T$3B,
+define(`VPOPCOUNT', `dnl
+ifelse($1$2, `LD', `
+// If the PopCountVL is generated by auto-vectorization, the dst basic
+// type is T_INT. And once we have unified the type definition for
+// Vector API and auto-vectorization, this rule can be merged with
+// "vpopcountLX" rule.', `')
+instruct vpopcount$1$2`'(vec$2 dst, vec$3 src) %{
+  predicate(n->as_Vector()->length_in_bytes() $4 16`'ifelse($1$2, `LD', ` &&
+            n->bottom_type()->is_vect()->element_basic_type() == T_INT', $1$2, `LX', ` &&
+            n->bottom_type()->is_vect()->element_basic_type() == T_LONG', `'));
+  match(Set dst (PopCountV$1 src));
+  ins_cost($5 * INSN_COST);
+  format %{ "vpopcount$1  $dst, $src\t# vector ($6)" %}
+  ins_encode %{dnl
+ifelse($1, `I', `
+    BasicType bt = Matcher::vector_element_basic_type(this);', `')
+    __ cnt(as_FloatRegister($dst$$reg), __ T`'ifelse($3, D, 8, 16)B,
+           as_FloatRegister($src$$reg));dnl
+ifelse($1, `L', `
+    __ uaddlp(as_FloatRegister($dst$$reg), __ T16B,
              as_FloatRegister($dst$$reg));
-    __ uaddlp(as_FloatRegister($dst$$reg), __ T$4H,
+    __ uaddlp(as_FloatRegister($dst$$reg), __ T8H,
              as_FloatRegister($dst$$reg));
+    __ uaddlp(as_FloatRegister($dst$$reg), __ T4S,
+              as_FloatRegister($dst$$reg));', `
+    if (bt == T_SHORT || bt == T_INT) {
+      __ uaddlp(as_FloatRegister($dst$$reg), __ T`'ifelse($2, D, 8, 16)B,
+                as_FloatRegister($dst$$reg));
+      if (bt == T_INT) {
+        __ uaddlp(as_FloatRegister($dst$$reg), __ T`'ifelse($2, D, 4, 8)H,
+                  as_FloatRegister($dst$$reg));
+      }
+    }')dnl
+ifelse($1$2, `LD', `
+    __ xtn(as_FloatRegister($dst$$reg), __ T2S,
+           as_FloatRegister($dst$$reg), __ T2D);', `')
  %}
  ins_pipe(pipe_class_default);
 %}')dnl
-dnl       $1 $2 $3  $4 $5
-VPOPCOUNT(4, I, 16, 8, X)
-VPOPCOUNT(2, I, 8,  4, D)
+dnl       $1 $2 $3 $4  $5 $6
+VPOPCOUNT(I, D, D, <,  3, 8B/4H/2S)
+VPOPCOUNT(I, X, X, ==, 3, 16B/8H/4S)
+VPOPCOUNT(L, D, X, <,  5, 2S)
+VPOPCOUNT(L, X, X, ==, 4, 2D)
 dnl
 dnl VMASK_TRUECOUNT($1,     $2 )
 dnl VMASK_TRUECOUNT(suffix, reg)
@ -2647,3 +2669,81 @@ instruct vmask_tolong16B(iRegLNoSp dst, vecX src) %{
  %}
  ins_pipe(pipe_slow);
 %}
+
+dnl
+dnl CLTZ_D($1     )
+dnl CLTZ_D(op_name)
+define(`CLTZ_D', `
+instruct count$1D(vecD dst, vecD src) %{
+  predicate(n->as_Vector()->length_in_bytes() == 8);
+  match(Set dst (Count$1 src));
+  ins_cost(ifelse($1, `TrailingZerosV', `3 * ', `')INSN_COST);
+  format %{ "count$1 $dst, $src\t# vector (8B/4H/2S)" %}
+  ins_encode %{
+    BasicType bt = Matcher::vector_element_basic_type(this);
+    Assembler::SIMD_Arrangement size = __ esize2arrangement((unsigned)type2aelembytes(bt), false);dnl
+ifelse($1, `TrailingZerosV', `
+    __ neon_reverse_bits(as_FloatRegister($dst$$reg), as_FloatRegister($src$$reg), bt, false);', `')
+    __ clz(as_FloatRegister($dst$$reg), size, as_FloatRegister($ifelse($1, `TrailingZerosV', dst, src)$$reg));
+  %}
+  ins_pipe(pipe_slow);
+%}')dnl
+dnl
+dnl CLTZ_X($1     )
+dnl CLTZ_X(op_name)
+define(`CLTZ_X', `
+instruct count$1X(vecX dst, vecX src) %{
+  predicate(n->as_Vector()->length_in_bytes() == 16);
+  match(Set dst (Count$1 src));
+  ins_cost(ifelse($1, `TrailingZerosV', `3 * ', `')INSN_COST);
+  format %{ "count$1 $dst, $src\t# vector (16B/8H/4S/2D)" %}
+  ins_encode %{
+    BasicType bt = Matcher::vector_element_basic_type(this);
+    Assembler::SIMD_Arrangement size = __ esize2arrangement((unsigned)type2aelembytes(bt), true);dnl
+ifelse($1, `TrailingZerosV', `
+    __ neon_reverse_bits(as_FloatRegister($dst$$reg), as_FloatRegister($src$$reg), bt, true);', `')
+    if (bt != T_LONG) {
+      __ clz(as_FloatRegister($dst$$reg), size, as_FloatRegister($ifelse($1, `TrailingZerosV', dst, src)$$reg));
+    } else {
+      __ umov(rscratch1, as_FloatRegister($ifelse($1, `TrailingZerosV', dst, src)$$reg), __ D, 0);
+      __ clz(rscratch1, rscratch1);
+      __ mov(as_FloatRegister($dst$$reg), __ D, 0, rscratch1);
+      __ umov(rscratch1, as_FloatRegister($ifelse($1, `TrailingZerosV', dst, src)$$reg), __ D, 1);
+      __ clz(rscratch1, rscratch1);
+      __ mov(as_FloatRegister($dst$$reg), __ D, 1, rscratch1);
+    }
+  %}
+  ins_pipe(pipe_slow);
+%}')dnl
+dnl
+//------------------------- CountLeadingZerosV -----------------------------
+CLTZ_D(LeadingZerosV)
+CLTZ_X(LeadingZerosV)
+
+//------------------------- CountTrailingZerosV ----------------------------
+CLTZ_D(TrailingZerosV)
+CLTZ_X(TrailingZerosV)
+
+dnl
+dnl REVERSE($1,        $2,      $3,   $4  )
+dnl REVERSE(insn_name, op_name, type, insn)
+define(`REVERSE', `
+instruct $1(vec$3 dst, vec$3 src) %{
+  predicate(n->as_Vector()->length_in_bytes() == ifelse($3, D, 8, 16));
+  match(Set dst ($2 src));
+  ins_cost(ifelse($2, `ReverseV', `2 * ', `')INSN_COST);
+  format %{ "$2 $dst, $src\t# vector ($3)" %}
+  ins_encode %{
+    BasicType bt = Matcher::vector_element_basic_type(this);
+    __ $4(as_FloatRegister($dst$$reg), as_FloatRegister($src$$reg), bt, ifelse($3, D, false, true));
+  %}
+  ins_pipe(pipe_slow);
+%}')dnl
+dnl
+//------------------------------ ReverseV -----------------------------------
+REVERSE(vreverseD, ReverseV, D, neon_reverse_bits)
+REVERSE(vreverseX, ReverseV, X, neon_reverse_bits)
+
+//---------------------------- ReverseBytesV --------------------------------
+REVERSE(vreverseBytesD, ReverseBytesV, D, neon_reverse_bytes)
+REVERSE(vreverseBytesX, ReverseBytesV, X, neon_reverse_bytes)
--- a/src/hotspot/cpu/aarch64/aarch64_sve.ad
+++ b/src/hotspot/cpu/aarch64/aarch64_sve.ad
@ -149,6 +149,8 @@ source %{
      case Op_LoadVector:
      case Op_StoreVector:
        return Matcher::vector_size_supported(bt, vlen);
+      case Op_ExpandV:
+        if (UseSVE < 2 || is_subword_type(bt)) return false;
      case Op_VectorMaskToLong:
        if (vlen > 64) return false;
      default:
@ -2199,14 +2201,83 @@ instruct vnegD_masked(vReg dst_src, pRegGov pg) %{
  ins_pipe(pipe_slow);
 %}

-// popcount vector
+// vector popcount

 instruct vpopcountI(vReg dst, vReg src) %{
-  predicate(UseSVE > 0);
+  predicate(UseSVE > 0 &&
+            !n->as_Vector()->is_predicated_vector());
  match(Set dst (PopCountVI src));
-  format %{ "sve_cnt $dst, $src\t# vector (sve) (S)\n\t" %}
+  ins_cost(SVE_COST);
+  format %{ "sve_cnt $dst, $src\t# vector (sve) (B/H/S)" %}
  ins_encode %{
-     __ sve_cnt(as_FloatRegister($dst$$reg), __ S, ptrue, as_FloatRegister($src$$reg));
+    BasicType bt = Matcher::vector_element_basic_type(this);
+    __ sve_cnt(as_FloatRegister($dst$$reg), __ elemType_to_regVariant(bt),
+         ptrue, as_FloatRegister($src$$reg));
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct vpopcountL(vReg dst, vReg src) %{
+  predicate(UseSVE > 0 &&
+            !n->as_Vector()->is_predicated_vector() &&
+            n->bottom_type()->is_vect()->element_basic_type() == T_LONG);
+  match(Set dst (PopCountVL src));
+  ins_cost(SVE_COST);
+  format %{ "sve_cnt $dst, $src\t# vector (sve) (D)" %}
+  ins_encode %{
+    __ sve_cnt(as_FloatRegister($dst$$reg), __ D,
+         ptrue, as_FloatRegister($src$$reg));
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+// If the PopCountVL is generated by auto-vectorization, the dst basic
+// type is T_INT. And once we have unified the type definition for
+// Vector API and auto-vectorization, this rule can be merged with
+// "vpopcountL" rule.
+instruct vpopcountLI(vReg dst, vReg src, vReg vtmp) %{
+  predicate(UseSVE > 0 &&
+            !n->as_Vector()->is_predicated_vector() &&
+            n->bottom_type()->is_vect()->element_basic_type() == T_INT);
+  match(Set dst (PopCountVL src));
+  effect(TEMP_DEF dst, TEMP vtmp);
+  ins_cost(3 * SVE_COST);
+  format %{ "sve_cnt $dst, $src\n\t"
+            "sve_dup $vtmp, #0\n\t"
+            "sve_uzp1 $dst, $dst, $vtmp\t# vector (sve) (S)" %}
+  ins_encode %{
+    __ sve_cnt(as_FloatRegister($dst$$reg), __ D,
+         ptrue, as_FloatRegister($src$$reg));
+    __ sve_vector_narrow(as_FloatRegister($dst$$reg), __ S,
+         as_FloatRegister($dst$$reg), __ D, as_FloatRegister($vtmp$$reg));
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+// vector popcount - predicated
+
+instruct vpopcountI_masked(vReg dst_src, pRegGov pg) %{
+  predicate(UseSVE > 0);
+  match(Set dst_src (PopCountVI dst_src pg));
+  ins_cost(SVE_COST);
+  format %{ "sve_cnt $dst_src, $pg, $dst_src\t# vector (sve) (B/H/S)" %}
+  ins_encode %{
+    BasicType bt = Matcher::vector_element_basic_type(this);
+    __ sve_cnt(as_FloatRegister($dst_src$$reg), __ elemType_to_regVariant(bt),
+         as_PRegister($pg$$reg), as_FloatRegister($dst_src$$reg));
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct vpopcountL_masked(vReg dst_src, pRegGov pg) %{
+  predicate(UseSVE > 0 &&
+            n->bottom_type()->is_vect()->element_basic_type() == T_LONG);
+  match(Set dst_src (PopCountVL dst_src pg));
+  ins_cost(SVE_COST);
+  format %{ "sve_cnt $dst_src, $pg, $dst_src\t# vector (sve) (D)" %}
+  ins_encode %{
+    __ sve_cnt(as_FloatRegister($dst_src$$reg), __ D,
+         as_PRegister($pg$$reg), as_FloatRegister($dst_src$$reg));
  %}
  ins_pipe(pipe_slow);
 %}
@ -5767,6 +5838,104 @@ instruct vloadmask_extend(pRegGov dst, vReg src, vReg tmp, rFlagsReg cr) %{
  ins_pipe(pipe_slow);
 %}

+// ---------------------------- Compress/Expand Operations ---------------------------
+
+instruct mcompress(pReg dst, pReg pg, rFlagsReg cr) %{
+  predicate(UseSVE > 0);
+  match(Set dst (CompressM pg));
+  effect(KILL cr);
+  ins_cost(2 * SVE_COST);
+  format %{ "sve_cntp rscratch1, $pg\n\t"
+            "sve_whilelo $dst, zr, rscratch1\t# mask compress (B/H/S/D)" %}
+  ins_encode %{
+    BasicType bt = Matcher::vector_element_basic_type(this);
+    Assembler::SIMD_RegVariant size = __ elemType_to_regVariant(bt);
+    __ sve_cntp(rscratch1, size, ptrue, as_PRegister($pg$$reg));
+    __ sve_whilelo(as_PRegister($dst$$reg), size, zr, rscratch1);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct vcompress(vReg dst, vReg src, pRegGov pg) %{
+  predicate(UseSVE > 0 &&
+            (n->bottom_type()->is_vect()->element_basic_type() == T_INT ||
+             n->bottom_type()->is_vect()->element_basic_type() == T_FLOAT ||
+             n->bottom_type()->is_vect()->element_basic_type() == T_LONG ||
+             n->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE));
+  match(Set dst (CompressV src pg));
+  ins_cost(SVE_COST);
+  format %{ "sve_compact $dst, $src, $pg\t# vector compress (S/D)" %}
+  ins_encode %{
+    BasicType bt = Matcher::vector_element_basic_type(this);
+    Assembler::SIMD_RegVariant size = __ elemType_to_regVariant(bt);
+    __ sve_compact(as_FloatRegister($dst$$reg), size, as_FloatRegister($src$$reg), as_PRegister($pg$$reg));
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct vcompressB(vReg dst, vReg src, pReg pg, vReg vtmp1, vReg vtmp2, vReg vtmp3, vReg vtmp4,
+                    pReg ptmp, pRegGov pgtmp) %{
+  predicate(UseSVE > 0 && n->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
+  effect(TEMP_DEF dst, TEMP vtmp1, TEMP vtmp2, TEMP vtmp3, TEMP vtmp4, TEMP ptmp, TEMP pgtmp);
+  match(Set dst (CompressV src pg));
+  ins_cost(13 * SVE_COST);
+  format %{ "sve_compact $dst, $src, $pg\t# vector compress (B)" %}
+  ins_encode %{
+    __ sve_compress_byte(as_FloatRegister($dst$$reg), as_FloatRegister($src$$reg), as_PRegister($pg$$reg),
+                         as_FloatRegister($vtmp1$$reg),as_FloatRegister($vtmp2$$reg),
+                         as_FloatRegister($vtmp3$$reg),as_FloatRegister($vtmp4$$reg),
+                         as_PRegister($ptmp$$reg), as_PRegister($pgtmp$$reg));
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct vcompressS(vReg dst, vReg src, pReg pg, vReg vtmp1, vReg vtmp2, pRegGov pgtmp) %{
+  predicate(UseSVE > 0 && n->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
+  effect(TEMP_DEF dst, TEMP vtmp1, TEMP vtmp2, TEMP pgtmp);
+  match(Set dst (CompressV src pg));
+  ins_cost(38 * SVE_COST);
+  format %{ "sve_compact $dst, $src, $pg\t# vector compress (H)" %}
+  ins_encode %{
+    __ sve_compress_short(as_FloatRegister($dst$$reg), as_FloatRegister($src$$reg), as_PRegister($pg$$reg),
+                          as_FloatRegister($vtmp1$$reg),as_FloatRegister($vtmp2$$reg), as_PRegister($pgtmp$$reg));
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct vexpand(vReg dst, vReg src, pRegGov pg) %{
+  match(Set dst (ExpandV src pg));
+  effect(TEMP_DEF dst);
+  ins_cost(4 * SVE_COST);
+  format %{ "sve_dup $dst, S/D, 0\n\t"
+            "sve_histcnt $dst, S/D, $pg, $dst, $dst\n\t"
+            "sve_sub $dst, S/D, 1\n\t"
+            "sve_tbl $dst, S/D, $src, $dst\t# vector expand (S/D)" %}
+  ins_encode %{
+    // Example input:   src   = 1 2 3 4 5 6 7 8
+    //                  pg    = 1 0 0 1 1 0 1 1
+    // Expected result: dst   = 4 0 0 5 6 0 7 8
+
+    // The basic idea is to use TBL which can shuffle the elements in the given
+    // vector flexibly. HISTCNT + SUB is used to generate the second source input
+    // for TBL whose value is used to select the indexed element from src vector.
+
+    BasicType bt = Matcher::vector_element_basic_type(this);
+    assert(UseSVE == 2 && !is_subword_type(bt), "unsupported");
+    Assembler::SIMD_RegVariant size = __ elemType_to_regVariant(bt);
+    // dst = 0 0 0 0 0 0 0 0
+    __ sve_dup(as_FloatRegister($dst$$reg), size, 0);
+    // dst = 5 0 0 4 3 0 2 1
+    __ sve_histcnt(as_FloatRegister($dst$$reg), size, as_PRegister($pg$$reg),
+                   as_FloatRegister($dst$$reg), as_FloatRegister($dst$$reg));
+    // dst = 4 -1 -1 3 2 -1 1 0
+    __ sve_sub(as_FloatRegister($dst$$reg), size, 1);
+    // dst = 4 0 0 5 6 0 7 8
+    __ sve_tbl(as_FloatRegister($dst$$reg), size, as_FloatRegister($src$$reg),
+               as_FloatRegister($dst$$reg));
+  %}
+  ins_pipe(pipe_slow);
+%}
+
 instruct vmask_gen(pRegGov pg, iRegL len, rFlagsReg cr) %{
  predicate(UseSVE > 0);
  match(Set pg (VectorMaskGen len));
@ -5780,3 +5949,147 @@ instruct vmask_gen(pRegGov pg, iRegL len, rFlagsReg cr) %{
  %}
  ins_pipe(pipe_slow);
 %}
+
+// ------------------------------ CountLeadingZerosV ------------------------------
+
+instruct vcountLeadingZeros(vReg dst, vReg src) %{
+  predicate(UseSVE > 0 &&
+            !n->as_Vector()->is_predicated_vector());
+  match(Set dst (CountLeadingZerosV src));
+  ins_cost(SVE_COST);
+  format %{ "sve_clz $dst, $src\t# vector (sve)" %}
+  ins_encode %{
+    BasicType bt = Matcher::vector_element_basic_type(this);
+    Assembler::SIMD_RegVariant size = __ elemType_to_regVariant(bt);
+    __ sve_clz(as_FloatRegister($dst$$reg), size, ptrue, as_FloatRegister($src$$reg));
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+// The dst and src should use the same register to make sure the
+// inactive lanes in dst save the same elements as src.
+instruct vcountLeadingZeros_masked(vReg dst_src, pRegGov pg) %{
+  predicate(UseSVE > 0);
+  match(Set dst_src (CountLeadingZerosV dst_src pg));
+  ins_cost(SVE_COST);
+  format %{ "sve_clz $dst_src, $pg, $dst_src\t# vector (sve)" %}
+  ins_encode %{
+    BasicType bt = Matcher::vector_element_basic_type(this);
+    Assembler::SIMD_RegVariant size = __ elemType_to_regVariant(bt);
+    __ sve_clz(as_FloatRegister($dst_src$$reg), size,
+        as_PRegister($pg$$reg), as_FloatRegister($dst_src$$reg));
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+// ------------------------------ CountTrailingZerosV -----------------------------
+
+instruct vcountTrailingZeros(vReg dst, vReg src) %{
+  predicate(UseSVE > 0 &&
+            !n->as_Vector()->is_predicated_vector());
+  match(Set dst (CountTrailingZerosV src));
+  ins_cost(2 * SVE_COST);
+  format %{ "sve_rbit $dst, $src\n\t"
+            "sve_clz  $dst, $dst\t# vector (sve)" %}
+  ins_encode %{
+    BasicType bt = Matcher::vector_element_basic_type(this);
+    Assembler::SIMD_RegVariant size = __ elemType_to_regVariant(bt);
+    __ sve_rbit(as_FloatRegister($dst$$reg), size, ptrue, as_FloatRegister($src$$reg));
+    __ sve_clz(as_FloatRegister($dst$$reg), size, ptrue, as_FloatRegister($dst$$reg));
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+// The dst and src should use the same register to make sure the
+// inactive lanes in dst save the same elements as src.
+instruct vcountTrailingZeros_masked(vReg dst_src, pRegGov pg) %{
+  predicate(UseSVE > 0);
+  match(Set dst_src (CountTrailingZerosV dst_src pg));
+  ins_cost(2 * SVE_COST);
+  format %{ "sve_rbit $dst_src, $pg, $dst_src\n\t"
+            "sve_clz  $dst_src, $pg, $dst_src\t# vector (sve)" %}
+  ins_encode %{
+    BasicType bt = Matcher::vector_element_basic_type(this);
+    Assembler::SIMD_RegVariant size = __ elemType_to_regVariant(bt);
+    __ sve_rbit(as_FloatRegister($dst_src$$reg), size,
+        as_PRegister($pg$$reg), as_FloatRegister($dst_src$$reg));
+    __ sve_clz(as_FloatRegister($dst_src$$reg), size,
+        as_PRegister($pg$$reg), as_FloatRegister($dst_src$$reg));
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+// ---------------------------------- ReverseV ------------------------------------
+
+instruct vreverse(vReg dst, vReg src) %{
+  predicate(UseSVE > 0 &&
+            !n->as_Vector()->is_predicated_vector());
+  match(Set dst (ReverseV src));
+  ins_cost(SVE_COST);
+  format %{ "sve_rbit $dst, $src\t# vector (sve)" %}
+  ins_encode %{
+    BasicType bt = Matcher::vector_element_basic_type(this);
+    Assembler::SIMD_RegVariant size = __ elemType_to_regVariant(bt);
+    __ sve_rbit(as_FloatRegister($dst$$reg), size, ptrue, as_FloatRegister($src$$reg));
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+// The dst and src should use the same register to make sure the
+// inactive lanes in dst save the same elements as src.
+instruct vreverse_masked(vReg dst_src, pRegGov pg) %{
+  predicate(UseSVE > 0);
+  match(Set dst_src (ReverseV dst_src pg));
+  ins_cost(SVE_COST);
+  format %{ "sve_rbit $dst_src, $pg, $dst_src\t# vector (sve)" %}
+  ins_encode %{
+    BasicType bt = Matcher::vector_element_basic_type(this);
+    Assembler::SIMD_RegVariant size = __ elemType_to_regVariant(bt);
+    __ sve_rbit(as_FloatRegister($dst_src$$reg), size,
+        as_PRegister($pg$$reg), as_FloatRegister($dst_src$$reg));
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+// -------------------------------- ReverseBytesV ---------------------------------
+
+instruct vreverseBytes(vReg dst, vReg src) %{
+  predicate(UseSVE > 0 &&
+            !n->as_Vector()->is_predicated_vector());
+  match(Set dst (ReverseBytesV src));
+  ins_cost(SVE_COST);
+  format %{ "sve_revb $dst, $src\t# vector (sve)" %}
+  ins_encode %{
+    BasicType bt = Matcher::vector_element_basic_type(this);
+    Assembler::SIMD_RegVariant size = __ elemType_to_regVariant(bt);
+    if (bt == T_BYTE) {
+      if (as_FloatRegister($dst$$reg) != as_FloatRegister($src$$reg)) {
+        __ sve_orr(as_FloatRegister($dst$$reg), as_FloatRegister($src$$reg), as_FloatRegister($src$$reg));
+      }
+    } else {
+      __ sve_revb(as_FloatRegister($dst$$reg), size, ptrue, as_FloatRegister($src$$reg));
+    }
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+// The dst and src should use the same register to make sure the
+// inactive lanes in dst save the same elements as src.
+instruct vreverseBytes_masked(vReg dst_src, pRegGov pg) %{
+  predicate(UseSVE > 0);
+  match(Set dst_src (ReverseBytesV dst_src pg));
+  ins_cost(SVE_COST);
+  format %{ "sve_revb $dst_src, $pg, $dst_src\t# vector (sve)" %}
+  ins_encode %{
+    BasicType bt = Matcher::vector_element_basic_type(this);
+    Assembler::SIMD_RegVariant size = __ elemType_to_regVariant(bt);
+    if (bt == T_BYTE) {
+      // do nothing
+    } else {
+      __ sve_revb(as_FloatRegister($dst_src$$reg), size,
+          as_PRegister($pg$$reg), as_FloatRegister($dst_src$$reg));
+    }
+  %}
+  ins_pipe(pipe_slow);
+%}
+
--- a/src/hotspot/cpu/aarch64/aarch64_sve_ad.m4
+++ b/src/hotspot/cpu/aarch64/aarch64_sve_ad.m4
@ -144,6 +144,8 @@ source %{
      case Op_LoadVector:
      case Op_StoreVector:
        return Matcher::vector_size_supported(bt, vlen);
+      case Op_ExpandV:
+        if (UseSVE < 2 || is_subword_type(bt)) return false;
      case Op_VectorMaskToLong:
        if (vlen > 64) return false;
      default:
@ -1172,18 +1174,75 @@ UNARY_OP_PREDICATE(vnegL, NegVL, D, sve_neg)
 UNARY_OP_PREDICATE(vnegF, NegVF, S, sve_fneg)
 UNARY_OP_PREDICATE(vnegD, NegVD, D, sve_fneg)

-// popcount vector
+dnl
+dnl VPOPCOUNT($1,          $2  )
+dnl VPOPCOUNT(name_suffix, size)
+define(`VPOPCOUNT', `
+instruct vpopcount$1(vReg dst, vReg src) %{
+  predicate(UseSVE > 0 &&
+            !n->as_Vector()->is_predicated_vector()`'ifelse($1, `L', ` &&
+            n->bottom_type()->is_vect()->element_basic_type() == T_LONG', `'));
+  match(Set dst (PopCountV$1 src));
+  ins_cost(SVE_COST);
+  format %{ "sve_cnt $dst, $src\t# vector (sve) ($2)" %}
+  ins_encode %{dnl
+ifelse($1, `I', `
+    BasicType bt = Matcher::vector_element_basic_type(this);', `')
+    __ sve_cnt(as_FloatRegister($dst$$reg), ifelse($1, `I', `__ elemType_to_regVariant(bt)', `__ D'),
+         ptrue, as_FloatRegister($src$$reg));
+  %}
+  ins_pipe(pipe_slow);
+%}')dnl
+dnl
+// vector popcount
+VPOPCOUNT(I, B/H/S)
+VPOPCOUNT(L, D)

-instruct vpopcountI(vReg dst, vReg src) %{
-  predicate(UseSVE > 0);
-  match(Set dst (PopCountVI src));
-  format %{ "sve_cnt $dst, $src\t# vector (sve) (S)\n\t" %}
+// If the PopCountVL is generated by auto-vectorization, the dst basic
+// type is T_INT. And once we have unified the type definition for
+// Vector API and auto-vectorization, this rule can be merged with
+// "vpopcountL" rule.
+instruct vpopcountLI(vReg dst, vReg src, vReg vtmp) %{
+  predicate(UseSVE > 0 &&
+            !n->as_Vector()->is_predicated_vector() &&
+            n->bottom_type()->is_vect()->element_basic_type() == T_INT);
+  match(Set dst (PopCountVL src));
+  effect(TEMP_DEF dst, TEMP vtmp);
+  ins_cost(3 * SVE_COST);
+  format %{ "sve_cnt $dst, $src\n\t"
+            "sve_dup $vtmp, #0\n\t"
+            "sve_uzp1 $dst, $dst, $vtmp\t# vector (sve) (S)" %}
  ins_encode %{
-     __ sve_cnt(as_FloatRegister($dst$$reg), __ S, ptrue, as_FloatRegister($src$$reg));
+    __ sve_cnt(as_FloatRegister($dst$$reg), __ D,
+         ptrue, as_FloatRegister($src$$reg));
+    __ sve_vector_narrow(as_FloatRegister($dst$$reg), __ S,
+         as_FloatRegister($dst$$reg), __ D, as_FloatRegister($vtmp$$reg));
  %}
  ins_pipe(pipe_slow);
 %}

+dnl
+dnl VPOPCOUNT_PREDICATE($1,          $2  )
+dnl VPOPCOUNT_PREDICATE(name_suffix, size)
+define(`VPOPCOUNT_PREDICATE', `
+instruct vpopcount$1_masked(vReg dst_src, pRegGov pg) %{
+  predicate(UseSVE > 0`'ifelse($1, `L', ` &&
+            n->bottom_type()->is_vect()->element_basic_type() == T_LONG', `'));
+  match(Set dst_src (PopCountV$1 dst_src pg));
+  ins_cost(SVE_COST);
+  format %{ "sve_cnt $dst_src, $pg, $dst_src\t# vector (sve) ($2)" %}
+  ins_encode %{dnl
+ifelse($1, `I', `
+    BasicType bt = Matcher::vector_element_basic_type(this);', `')
+    __ sve_cnt(as_FloatRegister($dst_src$$reg), ifelse($1, `I', `__ elemType_to_regVariant(bt)', `__ D'),
+         as_PRegister($pg$$reg), as_FloatRegister($dst_src$$reg));
+  %}
+  ins_pipe(pipe_slow);
+%}')dnl
+// vector popcount - predicated
+VPOPCOUNT_PREDICATE(I, B/H/S)
+VPOPCOUNT_PREDICATE(L, D)
+
 // vector blend

 instruct vblend(vReg dst, vReg src1, vReg src2, pRegGov pg) %{
@ -3234,6 +3293,104 @@ instruct vloadmask_extend(pRegGov dst, vReg src, vReg tmp, rFlagsReg cr) %{
  ins_pipe(pipe_slow);
 %}

+// ---------------------------- Compress/Expand Operations ---------------------------
+
+instruct mcompress(pReg dst, pReg pg, rFlagsReg cr) %{
+  predicate(UseSVE > 0);
+  match(Set dst (CompressM pg));
+  effect(KILL cr);
+  ins_cost(2 * SVE_COST);
+  format %{ "sve_cntp rscratch1, $pg\n\t"
+            "sve_whilelo $dst, zr, rscratch1\t# mask compress (B/H/S/D)" %}
+  ins_encode %{
+    BasicType bt = Matcher::vector_element_basic_type(this);
+    Assembler::SIMD_RegVariant size = __ elemType_to_regVariant(bt);
+    __ sve_cntp(rscratch1, size, ptrue, as_PRegister($pg$$reg));
+    __ sve_whilelo(as_PRegister($dst$$reg), size, zr, rscratch1);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct vcompress(vReg dst, vReg src, pRegGov pg) %{
+  predicate(UseSVE > 0 &&
+            (n->bottom_type()->is_vect()->element_basic_type() == T_INT ||
+             n->bottom_type()->is_vect()->element_basic_type() == T_FLOAT ||
+             n->bottom_type()->is_vect()->element_basic_type() == T_LONG ||
+             n->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE));
+  match(Set dst (CompressV src pg));
+  ins_cost(SVE_COST);
+  format %{ "sve_compact $dst, $src, $pg\t# vector compress (S/D)" %}
+  ins_encode %{
+    BasicType bt = Matcher::vector_element_basic_type(this);
+    Assembler::SIMD_RegVariant size = __ elemType_to_regVariant(bt);
+    __ sve_compact(as_FloatRegister($dst$$reg), size, as_FloatRegister($src$$reg), as_PRegister($pg$$reg));
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct vcompressB(vReg dst, vReg src, pReg pg, vReg vtmp1, vReg vtmp2, vReg vtmp3, vReg vtmp4,
+                    pReg ptmp, pRegGov pgtmp) %{
+  predicate(UseSVE > 0 && n->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
+  effect(TEMP_DEF dst, TEMP vtmp1, TEMP vtmp2, TEMP vtmp3, TEMP vtmp4, TEMP ptmp, TEMP pgtmp);
+  match(Set dst (CompressV src pg));
+  ins_cost(13 * SVE_COST);
+  format %{ "sve_compact $dst, $src, $pg\t# vector compress (B)" %}
+  ins_encode %{
+    __ sve_compress_byte(as_FloatRegister($dst$$reg), as_FloatRegister($src$$reg), as_PRegister($pg$$reg),
+                         as_FloatRegister($vtmp1$$reg),as_FloatRegister($vtmp2$$reg),
+                         as_FloatRegister($vtmp3$$reg),as_FloatRegister($vtmp4$$reg),
+                         as_PRegister($ptmp$$reg), as_PRegister($pgtmp$$reg));
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct vcompressS(vReg dst, vReg src, pReg pg, vReg vtmp1, vReg vtmp2, pRegGov pgtmp) %{
+  predicate(UseSVE > 0 && n->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
+  effect(TEMP_DEF dst, TEMP vtmp1, TEMP vtmp2, TEMP pgtmp);
+  match(Set dst (CompressV src pg));
+  ins_cost(38 * SVE_COST);
+  format %{ "sve_compact $dst, $src, $pg\t# vector compress (H)" %}
+  ins_encode %{
+    __ sve_compress_short(as_FloatRegister($dst$$reg), as_FloatRegister($src$$reg), as_PRegister($pg$$reg),
+                          as_FloatRegister($vtmp1$$reg),as_FloatRegister($vtmp2$$reg), as_PRegister($pgtmp$$reg));
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct vexpand(vReg dst, vReg src, pRegGov pg) %{
+  match(Set dst (ExpandV src pg));
+  effect(TEMP_DEF dst);
+  ins_cost(4 * SVE_COST);
+  format %{ "sve_dup $dst, S/D, 0\n\t"
+            "sve_histcnt $dst, S/D, $pg, $dst, $dst\n\t"
+            "sve_sub $dst, S/D, 1\n\t"
+            "sve_tbl $dst, S/D, $src, $dst\t# vector expand (S/D)" %}
+  ins_encode %{
+    // Example input:   src   = 1 2 3 4 5 6 7 8
+    //                  pg    = 1 0 0 1 1 0 1 1
+    // Expected result: dst   = 4 0 0 5 6 0 7 8
+
+    // The basic idea is to use TBL which can shuffle the elements in the given
+    // vector flexibly. HISTCNT + SUB is used to generate the second source input
+    // for TBL whose value is used to select the indexed element from src vector.
+
+    BasicType bt = Matcher::vector_element_basic_type(this);
+    assert(UseSVE == 2 && !is_subword_type(bt), "unsupported");
+    Assembler::SIMD_RegVariant size = __ elemType_to_regVariant(bt);
+    // dst = 0 0 0 0 0 0 0 0
+    __ sve_dup(as_FloatRegister($dst$$reg), size, 0);
+    // dst = 5 0 0 4 3 0 2 1
+    __ sve_histcnt(as_FloatRegister($dst$$reg), size, as_PRegister($pg$$reg),
+                   as_FloatRegister($dst$$reg), as_FloatRegister($dst$$reg));
+    // dst = 4 -1 -1 3 2 -1 1 0
+    __ sve_sub(as_FloatRegister($dst$$reg), size, 1);
+    // dst = 4 0 0 5 6 0 7 8
+    __ sve_tbl(as_FloatRegister($dst$$reg), size, as_FloatRegister($src$$reg),
+               as_FloatRegister($dst$$reg));
+  %}
+  ins_pipe(pipe_slow);
+%}
+
 instruct vmask_gen(pRegGov pg, iRegL len, rFlagsReg cr) %{
  predicate(UseSVE > 0);
  match(Set pg (VectorMaskGen len));
@ -3247,3 +3404,79 @@ instruct vmask_gen(pRegGov pg, iRegL len, rFlagsReg cr) %{
  %}
  ins_pipe(pipe_slow);
 %}
+
+dnl
+dnl BITWISE_UNARY($1,        $2,      $3  )
+dnl BITWISE_UNARY(insn_name, op_name, insn)
+define(`BITWISE_UNARY', `
+instruct $1(vReg dst, vReg src) %{
+  predicate(UseSVE > 0 &&
+            !n->as_Vector()->is_predicated_vector());
+  match(Set dst ($2 src));
+  ins_cost(ifelse($2, `CountTrailingZerosV', `2 * ', `')SVE_COST);
+  format %{ ifelse($2, `CountTrailingZerosV', `"sve_rbit $dst, $src\n\t"
+            "$3  $dst, $dst', `"$3 $dst, $src')\t# vector (sve)" %}
+  ins_encode %{
+    BasicType bt = Matcher::vector_element_basic_type(this);
+    Assembler::SIMD_RegVariant size = __ elemType_to_regVariant(bt);dnl
+ifelse($2, `CountTrailingZerosV', `
+    __ sve_rbit(as_FloatRegister($dst$$reg), size, ptrue, as_FloatRegister($src$$reg));', `')dnl
+ifelse($2, `ReverseBytesV', `
+    if (bt == T_BYTE) {
+      if (as_FloatRegister($dst$$reg) != as_FloatRegister($src$$reg)) {
+        __ sve_orr(as_FloatRegister($dst$$reg), as_FloatRegister($src$$reg), as_FloatRegister($src$$reg));
+      }
+    } else {
+      __ $3(as_FloatRegister($dst$$reg), size, ptrue, as_FloatRegister($src$$reg));
+    }', `
+    __ $3(as_FloatRegister($dst$$reg), size, ptrue, as_FloatRegister($ifelse($2, `CountTrailingZerosV', dst, src)$$reg));')
+  %}
+  ins_pipe(pipe_slow);
+%}')dnl
+dnl
+dnl BITWISE_UNARY_PREDICATE($1,        $2,      $3  )
+dnl BITWISE_UNARY_PREDICATE(insn_name, op_name, insn)
+define(`BITWISE_UNARY_PREDICATE', `
+// The dst and src should use the same register to make sure the
+// inactive lanes in dst save the same elements as src.
+instruct $1_masked(vReg dst_src, pRegGov pg) %{
+  predicate(UseSVE > 0);
+  match(Set dst_src ($2 dst_src pg));
+  ins_cost(ifelse($2, `CountTrailingZerosV', `2 * ', `')SVE_COST);
+  format %{ ifelse($2, `CountTrailingZerosV', `"sve_rbit $dst_src, $pg, $dst_src\n\t"
+            "$3  $dst_src, $pg, $dst_src', `"$3 $dst_src, $pg, $dst_src')\t# vector (sve)" %}
+  ins_encode %{
+    BasicType bt = Matcher::vector_element_basic_type(this);
+    Assembler::SIMD_RegVariant size = __ elemType_to_regVariant(bt);dnl
+ifelse($2, `CountTrailingZerosV', `
+    __ sve_rbit(as_FloatRegister($dst_src$$reg), size,
+        as_PRegister($pg$$reg), as_FloatRegister($dst_src$$reg));', `')dnl
+ifelse($2, `ReverseBytesV', `
+    if (bt == T_BYTE) {
+      // do nothing
+    } else {
+      __ $3(as_FloatRegister($dst_src$$reg), size,
+          as_PRegister($pg$$reg), as_FloatRegister($dst_src$$reg));
+    }', `
+    __ $3(as_FloatRegister($dst_src$$reg), size,
+        as_PRegister($pg$$reg), as_FloatRegister($dst_src$$reg));')
+  %}
+  ins_pipe(pipe_slow);
+%}')dnl
+dnl
+// ------------------------------ CountLeadingZerosV ------------------------------
+BITWISE_UNARY(vcountLeadingZeros, CountLeadingZerosV, sve_clz)
+BITWISE_UNARY_PREDICATE(vcountLeadingZeros, CountLeadingZerosV, sve_clz)
+
+// ------------------------------ CountTrailingZerosV -----------------------------
+BITWISE_UNARY(vcountTrailingZeros, CountTrailingZerosV, sve_clz)
+BITWISE_UNARY_PREDICATE(vcountTrailingZeros, CountTrailingZerosV, sve_clz)
+
+// ---------------------------------- ReverseV ------------------------------------
+BITWISE_UNARY(vreverse, ReverseV, sve_rbit)
+BITWISE_UNARY_PREDICATE(vreverse, ReverseV, sve_rbit)
+
+// -------------------------------- ReverseBytesV ---------------------------------
+BITWISE_UNARY(vreverseBytes, ReverseBytesV, sve_revb)
+BITWISE_UNARY_PREDICATE(vreverseBytes, ReverseBytesV, sve_revb)
+
--- a/src/hotspot/cpu/aarch64/assembler_aarch64.hpp
+++ b/src/hotspot/cpu/aarch64/assembler_aarch64.hpp
@ -3134,6 +3134,7 @@ public:
  INSN(sve_andv, 0b00000100, 0b011010001); // bitwise and reduction to scalar
  INSN(sve_asr,  0b00000100, 0b010000100); // vector arithmetic shift right
  INSN(sve_bic,  0b00000100, 0b011011000); // vector bitwise clear
+  INSN(sve_clz,  0b00000100, 0b011001101); // vector count leading zero bits
  INSN(sve_cnt,  0b00000100, 0b011010101); // count non-zero bits
  INSN(sve_cpy,  0b00000101, 0b100000100); // copy scalar to each active vector element
  INSN(sve_eor,  0b00000100, 0b011001000); // vector eor
@ -3793,6 +3794,19 @@ void sve_fcm(Condition cond, PRegister Pd, SIMD_RegVariant T,
  INSN(sve_lastb, 0b1);
 #undef INSN

+// SVE reverse within elements
+#define INSN(NAME, opc, cond)                                                        \
+  void NAME(FloatRegister Zd, SIMD_RegVariant T, PRegister Pg,  FloatRegister Zn) {  \
+    starti;                                                                          \
+    assert(cond, "invalid size");                                                    \
+    f(0b00000101, 31, 24), f(T, 23, 22), f(0b1001, 21, 18), f(opc, 17, 16);          \
+    f(0b100, 15, 13), pgrf(Pg, 10), rf(Zn, 5), rf(Zd, 0);                            \
+  }
+
+  INSN(sve_revb, 0b00, T == H || T == S || T == D);
+  INSN(sve_rbit, 0b11, T != Q);
+#undef INSN
+
  // SVE Create index starting from general-purpose register and incremented by immediate
  void sve_index(FloatRegister Zd, SIMD_RegVariant T, Register Rn, int imm) {
    starti;
@ -3819,6 +3833,23 @@ void sve_fcm(Condition cond, PRegister Pd, SIMD_RegVariant T,
    f(0b001100, 15, 10), rf(Zn, 5), rf(Zd, 0);
  }

+  // Shuffle active elements of vector to the right and fill with zero
+  void sve_compact(FloatRegister Zd, SIMD_RegVariant T, FloatRegister Zn, PRegister Pg) {
+    starti;
+    assert(T == S || T == D, "invalid size");
+    f(0b00000101, 31, 24), f(T, 23, 22), f(0b100001100, 21, 13);
+    pgrf(Pg, 10), rf(Zn, 5), rf(Zd, 0);
+  }
+
+  // SVE2 Count matching elements in vector
+  void sve_histcnt(FloatRegister Zd, SIMD_RegVariant T, PRegister Pg,
+                   FloatRegister Zn, FloatRegister Zm) {
+    starti;
+    assert(T == S || T == D, "invalid size");
+    f(0b01000101, 31, 24), f(T, 23, 22), f(0b1, 21), rf(Zm, 16);
+    f(0b110, 15, 13), pgrf(Pg, 10), rf(Zn, 5), rf(Zd, 0);
+  }
+
 // SVE2 bitwise permute
 #define INSN(NAME, opc)                                                                  \
  void NAME(FloatRegister Zd, SIMD_RegVariant T, FloatRegister Zn,  FloatRegister Zm) {  \
--- a/src/hotspot/cpu/aarch64/c2_MacroAssembler_aarch64.cpp
+++ b/src/hotspot/cpu/aarch64/c2_MacroAssembler_aarch64.cpp
@ -1113,10 +1113,12 @@ void C2_MacroAssembler::sve_vector_narrow(FloatRegister dst, SIMD_RegVariant dst
      sve_uzp1(dst, S, src, tmp);
      break;
    case H:
+      assert_different_registers(dst, tmp);
      sve_uzp1(dst, S, src, tmp);
      sve_uzp1(dst, H, dst, tmp);
      break;
    case B:
+      assert_different_registers(dst, tmp);
      sve_uzp1(dst, S, src, tmp);
      sve_uzp1(dst, H, dst, tmp);
      sve_uzp1(dst, B, dst, tmp);
@ -1128,6 +1130,7 @@ void C2_MacroAssembler::sve_vector_narrow(FloatRegister dst, SIMD_RegVariant dst
    if (dst_size == H) {
      sve_uzp1(dst, H, src, tmp);
    } else { // B
+      assert_different_registers(dst, tmp);
      sve_uzp1(dst, H, src, tmp);
      sve_uzp1(dst, B, dst, tmp);
    }
@ -1311,6 +1314,154 @@ void C2_MacroAssembler::sve_ptrue_lanecnt(PRegister dst, SIMD_RegVariant size, i
  }
 }

+// Pack active elements of src, under the control of mask, into the lowest-numbered elements of dst.
+// Any remaining elements of dst will be filled with zero.
+// Clobbers: rscratch1
+// Preserves: src, mask
+void C2_MacroAssembler::sve_compress_short(FloatRegister dst, FloatRegister src, PRegister mask,
+                                           FloatRegister vtmp1, FloatRegister vtmp2,
+                                           PRegister pgtmp) {
+  assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
+  assert_different_registers(dst, src, vtmp1, vtmp2);
+  assert_different_registers(mask, pgtmp);
+
+  // Example input:   src   = 8888 7777 6666 5555 4444 3333 2222 1111
+  //                  mask  = 0001 0000 0000 0001 0001 0000 0001 0001
+  // Expected result: dst   = 0000 0000 0000 8888 5555 4444 2222 1111
+  sve_dup(vtmp2, H, 0);
+
+  // Extend lowest half to type INT.
+  // dst = 00004444 00003333 00002222 00001111
+  sve_uunpklo(dst, S, src);
+  // pgtmp = 00000001 00000000 00000001 00000001
+  sve_punpklo(pgtmp, mask);
+  // Pack the active elements in size of type INT to the right,
+  // and fill the remainings with zero.
+  // dst = 00000000 00004444 00002222 00001111
+  sve_compact(dst, S, dst, pgtmp);
+  // Narrow the result back to type SHORT.
+  // dst = 0000 0000 0000 0000 0000 4444 2222 1111
+  sve_uzp1(dst, H, dst, vtmp2);
+  // Count the active elements of lowest half.
+  // rscratch1 = 3
+  sve_cntp(rscratch1, S, ptrue, pgtmp);
+
+  // Repeat to the highest half.
+  // pgtmp = 00000001 00000000 00000000 00000001
+  sve_punpkhi(pgtmp, mask);
+  // vtmp1 = 00008888 00007777 00006666 00005555
+  sve_uunpkhi(vtmp1, S, src);
+  // vtmp1 = 00000000 00000000 00008888 00005555
+  sve_compact(vtmp1, S, vtmp1, pgtmp);
+  // vtmp1 = 0000 0000 0000 0000 0000 0000 8888 5555
+  sve_uzp1(vtmp1, H, vtmp1, vtmp2);
+
+  // Compressed low:   dst   = 0000 0000 0000 0000 0000 4444 2222 1111
+  // Compressed high:  vtmp1 = 0000 0000 0000 0000 0000 0000 8888  5555
+  // Left shift(cross lane) compressed high with TRUE_CNT lanes,
+  // TRUE_CNT is the number of active elements in the compressed low.
+  neg(rscratch1, rscratch1);
+  // vtmp2 = {4 3 2 1 0 -1 -2 -3}
+  sve_index(vtmp2, H, rscratch1, 1);
+  // vtmp1 = 0000 0000 0000 8888 5555 0000 0000 0000
+  sve_tbl(vtmp1, H, vtmp1, vtmp2);
+
+  // Combine the compressed high(after shifted) with the compressed low.
+  // dst = 0000 0000 0000 8888 5555 4444 2222 1111
+  sve_orr(dst, dst, vtmp1);
+}
+
+// Clobbers: rscratch1, rscratch2
+// Preserves: src, mask
+void C2_MacroAssembler::sve_compress_byte(FloatRegister dst, FloatRegister src, PRegister mask,
+                                          FloatRegister vtmp1, FloatRegister vtmp2,
+                                          FloatRegister vtmp3, FloatRegister vtmp4,
+                                          PRegister ptmp, PRegister pgtmp) {
+  assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
+  assert_different_registers(dst, src, vtmp1, vtmp2, vtmp3, vtmp4);
+  assert_different_registers(mask, ptmp, pgtmp);
+  // Example input:   src   = 88 77 66 55 44 33 22 11
+  //                  mask  = 01 00 00 01 01 00 01 01
+  // Expected result: dst   = 00 00 00 88 55 44 22 11
+
+  sve_dup(vtmp4, B, 0);
+  // Extend lowest half to type SHORT.
+  // vtmp1 = 0044 0033 0022 0011
+  sve_uunpklo(vtmp1, H, src);
+  // ptmp = 0001 0000 0001 0001
+  sve_punpklo(ptmp, mask);
+  // Count the active elements of lowest half.
+  // rscratch2 = 3
+  sve_cntp(rscratch2, H, ptrue, ptmp);
+  // Pack the active elements in size of type SHORT to the right,
+  // and fill the remainings with zero.
+  // dst = 0000 0044 0022 0011
+  sve_compress_short(dst, vtmp1, ptmp, vtmp2, vtmp3, pgtmp);
+  // Narrow the result back to type BYTE.
+  // dst = 00 00 00 00 00 44 22 11
+  sve_uzp1(dst, B, dst, vtmp4);
+
+  // Repeat to the highest half.
+  // ptmp = 0001 0000 0000 0001
+  sve_punpkhi(ptmp, mask);
+  // vtmp1 = 0088 0077 0066 0055
+  sve_uunpkhi(vtmp2, H, src);
+  // vtmp1 = 0000 0000 0088 0055
+  sve_compress_short(vtmp1, vtmp2, ptmp, vtmp3, vtmp4, pgtmp);
+
+  sve_dup(vtmp4, B, 0);
+  // vtmp1 = 00 00 00 00 00 00 88 55
+  sve_uzp1(vtmp1, B, vtmp1, vtmp4);
+
+  // Compressed low:   dst   = 00 00 00 00 00 44 22 11
+  // Compressed high:  vtmp1 = 00 00 00 00 00 00 88 55
+  // Left shift(cross lane) compressed high with TRUE_CNT lanes,
+  // TRUE_CNT is the number of active elements in the compressed low.
+  neg(rscratch2, rscratch2);
+  // vtmp2 = {4 3 2 1 0 -1 -2 -3}
+  sve_index(vtmp2, B, rscratch2, 1);
+  // vtmp1 = 00 00 00 88 55 00 00 00
+  sve_tbl(vtmp1, B, vtmp1, vtmp2);
+  // Combine the compressed high(after shifted) with the compressed low.
+  // dst = 00 00 00 88 55 44 22 11
+  sve_orr(dst, dst, vtmp1);
+}
+
+void C2_MacroAssembler::neon_reverse_bits(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) {
+  assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type");
+  SIMD_Arrangement size = isQ ? T16B : T8B;
+  if (bt == T_BYTE) {
+    rbit(dst, size, src);
+  } else {
+    neon_reverse_bytes(dst, src, bt, isQ);
+    rbit(dst, size, dst);
+  }
+}
+
+void C2_MacroAssembler::neon_reverse_bytes(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) {
+  assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type");
+  SIMD_Arrangement size = isQ ? T16B : T8B;
+  switch (bt) {
+    case T_BYTE:
+      if (dst != src) {
+        orr(dst, size, src, src);
+      }
+      break;
+    case T_SHORT:
+      rev16(dst, size, src);
+      break;
+    case T_INT:
+      rev32(dst, size, src);
+      break;
+    case T_LONG:
+      rev64(dst, size, src);
+      break;
+    default:
+      assert(false, "unsupported");
+      ShouldNotReachHere();
+  }
+}
+
 // Extract a scalar element from an sve vector at position 'idx'.
 // The input elements in src are expected to be of integral type.
 void C2_MacroAssembler::sve_extract_integral(Register dst, SIMD_RegVariant size, FloatRegister src, int idx,
--- a/src/hotspot/cpu/aarch64/c2_MacroAssembler_aarch64.hpp
+++ b/src/hotspot/cpu/aarch64/c2_MacroAssembler_aarch64.hpp
@ -105,4 +105,20 @@
                        FloatRegister tmp2, PRegister ptmp,
                        SIMD_RegVariant T);

+  // Pack active elements of src, under the control of mask, into the
+  // lowest-numbered elements of dst. Any remaining elements of dst will
+  // be filled with zero.
+  void sve_compress_byte(FloatRegister dst, FloatRegister src, PRegister mask,
+                         FloatRegister vtmp1, FloatRegister vtmp2,
+                         FloatRegister vtmp3, FloatRegister vtmp4,
+                         PRegister ptmp, PRegister pgtmp);
+
+  void sve_compress_short(FloatRegister dst, FloatRegister src, PRegister mask,
+                          FloatRegister vtmp1, FloatRegister vtmp2,
+                          PRegister pgtmp);
+
+  void neon_reverse_bits(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ);
+
+  void neon_reverse_bytes(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ);
+
 #endif // CPU_AARCH64_C2_MACROASSEMBLER_AARCH64_HPP
--- a/src/hotspot/cpu/x86/assembler_x86.cpp
+++ b/src/hotspot/cpu/x86/assembler_x86.cpp
@ -4838,18 +4838,54 @@ void Assembler::popcntl(Register dst, Register src) {
  emit_int24(0x0F, (unsigned char)0xB8, (0xC0 | encode));
 }

-void Assembler::vpopcntd(XMMRegister dst, XMMRegister src, int vector_len) {
-  assert(VM_Version::supports_avx512_vpopcntdq(), "must support vpopcntdq feature");
-  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true);
+void Assembler::evpopcntb(XMMRegister dst, KRegister mask, XMMRegister src, bool merge, int vector_len) {
+  assert(VM_Version::supports_avx512_bitalg(), "must support avx512bitalg feature");
+  assert(vector_len == AVX_512bit || VM_Version::supports_avx512vl(), "");
+  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
+  attributes.set_embedded_opmask_register_specifier(mask);
  attributes.set_is_evex_instruction();
+  if (merge) {
+    attributes.reset_is_clear_context();
+  }
+  int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
+  emit_int16(0x54, (0xC0 | encode));
+}
+
+void Assembler::evpopcntw(XMMRegister dst, KRegister mask, XMMRegister src, bool merge, int vector_len) {
+  assert(VM_Version::supports_avx512_bitalg(), "must support avx512bitalg feature");
+  assert(vector_len == AVX_512bit || VM_Version::supports_avx512vl(), "");
+  InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
+  attributes.set_is_evex_instruction();
+  attributes.set_embedded_opmask_register_specifier(mask);
+  if (merge) {
+    attributes.reset_is_clear_context();
+  }
+  int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
+  emit_int16(0x54, (0xC0 | encode));
+}
+
+void Assembler::evpopcntd(XMMRegister dst, KRegister mask, XMMRegister src, bool merge, int vector_len) {
+  assert(VM_Version::supports_avx512_vpopcntdq(), "must support vpopcntdq feature");
+  assert(vector_len == AVX_512bit || VM_Version::supports_avx512vl(), "");
+  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
+  attributes.set_is_evex_instruction();
+  attributes.set_embedded_opmask_register_specifier(mask);
+  if (merge) {
+    attributes.reset_is_clear_context();
+  }
  int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
  emit_int16(0x55, (0xC0 | encode));
 }

-void Assembler::vpopcntq(XMMRegister dst, XMMRegister src, int vector_len) {
+void Assembler::evpopcntq(XMMRegister dst, KRegister mask, XMMRegister src, bool merge, int vector_len) {
  assert(VM_Version::supports_avx512_vpopcntdq(), "must support vpopcntdq feature");
-  InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true);
+  assert(vector_len == AVX_512bit || VM_Version::supports_avx512vl(), "");
+  InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
  attributes.set_is_evex_instruction();
+  attributes.set_embedded_opmask_register_specifier(mask);
+  if (merge) {
+    attributes.reset_is_clear_context();
+  }
  int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
  emit_int16(0x55, (0xC0 | encode));
 }
@ -7894,6 +7930,32 @@ void Assembler::evprorvq(XMMRegister dst, XMMRegister src, XMMRegister shift, in
  emit_int16(0x14, (unsigned char)(0xC0 | encode));
 }

+void Assembler::evplzcntd(XMMRegister dst, KRegister mask, XMMRegister src, bool merge, int vector_len) {
+  assert(VM_Version::supports_avx512cd(), "");
+  assert(vector_len == AVX_512bit || VM_Version::supports_avx512vl(), "");
+  InstructionAttr attributes(vector_len, /* vex_w */ false,/* legacy_mode */ false, /* no_mask_reg */ false,/* uses_vl */ true);
+  attributes.set_is_evex_instruction();
+  attributes.set_embedded_opmask_register_specifier(mask);
+  if (merge) {
+    attributes.reset_is_clear_context();
+  }
+  int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
+  emit_int16(0x44, (0xC0 | encode));
+}
+
+void Assembler::evplzcntq(XMMRegister dst, KRegister mask, XMMRegister src, bool merge, int vector_len) {
+  assert(VM_Version::supports_avx512cd(), "");
+  assert(vector_len == AVX_512bit || VM_Version::supports_avx512vl(), "");
+  InstructionAttr attributes(vector_len, /* vex_w */ true,/* legacy_mode */ false, /* no_mask_reg */ false,/* uses_vl */ true);
+  attributes.set_is_evex_instruction();
+  attributes.set_embedded_opmask_register_specifier(mask);
+  if (merge) {
+    attributes.reset_is_clear_context();
+  }
+  int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
+  emit_int16(0x44, (0xC0 | encode));
+}
+
 void Assembler::vpternlogd(XMMRegister dst, int imm8, XMMRegister src2, XMMRegister src3, int vector_len) {
  assert(VM_Version::supports_evex(), "requires EVEX support");
  assert(vector_len == Assembler::AVX_512bit || VM_Version::supports_avx512vl(), "requires VL support");
@ -7930,6 +7992,84 @@ void Assembler::vpternlogq(XMMRegister dst, int imm8, XMMRegister src2, XMMRegis
  emit_int8(imm8);
 }

+void Assembler::evexpandps(XMMRegister dst, KRegister mask, XMMRegister src, bool merge, int vector_len) {
+  assert(VM_Version::supports_evex(), "");
+  assert(vector_len == AVX_512bit || VM_Version::supports_avx512vl(), "");
+  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false,/* uses_vl */ true);
+  attributes.set_is_evex_instruction();
+  attributes.set_embedded_opmask_register_specifier(mask);
+  if (merge) {
+    attributes.reset_is_clear_context();
+  }
+  int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
+  emit_int16((unsigned char)0x88, (0xC0 | encode));
+}
+
+void Assembler::evexpandpd(XMMRegister dst, KRegister mask, XMMRegister src, bool merge, int vector_len) {
+  assert(VM_Version::supports_evex(), "");
+  assert(vector_len == AVX_512bit || VM_Version::supports_avx512vl(), "");
+  InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false,/* uses_vl */ true);
+  attributes.set_is_evex_instruction();
+  attributes.set_embedded_opmask_register_specifier(mask);
+  if (merge) {
+    attributes.reset_is_clear_context();
+  }
+  int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
+  emit_int16((unsigned char)0x88, (0xC0 | encode));
+}
+
+void Assembler::evpexpandb(XMMRegister dst, KRegister mask, XMMRegister src, bool merge, int vector_len) {
+  assert(VM_Version::supports_avx512_vbmi2(), "");
+  assert(vector_len == AVX_512bit || VM_Version::supports_avx512vl(), "");
+  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false,/* uses_vl */ true);
+  attributes.set_is_evex_instruction();
+  attributes.set_embedded_opmask_register_specifier(mask);
+  if (merge) {
+    attributes.reset_is_clear_context();
+  }
+  int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
+  emit_int16(0x62, (0xC0 | encode));
+}
+
+void Assembler::evpexpandw(XMMRegister dst, KRegister mask, XMMRegister src, bool merge, int vector_len) {
+  assert(VM_Version::supports_avx512_vbmi2(), "");
+  assert(vector_len == AVX_512bit || VM_Version::supports_avx512vl(), "");
+  InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false,/* uses_vl */ true);
+  attributes.set_is_evex_instruction();
+  attributes.set_embedded_opmask_register_specifier(mask);
+  if (merge) {
+    attributes.reset_is_clear_context();
+  }
+  int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
+  emit_int16(0x62, (0xC0 | encode));
+}
+
+void Assembler::evpexpandd(XMMRegister dst, KRegister mask, XMMRegister src, bool merge, int vector_len) {
+  assert(VM_Version::supports_evex(), "");
+  assert(vector_len == AVX_512bit || VM_Version::supports_avx512vl(), "");
+  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false,/* uses_vl */ true);
+  attributes.set_is_evex_instruction();
+  attributes.set_embedded_opmask_register_specifier(mask);
+  if (merge) {
+    attributes.reset_is_clear_context();
+  }
+  int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
+  emit_int16((unsigned char)0x89, (0xC0 | encode));
+}
+
+void Assembler::evpexpandq(XMMRegister dst, KRegister mask, XMMRegister src, bool merge, int vector_len) {
+  assert(VM_Version::supports_evex(), "");
+  assert(vector_len == AVX_512bit || VM_Version::supports_avx512vl(), "");
+  InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false,/* uses_vl */ true);
+  attributes.set_is_evex_instruction();
+  attributes.set_embedded_opmask_register_specifier(mask);
+  if (merge) {
+    attributes.reset_is_clear_context();
+  }
+  int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
+  emit_int16((unsigned char)0x89, (0xC0 | encode));
+}
+
 // vinserti forms

 void Assembler::vinserti128(XMMRegister dst, XMMRegister nds, XMMRegister src, uint8_t imm8) {
@ -7973,7 +8113,7 @@ void Assembler::vinserti32x4(XMMRegister dst, XMMRegister nds, XMMRegister src,
 }

 void Assembler::vinserti32x4(XMMRegister dst, XMMRegister nds, Address src, uint8_t imm8) {
-  assert(VM_Version::supports_avx(), "");
+  assert(VM_Version::supports_evex(), "");
  assert(dst != xnoreg, "sanity");
  assert(imm8 <= 0x03, "imm8: %u", imm8);
  InstructionMark im(this);
@ -8032,7 +8172,7 @@ void Assembler::vinsertf128(XMMRegister dst, XMMRegister nds, Address src, uint8
 }

 void Assembler::vinsertf32x4(XMMRegister dst, XMMRegister nds, XMMRegister src, uint8_t imm8) {
-  assert(VM_Version::supports_avx2(), "");
+  assert(VM_Version::supports_evex(), "");
  assert(imm8 <= 0x03, "imm8: %u", imm8);
  InstructionAttr attributes(AVX_512bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true);
  int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
@ -8045,7 +8185,7 @@ void Assembler::vinsertf32x4(XMMRegister dst, XMMRegister nds, XMMRegister src,
 }

 void Assembler::vinsertf32x4(XMMRegister dst, XMMRegister nds, Address src, uint8_t imm8) {
-  assert(VM_Version::supports_avx(), "");
+  assert(VM_Version::supports_evex(), "");
  assert(dst != xnoreg, "sanity");
  assert(imm8 <= 0x03, "imm8: %u", imm8);
  InstructionMark im(this);
@ -8346,6 +8486,20 @@ void Assembler::vpsadbw(XMMRegister dst, XMMRegister nds, XMMRegister src, int v
  emit_int16((unsigned char)0xF6, (0xC0 | encode));
 }

+void Assembler::vpunpckhwd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
+  assert(UseAVX > 0, "requires some form of AVX");
+  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true);
+  int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
+  emit_int16(0x69, (0xC0 | encode));
+}
+
+void Assembler::vpunpcklwd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
+  assert(UseAVX > 0, "requires some form of AVX");
+  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true);
+  int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
+  emit_int16(0x61, (0xC0 | encode));
+}
+
 void Assembler::vpunpckhdq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
  assert(UseAVX > 0, "requires some form of AVX");
  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true);
@ -9862,6 +10016,14 @@ void Assembler::evpternlogq(XMMRegister dst, int imm8, KRegister mask, XMMRegist
  emit_int8(imm8);
 }

+void Assembler::vgf2p8affineqb(XMMRegister dst, XMMRegister src2, XMMRegister src3, int imm8, int vector_len) {
+  assert(VM_Version::supports_gfni(), "requires GFNI support");
+  assert(VM_Version::supports_sse(), "");
+  InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true);
+  int encode = vex_prefix_and_encode(dst->encoding(), src2->encoding(), src3->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
+  emit_int24((unsigned char)0xCE, (unsigned char)(0xC0 | encode), imm8);
+}
+
 // duplicate 4-byte integer data from src into programmed locations in dest : requires AVX512VL
 void Assembler::vpbroadcastd(XMMRegister dst, XMMRegister src, int vector_len) {
  assert(UseAVX >= 2, "");
@ -11555,6 +11717,85 @@ void Assembler::evpmovm2b(XMMRegister dst, KRegister src, int vector_len) {
  int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_F3, VEX_OPCODE_0F_38, &attributes);
  emit_int16(0x28, (0xC0 | encode));
 }
+
+void Assembler::evpcompressb(XMMRegister dst, KRegister mask, XMMRegister src, bool merge, int vector_len) {
+  assert(VM_Version::supports_avx512_vbmi2(), "");
+  assert(vector_len == AVX_512bit || VM_Version::supports_avx512vl(), "");
+  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
+  attributes.set_embedded_opmask_register_specifier(mask);
+  attributes.set_is_evex_instruction();
+  if (merge) {
+    attributes.reset_is_clear_context();
+  }
+  int encode = vex_prefix_and_encode(src->encoding(), 0, dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
+  emit_int16((unsigned char)0x63, (0xC0 | encode));
+}
+
+void Assembler::evpcompressw(XMMRegister dst, KRegister mask, XMMRegister src, bool merge, int vector_len) {
+  assert(VM_Version::supports_avx512_vbmi2(), "");
+  assert(vector_len == AVX_512bit || VM_Version::supports_avx512vl(), "");
+  InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
+  attributes.set_embedded_opmask_register_specifier(mask);
+  attributes.set_is_evex_instruction();
+  if (merge) {
+    attributes.reset_is_clear_context();
+  }
+  int encode = vex_prefix_and_encode(src->encoding(), 0, dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
+  emit_int16((unsigned char)0x63, (0xC0 | encode));
+}
+
+void Assembler::evpcompressd(XMMRegister dst, KRegister mask, XMMRegister src, bool merge, int vector_len) {
+  assert(VM_Version::supports_evex(), "");
+  assert(vector_len == AVX_512bit || VM_Version::supports_avx512vl(), "");
+  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
+  attributes.set_embedded_opmask_register_specifier(mask);
+  attributes.set_is_evex_instruction();
+  if (merge) {
+    attributes.reset_is_clear_context();
+  }
+  int encode = vex_prefix_and_encode(src->encoding(), 0, dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
+  emit_int16((unsigned char)0x8B, (0xC0 | encode));
+}
+
+void Assembler::evpcompressq(XMMRegister dst, KRegister mask, XMMRegister src, bool merge, int vector_len) {
+  assert(VM_Version::supports_evex(), "");
+  assert(vector_len == AVX_512bit || VM_Version::supports_avx512vl(), "");
+  InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
+  attributes.set_embedded_opmask_register_specifier(mask);
+  attributes.set_is_evex_instruction();
+  if (merge) {
+    attributes.reset_is_clear_context();
+  }
+  int encode = vex_prefix_and_encode(src->encoding(), 0, dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
+  emit_int16((unsigned char)0x8B, (0xC0 | encode));
+}
+
+void Assembler::evcompressps(XMMRegister dst, KRegister mask, XMMRegister src, bool merge, int vector_len) {
+  assert(VM_Version::supports_evex(), "");
+  assert(vector_len == AVX_512bit || VM_Version::supports_avx512vl(), "");
+  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
+  attributes.set_embedded_opmask_register_specifier(mask);
+  attributes.set_is_evex_instruction();
+  if (merge) {
+    attributes.reset_is_clear_context();
+  }
+  int encode = vex_prefix_and_encode(src->encoding(), 0, dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
+  emit_int16((unsigned char)0x8A, (0xC0 | encode));
+}
+
+void Assembler::evcompresspd(XMMRegister dst, KRegister mask, XMMRegister src, bool merge, int vector_len) {
+  assert(VM_Version::supports_evex(), "");
+  assert(vector_len == AVX_512bit || VM_Version::supports_avx512vl(), "");
+  InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
+  attributes.set_embedded_opmask_register_specifier(mask);
+  attributes.set_is_evex_instruction();
+  if (merge) {
+    attributes.reset_is_clear_context();
+  }
+  int encode = vex_prefix_and_encode(src->encoding(), 0, dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
+  emit_int16((unsigned char)0x8A, (0xC0 | encode));
+}
+
 #ifndef _LP64

 void Assembler::incl(Register dst) {
--- a/src/hotspot/cpu/x86/assembler_x86.hpp
+++ b/src/hotspot/cpu/x86/assembler_x86.hpp
@ -1878,8 +1878,10 @@ private:
  void popcntl(Register dst, Address src);
  void popcntl(Register dst, Register src);

-  void vpopcntd(XMMRegister dst, XMMRegister src, int vector_len);
-  void vpopcntq(XMMRegister dst, XMMRegister src, int vector_len);
+  void evpopcntb(XMMRegister dst, KRegister mask, XMMRegister src, bool merge, int vector_len);
+  void evpopcntw(XMMRegister dst, KRegister mask, XMMRegister src, bool merge, int vector_len);
+  void evpopcntd(XMMRegister dst, KRegister mask, XMMRegister src, bool merge, int vector_len);
+  void evpopcntq(XMMRegister dst, KRegister mask, XMMRegister src, bool merge, int vector_len);

 #ifdef _LP64
  void popcntq(Register dst, Address src);
@ -1945,6 +1947,12 @@ private:
  void punpckldq(XMMRegister dst, Address src);
  void vpunpckldq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);

+  // Interleave High Word
+  void vpunpckhwd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
+
+  // Interleave Low Word
+  void vpunpcklwd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
+
  // Interleave High Doublewords
  void vpunpckhdq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);

@ -2206,9 +2214,8 @@ private:
  void shrxq(Register dst, Register src1, Register src2);

  void bzhiq(Register dst, Register src1, Register src2);
-  void pdep(Register dst, Register src1, Register src2);
  void pext(Register dst, Register src1, Register src2);
-
+  void pdep(Register dst, Register src1, Register src2);

  //====================VECTOR ARITHMETIC=====================================
  // Add Packed Floating-Point Values
@ -2437,6 +2444,8 @@ private:
  void evpternlogq(XMMRegister dst, int imm8, KRegister mask, XMMRegister src2, XMMRegister src3, bool merge, int vector_len);
  void evpternlogq(XMMRegister dst, int imm8, KRegister mask, XMMRegister src2, Address src3, bool merge, int vector_len);

+  void evplzcntd(XMMRegister dst, KRegister mask, XMMRegister src, bool merge, int vector_len);
+  void evplzcntq(XMMRegister dst, KRegister mask, XMMRegister src, bool merge, int vector_len);

  // Sub packed integers
  void psubb(XMMRegister dst, XMMRegister src);
@ -2581,6 +2590,21 @@ private:
  void vpternlogd(XMMRegister dst, int imm8, XMMRegister src2, Address     src3, int vector_len);
  void vpternlogq(XMMRegister dst, int imm8, XMMRegister src2, XMMRegister src3, int vector_len);

+  // Vector compress/expand instructions.
+  void evpcompressb(XMMRegister dst, KRegister mask, XMMRegister src, bool merge, int vector_len);
+  void evpcompressw(XMMRegister dst, KRegister mask, XMMRegister src, bool merge, int vector_len);
+  void evpcompressd(XMMRegister dst, KRegister mask, XMMRegister src, bool merge, int vector_len);
+  void evpcompressq(XMMRegister dst, KRegister mask, XMMRegister src, bool merge, int vector_len);
+  void evcompressps(XMMRegister dst, KRegister mask, XMMRegister src, bool merge, int vector_len);
+  void evcompresspd(XMMRegister dst, KRegister mask, XMMRegister src, bool merge, int vector_len);
+
+  void evpexpandb(XMMRegister dst, KRegister mask, XMMRegister src, bool merge, int vector_len);
+  void evpexpandw(XMMRegister dst, KRegister mask, XMMRegister src, bool merge, int vector_len);
+  void evpexpandd(XMMRegister dst, KRegister mask, XMMRegister src, bool merge, int vector_len);
+  void evpexpandq(XMMRegister dst, KRegister mask, XMMRegister src, bool merge, int vector_len);
+  void evexpandps(XMMRegister dst, KRegister mask, XMMRegister src, bool merge, int vector_len);
+  void evexpandpd(XMMRegister dst, KRegister mask, XMMRegister src, bool merge, int vector_len);
+
  // Vector Rotate Left/Right instruction.
  void evprolvd(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len);
  void evprolvq(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len);
@ -2737,6 +2761,10 @@ private:
  void evpblendmw(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
  void evpblendmd(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
  void evpblendmq(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
+
+  // Galois field affine transformation instructions.
+  void vgf2p8affineqb(XMMRegister dst, XMMRegister src2, XMMRegister src3, int imm8, int vector_len);
+
 protected:
  // Next instructions require address alignment 16 bytes SSE mode.
  // They should be called only from corresponding MacroAssembler instructions.
--- a/src/hotspot/cpu/x86/c2_MacroAssembler_x86.cpp
+++ b/src/hotspot/cpu/x86/c2_MacroAssembler_x86.cpp
@ -2292,7 +2292,7 @@ void C2_MacroAssembler::vpadd(BasicType elem_bt, XMMRegister dst, XMMRegister sr
    case T_FLOAT: vaddps(dst, src1, src2, vlen_enc); return;
    case T_LONG: vpaddq(dst, src1, src2, vlen_enc); return;
    case T_DOUBLE: vaddpd(dst, src1, src2, vlen_enc); return;
-    default: assert(false, "%s", type2name(elem_bt));
+    default: fatal("Unsupported type %s", type2name(elem_bt)); return;
  }
 }

@ -2309,7 +2309,7 @@ void C2_MacroAssembler::vpbroadcast(BasicType elem_bt, XMMRegister dst, Register
      case T_SHORT: evpbroadcastw(dst, src, vlen_enc); return;
      case T_FLOAT: case T_INT: evpbroadcastd(dst, src, vlen_enc); return;
      case T_DOUBLE: case T_LONG: evpbroadcastq(dst, src, vlen_enc); return;
-      default: assert(false, "%s", type2name(elem_bt));
+      default: fatal("Unsupported type %s", type2name(elem_bt)); return;
    }
  } else {
    assert(vlen_enc != Assembler::AVX_512bit, "required");
@ -2321,7 +2321,7 @@ void C2_MacroAssembler::vpbroadcast(BasicType elem_bt, XMMRegister dst, Register
      case T_FLOAT: movdl(dst, src); vbroadcastss(dst, dst, vlen_enc); return;
      case T_LONG: movdq(dst, src); vpbroadcastq(dst, dst, vlen_enc); return;
      case T_DOUBLE: movdq(dst, src); vbroadcastsd(dst, dst, vlen_enc); return;
-      default: assert(false, "%s", type2name(elem_bt));
+      default: fatal("Unsupported type %s", type2name(elem_bt)); return;
    }
  }
 }
@ -2348,7 +2348,9 @@ void C2_MacroAssembler::vconvert_b2x(BasicType to_elem_bt, XMMRegister dst, XMMR
      vcvtdq2pd(dst, dst, vlen_enc);
      break;
    }
-    default: assert(false, "%s", type2name(to_elem_bt));
+    default:
+      fatal("Unsupported type %s", type2name(to_elem_bt));
+      break;
  }
 }

@ -4496,6 +4498,71 @@ void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, XMMRegister

  vector_mask_operation_helper(opc, dst, tmp, masklen);
 }
+
+void C2_MacroAssembler::vector_mask_compress(KRegister dst, KRegister src, Register rtmp1,
+                                             Register rtmp2, int mask_len) {
+  kmov(rtmp1, src);
+  andq(rtmp1, (0xFFFFFFFFFFFFFFFFUL >> (64 - mask_len)));
+  mov64(rtmp2, -1L);
+  pext(rtmp2, rtmp2, rtmp1);
+  kmov(dst, rtmp2);
+}
+
+void C2_MacroAssembler::vector_compress_expand(int opcode, XMMRegister dst, XMMRegister src, KRegister mask,
+                                               bool merge, BasicType bt, int vec_enc) {
+  if (opcode == Op_CompressV) {
+    switch(bt) {
+    case T_BYTE:
+      evpcompressb(dst, mask, src, merge, vec_enc);
+      break;
+    case T_CHAR:
+    case T_SHORT:
+      evpcompressw(dst, mask, src, merge, vec_enc);
+      break;
+    case T_INT:
+      evpcompressd(dst, mask, src, merge, vec_enc);
+      break;
+    case T_FLOAT:
+      evcompressps(dst, mask, src, merge, vec_enc);
+      break;
+    case T_LONG:
+      evpcompressq(dst, mask, src, merge, vec_enc);
+      break;
+    case T_DOUBLE:
+      evcompresspd(dst, mask, src, merge, vec_enc);
+      break;
+    default:
+      fatal("Unsupported type %s", type2name(bt));
+      break;
+    }
+  } else {
+    assert(opcode == Op_ExpandV, "");
+    switch(bt) {
+    case T_BYTE:
+      evpexpandb(dst, mask, src, merge, vec_enc);
+      break;
+    case T_CHAR:
+    case T_SHORT:
+      evpexpandw(dst, mask, src, merge, vec_enc);
+      break;
+    case T_INT:
+      evpexpandd(dst, mask, src, merge, vec_enc);
+      break;
+    case T_FLOAT:
+      evexpandps(dst, mask, src, merge, vec_enc);
+      break;
+    case T_LONG:
+      evpexpandq(dst, mask, src, merge, vec_enc);
+      break;
+    case T_DOUBLE:
+      evexpandpd(dst, mask, src, merge, vec_enc);
+      break;
+    default:
+      fatal("Unsupported type %s", type2name(bt));
+      break;
+    }
+  }
+}
 #endif

 void C2_MacroAssembler::vector_signum_evex(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one,
@ -4559,6 +4626,34 @@ void C2_MacroAssembler::vector_maskall_operation(KRegister dst, Register src, in
  }
 }

+void C2_MacroAssembler::vbroadcast(BasicType bt, XMMRegister dst, int imm32, Register rtmp, int vec_enc) {
+  int lane_size = type2aelembytes(bt);
+  bool is_LP64 = LP64_ONLY(true) NOT_LP64(false);
+  if ((is_LP64 || lane_size < 8) &&
+      ((is_non_subword_integral_type(bt) && VM_Version::supports_avx512vl()) ||
+       (is_subword_type(bt) && VM_Version::supports_avx512vlbw()))) {
+    movptr(rtmp, imm32);
+    switch(lane_size) {
+      case 1 : evpbroadcastb(dst, rtmp, vec_enc); break;
+      case 2 : evpbroadcastw(dst, rtmp, vec_enc); break;
+      case 4 : evpbroadcastd(dst, rtmp, vec_enc); break;
+      case 8 : evpbroadcastq(dst, rtmp, vec_enc); break;
+      fatal("Unsupported lane size %d", lane_size);
+      break;
+    }
+  } else {
+    movptr(rtmp, imm32);
+    LP64_ONLY(movq(dst, rtmp)) NOT_LP64(movdl(dst, rtmp));
+    switch(lane_size) {
+      case 1 : vpbroadcastb(dst, dst, vec_enc); break;
+      case 2 : vpbroadcastw(dst, dst, vec_enc); break;
+      case 4 : vpbroadcastd(dst, dst, vec_enc); break;
+      case 8 : vpbroadcastq(dst, dst, vec_enc); break;
+      fatal("Unsupported lane size %d", lane_size);
+      break;
+    }
+  }
+}

 //
 // Following is lookup table based popcount computation algorithm:-
@ -4589,62 +4684,98 @@ void C2_MacroAssembler::vector_maskall_operation(KRegister dst, Register src, in
 //  f. Perform step e. for upper 128bit vector lane.
 //  g. Pack the bitset count of quadwords back to double word.
 //  h. Unpacking and packing operations are not needed for 64bit vector lane.
+
+void C2_MacroAssembler::vector_popcount_byte(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
+                                             XMMRegister xtmp2, Register rtmp, int vec_enc) {
+  assert((vec_enc == Assembler::AVX_512bit && VM_Version::supports_avx512bw()) || VM_Version::supports_avx2(), "");
+  vbroadcast(T_INT, xtmp1, 0x0F0F0F0F, rtmp, vec_enc);
+  vpsrlw(dst, src, 4, vec_enc);
+  vpand(dst, dst, xtmp1, vec_enc);
+  vpand(xtmp1, src, xtmp1, vec_enc);
+  vmovdqu(xtmp2, ExternalAddress(StubRoutines::x86::vector_popcount_lut()), rtmp, vec_enc);
+  vpshufb(xtmp1, xtmp2, xtmp1, vec_enc);
+  vpshufb(dst, xtmp2, dst, vec_enc);
+  vpaddb(dst, dst, xtmp1, vec_enc);
+}
+
 void C2_MacroAssembler::vector_popcount_int(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
-                                            XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp,
-                                            int vec_enc) {
-  if (VM_Version::supports_avx512_vpopcntdq()) {
-    vpopcntd(dst, src, vec_enc);
-  } else {
-    assert((vec_enc == Assembler::AVX_512bit && VM_Version::supports_avx512bw()) || VM_Version::supports_avx2(), "");
-    movl(rtmp, 0x0F0F0F0F);
-    movdl(xtmp1, rtmp);
-    vpbroadcastd(xtmp1, xtmp1, vec_enc);
-    if (Assembler::AVX_512bit == vec_enc) {
-      evmovdqul(xtmp2, k0, ExternalAddress(StubRoutines::x86::vector_popcount_lut()), false, vec_enc, rtmp);
-    } else {
-      vmovdqu(xtmp2, ExternalAddress(StubRoutines::x86::vector_popcount_lut()), rtmp);
-    }
-    vpand(xtmp3, src, xtmp1, vec_enc);
-    vpshufb(xtmp3, xtmp2, xtmp3, vec_enc);
-    vpsrlw(dst, src, 4, vec_enc);
-    vpand(dst, dst, xtmp1, vec_enc);
-    vpshufb(dst, xtmp2, dst, vec_enc);
-    vpaddb(xtmp3, dst, xtmp3, vec_enc);
-    vpxor(xtmp1, xtmp1, xtmp1, vec_enc);
-    vpunpckhdq(dst, xtmp3, xtmp1, vec_enc);
-    vpsadbw(dst, dst, xtmp1, vec_enc);
-    vpunpckldq(xtmp2, xtmp3, xtmp1, vec_enc);
-    vpsadbw(xtmp2, xtmp2, xtmp1, vec_enc);
-    vpackuswb(dst, xtmp2, dst, vec_enc);
-  }
+                                            XMMRegister xtmp2, Register rtmp, int vec_enc) {
+  vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
+  // Following code is as per steps e,f,g and h of above algorithm.
+  vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
+  vpunpckhdq(dst, xtmp1, xtmp2, vec_enc);
+  vpsadbw(dst, dst, xtmp2, vec_enc);
+  vpunpckldq(xtmp1, xtmp1, xtmp2, vec_enc);
+  vpsadbw(xtmp1, xtmp1, xtmp2, vec_enc);
+  vpackuswb(dst, xtmp1, dst, vec_enc);
+}
+
+void C2_MacroAssembler::vector_popcount_short(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
+                                              XMMRegister xtmp2, Register rtmp, int vec_enc) {
+  vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
+  // Add the popcount of upper and lower bytes of word.
+  vbroadcast(T_INT, xtmp2, 0x00FF00FF, rtmp, vec_enc);
+  vpsrlw(dst, xtmp1, 8, vec_enc);
+  vpand(xtmp1, xtmp1, xtmp2, vec_enc);
+  vpaddw(dst, dst, xtmp1, vec_enc);
 }

 void C2_MacroAssembler::vector_popcount_long(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
-                                             XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp,
-                                             int vec_enc) {
-  if (VM_Version::supports_avx512_vpopcntdq()) {
-    vpopcntq(dst, src, vec_enc);
-  } else if (vec_enc == Assembler::AVX_512bit) {
-    assert(VM_Version::supports_avx512bw(), "");
-    movl(rtmp, 0x0F0F0F0F);
-    movdl(xtmp1, rtmp);
-    vpbroadcastd(xtmp1, xtmp1, vec_enc);
-    evmovdqul(xtmp2, k0, ExternalAddress(StubRoutines::x86::vector_popcount_lut()), true, vec_enc, rtmp);
-    vpandq(xtmp3, src, xtmp1, vec_enc);
-    vpshufb(xtmp3, xtmp2, xtmp3, vec_enc);
-    vpsrlw(dst, src, 4, vec_enc);
-    vpandq(dst, dst, xtmp1, vec_enc);
-    vpshufb(dst, xtmp2, dst, vec_enc);
-    vpaddb(xtmp3, dst, xtmp3, vec_enc);
-    vpxorq(xtmp1, xtmp1, xtmp1, vec_enc);
-    vpsadbw(dst, xtmp3, xtmp1, vec_enc);
-  } else {
-    // We do not see any performance benefit of running
-    // above instruction sequence on 256 bit vector which
-    // can operate over maximum 4 long elements.
-    ShouldNotReachHere();
+                                             XMMRegister xtmp2, Register rtmp, int vec_enc) {
+  vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
+  vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
+  vpsadbw(dst, xtmp1, xtmp2, vec_enc);
+}
+
+void C2_MacroAssembler::vector_popcount_integral(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
+                                                 XMMRegister xtmp2, Register rtmp, int vec_enc) {
+  switch(bt) {
+    case T_LONG:
+      vector_popcount_long(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
+      break;
+    case T_INT:
+      vector_popcount_int(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
+      break;
+    case T_CHAR:
+    case T_SHORT:
+      vector_popcount_short(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
+      break;
+    case T_BYTE:
+    case T_BOOLEAN:
+      vector_popcount_byte(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
+      break;
+    default:
+      fatal("Unsupported type %s", type2name(bt));
+      break;
+  }
+}
+
+void C2_MacroAssembler::vector_popcount_integral_evex(BasicType bt, XMMRegister dst, XMMRegister src,
+                                                      KRegister mask, bool merge, int vec_enc) {
+  assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, "");
+  switch(bt) {
+    case T_LONG:
+      assert(VM_Version::supports_avx512_vpopcntdq(), "");
+      evpopcntq(dst, mask, src, merge, vec_enc);
+      break;
+    case T_INT:
+      assert(VM_Version::supports_avx512_vpopcntdq(), "");
+      evpopcntd(dst, mask, src, merge, vec_enc);
+      break;
+    case T_CHAR:
+    case T_SHORT:
+      assert(VM_Version::supports_avx512_bitalg(), "");
+      evpopcntw(dst, mask, src, merge, vec_enc);
+      break;
+    case T_BYTE:
+    case T_BOOLEAN:
+      assert(VM_Version::supports_avx512_bitalg(), "");
+      evpopcntb(dst, mask, src, merge, vec_enc);
+      break;
+    default:
+      fatal("Unsupported type %s", type2name(bt));
+      break;
  }
-  evpmovqd(dst, dst, vec_enc);
 }

 #ifndef _LP64
@ -4655,6 +4786,374 @@ void C2_MacroAssembler::vector_maskall_operation32(KRegister dst, Register src,
 }
 #endif

+// Bit reversal algorithm first reverses the bits of each byte followed by
+// a byte level reversal for multi-byte primitive types (short/int/long).
+// Algorithm performs a lookup table access to get reverse bit sequence
+// corresponding to a 4 bit value. Thus a reverse bit sequence for a byte
+// is obtained by swapping the reverse bit sequences of upper and lower
+// nibble of a byte.
+void C2_MacroAssembler::vector_reverse_bit(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
+                                           XMMRegister xtmp2, Register rtmp, int vec_enc) {
+  if (VM_Version::supports_avx512vlbw()) {
+
+    // Get the reverse bit sequence of lower nibble of each byte.
+    vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), rtmp, vec_enc);
+    vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
+    vpandq(dst, xtmp2, src, vec_enc);
+    vpshufb(dst, xtmp1, dst, vec_enc);
+    vpsllq(dst, dst, 4, vec_enc);
+
+    // Get the reverse bit sequence of upper nibble of each byte.
+    vpandn(xtmp2, xtmp2, src, vec_enc);
+    vpsrlq(xtmp2, xtmp2, 4, vec_enc);
+    vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
+
+    // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and
+    // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte.
+    vporq(xtmp2, dst, xtmp2, vec_enc);
+    vector_reverse_byte(bt, dst, xtmp2, rtmp, vec_enc);
+
+  } else if(vec_enc == Assembler::AVX_512bit) {
+    // Shift based bit reversal.
+    assert(bt == T_LONG || bt == T_INT, "");
+
+    // Swap lower and upper nibble of each byte.
+    vector_swap_nbits(4, 0x0F0F0F0F, xtmp1, src, xtmp2, rtmp, vec_enc);
+
+    // Swap two least and most significant bits of each nibble.
+    vector_swap_nbits(2, 0x33333333, dst, xtmp1, xtmp2, rtmp, vec_enc);
+
+    // Swap adjacent pair of bits.
+    evmovdqul(xtmp1, k0, dst, true, vec_enc);
+    vector_swap_nbits(1, 0x55555555, dst, xtmp1, xtmp2, rtmp, vec_enc);
+
+    evmovdqul(xtmp1, k0, dst, true, vec_enc);
+    vector_reverse_byte64(bt, dst, xtmp1, xtmp1, xtmp2, rtmp, vec_enc);
+  } else {
+    vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), rtmp, vec_enc);
+    vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
+
+    // Get the reverse bit sequence of lower nibble of each byte.
+    vpand(dst, xtmp2, src, vec_enc);
+    vpshufb(dst, xtmp1, dst, vec_enc);
+    vpsllq(dst, dst, 4, vec_enc);
+
+    // Get the reverse bit sequence of upper nibble of each byte.
+    vpandn(xtmp2, xtmp2, src, vec_enc);
+    vpsrlq(xtmp2, xtmp2, 4, vec_enc);
+    vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
+
+    // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and
+    // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte.
+    vpor(xtmp2, dst, xtmp2, vec_enc);
+    vector_reverse_byte(bt, dst, xtmp2, rtmp, vec_enc);
+  }
+}
+
+void C2_MacroAssembler::vector_reverse_bit_gfni(BasicType bt, XMMRegister dst, XMMRegister src,
+                                                XMMRegister xtmp, AddressLiteral mask, Register rtmp, int vec_enc) {
+  // Galois field instruction based bit reversal based on following algorithm.
+  // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
+  assert(VM_Version::supports_gfni(), "");
+  vpbroadcastq(xtmp, mask, vec_enc, rtmp);
+  vgf2p8affineqb(xtmp, src, xtmp, 0, vec_enc);
+  vector_reverse_byte(bt, dst, xtmp, rtmp, vec_enc);
+}
+
+void C2_MacroAssembler::vector_swap_nbits(int nbits, int bitmask, XMMRegister dst, XMMRegister src,
+                                          XMMRegister xtmp1, Register rtmp, int vec_enc) {
+  vbroadcast(T_INT, xtmp1, bitmask, rtmp, vec_enc);
+  vpandq(dst, xtmp1, src, vec_enc);
+  vpsllq(dst, dst, nbits, vec_enc);
+  vpandn(xtmp1, xtmp1, src, vec_enc);
+  vpsrlq(xtmp1, xtmp1, nbits, vec_enc);
+  vporq(dst, dst, xtmp1, vec_enc);
+}
+
+void C2_MacroAssembler::vector_reverse_byte64(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
+                                              XMMRegister xtmp2, Register rtmp, int vec_enc) {
+  // Shift based bit reversal.
+  assert(VM_Version::supports_evex(), "");
+  switch(bt) {
+    case T_LONG:
+      // Swap upper and lower double word of each quad word.
+      evprorq(xtmp1, k0, src, 32, true, vec_enc);
+      evprord(xtmp1, k0, xtmp1, 16, true, vec_enc);
+      vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc);
+      break;
+    case T_INT:
+      // Swap upper and lower word of each double word.
+      evprord(xtmp1, k0, src, 16, true, vec_enc);
+      vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc);
+      break;
+    case T_SHORT:
+      // Swap upper and lower byte of each word.
+      vector_swap_nbits(8, 0x00FF00FF, dst, src, xtmp2, rtmp, vec_enc);
+      break;
+    case T_BYTE:
+      evmovdquq(dst, k0, src, true, vec_enc);
+      break;
+    default:
+      fatal("Unsupported type %s", type2name(bt));
+      break;
+  }
+}
+
+void C2_MacroAssembler::vector_reverse_byte(BasicType bt, XMMRegister dst, XMMRegister src, Register rtmp, int vec_enc) {
+  if (bt == T_BYTE) {
+    if (VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit) {
+      evmovdquq(dst, k0, src, true, vec_enc);
+    } else {
+      vmovdqu(dst, src);
+    }
+    return;
+  }
+  // Perform byte reversal by shuffling the bytes of a multi-byte primitive type using
+  // pre-computed shuffle indices.
+  switch(bt) {
+    case T_LONG:
+      vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_long()), rtmp, vec_enc);
+      break;
+    case T_INT:
+      vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_int()), rtmp, vec_enc);
+      break;
+    case T_SHORT:
+      vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_short()), rtmp, vec_enc);
+      break;
+    default:
+      fatal("Unsupported type %s", type2name(bt));
+      break;
+  }
+  vpshufb(dst, src, dst, vec_enc);
+}
+
+void C2_MacroAssembler::vector_count_leading_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src,
+                                                        XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
+                                                        KRegister ktmp, Register rtmp, bool merge, int vec_enc) {
+  assert(is_integral_type(bt), "");
+  assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, "");
+  assert(VM_Version::supports_avx512cd(), "");
+  switch(bt) {
+    case T_LONG:
+      evplzcntq(dst, ktmp, src, merge, vec_enc);
+      break;
+    case T_INT:
+      evplzcntd(dst, ktmp, src, merge, vec_enc);
+      break;
+    case T_SHORT:
+      vpternlogd(xtmp1, 0xff, xtmp1, xtmp1, vec_enc);
+      vpunpcklwd(xtmp2, xtmp1, src, vec_enc);
+      evplzcntd(xtmp2, ktmp, xtmp2, merge, vec_enc);
+      vpunpckhwd(dst, xtmp1, src, vec_enc);
+      evplzcntd(dst, ktmp, dst, merge, vec_enc);
+      vpackusdw(dst, xtmp2, dst, vec_enc);
+      break;
+    case T_BYTE:
+      // T1 = Compute leading zero counts of 4 LSB bits of each byte by
+      // accessing the lookup table.
+      // T2 = Compute leading zero counts of 4 MSB bits of each byte by
+      // accessing the lookup table.
+      // Add T1 to T2 if 4 MSB bits of byte are all zeros.
+      assert(VM_Version::supports_avx512bw(), "");
+      evmovdquq(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), vec_enc, rtmp);
+      vbroadcast(T_INT, dst, 0x0F0F0F0F, rtmp, vec_enc);
+      vpand(xtmp2, dst, src, vec_enc);
+      vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
+      vpsrlw(xtmp3, src, 4, vec_enc);
+      vpand(xtmp3, dst, xtmp3, vec_enc);
+      vpshufb(dst, xtmp1, xtmp3, vec_enc);
+      vpxor(xtmp1, xtmp1, xtmp1, vec_enc);
+      evpcmpeqb(ktmp, xtmp1, xtmp3, vec_enc);
+      evpaddb(dst, ktmp, dst, xtmp2, true, vec_enc);
+      break;
+    default:
+      fatal("Unsupported type %s", type2name(bt));
+      break;
+  }
+}
+
+void C2_MacroAssembler::vector_count_leading_zeros_byte_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
+                                                            XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
+  vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), rtmp);
+  vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
+  // T1 = Compute leading zero counts of 4 LSB bits of each byte by
+  // accessing the lookup table.
+  vpand(dst, xtmp2, src, vec_enc);
+  vpshufb(dst, xtmp1, dst, vec_enc);
+  // T2 = Compute leading zero counts of 4 MSB bits of each byte by
+  // accessing the lookup table.
+  vpsrlw(xtmp3, src, 4, vec_enc);
+  vpand(xtmp3, xtmp2, xtmp3, vec_enc);
+  vpshufb(xtmp2, xtmp1, xtmp3, vec_enc);
+  // Add T1 to T2 if 4 MSB bits of byte are all zeros.
+  vpxor(xtmp1, xtmp1, xtmp1, vec_enc);
+  vpcmpeqb(xtmp3, xtmp1, xtmp3, vec_enc);
+  vpaddb(dst, dst, xtmp2, vec_enc);
+  vpblendvb(dst, xtmp2, dst, xtmp3, vec_enc);
+}
+
+void C2_MacroAssembler::vector_count_leading_zeros_short_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
+                                                             XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
+  vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
+  // Add zero counts of lower byte and upper byte of a word if
+  // upper byte holds a zero value.
+  vpsrlw(xtmp3, src, 8, vec_enc);
+  // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx.
+  vpcmpeqw(xtmp3, xtmp1, xtmp3, vec_enc);
+  vpsllw(xtmp2, dst, 8, vec_enc);
+  vpaddw(xtmp2, xtmp2, dst, vec_enc);
+  vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
+  vpsrlw(dst, dst, 8, vec_enc);
+}
+
+void C2_MacroAssembler::vector_count_leading_zeros_int_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
+                                                           XMMRegister xtmp2, XMMRegister xtmp3, int vec_enc) {
+  // Since IEEE 754 floating point format represents mantissa in 1.0 format
+  // hence biased exponent can be used to compute leading zero count as per
+  // following formula:-
+  // LZCNT = 32 - (biased_exp - 127)
+  // Special handling has been introduced for Zero, Max_Int and -ve source values.
+
+  // Broadcast 0xFF
+  vpcmpeqd(xtmp1, xtmp1, xtmp1, vec_enc);
+  vpsrld(xtmp1, xtmp1, 24, vec_enc);
+
+  // Extract biased exponent.
+  vcvtdq2ps(dst, src, vec_enc);
+  vpsrld(dst, dst, 23, vec_enc);
+  vpand(dst, dst, xtmp1, vec_enc);
+
+  // Broadcast 127.
+  vpsrld(xtmp1, xtmp1, 1, vec_enc);
+  // Exponent = biased_exp - 127
+  vpsubd(dst, dst, xtmp1, vec_enc);
+
+  // Exponent = Exponent  + 1
+  vpsrld(xtmp3, xtmp1, 6, vec_enc);
+  vpaddd(dst, dst, xtmp3, vec_enc);
+
+  // Replace -ve exponent with zero, exponent is -ve when src
+  // lane contains a zero value.
+  vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
+  vblendvps(dst, dst, xtmp2, dst, vec_enc);
+
+  // Rematerialize broadcast 32.
+  vpslld(xtmp1, xtmp3, 5, vec_enc);
+  // Exponent is 32 if corresponding source lane contains max_int value.
+  vpcmpeqd(xtmp2, dst, xtmp1, vec_enc);
+  // LZCNT = 32 - exponent
+  vpsubd(dst, xtmp1, dst, vec_enc);
+
+  // Replace LZCNT with a value 1 if corresponding source lane
+  // contains max_int value.
+  vpblendvb(dst, dst, xtmp3, xtmp2, vec_enc);
+
+  // Replace biased_exp with 0 if source lane value is less than zero.
+  vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
+  vblendvps(dst, dst, xtmp2, src, vec_enc);
+}
+
+void C2_MacroAssembler::vector_count_leading_zeros_long_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
+                                                            XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
+  vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
+  // Add zero counts of lower word and upper word of a double word if
+  // upper word holds a zero value.
+  vpsrld(xtmp3, src, 16, vec_enc);
+  // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx.
+  vpcmpeqd(xtmp3, xtmp1, xtmp3, vec_enc);
+  vpslld(xtmp2, dst, 16, vec_enc);
+  vpaddd(xtmp2, xtmp2, dst, vec_enc);
+  vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
+  vpsrld(dst, dst, 16, vec_enc);
+  // Add zero counts of lower doubleword and upper doubleword of a
+  // quadword if upper doubleword holds a zero value.
+  vpsrlq(xtmp3, src, 32, vec_enc);
+  vpcmpeqq(xtmp3, xtmp1, xtmp3, vec_enc);
+  vpsllq(xtmp2, dst, 32, vec_enc);
+  vpaddq(xtmp2, xtmp2, dst, vec_enc);
+  vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
+  vpsrlq(dst, dst, 32, vec_enc);
+}
+
+void C2_MacroAssembler::vector_count_leading_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src,
+                                                       XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
+                                                       Register rtmp, int vec_enc) {
+  assert(is_integral_type(bt), "unexpected type");
+  assert(vec_enc < Assembler::AVX_512bit, "");
+  switch(bt) {
+    case T_LONG:
+      vector_count_leading_zeros_long_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
+      break;
+    case T_INT:
+      vector_count_leading_zeros_int_avx(dst, src, xtmp1, xtmp2, xtmp3, vec_enc);
+      break;
+    case T_SHORT:
+      vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
+      break;
+    case T_BYTE:
+      vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
+      break;
+    default:
+      fatal("Unsupported type %s", type2name(bt));
+      break;
+  }
+}
+
+void C2_MacroAssembler::vpsub(BasicType bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vec_enc) {
+  switch(bt) {
+    case T_BYTE:
+      vpsubb(dst, src1, src2, vec_enc);
+      break;
+    case T_SHORT:
+      vpsubw(dst, src1, src2, vec_enc);
+      break;
+    case T_INT:
+      vpsubd(dst, src1, src2, vec_enc);
+      break;
+    case T_LONG:
+      vpsubq(dst, src1, src2, vec_enc);
+      break;
+    default:
+      fatal("Unsupported type %s", type2name(bt));
+      break;
+  }
+}
+
+// Trailing zero count computation is based on leading zero count operation as per
+// following equation. All AVX3 targets support AVX512CD feature which offers
+// direct vector instruction to compute leading zero count.
+//      CTZ = PRIM_TYPE_WIDHT - CLZ((x - 1) & ~x)
+void C2_MacroAssembler::vector_count_trailing_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src,
+                                                         XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
+                                                         XMMRegister xtmp4, KRegister ktmp, Register rtmp, int vec_enc) {
+  assert(is_integral_type(bt), "");
+  // xtmp = -1
+  vpternlogd(xtmp4, 0xff, xtmp4, xtmp4, vec_enc);
+  // xtmp = xtmp + src
+  vpadd(bt, xtmp4, xtmp4, src, vec_enc);
+  // xtmp = xtmp & ~src
+  vpternlogd(xtmp4, 0x40, xtmp4, src, vec_enc);
+  vector_count_leading_zeros_evex(bt, dst, xtmp4, xtmp1, xtmp2, xtmp3, ktmp, rtmp, true, vec_enc);
+  vbroadcast(bt, xtmp4, 8 * type2aelembytes(bt), rtmp, vec_enc);
+  vpsub(bt, dst, xtmp4, dst, vec_enc);
+}
+
+// Trailing zero count computation for AVX2 targets is based on popcount operation as per following equation
+//      CTZ = PRIM_TYPE_WIDHT - POPC(x | -x)
+void C2_MacroAssembler::vector_count_trailing_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
+                                                        XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
+  assert(is_integral_type(bt), "");
+  // xtmp = 0
+  vpxor(xtmp3 , xtmp3, xtmp3, vec_enc);
+  // xtmp = 0 - src
+  vpsub(bt, xtmp3, xtmp3, src, vec_enc);
+  // xtmp = xtmp | src
+  vpor(xtmp3, xtmp3, src, vec_enc);
+  vector_popcount_integral(bt, dst, xtmp3, xtmp1, xtmp2, rtmp, vec_enc);
+  vbroadcast(bt, xtmp1, 8 * type2aelembytes(bt), rtmp, vec_enc);
+  vpsub(bt, dst, xtmp1, dst, vec_enc);
+}
+
 void C2_MacroAssembler::udivI(Register rax, Register divisor, Register rdx) {
  Label done;
  Label neg_divisor_fastpath;
@ -4817,4 +5316,3 @@ void C2_MacroAssembler::udivmodL(Register rax, Register divisor, Register rdx, R
  bind(done);
 }
 #endif
-
--- a/src/hotspot/cpu/x86/c2_MacroAssembler_x86.hpp
+++ b/src/hotspot/cpu/x86/c2_MacroAssembler_x86.hpp
@ -88,6 +88,11 @@ public:
                 XMMRegister zero, XMMRegister one,
                 Register scratch);

+  void vector_compress_expand(int opcode, XMMRegister dst, XMMRegister src, KRegister mask,
+                              bool merge, BasicType bt, int vec_enc);
+
+  void vector_mask_compress(KRegister dst, KRegister src, Register rtmp1, Register rtmp2, int mask_len);
+
  void vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len);
  void vextendbw(bool sign, XMMRegister dst, XMMRegister src);
  void vextendbd(bool sign, XMMRegister dst, XMMRegister src, int vector_len);
@ -137,7 +142,6 @@ public:
 #ifdef _LP64
 void vpbroadcast(BasicType elem_bt, XMMRegister dst, Register src, int vlen_enc);
 #endif
- void vpadd(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc);

  // blend
  void evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, AddressLiteral adr, int comparison, int vector_len, Register scratch = rscratch1);
@ -341,34 +345,89 @@ public:
                              AddressLiteral new_mxcsr, Register scratch, int vec_enc);
 #endif

+  void udivI(Register rax, Register divisor, Register rdx);
+  void umodI(Register rax, Register divisor, Register rdx);
+  void udivmodI(Register rax, Register divisor, Register rdx, Register tmp);
+
+#ifdef _LP64
+  void udivL(Register rax, Register divisor, Register rdx);
+  void umodL(Register rax, Register divisor, Register rdx);
+  void udivmodL(Register rax, Register divisor, Register rdx, Register tmp);
+#endif
+
  void evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, XMMRegister src3,
                  bool merge, BasicType bt, int vlen_enc);

  void evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, Address src3,
                  bool merge, BasicType bt, int vlen_enc);

+  void vector_reverse_bit(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
+                          XMMRegister xtmp2, Register rtmp, int vec_enc);
+
+  void vector_reverse_bit_gfni(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp,
+                               AddressLiteral mask, Register rtmp, int vec_enc);
+
+  void vector_reverse_byte(BasicType bt, XMMRegister dst, XMMRegister src, Register rtmp, int vec_enc);
+
+  void vector_popcount_int(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
+                           XMMRegister xtmp2, Register rtmp, int vec_enc);
+
+  void vector_popcount_long(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
+                            XMMRegister xtmp2, Register rtmp, int vec_enc);
+
+  void vector_popcount_short(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
+                             XMMRegister xtmp2, Register rtmp, int vec_enc);
+
+  void vector_popcount_byte(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
+                            XMMRegister xtmp2, Register rtmp, int vec_enc);
+
+  void vector_popcount_integral(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
+                                XMMRegister xtmp2, Register rtmp, int vec_enc);
+
+  void vector_popcount_integral_evex(BasicType bt, XMMRegister dst, XMMRegister src,
+                                     KRegister mask, bool merge, int vec_enc);
+
+  void vbroadcast(BasicType bt, XMMRegister dst, int imm32, Register rtmp, int vec_enc);
+
+  void vector_reverse_byte64(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
+                             XMMRegister xtmp2, Register rtmp, int vec_enc);
+
+  void vector_count_leading_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src,
+                                       XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
+                                       KRegister ktmp, Register rtmp, bool merge, int vec_enc);
+
+  void vector_count_leading_zeros_byte_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
+                                           XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc);
+
+  void vector_count_leading_zeros_short_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
+                                            XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc);
+
+  void vector_count_leading_zeros_int_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
+                                          XMMRegister xtmp2, XMMRegister xtmp3, int vec_enc);
+
+  void vector_count_leading_zeros_long_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
+                                           XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc);
+
+  void vector_count_leading_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
+                                      XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc);
+
+  void vpadd(BasicType bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vec_enc);
+
+  void vpsub(BasicType bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vec_enc);
+
+  void vector_count_trailing_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
+                                        XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, KRegister ktmp,
+                                        Register rtmp, int vec_enc);
+
+  void vector_swap_nbits(int nbits, int bitmask, XMMRegister dst, XMMRegister src,
+                         XMMRegister xtmp1, Register rtmp, int vec_enc);
+
+  void vector_count_trailing_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
+                                       XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc);
+
  void vector_signum_avx(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one,
                         XMMRegister xtmp1, int vec_enc);

  void vector_signum_evex(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one,
                          KRegister ktmp1, int vec_enc);
-
-  void udivI(Register rax, Register divisor, Register rdx);
-  void umodI(Register rax, Register divisor, Register rdx);
-  void udivmodI(Register rax, Register divisor, Register rdx, Register tmp);
-
-  #ifdef _LP64
-  void udivL(Register rax, Register divisor, Register rdx);
-  void umodL(Register rax, Register divisor, Register rdx);
-  void udivmodL(Register rax, Register divisor, Register rdx, Register tmp);
-  #endif
-
-  void vector_popcount_int(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
-                           XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp,
-                           int vec_enc);
-
-  void vector_popcount_long(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
-                            XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp,
-                            int vec_enc);
-
 #endif // CPU_X86_C2_MACROASSEMBLER_X86_HPP
--- a/src/hotspot/cpu/x86/macroAssembler_x86.cpp
+++ b/src/hotspot/cpu/x86/macroAssembler_x86.cpp
@ -2577,8 +2577,9 @@ void MacroAssembler::vmovdqu(XMMRegister dst, AddressLiteral src, Register scrat
 }

 void MacroAssembler::vmovdqu(XMMRegister dst, AddressLiteral src, Register scratch_reg, int vector_len) {
-  assert(vector_len <= AVX_256bit, "AVX2 vector length");
-  if (vector_len == AVX_256bit) {
+  if (vector_len == AVX_512bit) {
+    evmovdquq(dst, src, AVX_512bit, scratch_reg);
+  } else if (vector_len == AVX_256bit) {
    vmovdqu(dst, src, scratch_reg);
  } else {
    movdqu(dst, src, scratch_reg);
@ -3229,6 +3230,15 @@ void MacroAssembler::vpbroadcastw(XMMRegister dst, XMMRegister src, int vector_l
  Assembler::vpbroadcastw(dst, src, vector_len);
 }

+void MacroAssembler::vpbroadcastq(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch) {
+  if (reachable(src)) {
+    Assembler::vpbroadcastq(dst, as_Address(src), vector_len);
+  } else {
+    lea(rscratch, src);
+    Assembler::vpbroadcastq(dst, Address(rscratch, 0), vector_len);
+  }
+}
+
 void MacroAssembler::vbroadcastsd(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch) {
  if (reachable(src)) {
    Assembler::vbroadcastsd(dst, as_Address(src), vector_len);
--- a/src/hotspot/cpu/x86/macroAssembler_x86.hpp
+++ b/src/hotspot/cpu/x86/macroAssembler_x86.hpp
@ -1347,6 +1347,11 @@ public:

  using Assembler::vbroadcastsd;
  void vbroadcastsd(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch = rscratch1);
+  void vpbroadcastq(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch = rscratch1);
+  void vpbroadcastq(XMMRegister dst, XMMRegister src, int vector_len) { Assembler::vpbroadcastq(dst, src, vector_len); }
+  void vpbroadcastq(XMMRegister dst, Address src, int vector_len) { Assembler::vpbroadcastq(dst, src, vector_len); }
+
+

  void vpcmpeqb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);

--- a/src/hotspot/cpu/x86/matcher_x86.hpp
+++ b/src/hotspot/cpu/x86/matcher_x86.hpp
@ -186,15 +186,29 @@
  // Returns pre-selection estimated size of a vector operation.
  static int vector_op_pre_select_sz_estimate(int vopc, BasicType ety, int vlen) {
    switch(vopc) {
-      default: return 0;
-      case Op_PopCountVI: return VM_Version::supports_avx512_vpopcntdq() ? 0 : 50;
-      case Op_PopCountVL: return VM_Version::supports_avx512_vpopcntdq() ? 0 : 40;
+      default:
+        return 0;
+      case Op_CountTrailingZerosV:
+      case Op_CountLeadingZerosV:
+        return VM_Version::supports_avx512cd() && (ety == T_INT || ety == T_LONG) ? 0 : 40;
+      case Op_PopCountVI:
+        if (is_subword_type(ety)) {
+          return VM_Version::supports_avx512_bitalg() ? 0 : 50;
+        } else {
+          assert(ety == T_INT, "sanity"); // for documentation purposes
+          return VM_Version::supports_avx512_vpopcntdq() ? 0 : 50;
+        }
+      case Op_PopCountVL:
+        return VM_Version::supports_avx512_vpopcntdq() ? 0 : 40;
+      case Op_ReverseV:
+        return VM_Version::supports_gfni() ? 0 : 30;
      case Op_RoundVF: // fall through
      case Op_RoundVD: {
        return 30;
      }
    }
  }
+
  // Returns pre-selection estimated size of a scalar operation.
  static int scalar_op_pre_select_sz_estimate(int vopc, BasicType ety) {
    switch(vopc) {
--- a/src/hotspot/cpu/x86/stubGenerator_x86_32.cpp
+++ b/src/hotspot/cpu/x86/stubGenerator_x86_32.cpp
@ -588,6 +588,30 @@ class StubGenerator: public StubCodeGenerator {
    return start;
  }

+  address generate_count_leading_zeros_lut(const char *stub_name) {
+    __ align64();
+    StubCodeMark mark(this, "StubRoutines", stub_name);
+    address start = __ pc();
+    __ emit_data(0x02020304, relocInfo::none, 0);
+    __ emit_data(0x01010101, relocInfo::none, 0);
+    __ emit_data(0x00000000, relocInfo::none, 0);
+    __ emit_data(0x00000000, relocInfo::none, 0);
+    __ emit_data(0x02020304, relocInfo::none, 0);
+    __ emit_data(0x01010101, relocInfo::none, 0);
+    __ emit_data(0x00000000, relocInfo::none, 0);
+    __ emit_data(0x00000000, relocInfo::none, 0);
+    __ emit_data(0x02020304, relocInfo::none, 0);
+    __ emit_data(0x01010101, relocInfo::none, 0);
+    __ emit_data(0x00000000, relocInfo::none, 0);
+    __ emit_data(0x00000000, relocInfo::none, 0);
+    __ emit_data(0x02020304, relocInfo::none, 0);
+    __ emit_data(0x01010101, relocInfo::none, 0);
+    __ emit_data(0x00000000, relocInfo::none, 0);
+    __ emit_data(0x00000000, relocInfo::none, 0);
+    return start;
+  }
+
+
  address generate_popcount_avx_lut(const char *stub_name) {
    __ align64();
    StubCodeMark mark(this, "StubRoutines", stub_name);
@ -635,6 +659,98 @@ class StubGenerator: public StubCodeGenerator {
    return start;
  }

+  address generate_vector_reverse_bit_lut(const char *stub_name) {
+    __ align(CodeEntryAlignment);
+    StubCodeMark mark(this, "StubRoutines", stub_name);
+    address start = __ pc();
+    __ emit_data(0x0C040800, relocInfo::none, 0);
+    __ emit_data(0x0E060A02, relocInfo::none, 0);
+    __ emit_data(0x0D050901, relocInfo::none, 0);
+    __ emit_data(0x0F070B03, relocInfo::none, 0);
+    __ emit_data(0x0C040800, relocInfo::none, 0);
+    __ emit_data(0x0E060A02, relocInfo::none, 0);
+    __ emit_data(0x0D050901, relocInfo::none, 0);
+    __ emit_data(0x0F070B03, relocInfo::none, 0);
+    __ emit_data(0x0C040800, relocInfo::none, 0);
+    __ emit_data(0x0E060A02, relocInfo::none, 0);
+    __ emit_data(0x0D050901, relocInfo::none, 0);
+    __ emit_data(0x0F070B03, relocInfo::none, 0);
+    __ emit_data(0x0C040800, relocInfo::none, 0);
+    __ emit_data(0x0E060A02, relocInfo::none, 0);
+    __ emit_data(0x0D050901, relocInfo::none, 0);
+    __ emit_data(0x0F070B03, relocInfo::none, 0);
+    return start;
+  }
+
+  address generate_vector_reverse_byte_perm_mask_long(const char *stub_name) {
+    __ align(CodeEntryAlignment);
+    StubCodeMark mark(this, "StubRoutines", stub_name);
+    address start = __ pc();
+    __ emit_data(0x04050607, relocInfo::none, 0);
+    __ emit_data(0x00010203, relocInfo::none, 0);
+    __ emit_data(0x0C0D0E0F, relocInfo::none, 0);
+    __ emit_data(0x08090A0B, relocInfo::none, 0);
+    __ emit_data(0x04050607, relocInfo::none, 0);
+    __ emit_data(0x00010203, relocInfo::none, 0);
+    __ emit_data(0x0C0D0E0F, relocInfo::none, 0);
+    __ emit_data(0x08090A0B, relocInfo::none, 0);
+    __ emit_data(0x04050607, relocInfo::none, 0);
+    __ emit_data(0x00010203, relocInfo::none, 0);
+    __ emit_data(0x0C0D0E0F, relocInfo::none, 0);
+    __ emit_data(0x08090A0B, relocInfo::none, 0);
+    __ emit_data(0x04050607, relocInfo::none, 0);
+    __ emit_data(0x00010203, relocInfo::none, 0);
+    __ emit_data(0x0C0D0E0F, relocInfo::none, 0);
+    __ emit_data(0x08090A0B, relocInfo::none, 0);
+    return start;
+  }
+
+  address generate_vector_reverse_byte_perm_mask_int(const char *stub_name) {
+    __ align(CodeEntryAlignment);
+    StubCodeMark mark(this, "StubRoutines", stub_name);
+    address start = __ pc();
+    __ emit_data(0x00010203, relocInfo::none, 0);
+    __ emit_data(0x04050607, relocInfo::none, 0);
+    __ emit_data(0x08090A0B, relocInfo::none, 0);
+    __ emit_data(0x0C0D0E0F, relocInfo::none, 0);
+    __ emit_data(0x00010203, relocInfo::none, 0);
+    __ emit_data(0x04050607, relocInfo::none, 0);
+    __ emit_data(0x08090A0B, relocInfo::none, 0);
+    __ emit_data(0x0C0D0E0F, relocInfo::none, 0);
+    __ emit_data(0x00010203, relocInfo::none, 0);
+    __ emit_data(0x04050607, relocInfo::none, 0);
+    __ emit_data(0x08090A0B, relocInfo::none, 0);
+    __ emit_data(0x0C0D0E0F, relocInfo::none, 0);
+    __ emit_data(0x00010203, relocInfo::none, 0);
+    __ emit_data(0x04050607, relocInfo::none, 0);
+    __ emit_data(0x08090A0B, relocInfo::none, 0);
+    __ emit_data(0x0C0D0E0F, relocInfo::none, 0);
+    return start;
+  }
+
+  address generate_vector_reverse_byte_perm_mask_short(const char *stub_name) {
+    __ align(CodeEntryAlignment);
+    StubCodeMark mark(this, "StubRoutines", stub_name);
+    address start = __ pc();
+    __ emit_data(0x02030001, relocInfo::none, 0);
+    __ emit_data(0x06070405, relocInfo::none, 0);
+    __ emit_data(0x0A0B0809, relocInfo::none, 0);
+    __ emit_data(0x0E0F0C0D, relocInfo::none, 0);
+    __ emit_data(0x02030001, relocInfo::none, 0);
+    __ emit_data(0x06070405, relocInfo::none, 0);
+    __ emit_data(0x0A0B0809, relocInfo::none, 0);
+    __ emit_data(0x0E0F0C0D, relocInfo::none, 0);
+    __ emit_data(0x02030001, relocInfo::none, 0);
+    __ emit_data(0x06070405, relocInfo::none, 0);
+    __ emit_data(0x0A0B0809, relocInfo::none, 0);
+    __ emit_data(0x0E0F0C0D, relocInfo::none, 0);
+    __ emit_data(0x02030001, relocInfo::none, 0);
+    __ emit_data(0x06070405, relocInfo::none, 0);
+    __ emit_data(0x0A0B0809, relocInfo::none, 0);
+    __ emit_data(0x0E0F0C0D, relocInfo::none, 0);
+    return start;
+  }
+
  address generate_vector_byte_shuffle_mask(const char *stub_name) {
    __ align(CodeEntryAlignment);
    StubCodeMark mark(this, "StubRoutines", stub_name);
@ -4090,8 +4206,13 @@ class StubGenerator: public StubCodeGenerator {
    StubRoutines::x86::_vector_all_bits_set = generate_vector_mask("vector_all_bits_set", 0xFFFFFFFF);
    StubRoutines::x86::_vector_int_mask_cmp_bits = generate_vector_mask("vector_int_mask_cmp_bits", 0x00000001);
    StubRoutines::x86::_vector_iota_indices = generate_iota_indices("iota_indices");
+    StubRoutines::x86::_vector_count_leading_zeros_lut = generate_count_leading_zeros_lut("count_leading_zeros_lut");
+    StubRoutines::x86::_vector_reverse_bit_lut = generate_vector_reverse_bit_lut("reverse_bit_lut");
+    StubRoutines::x86::_vector_reverse_byte_perm_mask_long = generate_vector_reverse_byte_perm_mask_long("perm_mask_long");
+    StubRoutines::x86::_vector_reverse_byte_perm_mask_int = generate_vector_reverse_byte_perm_mask_int("perm_mask_int");
+    StubRoutines::x86::_vector_reverse_byte_perm_mask_short = generate_vector_reverse_byte_perm_mask_short("perm_mask_short");

-    if (UsePopCountInstruction && VM_Version::supports_avx2() && !VM_Version::supports_avx512_vpopcntdq()) {
+    if (VM_Version::supports_avx2() && !VM_Version::supports_avx512_vpopcntdq()) {
      // lut implementation influenced by counting 1s algorithm from section 5-1 of Hackers' Delight.
      StubRoutines::x86::_vector_popcount_lut = generate_popcount_avx_lut("popcount_lut");
    }
--- a/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp
+++ b/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp
@ -807,6 +807,21 @@ class StubGenerator: public StubCodeGenerator {
    return start;
  }

+  address generate_count_leading_zeros_lut(const char *stub_name) {
+    __ align64();
+    StubCodeMark mark(this, "StubRoutines", stub_name);
+    address start = __ pc();
+    __ emit_data64(0x0101010102020304, relocInfo::none);
+    __ emit_data64(0x0000000000000000, relocInfo::none);
+    __ emit_data64(0x0101010102020304, relocInfo::none);
+    __ emit_data64(0x0000000000000000, relocInfo::none);
+    __ emit_data64(0x0101010102020304, relocInfo::none);
+    __ emit_data64(0x0000000000000000, relocInfo::none);
+    __ emit_data64(0x0101010102020304, relocInfo::none);
+    __ emit_data64(0x0000000000000000, relocInfo::none);
+    return start;
+  }
+
  address generate_popcount_avx_lut(const char *stub_name) {
    __ align64();
    StubCodeMark mark(this, "StubRoutines", stub_name);
@ -837,6 +852,66 @@ class StubGenerator: public StubCodeGenerator {
    return start;
  }

+  address generate_vector_reverse_bit_lut(const char *stub_name) {
+    __ align(CodeEntryAlignment);
+    StubCodeMark mark(this, "StubRoutines", stub_name);
+    address start = __ pc();
+    __ emit_data64(0x0E060A020C040800, relocInfo::none);
+    __ emit_data64(0x0F070B030D050901, relocInfo::none);
+    __ emit_data64(0x0E060A020C040800, relocInfo::none);
+    __ emit_data64(0x0F070B030D050901, relocInfo::none);
+    __ emit_data64(0x0E060A020C040800, relocInfo::none);
+    __ emit_data64(0x0F070B030D050901, relocInfo::none);
+    __ emit_data64(0x0E060A020C040800, relocInfo::none);
+    __ emit_data64(0x0F070B030D050901, relocInfo::none);
+    return start;
+  }
+
+  address generate_vector_reverse_byte_perm_mask_long(const char *stub_name) {
+    __ align(CodeEntryAlignment);
+    StubCodeMark mark(this, "StubRoutines", stub_name);
+    address start = __ pc();
+    __ emit_data64(0x0001020304050607, relocInfo::none);
+    __ emit_data64(0x08090A0B0C0D0E0F, relocInfo::none);
+    __ emit_data64(0x0001020304050607, relocInfo::none);
+    __ emit_data64(0x08090A0B0C0D0E0F, relocInfo::none);
+    __ emit_data64(0x0001020304050607, relocInfo::none);
+    __ emit_data64(0x08090A0B0C0D0E0F, relocInfo::none);
+    __ emit_data64(0x0001020304050607, relocInfo::none);
+    __ emit_data64(0x08090A0B0C0D0E0F, relocInfo::none);
+    return start;
+  }
+
+  address generate_vector_reverse_byte_perm_mask_int(const char *stub_name) {
+    __ align(CodeEntryAlignment);
+    StubCodeMark mark(this, "StubRoutines", stub_name);
+    address start = __ pc();
+    __ emit_data64(0x0405060700010203, relocInfo::none);
+    __ emit_data64(0x0C0D0E0F08090A0B, relocInfo::none);
+    __ emit_data64(0x0405060700010203, relocInfo::none);
+    __ emit_data64(0x0C0D0E0F08090A0B, relocInfo::none);
+    __ emit_data64(0x0405060700010203, relocInfo::none);
+    __ emit_data64(0x0C0D0E0F08090A0B, relocInfo::none);
+    __ emit_data64(0x0405060700010203, relocInfo::none);
+    __ emit_data64(0x0C0D0E0F08090A0B, relocInfo::none);
+    return start;
+  }
+
+  address generate_vector_reverse_byte_perm_mask_short(const char *stub_name) {
+    __ align(CodeEntryAlignment);
+    StubCodeMark mark(this, "StubRoutines", stub_name);
+    address start = __ pc();
+    __ emit_data64(0x0607040502030001, relocInfo::none);
+    __ emit_data64(0x0E0F0C0D0A0B0809, relocInfo::none);
+    __ emit_data64(0x0607040502030001, relocInfo::none);
+    __ emit_data64(0x0E0F0C0D0A0B0809, relocInfo::none);
+    __ emit_data64(0x0607040502030001, relocInfo::none);
+    __ emit_data64(0x0E0F0C0D0A0B0809, relocInfo::none);
+    __ emit_data64(0x0607040502030001, relocInfo::none);
+    __ emit_data64(0x0E0F0C0D0A0B0809, relocInfo::none);
+    return start;
+  }
+
  address generate_vector_byte_shuffle_mask(const char *stub_name) {
    __ align(CodeEntryAlignment);
    StubCodeMark mark(this, "StubRoutines", stub_name);
@ -7955,8 +8030,13 @@ address generate_avx_ghash_processBlocks() {
    StubRoutines::x86::_vector_long_shuffle_mask = generate_vector_mask("vector_long_shuffle_mask", 0x0000000100000000);
    StubRoutines::x86::_vector_long_sign_mask = generate_vector_mask("vector_long_sign_mask", 0x8000000000000000);
    StubRoutines::x86::_vector_iota_indices = generate_iota_indices("iota_indices");
+    StubRoutines::x86::_vector_count_leading_zeros_lut = generate_count_leading_zeros_lut("count_leading_zeros_lut");
+    StubRoutines::x86::_vector_reverse_bit_lut = generate_vector_reverse_bit_lut("reverse_bit_lut");
+    StubRoutines::x86::_vector_reverse_byte_perm_mask_long = generate_vector_reverse_byte_perm_mask_long("perm_mask_long");
+    StubRoutines::x86::_vector_reverse_byte_perm_mask_int = generate_vector_reverse_byte_perm_mask_int("perm_mask_int");
+    StubRoutines::x86::_vector_reverse_byte_perm_mask_short = generate_vector_reverse_byte_perm_mask_short("perm_mask_short");

-    if (UsePopCountInstruction && VM_Version::supports_avx2() && !VM_Version::supports_avx512_vpopcntdq()) {
+    if (VM_Version::supports_avx2() && !VM_Version::supports_avx512_vpopcntdq()) {
      // lut implementation influenced by counting 1s algorithm from section 5-1 of Hackers' Delight.
      StubRoutines::x86::_vector_popcount_lut = generate_popcount_avx_lut("popcount_lut");
    }
--- a/src/hotspot/cpu/x86/stubRoutines_x86.cpp
+++ b/src/hotspot/cpu/x86/stubRoutines_x86.cpp
@ -59,7 +59,12 @@ address StubRoutines::x86::_vector_double_sign_flip = NULL;
 address StubRoutines::x86::_vector_byte_perm_mask = NULL;
 address StubRoutines::x86::_vector_long_sign_mask = NULL;
 address StubRoutines::x86::_vector_iota_indices = NULL;
+address StubRoutines::x86::_vector_reverse_bit_lut = NULL;
+address StubRoutines::x86::_vector_reverse_byte_perm_mask_long = NULL;
+address StubRoutines::x86::_vector_reverse_byte_perm_mask_int = NULL;
+address StubRoutines::x86::_vector_reverse_byte_perm_mask_short = NULL;
 address StubRoutines::x86::_vector_popcount_lut = NULL;
+address StubRoutines::x86::_vector_count_leading_zeros_lut = NULL;
 address StubRoutines::x86::_vector_32_bit_mask = NULL;
 address StubRoutines::x86::_vector_64_bit_mask = NULL;
 #ifdef _LP64
--- a/src/hotspot/cpu/x86/stubRoutines_x86.hpp
+++ b/src/hotspot/cpu/x86/stubRoutines_x86.hpp
@ -178,6 +178,11 @@ class x86 {
  static address _vector_long_shuffle_mask;
  static address _vector_iota_indices;
  static address _vector_popcount_lut;
+  static address _vector_count_leading_zeros_lut;
+  static address _vector_reverse_bit_lut;
+  static address _vector_reverse_byte_perm_mask_long;
+  static address _vector_reverse_byte_perm_mask_int;
+  static address _vector_reverse_byte_perm_mask_short;
 #ifdef _LP64
  static juint _k256_W[];
  static address _k256_W_adr;
@ -341,6 +346,26 @@ class x86 {
    return _vector_iota_indices;
  }

+  static address vector_count_leading_zeros_lut() {
+    return _vector_count_leading_zeros_lut;
+  }
+
+  static address vector_reverse_bit_lut() {
+    return _vector_reverse_bit_lut;
+  }
+
+  static address vector_reverse_byte_perm_mask_long() {
+    return _vector_reverse_byte_perm_mask_long;
+  }
+
+  static address vector_reverse_byte_perm_mask_int() {
+    return _vector_reverse_byte_perm_mask_int;
+  }
+
+  static address vector_reverse_byte_perm_mask_short() {
+    return _vector_reverse_byte_perm_mask_short;
+  }
+
  static address vector_popcount_lut() {
    return _vector_popcount_lut;
  }
--- a/src/hotspot/cpu/x86/vm_version_x86.cpp
+++ b/src/hotspot/cpu/x86/vm_version_x86.cpp
@ -922,6 +922,7 @@ void VM_Version::get_processor_features() {
    _features &= ~CPU_AVX512_VNNI;
    _features &= ~CPU_AVX512_VBMI;
    _features &= ~CPU_AVX512_VBMI2;
+    _features &= ~CPU_AVX512_BITALG;
  }

  if (UseAVX < 2)
@ -951,6 +952,8 @@ void VM_Version::get_processor_features() {
      _features &= ~CPU_AVX512_VBMI2;
      _features &= ~CPU_CLWB;
      _features &= ~CPU_FLUSHOPT;
+      _features &= ~CPU_GFNI;
+      _features &= ~CPU_AVX512_BITALG;
    }
  }

--- a/src/hotspot/cpu/x86/vm_version_x86.hpp
+++ b/src/hotspot/cpu/x86/vm_version_x86.hpp
@ -370,10 +370,11 @@ protected:
    decl(AVX512_VBMI,       "avx512_vbmi",       45) /* Vector BMI instructions */ \
    decl(HV,                "hv",                46) /* Hypervisor instructions */ \
    decl(SERIALIZE,         "serialize",         47) /* CPU SERIALIZE */ \
-                                                     \
    decl(RDTSCP,            "rdtscp",            48) /* RDTSCP instruction */ \
    decl(RDPID,             "rdpid",             49) /* RDPID instruction */ \
-    decl(FSRM,              "fsrm",              50) /* Fast Short REP MOV */
+    decl(FSRM,              "fsrm",              50) /* Fast Short REP MOV */ \
+    decl(GFNI,              "gfni",              51) /* Vector GFNI instructions */ \
+    decl(AVX512_BITALG,     "avx512_bitalg",     52) /* Vector sub-word popcount and bit gather instructions */

 #define DECLARE_CPU_FEATURE_FLAG(id, name, bit) CPU_##id = (1ULL << bit),
    CPU_FEATURE_FLAGS(DECLARE_CPU_FEATURE_FLAG)
@ -603,8 +604,12 @@ protected:
          result |= CPU_AVX512_VPCLMULQDQ;
        if (_cpuid_info.sef_cpuid7_ecx.bits.vaes != 0)
          result |= CPU_AVX512_VAES;
+        if (_cpuid_info.sef_cpuid7_ecx.bits.gfni != 0)
+          result |= CPU_GFNI;
        if (_cpuid_info.sef_cpuid7_ecx.bits.avx512_vnni != 0)
          result |= CPU_AVX512_VNNI;
+        if (_cpuid_info.sef_cpuid7_ecx.bits.avx512_bitalg != 0)
+          result |= CPU_AVX512_BITALG;
        if (_cpuid_info.sef_cpuid7_ecx.bits.avx512_vbmi != 0)
          result |= CPU_AVX512_VBMI;
        if (_cpuid_info.sef_cpuid7_ecx.bits.avx512_vbmi2 != 0)
@ -918,7 +923,9 @@ public:
  static bool supports_avx512_vpopcntdq()  { return (_features & CPU_AVX512_VPOPCNTDQ) != 0; }
  static bool supports_avx512_vpclmulqdq() { return (_features & CPU_AVX512_VPCLMULQDQ) != 0; }
  static bool supports_avx512_vaes()  { return (_features & CPU_AVX512_VAES) != 0; }
+  static bool supports_gfni()         { return (_features & CPU_GFNI) != 0; }
  static bool supports_avx512_vnni()  { return (_features & CPU_AVX512_VNNI) != 0; }
+  static bool supports_avx512_bitalg()  { return (_features & CPU_AVX512_BITALG) != 0; }
  static bool supports_avx512_vbmi()  { return (_features & CPU_AVX512_VBMI) != 0; }
  static bool supports_avx512_vbmi2() { return (_features & CPU_AVX512_VBMI2) != 0; }
  static bool supports_hv()           { return (_features & CPU_HV) != 0; }
--- a/src/hotspot/cpu/x86/x86.ad
+++ b/src/hotspot/cpu/x86/x86.ad
@ -1241,10 +1241,20 @@ static inline Assembler::AvxVectorLen vector_length_encoding(const MachNode* use
  return vector_length_encoding(def);
 }

+static inline bool is_vector_popcount_predicate(BasicType bt) {
+  return (is_subword_type(bt) && VM_Version::supports_avx512_bitalg()) ||
+         (is_non_subword_integral_type(bt) && VM_Version::supports_avx512_vpopcntdq());
+}
+
 static inline bool is_unsigned_booltest_pred(int bt) {
  return  ((bt & BoolTest::unsigned_compare) == BoolTest::unsigned_compare);
 }

+static inline bool is_clz_non_subword_predicate_evex(BasicType bt, int vlen_bytes) {
+  return is_non_subword_integral_type(bt) && VM_Version::supports_avx512cd() &&
+           (VM_Version::supports_avx512vl() || vlen_bytes == 64);
+}
+
 class Node::PD {
 public:
  enum NodeFlags {
@ -1405,12 +1415,12 @@ const bool Matcher::match_rule_supported(int opcode) {
      }
      break;
    case Op_PopCountVI:
-      if (!UsePopCountInstruction || (UseAVX < 2)) {
+      if (UseAVX < 2) {
        return false;
      }
      break;
    case Op_PopCountVL:
-      if (!UsePopCountInstruction || (UseAVX <= 2)) {
+      if (UseAVX < 2) {
        return false;
      }
      break;
@ -1630,6 +1640,17 @@ const bool Matcher::match_rule_supported(int opcode) {
        return false;
      }
      break;
+    case Op_CompressM:
+      if (!VM_Version::supports_avx512vl() || !VM_Version::supports_bmi2()) {
+        return false;
+      }
+      break;
+    case Op_CompressV:
+    case Op_ExpandV:
+      if (!VM_Version::supports_avx512vl()) {
+        return false;
+      }
+      break;
    case Op_SqrtF:
      if (UseSSE < 1) {
        return false;
@ -1651,6 +1672,11 @@ const bool Matcher::match_rule_supported(int opcode) {

 //------------------------------------------------------------------------

+static inline bool is_pop_count_instr_target(BasicType bt) {
+  return (is_subword_type(bt) && VM_Version::supports_avx512_bitalg()) ||
+         (is_non_subword_integral_type(bt) && VM_Version::supports_avx512_vpopcntdq());
+}
+
 // Identify extra cases that we might want to provide match rules for vector nodes and
 // other intrinsics guarded with vector length (vlen) and element type (bt).
 const bool Matcher::match_rule_supported_vector(int opcode, int vlen, BasicType bt) {
@ -1860,7 +1886,7 @@ const bool Matcher::match_rule_supported_vector(int opcode, int vlen, BasicType
    case Op_LoadVectorGatherMasked:
    case Op_StoreVectorScatterMasked:
    case Op_StoreVectorScatter:
-      if(is_subword_type(bt)) {
+      if (is_subword_type(bt)) {
        return false;
      } else if (size_in_bits < 512 && !VM_Version::supports_avx512vl()) {
        return false;
@ -1887,6 +1913,23 @@ const bool Matcher::match_rule_supported_vector(int opcode, int vlen, BasicType
        return false;
      }
      break;
+    case Op_CompressM:
+      if (UseAVX < 3 || !VM_Version::supports_bmi2()) {
+        return false;
+      }
+      break;
+    case Op_CompressV:
+    case Op_ExpandV:
+      if (is_subword_type(bt) && !VM_Version::supports_avx512_vbmi2()) {
+        return false;
+      }
+      if (size_in_bits < 128 ) {
+        return false;
+      }
+      if (size_in_bits < 512 && !VM_Version::supports_avx512vl()) {
+        return false;
+      }
+      break;
    case Op_VectorLongToMask:
      if (UseAVX < 1 || !is_LP64) {
        return false;
@ -1902,14 +1945,22 @@ const bool Matcher::match_rule_supported_vector(int opcode, int vlen, BasicType
      }
      break;
    case Op_PopCountVI:
-      if (!VM_Version::supports_avx512_vpopcntdq() &&
-          (vlen == 16) && !VM_Version::supports_avx512bw()) {
+    case Op_PopCountVL: {
+        if (!is_pop_count_instr_target(bt) &&
+            (size_in_bits == 512) && !VM_Version::supports_avx512bw()) {
+          return false;
+        }
+      }
+      break;
+    case Op_ReverseV:
+    case Op_ReverseBytesV:
+      if (UseAVX < 2) {
        return false;
      }
      break;
-    case Op_PopCountVL:
-      if (!VM_Version::supports_avx512_vpopcntdq() &&
-          ((vlen <= 4) || ((vlen == 8) && !VM_Version::supports_avx512bw()))) {
+    case Op_CountTrailingZerosV:
+    case Op_CountLeadingZerosV:
+      if (UseAVX < 2) {
        return false;
      }
      break;
@ -2057,9 +2108,20 @@ const bool Matcher::match_rule_supported_vector_masked(int opcode, int vlen, Bas
      }
      return true;

+    case Op_PopCountVI:
+    case Op_PopCountVL:
+      if (!is_pop_count_instr_target(bt)) {
+        return false;
+      }
+      return true;
+
    case Op_MaskAll:
      return true;

+    case Op_CountLeadingZerosV:
+      if (is_non_subword_integral_type(bt) && VM_Version::supports_avx512cd()) {
+        return true;
+      }
    default:
      return false;
  }
@ -8705,58 +8767,151 @@ instruct vmuladdaddS2I_reg(vec dst, vec src1, vec src2) %{

 // --------------------------------- PopCount --------------------------------------

-instruct vpopcountI_popcntd(vec dst, vec src) %{
-  predicate(VM_Version::supports_avx512_vpopcntdq());
+instruct vpopcount_integral_reg_evex(vec dst, vec src) %{
+  predicate(is_vector_popcount_predicate(Matcher::vector_element_basic_type(n->in(1))));
  match(Set dst (PopCountVI src));
-  format %{ "vector_popcount_int $dst, $src\t! vector popcount packedI" %}
+  match(Set dst (PopCountVL src));
+  ins_cost(400);
+  format %{ "vector_popcount_integral $dst, $src" %}
  ins_encode %{
-    assert(UsePopCountInstruction, "not enabled");
-    int vlen_enc = vector_length_encoding(this);
-    __ vector_popcount_int($dst$$XMMRegister, $src$$XMMRegister, xnoreg, xnoreg, xnoreg, noreg, vlen_enc);
+    int opcode = this->ideal_Opcode();
+    int vlen_enc = vector_length_encoding(this, $src);
+    BasicType bt = Matcher::vector_element_basic_type(this, $src);
+    __ vector_popcount_integral_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, k0, true, vlen_enc);
+    // TODO: Once auto-vectorizer supports ConvL2I operation, PopCountVL
+    // should be succeeded by its corresponding vector IR and following
+    // special handling should be removed.
+    if (opcode == Op_PopCountVL && Matcher::vector_element_basic_type(this) == T_INT) {
+      __ evpmovqd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
+    }
  %}
  ins_pipe( pipe_slow );
 %}

-instruct vpopcountI(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, rRegP rtmp, rFlagsReg cc) %{
-  predicate(!VM_Version::supports_avx512_vpopcntdq());
+instruct vpopcount_integral_reg_evex_masked(vec dst, vec src, kReg mask) %{
+  predicate(is_vector_popcount_predicate(Matcher::vector_element_basic_type(n->in(1))));
+  match(Set dst (PopCountVI src mask));
+  match(Set dst (PopCountVL src mask));
+  format %{ "vector_popcount_integral_masked $dst, $src, $mask" %}
+  ins_encode %{
+    int vlen_enc = vector_length_encoding(this, $src);
+    BasicType bt = Matcher::vector_element_basic_type(this, $src);
+    __ evmovdquq($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
+    __ vector_popcount_integral_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, $mask$$KRegister, true, vlen_enc);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vpopcount_avx_reg(vec dst, vec src, vec xtmp1, vec xtmp2, rRegP rtmp) %{
+  predicate(!is_vector_popcount_predicate(Matcher::vector_element_basic_type(n->in(1))));
  match(Set dst (PopCountVI src));
-  effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp, KILL cc);
-  format %{ "vector_popcount_int  $dst, $src\t! using $xtmp1, $xtmp2, $xtmp3, and $rtmp as TEMP" %}
+  match(Set dst (PopCountVL src));
+  effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP rtmp);
+  format %{ "vector_popcount_integral $dst, $src\t! using $xtmp1, $xtmp2, and $rtmp as TEMP" %}
  ins_encode %{
-    assert(UsePopCountInstruction, "not enabled");
-    int vlen_enc = vector_length_encoding(this);
-    __ vector_popcount_int($dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister,
-                           $xtmp3$$XMMRegister, $rtmp$$Register, vlen_enc);
+    int opcode = this->ideal_Opcode();
+    int vlen_enc = vector_length_encoding(this, $src);
+    BasicType bt = Matcher::vector_element_basic_type(this, $src);
+    __ vector_popcount_integral(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
+                                $xtmp2$$XMMRegister, $rtmp$$Register, vlen_enc);
+    // TODO: Once auto-vectorizer supports ConvL2I operation, PopCountVL
+    // should be succeeded by its corresponding vector IR and following
+    // special handling should be removed.
+    if (opcode == Op_PopCountVL && Matcher::vector_element_basic_type(this) == T_INT) {
+      if (VM_Version::supports_avx512vl()) {
+        __ evpmovqd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
+      } else {
+        assert(VM_Version::supports_avx2(), "");
+        __ vpshufd($dst$$XMMRegister, $dst$$XMMRegister, 8, vlen_enc);
+        __ vpermq($dst$$XMMRegister, $dst$$XMMRegister, 8, vlen_enc);
+      }
+    }
  %}
  ins_pipe( pipe_slow );
 %}

-instruct vpopcountL_popcntd(vec dst, vec src) %{
-  predicate(VM_Version::supports_avx512_vpopcntdq());
-  match(Set dst (PopCountVL src));
-  format %{ "vector_popcount_long  $dst, $src\t! vector popcount packedL" %}
+// --------------------------------- Vector Trailing Zeros Count --------------------------------------
+
+instruct vcount_trailing_zeros_reg_evex(vec dst, vec src, vec xtmp, rRegP rtmp) %{
+  predicate(is_clz_non_subword_predicate_evex(Matcher::vector_element_basic_type(n->in(1)),
+                                              Matcher::vector_length_in_bytes(n->in(1))));
+  match(Set dst (CountTrailingZerosV src));
+  effect(TEMP dst, TEMP xtmp, TEMP rtmp);
+  ins_cost(400);
+  format %{ "vector_count_trailing_zeros $dst, $src!\t using $xtmp and $rtmp as TEMP" %}
  ins_encode %{
-    assert(UsePopCountInstruction, "not enabled");
    int vlen_enc = vector_length_encoding(this, $src);
-    __ vector_popcount_long($dst$$XMMRegister, $src$$XMMRegister, xnoreg, xnoreg, xnoreg, noreg, vlen_enc);
+    BasicType bt = Matcher::vector_element_basic_type(this, $src);
+    BasicType rbt = Matcher::vector_element_basic_type(this);
+    __ vector_count_trailing_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, xnoreg,
+                                        xnoreg, xnoreg, $xtmp$$XMMRegister, k0, $rtmp$$Register, vlen_enc);
+    // TODO: Once auto-vectorizer supports ConvL2I operation, CountTrailingZerosV
+    // should be succeeded by its corresponding vector IR and following
+    // special handling should be removed.
+    if (bt == T_LONG && rbt == T_INT) {
+      __ evpmovqd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
+    }
  %}
  ins_pipe( pipe_slow );
 %}

-instruct vpopcountL(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, rRegP rtmp, rFlagsReg cc) %{
-  predicate(!VM_Version::supports_avx512_vpopcntdq());
-  match(Set dst (PopCountVL src));
-  effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp, KILL cc);
-  format %{ "vector_popcount_long  $dst, $src\t! using $xtmp1, $xtmp2, $xtmp3, and $rtmp as TEMP" %}
+instruct vcount_trailing_zeros_short_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, rRegP rtmp) %{
+  predicate(Matcher::vector_element_basic_type(n->in(1)) == T_SHORT &&
+            VM_Version::supports_avx512cd() &&
+            (VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n) == 64));
+  match(Set dst (CountTrailingZerosV src));
+  effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp);
+  ins_cost(400);
+  format %{ "vector_count_trailing_zeros $dst, $src!\t using $xtmp1, $xtmp2, $xtmp3 and $rtmp as TEMP" %}
  ins_encode %{
-    assert(UsePopCountInstruction, "not enabled");
    int vlen_enc = vector_length_encoding(this, $src);
-    __ vector_popcount_long($dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister,
-                           $xtmp3$$XMMRegister, $rtmp$$Register, vlen_enc);
+    BasicType bt = Matcher::vector_element_basic_type(this, $src);
+    __ vector_count_trailing_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
+                                        $xtmp2$$XMMRegister, xnoreg, $xtmp3$$XMMRegister, k0, $rtmp$$Register, vlen_enc);
  %}
  ins_pipe( pipe_slow );
 %}

+instruct vcount_trailing_zeros_byte_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, vec xtmp4, kReg ktmp, rRegP rtmp) %{
+  predicate(Matcher::vector_element_basic_type(n->in(1)) == T_BYTE && VM_Version::supports_avx512vlbw());
+  match(Set dst (CountTrailingZerosV src));
+  effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP xtmp4, TEMP ktmp, TEMP rtmp);
+  ins_cost(400);
+  format %{ "vector_count_trailing_zeros $dst, $src!\t using $xtmp1, $xtmp2, $xtmp3, $xtmp4, $ktmp and $rtmp as TEMP" %}
+  ins_encode %{
+    int vlen_enc = vector_length_encoding(this, $src);
+    BasicType bt = Matcher::vector_element_basic_type(this, $src);
+    __ vector_count_trailing_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
+                                        $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $xtmp4$$XMMRegister,
+                                        $ktmp$$KRegister, $rtmp$$Register, vlen_enc);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vcount_trailing_zeros_reg_avx(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, rRegP rtmp) %{
+  predicate(!VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n->in(1)) < 64);
+  match(Set dst (CountTrailingZerosV src));
+  effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp);
+  format %{ "vector_count_trailing_zeros $dst, $src\t! using $xtmp1, $xtmp2, $xtmp3, and $rtmp as TEMP" %}
+  ins_encode %{
+    int vlen_enc = vector_length_encoding(this, $src);
+    BasicType bt = Matcher::vector_element_basic_type(this, $src);
+    BasicType rbt = Matcher::vector_element_basic_type(this);
+    __ vector_count_trailing_zeros_avx(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
+                                       $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $rtmp$$Register, vlen_enc);
+    // TODO: Once auto-vectorizer supports ConvL2I operation, PopCountVL
+    // should be succeeded by its corresponding vector IR and following
+    // special handling should be removed.
+    if (bt == T_LONG && rbt == T_INT) {
+      assert(VM_Version::supports_avx2(), "");
+      __ vpshufd($dst$$XMMRegister, $dst$$XMMRegister, 8, vlen_enc);
+      __ vpermq($dst$$XMMRegister, $dst$$XMMRegister, 8, vlen_enc);
+    }
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+
 // --------------------------------- Bitwise Ternary Logic ----------------------------------

 instruct vpternlog(vec dst, vec src2, vec src3, immU8 func) %{
@ -9031,8 +9186,200 @@ instruct vmask_first_or_last_true_avx(rRegI dst, vec mask, immI size, rRegL tmp,
  %}
  ins_pipe( pipe_slow );
 %}
+
+// --------------------------------- Compress/Expand Operations ---------------------------
+
+instruct vcompress_expand_reg_evex(vec dst, vec src, kReg mask) %{
+  match(Set dst (CompressV src mask));
+  match(Set dst (ExpandV src mask));
+  format %{ "vector_compress_expand $dst, $src, $mask" %}
+  ins_encode %{
+    int opcode = this->ideal_Opcode();
+    int vector_len = vector_length_encoding(this);
+    BasicType bt  = Matcher::vector_element_basic_type(this);
+    __ vector_compress_expand(opcode, $dst$$XMMRegister, $src$$XMMRegister, $mask$$KRegister, false, bt, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vcompress_mask_reg_evex(kReg dst, kReg mask, rRegL rtmp1, rRegL rtmp2, rFlagsReg cr) %{
+  match(Set dst (CompressM mask));
+  effect(TEMP rtmp1, TEMP rtmp2, KILL cr);
+  format %{ "mask_compress_evex $dst, $mask\t! using $rtmp1 and $rtmp2 as TEMP" %}
+  ins_encode %{
+    assert(this->in(1)->bottom_type()->isa_vectmask(), "");
+    int mask_len = Matcher::vector_length(this);
+    __ vector_mask_compress($dst$$KRegister, $mask$$KRegister, $rtmp1$$Register, $rtmp2$$Register, mask_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
 #endif // _LP64

+// -------------------------------- Bit and Byte Reversal Vector Operations ------------------------
+
+instruct vreverse_reg(vec dst, vec src, vec xtmp1, vec xtmp2, rRegI rtmp) %{
+  predicate(!VM_Version::supports_gfni());
+  match(Set dst (ReverseV src));
+  effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP rtmp);
+  format %{ "vector_reverse_bit_evex $dst, $src!\t using $xtmp1, $xtmp2 and $rtmp as TEMP" %}
+  ins_encode %{
+    int vec_enc = vector_length_encoding(this);
+    BasicType bt = Matcher::vector_element_basic_type(this);
+    __ vector_reverse_bit(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
+                          $xtmp2$$XMMRegister, $rtmp$$Register, vec_enc);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vreverse_reg_gfni(vec dst, vec src, vec xtmp, rRegI rtmp) %{
+  predicate(VM_Version::supports_gfni());
+  match(Set dst (ReverseV src));
+  effect(TEMP dst, TEMP xtmp, TEMP rtmp);
+  format %{ "vector_reverse_bit_gfni $dst, $src!\t using $rtmp and $xtmp as TEMP" %}
+  ins_encode %{
+    int vec_enc = vector_length_encoding(this);
+    BasicType bt  = Matcher::vector_element_basic_type(this);
+    InternalAddress addr = $constantaddress(T_LONG, vreplicate_imm(T_LONG, 0x8040201008040201L, 1));
+    __ vector_reverse_bit_gfni(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp$$XMMRegister,
+                               addr, $rtmp$$Register, vec_enc);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vreverse_byte_reg(vec dst, vec src, rRegI rtmp) %{
+  predicate(VM_Version::supports_avx512bw() || Matcher::vector_length_in_bytes(n) < 64);
+  match(Set dst (ReverseBytesV src));
+  effect(TEMP dst, TEMP rtmp);
+  format %{ "vector_reverse_byte $dst, $src!\t using $rtmp as TEMP" %}
+  ins_encode %{
+    int vec_enc = vector_length_encoding(this);
+    BasicType bt = Matcher::vector_element_basic_type(this);
+    __ vector_reverse_byte(bt, $dst$$XMMRegister, $src$$XMMRegister, $rtmp$$Register, vec_enc);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vreverse_byte64_reg(vec dst, vec src, vec xtmp1, vec xtmp2, rRegI rtmp) %{
+  predicate(!VM_Version::supports_avx512bw() && Matcher::vector_length_in_bytes(n) == 64);
+  match(Set dst (ReverseBytesV src));
+  effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP rtmp);
+  format %{ "vector_reverse_byte $dst, $src!\t using $xtmp1, $xtmp2 and $rtmp as TEMP" %}
+  ins_encode %{
+    int vec_enc = vector_length_encoding(this);
+    BasicType bt = Matcher::vector_element_basic_type(this);
+    __ vector_reverse_byte64(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
+                             $xtmp2$$XMMRegister, $rtmp$$Register, vec_enc);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+// ---------------------------------- Vector Count Leading Zeros -----------------------------------
+
+instruct vcount_leading_zeros_IL_reg_evex(vec dst, vec src) %{
+  predicate(is_clz_non_subword_predicate_evex(Matcher::vector_element_basic_type(n->in(1)),
+                                              Matcher::vector_length_in_bytes(n->in(1))));
+  match(Set dst (CountLeadingZerosV src));
+  format %{ "vector_count_leading_zeros $dst, $src" %}
+  ins_encode %{
+     int vlen_enc = vector_length_encoding(this, $src);
+     BasicType bt = Matcher::vector_element_basic_type(this, $src);
+     BasicType rbt = Matcher::vector_element_basic_type(this);
+     __ vector_count_leading_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, xnoreg,
+                                        xnoreg, xnoreg, k0, noreg, true, vlen_enc);
+     // TODO: Once auto-vectorizer supports ConvL2I operation, CountLeadingZerosV
+     // should be succeeded by its corresponding vector IR and following
+     // special handling should be removed.
+     if (rbt == T_INT && bt == T_LONG) {
+       __ evpmovqd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
+     }
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vcount_leading_zeros_IL_reg_evex_masked(vec dst, vec src, kReg mask) %{
+  predicate(is_clz_non_subword_predicate_evex(Matcher::vector_element_basic_type(n->in(1)),
+                                              Matcher::vector_length_in_bytes(n->in(1))));
+  match(Set dst (CountLeadingZerosV src mask));
+  format %{ "vector_count_leading_zeros $dst, $src, $mask" %}
+  ins_encode %{
+    int vlen_enc = vector_length_encoding(this, $src);
+    BasicType bt = Matcher::vector_element_basic_type(this, $src);
+    __ evmovdquq($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
+    __ vector_count_leading_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, xnoreg, xnoreg,
+                                       xnoreg, $mask$$KRegister, noreg, true, vlen_enc);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vcount_leading_zeros_short_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2) %{
+  predicate(Matcher::vector_element_basic_type(n->in(1)) == T_SHORT &&
+            VM_Version::supports_avx512cd() &&
+            (VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n) == 64));
+  match(Set dst (CountLeadingZerosV src));
+  effect(TEMP dst, TEMP xtmp1, TEMP xtmp2);
+  format %{ "vector_count_leading_zeros $dst, $src!\t using $xtmp1 and $xtmp2 as TEMP" %}
+  ins_encode %{
+    int vlen_enc = vector_length_encoding(this, $src);
+    BasicType bt = Matcher::vector_element_basic_type(this, $src);
+    __ vector_count_leading_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
+                                       $xtmp2$$XMMRegister, xnoreg, k0, noreg, true, vlen_enc);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vcount_leading_zeros_byte_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, kReg ktmp, rRegP rtmp) %{
+  predicate(Matcher::vector_element_basic_type(n->in(1)) == T_BYTE && VM_Version::supports_avx512vlbw());
+  match(Set dst (CountLeadingZerosV src));
+  effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP ktmp, TEMP rtmp);
+  format %{ "vector_count_leading_zeros $dst, $src!\t using $xtmp1, $xtmp2, $xtmp3, $ktmp and $rtmp as TEMP" %}
+  ins_encode %{
+    int vlen_enc = vector_length_encoding(this, $src);
+    BasicType bt = Matcher::vector_element_basic_type(this, $src);
+    __ vector_count_leading_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
+                                       $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $ktmp$$KRegister,
+                                       $rtmp$$Register, true, vlen_enc);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vcount_leading_zeros_int_reg_avx(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3) %{
+  predicate(Matcher::vector_element_basic_type(n->in(1)) == T_INT &&
+            !VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n->in(1)) < 64);
+  match(Set dst (CountLeadingZerosV src));
+  effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3);
+  format %{ "vector_count_leading_zeros $dst, $src\t! using $xtmp1, $xtmp2 and $xtmp3 as TEMP" %}
+  ins_encode %{
+    int vlen_enc = vector_length_encoding(this, $src);
+    BasicType bt = Matcher::vector_element_basic_type(this, $src);
+    __ vector_count_leading_zeros_avx(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
+                                      $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, noreg, vlen_enc);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vcount_leading_zeros_reg_avx(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, rRegP rtmp) %{
+  predicate(Matcher::vector_element_basic_type(n->in(1)) != T_INT &&
+            !VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n->in(1)) < 64);
+  match(Set dst (CountLeadingZerosV src));
+  effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp);
+  format %{ "vector_count_leading_zeros $dst, $src\t! using $xtmp1, $xtmp2, $xtmp3, and $rtmp as TEMP" %}
+  ins_encode %{
+    int vlen_enc = vector_length_encoding(this, $src);
+    BasicType bt = Matcher::vector_element_basic_type(this, $src);
+    BasicType rbt = Matcher::vector_element_basic_type(this);
+    __ vector_count_leading_zeros_avx(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
+                                      $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $rtmp$$Register, vlen_enc);
+    // TODO: Once auto-vectorizer supports ConvL2I operation, CountLeadingZerosV
+    // should be succeeded by its corresponding vector IR and following
+    // special handling should be removed.
+    if (rbt == T_INT && bt == T_LONG) {
+      __ evpmovqd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
+    }
+  %}
+  ins_pipe( pipe_slow );
+%}
+
 // ---------------------------------- Vector Masked Operations ------------------------------------

 instruct vadd_reg_masked(vec dst, vec src2, kReg mask) %{
--- a/src/hotspot/share/adlc/formssel.cpp
+++ b/src/hotspot/share/adlc/formssel.cpp
@ -4212,6 +4212,7 @@ bool MatchRule::is_vector() const {
    "SqrtVD","SqrtVF",
    "AndV" ,"XorV" ,"OrV",
    "MaxV", "MinV",
+    "CompressV", "ExpandV", "CompressM",
    "AddReductionVI", "AddReductionVL",
    "AddReductionVF", "AddReductionVD",
    "MulReductionVI", "MulReductionVL",
@ -4223,7 +4224,7 @@ bool MatchRule::is_vector() const {
    "LShiftVB","LShiftVS","LShiftVI","LShiftVL",
    "RShiftVB","RShiftVS","RShiftVI","RShiftVL",
    "URShiftVB","URShiftVS","URShiftVI","URShiftVL",
-    "ReplicateB","ReplicateS","ReplicateI","ReplicateL","ReplicateF","ReplicateD","PopulateIndex",
+    "ReplicateB","ReplicateS","ReplicateI","ReplicateL","ReplicateF","ReplicateD","ReverseV","ReverseBytesV",
    "RoundDoubleModeV","RotateLeftV" , "RotateRightV", "LoadVector","StoreVector",
    "LoadVectorGather", "StoreVectorScatter", "LoadVectorGatherMasked", "StoreVectorScatterMasked",
    "VectorTest", "VectorLoadMask", "VectorStoreMask", "VectorBlend", "VectorInsert",
@ -4232,7 +4233,8 @@ bool MatchRule::is_vector() const {
    "VectorCastL2X", "VectorCastF2X", "VectorCastD2X",
    "VectorUCastB2X", "VectorUCastS2X", "VectorUCastI2X",
    "VectorMaskWrapper","VectorMaskCmp","VectorReinterpret","LoadVectorMasked","StoreVectorMasked",
-    "FmaVD","FmaVF","PopCountVI", "PopCountVL", "SignumVF", "SignumVD", "VectorLongToMask",
+    "FmaVD","FmaVF","PopCountVI","PopCountVL","PopulateIndex","VectorLongToMask",
+    "CountLeadingZerosV", "CountTrailingZerosV", "SignumVF", "SignumVD",
    // Next are vector mask ops.
    "MaskAll", "AndVMask", "OrVMask", "XorVMask", "VectorMaskCast",
    "RoundVF", "RoundVD",
--- a/src/hotspot/share/classfile/vmIntrinsics.hpp
+++ b/src/hotspot/share/classfile/vmIntrinsics.hpp
@ -935,7 +935,7 @@ class methodHandle;
                                     "Ljava/lang/Object;"                                                                                      \
                                     "J"                                                                                                       \
                                     "Ljava/lang/Object;"                                                                                      \
-                                     "I"                                                                                                       \
+                                     "J"                                                                                                       \
                                     "Ljdk/internal/vm/vector/VectorSupport$VectorSpecies;"                                                    \
                                     "Ljdk/internal/vm/vector/VectorSupport$LoadOperation;)"                                                   \
                                     "Ljdk/internal/vm/vector/VectorSupport$VectorPayload;")                                                   \
@ -950,7 +950,7 @@ class methodHandle;
                                            "J"                                                                                                \
                                            "Ljdk/internal/vm/vector/VectorSupport$VectorMask;"                                                \
                                            "Ljava/lang/Object;"                                                                               \
-                                            "I"                                                                                                \
+                                            "J"                                                                                                \
                                            "Ljdk/internal/vm/vector/VectorSupport$VectorSpecies;"                                             \
                                            "Ljdk/internal/vm/vector/VectorSupport$LoadVectorMaskedOperation;)"                                \
                                            "Ljdk/internal/vm/vector/VectorSupport$Vector;")                                                   \
@ -962,8 +962,10 @@ class methodHandle;
                                      "I"                                                                                                      \
                                      "Ljava/lang/Object;"                                                                                     \
                                      "J"                                                                                                      \
-                                      "Ljdk/internal/vm/vector/VectorSupport$Vector;"                                                          \
-                                      "Ljava/lang/Object;ILjdk/internal/vm/vector/VectorSupport$StoreVectorOperation;)"                        \
+                                      "Ljdk/internal/vm/vector/VectorSupport$VectorPayload;"                                                   \
+                                      "Ljava/lang/Object;"                                                                                     \
+                                      "J"                                                                                                      \
+                                      "Ljdk/internal/vm/vector/VectorSupport$StoreVectorOperation;)"                                           \
                                      "V")                                                                                                     \
   do_name(vector_store_op_name,     "store")                                                                                                  \
                                                                                                                                               \
@ -977,7 +979,7 @@ class methodHandle;
                                             "Ljdk/internal/vm/vector/VectorSupport$Vector;"                                                   \
                                             "Ljdk/internal/vm/vector/VectorSupport$VectorMask;"                                               \
                                             "Ljava/lang/Object;"                                                                              \
-                                             "I"                                                                                               \
+                                             "J"                                                                                               \
                                             "Ljdk/internal/vm/vector/VectorSupport$StoreVectorMaskedOperation;)"                              \
                                             "V")                                                                                              \
   do_name(vector_store_masked_op_name,     "storeMasked")                                                                                     \
@ -1137,6 +1139,17 @@ class methodHandle;
                                        "J")                                                                                                   \
    do_name(vector_mask_oper_name, "maskReductionCoerced")                                                                                     \
                                                                                                                                               \
+  do_intrinsic(_VectorCompressExpand, jdk_internal_vm_vector_VectorSupport, vector_compress_expand_op_name, vector_compress_expand_op_sig, F_S)\
+   do_signature(vector_compress_expand_op_sig, "(I"                                                                                            \
+                                      "Ljava/lang/Class;"                                                                                      \
+                                      "Ljava/lang/Class;"                                                                                      \
+                                      "Ljava/lang/Class;"                                                                                      \
+                                      "I"                                                                                                      \
+                                      "Ljdk/internal/vm/vector/VectorSupport$Vector;"                                                          \
+                                      "Ljdk/internal/vm/vector/VectorSupport$VectorMask;"                                                      \
+                                      "Ljdk/internal/vm/vector/VectorSupport$CompressExpandOperation;)"                                        \
+                                      "Ljdk/internal/vm/vector/VectorSupport$VectorPayload;")                                                  \
+   do_name(vector_compress_expand_op_name,     "compressExpandOp")                                                                             \
   /* (2) Bytecode intrinsics                                                                        */                        \
                                                                                                                               \
  do_intrinsic(_park,                     jdk_internal_misc_Unsafe,     park_name, park_signature,                     F_RN)   \
@ -1245,7 +1258,7 @@ enum class vmIntrinsicID : int {
                   __IGNORE_CLASS, __IGNORE_NAME, __IGNORE_SIGNATURE, __IGNORE_ALIAS)

  ID_LIMIT,
-  LAST_COMPILER_INLINE = _VectorMaskOp,
+  LAST_COMPILER_INLINE = _VectorCompressExpand,
  FIRST_MH_SIG_POLY    = _invokeGeneric,
  FIRST_MH_STATIC      = _linkToVirtual,
  LAST_MH_SIG_POLY     = _linkToNative,
--- a/src/hotspot/share/opto/c2compiler.cpp
+++ b/src/hotspot/share/opto/c2compiler.cpp
@ -715,6 +715,7 @@ bool C2Compiler::is_intrinsic_supported(const methodHandle& method, bool is_virt
  case vmIntrinsics::_Continuation_doYield:
    break;

+  case vmIntrinsics::_VectorCompressExpand:
  case vmIntrinsics::_VectorUnaryOp:
  case vmIntrinsics::_VectorBinaryOp:
  case vmIntrinsics::_VectorTernaryOp:
--- a/src/hotspot/share/opto/classes.hpp
+++ b/src/hotspot/share/opto/classes.hpp
@ -51,6 +51,7 @@ macro(ReverseBytesI)
 macro(ReverseBytesL)
 macro(ReverseBytesUS)
 macro(ReverseBytesS)
+macro(ReverseBytesV)
 macro(CProj)
 macro(CacheWB)
 macro(CacheWBPreSync)
@ -74,6 +75,8 @@ macro(Catch)
 macro(CatchProj)
 macro(CheckCastPP)
 macro(ClearArray)
+macro(CompressBits)
+macro(ExpandBits)
 macro(ConstraintCast)
 macro(CMoveD)
 macro(CMoveVD)
@ -152,8 +155,10 @@ macro(LongCountedLoop)
 macro(LongCountedLoopEnd)
 macro(CountLeadingZerosI)
 macro(CountLeadingZerosL)
+macro(CountLeadingZerosV)
 macro(CountTrailingZerosI)
 macro(CountTrailingZerosL)
+macro(CountTrailingZerosV)
 macro(CreateEx)
 macro(DecodeN)
 macro(DecodeNKlass)
@ -285,6 +290,9 @@ macro(RShiftL)
 macro(Region)
 macro(Rethrow)
 macro(Return)
+macro(ReverseI)
+macro(ReverseL)
+macro(ReverseV)
 macro(Root)
 macro(RoundDouble)
 macro(RoundDoubleMode)
@ -424,6 +432,9 @@ macro(MinV)
 macro(MaxV)
 macro(MinReductionV)
 macro(MaxReductionV)
+macro(CompressV)
+macro(CompressM)
+macro(ExpandV)
 macro(LoadVector)
 macro(LoadVectorGather)
 macro(LoadVectorGatherMasked)
--- a/src/hotspot/share/opto/library_call.cpp
+++ b/src/hotspot/share/opto/library_call.cpp
@ -700,6 +700,8 @@ bool LibraryCallKit::try_to_inline(int predicate) {
    return inline_vector_insert();
  case vmIntrinsics::_VectorExtract:
    return inline_vector_extract();
+  case vmIntrinsics::_VectorCompressExpand:
+    return inline_vector_compress_expand();

  case vmIntrinsics::_getObjectSize:
    return inline_getObjectSize();
--- a/src/hotspot/share/opto/library_call.hpp
+++ b/src/hotspot/share/opto/library_call.hpp
@ -344,6 +344,8 @@ class LibraryCallKit : public GraphKit {
  bool inline_vector_convert();
  bool inline_vector_extract();
  bool inline_vector_insert();
+  bool inline_vector_compress_expand();
+
  Node* gen_call_to_svml(int vector_api_op_id, BasicType bt, int num_elem, Node* opd1, Node* opd2);

  enum VectorMaskUseType {
--- a/src/hotspot/share/opto/loopTransform.cpp
+++ b/src/hotspot/share/opto/loopTransform.cpp
@ -974,6 +974,9 @@ bool IdealLoopTree::policy_unroll(PhaseIdealLoop *phase) {
      case Op_RoundD: {
          body_size += Matcher::scalar_op_pre_select_sz_estimate(n->Opcode(), n->bottom_type()->basic_type());
      } break;
+      case Op_CountTrailingZerosV:
+      case Op_CountLeadingZerosV:
+      case Op_ReverseV:
      case Op_RoundVF:
      case Op_RoundVD:
      case Op_PopCountVI:
--- a/src/hotspot/share/opto/matcher.cpp
+++ b/src/hotspot/share/opto/matcher.cpp
@ -2254,6 +2254,9 @@ bool Matcher::find_shared_visit(MStack& mstack, Node* n, uint opcode, bool& mem_
    case Op_MacroLogicV:
    case Op_LoadVectorMasked:
    case Op_VectorCmpMasked:
+    case Op_CompressV:
+    case Op_CompressM:
+    case Op_ExpandV:
    case Op_VectorLoadMask:
      set_shared(n); // Force result into register (it will be anyways)
      break;
--- a/src/hotspot/share/opto/mulnode.hpp
+++ b/src/hotspot/share/opto/mulnode.hpp
@ -389,4 +389,20 @@ public:
  virtual uint ideal_reg() const { return Op_RegI; }
 };

+//------------------------------CompressBitsNode-------------------------------
+// CompressBits placeholder node
+class CompressBitsNode : public Node {
+public:
+  CompressBitsNode(Node *in1, Node *in2) : Node(0,in1,in2) {}
+  virtual int Opcode() const;
+};
+
+//------------------------------ExpandBitsNode---------------------------------
+// ExpandBits placeholder node
+class ExpandBitsNode : public Node {
+public:
+  ExpandBitsNode(Node *in1, Node *in2) : Node(0,in1,in2) {}
+  virtual int Opcode() const;
+};
+
 #endif // SHARE_OPTO_MULNODE_HPP
--- a/src/hotspot/share/opto/node.hpp
+++ b/src/hotspot/share/opto/node.hpp
@ -175,6 +175,9 @@ class VectorUnboxNode;
 class VectorSet;
 class VectorReinterpretNode;
 class ShiftVNode;
+class ExpandVNode;
+class CompressVNode;
+class CompressMNode;


 #ifndef OPTO_DU_ITERATOR_ASSERT
@ -704,6 +707,9 @@ public:
        DEFINE_CLASS_ID(VectorUnbox, Vector, 1)
        DEFINE_CLASS_ID(VectorReinterpret, Vector, 2)
        DEFINE_CLASS_ID(ShiftV, Vector, 3)
+        DEFINE_CLASS_ID(CompressV, Vector, 4)
+        DEFINE_CLASS_ID(ExpandV, Vector, 5)
+        DEFINE_CLASS_ID(CompressM, Vector, 6)

    DEFINE_CLASS_ID(Proj,  Node, 3)
      DEFINE_CLASS_ID(CatchProj, Proj, 0)
@ -777,7 +783,8 @@ public:
    Flag_is_predicated_vector        = 1 << 14,
    Flag_for_post_loop_opts_igvn     = 1 << 15,
    Flag_is_removed_by_peephole      = 1 << 16,
-    _last_flag                       = Flag_is_removed_by_peephole
+    Flag_is_predicated_using_blend   = 1 << 17,
+    _last_flag                       = Flag_is_predicated_using_blend
  };

  class PD;
@ -931,7 +938,10 @@ public:
  DEFINE_CLASS_QUERY(Vector)
  DEFINE_CLASS_QUERY(VectorMaskCmp)
  DEFINE_CLASS_QUERY(VectorUnbox)
-  DEFINE_CLASS_QUERY(VectorReinterpret);
+  DEFINE_CLASS_QUERY(VectorReinterpret)
+  DEFINE_CLASS_QUERY(CompressV)
+  DEFINE_CLASS_QUERY(ExpandV)
+  DEFINE_CLASS_QUERY(CompressM)
  DEFINE_CLASS_QUERY(LoadVector)
  DEFINE_CLASS_QUERY(LoadVectorGather)
  DEFINE_CLASS_QUERY(StoreVector)
@ -989,6 +999,8 @@ public:

  bool is_predicated_vector() const { return (_flags & Flag_is_predicated_vector) != 0; }

+  bool is_predicated_using_blend() const { return (_flags & Flag_is_predicated_using_blend) != 0; }
+
  // Used in lcm to mark nodes that have scheduled
  bool is_scheduled() const { return (_flags & Flag_is_scheduled) != 0; }

--- a/src/hotspot/share/opto/subnode.hpp
+++ b/src/hotspot/share/opto/subnode.hpp
@ -548,4 +548,24 @@ public:
  virtual uint ideal_reg() const { return Op_RegI; }
 };

+//-------------------------------ReverseINode--------------------------------
+// reverse bits of an int
+class ReverseINode : public Node {
+public:
+  ReverseINode(Node *c, Node *in1) : Node(c, in1) {}
+  virtual int Opcode() const;
+  const Type *bottom_type() const { return TypeInt::INT; }
+  virtual uint ideal_reg() const { return Op_RegI; }
+};
+
+//-------------------------------ReverseLNode--------------------------------
+// reverse bits of a long
+class ReverseLNode : public Node {
+public:
+  ReverseLNode(Node *c, Node *in1) : Node(c, in1) {}
+  virtual int Opcode() const;
+  const Type *bottom_type() const { return TypeLong::LONG; }
+  virtual uint ideal_reg() const { return Op_RegL; }
+};
+
 #endif // SHARE_OPTO_SUBNODE_HPP
--- a/src/hotspot/share/opto/superword.cpp
+++ b/src/hotspot/share/opto/superword.cpp
@ -2585,7 +2585,9 @@ bool SuperWord::output() {
                 opc == Op_AbsI || opc == Op_AbsL ||
                 opc == Op_NegF || opc == Op_NegD ||
                 opc == Op_RoundF || opc == Op_RoundD ||
-                 opc == Op_PopCountI || opc == Op_PopCountL) {
+                 opc == Op_PopCountI || opc == Op_PopCountL ||
+                 opc == Op_CountLeadingZerosI || opc == Op_CountLeadingZerosL ||
+                 opc == Op_CountTrailingZerosI || opc == Op_CountTrailingZerosL) {
        assert(n->req() == 2, "only one input expected");
        Node* in = vector_opd(p, 1);
        vn = VectorNode::make(opc, in, NULL, vlen, velt_basic_type(n));
@ -3092,9 +3094,9 @@ bool SuperWord::is_vector_use(Node* use, int u_idx) {
    return true;
  }

-  if (VectorNode::is_vpopcnt_long(use)) {
-    // VPOPCNT_LONG takes long and produces int - hence the special checks
-    // on alignment and size.
+  if (VectorNode::is_type_transition_long_to_int(use)) {
+    // PopCountL/CountLeadingZerosL/CountTrailingZerosL takes long and produces
+    // int - hence the special checks on alignment and size.
    if (u_pk->size() != d_pk->size()) {
      return false;
    }
--- a/src/hotspot/share/opto/vectorIntrinsics.cpp
+++ b/src/hotspot/share/opto/vectorIntrinsics.cpp
@ -80,9 +80,12 @@ bool LibraryCallKit::arch_supports_vector_rotate(int opc, int num_elem, BasicTyp
  }

  if (is_supported) {
-    // Check whether mask unboxing is supported.
+    // Check if mask unboxing is supported, this is a two step process which first loads the contents
+    // of boolean array into vector followed by either lane expansion to match the lane size of masked
+    // vector operation or populate the predicate register.
    if ((mask_use_type & VecMaskUseLoad) != 0) {
-      if (!Matcher::match_rule_supported_vector(Op_VectorLoadMask, num_elem, elem_bt)) {
+      if (!Matcher::match_rule_supported_vector(Op_VectorLoadMask, num_elem, elem_bt) ||
+          !Matcher::match_rule_supported_vector(Op_LoadVector, num_elem, T_BOOLEAN)) {
      #ifndef PRODUCT
        if (C->print_intrinsics()) {
          tty->print_cr("  ** Rejected vector mask loading (%s,%s,%d) because architecture does not support it",
@ -260,9 +263,12 @@ bool LibraryCallKit::arch_supports_vector(int sopc, int num_elem, BasicType type
    return false;
  }

-  // Check whether mask unboxing is supported.
+  // Check if mask unboxing is supported, this is a two step process which first loads the contents
+  // of boolean array into vector followed by either lane expansion to match the lane size of masked
+  // vector operation or populate the predicate register.
  if ((mask_use_type & VecMaskUseLoad) != 0) {
-    if (!Matcher::match_rule_supported_vector(Op_VectorLoadMask, num_elem, type)) {
+    if (!Matcher::match_rule_supported_vector(Op_VectorLoadMask, num_elem, type) ||
+        !Matcher::match_rule_supported_vector(Op_LoadVector, num_elem, T_BOOLEAN)) {
    #ifndef PRODUCT
      if (C->print_intrinsics()) {
        tty->print_cr("  ** Rejected vector mask loading (%s,%s,%d) because architecture does not support it",
@ -273,9 +279,12 @@ bool LibraryCallKit::arch_supports_vector(int sopc, int num_elem, BasicType type
    }
  }

-  // Check whether mask boxing is supported.
+  // Check if mask boxing is supported, this is a two step process which first stores the contents
+  // of mask vector / predicate register into a boolean vector followed by vector store operation to
+  // transfer the contents to underlined storage of mask boxes which is a boolean array.
  if ((mask_use_type & VecMaskUseStore) != 0) {
-    if (!Matcher::match_rule_supported_vector(Op_VectorStoreMask, num_elem, type)) {
+    if (!Matcher::match_rule_supported_vector(Op_VectorStoreMask, num_elem, type) ||
+        !Matcher::match_rule_supported_vector(Op_StoreVector, num_elem, T_BOOLEAN)) {
    #ifndef PRODUCT
      if (C->print_intrinsics()) {
        tty->print_cr("Rejected vector mask storing (%s,%s,%d) because architecture does not support it",
@ -560,6 +569,7 @@ bool LibraryCallKit::inline_vector_nary_operation(int n) {
      operation->add_req(mask);
      operation->add_flag(Node::Flag_is_predicated_vector);
    } else {
+      operation->add_flag(Node::Flag_is_predicated_using_blend);
      operation = gvn().transform(operation);
      operation = new VectorBlendNode(opd1, operation, mask);
    }
@ -695,16 +705,8 @@ bool LibraryCallKit::inline_vector_mask_operation() {
  ciType* elem_type = elem_klass->const_oop()->as_instance()->java_mirror_type();
  BasicType elem_bt = elem_type->basic_type();

-  if (!arch_supports_vector(Op_LoadVector, num_elem, T_BOOLEAN, VecMaskNotUsed)) {
-    if (C->print_intrinsics()) {
-      tty->print_cr("  ** not supported: arity=1 op=cast#%d/3 vlen2=%d etype2=%s",
-                    Op_LoadVector, num_elem, type2name(T_BOOLEAN));
-    }
-    return false; // not supported
-  }
-
  int mopc = VectorSupport::vop2ideal(oper->get_con(), elem_bt);
-  if (!arch_supports_vector(mopc, num_elem, elem_bt, VecMaskNotUsed)) {
+  if (!arch_supports_vector(mopc, num_elem, elem_bt, VecMaskUseLoad)) {
    if (C->print_intrinsics()) {
      tty->print_cr("  ** not supported: arity=1 op=cast#%d/3 vlen2=%d etype2=%s",
                    mopc, num_elem, type2name(elem_bt));
@ -937,7 +939,7 @@ static bool elem_consistent_with_arr(BasicType elem_bt, const TypeAryPtr* arr_ty
 //  S extends VectorSpecies<E>>
 // VM load(Class<? extends VM> vmClass, Class<E> elementType, int length,
 //         Object base, long offset,    // Unsafe addressing
-//         C container, int index, S s,     // Arguments for default implementation
+//         C container, long index, S s,     // Arguments for default implementation
 //         LoadOperation<C, VM, E, S> defaultImpl)
 //
 // public static
@ -946,7 +948,7 @@ static bool elem_consistent_with_arr(BasicType elem_bt, const TypeAryPtr* arr_ty
 // void store(Class<?> vectorClass, Class<?> elementType, int length,
 //            Object base, long offset,    // Unsafe addressing
 //            V v,
-//            C container, int index,      // Arguments for default implementation
+//            C container, long index,      // Arguments for default implementation
 //            StoreVectorOperation<C, V> defaultImpl)

 bool LibraryCallKit::inline_vector_mem_operation(bool is_store) {
@ -1049,16 +1051,6 @@ bool LibraryCallKit::inline_vector_mem_operation(bool is_store) {
    }
  }
  if (is_mask) {
-    if (!arch_supports_vector(Op_LoadVector, num_elem, T_BOOLEAN, VecMaskNotUsed)) {
-      if (C->print_intrinsics()) {
-        tty->print_cr("  ** not supported: arity=%d op=%s/mask vlen=%d etype=bit ismask=no",
-                      is_store, is_store ? "store" : "load",
-                      num_elem);
-      }
-      set_map(old_map);
-      set_sp(old_sp);
-      return false; // not supported
-    }
    if (!is_store) {
      if (!arch_supports_vector(Op_LoadVector, num_elem, elem_bt, VecMaskUseLoad)) {
        set_map(old_map);
@ -1096,7 +1088,9 @@ bool LibraryCallKit::inline_vector_mem_operation(bool is_store) {
      const TypeVect* to_vect_type = TypeVect::make(T_BYTE, store_num_elem);
      val = gvn().transform(new VectorReinterpretNode(val, val->bottom_type()->is_vect(), to_vect_type));
    }
-
+    if (is_mask) {
+      val = gvn().transform(VectorStoreMaskNode::make(gvn(), val, elem_bt, num_elem));
+    }
    Node* vstore = gvn().transform(StoreVectorNode::make(0, control(), memory(addr), addr, addr_type, val, store_num_elem));
    set_memory(vstore, addr_type);
  } else {
@ -1138,7 +1132,7 @@ bool LibraryCallKit::inline_vector_mem_operation(bool is_store) {
 //  M extends VectorMask<E>>
 // V loadMasked(Class<? extends V> vectorClass, Class<M> maskClass, Class<E> elementType,
 //              int length, Object base, long offset, M m,
-//              C container, int index, S s,  // Arguments for default implementation
+//              C container, long index, S s,  // Arguments for default implementation
 //              LoadVectorMaskedOperation<C, V, S, M> defaultImpl) {
 //
 // public static
@ -1149,7 +1143,7 @@ bool LibraryCallKit::inline_vector_mem_operation(bool is_store) {
 // void storeMasked(Class<? extends V> vectorClass, Class<M> maskClass, Class<E> elementType,
 //                  int length, Object base, long offset,
 //                  V v, M m,
-//                  C container, int index,  // Arguments for default implementation
+//                  C container, long index,  // Arguments for default implementation
 //                  StoreVectorMaskedOperation<C, V, M, E> defaultImpl) {
 //
 bool LibraryCallKit::inline_vector_mem_masked_operation(bool is_store) {
@ -2736,3 +2730,97 @@ bool LibraryCallKit::inline_vector_extract() {
  return true;
 }

+// public static
+// <V extends Vector<E>,
+//  M extends VectorMask<E>,
+//  E>
+//  V compressExpandOp(int opr,
+//                    Class<? extends V> vClass, Class<? extends M> mClass, Class<E> eClass,
+//                    int length, V v, M m,
+//                    CompressExpandOperation<V, M> defaultImpl)
+bool LibraryCallKit::inline_vector_compress_expand() {
+  const TypeInt*     opr          = gvn().type(argument(0))->isa_int();
+  const TypeInstPtr* vector_klass = gvn().type(argument(1))->isa_instptr();
+  const TypeInstPtr* mask_klass   = gvn().type(argument(2))->isa_instptr();
+  const TypeInstPtr* elem_klass   = gvn().type(argument(3))->isa_instptr();
+  const TypeInt*     vlen         = gvn().type(argument(4))->isa_int();
+
+  if (vector_klass == NULL || elem_klass == NULL || mask_klass == NULL || vlen == NULL ||
+      vector_klass->const_oop() == NULL || mask_klass->const_oop() == NULL ||
+      elem_klass->const_oop() == NULL || !vlen->is_con() || !opr->is_con()) {
+    if (C->print_intrinsics()) {
+      tty->print_cr("  ** missing constant: opr=%s vclass=%s mclass=%s etype=%s vlen=%s",
+                    NodeClassNames[argument(0)->Opcode()],
+                    NodeClassNames[argument(1)->Opcode()],
+                    NodeClassNames[argument(2)->Opcode()],
+                    NodeClassNames[argument(3)->Opcode()],
+                    NodeClassNames[argument(4)->Opcode()]);
+    }
+    return false; // not enough info for intrinsification
+  }
+
+  if (!is_klass_initialized(vector_klass) || !is_klass_initialized(mask_klass)) {
+    if (C->print_intrinsics()) {
+      tty->print_cr("  ** klass argument not initialized");
+    }
+    return false;
+  }
+
+  ciType* elem_type = elem_klass->const_oop()->as_instance()->java_mirror_type();
+  if (!elem_type->is_primitive_type()) {
+    if (C->print_intrinsics()) {
+      tty->print_cr("  ** not a primitive bt=%d", elem_type->basic_type());
+    }
+    return false; // should be primitive type
+  }
+
+  int num_elem = vlen->get_con();
+  BasicType elem_bt = elem_type->basic_type();
+  int opc = VectorSupport::vop2ideal(opr->get_con(), elem_bt);
+
+  if (!arch_supports_vector(opc, num_elem, elem_bt, VecMaskUseLoad)) {
+    if (C->print_intrinsics()) {
+      tty->print_cr("  ** not supported: opc=%d vlen=%d etype=%s ismask=useload",
+                    opc, num_elem, type2name(elem_bt));
+    }
+    return false; // not supported
+  }
+
+  Node* opd1 = NULL;
+  const TypeInstPtr* vbox_type = NULL;
+  if (opc != Op_CompressM) {
+    ciKlass* vbox_klass = vector_klass->const_oop()->as_instance()->java_lang_Class_klass();
+    vbox_type = TypeInstPtr::make_exact(TypePtr::NotNull, vbox_klass);
+    opd1 = unbox_vector(argument(5), vbox_type, elem_bt, num_elem);
+    if (opd1 == NULL) {
+      if (C->print_intrinsics()) {
+        tty->print_cr("  ** unbox failed vector=%s",
+                      NodeClassNames[argument(5)->Opcode()]);
+      }
+      return false;
+    }
+  }
+
+  ciKlass* mbox_klass = mask_klass->const_oop()->as_instance()->java_lang_Class_klass();
+  assert(is_vector_mask(mbox_klass), "argument(6) should be a mask class");
+  const TypeInstPtr* mbox_type = TypeInstPtr::make_exact(TypePtr::NotNull, mbox_klass);
+
+  Node* mask = unbox_vector(argument(6), mbox_type, elem_bt, num_elem);
+  if (mask == NULL) {
+    if (C->print_intrinsics()) {
+      tty->print_cr("  ** unbox failed mask=%s",
+                    NodeClassNames[argument(6)->Opcode()]);
+    }
+    return false;
+  }
+
+  const TypeVect* vt = TypeVect::make(elem_bt, num_elem, opc == Op_CompressM);
+  Node* operation = gvn().transform(VectorNode::make(opc, opd1, mask, vt));
+
+  // Wrap it up in VectorBox to keep object type information.
+  const TypeInstPtr* box_type = opc == Op_CompressM ? mbox_type : vbox_type;
+  Node* vbox = box_vector(operation, box_type, elem_bt, num_elem);
+  set_result(vbox);
+  C->set_max_vector_size(MAX2(C->max_vector_size(), (uint)(num_elem * type2aelembytes(elem_bt))));
+  return true;
+}
--- a/src/hotspot/share/opto/vectornode.cpp
+++ b/src/hotspot/share/opto/vectornode.cpp
@ -162,11 +162,22 @@ int VectorNode::opcode(int sopc, BasicType bt) {
  case Op_RoundD:
    return (bt == T_LONG ? Op_RoundVD : 0);
  case Op_PopCountI:
-    // Unimplemented for subword types since bit count changes
-    // depending on size of lane (and sign bit).
-    return (bt == T_INT ? Op_PopCountVI : 0);
+    return Op_PopCountVI;
  case Op_PopCountL:
    return Op_PopCountVL;
+  case Op_ReverseI:
+  case Op_ReverseL:
+    return (is_integral_type(bt) ? Op_ReverseV : 0);
+  case Op_ReverseBytesS:
+  case Op_ReverseBytesI:
+  case Op_ReverseBytesL:
+    return (is_integral_type(bt) ? Op_ReverseBytesV : 0);
+  case Op_CompressBits:
+    // Not implemented. Returning 0 temporarily
+    return 0;
+  case Op_ExpandBits:
+    // Not implemented. Returning 0 temporarily
+    return 0;
  case Op_LShiftI:
    switch (bt) {
    case T_BOOLEAN:
@ -245,6 +256,12 @@ int VectorNode::opcode(int sopc, BasicType bt) {
    return Op_VectorCastF2X;
  case Op_ConvD2L:
    return Op_VectorCastD2X;
+  case Op_CountLeadingZerosI:
+  case Op_CountLeadingZerosL:
+    return Op_CountLeadingZerosV;
+  case Op_CountTrailingZerosI:
+  case Op_CountTrailingZerosL:
+    return Op_CountTrailingZerosV;
  case Op_SignumF:
    return Op_SignumVF;
  case Op_SignumD:
@ -317,16 +334,17 @@ bool VectorNode::is_muladds2i(Node* n) {
  return false;
 }

-bool VectorNode::is_vpopcnt_long(Node* n) {
-  if (n->Opcode() == Op_PopCountL) {
-    return true;
+bool VectorNode::is_type_transition_long_to_int(Node* n) {
+  switch(n->Opcode()) {
+    case Op_PopCountL:
+    case Op_CountLeadingZerosL:
+    case Op_CountTrailingZerosL:
+       return true;
+    default:
+       return false;
  }
-  return false;
 }

-
-
-
 bool VectorNode::is_roundopD(Node* n) {
  if (n->Opcode() == Op_RoundDoubleMode) {
    return true;
@ -595,6 +613,9 @@ VectorNode* VectorNode::make(int vopc, Node* n1, Node* n2, const TypeVect* vt, b
  case Op_NegVF: return new NegVFNode(n1, vt);
  case Op_NegVD: return new NegVDNode(n1, vt);

+  case Op_ReverseV: return new ReverseVNode(n1, vt);
+  case Op_ReverseBytesV: return new ReverseBytesVNode(n1, vt);
+
  case Op_SqrtVF: return new SqrtVFNode(n1, vt);
  case Op_SqrtVD: return new SqrtVDNode(n1, vt);

@ -628,6 +649,12 @@ VectorNode* VectorNode::make(int vopc, Node* n1, Node* n2, const TypeVect* vt, b
  case Op_RoundDoubleModeV: return new RoundDoubleModeVNode(n1, n2, vt);

  case Op_MulAddVS2VI: return new MulAddVS2VINode(n1, n2, vt);
+
+  case Op_ExpandV: return new ExpandVNode(n1, n2, vt);
+  case Op_CompressV: return new CompressVNode(n1, n2, vt);
+  case Op_CompressM: assert(n1 == NULL, ""); return new CompressMNode(n2, vt);
+  case Op_CountLeadingZerosV: return new CountLeadingZerosVNode(n1, vt);
+  case Op_CountTrailingZerosV: return new CountTrailingZerosVNode(n1, vt);
  default:
    fatal("Missed vector creation for '%s'", NodeClassNames[vopc]);
    return NULL;
@ -1669,6 +1696,38 @@ Node* NegVNode::Ideal(PhaseGVN* phase, bool can_reshape) {
  return NULL;
 }

+Node* ReverseBytesVNode::Identity(PhaseGVN* phase) {
+  if (is_predicated_using_blend()) {
+    return this;
+  }
+  // ReverseBytesV (ReverseBytesV X , MASK) , MASK =>  X
+  if (in(1)->Opcode() == Op_ReverseBytesV) {
+    if (is_predicated_vector() && in(1)->is_predicated_vector() && in(2) == in(1)->in(2)) {
+      return in(1)->in(1);
+    } else {
+      // ReverseBytesV (ReverseBytesV X) =>  X
+      return in(1)->in(1);
+    }
+  }
+  return this;
+}
+
+Node* ReverseVNode::Identity(PhaseGVN* phase) {
+  if (is_predicated_using_blend()) {
+    return this;
+  }
+  // ReverseV (ReverseV X , MASK) , MASK =>  X
+  if (in(1)->Opcode() == Op_ReverseV) {
+    if (is_predicated_vector() && in(1)->is_predicated_vector() && in(2) == in(1)->in(2)) {
+      return in(1)->in(1);
+    } else {
+      // ReverseV (ReverseV X) =>  X
+      return in(1)->in(1);
+    }
+  }
+  return this;
+}
+
 #ifndef PRODUCT
 void VectorBoxAllocateNode::dump_spec(outputStream *st) const {
  CallStaticJavaNode::dump_spec(st);
--- a/src/hotspot/share/opto/vectornode.hpp
+++ b/src/hotspot/share/opto/vectornode.hpp
@ -93,7 +93,7 @@ class VectorNode : public TypeNode {
  static bool is_type_transition_short_to_int(Node* n);
  static bool is_type_transition_to_int(Node* n);
  static bool is_muladds2i(Node* n);
-  static bool is_vpopcnt_long(Node* n);
+  static bool is_type_transition_long_to_int(Node* n);
  static bool is_roundopD(Node* n);
  static bool is_scalar_rotate(Node* n);
  static bool is_vector_rotate_supported(int opc, uint vlen, BasicType bt);
@ -769,6 +769,37 @@ public:
  virtual int Opcode() const;
 };

+//------------------------------CompressVNode--------------------------------------
+// Vector compress
+class CompressVNode: public VectorNode {
+ public:
+  CompressVNode(Node* vec, Node* mask, const TypeVect* vt) :
+      VectorNode(vec, mask, vt) {
+    init_class_id(Class_CompressV);
+  }
+  virtual int Opcode() const;
+};
+
+class CompressMNode: public VectorNode {
+ public:
+  CompressMNode(Node* mask, const TypeVect* vt) :
+      VectorNode(mask, vt) {
+    init_class_id(Class_CompressM);
+  }
+  virtual int Opcode() const;
+};
+
+//------------------------------ExpandVNode--------------------------------------
+// Vector expand
+class ExpandVNode: public VectorNode {
+ public:
+  ExpandVNode(Node* vec, Node* mask, const TypeVect* vt) :
+      VectorNode(vec, mask, vt) {
+    init_class_id(Class_ExpandV);
+  }
+  virtual int Opcode() const;
+};
+
 //================================= M E M O R Y ===============================

 //------------------------------LoadVectorNode---------------------------------
@ -1392,7 +1423,6 @@ class VectorBlendNode : public VectorNode {
 public:
  VectorBlendNode(Node* vec1, Node* vec2, Node* mask)
    : VectorNode(vec1, vec2, mask, vec1->bottom_type()->is_vect()) {
-    // assert(mask->is_VectorMask(), "VectorBlendNode requires that third argument be a mask");
  }

  virtual int Opcode() const;
@ -1675,6 +1705,40 @@ public:
  Node* Ideal(PhaseGVN* phase, bool can_reshape);
 };

+class CountLeadingZerosVNode : public VectorNode {
+ public:
+  CountLeadingZerosVNode(Node* in, const TypeVect* vt)
+  : VectorNode(in, vt) {}
+
+  virtual int Opcode() const;
+};
+
+class CountTrailingZerosVNode : public VectorNode {
+ public:
+  CountTrailingZerosVNode(Node* in, const TypeVect* vt)
+  : VectorNode(in, vt) {}
+
+  virtual int Opcode() const;
+};
+
+class ReverseVNode : public VectorNode {
+public:
+  ReverseVNode(Node* in, const TypeVect* vt)
+  : VectorNode(in, vt) {}
+
+  virtual Node* Identity(PhaseGVN* phase);
+  virtual int Opcode() const;
+};
+
+class ReverseBytesVNode : public VectorNode {
+public:
+  ReverseBytesVNode(Node* in, const TypeVect* vt)
+  : VectorNode(in, vt) {}
+
+  virtual Node* Identity(PhaseGVN* phase);
+  virtual int Opcode() const;
+};
+
 class SignumVFNode : public VectorNode {
 public:
  SignumVFNode(Node* in1, Node* zero, Node* one, const TypeVect* vt)
@ -1690,4 +1754,5 @@ public:

  virtual int Opcode() const;
 };
+
 #endif // SHARE_OPTO_VECTORNODE_HPP
--- a/src/hotspot/share/prims/vectorSupport.cpp
+++ b/src/hotspot/share/prims/vectorSupport.cpp
@ -443,6 +443,109 @@ int VectorSupport::vop2ideal(jint id, BasicType bt) {
      }
      break;
    }
+    case VECTOR_OP_EXPAND: {
+      switch (bt) {
+        case T_BYTE:  // fall-through
+        case T_SHORT: // fall-through
+        case T_INT:   // fall-through
+        case T_LONG:  // fall-through
+        case T_FLOAT: // fall-through
+        case T_DOUBLE: return Op_ExpandV;
+        default: fatal("EXPAND: %s", type2name(bt));
+      }
+      break;
+    }
+    case VECTOR_OP_COMPRESS: {
+      switch (bt) {
+        case T_BYTE:  // fall-through
+        case T_SHORT: // fall-through
+        case T_INT:   // fall-through
+        case T_LONG:  // fall-through
+        case T_FLOAT: // fall-through
+        case T_DOUBLE: return Op_CompressV;
+        default: fatal("COMPRESS: %s", type2name(bt));
+      }
+      break;
+    }
+    case VECTOR_OP_MASK_COMPRESS: {
+      switch (bt) {
+        case T_BYTE:  // fall-through
+        case T_SHORT: // fall-through
+        case T_INT:   // fall-through
+        case T_LONG:  // fall-through
+        case T_FLOAT: // fall-through
+        case T_DOUBLE: return Op_CompressM;
+        default: fatal("MASK_COMPRESS: %s", type2name(bt));
+      }
+      break;
+    }
+    case VECTOR_OP_BIT_COUNT: {
+      switch (bt) {
+        case T_BYTE:  // Returning Op_PopCountI
+        case T_SHORT: // for byte and short types temporarily
+        case T_INT:   return Op_PopCountI;
+        case T_LONG:  return Op_PopCountL;
+        default: fatal("BIT_COUNT: %s", type2name(bt));
+      }
+      break;
+    }
+    case VECTOR_OP_TZ_COUNT: {
+      switch (bt) {
+        case T_BYTE:
+        case T_SHORT:
+        case T_INT:   return Op_CountTrailingZerosI;
+        case T_LONG:  return Op_CountTrailingZerosL;
+        default: fatal("TZ_COUNT: %s", type2name(bt));
+      }
+      break;
+    }
+    case VECTOR_OP_LZ_COUNT: {
+      switch (bt) {
+        case T_BYTE:
+        case T_SHORT:
+        case T_INT:   return Op_CountLeadingZerosI;
+        case T_LONG:  return Op_CountLeadingZerosL;
+        default: fatal("LZ_COUNT: %s", type2name(bt));
+      }
+      break;
+    }
+    case VECTOR_OP_REVERSE: {
+      switch (bt) {
+        case T_BYTE:  // Temporarily returning
+        case T_SHORT: // Op_ReverseI for byte and short
+        case T_INT:   return Op_ReverseI;
+        case T_LONG:  return Op_ReverseL;
+        default: fatal("REVERSE: %s", type2name(bt));
+      }
+      break;
+    }
+    case VECTOR_OP_REVERSE_BYTES: {
+      switch (bt) {
+        case T_BYTE:
+        case T_SHORT:
+        case T_INT:   return Op_ReverseBytesI;
+        case T_LONG:  return Op_ReverseBytesL;
+        default: fatal("REVERSE_BYTES: %s", type2name(bt));
+      }
+      break;
+    }
+    case VECTOR_OP_COMPRESS_BITS: {
+      switch (bt) {
+        case T_INT:
+        case T_LONG: return Op_CompressBits;
+        default: fatal("COMPRESS_BITS: %s", type2name(bt));
+      }
+      break;
+    }
+    case VECTOR_OP_EXPAND_BITS: {
+      switch (bt) {
+        case T_INT:
+        case T_LONG: return Op_ExpandBits;
+        default: fatal("EXPAND_BITS: %s", type2name(bt));
+      }
+      break;
+    }
+
    case VECTOR_OP_TAN:
    case VECTOR_OP_TANH:
    case VECTOR_OP_SIN:
--- a/src/hotspot/share/prims/vectorSupport.hpp
+++ b/src/hotspot/share/prims/vectorSupport.hpp
@ -54,6 +54,7 @@ class VectorSupport : AllStatic {
    VECTOR_OP_ABS     = 0,
    VECTOR_OP_NEG     = 1,
    VECTOR_OP_SQRT    = 2,
+    VECTOR_OP_BIT_COUNT = 3,

    // Binary
    VECTOR_OP_ADD     = 4,
@ -89,6 +90,17 @@ class VectorSupport : AllStatic {
    VECTOR_OP_LROTATE = 24,
    VECTOR_OP_RROTATE = 25,

+    VECTOR_OP_COMPRESS = 26,
+    VECTOR_OP_EXPAND = 27,
+    VECTOR_OP_MASK_COMPRESS = 28,
+
+    VECTOR_OP_TZ_COUNT = 29,
+    VECTOR_OP_LZ_COUNT = 30,
+    VECTOR_OP_REVERSE  = 31,
+    VECTOR_OP_REVERSE_BYTES = 32,
+    VECTOR_OP_COMPRESS_BITS = 33,
+    VECTOR_OP_EXPAND_BITS = 34,
+
    // Vector Math Library
    VECTOR_OP_TAN   = 101,
    VECTOR_OP_TANH  = 102,
--- a/src/hotspot/share/runtime/vmStructs.cpp
+++ b/src/hotspot/share/runtime/vmStructs.cpp
@ -1775,6 +1775,9 @@
  declare_c2_type(FmaVFNode, VectorNode)                                  \
  declare_c2_type(CMoveVFNode, VectorNode)                                \
  declare_c2_type(CMoveVDNode, VectorNode)                                \
+  declare_c2_type(CompressVNode, VectorNode)                              \
+  declare_c2_type(CompressMNode, VectorNode)                              \
+  declare_c2_type(ExpandVNode, VectorNode)                                \
  declare_c2_type(MulReductionVDNode, ReductionNode)                      \
  declare_c2_type(DivVFNode, VectorNode)                                  \
  declare_c2_type(DivVDNode, VectorNode)                                  \
@ -1866,6 +1869,10 @@
  declare_c2_type(VectorUnboxNode, VectorNode)                            \
  declare_c2_type(VectorReinterpretNode, VectorNode)                      \
  declare_c2_type(VectorMaskCastNode, VectorNode)                         \
+  declare_c2_type(CountLeadingZerosVNode, VectorNode)                     \
+  declare_c2_type(CountTrailingZerosVNode, VectorNode)                    \
+  declare_c2_type(ReverseBytesVNode, VectorNode)                          \
+  declare_c2_type(ReverseVNode, VectorNode)                               \
  declare_c2_type(MaskAllNode, VectorNode)                                \
  declare_c2_type(AndVMaskNode, VectorNode)                               \
  declare_c2_type(OrVMaskNode, VectorNode)                                \
--- a/src/hotspot/share/utilities/globalDefinitions.hpp
+++ b/src/hotspot/share/utilities/globalDefinitions.hpp
@ -721,6 +721,10 @@ inline bool is_integral_type(BasicType t) {
  return is_subword_type(t) || t == T_INT || t == T_LONG;
 }

+inline bool is_non_subword_integral_type(BasicType t) {
+  return t == T_INT || t == T_LONG;
+}
+
 inline bool is_floating_point_type(BasicType t) {
  return (t == T_FLOAT || t == T_DOUBLE);
 }
--- a/src/java.base/share/classes/jdk/internal/misc/X-ScopedMemoryAccess.java.template
+++ b/src/java.base/share/classes/jdk/internal/misc/X-ScopedMemoryAccess.java.template
@ -32,8 +32,6 @@ import java.lang.annotation.Target;
 import java.lang.foreign.MemorySegment;
 import java.lang.ref.Reference;
 import java.io.FileDescriptor;
-import java.nio.Buffer;
-import java.nio.ByteBuffer;

 import jdk.internal.access.JavaNioAccess;
 import jdk.internal.access.SharedSecrets;
@ -313,59 +311,25 @@ public class ScopedMemoryAccess {
        }
    }

-    // ByteBuffer vector access ops
-
-    // Buffer access constants, to be initialized when required.
-    // Avoids a null value for NIO_ACCESS, due to class initialization dependencies
-    static final class BufferAccess {
-        // Buffer.address
-        static final long BUFFER_ADDRESS
-                = UNSAFE.objectFieldOffset(Buffer.class, "address");
-
-        // ByteBuffer.hb
-        static final long BYTE_BUFFER_HB
-                = UNSAFE.objectFieldOffset(ByteBuffer.class, "hb");
-
-        static final long BYTE_BUFFER_IS_READ_ONLY
-                = UNSAFE.objectFieldOffset(ByteBuffer.class, "isReadOnly");
-
-        @ForceInline
-        static Object bufferBase(ByteBuffer bb) {
-            return UNSAFE.getReference(bb, BYTE_BUFFER_HB);
-        }
-
-        @ForceInline
-        static long bufferAddress(ByteBuffer bb, long offset) {
-            return UNSAFE.getLong(bb, BUFFER_ADDRESS) + offset;
-        }
-
-        static final JavaNioAccess NIO_ACCESS = SharedSecrets.getJavaNioAccess();
-
-        @ForceInline
-        static MemorySessionImpl session(ByteBuffer bb) {
-            MemorySegment segment = NIO_ACCESS.bufferSegment(bb);
-            return segment != null ?
-                    ((AbstractMemorySegmentImpl)segment).sessionImpl() : null;
-        }
-    }
-
-    @ForceInline
-    public static boolean isReadOnly(ByteBuffer bb) {
-        return UNSAFE.getBoolean(bb, BufferAccess.BYTE_BUFFER_IS_READ_ONLY);
-    }
+    // MemorySegment vector access ops

    @ForceInline
    public static
    <V extends VectorSupport.Vector<E>, E, S extends VectorSupport.VectorSpecies<E>>
-    V loadFromByteBuffer(Class<? extends V> vmClass, Class<E> e, int length,
-                          ByteBuffer bb, int offset,
-                          S s,
-                          VectorSupport.LoadOperation<ByteBuffer, V, S> defaultImpl) {
+    V loadFromMemorySegment(Class<? extends V> vmClass, Class<E> e, int length,
+                         AbstractMemorySegmentImpl msp, long offset,
+                         S s,
+                         VectorSupport.LoadOperation<AbstractMemorySegmentImpl, V, S> defaultImpl) {
+        // @@@ Smarter alignment checking if accessing heap segment backing non-byte[] array
+        if (msp.maxAlignMask() > 1) {
+            throw new IllegalArgumentException();
+        }
+
        try {
-            return loadFromByteBufferScoped(
-                    BufferAccess.session(bb),
+            return loadFromMemorySegmentScopedInternal(
+                    msp.sessionImpl(),
                    vmClass, e, length,
-                    bb, offset,
+                    msp, offset,
                    s,
                    defaultImpl);
        } catch (ScopedAccessError ex) {
@ -377,22 +341,18 @@ public class ScopedMemoryAccess {
    @ForceInline
    private static
    <V extends VectorSupport.Vector<E>, E, S extends VectorSupport.VectorSpecies<E>>
-    V loadFromByteBufferScoped(MemorySessionImpl session,
-                          Class<? extends V> vmClass, Class<E> e, int length,
-                          ByteBuffer bb, int offset,
-                          S s,
-                          VectorSupport.LoadOperation<ByteBuffer, V, S> defaultImpl) {
+    V loadFromMemorySegmentScopedInternal(MemorySessionImpl session,
+                                          Class<? extends V> vmClass, Class<E> e, int length,
+                                          AbstractMemorySegmentImpl msp, long offset,
+                                          S s,
+                                          VectorSupport.LoadOperation<AbstractMemorySegmentImpl, V, S> defaultImpl) {
        try {
-            if (session != null) {
-                session.checkValidState();
-            }
-
-            final byte[] base = (byte[]) BufferAccess.bufferBase(bb);
+            session.checkValidState();

            return VectorSupport.load(vmClass, e, length,
-                      base, BufferAccess.bufferAddress(bb, offset),
-                      bb, offset, s,
-                      defaultImpl);
+                    msp.unsafeGetBase(), msp.unsafeGetOffset() + offset,
+                    msp, offset, s,
+                    defaultImpl);
        } finally {
            Reference.reachabilityFence(session);
        }
@ -402,17 +362,22 @@ public class ScopedMemoryAccess {
    public static
    <V extends VectorSupport.Vector<E>, E, S extends VectorSupport.VectorSpecies<E>,
     M extends VectorSupport.VectorMask<E>>
-    V loadFromByteBufferMasked(Class<? extends V> vmClass, Class<M> maskClass, Class<E> e,
-                               int length, ByteBuffer bb, int offset, M m, S s,
-                               VectorSupport.LoadVectorMaskedOperation<ByteBuffer, V, S, M> defaultImpl) {
+    V loadFromMemorySegmentMasked(Class<? extends V> vmClass, Class<M> maskClass, Class<E> e,
+                                  int length, AbstractMemorySegmentImpl msp, long offset, M m, S s,
+                                  VectorSupport.LoadVectorMaskedOperation<AbstractMemorySegmentImpl, V, S, M> defaultImpl) {
+        // @@@ Smarter alignment checking if accessing heap segment backing non-byte[] array
+        if (msp.maxAlignMask() > 1) {
+            throw new IllegalArgumentException();
+        }
+
        try {
-            return loadFromByteBufferMaskedScoped(
-                    BufferAccess.session(bb),
+            return loadFromMemorySegmentMaskedScopedInternal(
+                    msp.sessionImpl(),
                    vmClass, maskClass, e, length,
-                    bb, offset, m,
+                    msp, offset, m,
                    s,
                    defaultImpl);
-        } catch (ScopedMemoryAccess.ScopedAccessError ex) {
+        } catch (ScopedAccessError ex) {
            throw new IllegalStateException("This segment is already closed");
        }
    }
@ -422,19 +387,17 @@ public class ScopedMemoryAccess {
    private static
    <V extends VectorSupport.Vector<E>, E, S extends VectorSupport.VectorSpecies<E>,
     M extends VectorSupport.VectorMask<E>>
-    V loadFromByteBufferMaskedScoped(MemorySessionImpl session, Class<? extends V> vmClass,
-                                     Class<M> maskClass, Class<E> e, int length,
-                                     ByteBuffer bb, int offset, M m,
-                                     S s,
-                                     VectorSupport.LoadVectorMaskedOperation<ByteBuffer, V, S, M> defaultImpl) {
+    V loadFromMemorySegmentMaskedScopedInternal(MemorySessionImpl session, Class<? extends V> vmClass,
+                                                Class<M> maskClass, Class<E> e, int length,
+                                                AbstractMemorySegmentImpl msp, long offset, M m,
+                                                S s,
+                                                VectorSupport.LoadVectorMaskedOperation<AbstractMemorySegmentImpl, V, S, M> defaultImpl) {
        try {
-            if (session != null) {
-                session.checkValidState();
-            }
+            session.checkValidState();

            return VectorSupport.loadMasked(vmClass, maskClass, e, length,
-                    BufferAccess.bufferBase(bb), BufferAccess.bufferAddress(bb, offset), m,
-                    bb, offset, s,
+                    msp.unsafeGetBase(), msp.unsafeGetOffset() + offset, m,
+                    msp, offset, s,
                    defaultImpl);
        } finally {
            Reference.reachabilityFence(session);
@ -444,16 +407,21 @@ public class ScopedMemoryAccess {
    @ForceInline
    public static
    <V extends VectorSupport.Vector<E>, E>
-    void storeIntoByteBuffer(Class<? extends V> vmClass, Class<E> e, int length,
-                             V v,
-                             ByteBuffer bb, int offset,
-                             VectorSupport.StoreVectorOperation<ByteBuffer, V> defaultImpl) {
+    void storeIntoMemorySegment(Class<? extends V> vmClass, Class<E> e, int length,
+                                V v,
+                                AbstractMemorySegmentImpl msp, long offset,
+                                VectorSupport.StoreVectorOperation<AbstractMemorySegmentImpl, V> defaultImpl) {
+        // @@@ Smarter alignment checking if accessing heap segment backing non-byte[] array
+        if (msp.maxAlignMask() > 1) {
+            throw new IllegalArgumentException();
+        }
+
        try {
-            storeIntoByteBufferScoped(
-                    BufferAccess.session(bb),
+            storeIntoMemorySegmentScopedInternal(
+                    msp.sessionImpl(),
                    vmClass, e, length,
                    v,
-                    bb, offset,
+                    msp, offset,
                    defaultImpl);
        } catch (ScopedAccessError ex) {
            throw new IllegalStateException("This segment is already closed");
@ -464,23 +432,19 @@ public class ScopedMemoryAccess {
    @ForceInline
    private static
    <V extends VectorSupport.Vector<E>, E>
-    void storeIntoByteBufferScoped(MemorySessionImpl session,
-                                   Class<? extends V> vmClass, Class<E> e, int length,
-                                   V v,
-                                   ByteBuffer bb, int offset,
-                                   VectorSupport.StoreVectorOperation<ByteBuffer, V> defaultImpl) {
+    void storeIntoMemorySegmentScopedInternal(MemorySessionImpl session,
+                                              Class<? extends V> vmClass, Class<E> e, int length,
+                                              V v,
+                                              AbstractMemorySegmentImpl msp, long offset,
+                                              VectorSupport.StoreVectorOperation<AbstractMemorySegmentImpl, V> defaultImpl) {
        try {
-            if (session != null) {
-                session.checkValidState();
-            }
-
-            final byte[] base = (byte[]) BufferAccess.bufferBase(bb);
+            session.checkValidState();

            VectorSupport.store(vmClass, e, length,
-                                base, BufferAccess.bufferAddress(bb, offset),
-                                v,
-                                bb, offset,
-                                defaultImpl);
+                    msp.unsafeGetBase(), msp.unsafeGetOffset() + offset,
+                    v,
+                    msp, offset,
+                    defaultImpl);
        } finally {
            Reference.reachabilityFence(session);
        }
@ -489,18 +453,23 @@ public class ScopedMemoryAccess {
    @ForceInline
    public static
    <V extends VectorSupport.Vector<E>, E, M extends VectorSupport.VectorMask<E>>
-    void storeIntoByteBufferMasked(Class<? extends V> vmClass, Class<M> maskClass, Class<E> e,
-                                   int length, V v, M m,
-                                   ByteBuffer bb, int offset,
-                                   VectorSupport.StoreVectorMaskedOperation<ByteBuffer, V, M> defaultImpl) {
+    void storeIntoMemorySegmentMasked(Class<? extends V> vmClass, Class<M> maskClass, Class<E> e,
+                                      int length, V v, M m,
+                                      AbstractMemorySegmentImpl msp, long offset,
+                                      VectorSupport.StoreVectorMaskedOperation<AbstractMemorySegmentImpl, V, M> defaultImpl) {
+        // @@@ Smarter alignment checking if accessing heap segment backing non-byte[] array
+        if (msp.maxAlignMask() > 1) {
+            throw new IllegalArgumentException();
+        }
+
        try {
-            storeIntoByteBufferMaskedScoped(
-                    BufferAccess.session(bb),
+            storeIntoMemorySegmentMaskedScopedInternal(
+                    msp.sessionImpl(),
                    vmClass, maskClass, e, length,
                    v, m,
-                    bb, offset,
+                    msp, offset,
                    defaultImpl);
-        } catch (ScopedMemoryAccess.ScopedAccessError ex) {
+        } catch (ScopedAccessError ex) {
            throw new IllegalStateException("This segment is already closed");
        }
    }
@ -509,20 +478,18 @@ public class ScopedMemoryAccess {
    @ForceInline
    private static
    <V extends VectorSupport.Vector<E>, E, M extends VectorSupport.VectorMask<E>>
-    void storeIntoByteBufferMaskedScoped(MemorySessionImpl session,
-                                         Class<? extends V> vmClass, Class<M> maskClass,
-                                         Class<E> e, int length, V v, M m,
-                                         ByteBuffer bb, int offset,
-                                         VectorSupport.StoreVectorMaskedOperation<ByteBuffer, V, M> defaultImpl) {
+    void storeIntoMemorySegmentMaskedScopedInternal(MemorySessionImpl session,
+                                                    Class<? extends V> vmClass, Class<M> maskClass,
+                                                    Class<E> e, int length, V v, M m,
+                                                    AbstractMemorySegmentImpl msp, long offset,
+                                                    VectorSupport.StoreVectorMaskedOperation<AbstractMemorySegmentImpl, V, M> defaultImpl) {
        try {
-            if (session != null) {
-                session.checkValidState();
-            }
+            session.checkValidState();

            VectorSupport.storeMasked(vmClass, maskClass, e, length,
-                    BufferAccess.bufferBase(bb), BufferAccess.bufferAddress(bb, offset),
+                    msp.unsafeGetBase(), msp.unsafeGetOffset() + offset,
                    v, m,
-                    bb, offset,
+                    msp, offset,
                    defaultImpl);
        } finally {
            Reference.reachabilityFence(session);
--- a/src/java.base/share/classes/jdk/internal/vm/vector/VectorSupport.java
+++ b/src/java.base/share/classes/jdk/internal/vm/vector/VectorSupport.java
@ -41,6 +41,7 @@ public class VectorSupport {
    public static final int VECTOR_OP_ABS  = 0;
    public static final int VECTOR_OP_NEG  = 1;
    public static final int VECTOR_OP_SQRT = 2;
+    public static final int VECTOR_OP_BIT_COUNT = 3;

    // Binary
    public static final int VECTOR_OP_ADD  = 4;
@ -76,6 +77,23 @@ public class VectorSupport {
    public static final int VECTOR_OP_LROTATE = 24;
    public static final int VECTOR_OP_RROTATE = 25;

+    // Compression expansion operations
+    public static final int VECTOR_OP_COMPRESS = 26;
+    public static final int VECTOR_OP_EXPAND = 27;
+    public static final int VECTOR_OP_MASK_COMPRESS = 28;
+
+    // Leading/Trailing zeros count operations
+    public static final int VECTOR_OP_TZ_COUNT  = 29;
+    public static final int VECTOR_OP_LZ_COUNT  = 30;
+
+    // Reverse operation
+    public static final int VECTOR_OP_REVERSE   = 31;
+    public static final int VECTOR_OP_REVERSE_BYTES = 32;
+
+    // Compress and Expand Bits operation
+    public static final int VECTOR_OP_COMPRESS_BITS = 33;
+    public static final int VECTOR_OP_EXPAND_BITS = 34;
+
    // Math routines
    public static final int VECTOR_OP_TAN = 101;
    public static final int VECTOR_OP_TANH = 102;
@ -363,7 +381,7 @@ public class VectorSupport {
    public interface LoadOperation<C,
                                   VM extends VectorPayload,
                                   S extends VectorSpecies<?>> {
-        VM load(C container, int index, S s);
+        VM load(C container, long index, S s);
    }

    @IntrinsicCandidate
@ -375,7 +393,7 @@ public class VectorSupport {
    VM load(Class<? extends VM> vmClass, Class<E> eClass,
            int length,
            Object base, long offset,
-            C container, int index, S s,
+            C container, long index, S s,
            LoadOperation<C, VM, S> defaultImpl) {
        assert isNonCapturingLambda(defaultImpl) : defaultImpl;
        return defaultImpl.load(container, index, s);
@ -387,7 +405,7 @@ public class VectorSupport {
                                               V extends Vector<?>,
                                               S extends VectorSpecies<?>,
                                               M extends VectorMask<?>> {
-        V load(C container, int index, S s, M m);
+        V load(C container, long index, S s, M m);
    }

    @IntrinsicCandidate
@ -400,7 +418,7 @@ public class VectorSupport {
    V loadMasked(Class<? extends V> vClass, Class<M> mClass, Class<E> eClass,
                 int length,
                 Object base, long offset,
-                 M m, C container, int index, S s,
+                 M m, C container, long index, S s,
                 LoadVectorMaskedOperation<C, V, S, M> defaultImpl) {
        assert isNonCapturingLambda(defaultImpl) : defaultImpl;
        return defaultImpl.load(container, index, s, m);
@ -437,18 +455,18 @@ public class VectorSupport {
    /* ============================================================================ */

    public interface StoreVectorOperation<C,
-                                          V extends Vector<?>> {
-        void store(C container, int index, V v);
+                                          V extends VectorPayload> {
+        void store(C container, long index, V v);
    }

    @IntrinsicCandidate
    public static
    <C,
-     V extends Vector<?>>
+     V extends VectorPayload>
    void store(Class<?> vClass, Class<?> eClass,
               int length,
               Object base, long offset,
-               V v, C container, int index,
+               V v, C container, long index,
               StoreVectorOperation<C, V> defaultImpl) {
        assert isNonCapturingLambda(defaultImpl) : defaultImpl;
        defaultImpl.store(container, index, v);
@ -457,7 +475,7 @@ public class VectorSupport {
    public interface StoreVectorMaskedOperation<C,
                                                V extends Vector<?>,
                                                M extends VectorMask<?>> {
-        void store(C container, int index, V v, M m);
+        void store(C container, long index, V v, M m);
    }

    @IntrinsicCandidate
@ -469,7 +487,7 @@ public class VectorSupport {
    void storeMasked(Class<? extends V> vClass, Class<M> mClass, Class<E> eClass,
                     int length,
                     Object base, long offset,
-                     V v, M m, C container, int index,
+                     V v, M m, C container, long index,
                     StoreVectorMaskedOperation<C, V, M> defaultImpl) {
        assert isNonCapturingLambda(defaultImpl) : defaultImpl;
        defaultImpl.store(container, index, v, m);
@ -626,6 +644,26 @@ public class VectorSupport {

    /* ============================================================================ */

+    public interface CompressExpandOperation<V extends Vector<?>,
+                                     M extends VectorMask<?>> {
+        VectorPayload apply(V v, M m);
+    }
+
+    @IntrinsicCandidate
+    public static
+    <V extends Vector<E>,
+     M extends VectorMask<E>,
+     E>
+    VectorPayload compressExpandOp(int opr,
+                                   Class<? extends V> vClass, Class<? extends M> mClass, Class<E> eClass,
+                                   int length, V v, M m,
+                                   CompressExpandOperation<V, M> defaultImpl) {
+        assert isNonCapturingLambda(defaultImpl) : defaultImpl;
+        return defaultImpl.apply(v, m);
+    }
+
+    /* ============================================================================ */
+
    @IntrinsicCandidate
    public static
    <VP extends VectorPayload>
--- a/src/java.base/share/classes/module-info.java
+++ b/src/java.base/share/classes/module-info.java
@ -158,6 +158,8 @@ module java.base {
        jdk.jartool,
        jdk.jlink,
        jdk.net;
+    exports jdk.internal.foreign to
+        jdk.incubator.vector;
    exports jdk.internal.event to
        jdk.jfr;
    exports jdk.internal.jimage to
--- a/src/jdk.compiler/share/classes/com/sun/tools/javac/code/Preview.java
+++ b/src/jdk.compiler/share/classes/com/sun/tools/javac/code/Preview.java
@ -38,6 +38,7 @@ import com.sun.tools.javac.util.JCDiagnostic.SimpleDiagnosticPosition;
 import com.sun.tools.javac.util.JCDiagnostic.Warning;
 import com.sun.tools.javac.util.Log;
 import com.sun.tools.javac.util.MandatoryWarningHandler;
+import com.sun.tools.javac.util.Names;
 import com.sun.tools.javac.util.Options;

 import javax.tools.JavaFileObject;
@ -78,6 +79,7 @@ public class Preview {

    private final Set<JavaFileObject> sourcesWithPreviewFeatures = new HashSet<>();

+    private final Names names;
    private final Lint lint;
    private final Log log;
    private final Source source;
@ -95,6 +97,7 @@ public class Preview {
    Preview(Context context) {
        context.put(previewKey, this);
        Options options = Options.instance(context);
+        names = Names.instance(context);
        enabled = options.isSet(PREVIEW);
        log = Log.instance(context);
        lint = Lint.instance(context);
@ -115,7 +118,22 @@ public class Preview {
            }
        }
        return majorVersionToSource;
-   }
+    }
+
+    /**
+     * Returns true if {@code s} is deemed to participate in the preview of {@code previewSymbol}, and
+     * therefore no warnings or errors will be produced.
+     *
+     * @param s the symbol depending on the preview symbol
+     * @param previewSymbol the preview symbol marked with @Preview
+     * @return true if {@code s} is participating in the preview of {@code previewSymbol}
+     */
+    public boolean participatesInPreview(Symbol s, Symbol previewSymbol) {
+        // Hardcode the incubating vector API module for now
+        // Will generalize with an annotation, @PreviewParticipating say, later
+        return previewSymbol.packge().modle == s.packge().modle ||
+                s.packge().modle.name == names.jdk_incubator_vector;
+    }

    /**
     * Report usage of a preview feature. Usages reported through this method will affect the
--- a/src/jdk.compiler/share/classes/com/sun/tools/javac/comp/Check.java
+++ b/src/jdk.compiler/share/classes/com/sun/tools/javac/comp/Check.java
@ -3600,7 +3600,7 @@ public class Check {
    }

    void checkPreview(DiagnosticPosition pos, Symbol other, Symbol s) {
-        if ((s.flags() & PREVIEW_API) != 0 && s.packge().modle != other.packge().modle) {
+        if ((s.flags() & PREVIEW_API) != 0 && !preview.participatesInPreview(other, s)) {
            if ((s.flags() & PREVIEW_REFLECTIVE) == 0) {
                if (!preview.isEnabled()) {
                    log.error(pos, Errors.IsPreview(s));
--- a/src/jdk.compiler/share/classes/com/sun/tools/javac/util/Names.java
+++ b/src/jdk.compiler/share/classes/com/sun/tools/javac/util/Names.java
@ -124,6 +124,7 @@ public class Names {
    // module names
    public final Name java_base;
    public final Name jdk_unsupported;
+    public final Name jdk_incubator_vector;

    // attribute names
    public final Name Annotation;
@ -305,6 +306,7 @@ public class Names {
        // module names
        java_base = fromString("java.base");
        jdk_unsupported = fromString("jdk.unsupported");
+        jdk_incubator_vector = fromString("jdk.incubator.vector");

        // attribute names
        Annotation = fromString("Annotation");
--- a/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/AbstractMask.java
+++ b/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/AbstractMask.java
@ -28,6 +28,10 @@ import java.util.Objects;

 import jdk.internal.vm.annotation.ForceInline;

+import jdk.internal.misc.Unsafe;
+
+import jdk.internal.vm.vector.VectorSupport;
+
 import static jdk.incubator.vector.VectorOperators.*;

 abstract class AbstractMask<E> extends VectorMask<E> {
@ -77,7 +81,15 @@ abstract class AbstractMask<E> extends VectorMask<E> {

    @Override
    public void intoArray(boolean[] bits, int i) {
-        System.arraycopy(getBits(), 0, bits, i, length());
+        AbstractSpecies<E> vsp = (AbstractSpecies<E>) vectorSpecies();
+        int laneCount = vsp.laneCount();
+        i = VectorIntrinsics.checkFromIndexSize(i, laneCount, bits.length);
+        VectorSupport.store(
+            vsp.maskType(), vsp.elementType(), laneCount,
+            bits, (long) i + Unsafe.ARRAY_BOOLEAN_BASE_OFFSET,
+            this, bits, i,
+            (c, idx, s) -> System.arraycopy(s.getBits(), 0, c, (int) idx, s.length()));
+
    }

    @Override
@ -192,6 +204,15 @@ abstract class AbstractMask<E> extends VectorMask<E> {
        return this.andNot(badMask);
    }

+    @Override
+    @ForceInline
+    public VectorMask<E> indexInRange(long offset, long limit) {
+        int vlength = length();
+        Vector<E> iota = vectorSpecies().zero().addIndex(1);
+        VectorMask<E> badMask = checkIndex0(offset, limit, iota, vlength);
+        return this.andNot(badMask);
+    }
+
    /*package-private*/
    @ForceInline
    AbstractVector<E>
@ -215,7 +236,7 @@ abstract class AbstractMask<E> extends VectorMask<E> {
     */
    /*package-private*/
    @ForceInline
-    void checkIndexByLane(int offset, int alength,
+    void checkIndexByLane(int offset, int length,
                          Vector<E> iota,
                          int esize) {
        if (VectorIntrinsics.VECTOR_ACCESS_OOB_CHECK == 0) {
@ -229,15 +250,15 @@ abstract class AbstractMask<E> extends VectorMask<E> {
        int vlength = length();
        VectorMask<E> badMask;
        if (esize == 1) {
-            badMask = checkIndex0(offset, alength, iota, vlength);
+            badMask = checkIndex0(offset, length, iota, vlength);
        } else if (offset >= 0) {
            // Masked access to multi-byte lanes in byte array.
            // It could be aligned anywhere.
-            int elemCount = Math.min(vlength, (alength - offset) / esize);
+            int elemCount = Math.min(vlength, (length - offset) / esize);
            badMask = checkIndex0(0, elemCount, iota, vlength);
        } else {
            int clipOffset = Math.max(offset, -(vlength * esize));
-            badMask = checkIndex0(clipOffset, alength,
+            badMask = checkIndex0(clipOffset, length,
                                  iota.lanewise(VectorOperators.MUL, esize),
                                  vlength * esize);
        }
@ -245,20 +266,20 @@ abstract class AbstractMask<E> extends VectorMask<E> {
        if (badMask.anyTrue()) {
            int badLane = badMask.firstTrue();
            throw ((AbstractMask<E>)badMask)
-                   .checkIndexFailed(offset, badLane, alength, esize);
+                   .checkIndexFailed(offset, badLane, length, esize);
        }
    }

    private
    @ForceInline
-    VectorMask<E> checkIndex0(int offset, int alength,
+    VectorMask<E> checkIndex0(int offset, int length,
                              Vector<E> iota, int vlength) {
        // An active lane is bad if its number is greater than
-        // alength-offset, since when added to offset it will step off
+        // length-offset, since when added to offset it will step off
        // of the end of the array.  To avoid overflow when
        // converting, clip the comparison value to [0..vlength]
        // inclusive.
-        int indexLimit = Math.max(0, Math.min(alength - offset, vlength));
+        int indexLimit = Math.max(0, Math.min(length - offset, vlength));
        VectorMask<E> badMask =
            iota.compare(GE, iota.broadcast(indexLimit));
        if (offset < 0) {
@ -280,14 +301,90 @@ abstract class AbstractMask<E> extends VectorMask<E> {
        return badMask;
    }

-    private IndexOutOfBoundsException checkIndexFailed(int offset, int lane,
-                                                       int alength, int esize) {
+    /**
+     * Test if a masked memory access at a given offset into an array
+     * of the given length will stay within the array.
+     * The per-lane offsets are iota*esize.
+     */
+    /*package-private*/
+    @ForceInline
+    void checkIndexByLane(long offset, long length,
+                          Vector<E> iota,
+                          int esize) {
+        if (VectorIntrinsics.VECTOR_ACCESS_OOB_CHECK == 0) {
+            return;
+        }
+        // Although the specification is simple, the implementation is
+        // tricky, because the value iota*esize might possibly
+        // overflow.  So we calculate our test values as scalars,
+        // clipping to the range [-1..VLENGTH], and test them against
+        // the unscaled iota vector, whose values are in [0..VLENGTH-1].
+        int vlength = length();
+        VectorMask<E> badMask;
+        if (esize == 1) {
+            badMask = checkIndex0(offset, length, iota, vlength);
+        } else if (offset >= 0) {
+            // Masked access to multi-byte lanes in byte array.
+            // It could be aligned anywhere.
+            // 0 <= elemCount <= vlength
+            int elemCount = (int) Math.min(vlength, (length - offset) / esize);
+            badMask = checkIndex0(0, elemCount, iota, vlength);
+        } else {
+            // -vlength * esize <= clipOffset <= 0
+            int clipOffset = (int) Math.max(offset, -(vlength * esize));
+            badMask = checkIndex0(clipOffset, length,
+                    iota.lanewise(VectorOperators.MUL, esize),
+                    vlength * esize);
+        }
+        badMask = badMask.and(this);
+        if (badMask.anyTrue()) {
+            int badLane = badMask.firstTrue();
+            throw ((AbstractMask<E>)badMask)
+                    .checkIndexFailed(offset, badLane, length, esize);
+        }
+    }
+
+    private
+    @ForceInline
+    VectorMask<E> checkIndex0(long offset, long length,
+                              Vector<E> iota, int vlength) {
+        // An active lane is bad if its number is greater than
+        // length-offset, since when added to offset it will step off
+        // of the end of the array.  To avoid overflow when
+        // converting, clip the comparison value to [0..vlength]
+        // inclusive.
+        // 0 <= indexLimit <= vlength
+        int indexLimit = (int) Math.max(0, Math.min(length - offset, vlength));
+        VectorMask<E> badMask =
+                iota.compare(GE, iota.broadcast(indexLimit));
+        if (offset < 0) {
+            // An active lane is bad if its number is less than
+            // -offset, because when added to offset it will then
+            // address an array element at a negative index.  To avoid
+            // overflow when converting, clip the comparison value at
+            // vlength.  This specific expression works correctly even
+            // when offset is Integer.MIN_VALUE.
+            // 0 <= firstGoodIndex <= vlength
+            int firstGoodIndex = (int) -Math.max(offset, -vlength);
+            VectorMask<E> badMask2 =
+                    iota.compare(LT, iota.broadcast(firstGoodIndex));
+            if (indexLimit >= vlength) {
+                badMask = badMask2;  // 1st badMask is all true
+            } else {
+                badMask = badMask.or(badMask2);
+            }
+        }
+        return badMask;
+    }
+
+    private IndexOutOfBoundsException checkIndexFailed(long offset, int lane,
+                                                       long length, int esize) {
        String msg = String.format("Masked range check failed: "+
                                   "vector mask %s out of bounds at "+
-                                   "index %d+%d in array of length %d",
-                                   this, offset, lane * esize, alength);
+                                   "index %d+%d for length %d",
+                                   this, offset, lane * esize, length);
        if (esize != 1) {
-            msg += String.format(" (each lane spans %d array elements)", esize);
+            msg += String.format(" (each lane spans %d elements)", esize);
        }
        throw new IndexOutOfBoundsException(msg);
    }
--- a/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/AbstractSpecies.java
+++ b/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/AbstractSpecies.java
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017, 2020, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2017, 2022, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@ -24,10 +24,11 @@
 */
 package jdk.incubator.vector;

+import java.lang.foreign.MemorySegment;
 import jdk.internal.vm.annotation.ForceInline;
 import jdk.internal.vm.annotation.Stable;
-import java.nio.ByteOrder;
 import java.lang.reflect.Array;
+import java.nio.ByteOrder;
 import java.util.Arrays;
 import java.util.function.Function;
 import java.util.function.IntUnaryOperator;
@ -203,12 +204,24 @@ abstract class AbstractSpecies<E> extends jdk.internal.vm.vector.VectorSupport.V
        return VectorIntrinsics.roundDown(length, laneCount);
    }

+    @Override
+    @ForceInline
+    public final long loopBound(long length) {
+        return VectorIntrinsics.roundDown(length, laneCount);
+    }
+
    @Override
    @ForceInline
    public final VectorMask<E> indexInRange(int offset, int limit) {
        return maskAll(true).indexInRange(offset, limit);
    }

+    @Override
+    @ForceInline
+    public final VectorMask<E> indexInRange(long offset, long limit) {
+        return maskAll(true).indexInRange(offset, limit);
+    }
+
    @Override
    @ForceInline
    public final <F> VectorSpecies<F> withLanes(Class<F> newType) {
@ -349,9 +362,9 @@ abstract class AbstractSpecies<E> extends jdk.internal.vm.vector.VectorSupport.V

    @ForceInline
    @Override
-    public final Vector<E> fromByteArray(byte[] a, int offset, ByteOrder bo) {
+    public final Vector<E> fromMemorySegment(MemorySegment ms, long offset, ByteOrder bo) {
        return dummyVector()
-            .fromByteArray0(a, offset)
+            .fromMemorySegment0(ms, offset)
            .maybeSwap(bo);
    }

--- a/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/AbstractVector.java
+++ b/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/AbstractVector.java
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, 2021, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2019, 2022, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@ -24,10 +24,10 @@
 */
 package jdk.incubator.vector;

+import java.lang.foreign.MemorySegment;
 import jdk.internal.vm.annotation.ForceInline;
 import jdk.internal.vm.vector.VectorSupport;

-import java.nio.ByteBuffer;
 import java.nio.ByteOrder;
 import java.util.function.IntUnaryOperator;

@ -194,7 +194,7 @@ abstract class AbstractVector<E> extends Vector<E> {
    abstract AbstractShuffle<E> shuffleFromOp(IntUnaryOperator fn);

    /*package-private*/
-    abstract AbstractVector<E> fromByteArray0(byte[] a, int offset);
+    abstract AbstractVector<E> fromMemorySegment0(MemorySegment ms, long offset);

    /*package-private*/
    abstract AbstractVector<E> maybeSwap(ByteOrder bo);
@ -504,23 +504,23 @@ abstract class AbstractVector<E> extends Vector<E> {
    AbstractVector<F> defaultReinterpret(AbstractSpecies<F> rsp) {
        int blen = Math.max(this.bitSize(), rsp.vectorBitSize()) / Byte.SIZE;
        ByteOrder bo = ByteOrder.nativeOrder();
-        ByteBuffer bb = ByteBuffer.allocate(blen);
-        this.intoByteBuffer(bb, 0, bo);
+        MemorySegment ms = MemorySegment.ofArray(new byte[blen]);
+        this.intoMemorySegment(ms, 0, bo);
        VectorMask<F> m = rsp.maskAll(true);
        // enum-switches don't optimize properly JDK-8161245
        switch (rsp.laneType.switchKey) {
        case LaneType.SK_BYTE:
-            return ByteVector.fromByteBuffer(rsp.check(byte.class), bb, 0, bo, m.check(byte.class)).check0(rsp);
+            return ByteVector.fromMemorySegment(rsp.check(byte.class), ms, 0, bo, m.check(byte.class)).check0(rsp);
        case LaneType.SK_SHORT:
-            return ShortVector.fromByteBuffer(rsp.check(short.class), bb, 0, bo, m.check(short.class)).check0(rsp);
+            return ShortVector.fromMemorySegment(rsp.check(short.class), ms, 0, bo, m.check(short.class)).check0(rsp);
        case LaneType.SK_INT:
-            return IntVector.fromByteBuffer(rsp.check(int.class), bb, 0, bo, m.check(int.class)).check0(rsp);
+            return IntVector.fromMemorySegment(rsp.check(int.class), ms, 0, bo, m.check(int.class)).check0(rsp);
        case LaneType.SK_LONG:
-            return LongVector.fromByteBuffer(rsp.check(long.class), bb, 0, bo, m.check(long.class)).check0(rsp);
+            return LongVector.fromMemorySegment(rsp.check(long.class), ms, 0, bo, m.check(long.class)).check0(rsp);
        case LaneType.SK_FLOAT:
-            return FloatVector.fromByteBuffer(rsp.check(float.class), bb, 0, bo, m.check(float.class)).check0(rsp);
+            return FloatVector.fromMemorySegment(rsp.check(float.class), ms, 0, bo, m.check(float.class)).check0(rsp);
        case LaneType.SK_DOUBLE:
-            return DoubleVector.fromByteBuffer(rsp.check(double.class), bb, 0, bo, m.check(double.class)).check0(rsp);
+            return DoubleVector.fromMemorySegment(rsp.check(double.class), ms, 0, bo, m.check(double.class)).check0(rsp);
        default:
            throw new AssertionError(rsp.toString());
        }
@ -730,15 +730,6 @@ abstract class AbstractVector<E> extends Vector<E> {
        throw new AssertionError();
    }

-    // Byte buffer wrappers.
-    static ByteBuffer wrapper(ByteBuffer bb, ByteOrder bo) {
-        return bb.duplicate().order(bo);
-    }
-
-    static ByteBuffer wrapper(byte[] a, ByteOrder bo) {
-        return ByteBuffer.wrap(a).order(bo);
-    }
-
    static {
        // Recode uses of VectorSupport.reinterpret if this assertion fails:
        assert(REGISTER_ENDIAN == ByteOrder.LITTLE_ENDIAN);
--- a/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/Byte128Vector.java
+++ b/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/Byte128Vector.java
@ -24,7 +24,7 @@
 */
 package jdk.incubator.vector;

-import java.nio.ByteBuffer;
+import java.lang.foreign.MemorySegment;
 import java.util.Arrays;
 import java.util.Objects;
 import java.util.function.IntUnaryOperator;
@ -474,6 +474,22 @@ final class Byte128Vector extends ByteVector {
                                    (Byte128Vector) v);  // specialize
    }

+    @Override
+    @ForceInline
+    public Byte128Vector compress(VectorMask<Byte> m) {
+        return (Byte128Vector)
+            super.compressTemplate(Byte128Mask.class,
+                                   (Byte128Mask) m);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    public Byte128Vector expand(VectorMask<Byte> m) {
+        return (Byte128Vector)
+            super.expandTemplate(Byte128Mask.class,
+                                   (Byte128Mask) m);  // specialize
+    }
+
    @Override
    @ForceInline
    public Byte128Vector selectFrom(Vector<Byte> v) {
@ -677,6 +693,15 @@ final class Byte128Vector extends ByteVector {
            return xor(maskAll(true));
        }

+        @Override
+        @ForceInline
+        public Byte128Mask compress() {
+            return (Byte128Mask)VectorSupport.compressExpandOp(VectorSupport.VECTOR_OP_MASK_COMPRESS,
+                Byte128Vector.class, Byte128Mask.class, ETYPE, VLENGTH, null, this,
+                (v1, m1) -> VSPECIES.iota().compare(VectorOperators.LT, m1.trueCount()));
+        }
+
+
        // Binary operations

        @Override
@ -876,29 +901,15 @@ final class Byte128Vector extends ByteVector {
    @ForceInline
    @Override
    final
-    ByteVector fromByteArray0(byte[] a, int offset) {
-        return super.fromByteArray0Template(a, offset);  // specialize
+    ByteVector fromMemorySegment0(MemorySegment ms, long offset) {
+        return super.fromMemorySegment0Template(ms, offset);  // specialize
    }

    @ForceInline
    @Override
    final
-    ByteVector fromByteArray0(byte[] a, int offset, VectorMask<Byte> m) {
-        return super.fromByteArray0Template(Byte128Mask.class, a, offset, (Byte128Mask) m);  // specialize
-    }
-
-    @ForceInline
-    @Override
-    final
-    ByteVector fromByteBuffer0(ByteBuffer bb, int offset) {
-        return super.fromByteBuffer0Template(bb, offset);  // specialize
-    }
-
-    @ForceInline
-    @Override
-    final
-    ByteVector fromByteBuffer0(ByteBuffer bb, int offset, VectorMask<Byte> m) {
-        return super.fromByteBuffer0Template(Byte128Mask.class, bb, offset, (Byte128Mask) m);  // specialize
+    ByteVector fromMemorySegment0(MemorySegment ms, long offset, VectorMask<Byte> m) {
+        return super.fromMemorySegment0Template(Byte128Mask.class, ms, offset, (Byte128Mask) m);  // specialize
    }

    @ForceInline
@ -926,22 +937,8 @@ final class Byte128Vector extends ByteVector {
    @ForceInline
    @Override
    final
-    void intoByteArray0(byte[] a, int offset) {
-        super.intoByteArray0Template(a, offset);  // specialize
-    }
-
-    @ForceInline
-    @Override
-    final
-    void intoByteArray0(byte[] a, int offset, VectorMask<Byte> m) {
-        super.intoByteArray0Template(Byte128Mask.class, a, offset, (Byte128Mask) m);  // specialize
-    }
-
-    @ForceInline
-    @Override
-    final
-    void intoByteBuffer0(ByteBuffer bb, int offset, VectorMask<Byte> m) {
-        super.intoByteBuffer0Template(Byte128Mask.class, bb, offset, (Byte128Mask) m);
+    void intoMemorySegment0(MemorySegment ms, long offset, VectorMask<Byte> m) {
+        super.intoMemorySegment0Template(Byte128Mask.class, ms, offset, (Byte128Mask) m);
    }


@ -950,3 +947,4 @@ final class Byte128Vector extends ByteVector {
    // ================================================

 }
+
--- a/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/Byte256Vector.java
+++ b/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/Byte256Vector.java
@ -24,7 +24,7 @@
 */
 package jdk.incubator.vector;

-import java.nio.ByteBuffer;
+import java.lang.foreign.MemorySegment;
 import java.util.Arrays;
 import java.util.Objects;
 import java.util.function.IntUnaryOperator;
@ -474,6 +474,22 @@ final class Byte256Vector extends ByteVector {
                                    (Byte256Vector) v);  // specialize
    }

+    @Override
+    @ForceInline
+    public Byte256Vector compress(VectorMask<Byte> m) {
+        return (Byte256Vector)
+            super.compressTemplate(Byte256Mask.class,
+                                   (Byte256Mask) m);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    public Byte256Vector expand(VectorMask<Byte> m) {
+        return (Byte256Vector)
+            super.expandTemplate(Byte256Mask.class,
+                                   (Byte256Mask) m);  // specialize
+    }
+
    @Override
    @ForceInline
    public Byte256Vector selectFrom(Vector<Byte> v) {
@ -709,6 +725,15 @@ final class Byte256Vector extends ByteVector {
            return xor(maskAll(true));
        }

+        @Override
+        @ForceInline
+        public Byte256Mask compress() {
+            return (Byte256Mask)VectorSupport.compressExpandOp(VectorSupport.VECTOR_OP_MASK_COMPRESS,
+                Byte256Vector.class, Byte256Mask.class, ETYPE, VLENGTH, null, this,
+                (v1, m1) -> VSPECIES.iota().compare(VectorOperators.LT, m1.trueCount()));
+        }
+
+
        // Binary operations

        @Override
@ -908,29 +933,15 @@ final class Byte256Vector extends ByteVector {
    @ForceInline
    @Override
    final
-    ByteVector fromByteArray0(byte[] a, int offset) {
-        return super.fromByteArray0Template(a, offset);  // specialize
+    ByteVector fromMemorySegment0(MemorySegment ms, long offset) {
+        return super.fromMemorySegment0Template(ms, offset);  // specialize
    }

    @ForceInline
    @Override
    final
-    ByteVector fromByteArray0(byte[] a, int offset, VectorMask<Byte> m) {
-        return super.fromByteArray0Template(Byte256Mask.class, a, offset, (Byte256Mask) m);  // specialize
-    }
-
-    @ForceInline
-    @Override
-    final
-    ByteVector fromByteBuffer0(ByteBuffer bb, int offset) {
-        return super.fromByteBuffer0Template(bb, offset);  // specialize
-    }
-
-    @ForceInline
-    @Override
-    final
-    ByteVector fromByteBuffer0(ByteBuffer bb, int offset, VectorMask<Byte> m) {
-        return super.fromByteBuffer0Template(Byte256Mask.class, bb, offset, (Byte256Mask) m);  // specialize
+    ByteVector fromMemorySegment0(MemorySegment ms, long offset, VectorMask<Byte> m) {
+        return super.fromMemorySegment0Template(Byte256Mask.class, ms, offset, (Byte256Mask) m);  // specialize
    }

    @ForceInline
@ -958,22 +969,8 @@ final class Byte256Vector extends ByteVector {
    @ForceInline
    @Override
    final
-    void intoByteArray0(byte[] a, int offset) {
-        super.intoByteArray0Template(a, offset);  // specialize
-    }
-
-    @ForceInline
-    @Override
-    final
-    void intoByteArray0(byte[] a, int offset, VectorMask<Byte> m) {
-        super.intoByteArray0Template(Byte256Mask.class, a, offset, (Byte256Mask) m);  // specialize
-    }
-
-    @ForceInline
-    @Override
-    final
-    void intoByteBuffer0(ByteBuffer bb, int offset, VectorMask<Byte> m) {
-        super.intoByteBuffer0Template(Byte256Mask.class, bb, offset, (Byte256Mask) m);
+    void intoMemorySegment0(MemorySegment ms, long offset, VectorMask<Byte> m) {
+        super.intoMemorySegment0Template(Byte256Mask.class, ms, offset, (Byte256Mask) m);
    }


@ -982,3 +979,4 @@ final class Byte256Vector extends ByteVector {
    // ================================================

 }
+
--- a/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/Byte512Vector.java
+++ b/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/Byte512Vector.java
@ -24,7 +24,7 @@
 */
 package jdk.incubator.vector;

-import java.nio.ByteBuffer;
+import java.lang.foreign.MemorySegment;
 import java.util.Arrays;
 import java.util.Objects;
 import java.util.function.IntUnaryOperator;
@ -474,6 +474,22 @@ final class Byte512Vector extends ByteVector {
                                    (Byte512Vector) v);  // specialize
    }

+    @Override
+    @ForceInline
+    public Byte512Vector compress(VectorMask<Byte> m) {
+        return (Byte512Vector)
+            super.compressTemplate(Byte512Mask.class,
+                                   (Byte512Mask) m);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    public Byte512Vector expand(VectorMask<Byte> m) {
+        return (Byte512Vector)
+            super.expandTemplate(Byte512Mask.class,
+                                   (Byte512Mask) m);  // specialize
+    }
+
    @Override
    @ForceInline
    public Byte512Vector selectFrom(Vector<Byte> v) {
@ -773,6 +789,15 @@ final class Byte512Vector extends ByteVector {
            return xor(maskAll(true));
        }

+        @Override
+        @ForceInline
+        public Byte512Mask compress() {
+            return (Byte512Mask)VectorSupport.compressExpandOp(VectorSupport.VECTOR_OP_MASK_COMPRESS,
+                Byte512Vector.class, Byte512Mask.class, ETYPE, VLENGTH, null, this,
+                (v1, m1) -> VSPECIES.iota().compare(VectorOperators.LT, m1.trueCount()));
+        }
+
+
        // Binary operations

        @Override
@ -972,29 +997,15 @@ final class Byte512Vector extends ByteVector {
    @ForceInline
    @Override
    final
-    ByteVector fromByteArray0(byte[] a, int offset) {
-        return super.fromByteArray0Template(a, offset);  // specialize
+    ByteVector fromMemorySegment0(MemorySegment ms, long offset) {
+        return super.fromMemorySegment0Template(ms, offset);  // specialize
    }

    @ForceInline
    @Override
    final
-    ByteVector fromByteArray0(byte[] a, int offset, VectorMask<Byte> m) {
-        return super.fromByteArray0Template(Byte512Mask.class, a, offset, (Byte512Mask) m);  // specialize
-    }
-
-    @ForceInline
-    @Override
-    final
-    ByteVector fromByteBuffer0(ByteBuffer bb, int offset) {
-        return super.fromByteBuffer0Template(bb, offset);  // specialize
-    }
-
-    @ForceInline
-    @Override
-    final
-    ByteVector fromByteBuffer0(ByteBuffer bb, int offset, VectorMask<Byte> m) {
-        return super.fromByteBuffer0Template(Byte512Mask.class, bb, offset, (Byte512Mask) m);  // specialize
+    ByteVector fromMemorySegment0(MemorySegment ms, long offset, VectorMask<Byte> m) {
+        return super.fromMemorySegment0Template(Byte512Mask.class, ms, offset, (Byte512Mask) m);  // specialize
    }

    @ForceInline
@ -1022,22 +1033,8 @@ final class Byte512Vector extends ByteVector {
    @ForceInline
    @Override
    final
-    void intoByteArray0(byte[] a, int offset) {
-        super.intoByteArray0Template(a, offset);  // specialize
-    }
-
-    @ForceInline
-    @Override
-    final
-    void intoByteArray0(byte[] a, int offset, VectorMask<Byte> m) {
-        super.intoByteArray0Template(Byte512Mask.class, a, offset, (Byte512Mask) m);  // specialize
-    }
-
-    @ForceInline
-    @Override
-    final
-    void intoByteBuffer0(ByteBuffer bb, int offset, VectorMask<Byte> m) {
-        super.intoByteBuffer0Template(Byte512Mask.class, bb, offset, (Byte512Mask) m);
+    void intoMemorySegment0(MemorySegment ms, long offset, VectorMask<Byte> m) {
+        super.intoMemorySegment0Template(Byte512Mask.class, ms, offset, (Byte512Mask) m);
    }


@ -1046,3 +1043,4 @@ final class Byte512Vector extends ByteVector {
    // ================================================

 }
+
--- a/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/Byte64Vector.java
+++ b/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/Byte64Vector.java
@ -24,7 +24,7 @@
 */
 package jdk.incubator.vector;

-import java.nio.ByteBuffer;
+import java.lang.foreign.MemorySegment;
 import java.util.Arrays;
 import java.util.Objects;
 import java.util.function.IntUnaryOperator;
@ -474,6 +474,22 @@ final class Byte64Vector extends ByteVector {
                                    (Byte64Vector) v);  // specialize
    }

+    @Override
+    @ForceInline
+    public Byte64Vector compress(VectorMask<Byte> m) {
+        return (Byte64Vector)
+            super.compressTemplate(Byte64Mask.class,
+                                   (Byte64Mask) m);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    public Byte64Vector expand(VectorMask<Byte> m) {
+        return (Byte64Vector)
+            super.expandTemplate(Byte64Mask.class,
+                                   (Byte64Mask) m);  // specialize
+    }
+
    @Override
    @ForceInline
    public Byte64Vector selectFrom(Vector<Byte> v) {
@ -661,6 +677,15 @@ final class Byte64Vector extends ByteVector {
            return xor(maskAll(true));
        }

+        @Override
+        @ForceInline
+        public Byte64Mask compress() {
+            return (Byte64Mask)VectorSupport.compressExpandOp(VectorSupport.VECTOR_OP_MASK_COMPRESS,
+                Byte64Vector.class, Byte64Mask.class, ETYPE, VLENGTH, null, this,
+                (v1, m1) -> VSPECIES.iota().compare(VectorOperators.LT, m1.trueCount()));
+        }
+
+
        // Binary operations

        @Override
@ -860,29 +885,15 @@ final class Byte64Vector extends ByteVector {
    @ForceInline
    @Override
    final
-    ByteVector fromByteArray0(byte[] a, int offset) {
-        return super.fromByteArray0Template(a, offset);  // specialize
+    ByteVector fromMemorySegment0(MemorySegment ms, long offset) {
+        return super.fromMemorySegment0Template(ms, offset);  // specialize
    }

    @ForceInline
    @Override
    final
-    ByteVector fromByteArray0(byte[] a, int offset, VectorMask<Byte> m) {
-        return super.fromByteArray0Template(Byte64Mask.class, a, offset, (Byte64Mask) m);  // specialize
-    }
-
-    @ForceInline
-    @Override
-    final
-    ByteVector fromByteBuffer0(ByteBuffer bb, int offset) {
-        return super.fromByteBuffer0Template(bb, offset);  // specialize
-    }
-
-    @ForceInline
-    @Override
-    final
-    ByteVector fromByteBuffer0(ByteBuffer bb, int offset, VectorMask<Byte> m) {
-        return super.fromByteBuffer0Template(Byte64Mask.class, bb, offset, (Byte64Mask) m);  // specialize
+    ByteVector fromMemorySegment0(MemorySegment ms, long offset, VectorMask<Byte> m) {
+        return super.fromMemorySegment0Template(Byte64Mask.class, ms, offset, (Byte64Mask) m);  // specialize
    }

    @ForceInline
@ -910,22 +921,8 @@ final class Byte64Vector extends ByteVector {
    @ForceInline
    @Override
    final
-    void intoByteArray0(byte[] a, int offset) {
-        super.intoByteArray0Template(a, offset);  // specialize
-    }
-
-    @ForceInline
-    @Override
-    final
-    void intoByteArray0(byte[] a, int offset, VectorMask<Byte> m) {
-        super.intoByteArray0Template(Byte64Mask.class, a, offset, (Byte64Mask) m);  // specialize
-    }
-
-    @ForceInline
-    @Override
-    final
-    void intoByteBuffer0(ByteBuffer bb, int offset, VectorMask<Byte> m) {
-        super.intoByteBuffer0Template(Byte64Mask.class, bb, offset, (Byte64Mask) m);
+    void intoMemorySegment0(MemorySegment ms, long offset, VectorMask<Byte> m) {
+        super.intoMemorySegment0Template(Byte64Mask.class, ms, offset, (Byte64Mask) m);
    }


@ -934,3 +931,4 @@ final class Byte64Vector extends ByteVector {
    // ================================================

 }
+
--- a/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/ByteMaxVector.java
+++ b/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/ByteMaxVector.java
@ -24,7 +24,7 @@
 */
 package jdk.incubator.vector;

-import java.nio.ByteBuffer;
+import java.lang.foreign.MemorySegment;
 import java.util.Arrays;
 import java.util.Objects;
 import java.util.function.IntUnaryOperator;
@ -474,6 +474,22 @@ final class ByteMaxVector extends ByteVector {
                                    (ByteMaxVector) v);  // specialize
    }

+    @Override
+    @ForceInline
+    public ByteMaxVector compress(VectorMask<Byte> m) {
+        return (ByteMaxVector)
+            super.compressTemplate(ByteMaxMask.class,
+                                   (ByteMaxMask) m);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    public ByteMaxVector expand(VectorMask<Byte> m) {
+        return (ByteMaxVector)
+            super.expandTemplate(ByteMaxMask.class,
+                                   (ByteMaxMask) m);  // specialize
+    }
+
    @Override
    @ForceInline
    public ByteMaxVector selectFrom(Vector<Byte> v) {
@ -647,6 +663,15 @@ final class ByteMaxVector extends ByteVector {
            return xor(maskAll(true));
        }

+        @Override
+        @ForceInline
+        public ByteMaxMask compress() {
+            return (ByteMaxMask)VectorSupport.compressExpandOp(VectorSupport.VECTOR_OP_MASK_COMPRESS,
+                ByteMaxVector.class, ByteMaxMask.class, ETYPE, VLENGTH, null, this,
+                (v1, m1) -> VSPECIES.iota().compare(VectorOperators.LT, m1.trueCount()));
+        }
+
+
        // Binary operations

        @Override
@ -846,29 +871,15 @@ final class ByteMaxVector extends ByteVector {
    @ForceInline
    @Override
    final
-    ByteVector fromByteArray0(byte[] a, int offset) {
-        return super.fromByteArray0Template(a, offset);  // specialize
+    ByteVector fromMemorySegment0(MemorySegment ms, long offset) {
+        return super.fromMemorySegment0Template(ms, offset);  // specialize
    }

    @ForceInline
    @Override
    final
-    ByteVector fromByteArray0(byte[] a, int offset, VectorMask<Byte> m) {
-        return super.fromByteArray0Template(ByteMaxMask.class, a, offset, (ByteMaxMask) m);  // specialize
-    }
-
-    @ForceInline
-    @Override
-    final
-    ByteVector fromByteBuffer0(ByteBuffer bb, int offset) {
-        return super.fromByteBuffer0Template(bb, offset);  // specialize
-    }
-
-    @ForceInline
-    @Override
-    final
-    ByteVector fromByteBuffer0(ByteBuffer bb, int offset, VectorMask<Byte> m) {
-        return super.fromByteBuffer0Template(ByteMaxMask.class, bb, offset, (ByteMaxMask) m);  // specialize
+    ByteVector fromMemorySegment0(MemorySegment ms, long offset, VectorMask<Byte> m) {
+        return super.fromMemorySegment0Template(ByteMaxMask.class, ms, offset, (ByteMaxMask) m);  // specialize
    }

    @ForceInline
@ -896,22 +907,8 @@ final class ByteMaxVector extends ByteVector {
    @ForceInline
    @Override
    final
-    void intoByteArray0(byte[] a, int offset) {
-        super.intoByteArray0Template(a, offset);  // specialize
-    }
-
-    @ForceInline
-    @Override
-    final
-    void intoByteArray0(byte[] a, int offset, VectorMask<Byte> m) {
-        super.intoByteArray0Template(ByteMaxMask.class, a, offset, (ByteMaxMask) m);  // specialize
-    }
-
-    @ForceInline
-    @Override
-    final
-    void intoByteBuffer0(ByteBuffer bb, int offset, VectorMask<Byte> m) {
-        super.intoByteBuffer0Template(ByteMaxMask.class, bb, offset, (ByteMaxMask) m);
+    void intoMemorySegment0(MemorySegment ms, long offset, VectorMask<Byte> m) {
+        super.intoMemorySegment0Template(ByteMaxMask.class, ms, offset, (ByteMaxMask) m);
    }


@ -920,3 +917,4 @@ final class ByteMaxVector extends ByteVector {
    // ================================================

 }
+
--- a/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/ByteVector.java
+++ b/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/ByteVector.java
@ -24,14 +24,14 @@
 */
 package jdk.incubator.vector;

-import java.nio.ByteBuffer;
+import java.lang.foreign.MemorySegment;
+import java.lang.foreign.ValueLayout;
 import java.nio.ByteOrder;
-import java.nio.ReadOnlyBufferException;
 import java.util.Arrays;
 import java.util.Objects;
 import java.util.function.Function;
-import java.util.function.UnaryOperator;

+import jdk.internal.foreign.AbstractMemorySegmentImpl;
 import jdk.internal.misc.ScopedMemoryAccess;
 import jdk.internal.misc.Unsafe;
 import jdk.internal.vm.annotation.ForceInline;
@ -57,6 +57,8 @@ public abstract class ByteVector extends AbstractVector<Byte> {

    static final int FORBID_OPCODE_KIND = VO_ONLYFP;

+    static final ValueLayout.OfByte ELEMENT_LAYOUT = ValueLayout.JAVA_BYTE.withBitAlignment(8);
+
    @ForceInline
    static int opCode(Operator op) {
        return VectorOperators.opCode(op, VO_OPCODE_VALID, FORBID_OPCODE_KIND);
@ -351,6 +353,45 @@ public abstract class ByteVector extends AbstractVector<Byte> {
        return vectorFactory(res);
    }

+    /*package-private*/
+    interface FLdLongOp {
+        byte apply(MemorySegment memory, long offset, int i);
+    }
+
+    /*package-private*/
+    @ForceInline
+    final
+    ByteVector ldLongOp(MemorySegment memory, long offset,
+                                  FLdLongOp f) {
+        //dummy; no vec = vec();
+        byte[] res = new byte[length()];
+        for (int i = 0; i < res.length; i++) {
+            res[i] = f.apply(memory, offset, i);
+        }
+        return vectorFactory(res);
+    }
+
+    /*package-private*/
+    @ForceInline
+    final
+    ByteVector ldLongOp(MemorySegment memory, long offset,
+                                  VectorMask<Byte> m,
+                                  FLdLongOp f) {
+        //byte[] vec = vec();
+        byte[] res = new byte[length()];
+        boolean[] mbits = ((AbstractMask<Byte>)m).getBits();
+        for (int i = 0; i < res.length; i++) {
+            if (mbits[i]) {
+                res[i] = f.apply(memory, offset, i);
+            }
+        }
+        return vectorFactory(res);
+    }
+
+    static byte memorySegmentGet(MemorySegment ms, long o, int i) {
+        return ms.get(ELEMENT_LAYOUT, o + i * 1L);
+    }
+
    interface FStOp<M> {
        void apply(M memory, int offset, int i, byte a);
    }
@ -381,6 +422,40 @@ public abstract class ByteVector extends AbstractVector<Byte> {
        }
    }

+    interface FStLongOp {
+        void apply(MemorySegment memory, long offset, int i, byte a);
+    }
+
+    /*package-private*/
+    @ForceInline
+    final
+    void stLongOp(MemorySegment memory, long offset,
+                  FStLongOp f) {
+        byte[] vec = vec();
+        for (int i = 0; i < vec.length; i++) {
+            f.apply(memory, offset, i, vec[i]);
+        }
+    }
+
+    /*package-private*/
+    @ForceInline
+    final
+    void stLongOp(MemorySegment memory, long offset,
+                  VectorMask<Byte> m,
+                  FStLongOp f) {
+        byte[] vec = vec();
+        boolean[] mbits = ((AbstractMask<Byte>)m).getBits();
+        for (int i = 0; i < vec.length; i++) {
+            if (mbits[i]) {
+                f.apply(memory, offset, i, vec[i]);
+            }
+        }
+    }
+
+    static void memorySegmentSet(MemorySegment ms, long o, int i, byte e) {
+        ms.set(ELEMENT_LAYOUT, o + i * 1L, e);
+    }
+
    // Binary test

    /*package-private*/
@ -431,6 +506,36 @@ public abstract class ByteVector extends AbstractVector<Byte> {
        return ((byte)bits);
    }

+    static ByteVector expandHelper(Vector<Byte> v, VectorMask<Byte> m) {
+        VectorSpecies<Byte> vsp = m.vectorSpecies();
+        ByteVector r  = (ByteVector) vsp.zero();
+        ByteVector vi = (ByteVector) v;
+        if (m.allTrue()) {
+            return vi;
+        }
+        for (int i = 0, j = 0; i < vsp.length(); i++) {
+            if (m.laneIsSet(i)) {
+                r = r.withLane(i, vi.lane(j++));
+            }
+        }
+        return r;
+    }
+
+    static ByteVector compressHelper(Vector<Byte> v, VectorMask<Byte> m) {
+        VectorSpecies<Byte> vsp = m.vectorSpecies();
+        ByteVector r  = (ByteVector) vsp.zero();
+        ByteVector vi = (ByteVector) v;
+        if (m.allTrue()) {
+            return vi;
+        }
+        for (int i = 0, j = 0; i < vsp.length(); i++) {
+            if (m.laneIsSet(i)) {
+                r = r.withLane(j++, vi.lane(i));
+            }
+        }
+        return r;
+    }
+
    // Static factories (other than memory operations)

    // Note: A surprising behavior in javadoc
@ -620,6 +725,16 @@ public abstract class ByteVector extends AbstractVector<Byte> {
                    v0.uOp(m, (i, a) -> (byte) -a);
            case VECTOR_OP_ABS: return (v0, m) ->
                    v0.uOp(m, (i, a) -> (byte) Math.abs(a));
+            case VECTOR_OP_BIT_COUNT: return (v0, m) ->
+                    v0.uOp(m, (i, a) -> (byte) bitCount(a));
+            case VECTOR_OP_TZ_COUNT: return (v0, m) ->
+                    v0.uOp(m, (i, a) -> (byte) numberOfTrailingZeros(a));
+            case VECTOR_OP_LZ_COUNT: return (v0, m) ->
+                    v0.uOp(m, (i, a) -> (byte) numberOfLeadingZeros(a));
+            case VECTOR_OP_REVERSE: return (v0, m) ->
+                    v0.uOp(m, (i, a) -> reverse(a));
+            case VECTOR_OP_REVERSE_BYTES: return (v0, m) ->
+                    v0.uOp(m, (i, a) -> a);
            default: return null;
        }
    }
@ -1746,6 +1861,25 @@ public abstract class ByteVector extends AbstractVector<Byte> {
        return lanewise(ABS);
    }

+    static int bitCount(byte a) {
+        return Integer.bitCount((int)a & 0xFF);
+    }
+    static int numberOfTrailingZeros(byte a) {
+        return a != 0 ? Integer.numberOfTrailingZeros(a) : 8;
+    }
+    static int numberOfLeadingZeros(byte a) {
+        return a >= 0 ? Integer.numberOfLeadingZeros(a) - 24 : 0;
+    }
+
+    static byte reverse(byte a) {
+        if (a == 0 || a == -1) return a;
+
+        byte b = rotateLeft(a, 4);
+        b = (byte) (((b & 0x55) << 1) | ((b & 0xAA) >>> 1));
+        b = (byte) (((b & 0x33) << 2) | ((b & 0xCC) >>> 2));
+        return b;
+    }
+
    // not (~)
    /**
     * Computes the bitwise logical complement ({@code ~})
@ -2372,6 +2506,45 @@ public abstract class ByteVector extends AbstractVector<Byte> {
                                     ByteVector::toShuffle0);
    }

+    /**
+     * {@inheritDoc} <!--workaround-->
+     * @since 19
+     */
+    @Override
+    public abstract
+    ByteVector compress(VectorMask<Byte> m);
+
+    /*package-private*/
+    @ForceInline
+    final
+    <M extends AbstractMask<Byte>>
+    ByteVector compressTemplate(Class<M> masktype, M m) {
+      m.check(masktype, this);
+      return (ByteVector) VectorSupport.compressExpandOp(VectorSupport.VECTOR_OP_COMPRESS, getClass(), masktype,
+                                                        byte.class, length(), this, m,
+                                                        (v1, m1) -> compressHelper(v1, m1));
+    }
+
+    /**
+     * {@inheritDoc} <!--workaround-->
+     * @since 19
+     */
+    @Override
+    public abstract
+    ByteVector expand(VectorMask<Byte> m);
+
+    /*package-private*/
+    @ForceInline
+    final
+    <M extends AbstractMask<Byte>>
+    ByteVector expandTemplate(Class<M> masktype, M m) {
+      m.check(masktype, this);
+      return (ByteVector) VectorSupport.compressExpandOp(VectorSupport.VECTOR_OP_EXPAND, getClass(), masktype,
+                                                        byte.class, length(), this, m,
+                                                        (v1, m1) -> expandHelper(v1, m1));
+    }
+
+
    /**
     * {@inheritDoc} <!--workaround-->
     */
@ -2784,90 +2957,6 @@ public abstract class ByteVector extends AbstractVector<Byte> {
        return res;
    }

-    /**
-     * Loads a vector from a byte array starting at an offset.
-     * Bytes are composed into primitive lane elements according
-     * to the specified byte order.
-     * The vector is arranged into lanes according to
-     * <a href="Vector.html#lane-order">memory ordering</a>.
-     * <p>
-     * This method behaves as if it returns the result of calling
-     * {@link #fromByteBuffer(VectorSpecies,ByteBuffer,int,ByteOrder,VectorMask)
-     * fromByteBuffer()} as follows:
-     * <pre>{@code
-     * var bb = ByteBuffer.wrap(a);
-     * var m = species.maskAll(true);
-     * return fromByteBuffer(species, bb, offset, bo, m);
-     * }</pre>
-     *
-     * @param species species of desired vector
-     * @param a the byte array
-     * @param offset the offset into the array
-     * @param bo the intended byte order
-     * @return a vector loaded from a byte array
-     * @throws IndexOutOfBoundsException
-     *         if {@code offset+N*ESIZE < 0}
-     *         or {@code offset+(N+1)*ESIZE > a.length}
-     *         for any lane {@code N} in the vector
-     */
-    @ForceInline
-    public static
-    ByteVector fromByteArray(VectorSpecies<Byte> species,
-                                       byte[] a, int offset,
-                                       ByteOrder bo) {
-        offset = checkFromIndexSize(offset, species.vectorByteSize(), a.length);
-        ByteSpecies vsp = (ByteSpecies) species;
-        return vsp.dummyVector().fromByteArray0(a, offset).maybeSwap(bo);
-    }
-
-    /**
-     * Loads a vector from a byte array starting at an offset
-     * and using a mask.
-     * Lanes where the mask is unset are filled with the default
-     * value of {@code byte} (zero).
-     * Bytes are composed into primitive lane elements according
-     * to the specified byte order.
-     * The vector is arranged into lanes according to
-     * <a href="Vector.html#lane-order">memory ordering</a>.
-     * <p>
-     * This method behaves as if it returns the result of calling
-     * {@link #fromByteBuffer(VectorSpecies,ByteBuffer,int,ByteOrder,VectorMask)
-     * fromByteBuffer()} as follows:
-     * <pre>{@code
-     * var bb = ByteBuffer.wrap(a);
-     * return fromByteBuffer(species, bb, offset, bo, m);
-     * }</pre>
-     *
-     * @param species species of desired vector
-     * @param a the byte array
-     * @param offset the offset into the array
-     * @param bo the intended byte order
-     * @param m the mask controlling lane selection
-     * @return a vector loaded from a byte array
-     * @throws IndexOutOfBoundsException
-     *         if {@code offset+N*ESIZE < 0}
-     *         or {@code offset+(N+1)*ESIZE > a.length}
-     *         for any lane {@code N} in the vector
-     *         where the mask is set
-     */
-    @ForceInline
-    public static
-    ByteVector fromByteArray(VectorSpecies<Byte> species,
-                                       byte[] a, int offset,
-                                       ByteOrder bo,
-                                       VectorMask<Byte> m) {
-        ByteSpecies vsp = (ByteSpecies) species;
-        if (offset >= 0 && offset <= (a.length - species.vectorByteSize())) {
-            return vsp.dummyVector().fromByteArray0(a, offset, m).maybeSwap(bo);
-        }
-
-        // FIXME: optimize
-        checkMaskFromIndexSize(offset, vsp, m, 1, a.length);
-        ByteBuffer wb = wrapper(a, bo);
-        return vsp.ldOp(wb, offset, (AbstractMask<Byte>)m,
-                   (wb_, o, i)  -> wb_.get(o + i * 1));
-    }
-
    /**
     * Loads a vector from an array of type {@code byte[]}
     * starting at an offset.
@ -3174,44 +3263,49 @@ public abstract class ByteVector extends AbstractVector<Byte> {
    }

    /**
-     * Loads a vector from a {@linkplain ByteBuffer byte buffer}
-     * starting at an offset into the byte buffer.
+     * Loads a vector from a {@linkplain MemorySegment memory segment}
+     * starting at an offset into the memory segment.
     * Bytes are composed into primitive lane elements according
     * to the specified byte order.
     * The vector is arranged into lanes according to
     * <a href="Vector.html#lane-order">memory ordering</a>.
     * <p>
     * This method behaves as if it returns the result of calling
-     * {@link #fromByteBuffer(VectorSpecies,ByteBuffer,int,ByteOrder,VectorMask)
-     * fromByteBuffer()} as follows:
+     * {@link #fromMemorySegment(VectorSpecies,MemorySegment,long,ByteOrder,VectorMask)
+     * fromMemorySegment()} as follows:
     * <pre>{@code
     * var m = species.maskAll(true);
-     * return fromByteBuffer(species, bb, offset, bo, m);
+     * return fromMemorySegment(species, ms, offset, bo, m);
     * }</pre>
     *
     * @param species species of desired vector
-     * @param bb the byte buffer
-     * @param offset the offset into the byte buffer
+     * @param ms the memory segment
+     * @param offset the offset into the memory segment
     * @param bo the intended byte order
-     * @return a vector loaded from a byte buffer
+     * @return a vector loaded from the memory segment
     * @throws IndexOutOfBoundsException
     *         if {@code offset+N*1 < 0}
-     *         or {@code offset+N*1 >= bb.limit()}
+     *         or {@code offset+N*1 >= ms.byteSize()}
     *         for any lane {@code N} in the vector
+     * @throws IllegalArgumentException if the memory segment is a heap segment that is
+     *         not backed by a {@code byte[]} array.
+     * @throws IllegalStateException if the memory segment's session is not alive,
+     *         or if access occurs from a thread other than the thread owning the session.
+     * @since 19
     */
    @ForceInline
    public static
-    ByteVector fromByteBuffer(VectorSpecies<Byte> species,
-                                        ByteBuffer bb, int offset,
-                                        ByteOrder bo) {
-        offset = checkFromIndexSize(offset, species.vectorByteSize(), bb.limit());
+    ByteVector fromMemorySegment(VectorSpecies<Byte> species,
+                                           MemorySegment ms, long offset,
+                                           ByteOrder bo) {
+        offset = checkFromIndexSize(offset, species.vectorByteSize(), ms.byteSize());
        ByteSpecies vsp = (ByteSpecies) species;
-        return vsp.dummyVector().fromByteBuffer0(bb, offset).maybeSwap(bo);
+        return vsp.dummyVector().fromMemorySegment0(ms, offset).maybeSwap(bo);
    }

    /**
-     * Loads a vector from a {@linkplain ByteBuffer byte buffer}
-     * starting at an offset into the byte buffer
+     * Loads a vector from a {@linkplain MemorySegment memory segment}
+     * starting at an offset into the memory segment
     * and using a mask.
     * Lanes where the mask is unset are filled with the default
     * value of {@code byte} (zero).
@ -3222,12 +3316,11 @@ public abstract class ByteVector extends AbstractVector<Byte> {
     * <p>
     * The following pseudocode illustrates the behavior:
     * <pre>{@code
-     * ByteBuffer eb = bb.duplicate()
-     *     .position(offset);
+     * var slice = ms.asSlice(offset);
     * byte[] ar = new byte[species.length()];
     * for (int n = 0; n < ar.length; n++) {
     *     if (m.laneIsSet(n)) {
-     *         ar[n] = eb.get(n);
+     *         ar[n] = slice.getAtIndex(ValuaLayout.JAVA_BYTE.withBitAlignment(8), n);
     *     }
     * }
     * ByteVector r = ByteVector.fromArray(species, ar, 0);
@ -3236,33 +3329,36 @@ public abstract class ByteVector extends AbstractVector<Byte> {
     * The byte order argument is ignored.
     *
     * @param species species of desired vector
-     * @param bb the byte buffer
-     * @param offset the offset into the byte buffer
+     * @param ms the memory segment
+     * @param offset the offset into the memory segment
     * @param bo the intended byte order
     * @param m the mask controlling lane selection
-     * @return a vector loaded from a byte buffer
+     * @return a vector loaded from the memory segment
     * @throws IndexOutOfBoundsException
     *         if {@code offset+N*1 < 0}
-     *         or {@code offset+N*1 >= bb.limit()}
+     *         or {@code offset+N*1 >= ms.byteSize()}
     *         for any lane {@code N} in the vector
     *         where the mask is set
+     * @throws IllegalArgumentException if the memory segment is a heap segment that is
+     *         not backed by a {@code byte[]} array.
+     * @throws IllegalStateException if the memory segment's session is not alive,
+     *         or if access occurs from a thread other than the thread owning the session.
+     * @since 19
     */
    @ForceInline
    public static
-    ByteVector fromByteBuffer(VectorSpecies<Byte> species,
-                                        ByteBuffer bb, int offset,
-                                        ByteOrder bo,
-                                        VectorMask<Byte> m) {
+    ByteVector fromMemorySegment(VectorSpecies<Byte> species,
+                                           MemorySegment ms, long offset,
+                                           ByteOrder bo,
+                                           VectorMask<Byte> m) {
        ByteSpecies vsp = (ByteSpecies) species;
-        if (offset >= 0 && offset <= (bb.limit() - species.vectorByteSize())) {
-            return vsp.dummyVector().fromByteBuffer0(bb, offset, m).maybeSwap(bo);
+        if (offset >= 0 && offset <= (ms.byteSize() - species.vectorByteSize())) {
+            return vsp.dummyVector().fromMemorySegment0(ms, offset, m).maybeSwap(bo);
        }

        // FIXME: optimize
-        checkMaskFromIndexSize(offset, vsp, m, 1, bb.limit());
-        ByteBuffer wb = wrapper(bb, bo);
-        return vsp.ldOp(wb, offset, (AbstractMask<Byte>)m,
-                   (wb_, o, i)  -> wb_.get(o + i * 1));
+        checkMaskFromIndexSize(offset, vsp, m, 1, ms.byteSize());
+        return vsp.ldLongOp(ms, offset, m, ByteVector::memorySegmentGet);
    }

    // Memory store operations
@ -3292,7 +3388,7 @@ public abstract class ByteVector extends AbstractVector<Byte> {
            this,
            a, offset,
            (arr, off, v)
-            -> v.stOp(arr, off,
+            -> v.stOp(arr, (int) off,
                      (arr_, off_, i, e) -> arr_[off_ + i] = e));
    }

@ -3443,7 +3539,7 @@ public abstract class ByteVector extends AbstractVector<Byte> {
            normalized,
            a, offset,
            (arr, off, v)
-            -> v.stOp(arr, off,
+            -> v.stOp(arr, (int) off,
                      (arr_, off_, i, e) -> arr_[off_ + i] = (e & 1) != 0));
    }

@ -3581,67 +3677,40 @@ public abstract class ByteVector extends AbstractVector<Byte> {

    /**
     * {@inheritDoc} <!--workaround-->
+     * @since 19
     */
    @Override
    @ForceInline
    public final
-    void intoByteArray(byte[] a, int offset,
-                       ByteOrder bo) {
-        offset = checkFromIndexSize(offset, byteSize(), a.length);
-        maybeSwap(bo).intoByteArray0(a, offset);
-    }
-
-    /**
-     * {@inheritDoc} <!--workaround-->
-     */
-    @Override
-    @ForceInline
-    public final
-    void intoByteArray(byte[] a, int offset,
-                       ByteOrder bo,
-                       VectorMask<Byte> m) {
-        if (m.allTrue()) {
-            intoByteArray(a, offset, bo);
-        } else {
-            ByteSpecies vsp = vspecies();
-            checkMaskFromIndexSize(offset, vsp, m, 1, a.length);
-            maybeSwap(bo).intoByteArray0(a, offset, m);
+    void intoMemorySegment(MemorySegment ms, long offset,
+                           ByteOrder bo) {
+        if (ms.isReadOnly()) {
+            throw new UnsupportedOperationException("Attempt to write a read-only segment");
        }
+
+        offset = checkFromIndexSize(offset, byteSize(), ms.byteSize());
+        maybeSwap(bo).intoMemorySegment0(ms, offset);
    }

    /**
     * {@inheritDoc} <!--workaround-->
+     * @since 19
     */
    @Override
    @ForceInline
    public final
-    void intoByteBuffer(ByteBuffer bb, int offset,
-                        ByteOrder bo) {
-        if (ScopedMemoryAccess.isReadOnly(bb)) {
-            throw new ReadOnlyBufferException();
-        }
-        offset = checkFromIndexSize(offset, byteSize(), bb.limit());
-        maybeSwap(bo).intoByteBuffer0(bb, offset);
-    }
-
-    /**
-     * {@inheritDoc} <!--workaround-->
-     */
-    @Override
-    @ForceInline
-    public final
-    void intoByteBuffer(ByteBuffer bb, int offset,
-                        ByteOrder bo,
-                        VectorMask<Byte> m) {
+    void intoMemorySegment(MemorySegment ms, long offset,
+                           ByteOrder bo,
+                           VectorMask<Byte> m) {
        if (m.allTrue()) {
-            intoByteBuffer(bb, offset, bo);
+            intoMemorySegment(ms, offset, bo);
        } else {
-            if (bb.isReadOnly()) {
-                throw new ReadOnlyBufferException();
+            if (ms.isReadOnly()) {
+                throw new UnsupportedOperationException("Attempt to write a read-only segment");
            }
            ByteSpecies vsp = vspecies();
-            checkMaskFromIndexSize(offset, vsp, m, 1, bb.limit());
-            maybeSwap(bo).intoByteBuffer0(bb, offset, m);
+            checkMaskFromIndexSize(offset, vsp, m, 1, ms.byteSize());
+            maybeSwap(bo).intoMemorySegment0(ms, offset, m);
        }
    }

@ -3675,7 +3744,7 @@ public abstract class ByteVector extends AbstractVector<Byte> {
            vsp.vectorType(), vsp.elementType(), vsp.laneCount(),
            a, arrayAddress(a, offset),
            a, offset, vsp,
-            (arr, off, s) -> s.ldOp(arr, off,
+            (arr, off, s) -> s.ldOp(arr, (int) off,
                                    (arr_, off_, i) -> arr_[off_ + i]));
    }

@ -3692,7 +3761,7 @@ public abstract class ByteVector extends AbstractVector<Byte> {
            vsp.vectorType(), maskClass, vsp.elementType(), vsp.laneCount(),
            a, arrayAddress(a, offset), m,
            a, offset, vsp,
-            (arr, off, s, vm) -> s.ldOp(arr, off, vm,
+            (arr, off, s, vm) -> s.ldOp(arr, (int) off, vm,
                                        (arr_, off_, i) -> arr_[off_ + i]));
    }

@ -3709,7 +3778,7 @@ public abstract class ByteVector extends AbstractVector<Byte> {
            vsp.vectorType(), vsp.elementType(), vsp.laneCount(),
            a, booleanArrayAddress(a, offset),
            a, offset, vsp,
-            (arr, off, s) -> s.ldOp(arr, off,
+            (arr, off, s) -> s.ldOp(arr, (int) off,
                                    (arr_, off_, i) -> (byte) (arr_[off_ + i] ? 1 : 0)));
    }

@ -3726,78 +3795,37 @@ public abstract class ByteVector extends AbstractVector<Byte> {
            vsp.vectorType(), maskClass, vsp.elementType(), vsp.laneCount(),
            a, booleanArrayAddress(a, offset), m,
            a, offset, vsp,
-            (arr, off, s, vm) -> s.ldOp(arr, off, vm,
+            (arr, off, s, vm) -> s.ldOp(arr, (int) off, vm,
                                        (arr_, off_, i) -> (byte) (arr_[off_ + i] ? 1 : 0)));
    }

-    @Override
    abstract
-    ByteVector fromByteArray0(byte[] a, int offset);
+    ByteVector fromMemorySegment0(MemorySegment bb, long offset);
    @ForceInline
    final
-    ByteVector fromByteArray0Template(byte[] a, int offset) {
+    ByteVector fromMemorySegment0Template(MemorySegment ms, long offset) {
        ByteSpecies vsp = vspecies();
-        return VectorSupport.load(
-            vsp.vectorType(), vsp.elementType(), vsp.laneCount(),
-            a, byteArrayAddress(a, offset),
-            a, offset, vsp,
-            (arr, off, s) -> {
-                ByteBuffer wb = wrapper(arr, NATIVE_ENDIAN);
-                return s.ldOp(wb, off,
-                        (wb_, o, i) -> wb_.get(o + i * 1));
-            });
-    }
-
-    abstract
-    ByteVector fromByteArray0(byte[] a, int offset, VectorMask<Byte> m);
-    @ForceInline
-    final
-    <M extends VectorMask<Byte>>
-    ByteVector fromByteArray0Template(Class<M> maskClass, byte[] a, int offset, M m) {
-        ByteSpecies vsp = vspecies();
-        m.check(vsp);
-        return VectorSupport.loadMasked(
-            vsp.vectorType(), maskClass, vsp.elementType(), vsp.laneCount(),
-            a, byteArrayAddress(a, offset), m,
-            a, offset, vsp,
-            (arr, off, s, vm) -> {
-                ByteBuffer wb = wrapper(arr, NATIVE_ENDIAN);
-                return s.ldOp(wb, off, vm,
-                        (wb_, o, i) -> wb_.get(o + i * 1));
-            });
-    }
-
-    abstract
-    ByteVector fromByteBuffer0(ByteBuffer bb, int offset);
-    @ForceInline
-    final
-    ByteVector fromByteBuffer0Template(ByteBuffer bb, int offset) {
-        ByteSpecies vsp = vspecies();
-        return ScopedMemoryAccess.loadFromByteBuffer(
+        return ScopedMemoryAccess.loadFromMemorySegment(
                vsp.vectorType(), vsp.elementType(), vsp.laneCount(),
-                bb, offset, vsp,
-                (buf, off, s) -> {
-                    ByteBuffer wb = wrapper(buf, NATIVE_ENDIAN);
-                    return s.ldOp(wb, off,
-                            (wb_, o, i) -> wb_.get(o + i * 1));
+                (AbstractMemorySegmentImpl) ms, offset, vsp,
+                (msp, off, s) -> {
+                    return s.ldLongOp((MemorySegment) msp, off, ByteVector::memorySegmentGet);
                });
    }

    abstract
-    ByteVector fromByteBuffer0(ByteBuffer bb, int offset, VectorMask<Byte> m);
+    ByteVector fromMemorySegment0(MemorySegment ms, long offset, VectorMask<Byte> m);
    @ForceInline
    final
    <M extends VectorMask<Byte>>
-    ByteVector fromByteBuffer0Template(Class<M> maskClass, ByteBuffer bb, int offset, M m) {
+    ByteVector fromMemorySegment0Template(Class<M> maskClass, MemorySegment ms, long offset, M m) {
        ByteSpecies vsp = vspecies();
        m.check(vsp);
-        return ScopedMemoryAccess.loadFromByteBufferMasked(
+        return ScopedMemoryAccess.loadFromMemorySegmentMasked(
                vsp.vectorType(), maskClass, vsp.elementType(), vsp.laneCount(),
-                bb, offset, m, vsp,
-                (buf, off, s, vm) -> {
-                    ByteBuffer wb = wrapper(buf, NATIVE_ENDIAN);
-                    return s.ldOp(wb, off, vm,
-                            (wb_, o, i) -> wb_.get(o + i * 1));
+                (AbstractMemorySegmentImpl) ms, offset, m, vsp,
+                (msp, off, s, vm) -> {
+                    return s.ldLongOp((MemorySegment) msp, off, vm, ByteVector::memorySegmentGet);
                });
    }

@ -3816,7 +3844,7 @@ public abstract class ByteVector extends AbstractVector<Byte> {
            a, arrayAddress(a, offset),
            this, a, offset,
            (arr, off, v)
-            -> v.stOp(arr, off,
+            -> v.stOp(arr, (int) off,
                      (arr_, off_, i, e) -> arr_[off_+i] = e));
    }

@ -3833,7 +3861,7 @@ public abstract class ByteVector extends AbstractVector<Byte> {
            a, arrayAddress(a, offset),
            this, m, a, offset,
            (arr, off, v, vm)
-            -> v.stOp(arr, off, vm,
+            -> v.stOp(arr, (int) off, vm,
                      (arr_, off_, i, e) -> arr_[off_ + i] = e));
    }

@ -3852,75 +3880,37 @@ public abstract class ByteVector extends AbstractVector<Byte> {
            a, booleanArrayAddress(a, offset),
            normalized, m, a, offset,
            (arr, off, v, vm)
-            -> v.stOp(arr, off, vm,
+            -> v.stOp(arr, (int) off, vm,
                      (arr_, off_, i, e) -> arr_[off_ + i] = (e & 1) != 0));
    }

-    abstract
-    void intoByteArray0(byte[] a, int offset);
    @ForceInline
    final
-    void intoByteArray0Template(byte[] a, int offset) {
+    void intoMemorySegment0(MemorySegment ms, long offset) {
        ByteSpecies vsp = vspecies();
-        VectorSupport.store(
-            vsp.vectorType(), vsp.elementType(), vsp.laneCount(),
-            a, byteArrayAddress(a, offset),
-            this, a, offset,
-            (arr, off, v) -> {
-                ByteBuffer wb = wrapper(arr, NATIVE_ENDIAN);
-                v.stOp(wb, off,
-                        (tb_, o, i, e) -> tb_.put(o + i * 1, e));
-            });
-    }
-
-    abstract
-    void intoByteArray0(byte[] a, int offset, VectorMask<Byte> m);
-    @ForceInline
-    final
-    <M extends VectorMask<Byte>>
-    void intoByteArray0Template(Class<M> maskClass, byte[] a, int offset, M m) {
-        ByteSpecies vsp = vspecies();
-        m.check(vsp);
-        VectorSupport.storeMasked(
-            vsp.vectorType(), maskClass, vsp.elementType(), vsp.laneCount(),
-            a, byteArrayAddress(a, offset),
-            this, m, a, offset,
-            (arr, off, v, vm) -> {
-                ByteBuffer wb = wrapper(arr, NATIVE_ENDIAN);
-                v.stOp(wb, off, vm,
-                        (tb_, o, i, e) -> tb_.put(o + i * 1, e));
-            });
-    }
-
-    @ForceInline
-    final
-    void intoByteBuffer0(ByteBuffer bb, int offset) {
-        ByteSpecies vsp = vspecies();
-        ScopedMemoryAccess.storeIntoByteBuffer(
+        ScopedMemoryAccess.storeIntoMemorySegment(
                vsp.vectorType(), vsp.elementType(), vsp.laneCount(),
-                this, bb, offset,
-                (buf, off, v) -> {
-                    ByteBuffer wb = wrapper(buf, NATIVE_ENDIAN);
-                    v.stOp(wb, off,
-                            (wb_, o, i, e) -> wb_.put(o + i * 1, e));
+                this,
+                (AbstractMemorySegmentImpl) ms, offset,
+                (msp, off, v) -> {
+                    v.stLongOp((MemorySegment) msp, off, ByteVector::memorySegmentSet);
                });
    }

    abstract
-    void intoByteBuffer0(ByteBuffer bb, int offset, VectorMask<Byte> m);
+    void intoMemorySegment0(MemorySegment bb, long offset, VectorMask<Byte> m);
    @ForceInline
    final
    <M extends VectorMask<Byte>>
-    void intoByteBuffer0Template(Class<M> maskClass, ByteBuffer bb, int offset, M m) {
+    void intoMemorySegment0Template(Class<M> maskClass, MemorySegment ms, long offset, M m) {
        ByteSpecies vsp = vspecies();
        m.check(vsp);
-        ScopedMemoryAccess.storeIntoByteBufferMasked(
+        ScopedMemoryAccess.storeIntoMemorySegmentMasked(
                vsp.vectorType(), maskClass, vsp.elementType(), vsp.laneCount(),
-                this, m, bb, offset,
-                (buf, off, v, vm) -> {
-                    ByteBuffer wb = wrapper(buf, NATIVE_ENDIAN);
-                    v.stOp(wb, off, vm,
-                            (wb_, o, i, e) -> wb_.put(o + i * 1, e));
+                this, m,
+                (AbstractMemorySegmentImpl) ms, offset,
+                (msp, off, v, vm) -> {
+                    v.stLongOp((MemorySegment) msp, off, vm, ByteVector::memorySegmentSet);
                });
    }

@ -3937,6 +3927,16 @@ public abstract class ByteVector extends AbstractVector<Byte> {
            .checkIndexByLane(offset, limit, vsp.iota(), scale);
    }

+    private static
+    void checkMaskFromIndexSize(long offset,
+                                ByteSpecies vsp,
+                                VectorMask<Byte> m,
+                                int scale,
+                                long limit) {
+        ((AbstractMask<Byte>)m)
+            .checkIndexByLane(offset, limit, vsp.iota(), scale);
+    }
+
    @ForceInline
    private void conditionalStoreNYI(int offset,
                                     ByteSpecies vsp,
@ -4256,6 +4256,21 @@ public abstract class ByteVector extends AbstractVector<Byte> {
            return dummyVector().ldOp(memory, offset, m, f);
        }

+        /*package-private*/
+        @ForceInline
+        ByteVector ldLongOp(MemorySegment memory, long offset,
+                                      FLdLongOp f) {
+            return dummyVector().ldLongOp(memory, offset, f);
+        }
+
+        /*package-private*/
+        @ForceInline
+        ByteVector ldLongOp(MemorySegment memory, long offset,
+                                      VectorMask<Byte> m,
+                                      FLdLongOp f) {
+            return dummyVector().ldLongOp(memory, offset, m, f);
+        }
+
        /*package-private*/
        @ForceInline
        <M> void stOp(M memory, int offset, FStOp<M> f) {
@ -4270,6 +4285,20 @@ public abstract class ByteVector extends AbstractVector<Byte> {
            dummyVector().stOp(memory, offset, m, f);
        }

+        /*package-private*/
+        @ForceInline
+        void stLongOp(MemorySegment memory, long offset, FStLongOp f) {
+            dummyVector().stLongOp(memory, offset, f);
+        }
+
+        /*package-private*/
+        @ForceInline
+        void stLongOp(MemorySegment memory, long offset,
+                      AbstractMask<Byte> m,
+                      FStLongOp f) {
+            dummyVector().stLongOp(memory, offset, m, f);
+        }
+
        // N.B. Make sure these constant vectors and
        // masks load up correctly into registers.
        //
@ -4383,3 +4412,4 @@ public abstract class ByteVector extends AbstractVector<Byte> {
    public static final VectorSpecies<Byte> SPECIES_PREFERRED
        = (ByteSpecies) VectorSpecies.ofPreferred(byte.class);
 }
+
--- a/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/Double128Vector.java
+++ b/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/Double128Vector.java
@ -24,7 +24,7 @@
 */
 package jdk.incubator.vector;

-import java.nio.ByteBuffer;
+import java.lang.foreign.MemorySegment;
 import java.util.Arrays;
 import java.util.Objects;
 import java.util.function.IntUnaryOperator;
@ -461,6 +461,22 @@ final class Double128Vector extends DoubleVector {
                                    (Double128Vector) v);  // specialize
    }

+    @Override
+    @ForceInline
+    public Double128Vector compress(VectorMask<Double> m) {
+        return (Double128Vector)
+            super.compressTemplate(Double128Mask.class,
+                                   (Double128Mask) m);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    public Double128Vector expand(VectorMask<Double> m) {
+        return (Double128Vector)
+            super.expandTemplate(Double128Mask.class,
+                                   (Double128Mask) m);  // specialize
+    }
+
    @Override
    @ForceInline
    public Double128Vector selectFrom(Vector<Double> v) {
@ -638,6 +654,15 @@ final class Double128Vector extends DoubleVector {
            return xor(maskAll(true));
        }

+        @Override
+        @ForceInline
+        public Double128Mask compress() {
+            return (Double128Mask)VectorSupport.compressExpandOp(VectorSupport.VECTOR_OP_MASK_COMPRESS,
+                Double128Vector.class, Double128Mask.class, ETYPE, VLENGTH, null, this,
+                (v1, m1) -> VSPECIES.iota().compare(VectorOperators.LT, m1.trueCount()));
+        }
+
+
        // Binary operations

        @Override
@ -830,29 +855,15 @@ final class Double128Vector extends DoubleVector {
    @ForceInline
    @Override
    final
-    DoubleVector fromByteArray0(byte[] a, int offset) {
-        return super.fromByteArray0Template(a, offset);  // specialize
+    DoubleVector fromMemorySegment0(MemorySegment ms, long offset) {
+        return super.fromMemorySegment0Template(ms, offset);  // specialize
    }

    @ForceInline
    @Override
    final
-    DoubleVector fromByteArray0(byte[] a, int offset, VectorMask<Double> m) {
-        return super.fromByteArray0Template(Double128Mask.class, a, offset, (Double128Mask) m);  // specialize
-    }
-
-    @ForceInline
-    @Override
-    final
-    DoubleVector fromByteBuffer0(ByteBuffer bb, int offset) {
-        return super.fromByteBuffer0Template(bb, offset);  // specialize
-    }
-
-    @ForceInline
-    @Override
-    final
-    DoubleVector fromByteBuffer0(ByteBuffer bb, int offset, VectorMask<Double> m) {
-        return super.fromByteBuffer0Template(Double128Mask.class, bb, offset, (Double128Mask) m);  // specialize
+    DoubleVector fromMemorySegment0(MemorySegment ms, long offset, VectorMask<Double> m) {
+        return super.fromMemorySegment0Template(Double128Mask.class, ms, offset, (Double128Mask) m);  // specialize
    }

    @ForceInline
@ -880,22 +891,8 @@ final class Double128Vector extends DoubleVector {
    @ForceInline
    @Override
    final
-    void intoByteArray0(byte[] a, int offset) {
-        super.intoByteArray0Template(a, offset);  // specialize
-    }
-
-    @ForceInline
-    @Override
-    final
-    void intoByteArray0(byte[] a, int offset, VectorMask<Double> m) {
-        super.intoByteArray0Template(Double128Mask.class, a, offset, (Double128Mask) m);  // specialize
-    }
-
-    @ForceInline
-    @Override
-    final
-    void intoByteBuffer0(ByteBuffer bb, int offset, VectorMask<Double> m) {
-        super.intoByteBuffer0Template(Double128Mask.class, bb, offset, (Double128Mask) m);
+    void intoMemorySegment0(MemorySegment ms, long offset, VectorMask<Double> m) {
+        super.intoMemorySegment0Template(Double128Mask.class, ms, offset, (Double128Mask) m);
    }


@ -904,3 +901,4 @@ final class Double128Vector extends DoubleVector {
    // ================================================

 }
+
--- a/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/Double256Vector.java
+++ b/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/Double256Vector.java
@ -24,7 +24,7 @@
 */
 package jdk.incubator.vector;

-import java.nio.ByteBuffer;
+import java.lang.foreign.MemorySegment;
 import java.util.Arrays;
 import java.util.Objects;
 import java.util.function.IntUnaryOperator;
@ -461,6 +461,22 @@ final class Double256Vector extends DoubleVector {
                                    (Double256Vector) v);  // specialize
    }

+    @Override
+    @ForceInline
+    public Double256Vector compress(VectorMask<Double> m) {
+        return (Double256Vector)
+            super.compressTemplate(Double256Mask.class,
+                                   (Double256Mask) m);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    public Double256Vector expand(VectorMask<Double> m) {
+        return (Double256Vector)
+            super.expandTemplate(Double256Mask.class,
+                                   (Double256Mask) m);  // specialize
+    }
+
    @Override
    @ForceInline
    public Double256Vector selectFrom(Vector<Double> v) {
@ -642,6 +658,15 @@ final class Double256Vector extends DoubleVector {
            return xor(maskAll(true));
        }

+        @Override
+        @ForceInline
+        public Double256Mask compress() {
+            return (Double256Mask)VectorSupport.compressExpandOp(VectorSupport.VECTOR_OP_MASK_COMPRESS,
+                Double256Vector.class, Double256Mask.class, ETYPE, VLENGTH, null, this,
+                (v1, m1) -> VSPECIES.iota().compare(VectorOperators.LT, m1.trueCount()));
+        }
+
+
        // Binary operations

        @Override
@ -834,29 +859,15 @@ final class Double256Vector extends DoubleVector {
    @ForceInline
    @Override
    final
-    DoubleVector fromByteArray0(byte[] a, int offset) {
-        return super.fromByteArray0Template(a, offset);  // specialize
+    DoubleVector fromMemorySegment0(MemorySegment ms, long offset) {
+        return super.fromMemorySegment0Template(ms, offset);  // specialize
    }

    @ForceInline
    @Override
    final
-    DoubleVector fromByteArray0(byte[] a, int offset, VectorMask<Double> m) {
-        return super.fromByteArray0Template(Double256Mask.class, a, offset, (Double256Mask) m);  // specialize
-    }
-
-    @ForceInline
-    @Override
-    final
-    DoubleVector fromByteBuffer0(ByteBuffer bb, int offset) {
-        return super.fromByteBuffer0Template(bb, offset);  // specialize
-    }
-
-    @ForceInline
-    @Override
-    final
-    DoubleVector fromByteBuffer0(ByteBuffer bb, int offset, VectorMask<Double> m) {
-        return super.fromByteBuffer0Template(Double256Mask.class, bb, offset, (Double256Mask) m);  // specialize
+    DoubleVector fromMemorySegment0(MemorySegment ms, long offset, VectorMask<Double> m) {
+        return super.fromMemorySegment0Template(Double256Mask.class, ms, offset, (Double256Mask) m);  // specialize
    }

    @ForceInline
@ -884,22 +895,8 @@ final class Double256Vector extends DoubleVector {
    @ForceInline
    @Override
    final
-    void intoByteArray0(byte[] a, int offset) {
-        super.intoByteArray0Template(a, offset);  // specialize
-    }
-
-    @ForceInline
-    @Override
-    final
-    void intoByteArray0(byte[] a, int offset, VectorMask<Double> m) {
-        super.intoByteArray0Template(Double256Mask.class, a, offset, (Double256Mask) m);  // specialize
-    }
-
-    @ForceInline
-    @Override
-    final
-    void intoByteBuffer0(ByteBuffer bb, int offset, VectorMask<Double> m) {
-        super.intoByteBuffer0Template(Double256Mask.class, bb, offset, (Double256Mask) m);
+    void intoMemorySegment0(MemorySegment ms, long offset, VectorMask<Double> m) {
+        super.intoMemorySegment0Template(Double256Mask.class, ms, offset, (Double256Mask) m);
    }


@ -908,3 +905,4 @@ final class Double256Vector extends DoubleVector {
    // ================================================

 }
+
--- a/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/Double512Vector.java
+++ b/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/Double512Vector.java
@ -24,7 +24,7 @@
 */
 package jdk.incubator.vector;

-import java.nio.ByteBuffer;
+import java.lang.foreign.MemorySegment;
 import java.util.Arrays;
 import java.util.Objects;
 import java.util.function.IntUnaryOperator;
@ -461,6 +461,22 @@ final class Double512Vector extends DoubleVector {
                                    (Double512Vector) v);  // specialize
    }

+    @Override
+    @ForceInline
+    public Double512Vector compress(VectorMask<Double> m) {
+        return (Double512Vector)
+            super.compressTemplate(Double512Mask.class,
+                                   (Double512Mask) m);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    public Double512Vector expand(VectorMask<Double> m) {
+        return (Double512Vector)
+            super.expandTemplate(Double512Mask.class,
+                                   (Double512Mask) m);  // specialize
+    }
+
    @Override
    @ForceInline
    public Double512Vector selectFrom(Vector<Double> v) {
@ -650,6 +666,15 @@ final class Double512Vector extends DoubleVector {
            return xor(maskAll(true));
        }

+        @Override
+        @ForceInline
+        public Double512Mask compress() {
+            return (Double512Mask)VectorSupport.compressExpandOp(VectorSupport.VECTOR_OP_MASK_COMPRESS,
+                Double512Vector.class, Double512Mask.class, ETYPE, VLENGTH, null, this,
+                (v1, m1) -> VSPECIES.iota().compare(VectorOperators.LT, m1.trueCount()));
+        }
+
+
        // Binary operations

        @Override
@ -842,29 +867,15 @@ final class Double512Vector extends DoubleVector {
    @ForceInline
    @Override
    final
-    DoubleVector fromByteArray0(byte[] a, int offset) {
-        return super.fromByteArray0Template(a, offset);  // specialize
+    DoubleVector fromMemorySegment0(MemorySegment ms, long offset) {
+        return super.fromMemorySegment0Template(ms, offset);  // specialize
    }

    @ForceInline
    @Override
    final
-    DoubleVector fromByteArray0(byte[] a, int offset, VectorMask<Double> m) {
-        return super.fromByteArray0Template(Double512Mask.class, a, offset, (Double512Mask) m);  // specialize
-    }
-
-    @ForceInline
-    @Override
-    final
-    DoubleVector fromByteBuffer0(ByteBuffer bb, int offset) {
-        return super.fromByteBuffer0Template(bb, offset);  // specialize
-    }
-
-    @ForceInline
-    @Override
-    final
-    DoubleVector fromByteBuffer0(ByteBuffer bb, int offset, VectorMask<Double> m) {
-        return super.fromByteBuffer0Template(Double512Mask.class, bb, offset, (Double512Mask) m);  // specialize
+    DoubleVector fromMemorySegment0(MemorySegment ms, long offset, VectorMask<Double> m) {
+        return super.fromMemorySegment0Template(Double512Mask.class, ms, offset, (Double512Mask) m);  // specialize
    }

    @ForceInline
@ -892,22 +903,8 @@ final class Double512Vector extends DoubleVector {
    @ForceInline
    @Override
    final
-    void intoByteArray0(byte[] a, int offset) {
-        super.intoByteArray0Template(a, offset);  // specialize
-    }
-
-    @ForceInline
-    @Override
-    final
-    void intoByteArray0(byte[] a, int offset, VectorMask<Double> m) {
-        super.intoByteArray0Template(Double512Mask.class, a, offset, (Double512Mask) m);  // specialize
-    }
-
-    @ForceInline
-    @Override
-    final
-    void intoByteBuffer0(ByteBuffer bb, int offset, VectorMask<Double> m) {
-        super.intoByteBuffer0Template(Double512Mask.class, bb, offset, (Double512Mask) m);
+    void intoMemorySegment0(MemorySegment ms, long offset, VectorMask<Double> m) {
+        super.intoMemorySegment0Template(Double512Mask.class, ms, offset, (Double512Mask) m);
    }


@ -916,3 +913,4 @@ final class Double512Vector extends DoubleVector {
    // ================================================

 }
+
--- a/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/Double64Vector.java
+++ b/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/Double64Vector.java
@ -24,7 +24,7 @@
 */
 package jdk.incubator.vector;

-import java.nio.ByteBuffer;
+import java.lang.foreign.MemorySegment;
 import java.util.Arrays;
 import java.util.Objects;
 import java.util.function.IntUnaryOperator;
@ -461,6 +461,22 @@ final class Double64Vector extends DoubleVector {
                                    (Double64Vector) v);  // specialize
    }

+    @Override
+    @ForceInline
+    public Double64Vector compress(VectorMask<Double> m) {
+        return (Double64Vector)
+            super.compressTemplate(Double64Mask.class,
+                                   (Double64Mask) m);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    public Double64Vector expand(VectorMask<Double> m) {
+        return (Double64Vector)
+            super.expandTemplate(Double64Mask.class,
+                                   (Double64Mask) m);  // specialize
+    }
+
    @Override
    @ForceInline
    public Double64Vector selectFrom(Vector<Double> v) {
@ -636,6 +652,15 @@ final class Double64Vector extends DoubleVector {
            return xor(maskAll(true));
        }

+        @Override
+        @ForceInline
+        public Double64Mask compress() {
+            return (Double64Mask)VectorSupport.compressExpandOp(VectorSupport.VECTOR_OP_MASK_COMPRESS,
+                Double64Vector.class, Double64Mask.class, ETYPE, VLENGTH, null, this,
+                (v1, m1) -> VSPECIES.iota().compare(VectorOperators.LT, m1.trueCount()));
+        }
+
+
        // Binary operations

        @Override
@ -828,29 +853,15 @@ final class Double64Vector extends DoubleVector {
    @ForceInline
    @Override
    final
-    DoubleVector fromByteArray0(byte[] a, int offset) {
-        return super.fromByteArray0Template(a, offset);  // specialize
+    DoubleVector fromMemorySegment0(MemorySegment ms, long offset) {
+        return super.fromMemorySegment0Template(ms, offset);  // specialize
    }

    @ForceInline
    @Override
    final
-    DoubleVector fromByteArray0(byte[] a, int offset, VectorMask<Double> m) {
-        return super.fromByteArray0Template(Double64Mask.class, a, offset, (Double64Mask) m);  // specialize
-    }
-
-    @ForceInline
-    @Override
-    final
-    DoubleVector fromByteBuffer0(ByteBuffer bb, int offset) {
-        return super.fromByteBuffer0Template(bb, offset);  // specialize
-    }
-
-    @ForceInline
-    @Override
-    final
-    DoubleVector fromByteBuffer0(ByteBuffer bb, int offset, VectorMask<Double> m) {
-        return super.fromByteBuffer0Template(Double64Mask.class, bb, offset, (Double64Mask) m);  // specialize
+    DoubleVector fromMemorySegment0(MemorySegment ms, long offset, VectorMask<Double> m) {
+        return super.fromMemorySegment0Template(Double64Mask.class, ms, offset, (Double64Mask) m);  // specialize
    }

    @ForceInline
@ -878,22 +889,8 @@ final class Double64Vector extends DoubleVector {
    @ForceInline
    @Override
    final
-    void intoByteArray0(byte[] a, int offset) {
-        super.intoByteArray0Template(a, offset);  // specialize
-    }
-
-    @ForceInline
-    @Override
-    final
-    void intoByteArray0(byte[] a, int offset, VectorMask<Double> m) {
-        super.intoByteArray0Template(Double64Mask.class, a, offset, (Double64Mask) m);  // specialize
-    }
-
-    @ForceInline
-    @Override
-    final
-    void intoByteBuffer0(ByteBuffer bb, int offset, VectorMask<Double> m) {
-        super.intoByteBuffer0Template(Double64Mask.class, bb, offset, (Double64Mask) m);
+    void intoMemorySegment0(MemorySegment ms, long offset, VectorMask<Double> m) {
+        super.intoMemorySegment0Template(Double64Mask.class, ms, offset, (Double64Mask) m);
    }


@ -902,3 +899,4 @@ final class Double64Vector extends DoubleVector {
    // ================================================

 }
+
--- a/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/DoubleMaxVector.java
+++ b/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/DoubleMaxVector.java
@ -24,7 +24,7 @@
 */
 package jdk.incubator.vector;

-import java.nio.ByteBuffer;
+import java.lang.foreign.MemorySegment;
 import java.util.Arrays;
 import java.util.Objects;
 import java.util.function.IntUnaryOperator;
@ -461,6 +461,22 @@ final class DoubleMaxVector extends DoubleVector {
                                    (DoubleMaxVector) v);  // specialize
    }

+    @Override
+    @ForceInline
+    public DoubleMaxVector compress(VectorMask<Double> m) {
+        return (DoubleMaxVector)
+            super.compressTemplate(DoubleMaxMask.class,
+                                   (DoubleMaxMask) m);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    public DoubleMaxVector expand(VectorMask<Double> m) {
+        return (DoubleMaxVector)
+            super.expandTemplate(DoubleMaxMask.class,
+                                   (DoubleMaxMask) m);  // specialize
+    }
+
    @Override
    @ForceInline
    public DoubleMaxVector selectFrom(Vector<Double> v) {
@ -635,6 +651,15 @@ final class DoubleMaxVector extends DoubleVector {
            return xor(maskAll(true));
        }

+        @Override
+        @ForceInline
+        public DoubleMaxMask compress() {
+            return (DoubleMaxMask)VectorSupport.compressExpandOp(VectorSupport.VECTOR_OP_MASK_COMPRESS,
+                DoubleMaxVector.class, DoubleMaxMask.class, ETYPE, VLENGTH, null, this,
+                (v1, m1) -> VSPECIES.iota().compare(VectorOperators.LT, m1.trueCount()));
+        }
+
+
        // Binary operations

        @Override
@ -827,29 +852,15 @@ final class DoubleMaxVector extends DoubleVector {
    @ForceInline
    @Override
    final
-    DoubleVector fromByteArray0(byte[] a, int offset) {
-        return super.fromByteArray0Template(a, offset);  // specialize
+    DoubleVector fromMemorySegment0(MemorySegment ms, long offset) {
+        return super.fromMemorySegment0Template(ms, offset);  // specialize
    }

    @ForceInline
    @Override
    final
-    DoubleVector fromByteArray0(byte[] a, int offset, VectorMask<Double> m) {
-        return super.fromByteArray0Template(DoubleMaxMask.class, a, offset, (DoubleMaxMask) m);  // specialize
-    }
-
-    @ForceInline
-    @Override
-    final
-    DoubleVector fromByteBuffer0(ByteBuffer bb, int offset) {
-        return super.fromByteBuffer0Template(bb, offset);  // specialize
-    }
-
-    @ForceInline
-    @Override
-    final
-    DoubleVector fromByteBuffer0(ByteBuffer bb, int offset, VectorMask<Double> m) {
-        return super.fromByteBuffer0Template(DoubleMaxMask.class, bb, offset, (DoubleMaxMask) m);  // specialize
+    DoubleVector fromMemorySegment0(MemorySegment ms, long offset, VectorMask<Double> m) {
+        return super.fromMemorySegment0Template(DoubleMaxMask.class, ms, offset, (DoubleMaxMask) m);  // specialize
    }

    @ForceInline
@ -877,22 +888,8 @@ final class DoubleMaxVector extends DoubleVector {
    @ForceInline
    @Override
    final
-    void intoByteArray0(byte[] a, int offset) {
-        super.intoByteArray0Template(a, offset);  // specialize
-    }
-
-    @ForceInline
-    @Override
-    final
-    void intoByteArray0(byte[] a, int offset, VectorMask<Double> m) {
-        super.intoByteArray0Template(DoubleMaxMask.class, a, offset, (DoubleMaxMask) m);  // specialize
-    }
-
-    @ForceInline
-    @Override
-    final
-    void intoByteBuffer0(ByteBuffer bb, int offset, VectorMask<Double> m) {
-        super.intoByteBuffer0Template(DoubleMaxMask.class, bb, offset, (DoubleMaxMask) m);
+    void intoMemorySegment0(MemorySegment ms, long offset, VectorMask<Double> m) {
+        super.intoMemorySegment0Template(DoubleMaxMask.class, ms, offset, (DoubleMaxMask) m);
    }


@ -901,3 +898,4 @@ final class DoubleMaxVector extends DoubleVector {
    // ================================================

 }
+
--- a/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/DoubleVector.java
+++ b/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/DoubleVector.java
@ -24,14 +24,14 @@
 */
 package jdk.incubator.vector;

-import java.nio.ByteBuffer;
+import java.lang.foreign.MemorySegment;
+import java.lang.foreign.ValueLayout;
 import java.nio.ByteOrder;
-import java.nio.ReadOnlyBufferException;
 import java.util.Arrays;
 import java.util.Objects;
 import java.util.function.Function;
-import java.util.function.UnaryOperator;

+import jdk.internal.foreign.AbstractMemorySegmentImpl;
 import jdk.internal.misc.ScopedMemoryAccess;
 import jdk.internal.misc.Unsafe;
 import jdk.internal.vm.annotation.ForceInline;
@ -57,6 +57,8 @@ public abstract class DoubleVector extends AbstractVector<Double> {

    static final int FORBID_OPCODE_KIND = VO_NOFP;

+    static final ValueLayout.OfDouble ELEMENT_LAYOUT = ValueLayout.JAVA_DOUBLE.withBitAlignment(8);
+
    @ForceInline
    static int opCode(Operator op) {
        return VectorOperators.opCode(op, VO_OPCODE_VALID, FORBID_OPCODE_KIND);
@ -351,6 +353,45 @@ public abstract class DoubleVector extends AbstractVector<Double> {
        return vectorFactory(res);
    }

+    /*package-private*/
+    interface FLdLongOp {
+        double apply(MemorySegment memory, long offset, int i);
+    }
+
+    /*package-private*/
+    @ForceInline
+    final
+    DoubleVector ldLongOp(MemorySegment memory, long offset,
+                                  FLdLongOp f) {
+        //dummy; no vec = vec();
+        double[] res = new double[length()];
+        for (int i = 0; i < res.length; i++) {
+            res[i] = f.apply(memory, offset, i);
+        }
+        return vectorFactory(res);
+    }
+
+    /*package-private*/
+    @ForceInline
+    final
+    DoubleVector ldLongOp(MemorySegment memory, long offset,
+                                  VectorMask<Double> m,
+                                  FLdLongOp f) {
+        //double[] vec = vec();
+        double[] res = new double[length()];
+        boolean[] mbits = ((AbstractMask<Double>)m).getBits();
+        for (int i = 0; i < res.length; i++) {
+            if (mbits[i]) {
+                res[i] = f.apply(memory, offset, i);
+            }
+        }
+        return vectorFactory(res);
+    }
+
+    static double memorySegmentGet(MemorySegment ms, long o, int i) {
+        return ms.get(ELEMENT_LAYOUT, o + i * 8L);
+    }
+
    interface FStOp<M> {
        void apply(M memory, int offset, int i, double a);
    }
@ -381,6 +422,40 @@ public abstract class DoubleVector extends AbstractVector<Double> {
        }
    }

+    interface FStLongOp {
+        void apply(MemorySegment memory, long offset, int i, double a);
+    }
+
+    /*package-private*/
+    @ForceInline
+    final
+    void stLongOp(MemorySegment memory, long offset,
+                  FStLongOp f) {
+        double[] vec = vec();
+        for (int i = 0; i < vec.length; i++) {
+            f.apply(memory, offset, i, vec[i]);
+        }
+    }
+
+    /*package-private*/
+    @ForceInline
+    final
+    void stLongOp(MemorySegment memory, long offset,
+                  VectorMask<Double> m,
+                  FStLongOp f) {
+        double[] vec = vec();
+        boolean[] mbits = ((AbstractMask<Double>)m).getBits();
+        for (int i = 0; i < vec.length; i++) {
+            if (mbits[i]) {
+                f.apply(memory, offset, i, vec[i]);
+            }
+        }
+    }
+
+    static void memorySegmentSet(MemorySegment ms, long o, int i, double e) {
+        ms.set(ELEMENT_LAYOUT, o + i * 8L, e);
+    }
+
    // Binary test

    /*package-private*/
@ -420,6 +495,36 @@ public abstract class DoubleVector extends AbstractVector<Double> {
        return Double.longBitsToDouble((long)bits);
    }

+    static DoubleVector expandHelper(Vector<Double> v, VectorMask<Double> m) {
+        VectorSpecies<Double> vsp = m.vectorSpecies();
+        DoubleVector r  = (DoubleVector) vsp.zero();
+        DoubleVector vi = (DoubleVector) v;
+        if (m.allTrue()) {
+            return vi;
+        }
+        for (int i = 0, j = 0; i < vsp.length(); i++) {
+            if (m.laneIsSet(i)) {
+                r = r.withLane(i, vi.lane(j++));
+            }
+        }
+        return r;
+    }
+
+    static DoubleVector compressHelper(Vector<Double> v, VectorMask<Double> m) {
+        VectorSpecies<Double> vsp = m.vectorSpecies();
+        DoubleVector r  = (DoubleVector) vsp.zero();
+        DoubleVector vi = (DoubleVector) v;
+        if (m.allTrue()) {
+            return vi;
+        }
+        for (int i = 0, j = 0; i < vsp.length(); i++) {
+            if (m.laneIsSet(i)) {
+                r = r.withLane(j++, vi.lane(i));
+            }
+        }
+        return r;
+    }
+
    // Static factories (other than memory operations)

    // Note: A surprising behavior in javadoc
@ -1594,6 +1699,7 @@ public abstract class DoubleVector extends AbstractVector<Double> {
    }


+
    // sqrt
    /**
     * Computes the square root of this vector.
@ -2241,6 +2347,45 @@ public abstract class DoubleVector extends AbstractVector<Double> {
                                     DoubleVector::toShuffle0);
    }

+    /**
+     * {@inheritDoc} <!--workaround-->
+     * @since 19
+     */
+    @Override
+    public abstract
+    DoubleVector compress(VectorMask<Double> m);
+
+    /*package-private*/
+    @ForceInline
+    final
+    <M extends AbstractMask<Double>>
+    DoubleVector compressTemplate(Class<M> masktype, M m) {
+      m.check(masktype, this);
+      return (DoubleVector) VectorSupport.compressExpandOp(VectorSupport.VECTOR_OP_COMPRESS, getClass(), masktype,
+                                                        double.class, length(), this, m,
+                                                        (v1, m1) -> compressHelper(v1, m1));
+    }
+
+    /**
+     * {@inheritDoc} <!--workaround-->
+     * @since 19
+     */
+    @Override
+    public abstract
+    DoubleVector expand(VectorMask<Double> m);
+
+    /*package-private*/
+    @ForceInline
+    final
+    <M extends AbstractMask<Double>>
+    DoubleVector expandTemplate(Class<M> masktype, M m) {
+      m.check(masktype, this);
+      return (DoubleVector) VectorSupport.compressExpandOp(VectorSupport.VECTOR_OP_EXPAND, getClass(), masktype,
+                                                        double.class, length(), this, m,
+                                                        (v1, m1) -> expandHelper(v1, m1));
+    }
+
+
    /**
     * {@inheritDoc} <!--workaround-->
     */
@ -2609,90 +2754,6 @@ public abstract class DoubleVector extends AbstractVector<Double> {
        return toArray();
    }

-    /**
-     * Loads a vector from a byte array starting at an offset.
-     * Bytes are composed into primitive lane elements according
-     * to the specified byte order.
-     * The vector is arranged into lanes according to
-     * <a href="Vector.html#lane-order">memory ordering</a>.
-     * <p>
-     * This method behaves as if it returns the result of calling
-     * {@link #fromByteBuffer(VectorSpecies,ByteBuffer,int,ByteOrder,VectorMask)
-     * fromByteBuffer()} as follows:
-     * <pre>{@code
-     * var bb = ByteBuffer.wrap(a);
-     * var m = species.maskAll(true);
-     * return fromByteBuffer(species, bb, offset, bo, m);
-     * }</pre>
-     *
-     * @param species species of desired vector
-     * @param a the byte array
-     * @param offset the offset into the array
-     * @param bo the intended byte order
-     * @return a vector loaded from a byte array
-     * @throws IndexOutOfBoundsException
-     *         if {@code offset+N*ESIZE < 0}
-     *         or {@code offset+(N+1)*ESIZE > a.length}
-     *         for any lane {@code N} in the vector
-     */
-    @ForceInline
-    public static
-    DoubleVector fromByteArray(VectorSpecies<Double> species,
-                                       byte[] a, int offset,
-                                       ByteOrder bo) {
-        offset = checkFromIndexSize(offset, species.vectorByteSize(), a.length);
-        DoubleSpecies vsp = (DoubleSpecies) species;
-        return vsp.dummyVector().fromByteArray0(a, offset).maybeSwap(bo);
-    }
-
-    /**
-     * Loads a vector from a byte array starting at an offset
-     * and using a mask.
-     * Lanes where the mask is unset are filled with the default
-     * value of {@code double} (positive zero).
-     * Bytes are composed into primitive lane elements according
-     * to the specified byte order.
-     * The vector is arranged into lanes according to
-     * <a href="Vector.html#lane-order">memory ordering</a>.
-     * <p>
-     * This method behaves as if it returns the result of calling
-     * {@link #fromByteBuffer(VectorSpecies,ByteBuffer,int,ByteOrder,VectorMask)
-     * fromByteBuffer()} as follows:
-     * <pre>{@code
-     * var bb = ByteBuffer.wrap(a);
-     * return fromByteBuffer(species, bb, offset, bo, m);
-     * }</pre>
-     *
-     * @param species species of desired vector
-     * @param a the byte array
-     * @param offset the offset into the array
-     * @param bo the intended byte order
-     * @param m the mask controlling lane selection
-     * @return a vector loaded from a byte array
-     * @throws IndexOutOfBoundsException
-     *         if {@code offset+N*ESIZE < 0}
-     *         or {@code offset+(N+1)*ESIZE > a.length}
-     *         for any lane {@code N} in the vector
-     *         where the mask is set
-     */
-    @ForceInline
-    public static
-    DoubleVector fromByteArray(VectorSpecies<Double> species,
-                                       byte[] a, int offset,
-                                       ByteOrder bo,
-                                       VectorMask<Double> m) {
-        DoubleSpecies vsp = (DoubleSpecies) species;
-        if (offset >= 0 && offset <= (a.length - species.vectorByteSize())) {
-            return vsp.dummyVector().fromByteArray0(a, offset, m).maybeSwap(bo);
-        }
-
-        // FIXME: optimize
-        checkMaskFromIndexSize(offset, vsp, m, 8, a.length);
-        ByteBuffer wb = wrapper(a, bo);
-        return vsp.ldOp(wb, offset, (AbstractMask<Double>)m,
-                   (wb_, o, i)  -> wb_.getDouble(o + i * 8));
-    }
-
    /**
     * Loads a vector from an array of type {@code double[]}
     * starting at an offset.
@ -2883,44 +2944,49 @@ public abstract class DoubleVector extends AbstractVector<Double> {


    /**
-     * Loads a vector from a {@linkplain ByteBuffer byte buffer}
-     * starting at an offset into the byte buffer.
+     * Loads a vector from a {@linkplain MemorySegment memory segment}
+     * starting at an offset into the memory segment.
     * Bytes are composed into primitive lane elements according
     * to the specified byte order.
     * The vector is arranged into lanes according to
     * <a href="Vector.html#lane-order">memory ordering</a>.
     * <p>
     * This method behaves as if it returns the result of calling
-     * {@link #fromByteBuffer(VectorSpecies,ByteBuffer,int,ByteOrder,VectorMask)
-     * fromByteBuffer()} as follows:
+     * {@link #fromMemorySegment(VectorSpecies,MemorySegment,long,ByteOrder,VectorMask)
+     * fromMemorySegment()} as follows:
     * <pre>{@code
     * var m = species.maskAll(true);
-     * return fromByteBuffer(species, bb, offset, bo, m);
+     * return fromMemorySegment(species, ms, offset, bo, m);
     * }</pre>
     *
     * @param species species of desired vector
-     * @param bb the byte buffer
-     * @param offset the offset into the byte buffer
+     * @param ms the memory segment
+     * @param offset the offset into the memory segment
     * @param bo the intended byte order
-     * @return a vector loaded from a byte buffer
+     * @return a vector loaded from the memory segment
     * @throws IndexOutOfBoundsException
     *         if {@code offset+N*8 < 0}
-     *         or {@code offset+N*8 >= bb.limit()}
+     *         or {@code offset+N*8 >= ms.byteSize()}
     *         for any lane {@code N} in the vector
+     * @throws IllegalArgumentException if the memory segment is a heap segment that is
+     *         not backed by a {@code byte[]} array.
+     * @throws IllegalStateException if the memory segment's session is not alive,
+     *         or if access occurs from a thread other than the thread owning the session.
+     * @since 19
     */
    @ForceInline
    public static
-    DoubleVector fromByteBuffer(VectorSpecies<Double> species,
-                                        ByteBuffer bb, int offset,
-                                        ByteOrder bo) {
-        offset = checkFromIndexSize(offset, species.vectorByteSize(), bb.limit());
+    DoubleVector fromMemorySegment(VectorSpecies<Double> species,
+                                           MemorySegment ms, long offset,
+                                           ByteOrder bo) {
+        offset = checkFromIndexSize(offset, species.vectorByteSize(), ms.byteSize());
        DoubleSpecies vsp = (DoubleSpecies) species;
-        return vsp.dummyVector().fromByteBuffer0(bb, offset).maybeSwap(bo);
+        return vsp.dummyVector().fromMemorySegment0(ms, offset).maybeSwap(bo);
    }

    /**
-     * Loads a vector from a {@linkplain ByteBuffer byte buffer}
-     * starting at an offset into the byte buffer
+     * Loads a vector from a {@linkplain MemorySegment memory segment}
+     * starting at an offset into the memory segment
     * and using a mask.
     * Lanes where the mask is unset are filled with the default
     * value of {@code double} (positive zero).
@ -2931,13 +2997,11 @@ public abstract class DoubleVector extends AbstractVector<Double> {
     * <p>
     * The following pseudocode illustrates the behavior:
     * <pre>{@code
-     * DoubleBuffer eb = bb.duplicate()
-     *     .position(offset)
-     *     .order(bo).asDoubleBuffer();
+     * var slice = ms.asSlice(offset);
     * double[] ar = new double[species.length()];
     * for (int n = 0; n < ar.length; n++) {
     *     if (m.laneIsSet(n)) {
-     *         ar[n] = eb.get(n);
+     *         ar[n] = slice.getAtIndex(ValuaLayout.JAVA_DOUBLE.withBitAlignment(8), n);
     *     }
     * }
     * DoubleVector r = DoubleVector.fromArray(species, ar, 0);
@ -2951,33 +3015,36 @@ public abstract class DoubleVector extends AbstractVector<Double> {
     * the bytes of lane values.
     *
     * @param species species of desired vector
-     * @param bb the byte buffer
-     * @param offset the offset into the byte buffer
+     * @param ms the memory segment
+     * @param offset the offset into the memory segment
     * @param bo the intended byte order
     * @param m the mask controlling lane selection
-     * @return a vector loaded from a byte buffer
+     * @return a vector loaded from the memory segment
     * @throws IndexOutOfBoundsException
     *         if {@code offset+N*8 < 0}
-     *         or {@code offset+N*8 >= bb.limit()}
+     *         or {@code offset+N*8 >= ms.byteSize()}
     *         for any lane {@code N} in the vector
     *         where the mask is set
+     * @throws IllegalArgumentException if the memory segment is a heap segment that is
+     *         not backed by a {@code byte[]} array.
+     * @throws IllegalStateException if the memory segment's session is not alive,
+     *         or if access occurs from a thread other than the thread owning the session.
+     * @since 19
     */
    @ForceInline
    public static
-    DoubleVector fromByteBuffer(VectorSpecies<Double> species,
-                                        ByteBuffer bb, int offset,
-                                        ByteOrder bo,
-                                        VectorMask<Double> m) {
+    DoubleVector fromMemorySegment(VectorSpecies<Double> species,
+                                           MemorySegment ms, long offset,
+                                           ByteOrder bo,
+                                           VectorMask<Double> m) {
        DoubleSpecies vsp = (DoubleSpecies) species;
-        if (offset >= 0 && offset <= (bb.limit() - species.vectorByteSize())) {
-            return vsp.dummyVector().fromByteBuffer0(bb, offset, m).maybeSwap(bo);
+        if (offset >= 0 && offset <= (ms.byteSize() - species.vectorByteSize())) {
+            return vsp.dummyVector().fromMemorySegment0(ms, offset, m).maybeSwap(bo);
        }

        // FIXME: optimize
-        checkMaskFromIndexSize(offset, vsp, m, 8, bb.limit());
-        ByteBuffer wb = wrapper(bb, bo);
-        return vsp.ldOp(wb, offset, (AbstractMask<Double>)m,
-                   (wb_, o, i)  -> wb_.getDouble(o + i * 8));
+        checkMaskFromIndexSize(offset, vsp, m, 8, ms.byteSize());
+        return vsp.ldLongOp(ms, offset, m, DoubleVector::memorySegmentGet);
    }

    // Memory store operations
@ -3007,7 +3074,7 @@ public abstract class DoubleVector extends AbstractVector<Double> {
            this,
            a, offset,
            (arr, off, v)
-            -> v.stOp(arr, off,
+            -> v.stOp(arr, (int) off,
                      (arr_, off_, i, e) -> arr_[off_ + i] = e));
    }

@ -3167,67 +3234,40 @@ public abstract class DoubleVector extends AbstractVector<Double> {

    /**
     * {@inheritDoc} <!--workaround-->
+     * @since 19
     */
    @Override
    @ForceInline
    public final
-    void intoByteArray(byte[] a, int offset,
-                       ByteOrder bo) {
-        offset = checkFromIndexSize(offset, byteSize(), a.length);
-        maybeSwap(bo).intoByteArray0(a, offset);
-    }
-
-    /**
-     * {@inheritDoc} <!--workaround-->
-     */
-    @Override
-    @ForceInline
-    public final
-    void intoByteArray(byte[] a, int offset,
-                       ByteOrder bo,
-                       VectorMask<Double> m) {
-        if (m.allTrue()) {
-            intoByteArray(a, offset, bo);
-        } else {
-            DoubleSpecies vsp = vspecies();
-            checkMaskFromIndexSize(offset, vsp, m, 8, a.length);
-            maybeSwap(bo).intoByteArray0(a, offset, m);
+    void intoMemorySegment(MemorySegment ms, long offset,
+                           ByteOrder bo) {
+        if (ms.isReadOnly()) {
+            throw new UnsupportedOperationException("Attempt to write a read-only segment");
        }
+
+        offset = checkFromIndexSize(offset, byteSize(), ms.byteSize());
+        maybeSwap(bo).intoMemorySegment0(ms, offset);
    }

    /**
     * {@inheritDoc} <!--workaround-->
+     * @since 19
     */
    @Override
    @ForceInline
    public final
-    void intoByteBuffer(ByteBuffer bb, int offset,
-                        ByteOrder bo) {
-        if (ScopedMemoryAccess.isReadOnly(bb)) {
-            throw new ReadOnlyBufferException();
-        }
-        offset = checkFromIndexSize(offset, byteSize(), bb.limit());
-        maybeSwap(bo).intoByteBuffer0(bb, offset);
-    }
-
-    /**
-     * {@inheritDoc} <!--workaround-->
-     */
-    @Override
-    @ForceInline
-    public final
-    void intoByteBuffer(ByteBuffer bb, int offset,
-                        ByteOrder bo,
-                        VectorMask<Double> m) {
+    void intoMemorySegment(MemorySegment ms, long offset,
+                           ByteOrder bo,
+                           VectorMask<Double> m) {
        if (m.allTrue()) {
-            intoByteBuffer(bb, offset, bo);
+            intoMemorySegment(ms, offset, bo);
        } else {
-            if (bb.isReadOnly()) {
-                throw new ReadOnlyBufferException();
+            if (ms.isReadOnly()) {
+                throw new UnsupportedOperationException("Attempt to write a read-only segment");
            }
            DoubleSpecies vsp = vspecies();
-            checkMaskFromIndexSize(offset, vsp, m, 8, bb.limit());
-            maybeSwap(bo).intoByteBuffer0(bb, offset, m);
+            checkMaskFromIndexSize(offset, vsp, m, 8, ms.byteSize());
+            maybeSwap(bo).intoMemorySegment0(ms, offset, m);
        }
    }

@ -3261,7 +3301,7 @@ public abstract class DoubleVector extends AbstractVector<Double> {
            vsp.vectorType(), vsp.elementType(), vsp.laneCount(),
            a, arrayAddress(a, offset),
            a, offset, vsp,
-            (arr, off, s) -> s.ldOp(arr, off,
+            (arr, off, s) -> s.ldOp(arr, (int) off,
                                    (arr_, off_, i) -> arr_[off_ + i]));
    }

@ -3278,7 +3318,7 @@ public abstract class DoubleVector extends AbstractVector<Double> {
            vsp.vectorType(), maskClass, vsp.elementType(), vsp.laneCount(),
            a, arrayAddress(a, offset), m,
            a, offset, vsp,
-            (arr, off, s, vm) -> s.ldOp(arr, off, vm,
+            (arr, off, s, vm) -> s.ldOp(arr, (int) off, vm,
                                        (arr_, off_, i) -> arr_[off_ + i]));
    }

@ -3336,74 +3376,33 @@ public abstract class DoubleVector extends AbstractVector<Double> {



-    @Override
    abstract
-    DoubleVector fromByteArray0(byte[] a, int offset);
+    DoubleVector fromMemorySegment0(MemorySegment bb, long offset);
    @ForceInline
    final
-    DoubleVector fromByteArray0Template(byte[] a, int offset) {
+    DoubleVector fromMemorySegment0Template(MemorySegment ms, long offset) {
        DoubleSpecies vsp = vspecies();
-        return VectorSupport.load(
-            vsp.vectorType(), vsp.elementType(), vsp.laneCount(),
-            a, byteArrayAddress(a, offset),
-            a, offset, vsp,
-            (arr, off, s) -> {
-                ByteBuffer wb = wrapper(arr, NATIVE_ENDIAN);
-                return s.ldOp(wb, off,
-                        (wb_, o, i) -> wb_.getDouble(o + i * 8));
-            });
-    }
-
-    abstract
-    DoubleVector fromByteArray0(byte[] a, int offset, VectorMask<Double> m);
-    @ForceInline
-    final
-    <M extends VectorMask<Double>>
-    DoubleVector fromByteArray0Template(Class<M> maskClass, byte[] a, int offset, M m) {
-        DoubleSpecies vsp = vspecies();
-        m.check(vsp);
-        return VectorSupport.loadMasked(
-            vsp.vectorType(), maskClass, vsp.elementType(), vsp.laneCount(),
-            a, byteArrayAddress(a, offset), m,
-            a, offset, vsp,
-            (arr, off, s, vm) -> {
-                ByteBuffer wb = wrapper(arr, NATIVE_ENDIAN);
-                return s.ldOp(wb, off, vm,
-                        (wb_, o, i) -> wb_.getDouble(o + i * 8));
-            });
-    }
-
-    abstract
-    DoubleVector fromByteBuffer0(ByteBuffer bb, int offset);
-    @ForceInline
-    final
-    DoubleVector fromByteBuffer0Template(ByteBuffer bb, int offset) {
-        DoubleSpecies vsp = vspecies();
-        return ScopedMemoryAccess.loadFromByteBuffer(
+        return ScopedMemoryAccess.loadFromMemorySegment(
                vsp.vectorType(), vsp.elementType(), vsp.laneCount(),
-                bb, offset, vsp,
-                (buf, off, s) -> {
-                    ByteBuffer wb = wrapper(buf, NATIVE_ENDIAN);
-                    return s.ldOp(wb, off,
-                            (wb_, o, i) -> wb_.getDouble(o + i * 8));
+                (AbstractMemorySegmentImpl) ms, offset, vsp,
+                (msp, off, s) -> {
+                    return s.ldLongOp((MemorySegment) msp, off, DoubleVector::memorySegmentGet);
                });
    }

    abstract
-    DoubleVector fromByteBuffer0(ByteBuffer bb, int offset, VectorMask<Double> m);
+    DoubleVector fromMemorySegment0(MemorySegment ms, long offset, VectorMask<Double> m);
    @ForceInline
    final
    <M extends VectorMask<Double>>
-    DoubleVector fromByteBuffer0Template(Class<M> maskClass, ByteBuffer bb, int offset, M m) {
+    DoubleVector fromMemorySegment0Template(Class<M> maskClass, MemorySegment ms, long offset, M m) {
        DoubleSpecies vsp = vspecies();
        m.check(vsp);
-        return ScopedMemoryAccess.loadFromByteBufferMasked(
+        return ScopedMemoryAccess.loadFromMemorySegmentMasked(
                vsp.vectorType(), maskClass, vsp.elementType(), vsp.laneCount(),
-                bb, offset, m, vsp,
-                (buf, off, s, vm) -> {
-                    ByteBuffer wb = wrapper(buf, NATIVE_ENDIAN);
-                    return s.ldOp(wb, off, vm,
-                            (wb_, o, i) -> wb_.getDouble(o + i * 8));
+                (AbstractMemorySegmentImpl) ms, offset, m, vsp,
+                (msp, off, s, vm) -> {
+                    return s.ldLongOp((MemorySegment) msp, off, vm, DoubleVector::memorySegmentGet);
                });
    }

@ -3422,7 +3421,7 @@ public abstract class DoubleVector extends AbstractVector<Double> {
            a, arrayAddress(a, offset),
            this, a, offset,
            (arr, off, v)
-            -> v.stOp(arr, off,
+            -> v.stOp(arr, (int) off,
                      (arr_, off_, i, e) -> arr_[off_+i] = e));
    }

@ -3439,7 +3438,7 @@ public abstract class DoubleVector extends AbstractVector<Double> {
            a, arrayAddress(a, offset),
            this, m, a, offset,
            (arr, off, v, vm)
-            -> v.stOp(arr, off, vm,
+            -> v.stOp(arr, (int) off, vm,
                      (arr_, off_, i, e) -> arr_[off_ + i] = e));
    }

@ -3497,71 +3496,33 @@ public abstract class DoubleVector extends AbstractVector<Double> {
    }


-    abstract
-    void intoByteArray0(byte[] a, int offset);
    @ForceInline
    final
-    void intoByteArray0Template(byte[] a, int offset) {
+    void intoMemorySegment0(MemorySegment ms, long offset) {
        DoubleSpecies vsp = vspecies();
-        VectorSupport.store(
-            vsp.vectorType(), vsp.elementType(), vsp.laneCount(),
-            a, byteArrayAddress(a, offset),
-            this, a, offset,
-            (arr, off, v) -> {
-                ByteBuffer wb = wrapper(arr, NATIVE_ENDIAN);
-                v.stOp(wb, off,
-                        (tb_, o, i, e) -> tb_.putDouble(o + i * 8, e));
-            });
-    }
-
-    abstract
-    void intoByteArray0(byte[] a, int offset, VectorMask<Double> m);
-    @ForceInline
-    final
-    <M extends VectorMask<Double>>
-    void intoByteArray0Template(Class<M> maskClass, byte[] a, int offset, M m) {
-        DoubleSpecies vsp = vspecies();
-        m.check(vsp);
-        VectorSupport.storeMasked(
-            vsp.vectorType(), maskClass, vsp.elementType(), vsp.laneCount(),
-            a, byteArrayAddress(a, offset),
-            this, m, a, offset,
-            (arr, off, v, vm) -> {
-                ByteBuffer wb = wrapper(arr, NATIVE_ENDIAN);
-                v.stOp(wb, off, vm,
-                        (tb_, o, i, e) -> tb_.putDouble(o + i * 8, e));
-            });
-    }
-
-    @ForceInline
-    final
-    void intoByteBuffer0(ByteBuffer bb, int offset) {
-        DoubleSpecies vsp = vspecies();
-        ScopedMemoryAccess.storeIntoByteBuffer(
+        ScopedMemoryAccess.storeIntoMemorySegment(
                vsp.vectorType(), vsp.elementType(), vsp.laneCount(),
-                this, bb, offset,
-                (buf, off, v) -> {
-                    ByteBuffer wb = wrapper(buf, NATIVE_ENDIAN);
-                    v.stOp(wb, off,
-                            (wb_, o, i, e) -> wb_.putDouble(o + i * 8, e));
+                this,
+                (AbstractMemorySegmentImpl) ms, offset,
+                (msp, off, v) -> {
+                    v.stLongOp((MemorySegment) msp, off, DoubleVector::memorySegmentSet);
                });
    }

    abstract
-    void intoByteBuffer0(ByteBuffer bb, int offset, VectorMask<Double> m);
+    void intoMemorySegment0(MemorySegment bb, long offset, VectorMask<Double> m);
    @ForceInline
    final
    <M extends VectorMask<Double>>
-    void intoByteBuffer0Template(Class<M> maskClass, ByteBuffer bb, int offset, M m) {
+    void intoMemorySegment0Template(Class<M> maskClass, MemorySegment ms, long offset, M m) {
        DoubleSpecies vsp = vspecies();
        m.check(vsp);
-        ScopedMemoryAccess.storeIntoByteBufferMasked(
+        ScopedMemoryAccess.storeIntoMemorySegmentMasked(
                vsp.vectorType(), maskClass, vsp.elementType(), vsp.laneCount(),
-                this, m, bb, offset,
-                (buf, off, v, vm) -> {
-                    ByteBuffer wb = wrapper(buf, NATIVE_ENDIAN);
-                    v.stOp(wb, off, vm,
-                            (wb_, o, i, e) -> wb_.putDouble(o + i * 8, e));
+                this, m,
+                (AbstractMemorySegmentImpl) ms, offset,
+                (msp, off, v, vm) -> {
+                    v.stLongOp((MemorySegment) msp, off, vm, DoubleVector::memorySegmentSet);
                });
    }

@ -3578,6 +3539,16 @@ public abstract class DoubleVector extends AbstractVector<Double> {
            .checkIndexByLane(offset, limit, vsp.iota(), scale);
    }

+    private static
+    void checkMaskFromIndexSize(long offset,
+                                DoubleSpecies vsp,
+                                VectorMask<Double> m,
+                                int scale,
+                                long limit) {
+        ((AbstractMask<Double>)m)
+            .checkIndexByLane(offset, limit, vsp.iota(), scale);
+    }
+
    @ForceInline
    private void conditionalStoreNYI(int offset,
                                     DoubleSpecies vsp,
@ -3888,6 +3859,21 @@ public abstract class DoubleVector extends AbstractVector<Double> {
            return dummyVector().ldOp(memory, offset, m, f);
        }

+        /*package-private*/
+        @ForceInline
+        DoubleVector ldLongOp(MemorySegment memory, long offset,
+                                      FLdLongOp f) {
+            return dummyVector().ldLongOp(memory, offset, f);
+        }
+
+        /*package-private*/
+        @ForceInline
+        DoubleVector ldLongOp(MemorySegment memory, long offset,
+                                      VectorMask<Double> m,
+                                      FLdLongOp f) {
+            return dummyVector().ldLongOp(memory, offset, m, f);
+        }
+
        /*package-private*/
        @ForceInline
        <M> void stOp(M memory, int offset, FStOp<M> f) {
@ -3902,6 +3888,20 @@ public abstract class DoubleVector extends AbstractVector<Double> {
            dummyVector().stOp(memory, offset, m, f);
        }

+        /*package-private*/
+        @ForceInline
+        void stLongOp(MemorySegment memory, long offset, FStLongOp f) {
+            dummyVector().stLongOp(memory, offset, f);
+        }
+
+        /*package-private*/
+        @ForceInline
+        void stLongOp(MemorySegment memory, long offset,
+                      AbstractMask<Double> m,
+                      FStLongOp f) {
+            dummyVector().stLongOp(memory, offset, m, f);
+        }
+
        // N.B. Make sure these constant vectors and
        // masks load up correctly into registers.
        //
@ -4015,3 +4015,4 @@ public abstract class DoubleVector extends AbstractVector<Double> {
    public static final VectorSpecies<Double> SPECIES_PREFERRED
        = (DoubleSpecies) VectorSpecies.ofPreferred(double.class);
 }
+
--- a/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/Float128Vector.java
+++ b/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/Float128Vector.java
@ -24,7 +24,7 @@
 */
 package jdk.incubator.vector;

-import java.nio.ByteBuffer;
+import java.lang.foreign.MemorySegment;
 import java.util.Arrays;
 import java.util.Objects;
 import java.util.function.IntUnaryOperator;
@ -461,6 +461,22 @@ final class Float128Vector extends FloatVector {
                                    (Float128Vector) v);  // specialize
    }

+    @Override
+    @ForceInline
+    public Float128Vector compress(VectorMask<Float> m) {
+        return (Float128Vector)
+            super.compressTemplate(Float128Mask.class,
+                                   (Float128Mask) m);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    public Float128Vector expand(VectorMask<Float> m) {
+        return (Float128Vector)
+            super.expandTemplate(Float128Mask.class,
+                                   (Float128Mask) m);  // specialize
+    }
+
    @Override
    @ForceInline
    public Float128Vector selectFrom(Vector<Float> v) {
@ -642,6 +658,15 @@ final class Float128Vector extends FloatVector {
            return xor(maskAll(true));
        }

+        @Override
+        @ForceInline
+        public Float128Mask compress() {
+            return (Float128Mask)VectorSupport.compressExpandOp(VectorSupport.VECTOR_OP_MASK_COMPRESS,
+                Float128Vector.class, Float128Mask.class, ETYPE, VLENGTH, null, this,
+                (v1, m1) -> VSPECIES.iota().compare(VectorOperators.LT, m1.trueCount()));
+        }
+
+
        // Binary operations

        @Override
@ -834,29 +859,15 @@ final class Float128Vector extends FloatVector {
    @ForceInline
    @Override
    final
-    FloatVector fromByteArray0(byte[] a, int offset) {
-        return super.fromByteArray0Template(a, offset);  // specialize
+    FloatVector fromMemorySegment0(MemorySegment ms, long offset) {
+        return super.fromMemorySegment0Template(ms, offset);  // specialize
    }

    @ForceInline
    @Override
    final
-    FloatVector fromByteArray0(byte[] a, int offset, VectorMask<Float> m) {
-        return super.fromByteArray0Template(Float128Mask.class, a, offset, (Float128Mask) m);  // specialize
-    }
-
-    @ForceInline
-    @Override
-    final
-    FloatVector fromByteBuffer0(ByteBuffer bb, int offset) {
-        return super.fromByteBuffer0Template(bb, offset);  // specialize
-    }
-
-    @ForceInline
-    @Override
-    final
-    FloatVector fromByteBuffer0(ByteBuffer bb, int offset, VectorMask<Float> m) {
-        return super.fromByteBuffer0Template(Float128Mask.class, bb, offset, (Float128Mask) m);  // specialize
+    FloatVector fromMemorySegment0(MemorySegment ms, long offset, VectorMask<Float> m) {
+        return super.fromMemorySegment0Template(Float128Mask.class, ms, offset, (Float128Mask) m);  // specialize
    }

    @ForceInline
@ -884,22 +895,8 @@ final class Float128Vector extends FloatVector {
    @ForceInline
    @Override
    final
-    void intoByteArray0(byte[] a, int offset) {
-        super.intoByteArray0Template(a, offset);  // specialize
-    }
-
-    @ForceInline
-    @Override
-    final
-    void intoByteArray0(byte[] a, int offset, VectorMask<Float> m) {
-        super.intoByteArray0Template(Float128Mask.class, a, offset, (Float128Mask) m);  // specialize
-    }
-
-    @ForceInline
-    @Override
-    final
-    void intoByteBuffer0(ByteBuffer bb, int offset, VectorMask<Float> m) {
-        super.intoByteBuffer0Template(Float128Mask.class, bb, offset, (Float128Mask) m);
+    void intoMemorySegment0(MemorySegment ms, long offset, VectorMask<Float> m) {
+        super.intoMemorySegment0Template(Float128Mask.class, ms, offset, (Float128Mask) m);
    }


@ -908,3 +905,4 @@ final class Float128Vector extends FloatVector {
    // ================================================

 }
+
--- a/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/Float256Vector.java
+++ b/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/Float256Vector.java
@ -24,7 +24,7 @@
 */
 package jdk.incubator.vector;

-import java.nio.ByteBuffer;
+import java.lang.foreign.MemorySegment;
 import java.util.Arrays;
 import java.util.Objects;
 import java.util.function.IntUnaryOperator;
@ -461,6 +461,22 @@ final class Float256Vector extends FloatVector {
                                    (Float256Vector) v);  // specialize
    }

+    @Override
+    @ForceInline
+    public Float256Vector compress(VectorMask<Float> m) {
+        return (Float256Vector)
+            super.compressTemplate(Float256Mask.class,
+                                   (Float256Mask) m);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    public Float256Vector expand(VectorMask<Float> m) {
+        return (Float256Vector)
+            super.expandTemplate(Float256Mask.class,
+                                   (Float256Mask) m);  // specialize
+    }
+
    @Override
    @ForceInline
    public Float256Vector selectFrom(Vector<Float> v) {
@ -650,6 +666,15 @@ final class Float256Vector extends FloatVector {
            return xor(maskAll(true));
        }

+        @Override
+        @ForceInline
+        public Float256Mask compress() {
+            return (Float256Mask)VectorSupport.compressExpandOp(VectorSupport.VECTOR_OP_MASK_COMPRESS,
+                Float256Vector.class, Float256Mask.class, ETYPE, VLENGTH, null, this,
+                (v1, m1) -> VSPECIES.iota().compare(VectorOperators.LT, m1.trueCount()));
+        }
+
+
        // Binary operations

        @Override
@ -842,29 +867,15 @@ final class Float256Vector extends FloatVector {
    @ForceInline
    @Override
    final
-    FloatVector fromByteArray0(byte[] a, int offset) {
-        return super.fromByteArray0Template(a, offset);  // specialize
+    FloatVector fromMemorySegment0(MemorySegment ms, long offset) {
+        return super.fromMemorySegment0Template(ms, offset);  // specialize
    }

    @ForceInline
    @Override
    final
-    FloatVector fromByteArray0(byte[] a, int offset, VectorMask<Float> m) {
-        return super.fromByteArray0Template(Float256Mask.class, a, offset, (Float256Mask) m);  // specialize
-    }
-
-    @ForceInline
-    @Override
-    final
-    FloatVector fromByteBuffer0(ByteBuffer bb, int offset) {
-        return super.fromByteBuffer0Template(bb, offset);  // specialize
-    }
-
-    @ForceInline
-    @Override
-    final
-    FloatVector fromByteBuffer0(ByteBuffer bb, int offset, VectorMask<Float> m) {
-        return super.fromByteBuffer0Template(Float256Mask.class, bb, offset, (Float256Mask) m);  // specialize
+    FloatVector fromMemorySegment0(MemorySegment ms, long offset, VectorMask<Float> m) {
+        return super.fromMemorySegment0Template(Float256Mask.class, ms, offset, (Float256Mask) m);  // specialize
    }

    @ForceInline
@ -892,22 +903,8 @@ final class Float256Vector extends FloatVector {
    @ForceInline
    @Override
    final
-    void intoByteArray0(byte[] a, int offset) {
-        super.intoByteArray0Template(a, offset);  // specialize
-    }
-
-    @ForceInline
-    @Override
-    final
-    void intoByteArray0(byte[] a, int offset, VectorMask<Float> m) {
-        super.intoByteArray0Template(Float256Mask.class, a, offset, (Float256Mask) m);  // specialize
-    }
-
-    @ForceInline
-    @Override
-    final
-    void intoByteBuffer0(ByteBuffer bb, int offset, VectorMask<Float> m) {
-        super.intoByteBuffer0Template(Float256Mask.class, bb, offset, (Float256Mask) m);
+    void intoMemorySegment0(MemorySegment ms, long offset, VectorMask<Float> m) {
+        super.intoMemorySegment0Template(Float256Mask.class, ms, offset, (Float256Mask) m);
    }


@ -916,3 +913,4 @@ final class Float256Vector extends FloatVector {
    // ================================================

 }
+
--- a/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/Float512Vector.java
+++ b/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/Float512Vector.java
@ -24,7 +24,7 @@
 */
 package jdk.incubator.vector;

-import java.nio.ByteBuffer;
+import java.lang.foreign.MemorySegment;
 import java.util.Arrays;
 import java.util.Objects;
 import java.util.function.IntUnaryOperator;
@ -461,6 +461,22 @@ final class Float512Vector extends FloatVector {
                                    (Float512Vector) v);  // specialize
    }

+    @Override
+    @ForceInline
+    public Float512Vector compress(VectorMask<Float> m) {
+        return (Float512Vector)
+            super.compressTemplate(Float512Mask.class,
+                                   (Float512Mask) m);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    public Float512Vector expand(VectorMask<Float> m) {
+        return (Float512Vector)
+            super.expandTemplate(Float512Mask.class,
+                                   (Float512Mask) m);  // specialize
+    }
+
    @Override
    @ForceInline
    public Float512Vector selectFrom(Vector<Float> v) {
@ -666,6 +682,15 @@ final class Float512Vector extends FloatVector {
            return xor(maskAll(true));
        }

+        @Override
+        @ForceInline
+        public Float512Mask compress() {
+            return (Float512Mask)VectorSupport.compressExpandOp(VectorSupport.VECTOR_OP_MASK_COMPRESS,
+                Float512Vector.class, Float512Mask.class, ETYPE, VLENGTH, null, this,
+                (v1, m1) -> VSPECIES.iota().compare(VectorOperators.LT, m1.trueCount()));
+        }
+
+
        // Binary operations

        @Override
@ -858,29 +883,15 @@ final class Float512Vector extends FloatVector {
    @ForceInline
    @Override
    final
-    FloatVector fromByteArray0(byte[] a, int offset) {
-        return super.fromByteArray0Template(a, offset);  // specialize
+    FloatVector fromMemorySegment0(MemorySegment ms, long offset) {
+        return super.fromMemorySegment0Template(ms, offset);  // specialize
    }

    @ForceInline
    @Override
    final
-    FloatVector fromByteArray0(byte[] a, int offset, VectorMask<Float> m) {
-        return super.fromByteArray0Template(Float512Mask.class, a, offset, (Float512Mask) m);  // specialize
-    }
-
-    @ForceInline
-    @Override
-    final
-    FloatVector fromByteBuffer0(ByteBuffer bb, int offset) {
-        return super.fromByteBuffer0Template(bb, offset);  // specialize
-    }
-
-    @ForceInline
-    @Override
-    final
-    FloatVector fromByteBuffer0(ByteBuffer bb, int offset, VectorMask<Float> m) {
-        return super.fromByteBuffer0Template(Float512Mask.class, bb, offset, (Float512Mask) m);  // specialize
+    FloatVector fromMemorySegment0(MemorySegment ms, long offset, VectorMask<Float> m) {
+        return super.fromMemorySegment0Template(Float512Mask.class, ms, offset, (Float512Mask) m);  // specialize
    }

    @ForceInline
@ -908,22 +919,8 @@ final class Float512Vector extends FloatVector {
    @ForceInline
    @Override
    final
-    void intoByteArray0(byte[] a, int offset) {
-        super.intoByteArray0Template(a, offset);  // specialize
-    }
-
-    @ForceInline
-    @Override
-    final
-    void intoByteArray0(byte[] a, int offset, VectorMask<Float> m) {
-        super.intoByteArray0Template(Float512Mask.class, a, offset, (Float512Mask) m);  // specialize
-    }
-
-    @ForceInline
-    @Override
-    final
-    void intoByteBuffer0(ByteBuffer bb, int offset, VectorMask<Float> m) {
-        super.intoByteBuffer0Template(Float512Mask.class, bb, offset, (Float512Mask) m);
+    void intoMemorySegment0(MemorySegment ms, long offset, VectorMask<Float> m) {
+        super.intoMemorySegment0Template(Float512Mask.class, ms, offset, (Float512Mask) m);
    }


@ -932,3 +929,4 @@ final class Float512Vector extends FloatVector {
    // ================================================

 }
+
--- a/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/Float64Vector.java
+++ b/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/Float64Vector.java
@ -24,7 +24,7 @@
 */
 package jdk.incubator.vector;

-import java.nio.ByteBuffer;
+import java.lang.foreign.MemorySegment;
 import java.util.Arrays;
 import java.util.Objects;
 import java.util.function.IntUnaryOperator;
@ -461,6 +461,22 @@ final class Float64Vector extends FloatVector {
                                    (Float64Vector) v);  // specialize
    }

+    @Override
+    @ForceInline
+    public Float64Vector compress(VectorMask<Float> m) {
+        return (Float64Vector)
+            super.compressTemplate(Float64Mask.class,
+                                   (Float64Mask) m);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    public Float64Vector expand(VectorMask<Float> m) {
+        return (Float64Vector)
+            super.expandTemplate(Float64Mask.class,
+                                   (Float64Mask) m);  // specialize
+    }
+
    @Override
    @ForceInline
    public Float64Vector selectFrom(Vector<Float> v) {
@ -638,6 +654,15 @@ final class Float64Vector extends FloatVector {
            return xor(maskAll(true));
        }

+        @Override
+        @ForceInline
+        public Float64Mask compress() {
+            return (Float64Mask)VectorSupport.compressExpandOp(VectorSupport.VECTOR_OP_MASK_COMPRESS,
+                Float64Vector.class, Float64Mask.class, ETYPE, VLENGTH, null, this,
+                (v1, m1) -> VSPECIES.iota().compare(VectorOperators.LT, m1.trueCount()));
+        }
+
+
        // Binary operations

        @Override
@ -830,29 +855,15 @@ final class Float64Vector extends FloatVector {
    @ForceInline
    @Override
    final
-    FloatVector fromByteArray0(byte[] a, int offset) {
-        return super.fromByteArray0Template(a, offset);  // specialize
+    FloatVector fromMemorySegment0(MemorySegment ms, long offset) {
+        return super.fromMemorySegment0Template(ms, offset);  // specialize
    }

    @ForceInline
    @Override
    final
-    FloatVector fromByteArray0(byte[] a, int offset, VectorMask<Float> m) {
-        return super.fromByteArray0Template(Float64Mask.class, a, offset, (Float64Mask) m);  // specialize
-    }
-
-    @ForceInline
-    @Override
-    final
-    FloatVector fromByteBuffer0(ByteBuffer bb, int offset) {
-        return super.fromByteBuffer0Template(bb, offset);  // specialize
-    }
-
-    @ForceInline
-    @Override
-    final
-    FloatVector fromByteBuffer0(ByteBuffer bb, int offset, VectorMask<Float> m) {
-        return super.fromByteBuffer0Template(Float64Mask.class, bb, offset, (Float64Mask) m);  // specialize
+    FloatVector fromMemorySegment0(MemorySegment ms, long offset, VectorMask<Float> m) {
+        return super.fromMemorySegment0Template(Float64Mask.class, ms, offset, (Float64Mask) m);  // specialize
    }

    @ForceInline
@ -880,22 +891,8 @@ final class Float64Vector extends FloatVector {
    @ForceInline
    @Override
    final
-    void intoByteArray0(byte[] a, int offset) {
-        super.intoByteArray0Template(a, offset);  // specialize
-    }
-
-    @ForceInline
-    @Override
-    final
-    void intoByteArray0(byte[] a, int offset, VectorMask<Float> m) {
-        super.intoByteArray0Template(Float64Mask.class, a, offset, (Float64Mask) m);  // specialize
-    }
-
-    @ForceInline
-    @Override
-    final
-    void intoByteBuffer0(ByteBuffer bb, int offset, VectorMask<Float> m) {
-        super.intoByteBuffer0Template(Float64Mask.class, bb, offset, (Float64Mask) m);
+    void intoMemorySegment0(MemorySegment ms, long offset, VectorMask<Float> m) {
+        super.intoMemorySegment0Template(Float64Mask.class, ms, offset, (Float64Mask) m);
    }


@ -904,3 +901,4 @@ final class Float64Vector extends FloatVector {
    // ================================================

 }
+
--- a/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/FloatMaxVector.java
+++ b/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/FloatMaxVector.java
@ -24,7 +24,7 @@
 */
 package jdk.incubator.vector;

-import java.nio.ByteBuffer;
+import java.lang.foreign.MemorySegment;
 import java.util.Arrays;
 import java.util.Objects;
 import java.util.function.IntUnaryOperator;
@ -461,6 +461,22 @@ final class FloatMaxVector extends FloatVector {
                                    (FloatMaxVector) v);  // specialize
    }

+    @Override
+    @ForceInline
+    public FloatMaxVector compress(VectorMask<Float> m) {
+        return (FloatMaxVector)
+            super.compressTemplate(FloatMaxMask.class,
+                                   (FloatMaxMask) m);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    public FloatMaxVector expand(VectorMask<Float> m) {
+        return (FloatMaxVector)
+            super.expandTemplate(FloatMaxMask.class,
+                                   (FloatMaxMask) m);  // specialize
+    }
+
    @Override
    @ForceInline
    public FloatMaxVector selectFrom(Vector<Float> v) {
@ -635,6 +651,15 @@ final class FloatMaxVector extends FloatVector {
            return xor(maskAll(true));
        }

+        @Override
+        @ForceInline
+        public FloatMaxMask compress() {
+            return (FloatMaxMask)VectorSupport.compressExpandOp(VectorSupport.VECTOR_OP_MASK_COMPRESS,
+                FloatMaxVector.class, FloatMaxMask.class, ETYPE, VLENGTH, null, this,
+                (v1, m1) -> VSPECIES.iota().compare(VectorOperators.LT, m1.trueCount()));
+        }
+
+
        // Binary operations

        @Override
@ -827,29 +852,15 @@ final class FloatMaxVector extends FloatVector {
    @ForceInline
    @Override
    final
-    FloatVector fromByteArray0(byte[] a, int offset) {
-        return super.fromByteArray0Template(a, offset);  // specialize
+    FloatVector fromMemorySegment0(MemorySegment ms, long offset) {
+        return super.fromMemorySegment0Template(ms, offset);  // specialize
    }

    @ForceInline
    @Override
    final
-    FloatVector fromByteArray0(byte[] a, int offset, VectorMask<Float> m) {
-        return super.fromByteArray0Template(FloatMaxMask.class, a, offset, (FloatMaxMask) m);  // specialize
-    }
-
-    @ForceInline
-    @Override
-    final
-    FloatVector fromByteBuffer0(ByteBuffer bb, int offset) {
-        return super.fromByteBuffer0Template(bb, offset);  // specialize
-    }
-
-    @ForceInline
-    @Override
-    final
-    FloatVector fromByteBuffer0(ByteBuffer bb, int offset, VectorMask<Float> m) {
-        return super.fromByteBuffer0Template(FloatMaxMask.class, bb, offset, (FloatMaxMask) m);  // specialize
+    FloatVector fromMemorySegment0(MemorySegment ms, long offset, VectorMask<Float> m) {
+        return super.fromMemorySegment0Template(FloatMaxMask.class, ms, offset, (FloatMaxMask) m);  // specialize
    }

    @ForceInline
@ -877,22 +888,8 @@ final class FloatMaxVector extends FloatVector {
    @ForceInline
    @Override
    final
-    void intoByteArray0(byte[] a, int offset) {
-        super.intoByteArray0Template(a, offset);  // specialize
-    }
-
-    @ForceInline
-    @Override
-    final
-    void intoByteArray0(byte[] a, int offset, VectorMask<Float> m) {
-        super.intoByteArray0Template(FloatMaxMask.class, a, offset, (FloatMaxMask) m);  // specialize
-    }
-
-    @ForceInline
-    @Override
-    final
-    void intoByteBuffer0(ByteBuffer bb, int offset, VectorMask<Float> m) {
-        super.intoByteBuffer0Template(FloatMaxMask.class, bb, offset, (FloatMaxMask) m);
+    void intoMemorySegment0(MemorySegment ms, long offset, VectorMask<Float> m) {
+        super.intoMemorySegment0Template(FloatMaxMask.class, ms, offset, (FloatMaxMask) m);
    }


@ -901,3 +898,4 @@ final class FloatMaxVector extends FloatVector {
    // ================================================

 }
+
--- a/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/FloatVector.java
+++ b/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/FloatVector.java
@ -24,14 +24,14 @@
 */
 package jdk.incubator.vector;

-import java.nio.ByteBuffer;
+import java.lang.foreign.MemorySegment;
+import java.lang.foreign.ValueLayout;
 import java.nio.ByteOrder;
-import java.nio.ReadOnlyBufferException;
 import java.util.Arrays;
 import java.util.Objects;
 import java.util.function.Function;
-import java.util.function.UnaryOperator;

+import jdk.internal.foreign.AbstractMemorySegmentImpl;
 import jdk.internal.misc.ScopedMemoryAccess;
 import jdk.internal.misc.Unsafe;
 import jdk.internal.vm.annotation.ForceInline;
@ -57,6 +57,8 @@ public abstract class FloatVector extends AbstractVector<Float> {

    static final int FORBID_OPCODE_KIND = VO_NOFP;

+    static final ValueLayout.OfFloat ELEMENT_LAYOUT = ValueLayout.JAVA_FLOAT.withBitAlignment(8);
+
    @ForceInline
    static int opCode(Operator op) {
        return VectorOperators.opCode(op, VO_OPCODE_VALID, FORBID_OPCODE_KIND);
@ -351,6 +353,45 @@ public abstract class FloatVector extends AbstractVector<Float> {
        return vectorFactory(res);
    }

+    /*package-private*/
+    interface FLdLongOp {
+        float apply(MemorySegment memory, long offset, int i);
+    }
+
+    /*package-private*/
+    @ForceInline
+    final
+    FloatVector ldLongOp(MemorySegment memory, long offset,
+                                  FLdLongOp f) {
+        //dummy; no vec = vec();
+        float[] res = new float[length()];
+        for (int i = 0; i < res.length; i++) {
+            res[i] = f.apply(memory, offset, i);
+        }
+        return vectorFactory(res);
+    }
+
+    /*package-private*/
+    @ForceInline
+    final
+    FloatVector ldLongOp(MemorySegment memory, long offset,
+                                  VectorMask<Float> m,
+                                  FLdLongOp f) {
+        //float[] vec = vec();
+        float[] res = new float[length()];
+        boolean[] mbits = ((AbstractMask<Float>)m).getBits();
+        for (int i = 0; i < res.length; i++) {
+            if (mbits[i]) {
+                res[i] = f.apply(memory, offset, i);
+            }
+        }
+        return vectorFactory(res);
+    }
+
+    static float memorySegmentGet(MemorySegment ms, long o, int i) {
+        return ms.get(ELEMENT_LAYOUT, o + i * 4L);
+    }
+
    interface FStOp<M> {
        void apply(M memory, int offset, int i, float a);
    }
@ -381,6 +422,40 @@ public abstract class FloatVector extends AbstractVector<Float> {
        }
    }

+    interface FStLongOp {
+        void apply(MemorySegment memory, long offset, int i, float a);
+    }
+
+    /*package-private*/
+    @ForceInline
+    final
+    void stLongOp(MemorySegment memory, long offset,
+                  FStLongOp f) {
+        float[] vec = vec();
+        for (int i = 0; i < vec.length; i++) {
+            f.apply(memory, offset, i, vec[i]);
+        }
+    }
+
+    /*package-private*/
+    @ForceInline
+    final
+    void stLongOp(MemorySegment memory, long offset,
+                  VectorMask<Float> m,
+                  FStLongOp f) {
+        float[] vec = vec();
+        boolean[] mbits = ((AbstractMask<Float>)m).getBits();
+        for (int i = 0; i < vec.length; i++) {
+            if (mbits[i]) {
+                f.apply(memory, offset, i, vec[i]);
+            }
+        }
+    }
+
+    static void memorySegmentSet(MemorySegment ms, long o, int i, float e) {
+        ms.set(ELEMENT_LAYOUT, o + i * 4L, e);
+    }
+
    // Binary test

    /*package-private*/
@ -420,6 +495,36 @@ public abstract class FloatVector extends AbstractVector<Float> {
        return Float.intBitsToFloat((int)bits);
    }

+    static FloatVector expandHelper(Vector<Float> v, VectorMask<Float> m) {
+        VectorSpecies<Float> vsp = m.vectorSpecies();
+        FloatVector r  = (FloatVector) vsp.zero();
+        FloatVector vi = (FloatVector) v;
+        if (m.allTrue()) {
+            return vi;
+        }
+        for (int i = 0, j = 0; i < vsp.length(); i++) {
+            if (m.laneIsSet(i)) {
+                r = r.withLane(i, vi.lane(j++));
+            }
+        }
+        return r;
+    }
+
+    static FloatVector compressHelper(Vector<Float> v, VectorMask<Float> m) {
+        VectorSpecies<Float> vsp = m.vectorSpecies();
+        FloatVector r  = (FloatVector) vsp.zero();
+        FloatVector vi = (FloatVector) v;
+        if (m.allTrue()) {
+            return vi;
+        }
+        for (int i = 0, j = 0; i < vsp.length(); i++) {
+            if (m.laneIsSet(i)) {
+                r = r.withLane(j++, vi.lane(i));
+            }
+        }
+        return r;
+    }
+
    // Static factories (other than memory operations)

    // Note: A surprising behavior in javadoc
@ -1602,6 +1707,7 @@ public abstract class FloatVector extends AbstractVector<Float> {
    }


+
    // sqrt
    /**
     * Computes the square root of this vector.
@ -2253,6 +2359,45 @@ public abstract class FloatVector extends AbstractVector<Float> {
                                     FloatVector::toShuffle0);
    }

+    /**
+     * {@inheritDoc} <!--workaround-->
+     * @since 19
+     */
+    @Override
+    public abstract
+    FloatVector compress(VectorMask<Float> m);
+
+    /*package-private*/
+    @ForceInline
+    final
+    <M extends AbstractMask<Float>>
+    FloatVector compressTemplate(Class<M> masktype, M m) {
+      m.check(masktype, this);
+      return (FloatVector) VectorSupport.compressExpandOp(VectorSupport.VECTOR_OP_COMPRESS, getClass(), masktype,
+                                                        float.class, length(), this, m,
+                                                        (v1, m1) -> compressHelper(v1, m1));
+    }
+
+    /**
+     * {@inheritDoc} <!--workaround-->
+     * @since 19
+     */
+    @Override
+    public abstract
+    FloatVector expand(VectorMask<Float> m);
+
+    /*package-private*/
+    @ForceInline
+    final
+    <M extends AbstractMask<Float>>
+    FloatVector expandTemplate(Class<M> masktype, M m) {
+      m.check(masktype, this);
+      return (FloatVector) VectorSupport.compressExpandOp(VectorSupport.VECTOR_OP_EXPAND, getClass(), masktype,
+                                                        float.class, length(), this, m,
+                                                        (v1, m1) -> expandHelper(v1, m1));
+    }
+
+
    /**
     * {@inheritDoc} <!--workaround-->
     */
@ -2633,90 +2778,6 @@ public abstract class FloatVector extends AbstractVector<Float> {
        return res;
    }

-    /**
-     * Loads a vector from a byte array starting at an offset.
-     * Bytes are composed into primitive lane elements according
-     * to the specified byte order.
-     * The vector is arranged into lanes according to
-     * <a href="Vector.html#lane-order">memory ordering</a>.
-     * <p>
-     * This method behaves as if it returns the result of calling
-     * {@link #fromByteBuffer(VectorSpecies,ByteBuffer,int,ByteOrder,VectorMask)
-     * fromByteBuffer()} as follows:
-     * <pre>{@code
-     * var bb = ByteBuffer.wrap(a);
-     * var m = species.maskAll(true);
-     * return fromByteBuffer(species, bb, offset, bo, m);
-     * }</pre>
-     *
-     * @param species species of desired vector
-     * @param a the byte array
-     * @param offset the offset into the array
-     * @param bo the intended byte order
-     * @return a vector loaded from a byte array
-     * @throws IndexOutOfBoundsException
-     *         if {@code offset+N*ESIZE < 0}
-     *         or {@code offset+(N+1)*ESIZE > a.length}
-     *         for any lane {@code N} in the vector
-     */
-    @ForceInline
-    public static
-    FloatVector fromByteArray(VectorSpecies<Float> species,
-                                       byte[] a, int offset,
-                                       ByteOrder bo) {
-        offset = checkFromIndexSize(offset, species.vectorByteSize(), a.length);
-        FloatSpecies vsp = (FloatSpecies) species;
-        return vsp.dummyVector().fromByteArray0(a, offset).maybeSwap(bo);
-    }
-
-    /**
-     * Loads a vector from a byte array starting at an offset
-     * and using a mask.
-     * Lanes where the mask is unset are filled with the default
-     * value of {@code float} (positive zero).
-     * Bytes are composed into primitive lane elements according
-     * to the specified byte order.
-     * The vector is arranged into lanes according to
-     * <a href="Vector.html#lane-order">memory ordering</a>.
-     * <p>
-     * This method behaves as if it returns the result of calling
-     * {@link #fromByteBuffer(VectorSpecies,ByteBuffer,int,ByteOrder,VectorMask)
-     * fromByteBuffer()} as follows:
-     * <pre>{@code
-     * var bb = ByteBuffer.wrap(a);
-     * return fromByteBuffer(species, bb, offset, bo, m);
-     * }</pre>
-     *
-     * @param species species of desired vector
-     * @param a the byte array
-     * @param offset the offset into the array
-     * @param bo the intended byte order
-     * @param m the mask controlling lane selection
-     * @return a vector loaded from a byte array
-     * @throws IndexOutOfBoundsException
-     *         if {@code offset+N*ESIZE < 0}
-     *         or {@code offset+(N+1)*ESIZE > a.length}
-     *         for any lane {@code N} in the vector
-     *         where the mask is set
-     */
-    @ForceInline
-    public static
-    FloatVector fromByteArray(VectorSpecies<Float> species,
-                                       byte[] a, int offset,
-                                       ByteOrder bo,
-                                       VectorMask<Float> m) {
-        FloatSpecies vsp = (FloatSpecies) species;
-        if (offset >= 0 && offset <= (a.length - species.vectorByteSize())) {
-            return vsp.dummyVector().fromByteArray0(a, offset, m).maybeSwap(bo);
-        }
-
-        // FIXME: optimize
-        checkMaskFromIndexSize(offset, vsp, m, 4, a.length);
-        ByteBuffer wb = wrapper(a, bo);
-        return vsp.ldOp(wb, offset, (AbstractMask<Float>)m,
-                   (wb_, o, i)  -> wb_.getFloat(o + i * 4));
-    }
-
    /**
     * Loads a vector from an array of type {@code float[]}
     * starting at an offset.
@ -2889,44 +2950,49 @@ public abstract class FloatVector extends AbstractVector<Float> {


    /**
-     * Loads a vector from a {@linkplain ByteBuffer byte buffer}
-     * starting at an offset into the byte buffer.
+     * Loads a vector from a {@linkplain MemorySegment memory segment}
+     * starting at an offset into the memory segment.
     * Bytes are composed into primitive lane elements according
     * to the specified byte order.
     * The vector is arranged into lanes according to
     * <a href="Vector.html#lane-order">memory ordering</a>.
     * <p>
     * This method behaves as if it returns the result of calling
-     * {@link #fromByteBuffer(VectorSpecies,ByteBuffer,int,ByteOrder,VectorMask)
-     * fromByteBuffer()} as follows:
+     * {@link #fromMemorySegment(VectorSpecies,MemorySegment,long,ByteOrder,VectorMask)
+     * fromMemorySegment()} as follows:
     * <pre>{@code
     * var m = species.maskAll(true);
-     * return fromByteBuffer(species, bb, offset, bo, m);
+     * return fromMemorySegment(species, ms, offset, bo, m);
     * }</pre>
     *
     * @param species species of desired vector
-     * @param bb the byte buffer
-     * @param offset the offset into the byte buffer
+     * @param ms the memory segment
+     * @param offset the offset into the memory segment
     * @param bo the intended byte order
-     * @return a vector loaded from a byte buffer
+     * @return a vector loaded from the memory segment
     * @throws IndexOutOfBoundsException
     *         if {@code offset+N*4 < 0}
-     *         or {@code offset+N*4 >= bb.limit()}
+     *         or {@code offset+N*4 >= ms.byteSize()}
     *         for any lane {@code N} in the vector
+     * @throws IllegalArgumentException if the memory segment is a heap segment that is
+     *         not backed by a {@code byte[]} array.
+     * @throws IllegalStateException if the memory segment's session is not alive,
+     *         or if access occurs from a thread other than the thread owning the session.
+     * @since 19
     */
    @ForceInline
    public static
-    FloatVector fromByteBuffer(VectorSpecies<Float> species,
-                                        ByteBuffer bb, int offset,
-                                        ByteOrder bo) {
-        offset = checkFromIndexSize(offset, species.vectorByteSize(), bb.limit());
+    FloatVector fromMemorySegment(VectorSpecies<Float> species,
+                                           MemorySegment ms, long offset,
+                                           ByteOrder bo) {
+        offset = checkFromIndexSize(offset, species.vectorByteSize(), ms.byteSize());
        FloatSpecies vsp = (FloatSpecies) species;
-        return vsp.dummyVector().fromByteBuffer0(bb, offset).maybeSwap(bo);
+        return vsp.dummyVector().fromMemorySegment0(ms, offset).maybeSwap(bo);
    }

    /**
-     * Loads a vector from a {@linkplain ByteBuffer byte buffer}
-     * starting at an offset into the byte buffer
+     * Loads a vector from a {@linkplain MemorySegment memory segment}
+     * starting at an offset into the memory segment
     * and using a mask.
     * Lanes where the mask is unset are filled with the default
     * value of {@code float} (positive zero).
@ -2937,13 +3003,11 @@ public abstract class FloatVector extends AbstractVector<Float> {
     * <p>
     * The following pseudocode illustrates the behavior:
     * <pre>{@code
-     * FloatBuffer eb = bb.duplicate()
-     *     .position(offset)
-     *     .order(bo).asFloatBuffer();
+     * var slice = ms.asSlice(offset);
     * float[] ar = new float[species.length()];
     * for (int n = 0; n < ar.length; n++) {
     *     if (m.laneIsSet(n)) {
-     *         ar[n] = eb.get(n);
+     *         ar[n] = slice.getAtIndex(ValuaLayout.JAVA_FLOAT.withBitAlignment(8), n);
     *     }
     * }
     * FloatVector r = FloatVector.fromArray(species, ar, 0);
@ -2957,33 +3021,36 @@ public abstract class FloatVector extends AbstractVector<Float> {
     * the bytes of lane values.
     *
     * @param species species of desired vector
-     * @param bb the byte buffer
-     * @param offset the offset into the byte buffer
+     * @param ms the memory segment
+     * @param offset the offset into the memory segment
     * @param bo the intended byte order
     * @param m the mask controlling lane selection
-     * @return a vector loaded from a byte buffer
+     * @return a vector loaded from the memory segment
     * @throws IndexOutOfBoundsException
     *         if {@code offset+N*4 < 0}
-     *         or {@code offset+N*4 >= bb.limit()}
+     *         or {@code offset+N*4 >= ms.byteSize()}
     *         for any lane {@code N} in the vector
     *         where the mask is set
+     * @throws IllegalArgumentException if the memory segment is a heap segment that is
+     *         not backed by a {@code byte[]} array.
+     * @throws IllegalStateException if the memory segment's session is not alive,
+     *         or if access occurs from a thread other than the thread owning the session.
+     * @since 19
     */
    @ForceInline
    public static
-    FloatVector fromByteBuffer(VectorSpecies<Float> species,
-                                        ByteBuffer bb, int offset,
-                                        ByteOrder bo,
-                                        VectorMask<Float> m) {
+    FloatVector fromMemorySegment(VectorSpecies<Float> species,
+                                           MemorySegment ms, long offset,
+                                           ByteOrder bo,
+                                           VectorMask<Float> m) {
        FloatSpecies vsp = (FloatSpecies) species;
-        if (offset >= 0 && offset <= (bb.limit() - species.vectorByteSize())) {
-            return vsp.dummyVector().fromByteBuffer0(bb, offset, m).maybeSwap(bo);
+        if (offset >= 0 && offset <= (ms.byteSize() - species.vectorByteSize())) {
+            return vsp.dummyVector().fromMemorySegment0(ms, offset, m).maybeSwap(bo);
        }

        // FIXME: optimize
-        checkMaskFromIndexSize(offset, vsp, m, 4, bb.limit());
-        ByteBuffer wb = wrapper(bb, bo);
-        return vsp.ldOp(wb, offset, (AbstractMask<Float>)m,
-                   (wb_, o, i)  -> wb_.getFloat(o + i * 4));
+        checkMaskFromIndexSize(offset, vsp, m, 4, ms.byteSize());
+        return vsp.ldLongOp(ms, offset, m, FloatVector::memorySegmentGet);
    }

    // Memory store operations
@ -3013,7 +3080,7 @@ public abstract class FloatVector extends AbstractVector<Float> {
            this,
            a, offset,
            (arr, off, v)
-            -> v.stOp(arr, off,
+            -> v.stOp(arr, (int) off,
                      (arr_, off_, i, e) -> arr_[off_ + i] = e));
    }

@ -3154,67 +3221,40 @@ public abstract class FloatVector extends AbstractVector<Float> {

    /**
     * {@inheritDoc} <!--workaround-->
+     * @since 19
     */
    @Override
    @ForceInline
    public final
-    void intoByteArray(byte[] a, int offset,
-                       ByteOrder bo) {
-        offset = checkFromIndexSize(offset, byteSize(), a.length);
-        maybeSwap(bo).intoByteArray0(a, offset);
-    }
-
-    /**
-     * {@inheritDoc} <!--workaround-->
-     */
-    @Override
-    @ForceInline
-    public final
-    void intoByteArray(byte[] a, int offset,
-                       ByteOrder bo,
-                       VectorMask<Float> m) {
-        if (m.allTrue()) {
-            intoByteArray(a, offset, bo);
-        } else {
-            FloatSpecies vsp = vspecies();
-            checkMaskFromIndexSize(offset, vsp, m, 4, a.length);
-            maybeSwap(bo).intoByteArray0(a, offset, m);
+    void intoMemorySegment(MemorySegment ms, long offset,
+                           ByteOrder bo) {
+        if (ms.isReadOnly()) {
+            throw new UnsupportedOperationException("Attempt to write a read-only segment");
        }
+
+        offset = checkFromIndexSize(offset, byteSize(), ms.byteSize());
+        maybeSwap(bo).intoMemorySegment0(ms, offset);
    }

    /**
     * {@inheritDoc} <!--workaround-->
+     * @since 19
     */
    @Override
    @ForceInline
    public final
-    void intoByteBuffer(ByteBuffer bb, int offset,
-                        ByteOrder bo) {
-        if (ScopedMemoryAccess.isReadOnly(bb)) {
-            throw new ReadOnlyBufferException();
-        }
-        offset = checkFromIndexSize(offset, byteSize(), bb.limit());
-        maybeSwap(bo).intoByteBuffer0(bb, offset);
-    }
-
-    /**
-     * {@inheritDoc} <!--workaround-->
-     */
-    @Override
-    @ForceInline
-    public final
-    void intoByteBuffer(ByteBuffer bb, int offset,
-                        ByteOrder bo,
-                        VectorMask<Float> m) {
+    void intoMemorySegment(MemorySegment ms, long offset,
+                           ByteOrder bo,
+                           VectorMask<Float> m) {
        if (m.allTrue()) {
-            intoByteBuffer(bb, offset, bo);
+            intoMemorySegment(ms, offset, bo);
        } else {
-            if (bb.isReadOnly()) {
-                throw new ReadOnlyBufferException();
+            if (ms.isReadOnly()) {
+                throw new UnsupportedOperationException("Attempt to write a read-only segment");
            }
            FloatSpecies vsp = vspecies();
-            checkMaskFromIndexSize(offset, vsp, m, 4, bb.limit());
-            maybeSwap(bo).intoByteBuffer0(bb, offset, m);
+            checkMaskFromIndexSize(offset, vsp, m, 4, ms.byteSize());
+            maybeSwap(bo).intoMemorySegment0(ms, offset, m);
        }
    }

@ -3248,7 +3288,7 @@ public abstract class FloatVector extends AbstractVector<Float> {
            vsp.vectorType(), vsp.elementType(), vsp.laneCount(),
            a, arrayAddress(a, offset),
            a, offset, vsp,
-            (arr, off, s) -> s.ldOp(arr, off,
+            (arr, off, s) -> s.ldOp(arr, (int) off,
                                    (arr_, off_, i) -> arr_[off_ + i]));
    }

@ -3265,7 +3305,7 @@ public abstract class FloatVector extends AbstractVector<Float> {
            vsp.vectorType(), maskClass, vsp.elementType(), vsp.laneCount(),
            a, arrayAddress(a, offset), m,
            a, offset, vsp,
-            (arr, off, s, vm) -> s.ldOp(arr, off, vm,
+            (arr, off, s, vm) -> s.ldOp(arr, (int) off, vm,
                                        (arr_, off_, i) -> arr_[off_ + i]));
    }

@ -3305,74 +3345,33 @@ public abstract class FloatVector extends AbstractVector<Float> {



-    @Override
    abstract
-    FloatVector fromByteArray0(byte[] a, int offset);
+    FloatVector fromMemorySegment0(MemorySegment bb, long offset);
    @ForceInline
    final
-    FloatVector fromByteArray0Template(byte[] a, int offset) {
+    FloatVector fromMemorySegment0Template(MemorySegment ms, long offset) {
        FloatSpecies vsp = vspecies();
-        return VectorSupport.load(
-            vsp.vectorType(), vsp.elementType(), vsp.laneCount(),
-            a, byteArrayAddress(a, offset),
-            a, offset, vsp,
-            (arr, off, s) -> {
-                ByteBuffer wb = wrapper(arr, NATIVE_ENDIAN);
-                return s.ldOp(wb, off,
-                        (wb_, o, i) -> wb_.getFloat(o + i * 4));
-            });
-    }
-
-    abstract
-    FloatVector fromByteArray0(byte[] a, int offset, VectorMask<Float> m);
-    @ForceInline
-    final
-    <M extends VectorMask<Float>>
-    FloatVector fromByteArray0Template(Class<M> maskClass, byte[] a, int offset, M m) {
-        FloatSpecies vsp = vspecies();
-        m.check(vsp);
-        return VectorSupport.loadMasked(
-            vsp.vectorType(), maskClass, vsp.elementType(), vsp.laneCount(),
-            a, byteArrayAddress(a, offset), m,
-            a, offset, vsp,
-            (arr, off, s, vm) -> {
-                ByteBuffer wb = wrapper(arr, NATIVE_ENDIAN);
-                return s.ldOp(wb, off, vm,
-                        (wb_, o, i) -> wb_.getFloat(o + i * 4));
-            });
-    }
-
-    abstract
-    FloatVector fromByteBuffer0(ByteBuffer bb, int offset);
-    @ForceInline
-    final
-    FloatVector fromByteBuffer0Template(ByteBuffer bb, int offset) {
-        FloatSpecies vsp = vspecies();
-        return ScopedMemoryAccess.loadFromByteBuffer(
+        return ScopedMemoryAccess.loadFromMemorySegment(
                vsp.vectorType(), vsp.elementType(), vsp.laneCount(),
-                bb, offset, vsp,
-                (buf, off, s) -> {
-                    ByteBuffer wb = wrapper(buf, NATIVE_ENDIAN);
-                    return s.ldOp(wb, off,
-                            (wb_, o, i) -> wb_.getFloat(o + i * 4));
+                (AbstractMemorySegmentImpl) ms, offset, vsp,
+                (msp, off, s) -> {
+                    return s.ldLongOp((MemorySegment) msp, off, FloatVector::memorySegmentGet);
                });
    }

    abstract
-    FloatVector fromByteBuffer0(ByteBuffer bb, int offset, VectorMask<Float> m);
+    FloatVector fromMemorySegment0(MemorySegment ms, long offset, VectorMask<Float> m);
    @ForceInline
    final
    <M extends VectorMask<Float>>
-    FloatVector fromByteBuffer0Template(Class<M> maskClass, ByteBuffer bb, int offset, M m) {
+    FloatVector fromMemorySegment0Template(Class<M> maskClass, MemorySegment ms, long offset, M m) {
        FloatSpecies vsp = vspecies();
        m.check(vsp);
-        return ScopedMemoryAccess.loadFromByteBufferMasked(
+        return ScopedMemoryAccess.loadFromMemorySegmentMasked(
                vsp.vectorType(), maskClass, vsp.elementType(), vsp.laneCount(),
-                bb, offset, m, vsp,
-                (buf, off, s, vm) -> {
-                    ByteBuffer wb = wrapper(buf, NATIVE_ENDIAN);
-                    return s.ldOp(wb, off, vm,
-                            (wb_, o, i) -> wb_.getFloat(o + i * 4));
+                (AbstractMemorySegmentImpl) ms, offset, m, vsp,
+                (msp, off, s, vm) -> {
+                    return s.ldLongOp((MemorySegment) msp, off, vm, FloatVector::memorySegmentGet);
                });
    }

@ -3391,7 +3390,7 @@ public abstract class FloatVector extends AbstractVector<Float> {
            a, arrayAddress(a, offset),
            this, a, offset,
            (arr, off, v)
-            -> v.stOp(arr, off,
+            -> v.stOp(arr, (int) off,
                      (arr_, off_, i, e) -> arr_[off_+i] = e));
    }

@ -3408,7 +3407,7 @@ public abstract class FloatVector extends AbstractVector<Float> {
            a, arrayAddress(a, offset),
            this, m, a, offset,
            (arr, off, v, vm)
-            -> v.stOp(arr, off, vm,
+            -> v.stOp(arr, (int) off, vm,
                      (arr_, off_, i, e) -> arr_[off_ + i] = e));
    }

@ -3447,71 +3446,33 @@ public abstract class FloatVector extends AbstractVector<Float> {
    }


-    abstract
-    void intoByteArray0(byte[] a, int offset);
    @ForceInline
    final
-    void intoByteArray0Template(byte[] a, int offset) {
+    void intoMemorySegment0(MemorySegment ms, long offset) {
        FloatSpecies vsp = vspecies();
-        VectorSupport.store(
-            vsp.vectorType(), vsp.elementType(), vsp.laneCount(),
-            a, byteArrayAddress(a, offset),
-            this, a, offset,
-            (arr, off, v) -> {
-                ByteBuffer wb = wrapper(arr, NATIVE_ENDIAN);
-                v.stOp(wb, off,
-                        (tb_, o, i, e) -> tb_.putFloat(o + i * 4, e));
-            });
-    }
-
-    abstract
-    void intoByteArray0(byte[] a, int offset, VectorMask<Float> m);
-    @ForceInline
-    final
-    <M extends VectorMask<Float>>
-    void intoByteArray0Template(Class<M> maskClass, byte[] a, int offset, M m) {
-        FloatSpecies vsp = vspecies();
-        m.check(vsp);
-        VectorSupport.storeMasked(
-            vsp.vectorType(), maskClass, vsp.elementType(), vsp.laneCount(),
-            a, byteArrayAddress(a, offset),
-            this, m, a, offset,
-            (arr, off, v, vm) -> {
-                ByteBuffer wb = wrapper(arr, NATIVE_ENDIAN);
-                v.stOp(wb, off, vm,
-                        (tb_, o, i, e) -> tb_.putFloat(o + i * 4, e));
-            });
-    }
-
-    @ForceInline
-    final
-    void intoByteBuffer0(ByteBuffer bb, int offset) {
-        FloatSpecies vsp = vspecies();
-        ScopedMemoryAccess.storeIntoByteBuffer(
+        ScopedMemoryAccess.storeIntoMemorySegment(
                vsp.vectorType(), vsp.elementType(), vsp.laneCount(),
-                this, bb, offset,
-                (buf, off, v) -> {
-                    ByteBuffer wb = wrapper(buf, NATIVE_ENDIAN);
-                    v.stOp(wb, off,
-                            (wb_, o, i, e) -> wb_.putFloat(o + i * 4, e));
+                this,
+                (AbstractMemorySegmentImpl) ms, offset,
+                (msp, off, v) -> {
+                    v.stLongOp((MemorySegment) msp, off, FloatVector::memorySegmentSet);
                });
    }

    abstract
-    void intoByteBuffer0(ByteBuffer bb, int offset, VectorMask<Float> m);
+    void intoMemorySegment0(MemorySegment bb, long offset, VectorMask<Float> m);
    @ForceInline
    final
    <M extends VectorMask<Float>>
-    void intoByteBuffer0Template(Class<M> maskClass, ByteBuffer bb, int offset, M m) {
+    void intoMemorySegment0Template(Class<M> maskClass, MemorySegment ms, long offset, M m) {
        FloatSpecies vsp = vspecies();
        m.check(vsp);
-        ScopedMemoryAccess.storeIntoByteBufferMasked(
+        ScopedMemoryAccess.storeIntoMemorySegmentMasked(
                vsp.vectorType(), maskClass, vsp.elementType(), vsp.laneCount(),
-                this, m, bb, offset,
-                (buf, off, v, vm) -> {
-                    ByteBuffer wb = wrapper(buf, NATIVE_ENDIAN);
-                    v.stOp(wb, off, vm,
-                            (wb_, o, i, e) -> wb_.putFloat(o + i * 4, e));
+                this, m,
+                (AbstractMemorySegmentImpl) ms, offset,
+                (msp, off, v, vm) -> {
+                    v.stLongOp((MemorySegment) msp, off, vm, FloatVector::memorySegmentSet);
                });
    }

@ -3528,6 +3489,16 @@ public abstract class FloatVector extends AbstractVector<Float> {
            .checkIndexByLane(offset, limit, vsp.iota(), scale);
    }

+    private static
+    void checkMaskFromIndexSize(long offset,
+                                FloatSpecies vsp,
+                                VectorMask<Float> m,
+                                int scale,
+                                long limit) {
+        ((AbstractMask<Float>)m)
+            .checkIndexByLane(offset, limit, vsp.iota(), scale);
+    }
+
    @ForceInline
    private void conditionalStoreNYI(int offset,
                                     FloatSpecies vsp,
@ -3838,6 +3809,21 @@ public abstract class FloatVector extends AbstractVector<Float> {
            return dummyVector().ldOp(memory, offset, m, f);
        }

+        /*package-private*/
+        @ForceInline
+        FloatVector ldLongOp(MemorySegment memory, long offset,
+                                      FLdLongOp f) {
+            return dummyVector().ldLongOp(memory, offset, f);
+        }
+
+        /*package-private*/
+        @ForceInline
+        FloatVector ldLongOp(MemorySegment memory, long offset,
+                                      VectorMask<Float> m,
+                                      FLdLongOp f) {
+            return dummyVector().ldLongOp(memory, offset, m, f);
+        }
+
        /*package-private*/
        @ForceInline
        <M> void stOp(M memory, int offset, FStOp<M> f) {
@ -3852,6 +3838,20 @@ public abstract class FloatVector extends AbstractVector<Float> {
            dummyVector().stOp(memory, offset, m, f);
        }

+        /*package-private*/
+        @ForceInline
+        void stLongOp(MemorySegment memory, long offset, FStLongOp f) {
+            dummyVector().stLongOp(memory, offset, f);
+        }
+
+        /*package-private*/
+        @ForceInline
+        void stLongOp(MemorySegment memory, long offset,
+                      AbstractMask<Float> m,
+                      FStLongOp f) {
+            dummyVector().stLongOp(memory, offset, m, f);
+        }
+
        // N.B. Make sure these constant vectors and
        // masks load up correctly into registers.
        //
@ -3965,3 +3965,4 @@ public abstract class FloatVector extends AbstractVector<Float> {
    public static final VectorSpecies<Float> SPECIES_PREFERRED
        = (FloatSpecies) VectorSpecies.ofPreferred(float.class);
 }
+
--- a/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/Int128Vector.java
+++ b/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/Int128Vector.java
@ -24,7 +24,7 @@
 */
 package jdk.incubator.vector;

-import java.nio.ByteBuffer;
+import java.lang.foreign.MemorySegment;
 import java.util.Arrays;
 import java.util.Objects;
 import java.util.function.IntUnaryOperator;
@ -474,6 +474,22 @@ final class Int128Vector extends IntVector {
                                    (Int128Vector) v);  // specialize
    }

+    @Override
+    @ForceInline
+    public Int128Vector compress(VectorMask<Integer> m) {
+        return (Int128Vector)
+            super.compressTemplate(Int128Mask.class,
+                                   (Int128Mask) m);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    public Int128Vector expand(VectorMask<Integer> m) {
+        return (Int128Vector)
+            super.expandTemplate(Int128Mask.class,
+                                   (Int128Mask) m);  // specialize
+    }
+
    @Override
    @ForceInline
    public Int128Vector selectFrom(Vector<Integer> v) {
@ -653,6 +669,15 @@ final class Int128Vector extends IntVector {
            return xor(maskAll(true));
        }

+        @Override
+        @ForceInline
+        public Int128Mask compress() {
+            return (Int128Mask)VectorSupport.compressExpandOp(VectorSupport.VECTOR_OP_MASK_COMPRESS,
+                Int128Vector.class, Int128Mask.class, ETYPE, VLENGTH, null, this,
+                (v1, m1) -> VSPECIES.iota().compare(VectorOperators.LT, m1.trueCount()));
+        }
+
+
        // Binary operations

        @Override
@ -845,29 +870,15 @@ final class Int128Vector extends IntVector {
    @ForceInline
    @Override
    final
-    IntVector fromByteArray0(byte[] a, int offset) {
-        return super.fromByteArray0Template(a, offset);  // specialize
+    IntVector fromMemorySegment0(MemorySegment ms, long offset) {
+        return super.fromMemorySegment0Template(ms, offset);  // specialize
    }

    @ForceInline
    @Override
    final
-    IntVector fromByteArray0(byte[] a, int offset, VectorMask<Integer> m) {
-        return super.fromByteArray0Template(Int128Mask.class, a, offset, (Int128Mask) m);  // specialize
-    }
-
-    @ForceInline
-    @Override
-    final
-    IntVector fromByteBuffer0(ByteBuffer bb, int offset) {
-        return super.fromByteBuffer0Template(bb, offset);  // specialize
-    }
-
-    @ForceInline
-    @Override
-    final
-    IntVector fromByteBuffer0(ByteBuffer bb, int offset, VectorMask<Integer> m) {
-        return super.fromByteBuffer0Template(Int128Mask.class, bb, offset, (Int128Mask) m);  // specialize
+    IntVector fromMemorySegment0(MemorySegment ms, long offset, VectorMask<Integer> m) {
+        return super.fromMemorySegment0Template(Int128Mask.class, ms, offset, (Int128Mask) m);  // specialize
    }

    @ForceInline
@ -895,22 +906,8 @@ final class Int128Vector extends IntVector {
    @ForceInline
    @Override
    final
-    void intoByteArray0(byte[] a, int offset) {
-        super.intoByteArray0Template(a, offset);  // specialize
-    }
-
-    @ForceInline
-    @Override
-    final
-    void intoByteArray0(byte[] a, int offset, VectorMask<Integer> m) {
-        super.intoByteArray0Template(Int128Mask.class, a, offset, (Int128Mask) m);  // specialize
-    }
-
-    @ForceInline
-    @Override
-    final
-    void intoByteBuffer0(ByteBuffer bb, int offset, VectorMask<Integer> m) {
-        super.intoByteBuffer0Template(Int128Mask.class, bb, offset, (Int128Mask) m);
+    void intoMemorySegment0(MemorySegment ms, long offset, VectorMask<Integer> m) {
+        super.intoMemorySegment0Template(Int128Mask.class, ms, offset, (Int128Mask) m);
    }


@ -919,3 +916,4 @@ final class Int128Vector extends IntVector {
    // ================================================

 }
+
--- a/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/Int256Vector.java
+++ b/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/Int256Vector.java
@ -24,7 +24,7 @@
 */
 package jdk.incubator.vector;

-import java.nio.ByteBuffer;
+import java.lang.foreign.MemorySegment;
 import java.util.Arrays;
 import java.util.Objects;
 import java.util.function.IntUnaryOperator;
@ -474,6 +474,22 @@ final class Int256Vector extends IntVector {
                                    (Int256Vector) v);  // specialize
    }

+    @Override
+    @ForceInline
+    public Int256Vector compress(VectorMask<Integer> m) {
+        return (Int256Vector)
+            super.compressTemplate(Int256Mask.class,
+                                   (Int256Mask) m);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    public Int256Vector expand(VectorMask<Integer> m) {
+        return (Int256Vector)
+            super.expandTemplate(Int256Mask.class,
+                                   (Int256Mask) m);  // specialize
+    }
+
    @Override
    @ForceInline
    public Int256Vector selectFrom(Vector<Integer> v) {
@ -661,6 +677,15 @@ final class Int256Vector extends IntVector {
            return xor(maskAll(true));
        }

+        @Override
+        @ForceInline
+        public Int256Mask compress() {
+            return (Int256Mask)VectorSupport.compressExpandOp(VectorSupport.VECTOR_OP_MASK_COMPRESS,
+                Int256Vector.class, Int256Mask.class, ETYPE, VLENGTH, null, this,
+                (v1, m1) -> VSPECIES.iota().compare(VectorOperators.LT, m1.trueCount()));
+        }
+
+
        // Binary operations

        @Override
@ -853,29 +878,15 @@ final class Int256Vector extends IntVector {
    @ForceInline
    @Override
    final
-    IntVector fromByteArray0(byte[] a, int offset) {
-        return super.fromByteArray0Template(a, offset);  // specialize
+    IntVector fromMemorySegment0(MemorySegment ms, long offset) {
+        return super.fromMemorySegment0Template(ms, offset);  // specialize
    }

    @ForceInline
    @Override
    final
-    IntVector fromByteArray0(byte[] a, int offset, VectorMask<Integer> m) {
-        return super.fromByteArray0Template(Int256Mask.class, a, offset, (Int256Mask) m);  // specialize
-    }
-
-    @ForceInline
-    @Override
-    final
-    IntVector fromByteBuffer0(ByteBuffer bb, int offset) {
-        return super.fromByteBuffer0Template(bb, offset);  // specialize
-    }
-
-    @ForceInline
-    @Override
-    final
-    IntVector fromByteBuffer0(ByteBuffer bb, int offset, VectorMask<Integer> m) {
-        return super.fromByteBuffer0Template(Int256Mask.class, bb, offset, (Int256Mask) m);  // specialize
+    IntVector fromMemorySegment0(MemorySegment ms, long offset, VectorMask<Integer> m) {
+        return super.fromMemorySegment0Template(Int256Mask.class, ms, offset, (Int256Mask) m);  // specialize
    }

    @ForceInline
@ -903,22 +914,8 @@ final class Int256Vector extends IntVector {
    @ForceInline
    @Override
    final
-    void intoByteArray0(byte[] a, int offset) {
-        super.intoByteArray0Template(a, offset);  // specialize
-    }
-
-    @ForceInline
-    @Override
-    final
-    void intoByteArray0(byte[] a, int offset, VectorMask<Integer> m) {
-        super.intoByteArray0Template(Int256Mask.class, a, offset, (Int256Mask) m);  // specialize
-    }
-
-    @ForceInline
-    @Override
-    final
-    void intoByteBuffer0(ByteBuffer bb, int offset, VectorMask<Integer> m) {
-        super.intoByteBuffer0Template(Int256Mask.class, bb, offset, (Int256Mask) m);
+    void intoMemorySegment0(MemorySegment ms, long offset, VectorMask<Integer> m) {
+        super.intoMemorySegment0Template(Int256Mask.class, ms, offset, (Int256Mask) m);
    }


@ -927,3 +924,4 @@ final class Int256Vector extends IntVector {
    // ================================================

 }
+
--- a/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/Int512Vector.java
+++ b/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/Int512Vector.java
@ -24,7 +24,7 @@
 */
 package jdk.incubator.vector;

-import java.nio.ByteBuffer;
+import java.lang.foreign.MemorySegment;
 import java.util.Arrays;
 import java.util.Objects;
 import java.util.function.IntUnaryOperator;
@ -474,6 +474,22 @@ final class Int512Vector extends IntVector {
                                    (Int512Vector) v);  // specialize
    }

+    @Override
+    @ForceInline
+    public Int512Vector compress(VectorMask<Integer> m) {
+        return (Int512Vector)
+            super.compressTemplate(Int512Mask.class,
+                                   (Int512Mask) m);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    public Int512Vector expand(VectorMask<Integer> m) {
+        return (Int512Vector)
+            super.expandTemplate(Int512Mask.class,
+                                   (Int512Mask) m);  // specialize
+    }
+
    @Override
    @ForceInline
    public Int512Vector selectFrom(Vector<Integer> v) {
@ -677,6 +693,15 @@ final class Int512Vector extends IntVector {
            return xor(maskAll(true));
        }

+        @Override
+        @ForceInline
+        public Int512Mask compress() {
+            return (Int512Mask)VectorSupport.compressExpandOp(VectorSupport.VECTOR_OP_MASK_COMPRESS,
+                Int512Vector.class, Int512Mask.class, ETYPE, VLENGTH, null, this,
+                (v1, m1) -> VSPECIES.iota().compare(VectorOperators.LT, m1.trueCount()));
+        }
+
+
        // Binary operations

        @Override
@ -869,29 +894,15 @@ final class Int512Vector extends IntVector {
    @ForceInline
    @Override
    final
-    IntVector fromByteArray0(byte[] a, int offset) {
-        return super.fromByteArray0Template(a, offset);  // specialize
+    IntVector fromMemorySegment0(MemorySegment ms, long offset) {
+        return super.fromMemorySegment0Template(ms, offset);  // specialize
    }

    @ForceInline
    @Override
    final
-    IntVector fromByteArray0(byte[] a, int offset, VectorMask<Integer> m) {
-        return super.fromByteArray0Template(Int512Mask.class, a, offset, (Int512Mask) m);  // specialize
-    }
-
-    @ForceInline
-    @Override
-    final
-    IntVector fromByteBuffer0(ByteBuffer bb, int offset) {
-        return super.fromByteBuffer0Template(bb, offset);  // specialize
-    }
-
-    @ForceInline
-    @Override
-    final
-    IntVector fromByteBuffer0(ByteBuffer bb, int offset, VectorMask<Integer> m) {
-        return super.fromByteBuffer0Template(Int512Mask.class, bb, offset, (Int512Mask) m);  // specialize
+    IntVector fromMemorySegment0(MemorySegment ms, long offset, VectorMask<Integer> m) {
+        return super.fromMemorySegment0Template(Int512Mask.class, ms, offset, (Int512Mask) m);  // specialize
    }

    @ForceInline
@ -919,22 +930,8 @@ final class Int512Vector extends IntVector {
    @ForceInline
    @Override
    final
-    void intoByteArray0(byte[] a, int offset) {
-        super.intoByteArray0Template(a, offset);  // specialize
-    }
-
-    @ForceInline
-    @Override
-    final
-    void intoByteArray0(byte[] a, int offset, VectorMask<Integer> m) {
-        super.intoByteArray0Template(Int512Mask.class, a, offset, (Int512Mask) m);  // specialize
-    }
-
-    @ForceInline
-    @Override
-    final
-    void intoByteBuffer0(ByteBuffer bb, int offset, VectorMask<Integer> m) {
-        super.intoByteBuffer0Template(Int512Mask.class, bb, offset, (Int512Mask) m);
+    void intoMemorySegment0(MemorySegment ms, long offset, VectorMask<Integer> m) {
+        super.intoMemorySegment0Template(Int512Mask.class, ms, offset, (Int512Mask) m);
    }


@ -943,3 +940,4 @@ final class Int512Vector extends IntVector {
    // ================================================

 }
+
--- a/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/Int64Vector.java
+++ b/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/Int64Vector.java
@ -24,7 +24,7 @@
 */
 package jdk.incubator.vector;

-import java.nio.ByteBuffer;
+import java.lang.foreign.MemorySegment;
 import java.util.Arrays;
 import java.util.Objects;
 import java.util.function.IntUnaryOperator;
@ -474,6 +474,22 @@ final class Int64Vector extends IntVector {
                                    (Int64Vector) v);  // specialize
    }

+    @Override
+    @ForceInline
+    public Int64Vector compress(VectorMask<Integer> m) {
+        return (Int64Vector)
+            super.compressTemplate(Int64Mask.class,
+                                   (Int64Mask) m);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    public Int64Vector expand(VectorMask<Integer> m) {
+        return (Int64Vector)
+            super.expandTemplate(Int64Mask.class,
+                                   (Int64Mask) m);  // specialize
+    }
+
    @Override
    @ForceInline
    public Int64Vector selectFrom(Vector<Integer> v) {
@ -649,6 +665,15 @@ final class Int64Vector extends IntVector {
            return xor(maskAll(true));
        }

+        @Override
+        @ForceInline
+        public Int64Mask compress() {
+            return (Int64Mask)VectorSupport.compressExpandOp(VectorSupport.VECTOR_OP_MASK_COMPRESS,
+                Int64Vector.class, Int64Mask.class, ETYPE, VLENGTH, null, this,
+                (v1, m1) -> VSPECIES.iota().compare(VectorOperators.LT, m1.trueCount()));
+        }
+
+
        // Binary operations

        @Override
@ -841,29 +866,15 @@ final class Int64Vector extends IntVector {
    @ForceInline
    @Override
    final
-    IntVector fromByteArray0(byte[] a, int offset) {
-        return super.fromByteArray0Template(a, offset);  // specialize
+    IntVector fromMemorySegment0(MemorySegment ms, long offset) {
+        return super.fromMemorySegment0Template(ms, offset);  // specialize
    }

    @ForceInline
    @Override
    final
-    IntVector fromByteArray0(byte[] a, int offset, VectorMask<Integer> m) {
-        return super.fromByteArray0Template(Int64Mask.class, a, offset, (Int64Mask) m);  // specialize
-    }
-
-    @ForceInline
-    @Override
-    final
-    IntVector fromByteBuffer0(ByteBuffer bb, int offset) {
-        return super.fromByteBuffer0Template(bb, offset);  // specialize
-    }
-
-    @ForceInline
-    @Override
-    final
-    IntVector fromByteBuffer0(ByteBuffer bb, int offset, VectorMask<Integer> m) {
-        return super.fromByteBuffer0Template(Int64Mask.class, bb, offset, (Int64Mask) m);  // specialize
+    IntVector fromMemorySegment0(MemorySegment ms, long offset, VectorMask<Integer> m) {
+        return super.fromMemorySegment0Template(Int64Mask.class, ms, offset, (Int64Mask) m);  // specialize
    }

    @ForceInline
@ -891,22 +902,8 @@ final class Int64Vector extends IntVector {
    @ForceInline
    @Override
    final
-    void intoByteArray0(byte[] a, int offset) {
-        super.intoByteArray0Template(a, offset);  // specialize
-    }
-
-    @ForceInline
-    @Override
-    final
-    void intoByteArray0(byte[] a, int offset, VectorMask<Integer> m) {
-        super.intoByteArray0Template(Int64Mask.class, a, offset, (Int64Mask) m);  // specialize
-    }
-
-    @ForceInline
-    @Override
-    final
-    void intoByteBuffer0(ByteBuffer bb, int offset, VectorMask<Integer> m) {
-        super.intoByteBuffer0Template(Int64Mask.class, bb, offset, (Int64Mask) m);
+    void intoMemorySegment0(MemorySegment ms, long offset, VectorMask<Integer> m) {
+        super.intoMemorySegment0Template(Int64Mask.class, ms, offset, (Int64Mask) m);
    }


@ -915,3 +912,4 @@ final class Int64Vector extends IntVector {
    // ================================================

 }
+
--- a/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/IntMaxVector.java
+++ b/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/IntMaxVector.java
@ -24,7 +24,7 @@
 */
 package jdk.incubator.vector;

-import java.nio.ByteBuffer;
+import java.lang.foreign.MemorySegment;
 import java.util.Arrays;
 import java.util.Objects;
 import java.util.function.IntUnaryOperator;
@ -474,6 +474,22 @@ final class IntMaxVector extends IntVector {
                                    (IntMaxVector) v);  // specialize
    }

+    @Override
+    @ForceInline
+    public IntMaxVector compress(VectorMask<Integer> m) {
+        return (IntMaxVector)
+            super.compressTemplate(IntMaxMask.class,
+                                   (IntMaxMask) m);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    public IntMaxVector expand(VectorMask<Integer> m) {
+        return (IntMaxVector)
+            super.expandTemplate(IntMaxMask.class,
+                                   (IntMaxMask) m);  // specialize
+    }
+
    @Override
    @ForceInline
    public IntMaxVector selectFrom(Vector<Integer> v) {
@ -647,6 +663,15 @@ final class IntMaxVector extends IntVector {
            return xor(maskAll(true));
        }

+        @Override
+        @ForceInline
+        public IntMaxMask compress() {
+            return (IntMaxMask)VectorSupport.compressExpandOp(VectorSupport.VECTOR_OP_MASK_COMPRESS,
+                IntMaxVector.class, IntMaxMask.class, ETYPE, VLENGTH, null, this,
+                (v1, m1) -> VSPECIES.iota().compare(VectorOperators.LT, m1.trueCount()));
+        }
+
+
        // Binary operations

        @Override
@ -850,29 +875,15 @@ final class IntMaxVector extends IntVector {
    @ForceInline
    @Override
    final
-    IntVector fromByteArray0(byte[] a, int offset) {
-        return super.fromByteArray0Template(a, offset);  // specialize
+    IntVector fromMemorySegment0(MemorySegment ms, long offset) {
+        return super.fromMemorySegment0Template(ms, offset);  // specialize
    }

    @ForceInline
    @Override
    final
-    IntVector fromByteArray0(byte[] a, int offset, VectorMask<Integer> m) {
-        return super.fromByteArray0Template(IntMaxMask.class, a, offset, (IntMaxMask) m);  // specialize
-    }
-
-    @ForceInline
-    @Override
-    final
-    IntVector fromByteBuffer0(ByteBuffer bb, int offset) {
-        return super.fromByteBuffer0Template(bb, offset);  // specialize
-    }
-
-    @ForceInline
-    @Override
-    final
-    IntVector fromByteBuffer0(ByteBuffer bb, int offset, VectorMask<Integer> m) {
-        return super.fromByteBuffer0Template(IntMaxMask.class, bb, offset, (IntMaxMask) m);  // specialize
+    IntVector fromMemorySegment0(MemorySegment ms, long offset, VectorMask<Integer> m) {
+        return super.fromMemorySegment0Template(IntMaxMask.class, ms, offset, (IntMaxMask) m);  // specialize
    }

    @ForceInline
@ -900,22 +911,8 @@ final class IntMaxVector extends IntVector {
    @ForceInline
    @Override
    final
-    void intoByteArray0(byte[] a, int offset) {
-        super.intoByteArray0Template(a, offset);  // specialize
-    }
-
-    @ForceInline
-    @Override
-    final
-    void intoByteArray0(byte[] a, int offset, VectorMask<Integer> m) {
-        super.intoByteArray0Template(IntMaxMask.class, a, offset, (IntMaxMask) m);  // specialize
-    }
-
-    @ForceInline
-    @Override
-    final
-    void intoByteBuffer0(ByteBuffer bb, int offset, VectorMask<Integer> m) {
-        super.intoByteBuffer0Template(IntMaxMask.class, bb, offset, (IntMaxMask) m);
+    void intoMemorySegment0(MemorySegment ms, long offset, VectorMask<Integer> m) {
+        super.intoMemorySegment0Template(IntMaxMask.class, ms, offset, (IntMaxMask) m);
    }


@ -924,3 +921,4 @@ final class IntMaxVector extends IntVector {
    // ================================================

 }
+
--- a/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/IntVector.java
+++ b/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/IntVector.java
@ -24,14 +24,14 @@
 */
 package jdk.incubator.vector;

-import java.nio.ByteBuffer;
+import java.lang.foreign.MemorySegment;
+import java.lang.foreign.ValueLayout;
 import java.nio.ByteOrder;
-import java.nio.ReadOnlyBufferException;
 import java.util.Arrays;
 import java.util.Objects;
 import java.util.function.Function;
-import java.util.function.UnaryOperator;

+import jdk.internal.foreign.AbstractMemorySegmentImpl;
 import jdk.internal.misc.ScopedMemoryAccess;
 import jdk.internal.misc.Unsafe;
 import jdk.internal.vm.annotation.ForceInline;
@ -57,6 +57,8 @@ public abstract class IntVector extends AbstractVector<Integer> {

    static final int FORBID_OPCODE_KIND = VO_ONLYFP;

+    static final ValueLayout.OfInt ELEMENT_LAYOUT = ValueLayout.JAVA_INT.withBitAlignment(8);
+
    @ForceInline
    static int opCode(Operator op) {
        return VectorOperators.opCode(op, VO_OPCODE_VALID, FORBID_OPCODE_KIND);
@ -351,6 +353,45 @@ public abstract class IntVector extends AbstractVector<Integer> {
        return vectorFactory(res);
    }

+    /*package-private*/
+    interface FLdLongOp {
+        int apply(MemorySegment memory, long offset, int i);
+    }
+
+    /*package-private*/
+    @ForceInline
+    final
+    IntVector ldLongOp(MemorySegment memory, long offset,
+                                  FLdLongOp f) {
+        //dummy; no vec = vec();
+        int[] res = new int[length()];
+        for (int i = 0; i < res.length; i++) {
+            res[i] = f.apply(memory, offset, i);
+        }
+        return vectorFactory(res);
+    }
+
+    /*package-private*/
+    @ForceInline
+    final
+    IntVector ldLongOp(MemorySegment memory, long offset,
+                                  VectorMask<Integer> m,
+                                  FLdLongOp f) {
+        //int[] vec = vec();
+        int[] res = new int[length()];
+        boolean[] mbits = ((AbstractMask<Integer>)m).getBits();
+        for (int i = 0; i < res.length; i++) {
+            if (mbits[i]) {
+                res[i] = f.apply(memory, offset, i);
+            }
+        }
+        return vectorFactory(res);
+    }
+
+    static int memorySegmentGet(MemorySegment ms, long o, int i) {
+        return ms.get(ELEMENT_LAYOUT, o + i * 4L);
+    }
+
    interface FStOp<M> {
        void apply(M memory, int offset, int i, int a);
    }
@ -381,6 +422,40 @@ public abstract class IntVector extends AbstractVector<Integer> {
        }
    }

+    interface FStLongOp {
+        void apply(MemorySegment memory, long offset, int i, int a);
+    }
+
+    /*package-private*/
+    @ForceInline
+    final
+    void stLongOp(MemorySegment memory, long offset,
+                  FStLongOp f) {
+        int[] vec = vec();
+        for (int i = 0; i < vec.length; i++) {
+            f.apply(memory, offset, i, vec[i]);
+        }
+    }
+
+    /*package-private*/
+    @ForceInline
+    final
+    void stLongOp(MemorySegment memory, long offset,
+                  VectorMask<Integer> m,
+                  FStLongOp f) {
+        int[] vec = vec();
+        boolean[] mbits = ((AbstractMask<Integer>)m).getBits();
+        for (int i = 0; i < vec.length; i++) {
+            if (mbits[i]) {
+                f.apply(memory, offset, i, vec[i]);
+            }
+        }
+    }
+
+    static void memorySegmentSet(MemorySegment ms, long o, int i, int e) {
+        ms.set(ELEMENT_LAYOUT, o + i * 4L, e);
+    }
+
    // Binary test

    /*package-private*/
@ -431,6 +506,36 @@ public abstract class IntVector extends AbstractVector<Integer> {
        return ((int)bits);
    }

+    static IntVector expandHelper(Vector<Integer> v, VectorMask<Integer> m) {
+        VectorSpecies<Integer> vsp = m.vectorSpecies();
+        IntVector r  = (IntVector) vsp.zero();
+        IntVector vi = (IntVector) v;
+        if (m.allTrue()) {
+            return vi;
+        }
+        for (int i = 0, j = 0; i < vsp.length(); i++) {
+            if (m.laneIsSet(i)) {
+                r = r.withLane(i, vi.lane(j++));
+            }
+        }
+        return r;
+    }
+
+    static IntVector compressHelper(Vector<Integer> v, VectorMask<Integer> m) {
+        VectorSpecies<Integer> vsp = m.vectorSpecies();
+        IntVector r  = (IntVector) vsp.zero();
+        IntVector vi = (IntVector) v;
+        if (m.allTrue()) {
+            return vi;
+        }
+        for (int i = 0, j = 0; i < vsp.length(); i++) {
+            if (m.laneIsSet(i)) {
+                r = r.withLane(j++, vi.lane(i));
+            }
+        }
+        return r;
+    }
+
    // Static factories (other than memory operations)

    // Note: A surprising behavior in javadoc
@ -620,6 +725,16 @@ public abstract class IntVector extends AbstractVector<Integer> {
                    v0.uOp(m, (i, a) -> (int) -a);
            case VECTOR_OP_ABS: return (v0, m) ->
                    v0.uOp(m, (i, a) -> (int) Math.abs(a));
+            case VECTOR_OP_BIT_COUNT: return (v0, m) ->
+                    v0.uOp(m, (i, a) -> (int) Integer.bitCount(a));
+            case VECTOR_OP_TZ_COUNT: return (v0, m) ->
+                    v0.uOp(m, (i, a) -> (int) Integer.numberOfTrailingZeros(a));
+            case VECTOR_OP_LZ_COUNT: return (v0, m) ->
+                    v0.uOp(m, (i, a) -> (int) Integer.numberOfLeadingZeros(a));
+            case VECTOR_OP_REVERSE: return (v0, m) ->
+                    v0.uOp(m, (i, a) -> (int) Integer.reverse(a));
+            case VECTOR_OP_REVERSE_BYTES: return (v0, m) ->
+                    v0.uOp(m, (i, a) -> (int) Integer.reverseBytes(a));
            default: return null;
        }
    }
@ -760,6 +875,10 @@ public abstract class IntVector extends AbstractVector<Integer> {
                    v0.bOp(v1, vm, (i, a, n) -> rotateLeft(a, (int)n));
            case VECTOR_OP_RROTATE: return (v0, v1, vm) ->
                    v0.bOp(v1, vm, (i, a, n) -> rotateRight(a, (int)n));
+            case VECTOR_OP_COMPRESS_BITS: return (v0, v1, vm) ->
+                    v0.bOp(v1, vm, (i, a, n) -> Integer.compress(a, n));
+            case VECTOR_OP_EXPAND_BITS: return (v0, v1, vm) ->
+                    v0.bOp(v1, vm, (i, a, n) -> Integer.expand(a, n));
            default: return null;
        }
    }
@ -1745,6 +1864,7 @@ public abstract class IntVector extends AbstractVector<Integer> {
        return lanewise(ABS);
    }

+
    // not (~)
    /**
     * Computes the bitwise logical complement ({@code ~})
@ -2371,6 +2491,45 @@ public abstract class IntVector extends AbstractVector<Integer> {
                                     IntVector::toShuffle0);
    }

+    /**
+     * {@inheritDoc} <!--workaround-->
+     * @since 19
+     */
+    @Override
+    public abstract
+    IntVector compress(VectorMask<Integer> m);
+
+    /*package-private*/
+    @ForceInline
+    final
+    <M extends AbstractMask<Integer>>
+    IntVector compressTemplate(Class<M> masktype, M m) {
+      m.check(masktype, this);
+      return (IntVector) VectorSupport.compressExpandOp(VectorSupport.VECTOR_OP_COMPRESS, getClass(), masktype,
+                                                        int.class, length(), this, m,
+                                                        (v1, m1) -> compressHelper(v1, m1));
+    }
+
+    /**
+     * {@inheritDoc} <!--workaround-->
+     * @since 19
+     */
+    @Override
+    public abstract
+    IntVector expand(VectorMask<Integer> m);
+
+    /*package-private*/
+    @ForceInline
+    final
+    <M extends AbstractMask<Integer>>
+    IntVector expandTemplate(Class<M> masktype, M m) {
+      m.check(masktype, this);
+      return (IntVector) VectorSupport.compressExpandOp(VectorSupport.VECTOR_OP_EXPAND, getClass(), masktype,
+                                                        int.class, length(), this, m,
+                                                        (v1, m1) -> expandHelper(v1, m1));
+    }
+
+
    /**
     * {@inheritDoc} <!--workaround-->
     */
@ -2776,90 +2935,6 @@ public abstract class IntVector extends AbstractVector<Integer> {
        return res;
    }

-    /**
-     * Loads a vector from a byte array starting at an offset.
-     * Bytes are composed into primitive lane elements according
-     * to the specified byte order.
-     * The vector is arranged into lanes according to
-     * <a href="Vector.html#lane-order">memory ordering</a>.
-     * <p>
-     * This method behaves as if it returns the result of calling
-     * {@link #fromByteBuffer(VectorSpecies,ByteBuffer,int,ByteOrder,VectorMask)
-     * fromByteBuffer()} as follows:
-     * <pre>{@code
-     * var bb = ByteBuffer.wrap(a);
-     * var m = species.maskAll(true);
-     * return fromByteBuffer(species, bb, offset, bo, m);
-     * }</pre>
-     *
-     * @param species species of desired vector
-     * @param a the byte array
-     * @param offset the offset into the array
-     * @param bo the intended byte order
-     * @return a vector loaded from a byte array
-     * @throws IndexOutOfBoundsException
-     *         if {@code offset+N*ESIZE < 0}
-     *         or {@code offset+(N+1)*ESIZE > a.length}
-     *         for any lane {@code N} in the vector
-     */
-    @ForceInline
-    public static
-    IntVector fromByteArray(VectorSpecies<Integer> species,
-                                       byte[] a, int offset,
-                                       ByteOrder bo) {
-        offset = checkFromIndexSize(offset, species.vectorByteSize(), a.length);
-        IntSpecies vsp = (IntSpecies) species;
-        return vsp.dummyVector().fromByteArray0(a, offset).maybeSwap(bo);
-    }
-
-    /**
-     * Loads a vector from a byte array starting at an offset
-     * and using a mask.
-     * Lanes where the mask is unset are filled with the default
-     * value of {@code int} (zero).
-     * Bytes are composed into primitive lane elements according
-     * to the specified byte order.
-     * The vector is arranged into lanes according to
-     * <a href="Vector.html#lane-order">memory ordering</a>.
-     * <p>
-     * This method behaves as if it returns the result of calling
-     * {@link #fromByteBuffer(VectorSpecies,ByteBuffer,int,ByteOrder,VectorMask)
-     * fromByteBuffer()} as follows:
-     * <pre>{@code
-     * var bb = ByteBuffer.wrap(a);
-     * return fromByteBuffer(species, bb, offset, bo, m);
-     * }</pre>
-     *
-     * @param species species of desired vector
-     * @param a the byte array
-     * @param offset the offset into the array
-     * @param bo the intended byte order
-     * @param m the mask controlling lane selection
-     * @return a vector loaded from a byte array
-     * @throws IndexOutOfBoundsException
-     *         if {@code offset+N*ESIZE < 0}
-     *         or {@code offset+(N+1)*ESIZE > a.length}
-     *         for any lane {@code N} in the vector
-     *         where the mask is set
-     */
-    @ForceInline
-    public static
-    IntVector fromByteArray(VectorSpecies<Integer> species,
-                                       byte[] a, int offset,
-                                       ByteOrder bo,
-                                       VectorMask<Integer> m) {
-        IntSpecies vsp = (IntSpecies) species;
-        if (offset >= 0 && offset <= (a.length - species.vectorByteSize())) {
-            return vsp.dummyVector().fromByteArray0(a, offset, m).maybeSwap(bo);
-        }
-
-        // FIXME: optimize
-        checkMaskFromIndexSize(offset, vsp, m, 4, a.length);
-        ByteBuffer wb = wrapper(a, bo);
-        return vsp.ldOp(wb, offset, (AbstractMask<Integer>)m,
-                   (wb_, o, i)  -> wb_.getInt(o + i * 4));
-    }
-
    /**
     * Loads a vector from an array of type {@code int[]}
     * starting at an offset.
@ -3032,44 +3107,49 @@ public abstract class IntVector extends AbstractVector<Integer> {


    /**
-     * Loads a vector from a {@linkplain ByteBuffer byte buffer}
-     * starting at an offset into the byte buffer.
+     * Loads a vector from a {@linkplain MemorySegment memory segment}
+     * starting at an offset into the memory segment.
     * Bytes are composed into primitive lane elements according
     * to the specified byte order.
     * The vector is arranged into lanes according to
     * <a href="Vector.html#lane-order">memory ordering</a>.
     * <p>
     * This method behaves as if it returns the result of calling
-     * {@link #fromByteBuffer(VectorSpecies,ByteBuffer,int,ByteOrder,VectorMask)
-     * fromByteBuffer()} as follows:
+     * {@link #fromMemorySegment(VectorSpecies,MemorySegment,long,ByteOrder,VectorMask)
+     * fromMemorySegment()} as follows:
     * <pre>{@code
     * var m = species.maskAll(true);
-     * return fromByteBuffer(species, bb, offset, bo, m);
+     * return fromMemorySegment(species, ms, offset, bo, m);
     * }</pre>
     *
     * @param species species of desired vector
-     * @param bb the byte buffer
-     * @param offset the offset into the byte buffer
+     * @param ms the memory segment
+     * @param offset the offset into the memory segment
     * @param bo the intended byte order
-     * @return a vector loaded from a byte buffer
+     * @return a vector loaded from the memory segment
     * @throws IndexOutOfBoundsException
     *         if {@code offset+N*4 < 0}
-     *         or {@code offset+N*4 >= bb.limit()}
+     *         or {@code offset+N*4 >= ms.byteSize()}
     *         for any lane {@code N} in the vector
+     * @throws IllegalArgumentException if the memory segment is a heap segment that is
+     *         not backed by a {@code byte[]} array.
+     * @throws IllegalStateException if the memory segment's session is not alive,
+     *         or if access occurs from a thread other than the thread owning the session.
+     * @since 19
     */
    @ForceInline
    public static
-    IntVector fromByteBuffer(VectorSpecies<Integer> species,
-                                        ByteBuffer bb, int offset,
-                                        ByteOrder bo) {
-        offset = checkFromIndexSize(offset, species.vectorByteSize(), bb.limit());
+    IntVector fromMemorySegment(VectorSpecies<Integer> species,
+                                           MemorySegment ms, long offset,
+                                           ByteOrder bo) {
+        offset = checkFromIndexSize(offset, species.vectorByteSize(), ms.byteSize());
        IntSpecies vsp = (IntSpecies) species;
-        return vsp.dummyVector().fromByteBuffer0(bb, offset).maybeSwap(bo);
+        return vsp.dummyVector().fromMemorySegment0(ms, offset).maybeSwap(bo);
    }

    /**
-     * Loads a vector from a {@linkplain ByteBuffer byte buffer}
-     * starting at an offset into the byte buffer
+     * Loads a vector from a {@linkplain MemorySegment memory segment}
+     * starting at an offset into the memory segment
     * and using a mask.
     * Lanes where the mask is unset are filled with the default
     * value of {@code int} (zero).
@ -3080,13 +3160,11 @@ public abstract class IntVector extends AbstractVector<Integer> {
     * <p>
     * The following pseudocode illustrates the behavior:
     * <pre>{@code
-     * IntBuffer eb = bb.duplicate()
-     *     .position(offset)
-     *     .order(bo).asIntBuffer();
+     * var slice = ms.asSlice(offset);
     * int[] ar = new int[species.length()];
     * for (int n = 0; n < ar.length; n++) {
     *     if (m.laneIsSet(n)) {
-     *         ar[n] = eb.get(n);
+     *         ar[n] = slice.getAtIndex(ValuaLayout.JAVA_INT.withBitAlignment(8), n);
     *     }
     * }
     * IntVector r = IntVector.fromArray(species, ar, 0);
@ -3100,33 +3178,36 @@ public abstract class IntVector extends AbstractVector<Integer> {
     * the bytes of lane values.
     *
     * @param species species of desired vector
-     * @param bb the byte buffer
-     * @param offset the offset into the byte buffer
+     * @param ms the memory segment
+     * @param offset the offset into the memory segment
     * @param bo the intended byte order
     * @param m the mask controlling lane selection
-     * @return a vector loaded from a byte buffer
+     * @return a vector loaded from the memory segment
     * @throws IndexOutOfBoundsException
     *         if {@code offset+N*4 < 0}
-     *         or {@code offset+N*4 >= bb.limit()}
+     *         or {@code offset+N*4 >= ms.byteSize()}
     *         for any lane {@code N} in the vector
     *         where the mask is set
+     * @throws IllegalArgumentException if the memory segment is a heap segment that is
+     *         not backed by a {@code byte[]} array.
+     * @throws IllegalStateException if the memory segment's session is not alive,
+     *         or if access occurs from a thread other than the thread owning the session.
+     * @since 19
     */
    @ForceInline
    public static
-    IntVector fromByteBuffer(VectorSpecies<Integer> species,
-                                        ByteBuffer bb, int offset,
-                                        ByteOrder bo,
-                                        VectorMask<Integer> m) {
+    IntVector fromMemorySegment(VectorSpecies<Integer> species,
+                                           MemorySegment ms, long offset,
+                                           ByteOrder bo,
+                                           VectorMask<Integer> m) {
        IntSpecies vsp = (IntSpecies) species;
-        if (offset >= 0 && offset <= (bb.limit() - species.vectorByteSize())) {
-            return vsp.dummyVector().fromByteBuffer0(bb, offset, m).maybeSwap(bo);
+        if (offset >= 0 && offset <= (ms.byteSize() - species.vectorByteSize())) {
+            return vsp.dummyVector().fromMemorySegment0(ms, offset, m).maybeSwap(bo);
        }

        // FIXME: optimize
-        checkMaskFromIndexSize(offset, vsp, m, 4, bb.limit());
-        ByteBuffer wb = wrapper(bb, bo);
-        return vsp.ldOp(wb, offset, (AbstractMask<Integer>)m,
-                   (wb_, o, i)  -> wb_.getInt(o + i * 4));
+        checkMaskFromIndexSize(offset, vsp, m, 4, ms.byteSize());
+        return vsp.ldLongOp(ms, offset, m, IntVector::memorySegmentGet);
    }

    // Memory store operations
@ -3156,7 +3237,7 @@ public abstract class IntVector extends AbstractVector<Integer> {
            this,
            a, offset,
            (arr, off, v)
-            -> v.stOp(arr, off,
+            -> v.stOp(arr, (int) off,
                      (arr_, off_, i, e) -> arr_[off_ + i] = e));
    }

@ -3297,67 +3378,40 @@ public abstract class IntVector extends AbstractVector<Integer> {

    /**
     * {@inheritDoc} <!--workaround-->
+     * @since 19
     */
    @Override
    @ForceInline
    public final
-    void intoByteArray(byte[] a, int offset,
-                       ByteOrder bo) {
-        offset = checkFromIndexSize(offset, byteSize(), a.length);
-        maybeSwap(bo).intoByteArray0(a, offset);
-    }
-
-    /**
-     * {@inheritDoc} <!--workaround-->
-     */
-    @Override
-    @ForceInline
-    public final
-    void intoByteArray(byte[] a, int offset,
-                       ByteOrder bo,
-                       VectorMask<Integer> m) {
-        if (m.allTrue()) {
-            intoByteArray(a, offset, bo);
-        } else {
-            IntSpecies vsp = vspecies();
-            checkMaskFromIndexSize(offset, vsp, m, 4, a.length);
-            maybeSwap(bo).intoByteArray0(a, offset, m);
+    void intoMemorySegment(MemorySegment ms, long offset,
+                           ByteOrder bo) {
+        if (ms.isReadOnly()) {
+            throw new UnsupportedOperationException("Attempt to write a read-only segment");
        }
+
+        offset = checkFromIndexSize(offset, byteSize(), ms.byteSize());
+        maybeSwap(bo).intoMemorySegment0(ms, offset);
    }

    /**
     * {@inheritDoc} <!--workaround-->
+     * @since 19
     */
    @Override
    @ForceInline
    public final
-    void intoByteBuffer(ByteBuffer bb, int offset,
-                        ByteOrder bo) {
-        if (ScopedMemoryAccess.isReadOnly(bb)) {
-            throw new ReadOnlyBufferException();
-        }
-        offset = checkFromIndexSize(offset, byteSize(), bb.limit());
-        maybeSwap(bo).intoByteBuffer0(bb, offset);
-    }
-
-    /**
-     * {@inheritDoc} <!--workaround-->
-     */
-    @Override
-    @ForceInline
-    public final
-    void intoByteBuffer(ByteBuffer bb, int offset,
-                        ByteOrder bo,
-                        VectorMask<Integer> m) {
+    void intoMemorySegment(MemorySegment ms, long offset,
+                           ByteOrder bo,
+                           VectorMask<Integer> m) {
        if (m.allTrue()) {
-            intoByteBuffer(bb, offset, bo);
+            intoMemorySegment(ms, offset, bo);
        } else {
-            if (bb.isReadOnly()) {
-                throw new ReadOnlyBufferException();
+            if (ms.isReadOnly()) {
+                throw new UnsupportedOperationException("Attempt to write a read-only segment");
            }
            IntSpecies vsp = vspecies();
-            checkMaskFromIndexSize(offset, vsp, m, 4, bb.limit());
-            maybeSwap(bo).intoByteBuffer0(bb, offset, m);
+            checkMaskFromIndexSize(offset, vsp, m, 4, ms.byteSize());
+            maybeSwap(bo).intoMemorySegment0(ms, offset, m);
        }
    }

@ -3391,7 +3445,7 @@ public abstract class IntVector extends AbstractVector<Integer> {
            vsp.vectorType(), vsp.elementType(), vsp.laneCount(),
            a, arrayAddress(a, offset),
            a, offset, vsp,
-            (arr, off, s) -> s.ldOp(arr, off,
+            (arr, off, s) -> s.ldOp(arr, (int) off,
                                    (arr_, off_, i) -> arr_[off_ + i]));
    }

@ -3408,7 +3462,7 @@ public abstract class IntVector extends AbstractVector<Integer> {
            vsp.vectorType(), maskClass, vsp.elementType(), vsp.laneCount(),
            a, arrayAddress(a, offset), m,
            a, offset, vsp,
-            (arr, off, s, vm) -> s.ldOp(arr, off, vm,
+            (arr, off, s, vm) -> s.ldOp(arr, (int) off, vm,
                                        (arr_, off_, i) -> arr_[off_ + i]));
    }

@ -3448,74 +3502,33 @@ public abstract class IntVector extends AbstractVector<Integer> {



-    @Override
    abstract
-    IntVector fromByteArray0(byte[] a, int offset);
+    IntVector fromMemorySegment0(MemorySegment bb, long offset);
    @ForceInline
    final
-    IntVector fromByteArray0Template(byte[] a, int offset) {
+    IntVector fromMemorySegment0Template(MemorySegment ms, long offset) {
        IntSpecies vsp = vspecies();
-        return VectorSupport.load(
-            vsp.vectorType(), vsp.elementType(), vsp.laneCount(),
-            a, byteArrayAddress(a, offset),
-            a, offset, vsp,
-            (arr, off, s) -> {
-                ByteBuffer wb = wrapper(arr, NATIVE_ENDIAN);
-                return s.ldOp(wb, off,
-                        (wb_, o, i) -> wb_.getInt(o + i * 4));
-            });
-    }
-
-    abstract
-    IntVector fromByteArray0(byte[] a, int offset, VectorMask<Integer> m);
-    @ForceInline
-    final
-    <M extends VectorMask<Integer>>
-    IntVector fromByteArray0Template(Class<M> maskClass, byte[] a, int offset, M m) {
-        IntSpecies vsp = vspecies();
-        m.check(vsp);
-        return VectorSupport.loadMasked(
-            vsp.vectorType(), maskClass, vsp.elementType(), vsp.laneCount(),
-            a, byteArrayAddress(a, offset), m,
-            a, offset, vsp,
-            (arr, off, s, vm) -> {
-                ByteBuffer wb = wrapper(arr, NATIVE_ENDIAN);
-                return s.ldOp(wb, off, vm,
-                        (wb_, o, i) -> wb_.getInt(o + i * 4));
-            });
-    }
-
-    abstract
-    IntVector fromByteBuffer0(ByteBuffer bb, int offset);
-    @ForceInline
-    final
-    IntVector fromByteBuffer0Template(ByteBuffer bb, int offset) {
-        IntSpecies vsp = vspecies();
-        return ScopedMemoryAccess.loadFromByteBuffer(
+        return ScopedMemoryAccess.loadFromMemorySegment(
                vsp.vectorType(), vsp.elementType(), vsp.laneCount(),
-                bb, offset, vsp,
-                (buf, off, s) -> {
-                    ByteBuffer wb = wrapper(buf, NATIVE_ENDIAN);
-                    return s.ldOp(wb, off,
-                            (wb_, o, i) -> wb_.getInt(o + i * 4));
+                (AbstractMemorySegmentImpl) ms, offset, vsp,
+                (msp, off, s) -> {
+                    return s.ldLongOp((MemorySegment) msp, off, IntVector::memorySegmentGet);
                });
    }

    abstract
-    IntVector fromByteBuffer0(ByteBuffer bb, int offset, VectorMask<Integer> m);
+    IntVector fromMemorySegment0(MemorySegment ms, long offset, VectorMask<Integer> m);
    @ForceInline
    final
    <M extends VectorMask<Integer>>
-    IntVector fromByteBuffer0Template(Class<M> maskClass, ByteBuffer bb, int offset, M m) {
+    IntVector fromMemorySegment0Template(Class<M> maskClass, MemorySegment ms, long offset, M m) {
        IntSpecies vsp = vspecies();
        m.check(vsp);
-        return ScopedMemoryAccess.loadFromByteBufferMasked(
+        return ScopedMemoryAccess.loadFromMemorySegmentMasked(
                vsp.vectorType(), maskClass, vsp.elementType(), vsp.laneCount(),
-                bb, offset, m, vsp,
-                (buf, off, s, vm) -> {
-                    ByteBuffer wb = wrapper(buf, NATIVE_ENDIAN);
-                    return s.ldOp(wb, off, vm,
-                            (wb_, o, i) -> wb_.getInt(o + i * 4));
+                (AbstractMemorySegmentImpl) ms, offset, m, vsp,
+                (msp, off, s, vm) -> {
+                    return s.ldLongOp((MemorySegment) msp, off, vm, IntVector::memorySegmentGet);
                });
    }

@ -3534,7 +3547,7 @@ public abstract class IntVector extends AbstractVector<Integer> {
            a, arrayAddress(a, offset),
            this, a, offset,
            (arr, off, v)
-            -> v.stOp(arr, off,
+            -> v.stOp(arr, (int) off,
                      (arr_, off_, i, e) -> arr_[off_+i] = e));
    }

@ -3551,7 +3564,7 @@ public abstract class IntVector extends AbstractVector<Integer> {
            a, arrayAddress(a, offset),
            this, m, a, offset,
            (arr, off, v, vm)
-            -> v.stOp(arr, off, vm,
+            -> v.stOp(arr, (int) off, vm,
                      (arr_, off_, i, e) -> arr_[off_ + i] = e));
    }

@ -3590,71 +3603,33 @@ public abstract class IntVector extends AbstractVector<Integer> {
    }


-    abstract
-    void intoByteArray0(byte[] a, int offset);
    @ForceInline
    final
-    void intoByteArray0Template(byte[] a, int offset) {
+    void intoMemorySegment0(MemorySegment ms, long offset) {
        IntSpecies vsp = vspecies();
-        VectorSupport.store(
-            vsp.vectorType(), vsp.elementType(), vsp.laneCount(),
-            a, byteArrayAddress(a, offset),
-            this, a, offset,
-            (arr, off, v) -> {
-                ByteBuffer wb = wrapper(arr, NATIVE_ENDIAN);
-                v.stOp(wb, off,
-                        (tb_, o, i, e) -> tb_.putInt(o + i * 4, e));
-            });
-    }
-
-    abstract
-    void intoByteArray0(byte[] a, int offset, VectorMask<Integer> m);
-    @ForceInline
-    final
-    <M extends VectorMask<Integer>>
-    void intoByteArray0Template(Class<M> maskClass, byte[] a, int offset, M m) {
-        IntSpecies vsp = vspecies();
-        m.check(vsp);
-        VectorSupport.storeMasked(
-            vsp.vectorType(), maskClass, vsp.elementType(), vsp.laneCount(),
-            a, byteArrayAddress(a, offset),
-            this, m, a, offset,
-            (arr, off, v, vm) -> {
-                ByteBuffer wb = wrapper(arr, NATIVE_ENDIAN);
-                v.stOp(wb, off, vm,
-                        (tb_, o, i, e) -> tb_.putInt(o + i * 4, e));
-            });
-    }
-
-    @ForceInline
-    final
-    void intoByteBuffer0(ByteBuffer bb, int offset) {
-        IntSpecies vsp = vspecies();
-        ScopedMemoryAccess.storeIntoByteBuffer(
+        ScopedMemoryAccess.storeIntoMemorySegment(
                vsp.vectorType(), vsp.elementType(), vsp.laneCount(),
-                this, bb, offset,
-                (buf, off, v) -> {
-                    ByteBuffer wb = wrapper(buf, NATIVE_ENDIAN);
-                    v.stOp(wb, off,
-                            (wb_, o, i, e) -> wb_.putInt(o + i * 4, e));
+                this,
+                (AbstractMemorySegmentImpl) ms, offset,
+                (msp, off, v) -> {
+                    v.stLongOp((MemorySegment) msp, off, IntVector::memorySegmentSet);
                });
    }

    abstract
-    void intoByteBuffer0(ByteBuffer bb, int offset, VectorMask<Integer> m);
+    void intoMemorySegment0(MemorySegment bb, long offset, VectorMask<Integer> m);
    @ForceInline
    final
    <M extends VectorMask<Integer>>
-    void intoByteBuffer0Template(Class<M> maskClass, ByteBuffer bb, int offset, M m) {
+    void intoMemorySegment0Template(Class<M> maskClass, MemorySegment ms, long offset, M m) {
        IntSpecies vsp = vspecies();
        m.check(vsp);
-        ScopedMemoryAccess.storeIntoByteBufferMasked(
+        ScopedMemoryAccess.storeIntoMemorySegmentMasked(
                vsp.vectorType(), maskClass, vsp.elementType(), vsp.laneCount(),
-                this, m, bb, offset,
-                (buf, off, v, vm) -> {
-                    ByteBuffer wb = wrapper(buf, NATIVE_ENDIAN);
-                    v.stOp(wb, off, vm,
-                            (wb_, o, i, e) -> wb_.putInt(o + i * 4, e));
+                this, m,
+                (AbstractMemorySegmentImpl) ms, offset,
+                (msp, off, v, vm) -> {
+                    v.stLongOp((MemorySegment) msp, off, vm, IntVector::memorySegmentSet);
                });
    }

@ -3671,6 +3646,16 @@ public abstract class IntVector extends AbstractVector<Integer> {
            .checkIndexByLane(offset, limit, vsp.iota(), scale);
    }

+    private static
+    void checkMaskFromIndexSize(long offset,
+                                IntSpecies vsp,
+                                VectorMask<Integer> m,
+                                int scale,
+                                long limit) {
+        ((AbstractMask<Integer>)m)
+            .checkIndexByLane(offset, limit, vsp.iota(), scale);
+    }
+
    @ForceInline
    private void conditionalStoreNYI(int offset,
                                     IntSpecies vsp,
@ -3981,6 +3966,21 @@ public abstract class IntVector extends AbstractVector<Integer> {
            return dummyVector().ldOp(memory, offset, m, f);
        }

+        /*package-private*/
+        @ForceInline
+        IntVector ldLongOp(MemorySegment memory, long offset,
+                                      FLdLongOp f) {
+            return dummyVector().ldLongOp(memory, offset, f);
+        }
+
+        /*package-private*/
+        @ForceInline
+        IntVector ldLongOp(MemorySegment memory, long offset,
+                                      VectorMask<Integer> m,
+                                      FLdLongOp f) {
+            return dummyVector().ldLongOp(memory, offset, m, f);
+        }
+
        /*package-private*/
        @ForceInline
        <M> void stOp(M memory, int offset, FStOp<M> f) {
@ -3995,6 +3995,20 @@ public abstract class IntVector extends AbstractVector<Integer> {
            dummyVector().stOp(memory, offset, m, f);
        }

+        /*package-private*/
+        @ForceInline
+        void stLongOp(MemorySegment memory, long offset, FStLongOp f) {
+            dummyVector().stLongOp(memory, offset, f);
+        }
+
+        /*package-private*/
+        @ForceInline
+        void stLongOp(MemorySegment memory, long offset,
+                      AbstractMask<Integer> m,
+                      FStLongOp f) {
+            dummyVector().stLongOp(memory, offset, m, f);
+        }
+
        // N.B. Make sure these constant vectors and
        // masks load up correctly into registers.
        //
@ -4108,3 +4122,4 @@ public abstract class IntVector extends AbstractVector<Integer> {
    public static final VectorSpecies<Integer> SPECIES_PREFERRED
        = (IntSpecies) VectorSpecies.ofPreferred(int.class);
 }
+
--- a/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/Long128Vector.java
+++ b/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/Long128Vector.java
@ -24,7 +24,7 @@
 */
 package jdk.incubator.vector;

-import java.nio.ByteBuffer;
+import java.lang.foreign.MemorySegment;
 import java.util.Arrays;
 import java.util.Objects;
 import java.util.function.IntUnaryOperator;
@ -464,6 +464,22 @@ final class Long128Vector extends LongVector {
                                    (Long128Vector) v);  // specialize
    }

+    @Override
+    @ForceInline
+    public Long128Vector compress(VectorMask<Long> m) {
+        return (Long128Vector)
+            super.compressTemplate(Long128Mask.class,
+                                   (Long128Mask) m);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    public Long128Vector expand(VectorMask<Long> m) {
+        return (Long128Vector)
+            super.expandTemplate(Long128Mask.class,
+                                   (Long128Mask) m);  // specialize
+    }
+
    @Override
    @ForceInline
    public Long128Vector selectFrom(Vector<Long> v) {
@ -639,6 +655,15 @@ final class Long128Vector extends LongVector {
            return xor(maskAll(true));
        }

+        @Override
+        @ForceInline
+        public Long128Mask compress() {
+            return (Long128Mask)VectorSupport.compressExpandOp(VectorSupport.VECTOR_OP_MASK_COMPRESS,
+                Long128Vector.class, Long128Mask.class, ETYPE, VLENGTH, null, this,
+                (v1, m1) -> VSPECIES.iota().compare(VectorOperators.LT, m1.trueCount()));
+        }
+
+
        // Binary operations

        @Override
@ -831,29 +856,15 @@ final class Long128Vector extends LongVector {
    @ForceInline
    @Override
    final
-    LongVector fromByteArray0(byte[] a, int offset) {
-        return super.fromByteArray0Template(a, offset);  // specialize
+    LongVector fromMemorySegment0(MemorySegment ms, long offset) {
+        return super.fromMemorySegment0Template(ms, offset);  // specialize
    }

    @ForceInline
    @Override
    final
-    LongVector fromByteArray0(byte[] a, int offset, VectorMask<Long> m) {
-        return super.fromByteArray0Template(Long128Mask.class, a, offset, (Long128Mask) m);  // specialize
-    }
-
-    @ForceInline
-    @Override
-    final
-    LongVector fromByteBuffer0(ByteBuffer bb, int offset) {
-        return super.fromByteBuffer0Template(bb, offset);  // specialize
-    }
-
-    @ForceInline
-    @Override
-    final
-    LongVector fromByteBuffer0(ByteBuffer bb, int offset, VectorMask<Long> m) {
-        return super.fromByteBuffer0Template(Long128Mask.class, bb, offset, (Long128Mask) m);  // specialize
+    LongVector fromMemorySegment0(MemorySegment ms, long offset, VectorMask<Long> m) {
+        return super.fromMemorySegment0Template(Long128Mask.class, ms, offset, (Long128Mask) m);  // specialize
    }

    @ForceInline
@ -881,22 +892,8 @@ final class Long128Vector extends LongVector {
    @ForceInline
    @Override
    final
-    void intoByteArray0(byte[] a, int offset) {
-        super.intoByteArray0Template(a, offset);  // specialize
-    }
-
-    @ForceInline
-    @Override
-    final
-    void intoByteArray0(byte[] a, int offset, VectorMask<Long> m) {
-        super.intoByteArray0Template(Long128Mask.class, a, offset, (Long128Mask) m);  // specialize
-    }
-
-    @ForceInline
-    @Override
-    final
-    void intoByteBuffer0(ByteBuffer bb, int offset, VectorMask<Long> m) {
-        super.intoByteBuffer0Template(Long128Mask.class, bb, offset, (Long128Mask) m);
+    void intoMemorySegment0(MemorySegment ms, long offset, VectorMask<Long> m) {
+        super.intoMemorySegment0Template(Long128Mask.class, ms, offset, (Long128Mask) m);
    }


@ -905,3 +902,4 @@ final class Long128Vector extends LongVector {
    // ================================================

 }
+
--- a/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/Long256Vector.java
+++ b/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/Long256Vector.java
@ -24,7 +24,7 @@
 */
 package jdk.incubator.vector;

-import java.nio.ByteBuffer;
+import java.lang.foreign.MemorySegment;
 import java.util.Arrays;
 import java.util.Objects;
 import java.util.function.IntUnaryOperator;
@ -464,6 +464,22 @@ final class Long256Vector extends LongVector {
                                    (Long256Vector) v);  // specialize
    }

+    @Override
+    @ForceInline
+    public Long256Vector compress(VectorMask<Long> m) {
+        return (Long256Vector)
+            super.compressTemplate(Long256Mask.class,
+                                   (Long256Mask) m);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    public Long256Vector expand(VectorMask<Long> m) {
+        return (Long256Vector)
+            super.expandTemplate(Long256Mask.class,
+                                   (Long256Mask) m);  // specialize
+    }
+
    @Override
    @ForceInline
    public Long256Vector selectFrom(Vector<Long> v) {
@ -643,6 +659,15 @@ final class Long256Vector extends LongVector {
            return xor(maskAll(true));
        }

+        @Override
+        @ForceInline
+        public Long256Mask compress() {
+            return (Long256Mask)VectorSupport.compressExpandOp(VectorSupport.VECTOR_OP_MASK_COMPRESS,
+                Long256Vector.class, Long256Mask.class, ETYPE, VLENGTH, null, this,
+                (v1, m1) -> VSPECIES.iota().compare(VectorOperators.LT, m1.trueCount()));
+        }
+
+
        // Binary operations

        @Override
@ -835,29 +860,15 @@ final class Long256Vector extends LongVector {
    @ForceInline
    @Override
    final
-    LongVector fromByteArray0(byte[] a, int offset) {
-        return super.fromByteArray0Template(a, offset);  // specialize
+    LongVector fromMemorySegment0(MemorySegment ms, long offset) {
+        return super.fromMemorySegment0Template(ms, offset);  // specialize
    }

    @ForceInline
    @Override
    final
-    LongVector fromByteArray0(byte[] a, int offset, VectorMask<Long> m) {
-        return super.fromByteArray0Template(Long256Mask.class, a, offset, (Long256Mask) m);  // specialize
-    }
-
-    @ForceInline
-    @Override
-    final
-    LongVector fromByteBuffer0(ByteBuffer bb, int offset) {
-        return super.fromByteBuffer0Template(bb, offset);  // specialize
-    }
-
-    @ForceInline
-    @Override
-    final
-    LongVector fromByteBuffer0(ByteBuffer bb, int offset, VectorMask<Long> m) {
-        return super.fromByteBuffer0Template(Long256Mask.class, bb, offset, (Long256Mask) m);  // specialize
+    LongVector fromMemorySegment0(MemorySegment ms, long offset, VectorMask<Long> m) {
+        return super.fromMemorySegment0Template(Long256Mask.class, ms, offset, (Long256Mask) m);  // specialize
    }

    @ForceInline
@ -885,22 +896,8 @@ final class Long256Vector extends LongVector {
    @ForceInline
    @Override
    final
-    void intoByteArray0(byte[] a, int offset) {
-        super.intoByteArray0Template(a, offset);  // specialize
-    }
-
-    @ForceInline
-    @Override
-    final
-    void intoByteArray0(byte[] a, int offset, VectorMask<Long> m) {
-        super.intoByteArray0Template(Long256Mask.class, a, offset, (Long256Mask) m);  // specialize
-    }
-
-    @ForceInline
-    @Override
-    final
-    void intoByteBuffer0(ByteBuffer bb, int offset, VectorMask<Long> m) {
-        super.intoByteBuffer0Template(Long256Mask.class, bb, offset, (Long256Mask) m);
+    void intoMemorySegment0(MemorySegment ms, long offset, VectorMask<Long> m) {
+        super.intoMemorySegment0Template(Long256Mask.class, ms, offset, (Long256Mask) m);
    }


@ -909,3 +906,4 @@ final class Long256Vector extends LongVector {
    // ================================================

 }
+
--- a/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/Long512Vector.java
+++ b/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/Long512Vector.java
@ -24,7 +24,7 @@
 */
 package jdk.incubator.vector;

-import java.nio.ByteBuffer;
+import java.lang.foreign.MemorySegment;
 import java.util.Arrays;
 import java.util.Objects;
 import java.util.function.IntUnaryOperator;
@ -464,6 +464,22 @@ final class Long512Vector extends LongVector {
                                    (Long512Vector) v);  // specialize
    }

+    @Override
+    @ForceInline
+    public Long512Vector compress(VectorMask<Long> m) {
+        return (Long512Vector)
+            super.compressTemplate(Long512Mask.class,
+                                   (Long512Mask) m);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    public Long512Vector expand(VectorMask<Long> m) {
+        return (Long512Vector)
+            super.expandTemplate(Long512Mask.class,
+                                   (Long512Mask) m);  // specialize
+    }
+
    @Override
    @ForceInline
    public Long512Vector selectFrom(Vector<Long> v) {
@ -651,6 +667,15 @@ final class Long512Vector extends LongVector {
            return xor(maskAll(true));
        }

+        @Override
+        @ForceInline
+        public Long512Mask compress() {
+            return (Long512Mask)VectorSupport.compressExpandOp(VectorSupport.VECTOR_OP_MASK_COMPRESS,
+                Long512Vector.class, Long512Mask.class, ETYPE, VLENGTH, null, this,
+                (v1, m1) -> VSPECIES.iota().compare(VectorOperators.LT, m1.trueCount()));
+        }
+
+
        // Binary operations

        @Override
@ -843,29 +868,15 @@ final class Long512Vector extends LongVector {
    @ForceInline
    @Override
    final
-    LongVector fromByteArray0(byte[] a, int offset) {
-        return super.fromByteArray0Template(a, offset);  // specialize
+    LongVector fromMemorySegment0(MemorySegment ms, long offset) {
+        return super.fromMemorySegment0Template(ms, offset);  // specialize
    }

    @ForceInline
    @Override
    final
-    LongVector fromByteArray0(byte[] a, int offset, VectorMask<Long> m) {
-        return super.fromByteArray0Template(Long512Mask.class, a, offset, (Long512Mask) m);  // specialize
-    }
-
-    @ForceInline
-    @Override
-    final
-    LongVector fromByteBuffer0(ByteBuffer bb, int offset) {
-        return super.fromByteBuffer0Template(bb, offset);  // specialize
-    }
-
-    @ForceInline
-    @Override
-    final
-    LongVector fromByteBuffer0(ByteBuffer bb, int offset, VectorMask<Long> m) {
-        return super.fromByteBuffer0Template(Long512Mask.class, bb, offset, (Long512Mask) m);  // specialize
+    LongVector fromMemorySegment0(MemorySegment ms, long offset, VectorMask<Long> m) {
+        return super.fromMemorySegment0Template(Long512Mask.class, ms, offset, (Long512Mask) m);  // specialize
    }

    @ForceInline
@ -893,22 +904,8 @@ final class Long512Vector extends LongVector {
    @ForceInline
    @Override
    final
-    void intoByteArray0(byte[] a, int offset) {
-        super.intoByteArray0Template(a, offset);  // specialize
-    }
-
-    @ForceInline
-    @Override
-    final
-    void intoByteArray0(byte[] a, int offset, VectorMask<Long> m) {
-        super.intoByteArray0Template(Long512Mask.class, a, offset, (Long512Mask) m);  // specialize
-    }
-
-    @ForceInline
-    @Override
-    final
-    void intoByteBuffer0(ByteBuffer bb, int offset, VectorMask<Long> m) {
-        super.intoByteBuffer0Template(Long512Mask.class, bb, offset, (Long512Mask) m);
+    void intoMemorySegment0(MemorySegment ms, long offset, VectorMask<Long> m) {
+        super.intoMemorySegment0Template(Long512Mask.class, ms, offset, (Long512Mask) m);
    }


@ -917,3 +914,4 @@ final class Long512Vector extends LongVector {
    // ================================================

 }
+
--- a/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/Long64Vector.java
+++ b/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/Long64Vector.java
@ -24,7 +24,7 @@
 */
 package jdk.incubator.vector;

-import java.nio.ByteBuffer;
+import java.lang.foreign.MemorySegment;
 import java.util.Arrays;
 import java.util.Objects;
 import java.util.function.IntUnaryOperator;
@ -464,6 +464,22 @@ final class Long64Vector extends LongVector {
                                    (Long64Vector) v);  // specialize
    }

+    @Override
+    @ForceInline
+    public Long64Vector compress(VectorMask<Long> m) {
+        return (Long64Vector)
+            super.compressTemplate(Long64Mask.class,
+                                   (Long64Mask) m);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    public Long64Vector expand(VectorMask<Long> m) {
+        return (Long64Vector)
+            super.expandTemplate(Long64Mask.class,
+                                   (Long64Mask) m);  // specialize
+    }
+
    @Override
    @ForceInline
    public Long64Vector selectFrom(Vector<Long> v) {
@ -637,6 +653,15 @@ final class Long64Vector extends LongVector {
            return xor(maskAll(true));
        }

+        @Override
+        @ForceInline
+        public Long64Mask compress() {
+            return (Long64Mask)VectorSupport.compressExpandOp(VectorSupport.VECTOR_OP_MASK_COMPRESS,
+                Long64Vector.class, Long64Mask.class, ETYPE, VLENGTH, null, this,
+                (v1, m1) -> VSPECIES.iota().compare(VectorOperators.LT, m1.trueCount()));
+        }
+
+
        // Binary operations

        @Override
@ -829,29 +854,15 @@ final class Long64Vector extends LongVector {
    @ForceInline
    @Override
    final
-    LongVector fromByteArray0(byte[] a, int offset) {
-        return super.fromByteArray0Template(a, offset);  // specialize
+    LongVector fromMemorySegment0(MemorySegment ms, long offset) {
+        return super.fromMemorySegment0Template(ms, offset);  // specialize
    }

    @ForceInline
    @Override
    final
-    LongVector fromByteArray0(byte[] a, int offset, VectorMask<Long> m) {
-        return super.fromByteArray0Template(Long64Mask.class, a, offset, (Long64Mask) m);  // specialize
-    }
-
-    @ForceInline
-    @Override
-    final
-    LongVector fromByteBuffer0(ByteBuffer bb, int offset) {
-        return super.fromByteBuffer0Template(bb, offset);  // specialize
-    }
-
-    @ForceInline
-    @Override
-    final
-    LongVector fromByteBuffer0(ByteBuffer bb, int offset, VectorMask<Long> m) {
-        return super.fromByteBuffer0Template(Long64Mask.class, bb, offset, (Long64Mask) m);  // specialize
+    LongVector fromMemorySegment0(MemorySegment ms, long offset, VectorMask<Long> m) {
+        return super.fromMemorySegment0Template(Long64Mask.class, ms, offset, (Long64Mask) m);  // specialize
    }

    @ForceInline
@ -879,22 +890,8 @@ final class Long64Vector extends LongVector {
    @ForceInline
    @Override
    final
-    void intoByteArray0(byte[] a, int offset) {
-        super.intoByteArray0Template(a, offset);  // specialize
-    }
-
-    @ForceInline
-    @Override
-    final
-    void intoByteArray0(byte[] a, int offset, VectorMask<Long> m) {
-        super.intoByteArray0Template(Long64Mask.class, a, offset, (Long64Mask) m);  // specialize
-    }
-
-    @ForceInline
-    @Override
-    final
-    void intoByteBuffer0(ByteBuffer bb, int offset, VectorMask<Long> m) {
-        super.intoByteBuffer0Template(Long64Mask.class, bb, offset, (Long64Mask) m);
+    void intoMemorySegment0(MemorySegment ms, long offset, VectorMask<Long> m) {
+        super.intoMemorySegment0Template(Long64Mask.class, ms, offset, (Long64Mask) m);
    }


@ -903,3 +900,4 @@ final class Long64Vector extends LongVector {
    // ================================================

 }
+
--- a/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/LongMaxVector.java
+++ b/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/LongMaxVector.java
@ -24,7 +24,7 @@
 */
 package jdk.incubator.vector;

-import java.nio.ByteBuffer;
+import java.lang.foreign.MemorySegment;
 import java.util.Arrays;
 import java.util.Objects;
 import java.util.function.IntUnaryOperator;
@ -464,6 +464,22 @@ final class LongMaxVector extends LongVector {
                                    (LongMaxVector) v);  // specialize
    }

+    @Override
+    @ForceInline
+    public LongMaxVector compress(VectorMask<Long> m) {
+        return (LongMaxVector)
+            super.compressTemplate(LongMaxMask.class,
+                                   (LongMaxMask) m);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    public LongMaxVector expand(VectorMask<Long> m) {
+        return (LongMaxVector)
+            super.expandTemplate(LongMaxMask.class,
+                                   (LongMaxMask) m);  // specialize
+    }
+
    @Override
    @ForceInline
    public LongMaxVector selectFrom(Vector<Long> v) {
@ -637,6 +653,15 @@ final class LongMaxVector extends LongVector {
            return xor(maskAll(true));
        }

+        @Override
+        @ForceInline
+        public LongMaxMask compress() {
+            return (LongMaxMask)VectorSupport.compressExpandOp(VectorSupport.VECTOR_OP_MASK_COMPRESS,
+                LongMaxVector.class, LongMaxMask.class, ETYPE, VLENGTH, null, this,
+                (v1, m1) -> VSPECIES.iota().compare(VectorOperators.LT, m1.trueCount()));
+        }
+
+
        // Binary operations

        @Override
@ -829,29 +854,15 @@ final class LongMaxVector extends LongVector {
    @ForceInline
    @Override
    final
-    LongVector fromByteArray0(byte[] a, int offset) {
-        return super.fromByteArray0Template(a, offset);  // specialize
+    LongVector fromMemorySegment0(MemorySegment ms, long offset) {
+        return super.fromMemorySegment0Template(ms, offset);  // specialize
    }

    @ForceInline
    @Override
    final
-    LongVector fromByteArray0(byte[] a, int offset, VectorMask<Long> m) {
-        return super.fromByteArray0Template(LongMaxMask.class, a, offset, (LongMaxMask) m);  // specialize
-    }
-
-    @ForceInline
-    @Override
-    final
-    LongVector fromByteBuffer0(ByteBuffer bb, int offset) {
-        return super.fromByteBuffer0Template(bb, offset);  // specialize
-    }
-
-    @ForceInline
-    @Override
-    final
-    LongVector fromByteBuffer0(ByteBuffer bb, int offset, VectorMask<Long> m) {
-        return super.fromByteBuffer0Template(LongMaxMask.class, bb, offset, (LongMaxMask) m);  // specialize
+    LongVector fromMemorySegment0(MemorySegment ms, long offset, VectorMask<Long> m) {
+        return super.fromMemorySegment0Template(LongMaxMask.class, ms, offset, (LongMaxMask) m);  // specialize
    }

    @ForceInline
@ -879,22 +890,8 @@ final class LongMaxVector extends LongVector {
    @ForceInline
    @Override
    final
-    void intoByteArray0(byte[] a, int offset) {
-        super.intoByteArray0Template(a, offset);  // specialize
-    }
-
-    @ForceInline
-    @Override
-    final
-    void intoByteArray0(byte[] a, int offset, VectorMask<Long> m) {
-        super.intoByteArray0Template(LongMaxMask.class, a, offset, (LongMaxMask) m);  // specialize
-    }
-
-    @ForceInline
-    @Override
-    final
-    void intoByteBuffer0(ByteBuffer bb, int offset, VectorMask<Long> m) {
-        super.intoByteBuffer0Template(LongMaxMask.class, bb, offset, (LongMaxMask) m);
+    void intoMemorySegment0(MemorySegment ms, long offset, VectorMask<Long> m) {
+        super.intoMemorySegment0Template(LongMaxMask.class, ms, offset, (LongMaxMask) m);
    }


@ -903,3 +900,4 @@ final class LongMaxVector extends LongVector {
    // ================================================

 }
+
--- a/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/LongVector.java
+++ b/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/LongVector.java
@ -24,14 +24,14 @@
 */
 package jdk.incubator.vector;

-import java.nio.ByteBuffer;
+import java.lang.foreign.MemorySegment;
+import java.lang.foreign.ValueLayout;
 import java.nio.ByteOrder;
-import java.nio.ReadOnlyBufferException;
 import java.util.Arrays;
 import java.util.Objects;
 import java.util.function.Function;
-import java.util.function.UnaryOperator;

+import jdk.internal.foreign.AbstractMemorySegmentImpl;
 import jdk.internal.misc.ScopedMemoryAccess;
 import jdk.internal.misc.Unsafe;
 import jdk.internal.vm.annotation.ForceInline;
@ -57,6 +57,8 @@ public abstract class LongVector extends AbstractVector<Long> {

    static final int FORBID_OPCODE_KIND = VO_ONLYFP;

+    static final ValueLayout.OfLong ELEMENT_LAYOUT = ValueLayout.JAVA_LONG.withBitAlignment(8);
+
    @ForceInline
    static int opCode(Operator op) {
        return VectorOperators.opCode(op, VO_OPCODE_VALID, FORBID_OPCODE_KIND);
@ -351,6 +353,45 @@ public abstract class LongVector extends AbstractVector<Long> {
        return vectorFactory(res);
    }

+    /*package-private*/
+    interface FLdLongOp {
+        long apply(MemorySegment memory, long offset, int i);
+    }
+
+    /*package-private*/
+    @ForceInline
+    final
+    LongVector ldLongOp(MemorySegment memory, long offset,
+                                  FLdLongOp f) {
+        //dummy; no vec = vec();
+        long[] res = new long[length()];
+        for (int i = 0; i < res.length; i++) {
+            res[i] = f.apply(memory, offset, i);
+        }
+        return vectorFactory(res);
+    }
+
+    /*package-private*/
+    @ForceInline
+    final
+    LongVector ldLongOp(MemorySegment memory, long offset,
+                                  VectorMask<Long> m,
+                                  FLdLongOp f) {
+        //long[] vec = vec();
+        long[] res = new long[length()];
+        boolean[] mbits = ((AbstractMask<Long>)m).getBits();
+        for (int i = 0; i < res.length; i++) {
+            if (mbits[i]) {
+                res[i] = f.apply(memory, offset, i);
+            }
+        }
+        return vectorFactory(res);
+    }
+
+    static long memorySegmentGet(MemorySegment ms, long o, int i) {
+        return ms.get(ELEMENT_LAYOUT, o + i * 8L);
+    }
+
    interface FStOp<M> {
        void apply(M memory, int offset, int i, long a);
    }
@ -381,6 +422,40 @@ public abstract class LongVector extends AbstractVector<Long> {
        }
    }

+    interface FStLongOp {
+        void apply(MemorySegment memory, long offset, int i, long a);
+    }
+
+    /*package-private*/
+    @ForceInline
+    final
+    void stLongOp(MemorySegment memory, long offset,
+                  FStLongOp f) {
+        long[] vec = vec();
+        for (int i = 0; i < vec.length; i++) {
+            f.apply(memory, offset, i, vec[i]);
+        }
+    }
+
+    /*package-private*/
+    @ForceInline
+    final
+    void stLongOp(MemorySegment memory, long offset,
+                  VectorMask<Long> m,
+                  FStLongOp f) {
+        long[] vec = vec();
+        boolean[] mbits = ((AbstractMask<Long>)m).getBits();
+        for (int i = 0; i < vec.length; i++) {
+            if (mbits[i]) {
+                f.apply(memory, offset, i, vec[i]);
+            }
+        }
+    }
+
+    static void memorySegmentSet(MemorySegment ms, long o, int i, long e) {
+        ms.set(ELEMENT_LAYOUT, o + i * 8L, e);
+    }
+
    // Binary test

    /*package-private*/
@ -431,6 +506,36 @@ public abstract class LongVector extends AbstractVector<Long> {
        return ((long)bits);
    }

+    static LongVector expandHelper(Vector<Long> v, VectorMask<Long> m) {
+        VectorSpecies<Long> vsp = m.vectorSpecies();
+        LongVector r  = (LongVector) vsp.zero();
+        LongVector vi = (LongVector) v;
+        if (m.allTrue()) {
+            return vi;
+        }
+        for (int i = 0, j = 0; i < vsp.length(); i++) {
+            if (m.laneIsSet(i)) {
+                r = r.withLane(i, vi.lane(j++));
+            }
+        }
+        return r;
+    }
+
+    static LongVector compressHelper(Vector<Long> v, VectorMask<Long> m) {
+        VectorSpecies<Long> vsp = m.vectorSpecies();
+        LongVector r  = (LongVector) vsp.zero();
+        LongVector vi = (LongVector) v;
+        if (m.allTrue()) {
+            return vi;
+        }
+        for (int i = 0, j = 0; i < vsp.length(); i++) {
+            if (m.laneIsSet(i)) {
+                r = r.withLane(j++, vi.lane(i));
+            }
+        }
+        return r;
+    }
+
    // Static factories (other than memory operations)

    // Note: A surprising behavior in javadoc
@ -578,6 +683,16 @@ public abstract class LongVector extends AbstractVector<Long> {
                    v0.uOp(m, (i, a) -> (long) -a);
            case VECTOR_OP_ABS: return (v0, m) ->
                    v0.uOp(m, (i, a) -> (long) Math.abs(a));
+            case VECTOR_OP_BIT_COUNT: return (v0, m) ->
+                    v0.uOp(m, (i, a) -> (long) Long.bitCount(a));
+            case VECTOR_OP_TZ_COUNT: return (v0, m) ->
+                    v0.uOp(m, (i, a) -> (long) Long.numberOfTrailingZeros(a));
+            case VECTOR_OP_LZ_COUNT: return (v0, m) ->
+                    v0.uOp(m, (i, a) -> (long) Long.numberOfLeadingZeros(a));
+            case VECTOR_OP_REVERSE: return (v0, m) ->
+                    v0.uOp(m, (i, a) -> (long) Long.reverse(a));
+            case VECTOR_OP_REVERSE_BYTES: return (v0, m) ->
+                    v0.uOp(m, (i, a) -> (long) Long.reverseBytes(a));
            default: return null;
        }
    }
@ -718,6 +833,10 @@ public abstract class LongVector extends AbstractVector<Long> {
                    v0.bOp(v1, vm, (i, a, n) -> rotateLeft(a, (int)n));
            case VECTOR_OP_RROTATE: return (v0, v1, vm) ->
                    v0.bOp(v1, vm, (i, a, n) -> rotateRight(a, (int)n));
+            case VECTOR_OP_COMPRESS_BITS: return (v0, v1, vm) ->
+                    v0.bOp(v1, vm, (i, a, n) -> Long.compress(a, n));
+            case VECTOR_OP_EXPAND_BITS: return (v0, v1, vm) ->
+                    v0.bOp(v1, vm, (i, a, n) -> Long.expand(a, n));
            default: return null;
        }
    }
@ -1658,6 +1777,7 @@ public abstract class LongVector extends AbstractVector<Long> {
        return lanewise(ABS);
    }

+
    // not (~)
    /**
     * Computes the bitwise logical complement ({@code ~})
@ -2237,6 +2357,45 @@ public abstract class LongVector extends AbstractVector<Long> {
                                     LongVector::toShuffle0);
    }

+    /**
+     * {@inheritDoc} <!--workaround-->
+     * @since 19
+     */
+    @Override
+    public abstract
+    LongVector compress(VectorMask<Long> m);
+
+    /*package-private*/
+    @ForceInline
+    final
+    <M extends AbstractMask<Long>>
+    LongVector compressTemplate(Class<M> masktype, M m) {
+      m.check(masktype, this);
+      return (LongVector) VectorSupport.compressExpandOp(VectorSupport.VECTOR_OP_COMPRESS, getClass(), masktype,
+                                                        long.class, length(), this, m,
+                                                        (v1, m1) -> compressHelper(v1, m1));
+    }
+
+    /**
+     * {@inheritDoc} <!--workaround-->
+     * @since 19
+     */
+    @Override
+    public abstract
+    LongVector expand(VectorMask<Long> m);
+
+    /*package-private*/
+    @ForceInline
+    final
+    <M extends AbstractMask<Long>>
+    LongVector expandTemplate(Class<M> masktype, M m) {
+      m.check(masktype, this);
+      return (LongVector) VectorSupport.compressExpandOp(VectorSupport.VECTOR_OP_EXPAND, getClass(), masktype,
+                                                        long.class, length(), this, m,
+                                                        (v1, m1) -> expandHelper(v1, m1));
+    }
+
+
    /**
     * {@inheritDoc} <!--workaround-->
     */
@ -2637,90 +2796,6 @@ public abstract class LongVector extends AbstractVector<Long> {
        return res;
    }

-    /**
-     * Loads a vector from a byte array starting at an offset.
-     * Bytes are composed into primitive lane elements according
-     * to the specified byte order.
-     * The vector is arranged into lanes according to
-     * <a href="Vector.html#lane-order">memory ordering</a>.
-     * <p>
-     * This method behaves as if it returns the result of calling
-     * {@link #fromByteBuffer(VectorSpecies,ByteBuffer,int,ByteOrder,VectorMask)
-     * fromByteBuffer()} as follows:
-     * <pre>{@code
-     * var bb = ByteBuffer.wrap(a);
-     * var m = species.maskAll(true);
-     * return fromByteBuffer(species, bb, offset, bo, m);
-     * }</pre>
-     *
-     * @param species species of desired vector
-     * @param a the byte array
-     * @param offset the offset into the array
-     * @param bo the intended byte order
-     * @return a vector loaded from a byte array
-     * @throws IndexOutOfBoundsException
-     *         if {@code offset+N*ESIZE < 0}
-     *         or {@code offset+(N+1)*ESIZE > a.length}
-     *         for any lane {@code N} in the vector
-     */
-    @ForceInline
-    public static
-    LongVector fromByteArray(VectorSpecies<Long> species,
-                                       byte[] a, int offset,
-                                       ByteOrder bo) {
-        offset = checkFromIndexSize(offset, species.vectorByteSize(), a.length);
-        LongSpecies vsp = (LongSpecies) species;
-        return vsp.dummyVector().fromByteArray0(a, offset).maybeSwap(bo);
-    }
-
-    /**
-     * Loads a vector from a byte array starting at an offset
-     * and using a mask.
-     * Lanes where the mask is unset are filled with the default
-     * value of {@code long} (zero).
-     * Bytes are composed into primitive lane elements according
-     * to the specified byte order.
-     * The vector is arranged into lanes according to
-     * <a href="Vector.html#lane-order">memory ordering</a>.
-     * <p>
-     * This method behaves as if it returns the result of calling
-     * {@link #fromByteBuffer(VectorSpecies,ByteBuffer,int,ByteOrder,VectorMask)
-     * fromByteBuffer()} as follows:
-     * <pre>{@code
-     * var bb = ByteBuffer.wrap(a);
-     * return fromByteBuffer(species, bb, offset, bo, m);
-     * }</pre>
-     *
-     * @param species species of desired vector
-     * @param a the byte array
-     * @param offset the offset into the array
-     * @param bo the intended byte order
-     * @param m the mask controlling lane selection
-     * @return a vector loaded from a byte array
-     * @throws IndexOutOfBoundsException
-     *         if {@code offset+N*ESIZE < 0}
-     *         or {@code offset+(N+1)*ESIZE > a.length}
-     *         for any lane {@code N} in the vector
-     *         where the mask is set
-     */
-    @ForceInline
-    public static
-    LongVector fromByteArray(VectorSpecies<Long> species,
-                                       byte[] a, int offset,
-                                       ByteOrder bo,
-                                       VectorMask<Long> m) {
-        LongSpecies vsp = (LongSpecies) species;
-        if (offset >= 0 && offset <= (a.length - species.vectorByteSize())) {
-            return vsp.dummyVector().fromByteArray0(a, offset, m).maybeSwap(bo);
-        }
-
-        // FIXME: optimize
-        checkMaskFromIndexSize(offset, vsp, m, 8, a.length);
-        ByteBuffer wb = wrapper(a, bo);
-        return vsp.ldOp(wb, offset, (AbstractMask<Long>)m,
-                   (wb_, o, i)  -> wb_.getLong(o + i * 8));
-    }
-
    /**
     * Loads a vector from an array of type {@code long[]}
     * starting at an offset.
@ -2911,44 +2986,49 @@ public abstract class LongVector extends AbstractVector<Long> {


    /**
-     * Loads a vector from a {@linkplain ByteBuffer byte buffer}
-     * starting at an offset into the byte buffer.
+     * Loads a vector from a {@linkplain MemorySegment memory segment}
+     * starting at an offset into the memory segment.
     * Bytes are composed into primitive lane elements according
     * to the specified byte order.
     * The vector is arranged into lanes according to
     * <a href="Vector.html#lane-order">memory ordering</a>.
     * <p>
     * This method behaves as if it returns the result of calling
-     * {@link #fromByteBuffer(VectorSpecies,ByteBuffer,int,ByteOrder,VectorMask)
-     * fromByteBuffer()} as follows:
+     * {@link #fromMemorySegment(VectorSpecies,MemorySegment,long,ByteOrder,VectorMask)
+     * fromMemorySegment()} as follows:
     * <pre>{@code
     * var m = species.maskAll(true);
-     * return fromByteBuffer(species, bb, offset, bo, m);
+     * return fromMemorySegment(species, ms, offset, bo, m);
     * }</pre>
     *
     * @param species species of desired vector
-     * @param bb the byte buffer
-     * @param offset the offset into the byte buffer
+     * @param ms the memory segment
+     * @param offset the offset into the memory segment
     * @param bo the intended byte order
-     * @return a vector loaded from a byte buffer
+     * @return a vector loaded from the memory segment
     * @throws IndexOutOfBoundsException
     *         if {@code offset+N*8 < 0}
-     *         or {@code offset+N*8 >= bb.limit()}
+     *         or {@code offset+N*8 >= ms.byteSize()}
     *         for any lane {@code N} in the vector
+     * @throws IllegalArgumentException if the memory segment is a heap segment that is
+     *         not backed by a {@code byte[]} array.
+     * @throws IllegalStateException if the memory segment's session is not alive,
+     *         or if access occurs from a thread other than the thread owning the session.
+     * @since 19
     */
    @ForceInline
    public static
-    LongVector fromByteBuffer(VectorSpecies<Long> species,
-                                        ByteBuffer bb, int offset,
-                                        ByteOrder bo) {
-        offset = checkFromIndexSize(offset, species.vectorByteSize(), bb.limit());
+    LongVector fromMemorySegment(VectorSpecies<Long> species,
+                                           MemorySegment ms, long offset,
+                                           ByteOrder bo) {
+        offset = checkFromIndexSize(offset, species.vectorByteSize(), ms.byteSize());
        LongSpecies vsp = (LongSpecies) species;
-        return vsp.dummyVector().fromByteBuffer0(bb, offset).maybeSwap(bo);
+        return vsp.dummyVector().fromMemorySegment0(ms, offset).maybeSwap(bo);
    }

    /**
-     * Loads a vector from a {@linkplain ByteBuffer byte buffer}
-     * starting at an offset into the byte buffer
+     * Loads a vector from a {@linkplain MemorySegment memory segment}
+     * starting at an offset into the memory segment
     * and using a mask.
     * Lanes where the mask is unset are filled with the default
     * value of {@code long} (zero).
@ -2959,13 +3039,11 @@ public abstract class LongVector extends AbstractVector<Long> {
     * <p>
     * The following pseudocode illustrates the behavior:
     * <pre>{@code
-     * LongBuffer eb = bb.duplicate()
-     *     .position(offset)
-     *     .order(bo).asLongBuffer();
+     * var slice = ms.asSlice(offset);
     * long[] ar = new long[species.length()];
     * for (int n = 0; n < ar.length; n++) {
     *     if (m.laneIsSet(n)) {
-     *         ar[n] = eb.get(n);
+     *         ar[n] = slice.getAtIndex(ValuaLayout.JAVA_LONG.withBitAlignment(8), n);
     *     }
     * }
     * LongVector r = LongVector.fromArray(species, ar, 0);
@ -2979,33 +3057,36 @@ public abstract class LongVector extends AbstractVector<Long> {
     * the bytes of lane values.
     *
     * @param species species of desired vector
-     * @param bb the byte buffer
-     * @param offset the offset into the byte buffer
+     * @param ms the memory segment
+     * @param offset the offset into the memory segment
     * @param bo the intended byte order
     * @param m the mask controlling lane selection
-     * @return a vector loaded from a byte buffer
+     * @return a vector loaded from the memory segment
     * @throws IndexOutOfBoundsException
     *         if {@code offset+N*8 < 0}
-     *         or {@code offset+N*8 >= bb.limit()}
+     *         or {@code offset+N*8 >= ms.byteSize()}
     *         for any lane {@code N} in the vector
     *         where the mask is set
+     * @throws IllegalArgumentException if the memory segment is a heap segment that is
+     *         not backed by a {@code byte[]} array.
+     * @throws IllegalStateException if the memory segment's session is not alive,
+     *         or if access occurs from a thread other than the thread owning the session.
+     * @since 19
     */
    @ForceInline
    public static
-    LongVector fromByteBuffer(VectorSpecies<Long> species,
-                                        ByteBuffer bb, int offset,
-                                        ByteOrder bo,
-                                        VectorMask<Long> m) {
+    LongVector fromMemorySegment(VectorSpecies<Long> species,
+                                           MemorySegment ms, long offset,
+                                           ByteOrder bo,
+                                           VectorMask<Long> m) {
        LongSpecies vsp = (LongSpecies) species;
-        if (offset >= 0 && offset <= (bb.limit() - species.vectorByteSize())) {
-            return vsp.dummyVector().fromByteBuffer0(bb, offset, m).maybeSwap(bo);
+        if (offset >= 0 && offset <= (ms.byteSize() - species.vectorByteSize())) {
+            return vsp.dummyVector().fromMemorySegment0(ms, offset, m).maybeSwap(bo);
        }

        // FIXME: optimize
-        checkMaskFromIndexSize(offset, vsp, m, 8, bb.limit());
-        ByteBuffer wb = wrapper(bb, bo);
-        return vsp.ldOp(wb, offset, (AbstractMask<Long>)m,
-                   (wb_, o, i)  -> wb_.getLong(o + i * 8));
+        checkMaskFromIndexSize(offset, vsp, m, 8, ms.byteSize());
+        return vsp.ldLongOp(ms, offset, m, LongVector::memorySegmentGet);
    }

    // Memory store operations
@ -3035,7 +3116,7 @@ public abstract class LongVector extends AbstractVector<Long> {
            this,
            a, offset,
            (arr, off, v)
-            -> v.stOp(arr, off,
+            -> v.stOp(arr, (int) off,
                      (arr_, off_, i, e) -> arr_[off_ + i] = e));
    }

@ -3195,67 +3276,40 @@ public abstract class LongVector extends AbstractVector<Long> {

    /**
     * {@inheritDoc} <!--workaround-->
+     * @since 19
     */
    @Override
    @ForceInline
    public final
-    void intoByteArray(byte[] a, int offset,
-                       ByteOrder bo) {
-        offset = checkFromIndexSize(offset, byteSize(), a.length);
-        maybeSwap(bo).intoByteArray0(a, offset);
-    }
-
-    /**
-     * {@inheritDoc} <!--workaround-->
-     */
-    @Override
-    @ForceInline
-    public final
-    void intoByteArray(byte[] a, int offset,
-                       ByteOrder bo,
-                       VectorMask<Long> m) {
-        if (m.allTrue()) {
-            intoByteArray(a, offset, bo);
-        } else {
-            LongSpecies vsp = vspecies();
-            checkMaskFromIndexSize(offset, vsp, m, 8, a.length);
-            maybeSwap(bo).intoByteArray0(a, offset, m);
+    void intoMemorySegment(MemorySegment ms, long offset,
+                           ByteOrder bo) {
+        if (ms.isReadOnly()) {
+            throw new UnsupportedOperationException("Attempt to write a read-only segment");
        }
+
+        offset = checkFromIndexSize(offset, byteSize(), ms.byteSize());
+        maybeSwap(bo).intoMemorySegment0(ms, offset);
    }

    /**
     * {@inheritDoc} <!--workaround-->
+     * @since 19
     */
    @Override
    @ForceInline
    public final
-    void intoByteBuffer(ByteBuffer bb, int offset,
-                        ByteOrder bo) {
-        if (ScopedMemoryAccess.isReadOnly(bb)) {
-            throw new ReadOnlyBufferException();
-        }
-        offset = checkFromIndexSize(offset, byteSize(), bb.limit());
-        maybeSwap(bo).intoByteBuffer0(bb, offset);
-    }
-
-    /**
-     * {@inheritDoc} <!--workaround-->
-     */
-    @Override
-    @ForceInline
-    public final
-    void intoByteBuffer(ByteBuffer bb, int offset,
-                        ByteOrder bo,
-                        VectorMask<Long> m) {
+    void intoMemorySegment(MemorySegment ms, long offset,
+                           ByteOrder bo,
+                           VectorMask<Long> m) {
        if (m.allTrue()) {
-            intoByteBuffer(bb, offset, bo);
+            intoMemorySegment(ms, offset, bo);
        } else {
-            if (bb.isReadOnly()) {
-                throw new ReadOnlyBufferException();
+            if (ms.isReadOnly()) {
+                throw new UnsupportedOperationException("Attempt to write a read-only segment");
            }
            LongSpecies vsp = vspecies();
-            checkMaskFromIndexSize(offset, vsp, m, 8, bb.limit());
-            maybeSwap(bo).intoByteBuffer0(bb, offset, m);
+            checkMaskFromIndexSize(offset, vsp, m, 8, ms.byteSize());
+            maybeSwap(bo).intoMemorySegment0(ms, offset, m);
        }
    }

@ -3289,7 +3343,7 @@ public abstract class LongVector extends AbstractVector<Long> {
            vsp.vectorType(), vsp.elementType(), vsp.laneCount(),
            a, arrayAddress(a, offset),
            a, offset, vsp,
-            (arr, off, s) -> s.ldOp(arr, off,
+            (arr, off, s) -> s.ldOp(arr, (int) off,
                                    (arr_, off_, i) -> arr_[off_ + i]));
    }

@ -3306,7 +3360,7 @@ public abstract class LongVector extends AbstractVector<Long> {
            vsp.vectorType(), maskClass, vsp.elementType(), vsp.laneCount(),
            a, arrayAddress(a, offset), m,
            a, offset, vsp,
-            (arr, off, s, vm) -> s.ldOp(arr, off, vm,
+            (arr, off, s, vm) -> s.ldOp(arr, (int) off, vm,
                                        (arr_, off_, i) -> arr_[off_ + i]));
    }

@ -3364,74 +3418,33 @@ public abstract class LongVector extends AbstractVector<Long> {



-    @Override
    abstract
-    LongVector fromByteArray0(byte[] a, int offset);
+    LongVector fromMemorySegment0(MemorySegment bb, long offset);
    @ForceInline
    final
-    LongVector fromByteArray0Template(byte[] a, int offset) {
+    LongVector fromMemorySegment0Template(MemorySegment ms, long offset) {
        LongSpecies vsp = vspecies();
-        return VectorSupport.load(
-            vsp.vectorType(), vsp.elementType(), vsp.laneCount(),
-            a, byteArrayAddress(a, offset),
-            a, offset, vsp,
-            (arr, off, s) -> {
-                ByteBuffer wb = wrapper(arr, NATIVE_ENDIAN);
-                return s.ldOp(wb, off,
-                        (wb_, o, i) -> wb_.getLong(o + i * 8));
-            });
-    }
-
-    abstract
-    LongVector fromByteArray0(byte[] a, int offset, VectorMask<Long> m);
-    @ForceInline
-    final
-    <M extends VectorMask<Long>>
-    LongVector fromByteArray0Template(Class<M> maskClass, byte[] a, int offset, M m) {
-        LongSpecies vsp = vspecies();
-        m.check(vsp);
-        return VectorSupport.loadMasked(
-            vsp.vectorType(), maskClass, vsp.elementType(), vsp.laneCount(),
-            a, byteArrayAddress(a, offset), m,
-            a, offset, vsp,
-            (arr, off, s, vm) -> {
-                ByteBuffer wb = wrapper(arr, NATIVE_ENDIAN);
-                return s.ldOp(wb, off, vm,
-                        (wb_, o, i) -> wb_.getLong(o + i * 8));
-            });
-    }
-
-    abstract
-    LongVector fromByteBuffer0(ByteBuffer bb, int offset);
-    @ForceInline
-    final
-    LongVector fromByteBuffer0Template(ByteBuffer bb, int offset) {
-        LongSpecies vsp = vspecies();
-        return ScopedMemoryAccess.loadFromByteBuffer(
+        return ScopedMemoryAccess.loadFromMemorySegment(
                vsp.vectorType(), vsp.elementType(), vsp.laneCount(),
-                bb, offset, vsp,
-                (buf, off, s) -> {
-                    ByteBuffer wb = wrapper(buf, NATIVE_ENDIAN);
-                    return s.ldOp(wb, off,
-                            (wb_, o, i) -> wb_.getLong(o + i * 8));
+                (AbstractMemorySegmentImpl) ms, offset, vsp,
+                (msp, off, s) -> {
+                    return s.ldLongOp((MemorySegment) msp, off, LongVector::memorySegmentGet);
                });
    }

    abstract
-    LongVector fromByteBuffer0(ByteBuffer bb, int offset, VectorMask<Long> m);
+    LongVector fromMemorySegment0(MemorySegment ms, long offset, VectorMask<Long> m);
    @ForceInline
    final
    <M extends VectorMask<Long>>
-    LongVector fromByteBuffer0Template(Class<M> maskClass, ByteBuffer bb, int offset, M m) {
+    LongVector fromMemorySegment0Template(Class<M> maskClass, MemorySegment ms, long offset, M m) {
        LongSpecies vsp = vspecies();
        m.check(vsp);
-        return ScopedMemoryAccess.loadFromByteBufferMasked(
+        return ScopedMemoryAccess.loadFromMemorySegmentMasked(
                vsp.vectorType(), maskClass, vsp.elementType(), vsp.laneCount(),
-                bb, offset, m, vsp,
-                (buf, off, s, vm) -> {
-                    ByteBuffer wb = wrapper(buf, NATIVE_ENDIAN);
-                    return s.ldOp(wb, off, vm,
-                            (wb_, o, i) -> wb_.getLong(o + i * 8));
+                (AbstractMemorySegmentImpl) ms, offset, m, vsp,
+                (msp, off, s, vm) -> {
+                    return s.ldLongOp((MemorySegment) msp, off, vm, LongVector::memorySegmentGet);
                });
    }

@ -3450,7 +3463,7 @@ public abstract class LongVector extends AbstractVector<Long> {
            a, arrayAddress(a, offset),
            this, a, offset,
            (arr, off, v)
-            -> v.stOp(arr, off,
+            -> v.stOp(arr, (int) off,
                      (arr_, off_, i, e) -> arr_[off_+i] = e));
    }

@ -3467,7 +3480,7 @@ public abstract class LongVector extends AbstractVector<Long> {
            a, arrayAddress(a, offset),
            this, m, a, offset,
            (arr, off, v, vm)
-            -> v.stOp(arr, off, vm,
+            -> v.stOp(arr, (int) off, vm,
                      (arr_, off_, i, e) -> arr_[off_ + i] = e));
    }

@ -3525,71 +3538,33 @@ public abstract class LongVector extends AbstractVector<Long> {
    }


-    abstract
-    void intoByteArray0(byte[] a, int offset);
    @ForceInline
    final
-    void intoByteArray0Template(byte[] a, int offset) {
+    void intoMemorySegment0(MemorySegment ms, long offset) {
        LongSpecies vsp = vspecies();
-        VectorSupport.store(
-            vsp.vectorType(), vsp.elementType(), vsp.laneCount(),
-            a, byteArrayAddress(a, offset),
-            this, a, offset,
-            (arr, off, v) -> {
-                ByteBuffer wb = wrapper(arr, NATIVE_ENDIAN);
-                v.stOp(wb, off,
-                        (tb_, o, i, e) -> tb_.putLong(o + i * 8, e));
-            });
-    }
-
-    abstract
-    void intoByteArray0(byte[] a, int offset, VectorMask<Long> m);
-    @ForceInline
-    final
-    <M extends VectorMask<Long>>
-    void intoByteArray0Template(Class<M> maskClass, byte[] a, int offset, M m) {
-        LongSpecies vsp = vspecies();
-        m.check(vsp);
-        VectorSupport.storeMasked(
-            vsp.vectorType(), maskClass, vsp.elementType(), vsp.laneCount(),
-            a, byteArrayAddress(a, offset),
-            this, m, a, offset,
-            (arr, off, v, vm) -> {
-                ByteBuffer wb = wrapper(arr, NATIVE_ENDIAN);
-                v.stOp(wb, off, vm,
-                        (tb_, o, i, e) -> tb_.putLong(o + i * 8, e));
-            });
-    }
-
-    @ForceInline
-    final
-    void intoByteBuffer0(ByteBuffer bb, int offset) {
-        LongSpecies vsp = vspecies();
-        ScopedMemoryAccess.storeIntoByteBuffer(
+        ScopedMemoryAccess.storeIntoMemorySegment(
                vsp.vectorType(), vsp.elementType(), vsp.laneCount(),
-                this, bb, offset,
-                (buf, off, v) -> {
-                    ByteBuffer wb = wrapper(buf, NATIVE_ENDIAN);
-                    v.stOp(wb, off,
-                            (wb_, o, i, e) -> wb_.putLong(o + i * 8, e));
+                this,
+                (AbstractMemorySegmentImpl) ms, offset,
+                (msp, off, v) -> {
+                    v.stLongOp((MemorySegment) msp, off, LongVector::memorySegmentSet);
                });
    }

    abstract
-    void intoByteBuffer0(ByteBuffer bb, int offset, VectorMask<Long> m);
+    void intoMemorySegment0(MemorySegment bb, long offset, VectorMask<Long> m);
    @ForceInline
    final
    <M extends VectorMask<Long>>
-    void intoByteBuffer0Template(Class<M> maskClass, ByteBuffer bb, int offset, M m) {
+    void intoMemorySegment0Template(Class<M> maskClass, MemorySegment ms, long offset, M m) {
        LongSpecies vsp = vspecies();
        m.check(vsp);
-        ScopedMemoryAccess.storeIntoByteBufferMasked(
+        ScopedMemoryAccess.storeIntoMemorySegmentMasked(
                vsp.vectorType(), maskClass, vsp.elementType(), vsp.laneCount(),
-                this, m, bb, offset,
-                (buf, off, v, vm) -> {
-                    ByteBuffer wb = wrapper(buf, NATIVE_ENDIAN);
-                    v.stOp(wb, off, vm,
-                            (wb_, o, i, e) -> wb_.putLong(o + i * 8, e));
+                this, m,
+                (AbstractMemorySegmentImpl) ms, offset,
+                (msp, off, v, vm) -> {
+                    v.stLongOp((MemorySegment) msp, off, vm, LongVector::memorySegmentSet);
                });
    }

@ -3606,6 +3581,16 @@ public abstract class LongVector extends AbstractVector<Long> {
            .checkIndexByLane(offset, limit, vsp.iota(), scale);
    }

+    private static
+    void checkMaskFromIndexSize(long offset,
+                                LongSpecies vsp,
+                                VectorMask<Long> m,
+                                int scale,
+                                long limit) {
+        ((AbstractMask<Long>)m)
+            .checkIndexByLane(offset, limit, vsp.iota(), scale);
+    }
+
    @ForceInline
    private void conditionalStoreNYI(int offset,
                                     LongSpecies vsp,
@ -3907,6 +3892,21 @@ public abstract class LongVector extends AbstractVector<Long> {
            return dummyVector().ldOp(memory, offset, m, f);
        }

+        /*package-private*/
+        @ForceInline
+        LongVector ldLongOp(MemorySegment memory, long offset,
+                                      FLdLongOp f) {
+            return dummyVector().ldLongOp(memory, offset, f);
+        }
+
+        /*package-private*/
+        @ForceInline
+        LongVector ldLongOp(MemorySegment memory, long offset,
+                                      VectorMask<Long> m,
+                                      FLdLongOp f) {
+            return dummyVector().ldLongOp(memory, offset, m, f);
+        }
+
        /*package-private*/
        @ForceInline
        <M> void stOp(M memory, int offset, FStOp<M> f) {
@ -3921,6 +3921,20 @@ public abstract class LongVector extends AbstractVector<Long> {
            dummyVector().stOp(memory, offset, m, f);
        }

+        /*package-private*/
+        @ForceInline
+        void stLongOp(MemorySegment memory, long offset, FStLongOp f) {
+            dummyVector().stLongOp(memory, offset, f);
+        }
+
+        /*package-private*/
+        @ForceInline
+        void stLongOp(MemorySegment memory, long offset,
+                      AbstractMask<Long> m,
+                      FStLongOp f) {
+            dummyVector().stLongOp(memory, offset, m, f);
+        }
+
        // N.B. Make sure these constant vectors and
        // masks load up correctly into registers.
        //
@ -4034,3 +4048,4 @@ public abstract class LongVector extends AbstractVector<Long> {
    public static final VectorSpecies<Long> SPECIES_PREFERRED
        = (LongSpecies) VectorSpecies.ofPreferred(long.class);
 }
+
--- a/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/Short128Vector.java
+++ b/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/Short128Vector.java
@ -24,7 +24,7 @@
 */
 package jdk.incubator.vector;

-import java.nio.ByteBuffer;
+import java.lang.foreign.MemorySegment;
 import java.util.Arrays;
 import java.util.Objects;
 import java.util.function.IntUnaryOperator;
@ -474,6 +474,22 @@ final class Short128Vector extends ShortVector {
                                    (Short128Vector) v);  // specialize
    }

+    @Override
+    @ForceInline
+    public Short128Vector compress(VectorMask<Short> m) {
+        return (Short128Vector)
+            super.compressTemplate(Short128Mask.class,
+                                   (Short128Mask) m);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    public Short128Vector expand(VectorMask<Short> m) {
+        return (Short128Vector)
+            super.expandTemplate(Short128Mask.class,
+                                   (Short128Mask) m);  // specialize
+    }
+
    @Override
    @ForceInline
    public Short128Vector selectFrom(Vector<Short> v) {
@ -661,6 +677,15 @@ final class Short128Vector extends ShortVector {
            return xor(maskAll(true));
        }

+        @Override
+        @ForceInline
+        public Short128Mask compress() {
+            return (Short128Mask)VectorSupport.compressExpandOp(VectorSupport.VECTOR_OP_MASK_COMPRESS,
+                Short128Vector.class, Short128Mask.class, ETYPE, VLENGTH, null, this,
+                (v1, m1) -> VSPECIES.iota().compare(VectorOperators.LT, m1.trueCount()));
+        }
+
+
        // Binary operations

        @Override
@ -860,29 +885,15 @@ final class Short128Vector extends ShortVector {
    @ForceInline
    @Override
    final
-    ShortVector fromByteArray0(byte[] a, int offset) {
-        return super.fromByteArray0Template(a, offset);  // specialize
+    ShortVector fromMemorySegment0(MemorySegment ms, long offset) {
+        return super.fromMemorySegment0Template(ms, offset);  // specialize
    }

    @ForceInline
    @Override
    final
-    ShortVector fromByteArray0(byte[] a, int offset, VectorMask<Short> m) {
-        return super.fromByteArray0Template(Short128Mask.class, a, offset, (Short128Mask) m);  // specialize
-    }
-
-    @ForceInline
-    @Override
-    final
-    ShortVector fromByteBuffer0(ByteBuffer bb, int offset) {
-        return super.fromByteBuffer0Template(bb, offset);  // specialize
-    }
-
-    @ForceInline
-    @Override
-    final
-    ShortVector fromByteBuffer0(ByteBuffer bb, int offset, VectorMask<Short> m) {
-        return super.fromByteBuffer0Template(Short128Mask.class, bb, offset, (Short128Mask) m);  // specialize
+    ShortVector fromMemorySegment0(MemorySegment ms, long offset, VectorMask<Short> m) {
+        return super.fromMemorySegment0Template(Short128Mask.class, ms, offset, (Short128Mask) m);  // specialize
    }

    @ForceInline
@ -904,22 +915,8 @@ final class Short128Vector extends ShortVector {
    @ForceInline
    @Override
    final
-    void intoByteArray0(byte[] a, int offset) {
-        super.intoByteArray0Template(a, offset);  // specialize
-    }
-
-    @ForceInline
-    @Override
-    final
-    void intoByteArray0(byte[] a, int offset, VectorMask<Short> m) {
-        super.intoByteArray0Template(Short128Mask.class, a, offset, (Short128Mask) m);  // specialize
-    }
-
-    @ForceInline
-    @Override
-    final
-    void intoByteBuffer0(ByteBuffer bb, int offset, VectorMask<Short> m) {
-        super.intoByteBuffer0Template(Short128Mask.class, bb, offset, (Short128Mask) m);
+    void intoMemorySegment0(MemorySegment ms, long offset, VectorMask<Short> m) {
+        super.intoMemorySegment0Template(Short128Mask.class, ms, offset, (Short128Mask) m);
    }

    @ForceInline
@ -934,3 +931,4 @@ final class Short128Vector extends ShortVector {
    // ================================================

 }
+
--- a/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/Short256Vector.java
+++ b/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/Short256Vector.java
@ -24,7 +24,7 @@
 */
 package jdk.incubator.vector;

-import java.nio.ByteBuffer;
+import java.lang.foreign.MemorySegment;
 import java.util.Arrays;
 import java.util.Objects;
 import java.util.function.IntUnaryOperator;
@ -474,6 +474,22 @@ final class Short256Vector extends ShortVector {
                                    (Short256Vector) v);  // specialize
    }

+    @Override
+    @ForceInline
+    public Short256Vector compress(VectorMask<Short> m) {
+        return (Short256Vector)
+            super.compressTemplate(Short256Mask.class,
+                                   (Short256Mask) m);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    public Short256Vector expand(VectorMask<Short> m) {
+        return (Short256Vector)
+            super.expandTemplate(Short256Mask.class,
+                                   (Short256Mask) m);  // specialize
+    }
+
    @Override
    @ForceInline
    public Short256Vector selectFrom(Vector<Short> v) {
@ -677,6 +693,15 @@ final class Short256Vector extends ShortVector {
            return xor(maskAll(true));
        }

+        @Override
+        @ForceInline
+        public Short256Mask compress() {
+            return (Short256Mask)VectorSupport.compressExpandOp(VectorSupport.VECTOR_OP_MASK_COMPRESS,
+                Short256Vector.class, Short256Mask.class, ETYPE, VLENGTH, null, this,
+                (v1, m1) -> VSPECIES.iota().compare(VectorOperators.LT, m1.trueCount()));
+        }
+
+
        // Binary operations

        @Override
@ -876,29 +901,15 @@ final class Short256Vector extends ShortVector {
    @ForceInline
    @Override
    final
-    ShortVector fromByteArray0(byte[] a, int offset) {
-        return super.fromByteArray0Template(a, offset);  // specialize
+    ShortVector fromMemorySegment0(MemorySegment ms, long offset) {
+        return super.fromMemorySegment0Template(ms, offset);  // specialize
    }

    @ForceInline
    @Override
    final
-    ShortVector fromByteArray0(byte[] a, int offset, VectorMask<Short> m) {
-        return super.fromByteArray0Template(Short256Mask.class, a, offset, (Short256Mask) m);  // specialize
-    }
-
-    @ForceInline
-    @Override
-    final
-    ShortVector fromByteBuffer0(ByteBuffer bb, int offset) {
-        return super.fromByteBuffer0Template(bb, offset);  // specialize
-    }
-
-    @ForceInline
-    @Override
-    final
-    ShortVector fromByteBuffer0(ByteBuffer bb, int offset, VectorMask<Short> m) {
-        return super.fromByteBuffer0Template(Short256Mask.class, bb, offset, (Short256Mask) m);  // specialize
+    ShortVector fromMemorySegment0(MemorySegment ms, long offset, VectorMask<Short> m) {
+        return super.fromMemorySegment0Template(Short256Mask.class, ms, offset, (Short256Mask) m);  // specialize
    }

    @ForceInline
@ -920,22 +931,8 @@ final class Short256Vector extends ShortVector {
    @ForceInline
    @Override
    final
-    void intoByteArray0(byte[] a, int offset) {
-        super.intoByteArray0Template(a, offset);  // specialize
-    }
-
-    @ForceInline
-    @Override
-    final
-    void intoByteArray0(byte[] a, int offset, VectorMask<Short> m) {
-        super.intoByteArray0Template(Short256Mask.class, a, offset, (Short256Mask) m);  // specialize
-    }
-
-    @ForceInline
-    @Override
-    final
-    void intoByteBuffer0(ByteBuffer bb, int offset, VectorMask<Short> m) {
-        super.intoByteBuffer0Template(Short256Mask.class, bb, offset, (Short256Mask) m);
+    void intoMemorySegment0(MemorySegment ms, long offset, VectorMask<Short> m) {
+        super.intoMemorySegment0Template(Short256Mask.class, ms, offset, (Short256Mask) m);
    }

    @ForceInline
@ -950,3 +947,4 @@ final class Short256Vector extends ShortVector {
    // ================================================

 }
+
--- a/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/Short512Vector.java
+++ b/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/Short512Vector.java
@ -24,7 +24,7 @@
 */
 package jdk.incubator.vector;

-import java.nio.ByteBuffer;
+import java.lang.foreign.MemorySegment;
 import java.util.Arrays;
 import java.util.Objects;
 import java.util.function.IntUnaryOperator;
@ -474,6 +474,22 @@ final class Short512Vector extends ShortVector {
                                    (Short512Vector) v);  // specialize
    }

+    @Override
+    @ForceInline
+    public Short512Vector compress(VectorMask<Short> m) {
+        return (Short512Vector)
+            super.compressTemplate(Short512Mask.class,
+                                   (Short512Mask) m);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    public Short512Vector expand(VectorMask<Short> m) {
+        return (Short512Vector)
+            super.expandTemplate(Short512Mask.class,
+                                   (Short512Mask) m);  // specialize
+    }
+
    @Override
    @ForceInline
    public Short512Vector selectFrom(Vector<Short> v) {
@ -709,6 +725,15 @@ final class Short512Vector extends ShortVector {
            return xor(maskAll(true));
        }

+        @Override
+        @ForceInline
+        public Short512Mask compress() {
+            return (Short512Mask)VectorSupport.compressExpandOp(VectorSupport.VECTOR_OP_MASK_COMPRESS,
+                Short512Vector.class, Short512Mask.class, ETYPE, VLENGTH, null, this,
+                (v1, m1) -> VSPECIES.iota().compare(VectorOperators.LT, m1.trueCount()));
+        }
+
+
        // Binary operations

        @Override
@ -908,29 +933,15 @@ final class Short512Vector extends ShortVector {
    @ForceInline
    @Override
    final
-    ShortVector fromByteArray0(byte[] a, int offset) {
-        return super.fromByteArray0Template(a, offset);  // specialize
+    ShortVector fromMemorySegment0(MemorySegment ms, long offset) {
+        return super.fromMemorySegment0Template(ms, offset);  // specialize
    }

    @ForceInline
    @Override
    final
-    ShortVector fromByteArray0(byte[] a, int offset, VectorMask<Short> m) {
-        return super.fromByteArray0Template(Short512Mask.class, a, offset, (Short512Mask) m);  // specialize
-    }
-
-    @ForceInline
-    @Override
-    final
-    ShortVector fromByteBuffer0(ByteBuffer bb, int offset) {
-        return super.fromByteBuffer0Template(bb, offset);  // specialize
-    }
-
-    @ForceInline
-    @Override
-    final
-    ShortVector fromByteBuffer0(ByteBuffer bb, int offset, VectorMask<Short> m) {
-        return super.fromByteBuffer0Template(Short512Mask.class, bb, offset, (Short512Mask) m);  // specialize
+    ShortVector fromMemorySegment0(MemorySegment ms, long offset, VectorMask<Short> m) {
+        return super.fromMemorySegment0Template(Short512Mask.class, ms, offset, (Short512Mask) m);  // specialize
    }

    @ForceInline
@ -952,22 +963,8 @@ final class Short512Vector extends ShortVector {
    @ForceInline
    @Override
    final
-    void intoByteArray0(byte[] a, int offset) {
-        super.intoByteArray0Template(a, offset);  // specialize
-    }
-
-    @ForceInline
-    @Override
-    final
-    void intoByteArray0(byte[] a, int offset, VectorMask<Short> m) {
-        super.intoByteArray0Template(Short512Mask.class, a, offset, (Short512Mask) m);  // specialize
-    }
-
-    @ForceInline
-    @Override
-    final
-    void intoByteBuffer0(ByteBuffer bb, int offset, VectorMask<Short> m) {
-        super.intoByteBuffer0Template(Short512Mask.class, bb, offset, (Short512Mask) m);
+    void intoMemorySegment0(MemorySegment ms, long offset, VectorMask<Short> m) {
+        super.intoMemorySegment0Template(Short512Mask.class, ms, offset, (Short512Mask) m);
    }

    @ForceInline
@ -982,3 +979,4 @@ final class Short512Vector extends ShortVector {
    // ================================================

 }
+
--- a/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/Short64Vector.java
+++ b/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/Short64Vector.java
@ -24,7 +24,7 @@
 */
 package jdk.incubator.vector;

-import java.nio.ByteBuffer;
+import java.lang.foreign.MemorySegment;
 import java.util.Arrays;
 import java.util.Objects;
 import java.util.function.IntUnaryOperator;
@ -474,6 +474,22 @@ final class Short64Vector extends ShortVector {
                                    (Short64Vector) v);  // specialize
    }

+    @Override
+    @ForceInline
+    public Short64Vector compress(VectorMask<Short> m) {
+        return (Short64Vector)
+            super.compressTemplate(Short64Mask.class,
+                                   (Short64Mask) m);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    public Short64Vector expand(VectorMask<Short> m) {
+        return (Short64Vector)
+            super.expandTemplate(Short64Mask.class,
+                                   (Short64Mask) m);  // specialize
+    }
+
    @Override
    @ForceInline
    public Short64Vector selectFrom(Vector<Short> v) {
@ -653,6 +669,15 @@ final class Short64Vector extends ShortVector {
            return xor(maskAll(true));
        }

+        @Override
+        @ForceInline
+        public Short64Mask compress() {
+            return (Short64Mask)VectorSupport.compressExpandOp(VectorSupport.VECTOR_OP_MASK_COMPRESS,
+                Short64Vector.class, Short64Mask.class, ETYPE, VLENGTH, null, this,
+                (v1, m1) -> VSPECIES.iota().compare(VectorOperators.LT, m1.trueCount()));
+        }
+
+
        // Binary operations

        @Override
@ -852,29 +877,15 @@ final class Short64Vector extends ShortVector {
    @ForceInline
    @Override
    final
-    ShortVector fromByteArray0(byte[] a, int offset) {
-        return super.fromByteArray0Template(a, offset);  // specialize
+    ShortVector fromMemorySegment0(MemorySegment ms, long offset) {
+        return super.fromMemorySegment0Template(ms, offset);  // specialize
    }

    @ForceInline
    @Override
    final
-    ShortVector fromByteArray0(byte[] a, int offset, VectorMask<Short> m) {
-        return super.fromByteArray0Template(Short64Mask.class, a, offset, (Short64Mask) m);  // specialize
-    }
-
-    @ForceInline
-    @Override
-    final
-    ShortVector fromByteBuffer0(ByteBuffer bb, int offset) {
-        return super.fromByteBuffer0Template(bb, offset);  // specialize
-    }
-
-    @ForceInline
-    @Override
-    final
-    ShortVector fromByteBuffer0(ByteBuffer bb, int offset, VectorMask<Short> m) {
-        return super.fromByteBuffer0Template(Short64Mask.class, bb, offset, (Short64Mask) m);  // specialize
+    ShortVector fromMemorySegment0(MemorySegment ms, long offset, VectorMask<Short> m) {
+        return super.fromMemorySegment0Template(Short64Mask.class, ms, offset, (Short64Mask) m);  // specialize
    }

    @ForceInline
@ -896,22 +907,8 @@ final class Short64Vector extends ShortVector {
    @ForceInline
    @Override
    final
-    void intoByteArray0(byte[] a, int offset) {
-        super.intoByteArray0Template(a, offset);  // specialize
-    }
-
-    @ForceInline
-    @Override
-    final
-    void intoByteArray0(byte[] a, int offset, VectorMask<Short> m) {
-        super.intoByteArray0Template(Short64Mask.class, a, offset, (Short64Mask) m);  // specialize
-    }
-
-    @ForceInline
-    @Override
-    final
-    void intoByteBuffer0(ByteBuffer bb, int offset, VectorMask<Short> m) {
-        super.intoByteBuffer0Template(Short64Mask.class, bb, offset, (Short64Mask) m);
+    void intoMemorySegment0(MemorySegment ms, long offset, VectorMask<Short> m) {
+        super.intoMemorySegment0Template(Short64Mask.class, ms, offset, (Short64Mask) m);
    }

    @ForceInline
@ -926,3 +923,4 @@ final class Short64Vector extends ShortVector {
    // ================================================

 }
+
--- a/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/ShortMaxVector.java
+++ b/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/ShortMaxVector.java
@ -24,7 +24,7 @@
 */
 package jdk.incubator.vector;

-import java.nio.ByteBuffer;
+import java.lang.foreign.MemorySegment;
 import java.util.Arrays;
 import java.util.Objects;
 import java.util.function.IntUnaryOperator;
@ -474,6 +474,22 @@ final class ShortMaxVector extends ShortVector {
                                    (ShortMaxVector) v);  // specialize
    }

+    @Override
+    @ForceInline
+    public ShortMaxVector compress(VectorMask<Short> m) {
+        return (ShortMaxVector)
+            super.compressTemplate(ShortMaxMask.class,
+                                   (ShortMaxMask) m);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    public ShortMaxVector expand(VectorMask<Short> m) {
+        return (ShortMaxVector)
+            super.expandTemplate(ShortMaxMask.class,
+                                   (ShortMaxMask) m);  // specialize
+    }
+
    @Override
    @ForceInline
    public ShortMaxVector selectFrom(Vector<Short> v) {
@ -647,6 +663,15 @@ final class ShortMaxVector extends ShortVector {
            return xor(maskAll(true));
        }

+        @Override
+        @ForceInline
+        public ShortMaxMask compress() {
+            return (ShortMaxMask)VectorSupport.compressExpandOp(VectorSupport.VECTOR_OP_MASK_COMPRESS,
+                ShortMaxVector.class, ShortMaxMask.class, ETYPE, VLENGTH, null, this,
+                (v1, m1) -> VSPECIES.iota().compare(VectorOperators.LT, m1.trueCount()));
+        }
+
+
        // Binary operations

        @Override
@ -846,29 +871,15 @@ final class ShortMaxVector extends ShortVector {
    @ForceInline
    @Override
    final
-    ShortVector fromByteArray0(byte[] a, int offset) {
-        return super.fromByteArray0Template(a, offset);  // specialize
+    ShortVector fromMemorySegment0(MemorySegment ms, long offset) {
+        return super.fromMemorySegment0Template(ms, offset);  // specialize
    }

    @ForceInline
    @Override
    final
-    ShortVector fromByteArray0(byte[] a, int offset, VectorMask<Short> m) {
-        return super.fromByteArray0Template(ShortMaxMask.class, a, offset, (ShortMaxMask) m);  // specialize
-    }
-
-    @ForceInline
-    @Override
-    final
-    ShortVector fromByteBuffer0(ByteBuffer bb, int offset) {
-        return super.fromByteBuffer0Template(bb, offset);  // specialize
-    }
-
-    @ForceInline
-    @Override
-    final
-    ShortVector fromByteBuffer0(ByteBuffer bb, int offset, VectorMask<Short> m) {
-        return super.fromByteBuffer0Template(ShortMaxMask.class, bb, offset, (ShortMaxMask) m);  // specialize
+    ShortVector fromMemorySegment0(MemorySegment ms, long offset, VectorMask<Short> m) {
+        return super.fromMemorySegment0Template(ShortMaxMask.class, ms, offset, (ShortMaxMask) m);  // specialize
    }

    @ForceInline
@ -890,22 +901,8 @@ final class ShortMaxVector extends ShortVector {
    @ForceInline
    @Override
    final
-    void intoByteArray0(byte[] a, int offset) {
-        super.intoByteArray0Template(a, offset);  // specialize
-    }
-
-    @ForceInline
-    @Override
-    final
-    void intoByteArray0(byte[] a, int offset, VectorMask<Short> m) {
-        super.intoByteArray0Template(ShortMaxMask.class, a, offset, (ShortMaxMask) m);  // specialize
-    }
-
-    @ForceInline
-    @Override
-    final
-    void intoByteBuffer0(ByteBuffer bb, int offset, VectorMask<Short> m) {
-        super.intoByteBuffer0Template(ShortMaxMask.class, bb, offset, (ShortMaxMask) m);
+    void intoMemorySegment0(MemorySegment ms, long offset, VectorMask<Short> m) {
+        super.intoMemorySegment0Template(ShortMaxMask.class, ms, offset, (ShortMaxMask) m);
    }

    @ForceInline
@ -920,3 +917,4 @@ final class ShortMaxVector extends ShortVector {
    // ================================================

 }
+
--- a/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/ShortVector.java
+++ b/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/ShortVector.java
@ -24,14 +24,14 @@
 */
 package jdk.incubator.vector;

-import java.nio.ByteBuffer;
+import java.lang.foreign.MemorySegment;
+import java.lang.foreign.ValueLayout;
 import java.nio.ByteOrder;
-import java.nio.ReadOnlyBufferException;
 import java.util.Arrays;
 import java.util.Objects;
 import java.util.function.Function;
-import java.util.function.UnaryOperator;

+import jdk.internal.foreign.AbstractMemorySegmentImpl;
 import jdk.internal.misc.ScopedMemoryAccess;
 import jdk.internal.misc.Unsafe;
 import jdk.internal.vm.annotation.ForceInline;
@ -57,6 +57,8 @@ public abstract class ShortVector extends AbstractVector<Short> {

    static final int FORBID_OPCODE_KIND = VO_ONLYFP;

+    static final ValueLayout.OfShort ELEMENT_LAYOUT = ValueLayout.JAVA_SHORT.withBitAlignment(8);
+
    @ForceInline
    static int opCode(Operator op) {
        return VectorOperators.opCode(op, VO_OPCODE_VALID, FORBID_OPCODE_KIND);
@ -351,6 +353,45 @@ public abstract class ShortVector extends AbstractVector<Short> {
        return vectorFactory(res);
    }

+    /*package-private*/
+    interface FLdLongOp {
+        short apply(MemorySegment memory, long offset, int i);
+    }
+
+    /*package-private*/
+    @ForceInline
+    final
+    ShortVector ldLongOp(MemorySegment memory, long offset,
+                                  FLdLongOp f) {
+        //dummy; no vec = vec();
+        short[] res = new short[length()];
+        for (int i = 0; i < res.length; i++) {
+            res[i] = f.apply(memory, offset, i);
+        }
+        return vectorFactory(res);
+    }
+
+    /*package-private*/
+    @ForceInline
+    final
+    ShortVector ldLongOp(MemorySegment memory, long offset,
+                                  VectorMask<Short> m,
+                                  FLdLongOp f) {
+        //short[] vec = vec();
+        short[] res = new short[length()];
+        boolean[] mbits = ((AbstractMask<Short>)m).getBits();
+        for (int i = 0; i < res.length; i++) {
+            if (mbits[i]) {
+                res[i] = f.apply(memory, offset, i);
+            }
+        }
+        return vectorFactory(res);
+    }
+
+    static short memorySegmentGet(MemorySegment ms, long o, int i) {
+        return ms.get(ELEMENT_LAYOUT, o + i * 2L);
+    }
+
    interface FStOp<M> {
        void apply(M memory, int offset, int i, short a);
    }
@ -381,6 +422,40 @@ public abstract class ShortVector extends AbstractVector<Short> {
        }
    }

+    interface FStLongOp {
+        void apply(MemorySegment memory, long offset, int i, short a);
+    }
+
+    /*package-private*/
+    @ForceInline
+    final
+    void stLongOp(MemorySegment memory, long offset,
+                  FStLongOp f) {
+        short[] vec = vec();
+        for (int i = 0; i < vec.length; i++) {
+            f.apply(memory, offset, i, vec[i]);
+        }
+    }
+
+    /*package-private*/
+    @ForceInline
+    final
+    void stLongOp(MemorySegment memory, long offset,
+                  VectorMask<Short> m,
+                  FStLongOp f) {
+        short[] vec = vec();
+        boolean[] mbits = ((AbstractMask<Short>)m).getBits();
+        for (int i = 0; i < vec.length; i++) {
+            if (mbits[i]) {
+                f.apply(memory, offset, i, vec[i]);
+            }
+        }
+    }
+
+    static void memorySegmentSet(MemorySegment ms, long o, int i, short e) {
+        ms.set(ELEMENT_LAYOUT, o + i * 2L, e);
+    }
+
    // Binary test

    /*package-private*/
@ -431,6 +506,36 @@ public abstract class ShortVector extends AbstractVector<Short> {
        return ((short)bits);
    }

+    static ShortVector expandHelper(Vector<Short> v, VectorMask<Short> m) {
+        VectorSpecies<Short> vsp = m.vectorSpecies();
+        ShortVector r  = (ShortVector) vsp.zero();
+        ShortVector vi = (ShortVector) v;
+        if (m.allTrue()) {
+            return vi;
+        }
+        for (int i = 0, j = 0; i < vsp.length(); i++) {
+            if (m.laneIsSet(i)) {
+                r = r.withLane(i, vi.lane(j++));
+            }
+        }
+        return r;
+    }
+
+    static ShortVector compressHelper(Vector<Short> v, VectorMask<Short> m) {
+        VectorSpecies<Short> vsp = m.vectorSpecies();
+        ShortVector r  = (ShortVector) vsp.zero();
+        ShortVector vi = (ShortVector) v;
+        if (m.allTrue()) {
+            return vi;
+        }
+        for (int i = 0, j = 0; i < vsp.length(); i++) {
+            if (m.laneIsSet(i)) {
+                r = r.withLane(j++, vi.lane(i));
+            }
+        }
+        return r;
+    }
+
    // Static factories (other than memory operations)

    // Note: A surprising behavior in javadoc
@ -620,6 +725,16 @@ public abstract class ShortVector extends AbstractVector<Short> {
                    v0.uOp(m, (i, a) -> (short) -a);
            case VECTOR_OP_ABS: return (v0, m) ->
                    v0.uOp(m, (i, a) -> (short) Math.abs(a));
+            case VECTOR_OP_BIT_COUNT: return (v0, m) ->
+                    v0.uOp(m, (i, a) -> (short) bitCount(a));
+            case VECTOR_OP_TZ_COUNT: return (v0, m) ->
+                    v0.uOp(m, (i, a) -> (short) numberOfTrailingZeros(a));
+            case VECTOR_OP_LZ_COUNT: return (v0, m) ->
+                    v0.uOp(m, (i, a) -> (short) numberOfLeadingZeros(a));
+            case VECTOR_OP_REVERSE: return (v0, m) ->
+                    v0.uOp(m, (i, a) -> reverse(a));
+            case VECTOR_OP_REVERSE_BYTES: return (v0, m) ->
+                    v0.uOp(m, (i, a) -> (short) Short.reverseBytes(a));
            default: return null;
        }
    }
@ -1746,6 +1861,26 @@ public abstract class ShortVector extends AbstractVector<Short> {
        return lanewise(ABS);
    }

+    static int bitCount(short a) {
+        return Integer.bitCount((int)a & 0xFFFF);
+    }
+    static int numberOfTrailingZeros(short a) {
+        return a != 0 ? Integer.numberOfTrailingZeros(a) : 16;
+    }
+    static int numberOfLeadingZeros(short a) {
+        return a >= 0 ? Integer.numberOfLeadingZeros(a) - 16 : 0;
+    }
+
+    static short reverse(short a) {
+        if (a == 0 || a == -1) return a;
+
+        short b = rotateLeft(a, 8);
+        b = (short) (((b & 0x5555) << 1) | ((b & 0xAAAA) >>> 1));
+        b = (short) (((b & 0x3333) << 2) | ((b & 0xCCCC) >>> 2));
+        b = (short) (((b & 0x0F0F) << 4) | ((b & 0xF0F0) >>> 4));
+        return b;
+    }
+
    // not (~)
    /**
     * Computes the bitwise logical complement ({@code ~})
@ -2372,6 +2507,45 @@ public abstract class ShortVector extends AbstractVector<Short> {
                                     ShortVector::toShuffle0);
    }

+    /**
+     * {@inheritDoc} <!--workaround-->
+     * @since 19
+     */
+    @Override
+    public abstract
+    ShortVector compress(VectorMask<Short> m);
+
+    /*package-private*/
+    @ForceInline
+    final
+    <M extends AbstractMask<Short>>
+    ShortVector compressTemplate(Class<M> masktype, M m) {
+      m.check(masktype, this);
+      return (ShortVector) VectorSupport.compressExpandOp(VectorSupport.VECTOR_OP_COMPRESS, getClass(), masktype,
+                                                        short.class, length(), this, m,
+                                                        (v1, m1) -> compressHelper(v1, m1));
+    }
+
+    /**
+     * {@inheritDoc} <!--workaround-->
+     * @since 19
+     */
+    @Override
+    public abstract
+    ShortVector expand(VectorMask<Short> m);
+
+    /*package-private*/
+    @ForceInline
+    final
+    <M extends AbstractMask<Short>>
+    ShortVector expandTemplate(Class<M> masktype, M m) {
+      m.check(masktype, this);
+      return (ShortVector) VectorSupport.compressExpandOp(VectorSupport.VECTOR_OP_EXPAND, getClass(), masktype,
+                                                        short.class, length(), this, m,
+                                                        (v1, m1) -> expandHelper(v1, m1));
+    }
+
+
    /**
     * {@inheritDoc} <!--workaround-->
     */
@ -2784,90 +2958,6 @@ public abstract class ShortVector extends AbstractVector<Short> {
        return res;
    }

-    /**
-     * Loads a vector from a byte array starting at an offset.
-     * Bytes are composed into primitive lane elements according
-     * to the specified byte order.
-     * The vector is arranged into lanes according to
-     * <a href="Vector.html#lane-order">memory ordering</a>.
-     * <p>
-     * This method behaves as if it returns the result of calling
-     * {@link #fromByteBuffer(VectorSpecies,ByteBuffer,int,ByteOrder,VectorMask)
-     * fromByteBuffer()} as follows:
-     * <pre>{@code
-     * var bb = ByteBuffer.wrap(a);
-     * var m = species.maskAll(true);
-     * return fromByteBuffer(species, bb, offset, bo, m);
-     * }</pre>
-     *
-     * @param species species of desired vector
-     * @param a the byte array
-     * @param offset the offset into the array
-     * @param bo the intended byte order
-     * @return a vector loaded from a byte array
-     * @throws IndexOutOfBoundsException
-     *         if {@code offset+N*ESIZE < 0}
-     *         or {@code offset+(N+1)*ESIZE > a.length}
-     *         for any lane {@code N} in the vector
-     */
-    @ForceInline
-    public static
-    ShortVector fromByteArray(VectorSpecies<Short> species,
-                                       byte[] a, int offset,
-                                       ByteOrder bo) {
-        offset = checkFromIndexSize(offset, species.vectorByteSize(), a.length);
-        ShortSpecies vsp = (ShortSpecies) species;
-        return vsp.dummyVector().fromByteArray0(a, offset).maybeSwap(bo);
-    }
-
-    /**
-     * Loads a vector from a byte array starting at an offset
-     * and using a mask.
-     * Lanes where the mask is unset are filled with the default
-     * value of {@code short} (zero).
-     * Bytes are composed into primitive lane elements according
-     * to the specified byte order.
-     * The vector is arranged into lanes according to
-     * <a href="Vector.html#lane-order">memory ordering</a>.
-     * <p>
-     * This method behaves as if it returns the result of calling
-     * {@link #fromByteBuffer(VectorSpecies,ByteBuffer,int,ByteOrder,VectorMask)
-     * fromByteBuffer()} as follows:
-     * <pre>{@code
-     * var bb = ByteBuffer.wrap(a);
-     * return fromByteBuffer(species, bb, offset, bo, m);
-     * }</pre>
-     *
-     * @param species species of desired vector
-     * @param a the byte array
-     * @param offset the offset into the array
-     * @param bo the intended byte order
-     * @param m the mask controlling lane selection
-     * @return a vector loaded from a byte array
-     * @throws IndexOutOfBoundsException
-     *         if {@code offset+N*ESIZE < 0}
-     *         or {@code offset+(N+1)*ESIZE > a.length}
-     *         for any lane {@code N} in the vector
-     *         where the mask is set
-     */
-    @ForceInline
-    public static
-    ShortVector fromByteArray(VectorSpecies<Short> species,
-                                       byte[] a, int offset,
-                                       ByteOrder bo,
-                                       VectorMask<Short> m) {
-        ShortSpecies vsp = (ShortSpecies) species;
-        if (offset >= 0 && offset <= (a.length - species.vectorByteSize())) {
-            return vsp.dummyVector().fromByteArray0(a, offset, m).maybeSwap(bo);
-        }
-
-        // FIXME: optimize
-        checkMaskFromIndexSize(offset, vsp, m, 2, a.length);
-        ByteBuffer wb = wrapper(a, bo);
-        return vsp.ldOp(wb, offset, (AbstractMask<Short>)m,
-                   (wb_, o, i)  -> wb_.getShort(o + i * 2));
-    }
-
    /**
     * Loads a vector from an array of type {@code short[]}
     * starting at an offset.
@ -3167,44 +3257,49 @@ public abstract class ShortVector extends AbstractVector<Short> {


    /**
-     * Loads a vector from a {@linkplain ByteBuffer byte buffer}
-     * starting at an offset into the byte buffer.
+     * Loads a vector from a {@linkplain MemorySegment memory segment}
+     * starting at an offset into the memory segment.
     * Bytes are composed into primitive lane elements according
     * to the specified byte order.
     * The vector is arranged into lanes according to
     * <a href="Vector.html#lane-order">memory ordering</a>.
     * <p>
     * This method behaves as if it returns the result of calling
-     * {@link #fromByteBuffer(VectorSpecies,ByteBuffer,int,ByteOrder,VectorMask)
-     * fromByteBuffer()} as follows:
+     * {@link #fromMemorySegment(VectorSpecies,MemorySegment,long,ByteOrder,VectorMask)
+     * fromMemorySegment()} as follows:
     * <pre>{@code
     * var m = species.maskAll(true);
-     * return fromByteBuffer(species, bb, offset, bo, m);
+     * return fromMemorySegment(species, ms, offset, bo, m);
     * }</pre>
     *
     * @param species species of desired vector
-     * @param bb the byte buffer
-     * @param offset the offset into the byte buffer
+     * @param ms the memory segment
+     * @param offset the offset into the memory segment
     * @param bo the intended byte order
-     * @return a vector loaded from a byte buffer
+     * @return a vector loaded from the memory segment
     * @throws IndexOutOfBoundsException
     *         if {@code offset+N*2 < 0}
-     *         or {@code offset+N*2 >= bb.limit()}
+     *         or {@code offset+N*2 >= ms.byteSize()}
     *         for any lane {@code N} in the vector
+     * @throws IllegalArgumentException if the memory segment is a heap segment that is
+     *         not backed by a {@code byte[]} array.
+     * @throws IllegalStateException if the memory segment's session is not alive,
+     *         or if access occurs from a thread other than the thread owning the session.
+     * @since 19
     */
    @ForceInline
    public static
-    ShortVector fromByteBuffer(VectorSpecies<Short> species,
-                                        ByteBuffer bb, int offset,
-                                        ByteOrder bo) {
-        offset = checkFromIndexSize(offset, species.vectorByteSize(), bb.limit());
+    ShortVector fromMemorySegment(VectorSpecies<Short> species,
+                                           MemorySegment ms, long offset,
+                                           ByteOrder bo) {
+        offset = checkFromIndexSize(offset, species.vectorByteSize(), ms.byteSize());
        ShortSpecies vsp = (ShortSpecies) species;
-        return vsp.dummyVector().fromByteBuffer0(bb, offset).maybeSwap(bo);
+        return vsp.dummyVector().fromMemorySegment0(ms, offset).maybeSwap(bo);
    }

    /**
-     * Loads a vector from a {@linkplain ByteBuffer byte buffer}
-     * starting at an offset into the byte buffer
+     * Loads a vector from a {@linkplain MemorySegment memory segment}
+     * starting at an offset into the memory segment
     * and using a mask.
     * Lanes where the mask is unset are filled with the default
     * value of {@code short} (zero).
@ -3215,13 +3310,11 @@ public abstract class ShortVector extends AbstractVector<Short> {
     * <p>
     * The following pseudocode illustrates the behavior:
     * <pre>{@code
-     * ShortBuffer eb = bb.duplicate()
-     *     .position(offset)
-     *     .order(bo).asShortBuffer();
+     * var slice = ms.asSlice(offset);
     * short[] ar = new short[species.length()];
     * for (int n = 0; n < ar.length; n++) {
     *     if (m.laneIsSet(n)) {
-     *         ar[n] = eb.get(n);
+     *         ar[n] = slice.getAtIndex(ValuaLayout.JAVA_SHORT.withBitAlignment(8), n);
     *     }
     * }
     * ShortVector r = ShortVector.fromArray(species, ar, 0);
@ -3235,33 +3328,36 @@ public abstract class ShortVector extends AbstractVector<Short> {
     * the bytes of lane values.
     *
     * @param species species of desired vector
-     * @param bb the byte buffer
-     * @param offset the offset into the byte buffer
+     * @param ms the memory segment
+     * @param offset the offset into the memory segment
     * @param bo the intended byte order
     * @param m the mask controlling lane selection
-     * @return a vector loaded from a byte buffer
+     * @return a vector loaded from the memory segment
     * @throws IndexOutOfBoundsException
     *         if {@code offset+N*2 < 0}
-     *         or {@code offset+N*2 >= bb.limit()}
+     *         or {@code offset+N*2 >= ms.byteSize()}
     *         for any lane {@code N} in the vector
     *         where the mask is set
+     * @throws IllegalArgumentException if the memory segment is a heap segment that is
+     *         not backed by a {@code byte[]} array.
+     * @throws IllegalStateException if the memory segment's session is not alive,
+     *         or if access occurs from a thread other than the thread owning the session.
+     * @since 19
     */
    @ForceInline
    public static
-    ShortVector fromByteBuffer(VectorSpecies<Short> species,
-                                        ByteBuffer bb, int offset,
-                                        ByteOrder bo,
-                                        VectorMask<Short> m) {
+    ShortVector fromMemorySegment(VectorSpecies<Short> species,
+                                           MemorySegment ms, long offset,
+                                           ByteOrder bo,
+                                           VectorMask<Short> m) {
        ShortSpecies vsp = (ShortSpecies) species;
-        if (offset >= 0 && offset <= (bb.limit() - species.vectorByteSize())) {
-            return vsp.dummyVector().fromByteBuffer0(bb, offset, m).maybeSwap(bo);
+        if (offset >= 0 && offset <= (ms.byteSize() - species.vectorByteSize())) {
+            return vsp.dummyVector().fromMemorySegment0(ms, offset, m).maybeSwap(bo);
        }

        // FIXME: optimize
-        checkMaskFromIndexSize(offset, vsp, m, 2, bb.limit());
-        ByteBuffer wb = wrapper(bb, bo);
-        return vsp.ldOp(wb, offset, (AbstractMask<Short>)m,
-                   (wb_, o, i)  -> wb_.getShort(o + i * 2));
+        checkMaskFromIndexSize(offset, vsp, m, 2, ms.byteSize());
+        return vsp.ldLongOp(ms, offset, m, ShortVector::memorySegmentGet);
    }

    // Memory store operations
@ -3291,7 +3387,7 @@ public abstract class ShortVector extends AbstractVector<Short> {
            this,
            a, offset,
            (arr, off, v)
-            -> v.stOp(arr, off,
+            -> v.stOp(arr, (int) off,
                      (arr_, off_, i, e) -> arr_[off_ + i] = e));
    }

@ -3437,7 +3533,7 @@ public abstract class ShortVector extends AbstractVector<Short> {
            this,
            a, offset,
            (arr, off, v)
-            -> v.stOp(arr, off,
+            -> v.stOp(arr, (int) off,
                      (arr_, off_, i, e) -> arr_[off_ + i] = (char) e));
    }

@ -3567,67 +3663,40 @@ public abstract class ShortVector extends AbstractVector<Short> {

    /**
     * {@inheritDoc} <!--workaround-->
+     * @since 19
     */
    @Override
    @ForceInline
    public final
-    void intoByteArray(byte[] a, int offset,
-                       ByteOrder bo) {
-        offset = checkFromIndexSize(offset, byteSize(), a.length);
-        maybeSwap(bo).intoByteArray0(a, offset);
-    }
-
-    /**
-     * {@inheritDoc} <!--workaround-->
-     */
-    @Override
-    @ForceInline
-    public final
-    void intoByteArray(byte[] a, int offset,
-                       ByteOrder bo,
-                       VectorMask<Short> m) {
-        if (m.allTrue()) {
-            intoByteArray(a, offset, bo);
-        } else {
-            ShortSpecies vsp = vspecies();
-            checkMaskFromIndexSize(offset, vsp, m, 2, a.length);
-            maybeSwap(bo).intoByteArray0(a, offset, m);
+    void intoMemorySegment(MemorySegment ms, long offset,
+                           ByteOrder bo) {
+        if (ms.isReadOnly()) {
+            throw new UnsupportedOperationException("Attempt to write a read-only segment");
        }
+
+        offset = checkFromIndexSize(offset, byteSize(), ms.byteSize());
+        maybeSwap(bo).intoMemorySegment0(ms, offset);
    }

    /**
     * {@inheritDoc} <!--workaround-->
+     * @since 19
     */
    @Override
    @ForceInline
    public final
-    void intoByteBuffer(ByteBuffer bb, int offset,
-                        ByteOrder bo) {
-        if (ScopedMemoryAccess.isReadOnly(bb)) {
-            throw new ReadOnlyBufferException();
-        }
-        offset = checkFromIndexSize(offset, byteSize(), bb.limit());
-        maybeSwap(bo).intoByteBuffer0(bb, offset);
-    }
-
-    /**
-     * {@inheritDoc} <!--workaround-->
-     */
-    @Override
-    @ForceInline
-    public final
-    void intoByteBuffer(ByteBuffer bb, int offset,
-                        ByteOrder bo,
-                        VectorMask<Short> m) {
+    void intoMemorySegment(MemorySegment ms, long offset,
+                           ByteOrder bo,
+                           VectorMask<Short> m) {
        if (m.allTrue()) {
-            intoByteBuffer(bb, offset, bo);
+            intoMemorySegment(ms, offset, bo);
        } else {
-            if (bb.isReadOnly()) {
-                throw new ReadOnlyBufferException();
+            if (ms.isReadOnly()) {
+                throw new UnsupportedOperationException("Attempt to write a read-only segment");
            }
            ShortSpecies vsp = vspecies();
-            checkMaskFromIndexSize(offset, vsp, m, 2, bb.limit());
-            maybeSwap(bo).intoByteBuffer0(bb, offset, m);
+            checkMaskFromIndexSize(offset, vsp, m, 2, ms.byteSize());
+            maybeSwap(bo).intoMemorySegment0(ms, offset, m);
        }
    }

@ -3661,7 +3730,7 @@ public abstract class ShortVector extends AbstractVector<Short> {
            vsp.vectorType(), vsp.elementType(), vsp.laneCount(),
            a, arrayAddress(a, offset),
            a, offset, vsp,
-            (arr, off, s) -> s.ldOp(arr, off,
+            (arr, off, s) -> s.ldOp(arr, (int) off,
                                    (arr_, off_, i) -> arr_[off_ + i]));
    }

@ -3678,7 +3747,7 @@ public abstract class ShortVector extends AbstractVector<Short> {
            vsp.vectorType(), maskClass, vsp.elementType(), vsp.laneCount(),
            a, arrayAddress(a, offset), m,
            a, offset, vsp,
-            (arr, off, s, vm) -> s.ldOp(arr, off, vm,
+            (arr, off, s, vm) -> s.ldOp(arr, (int) off, vm,
                                        (arr_, off_, i) -> arr_[off_ + i]));
    }

@ -3694,7 +3763,7 @@ public abstract class ShortVector extends AbstractVector<Short> {
            vsp.vectorType(), vsp.elementType(), vsp.laneCount(),
            a, charArrayAddress(a, offset),
            a, offset, vsp,
-            (arr, off, s) -> s.ldOp(arr, off,
+            (arr, off, s) -> s.ldOp(arr, (int) off,
                                    (arr_, off_, i) -> (short) arr_[off_ + i]));
    }

@ -3711,79 +3780,38 @@ public abstract class ShortVector extends AbstractVector<Short> {
                vsp.vectorType(), maskClass, vsp.elementType(), vsp.laneCount(),
                a, charArrayAddress(a, offset), m,
                a, offset, vsp,
-                (arr, off, s, vm) -> s.ldOp(arr, off, vm,
+                (arr, off, s, vm) -> s.ldOp(arr, (int) off, vm,
                                            (arr_, off_, i) -> (short) arr_[off_ + i]));
    }


-    @Override
    abstract
-    ShortVector fromByteArray0(byte[] a, int offset);
+    ShortVector fromMemorySegment0(MemorySegment bb, long offset);
    @ForceInline
    final
-    ShortVector fromByteArray0Template(byte[] a, int offset) {
+    ShortVector fromMemorySegment0Template(MemorySegment ms, long offset) {
        ShortSpecies vsp = vspecies();
-        return VectorSupport.load(
-            vsp.vectorType(), vsp.elementType(), vsp.laneCount(),
-            a, byteArrayAddress(a, offset),
-            a, offset, vsp,
-            (arr, off, s) -> {
-                ByteBuffer wb = wrapper(arr, NATIVE_ENDIAN);
-                return s.ldOp(wb, off,
-                        (wb_, o, i) -> wb_.getShort(o + i * 2));
-            });
-    }
-
-    abstract
-    ShortVector fromByteArray0(byte[] a, int offset, VectorMask<Short> m);
-    @ForceInline
-    final
-    <M extends VectorMask<Short>>
-    ShortVector fromByteArray0Template(Class<M> maskClass, byte[] a, int offset, M m) {
-        ShortSpecies vsp = vspecies();
-        m.check(vsp);
-        return VectorSupport.loadMasked(
-            vsp.vectorType(), maskClass, vsp.elementType(), vsp.laneCount(),
-            a, byteArrayAddress(a, offset), m,
-            a, offset, vsp,
-            (arr, off, s, vm) -> {
-                ByteBuffer wb = wrapper(arr, NATIVE_ENDIAN);
-                return s.ldOp(wb, off, vm,
-                        (wb_, o, i) -> wb_.getShort(o + i * 2));
-            });
-    }
-
-    abstract
-    ShortVector fromByteBuffer0(ByteBuffer bb, int offset);
-    @ForceInline
-    final
-    ShortVector fromByteBuffer0Template(ByteBuffer bb, int offset) {
-        ShortSpecies vsp = vspecies();
-        return ScopedMemoryAccess.loadFromByteBuffer(
+        return ScopedMemoryAccess.loadFromMemorySegment(
                vsp.vectorType(), vsp.elementType(), vsp.laneCount(),
-                bb, offset, vsp,
-                (buf, off, s) -> {
-                    ByteBuffer wb = wrapper(buf, NATIVE_ENDIAN);
-                    return s.ldOp(wb, off,
-                            (wb_, o, i) -> wb_.getShort(o + i * 2));
+                (AbstractMemorySegmentImpl) ms, offset, vsp,
+                (msp, off, s) -> {
+                    return s.ldLongOp((MemorySegment) msp, off, ShortVector::memorySegmentGet);
                });
    }

    abstract
-    ShortVector fromByteBuffer0(ByteBuffer bb, int offset, VectorMask<Short> m);
+    ShortVector fromMemorySegment0(MemorySegment ms, long offset, VectorMask<Short> m);
    @ForceInline
    final
    <M extends VectorMask<Short>>
-    ShortVector fromByteBuffer0Template(Class<M> maskClass, ByteBuffer bb, int offset, M m) {
+    ShortVector fromMemorySegment0Template(Class<M> maskClass, MemorySegment ms, long offset, M m) {
        ShortSpecies vsp = vspecies();
        m.check(vsp);
-        return ScopedMemoryAccess.loadFromByteBufferMasked(
+        return ScopedMemoryAccess.loadFromMemorySegmentMasked(
                vsp.vectorType(), maskClass, vsp.elementType(), vsp.laneCount(),
-                bb, offset, m, vsp,
-                (buf, off, s, vm) -> {
-                    ByteBuffer wb = wrapper(buf, NATIVE_ENDIAN);
-                    return s.ldOp(wb, off, vm,
-                            (wb_, o, i) -> wb_.getShort(o + i * 2));
+                (AbstractMemorySegmentImpl) ms, offset, m, vsp,
+                (msp, off, s, vm) -> {
+                    return s.ldLongOp((MemorySegment) msp, off, vm, ShortVector::memorySegmentGet);
                });
    }

@ -3802,7 +3830,7 @@ public abstract class ShortVector extends AbstractVector<Short> {
            a, arrayAddress(a, offset),
            this, a, offset,
            (arr, off, v)
-            -> v.stOp(arr, off,
+            -> v.stOp(arr, (int) off,
                      (arr_, off_, i, e) -> arr_[off_+i] = e));
    }

@ -3819,77 +3847,39 @@ public abstract class ShortVector extends AbstractVector<Short> {
            a, arrayAddress(a, offset),
            this, m, a, offset,
            (arr, off, v, vm)
-            -> v.stOp(arr, off, vm,
+            -> v.stOp(arr, (int) off, vm,
                      (arr_, off_, i, e) -> arr_[off_ + i] = e));
    }



-    abstract
-    void intoByteArray0(byte[] a, int offset);
    @ForceInline
    final
-    void intoByteArray0Template(byte[] a, int offset) {
+    void intoMemorySegment0(MemorySegment ms, long offset) {
        ShortSpecies vsp = vspecies();
-        VectorSupport.store(
-            vsp.vectorType(), vsp.elementType(), vsp.laneCount(),
-            a, byteArrayAddress(a, offset),
-            this, a, offset,
-            (arr, off, v) -> {
-                ByteBuffer wb = wrapper(arr, NATIVE_ENDIAN);
-                v.stOp(wb, off,
-                        (tb_, o, i, e) -> tb_.putShort(o + i * 2, e));
-            });
-    }
-
-    abstract
-    void intoByteArray0(byte[] a, int offset, VectorMask<Short> m);
-    @ForceInline
-    final
-    <M extends VectorMask<Short>>
-    void intoByteArray0Template(Class<M> maskClass, byte[] a, int offset, M m) {
-        ShortSpecies vsp = vspecies();
-        m.check(vsp);
-        VectorSupport.storeMasked(
-            vsp.vectorType(), maskClass, vsp.elementType(), vsp.laneCount(),
-            a, byteArrayAddress(a, offset),
-            this, m, a, offset,
-            (arr, off, v, vm) -> {
-                ByteBuffer wb = wrapper(arr, NATIVE_ENDIAN);
-                v.stOp(wb, off, vm,
-                        (tb_, o, i, e) -> tb_.putShort(o + i * 2, e));
-            });
-    }
-
-    @ForceInline
-    final
-    void intoByteBuffer0(ByteBuffer bb, int offset) {
-        ShortSpecies vsp = vspecies();
-        ScopedMemoryAccess.storeIntoByteBuffer(
+        ScopedMemoryAccess.storeIntoMemorySegment(
                vsp.vectorType(), vsp.elementType(), vsp.laneCount(),
-                this, bb, offset,
-                (buf, off, v) -> {
-                    ByteBuffer wb = wrapper(buf, NATIVE_ENDIAN);
-                    v.stOp(wb, off,
-                            (wb_, o, i, e) -> wb_.putShort(o + i * 2, e));
+                this,
+                (AbstractMemorySegmentImpl) ms, offset,
+                (msp, off, v) -> {
+                    v.stLongOp((MemorySegment) msp, off, ShortVector::memorySegmentSet);
                });
    }

    abstract
-    void intoByteBuffer0(ByteBuffer bb, int offset, VectorMask<Short> m);
+    void intoMemorySegment0(MemorySegment bb, long offset, VectorMask<Short> m);
    @ForceInline
    final
    <M extends VectorMask<Short>>
-    void intoByteBuffer0Template(Class<M> maskClass, ByteBuffer bb, int offset, M m) {
+    void intoMemorySegment0Template(Class<M> maskClass, MemorySegment ms, long offset, M m) {
        ShortSpecies vsp = vspecies();
        m.check(vsp);
-        ScopedMemoryAccess.storeIntoByteBufferMasked(
+        ScopedMemoryAccess.storeIntoMemorySegmentMasked(
                vsp.vectorType(), maskClass, vsp.elementType(), vsp.laneCount(),
-                this, m, bb, offset,
-                (buf, off, v, vm) -> {
-                    ByteBuffer wb = wrapper(buf, NATIVE_ENDIAN);
-                    v.stOp(wb, off, vm,
-                            (wb_, o, i, e) -> wb_.putShort(o + i * 2, e));
+                this, m,
+                (AbstractMemorySegmentImpl) ms, offset,
+                (msp, off, v, vm) -> {
+                    v.stLongOp((MemorySegment) msp, off, vm, ShortVector::memorySegmentSet);
                });
    }

@ -3907,7 +3897,7 @@ public abstract class ShortVector extends AbstractVector<Short> {
            a, charArrayAddress(a, offset),
            this, m, a, offset,
            (arr, off, v, vm)
-            -> v.stOp(arr, off, vm,
+            -> v.stOp(arr, (int) off, vm,
                      (arr_, off_, i, e) -> arr_[off_ + i] = (char) e));
    }

@ -3923,6 +3913,16 @@ public abstract class ShortVector extends AbstractVector<Short> {
            .checkIndexByLane(offset, limit, vsp.iota(), scale);
    }

+    private static
+    void checkMaskFromIndexSize(long offset,
+                                ShortSpecies vsp,
+                                VectorMask<Short> m,
+                                int scale,
+                                long limit) {
+        ((AbstractMask<Short>)m)
+            .checkIndexByLane(offset, limit, vsp.iota(), scale);
+    }
+
    @ForceInline
    private void conditionalStoreNYI(int offset,
                                     ShortSpecies vsp,
@ -4250,6 +4250,21 @@ public abstract class ShortVector extends AbstractVector<Short> {
            return dummyVector().ldOp(memory, offset, m, f);
        }

+        /*package-private*/
+        @ForceInline
+        ShortVector ldLongOp(MemorySegment memory, long offset,
+                                      FLdLongOp f) {
+            return dummyVector().ldLongOp(memory, offset, f);
+        }
+
+        /*package-private*/
+        @ForceInline
+        ShortVector ldLongOp(MemorySegment memory, long offset,
+                                      VectorMask<Short> m,
+                                      FLdLongOp f) {
+            return dummyVector().ldLongOp(memory, offset, m, f);
+        }
+
        /*package-private*/
        @ForceInline
        <M> void stOp(M memory, int offset, FStOp<M> f) {
@ -4264,6 +4279,20 @@ public abstract class ShortVector extends AbstractVector<Short> {
            dummyVector().stOp(memory, offset, m, f);
        }

+        /*package-private*/
+        @ForceInline
+        void stLongOp(MemorySegment memory, long offset, FStLongOp f) {
+            dummyVector().stLongOp(memory, offset, f);
+        }
+
+        /*package-private*/
+        @ForceInline
+        void stLongOp(MemorySegment memory, long offset,
+                      AbstractMask<Short> m,
+                      FStLongOp f) {
+            dummyVector().stLongOp(memory, offset, m, f);
+        }
+
        // N.B. Make sure these constant vectors and
        // masks load up correctly into registers.
        //
@ -4377,3 +4406,4 @@ public abstract class ShortVector extends AbstractVector<Short> {
    public static final VectorSpecies<Short> SPECIES_PREFERRED
        = (ShortSpecies) VectorSpecies.ofPreferred(short.class);
 }
+
--- a/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/Vector.java
+++ b/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/Vector.java
@ -24,7 +24,8 @@
 */
 package jdk.incubator.vector;

-import java.nio.ByteBuffer;
+import java.lang.foreign.MemorySegment;
+
 import java.nio.ByteOrder;
 import java.util.Arrays;

@ -763,11 +764,11 @@ import java.util.Arrays;
 * first vector lane value occupies the first position in memory, and so on,
 * up to the length of the vector. Further, the memory order of stored
 * vector lanes corresponds to increasing index values in a Java array or
- * in a {@link java.nio.ByteBuffer}.
+ * in a {@link java.lang.foreign.MemorySegment}.
 *
 * <p> Byte order for lane storage is chosen such that the stored
 * vector values can be read or written as single primitive values,
- * within the array or buffer that holds the vector, producing the
+ * within the array or segment that holds the vector, producing the
 * same values as the lane-wise values within the vector.
 * This fact is independent of the convenient fiction that lane values
 * inside of vectors are stored in little-endian order.
@ -1039,6 +1040,12 @@ import java.util.Arrays;
 * can encode a mathematical permutation as well as many other
 * patterns of data movement.
 *
+ * <li>The {@link #compress(VectorMask)} and {@link #expand(VectorMask)}
+ * methods, which select up to {@code VLENGTH} lanes from an
+ * input vector, and assemble them in lane order.  The selection of lanes
+ * is controlled by a {@code VectorMask}, with set lane elements mapping, by
+ * compression or expansion in lane order, source lanes to destination lanes.
+ *
 * </ul>
 * <p> Some vector operations are not lane-wise, but rather move data
 * across lane boundaries.  Such operations are typically rare in SIMD
@ -2689,6 +2696,46 @@ public abstract class Vector<E> extends jdk.internal.vm.vector.VectorSupport.Vec
     */
    public abstract Vector<E> rearrange(VectorShuffle<E> s, Vector<E> v);

+    /**
+     * Compresses the lane elements of this vector selecting lanes
+     * under the control of a specific mask.
+     *
+     * This is a cross-lane operation that compresses the lane
+     * elements of this vector as selected by the specified mask.
+     *
+     * For each lane {@code N} of the mask, if the mask at
+     * lane {@code N} is set, the element at lane {@code N}
+     * of input vector is selected and stored into the output
+     * vector contiguously starting from the lane {@code 0}.
+     * All the upper remaining lanes, if any, of the output
+     * vector are set to zero.
+     *
+     * @param m the mask controlling the compression
+     * @return the compressed lane elements of this vector
+     * @since 19
+     */
+    public abstract Vector<E> compress(VectorMask<E> m);
+
+    /**
+     * Expands the lane elements of this vector
+     * under the control of a specific mask.
+     *
+     * This is a cross-lane operation that expands the contiguous lane
+     * elements of this vector into lanes of an output vector
+     * as selected by the specified mask.
+     *
+     * For each lane {@code N} of the mask, if the mask at
+     * lane {@code N} is set, the next contiguous element of input vector
+     * starting from lane {@code 0} is selected and stored into the output
+     * vector at lane {@code N}.
+     * All the remaining lanes, if any, of the output vector are set to zero.
+     *
+     * @param m the mask controlling the compression
+     * @return the expanded lane elements of this vector
+     * @since 19
+     */
+    public abstract Vector<E> expand(VectorMask<E> m);
+
    /**
     * Using index values stored in the lanes of this vector,
     * assemble values stored in second vector {@code v}.
@ -2854,9 +2901,8 @@ public abstract class Vector<E> extends jdk.internal.vm.vector.VectorSupport.Vec
     * implementation costs.
     *
     * <p> The method behaves as if this vector is stored into a byte
-     * buffer or array using little-endian byte ordering and then the
-     * desired vector is loaded from the same byte buffer or array
-     * using the same ordering.
+     * array using little-endian byte ordering and then the desired vector is loaded from the same byte
+     * array using the same ordering.
     *
     * <p> The following pseudocode illustrates the behavior:
     * <pre>{@code
@ -2865,15 +2911,15 @@ public abstract class Vector<E> extends jdk.internal.vm.vector.VectorSupport.Vec
     * int M = (domSize > ranSize ? domSize / ranSize : ranSize / domSize);
     * assert Math.abs(part) < M;
     * assert (part == 0) || (part > 0) == (domSize > ranSize);
-     * byte[] ra = new byte[Math.max(domSize, ranSize)];
+     * MemorySegment ms = MemorySegment.ofArray(new byte[Math.max(domSize, ranSize)]);
     * if (domSize > ranSize) {  // expansion
-     *     this.intoByteArray(ra, 0, ByteOrder.native());
+     *     this.intoMemorySegment(ms, 0, ByteOrder.native());
     *     int origin = part * ranSize;
-     *     return species.fromByteArray(ra, origin, ByteOrder.native());
+     *     return species.fromMemorySegment(ms, origin, ByteOrder.native());
     * } else {  // contraction or size-invariant
     *     int origin = (-part) * domSize;
-     *     this.intoByteArray(ra, origin, ByteOrder.native());
-     *     return species.fromByteArray(ra, 0, ByteOrder.native());
+     *     this.intoMemorySegment(ms, origin, ByteOrder.native());
+     *     return species.fromMemorySegment(ms, 0, ByteOrder.native());
     * }
     * }</pre>
     *
@ -2910,8 +2956,8 @@ public abstract class Vector<E> extends jdk.internal.vm.vector.VectorSupport.Vec
     *
     * @return a {@code ByteVector} with the same shape and information content
     * @see Vector#reinterpretShape(VectorSpecies,int)
-     * @see IntVector#intoByteArray(byte[], int, ByteOrder)
-     * @see FloatVector#intoByteArray(byte[], int, ByteOrder)
+     * @see IntVector#intoMemorySegment(java.lang.foreign.MemorySegment, long, java.nio.ByteOrder)
+     * @see FloatVector#intoMemorySegment(java.lang.foreign.MemorySegment, long, java.nio.ByteOrder)
     * @see VectorSpecies#withLanes(Class)
     */
    public abstract ByteVector reinterpretAsBytes();
@ -3319,8 +3365,8 @@ public abstract class Vector<E> extends jdk.internal.vm.vector.VectorSupport.Vec
    //Array stores

    /**
-     * Stores this vector into a byte array starting at an offset
-     * using explicit byte order.
+     * Stores this vector into a {@linkplain MemorySegment memory segment}
+     * starting at an offset using explicit byte order.
     * <p>
     * Bytes are extracted from primitive lane elements according
     * to the specified byte ordering.
@ -3328,88 +3374,33 @@ public abstract class Vector<E> extends jdk.internal.vm.vector.VectorSupport.Vec
     * <a href="Vector.html#lane-order">memory ordering</a>.
     * <p>
     * This method behaves as if it calls
-     * {@link #intoByteBuffer(ByteBuffer,int,ByteOrder,VectorMask)
-     * intoByteBuffer()} as follows:
-     * <pre>{@code
-     * var bb = ByteBuffer.wrap(a);
-     * var m = maskAll(true);
-     * intoByteBuffer(bb, offset, bo, m);
-     * }</pre>
-     *
-     * @param a the byte array
-     * @param offset the offset into the array
-     * @param bo the intended byte order
-     * @throws IndexOutOfBoundsException
-     *         if {@code offset+N*ESIZE < 0}
-     *         or {@code offset+(N+1)*ESIZE > a.length}
-     *         for any lane {@code N} in the vector
-     */
-    public abstract void intoByteArray(byte[] a, int offset,
-                                       ByteOrder bo);
-
-    /**
-     * Stores this vector into a byte array starting at an offset
-     * using explicit byte order and a mask.
-     * <p>
-     * Bytes are extracted from primitive lane elements according
-     * to the specified byte ordering.
-     * The lanes are stored according to their
-     * <a href="Vector.html#lane-order">memory ordering</a>.
-     * <p>
-     * This method behaves as if it calls
-     * {@link #intoByteBuffer(ByteBuffer,int,ByteOrder,VectorMask)
-     * intoByteBuffer()} as follows:
-     * <pre>{@code
-     * var bb = ByteBuffer.wrap(a);
-     * intoByteBuffer(bb, offset, bo, m);
-     * }</pre>
-     *
-     * @param a the byte array
-     * @param offset the offset into the array
-     * @param bo the intended byte order
-     * @param m the mask controlling lane selection
-     * @throws IndexOutOfBoundsException
-     *         if {@code offset+N*ESIZE < 0}
-     *         or {@code offset+(N+1)*ESIZE > a.length}
-     *         for any lane {@code N} in the vector
-     *         where the mask is set
-     */
-    public abstract void intoByteArray(byte[] a, int offset,
-                                       ByteOrder bo,
-                                       VectorMask<E> m);
-
-    /**
-     * Stores this vector into a byte buffer starting at an offset
-     * using explicit byte order.
-     * <p>
-     * Bytes are extracted from primitive lane elements according
-     * to the specified byte ordering.
-     * The lanes are stored according to their
-     * <a href="Vector.html#lane-order">memory ordering</a>.
-     * <p>
-     * This method behaves as if it calls
-     * {@link #intoByteBuffer(ByteBuffer,int,ByteOrder,VectorMask)
-     * intoByteBuffer()} as follows:
+     * {@link #intoMemorySegment(MemorySegment,long,ByteOrder,VectorMask)
+     * intoMemorySegment()} as follows:
     * <pre>{@code
     * var m = maskAll(true);
-     * intoByteBuffer(bb, offset, bo, m);
+     * intoMemorySegment(ms, offset, bo, m);
     * }</pre>
     *
-     * @param bb the byte buffer
-     * @param offset the offset into the array
+     * @param ms the memory segment
+     * @param offset the offset into the memory segment
     * @param bo the intended byte order
     * @throws IndexOutOfBoundsException
     *         if {@code offset+N*ESIZE < 0}
-     *         or {@code offset+(N+1)*ESIZE > bb.limit()}
+     *         or {@code offset+(N+1)*ESIZE > ms.byteSize()}
     *         for any lane {@code N} in the vector
-     * @throws java.nio.ReadOnlyBufferException
-     *         if the byte buffer is read-only
+     * @throws UnsupportedOperationException
+     *         if the memory segment is read-only
+     * @throws IllegalArgumentException if the memory segment is a heap segment that is
+     *         not backed by a {@code byte[]} array.
+     * @throws IllegalStateException if the memory segment's session is not alive,
+     *         or if access occurs from a thread other than the thread owning the session.
+     * @since 19
     */
-    public abstract void intoByteBuffer(ByteBuffer bb, int offset, ByteOrder bo);
+    public abstract void intoMemorySegment(MemorySegment ms, long offset, ByteOrder bo);

    /**
-     * Stores this vector into a byte buffer starting at an offset
-     * using explicit byte order and a mask.
+     * Stores this vector into a {@linkplain MemorySegment memory segment}
+     * starting at an offset using explicit byte order and a mask.
     * <p>
     * Bytes are extracted from primitive lane elements according
     * to the specified byte ordering.
@ -3417,28 +3408,18 @@ public abstract class Vector<E> extends jdk.internal.vm.vector.VectorSupport.Vec
     * <a href="Vector.html#lane-order">memory ordering</a>.
     * <p>
     * The following pseudocode illustrates the behavior, where
-     * the primitive element type is not of {@code byte},
-     * {@code EBuffer} is the primitive buffer type, {@code ETYPE} is the
+     * {@code JAVA_E} is the layout of the primitive element type, {@code ETYPE} is the
     * primitive element type, and {@code EVector} is the primitive
     * vector type for this vector:
     * <pre>{@code
-     * EBuffer eb = bb.duplicate()
-     *     .position(offset)
-     *     .order(bo).asEBuffer();
     * ETYPE[] a = this.toArray();
+     * var slice = ms.asSlice(offset)
     * for (int n = 0; n < a.length; n++) {
     *     if (m.laneIsSet(n)) {
-     *         eb.put(n, a[n]);
+     *         slice.setAtIndex(ValueLayout.JAVA_E.withBitAlignment(8), n);
     *     }
     * }
     * }</pre>
-     * When the primitive element type is of {@code byte} the primitive
-     * byte buffer is obtained as follows, where operation on the buffer
-     * remains the same as in the prior pseudocode:
-     * <pre>{@code
-     * ByteBuffer eb = bb.duplicate()
-     *     .position(offset);
-     * }</pre>
     *
     * @implNote
     * This operation is likely to be more efficient if
@ -3451,20 +3432,25 @@ public abstract class Vector<E> extends jdk.internal.vm.vector.VectorSupport.Vec
     * {@code byte}, the byte order argument is
     * ignored.
     *
-     * @param bb the byte buffer
-     * @param offset the offset into the array
+     * @param ms the memory segment
+     * @param offset the offset into the memory segment
     * @param bo the intended byte order
     * @param m the mask controlling lane selection
     * @throws IndexOutOfBoundsException
     *         if {@code offset+N*ESIZE < 0}
-     *         or {@code offset+(N+1)*ESIZE > bb.limit()}
+     *         or {@code offset+(N+1)*ESIZE > ms.byteSize()}
     *         for any lane {@code N} in the vector
     *         where the mask is set
-     * @throws java.nio.ReadOnlyBufferException
-     *         if the byte buffer is read-only
+     * @throws UnsupportedOperationException
+     *         if the memory segment is read-only
+     * @throws IllegalArgumentException if the memory segment is a heap segment that is
+     *         not backed by a {@code byte[]} array.
+     * @throws IllegalStateException if the memory segment's session is not alive,
+     *         or if access occurs from a thread other than the thread owning the session.
+     * @since 19
     */
-    public abstract void intoByteBuffer(ByteBuffer bb, int offset,
-                                        ByteOrder bo, VectorMask<E> m);
+    public abstract void intoMemorySegment(MemorySegment ms, long offset,
+                                           ByteOrder bo, VectorMask<E> m);

    /**
     * Returns a packed array containing all the lane values.
--- a/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/VectorIntrinsics.java
+++ b/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/VectorIntrinsics.java
@ -54,6 +54,16 @@ import java.util.Objects;
        }
    }

+    @ForceInline
+    static long checkFromIndexSize(long ix, long vlen, long length) {
+        switch (VectorIntrinsics.VECTOR_ACCESS_OOB_CHECK) {
+            case 0: return ix; // no range check
+            case 1: return Objects.checkFromIndexSize(ix, vlen, length);
+            case 2: return Objects.checkIndex(ix, length - (vlen - 1));
+            default: throw new InternalError();
+        }
+    }
+
    @ForceInline
    static IntVector checkIndex(IntVector vix, int length) {
        switch (VectorIntrinsics.VECTOR_ACCESS_OOB_CHECK) {
@ -92,9 +102,30 @@ import java.util.Objects;
        if (index >= 0) {
            return index - (index % size);
        } else {
-            return index - Math.floorMod(index, Math.abs(size));
+            return index - Math.floorMod(index, size);
        }
    }
+
+    // If the index is not already a multiple of size,
+    // round it down to the next smaller multiple of size.
+    // It is an error if size is less than zero.
+    @ForceInline
+    static long roundDown(long index, int size) {
+        if ((size & (size - 1)) == 0) {
+            // Size is zero or a power of two, so we got this.
+            return index & ~(size - 1);
+        } else {
+            return roundDownNPOT(index, size);
+        }
+    }
+    private static long roundDownNPOT(long index, int size) {
+        if (index >= 0) {
+            return index - (index % size);
+        } else {
+            return index - Math.floorMod(index, size);
+        }
+    }
+
    @ForceInline
    static int wrapToRange(int index, int size) {
        if ((size & (size - 1)) == 0) {
--- a/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/VectorMask.java
+++ b/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/VectorMask.java
@ -210,7 +210,7 @@ public abstract class VectorMask<E> extends jdk.internal.vm.vector.VectorSupport
                bits, (long) offset + Unsafe.ARRAY_BOOLEAN_BASE_OFFSET,
                bits, offset, vsp,
                (c, idx, s)
-                  -> s.opm(n -> c[idx + n]));
+                  -> s.opm(n -> c[((int )idx) + n]));
    }

    /**
@ -471,6 +471,39 @@ public abstract class VectorMask<E> extends jdk.internal.vm.vector.VectorSupport
     */
    public abstract VectorMask<E> indexInRange(int offset, int limit);

+    /**
+     * Removes lanes numbered {@code N} from this mask where the
+     * adjusted index {@code N+offset}, is not in the range
+     * {@code [0..limit-1]}.
+     *
+     * <p> In all cases the series of set and unset lanes is assigned
+     * as if by using infinite precision or {@code VLENGTH-}saturating
+     * additions or subtractions, without overflow or wrap-around.
+     *
+     * @apiNote
+     *
+     * This method performs a SIMD emulation of the check performed by
+     * {@link Objects#checkIndex(long,long)}, on the index numbers in
+     * the range {@code [offset..offset+VLENGTH-1]}.  If an exception
+     * is desired, the resulting mask can be compared with the
+     * original mask; if they are not equal, then at least one lane
+     * was out of range, and exception processing can be performed.
+     *
+     * <p> A mask which is a series of {@code N} set lanes followed by
+     * a series of unset lanes can be obtained by calling
+     * {@code allTrue.indexInRange(0, N)}, where {@code allTrue} is a
+     * mask of all true bits.  A mask of {@code N1} unset lanes
+     * followed by {@code N2} set lanes can be obtained by calling
+     * {@code allTrue.indexInRange(-N1, N2)}.
+     *
+     * @param offset the starting index
+     * @param limit the upper-bound (exclusive) of index range
+     * @return the original mask, with out-of-range lanes unset
+     * @see VectorSpecies#indexInRange(long, long)
+     * @since 19
+     */
+    public abstract VectorMask<E> indexInRange(long offset, long limit);
+
    /**
     * Returns a vector representation of this mask, the
     * lane bits of which are set or unset in correspondence
@ -621,6 +654,18 @@ public abstract class VectorMask<E> extends jdk.internal.vm.vector.VectorSupport
        return Objects.hash(vectorSpecies(), Arrays.hashCode(toArray()));
    }

+    /**
+     * Compresses set lanes from this mask.
+     *
+     * Returns a mask which is a series of {@code N} set lanes
+     * followed by a series of unset lanes, where {@code N} is
+     * the true count of this mask.
+     *
+     * @return the compressed mask of this mask
+     * @since 19
+     */
+    public abstract VectorMask<E> compress();
+
    // ==== JROSE NAME CHANGES ====

    // TYPE CHANGED
--- a/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/VectorOperators.java
+++ b/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/VectorOperators.java
@ -452,6 +452,26 @@ public abstract class VectorOperators {
    public static final Unary ABS = unary("ABS", "abs", VectorSupport.VECTOR_OP_ABS, VO_ALL);
    /** Produce {@code -a}. */
    public static final Unary NEG = unary("NEG", "-a", VectorSupport.VECTOR_OP_NEG, VO_ALL|VO_SPECIAL);
+    /** Produce {@code bitCount(a)}
+     * @since 19
+     */
+    public static final Unary BIT_COUNT = unary("BIT_COUNT", "bitCount", VectorSupport.VECTOR_OP_BIT_COUNT, VO_NOFP);
+    /** Produce {@code numberOfTrailingZeros(a)}
+     * @since 19
+     */
+    public static final Unary TRAILING_ZEROS_COUNT = unary("TRAILING_ZEROS_COUNT", "numberOfTrailingZeros", VectorSupport.VECTOR_OP_TZ_COUNT, VO_NOFP);
+    /** Produce {@code numberOfLeadingZeros(a)}
+     * @since 19
+     */
+    public static final Unary LEADING_ZEROS_COUNT = unary("LEADING_ZEROS_COUNT", "numberOfLeadingZeros", VectorSupport.VECTOR_OP_LZ_COUNT, VO_NOFP);
+    /** Produce {@code reverse(a)}
+     * @since 19
+     */
+    public static final Unary REVERSE = unary("REVERSE", "reverse", VectorSupport.VECTOR_OP_REVERSE, VO_NOFP);
+    /** Produce {@code reverseBytes(a)}
+     * @since 19
+     */
+    public static final Unary REVERSE_BYTES = unary("REVERSE_BYTES", "reverseBytes", VectorSupport.VECTOR_OP_REVERSE_BYTES, VO_NOFP);

    /** Produce {@code sin(a)}.  Floating only.
     *  Not guaranteed to be semi-monotonic. See section "Operations on floating point vectors" above
@ -556,6 +576,14 @@ public abstract class VectorOperators {
    public static final /*bitwise*/ Binary ROL = binary("ROL", "rotateLeft", VectorSupport.VECTOR_OP_LROTATE, VO_SHIFT);
    /** Produce {@code rotateRight(a,n)}.  Integral only. */
    public static final /*bitwise*/ Binary ROR = binary("ROR", "rotateRight", VectorSupport.VECTOR_OP_RROTATE, VO_SHIFT);
+    /** Produce {@code compress(a,n)}. Integral, {@code int} and {@code long}, only.
+     * @since 19
+     */
+    public static final /*bitwise*/ Binary COMPRESS_BITS = binary("COMPRESS_BITS", "compressBits", VectorSupport.VECTOR_OP_COMPRESS_BITS, VO_NOFP);
+    /** Produce {@code expand(a,n)}. Integral, {@code int} and {@code long}, only.
+     * @since 19
+     */
+    public static final /*bitwise*/ Binary EXPAND_BITS = binary("EXPAND_BITS", "expandBits", VectorSupport.VECTOR_OP_EXPAND_BITS, VO_NOFP);

    /** Produce {@code atan2(a,b)}. See  Floating only.
     *  Not guaranteed to be semi-monotonic. See section "Operations on floating point vectors" above
--- a/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/VectorSpecies.java
+++ b/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/VectorSpecies.java
@ -24,6 +24,8 @@
 */
 package jdk.incubator.vector;

+import java.lang.foreign.MemorySegment;
+
 import java.nio.ByteOrder;
 import java.util.function.IntUnaryOperator;

@ -149,11 +151,37 @@ public interface VectorSpecies<E> {
     * @return the largest multiple of the vector length not greater
     *         than the given length
     * @throws IllegalArgumentException if the {@code length} is
-               negative and the result would overflow to a positive value
+     *         negative and the result would overflow to a positive value
     * @see Math#floorMod(int, int)
     */
    int loopBound(int length);

+    /**
+     * Loop control function which returns the largest multiple of
+     * {@code VLENGTH} that is less than or equal to the given
+     * {@code length} value.
+     * Here, {@code VLENGTH} is the result of {@code this.length()},
+     * and {@code length} is interpreted as a number of lanes.
+     * The resulting value {@code R} satisfies this inequality:
+     * <pre>{@code R <= length < R+VLENGTH}
+     * </pre>
+     * <p> Specifically, this method computes
+     * {@code length - floorMod(length, VLENGTH)}, where
+     * {@link Math#floorMod(long,int) floorMod} computes a remainder
+     * value by rounding its quotient toward negative infinity.
+     * As long as {@code VLENGTH} is a power of two, then the result
+     * is also equal to {@code length & ~(VLENGTH - 1)}.
+     *
+     * @param length the input length
+     * @return the largest multiple of the vector length not greater
+     *         than the given length
+     * @throws IllegalArgumentException if the {@code length} is
+     *         negative and the result would overflow to a positive value
+     * @see Math#floorMod(long, int)
+     * @since 19
+     */
+    long loopBound(long length);
+
    /**
     * Returns a mask of this species where only
     * the lanes at index N such that the adjusted index
@ -171,6 +199,24 @@ public interface VectorSpecies<E> {
     */
    VectorMask<E> indexInRange(int offset, int limit);

+    /**
+     * Returns a mask of this species where only
+     * the lanes at index N such that the adjusted index
+     * {@code N+offset} is in the range {@code [0..limit-1]}
+     * are set.
+     *
+     * <p>
+     * This method returns the value of the expression
+     * {@code maskAll(true).indexInRange(offset, limit)}
+     *
+     * @param offset the starting index
+     * @param limit the upper-bound (exclusive) of index range
+     * @return a mask with out-of-range lanes unset
+     * @see VectorMask#indexInRange(long, long)
+     * @since 19
+     */
+    VectorMask<E> indexInRange(long offset, long limit);
+
    /**
     * Checks that this species has the given element type,
     * and returns this species unchanged.
@ -433,31 +479,31 @@ public interface VectorSpecies<E> {
    // Defined when ETYPE is known.

    /**
-     * Loads a vector of this species from a byte array starting
-     * at an offset.
+     * Loads a vector of this species from a {@linkplain MemorySegment memory segment}
+     * starting at an offset into the memory segment.
     * Bytes are composed into primitive lane elements according
     * to the specified byte order.
     * The vector is arranged into lanes according to
     * <a href="Vector.html#lane-order">memory ordering</a>.
     * <p>
     * Equivalent to
-     * {@code IntVector.fromByteArray(this,a,offset,bo)}
-     * or an equivalent {@code fromByteArray} method,
+     * {@code IntVector.fromMemorySegment(this,ms,offset,bo)},
     * on the vector type corresponding to
     * this species.
     *
-     * @param a a byte array
-     * @param offset the index of the first byte to load
+     * @param ms the memory segment
+     * @param offset the offset into the memory segment
     * @param bo the intended byte order
-     * @return a vector of the given species filled from the byte array
+     * @return a vector of the given species filled from the memory segment
     * @throws IndexOutOfBoundsException
     *         if {@code offset+N*ESIZE < 0}
     *         or {@code offset+(N+1)*ESIZE > a.length}
     *         for any lane {@code N} in the vector
-     * @see IntVector#fromByteArray(VectorSpecies,byte[],int,ByteOrder)
-     * @see FloatVector#fromByteArray(VectorSpecies,byte[],int,ByteOrder)
+     * @see IntVector#fromMemorySegment(VectorSpecies, java.lang.foreign.MemorySegment, long, java.nio.ByteOrder)
+     * @see FloatVector#fromMemorySegment(VectorSpecies, java.lang.foreign.MemorySegment, long, java.nio.ByteOrder)
+     * @since 19
     */
-    Vector<E> fromByteArray(byte[] a, int offset, ByteOrder bo);
+    Vector<E> fromMemorySegment(MemorySegment ms, long offset, ByteOrder bo);

    /**
     * Returns a mask of this species
--- a/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/X-Vector.java.template
+++ b/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/X-Vector.java.template
@ -24,14 +24,14 @@
 */
 package jdk.incubator.vector;

-import java.nio.ByteBuffer;
+import java.lang.foreign.MemorySegment;
+import java.lang.foreign.ValueLayout;
 import java.nio.ByteOrder;
-import java.nio.ReadOnlyBufferException;
 import java.util.Arrays;
 import java.util.Objects;
 import java.util.function.Function;
-import java.util.function.UnaryOperator;

+import jdk.internal.foreign.AbstractMemorySegmentImpl;
 import jdk.internal.misc.ScopedMemoryAccess;
 import jdk.internal.misc.Unsafe;
 import jdk.internal.vm.annotation.ForceInline;
@ -61,6 +61,8 @@ public abstract class $abstractvectortype$ extends AbstractVector<$Boxtype$> {
    static final int FORBID_OPCODE_KIND = VO_ONLYFP;
 #end[FP]

+    static final ValueLayout.Of$Type$ ELEMENT_LAYOUT = ValueLayout.JAVA_$TYPE$.withBitAlignment(8);
+
    @ForceInline
    static int opCode(Operator op) {
        return VectorOperators.opCode(op, VO_OPCODE_VALID, FORBID_OPCODE_KIND);
@ -355,6 +357,45 @@ public abstract class $abstractvectortype$ extends AbstractVector<$Boxtype$> {
        return vectorFactory(res);
    }

+    /*package-private*/
+    interface FLdLongOp {
+        $type$ apply(MemorySegment memory, long offset, int i);
+    }
+
+    /*package-private*/
+    @ForceInline
+    final
+    $abstractvectortype$ ldLongOp(MemorySegment memory, long offset,
+                                  FLdLongOp f) {
+        //dummy; no vec = vec();
+        $type$[] res = new $type$[length()];
+        for (int i = 0; i < res.length; i++) {
+            res[i] = f.apply(memory, offset, i);
+        }
+        return vectorFactory(res);
+    }
+
+    /*package-private*/
+    @ForceInline
+    final
+    $abstractvectortype$ ldLongOp(MemorySegment memory, long offset,
+                                  VectorMask<$Boxtype$> m,
+                                  FLdLongOp f) {
+        //$type$[] vec = vec();
+        $type$[] res = new $type$[length()];
+        boolean[] mbits = ((AbstractMask<$Boxtype$>)m).getBits();
+        for (int i = 0; i < res.length; i++) {
+            if (mbits[i]) {
+                res[i] = f.apply(memory, offset, i);
+            }
+        }
+        return vectorFactory(res);
+    }
+
+    static $type$ memorySegmentGet(MemorySegment ms, long o, int i) {
+        return ms.get(ELEMENT_LAYOUT, o + i * $sizeInBytes$L);
+    }
+
    interface FStOp<M> {
        void apply(M memory, int offset, int i, $type$ a);
    }
@ -385,6 +426,40 @@ public abstract class $abstractvectortype$ extends AbstractVector<$Boxtype$> {
        }
    }

+    interface FStLongOp {
+        void apply(MemorySegment memory, long offset, int i, $type$ a);
+    }
+
+    /*package-private*/
+    @ForceInline
+    final
+    void stLongOp(MemorySegment memory, long offset,
+                  FStLongOp f) {
+        $type$[] vec = vec();
+        for (int i = 0; i < vec.length; i++) {
+            f.apply(memory, offset, i, vec[i]);
+        }
+    }
+
+    /*package-private*/
+    @ForceInline
+    final
+    void stLongOp(MemorySegment memory, long offset,
+                  VectorMask<$Boxtype$> m,
+                  FStLongOp f) {
+        $type$[] vec = vec();
+        boolean[] mbits = ((AbstractMask<$Boxtype$>)m).getBits();
+        for (int i = 0; i < vec.length; i++) {
+            if (mbits[i]) {
+                f.apply(memory, offset, i, vec[i]);
+            }
+        }
+    }
+
+    static void memorySegmentSet(MemorySegment ms, long o, int i, $type$ e) {
+        ms.set(ELEMENT_LAYOUT, o + i * $sizeInBytes$L, e);
+    }
+
    // Binary test

    /*package-private*/
@ -445,6 +520,36 @@ public abstract class $abstractvectortype$ extends AbstractVector<$Boxtype$> {
        return {#if[FP]?$Type$.$bitstype$BitsTo$Type$}(($bitstype$)bits);
    }

+    static $abstractvectortype$ expandHelper(Vector<$Boxtype$> v, VectorMask<$Boxtype$> m) {
+        VectorSpecies<$Boxtype$> vsp = m.vectorSpecies();
+        $abstractvectortype$ r  = ($abstractvectortype$) vsp.zero();
+        $abstractvectortype$ vi = ($abstractvectortype$) v;
+        if (m.allTrue()) {
+            return vi;
+        }
+        for (int i = 0, j = 0; i < vsp.length(); i++) {
+            if (m.laneIsSet(i)) {
+                r = r.withLane(i, vi.lane(j++));
+            }
+        }
+        return r;
+    }
+
+    static $abstractvectortype$ compressHelper(Vector<$Boxtype$> v, VectorMask<$Boxtype$> m) {
+        VectorSpecies<$Boxtype$> vsp = m.vectorSpecies();
+        $abstractvectortype$ r  = ($abstractvectortype$) vsp.zero();
+        $abstractvectortype$ vi = ($abstractvectortype$) v;
+        if (m.allTrue()) {
+            return vi;
+        }
+        for (int i = 0, j = 0; i < vsp.length(); i++) {
+            if (m.laneIsSet(i)) {
+                r = r.withLane(j++, vi.lane(i));
+            }
+        }
+        return r;
+    }
+
    // Static factories (other than memory operations)

    // Note: A surprising behavior in javadoc
@ -646,6 +751,36 @@ public abstract class $abstractvectortype$ extends AbstractVector<$Boxtype$> {
                    v0.uOp(m, (i, a) -> ($type$) -a);
            case VECTOR_OP_ABS: return (v0, m) ->
                    v0.uOp(m, (i, a) -> ($type$) Math.abs(a));
+#if[!FP]
+#if[intOrLong]
+            case VECTOR_OP_BIT_COUNT: return (v0, m) ->
+                    v0.uOp(m, (i, a) -> ($type$) $Boxtype$.bitCount(a));
+            case VECTOR_OP_TZ_COUNT: return (v0, m) ->
+                    v0.uOp(m, (i, a) -> ($type$) $Boxtype$.numberOfTrailingZeros(a));
+            case VECTOR_OP_LZ_COUNT: return (v0, m) ->
+                    v0.uOp(m, (i, a) -> ($type$) $Boxtype$.numberOfLeadingZeros(a));
+            case VECTOR_OP_REVERSE: return (v0, m) ->
+                    v0.uOp(m, (i, a) -> ($type$) $Boxtype$.reverse(a));
+#else[intOrLong]
+            case VECTOR_OP_BIT_COUNT: return (v0, m) ->
+                    v0.uOp(m, (i, a) -> ($type$) bitCount(a));
+            case VECTOR_OP_TZ_COUNT: return (v0, m) ->
+                    v0.uOp(m, (i, a) -> ($type$) numberOfTrailingZeros(a));
+            case VECTOR_OP_LZ_COUNT: return (v0, m) ->
+                    v0.uOp(m, (i, a) -> ($type$) numberOfLeadingZeros(a));
+            case VECTOR_OP_REVERSE: return (v0, m) ->
+                    v0.uOp(m, (i, a) -> reverse(a));
+#end[intOrLong]
+#if[BITWISE]
+#if[byte]
+            case VECTOR_OP_REVERSE_BYTES: return (v0, m) ->
+                    v0.uOp(m, (i, a) -> a);
+#else[byte]
+            case VECTOR_OP_REVERSE_BYTES: return (v0, m) ->
+                    v0.uOp(m, (i, a) -> ($type$) $Boxtype$.reverseBytes(a));
+#end[byte]
+#end[BITWISE]
+#end[!FP]
 #if[FP]
            case VECTOR_OP_SIN: return (v0, m) ->
                    v0.uOp(m, (i, a) -> ($type$) Math.sin(a));
@ -839,6 +974,12 @@ public abstract class $abstractvectortype$ extends AbstractVector<$Boxtype$> {
                    v0.bOp(v1, vm, (i, a, n) -> rotateLeft(a, (int)n));
            case VECTOR_OP_RROTATE: return (v0, v1, vm) ->
                    v0.bOp(v1, vm, (i, a, n) -> rotateRight(a, (int)n));
+#if[intOrLong]
+            case VECTOR_OP_COMPRESS_BITS: return (v0, v1, vm) ->
+                    v0.bOp(v1, vm, (i, a, n) -> $Boxtype$.compress(a, n));
+            case VECTOR_OP_EXPAND_BITS: return (v0, v1, vm) ->
+                    v0.bOp(v1, vm, (i, a, n) -> $Boxtype$.expand(a, n));
+#end[intOrLong]
 #end[BITWISE]
 #if[FP]
            case VECTOR_OP_OR: return (v0, v1, vm) ->
@ -1987,6 +2128,56 @@ public abstract class $abstractvectortype$ extends AbstractVector<$Boxtype$> {
        return lanewise(ABS);
    }

+#if[!FP]
+#if[!intOrLong]
+    static int bitCount($type$ a) {
+#if[short]
+        return Integer.bitCount((int)a & 0xFFFF);
+#else[short]
+        return Integer.bitCount((int)a & 0xFF);
+#end[short]
+    }
+#end[!intOrLong]
+#end[!FP]
+#if[!FP]
+#if[!intOrLong]
+    static int numberOfTrailingZeros($type$ a) {
+#if[short]
+        return a != 0 ? Integer.numberOfTrailingZeros(a) : 16;
+#else[short]
+        return a != 0 ? Integer.numberOfTrailingZeros(a) : 8;
+#end[short]
+    }
+#end[!intOrLong]
+#end[!FP]
+#if[!FP]
+#if[!intOrLong]
+    static int numberOfLeadingZeros($type$ a) {
+#if[short]
+        return a >= 0 ? Integer.numberOfLeadingZeros(a) - 16 : 0;
+#else[short]
+        return a >= 0 ? Integer.numberOfLeadingZeros(a) - 24 : 0;
+#end[short]
+    }
+
+    static $type$ reverse($type$ a) {
+        if (a == 0 || a == -1) return a;
+
+#if[short]
+        $type$ b = rotateLeft(a, 8);
+        b = ($type$) (((b & 0x5555) << 1) | ((b & 0xAAAA) >>> 1));
+        b = ($type$) (((b & 0x3333) << 2) | ((b & 0xCCCC) >>> 2));
+        b = ($type$) (((b & 0x0F0F) << 4) | ((b & 0xF0F0) >>> 4));
+#else[short]
+        $type$ b = rotateLeft(a, 4);
+        b = ($type$) (((b & 0x55) << 1) | ((b & 0xAA) >>> 1));
+        b = ($type$) (((b & 0x33) << 2) | ((b & 0xCC) >>> 2));
+#end[short]
+        return b;
+    }
+#end[!intOrLong]
+#end[!FP]
+
 #if[BITWISE]
    // not (~)
    /**
@ -2695,6 +2886,45 @@ public abstract class $abstractvectortype$ extends AbstractVector<$Boxtype$> {
                                     $Type$Vector::toShuffle0);
    }

+    /**
+     * {@inheritDoc} <!--workaround-->
+     * @since 19
+     */
+    @Override
+    public abstract
+    $Type$Vector compress(VectorMask<$Boxtype$> m);
+
+    /*package-private*/
+    @ForceInline
+    final
+    <M extends AbstractMask<$Boxtype$>>
+    $Type$Vector compressTemplate(Class<M> masktype, M m) {
+      m.check(masktype, this);
+      return ($Type$Vector) VectorSupport.compressExpandOp(VectorSupport.VECTOR_OP_COMPRESS, getClass(), masktype,
+                                                        $type$.class, length(), this, m,
+                                                        (v1, m1) -> compressHelper(v1, m1));
+    }
+
+    /**
+     * {@inheritDoc} <!--workaround-->
+     * @since 19
+     */
+    @Override
+    public abstract
+    $Type$Vector expand(VectorMask<$Boxtype$> m);
+
+    /*package-private*/
+    @ForceInline
+    final
+    <M extends AbstractMask<$Boxtype$>>
+    $Type$Vector expandTemplate(Class<M> masktype, M m) {
+      m.check(masktype, this);
+      return ($Type$Vector) VectorSupport.compressExpandOp(VectorSupport.VECTOR_OP_EXPAND, getClass(), masktype,
+                                                        $type$.class, length(), this, m,
+                                                        (v1, m1) -> expandHelper(v1, m1));
+    }
+
+
    /**
     * {@inheritDoc} <!--workaround-->
     */
@ -3302,90 +3532,6 @@ public abstract class $abstractvectortype$ extends AbstractVector<$Boxtype$> {
    }
 #end[double]

-    /**
-     * Loads a vector from a byte array starting at an offset.
-     * Bytes are composed into primitive lane elements according
-     * to the specified byte order.
-     * The vector is arranged into lanes according to
-     * <a href="Vector.html#lane-order">memory ordering</a>.
-     * <p>
-     * This method behaves as if it returns the result of calling
-     * {@link #fromByteBuffer(VectorSpecies,ByteBuffer,int,ByteOrder,VectorMask)
-     * fromByteBuffer()} as follows:
-     * <pre>{@code
-     * var bb = ByteBuffer.wrap(a);
-     * var m = species.maskAll(true);
-     * return fromByteBuffer(species, bb, offset, bo, m);
-     * }</pre>
-     *
-     * @param species species of desired vector
-     * @param a the byte array
-     * @param offset the offset into the array
-     * @param bo the intended byte order
-     * @return a vector loaded from a byte array
-     * @throws IndexOutOfBoundsException
-     *         if {@code offset+N*ESIZE < 0}
-     *         or {@code offset+(N+1)*ESIZE > a.length}
-     *         for any lane {@code N} in the vector
-     */
-    @ForceInline
-    public static
-    $abstractvectortype$ fromByteArray(VectorSpecies<$Boxtype$> species,
-                                       byte[] a, int offset,
-                                       ByteOrder bo) {
-        offset = checkFromIndexSize(offset, species.vectorByteSize(), a.length);
-        $Type$Species vsp = ($Type$Species) species;
-        return vsp.dummyVector().fromByteArray0(a, offset).maybeSwap(bo);
-    }
-
-    /**
-     * Loads a vector from a byte array starting at an offset
-     * and using a mask.
-     * Lanes where the mask is unset are filled with the default
-     * value of {@code $type$} ({#if[FP]?positive }zero).
-     * Bytes are composed into primitive lane elements according
-     * to the specified byte order.
-     * The vector is arranged into lanes according to
-     * <a href="Vector.html#lane-order">memory ordering</a>.
-     * <p>
-     * This method behaves as if it returns the result of calling
-     * {@link #fromByteBuffer(VectorSpecies,ByteBuffer,int,ByteOrder,VectorMask)
-     * fromByteBuffer()} as follows:
-     * <pre>{@code
-     * var bb = ByteBuffer.wrap(a);
-     * return fromByteBuffer(species, bb, offset, bo, m);
-     * }</pre>
-     *
-     * @param species species of desired vector
-     * @param a the byte array
-     * @param offset the offset into the array
-     * @param bo the intended byte order
-     * @param m the mask controlling lane selection
-     * @return a vector loaded from a byte array
-     * @throws IndexOutOfBoundsException
-     *         if {@code offset+N*ESIZE < 0}
-     *         or {@code offset+(N+1)*ESIZE > a.length}
-     *         for any lane {@code N} in the vector
-     *         where the mask is set
-     */
-    @ForceInline
-    public static
-    $abstractvectortype$ fromByteArray(VectorSpecies<$Boxtype$> species,
-                                       byte[] a, int offset,
-                                       ByteOrder bo,
-                                       VectorMask<$Boxtype$> m) {
-        $Type$Species vsp = ($Type$Species) species;
-        if (offset >= 0 && offset <= (a.length - species.vectorByteSize())) {
-            return vsp.dummyVector().fromByteArray0(a, offset, m).maybeSwap(bo);
-        }
-
-        // FIXME: optimize
-        checkMaskFromIndexSize(offset, vsp, m, $sizeInBytes$, a.length);
-        ByteBuffer wb = wrapper(a, bo);
-        return vsp.ldOp(wb, offset, (AbstractMask<$Boxtype$>)m,
-                   (wb_, o, i)  -> wb_.get{#if[byte]?(:$Type$(}o + i * $sizeInBytes$));
-    }
-
    /**
     * Loads a vector from an array of type {@code $type$[]}
     * starting at an offset.
@ -3917,44 +4063,49 @@ public abstract class $abstractvectortype$ extends AbstractVector<$Boxtype$> {
 #end[byte]

    /**
-     * Loads a vector from a {@linkplain ByteBuffer byte buffer}
-     * starting at an offset into the byte buffer.
+     * Loads a vector from a {@linkplain MemorySegment memory segment}
+     * starting at an offset into the memory segment.
     * Bytes are composed into primitive lane elements according
     * to the specified byte order.
     * The vector is arranged into lanes according to
     * <a href="Vector.html#lane-order">memory ordering</a>.
     * <p>
     * This method behaves as if it returns the result of calling
-     * {@link #fromByteBuffer(VectorSpecies,ByteBuffer,int,ByteOrder,VectorMask)
-     * fromByteBuffer()} as follows:
+     * {@link #fromMemorySegment(VectorSpecies,MemorySegment,long,ByteOrder,VectorMask)
+     * fromMemorySegment()} as follows:
     * <pre>{@code
     * var m = species.maskAll(true);
-     * return fromByteBuffer(species, bb, offset, bo, m);
+     * return fromMemorySegment(species, ms, offset, bo, m);
     * }</pre>
     *
     * @param species species of desired vector
-     * @param bb the byte buffer
-     * @param offset the offset into the byte buffer
+     * @param ms the memory segment
+     * @param offset the offset into the memory segment
     * @param bo the intended byte order
-     * @return a vector loaded from a byte buffer
+     * @return a vector loaded from the memory segment
     * @throws IndexOutOfBoundsException
     *         if {@code offset+N*$sizeInBytes$ < 0}
-     *         or {@code offset+N*$sizeInBytes$ >= bb.limit()}
+     *         or {@code offset+N*$sizeInBytes$ >= ms.byteSize()}
     *         for any lane {@code N} in the vector
+     * @throws IllegalArgumentException if the memory segment is a heap segment that is
+     *         not backed by a {@code byte[]} array.
+     * @throws IllegalStateException if the memory segment's session is not alive,
+     *         or if access occurs from a thread other than the thread owning the session.
+     * @since 19
     */
    @ForceInline
    public static
-    $abstractvectortype$ fromByteBuffer(VectorSpecies<$Boxtype$> species,
-                                        ByteBuffer bb, int offset,
-                                        ByteOrder bo) {
-        offset = checkFromIndexSize(offset, species.vectorByteSize(), bb.limit());
+    $abstractvectortype$ fromMemorySegment(VectorSpecies<$Boxtype$> species,
+                                           MemorySegment ms, long offset,
+                                           ByteOrder bo) {
+        offset = checkFromIndexSize(offset, species.vectorByteSize(), ms.byteSize());
        $Type$Species vsp = ($Type$Species) species;
-        return vsp.dummyVector().fromByteBuffer0(bb, offset).maybeSwap(bo);
+        return vsp.dummyVector().fromMemorySegment0(ms, offset).maybeSwap(bo);
    }

    /**
-     * Loads a vector from a {@linkplain ByteBuffer byte buffer}
-     * starting at an offset into the byte buffer
+     * Loads a vector from a {@linkplain MemorySegment memory segment}
+     * starting at an offset into the memory segment
     * and using a mask.
     * Lanes where the mask is unset are filled with the default
     * value of {@code $type$} ({#if[FP]?positive }zero).
@ -3965,15 +4116,11 @@ public abstract class $abstractvectortype$ extends AbstractVector<$Boxtype$> {
     * <p>
     * The following pseudocode illustrates the behavior:
     * <pre>{@code
-     * $Type$Buffer eb = bb.duplicate()
-     *     .position(offset){#if[byte]?;}
-#if[!byte]
-     *     .order(bo).as$Type$Buffer();
-#end[!byte]
+     * var slice = ms.asSlice(offset);
     * $type$[] ar = new $type$[species.length()];
     * for (int n = 0; n < ar.length; n++) {
     *     if (m.laneIsSet(n)) {
-     *         ar[n] = eb.get(n);
+     *         ar[n] = slice.getAtIndex(ValuaLayout.JAVA_$TYPE$.withBitAlignment(8), n);
     *     }
     * }
     * $abstractvectortype$ r = $abstractvectortype$.fromArray(species, ar, 0);
@ -3991,33 +4138,36 @@ public abstract class $abstractvectortype$ extends AbstractVector<$Boxtype$> {
 #end[!byte]
     *
     * @param species species of desired vector
-     * @param bb the byte buffer
-     * @param offset the offset into the byte buffer
+     * @param ms the memory segment
+     * @param offset the offset into the memory segment
     * @param bo the intended byte order
     * @param m the mask controlling lane selection
-     * @return a vector loaded from a byte buffer
+     * @return a vector loaded from the memory segment
     * @throws IndexOutOfBoundsException
     *         if {@code offset+N*$sizeInBytes$ < 0}
-     *         or {@code offset+N*$sizeInBytes$ >= bb.limit()}
+     *         or {@code offset+N*$sizeInBytes$ >= ms.byteSize()}
     *         for any lane {@code N} in the vector
     *         where the mask is set
+     * @throws IllegalArgumentException if the memory segment is a heap segment that is
+     *         not backed by a {@code byte[]} array.
+     * @throws IllegalStateException if the memory segment's session is not alive,
+     *         or if access occurs from a thread other than the thread owning the session.
+     * @since 19
     */
    @ForceInline
    public static
-    $abstractvectortype$ fromByteBuffer(VectorSpecies<$Boxtype$> species,
-                                        ByteBuffer bb, int offset,
-                                        ByteOrder bo,
-                                        VectorMask<$Boxtype$> m) {
+    $abstractvectortype$ fromMemorySegment(VectorSpecies<$Boxtype$> species,
+                                           MemorySegment ms, long offset,
+                                           ByteOrder bo,
+                                           VectorMask<$Boxtype$> m) {
        $Type$Species vsp = ($Type$Species) species;
-        if (offset >= 0 && offset <= (bb.limit() - species.vectorByteSize())) {
-            return vsp.dummyVector().fromByteBuffer0(bb, offset, m).maybeSwap(bo);
+        if (offset >= 0 && offset <= (ms.byteSize() - species.vectorByteSize())) {
+            return vsp.dummyVector().fromMemorySegment0(ms, offset, m).maybeSwap(bo);
        }

        // FIXME: optimize
-        checkMaskFromIndexSize(offset, vsp, m, $sizeInBytes$, bb.limit());
-        ByteBuffer wb = wrapper(bb, bo);
-        return vsp.ldOp(wb, offset, (AbstractMask<$Boxtype$>)m,
-                   (wb_, o, i)  -> wb_.get{#if[byte]?(:$Type$(}o + i * $sizeInBytes$));
+        checkMaskFromIndexSize(offset, vsp, m, $sizeInBytes$, ms.byteSize());
+        return vsp.ldLongOp(ms, offset, m, $abstractvectortype$::memorySegmentGet);
    }

    // Memory store operations
@ -4047,7 +4197,7 @@ public abstract class $abstractvectortype$ extends AbstractVector<$Boxtype$> {
            this,
            a, offset,
            (arr, off, v)
-            -> v.stOp(arr, off,
+            -> v.stOp(arr, (int) off,
                      (arr_, off_, i, e) -> arr_[off_ + i] = e));
    }

@ -4264,7 +4414,7 @@ public abstract class $abstractvectortype$ extends AbstractVector<$Boxtype$> {
            this,
            a, offset,
            (arr, off, v)
-            -> v.stOp(arr, off,
+            -> v.stOp(arr, (int) off,
                      (arr_, off_, i, e) -> arr_[off_ + i] = (char) e));
    }

@ -4423,7 +4573,7 @@ public abstract class $abstractvectortype$ extends AbstractVector<$Boxtype$> {
            normalized,
            a, offset,
            (arr, off, v)
-            -> v.stOp(arr, off,
+            -> v.stOp(arr, (int) off,
                      (arr_, off_, i, e) -> arr_[off_ + i] = (e & 1) != 0));
    }

@ -4562,67 +4712,40 @@ public abstract class $abstractvectortype$ extends AbstractVector<$Boxtype$> {

    /**
     * {@inheritDoc} <!--workaround-->
+     * @since 19
     */
    @Override
    @ForceInline
    public final
-    void intoByteArray(byte[] a, int offset,
-                       ByteOrder bo) {
-        offset = checkFromIndexSize(offset, byteSize(), a.length);
-        maybeSwap(bo).intoByteArray0(a, offset);
-    }
-
-    /**
-     * {@inheritDoc} <!--workaround-->
-     */
-    @Override
-    @ForceInline
-    public final
-    void intoByteArray(byte[] a, int offset,
-                       ByteOrder bo,
-                       VectorMask<$Boxtype$> m) {
-        if (m.allTrue()) {
-            intoByteArray(a, offset, bo);
-        } else {
-            $Type$Species vsp = vspecies();
-            checkMaskFromIndexSize(offset, vsp, m, $sizeInBytes$, a.length);
-            maybeSwap(bo).intoByteArray0(a, offset, m);
+    void intoMemorySegment(MemorySegment ms, long offset,
+                           ByteOrder bo) {
+        if (ms.isReadOnly()) {
+            throw new UnsupportedOperationException("Attempt to write a read-only segment");
        }
+
+        offset = checkFromIndexSize(offset, byteSize(), ms.byteSize());
+        maybeSwap(bo).intoMemorySegment0(ms, offset);
    }

    /**
     * {@inheritDoc} <!--workaround-->
+     * @since 19
     */
    @Override
    @ForceInline
    public final
-    void intoByteBuffer(ByteBuffer bb, int offset,
-                        ByteOrder bo) {
-        if (ScopedMemoryAccess.isReadOnly(bb)) {
-            throw new ReadOnlyBufferException();
-        }
-        offset = checkFromIndexSize(offset, byteSize(), bb.limit());
-        maybeSwap(bo).intoByteBuffer0(bb, offset);
-    }
-
-    /**
-     * {@inheritDoc} <!--workaround-->
-     */
-    @Override
-    @ForceInline
-    public final
-    void intoByteBuffer(ByteBuffer bb, int offset,
-                        ByteOrder bo,
-                        VectorMask<$Boxtype$> m) {
+    void intoMemorySegment(MemorySegment ms, long offset,
+                           ByteOrder bo,
+                           VectorMask<$Boxtype$> m) {
        if (m.allTrue()) {
-            intoByteBuffer(bb, offset, bo);
+            intoMemorySegment(ms, offset, bo);
        } else {
-            if (bb.isReadOnly()) {
-                throw new ReadOnlyBufferException();
+            if (ms.isReadOnly()) {
+                throw new UnsupportedOperationException("Attempt to write a read-only segment");
            }
            $Type$Species vsp = vspecies();
-            checkMaskFromIndexSize(offset, vsp, m, $sizeInBytes$, bb.limit());
-            maybeSwap(bo).intoByteBuffer0(bb, offset, m);
+            checkMaskFromIndexSize(offset, vsp, m, $sizeInBytes$, ms.byteSize());
+            maybeSwap(bo).intoMemorySegment0(ms, offset, m);
        }
    }

@ -4656,7 +4779,7 @@ public abstract class $abstractvectortype$ extends AbstractVector<$Boxtype$> {
            vsp.vectorType(), vsp.elementType(), vsp.laneCount(),
            a, arrayAddress(a, offset),
            a, offset, vsp,
-            (arr, off, s) -> s.ldOp(arr, off,
+            (arr, off, s) -> s.ldOp(arr, (int) off,
                                    (arr_, off_, i) -> arr_[off_ + i]));
    }

@ -4673,7 +4796,7 @@ public abstract class $abstractvectortype$ extends AbstractVector<$Boxtype$> {
            vsp.vectorType(), maskClass, vsp.elementType(), vsp.laneCount(),
            a, arrayAddress(a, offset), m,
            a, offset, vsp,
-            (arr, off, s, vm) -> s.ldOp(arr, off, vm,
+            (arr, off, s, vm) -> s.ldOp(arr, (int) off, vm,
                                        (arr_, off_, i) -> arr_[off_ + i]));
    }

@ -4750,7 +4873,7 @@ public abstract class $abstractvectortype$ extends AbstractVector<$Boxtype$> {
            vsp.vectorType(), vsp.elementType(), vsp.laneCount(),
            a, charArrayAddress(a, offset),
            a, offset, vsp,
-            (arr, off, s) -> s.ldOp(arr, off,
+            (arr, off, s) -> s.ldOp(arr, (int) off,
                                    (arr_, off_, i) -> (short) arr_[off_ + i]));
    }

@ -4767,7 +4890,7 @@ public abstract class $abstractvectortype$ extends AbstractVector<$Boxtype$> {
                vsp.vectorType(), maskClass, vsp.elementType(), vsp.laneCount(),
                a, charArrayAddress(a, offset), m,
                a, offset, vsp,
-                (arr, off, s, vm) -> s.ldOp(arr, off, vm,
+                (arr, off, s, vm) -> s.ldOp(arr, (int) off, vm,
                                            (arr_, off_, i) -> (short) arr_[off_ + i]));
    }
 #end[short]
@ -4784,7 +4907,7 @@ public abstract class $abstractvectortype$ extends AbstractVector<$Boxtype$> {
            vsp.vectorType(), vsp.elementType(), vsp.laneCount(),
            a, booleanArrayAddress(a, offset),
            a, offset, vsp,
-            (arr, off, s) -> s.ldOp(arr, off,
+            (arr, off, s) -> s.ldOp(arr, (int) off,
                                    (arr_, off_, i) -> (byte) (arr_[off_ + i] ? 1 : 0)));
    }

@ -4801,79 +4924,38 @@ public abstract class $abstractvectortype$ extends AbstractVector<$Boxtype$> {
            vsp.vectorType(), maskClass, vsp.elementType(), vsp.laneCount(),
            a, booleanArrayAddress(a, offset), m,
            a, offset, vsp,
-            (arr, off, s, vm) -> s.ldOp(arr, off, vm,
+            (arr, off, s, vm) -> s.ldOp(arr, (int) off, vm,
                                        (arr_, off_, i) -> (byte) (arr_[off_ + i] ? 1 : 0)));
    }
 #end[byte]

-    @Override
    abstract
-    $abstractvectortype$ fromByteArray0(byte[] a, int offset);
+    $abstractvectortype$ fromMemorySegment0(MemorySegment bb, long offset);
    @ForceInline
    final
-    $abstractvectortype$ fromByteArray0Template(byte[] a, int offset) {
+    $abstractvectortype$ fromMemorySegment0Template(MemorySegment ms, long offset) {
        $Type$Species vsp = vspecies();
-        return VectorSupport.load(
-            vsp.vectorType(), vsp.elementType(), vsp.laneCount(),
-            a, byteArrayAddress(a, offset),
-            a, offset, vsp,
-            (arr, off, s) -> {
-                ByteBuffer wb = wrapper(arr, NATIVE_ENDIAN);
-                return s.ldOp(wb, off,
-                        (wb_, o, i) -> wb_.get{#if[byte]?(:$Type$(}o + i * $sizeInBytes$));
-            });
-    }
-
-    abstract
-    $abstractvectortype$ fromByteArray0(byte[] a, int offset, VectorMask<$Boxtype$> m);
-    @ForceInline
-    final
-    <M extends VectorMask<$Boxtype$>>
-    $abstractvectortype$ fromByteArray0Template(Class<M> maskClass, byte[] a, int offset, M m) {
-        $Type$Species vsp = vspecies();
-        m.check(vsp);
-        return VectorSupport.loadMasked(
-            vsp.vectorType(), maskClass, vsp.elementType(), vsp.laneCount(),
-            a, byteArrayAddress(a, offset), m,
-            a, offset, vsp,
-            (arr, off, s, vm) -> {
-                ByteBuffer wb = wrapper(arr, NATIVE_ENDIAN);
-                return s.ldOp(wb, off, vm,
-                        (wb_, o, i) -> wb_.get{#if[byte]?(:$Type$(}o + i * $sizeInBytes$));
-            });
-    }
-
-    abstract
-    $abstractvectortype$ fromByteBuffer0(ByteBuffer bb, int offset);
-    @ForceInline
-    final
-    $abstractvectortype$ fromByteBuffer0Template(ByteBuffer bb, int offset) {
-        $Type$Species vsp = vspecies();
-        return ScopedMemoryAccess.loadFromByteBuffer(
+        return ScopedMemoryAccess.loadFromMemorySegment(
                vsp.vectorType(), vsp.elementType(), vsp.laneCount(),
-                bb, offset, vsp,
-                (buf, off, s) -> {
-                    ByteBuffer wb = wrapper(buf, NATIVE_ENDIAN);
-                    return s.ldOp(wb, off,
-                            (wb_, o, i) -> wb_.get{#if[byte]?(:$Type$(}o + i * $sizeInBytes$));
+                (AbstractMemorySegmentImpl) ms, offset, vsp,
+                (msp, off, s) -> {
+                    return s.ldLongOp((MemorySegment) msp, off, $abstractvectortype$::memorySegmentGet);
                });
    }

    abstract
-    $abstractvectortype$ fromByteBuffer0(ByteBuffer bb, int offset, VectorMask<$Boxtype$> m);
+    $abstractvectortype$ fromMemorySegment0(MemorySegment ms, long offset, VectorMask<$Boxtype$> m);
    @ForceInline
    final
    <M extends VectorMask<$Boxtype$>>
-    $abstractvectortype$ fromByteBuffer0Template(Class<M> maskClass, ByteBuffer bb, int offset, M m) {
+    $abstractvectortype$ fromMemorySegment0Template(Class<M> maskClass, MemorySegment ms, long offset, M m) {
        $Type$Species vsp = vspecies();
        m.check(vsp);
-        return ScopedMemoryAccess.loadFromByteBufferMasked(
+        return ScopedMemoryAccess.loadFromMemorySegmentMasked(
                vsp.vectorType(), maskClass, vsp.elementType(), vsp.laneCount(),
-                bb, offset, m, vsp,
-                (buf, off, s, vm) -> {
-                    ByteBuffer wb = wrapper(buf, NATIVE_ENDIAN);
-                    return s.ldOp(wb, off, vm,
-                            (wb_, o, i) -> wb_.get{#if[byte]?(:$Type$(}o + i * $sizeInBytes$));
+                (AbstractMemorySegmentImpl) ms, offset, m, vsp,
+                (msp, off, s, vm) -> {
+                    return s.ldLongOp((MemorySegment) msp, off, vm, $abstractvectortype$::memorySegmentGet);
                });
    }

@ -4892,7 +4974,7 @@ public abstract class $abstractvectortype$ extends AbstractVector<$Boxtype$> {
            a, arrayAddress(a, offset),
            this, a, offset,
            (arr, off, v)
-            -> v.stOp(arr, off,
+            -> v.stOp(arr, (int) off,
                      (arr_, off_, i, e) -> arr_[off_+i] = e));
    }

@ -4909,7 +4991,7 @@ public abstract class $abstractvectortype$ extends AbstractVector<$Boxtype$> {
            a, arrayAddress(a, offset),
            this, m, a, offset,
            (arr, off, v, vm)
-            -> v.stOp(arr, off, vm,
+            -> v.stOp(arr, (int) off, vm,
                      (arr_, off_, i, e) -> arr_[off_ + i] = e));
    }

@ -4990,76 +5072,38 @@ public abstract class $abstractvectortype$ extends AbstractVector<$Boxtype$> {
            a, booleanArrayAddress(a, offset),
            normalized, m, a, offset,
            (arr, off, v, vm)
-            -> v.stOp(arr, off, vm,
+            -> v.stOp(arr, (int) off, vm,
                      (arr_, off_, i, e) -> arr_[off_ + i] = (e & 1) != 0));
    }
 #end[byte]

-    abstract
-    void intoByteArray0(byte[] a, int offset);
    @ForceInline
    final
-    void intoByteArray0Template(byte[] a, int offset) {
+    void intoMemorySegment0(MemorySegment ms, long offset) {
        $Type$Species vsp = vspecies();
-        VectorSupport.store(
-            vsp.vectorType(), vsp.elementType(), vsp.laneCount(),
-            a, byteArrayAddress(a, offset),
-            this, a, offset,
-            (arr, off, v) -> {
-                ByteBuffer wb = wrapper(arr, NATIVE_ENDIAN);
-                v.stOp(wb, off,
-                        (tb_, o, i, e) -> tb_.put{#if[byte]?(:$Type$(}o + i * $sizeInBytes$, e));
-            });
-    }
-
-    abstract
-    void intoByteArray0(byte[] a, int offset, VectorMask<$Boxtype$> m);
-    @ForceInline
-    final
-    <M extends VectorMask<$Boxtype$>>
-    void intoByteArray0Template(Class<M> maskClass, byte[] a, int offset, M m) {
-        $Type$Species vsp = vspecies();
-        m.check(vsp);
-        VectorSupport.storeMasked(
-            vsp.vectorType(), maskClass, vsp.elementType(), vsp.laneCount(),
-            a, byteArrayAddress(a, offset),
-            this, m, a, offset,
-            (arr, off, v, vm) -> {
-                ByteBuffer wb = wrapper(arr, NATIVE_ENDIAN);
-                v.stOp(wb, off, vm,
-                        (tb_, o, i, e) -> tb_.put{#if[byte]?(:$Type$(}o + i * $sizeInBytes$, e));
-            });
-    }
-
-    @ForceInline
-    final
-    void intoByteBuffer0(ByteBuffer bb, int offset) {
-        $Type$Species vsp = vspecies();
-        ScopedMemoryAccess.storeIntoByteBuffer(
+        ScopedMemoryAccess.storeIntoMemorySegment(
                vsp.vectorType(), vsp.elementType(), vsp.laneCount(),
-                this, bb, offset,
-                (buf, off, v) -> {
-                    ByteBuffer wb = wrapper(buf, NATIVE_ENDIAN);
-                    v.stOp(wb, off,
-                            (wb_, o, i, e) -> wb_.put{#if[byte]?(:$Type$(}o + i * $sizeInBytes$, e));
+                this,
+                (AbstractMemorySegmentImpl) ms, offset,
+                (msp, off, v) -> {
+                    v.stLongOp((MemorySegment) msp, off, $abstractvectortype$::memorySegmentSet);
                });
    }

    abstract
-    void intoByteBuffer0(ByteBuffer bb, int offset, VectorMask<$Boxtype$> m);
+    void intoMemorySegment0(MemorySegment bb, long offset, VectorMask<$Boxtype$> m);
    @ForceInline
    final
    <M extends VectorMask<$Boxtype$>>
-    void intoByteBuffer0Template(Class<M> maskClass, ByteBuffer bb, int offset, M m) {
+    void intoMemorySegment0Template(Class<M> maskClass, MemorySegment ms, long offset, M m) {
        $Type$Species vsp = vspecies();
        m.check(vsp);
-        ScopedMemoryAccess.storeIntoByteBufferMasked(
+        ScopedMemoryAccess.storeIntoMemorySegmentMasked(
                vsp.vectorType(), maskClass, vsp.elementType(), vsp.laneCount(),
-                this, m, bb, offset,
-                (buf, off, v, vm) -> {
-                    ByteBuffer wb = wrapper(buf, NATIVE_ENDIAN);
-                    v.stOp(wb, off, vm,
-                            (wb_, o, i, e) -> wb_.put{#if[byte]?(:$Type$(}o + i * $sizeInBytes$, e));
+                this, m,
+                (AbstractMemorySegmentImpl) ms, offset,
+                (msp, off, v, vm) -> {
+                    v.stLongOp((MemorySegment) msp, off, vm, $abstractvectortype$::memorySegmentSet);
                });
    }

@ -5078,7 +5122,7 @@ public abstract class $abstractvectortype$ extends AbstractVector<$Boxtype$> {
            a, charArrayAddress(a, offset),
            this, m, a, offset,
            (arr, off, v, vm)
-            -> v.stOp(arr, off, vm,
+            -> v.stOp(arr, (int) off, vm,
                      (arr_, off_, i, e) -> arr_[off_ + i] = (char) e));
    }
 #end[short]
@ -5095,6 +5139,16 @@ public abstract class $abstractvectortype$ extends AbstractVector<$Boxtype$> {
            .checkIndexByLane(offset, limit, vsp.iota(), scale);
    }

+    private static
+    void checkMaskFromIndexSize(long offset,
+                                $Type$Species vsp,
+                                VectorMask<$Boxtype$> m,
+                                int scale,
+                                long limit) {
+        ((AbstractMask<$Boxtype$>)m)
+            .checkIndexByLane(offset, limit, vsp.iota(), scale);
+    }
+
    @ForceInline
    private void conditionalStoreNYI(int offset,
                                     $Type$Species vsp,
@ -5463,6 +5517,21 @@ public abstract class $abstractvectortype$ extends AbstractVector<$Boxtype$> {
            return dummyVector().ldOp(memory, offset, m, f);
        }

+        /*package-private*/
+        @ForceInline
+        $abstractvectortype$ ldLongOp(MemorySegment memory, long offset,
+                                      FLdLongOp f) {
+            return dummyVector().ldLongOp(memory, offset, f);
+        }
+
+        /*package-private*/
+        @ForceInline
+        $abstractvectortype$ ldLongOp(MemorySegment memory, long offset,
+                                      VectorMask<$Boxtype$> m,
+                                      FLdLongOp f) {
+            return dummyVector().ldLongOp(memory, offset, m, f);
+        }
+
        /*package-private*/
        @ForceInline
        <M> void stOp(M memory, int offset, FStOp<M> f) {
@ -5477,6 +5546,20 @@ public abstract class $abstractvectortype$ extends AbstractVector<$Boxtype$> {
            dummyVector().stOp(memory, offset, m, f);
        }

+        /*package-private*/
+        @ForceInline
+        void stLongOp(MemorySegment memory, long offset, FStLongOp f) {
+            dummyVector().stLongOp(memory, offset, f);
+        }
+
+        /*package-private*/
+        @ForceInline
+        void stLongOp(MemorySegment memory, long offset,
+                      AbstractMask<$Boxtype$> m,
+                      FStLongOp f) {
+            dummyVector().stLongOp(memory, offset, m, f);
+        }
+
        // N.B. Make sure these constant vectors and
        // masks load up correctly into registers.
        //
@ -5590,3 +5673,4 @@ public abstract class $abstractvectortype$ extends AbstractVector<$Boxtype$> {
    public static final VectorSpecies<$Boxtype$> SPECIES_PREFERRED
        = ($Type$Species) VectorSpecies.ofPreferred($type$.class);
 }
+
--- a/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/X-VectorBits.java.template
+++ b/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/X-VectorBits.java.template
@ -24,7 +24,7 @@
 */
 package jdk.incubator.vector;

-import java.nio.ByteBuffer;
+import java.lang.foreign.MemorySegment;
 import java.util.Arrays;
 import java.util.Objects;
 import java.util.function.IntUnaryOperator;
@ -480,6 +480,22 @@ final class $vectortype$ extends $abstractvectortype$ {
                                    ($vectortype$) v);  // specialize
    }

+    @Override
+    @ForceInline
+    public $vectortype$ compress(VectorMask<$Boxtype$> m) {
+        return ($vectortype$)
+            super.compressTemplate($masktype$.class,
+                                   ($masktype$) m);  // specialize
+    }
+
+    @Override
+    @ForceInline
+    public $vectortype$ expand(VectorMask<$Boxtype$> m) {
+        return ($vectortype$)
+            super.expandTemplate($masktype$.class,
+                                   ($masktype$) m);  // specialize
+    }
+
    @Override
    @ForceInline
    public $vectortype$ selectFrom(Vector<$Boxtype$> v) {
@ -920,6 +936,15 @@ final class $vectortype$ extends $abstractvectortype$ {
            return xor(maskAll(true));
        }

+        @Override
+        @ForceInline
+        public $masktype$ compress() {
+            return ($masktype$)VectorSupport.compressExpandOp(VectorSupport.VECTOR_OP_MASK_COMPRESS,
+                $vectortype$.class, $masktype$.class, ETYPE, VLENGTH, null, this,
+                (v1, m1) -> VSPECIES.iota().compare(VectorOperators.LT, m1.trueCount()));
+        }
+
+
        // Binary operations

        @Override
@ -1159,29 +1184,15 @@ final class $vectortype$ extends $abstractvectortype$ {
    @ForceInline
    @Override
    final
-    $abstractvectortype$ fromByteArray0(byte[] a, int offset) {
-        return super.fromByteArray0Template(a, offset);  // specialize
+    $abstractvectortype$ fromMemorySegment0(MemorySegment ms, long offset) {
+        return super.fromMemorySegment0Template(ms, offset);  // specialize
    }

    @ForceInline
    @Override
    final
-    $abstractvectortype$ fromByteArray0(byte[] a, int offset, VectorMask<$Boxtype$> m) {
-        return super.fromByteArray0Template($masktype$.class, a, offset, ($masktype$) m);  // specialize
-    }
-
-    @ForceInline
-    @Override
-    final
-    $abstractvectortype$ fromByteBuffer0(ByteBuffer bb, int offset) {
-        return super.fromByteBuffer0Template(bb, offset);  // specialize
-    }
-
-    @ForceInline
-    @Override
-    final
-    $abstractvectortype$ fromByteBuffer0(ByteBuffer bb, int offset, VectorMask<$Boxtype$> m) {
-        return super.fromByteBuffer0Template($masktype$.class, bb, offset, ($masktype$) m);  // specialize
+    $abstractvectortype$ fromMemorySegment0(MemorySegment ms, long offset, VectorMask<$Boxtype$> m) {
+        return super.fromMemorySegment0Template($masktype$.class, ms, offset, ($masktype$) m);  // specialize
    }

    @ForceInline
@ -1219,22 +1230,8 @@ final class $vectortype$ extends $abstractvectortype$ {
    @ForceInline
    @Override
    final
-    void intoByteArray0(byte[] a, int offset) {
-        super.intoByteArray0Template(a, offset);  // specialize
-    }
-
-    @ForceInline
-    @Override
-    final
-    void intoByteArray0(byte[] a, int offset, VectorMask<$Boxtype$> m) {
-        super.intoByteArray0Template($masktype$.class, a, offset, ($masktype$) m);  // specialize
-    }
-
-    @ForceInline
-    @Override
-    final
-    void intoByteBuffer0(ByteBuffer bb, int offset, VectorMask<$Boxtype$> m) {
-        super.intoByteBuffer0Template($masktype$.class, bb, offset, ($masktype$) m);
+    void intoMemorySegment0(MemorySegment ms, long offset, VectorMask<$Boxtype$> m) {
+        super.intoMemorySegment0Template($masktype$.class, ms, offset, ($masktype$) m);
    }

 #if[short]
@ -1251,3 +1248,4 @@ final class $vectortype$ extends $abstractvectortype$ {
    // ================================================

 }
+
--- a/src/jdk.internal.vm.ci/share/classes/jdk.vm.ci.amd64/src/jdk/vm/ci/amd64/AMD64.java
+++ b/src/jdk.internal.vm.ci/share/classes/jdk.vm.ci.amd64/src/jdk/vm/ci/amd64/AMD64.java
@ -224,6 +224,8 @@ public class AMD64 extends Architecture {
        RDTSCP,
        RDPID,
        FSRM,
+        GFNI,
+        AVX512_BITALG,
    }

    private final EnumSet<CPUFeature> features;
--- a/test/hotspot/gtest/aarch64/aarch64-asmtest.py
+++ b/test/hotspot/gtest/aarch64/aarch64-asmtest.py
@ -1769,6 +1769,10 @@ generate(SpecialCases, [["ccmn",   "__ ccmn(zr, zr, 3u, Assembler::LE);",
                        ["lasta",   "__ sve_lasta(v0, __ B, p0, z15);",                   "lasta\tb0, p0, z15.b"],
                        ["lastb",   "__ sve_lastb(v1, __ B, p1, z16);",                   "lastb\tb1, p1, z16.b"],
                        ["index",   "__ sve_index(z6, __ S, 1, 1);",                      "index\tz6.s, #1, #1"],
+                        ["index",   "__ sve_index(z6, __ B, r5, 2);",                     "index\tz6.b, w5, #2"],
+                        ["index",   "__ sve_index(z6, __ H, r5, 3);",                     "index\tz6.h, w5, #3"],
+                        ["index",   "__ sve_index(z6, __ S, r5, 4);",                     "index\tz6.s, w5, #4"],
+                        ["index",   "__ sve_index(z7, __ D, r5, 5);",                     "index\tz7.d, x5, #5"],
                        ["cpy",     "__ sve_cpy(z7, __ H, p3, r5);",                      "cpy\tz7.h, p3/m, w5"],
                        ["tbl",     "__ sve_tbl(z16, __ S, z17, z18);",                   "tbl\tz16.s, {z17.s}, z18.s"],
                        ["ld1w",    "__ sve_ld1w_gather(z15, p0, r5, z16);",              "ld1w\t{z15.s}, p0/z, [x5, z16.s, uxtw #2]"],
@ -1811,7 +1815,12 @@ generate(SpecialCases, [["ccmn",   "__ ccmn(zr, zr, 3u, Assembler::LE);",
                        ["uzp2",    "__ sve_uzp2(p0, __ D, p0, p1);",                     "uzp2\tp0.d, p0.d, p1.d"],
                        ["punpklo", "__ sve_punpklo(p1, p0);",                            "punpklo\tp1.h, p0.b"],
                        ["punpkhi", "__ sve_punpkhi(p1, p0);",                            "punpkhi\tp1.h, p0.b"],
+                        ["compact", "__ sve_compact(z16, __ S, z16, p1);",                "compact\tz16.s, p1, z16.s"],
+                        ["compact", "__ sve_compact(z16, __ D, z16, p1);",                "compact\tz16.d, p1, z16.d"],
                        ["ext",     "__ sve_ext(z17, z16, 63);",                          "ext\tz17.b, z17.b, z16.b, #63"],
+                        # SVE2 instructions
+                        ["histcnt", "__ sve_histcnt(z16, __ S, p0, z16, z16);",           "histcnt\tz16.s, p0/z, z16.s, z16.s"],
+                        ["histcnt", "__ sve_histcnt(z17, __ D, p0, z17, z17);",           "histcnt\tz17.d, p0/z, z17.d, z17.d"],
 ])

 print "\n// FloatImmediateOp"
@ -1855,6 +1864,7 @@ generate(SVEVectorOp, [["add", "ZZZ"],
                       ["and", "ZPZ", "m", "dn"],
                       ["asr", "ZPZ", "m", "dn"],
                       ["bic", "ZPZ", "m", "dn"],
+                       ["clz", "ZPZ", "m"],
                       ["cnt", "ZPZ", "m"],
                       ["eor", "ZPZ", "m", "dn"],
                       ["lsl", "ZPZ", "m", "dn"],
@ -1863,6 +1873,8 @@ generate(SVEVectorOp, [["add", "ZZZ"],
                       ["neg", "ZPZ", "m"],
                       ["not", "ZPZ", "m"],
                       ["orr", "ZPZ", "m", "dn"],
+                       ["rbit", "ZPZ", "m"],
+                       ["revb", "ZPZ", "m"],
                       ["smax", "ZPZ", "m", "dn"],
                       ["smin", "ZPZ", "m", "dn"],
                       ["sub", "ZPZ", "m", "dn"],
--- a/test/hotspot/gtest/aarch64/asmtest.out.h
+++ b/test/hotspot/gtest/aarch64/asmtest.out.h
@ -913,6 +913,10 @@
    __ sve_lasta(v0, __ B, p0, z15);                   //       lasta   b0, p0, z15.b
    __ sve_lastb(v1, __ B, p1, z16);                   //       lastb   b1, p1, z16.b
    __ sve_index(z6, __ S, 1, 1);                      //       index   z6.s, #1, #1
+    __ sve_index(z6, __ B, r5, 2);                     //       index   z6.b, w5, #2
+    __ sve_index(z6, __ H, r5, 3);                     //       index   z6.h, w5, #3
+    __ sve_index(z6, __ S, r5, 4);                     //       index   z6.s, w5, #4
+    __ sve_index(z7, __ D, r5, 5);                     //       index   z7.d, x5, #5
    __ sve_cpy(z7, __ H, p3, r5);                      //       cpy     z7.h, p3/m, w5
    __ sve_tbl(z16, __ S, z17, z18);                   //       tbl     z16.s, {z17.s}, z18.s
    __ sve_ld1w_gather(z15, p0, r5, z16);              //       ld1w    {z15.s}, p0/z, [x5, z16.s, uxtw #2]
@ -955,7 +959,11 @@
    __ sve_uzp2(p0, __ D, p0, p1);                     //       uzp2    p0.d, p0.d, p1.d
    __ sve_punpklo(p1, p0);                            //       punpklo p1.h, p0.b
    __ sve_punpkhi(p1, p0);                            //       punpkhi p1.h, p0.b
+    __ sve_compact(z16, __ S, z16, p1);                //       compact z16.s, p1, z16.s
+    __ sve_compact(z16, __ D, z16, p1);                //       compact z16.d, p1, z16.d
    __ sve_ext(z17, z16, 63);                          //       ext     z17.b, z17.b, z16.b, #63
+    __ sve_histcnt(z16, __ S, p0, z16, z16);           //       histcnt z16.s, p0/z, z16.s, z16.s
+    __ sve_histcnt(z17, __ D, p0, z17, z17);           //       histcnt z17.d, p0/z, z17.d, z17.d

 // FloatImmediateOp
    __ fmovd(v0, 2.0);                                 //       fmov d0, #2.0
@ -1144,57 +1152,60 @@
    __ sve_and(z22, __ D, p5, z20);                    //       and     z22.d, p5/m, z22.d, z20.d
    __ sve_asr(z28, __ S, p2, z13);                    //       asr     z28.s, p2/m, z28.s, z13.s
    __ sve_bic(z7, __ H, p5, z28);                     //       bic     z7.h, p5/m, z7.h, z28.h
-    __ sve_cnt(z11, __ S, p3, z11);                    //       cnt     z11.s, p3/m, z11.s
-    __ sve_eor(z1, __ S, p6, z8);                      //       eor     z1.s, p6/m, z1.s, z8.s
-    __ sve_lsl(z13, __ S, p4, z17);                    //       lsl     z13.s, p4/m, z13.s, z17.s
-    __ sve_lsr(z4, __ H, p0, z3);                      //       lsr     z4.h, p0/m, z4.h, z3.h
-    __ sve_mul(z7, __ S, p3, z14);                     //       mul     z7.s, p3/m, z7.s, z14.s
-    __ sve_neg(z4, __ B, p3, z29);                     //       neg     z4.b, p3/m, z29.b
-    __ sve_not(z0, __ D, p2, z21);                     //       not     z0.d, p2/m, z21.d
-    __ sve_orr(z3, __ S, p0, z9);                      //       orr     z3.s, p0/m, z3.s, z9.s
-    __ sve_smax(z28, __ B, p2, z24);                   //       smax    z28.b, p2/m, z28.b, z24.b
-    __ sve_smin(z19, __ D, p1, z23);                   //       smin    z19.d, p1/m, z19.d, z23.d
-    __ sve_sub(z13, __ D, p5, z10);                    //       sub     z13.d, p5/m, z13.d, z10.d
-    __ sve_fabs(z12, __ D, p4, z30);                   //       fabs    z12.d, p4/m, z30.d
-    __ sve_fadd(z14, __ D, p0, z29);                   //       fadd    z14.d, p0/m, z14.d, z29.d
-    __ sve_fdiv(z21, __ D, p5, z7);                    //       fdiv    z21.d, p5/m, z21.d, z7.d
-    __ sve_fmax(z2, __ D, p0, z26);                    //       fmax    z2.d, p0/m, z2.d, z26.d
-    __ sve_fmin(z9, __ D, p4, z17);                    //       fmin    z9.d, p4/m, z9.d, z17.d
-    __ sve_fmul(z0, __ D, p1, z2);                     //       fmul    z0.d, p1/m, z0.d, z2.d
-    __ sve_fneg(z14, __ D, p1, z11);                   //       fneg    z14.d, p1/m, z11.d
-    __ sve_frintm(z14, __ S, p4, z29);                 //       frintm  z14.s, p4/m, z29.s
-    __ sve_frintn(z3, __ S, p0, z22);                  //       frintn  z3.s, p0/m, z22.s
-    __ sve_frintp(z3, __ S, p6, z27);                  //       frintp  z3.s, p6/m, z27.s
-    __ sve_fsqrt(z19, __ D, p5, z7);                   //       fsqrt   z19.d, p5/m, z7.d
-    __ sve_fsub(z21, __ S, p3, z5);                    //       fsub    z21.s, p3/m, z21.s, z5.s
-    __ sve_fmad(z25, __ D, p1, z21, z17);              //       fmad    z25.d, p1/m, z21.d, z17.d
-    __ sve_fmla(z0, __ S, p0, z9, z19);                //       fmla    z0.s, p0/m, z9.s, z19.s
-    __ sve_fmls(z7, __ D, p3, z14, z17);               //       fmls    z7.d, p3/m, z14.d, z17.d
-    __ sve_fmsb(z11, __ D, p3, z24, z17);              //       fmsb    z11.d, p3/m, z24.d, z17.d
-    __ sve_fnmad(z17, __ D, p2, z15, z14);             //       fnmad   z17.d, p2/m, z15.d, z14.d
-    __ sve_fnmsb(z22, __ S, p7, z22, z7);              //       fnmsb   z22.s, p7/m, z22.s, z7.s
-    __ sve_fnmla(z5, __ S, p7, z27, z10);              //       fnmla   z5.s, p7/m, z27.s, z10.s
-    __ sve_fnmls(z14, __ S, p6, z21, z20);             //       fnmls   z14.s, p6/m, z21.s, z20.s
-    __ sve_mla(z3, __ D, p5, z25, z5);                 //       mla     z3.d, p5/m, z25.d, z5.d
-    __ sve_mls(z29, __ H, p4, z17, z1);                //       mls     z29.h, p4/m, z17.h, z1.h
-    __ sve_and(z14, z29, z13);                         //       and     z14.d, z29.d, z13.d
-    __ sve_eor(z17, z2, z30);                          //       eor     z17.d, z2.d, z30.d
-    __ sve_orr(z22, z21, z29);                         //       orr     z22.d, z21.d, z29.d
-    __ sve_bic(z8, z2, z0);                            //       bic     z8.d, z2.d, z0.d
-    __ sve_uzp1(z23, __ S, z22, z0);                   //       uzp1    z23.s, z22.s, z0.s
-    __ sve_uzp2(z25, __ H, z26, z23);                  //       uzp2    z25.h, z26.h, z23.h
-    __ sve_bext(z21, __ B, z21, z1);                   //       bext    z21.b, z21.b, z1.b
+    __ sve_clz(z11, __ S, p3, z11);                    //       clz     z11.s, p3/m, z11.s
+    __ sve_cnt(z1, __ S, p6, z8);                      //       cnt     z1.s, p6/m, z8.s
+    __ sve_eor(z13, __ S, p4, z17);                    //       eor     z13.s, p4/m, z13.s, z17.s
+    __ sve_lsl(z4, __ H, p0, z3);                      //       lsl     z4.h, p0/m, z4.h, z3.h
+    __ sve_lsr(z7, __ S, p3, z14);                     //       lsr     z7.s, p3/m, z7.s, z14.s
+    __ sve_mul(z4, __ B, p3, z29);                     //       mul     z4.b, p3/m, z4.b, z29.b
+    __ sve_neg(z0, __ D, p2, z21);                     //       neg     z0.d, p2/m, z21.d
+    __ sve_not(z3, __ S, p0, z9);                      //       not     z3.s, p0/m, z9.s
+    __ sve_orr(z28, __ B, p2, z24);                    //       orr     z28.b, p2/m, z28.b, z24.b
+    __ sve_rbit(z19, __ D, p1, z23);                   //       rbit    z19.d, p1/m, z23.d
+    __ sve_revb(z13, __ D, p5, z10);                   //       revb    z13.d, p5/m, z10.d
+    __ sve_smax(z12, __ S, p4, z30);                   //       smax    z12.s, p4/m, z12.s, z30.s
+    __ sve_smin(z14, __ S, p0, z29);                   //       smin    z14.s, p0/m, z14.s, z29.s
+    __ sve_sub(z21, __ S, p5, z7);                     //       sub     z21.s, p5/m, z21.s, z7.s
+    __ sve_fabs(z2, __ D, p0, z26);                    //       fabs    z2.d, p0/m, z26.d
+    __ sve_fadd(z9, __ D, p4, z17);                    //       fadd    z9.d, p4/m, z9.d, z17.d
+    __ sve_fdiv(z0, __ D, p1, z2);                     //       fdiv    z0.d, p1/m, z0.d, z2.d
+    __ sve_fmax(z14, __ D, p1, z11);                   //       fmax    z14.d, p1/m, z14.d, z11.d
+    __ sve_fmin(z14, __ S, p4, z29);                   //       fmin    z14.s, p4/m, z14.s, z29.s
+    __ sve_fmul(z3, __ S, p0, z22);                    //       fmul    z3.s, p0/m, z3.s, z22.s
+    __ sve_fneg(z3, __ S, p6, z27);                    //       fneg    z3.s, p6/m, z27.s
+    __ sve_frintm(z19, __ D, p5, z7);                  //       frintm  z19.d, p5/m, z7.d
+    __ sve_frintn(z21, __ S, p3, z5);                  //       frintn  z21.s, p3/m, z5.s
+    __ sve_frintp(z25, __ D, p1, z21);                 //       frintp  z25.d, p1/m, z21.d
+    __ sve_fsqrt(z17, __ S, p0, z3);                   //       fsqrt   z17.s, p0/m, z3.s
+    __ sve_fsub(z19, __ S, p3, z7);                    //       fsub    z19.s, p3/m, z19.s, z7.s
+    __ sve_fmad(z14, __ S, p4, z17, z11);              //       fmad    z14.s, p4/m, z17.s, z11.s
+    __ sve_fmla(z24, __ S, p4, z30, z17);              //       fmla    z24.s, p4/m, z30.s, z17.s
+    __ sve_fmls(z15, __ D, p3, z26, z22);              //       fmls    z15.d, p3/m, z26.d, z22.d
+    __ sve_fmsb(z22, __ D, p2, z8, z5);                //       fmsb    z22.d, p2/m, z8.d, z5.d
+    __ sve_fnmad(z27, __ D, p2, z0, z14);              //       fnmad   z27.d, p2/m, z0.d, z14.d
+    __ sve_fnmsb(z21, __ D, p5, z0, z3);               //       fnmsb   z21.d, p5/m, z0.d, z3.d
+    __ sve_fnmla(z25, __ D, p1, z25, z29);             //       fnmla   z25.d, p1/m, z25.d, z29.d
+    __ sve_fnmls(z17, __ D, p0, z12, z14);             //       fnmls   z17.d, p0/m, z12.d, z14.d
+    __ sve_mla(z13, __ D, p0, z17, z2);                //       mla     z13.d, p0/m, z17.d, z2.d
+    __ sve_mls(z20, __ H, p5, z21, z29);               //       mls     z20.h, p5/m, z21.h, z29.h
+    __ sve_and(z8, z2, z0);                            //       and     z8.d, z2.d, z0.d
+    __ sve_eor(z23, z22, z0);                          //       eor     z23.d, z22.d, z0.d
+    __ sve_orr(z25, z26, z23);                         //       orr     z25.d, z26.d, z23.d
+    __ sve_bic(z21, z21, z1);                          //       bic     z21.d, z21.d, z1.d
+    __ sve_uzp1(z10, __ S, z19, z11);                  //       uzp1    z10.s, z19.s, z11.s
+    __ sve_uzp2(z23, __ D, z23, z8);                   //       uzp2    z23.d, z23.d, z8.d
+    __ sve_bext(z17, __ S, z19, z19);                  //       bext    z17.s, z19.s, z19.s

 // SVEReductionOp
-    __ sve_andv(v10, __ S, p5, z11);                   //       andv s10, p5, z11.s
-    __ sve_orv(v23, __ D, p6, z8);                     //       orv d23, p6, z8.d
-    __ sve_eorv(v17, __ S, p5, z19);                   //       eorv s17, p5, z19.s
-    __ sve_smaxv(v4, __ D, p5, z13);                   //       smaxv d4, p5, z13.d
-    __ sve_sminv(v22, __ D, p7, z30);                  //       sminv d22, p7, z30.d
-    __ sve_fminv(v17, __ S, p4, z14);                  //       fminv s17, p4, z14.s
-    __ sve_fmaxv(v12, __ S, p7, z20);                  //       fmaxv s12, p7, z20.s
-    __ sve_fadda(v1, __ S, p3, z13);                   //       fadda s1, p3, s1, z13.s
-    __ sve_uaddv(v7, __ S, p2, z11);                   //       uaddv d7, p2, z11.s
+    __ sve_andv(v4, __ D, p5, z13);                    //       andv d4, p5, z13.d
+    __ sve_orv(v22, __ D, p7, z30);                    //       orv d22, p7, z30.d
+    __ sve_eorv(v17, __ H, p4, z14);                   //       eorv h17, p4, z14.h
+    __ sve_smaxv(v12, __ B, p7, z20);                  //       smaxv b12, p7, z20.b
+    __ sve_sminv(v1, __ B, p3, z13);                   //       sminv b1, p3, z13.b
+    __ sve_fminv(v7, __ D, p2, z11);                   //       fminv d7, p2, z11.d
+    __ sve_fmaxv(v4, __ S, p6, z15);                   //       fmaxv s4, p6, z15.s
+    __ sve_fadda(v3, __ D, p7, z0);                    //       fadda d3, p7, d3, z0.d
+    __ sve_uaddv(v5, __ D, p5, z30);                   //       uaddv d5, p5, z30.d

    __ bind(forth);

@ -1213,30 +1224,30 @@
    0x9101a1a0,     0xb10a5cc8,     0xd10810aa,     0xf10fd061,
    0x120cb166,     0x321764bc,     0x52174681,     0x720c0227,
    0x9241018e,     0xb25a2969,     0xd278b411,     0xf26aad01,
-    0x14000000,     0x17ffffd7,     0x140003e5,     0x94000000,
-    0x97ffffd4,     0x940003e2,     0x3400000a,     0x34fffa2a,
-    0x34007bea,     0x35000008,     0x35fff9c8,     0x35007b88,
-    0xb400000b,     0xb4fff96b,     0xb4007b2b,     0xb500001d,
-    0xb5fff91d,     0xb5007add,     0x10000013,     0x10fff8b3,
-    0x10007a73,     0x90000013,     0x36300016,     0x3637f836,
-    0x363079f6,     0x3758000c,     0x375ff7cc,     0x3758798c,
+    0x14000000,     0x17ffffd7,     0x140003f0,     0x94000000,
+    0x97ffffd4,     0x940003ed,     0x3400000a,     0x34fffa2a,
+    0x34007d4a,     0x35000008,     0x35fff9c8,     0x35007ce8,
+    0xb400000b,     0xb4fff96b,     0xb4007c8b,     0xb500001d,
+    0xb5fff91d,     0xb5007c3d,     0x10000013,     0x10fff8b3,
+    0x10007bd3,     0x90000013,     0x36300016,     0x3637f836,
+    0x36307b56,     0x3758000c,     0x375ff7cc,     0x37587aec,
    0x128313a0,     0x528a32c7,     0x7289173b,     0x92ab3acc,
    0xd2a0bf94,     0xf2c285e8,     0x9358722f,     0x330e652f,
    0x53067f3b,     0x93577c53,     0xb34a1aac,     0xd35a4016,
    0x13946c63,     0x93c3dbc8,     0x54000000,     0x54fff5a0,
-    0x54007760,     0x54000001,     0x54fff541,     0x54007701,
-    0x54000002,     0x54fff4e2,     0x540076a2,     0x54000002,
-    0x54fff482,     0x54007642,     0x54000003,     0x54fff423,
-    0x540075e3,     0x54000003,     0x54fff3c3,     0x54007583,
-    0x54000004,     0x54fff364,     0x54007524,     0x54000005,
-    0x54fff305,     0x540074c5,     0x54000006,     0x54fff2a6,
-    0x54007466,     0x54000007,     0x54fff247,     0x54007407,
-    0x54000008,     0x54fff1e8,     0x540073a8,     0x54000009,
-    0x54fff189,     0x54007349,     0x5400000a,     0x54fff12a,
-    0x540072ea,     0x5400000b,     0x54fff0cb,     0x5400728b,
-    0x5400000c,     0x54fff06c,     0x5400722c,     0x5400000d,
-    0x54fff00d,     0x540071cd,     0x5400000e,     0x54ffefae,
-    0x5400716e,     0x5400000f,     0x54ffef4f,     0x5400710f,
+    0x540078c0,     0x54000001,     0x54fff541,     0x54007861,
+    0x54000002,     0x54fff4e2,     0x54007802,     0x54000002,
+    0x54fff482,     0x540077a2,     0x54000003,     0x54fff423,
+    0x54007743,     0x54000003,     0x54fff3c3,     0x540076e3,
+    0x54000004,     0x54fff364,     0x54007684,     0x54000005,
+    0x54fff305,     0x54007625,     0x54000006,     0x54fff2a6,
+    0x540075c6,     0x54000007,     0x54fff247,     0x54007567,
+    0x54000008,     0x54fff1e8,     0x54007508,     0x54000009,
+    0x54fff189,     0x540074a9,     0x5400000a,     0x54fff12a,
+    0x5400744a,     0x5400000b,     0x54fff0cb,     0x540073eb,
+    0x5400000c,     0x54fff06c,     0x5400738c,     0x5400000d,
+    0x54fff00d,     0x5400732d,     0x5400000e,     0x54ffefae,
+    0x540072ce,     0x5400000f,     0x54ffef4f,     0x5400726f,
    0xd40658e1,     0xd4014d22,     0xd4046543,     0xd4273f60,
    0xd44cad80,     0xd503201f,     0xd503203f,     0xd503205f,
    0xd503209f,     0xd50320bf,     0xd503219f,     0xd50323bf,
@ -1401,7 +1412,8 @@
    0x6554ac26,     0x6556ac26,     0x6552ac26,     0x65cbac85,
    0x65caac01,     0x65dea833,     0x659ca509,     0x65d8a801,
    0x65dcac01,     0x655cb241,     0x0520a1e0,     0x0521a601,
-    0x052281e0,     0x05238601,     0x04a14026,     0x0568aca7,
+    0x052281e0,     0x05238601,     0x04a14026,     0x042244a6,
+    0x046344a6,     0x04a444a6,     0x04e544a7,     0x0568aca7,
    0x05b23230,     0x853040af,     0xc5b040af,     0xe57080af,
    0xe5b080af,     0x25034440,     0x254054c4,     0x25034640,
    0x25415a05,     0x25834440,     0x25c54489,     0x250b5d3a,
@ -1412,7 +1424,8 @@
    0x25d8e104,     0x25d8e184,     0x2518e407,     0x05214800,
    0x05614800,     0x05a14800,     0x05e14800,     0x05214c00,
    0x05614c00,     0x05a14c00,     0x05e14c00,     0x05304001,
-    0x05314001,     0x05271e11,     0x1e601000,     0x1e603000,
+    0x05314001,     0x05a18610,     0x05e18610,     0x05271e11,
+    0x45b0c210,     0x45f1c231,     0x1e601000,     0x1e603000,
    0x1e621000,     0x1e623000,     0x1e641000,     0x1e643000,
    0x1e661000,     0x1e663000,     0x1e681000,     0x1e683000,
    0x1e6a1000,     0x1e6a3000,     0x1e6c1000,     0x1e6c3000,
@ -1450,18 +1463,19 @@
    0x25a1de96,     0x05808874,     0x05423bb1,     0x050030e4,
    0x04680102,     0x04be0638,     0x658103c4,     0x65800993,
    0x65910707,     0x04d6a53b,     0x04c00e17,     0x04da1696,
-    0x049089bc,     0x045b1787,     0x049aad6b,     0x04991901,
-    0x0493922d,     0x04518064,     0x04900dc7,     0x0417afa4,
-    0x04deaaa0,     0x04980123,     0x04080b1c,     0x04ca06f3,
-    0x04c1154d,     0x04dcb3cc,     0x65c083ae,     0x65cd94f5,
-    0x65c68342,     0x65c79229,     0x65c28440,     0x04dda56e,
-    0x6582b3ae,     0x6580a2c3,     0x6581bb63,     0x65cdb4f3,
-    0x65818cb5,     0x65f186b9,     0x65b30120,     0x65f12dc7,
-    0x65f1af0b,     0x65eec9f1,     0x65a7fed6,     0x65aa5f65,
-    0x65b47aae,     0x04c55723,     0x0441723d,     0x042d33ae,
-    0x04be3051,     0x047d32b6,     0x04e03048,     0x05a06ad7,
-    0x05776f59,     0x4501b2b5,     0x049a356a,     0x04d83917,
-    0x04993671,     0x04c835a4,     0x04ca3fd6,     0x658731d1,
-    0x65863e8c,     0x65982da1,     0x04812967,
+    0x049089bc,     0x045b1787,     0x0499ad6b,     0x049ab901,
+    0x0499122d,     0x04538064,     0x04918dc7,     0x04100fa4,
+    0x04d7aaa0,     0x049ea123,     0x04180b1c,     0x05e786f3,
+    0x05e4954d,     0x048813cc,     0x048a03ae,     0x048114f5,
+    0x04dca342,     0x65c09229,     0x65cd8440,     0x65c6856e,
+    0x658793ae,     0x658282c3,     0x049dbb63,     0x65c2b4f3,
+    0x6580acb5,     0x65c1a6b9,     0x658da071,     0x65818cf3,
+    0x65ab922e,     0x65b113d8,     0x65f62f4f,     0x65e5a916,
+    0x65eec81b,     0x65e3f415,     0x65fd4739,     0x65ee6191,
+    0x04c2422d,     0x045d76b4,     0x04203048,     0x04a032d7,
+    0x04773359,     0x04e132b5,     0x05ab6a6a,     0x05e86ef7,
+    0x4593b271,     0x04da35a4,     0x04d83fd6,     0x045931d1,
+    0x04083e8c,     0x040a2da1,     0x65c72967,     0x658639e4,
+    0x65d83c03,     0x04c137c5,
  };
 // END  Generated code -- do not edit
--- a/test/hotspot/jtreg/compiler/vectorapi/TestIntrinsicBailOut.java
+++ b/test/hotspot/jtreg/compiler/vectorapi/TestIntrinsicBailOut.java
@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2021 THL A29 Limited, a Tencent company. All rights reserved.
+ * Copyright (C) 2021, 2022, THL A29 Limited, a Tencent company. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@ -23,11 +23,13 @@

 package compiler.vectorapi;

+import java.lang.foreign.MemorySegment;
 import jdk.incubator.vector.*;
 import java.nio.ByteOrder;

 /*
 * @test
+ * @enablePreview
 * @bug 8262998
 * @summary Vector API intrinsincs should not modify IR when bailing out
 * @modules jdk.incubator.vector
@ -40,13 +42,15 @@ public class TestIntrinsicBailOut {
  static final VectorSpecies<Double> SPECIES256 = DoubleVector.SPECIES_256;
  static byte[] a = new byte[512];
  static byte[] r = new byte[512];
+  static MemorySegment msa = MemorySegment.ofArray(a);
+  static MemorySegment msr = MemorySegment.ofArray(r);

  static void test() {
-    DoubleVector av = DoubleVector.fromByteArray(SPECIES256, a, 0, ByteOrder.BIG_ENDIAN);
-    av.intoByteArray(r, 0, ByteOrder.BIG_ENDIAN);
+    DoubleVector av = DoubleVector.fromMemorySegment(SPECIES256, msa, 0, ByteOrder.BIG_ENDIAN);
+    av.intoMemorySegment(msr, 0, ByteOrder.BIG_ENDIAN);

-    DoubleVector bv = DoubleVector.fromByteArray(SPECIES256, a, 32, ByteOrder.LITTLE_ENDIAN);
-    bv.intoByteArray(r, 32, ByteOrder.LITTLE_ENDIAN);
+    DoubleVector bv = DoubleVector.fromMemorySegment(SPECIES256, msa, 32, ByteOrder.LITTLE_ENDIAN);
+    bv.intoMemorySegment(msr, 32, ByteOrder.LITTLE_ENDIAN);
  }

  public static void main(String[] args) {
--- a/test/hotspot/jtreg/compiler/vectorapi/TestVectorErgonomics.java
+++ b/test/hotspot/jtreg/compiler/vectorapi/TestVectorErgonomics.java
@ -38,42 +38,42 @@ public class TestVectorErgonomics {

    public static void main(String[] args) throws Throwable {
        ProcessTools.executeTestJvm("--add-modules=jdk.incubator.vector", "-XX:+UnlockExperimentalVMOptions",
-                                    "-XX:+EnableVectorReboxing", "-Xlog:compilation", "-version")
+                                    "-XX:+EnableVectorReboxing", "-Xlog:compilation", "-version", "--enable-preview")
                    .shouldHaveExitValue(0)
                    .shouldContain("EnableVectorReboxing=true");

        ProcessTools.executeTestJvm("--add-modules=jdk.incubator.vector", "-XX:+UnlockExperimentalVMOptions",
-                                    "-XX:+EnableVectorAggressiveReboxing", "-Xlog:compilation", "-version")
+                                    "-XX:+EnableVectorAggressiveReboxing", "-Xlog:compilation", "-version", "--enable-preview")
                    .shouldHaveExitValue(0)
                    .shouldContain("EnableVectorAggressiveReboxing=true");

        ProcessTools.executeTestJvm("--add-modules=jdk.incubator.vector", "-XX:+UnlockExperimentalVMOptions",
-                                    "-XX:-EnableVectorReboxing", "-Xlog:compilation", "-version")
+                                    "-XX:-EnableVectorReboxing", "-Xlog:compilation", "-version", "--enable-preview")
                    .shouldHaveExitValue(0)
                    .shouldContain("EnableVectorReboxing=false")
                    .shouldContain("EnableVectorAggressiveReboxing=false");

        ProcessTools.executeTestJvm("--add-modules=jdk.incubator.vector", "-XX:+UnlockExperimentalVMOptions",
-                                    "-XX:-EnableVectorAggressiveReboxing", "-Xlog:compilation", "-version")
+                                    "-XX:-EnableVectorAggressiveReboxing", "-Xlog:compilation", "-version", "--enable-preview")
                    .shouldHaveExitValue(0)
                    .shouldContain("EnableVectorAggressiveReboxing=false");

        ProcessTools.executeTestJvm("--add-modules=jdk.incubator.vector", "-XX:+UnlockExperimentalVMOptions",
-                                    "-XX:-EnableVectorSupport", "-Xlog:compilation", "-version")
+                                    "-XX:-EnableVectorSupport", "-Xlog:compilation", "-version", "--enable-preview")
                    .shouldHaveExitValue(0)
                    .shouldContain("EnableVectorSupport=false")
                    .shouldContain("EnableVectorReboxing=false")
                    .shouldContain("EnableVectorAggressiveReboxing=false");

        ProcessTools.executeTestJvm("--add-modules=jdk.incubator.vector", "-XX:+UnlockExperimentalVMOptions",
-                                    "-XX:-EnableVectorSupport", "-XX:+EnableVectorReboxing", "-Xlog:compilation", "-version")
+                                    "-XX:-EnableVectorSupport", "-XX:+EnableVectorReboxing", "-Xlog:compilation", "-version", "--enable-preview")
                    .shouldHaveExitValue(0)
                    .shouldContain("EnableVectorSupport=false")
                    .shouldContain("EnableVectorReboxing=false")
                    .shouldContain("EnableVectorAggressiveReboxing=false");

        ProcessTools.executeTestJvm("--add-modules=jdk.incubator.vector", "-XX:+UnlockExperimentalVMOptions",
-                                    "-XX:-EnableVectorSupport", "-XX:+EnableVectorAggressiveReboxing", "-Xlog:compilation", "-version")
+                                    "-XX:-EnableVectorSupport", "-XX:+EnableVectorAggressiveReboxing", "-Xlog:compilation", "-version", "--enable-preview")
                    .shouldHaveExitValue(0)
                    .shouldContain("EnableVectorSupport=false")
                    .shouldContain("EnableVectorReboxing=false")
--- a/test/hotspot/jtreg/compiler/vectorapi/VectorMemoryAlias.java
+++ b/test/hotspot/jtreg/compiler/vectorapi/VectorMemoryAlias.java
@ -1,5 +1,5 @@
 /*
- *  Copyright (c) 2021, Oracle and/or its affiliates. All rights reserved.
+ *  Copyright (c) 2021, 2022, Oracle and/or its affiliates. All rights reserved.
 *  Copyright (c) 2021, Rado Smogura. All rights reserved.
 *
 *  DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
@ -26,6 +26,7 @@

 /*
 * @test
+ * @enablePreview
 * @summary Test if memory ordering is preserved
 *
 * @run main/othervm -XX:-TieredCompilation -XX:+UnlockDiagnosticVMOptions -XX:+AbortVMOnCompilationFailure
@ -36,8 +37,8 @@

 package compiler.vectorapi;

-import java.nio.ByteBuffer;
 import java.nio.ByteOrder;
+import java.lang.foreign.MemorySegment;
 import jdk.incubator.vector.ByteVector;
 import jdk.incubator.vector.VectorSpecies;

@ -53,13 +54,13 @@ public class VectorMemoryAlias {

  public static int test() {
    byte arr[] = new byte[256];
-    final var bb = ByteBuffer.wrap(arr);
+    final var ms = MemorySegment.ofArray(arr);
    final var ones = ByteVector.broadcast(SPECIES, 1);
    var res = ByteVector.zero(SPECIES);

    int result = 0;
    result += arr[2];
-    res.add(ones).intoByteBuffer(bb, 0, ByteOrder.nativeOrder());
+    res.add(ones).intoMemorySegment(ms, 0L, ByteOrder.nativeOrder());
    result += arr[2];

    return result;
--- a/test/hotspot/jtreg/compiler/vectorapi/VectorRebracket128Test.java
+++ b/test/hotspot/jtreg/compiler/vectorapi/VectorRebracket128Test.java
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2021, 2022, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@ -20,30 +20,28 @@
 * or visit www.oracle.com if you need additional information or have any
 * questions.
 */
-import jdk.incubator.vector.*;
-import jdk.internal.vm.annotation.ForceInline;
 import org.testng.Assert;
 import org.testng.annotations.Test;
 import org.testng.annotations.DataProvider;

-import java.lang.invoke.MethodHandles;
-import java.lang.invoke.VarHandle;
 import java.nio.ByteOrder;
 import java.util.Arrays;
 import java.util.List;
 import java.util.function.IntFunction;
 import java.util.function.IntUnaryOperator;
-import jdk.incubator.vector.VectorShape;
-import jdk.incubator.vector.VectorSpecies;
+
+import java.lang.foreign.MemorySegment;
+import jdk.incubator.vector.*;
 import jdk.internal.vm.annotation.ForceInline;

 /*
 * @test id=Z
 * @bug 8260473
+ * @enablePreview
 * @requires vm.gc.Z
 * @modules jdk.incubator.vector
 * @modules java.base/jdk.internal.vm.annotation
- * @run testng/othervm -XX:CompileCommand=compileonly,jdk/incubator/vector/ByteVector.fromByteBuffer
+ * @run testng/othervm -XX:CompileCommand=compileonly,jdk/incubator/vector/ByteVector.fromMemorySegment
 *      -XX:-TieredCompilation -XX:CICompilerCount=1 -XX:+UseZGC -Xbatch -Xmx256m VectorRebracket128Test
 */

@ -124,8 +122,10 @@ public class VectorRebracket128Test {

    @ForceInline
    static <E,F>
-    void testVectorRebracket(VectorSpecies<E> a, VectorSpecies<F> b, byte[] input, byte[] output) {
-        Vector<E> av = a.fromByteArray(input, 0, ByteOrder.nativeOrder());
+    void testVectorRebracket(VectorSpecies<E> a, VectorSpecies<F> b,
+                             byte[] input, byte[] output,
+                             MemorySegment msInput, MemorySegment msOutput) {
+        Vector<E> av = a.fromMemorySegment(msInput, 0, ByteOrder.nativeOrder());
        int block;
        assert(input.length == output.length);

@ -139,7 +139,7 @@ public class VectorRebracket128Test {

        int part = 0;
        Vector<F> bv = av.reinterpretShape(b, part);
-        bv.intoByteArray(output, 0, ByteOrder.nativeOrder());
+        bv.intoMemorySegment(msOutput, 0, ByteOrder.nativeOrder());
        // in-place copy, no resize
        expected = input;
        origin = 0;
@ -152,10 +152,12 @@ public class VectorRebracket128Test {
    static void testRebracket128(IntFunction<byte[]> fa) {
        byte[] barr = fa.apply(128/Byte.SIZE);
        byte[] bout = new byte[barr.length];
+        MemorySegment msin = MemorySegment.ofArray(barr);
+        MemorySegment msout = MemorySegment.ofArray(bout);
        for (int i = 0; i < NUM_ITER; i++) {
-            testVectorRebracket(bspec128, bspec128, barr, bout);
-            testVectorRebracket(bspec128, sspec128, barr, bout);
-            testVectorRebracket(bspec128, ispec128, barr, bout);
+            testVectorRebracket(bspec128, bspec128, barr, bout, msin, msout);
+            testVectorRebracket(bspec128, sspec128, barr, bout, msin, msout);
+            testVectorRebracket(bspec128, ispec128, barr, bout, msin, msout);
        }
    }

--- a/Show more
+++ b/Show more