Merge

2025-09-20 02:54:35 +02:00 · 2016-01-07 21:10:28 +00:00 · 2016-01-07 21:10:28 +00:00 · 9288ff53e7
commit 9288ff53e7
parent 4e9dd68311 543fcb3a25
1316 changed files with 58581 additions and 14455 deletions
--- a/hotspot/src/cpu/x86/vm/macroAssembler_x86.cpp
+++ b/hotspot/src/cpu/x86/vm/macroAssembler_x86.cpp
@ -2277,8 +2277,8 @@ void MacroAssembler::call(AddressLiteral entry) {
  }
 }

-void MacroAssembler::ic_call(address entry) {
-  RelocationHolder rh = virtual_call_Relocation::spec(pc());
+void MacroAssembler::ic_call(address entry, jint method_index) {
+  RelocationHolder rh = virtual_call_Relocation::spec(pc(), method_index);
  movptr(rax, (intptr_t)Universe::non_oop_word());
  call(AddressLiteral(entry, rh));
 }
@ -3058,50 +3058,6 @@ void MacroAssembler::mulpd(XMMRegister dst, AddressLiteral src) {
  }
 }

-void MacroAssembler::pow_exp_core_encoding() {
-  // kills rax, rcx, rdx
-  subptr(rsp,sizeof(jdouble));
-  // computes 2^X. Stack: X ...
-  // f2xm1 computes 2^X-1 but only operates on -1<=X<=1. Get int(X) and
-  // keep it on the thread's stack to compute 2^int(X) later
-  // then compute 2^(X-int(X)) as (2^(X-int(X)-1+1)
-  // final result is obtained with: 2^X = 2^int(X) * 2^(X-int(X))
-  fld_s(0);                 // Stack: X X ...
-  frndint();                // Stack: int(X) X ...
-  fsuba(1);                 // Stack: int(X) X-int(X) ...
-  fistp_s(Address(rsp,0));  // move int(X) as integer to thread's stack. Stack: X-int(X) ...
-  f2xm1();                  // Stack: 2^(X-int(X))-1 ...
-  fld1();                   // Stack: 1 2^(X-int(X))-1 ...
-  faddp(1);                 // Stack: 2^(X-int(X))
-  // computes 2^(int(X)): add exponent bias (1023) to int(X), then
-  // shift int(X)+1023 to exponent position.
-  // Exponent is limited to 11 bits if int(X)+1023 does not fit in 11
-  // bits, set result to NaN. 0x000 and 0x7FF are reserved exponent
-  // values so detect them and set result to NaN.
-  movl(rax,Address(rsp,0));
-  movl(rcx, -2048); // 11 bit mask and valid NaN binary encoding
-  addl(rax, 1023);
-  movl(rdx,rax);
-  shll(rax,20);
-  // Check that 0 < int(X)+1023 < 2047. Otherwise set rax to NaN.
-  addl(rdx,1);
-  // Check that 1 < int(X)+1023+1 < 2048
-  // in 3 steps:
-  // 1- (int(X)+1023+1)&-2048 == 0 => 0 <= int(X)+1023+1 < 2048
-  // 2- (int(X)+1023+1)&-2048 != 0
-  // 3- (int(X)+1023+1)&-2048 != 1
-  // Do 2- first because addl just updated the flags.
-  cmov32(Assembler::equal,rax,rcx);
-  cmpl(rdx,1);
-  cmov32(Assembler::equal,rax,rcx);
-  testl(rdx,rcx);
-  cmov32(Assembler::notEqual,rax,rcx);
-  movl(Address(rsp,4),rax);
-  movl(Address(rsp,0),0);
-  fmul_d(Address(rsp,0));   // Stack: 2^X ...
-  addptr(rsp,sizeof(jdouble));
-}
-
 void MacroAssembler::increase_precision() {
  subptr(rsp, BytesPerWord);
  fnstcw(Address(rsp, 0));
@ -3117,194 +3073,6 @@ void MacroAssembler::restore_precision() {
  addptr(rsp, BytesPerWord);
 }

-void MacroAssembler::fast_pow() {
-  // computes X^Y = 2^(Y * log2(X))
-  // if fast computation is not possible, result is NaN. Requires
-  // fallback from user of this macro.
-  // increase precision for intermediate steps of the computation
-  BLOCK_COMMENT("fast_pow {");
-  increase_precision();
-  fyl2x();                 // Stack: (Y*log2(X)) ...
-  pow_exp_core_encoding(); // Stack: exp(X) ...
-  restore_precision();
-  BLOCK_COMMENT("} fast_pow");
-}
-
-void MacroAssembler::pow_or_exp(int num_fpu_regs_in_use) {
-  // kills rax, rcx, rdx
-  // pow and exp needs 2 extra registers on the fpu stack.
-  Label slow_case, done;
-  Register tmp = noreg;
-  if (!VM_Version::supports_cmov()) {
-    // fcmp needs a temporary so preserve rdx,
-    tmp = rdx;
-  }
-  Register tmp2 = rax;
-  Register tmp3 = rcx;
-
-  // Stack: X Y
-  Label x_negative, y_not_2;
-
-  static double two = 2.0;
-  ExternalAddress two_addr((address)&two);
-
-  // constant maybe too far on 64 bit
-  lea(tmp2, two_addr);
-  fld_d(Address(tmp2, 0));    // Stack: 2 X Y
-  fcmp(tmp, 2, true, false);  // Stack: X Y
-  jcc(Assembler::parity, y_not_2);
-  jcc(Assembler::notEqual, y_not_2);
-
-  fxch(); fpop();             // Stack: X
-  fmul(0);                    // Stack: X*X
-
-  jmp(done);
-
-  bind(y_not_2);
-
-  fldz();                     // Stack: 0 X Y
-  fcmp(tmp, 1, true, false);  // Stack: X Y
-  jcc(Assembler::above, x_negative);
-
-  // X >= 0
-
-  fld_s(1);                   // duplicate arguments for runtime call. Stack: Y X Y
-  fld_s(1);                   // Stack: X Y X Y
-  fast_pow();                 // Stack: X^Y X Y
-  fcmp(tmp, 0, false, false); // Stack: X^Y X Y
-  // X^Y not equal to itself: X^Y is NaN go to slow case.
-  jcc(Assembler::parity, slow_case);
-  // get rid of duplicate arguments. Stack: X^Y
-  if (num_fpu_regs_in_use > 0) {
-    fxch(); fpop();
-    fxch(); fpop();
-  } else {
-    ffree(2);
-    ffree(1);
-  }
-  jmp(done);
-
-  // X <= 0
-  bind(x_negative);
-
-  fld_s(1);                   // Stack: Y X Y
-  frndint();                  // Stack: int(Y) X Y
-  fcmp(tmp, 2, false, false); // Stack: int(Y) X Y
-  jcc(Assembler::notEqual, slow_case);
-
-  subptr(rsp, 8);
-
-  // For X^Y, when X < 0, Y has to be an integer and the final
-  // result depends on whether it's odd or even. We just checked
-  // that int(Y) == Y.  We move int(Y) to gp registers as a 64 bit
-  // integer to test its parity. If int(Y) is huge and doesn't fit
-  // in the 64 bit integer range, the integer indefinite value will
-  // end up in the gp registers. Huge numbers are all even, the
-  // integer indefinite number is even so it's fine.
-
-#ifdef ASSERT
-  // Let's check we don't end up with an integer indefinite number
-  // when not expected. First test for huge numbers: check whether
-  // int(Y)+1 == int(Y) which is true for very large numbers and
-  // those are all even. A 64 bit integer is guaranteed to not
-  // overflow for numbers where y+1 != y (when precision is set to
-  // double precision).
-  Label y_not_huge;
-
-  fld1();                     // Stack: 1 int(Y) X Y
-  fadd(1);                    // Stack: 1+int(Y) int(Y) X Y
-
-#ifdef _LP64
-  // trip to memory to force the precision down from double extended
-  // precision
-  fstp_d(Address(rsp, 0));
-  fld_d(Address(rsp, 0));
-#endif
-
-  fcmp(tmp, 1, true, false);  // Stack: int(Y) X Y
-#endif
-
-  // move int(Y) as 64 bit integer to thread's stack
-  fistp_d(Address(rsp,0));    // Stack: X Y
-
-#ifdef ASSERT
-  jcc(Assembler::notEqual, y_not_huge);
-
-  // Y is huge so we know it's even. It may not fit in a 64 bit
-  // integer and we don't want the debug code below to see the
-  // integer indefinite value so overwrite int(Y) on the thread's
-  // stack with 0.
-  movl(Address(rsp, 0), 0);
-  movl(Address(rsp, 4), 0);
-
-  bind(y_not_huge);
-#endif
-
-  fld_s(1);                   // duplicate arguments for runtime call. Stack: Y X Y
-  fld_s(1);                   // Stack: X Y X Y
-  fabs();                     // Stack: abs(X) Y X Y
-  fast_pow();                 // Stack: abs(X)^Y X Y
-  fcmp(tmp, 0, false, false); // Stack: abs(X)^Y X Y
-  // abs(X)^Y not equal to itself: abs(X)^Y is NaN go to slow case.
-
-  pop(tmp2);
-  NOT_LP64(pop(tmp3));
-  jcc(Assembler::parity, slow_case);
-
-#ifdef ASSERT
-  // Check that int(Y) is not integer indefinite value (int
-  // overflow). Shouldn't happen because for values that would
-  // overflow, 1+int(Y)==Y which was tested earlier.
-#ifndef _LP64
-  {
-    Label integer;
-    testl(tmp2, tmp2);
-    jcc(Assembler::notZero, integer);
-    cmpl(tmp3, 0x80000000);
-    jcc(Assembler::notZero, integer);
-    STOP("integer indefinite value shouldn't be seen here");
-    bind(integer);
-  }
-#else
-  {
-    Label integer;
-    mov(tmp3, tmp2); // preserve tmp2 for parity check below
-    shlq(tmp3, 1);
-    jcc(Assembler::carryClear, integer);
-    jcc(Assembler::notZero, integer);
-    STOP("integer indefinite value shouldn't be seen here");
-    bind(integer);
-  }
-#endif
-#endif
-
-  // get rid of duplicate arguments. Stack: X^Y
-  if (num_fpu_regs_in_use > 0) {
-    fxch(); fpop();
-    fxch(); fpop();
-  } else {
-    ffree(2);
-    ffree(1);
-  }
-
-  testl(tmp2, 1);
-  jcc(Assembler::zero, done); // X <= 0, Y even: X^Y = abs(X)^Y
-  // X <= 0, Y even: X^Y = -abs(X)^Y
-
-  fchs();                     // Stack: -abs(X)^Y Y
-  jmp(done);
-
-  // slow case: runtime call
-  bind(slow_case);
-
-  fpop();                       // pop incorrect result or int(Y)
-
-  fp_runtime_fallback(CAST_FROM_FN_PTR(address, SharedRuntime::dpow), 2, num_fpu_regs_in_use);
-
-  // Come here with result in F-TOS
-  bind(done);
-}
-
 void MacroAssembler::fpop() {
  ffree();
  fincstp();
@ -8014,9 +7782,15 @@ void MacroAssembler::string_compare(Register str1, Register str2,
                                    XMMRegister vec1, int ae) {
  ShortBranchVerifier sbv(this);
  Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL;
+  Label COMPARE_WIDE_VECTORS_LOOP_FAILED;  // used only _LP64 && AVX3
  int stride, stride2, adr_stride, adr_stride1, adr_stride2;
+  int stride2x2 = 0x40;
  Address::ScaleFactor scale, scale1, scale2;

+  if (ae != StrIntrinsicNode::LL) {
+    stride2x2 = 0x20;
+  }
+
  if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) {
    shrl(cnt2, 1);
  }
@ -8026,15 +7800,15 @@ void MacroAssembler::string_compare(Register str1, Register str2,
  movl(result, cnt1);
  subl(cnt1, cnt2);
  push(cnt1);
-  cmov32(Assembler::lessEqual, cnt2, result);
+  cmov32(Assembler::lessEqual, cnt2, result);    // cnt2 = min(cnt1, cnt2)

  // Is the minimum length zero?
  testl(cnt2, cnt2);
  jcc(Assembler::zero, LENGTH_DIFF_LABEL);
  if (ae == StrIntrinsicNode::LL) {
    // Load first bytes
-    load_unsigned_byte(result, Address(str1, 0));
-    load_unsigned_byte(cnt1, Address(str2, 0));
+    load_unsigned_byte(result, Address(str1, 0));  // result = str1[0]
+    load_unsigned_byte(cnt1, Address(str2, 0));    // cnt1   = str2[0]
  } else if (ae == StrIntrinsicNode::UU) {
    // Load first characters
    load_unsigned_short(result, Address(str1, 0));
@ -8075,7 +7849,10 @@ void MacroAssembler::string_compare(Register str1, Register str2,
    assert(UseSSE >= 4, "SSE4 must be enabled for SSE4.2 intrinsics to be available");
    Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR;
    Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR;
+    Label COMPARE_WIDE_VECTORS_LOOP_AVX2;
    Label COMPARE_TAIL_LONG;
+    Label COMPARE_WIDE_VECTORS_LOOP_AVX3;  // used only _LP64 && AVX3
+
    int pcmpmask = 0x19;
    if (ae == StrIntrinsicNode::LL) {
      pcmpmask &= ~0x01;
@ -8138,11 +7915,40 @@ void MacroAssembler::string_compare(Register str1, Register str2,
    }
    subl(result, stride2);
    subl(cnt2, stride2);
-    jccb(Assembler::zero, COMPARE_WIDE_TAIL);
+    jcc(Assembler::zero, COMPARE_WIDE_TAIL);
    negptr(result);

    //  In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest)
    bind(COMPARE_WIDE_VECTORS_LOOP);
+
+#ifdef _LP64
+    if (VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
+      cmpl(cnt2, stride2x2);
+      jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2);
+      testl(cnt2, stride2x2-1);   // cnt2 holds the vector count
+      jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2);   // means we cannot subtract by 0x40
+
+      bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
+      if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
+        evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit);
+        evpcmpeqb(k7, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
+      } else {
+        vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit);
+        evpcmpeqb(k7, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
+      }
+      kortestql(k7, k7);
+      jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED);     // miscompare
+      addptr(result, stride2x2);  // update since we already compared at this addr
+      subl(cnt2, stride2x2);      // and sub the size too
+      jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3);
+
+      vpxor(vec1, vec1);
+      jmpb(COMPARE_WIDE_TAIL);
+    }//if (VM_Version::supports_avx512vlbw())
+#endif // _LP64
+
+
+    bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
    if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
      vmovdqu(vec1, Address(str1, result, scale));
      vpxor(vec1, Address(str2, result, scale));
@ -8151,7 +7957,7 @@ void MacroAssembler::string_compare(Register str1, Register str2,
      vpxor(vec1, Address(str2, result, scale2));
    }
    vptest(vec1, vec1);
-    jccb(Assembler::notZero, VECTOR_NOT_EQUAL);
+    jcc(Assembler::notZero, VECTOR_NOT_EQUAL);
    addptr(result, stride2);
    subl(cnt2, stride2);
    jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP);
@ -8166,7 +7972,7 @@ void MacroAssembler::string_compare(Register str1, Register str2,
    movl(result, stride2);
    movl(cnt2, result);
    negptr(result);
-    jmpb(COMPARE_WIDE_VECTORS_LOOP);
+    jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2);

    // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors.
    bind(VECTOR_NOT_EQUAL);
@ -8310,6 +8116,34 @@ void MacroAssembler::string_compare(Register str1, Register str2,
  }
  jmpb(DONE_LABEL);

+#ifdef _LP64
+  if (VM_Version::supports_avx512vlbw()) {
+
+    bind(COMPARE_WIDE_VECTORS_LOOP_FAILED);
+
+    kmovql(cnt1, k7);
+    notq(cnt1);
+    bsfq(cnt2, cnt1);
+    if (ae != StrIntrinsicNode::LL) {
+      // Divide diff by 2 to get number of chars
+      sarl(cnt2, 1);
+    }
+    addq(result, cnt2);
+    if (ae == StrIntrinsicNode::LL) {
+      load_unsigned_byte(cnt1, Address(str2, result));
+      load_unsigned_byte(result, Address(str1, result));
+    } else if (ae == StrIntrinsicNode::UU) {
+      load_unsigned_short(cnt1, Address(str2, result, scale));
+      load_unsigned_short(result, Address(str1, result, scale));
+    } else {
+      load_unsigned_short(cnt1, Address(str2, result, scale2));
+      load_unsigned_byte(result, Address(str1, result, scale1));
+    }
+    subl(result, cnt1);
+    jmpb(POP_LABEL);
+  }//if (VM_Version::supports_avx512vlbw())
+#endif // _LP64
+
  // Discard the stored length difference
  bind(POP_LABEL);
  pop(cnt1);
@ -8319,6 +8153,7 @@ void MacroAssembler::string_compare(Register str1, Register str2,
  if(ae == StrIntrinsicNode::UL) {
    negl(result);
  }
+
 }

 // Search for Non-ASCII character (Negative byte value) in a byte array,
@ -8510,13 +8345,53 @@ void MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ar
    // Compare 32-byte vectors
    andl(result, 0x0000001f);  //   tail count (in bytes)
    andl(limit, 0xffffffe0);   // vector count (in bytes)
-    jccb(Assembler::zero, COMPARE_TAIL);
+    jcc(Assembler::zero, COMPARE_TAIL);

    lea(ary1, Address(ary1, limit, Address::times_1));
    lea(ary2, Address(ary2, limit, Address::times_1));
    negptr(limit);

    bind(COMPARE_WIDE_VECTORS);
+
+#ifdef _LP64
+    if (VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
+      Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3;
+
+      cmpl(limit, -64);
+      jccb(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2);
+
+      bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
+
+      evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit);
+      evpcmpeqb(k7, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit);
+      kortestql(k7, k7);
+      jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
+      addptr(limit, 64);  // update since we already compared at this addr
+      cmpl(limit, -64);
+      jccb(Assembler::lessEqual, COMPARE_WIDE_VECTORS_LOOP_AVX3);
+
+      // At this point we may still need to compare -limit+result bytes.
+      // We could execute the next two instruction and just continue via non-wide path:
+      //  cmpl(limit, 0);
+      //  jcc(Assembler::equal, COMPARE_TAIL);  // true
+      // But since we stopped at the points ary{1,2}+limit which are
+      // not farther than 64 bytes from the ends of arrays ary{1,2}+result
+      // (|limit| <= 32 and result < 32),
+      // we may just compare the last 64 bytes.
+      //
+      addptr(result, -64);   // it is safe, bc we just came from this area
+      evmovdquq(vec1, Address(ary1, result, Address::times_1), Assembler::AVX_512bit);
+      evpcmpeqb(k7, vec1, Address(ary2, result, Address::times_1), Assembler::AVX_512bit);
+      kortestql(k7, k7);
+      jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
+
+      jmp(TRUE_LABEL);
+
+      bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
+
+    }//if (VM_Version::supports_avx512vlbw())
+#endif //_LP64
+
    vmovdqu(vec1, Address(ary1, limit, Address::times_1));
    vmovdqu(vec2, Address(ary2, limit, Address::times_1));
    vpxor(vec1, vec2);
@ -9454,13 +9329,184 @@ void MacroAssembler::multiply_to_len(Register x, Register xlen, Register y, Regi
  pop(tmp1);
 }

+void MacroAssembler::vectorized_mismatch(Register obja, Register objb, Register length, Register log2_array_indxscale,
+  Register result, Register tmp1, Register tmp2, XMMRegister rymm0, XMMRegister rymm1, XMMRegister rymm2){
+  assert(UseSSE42Intrinsics, "SSE4.2 must be enabled.");
+  Label VECTOR32_LOOP, VECTOR16_LOOP, VECTOR8_LOOP, VECTOR4_LOOP;
+  Label VECTOR16_TAIL, VECTOR8_TAIL, VECTOR4_TAIL;
+  Label VECTOR32_NOT_EQUAL, VECTOR16_NOT_EQUAL, VECTOR8_NOT_EQUAL, VECTOR4_NOT_EQUAL;
+  Label SAME_TILL_END, DONE;
+  Label BYTES_LOOP, BYTES_TAIL, BYTES_NOT_EQUAL;
+
+  //scale is in rcx in both Win64 and Unix
+  ShortBranchVerifier sbv(this);
+
+  shlq(length);
+  xorq(result, result);
+
+  cmpq(length, 8);
+  jcc(Assembler::equal, VECTOR8_LOOP);
+  jcc(Assembler::less, VECTOR4_TAIL);
+
+  if (UseAVX >= 2){
+
+    cmpq(length, 16);
+    jcc(Assembler::equal, VECTOR16_LOOP);
+    jcc(Assembler::less, VECTOR8_LOOP);
+
+    cmpq(length, 32);
+    jccb(Assembler::less, VECTOR16_TAIL);
+
+    subq(length, 32);
+    bind(VECTOR32_LOOP);
+    vmovdqu(rymm0, Address(obja, result));
+    vmovdqu(rymm1, Address(objb, result));
+    vpxor(rymm2, rymm0, rymm1, Assembler::AVX_256bit);
+    vptest(rymm2, rymm2);
+    jcc(Assembler::notZero, VECTOR32_NOT_EQUAL);//mismatch found
+    addq(result, 32);
+    subq(length, 32);
+    jccb(Assembler::greaterEqual, VECTOR32_LOOP);
+    addq(length, 32);
+    jcc(Assembler::equal, SAME_TILL_END);
+    //falling through if less than 32 bytes left //close the branch here.
+
+    bind(VECTOR16_TAIL);
+    cmpq(length, 16);
+    jccb(Assembler::less, VECTOR8_TAIL);
+    bind(VECTOR16_LOOP);
+    movdqu(rymm0, Address(obja, result));
+    movdqu(rymm1, Address(objb, result));
+    vpxor(rymm2, rymm0, rymm1, Assembler::AVX_128bit);
+    ptest(rymm2, rymm2);
+    jcc(Assembler::notZero, VECTOR16_NOT_EQUAL);//mismatch found
+    addq(result, 16);
+    subq(length, 16);
+    jcc(Assembler::equal, SAME_TILL_END);
+    //falling through if less than 16 bytes left
+  } else {//regular intrinsics
+
+    cmpq(length, 16);
+    jccb(Assembler::less, VECTOR8_TAIL);
+
+    subq(length, 16);
+    bind(VECTOR16_LOOP);
+    movdqu(rymm0, Address(obja, result));
+    movdqu(rymm1, Address(objb, result));
+    pxor(rymm0, rymm1);
+    ptest(rymm0, rymm0);
+    jcc(Assembler::notZero, VECTOR16_NOT_EQUAL);//mismatch found
+    addq(result, 16);
+    subq(length, 16);
+    jccb(Assembler::greaterEqual, VECTOR16_LOOP);
+    addq(length, 16);
+    jcc(Assembler::equal, SAME_TILL_END);
+    //falling through if less than 16 bytes left
+  }
+
+  bind(VECTOR8_TAIL);
+  cmpq(length, 8);
+  jccb(Assembler::less, VECTOR4_TAIL);
+  bind(VECTOR8_LOOP);
+  movq(tmp1, Address(obja, result));
+  movq(tmp2, Address(objb, result));
+  xorq(tmp1, tmp2);
+  testq(tmp1, tmp1);
+  jcc(Assembler::notZero, VECTOR8_NOT_EQUAL);//mismatch found
+  addq(result, 8);
+  subq(length, 8);
+  jcc(Assembler::equal, SAME_TILL_END);
+  //falling through if less than 8 bytes left
+
+  bind(VECTOR4_TAIL);
+  cmpq(length, 4);
+  jccb(Assembler::less, BYTES_TAIL);
+  bind(VECTOR4_LOOP);
+  movl(tmp1, Address(obja, result));
+  xorl(tmp1, Address(objb, result));
+  testl(tmp1, tmp1);
+  jcc(Assembler::notZero, VECTOR4_NOT_EQUAL);//mismatch found
+  addq(result, 4);
+  subq(length, 4);
+  jcc(Assembler::equal, SAME_TILL_END);
+  //falling through if less than 4 bytes left
+
+  bind(BYTES_TAIL);
+  bind(BYTES_LOOP);
+  load_unsigned_byte(tmp1, Address(obja, result));
+  load_unsigned_byte(tmp2, Address(objb, result));
+  xorl(tmp1, tmp2);
+  testl(tmp1, tmp1);
+  jccb(Assembler::notZero, BYTES_NOT_EQUAL);//mismatch found
+  decq(length);
+  jccb(Assembler::zero, SAME_TILL_END);
+  incq(result);
+  load_unsigned_byte(tmp1, Address(obja, result));
+  load_unsigned_byte(tmp2, Address(objb, result));
+  xorl(tmp1, tmp2);
+  testl(tmp1, tmp1);
+  jccb(Assembler::notZero, BYTES_NOT_EQUAL);//mismatch found
+  decq(length);
+  jccb(Assembler::zero, SAME_TILL_END);
+  incq(result);
+  load_unsigned_byte(tmp1, Address(obja, result));
+  load_unsigned_byte(tmp2, Address(objb, result));
+  xorl(tmp1, tmp2);
+  testl(tmp1, tmp1);
+  jccb(Assembler::notZero, BYTES_NOT_EQUAL);//mismatch found
+  jmpb(SAME_TILL_END);
+
+  if (UseAVX >= 2){
+    bind(VECTOR32_NOT_EQUAL);
+    vpcmpeqb(rymm2, rymm2, rymm2, Assembler::AVX_256bit);
+    vpcmpeqb(rymm0, rymm0, rymm1, Assembler::AVX_256bit);
+    vpxor(rymm0, rymm0, rymm2, Assembler::AVX_256bit);
+    vpmovmskb(tmp1, rymm0);
+    bsfq(tmp1, tmp1);
+    addq(result, tmp1);
+    shrq(result);
+    jmpb(DONE);
+  }
+
+  bind(VECTOR16_NOT_EQUAL);
+  if (UseAVX >= 2){
+    vpcmpeqb(rymm2, rymm2, rymm2, Assembler::AVX_128bit);
+    vpcmpeqb(rymm0, rymm0, rymm1, Assembler::AVX_128bit);
+    pxor(rymm0, rymm2);
+  } else {
+    pcmpeqb(rymm2, rymm2);
+    pxor(rymm0, rymm1);
+    pcmpeqb(rymm0, rymm1);
+    pxor(rymm0, rymm2);
+  }
+  pmovmskb(tmp1, rymm0);
+  bsfq(tmp1, tmp1);
+  addq(result, tmp1);
+  shrq(result);
+  jmpb(DONE);
+
+  bind(VECTOR8_NOT_EQUAL);
+  bind(VECTOR4_NOT_EQUAL);
+  bsfq(tmp1, tmp1);
+  shrq(tmp1, 3);
+  addq(result, tmp1);
+  bind(BYTES_NOT_EQUAL);
+  shrq(result);
+  jmpb(DONE);
+
+  bind(SAME_TILL_END);
+  mov64(result, -1);
+
+  bind(DONE);
+}
+
+
 //Helper functions for square_to_len()

 /**
 * Store the squares of x[], right shifted one bit (divided by 2) into z[]
 * Preserves x and z and modifies rest of the registers.
 */
-
 void MacroAssembler::square_rshift(Register x, Register xlen, Register z, Register tmp1, Register tmp3, Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
  // Perform square and right shift by 1
  // Handle odd xlen case first, then for even xlen do the following