Merge pull request #102 from rivosinc/rivos/merge-jdk-22+0

Merge jdk-22+0 into rivos/main
2025-08-27 14:54:52 +02:00 · 2023-06-11 21:32:10 +02:00 · 2023-06-11 21:32:10 +02:00 · 6664815e5c
commit 6664815e5c
parent 5c11fbf220 67b43092ad
581 changed files with 41776 additions and 7141 deletions
--- a/bin/idea.sh
+++ b/bin/idea.sh
@ -193,17 +193,7 @@ for root in $MODULE_ROOTS; do
      root=`wslpath -am $root`
    fi

-    VM_CI="jdk.internal.vm.ci/share/classes"
-    VM_COMPILER="src/jdk.internal.vm.compiler/share/classes"
-    if test "${root#*$VM_CI}" != "$root" || test "${root#*$VM_COMPILER}" != "$root"; then
-        for subdir in "$root"/*; do
-            if [ -d "$subdir" ]; then
-                SOURCES=$SOURCES" $SOURCE_PREFIX""$subdir"/src"$SOURCE_POSTFIX"
-            fi
-        done
-    else
-        SOURCES=$SOURCES" $SOURCE_PREFIX""$root""$SOURCE_POSTFIX"
-    fi
+    SOURCES=$SOURCES" $SOURCE_PREFIX""$root""$SOURCE_POSTFIX"
 done

 add_replacement "###SOURCE_ROOTS###" "$SOURCES"
@ -274,4 +264,4 @@ $BOOT_JDK/bin/$JAVAC -d $JAVAC_CLASSES -sourcepath $JAVAC_SOURCE_PATH -cp $JAVAC

 if [ "x$WSL_DISTRO_NAME" != "x" ]; then
  rm -rf $ANT_TEMP
-fi
+fi
--- a/doc/building.html
+++ b/doc/building.html
@ -685,6 +685,8 @@ does not automatically locate the platform FreeType files.</p>
 Fontconfig</a> is required on all platforms except Windows and
 macOS.</p>
 <ul>
+<li>To install on an apt-based Linux, try running
+<code>sudo apt-get install     libfontconfig-dev</code>.</li>
 <li>To install on an rpm-based Linux, try running
 <code>sudo yum install     fontconfig-devel</code>.</li>
 </ul>
--- a/doc/building.md
+++ b/doc/building.md
@ -480,6 +480,8 @@ if `configure` does not automatically locate the platform FreeType files.
 Fontconfig from [freedesktop.org Fontconfig](http://fontconfig.org) is required
 on all platforms except Windows and macOS.

+  * To install on an apt-based Linux, try running `sudo apt-get install
+    libfontconfig-dev`.
  * To install on an rpm-based Linux, try running `sudo yum install
    fontconfig-devel`.

--- a/make/CompileInterimLangtools.gmk
+++ b/make/CompileInterimLangtools.gmk
@ -98,6 +98,7 @@ define SetupInterimModule
      EXCLUDES := sun javax/tools/snippet-files, \
      EXCLUDE_FILES := $(TOPDIR)/src/$1/share/classes/module-info.java \
          $(TOPDIR)/src/$1/share/classes/javax/tools/ToolProvider.java \
+          $(TOPDIR)/src/$1/share/classes/com/sun/tools/javac/launcher/Main.java \
          Standard.java, \
      EXTRA_FILES := $(BUILDTOOLS_OUTPUTDIR)/gensrc/$1.interim/module-info.java \
          $($1.interim_EXTRA_FILES), \
--- a/make/conf/docs-modules.conf
+++ b/make/conf/docs-modules.conf
@ -42,7 +42,6 @@ DOCS_MODULES= \
    jdk.hotspot.agent \
    jdk.httpserver \
    jdk.jpackage \
-    jdk.incubator.concurrent \
    jdk.incubator.vector \
    jdk.jartool \
    jdk.javadoc \
--- a/make/conf/module-loader-map.conf
+++ b/make/conf/module-loader-map.conf
@ -43,7 +43,6 @@ BOOT_MODULES= \
    java.rmi \
    java.security.sasl \
    java.xml \
-    jdk.incubator.concurrent \
    jdk.incubator.vector \
    jdk.internal.vm.ci \
    jdk.jfr \
--- a/make/data/hotspot-symbols/symbols-unix
+++ b/make/data/hotspot-symbols/symbols-unix
@ -181,6 +181,7 @@ JVM_NewArray
 JVM_NewInstanceFromConstructor
 JVM_NewMultiArray
 JVM_PhantomReferenceRefersTo
+JVM_PrintWarningAtDynamicAgentLoad
 JVM_RaiseSignal
 JVM_RawMonitorCreate
 JVM_RawMonitorDestroy
--- a/make/modules/java.base/Lib.gmk
+++ b/make/modules/java.base/Lib.gmk
@ -46,6 +46,8 @@ $(eval $(call SetupJdkLibrary, BUILD_LIBNET, \
    DISABLED_WARNINGS_gcc_net_util_md.c := format-nonliteral, \
    DISABLED_WARNINGS_gcc_NetworkInterface.c := unused-function, \
    DISABLED_WARNINGS_clang_net_util_md.c := format-nonliteral, \
+    DISABLED_WARNINGS_clang_aix_DefaultProxySelector.c := deprecated-non-prototype, \
+    DISABLED_WARNINGS_clang_aix_NetworkInterface.c := gnu-pointer-arith, \
    DISABLED_WARNINGS_microsoft_InetAddress.c := 4244, \
    DISABLED_WARNINGS_microsoft_ResolverConfigurationImpl.c := 4996, \
    LDFLAGS := $(LDFLAGS_JDKLIB) \
--- a/make/modules/java.base/lib/CoreLibraries.gmk
+++ b/make/modules/java.base/lib/CoreLibraries.gmk
@ -198,6 +198,7 @@ ifeq ($(call isTargetOs, aix), true)
      OPTIMIZATION := HIGH, \
      CFLAGS := $(STATIC_LIBRARY_FLAGS) $(CFLAGS_JDKLIB) $(LIBJLI_CFLAGS) \
          $(addprefix -I, $(LIBJLI_SRC_DIRS)), \
+      DISABLED_WARNINGS_clang_aix := format-nonliteral deprecated-non-prototype, \
      ARFLAGS := $(ARFLAGS), \
      OBJECT_DIR := $(SUPPORT_OUTPUTDIR)/native/$(MODULE)/libjli_static))

--- a/make/modules/java.desktop/Java.gmk
+++ b/make/modules/java.desktop/Java.gmk
@ -68,6 +68,7 @@ EXCLUDE_FILES += \
 ifeq ($(call isTargetOs, macosx), true)
  # exclude all X11 on Mac.
  EXCLUDES += \
+      sun/awt/screencast \
      sun/awt/X11 \
      sun/java2d/x11 \
      sun/java2d/jules \
--- a/make/modules/java.desktop/lib/Awt2dLibraries.gmk
+++ b/make/modules/java.desktop/lib/Awt2dLibraries.gmk
@ -191,6 +191,9 @@ ifeq ($(call isTargetOs, windows macosx), false)

    LIBAWT_XAWT_EXCLUDES := medialib debug

+    LIBPIPEWIRE_HEADER_DIRS := \
+        $(TOPDIR)/src/$(MODULE)/unix/native/libpipewire/include
+
    LIBAWT_XAWT_EXTRA_HEADER_DIRS := \
        $(LIBAWT_DEFAULT_HEADER_DIRS) \
        libawt_xawt/awt \
@ -200,7 +203,7 @@ ifeq ($(call isTargetOs, windows macosx), false)
        common/font \
        common/java2d/opengl \
        common/java2d/x11 \
-        #
+        $(LIBPIPEWIRE_HEADER_DIRS)

    LIBAWT_XAWT_CFLAGS += -DXAWT -DXAWT_HACK \
        $(FONTCONFIG_CFLAGS) \
@ -240,6 +243,14 @@ ifeq ($(call isTargetOs, windows macosx), false)
        DISABLED_WARNINGS_gcc_XRBackendNative.c := maybe-uninitialized, \
        DISABLED_WARNINGS_gcc_XToolkit.c := unused-result, \
        DISABLED_WARNINGS_gcc_XWindow.c := unused-function, \
+        DISABLED_WARNINGS_clang_aix := deprecated-non-prototype, \
+        DISABLED_WARNINGS_clang_aix_awt_Taskbar.c := parentheses, \
+        DISABLED_WARNINGS_clang_aix_OGLPaints.c := format-nonliteral, \
+        DISABLED_WARNINGS_clang_aix_OGLBufImgOps.c := format-nonliteral, \
+        DISABLED_WARNINGS_clang_aix_gtk2_interface.c := parentheses logical-op-parentheses, \
+        DISABLED_WARNINGS_clang_aix_gtk3_interface.c := parentheses logical-op-parentheses, \
+        DISABLED_WARNINGS_clang_aix_sun_awt_X11_GtkFileDialogPeer.c := parentheses, \
+        DISABLED_WARNINGS_clang_aix_awt_InputMethod.c := sign-compare, \
        LDFLAGS := $(LDFLAGS_JDKLIB) \
            $(call SET_SHARED_LIBRARY_ORIGIN) \
            -L$(INSTALL_LIBRARIES_HERE), \
@ -446,7 +457,9 @@ else
   # Early re-canonizing has to be disabled to workaround an internal XlC compiler error
   # when building libharfbuzz
   ifeq ($(call isTargetOs, aix), true)
+    ifneq ($(TOOLCHAIN_TYPE), clang)
     HARFBUZZ_CFLAGS += -qdebug=necan
+    endif
   endif

   # hb-ft.cc is not presently needed, and requires freetype 2.4.2 or later.
@ -693,6 +706,20 @@ ifeq ($(ENABLE_HEADLESS_ONLY), false)
    endif
  endif

+  # The external libpng submitted in the jdk is a reduced version
+  # which does not contain .png_init_filter_functions_vsx.
+  # Therefore we need to disable PNG_POWERPC_VSX_OPT explicitly by setting
+  # it to 0. If this define is not set, it would be automatically set to 2,
+  # because
+  #   "#if defined(__PPC64__) && defined(__ALTIVEC__) && defined(__VSX__)"
+  # expands to true. This would results in the fact that
+  # .png_init_filter_functions_vsx is needed in libpng.
+  ifeq ($(call isTargetOs, aix), true)
+    ifeq ($(TOOLCHAIN_TYPE), clang)
+      LIBSPLASHSCREEN_CFLAGS += -DPNG_POWERPC_VSX_OPT=0
+    endif
+  endif
+
  ifeq ($(call isTargetOs, macosx), true)
    LIBSPLASHSCREEN_CFLAGS += -DWITH_MACOSX

--- a/make/modules/java.security.jgss/Lib.gmk
+++ b/make/modules/java.security.jgss/Lib.gmk
@ -1,5 +1,5 @@
 #
-# Copyright (c) 2011, 2022, Oracle and/or its affiliates. All rights reserved.
+# Copyright (c) 2011, 2023, Oracle and/or its affiliates. All rights reserved.
 # DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 #
 # This code is free software; you can redistribute it and/or modify it
@ -32,6 +32,7 @@ $(eval $(call SetupJdkLibrary, BUILD_LIBJ2GSS, \
    OPTIMIZATION := LOW, \
    CFLAGS := $(CFLAGS_JDKLIB), \
    DISABLED_WARNINGS_gcc := undef, \
+    DISABLED_WARNINGS_clang_aix := undef, \
    LDFLAGS := $(LDFLAGS_JDKLIB) \
        $(call SET_SHARED_LIBRARY_ORIGIN), \
    LIBS := $(LIBDL), \
--- a/make/modules/java.xml/Copy.gmk
+++ b/make/modules/java.xml/Copy.gmk
@ -0,0 +1,37 @@
+#
+# Copyright (c) 2023, Oracle and/or its affiliates. All rights reserved.
+# DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+#
+# This code is free software; you can redistribute it and/or modify it
+# under the terms of the GNU General Public License version 2 only, as
+# published by the Free Software Foundation.  Oracle designates this
+# particular file as subject to the "Classpath" exception as provided
+# by Oracle in the LICENSE file that accompanied this code.
+#
+# This code is distributed in the hope that it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+# FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+# version 2 for more details (a copy is included in the LICENSE file that
+# accompanied this code).
+#
+# You should have received a copy of the GNU General Public License version
+# 2 along with this work; if not, write to the Free Software Foundation,
+# Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+#
+# Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+# or visit www.oracle.com if you need additional information or have any
+# questions.
+#
+
+include CopyCommon.gmk
+
+################################################################################
+
+XML_LIB_SRC := $(TOPDIR)/src/java.xml/share/conf
+
+$(CONF_DST_DIR)/jaxp.properties: $(XML_LIB_SRC)/jaxp.properties
+	$(call install-file)
+
+TARGETS := $(CONF_DST_DIR)/jaxp.properties
+
+################################################################################
--- a/make/modules/jdk.jdwp.agent/Lib.gmk
+++ b/make/modules/jdk.jdwp.agent/Lib.gmk
@ -1,5 +1,5 @@
 #
-# Copyright (c) 2011, 2022, Oracle and/or its affiliates. All rights reserved.
+# Copyright (c) 2011, 2023, Oracle and/or its affiliates. All rights reserved.
 # DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 #
 # This code is free software; you can redistribute it and/or modify it
@ -30,6 +30,7 @@ include LibCommon.gmk
 $(eval $(call SetupJdkLibrary, BUILD_LIBDT_SOCKET, \
    NAME := dt_socket, \
    OPTIMIZATION := LOW, \
+    DISABLED_WARNINGS_clang_aix := missing-braces, \
    CFLAGS := $(CFLAGS_JDKLIB) $(LIBDT_SOCKET_CPPFLAGS), \
    EXTRA_HEADER_DIRS := \
        include \
--- a/src/hotspot/cpu/aarch64/aarch64_vector.ad
+++ b/src/hotspot/cpu/aarch64/aarch64_vector.ad
@ -5522,12 +5522,12 @@ instruct vstoremask_truecount_neon(iRegINoSp dst, vReg src, immI_gt_1 size, vReg
    // Input "src" is a vector mask represented as lanes with
    // 0/-1 as element values.
    uint esize = (uint)$size$$constant;
-    if (esize == 8) {
-      __ addpd($vtmp$$FloatRegister, $src$$FloatRegister);
+    uint length_in_bytes = Matcher::vector_length_in_bytes(this, $src);
+    Assembler::SIMD_Arrangement arrangement = Assembler::esize2arrangement(esize,
+                                                                           /* isQ */ length_in_bytes == 16);
+    if (arrangement == __ T2D || arrangement == __ T2S) {
+      __ addpv($vtmp$$FloatRegister, arrangement, $src$$FloatRegister, $src$$FloatRegister);
    } else {
-      uint length_in_bytes = Matcher::vector_length_in_bytes(this, $src);
-      Assembler::SIMD_Arrangement arrangement = Assembler::esize2arrangement(esize,
-                                                                             /* isQ */ length_in_bytes == 16);
      __ addv($vtmp$$FloatRegister, arrangement, $src$$FloatRegister);
    }
    __ smov($dst$$Register, $vtmp$$FloatRegister, __ B, 0);
--- a/src/hotspot/cpu/aarch64/aarch64_vector_ad.m4
+++ b/src/hotspot/cpu/aarch64/aarch64_vector_ad.m4
@ -3832,12 +3832,12 @@ instruct vstoremask_truecount_neon(iRegINoSp dst, vReg src, immI_gt_1 size, vReg
    // Input "src" is a vector mask represented as lanes with
    // 0/-1 as element values.
    uint esize = (uint)$size$$constant;
-    if (esize == 8) {
-      __ addpd($vtmp$$FloatRegister, $src$$FloatRegister);
+    uint length_in_bytes = Matcher::vector_length_in_bytes(this, $src);
+    Assembler::SIMD_Arrangement arrangement = Assembler::esize2arrangement(esize,
+                                                                           /* isQ */ length_in_bytes == 16);
+    if (arrangement == __ T2D || arrangement == __ T2S) {
+      __ addpv($vtmp$$FloatRegister, arrangement, $src$$FloatRegister, $src$$FloatRegister);
    } else {
-      uint length_in_bytes = Matcher::vector_length_in_bytes(this, $src);
-      Assembler::SIMD_Arrangement arrangement = Assembler::esize2arrangement(esize,
-                                                                             /* isQ */ length_in_bytes == 16);
      __ addv($vtmp$$FloatRegister, arrangement, $src$$FloatRegister);
    }
    __ smov($dst$$Register, $vtmp$$FloatRegister, __ B, 0);
--- a/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp
+++ b/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp
@ -7046,6 +7046,171 @@ class StubGenerator: public StubCodeGenerator {
    return start;
  }

+  // In sun.security.util.math.intpoly.IntegerPolynomial1305, integers
+  // are represented as long[5], with BITS_PER_LIMB = 26.
+  // Pack five 26-bit limbs into three 64-bit registers.
+  void pack_26(Register dest0, Register dest1, Register dest2, Register src) {
+    __ ldp(dest0, rscratch1, Address(src, 0));     // 26 bits
+    __ add(dest0, dest0, rscratch1, Assembler::LSL, 26);  // 26 bits
+    __ ldp(rscratch1, rscratch2, Address(src, 2 * sizeof (jlong)));
+    __ add(dest0, dest0, rscratch1, Assembler::LSL, 52);  // 12 bits
+
+    __ add(dest1, zr, rscratch1, Assembler::LSR, 12);     // 14 bits
+    __ add(dest1, dest1, rscratch2, Assembler::LSL, 14);  // 26 bits
+    __ ldr(rscratch1, Address(src, 4 * sizeof (jlong)));
+    __ add(dest1, dest1, rscratch1, Assembler::LSL, 40);  // 24 bits
+
+    if (dest2->is_valid()) {
+      __ add(dest2, zr, rscratch1, Assembler::LSR, 24);     // 2 bits
+    } else {
+#ifdef ASSERT
+      Label OK;
+      __ cmp(zr, rscratch1, Assembler::LSR, 24);     // 2 bits
+      __ br(__ EQ, OK);
+      __ stop("high bits of Poly1305 integer should be zero");
+      __ should_not_reach_here();
+      __ bind(OK);
+#endif
+    }
+  }
+
+  // As above, but return only a 128-bit integer, packed into two
+  // 64-bit registers.
+  void pack_26(Register dest0, Register dest1, Register src) {
+    pack_26(dest0, dest1, noreg, src);
+  }
+
+  // Multiply and multiply-accumulate unsigned 64-bit registers.
+  void wide_mul(Register prod_lo, Register prod_hi, Register n, Register m) {
+    __ mul(prod_lo, n, m);
+    __ umulh(prod_hi, n, m);
+  }
+  void wide_madd(Register sum_lo, Register sum_hi, Register n, Register m) {
+    wide_mul(rscratch1, rscratch2, n, m);
+    __ adds(sum_lo, sum_lo, rscratch1);
+    __ adc(sum_hi, sum_hi, rscratch2);
+  }
+
+  // Poly1305, RFC 7539
+
+  // See https://loup-vaillant.fr/tutorials/poly1305-design for a
+  // description of the tricks used to simplify and accelerate this
+  // computation.
+
+  address generate_poly1305_processBlocks() {
+    __ align(CodeEntryAlignment);
+    StubCodeMark mark(this, "StubRoutines", "poly1305_processBlocks");
+    address start = __ pc();
+    Label here;
+    __ enter();
+    RegSet callee_saved = RegSet::range(r19, r28);
+    __ push(callee_saved, sp);
+
+    RegSetIterator<Register> regs = (RegSet::range(c_rarg0, r28) - r18_tls - rscratch1 - rscratch2).begin();
+
+    // Arguments
+    const Register input_start = *regs, length = *++regs, acc_start = *++regs, r_start = *++regs;
+
+    // R_n is the 128-bit randomly-generated key, packed into two
+    // registers.  The caller passes this key to us as long[5], with
+    // BITS_PER_LIMB = 26.
+    const Register R_0 = *++regs, R_1 = *++regs;
+    pack_26(R_0, R_1, r_start);
+
+    // RR_n is (R_n >> 2) * 5
+    const Register RR_0 = *++regs, RR_1 = *++regs;
+    __ lsr(RR_0, R_0, 2);
+    __ add(RR_0, RR_0, RR_0, Assembler::LSL, 2);
+    __ lsr(RR_1, R_1, 2);
+    __ add(RR_1, RR_1, RR_1, Assembler::LSL, 2);
+
+    // U_n is the current checksum
+    const Register U_0 = *++regs, U_1 = *++regs, U_2 = *++regs;
+    pack_26(U_0, U_1, U_2, acc_start);
+
+    static constexpr int BLOCK_LENGTH = 16;
+    Label DONE, LOOP;
+
+    __ cmp(length, checked_cast<u1>(BLOCK_LENGTH));
+    __ br(Assembler::LT, DONE); {
+      __ bind(LOOP);
+
+      // S_n is to be the sum of U_n and the next block of data
+      const Register S_0 = *++regs, S_1 = *++regs, S_2 = *++regs;
+      __ ldp(S_0, S_1, __ post(input_start, 2 * wordSize));
+      __ adds(S_0, U_0, S_0);
+      __ adcs(S_1, U_1, S_1);
+      __ adc(S_2, U_2, zr);
+      __ add(S_2, S_2, 1);
+
+      const Register U_0HI = *++regs, U_1HI = *++regs;
+
+      // NB: this logic depends on some of the special properties of
+      // Poly1305 keys. In particular, because we know that the top
+      // four bits of R_0 and R_1 are zero, we can add together
+      // partial products without any risk of needing to propagate a
+      // carry out.
+      wide_mul(U_0, U_0HI, S_0, R_0);  wide_madd(U_0, U_0HI, S_1, RR_1); wide_madd(U_0, U_0HI, S_2, RR_0);
+      wide_mul(U_1, U_1HI, S_0, R_1);  wide_madd(U_1, U_1HI, S_1, R_0);  wide_madd(U_1, U_1HI, S_2, RR_1);
+      __ andr(U_2, R_0, 3);
+      __ mul(U_2, S_2, U_2);
+
+      // Recycle registers S_0, S_1, S_2
+      regs = (regs.remaining() + S_0 + S_1 + S_2).begin();
+
+      // Partial reduction mod 2**130 - 5
+      __ adds(U_1, U_0HI, U_1);
+      __ adc(U_2, U_1HI, U_2);
+      // Sum now in U_2:U_1:U_0.
+      // Dead: U_0HI, U_1HI.
+      regs = (regs.remaining() + U_0HI + U_1HI).begin();
+
+      // U_2:U_1:U_0 += (U_2 >> 2) * 5 in two steps
+
+      // First, U_2:U_1:U_0 += (U_2 >> 2)
+      __ lsr(rscratch1, U_2, 2);
+      __ andr(U_2, U_2, (u8)3);
+      __ adds(U_0, U_0, rscratch1);
+      __ adcs(U_1, U_1, zr);
+      __ adc(U_2, U_2, zr);
+      // Second, U_2:U_1:U_0 += (U_2 >> 2) << 2
+      __ adds(U_0, U_0, rscratch1, Assembler::LSL, 2);
+      __ adcs(U_1, U_1, zr);
+      __ adc(U_2, U_2, zr);
+
+      __ sub(length, length, checked_cast<u1>(BLOCK_LENGTH));
+      __ cmp(length, checked_cast<u1>(BLOCK_LENGTH));
+      __ br(~ Assembler::LT, LOOP);
+    }
+
+    // Further reduce modulo 2^130 - 5
+    __ lsr(rscratch1, U_2, 2);
+    __ add(rscratch1, rscratch1, rscratch1, Assembler::LSL, 2); // rscratch1 = U_2 * 5
+    __ adds(U_0, U_0, rscratch1); // U_0 += U_2 * 5
+    __ adcs(U_1, U_1, zr);
+    __ andr(U_2, U_2, (u1)3);
+    __ adc(U_2, U_2, zr);
+
+    // Unpack the sum into five 26-bit limbs and write to memory.
+    __ ubfiz(rscratch1, U_0, 0, 26);
+    __ ubfx(rscratch2, U_0, 26, 26);
+    __ stp(rscratch1, rscratch2, Address(acc_start));
+    __ ubfx(rscratch1, U_0, 52, 12);
+    __ bfi(rscratch1, U_1, 12, 14);
+    __ ubfx(rscratch2, U_1, 14, 26);
+    __ stp(rscratch1, rscratch2, Address(acc_start, 2 * sizeof (jlong)));
+    __ ubfx(rscratch1, U_1, 40, 24);
+    __ bfi(rscratch1, U_2, 24, 3);
+    __ str(rscratch1, Address(acc_start, 4 * sizeof (jlong)));
+
+    __ bind(DONE);
+    __ pop(callee_saved, sp);
+    __ leave();
+    __ ret(lr);
+
+    return start;
+  }
+
 #if INCLUDE_JFR

  static void jfr_prologue(address the_pc, MacroAssembler* _masm, Register thread) {
@ -8133,6 +8298,10 @@ class StubGenerator: public StubCodeGenerator {

    StubRoutines::aarch64::_spin_wait = generate_spin_wait();

+    if (UsePoly1305Intrinsics) {
+      StubRoutines::_poly1305_processBlocks = generate_poly1305_processBlocks();
+    }
+
 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS)

    generate_atomic_entry_points();
--- a/src/hotspot/cpu/aarch64/vm_version_aarch64.cpp
+++ b/src/hotspot/cpu/aarch64/vm_version_aarch64.cpp
@ -565,6 +565,10 @@ void VM_Version::initialize() {
  if (FLAG_IS_DEFAULT(AlignVector)) {
    AlignVector = AvoidUnalignedAccesses;
  }
+
+  if (FLAG_IS_DEFAULT(UsePoly1305Intrinsics)) {
+    FLAG_SET_DEFAULT(UsePoly1305Intrinsics, true);
+  }
 #endif

  _spin_wait = get_spin_wait_desc();
--- a/src/hotspot/cpu/ppc/c1_MacroAssembler_ppc.cpp
+++ b/src/hotspot/cpu/ppc/c1_MacroAssembler_ppc.cpp
@ -114,41 +114,46 @@ void C1_MacroAssembler::lock_object(Register Rmark, Register Roop, Register Rbox
    bne(CCR0, slow_int);
  }

-  // ... and mark it unlocked.
-  ori(Rmark, Rmark, markWord::unlocked_value);
+  if (LockingMode == LM_LIGHTWEIGHT) {
+    fast_lock(Roop, Rmark, Rscratch, slow_int);
+  } else if (LockingMode == LM_LEGACY) {
+    // ... and mark it unlocked.
+    ori(Rmark, Rmark, markWord::unlocked_value);

-  // Save unlocked object header into the displaced header location on the stack.
-  std(Rmark, BasicLock::displaced_header_offset_in_bytes(), Rbox);
+    // Save unlocked object header into the displaced header location on the stack.
+    std(Rmark, BasicLock::displaced_header_offset_in_bytes(), Rbox);

-  // Compare object markWord with Rmark and if equal exchange Rscratch with object markWord.
-  assert(oopDesc::mark_offset_in_bytes() == 0, "cas must take a zero displacement");
-  cmpxchgd(/*flag=*/CCR0,
-           /*current_value=*/Rscratch,
-           /*compare_value=*/Rmark,
-           /*exchange_value=*/Rbox,
-           /*where=*/Roop/*+0==mark_offset_in_bytes*/,
-           MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq,
-           MacroAssembler::cmpxchgx_hint_acquire_lock(),
-           noreg,
-           &cas_failed,
-           /*check without membar and ldarx first*/true);
-  // If compare/exchange succeeded we found an unlocked object and we now have locked it
-  // hence we are done.
+    // Compare object markWord with Rmark and if equal exchange Rscratch with object markWord.
+    assert(oopDesc::mark_offset_in_bytes() == 0, "cas must take a zero displacement");
+    cmpxchgd(/*flag=*/CCR0,
+             /*current_value=*/Rscratch,
+             /*compare_value=*/Rmark,
+             /*exchange_value=*/Rbox,
+             /*where=*/Roop/*+0==mark_offset_in_bytes*/,
+             MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq,
+             MacroAssembler::cmpxchgx_hint_acquire_lock(),
+             noreg,
+             &cas_failed,
+             /*check without membar and ldarx first*/true);
+    // If compare/exchange succeeded we found an unlocked object and we now have locked it
+    // hence we are done.
+  }
  b(done);

  bind(slow_int);
  b(slow_case); // far

-  bind(cas_failed);
-  // We did not find an unlocked object so see if this is a recursive case.
-  sub(Rscratch, Rscratch, R1_SP);
-  load_const_optimized(R0, (~(os::vm_page_size()-1) | markWord::lock_mask_in_place));
-  and_(R0/*==0?*/, Rscratch, R0);
-  std(R0/*==0, perhaps*/, BasicLock::displaced_header_offset_in_bytes(), Rbox);
-  bne(CCR0, slow_int);
+  if (LockingMode == LM_LEGACY) {
+    bind(cas_failed);
+    // We did not find an unlocked object so see if this is a recursive case.
+    sub(Rscratch, Rscratch, R1_SP);
+    load_const_optimized(R0, (~(os::vm_page_size()-1) | markWord::lock_mask_in_place));
+    and_(R0/*==0?*/, Rscratch, R0);
+    std(R0/*==0, perhaps*/, BasicLock::displaced_header_offset_in_bytes(), Rbox);
+    bne(CCR0, slow_int);
+  }

  bind(done);
-
  inc_held_monitor_count(Rmark /*tmp*/);
 }

@ -161,33 +166,41 @@ void C1_MacroAssembler::unlock_object(Register Rmark, Register Roop, Register Rb
  Address mark_addr(Roop, oopDesc::mark_offset_in_bytes());
  assert(mark_addr.disp() == 0, "cas must take a zero displacement");

-  // Test first if it is a fast recursive unlock.
-  ld(Rmark, BasicLock::displaced_header_offset_in_bytes(), Rbox);
-  cmpdi(CCR0, Rmark, 0);
-  beq(CCR0, done);
+  if (LockingMode != LM_LIGHTWEIGHT) {
+    // Test first if it is a fast recursive unlock.
+    ld(Rmark, BasicLock::displaced_header_offset_in_bytes(), Rbox);
+    cmpdi(CCR0, Rmark, 0);
+    beq(CCR0, done);
+  }

  // Load object.
  ld(Roop, in_bytes(BasicObjectLock::obj_offset()), Rbox);
  verify_oop(Roop, FILE_AND_LINE);

-  // Check if it is still a light weight lock, this is is true if we see
-  // the stack address of the basicLock in the markWord of the object.
-  cmpxchgd(/*flag=*/CCR0,
-           /*current_value=*/R0,
-           /*compare_value=*/Rbox,
-           /*exchange_value=*/Rmark,
-           /*where=*/Roop,
-           MacroAssembler::MemBarRel,
-           MacroAssembler::cmpxchgx_hint_release_lock(),
-           noreg,
-           &slow_int);
+  if (LockingMode == LM_LIGHTWEIGHT) {
+    ld(Rmark, oopDesc::mark_offset_in_bytes(), Roop);
+    andi_(R0, Rmark, markWord::monitor_value);
+    bne(CCR0, slow_int);
+    fast_unlock(Roop, Rmark, slow_int);
+  } else if (LockingMode == LM_LEGACY) {
+    // Check if it is still a light weight lock, this is is true if we see
+    // the stack address of the basicLock in the markWord of the object.
+    cmpxchgd(/*flag=*/CCR0,
+             /*current_value=*/R0,
+             /*compare_value=*/Rbox,
+             /*exchange_value=*/Rmark,
+             /*where=*/Roop,
+             MacroAssembler::MemBarRel,
+             MacroAssembler::cmpxchgx_hint_release_lock(),
+             noreg,
+             &slow_int);
+  }
  b(done);
  bind(slow_int);
  b(slow_case); // far

  // Done
  bind(done);
-
  dec_held_monitor_count(Rmark /*tmp*/);
 }

--- a/src/hotspot/cpu/ppc/interp_masm_ppc_64.cpp
+++ b/src/hotspot/cpu/ppc/interp_masm_ppc_64.cpp
@ -924,7 +924,7 @@ void InterpreterMacroAssembler::lock_object(Register monitor, Register object) {
  if (LockingMode == LM_MONITOR) {
    call_VM(noreg, CAST_FROM_FN_PTR(address, InterpreterRuntime::monitorenter), monitor);
  } else {
-    // template code:
+    // template code (for LM_LEGACY):
    //
    // markWord displaced_header = obj->mark().set_unlocked();
    // monitor->lock()->set_displaced_header(displaced_header);
@ -938,7 +938,7 @@ void InterpreterMacroAssembler::lock_object(Register monitor, Register object) {
    //   InterpreterRuntime::monitorenter(THREAD, monitor);
    // }

-    const Register displaced_header = R7_ARG5;
+    const Register header           = R7_ARG5;
    const Register object_mark_addr = R8_ARG6;
    const Register current_header   = R9_ARG7;
    const Register tmp              = R10_ARG8;
@ -946,12 +946,12 @@ void InterpreterMacroAssembler::lock_object(Register monitor, Register object) {
    Label count_locking, done;
    Label cas_failed, slow_case;

-    assert_different_registers(displaced_header, object_mark_addr, current_header, tmp);
+    assert_different_registers(header, object_mark_addr, current_header, tmp);

    // markWord displaced_header = obj->mark().set_unlocked();

-    // Load markWord from object into displaced_header.
-    ld(displaced_header, oopDesc::mark_offset_in_bytes(), object);
+    // Load markWord from object into header.
+    ld(header, oopDesc::mark_offset_in_bytes(), object);

    if (DiagnoseSyncOnValueBasedClasses != 0) {
      load_klass(tmp, object);
@ -960,58 +960,64 @@ void InterpreterMacroAssembler::lock_object(Register monitor, Register object) {
      bne(CCR0, slow_case);
    }

-    // Set displaced_header to be (markWord of object | UNLOCK_VALUE).
-    ori(displaced_header, displaced_header, markWord::unlocked_value);
+    if (LockingMode == LM_LIGHTWEIGHT) {
+      fast_lock(object, /* mark word */ header, tmp, slow_case);
+      b(count_locking);
+    } else if (LockingMode == LM_LEGACY) {

-    // monitor->lock()->set_displaced_header(displaced_header);
-    const int lock_offset = in_bytes(BasicObjectLock::lock_offset());
-    const int mark_offset = lock_offset +
-                            BasicLock::displaced_header_offset_in_bytes();
+      // Set displaced_header to be (markWord of object | UNLOCK_VALUE).
+      ori(header, header, markWord::unlocked_value);

-    // Initialize the box (Must happen before we update the object mark!).
-    std(displaced_header, mark_offset, monitor);
+      // monitor->lock()->set_displaced_header(displaced_header);
+      const int lock_offset = in_bytes(BasicObjectLock::lock_offset());
+      const int mark_offset = lock_offset +
+                              BasicLock::displaced_header_offset_in_bytes();

-    // if (Atomic::cmpxchg(/*addr*/obj->mark_addr(), /*cmp*/displaced_header, /*ex=*/monitor) == displaced_header) {
+      // Initialize the box (Must happen before we update the object mark!).
+      std(header, mark_offset, monitor);

-    // Store stack address of the BasicObjectLock (this is monitor) into object.
-    addi(object_mark_addr, object, oopDesc::mark_offset_in_bytes());
+      // if (Atomic::cmpxchg(/*addr*/obj->mark_addr(), /*cmp*/displaced_header, /*ex=*/monitor) == displaced_header) {

-    // Must fence, otherwise, preceding store(s) may float below cmpxchg.
-    // CmpxchgX sets CCR0 to cmpX(current, displaced).
-    cmpxchgd(/*flag=*/CCR0,
-             /*current_value=*/current_header,
-             /*compare_value=*/displaced_header, /*exchange_value=*/monitor,
-             /*where=*/object_mark_addr,
-             MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq,
-             MacroAssembler::cmpxchgx_hint_acquire_lock(),
-             noreg,
-             &cas_failed,
-             /*check without membar and ldarx first*/true);
+      // Store stack address of the BasicObjectLock (this is monitor) into object.
+      addi(object_mark_addr, object, oopDesc::mark_offset_in_bytes());

-    // If the compare-and-exchange succeeded, then we found an unlocked
-    // object and we have now locked it.
-    b(count_locking);
-    bind(cas_failed);
+      // Must fence, otherwise, preceding store(s) may float below cmpxchg.
+      // CmpxchgX sets CCR0 to cmpX(current, displaced).
+      cmpxchgd(/*flag=*/CCR0,
+               /*current_value=*/current_header,
+               /*compare_value=*/header, /*exchange_value=*/monitor,
+               /*where=*/object_mark_addr,
+               MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq,
+               MacroAssembler::cmpxchgx_hint_acquire_lock(),
+               noreg,
+               &cas_failed,
+               /*check without membar and ldarx first*/true);

-    // } else if (THREAD->is_lock_owned((address)displaced_header))
-    //   // Simple recursive case.
-    //   monitor->lock()->set_displaced_header(nullptr);
+      // If the compare-and-exchange succeeded, then we found an unlocked
+      // object and we have now locked it.
+      b(count_locking);
+      bind(cas_failed);

-    // We did not see an unlocked object so try the fast recursive case.
+      // } else if (THREAD->is_lock_owned((address)displaced_header))
+      //   // Simple recursive case.
+      //   monitor->lock()->set_displaced_header(nullptr);

-    // Check if owner is self by comparing the value in the markWord of object
-    // (current_header) with the stack pointer.
-    sub(current_header, current_header, R1_SP);
+      // We did not see an unlocked object so try the fast recursive case.

-    assert(os::vm_page_size() > 0xfff, "page size too small - change the constant");
-    load_const_optimized(tmp, ~(os::vm_page_size()-1) | markWord::lock_mask_in_place);
+      // Check if owner is self by comparing the value in the markWord of object
+      // (current_header) with the stack pointer.
+      sub(current_header, current_header, R1_SP);

-    and_(R0/*==0?*/, current_header, tmp);
-    // If condition is true we are done and hence we can store 0 in the displaced
-    // header indicating it is a recursive lock.
-    bne(CCR0, slow_case);
-    std(R0/*==0!*/, mark_offset, monitor);
-    b(count_locking);
+      assert(os::vm_page_size() > 0xfff, "page size too small - change the constant");
+      load_const_optimized(tmp, ~(os::vm_page_size()-1) | markWord::lock_mask_in_place);
+
+      and_(R0/*==0?*/, current_header, tmp);
+      // If condition is true we are done and hence we can store 0 in the displaced
+      // header indicating it is a recursive lock.
+      bne(CCR0, slow_case);
+      std(R0/*==0!*/, mark_offset, monitor);
+      b(count_locking);
+    }

    // } else {
    //   // Slow path.
@ -1020,7 +1026,11 @@ void InterpreterMacroAssembler::lock_object(Register monitor, Register object) {
    // None of the above fast optimizations worked so we have to get into the
    // slow case of monitor enter.
    bind(slow_case);
-    call_VM(noreg, CAST_FROM_FN_PTR(address, InterpreterRuntime::monitorenter), monitor);
+    if (LockingMode == LM_LIGHTWEIGHT) {
+      call_VM(noreg, CAST_FROM_FN_PTR(address, InterpreterRuntime::monitorenter_obj), object);
+    } else {
+      call_VM(noreg, CAST_FROM_FN_PTR(address, InterpreterRuntime::monitorenter), monitor);
+    }
    b(done);
    // }
    align(32, 12);
@ -1042,7 +1052,7 @@ void InterpreterMacroAssembler::unlock_object(Register monitor) {
    call_VM_leaf(CAST_FROM_FN_PTR(address, InterpreterRuntime::monitorexit), monitor);
  } else {

-    // template code:
+    // template code (for LM_LEGACY):
    //
    // if ((displaced_header = monitor->displaced_header()) == nullptr) {
    //   // Recursive unlock. Mark the monitor unlocked by setting the object field to null.
@ -1056,22 +1066,24 @@ void InterpreterMacroAssembler::unlock_object(Register monitor) {
    // }

    const Register object           = R7_ARG5;
-    const Register displaced_header = R8_ARG6;
+    const Register header           = R8_ARG6;
    const Register object_mark_addr = R9_ARG7;
    const Register current_header   = R10_ARG8;

    Label free_slot;
    Label slow_case;

-    assert_different_registers(object, displaced_header, object_mark_addr, current_header);
+    assert_different_registers(object, header, object_mark_addr, current_header);

-    // Test first if we are in the fast recursive case.
-    ld(displaced_header, in_bytes(BasicObjectLock::lock_offset()) +
-                         BasicLock::displaced_header_offset_in_bytes(), monitor);
+    if (LockingMode != LM_LIGHTWEIGHT) {
+      // Test first if we are in the fast recursive case.
+      ld(header, in_bytes(BasicObjectLock::lock_offset()) +
+                 BasicLock::displaced_header_offset_in_bytes(), monitor);

-    // If the displaced header is zero, we have a recursive unlock.
-    cmpdi(CCR0, displaced_header, 0);
-    beq(CCR0, free_slot); // recursive unlock
+      // If the displaced header is zero, we have a recursive unlock.
+      cmpdi(CCR0, header, 0);
+      beq(CCR0, free_slot); // recursive unlock
+    }

    // } else if (Atomic::cmpxchg(obj->mark_addr(), monitor, displaced_header) == monitor) {
    //   // We swapped the unlocked mark in displaced_header into the object's mark word.
@ -1081,20 +1093,41 @@ void InterpreterMacroAssembler::unlock_object(Register monitor) {

    // The object address from the monitor is in object.
    ld(object, in_bytes(BasicObjectLock::obj_offset()), monitor);
-    addi(object_mark_addr, object, oopDesc::mark_offset_in_bytes());

-    // We have the displaced header in displaced_header. If the lock is still
-    // lightweight, it will contain the monitor address and we'll store the
-    // displaced header back into the object's mark word.
-    // CmpxchgX sets CCR0 to cmpX(current, monitor).
-    cmpxchgd(/*flag=*/CCR0,
-             /*current_value=*/current_header,
-             /*compare_value=*/monitor, /*exchange_value=*/displaced_header,
-             /*where=*/object_mark_addr,
-             MacroAssembler::MemBarRel,
-             MacroAssembler::cmpxchgx_hint_release_lock(),
-             noreg,
-             &slow_case);
+    if (LockingMode == LM_LIGHTWEIGHT) {
+      // Check for non-symmetric locking. This is allowed by the spec and the interpreter
+      // must handle it.
+      Register tmp = current_header;
+      // First check for lock-stack underflow.
+      lwz(tmp, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread);
+      cmplwi(CCR0, tmp, (unsigned)LockStack::start_offset());
+      ble(CCR0, slow_case);
+      // Then check if the top of the lock-stack matches the unlocked object.
+      addi(tmp, tmp, -oopSize);
+      ldx(tmp, tmp, R16_thread);
+      cmpd(CCR0, tmp, object);
+      bne(CCR0, slow_case);
+
+      ld(header, oopDesc::mark_offset_in_bytes(), object);
+      andi_(R0, header, markWord::monitor_value);
+      bne(CCR0, slow_case);
+      fast_unlock(object, header, slow_case);
+    } else {
+      addi(object_mark_addr, object, oopDesc::mark_offset_in_bytes());
+
+      // We have the displaced header in displaced_header. If the lock is still
+      // lightweight, it will contain the monitor address and we'll store the
+      // displaced header back into the object's mark word.
+      // CmpxchgX sets CCR0 to cmpX(current, monitor).
+      cmpxchgd(/*flag=*/CCR0,
+               /*current_value=*/current_header,
+               /*compare_value=*/monitor, /*exchange_value=*/header,
+               /*where=*/object_mark_addr,
+               MacroAssembler::MemBarRel,
+               MacroAssembler::cmpxchgx_hint_release_lock(),
+               noreg,
+               &slow_case);
+    }
    b(free_slot);

    // } else {
--- a/src/hotspot/cpu/ppc/macroAssembler_ppc.cpp
+++ b/src/hotspot/cpu/ppc/macroAssembler_ppc.cpp
@ -2629,8 +2629,7 @@ void MacroAssembler::compiler_fast_lock_object(ConditionRegister flag, Register
                                               Metadata* method_data,
                                               bool use_rtm, bool profile_rtm) {
  assert_different_registers(oop, box, temp, displaced_header, current_header);
-  assert(flag != CCR0, "bad condition register");
-  Label cont;
+  assert(LockingMode != LM_LIGHTWEIGHT || flag == CCR0, "bad condition register");
  Label object_has_monitor;
  Label cas_failed;
  Label success, failure;
@ -2649,7 +2648,7 @@ void MacroAssembler::compiler_fast_lock_object(ConditionRegister flag, Register
  if (UseRTMForStackLocks && use_rtm) {
    rtm_stack_locking(flag, oop, displaced_header, temp, /*temp*/ current_header,
                      stack_rtm_counters, method_data, profile_rtm,
-                      cont, object_has_monitor);
+                      success, object_has_monitor);
  }
 #endif // INCLUDE_RTM_OPT

@ -2658,7 +2657,11 @@ void MacroAssembler::compiler_fast_lock_object(ConditionRegister flag, Register
  andi_(temp, displaced_header, markWord::monitor_value);
  bne(CCR0, object_has_monitor);

-  if (LockingMode != LM_MONITOR) {
+  if (LockingMode == LM_MONITOR) {
+    // Set NE to indicate 'failure' -> take slow-path.
+    crandc(flag, Assembler::equal, flag, Assembler::equal);
+    b(failure);
+  } else if (LockingMode == LM_LEGACY) {
    // Set displaced_header to be (markWord of object | UNLOCK_VALUE).
    ori(displaced_header, displaced_header, markWord::unlocked_value);

@ -2683,28 +2686,31 @@ void MacroAssembler::compiler_fast_lock_object(ConditionRegister flag, Register
    // If the compare-and-exchange succeeded, then we found an unlocked
    // object and we have now locked it.
    b(success);
-  } else {
-    // Set NE to indicate 'failure' -> take slow-path.
-    crandc(flag, Assembler::equal, flag, Assembler::equal);
+
+    bind(cas_failed);
+    // We did not see an unlocked object so try the fast recursive case.
+
+    // Check if the owner is self by comparing the value in the markWord of object
+    // (current_header) with the stack pointer.
+    sub(current_header, current_header, R1_SP);
+    load_const_optimized(temp, ~(os::vm_page_size()-1) | markWord::lock_mask_in_place);
+
+    and_(R0/*==0?*/, current_header, temp);
+    // If condition is true we are cont and hence we can store 0 as the
+    // displaced header in the box, which indicates that it is a recursive lock.
+    std(R0/*==0, perhaps*/, BasicLock::displaced_header_offset_in_bytes(), box);
+
+    if (flag != CCR0) {
+      mcrf(flag, CCR0);
+    }
+    beq(CCR0, success);
    b(failure);
+  } else {
+    assert(LockingMode == LM_LIGHTWEIGHT, "must be");
+    fast_lock(oop, displaced_header, temp, failure);
+    b(success);
  }

-  bind(cas_failed);
-  // We did not see an unlocked object so try the fast recursive case.
-
-  // Check if the owner is self by comparing the value in the markWord of object
-  // (current_header) with the stack pointer.
-  sub(current_header, current_header, R1_SP);
-  load_const_optimized(temp, ~(os::vm_page_size()-1) | markWord::lock_mask_in_place);
-
-  and_(R0/*==0?*/, current_header, temp);
-  // If condition is true we are cont and hence we can store 0 as the
-  // displaced header in the box, which indicates that it is a recursive lock.
-  mcrf(flag,CCR0);
-  std(R0/*==0, perhaps*/, BasicLock::displaced_header_offset_in_bytes(), box);
-
-  b(cont);
-
  // Handle existing monitor.
  bind(object_has_monitor);
  // The object's monitor m is unlocked iff m->owner is null,
@ -2714,7 +2720,8 @@ void MacroAssembler::compiler_fast_lock_object(ConditionRegister flag, Register
  // Use the same RTM locking code in 32- and 64-bit VM.
  if (use_rtm) {
    rtm_inflated_locking(flag, oop, displaced_header, box, temp, /*temp*/ current_header,
-                         rtm_counters, method_data, profile_rtm, cont);
+                         rtm_counters, method_data, profile_rtm, success);
+    bne(flag, failure);
  } else {
 #endif // INCLUDE_RTM_OPT

@ -2728,8 +2735,10 @@ void MacroAssembler::compiler_fast_lock_object(ConditionRegister flag, Register
           MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq,
           MacroAssembler::cmpxchgx_hint_acquire_lock());

-  // Store a non-null value into the box.
-  std(box, BasicLock::displaced_header_offset_in_bytes(), box);
+  if (LockingMode != LM_LIGHTWEIGHT) {
+    // Store a non-null value into the box.
+    std(box, BasicLock::displaced_header_offset_in_bytes(), box);
+  }
  beq(flag, success);

  // Check for recursive locking.
@ -2746,10 +2755,8 @@ void MacroAssembler::compiler_fast_lock_object(ConditionRegister flag, Register
  } // use_rtm()
 #endif

-  bind(cont);
  // flag == EQ indicates success, increment held monitor count
  // flag == NE indicates failure
-  bne(flag, failure);
  bind(success);
  inc_held_monitor_count(temp);
  bind(failure);
@ -2759,9 +2766,8 @@ void MacroAssembler::compiler_fast_unlock_object(ConditionRegister flag, Registe
                                                 Register temp, Register displaced_header, Register current_header,
                                                 bool use_rtm) {
  assert_different_registers(oop, box, temp, displaced_header, current_header);
-  assert(flag != CCR0, "bad condition register");
-  Label object_has_monitor, notRecursive;
-  Label success, failure;
+  assert(LockingMode != LM_LIGHTWEIGHT || flag == CCR0, "bad condition register");
+  Label success, failure, object_has_monitor, notRecursive;

 #if INCLUDE_RTM_OPT
  if (UseRTMForStackLocks && use_rtm) {
@ -2776,7 +2782,7 @@ void MacroAssembler::compiler_fast_unlock_object(ConditionRegister flag, Registe
  }
 #endif

-  if (LockingMode != LM_MONITOR) {
+  if (LockingMode == LM_LEGACY) {
    // Find the lock address and load the displaced header from the stack.
    ld(displaced_header, BasicLock::displaced_header_offset_in_bytes(), box);

@ -2792,7 +2798,11 @@ void MacroAssembler::compiler_fast_unlock_object(ConditionRegister flag, Registe
  andi_(R0, current_header, markWord::monitor_value);
  bne(CCR0, object_has_monitor);

-  if (LockingMode != LM_MONITOR) {
+  if (LockingMode == LM_MONITOR) {
+    // Set NE to indicate 'failure' -> take slow-path.
+    crandc(flag, Assembler::equal, flag, Assembler::equal);
+    b(failure);
+  } else if (LockingMode == LM_LEGACY) {
    // Check if it is still a light weight lock, this is is true if we see
    // the stack address of the basicLock in the markWord of the object.
    // Cmpxchg sets flag to cmpd(current_header, box).
@ -2808,9 +2818,9 @@ void MacroAssembler::compiler_fast_unlock_object(ConditionRegister flag, Registe
    assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
    b(success);
  } else {
-    // Set NE to indicate 'failure' -> take slow-path.
-    crandc(flag, Assembler::equal, flag, Assembler::equal);
-    b(failure);
+    assert(LockingMode == LM_LIGHTWEIGHT, "must be");
+    fast_unlock(oop, current_header, failure);
+    b(success);
  }

  // Handle existing monitor.
@ -2819,7 +2829,7 @@ void MacroAssembler::compiler_fast_unlock_object(ConditionRegister flag, Registe
  addi(current_header, current_header, -(int)markWord::monitor_value); // monitor
  ld(temp,             in_bytes(ObjectMonitor::owner_offset()), current_header);

-    // It's inflated.
+  // It's inflated.
 #if INCLUDE_RTM_OPT
  if (use_rtm) {
    Label L_regular_inflated_unlock;
@ -2832,15 +2842,20 @@ void MacroAssembler::compiler_fast_unlock_object(ConditionRegister flag, Registe
  }
 #endif

-  ld(displaced_header, in_bytes(ObjectMonitor::recursions_offset()), current_header);
-
+  // In case of LM_LIGHTWEIGHT, we may reach here with (temp & ObjectMonitor::ANONYMOUS_OWNER) != 0.
+  // This is handled like owner thread mismatches: We take the slow path.
  cmpd(flag, temp, R16_thread);
  bne(flag, failure);

+  ld(displaced_header, in_bytes(ObjectMonitor::recursions_offset()), current_header);
+
  addic_(displaced_header, displaced_header, -1);
  blt(CCR0, notRecursive); // Not recursive if negative after decrement.
  std(displaced_header, in_bytes(ObjectMonitor::recursions_offset()), current_header);
-  b(success); // flag is already EQ here.
+  if (flag == CCR0) { // Otherwise, flag is already EQ, here.
+    crorc(CCR0, Assembler::equal, CCR0, Assembler::equal); // Set CCR0 EQ
+  }
+  b(success);

  bind(notRecursive);
  ld(temp,             in_bytes(ObjectMonitor::EntryList_offset()), current_header);
@ -4410,6 +4425,7 @@ void MacroAssembler::pop_cont_fastpath() {
  bind(done);
 }

+// Note: Must preserve CCR0 EQ (invariant).
 void MacroAssembler::inc_held_monitor_count(Register tmp) {
  ld(tmp, in_bytes(JavaThread::held_monitor_count_offset()), R16_thread);
 #ifdef ASSERT
@ -4418,11 +4434,13 @@ void MacroAssembler::inc_held_monitor_count(Register tmp) {
  bge_predict_taken(CCR0, ok);
  stop("held monitor count is negativ at increment");
  bind(ok);
+  crorc(CCR0, Assembler::equal, CCR0, Assembler::equal); // Restore CCR0 EQ
 #endif
  addi(tmp, tmp, 1);
  std(tmp, in_bytes(JavaThread::held_monitor_count_offset()), R16_thread);
 }

+// Note: Must preserve CCR0 EQ (invariant).
 void MacroAssembler::dec_held_monitor_count(Register tmp) {
  ld(tmp, in_bytes(JavaThread::held_monitor_count_offset()), R16_thread);
 #ifdef ASSERT
@ -4431,7 +4449,136 @@ void MacroAssembler::dec_held_monitor_count(Register tmp) {
  bgt_predict_taken(CCR0, ok);
  stop("held monitor count is <= 0 at decrement");
  bind(ok);
+  crorc(CCR0, Assembler::equal, CCR0, Assembler::equal); // Restore CCR0 EQ
 #endif
  addi(tmp, tmp, -1);
  std(tmp, in_bytes(JavaThread::held_monitor_count_offset()), R16_thread);
 }
+
+// Function to flip between unlocked and locked state (fast locking).
+// Branches to failed if the state is not as expected with CCR0 NE.
+// Falls through upon success with CCR0 EQ.
+// This requires fewer instructions and registers and is easier to use than the
+// cmpxchg based implementation.
+void MacroAssembler::atomically_flip_locked_state(bool is_unlock, Register obj, Register tmp, Label& failed, int semantics) {
+  assert_different_registers(obj, tmp, R0);
+  Label retry;
+
+  if (semantics & MemBarRel) {
+    release();
+  }
+
+  bind(retry);
+  STATIC_ASSERT(markWord::locked_value == 0); // Or need to change this!
+  if (!is_unlock) {
+    ldarx(tmp, obj, MacroAssembler::cmpxchgx_hint_acquire_lock());
+    xori(tmp, tmp, markWord::unlocked_value); // flip unlocked bit
+    andi_(R0, tmp, markWord::lock_mask_in_place);
+    bne(CCR0, failed); // failed if new header doesn't contain locked_value (which is 0)
+  } else {
+    ldarx(tmp, obj, MacroAssembler::cmpxchgx_hint_release_lock());
+    andi_(R0, tmp, markWord::lock_mask_in_place);
+    bne(CCR0, failed); // failed if old header doesn't contain locked_value (which is 0)
+    ori(tmp, tmp, markWord::unlocked_value); // set unlocked bit
+  }
+  stdcx_(tmp, obj);
+  bne(CCR0, retry);
+
+  if (semantics & MemBarFenceAfter) {
+    fence();
+  } else if (semantics & MemBarAcq) {
+    isync();
+  }
+}
+
+// Implements fast-locking.
+// Branches to slow upon failure to lock the object, with CCR0 NE.
+// Falls through upon success with CCR0 EQ.
+//
+//  - obj: the object to be locked
+//  - hdr: the header, already loaded from obj, will be destroyed
+//  - t1: temporary register
+void MacroAssembler::fast_lock(Register obj, Register hdr, Register t1, Label& slow) {
+  assert(LockingMode == LM_LIGHTWEIGHT, "only used with new lightweight locking");
+  assert_different_registers(obj, hdr, t1);
+
+  // Check if we would have space on lock-stack for the object.
+  lwz(t1, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread);
+  cmplwi(CCR0, t1, LockStack::end_offset() - 1);
+  bgt(CCR0, slow);
+
+  // Quick check: Do not reserve cache line for atomic update if not unlocked.
+  // (Similar to contention_hint in cmpxchg solutions.)
+  xori(R0, hdr, markWord::unlocked_value); // flip unlocked bit
+  andi_(R0, R0, markWord::lock_mask_in_place);
+  bne(CCR0, slow); // failed if new header doesn't contain locked_value (which is 0)
+
+  // Note: We're not publishing anything (like the displaced header in LM_LEGACY)
+  // to other threads at this point. Hence, no release barrier, here.
+  // (The obj has been written to the BasicObjectLock at obj_offset() within the own thread stack.)
+  atomically_flip_locked_state(/* is_unlock */ false, obj, hdr, slow, MacroAssembler::MemBarAcq);
+
+  // After successful lock, push object on lock-stack
+  stdx(obj, t1, R16_thread);
+  addi(t1, t1, oopSize);
+  stw(t1, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread);
+}
+
+// Implements fast-unlocking.
+// Branches to slow upon failure, with CCR0 NE.
+// Falls through upon success, with CCR0 EQ.
+//
+// - obj: the object to be unlocked
+// - hdr: the (pre-loaded) header of the object, will be destroyed
+void MacroAssembler::fast_unlock(Register obj, Register hdr, Label& slow) {
+  assert(LockingMode == LM_LIGHTWEIGHT, "only used with new lightweight locking");
+  assert_different_registers(obj, hdr);
+
+#ifdef ASSERT
+  {
+    // Check that hdr is fast-locked.
+    Label hdr_ok;
+    andi_(R0, hdr, markWord::lock_mask_in_place);
+    beq(CCR0, hdr_ok);
+    stop("Header is not fast-locked");
+    bind(hdr_ok);
+  }
+  Register t1 = hdr; // Reuse in debug build.
+  {
+    // The following checks rely on the fact that LockStack is only ever modified by
+    // its owning thread, even if the lock got inflated concurrently; removal of LockStack
+    // entries after inflation will happen delayed in that case.
+
+    // Check for lock-stack underflow.
+    Label stack_ok;
+    lwz(t1, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread);
+    cmplwi(CCR0, t1, LockStack::start_offset());
+    bgt(CCR0, stack_ok);
+    stop("Lock-stack underflow");
+    bind(stack_ok);
+  }
+  {
+    // Check if the top of the lock-stack matches the unlocked object.
+    Label tos_ok;
+    addi(t1, t1, -oopSize);
+    ldx(t1, t1, R16_thread);
+    cmpd(CCR0, t1, obj);
+    beq(CCR0, tos_ok);
+    stop("Top of lock-stack does not match the unlocked object");
+    bind(tos_ok);
+  }
+#endif
+
+  // Release the lock.
+  atomically_flip_locked_state(/* is_unlock */ true, obj, hdr, slow, MacroAssembler::MemBarRel);
+
+  // After successful unlock, pop object from lock-stack
+  Register t2 = hdr;
+  lwz(t2, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread);
+  addi(t2, t2, -oopSize);
+#ifdef ASSERT
+  li(R0, 0);
+  stdx(R0, t2, R16_thread);
+#endif
+  stw(t2, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread);
+}
--- a/src/hotspot/cpu/ppc/macroAssembler_ppc.hpp
+++ b/src/hotspot/cpu/ppc/macroAssembler_ppc.hpp
@ -606,6 +606,9 @@ class MacroAssembler: public Assembler {
  void pop_cont_fastpath();
  void inc_held_monitor_count(Register tmp);
  void dec_held_monitor_count(Register tmp);
+  void atomically_flip_locked_state(bool is_unlock, Register obj, Register tmp, Label& failed, int semantics);
+  void fast_lock(Register obj, Register hdr, Register t1, Label& slow);
+  void fast_unlock(Register obj, Register hdr, Label& slow);

  // allocation (for C1)
  void tlab_allocate(
--- a/src/hotspot/cpu/ppc/ppc.ad
+++ b/src/hotspot/cpu/ppc/ppc.ad
@ -12139,7 +12139,7 @@ instruct partialSubtypeCheck(iRegPdst result, iRegP_N2P subklass, iRegP_N2P supe

 // inlined locking and unlocking

-instruct cmpFastLock(flagsReg crx, iRegPdst oop, iRegPdst box, iRegPdst tmp1, iRegPdst tmp2) %{
+instruct cmpFastLock(flagsRegCR0 crx, iRegPdst oop, iRegPdst box, iRegPdst tmp1, iRegPdst tmp2) %{
  match(Set crx (FastLock oop box));
  effect(TEMP tmp1, TEMP tmp2);
  predicate(!Compile::current()->use_rtm());
@ -12175,7 +12175,7 @@ instruct cmpFastLock_tm(flagsReg crx, iRegPdst oop, rarg2RegP box, iRegPdst tmp1
  ins_pipe(pipe_class_compare);
 %}

-instruct cmpFastUnlock(flagsReg crx, iRegPdst oop, iRegPdst box, iRegPdst tmp1, iRegPdst tmp2, iRegPdst tmp3) %{
+instruct cmpFastUnlock(flagsRegCR0 crx, iRegPdst oop, iRegPdst box, iRegPdst tmp1, iRegPdst tmp2, iRegPdst tmp3) %{
  match(Set crx (FastUnlock oop box));
  effect(TEMP tmp1, TEMP tmp2, TEMP tmp3);
  predicate(!Compile::current()->use_rtm());
--- a/src/hotspot/cpu/ppc/vm_version_ppc.cpp
+++ b/src/hotspot/cpu/ppc/vm_version_ppc.cpp
@ -394,6 +394,10 @@ void VM_Version::initialize() {
      // high lock contention. For now we do not use it by default.
      vm_exit_during_initialization("UseRTMLocking flag should be only set on command line");
    }
+    if (LockingMode != LM_LEGACY) {
+      warning("UseRTMLocking requires LockingMode = 1");
+      FLAG_SET_DEFAULT(UseRTMLocking, false);
+    }
 #else
    // Only C2 does RTM locking optimization.
    vm_exit_during_initialization("RTM locking optimization is not supported in this VM");
--- a/src/hotspot/cpu/riscv/assembler_riscv.hpp
+++ b/src/hotspot/cpu/riscv/assembler_riscv.hpp
@ -1406,7 +1406,6 @@ enum VectorMask {
  // Vector Floating-Point Sign-Injection Instructions
  INSN(vfsgnjx_vv, 0b1010111, 0b001, 0b001010);
  INSN(vfsgnjn_vv, 0b1010111, 0b001, 0b001001);
-  INSN(vfsgnj_vv,  0b1010111, 0b001, 0b001000);

  // Vector Floating-Point MIN/MAX Instructions
  INSN(vfmax_vv,   0b1010111, 0b001, 0b000110);
@ -1560,11 +1559,6 @@ enum VectorMask {
  INSN(vmfne_vf, 0b1010111, 0b101, 0b011100);
  INSN(vmfeq_vf, 0b1010111, 0b101, 0b011000);

-  // Vector Floating-Point Sign-Injection Instructions
-  INSN(vfsgnjx_vf, 0b1010111, 0b101, 0b001010);
-  INSN(vfsgnjn_vf, 0b1010111, 0b101, 0b001001);
-  INSN(vfsgnj_vf,  0b1010111, 0b101, 0b001000);
-
  // Vector Floating-Point MIN/MAX Instructions
  INSN(vfmax_vf, 0b1010111, 0b101, 0b000110);
  INSN(vfmin_vf, 0b1010111, 0b101, 0b000100);
--- a/src/hotspot/cpu/riscv/c2_MacroAssembler_riscv.cpp
+++ b/src/hotspot/cpu/riscv/c2_MacroAssembler_riscv.cpp
@ -1521,26 +1521,39 @@ void C2_MacroAssembler::byte_array_inflate_v(Register src, Register dst, Registe

 // Compress char[] array to byte[].
 // result: the array length if every element in array can be encoded; 0, otherwise.
-void C2_MacroAssembler::char_array_compress_v(Register src, Register dst, Register len, Register result, Register tmp) {
+void C2_MacroAssembler::char_array_compress_v(Register src, Register dst, Register len,
+                                              Register result, Register tmp) {
  Label done;
-  encode_iso_array_v(src, dst, len, result, tmp);
+  encode_iso_array_v(src, dst, len, result, tmp, false);
  beqz(len, done);
  mv(result, zr);
  bind(done);
 }

-// result: the number of elements had been encoded.
-void C2_MacroAssembler::encode_iso_array_v(Register src, Register dst, Register len, Register result, Register tmp) {
-  Label loop, DIFFERENCE, DONE;
+// Intrinsic for
+//
+// - sun/nio/cs/ISO_8859_1$Encoder.implEncodeISOArray
+//     return the number of characters copied.
+// - java/lang/StringUTF16.compress
+//     return zero (0) if copy fails, otherwise 'len'.
+//
+// This version always returns the number of characters copied. A successful
+// copy will complete with the post-condition: 'res' == 'len', while an
+// unsuccessful copy will exit with the post-condition: 0 <= 'res' < 'len'.
+//
+// Clobbers: src, dst, len, result, t0
+void C2_MacroAssembler::encode_iso_array_v(Register src, Register dst, Register len,
+                                           Register result, Register tmp, bool ascii) {
+  Label loop, fail, done;

  BLOCK_COMMENT("encode_iso_array_v {");
  mv(result, 0);

  bind(loop);
-  mv(tmp, 0xff);
+  mv(tmp, ascii ? 0x7f : 0xff);
  vsetvli(t0, len, Assembler::e16, Assembler::m2);
  vle16_v(v2, src);
-  // if element > 0xff, stop
+
  vmsgtu_vx(v1, v2, tmp);
  vfirst_m(tmp, v1);
  vmsbf_m(v0, v1);
@ -1549,18 +1562,19 @@ void C2_MacroAssembler::encode_iso_array_v(Register src, Register dst, Register
  vncvt_x_x_w(v1, v2, Assembler::v0_t);
  vse8_v(v1, dst, Assembler::v0_t);

-  bgez(tmp, DIFFERENCE);
+  // fail if char > 0x7f/0xff
+  bgez(tmp, fail);
  add(result, result, t0);
  add(dst, dst, t0);
  sub(len, len, t0);
  shadd(src, t0, src, t0, 1);
  bnez(len, loop);
-  j(DONE);
+  j(done);

-  bind(DIFFERENCE);
+  bind(fail);
  add(result, result, tmp);

-  bind(DONE);
+  bind(done);
  BLOCK_COMMENT("} encode_iso_array_v");
 }

--- a/src/hotspot/cpu/riscv/c2_MacroAssembler_riscv.hpp
+++ b/src/hotspot/cpu/riscv/c2_MacroAssembler_riscv.hpp
@ -134,7 +134,7 @@

  void unspill(VectorRegister v, int offset) {
    add(t0, sp, offset);
-    vl1re8_v(v, t0);
+    vl1r_v(v, t0);
  }

  void spill_copy_vector_stack_to_stack(int src_offset, int dst_offset, int vector_length_in_bytes) {
@ -175,7 +175,7 @@

 void encode_iso_array_v(Register src, Register dst,
                         Register len, Register result,
-                         Register tmp);
+                         Register tmp, bool ascii);

 void count_positives_v(Register ary, Register len,
                        Register result, Register tmp);
--- a/src/hotspot/cpu/riscv/macroAssembler_riscv.cpp
+++ b/src/hotspot/cpu/riscv/macroAssembler_riscv.cpp
@ -1278,7 +1278,7 @@ int MacroAssembler::pop_v(unsigned int bitset, Register stack) {
  int count = bitset_to_regs(bitset, regs);

  for (int i = count - 1; i >= 0; i--) {
-    vl1re8_v(as_VectorRegister(regs[i]), stack);
+    vl1r_v(as_VectorRegister(regs[i]), stack);
    add(stack, stack, vector_size_in_bytes);
  }

--- a/src/hotspot/cpu/riscv/macroAssembler_riscv.hpp
+++ b/src/hotspot/cpu/riscv/macroAssembler_riscv.hpp
@ -1274,6 +1274,10 @@ public:
  }

  // vector pseudo instructions
+  inline void vl1r_v(VectorRegister vd, Register rs) {
+    vl1re8_v(vd, rs);
+  }
+
  inline void vmnot_m(VectorRegister vd, VectorRegister vs) {
    vmnand_mm(vd, vs, vs);
  }
@ -1290,6 +1294,10 @@ public:
    vfsgnjn_vv(vd, vs, vs, vm);
  }

+  inline void vfabs_v(VectorRegister vd, VectorRegister vs, VectorMask vm = unmasked) {
+    vfsgnjx_vv(vd, vs, vs, vm);
+  }
+
  inline void vmsgt_vv(VectorRegister vd, VectorRegister vs2, VectorRegister vs1, VectorMask vm = unmasked) {
    vmslt_vv(vd, vs1, vs2, vm);
  }
--- a/src/hotspot/cpu/riscv/matcher_riscv.hpp
+++ b/src/hotspot/cpu/riscv/matcher_riscv.hpp
@ -159,7 +159,7 @@
  }

  // Implements a variant of EncodeISOArrayNode that encode ASCII only
-  static const bool supports_encode_ascii_array = false;
+  static const bool supports_encode_ascii_array = true;

  // Some architecture needs a helper to check for alltrue vector
  static constexpr bool vectortest_needs_second_argument(bool is_alltrue, bool is_predicate) {
--- a/src/hotspot/cpu/riscv/nativeInst_riscv.hpp
+++ b/src/hotspot/cpu/riscv/nativeInst_riscv.hpp
@ -624,7 +624,7 @@ class NativeDeoptInstruction: public NativeInstruction {

  static bool is_deopt_at(address instr) {
    assert(instr != nullptr, "");
-    uint32_t value = *(uint32_t *) instr;
+    uint32_t value = Assembler::ld_instr(instr);
    // 0xc0201073 encodes CSRRW x0, instret, x0
    return value == 0xc0201073;
  }
--- a/src/hotspot/cpu/riscv/riscv_v.ad
+++ b/src/hotspot/cpu/riscv/riscv_v.ad
--- a/src/hotspot/cpu/riscv/stubGenerator_riscv.cpp
+++ b/src/hotspot/cpu/riscv/stubGenerator_riscv.cpp
@ -915,7 +915,10 @@ class StubGenerator: public StubCodeGenerator {

    __ vlex_v(v0, src, sew);
    __ sub(cnt, cnt, vl);
-    __ slli(vl, vl, (int)sew);
+    if (sew != Assembler::e8) {
+      // when sew == e8 (e.g., elem size is 1 byte), slli R, R, 0 is a nop and unnecessary
+      __ slli(vl, vl, sew);
+    }
    __ add(src, src, vl);

    __ vsex_v(v0, dst, sew);
@ -927,7 +930,10 @@ class StubGenerator: public StubCodeGenerator {

      __ bind(loop_backward);
      __ sub(t0, cnt, vl);
-      __ slli(t0, t0, sew);
+      if (sew != Assembler::e8) {
+        // when sew == e8 (e.g., elem size is 1 byte), slli R, R, 0 is a nop and unnecessary
+        __ slli(t0, t0, sew);
+      }
      __ add(tmp1, s, t0);
      __ vlex_v(v0, tmp1, sew);
      __ add(tmp2, d, t0);
--- a/src/hotspot/cpu/x86/gc/shared/cardTableBarrierSetAssembler_x86.cpp
+++ b/src/hotspot/cpu/x86/gc/shared/cardTableBarrierSetAssembler_x86.cpp
@ -119,7 +119,7 @@ void CardTableBarrierSetAssembler::store_check(MacroAssembler* masm, Register ob
  if (UseCondCardMark) {
    Label L_already_dirty;
    __ cmpb(card_addr, dirty);
-    __ jcc(Assembler::equal, L_already_dirty);
+    __ jccb(Assembler::equal, L_already_dirty);
    __ movb(card_addr, dirty);
    __ bind(L_already_dirty);
  } else {
--- a/src/hotspot/cpu/x86/macroAssembler_x86.cpp
+++ b/src/hotspot/cpu/x86/macroAssembler_x86.cpp
@ -4300,6 +4300,125 @@ void MacroAssembler::lookup_interface_method(Register recv_klass,
  }
 }

+// Look up the method for a megamorphic invokeinterface call in a single pass over itable:
+// - check recv_klass (actual object class) is a subtype of resolved_klass from CompiledICHolder
+// - find a holder_klass (class that implements the method) vtable offset and get the method from vtable by index
+// The target method is determined by <holder_klass, itable_index>.
+// The receiver klass is in recv_klass.
+// On success, the result will be in method_result, and execution falls through.
+// On failure, execution transfers to the given label.
+void MacroAssembler::lookup_interface_method_stub(Register recv_klass,
+                                                  Register holder_klass,
+                                                  Register resolved_klass,
+                                                  Register method_result,
+                                                  Register scan_temp,
+                                                  Register temp_reg2,
+                                                  Register receiver,
+                                                  int itable_index,
+                                                  Label& L_no_such_interface) {
+  assert_different_registers(recv_klass, method_result, holder_klass, resolved_klass, scan_temp, temp_reg2, receiver);
+  Register temp_itbl_klass = method_result;
+  Register temp_reg = (temp_reg2 == noreg ? recv_klass : temp_reg2); // reuse recv_klass register on 32-bit x86 impl
+
+  int vtable_base = in_bytes(Klass::vtable_start_offset());
+  int itentry_off = in_bytes(itableMethodEntry::method_offset());
+  int scan_step = itableOffsetEntry::size() * wordSize;
+  int vte_size = vtableEntry::size_in_bytes();
+  int ioffset = in_bytes(itableOffsetEntry::interface_offset());
+  int ooffset = in_bytes(itableOffsetEntry::offset_offset());
+  Address::ScaleFactor times_vte_scale = Address::times_ptr;
+  assert(vte_size == wordSize, "adjust times_vte_scale");
+
+  Label L_loop_scan_resolved_entry, L_resolved_found, L_holder_found;
+
+  // temp_itbl_klass = recv_klass.itable[0]
+  // scan_temp = &recv_klass.itable[0] + step
+  movl(scan_temp, Address(recv_klass, Klass::vtable_length_offset()));
+  movptr(temp_itbl_klass, Address(recv_klass, scan_temp, times_vte_scale, vtable_base + ioffset));
+  lea(scan_temp, Address(recv_klass, scan_temp, times_vte_scale, vtable_base + ioffset + scan_step));
+  xorptr(temp_reg, temp_reg);
+
+  // Initial checks:
+  //   - if (holder_klass != resolved_klass), go to "scan for resolved"
+  //   - if (itable[0] == 0), no such interface
+  //   - if (itable[0] == holder_klass), shortcut to "holder found"
+  cmpptr(holder_klass, resolved_klass);
+  jccb(Assembler::notEqual, L_loop_scan_resolved_entry);
+  testptr(temp_itbl_klass, temp_itbl_klass);
+  jccb(Assembler::zero, L_no_such_interface);
+  cmpptr(holder_klass, temp_itbl_klass);
+  jccb(Assembler::equal, L_holder_found);
+
+  // Loop: Look for holder_klass record in itable
+  //   do {
+  //     tmp = itable[index];
+  //     index += step;
+  //     if (tmp == holder_klass) {
+  //       goto L_holder_found; // Found!
+  //     }
+  //   } while (tmp != 0);
+  //   goto L_no_such_interface // Not found.
+  Label L_scan_holder;
+  bind(L_scan_holder);
+    movptr(temp_itbl_klass, Address(scan_temp, 0));
+    addptr(scan_temp, scan_step);
+    cmpptr(holder_klass, temp_itbl_klass);
+    jccb(Assembler::equal, L_holder_found);
+    testptr(temp_itbl_klass, temp_itbl_klass);
+    jccb(Assembler::notZero, L_scan_holder);
+
+  jmpb(L_no_such_interface);
+
+  // Loop: Look for resolved_class record in itable
+  //   do {
+  //     tmp = itable[index];
+  //     index += step;
+  //     if (tmp == holder_klass) {
+  //        // Also check if we have met a holder klass
+  //        holder_tmp = itable[index-step-ioffset];
+  //     }
+  //     if (tmp == resolved_klass) {
+  //        goto L_resolved_found;  // Found!
+  //     }
+  //   } while (tmp != 0);
+  //   goto L_no_such_interface // Not found.
+  //
+  Label L_loop_scan_resolved;
+  bind(L_loop_scan_resolved);
+    movptr(temp_itbl_klass, Address(scan_temp, 0));
+    addptr(scan_temp, scan_step);
+    bind(L_loop_scan_resolved_entry);
+    cmpptr(holder_klass, temp_itbl_klass);
+    cmovl(Assembler::equal, temp_reg, Address(scan_temp, ooffset - ioffset - scan_step));
+    cmpptr(resolved_klass, temp_itbl_klass);
+    jccb(Assembler::equal, L_resolved_found);
+    testptr(temp_itbl_klass, temp_itbl_klass);
+    jccb(Assembler::notZero, L_loop_scan_resolved);
+
+  jmpb(L_no_such_interface);
+
+  Label L_ready;
+
+  // See if we already have a holder klass. If not, go and scan for it.
+  bind(L_resolved_found);
+  testptr(temp_reg, temp_reg);
+  jccb(Assembler::zero, L_scan_holder);
+  jmpb(L_ready);
+
+  bind(L_holder_found);
+  movl(temp_reg, Address(scan_temp, ooffset - ioffset - scan_step));
+
+  // Finally, temp_reg contains holder_klass vtable offset
+  bind(L_ready);
+  assert(itableMethodEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
+  if (temp_reg2 == noreg) { // recv_klass register is clobbered for 32-bit x86 impl
+    load_klass(scan_temp, receiver, noreg);
+    movptr(method_result, Address(scan_temp, temp_reg, Address::times_1, itable_index * wordSize + itentry_off));
+  } else {
+    movptr(method_result, Address(recv_klass, temp_reg, Address::times_1, itable_index * wordSize + itentry_off));
+  }
+}
+

 // virtual method calling
 void MacroAssembler::lookup_virtual_method(Register recv_klass,
--- a/src/hotspot/cpu/x86/macroAssembler_x86.hpp
+++ b/src/hotspot/cpu/x86/macroAssembler_x86.hpp
@ -603,6 +603,16 @@ public:
                               Label& no_such_interface,
                               bool return_method = true);

+  void lookup_interface_method_stub(Register recv_klass,
+                                    Register holder_klass,
+                                    Register resolved_klass,
+                                    Register method_result,
+                                    Register scan_temp,
+                                    Register temp_reg2,
+                                    Register receiver,
+                                    int itable_index,
+                                    Label& L_no_such_interface);
+
  // virtual method calling
  void lookup_virtual_method(Register recv_klass,
                             RegisterOrConstant vtable_index,
--- a/src/hotspot/cpu/x86/vtableStubs_x86_32.cpp
+++ b/src/hotspot/cpu/x86/vtableStubs_x86_32.cpp
@ -179,14 +179,16 @@ VtableStub* VtableStubs::create_itable_stub(int itable_index) {
  //  rax: CompiledICHolder
  //  rcx: Receiver

-  // Most registers are in use; we'll use rax, rbx, rsi, rdi
+  // Most registers are in use; we'll use rax, rbx, rcx, rdx, rsi, rdi
  // (If we need to make rsi, rdi callee-save, do a push/pop here.)
  const Register recv_klass_reg     = rsi;
  const Register holder_klass_reg   = rax; // declaring interface klass (DECC)
-  const Register resolved_klass_reg = rbx; // resolved interface klass (REFC)
-  const Register temp_reg           = rdi;
+  const Register resolved_klass_reg = rdi; // resolved interface klass (REFC)
+  const Register temp_reg           = rdx;
+  const Register method             = rbx;
+  const Register icholder_reg       = rax;
+  const Register receiver           = rcx;

-  const Register icholder_reg = rax;
  __ movptr(resolved_klass_reg, Address(icholder_reg, CompiledICHolder::holder_klass_offset()));
  __ movptr(holder_klass_reg,   Address(icholder_reg, CompiledICHolder::holder_metadata_offset()));

@ -198,35 +200,26 @@ VtableStub* VtableStubs::create_itable_stub(int itable_index) {
  __ load_klass(recv_klass_reg, rcx, noreg);

  start_pc = __ pc();
+  __ push(rdx); // temp_reg

  // Receiver subtype check against REFC.
-  // Destroys recv_klass_reg value.
-  __ lookup_interface_method(// inputs: rec. class, interface
-                             recv_klass_reg, resolved_klass_reg, noreg,
-                             // outputs:  scan temp. reg1, scan temp. reg2
-                             recv_klass_reg, temp_reg,
-                             L_no_such_interface,
-                             /*return_method=*/false);
-
-  const ptrdiff_t  typecheckSize = __ pc() - start_pc;
-  start_pc = __ pc();
-
  // Get selected method from declaring class and itable index
-  const Register method = rbx;
-  __ load_klass(recv_klass_reg, rcx, noreg); // restore recv_klass_reg
-  __ lookup_interface_method(// inputs: rec. class, interface, itable index
-                             recv_klass_reg, holder_klass_reg, itable_index,
-                             // outputs: method, scan temp. reg
-                             method, temp_reg,
-                             L_no_such_interface);
-
+  __ lookup_interface_method_stub(recv_klass_reg, // input
+                                  holder_klass_reg, // input
+                                  resolved_klass_reg, // input
+                                  method, // output
+                                  temp_reg,
+                                  noreg,
+                                  receiver, // input (x86_32 only: to restore recv_klass value)
+                                  itable_index,
+                                  L_no_such_interface);
  const ptrdiff_t  lookupSize = __ pc() - start_pc;

  // We expect we need index_dependent_slop extra bytes. Reason:
  // The emitted code in lookup_interface_method changes when itable_index exceeds 31.
  // For windows, a narrow estimate was found to be 104. Other OSes not tested.
  const ptrdiff_t estimate = 104;
-  const ptrdiff_t codesize = typecheckSize + lookupSize + index_dependent_slop;
+  const ptrdiff_t codesize = lookupSize + index_dependent_slop;
  slop_delta  = (int)(estimate - codesize);
  slop_bytes += slop_delta;
  assert(slop_delta >= 0, "itable #%d: Code size estimate (%d) for lookup_interface_method too small, required: %d", itable_index, (int)estimate, (int)codesize);
@ -246,6 +239,7 @@ VtableStub* VtableStubs::create_itable_stub(int itable_index) {
  }
 #endif // ASSERT

+  __ pop(rdx);
  address ame_addr = __ pc();
  __ jmp(Address(method, Method::from_compiled_offset()));

@ -255,6 +249,7 @@ VtableStub* VtableStubs::create_itable_stub(int itable_index) {
  // We force resolving of the call site by jumping to the "handle
  // wrong method" stub, and so let the interpreter runtime do all the
  // dirty work.
+  __ pop(rdx);
  __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub()));

  masm->flush();
--- a/src/hotspot/cpu/x86/vtableStubs_x86_64.cpp
+++ b/src/hotspot/cpu/x86/vtableStubs_x86_64.cpp
@ -175,10 +175,12 @@ VtableStub* VtableStubs::create_itable_stub(int itable_index) {
  // (various calling sequences use r[cd]x, r[sd]i, r[89]; stay away from them)
  const Register recv_klass_reg     = r10;
  const Register holder_klass_reg   = rax; // declaring interface klass (DECC)
-  const Register resolved_klass_reg = rbx; // resolved interface klass (REFC)
+  const Register resolved_klass_reg = r14; // resolved interface klass (REFC)
  const Register temp_reg           = r11;
+  const Register temp_reg2          = r13;
+  const Register method             = rbx;
+  const Register icholder_reg       = rax;

-  const Register icholder_reg = rax;
  __ movptr(resolved_klass_reg, Address(icholder_reg, CompiledICHolder::holder_klass_offset()));
  __ movptr(holder_klass_reg,   Address(icholder_reg, CompiledICHolder::holder_metadata_offset()));

@ -192,25 +194,16 @@ VtableStub* VtableStubs::create_itable_stub(int itable_index) {
  start_pc = __ pc();

  // Receiver subtype check against REFC.
-  // Destroys recv_klass_reg value.
-  __ lookup_interface_method(// inputs: rec. class, interface
-                             recv_klass_reg, resolved_klass_reg, noreg,
-                             // outputs:  scan temp. reg1, scan temp. reg2
-                             recv_klass_reg, temp_reg,
-                             L_no_such_interface,
-                             /*return_method=*/false);
-
-  const ptrdiff_t  typecheckSize = __ pc() - start_pc;
-  start_pc = __ pc();
-
  // Get selected method from declaring class and itable index
-  const Register method = rbx;
-  __ load_klass(recv_klass_reg, j_rarg0, temp_reg);   // restore recv_klass_reg
-  __ lookup_interface_method(// inputs: rec. class, interface, itable index
-                             recv_klass_reg, holder_klass_reg, itable_index,
-                             // outputs: method, scan temp. reg
-                             method, temp_reg,
-                             L_no_such_interface);
+  __ lookup_interface_method_stub(recv_klass_reg, // input
+                                  holder_klass_reg, // input
+                                  resolved_klass_reg, // input
+                                  method, // output
+                                  temp_reg,
+                                  temp_reg2,
+                                  noreg,
+                                  itable_index,
+                                  L_no_such_interface);

  const ptrdiff_t  lookupSize = __ pc() - start_pc;

@ -218,7 +211,7 @@ VtableStub* VtableStubs::create_itable_stub(int itable_index) {
  // The emitted code in lookup_interface_method changes when itable_index exceeds 15.
  // For linux, a very narrow estimate would be 112, but Solaris requires some more space (130).
  const ptrdiff_t estimate = 136;
-  const ptrdiff_t codesize = typecheckSize + lookupSize + index_dependent_slop;
+  const ptrdiff_t codesize = lookupSize + index_dependent_slop;
  slop_delta  = (int)(estimate - codesize);
  slop_bytes += slop_delta;
  assert(slop_delta >= 0, "itable #%d: Code size estimate (%d) for lookup_interface_method too small, required: %d", itable_index, (int)estimate, (int)codesize);
--- a/src/hotspot/os/aix/os_aix.cpp
+++ b/src/hotspot/os/aix/os_aix.cpp
@ -3002,3 +3002,9 @@ bool os::supports_map_sync() {
 }

 void os::print_memory_mappings(char* addr, size_t bytes, outputStream* st) {}
+
+#if INCLUDE_JFR
+
+void os::jfr_report_memory_info() {}
+
+#endif // INCLUDE_JFR
--- a/src/hotspot/os/bsd/os_bsd.cpp
+++ b/src/hotspot/os/bsd/os_bsd.cpp
@ -69,6 +69,9 @@
 #include "utilities/events.hpp"
 #include "utilities/growableArray.hpp"
 #include "utilities/vmError.hpp"
+#if INCLUDE_JFR
+#include "jfr/jfrEvents.hpp"
+#endif

 // put OS-includes here
 # include <dlfcn.h>
@ -101,6 +104,7 @@
 #endif

 #ifdef __APPLE__
+  #include <mach/task_info.h>
  #include <mach-o/dyld.h>
 #endif

@ -2453,3 +2457,31 @@ bool os::start_debugging(char *buf, int buflen) {
 }

 void os::print_memory_mappings(char* addr, size_t bytes, outputStream* st) {}
+
+#if INCLUDE_JFR
+
+void os::jfr_report_memory_info() {
+#ifdef __APPLE__
+  mach_task_basic_info info;
+  mach_msg_type_number_t count = MACH_TASK_BASIC_INFO_COUNT;
+
+  kern_return_t ret = task_info(mach_task_self(), MACH_TASK_BASIC_INFO, (task_info_t)&info, &count);
+  if (ret == KERN_SUCCESS) {
+    // Send the RSS JFR event
+    EventResidentSetSize event;
+    event.set_size(info.resident_size);
+    event.set_peak(info.resident_size_max);
+    event.commit();
+  } else {
+    // Log a warning
+    static bool first_warning = true;
+    if (first_warning) {
+      log_warning(jfr)("Error fetching RSS values: task_info failed");
+      first_warning = false;
+    }
+  }
+
+#endif // __APPLE__
+}
+
+#endif // INCLUDE_JFR
--- a/src/hotspot/os/linux/os_linux.cpp
+++ b/src/hotspot/os/linux/os_linux.cpp
@ -77,6 +77,9 @@
 #include "utilities/macros.hpp"
 #include "utilities/powerOfTwo.hpp"
 #include "utilities/vmError.hpp"
+#if INCLUDE_JFR
+#include "jfr/jfrEvents.hpp"
+#endif

 // put OS-includes here
 # include <sys/types.h>
@ -927,6 +930,15 @@ bool os::create_thread(Thread* thread, ThreadType thr_type,
  }
  assert(is_aligned(stack_size, os::vm_page_size()), "stack_size not aligned");

+  // Add an additional page to the stack size to reduce its chances of getting large page aligned
+  // so that the stack does not get backed by a transparent huge page.
+  size_t default_large_page_size = os::Linux::default_large_page_size();
+  if (default_large_page_size != 0 &&
+      stack_size >= default_large_page_size &&
+      is_aligned(stack_size, default_large_page_size)) {
+    stack_size += os::vm_page_size();
+  }
+
  int status = pthread_attr_setstacksize(&attr, stack_size);
  if (status != 0) {
    // pthread_attr_setstacksize() function can fail
@ -2461,6 +2473,28 @@ void os::pd_print_cpu_info(outputStream* st, char* buf, size_t buflen) {
  print_sys_devices_cpu_info(st);
 }

+#if INCLUDE_JFR
+
+void os::jfr_report_memory_info() {
+  os::Linux::meminfo_t info;
+  if (os::Linux::query_process_memory_info(&info)) {
+    // Send the RSS JFR event
+    EventResidentSetSize event;
+    event.set_size(info.vmrss * K);
+    event.set_peak(info.vmhwm * K);
+    event.commit();
+  } else {
+    // Log a warning
+    static bool first_warning = true;
+    if (first_warning) {
+      log_warning(jfr)("Error fetching RSS values: query_process_memory_info failed");
+      first_warning = false;
+    }
+  }
+}
+
+#endif // INCLUDE_JFR
+
 #if defined(AMD64) || defined(IA32) || defined(X32)
 const char* search_string = "model name";
 #elif defined(M68K)
@ -3745,8 +3779,11 @@ bool os::Linux::setup_large_page_type(size_t page_size) {
 }

 void os::large_page_init() {
-  // 1) Handle the case where we do not want to use huge pages and hence
-  //    there is no need to scan the OS for related info
+  // Always initialize the default large page size even if large pages are not being used.
+  size_t default_large_page_size = scan_default_large_page_size();
+  os::Linux::_default_large_page_size = default_large_page_size;
+
+  // 1) Handle the case where we do not want to use huge pages
  if (!UseLargePages &&
      !UseTransparentHugePages &&
      !UseHugeTLBFS &&
@ -3764,9 +3801,7 @@ void os::large_page_init() {
    return;
  }

-  // 2) Scan OS info
-  size_t default_large_page_size = scan_default_large_page_size();
-  os::Linux::_default_large_page_size = default_large_page_size;
+  // 2) check if large pages are configured
  if (default_large_page_size == 0) {
    // No large pages configured, return.
    warn_no_large_pages_configured();
--- a/src/hotspot/os/windows/os_windows.cpp
+++ b/src/hotspot/os/windows/os_windows.cpp
@ -78,6 +78,9 @@
 #include "utilities/macros.hpp"
 #include "utilities/vmError.hpp"
 #include "windbghelp.hpp"
+#if INCLUDE_JFR
+#include "jfr/jfrEvents.hpp"
+#endif

 #ifdef _DEBUG
 #include <crtdbg.h>
@ -6022,6 +6025,33 @@ void os::print_memory_mappings(char* addr, size_t bytes, outputStream* st) {
  }
 }

+#if INCLUDE_JFR
+
+void os::jfr_report_memory_info() {
+  PROCESS_MEMORY_COUNTERS_EX pmex;
+  ZeroMemory(&pmex, sizeof(PROCESS_MEMORY_COUNTERS_EX));
+  pmex.cb = sizeof(pmex);
+
+  BOOL ret = GetProcessMemoryInfo(GetCurrentProcess(), (PROCESS_MEMORY_COUNTERS*) &pmex, sizeof(pmex));
+  if (ret != 0) {
+    // Send the RSS JFR event
+    EventResidentSetSize event;
+    event.set_size(pmex.WorkingSetSize);
+    event.set_peak(pmex.PeakWorkingSetSize);
+    event.commit();
+  } else {
+    // Log a warning
+    static bool first_warning = true;
+    if (first_warning) {
+      log_warning(jfr)("Error fetching RSS values: GetProcessMemoryInfo failed");
+      first_warning = false;
+    }
+  }
+}
+
+#endif // INCLUDE_JFR
+
+
 // File conventions
 const char* os::file_separator() { return "\\"; }
 const char* os::line_separator() { return "\r\n"; }
--- a/src/hotspot/os_cpu/linux_aarch64/globals_linux_aarch64.hpp
+++ b/src/hotspot/os_cpu/linux_aarch64/globals_linux_aarch64.hpp
@ -30,10 +30,18 @@
 // (see globals.hpp)

 define_pd_global(bool, DontYieldALot,            false);
-define_pd_global(intx, ThreadStackSize,          2048); // 0 => use system default
-define_pd_global(intx, VMThreadStackSize,        2048);

-define_pd_global(intx, CompilerThreadStackSize,  2048);
+// Set default stack sizes < 2MB so as to prevent stacks from getting
+// large-page aligned and backed by THPs on systems where 2MB is the
+// default huge page size. For non-JavaThreads, glibc may add an additional
+// guard page to the total stack size, so to keep the default sizes same
+// for all the following flags, we set them to 2 pages less than 2MB. On
+// systems where 2MB is the default large page size, 4KB is most commonly
+// the regular page size.
+define_pd_global(intx, ThreadStackSize,          2040); // 0 => use system default
+define_pd_global(intx, VMThreadStackSize,        2040);
+
+define_pd_global(intx, CompilerThreadStackSize,  2040);

 define_pd_global(uintx,JVMInvokeMethodSlack,     8192);

--- a/src/hotspot/share/asm/register.hpp
+++ b/src/hotspot/share/asm/register.hpp
@ -184,6 +184,10 @@ public:
    return *this;
  }

+  RegSetIterator<RegImpl>& operator=(const RegSetIterator<RegImpl>& mit) {
+    _regs= mit._regs;
+    return *this;
+  }
  bool operator==(const RegSetIterator& rhs) const {
    return _regs.bits() == rhs._regs.bits();
  }
@ -194,6 +198,10 @@ public:
  RegImpl operator*() {
    return _regs.first();
  }
+
+  AbstractRegSet<RegImpl> remaining() const {
+    return _regs;
+  }
 };

 template <class RegImpl>
--- a/src/hotspot/share/classfile/classLoaderData.cpp
+++ b/src/hotspot/share/classfile/classLoaderData.cpp
@ -148,6 +148,7 @@ ClassLoaderData::ClassLoaderData(Handle h_class_loader, bool has_class_mirror_ho
  _jmethod_ids(nullptr),
  _deallocate_list(nullptr),
  _next(nullptr),
+  _unloading_next(nullptr),
  _class_loader_klass(nullptr), _name(nullptr), _name_and_id(nullptr) {

  if (!h_class_loader.is_null()) {
--- a/src/hotspot/share/classfile/classLoaderData.hpp
+++ b/src/hotspot/share/classfile/classLoaderData.hpp
@ -153,15 +153,41 @@ class ClassLoaderData : public CHeapObj<mtClass> {
  GrowableArray<Metadata*>*      _deallocate_list;

  // Support for walking class loader data objects
-  ClassLoaderData* _next; /// Next loader_datas created
+  //
+  // The ClassLoaderDataGraph maintains two lists to keep track of CLDs.
+  //
+  // The first list [_head, _next] is where new CLDs are registered. The CLDs
+  // are only inserted at the _head, and the _next pointers are only rewritten
+  // from unlink_next() which unlinks one unloading CLD by setting _next to
+  // _next->_next. This allows GCs to concurrently walk the list while the CLDs
+  // are being concurrently unlinked.
+  //
+  // The second list [_unloading_head, _unloading_next] is where dead CLDs get
+  // moved to during class unloading. See: ClassLoaderDataGraph::do_unloading().
+  // This list is never modified while other threads are iterating over it.
+  //
+  // After all dead CLDs have been moved to the unloading list, there's a
+  // synchronization point (handshake) to ensure that all threads reading these
+  // CLDs finish their work. This ensures that we don't have a use-after-free
+  // when we later delete the CLDs.
+  //
+  // And finally, when no threads are using the unloading CLDs anymore, we
+  // remove them from the class unloading list and delete them. See:
+  // ClassLoaderDataGraph::purge();
+  ClassLoaderData* _next;
+  ClassLoaderData* _unloading_next;

  Klass*  _class_loader_klass;
  Symbol* _name;
  Symbol* _name_and_id;
  JFR_ONLY(DEFINE_TRACE_ID_FIELD;)

-  void set_next(ClassLoaderData* next) { Atomic::store(&_next, next); }
-  ClassLoaderData* next() const        { return Atomic::load(&_next); }
+  void set_next(ClassLoaderData* next);
+  ClassLoaderData* next() const;
+  void unlink_next();
+
+  void set_unloading_next(ClassLoaderData* unloading_next);
+  ClassLoaderData* unloading_next() const;

  ClassLoaderData(Handle h_class_loader, bool has_class_mirror_holder);
  ~ClassLoaderData();
--- a/src/hotspot/share/classfile/classLoaderData.inline.hpp
+++ b/src/hotspot/share/classfile/classLoaderData.inline.hpp
@ -33,6 +33,29 @@
 #include "oops/oopHandle.inline.hpp"
 #include "oops/weakHandle.inline.hpp"

+inline void ClassLoaderData::set_next(ClassLoaderData* next) {
+  assert(this->next() == nullptr, "only link once");
+  Atomic::store(&_next, next);
+}
+
+inline ClassLoaderData* ClassLoaderData::next() const {
+  return Atomic::load(&_next);
+}
+
+inline void ClassLoaderData::unlink_next() {
+  assert(next()->is_unloading(), "only remove unloading clds");
+  Atomic::store(&_next, _next->_next);
+}
+
+inline void ClassLoaderData::set_unloading_next(ClassLoaderData* unloading_next) {
+  assert(this->unloading_next() == nullptr, "only link once");
+  _unloading_next = unloading_next;
+}
+
+inline ClassLoaderData* ClassLoaderData::unloading_next() const {
+  return _unloading_next;
+}
+
 inline oop ClassLoaderData::class_loader() const {
  assert(!_unloading, "This oop is not available to unloading class loader data");
  assert(_holder.is_null() || holder_no_keepalive() != nullptr , "This class loader data holder must be alive");
--- a/src/hotspot/share/classfile/classLoaderDataGraph.cpp
+++ b/src/hotspot/share/classfile/classLoaderDataGraph.cpp
@ -23,6 +23,7 @@
 */

 #include "precompiled.hpp"
+#include "classfile/classLoaderData.inline.hpp"
 #include "classfile/classLoaderDataGraph.inline.hpp"
 #include "classfile/dictionary.hpp"
 #include "classfile/javaClasses.hpp"
@ -200,9 +201,9 @@ void ClassLoaderDataGraph::walk_metadata_and_clean_metaspaces() {
  clean_deallocate_lists(walk_all_metadata);
 }

-// GC root of class loader data created.
+// List head of all class loader data.
 ClassLoaderData* volatile ClassLoaderDataGraph::_head = nullptr;
-ClassLoaderData* ClassLoaderDataGraph::_unloading = nullptr;
+ClassLoaderData* ClassLoaderDataGraph::_unloading_head = nullptr;

 bool ClassLoaderDataGraph::_should_clean_deallocate_lists = false;
 bool ClassLoaderDataGraph::_safepoint_cleanup_needed = false;
@ -268,16 +269,7 @@ inline void assert_is_safepoint_or_gc() {
         "Must be called by safepoint or GC");
 }

-void ClassLoaderDataGraph::cld_unloading_do(CLDClosure* cl) {
-  assert_is_safepoint_or_gc();
-  for (ClassLoaderData* cld = _unloading; cld != nullptr; cld = cld->next()) {
-    assert(cld->is_unloading(), "invariant");
-    cl->do_cld(cld);
-  }
-}
-
-// These are functions called by the GC, which require all of the CLDs, including the
-// unloading ones.
+// These are functions called by the GC, which require all of the CLDs, including not yet unlinked CLDs.
 void ClassLoaderDataGraph::cld_do(CLDClosure* cl) {
  assert_is_safepoint_or_gc();
  for (ClassLoaderData* cld = Atomic::load_acquire(&_head);  cld != nullptr; cld = cld->next()) {
@ -430,7 +422,7 @@ void ClassLoaderDataGraph::loaded_classes_do(KlassClosure* klass_closure) {

 void ClassLoaderDataGraph::classes_unloading_do(void f(Klass* const)) {
  assert_locked_or_safepoint(ClassLoaderDataGraph_lock);
-  for (ClassLoaderData* cld = _unloading; cld != nullptr; cld = cld->next()) {
+  for (ClassLoaderData* cld = _unloading_head; cld != nullptr; cld = cld->unloading_next()) {
    assert(cld->is_unloading(), "invariant");
    cld->classes_do(f);
  }
@ -501,37 +493,32 @@ bool ClassLoaderDataGraph::is_valid(ClassLoaderData* loader_data) {
 bool ClassLoaderDataGraph::do_unloading() {
  assert_locked_or_safepoint(ClassLoaderDataGraph_lock);

-  ClassLoaderData* data = _head;
  ClassLoaderData* prev = nullptr;
  bool seen_dead_loader = false;
  uint loaders_processed = 0;
  uint loaders_removed = 0;

-  data = _head;
-  while (data != nullptr) {
+  for (ClassLoaderData* data = _head; data != nullptr; data = data->next()) {
    if (data->is_alive()) {
      prev = data;
-      data = data->next();
      loaders_processed++;
-      continue;
-    }
-    seen_dead_loader = true;
-    loaders_removed++;
-    ClassLoaderData* dead = data;
-    dead->unload();
-    data = data->next();
-    // Remove from loader list.
-    // This class loader data will no longer be found
-    // in the ClassLoaderDataGraph.
-    if (prev != nullptr) {
-      prev->set_next(data);
    } else {
-      assert(dead == _head, "sanity check");
-      // The GC might be walking this concurrently
-      Atomic::store(&_head, data);
+      // Found dead CLD.
+      loaders_removed++;
+      seen_dead_loader = true;
+      data->unload();
+
+      // Move dead CLD to unloading list.
+      if (prev != nullptr) {
+        prev->unlink_next();
+      } else {
+        assert(data == _head, "sanity check");
+        // The GC might be walking this concurrently
+        Atomic::store(&_head, data->next());
+      }
+      data->set_unloading_next(_unloading_head);
+      _unloading_head = data;
    }
-    dead->set_next(_unloading);
-    _unloading = dead;
  }

  log_debug(class, loader, data)("do_unloading: loaders processed %u, loaders removed %u", loaders_processed, loaders_removed);
@ -563,13 +550,13 @@ void ClassLoaderDataGraph::clean_module_and_package_info() {
 }

 void ClassLoaderDataGraph::purge(bool at_safepoint) {
-  ClassLoaderData* list = _unloading;
-  _unloading = nullptr;
+  ClassLoaderData* list = _unloading_head;
+  _unloading_head = nullptr;
  ClassLoaderData* next = list;
  bool classes_unloaded = false;
  while (next != nullptr) {
    ClassLoaderData* purge_me = next;
-    next = purge_me->next();
+    next = purge_me->unloading_next();
    delete purge_me;
    classes_unloaded = true;
  }
--- a/src/hotspot/share/classfile/classLoaderDataGraph.hpp
+++ b/src/hotspot/share/classfile/classLoaderDataGraph.hpp
@ -41,9 +41,10 @@ class ClassLoaderDataGraph : public AllStatic {
  friend class ClassLoaderDataGraphIteratorBase;
  friend class VMStructs;
 private:
-  // All CLDs (except the null CLD) can be reached by walking _head->_next->...
+  // All CLDs (except unlinked CLDs) can be reached by walking _head->_next->...
  static ClassLoaderData* volatile _head;
-  static ClassLoaderData* _unloading;
+  // All unlinked CLDs
+  static ClassLoaderData* _unloading_head;

  // Set if there's anything to purge in the deallocate lists or previous versions
  // during a safepoint after class unloading in a full GC.
@ -67,9 +68,8 @@ class ClassLoaderDataGraph : public AllStatic {
  static void clear_claimed_marks();
  static void clear_claimed_marks(int claim);
  static void verify_claimed_marks_cleared(int claim);
-  // Iteration through CLDG inside a safepoint; GC support
+  // Iteration through CLDG; GC support
  static void cld_do(CLDClosure* cl);
-  static void cld_unloading_do(CLDClosure* cl);
  static void roots_cld_do(CLDClosure* strong, CLDClosure* weak);
  static void always_strong_cld_do(CLDClosure* cl);
  // Iteration through CLDG not by GC.
--- a/src/hotspot/share/classfile/vmSymbols.hpp
+++ b/src/hotspot/share/classfile/vmSymbols.hpp
@ -118,6 +118,8 @@
  template(java_lang_StringBuilder,                   "java/lang/StringBuilder")                  \
  template(java_lang_CharSequence,                    "java/lang/CharSequence")                   \
  template(java_lang_SecurityManager,                 "java/lang/SecurityManager")                \
+  template(java_lang_ScopedValue,                     "java/lang/ScopedValue")                    \
+  template(java_lang_ScopedValue_Carrier,             "java/lang/ScopedValue$Carrier")            \
  template(java_security_AccessControlContext,        "java/security/AccessControlContext")       \
  template(java_security_AccessController,            "java/security/AccessController")           \
  template(executePrivileged_name,                    "executePrivileged")                        \
@ -157,8 +159,6 @@
  template(jdk_internal_loader_BuiltinClassLoader,    "jdk/internal/loader/BuiltinClassLoader")   \
  template(jdk_internal_loader_ClassLoaders_AppClassLoader,      "jdk/internal/loader/ClassLoaders$AppClassLoader")      \
  template(jdk_internal_loader_ClassLoaders_PlatformClassLoader, "jdk/internal/loader/ClassLoaders$PlatformClassLoader") \
-  template(jdk_incubator_concurrent_ScopedValue,      "jdk/incubator/concurrent/ScopedValue")     \
-  template(jdk_incubator_concurrent_ScopedValue_Carrier, "jdk/incubator/concurrent/ScopedValue$Carrier") \
                                                                                                  \
  /* Java runtime version access */                                                               \
  template(java_lang_VersionProps,                    "java/lang/VersionProps")                   \
--- a/src/hotspot/share/compiler/compilerDefinitions.cpp
+++ b/src/hotspot/share/compiler/compilerDefinitions.cpp
@ -545,7 +545,7 @@ bool CompilerConfig::check_args_consistency(bool status) {
      FLAG_SET_DEFAULT(SegmentedCodeCache, false);
    }
 #if INCLUDE_JVMCI
-    if (EnableJVMCI) {
+    if (EnableJVMCI || UseJVMCICompiler) {
      if (!FLAG_IS_DEFAULT(EnableJVMCI) || !FLAG_IS_DEFAULT(UseJVMCICompiler)) {
        warning("JVMCI Compiler disabled due to -Xint.");
      }
--- a/src/hotspot/share/gc/g1/g1HeapRegionAttr.hpp
+++ b/src/hotspot/share/gc/g1/g1HeapRegionAttr.hpp
@ -101,7 +101,6 @@ public:
 #ifdef ASSERT
  bool is_default() const              { return type() == NotInCSet; }
  bool is_valid() const                { return (type() >= Optional && type() < Num); }
-  bool is_valid_gen() const            { return (type() >= Young && type() <= Old); }
 #endif
 };

--- a/src/hotspot/share/gc/shared/gcVMOperations.cpp
+++ b/src/hotspot/share/gc/shared/gcVMOperations.cpp
@ -68,6 +68,10 @@ VM_GC_Operation::~VM_GC_Operation() {
  ch->soft_ref_policy()->set_all_soft_refs_clear(false);
 }

+const char* VM_GC_Operation::cause() const {
+  return GCCause::to_string(_gc_cause);
+}
+
 // The same dtrace probe can't be inserted in two different files, so we
 // have to call it here, so it's only in one file.  Can't create new probes
 // for the other file anymore.   The dtrace probes have to remain stable.
--- a/src/hotspot/share/gc/shared/gcVMOperations.hpp
+++ b/src/hotspot/share/gc/shared/gcVMOperations.hpp
@ -137,6 +137,8 @@ class VM_GC_Operation: public VM_GC_Sync_Operation {
  }
  ~VM_GC_Operation();

+  virtual const char* cause() const;
+
  // Acquire the Heap_lock and determine if this VM operation should be executed
  // (i.e. not skipped). Return this result, and also store it in _prologue_succeeded.
  virtual bool doit_prologue();
--- a/src/hotspot/share/gc/shared/genCollectedHeap.cpp
+++ b/src/hotspot/share/gc/shared/genCollectedHeap.cpp
@ -278,12 +278,7 @@ HeapWord* GenCollectedHeap::expand_heap_and_allocate(size_t size, bool   is_tlab
 }

 HeapWord* GenCollectedHeap::mem_allocate_work(size_t size,
-                                              bool is_tlab,
-                                              bool* gc_overhead_limit_was_exceeded) {
-  // In general gc_overhead_limit_was_exceeded should be false so
-  // set it so here and reset it to true only if the gc time
-  // limit is being exceeded as checked below.
-  *gc_overhead_limit_was_exceeded = false;
+                                              bool is_tlab) {

  HeapWord* result = nullptr;

@ -365,23 +360,6 @@ HeapWord* GenCollectedHeap::mem_allocate_work(size_t size,
         continue;  // Retry and/or stall as necessary.
      }

-      // Allocation has failed and a collection
-      // has been done.  If the gc time limit was exceeded the
-      // this time, return null so that an out-of-memory
-      // will be thrown.  Clear gc_overhead_limit_exceeded
-      // so that the overhead exceeded does not persist.
-
-      const bool limit_exceeded = size_policy()->gc_overhead_limit_exceeded();
-      const bool softrefs_clear = soft_ref_policy()->all_soft_refs_clear();
-
-      if (limit_exceeded && softrefs_clear) {
-        *gc_overhead_limit_was_exceeded = true;
-        size_policy()->set_gc_overhead_limit_exceeded(false);
-        if (op.result() != nullptr) {
-          CollectedHeap::fill_with_object(op.result(), size);
-        }
-        return nullptr;
-      }
      assert(result == nullptr || is_in_reserved(result),
             "result not in heap");
      return result;
@ -418,8 +396,7 @@ HeapWord* GenCollectedHeap::attempt_allocation(size_t size,
 HeapWord* GenCollectedHeap::mem_allocate(size_t size,
                                         bool* gc_overhead_limit_was_exceeded) {
  return mem_allocate_work(size,
-                           false /* is_tlab */,
-                           gc_overhead_limit_was_exceeded);
+                           false /* is_tlab */);
 }

 bool GenCollectedHeap::must_clear_all_soft_refs() {
@ -935,10 +912,8 @@ size_t GenCollectedHeap::unsafe_max_tlab_alloc(Thread* thr) const {
 HeapWord* GenCollectedHeap::allocate_new_tlab(size_t min_size,
                                              size_t requested_size,
                                              size_t* actual_size) {
-  bool gc_overhead_limit_was_exceeded;
  HeapWord* result = mem_allocate_work(requested_size /* size */,
-                                       true /* is_tlab */,
-                                       &gc_overhead_limit_was_exceeded);
+                                       true /* is_tlab */);
  if (result != nullptr) {
    *actual_size = requested_size;
  }
--- a/src/hotspot/share/gc/shared/genCollectedHeap.hpp
+++ b/src/hotspot/share/gc/shared/genCollectedHeap.hpp
@ -376,8 +376,7 @@ private:
  HeapWord* expand_heap_and_allocate(size_t size, bool is_tlab);

  HeapWord* mem_allocate_work(size_t size,
-                              bool is_tlab,
-                              bool* gc_overhead_limit_was_exceeded);
+                              bool is_tlab);

 #if INCLUDE_SERIALGC
  // For use by mark-sweep.  As implemented, mark-sweep-compact is global
--- a/src/hotspot/share/gc/shared/threadLocalAllocBuffer.cpp
+++ b/src/hotspot/share/gc/shared/threadLocalAllocBuffer.cpp
@ -213,9 +213,11 @@ void ThreadLocalAllocBuffer::initialize() {
  set_desired_size(initial_desired_size());

  size_t capacity = Universe::heap()->tlab_capacity(thread()) / HeapWordSize;
-  // Keep alloc_frac as float and not double to avoid the double to float conversion
-  float alloc_frac = desired_size() * target_refills() / (float) capacity;
-  _allocation_fraction.sample(alloc_frac);
+  if (capacity > 0) {
+    // Keep alloc_frac as float and not double to avoid the double to float conversion
+    float alloc_frac = desired_size() * target_refills() / (float)capacity;
+    _allocation_fraction.sample(alloc_frac);
+  }

  set_refill_waste_limit(initial_refill_waste_limit());

--- a/src/hotspot/share/gc/shenandoah/shenandoahHeap.cpp
+++ b/src/hotspot/share/gc/shenandoah/shenandoahHeap.cpp
@ -593,6 +593,7 @@ void ShenandoahHeap::print_on(outputStream* st) const {
  MetaspaceUtils::print_on(st);

  if (Verbose) {
+    st->cr();
    print_heap_regions_on(st);
  }
 }
@ -1021,10 +1022,13 @@ void ShenandoahHeap::trash_cset_regions() {

 void ShenandoahHeap::print_heap_regions_on(outputStream* st) const {
  st->print_cr("Heap Regions:");
-  st->print_cr("EU=empty-uncommitted, EC=empty-committed, R=regular, H=humongous start, HC=humongous continuation, CS=collection set, T=trash, P=pinned");
-  st->print_cr("BTE=bottom/top/end, U=used, T=TLAB allocs, G=GCLAB allocs, S=shared allocs, L=live data");
-  st->print_cr("R=root, CP=critical pins, TAMS=top-at-mark-start, UWM=update watermark");
-  st->print_cr("SN=alloc sequence number");
+  st->print_cr("Region state: EU=empty-uncommitted, EC=empty-committed, R=regular, H=humongous start, HP=pinned humongous start");
+  st->print_cr("              HC=humongous continuation, CS=collection set, TR=trash, P=pinned, CSP=pinned collection set");
+  st->print_cr("BTE=bottom/top/end, TAMS=top-at-mark-start");
+  st->print_cr("UWM=update watermark, U=used");
+  st->print_cr("T=TLAB allocs, G=GCLAB allocs");
+  st->print_cr("S=shared allocs, L=live data");
+  st->print_cr("CP=critical pins");

  for (size_t i = 0; i < num_regions(); i++) {
    get_region(i)->print_on(st);
@ -2115,6 +2119,7 @@ void ShenandoahHeap::rebuild_free_set(bool concurrent) {

 void ShenandoahHeap::print_extended_on(outputStream *st) const {
  print_on(st);
+  st->cr();
  print_heap_regions_on(st);
 }

--- a/src/hotspot/share/gc/shenandoah/shenandoahHeapRegion.cpp
+++ b/src/hotspot/share/gc/shenandoah/shenandoahHeapRegion.cpp
@ -351,7 +351,7 @@ void ShenandoahHeapRegion::print_on(outputStream* st) const {
      st->print("|CS ");
      break;
    case _trash:
-      st->print("|T  ");
+      st->print("|TR ");
      break;
    case _pinned:
      st->print("|P  ");
--- a/src/hotspot/share/gc/z/zGeneration.cpp
+++ b/src/hotspot/share/gc/z/zGeneration.cpp
@ -398,14 +398,20 @@ const char* ZGeneration::phase_to_string() const {

 class VM_ZOperation : public VM_Operation {
 private:
-  const uint _gc_id;
-  bool       _success;
+  const uint           _gc_id;
+  const GCCause::Cause _gc_cause;
+  bool                 _success;

 public:
-  VM_ZOperation()
+  VM_ZOperation(GCCause::Cause gc_cause)
    : _gc_id(GCId::current()),
+      _gc_cause(gc_cause),
      _success(false) {}

+  virtual const char* cause() const {
+    return GCCause::to_string(_gc_cause);
+  }
+
  virtual bool block_jni_critical() const {
    // Blocking JNI critical regions is needed in operations where we change
    // the bad mask or move objects. Changing the bad mask will invalidate all
@ -558,6 +564,9 @@ void ZGenerationYoung::collect(ZYoungType type, ConcurrentGCTimer* timer) {

 class VM_ZMarkStartYoungAndOld : public VM_ZOperation {
 public:
+  VM_ZMarkStartYoungAndOld()
+    : VM_ZOperation(ZDriver::major()->gc_cause()) {}
+
  virtual VMOp_Type type() const {
    return VMOp_ZMarkStartYoungAndOld;
  }
@ -578,7 +587,22 @@ public:
  }
 };

-class VM_ZMarkStartYoung : public VM_ZOperation {
+class VM_ZYoungOperation : public VM_ZOperation {
+private:
+  static ZDriver* driver() {
+    if (ZGeneration::young()->type() == ZYoungType::minor) {
+      return ZDriver::minor();
+    } else {
+      return ZDriver::major();
+    }
+  }
+
+public:
+  VM_ZYoungOperation()
+    : VM_ZOperation(driver()->gc_cause()) {}
+};
+
+class VM_ZMarkStartYoung : public VM_ZYoungOperation {
 public:
  virtual VMOp_Type type() const {
    return VMOp_ZMarkStartYoung;
@ -626,7 +650,7 @@ void ZGenerationYoung::concurrent_mark() {
  mark_follow();
 }

-class VM_ZMarkEndYoung : public VM_ZOperation {
+class VM_ZMarkEndYoung : public VM_ZYoungOperation {
 public:
  virtual VMOp_Type type() const {
    return VMOp_ZMarkEndYoung;
@ -785,7 +809,8 @@ void ZGenerationYoung::concurrent_select_relocation_set() {
  select_relocation_set(_id, promote_all);
 }

-class VM_ZRelocateStartYoung : public VM_ZOperation {
+class VM_ZRelocateStartYoung : public VM_ZYoungOperation {
+
 public:
  virtual VMOp_Type type() const {
    return VMOp_ZRelocateStartYoung;
@ -1047,6 +1072,9 @@ void ZGenerationOld::concurrent_mark() {

 class VM_ZMarkEndOld : public VM_ZOperation {
 public:
+  VM_ZMarkEndOld()
+    : VM_ZOperation(ZDriver::major()->gc_cause()) {}
+
  virtual VMOp_Type type() const {
    return VMOp_ZMarkEndOld;
  }
@ -1125,6 +1153,9 @@ void ZGenerationOld::concurrent_select_relocation_set() {

 class VM_ZRelocateStartOld : public VM_ZOperation {
 public:
+  VM_ZRelocateStartOld()
+    : VM_ZOperation(ZDriver::major()->gc_cause()) {}
+
  virtual VMOp_Type type() const {
    return VMOp_ZRelocateStartOld;
  }
--- a/src/hotspot/share/gc/z/zMark.cpp
+++ b/src/hotspot/share/gc/z/zMark.cpp
@ -375,16 +375,17 @@ void ZMark::follow_array_object(objArrayOop obj, bool finalizable) {

 void ZMark::follow_object(oop obj, bool finalizable) {
  if (_generation->is_old()) {
-    if (ZHeap::heap()->is_old(to_zaddress(obj))) {
-      if (finalizable) {
-        ZMarkBarrierFollowOopClosure<true /* finalizable */, ZGenerationIdOptional::old> cl;
-        ZIterator::oop_iterate(obj, &cl);
-      } else {
-        ZMarkBarrierFollowOopClosure<false /* finalizable */, ZGenerationIdOptional::old> cl;
-        ZIterator::oop_iterate(obj, &cl);
-      }
+    assert(ZHeap::heap()->is_old(to_zaddress(obj)), "Should only follow objects from old gen");
+    if (obj->is_stackChunk()) {
+      // No support for tracing through stack chunks as finalizably reachable
+      ZMarkBarrierFollowOopClosure<false /* finalizable */, ZGenerationIdOptional::old> cl;
+      ZIterator::oop_iterate(obj, &cl);
+    } else if (finalizable) {
+      ZMarkBarrierFollowOopClosure<true /* finalizable */, ZGenerationIdOptional::old> cl;
+      ZIterator::oop_iterate(obj, &cl);
    } else {
-      fatal("Catch me!");
+      ZMarkBarrierFollowOopClosure<false /* finalizable */, ZGenerationIdOptional::old> cl;
+      ZIterator::oop_iterate(obj, &cl);
    }
  } else {
    // Young gen must help out with old marking
--- a/src/hotspot/share/gc/z/zPhysicalMemory.cpp
+++ b/src/hotspot/share/gc/z/zPhysicalMemory.cpp
@ -276,6 +276,13 @@ void ZPhysicalMemoryManager::try_enable_uncommit(size_t min_capacity, size_t max
 }

 void ZPhysicalMemoryManager::nmt_commit(zoffset offset, size_t size) const {
+  // NMT expects a 1-to-1 mapping between virtual and physical memory.
+  // ZGC can temporarily have multiple virtual addresses pointing to
+  // the same physical memory.
+  //
+  // When this function is called we don't know where in the virtual memory
+  // this physical memory will be mapped. So we fake that the virtual memory
+  // address is the heap base + the given offset.
  const zaddress addr = ZOffset::address(offset);
  MemTracker::record_virtual_memory_commit((void*)untype(addr), size, CALLER_PC);
 }
@ -320,6 +327,11 @@ bool ZPhysicalMemoryManager::commit(ZPhysicalMemory& pmem) {

    // Commit segment
    const size_t committed = _backing.commit(segment.start(), segment.size());
+
+    // Register with NMT
+    nmt_commit(segment.start(), committed);
+
+    // Register committed segment
    if (!pmem.commit_segment(i, committed)) {
      // Failed or partially failed
      return false;
@ -341,6 +353,11 @@ bool ZPhysicalMemoryManager::uncommit(ZPhysicalMemory& pmem) {

    // Uncommit segment
    const size_t uncommitted = _backing.uncommit(segment.start(), segment.size());
+
+    // Unregister with NMT
+    nmt_uncommit(segment.start(), uncommitted);
+
+    // Deregister uncommitted segment
    if (!pmem.uncommit_segment(i, uncommitted)) {
      // Failed or partially failed
      return false;
@ -351,12 +368,16 @@ bool ZPhysicalMemoryManager::uncommit(ZPhysicalMemory& pmem) {
  return true;
 }

-void ZPhysicalMemoryManager::pretouch_view(zaddress addr, size_t size) const {
+void ZPhysicalMemoryManager::pretouch(zoffset offset, size_t size) const {
+  const uintptr_t addr = untype(ZOffset::address(offset));
  const size_t page_size = ZLargePages::is_explicit() ? ZGranuleSize : os::vm_page_size();
-  os::pretouch_memory((void*)untype(addr), (void*)(untype(addr) + size), page_size);
+  os::pretouch_memory((void*)addr, (void*)(addr + size), page_size);
 }

-void ZPhysicalMemoryManager::map_view(zaddress_unsafe addr, const ZPhysicalMemory& pmem) const {
+// Map virtual memory to physcial memory
+void ZPhysicalMemoryManager::map(zoffset offset, const ZPhysicalMemory& pmem) const {
+  const zaddress_unsafe addr = ZOffset::address_unsafe(offset);
+
  size_t size = 0;

  // Map segments
@ -375,27 +396,9 @@ void ZPhysicalMemoryManager::map_view(zaddress_unsafe addr, const ZPhysicalMemor
  }
 }

-void ZPhysicalMemoryManager::unmap_view(zaddress_unsafe addr, size_t size) const {
+// Unmap virtual memory from physical memory
+void ZPhysicalMemoryManager::unmap(zoffset offset, size_t size) const {
+  const zaddress_unsafe addr = ZOffset::address_unsafe(offset);
+
  _backing.unmap(addr, size);
 }
-
-void ZPhysicalMemoryManager::pretouch(zoffset offset, size_t size) const {
-  // Pre-touch all views
-  pretouch_view(ZOffset::address(offset), size);
-}
-
-void ZPhysicalMemoryManager::map(zoffset offset, const ZPhysicalMemory& pmem) const {
-  const size_t size = pmem.size();
-
-  // Map all views
-  map_view(ZOffset::address_unsafe(offset), pmem);
-
-  nmt_commit(offset, size);
-}
-
-void ZPhysicalMemoryManager::unmap(zoffset offset, size_t size) const {
-  nmt_uncommit(offset, size);
-
-  // Unmap all views
-  unmap_view(ZOffset::address_unsafe(offset), size);
-}
--- a/src/hotspot/share/include/jvm.h
+++ b/src/hotspot/share/include/jvm.h
@ -1164,6 +1164,12 @@ JVM_VirtualThreadHideFrames(JNIEnv* env, jobject vthread, jboolean hide);
 JNIEXPORT jint JNICALL
 JVM_GetClassFileVersion(JNIEnv *env, jclass current);

+/*
+ * Return JNI_TRUE if warnings are printed when agents are dynamically loaded.
+ */
+JNIEXPORT jboolean JNICALL
+JVM_PrintWarningAtDynamicAgentLoad(void);
+
 /*
 * This structure is used by the launcher to get the default thread
 * stack size from the VM using JNI_GetDefaultJavaVMInitArgs() with a
--- a/src/hotspot/share/jfr/metadata/metadata.xml
+++ b/src/hotspot/share/jfr/metadata/metadata.xml
@ -73,6 +73,11 @@
    <Field type="string" name="name" label="Name" />
  </Event>

+  <Event name="ResidentSetSize" category="Java Virtual Machine, Memory" label="Resident Set Size" description="Resident set size of the process" thread="false" period="everyChunk">
+    <Field type="ulong" contentType="bytes" name="size" label="Resident Set Size" description="Resident set size of the process" />
+    <Field type="ulong" contentType="bytes" name="peak" label="Resident Set Size Peak Value" description="Resident set size peak value of the process" />
+  </Event>
+
  <!-- Ordinary and experimental events !-->

  <Event name="ThreadStart" category="Java Application" label="Java Thread Start" thread="true" startTime="false" stackTrace="true">
--- a/src/hotspot/share/jfr/periodic/jfrPeriodic.cpp
+++ b/src/hotspot/share/jfr/periodic/jfrPeriodic.cpp
@ -95,6 +95,10 @@ PeriodicType JfrPeriodicEventSet::type(void) {
  return _type;
 }

+TRACE_REQUEST_FUNC(ResidentSetSize) {
+  os::jfr_report_memory_info();
+}
+
 TRACE_REQUEST_FUNC(JVMInformation) {
  ResourceMark rm;
  EventJVMInformation event;
--- a/src/hotspot/share/jvmci/jvmciRuntime.cpp
+++ b/src/hotspot/share/jvmci/jvmciRuntime.cpp
@ -1966,7 +1966,12 @@ Method* JVMCIRuntime::get_method_by_index(const constantPoolHandle& cpool,
 // ------------------------------------------------------------------
 // Check for changes to the system dictionary during compilation
 // class loads, evolution, breakpoints
-JVMCI::CodeInstallResult JVMCIRuntime::validate_compile_task_dependencies(Dependencies* dependencies, JVMCICompileState* compile_state, char** failure_detail) {
+JVMCI::CodeInstallResult JVMCIRuntime::validate_compile_task_dependencies(Dependencies* dependencies,
+                                                                          JVMCICompileState* compile_state,
+                                                                          char** failure_detail,
+                                                                          bool& failing_dep_is_call_site)
+{
+  failing_dep_is_call_site = false;
  // If JVMTI capabilities were enabled during compile, the compilation is invalidated.
  if (compile_state != nullptr && compile_state->jvmti_state_changed()) {
    *failure_detail = (char*) "Jvmti state change during compilation invalidated dependencies";
@ -1975,10 +1980,13 @@ JVMCI::CodeInstallResult JVMCIRuntime::validate_compile_task_dependencies(Depend

  CompileTask* task = compile_state == nullptr ? nullptr : compile_state->task();
  Dependencies::DepType result = dependencies->validate_dependencies(task, failure_detail);
+
  if (result == Dependencies::end_marker) {
    return JVMCI::ok;
  }
-
+  if (result == Dependencies::call_site_target_value) {
+    failing_dep_is_call_site = true;
+  }
  return JVMCI::dependencies_failed;
 }

@ -2167,11 +2175,13 @@ JVMCI::CodeInstallResult JVMCIRuntime::register_method(JVMCIEnv* JVMCIENV,
    }

    // Check for {class loads, evolution, breakpoints} during compilation
-    result = validate_compile_task_dependencies(dependencies, JVMCIENV->compile_state(), &failure_detail);
+    JVMCICompileState* compile_state = JVMCIENV->compile_state();
+    bool failing_dep_is_call_site;
+    result = validate_compile_task_dependencies(dependencies, compile_state, &failure_detail, failing_dep_is_call_site);
    if (result != JVMCI::ok) {
      // While not a true deoptimization, it is a preemptive decompile.
      MethodData* mdp = method()->method_data();
-      if (mdp != nullptr) {
+      if (mdp != nullptr && !failing_dep_is_call_site) {
        mdp->inc_decompile_count();
 #ifdef ASSERT
        if (mdp->decompile_count() > (uint)PerMethodRecompilationCutoff) {
--- a/src/hotspot/share/jvmci/jvmciRuntime.hpp
+++ b/src/hotspot/share/jvmci/jvmciRuntime.hpp
@ -428,7 +428,10 @@ class JVMCIRuntime: public CHeapObj<mtJVMCI> {

  // Helper routine for determining the validity of a compilation
  // with respect to concurrent class loading.
-  static JVMCI::CodeInstallResult validate_compile_task_dependencies(Dependencies* target, JVMCICompileState* task, char** failure_detail);
+  static JVMCI::CodeInstallResult validate_compile_task_dependencies(Dependencies* target,
+                                                                     JVMCICompileState* task,
+                                                                     char** failure_detail,
+                                                                     bool& failing_dep_is_call_site);

  // Compiles `target` with the JVMCI compiler.
  void compile_method(JVMCIEnv* JVMCIENV, JVMCICompiler* compiler, const methodHandle& target, int entry_bci);
--- a/src/hotspot/share/jvmci/jvmci_globals.cpp
+++ b/src/hotspot/share/jvmci/jvmci_globals.cpp
@ -71,6 +71,7 @@ bool JVMCIGlobals::check_jvmci_flags_are_consistent() {
  JVMCI_FLAG_CHECKED(UseJVMCICompiler)
  JVMCI_FLAG_CHECKED(EnableJVMCI)
  JVMCI_FLAG_CHECKED(EnableJVMCIProduct)
+  JVMCI_FLAG_CHECKED(UseGraalJIT)

  CHECK_NOT_SET(BootstrapJVMCI,   UseJVMCICompiler)
  CHECK_NOT_SET(PrintBootstrap,   UseJVMCICompiler)
@ -164,7 +165,7 @@ bool JVMCIGlobals::check_jvmci_flags_are_consistent() {
 }

 // Convert JVMCI flags from experimental to product
-bool JVMCIGlobals::enable_jvmci_product_mode(JVMFlagOrigin origin) {
+bool JVMCIGlobals::enable_jvmci_product_mode(JVMFlagOrigin origin, bool use_graal_jit) {
  const char *JVMCIFlags[] = {
    "EnableJVMCI",
    "EnableJVMCIProduct",
@ -201,6 +202,12 @@ bool JVMCIGlobals::enable_jvmci_product_mode(JVMFlagOrigin origin) {
  if (JVMFlagAccess::set_bool(jvmciEnableFlag, &value, origin) != JVMFlag::SUCCESS) {
    return false;
  }
+  if (use_graal_jit) {
+    JVMFlag *useGraalJITFlag = JVMFlag::find_flag("UseGraalJIT");
+    if (JVMFlagAccess::set_bool(useGraalJITFlag, &value, origin) != JVMFlag::SUCCESS) {
+      return false;
+    }
+  }

  // Effect of EnableJVMCIProduct on changing defaults of EnableJVMCI
  // and UseJVMCICompiler is deferred to check_jvmci_flags_are_consistent
--- a/src/hotspot/share/jvmci/jvmci_globals.hpp
+++ b/src/hotspot/share/jvmci/jvmci_globals.hpp
@ -48,6 +48,11 @@ class fileStream;
  product(bool, EnableJVMCI, false, EXPERIMENTAL,                           \
          "Enable JVMCI")                                                   \
                                                                            \
+  product(bool, UseGraalJIT, false, EXPERIMENTAL,                           \
+          "Select the Graal JVMCI compiler. This is an alias for: "         \
+          "  -XX:+EnableJVMCIProduct "                                      \
+          "  -Djvmci.Compiler=graal ")                                      \
+                                                                            \
  product(bool, EnableJVMCIProduct, false, EXPERIMENTAL,                    \
          "Allow JVMCI to be used in product mode. This alters a subset of "\
          "JVMCI flags to be non-experimental, defaults UseJVMCICompiler "  \
@ -185,7 +190,7 @@ class JVMCIGlobals {
  static bool check_jvmci_flags_are_consistent();

  // Convert JVMCI experimental flags to product
-  static bool enable_jvmci_product_mode(JVMFlagOrigin);
+  static bool enable_jvmci_product_mode(JVMFlagOrigin origin, bool use_graal_jit);

  // Returns true iff the GC fully supports JVMCI.
  static bool gc_supports_jvmci();
--- a/src/hotspot/share/jvmci/vmStructs_jvmci.cpp
+++ b/src/hotspot/share/jvmci/vmStructs_jvmci.cpp
@ -32,6 +32,7 @@
 #include "jvmci/vmStructs_jvmci.hpp"
 #include "oops/klassVtable.hpp"
 #include "oops/objArrayKlass.hpp"
+#include "prims/jvmtiThreadState.hpp"
 #include "runtime/deoptimization.hpp"
 #include "runtime/flags/jvmFlag.hpp"
 #include "runtime/osThread.hpp"
@ -215,6 +216,10 @@
  nonstatic_field(JavaThread,                  _poll_data,                                    SafepointMechanism::ThreadData)        \
  nonstatic_field(JavaThread,                  _stack_overflow_state._reserved_stack_activation, address)                            \
  nonstatic_field(JavaThread,                  _held_monitor_count,                           int64_t)                               \
+  JVMTI_ONLY(nonstatic_field(JavaThread,       _is_in_VTMS_transition,                        bool))                                 \
+  JVMTI_ONLY(nonstatic_field(JavaThread,       _is_in_tmp_VTMS_transition,                    bool))                                 \
+                                                                                                                                     \
+  JVMTI_ONLY(static_field(JvmtiVTMSTransitionDisabler, _VTMS_notify_jvmti_events,             bool))                                 \
                                                                                                                                     \
  static_field(java_lang_Class,                _klass_offset,                                 int)                                   \
  static_field(java_lang_Class,                _array_klass_offset,                           int)                                   \
@ -366,6 +371,7 @@
  JFR_ONLY(nonstatic_field(Thread,          _jfr_thread_local,                                JfrThreadLocal))                       \
                                                                                                                                     \
  static_field(java_lang_Thread,            _tid_offset,                                      int)                                   \
+  static_field(java_lang_Thread,            _jvmti_is_in_VTMS_transition_offset,              int)                                   \
  JFR_ONLY(static_field(java_lang_Thread,   _jfr_epoch_offset,                                int))                                  \
                                                                                                                                     \
  JFR_ONLY(nonstatic_field(JfrThreadLocal,  _vthread_id,                                      traceid))                              \
@ -756,6 +762,10 @@
  declare_function(SharedRuntime::enable_stack_reserved_zone)             \
  declare_function(SharedRuntime::frem)                                   \
  declare_function(SharedRuntime::drem)                                   \
+  JVMTI_ONLY(declare_function(SharedRuntime::notify_jvmti_vthread_start)) \
+  JVMTI_ONLY(declare_function(SharedRuntime::notify_jvmti_vthread_end))   \
+  JVMTI_ONLY(declare_function(SharedRuntime::notify_jvmti_vthread_mount)) \
+  JVMTI_ONLY(declare_function(SharedRuntime::notify_jvmti_vthread_unmount)) \
                                                                          \
  declare_function(os::dll_load)                                          \
  declare_function(os::dll_lookup)                                        \
--- a/src/hotspot/share/memory/virtualspace.cpp
+++ b/src/hotspot/share/memory/virtualspace.cpp
@ -551,7 +551,7 @@ void ReservedHeapSpace::initialize_compressed_heap(const size_t size, size_t ali
    const size_t class_space = align_up(CompressedClassSpaceSize, alignment);
    // For small heaps, save some space for compressed class pointer
    // space so it can be decoded with no base.
-    if (UseCompressedClassPointers && !UseSharedSpaces &&
+    if (UseCompressedClassPointers && !UseSharedSpaces && !DumpSharedSpaces &&
        OopEncodingHeapMax <= KlassEncodingMetaspaceMax &&
        (uint64_t)(aligned_heap_base_min_address + size + class_space) <= KlassEncodingMetaspaceMax) {
      zerobased_max = (char *)OopEncodingHeapMax - class_space;
--- a/src/hotspot/share/oops/weakHandle.hpp
+++ b/src/hotspot/share/oops/weakHandle.hpp
@ -54,6 +54,7 @@ class WeakHandle {
  inline oop peek() const;
  void release(OopStorage* storage) const;
  bool is_null() const { return _obj == nullptr; }
+  void set_null() { _obj = nullptr; }

  void replace(oop with_obj);

--- a/src/hotspot/share/opto/addnode.cpp
+++ b/src/hotspot/share/opto/addnode.cpp
@ -1030,6 +1030,14 @@ const Type* XorLNode::Value(PhaseGVN* phase) const {
  return AddNode::Value(phase);
 }

+Node* build_min_max_int(Node* a, Node* b, bool is_max) {
+  if (is_max) {
+    return new MaxINode(a, b);
+  } else {
+    return new MinINode(a, b);
+  }
+}
+
 Node* MaxNode::build_min_max(Node* a, Node* b, bool is_max, bool is_unsigned, const Type* t, PhaseGVN& gvn) {
  bool is_int = gvn.type(a)->isa_int();
  assert(is_int || gvn.type(a)->isa_long(), "int or long inputs");
@ -1044,13 +1052,8 @@ Node* MaxNode::build_min_max(Node* a, Node* b, bool is_max, bool is_unsigned, co
  }
  Node* res = nullptr;
  if (is_int && !is_unsigned) {
-    if (is_max) {
-      res =  gvn.transform(new MaxINode(a, b));
-      assert(gvn.type(res)->is_int()->_lo >= t->is_int()->_lo && gvn.type(res)->is_int()->_hi <= t->is_int()->_hi, "type doesn't match");
-    } else {
-      Node* res =  gvn.transform(new MinINode(a, b));
-      assert(gvn.type(res)->is_int()->_lo >= t->is_int()->_lo && gvn.type(res)->is_int()->_hi <= t->is_int()->_hi, "type doesn't match");
-    }
+    res = gvn.transform(build_min_max_int(a, b, is_max));
+    assert(gvn.type(res)->is_int()->_lo >= t->is_int()->_lo && gvn.type(res)->is_int()->_hi <= t->is_int()->_hi, "type doesn't match");
  } else {
    Node* cmp = nullptr;
    if (is_max) {
@ -1095,6 +1098,113 @@ Node* MaxNode::build_min_max_diff_with_zero(Node* a, Node* b, bool is_max, const
  return res;
 }

+// Check if addition of an integer with type 't' and a constant 'c' can overflow.
+static bool can_overflow(const TypeInt* t, jint c) {
+  jint t_lo = t->_lo;
+  jint t_hi = t->_hi;
+  return ((c < 0 && (java_add(t_lo, c) > t_lo)) ||
+          (c > 0 && (java_add(t_hi, c) < t_hi)));
+}
+
+// Let <x, x_off> = x_operands and <y, y_off> = y_operands.
+// If x == y and neither add(x, x_off) nor add(y, y_off) overflow, return
+// add(x, op(x_off, y_off)). Otherwise, return nullptr.
+Node* MaxNode::extract_add(PhaseGVN* phase, ConstAddOperands x_operands, ConstAddOperands y_operands) {
+  Node* x = x_operands.first;
+  Node* y = y_operands.first;
+  int opcode = Opcode();
+  assert(opcode == Op_MaxI || opcode == Op_MinI, "Unexpected opcode");
+  const TypeInt* tx = phase->type(x)->isa_int();
+  jint x_off = x_operands.second;
+  jint y_off = y_operands.second;
+  if (x == y && tx != nullptr &&
+      !can_overflow(tx, x_off) &&
+      !can_overflow(tx, y_off)) {
+    jint c = opcode == Op_MinI ? MIN2(x_off, y_off) : MAX2(x_off, y_off);
+    return new AddINode(x, phase->intcon(c));
+  }
+  return nullptr;
+}
+
+// Try to cast n as an integer addition with a constant. Return:
+//   <x, C>,       if n == add(x, C), where 'C' is a non-TOP constant;
+//   <nullptr, 0>, if n == add(x, C), where 'C' is a TOP constant; or
+//   <n, 0>,       otherwise.
+static ConstAddOperands as_add_with_constant(Node* n) {
+  if (n->Opcode() != Op_AddI) {
+    return ConstAddOperands(n, 0);
+  }
+  Node* x = n->in(1);
+  Node* c = n->in(2);
+  if (!c->is_Con()) {
+    return ConstAddOperands(n, 0);
+  }
+  const Type* c_type = c->bottom_type();
+  if (c_type == Type::TOP) {
+    return ConstAddOperands(nullptr, 0);
+  }
+  return ConstAddOperands(x, c_type->is_int()->get_con());
+}
+
+Node* MaxNode::IdealI(PhaseGVN* phase, bool can_reshape) {
+  int opcode = Opcode();
+  assert(opcode == Op_MinI || opcode == Op_MaxI, "Unexpected opcode");
+  // Try to transform the following pattern, in any of its four possible
+  // permutations induced by op's commutativity:
+  //     op(op(add(inner, inner_off), inner_other), add(outer, outer_off))
+  // into
+  //     op(add(inner, op(inner_off, outer_off)), inner_other),
+  // where:
+  //     op is either MinI or MaxI, and
+  //     inner == outer, and
+  //     the additions cannot overflow.
+  for (uint inner_op_index = 1; inner_op_index <= 2; inner_op_index++) {
+    if (in(inner_op_index)->Opcode() != opcode) {
+      continue;
+    }
+    Node* outer_add = in(inner_op_index == 1 ? 2 : 1);
+    ConstAddOperands outer_add_operands = as_add_with_constant(outer_add);
+    if (outer_add_operands.first == nullptr) {
+      return nullptr; // outer_add has a TOP input, no need to continue.
+    }
+    // One operand is a MinI/MaxI and the other is an integer addition with
+    // constant. Test the operands of the inner MinI/MaxI.
+    for (uint inner_add_index = 1; inner_add_index <= 2; inner_add_index++) {
+      Node* inner_op = in(inner_op_index);
+      Node* inner_add = inner_op->in(inner_add_index);
+      ConstAddOperands inner_add_operands = as_add_with_constant(inner_add);
+      if (inner_add_operands.first == nullptr) {
+        return nullptr; // inner_add has a TOP input, no need to continue.
+      }
+      // Try to extract the inner add.
+      Node* add_extracted = extract_add(phase, inner_add_operands, outer_add_operands);
+      if (add_extracted == nullptr) {
+        continue;
+      }
+      Node* add_transformed = phase->transform(add_extracted);
+      Node* inner_other = inner_op->in(inner_add_index == 1 ? 2 : 1);
+      return build_min_max_int(add_transformed, inner_other, opcode == Op_MaxI);
+    }
+  }
+  // Try to transform
+  //     op(add(x, x_off), add(y, y_off))
+  // into
+  //     add(x, op(x_off, y_off)),
+  // where:
+  //     op is either MinI or MaxI, and
+  //     inner == outer, and
+  //     the additions cannot overflow.
+  ConstAddOperands xC = as_add_with_constant(in(1));
+  ConstAddOperands yC = as_add_with_constant(in(2));
+  if (xC.first == nullptr || yC.first == nullptr) return nullptr;
+  return extract_add(phase, xC, yC);
+}
+
+// Ideal transformations for MaxINode
+Node* MaxINode::Ideal(PhaseGVN* phase, bool can_reshape) {
+  return IdealI(phase, can_reshape);
+}
+
 //=============================================================================
 //------------------------------add_ring---------------------------------------
 // Supplied function returns the sum of the inputs.
@ -1106,174 +1216,12 @@ const Type *MaxINode::add_ring( const Type *t0, const Type *t1 ) const {
  return TypeInt::make( MAX2(r0->_lo,r1->_lo), MAX2(r0->_hi,r1->_hi), MAX2(r0->_widen,r1->_widen) );
 }

-// Check if addition of an integer with type 't' and a constant 'c' can overflow
-static bool can_overflow(const TypeInt* t, jint c) {
-  jint t_lo = t->_lo;
-  jint t_hi = t->_hi;
-  return ((c < 0 && (java_add(t_lo, c) > t_lo)) ||
-          (c > 0 && (java_add(t_hi, c) < t_hi)));
-}
-
-// Ideal transformations for MaxINode
-Node* MaxINode::Ideal(PhaseGVN* phase, bool can_reshape) {
-  // Force a right-spline graph
-  Node* l = in(1);
-  Node* r = in(2);
-  // Transform  MaxI1(MaxI2(a, b), c)  into  MaxI1(a, MaxI2(b, c))
-  // to force a right-spline graph for the rest of MaxINode::Ideal().
-  if (l->Opcode() == Op_MaxI) {
-    assert(l != l->in(1), "dead loop in MaxINode::Ideal");
-    r = phase->transform(new MaxINode(l->in(2), r));
-    l = l->in(1);
-    set_req_X(1, l, phase);
-    set_req_X(2, r, phase);
-    return this;
-  }
-
-  // Get left input & constant
-  Node* x = l;
-  jint x_off = 0;
-  if (x->Opcode() == Op_AddI && // Check for "x+c0" and collect constant
-      x->in(2)->is_Con()) {
-    const Type* t = x->in(2)->bottom_type();
-    if (t == Type::TOP) return nullptr;  // No progress
-    x_off = t->is_int()->get_con();
-    x = x->in(1);
-  }
-
-  // Scan a right-spline-tree for MAXs
-  Node* y = r;
-  jint y_off = 0;
-  // Check final part of MAX tree
-  if (y->Opcode() == Op_AddI && // Check for "y+c1" and collect constant
-      y->in(2)->is_Con()) {
-    const Type* t = y->in(2)->bottom_type();
-    if (t == Type::TOP) return nullptr;  // No progress
-    y_off = t->is_int()->get_con();
-    y = y->in(1);
-  }
-  if (x->_idx > y->_idx && r->Opcode() != Op_MaxI) {
-    swap_edges(1, 2);
-    return this;
-  }
-
-  const TypeInt* tx = phase->type(x)->isa_int();
-
-  if (r->Opcode() == Op_MaxI) {
-    assert(r != r->in(2), "dead loop in MaxINode::Ideal");
-    y = r->in(1);
-    // Check final part of MAX tree
-    if (y->Opcode() == Op_AddI &&// Check for "y+c1" and collect constant
-        y->in(2)->is_Con()) {
-      const Type* t = y->in(2)->bottom_type();
-      if (t == Type::TOP) return nullptr;  // No progress
-      y_off = t->is_int()->get_con();
-      y = y->in(1);
-    }
-
-    if (x->_idx > y->_idx)
-      return new MaxINode(r->in(1), phase->transform(new MaxINode(l, r->in(2))));
-
-    // Transform MAX2(x + c0, MAX2(x + c1, z)) into MAX2(x + MAX2(c0, c1), z)
-    // if x == y and the additions can't overflow.
-    if (x == y && tx != nullptr &&
-        !can_overflow(tx, x_off) &&
-        !can_overflow(tx, y_off)) {
-      return new MaxINode(phase->transform(new AddINode(x, phase->intcon(MAX2(x_off, y_off)))), r->in(2));
-    }
-  } else {
-    // Transform MAX2(x + c0, y + c1) into x + MAX2(c0, c1)
-    // if x == y and the additions can't overflow.
-    if (x == y && tx != nullptr &&
-        !can_overflow(tx, x_off) &&
-        !can_overflow(tx, y_off)) {
-      return new AddINode(x, phase->intcon(MAX2(x_off, y_off)));
-    }
-  }
- return nullptr;
-}
-
 //=============================================================================
 //------------------------------Idealize---------------------------------------
 // MINs show up in range-check loop limit calculations.  Look for
 // "MIN2(x+c0,MIN2(y,x+c1))".  Pick the smaller constant: "MIN2(x+c0,y)"
-Node *MinINode::Ideal(PhaseGVN *phase, bool can_reshape) {
-  Node *progress = nullptr;
-  // Force a right-spline graph
-  Node *l = in(1);
-  Node *r = in(2);
-  // Transform  MinI1( MinI2(a,b), c)  into  MinI1( a, MinI2(b,c) )
-  // to force a right-spline graph for the rest of MinINode::Ideal().
-  if( l->Opcode() == Op_MinI ) {
-    assert( l != l->in(1), "dead loop in MinINode::Ideal" );
-    r = phase->transform(new MinINode(l->in(2),r));
-    l = l->in(1);
-    set_req_X(1, l, phase);
-    set_req_X(2, r, phase);
-    return this;
-  }
-
-  // Get left input & constant
-  Node *x = l;
-  jint x_off = 0;
-  if( x->Opcode() == Op_AddI && // Check for "x+c0" and collect constant
-      x->in(2)->is_Con() ) {
-    const Type *t = x->in(2)->bottom_type();
-    if( t == Type::TOP ) return nullptr;  // No progress
-    x_off = t->is_int()->get_con();
-    x = x->in(1);
-  }
-
-  // Scan a right-spline-tree for MINs
-  Node *y = r;
-  jint y_off = 0;
-  // Check final part of MIN tree
-  if( y->Opcode() == Op_AddI && // Check for "y+c1" and collect constant
-      y->in(2)->is_Con() ) {
-    const Type *t = y->in(2)->bottom_type();
-    if( t == Type::TOP ) return nullptr;  // No progress
-    y_off = t->is_int()->get_con();
-    y = y->in(1);
-  }
-  if( x->_idx > y->_idx && r->Opcode() != Op_MinI ) {
-    swap_edges(1, 2);
-    return this;
-  }
-
-  const TypeInt* tx = phase->type(x)->isa_int();
-
-  if( r->Opcode() == Op_MinI ) {
-    assert( r != r->in(2), "dead loop in MinINode::Ideal" );
-    y = r->in(1);
-    // Check final part of MIN tree
-    if( y->Opcode() == Op_AddI &&// Check for "y+c1" and collect constant
-        y->in(2)->is_Con() ) {
-      const Type *t = y->in(2)->bottom_type();
-      if( t == Type::TOP ) return nullptr;  // No progress
-      y_off = t->is_int()->get_con();
-      y = y->in(1);
-    }
-
-    if( x->_idx > y->_idx )
-      return new MinINode(r->in(1),phase->transform(new MinINode(l,r->in(2))));
-
-    // Transform MIN2(x + c0, MIN2(x + c1, z)) into MIN2(x + MIN2(c0, c1), z)
-    // if x == y and the additions can't overflow.
-    if (x == y && tx != nullptr &&
-        !can_overflow(tx, x_off) &&
-        !can_overflow(tx, y_off)) {
-      return new MinINode(phase->transform(new AddINode(x, phase->intcon(MIN2(x_off, y_off)))), r->in(2));
-    }
-  } else {
-    // Transform MIN2(x + c0, y + c1) into x + MIN2(c0, c1)
-    // if x == y and the additions can't overflow.
-    if (x == y && tx != nullptr &&
-        !can_overflow(tx, x_off) &&
-        !can_overflow(tx, y_off)) {
-      return new AddINode(x,phase->intcon(MIN2(x_off,y_off)));
-    }
-  }
-  return nullptr;
+Node* MinINode::Ideal(PhaseGVN* phase, bool can_reshape) {
+  return IdealI(phase, can_reshape);
 }

 //------------------------------add_ring---------------------------------------
--- a/src/hotspot/share/opto/addnode.hpp
+++ b/src/hotspot/share/opto/addnode.hpp
@ -28,10 +28,12 @@
 #include "opto/node.hpp"
 #include "opto/opcodes.hpp"
 #include "opto/type.hpp"
+#include "utilities/pair.hpp"

 // Portions of code courtesy of Clifford Click

 class PhaseTransform;
+typedef const Pair<Node*, jint> ConstAddOperands;

 //------------------------------AddNode----------------------------------------
 // Classic Add functionality.  This covers all the usual 'add' behaviors for
@ -252,12 +254,14 @@ class MaxNode : public AddNode {
 private:
  static Node* build_min_max(Node* a, Node* b, bool is_max, bool is_unsigned, const Type* t, PhaseGVN& gvn);
  static Node* build_min_max_diff_with_zero(Node* a, Node* b, bool is_max, const Type* t, PhaseGVN& gvn);
+  Node* extract_add(PhaseGVN* phase, ConstAddOperands x_operands, ConstAddOperands y_operands);

 public:
  MaxNode( Node *in1, Node *in2 ) : AddNode(in1,in2) {}
  virtual int Opcode() const = 0;
  virtual int max_opcode() const = 0;
  virtual int min_opcode() const = 0;
+  Node* IdealI(PhaseGVN* phase, bool can_reshape);

  static Node* unsigned_max(Node* a, Node* b, const Type* t, PhaseGVN& gvn) {
    return build_min_max(a, b, true, true, t, gvn);
--- a/src/hotspot/share/opto/compile.hpp
+++ b/src/hotspot/share/opto/compile.hpp
@ -672,6 +672,12 @@ class Compile : public Phase {
  void print_method(CompilerPhaseType cpt, int level, Node* n = nullptr);

 #ifndef PRODUCT
+  void dump_igv(const char* graph_name, int level = 3) {
+    if (should_print_igv(level)) {
+      _igv_printer->print_method(graph_name, level);
+    }
+  }
+
  void igv_print_method_to_file(const char* phase_name = "Debug", bool append = false);
  void igv_print_method_to_network(const char* phase_name = "Debug");
  static IdealGraphPrinter* debug_file_printer() { return _debug_file_printer; }
--- a/src/hotspot/share/opto/loopPredicate.cpp
+++ b/src/hotspot/share/opto/loopPredicate.cpp
@ -831,8 +831,9 @@ class Invariance : public StackObj {
 // Returns true if the predicate of iff is in "scale*iv + offset u< load_range(ptr)" format
 // Note: this function is particularly designed for loop predication. We require load_range
 //       and offset to be loop invariant computed on the fly by "invar"
-bool IdealLoopTree::is_range_check_if(IfNode *iff, PhaseIdealLoop *phase, BasicType bt, Node *iv, Node *&range,
+bool IdealLoopTree::is_range_check_if(IfProjNode* if_success_proj, PhaseIdealLoop *phase, BasicType bt, Node *iv, Node *&range,
                                      Node *&offset, jlong &scale) const {
+  IfNode* iff = if_success_proj->in(0)->as_If();
  if (!is_loop_exit(iff)) {
    return false;
  }
@ -840,7 +841,43 @@ bool IdealLoopTree::is_range_check_if(IfNode *iff, PhaseIdealLoop *phase, BasicT
    return false;
  }
  const BoolNode *bol = iff->in(1)->as_Bool();
-  if (bol->_test._test != BoolTest::lt) {
+  if (bol->_test._test != BoolTest::lt || if_success_proj->is_IfFalse()) {
+    // We don't have the required range check pattern:
+    // if (scale*iv + offset <u limit) {
+    //
+    // } else {
+    //   trap();
+    // }
+    //
+    // Having the trap on the true projection:
+    // if (scale*iv + offset <u limit) {
+    //   trap();
+    // }
+    //
+    // is not correct. We would need to flip the test to get the expected "trap on false path" pattern:
+    // if (scale*iv + offset >=u limit) {
+    //
+    // } else {
+    //   trap();
+    // }
+    //
+    // If we create a Hoisted Range Check Predicate for this wrong pattern, it could succeed at runtime (i.e. true
+    // for the value of "scale*iv + offset" in the first loop iteration and true for the value of "scale*iv + offset"
+    // in the last loop iteration) while the check to be hoisted could fail in other loop iterations.
+    //
+    // Example:
+    // Loop: "for (int i = -1; i < 1000; i++)"
+    // init = "scale*iv + offset" in the first loop iteration = 1*-1 + 0 = -1
+    // last = "scale*iv + offset" in the last loop iteration = 1*999 + 0 = 999
+    // limit = 100
+    //
+    // Hoisted Range Check Predicate is always true:
+    // init >=u limit && last >=u limit  <=>
+    // -1 >=u 100 && 999 >= u 100
+    //
+    // But for 0 <= x < 100: x >=u 100 is false.
+    // We would wrongly skip the branch with the trap() and possibly miss to execute some other statements inside that
+    // trap() branch.
    return false;
  }
  if (!bol->in(1)->is_Cmp()) {
@ -871,14 +908,14 @@ bool IdealLoopTree::is_range_check_if(IfNode *iff, PhaseIdealLoop *phase, BasicT
  return true;
 }

-bool IdealLoopTree::is_range_check_if(IfNode *iff, PhaseIdealLoop *phase, Invariance& invar DEBUG_ONLY(COMMA ProjNode *predicate_proj)) const {
+bool IdealLoopTree::is_range_check_if(IfProjNode* if_success_proj, PhaseIdealLoop *phase, Invariance& invar DEBUG_ONLY(COMMA ProjNode *predicate_proj)) const {
  Node* range = nullptr;
  Node* offset = nullptr;
  jlong scale = 0;
  Node* iv = _head->as_BaseCountedLoop()->phi();
  Compile* C = Compile::current();
  const uint old_unique_idx = C->unique();
-  if (!is_range_check_if(iff, phase, T_INT, iv, range, offset, scale)) {
+  if (!is_range_check_if(if_success_proj, phase, T_INT, iv, range, offset, scale)) {
    return false;
  }
  if (!invar.is_invariant(range)) {
@ -931,10 +968,8 @@ bool IdealLoopTree::is_range_check_if(IfNode *iff, PhaseIdealLoop *phase, Invari
 //   max(scale*i + offset) = scale*(limit-stride) + offset
 // (2) stride*scale < 0
 //   max(scale*i + offset) = scale*init + offset
-BoolNode* PhaseIdealLoop::rc_predicate(IdealLoopTree *loop, Node* ctrl,
-                                       int scale, Node* offset,
-                                       Node* init, Node* limit, jint stride,
-                                       Node* range, bool upper, bool &overflow, bool negate) {
+BoolNode* PhaseIdealLoop::rc_predicate(IdealLoopTree* loop, Node* ctrl, int scale, Node* offset, Node* init,
+                                       Node* limit, jint stride, Node* range, bool upper, bool& overflow) {
  jint con_limit  = (limit != nullptr && limit->is_Con())  ? limit->get_int()  : 0;
  jint con_init   = init->is_Con()   ? init->get_int()   : 0;
  jint con_offset = offset->is_Con() ? offset->get_int() : 0;
@ -1060,7 +1095,7 @@ BoolNode* PhaseIdealLoop::rc_predicate(IdealLoopTree *loop, Node* ctrl,
    cmp = new CmpUNode(max_idx_expr, range);
  }
  register_new_node(cmp, ctrl);
-  BoolNode* bol = new BoolNode(cmp, negate ? BoolTest::ge : BoolTest::lt);
+  BoolNode* bol = new BoolNode(cmp, BoolTest::lt);
  register_new_node(bol, ctrl);

  if (TraceLoopPredicate) {
@ -1323,12 +1358,12 @@ void PhaseIdealLoop::loop_predication_follow_branches(Node *n, IdealLoopTree *lo
  } while (stack.size() > 0);
 }

-bool PhaseIdealLoop::loop_predication_impl_helper(IdealLoopTree* loop, IfProjNode* if_proj,
+bool PhaseIdealLoop::loop_predication_impl_helper(IdealLoopTree* loop, IfProjNode* if_success_proj,
                                                  ParsePredicateSuccessProj* parse_predicate_proj, CountedLoopNode* cl,
                                                  ConNode* zero, Invariance& invar, Deoptimization::DeoptReason reason) {
  // Following are changed to nonnull when a predicate can be hoisted
  IfProjNode* new_predicate_proj = nullptr;
-  IfNode*   iff  = if_proj->in(0)->as_If();
+  IfNode*   iff  = if_success_proj->in(0)->as_If();
  Node*     test = iff->in(1);
  if (!test->is_Bool()) { //Conv2B, ...
    return false;
@ -1344,7 +1379,7 @@ bool PhaseIdealLoop::loop_predication_impl_helper(IdealLoopTree* loop, IfProjNod

    // Negate test if necessary (Parse Predicates always have IfTrue as success projection and IfFalse as uncommon trap)
    bool negated = false;
-    if (if_proj->is_IfFalse()) {
+    if (if_success_proj->is_IfFalse()) {
      new_predicate_bol = new BoolNode(new_predicate_bol->in(1), new_predicate_bol->_test.negate());
      register_new_node(new_predicate_bol, ctrl);
      negated = true;
@ -1361,8 +1396,9 @@ bool PhaseIdealLoop::loop_predication_impl_helper(IdealLoopTree* loop, IfProjNod
      loop->dump_head();
    }
 #endif
-  } else if (cl != nullptr && loop->is_range_check_if(iff, this, invar DEBUG_ONLY(COMMA parse_predicate_proj))) {
+  } else if (cl != nullptr && loop->is_range_check_if(if_success_proj, this, invar DEBUG_ONLY(COMMA parse_predicate_proj))) {
    // Range check for counted loops
+    assert(if_success_proj->is_IfTrue(), "trap must be on false projection for a range check");
    const Node*    cmp    = bol->in(1)->as_Cmp();
    Node*          idx    = cmp->in(1);
    assert(!invar.is_invariant(idx), "index is variant");
@ -1397,33 +1433,31 @@ bool PhaseIdealLoop::loop_predication_impl_helper(IdealLoopTree* loop, IfProjNod
    }
    // If predicate expressions may overflow in the integer range, longs are used.
    bool overflow = false;
-    // Negate test if necessary (Parse Predicates always have IfTrue as success projection and IfFalse as uncommon trap)
-    const bool negate = (if_proj->is_IfFalse());
-
    // Test the lower bound
-    BoolNode* lower_bound_bol = rc_predicate(loop, ctrl, scale, offset, init, limit, stride, rng, false, overflow, negate);
+    BoolNode* lower_bound_bol = rc_predicate(loop, ctrl, scale, offset, init, limit, stride, rng, false, overflow);

    const int if_opcode = iff->Opcode();
    IfProjNode* lower_bound_proj = create_new_if_for_predicate(parse_predicate_proj, nullptr, reason, overflow ? Op_If : if_opcode);
    IfNode* lower_bound_iff = lower_bound_proj->in(0)->as_If();
    _igvn.hash_delete(lower_bound_iff);
    lower_bound_iff->set_req(1, lower_bound_bol);
-    if (TraceLoopPredicate) tty->print_cr("lower bound check if: %s %d ", negate ? " negated" : "", lower_bound_iff->_idx);
+    if (TraceLoopPredicate) tty->print_cr("lower bound check if: %d", lower_bound_iff->_idx);

    // Test the upper bound
-    BoolNode* upper_bound_bol = rc_predicate(loop, lower_bound_proj, scale, offset, init, limit, stride, rng, true, overflow, negate);
+    BoolNode* upper_bound_bol = rc_predicate(loop, lower_bound_proj, scale, offset, init, limit, stride, rng, true,
+                                             overflow);

    IfProjNode* upper_bound_proj = create_new_if_for_predicate(parse_predicate_proj, nullptr, reason, overflow ? Op_If : if_opcode);
    assert(upper_bound_proj->in(0)->as_If()->in(0) == lower_bound_proj, "should dominate");
    IfNode* upper_bound_iff = upper_bound_proj->in(0)->as_If();
    _igvn.hash_delete(upper_bound_iff);
    upper_bound_iff->set_req(1, upper_bound_bol);
-    if (TraceLoopPredicate) tty->print_cr("upper bound check if: %s %d ", negate ? " negated" : "", lower_bound_iff->_idx);
+    if (TraceLoopPredicate) tty->print_cr("upper bound check if: %d", lower_bound_iff->_idx);

    // Fall through into rest of the cleanup code which will move any dependent nodes to the skeleton predicates of the
    // upper bound test. We always need to create skeleton predicates in order to properly remove dead loops when later
    // splitting the predicated loop into (unreachable) sub-loops (i.e. done by unrolling, peeling, pre/main/post etc.).
-    new_predicate_proj = add_template_assertion_predicate(iff, loop, if_proj, parse_predicate_proj, upper_bound_proj, scale,
+    new_predicate_proj = add_template_assertion_predicate(iff, loop, if_success_proj, parse_predicate_proj, upper_bound_proj, scale,
                                                          offset, init, limit, stride, rng, overflow, reason);

 #ifndef PRODUCT
@ -1439,10 +1473,10 @@ bool PhaseIdealLoop::loop_predication_impl_helper(IdealLoopTree* loop, IfProjNod
  }
  assert(new_predicate_proj != nullptr, "sanity");
  // Success - attach condition (new_predicate_bol) to predicate if
-  invar.map_ctrl(if_proj, new_predicate_proj); // so that invariance test can be appropriate
+  invar.map_ctrl(if_success_proj, new_predicate_proj); // so that invariance test can be appropriate

  // Eliminate the old If in the loop body
-  dominated_by(new_predicate_proj, iff, if_proj->_con != new_predicate_proj->_con );
+  dominated_by(new_predicate_proj, iff, if_success_proj->_con != new_predicate_proj->_con);

  C->set_major_progress();
  return true;
@ -1459,7 +1493,8 @@ IfProjNode* PhaseIdealLoop::add_template_assertion_predicate(IfNode* iff, IdealL
  Node* opaque_init = new OpaqueLoopInitNode(C, init);
  register_new_node(opaque_init, upper_bound_proj);
  bool negate = (if_proj->_con != predicate_proj->_con);
-  BoolNode* bol = rc_predicate(loop, upper_bound_proj, scale, offset, opaque_init, limit, stride, rng, (stride > 0) != (scale > 0), overflow, negate);
+  BoolNode* bol = rc_predicate(loop, upper_bound_proj, scale, offset, opaque_init, limit, stride, rng,
+                               (stride > 0) != (scale > 0), overflow);
  Node* opaque_bol = new Opaque4Node(C, bol, _igvn.intcon(1)); // This will go away once loop opts are over
  C->add_template_assertion_predicate_opaq(opaque_bol);
  register_new_node(opaque_bol, upper_bound_proj);
@ -1481,7 +1516,8 @@ IfProjNode* PhaseIdealLoop::add_template_assertion_predicate(IfNode* iff, IdealL
  max_value = new CastIINode(max_value, loop->_head->as_CountedLoop()->phi()->bottom_type());
  register_new_node(max_value, predicate_proj);

-  bol = rc_predicate(loop, new_proj, scale, offset, max_value, limit, stride, rng, (stride > 0) != (scale > 0), overflow, negate);
+  bol = rc_predicate(loop, new_proj, scale, offset, max_value, limit, stride, rng, (stride > 0) != (scale > 0),
+                     overflow);
  opaque_bol = new Opaque4Node(C, bol, _igvn.intcon(1));
  C->add_template_assertion_predicate_opaq(opaque_bol);
  register_new_node(opaque_bol, new_proj);
@ -1799,6 +1835,9 @@ ParsePredicateNode* ParsePredicates::get_parse_predicate_or_null(Node* parse_pre
 }

 // Initialize the Parse Predicate projection field that matches the kind of the parent of `parse_predicate_proj`.
+// Only initialize if Parse Predicate projection itself or any of the Parse Predicate projections coming further up
+// in the graph are not already initialized (this would be a sign of repeated Parse Predicates which are not cleaned up,
+// yet).
 bool ParsePredicates::assign_predicate_proj(ParsePredicateSuccessProj* parse_predicate_proj) {
  ParsePredicateNode* parse_predicate = get_parse_predicate_or_null(parse_predicate_proj);
  assert(parse_predicate != nullptr, "must exist");
@ -1811,13 +1850,16 @@ bool ParsePredicates::assign_predicate_proj(ParsePredicateSuccessProj* parse_pre
      _loop_predicate_proj = parse_predicate_proj;
      break;
    case Deoptimization::DeoptReason::Reason_profile_predicate:
-      if (_profiled_loop_predicate_proj != nullptr) {
+      if (_profiled_loop_predicate_proj != nullptr ||
+          _loop_predicate_proj != nullptr) {
        return false;
      }
      _profiled_loop_predicate_proj = parse_predicate_proj;
      break;
    case Deoptimization::DeoptReason::Reason_loop_limit_check:
-      if (_loop_limit_check_predicate_proj != nullptr) {
+      if (_loop_limit_check_predicate_proj != nullptr ||
+          _loop_predicate_proj != nullptr ||
+          _profiled_loop_predicate_proj != nullptr) {
        return false;
      }
      _loop_limit_check_predicate_proj = parse_predicate_proj;
--- a/src/hotspot/share/opto/loopTransform.cpp
+++ b/src/hotspot/share/opto/loopTransform.cpp
@ -2858,7 +2858,7 @@ Node* PhaseIdealLoop::add_range_check_elimination_assertion_predicate(IdealLoopT
                                                                      Node* value) {
  bool overflow = false;
  BoolNode* bol = rc_predicate(loop, ctrl, scale_con, offset, value, nullptr, stride_con,
-                               limit, (stride_con > 0) != (scale_con > 0), overflow, false);
+                               limit, (stride_con > 0) != (scale_con > 0), overflow);
  Node* opaque_bol = new Opaque4Node(C, bol, _igvn.intcon(1));
  register_new_node(opaque_bol, ctrl);
  IfNode* new_iff = nullptr;
--- a/src/hotspot/share/opto/loopnode.cpp
+++ b/src/hotspot/share/opto/loopnode.cpp
@ -1089,13 +1089,13 @@ int PhaseIdealLoop::extract_long_range_checks(const IdealLoopTree* loop, jlong s
  for (uint i = 0; i < loop->_body.size(); i++) {
    Node* c = loop->_body.at(i);
    if (c->is_IfProj() && c->in(0)->is_RangeCheck()) {
-      CallStaticJavaNode* call = c->as_IfProj()->is_uncommon_trap_if_pattern(Deoptimization::Reason_none);
+      IfProjNode* if_proj = c->as_IfProj();
+      CallStaticJavaNode* call = if_proj->is_uncommon_trap_if_pattern(Deoptimization::Reason_none);
      if (call != nullptr) {
        Node* range = nullptr;
        Node* offset = nullptr;
        jlong scale = 0;
-        RangeCheckNode* rc = c->in(0)->as_RangeCheck();
-        if (loop->is_range_check_if(rc, this, T_LONG, phi, range, offset, scale) &&
+        if (loop->is_range_check_if(if_proj, this, T_LONG, phi, range, offset, scale) &&
            loop->is_invariant(range) && loop->is_invariant(offset) &&
            original_iters_limit / ABS(scale * stride_con) >= min_iters) {
          reduced_iters_limit = MIN2(reduced_iters_limit, original_iters_limit/ABS(scale));
--- a/src/hotspot/share/opto/loopnode.hpp
+++ b/src/hotspot/share/opto/loopnode.hpp
@ -733,8 +733,8 @@ public:
  bool policy_range_check(PhaseIdealLoop* phase, bool provisional, BasicType bt) const;

  // Return TRUE if "iff" is a range check.
-  bool is_range_check_if(IfNode *iff, PhaseIdealLoop *phase, Invariance& invar DEBUG_ONLY(COMMA ProjNode *predicate_proj)) const;
-  bool is_range_check_if(IfNode* iff, PhaseIdealLoop* phase, BasicType bt, Node* iv, Node*& range, Node*& offset,
+  bool is_range_check_if(IfProjNode* if_success_proj, PhaseIdealLoop* phase, Invariance& invar DEBUG_ONLY(COMMA ProjNode* predicate_proj)) const;
+  bool is_range_check_if(IfProjNode* if_success_proj, PhaseIdealLoop* phase, BasicType bt, Node* iv, Node*& range, Node*& offset,
                         jlong& scale) const;

  // Estimate the number of nodes required when cloning a loop (body).
@ -1366,15 +1366,12 @@ public:
  void register_control(Node* n, IdealLoopTree *loop, Node* pred, bool update_body = true);

  // Construct a range check for a predicate if
-  BoolNode* rc_predicate(IdealLoopTree *loop, Node* ctrl,
-                         int scale, Node* offset,
-                         Node* init, Node* limit, jint stride,
-                         Node* range, bool upper, bool &overflow,
-                         bool negate);
+  BoolNode* rc_predicate(IdealLoopTree* loop, Node* ctrl, int scale, Node* offset, Node* init, Node* limit,
+                         jint stride, Node* range, bool upper, bool& overflow);

  // Implementation of the loop predication to promote checks outside the loop
  bool loop_predication_impl(IdealLoopTree *loop);
-  bool loop_predication_impl_helper(IdealLoopTree* loop, IfProjNode* if_proj,
+  bool loop_predication_impl_helper(IdealLoopTree* loop, IfProjNode* if_success_proj,
                                    ParsePredicateSuccessProj* parse_predicate_proj, CountedLoopNode* cl, ConNode* zero,
                                    Invariance& invar, Deoptimization::DeoptReason reason);
  bool loop_predication_should_follow_branches(IdealLoopTree* loop, IfProjNode* predicate_proj, float& loop_trip_cnt);
@ -1486,7 +1483,7 @@ public:
  IfNode* insert_cmpi_loop_exit(IfNode* if_cmpu, IdealLoopTree *loop);
  void remove_cmpi_loop_exit(IfNode* if_cmp, IdealLoopTree *loop);
  // Utility to register node "n" with PhaseIdealLoop
-  void register_node(Node* n, IdealLoopTree *loop, Node* pred, int ddepth);
+  void register_node(Node* n, IdealLoopTree* loop, Node* pred, uint ddepth);
  // Utility to create an if-projection
  ProjNode* proj_clone(ProjNode* p, IfNode* iff);
  // Force the iff control output to be the live_proj
--- a/src/hotspot/share/opto/loopopts.cpp
+++ b/src/hotspot/share/opto/loopopts.cpp
@ -2711,7 +2711,7 @@ Node* PhaseIdealLoop::stay_in_loop( Node* n, IdealLoopTree *loop) {

 //------------------------------ register_node -------------------------------------
 // Utility to register node "n" with PhaseIdealLoop
-void PhaseIdealLoop::register_node(Node* n, IdealLoopTree *loop, Node* pred, int ddepth) {
+void PhaseIdealLoop::register_node(Node* n, IdealLoopTree* loop, Node* pred, uint ddepth) {
  _igvn.register_new_node_with_optimizer(n);
  loop->_body.push(n);
  if (n->is_CFG()) {
@ -2770,7 +2770,7 @@ ProjNode* PhaseIdealLoop::insert_if_before_proj(Node* left, bool Signed, BoolTes
  IfNode* iff = proj->in(0)->as_If();
  IdealLoopTree *loop = get_loop(proj);
  ProjNode *other_proj = iff->proj_out(!proj->is_IfTrue())->as_Proj();
-  int ddepth = dom_depth(proj);
+  uint ddepth = dom_depth(proj);

  _igvn.rehash_node_delayed(iff);
  _igvn.rehash_node_delayed(proj);
@ -2831,7 +2831,7 @@ RegionNode* PhaseIdealLoop::insert_region_before_proj(ProjNode* proj) {
  IfNode* iff = proj->in(0)->as_If();
  IdealLoopTree *loop = get_loop(proj);
  ProjNode *other_proj = iff->proj_out(!proj->is_IfTrue())->as_Proj();
-  int ddepth = dom_depth(proj);
+  uint ddepth = dom_depth(proj);

  _igvn.rehash_node_delayed(iff);
  _igvn.rehash_node_delayed(proj);
--- a/src/hotspot/share/opto/superword.cpp
+++ b/src/hotspot/share/opto/superword.cpp
@ -3712,8 +3712,12 @@ void SuperWord::compute_vector_element_type() {
      assert(nn->is_Cmp(), "always have Cmp above Bool");
    }
    if (nn->is_Cmp() && nn->in(0) == nullptr) {
-      nn = nn->in(1);
-      set_velt_type(n, velt_type(nn));
+      assert(in_bb(nn->in(1)) || in_bb(nn->in(2)), "one of the inputs must be in the loop too");
+      if (in_bb(nn->in(1))) {
+        set_velt_type(n, velt_type(nn->in(1)));
+      } else {
+        set_velt_type(n, velt_type(nn->in(2)));
+      }
    }
  }
 #ifndef PRODUCT
--- a/src/hotspot/share/prims/jni.cpp
+++ b/src/hotspot/share/prims/jni.cpp
@ -3628,8 +3628,10 @@ static jint JNI_CreateJavaVM_inner(JavaVM **vm, void **penv, void *args) {
    // to continue.
    if (Universe::is_fully_initialized()) {
      // otherwise no pending exception possible - VM will already have aborted
-      JavaThread* THREAD = JavaThread::current(); // For exception macros.
-      if (HAS_PENDING_EXCEPTION) {
+      Thread* current = Thread::current_or_null();
+      if (current != nullptr) {
+        JavaThread* THREAD = JavaThread::cast(current); // For exception macros.
+        assert(HAS_PENDING_EXCEPTION, "must be - else no current thread exists");
        HandleMark hm(THREAD);
        vm_exit_during_initialization(Handle(THREAD, PENDING_EXCEPTION));
      }
--- a/src/hotspot/share/prims/jvm.cpp
+++ b/src/hotspot/share/prims/jvm.cpp
@ -1362,7 +1362,7 @@ class ScopedValueBindingsResolver {
 public:
  InstanceKlass* Carrier_klass;
  ScopedValueBindingsResolver(JavaThread* THREAD) {
-    Klass *k = SystemDictionary::resolve_or_fail(vmSymbols::jdk_incubator_concurrent_ScopedValue_Carrier(), true, THREAD);
+    Klass *k = SystemDictionary::resolve_or_fail(vmSymbols::java_lang_ScopedValue_Carrier(), true, THREAD);
    Carrier_klass = InstanceKlass::cast(k);
  }
 };
@ -1395,7 +1395,7 @@ JVM_ENTRY(jobject, JVM_FindScopedValueBindings(JNIEnv *env, jclass cls))
    if (loc != -1) {
      javaVFrame *frame = vfst.asJavaVFrame();
      StackValueCollection* locals = frame->locals();
-      StackValue* head_sv = locals->at(loc); // jdk/incubator/concurrent/ScopedValue$Snapshot
+      StackValue* head_sv = locals->at(loc); // java/lang/ScopedValue$Snapshot
      Handle result = head_sv->get_obj();
      assert(!head_sv->obj_is_scalar_replaced(), "found scalar-replaced object");
      if (result() != nullptr) {
@ -4024,3 +4024,10 @@ JVM_END
 JVM_ENTRY(void, JVM_EnsureMaterializedForStackWalk_func(JNIEnv* env, jobject vthread, jobject value))
  JVM_EnsureMaterializedForStackWalk(env, value);
 JVM_END
+
+/*
+ * Return JNI_TRUE if warnings are printed when agents are dynamically loaded.
+ */
+JVM_LEAF(jboolean, JVM_PrintWarningAtDynamicAgentLoad(void))
+  return (EnableDynamicAgentLoading && !FLAG_IS_CMDLINE(EnableDynamicAgentLoading)) ? JNI_TRUE : JNI_FALSE;
+JVM_END
--- a/src/hotspot/share/prims/jvmti.xml
+++ b/src/hotspot/share/prims/jvmti.xml
@ -640,6 +640,17 @@ Agent_OnLoad_L(JavaVM *vm, char *options, void *reserved)</example>
    or implementation specific API, to attach to the running VM, and request it start a given
    agent.
    <p/>
+    The VM prints a warning on the standard error stream for each agent that it attempts
+    to start in the live phase. If an agent was previously started (in the <code>OnLoad</code>
+    phase or in the live phase), then it is implementation specific as to whether a
+    warning is printed when attempting to start the same agent a second or subsequent time.
+    Warnings can be disabled by means of an implementation-specific command line option.
+    <p/>
+    <b>Implementation Note:</b> For the HotSpot VM, the VM option
+    <code>-XX:+EnableDynamicAgentLoading</code> is used to opt-in to allow dynamic loading
+    of agents in the live phase. This option suppresses the warning to standard error when
+    starting an agent in the live phase.
+    <p/>
    If an agent is started during the live phase then its agent library
    must export a start-up function
    with the following prototype:
@ -15381,60 +15392,52 @@ typedef void (JNICALL *jvmtiEventVMInit)
  </change>
  <change date="13 October 2016" version="9.0.0">
      Support for modules:
-       - The majorversion is 9 now
-       - The ClassFileLoadHook events are not sent during the primordial phase anymore.
-       - Allow CompiledMethodLoad events at start phase
-       - Add new capabilities:
-          - can_generate_early_vmstart
-          - can_generate_early_class_hook_events
-       - Add new functions:
-          - GetAllModules
-          - AddModuleReads, AddModuleExports, AddModuleOpens, AddModuleUses, AddModuleProvides
-          - IsModifiableModule
-      Clarified can_redefine_any_classes, can_retransform_any_classes and IsModifiableClass API to
-      disallow some implementation defined classes.
+      The majorversion is 9 now.
+      The ClassFileLoadHook events are not sent during the primordial phase anymore.
+      Allow CompiledMethodLoad events at start phase.
+      Add new capabilities: can_generate_early_vmstart, can_generate_early_class_hook_events.
+      Add new functions: GetAllModules, AddModuleReads, AddModuleExports,
+      AddModuleOpens, AddModuleUses, AddModuleProvides, IsModifiableModule.
+      Clarified can_redefine_any_classes, can_retransform_any_classes and
+      IsModifiableClass API to disallow some implementation defined classes.
  </change>
  <change date="12 February 2017" version="9.0.0">
      Minor update for GetCurrentThread function:
-       - The function may return NULL in the start phase if the
-         can_generate_early_vmstart capability is enabled.
+      The function may return NULL in the start phase if the
+      can_generate_early_vmstart capability is enabled.
  </change>
  <change date="7 February 2018" version="11.0.0">
      Minor update for new class file NestHost and NestMembers attributes:
-        - Specify that RedefineClasses and RetransformClasses are not allowed
-          to change the class file NestHost and NestMembers attributes.
-        - Add new error JVMTI_ERROR_UNSUPPORTED_REDEFINITION_CLASS_ATTRIBUTE_CHANGED
-          that can be returned by RedefineClasses and RetransformClasses.
+      Specify that RedefineClasses and RetransformClasses are not allowed
+      to change the class file NestHost and NestMembers attributes;
+      Add new error JVMTI_ERROR_UNSUPPORTED_REDEFINITION_CLASS_ATTRIBUTE_CHANGED
+      that can be returned by RedefineClasses and RetransformClasses.
  </change>
  <change date="15 June 2018" version="11.0.0">
      Support for Low Overhead Heap Sampling:
-       - Add new capability:
-         - can_generate_sampled_object_alloc_events
-       - Add new function:
-         - SetHeapSamplingInterval
-       - Add new event type:
-         - JVMTI_EVENT_SAMPLED_OBJECT_ALLOC
+      Add new capability: can_generate_sampled_object_alloc_events.
+      Add new function: SetHeapSamplingInterval.
+      Add new event type: JVMTI_EVENT_SAMPLED_OBJECT_ALLOC.
  </change>
  <change date="20 May 2019" version="13.0.0">
      Minor spec update for the capability "can_redefine_any_class".
-      It now says:
-       "RedefineClasses can be called on any modifiable class. See IsModifiableClass.
-       (can_redefine_classes must also be set)"
+      It now says: "RedefineClasses can be called on any modifiable class."
+      See IsModifiableClass. (can_redefine_classes must also be set)
  </change>
  <change date="5 June 2019" version="13.0.0">
      Minor PopFrame spec update:
-        - The specified thread must be suspended or must be the current thread.
-          (It was not allowed to be the current thread before.)
+      The specified thread must be suspended or must be the current thread.
+     (It was not allowed to be the current thread before.)
  </change>
  <change date="10 October 2019" version="14.0.0">
      Minor update for new class file Record attribute:
-        - Specify that RedefineClasses and RetransformClasses are not allowed
-          to change the class file Record attribute.
+      Specify that RedefineClasses and RetransformClasses are not allowed
+      to change the class file Record attribute.
  </change>
  <change date="13 May 2020" version="15.0.0">
      Minor update for new class file PermittedSubclasses attribute:
-        - Specify that RedefineClasses and RetransformClasses are not allowed
-          to change the class file PermittedSubclasses attribute.
+      Specify that RedefineClasses and RetransformClasses are not allowed
+      to change the class file PermittedSubclasses attribute.
  </change>
  <change date="15 January 2021" version="17.0.0">
      Minor clarification in the section "Agent Shutdown" that says the
@ -15445,17 +15448,15 @@ typedef void (JNICALL *jvmtiEventVMInit)
      Minor update to deprecate Heap functions 1.0.
  </change>
  <change date="27 April 2022" version="19.0.0">
-      Support for virtual threads:
-       - Add new capability:
-         - can_support_virtual_threads
-       - Add new functions:
-         - SuspendAllVirtualThreads
-         - ResumeAllVirtualThreads
-       - Add new event types:
-         - JVMTI_EVENT_VIRTUAL_THREAD_START
-         - JVMTI_EVENT_VIRTUAL_THREAD_END
-       - Add new error code:
-         - JVMTI_ERROR_UNSUPPORTED_OPERATION
+      Support for virtual threads (Preview):
+      Add new capability: can_support_virtual_threads.
+      Add new functions: SuspendAllVirtualThreads, ResumeAllVirtualThreads.
+      Add new event types: JVMTI_EVENT_VIRTUAL_THREAD_START, JVMTI_EVENT_VIRTUAL_THREAD_END.
+      Add new error code: JVMTI_ERROR_UNSUPPORTED_OPERATION.
+  </change>
+  <change date="7 June 2023" version="21.0.0">
+      Virtual threads finalized to be a permanent feature.
+      Agent start-up in the live phase now specified to print a warning.
  </change>
 </changehistory>

--- a/src/hotspot/share/prims/jvmtiAgent.cpp
+++ b/src/hotspot/share/prims/jvmtiAgent.cpp
@ -30,13 +30,16 @@
 #include "jvmtifiles/jvmtiEnv.hpp"
 #include "prims/jvmtiEnvBase.hpp"
 #include "prims/jvmtiExport.hpp"
+#include "prims/jvmtiAgentList.hpp"
 #include "runtime/arguments.hpp"
 #include "runtime/handles.inline.hpp"
 #include "runtime/interfaceSupport.inline.hpp"
 #include "runtime/java.hpp"
 #include "runtime/jniHandles.hpp"
+#include "runtime/globals_extension.hpp"
 #include "runtime/os.inline.hpp"
 #include "runtime/thread.inline.hpp"
+#include "utilities/defaultStream.hpp"

 static inline const char* copy_string(const char* str) {
  return str != nullptr ? os::strdup(str, mtServiceability) : nullptr;
@ -260,9 +263,9 @@ static void assert_preload(const JvmtiAgent* agent) {

 // Check for a statically linked-in agent, i.e. in the executable.
 // This should be the first function called when loading an agent. It is a bit special:
-// For statically linked agents we cant't rely on os_lib == nullptr because
+// For statically linked agents we can't rely on os_lib == nullptr because
 // statically linked agents could have a handle of RTLD_DEFAULT which == 0 on some platforms.
-// If this function returns true, then agent->is_static_lib().&& agent->is_loaded().
+// If this function returns true, then agent->is_static_lib() && agent->is_loaded().
 static bool load_agent_from_executable(JvmtiAgent* agent, const char* on_load_symbols[], size_t num_symbol_entries) {
  DEBUG_ONLY(assert_preload(agent);)
  assert(on_load_symbols != nullptr, "invariant");
@ -483,7 +486,13 @@ extern "C" {
 }

 // Loading the agent by invoking Agent_OnAttach.
+// This function is called before the agent is added to JvmtiAgentList.
 static bool invoke_Agent_OnAttach(JvmtiAgent* agent, outputStream* st) {
+  if (!EnableDynamicAgentLoading) {
+    st->print_cr("Dynamic agent loading is not enabled. "
+                 "Use -XX:+EnableDynamicAgentLoading to launch target VM.");
+    return false;
+  }
  DEBUG_ONLY(assert_preload(agent);)
  assert(agent->is_dynamic(), "invariant");
  assert(st != nullptr, "invariant");
@ -491,7 +500,10 @@ static bool invoke_Agent_OnAttach(JvmtiAgent* agent, outputStream* st) {
  const char* on_attach_symbols[] = AGENT_ONATTACH_SYMBOLS;
  const size_t num_symbol_entries = ARRAY_SIZE(on_attach_symbols);
  void* library = nullptr;
-  if (!load_agent_from_executable(agent, &on_attach_symbols[0], num_symbol_entries)) {
+  bool previously_loaded;
+  if (load_agent_from_executable(agent, &on_attach_symbols[0], num_symbol_entries)) {
+    previously_loaded = JvmtiAgentList::is_static_lib_loaded(agent->name());
+  } else {
    library = load_library(agent, &on_attach_symbols[0], num_symbol_entries, /* vm_exit_on_error */ false);
    if (library == nullptr) {
      st->print_cr("%s was not loaded.", agent->name());
@ -503,7 +515,17 @@ static bool invoke_Agent_OnAttach(JvmtiAgent* agent, outputStream* st) {
    agent->set_os_lib_path(&buffer[0]);
    agent->set_os_lib(library);
    agent->set_loaded();
+    previously_loaded = JvmtiAgentList::is_dynamic_lib_loaded(library);
  }
+
+  // Print warning if agent was not previously loaded and EnableDynamicAgentLoading not enabled on the command line.
+  if (!previously_loaded && !FLAG_IS_CMDLINE(EnableDynamicAgentLoading) && !agent->is_instrument_lib()) {
+    jio_fprintf(defaultStream::error_stream(),
+      "WARNING: A JVM TI agent has been loaded dynamically (%s)\n"
+      "WARNING: If a serviceability tool is in use, please run with -XX:+EnableDynamicAgentLoading to hide this warning\n"
+      "WARNING: Dynamic loading of agents will be disallowed by default in a future release\n", agent->name());
+  }
+
  assert(agent->is_loaded(), "invariant");
  // The library was loaded so we attempt to lookup and invoke the Agent_OnAttach function.
  OnAttachEntry_t on_attach_entry = CAST_TO_FN_PTR(OnAttachEntry_t,
--- a/src/hotspot/share/prims/jvmtiAgentList.cpp
+++ b/src/hotspot/share/prims/jvmtiAgentList.cpp
@ -220,6 +220,30 @@ void JvmtiAgentList::unload_agents() {
  }
 }

+// Return true if a statically linked agent is on the list
+bool JvmtiAgentList::is_static_lib_loaded(const char* name) {
+  JvmtiAgentList::Iterator it = JvmtiAgentList::agents();
+  while (it.has_next()) {
+    JvmtiAgent* const agent = it.next();
+    if (agent->is_static_lib() && strcmp(agent->name(), name) == 0) {
+      return true;
+    }
+  }
+  return false;
+}
+
+// Return true if a agent library on the list
+bool JvmtiAgentList::is_dynamic_lib_loaded(void* os_lib) {
+  JvmtiAgentList::Iterator it = JvmtiAgentList::agents();
+  while (it.has_next()) {
+    JvmtiAgent* const agent = it.next();
+    if (!agent->is_static_lib() && agent->os_lib() == os_lib) {
+      return true;
+    }
+  }
+  return false;
+}
+
 static bool match(JvmtiEnv* env, const JvmtiAgent* agent, const void* os_module_address) {
  assert(env != nullptr, "invariant");
  assert(agent != nullptr, "invariant");
--- a/src/hotspot/share/prims/jvmtiAgentList.hpp
+++ b/src/hotspot/share/prims/jvmtiAgentList.hpp
@ -76,6 +76,9 @@ class JvmtiAgentList : AllStatic {
  static void load_xrun_agents() NOT_JVMTI_RETURN;
  static void unload_agents() NOT_JVMTI_RETURN;

+  static bool is_static_lib_loaded(const char* name);
+  static bool is_dynamic_lib_loaded(void* os_lib);
+
  static JvmtiAgent* lookup(JvmtiEnv* env, void* f_ptr);

  static Iterator agents() NOT_JVMTI({ Iterator it; return it; });
--- a/src/hotspot/share/prims/jvmtiEnv.cpp
+++ b/src/hotspot/share/prims/jvmtiEnv.cpp
@ -1780,9 +1780,9 @@ JvmtiEnv::GetAllStackTraces(jint max_frame_count, jvmtiStackInfo** stack_info_pt
 jvmtiError
 JvmtiEnv::GetThreadListStackTraces(jint thread_count, const jthread* thread_list, jint max_frame_count, jvmtiStackInfo** stack_info_ptr) {
  jvmtiError err = JVMTI_ERROR_NONE;
+  JvmtiVTMSTransitionDisabler disabler;

  if (thread_count == 1) {
-    JvmtiVTMSTransitionDisabler disabler;

    // Use direct handshake if we need to get only one stack trace.
    JavaThread *current_thread = JavaThread::current();
--- a/src/hotspot/share/prims/jvmtiEnvBase.cpp
+++ b/src/hotspot/share/prims/jvmtiEnvBase.cpp
@ -1344,13 +1344,15 @@ JvmtiEnvBase::current_thread_obj_or_resolve_external_guard(jthread thread) {
 }

 jvmtiError
-JvmtiEnvBase::get_threadOop_and_JavaThread(ThreadsList* t_list, jthread thread,
+JvmtiEnvBase::get_threadOop_and_JavaThread(ThreadsList* t_list, jthread thread, JavaThread* cur_thread,
                                           JavaThread** jt_pp, oop* thread_oop_p) {
-  JavaThread* cur_thread = JavaThread::current();
  JavaThread* java_thread = nullptr;
  oop thread_oop = nullptr;

  if (thread == nullptr) {
+    if (cur_thread == nullptr) { // cur_thread can be null when called from a VM_op
+      return JVMTI_ERROR_INVALID_THREAD;
+    }
    java_thread = cur_thread;
    thread_oop = get_vthread_or_thread_oop(java_thread);
    if (thread_oop == nullptr || !thread_oop->is_a(vmClasses::Thread_klass())) {
@ -1381,6 +1383,14 @@ JvmtiEnvBase::get_threadOop_and_JavaThread(ThreadsList* t_list, jthread thread,
  return JVMTI_ERROR_NONE;
 }

+jvmtiError
+JvmtiEnvBase::get_threadOop_and_JavaThread(ThreadsList* t_list, jthread thread,
+                                           JavaThread** jt_pp, oop* thread_oop_p) {
+  JavaThread* cur_thread = JavaThread::current();
+  jvmtiError err = get_threadOop_and_JavaThread(t_list, thread, cur_thread, jt_pp, thread_oop_p);
+  return err;
+}
+
 // Check for JVMTI_ERROR_NOT_SUSPENDED and JVMTI_ERROR_OPAQUE_FRAME errors.
 // Used in PopFrame and ForceEarlyReturn implementations.
 jvmtiError
@ -1931,13 +1941,15 @@ VM_GetThreadListStackTraces::doit() {
    jthread jt = _thread_list[i];
    JavaThread* java_thread = nullptr;
    oop thread_oop = nullptr;
-    jvmtiError err = JvmtiExport::cv_external_thread_to_JavaThread(tlh.list(), jt, &java_thread, &thread_oop);
+    jvmtiError err = JvmtiEnvBase::get_threadOop_and_JavaThread(tlh.list(), jt, nullptr, &java_thread, &thread_oop);
+
    if (err != JVMTI_ERROR_NONE) {
      // We got an error code so we don't have a JavaThread *, but
      // only return an error from here if we didn't get a valid
      // thread_oop.
-      // In the virtual thread case the cv_external_thread_to_JavaThread is expected to correctly set
-      // the thread_oop and return JVMTI_ERROR_INVALID_THREAD which we ignore here.
+      // In the virtual thread case the get_threadOop_and_JavaThread is expected to correctly set
+      // the thread_oop and return JVMTI_ERROR_THREAD_NOT_ALIVE which we ignore here.
+      // The corresponding thread state will be recorded in the jvmtiStackInfo.state.
      if (thread_oop == nullptr) {
        _collector.set_result(err);
        return;
@ -1952,7 +1964,7 @@ VM_GetThreadListStackTraces::doit() {
 void
 GetSingleStackTraceClosure::do_thread(Thread *target) {
  JavaThread *jt = JavaThread::cast(target);
-  oop thread_oop = jt->threadObj();
+  oop thread_oop = JNIHandles::resolve_external_guard(_jthread);

  if (!jt->is_exiting() && thread_oop != nullptr) {
    ResourceMark rm;
--- a/src/hotspot/share/prims/jvmtiEnvBase.hpp
+++ b/src/hotspot/share/prims/jvmtiEnvBase.hpp
@ -214,6 +214,8 @@ class JvmtiEnvBase : public CHeapObj<mtInternal> {
    return result;
  }

+  static jvmtiError get_threadOop_and_JavaThread(ThreadsList* t_list, jthread thread, JavaThread* cur_thread,
+                                                 JavaThread** jt_pp, oop* thread_oop_p);
  static jvmtiError get_threadOop_and_JavaThread(ThreadsList* t_list, jthread thread,
                                                 JavaThread** jt_pp, oop* thread_oop_p);

--- a/src/hotspot/share/prims/jvmtiTagMap.cpp
+++ b/src/hotspot/share/prims/jvmtiTagMap.cpp
@ -2968,14 +2968,15 @@ void JvmtiTagMap::iterate_over_reachable_objects(jvmtiHeapRootCallback heap_root
                                                 jvmtiStackReferenceCallback stack_ref_callback,
                                                 jvmtiObjectReferenceCallback object_ref_callback,
                                                 const void* user_data) {
+  // VTMS transitions must be disabled before the EscapeBarrier.
+  JvmtiVTMSTransitionDisabler disabler;
+
  JavaThread* jt = JavaThread::current();
  EscapeBarrier eb(true, jt);
  eb.deoptimize_objects_all_threads();
  Arena dead_object_arena(mtServiceability);
  GrowableArray<jlong> dead_objects(&dead_object_arena, 10, 0, 0);

-  JvmtiVTMSTransitionDisabler disabler;
-
  {
    MutexLocker ml(Heap_lock);
    BasicHeapWalkContext context(heap_root_callback, stack_ref_callback, object_ref_callback);
@ -3015,6 +3016,9 @@ void JvmtiTagMap::follow_references(jint heap_filter,
                                    const jvmtiHeapCallbacks* callbacks,
                                    const void* user_data)
 {
+  // VTMS transitions must be disabled before the EscapeBarrier.
+  JvmtiVTMSTransitionDisabler disabler;
+
  oop obj = JNIHandles::resolve(object);
  JavaThread* jt = JavaThread::current();
  Handle initial_object(jt, obj);
@ -3027,8 +3031,6 @@ void JvmtiTagMap::follow_references(jint heap_filter,
  Arena dead_object_arena(mtServiceability);
  GrowableArray<jlong> dead_objects(&dead_object_arena, 10, 0, 0);

-  JvmtiVTMSTransitionDisabler disabler;
-
  {
    MutexLocker ml(Heap_lock);
    AdvancedHeapWalkContext context(heap_filter, klass, callbacks);
--- a/src/hotspot/share/runtime/arguments.cpp
+++ b/src/hotspot/share/runtime/arguments.cpp
@ -1905,7 +1905,7 @@ bool Arguments::check_vm_args_consistency() {
 #endif


-#if !defined(X86) && !defined(AARCH64) && !defined(RISCV64) && !defined(ARM)
+#if !defined(X86) && !defined(AARCH64) && !defined(RISCV64) && !defined(ARM) && !defined(PPC64)
  if (LockingMode == LM_LIGHTWEIGHT) {
    FLAG_SET_CMDLINE(LockingMode, LM_LEGACY);
    warning("New lightweight locking not supported on this platform");
@ -2827,28 +2827,42 @@ jint Arguments::parse_each_vm_init_arg(const JavaVMInitArgs* args, bool* patch_m
        return JNI_ERR;
 #endif // INCLUDE_MANAGEMENT
 #if INCLUDE_JVMCI
-    } else if (match_option(option, "-XX:-EnableJVMCIProduct")) {
+    } else if (match_option(option, "-XX:-EnableJVMCIProduct") || match_option(option, "-XX:-UseGraalJIT")) {
      if (EnableJVMCIProduct) {
        jio_fprintf(defaultStream::error_stream(),
-                  "-XX:-EnableJVMCIProduct cannot come after -XX:+EnableJVMCIProduct\n");
+                  "-XX:-EnableJVMCIProduct or -XX:-UseGraalJIT cannot come after -XX:+EnableJVMCIProduct or -XX:+UseGraalJIT\n");
        return JNI_EINVAL;
      }
-    } else if (match_option(option, "-XX:+EnableJVMCIProduct")) {
-      // Just continue, since "-XX:+EnableJVMCIProduct" has been specified before
+    } else if (match_option(option, "-XX:+EnableJVMCIProduct") || match_option(option, "-XX:+UseGraalJIT")) {
+      bool use_graal_jit = match_option(option, "-XX:+UseGraalJIT");
+      if (use_graal_jit) {
+        const char* jvmci_compiler = get_property("jvmci.Compiler");
+        if (jvmci_compiler != nullptr) {
+          if (strncmp(jvmci_compiler, "graal", strlen("graal")) != 0) {
+            jio_fprintf(defaultStream::error_stream(),
+              "Value of jvmci.Compiler incompatible with +UseGraalJIT: %s", jvmci_compiler);
+            return JNI_ERR;
+          }
+        } else if (!add_property("jvmci.Compiler=graal")) {
+            return JNI_ENOMEM;
+        }
+      }
+
+      // Just continue, since "-XX:+EnableJVMCIProduct" or "-XX:+UseGraalJIT" has been specified before
      if (EnableJVMCIProduct) {
        continue;
      }
      JVMFlag *jvmciFlag = JVMFlag::find_flag("EnableJVMCIProduct");
      // Allow this flag if it has been unlocked.
      if (jvmciFlag != nullptr && jvmciFlag->is_unlocked()) {
-        if (!JVMCIGlobals::enable_jvmci_product_mode(origin)) {
+        if (!JVMCIGlobals::enable_jvmci_product_mode(origin, use_graal_jit)) {
          jio_fprintf(defaultStream::error_stream(),
            "Unable to enable JVMCI in product mode");
          return JNI_ERR;
        }
      }
      // The flag was locked so process normally to report that error
-      else if (!process_argument("EnableJVMCIProduct", args->ignoreUnrecognized, origin)) {
+      else if (!process_argument(use_graal_jit ? "UseGraalJIT" : "EnableJVMCIProduct", args->ignoreUnrecognized, origin)) {
        return JNI_EINVAL;
      }
 #endif // INCLUDE_JVMCI
--- a/src/hotspot/share/runtime/handshake.cpp
+++ b/src/hotspot/share/runtime/handshake.cpp
@ -238,6 +238,8 @@ class VM_HandshakeAllThreads: public VM_Operation {
 public:
  VM_HandshakeAllThreads(HandshakeOperation* op) : _op(op) {}

+  const char* cause() const { return _op->name(); }
+
  bool evaluate_at_safepoint() const { return false; }

  void doit() {
--- a/src/hotspot/share/runtime/objectMonitor.cpp
+++ b/src/hotspot/share/runtime/objectMonitor.cpp
@ -276,7 +276,10 @@ ObjectMonitor::ObjectMonitor(oop object) :
 { }

 ObjectMonitor::~ObjectMonitor() {
-  _object.release(_oop_storage);
+  if (!_object.is_null()) {
+    // Release object's oop storage if it hasn't already been done.
+    release_object();
+  }
 }

 oop ObjectMonitor::object() const {
@ -595,6 +598,9 @@ bool ObjectMonitor::deflate_monitor() {
    install_displaced_markword_in_object(obj);
  }

+  // Release object's oop storage since the ObjectMonitor has been deflated:
+  release_object();
+
  // We leave owner == DEFLATER_MARKER and contentions < 0
  // to force any racing threads to retry.
  return true;  // Success, ObjectMonitor has been deflated.
--- a/src/hotspot/share/runtime/objectMonitor.hpp
+++ b/src/hotspot/share/runtime/objectMonitor.hpp
@ -363,6 +363,7 @@ private:
  // Deflation support
  bool      deflate_monitor();
  void      install_displaced_markword_in_object(const oop obj);
+  void      release_object() { _object.release(_oop_storage); _object.set_null(); }
 };

 #endif // SHARE_RUNTIME_OBJECTMONITOR_HPP
--- a/src/hotspot/share/runtime/os.cpp
+++ b/src/hotspot/share/runtime/os.cpp
@ -162,7 +162,7 @@ char* os::iso8601_time(jlong milliseconds_since_19700101, char* buffer, size_t b
  // No offset when dealing with UTC
  time_t UTC_to_local = 0;
  if (!utc) {
-#if defined(_ALLBSD_SOURCE) || defined(_GNU_SOURCE)
+#if (defined(_ALLBSD_SOURCE) || defined(_GNU_SOURCE)) && !defined(AIX)
    UTC_to_local = -(time_struct.tm_gmtoff);
 #elif defined(_WINDOWS)
    long zone;
@ -878,8 +878,8 @@ bool os::print_function_and_library_name(outputStream* st,
  // this as a function descriptor for the reader (see below).
  if (!have_function_name && os::is_readable_pointer(addr)) {
    address addr2 = (address)os::resolve_function_descriptor(addr);
-    if (have_function_name = is_function_descriptor =
-        dll_address_to_function_name(addr2, p, buflen, &offset, demangle)) {
+    if ((have_function_name = is_function_descriptor =
+        dll_address_to_function_name(addr2, p, buflen, &offset, demangle))) {
      addr = addr2;
    }
  }
--- a/src/hotspot/share/runtime/os.hpp
+++ b/src/hotspot/share/runtime/os.hpp
@ -791,6 +791,9 @@ class os: AllStatic {
  static size_t lasterror(char *buf, size_t len);
  static int get_last_error();

+  // Send JFR memory info event
+  static void jfr_report_memory_info() NOT_JFR_RETURN();
+
  // Replacement for strerror().
  // Will return the english description of the error (e.g. "File not found", as
  //  suggested in the POSIX standard.
--- a/src/hotspot/share/runtime/synchronizer.cpp
+++ b/src/hotspot/share/runtime/synchronizer.cpp
@ -1645,6 +1645,15 @@ public:
  };
 };

+static size_t delete_monitors(GrowableArray<ObjectMonitor*>* delete_list) {
+  size_t count = 0;
+  for (ObjectMonitor* monitor: *delete_list) {
+    delete monitor;
+    count++;
+  }
+  return count;
+}
+
 // This function is called by the MonitorDeflationThread to deflate
 // ObjectMonitors. It is also called via do_final_audit_and_print_stats()
 // and VM_ThreadDump::doit() by the VMThread.
@ -1719,16 +1728,30 @@ size_t ObjectSynchronizer::deflate_idle_monitors(ObjectMonitorsHashtable* table)
    }

    // After the handshake, safely free the ObjectMonitors that were
-    // deflated in this cycle.
-    for (ObjectMonitor* monitor: delete_list) {
-      delete monitor;
-      deleted_count++;
-
-      if (current->is_Java_thread()) {
-        // A JavaThread must check for a safepoint/handshake and honor it.
-        chk_for_block_req(JavaThread::cast(current), "deletion", "deleted_count",
-                          deleted_count, ls, &timer);
+    // deflated and unlinked in this cycle.
+    if (current->is_Java_thread()) {
+      if (ls != NULL) {
+        timer.stop();
+        ls->print_cr("before setting blocked: unlinked_count=" SIZE_FORMAT
+                     ", in_use_list stats: ceiling=" SIZE_FORMAT ", count="
+                     SIZE_FORMAT ", max=" SIZE_FORMAT,
+                     unlinked_count, in_use_list_ceiling(),
+                     _in_use_list.count(), _in_use_list.max());
      }
+      // Mark the calling JavaThread blocked (safepoint safe) while we free
+      // the ObjectMonitors so we don't delay safepoints whilst doing that.
+      ThreadBlockInVM tbivm(JavaThread::cast(current));
+      if (ls != NULL) {
+        ls->print_cr("after setting blocked: in_use_list stats: ceiling="
+                     SIZE_FORMAT ", count=" SIZE_FORMAT ", max=" SIZE_FORMAT,
+                     in_use_list_ceiling(), _in_use_list.count(), _in_use_list.max());
+        timer.start();
+      }
+      deleted_count = delete_monitors(&delete_list);
+      // ThreadBlockInVM is destroyed here
+    } else {
+      // A non-JavaThread can just free the ObjectMonitors:
+      deleted_count = delete_monitors(&delete_list);
    }
    assert(unlinked_count == deleted_count, "must be");
  }
--- a/src/hotspot/share/runtime/threads.cpp
+++ b/src/hotspot/share/runtime/threads.cpp
@ -564,7 +564,12 @@ jint Threads::create_vm(JavaVMInitArgs* args, bool* canTryAgain) {
  status = init_globals2();
  if (status != JNI_OK) {
    Threads::remove(main_thread, false);
-    main_thread->smr_delete();
+    // It is possible that we managed to fully initialize Universe but have then
+    // failed by throwing an exception. In that case our caller JNI_CreateJavaVM
+    // will want to report it, so we can't delete the main thread.
+    if (!main_thread->has_pending_exception()) {
+      main_thread->smr_delete();
+    }
    *canTryAgain = false; // don't let caller call JNI_CreateJavaVM again
    return status;
  }
--- a/src/hotspot/share/runtime/vmOperation.hpp
+++ b/src/hotspot/share/runtime/vmOperation.hpp
@ -174,6 +174,8 @@ class VM_Operation : public StackObj {
    assert(type >= 0 && type < VMOp_Terminating, "invalid VM operation type");
    return _names[type];
  }
+  // Extra information about what triggered this operation.
+  virtual const char* cause() const { return nullptr; }
 #ifndef PRODUCT
  void print_on(outputStream* st) const { print_on_error(st); }
 #endif
--- a/Show more
+++ b/Show more