Merge

2025-08-27 23:04:50 +02:00 · 2016-01-14 16:26:38 -05:00 · 2016-01-14 16:26:38 -05:00 · f8800caf4f
commit f8800caf4f
parent b26df6b69f d767b421ea
1124 changed files with 35140 additions and 7456 deletions
--- a/.hgtags
+++ b/.hgtags
@ -342,3 +342,4 @@ f242d4332f563648426a1b0fa02d8741beba19ef jdk9-b92
 4edcff1b9a8875eb6380a2165dfec599e8e3f7c0 jdk-9+97
 d00ad2d9049ac60815f70bff445e95df85648bd2 jdk-9+98
 f9bcdce2df26678c3fe468130b535c0342c69b89 jdk-9+99
 4379223f8806626852c46c52d4e7a27a584b406e jdk-9+100
--- a/.hgtags-top-repo
+++ b/.hgtags-top-repo
@ -342,3 +342,4 @@ cf1dc4c035fb84693d4ae5ad818785cb4d1465d1 jdk9-b90
 75c3897541ecb52ee16d001ea605b12971df7303 jdk-9+97
 48987460c7d49a29013963ee44d090194396bb61 jdk-9+98
 7c0577bea4c65d69c5bef67023a89d2efa4fb2f7 jdk-9+99
 c1f30ac14db0eaff398429c04cd9fab92e1b4b2a jdk-9+100
--- a/common/autoconf/build-performance.m4
+++ b/common/autoconf/build-performance.m4
@ -251,6 +251,24 @@ AC_DEFUN([BPERF_SETUP_CCACHE_USAGE],
  fi
 ])
 ################################################################################
 #
 # Runs icecc-create-env once and prints the error if it fails
 #
 # $1: arguments to icecc-create-env
 # $2: log file
 #
 AC_DEFUN([BPERF_RUN_ICECC_CREATE_ENV],
 [
  cd ${CONFIGURESUPPORT_OUTPUTDIR}/icecc \
      && ${ICECC_CREATE_ENV} $1 > $2 2>&1
  if test "$?" != "0"; then
    AC_MSG_NOTICE([icecc-create-env output:])
    cat $2
    AC_MSG_ERROR([Failed to create icecc compiler environment])
  fi
 ])
 ################################################################################
 #
 # Optionally enable distributed compilation of native code using icecc/icecream
@ -271,16 +289,18 @@ AC_DEFUN([BPERF_SETUP_ICECC],
    # be sent to the other hosts in the icecream cluster.
    icecc_create_env_log="${CONFIGURESUPPORT_OUTPUTDIR}/icecc/icecc_create_env.log"
    ${MKDIR} -p ${CONFIGURESUPPORT_OUTPUTDIR}/icecc
-    AC_MSG_CHECKING([for icecc build environment for target compiler])
+    # Older versions of icecc does not have the --gcc parameter
    if ${ICECC_CREATE_ENV} | $GREP -q -e --gcc; then
      icecc_gcc_arg="--gcc"
    fi
    if test "x${TOOLCHAIN_TYPE}" = "xgcc"; then
-      cd ${CONFIGURESUPPORT_OUTPUTDIR}/icecc \
+      BPERF_RUN_ICECC_CREATE_ENV([${icecc_gcc_arg} ${CC} ${CXX}], \
-          && ${ICECC_CREATE_ENV} --gcc ${CC} ${CXX} > ${icecc_create_env_log}
+          ${icecc_create_env_log})
    elif test "x$TOOLCHAIN_TYPE" = "xclang"; then
      # For clang, the icecc compilerwrapper is needed. It usually resides next
      # to icecc-create-env.
      BASIC_REQUIRE_PROGS(ICECC_WRAPPER, compilerwrapper)
-      cd ${CONFIGURESUPPORT_OUTPUTDIR}/icecc \
+      BPERF_RUN_ICECC_CREATE_ENV([--clang ${CC} ${ICECC_WRAPPER}], ${icecc_create_env_log})
          && ${ICECC_CREATE_ENV} --clang ${CC} ${ICECC_WRAPPER} > ${icecc_create_env_log}
    else
      AC_MSG_ERROR([Can only create icecc compiler packages for toolchain types gcc and clang])
    fi
@ -289,24 +309,31 @@ AC_DEFUN([BPERF_SETUP_ICECC],
    # to find it.
    ICECC_ENV_BUNDLE_BASENAME="`${SED} -n '/^creating/s/creating //p' ${icecc_create_env_log}`"
    ICECC_ENV_BUNDLE="${CONFIGURESUPPORT_OUTPUTDIR}/icecc/${ICECC_ENV_BUNDLE_BASENAME}"
    if test ! -f ${ICECC_ENV_BUNDLE}; then
      AC_MSG_ERROR([icecc-create-env did not produce an environment ${ICECC_ENV_BUNDLE}])
    fi
    AC_MSG_CHECKING([for icecc build environment for target compiler])
    AC_MSG_RESULT([${ICECC_ENV_BUNDLE}])
    ICECC="ICECC_VERSION=${ICECC_ENV_BUNDLE} ICECC_CC=${CC} ICECC_CXX=${CXX} ${ICECC_CMD}"
    if test "x${COMPILE_TYPE}" = "xcross"; then
      # If cross compiling, create a separate env package for the build compiler
      AC_MSG_CHECKING([for icecc build environment for build compiler])
      # Assume "gcc" or "cc" is gcc and "clang" is clang. Otherwise bail.
      icecc_create_env_log_build="${CONFIGURESUPPORT_OUTPUTDIR}/icecc/icecc_create_env_build.log"
      if test "x${BUILD_CC##*/}" = "xgcc" ||  test "x${BUILD_CC##*/}" = "xcc"; then
-        cd ${CONFIGURESUPPORT_OUTPUTDIR}/icecc \
+        BPERF_RUN_ICECC_CREATE_ENV([${icecc_gcc_arg} ${BUILD_CC} ${BUILD_CXX}], \
-            && ${ICECC_CREATE_ENV} --gcc ${BUILD_CC} ${BUILD_CXX} > ${icecc_create_env_log}
+            ${icecc_create_env_log_build})
      elif test "x${BUILD_CC##*/}" = "xclang"; then
-        cd ${CONFIGURESUPPORT_OUTPUTDIR}/icecc \
+        BPERF_RUN_ICECC_CREATE_ENV([--clang ${BUILD_CC} ${ICECC_WRAPPER}], ${icecc_create_env_log_build})
            && ${ICECC_CREATE_ENV} --clang ${BUILD_CC} ${ICECC_WRAPPER} > ${icecc_create_env_log}
      else
        AC_MSG_ERROR([Cannot create icecc compiler package for ${BUILD_CC}])
      fi
-      ICECC_ENV_BUNDLE_BASENAME="`${SED} -n '/^creating/s/creating //p' ${icecc_create_env_log}`"
+      ICECC_ENV_BUNDLE_BASENAME="`${SED} -n '/^creating/s/creating //p' ${icecc_create_env_log_build}`"
      ICECC_ENV_BUNDLE="${CONFIGURESUPPORT_OUTPUTDIR}/icecc/${ICECC_ENV_BUNDLE_BASENAME}"
      if test ! -f ${ICECC_ENV_BUNDLE}; then
        AC_MSG_ERROR([icecc-create-env did not produce an environment ${ICECC_ENV_BUNDLE}])
      fi
      AC_MSG_CHECKING([for icecc build environment for build compiler])
      AC_MSG_RESULT([${ICECC_ENV_BUNDLE}])
      BUILD_ICECC="ICECC_VERSION=${ICECC_ENV_BUNDLE} ICECC_CC=${BUILD_CC} \
          ICECC_CXX=${BUILD_CXX} ${ICECC_CMD}"
--- a/common/autoconf/flags.m4
+++ b/common/autoconf/flags.m4
@ -128,6 +128,26 @@ AC_DEFUN_ONCE([FLAGS_SETUP_INIT_FLAGS],
  else
    COMPILER_TARGET_BITS_FLAG="-m"
    COMPILER_COMMAND_FILE_FLAG="@"
    # The solstudio linker does not support @-files.
    if test "x$TOOLCHAIN_TYPE" = xsolstudio; then
      COMPILER_COMMAND_FILE_FLAG=
    fi
    # Check if @file is supported by gcc
    if test "x$TOOLCHAIN_TYPE" = xgcc; then
      AC_MSG_CHECKING([if @file is supported by gcc])
      # Extra emtpy "" to prevent ECHO from interpreting '--version' as argument
      $ECHO "" "--version" > command.file
      if $CXX @command.file 2>&AS_MESSAGE_LOG_FD >&AS_MESSAGE_LOG_FD; then
        AC_MSG_RESULT(yes)
        COMPILER_COMMAND_FILE_FLAG="@"
      else
        AC_MSG_RESULT(no)
        COMPILER_COMMAND_FILE_FLAG=
      fi
      rm -rf command.file
    fi
  fi
  AC_SUBST(COMPILER_TARGET_BITS_FLAG)
  AC_SUBST(COMPILER_COMMAND_FILE_FLAG)
--- a/common/autoconf/generated-configure.sh
+++ b/common/autoconf/generated-configure.sh
@ -3792,6 +3792,15 @@ ac_configure="$SHELL $ac_aux_dir/configure"  # Please don't use this var.
 ################################################################################
 #
 # Runs icecc-create-env once and prints the error if it fails
 #
 # $1: arguments to icecc-create-env
 # $2: log file
 #
 ################################################################################
 #
 # Optionally enable distributed compilation of native code using icecc/icecream
@ -4308,7 +4317,7 @@ pkgadd_help() {
 #
-# Copyright (c) 2011, 2015, Oracle and/or its affiliates. All rights reserved.
+# Copyright (c) 2011, 2016, Oracle and/or its affiliates. All rights reserved.
 # DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 #
 # This code is free software; you can redistribute it and/or modify it
@ -4801,7 +4810,7 @@ VS_SDK_PLATFORM_NAME_2013=
 #CUSTOM_AUTOCONF_INCLUDE
 # Do not change or remove the following line, it is needed for consistency checks:
-DATE_WHEN_GENERATED=1450277321
+DATE_WHEN_GENERATED=1452261921
 ###############################################################################
 #
@ -45930,6 +45939,29 @@ $as_echo "$tool_specified" >&6; }
  else
    COMPILER_TARGET_BITS_FLAG="-m"
    COMPILER_COMMAND_FILE_FLAG="@"
    # The solstudio linker does not support @-files.
    if test "x$TOOLCHAIN_TYPE" = xsolstudio; then
      COMPILER_COMMAND_FILE_FLAG=
    fi
    # Check if @file is supported by gcc
    if test "x$TOOLCHAIN_TYPE" = xgcc; then
      { $as_echo "$as_me:${as_lineno-$LINENO}: checking if @file is supported by gcc" >&5
 $as_echo_n "checking if @file is supported by gcc... " >&6; }
      # Extra emtpy "" to prevent ECHO from interpreting '--version' as argument
      $ECHO "" "--version" > command.file
      if $CXX @command.file 2>&5 >&5; then
        { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
 $as_echo "yes" >&6; }
        COMPILER_COMMAND_FILE_FLAG="@"
      else
        { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
 $as_echo "no" >&6; }
        COMPILER_COMMAND_FILE_FLAG=
      fi
      rm -rf command.file
    fi
  fi
@ -51742,12 +51774,28 @@ $as_echo "$as_me: WARNING: cups not used, so --with-cups[-*] is ignored" >&2;}
    fi
    if test "x${with_cups}" != x; then
      { $as_echo "$as_me:${as_lineno-$LINENO}: checking for cups headers" >&5
 $as_echo_n "checking for cups headers... " >&6; }
      if test -s "${with_cups}/include/cups/cups.h"; then
        CUPS_CFLAGS="-I${with_cups}/include"
        CUPS_FOUND=yes
        { $as_echo "$as_me:${as_lineno-$LINENO}: result: $CUPS_FOUND" >&5
 $as_echo "$CUPS_FOUND" >&6; }
      else
        as_fn_error $? "Can't find 'include/cups/cups.h' under ${with_cups} given with the --with-cups option." "$LINENO" 5
      fi
    fi
    if test "x${with_cups_include}" != x; then
      { $as_echo "$as_me:${as_lineno-$LINENO}: checking for cups headers" >&5
 $as_echo_n "checking for cups headers... " >&6; }
      if test -s "${with_cups_include}/cups/cups.h"; then
        CUPS_CFLAGS="-I${with_cups_include}"
        CUPS_FOUND=yes
        { $as_echo "$as_me:${as_lineno-$LINENO}: result: $CUPS_FOUND" >&5
 $as_echo "$CUPS_FOUND" >&6; }
      else
        as_fn_error $? "Can't find 'cups/cups.h' under ${with_cups_include} given with the --with-cups-include option." "$LINENO" 5
      fi
    fi
    if test "x$CUPS_FOUND" = xno; then
      # Are the cups headers installed in the default /usr/include location?
@ -59543,11 +59591,23 @@ $as_echo "$tool_specified" >&6; }
    # be sent to the other hosts in the icecream cluster.
    icecc_create_env_log="${CONFIGURESUPPORT_OUTPUTDIR}/icecc/icecc_create_env.log"
    ${MKDIR} -p ${CONFIGURESUPPORT_OUTPUTDIR}/icecc
-    { $as_echo "$as_me:${as_lineno-$LINENO}: checking for icecc build environment for target compiler" >&5
+    # Older versions of icecc does not have the --gcc parameter
-$as_echo_n "checking for icecc build environment for target compiler... " >&6; }
+    if ${ICECC_CREATE_ENV} | $GREP -q -e --gcc; then
      icecc_gcc_arg="--gcc"
    fi
    if test "x${TOOLCHAIN_TYPE}" = "xgcc"; then
  cd ${CONFIGURESUPPORT_OUTPUTDIR}/icecc \
-          && ${ICECC_CREATE_ENV} --gcc ${CC} ${CXX} > ${icecc_create_env_log}
+      && ${ICECC_CREATE_ENV} ${icecc_gcc_arg} ${CC} ${CXX} > \
          ${icecc_create_env_log} 2>&1
  if test "$?" != "0"; then
    { $as_echo "$as_me:${as_lineno-$LINENO}: icecc-create-env output:" >&5
 $as_echo "$as_me: icecc-create-env output:" >&6;}
    cat \
          ${icecc_create_env_log}
    as_fn_error $? "Failed to create icecc compiler environment" "$LINENO" 5
  fi
    elif test "x$TOOLCHAIN_TYPE" = "xclang"; then
      # For clang, the icecc compilerwrapper is needed. It usually resides next
      # to icecc-create-env.
@ -59755,8 +59815,16 @@ $as_echo "$tool_specified" >&6; }
  fi
  cd ${CONFIGURESUPPORT_OUTPUTDIR}/icecc \
-          && ${ICECC_CREATE_ENV} --clang ${CC} ${ICECC_WRAPPER} > ${icecc_create_env_log}
+      && ${ICECC_CREATE_ENV} --clang ${CC} ${ICECC_WRAPPER} > ${icecc_create_env_log} 2>&1
  if test "$?" != "0"; then
    { $as_echo "$as_me:${as_lineno-$LINENO}: icecc-create-env output:" >&5
 $as_echo "$as_me: icecc-create-env output:" >&6;}
    cat ${icecc_create_env_log}
    as_fn_error $? "Failed to create icecc compiler environment" "$LINENO" 5
  fi
    else
      as_fn_error $? "Can only create icecc compiler packages for toolchain types gcc and clang" "$LINENO" 5
    fi
@ -59765,26 +59833,53 @@ $as_echo "$tool_specified" >&6; }
    # to find it.
    ICECC_ENV_BUNDLE_BASENAME="`${SED} -n '/^creating/s/creating //p' ${icecc_create_env_log}`"
    ICECC_ENV_BUNDLE="${CONFIGURESUPPORT_OUTPUTDIR}/icecc/${ICECC_ENV_BUNDLE_BASENAME}"
    if test ! -f ${ICECC_ENV_BUNDLE}; then
      as_fn_error $? "icecc-create-env did not produce an environment ${ICECC_ENV_BUNDLE}" "$LINENO" 5
    fi
    { $as_echo "$as_me:${as_lineno-$LINENO}: checking for icecc build environment for target compiler" >&5
 $as_echo_n "checking for icecc build environment for target compiler... " >&6; }
    { $as_echo "$as_me:${as_lineno-$LINENO}: result: ${ICECC_ENV_BUNDLE}" >&5
 $as_echo "${ICECC_ENV_BUNDLE}" >&6; }
    ICECC="ICECC_VERSION=${ICECC_ENV_BUNDLE} ICECC_CC=${CC} ICECC_CXX=${CXX} ${ICECC_CMD}"
    if test "x${COMPILE_TYPE}" = "xcross"; then
      # If cross compiling, create a separate env package for the build compiler
      { $as_echo "$as_me:${as_lineno-$LINENO}: checking for icecc build environment for build compiler" >&5
 $as_echo_n "checking for icecc build environment for build compiler... " >&6; }
      # Assume "gcc" or "cc" is gcc and "clang" is clang. Otherwise bail.
      icecc_create_env_log_build="${CONFIGURESUPPORT_OUTPUTDIR}/icecc/icecc_create_env_build.log"
      if test "x${BUILD_CC##*/}" = "xgcc" ||  test "x${BUILD_CC##*/}" = "xcc"; then
  cd ${CONFIGURESUPPORT_OUTPUTDIR}/icecc \
-            && ${ICECC_CREATE_ENV} --gcc ${BUILD_CC} ${BUILD_CXX} > ${icecc_create_env_log}
+      && ${ICECC_CREATE_ENV} ${icecc_gcc_arg} ${BUILD_CC} ${BUILD_CXX} > \
            ${icecc_create_env_log_build} 2>&1
  if test "$?" != "0"; then
    { $as_echo "$as_me:${as_lineno-$LINENO}: icecc-create-env output:" >&5
 $as_echo "$as_me: icecc-create-env output:" >&6;}
    cat \
            ${icecc_create_env_log_build}
    as_fn_error $? "Failed to create icecc compiler environment" "$LINENO" 5
  fi
      elif test "x${BUILD_CC##*/}" = "xclang"; then
  cd ${CONFIGURESUPPORT_OUTPUTDIR}/icecc \
-            && ${ICECC_CREATE_ENV} --clang ${BUILD_CC} ${ICECC_WRAPPER} > ${icecc_create_env_log}
+      && ${ICECC_CREATE_ENV} --clang ${BUILD_CC} ${ICECC_WRAPPER} > ${icecc_create_env_log_build} 2>&1
  if test "$?" != "0"; then
    { $as_echo "$as_me:${as_lineno-$LINENO}: icecc-create-env output:" >&5
 $as_echo "$as_me: icecc-create-env output:" >&6;}
    cat ${icecc_create_env_log_build}
    as_fn_error $? "Failed to create icecc compiler environment" "$LINENO" 5
  fi
      else
        as_fn_error $? "Cannot create icecc compiler package for ${BUILD_CC}" "$LINENO" 5
      fi
-      ICECC_ENV_BUNDLE_BASENAME="`${SED} -n '/^creating/s/creating //p' ${icecc_create_env_log}`"
+      ICECC_ENV_BUNDLE_BASENAME="`${SED} -n '/^creating/s/creating //p' ${icecc_create_env_log_build}`"
      ICECC_ENV_BUNDLE="${CONFIGURESUPPORT_OUTPUTDIR}/icecc/${ICECC_ENV_BUNDLE_BASENAME}"
      if test ! -f ${ICECC_ENV_BUNDLE}; then
        as_fn_error $? "icecc-create-env did not produce an environment ${ICECC_ENV_BUNDLE}" "$LINENO" 5
      fi
      { $as_echo "$as_me:${as_lineno-$LINENO}: checking for icecc build environment for build compiler" >&5
 $as_echo_n "checking for icecc build environment for build compiler... " >&6; }
      { $as_echo "$as_me:${as_lineno-$LINENO}: result: ${ICECC_ENV_BUNDLE}" >&5
 $as_echo "${ICECC_ENV_BUNDLE}" >&6; }
      BUILD_ICECC="ICECC_VERSION=${ICECC_ENV_BUNDLE} ICECC_CC=${BUILD_CC} \
--- a/common/autoconf/lib-cups.m4
+++ b/common/autoconf/lib-cups.m4
@ -1,5 +1,5 @@
 #
-# Copyright (c) 2011, 2015, Oracle and/or its affiliates. All rights reserved.
+# Copyright (c) 2011, 2016, Oracle and/or its affiliates. All rights reserved.
 # DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 #
 # This code is free software; you can redistribute it and/or modify it
@ -48,12 +48,24 @@ AC_DEFUN_ONCE([LIB_SETUP_CUPS],
    fi
    if test "x${with_cups}" != x; then
      AC_MSG_CHECKING([for cups headers])
      if test -s "${with_cups}/include/cups/cups.h"; then
        CUPS_CFLAGS="-I${with_cups}/include"
        CUPS_FOUND=yes
        AC_MSG_RESULT([$CUPS_FOUND])
      else
        AC_MSG_ERROR([Can't find 'include/cups/cups.h' under ${with_cups} given with the --with-cups option.])
      fi
    fi
    if test "x${with_cups_include}" != x; then
      AC_MSG_CHECKING([for cups headers])
      if test -s "${with_cups_include}/cups/cups.h"; then
        CUPS_CFLAGS="-I${with_cups_include}"
        CUPS_FOUND=yes
        AC_MSG_RESULT([$CUPS_FOUND])
      else
        AC_MSG_ERROR([Can't find 'cups/cups.h' under ${with_cups_include} given with the --with-cups-include option.])
      fi
    fi
    if test "x$CUPS_FOUND" = xno; then
      # Are the cups headers installed in the default /usr/include location?
--- a/corba/.hgtags
+++ b/corba/.hgtags
@ -342,3 +342,4 @@ feb1bd85d7990dcf5584ca9e53104269c01db006 jdk-9+96
 10a482b863582376d4ca229090334b23b05159fc jdk-9+97
 ea285530245cf4e0edf0479121a41347d3030eba jdk-9+98
 180212ee1d8710691ba9944593dfc1ff3e4f1532 jdk-9+99
 791d0d3ac0138faeb6110bd840a4545bc1950df2 jdk-9+100
--- a/hotspot/.hgtags
+++ b/hotspot/.hgtags
@ -502,3 +502,4 @@ a94bb7203596dd632486f1e3655fa5f70541dc08 jdk-9+96
 de592ea5f7ba0f8a8c5afc03bd169f7690c72b6f jdk-9+97
 e5b1a23be1e105417ba1c4c576ab373eb3fa2c2b jdk-9+98
 f008e8cc10d5b3212fb22d58c96fa01d38654f19 jdk-9+99
 bdb0acafc63c42e84d9d8195bf2e2b25ee9c3306 jdk-9+100
--- a/hotspot/.mx.jvmci/mx_jvmci.py
+++ b/hotspot/.mx.jvmci/mx_jvmci.py
@ -677,12 +677,6 @@ class JVMCIArchiveParticipant:
                assert service
                self.services.setdefault(service, []).append(provider)
            return True
        elif arcname.endswith('_OptionDescriptors.class'):
            # Need to create service files for the providers of the
            # jdk.vm.ci.options.Options service created by
            # jdk.vm.ci.options.processor.OptionProcessor.
            provider = arcname[:-len('.class'):].replace('/', '.')
            self.services.setdefault('jdk.vm.ci.options.OptionDescriptors', []).append(provider)
        return False
    def __addsrc__(self, arcname, contents):
@ -761,21 +755,6 @@ class JVMCI9JDKConfig(mx.JDKConfig):
        if jacocoArgs:
            args = jacocoArgs + args
        # Support for -G: options
        def translateGOption(arg):
            if arg.startswith('-G:+'):
                if '=' in arg:
                    mx.abort('Mixing + and = in -G: option specification: ' + arg)
                arg = '-Djvmci.option.' + arg[len('-G:+'):] + '=true'
            elif arg.startswith('-G:-'):
                if '=' in arg:
                    mx.abort('Mixing - and = in -G: option specification: ' + arg)
                arg = '-Djvmci.option.' + arg[len('-G:+'):] + '=false'
            elif arg.startswith('-G:'):
                arg = '-Djvmci.option.' + arg[len('-G:'):]
            return arg
        args = map(translateGOption, args)
        args = ['-Xbootclasspath/p:' + dep.classpath_repr() for dep in _jvmci_bootclasspath_prepends] + args
        jvmciModeArgs = _jvmciModes[_vm.jvmciMode]
--- a/hotspot/.mx.jvmci/suite.py
+++ b/hotspot/.mx.jvmci/suite.py
@ -109,7 +109,6 @@ suite = {
        "jdk.vm.ci.code",
      ],
      "checkstyle" : "jdk.vm.ci.service",
      "annotationProcessors" : ["JVMCI_OPTIONS_PROCESSOR"],
      "javaCompliance" : "1.8",
      "workingSets" : "API,JVMCI",
    },
@ -135,40 +134,17 @@ suite = {
      "workingSets" : "JVMCI",
    },
    "jdk.vm.ci.options" : {
      "subDir" : "src/jdk.vm.ci/share/classes",
      "sourceDirs" : ["src"],
      "checkstyle" : "jdk.vm.ci.service",
      "dependencies" : ["jdk.vm.ci.inittimer"],
      "javaCompliance" : "1.8",
      "workingSets" : "JVMCI",
    },
    "jdk.vm.ci.options.processor" : {
      "subDir" : "src/jdk.vm.ci/share/classes",
      "sourceDirs" : ["src"],
      "dependencies" : [
        "jdk.vm.ci.options",
      ],
      "checkstyle" : "jdk.vm.ci.service",
      "javaCompliance" : "1.8",
      "workingSets" : "JVMCI,Codegen",
    },
    "jdk.vm.ci.options.test" : {
      "subDir" : "test/compiler/jvmci",
      "sourceDirs" : ["src"],
      "dependencies" : [
        "jdk.vm.ci.options",
        "mx:JUNIT",
      ],
      "checkstyle" : "jdk.vm.ci.service",
      "javaCompliance" : "1.8",
      "workingSets" : "JVMCI",
    },
    # ------------- JVMCI:HotSpot -------------
    "jdk.vm.ci.aarch64" : {
      "subDir" : "src/jdk.vm.ci/share/classes",
      "sourceDirs" : ["src"],
      "dependencies" : ["jdk.vm.ci.code"],
      "checkstyle" : "jdk.vm.ci.service",
      "javaCompliance" : "1.8",
      "workingSets" : "JVMCI,AArch64",
    },
    "jdk.vm.ci.amd64" : {
      "subDir" : "src/jdk.vm.ci/share/classes",
      "sourceDirs" : ["src"],
@ -191,15 +167,12 @@ suite = {
      "subDir" : "src/jdk.vm.ci/share/classes",
      "sourceDirs" : ["src"],
      "dependencies" : [
        "jdk.vm.ci.options",
        "jdk.vm.ci.hotspotvmconfig",
        "jdk.vm.ci.common",
        "jdk.vm.ci.inittimer",
        "jdk.vm.ci.runtime",
        "jdk.vm.ci.service",
      ],
      "annotationProcessors" : [
        "JVMCI_OPTIONS_PROCESSOR",
      ],
      "checkstyle" : "jdk.vm.ci.service",
      "javaCompliance" : "1.8",
      "workingSets" : "JVMCI",
@ -213,6 +186,21 @@ suite = {
      "workingSets" : "JVMCI,HotSpot",
    },
    "jdk.vm.ci.hotspot.aarch64" : {
      "subDir" : "src/jdk.vm.ci/share/classes",
      "sourceDirs" : ["src"],
      "dependencies" : [
        "jdk.vm.ci.aarch64",
        "jdk.vm.ci.hotspot",
      ],
      "checkstyle" : "jdk.vm.ci.service",
      "annotationProcessors" : [
        "JVMCI_SERVICE_PROCESSOR",
      ],
      "javaCompliance" : "1.8",
      "workingSets" : "JVMCI,HotSpot,AArch64",
    },
    "jdk.vm.ci.hotspot.amd64" : {
      "subDir" : "src/jdk.vm.ci/share/classes",
      "sourceDirs" : ["src"],
@ -258,22 +246,17 @@ suite = {
      "dependencies" : ["jdk.vm.ci.service"],
    },
    "JVMCI_OPTIONS" : {
      "subDir" : "src/jdk.vm.ci/share/classes",
      "dependencies" : ["jdk.vm.ci.options"],
    },
    "JVMCI_API" : {
      "subDir" : "src/jdk.vm.ci/share/classes",
      "dependencies" : [
        "jdk.vm.ci.inittimer",
        "jdk.vm.ci.runtime",
        "jdk.vm.ci.common",
        "jdk.vm.ci.aarch64",
        "jdk.vm.ci.amd64",
        "jdk.vm.ci.sparc",
      ],
      "distDependencies" : [
        "JVMCI_OPTIONS",
        "JVMCI_SERVICE",
      ],
    },
@ -288,6 +271,7 @@ suite = {
    "JVMCI_HOTSPOT" : {
      "subDir" : "src/jdk.vm.ci/share/classes",
      "dependencies" : [
        "jdk.vm.ci.hotspot.aarch64",
        "jdk.vm.ci.hotspot.amd64",
        "jdk.vm.ci.hotspot.sparc",
      ],
@ -301,7 +285,6 @@ suite = {
    "JVMCI_TEST" : {
      "subDir" : "test/compiler/jvmci",
      "dependencies" : [
        "jdk.vm.ci.options.test",
        "jdk.vm.ci.runtime.test",
      ],
      "distDependencies" : [
@ -310,13 +293,6 @@ suite = {
      "exclude" : ["mx:JUNIT"],
    },
    "JVMCI_OPTIONS_PROCESSOR" : {
      "subDir" : "src/jdk.vm.ci/share/classes",
      "dependencies" : ["jdk.vm.ci.options.processor"],
      "distDependencies" : [
        "JVMCI_OPTIONS",
      ],
    },
    "JVMCI_SERVICE_PROCESSOR" : {
      "subDir" : "src/jdk.vm.ci/share/classes",
@ -332,25 +308,23 @@ suite = {
      "subDir" : "src/jdk.vm.ci/share/classes",
      "overlaps" : [
        "JVMCI_API",
        "JVMCI_OPTIONS",
        "JVMCI_SERVICE",
        "JVMCI_HOTSPOT",
        "JVMCI_HOTSPOTVMCONFIG",
        "JVMCI_SERVICE_PROCESSOR",
        "JVMCI_OPTIONS_PROCESSOR"
      ],
      "dependencies" : [
        "jdk.vm.ci.options",
        "jdk.vm.ci.service",
        "jdk.vm.ci.inittimer",
        "jdk.vm.ci.runtime",
        "jdk.vm.ci.common",
        "jdk.vm.ci.aarch64",
        "jdk.vm.ci.amd64",
        "jdk.vm.ci.sparc",
        "jdk.vm.ci.hotspotvmconfig",
        "jdk.vm.ci.hotspot.aarch64",
        "jdk.vm.ci.hotspot.amd64",
        "jdk.vm.ci.hotspot.sparc",
        "jdk.vm.ci.options.processor",
        "jdk.vm.ci.service.processor"
      ],
    },
--- a/hotspot/make/aix/Makefile
+++ b/hotspot/make/aix/Makefile
@ -1,6 +1,6 @@
 #
 # Copyright (c) 1999, 2015, Oracle and/or its affiliates. All rights reserved.
-# Copyright 2012, 2013 SAP AG. All rights reserved.
+# Copyright 2012, 2015 SAP AG. All rights reserved.
 # DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 #
 # This code is free software; you can redistribute it and/or modify it
@ -61,10 +61,6 @@ ifndef CC_INTERP
    FORCE_TIERED=1
  endif
 endif
 # C1 is not ported on ppc64(le), so we cannot build a tiered VM:
 ifneq (,$(filter $(ARCH),ppc64 pp64le))
  FORCE_TIERED=0
 endif
 ifdef LP64
  ifeq ("$(filter $(LP64_ARCH),$(BUILDARCH))","")
--- a/hotspot/make/aix/makefiles/fastdebug.make
+++ b/hotspot/make/aix/makefiles/fastdebug.make
@ -68,5 +68,5 @@ MAPFILE = $(GAMMADIR)/make/aix/makefiles/mapfile-vers-debug
 LFLAGS_QIPA=
 VERSION = optimized
-SYSDEFS += -DASSERT -DFASTDEBUG
+SYSDEFS += -DASSERT
 PICFLAGS = DEFAULT
--- a/hotspot/make/aix/makefiles/tiered.make
+++ b/hotspot/make/aix/makefiles/tiered.make
@ -0,0 +1,32 @@
 #
 # Copyright (c) 2006, 2015, Oracle and/or its affiliates. All rights reserved.
 # Copyright 2012, 2015 SAP AG. All rights reserved.
 # DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 #
 # This code is free software; you can redistribute it and/or modify it
 # under the terms of the GNU General Public License version 2 only, as
 # published by the Free Software Foundation.
 #
 # This code is distributed in the hope that it will be useful, but WITHOUT
 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 # FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 # version 2 for more details (a copy is included in the LICENSE file that
 # accompanied this code).
 #
 # You should have received a copy of the GNU General Public License version
 # 2 along with this work; if not, write to the Free Software Foundation,
 # Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
 #
 # Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
 # or visit www.oracle.com if you need additional information or have any
 # questions.
 #
 #
 # Sets make macros for making tiered version of VM
 TYPE=TIERED
 VM_SUBDIR = server
 CFLAGS += -DCOMPILER2 -DCOMPILER1
--- a/hotspot/make/excludeSrc.make
+++ b/hotspot/make/excludeSrc.make
@ -107,8 +107,8 @@ ifeq ($(INCLUDE_NMT), false)
 	 memTracker.cpp nmtDCmd.cpp mallocSiteTable.cpp
 endif
-ifneq (,$(findstring $(Platform_arch_model), x86_64, sparc))
+ifneq (,$(findstring $(Platform_arch_model), aarch64, arm_64, sparc, x86_64))
-      # JVMCI is supported only on x86_64 and SPARC.
+      # JVMCI is supported
 else
      INCLUDE_JVMCI := false
 endif
--- a/hotspot/make/gensrc/Gensrc-jdk.vm.ci.gmk
+++ b/hotspot/make/gensrc/Gensrc-jdk.vm.ci.gmk
@ -36,15 +36,6 @@ SRC_DIR := $(HOTSPOT_TOPDIR)/src/jdk.vm.ci/share/classes
 ################################################################################
 # Compile the annotation processor
 $(eval $(call SetupJavaCompilation, BUILD_JVMCI_OPTIONS, \
    SETUP := GENERATE_OLDBYTECODE, \
    SRC := $(SRC_DIR)/jdk.vm.ci.options/src \
        $(SRC_DIR)/jdk.vm.ci.options.processor/src \
        $(SRC_DIR)/jdk.vm.ci.inittimer/src, \
    BIN := $(BUILDTOOLS_OUTPUTDIR)/jvmci_options, \
    JAR := $(BUILDTOOLS_OUTPUTDIR)/jdk.vm.ci.options.jar, \
 ))
 $(eval $(call SetupJavaCompilation, BUILD_JVMCI_SERVICE, \
    SETUP := GENERATE_OLDBYTECODE, \
    SRC := $(SRC_DIR)/jdk.vm.ci.service/src \
@ -57,6 +48,7 @@ $(eval $(call SetupJavaCompilation, BUILD_JVMCI_SERVICE, \
 PROC_SRC_SUBDIRS := \
    jdk.vm.ci.hotspot \
    jdk.vm.ci.hotspot.aarch64 \
    jdk.vm.ci.hotspot.amd64 \
    jdk.vm.ci.hotspot.sparc \
    jdk.vm.ci.runtime \
@ -69,15 +61,15 @@ PROC_SRCS := $(filter %.java, $(call CacheFind, $(PROC_SRC_DIRS)))
 ALL_SRC_DIRS := $(wildcard $(SRC_DIR)/*/src)
 SOURCEPATH := $(call PathList, $(ALL_SRC_DIRS))
 PROCESSOR_PATH := $(call PathList, \
    $(BUILDTOOLS_OUTPUTDIR)/jdk.vm.ci.options.jar \
    $(BUILDTOOLS_OUTPUTDIR)/jdk.vm.ci.service.jar)
 $(GENSRC_DIR)/_gensrc_proc_done: $(PROC_SRCS) \
-    $(BUILD_JVMCI_OPTIONS) $(BUILD_JVMCI_SERVICE)
+    $(BUILD_JVMCI_SERVICE)
 	$(MKDIR) -p $(@D)
 	$(eval $(call ListPathsSafely,PROC_SRCS,$(@D)/_gensrc_proc_files))
 	$(JAVA_SMALL) $(NEW_JAVAC) \
 	    -XDignore.symbol.file \
            -bootclasspath $(JDK_OUTPUTDIR)/modules/java.base \
 	    -sourcepath $(SOURCEPATH) \
 	    -implicit:none \
 	    -proc:only \
@ -91,15 +83,6 @@ TARGETS += $(GENSRC_DIR)/_gensrc_proc_done
 ################################################################################
 $(GENSRC_DIR)/META-INF/services/jdk.vm.ci.options.OptionDescriptors: \
    $(GENSRC_DIR)/_gensrc_proc_done
 	$(MKDIR) -p $(@D)
 	$(FIND) $(GENSRC_DIR) -name '*_OptionDescriptors.java' | $(SED) 's:.*/jdk\.vm\.ci/\(.*\)\.java:\1:' | $(TR) '/' '.' > $@
 TARGETS += $(GENSRC_DIR)/META-INF/services/jdk.vm.ci.options.OptionDescriptors
 ################################################################################
 $(GENSRC_DIR)/_providers_converted: $(GENSRC_DIR)/_gensrc_proc_done
 	$(MKDIR) -p $(GENSRC_DIR)/META-INF/services
 	($(CD) $(GENSRC_DIR)/META-INF/jvmci.providers && \
--- a/hotspot/make/linux/Makefile
+++ b/hotspot/make/linux/Makefile
@ -57,14 +57,6 @@ ifndef CC_INTERP
    FORCE_TIERED=1
  endif
 endif
 # C1 is not ported on ppc64, so we cannot build a tiered VM:
 # Notice: after 8046471 ARCH will be 'ppc' for top-level ppc64 builds but
 # 'ppc64' for HotSpot-only ppc64 builds. Need to detect both variants here!
 ifneq (,$(findstring $(ARCH), ppc ppc64))
  ifeq ($(ARCH_DATA_MODEL), 64)
    FORCE_TIERED=0
  endif
 endif
 ifdef LP64
  ifeq ("$(filter $(LP64_ARCH),$(BUILDARCH))","")
--- a/hotspot/make/test/JtregNative.gmk
+++ b/hotspot/make/test/JtregNative.gmk
@ -46,6 +46,8 @@ BUILD_HOTSPOT_JTREG_NATIVE_SRC := \
    $(HOTSPOT_TOPDIR)/test/runtime/jni/8033445 \
    $(HOTSPOT_TOPDIR)/test/runtime/jni/ToStringInInterfaceTest \
    $(HOTSPOT_TOPDIR)/test/runtime/SameObject \
    $(HOTSPOT_TOPDIR)/test/compiler/floatingpoint/ \
    $(HOTSPOT_TOPDIR)/test/compiler/calls \
    #
 # Add conditional directories here when needed.
--- a/hotspot/src/cpu/aarch64/vm/aarch64.ad
+++ b/hotspot/src/cpu/aarch64/vm/aarch64.ad
@ -3484,10 +3484,14 @@ int Matcher::regnum_to_fpu_offset(int regnum)
  return 0;
 }
-bool Matcher::is_short_branch_offset(int rule, int br_size, int offset)
+// Is this branch offset short enough that a short branch can be used?
-{
+//
-  Unimplemented();
+// NOTE: If the platform does not provide any short branch variants, then
-  return false;
+//       this method should return false for offset 0.
 bool Matcher::is_short_branch_offset(int rule, int br_size, int offset) {
  // The passed offset is relative to address of the branch.
  return (-32768 <= offset && offset < 32768);
 }
 const bool Matcher::isSimpleConstant64(jlong value) {
@ -4667,17 +4671,12 @@ encode %{
    if (!_method) {
      // A call to a runtime wrapper, e.g. new, new_typeArray_Java, uncommon_trap.
      call = __ trampoline_call(Address(addr, relocInfo::runtime_call_type), &cbuf);
    } else if (_optimized_virtual) {
      call = __ trampoline_call(Address(addr, relocInfo::opt_virtual_call_type), &cbuf);
    } else {
-      call = __ trampoline_call(Address(addr, relocInfo::static_call_type), &cbuf);
+      int method_index = resolved_method_index(cbuf);
-    }
+      RelocationHolder rspec = _optimized_virtual ? opt_virtual_call_Relocation::spec(method_index)
-    if (call == NULL) {
+                                                  : static_call_Relocation::spec(method_index);
-      ciEnv::current()->record_failure("CodeCache is full");
+      call = __ trampoline_call(Address(addr, rspec), &cbuf);
      return;
    }
    if (_method) {
      // Emit stub for static call
      address stub = CompiledStaticCall::emit_to_interp_stub(cbuf);
      if (stub == NULL) {
@ -4685,11 +4684,16 @@ encode %{
        return;
      }
    }
    if (call == NULL) {
      ciEnv::current()->record_failure("CodeCache is full");
      return;
    }
  %}
  enc_class aarch64_enc_java_dynamic_call(method meth) %{
    MacroAssembler _masm(&cbuf);
-    address call = __ ic_call((address)$meth$$method);
+    int method_index = resolved_method_index(cbuf);
    address call = __ ic_call((address)$meth$$method, method_index);
    if (call == NULL) {
      ciEnv::current()->record_failure("CodeCache is full");
      return;
@ -13845,7 +13849,8 @@ instruct cmpP_narrowOop_imm0_branch(cmpOp cmp, iRegN oop, immP0 zero, label labl
 // Test bit and Branch
-instruct cmpL_branch_sign(cmpOp cmp, iRegL op1, immL0 op2, label labl, rFlagsReg cr) %{
+// Patterns for short (< 32KiB) variants
 instruct cmpL_branch_sign(cmpOp cmp, iRegL op1, immL0 op2, label labl) %{
  match(If cmp (CmpL op1 op2));
  predicate(n->in(1)->as_Bool()->_test._test == BoolTest::lt
            || n->in(1)->as_Bool()->_test._test == BoolTest::ge);
@ -13855,16 +13860,15 @@ instruct cmpL_branch_sign(cmpOp cmp, iRegL op1, immL0 op2, label labl, rFlagsReg
  format %{ "cb$cmp   $op1, $labl # long" %}
  ins_encode %{
    Label* L = $labl$$label;
-    Assembler::Condition cond = (Assembler::Condition)$cmp$$cmpcode;
+    Assembler::Condition cond =
-    if (cond == Assembler::LT)
+      ((Assembler::Condition)$cmp$$cmpcode == Assembler::LT) ? Assembler::NE : Assembler::EQ;
-      __ tbnz($op1$$Register, 63, *L);
+    __ tbr(cond, $op1$$Register, 63, *L);
    else
      __ tbz($op1$$Register, 63, *L);
  %}
  ins_pipe(pipe_cmp_branch);
  ins_short_branch(1);
 %}
-instruct cmpI_branch_sign(cmpOp cmp, iRegIorL2I op1, immI0 op2, label labl, rFlagsReg cr) %{
+instruct cmpI_branch_sign(cmpOp cmp, iRegIorL2I op1, immI0 op2, label labl) %{
  match(If cmp (CmpI op1 op2));
  predicate(n->in(1)->as_Bool()->_test._test == BoolTest::lt
            || n->in(1)->as_Bool()->_test._test == BoolTest::ge);
@ -13874,16 +13878,15 @@ instruct cmpI_branch_sign(cmpOp cmp, iRegIorL2I op1, immI0 op2, label labl, rFla
  format %{ "cb$cmp   $op1, $labl # int" %}
  ins_encode %{
    Label* L = $labl$$label;
-    Assembler::Condition cond = (Assembler::Condition)$cmp$$cmpcode;
+    Assembler::Condition cond =
-    if (cond == Assembler::LT)
+      ((Assembler::Condition)$cmp$$cmpcode == Assembler::LT) ? Assembler::NE : Assembler::EQ;
-      __ tbnz($op1$$Register, 31, *L);
+    __ tbr(cond, $op1$$Register, 31, *L);
    else
      __ tbz($op1$$Register, 31, *L);
  %}
  ins_pipe(pipe_cmp_branch);
  ins_short_branch(1);
 %}
-instruct cmpL_branch_bit(cmpOp cmp, iRegL op1, immL op2, immL0 op3, label labl, rFlagsReg cr) %{
+instruct cmpL_branch_bit(cmpOp cmp, iRegL op1, immL op2, immL0 op3, label labl) %{
  match(If cmp (CmpL (AndL op1 op2) op3));
  predicate((n->in(1)->as_Bool()->_test._test == BoolTest::ne
            || n->in(1)->as_Bool()->_test._test == BoolTest::eq)
@ -13896,15 +13899,13 @@ instruct cmpL_branch_bit(cmpOp cmp, iRegL op1, immL op2, immL0 op3, label labl,
    Label* L = $labl$$label;
    Assembler::Condition cond = (Assembler::Condition)$cmp$$cmpcode;
    int bit = exact_log2($op2$$constant);
-    if (cond == Assembler::EQ)
+    __ tbr(cond, $op1$$Register, bit, *L);
      __ tbz($op1$$Register, bit, *L);
    else
      __ tbnz($op1$$Register, bit, *L);
  %}
  ins_pipe(pipe_cmp_branch);
  ins_short_branch(1);
 %}
-instruct cmpI_branch_bit(cmpOp cmp, iRegIorL2I op1, immI op2, immI0 op3, label labl, rFlagsReg cr) %{
+instruct cmpI_branch_bit(cmpOp cmp, iRegIorL2I op1, immI op2, immI0 op3, label labl) %{
  match(If cmp (CmpI (AndI op1 op2) op3));
  predicate((n->in(1)->as_Bool()->_test._test == BoolTest::ne
            || n->in(1)->as_Bool()->_test._test == BoolTest::eq)
@ -13917,10 +13918,79 @@ instruct cmpI_branch_bit(cmpOp cmp, iRegIorL2I op1, immI op2, immI0 op3, label l
    Label* L = $labl$$label;
    Assembler::Condition cond = (Assembler::Condition)$cmp$$cmpcode;
    int bit = exact_log2($op2$$constant);
-    if (cond == Assembler::EQ)
+    __ tbr(cond, $op1$$Register, bit, *L);
-      __ tbz($op1$$Register, bit, *L);
+  %}
-    else
+  ins_pipe(pipe_cmp_branch);
-      __ tbnz($op1$$Register, bit, *L);
+  ins_short_branch(1);
 %}
 // And far variants
 instruct far_cmpL_branch_sign(cmpOp cmp, iRegL op1, immL0 op2, label labl) %{
  match(If cmp (CmpL op1 op2));
  predicate(n->in(1)->as_Bool()->_test._test == BoolTest::lt
            || n->in(1)->as_Bool()->_test._test == BoolTest::ge);
  effect(USE labl);
  ins_cost(BRANCH_COST);
  format %{ "cb$cmp   $op1, $labl # long" %}
  ins_encode %{
    Label* L = $labl$$label;
    Assembler::Condition cond =
      ((Assembler::Condition)$cmp$$cmpcode == Assembler::LT) ? Assembler::NE : Assembler::EQ;
    __ tbr(cond, $op1$$Register, 63, *L, /*far*/true);
  %}
  ins_pipe(pipe_cmp_branch);
 %}
 instruct far_cmpI_branch_sign(cmpOp cmp, iRegIorL2I op1, immI0 op2, label labl) %{
  match(If cmp (CmpI op1 op2));
  predicate(n->in(1)->as_Bool()->_test._test == BoolTest::lt
            || n->in(1)->as_Bool()->_test._test == BoolTest::ge);
  effect(USE labl);
  ins_cost(BRANCH_COST);
  format %{ "cb$cmp   $op1, $labl # int" %}
  ins_encode %{
    Label* L = $labl$$label;
    Assembler::Condition cond =
      ((Assembler::Condition)$cmp$$cmpcode == Assembler::LT) ? Assembler::NE : Assembler::EQ;
    __ tbr(cond, $op1$$Register, 31, *L, /*far*/true);
  %}
  ins_pipe(pipe_cmp_branch);
 %}
 instruct far_cmpL_branch_bit(cmpOp cmp, iRegL op1, immL op2, immL0 op3, label labl) %{
  match(If cmp (CmpL (AndL op1 op2) op3));
  predicate((n->in(1)->as_Bool()->_test._test == BoolTest::ne
            || n->in(1)->as_Bool()->_test._test == BoolTest::eq)
            && is_power_of_2(n->in(2)->in(1)->in(2)->get_long()));
  effect(USE labl);
  ins_cost(BRANCH_COST);
  format %{ "tb$cmp   $op1, $op2, $labl" %}
  ins_encode %{
    Label* L = $labl$$label;
    Assembler::Condition cond = (Assembler::Condition)$cmp$$cmpcode;
    int bit = exact_log2($op2$$constant);
    __ tbr(cond, $op1$$Register, bit, *L, /*far*/true);
  %}
  ins_pipe(pipe_cmp_branch);
 %}
 instruct far_cmpI_branch_bit(cmpOp cmp, iRegIorL2I op1, immI op2, immI0 op3, label labl) %{
  match(If cmp (CmpI (AndI op1 op2) op3));
  predicate((n->in(1)->as_Bool()->_test._test == BoolTest::ne
            || n->in(1)->as_Bool()->_test._test == BoolTest::eq)
            && is_power_of_2(n->in(2)->in(1)->in(2)->get_int()));
  effect(USE labl);
  ins_cost(BRANCH_COST);
  format %{ "tb$cmp   $op1, $op2, $labl" %}
  ins_encode %{
    Label* L = $labl$$label;
    Assembler::Condition cond = (Assembler::Condition)$cmp$$cmpcode;
    int bit = exact_log2($op2$$constant);
    __ tbr(cond, $op1$$Register, bit, *L, /*far*/true);
  %}
  ins_pipe(pipe_cmp_branch);
 %}
@ -15318,6 +15388,124 @@ instruct vmul2D(vecX dst, vecX src1, vecX src2)
  ins_pipe(pipe_class_default);
 %}
 // --------------------------------- MLA --------------------------------------
 instruct vmla4S(vecD dst, vecD src1, vecD src2)
 %{
  predicate(n->as_Vector()->length() == 2 ||
            n->as_Vector()->length() == 4);
  match(Set dst (AddVS dst (MulVS src1 src2)));
  ins_cost(INSN_COST);
  format %{ "mlav  $dst,$src1,$src2\t# vector (4H)" %}
  ins_encode %{
    __ mlav(as_FloatRegister($dst$$reg), __ T4H,
            as_FloatRegister($src1$$reg),
            as_FloatRegister($src2$$reg));
  %}
  ins_pipe(pipe_class_default);
 %}
 instruct vmla8S(vecX dst, vecX src1, vecX src2)
 %{
  predicate(n->as_Vector()->length() == 8);
  match(Set dst (AddVS dst (MulVS src1 src2)));
  ins_cost(INSN_COST);
  format %{ "mlav  $dst,$src1,$src2\t# vector (8H)" %}
  ins_encode %{
    __ mlav(as_FloatRegister($dst$$reg), __ T8H,
            as_FloatRegister($src1$$reg),
            as_FloatRegister($src2$$reg));
  %}
  ins_pipe(pipe_class_default);
 %}
 instruct vmla2I(vecD dst, vecD src1, vecD src2)
 %{
  predicate(n->as_Vector()->length() == 2);
  match(Set dst (AddVI dst (MulVI src1 src2)));
  ins_cost(INSN_COST);
  format %{ "mlav  $dst,$src1,$src2\t# vector (2S)" %}
  ins_encode %{
    __ mlav(as_FloatRegister($dst$$reg), __ T2S,
            as_FloatRegister($src1$$reg),
            as_FloatRegister($src2$$reg));
  %}
  ins_pipe(pipe_class_default);
 %}
 instruct vmla4I(vecX dst, vecX src1, vecX src2)
 %{
  predicate(n->as_Vector()->length() == 4);
  match(Set dst (AddVI dst (MulVI src1 src2)));
  ins_cost(INSN_COST);
  format %{ "mlav  $dst,$src1,$src2\t# vector (4S)" %}
  ins_encode %{
    __ mlav(as_FloatRegister($dst$$reg), __ T4S,
            as_FloatRegister($src1$$reg),
            as_FloatRegister($src2$$reg));
  %}
  ins_pipe(pipe_class_default);
 %}
 // --------------------------------- MLS --------------------------------------
 instruct vmls4S(vecD dst, vecD src1, vecD src2)
 %{
  predicate(n->as_Vector()->length() == 2 ||
            n->as_Vector()->length() == 4);
  match(Set dst (SubVS dst (MulVS src1 src2)));
  ins_cost(INSN_COST);
  format %{ "mlsv  $dst,$src1,$src2\t# vector (4H)" %}
  ins_encode %{
    __ mlsv(as_FloatRegister($dst$$reg), __ T4H,
            as_FloatRegister($src1$$reg),
            as_FloatRegister($src2$$reg));
  %}
  ins_pipe(pipe_class_default);
 %}
 instruct vmls8S(vecX dst, vecX src1, vecX src2)
 %{
  predicate(n->as_Vector()->length() == 8);
  match(Set dst (SubVS dst (MulVS src1 src2)));
  ins_cost(INSN_COST);
  format %{ "mlsv  $dst,$src1,$src2\t# vector (8H)" %}
  ins_encode %{
    __ mlsv(as_FloatRegister($dst$$reg), __ T8H,
            as_FloatRegister($src1$$reg),
            as_FloatRegister($src2$$reg));
  %}
  ins_pipe(pipe_class_default);
 %}
 instruct vmls2I(vecD dst, vecD src1, vecD src2)
 %{
  predicate(n->as_Vector()->length() == 2);
  match(Set dst (SubVI dst (MulVI src1 src2)));
  ins_cost(INSN_COST);
  format %{ "mlsv  $dst,$src1,$src2\t# vector (2S)" %}
  ins_encode %{
    __ mlsv(as_FloatRegister($dst$$reg), __ T2S,
            as_FloatRegister($src1$$reg),
            as_FloatRegister($src2$$reg));
  %}
  ins_pipe(pipe_class_default);
 %}
 instruct vmls4I(vecX dst, vecX src1, vecX src2)
 %{
  predicate(n->as_Vector()->length() == 4);
  match(Set dst (SubVI dst (MulVI src1 src2)));
  ins_cost(INSN_COST);
  format %{ "mlsv  $dst,$src1,$src2\t# vector (4S)" %}
  ins_encode %{
    __ mlsv(as_FloatRegister($dst$$reg), __ T4S,
            as_FloatRegister($src1$$reg),
            as_FloatRegister($src2$$reg));
  %}
  ins_pipe(pipe_class_default);
 %}
 // --------------------------------- DIV --------------------------------------
 instruct vdiv2F(vecD dst, vecD src1, vecD src2)
--- a/hotspot/src/cpu/aarch64/vm/assembler_aarch64.hpp
+++ b/hotspot/src/cpu/aarch64/vm/assembler_aarch64.hpp
@ -139,11 +139,6 @@ REGISTER_DECLARATION(Register, rdispatch,      r21);
 // Java stack pointer
 REGISTER_DECLARATION(Register, esp,      r20);
 // TODO : x86 uses rbp to save SP in method handle code
 // we may need to do the same with fp
 // JSR 292 fixed register usages:
 //REGISTER_DECLARATION(Register, r_mh_SP_save, r29);
 #define assert_cond(ARG1) assert(ARG1, #ARG1)
 namespace asm_util {
@ -551,6 +546,7 @@ class Address VALUE_OBJ_CLASS_SPEC {
        size = 0; break;
      default:
        ShouldNotReachHere();
        size = 0;  // unreachable
      }
    } else {
      size = i->get(31, 31);
@ -2041,6 +2037,8 @@ public:
  INSN(addv, 0, 0b100001);
  INSN(subv, 1, 0b100001);
  INSN(mulv, 0, 0b100111);
  INSN(mlav, 0, 0b100101);
  INSN(mlsv, 1, 0b100101);
  INSN(sshl, 0, 0b010001);
  INSN(ushl, 1, 0b010001);
--- a/hotspot/src/cpu/aarch64/vm/c1_LIRAssembler_aarch64.cpp
+++ b/hotspot/src/cpu/aarch64/vm/c1_LIRAssembler_aarch64.cpp
@ -173,6 +173,7 @@ static jlong as_long(LIR_Opr data) {
    break;
  default:
    ShouldNotReachHere();
    result = 0;  // unreachable
  }
  return result;
 }
@ -720,6 +721,7 @@ void LIR_Assembler::const2mem(LIR_Opr src, LIR_Opr dest, BasicType type, CodeEmi
    break;
  default:
    ShouldNotReachHere();
    insn = &Assembler::str;  // unreachable
  }
  if (info) add_debug_info_for_null_check_here(info);
@ -1110,6 +1112,7 @@ void LIR_Assembler::emit_opBranch(LIR_OpBranch* op) {
      case lir_cond_greaterEqual: acond = (is_unordered ? Assembler::HS : Assembler::GE); break;
      case lir_cond_greater:      acond = (is_unordered ? Assembler::HI : Assembler::GT); break;
      default:                    ShouldNotReachHere();
        acond = Assembler::EQ;  // unreachable
      }
    } else {
      switch (op->cond()) {
@ -1122,6 +1125,7 @@ void LIR_Assembler::emit_opBranch(LIR_OpBranch* op) {
        case lir_cond_belowEqual:   acond = Assembler::LS; break;
        case lir_cond_aboveEqual:   acond = Assembler::HS; break;
        default:                    ShouldNotReachHere();
          acond = Assembler::EQ;  // unreachable
      }
    }
    __ br(acond,*(op->label()));
@ -1313,7 +1317,9 @@ void LIR_Assembler::emit_typecheck_helper(LIR_OpTypeCheck *op, Label* success, L
  ciMethodData* md;
  ciProfileData* data;
-  if (op->should_profile()) {
+  const bool should_profile = op->should_profile();
  if (should_profile) {
    ciMethod* method = op->profiled_method();
    assert(method != NULL, "Should have method");
    int bci = op->profiled_bci();
@ -1324,8 +1330,8 @@ void LIR_Assembler::emit_typecheck_helper(LIR_OpTypeCheck *op, Label* success, L
    assert(data->is_ReceiverTypeData(), "need ReceiverTypeData for type check");
  }
  Label profile_cast_success, profile_cast_failure;
-  Label *success_target = op->should_profile() ? &profile_cast_success : success;
+  Label *success_target = should_profile ? &profile_cast_success : success;
-  Label *failure_target = op->should_profile() ? &profile_cast_failure : failure;
+  Label *failure_target = should_profile ? &profile_cast_failure : failure;
  if (obj == k_RInfo) {
    k_RInfo = dst;
@ -1341,7 +1347,7 @@ void LIR_Assembler::emit_typecheck_helper(LIR_OpTypeCheck *op, Label* success, L
  assert_different_registers(obj, k_RInfo, klass_RInfo);
-    if (op->should_profile()) {
+    if (should_profile) {
      Label not_null;
      __ cbnz(obj, not_null);
      // Object is null; update MDO and exit
@ -1413,7 +1419,7 @@ void LIR_Assembler::emit_typecheck_helper(LIR_OpTypeCheck *op, Label* success, L
      // successful cast, fall through to profile or jump
    }
  }
-  if (op->should_profile()) {
+  if (should_profile) {
    Register mdo  = klass_RInfo, recv = k_RInfo;
    __ bind(profile_cast_success);
    __ mov_metadata(mdo, md->constant_encoding());
@ -1438,6 +1444,8 @@ void LIR_Assembler::emit_typecheck_helper(LIR_OpTypeCheck *op, Label* success, L
 void LIR_Assembler::emit_opTypeCheck(LIR_OpTypeCheck* op) {
  const bool should_profile = op->should_profile();
  LIR_Code code = op->code();
  if (code == lir_store_check) {
    Register value = op->object()->as_register();
@ -1452,7 +1460,7 @@ void LIR_Assembler::emit_opTypeCheck(LIR_OpTypeCheck* op) {
    ciMethodData* md;
    ciProfileData* data;
-    if (op->should_profile()) {
+    if (should_profile) {
      ciMethod* method = op->profiled_method();
      assert(method != NULL, "Should have method");
      int bci = op->profiled_bci();
@ -1463,10 +1471,10 @@ void LIR_Assembler::emit_opTypeCheck(LIR_OpTypeCheck* op) {
      assert(data->is_ReceiverTypeData(), "need ReceiverTypeData for type check");
    }
    Label profile_cast_success, profile_cast_failure, done;
-    Label *success_target = op->should_profile() ? &profile_cast_success : &done;
+    Label *success_target = should_profile ? &profile_cast_success : &done;
-    Label *failure_target = op->should_profile() ? &profile_cast_failure : stub->entry();
+    Label *failure_target = should_profile ? &profile_cast_failure : stub->entry();
-    if (op->should_profile()) {
+    if (should_profile) {
      Label not_null;
      __ cbnz(value, not_null);
      // Object is null; update MDO and exit
@ -1502,7 +1510,7 @@ void LIR_Assembler::emit_opTypeCheck(LIR_OpTypeCheck* op) {
    __ cbzw(k_RInfo, *failure_target);
    // fall through to the success case
-    if (op->should_profile()) {
+    if (should_profile) {
      Register mdo  = klass_RInfo, recv = k_RInfo;
      __ bind(profile_cast_success);
      __ mov_metadata(mdo, md->constant_encoding());
@ -1621,9 +1629,10 @@ void LIR_Assembler::cmove(LIR_Condition condition, LIR_Opr opr1, LIR_Opr opr2, L
  case lir_cond_lessEqual:    acond = Assembler::LE; ncond = Assembler::GT; break;
  case lir_cond_greaterEqual: acond = Assembler::GE; ncond = Assembler::LT; break;
  case lir_cond_greater:      acond = Assembler::GT; ncond = Assembler::LE; break;
-  case lir_cond_belowEqual:   Unimplemented(); break;
+  case lir_cond_belowEqual:
-  case lir_cond_aboveEqual:   Unimplemented(); break;
+  case lir_cond_aboveEqual:
  default:                    ShouldNotReachHere();
    acond = Assembler::EQ; ncond = Assembler::NE;  // unreachable
  }
  assert(result->is_single_cpu() || result->is_double_cpu(),
@ -1724,6 +1733,7 @@ void LIR_Assembler::arith_op(LIR_Code code, LIR_Opr left, LIR_Opr right, LIR_Opr
        break;
      default:
        ShouldNotReachHere();
        c = 0;  // unreachable
        break;
      }
@ -1926,6 +1936,7 @@ void LIR_Assembler::comp_op(LIR_Condition condition, LIR_Opr opr1, LIR_Opr opr2,
        break;
      default:
        ShouldNotReachHere();
        imm = 0;  // unreachable
        break;
      }
@ -3123,6 +3134,9 @@ void LIR_Assembler::atomic_op(LIR_Code code, LIR_Opr src, LIR_Opr data, LIR_Opr
    break;
  default:
    ShouldNotReachHere();
    lda = &MacroAssembler::ldaxr;
    add = &MacroAssembler::add;
    stl = &MacroAssembler::stlxr;  // unreachable
  }
  switch (code) {
--- a/hotspot/src/cpu/aarch64/vm/c1_LIRGenerator_aarch64.cpp
+++ b/hotspot/src/cpu/aarch64/vm/c1_LIRGenerator_aarch64.cpp
@ -238,6 +238,7 @@ LIR_Opr LIRGenerator::load_immediate(int x, BasicType type) {
    }
  } else {
    ShouldNotReachHere();
    r = NULL;  // unreachable
  }
  return r;
 }
--- a/hotspot/src/cpu/aarch64/vm/c1_MacroAssembler_aarch64.hpp
+++ b/hotspot/src/cpu/aarch64/vm/c1_MacroAssembler_aarch64.hpp
@ -27,6 +27,7 @@
 #define CPU_AARCH64_VM_C1_MACROASSEMBLER_AARCH64_HPP
 using MacroAssembler::build_frame;
 using MacroAssembler::null_check;
 // C1_MacroAssembler contains high-level macros for C1
--- a/hotspot/src/cpu/aarch64/vm/frame_aarch64.cpp
+++ b/hotspot/src/cpu/aarch64/vm/frame_aarch64.cpp
@ -433,11 +433,11 @@ frame frame::sender_for_interpreter_frame(RegisterMap* map) const {
  // This is the sp before any possible extension (adapter/locals).
  intptr_t* unextended_sp = interpreter_frame_sender_sp();
-#ifdef COMPILER2
+#if defined(COMPILER2) || INCLUDE_JVMCI
  if (map->update_map()) {
    update_map_with_saved_link(map, (intptr_t**) addr_at(link_offset));
  }
-#endif // COMPILER2
+#endif // COMPILER2 || INCLUDE_JVMCI
  return frame(sender_sp, unextended_sp, link(), sender_pc());
 }
--- a/hotspot/src/cpu/aarch64/vm/globalDefinitions_aarch64.hpp
+++ b/hotspot/src/cpu/aarch64/vm/globalDefinitions_aarch64.hpp
@ -28,6 +28,10 @@
 const int StackAlignmentInBytes  = 16;
 // Indicates whether the C calling conventions require that
 // 32-bit integer argument values are extended to 64 bits.
 const bool CCallingConventionRequiresIntsAsLongs = false;
 #define SUPPORTS_NATIVE_CX8
 // The maximum B/BL offset range on AArch64 is 128MB.
--- a/hotspot/src/cpu/aarch64/vm/globals_aarch64.hpp
+++ b/hotspot/src/cpu/aarch64/vm/globals_aarch64.hpp
@ -40,14 +40,7 @@ define_pd_global(bool, ImplicitNullChecks,       true);  // Generate code for im
 define_pd_global(bool, TrapBasedNullChecks,  false);
 define_pd_global(bool, UncommonNullCast,         true);  // Uncommon-trap NULLs past to check cast
-// See 4827828 for this change. There is no globals_core_i486.hpp. I can't
+#if defined(COMPILER2) || INCLUDE_JVMCI
 // assign a different value for C2 without touching a number of files. Use
 // #ifdef to minimize the change as it's late in Mantis. -- FIXME.
 // c1 doesn't have this problem because the fix to 4858033 assures us
 // the the vep is aligned at CodeEntryAlignment whereas c2 only aligns
 // the uep and the vep doesn't get real alignment but just slops on by
 // only assured that the entry instruction meets the 5 byte size requirement.
 #ifdef COMPILER2
 define_pd_global(intx, CodeEntryAlignment,       64);
 #else
 define_pd_global(intx, CodeEntryAlignment,       16);
--- a/hotspot/src/cpu/aarch64/vm/interp_masm_aarch64.cpp
+++ b/hotspot/src/cpu/aarch64/vm/interp_masm_aarch64.cpp
@ -1054,13 +1054,39 @@ void InterpreterMacroAssembler::profile_virtual_call(Register receiver,
    bind(skip_receiver_profile);
    // The method data pointer needs to be updated to reflect the new target.
 #if INCLUDE_JVMCI
    if (MethodProfileWidth == 0) {
      update_mdp_by_constant(mdp, in_bytes(VirtualCallData::virtual_call_data_size()));
    }
 #else // INCLUDE_JVMCI
    update_mdp_by_constant(mdp,
                           in_bytes(VirtualCallData::
                                    virtual_call_data_size()));
 #endif // INCLUDE_JVMCI
    bind(profile_continue);
  }
 }
 #if INCLUDE_JVMCI
 void InterpreterMacroAssembler::profile_called_method(Register method, Register mdp, Register reg2) {
  assert_different_registers(method, mdp, reg2);
  if (ProfileInterpreter && MethodProfileWidth > 0) {
    Label profile_continue;
    // If no method data exists, go to profile_continue.
    test_method_data_pointer(mdp, profile_continue);
    Label done;
    record_item_in_profile_helper(method, mdp, reg2, 0, done, MethodProfileWidth,
      &VirtualCallData::method_offset, &VirtualCallData::method_count_offset, in_bytes(VirtualCallData::nonprofiled_receiver_count_offset()));
    bind(done);
    update_mdp_by_constant(mdp, in_bytes(VirtualCallData::virtual_call_data_size()));
    bind(profile_continue);
  }
 }
 #endif // INCLUDE_JVMCI
 // This routine creates a state machine for updating the multi-row
 // type profile at a virtual call site (or other type-sensitive bytecode).
 // The machine visits each row (of receiver/count) until the receiver type
@ -1080,14 +1106,36 @@ void InterpreterMacroAssembler::record_klass_in_profile_helper(
    if (is_virtual_call) {
      increment_mdp_data_at(mdp, in_bytes(CounterData::count_offset()));
    }
-    return;
+#if INCLUDE_JVMCI
    else if (EnableJVMCI) {
      increment_mdp_data_at(mdp, in_bytes(ReceiverTypeData::nonprofiled_receiver_count_offset()));
    }
 #endif // INCLUDE_JVMCI
  } else {
    int non_profiled_offset = -1;
    if (is_virtual_call) {
      non_profiled_offset = in_bytes(CounterData::count_offset());
    }
 #if INCLUDE_JVMCI
    else if (EnableJVMCI) {
      non_profiled_offset = in_bytes(ReceiverTypeData::nonprofiled_receiver_count_offset());
    }
 #endif // INCLUDE_JVMCI
-  int last_row = VirtualCallData::row_limit() - 1;
+    record_item_in_profile_helper(receiver, mdp, reg2, 0, done, TypeProfileWidth,
        &VirtualCallData::receiver_offset, &VirtualCallData::receiver_count_offset, non_profiled_offset);
  }
 }
 void InterpreterMacroAssembler::record_item_in_profile_helper(Register item, Register mdp,
                                        Register reg2, int start_row, Label& done, int total_rows,
                                        OffsetFunction item_offset_fn, OffsetFunction item_count_offset_fn,
                                        int non_profiled_offset) {
  int last_row = total_rows - 1;
  assert(start_row <= last_row, "must be work left to do");
-  // Test this row for both the receiver and for null.
+  // Test this row for both the item and for null.
  // Take any of three different outcomes:
-  //   1. found receiver => increment count and goto done
+  //   1. found item => increment count and goto done
  //   2. found null => keep looking for case 1, maybe allocate this cell
  //   3. found something else => keep looking for cases 1 and 2
  // Case 3 is handled by a recursive call.
@ -1095,55 +1143,56 @@ void InterpreterMacroAssembler::record_klass_in_profile_helper(
    Label next_test;
    bool test_for_null_also = (row == start_row);
-    // See if the receiver is receiver[n].
+    // See if the item is item[n].
-    int recvr_offset = in_bytes(VirtualCallData::receiver_offset(row));
+    int item_offset = in_bytes(item_offset_fn(row));
-    test_mdp_data_at(mdp, recvr_offset, receiver,
+    test_mdp_data_at(mdp, item_offset, item,
                     (test_for_null_also ? reg2 : noreg),
                     next_test);
-    // (Reg2 now contains the receiver from the CallData.)
+    // (Reg2 now contains the item from the CallData.)
-    // The receiver is receiver[n].  Increment count[n].
+    // The item is item[n].  Increment count[n].
-    int count_offset = in_bytes(VirtualCallData::receiver_count_offset(row));
+    int count_offset = in_bytes(item_count_offset_fn(row));
    increment_mdp_data_at(mdp, count_offset);
    b(done);
    bind(next_test);
    if (test_for_null_also) {
      Label found_null;
-      // Failed the equality check on receiver[n]...  Test for null.
+      // Failed the equality check on item[n]...  Test for null.
      if (start_row == last_row) {
        // The only thing left to do is handle the null case.
-        if (is_virtual_call) {
+        if (non_profiled_offset >= 0) {
          cbz(reg2, found_null);
-          // Receiver did not match any saved receiver and there is no empty row for it.
+          // Item did not match any saved item and there is no empty row for it.
          // Increment total counter to indicate polymorphic case.
-          increment_mdp_data_at(mdp, in_bytes(CounterData::count_offset()));
+          increment_mdp_data_at(mdp, non_profiled_offset);
          b(done);
          bind(found_null);
        } else {
-          cbz(reg2, done);
+          cbnz(reg2, done);
        }
        break;
      }
      // Since null is rare, make it be the branch-taken case.
-      cbz(reg2,found_null);
+      cbz(reg2, found_null);
      // Put all the "Case 3" tests here.
-      record_klass_in_profile_helper(receiver, mdp, reg2, start_row + 1, done, is_virtual_call);
+      record_item_in_profile_helper(item, mdp, reg2, start_row + 1, done, total_rows,
        item_offset_fn, item_count_offset_fn, non_profiled_offset);
-      // Found a null.  Keep searching for a matching receiver,
+      // Found a null.  Keep searching for a matching item,
      // but remember that this is an empty (unused) slot.
      bind(found_null);
    }
  }
-  // In the fall-through case, we found no matching receiver, but we
+  // In the fall-through case, we found no matching item, but we
-  // observed the receiver[start_row] is NULL.
+  // observed the item[start_row] is NULL.
-  // Fill in the receiver field and increment the count.
+  // Fill in the item field and increment the count.
-  int recvr_offset = in_bytes(VirtualCallData::receiver_offset(start_row));
+  int item_offset = in_bytes(item_offset_fn(start_row));
-  set_mdp_data_at(mdp, recvr_offset, receiver);
+  set_mdp_data_at(mdp, item_offset, item);
-  int count_offset = in_bytes(VirtualCallData::receiver_count_offset(start_row));
+  int count_offset = in_bytes(item_count_offset_fn(start_row));
  mov(reg2, DataLayout::counter_increment);
  set_mdp_data_at(mdp, count_offset, reg2);
  if (start_row > 0) {
@ -1347,9 +1396,8 @@ void InterpreterMacroAssembler::notify_method_entry() {
  // the code to check if the event should be sent.
  if (JvmtiExport::can_post_interpreter_events()) {
    Label L;
-    ldr(r3, Address(rthread, JavaThread::interp_only_mode_offset()));
+    ldrw(r3, Address(rthread, JavaThread::interp_only_mode_offset()));
-    tst(r3, ~0);
+    cbzw(r3, L);
    br(Assembler::EQ, L);
    call_VM(noreg, CAST_FROM_FN_PTR(address,
                                    InterpreterRuntime::post_method_entry));
    bind(L);
--- a/hotspot/src/cpu/aarch64/vm/interp_masm_aarch64.hpp
+++ b/hotspot/src/cpu/aarch64/vm/interp_masm_aarch64.hpp
@ -33,6 +33,7 @@
 // This file specializes the assember with interpreter-specific macros
 typedef ByteSize (*OffsetFunction)(uint);
 class InterpreterMacroAssembler: public MacroAssembler {
 protected:
@ -234,6 +235,10 @@ class InterpreterMacroAssembler: public MacroAssembler {
  void record_klass_in_profile_helper(Register receiver, Register mdp,
                                      Register reg2, int start_row,
                                      Label& done, bool is_virtual_call);
  void record_item_in_profile_helper(Register item, Register mdp,
                                     Register reg2, int start_row, Label& done, int total_rows,
                                     OffsetFunction item_offset_fn, OffsetFunction item_count_offset_fn,
                                     int non_profiled_offset);
  void update_mdp_by_offset(Register mdp_in, int offset_of_offset);
  void update_mdp_by_offset(Register mdp_in, Register reg, int offset_of_disp);
@ -247,6 +252,7 @@ class InterpreterMacroAssembler: public MacroAssembler {
  void profile_virtual_call(Register receiver, Register mdp,
                            Register scratch2,
                            bool receiver_can_be_null = false);
  void profile_called_method(Register method, Register mdp, Register reg2) NOT_JVMCI_RETURN;
  void profile_ret(Register return_bci, Register mdp);
  void profile_null_seen(Register mdp);
  void profile_typecheck(Register mdp, Register klass, Register scratch);
--- a/hotspot/src/cpu/aarch64/vm/jniFastGetField_aarch64.cpp
+++ b/hotspot/src/cpu/aarch64/vm/jniFastGetField_aarch64.cpp
@ -61,6 +61,7 @@ address JNI_FastGetField::generate_fast_get_int_field0(BasicType type) {
    case T_FLOAT:   name = "jni_fast_GetFloatField";   break;
    case T_DOUBLE:  name = "jni_fast_GetDoubleField";  break;
    default:        ShouldNotReachHere();
      name = NULL;  // unreachable
  }
  ResourceMark rm;
  BufferBlob* blob = BufferBlob::create(name, BUFFER_SIZE);
@ -125,6 +126,7 @@ address JNI_FastGetField::generate_fast_get_int_field0(BasicType type) {
    case T_FLOAT:   slow_case_addr = jni_GetFloatField_addr();   break;
    case T_DOUBLE:  slow_case_addr = jni_GetDoubleField_addr();  break;
    default:        ShouldNotReachHere();
      slow_case_addr = NULL;  // unreachable
  }
  {
--- a/hotspot/src/cpu/aarch64/vm/macroAssembler_aarch64.cpp
+++ b/hotspot/src/cpu/aarch64/vm/macroAssembler_aarch64.cpp
@ -678,7 +678,7 @@ address MacroAssembler::trampoline_call(Address entry, CodeBuffer *cbuf) {
  if (cbuf) cbuf->set_insts_mark();
  relocate(entry.rspec());
-  if (Assembler::reachable_from_branch_at(pc(), entry.target())) {
+  if (!far_branches()) {
    bl(entry.target());
  } else {
    bl(pc());
@ -733,8 +733,8 @@ address MacroAssembler::emit_trampoline_stub(int insts_call_instruction_offset,
  return stub;
 }
-address MacroAssembler::ic_call(address entry) {
+address MacroAssembler::ic_call(address entry, jint method_index) {
-  RelocationHolder rh = virtual_call_Relocation::spec(pc());
+  RelocationHolder rh = virtual_call_Relocation::spec(pc(), method_index);
  // address const_ptr = long_constant((jlong)Universe::non_oop_word());
  // unsigned long offset;
  // ldr_constant(rscratch2, const_ptr);
--- a/hotspot/src/cpu/aarch64/vm/macroAssembler_aarch64.hpp
+++ b/hotspot/src/cpu/aarch64/vm/macroAssembler_aarch64.hpp
@ -410,7 +410,7 @@ class MacroAssembler: public Assembler {
 #define WRAP(INSN)                                                            \
  void INSN(Register Rd, Register Rn, Register Rm, Register Ra) {             \
-    if ((VM_Version::cpu_cpuFeatures() & VM_Version::CPU_A53MAC) && Ra != zr) \
+    if ((VM_Version::features() & VM_Version::CPU_A53MAC) && Ra != zr)        \
      nop();                                                                  \
    Assembler::INSN(Rd, Rn, Rm, Ra);                                          \
  }
@ -480,6 +480,32 @@ public:
    orr(Vd, T, Vn, Vn);
  }
 public:
  // Generalized Test Bit And Branch, including a "far" variety which
  // spans more than 32KiB.
  void tbr(Condition cond, Register Rt, int bitpos, Label &dest, bool far = false) {
    assert(cond == EQ || cond == NE, "must be");
    if (far)
      cond = ~cond;
    void (Assembler::* branch)(Register Rt, int bitpos, Label &L);
    if (cond == Assembler::EQ)
      branch = &Assembler::tbz;
    else
      branch = &Assembler::tbnz;
    if (far) {
      Label L;
      (this->*branch)(Rt, bitpos, L);
      b(dest);
      bind(L);
    } else {
      (this->*branch)(Rt, bitpos, dest);
    }
  }
  // macro instructions for accessing and updating floating point
  // status register
  //
@ -976,7 +1002,7 @@ public:
  }
  // Emit the CompiledIC call idiom
-  address ic_call(address entry);
+  address ic_call(address entry, jint method_index = 0);
 public:
--- a/hotspot/src/cpu/aarch64/vm/nativeInst_aarch64.hpp
+++ b/hotspot/src/cpu/aarch64/vm/nativeInst_aarch64.hpp
@ -62,7 +62,6 @@ class NativeInstruction VALUE_OBJ_CLASS_SPEC {
  inline bool is_jump_or_nop();
  inline bool is_cond_jump();
  bool is_safepoint_poll();
  inline bool is_mov_literal64();
  bool is_movz();
  bool is_movk();
  bool is_sigill_zombie_not_entrant();
@ -98,6 +97,14 @@ class NativeInstruction VALUE_OBJ_CLASS_SPEC {
  static bool is_ldr_literal_at(address instr);
  static bool is_ldrw_to_zr(address instr);
  static bool is_call_at(address instr) {
    const uint32_t insn = (*(uint32_t*)instr);
    return (insn >> 26) == 0b100101;
  }
  bool is_call() {
    return is_call_at(addr_at(0));
  }
  static bool maybe_cpool_ref(address instr) {
    return is_adrp_at(instr) || is_ldr_literal_at(instr);
  }
@ -157,11 +164,6 @@ class NativeCall: public NativeInstruction {
  inline friend NativeCall* nativeCall_at(address address);
  inline friend NativeCall* nativeCall_before(address return_address);
  static bool is_call_at(address instr) {
    const uint32_t insn = (*(uint32_t*)instr);
    return (insn >> 26) == 0b100101;
  }
  static bool is_call_before(address return_address) {
    return is_call_at(return_address - NativeCall::return_address_offset);
  }
--- a/hotspot/src/cpu/aarch64/vm/relocInfo_aarch64.cpp
+++ b/hotspot/src/cpu/aarch64/vm/relocInfo_aarch64.cpp
@ -59,14 +59,20 @@ void Relocation::pd_set_data_value(address x, intptr_t o, bool verify_only) {
 address Relocation::pd_call_destination(address orig_addr) {
  assert(is_call(), "should be a call here");
-  if (is_call()) {
+  if (NativeCall::is_call_at(addr())) {
    address trampoline = nativeCall_at(addr())->get_trampoline();
    if (trampoline) {
      return nativeCallTrampolineStub_at(trampoline)->destination();
    }
  }
  if (orig_addr != NULL) {
-    return MacroAssembler::pd_call_destination(orig_addr);
+    address new_addr = MacroAssembler::pd_call_destination(orig_addr);
    // If call is branch to self, don't try to relocate it, just leave it
    // as branch to self. This happens during code generation if the code
    // buffer expands. It will be relocated to the trampoline above once
    // code generation is complete.
    new_addr = (new_addr == orig_addr) ? addr() : new_addr;
    return new_addr;
  }
  return MacroAssembler::pd_call_destination(addr());
 }
--- a/hotspot/src/cpu/aarch64/vm/sharedRuntime_aarch64.cpp
+++ b/hotspot/src/cpu/aarch64/vm/sharedRuntime_aarch64.cpp
@ -39,10 +39,13 @@
 #ifdef COMPILER1
 #include "c1/c1_Runtime1.hpp"
 #endif
-#ifdef COMPILER2
+#if defined(COMPILER2) || INCLUDE_JVMCI
 #include "adfiles/ad_aarch64.hpp"
 #include "opto/runtime.hpp"
 #endif
 #if INCLUDE_JVMCI
 #include "jvmci/jvmciJavaClasses.hpp"
 #endif
 #ifdef BUILTIN_SIM
 #include "../../../../../../simulator/simulator.hpp"
@ -109,14 +112,14 @@ class RegisterSaver {
 };
 OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_vectors) {
-#ifdef COMPILER2
+#if defined(COMPILER2) || INCLUDE_JVMCI
  if (save_vectors) {
    // Save upper half of vector registers
    int vect_words = 32 * 8 / wordSize;
    additional_frame_words += vect_words;
  }
 #else
-  assert(!save_vectors, "vectors are generated only by C2");
+  assert(!save_vectors, "vectors are generated only by C2 and JVMCI");
 #endif
  int frame_size_in_bytes = round_to(additional_frame_words*wordSize +
@ -166,7 +169,7 @@ OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_
 void RegisterSaver::restore_live_registers(MacroAssembler* masm, bool restore_vectors) {
 #ifndef COMPILER2
-  assert(!restore_vectors, "vectors are generated only by C2");
+  assert(!restore_vectors, "vectors are generated only by C2 and JVMCI");
 #endif
  __ pop_CPU_state(restore_vectors);
  __ leave();
@ -547,6 +550,18 @@ void SharedRuntime::gen_i2c_adapter(MacroAssembler *masm,
  // Pre-load the register-jump target early, to schedule it better.
  __ ldr(rscratch1, Address(rmethod, in_bytes(Method::from_compiled_offset())));
 #if INCLUDE_JVMCI
  if (EnableJVMCI) {
    // check if this call should be routed towards a specific entry point
    __ ldr(rscratch2, Address(rthread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())));
    Label no_alternative_target;
    __ cbz(rscratch2, no_alternative_target);
    __ mov(rscratch1, rscratch2);
    __ str(zr, Address(rthread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())));
    __ bind(no_alternative_target);
  }
 #endif // INCLUDE_JVMCI
  // Now generate the shuffle code.
  for (int i = 0; i < total_args_passed; i++) {
    if (sig_bt[i] == T_VOID) {
@ -1075,7 +1090,7 @@ static void restore_args(MacroAssembler *masm, int arg_count, int first_arg, VMR
 }
-// Check GC_locker::needs_gc and enter the runtime if it's true.  This
+// Check GCLocker::needs_gc and enter the runtime if it's true.  This
 // keeps a new JNI critical region from starting until a GC has been
 // forced.  Save down any oops in registers and describe them in an
 // OopMap.
@ -1257,14 +1272,14 @@ static void gen_special_dispatch(MacroAssembler* masm,
 // GetPrimtiveArrayCritical and disallow the use of any other JNI
 // functions.  The wrapper is expected to unpack the arguments before
 // passing them to the callee and perform checks before and after the
-// native call to ensure that they GC_locker
+// native call to ensure that they GCLocker
 // lock_critical/unlock_critical semantics are followed.  Some other
 // parts of JNI setup are skipped like the tear down of the JNI handle
 // block and the check for pending exceptions it's impossible for them
 // to be thrown.
 //
 // They are roughly structured like this:
-//    if (GC_locker::needs_gc())
+//    if (GCLocker::needs_gc())
 //      SharedRuntime::block_for_jni_critical();
 //    tranistion to thread_in_native
 //    unpack arrray arguments and call native entry point
@ -2237,7 +2252,13 @@ void SharedRuntime::generate_deopt_blob() {
  // Allocate space for the code
  ResourceMark rm;
  // Setup code generation tools
-  CodeBuffer buffer("deopt_blob", 2048, 1024);
+  int pad = 0;
 #if INCLUDE_JVMCI
  if (EnableJVMCI) {
    pad += 512; // Increase the buffer size when compiling for JVMCI
  }
 #endif
  CodeBuffer buffer("deopt_blob", 2048+pad, 1024);
  MacroAssembler* masm = new MacroAssembler(&buffer);
  int frame_size_in_words;
  OopMap* map = NULL;
@ -2294,6 +2315,12 @@ void SharedRuntime::generate_deopt_blob() {
  __ b(cont);
  int reexecute_offset = __ pc() - start;
 #if defined(INCLUDE_JVMCI) && !defined(COMPILER1)
  if (EnableJVMCI && UseJVMCICompiler) {
    // JVMCI does not use this kind of deoptimization
    __ should_not_reach_here();
  }
 #endif
  // Reexecute case
  // return address is the pc describes what bci to do re-execute at
@ -2304,6 +2331,44 @@ void SharedRuntime::generate_deopt_blob() {
  __ movw(rcpool, Deoptimization::Unpack_reexecute); // callee-saved
  __ b(cont);
 #if INCLUDE_JVMCI
  Label after_fetch_unroll_info_call;
  int implicit_exception_uncommon_trap_offset = 0;
  int uncommon_trap_offset = 0;
  if (EnableJVMCI) {
    implicit_exception_uncommon_trap_offset = __ pc() - start;
    __ ldr(lr, Address(rthread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset())));
    __ str(zr, Address(rthread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset())));
    uncommon_trap_offset = __ pc() - start;
    // Save everything in sight.
    RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words);
    // fetch_unroll_info needs to call last_java_frame()
    Label retaddr;
    __ set_last_Java_frame(sp, noreg, retaddr, rscratch1);
    __ ldrw(c_rarg1, Address(rthread, in_bytes(JavaThread::pending_deoptimization_offset())));
    __ movw(rscratch1, -1);
    __ strw(rscratch1, Address(rthread, in_bytes(JavaThread::pending_deoptimization_offset())));
    __ movw(rcpool, (int32_t)Deoptimization::Unpack_reexecute);
    __ mov(c_rarg0, rthread);
    __ lea(rscratch1,
           RuntimeAddress(CAST_FROM_FN_PTR(address,
                                           Deoptimization::uncommon_trap)));
    __ blrt(rscratch1, 2, 0, MacroAssembler::ret_type_integral);
    __ bind(retaddr);
    oop_maps->add_gc_map( __ pc()-start, map->deep_copy());
    __ reset_last_Java_frame(false, false);
    __ b(after_fetch_unroll_info_call);
  } // EnableJVMCI
 #endif // INCLUDE_JVMCI
  int exception_offset = __ pc() - start;
  // Prolog for exception case
@ -2395,7 +2460,13 @@ void SharedRuntime::generate_deopt_blob() {
  __ reset_last_Java_frame(false, true);
-  // Load UnrollBlock* into rdi
+#if INCLUDE_JVMCI
  if (EnableJVMCI) {
    __ bind(after_fetch_unroll_info_call);
  }
 #endif
  // Load UnrollBlock* into r5
  __ mov(r5, r0);
  __ ldrw(rcpool, Address(r5, Deoptimization::UnrollBlock::unpack_kind_offset_in_bytes()));
@ -2547,7 +2618,12 @@ void SharedRuntime::generate_deopt_blob() {
  _deopt_blob = DeoptimizationBlob::create(&buffer, oop_maps, 0, exception_offset, reexecute_offset, frame_size_in_words);
  _deopt_blob->set_unpack_with_exception_in_tls_offset(exception_in_tls_offset);
-
+#if INCLUDE_JVMCI
  if (EnableJVMCI) {
    _deopt_blob->set_uncommon_trap_offset(uncommon_trap_offset);
    _deopt_blob->set_implicit_exception_uncommon_trap_offset(implicit_exception_uncommon_trap_offset);
  }
 #endif
 #ifdef BUILTIN_SIM
  if (NotifySimulator) {
    unsigned char *base = _deopt_blob->code_begin();
@ -2560,7 +2636,7 @@ uint SharedRuntime::out_preserve_stack_slots() {
  return 0;
 }
-#ifdef COMPILER2
+#if defined(COMPILER2) || INCLUDE_JVMCI
 //------------------------------generate_uncommon_trap_blob--------------------
 void SharedRuntime::generate_uncommon_trap_blob() {
  // Allocate space for the code
@ -2943,7 +3019,7 @@ RuntimeStub* SharedRuntime::generate_resolve_blob(address destination, const cha
 }
-#ifdef COMPILER2
+#if defined(COMPILER2) || INCLUDE_JVMCI
 // This is here instead of runtime_x86_64.cpp because it uses SimpleRuntimeFrame
 //
 //------------------------------generate_exception_blob---------------------------
--- a/hotspot/src/cpu/aarch64/vm/stubGenerator_aarch64.cpp
+++ b/hotspot/src/cpu/aarch64/vm/stubGenerator_aarch64.cpp
@ -958,8 +958,8 @@ class StubGenerator: public StubCodeGenerator {
    const Register t0 = r3, t1 = r4;
    if (is_backwards) {
-      __ lea(s, Address(s, count, Address::uxtw(exact_log2(-step))));
+      __ lea(s, Address(s, count, Address::lsl(exact_log2(-step))));
-      __ lea(d, Address(d, count, Address::uxtw(exact_log2(-step))));
+      __ lea(d, Address(d, count, Address::lsl(exact_log2(-step))));
    }
    Label done, tail;
@ -1051,10 +1051,10 @@ class StubGenerator: public StubCodeGenerator {
    __ cmp(rscratch2, count);
    __ br(Assembler::HS, end);
    if (size == (size_t)wordSize) {
-      __ ldr(temp, Address(a, rscratch2, Address::uxtw(exact_log2(size))));
+      __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
      __ verify_oop(temp);
    } else {
-      __ ldrw(r16, Address(a, rscratch2, Address::uxtw(exact_log2(size))));
+      __ ldrw(r16, Address(a, rscratch2, Address::lsl(exact_log2(size))));
      __ decode_heap_oop(temp); // calls verify_oop
    }
    __ add(rscratch2, rscratch2, size);
@ -1087,12 +1087,14 @@ class StubGenerator: public StubCodeGenerator {
    __ align(CodeEntryAlignment);
    StubCodeMark mark(this, "StubRoutines", name);
    address start = __ pc();
    __ enter();
    if (entry != NULL) {
      *entry = __ pc();
      // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
      BLOCK_COMMENT("Entry:");
    }
-    __ enter();
+
    if (is_oop) {
      __ push(RegSet::of(d, count), sp);
      // no registers are destroyed by this call
@ -1104,10 +1106,11 @@ class StubGenerator: public StubCodeGenerator {
      if (VerifyOops)
        verify_oop_array(size, d, count, r16);
      __ sub(count, count, 1); // make an inclusive end pointer
-      __ lea(count, Address(d, count, Address::uxtw(exact_log2(size))));
+      __ lea(count, Address(d, count, Address::lsl(exact_log2(size))));
      gen_write_ref_array_post_barrier(d, count, rscratch1);
    }
    __ leave();
    __ mov(r0, zr); // return 0
    __ ret(lr);
 #ifdef BUILTIN_SIM
    {
@ -1140,11 +1143,16 @@ class StubGenerator: public StubCodeGenerator {
    StubCodeMark mark(this, "StubRoutines", name);
    address start = __ pc();
    __ enter();
    if (entry != NULL) {
      *entry = __ pc();
      // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
      BLOCK_COMMENT("Entry:");
    }
    __ cmp(d, s);
    __ br(Assembler::LS, nooverlap_target);
    __ enter();
    if (is_oop) {
      __ push(RegSet::of(d, count), sp);
      // no registers are destroyed by this call
@ -1160,6 +1168,7 @@ class StubGenerator: public StubCodeGenerator {
      gen_write_ref_array_post_barrier(d, count, rscratch1);
    }
    __ leave();
    __ mov(r0, zr); // return 0
    __ ret(lr);
 #ifdef BUILTIN_SIM
    {
@ -1559,7 +1568,29 @@ class StubGenerator: public StubCodeGenerator {
                              Register dst_pos, // destination position (c_rarg3)
                              Register length,
                              Register temp,
-                              Label& L_failed) { Unimplemented(); }
+                              Label& L_failed) {
    BLOCK_COMMENT("arraycopy_range_checks:");
    assert_different_registers(rscratch1, temp);
    //  if (src_pos + length > arrayOop(src)->length())  FAIL;
    __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes()));
    __ addw(temp, length, src_pos);
    __ cmpw(temp, rscratch1);
    __ br(Assembler::HI, L_failed);
    //  if (dst_pos + length > arrayOop(dst)->length())  FAIL;
    __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes()));
    __ addw(temp, length, dst_pos);
    __ cmpw(temp, rscratch1);
    __ br(Assembler::HI, L_failed);
    // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'.
    __ movw(src_pos, src_pos);
    __ movw(dst_pos, dst_pos);
    BLOCK_COMMENT("arraycopy_range_checks done");
  }
  // These stubs get called from some dumb test routine.
  // I'll write them properly when they're called from
@ -1569,6 +1600,309 @@ class StubGenerator: public StubCodeGenerator {
  }
  //
  //  Generate 'unsafe' array copy stub
  //  Though just as safe as the other stubs, it takes an unscaled
  //  size_t argument instead of an element count.
  //
  //  Input:
  //    c_rarg0   - source array address
  //    c_rarg1   - destination array address
  //    c_rarg2   - byte count, treated as ssize_t, can be zero
  //
  // Examines the alignment of the operands and dispatches
  // to a long, int, short, or byte copy loop.
  //
  address generate_unsafe_copy(const char *name,
                               address byte_copy_entry) {
 #ifdef PRODUCT
    return StubRoutines::_jbyte_arraycopy;
 #else
    __ align(CodeEntryAlignment);
    StubCodeMark mark(this, "StubRoutines", name);
    address start = __ pc();
    __ enter(); // required for proper stackwalking of RuntimeStub frame
    // bump this on entry, not on exit:
    __ lea(rscratch2, ExternalAddress((address)&SharedRuntime::_unsafe_array_copy_ctr));
    __ incrementw(Address(rscratch2));
    __ b(RuntimeAddress(byte_copy_entry));
    return start;
 #endif
  }
  //
  //  Generate generic array copy stubs
  //
  //  Input:
  //    c_rarg0    -  src oop
  //    c_rarg1    -  src_pos (32-bits)
  //    c_rarg2    -  dst oop
  //    c_rarg3    -  dst_pos (32-bits)
  //    c_rarg4    -  element count (32-bits)
  //
  //  Output:
  //    r0 ==  0  -  success
  //    r0 == -1^K - failure, where K is partial transfer count
  //
  address generate_generic_copy(const char *name,
                                address byte_copy_entry, address short_copy_entry,
                                address int_copy_entry, address oop_copy_entry,
                                address long_copy_entry, address checkcast_copy_entry) {
    Label L_failed, L_failed_0, L_objArray;
    Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs;
    // Input registers
    const Register src        = c_rarg0;  // source array oop
    const Register src_pos    = c_rarg1;  // source position
    const Register dst        = c_rarg2;  // destination array oop
    const Register dst_pos    = c_rarg3;  // destination position
    const Register length     = c_rarg4;
    StubCodeMark mark(this, "StubRoutines", name);
    __ align(CodeEntryAlignment);
    address start = __ pc();
    __ enter(); // required for proper stackwalking of RuntimeStub frame
    // bump this on entry, not on exit:
    inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
    //-----------------------------------------------------------------------
    // Assembler stub will be used for this call to arraycopy
    // if the following conditions are met:
    //
    // (1) src and dst must not be null.
    // (2) src_pos must not be negative.
    // (3) dst_pos must not be negative.
    // (4) length  must not be negative.
    // (5) src klass and dst klass should be the same and not NULL.
    // (6) src and dst should be arrays.
    // (7) src_pos + length must not exceed length of src.
    // (8) dst_pos + length must not exceed length of dst.
    //
    //  if (src == NULL) return -1;
    __ cbz(src, L_failed);
    //  if (src_pos < 0) return -1;
    __ tbnz(src_pos, 31, L_failed);  // i.e. sign bit set
    //  if (dst == NULL) return -1;
    __ cbz(dst, L_failed);
    //  if (dst_pos < 0) return -1;
    __ tbnz(dst_pos, 31, L_failed);  // i.e. sign bit set
    // registers used as temp
    const Register scratch_length    = r16; // elements count to copy
    const Register scratch_src_klass = r17; // array klass
    const Register lh                = r18; // layout helper
    //  if (length < 0) return -1;
    __ movw(scratch_length, length);        // length (elements count, 32-bits value)
    __ tbnz(scratch_length, 31, L_failed);  // i.e. sign bit set
    __ load_klass(scratch_src_klass, src);
 #ifdef ASSERT
    //  assert(src->klass() != NULL);
    {
      BLOCK_COMMENT("assert klasses not null {");
      Label L1, L2;
      __ cbnz(scratch_src_klass, L2);   // it is broken if klass is NULL
      __ bind(L1);
      __ stop("broken null klass");
      __ bind(L2);
      __ load_klass(rscratch1, dst);
      __ cbz(rscratch1, L1);     // this would be broken also
      BLOCK_COMMENT("} assert klasses not null done");
    }
 #endif
    // Load layout helper (32-bits)
    //
    //  |array_tag|     | header_size | element_type |     |log2_element_size|
    // 32        30    24            16              8     2                 0
    //
    //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
    //
    const int lh_offset = in_bytes(Klass::layout_helper_offset());
    // Handle objArrays completely differently...
    const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
    __ ldrw(lh, Address(scratch_src_klass, lh_offset));
    __ movw(rscratch1, objArray_lh);
    __ eorw(rscratch2, lh, rscratch1);
    __ cbzw(rscratch2, L_objArray);
    //  if (src->klass() != dst->klass()) return -1;
    __ load_klass(rscratch2, dst);
    __ eor(rscratch2, rscratch2, scratch_src_klass);
    __ cbnz(rscratch2, L_failed);
    //  if (!src->is_Array()) return -1;
    __ tbz(lh, 31, L_failed);  // i.e. (lh >= 0)
    // At this point, it is known to be a typeArray (array_tag 0x3).
 #ifdef ASSERT
    {
      BLOCK_COMMENT("assert primitive array {");
      Label L;
      __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift);
      __ cmpw(lh, rscratch2);
      __ br(Assembler::GE, L);
      __ stop("must be a primitive array");
      __ bind(L);
      BLOCK_COMMENT("} assert primitive array done");
    }
 #endif
    arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
                           rscratch2, L_failed);
    // TypeArrayKlass
    //
    // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
    // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
    //
    const Register rscratch1_offset = rscratch1;    // array offset
    const Register r18_elsize = lh; // element size
    __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift,
           exact_log2(Klass::_lh_header_size_mask+1));   // array_offset
    __ add(src, src, rscratch1_offset);           // src array offset
    __ add(dst, dst, rscratch1_offset);           // dst array offset
    BLOCK_COMMENT("choose copy loop based on element size");
    // next registers should be set before the jump to corresponding stub
    const Register from     = c_rarg0;  // source array address
    const Register to       = c_rarg1;  // destination array address
    const Register count    = c_rarg2;  // elements count
    // 'from', 'to', 'count' registers should be set in such order
    // since they are the same as 'src', 'src_pos', 'dst'.
    assert(Klass::_lh_log2_element_size_shift == 0, "fix this code");
    // The possible values of elsize are 0-3, i.e. exact_log2(element
    // size in bytes).  We do a simple bitwise binary search.
  __ BIND(L_copy_bytes);
    __ tbnz(r18_elsize, 1, L_copy_ints);
    __ tbnz(r18_elsize, 0, L_copy_shorts);
    __ lea(from, Address(src, src_pos));// src_addr
    __ lea(to,   Address(dst, dst_pos));// dst_addr
    __ movw(count, scratch_length); // length
    __ b(RuntimeAddress(byte_copy_entry));
  __ BIND(L_copy_shorts);
    __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr
    __ lea(to,   Address(dst, dst_pos, Address::lsl(1)));// dst_addr
    __ movw(count, scratch_length); // length
    __ b(RuntimeAddress(short_copy_entry));
  __ BIND(L_copy_ints);
    __ tbnz(r18_elsize, 0, L_copy_longs);
    __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr
    __ lea(to,   Address(dst, dst_pos, Address::lsl(2)));// dst_addr
    __ movw(count, scratch_length); // length
    __ b(RuntimeAddress(int_copy_entry));
  __ BIND(L_copy_longs);
 #ifdef ASSERT
    {
      BLOCK_COMMENT("assert long copy {");
      Label L;
      __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r18_elsize
      __ cmpw(r18_elsize, LogBytesPerLong);
      __ br(Assembler::EQ, L);
      __ stop("must be long copy, but elsize is wrong");
      __ bind(L);
      BLOCK_COMMENT("} assert long copy done");
    }
 #endif
    __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr
    __ lea(to,   Address(dst, dst_pos, Address::lsl(3)));// dst_addr
    __ movw(count, scratch_length); // length
    __ b(RuntimeAddress(long_copy_entry));
    // ObjArrayKlass
  __ BIND(L_objArray);
    // live at this point:  scratch_src_klass, scratch_length, src[_pos], dst[_pos]
    Label L_plain_copy, L_checkcast_copy;
    //  test array classes for subtyping
    __ load_klass(r18, dst);
    __ cmp(scratch_src_klass, r18); // usual case is exact equality
    __ br(Assembler::NE, L_checkcast_copy);
    // Identically typed arrays can be copied without element-wise checks.
    arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
                           rscratch2, L_failed);
    __ lea(from, Address(src, src_pos, Address::lsl(3)));
    __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
    __ lea(to, Address(dst, dst_pos, Address::lsl(3)));
    __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
    __ movw(count, scratch_length); // length
  __ BIND(L_plain_copy);
    __ b(RuntimeAddress(oop_copy_entry));
  __ BIND(L_checkcast_copy);
    // live at this point:  scratch_src_klass, scratch_length, r18 (dst_klass)
    {
      // Before looking at dst.length, make sure dst is also an objArray.
      __ ldrw(rscratch1, Address(r18, lh_offset));
      __ movw(rscratch2, objArray_lh);
      __ eorw(rscratch1, rscratch1, rscratch2);
      __ cbnzw(rscratch1, L_failed);
      // It is safe to examine both src.length and dst.length.
      arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
                             r18, L_failed);
      const Register rscratch2_dst_klass = rscratch2;
      __ load_klass(rscratch2_dst_klass, dst); // reload
      // Marshal the base address arguments now, freeing registers.
      __ lea(from, Address(src, src_pos, Address::lsl(3)));
      __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
      __ lea(to, Address(dst, dst_pos, Address::lsl(3)));
      __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
      __ movw(count, length);           // length (reloaded)
      Register sco_temp = c_rarg3;      // this register is free now
      assert_different_registers(from, to, count, sco_temp,
                                 rscratch2_dst_klass, scratch_src_klass);
      // assert_clean_int(count, sco_temp);
      // Generate the type check.
      const int sco_offset = in_bytes(Klass::super_check_offset_offset());
      __ ldrw(sco_temp, Address(rscratch2_dst_klass, sco_offset));
      // assert_clean_int(sco_temp, r18);
      generate_type_check(scratch_src_klass, sco_temp, rscratch2_dst_klass, L_plain_copy);
      // Fetch destination element klass from the ObjArrayKlass header.
      int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
      __ ldr(rscratch2_dst_klass, Address(rscratch2_dst_klass, ek_offset));
      __ ldrw(sco_temp, Address(rscratch2_dst_klass, sco_offset));
      // the checkcast_copy loop needs two extra arguments:
      assert(c_rarg3 == sco_temp, "#3 already in place");
      // Set up arguments for checkcast_copy_entry.
      __ mov(c_rarg4, rscratch2_dst_klass);  // dst.klass.element_klass
      __ b(RuntimeAddress(checkcast_copy_entry));
    }
  __ BIND(L_failed);
    __ mov(r0, -1);
    __ leave();   // required for proper stackwalking of RuntimeStub frame
    __ ret(lr);
    return start;
  }
  void generate_arraycopy_stubs() {
    address entry;
    address entry_jbyte_arraycopy;
@ -1655,6 +1989,18 @@ class StubGenerator: public StubCodeGenerator {
    StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy);
    StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL,
                                                                        /*dest_uninitialized*/true);
    StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy("unsafe_arraycopy",
                                                              entry_jbyte_arraycopy);
    StubRoutines::_generic_arraycopy   = generate_generic_copy("generic_arraycopy",
                                                               entry_jbyte_arraycopy,
                                                               entry_jshort_arraycopy,
                                                               entry_jint_arraycopy,
                                                               entry_oop_arraycopy,
                                                               entry_jlong_arraycopy,
                                                               entry_checkcast_arraycopy);
  }
  void generate_math_stubs() { Unimplemented(); }
@ -1973,7 +2319,7 @@ class StubGenerator: public StubCodeGenerator {
  //   c_rarg4   - input length
  //
  // Output:
-  //   x0        - input length
+  //   r0        - input length
  //
  address generate_cipherBlockChaining_decryptAESCrypt() {
    assert(UseAES, "need AES instructions and misaligned SSE support");
--- a/hotspot/src/cpu/aarch64/vm/templateInterpreterGenerator_aarch64.cpp
+++ b/hotspot/src/cpu/aarch64/vm/templateInterpreterGenerator_aarch64.cpp
@ -248,6 +248,7 @@ void TemplateInterpreterGenerator::generate_transcendental_entry(AbstractInterpr
    break;
  default:
    ShouldNotReachHere();
    fn = NULL;  // unreachable
  }
  const int gpargs = 0, rtype = 3;
  __ mov(rscratch1, fn);
@ -436,6 +437,19 @@ address TemplateInterpreterGenerator::generate_deopt_entry_for(TosState state,
  __ restore_constant_pool_cache();
  __ get_method(rmethod);
 #if INCLUDE_JVMCI
  // Check if we need to take lock at entry of synchronized method.
  if (UseJVMCICompiler) {
    Label L;
    __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
    __ cbz(rscratch1, L);
    // Clear flag.
    __ strb(zr, Address(rthread, JavaThread::pending_monitorenter_offset()));
    // Take lock.
    lock_method();
    __ bind(L);
  }
 #endif
  // handle exceptions
  {
    Label L;
@ -580,7 +594,7 @@ void TemplateInterpreterGenerator::generate_counter_incr(
      __ br(Assembler::LT, *profile_method_continue);
      // if no method data exists, go to profile_method
-      __ test_method_data_pointer(r0, *profile_method);
+      __ test_method_data_pointer(rscratch2, *profile_method);
    }
    {
--- a/hotspot/src/cpu/aarch64/vm/vmStructs_aarch64.hpp
+++ b/hotspot/src/cpu/aarch64/vm/vmStructs_aarch64.hpp
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2001, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
 * Copyright (c) 2014, Red Hat Inc. All rights reserved. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
@ -31,16 +31,8 @@
 // referenced by vmStructs.cpp.
 #define VM_STRUCTS_CPU(nonstatic_field, static_field, unchecked_nonstatic_field, volatile_nonstatic_field, nonproduct_nonstatic_field, c2_nonstatic_field, unchecked_c1_static_field, unchecked_c2_static_field) \
                                                                                                                                     \
  /******************************/                                                                                                   \
  /* JavaCallWrapper            */                                                                                                   \
  /******************************/                                                                                                   \
  /******************************/                                                                                                   \
  /* JavaFrameAnchor            */                                                                                                   \
  /******************************/                                                                                                   \
  volatile_nonstatic_field(JavaFrameAnchor, _last_Java_fp, intptr_t*)
 #define VM_TYPES_CPU(declare_type, declare_toplevel_type, declare_oop_type, declare_integer_type, declare_unsigned_integer_type, declare_c1_toplevel_type, declare_c2_type, declare_c2_toplevel_type)
 #define VM_INT_CONSTANTS_CPU(declare_constant, declare_preprocessor_constant, declare_c1_constant, declare_c2_constant, declare_c2_preprocessor_constant)
--- a/hotspot/src/cpu/aarch64/vm/vm_version_aarch64.cpp
+++ b/hotspot/src/cpu/aarch64/vm/vm_version_aarch64.cpp
@ -67,8 +67,6 @@ int VM_Version::_model2;
 int VM_Version::_variant;
 int VM_Version::_revision;
 int VM_Version::_stepping;
 int VM_Version::_cpuFeatures;
 const char*           VM_Version::_features_str = "";
 static BufferBlob* stub_blob;
 static const int stub_size = 550;
@ -129,7 +127,7 @@ void VM_Version::get_processor_features() {
  char buf[512];
-  _cpuFeatures = auxv;
+  _features = auxv;
  int cpu_lines = 0;
  if (FILE *f = fopen("/proc/cpuinfo", "r")) {
@ -154,12 +152,12 @@ void VM_Version::get_processor_features() {
  }
  // Enable vendor specific features
-  if (_cpu == CPU_CAVIUM && _variant == 0) _cpuFeatures |= CPU_DMB_ATOMICS;
+  if (_cpu == CPU_CAVIUM && _variant == 0) _features |= CPU_DMB_ATOMICS;
-  if (_cpu == CPU_ARM && (_model == 0xd03 || _model2 == 0xd03)) _cpuFeatures |= CPU_A53MAC;
+  if (_cpu == CPU_ARM && (_model == 0xd03 || _model2 == 0xd03)) _features |= CPU_A53MAC;
  // If an olde style /proc/cpuinfo (cpu_lines == 1) then if _model is an A57 (0xd07)
  // we assume the worst and assume we could be on a big little system and have
  // undisclosed A53 cores which we could be swapped to at any stage
-  if (_cpu == CPU_ARM && cpu_lines == 1 && _model == 0xd07) _cpuFeatures |= CPU_A53MAC;
+  if (_cpu == CPU_ARM && cpu_lines == 1 && _model == 0xd07) _features |= CPU_A53MAC;
  sprintf(buf, "0x%02x:0x%x:0x%03x:%d", _cpu, _variant, _model, _revision);
  if (_model2) sprintf(buf+strlen(buf), "(0x%03x)", _model2);
@ -169,7 +167,7 @@ void VM_Version::get_processor_features() {
  if (auxv & HWCAP_SHA1)  strcat(buf, ", sha1");
  if (auxv & HWCAP_SHA2)  strcat(buf, ", sha256");
-  _features_str = os::strdup(buf);
+  _features_string = os::strdup(buf);
  if (FLAG_IS_DEFAULT(UseCRC32)) {
    UseCRC32 = (auxv & HWCAP_CRC32) != 0;
@ -182,6 +180,11 @@ void VM_Version::get_processor_features() {
    FLAG_SET_DEFAULT(UseAdler32Intrinsics, true);
  }
  if (UseVectorizedMismatchIntrinsic) {
    warning("UseVectorizedMismatchIntrinsic specified, but not available on this CPU.");
    FLAG_SET_DEFAULT(UseVectorizedMismatchIntrinsic, false);
  }
  if (auxv & HWCAP_AES) {
    UseAES = UseAES || FLAG_IS_DEFAULT(UseAES);
    UseAESIntrinsics =
@ -199,6 +202,11 @@ void VM_Version::get_processor_features() {
    }
  }
  if (UseAESCTRIntrinsics) {
    warning("AES/CTR intrinsics are not available on this CPU");
    FLAG_SET_DEFAULT(UseAESCTRIntrinsics, false);
  }
  if (FLAG_IS_DEFAULT(UseCRC32Intrinsics)) {
    UseCRC32Intrinsics = true;
  }
@ -267,7 +275,7 @@ void VM_Version::get_processor_features() {
  }
  if (FLAG_IS_DEFAULT(UseBarriersForVolatile)) {
-    UseBarriersForVolatile = (_cpuFeatures & CPU_DMB_ATOMICS) != 0;
+    UseBarriersForVolatile = (_features & CPU_DMB_ATOMICS) != 0;
  }
  if (FLAG_IS_DEFAULT(UsePopCountInstruction)) {
--- a/hotspot/src/cpu/aarch64/vm/vm_version_aarch64.hpp
+++ b/hotspot/src/cpu/aarch64/vm/vm_version_aarch64.hpp
@ -30,7 +30,8 @@
 #include "runtime/vm_version.hpp"
 class VM_Version : public Abstract_VM_Version {
-public:
+  friend class JVMCIVMStructs;
 protected:
  static int _cpu;
  static int _model;
@ -38,9 +39,6 @@ protected:
  static int _variant;
  static int _revision;
  static int _stepping;
  static int _cpuFeatures;     // features returned by the "cpuid" instruction
                               // 0 if this instruction is not available
  static const char* _features_str;
  static void get_processor_features();
@ -52,7 +50,7 @@ public:
  static void assert_is_initialized() {
  }
-  enum {
+  enum Family {
    CPU_ARM       = 'A',
    CPU_BROADCOM  = 'B',
    CPU_CAVIUM    = 'C',
@ -64,9 +62,9 @@ public:
    CPU_QUALCOM   = 'Q',
    CPU_MARVELL   = 'V',
    CPU_INTEL     = 'i',
-  } cpuFamily;
+  };
-  enum {
+  enum Feature_Flag {
    CPU_FP           = (1<<0),
    CPU_ASIMD        = (1<<1),
    CPU_EVTSTRM      = (1<<2),
@ -77,16 +75,13 @@ public:
    CPU_CRC32        = (1<<7),
    CPU_A53MAC       = (1 << 30),
    CPU_DMB_ATOMICS  = (1 << 31),
-  } cpuFeatureFlags;
+  };
  static const char* cpu_features()           { return _features_str; }
  static int cpu_family()                     { return _cpu; }
  static int cpu_model()                      { return _model; }
  static int cpu_model2()                     { return _model2; }
  static int cpu_variant()                    { return _variant; }
  static int cpu_revision()                   { return _revision; }
  static int cpu_cpuFeatures()                { return _cpuFeatures; }
 };
 #endif // CPU_AARCH64_VM_VM_VERSION_AARCH64_HPP
--- a/hotspot/src/cpu/ppc/vm/abstractInterpreter_ppc.cpp
+++ b/hotspot/src/cpu/ppc/vm/abstractInterpreter_ppc.cpp
@ -141,9 +141,9 @@ void AbstractInterpreter::layout_activation(Method* method,
  intptr_t* locals_base  = (caller->is_interpreted_frame()) ?
    caller->interpreter_frame_esp() + caller_actual_parameters :
-    caller->sp() + method->max_locals() - 1 + (frame::abi_minframe_size / Interpreter::stackElementSize) ;
+    caller->sp() + method->max_locals() - 1 + (frame::abi_minframe_size / Interpreter::stackElementSize);
-  intptr_t* monitor_base = caller->sp() - frame::ijava_state_size / Interpreter::stackElementSize ;
+  intptr_t* monitor_base = caller->sp() - frame::ijava_state_size / Interpreter::stackElementSize;
  intptr_t* monitor      = monitor_base - (moncount * frame::interpreter_frame_monitor_size());
  intptr_t* esp_base     = monitor - 1;
  intptr_t* esp          = esp_base - tempcount - popframe_extra_args;
--- a/hotspot/src/cpu/ppc/vm/assembler_ppc.cpp
+++ b/hotspot/src/cpu/ppc/vm/assembler_ppc.cpp
@ -53,9 +53,6 @@ int AbstractAssembler::code_fill_byte() {
  return 0x00;                  // illegal instruction 0x00000000
 }
 void Assembler::print_instruction(int inst) {
  Unimplemented();
 }
 // Patch instruction `inst' at offset `inst_pos' to refer to
 // `dest_pos' and return the resulting instruction.  We should have
@ -484,7 +481,7 @@ int Assembler::add_const_optimized(Register d, Register s, long x, Register tmp,
      if (d != s) { mr(d, s); }
      return 0;
    }
-    if (return_simm16_rest) {
+    if (return_simm16_rest && (d == s)) {
      return xd;
    }
    addi(d, s, xd);
--- a/hotspot/src/cpu/ppc/vm/assembler_ppc.hpp
+++ b/hotspot/src/cpu/ppc/vm/assembler_ppc.hpp
@ -31,10 +31,37 @@
 // Address is an abstraction used to represent a memory location
 // as used in assembler instructions.
 // PPC instructions grok either baseReg + indexReg or baseReg + disp.
 // So far we do not use this as simplification by this class is low
 // on PPC with its simple addressing mode. Use RegisterOrConstant to
 // represent an offset.
 class Address VALUE_OBJ_CLASS_SPEC {
 private:
  Register _base;         // Base register.
  Register _index;        // Index register.
  intptr_t _disp;         // Displacement.
 public:
  Address(Register b, Register i, address d = 0)
    : _base(b), _index(i), _disp((intptr_t)d) {
    assert(i == noreg || d == 0, "can't have both");
  }
  Address(Register b, address d = 0)
    : _base(b), _index(noreg), _disp((intptr_t)d) {}
  Address(Register b, intptr_t d)
    : _base(b), _index(noreg), _disp(d) {}
  Address(Register b, RegisterOrConstant roc)
    : _base(b), _index(noreg), _disp(0) {
    if (roc.is_constant()) _disp = roc.as_constant(); else _index = roc.as_register();
  }
  Address()
    : _base(noreg), _index(noreg), _disp(0) {}
  // accessors
  Register base()  const { return _base; }
  Register index() const { return _index; }
  int      disp()  const { return (int)_disp; }
  bool     is_const() const { return _base == noreg && _index == noreg; }
 };
 class AddressLiteral VALUE_OBJ_CLASS_SPEC {
@ -164,10 +191,14 @@ struct FunctionDescriptor VALUE_OBJ_CLASS_SPEC {
 };
 #endif
 // The PPC Assembler: Pure assembler doing NO optimizations on the
 // instruction level; i.e., what you write is what you get. The
 // Assembler is generating code into a CodeBuffer.
 class Assembler : public AbstractAssembler {
 protected:
  // Displacement routines
  static void print_instruction(int inst);
  static int  patched_branch(int dest_pos, int inst, int inst_pos);
  static int  branch_destination(int inst, int pos);
@ -839,41 +870,38 @@ class Assembler : public AbstractAssembler {
  enum Predict { pt = 1, pn = 0 }; // pt = predict taken
-  // instruction must start at passed address
+  // Instruction must start at passed address.
  static int instr_len(unsigned char *instr) { return BytesPerInstWord; }
  // instruction must be left-justified in argument
  static int instr_len(unsigned long instr)  { return BytesPerInstWord; }
  // longest instructions
  static int instr_maxlen() { return BytesPerInstWord; }
  // Test if x is within signed immediate range for nbits.
  static bool is_simm(int x, unsigned int nbits) {
    assert(0 < nbits && nbits < 32, "out of bounds");
-    const int   min      = -( ((int)1) << nbits-1 );
+    const int   min      = -(((int)1) << nbits-1);
-    const int   maxplus1 =  ( ((int)1) << nbits-1 );
+    const int   maxplus1 =  (((int)1) << nbits-1);
    return min <= x && x < maxplus1;
  }
  static bool is_simm(jlong x, unsigned int nbits) {
    assert(0 < nbits && nbits < 64, "out of bounds");
-    const jlong min      = -( ((jlong)1) << nbits-1 );
+    const jlong min      = -(((jlong)1) << nbits-1);
-    const jlong maxplus1 =  ( ((jlong)1) << nbits-1 );
+    const jlong maxplus1 =  (((jlong)1) << nbits-1);
    return min <= x && x < maxplus1;
  }
-  // Test if x is within unsigned immediate range for nbits
+  // Test if x is within unsigned immediate range for nbits.
  static bool is_uimm(int x, unsigned int nbits) {
    assert(0 < nbits && nbits < 32, "out of bounds");
-    const int   maxplus1 = ( ((int)1) << nbits );
+    const unsigned int maxplus1 = (((unsigned int)1) << nbits);
-    return 0 <= x && x < maxplus1;
+    return (unsigned int)x < maxplus1;
  }
  static bool is_uimm(jlong x, unsigned int nbits) {
    assert(0 < nbits && nbits < 64, "out of bounds");
-    const jlong maxplus1 =  ( ((jlong)1) << nbits );
+    const julong maxplus1 = (((julong)1) << nbits);
-    return 0 <= x && x < maxplus1;
+    return (julong)x < maxplus1;
  }
 protected:
@ -1196,6 +1224,8 @@ class Assembler : public AbstractAssembler {
  inline void mullw_( Register d, Register a, Register b);
  inline void mulhw(  Register d, Register a, Register b);
  inline void mulhw_( Register d, Register a, Register b);
  inline void mulhwu( Register d, Register a, Register b);
  inline void mulhwu_(Register d, Register a, Register b);
  inline void mulhd(  Register d, Register a, Register b);
  inline void mulhd_( Register d, Register a, Register b);
  inline void mulhdu( Register d, Register a, Register b);
@ -1376,8 +1406,11 @@ class Assembler : public AbstractAssembler {
  inline void orc(    Register a, Register s, Register b);
  inline void orc_(   Register a, Register s, Register b);
  inline void extsb(  Register a, Register s);
  inline void extsb_( Register a, Register s);
  inline void extsh(  Register a, Register s);
  inline void extsh_( Register a, Register s);
  inline void extsw(  Register a, Register s);
  inline void extsw_( Register a, Register s);
  // extended mnemonics
  inline void nop();
@ -1767,6 +1800,8 @@ class Assembler : public AbstractAssembler {
  inline void smt_yield();
  inline void smt_mdoio();
  inline void smt_mdoom();
  // >= Power8
  inline void smt_miso();
  // trap instructions
  inline void twi_0(Register a); // for load with acquire semantics use load+twi_0+isync (trap can't occur)
@ -2168,6 +2203,7 @@ class Assembler : public AbstractAssembler {
  inline void load_const(Register d, void* a,           Register tmp = noreg);
  inline void load_const(Register d, Label& L,          Register tmp = noreg);
  inline void load_const(Register d, AddressLiteral& a, Register tmp = noreg);
  inline void load_const32(Register d, int i); // load signed int (patchable)
  // Load a 64 bit constant, optimized, not identifyable.
  // Tmp can be used to increase ILP. Set return_simm16_rest = true to get a
--- a/hotspot/src/cpu/ppc/vm/assembler_ppc.inline.hpp
+++ b/hotspot/src/cpu/ppc/vm/assembler_ppc.inline.hpp
@ -117,6 +117,8 @@ inline void Assembler::mullw(  Register d, Register a, Register b) { emit_int32(
 inline void Assembler::mullw_( Register d, Register a, Register b) { emit_int32(MULLW_OPCODE  | rt(d) | ra(a) | rb(b) | oe(0) | rc(1)); }
 inline void Assembler::mulhw(  Register d, Register a, Register b) { emit_int32(MULHW_OPCODE  | rt(d) | ra(a) | rb(b) | rc(0)); }
 inline void Assembler::mulhw_( Register d, Register a, Register b) { emit_int32(MULHW_OPCODE  | rt(d) | ra(a) | rb(b) | rc(1)); }
 inline void Assembler::mulhwu( Register d, Register a, Register b) { emit_int32(MULHWU_OPCODE | rt(d) | ra(a) | rb(b) | rc(0)); }
 inline void Assembler::mulhwu_(Register d, Register a, Register b) { emit_int32(MULHWU_OPCODE | rt(d) | ra(a) | rb(b) | rc(1)); }
 inline void Assembler::mulhd(  Register d, Register a, Register b) { emit_int32(MULHD_OPCODE  | rt(d) | ra(a) | rb(b) | rc(0)); }
 inline void Assembler::mulhd_( Register d, Register a, Register b) { emit_int32(MULHD_OPCODE  | rt(d) | ra(a) | rb(b) | rc(1)); }
 inline void Assembler::mulhdu( Register d, Register a, Register b) { emit_int32(MULHDU_OPCODE | rt(d) | ra(a) | rb(b) | rc(0)); }
@ -206,8 +208,11 @@ inline void Assembler::andc_(   Register a, Register s, Register b)    { emit_in
 inline void Assembler::orc(     Register a, Register s, Register b)    { emit_int32(ORC_OPCODE     | rta(a) | rs(s) | rb(b) | rc(0)); }
 inline void Assembler::orc_(    Register a, Register s, Register b)    { emit_int32(ORC_OPCODE     | rta(a) | rs(s) | rb(b) | rc(1)); }
 inline void Assembler::extsb(   Register a, Register s)                { emit_int32(EXTSB_OPCODE   | rta(a) | rs(s) | rc(0)); }
 inline void Assembler::extsb_(  Register a, Register s)                { emit_int32(EXTSB_OPCODE   | rta(a) | rs(s) | rc(1)); }
 inline void Assembler::extsh(   Register a, Register s)                { emit_int32(EXTSH_OPCODE   | rta(a) | rs(s) | rc(0)); }
 inline void Assembler::extsh_(  Register a, Register s)                { emit_int32(EXTSH_OPCODE   | rta(a) | rs(s) | rc(1)); }
 inline void Assembler::extsw(   Register a, Register s)                { emit_int32(EXTSW_OPCODE   | rta(a) | rs(s) | rc(0)); }
 inline void Assembler::extsw_(  Register a, Register s)                { emit_int32(EXTSW_OPCODE   | rta(a) | rs(s) | rc(1)); }
 // extended mnemonics
 inline void Assembler::nop()                              { Assembler::ori(R0, R0, 0); }
@ -609,6 +614,8 @@ inline void Assembler::smt_prio_high()        { Assembler::or_unchecked(R3,  R3,
 inline void Assembler::smt_yield()            { Assembler::or_unchecked(R27, R27, R27); }
 inline void Assembler::smt_mdoio()            { Assembler::or_unchecked(R29, R29, R29); }
 inline void Assembler::smt_mdoom()            { Assembler::or_unchecked(R30, R30, R30); }
 // >= Power8
 inline void Assembler::smt_miso()             { Assembler::or_unchecked(R26, R26, R26); }
 inline void Assembler::twi_0(Register a)      { twi_unchecked(0, a, 0);}
@ -967,12 +974,15 @@ inline void Assembler::load_const(Register d, Label& L, Register tmp) {
 // Load a 64 bit constant encoded by an AddressLiteral. patchable.
 inline void Assembler::load_const(Register d, AddressLiteral& a, Register tmp) {
  assert(d != R0, "R0 not allowed");
  // First relocate (we don't change the offset in the RelocationHolder,
  // just pass a.rspec()), then delegate to load_const(Register, long).
  relocate(a.rspec());
  load_const(d, (long)a.value(), tmp);
 }
 inline void Assembler::load_const32(Register d, int i) {
  lis(d, i >> 16);
  ori(d, d, i & 0xFFFF);
 }
 #endif // CPU_PPC_VM_ASSEMBLER_PPC_INLINE_HPP
--- a/hotspot/src/cpu/ppc/vm/c1_CodeStubs_ppc.cpp
+++ b/hotspot/src/cpu/ppc/vm/c1_CodeStubs_ppc.cpp
@ -0,0 +1,527 @@
 /*
 * Copyright (c) 1999, 2015, Oracle and/or its affiliates. All rights reserved.
 * Copyright 2012, 2015 SAP AG. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
 * under the terms of the GNU General Public License version 2 only, as
 * published by the Free Software Foundation.
 *
 * This code is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 * version 2 for more details (a copy is included in the LICENSE file that
 * accompanied this code).
 *
 * You should have received a copy of the GNU General Public License version
 * 2 along with this work; if not, write to the Free Software Foundation,
 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
 *
 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
 * or visit www.oracle.com if you need additional information or have any
 * questions.
 *
 */
 #include "precompiled.hpp"
 #include "c1/c1_CodeStubs.hpp"
 #include "c1/c1_FrameMap.hpp"
 #include "c1/c1_LIRAssembler.hpp"
 #include "c1/c1_MacroAssembler.hpp"
 #include "c1/c1_Runtime1.hpp"
 #include "nativeInst_ppc.hpp"
 #include "runtime/sharedRuntime.hpp"
 #include "utilities/macros.hpp"
 #include "vmreg_ppc.inline.hpp"
 #if INCLUDE_ALL_GCS
 #include "gc/g1/g1SATBCardTableModRefBS.hpp"
 #endif // INCLUDE_ALL_GCS
 #define __ ce->masm()->
 RangeCheckStub::RangeCheckStub(CodeEmitInfo* info, LIR_Opr index,
                               bool throw_index_out_of_bounds_exception)
  : _throw_index_out_of_bounds_exception(throw_index_out_of_bounds_exception)
  , _index(index) {
  assert(info != NULL, "must have info");
  _info = new CodeEmitInfo(info);
 }
 void RangeCheckStub::emit_code(LIR_Assembler* ce) {
  __ bind(_entry);
  if (_info->deoptimize_on_exception()) {
    address a = Runtime1::entry_for(Runtime1::predicate_failed_trap_id);
    // May be used by optimizations like LoopInvariantCodeMotion or RangeCheckEliminator.
    DEBUG_ONLY( __ untested("RangeCheckStub: predicate_failed_trap_id"); )
    //__ load_const_optimized(R0, a);
    __ add_const_optimized(R0, R29_TOC, MacroAssembler::offset_to_global_toc(a));
    __ mtctr(R0);
    __ bctrl();
    ce->add_call_info_here(_info);
    ce->verify_oop_map(_info);
    debug_only(__ illtrap());
    return;
  }
  address stub = _throw_index_out_of_bounds_exception ? Runtime1::entry_for(Runtime1::throw_index_exception_id)
                                                      : Runtime1::entry_for(Runtime1::throw_range_check_failed_id);
  //__ load_const_optimized(R0, stub);
  __ add_const_optimized(R0, R29_TOC, MacroAssembler::offset_to_global_toc(stub));
  __ mtctr(R0);
  Register index = R0; // pass in R0
  if (_index->is_register()) {
    __ extsw(index, _index->as_register());
  } else {
    __ load_const_optimized(index, _index->as_jint());
  }
  __ bctrl();
  ce->add_call_info_here(_info);
  ce->verify_oop_map(_info);
  debug_only(__ illtrap());
 }
 PredicateFailedStub::PredicateFailedStub(CodeEmitInfo* info) {
  _info = new CodeEmitInfo(info);
 }
 void PredicateFailedStub::emit_code(LIR_Assembler* ce) {
  __ bind(_entry);
  address a = Runtime1::entry_for(Runtime1::predicate_failed_trap_id);
  //__ load_const_optimized(R0, a);
  __ add_const_optimized(R0, R29_TOC, MacroAssembler::offset_to_global_toc(a));
  __ mtctr(R0);
  __ bctrl();
  ce->add_call_info_here(_info);
  ce->verify_oop_map(_info);
  debug_only(__ illtrap());
 }
 void CounterOverflowStub::emit_code(LIR_Assembler* ce) {
  __ bind(_entry);
  // Parameter 1: bci
  __ load_const_optimized(R0, _bci);
  __ std(R0, -16, R1_SP);
  // Parameter 2: Method*
  Metadata *m = _method->as_constant_ptr()->as_metadata();
  AddressLiteral md = __ constant_metadata_address(m); // Notify OOP recorder (don't need the relocation).
  __ load_const_optimized(R0, md.value());
  __ std(R0, -8, R1_SP);
  address a = Runtime1::entry_for(Runtime1::counter_overflow_id);
  //__ load_const_optimized(R0, a);
  __ add_const_optimized(R0, R29_TOC, MacroAssembler::offset_to_global_toc(a));
  __ mtctr(R0);
  __ bctrl();
  ce->add_call_info_here(_info);
  ce->verify_oop_map(_info);
  __ b(_continuation);
 }
 void DivByZeroStub::emit_code(LIR_Assembler* ce) {
  if (_offset != -1) {
    ce->compilation()->implicit_exception_table()->append(_offset, __ offset());
  }
  __ bind(_entry);
  address stub = Runtime1::entry_for(Runtime1::throw_div0_exception_id);
  //__ load_const_optimized(R0, stub);
  __ add_const_optimized(R0, R29_TOC, MacroAssembler::offset_to_global_toc(stub));
  __ mtctr(R0);
  __ bctrl();
  ce->add_call_info_here(_info);
  ce->verify_oop_map(_info);
  debug_only(__ illtrap());
 }
 void ImplicitNullCheckStub::emit_code(LIR_Assembler* ce) {
  address a;
  if (_info->deoptimize_on_exception()) {
    // Deoptimize, do not throw the exception, because it is probably wrong to do it here.
    a = Runtime1::entry_for(Runtime1::predicate_failed_trap_id);
  } else {
    a = Runtime1::entry_for(Runtime1::throw_null_pointer_exception_id);
  }
  if (ImplicitNullChecks || TrapBasedNullChecks) {
    ce->compilation()->implicit_exception_table()->append(_offset, __ offset());
  }
  __ bind(_entry);
  //__ load_const_optimized(R0, a);
  __ add_const_optimized(R0, R29_TOC, MacroAssembler::offset_to_global_toc(a));
  __ mtctr(R0);
  __ bctrl();
  ce->add_call_info_here(_info);
  ce->verify_oop_map(_info);
  debug_only(__ illtrap());
 }
 // Implementation of SimpleExceptionStub
 void SimpleExceptionStub::emit_code(LIR_Assembler* ce) {
  __ bind(_entry);
  address stub = Runtime1::entry_for(_stub);
  //__ load_const_optimized(R0, stub);
  __ add_const_optimized(R0, R29_TOC, MacroAssembler::offset_to_global_toc(stub));
  if (_obj->is_valid()) { __ mr_if_needed(/*tmp1 in do_CheckCast*/ R4_ARG2, _obj->as_register()); }
  __ mtctr(R0);
  __ bctrl();
  ce->add_call_info_here(_info);
  debug_only( __ illtrap(); )
 }
 // Implementation of NewInstanceStub
 NewInstanceStub::NewInstanceStub(LIR_Opr klass_reg, LIR_Opr result, ciInstanceKlass* klass, CodeEmitInfo* info, Runtime1::StubID stub_id) {
  _result = result;
  _klass = klass;
  _klass_reg = klass_reg;
  _info = new CodeEmitInfo(info);
  assert(stub_id == Runtime1::new_instance_id                 ||
         stub_id == Runtime1::fast_new_instance_id            ||
         stub_id == Runtime1::fast_new_instance_init_check_id,
         "need new_instance id");
  _stub_id = stub_id;
 }
 void NewInstanceStub::emit_code(LIR_Assembler* ce) {
  __ bind(_entry);
  address entry = Runtime1::entry_for(_stub_id);
  //__ load_const_optimized(R0, entry);
  __ add_const_optimized(R0, R29_TOC, MacroAssembler::offset_to_global_toc(entry));
  __ mtctr(R0);
  __ bctrl();
  ce->add_call_info_here(_info);
  ce->verify_oop_map(_info);
  __ b(_continuation);
 }
 // Implementation of NewTypeArrayStub
 NewTypeArrayStub::NewTypeArrayStub(LIR_Opr klass_reg, LIR_Opr length, LIR_Opr result, CodeEmitInfo* info) {
  _klass_reg = klass_reg;
  _length = length;
  _result = result;
  _info = new CodeEmitInfo(info);
 }
 void NewTypeArrayStub::emit_code(LIR_Assembler* ce) {
  __ bind(_entry);
  address entry = Runtime1::entry_for(Runtime1::new_type_array_id);
  //__ load_const_optimized(R0, entry);
  __ add_const_optimized(R0, R29_TOC, MacroAssembler::offset_to_global_toc(entry));
  __ mr_if_needed(/*op->tmp1()->as_register()*/ R5_ARG3, _length->as_register()); // already sign-extended
  __ mtctr(R0);
  __ bctrl();
  ce->add_call_info_here(_info);
  ce->verify_oop_map(_info);
  __ b(_continuation);
 }
 // Implementation of NewObjectArrayStub
 NewObjectArrayStub::NewObjectArrayStub(LIR_Opr klass_reg, LIR_Opr length, LIR_Opr result, CodeEmitInfo* info) {
  _klass_reg = klass_reg;
  _length = length;
  _result = result;
  _info = new CodeEmitInfo(info);
 }
 void NewObjectArrayStub::emit_code(LIR_Assembler* ce) {
  __ bind(_entry);
  address entry = Runtime1::entry_for(Runtime1::new_object_array_id);
  //__ load_const_optimized(R0, entry);
  __ add_const_optimized(R0, R29_TOC, MacroAssembler::offset_to_global_toc(entry));
  __ mr_if_needed(/*op->tmp1()->as_register()*/ R5_ARG3, _length->as_register()); // already sign-extended
  __ mtctr(R0);
  __ bctrl();
  ce->add_call_info_here(_info);
  ce->verify_oop_map(_info);
  __ b(_continuation);
 }
 // Implementation of MonitorAccessStubs
 MonitorEnterStub::MonitorEnterStub(LIR_Opr obj_reg, LIR_Opr lock_reg, CodeEmitInfo* info)
  : MonitorAccessStub(obj_reg, lock_reg) {
  _info = new CodeEmitInfo(info);
 }
 void MonitorEnterStub::emit_code(LIR_Assembler* ce) {
  __ bind(_entry);
  address stub = Runtime1::entry_for(ce->compilation()->has_fpu_code() ? Runtime1::monitorenter_id : Runtime1::monitorenter_nofpu_id);
  //__ load_const_optimized(R0, stub);
  __ add_const_optimized(R0, R29_TOC, MacroAssembler::offset_to_global_toc(stub));
  __ mr_if_needed(/*scratch_opr()->as_register()*/ R4_ARG2, _obj_reg->as_register());
  assert(_lock_reg->as_register() == R5_ARG3, "");
  __ mtctr(R0);
  __ bctrl();
  ce->add_call_info_here(_info);
  ce->verify_oop_map(_info);
  __ b(_continuation);
 }
 void MonitorExitStub::emit_code(LIR_Assembler* ce) {
  __ bind(_entry);
  if (_compute_lock) {
    ce->monitor_address(_monitor_ix, _lock_reg);
  }
  address stub = Runtime1::entry_for(ce->compilation()->has_fpu_code() ? Runtime1::monitorexit_id : Runtime1::monitorexit_nofpu_id);
  //__ load_const_optimized(R0, stub);
  __ add_const_optimized(R0, R29_TOC, MacroAssembler::offset_to_global_toc(stub));
  assert(_lock_reg->as_register() == R4_ARG2, "");
  __ mtctr(R0);
  __ bctrl();
  __ b(_continuation);
 }
 // Implementation of patching:
 // - Copy the code at given offset to an inlined buffer (first the bytes, then the number of bytes).
 // - Replace original code with a call to the stub.
 // At Runtime:
 // - call to stub, jump to runtime
 // - in runtime: preserve all registers (especially objects, i.e., source and destination object)
 // - in runtime: after initializing class, restore original code, reexecute instruction
 int PatchingStub::_patch_info_offset = -(5 * BytesPerInstWord);
 void PatchingStub::align_patch_site(MacroAssembler* ) {
  // Patch sites on ppc are always properly aligned.
 }
 #ifdef ASSERT
 inline void compare_with_patch_site(address template_start, address pc_start, int bytes_to_copy) {
  address start = template_start;
  for (int i = 0; i < bytes_to_copy; i++) {
    address ptr = (address)(pc_start + i);
    int a_byte = (*ptr) & 0xFF;
    assert(a_byte == *start++, "should be the same code");
  }
 }
 #endif
 void PatchingStub::emit_code(LIR_Assembler* ce) {
  // copy original code here
  assert(NativeGeneralJump::instruction_size <= _bytes_to_copy && _bytes_to_copy <= 0xFF,
         "not enough room for call");
  assert((_bytes_to_copy & 0x3) == 0, "must copy a multiple of four bytes");
  Label call_patch;
  int being_initialized_entry = __ offset();
  if (_id == load_klass_id) {
    // Produce a copy of the load klass instruction for use by the being initialized case.
    AddressLiteral addrlit((address)NULL, metadata_Relocation::spec(_index));
    __ load_const(_obj, addrlit, R0);
    DEBUG_ONLY( compare_with_patch_site(__ code_section()->start() + being_initialized_entry, _pc_start, _bytes_to_copy); )
  } else if (_id == load_mirror_id || _id == load_appendix_id) {
    // Produce a copy of the load mirror instruction for use by the being initialized case.
    AddressLiteral addrlit((address)NULL, oop_Relocation::spec(_index));
    __ load_const(_obj, addrlit, R0);
    DEBUG_ONLY( compare_with_patch_site(__ code_section()->start() + being_initialized_entry, _pc_start, _bytes_to_copy); )
  } else {
    // Make a copy the code which is going to be patched.
    for (int i = 0; i < _bytes_to_copy; i++) {
      address ptr = (address)(_pc_start + i);
      int a_byte = (*ptr) & 0xFF;
      __ emit_int8 (a_byte);
    }
  }
  address end_of_patch = __ pc();
  int bytes_to_skip = 0;
  if (_id == load_mirror_id) {
    int offset = __ offset();
    __ block_comment(" being_initialized check");
    // Static field accesses have special semantics while the class
    // initializer is being run so we emit a test which can be used to
    // check that this code is being executed by the initializing
    // thread.
    assert(_obj != noreg, "must be a valid register");
    assert(_index >= 0, "must have oop index");
    __ mr(R0, _obj); // spill
    __ ld(_obj, java_lang_Class::klass_offset_in_bytes(), _obj);
    __ ld(_obj, in_bytes(InstanceKlass::init_thread_offset()), _obj);
    __ cmpd(CCR0, _obj, R16_thread);
    __ mr(_obj, R0); // restore
    __ bne(CCR0, call_patch);
    // Load_klass patches may execute the patched code before it's
    // copied back into place so we need to jump back into the main
    // code of the nmethod to continue execution.
    __ b(_patch_site_continuation);
    // Make sure this extra code gets skipped.
    bytes_to_skip += __ offset() - offset;
  }
  // Now emit the patch record telling the runtime how to find the
  // pieces of the patch.  We only need 3 bytes but it has to be
  // aligned as an instruction so emit 4 bytes.
  int sizeof_patch_record = 4;
  bytes_to_skip += sizeof_patch_record;
  // Emit the offsets needed to find the code to patch.
  int being_initialized_entry_offset = __ offset() - being_initialized_entry + sizeof_patch_record;
  // Emit the patch record.  We need to emit a full word, so emit an extra empty byte.
  __ emit_int8(0);
  __ emit_int8(being_initialized_entry_offset);
  __ emit_int8(bytes_to_skip);
  __ emit_int8(_bytes_to_copy);
  address patch_info_pc = __ pc();
  assert(patch_info_pc - end_of_patch == bytes_to_skip, "incorrect patch info");
  address entry = __ pc();
  NativeGeneralJump::insert_unconditional((address)_pc_start, entry);
  address target = NULL;
  relocInfo::relocType reloc_type = relocInfo::none;
  switch (_id) {
    case access_field_id:  target = Runtime1::entry_for(Runtime1::access_field_patching_id); break;
    case load_klass_id:    target = Runtime1::entry_for(Runtime1::load_klass_patching_id);
                           reloc_type = relocInfo::metadata_type; break;
    case load_mirror_id:   target = Runtime1::entry_for(Runtime1::load_mirror_patching_id);
                           reloc_type = relocInfo::oop_type; break;
    case load_appendix_id: target = Runtime1::entry_for(Runtime1::load_appendix_patching_id);
                           reloc_type = relocInfo::oop_type; break;
    default: ShouldNotReachHere();
  }
  __ bind(call_patch);
  __ block_comment("patch entry point");
  //__ load_const(R0, target); + mtctr + bctrl must have size -_patch_info_offset
  __ load_const32(R0, MacroAssembler::offset_to_global_toc(target));
  __ add(R0, R29_TOC, R0);
  __ mtctr(R0);
  __ bctrl();
  assert(_patch_info_offset == (patch_info_pc - __ pc()), "must not change");
  ce->add_call_info_here(_info);
  __ b(_patch_site_entry);
  if (_id == load_klass_id || _id == load_mirror_id || _id == load_appendix_id) {
    CodeSection* cs = __ code_section();
    address pc = (address)_pc_start;
    RelocIterator iter(cs, pc, pc + 1);
    relocInfo::change_reloc_info_for_address(&iter, (address) pc, reloc_type, relocInfo::none);
  }
 }
 void DeoptimizeStub::emit_code(LIR_Assembler* ce) {
  __ bind(_entry);
  address stub = Runtime1::entry_for(Runtime1::deoptimize_id);
  //__ load_const_optimized(R0, stub);
  __ add_const_optimized(R0, R29_TOC, MacroAssembler::offset_to_global_toc(stub));
  __ mtctr(R0);
  __ load_const_optimized(R0, _trap_request); // Pass trap request in R0.
  __ bctrl();
  ce->add_call_info_here(_info);
  debug_only(__ illtrap());
 }
 void ArrayCopyStub::emit_code(LIR_Assembler* ce) {
  //---------------slow case: call to native-----------------
  __ bind(_entry);
  __ mr(R3_ARG1, src()->as_register());
  __ extsw(R4_ARG2, src_pos()->as_register());
  __ mr(R5_ARG3, dst()->as_register());
  __ extsw(R6_ARG4, dst_pos()->as_register());
  __ extsw(R7_ARG5, length()->as_register());
  ce->emit_static_call_stub();
  bool success = ce->emit_trampoline_stub_for_call(SharedRuntime::get_resolve_static_call_stub());
  if (!success) { return; }
  __ relocate(relocInfo::static_call_type);
  // Note: At this point we do not have the address of the trampoline
  // stub, and the entry point might be too far away for bl, so __ pc()
  // serves as dummy and the bl will be patched later.
  __ code()->set_insts_mark();
  __ bl(__ pc());
  ce->add_call_info_here(info());
  ce->verify_oop_map(info());
 #ifndef PRODUCT
  const address counter = (address)&Runtime1::_arraycopy_slowcase_cnt;
  const Register tmp = R3, tmp2 = R4;
  int simm16_offs = __ load_const_optimized(tmp, counter, tmp2, true);
  __ lwz(tmp2, simm16_offs, tmp);
  __ addi(tmp2, tmp2, 1);
  __ stw(tmp2, simm16_offs, tmp);
 #endif
  __ b(_continuation);
 }
 ///////////////////////////////////////////////////////////////////////////////////
 #if INCLUDE_ALL_GCS
 void G1PreBarrierStub::emit_code(LIR_Assembler* ce) {
  // At this point we know that marking is in progress.
  // If do_load() is true then we have to emit the
  // load of the previous value; otherwise it has already
  // been loaded into _pre_val.
  __ bind(_entry);
  assert(pre_val()->is_register(), "Precondition.");
  Register pre_val_reg = pre_val()->as_register();
  if (do_load()) {
    ce->mem2reg(addr(), pre_val(), T_OBJECT, patch_code(), info(), false /*wide*/, false /*unaligned*/);
  }
  __ cmpdi(CCR0, pre_val_reg, 0);
  __ bc_far_optimized(Assembler::bcondCRbiIs1, __ bi0(CCR0, Assembler::equal), _continuation);
  address stub = Runtime1::entry_for(Runtime1::Runtime1::g1_pre_barrier_slow_id);
  //__ load_const_optimized(R0, stub);
  __ add_const_optimized(R0, R29_TOC, MacroAssembler::offset_to_global_toc(stub));
  __ std(pre_val_reg, -8, R1_SP); // Pass pre_val on stack.
  __ mtctr(R0);
  __ bctrl();
  __ b(_continuation);
 }
 void G1PostBarrierStub::emit_code(LIR_Assembler* ce) {
  __ bind(_entry);
  assert(addr()->is_register(), "Precondition.");
  assert(new_val()->is_register(), "Precondition.");
  Register addr_reg = addr()->as_pointer_register();
  Register new_val_reg = new_val()->as_register();
  __ cmpdi(CCR0, new_val_reg, 0);
  __ bc_far_optimized(Assembler::bcondCRbiIs1, __ bi0(CCR0, Assembler::equal), _continuation);
  address stub = Runtime1::entry_for(Runtime1::Runtime1::g1_post_barrier_slow_id);
  //__ load_const_optimized(R0, stub);
  __ add_const_optimized(R0, R29_TOC, MacroAssembler::offset_to_global_toc(stub));
  __ mtctr(R0);
  __ mr(R0, addr_reg); // Pass addr in R0.
  __ bctrl();
  __ b(_continuation);
 }
 #endif // INCLUDE_ALL_GCS
 ///////////////////////////////////////////////////////////////////////////////////
 #undef __
--- a/hotspot/src/cpu/ppc/vm/c1_Defs_ppc.hpp
+++ b/hotspot/src/cpu/ppc/vm/c1_Defs_ppc.hpp
@ -0,0 +1,76 @@
 /*
 * Copyright (c) 2000, 2015, Oracle and/or its affiliates. All rights reserved.
 * Copyright 2012, 2015 SAP AG. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
 * under the terms of the GNU General Public License version 2 only, as
 * published by the Free Software Foundation.
 *
 * This code is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 * version 2 for more details (a copy is included in the LICENSE file that
 * accompanied this code).
 *
 * You should have received a copy of the GNU General Public License version
 * 2 along with this work; if not, write to the Free Software Foundation,
 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
 *
 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
 * or visit www.oracle.com if you need additional information or have any
 * questions.
 *
 */
 #ifndef CPU_PPC_VM_C1_DEFS_PPC_HPP
 #define CPU_PPC_VM_C1_DEFS_PPC_HPP
 // Native word offsets from memory address.
 enum {
 #if defined(VM_LITTLE_ENDIAN)
  pd_lo_word_offset_in_bytes = 0,
  pd_hi_word_offset_in_bytes = BytesPerInt
 #else
  pd_lo_word_offset_in_bytes = BytesPerInt,
  pd_hi_word_offset_in_bytes = 0
 #endif
 };
 // Explicit rounding operations are not required to implement the strictFP mode.
 enum {
  pd_strict_fp_requires_explicit_rounding = false
 };
 // registers
 enum {
  pd_nof_cpu_regs_frame_map = 32,              // Number of registers used during code emission.
  pd_nof_caller_save_cpu_regs_frame_map = 27,  // Number of cpu registers killed by calls. (At least R3_ARG1 ... R10_ARG8, but using all like C2.)
  pd_nof_cpu_regs_reg_alloc = 27,              // Number of registers that are visible to register allocator.
  pd_nof_cpu_regs_linearscan = 32,             // Number of registers visible linear scan.
  pd_first_callee_saved_reg = pd_nof_caller_save_cpu_regs_frame_map,
  pd_last_callee_saved_reg = pd_nof_cpu_regs_reg_alloc - 1,
  pd_first_cpu_reg = 0,
  pd_last_cpu_reg = pd_nof_cpu_regs_reg_alloc - 1,
  pd_nof_fpu_regs_frame_map = 32,              // Number of registers used during code emission.
  pd_nof_caller_save_fpu_regs_frame_map = 32,  // Number of fpu registers killed by calls.
  pd_nof_fpu_regs_reg_alloc = 32,              // Number of registers that are visible to register allocator.
  pd_nof_fpu_regs_linearscan = 32,             // Number of registers visible to linear scan.
  pd_first_fpu_reg = pd_nof_cpu_regs_frame_map,
  pd_last_fpu_reg =  pd_nof_cpu_regs_frame_map + pd_nof_fpu_regs_reg_alloc - 1,
  pd_nof_xmm_regs_linearscan = 0,
  pd_nof_caller_save_xmm_regs = 0,
  pd_first_xmm_reg = -1,
  pd_last_xmm_reg = -1
 };
 // For debug info: a float value in a register is saved in single precision by runtime stubs.
 enum {
  pd_float_saved_as_double = true
 };
 #endif // CPU_PPC_VM_C1_DEFS_PPC_HPP
--- a/hotspot/src/cpu/ppc/vm/c1_FpuStackSim_ppc.hpp
+++ b/hotspot/src/cpu/ppc/vm/c1_FpuStackSim_ppc.hpp
@ -0,0 +1,32 @@
 /*
 * Copyright (c) 2005, 2015, Oracle and/or its affiliates. All rights reserved.
 * Copyright 2012, 2015 SAP AG. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
 * under the terms of the GNU General Public License version 2 only, as
 * published by the Free Software Foundation.
 *
 * This code is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 * version 2 for more details (a copy is included in the LICENSE file that
 * accompanied this code).
 *
 * You should have received a copy of the GNU General Public License version
 * 2 along with this work; if not, write to the Free Software Foundation,
 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
 *
 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
 * or visit www.oracle.com if you need additional information or have any
 * questions.
 *
 */
 #ifndef CPU_PPC_VM_C1_FPUSTACKSIM_PPC_HPP
 #define CPU_PPC_VM_C1_FPUSTACKSIM_PPC_HPP
 // No FPU stack on PPC.
 class FpuStackSim;
 #endif // CPU_PPC_VM_C1_FPUSTACKSIM_PPC_HPP
--- a/hotspot/src/cpu/ppc/vm/c1_FrameMap_ppc.cpp
+++ b/hotspot/src/cpu/ppc/vm/c1_FrameMap_ppc.cpp
@ -0,0 +1,394 @@
 /*
 * Copyright (c) 1999, 2015, Oracle and/or its affiliates. All rights reserved.
 * Copyright 2012, 2015 SAP AG. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
 * under the terms of the GNU General Public License version 2 only, as
 * published by the Free Software Foundation.
 *
 * This code is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 * version 2 for more details (a copy is included in the LICENSE file that
 * accompanied this code).
 *
 * You should have received a copy of the GNU General Public License version
 * 2 along with this work; if not, write to the Free Software Foundation,
 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
 *
 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
 * or visit www.oracle.com if you need additional information or have any
 * questions.
 *
 */
 #include "precompiled.hpp"
 #include "c1/c1_FrameMap.hpp"
 #include "c1/c1_LIR.hpp"
 #include "runtime/sharedRuntime.hpp"
 #include "vmreg_ppc.inline.hpp"
 const int FrameMap::pd_c_runtime_reserved_arg_size = 7;
 LIR_Opr FrameMap::map_to_opr(BasicType type, VMRegPair* reg, bool outgoing) {
  LIR_Opr opr = LIR_OprFact::illegalOpr;
  VMReg r_1 = reg->first();
  VMReg r_2 = reg->second();
  if (r_1->is_stack()) {
    // Convert stack slot to an SP offset.
    // The calling convention does not count the SharedRuntime::out_preserve_stack_slots() value
    // so we must add it in here.
    int st_off = (r_1->reg2stack() + SharedRuntime::out_preserve_stack_slots()) * VMRegImpl::stack_slot_size;
    opr = LIR_OprFact::address(new LIR_Address(SP_opr, st_off + STACK_BIAS, type));
  } else if (r_1->is_Register()) {
    Register reg = r_1->as_Register();
    //if (outgoing) {
    //  assert(!reg->is_in(), "should be using I regs");
    //} else {
    //  assert(!reg->is_out(), "should be using O regs");
    //}
    if (r_2->is_Register() && (type == T_LONG || type == T_DOUBLE)) {
      opr = as_long_opr(reg);
    } else if (type == T_OBJECT || type == T_ARRAY) {
      opr = as_oop_opr(reg);
    } else {
      opr = as_opr(reg);
    }
  } else if (r_1->is_FloatRegister()) {
    assert(type == T_DOUBLE || type == T_FLOAT, "wrong type");
    FloatRegister f = r_1->as_FloatRegister();
    if (type == T_DOUBLE) {
      opr = as_double_opr(f);
    } else {
      opr = as_float_opr(f);
    }
  }
  return opr;
 }
 //               FrameMap
 //--------------------------------------------------------
 FloatRegister FrameMap::_fpu_regs [FrameMap::nof_fpu_regs];
 LIR_Opr  FrameMap::R0_opr;
 LIR_Opr  FrameMap::R1_opr;
 LIR_Opr  FrameMap::R2_opr;
 LIR_Opr  FrameMap::R3_opr;
 LIR_Opr  FrameMap::R4_opr;
 LIR_Opr  FrameMap::R5_opr;
 LIR_Opr  FrameMap::R6_opr;
 LIR_Opr  FrameMap::R7_opr;
 LIR_Opr  FrameMap::R8_opr;
 LIR_Opr  FrameMap::R9_opr;
 LIR_Opr FrameMap::R10_opr;
 LIR_Opr FrameMap::R11_opr;
 LIR_Opr FrameMap::R12_opr;
 LIR_Opr FrameMap::R13_opr;
 LIR_Opr FrameMap::R14_opr;
 LIR_Opr FrameMap::R15_opr;
 LIR_Opr FrameMap::R16_opr;
 LIR_Opr FrameMap::R17_opr;
 LIR_Opr FrameMap::R18_opr;
 LIR_Opr FrameMap::R19_opr;
 LIR_Opr FrameMap::R20_opr;
 LIR_Opr FrameMap::R21_opr;
 LIR_Opr FrameMap::R22_opr;
 LIR_Opr FrameMap::R23_opr;
 LIR_Opr FrameMap::R24_opr;
 LIR_Opr FrameMap::R25_opr;
 LIR_Opr FrameMap::R26_opr;
 LIR_Opr FrameMap::R27_opr;
 LIR_Opr FrameMap::R28_opr;
 LIR_Opr FrameMap::R29_opr;
 LIR_Opr FrameMap::R30_opr;
 LIR_Opr FrameMap::R31_opr;
 LIR_Opr  FrameMap::R0_oop_opr;
 //LIR_Opr  FrameMap::R1_oop_opr;
 LIR_Opr  FrameMap::R2_oop_opr;
 LIR_Opr  FrameMap::R3_oop_opr;
 LIR_Opr  FrameMap::R4_oop_opr;
 LIR_Opr  FrameMap::R5_oop_opr;
 LIR_Opr  FrameMap::R6_oop_opr;
 LIR_Opr  FrameMap::R7_oop_opr;
 LIR_Opr  FrameMap::R8_oop_opr;
 LIR_Opr  FrameMap::R9_oop_opr;
 LIR_Opr FrameMap::R10_oop_opr;
 LIR_Opr FrameMap::R11_oop_opr;
 LIR_Opr FrameMap::R12_oop_opr;
 //LIR_Opr FrameMap::R13_oop_opr;
 LIR_Opr FrameMap::R14_oop_opr;
 LIR_Opr FrameMap::R15_oop_opr;
 //LIR_Opr FrameMap::R16_oop_opr;
 LIR_Opr FrameMap::R17_oop_opr;
 LIR_Opr FrameMap::R18_oop_opr;
 LIR_Opr FrameMap::R19_oop_opr;
 LIR_Opr FrameMap::R20_oop_opr;
 LIR_Opr FrameMap::R21_oop_opr;
 LIR_Opr FrameMap::R22_oop_opr;
 LIR_Opr FrameMap::R23_oop_opr;
 LIR_Opr FrameMap::R24_oop_opr;
 LIR_Opr FrameMap::R25_oop_opr;
 LIR_Opr FrameMap::R26_oop_opr;
 LIR_Opr FrameMap::R27_oop_opr;
 LIR_Opr FrameMap::R28_oop_opr;
 //LIR_Opr FrameMap::R29_oop_opr;
 LIR_Opr FrameMap::R30_oop_opr;
 LIR_Opr FrameMap::R31_oop_opr;
 LIR_Opr  FrameMap::R0_metadata_opr;
 //LIR_Opr  FrameMap::R1_metadata_opr;
 LIR_Opr  FrameMap::R2_metadata_opr;
 LIR_Opr  FrameMap::R3_metadata_opr;
 LIR_Opr  FrameMap::R4_metadata_opr;
 LIR_Opr  FrameMap::R5_metadata_opr;
 LIR_Opr  FrameMap::R6_metadata_opr;
 LIR_Opr  FrameMap::R7_metadata_opr;
 LIR_Opr  FrameMap::R8_metadata_opr;
 LIR_Opr  FrameMap::R9_metadata_opr;
 LIR_Opr FrameMap::R10_metadata_opr;
 LIR_Opr FrameMap::R11_metadata_opr;
 LIR_Opr FrameMap::R12_metadata_opr;
 //LIR_Opr FrameMap::R13_metadata_opr;
 LIR_Opr FrameMap::R14_metadata_opr;
 LIR_Opr FrameMap::R15_metadata_opr;
 //LIR_Opr FrameMap::R16_metadata_opr;
 LIR_Opr FrameMap::R17_metadata_opr;
 LIR_Opr FrameMap::R18_metadata_opr;
 LIR_Opr FrameMap::R19_metadata_opr;
 LIR_Opr FrameMap::R20_metadata_opr;
 LIR_Opr FrameMap::R21_metadata_opr;
 LIR_Opr FrameMap::R22_metadata_opr;
 LIR_Opr FrameMap::R23_metadata_opr;
 LIR_Opr FrameMap::R24_metadata_opr;
 LIR_Opr FrameMap::R25_metadata_opr;
 LIR_Opr FrameMap::R26_metadata_opr;
 LIR_Opr FrameMap::R27_metadata_opr;
 LIR_Opr FrameMap::R28_metadata_opr;
 //LIR_Opr FrameMap::R29_metadata_opr;
 LIR_Opr FrameMap::R30_metadata_opr;
 LIR_Opr FrameMap::R31_metadata_opr;
 LIR_Opr FrameMap::SP_opr;
 LIR_Opr FrameMap::R0_long_opr;
 LIR_Opr FrameMap::R3_long_opr;
 LIR_Opr FrameMap::F1_opr;
 LIR_Opr FrameMap::F1_double_opr;
 LIR_Opr FrameMap::_caller_save_cpu_regs[] = { 0, };
 LIR_Opr FrameMap::_caller_save_fpu_regs[] = { 0, };
 FloatRegister FrameMap::nr2floatreg (int rnr) {
  assert(_init_done, "tables not initialized");
  debug_only(fpu_range_check(rnr);)
  return _fpu_regs[rnr];
 }
 // Returns true if reg could be smashed by a callee.
 bool FrameMap::is_caller_save_register (LIR_Opr reg) {
  if (reg->is_single_fpu() || reg->is_double_fpu()) { return true; }
  if (reg->is_double_cpu()) {
    return is_caller_save_register(reg->as_register_lo()) ||
           is_caller_save_register(reg->as_register_hi());
  }
  return is_caller_save_register(reg->as_register());
 }
 bool FrameMap::is_caller_save_register (Register r) {
  // not visible to allocator: R0: scratch, R1: SP
  // r->encoding() < 2 + nof_caller_save_cpu_regs();
  return true; // Currently all regs are caller save.
 }
 void FrameMap::initialize() {
  assert(!_init_done, "once");
  int i = 0;
  // Put generally available registers at the beginning (allocated, saved for GC).
  for (int j = 0; j < nof_cpu_regs; ++j) {
    Register rj = as_Register(j);
    if (reg_needs_save(rj)) {
      map_register(i++, rj);
    }
  }
  assert(i == nof_cpu_regs_reg_alloc, "number of allocated registers");
  // The following registers are not normally available.
  for (int j = 0; j < nof_cpu_regs; ++j) {
    Register rj = as_Register(j);
    if (!reg_needs_save(rj)) {
      map_register(i++, rj);
    }
  }
  assert(i == nof_cpu_regs, "number of CPU registers");
  for (i = 0; i < nof_fpu_regs; i++) {
    _fpu_regs[i] = as_FloatRegister(i);
  }
  _init_done = true;
  R0_opr  = as_opr(R0);
  R1_opr  = as_opr(R1);
  R2_opr  = as_opr(R2);
  R3_opr  = as_opr(R3);
  R4_opr  = as_opr(R4);
  R5_opr  = as_opr(R5);
  R6_opr  = as_opr(R6);
  R7_opr  = as_opr(R7);
  R8_opr  = as_opr(R8);
  R9_opr  = as_opr(R9);
  R10_opr = as_opr(R10);
  R11_opr = as_opr(R11);
  R12_opr = as_opr(R12);
  R13_opr = as_opr(R13);
  R14_opr = as_opr(R14);
  R15_opr = as_opr(R15);
  R16_opr = as_opr(R16);
  R17_opr = as_opr(R17);
  R18_opr = as_opr(R18);
  R19_opr = as_opr(R19);
  R20_opr = as_opr(R20);
  R21_opr = as_opr(R21);
  R22_opr = as_opr(R22);
  R23_opr = as_opr(R23);
  R24_opr = as_opr(R24);
  R25_opr = as_opr(R25);
  R26_opr = as_opr(R26);
  R27_opr = as_opr(R27);
  R28_opr = as_opr(R28);
  R29_opr = as_opr(R29);
  R30_opr = as_opr(R30);
  R31_opr = as_opr(R31);
  R0_oop_opr  = as_oop_opr(R0);
  //R1_oop_opr  = as_oop_opr(R1);
  R2_oop_opr  = as_oop_opr(R2);
  R3_oop_opr  = as_oop_opr(R3);
  R4_oop_opr  = as_oop_opr(R4);
  R5_oop_opr  = as_oop_opr(R5);
  R6_oop_opr  = as_oop_opr(R6);
  R7_oop_opr  = as_oop_opr(R7);
  R8_oop_opr  = as_oop_opr(R8);
  R9_oop_opr  = as_oop_opr(R9);
  R10_oop_opr = as_oop_opr(R10);
  R11_oop_opr = as_oop_opr(R11);
  R12_oop_opr = as_oop_opr(R12);
  //R13_oop_opr = as_oop_opr(R13);
  R14_oop_opr = as_oop_opr(R14);
  R15_oop_opr = as_oop_opr(R15);
  //R16_oop_opr = as_oop_opr(R16);
  R17_oop_opr = as_oop_opr(R17);
  R18_oop_opr = as_oop_opr(R18);
  R19_oop_opr = as_oop_opr(R19);
  R20_oop_opr = as_oop_opr(R20);
  R21_oop_opr = as_oop_opr(R21);
  R22_oop_opr = as_oop_opr(R22);
  R23_oop_opr = as_oop_opr(R23);
  R24_oop_opr = as_oop_opr(R24);
  R25_oop_opr = as_oop_opr(R25);
  R26_oop_opr = as_oop_opr(R26);
  R27_oop_opr = as_oop_opr(R27);
  R28_oop_opr = as_oop_opr(R28);
  //R29_oop_opr = as_oop_opr(R29);
  R30_oop_opr = as_oop_opr(R30);
  R31_oop_opr = as_oop_opr(R31);
  R0_metadata_opr  = as_metadata_opr(R0);
  //R1_metadata_opr  = as_metadata_opr(R1);
  R2_metadata_opr  = as_metadata_opr(R2);
  R3_metadata_opr  = as_metadata_opr(R3);
  R4_metadata_opr  = as_metadata_opr(R4);
  R5_metadata_opr  = as_metadata_opr(R5);
  R6_metadata_opr  = as_metadata_opr(R6);
  R7_metadata_opr  = as_metadata_opr(R7);
  R8_metadata_opr  = as_metadata_opr(R8);
  R9_metadata_opr  = as_metadata_opr(R9);
  R10_metadata_opr = as_metadata_opr(R10);
  R11_metadata_opr = as_metadata_opr(R11);
  R12_metadata_opr = as_metadata_opr(R12);
  //R13_metadata_opr = as_metadata_opr(R13);
  R14_metadata_opr = as_metadata_opr(R14);
  R15_metadata_opr = as_metadata_opr(R15);
  //R16_metadata_opr = as_metadata_opr(R16);
  R17_metadata_opr = as_metadata_opr(R17);
  R18_metadata_opr = as_metadata_opr(R18);
  R19_metadata_opr = as_metadata_opr(R19);
  R20_metadata_opr = as_metadata_opr(R20);
  R21_metadata_opr = as_metadata_opr(R21);
  R22_metadata_opr = as_metadata_opr(R22);
  R23_metadata_opr = as_metadata_opr(R23);
  R24_metadata_opr = as_metadata_opr(R24);
  R25_metadata_opr = as_metadata_opr(R25);
  R26_metadata_opr = as_metadata_opr(R26);
  R27_metadata_opr = as_metadata_opr(R27);
  R28_metadata_opr = as_metadata_opr(R28);
  //R29_metadata_opr = as_metadata_opr(R29);
  R30_metadata_opr = as_metadata_opr(R30);
  R31_metadata_opr = as_metadata_opr(R31);
  SP_opr = as_pointer_opr(R1_SP);
  R0_long_opr = LIR_OprFact::double_cpu(cpu_reg2rnr(R0), cpu_reg2rnr(R0));
  R3_long_opr = LIR_OprFact::double_cpu(cpu_reg2rnr(R3), cpu_reg2rnr(R3));
  F1_opr = as_float_opr(F1);
  F1_double_opr = as_double_opr(F1);
  // All the allocated cpu regs are caller saved.
  for (int i = 0; i < max_nof_caller_save_cpu_regs; i++) {
    _caller_save_cpu_regs[i] = LIR_OprFact::single_cpu(i);
  }
  // All the fpu regs are caller saved.
  for (int i = 0; i < nof_caller_save_fpu_regs; i++) {
    _caller_save_fpu_regs[i] = LIR_OprFact::single_fpu(i);
  }
 }
 Address FrameMap::make_new_address(ByteSize sp_offset) const {
  return Address(R1_SP, STACK_BIAS + in_bytes(sp_offset));
 }
 VMReg FrameMap::fpu_regname (int n) {
  return as_FloatRegister(n)->as_VMReg();
 }
 LIR_Opr FrameMap::stack_pointer() {
  return SP_opr;
 }
 // JSR 292
 // On PPC64, there is no need to save the SP, because neither
 // method handle intrinsics, nor compiled lambda forms modify it.
 LIR_Opr FrameMap::method_handle_invoke_SP_save_opr() {
  return LIR_OprFact::illegalOpr;
 }
 bool FrameMap::validate_frame() {
  int max_offset = in_bytes(framesize_in_bytes());
  int java_index = 0;
  for (int i = 0; i < _incoming_arguments->length(); i++) {
    LIR_Opr opr = _incoming_arguments->at(i);
    if (opr->is_stack()) {
      max_offset = MAX2(_argument_locations->at(java_index), max_offset);
    }
    java_index += type2size[opr->type()];
  }
  return Assembler::is_simm16(max_offset + STACK_BIAS);
 }
--- a/hotspot/src/cpu/ppc/vm/c1_FrameMap_ppc.hpp
+++ b/hotspot/src/cpu/ppc/vm/c1_FrameMap_ppc.hpp
@ -0,0 +1,202 @@
 /*
 * Copyright (c) 1999, 2015, Oracle and/or its affiliates. All rights reserved.
 * Copyright 2012, 2015 SAP AG. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
 * under the terms of the GNU General Public License version 2 only, as
 * published by the Free Software Foundation.
 *
 * This code is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 * version 2 for more details (a copy is included in the LICENSE file that
 * accompanied this code).
 *
 * You should have received a copy of the GNU General Public License version
 * 2 along with this work; if not, write to the Free Software Foundation,
 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
 *
 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
 * or visit www.oracle.com if you need additional information or have any
 * questions.
 *
 */
 #ifndef CPU_PPC_VM_C1_FRAMEMAP_PPC_HPP
 #define CPU_PPC_VM_C1_FRAMEMAP_PPC_HPP
 public:
  enum {
    nof_reg_args = 8,   // Registers R3-R10 are available for parameter passing.
    first_available_sp_in_frame = frame::jit_out_preserve_size,
    frame_pad_in_bytes = 0
  };
  static const int pd_c_runtime_reserved_arg_size;
  static LIR_Opr  R0_opr;
  static LIR_Opr  R1_opr;
  static LIR_Opr  R2_opr;
  static LIR_Opr  R3_opr;
  static LIR_Opr  R4_opr;
  static LIR_Opr  R5_opr;
  static LIR_Opr  R6_opr;
  static LIR_Opr  R7_opr;
  static LIR_Opr  R8_opr;
  static LIR_Opr  R9_opr;
  static LIR_Opr R10_opr;
  static LIR_Opr R11_opr;
  static LIR_Opr R12_opr;
  static LIR_Opr R13_opr;
  static LIR_Opr R14_opr;
  static LIR_Opr R15_opr;
  static LIR_Opr R16_opr;
  static LIR_Opr R17_opr;
  static LIR_Opr R18_opr;
  static LIR_Opr R19_opr;
  static LIR_Opr R20_opr;
  static LIR_Opr R21_opr;
  static LIR_Opr R22_opr;
  static LIR_Opr R23_opr;
  static LIR_Opr R24_opr;
  static LIR_Opr R25_opr;
  static LIR_Opr R26_opr;
  static LIR_Opr R27_opr;
  static LIR_Opr R28_opr;
  static LIR_Opr R29_opr;
  static LIR_Opr R30_opr;
  static LIR_Opr R31_opr;
  static LIR_Opr  R0_oop_opr;
  //R1: Stack pointer. Not an oop.
  static LIR_Opr  R2_oop_opr;
  static LIR_Opr  R3_oop_opr;
  static LIR_Opr  R4_oop_opr;
  static LIR_Opr  R5_oop_opr;
  static LIR_Opr  R6_oop_opr;
  static LIR_Opr  R7_oop_opr;
  static LIR_Opr  R8_oop_opr;
  static LIR_Opr  R9_oop_opr;
  static LIR_Opr R10_oop_opr;
  static LIR_Opr R11_oop_opr;
  static LIR_Opr R12_oop_opr;
  //R13: System thread register. Not usable.
  static LIR_Opr R14_oop_opr;
  static LIR_Opr R15_oop_opr;
  //R16: Java thread register. Not an oop.
  static LIR_Opr R17_oop_opr;
  static LIR_Opr R18_oop_opr;
  static LIR_Opr R19_oop_opr;
  static LIR_Opr R20_oop_opr;
  static LIR_Opr R21_oop_opr;
  static LIR_Opr R22_oop_opr;
  static LIR_Opr R23_oop_opr;
  static LIR_Opr R24_oop_opr;
  static LIR_Opr R25_oop_opr;
  static LIR_Opr R26_oop_opr;
  static LIR_Opr R27_oop_opr;
  static LIR_Opr R28_oop_opr;
  static LIR_Opr R29_oop_opr;
  //R29: TOC register. Not an oop.
  static LIR_Opr R30_oop_opr;
  static LIR_Opr R31_oop_opr;
  static LIR_Opr  R0_metadata_opr;
  //R1: Stack pointer. Not metadata.
  static LIR_Opr  R2_metadata_opr;
  static LIR_Opr  R3_metadata_opr;
  static LIR_Opr  R4_metadata_opr;
  static LIR_Opr  R5_metadata_opr;
  static LIR_Opr  R6_metadata_opr;
  static LIR_Opr  R7_metadata_opr;
  static LIR_Opr  R8_metadata_opr;
  static LIR_Opr  R9_metadata_opr;
  static LIR_Opr R10_metadata_opr;
  static LIR_Opr R11_metadata_opr;
  static LIR_Opr R12_metadata_opr;
  //R13: System thread register. Not usable.
  static LIR_Opr R14_metadata_opr;
  static LIR_Opr R15_metadata_opr;
  //R16: Java thread register. Not metadata.
  static LIR_Opr R17_metadata_opr;
  static LIR_Opr R18_metadata_opr;
  static LIR_Opr R19_metadata_opr;
  static LIR_Opr R20_metadata_opr;
  static LIR_Opr R21_metadata_opr;
  static LIR_Opr R22_metadata_opr;
  static LIR_Opr R23_metadata_opr;
  static LIR_Opr R24_metadata_opr;
  static LIR_Opr R25_metadata_opr;
  static LIR_Opr R26_metadata_opr;
  static LIR_Opr R27_metadata_opr;
  static LIR_Opr R28_metadata_opr;
  //R29: TOC register. Not metadata.
  static LIR_Opr R30_metadata_opr;
  static LIR_Opr R31_metadata_opr;
  static LIR_Opr SP_opr;
  static LIR_Opr R0_long_opr;
  static LIR_Opr R3_long_opr;
  static LIR_Opr F1_opr;
  static LIR_Opr F1_double_opr;
 private:
  static FloatRegister  _fpu_regs [nof_fpu_regs];
  static LIR_Opr as_long_single_opr(Register r) {
    return LIR_OprFact::double_cpu(cpu_reg2rnr(r), cpu_reg2rnr(r));
  }
  static LIR_Opr as_long_pair_opr(Register r) {
    return LIR_OprFact::double_cpu(cpu_reg2rnr(r->successor()), cpu_reg2rnr(r));
  }
 public:
 #ifdef _LP64
  static LIR_Opr as_long_opr(Register r) {
    return as_long_single_opr(r);
  }
  static LIR_Opr as_pointer_opr(Register r) {
    return as_long_single_opr(r);
  }
 #else
  static LIR_Opr as_long_opr(Register r) {
    Unimplemented(); return 0;
 //    return as_long_pair_opr(r);
  }
  static LIR_Opr as_pointer_opr(Register r) {
    Unimplemented(); return 0;
 //    return as_opr(r);
  }
 #endif
  static LIR_Opr as_float_opr(FloatRegister r) {
    return LIR_OprFact::single_fpu(r->encoding());
  }
  static LIR_Opr as_double_opr(FloatRegister r) {
    return LIR_OprFact::double_fpu(r->encoding());
  }
  static FloatRegister nr2floatreg (int rnr);
  static VMReg fpu_regname (int n);
  static bool is_caller_save_register(LIR_Opr  reg);
  static bool is_caller_save_register(Register r);
  static int nof_caller_save_cpu_regs() { return pd_nof_caller_save_cpu_regs_frame_map; }
  static int last_cpu_reg()             { return pd_last_cpu_reg; }
  // Registers which need to be saved in the frames (e.g. for GC).
  // Register usage:
  //  R0: scratch
  //  R1: sp
  // R13: system thread id
  // R16: java thread
  // R29: global TOC
  static bool reg_needs_save(Register r) { return r != R0 && r != R1 && r != R13 && r != R16 && r != R29; }
 #endif // CPU_PPC_VM_C1_FRAMEMAP_PPC_HPP
--- a/hotspot/src/cpu/ppc/vm/c1_LIRAssembler_ppc.cpp
+++ b/hotspot/src/cpu/ppc/vm/c1_LIRAssembler_ppc.cpp
--- a/hotspot/src/cpu/ppc/vm/c1_LIRAssembler_ppc.hpp
+++ b/hotspot/src/cpu/ppc/vm/c1_LIRAssembler_ppc.hpp
@ -0,0 +1,69 @@
 /*
 * Copyright (c) 2000, 2015, Oracle and/or its affiliates. All rights reserved.
 * Copyright 2012, 2015 SAP AG. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
 * under the terms of the GNU General Public License version 2 only, as
 * published by the Free Software Foundation.
 *
 * This code is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 * version 2 for more details (a copy is included in the LICENSE file that
 * accompanied this code).
 *
 * You should have received a copy of the GNU General Public License version
 * 2 along with this work; if not, write to the Free Software Foundation,
 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
 *
 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
 * or visit www.oracle.com if you need additional information or have any
 * questions.
 *
 */
 #ifndef CPU_PPC_VM_C1_LIRASSEMBLER_PPC_HPP
 #define CPU_PPC_VM_C1_LIRASSEMBLER_PPC_HPP
 private:
  //////////////////////////////////////////////////////////////////////////////
  // PPC64 load/store emission
  //
  // The PPC ld/st instructions cannot accomodate displacements > 16 bits long.
  // The following "pseudo" instructions (load/store) make it easier to
  // use the indexed addressing mode by allowing 32 bit displacements:
  //
  void explicit_null_check(Register addr, CodeEmitInfo* info);
  int store(LIR_Opr from_reg, Register base, int offset, BasicType type, bool wide, bool unaligned);
  int store(LIR_Opr from_reg, Register base, Register disp, BasicType type, bool wide);
  int load(Register base, int offset, LIR_Opr to_reg, BasicType type, bool wide, bool unaligned);
  int load(Register base, Register disp, LIR_Opr to_reg, BasicType type, bool wide);
  int shift_amount(BasicType t);
  // Record the type of the receiver in ReceiverTypeData.
  void type_profile_helper(Register mdo, int mdo_offset_bias,
                           ciMethodData *md, ciProfileData *data,
                           Register recv, Register tmp1, Label* update_done);
  // Setup pointers to MDO, MDO slot, also compute offset bias to access the slot.
  void setup_md_access(ciMethod* method, int bci,
                       ciMethodData*& md, ciProfileData*& data, int& mdo_offset_bias);
 public:
  static const ConditionRegister BOOL_RESULT;
  // Emit trampoline stub for call. Call bailout() if failed. Return true on success.
  bool emit_trampoline_stub_for_call(address target, Register Rtoc = noreg);
 enum {
  max_static_call_stub_size = 4 * BytesPerInstWord + MacroAssembler::b64_patchable_size,
  call_stub_size = max_static_call_stub_size + MacroAssembler::trampoline_stub_size, // or smaller
  exception_handler_size = MacroAssembler::b64_patchable_size, // or smaller
  deopt_handler_size = MacroAssembler::bl64_patchable_size
 };
 #endif // CPU_PPC_VM_C1_LIRASSEMBLER_PPC_HPP
--- a/hotspot/src/cpu/ppc/vm/c1_LIRGenerator_ppc.cpp
+++ b/hotspot/src/cpu/ppc/vm/c1_LIRGenerator_ppc.cpp
--- a/hotspot/src/cpu/ppc/vm/c1_LinearScan_ppc.cpp
+++ b/hotspot/src/cpu/ppc/vm/c1_LinearScan_ppc.cpp
@ -0,0 +1,34 @@
 /*
 * Copyright (c) 2005, 2015, Oracle and/or its affiliates. All rights reserved.
 * Copyright 2012, 2015 SAP AG. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
 * under the terms of the GNU General Public License version 2 only, as
 * published by the Free Software Foundation.
 *
 * This code is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 * version 2 for more details (a copy is included in the LICENSE file that
 * accompanied this code).
 *
 * You should have received a copy of the GNU General Public License version
 * 2 along with this work; if not, write to the Free Software Foundation,
 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
 *
 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
 * or visit www.oracle.com if you need additional information or have any
 * questions.
 *
 */
 #include "precompiled.hpp"
 #include "c1/c1_Instruction.hpp"
 #include "c1/c1_LinearScan.hpp"
 #include "utilities/bitMap.inline.hpp"
 void LinearScan::allocate_fpu_stack() {
  Unimplemented();
  // No FPU stack on PPC
 }
--- a/hotspot/src/cpu/ppc/vm/c1_LinearScan_ppc.hpp
+++ b/hotspot/src/cpu/ppc/vm/c1_LinearScan_ppc.hpp
@ -0,0 +1,73 @@
 /*
 * Copyright (c) 2005, 2015, Oracle and/or its affiliates. All rights reserved.
 * Copyright 2012, 2015 SAP AG. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
 * under the terms of the GNU General Public License version 2 only, as
 * published by the Free Software Foundation.
 *
 * This code is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 * version 2 for more details (a copy is included in the LICENSE file that
 * accompanied this code).
 *
 * You should have received a copy of the GNU General Public License version
 * 2 along with this work; if not, write to the Free Software Foundation,
 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
 *
 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
 * or visit www.oracle.com if you need additional information or have any
 * questions.
 *
 */
 #ifndef CPU_PPC_VM_C1_LINEARSCAN_PPC_HPP
 #define CPU_PPC_VM_C1_LINEARSCAN_PPC_HPP
 inline bool LinearScan::is_processed_reg_num(int reg_num) {
  assert(FrameMap::R0_opr->cpu_regnr() == FrameMap::last_cpu_reg() + 1, "wrong assumption below");
  assert(FrameMap::R1_opr->cpu_regnr() == FrameMap::last_cpu_reg() + 2, "wrong assumption below");
  assert(FrameMap::R13_opr->cpu_regnr() == FrameMap::last_cpu_reg() + 3, "wrong assumption below");
  assert(FrameMap::R16_opr->cpu_regnr() == FrameMap::last_cpu_reg() + 4, "wrong assumption below");
  assert(FrameMap::R29_opr->cpu_regnr() == FrameMap::last_cpu_reg() + 5, "wrong assumption below");
  return reg_num <= FrameMap::last_cpu_reg() || reg_num >= pd_nof_cpu_regs_frame_map;
 }
 inline int LinearScan::num_physical_regs(BasicType type) {
  return 1;
 }
 inline bool LinearScan::requires_adjacent_regs(BasicType type) {
  return false;
 }
 inline bool LinearScan::is_caller_save(int assigned_reg) {
  return true; // assigned_reg < pd_first_callee_saved_reg;
 }
 inline void LinearScan::pd_add_temps(LIR_Op* op) {
  // No special case behaviours yet
 }
 inline bool LinearScanWalker::pd_init_regs_for_alloc(Interval* cur) {
  if (allocator()->gen()->is_vreg_flag_set(cur->reg_num(), LIRGenerator::callee_saved)) {
    assert(cur->type() != T_FLOAT && cur->type() != T_DOUBLE, "cpu regs only");
    _first_reg = pd_first_callee_saved_reg;
    _last_reg = pd_last_callee_saved_reg;
    ShouldNotReachHere(); // Currently no callee saved regs.
    return true;
  } else if (cur->type() == T_INT || cur->type() == T_LONG || cur->type() == T_OBJECT ||
             cur->type() == T_ADDRESS || cur->type() == T_METADATA) {
    _first_reg = pd_first_cpu_reg;
    _last_reg = pd_last_cpu_reg;
    return true;
  }
  return false;
 }
 #endif // CPU_PPC_VM_C1_LINEARSCAN_PPC_HPP
--- a/hotspot/src/cpu/ppc/vm/c1_MacroAssembler_ppc.cpp
+++ b/hotspot/src/cpu/ppc/vm/c1_MacroAssembler_ppc.cpp
@ -0,0 +1,486 @@
 /*
 * Copyright (c) 1999, 2015, Oracle and/or its affiliates. All rights reserved.
 * Copyright 2012, 2015 SAP AG. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
 * under the terms of the GNU General Public License version 2 only, as
 * published by the Free Software Foundation.
 *
 * This code is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 * version 2 for more details (a copy is included in the LICENSE file that
 * accompanied this code).
 *
 * You should have received a copy of the GNU General Public License version
 * 2 along with this work; if not, write to the Free Software Foundation,
 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
 *
 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
 * or visit www.oracle.com if you need additional information or have any
 * questions.
 *
 */
 #include "precompiled.hpp"
 #include "c1/c1_MacroAssembler.hpp"
 #include "c1/c1_Runtime1.hpp"
 #include "classfile/systemDictionary.hpp"
 #include "gc/shared/collectedHeap.hpp"
 #include "interpreter/interpreter.hpp"
 #include "oops/arrayOop.hpp"
 #include "oops/markOop.hpp"
 #include "runtime/basicLock.hpp"
 #include "runtime/biasedLocking.hpp"
 #include "runtime/os.hpp"
 #include "runtime/stubRoutines.hpp"
 #include "runtime/sharedRuntime.hpp"
 void C1_MacroAssembler::inline_cache_check(Register receiver, Register iCache) {
  const Register temp_reg = R12_scratch2;
  verify_oop(receiver);
  load_klass(temp_reg, receiver);
  if (TrapBasedICMissChecks) {
    trap_ic_miss_check(temp_reg, iCache);
  } else {
    Label L;
    cmpd(CCR0, temp_reg, iCache);
    beq(CCR0, L);
    //load_const_optimized(temp_reg, SharedRuntime::get_ic_miss_stub(), R0);
    calculate_address_from_global_toc(temp_reg, SharedRuntime::get_ic_miss_stub(), true, true, false);
    mtctr(temp_reg);
    bctr();
    align(32, 12);
    bind(L);
  }
 }
 void C1_MacroAssembler::explicit_null_check(Register base) {
  Unimplemented();
 }
 void C1_MacroAssembler::build_frame(int frame_size_in_bytes, int bang_size_in_bytes) {
  assert(bang_size_in_bytes >= frame_size_in_bytes, "stack bang size incorrect");
  // Make sure there is enough stack space for this method's activation.
  generate_stack_overflow_check(bang_size_in_bytes);
  // Create the frame.
  const Register return_pc  = R0;
  mflr(return_pc);
  // Get callers sp.
  std(return_pc, _abi(lr), R1_SP);           // SP->lr = return_pc
  push_frame(frame_size_in_bytes, R0);       // SP -= frame_size_in_bytes
 }
 void C1_MacroAssembler::unverified_entry(Register receiver, Register ic_klass) {
  Unimplemented(); // Currently unused.
  //if (C1Breakpoint) illtrap();
  //inline_cache_check(receiver, ic_klass);
 }
 void C1_MacroAssembler::verified_entry() {
  if (C1Breakpoint) illtrap();
  // build frame
 }
 void C1_MacroAssembler::lock_object(Register Rmark, Register Roop, Register Rbox, Register Rscratch, Label& slow_case) {
  assert_different_registers(Rmark, Roop, Rbox, Rscratch);
  Label done, cas_failed, slow_int;
  // The following move must be the first instruction of emitted since debug
  // information may be generated for it.
  // Load object header.
  ld(Rmark, oopDesc::mark_offset_in_bytes(), Roop);
  verify_oop(Roop);
  // Save object being locked into the BasicObjectLock...
  std(Roop, BasicObjectLock::obj_offset_in_bytes(), Rbox);
  if (UseBiasedLocking) {
    biased_locking_enter(CCR0, Roop, Rmark, Rscratch, R0, done, &slow_int);
  }
  // ... and mark it unlocked.
  ori(Rmark, Rmark, markOopDesc::unlocked_value);
  // Save unlocked object header into the displaced header location on the stack.
  std(Rmark, BasicLock::displaced_header_offset_in_bytes(), Rbox);
  // Compare object markOop with Rmark and if equal exchange Rscratch with object markOop.
  assert(oopDesc::mark_offset_in_bytes() == 0, "cas must take a zero displacement");
  cmpxchgd(/*flag=*/CCR0,
           /*current_value=*/Rscratch,
           /*compare_value=*/Rmark,
           /*exchange_value=*/Rbox,
           /*where=*/Roop/*+0==mark_offset_in_bytes*/,
           MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq,
           MacroAssembler::cmpxchgx_hint_acquire_lock(),
           noreg,
           &cas_failed,
           /*check without membar and ldarx first*/true);
  // If compare/exchange succeeded we found an unlocked object and we now have locked it
  // hence we are done.
  b(done);
  bind(slow_int);
  b(slow_case); // far
  bind(cas_failed);
  // We did not find an unlocked object so see if this is a recursive case.
  sub(Rscratch, Rscratch, R1_SP);
  load_const_optimized(R0, (~(os::vm_page_size()-1) | markOopDesc::lock_mask_in_place));
  and_(R0/*==0?*/, Rscratch, R0);
  std(R0/*==0, perhaps*/, BasicLock::displaced_header_offset_in_bytes(), Rbox);
  bne(CCR0, slow_int);
  bind(done);
 }
 void C1_MacroAssembler::unlock_object(Register Rmark, Register Roop, Register Rbox, Label& slow_case) {
  assert_different_registers(Rmark, Roop, Rbox);
  Label slow_int, done;
  Address mark_addr(Roop, oopDesc::mark_offset_in_bytes());
  assert(mark_addr.disp() == 0, "cas must take a zero displacement");
  if (UseBiasedLocking) {
    // Load the object out of the BasicObjectLock.
    ld(Roop, BasicObjectLock::obj_offset_in_bytes(), Rbox);
    verify_oop(Roop);
    biased_locking_exit(CCR0, Roop, R0, done);
  }
  // Test first it it is a fast recursive unlock.
  ld(Rmark, BasicLock::displaced_header_offset_in_bytes(), Rbox);
  cmpdi(CCR0, Rmark, 0);
  beq(CCR0, done);
  if (!UseBiasedLocking) {
    // Load object.
    ld(Roop, BasicObjectLock::obj_offset_in_bytes(), Rbox);
    verify_oop(Roop);
  }
  // Check if it is still a light weight lock, this is is true if we see
  // the stack address of the basicLock in the markOop of the object.
  cmpxchgd(/*flag=*/CCR0,
           /*current_value=*/R0,
           /*compare_value=*/Rbox,
           /*exchange_value=*/Rmark,
           /*where=*/Roop,
           MacroAssembler::MemBarRel,
           MacroAssembler::cmpxchgx_hint_release_lock(),
           noreg,
           &slow_int);
  b(done);
  bind(slow_int);
  b(slow_case); // far
  // Done
  bind(done);
 }
 void C1_MacroAssembler::try_allocate(
  Register obj,                        // result: pointer to object after successful allocation
  Register var_size_in_bytes,          // object size in bytes if unknown at compile time; invalid otherwise
  int      con_size_in_bytes,          // object size in bytes if   known at compile time
  Register t1,                         // temp register, must be global register for incr_allocated_bytes
  Register t2,                         // temp register
  Label&   slow_case                   // continuation point if fast allocation fails
 ) {
  if (UseTLAB) {
    tlab_allocate(obj, var_size_in_bytes, con_size_in_bytes, t1, slow_case);
  } else {
    eden_allocate(obj, var_size_in_bytes, con_size_in_bytes, t1, t2, slow_case);
    RegisterOrConstant size_in_bytes = var_size_in_bytes->is_valid()
                                       ? RegisterOrConstant(var_size_in_bytes)
                                       : RegisterOrConstant(con_size_in_bytes);
    incr_allocated_bytes(size_in_bytes, t1, t2);
  }
 }
 void C1_MacroAssembler::initialize_header(Register obj, Register klass, Register len, Register t1, Register t2) {
  assert_different_registers(obj, klass, len, t1, t2);
  if (UseBiasedLocking && !len->is_valid()) {
    ld(t1, in_bytes(Klass::prototype_header_offset()), klass);
  } else {
    load_const_optimized(t1, (intx)markOopDesc::prototype());
  }
  std(t1, oopDesc::mark_offset_in_bytes(), obj);
  store_klass(obj, klass);
  if (len->is_valid()) {
    stw(len, arrayOopDesc::length_offset_in_bytes(), obj);
  } else if (UseCompressedClassPointers) {
    // Otherwise length is in the class gap.
    store_klass_gap(obj);
  }
 }
 void C1_MacroAssembler::initialize_body(Register base, Register index) {
  assert_different_registers(base, index);
  srdi(index, index, LogBytesPerWord);
  clear_memory_doubleword(base, index);
 }
 void C1_MacroAssembler::initialize_body(Register obj, Register tmp1, Register tmp2,
                                        int obj_size_in_bytes, int hdr_size_in_bytes) {
  const int index = (obj_size_in_bytes - hdr_size_in_bytes) / HeapWordSize;
  const int cl_size         = VM_Version::L1_data_cache_line_size(),
            cl_dwords       = cl_size>>3,
            cl_dw_addr_bits = exact_log2(cl_dwords);
  const Register tmp = R0,
                 base_ptr = tmp1,
                 cnt_dwords = tmp2;
  if (index <= 6) {
    // Use explicit NULL stores.
    if (index > 0) { li(tmp, 0); }
    for (int i = 0; i < index; ++i) { std(tmp, hdr_size_in_bytes + i * HeapWordSize, obj); }
  } else if (index < (2<<cl_dw_addr_bits)-1) {
    // simple loop
    Label loop;
    li(cnt_dwords, index);
    addi(base_ptr, obj, hdr_size_in_bytes); // Compute address of first element.
    li(tmp, 0);
    mtctr(cnt_dwords);                      // Load counter.
  bind(loop);
    std(tmp, 0, base_ptr);                  // Clear 8byte aligned block.
    addi(base_ptr, base_ptr, 8);
    bdnz(loop);
  } else {
    // like clear_memory_doubleword
    Label startloop, fast, fastloop, restloop, done;
    addi(base_ptr, obj, hdr_size_in_bytes);           // Compute address of first element.
    load_const_optimized(cnt_dwords, index);
    rldicl_(tmp, base_ptr, 64-3, 64-cl_dw_addr_bits); // Extract dword offset within first cache line.
    beq(CCR0, fast);                                  // Already 128byte aligned.
    subfic(tmp, tmp, cl_dwords);
    mtctr(tmp);                        // Set ctr to hit 128byte boundary (0<ctr<cl_dwords).
    subf(cnt_dwords, tmp, cnt_dwords); // rest.
    li(tmp, 0);
  bind(startloop);                     // Clear at the beginning to reach 128byte boundary.
    std(tmp, 0, base_ptr);             // Clear 8byte aligned block.
    addi(base_ptr, base_ptr, 8);
    bdnz(startloop);
  bind(fast);                                  // Clear 128byte blocks.
    srdi(tmp, cnt_dwords, cl_dw_addr_bits);    // Loop count for 128byte loop (>0).
    andi(cnt_dwords, cnt_dwords, cl_dwords-1); // Rest in dwords.
    mtctr(tmp);                                // Load counter.
  bind(fastloop);
    dcbz(base_ptr);                    // Clear 128byte aligned block.
    addi(base_ptr, base_ptr, cl_size);
    bdnz(fastloop);
    cmpdi(CCR0, cnt_dwords, 0);        // size 0?
    beq(CCR0, done);                   // rest == 0
    li(tmp, 0);
    mtctr(cnt_dwords);                 // Load counter.
  bind(restloop);                      // Clear rest.
    std(tmp, 0, base_ptr);             // Clear 8byte aligned block.
    addi(base_ptr, base_ptr, 8);
    bdnz(restloop);
  bind(done);
  }
 }
 void C1_MacroAssembler::allocate_object(
  Register obj,                        // result: pointer to object after successful allocation
  Register t1,                         // temp register
  Register t2,                         // temp register
  Register t3,                         // temp register
  int      hdr_size,                   // object header size in words
  int      obj_size,                   // object size in words
  Register klass,                      // object klass
  Label&   slow_case                   // continuation point if fast allocation fails
 ) {
  assert_different_registers(obj, t1, t2, t3, klass);
  // allocate space & initialize header
  if (!is_simm16(obj_size * wordSize)) {
    // Would need to use extra register to load
    // object size => go the slow case for now.
    b(slow_case);
    return;
  }
  try_allocate(obj, noreg, obj_size * wordSize, t2, t3, slow_case);
  initialize_object(obj, klass, noreg, obj_size * HeapWordSize, t1, t2);
 }
 void C1_MacroAssembler::initialize_object(
  Register obj,                        // result: pointer to object after successful allocation
  Register klass,                      // object klass
  Register var_size_in_bytes,          // object size in bytes if unknown at compile time; invalid otherwise
  int      con_size_in_bytes,          // object size in bytes if   known at compile time
  Register t1,                         // temp register
  Register t2                          // temp register
  ) {
  const int hdr_size_in_bytes = instanceOopDesc::header_size() * HeapWordSize;
  initialize_header(obj, klass, noreg, t1, t2);
 #ifdef ASSERT
  {
    lwz(t1, in_bytes(Klass::layout_helper_offset()), klass);
    if (var_size_in_bytes != noreg) {
      cmpw(CCR0, t1, var_size_in_bytes);
    } else {
      cmpwi(CCR0, t1, con_size_in_bytes);
    }
    asm_assert_eq("bad size in initialize_object", 0x753);
  }
 #endif
  // Initialize body.
  if (var_size_in_bytes != noreg) {
    // Use a loop.
    addi(t1, obj, hdr_size_in_bytes);                // Compute address of first element.
    addi(t2, var_size_in_bytes, -hdr_size_in_bytes); // Compute size of body.
    initialize_body(t1, t2);
  } else if (con_size_in_bytes > hdr_size_in_bytes) {
    // Use a loop.
    initialize_body(obj, t1, t2, con_size_in_bytes, hdr_size_in_bytes);
  }
  if (CURRENT_ENV->dtrace_alloc_probes()) {
    Unimplemented();
 //    assert(obj == O0, "must be");
 //    call(CAST_FROM_FN_PTR(address, Runtime1::entry_for(Runtime1::dtrace_object_alloc_id)),
 //         relocInfo::runtime_call_type);
  }
  verify_oop(obj);
 }
 void C1_MacroAssembler::allocate_array(
  Register obj,                        // result: pointer to array after successful allocation
  Register len,                        // array length
  Register t1,                         // temp register
  Register t2,                         // temp register
  Register t3,                         // temp register
  int      hdr_size,                   // object header size in words
  int      elt_size,                   // element size in bytes
  Register klass,                      // object klass
  Label&   slow_case                   // continuation point if fast allocation fails
 ) {
  assert_different_registers(obj, len, t1, t2, t3, klass);
  // Determine alignment mask.
  assert(!(BytesPerWord & 1), "must be a multiple of 2 for masking code to work");
  int log2_elt_size = exact_log2(elt_size);
  // Check for negative or excessive length.
  size_t max_length = max_array_allocation_length >> log2_elt_size;
  if (UseTLAB) {
    size_t max_tlab = align_size_up(ThreadLocalAllocBuffer::max_size() >> log2_elt_size, 64*K);
    if (max_tlab < max_length) { max_length = max_tlab; }
  }
  load_const_optimized(t1, max_length);
  cmpld(CCR0, len, t1);
  bc_far_optimized(Assembler::bcondCRbiIs1, bi0(CCR0, Assembler::greater), slow_case);
  // compute array size
  // note: If 0 <= len <= max_length, len*elt_size + header + alignment is
  //       smaller or equal to the largest integer; also, since top is always
  //       aligned, we can do the alignment here instead of at the end address
  //       computation.
  const Register arr_size = t1;
  Register arr_len_in_bytes = len;
  if (elt_size != 1) {
    sldi(t1, len, log2_elt_size);
    arr_len_in_bytes = t1;
  }
  addi(arr_size, arr_len_in_bytes, hdr_size * wordSize + MinObjAlignmentInBytesMask); // Add space for header & alignment.
  clrrdi(arr_size, arr_size, LogMinObjAlignmentInBytes);                              // Align array size.
  // Allocate space & initialize header.
  if (UseTLAB) {
    tlab_allocate(obj, arr_size, 0, t2, slow_case);
  } else {
    eden_allocate(obj, arr_size, 0, t2, t3, slow_case);
  }
  initialize_header(obj, klass, len, t2, t3);
  // Initialize body.
  const Register base  = t2;
  const Register index = t3;
  addi(base, obj, hdr_size * wordSize);               // compute address of first element
  addi(index, arr_size, -(hdr_size * wordSize));      // compute index = number of bytes to clear
  initialize_body(base, index);
  if (CURRENT_ENV->dtrace_alloc_probes()) {
    Unimplemented();
    //assert(obj == O0, "must be");
    //call(CAST_FROM_FN_PTR(address, Runtime1::entry_for(Runtime1::dtrace_object_alloc_id)),
    //     relocInfo::runtime_call_type);
  }
  verify_oop(obj);
 }
 #ifndef PRODUCT
 void C1_MacroAssembler::verify_stack_oop(int stack_offset) {
  verify_oop_addr((RegisterOrConstant)(stack_offset + STACK_BIAS), R1_SP, "broken oop in stack slot");
 }
 void C1_MacroAssembler::verify_not_null_oop(Register r) {
  Label not_null;
  cmpdi(CCR0, r, 0);
  bne(CCR0, not_null);
  stop("non-null oop required");
  bind(not_null);
  if (!VerifyOops) return;
  verify_oop(r);
 }
 #endif // PRODUCT
 void C1_MacroAssembler::null_check(Register r, Label* Lnull) {
  if (TrapBasedNullChecks) { // SIGTRAP based
    trap_null_check(r);
  } else { // explicit
    //const address exception_entry = Runtime1::entry_for(Runtime1::throw_null_pointer_exception_id);
    assert(Lnull != NULL, "must have Label for explicit check");
    cmpdi(CCR0, r, 0);
    bc_far_optimized(Assembler::bcondCRbiIs1, bi0(CCR0, Assembler::equal), *Lnull);
  }
 }
 address C1_MacroAssembler::call_c_with_frame_resize(address dest, int frame_resize) {
  if (frame_resize) { resize_frame(-frame_resize, R0); }
 #if defined(ABI_ELFv2)
  address return_pc = call_c(dest, relocInfo::runtime_call_type);
 #else
  address return_pc = call_c(CAST_FROM_FN_PTR(FunctionDescriptor*, dest), relocInfo::runtime_call_type);
 #endif
  if (frame_resize) { resize_frame(frame_resize, R0); }
  return return_pc;
 }
--- a/hotspot/src/cpu/ppc/vm/c1_MacroAssembler_ppc.hpp
+++ b/hotspot/src/cpu/ppc/vm/c1_MacroAssembler_ppc.hpp
@ -0,0 +1,93 @@
 /*
 * Copyright (c) 1999, 2015, Oracle and/or its affiliates. All rights reserved.
 * Copyright 2012, 2015 SAP AG. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
 * under the terms of the GNU General Public License version 2 only, as
 * published by the Free Software Foundation.
 *
 * This code is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 * version 2 for more details (a copy is included in the LICENSE file that
 * accompanied this code).
 *
 * You should have received a copy of the GNU General Public License version
 * 2 along with this work; if not, write to the Free Software Foundation,
 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
 *
 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
 * or visit www.oracle.com if you need additional information or have any
 * questions.
 *
 */
 #ifndef CPU_PPC_VM_C1_MACROASSEMBLER_PPC_HPP
 #define CPU_PPC_VM_C1_MACROASSEMBLER_PPC_HPP
  void pd_init() { /* nothing to do */ }
 public:
   void try_allocate(
    Register obj,                      // result: pointer to object after successful allocation
    Register var_size_in_bytes,        // object size in bytes if unknown at compile time; invalid otherwise
    int      con_size_in_bytes,        // object size in bytes if   known at compile time
    Register t1,                       // temp register
    Register t2,                       // temp register
    Label&   slow_case                 // continuation point if fast allocation fails
  );
  void initialize_header(Register obj, Register klass, Register len, Register t1, Register t2);
  void initialize_body(Register base, Register index);
  void initialize_body(Register obj, Register tmp1, Register tmp2, int obj_size_in_bytes, int hdr_size_in_bytes);
  // locking/unlocking
  void lock_object  (Register Rmark, Register Roop, Register Rbox, Register Rscratch, Label& slow_case);
  void unlock_object(Register Rmark, Register Roop, Register Rbox,                    Label& slow_case);
  void initialize_object(
    Register obj,                      // result: pointer to object after successful allocation
    Register klass,                    // object klass
    Register var_size_in_bytes,        // object size in bytes if unknown at compile time; invalid otherwise
    int      con_size_in_bytes,        // object size in bytes if   known at compile time
    Register t1,                       // temp register
    Register t2                        // temp register
  );
  // Allocation of fixed-size objects
  // (Can also be used to allocate fixed-size arrays, by setting
  // hdr_size correctly and storing the array length afterwards.)
  void allocate_object(
    Register obj,                      // result: pointer to object after successful allocation
    Register t1,                       // temp register
    Register t2,                       // temp register
    Register t3,                       // temp register
    int      hdr_size,                 // object header size in words
    int      obj_size,                 // object size in words
    Register klass,                    // object klass
    Label&   slow_case                 // continuation point if fast allocation fails
  );
  enum {
    max_array_allocation_length = 0x40000000 // ppc friendly value, requires lis only
  };
  // Allocation of arrays
  void allocate_array(
    Register obj,                      // result: pointer to array after successful allocation
    Register len,                      // array length
    Register t1,                       // temp register
    Register t2,                       // temp register
    Register t3,                       // temp register
    int      hdr_size,                 // object header size in words
    int      elt_size,                 // element size in bytes
    Register klass,                    // object klass
    Label&   slow_case                 // continuation point if fast allocation fails
  );
  void null_check(Register r, Label *Lnull = NULL);
  address call_c_with_frame_resize(address dest, int frame_resize);
 #endif // CPU_PPC_VM_C1_MACROASSEMBLER_PPC_HPP
--- a/hotspot/src/cpu/ppc/vm/c1_Runtime1_ppc.cpp
+++ b/hotspot/src/cpu/ppc/vm/c1_Runtime1_ppc.cpp
--- a/hotspot/src/cpu/ppc/vm/c1_globals_ppc.hpp
+++ b/hotspot/src/cpu/ppc/vm/c1_globals_ppc.hpp
@ -0,0 +1,68 @@
 /*
 * Copyright (c) 2000, 2015, Oracle and/or its affiliates. All rights reserved.
 * Copyright 2012, 2015 SAP AG. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
 * under the terms of the GNU General Public License version 2 only, as
 * published by the Free Software Foundation.
 *
 * This code is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 * version 2 for more details (a copy is included in the LICENSE file that
 * accompanied this code).
 *
 * You should have received a copy of the GNU General Public License version
 * 2 along with this work; if not, write to the Free Software Foundation,
 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
 *
 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
 * or visit www.oracle.com if you need additional information or have any
 * questions.
 *
 */
 #ifndef CPU_PPC_VM_C1_GLOBALS_PPC_HPP
 #define CPU_PPC_VM_C1_GLOBALS_PPC_HPP
 #include "utilities/globalDefinitions.hpp"
 #include "utilities/macros.hpp"
 // Sets the default values for platform dependent flags used by the client compiler.
 // (see c1_globals.hpp)
 #ifndef TIERED
 define_pd_global(bool, BackgroundCompilation,        true );
 define_pd_global(bool, CICompileOSR,                 true );
 define_pd_global(bool, InlineIntrinsics,             true );
 define_pd_global(bool, PreferInterpreterNativeStubs, false);
 define_pd_global(bool, ProfileTraps,                 false);
 define_pd_global(bool, UseOnStackReplacement,        true );
 define_pd_global(bool, TieredCompilation,            false);
 define_pd_global(intx, CompileThreshold,             1000 );
 define_pd_global(intx, OnStackReplacePercentage,     1400 );
 define_pd_global(bool, UseTLAB,                      true );
 define_pd_global(bool, ProfileInterpreter,           false);
 define_pd_global(intx, FreqInlineSize,               325  );
 define_pd_global(bool, ResizeTLAB,                   true );
 define_pd_global(intx, ReservedCodeCacheSize,        32*M );
 define_pd_global(intx, CodeCacheExpansionSize,       32*K );
 define_pd_global(uintx,CodeCacheMinBlockLength,      1);
 define_pd_global(uintx,MetaspaceSize,                12*M );
 define_pd_global(bool, NeverActAsServerClassMachine, true );
 define_pd_global(intx, NewSizeThreadIncrease,        16*K );
 define_pd_global(uint64_t,MaxRAM,                    1ULL*G);
 define_pd_global(intx, InitialCodeCacheSize,         160*K);
 #endif // !TIERED
 define_pd_global(bool, UseTypeProfile,               false);
 define_pd_global(bool, RoundFPResults,               false);
 define_pd_global(bool, LIRFillDelaySlots,            false);
 define_pd_global(bool, OptimizeSinglePrecision,      false);
 define_pd_global(bool, CSEArrayLength,               true );
 define_pd_global(bool, TwoOperandLIRForm,            false);
 #endif // CPU_PPC_VM_C1_GLOBALS_PPC_HPP
--- a/hotspot/src/cpu/ppc/vm/c2_globals_ppc.hpp
+++ b/hotspot/src/cpu/ppc/vm/c2_globals_ppc.hpp
@ -39,7 +39,7 @@ define_pd_global(bool, PreferInterpreterNativeStubs, false);
 define_pd_global(bool, ProfileTraps,                 true);
 define_pd_global(bool, UseOnStackReplacement,        true);
 define_pd_global(bool, ProfileInterpreter,           true);
-define_pd_global(bool, TieredCompilation,            false);
+define_pd_global(bool, TieredCompilation,            true);
 define_pd_global(intx, CompileThreshold,             10000);
 define_pd_global(intx, OnStackReplacePercentage,     140);
--- a/hotspot/src/cpu/ppc/vm/c2_init_ppc.cpp
+++ b/hotspot/src/cpu/ppc/vm/c2_init_ppc.cpp
@ -1,6 +1,6 @@
 /*
- * Copyright (c) 2000, 2013, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2000, 2015, Oracle and/or its affiliates. All rights reserved.
- * Copyright 2012, 2013 SAP AG. All rights reserved.
+ * Copyright 2012, 2015 SAP AG. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@ -45,4 +45,14 @@ void Compile::pd_compiler2_init() {
      FLAG_SET_ERGO(bool, InsertEndGroupPPC64, true);
    }
  }
  if (!VM_Version::has_isel() && FLAG_IS_DEFAULT(ConditionalMoveLimit)) {
    FLAG_SET_ERGO(intx, ConditionalMoveLimit, 0);
  }
  if (OptimizeFill) {
    warning("OptimizeFill is not supported on this CPU.");
    FLAG_SET_DEFAULT(OptimizeFill, false);
  }
 }
--- a/hotspot/src/cpu/ppc/vm/compiledIC_ppc.cpp
+++ b/hotspot/src/cpu/ppc/vm/compiledIC_ppc.cpp
@ -1,5 +1,6 @@
 /*
- * Copyright (c) 1997, 2014, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2015, Oracle and/or its affiliates. All rights reserved.
 * Copyright 2012, 2015 SAP AG. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@ -129,13 +130,20 @@ address CompiledStaticCall::emit_to_interp_stub(CodeBuffer &cbuf, address mark/*
  // - call
  __ calculate_address_from_global_toc(reg_scratch, __ method_toc());
  AddressLiteral ic = __ allocate_metadata_address((Metadata *)NULL);
-  __ load_const_from_method_toc(as_Register(Matcher::inline_cache_reg_encode()), ic, reg_scratch);
+  bool success = __ load_const_from_method_toc(as_Register(Matcher::inline_cache_reg_encode()),
                                               ic, reg_scratch, /*fixed_size*/ true);
  if (!success) {
    return NULL; // CodeCache is full
  }
  if (ReoptimizeCallSequences) {
    __ b64_patchable((address)-1, relocInfo::none);
  } else {
    AddressLiteral a((address)-1);
-    __ load_const_from_method_toc(reg_scratch, a, reg_scratch);
+    success = __ load_const_from_method_toc(reg_scratch, a, reg_scratch, /*fixed_size*/ true);
    if (!success) {
      return NULL; // CodeCache is full
    }
    __ mtctr(reg_scratch);
    __ bctr();
  }
@ -153,6 +161,7 @@ address CompiledStaticCall::emit_to_interp_stub(CodeBuffer &cbuf, address mark/*
  return stub;
 #else
  ShouldNotReachHere();
  return NULL;
 #endif
 }
 #undef __
--- a/hotspot/src/cpu/ppc/vm/frame_ppc.cpp
+++ b/hotspot/src/cpu/ppc/vm/frame_ppc.cpp
@ -1,6 +1,6 @@
 /*
- * Copyright (c) 2000, 2014, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2000, 2015, Oracle and/or its affiliates. All rights reserved.
- * Copyright 2012, 2014 SAP AG. All rights reserved.
+ * Copyright 2012, 2015 SAP AG. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@ -236,39 +236,6 @@ void frame::describe_pd(FrameValues& values, int frame_no) {
 }
 #endif
 void frame::adjust_unextended_sp() {
  // If we are returning to a compiled MethodHandle call site, the
  // saved_fp will in fact be a saved value of the unextended SP. The
  // simplest way to tell whether we are returning to such a call site
  // is as follows:
  if (is_compiled_frame() && false /*is_at_mh_callsite()*/) {  // TODO PPC port
    // If the sender PC is a deoptimization point, get the original
    // PC. For MethodHandle call site the unextended_sp is stored in
    // saved_fp.
    _unextended_sp = _fp - _cb->frame_size();
 #ifdef ASSERT
    nmethod *sender_nm = _cb->as_nmethod_or_null();
    assert(sender_nm && *_sp == *_unextended_sp, "backlink changed");
    intptr_t* sp = _unextended_sp;  // check if stack can be walked from here
    for (int x = 0; x < 5; ++x) {   // check up to a couple of backlinks
      intptr_t* prev_sp = *(intptr_t**)sp;
      if (prev_sp == 0) break;      // end of stack
      assert(prev_sp>sp, "broken stack");
      sp = prev_sp;
    }
    if (sender_nm->is_deopt_mh_entry(_pc)) { // checks for deoptimization
      address original_pc = sender_nm->get_original_pc(this);
      assert(sender_nm->insts_contains(original_pc), "original PC must be in nmethod");
      assert(sender_nm->is_method_handle_return(original_pc), "must be");
    }
 #endif
  }
 }
 intptr_t *frame::initial_deoptimization_info() {
  // unused... but returns fp() to minimize changes introduced by 7087445
  return fp();
--- a/hotspot/src/cpu/ppc/vm/frame_ppc.hpp
+++ b/hotspot/src/cpu/ppc/vm/frame_ppc.hpp
@ -1,6 +1,6 @@
 /*
- * Copyright (c) 2000, 2013, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2000, 2015, Oracle and/or its affiliates. All rights reserved.
- * Copyright 2012, 2014 SAP AG. All rights reserved.
+ * Copyright 2012, 2015 SAP AG. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@ -373,7 +373,6 @@
  // The frame's stack pointer before it has been extended by a c2i adapter;
  // needed by deoptimization
  intptr_t* _unextended_sp;
  void adjust_unextended_sp();
 public:
--- a/hotspot/src/cpu/ppc/vm/frame_ppc.inline.hpp
+++ b/hotspot/src/cpu/ppc/vm/frame_ppc.inline.hpp
@ -39,9 +39,6 @@ inline void frame::find_codeblob_and_set_pc_and_deopt_state(address pc) {
  _pc = pc;   // Must be set for get_deopt_original_pc()
  _fp = (intptr_t*)own_abi()->callers_sp;
  // Use _fp - frame_size, needs to be done between _cb and _pc initialization
  // and get_deopt_original_pc.
  adjust_unextended_sp();
  address original_pc = nmethod::get_deopt_original_pc(this);
  if (original_pc != NULL) {
--- a/hotspot/src/cpu/ppc/vm/globalDefinitions_ppc.hpp
+++ b/hotspot/src/cpu/ppc/vm/globalDefinitions_ppc.hpp
@ -35,11 +35,18 @@ const int BytesPerInstWord = 4;
 const int StackAlignmentInBytes = 16;
 // Indicates whether the C calling conventions require that
 // 32-bit integer argument values are extended to 64 bits.
 const bool CCallingConventionRequiresIntsAsLongs = true;
 #define SUPPORTS_NATIVE_CX8
 // The PPC CPUs are NOT multiple-copy-atomic.
 #define CPU_NOT_MULTIPLE_COPY_ATOMIC
 // The expected size in bytes of a cache line, used to pad data structures.
 #define DEFAULT_CACHE_LINE_SIZE 128
 #if defined(COMPILER2) && defined(AIX)
 // Include Transactional Memory lock eliding optimization
 #define INCLUDE_RTM_OPT 1
--- a/hotspot/src/cpu/ppc/vm/interp_masm_ppc_64.cpp
+++ b/hotspot/src/cpu/ppc/vm/interp_masm_ppc_64.cpp
@ -87,9 +87,9 @@ void InterpreterMacroAssembler::dispatch_prolog(TosState state, int bcp_incr) {
 // own dispatch. The dispatch address in R24_dispatch_addr is used for the
 // dispatch.
 void InterpreterMacroAssembler::dispatch_epilog(TosState state, int bcp_incr) {
  if (bcp_incr) { addi(R14_bcp, R14_bcp, bcp_incr); }
  mtctr(R24_dispatch_addr);
-  addi(R14_bcp, R14_bcp, bcp_incr);
+  bcctr(bcondAlways, 0, bhintbhBCCTRisNotPredictable);
  bctr();
 }
 void InterpreterMacroAssembler::check_and_handle_popframe(Register scratch_reg) {
@ -207,9 +207,6 @@ void InterpreterMacroAssembler::dispatch_Lbyte_code(TosState state, Register byt
    unimplemented("dispatch_Lbyte_code: verify"); // See Sparc Implementation to implement this
  }
 #ifdef FAST_DISPATCH
  unimplemented("dispatch_Lbyte_code FAST_DISPATCH");
 #else
  assert_different_registers(bytecode, R11_scratch1);
  // Calc dispatch table address.
@ -220,8 +217,7 @@ void InterpreterMacroAssembler::dispatch_Lbyte_code(TosState state, Register byt
  // Jump off!
  mtctr(R11_scratch1);
-  bctr();
+  bcctr(bcondAlways, 0, bhintbhBCCTRisNotPredictable);
 #endif
 }
 void InterpreterMacroAssembler::load_receiver(Register Rparam_count, Register Rrecv_dst) {
@ -544,8 +540,8 @@ void InterpreterMacroAssembler::index_check_without_pop(Register Rarray, Registe
  sldi(RsxtIndex, RsxtIndex, index_shift);
  blt(CCR0, LnotOOR);
  // Index should be in R17_tos, array should be in R4_ARG2.
-  mr(R17_tos, Rindex);
+  mr_if_needed(R17_tos, Rindex);
-  mr(R4_ARG2, Rarray);
+  mr_if_needed(R4_ARG2, Rarray);
  load_dispatch_table(Rtmp, (address*)Interpreter::_throw_ArrayIndexOutOfBoundsException_entry);
  mtctr(Rtmp);
  bctr();
@ -842,7 +838,6 @@ void InterpreterMacroAssembler::lock_object(Register monitor, Register object) {
    // Must fence, otherwise, preceding store(s) may float below cmpxchg.
    // CmpxchgX sets CCR0 to cmpX(current, displaced).
    fence(); // TODO: replace by MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq ?
    cmpxchgd(/*flag=*/CCR0,
             /*current_value=*/current_header,
             /*compare_value=*/displaced_header, /*exchange_value=*/monitor,
@ -850,7 +845,8 @@ void InterpreterMacroAssembler::lock_object(Register monitor, Register object) {
             MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq,
             MacroAssembler::cmpxchgx_hint_acquire_lock(),
             noreg,
-             &cas_failed);
+             &cas_failed,
             /*check without membar and ldarx first*/true);
    // If the compare-and-exchange succeeded, then we found an unlocked
    // object and we have now locked it.
@ -868,9 +864,7 @@ void InterpreterMacroAssembler::lock_object(Register monitor, Register object) {
    sub(current_header, current_header, R1_SP);
    assert(os::vm_page_size() > 0xfff, "page size too small - change the constant");
-    load_const_optimized(tmp,
+    load_const_optimized(tmp, ~(os::vm_page_size()-1) | markOopDesc::lock_mask_in_place);
                         (address) (~(os::vm_page_size()-1) |
                                    markOopDesc::lock_mask_in_place));
    and_(R0/*==0?*/, current_header, tmp);
    // If condition is true we are done and hence we can store 0 in the displaced
@ -1106,6 +1100,7 @@ void InterpreterMacroAssembler::verify_method_data_pointer() {
 }
 void InterpreterMacroAssembler::test_invocation_counter_for_mdp(Register invocation_count,
                                                                Register method_counters,
                                                                Register Rscratch,
                                                                Label &profile_continue) {
  assert(ProfileInterpreter, "must be profiling interpreter");
@ -1114,12 +1109,11 @@ void InterpreterMacroAssembler::test_invocation_counter_for_mdp(Register invocat
  Label done;
  // If no method data exists, and the counter is high enough, make one.
-  int ipl_offs = load_const_optimized(Rscratch, &InvocationCounter::InterpreterProfileLimit, R0, true);
+  lwz(Rscratch, in_bytes(MethodCounters::interpreter_profile_limit_offset()), method_counters);
  lwz(Rscratch, ipl_offs, Rscratch);
  cmpdi(CCR0, R28_mdx, 0);
  // Test to see if we should create a method data oop.
-  cmpd(CCR1, Rscratch /* InterpreterProfileLimit */, invocation_count);
+  cmpd(CCR1, Rscratch, invocation_count);
  bne(CCR0, done);
  bge(CCR1, profile_continue);
@ -1132,15 +1126,15 @@ void InterpreterMacroAssembler::test_invocation_counter_for_mdp(Register invocat
  bind(done);
 }
-void InterpreterMacroAssembler::test_backedge_count_for_osr(Register backedge_count, Register branch_bcp, Register Rtmp) {
+void InterpreterMacroAssembler::test_backedge_count_for_osr(Register backedge_count, Register method_counters,
-  assert_different_registers(backedge_count, Rtmp, branch_bcp);
+                                                            Register target_bcp, Register disp, Register Rtmp) {
  assert_different_registers(backedge_count, target_bcp, disp, Rtmp, R4_ARG2);
  assert(UseOnStackReplacement,"Must UseOnStackReplacement to test_backedge_count_for_osr");
  Label did_not_overflow;
  Label overflow_with_error;
-  int ibbl_offs = load_const_optimized(Rtmp, &InvocationCounter::InterpreterBackwardBranchLimit, R0, true);
+  lwz(Rtmp, in_bytes(MethodCounters::interpreter_backward_branch_limit_offset()), method_counters);
  lwz(Rtmp, ibbl_offs, Rtmp);
  cmpw(CCR0, backedge_count, Rtmp);
  blt(CCR0, did_not_overflow);
@ -1152,17 +1146,15 @@ void InterpreterMacroAssembler::test_backedge_count_for_osr(Register backedge_co
  // the overflow function is called only once every overflow_frequency.
  if (ProfileInterpreter) {
    const int overflow_frequency = 1024;
-    li(Rtmp, overflow_frequency-1);
+    andi_(Rtmp, backedge_count, overflow_frequency-1);
    andr(Rtmp, Rtmp, backedge_count);
    cmpwi(CCR0, Rtmp, 0);
    bne(CCR0, did_not_overflow);
  }
  // Overflow in loop, pass branch bytecode.
-  call_VM(noreg, CAST_FROM_FN_PTR(address, InterpreterRuntime::frequency_counter_overflow), branch_bcp, true);
+  subf(R4_ARG2, disp, target_bcp); // Compute branch bytecode (previous bcp).
  call_VM(noreg, CAST_FROM_FN_PTR(address, InterpreterRuntime::frequency_counter_overflow), R4_ARG2, true);
  // Was an OSR adapter generated?
  // O0 = osr nmethod
  cmpdi(CCR0, R3_RET, 0);
  beq(CCR0, overflow_with_error);
@ -1323,7 +1315,7 @@ void InterpreterMacroAssembler::increment_backedge_counter(const Register Rcount
  assert_different_registers(Rdst, Rtmp1);
  const Register invocation_counter = Rtmp1;
  const Register counter = Rdst;
-  // TODO ppc port assert(4 == InvocationCounter::sz_counter(), "unexpected field size.");
+  // TODO: PPC port: assert(4 == InvocationCounter::sz_counter(), "unexpected field size.");
  // Load backedge counter.
  lwz(counter, in_bytes(MethodCounters::backedge_counter_offset()) +
@ -1336,8 +1328,7 @@ void InterpreterMacroAssembler::increment_backedge_counter(const Register Rcount
  addi(counter, counter, InvocationCounter::count_increment);
  // Mask the invocation counter.
-  li(Rscratch, InvocationCounter::count_mask_value);
+  andi(invocation_counter, invocation_counter, InvocationCounter::count_mask_value);
  andr(invocation_counter, invocation_counter, Rscratch);
  // Store new counter value.
  stw(counter, in_bytes(MethodCounters::backedge_counter_offset()) +
@ -1822,15 +1813,13 @@ void InterpreterMacroAssembler::profile_return_type(Register ret, Register tmp1,
    test_method_data_pointer(profile_continue);
    if (MethodData::profile_return_jsr292_only()) {
      assert(Method::intrinsic_id_size_in_bytes() == 2, "assuming Method::_intrinsic_id is u2");
      // If we don't profile all invoke bytecodes we must make sure
      // it's a bytecode we indeed profile. We can't go back to the
      // begining of the ProfileData we intend to update to check its
      // type because we're right after it and we don't known its
      // length.
      lbz(tmp1, 0, R14_bcp);
-      lhz(tmp2, Method::intrinsic_id_offset_in_bytes(), R19_method);
+      lbz(tmp2, Method::intrinsic_id_offset_in_bytes(), R19_method);
      cmpwi(CCR0, tmp1, Bytecodes::_invokedynamic);
      cmpwi(CCR1, tmp1, Bytecodes::_invokehandle);
      cror(CCR0, Assembler::equal, CCR1, Assembler::equal);
@ -2224,9 +2213,7 @@ void InterpreterMacroAssembler::increment_invocation_counter(Register Rcounters,
  // Load the backedge counter.
  lwz(backedge_count, be_counter_offset, Rcounters); // is unsigned int
  // Mask the backedge counter.
-  Register tmp = invocation_count;
+  andi(backedge_count, backedge_count, InvocationCounter::count_mask_value);
  li(tmp, InvocationCounter::count_mask_value);
  andr(backedge_count, tmp, backedge_count); // Cannot use andi, need sign extension of count_mask_value.
  // Load the invocation counter.
  lwz(invocation_count, inv_counter_offset, Rcounters); // is unsigned int
@ -2282,7 +2269,7 @@ void InterpreterMacroAssembler::verify_oop_or_return_address(Register reg, Regis
  bne(CCR0, test);
  address fd = CAST_FROM_FN_PTR(address, verify_return_address);
-  const int nbytes_save = 11*8; // volatile gprs except R0
+  const int nbytes_save = MacroAssembler::num_volatile_regs * 8;
  save_volatile_gprs(R1_SP, -nbytes_save); // except R0
  save_LR_CR(Rtmp); // Save in old frame.
  push_frame_reg_args(nbytes_save, Rtmp);
--- a/hotspot/src/cpu/ppc/vm/interp_masm_ppc_64.hpp
+++ b/hotspot/src/cpu/ppc/vm/interp_masm_ppc_64.hpp
@ -195,7 +195,7 @@ class InterpreterMacroAssembler: public MacroAssembler {
  void restore_interpreter_state(Register scratch, bool bcp_and_mdx_only = false);
  void increment_backedge_counter(const Register Rcounters, Register Rtmp, Register Rtmp2, Register Rscratch);
-  void test_backedge_count_for_osr(Register backedge_count, Register branch_bcp, Register Rtmp);
+  void test_backedge_count_for_osr(Register backedge_count, Register method_counters, Register target_bcp, Register disp, Register Rtmp);
  void record_static_call_in_profile(Register Rentry, Register Rtmp);
  void record_receiver_call_in_profile(Register Rklass, Register Rentry, Register Rtmp);
@ -211,7 +211,7 @@ class InterpreterMacroAssembler: public MacroAssembler {
  void set_method_data_pointer_for_bcp();
  void test_method_data_pointer(Label& zero_continue);
  void verify_method_data_pointer();
-  void test_invocation_counter_for_mdp(Register invocation_count, Register Rscratch, Label &profile_continue);
+  void test_invocation_counter_for_mdp(Register invocation_count, Register method_counters, Register Rscratch, Label &profile_continue);
  void set_mdp_data_at(int constant, Register value);
--- a/hotspot/src/cpu/ppc/vm/macroAssembler_ppc.cpp
+++ b/hotspot/src/cpu/ppc/vm/macroAssembler_ppc.cpp
@ -30,6 +30,7 @@
 #include "gc/shared/collectedHeap.inline.hpp"
 #include "interpreter/interpreter.hpp"
 #include "memory/resourceArea.hpp"
 #include "nativeInst_ppc.hpp"
 #include "prims/methodHandles.hpp"
 #include "runtime/biasedLocking.hpp"
 #include "runtime/icache.hpp"
@ -114,7 +115,7 @@ void MacroAssembler::calculate_address_from_global_toc(Register dst, address add
  }
  if (hi16) {
-    addis(dst, R29, MacroAssembler::largeoffset_si16_si16_hi(offset));
+    addis(dst, R29_TOC, MacroAssembler::largeoffset_si16_si16_hi(offset));
  }
  if (lo16) {
    if (add_relocation) {
@ -256,7 +257,9 @@ narrowOop MacroAssembler::get_narrow_oop(address a, address bound) {
 }
 #endif // _LP64
-void MacroAssembler::load_const_from_method_toc(Register dst, AddressLiteral& a, Register toc) {
+// Returns true if successful.
 bool MacroAssembler::load_const_from_method_toc(Register dst, AddressLiteral& a,
                                                Register toc, bool fixed_size) {
  int toc_offset = 0;
  // Use RelocationHolder::none for the constant pool entry, otherwise
  // we will end up with a failing NativeCall::verify(x) where x is
@ -264,11 +267,13 @@ void MacroAssembler::load_const_from_method_toc(Register dst, AddressLiteral& a,
  // FIXME: We should insert relocation information for oops at the constant
  // pool entries instead of inserting it at the loads; patching of a constant
  // pool entry should be less expensive.
-  address oop_address = address_constant((address)a.value(), RelocationHolder::none);
+  address const_address = address_constant((address)a.value(), RelocationHolder::none);
  if (const_address == NULL) { return false; } // allocation failure
  // Relocate at the pc of the load.
  relocate(a.rspec());
-  toc_offset = (int)(oop_address - code()->consts()->start());
+  toc_offset = (int)(const_address - code()->consts()->start());
-  ld_largeoffset_unchecked(dst, toc_offset, toc, true);
+  ld_largeoffset_unchecked(dst, toc_offset, toc, fixed_size);
  return true;
 }
 bool MacroAssembler::is_load_const_from_method_toc_at(address a) {
@ -446,6 +451,15 @@ void MacroAssembler::bc_far(int boint, int biint, Label& dest, int optimize) {
  assert(dest.is_bound() || target_pc == b_pc, "postcondition");
 }
 // 1 or 2 instructions
 void MacroAssembler::bc_far_optimized(int boint, int biint, Label& dest) {
  if (dest.is_bound() && is_within_range_of_bcxx(target(dest), pc())) {
    bc(boint, biint, dest);
  } else {
    bc_far(boint, biint, dest, MacroAssembler::bc_far_optimize_on_relocate);
  }
 }
 bool MacroAssembler::is_bc_far_at(address instruction_addr) {
  return is_bc_far_variant1_at(instruction_addr) ||
         is_bc_far_variant2_at(instruction_addr) ||
@ -496,7 +510,7 @@ void MacroAssembler::set_dest_of_bc_far_at(address instruction_addr, address des
      // variant 1, the 1st instruction contains the destination address:
      //
      //    bcxx  DEST
-      //    endgroup
+      //    nop
      //
      const int instruction_1 = *(int*)(instruction_addr);
      boint = inv_bo_field(instruction_1);
@ -523,10 +537,10 @@ void MacroAssembler::set_dest_of_bc_far_at(address instruction_addr, address des
      // variant 1:
      //
      //    bcxx  DEST
-      //    endgroup
+      //    nop
      //
      masm.bc(boint, biint, dest);
-      masm.endgroup();
+      masm.nop();
    } else {
      // variant 2:
      //
@ -810,7 +824,22 @@ void MacroAssembler::save_volatile_gprs(Register dst, int offset) {
  std(R9,  offset, dst);   offset += 8;
  std(R10, offset, dst);   offset += 8;
  std(R11, offset, dst);   offset += 8;
-  std(R12, offset, dst);
+  std(R12, offset, dst);   offset += 8;
  stfd(F0, offset, dst);   offset += 8;
  stfd(F1, offset, dst);   offset += 8;
  stfd(F2, offset, dst);   offset += 8;
  stfd(F3, offset, dst);   offset += 8;
  stfd(F4, offset, dst);   offset += 8;
  stfd(F5, offset, dst);   offset += 8;
  stfd(F6, offset, dst);   offset += 8;
  stfd(F7, offset, dst);   offset += 8;
  stfd(F8, offset, dst);   offset += 8;
  stfd(F9, offset, dst);   offset += 8;
  stfd(F10, offset, dst);  offset += 8;
  stfd(F11, offset, dst);  offset += 8;
  stfd(F12, offset, dst);  offset += 8;
  stfd(F13, offset, dst);
 }
 // For verify_oops.
@ -825,7 +854,22 @@ void MacroAssembler::restore_volatile_gprs(Register src, int offset) {
  ld(R9,  offset, src);   offset += 8;
  ld(R10, offset, src);   offset += 8;
  ld(R11, offset, src);   offset += 8;
-  ld(R12, offset, src);
+  ld(R12, offset, src);   offset += 8;
  lfd(F0, offset, src);   offset += 8;
  lfd(F1, offset, src);   offset += 8;
  lfd(F2, offset, src);   offset += 8;
  lfd(F3, offset, src);   offset += 8;
  lfd(F4, offset, src);   offset += 8;
  lfd(F5, offset, src);   offset += 8;
  lfd(F6, offset, src);   offset += 8;
  lfd(F7, offset, src);   offset += 8;
  lfd(F8, offset, src);   offset += 8;
  lfd(F9, offset, src);   offset += 8;
  lfd(F10, offset, src);  offset += 8;
  lfd(F11, offset, src);  offset += 8;
  lfd(F12, offset, src);  offset += 8;
  lfd(F13, offset, src);
 }
 void MacroAssembler::save_LR_CR(Register tmp) {
@ -908,7 +952,7 @@ void MacroAssembler::push_frame(unsigned int bytes, Register tmp) {
  if (is_simm(-offset, 16)) {
    stdu(R1_SP, -offset, R1_SP);
  } else {
-    load_const(tmp, -offset);
+    load_const_optimized(tmp, -offset);
    stdux(R1_SP, R1_SP, tmp);
  }
 }
@ -1090,20 +1134,21 @@ address MacroAssembler::call_c_using_toc(const FunctionDescriptor* fd,
    assert(fd->entry() != NULL, "function must be linked");
    AddressLiteral fd_entry(fd->entry());
-    load_const_from_method_toc(R11, fd_entry, toc);
+    bool success = load_const_from_method_toc(R11, fd_entry, toc, /*fixed_size*/ true);
    mtctr(R11);
    if (fd->env() == NULL) {
      li(R11, 0);
      nop();
    } else {
      AddressLiteral fd_env(fd->env());
-      load_const_from_method_toc(R11, fd_env, toc);
+      success = success && load_const_from_method_toc(R11, fd_env, toc, /*fixed_size*/ true);
    }
    AddressLiteral fd_toc(fd->toc());
-    load_toc_from_toc(R2_TOC, fd_toc, toc);
+    // Set R2_TOC (load from toc)
-    // R2_TOC is killed.
+    success = success && load_const_from_method_toc(R2_TOC, fd_toc, toc, /*fixed_size*/ true);
    bctrl();
    _last_calls_return_pc = pc();
    if (!success) { return NULL; }
  } else {
    // It's a friend function, load the entry point and don't care about
    // toc and env. Use an optimizable call instruction, but ensure the
@ -1367,11 +1412,6 @@ void MacroAssembler::cmpxchgw(ConditionRegister flag, Register dest_current_valu
  bool preset_result_reg = (int_flag_success != dest_current_value && int_flag_success != compare_value &&
                            int_flag_success != exchange_value && int_flag_success != addr_base);
  // release/fence semantics
  if (semantics & MemBarRel) {
    release();
  }
  if (use_result_reg && preset_result_reg) {
    li(int_flag_success, 0); // preset (assume cas failed)
  }
@ -1383,6 +1423,11 @@ void MacroAssembler::cmpxchgw(ConditionRegister flag, Register dest_current_valu
    bne(flag, failed);
  }
  // release/fence semantics
  if (semantics & MemBarRel) {
    release();
  }
  // atomic emulation loop
  bind(retry);
@ -1462,11 +1507,6 @@ void MacroAssembler::cmpxchgd(ConditionRegister flag,
                            int_flag_success!=exchange_value && int_flag_success!=addr_base);
  assert(int_flag_success == noreg || failed_ext == NULL, "cannot have both");
  // release/fence semantics
  if (semantics & MemBarRel) {
    release();
  }
  if (use_result_reg && preset_result_reg) {
    li(int_flag_success, 0); // preset (assume cas failed)
  }
@ -1478,6 +1518,11 @@ void MacroAssembler::cmpxchgd(ConditionRegister flag,
    bne(flag, failed);
  }
  // release/fence semantics
  if (semantics & MemBarRel) {
    release();
  }
  // atomic emulation loop
  bind(retry);
@ -1501,8 +1546,6 @@ void MacroAssembler::cmpxchgd(ConditionRegister flag,
    li(int_flag_success, 1);
  }
  // POWER6 doesn't need isync in CAS.
  // Always emit isync to be on the safe side.
  if (semantics & MemBarFenceAfter) {
    fence();
  } else if (semantics & MemBarAcq) {
@ -1627,13 +1670,14 @@ void MacroAssembler::lookup_virtual_method(Register recv_klass,
 }
 /////////////////////////////////////////// subtype checking ////////////////////////////////////////////
 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
                                                   Register super_klass,
                                                   Register temp1_reg,
                                                   Register temp2_reg,
-                                                   Label& L_success,
+                                                   Label* L_success,
-                                                   Label& L_failure) {
+                                                   Label* L_failure,
                                                   Label* L_slow_path,
                                                   RegisterOrConstant super_check_offset) {
  const Register check_cache_offset = temp1_reg;
  const Register cached_super       = temp2_reg;
@ -1643,6 +1687,18 @@ void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
  int sco_offset = in_bytes(Klass::super_check_offset_offset());
  int sc_offset  = in_bytes(Klass::secondary_super_cache_offset());
  bool must_load_sco = (super_check_offset.constant_or_zero() == -1);
  bool need_slow_path = (must_load_sco || super_check_offset.constant_or_zero() == sco_offset);
  Label L_fallthrough;
  int label_nulls = 0;
  if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
  if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
  if (L_slow_path == NULL) { L_slow_path = &L_fallthrough; label_nulls++; }
  assert(label_nulls <= 1 ||
         (L_slow_path == &L_fallthrough && label_nulls <= 2 && !need_slow_path),
         "at most one NULL in the batch, usually");
  // If the pointers are equal, we are done (e.g., String[] elements).
  // This self-check enables sharing of secondary supertype arrays among
  // non-primary types such as array-of-interface. Otherwise, each such
@ -1651,15 +1707,20 @@ void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
  // type checks are in fact trivially successful in this manner,
  // so we get a nicely predicted branch right at the start of the check.
  cmpd(CCR0, sub_klass, super_klass);
-  beq(CCR0, L_success);
+  beq(CCR0, *L_success);
  // Check the supertype display:
  if (must_load_sco) {
    // The super check offset is always positive...
  lwz(check_cache_offset, sco_offset, super_klass);
    super_check_offset = RegisterOrConstant(check_cache_offset);
    // super_check_offset is register.
    assert_different_registers(sub_klass, super_klass, cached_super, super_check_offset.as_register());
  }
  // The loaded value is the offset from KlassOopDesc.
-  ldx(cached_super, check_cache_offset, sub_klass);
+  ld(cached_super, super_check_offset, sub_klass);
  cmpd(CCR0, cached_super, super_klass);
  beq(CCR0, L_success);
  // This check has worked decisively for primary supers.
  // Secondary supers are sought in the super_cache ('super_cache_addr').
@ -1672,9 +1733,39 @@ void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
  // So if it was a primary super, we can just fail immediately.
  // Otherwise, it's the slow path for us (no success at this point).
-  cmpwi(CCR0, check_cache_offset, sc_offset);
+#define FINAL_JUMP(label) if (&(label) != &L_fallthrough) { b(label); }
-  bne(CCR0, L_failure);
+
-  // bind(slow_path); // fallthru
+  if (super_check_offset.is_register()) {
    beq(CCR0, *L_success);
    cmpwi(CCR0, super_check_offset.as_register(), sc_offset);
    if (L_failure == &L_fallthrough) {
      beq(CCR0, *L_slow_path);
    } else {
      bne(CCR0, *L_failure);
      FINAL_JUMP(*L_slow_path);
    }
  } else {
    if (super_check_offset.as_constant() == sc_offset) {
      // Need a slow path; fast failure is impossible.
      if (L_slow_path == &L_fallthrough) {
        beq(CCR0, *L_success);
      } else {
        bne(CCR0, *L_slow_path);
        FINAL_JUMP(*L_success);
      }
    } else {
      // No slow path; it's a fast decision.
      if (L_failure == &L_fallthrough) {
        beq(CCR0, *L_success);
      } else {
        bne(CCR0, *L_failure);
        FINAL_JUMP(*L_success);
      }
    }
  }
  bind(L_fallthrough);
 #undef FINAL_JUMP
 }
 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
@ -1698,7 +1789,7 @@ void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
  ld(array_ptr, source_offset, sub_klass);
-  //assert(4 == arrayOopDesc::length_length_in_bytes(), "precondition violated.");
+  // TODO: PPC port: assert(4 == arrayOopDesc::length_length_in_bytes(), "precondition violated.");
  lwz(temp, length_offset, array_ptr);
  cmpwi(CCR0, temp, 0);
  beq(CCR0, result_reg!=noreg ? failure : fallthru); // length 0
@ -1719,8 +1810,9 @@ void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
  bind(hit);
  std(super_klass, target_offset, sub_klass); // save result to cache
-  if (result_reg != noreg) li(result_reg, 0); // load zero result (indicates a hit)
+  if (result_reg != noreg) { li(result_reg, 0); } // load zero result (indicates a hit)
-  if (L_success != NULL) b(*L_success);
+  if (L_success != NULL) { b(*L_success); }
  else if (result_reg == noreg) { blr(); } // return with CR0.eq if neither label nor result reg provided
  bind(fallthru);
 }
@ -1732,7 +1824,7 @@ void MacroAssembler::check_klass_subtype(Register sub_klass,
                         Register temp2_reg,
                         Label& L_success) {
  Label L_failure;
-  check_klass_subtype_fast_path(sub_klass, super_klass, temp1_reg, temp2_reg, L_success, L_failure);
+  check_klass_subtype_fast_path(sub_klass, super_klass, temp1_reg, temp2_reg, &L_success, &L_failure);
  check_klass_subtype_slow_path(sub_klass, super_klass, temp1_reg, temp2_reg, &L_success);
  bind(L_failure); // Fallthru if not successful.
 }
@ -1765,6 +1857,7 @@ RegisterOrConstant MacroAssembler::argument_offset(RegisterOrConstant arg_slot,
  }
 }
 // Supports temp2_reg = R0.
 void MacroAssembler::biased_locking_enter(ConditionRegister cr_reg, Register obj_reg,
                                          Register mark_reg, Register temp_reg,
                                          Register temp2_reg, Label& done, Label* slow_case) {
@ -1788,10 +1881,10 @@ void MacroAssembler::biased_locking_enter(ConditionRegister cr_reg, Register obj
         "biased locking makes assumptions about bit layout");
  if (PrintBiasedLockingStatistics) {
-    load_const(temp_reg, (address) BiasedLocking::total_entry_count_addr(), temp2_reg);
+    load_const(temp2_reg, (address) BiasedLocking::total_entry_count_addr(), temp_reg);
-    lwz(temp2_reg, 0, temp_reg);
+    lwzx(temp_reg, temp2_reg);
-    addi(temp2_reg, temp2_reg, 1);
+    addi(temp_reg, temp_reg, 1);
-    stw(temp2_reg, 0, temp_reg);
+    stwx(temp_reg, temp2_reg);
  }
  andi(temp_reg, mark_reg, markOopDesc::biased_lock_mask_in_place);
@ -1809,10 +1902,10 @@ void MacroAssembler::biased_locking_enter(ConditionRegister cr_reg, Register obj
  if (PrintBiasedLockingStatistics) {
    Label l;
    bne(cr_reg, l);
-    load_const(mark_reg, (address) BiasedLocking::biased_lock_entry_count_addr());
+    load_const(temp2_reg, (address) BiasedLocking::biased_lock_entry_count_addr());
-    lwz(temp2_reg, 0, mark_reg);
+    lwzx(mark_reg, temp2_reg);
-    addi(temp2_reg, temp2_reg, 1);
+    addi(mark_reg, mark_reg, 1);
-    stw(temp2_reg, 0, mark_reg);
+    stwx(mark_reg, temp2_reg);
    // restore mark_reg
    ld(mark_reg, oopDesc::mark_offset_in_bytes(), obj_reg);
    bind(l);
@ -1878,10 +1971,10 @@ void MacroAssembler::biased_locking_enter(ConditionRegister cr_reg, Register obj
  // need to revoke that bias. The revocation will occur in the
  // interpreter runtime in the slow case.
  if (PrintBiasedLockingStatistics) {
-    load_const(temp_reg, (address) BiasedLocking::anonymously_biased_lock_entry_count_addr(), temp2_reg);
+    load_const(temp2_reg, (address) BiasedLocking::anonymously_biased_lock_entry_count_addr(), temp_reg);
-    lwz(temp2_reg, 0, temp_reg);
+    lwzx(temp_reg, temp2_reg);
-    addi(temp2_reg, temp2_reg, 1);
+    addi(temp_reg, temp_reg, 1);
-    stw(temp2_reg, 0, temp_reg);
+    stwx(temp_reg, temp2_reg);
  }
  b(done);
@ -1892,15 +1985,14 @@ void MacroAssembler::biased_locking_enter(ConditionRegister cr_reg, Register obj
  // value as the comparison value when doing the cas to acquire the
  // bias in the current epoch. In other words, we allow transfer of
  // the bias from one thread to another directly in this situation.
-  andi(temp_reg, mark_reg, markOopDesc::age_mask_in_place);
+  load_klass(temp_reg, obj_reg);
-  orr(temp_reg, R16_thread, temp_reg);
+  andi(temp2_reg, mark_reg, markOopDesc::age_mask_in_place);
-  load_klass(temp2_reg, obj_reg);
+  orr(temp2_reg, R16_thread, temp2_reg);
-  ld(temp2_reg, in_bytes(Klass::prototype_header_offset()), temp2_reg);
+  ld(temp_reg, in_bytes(Klass::prototype_header_offset()), temp_reg);
-  orr(temp_reg, temp_reg, temp2_reg);
+  orr(temp_reg, temp2_reg, temp_reg);
  assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
  // CmpxchgX sets cr_reg to cmpX(temp2_reg, mark_reg).
  cmpxchgd(/*flag=*/cr_reg, /*current_value=*/temp2_reg,
                 /*compare_value=*/mark_reg, /*exchange_value=*/temp_reg,
                 /*where=*/obj_reg,
@ -1913,10 +2005,10 @@ void MacroAssembler::biased_locking_enter(ConditionRegister cr_reg, Register obj
  // need to revoke that bias. The revocation will occur in the
  // interpreter runtime in the slow case.
  if (PrintBiasedLockingStatistics) {
-    load_const(temp_reg, (address) BiasedLocking::rebiased_lock_entry_count_addr(), temp2_reg);
+    load_const(temp2_reg, (address) BiasedLocking::rebiased_lock_entry_count_addr(), temp_reg);
-    lwz(temp2_reg, 0, temp_reg);
+    lwzx(temp_reg, temp2_reg);
-    addi(temp2_reg, temp2_reg, 1);
+    addi(temp_reg, temp_reg, 1);
-    stw(temp2_reg, 0, temp_reg);
+    stwx(temp_reg, temp2_reg);
  }
  b(done);
@ -1952,10 +2044,10 @@ void MacroAssembler::biased_locking_enter(ConditionRegister cr_reg, Register obj
  if (PrintBiasedLockingStatistics) {
    Label l;
    bne(cr_reg, l);
-    load_const(temp_reg, (address) BiasedLocking::revoked_lock_entry_count_addr(), temp2_reg);
+    load_const(temp2_reg, (address) BiasedLocking::revoked_lock_entry_count_addr(), temp_reg);
-    lwz(temp2_reg, 0, temp_reg);
+    lwzx(temp_reg, temp2_reg);
-    addi(temp2_reg, temp2_reg, 1);
+    addi(temp_reg, temp_reg, 1);
-    stw(temp2_reg, 0, temp_reg);
+    stwx(temp_reg, temp2_reg);
    bind(l);
  }
@ -1977,6 +2069,109 @@ void MacroAssembler::biased_locking_exit (ConditionRegister cr_reg, Register mar
  beq(cr_reg, done);
 }
 // allocation (for C1)
 void MacroAssembler::eden_allocate(
  Register obj,                      // result: pointer to object after successful allocation
  Register var_size_in_bytes,        // object size in bytes if unknown at compile time; invalid otherwise
  int      con_size_in_bytes,        // object size in bytes if   known at compile time
  Register t1,                       // temp register
  Register t2,                       // temp register
  Label&   slow_case                 // continuation point if fast allocation fails
 ) {
  b(slow_case);
 }
 void MacroAssembler::tlab_allocate(
  Register obj,                      // result: pointer to object after successful allocation
  Register var_size_in_bytes,        // object size in bytes if unknown at compile time; invalid otherwise
  int      con_size_in_bytes,        // object size in bytes if   known at compile time
  Register t1,                       // temp register
  Label&   slow_case                 // continuation point if fast allocation fails
 ) {
  // make sure arguments make sense
  assert_different_registers(obj, var_size_in_bytes, t1);
  assert(0 <= con_size_in_bytes && is_simm13(con_size_in_bytes), "illegal object size");
  assert((con_size_in_bytes & MinObjAlignmentInBytesMask) == 0, "object size is not multiple of alignment");
  const Register new_top = t1;
  //verify_tlab(); not implemented
  ld(obj, in_bytes(JavaThread::tlab_top_offset()), R16_thread);
  ld(R0, in_bytes(JavaThread::tlab_end_offset()), R16_thread);
  if (var_size_in_bytes == noreg) {
    addi(new_top, obj, con_size_in_bytes);
  } else {
    add(new_top, obj, var_size_in_bytes);
  }
  cmpld(CCR0, new_top, R0);
  bc_far_optimized(Assembler::bcondCRbiIs1, bi0(CCR0, Assembler::greater), slow_case);
 #ifdef ASSERT
  // make sure new free pointer is properly aligned
  {
    Label L;
    andi_(R0, new_top, MinObjAlignmentInBytesMask);
    beq(CCR0, L);
    stop("updated TLAB free is not properly aligned", 0x934);
    bind(L);
  }
 #endif // ASSERT
  // update the tlab top pointer
  std(new_top, in_bytes(JavaThread::tlab_top_offset()), R16_thread);
  //verify_tlab(); not implemented
 }
 void MacroAssembler::tlab_refill(Label& retry_tlab, Label& try_eden, Label& slow_case) {
  unimplemented("tlab_refill");
 }
 void MacroAssembler::incr_allocated_bytes(RegisterOrConstant size_in_bytes, Register t1, Register t2) {
  unimplemented("incr_allocated_bytes");
 }
 address MacroAssembler::emit_trampoline_stub(int destination_toc_offset,
                                             int insts_call_instruction_offset, Register Rtoc) {
  // Start the stub.
  address stub = start_a_stub(64);
  if (stub == NULL) { return NULL; } // CodeCache full: bail out
  // Create a trampoline stub relocation which relates this trampoline stub
  // with the call instruction at insts_call_instruction_offset in the
  // instructions code-section.
  relocate(trampoline_stub_Relocation::spec(code()->insts()->start() + insts_call_instruction_offset));
  const int stub_start_offset = offset();
  // For java_to_interp stubs we use R11_scratch1 as scratch register
  // and in call trampoline stubs we use R12_scratch2. This way we
  // can distinguish them (see is_NativeCallTrampolineStub_at()).
  Register reg_scratch = R12_scratch2;
  // Now, create the trampoline stub's code:
  // - load the TOC
  // - load the call target from the constant pool
  // - call
  if (Rtoc == noreg) {
    calculate_address_from_global_toc(reg_scratch, method_toc());
    Rtoc = reg_scratch;
  }
  ld_largeoffset_unchecked(reg_scratch, destination_toc_offset, Rtoc, false);
  mtctr(reg_scratch);
  bctr();
  const address stub_start_addr = addr_at(stub_start_offset);
  // Assert that the encoded destination_toc_offset can be identified and that it is correct.
  assert(destination_toc_offset == NativeCallTrampolineStub_at(stub_start_addr)->destination_toc_offset(),
         "encoded offset into the constant pool must match");
  // Trampoline_stub_size should be good.
  assert((uint)(offset() - stub_start_offset) <= trampoline_stub_size, "should be good size");
  assert(is_NativeCallTrampolineStub_at(stub_start_addr), "doesn't look like a trampoline");
  // End the stub.
  end_a_stub();
  return stub;
 }
 // TM on PPC64.
 void MacroAssembler::atomic_inc_ptr(Register addr, Register result, int simm16) {
  Label retry;
@ -2387,17 +2582,16 @@ void MacroAssembler::compiler_fast_lock_object(ConditionRegister flag, Register
  // Must fence, otherwise, preceding store(s) may float below cmpxchg.
  // Compare object markOop with mark and if equal exchange scratch1 with object markOop.
  // CmpxchgX sets cr_reg to cmpX(current, displaced).
  membar(Assembler::StoreStore);
  cmpxchgd(/*flag=*/flag,
           /*current_value=*/current_header,
           /*compare_value=*/displaced_header,
           /*exchange_value=*/box,
           /*where=*/oop,
-           MacroAssembler::MemBarAcq,
+           MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq,
           MacroAssembler::cmpxchgx_hint_acquire_lock(),
           noreg,
-           &cas_failed);
+           &cas_failed,
           /*check without membar and ldarx first*/true);
  assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
  // If the compare-and-exchange succeeded, then we found an unlocked
@ -2410,8 +2604,7 @@ void MacroAssembler::compiler_fast_lock_object(ConditionRegister flag, Register
  // Check if the owner is self by comparing the value in the markOop of object
  // (current_header) with the stack pointer.
  sub(current_header, current_header, R1_SP);
-  load_const_optimized(temp, (address) (~(os::vm_page_size()-1) |
+  load_const_optimized(temp, ~(os::vm_page_size()-1) | markOopDesc::lock_mask_in_place);
                                        markOopDesc::lock_mask_in_place));
  and_(R0/*==0?*/, current_header, temp);
  // If condition is true we are cont and hence we can store 0 as the
@ -2437,8 +2630,6 @@ void MacroAssembler::compiler_fast_lock_object(ConditionRegister flag, Register
    // Try to CAS m->owner from NULL to current thread.
    addi(temp, displaced_header, ObjectMonitor::owner_offset_in_bytes()-markOopDesc::monitor_value);
    li(displaced_header, 0);
    // CmpxchgX sets flag to cmpX(current, displaced).
    cmpxchgd(/*flag=*/flag,
             /*current_value=*/current_header,
             /*compare_value=*/(intptr_t)0,
@ -2924,31 +3115,12 @@ void MacroAssembler::load_klass(Register dst, Register src) {
  }
 }
 void MacroAssembler::load_klass_with_trap_null_check(Register dst, Register src) {
  if (!os::zero_page_read_protected()) {
    if (TrapBasedNullChecks) {
      trap_null_check(src);
    }
  }
  load_klass(dst, src);
 }
 void MacroAssembler::reinit_heapbase(Register d, Register tmp) {
  if (Universe::heap() != NULL) {
    load_const_optimized(R30, Universe::narrow_ptrs_base(), tmp);
  } else {
    // Heap not yet allocated. Load indirectly.
    int simm16_offset = load_const_optimized(R30, Universe::narrow_ptrs_base_addr(), tmp, true);
    ld(R30, simm16_offset, R30);
  }
 }
 // Clear Array
 // Kills both input registers. tmp == R0 is allowed.
 void MacroAssembler::clear_memory_doubleword(Register base_ptr, Register cnt_dwords, Register tmp) {
  // Procedure for large arrays (uses data cache block zero instruction).
    Label startloop, fast, fastloop, small_rest, restloop, done;
-    const int cl_size         = VM_Version::get_cache_line_size(),
+    const int cl_size         = VM_Version::L1_data_cache_line_size(),
              cl_dwords       = cl_size>>3,
              cl_dw_addr_bits = exact_log2(cl_dwords),
              dcbz_min        = 1;                     // Min count of dcbz executions, needs to be >0.
@ -4021,7 +4193,7 @@ void MacroAssembler::multiply_128_x_128_loop(Register x_xstart,
  bind(L_check_1);
  addi(idx, idx, 0x2);
-  andi_(idx, idx, 0x1) ;
+  andi_(idx, idx, 0x1);
  addic_(idx, idx, -1);
  blt(CCR0, L_post_third_loop_done);
@ -4251,17 +4423,42 @@ void MacroAssembler::verify_oop(Register oop, const char* msg) {
  address/* FunctionDescriptor** */fd = StubRoutines::verify_oop_subroutine_entry_address();
  const Register tmp = R11; // Will be preserved.
-  const int nbytes_save = 11*8; // Volatile gprs except R0.
+  const int nbytes_save = MacroAssembler::num_volatile_regs * 8;
  save_volatile_gprs(R1_SP, -nbytes_save); // except R0
-  if (oop == tmp) mr(R4_ARG2, oop);
+  mr_if_needed(R4_ARG2, oop);
  save_LR_CR(tmp); // save in old frame
  push_frame_reg_args(nbytes_save, tmp);
  // load FunctionDescriptor** / entry_address *
  load_const_optimized(tmp, fd, R0);
  // load FunctionDescriptor* / entry_address
  ld(tmp, 0, tmp);
  load_const_optimized(R3_ARG1, (address)msg, R0);
  // Call destination for its side effect.
  call_c(tmp);
  pop_frame();
  restore_LR_CR(tmp);
  restore_volatile_gprs(R1_SP, -nbytes_save); // except R0
 }
 void MacroAssembler::verify_oop_addr(RegisterOrConstant offs, Register base, const char* msg) {
  if (!VerifyOops) {
    return;
  }
  address/* FunctionDescriptor** */fd = StubRoutines::verify_oop_subroutine_entry_address();
  const Register tmp = R11; // Will be preserved.
  const int nbytes_save = MacroAssembler::num_volatile_regs * 8;
  save_volatile_gprs(R1_SP, -nbytes_save); // except R0
  ld(R4_ARG2, offs, base);
  save_LR_CR(tmp); // save in old frame
  push_frame_reg_args(nbytes_save, tmp);
  // load FunctionDescriptor** / entry_address *
  load_const_optimized(tmp, fd, R0);
  // load FunctionDescriptor* / entry_address
  ld(tmp, 0, tmp);
  if (oop != tmp) mr_if_needed(R4_ARG2, oop);
  load_const_optimized(R3_ARG1, (address)msg, R0);
  // Call destination for its side effect.
  call_c(tmp);
--- a/hotspot/src/cpu/ppc/vm/macroAssembler_ppc.hpp
+++ b/hotspot/src/cpu/ppc/vm/macroAssembler_ppc.hpp
@ -119,11 +119,8 @@ class MacroAssembler: public Assembler {
  // Emits an oop const to the constant pool, loads the constant, and
  // sets a relocation info with address current_pc.
-  void load_const_from_method_toc(Register dst, AddressLiteral& a, Register toc);
+  // Returns true if successful.
-  void load_toc_from_toc(Register dst, AddressLiteral& a, Register toc) {
+  bool load_const_from_method_toc(Register dst, AddressLiteral& a, Register toc, bool fixed_size = false);
    assert(dst == R2_TOC, "base register must be TOC");
    load_const_from_method_toc(dst, a, toc);
  }
  static bool is_load_const_from_method_toc_at(address a);
  static int get_offset_of_load_const_from_method_toc_at(address a);
@ -174,6 +171,7 @@ class MacroAssembler: public Assembler {
  // optimize: flag for telling the conditional far branch to optimize
  //           itself when relocated.
  void bc_far(int boint, int biint, Label& dest, int optimize);
  void bc_far_optimized(int boint, int biint, Label& dest); // 1 or 2 instructions
  // Relocation of conditional far branches.
  static bool    is_bc_far_at(address instruction_addr);
  static address get_dest_of_bc_far_at(address instruction_addr);
@ -262,6 +260,7 @@ class MacroAssembler: public Assembler {
  // some ABI-related functions
  void save_nonvolatile_gprs(   Register dst_base, int offset);
  void restore_nonvolatile_gprs(Register src_base, int offset);
  enum { num_volatile_regs = 11 + 14 }; // GPR + FPR
  void save_volatile_gprs(   Register dst_base, int offset);
  void restore_volatile_gprs(Register src_base, int offset);
  void save_LR_CR(   Register tmp);     // tmp contains LR on return.
@ -461,8 +460,10 @@ class MacroAssembler: public Assembler {
                                     Register super_klass,
                                     Register temp1_reg,
                                     Register temp2_reg,
-                                     Label& L_success,
+                                     Label* L_success,
-                                     Label& L_failure);
+                                     Label* L_failure,
                                     Label* L_slow_path = NULL, // default fall through
                                     RegisterOrConstant super_check_offset = RegisterOrConstant(-1));
  // The rest of the type check; must be wired to a corresponding fast path.
  // It does not repeat the fast path logic, so don't use it standalone.
@ -507,6 +508,28 @@ class MacroAssembler: public Assembler {
  // biased locking exit case failed.
  void biased_locking_exit(ConditionRegister cr_reg, Register mark_addr, Register temp_reg, Label& done);
  // allocation (for C1)
  void eden_allocate(
    Register obj,                      // result: pointer to object after successful allocation
    Register var_size_in_bytes,        // object size in bytes if unknown at compile time; invalid otherwise
    int      con_size_in_bytes,        // object size in bytes if   known at compile time
    Register t1,                       // temp register
    Register t2,                       // temp register
    Label&   slow_case                 // continuation point if fast allocation fails
  );
  void tlab_allocate(
    Register obj,                      // result: pointer to object after successful allocation
    Register var_size_in_bytes,        // object size in bytes if unknown at compile time; invalid otherwise
    int      con_size_in_bytes,        // object size in bytes if   known at compile time
    Register t1,                       // temp register
    Label&   slow_case                 // continuation point if fast allocation fails
  );
  void tlab_refill(Label& retry_tlab, Label& try_eden, Label& slow_case);
  void incr_allocated_bytes(RegisterOrConstant size_in_bytes, Register t1, Register t2);
  enum { trampoline_stub_size = 6 * 4 };
  address emit_trampoline_stub(int destination_toc_offset, int insts_call_instruction_offset, Register Rtoc = noreg);
  void atomic_inc_ptr(Register addr, Register result, int simm16 = 1);
  void atomic_ori_int(Register addr, Register result, int uimm16);
@ -597,9 +620,7 @@ class MacroAssembler: public Assembler {
  // Implicit or explicit null check, jumps to static address exception_entry.
  inline void null_check_throw(Register a, int offset, Register temp_reg, address exception_entry);
-
+  inline void null_check(Register a, int offset, Label *Lis_null); // implicit only if Lis_null not provided
  // Check accessed object for null. Use SIGTRAP-based null checks on AIX.
  inline void load_with_trap_null_check(Register d, int si16, Register s1);
  // Load heap oop and decompress. Loaded oop may not be null.
  // Specify tmp to save one cycle.
@ -619,20 +640,17 @@ class MacroAssembler: public Assembler {
  inline Register decode_heap_oop_not_null(Register d, Register src = noreg);
  // Null allowed.
  inline Register encode_heap_oop(Register d, Register src); // Prefer null check in GC barrier!
  inline void decode_heap_oop(Register d);
  // Load/Store klass oop from klass field. Compress.
  void load_klass(Register dst, Register src);
  void load_klass_with_trap_null_check(Register dst, Register src);
  void store_klass(Register dst_oop, Register klass, Register tmp = R0);
  void store_klass_gap(Register dst_oop, Register val = noreg); // Will store 0 if val not specified.
  static int instr_size_for_decode_klass_not_null();
  void decode_klass_not_null(Register dst, Register src = noreg);
  Register encode_klass_not_null(Register dst, Register src = noreg);
  // Load common heap base into register.
  void reinit_heapbase(Register d, Register tmp = noreg);
  // SIGTRAP-based range checks for arrays.
  inline void trap_range_check_l(Register a, Register b);
  inline void trap_range_check_l(Register a, int si16);
@ -750,6 +768,7 @@ class MacroAssembler: public Assembler {
  // Emit code to verify that reg contains a valid oop if +VerifyOops is set.
  void verify_oop(Register reg, const char* s = "broken oop");
  void verify_oop_addr(RegisterOrConstant offs, Register base, const char* s = "contains broken oop");
  // TODO: verify method and klass metadata (compare against vptr?)
  void _verify_method_ptr(Register reg, const char * msg, const char * file, int line) {}
--- a/hotspot/src/cpu/ppc/vm/macroAssembler_ppc.inline.hpp
+++ b/hotspot/src/cpu/ppc/vm/macroAssembler_ppc.inline.hpp
@ -70,9 +70,11 @@ inline void MacroAssembler::endgroup_if_needed(bool needed) {
 }
 inline void MacroAssembler::membar(int bits) {
-  // TODO: use elemental_membar(bits) for Power 8 and disable optimization of acquire-release
+  // Comment: Usage of elemental_membar(bits) is not recommended for Power 8.
-  // (Matcher::post_membar_release where we use PPC64_ONLY(xop == Op_MemBarRelease ||))
+  // If elemental_membar(bits) is used, disable optimization of acquire-release
-  if (bits & StoreLoad) sync(); else lwsync();
+  // (Matcher::post_membar_release where we use PPC64_ONLY(xop == Op_MemBarRelease ||))!
  if (bits & StoreLoad) { sync(); }
  else if (bits) { lwsync(); }
 }
 inline void MacroAssembler::release() { membar(LoadStore | StoreStore); }
 inline void MacroAssembler::acquire() { membar(LoadLoad | LoadStore); }
@ -86,7 +88,7 @@ inline address MacroAssembler::global_toc() {
 // Offset of given address to the global TOC.
 inline int MacroAssembler::offset_to_global_toc(const address addr) {
  intptr_t offset = (intptr_t)addr - (intptr_t)MacroAssembler::global_toc();
-  assert(Assembler::is_simm((long)offset, 31) && offset >= 0, "must be in range");
+  assert(Assembler::is_uimm((long)offset, 31), "must be in range");
  return (int)offset;
 }
@ -98,7 +100,7 @@ inline address MacroAssembler::method_toc() {
 // Offset of given address to current method's TOC.
 inline int MacroAssembler::offset_to_method_toc(address addr) {
  intptr_t offset = (intptr_t)addr - (intptr_t)method_toc();
-  assert(is_simm((long)offset, 31) && offset >= 0, "must be in range");
+  assert(Assembler::is_uimm((long)offset, 31), "must be in range");
  return (int)offset;
 }
@ -190,13 +192,13 @@ inline bool MacroAssembler::is_bc_far_variant1_at(address instruction_addr) {
  // Variant 1, the 1st instruction contains the destination address:
  //
  //    bcxx  DEST
-  //    endgroup
+  //    nop
  //
  const int instruction_1 = *(int*)(instruction_addr);
  const int instruction_2 = *(int*)(instruction_addr + 4);
  return is_bcxx(instruction_1) &&
         (inv_bd_field(instruction_1, (intptr_t)instruction_addr) != (intptr_t)(instruction_addr + 2*4)) &&
-         is_endgroup(instruction_2);
+         is_nop(instruction_2);
 }
 // Relocation of conditional far branches.
@ -302,13 +304,17 @@ inline void MacroAssembler::null_check_throw(Register a, int offset, Register te
  }
 }
-inline void MacroAssembler::load_with_trap_null_check(Register d, int si16, Register s1) {
+inline void MacroAssembler::null_check(Register a, int offset, Label *Lis_null) {
-  if (!os::zero_page_read_protected()) {
+  if (!ImplicitNullChecks || needs_explicit_null_check(offset) || !os::zero_page_read_protected()) {
    if (TrapBasedNullChecks) {
-      trap_null_check(s1);
+      assert(UseSIGTRAP, "sanity");
      trap_null_check(a);
    } else if (Lis_null){
      Label ok;
      cmpdi(CCR0, a, 0);
      beq(CCR0, *Lis_null);
    }
  }
  ld(d, si16, s1);
 }
 inline void MacroAssembler::load_heap_oop_not_null(Register d, RegisterOrConstant offs, Register s1, Register tmp) {
@ -365,6 +371,26 @@ inline Register MacroAssembler::encode_heap_oop_not_null(Register d, Register sr
  return current; // Encoded oop is in this register.
 }
 inline Register MacroAssembler::encode_heap_oop(Register d, Register src) {
  if (Universe::narrow_oop_base() != NULL) {
    if (VM_Version::has_isel()) {
      cmpdi(CCR0, src, 0);
      Register co = encode_heap_oop_not_null(d, src);
      assert(co == d, "sanity");
      isel_0(d, CCR0, Assembler::equal);
    } else {
      Label isNull;
      or_(d, src, src); // move and compare 0
      beq(CCR0, isNull);
      encode_heap_oop_not_null(d, src);
      bind(isNull);
    }
    return d;
  } else {
    return encode_heap_oop_not_null(d, src);
  }
 }
 inline Register MacroAssembler::decode_heap_oop_not_null(Register d, Register src) {
  if (Universe::narrow_oop_base_disjoint() && src != noreg && src != d &&
      Universe::narrow_oop_shift() != 0) {
--- a/hotspot/src/cpu/ppc/vm/methodHandles_ppc.cpp
+++ b/hotspot/src/cpu/ppc/vm/methodHandles_ppc.cpp
@ -1,6 +1,6 @@
 /*
 * Copyright (c) 1997, 2015, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2015 SAP SE. All rights reserved.
+ * Copyright 2012, 2015 SAP AG. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@ -502,8 +502,7 @@ void trace_method_handle_stub(const char* adaptername,
      frame cur_frame = os::current_frame();
      // Robust search of trace_calling_frame (independant of inlining).
-      // Assumes saved_regs comes from a pusha in the trace_calling_frame.
+      assert(cur_frame.sp() <= saved_regs, "registers not saved on stack ?");
      assert(cur_frame.sp() < saved_regs, "registers not saved on stack ?");
      frame trace_calling_frame = os::get_sender_for_C_frame(&cur_frame);
      while (trace_calling_frame.fp() < saved_regs) {
        trace_calling_frame = os::get_sender_for_C_frame(&trace_calling_frame);
@ -537,7 +536,7 @@ void MethodHandles::trace_method_handle(MacroAssembler* _masm, const char* adapt
  BLOCK_COMMENT("trace_method_handle {");
  const Register tmp = R11; // Will be preserved.
-  const int nbytes_save = 11*8; // volatile gprs except R0
+  const int nbytes_save = MacroAssembler::num_volatile_regs * 8;
  __ save_volatile_gprs(R1_SP, -nbytes_save); // except R0
  __ save_LR_CR(tmp); // save in old frame
--- a/hotspot/src/cpu/ppc/vm/nativeInst_ppc.cpp
+++ b/hotspot/src/cpu/ppc/vm/nativeInst_ppc.cpp
@ -1,6 +1,6 @@
 /*
- * Copyright (c) 1997, 2014, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2015, Oracle and/or its affiliates. All rights reserved.
- * Copyright 2012, 2014 SAP AG. All rights reserved.
+ * Copyright 2012, 2015 SAP AG. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@ -65,6 +65,9 @@ address NativeCall::destination() const {
  address destination = Assembler::bxx_destination(addr);
  // Do we use a trampoline stub for this call?
  // Trampoline stubs are located behind the main code.
  if (destination > addr) {
    // Filter out recursive method invocation (call to verified/unverified entry point).
    CodeBlob* cb = CodeCache::find_blob_unsafe(addr);   // Else we get assertion if nmethod is zombie.
    assert(cb && cb->is_nmethod(), "sanity");
    nmethod *nm = (nmethod *)cb;
@ -73,6 +76,7 @@ address NativeCall::destination() const {
      const address trampoline_stub_addr = destination;
      destination = NativeCallTrampolineStub_at(trampoline_stub_addr)->destination(nm);
    }
  }
  return destination;
 }
@ -267,7 +271,7 @@ void NativeMovConstReg::set_data(intptr_t data) {
          oop_addr = r->oop_addr();
          *oop_addr = cast_to_oop(data);
        } else {
-          assert(oop_addr == r->oop_addr(), "must be only one set-oop here") ;
+          assert(oop_addr == r->oop_addr(), "must be only one set-oop here");
        }
      }
      if (iter.type() == relocInfo::metadata_type) {
@ -351,6 +355,27 @@ void NativeJump::verify() {
 }
 #endif // ASSERT
 void NativeGeneralJump::insert_unconditional(address code_pos, address entry) {
  CodeBuffer cb(code_pos, BytesPerInstWord + 1);
  MacroAssembler* a = new MacroAssembler(&cb);
  a->b(entry);
  ICache::ppc64_flush_icache_bytes(code_pos, NativeGeneralJump::instruction_size);
 }
 // MT-safe patching of a jmp instruction.
 void NativeGeneralJump::replace_mt_safe(address instr_addr, address code_buffer) {
  // Bytes beyond offset NativeGeneralJump::instruction_size are copied by caller.
  // Finally patch out the jump.
  volatile juint *jump_addr = (volatile juint*)instr_addr;
  // Release not needed because caller uses invalidate_range after copying the remaining bytes.
  //OrderAccess::release_store(jump_addr, *((juint*)code_buffer));
  *jump_addr = *((juint*)code_buffer); // atomically store code over branch instruction
  ICache::ppc64_flush_icache_bytes(instr_addr, NativeGeneralJump::instruction_size);
 }
 //-------------------------------------------------------------------
 // Call trampoline stubs.
@ -364,10 +389,12 @@ void NativeJump::verify() {
 //
 address NativeCallTrampolineStub::encoded_destination_addr() const {
-  address instruction_addr = addr_at(2 * BytesPerInstWord);
+  address instruction_addr = addr_at(0 * BytesPerInstWord);
  if (!MacroAssembler::is_ld_largeoffset(instruction_addr)) {
    instruction_addr = addr_at(2 * BytesPerInstWord);
    assert(MacroAssembler::is_ld_largeoffset(instruction_addr),
           "must be a ld with large offset (from the constant pool)");
-
+  }
  return instruction_addr;
 }
--- a/hotspot/src/cpu/ppc/vm/nativeInst_ppc.hpp
+++ b/hotspot/src/cpu/ppc/vm/nativeInst_ppc.hpp
@ -1,6 +1,6 @@
 /*
- * Copyright (c) 2002, 2013, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2002, 2015, Oracle and/or its affiliates. All rights reserved.
- * Copyright 2012, 2013 SAP AG. All rights reserved.
+ * Copyright 2012, 2015 SAP AG. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@ -50,6 +50,8 @@ class NativeInstruction VALUE_OBJ_CLASS_SPEC {
  friend class Relocation;
 public:
  bool is_jump() { return Assembler::is_b(long_at(0)); } // See NativeGeneralJump.
  bool is_sigtrap_ic_miss_check() {
    assert(UseSIGTRAP, "precondition");
    return MacroAssembler::is_trap_ic_miss_check(long_at(0));
@ -235,8 +237,8 @@ inline NativeFarCall* nativeFarCall_at(address instr) {
  return call;
 }
-// An interface for accessing/manipulating native set_oop imm, reg instructions.
+// An interface for accessing/manipulating native set_oop imm, reg instructions
-// (used to manipulate inlined data references, etc.)
+// (used to manipulate inlined data references, etc.).
 class NativeMovConstReg: public NativeInstruction {
 public:
@ -384,10 +386,21 @@ class NativeCallTrampolineStub : public NativeInstruction {
  void set_destination(address new_destination);
 };
 // Note: Other stubs must not begin with this pattern.
 inline bool is_NativeCallTrampolineStub_at(address address) {
  int first_instr = *(int*)address;
-  return Assembler::is_addis(first_instr) &&
+  // calculate_address_from_global_toc and long form of ld_largeoffset_unchecked begin with addis with target R12
-    (Register)(intptr_t)Assembler::inv_rt_field(first_instr) == R12_scratch2;
+  if (Assembler::is_addis(first_instr) &&
      (Register)(intptr_t)Assembler::inv_rt_field(first_instr) == R12_scratch2) return true;
  // short form of ld_largeoffset_unchecked is ld which is followed by mtctr
  int second_instr = *((int*)address + 1);
  if (Assembler::is_ld(first_instr) &&
      (Register)(intptr_t)Assembler::inv_rt_field(first_instr) == R12_scratch2 &&
      Assembler::is_mtctr(second_instr) &&
      (Register)(intptr_t)Assembler::inv_rs_field(second_instr) == R12_scratch2) return true;
  return false;
 }
 inline NativeCallTrampolineStub* NativeCallTrampolineStub_at(address address) {
@ -395,4 +408,102 @@ inline NativeCallTrampolineStub* NativeCallTrampolineStub_at(address address) {
  return (NativeCallTrampolineStub*)address;
 }
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 //-------------------------------------
 //  N a t i v e G e n e r a l J u m p
 //-------------------------------------
 // Despite the name, handles only simple branches.
 class NativeGeneralJump;
 inline NativeGeneralJump* nativeGeneralJump_at(address address);
 // Currently only implemented as single unconditional branch.
 class NativeGeneralJump: public NativeInstruction {
 public:
  enum PPC64_specific_constants {
    instruction_size = 4
  };
  address instruction_address() const { return addr_at(0); }
  // Creation.
  friend inline NativeGeneralJump* nativeGeneralJump_at(address addr) {
    NativeGeneralJump* jump = (NativeGeneralJump*)(addr);
    DEBUG_ONLY( jump->verify(); )
    return jump;
  }
  // Insertion of native general jump instruction.
  static void insert_unconditional(address code_pos, address entry);
  address jump_destination() const {
    DEBUG_ONLY( verify(); )
    return addr_at(0) + Assembler::inv_li_field(long_at(0));
  }
  void set_jump_destination(address dest) {
    DEBUG_ONLY( verify(); )
    insert_unconditional(addr_at(0), dest);
  }
  static void replace_mt_safe(address instr_addr, address code_buffer);
  void verify() const { guarantee(Assembler::is_b(long_at(0)), "invalid NativeGeneralJump"); }
 };
 // An interface for accessing/manipulating native load int (load_const32).
 class NativeMovRegMem;
 inline NativeMovRegMem* nativeMovRegMem_at(address address);
 class NativeMovRegMem: public NativeInstruction {
 public:
  enum PPC64_specific_constants {
    instruction_size = 8
  };
  address instruction_address() const { return addr_at(0); }
  intptr_t offset() const {
 #ifdef VM_LITTLE_ENDIAN
    short *hi_ptr = (short*)(addr_at(0));
    short *lo_ptr = (short*)(addr_at(4));
 #else
    short *hi_ptr = (short*)(addr_at(0) + 2);
    short *lo_ptr = (short*)(addr_at(4) + 2);
 #endif
    return ((*hi_ptr) << 16) | ((*lo_ptr) & 0xFFFF);
  }
  void set_offset(intptr_t x) {
 #ifdef VM_LITTLE_ENDIAN
    short *hi_ptr = (short*)(addr_at(0));
    short *lo_ptr = (short*)(addr_at(4));
 #else
    short *hi_ptr = (short*)(addr_at(0) + 2);
    short *lo_ptr = (short*)(addr_at(4) + 2);
 #endif
    *hi_ptr = x >> 16;
    *lo_ptr = x & 0xFFFF;
    ICache::ppc64_flush_icache_bytes(addr_at(0), NativeMovRegMem::instruction_size);
  }
  void add_offset_in_bytes(intptr_t radd_offset) {
    set_offset(offset() + radd_offset);
  }
  void verify() const {
    guarantee(Assembler::is_lis(long_at(0)), "load_const32 1st instr");
    guarantee(Assembler::is_ori(long_at(4)), "load_const32 2nd instr");
  }
 private:
  friend inline NativeMovRegMem* nativeMovRegMem_at(address address) {
    NativeMovRegMem* test = (NativeMovRegMem*)address;
    DEBUG_ONLY( test->verify(); )
    return test;
  }
 };
 #endif // CPU_PPC_VM_NATIVEINST_PPC_HPP
--- a/hotspot/src/cpu/ppc/vm/ppc.ad
+++ b/hotspot/src/cpu/ppc/vm/ppc.ad
@ -698,7 +698,7 @@ reg_class ctr_reg(SR_CTR);
 // ----------------------------
 reg_class flt_reg(
-/*F0*/              // scratch
+  F0,
  F1,
  F2,
  F3,
@ -735,7 +735,7 @@ reg_class flt_reg(
 // Double precision float registers have virtual `high halves' that
 // are needed by the allocator.
 reg_class dbl_reg(
-/*F0,  F0_H*/     // scratch
+  F0,  F0_H,
  F1,  F1_H,
  F2,  F2_H,
  F3,  F3_H,
@ -1040,8 +1040,6 @@ source_hpp %{ // Header information of the source block.
 //---<  Used for optimization in Compile::Shorten_branches  >---
 //--------------------------------------------------------------
 const uint trampoline_stub_size     =  6 * BytesPerInstWord;
 class CallStubImpl {
 public:
@ -1053,7 +1051,7 @@ class CallStubImpl {
  // This doesn't need to be accurate to the byte, but it
  // must be larger than or equal to the real size of the stub.
  static uint size_call_trampoline() {
-    return trampoline_stub_size;
+    return MacroAssembler::trampoline_stub_size;
  }
  // number of relocations needed by a call trampoline stub
@ -1079,46 +1077,10 @@ source %{
 //   branch via CTR (LR/link still points to the call-site above)
 void CallStubImpl::emit_trampoline_stub(MacroAssembler &_masm, int destination_toc_offset, int insts_call_instruction_offset) {
-  // Start the stub.
+  address stub = __ emit_trampoline_stub(destination_toc_offset, insts_call_instruction_offset);
  address stub = __ start_a_stub(Compile::MAX_stubs_size/2);
  if (stub == NULL) {
-    ciEnv::current()->record_failure("CodeCache is full");
+    ciEnv::current()->record_out_of_memory_failure();
    return;
  }
  // For java_to_interp stubs we use R11_scratch1 as scratch register
  // and in call trampoline stubs we use R12_scratch2. This way we
  // can distinguish them (see is_NativeCallTrampolineStub_at()).
  Register reg_scratch = R12_scratch2;
  // Create a trampoline stub relocation which relates this trampoline stub
  // with the call instruction at insts_call_instruction_offset in the
  // instructions code-section.
  __ relocate(trampoline_stub_Relocation::spec(__ code()->insts()->start() + insts_call_instruction_offset));
  const int stub_start_offset = __ offset();
  // Now, create the trampoline stub's code:
  // - load the TOC
  // - load the call target from the constant pool
  // - call
  __ calculate_address_from_global_toc(reg_scratch, __ method_toc());
  __ ld_largeoffset_unchecked(reg_scratch, destination_toc_offset, reg_scratch, false);
  __ mtctr(reg_scratch);
  __ bctr();
  const address stub_start_addr = __ addr_at(stub_start_offset);
  // FIXME: Assert that the trampoline stub can be identified and patched.
  // Assert that the encoded destination_toc_offset can be identified and that it is correct.
  assert(destination_toc_offset == NativeCallTrampolineStub_at(stub_start_addr)->destination_toc_offset(),
         "encoded offset into the constant pool must match");
  // Trampoline_stub_size should be good.
  assert((uint)(__ offset() - stub_start_offset) <= trampoline_stub_size, "should be good size");
  assert(is_NativeCallTrampolineStub_at(stub_start_addr), "doesn't look like a trampoline");
  // End the stub.
  __ end_a_stub();
 }
 //=============================================================================
@ -1156,6 +1118,10 @@ EmitCallOffsets emit_call_with_trampoline_stub(MacroAssembler &_masm, address en
  if (!Compile::current()->in_scratch_emit_size()) {
    // Put the entry point as a constant into the constant pool.
    const address entry_point_toc_addr   = __ address_constant(entry_point, RelocationHolder::none);
    if (entry_point_toc_addr == NULL) {
      ciEnv::current()->record_out_of_memory_failure();
      return offsets;
    }
    const int     entry_point_toc_offset = __ offset_to_method_toc(entry_point_toc_addr);
    // Emit the trampoline stub which will be related to the branch-and-link below.
@ -2474,6 +2440,10 @@ encode %{
      // Create a non-oop constant, no relocation needed.
      // If it is an IC, it has a virtual_call_Relocation.
      const_toc_addr = __ long_constant((jlong)$src$$constant);
      if (const_toc_addr == NULL) {
        ciEnv::current()->record_out_of_memory_failure();
        return;
      }
      // Get the constant's TOC offset.
      toc_offset = __ offset_to_method_toc(const_toc_addr);
@ -2495,6 +2465,10 @@ encode %{
      // Create a non-oop constant, no relocation needed.
      // If it is an IC, it has a virtual_call_Relocation.
      const_toc_addr = __ long_constant((jlong)$src$$constant);
      if (const_toc_addr == NULL) {
        ciEnv::current()->record_out_of_memory_failure();
        return;
      }
      // Get the constant's TOC offset.
      const int toc_offset = __ offset_to_method_toc(const_toc_addr);
@ -2631,6 +2605,10 @@ encode %{
        const_toc_addr = __ long_constant((jlong)$src$$constant);
      }
      if (const_toc_addr == NULL) {
        ciEnv::current()->record_out_of_memory_failure();
        return;
      }
      // Get the constant's TOC offset.
      toc_offset = __ offset_to_method_toc(const_toc_addr);
    }
@ -2660,6 +2638,10 @@ encode %{
        const_toc_addr = __ long_constant((jlong)$src$$constant);
      }
      if (const_toc_addr == NULL) {
        ciEnv::current()->record_out_of_memory_failure();
        return;
      }
      // Get the constant's TOC offset.
      const int toc_offset = __ offset_to_method_toc(const_toc_addr);
      // Store the toc offset of the constant.
@ -3408,13 +3390,19 @@ encode %{
        // Put the entry point as a constant into the constant pool.
        const address entry_point_toc_addr   = __ address_constant(entry_point, RelocationHolder::none);
        if (entry_point_toc_addr == NULL) {
          ciEnv::current()->record_out_of_memory_failure();
          return;
        }
        const int     entry_point_toc_offset = __ offset_to_method_toc(entry_point_toc_addr);
        // Emit the trampoline stub which will be related to the branch-and-link below.
        CallStubImpl::emit_trampoline_stub(_masm, entry_point_toc_offset, start_offset);
        if (ciEnv::current()->failing()) { return; } // Code cache may be full.
-        __ relocate(_optimized_virtual ?
+        int method_index = resolved_method_index(cbuf);
-                    relocInfo::opt_virtual_call_type : relocInfo::static_call_type);
+        __ relocate(_optimized_virtual ? opt_virtual_call_Relocation::spec(method_index)
                                       : static_call_Relocation::spec(method_index));
      }
      // The real call.
@ -3433,76 +3421,6 @@ encode %{
    }
  %}
  // Emit a method handle call.
  //
  // Method handle calls from compiled to compiled are going thru a
  // c2i -> i2c adapter, extending the frame for their arguments. The
  // caller however, returns directly to the compiled callee, that has
  // to cope with the extended frame. We restore the original frame by
  // loading the callers sp and adding the calculated framesize.
  enc_class enc_java_handle_call(method meth) %{
    // TODO: PPC port $archOpcode(ppc64Opcode_compound);
    MacroAssembler _masm(&cbuf);
    address entry_point = (address)$meth$$method;
    // Remember the offset not the address.
    const int start_offset = __ offset();
    // The trampoline stub.
    if (!ra_->C->in_scratch_emit_size()) {
      // No entry point given, use the current pc.
      // Make sure branch fits into
      if (entry_point == 0) entry_point = __ pc();
      // Put the entry point as a constant into the constant pool.
      const address entry_point_toc_addr   = __ address_constant(entry_point, RelocationHolder::none);
      const int     entry_point_toc_offset = __ offset_to_method_toc(entry_point_toc_addr);
      // Emit the trampoline stub which will be related to the branch-and-link below.
      CallStubImpl::emit_trampoline_stub(_masm, entry_point_toc_offset, start_offset);
      if (ra_->C->env()->failing()) { return; } // Code cache may be full.
      assert(_optimized_virtual, "methodHandle call should be a virtual call");
      __ relocate(relocInfo::opt_virtual_call_type);
    }
    // The real call.
    // Note: At this point we do not have the address of the trampoline
    // stub, and the entry point might be too far away for bl, so __ pc()
    // serves as dummy and the bl will be patched later.
    cbuf.set_insts_mark();
    __ bl(__ pc());  // Emits a relocation.
    assert(_method, "execute next statement conditionally");
    // The stub for call to interpreter.
    address stub = CompiledStaticCall::emit_to_interp_stub(cbuf);
    if (stub == NULL) {
      ciEnv::current()->record_failure("CodeCache is full");
      return;
    }
    // Restore original sp.
    __ ld(R11_scratch1, 0, R1_SP); // Load caller sp.
    const long framesize = ra_->C->frame_slots() << LogBytesPerInt;
    unsigned int bytes = (unsigned int)framesize;
    long offset = Assembler::align_addr(bytes, frame::alignment_in_bytes);
    if (Assembler::is_simm(-offset, 16)) {
      __ addi(R1_SP, R11_scratch1, -offset);
    } else {
      __ load_const_optimized(R12_scratch2, -offset);
      __ add(R1_SP, R11_scratch1, R12_scratch2);
    }
 #ifdef ASSERT
  __ ld(R12_scratch2, 0, R1_SP); // Load from unextended_sp.
  __ cmpd(CCR0, R11_scratch1, R12_scratch2);
  __ asm_assert_eq("backlink changed", 0x8000);
 #endif
    // If fails should store backlink before unextending.
    if (ra_->C->env()->failing()) {
      return;
    }
  %}
  // Second node of expanded dynamic call - the call.
  enc_class enc_java_dynamic_call_sched(method meth) %{
    // TODO: PPC port $archOpcode(ppc64Opcode_bl);
@ -3513,6 +3431,10 @@ encode %{
      // Create a call trampoline stub for the given method.
      const address entry_point = !($meth$$method) ? 0 : (address)$meth$$method;
      const address entry_point_const = __ address_constant(entry_point, RelocationHolder::none);
      if (entry_point_const == NULL) {
        ciEnv::current()->record_out_of_memory_failure();
        return;
      }
      const int entry_point_const_toc_offset = __ offset_to_method_toc(entry_point_const);
      CallStubImpl::emit_trampoline_stub(_masm, entry_point_const_toc_offset, __ offset());
      if (ra_->C->env()->failing()) { return; } // Code cache may be full.
@ -3530,8 +3452,8 @@ encode %{
      const address virtual_call_oop_addr = __ addr_at(virtual_call_oop_addr_offset);
      assert(MacroAssembler::is_load_const_from_method_toc_at(virtual_call_oop_addr),
             "should be load from TOC");
-
+      int method_index = resolved_method_index(cbuf);
-      __ relocate(virtual_call_Relocation::spec(virtual_call_oop_addr));
+      __ relocate(virtual_call_Relocation::spec(virtual_call_oop_addr, method_index));
    }
    // At this point I do not have the address of the trampoline stub,
@ -3564,6 +3486,7 @@ encode %{
    call->_jvmadj            = _jvmadj;
    call->_in_rms            = _in_rms;
    call->_nesting           = _nesting;
    call->_override_symbolic_info = _override_symbolic_info;
    // New call needs all inputs of old call.
    // Req...
@ -3620,7 +3543,11 @@ encode %{
      address virtual_call_meta_addr = __ pc();
      // Load a clear inline cache.
      AddressLiteral empty_ic((address) Universe::non_oop_word());
-      __ load_const_from_method_toc(ic_reg, empty_ic, Rtoc);
+      bool success = __ load_const_from_method_toc(ic_reg, empty_ic, Rtoc, /*fixed_size*/ true);
      if (!success) {
        ciEnv::current()->record_out_of_memory_failure();
        return;
      }
      // CALL to fixup routine.  Fixup routine uses ScopeDesc info
      // to determine who we intended to call.
      __ relocate(virtual_call_Relocation::spec(virtual_call_meta_addr));
@ -3676,7 +3603,11 @@ encode %{
    __ calculate_address_from_global_toc(Rtoc, __ method_toc());
    // Put entry, env, toc into the constant pool, this needs up to 3 constant
    // pool entries; call_c_using_toc will optimize the call.
-    __ call_c_using_toc(fd, relocInfo::runtime_call_type, Rtoc);
+    bool success = __ call_c_using_toc(fd, relocInfo::runtime_call_type, Rtoc);
    if (!success) {
      ciEnv::current()->record_out_of_memory_failure();
      return;
    }
 #endif
    // Check the ret_addr_offset.
@ -6263,6 +6194,10 @@ instruct loadConF(regF dst, immF src, iRegLdst toc) %{
  ins_encode %{
    // TODO: PPC port $archOpcode(ppc64Opcode_lfs);
    address float_address = __ float_constant($src$$constant);
    if (float_address == NULL) {
      ciEnv::current()->record_out_of_memory_failure();
      return;
    }
    __ lfs($dst$$FloatRegister, __ offset_to_method_toc(float_address), $toc$$Register);
  %}
  ins_pipe(pipe_class_memory);
@ -6284,6 +6219,10 @@ instruct loadConFComp(regF dst, immF src, iRegLdst toc) %{
    FloatRegister Rdst    = $dst$$FloatRegister;
    Register Rtoc         = $toc$$Register;
    address float_address = __ float_constant($src$$constant);
    if (float_address == NULL) {
      ciEnv::current()->record_out_of_memory_failure();
      return;
    }
    int offset            = __ offset_to_method_toc(float_address);
    int hi = (offset + (1<<15))>>16;
    int lo = offset - hi * (1<<16);
@ -6318,7 +6257,12 @@ instruct loadConD(regD dst, immD src, iRegLdst toc) %{
  size(4);
  ins_encode %{
    // TODO: PPC port $archOpcode(ppc64Opcode_lfd);
-    int offset =  __ offset_to_method_toc(__ double_constant($src$$constant));
+    address float_address = __ double_constant($src$$constant);
    if (float_address == NULL) {
      ciEnv::current()->record_out_of_memory_failure();
      return;
    }
    int offset =  __ offset_to_method_toc(float_address);
    __ lfd($dst$$FloatRegister, offset, $toc$$Register);
  %}
  ins_pipe(pipe_class_memory);
@ -6340,6 +6284,10 @@ instruct loadConDComp(regD dst, immD src, iRegLdst toc) %{
    FloatRegister Rdst    = $dst$$FloatRegister;
    Register      Rtoc    = $toc$$Register;
    address float_address = __ double_constant($src$$constant);
    if (float_address == NULL) {
      ciEnv::current()->record_out_of_memory_failure();
      return;
    }
    int offset = __ offset_to_method_toc(float_address);
    int hi = (offset + (1<<15))>>16;
    int lo = offset - hi * (1<<16);
@ -10949,16 +10897,16 @@ instruct partialSubtypeCheck(iRegPdst result, iRegP_N2P subklass, iRegP_N2P supe
 // inlined locking and unlocking
-instruct cmpFastLock(flagsReg crx, iRegPdst oop, iRegPdst box, iRegPdst tmp1, iRegPdst tmp2, iRegPdst tmp3) %{
+instruct cmpFastLock(flagsReg crx, iRegPdst oop, iRegPdst box, iRegPdst tmp1, iRegPdst tmp2) %{
  match(Set crx (FastLock oop box));
-  effect(TEMP tmp1, TEMP tmp2, TEMP tmp3);
+  effect(TEMP tmp1, TEMP tmp2);
  predicate(!Compile::current()->use_rtm());
-  format %{ "FASTLOCK  $oop, $box, $tmp1, $tmp2, $tmp3" %}
+  format %{ "FASTLOCK  $oop, $box, $tmp1, $tmp2" %}
  ins_encode %{
    // TODO: PPC port $archOpcode(ppc64Opcode_compound);
    __ compiler_fast_lock_object($crx$$CondRegister, $oop$$Register, $box$$Register,
-                                 $tmp3$$Register, $tmp1$$Register, $tmp2$$Register,
+                                 $tmp1$$Register, $tmp2$$Register, /*tmp3*/ R0,
                                 UseBiasedLocking && !UseOptoBiasInlining);
    // If locking was successfull, crx should indicate 'EQ'.
    // The compiler generates a branch to the runtime call to
@ -10977,7 +10925,7 @@ instruct cmpFastLock_tm(flagsReg crx, iRegPdst oop, rarg2RegP box, iRegPdst tmp1
  ins_encode %{
    // TODO: PPC port $archOpcode(ppc64Opcode_compound);
    __ compiler_fast_lock_object($crx$$CondRegister, $oop$$Register, $box$$Register,
-                                 $tmp3$$Register, $tmp1$$Register, $tmp2$$Register,
+                                 $tmp1$$Register, $tmp2$$Register, $tmp3$$Register,
                                 /*Biased Locking*/ false,
                                 _rtm_counters, _stack_rtm_counters,
                                 ((Method*)(ra_->C->method()->constant_encoding()))->method_data(),
@ -10998,7 +10946,7 @@ instruct cmpFastUnlock(flagsReg crx, iRegPdst oop, iRegPdst box, iRegPdst tmp1,
  ins_encode %{
    // TODO: PPC port $archOpcode(ppc64Opcode_compound);
    __ compiler_fast_unlock_object($crx$$CondRegister, $oop$$Register, $box$$Register,
-                                   $tmp3$$Register, $tmp1$$Register, $tmp2$$Register,
+                                   $tmp1$$Register, $tmp2$$Register, $tmp3$$Register,
                                   UseBiasedLocking && !UseOptoBiasInlining,
                                   false);
    // If unlocking was successfull, crx should indicate 'EQ'.
@ -11017,7 +10965,7 @@ instruct cmpFastUnlock_tm(flagsReg crx, iRegPdst oop, iRegPdst box, iRegPdst tmp
  ins_encode %{
    // TODO: PPC port $archOpcode(ppc64Opcode_compound);
    __ compiler_fast_unlock_object($crx$$CondRegister, $oop$$Register, $box$$Register,
-                                   $tmp3$$Register, $tmp1$$Register, $tmp2$$Register,
+                                   $tmp1$$Register, $tmp2$$Register, $tmp3$$Register,
                                   /*Biased Locking*/ false, /*TM*/ true);
    // If unlocking was successfull, crx should indicate 'EQ'.
    // The compiler generates a branch to the runtime call to
@ -11790,7 +11738,6 @@ instruct safePoint_poll_conPollAddr(rscratch2RegP poll) %{
 instruct CallStaticJavaDirect(method meth) %{
  match(CallStaticJava);
  effect(USE meth);
  predicate(!((CallStaticJavaNode*)n)->is_method_handle_invoke());
  ins_cost(CALL_COST);
  ins_num_consts(3 /* up to 3 patchable constants: inline cache, 2 call targets. */);
@ -11801,20 +11748,6 @@ instruct CallStaticJavaDirect(method meth) %{
  ins_pipe(pipe_class_call);
 %}
 // Schedulable version of call static node.
 instruct CallStaticJavaDirectHandle(method meth) %{
  match(CallStaticJava);
  effect(USE meth);
  predicate(((CallStaticJavaNode*)n)->is_method_handle_invoke());
  ins_cost(CALL_COST);
  ins_num_consts(3 /* up to 3 patchable constants: inline cache, 2 call targets. */);
  format %{ "CALL,static $meth \t// ==> " %}
  ins_encode( enc_java_handle_call(meth) );
  ins_pipe(pipe_class_call);
 %}
 // Call Java Dynamic Instruction
 // Used by postalloc expand of CallDynamicJavaDirectSchedEx (actual call).
--- a/hotspot/src/cpu/ppc/vm/register_ppc.hpp
+++ b/hotspot/src/cpu/ppc/vm/register_ppc.hpp
@ -609,11 +609,16 @@ REGISTER_DECLARATION(Register, R26_tmp6, R26);
 REGISTER_DECLARATION(Register, R27_tmp7, R27);
 REGISTER_DECLARATION(Register, R28_tmp8, R28);
 REGISTER_DECLARATION(Register, R29_tmp9, R29);
 #ifndef CC_INTERP
 REGISTER_DECLARATION(Register, R24_dispatch_addr,     R24);
 REGISTER_DECLARATION(Register, R25_templateTableBase, R25);
 REGISTER_DECLARATION(Register, R26_monitor,           R26);
 REGISTER_DECLARATION(Register, R27_constPoolCache,    R27);
 REGISTER_DECLARATION(Register, R28_mdx,               R28);
 #endif // CC_INTERP
 REGISTER_DECLARATION(Register, R19_inline_cache_reg, R19);
 REGISTER_DECLARATION(Register, R29_TOC, R29);
 #ifndef DONT_USE_REGISTER_DEFINES
 #define R21_tmp1         AS_REGISTER(Register, R21)
@ -635,7 +640,11 @@ REGISTER_DECLARATION(Register, R28_mdx,               R28);
 #define R28_mdx               AS_REGISTER(Register, R28)
 #endif
 #define R19_inline_cache_reg AS_REGISTER(Register, R19)
 #define R29_TOC AS_REGISTER(Register, R29)
 #define CCR4_is_synced AS_REGISTER(ConditionRegister, CCR4)
 #endif
 // Scratch registers are volatile.
 REGISTER_DECLARATION(Register, R11_scratch1, R11);
--- a/hotspot/src/cpu/ppc/vm/relocInfo_ppc.cpp
+++ b/hotspot/src/cpu/ppc/vm/relocInfo_ppc.cpp
@ -84,13 +84,11 @@ address Relocation::pd_call_destination(address orig_addr) {
    NativeConditionalFarBranch* branch = NativeConditionalFarBranch_at(inst_loc);
    return branch->branch_destination();
  } else {
    // There are two instructions at the beginning of a stub, therefore we
    // load at orig_addr + 8.
    orig_addr = nativeCall_at(inst_loc)->get_trampoline();
    if (orig_addr == NULL) {
      return (address) -1;
    } else {
-      return (address) nativeMovConstReg_at(orig_addr + 8)->data();
+      return ((NativeCallTrampolineStub*)orig_addr)->destination();
    }
  }
 }
--- a/hotspot/src/cpu/ppc/vm/runtime_ppc.cpp
+++ b/hotspot/src/cpu/ppc/vm/runtime_ppc.cpp
@ -1,6 +1,6 @@
 /*
- * Copyright (c) 1998, 2014, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1998, 2015, Oracle and/or its affiliates. All rights reserved.
- * Copyright 2012, 2014 SAP AG. All rights reserved.
+ * Copyright 2012, 2015 SAP AG. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@ -45,16 +45,6 @@
 #ifdef COMPILER2
 // SP adjustment (must use unextended SP) for method handle call sites
 // during exception handling.
 static intptr_t adjust_SP_for_methodhandle_callsite(JavaThread *thread) {
  RegisterMap map(thread, false);
  // The frame constructor will do the correction for us (see frame::adjust_unextended_SP).
  frame mh_caller_frame = thread->last_frame().sender(&map);
  assert(mh_caller_frame.is_compiled_frame(), "Only may reach here for compiled MH call sites");
  return (intptr_t) mh_caller_frame.unextended_sp();
 }
 //------------------------------generate_exception_blob---------------------------
 // Creates exception blob at the end.
 // Using exception blob, this code is jumped from a compiled method.
@ -129,17 +119,10 @@ void OptoRuntime::generate_exception_blob() {
  OopMapSet* oop_maps = new OopMapSet();
  oop_maps->add_gc_map(calls_return_pc - start, map);
  // Get unextended_sp for method handle call sites.
  Label mh_callsite, mh_done; // Use a 2nd c call if it's a method handle call site.
  __ lwa(R4_ARG2, in_bytes(JavaThread::is_method_handle_return_offset()), R16_thread);
  __ cmpwi(CCR0, R4_ARG2, 0);
  __ bne(CCR0, mh_callsite);
  __ mtctr(R3_RET); // Move address of exception handler to SR_CTR.
  __ reset_last_Java_frame();
  __ pop_frame();
  __ bind(mh_done);
  // We have a handler in register SR_CTR (could be deopt blob).
  // Get the exception oop.
@ -161,25 +144,6 @@ void OptoRuntime::generate_exception_blob() {
  __ mtlr(R4_ARG2);
  __ bctr();
  // Same as above, but also set sp to unextended_sp.
  __ bind(mh_callsite);
  __ mr(R31, R3_RET); // Save branch address.
  __ mr(R3_ARG1, R16_thread);
 #if defined(ABI_ELFv2)
  __ call_c((address) adjust_SP_for_methodhandle_callsite, relocInfo::none);
 #else
  __ call_c(CAST_FROM_FN_PTR(FunctionDescriptor*, adjust_SP_for_methodhandle_callsite), relocInfo::none);
 #endif
  // Returns unextended_sp in R3_RET.
  __ mtctr(R31); // Move address of exception handler to SR_CTR.
  __ reset_last_Java_frame();
  __ mr(R1_SP, R3_RET); // Set sp to unextended_sp.
  __ b(mh_done);
  // Make sure all code is generated.
  masm->flush();
--- a/hotspot/src/cpu/ppc/vm/sharedRuntime_ppc.cpp
+++ b/hotspot/src/cpu/ppc/vm/sharedRuntime_ppc.cpp
@ -44,6 +44,8 @@
 #include "opto/runtime.hpp"
 #endif
 #include <alloca.h>
 #define __ masm->
 #ifdef PRODUCT
@ -62,7 +64,7 @@ class RegisterSaver {
  // Support different return pc locations.
  enum ReturnPCLocation {
    return_pc_is_lr,
-    return_pc_is_r4,
+    return_pc_is_pre_saved,
    return_pc_is_thread_saved_exception_pc
  };
@ -242,15 +244,16 @@ OopMap* RegisterSaver::push_frame_reg_args_and_save_live_registers(MacroAssemble
  __ std(R31, _abi(cr), R1_SP);
  switch (return_pc_location) {
    case return_pc_is_lr: __ mflr(R31); break;
-    case return_pc_is_r4:    __ mr(R31, R4);     break;
+    case return_pc_is_pre_saved: assert(return_pc_adjustment == 0, "unsupported"); break;
-    case return_pc_is_thread_saved_exception_pc:
+    case return_pc_is_thread_saved_exception_pc: __ ld(R31, thread_(saved_exception_pc)); break;
                             __ ld(R31, thread_(saved_exception_pc)); break;
    default: ShouldNotReachHere();
  }
  if (return_pc_location != return_pc_is_pre_saved) {
    if (return_pc_adjustment != 0) {
      __ addi(R31, R31, return_pc_adjustment);
    }
    __ std(R31, _abi(lr), R1_SP);
  }
  // push a new frame
  __ push_frame(frame_size_in_bytes, R31);
@ -646,7 +649,7 @@ int SharedRuntime::java_calling_convention(const BasicType *sig_bt,
  return round_to(stk, 2);
 }
-#ifdef COMPILER2
+#if defined(COMPILER1) || defined(COMPILER2)
 // Calling convention for calling C code.
 int SharedRuntime::c_calling_convention(const BasicType *sig_bt,
                                        VMRegPair *regs,
@ -1474,7 +1477,7 @@ static void save_or_restore_arguments(MacroAssembler* masm,
  }
 }
-// Check GC_locker::needs_gc and enter the runtime if it's true. This
+// Check GCLocker::needs_gc and enter the runtime if it's true. This
 // keeps a new JNI critical region from starting until a GC has been
 // forced. Save down any oops in registers and describe them in an
 // OopMap.
@ -1486,9 +1489,9 @@ static void check_needs_gc_for_critical_native(MacroAssembler* masm,
                                               VMRegPair* in_regs,
                                               BasicType* in_sig_bt,
                                               Register tmp_reg ) {
-  __ block_comment("check GC_locker::needs_gc");
+  __ block_comment("check GCLocker::needs_gc");
  Label cont;
-  __ lbz(tmp_reg, (RegisterOrConstant)(intptr_t)GC_locker::needs_gc_address());
+  __ lbz(tmp_reg, (RegisterOrConstant)(intptr_t)GCLocker::needs_gc_address());
  __ cmplwi(CCR0, tmp_reg, 0);
  __ beq(CCR0, cont);
@ -1687,14 +1690,14 @@ static void gen_special_dispatch(MacroAssembler* masm,
 // GetPrimtiveArrayCritical and disallow the use of any other JNI
 // functions.  The wrapper is expected to unpack the arguments before
 // passing them to the callee and perform checks before and after the
-// native call to ensure that they GC_locker
+// native call to ensure that they GCLocker
 // lock_critical/unlock_critical semantics are followed.  Some other
 // parts of JNI setup are skipped like the tear down of the JNI handle
 // block and the check for pending exceptions it's impossible for them
 // to be thrown.
 //
 // They are roughly structured like this:
-//   if (GC_locker::needs_gc())
+//   if (GCLocker::needs_gc())
 //     SharedRuntime::block_for_jni_critical();
 //   tranistion to thread_in_native
 //   unpack arrray arguments and call native entry point
@ -2566,7 +2569,7 @@ uint SharedRuntime::out_preserve_stack_slots() {
 #endif
 }
-#ifdef COMPILER2
+#if defined(COMPILER1) || defined(COMPILER2)
 // Frame generation for deopt and uncommon trap blobs.
 static void push_skeleton_frame(MacroAssembler* masm, bool deopt,
                                /* Read */
@ -2713,7 +2716,7 @@ void SharedRuntime::generate_deopt_blob() {
  const address start = __ pc();
-#ifdef COMPILER2
+#if defined(COMPILER1) || defined(COMPILER2)
  // --------------------------------------------------------------------------
  // Prolog for non exception case!
@ -2762,28 +2765,43 @@ void SharedRuntime::generate_deopt_blob() {
  BLOCK_COMMENT("Prolog for exception case");
  // The RegisterSaves doesn't need to adjust the return pc for this situation.
  const int return_pc_adjustment_exception = 0;
  // Push the "unpack frame".
  // Save everything in sight.
  assert(R4 == R4_ARG2, "exception pc must be in r4");
  RegisterSaver::push_frame_reg_args_and_save_live_registers(masm,
                                                             &first_frame_size_in_bytes,
                                                             /*generate_oop_map=*/ false,
                                                             return_pc_adjustment_exception,
                                                             RegisterSaver::return_pc_is_r4);
  // Deopt during an exception. Save exec mode for unpack_frames.
  __ li(exec_mode_reg, Deoptimization::Unpack_exception);
  // Store exception oop and pc in thread (location known to GC).
  // This is needed since the call to "fetch_unroll_info()" may safepoint.
  __ std(R3_ARG1, in_bytes(JavaThread::exception_oop_offset()), R16_thread);
  __ std(R4_ARG2, in_bytes(JavaThread::exception_pc_offset()),  R16_thread);
  __ std(R4_ARG2, _abi(lr), R1_SP);
  // Vanilla deoptimization with an exception pending in exception_oop.
  int exception_in_tls_offset = __ pc() - start;
  // Push the "unpack frame".
  // Save everything in sight.
  RegisterSaver::push_frame_reg_args_and_save_live_registers(masm,
                                                             &first_frame_size_in_bytes,
                                                             /*generate_oop_map=*/ false,
                                                             /*return_pc_adjustment_exception=*/ 0,
                                                             RegisterSaver::return_pc_is_pre_saved);
  // Deopt during an exception. Save exec mode for unpack_frames.
  __ li(exec_mode_reg, Deoptimization::Unpack_exception);
  // fall through
  int reexecute_offset = 0;
 #ifdef COMPILER1
  __ b(exec_mode_initialized);
  // Reexecute entry, similar to c2 uncommon trap
  reexecute_offset = __ pc() - start;
  RegisterSaver::push_frame_reg_args_and_save_live_registers(masm,
                                                             &first_frame_size_in_bytes,
                                                             /*generate_oop_map=*/ false,
                                                             /*return_pc_adjustment_reexecute=*/ 0,
                                                             RegisterSaver::return_pc_is_pre_saved);
  __ li(exec_mode_reg, Deoptimization::Unpack_reexecute);
 #endif
  // --------------------------------------------------------------------------
  __ BIND(exec_mode_initialized);
@ -2889,7 +2907,9 @@ void SharedRuntime::generate_deopt_blob() {
  int exception_offset = __ pc() - start;
 #endif // COMPILER2
-  _deopt_blob = DeoptimizationBlob::create(&buffer, oop_maps, 0, exception_offset, 0, first_frame_size_in_bytes / wordSize);
+  _deopt_blob = DeoptimizationBlob::create(&buffer, oop_maps, 0, exception_offset,
                                           reexecute_offset, first_frame_size_in_bytes / wordSize);
  _deopt_blob->set_unpack_with_exception_in_tls_offset(exception_in_tls_offset);
 }
 #ifdef COMPILER2
@ -3196,3 +3216,245 @@ RuntimeStub* SharedRuntime::generate_resolve_blob(address destination, const cha
  return RuntimeStub::new_runtime_stub(name, &buffer, frame_complete, frame_size_in_bytes/wordSize,
                                       oop_maps, true);
 }
 //------------------------------Montgomery multiplication------------------------
 //
 // Subtract 0:b from carry:a. Return carry.
 static unsigned long
 sub(unsigned long a[], unsigned long b[], unsigned long carry, long len) {
  long i = 0;
  unsigned long tmp, tmp2;
  __asm__ __volatile__ (
    "subfc  %[tmp], %[tmp], %[tmp]   \n" // pre-set CA
    "mtctr  %[len]                   \n"
    "0:                              \n"
    "ldx    %[tmp], %[i], %[a]       \n"
    "ldx    %[tmp2], %[i], %[b]      \n"
    "subfe  %[tmp], %[tmp2], %[tmp]  \n" // subtract extended
    "stdx   %[tmp], %[i], %[a]       \n"
    "addi   %[i], %[i], 8            \n"
    "bdnz   0b                       \n"
    "addme  %[tmp], %[carry]         \n" // carry + CA - 1
    : [i]"+b"(i), [tmp]"=&r"(tmp), [tmp2]"=&r"(tmp2)
    : [a]"r"(a), [b]"r"(b), [carry]"r"(carry), [len]"r"(len)
    : "ctr", "xer", "memory"
  );
  return tmp;
 }
 // Multiply (unsigned) Long A by Long B, accumulating the double-
 // length result into the accumulator formed of T0, T1, and T2.
 inline void MACC(unsigned long A, unsigned long B, unsigned long &T0, unsigned long &T1, unsigned long &T2) {
  unsigned long hi, lo;
  __asm__ __volatile__ (
    "mulld  %[lo], %[A], %[B]    \n"
    "mulhdu %[hi], %[A], %[B]    \n"
    "addc   %[T0], %[T0], %[lo]  \n"
    "adde   %[T1], %[T1], %[hi]  \n"
    "addze  %[T2], %[T2]         \n"
    : [hi]"=&r"(hi), [lo]"=&r"(lo), [T0]"+r"(T0), [T1]"+r"(T1), [T2]"+r"(T2)
    : [A]"r"(A), [B]"r"(B)
    : "xer"
  );
 }
 // As above, but add twice the double-length result into the
 // accumulator.
 inline void MACC2(unsigned long A, unsigned long B, unsigned long &T0, unsigned long &T1, unsigned long &T2) {
  unsigned long hi, lo;
  __asm__ __volatile__ (
    "mulld  %[lo], %[A], %[B]    \n"
    "mulhdu %[hi], %[A], %[B]    \n"
    "addc   %[T0], %[T0], %[lo]  \n"
    "adde   %[T1], %[T1], %[hi]  \n"
    "addze  %[T2], %[T2]         \n"
    "addc   %[T0], %[T0], %[lo]  \n"
    "adde   %[T1], %[T1], %[hi]  \n"
    "addze  %[T2], %[T2]         \n"
    : [hi]"=&r"(hi), [lo]"=&r"(lo), [T0]"+r"(T0), [T1]"+r"(T1), [T2]"+r"(T2)
    : [A]"r"(A), [B]"r"(B)
    : "xer"
  );
 }
 // Fast Montgomery multiplication. The derivation of the algorithm is
 // in "A Cryptographic Library for the Motorola DSP56000,
 // Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237".
 static void
 montgomery_multiply(unsigned long a[], unsigned long b[], unsigned long n[],
                    unsigned long m[], unsigned long inv, int len) {
  unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
  int i;
  assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
  for (i = 0; i < len; i++) {
    int j;
    for (j = 0; j < i; j++) {
      MACC(a[j], b[i-j], t0, t1, t2);
      MACC(m[j], n[i-j], t0, t1, t2);
    }
    MACC(a[i], b[0], t0, t1, t2);
    m[i] = t0 * inv;
    MACC(m[i], n[0], t0, t1, t2);
    assert(t0 == 0, "broken Montgomery multiply");
    t0 = t1; t1 = t2; t2 = 0;
  }
  for (i = len; i < 2*len; i++) {
    int j;
    for (j = i-len+1; j < len; j++) {
      MACC(a[j], b[i-j], t0, t1, t2);
      MACC(m[j], n[i-j], t0, t1, t2);
    }
    m[i-len] = t0;
    t0 = t1; t1 = t2; t2 = 0;
  }
  while (t0) {
    t0 = sub(m, n, t0, len);
  }
 }
 // Fast Montgomery squaring. This uses asymptotically 25% fewer
 // multiplies so it should be up to 25% faster than Montgomery
 // multiplication. However, its loop control is more complex and it
 // may actually run slower on some machines.
 static void
 montgomery_square(unsigned long a[], unsigned long n[],
                  unsigned long m[], unsigned long inv, int len) {
  unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
  int i;
  assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
  for (i = 0; i < len; i++) {
    int j;
    int end = (i+1)/2;
    for (j = 0; j < end; j++) {
      MACC2(a[j], a[i-j], t0, t1, t2);
      MACC(m[j], n[i-j], t0, t1, t2);
    }
    if ((i & 1) == 0) {
      MACC(a[j], a[j], t0, t1, t2);
    }
    for (; j < i; j++) {
      MACC(m[j], n[i-j], t0, t1, t2);
    }
    m[i] = t0 * inv;
    MACC(m[i], n[0], t0, t1, t2);
    assert(t0 == 0, "broken Montgomery square");
    t0 = t1; t1 = t2; t2 = 0;
  }
  for (i = len; i < 2*len; i++) {
    int start = i-len+1;
    int end = start + (len - start)/2;
    int j;
    for (j = start; j < end; j++) {
      MACC2(a[j], a[i-j], t0, t1, t2);
      MACC(m[j], n[i-j], t0, t1, t2);
    }
    if ((i & 1) == 0) {
      MACC(a[j], a[j], t0, t1, t2);
    }
    for (; j < len; j++) {
      MACC(m[j], n[i-j], t0, t1, t2);
    }
    m[i-len] = t0;
    t0 = t1; t1 = t2; t2 = 0;
  }
  while (t0) {
    t0 = sub(m, n, t0, len);
  }
 }
 // The threshold at which squaring is advantageous was determined
 // experimentally on an i7-3930K (Ivy Bridge) CPU @ 3.5GHz.
 // Doesn't seem to be relevant for Power8 so we use the same value.
 #define MONTGOMERY_SQUARING_THRESHOLD 64
 // Copy len longwords from s to d, word-swapping as we go. The
 // destination array is reversed.
 static void reverse_words(unsigned long *s, unsigned long *d, int len) {
  d += len;
  while(len-- > 0) {
    d--;
    unsigned long s_val = *s;
    // Swap words in a longword on little endian machines.
 #ifdef VM_LITTLE_ENDIAN
     s_val = (s_val << 32) | (s_val >> 32);
 #endif
    *d = s_val;
    s++;
  }
 }
 void SharedRuntime::montgomery_multiply(jint *a_ints, jint *b_ints, jint *n_ints,
                                        jint len, jlong inv,
                                        jint *m_ints) {
  assert(len % 2 == 0, "array length in montgomery_multiply must be even");
  int longwords = len/2;
  assert(longwords > 0, "unsupported");
  // Make very sure we don't use so much space that the stack might
  // overflow. 512 jints corresponds to an 16384-bit integer and
  // will use here a total of 8k bytes of stack space.
  int total_allocation = longwords * sizeof (unsigned long) * 4;
  guarantee(total_allocation <= 8192, "must be");
  unsigned long *scratch = (unsigned long *)alloca(total_allocation);
  // Local scratch arrays
  unsigned long
    *a = scratch + 0 * longwords,
    *b = scratch + 1 * longwords,
    *n = scratch + 2 * longwords,
    *m = scratch + 3 * longwords;
  reverse_words((unsigned long *)a_ints, a, longwords);
  reverse_words((unsigned long *)b_ints, b, longwords);
  reverse_words((unsigned long *)n_ints, n, longwords);
  ::montgomery_multiply(a, b, n, m, (unsigned long)inv, longwords);
  reverse_words(m, (unsigned long *)m_ints, longwords);
 }
 void SharedRuntime::montgomery_square(jint *a_ints, jint *n_ints,
                                      jint len, jlong inv,
                                      jint *m_ints) {
  assert(len % 2 == 0, "array length in montgomery_square must be even");
  int longwords = len/2;
  assert(longwords > 0, "unsupported");
  // Make very sure we don't use so much space that the stack might
  // overflow. 512 jints corresponds to an 16384-bit integer and
  // will use here a total of 6k bytes of stack space.
  int total_allocation = longwords * sizeof (unsigned long) * 3;
  guarantee(total_allocation <= 8192, "must be");
  unsigned long *scratch = (unsigned long *)alloca(total_allocation);
  // Local scratch arrays
  unsigned long
    *a = scratch + 0 * longwords,
    *n = scratch + 1 * longwords,
    *m = scratch + 2 * longwords;
  reverse_words((unsigned long *)a_ints, a, longwords);
  reverse_words((unsigned long *)n_ints, n, longwords);
  if (len >= MONTGOMERY_SQUARING_THRESHOLD) {
    ::montgomery_square(a, n, m, (unsigned long)inv, longwords);
  } else {
    ::montgomery_multiply(a, a, n, m, (unsigned long)inv, longwords);
  }
  reverse_words(m, (unsigned long *)m_ints, longwords);
 }
--- a/hotspot/src/cpu/ppc/vm/stubGenerator_ppc.cpp
+++ b/hotspot/src/cpu/ppc/vm/stubGenerator_ppc.cpp
@ -48,6 +48,12 @@
 #define BLOCK_COMMENT(str) __ block_comment(str)
 #endif
 #if defined(ABI_ELFv2)
 #define STUB_ENTRY(name) StubRoutines::name()
 #else
 #define STUB_ENTRY(name) ((FunctionDescriptor*)StubRoutines::name())->entry()
 #endif
 class StubGenerator: public StubCodeGenerator {
 private:
@ -252,8 +258,7 @@ class StubGenerator: public StubCodeGenerator {
      //
      // global toc register
-      __ load_const(R29, MacroAssembler::global_toc(), R11_scratch1);
+      __ load_const_optimized(R29_TOC, MacroAssembler::global_toc(), R11_scratch1);
      // Remember the senderSP so we interpreter can pop c2i arguments off of the stack
      // when called via a c2i.
@ -612,14 +617,17 @@ class StubGenerator: public StubCodeGenerator {
  //  Kills:
  //     nothing
  //
-  void gen_write_ref_array_pre_barrier(Register from, Register to, Register count, bool dest_uninitialized, Register Rtmp1) {
+  void gen_write_ref_array_pre_barrier(Register from, Register to, Register count, bool dest_uninitialized, Register Rtmp1,
                                       Register preserve1 = noreg, Register preserve2 = noreg) {
    BarrierSet* const bs = Universe::heap()->barrier_set();
    switch (bs->kind()) {
      case BarrierSet::G1SATBCTLogging:
        // With G1, don't generate the call if we statically know that the target in uninitialized
        if (!dest_uninitialized) {
-          const int spill_slots = 4 * wordSize;
+          int spill_slots = 3;
-          const int frame_size  = frame::abi_reg_args_size + spill_slots;
+          if (preserve1 != noreg) { spill_slots++; }
          if (preserve2 != noreg) { spill_slots++; }
          const int frame_size = align_size_up(frame::abi_reg_args_size + spill_slots * BytesPerWord, frame::alignment_in_bytes);
          Label filtered;
          // Is marking active?
@ -633,17 +641,23 @@ class StubGenerator: public StubCodeGenerator {
          __ beq(CCR0, filtered);
          __ save_LR_CR(R0);
-          __ push_frame_reg_args(spill_slots, R0);
+          __ push_frame(frame_size, R0);
-          __ std(from,  frame_size - 1 * wordSize, R1_SP);
+          int slot_nr = 0;
-          __ std(to,    frame_size - 2 * wordSize, R1_SP);
+          __ std(from,  frame_size - (++slot_nr) * wordSize, R1_SP);
-          __ std(count, frame_size - 3 * wordSize, R1_SP);
+          __ std(to,    frame_size - (++slot_nr) * wordSize, R1_SP);
          __ std(count, frame_size - (++slot_nr) * wordSize, R1_SP);
          if (preserve1 != noreg) { __ std(preserve1, frame_size - (++slot_nr) * wordSize, R1_SP); }
          if (preserve2 != noreg) { __ std(preserve2, frame_size - (++slot_nr) * wordSize, R1_SP); }
          __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_pre), to, count);
-          __ ld(from,  frame_size - 1 * wordSize, R1_SP);
+          slot_nr = 0;
-          __ ld(to,    frame_size - 2 * wordSize, R1_SP);
+          __ ld(from,  frame_size - (++slot_nr) * wordSize, R1_SP);
-          __ ld(count, frame_size - 3 * wordSize, R1_SP);
+          __ ld(to,    frame_size - (++slot_nr) * wordSize, R1_SP);
-          __ pop_frame();
+          __ ld(count, frame_size - (++slot_nr) * wordSize, R1_SP);
          if (preserve1 != noreg) { __ ld(preserve1, frame_size - (++slot_nr) * wordSize, R1_SP); }
          if (preserve2 != noreg) { __ ld(preserve2, frame_size - (++slot_nr) * wordSize, R1_SP); }
          __ addi(R1_SP, R1_SP, frame_size); // pop_frame()
          __ restore_LR_CR(R0);
          __ bind(filtered);
@ -667,27 +681,22 @@ class StubGenerator: public StubCodeGenerator {
  //
  //  The input registers and R0 are overwritten.
  //
-  void gen_write_ref_array_post_barrier(Register addr, Register count, Register tmp, bool branchToEnd) {
+  void gen_write_ref_array_post_barrier(Register addr, Register count, Register tmp, Register preserve = noreg) {
    BarrierSet* const bs = Universe::heap()->barrier_set();
    switch (bs->kind()) {
      case BarrierSet::G1SATBCTLogging:
        {
-          if (branchToEnd) {
+          int spill_slots = (preserve != noreg) ? 1 : 0;
          const int frame_size = align_size_up(frame::abi_reg_args_size + spill_slots * BytesPerWord, frame::alignment_in_bytes);
          __ save_LR_CR(R0);
-            // We need this frame only to spill LR.
+          __ push_frame(frame_size, R0);
-            __ push_frame_reg_args(0, R0);
+          if (preserve != noreg) { __ std(preserve, frame_size - 1 * wordSize, R1_SP); }
          __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_post), addr, count);
-            __ pop_frame();
+          if (preserve != noreg) { __ ld(preserve, frame_size - 1 * wordSize, R1_SP); }
          __ addi(R1_SP, R1_SP, frame_size); // pop_frame();
          __ restore_LR_CR(R0);
          } else {
            // Tail call: fake call from stub caller by branching without linking.
            address entry_point = (address)CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_post);
            __ mr_if_needed(R3_ARG1, addr);
            __ mr_if_needed(R4_ARG2, count);
            __ load_const(R11, entry_point, R0);
            __ call_c_and_return_to_caller(R11);
          }
        }
        break;
      case BarrierSet::CardTableForRS:
@ -722,12 +731,9 @@ class StubGenerator: public StubCodeGenerator {
          __ addi(addr, addr, 1);
          __ bdnz(Lstore_loop);
          __ bind(Lskip_loop);
          if (!branchToEnd) __ blr();
        }
      break;
      case BarrierSet::ModRef:
        if (!branchToEnd) __ blr();
        break;
      default:
        ShouldNotReachHere();
@ -756,8 +762,10 @@ class StubGenerator: public StubCodeGenerator {
    // Procedure for large arrays (uses data cache block zero instruction).
    Label dwloop, fast, fastloop, restloop, lastdword, done;
-    int cl_size=VM_Version::get_cache_line_size(), cl_dwords=cl_size>>3, cl_dwordaddr_bits=exact_log2(cl_dwords);
+    int cl_size = VM_Version::L1_data_cache_line_size();
-    int min_dcbz=2; // Needs to be positive, apply dcbz only to at least min_dcbz cache lines.
+    int cl_dwords = cl_size >> 3;
    int cl_dwordaddr_bits = exact_log2(cl_dwords);
    int min_dcbz = 2; // Needs to be positive, apply dcbz only to at least min_dcbz cache lines.
    // Clear up to 128byte boundary if long enough, dword_cnt=(16-(base>>3))%16.
    __ dcbtst(base_ptr_reg);                    // Indicate write access to first cache line ...
@ -1074,7 +1082,6 @@ class StubGenerator: public StubCodeGenerator {
    Register tmp1 = R6_ARG4;
    Register tmp2 = R7_ARG5;
    Label l_overlap;
 #ifdef ASSERT
    __ srdi_(tmp2, R5_ARG3, 31);
    __ asm_assert_eq("missing zero extend", 0xAFFE);
@ -1084,19 +1091,11 @@ class StubGenerator: public StubCodeGenerator {
    __ sldi(tmp2, R5_ARG3, log2_elem_size); // size in bytes
    __ cmpld(CCR0, R3_ARG1, R4_ARG2); // Use unsigned comparison!
    __ cmpld(CCR1, tmp1, tmp2);
-    __ crand(CCR0, Assembler::less, CCR1, Assembler::less);
+    __ crnand(CCR0, Assembler::less, CCR1, Assembler::less);
-    __ blt(CCR0, l_overlap); // Src before dst and distance smaller than size.
+    // Overlaps if Src before dst and distance smaller than size.
    // Branch to forward copy routine otherwise (within range of 32kB).
    __ bc(Assembler::bcondCRbiIs1, Assembler::bi0(CCR0, Assembler::less), no_overlap_target);
    // need to copy forwards
    if (__ is_within_range_of_b(no_overlap_target, __ pc())) {
      __ b(no_overlap_target);
    } else {
      __ load_const(tmp1, no_overlap_target, tmp2);
      __ mtctr(tmp1);
      __ bctr();
    }
    __ bind(l_overlap);
    // need to copy backwards
  }
@ -1241,6 +1240,7 @@ class StubGenerator: public StubCodeGenerator {
    }
    __ bind(l_4);
    __ li(R3_RET, 0); // return 0
    __ blr();
    return start;
@ -1262,15 +1262,9 @@ class StubGenerator: public StubCodeGenerator {
    Register tmp2 = R7_ARG5;
    Register tmp3 = R8_ARG6;
 #if defined(ABI_ELFv2)
    address nooverlap_target = aligned ?
-      StubRoutines::arrayof_jbyte_disjoint_arraycopy() :
+      STUB_ENTRY(arrayof_jbyte_disjoint_arraycopy) :
-      StubRoutines::jbyte_disjoint_arraycopy();
+      STUB_ENTRY(jbyte_disjoint_arraycopy);
 #else
    address nooverlap_target = aligned ?
      ((FunctionDescriptor*)StubRoutines::arrayof_jbyte_disjoint_arraycopy())->entry() :
      ((FunctionDescriptor*)StubRoutines::jbyte_disjoint_arraycopy())->entry();
 #endif
    array_overlap_test(nooverlap_target, 0);
    // Do reverse copy. We assume the case of actual overlap is rare enough
@ -1285,6 +1279,7 @@ class StubGenerator: public StubCodeGenerator {
    __ lbzx(tmp1, R3_ARG1, R5_ARG3);
    __ bge(CCR0, l_1);
    __ li(R3_RET, 0); // return 0
    __ blr();
    return start;
@ -1467,6 +1462,7 @@ class StubGenerator: public StubCodeGenerator {
      __ bdnz(l_5);
    }
    __ bind(l_4);
    __ li(R3_RET, 0); // return 0
    __ blr();
    return start;
@ -1488,15 +1484,9 @@ class StubGenerator: public StubCodeGenerator {
    Register tmp2 = R7_ARG5;
    Register tmp3 = R8_ARG6;
 #if defined(ABI_ELFv2)
    address nooverlap_target = aligned ?
-        StubRoutines::arrayof_jshort_disjoint_arraycopy() :
+      STUB_ENTRY(arrayof_jshort_disjoint_arraycopy) :
-        StubRoutines::jshort_disjoint_arraycopy();
+      STUB_ENTRY(jshort_disjoint_arraycopy);
 #else
    address nooverlap_target = aligned ?
        ((FunctionDescriptor*)StubRoutines::arrayof_jshort_disjoint_arraycopy())->entry() :
        ((FunctionDescriptor*)StubRoutines::jshort_disjoint_arraycopy())->entry();
 #endif
    array_overlap_test(nooverlap_target, 1);
@ -1510,6 +1500,7 @@ class StubGenerator: public StubCodeGenerator {
    __ lhzx(tmp2, R3_ARG1, tmp1);
    __ bge(CCR0, l_1);
    __ li(R3_RET, 0); // return 0
    __ blr();
    return start;
@ -1613,6 +1604,7 @@ class StubGenerator: public StubCodeGenerator {
    StubCodeMark mark(this, "StubRoutines", name);
    address start = __ function_entry();
    generate_disjoint_int_copy_core(aligned);
    __ li(R3_RET, 0); // return 0
    __ blr();
    return start;
  }
@ -1697,20 +1689,15 @@ class StubGenerator: public StubCodeGenerator {
    StubCodeMark mark(this, "StubRoutines", name);
    address start = __ function_entry();
 #if defined(ABI_ELFv2)
    address nooverlap_target = aligned ?
-      StubRoutines::arrayof_jint_disjoint_arraycopy() :
+      STUB_ENTRY(arrayof_jint_disjoint_arraycopy) :
-      StubRoutines::jint_disjoint_arraycopy();
+      STUB_ENTRY(jint_disjoint_arraycopy);
 #else
    address nooverlap_target = aligned ?
      ((FunctionDescriptor*)StubRoutines::arrayof_jint_disjoint_arraycopy())->entry() :
      ((FunctionDescriptor*)StubRoutines::jint_disjoint_arraycopy())->entry();
 #endif
    array_overlap_test(nooverlap_target, 2);
    generate_conjoint_int_copy_core(aligned);
    __ li(R3_RET, 0); // return 0
    __ blr();
    return start;
@ -1789,6 +1776,7 @@ class StubGenerator: public StubCodeGenerator {
    StubCodeMark mark(this, "StubRoutines", name);
    address start = __ function_entry();
    generate_disjoint_long_copy_core(aligned);
    __ li(R3_RET, 0); // return 0
    __ blr();
    return start;
@ -1871,19 +1859,14 @@ class StubGenerator: public StubCodeGenerator {
    StubCodeMark mark(this, "StubRoutines", name);
    address start = __ function_entry();
 #if defined(ABI_ELFv2)
    address nooverlap_target = aligned ?
-      StubRoutines::arrayof_jlong_disjoint_arraycopy() :
+      STUB_ENTRY(arrayof_jlong_disjoint_arraycopy) :
-      StubRoutines::jlong_disjoint_arraycopy();
+      STUB_ENTRY(jlong_disjoint_arraycopy);
 #else
    address nooverlap_target = aligned ?
      ((FunctionDescriptor*)StubRoutines::arrayof_jlong_disjoint_arraycopy())->entry() :
      ((FunctionDescriptor*)StubRoutines::jlong_disjoint_arraycopy())->entry();
 #endif
    array_overlap_test(nooverlap_target, 3);
    generate_conjoint_long_copy_core(aligned);
    __ li(R3_RET, 0); // return 0
    __ blr();
    return start;
@ -1903,15 +1886,9 @@ class StubGenerator: public StubCodeGenerator {
    address start = __ function_entry();
 #if defined(ABI_ELFv2)
    address nooverlap_target = aligned ?
-      StubRoutines::arrayof_oop_disjoint_arraycopy() :
+      STUB_ENTRY(arrayof_oop_disjoint_arraycopy) :
-      StubRoutines::oop_disjoint_arraycopy();
+      STUB_ENTRY(oop_disjoint_arraycopy);
 #else
    address nooverlap_target = aligned ?
      ((FunctionDescriptor*)StubRoutines::arrayof_oop_disjoint_arraycopy())->entry() :
      ((FunctionDescriptor*)StubRoutines::oop_disjoint_arraycopy())->entry();
 #endif
    gen_write_ref_array_pre_barrier(R3_ARG1, R4_ARG2, R5_ARG3, dest_uninitialized, R9_ARG7);
@ -1927,7 +1904,9 @@ class StubGenerator: public StubCodeGenerator {
      generate_conjoint_long_copy_core(aligned);
    }
-    gen_write_ref_array_post_barrier(R9_ARG7, R10_ARG8, R11_scratch1, /*branchToEnd*/ false);
+    gen_write_ref_array_post_barrier(R9_ARG7, R10_ARG8, R11_scratch1);
    __ li(R3_RET, 0); // return 0
    __ blr();
    return start;
  }
@ -1957,11 +1936,460 @@ class StubGenerator: public StubCodeGenerator {
      generate_disjoint_long_copy_core(aligned);
    }
-    gen_write_ref_array_post_barrier(R9_ARG7, R10_ARG8, R11_scratch1, /*branchToEnd*/ false);
+    gen_write_ref_array_post_barrier(R9_ARG7, R10_ARG8, R11_scratch1);
    __ li(R3_RET, 0); // return 0
    __ blr();
    return start;
  }
  // Helper for generating a dynamic type check.
  // Smashes only the given temp registers.
  void generate_type_check(Register sub_klass,
                           Register super_check_offset,
                           Register super_klass,
                           Register temp,
                           Label& L_success) {
    assert_different_registers(sub_klass, super_check_offset, super_klass);
    BLOCK_COMMENT("type_check:");
    Label L_miss;
    __ check_klass_subtype_fast_path(sub_klass, super_klass, temp, R0, &L_success, &L_miss, NULL,
                                     super_check_offset);
    __ check_klass_subtype_slow_path(sub_klass, super_klass, temp, R0, &L_success, NULL);
    // Fall through on failure!
    __ bind(L_miss);
  }
  //  Generate stub for checked oop copy.
  //
  // Arguments for generated stub:
  //      from:  R3
  //      to:    R4
  //      count: R5 treated as signed
  //      ckoff: R6 (super_check_offset)
  //      ckval: R7 (super_klass)
  //      ret:   R3 zero for success; (-1^K) where K is partial transfer count
  //
  address generate_checkcast_copy(const char *name, bool dest_uninitialized) {
    const Register R3_from   = R3_ARG1;      // source array address
    const Register R4_to     = R4_ARG2;      // destination array address
    const Register R5_count  = R5_ARG3;      // elements count
    const Register R6_ckoff  = R6_ARG4;      // super_check_offset
    const Register R7_ckval  = R7_ARG5;      // super_klass
    const Register R8_offset = R8_ARG6;      // loop var, with stride wordSize
    const Register R9_remain = R9_ARG7;      // loop var, with stride -1
    const Register R10_oop   = R10_ARG8;     // actual oop copied
    const Register R11_klass = R11_scratch1; // oop._klass
    const Register R12_tmp   = R12_scratch2;
    const Register R2_minus1 = R2;
    //__ align(CodeEntryAlignment);
    StubCodeMark mark(this, "StubRoutines", name);
    address start = __ function_entry();
    // TODO: Assert that int is 64 bit sign extended and arrays are not conjoint.
    gen_write_ref_array_pre_barrier(R3_from, R4_to, R5_count, dest_uninitialized, R12_tmp, /* preserve: */ R6_ckoff, R7_ckval);
    //inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr, R12_tmp, R3_RET);
    Label load_element, store_element, store_null, success, do_card_marks;
    __ or_(R9_remain, R5_count, R5_count); // Initialize loop index, and test it.
    __ li(R8_offset, 0);                   // Offset from start of arrays.
    __ li(R2_minus1, -1);
    __ bne(CCR0, load_element);
    // Empty array: Nothing to do.
    __ li(R3_RET, 0);           // Return 0 on (trivial) success.
    __ blr();
    // ======== begin loop ========
    // (Entry is load_element.)
    __ align(OptoLoopAlignment);
    __ bind(store_element);
    if (UseCompressedOops) {
      __ encode_heap_oop_not_null(R10_oop);
      __ bind(store_null);
      __ stw(R10_oop, R8_offset, R4_to);
    } else {
      __ bind(store_null);
      __ std(R10_oop, R8_offset, R4_to);
    }
    __ addi(R8_offset, R8_offset, heapOopSize);   // Step to next offset.
    __ add_(R9_remain, R2_minus1, R9_remain);     // Decrement the count.
    __ beq(CCR0, success);
    // ======== loop entry is here ========
    __ bind(load_element);
    __ load_heap_oop(R10_oop, R8_offset, R3_from, &store_null);  // Load the oop.
    __ load_klass(R11_klass, R10_oop); // Query the object klass.
    generate_type_check(R11_klass, R6_ckoff, R7_ckval, R12_tmp,
                        // Branch to this on success:
                        store_element);
    // ======== end loop ========
    // It was a real error; we must depend on the caller to finish the job.
    // Register R9_remain has number of *remaining* oops, R5_count number of *total* oops.
    // Emit GC store barriers for the oops we have copied (R5_count minus R9_remain),
    // and report their number to the caller.
    __ subf_(R5_count, R9_remain, R5_count);
    __ nand(R3_RET, R5_count, R5_count);   // report (-1^K) to caller
    __ bne(CCR0, do_card_marks);
    __ blr();
    __ bind(success);
    __ li(R3_RET, 0);
    __ bind(do_card_marks);
    // Store check on R4_to[0..R5_count-1].
    gen_write_ref_array_post_barrier(R4_to, R5_count, R12_tmp, /* preserve: */ R3_RET);
    __ blr();
    return start;
  }
  //  Generate 'unsafe' array copy stub.
  //  Though just as safe as the other stubs, it takes an unscaled
  //  size_t argument instead of an element count.
  //
  // Arguments for generated stub:
  //      from:  R3
  //      to:    R4
  //      count: R5 byte count, treated as ssize_t, can be zero
  //
  // Examines the alignment of the operands and dispatches
  // to a long, int, short, or byte copy loop.
  //
  address generate_unsafe_copy(const char* name,
                               address byte_copy_entry,
                               address short_copy_entry,
                               address int_copy_entry,
                               address long_copy_entry) {
    const Register R3_from   = R3_ARG1;      // source array address
    const Register R4_to     = R4_ARG2;      // destination array address
    const Register R5_count  = R5_ARG3;      // elements count (as long on PPC64)
    const Register R6_bits   = R6_ARG4;      // test copy of low bits
    const Register R7_tmp    = R7_ARG5;
    //__ align(CodeEntryAlignment);
    StubCodeMark mark(this, "StubRoutines", name);
    address start = __ function_entry();
    // Bump this on entry, not on exit:
    //inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr, R6_bits, R7_tmp);
    Label short_copy, int_copy, long_copy;
    __ orr(R6_bits, R3_from, R4_to);
    __ orr(R6_bits, R6_bits, R5_count);
    __ andi_(R0, R6_bits, (BytesPerLong-1));
    __ beq(CCR0, long_copy);
    __ andi_(R0, R6_bits, (BytesPerInt-1));
    __ beq(CCR0, int_copy);
    __ andi_(R0, R6_bits, (BytesPerShort-1));
    __ beq(CCR0, short_copy);
    // byte_copy:
    __ b(byte_copy_entry);
    __ bind(short_copy);
    __ srwi(R5_count, R5_count, LogBytesPerShort);
    __ b(short_copy_entry);
    __ bind(int_copy);
    __ srwi(R5_count, R5_count, LogBytesPerInt);
    __ b(int_copy_entry);
    __ bind(long_copy);
    __ srwi(R5_count, R5_count, LogBytesPerLong);
    __ b(long_copy_entry);
    return start;
  }
  // Perform range checks on the proposed arraycopy.
  // Kills the two temps, but nothing else.
  // Also, clean the sign bits of src_pos and dst_pos.
  void arraycopy_range_checks(Register src,     // source array oop
                              Register src_pos, // source position
                              Register dst,     // destination array oop
                              Register dst_pos, // destination position
                              Register length,  // length of copy
                              Register temp1, Register temp2,
                              Label& L_failed) {
    BLOCK_COMMENT("arraycopy_range_checks:");
    const Register array_length = temp1;  // scratch
    const Register end_pos      = temp2;  // scratch
    //  if (src_pos + length > arrayOop(src)->length() ) FAIL;
    __ lwa(array_length, arrayOopDesc::length_offset_in_bytes(), src);
    __ add(end_pos, src_pos, length);  // src_pos + length
    __ cmpd(CCR0, end_pos, array_length);
    __ bgt(CCR0, L_failed);
    //  if (dst_pos + length > arrayOop(dst)->length() ) FAIL;
    __ lwa(array_length, arrayOopDesc::length_offset_in_bytes(), dst);
    __ add(end_pos, dst_pos, length);  // src_pos + length
    __ cmpd(CCR0, end_pos, array_length);
    __ bgt(CCR0, L_failed);
    BLOCK_COMMENT("arraycopy_range_checks done");
  }
  //
  //  Generate generic array copy stubs
  //
  //  Input:
  //    R3    -  src oop
  //    R4    -  src_pos
  //    R5    -  dst oop
  //    R6    -  dst_pos
  //    R7    -  element count
  //
  //  Output:
  //    R3 ==  0  -  success
  //    R3 == -1  -  need to call System.arraycopy
  //
  address generate_generic_copy(const char *name,
                                address entry_jbyte_arraycopy,
                                address entry_jshort_arraycopy,
                                address entry_jint_arraycopy,
                                address entry_oop_arraycopy,
                                address entry_disjoint_oop_arraycopy,
                                address entry_jlong_arraycopy,
                                address entry_checkcast_arraycopy) {
    Label L_failed, L_objArray;
    // Input registers
    const Register src       = R3_ARG1;  // source array oop
    const Register src_pos   = R4_ARG2;  // source position
    const Register dst       = R5_ARG3;  // destination array oop
    const Register dst_pos   = R6_ARG4;  // destination position
    const Register length    = R7_ARG5;  // elements count
    // registers used as temp
    const Register src_klass = R8_ARG6;  // source array klass
    const Register dst_klass = R9_ARG7;  // destination array klass
    const Register lh        = R10_ARG8; // layout handler
    const Register temp      = R2;
    //__ align(CodeEntryAlignment);
    StubCodeMark mark(this, "StubRoutines", name);
    address start = __ function_entry();
    // Bump this on entry, not on exit:
    //inc_counter_np(SharedRuntime::_generic_array_copy_ctr, lh, temp);
    // In principle, the int arguments could be dirty.
    //-----------------------------------------------------------------------
    // Assembler stubs will be used for this call to arraycopy
    // if the following conditions are met:
    //
    // (1) src and dst must not be null.
    // (2) src_pos must not be negative.
    // (3) dst_pos must not be negative.
    // (4) length  must not be negative.
    // (5) src klass and dst klass should be the same and not NULL.
    // (6) src and dst should be arrays.
    // (7) src_pos + length must not exceed length of src.
    // (8) dst_pos + length must not exceed length of dst.
    BLOCK_COMMENT("arraycopy initial argument checks");
    __ cmpdi(CCR1, src, 0);      // if (src == NULL) return -1;
    __ extsw_(src_pos, src_pos); // if (src_pos < 0) return -1;
    __ cmpdi(CCR5, dst, 0);      // if (dst == NULL) return -1;
    __ cror(CCR1, Assembler::equal, CCR0, Assembler::less);
    __ extsw_(dst_pos, dst_pos); // if (src_pos < 0) return -1;
    __ cror(CCR5, Assembler::equal, CCR0, Assembler::less);
    __ extsw_(length, length);   // if (length < 0) return -1;
    __ cror(CCR1, Assembler::equal, CCR5, Assembler::equal);
    __ cror(CCR1, Assembler::equal, CCR0, Assembler::less);
    __ beq(CCR1, L_failed);
    BLOCK_COMMENT("arraycopy argument klass checks");
    __ load_klass(src_klass, src);
    __ load_klass(dst_klass, dst);
    // Load layout helper
    //
    //  |array_tag|     | header_size | element_type |     |log2_element_size|
    // 32        30    24            16              8     2                 0
    //
    //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
    //
    int lh_offset = in_bytes(Klass::layout_helper_offset());
    // Load 32-bits signed value. Use br() instruction with it to check icc.
    __ lwz(lh, lh_offset, src_klass);
    // Handle objArrays completely differently...
    jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
    __ load_const_optimized(temp, objArray_lh, R0);
    __ cmpw(CCR0, lh, temp);
    __ beq(CCR0, L_objArray);
    __ cmpd(CCR5, src_klass, dst_klass);          // if (src->klass() != dst->klass()) return -1;
    __ cmpwi(CCR6, lh, Klass::_lh_neutral_value); // if (!src->is_Array()) return -1;
    __ crnand(CCR5, Assembler::equal, CCR6, Assembler::less);
    __ beq(CCR5, L_failed);
    // At this point, it is known to be a typeArray (array_tag 0x3).
 #ifdef ASSERT
    { Label L;
      jint lh_prim_tag_in_place = (Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift);
      __ load_const_optimized(temp, lh_prim_tag_in_place, R0);
      __ cmpw(CCR0, lh, temp);
      __ bge(CCR0, L);
      __ stop("must be a primitive array");
      __ bind(L);
    }
 #endif
    arraycopy_range_checks(src, src_pos, dst, dst_pos, length,
                           temp, dst_klass, L_failed);
    // TypeArrayKlass
    //
    // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
    // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
    //
    const Register offset = dst_klass;    // array offset
    const Register elsize = src_klass;    // log2 element size
    __ rldicl(offset, lh, 64 - Klass::_lh_header_size_shift, 64 - exact_log2(Klass::_lh_header_size_mask + 1));
    __ andi(elsize, lh, Klass::_lh_log2_element_size_mask);
    __ add(src, offset, src);       // src array offset
    __ add(dst, offset, dst);       // dst array offset
    // Next registers should be set before the jump to corresponding stub.
    const Register from     = R3_ARG1;  // source array address
    const Register to       = R4_ARG2;  // destination array address
    const Register count    = R5_ARG3;  // elements count
    // 'from', 'to', 'count' registers should be set in this order
    // since they are the same as 'src', 'src_pos', 'dst'.
    BLOCK_COMMENT("scale indexes to element size");
    __ sld(src_pos, src_pos, elsize);
    __ sld(dst_pos, dst_pos, elsize);
    __ add(from, src_pos, src);  // src_addr
    __ add(to, dst_pos, dst);    // dst_addr
    __ mr(count, length);        // length
    BLOCK_COMMENT("choose copy loop based on element size");
    // Using conditional branches with range 32kB.
    const int bo = Assembler::bcondCRbiIs1, bi = Assembler::bi0(CCR0, Assembler::equal);
    __ cmpwi(CCR0, elsize, 0);
    __ bc(bo, bi, entry_jbyte_arraycopy);
    __ cmpwi(CCR0, elsize, LogBytesPerShort);
    __ bc(bo, bi, entry_jshort_arraycopy);
    __ cmpwi(CCR0, elsize, LogBytesPerInt);
    __ bc(bo, bi, entry_jint_arraycopy);
 #ifdef ASSERT
    { Label L;
      __ cmpwi(CCR0, elsize, LogBytesPerLong);
      __ beq(CCR0, L);
      __ stop("must be long copy, but elsize is wrong");
      __ bind(L);
    }
 #endif
    __ b(entry_jlong_arraycopy);
    // ObjArrayKlass
  __ bind(L_objArray);
    // live at this point:  src_klass, dst_klass, src[_pos], dst[_pos], length
    Label L_disjoint_plain_copy, L_checkcast_copy;
    //  test array classes for subtyping
    __ cmpd(CCR0, src_klass, dst_klass);         // usual case is exact equality
    __ bne(CCR0, L_checkcast_copy);
    // Identically typed arrays can be copied without element-wise checks.
    arraycopy_range_checks(src, src_pos, dst, dst_pos, length,
                           temp, lh, L_failed);
    __ addi(src, src, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); //src offset
    __ addi(dst, dst, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); //dst offset
    __ sldi(src_pos, src_pos, LogBytesPerHeapOop);
    __ sldi(dst_pos, dst_pos, LogBytesPerHeapOop);
    __ add(from, src_pos, src);  // src_addr
    __ add(to, dst_pos, dst);    // dst_addr
    __ mr(count, length);        // length
    __ b(entry_oop_arraycopy);
  __ bind(L_checkcast_copy);
    // live at this point:  src_klass, dst_klass
    {
      // Before looking at dst.length, make sure dst is also an objArray.
      __ lwz(temp, lh_offset, dst_klass);
      __ cmpw(CCR0, lh, temp);
      __ bne(CCR0, L_failed);
      // It is safe to examine both src.length and dst.length.
      arraycopy_range_checks(src, src_pos, dst, dst_pos, length,
                             temp, lh, L_failed);
      // Marshal the base address arguments now, freeing registers.
      __ addi(src, src, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); //src offset
      __ addi(dst, dst, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); //dst offset
      __ sldi(src_pos, src_pos, LogBytesPerHeapOop);
      __ sldi(dst_pos, dst_pos, LogBytesPerHeapOop);
      __ add(from, src_pos, src);  // src_addr
      __ add(to, dst_pos, dst);    // dst_addr
      __ mr(count, length);        // length
      Register sco_temp = R6_ARG4;             // This register is free now.
      assert_different_registers(from, to, count, sco_temp,
                                 dst_klass, src_klass);
      // Generate the type check.
      int sco_offset = in_bytes(Klass::super_check_offset_offset());
      __ lwz(sco_temp, sco_offset, dst_klass);
      generate_type_check(src_klass, sco_temp, dst_klass,
                          temp, L_disjoint_plain_copy);
      // Fetch destination element klass from the ObjArrayKlass header.
      int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
      // The checkcast_copy loop needs two extra arguments:
      __ ld(R7_ARG5, ek_offset, dst_klass);   // dest elem klass
      __ lwz(R6_ARG4, sco_offset, R7_ARG5);   // sco of elem klass
      __ b(entry_checkcast_arraycopy);
    }
    __ bind(L_disjoint_plain_copy);
    __ b(entry_disjoint_oop_arraycopy);
  __ bind(L_failed);
    __ li(R3_RET, -1); // return -1
    __ blr();
    return start;
  }
  void generate_arraycopy_stubs() {
    // Note: the disjoint stubs must be generated first, some of
    // the conjoint stubs use them.
@ -1998,6 +2426,24 @@ class StubGenerator: public StubCodeGenerator {
    StubRoutines::_arrayof_oop_arraycopy        = generate_conjoint_oop_copy(true, "arrayof_oop_arraycopy", false);
    StubRoutines::_arrayof_oop_arraycopy_uninit = generate_conjoint_oop_copy(true, "arrayof_oop_arraycopy", true);
    // special/generic versions
    StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy("checkcast_arraycopy", false);
    StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", true);
    StubRoutines::_unsafe_arraycopy  = generate_unsafe_copy("unsafe_arraycopy",
                                                            STUB_ENTRY(jbyte_arraycopy),
                                                            STUB_ENTRY(jshort_arraycopy),
                                                            STUB_ENTRY(jint_arraycopy),
                                                            STUB_ENTRY(jlong_arraycopy));
    StubRoutines::_generic_arraycopy = generate_generic_copy("generic_arraycopy",
                                                             STUB_ENTRY(jbyte_arraycopy),
                                                             STUB_ENTRY(jshort_arraycopy),
                                                             STUB_ENTRY(jint_arraycopy),
                                                             STUB_ENTRY(oop_arraycopy),
                                                             STUB_ENTRY(oop_disjoint_arraycopy),
                                                             STUB_ENTRY(jlong_arraycopy),
                                                             STUB_ENTRY(checkcast_arraycopy));
    // fill routines
    StubRoutines::_jbyte_fill          = generate_fill(T_BYTE,  false, "jbyte_fill");
    StubRoutines::_jshort_fill         = generate_fill(T_SHORT, false, "jshort_fill");
@ -2228,6 +2674,15 @@ class StubGenerator: public StubCodeGenerator {
      StubRoutines::_multiplyToLen = generate_multiplyToLen();
    }
 #endif
    if (UseMontgomeryMultiplyIntrinsic) {
      StubRoutines::_montgomeryMultiply
        = CAST_FROM_FN_PTR(address, SharedRuntime::montgomery_multiply);
    }
    if (UseMontgomerySquareIntrinsic) {
      StubRoutines::_montgomerySquare
        = CAST_FROM_FN_PTR(address, SharedRuntime::montgomery_square);
    }
  }
 public:
--- a/hotspot/src/cpu/ppc/vm/stubRoutines_ppc_64.cpp
+++ b/hotspot/src/cpu/ppc/vm/stubRoutines_ppc_64.cpp
@ -34,7 +34,7 @@
 // CRC32 Intrinsics.
 void StubRoutines::ppc64::generate_load_crc_table_addr(MacroAssembler* masm, Register table) {
-  __ load_const(table, StubRoutines::_crc_table_adr);
+  __ load_const_optimized(table, StubRoutines::_crc_table_adr, R0);
 }
 // CRC32 Intrinsics.
--- a/hotspot/src/cpu/ppc/vm/templateInterpreterGenerator_ppc.cpp
+++ b/hotspot/src/cpu/ppc/vm/templateInterpreterGenerator_ppc.cpp
@ -749,34 +749,33 @@ void TemplateInterpreterGenerator::generate_counter_incr(Label* overflow, Label*
  if (TieredCompilation) {
    const int increment = InvocationCounter::count_increment;
    const int mask = ((1 << Tier0InvokeNotifyFreqLog) - 1) << InvocationCounter::count_shift;
    Label no_mdo;
    if (ProfileInterpreter) {
-      const Register Rmdo = Rscratch1;
+      const Register Rmdo = R3_counters;
      // If no method data exists, go to profile_continue.
      __ ld(Rmdo, in_bytes(Method::method_data_offset()), R19_method);
      __ cmpdi(CCR0, Rmdo, 0);
      __ beq(CCR0, no_mdo);
-      // Increment backedge counter in the MDO.
+      // Increment invocation counter in the MDO.
-      const int mdo_bc_offs = in_bytes(MethodData::backedge_counter_offset()) + in_bytes(InvocationCounter::counter_offset());
+      const int mdo_ic_offs = in_bytes(MethodData::invocation_counter_offset()) + in_bytes(InvocationCounter::counter_offset());
-      __ lwz(Rscratch2, mdo_bc_offs, Rmdo);
+      __ lwz(Rscratch2, mdo_ic_offs, Rmdo);
      __ lwz(Rscratch1, in_bytes(MethodData::invoke_mask_offset()), Rmdo);
      __ addi(Rscratch2, Rscratch2, increment);
-      __ stw(Rscratch2, mdo_bc_offs, Rmdo);
+      __ stw(Rscratch2, mdo_ic_offs, Rmdo);
      __ load_const_optimized(Rscratch1, mask, R0);
      __ and_(Rscratch1, Rscratch2, Rscratch1);
      __ bne(CCR0, done);
      __ b(*overflow);
    }
    // Increment counter in MethodCounters*.
-    const int mo_bc_offs = in_bytes(MethodCounters::backedge_counter_offset()) + in_bytes(InvocationCounter::counter_offset());
+    const int mo_ic_offs = in_bytes(MethodCounters::invocation_counter_offset()) + in_bytes(InvocationCounter::counter_offset());
    __ bind(no_mdo);
    __ get_method_counters(R19_method, R3_counters, done);
-    __ lwz(Rscratch2, mo_bc_offs, R3_counters);
+    __ lwz(Rscratch2, mo_ic_offs, R3_counters);
    __ lwz(Rscratch1, in_bytes(MethodCounters::invoke_mask_offset()), R3_counters);
    __ addi(Rscratch2, Rscratch2, increment);
-    __ stw(Rscratch2, mo_bc_offs, R3_counters);
+    __ stw(Rscratch2, mo_ic_offs, R3_counters);
    __ load_const_optimized(Rscratch1, mask, R0);
    __ and_(Rscratch1, Rscratch2, Rscratch1);
    __ beq(CCR0, *overflow);
@ -797,8 +796,7 @@ void TemplateInterpreterGenerator::generate_counter_incr(Label* overflow, Label*
    // Check if we must create a method data obj.
    if (ProfileInterpreter && profile_method != NULL) {
      const Register profile_limit = Rscratch1;
-      int pl_offs = __ load_const_optimized(profile_limit, &InvocationCounter::InterpreterProfileLimit, R0, true);
+      __ lwz(profile_limit, in_bytes(MethodCounters::interpreter_profile_limit_offset()), R3_counters);
      __ lwz(profile_limit, pl_offs, profile_limit);
      // Test to see if we should create a method data oop.
      __ cmpw(CCR0, Rsum_ivc_bec, profile_limit);
      __ blt(CCR0, *profile_method_continue);
@ -808,9 +806,7 @@ void TemplateInterpreterGenerator::generate_counter_incr(Label* overflow, Label*
    // Finally check for counter overflow.
    if (overflow) {
      const Register invocation_limit = Rscratch1;
-      int il_offs = __ load_const_optimized(invocation_limit, &InvocationCounter::InterpreterInvocationLimit, R0, true);
+      __ lwz(invocation_limit, in_bytes(MethodCounters::interpreter_invocation_limit_offset()), R3_counters);
      __ lwz(invocation_limit, il_offs, invocation_limit);
      assert(4 == sizeof(InvocationCounter::InterpreterInvocationLimit), "unexpected field size");
      __ cmpw(CCR0, Rsum_ivc_bec, invocation_limit);
      __ bge(CCR0, *overflow);
    }
--- a/hotspot/src/cpu/ppc/vm/templateTable_ppc_64.cpp
+++ b/hotspot/src/cpu/ppc/vm/templateTable_ppc_64.cpp
@ -1624,12 +1624,13 @@ void TemplateTable::branch(bool is_jsr, bool is_wide) {
  // --------------------------------------------------------------------------
  // Normal (non-jsr) branch handling
  // Bump bytecode pointer by displacement (take the branch).
  __ add(R14_bcp, Rdisp, R14_bcp); // Add to bc addr.
  const bool increment_invocation_counter_for_backward_branches = UseCompiler && UseLoopCounter;
  if (increment_invocation_counter_for_backward_branches) {
    //__ unimplemented("branch invocation counter");
    Label Lforward;
-    __ add(R14_bcp, Rdisp, R14_bcp); // Add to bc addr.
+    __ dispatch_prolog(vtos);
    // Check branch direction.
    __ cmpdi(CCR0, Rdisp, 0);
@ -1640,7 +1641,6 @@ void TemplateTable::branch(bool is_jsr, bool is_wide) {
    if (TieredCompilation) {
      Label Lno_mdo, Loverflow;
      const int increment = InvocationCounter::count_increment;
      const int mask = ((1 << Tier0BackedgeNotifyFreqLog) - 1) << InvocationCounter::count_shift;
      if (ProfileInterpreter) {
        Register Rmdo = Rscratch1;
@ -1652,7 +1652,7 @@ void TemplateTable::branch(bool is_jsr, bool is_wide) {
        // Increment backedge counter in the MDO.
        const int mdo_bc_offs = in_bytes(MethodData::backedge_counter_offset()) + in_bytes(InvocationCounter::counter_offset());
        __ lwz(Rscratch2, mdo_bc_offs, Rmdo);
-        __ load_const_optimized(Rscratch3, mask, R0);
+        __ lwz(Rscratch3, in_bytes(MethodData::backedge_mask_offset()), Rmdo);
        __ addi(Rscratch2, Rscratch2, increment);
        __ stw(Rscratch2, mdo_bc_offs, Rmdo);
        __ and_(Rscratch3, Rscratch2, Rscratch3);
@ -1664,19 +1664,19 @@ void TemplateTable::branch(bool is_jsr, bool is_wide) {
      const int mo_bc_offs = in_bytes(MethodCounters::backedge_counter_offset()) + in_bytes(InvocationCounter::counter_offset());
      __ bind(Lno_mdo);
      __ lwz(Rscratch2, mo_bc_offs, R4_counters);
-      __ load_const_optimized(Rscratch3, mask, R0);
+      __ lwz(Rscratch3, in_bytes(MethodCounters::backedge_mask_offset()), R4_counters);
      __ addi(Rscratch2, Rscratch2, increment);
-      __ stw(Rscratch2, mo_bc_offs, R19_method);
+      __ stw(Rscratch2, mo_bc_offs, R4_counters);
      __ and_(Rscratch3, Rscratch2, Rscratch3);
      __ bne(CCR0, Lforward);
      __ bind(Loverflow);
      // Notify point for loop, pass branch bytecode.
-      __ call_VM(noreg, CAST_FROM_FN_PTR(address, InterpreterRuntime::frequency_counter_overflow), R14_bcp, true);
+      __ subf(R4_ARG2, Rdisp, R14_bcp); // Compute branch bytecode (previous bcp).
      __ call_VM(noreg, CAST_FROM_FN_PTR(address, InterpreterRuntime::frequency_counter_overflow), R4_ARG2, true);
      // Was an OSR adapter generated?
      // O0 = osr nmethod
      __ cmpdi(CCR0, R3_RET, 0);
      __ beq(CCR0, Lforward);
@ -1712,27 +1712,23 @@ void TemplateTable::branch(bool is_jsr, bool is_wide) {
      __ increment_backedge_counter(R4_counters, invoke_ctr, Rscratch2, Rscratch3);
      if (ProfileInterpreter) {
-        __ test_invocation_counter_for_mdp(invoke_ctr, Rscratch2, Lforward);
+        __ test_invocation_counter_for_mdp(invoke_ctr, R4_counters, Rscratch2, Lforward);
        if (UseOnStackReplacement) {
-          __ test_backedge_count_for_osr(bumped_count, R14_bcp, Rscratch2);
+          __ test_backedge_count_for_osr(bumped_count, R4_counters, R14_bcp, Rdisp, Rscratch2);
        }
      } else {
        if (UseOnStackReplacement) {
-          __ test_backedge_count_for_osr(invoke_ctr, R14_bcp, Rscratch2);
+          __ test_backedge_count_for_osr(invoke_ctr, R4_counters, R14_bcp, Rdisp, Rscratch2);
        }
      }
    }
    __ bind(Lforward);
    __ dispatch_epilog(vtos);
  } else {
    // Bump bytecode pointer by displacement (take the branch).
    __ add(R14_bcp, Rdisp, R14_bcp); // Add to bc addr.
  }
  // Continue with bytecode @ target.
  // %%%%% Like Intel, could speed things up by moving bytecode fetch to code above,
  // %%%%% and changing dispatch_next to dispatch_only.
    __ dispatch_next(vtos);
  }
 }
 // Helper function for if_cmp* methods below.
--- a/hotspot/src/cpu/ppc/vm/vm_version_ppc.cpp
+++ b/hotspot/src/cpu/ppc/vm/vm_version_ppc.cpp
@ -37,9 +37,6 @@
 # include <sys/sysinfo.h>
 int VM_Version::_features = VM_Version::unknown_m;
 int VM_Version::_measured_cache_line_size = 32; // pessimistic init value
 const char* VM_Version::_features_str = "";
 bool VM_Version::_is_determine_features_test_running = false;
@ -56,7 +53,7 @@ void VM_Version::initialize() {
  // If PowerArchitecturePPC64 hasn't been specified explicitly determine from features.
  if (FLAG_IS_DEFAULT(PowerArchitecturePPC64)) {
-    if (VM_Version::has_lqarx()) {
+    if (VM_Version::has_tcheck() && VM_Version::has_lqarx()) {
      FLAG_SET_ERGO(uintx, PowerArchitecturePPC64, 8);
    } else if (VM_Version::has_popcntw()) {
      FLAG_SET_ERGO(uintx, PowerArchitecturePPC64, 7);
@ -68,10 +65,19 @@ void VM_Version::initialize() {
      FLAG_SET_ERGO(uintx, PowerArchitecturePPC64, 0);
    }
  }
-  guarantee(PowerArchitecturePPC64 == 0 || PowerArchitecturePPC64 == 5 ||
+
-            PowerArchitecturePPC64 == 6 || PowerArchitecturePPC64 == 7 ||
+  bool PowerArchitecturePPC64_ok = false;
-            PowerArchitecturePPC64 == 8,
+  switch (PowerArchitecturePPC64) {
-            "PowerArchitecturePPC64 should be 0, 5, 6, 7, or 8");
+    case 8: if (!VM_Version::has_tcheck() ) break;
            if (!VM_Version::has_lqarx()  ) break;
    case 7: if (!VM_Version::has_popcntw()) break;
    case 6: if (!VM_Version::has_cmpb()   ) break;
    case 5: if (!VM_Version::has_popcntb()) break;
    case 0: PowerArchitecturePPC64_ok = true; break;
    default: break;
  }
  guarantee(PowerArchitecturePPC64_ok, "PowerArchitecturePPC64 cannot be set to "
            UINTX_FORMAT " on this machine", PowerArchitecturePPC64);
  // Power 8: Configure Data Stream Control Register.
  if (PowerArchitecturePPC64 >= 8) {
@ -122,7 +128,7 @@ void VM_Version::initialize() {
               (has_tcheck()  ? " tcheck"  : "")
               // Make sure number of %s matches num_features!
              );
-  _features_str = os::strdup(buf);
+  _features_string = os::strdup(buf);
  if (Verbose) {
    print_features();
  }
@ -132,9 +138,15 @@ void VM_Version::initialize() {
  // and 'atomic long memory ops' (see Unsafe_GetLongVolatile).
  _supports_cx8 = true;
  // Used by C1.
  _supports_atomic_getset4 = true;
  _supports_atomic_getadd4 = true;
  _supports_atomic_getset8 = true;
  _supports_atomic_getadd8 = true;
  UseSSE = 0; // Only on x86 and x64
-  intx cache_line_size = _measured_cache_line_size;
+  intx cache_line_size = L1_data_cache_line_size();
  if (FLAG_IS_DEFAULT(AllocatePrefetchStyle)) AllocatePrefetchStyle = 1;
@ -184,6 +196,11 @@ void VM_Version::initialize() {
    FLAG_SET_DEFAULT(UseAESIntrinsics, false);
  }
  if (UseAESCTRIntrinsics) {
    warning("AES/CTR intrinsics are not available on this CPU");
    FLAG_SET_DEFAULT(UseAESCTRIntrinsics, false);
  }
  if (UseGHASHIntrinsics) {
    warning("GHASH intrinsics are not available on this CPU");
    FLAG_SET_DEFAULT(UseGHASHIntrinsics, false);
@ -208,6 +225,18 @@ void VM_Version::initialize() {
  if (FLAG_IS_DEFAULT(UseMultiplyToLenIntrinsic)) {
    UseMultiplyToLenIntrinsic = true;
  }
  if (FLAG_IS_DEFAULT(UseMontgomeryMultiplyIntrinsic)) {
    UseMontgomeryMultiplyIntrinsic = true;
  }
  if (FLAG_IS_DEFAULT(UseMontgomerySquareIntrinsic)) {
    UseMontgomerySquareIntrinsic = true;
  }
  if (UseVectorizedMismatchIntrinsic) {
    warning("UseVectorizedMismatchIntrinsic specified, but not available on this CPU.");
    FLAG_SET_DEFAULT(UseVectorizedMismatchIntrinsic, false);
  }
  // Adjust RTM (Restricted Transactional Memory) flags.
  if (UseRTMLocking) {
@ -276,11 +305,9 @@ void VM_Version::initialize() {
    }
  }
-  // This machine does not allow unaligned memory accesses
+  // This machine allows unaligned memory accesses
-  if (UseUnalignedAccesses) {
+  if (FLAG_IS_DEFAULT(UseUnalignedAccesses)) {
-    if (!FLAG_IS_DEFAULT(UseUnalignedAccesses))
+    FLAG_SET_DEFAULT(UseUnalignedAccesses, true);
      warning("Unaligned memory access is not available on this CPU");
    FLAG_SET_DEFAULT(UseUnalignedAccesses, false);
  }
 }
@ -306,7 +333,7 @@ bool VM_Version::use_biased_locking() {
 }
 void VM_Version::print_features() {
-  tty->print_cr("Version: %s cache_line_size = %d", cpu_features(), (int) get_cache_line_size());
+  tty->print_cr("Version: %s L1_data_cache_line_size=%d", features_string(), L1_data_cache_line_size());
 }
 #ifdef COMPILER2
@ -607,7 +634,7 @@ void VM_Version::determine_features() {
  int count = 0; // count zeroed bytes
  for (int i = 0; i < BUFFER_SIZE; i++) if (test_area[i] == 0) count++;
  guarantee(is_power_of_2(count), "cache line size needs to be a power of 2");
-  _measured_cache_line_size = count;
+  _L1_data_cache_line_size = count;
  // Execute code. Illegal instructions will be replaced by 0 in the signal handler.
  VM_Version::_is_determine_features_test_running = true;
@ -705,7 +732,7 @@ void VM_Version::config_dscr() {
  }
 }
-static int saved_features = 0;
+static uint64_t saved_features = 0;
 void VM_Version::allow_all() {
  saved_features = _features;
--- a/hotspot/src/cpu/ppc/vm/vm_version_ppc.hpp
+++ b/hotspot/src/cpu/ppc/vm/vm_version_ppc.hpp
@ -62,11 +62,9 @@ protected:
    vcipher_m             = (1 << vcipher),
    vpmsumb_m             = (1 << vpmsumb),
    tcheck_m              = (1 << tcheck ),
-    all_features_m        = -1
+    all_features_m        = (unsigned long)-1
  };
-  static int  _features;
+
  static int  _measured_cache_line_size;
  static const char* _features_str;
  static bool _is_determine_features_test_running;
  static void print_features();
@ -97,10 +95,6 @@ public:
  static bool has_vpmsumb() { return (_features & vpmsumb_m) != 0; }
  static bool has_tcheck()  { return (_features & tcheck_m) != 0; }
  static const char* cpu_features() { return _features_str; }
  static int get_cache_line_size()  { return _measured_cache_line_size; }
  // Assembler testing
  static void allow_all();
  static void revert();
--- a/hotspot/src/cpu/ppc/vm/vtableStubs_ppc_64.cpp
+++ b/hotspot/src/cpu/ppc/vm/vtableStubs_ppc_64.cpp
@ -76,7 +76,8 @@ VtableStub* VtableStubs::create_vtable_stub(int vtable_index) {
  // We might implicit NULL fault here.
  address npe_addr = __ pc(); // npe = null pointer exception
-  __ load_klass_with_trap_null_check(rcvr_klass, R3);
+  __ null_check(R3, oopDesc::klass_offset_in_bytes(), /*implicit only*/NULL);
  __ load_klass(rcvr_klass, R3);
 // Set method (in case of interpreted method), and destination address.
  int entry_offset = InstanceKlass::vtable_start_offset() + vtable_index*vtableEntry::size();
@ -111,8 +112,8 @@ VtableStub* VtableStubs::create_vtable_stub(int vtable_index) {
  // If the vtable entry is null, the method is abstract.
  address ame_addr = __ pc(); // ame = abstract method error
-
+  __ null_check(R19_method, in_bytes(Method::from_compiled_offset()), /*implicit only*/NULL);
-  __ load_with_trap_null_check(R12_scratch2, in_bytes(Method::from_compiled_offset()), R19_method);
+  __ ld(R12_scratch2, in_bytes(Method::from_compiled_offset()), R19_method);
  __ mtctr(R12_scratch2);
  __ bctr();
  masm->flush();
@ -158,7 +159,8 @@ VtableStub* VtableStubs::create_itable_stub(int vtable_index) {
  // We might implicit NULL fault here.
  address npe_addr = __ pc(); // npe = null pointer exception
-  __ load_klass_with_trap_null_check(rcvr_klass, R3_ARG1);
+  __ null_check(R3_ARG1, oopDesc::klass_offset_in_bytes(), /*implicit only*/NULL);
  __ load_klass(rcvr_klass, R3_ARG1);
  BLOCK_COMMENT("Load start of itable entries into itable_entry.");
  __ lwz(vtable_len, InstanceKlass::vtable_length_offset() * wordSize, rcvr_klass);
@ -217,15 +219,7 @@ VtableStub* VtableStubs::create_itable_stub(int vtable_index) {
  address ame_addr = __ pc(); // ame = abstract method error
  // Must do an explicit check if implicit checks are disabled.
-  assert(!MacroAssembler::needs_explicit_null_check(in_bytes(Method::from_compiled_offset())), "sanity");
+  __ null_check(R19_method, in_bytes(Method::from_compiled_offset()), &throw_icce);
  if (!ImplicitNullChecks || !os::zero_page_read_protected()) {
    if (TrapBasedNullChecks) {
      __ trap_null_check(R19_method);
    } else {
      __ cmpdi(CCR0, R19_method, 0);
      __ beq(CCR0, throw_icce);
    }
  }
  __ ld(R12_scratch2, in_bytes(Method::from_compiled_offset()), R19_method);
  __ mtctr(R12_scratch2);
  __ bctr();
--- a/hotspot/src/cpu/sparc/vm/assembler_sparc.hpp
+++ b/hotspot/src/cpu/sparc/vm/assembler_sparc.hpp
@ -677,11 +677,8 @@ class Assembler : public AbstractAssembler  {
 protected:
  // Insert a nop if the previous is cbcond
-  void insert_nop_after_cbcond() {
+  inline void insert_nop_after_cbcond();
-    if (UseCBCond && cbcond_before()) {
+
      nop();
    }
  }
  // Delay slot helpers
  // cti is called when emitting control-transfer instruction,
  // BEFORE doing the emitting.
@ -739,7 +736,7 @@ public:
  }
  inline void emit_int32(int);  // shadows AbstractAssembler::emit_int32
-  inline void emit_data(int x) { emit_int32(x); }
+  inline void emit_data(int x);
  inline void emit_data(int, RelocationHolder const&);
  inline void emit_data(int, relocInfo::relocType rtype);
  // helper for above fcns
@ -754,31 +751,31 @@ public:
  inline void add(Register s1, Register s2, Register d );
  inline void add(Register s1, int simm13a, Register d );
-  void addcc(  Register s1, Register s2, Register d ) { emit_int32( op(arith_op) | rd(d) | op3(add_op3  | cc_bit_op3) | rs1(s1) | rs2(s2) ); }
+  inline void addcc(  Register s1, Register s2, Register d );
-  void addcc(  Register s1, int simm13a, Register d ) { emit_int32( op(arith_op) | rd(d) | op3(add_op3  | cc_bit_op3) | rs1(s1) | immed(true) | simm(simm13a, 13) ); }
+  inline void addcc(  Register s1, int simm13a, Register d );
-  void addc(   Register s1, Register s2, Register d ) { emit_int32( op(arith_op) | rd(d) | op3(addc_op3             ) | rs1(s1) | rs2(s2) ); }
+  inline void addc(   Register s1, Register s2, Register d );
-  void addc(   Register s1, int simm13a, Register d ) { emit_int32( op(arith_op) | rd(d) | op3(addc_op3             ) | rs1(s1) | immed(true) | simm(simm13a, 13) ); }
+  inline void addc(   Register s1, int simm13a, Register d );
-  void addccc( Register s1, Register s2, Register d ) { emit_int32( op(arith_op) | rd(d) | op3(addc_op3 | cc_bit_op3) | rs1(s1) | rs2(s2) ); }
+  inline void addccc( Register s1, Register s2, Register d );
-  void addccc( Register s1, int simm13a, Register d ) { emit_int32( op(arith_op) | rd(d) | op3(addc_op3 | cc_bit_op3) | rs1(s1) | immed(true) | simm(simm13a, 13) ); }
+  inline void addccc( Register s1, int simm13a, Register d );
  // 4-operand AES instructions
-  void aes_eround01(  FloatRegister s1, FloatRegister s2, FloatRegister s3, FloatRegister d ) { aes_only(); emit_int32( op(arith_op) | fd(d, FloatRegisterImpl::D) | op3(aes4_op3) | fs1(s1, FloatRegisterImpl::D) | fs3(s3, FloatRegisterImpl::D) | op5(aes_eround01_op5) | fs2(s2, FloatRegisterImpl::D) ); }
+  inline void aes_eround01(  FloatRegister s1, FloatRegister s2, FloatRegister s3, FloatRegister d );
-  void aes_eround23(  FloatRegister s1, FloatRegister s2, FloatRegister s3, FloatRegister d ) { aes_only(); emit_int32( op(arith_op) | fd(d, FloatRegisterImpl::D) | op3(aes4_op3) | fs1(s1, FloatRegisterImpl::D) | fs3(s3, FloatRegisterImpl::D) | op5(aes_eround23_op5) | fs2(s2, FloatRegisterImpl::D) ); }
+  inline void aes_eround23(  FloatRegister s1, FloatRegister s2, FloatRegister s3, FloatRegister d );
-  void aes_dround01(  FloatRegister s1, FloatRegister s2, FloatRegister s3, FloatRegister d ) { aes_only(); emit_int32( op(arith_op) | fd(d, FloatRegisterImpl::D) | op3(aes4_op3) | fs1(s1, FloatRegisterImpl::D) | fs3(s3, FloatRegisterImpl::D) | op5(aes_dround01_op5) | fs2(s2, FloatRegisterImpl::D) ); }
+  inline void aes_dround01(  FloatRegister s1, FloatRegister s2, FloatRegister s3, FloatRegister d );
-  void aes_dround23(  FloatRegister s1, FloatRegister s2, FloatRegister s3, FloatRegister d ) { aes_only(); emit_int32( op(arith_op) | fd(d, FloatRegisterImpl::D) | op3(aes4_op3) | fs1(s1, FloatRegisterImpl::D) | fs3(s3, FloatRegisterImpl::D) | op5(aes_dround23_op5) | fs2(s2, FloatRegisterImpl::D) ); }
+  inline void aes_dround23(  FloatRegister s1, FloatRegister s2, FloatRegister s3, FloatRegister d );
-  void aes_eround01_l(  FloatRegister s1, FloatRegister s2, FloatRegister s3, FloatRegister d ) { aes_only(); emit_int32( op(arith_op) | fd(d, FloatRegisterImpl::D) | op3(aes4_op3) | fs1(s1, FloatRegisterImpl::D) | fs3(s3, FloatRegisterImpl::D) | op5(aes_eround01_l_op5) | fs2(s2, FloatRegisterImpl::D) ); }
+  inline void aes_eround01_l(  FloatRegister s1, FloatRegister s2, FloatRegister s3, FloatRegister d );
-  void aes_eround23_l(  FloatRegister s1, FloatRegister s2, FloatRegister s3, FloatRegister d ) { aes_only(); emit_int32( op(arith_op) | fd(d, FloatRegisterImpl::D) | op3(aes4_op3) | fs1(s1, FloatRegisterImpl::D) | fs3(s3, FloatRegisterImpl::D) | op5(aes_eround23_l_op5) | fs2(s2, FloatRegisterImpl::D) ); }
+  inline void aes_eround23_l(  FloatRegister s1, FloatRegister s2, FloatRegister s3, FloatRegister d );
-  void aes_dround01_l(  FloatRegister s1, FloatRegister s2, FloatRegister s3, FloatRegister d ) { aes_only(); emit_int32( op(arith_op) | fd(d, FloatRegisterImpl::D) | op3(aes4_op3) | fs1(s1, FloatRegisterImpl::D) | fs3(s3, FloatRegisterImpl::D) | op5(aes_dround01_l_op5) | fs2(s2, FloatRegisterImpl::D) ); }
+  inline void aes_dround01_l(  FloatRegister s1, FloatRegister s2, FloatRegister s3, FloatRegister d );
-  void aes_dround23_l(  FloatRegister s1, FloatRegister s2, FloatRegister s3, FloatRegister d ) { aes_only(); emit_int32( op(arith_op) | fd(d, FloatRegisterImpl::D) | op3(aes4_op3) | fs1(s1, FloatRegisterImpl::D) | fs3(s3, FloatRegisterImpl::D) | op5(aes_dround23_l_op5) | fs2(s2, FloatRegisterImpl::D) ); }
+  inline void aes_dround23_l(  FloatRegister s1, FloatRegister s2, FloatRegister s3, FloatRegister d );
-  void aes_kexpand1(  FloatRegister s1, FloatRegister s2, int imm5a, FloatRegister d ) { aes_only(); emit_int32( op(arith_op) | fd(d, FloatRegisterImpl::D) | op3(aes4_op3) | fs1(s1, FloatRegisterImpl::D) | u_field(imm5a, 13, 9) | op5(aes_kexpand1_op5) | fs2(s2, FloatRegisterImpl::D) ); }
+  inline void aes_kexpand1(  FloatRegister s1, FloatRegister s2, int imm5a, FloatRegister d );
  // 3-operand AES instructions
-  void aes_kexpand0(  FloatRegister s1, FloatRegister s2, FloatRegister d ) { aes_only(); emit_int32( op(arith_op) | fd(d, FloatRegisterImpl::D) | op3(aes3_op3) | fs1(s1, FloatRegisterImpl::D) | opf(aes_kexpand0_opf) | fs2(s2, FloatRegisterImpl::D) ); }
+  inline void aes_kexpand0(  FloatRegister s1, FloatRegister s2, FloatRegister d );
-  void aes_kexpand2(  FloatRegister s1, FloatRegister s2, FloatRegister d ) { aes_only(); emit_int32( op(arith_op) | fd(d, FloatRegisterImpl::D) | op3(aes3_op3) | fs1(s1, FloatRegisterImpl::D) | opf(aes_kexpand2_opf) | fs2(s2, FloatRegisterImpl::D) ); }
+  inline void aes_kexpand2(  FloatRegister s1, FloatRegister s2, FloatRegister d );
  // pp 136
@ -816,6 +813,8 @@ public:
  inline void call( address d,  relocInfo::relocType rt = relocInfo::runtime_call_type );
  inline void call( Label& L,   relocInfo::relocType rt = relocInfo::runtime_call_type );
  inline void call( address d,  RelocationHolder const& rspec );
 public:
  // pp 150
@ -825,70 +824,70 @@ public:
  // at address s1 is swapped with the data in d. If the values are not equal,
  // the the contents of memory at s1 is loaded into d, without the swap.
-  void casa(  Register s1, Register s2, Register d, int ia = -1 ) { v9_only();  emit_int32( op(ldst_op) | rd(d) | op3(casa_op3 ) | rs1(s1) | (ia == -1  ? immed(true) : imm_asi(ia)) | rs2(s2)); }
+  inline void casa(  Register s1, Register s2, Register d, int ia = -1 );
-  void casxa( Register s1, Register s2, Register d, int ia = -1 ) { v9_only();  emit_int32( op(ldst_op) | rd(d) | op3(casxa_op3) | rs1(s1) | (ia == -1  ? immed(true) : imm_asi(ia)) | rs2(s2)); }
+  inline void casxa( Register s1, Register s2, Register d, int ia = -1 );
  // pp 152
-  void udiv(   Register s1, Register s2, Register d ) { emit_int32( op(arith_op) | rd(d) | op3(udiv_op3             ) | rs1(s1) | rs2(s2)); }
+  inline void udiv(   Register s1, Register s2, Register d );
-  void udiv(   Register s1, int simm13a, Register d ) { emit_int32( op(arith_op) | rd(d) | op3(udiv_op3             ) | rs1(s1) | immed(true) | simm(simm13a, 13) ); }
+  inline void udiv(   Register s1, int simm13a, Register d );
-  void sdiv(   Register s1, Register s2, Register d ) { emit_int32( op(arith_op) | rd(d) | op3(sdiv_op3             ) | rs1(s1) | rs2(s2)); }
+  inline void sdiv(   Register s1, Register s2, Register d );
-  void sdiv(   Register s1, int simm13a, Register d ) { emit_int32( op(arith_op) | rd(d) | op3(sdiv_op3             ) | rs1(s1) | immed(true) | simm(simm13a, 13) ); }
+  inline void sdiv(   Register s1, int simm13a, Register d );
-  void udivcc( Register s1, Register s2, Register d ) { emit_int32( op(arith_op) | rd(d) | op3(udiv_op3 | cc_bit_op3) | rs1(s1) | rs2(s2)); }
+  inline void udivcc( Register s1, Register s2, Register d );
-  void udivcc( Register s1, int simm13a, Register d ) { emit_int32( op(arith_op) | rd(d) | op3(udiv_op3 | cc_bit_op3) | rs1(s1) | immed(true) | simm(simm13a, 13) ); }
+  inline void udivcc( Register s1, int simm13a, Register d );
-  void sdivcc( Register s1, Register s2, Register d ) { emit_int32( op(arith_op) | rd(d) | op3(sdiv_op3 | cc_bit_op3) | rs1(s1) | rs2(s2)); }
+  inline void sdivcc( Register s1, Register s2, Register d );
-  void sdivcc( Register s1, int simm13a, Register d ) { emit_int32( op(arith_op) | rd(d) | op3(sdiv_op3 | cc_bit_op3) | rs1(s1) | immed(true) | simm(simm13a, 13) ); }
+  inline void sdivcc( Register s1, int simm13a, Register d );
  // pp 155
-  void done()  { v9_only();  cti();  emit_int32( op(arith_op) | fcn(0) | op3(done_op3) ); }
+  inline void done();
-  void retry() { v9_only();  cti();  emit_int32( op(arith_op) | fcn(1) | op3(retry_op3) ); }
+  inline void retry();
  // pp 156
-  void fadd( FloatRegisterImpl::Width w, FloatRegister s1, FloatRegister s2, FloatRegister d ) { emit_int32( op(arith_op) | fd(d, w) | op3(fpop1_op3) | fs1(s1, w) | opf(0x40 + w) | fs2(s2, w)); }
+  inline void fadd( FloatRegisterImpl::Width w, FloatRegister s1, FloatRegister s2, FloatRegister d );
-  void fsub( FloatRegisterImpl::Width w, FloatRegister s1, FloatRegister s2, FloatRegister d ) { emit_int32( op(arith_op) | fd(d, w) | op3(fpop1_op3) | fs1(s1, w) | opf(0x44 + w) | fs2(s2, w)); }
+  inline void fsub( FloatRegisterImpl::Width w, FloatRegister s1, FloatRegister s2, FloatRegister d );
  // pp 157
-  void fcmp(  FloatRegisterImpl::Width w, CC cc, FloatRegister s1, FloatRegister s2) { emit_int32( op(arith_op) | cmpcc(cc) | op3(fpop2_op3) | fs1(s1, w) | opf(0x50 + w) | fs2(s2, w)); }
+  inline void fcmp(  FloatRegisterImpl::Width w, CC cc, FloatRegister s1, FloatRegister s2);
-  void fcmpe( FloatRegisterImpl::Width w, CC cc, FloatRegister s1, FloatRegister s2) { emit_int32( op(arith_op) | cmpcc(cc) | op3(fpop2_op3) | fs1(s1, w) | opf(0x54 + w) | fs2(s2, w)); }
+  inline void fcmpe( FloatRegisterImpl::Width w, CC cc, FloatRegister s1, FloatRegister s2);
  // pp 159
-  void ftox( FloatRegisterImpl::Width w, FloatRegister s, FloatRegister d ) { v9_only();  emit_int32( op(arith_op) | fd(d, FloatRegisterImpl::D) | op3(fpop1_op3) | opf(0x80 + w) | fs2(s, w)); }
+  inline void ftox( FloatRegisterImpl::Width w, FloatRegister s, FloatRegister d );
-  void ftoi( FloatRegisterImpl::Width w, FloatRegister s, FloatRegister d ) {             emit_int32( op(arith_op) | fd(d, FloatRegisterImpl::S) | op3(fpop1_op3) | opf(0xd0 + w) | fs2(s, w)); }
+  inline void ftoi( FloatRegisterImpl::Width w, FloatRegister s, FloatRegister d );
  // pp 160
-  void ftof( FloatRegisterImpl::Width sw, FloatRegisterImpl::Width dw, FloatRegister s, FloatRegister d ) { emit_int32( op(arith_op) | fd(d, dw) | op3(fpop1_op3) | opf(0xc0 + sw + dw*4) | fs2(s, sw)); }
+  inline void ftof( FloatRegisterImpl::Width sw, FloatRegisterImpl::Width dw, FloatRegister s, FloatRegister d );
  // pp 161
-  void fxtof( FloatRegisterImpl::Width w, FloatRegister s, FloatRegister d ) { v9_only();  emit_int32( op(arith_op) | fd(d, w) | op3(fpop1_op3) | opf(0x80 + w*4) | fs2(s, FloatRegisterImpl::D)); }
+  inline void fxtof( FloatRegisterImpl::Width w, FloatRegister s, FloatRegister d );
-  void fitof( FloatRegisterImpl::Width w, FloatRegister s, FloatRegister d ) {             emit_int32( op(arith_op) | fd(d, w) | op3(fpop1_op3) | opf(0xc0 + w*4) | fs2(s, FloatRegisterImpl::S)); }
+  inline void fitof( FloatRegisterImpl::Width w, FloatRegister s, FloatRegister d );
  // pp 162
-  void fmov( FloatRegisterImpl::Width w, FloatRegister s, FloatRegister d ) { emit_int32( op(arith_op) | fd(d, w) | op3(fpop1_op3) | opf(0x00 + w) | fs2(s, w)); }
+  inline void fmov( FloatRegisterImpl::Width w, FloatRegister s, FloatRegister d );
-  void fneg( FloatRegisterImpl::Width w, FloatRegister s, FloatRegister d ) { emit_int32( op(arith_op) | fd(d, w) | op3(fpop1_op3) | opf(0x04 + w) | fs2(s, w)); }
+  inline void fneg( FloatRegisterImpl::Width w, FloatRegister s, FloatRegister d );
-  void fabs( FloatRegisterImpl::Width w, FloatRegister s, FloatRegister d ) { emit_int32( op(arith_op) | fd(d, w) | op3(fpop1_op3) | opf(0x08 + w) | fs2(s, w)); }
+  inline void fabs( FloatRegisterImpl::Width w, FloatRegister s, FloatRegister d );
  // pp 163
-  void fmul( FloatRegisterImpl::Width w,                            FloatRegister s1, FloatRegister s2, FloatRegister d ) { emit_int32( op(arith_op) | fd(d, w)  | op3(fpop1_op3) | fs1(s1, w)  | opf(0x48 + w)         | fs2(s2, w)); }
+  inline void fmul( FloatRegisterImpl::Width w,                            FloatRegister s1, FloatRegister s2, FloatRegister d );
-  void fmul( FloatRegisterImpl::Width sw, FloatRegisterImpl::Width dw,  FloatRegister s1, FloatRegister s2, FloatRegister d ) { emit_int32( op(arith_op) | fd(d, dw) | op3(fpop1_op3) | fs1(s1, sw) | opf(0x60 + sw + dw*4) | fs2(s2, sw)); }
+  inline void fmul( FloatRegisterImpl::Width sw, FloatRegisterImpl::Width dw,  FloatRegister s1, FloatRegister s2, FloatRegister d );
-  void fdiv( FloatRegisterImpl::Width w,                            FloatRegister s1, FloatRegister s2, FloatRegister d ) { emit_int32( op(arith_op) | fd(d, w)  | op3(fpop1_op3) | fs1(s1, w)  | opf(0x4c + w)         | fs2(s2, w)); }
+  inline void fdiv( FloatRegisterImpl::Width w,                            FloatRegister s1, FloatRegister s2, FloatRegister d );
  // FXORs/FXORd instructions
-  void fxor( FloatRegisterImpl::Width w, FloatRegister s1, FloatRegister s2, FloatRegister d ) { vis1_only(); emit_int32( op(arith_op) | fd(d, w) | op3(flog3_op3) | fs1(s1, w) | opf(0x6E - w) | fs2(s2, w)); }
+  inline void fxor( FloatRegisterImpl::Width w, FloatRegister s1, FloatRegister s2, FloatRegister d );
  // pp 164
-  void fsqrt( FloatRegisterImpl::Width w, FloatRegister s, FloatRegister d ) { emit_int32( op(arith_op) | fd(d, w) | op3(fpop1_op3) | opf(0x28 + w) | fs2(s, w)); }
+  inline void fsqrt( FloatRegisterImpl::Width w, FloatRegister s, FloatRegister d );
  // pp 165
@ -897,17 +896,17 @@ public:
  // pp 167
-  void flushw() { v9_only();  emit_int32( op(arith_op) | op3(flushw_op3) ); }
+  void flushw();
  // pp 168
-  void illtrap( int const22a) { if (const22a != 0) v9_only();  emit_int32( op(branch_op) | u_field(const22a, 21, 0) ); }
+  void illtrap( int const22a);
  // v8 unimp == illtrap(0)
  // pp 169
-  void impdep1( int id1, int const19a ) { v9_only();  emit_int32( op(arith_op) | fcn(id1) | op3(impdep1_op3) | u_field(const19a, 18, 0)); }
+  void impdep1( int id1, int const19a );
-  void impdep2( int id1, int const19a ) { v9_only();  emit_int32( op(arith_op) | fcn(id1) | op3(impdep2_op3) | u_field(const19a, 18, 0)); }
+  void impdep2( int id1, int const19a );
  // pp 170
@ -927,8 +926,8 @@ public:
  // 173
-  void ldfa(  FloatRegisterImpl::Width w, Register s1, Register s2, int ia, FloatRegister d ) { v9_only();  emit_int32( op(ldst_op) | fd(d, w) | alt_op3(ldf_op3 | alt_bit_op3, w) | rs1(s1) | imm_asi(ia) | rs2(s2) ); }
+  inline void ldfa(  FloatRegisterImpl::Width w, Register s1, Register s2, int ia, FloatRegister d );
-  void ldfa(  FloatRegisterImpl::Width w, Register s1, int simm13a,         FloatRegister d ) { v9_only();  emit_int32( op(ldst_op) | fd(d, w) | alt_op3(ldf_op3 | alt_bit_op3, w) | rs1(s1) | immed(true) | simm(simm13a, 13) ); }
+  inline void ldfa(  FloatRegisterImpl::Width w, Register s1, int simm13a,         FloatRegister d );
  // pp 175, lduw is ld on v8
@ -951,119 +950,119 @@ public:
  // pp 177
-  void ldsba(  Register s1, Register s2, int ia, Register d ) {             emit_int32( op(ldst_op) | rd(d) | op3(ldsb_op3 | alt_bit_op3) | rs1(s1) | imm_asi(ia) | rs2(s2) ); }
+  inline void ldsba(  Register s1, Register s2, int ia, Register d );
-  void ldsba(  Register s1, int simm13a,         Register d ) {             emit_int32( op(ldst_op) | rd(d) | op3(ldsb_op3 | alt_bit_op3) | rs1(s1) | immed(true) | simm(simm13a, 13) ); }
+  inline void ldsba(  Register s1, int simm13a,         Register d );
-  void ldsha(  Register s1, Register s2, int ia, Register d ) {             emit_int32( op(ldst_op) | rd(d) | op3(ldsh_op3 | alt_bit_op3) | rs1(s1) | imm_asi(ia) | rs2(s2) ); }
+  inline void ldsha(  Register s1, Register s2, int ia, Register d );
-  void ldsha(  Register s1, int simm13a,         Register d ) {             emit_int32( op(ldst_op) | rd(d) | op3(ldsh_op3 | alt_bit_op3) | rs1(s1) | immed(true) | simm(simm13a, 13) ); }
+  inline void ldsha(  Register s1, int simm13a,         Register d );
-  void ldswa(  Register s1, Register s2, int ia, Register d ) { v9_only();  emit_int32( op(ldst_op) | rd(d) | op3(ldsw_op3 | alt_bit_op3) | rs1(s1) | imm_asi(ia) | rs2(s2) ); }
+  inline void ldswa(  Register s1, Register s2, int ia, Register d );
-  void ldswa(  Register s1, int simm13a,         Register d ) { v9_only();  emit_int32( op(ldst_op) | rd(d) | op3(ldsw_op3 | alt_bit_op3) | rs1(s1) | immed(true) | simm(simm13a, 13) ); }
+  inline void ldswa(  Register s1, int simm13a,         Register d );
-  void lduba(  Register s1, Register s2, int ia, Register d ) {             emit_int32( op(ldst_op) | rd(d) | op3(ldub_op3 | alt_bit_op3) | rs1(s1) | imm_asi(ia) | rs2(s2) ); }
+  inline void lduba(  Register s1, Register s2, int ia, Register d );
-  void lduba(  Register s1, int simm13a,         Register d ) {             emit_int32( op(ldst_op) | rd(d) | op3(ldub_op3 | alt_bit_op3) | rs1(s1) | immed(true) | simm(simm13a, 13) ); }
+  inline void lduba(  Register s1, int simm13a,         Register d );
-  void lduha(  Register s1, Register s2, int ia, Register d ) {             emit_int32( op(ldst_op) | rd(d) | op3(lduh_op3 | alt_bit_op3) | rs1(s1) | imm_asi(ia) | rs2(s2) ); }
+  inline void lduha(  Register s1, Register s2, int ia, Register d );
-  void lduha(  Register s1, int simm13a,         Register d ) {             emit_int32( op(ldst_op) | rd(d) | op3(lduh_op3 | alt_bit_op3) | rs1(s1) | immed(true) | simm(simm13a, 13) ); }
+  inline void lduha(  Register s1, int simm13a,         Register d );
-  void lduwa(  Register s1, Register s2, int ia, Register d ) {             emit_int32( op(ldst_op) | rd(d) | op3(lduw_op3 | alt_bit_op3) | rs1(s1) | imm_asi(ia) | rs2(s2) ); }
+  inline void lduwa(  Register s1, Register s2, int ia, Register d );
-  void lduwa(  Register s1, int simm13a,         Register d ) {             emit_int32( op(ldst_op) | rd(d) | op3(lduw_op3 | alt_bit_op3) | rs1(s1) | immed(true) | simm(simm13a, 13) ); }
+  inline void lduwa(  Register s1, int simm13a,         Register d );
-  void ldxa(   Register s1, Register s2, int ia, Register d ) { v9_only();  emit_int32( op(ldst_op) | rd(d) | op3(ldx_op3  | alt_bit_op3) | rs1(s1) | imm_asi(ia) | rs2(s2) ); }
+  inline void ldxa(   Register s1, Register s2, int ia, Register d );
-  void ldxa(   Register s1, int simm13a,         Register d ) { v9_only();  emit_int32( op(ldst_op) | rd(d) | op3(ldx_op3  | alt_bit_op3) | rs1(s1) | immed(true) | simm(simm13a, 13) ); }
+  inline void ldxa(   Register s1, int simm13a,         Register d );
  // pp 181
-  void and3(    Register s1, Register s2, Register d ) { emit_int32( op(arith_op) | rd(d) | op3(and_op3              ) | rs1(s1) | rs2(s2) ); }
+  inline void and3(    Register s1, Register s2, Register d );
-  void and3(    Register s1, int simm13a, Register d ) { emit_int32( op(arith_op) | rd(d) | op3(and_op3              ) | rs1(s1) | immed(true) | simm(simm13a, 13) ); }
+  inline void and3(    Register s1, int simm13a, Register d );
-  void andcc(   Register s1, Register s2, Register d ) { emit_int32( op(arith_op) | rd(d) | op3(and_op3  | cc_bit_op3) | rs1(s1) | rs2(s2) ); }
+  inline void andcc(   Register s1, Register s2, Register d );
-  void andcc(   Register s1, int simm13a, Register d ) { emit_int32( op(arith_op) | rd(d) | op3(and_op3  | cc_bit_op3) | rs1(s1) | immed(true) | simm(simm13a, 13) ); }
+  inline void andcc(   Register s1, int simm13a, Register d );
-  void andn(    Register s1, Register s2, Register d ) { emit_int32( op(arith_op) | rd(d) | op3(andn_op3             ) | rs1(s1) | rs2(s2) ); }
+  inline void andn(    Register s1, Register s2, Register d );
-  void andn(    Register s1, int simm13a, Register d ) { emit_int32( op(arith_op) | rd(d) | op3(andn_op3             ) | rs1(s1) | immed(true) | simm(simm13a, 13) ); }
+  inline void andn(    Register s1, int simm13a, Register d );
-  void andncc(  Register s1, Register s2, Register d ) { emit_int32( op(arith_op) | rd(d) | op3(andn_op3 | cc_bit_op3) | rs1(s1) | rs2(s2) ); }
+  inline void andncc(  Register s1, Register s2, Register d );
-  void andncc(  Register s1, int simm13a, Register d ) { emit_int32( op(arith_op) | rd(d) | op3(andn_op3 | cc_bit_op3) | rs1(s1) | immed(true) | simm(simm13a, 13) ); }
+  inline void andncc(  Register s1, int simm13a, Register d );
-  void or3(     Register s1, Register s2, Register d ) { emit_int32( op(arith_op) | rd(d) | op3(or_op3               ) | rs1(s1) | rs2(s2) ); }
+  inline void or3(     Register s1, Register s2, Register d );
-  void or3(     Register s1, int simm13a, Register d ) { emit_int32( op(arith_op) | rd(d) | op3(or_op3               ) | rs1(s1) | immed(true) | simm(simm13a, 13) ); }
+  inline void or3(     Register s1, int simm13a, Register d );
-  void orcc(    Register s1, Register s2, Register d ) { emit_int32( op(arith_op) | rd(d) | op3(or_op3   | cc_bit_op3) | rs1(s1) | rs2(s2) ); }
+  inline void orcc(    Register s1, Register s2, Register d );
-  void orcc(    Register s1, int simm13a, Register d ) { emit_int32( op(arith_op) | rd(d) | op3(or_op3   | cc_bit_op3) | rs1(s1) | immed(true) | simm(simm13a, 13) ); }
+  inline void orcc(    Register s1, int simm13a, Register d );
-  void orn(     Register s1, Register s2, Register d ) { emit_int32( op(arith_op) | rd(d) | op3(orn_op3) | rs1(s1) | rs2(s2) ); }
+  inline void orn(     Register s1, Register s2, Register d );
-  void orn(     Register s1, int simm13a, Register d ) { emit_int32( op(arith_op) | rd(d) | op3(orn_op3) | rs1(s1) | immed(true) | simm(simm13a, 13) ); }
+  inline void orn(     Register s1, int simm13a, Register d );
-  void orncc(   Register s1, Register s2, Register d ) { emit_int32( op(arith_op) | rd(d) | op3(orn_op3  | cc_bit_op3) | rs1(s1) | rs2(s2) ); }
+  inline void orncc(   Register s1, Register s2, Register d );
-  void orncc(   Register s1, int simm13a, Register d ) { emit_int32( op(arith_op) | rd(d) | op3(orn_op3  | cc_bit_op3) | rs1(s1) | immed(true) | simm(simm13a, 13) ); }
+  inline void orncc(   Register s1, int simm13a, Register d );
-  void xor3(    Register s1, Register s2, Register d ) { emit_int32( op(arith_op) | rd(d) | op3(xor_op3              ) | rs1(s1) | rs2(s2) ); }
+  inline void xor3(    Register s1, Register s2, Register d );
-  void xor3(    Register s1, int simm13a, Register d ) { emit_int32( op(arith_op) | rd(d) | op3(xor_op3              ) | rs1(s1) | immed(true) | simm(simm13a, 13) ); }
+  inline void xor3(    Register s1, int simm13a, Register d );
-  void xorcc(   Register s1, Register s2, Register d ) { emit_int32( op(arith_op) | rd(d) | op3(xor_op3  | cc_bit_op3) | rs1(s1) | rs2(s2) ); }
+  inline void xorcc(   Register s1, Register s2, Register d );
-  void xorcc(   Register s1, int simm13a, Register d ) { emit_int32( op(arith_op) | rd(d) | op3(xor_op3  | cc_bit_op3) | rs1(s1) | immed(true) | simm(simm13a, 13) ); }
+  inline void xorcc(   Register s1, int simm13a, Register d );
-  void xnor(    Register s1, Register s2, Register d ) { emit_int32( op(arith_op) | rd(d) | op3(xnor_op3             ) | rs1(s1) | rs2(s2) ); }
+  inline void xnor(    Register s1, Register s2, Register d );
-  void xnor(    Register s1, int simm13a, Register d ) { emit_int32( op(arith_op) | rd(d) | op3(xnor_op3             ) | rs1(s1) | immed(true) | simm(simm13a, 13) ); }
+  inline void xnor(    Register s1, int simm13a, Register d );
-  void xnorcc(  Register s1, Register s2, Register d ) { emit_int32( op(arith_op) | rd(d) | op3(xnor_op3 | cc_bit_op3) | rs1(s1) | rs2(s2) ); }
+  inline void xnorcc(  Register s1, Register s2, Register d );
-  void xnorcc(  Register s1, int simm13a, Register d ) { emit_int32( op(arith_op) | rd(d) | op3(xnor_op3 | cc_bit_op3) | rs1(s1) | immed(true) | simm(simm13a, 13) ); }
+  inline void xnorcc(  Register s1, int simm13a, Register d );
  // pp 183
-  void membar( Membar_mask_bits const7a ) { v9_only(); emit_int32( op(arith_op) | op3(membar_op3) | rs1(O7) | immed(true) | u_field( int(const7a), 6, 0)); }
+  inline void membar( Membar_mask_bits const7a );
  // pp 185
-  void fmov( FloatRegisterImpl::Width w, Condition c,  bool floatCC, CC cca, FloatRegister s2, FloatRegister d ) { v9_only();  emit_int32( op(arith_op) | fd(d, w) | op3(fpop2_op3) | cond_mov(c) | opf_cc(cca, floatCC) | opf_low6(w) | fs2(s2, w)); }
+  inline void fmov( FloatRegisterImpl::Width w, Condition c,  bool floatCC, CC cca, FloatRegister s2, FloatRegister d );
  // pp 189
-  void fmov( FloatRegisterImpl::Width w, RCondition c, Register s1,  FloatRegister s2, FloatRegister d ) { v9_only();  emit_int32( op(arith_op) | fd(d, w) | op3(fpop2_op3) | rs1(s1) | rcond(c) | opf_low5(4 + w) | fs2(s2, w)); }
+  inline void fmov( FloatRegisterImpl::Width w, RCondition c, Register s1,  FloatRegister s2, FloatRegister d );
  // pp 191
-  void movcc( Condition c, bool floatCC, CC cca, Register s2, Register d ) { v9_only();  emit_int32( op(arith_op) | rd(d) | op3(movcc_op3) | mov_cc(cca, floatCC) | cond_mov(c) | rs2(s2) ); }
+  inline void movcc( Condition c, bool floatCC, CC cca, Register s2, Register d );
-  void movcc( Condition c, bool floatCC, CC cca, int simm11a, Register d ) { v9_only();  emit_int32( op(arith_op) | rd(d) | op3(movcc_op3) | mov_cc(cca, floatCC) | cond_mov(c) | immed(true) | simm(simm11a, 11) ); }
+  inline void movcc( Condition c, bool floatCC, CC cca, int simm11a, Register d );
  // pp 195
-  void movr( RCondition c, Register s1, Register s2,  Register d ) { v9_only();  emit_int32( op(arith_op) | rd(d) | op3(movr_op3) | rs1(s1) | rcond(c) | rs2(s2) ); }
+  inline void movr( RCondition c, Register s1, Register s2,  Register d );
-  void movr( RCondition c, Register s1, int simm10a,  Register d ) { v9_only();  emit_int32( op(arith_op) | rd(d) | op3(movr_op3) | rs1(s1) | rcond(c) | immed(true) | simm(simm10a, 10) ); }
+  inline void movr( RCondition c, Register s1, int simm10a,  Register d );
  // pp 196
-  void mulx(  Register s1, Register s2, Register d ) { v9_only(); emit_int32( op(arith_op) | rd(d) | op3(mulx_op3 ) | rs1(s1) | rs2(s2) ); }
+  inline void mulx(  Register s1, Register s2, Register d );
-  void mulx(  Register s1, int simm13a, Register d ) { v9_only(); emit_int32( op(arith_op) | rd(d) | op3(mulx_op3 ) | rs1(s1) | immed(true) | simm(simm13a, 13) ); }
+  inline void mulx(  Register s1, int simm13a, Register d );
-  void sdivx( Register s1, Register s2, Register d ) { v9_only(); emit_int32( op(arith_op) | rd(d) | op3(sdivx_op3) | rs1(s1) | rs2(s2) ); }
+  inline void sdivx( Register s1, Register s2, Register d );
-  void sdivx( Register s1, int simm13a, Register d ) { v9_only(); emit_int32( op(arith_op) | rd(d) | op3(sdivx_op3) | rs1(s1) | immed(true) | simm(simm13a, 13) ); }
+  inline void sdivx( Register s1, int simm13a, Register d );
-  void udivx( Register s1, Register s2, Register d ) { v9_only(); emit_int32( op(arith_op) | rd(d) | op3(udivx_op3) | rs1(s1) | rs2(s2) ); }
+  inline void udivx( Register s1, Register s2, Register d );
-  void udivx( Register s1, int simm13a, Register d ) { v9_only(); emit_int32( op(arith_op) | rd(d) | op3(udivx_op3) | rs1(s1) | immed(true) | simm(simm13a, 13) ); }
+  inline void udivx( Register s1, int simm13a, Register d );
  // pp 197
-  void umul(   Register s1, Register s2, Register d ) { emit_int32( op(arith_op) | rd(d) | op3(umul_op3             ) | rs1(s1) | rs2(s2) ); }
+  inline void umul(   Register s1, Register s2, Register d );
-  void umul(   Register s1, int simm13a, Register d ) { emit_int32( op(arith_op) | rd(d) | op3(umul_op3             ) | rs1(s1) | immed(true) | simm(simm13a, 13) ); }
+  inline void umul(   Register s1, int simm13a, Register d );
-  void smul(   Register s1, Register s2, Register d ) { emit_int32( op(arith_op) | rd(d) | op3(smul_op3             ) | rs1(s1) | rs2(s2) ); }
+  inline void smul(   Register s1, Register s2, Register d );
-  void smul(   Register s1, int simm13a, Register d ) { emit_int32( op(arith_op) | rd(d) | op3(smul_op3             ) | rs1(s1) | immed(true) | simm(simm13a, 13) ); }
+  inline void smul(   Register s1, int simm13a, Register d );
-  void umulcc( Register s1, Register s2, Register d ) { emit_int32( op(arith_op) | rd(d) | op3(umul_op3 | cc_bit_op3) | rs1(s1) | rs2(s2) ); }
+  inline void umulcc( Register s1, Register s2, Register d );
-  void umulcc( Register s1, int simm13a, Register d ) { emit_int32( op(arith_op) | rd(d) | op3(umul_op3 | cc_bit_op3) | rs1(s1) | immed(true) | simm(simm13a, 13) ); }
+  inline void umulcc( Register s1, int simm13a, Register d );
-  void smulcc( Register s1, Register s2, Register d ) { emit_int32( op(arith_op) | rd(d) | op3(smul_op3 | cc_bit_op3) | rs1(s1) | rs2(s2) ); }
+  inline void smulcc( Register s1, Register s2, Register d );
-  void smulcc( Register s1, int simm13a, Register d ) { emit_int32( op(arith_op) | rd(d) | op3(smul_op3 | cc_bit_op3) | rs1(s1) | immed(true) | simm(simm13a, 13) ); }
+  inline void smulcc( Register s1, int simm13a, Register d );
  // pp 201
-  void nop() { emit_int32( op(branch_op) | op2(sethi_op2) ); }
+  inline void nop();
-  void sw_count() { emit_int32( op(branch_op) | op2(sethi_op2) | 0x3f0 ); }
+  inline void sw_count();
  // pp 202
-  void popc( Register s,  Register d) { v9_only();  emit_int32( op(arith_op) | rd(d) | op3(popc_op3) | rs2(s)); }
+  inline void popc( Register s,  Register d);
-  void popc( int simm13a, Register d) { v9_only();  emit_int32( op(arith_op) | rd(d) | op3(popc_op3) | immed(true) | simm(simm13a, 13)); }
+  inline void popc( int simm13a, Register d);
  // pp 203
-  void prefetch(   Register s1, Register s2, PrefetchFcn f) { v9_only();  emit_int32( op(ldst_op) | fcn(f) | op3(prefetch_op3) | rs1(s1) | rs2(s2) ); }
+  inline void prefetch(   Register s1, Register s2, PrefetchFcn f);
-  void prefetch(   Register s1, int simm13a, PrefetchFcn f) { v9_only();  emit_data( op(ldst_op) | fcn(f) | op3(prefetch_op3) | rs1(s1) | immed(true) | simm(simm13a, 13)); }
+  inline void prefetch(   Register s1, int simm13a, PrefetchFcn f);
-  void prefetcha(  Register s1, Register s2, int ia, PrefetchFcn f ) { v9_only();  emit_int32( op(ldst_op) | fcn(f) | op3(prefetch_op3 | alt_bit_op3) | rs1(s1) | imm_asi(ia) | rs2(s2) ); }
+  inline void prefetcha(  Register s1, Register s2, int ia, PrefetchFcn f );
-  void prefetcha(  Register s1, int simm13a,         PrefetchFcn f ) { v9_only();  emit_int32( op(ldst_op) | fcn(f) | op3(prefetch_op3 | alt_bit_op3) | rs1(s1) | immed(true) | simm(simm13a, 13) ); }
+  inline void prefetcha(  Register s1, int simm13a,         PrefetchFcn f );
  // pp 208
  // not implementing read privileged register
-  inline void rdy(    Register d) { v9_dep();  emit_int32( op(arith_op) | rd(d) | op3(rdreg_op3) | u_field(0, 18, 14)); }
+  inline void rdy(    Register d);
-  inline void rdccr(  Register d) { v9_only(); emit_int32( op(arith_op) | rd(d) | op3(rdreg_op3) | u_field(2, 18, 14)); }
+  inline void rdccr(  Register d);
-  inline void rdasi(  Register d) { v9_only(); emit_int32( op(arith_op) | rd(d) | op3(rdreg_op3) | u_field(3, 18, 14)); }
+  inline void rdasi(  Register d);
-  inline void rdtick( Register d) { v9_only(); emit_int32( op(arith_op) | rd(d) | op3(rdreg_op3) | u_field(4, 18, 14)); } // Spoon!
+  inline void rdtick( Register d);
-  inline void rdpc(   Register d) { v9_only(); emit_int32( op(arith_op) | rd(d) | op3(rdreg_op3) | u_field(5, 18, 14)); }
+  inline void rdpc(   Register d);
-  inline void rdfprs( Register d) { v9_only(); emit_int32( op(arith_op) | rd(d) | op3(rdreg_op3) | u_field(6, 18, 14)); }
+  inline void rdfprs( Register d);
  // pp 213
@ -1072,47 +1071,43 @@ public:
  // pp 214
-  void save(    Register s1, Register s2, Register d ) { emit_int32( op(arith_op) | rd(d) | op3(save_op3) | rs1(s1) | rs2(s2) ); }
+  inline void save(    Register s1, Register s2, Register d );
-  void save(    Register s1, int simm13a, Register d ) {
+  inline void save(    Register s1, int simm13a, Register d );
    // make sure frame is at least large enough for the register save area
    assert(-simm13a >= 16 * wordSize, "frame too small");
    emit_int32( op(arith_op) | rd(d) | op3(save_op3) | rs1(s1) | immed(true) | simm(simm13a, 13) );
  }
-  void restore( Register s1 = G0,  Register s2 = G0, Register d = G0 ) { emit_int32( op(arith_op) | rd(d) | op3(restore_op3) | rs1(s1) | rs2(s2) ); }
+  inline void restore( Register s1 = G0,  Register s2 = G0, Register d = G0 );
-  void restore( Register s1,       int simm13a,      Register d      ) { emit_int32( op(arith_op) | rd(d) | op3(restore_op3) | rs1(s1) | immed(true) | simm(simm13a, 13) ); }
+  inline void restore( Register s1,       int simm13a,      Register d      );
  // pp 216
-  void saved()    { v9_only();  emit_int32( op(arith_op) | fcn(0) | op3(saved_op3)); }
+  inline void saved();
-  void restored() { v9_only();  emit_int32( op(arith_op) | fcn(1) | op3(saved_op3)); }
+  inline void restored();
  // pp 217
  inline void sethi( int imm22a, Register d, RelocationHolder const& rspec = RelocationHolder() );
  // pp 218
-  void sll(  Register s1, Register s2, Register d ) { emit_int32( op(arith_op) | rd(d) | op3(sll_op3) | rs1(s1) | sx(0) | rs2(s2) ); }
+  inline void sll(  Register s1, Register s2, Register d );
-  void sll(  Register s1, int imm5a,   Register d ) { emit_int32( op(arith_op) | rd(d) | op3(sll_op3) | rs1(s1) | sx(0) | immed(true) | u_field(imm5a, 4, 0) ); }
+  inline void sll(  Register s1, int imm5a,   Register d );
-  void srl(  Register s1, Register s2, Register d ) { emit_int32( op(arith_op) | rd(d) | op3(srl_op3) | rs1(s1) | sx(0) | rs2(s2) ); }
+  inline void srl(  Register s1, Register s2, Register d );
-  void srl(  Register s1, int imm5a,   Register d ) { emit_int32( op(arith_op) | rd(d) | op3(srl_op3) | rs1(s1) | sx(0) | immed(true) | u_field(imm5a, 4, 0) ); }
+  inline void srl(  Register s1, int imm5a,   Register d );
-  void sra(  Register s1, Register s2, Register d ) { emit_int32( op(arith_op) | rd(d) | op3(sra_op3) | rs1(s1) | sx(0) | rs2(s2) ); }
+  inline void sra(  Register s1, Register s2, Register d );
-  void sra(  Register s1, int imm5a,   Register d ) { emit_int32( op(arith_op) | rd(d) | op3(sra_op3) | rs1(s1) | sx(0) | immed(true) | u_field(imm5a, 4, 0) ); }
+  inline void sra(  Register s1, int imm5a,   Register d );
-  void sllx( Register s1, Register s2, Register d ) { v9_only();  emit_int32( op(arith_op) | rd(d) | op3(sll_op3) | rs1(s1) | sx(1) | rs2(s2) ); }
+  inline void sllx( Register s1, Register s2, Register d );
-  void sllx( Register s1, int imm6a,   Register d ) { v9_only();  emit_int32( op(arith_op) | rd(d) | op3(sll_op3) | rs1(s1) | sx(1) | immed(true) | u_field(imm6a, 5, 0) ); }
+  inline void sllx( Register s1, int imm6a,   Register d );
-  void srlx( Register s1, Register s2, Register d ) { v9_only();  emit_int32( op(arith_op) | rd(d) | op3(srl_op3) | rs1(s1) | sx(1) | rs2(s2) ); }
+  inline void srlx( Register s1, Register s2, Register d );
-  void srlx( Register s1, int imm6a,   Register d ) { v9_only();  emit_int32( op(arith_op) | rd(d) | op3(srl_op3) | rs1(s1) | sx(1) | immed(true) | u_field(imm6a, 5, 0) ); }
+  inline void srlx( Register s1, int imm6a,   Register d );
-  void srax( Register s1, Register s2, Register d ) { v9_only();  emit_int32( op(arith_op) | rd(d) | op3(sra_op3) | rs1(s1) | sx(1) | rs2(s2) ); }
+  inline void srax( Register s1, Register s2, Register d );
-  void srax( Register s1, int imm6a,   Register d ) { v9_only();  emit_int32( op(arith_op) | rd(d) | op3(sra_op3) | rs1(s1) | sx(1) | immed(true) | u_field(imm6a, 5, 0) ); }
+  inline void srax( Register s1, int imm6a,   Register d );
  // pp 220
-  void sir( int simm13a ) { emit_int32( op(arith_op) | fcn(15) | op3(sir_op3) | immed(true) | simm(simm13a, 13)); }
+  inline void sir( int simm13a );
  // pp 221
-  void stbar() { emit_int32( op(arith_op) | op3(membar_op3) | u_field(15, 18, 14)); }
+  inline void stbar();
  // pp 222
@ -1126,8 +1121,8 @@ public:
  //  pp 224
-  void stfa(  FloatRegisterImpl::Width w, FloatRegister d, Register s1, Register s2, int ia ) { v9_only();  emit_int32( op(ldst_op) | fd(d, w) | alt_op3(stf_op3 | alt_bit_op3, w) | rs1(s1) | imm_asi(ia) | rs2(s2) ); }
+  inline void stfa(  FloatRegisterImpl::Width w, FloatRegister d, Register s1, Register s2, int ia );
-  void stfa(  FloatRegisterImpl::Width w, FloatRegister d, Register s1, int simm13a         ) { v9_only();  emit_int32( op(ldst_op) | fd(d, w) | alt_op3(stf_op3 | alt_bit_op3, w) | rs1(s1) | immed(true) | simm(simm13a, 13) ); }
+  inline void stfa(  FloatRegisterImpl::Width w, FloatRegister d, Register s1, int simm13a         );
  // p 226
@ -1144,28 +1139,28 @@ public:
  // pp 177
-  void stba(  Register d, Register s1, Register s2, int ia ) {             emit_int32( op(ldst_op) | rd(d) | op3(stb_op3 | alt_bit_op3) | rs1(s1) | imm_asi(ia) | rs2(s2) ); }
+  inline void stba(  Register d, Register s1, Register s2, int ia );
-  void stba(  Register d, Register s1, int simm13a         ) {             emit_int32( op(ldst_op) | rd(d) | op3(stb_op3 | alt_bit_op3) | rs1(s1) | immed(true) | simm(simm13a, 13) ); }
+  inline void stba(  Register d, Register s1, int simm13a         );
-  void stha(  Register d, Register s1, Register s2, int ia ) {             emit_int32( op(ldst_op) | rd(d) | op3(sth_op3 | alt_bit_op3) | rs1(s1) | imm_asi(ia) | rs2(s2) ); }
+  inline void stha(  Register d, Register s1, Register s2, int ia );
-  void stha(  Register d, Register s1, int simm13a         ) {             emit_int32( op(ldst_op) | rd(d) | op3(sth_op3 | alt_bit_op3) | rs1(s1) | immed(true) | simm(simm13a, 13) ); }
+  inline void stha(  Register d, Register s1, int simm13a         );
-  void stwa(  Register d, Register s1, Register s2, int ia ) {             emit_int32( op(ldst_op) | rd(d) | op3(stw_op3 | alt_bit_op3) | rs1(s1) | imm_asi(ia) | rs2(s2) ); }
+  inline void stwa(  Register d, Register s1, Register s2, int ia );
-  void stwa(  Register d, Register s1, int simm13a         ) {             emit_int32( op(ldst_op) | rd(d) | op3(stw_op3 | alt_bit_op3) | rs1(s1) | immed(true) | simm(simm13a, 13) ); }
+  inline void stwa(  Register d, Register s1, int simm13a         );
-  void stxa(  Register d, Register s1, Register s2, int ia ) { v9_only();  emit_int32( op(ldst_op) | rd(d) | op3(stx_op3 | alt_bit_op3) | rs1(s1) | imm_asi(ia) | rs2(s2) ); }
+  inline void stxa(  Register d, Register s1, Register s2, int ia );
-  void stxa(  Register d, Register s1, int simm13a         ) { v9_only();  emit_int32( op(ldst_op) | rd(d) | op3(stx_op3 | alt_bit_op3) | rs1(s1) | immed(true) | simm(simm13a, 13) ); }
+  inline void stxa(  Register d, Register s1, int simm13a         );
-  void stda(  Register d, Register s1, Register s2, int ia ) {             emit_int32( op(ldst_op) | rd(d) | op3(std_op3 | alt_bit_op3) | rs1(s1) | imm_asi(ia) | rs2(s2) ); }
+  inline void stda(  Register d, Register s1, Register s2, int ia );
-  void stda(  Register d, Register s1, int simm13a         ) {             emit_int32( op(ldst_op) | rd(d) | op3(std_op3 | alt_bit_op3) | rs1(s1) | immed(true) | simm(simm13a, 13) ); }
+  inline void stda(  Register d, Register s1, int simm13a         );
  // pp 230
-  void sub(    Register s1, Register s2, Register d ) { emit_int32( op(arith_op) | rd(d) | op3(sub_op3              ) | rs1(s1) | rs2(s2) ); }
+  inline void sub(    Register s1, Register s2, Register d );
-  void sub(    Register s1, int simm13a, Register d ) { emit_int32( op(arith_op) | rd(d) | op3(sub_op3              ) | rs1(s1) | immed(true) | simm(simm13a, 13) ); }
+  inline void sub(    Register s1, int simm13a, Register d );
-  void subcc(  Register s1, Register s2, Register d ) { emit_int32( op(arith_op) | rd(d) | op3(sub_op3 | cc_bit_op3 ) | rs1(s1) | rs2(s2) ); }
+  inline void subcc(  Register s1, Register s2, Register d );
-  void subcc(  Register s1, int simm13a, Register d ) { emit_int32( op(arith_op) | rd(d) | op3(sub_op3 | cc_bit_op3 ) | rs1(s1) | immed(true) | simm(simm13a, 13) ); }
+  inline void subcc(  Register s1, int simm13a, Register d );
-  void subc(   Register s1, Register s2, Register d ) { emit_int32( op(arith_op) | rd(d) | op3(subc_op3             ) | rs1(s1) | rs2(s2) ); }
+  inline void subc(   Register s1, Register s2, Register d );
-  void subc(   Register s1, int simm13a, Register d ) { emit_int32( op(arith_op) | rd(d) | op3(subc_op3             ) | rs1(s1) | immed(true) | simm(simm13a, 13) ); }
+  inline void subc(   Register s1, int simm13a, Register d );
-  void subccc( Register s1, Register s2, Register d ) { emit_int32( op(arith_op) | rd(d) | op3(subc_op3 | cc_bit_op3) | rs1(s1) | rs2(s2) ); }
+  inline void subccc( Register s1, Register s2, Register d );
-  void subccc( Register s1, int simm13a, Register d ) { emit_int32( op(arith_op) | rd(d) | op3(subc_op3 | cc_bit_op3) | rs1(s1) | immed(true) | simm(simm13a, 13) ); }
+  inline void subccc( Register s1, int simm13a, Register d );
  // pp 231
@ -1174,86 +1169,80 @@ public:
  // pp 232
-  void swapa(   Register s1, Register s2, int ia, Register d ) { v9_dep();  emit_int32( op(ldst_op) | rd(d) | op3(swap_op3 | alt_bit_op3) | rs1(s1) | imm_asi(ia) | rs2(s2) ); }
+  inline void swapa(   Register s1, Register s2, int ia, Register d );
-  void swapa(   Register s1, int simm13a,         Register d ) { v9_dep();  emit_int32( op(ldst_op) | rd(d) | op3(swap_op3 | alt_bit_op3) | rs1(s1) | immed(true) | simm(simm13a, 13) ); }
+  inline void swapa(   Register s1, int simm13a,         Register d );
  // pp 234, note op in book is wrong, see pp 268
-  void taddcc(    Register s1, Register s2, Register d ) {            emit_int32( op(arith_op) | rd(d) | op3(taddcc_op3  ) | rs1(s1) | rs2(s2) ); }
+  inline void taddcc(    Register s1, Register s2, Register d );
-  void taddcc(    Register s1, int simm13a, Register d ) {            emit_int32( op(arith_op) | rd(d) | op3(taddcc_op3  ) | rs1(s1) | immed(true) | simm(simm13a, 13) ); }
+  inline void taddcc(    Register s1, int simm13a, Register d );
  // pp 235
-  void tsubcc(    Register s1, Register s2, Register d ) { emit_int32( op(arith_op) | rd(d) | op3(tsubcc_op3  ) | rs1(s1) | rs2(s2) ); }
+  inline void tsubcc(    Register s1, Register s2, Register d );
-  void tsubcc(    Register s1, int simm13a, Register d ) { emit_int32( op(arith_op) | rd(d) | op3(tsubcc_op3  ) | rs1(s1) | immed(true) | simm(simm13a, 13) ); }
+  inline void tsubcc(    Register s1, int simm13a, Register d );
  // pp 237
-  void trap( Condition c, CC cc, Register s1, Register s2 ) { emit_int32( op(arith_op) | cond(c) | op3(trap_op3) | rs1(s1) | trapcc(cc) | rs2(s2)); }
+  inline void trap( Condition c, CC cc, Register s1, Register s2 );
-  void trap( Condition c, CC cc, Register s1, int trapa   ) { emit_int32( op(arith_op) | cond(c) | op3(trap_op3) | rs1(s1) | trapcc(cc) | immed(true) | u_field(trapa, 6, 0)); }
+  inline void trap( Condition c, CC cc, Register s1, int trapa   );
  // simple uncond. trap
-  void trap( int trapa ) { trap( always, icc, G0, trapa ); }
+  inline void trap( int trapa );
  // pp 239 omit write priv register for now
-  inline void wry(    Register d) { v9_dep();  emit_int32( op(arith_op) | rs1(d) | op3(wrreg_op3) | u_field(0, 29, 25)); }
+  inline void wry(    Register d);
-  inline void wrccr(Register s) { v9_only(); emit_int32( op(arith_op) | rs1(s) | op3(wrreg_op3) | u_field(2, 29, 25)); }
+  inline void wrccr(Register s);
-  inline void wrccr(Register s, int simm13a) { v9_only(); emit_int32( op(arith_op) |
+  inline void wrccr(Register s, int simm13a);
-                                                                           rs1(s) |
+  inline void wrasi(Register d);
                                                                           op3(wrreg_op3) |
                                                                           u_field(2, 29, 25) |
                                                                           immed(true) |
                                                                           simm(simm13a, 13)); }
  inline void wrasi(Register d) { v9_only(); emit_int32( op(arith_op) | rs1(d) | op3(wrreg_op3) | u_field(3, 29, 25)); }
  // wrasi(d, imm) stores (d xor imm) to asi
-  inline void wrasi(Register d, int simm13a) { v9_only(); emit_int32( op(arith_op) | rs1(d) | op3(wrreg_op3) |
+  inline void wrasi(Register d, int simm13a);
-                                               u_field(3, 29, 25) | immed(true) | simm(simm13a, 13)); }
+  inline void wrfprs( Register d);
  inline void wrfprs( Register d) { v9_only(); emit_int32( op(arith_op) | rs1(d) | op3(wrreg_op3) | u_field(6, 29, 25)); }
  //  VIS1 instructions
-  void alignaddr( Register s1, Register s2, Register d ) { vis1_only(); emit_int32( op(arith_op) | rd(d) | op3(alignaddr_op3) | rs1(s1) | opf(alignaddr_opf) | rs2(s2)); }
+  inline void alignaddr( Register s1, Register s2, Register d );
-  void faligndata( FloatRegister s1, FloatRegister s2, FloatRegister d ) { vis1_only(); emit_int32( op(arith_op) | fd(d, FloatRegisterImpl::D) | op3(faligndata_op3) | fs1(s1, FloatRegisterImpl::D) | opf(faligndata_opf) | fs2(s2, FloatRegisterImpl::D)); }
+  inline void faligndata( FloatRegister s1, FloatRegister s2, FloatRegister d );
-  void fzero( FloatRegisterImpl::Width w, FloatRegister d ) { vis1_only(); emit_int32( op(arith_op) | fd(d, w) | op3(fzero_op3) | opf(0x62 - w)); }
+  inline void fzero( FloatRegisterImpl::Width w, FloatRegister d );
-  void fsrc2( FloatRegisterImpl::Width w, FloatRegister s2, FloatRegister d ) { vis1_only(); emit_int32( op(arith_op) | fd(d, w) | op3(fsrc_op3) | opf(0x7A - w) | fs2(s2, w)); }
+  inline void fsrc2( FloatRegisterImpl::Width w, FloatRegister s2, FloatRegister d );
-  void fnot1( FloatRegisterImpl::Width w, FloatRegister s1, FloatRegister d ) { vis1_only(); emit_int32( op(arith_op) | fd(d, w) | op3(fnot_op3) | fs1(s1, w) | opf(0x6C - w)); }
+  inline void fnot1( FloatRegisterImpl::Width w, FloatRegister s1, FloatRegister d );
-  void fpmerge( FloatRegister s1, FloatRegister s2, FloatRegister d ) { vis1_only(); emit_int32( op(arith_op) | fd(d, FloatRegisterImpl::D) | op3(0x36) | fs1(s1, FloatRegisterImpl::S) | opf(0x4b) | fs2(s2, FloatRegisterImpl::S)); }
+  inline void fpmerge( FloatRegister s1, FloatRegister s2, FloatRegister d );
-  void stpartialf( Register s1, Register s2, FloatRegister d, int ia = -1 ) { vis1_only(); emit_int32( op(ldst_op) | fd(d, FloatRegisterImpl::D) | op3(stpartialf_op3) | rs1(s1) | imm_asi(ia) | rs2(s2)); }
+  inline void stpartialf( Register s1, Register s2, FloatRegister d, int ia = -1 );
  //  VIS2 instructions
-  void edge8n( Register s1, Register s2, Register d ) { vis2_only(); emit_int32( op(arith_op) | rd(d) | op3(edge_op3) | rs1(s1) | opf(edge8n_opf) | rs2(s2)); }
+  inline void edge8n( Register s1, Register s2, Register d );
-  void bmask( Register s1, Register s2, Register d ) { vis2_only(); emit_int32( op(arith_op) | rd(d) | op3(bmask_op3) | rs1(s1) | opf(bmask_opf) | rs2(s2)); }
+  inline void bmask( Register s1, Register s2, Register d );
-  void bshuffle( FloatRegister s1, FloatRegister s2, FloatRegister d ) { vis2_only(); emit_int32( op(arith_op) | fd(d, FloatRegisterImpl::D) | op3(bshuffle_op3) | fs1(s1, FloatRegisterImpl::D) | opf(bshuffle_opf) | fs2(s2, FloatRegisterImpl::D)); }
+  inline void bshuffle( FloatRegister s1, FloatRegister s2, FloatRegister d );
  // VIS3 instructions
-  void movstosw( FloatRegister s, Register d ) { vis3_only();  emit_int32( op(arith_op) | rd(d) | op3(mftoi_op3) | opf(mstosw_opf) | fs2(s, FloatRegisterImpl::S)); }
+  inline void movstosw( FloatRegister s, Register d );
-  void movstouw( FloatRegister s, Register d ) { vis3_only();  emit_int32( op(arith_op) | rd(d) | op3(mftoi_op3) | opf(mstouw_opf) | fs2(s, FloatRegisterImpl::S)); }
+  inline void movstouw( FloatRegister s, Register d );
-  void movdtox(  FloatRegister s, Register d ) { vis3_only();  emit_int32( op(arith_op) | rd(d) | op3(mftoi_op3) | opf(mdtox_opf) | fs2(s, FloatRegisterImpl::D)); }
+  inline void movdtox(  FloatRegister s, Register d );
-  void movwtos( Register s, FloatRegister d ) { vis3_only();  emit_int32( op(arith_op) | fd(d, FloatRegisterImpl::S) | op3(mftoi_op3) | opf(mwtos_opf) | rs2(s)); }
+  inline void movwtos( Register s, FloatRegister d );
-  void movxtod( Register s, FloatRegister d ) { vis3_only();  emit_int32( op(arith_op) | fd(d, FloatRegisterImpl::D) | op3(mftoi_op3) | opf(mxtod_opf) | rs2(s)); }
+  inline void movxtod( Register s, FloatRegister d );
-  void xmulx(Register s1, Register s2, Register d) { vis3_only(); emit_int32( op(arith_op) | rd(d) | op3(xmulx_op3) | rs1(s1) | opf(xmulx_opf) | rs2(s2)); }
+  inline void xmulx(Register s1, Register s2, Register d);
-  void xmulxhi(Register s1, Register s2, Register d) { vis3_only(); emit_int32( op(arith_op) | rd(d) | op3(xmulx_op3) | rs1(s1) | opf(xmulxhi_opf) | rs2(s2)); }
+  inline void xmulxhi(Register s1, Register s2, Register d);
  // Crypto SHA instructions
-  void sha1()   { sha1_only();    emit_int32( op(arith_op) | op3(sha_op3) | opf(sha1_opf)); }
+  inline void sha1();
-  void sha256() { sha256_only();  emit_int32( op(arith_op) | op3(sha_op3) | opf(sha256_opf)); }
+  inline void sha256();
-  void sha512() { sha512_only();  emit_int32( op(arith_op) | op3(sha_op3) | opf(sha512_opf)); }
+  inline void sha512();
  // CRC32C instruction
-  void crc32c( FloatRegister s1, FloatRegister s2, FloatRegister d ) { crc32c_only();  emit_int32( op(arith_op) | fd(d, FloatRegisterImpl::D) | op3(crc32c_op3) | fs1(s1, FloatRegisterImpl::D) | opf(crc32c_opf) | fs2(s2, FloatRegisterImpl::D)); }
+  inline void crc32c( FloatRegister s1, FloatRegister s2, FloatRegister d );
  // Creation
  Assembler(CodeBuffer* code) : AbstractAssembler(code) {
--- a/hotspot/src/cpu/sparc/vm/assembler_sparc.inline.hpp
+++ b/hotspot/src/cpu/sparc/vm/assembler_sparc.inline.hpp
@ -28,6 +28,12 @@
 #include "asm/assembler.hpp"
 inline void Assembler::insert_nop_after_cbcond() {
  if (UseCBCond && cbcond_before()) {
    nop();
  }
 }
 inline void Assembler::check_delay() {
 # ifdef CHECK_DELAY
  guarantee( delay_state != at_delay_slot, "must say delayed() when filling delay slot");
@ -40,6 +46,10 @@ inline void Assembler::emit_int32(int x) {
  AbstractAssembler::emit_int32(x);
 }
 inline void Assembler::emit_data(int x) {
  emit_int32(x);
 }
 inline void Assembler::emit_data(int x, relocInfo::relocType rtype) {
  relocate(rtype);
  emit_int32(x);
@ -54,6 +64,29 @@ inline void Assembler::emit_data(int x, RelocationHolder const& rspec) {
 inline void Assembler::add(Register s1, Register s2, Register d ) { emit_int32( op(arith_op) | rd(d) | op3(add_op3) | rs1(s1) | rs2(s2) ); }
 inline void Assembler::add(Register s1, int simm13a, Register d ) { emit_int32( op(arith_op) | rd(d) | op3(add_op3) | rs1(s1) | immed(true) | simm(simm13a, 13) ); }
 inline void Assembler::addcc(  Register s1, Register s2, Register d ) { emit_int32( op(arith_op) | rd(d) | op3(add_op3  | cc_bit_op3) | rs1(s1) | rs2(s2) ); }
 inline void Assembler::addcc(  Register s1, int simm13a, Register d ) { emit_int32( op(arith_op) | rd(d) | op3(add_op3  | cc_bit_op3) | rs1(s1) | immed(true) | simm(simm13a, 13) ); }
 inline void Assembler::addc(   Register s1, Register s2, Register d ) { emit_int32( op(arith_op) | rd(d) | op3(addc_op3             ) | rs1(s1) | rs2(s2) ); }
 inline void Assembler::addc(   Register s1, int simm13a, Register d ) { emit_int32( op(arith_op) | rd(d) | op3(addc_op3             ) | rs1(s1) | immed(true) | simm(simm13a, 13) ); }
 inline void Assembler::addccc( Register s1, Register s2, Register d ) { emit_int32( op(arith_op) | rd(d) | op3(addc_op3 | cc_bit_op3) | rs1(s1) | rs2(s2) ); }
 inline void Assembler::addccc( Register s1, int simm13a, Register d ) { emit_int32( op(arith_op) | rd(d) | op3(addc_op3 | cc_bit_op3) | rs1(s1) | immed(true) | simm(simm13a, 13) ); }
 inline void Assembler::aes_eround01(  FloatRegister s1, FloatRegister s2, FloatRegister s3, FloatRegister d ) { aes_only(); emit_int32( op(arith_op) | fd(d, FloatRegisterImpl::D) | op3(aes4_op3) | fs1(s1, FloatRegisterImpl::D) | fs3(s3, FloatRegisterImpl::D) | op5(aes_eround01_op5) | fs2(s2, FloatRegisterImpl::D) ); }
 inline void Assembler::aes_eround23(  FloatRegister s1, FloatRegister s2, FloatRegister s3, FloatRegister d ) { aes_only(); emit_int32( op(arith_op) | fd(d, FloatRegisterImpl::D) | op3(aes4_op3) | fs1(s1, FloatRegisterImpl::D) | fs3(s3, FloatRegisterImpl::D) | op5(aes_eround23_op5) | fs2(s2, FloatRegisterImpl::D) ); }
 inline void Assembler::aes_dround01(  FloatRegister s1, FloatRegister s2, FloatRegister s3, FloatRegister d ) { aes_only(); emit_int32( op(arith_op) | fd(d, FloatRegisterImpl::D) | op3(aes4_op3) | fs1(s1, FloatRegisterImpl::D) | fs3(s3, FloatRegisterImpl::D) | op5(aes_dround01_op5) | fs2(s2, FloatRegisterImpl::D) ); }
 inline void Assembler::aes_dround23(  FloatRegister s1, FloatRegister s2, FloatRegister s3, FloatRegister d ) { aes_only(); emit_int32( op(arith_op) | fd(d, FloatRegisterImpl::D) | op3(aes4_op3) | fs1(s1, FloatRegisterImpl::D) | fs3(s3, FloatRegisterImpl::D) | op5(aes_dround23_op5) | fs2(s2, FloatRegisterImpl::D) ); }
 inline void Assembler::aes_eround01_l(  FloatRegister s1, FloatRegister s2, FloatRegister s3, FloatRegister d ) { aes_only(); emit_int32( op(arith_op) | fd(d, FloatRegisterImpl::D) | op3(aes4_op3) | fs1(s1, FloatRegisterImpl::D) | fs3(s3, FloatRegisterImpl::D) | op5(aes_eround01_l_op5) | fs2(s2, FloatRegisterImpl::D) ); }
 inline void Assembler::aes_eround23_l(  FloatRegister s1, FloatRegister s2, FloatRegister s3, FloatRegister d ) { aes_only(); emit_int32( op(arith_op) | fd(d, FloatRegisterImpl::D) | op3(aes4_op3) | fs1(s1, FloatRegisterImpl::D) | fs3(s3, FloatRegisterImpl::D) | op5(aes_eround23_l_op5) | fs2(s2, FloatRegisterImpl::D) ); }
 inline void Assembler::aes_dround01_l(  FloatRegister s1, FloatRegister s2, FloatRegister s3, FloatRegister d ) { aes_only(); emit_int32( op(arith_op) | fd(d, FloatRegisterImpl::D) | op3(aes4_op3) | fs1(s1, FloatRegisterImpl::D) | fs3(s3, FloatRegisterImpl::D) | op5(aes_dround01_l_op5) | fs2(s2, FloatRegisterImpl::D) ); }
 inline void Assembler::aes_dround23_l(  FloatRegister s1, FloatRegister s2, FloatRegister s3, FloatRegister d ) { aes_only(); emit_int32( op(arith_op) | fd(d, FloatRegisterImpl::D) | op3(aes4_op3) | fs1(s1, FloatRegisterImpl::D) | fs3(s3, FloatRegisterImpl::D) | op5(aes_dround23_l_op5) | fs2(s2, FloatRegisterImpl::D) ); }
 inline void Assembler::aes_kexpand1(  FloatRegister s1, FloatRegister s2, int imm5a, FloatRegister d ) { aes_only(); emit_int32( op(arith_op) | fd(d, FloatRegisterImpl::D) | op3(aes4_op3) | fs1(s1, FloatRegisterImpl::D) | u_field(imm5a, 13, 9) | op5(aes_kexpand1_op5) | fs2(s2, FloatRegisterImpl::D) ); }
 // 3-operand AES instructions
 inline void Assembler::aes_kexpand0(  FloatRegister s1, FloatRegister s2, FloatRegister d ) { aes_only(); emit_int32( op(arith_op) | fd(d, FloatRegisterImpl::D) | op3(aes3_op3) | fs1(s1, FloatRegisterImpl::D) | opf(aes_kexpand0_opf) | fs2(s2, FloatRegisterImpl::D) ); }
 inline void Assembler::aes_kexpand2(  FloatRegister s1, FloatRegister s2, FloatRegister d ) { aes_only(); emit_int32( op(arith_op) | fd(d, FloatRegisterImpl::D) | op3(aes3_op3) | fs1(s1, FloatRegisterImpl::D) | opf(aes_kexpand2_opf) | fs2(s2, FloatRegisterImpl::D) ); }
 inline void Assembler::bpr( RCondition c, bool a, Predict p, Register s1, address d, relocInfo::relocType rt ) { v9_only(); insert_nop_after_cbcond(); cti();  emit_data( op(branch_op) | annul(a) | cond(c) | op2(bpr_op2) | wdisp16(intptr_t(d), intptr_t(pc())) | predict(p) | rs1(s1), rt);  has_delay_slot(); }
 inline void Assembler::bpr( RCondition c, bool a, Predict p, Register s1, Label& L) { insert_nop_after_cbcond(); bpr( c, a, p, s1, target(L)); }
@ -76,9 +109,58 @@ inline void Assembler::cbcond(Condition c, CC cc, Register s1, int simm5, Label&
 inline void Assembler::call( address d,  relocInfo::relocType rt ) { insert_nop_after_cbcond(); cti();  emit_data( op(call_op) | wdisp(intptr_t(d), intptr_t(pc()), 30), rt);  has_delay_slot(); assert(rt != relocInfo::virtual_call_type, "must use virtual_call_Relocation::spec"); }
 inline void Assembler::call( Label& L,   relocInfo::relocType rt ) { insert_nop_after_cbcond(); call( target(L), rt); }
 inline void Assembler::call( address d,  RelocationHolder const& rspec ) { insert_nop_after_cbcond(); cti();  emit_data( op(call_op) | wdisp(intptr_t(d), intptr_t(pc()), 30), rspec);  has_delay_slot(); assert(rspec.type() != relocInfo::virtual_call_type, "must use virtual_call_Relocation::spec"); }
 inline void Assembler::casa(  Register s1, Register s2, Register d, int ia ) { v9_only();  emit_int32( op(ldst_op) | rd(d) | op3(casa_op3 ) | rs1(s1) | (ia == -1  ? immed(true) : imm_asi(ia)) | rs2(s2)); }
 inline void Assembler::casxa( Register s1, Register s2, Register d, int ia ) { v9_only();  emit_int32( op(ldst_op) | rd(d) | op3(casxa_op3) | rs1(s1) | (ia == -1  ? immed(true) : imm_asi(ia)) | rs2(s2)); }
 inline void Assembler::udiv(   Register s1, Register s2, Register d ) { emit_int32( op(arith_op) | rd(d) | op3(udiv_op3             ) | rs1(s1) | rs2(s2)); }
 inline void Assembler::udiv(   Register s1, int simm13a, Register d ) { emit_int32( op(arith_op) | rd(d) | op3(udiv_op3             ) | rs1(s1) | immed(true) | simm(simm13a, 13) ); }
 inline void Assembler::sdiv(   Register s1, Register s2, Register d ) { emit_int32( op(arith_op) | rd(d) | op3(sdiv_op3             ) | rs1(s1) | rs2(s2)); }
 inline void Assembler::sdiv(   Register s1, int simm13a, Register d ) { emit_int32( op(arith_op) | rd(d) | op3(sdiv_op3             ) | rs1(s1) | immed(true) | simm(simm13a, 13) ); }
 inline void Assembler::udivcc( Register s1, Register s2, Register d ) { emit_int32( op(arith_op) | rd(d) | op3(udiv_op3 | cc_bit_op3) | rs1(s1) | rs2(s2)); }
 inline void Assembler::udivcc( Register s1, int simm13a, Register d ) { emit_int32( op(arith_op) | rd(d) | op3(udiv_op3 | cc_bit_op3) | rs1(s1) | immed(true) | simm(simm13a, 13) ); }
 inline void Assembler::sdivcc( Register s1, Register s2, Register d ) { emit_int32( op(arith_op) | rd(d) | op3(sdiv_op3 | cc_bit_op3) | rs1(s1) | rs2(s2)); }
 inline void Assembler::sdivcc( Register s1, int simm13a, Register d ) { emit_int32( op(arith_op) | rd(d) | op3(sdiv_op3 | cc_bit_op3) | rs1(s1) | immed(true) | simm(simm13a, 13) ); }
 inline void Assembler::done()  { v9_only();  cti();  emit_int32( op(arith_op) | fcn(0) | op3(done_op3) ); }
 inline void Assembler::retry() { v9_only();  cti();  emit_int32( op(arith_op) | fcn(1) | op3(retry_op3) ); }
 inline void Assembler::fadd( FloatRegisterImpl::Width w, FloatRegister s1, FloatRegister s2, FloatRegister d ) { emit_int32( op(arith_op) | fd(d, w) | op3(fpop1_op3) | fs1(s1, w) | opf(0x40 + w) | fs2(s2, w)); }
 inline void Assembler::fsub( FloatRegisterImpl::Width w, FloatRegister s1, FloatRegister s2, FloatRegister d ) { emit_int32( op(arith_op) | fd(d, w) | op3(fpop1_op3) | fs1(s1, w) | opf(0x44 + w) | fs2(s2, w)); }
 inline void Assembler::fcmp(  FloatRegisterImpl::Width w, CC cc, FloatRegister s1, FloatRegister s2) { emit_int32( op(arith_op) | cmpcc(cc) | op3(fpop2_op3) | fs1(s1, w) | opf(0x50 + w) | fs2(s2, w)); }
 inline void Assembler::fcmpe( FloatRegisterImpl::Width w, CC cc, FloatRegister s1, FloatRegister s2) { emit_int32( op(arith_op) | cmpcc(cc) | op3(fpop2_op3) | fs1(s1, w) | opf(0x54 + w) | fs2(s2, w)); }
 inline void Assembler::ftox( FloatRegisterImpl::Width w, FloatRegister s, FloatRegister d ) { v9_only();  emit_int32( op(arith_op) | fd(d, FloatRegisterImpl::D) | op3(fpop1_op3) | opf(0x80 + w) | fs2(s, w)); }
 inline void Assembler::ftoi( FloatRegisterImpl::Width w, FloatRegister s, FloatRegister d ) {             emit_int32( op(arith_op) | fd(d, FloatRegisterImpl::S) | op3(fpop1_op3) | opf(0xd0 + w) | fs2(s, w)); }
 inline void Assembler::ftof( FloatRegisterImpl::Width sw, FloatRegisterImpl::Width dw, FloatRegister s, FloatRegister d ) { emit_int32( op(arith_op) | fd(d, dw) | op3(fpop1_op3) | opf(0xc0 + sw + dw*4) | fs2(s, sw)); }
 inline void Assembler::fxtof( FloatRegisterImpl::Width w, FloatRegister s, FloatRegister d ) { v9_only();  emit_int32( op(arith_op) | fd(d, w) | op3(fpop1_op3) | opf(0x80 + w*4) | fs2(s, FloatRegisterImpl::D)); }
 inline void Assembler::fitof( FloatRegisterImpl::Width w, FloatRegister s, FloatRegister d ) {             emit_int32( op(arith_op) | fd(d, w) | op3(fpop1_op3) | opf(0xc0 + w*4) | fs2(s, FloatRegisterImpl::S)); }
 inline void Assembler::fmov( FloatRegisterImpl::Width w, FloatRegister s, FloatRegister d ) { emit_int32( op(arith_op) | fd(d, w) | op3(fpop1_op3) | opf(0x00 + w) | fs2(s, w)); }
 inline void Assembler::fneg( FloatRegisterImpl::Width w, FloatRegister s, FloatRegister d ) { emit_int32( op(arith_op) | fd(d, w) | op3(fpop1_op3) | opf(0x04 + w) | fs2(s, w)); }
 inline void Assembler::fabs( FloatRegisterImpl::Width w, FloatRegister s, FloatRegister d ) { emit_int32( op(arith_op) | fd(d, w) | op3(fpop1_op3) | opf(0x08 + w) | fs2(s, w)); }
 inline void Assembler::fmul( FloatRegisterImpl::Width w,                            FloatRegister s1, FloatRegister s2, FloatRegister d ) { emit_int32( op(arith_op) | fd(d, w)  | op3(fpop1_op3) | fs1(s1, w)  | opf(0x48 + w)         | fs2(s2, w)); }
 inline void Assembler::fmul( FloatRegisterImpl::Width sw, FloatRegisterImpl::Width dw,  FloatRegister s1, FloatRegister s2, FloatRegister d ) { emit_int32( op(arith_op) | fd(d, dw) | op3(fpop1_op3) | fs1(s1, sw) | opf(0x60 + sw + dw*4) | fs2(s2, sw)); }
 inline void Assembler::fdiv( FloatRegisterImpl::Width w,                            FloatRegister s1, FloatRegister s2, FloatRegister d ) { emit_int32( op(arith_op) | fd(d, w)  | op3(fpop1_op3) | fs1(s1, w)  | opf(0x4c + w)         | fs2(s2, w)); }
 inline void Assembler::fxor( FloatRegisterImpl::Width w, FloatRegister s1, FloatRegister s2, FloatRegister d ) { vis1_only(); emit_int32( op(arith_op) | fd(d, w) | op3(flog3_op3) | fs1(s1, w) | opf(0x6E - w) | fs2(s2, w)); }
 inline void Assembler::fsqrt( FloatRegisterImpl::Width w, FloatRegister s, FloatRegister d ) { emit_int32( op(arith_op) | fd(d, w) | op3(fpop1_op3) | opf(0x28 + w) | fs2(s, w)); }
 inline void Assembler::flush( Register s1, Register s2) { emit_int32( op(arith_op) | op3(flush_op3) | rs1(s1) | rs2(s2)); }
 inline void Assembler::flush( Register s1, int simm13a) { emit_data( op(arith_op) | op3(flush_op3) | rs1(s1) | immed(true) | simm(simm13a, 13)); }
 inline void Assembler::flushw() { v9_only();  emit_int32( op(arith_op) | op3(flushw_op3) ); }
 inline void Assembler::illtrap( int const22a) { if (const22a != 0) v9_only();  emit_int32( op(branch_op) | u_field(const22a, 21, 0) ); }
 inline void Assembler::impdep1( int id1, int const19a ) { v9_only();  emit_int32( op(arith_op) | fcn(id1) | op3(impdep1_op3) | u_field(const19a, 18, 0)); }
 inline void Assembler::impdep2( int id1, int const19a ) { v9_only();  emit_int32( op(arith_op) | fcn(id1) | op3(impdep2_op3) | u_field(const19a, 18, 0)); }
 inline void Assembler::jmpl( Register s1, Register s2, Register d ) { insert_nop_after_cbcond(); cti();  emit_int32( op(arith_op) | rd(d) | op3(jmpl_op3) | rs1(s1) | rs2(s2));  has_delay_slot(); }
 inline void Assembler::jmpl( Register s1, int simm13a, Register d, RelocationHolder const& rspec ) { insert_nop_after_cbcond(); cti();  emit_data( op(arith_op) | rd(d) | op3(jmpl_op3) | rs1(s1) | immed(true) | simm(simm13a, 13), rspec);  has_delay_slot(); }
@ -88,6 +170,9 @@ inline void Assembler::ldf(FloatRegisterImpl::Width w, Register s1, int simm13a,
 inline void Assembler::ldxfsr( Register s1, Register s2) { v9_only();  emit_int32( op(ldst_op) | rd(G1)    | op3(ldfsr_op3) | rs1(s1) | rs2(s2) ); }
 inline void Assembler::ldxfsr( Register s1, int simm13a) { v9_only();  emit_data( op(ldst_op) | rd(G1)    | op3(ldfsr_op3) | rs1(s1) | immed(true) | simm(simm13a, 13)); }
 inline void Assembler::ldfa(  FloatRegisterImpl::Width w, Register s1, Register s2, int ia, FloatRegister d ) { v9_only();  emit_int32( op(ldst_op) | fd(d, w) | alt_op3(ldf_op3 | alt_bit_op3, w) | rs1(s1) | imm_asi(ia) | rs2(s2) ); }
 inline void Assembler::ldfa(  FloatRegisterImpl::Width w, Register s1, int simm13a,         FloatRegister d ) { v9_only();  emit_int32( op(ldst_op) | fd(d, w) | alt_op3(ldf_op3 | alt_bit_op3, w) | rs1(s1) | immed(true) | simm(simm13a, 13) ); }
 inline void Assembler::ldsb(  Register s1, Register s2, Register d) { emit_int32( op(ldst_op) | rd(d) | op3(ldsb_op3) | rs1(s1) | rs2(s2) ); }
 inline void Assembler::ldsb(  Register s1, int simm13a, Register d) { emit_data( op(ldst_op) | rd(d) | op3(ldsb_op3) | rs1(s1) | immed(true) | simm(simm13a, 13)); }
@ -107,11 +192,134 @@ inline void Assembler::ldx(   Register s1, int simm13a, Register d) { v9_only();
 inline void Assembler::ldd(   Register s1, Register s2, Register d) { v9_dep(); assert(d->is_even(), "not even"); emit_int32( op(ldst_op) | rd(d) | op3(ldd_op3) | rs1(s1) | rs2(s2) ); }
 inline void Assembler::ldd(   Register s1, int simm13a, Register d) { v9_dep(); assert(d->is_even(), "not even"); emit_data( op(ldst_op) | rd(d) | op3(ldd_op3) | rs1(s1) | immed(true) | simm(simm13a, 13)); }
 inline void Assembler::ldsba(  Register s1, Register s2, int ia, Register d ) {             emit_int32( op(ldst_op) | rd(d) | op3(ldsb_op3 | alt_bit_op3) | rs1(s1) | imm_asi(ia) | rs2(s2) ); }
 inline void Assembler::ldsba(  Register s1, int simm13a,         Register d ) {             emit_int32( op(ldst_op) | rd(d) | op3(ldsb_op3 | alt_bit_op3) | rs1(s1) | immed(true) | simm(simm13a, 13) ); }
 inline void Assembler::ldsha(  Register s1, Register s2, int ia, Register d ) {             emit_int32( op(ldst_op) | rd(d) | op3(ldsh_op3 | alt_bit_op3) | rs1(s1) | imm_asi(ia) | rs2(s2) ); }
 inline void Assembler::ldsha(  Register s1, int simm13a,         Register d ) {             emit_int32( op(ldst_op) | rd(d) | op3(ldsh_op3 | alt_bit_op3) | rs1(s1) | immed(true) | simm(simm13a, 13) ); }
 inline void Assembler::ldswa(  Register s1, Register s2, int ia, Register d ) { v9_only();  emit_int32( op(ldst_op) | rd(d) | op3(ldsw_op3 | alt_bit_op3) | rs1(s1) | imm_asi(ia) | rs2(s2) ); }
 inline void Assembler::ldswa(  Register s1, int simm13a,         Register d ) { v9_only();  emit_int32( op(ldst_op) | rd(d) | op3(ldsw_op3 | alt_bit_op3) | rs1(s1) | immed(true) | simm(simm13a, 13) ); }
 inline void Assembler::lduba(  Register s1, Register s2, int ia, Register d ) {             emit_int32( op(ldst_op) | rd(d) | op3(ldub_op3 | alt_bit_op3) | rs1(s1) | imm_asi(ia) | rs2(s2) ); }
 inline void Assembler::lduba(  Register s1, int simm13a,         Register d ) {             emit_int32( op(ldst_op) | rd(d) | op3(ldub_op3 | alt_bit_op3) | rs1(s1) | immed(true) | simm(simm13a, 13) ); }
 inline void Assembler::lduha(  Register s1, Register s2, int ia, Register d ) {             emit_int32( op(ldst_op) | rd(d) | op3(lduh_op3 | alt_bit_op3) | rs1(s1) | imm_asi(ia) | rs2(s2) ); }
 inline void Assembler::lduha(  Register s1, int simm13a,         Register d ) {             emit_int32( op(ldst_op) | rd(d) | op3(lduh_op3 | alt_bit_op3) | rs1(s1) | immed(true) | simm(simm13a, 13) ); }
 inline void Assembler::lduwa(  Register s1, Register s2, int ia, Register d ) {             emit_int32( op(ldst_op) | rd(d) | op3(lduw_op3 | alt_bit_op3) | rs1(s1) | imm_asi(ia) | rs2(s2) ); }
 inline void Assembler::lduwa(  Register s1, int simm13a,         Register d ) {             emit_int32( op(ldst_op) | rd(d) | op3(lduw_op3 | alt_bit_op3) | rs1(s1) | immed(true) | simm(simm13a, 13) ); }
 inline void Assembler::ldxa(   Register s1, Register s2, int ia, Register d ) { v9_only();  emit_int32( op(ldst_op) | rd(d) | op3(ldx_op3  | alt_bit_op3) | rs1(s1) | imm_asi(ia) | rs2(s2) ); }
 inline void Assembler::ldxa(   Register s1, int simm13a,         Register d ) { v9_only();  emit_int32( op(ldst_op) | rd(d) | op3(ldx_op3  | alt_bit_op3) | rs1(s1) | immed(true) | simm(simm13a, 13) ); }
 inline void Assembler::and3(    Register s1, Register s2, Register d ) { emit_int32( op(arith_op) | rd(d) | op3(and_op3              ) | rs1(s1) | rs2(s2) ); }
 inline void Assembler::and3(    Register s1, int simm13a, Register d ) { emit_int32( op(arith_op) | rd(d) | op3(and_op3              ) | rs1(s1) | immed(true) | simm(simm13a, 13) ); }
 inline void Assembler::andcc(   Register s1, Register s2, Register d ) { emit_int32( op(arith_op) | rd(d) | op3(and_op3  | cc_bit_op3) | rs1(s1) | rs2(s2) ); }
 inline void Assembler::andcc(   Register s1, int simm13a, Register d ) { emit_int32( op(arith_op) | rd(d) | op3(and_op3  | cc_bit_op3) | rs1(s1) | immed(true) | simm(simm13a, 13) ); }
 inline void Assembler::andn(    Register s1, Register s2, Register d ) { emit_int32( op(arith_op) | rd(d) | op3(andn_op3             ) | rs1(s1) | rs2(s2) ); }
 inline void Assembler::andn(    Register s1, int simm13a, Register d ) { emit_int32( op(arith_op) | rd(d) | op3(andn_op3             ) | rs1(s1) | immed(true) | simm(simm13a, 13) ); }
 inline void Assembler::andncc(  Register s1, Register s2, Register d ) { emit_int32( op(arith_op) | rd(d) | op3(andn_op3 | cc_bit_op3) | rs1(s1) | rs2(s2) ); }
 inline void Assembler::andncc(  Register s1, int simm13a, Register d ) { emit_int32( op(arith_op) | rd(d) | op3(andn_op3 | cc_bit_op3) | rs1(s1) | immed(true) | simm(simm13a, 13) ); }
 inline void Assembler::or3(     Register s1, Register s2, Register d ) { emit_int32( op(arith_op) | rd(d) | op3(or_op3               ) | rs1(s1) | rs2(s2) ); }
 inline void Assembler::or3(     Register s1, int simm13a, Register d ) { emit_int32( op(arith_op) | rd(d) | op3(or_op3               ) | rs1(s1) | immed(true) | simm(simm13a, 13) ); }
 inline void Assembler::orcc(    Register s1, Register s2, Register d ) { emit_int32( op(arith_op) | rd(d) | op3(or_op3   | cc_bit_op3) | rs1(s1) | rs2(s2) ); }
 inline void Assembler::orcc(    Register s1, int simm13a, Register d ) { emit_int32( op(arith_op) | rd(d) | op3(or_op3   | cc_bit_op3) | rs1(s1) | immed(true) | simm(simm13a, 13) ); }
 inline void Assembler::orn(     Register s1, Register s2, Register d ) { emit_int32( op(arith_op) | rd(d) | op3(orn_op3) | rs1(s1) | rs2(s2) ); }
 inline void Assembler::orn(     Register s1, int simm13a, Register d ) { emit_int32( op(arith_op) | rd(d) | op3(orn_op3) | rs1(s1) | immed(true) | simm(simm13a, 13) ); }
 inline void Assembler::orncc(   Register s1, Register s2, Register d ) { emit_int32( op(arith_op) | rd(d) | op3(orn_op3  | cc_bit_op3) | rs1(s1) | rs2(s2) ); }
 inline void Assembler::orncc(   Register s1, int simm13a, Register d ) { emit_int32( op(arith_op) | rd(d) | op3(orn_op3  | cc_bit_op3) | rs1(s1) | immed(true) | simm(simm13a, 13) ); }
 inline void Assembler::xor3(    Register s1, Register s2, Register d ) { emit_int32( op(arith_op) | rd(d) | op3(xor_op3              ) | rs1(s1) | rs2(s2) ); }
 inline void Assembler::xor3(    Register s1, int simm13a, Register d ) { emit_int32( op(arith_op) | rd(d) | op3(xor_op3              ) | rs1(s1) | immed(true) | simm(simm13a, 13) ); }
 inline void Assembler::xorcc(   Register s1, Register s2, Register d ) { emit_int32( op(arith_op) | rd(d) | op3(xor_op3  | cc_bit_op3) | rs1(s1) | rs2(s2) ); }
 inline void Assembler::xorcc(   Register s1, int simm13a, Register d ) { emit_int32( op(arith_op) | rd(d) | op3(xor_op3  | cc_bit_op3) | rs1(s1) | immed(true) | simm(simm13a, 13) ); }
 inline void Assembler::xnor(    Register s1, Register s2, Register d ) { emit_int32( op(arith_op) | rd(d) | op3(xnor_op3             ) | rs1(s1) | rs2(s2) ); }
 inline void Assembler::xnor(    Register s1, int simm13a, Register d ) { emit_int32( op(arith_op) | rd(d) | op3(xnor_op3             ) | rs1(s1) | immed(true) | simm(simm13a, 13) ); }
 inline void Assembler::xnorcc(  Register s1, Register s2, Register d ) { emit_int32( op(arith_op) | rd(d) | op3(xnor_op3 | cc_bit_op3) | rs1(s1) | rs2(s2) ); }
 inline void Assembler::xnorcc(  Register s1, int simm13a, Register d ) { emit_int32( op(arith_op) | rd(d) | op3(xnor_op3 | cc_bit_op3) | rs1(s1) | immed(true) | simm(simm13a, 13) ); }
 inline void Assembler::membar( Membar_mask_bits const7a ) { v9_only(); emit_int32( op(arith_op) | op3(membar_op3) | rs1(O7) | immed(true) | u_field( int(const7a), 6, 0)); }
 inline void Assembler::fmov( FloatRegisterImpl::Width w, Condition c,  bool floatCC, CC cca, FloatRegister s2, FloatRegister d ) { v9_only();  emit_int32( op(arith_op) | fd(d, w) | op3(fpop2_op3) | cond_mov(c) | opf_cc(cca, floatCC) | opf_low6(w) | fs2(s2, w)); }
 inline void Assembler::fmov( FloatRegisterImpl::Width w, RCondition c, Register s1,  FloatRegister s2, FloatRegister d ) { v9_only();  emit_int32( op(arith_op) | fd(d, w) | op3(fpop2_op3) | rs1(s1) | rcond(c) | opf_low5(4 + w) | fs2(s2, w)); }
 inline void Assembler::movcc( Condition c, bool floatCC, CC cca, Register s2, Register d ) { v9_only();  emit_int32( op(arith_op) | rd(d) | op3(movcc_op3) | mov_cc(cca, floatCC) | cond_mov(c) | rs2(s2) ); }
 inline void Assembler::movcc( Condition c, bool floatCC, CC cca, int simm11a, Register d ) { v9_only();  emit_int32( op(arith_op) | rd(d) | op3(movcc_op3) | mov_cc(cca, floatCC) | cond_mov(c) | immed(true) | simm(simm11a, 11) ); }
 inline void Assembler::movr( RCondition c, Register s1, Register s2,  Register d ) { v9_only();  emit_int32( op(arith_op) | rd(d) | op3(movr_op3) | rs1(s1) | rcond(c) | rs2(s2) ); }
 inline void Assembler::movr( RCondition c, Register s1, int simm10a,  Register d ) { v9_only();  emit_int32( op(arith_op) | rd(d) | op3(movr_op3) | rs1(s1) | rcond(c) | immed(true) | simm(simm10a, 10) ); }
 inline void Assembler::mulx(  Register s1, Register s2, Register d ) { v9_only(); emit_int32( op(arith_op) | rd(d) | op3(mulx_op3 ) | rs1(s1) | rs2(s2) ); }
 inline void Assembler::mulx(  Register s1, int simm13a, Register d ) { v9_only(); emit_int32( op(arith_op) | rd(d) | op3(mulx_op3 ) | rs1(s1) | immed(true) | simm(simm13a, 13) ); }
 inline void Assembler::sdivx( Register s1, Register s2, Register d ) { v9_only(); emit_int32( op(arith_op) | rd(d) | op3(sdivx_op3) | rs1(s1) | rs2(s2) ); }
 inline void Assembler::sdivx( Register s1, int simm13a, Register d ) { v9_only(); emit_int32( op(arith_op) | rd(d) | op3(sdivx_op3) | rs1(s1) | immed(true) | simm(simm13a, 13) ); }
 inline void Assembler::udivx( Register s1, Register s2, Register d ) { v9_only(); emit_int32( op(arith_op) | rd(d) | op3(udivx_op3) | rs1(s1) | rs2(s2) ); }
 inline void Assembler::udivx( Register s1, int simm13a, Register d ) { v9_only(); emit_int32( op(arith_op) | rd(d) | op3(udivx_op3) | rs1(s1) | immed(true) | simm(simm13a, 13) ); }
 inline void Assembler::umul(   Register s1, Register s2, Register d ) { emit_int32( op(arith_op) | rd(d) | op3(umul_op3             ) | rs1(s1) | rs2(s2) ); }
 inline void Assembler::umul(   Register s1, int simm13a, Register d ) { emit_int32( op(arith_op) | rd(d) | op3(umul_op3             ) | rs1(s1) | immed(true) | simm(simm13a, 13) ); }
 inline void Assembler::smul(   Register s1, Register s2, Register d ) { emit_int32( op(arith_op) | rd(d) | op3(smul_op3             ) | rs1(s1) | rs2(s2) ); }
 inline void Assembler::smul(   Register s1, int simm13a, Register d ) { emit_int32( op(arith_op) | rd(d) | op3(smul_op3             ) | rs1(s1) | immed(true) | simm(simm13a, 13) ); }
 inline void Assembler::umulcc( Register s1, Register s2, Register d ) { emit_int32( op(arith_op) | rd(d) | op3(umul_op3 | cc_bit_op3) | rs1(s1) | rs2(s2) ); }
 inline void Assembler::umulcc( Register s1, int simm13a, Register d ) { emit_int32( op(arith_op) | rd(d) | op3(umul_op3 | cc_bit_op3) | rs1(s1) | immed(true) | simm(simm13a, 13) ); }
 inline void Assembler::smulcc( Register s1, Register s2, Register d ) { emit_int32( op(arith_op) | rd(d) | op3(smul_op3 | cc_bit_op3) | rs1(s1) | rs2(s2) ); }
 inline void Assembler::smulcc( Register s1, int simm13a, Register d ) { emit_int32( op(arith_op) | rd(d) | op3(smul_op3 | cc_bit_op3) | rs1(s1) | immed(true) | simm(simm13a, 13) ); }
 inline void Assembler::nop() { emit_int32( op(branch_op) | op2(sethi_op2) ); }
 inline void Assembler::sw_count() { emit_int32( op(branch_op) | op2(sethi_op2) | 0x3f0 ); }
 inline void Assembler::popc( Register s,  Register d) { v9_only();  emit_int32( op(arith_op) | rd(d) | op3(popc_op3) | rs2(s)); }
 inline void Assembler::popc( int simm13a, Register d) { v9_only();  emit_int32( op(arith_op) | rd(d) | op3(popc_op3) | immed(true) | simm(simm13a, 13)); }
 inline void Assembler::prefetch(   Register s1, Register s2, PrefetchFcn f) { v9_only();  emit_int32( op(ldst_op) | fcn(f) | op3(prefetch_op3) | rs1(s1) | rs2(s2) ); }
 inline void Assembler::prefetch(   Register s1, int simm13a, PrefetchFcn f) { v9_only();  emit_data( op(ldst_op) | fcn(f) | op3(prefetch_op3) | rs1(s1) | immed(true) | simm(simm13a, 13)); }
 inline void Assembler::prefetcha(  Register s1, Register s2, int ia, PrefetchFcn f ) { v9_only();  emit_int32( op(ldst_op) | fcn(f) | op3(prefetch_op3 | alt_bit_op3) | rs1(s1) | imm_asi(ia) | rs2(s2) ); }
 inline void Assembler::prefetcha(  Register s1, int simm13a,         PrefetchFcn f ) { v9_only();  emit_int32( op(ldst_op) | fcn(f) | op3(prefetch_op3 | alt_bit_op3) | rs1(s1) | immed(true) | simm(simm13a, 13) ); }
 inline void Assembler::rdy(    Register d) { v9_dep();  emit_int32( op(arith_op) | rd(d) | op3(rdreg_op3) | u_field(0, 18, 14)); }
 inline void Assembler::rdccr(  Register d) { v9_only(); emit_int32( op(arith_op) | rd(d) | op3(rdreg_op3) | u_field(2, 18, 14)); }
 inline void Assembler::rdasi(  Register d) { v9_only(); emit_int32( op(arith_op) | rd(d) | op3(rdreg_op3) | u_field(3, 18, 14)); }
 inline void Assembler::rdtick( Register d) { v9_only(); emit_int32( op(arith_op) | rd(d) | op3(rdreg_op3) | u_field(4, 18, 14)); } // Spoon!
 inline void Assembler::rdpc(   Register d) { v9_only(); emit_int32( op(arith_op) | rd(d) | op3(rdreg_op3) | u_field(5, 18, 14)); }
 inline void Assembler::rdfprs( Register d) { v9_only(); emit_int32( op(arith_op) | rd(d) | op3(rdreg_op3) | u_field(6, 18, 14)); }
 inline void Assembler::rett( Register s1, Register s2                         ) { cti();  emit_int32( op(arith_op) | op3(rett_op3) | rs1(s1) | rs2(s2));  has_delay_slot(); }
 inline void Assembler::rett( Register s1, int simm13a, relocInfo::relocType rt) { cti();  emit_data( op(arith_op) | op3(rett_op3) | rs1(s1) | immed(true) | simm(simm13a, 13), rt);  has_delay_slot(); }
 inline void Assembler::save(    Register s1, Register s2, Register d ) { emit_int32( op(arith_op) | rd(d) | op3(save_op3) | rs1(s1) | rs2(s2) ); }
 inline void Assembler::save(    Register s1, int simm13a, Register d ) {
  // make sure frame is at least large enough for the register save area
  assert(-simm13a >= 16 * wordSize, "frame too small");
  emit_int32( op(arith_op) | rd(d) | op3(save_op3) | rs1(s1) | immed(true) | simm(simm13a, 13) );
 }
 inline void Assembler::restore( Register s1, Register s2, Register d ) { emit_int32( op(arith_op) | rd(d) | op3(restore_op3) | rs1(s1) | rs2(s2) ); }
 inline void Assembler::restore( Register s1, int simm13a, Register d ) { emit_int32( op(arith_op) | rd(d) | op3(restore_op3) | rs1(s1) | immed(true) | simm(simm13a, 13) ); }
 // pp 216
 inline void Assembler::saved()    { v9_only();  emit_int32( op(arith_op) | fcn(0) | op3(saved_op3)); }
 inline void Assembler::restored() { v9_only();  emit_int32( op(arith_op) | fcn(1) | op3(saved_op3)); }
 inline void Assembler::sethi( int imm22a, Register d, RelocationHolder const& rspec ) { emit_data( op(branch_op) | rd(d) | op2(sethi_op2) | hi22(imm22a), rspec); }
 inline void Assembler::sll(  Register s1, Register s2, Register d ) { emit_int32( op(arith_op) | rd(d) | op3(sll_op3) | rs1(s1) | sx(0) | rs2(s2) ); }
 inline void Assembler::sll(  Register s1, int imm5a,   Register d ) { emit_int32( op(arith_op) | rd(d) | op3(sll_op3) | rs1(s1) | sx(0) | immed(true) | u_field(imm5a, 4, 0) ); }
 inline void Assembler::srl(  Register s1, Register s2, Register d ) { emit_int32( op(arith_op) | rd(d) | op3(srl_op3) | rs1(s1) | sx(0) | rs2(s2) ); }
 inline void Assembler::srl(  Register s1, int imm5a,   Register d ) { emit_int32( op(arith_op) | rd(d) | op3(srl_op3) | rs1(s1) | sx(0) | immed(true) | u_field(imm5a, 4, 0) ); }
 inline void Assembler::sra(  Register s1, Register s2, Register d ) { emit_int32( op(arith_op) | rd(d) | op3(sra_op3) | rs1(s1) | sx(0) | rs2(s2) ); }
 inline void Assembler::sra(  Register s1, int imm5a,   Register d ) { emit_int32( op(arith_op) | rd(d) | op3(sra_op3) | rs1(s1) | sx(0) | immed(true) | u_field(imm5a, 4, 0) ); }
 inline void Assembler::sllx( Register s1, Register s2, Register d ) { v9_only();  emit_int32( op(arith_op) | rd(d) | op3(sll_op3) | rs1(s1) | sx(1) | rs2(s2) ); }
 inline void Assembler::sllx( Register s1, int imm6a,   Register d ) { v9_only();  emit_int32( op(arith_op) | rd(d) | op3(sll_op3) | rs1(s1) | sx(1) | immed(true) | u_field(imm6a, 5, 0) ); }
 inline void Assembler::srlx( Register s1, Register s2, Register d ) { v9_only();  emit_int32( op(arith_op) | rd(d) | op3(srl_op3) | rs1(s1) | sx(1) | rs2(s2) ); }
 inline void Assembler::srlx( Register s1, int imm6a,   Register d ) { v9_only();  emit_int32( op(arith_op) | rd(d) | op3(srl_op3) | rs1(s1) | sx(1) | immed(true) | u_field(imm6a, 5, 0) ); }
 inline void Assembler::srax( Register s1, Register s2, Register d ) { v9_only();  emit_int32( op(arith_op) | rd(d) | op3(sra_op3) | rs1(s1) | sx(1) | rs2(s2) ); }
 inline void Assembler::srax( Register s1, int imm6a,   Register d ) { v9_only();  emit_int32( op(arith_op) | rd(d) | op3(sra_op3) | rs1(s1) | sx(1) | immed(true) | u_field(imm6a, 5, 0) ); }
 inline void Assembler::sir( int simm13a ) { emit_int32( op(arith_op) | fcn(15) | op3(sir_op3) | immed(true) | simm(simm13a, 13)); }
  // pp 221
 inline void Assembler::stbar() { emit_int32( op(arith_op) | op3(membar_op3) | u_field(15, 18, 14)); }
  // pp 222
 inline void Assembler::stf(    FloatRegisterImpl::Width w, FloatRegister d, Register s1, Register s2) { emit_int32( op(ldst_op) | fd(d, w) | alt_op3(stf_op3, w) | rs1(s1) | rs2(s2) ); }
@ -120,6 +328,9 @@ inline void Assembler::stf(    FloatRegisterImpl::Width w, FloatRegister d, Regi
 inline void Assembler::stxfsr( Register s1, Register s2) { v9_only();  emit_int32( op(ldst_op) | rd(G1)    | op3(stfsr_op3) | rs1(s1) | rs2(s2) ); }
 inline void Assembler::stxfsr( Register s1, int simm13a) { v9_only();  emit_data( op(ldst_op) | rd(G1)    | op3(stfsr_op3) | rs1(s1) | immed(true) | simm(simm13a, 13)); }
 inline void Assembler::stfa(  FloatRegisterImpl::Width w, FloatRegister d, Register s1, Register s2, int ia ) { v9_only();  emit_int32( op(ldst_op) | fd(d, w) | alt_op3(stf_op3 | alt_bit_op3, w) | rs1(s1) | imm_asi(ia) | rs2(s2) ); }
 inline void Assembler::stfa(  FloatRegisterImpl::Width w, FloatRegister d, Register s1, int simm13a         ) { v9_only();  emit_int32( op(ldst_op) | fd(d, w) | alt_op3(stf_op3 | alt_bit_op3, w) | rs1(s1) | immed(true) | simm(simm13a, 13) ); }
  // p 226
 inline void Assembler::stb(  Register d, Register s1, Register s2) { emit_int32( op(ldst_op) | rd(d) | op3(stb_op3) | rs1(s1) | rs2(s2) ); }
@ -135,9 +346,103 @@ inline void Assembler::stx(  Register d, Register s1, int simm13a) { v9_only();
 inline void Assembler::std(  Register d, Register s1, Register s2) { v9_dep(); assert(d->is_even(), "not even"); emit_int32( op(ldst_op) | rd(d) | op3(std_op3) | rs1(s1) | rs2(s2) ); }
 inline void Assembler::std(  Register d, Register s1, int simm13a) { v9_dep(); assert(d->is_even(), "not even"); emit_data( op(ldst_op) | rd(d) | op3(std_op3) | rs1(s1) | immed(true) | simm(simm13a, 13)); }
 inline void Assembler::stba(  Register d, Register s1, Register s2, int ia ) {             emit_int32( op(ldst_op) | rd(d) | op3(stb_op3 | alt_bit_op3) | rs1(s1) | imm_asi(ia) | rs2(s2) ); }
 inline void Assembler::stba(  Register d, Register s1, int simm13a         ) {             emit_int32( op(ldst_op) | rd(d) | op3(stb_op3 | alt_bit_op3) | rs1(s1) | immed(true) | simm(simm13a, 13) ); }
 inline void Assembler::stha(  Register d, Register s1, Register s2, int ia ) {             emit_int32( op(ldst_op) | rd(d) | op3(sth_op3 | alt_bit_op3) | rs1(s1) | imm_asi(ia) | rs2(s2) ); }
 inline void Assembler::stha(  Register d, Register s1, int simm13a         ) {             emit_int32( op(ldst_op) | rd(d) | op3(sth_op3 | alt_bit_op3) | rs1(s1) | immed(true) | simm(simm13a, 13) ); }
 inline void Assembler::stwa(  Register d, Register s1, Register s2, int ia ) {             emit_int32( op(ldst_op) | rd(d) | op3(stw_op3 | alt_bit_op3) | rs1(s1) | imm_asi(ia) | rs2(s2) ); }
 inline void Assembler::stwa(  Register d, Register s1, int simm13a         ) {             emit_int32( op(ldst_op) | rd(d) | op3(stw_op3 | alt_bit_op3) | rs1(s1) | immed(true) | simm(simm13a, 13) ); }
 inline void Assembler::stxa(  Register d, Register s1, Register s2, int ia ) { v9_only();  emit_int32( op(ldst_op) | rd(d) | op3(stx_op3 | alt_bit_op3) | rs1(s1) | imm_asi(ia) | rs2(s2) ); }
 inline void Assembler::stxa(  Register d, Register s1, int simm13a         ) { v9_only();  emit_int32( op(ldst_op) | rd(d) | op3(stx_op3 | alt_bit_op3) | rs1(s1) | immed(true) | simm(simm13a, 13) ); }
 inline void Assembler::stda(  Register d, Register s1, Register s2, int ia ) {             emit_int32( op(ldst_op) | rd(d) | op3(std_op3 | alt_bit_op3) | rs1(s1) | imm_asi(ia) | rs2(s2) ); }
 inline void Assembler::stda(  Register d, Register s1, int simm13a         ) {             emit_int32( op(ldst_op) | rd(d) | op3(std_op3 | alt_bit_op3) | rs1(s1) | immed(true) | simm(simm13a, 13) ); }
 // pp 230
 inline void Assembler::sub(    Register s1, Register s2, Register d ) { emit_int32( op(arith_op) | rd(d) | op3(sub_op3              ) | rs1(s1) | rs2(s2) ); }
 inline void Assembler::sub(    Register s1, int simm13a, Register d ) { emit_int32( op(arith_op) | rd(d) | op3(sub_op3              ) | rs1(s1) | immed(true) | simm(simm13a, 13) ); }
 inline void Assembler::subcc(  Register s1, Register s2, Register d ) { emit_int32( op(arith_op) | rd(d) | op3(sub_op3 | cc_bit_op3 ) | rs1(s1) | rs2(s2) ); }
 inline void Assembler::subcc(  Register s1, int simm13a, Register d ) { emit_int32( op(arith_op) | rd(d) | op3(sub_op3 | cc_bit_op3 ) | rs1(s1) | immed(true) | simm(simm13a, 13) ); }
 inline void Assembler::subc(   Register s1, Register s2, Register d ) { emit_int32( op(arith_op) | rd(d) | op3(subc_op3             ) | rs1(s1) | rs2(s2) ); }
 inline void Assembler::subc(   Register s1, int simm13a, Register d ) { emit_int32( op(arith_op) | rd(d) | op3(subc_op3             ) | rs1(s1) | immed(true) | simm(simm13a, 13) ); }
 inline void Assembler::subccc( Register s1, Register s2, Register d ) { emit_int32( op(arith_op) | rd(d) | op3(subc_op3 | cc_bit_op3) | rs1(s1) | rs2(s2) ); }
 inline void Assembler::subccc( Register s1, int simm13a, Register d ) { emit_int32( op(arith_op) | rd(d) | op3(subc_op3 | cc_bit_op3) | rs1(s1) | immed(true) | simm(simm13a, 13) ); }
 // pp 231
 inline void Assembler::swap(    Register s1, Register s2, Register d) { v9_dep();  emit_int32( op(ldst_op) | rd(d) | op3(swap_op3) | rs1(s1) | rs2(s2) ); }
 inline void Assembler::swap(    Register s1, int simm13a, Register d) { v9_dep();  emit_data( op(ldst_op) | rd(d) | op3(swap_op3) | rs1(s1) | immed(true) | simm(simm13a, 13)); }
 inline void Assembler::swapa(   Register s1, Register s2, int ia, Register d ) { v9_dep();  emit_int32( op(ldst_op) | rd(d) | op3(swap_op3 | alt_bit_op3) | rs1(s1) | imm_asi(ia) | rs2(s2) ); }
 inline void Assembler::swapa(   Register s1, int simm13a,         Register d ) { v9_dep();  emit_int32( op(ldst_op) | rd(d) | op3(swap_op3 | alt_bit_op3) | rs1(s1) | immed(true) | simm(simm13a, 13) ); }
 // pp 234, note op in book is wrong, see pp 268
 inline void Assembler::taddcc(    Register s1, Register s2, Register d ) {            emit_int32( op(arith_op) | rd(d) | op3(taddcc_op3  ) | rs1(s1) | rs2(s2) ); }
 inline void Assembler::taddcc(    Register s1, int simm13a, Register d ) {            emit_int32( op(arith_op) | rd(d) | op3(taddcc_op3  ) | rs1(s1) | immed(true) | simm(simm13a, 13) ); }
 // pp 235
 inline void Assembler::tsubcc(    Register s1, Register s2, Register d ) { emit_int32( op(arith_op) | rd(d) | op3(tsubcc_op3  ) | rs1(s1) | rs2(s2) ); }
 inline void Assembler::tsubcc(    Register s1, int simm13a, Register d ) { emit_int32( op(arith_op) | rd(d) | op3(tsubcc_op3  ) | rs1(s1) | immed(true) | simm(simm13a, 13) ); }
 // pp 237
 inline void Assembler::trap( Condition c, CC cc, Register s1, Register s2 ) { emit_int32( op(arith_op) | cond(c) | op3(trap_op3) | rs1(s1) | trapcc(cc) | rs2(s2)); }
 inline void Assembler::trap( Condition c, CC cc, Register s1, int trapa   ) { emit_int32( op(arith_op) | cond(c) | op3(trap_op3) | rs1(s1) | trapcc(cc) | immed(true) | u_field(trapa, 6, 0)); }
 // simple uncond. trap
 inline void Assembler::trap( int trapa ) { trap( always, icc, G0, trapa ); }
 inline void Assembler::wry(Register d) { v9_dep(); emit_int32(op(arith_op) | rs1(d) | op3(wrreg_op3) | u_field(0, 29, 25)); }
 inline void Assembler::wrccr(Register s) { v9_only(); emit_int32(op(arith_op) | rs1(s) | op3(wrreg_op3) | u_field(2, 29, 25)); }
 inline void Assembler::wrccr(Register s, int simm13a) { v9_only(); emit_int32(op(arith_op) | rs1(s) | op3(wrreg_op3) | u_field(2, 29, 25) | immed(true) | simm(simm13a, 13)); }
 inline void Assembler::wrasi(Register d) { v9_only(); emit_int32(op(arith_op) | rs1(d) | op3(wrreg_op3) | u_field(3, 29, 25)); }
 // wrasi(d, imm) stores (d xor imm) to asi
 inline void Assembler::wrasi(Register d, int simm13a) { v9_only(); emit_int32(op(arith_op) | rs1(d) | op3(wrreg_op3) | u_field(3, 29, 25) | immed(true) | simm(simm13a, 13)); }
 inline void Assembler::wrfprs(Register d) { v9_only(); emit_int32(op(arith_op) | rs1(d) | op3(wrreg_op3) | u_field(6, 29, 25)); }
 inline void Assembler::alignaddr( Register s1, Register s2, Register d ) { vis1_only(); emit_int32( op(arith_op) | rd(d) | op3(alignaddr_op3) | rs1(s1) | opf(alignaddr_opf) | rs2(s2)); }
 inline void Assembler::faligndata( FloatRegister s1, FloatRegister s2, FloatRegister d ) { vis1_only(); emit_int32( op(arith_op) | fd(d, FloatRegisterImpl::D) | op3(faligndata_op3) | fs1(s1, FloatRegisterImpl::D) | opf(faligndata_opf) | fs2(s2, FloatRegisterImpl::D)); }
 inline void Assembler::fzero( FloatRegisterImpl::Width w, FloatRegister d ) { vis1_only(); emit_int32( op(arith_op) | fd(d, w) | op3(fzero_op3) | opf(0x62 - w)); }
 inline void Assembler::fsrc2( FloatRegisterImpl::Width w, FloatRegister s2, FloatRegister d ) { vis1_only(); emit_int32( op(arith_op) | fd(d, w) | op3(fsrc_op3) | opf(0x7A - w) | fs2(s2, w)); }
 inline void Assembler::fnot1( FloatRegisterImpl::Width w, FloatRegister s1, FloatRegister d ) { vis1_only(); emit_int32( op(arith_op) | fd(d, w) | op3(fnot_op3) | fs1(s1, w) | opf(0x6C - w)); }
 inline void Assembler::fpmerge( FloatRegister s1, FloatRegister s2, FloatRegister d ) { vis1_only(); emit_int32( op(arith_op) | fd(d, FloatRegisterImpl::D) | op3(0x36) | fs1(s1, FloatRegisterImpl::S) | opf(0x4b) | fs2(s2, FloatRegisterImpl::S)); }
 inline void Assembler::stpartialf( Register s1, Register s2, FloatRegister d, int ia ) { vis1_only(); emit_int32( op(ldst_op) | fd(d, FloatRegisterImpl::D) | op3(stpartialf_op3) | rs1(s1) | imm_asi(ia) | rs2(s2)); }
 //  VIS2 instructions
 inline void Assembler::edge8n( Register s1, Register s2, Register d ) { vis2_only(); emit_int32( op(arith_op) | rd(d) | op3(edge_op3) | rs1(s1) | opf(edge8n_opf) | rs2(s2)); }
 inline void Assembler::bmask( Register s1, Register s2, Register d ) { vis2_only(); emit_int32( op(arith_op) | rd(d) | op3(bmask_op3) | rs1(s1) | opf(bmask_opf) | rs2(s2)); }
 inline void Assembler::bshuffle( FloatRegister s1, FloatRegister s2, FloatRegister d ) { vis2_only(); emit_int32( op(arith_op) | fd(d, FloatRegisterImpl::D) | op3(bshuffle_op3) | fs1(s1, FloatRegisterImpl::D) | opf(bshuffle_opf) | fs2(s2, FloatRegisterImpl::D)); }
 // VIS3 instructions
 inline void Assembler::movstosw( FloatRegister s, Register d ) { vis3_only();  emit_int32( op(arith_op) | rd(d) | op3(mftoi_op3) | opf(mstosw_opf) | fs2(s, FloatRegisterImpl::S)); }
 inline void Assembler::movstouw( FloatRegister s, Register d ) { vis3_only();  emit_int32( op(arith_op) | rd(d) | op3(mftoi_op3) | opf(mstouw_opf) | fs2(s, FloatRegisterImpl::S)); }
 inline void Assembler::movdtox(  FloatRegister s, Register d ) { vis3_only();  emit_int32( op(arith_op) | rd(d) | op3(mftoi_op3) | opf(mdtox_opf) | fs2(s, FloatRegisterImpl::D)); }
 inline void Assembler::movwtos( Register s, FloatRegister d ) { vis3_only();  emit_int32( op(arith_op) | fd(d, FloatRegisterImpl::S) | op3(mftoi_op3) | opf(mwtos_opf) | rs2(s)); }
 inline void Assembler::movxtod( Register s, FloatRegister d ) { vis3_only();  emit_int32( op(arith_op) | fd(d, FloatRegisterImpl::D) | op3(mftoi_op3) | opf(mxtod_opf) | rs2(s)); }
 inline void Assembler::xmulx(Register s1, Register s2, Register d) { vis3_only(); emit_int32( op(arith_op) | rd(d) | op3(xmulx_op3) | rs1(s1) | opf(xmulx_opf) | rs2(s2)); }
 inline void Assembler::xmulxhi(Register s1, Register s2, Register d) { vis3_only(); emit_int32( op(arith_op) | rd(d) | op3(xmulx_op3) | rs1(s1) | opf(xmulxhi_opf) | rs2(s2)); }
 // Crypto SHA instructions
 inline void Assembler::sha1()   { sha1_only();    emit_int32( op(arith_op) | op3(sha_op3) | opf(sha1_opf)); }
 inline void Assembler::sha256() { sha256_only();  emit_int32( op(arith_op) | op3(sha_op3) | opf(sha256_opf)); }
 inline void Assembler::sha512() { sha512_only();  emit_int32( op(arith_op) | op3(sha_op3) | opf(sha512_opf)); }
 // CRC32C instruction
 inline void Assembler::crc32c( FloatRegister s1, FloatRegister s2, FloatRegister d ) { crc32c_only();  emit_int32( op(arith_op) | fd(d, FloatRegisterImpl::D) | op3(crc32c_op3) | fs1(s1, FloatRegisterImpl::D) | opf(crc32c_opf) | fs2(s2, FloatRegisterImpl::D)); }
 #endif // CPU_SPARC_VM_ASSEMBLER_SPARC_INLINE_HPP
--- a/hotspot/src/cpu/sparc/vm/globalDefinitions_sparc.hpp
+++ b/hotspot/src/cpu/sparc/vm/globalDefinitions_sparc.hpp
@ -30,6 +30,10 @@ const int BytesPerInstWord = 4;
 const int StackAlignmentInBytes = (2*wordSize);
 // Indicates whether the C calling conventions require that
 // 32-bit integer argument values are extended to 64 bits.
 const bool CCallingConventionRequiresIntsAsLongs = false;
 #define SUPPORTS_NATIVE_CX8
 // The expected size in bytes of a cache line, used to pad data structures.
--- a/hotspot/src/cpu/sparc/vm/jvmciCodeInstaller_sparc.cpp
+++ b/hotspot/src/cpu/sparc/vm/jvmciCodeInstaller_sparc.cpp
@ -73,7 +73,7 @@ void CodeInstaller::pd_patch_MetaspaceConstant(int pc_offset, Handle constant, T
    NativeMovConstReg32* move = nativeMovConstReg32_at(pc);
    narrowKlass narrowOop = record_narrow_metadata_reference(constant, CHECK);
    move->set_data((intptr_t)narrowOop);
-    TRACE_jvmci_3("relocating (narrow metaspace constant) at %p/%p", pc, narrowOop);
+    TRACE_jvmci_3("relocating (narrow metaspace constant) at " PTR_FORMAT "/0x%x", p2i(pc), narrowOop);
 #else
    JVMCI_ERROR("compressed Klass* on 32bit");
 #endif
@ -81,7 +81,7 @@ void CodeInstaller::pd_patch_MetaspaceConstant(int pc_offset, Handle constant, T
    NativeMovConstReg* move = nativeMovConstReg_at(pc);
    Metadata* reference = record_metadata_reference(constant, CHECK);
    move->set_data((intptr_t)reference);
-    TRACE_jvmci_3("relocating (metaspace constant) at %p/%p", pc, reference);
+    TRACE_jvmci_3("relocating (metaspace constant) at " PTR_FORMAT "/" PTR_FORMAT, p2i(pc), p2i(reference));
  }
 }
--- a/hotspot/src/cpu/sparc/vm/macroAssembler_sparc.cpp
+++ b/hotspot/src/cpu/sparc/vm/macroAssembler_sparc.cpp
@ -181,19 +181,6 @@ void MacroAssembler::null_check(Register reg, int offset) {
 // Ring buffer jumps
 #ifndef PRODUCT
 void MacroAssembler::ret(  bool trace )   { if (trace) {
                                                    mov(I7, O7); // traceable register
                                                    JMP(O7, 2 * BytesPerInstWord);
                                                  } else {
                                                    jmpl( I7, 2 * BytesPerInstWord, G0 );
                                                  }
                                                }
 void MacroAssembler::retl( bool trace )  { if (trace) JMP(O7, 2 * BytesPerInstWord);
                                                 else jmpl( O7, 2 * BytesPerInstWord, G0 ); }
 #endif /* PRODUCT */
 void MacroAssembler::jmp2(Register r1, Register r2, const char* file, int line ) {
  assert_not_delayed();
@ -758,8 +745,8 @@ void MacroAssembler::set_vm_result(Register oop_result) {
 }
-void MacroAssembler::ic_call(address entry, bool emit_delay) {
+void MacroAssembler::ic_call(address entry, bool emit_delay, jint method_index) {
-  RelocationHolder rspec = virtual_call_Relocation::spec(pc());
+  RelocationHolder rspec = virtual_call_Relocation::spec(pc(), method_index);
  patchable_set((intptr_t)Universe::non_oop_word(), G5_inline_cache_reg);
  relocate(rspec);
  call(entry, relocInfo::none);
@ -768,7 +755,6 @@ void MacroAssembler::ic_call(address entry, bool emit_delay) {
  }
 }
 void MacroAssembler::card_table_write(jbyte* byte_map_base,
                                      Register tmp, Register obj) {
 #ifdef _LP64
--- a/hotspot/src/cpu/sparc/vm/macroAssembler_sparc.hpp
+++ b/hotspot/src/cpu/sparc/vm/macroAssembler_sparc.hpp
@ -680,8 +680,8 @@ class MacroAssembler : public Assembler {
  inline int get_pc( Register d );
  // Sparc shorthands(pp 85, V8 manual, pp 289 V9 manual)
-  inline void cmp(  Register s1, Register s2 ) { subcc( s1, s2, G0 ); }
+  inline void cmp(  Register s1, Register s2 );
-  inline void cmp(  Register s1, int simm13a ) { subcc( s1, simm13a, G0 ); }
+  inline void cmp(  Register s1, int simm13a );
  inline void jmp( Register s1, Register s2 );
  inline void jmp( Register s1, int simm13a, RelocationHolder const& rspec = RelocationHolder() );
@ -689,7 +689,11 @@ class MacroAssembler : public Assembler {
  // Check if the call target is out of wdisp30 range (relative to the code cache)
  static inline bool is_far_target(address d);
  inline void call( address d,  relocInfo::relocType rt = relocInfo::runtime_call_type );
  inline void call( address d,  RelocationHolder const& rspec);
  inline void call( Label& L,   relocInfo::relocType rt = relocInfo::runtime_call_type );
  inline void call( Label& L,  RelocationHolder const& rspec);
  inline void callr( Register s1, Register s2 );
  inline void callr( Register s1, int simm13a, RelocationHolder const& rspec = RelocationHolder() );
@ -697,23 +701,10 @@ class MacroAssembler : public Assembler {
  inline void iprefetch( address d, relocInfo::relocType rt = relocInfo::none );
  inline void iprefetch( Label& L);
-  inline void tst( Register s ) { orcc( G0, s, G0 ); }
+  inline void tst( Register s );
-#ifdef PRODUCT
+  inline void ret(  bool trace = TraceJumps );
-  inline void ret(  bool trace = TraceJumps )   { if (trace) {
+  inline void retl( bool trace = TraceJumps );
                                                    mov(I7, O7); // traceable register
                                                    JMP(O7, 2 * BytesPerInstWord);
                                                  } else {
                                                    jmpl( I7, 2 * BytesPerInstWord, G0 );
                                                  }
                                                }
  inline void retl( bool trace = TraceJumps )  { if (trace) JMP(O7, 2 * BytesPerInstWord);
                                                 else jmpl( O7, 2 * BytesPerInstWord, G0 ); }
 #else
  void ret(  bool trace = TraceJumps );
  void retl( bool trace = TraceJumps );
 #endif /* PRODUCT */
  // Required platform-specific helpers for Label::patch_instructions.
  // They _shadow_ the declarations in AbstractAssembler, which are undefined.
@ -746,26 +737,20 @@ public:
  static int insts_for_set64(jlong value);
  // sign-extend 32 to 64
-  inline void signx( Register s, Register d ) { sra( s, G0, d); }
+  inline void signx( Register s, Register d );
-  inline void signx( Register d )             { sra( d, G0, d); }
+  inline void signx( Register d );
-  inline void not1( Register s, Register d ) { xnor( s, G0, d ); }
+  inline void not1( Register s, Register d );
-  inline void not1( Register d )             { xnor( d, G0, d ); }
+  inline void not1( Register d );
-  inline void neg( Register s, Register d ) { sub( G0, s, d ); }
+  inline void neg( Register s, Register d );
-  inline void neg( Register d )             { sub( G0, d, d ); }
+  inline void neg( Register d );
-  inline void cas(  Register s1, Register s2, Register d) { casa( s1, s2, d, ASI_PRIMARY); }
+  inline void cas(  Register s1, Register s2, Register d);
-  inline void casx( Register s1, Register s2, Register d) { casxa(s1, s2, d, ASI_PRIMARY); }
+  inline void casx( Register s1, Register s2, Register d);
  // Functions for isolating 64 bit atomic swaps for LP64
  // cas_ptr will perform cas for 32 bit VM's and casx for 64 bit VM's
-  inline void cas_ptr(  Register s1, Register s2, Register d) {
+  inline void cas_ptr(  Register s1, Register s2, Register d);
 #ifdef _LP64
    casx( s1, s2, d );
 #else
    cas( s1, s2, d );
 #endif
  }
  // Functions for isolating 64 bit shifts for LP64
  inline void sll_ptr( Register s1, Register s2, Register d );
@ -775,14 +760,14 @@ public:
  inline void srl_ptr( Register s1, int imm6a,   Register d );
  // little-endian
-  inline void casl(  Register s1, Register s2, Register d) { casa( s1, s2, d, ASI_PRIMARY_LITTLE); }
+  inline void casl(  Register s1, Register s2, Register d);
-  inline void casxl( Register s1, Register s2, Register d) { casxa(s1, s2, d, ASI_PRIMARY_LITTLE); }
+  inline void casxl( Register s1, Register s2, Register d);
-  inline void inc(   Register d,  int const13 = 1 ) { add(   d, const13, d); }
+  inline void inc(   Register d,  int const13 = 1 );
-  inline void inccc( Register d,  int const13 = 1 ) { addcc( d, const13, d); }
+  inline void inccc( Register d,  int const13 = 1 );
-  inline void dec(   Register d,  int const13 = 1 ) { sub(   d, const13, d); }
+  inline void dec(   Register d,  int const13 = 1 );
-  inline void deccc( Register d,  int const13 = 1 ) { subcc( d, const13, d); }
+  inline void deccc( Register d,  int const13 = 1 );
  using Assembler::add;
  inline void add(Register s1, int simm13a, Register d, relocInfo::relocType rtype);
@ -793,19 +778,19 @@ public:
  using Assembler::andn;
  inline void andn(  Register s1, RegisterOrConstant s2, Register d);
-  inline void btst( Register s1,  Register s2 ) { andcc( s1, s2, G0 ); }
+  inline void btst( Register s1,  Register s2 );
-  inline void btst( int simm13a,  Register s )  { andcc( s,  simm13a, G0 ); }
+  inline void btst( int simm13a,  Register s );
-  inline void bset( Register s1,  Register s2 ) { or3( s1, s2, s2 ); }
+  inline void bset( Register s1,  Register s2 );
-  inline void bset( int simm13a,  Register s )  { or3( s,  simm13a, s ); }
+  inline void bset( int simm13a,  Register s );
-  inline void bclr( Register s1,  Register s2 ) { andn( s1, s2, s2 ); }
+  inline void bclr( Register s1,  Register s2 );
-  inline void bclr( int simm13a,  Register s )  { andn( s,  simm13a, s ); }
+  inline void bclr( int simm13a,  Register s );
-  inline void btog( Register s1,  Register s2 ) { xor3( s1, s2, s2 ); }
+  inline void btog( Register s1,  Register s2 );
-  inline void btog( int simm13a,  Register s )  { xor3( s,  simm13a, s ); }
+  inline void btog( int simm13a,  Register s );
-  inline void clr( Register d ) { or3( G0, G0, d ); }
+  inline void clr( Register d );
  inline void clrb( Register s1, Register s2);
  inline void clrh( Register s1, Register s2);
@ -818,9 +803,9 @@ public:
  inline void clrx( Register s1, int simm13a);
  // copy & clear upper word
-  inline void clruw( Register s, Register d ) { srl( s, G0, d); }
+  inline void clruw( Register s, Register d );
  // clear upper word
-  inline void clruwu( Register d ) { srl( d, G0, d); }
+  inline void clruwu( Register d );
  using Assembler::ldsb;
  using Assembler::ldsh;
@ -864,10 +849,10 @@ public:
  inline void ldf(FloatRegisterImpl::Width w, const Address& a, FloatRegister d, int offset = 0);
  // little-endian
-  inline void lduwl(Register s1, Register s2, Register d) { lduwa(s1, s2, ASI_PRIMARY_LITTLE, d); }
+  inline void lduwl(Register s1, Register s2, Register d);
-  inline void ldswl(Register s1, Register s2, Register d) { ldswa(s1, s2, ASI_PRIMARY_LITTLE, d);}
+  inline void ldswl(Register s1, Register s2, Register d);
-  inline void ldxl( Register s1, Register s2, Register d) { ldxa(s1, s2, ASI_PRIMARY_LITTLE, d); }
+  inline void ldxl( Register s1, Register s2, Register d);
-  inline void ldfl(FloatRegisterImpl::Width w, Register s1, Register s2, FloatRegister d) { ldfa(w, s1, s2, ASI_PRIMARY_LITTLE, d); }
+  inline void ldfl(FloatRegisterImpl::Width w, Register s1, Register s2, FloatRegister d);
  // membar psuedo instruction.  takes into account target memory model.
  inline void membar( Assembler::Membar_mask_bits const7a );
@ -876,17 +861,11 @@ public:
  inline bool membar_has_effect( Assembler::Membar_mask_bits const7a );
  // mov pseudo instructions
-  inline void mov( Register s,  Register d) {
+  inline void mov( Register s,  Register d);
    if ( s != d )    or3( G0, s, d);
    else             assert_not_delayed();  // Put something useful in the delay slot!
  }
-  inline void mov_or_nop( Register s,  Register d) {
+  inline void mov_or_nop( Register s,  Register d);
    if ( s != d )    or3( G0, s, d);
    else             nop();
  }
-  inline void mov( int simm13a, Register d) { or3( G0, simm13a, d); }
+  inline void mov( int simm13a, Register d);
  using Assembler::prefetch;
  inline void prefetch(const Address& a, PrefetchFcn F, int offset = 0);
@ -961,11 +940,7 @@ public:
  // handy macros:
-  inline void round_to( Register r, int modulus ) {
+  inline void round_to( Register r, int modulus );
    assert_not_delayed();
    inc( r, modulus - 1 );
    and3( r, -modulus, r );
  }
  // --------------------------------------------------
@ -1033,9 +1008,9 @@ public:
  // These are idioms to flag the need for care with accessing bools but on
  // this platform we assume byte size
-  inline void stbool(Register d, const Address& a) { stb(d, a); }
+  inline void stbool(Register d, const Address& a);
-  inline void ldbool(const Address& a, Register d) { ldub(a, d); }
+  inline void ldbool(const Address& a, Register d);
-  inline void movbool( bool boolconst, Register d) { mov( (int) boolconst, d); }
+  inline void movbool( bool boolconst, Register d);
  // klass oop manipulations if compressed
  void load_klass(Register src_oop, Register klass);
@ -1106,7 +1081,7 @@ public:
  void set_vm_result(Register oop_result);
  // Emit the CompiledIC call idiom
-  void ic_call(address entry, bool emit_delay = true);
+  void ic_call(address entry, bool emit_delay = true, jint method_index = 0);
  // if call_VM_base was called with check_exceptions=false, then call
  // check_and_forward_exception to handle exceptions when it is safe
@ -1371,12 +1346,7 @@ public:
  // Stack overflow checking
  // Note: this clobbers G3_scratch
-  void bang_stack_with_offset(int offset) {
+  inline void bang_stack_with_offset(int offset);
    // stack grows down, caller passes positive offset
    assert(offset > 0, "must bang with negative offset");
    set((-offset)+STACK_BIAS, G3_scratch);
    st(G0, SP, G3_scratch);
  }
  // Writes to stack successive pages until offset reached to check for
  // stack overflow + shadow pages.  Clobbers tsp and scratch registers.
--- a/hotspot/src/cpu/sparc/vm/macroAssembler_sparc.inline.hpp
+++ b/hotspot/src/cpu/sparc/vm/macroAssembler_sparc.inline.hpp
@ -187,6 +187,33 @@ inline void MacroAssembler::st_long( Register d, const Address& a, int offset )
 #endif
 }
 inline void MacroAssembler::stbool(Register d, const Address& a) { stb(d, a); }
 inline void MacroAssembler::ldbool(const Address& a, Register d) { ldub(a, d); }
 inline void MacroAssembler::movbool( bool boolconst, Register d) { mov( (int) boolconst, d); }
 inline void MacroAssembler::signx( Register s, Register d ) { sra( s, G0, d); }
 inline void MacroAssembler::signx( Register d )             { sra( d, G0, d); }
 inline void MacroAssembler::not1( Register s, Register d ) { xnor( s, G0, d ); }
 inline void MacroAssembler::not1( Register d )             { xnor( d, G0, d ); }
 inline void MacroAssembler::neg( Register s, Register d ) { sub( G0, s, d ); }
 inline void MacroAssembler::neg( Register d )             { sub( G0, d, d ); }
 inline void MacroAssembler::cas(  Register s1, Register s2, Register d) { casa( s1, s2, d, ASI_PRIMARY); }
 inline void MacroAssembler::casx( Register s1, Register s2, Register d) { casxa(s1, s2, d, ASI_PRIMARY); }
 // Functions for isolating 64 bit atomic swaps for LP64
 // cas_ptr will perform cas for 32 bit VM's and casx for 64 bit VM's
 inline void MacroAssembler::cas_ptr(  Register s1, Register s2, Register d) {
 #ifdef _LP64
  casx( s1, s2, d );
 #else
  cas( s1, s2, d );
 #endif
 }
 // Functions for isolating 64 bit shifts for LP64
 inline void MacroAssembler::sll_ptr( Register s1, Register s2, Register d ) {
@ -226,6 +253,15 @@ inline void MacroAssembler::sll_ptr( Register s1, RegisterOrConstant s2, Registe
  else                   sll_ptr(s1, s2.as_constant(), d);
 }
 inline void MacroAssembler::casl(  Register s1, Register s2, Register d) { casa( s1, s2, d, ASI_PRIMARY_LITTLE); }
 inline void MacroAssembler::casxl( Register s1, Register s2, Register d) { casxa(s1, s2, d, ASI_PRIMARY_LITTLE); }
 inline void MacroAssembler::inc(   Register d,  int const13 ) { add(   d, const13, d); }
 inline void MacroAssembler::inccc( Register d,  int const13 ) { addcc( d, const13, d); }
 inline void MacroAssembler::dec(   Register d,  int const13 ) { sub(   d, const13, d); }
 inline void MacroAssembler::deccc( Register d,  int const13 ) { subcc( d, const13, d); }
 // Use the right branch for the platform
 inline void MacroAssembler::br( Condition c, bool a, Predict p, address d, relocInfo::relocType rt ) {
@ -298,6 +334,10 @@ inline bool MacroAssembler::is_far_target(address d) {
 // expense of relocation and if we overflow the displacement
 // of the quick call instruction.
 inline void MacroAssembler::call( address d, relocInfo::relocType rt ) {
  MacroAssembler::call(d, Relocation::spec_simple(rt));
 }
 inline void MacroAssembler::call( address d, RelocationHolder const& rspec ) {
 #ifdef _LP64
  intptr_t disp;
  // NULL is ok because it will be relocated later.
@ -309,14 +349,14 @@ inline void MacroAssembler::call( address d, relocInfo::relocType rt ) {
  // Is this address within range of the call instruction?
  // If not, use the expensive instruction sequence
  if (is_far_target(d)) {
-    relocate(rt);
+    relocate(rspec);
    AddressLiteral dest(d);
    jumpl_to(dest, O7, O7);
  } else {
-    Assembler::call(d, rt);
+    Assembler::call(d, rspec);
  }
 #else
-  Assembler::call( d, rt );
+  Assembler::call( d, rspec );
 #endif
 }
@ -337,6 +377,24 @@ inline void MacroAssembler::iprefetch( address d, relocInfo::relocType rt ) {
 }
 inline void MacroAssembler::iprefetch( Label& L) { iprefetch( target(L) ); }
 inline void MacroAssembler::tst( Register s ) { orcc( G0, s, G0 ); }
 inline void MacroAssembler::ret( bool trace ) {
  if (trace) {
    mov(I7, O7); // traceable register
    JMP(O7, 2 * BytesPerInstWord);
  } else {
    jmpl( I7, 2 * BytesPerInstWord, G0 );
  }
 }
 inline void MacroAssembler::retl( bool trace ) {
  if (trace) {
    JMP(O7, 2 * BytesPerInstWord);
  } else {
    jmpl( O7, 2 * BytesPerInstWord, G0 );
  }
 }
 // clobbers o7 on V8!!
 // returns delta from gotten pc to addr after
@ -346,6 +404,8 @@ inline int MacroAssembler::get_pc( Register d ) {
  return offset() - x;
 }
 inline void MacroAssembler::cmp(  Register s1, Register s2 ) { subcc( s1, s2, G0 ); }
 inline void MacroAssembler::cmp(  Register s1, int simm13a ) { subcc( s1, simm13a, G0 ); }
 // Note:  All MacroAssembler::set_foo functions are defined out-of-line.
@ -521,6 +581,12 @@ inline void MacroAssembler::store_long_argument( Register s, Argument& a ) {
 }
 #endif
 inline void MacroAssembler::round_to( Register r, int modulus ) {
  assert_not_delayed();
  inc( r, modulus - 1 );
  and3( r, -modulus, r );
 }
 inline void MacroAssembler::add(Register s1, int simm13a, Register d, relocInfo::relocType rtype) {
  relocate(rtype);
  add(s1, simm13a, d);
@ -547,6 +613,20 @@ inline void MacroAssembler::andn(Register s1, RegisterOrConstant s2, Register d)
  else                   andn(s1, s2.as_constant(), d);
 }
 inline void MacroAssembler::btst( Register s1,  Register s2 ) { andcc( s1, s2, G0 ); }
 inline void MacroAssembler::btst( int simm13a,  Register s )  { andcc( s,  simm13a, G0 ); }
 inline void MacroAssembler::bset( Register s1,  Register s2 ) { or3( s1, s2, s2 ); }
 inline void MacroAssembler::bset( int simm13a,  Register s )  { or3( s,  simm13a, s ); }
 inline void MacroAssembler::bclr( Register s1,  Register s2 ) { andn( s1, s2, s2 ); }
 inline void MacroAssembler::bclr( int simm13a,  Register s )  { andn( s,  simm13a, s ); }
 inline void MacroAssembler::btog( Register s1,  Register s2 ) { xor3( s1, s2, s2 ); }
 inline void MacroAssembler::btog( int simm13a,  Register s )  { xor3( s,  simm13a, s ); }
 inline void MacroAssembler::clr( Register d ) { or3( G0, G0, d ); }
 inline void MacroAssembler::clrb( Register s1, Register s2) { stb( G0, s1, s2 ); }
 inline void MacroAssembler::clrh( Register s1, Register s2) { sth( G0, s1, s2 ); }
 inline void MacroAssembler::clr(  Register s1, Register s2) { stw( G0, s1, s2 ); }
@ -557,6 +637,9 @@ inline void MacroAssembler::clrh( Register s1, int simm13a) { sth( G0, s1, simm1
 inline void MacroAssembler::clr(  Register s1, int simm13a) { stw( G0, s1, simm13a); }
 inline void MacroAssembler::clrx( Register s1, int simm13a) { stx( G0, s1, simm13a); }
 inline void MacroAssembler::clruw( Register s, Register d ) { srl( s, G0, d); }
 inline void MacroAssembler::clruwu( Register d ) { srl( d, G0, d); }
 #ifdef _LP64
 // Make all 32 bit loads signed so 64 bit registers maintain proper sign
 inline void MacroAssembler::ld(  Register s1, Register s2, Register d)      { ldsw( s1, s2, d); }
@ -638,6 +721,11 @@ inline void MacroAssembler::ldf(FloatRegisterImpl::Width w, const Address& a, Fl
  }
 }
 inline void MacroAssembler::lduwl(Register s1, Register s2, Register d) { lduwa(s1, s2, ASI_PRIMARY_LITTLE, d); }
 inline void MacroAssembler::ldswl(Register s1, Register s2, Register d) { ldswa(s1, s2, ASI_PRIMARY_LITTLE, d);}
 inline void MacroAssembler::ldxl( Register s1, Register s2, Register d) { ldxa(s1, s2, ASI_PRIMARY_LITTLE, d); }
 inline void MacroAssembler::ldfl(FloatRegisterImpl::Width w, Register s1, Register s2, FloatRegister d) { ldfa(w, s1, s2, ASI_PRIMARY_LITTLE, d); }
 // returns if membar generates anything, obviously this code should mirror
 // membar below.
 inline bool MacroAssembler::membar_has_effect( Membar_mask_bits const7a ) {
@ -664,6 +752,24 @@ inline void MacroAssembler::membar( Membar_mask_bits const7a ) {
  }
 }
 inline void MacroAssembler::mov(Register s, Register d) {
  if (s != d) {
    or3(G0, s, d);
  } else {
    assert_not_delayed();  // Put something useful in the delay slot!
  }
 }
 inline void MacroAssembler::mov_or_nop(Register s, Register d) {
  if (s != d) {
    or3(G0, s, d);
  } else {
    nop();
  }
 }
 inline void MacroAssembler::mov( int simm13a, Register d) { or3( G0, simm13a, d); }
 inline void MacroAssembler::prefetch(const Address& a, PrefetchFcn f, int offset) {
  relocate(a.rspec(offset));
  assert(!a.has_index(), "");
@ -734,4 +840,11 @@ inline void MacroAssembler::swap(const Address& a, Register d, int offset) {
  else               {                          swap(a.base(), a.disp() + offset, d); }
 }
 inline void MacroAssembler::bang_stack_with_offset(int offset) {
  // stack grows down, caller passes positive offset
  assert(offset > 0, "must bang with negative offset");
  set((-offset)+STACK_BIAS, G3_scratch);
  st(G0, SP, G3_scratch);
 }
 #endif // CPU_SPARC_VM_MACROASSEMBLER_SPARC_INLINE_HPP
--- a/hotspot/src/cpu/sparc/vm/nativeInst_sparc.cpp
+++ b/hotspot/src/cpu/sparc/vm/nativeInst_sparc.cpp
@ -131,8 +131,9 @@ bool NativeInstruction::is_load_store_with_small_offset(Register reg) {
 void NativeCall::verify() {
  NativeInstruction::verify();
  // make sure code pattern is actually a call instruction
-  if (!is_op(long_at(0), Assembler::call_op)) {
+  int x = long_at(0);
-    fatal("not a call");
+  if (!is_op(x, Assembler::call_op)) {
    fatal("not a call: 0x%x @ " INTPTR_FORMAT, x, p2i(instruction_address()));
  }
 }
--- a/hotspot/src/cpu/sparc/vm/sharedRuntime_sparc.cpp
+++ b/hotspot/src/cpu/sparc/vm/sharedRuntime_sparc.cpp
@ -1748,7 +1748,7 @@ static void save_or_restore_arguments(MacroAssembler* masm,
 }
-// Check GC_locker::needs_gc and enter the runtime if it's true.  This
+// Check GCLocker::needs_gc and enter the runtime if it's true.  This
 // keeps a new JNI critical region from starting until a GC has been
 // forced.  Save down any oops in registers and describe them in an
 // OopMap.
@ -1759,9 +1759,9 @@ static void check_needs_gc_for_critical_native(MacroAssembler* masm,
                                               OopMapSet* oop_maps,
                                               VMRegPair* in_regs,
                                               BasicType* in_sig_bt) {
-  __ block_comment("check GC_locker::needs_gc");
+  __ block_comment("check GCLocker::needs_gc");
  Label cont;
-  AddressLiteral sync_state(GC_locker::needs_gc_address());
+  AddressLiteral sync_state(GCLocker::needs_gc_address());
  __ load_bool_contents(sync_state, G3_scratch);
  __ cmp_zero_and_br(Assembler::equal, G3_scratch, cont);
  __ delayed()->nop();
@ -1936,14 +1936,14 @@ static void gen_special_dispatch(MacroAssembler* masm,
 // GetPrimtiveArrayCritical and disallow the use of any other JNI
 // functions.  The wrapper is expected to unpack the arguments before
 // passing them to the callee and perform checks before and after the
-// native call to ensure that they GC_locker
+// native call to ensure that they GCLocker
 // lock_critical/unlock_critical semantics are followed.  Some other
 // parts of JNI setup are skipped like the tear down of the JNI handle
 // block and the check for pending exceptions it's impossible for them
 // to be thrown.
 //
 // They are roughly structured like this:
-//    if (GC_locker::needs_gc())
+//    if (GCLocker::needs_gc())
 //      SharedRuntime::block_for_jni_critical();
 //    tranistion to thread_in_native
 //    unpack arrray arguments and call native entry point
--- a/hotspot/src/cpu/sparc/vm/sparc.ad
+++ b/hotspot/src/cpu/sparc/vm/sparc.ad
@ -1001,7 +1001,7 @@ void emit_form3_mem_reg(CodeBuffer &cbuf, PhaseRegAlloc* ra, const MachNode* n,
 #endif
 }
-void emit_call_reloc(CodeBuffer &cbuf, intptr_t entry_point, relocInfo::relocType rtype, bool preserve_g2 = false) {
+void emit_call_reloc(CodeBuffer &cbuf, intptr_t entry_point, RelocationHolder const& rspec, bool preserve_g2 = false) {
  // The method which records debug information at every safepoint
  // expects the call to be the first instruction in the snippet as
  // it creates a PcDesc structure which tracks the offset of a call
@ -1023,7 +1023,7 @@ void emit_call_reloc(CodeBuffer &cbuf, intptr_t entry_point, relocInfo::relocTyp
  int startpos = __ offset();
 #endif /* ASSERT */
-  __ call((address)entry_point, rtype);
+  __ call((address)entry_point, rspec);
  if (preserve_g2)   __ delayed()->mov(G2, L7);
  else __ delayed()->nop();
@ -2598,8 +2598,7 @@ encode %{
  enc_class Java_To_Runtime (method meth) %{    // CALL Java_To_Runtime
    // CALL directly to the runtime
    // The user of this is responsible for ensuring that R_L7 is empty (killed).
-    emit_call_reloc(cbuf, $meth$$method, relocInfo::runtime_call_type,
+    emit_call_reloc(cbuf, $meth$$method, runtime_call_Relocation::spec(), /*preserve_g2=*/true);
                    /*preserve_g2=*/true);
  %}
  enc_class preserve_SP %{
@ -2616,13 +2615,14 @@ encode %{
    // CALL to fixup routine.  Fixup routine uses ScopeDesc info to determine
    // who we intended to call.
    if (!_method) {
-      emit_call_reloc(cbuf, $meth$$method, relocInfo::runtime_call_type);
+      emit_call_reloc(cbuf, $meth$$method, runtime_call_Relocation::spec());
    } else if (_optimized_virtual) {
      emit_call_reloc(cbuf, $meth$$method, relocInfo::opt_virtual_call_type);
    } else {
-      emit_call_reloc(cbuf, $meth$$method, relocInfo::static_call_type);
+      int method_index = resolved_method_index(cbuf);
-    }
+      RelocationHolder rspec = _optimized_virtual ? opt_virtual_call_Relocation::spec(method_index)
-    if (_method) {  // Emit stub for static call.
+                                                  : static_call_Relocation::spec(method_index);
      emit_call_reloc(cbuf, $meth$$method, rspec);
      // Emit stub for static call.
      address stub = CompiledStaticCall::emit_to_interp_stub(cbuf);
      // Stub does not fit into scratch buffer if TraceJumps is enabled
      if (stub == NULL && !(TraceJumps && Compile::current()->in_scratch_emit_size())) {
@ -2643,7 +2643,7 @@ encode %{
      Register G5_ic_reg = reg_to_register_object(Matcher::inline_cache_reg_encode());
      assert(G5_ic_reg == G5_inline_cache_reg, "G5_inline_cache_reg used in assemble_ic_buffer_code()");
      assert(G5_ic_reg == G5_megamorphic_method, "G5_megamorphic_method used in megamorphic call stub");
-      __ ic_call((address)$meth$$method);
+      __ ic_call((address)$meth$$method, /*emit_delay=*/true, resolved_method_index(cbuf));
    } else {
      assert(!UseInlineCaches, "expect vtable calls only if not using ICs");
      // Just go thru the vtable
--- a/hotspot/src/cpu/sparc/vm/vmStructs_sparc.hpp
+++ b/hotspot/src/cpu/sparc/vm/vmStructs_sparc.hpp
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2001, 2013, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2001, 2015, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@ -30,18 +30,9 @@
 // referenced by vmStructs.cpp.
 #define VM_STRUCTS_CPU(nonstatic_field, static_field, unchecked_nonstatic_field, volatile_nonstatic_field, nonproduct_nonstatic_field, c2_nonstatic_field, unchecked_c1_static_field, unchecked_c2_static_field) \
- \
+  volatile_nonstatic_field(JavaFrameAnchor, _flags, int)
  /******************************/                                                                                                   \
  /* JavaCallWrapper            */                                                                                                   \
  /******************************/                                                                                                   \
  /******************************/                                                                                                   \
  /* JavaFrameAnchor            */                                                                                                   \
  /******************************/                                                                                                   \
  volatile_nonstatic_field(JavaFrameAnchor,     _flags,                                          int)                                \
  static_field(VM_Version, _features, int)
 #define VM_TYPES_CPU(declare_type, declare_toplevel_type, declare_oop_type, declare_integer_type, declare_unsigned_integer_type, declare_c1_toplevel_type, declare_c2_type, declare_c2_toplevel_type) \
  declare_toplevel_type(VM_Version)
 #define VM_INT_CONSTANTS_CPU(declare_constant, declare_preprocessor_constant, declare_c1_constant, declare_c2_constant, declare_c2_preprocessor_constant) \
  /******************************/                                        \
--- a/hotspot/src/cpu/sparc/vm/vm_version_sparc.cpp
+++ b/hotspot/src/cpu/sparc/vm/vm_version_sparc.cpp
@ -30,13 +30,10 @@
 #include "runtime/stubCodeGenerator.hpp"
 #include "vm_version_sparc.hpp"
 int VM_Version::_features = VM_Version::unknown_m;
 const char* VM_Version::_features_str = "";
 unsigned int VM_Version::_L2_data_cache_line_size = 0;
 void VM_Version::initialize() {
-
+  assert(_features != 0, "System pre-initialization is not complete.");
  assert(_features != VM_Version::unknown_m, "System pre-initialization is not complete.");
  guarantee(VM_Version::has_v9(), "only SPARC v9 is supported");
  PrefetchCopyIntervalInBytes = prefetch_copy_interval_in_bytes();
@ -214,7 +211,7 @@ void VM_Version::initialize() {
               (!has_hardware_fsmuld() ? ", no-fsmuld" : ""));
  // buf is started with ", " or is empty
-  _features_str = os::strdup(strlen(buf) > 2 ? buf + 2 : buf);
+  _features_string = os::strdup(strlen(buf) > 2 ? buf + 2 : buf);
  // UseVIS is set to the smallest of what hardware supports and what
  // the command line requires.  I.e., you cannot set UseVIS to 3 on
@ -263,6 +260,11 @@ void VM_Version::initialize() {
    }
  }
  if (UseAESCTRIntrinsics) {
    warning("AES/CTR intrinsics are not available on this CPU");
    FLAG_SET_DEFAULT(UseAESCTRIntrinsics, false);
  }
  // GHASH/GCM intrinsics
  if (has_vis3() && (UseVIS > 2)) {
    if (FLAG_IS_DEFAULT(UseGHASHIntrinsics)) {
@ -357,6 +359,11 @@ void VM_Version::initialize() {
    FLAG_SET_DEFAULT(UseCRC32Intrinsics, false);
  }
  if (UseVectorizedMismatchIntrinsic) {
    warning("UseVectorizedMismatchIntrinsic specified, but not available on this CPU.");
    FLAG_SET_DEFAULT(UseVectorizedMismatchIntrinsic, false);
  }
  if (FLAG_IS_DEFAULT(ContendedPaddingWidth) &&
    (cache_line_size > ContendedPaddingWidth))
    ContendedPaddingWidth = cache_line_size;
@ -403,7 +410,7 @@ void VM_Version::initialize() {
 }
 void VM_Version::print_features() {
-  tty->print_cr("Version:%s", cpu_features());
+  tty->print_cr("Version:%s", _features);
 }
 int VM_Version::determine_features() {
@ -439,7 +446,7 @@ int VM_Version::determine_features() {
  return features;
 }
-static int saved_features = 0;
+static uint64_t saved_features = 0;
 void VM_Version::allow_all() {
  saved_features = _features;
--- a/hotspot/src/cpu/sparc/vm/vm_version_sparc.hpp
+++ b/hotspot/src/cpu/sparc/vm/vm_version_sparc.hpp
@ -30,6 +30,8 @@
 class VM_Version: public Abstract_VM_Version {
  friend class VMStructs;
  friend class JVMCIVMStructs;
 protected:
  enum Feature_Flag {
    v8_instructions      = 0,
@ -96,9 +98,6 @@ protected:
    niagara1_m          = generic_v9_m | niagara1_unique_m
  };
  static int  _features;
  static const char* _features_str;
  static unsigned int _L2_data_cache_line_size;
  static unsigned int L2_data_cache_line_size() { return _L2_data_cache_line_size; }
@ -174,8 +173,6 @@ public:
  // On T4 and newer Sparc BIS to the beginning of cache line always zeros it.
  static bool has_block_zeroing()       { return has_blk_init() && is_T4(); }
  static const char* cpu_features()     { return _features_str; }
  // default prefetch block size on sparc
  static intx prefetch_data_size()      { return L2_data_cache_line_size();  }
--- a/hotspot/src/cpu/x86/vm/assembler_x86.cpp
+++ b/hotspot/src/cpu/x86/vm/assembler_x86.cpp
@ -772,6 +772,7 @@ address Assembler::locate_operand(address inst, WhichOperand which) {
    case 0x55: // andnps
    case 0x56: // orps
    case 0x57: // xorps
    case 0x58: // addpd
    case 0x59: // mulpd
    case 0x6E: // movd
    case 0x7E: // movd
@ -2152,33 +2153,64 @@ void Assembler::movddup(XMMRegister dst, XMMRegister src) {
  emit_int8(0xC0 | encode);
 }
-void Assembler::kmovwl(KRegister dst, Register src) {
+void Assembler::kmovbl(KRegister dst, Register src) {
-  NOT_LP64(assert(VM_Version::supports_evex(), ""));
+  assert(VM_Version::supports_avx512dq(), "");
  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ false);
-  int encode = kreg_prefix_and_encode(dst, knoreg, src, VEX_SIMD_NONE, VEX_OPCODE_0F, &attributes);
+  int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
  emit_int8((unsigned char)0x92);
  emit_int8((unsigned char)(0xC0 | encode));
 }
 void Assembler::kmovbl(Register dst, KRegister src) {
  assert(VM_Version::supports_avx512dq(), "");
  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ false);
  int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
  emit_int8((unsigned char)0x93);
  emit_int8((unsigned char)(0xC0 | encode));
 }
 void Assembler::kmovwl(KRegister dst, Register src) {
  assert(VM_Version::supports_evex(), "");
  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ false);
  int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_NONE, VEX_OPCODE_0F, &attributes);
  emit_int8((unsigned char)0x92);
  emit_int8((unsigned char)(0xC0 | encode));
 }
 void Assembler::kmovwl(Register dst, KRegister src) {
  assert(VM_Version::supports_evex(), "");
  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ false);
  int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_NONE, VEX_OPCODE_0F, &attributes);
  emit_int8((unsigned char)0x93);
  emit_int8((unsigned char)(0xC0 | encode));
 }
 void Assembler::kmovdl(KRegister dst, Register src) {
-  NOT_LP64(assert(VM_Version::supports_evex(), ""));
+  assert(VM_Version::supports_avx512bw(), "");
  VexSimdPrefix pre = !_legacy_mode_bw ? VEX_SIMD_F2 : VEX_SIMD_NONE;
  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ false);
-  int encode = kreg_prefix_and_encode(dst, knoreg, src, pre, VEX_OPCODE_0F, &attributes);
+  int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_F2, VEX_OPCODE_0F, &attributes);
  emit_int8((unsigned char)0x92);
  emit_int8((unsigned char)(0xC0 | encode));
 }
 void Assembler::kmovdl(Register dst, KRegister src) {
  assert(VM_Version::supports_avx512bw(), "");
  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ false);
  int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_F2, VEX_OPCODE_0F, &attributes);
  emit_int8((unsigned char)0x93);
  emit_int8((unsigned char)(0xC0 | encode));
 }
 void Assembler::kmovql(KRegister dst, KRegister src) {
-  NOT_LP64(assert(VM_Version::supports_evex(), ""));
+  assert(VM_Version::supports_avx512bw(), "");
  InstructionAttr attributes(AVX_128bit, /* rex_w */ true, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ false);
-  int encode = kreg_prefix_and_encode(dst, knoreg, src, VEX_SIMD_NONE, VEX_OPCODE_0F, &attributes);
+  int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_NONE, VEX_OPCODE_0F, &attributes);
  emit_int8((unsigned char)0x90);
  emit_int8((unsigned char)(0xC0 | encode));
 }
 void Assembler::kmovql(KRegister dst, Address src) {
-  NOT_LP64(assert(VM_Version::supports_evex(), ""));
+  assert(VM_Version::supports_avx512bw(), "");
  InstructionMark im(this);
  InstructionAttr attributes(AVX_128bit, /* vex_w */ true, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ false);
  vex_prefix(src, 0, dst->encoding(), VEX_SIMD_NONE, VEX_OPCODE_0F, &attributes);
@ -2187,7 +2219,7 @@ void Assembler::kmovql(KRegister dst, Address src) {
 }
 void Assembler::kmovql(Address dst, KRegister src) {
-  NOT_LP64(assert(VM_Version::supports_evex(), ""));
+  assert(VM_Version::supports_avx512bw(), "");
  InstructionMark im(this);
  InstructionAttr attributes(AVX_128bit, /* vex_w */ true, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ false);
  vex_prefix(dst, 0, src->encoding(), VEX_SIMD_NONE, VEX_OPCODE_0F, &attributes);
@ -2196,46 +2228,53 @@ void Assembler::kmovql(Address dst, KRegister src) {
 }
 void Assembler::kmovql(KRegister dst, Register src) {
-  NOT_LP64(assert(VM_Version::supports_evex(), ""));
+  assert(VM_Version::supports_avx512bw(), "");
-  VexSimdPrefix pre = !_legacy_mode_bw ? VEX_SIMD_F2 : VEX_SIMD_NONE;
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ true, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ false);
-  InstructionAttr attributes(AVX_128bit, /* rex_w */ !_legacy_mode_bw, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ false);
+  int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_F2, VEX_OPCODE_0F, &attributes);
  int encode = kreg_prefix_and_encode(dst, knoreg, src, pre, VEX_OPCODE_0F, &attributes);
  emit_int8((unsigned char)0x92);
  emit_int8((unsigned char)(0xC0 | encode));
 }
 void Assembler::kmovql(Register dst, KRegister src) {
  assert(VM_Version::supports_avx512bw(), "");
  InstructionAttr attributes(AVX_128bit, /* rex_w */ true, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ false);
  int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_F2, VEX_OPCODE_0F, &attributes);
  emit_int8((unsigned char)0x93);
  emit_int8((unsigned char)(0xC0 | encode));
 }
 // This instruction produces ZF or CF flags
 void Assembler::kortestbl(KRegister src1, KRegister src2) {
-  NOT_LP64(assert(VM_Version::supports_avx512dq(), ""));
+  assert(VM_Version::supports_avx512dq(), "");
  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ false);
-  int encode = kreg_prefix_and_encode(src1, knoreg, src2, VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
+  int encode = vex_prefix_and_encode(src1->encoding(), 0, src2->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
  emit_int8((unsigned char)0x98);
  emit_int8((unsigned char)(0xC0 | encode));
 }
 // This instruction produces ZF or CF flags
 void Assembler::kortestwl(KRegister src1, KRegister src2) {
-  NOT_LP64(assert(VM_Version::supports_evex(), ""));
+  assert(VM_Version::supports_evex(), "");
  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ false);
-  int encode = kreg_prefix_and_encode(src1, knoreg, src2, VEX_SIMD_NONE, VEX_OPCODE_0F, &attributes);
+  int encode = vex_prefix_and_encode(src1->encoding(), 0, src2->encoding(), VEX_SIMD_NONE, VEX_OPCODE_0F, &attributes);
  emit_int8((unsigned char)0x98);
  emit_int8((unsigned char)(0xC0 | encode));
 }
 // This instruction produces ZF or CF flags
 void Assembler::kortestdl(KRegister src1, KRegister src2) {
-  NOT_LP64(assert(VM_Version::supports_avx512bw(), ""));
+  assert(VM_Version::supports_avx512bw(), "");
  InstructionAttr attributes(AVX_128bit, /* rex_w */ true, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ false);
-  int encode = kreg_prefix_and_encode(src1, knoreg, src2, VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
+  int encode = vex_prefix_and_encode(src1->encoding(), 0, src2->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
  emit_int8((unsigned char)0x98);
  emit_int8((unsigned char)(0xC0 | encode));
 }
 // This instruction produces ZF or CF flags
 void Assembler::kortestql(KRegister src1, KRegister src2) {
-  NOT_LP64(assert(VM_Version::supports_avx512bw(), ""));
+  assert(VM_Version::supports_avx512bw(), "");
  InstructionAttr attributes(AVX_128bit, /* rex_w */ true, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ false);
-  int encode = kreg_prefix_and_encode(src1, knoreg, src2, VEX_SIMD_NONE, VEX_OPCODE_0F, &attributes);
+  int encode = vex_prefix_and_encode(src1->encoding(), 0, src2->encoding(), VEX_SIMD_NONE, VEX_OPCODE_0F, &attributes);
  emit_int8((unsigned char)0x98);
  emit_int8((unsigned char)(0xC0 | encode));
 }
@ -2375,7 +2414,7 @@ void Assembler::vmovdqu(Address dst, XMMRegister src) {
 // Move Unaligned EVEX enabled Vector (programmable : 8,16,32,64)
 void Assembler::evmovdqub(XMMRegister dst, XMMRegister src, int vector_len) {
  assert(VM_Version::supports_evex(), "");
-  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ true);
+  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
  int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_F2, VEX_OPCODE_0F, &attributes);
  emit_int8(0x6F);
  emit_int8((unsigned char)(0xC0 | encode));
@ -2395,7 +2434,7 @@ void Assembler::evmovdqub(Address dst, XMMRegister src, int vector_len) {
  assert(VM_Version::supports_evex(), "");
  assert(src != xnoreg, "sanity");
  InstructionMark im(this);
-  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ true);
+  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
  attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_NObit);
  vex_prefix(dst, 0, src->encoding(), VEX_SIMD_F2, VEX_OPCODE_0F, &attributes);
  emit_int8(0x7F);
@ -2404,7 +2443,7 @@ void Assembler::evmovdqub(Address dst, XMMRegister src, int vector_len) {
 void Assembler::evmovdquw(XMMRegister dst, XMMRegister src, int vector_len) {
  assert(VM_Version::supports_evex(), "");
-  InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ true);
+  InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
  int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_F2, VEX_OPCODE_0F, &attributes);
  emit_int8(0x6F);
  emit_int8((unsigned char)(0xC0 | encode));
@ -2424,7 +2463,7 @@ void Assembler::evmovdquw(Address dst, XMMRegister src, int vector_len) {
  assert(VM_Version::supports_evex(), "");
  assert(src != xnoreg, "sanity");
  InstructionMark im(this);
-  InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ true);
+  InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
  attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_NObit);
  vex_prefix(dst, 0, src->encoding(), VEX_SIMD_F2, VEX_OPCODE_0F, &attributes);
  emit_int8(0x7F);
@ -3069,7 +3108,7 @@ void Assembler::packuswb(XMMRegister dst, Address src) {
  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
  assert((UseAVX > 0), "SSE mode requires address alignment 16 bytes");
  InstructionMark im(this);
-  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ true);
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
  attributes.set_address_attributes(/* tuple_type */ EVEX_FV, /* input_size_in_bits */ EVEX_32bit);
  simd_prefix(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
  emit_int8(0x67);
@ -3078,7 +3117,7 @@ void Assembler::packuswb(XMMRegister dst, Address src) {
 void Assembler::packuswb(XMMRegister dst, XMMRegister src) {
  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ true);
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
  emit_int8(0x67);
  emit_int8((unsigned char)(0xC0 | encode));
@ -3086,7 +3125,7 @@ void Assembler::packuswb(XMMRegister dst, XMMRegister src) {
 void Assembler::vpackuswb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
  assert(UseAVX > 0, "some form of AVX must be enabled");
-  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ true);
+  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
  int encode = vex_prefix_and_encode(dst->encoding(), nds_enc, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
  emit_int8(0x67);
@ -3128,7 +3167,7 @@ void Assembler::pcmpestri(XMMRegister dst, XMMRegister src, int imm8) {
 // In this context, the dst vector contains the components that are equal, non equal components are zeroed in dst
 void Assembler::pcmpeqb(XMMRegister dst, XMMRegister src) {
-  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
+  assert(VM_Version::supports_sse2(), "");
  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ false, /* uses_vl */ false);
  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
  emit_int8(0x74);
@ -3148,16 +3187,28 @@ void Assembler::vpcmpeqb(XMMRegister dst, XMMRegister nds, XMMRegister src, int
 // In this context, kdst is written the mask used to process the equal components
 void Assembler::evpcmpeqb(KRegister kdst, XMMRegister nds, XMMRegister src, int vector_len) {
  assert(VM_Version::supports_avx512bw(), "");
-  InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
+  InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true);
  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
  int encode = vex_prefix_and_encode(kdst->encoding(), nds_enc, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
  emit_int8(0x74);
  emit_int8((unsigned char)(0xC0 | encode));
 }
 void Assembler::evpcmpeqb(KRegister kdst, XMMRegister nds, Address src, int vector_len) {
  assert(VM_Version::supports_avx512bw(), "");
  InstructionMark im(this);
  InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true);
  attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_NObit);
  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
  int dst_enc = kdst->encoding();
  vex_prefix(src, nds_enc, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
  emit_int8(0x74);
  emit_operand(as_Register(dst_enc), src);
 }
 // In this context, the dst vector contains the components that are equal, non equal components are zeroed in dst
 void Assembler::pcmpeqw(XMMRegister dst, XMMRegister src) {
-  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
+  assert(VM_Version::supports_sse2(), "");
  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ false, /* uses_vl */ false);
  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
  emit_int8(0x75);
@ -3177,16 +3228,28 @@ void Assembler::vpcmpeqw(XMMRegister dst, XMMRegister nds, XMMRegister src, int
 // In this context, kdst is written the mask used to process the equal components
 void Assembler::evpcmpeqw(KRegister kdst, XMMRegister nds, XMMRegister src, int vector_len) {
  assert(VM_Version::supports_avx512bw(), "");
-  InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
+  InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true);
  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
  int encode = vex_prefix_and_encode(kdst->encoding(), nds_enc, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
  emit_int8(0x75);
  emit_int8((unsigned char)(0xC0 | encode));
 }
 void Assembler::evpcmpeqw(KRegister kdst, XMMRegister nds, Address src, int vector_len) {
  assert(VM_Version::supports_avx512bw(), "");
  InstructionMark im(this);
  InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true);
  attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_NObit);
  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
  int dst_enc = kdst->encoding();
  vex_prefix(src, nds_enc, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
  emit_int8(0x75);
  emit_operand(as_Register(dst_enc), src);
 }
 // In this context, the dst vector contains the components that are equal, non equal components are zeroed in dst
 void Assembler::pcmpeqd(XMMRegister dst, XMMRegister src) {
-  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
+  assert(VM_Version::supports_sse2(), "");
  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ false, /* uses_vl */ false);
  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
  emit_int8(0x76);
@ -3213,9 +3276,21 @@ void Assembler::evpcmpeqd(KRegister kdst, XMMRegister nds, XMMRegister src, int
  emit_int8((unsigned char)(0xC0 | encode));
 }
 void Assembler::evpcmpeqd(KRegister kdst, XMMRegister nds, Address src, int vector_len) {
  assert(VM_Version::supports_evex(), "");
  InstructionMark im(this);
  InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
  attributes.set_address_attributes(/* tuple_type */ EVEX_FV, /* input_size_in_bits */ EVEX_32bit);
  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
  int dst_enc = kdst->encoding();
  vex_prefix(src, nds_enc, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
  emit_int8(0x76);
  emit_operand(as_Register(dst_enc), src);
 }
 // In this context, the dst vector contains the components that are equal, non equal components are zeroed in dst
 void Assembler::pcmpeqq(XMMRegister dst, XMMRegister src) {
-  NOT_LP64(assert(VM_Version::supports_sse4_1(), ""));
+  assert(VM_Version::supports_sse4_1(), "");
  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ false, /* uses_vl */ false);
  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
  emit_int8(0x29);
@ -3274,21 +3349,41 @@ void Assembler::vpmovmskb(Register dst, XMMRegister src) {
 void Assembler::pextrd(Register dst, XMMRegister src, int imm8) {
  assert(VM_Version::supports_sse4_1(), "");
  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_dq, /* no_mask_reg */ true, /* uses_vl */ false);
-  int encode = simd_prefix_and_encode(as_XMMRegister(dst->encoding()), xnoreg, src, VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
+  int encode = simd_prefix_and_encode(src, xnoreg, as_XMMRegister(dst->encoding()), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
  emit_int8(0x16);
  emit_int8((unsigned char)(0xC0 | encode));
  emit_int8(imm8);
 }
 void Assembler::pextrd(Address dst, XMMRegister src, int imm8) {
  assert(VM_Version::supports_sse4_1(), "");
  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_dq, /* no_mask_reg */ true, /* uses_vl */ false);
  attributes.set_address_attributes(/* tuple_type */ EVEX_T1S, /* input_size_in_bits */ EVEX_32bit);
  simd_prefix(src, xnoreg, dst, VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
  emit_int8(0x16);
  emit_operand(src, dst);
  emit_int8(imm8);
 }
 void Assembler::pextrq(Register dst, XMMRegister src, int imm8) {
  assert(VM_Version::supports_sse4_1(), "");
  InstructionAttr attributes(AVX_128bit, /* rex_w */ true, /* legacy_mode */ _legacy_mode_dq, /* no_mask_reg */ true, /* uses_vl */ false);
-  int encode = simd_prefix_and_encode(as_XMMRegister(dst->encoding()), xnoreg, src, VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
+  int encode = simd_prefix_and_encode(src, xnoreg, as_XMMRegister(dst->encoding()), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
  emit_int8(0x16);
  emit_int8((unsigned char)(0xC0 | encode));
  emit_int8(imm8);
 }
 void Assembler::pextrq(Address dst, XMMRegister src, int imm8) {
  assert(VM_Version::supports_sse4_1(), "");
  InstructionAttr attributes(AVX_128bit, /* rex_w */ true, /* legacy_mode */ _legacy_mode_dq, /* no_mask_reg */ true, /* uses_vl */ false);
  attributes.set_address_attributes(/* tuple_type */ EVEX_T1S, /* input_size_in_bits */ EVEX_64bit);
  simd_prefix(src, xnoreg, dst, VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
  emit_int8(0x16);
  emit_operand(src, dst);
  emit_int8(imm8);
 }
 void Assembler::pextrw(Register dst, XMMRegister src, int imm8) {
  assert(VM_Version::supports_sse2(), "");
  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ false);
@ -3298,6 +3393,26 @@ void Assembler::pextrw(Register dst, XMMRegister src, int imm8) {
  emit_int8(imm8);
 }
 void Assembler::pextrw(Address dst, XMMRegister src, int imm8) {
  assert(VM_Version::supports_sse4_1(), "");
  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ false);
  attributes.set_address_attributes(/* tuple_type */ EVEX_T1S, /* input_size_in_bits */ EVEX_16bit);
  simd_prefix(src, xnoreg, dst, VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
  emit_int8((unsigned char)0x15);
  emit_operand(src, dst);
  emit_int8(imm8);
 }
 void Assembler::pextrb(Address dst, XMMRegister src, int imm8) {
  assert(VM_Version::supports_sse4_1(), "");
  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ false);
  attributes.set_address_attributes(/* tuple_type */ EVEX_T1S, /* input_size_in_bits */ EVEX_8bit);
  simd_prefix(src, xnoreg, dst, VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
  emit_int8(0x14);
  emit_operand(src, dst);
  emit_int8(imm8);
 }
 void Assembler::pinsrd(XMMRegister dst, Register src, int imm8) {
  assert(VM_Version::supports_sse4_1(), "");
  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_dq, /* no_mask_reg */ true, /* uses_vl */ false);
@ -3307,6 +3422,16 @@ void Assembler::pinsrd(XMMRegister dst, Register src, int imm8) {
  emit_int8(imm8);
 }
 void Assembler::pinsrd(XMMRegister dst, Address src, int imm8) {
  assert(VM_Version::supports_sse4_1(), "");
  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_dq, /* no_mask_reg */ true, /* uses_vl */ false);
  attributes.set_address_attributes(/* tuple_type */ EVEX_T1S, /* input_size_in_bits */ EVEX_32bit);
  simd_prefix(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
  emit_int8(0x22);
  emit_operand(dst,src);
  emit_int8(imm8);
 }
 void Assembler::pinsrq(XMMRegister dst, Register src, int imm8) {
  assert(VM_Version::supports_sse4_1(), "");
  InstructionAttr attributes(AVX_128bit, /* rex_w */ true, /* legacy_mode */ _legacy_mode_dq, /* no_mask_reg */ true, /* uses_vl */ false);
@ -3316,6 +3441,16 @@ void Assembler::pinsrq(XMMRegister dst, Register src, int imm8) {
  emit_int8(imm8);
 }
 void Assembler::pinsrq(XMMRegister dst, Address src, int imm8) {
  assert(VM_Version::supports_sse4_1(), "");
  InstructionAttr attributes(AVX_128bit, /* rex_w */ true, /* legacy_mode */ _legacy_mode_dq, /* no_mask_reg */ true, /* uses_vl */ false);
  attributes.set_address_attributes(/* tuple_type */ EVEX_T1S, /* input_size_in_bits */ EVEX_64bit);
  simd_prefix(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
  emit_int8(0x22);
  emit_operand(dst, src);
  emit_int8(imm8);
 }
 void Assembler::pinsrw(XMMRegister dst, Register src, int imm8) {
  assert(VM_Version::supports_sse2(), "");
  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ false);
@ -3325,10 +3460,30 @@ void Assembler::pinsrw(XMMRegister dst, Register src, int imm8) {
  emit_int8(imm8);
 }
 void Assembler::pinsrw(XMMRegister dst, Address src, int imm8) {
  assert(VM_Version::supports_sse2(), "");
  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ false);
  attributes.set_address_attributes(/* tuple_type */ EVEX_T1S, /* input_size_in_bits */ EVEX_16bit);
  simd_prefix(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
  emit_int8((unsigned char)0xC4);
  emit_operand(dst, src);
  emit_int8(imm8);
 }
 void Assembler::pinsrb(XMMRegister dst, Address src, int imm8) {
  assert(VM_Version::supports_sse4_1(), "");
  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ false);
  attributes.set_address_attributes(/* tuple_type */ EVEX_T1S, /* input_size_in_bits */ EVEX_8bit);
  simd_prefix(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
  emit_int8(0x20);
  emit_operand(dst, src);
  emit_int8(imm8);
 }
 void Assembler::pmovzxbw(XMMRegister dst, Address src) {
  assert(VM_Version::supports_sse4_1(), "");
  InstructionMark im(this);
-  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ false);
  attributes.set_address_attributes(/* tuple_type */ EVEX_HVM, /* input_size_in_bits */ EVEX_NObit);
  simd_prefix(dst, xnoreg, src, VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
  emit_int8(0x30);
@ -3337,7 +3492,7 @@ void Assembler::pmovzxbw(XMMRegister dst, Address src) {
 void Assembler::pmovzxbw(XMMRegister dst, XMMRegister src) {
  assert(VM_Version::supports_sse4_1(), "");
-  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ false);
  int encode = simd_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
  emit_int8(0x30);
  emit_int8((unsigned char)(0xC0 | encode));
@ -3347,7 +3502,7 @@ void Assembler::vpmovzxbw(XMMRegister dst, Address src, int vector_len) {
  assert(VM_Version::supports_avx(), "");
  InstructionMark im(this);
  assert(dst != xnoreg, "sanity");
-  InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ false);
+  InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ false);
  attributes.set_address_attributes(/* tuple_type */ EVEX_HVM, /* input_size_in_bits */ EVEX_NObit);
  vex_prefix(src, 0, dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
  emit_int8(0x30);
@ -3452,7 +3607,7 @@ void Assembler::prefix(Prefix p) {
 void Assembler::pshufb(XMMRegister dst, XMMRegister src) {
  assert(VM_Version::supports_ssse3(), "");
-  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ false);
  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
  emit_int8(0x00);
  emit_int8((unsigned char)(0xC0 | encode));
@ -3461,7 +3616,7 @@ void Assembler::pshufb(XMMRegister dst, XMMRegister src) {
 void Assembler::pshufb(XMMRegister dst, Address src) {
  assert(VM_Version::supports_ssse3(), "");
  InstructionMark im(this);
-  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ false);
  attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_NObit);
  simd_prefix(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
  emit_int8(0x00);
@ -3495,7 +3650,7 @@ void Assembler::pshufd(XMMRegister dst, Address src, int mode) {
 void Assembler::pshuflw(XMMRegister dst, XMMRegister src, int mode) {
  assert(isByte(mode), "invalid value");
  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ false);
  int encode = simd_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_F2, VEX_OPCODE_0F, &attributes);
  emit_int8(0x70);
  emit_int8((unsigned char)(0xC0 | encode));
@ -3507,7 +3662,7 @@ void Assembler::pshuflw(XMMRegister dst, Address src, int mode) {
  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
  assert((UseAVX > 0), "SSE mode requires address alignment 16 bytes");
  InstructionMark im(this);
-  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ false);
  attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_NObit);
  simd_prefix(dst, xnoreg, src, VEX_SIMD_F2, VEX_OPCODE_0F, &attributes);
  emit_int8(0x70);
@ -4112,6 +4267,12 @@ void Assembler::xorl(Register dst, Register src) {
  emit_arith(0x33, 0xC0, dst, src);
 }
 void Assembler::xorb(Register dst, Address src) {
  InstructionMark im(this);
  prefix(src, dst);
  emit_int8(0x32);
  emit_operand(dst, src);
 }
 // AVX 3-operands scalar float-point arithmetic instructions
@ -4287,6 +4448,17 @@ void Assembler::addpd(XMMRegister dst, XMMRegister src) {
  emit_int8((unsigned char)(0xC0 | encode));
 }
 void Assembler::addpd(XMMRegister dst, Address src) {
  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
  InstructionMark im(this);
  InstructionAttr attributes(AVX_128bit, /* rex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
  attributes.set_address_attributes(/* tuple_type */ EVEX_FV, /* input_size_in_bits */ EVEX_64bit);
  simd_prefix(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
  emit_int8(0x58);
  emit_operand(dst, src);
 }
 void Assembler::addps(XMMRegister dst, XMMRegister src) {
  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
@ -4723,7 +4895,7 @@ void Assembler::vphaddd(XMMRegister dst, XMMRegister nds, XMMRegister src, int v
 void Assembler::paddb(XMMRegister dst, XMMRegister src) {
  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ true);
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
  emit_int8((unsigned char)0xFC);
  emit_int8((unsigned char)(0xC0 | encode));
@ -4731,7 +4903,7 @@ void Assembler::paddb(XMMRegister dst, XMMRegister src) {
 void Assembler::paddw(XMMRegister dst, XMMRegister src) {
  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ true);
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
  emit_int8((unsigned char)0xFD);
  emit_int8((unsigned char)(0xC0 | encode));
@ -4771,7 +4943,7 @@ void Assembler::phaddd(XMMRegister dst, XMMRegister src) {
 void Assembler::vpaddb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
  assert(UseAVX > 0, "requires some form of AVX");
-  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ true);
+  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
  int encode = vex_prefix_and_encode(dst->encoding(), nds_enc, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
  emit_int8((unsigned char)0xFC);
@ -4780,7 +4952,7 @@ void Assembler::vpaddb(XMMRegister dst, XMMRegister nds, XMMRegister src, int ve
 void Assembler::vpaddw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
  assert(UseAVX > 0, "requires some form of AVX");
-  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ true);
+  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
  int encode = vex_prefix_and_encode(dst->encoding(), nds_enc, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
  emit_int8((unsigned char)0xFD);
@ -4808,7 +4980,7 @@ void Assembler::vpaddq(XMMRegister dst, XMMRegister nds, XMMRegister src, int ve
 void Assembler::vpaddb(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
  assert(UseAVX > 0, "requires some form of AVX");
  InstructionMark im(this);
-  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ true);
+  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
  attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_NObit);
  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
  vex_prefix(src, nds_enc, dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
@ -4819,7 +4991,7 @@ void Assembler::vpaddb(XMMRegister dst, XMMRegister nds, Address src, int vector
 void Assembler::vpaddw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
  assert(UseAVX > 0, "requires some form of AVX");
  InstructionMark im(this);
-  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ true);
+  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
  attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_NObit);
  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
  vex_prefix(src, nds_enc, dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
@ -4851,7 +5023,7 @@ void Assembler::vpaddq(XMMRegister dst, XMMRegister nds, Address src, int vector
 void Assembler::psubb(XMMRegister dst, XMMRegister src) {
  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ true);
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
  emit_int8((unsigned char)0xF8);
  emit_int8((unsigned char)(0xC0 | encode));
@ -4859,7 +5031,7 @@ void Assembler::psubb(XMMRegister dst, XMMRegister src) {
 void Assembler::psubw(XMMRegister dst, XMMRegister src) {
  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ true);
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
  emit_int8((unsigned char)0xF9);
  emit_int8((unsigned char)(0xC0 | encode));
@ -4882,7 +5054,7 @@ void Assembler::psubq(XMMRegister dst, XMMRegister src) {
 void Assembler::vpsubb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
  assert(UseAVX > 0, "requires some form of AVX");
-  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ true);
+  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
  int encode = vex_prefix_and_encode(dst->encoding(), nds_enc, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
  emit_int8((unsigned char)0xF8);
@ -4891,7 +5063,7 @@ void Assembler::vpsubb(XMMRegister dst, XMMRegister nds, XMMRegister src, int ve
 void Assembler::vpsubw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
  assert(UseAVX > 0, "requires some form of AVX");
-  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ true);
+  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
  int encode = vex_prefix_and_encode(dst->encoding(), nds_enc, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
  emit_int8((unsigned char)0xF9);
@ -4919,7 +5091,7 @@ void Assembler::vpsubq(XMMRegister dst, XMMRegister nds, XMMRegister src, int ve
 void Assembler::vpsubb(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
  assert(UseAVX > 0, "requires some form of AVX");
  InstructionMark im(this);
-  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ true);
+  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
  attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_NObit);
  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
  vex_prefix(src, nds_enc, dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
@ -4930,7 +5102,7 @@ void Assembler::vpsubb(XMMRegister dst, XMMRegister nds, Address src, int vector
 void Assembler::vpsubw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
  assert(UseAVX > 0, "requires some form of AVX");
  InstructionMark im(this);
-  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ true);
+  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
  attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_NObit);
  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
  vex_prefix(src, nds_enc, dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
@ -4962,7 +5134,7 @@ void Assembler::vpsubq(XMMRegister dst, XMMRegister nds, Address src, int vector
 void Assembler::pmullw(XMMRegister dst, XMMRegister src) {
  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ true);
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
  emit_int8((unsigned char)0xD5);
  emit_int8((unsigned char)(0xC0 | encode));
@ -4978,7 +5150,7 @@ void Assembler::pmulld(XMMRegister dst, XMMRegister src) {
 void Assembler::vpmullw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
  assert(UseAVX > 0, "requires some form of AVX");
-  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ true);
+  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
  int encode = vex_prefix_and_encode(dst->encoding(), nds_enc, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
  emit_int8((unsigned char)0xD5);
@ -5006,7 +5178,7 @@ void Assembler::vpmullq(XMMRegister dst, XMMRegister nds, XMMRegister src, int v
 void Assembler::vpmullw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
  assert(UseAVX > 0, "requires some form of AVX");
  InstructionMark im(this);
-  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ true);
+  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
  attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_NObit);
  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
  vex_prefix(src, nds_enc, dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
@ -5039,7 +5211,7 @@ void Assembler::vpmullq(XMMRegister dst, XMMRegister nds, Address src, int vecto
 // Shift packed integers left by specified number of bits.
 void Assembler::psllw(XMMRegister dst, int shift) {
  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ true);
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
  // XMM6 is for /6 encoding: 66 0F 71 /6 ib
  int encode = simd_prefix_and_encode(xmm6, dst, dst, VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
  emit_int8(0x71);
@ -5069,7 +5241,7 @@ void Assembler::psllq(XMMRegister dst, int shift) {
 void Assembler::psllw(XMMRegister dst, XMMRegister shift) {
  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ true);
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
  int encode = simd_prefix_and_encode(dst, dst, shift, VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
  emit_int8((unsigned char)0xF1);
  emit_int8((unsigned char)(0xC0 | encode));
@ -5093,7 +5265,7 @@ void Assembler::psllq(XMMRegister dst, XMMRegister shift) {
 void Assembler::vpsllw(XMMRegister dst, XMMRegister src, int shift, int vector_len) {
  assert(UseAVX > 0, "requires some form of AVX");
-  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ true);
+  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
  // XMM6 is for /6 encoding: 66 0F 71 /6 ib
  int encode = vex_prefix_and_encode(xmm6->encoding(), dst->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
  emit_int8(0x71);
@ -5124,7 +5296,7 @@ void Assembler::vpsllq(XMMRegister dst, XMMRegister src, int shift, int vector_l
 void Assembler::vpsllw(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len) {
  assert(UseAVX > 0, "requires some form of AVX");
-  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ true);
+  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
  int encode = vex_prefix_and_encode(dst->encoding(), src->encoding(), shift->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
  emit_int8((unsigned char)0xF1);
  emit_int8((unsigned char)(0xC0 | encode));
@ -5149,7 +5321,7 @@ void Assembler::vpsllq(XMMRegister dst, XMMRegister src, XMMRegister shift, int
 // Shift packed integers logically right by specified number of bits.
 void Assembler::psrlw(XMMRegister dst, int shift) {
  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ true);
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
  // XMM2 is for /2 encoding: 66 0F 71 /2 ib
  int encode = simd_prefix_and_encode(xmm2, dst, dst, VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
  emit_int8(0x71);
@ -5181,7 +5353,7 @@ void Assembler::psrlq(XMMRegister dst, int shift) {
 void Assembler::psrlw(XMMRegister dst, XMMRegister shift) {
  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ true);
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
  int encode = simd_prefix_and_encode(dst, dst, shift, VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
  emit_int8((unsigned char)0xD1);
  emit_int8((unsigned char)(0xC0 | encode));
@ -5205,7 +5377,7 @@ void Assembler::psrlq(XMMRegister dst, XMMRegister shift) {
 void Assembler::vpsrlw(XMMRegister dst, XMMRegister src, int shift, int vector_len) {
  assert(UseAVX > 0, "requires some form of AVX");
-  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ true);
+  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
  // XMM2 is for /2 encoding: 66 0F 71 /2 ib
  int encode = vex_prefix_and_encode(xmm2->encoding(), dst->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
  emit_int8(0x71);
@ -5235,7 +5407,7 @@ void Assembler::vpsrlq(XMMRegister dst, XMMRegister src, int shift, int vector_l
 void Assembler::vpsrlw(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len) {
  assert(UseAVX > 0, "requires some form of AVX");
-  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ true);
+  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
  int encode = vex_prefix_and_encode(dst->encoding(), src->encoding(), shift->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
  emit_int8((unsigned char)0xD1);
  emit_int8((unsigned char)(0xC0 | encode));
@ -5260,7 +5432,7 @@ void Assembler::vpsrlq(XMMRegister dst, XMMRegister src, XMMRegister shift, int
 // Shift packed integers arithmetically right by specified number of bits.
 void Assembler::psraw(XMMRegister dst, int shift) {
  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ true);
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
  // XMM4 is for /4 encoding: 66 0F 71 /4 ib
  int encode = simd_prefix_and_encode(xmm4, dst, dst, VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
  emit_int8(0x71);
@ -5280,7 +5452,7 @@ void Assembler::psrad(XMMRegister dst, int shift) {
 void Assembler::psraw(XMMRegister dst, XMMRegister shift) {
  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ true);
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
  int encode = simd_prefix_and_encode(dst, dst, shift, VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
  emit_int8((unsigned char)0xE1);
  emit_int8((unsigned char)(0xC0 | encode));
@ -5296,7 +5468,7 @@ void Assembler::psrad(XMMRegister dst, XMMRegister shift) {
 void Assembler::vpsraw(XMMRegister dst, XMMRegister src, int shift, int vector_len) {
  assert(UseAVX > 0, "requires some form of AVX");
-  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ true);
+  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
  // XMM4 is for /4 encoding: 66 0F 71 /4 ib
  int encode = vex_prefix_and_encode(xmm4->encoding(), dst->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
  emit_int8(0x71);
@ -5316,7 +5488,7 @@ void Assembler::vpsrad(XMMRegister dst, XMMRegister src, int shift, int vector_l
 void Assembler::vpsraw(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len) {
  assert(UseAVX > 0, "requires some form of AVX");
-  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ true);
+  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
  int encode = vex_prefix_and_encode(dst->encoding(), src->encoding(), shift->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
  emit_int8((unsigned char)0xE1);
  emit_int8((unsigned char)(0xC0 | encode));
@ -5706,7 +5878,7 @@ void Assembler::vpbroadcastd(XMMRegister dst, XMMRegister src) {
 // duplicate 2-bytes integer data from src into 16 locations in dest
 void Assembler::vpbroadcastw(XMMRegister dst, XMMRegister src) {
  assert(VM_Version::supports_avx2(), "");
-  InstructionAttr attributes(AVX_256bit, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ true);
+  InstructionAttr attributes(AVX_256bit, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
  int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
  emit_int8(0x79);
  emit_int8((unsigned char)(0xC0 | encode));
@ -6573,18 +6745,6 @@ int Assembler::simd_prefix_and_encode(XMMRegister dst, XMMRegister nds, XMMRegis
  }
 }
 int Assembler::kreg_prefix_and_encode(KRegister dst, KRegister nds, KRegister src, VexSimdPrefix pre,
                                      VexOpcode opc, InstructionAttr *attributes) {
  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
  return vex_prefix_and_encode(dst->encoding(), nds_enc, src->encoding(), pre, opc, attributes);
 }
 int Assembler::kreg_prefix_and_encode(KRegister dst, KRegister nds, Register src, VexSimdPrefix pre,
                                      VexOpcode opc, InstructionAttr *attributes) {
  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
  return vex_prefix_and_encode(dst->encoding(), nds_enc, src->encoding(), pre, opc, attributes);
 }
 void Assembler::cmppd(XMMRegister dst, XMMRegister nds, XMMRegister src, int cop, int vector_len) {
  assert(VM_Version::supports_avx(), "");
  assert(!VM_Version::supports_evex(), "");
--- a/hotspot/src/cpu/x86/vm/assembler_x86.hpp
+++ b/hotspot/src/cpu/x86/vm/assembler_x86.hpp
@ -655,12 +655,6 @@ private:
  int simd_prefix_and_encode(XMMRegister dst, XMMRegister nds, XMMRegister src, VexSimdPrefix pre,
                             VexOpcode opc, InstructionAttr *attributes);
  int kreg_prefix_and_encode(KRegister dst, KRegister nds, KRegister src, VexSimdPrefix pre,
                             VexOpcode opc, InstructionAttr *attributes);
  int kreg_prefix_and_encode(KRegister dst, KRegister nds, Register src, VexSimdPrefix pre,
                             VexOpcode opc, InstructionAttr *attributes);
  // Helper functions for groups of instructions
  void emit_arith_b(int op1, int op2, Register dst, int imm8);
@ -1331,12 +1325,17 @@ private:
  void movddup(XMMRegister dst, XMMRegister src);
  void kmovbl(KRegister dst, Register src);
  void kmovbl(Register dst, KRegister src);
  void kmovwl(KRegister dst, Register src);
  void kmovwl(Register dst, KRegister src);
  void kmovdl(KRegister dst, Register src);
  void kmovdl(Register dst, KRegister src);
  void kmovql(KRegister dst, KRegister src);
  void kmovql(KRegister dst, Register src);
  void kmovql(Address dst, KRegister src);
  void kmovql(KRegister dst, Address src);
  void kmovql(KRegister dst, Register src);
  void kmovql(Register dst, KRegister src);
  void kortestbl(KRegister dst, KRegister src);
  void kortestwl(KRegister dst, KRegister src);
@ -1521,14 +1520,17 @@ private:
  void pcmpeqb(XMMRegister dst, XMMRegister src);
  void vpcmpeqb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
  void evpcmpeqb(KRegister kdst, XMMRegister nds, XMMRegister src, int vector_len);
  void evpcmpeqb(KRegister kdst, XMMRegister nds, Address src, int vector_len);
  void pcmpeqw(XMMRegister dst, XMMRegister src);
  void vpcmpeqw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
  void evpcmpeqw(KRegister kdst, XMMRegister nds, XMMRegister src, int vector_len);
  void evpcmpeqw(KRegister kdst, XMMRegister nds, Address src, int vector_len);
  void pcmpeqd(XMMRegister dst, XMMRegister src);
  void vpcmpeqd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
  void evpcmpeqd(KRegister kdst, XMMRegister nds, XMMRegister src, int vector_len);
  void evpcmpeqd(KRegister kdst, XMMRegister nds, Address src, int vector_len);
  void pcmpeqq(XMMRegister dst, XMMRegister src);
  void vpcmpeqq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
@ -1541,14 +1543,22 @@ private:
  // SSE 4.1 extract
  void pextrd(Register dst, XMMRegister src, int imm8);
  void pextrq(Register dst, XMMRegister src, int imm8);
  void pextrd(Address dst, XMMRegister src, int imm8);
  void pextrq(Address dst, XMMRegister src, int imm8);
  void pextrb(Address dst, XMMRegister src, int imm8);
  // SSE 2 extract
  void pextrw(Register dst, XMMRegister src, int imm8);
  void pextrw(Address dst, XMMRegister src, int imm8);
  // SSE 4.1 insert
  void pinsrd(XMMRegister dst, Register src, int imm8);
  void pinsrq(XMMRegister dst, Register src, int imm8);
  void pinsrd(XMMRegister dst, Address src, int imm8);
  void pinsrq(XMMRegister dst, Address src, int imm8);
  void pinsrb(XMMRegister dst, Address src, int imm8);
  // SSE 2 insert
  void pinsrw(XMMRegister dst, Register src, int imm8);
  void pinsrw(XMMRegister dst, Address src, int imm8);
  // SSE4.1 packed move
  void pmovzxbw(XMMRegister dst, XMMRegister src);
@ -1760,6 +1770,8 @@ private:
  void xorl(Register dst, Address src);
  void xorl(Register dst, Register src);
  void xorb(Register dst, Address src);
  void xorq(Register dst, Address src);
  void xorq(Register dst, Register src);
@ -1789,6 +1801,7 @@ private:
  // Add Packed Floating-Point Values
  void addpd(XMMRegister dst, XMMRegister src);
  void addpd(XMMRegister dst, Address src);
  void addps(XMMRegister dst, XMMRegister src);
  void vaddpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
  void vaddps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
--- a/hotspot/src/cpu/x86/vm/c1_LIRAssembler_x86.cpp
+++ b/hotspot/src/cpu/x86/vm/c1_LIRAssembler_x86.cpp
@ -2381,9 +2381,6 @@ void LIR_Assembler::intrinsic_op(LIR_Code code, LIR_Opr value, LIR_Opr unused, L
        // Should consider not saving rbx, if not necessary
        __ trigfunc('t', op->as_Op2()->fpu_stack_size());
        break;
      case lir_pow :
        __ pow_with_fallback(op->as_Op2()->fpu_stack_size());
        break;
      default      : ShouldNotReachHere();
    }
  } else {
--- a/hotspot/src/cpu/x86/vm/c1_LIRGenerator_x86.cpp
+++ b/hotspot/src/cpu/x86/vm/c1_LIRGenerator_x86.cpp
@ -736,19 +736,6 @@ void LIRGenerator::do_CompareAndSwap(Intrinsic* x, ValueType* type) {
  obj.load_item();
  offset.load_nonconstant();
  if (type == objectType) {
    cmp.load_item_force(FrameMap::rax_oop_opr);
    val.load_item();
  } else if (type == intType) {
    cmp.load_item_force(FrameMap::rax_opr);
    val.load_item();
  } else if (type == longType) {
    cmp.load_item_force(FrameMap::long0_opr);
    val.load_item_force(FrameMap::long1_opr);
  } else {
    ShouldNotReachHere();
  }
  LIR_Opr addr = new_pointer_register();
  LIR_Address* a;
  if(offset.result()->is_constant()) {
@ -785,6 +772,19 @@ void LIRGenerator::do_CompareAndSwap(Intrinsic* x, ValueType* type) {
                true /* do_load */, false /* patch */, NULL);
  }
  if (type == objectType) {
    cmp.load_item_force(FrameMap::rax_oop_opr);
    val.load_item();
  } else if (type == intType) {
    cmp.load_item_force(FrameMap::rax_opr);
    val.load_item();
  } else if (type == longType) {
    cmp.load_item_force(FrameMap::long0_opr);
    val.load_item_force(FrameMap::long1_opr);
  } else {
    ShouldNotReachHere();
  }
  LIR_Opr ill = LIR_OprFact::illegalOpr;  // for convenience
  if (type == objectType)
    __ cas_obj(addr, cmp.result(), val.result(), ill, ill);
@ -810,7 +810,8 @@ void LIRGenerator::do_CompareAndSwap(Intrinsic* x, ValueType* type) {
 void LIRGenerator::do_MathIntrinsic(Intrinsic* x) {
  assert(x->number_of_arguments() == 1 || (x->number_of_arguments() == 2 && x->id() == vmIntrinsics::_dpow), "wrong type");
-  if (x->id() == vmIntrinsics::_dexp || x->id() == vmIntrinsics::_dlog) {
+  if (x->id() == vmIntrinsics::_dexp || x->id() == vmIntrinsics::_dlog ||
      x->id() == vmIntrinsics::_dpow) {
    do_LibmIntrinsic(x);
    return;
  }
@ -824,7 +825,6 @@ void LIRGenerator::do_MathIntrinsic(Intrinsic* x) {
      case vmIntrinsics::_dcos:
      case vmIntrinsics::_dtan:
      case vmIntrinsics::_dlog10:
      case vmIntrinsics::_dpow:
        use_fpu = true;
    }
  } else {
@ -874,7 +874,6 @@ void LIRGenerator::do_MathIntrinsic(Intrinsic* x) {
    case vmIntrinsics::_dcos:   __ cos  (calc_input, calc_result, tmp1, tmp2);              break;
    case vmIntrinsics::_dtan:   __ tan  (calc_input, calc_result, tmp1, tmp2);              break;
    case vmIntrinsics::_dlog10: __ log10(calc_input, calc_result, tmp1);                    break;
    case vmIntrinsics::_dpow:   __ pow  (calc_input, calc_input2, calc_result, tmp1, tmp2, FrameMap::rax_opr, FrameMap::rcx_opr, FrameMap::rdx_opr); break;
    default:                    ShouldNotReachHere();
  }
@ -890,11 +889,25 @@ void LIRGenerator::do_LibmIntrinsic(Intrinsic* x) {
  LIR_Opr calc_result = rlock_result(x);
  LIR_Opr result_reg = result_register_for(x->type());
  CallingConvention* cc = NULL;
  if (x->id() == vmIntrinsics::_dpow) {
    LIRItem value1(x->argument_at(1), this);
    value1.set_destroys_register();
    BasicTypeList signature(2);
    signature.append(T_DOUBLE);
    signature.append(T_DOUBLE);
    cc = frame_map()->c_calling_convention(&signature);
    value.load_item_force(cc->at(0));
    value1.load_item_force(cc->at(1));
  } else {
    BasicTypeList signature(1);
    signature.append(T_DOUBLE);
-  CallingConvention* cc = frame_map()->c_calling_convention(&signature);
+    cc = frame_map()->c_calling_convention(&signature);
    value.load_item_force(cc->at(0));
  }
 #ifndef _LP64
  LIR_Opr tmp = FrameMap::fpu0_double_opr;
@ -915,6 +928,14 @@ void LIRGenerator::do_LibmIntrinsic(Intrinsic* x) {
        __ call_runtime_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::dlog), getThreadTemp(), result_reg, cc->args());
      }
      break;
    case vmIntrinsics::_dpow:
      if (VM_Version::supports_sse2()) {
        __ call_runtime_leaf(StubRoutines::dpow(), getThreadTemp(), result_reg, cc->args());
      }
      else {
        __ call_runtime_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::dpow), getThreadTemp(), result_reg, cc->args());
      }
      break;
    default:  ShouldNotReachHere();
  }
 #else
@ -925,6 +946,9 @@ void LIRGenerator::do_LibmIntrinsic(Intrinsic* x) {
    case vmIntrinsics::_dlog:
      __ call_runtime_leaf(StubRoutines::dlog(), getThreadTemp(), result_reg, cc->args());
      break;
    case vmIntrinsics::_dpow:
      __ call_runtime_leaf(StubRoutines::dpow(), getThreadTemp(), result_reg, cc->args());
      break;
  }
 #endif
  __ move(result_reg, calc_result);
--- a/Show more
+++ b/Show more