8049737: Contended Locking reorder and cache line bucket

JEP-143/JDK-8046133 - optimization #1 - reorder and cache line bucket. Co-authored-by: Dave Dice <dave.dice@oracle.com> Co-authored-by: Karen Kinnear <karen.kinnear@oracle.com> Reviewed-by: shade, dice, dholmes, dsimms
2025-08-27 14:54:52 +02:00 · 2014-10-14 10:32:12 -07:00 · 2014-10-14 10:32:12 -07:00 · f1ab0fae73
commit f1ab0fae73
parent 51866388d1
13 changed files with 302 additions and 247 deletions
--- a/hotspot/src/share/vm/runtime/synchronizer.cpp
+++ b/hotspot/src/share/vm/runtime/synchronizer.cpp
@ -24,6 +24,7 @@

 #include "precompiled.hpp"
 #include "classfile/vmSymbols.hpp"
+#include "memory/padded.hpp"
 #include "memory/resourceArea.hpp"
 #include "oops/markOop.hpp"
 #include "oops/oop.inline.hpp"
@ -110,6 +111,8 @@ int dtrace_waited_probe(ObjectMonitor* monitor, Handle obj, Thread* thr) {
 #define NINFLATIONLOCKS 256
 static volatile intptr_t InflationLocks[NINFLATIONLOCKS];

+// gBlockList is really PaddedEnd<ObjectMonitor> *, but we don't
+// want to expose the PaddedEnd template more than necessary.
 ObjectMonitor * ObjectSynchronizer::gBlockList = NULL;
 ObjectMonitor * volatile ObjectSynchronizer::gFreeList  = NULL;
 ObjectMonitor * volatile ObjectSynchronizer::gOmInUseList  = NULL;
@ -410,16 +413,15 @@ void ObjectSynchronizer::notifyall(Handle obj, TRAPS) {
 // performed by the CPU(s) or platform.

 struct SharedGlobals {
+  char         _pad_prefix[DEFAULT_CACHE_LINE_SIZE];
  // These are highly shared mostly-read variables.
-  // To avoid false-sharing they need to be the sole occupants of a $ line.
-  double padPrefix[8];
+  // To avoid false-sharing they need to be the sole occupants of a cache line.
  volatile int stwRandom;
  volatile int stwCycle;
-
-  // Hot RW variables -- Sequester to avoid false-sharing
-  double padSuffix[16];
+  DEFINE_PAD_MINUS_SIZE(1, DEFAULT_CACHE_LINE_SIZE, sizeof(volatile int) * 2);
+  // Hot RW variable -- Sequester to avoid false-sharing
  volatile int hcSequence;
-  double padFinal[8];
+  DEFINE_PAD_MINUS_SIZE(2, DEFAULT_CACHE_LINE_SIZE, sizeof(volatile int));
 };

 static SharedGlobals GVars;
@ -780,18 +782,18 @@ JavaThread* ObjectSynchronizer::get_lock_owner(Handle h_obj, bool doLock) {
 // Visitors ...

 void ObjectSynchronizer::monitors_iterate(MonitorClosure* closure) {
-  ObjectMonitor* block = gBlockList;
+  PaddedEnd<ObjectMonitor> * block = (PaddedEnd<ObjectMonitor> *)gBlockList;
  ObjectMonitor* mid;
  while (block) {
    assert(block->object() == CHAINMARKER, "must be a block header");
    for (int i = _BLOCKSIZE - 1; i > 0; i--) {
-      mid = block + i;
+      mid = (ObjectMonitor *)(block + i);
      oop object = (oop) mid->object();
      if (object != NULL) {
        closure->do_monitor(mid);
      }
    }
-    block = (ObjectMonitor*) block->FreeNext;
+    block = (PaddedEnd<ObjectMonitor> *) block->FreeNext;
  }
 }

@ -806,10 +808,12 @@ static inline ObjectMonitor* next(ObjectMonitor* block) {

 void ObjectSynchronizer::oops_do(OopClosure* f) {
  assert(SafepointSynchronize::is_at_safepoint(), "must be at safepoint");
-  for (ObjectMonitor* block = gBlockList; block != NULL; block = next(block)) {
+  for (PaddedEnd<ObjectMonitor> * block =
+       (PaddedEnd<ObjectMonitor> *)gBlockList; block != NULL;
+       block = (PaddedEnd<ObjectMonitor> *)next(block)) {
    assert(block->object() == CHAINMARKER, "must be a block header");
    for (int i = 1; i < _BLOCKSIZE; i++) {
-      ObjectMonitor* mid = &block[i];
+      ObjectMonitor* mid = (ObjectMonitor *)&block[i];
      if (mid->object() != NULL) {
        f->do_oop((oop*)mid->object_addr());
      }
@ -966,16 +970,29 @@ ObjectMonitor * NOINLINE ObjectSynchronizer::omAlloc(Thread * Self) {
    // 3: allocate a block of new ObjectMonitors
    // Both the local and global free lists are empty -- resort to malloc().
    // In the current implementation objectMonitors are TSM - immortal.
+    // Ideally, we'd write "new ObjectMonitor[_BLOCKSIZE], but we want
+    // each ObjectMonitor to start at the beginning of a cache line,
+    // so we use align_size_up().
+    // A better solution would be to use C++ placement-new.
+    // BEWARE: As it stands currently, we don't run the ctors!
    assert(_BLOCKSIZE > 1, "invariant");
-    ObjectMonitor * temp = new ObjectMonitor[_BLOCKSIZE];
+    size_t neededsize = sizeof(PaddedEnd<ObjectMonitor>) * _BLOCKSIZE;
+    PaddedEnd<ObjectMonitor> * temp;
+    size_t aligned_size = neededsize + (DEFAULT_CACHE_LINE_SIZE - 1);
+    void* real_malloc_addr = (void *)NEW_C_HEAP_ARRAY(char, aligned_size,
+                                                      mtInternal);
+    temp = (PaddedEnd<ObjectMonitor> *)
+             align_size_up((intptr_t)real_malloc_addr,
+                           DEFAULT_CACHE_LINE_SIZE);

    // NOTE: (almost) no way to recover if allocation failed.
    // We might be able to induce a STW safepoint and scavenge enough
    // objectMonitors to permit progress.
    if (temp == NULL) {
-      vm_exit_out_of_memory(sizeof (ObjectMonitor[_BLOCKSIZE]), OOM_MALLOC_ERROR,
+      vm_exit_out_of_memory(neededsize, OOM_MALLOC_ERROR,
                            "Allocate ObjectMonitors");
    }
+    (void)memset((void *) temp, 0, neededsize);

    // Format the block.
    // initialize the linked list, each monitor points to its next
@ -986,7 +1003,7 @@ ObjectMonitor * NOINLINE ObjectSynchronizer::omAlloc(Thread * Self) {
    // look like: class Block { Block * next; int N; ObjectMonitor Body [N] ; }

    for (int i = 1; i < _BLOCKSIZE; i++) {
-      temp[i].FreeNext = &temp[i+1];
+      temp[i].FreeNext = (ObjectMonitor *)&temp[i+1];
    }

    // terminate the last monitor as the end of list
@ -1141,10 +1158,6 @@ ObjectMonitor* ObjectSynchronizer::inflate_helper(oop obj) {
 }


-// Note that we could encounter some performance loss through false-sharing as
-// multiple locks occupy the same $ line.  Padding might be appropriate.
-
-
 ObjectMonitor * NOINLINE ObjectSynchronizer::inflate(Thread * Self,
                                                     oop object) {
  // Inflate mutates the heap ...
@ -1210,7 +1223,6 @@ ObjectMonitor * NOINLINE ObjectSynchronizer::inflate(Thread * Self,
      // in which INFLATING appears in the mark.
      m->Recycle();
      m->_Responsible  = NULL;
-      m->OwnerIsThread = 0;
      m->_recursions   = 0;
      m->_SpinDuration = ObjectMonitor::Knob_SpinLimit;   // Consider: maintain by type/class

@ -1257,8 +1269,8 @@ ObjectMonitor * NOINLINE ObjectSynchronizer::inflate(Thread * Self,
      m->set_header(dmw);

      // Optimization: if the mark->locker stack address is associated
-      // with this thread we could simply set m->_owner = Self and
-      // m->OwnerIsThread = 1. Note that a thread can inflate an object
+      // with this thread we could simply set m->_owner = Self.
+      // Note that a thread can inflate an object
      // that it has stack-locked -- as might happen in wait() -- directly
      // with CAS.  That is, we can avoid the xchg-NULL .... ST idiom.
      m->set_owner(mark->locker());
@ -1302,7 +1314,6 @@ ObjectMonitor * NOINLINE ObjectSynchronizer::inflate(Thread * Self,
    m->set_header(mark);
    m->set_owner(NULL);
    m->set_object(object);
-    m->OwnerIsThread = 1;
    m->_recursions   = 0;
    m->_Responsible  = NULL;
    m->_SpinDuration = ObjectMonitor::Knob_SpinLimit;       // consider: keep metastats by type/class
@ -1310,7 +1321,6 @@ ObjectMonitor * NOINLINE ObjectSynchronizer::inflate(Thread * Self,
    if (Atomic::cmpxchg_ptr (markOopDesc::encode(m), object->mark_addr(), mark) != mark) {
      m->set_object(NULL);
      m->set_owner(NULL);
-      m->OwnerIsThread = 0;
      m->Recycle();
      omRelease(Self, m, true);
      m = NULL;
@ -1336,9 +1346,6 @@ ObjectMonitor * NOINLINE ObjectSynchronizer::inflate(Thread * Self,
  }
 }

-// Note that we could encounter some performance loss through false-sharing as
-// multiple locks occupy the same $ line.  Padding might be appropriate.
-

 // Deflate_idle_monitors() is called at all safepoints, immediately
 // after all mutators are stopped, but before any objects have moved.
@ -1491,12 +1498,14 @@ void ObjectSynchronizer::deflate_idle_monitors() {
      nInuse += gOmInUseCount;
    }

-  } else for (ObjectMonitor* block = gBlockList; block != NULL; block = next(block)) {
+  } else for (PaddedEnd<ObjectMonitor> * block =
+              (PaddedEnd<ObjectMonitor> *)gBlockList; block != NULL;
+              block = (PaddedEnd<ObjectMonitor> *)next(block)) {
    // Iterate over all extant monitors - Scavenge all idle monitors.
    assert(block->object() == CHAINMARKER, "must be a block header");
    nInCirculation += _BLOCKSIZE;
    for (int i = 1; i < _BLOCKSIZE; i++) {
-      ObjectMonitor* mid = &block[i];
+      ObjectMonitor* mid = (ObjectMonitor*)&block[i];
      oop obj = (oop) mid->object();

      if (obj == NULL) {
@ -1648,18 +1657,18 @@ void ObjectSynchronizer::sanity_checks(const bool verbose,

 // Verify all monitors in the monitor cache, the verification is weak.
 void ObjectSynchronizer::verify() {
-  ObjectMonitor* block = gBlockList;
+  PaddedEnd<ObjectMonitor> * block = (PaddedEnd<ObjectMonitor> *)gBlockList;
  ObjectMonitor* mid;
  while (block) {
    assert(block->object() == CHAINMARKER, "must be a block header");
    for (int i = 1; i < _BLOCKSIZE; i++) {
-      mid = block + i;
+      mid = (ObjectMonitor *)(block + i);
      oop object = (oop) mid->object();
      if (object != NULL) {
        mid->verify();
      }
    }
-    block = (ObjectMonitor*) block->FreeNext;
+    block = (PaddedEnd<ObjectMonitor> *) block->FreeNext;
  }
 }

@ -1668,18 +1677,19 @@ void ObjectSynchronizer::verify() {
 // the list of extant blocks without taking a lock.

 int ObjectSynchronizer::verify_objmon_isinpool(ObjectMonitor *monitor) {
-  ObjectMonitor* block = gBlockList;
+  PaddedEnd<ObjectMonitor> * block = (PaddedEnd<ObjectMonitor> *)gBlockList;

  while (block) {
    assert(block->object() == CHAINMARKER, "must be a block header");
-    if (monitor > &block[0] && monitor < &block[_BLOCKSIZE]) {
+    if (monitor > (ObjectMonitor *)&block[0] &&
+        monitor < (ObjectMonitor *)&block[_BLOCKSIZE]) {
      address mon = (address) monitor;
      address blk = (address) block;
      size_t diff = mon - blk;
-      assert((diff % sizeof(ObjectMonitor)) == 0, "check");
+      assert((diff % sizeof(PaddedEnd<ObjectMonitor>)) == 0, "check");
      return 1;
    }
-    block = (ObjectMonitor*) block->FreeNext;
+    block = (PaddedEnd<ObjectMonitor> *) block->FreeNext;
  }
  return 0;
 }