6988353: refactor contended sync subsystem

Reduce complexity by factoring synchronizer.cpp

Reviewed-by: dholmes, never, coleenp
This commit is contained in:
Karen Kinnear 2010-10-22 15:59:34 -04:00
parent daa052114f
commit 22929fb78f
29 changed files with 4556 additions and 4792 deletions

View file

@ -2995,8 +2995,8 @@ jint Threads::create_vm(JavaVMInitArgs* args, bool* canTryAgain) {
// crash Linux VM, see notes in os_linux.cpp.
main_thread->create_stack_guard_pages();
// Initialize Java-Leve synchronization subsystem
ObjectSynchronizer::Initialize() ;
// Initialize Java-Level synchronization subsystem
ObjectMonitor::Initialize() ;
// Initialize global modules
jint status = init_globals();
@ -3965,215 +3965,272 @@ void Threads::print_on_error(outputStream* st, Thread* current, char* buf, int b
}
}
// Internal SpinLock and Mutex
// Based on ParkEvent
// Lifecycle management for TSM ParkEvents.
// ParkEvents are type-stable (TSM).
// In our particular implementation they happen to be immortal.
// Ad-hoc mutual exclusion primitives: SpinLock and Mux
//
// We manage concurrency on the FreeList with a CAS-based
// detach-modify-reattach idiom that avoids the ABA problems
// that would otherwise be present in a simple CAS-based
// push-pop implementation. (push-one and pop-all)
// We employ SpinLocks _only for low-contention, fixed-length
// short-duration critical sections where we're concerned
// about native mutex_t or HotSpot Mutex:: latency.
// The mux construct provides a spin-then-block mutual exclusion
// mechanism.
//
// Caveat: Allocate() and Release() may be called from threads
// other than the thread associated with the Event!
// If we need to call Allocate() when running as the thread in
// question then look for the PD calls to initialize native TLS.
// Native TLS (Win32/Linux/Solaris) can only be initialized or
// accessed by the associated thread.
// See also pd_initialize().
// Testing has shown that contention on the ListLock guarding gFreeList
// is common. If we implement ListLock as a simple SpinLock it's common
// for the JVM to devolve to yielding with little progress. This is true
// despite the fact that the critical sections protected by ListLock are
// extremely short.
//
// Note that we could defer associating a ParkEvent with a thread
// until the 1st time the thread calls park(). unpark() calls to
// an unprovisioned thread would be ignored. The first park() call
// for a thread would allocate and associate a ParkEvent and return
// immediately.
// TODO-FIXME: ListLock should be of type SpinLock.
// We should make this a 1st-class type, integrated into the lock
// hierarchy as leaf-locks. Critically, the SpinLock structure
// should have sufficient padding to avoid false-sharing and excessive
// cache-coherency traffic.
volatile int ParkEvent::ListLock = 0 ;
ParkEvent * volatile ParkEvent::FreeList = NULL ;
ParkEvent * ParkEvent::Allocate (Thread * t) {
// In rare cases -- JVM_RawMonitor* operations -- we can find t == null.
ParkEvent * ev ;
typedef volatile int SpinLockT ;
// Start by trying to recycle an existing but unassociated
// ParkEvent from the global free list.
void Thread::SpinAcquire (volatile int * adr, const char * LockName) {
if (Atomic::cmpxchg (1, adr, 0) == 0) {
return ; // normal fast-path return
}
// Slow-path : We've encountered contention -- Spin/Yield/Block strategy.
TEVENT (SpinAcquire - ctx) ;
int ctr = 0 ;
int Yields = 0 ;
for (;;) {
ev = FreeList ;
if (ev == NULL) break ;
// 1: Detach - sequester or privatize the list
// Tantamount to ev = Swap (&FreeList, NULL)
if (Atomic::cmpxchg_ptr (NULL, &FreeList, ev) != ev) {
continue ;
}
// We've detached the list. The list in-hand is now
// local to this thread. This thread can operate on the
// list without risk of interference from other threads.
// 2: Extract -- pop the 1st element from the list.
ParkEvent * List = ev->FreeNext ;
if (List == NULL) break ;
for (;;) {
// 3: Try to reattach the residual list
guarantee (List != NULL, "invariant") ;
ParkEvent * Arv = (ParkEvent *) Atomic::cmpxchg_ptr (List, &FreeList, NULL) ;
if (Arv == NULL) break ;
// New nodes arrived. Try to detach the recent arrivals.
if (Atomic::cmpxchg_ptr (NULL, &FreeList, Arv) != Arv) {
continue ;
while (*adr != 0) {
++ctr ;
if ((ctr & 0xFFF) == 0 || !os::is_MP()) {
if (Yields > 5) {
// Consider using a simple NakedSleep() instead.
// Then SpinAcquire could be called by non-JVM threads
Thread::current()->_ParkEvent->park(1) ;
} else {
os::NakedYield() ;
++Yields ;
}
} else {
SpinPause() ;
}
guarantee (Arv != NULL, "invariant") ;
// 4: Merge Arv into List
ParkEvent * Tail = List ;
while (Tail->FreeNext != NULL) Tail = Tail->FreeNext ;
Tail->FreeNext = Arv ;
}
break ;
}
if (Atomic::cmpxchg (1, adr, 0) == 0) return ;
}
if (ev != NULL) {
guarantee (ev->AssociatedWith == NULL, "invariant") ;
} else {
// Do this the hard way -- materialize a new ParkEvent.
// In rare cases an allocating thread might detach a long list --
// installing null into FreeList -- and then stall or be obstructed.
// A 2nd thread calling Allocate() would see FreeList == null.
// The list held privately by the 1st thread is unavailable to the 2nd thread.
// In that case the 2nd thread would have to materialize a new ParkEvent,
// even though free ParkEvents existed in the system. In this case we end up
// with more ParkEvents in circulation than we need, but the race is
// rare and the outcome is benign. Ideally, the # of extant ParkEvents
// is equal to the maximum # of threads that existed at any one time.
// Because of the race mentioned above, segments of the freelist
// can be transiently inaccessible. At worst we may end up with the
// # of ParkEvents in circulation slightly above the ideal.
// Note that if we didn't have the TSM/immortal constraint, then
// when reattaching, above, we could trim the list.
ev = new ParkEvent () ;
guarantee ((intptr_t(ev) & 0xFF) == 0, "invariant") ;
}
ev->reset() ; // courtesy to caller
ev->AssociatedWith = t ; // Associate ev with t
ev->FreeNext = NULL ;
return ev ;
}
void ParkEvent::Release (ParkEvent * ev) {
if (ev == NULL) return ;
guarantee (ev->FreeNext == NULL , "invariant") ;
ev->AssociatedWith = NULL ;
void Thread::SpinRelease (volatile int * adr) {
assert (*adr != 0, "invariant") ;
OrderAccess::fence() ; // guarantee at least release consistency.
// Roach-motel semantics.
// It's safe if subsequent LDs and STs float "up" into the critical section,
// but prior LDs and STs within the critical section can't be allowed
// to reorder or float past the ST that releases the lock.
*adr = 0 ;
}
// muxAcquire and muxRelease:
//
// * muxAcquire and muxRelease support a single-word lock-word construct.
// The LSB of the word is set IFF the lock is held.
// The remainder of the word points to the head of a singly-linked list
// of threads blocked on the lock.
//
// * The current implementation of muxAcquire-muxRelease uses its own
// dedicated Thread._MuxEvent instance. If we're interested in
// minimizing the peak number of extant ParkEvent instances then
// we could eliminate _MuxEvent and "borrow" _ParkEvent as long
// as certain invariants were satisfied. Specifically, care would need
// to be taken with regards to consuming unpark() "permits".
// A safe rule of thumb is that a thread would never call muxAcquire()
// if it's enqueued (cxq, EntryList, WaitList, etc) and will subsequently
// park(). Otherwise the _ParkEvent park() operation in muxAcquire() could
// consume an unpark() permit intended for monitorenter, for instance.
// One way around this would be to widen the restricted-range semaphore
// implemented in park(). Another alternative would be to provide
// multiple instances of the PlatformEvent() for each thread. One
// instance would be dedicated to muxAcquire-muxRelease, for instance.
//
// * Usage:
// -- Only as leaf locks
// -- for short-term locking only as muxAcquire does not perform
// thread state transitions.
//
// Alternatives:
// * We could implement muxAcquire and muxRelease with MCS or CLH locks
// but with parking or spin-then-park instead of pure spinning.
// * Use Taura-Oyama-Yonenzawa locks.
// * It's possible to construct a 1-0 lock if we encode the lockword as
// (List,LockByte). Acquire will CAS the full lockword while Release
// will STB 0 into the LockByte. The 1-0 scheme admits stranding, so
// acquiring threads use timers (ParkTimed) to detect and recover from
// the stranding window. Thread/Node structures must be aligned on 256-byte
// boundaries by using placement-new.
// * Augment MCS with advisory back-link fields maintained with CAS().
// Pictorially: LockWord -> T1 <-> T2 <-> T3 <-> ... <-> Tn <-> Owner.
// The validity of the backlinks must be ratified before we trust the value.
// If the backlinks are invalid the exiting thread must back-track through the
// the forward links, which are always trustworthy.
// * Add a successor indication. The LockWord is currently encoded as
// (List, LOCKBIT:1). We could also add a SUCCBIT or an explicit _succ variable
// to provide the usual futile-wakeup optimization.
// See RTStt for details.
// * Consider schedctl.sc_nopreempt to cover the critical section.
//
typedef volatile intptr_t MutexT ; // Mux Lock-word
enum MuxBits { LOCKBIT = 1 } ;
void Thread::muxAcquire (volatile intptr_t * Lock, const char * LockName) {
intptr_t w = Atomic::cmpxchg_ptr (LOCKBIT, Lock, 0) ;
if (w == 0) return ;
if ((w & LOCKBIT) == 0 && Atomic::cmpxchg_ptr (w|LOCKBIT, Lock, w) == w) {
return ;
}
TEVENT (muxAcquire - Contention) ;
ParkEvent * const Self = Thread::current()->_MuxEvent ;
assert ((intptr_t(Self) & LOCKBIT) == 0, "invariant") ;
for (;;) {
// Push ev onto FreeList
// The mechanism is "half" lock-free.
ParkEvent * List = FreeList ;
ev->FreeNext = List ;
if (Atomic::cmpxchg_ptr (ev, &FreeList, List) == List) break ;
}
}
int its = (os::is_MP() ? 100 : 0) + 1 ;
// Override operator new and delete so we can ensure that the
// least significant byte of ParkEvent addresses is 0.
// Beware that excessive address alignment is undesirable
// as it can result in D$ index usage imbalance as
// well as bank access imbalance on Niagara-like platforms,
// although Niagara's hash function should help.
// Optional spin phase: spin-then-park strategy
while (--its >= 0) {
w = *Lock ;
if ((w & LOCKBIT) == 0 && Atomic::cmpxchg_ptr (w|LOCKBIT, Lock, w) == w) {
return ;
}
}
void * ParkEvent::operator new (size_t sz) {
return (void *) ((intptr_t (CHeapObj::operator new (sz + 256)) + 256) & -256) ;
}
void ParkEvent::operator delete (void * a) {
// ParkEvents are type-stable and immortal ...
ShouldNotReachHere();
}
// 6399321 As a temporary measure we copied & modified the ParkEvent::
// allocate() and release() code for use by Parkers. The Parker:: forms
// will eventually be removed as we consolide and shift over to ParkEvents
// for both builtin synchronization and JSR166 operations.
volatile int Parker::ListLock = 0 ;
Parker * volatile Parker::FreeList = NULL ;
Parker * Parker::Allocate (JavaThread * t) {
guarantee (t != NULL, "invariant") ;
Parker * p ;
// Start by trying to recycle an existing but unassociated
// Parker from the global free list.
for (;;) {
p = FreeList ;
if (p == NULL) break ;
// 1: Detach
// Tantamount to p = Swap (&FreeList, NULL)
if (Atomic::cmpxchg_ptr (NULL, &FreeList, p) != p) {
continue ;
}
// We've detached the list. The list in-hand is now
// local to this thread. This thread can operate on the
// list without risk of interference from other threads.
// 2: Extract -- pop the 1st element from the list.
Parker * List = p->FreeNext ;
if (List == NULL) break ;
for (;;) {
// 3: Try to reattach the residual list
guarantee (List != NULL, "invariant") ;
Parker * Arv = (Parker *) Atomic::cmpxchg_ptr (List, &FreeList, NULL) ;
if (Arv == NULL) break ;
// New nodes arrived. Try to detach the recent arrivals.
if (Atomic::cmpxchg_ptr (NULL, &FreeList, Arv) != Arv) {
continue ;
Self->reset() ;
Self->OnList = intptr_t(Lock) ;
// The following fence() isn't _strictly necessary as the subsequent
// CAS() both serializes execution and ratifies the fetched *Lock value.
OrderAccess::fence();
for (;;) {
w = *Lock ;
if ((w & LOCKBIT) == 0) {
if (Atomic::cmpxchg_ptr (w|LOCKBIT, Lock, w) == w) {
Self->OnList = 0 ; // hygiene - allows stronger asserts
return ;
}
continue ; // Interference -- *Lock changed -- Just retry
}
guarantee (Arv != NULL, "invariant") ;
// 4: Merge Arv into List
Parker * Tail = List ;
while (Tail->FreeNext != NULL) Tail = Tail->FreeNext ;
Tail->FreeNext = Arv ;
}
break ;
}
assert (w & LOCKBIT, "invariant") ;
Self->ListNext = (ParkEvent *) (w & ~LOCKBIT );
if (Atomic::cmpxchg_ptr (intptr_t(Self)|LOCKBIT, Lock, w) == w) break ;
}
if (p != NULL) {
guarantee (p->AssociatedWith == NULL, "invariant") ;
} else {
// Do this the hard way -- materialize a new Parker..
// In rare cases an allocating thread might detach
// a long list -- installing null into FreeList --and
// then stall. Another thread calling Allocate() would see
// FreeList == null and then invoke the ctor. In this case we
// end up with more Parkers in circulation than we need, but
// the race is rare and the outcome is benign.
// Ideally, the # of extant Parkers is equal to the
// maximum # of threads that existed at any one time.
// Because of the race mentioned above, segments of the
// freelist can be transiently inaccessible. At worst
// we may end up with the # of Parkers in circulation
// slightly above the ideal.
p = new Parker() ;
while (Self->OnList != 0) {
Self->park() ;
}
}
p->AssociatedWith = t ; // Associate p with t
p->FreeNext = NULL ;
return p ;
}
void Thread::muxAcquireW (volatile intptr_t * Lock, ParkEvent * ev) {
intptr_t w = Atomic::cmpxchg_ptr (LOCKBIT, Lock, 0) ;
if (w == 0) return ;
if ((w & LOCKBIT) == 0 && Atomic::cmpxchg_ptr (w|LOCKBIT, Lock, w) == w) {
return ;
}
void Parker::Release (Parker * p) {
if (p == NULL) return ;
guarantee (p->AssociatedWith != NULL, "invariant") ;
guarantee (p->FreeNext == NULL , "invariant") ;
p->AssociatedWith = NULL ;
TEVENT (muxAcquire - Contention) ;
ParkEvent * ReleaseAfter = NULL ;
if (ev == NULL) {
ev = ReleaseAfter = ParkEvent::Allocate (NULL) ;
}
assert ((intptr_t(ev) & LOCKBIT) == 0, "invariant") ;
for (;;) {
// Push p onto FreeList
Parker * List = FreeList ;
p->FreeNext = List ;
if (Atomic::cmpxchg_ptr (p, &FreeList, List) == List) break ;
guarantee (ev->OnList == 0, "invariant") ;
int its = (os::is_MP() ? 100 : 0) + 1 ;
// Optional spin phase: spin-then-park strategy
while (--its >= 0) {
w = *Lock ;
if ((w & LOCKBIT) == 0 && Atomic::cmpxchg_ptr (w|LOCKBIT, Lock, w) == w) {
if (ReleaseAfter != NULL) {
ParkEvent::Release (ReleaseAfter) ;
}
return ;
}
}
ev->reset() ;
ev->OnList = intptr_t(Lock) ;
// The following fence() isn't _strictly necessary as the subsequent
// CAS() both serializes execution and ratifies the fetched *Lock value.
OrderAccess::fence();
for (;;) {
w = *Lock ;
if ((w & LOCKBIT) == 0) {
if (Atomic::cmpxchg_ptr (w|LOCKBIT, Lock, w) == w) {
ev->OnList = 0 ;
// We call ::Release while holding the outer lock, thus
// artificially lengthening the critical section.
// Consider deferring the ::Release() until the subsequent unlock(),
// after we've dropped the outer lock.
if (ReleaseAfter != NULL) {
ParkEvent::Release (ReleaseAfter) ;
}
return ;
}
continue ; // Interference -- *Lock changed -- Just retry
}
assert (w & LOCKBIT, "invariant") ;
ev->ListNext = (ParkEvent *) (w & ~LOCKBIT );
if (Atomic::cmpxchg_ptr (intptr_t(ev)|LOCKBIT, Lock, w) == w) break ;
}
while (ev->OnList != 0) {
ev->park() ;
}
}
}
// Release() must extract a successor from the list and then wake that thread.
// It can "pop" the front of the list or use a detach-modify-reattach (DMR) scheme
// similar to that used by ParkEvent::Allocate() and ::Release(). DMR-based
// Release() would :
// (A) CAS() or swap() null to *Lock, releasing the lock and detaching the list.
// (B) Extract a successor from the private list "in-hand"
// (C) attempt to CAS() the residual back into *Lock over null.
// If there were any newly arrived threads and the CAS() would fail.
// In that case Release() would detach the RATs, re-merge the list in-hand
// with the RATs and repeat as needed. Alternately, Release() might
// detach and extract a successor, but then pass the residual list to the wakee.
// The wakee would be responsible for reattaching and remerging before it
// competed for the lock.
//
// Both "pop" and DMR are immune from ABA corruption -- there can be
// multiple concurrent pushers, but only one popper or detacher.
// This implementation pops from the head of the list. This is unfair,
// but tends to provide excellent throughput as hot threads remain hot.
// (We wake recently run threads first).
void Thread::muxRelease (volatile intptr_t * Lock) {
for (;;) {
const intptr_t w = Atomic::cmpxchg_ptr (0, Lock, LOCKBIT) ;
assert (w & LOCKBIT, "invariant") ;
if (w == LOCKBIT) return ;
ParkEvent * List = (ParkEvent *) (w & ~LOCKBIT) ;
assert (List != NULL, "invariant") ;
assert (List->OnList == intptr_t(Lock), "invariant") ;
ParkEvent * nxt = List->ListNext ;
// The following CAS() releases the lock and pops the head element.
if (Atomic::cmpxchg_ptr (intptr_t(nxt), Lock, w) != w) {
continue ;
}
List->OnList = 0 ;
OrderAccess::fence() ;
List->unpark () ;
return ;
}
}
void Threads::verify() {
ALL_JAVA_THREADS(p) {
p->verify();