8237143: Eliminate DirtyCardQ_cbl_mon

Replace locked data structures with lock-free data structures. Reviewed-by: tschatzl, sangheki
2025-09-16 00:54:38 +02:00 · 2020-02-06 19:09:07 -05:00 · 2020-02-06 19:09:07 -05:00 · ccbd819a01
commit ccbd819a01
parent e37a6aed88
10 changed files with 569 additions and 198 deletions
--- a/src/hotspot/share/gc/g1/g1BarrierSet.cpp
+++ b/src/hotspot/share/gc/g1/g1BarrierSet.cpp
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2001, 2019, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2001, 2020, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@ -36,7 +36,6 @@
 #include "oops/compressedOops.inline.hpp"
 #include "oops/oop.inline.hpp"
 #include "runtime/interfaceSupport.inline.hpp"
 #include "runtime/mutexLocker.hpp"
 #include "runtime/orderAccess.hpp"
 #include "runtime/thread.inline.hpp"
 #include "utilities/macros.hpp"
@ -59,7 +58,7 @@ G1BarrierSet::G1BarrierSet(G1CardTable* card_table) :
  _satb_mark_queue_buffer_allocator("SATB Buffer Allocator", G1SATBBufferSize),
  _dirty_card_queue_buffer_allocator("DC Buffer Allocator", G1UpdateBufferSize),
  _satb_mark_queue_set(&_satb_mark_queue_buffer_allocator),
-  _dirty_card_queue_set(DirtyCardQ_CBL_mon, &_dirty_card_queue_buffer_allocator),
+  _dirty_card_queue_set(&_dirty_card_queue_buffer_allocator),
  _shared_dirty_card_queue(&_dirty_card_queue_set)
 {}
--- a/src/hotspot/share/gc/g1/g1CollectedHeap.cpp
+++ b/src/hotspot/share/gc/g1/g1CollectedHeap.cpp
@ -2776,8 +2776,6 @@ size_t G1CollectedHeap::pending_card_num() {
  Threads::threads_do(&count_from_threads);
  G1DirtyCardQueueSet& dcqs = G1BarrierSet::dirty_card_queue_set();
  dcqs.verify_num_cards();
  return dcqs.num_cards() + count_from_threads._cards;
 }
--- a/src/hotspot/share/gc/g1/g1ConcurrentRefine.cpp
+++ b/src/hotspot/share/gc/g1/g1ConcurrentRefine.cpp
@ -89,6 +89,11 @@ jint G1ConcurrentRefineThreadControl::initialize(G1ConcurrentRefine* cr, uint nu
      }
    }
  }
  if (num_max_threads > 0) {
    G1BarrierSet::dirty_card_queue_set().set_primary_refinement_thread(_threads[0]);
  }
  return JNI_OK;
 }
@ -108,7 +113,7 @@ void G1ConcurrentRefineThreadControl::maybe_activate_next(uint cur_worker_id) {
    _threads[worker_id] = create_refinement_thread(worker_id, false);
    thread_to_activate = _threads[worker_id];
  }
-  if (thread_to_activate != NULL && !thread_to_activate->is_active()) {
+  if (thread_to_activate != NULL) {
    thread_to_activate->activate();
  }
 }
--- a/src/hotspot/share/gc/g1/g1ConcurrentRefineThread.cpp
+++ b/src/hotspot/share/gc/g1/g1ConcurrentRefineThread.cpp
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2001, 2019, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2001, 2020, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@ -29,9 +29,8 @@
 #include "gc/g1/g1DirtyCardQueue.hpp"
 #include "gc/shared/suspendibleThreadSet.hpp"
 #include "logging/log.hpp"
-#include "memory/resourceArea.hpp"
+#include "runtime/atomic.hpp"
-#include "runtime/handles.inline.hpp"
+#include "runtime/thread.hpp"
 #include "runtime/mutexLocker.hpp"
 G1ConcurrentRefineThread::G1ConcurrentRefineThread(G1ConcurrentRefine* cr, uint worker_id) :
  ConcurrentGCThread(),
@ -40,56 +39,53 @@ G1ConcurrentRefineThread::G1ConcurrentRefineThread(G1ConcurrentRefine* cr, uint
  _total_refinement_time(),
  _total_refined_cards(0),
  _worker_id(worker_id),
-  _active(false),
+  _notifier(new Semaphore(0)),
-  _monitor(NULL),
+  _should_notify(true),
  _cr(cr)
 {
  // Each thread has its own monitor. The i-th thread is responsible for signaling
  // to thread i+1 if the number of buffers in the queue exceeds a threshold for this
  // thread. Monitors are also used to wake up the threads during termination.
  // The 0th (primary) worker is notified by mutator threads and has a special monitor.
  if (!is_primary()) {
    _monitor = new Monitor(Mutex::nonleaf, "Refinement monitor", true,
                           Monitor::_safepoint_check_never);
  } else {
    _monitor = DirtyCardQ_CBL_mon;
  }
  // set name
  set_name("G1 Refine#%d", worker_id);
  create_and_start();
 }
 void G1ConcurrentRefineThread::wait_for_completed_buffers() {
-  MonitorLocker ml(_monitor, Mutex::_no_safepoint_check_flag);
+  assert(this == Thread::current(), "precondition");
-  while (!should_terminate() && !is_active()) {
+  while (Atomic::load_acquire(&_should_notify)) {
-    ml.wait();
+    _notifier->wait();
  }
 }
 bool G1ConcurrentRefineThread::is_active() {
  G1DirtyCardQueueSet& dcqs = G1BarrierSet::dirty_card_queue_set();
  return is_primary() ? dcqs.process_completed_buffers() : _active;
 }
 void G1ConcurrentRefineThread::activate() {
-  MutexLocker x(_monitor, Mutex::_no_safepoint_check_flag);
+  assert(this != Thread::current(), "precondition");
-  if (!is_primary()) {
+  // Notify iff transitioning from needing activation to not.  This helps
-    set_active(true);
+  // keep the semaphore count bounded and minimizes the work done by
-  } else {
+  // activators when the thread is already active.
-    G1DirtyCardQueueSet& dcqs = G1BarrierSet::dirty_card_queue_set();
+  if (Atomic::load_acquire(&_should_notify) &&
-    dcqs.set_process_completed_buffers(true);
+      Atomic::cmpxchg(&_should_notify, true, false)) {
    _notifier->signal();
  }
  _monitor->notify();
 }
-void G1ConcurrentRefineThread::deactivate() {
+bool G1ConcurrentRefineThread::maybe_deactivate(bool more_work) {
-  MutexLocker x(_monitor, Mutex::_no_safepoint_check_flag);
+  assert(this == Thread::current(), "precondition");
-  if (!is_primary()) {
+
-    set_active(false);
+  if (more_work) {
    // Suppress unnecessary notifications.
    Atomic::release_store(&_should_notify, false);
    return false;
  } else if (Atomic::load_acquire(&_should_notify)) {
    // Deactivate if no notifications since enabled (see below).
    return true;
  } else {
-    G1DirtyCardQueueSet& dcqs = G1BarrierSet::dirty_card_queue_set();
+    // Try for more refinement work with notifications enabled, to close
-    dcqs.set_process_completed_buffers(false);
+    // race; there could be a plethora of suppressed activation attempts
    // after we found no work but before we enable notifications here
    // (so there could be lots of work for this thread to do), followed
    // by a long time without activation after enabling notifications.
    // But first, clear any pending signals to prevent accumulation.
    while (_notifier->trywait()) {}
    Atomic::release_store(&_should_notify, true);
    return false;
  }
 }
@ -119,14 +115,13 @@ void G1ConcurrentRefineThread::run_service() {
        }
        Ticks start_time = Ticks::now();
-        if (!_cr->do_refinement_step(_worker_id, &_total_refined_cards)) {
+        bool more_work = _cr->do_refinement_step(_worker_id, &_total_refined_cards);
          break;                // No cards to process.
        }
        _total_refinement_time += (Ticks::now() - start_time);
        if (maybe_deactivate(more_work)) break;
      }
    }
    deactivate();
    log_debug(gc, refine)("Deactivated worker %d, off threshold: " SIZE_FORMAT
                          ", current: " SIZE_FORMAT ", refined cards: "
                          SIZE_FORMAT ", total refined cards: " SIZE_FORMAT,
@ -146,6 +141,5 @@ void G1ConcurrentRefineThread::run_service() {
 }
 void G1ConcurrentRefineThread::stop_service() {
-  MutexLocker x(_monitor, Mutex::_no_safepoint_check_flag);
+  activate();
  _monitor->notify();
 }
--- a/src/hotspot/share/gc/g1/g1ConcurrentRefineThread.hpp
+++ b/src/hotspot/share/gc/g1/g1ConcurrentRefineThread.hpp
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2001, 2019, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2001, 2020, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@ -45,24 +45,33 @@ class G1ConcurrentRefineThread: public ConcurrentGCThread {
  uint _worker_id;
-  bool _active;
+  // _notifier and _should_notify form a single-reader / multi-writer
-  Monitor* _monitor;
+  // notification mechanism.  The owning concurrent refinement thread is the
  // single reader. The writers are (other) threads that call activate() on
  // the thread.  The i-th concurrent refinement thread is responsible for
  // activating thread i+1 if the number of buffers in the queue exceeds a
  // threshold for that i+1th thread.  The 0th (primary) thread is activated
  // by threads that add cards to the dirty card queue set when the primary
  // thread's threshold is exceeded.  activate() is also used to wake up the
  // threads during termination, so even the non-primary thread case is
  // multi-writer.
  Semaphore* _notifier;
  volatile bool _should_notify;
  // Called when no refinement work found for this thread.
  // Returns true if should deactivate.
  bool maybe_deactivate(bool more_work);
  G1ConcurrentRefine* _cr;
  void wait_for_completed_buffers();
-  void set_active(bool x) { _active = x; }
+  virtual void run_service();
-  // Deactivate this thread.
+  virtual void stop_service();
  void deactivate();
  bool is_primary() { return (_worker_id == 0); }
  void run_service();
  void stop_service();
 public:
  G1ConcurrentRefineThread(G1ConcurrentRefine* cg1r, uint worker_id);
  bool is_active();
  // Activate this thread.
  void activate();
--- a/src/hotspot/share/gc/g1/g1DirtyCardQueue.cpp
+++ b/src/hotspot/share/gc/g1/g1DirtyCardQueue.cpp
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2001, 2019, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2001, 2020, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@ -26,6 +26,7 @@
 #include "gc/g1/g1BufferNodeList.hpp"
 #include "gc/g1/g1CardTableEntryClosure.hpp"
 #include "gc/g1/g1CollectedHeap.inline.hpp"
 #include "gc/g1/g1ConcurrentRefineThread.hpp"
 #include "gc/g1/g1DirtyCardQueue.hpp"
 #include "gc/g1/g1FreeIdSet.hpp"
 #include "gc/g1/g1RedirtyCardsQueue.hpp"
@ -33,15 +34,14 @@
 #include "gc/g1/g1ThreadLocalData.hpp"
 #include "gc/g1/heapRegionRemSet.hpp"
 #include "gc/shared/suspendibleThreadSet.hpp"
 #include "gc/shared/workgroup.hpp"
 #include "memory/iterator.hpp"
-#include "runtime/flags/flagSetting.hpp"
+#include "runtime/atomic.hpp"
 #include "runtime/mutexLocker.hpp"
 #include "runtime/orderAccess.hpp"
 #include "runtime/os.hpp"
 #include "runtime/safepoint.hpp"
 #include "runtime/thread.inline.hpp"
 #include "runtime/threadSMR.hpp"
 #include "utilities/globalCounter.inline.hpp"
 #include "utilities/macros.hpp"
 #include "utilities/quickSort.hpp"
 G1DirtyCardQueue::G1DirtyCardQueue(G1DirtyCardQueueSet* qset) :
@ -68,18 +68,16 @@ void G1DirtyCardQueue::handle_completed_buffer() {
 // Assumed to be zero by concurrent threads.
 static uint par_ids_start() { return 0; }
-G1DirtyCardQueueSet::G1DirtyCardQueueSet(Monitor* cbl_mon,
+G1DirtyCardQueueSet::G1DirtyCardQueueSet(BufferNode::Allocator* allocator) :
                                         BufferNode::Allocator* allocator) :
  PtrQueueSet(allocator),
-  _cbl_mon(cbl_mon),
+  _primary_refinement_thread(NULL),
  _completed_buffers_head(NULL),
  _completed_buffers_tail(NULL),
  _num_cards(0),
  _completed(),
  _paused(),
  _free_ids(par_ids_start(), num_par_ids()),
  _process_cards_threshold(ProcessCardsThresholdNever),
  _process_completed_buffers(false),
  _max_cards(MaxCardsUnlimited),
  _max_cards_padding(0),
  _free_ids(par_ids_start(), num_par_ids()),
  _mutator_refined_cards_counters(NEW_C_HEAP_ARRAY(size_t, num_par_ids(), mtGC))
 {
  ::memset(_mutator_refined_cards_counters, 0, num_par_ids() * sizeof(size_t));
@ -108,75 +106,304 @@ void G1DirtyCardQueueSet::handle_zero_index_for_thread(Thread* t) {
  G1ThreadLocalData::dirty_card_queue(t).handle_zero_index();
 }
-void G1DirtyCardQueueSet::enqueue_completed_buffer(BufferNode* cbn) {
+#ifdef ASSERT
-  MonitorLocker ml(_cbl_mon, Mutex::_no_safepoint_check_flag);
+G1DirtyCardQueueSet::Queue::~Queue() {
-  cbn->set_next(NULL);
+  assert(_head == NULL, "precondition");
-  if (_completed_buffers_tail == NULL) {
+  assert(_tail == NULL, "precondition");
-    assert(_completed_buffers_head == NULL, "Well-formedness");
+}
-    _completed_buffers_head = cbn;
+#endif // ASSERT
    _completed_buffers_tail = cbn;
  } else {
    _completed_buffers_tail->set_next(cbn);
    _completed_buffers_tail = cbn;
  }
  _num_cards += buffer_size() - cbn->index();
-  if (!process_completed_buffers() &&
+BufferNode* G1DirtyCardQueueSet::Queue::top() const {
-      (num_cards() > process_cards_threshold())) {
+  return Atomic::load(&_head);
-    set_process_completed_buffers(true);
+}
-    ml.notify_all();
+
 // An append operation atomically exchanges the new tail with the queue tail.
 // It then sets the "next" value of the old tail to the head of the list being
 // appended; it is an invariant that the old tail's "next" value is NULL.
 // But if the old tail is NULL then the queue was empty.  In this case the
 // head of the list being appended is instead stored in the queue head; it is
 // an invariant that the queue head is NULL in this case.
 //
 // This means there is a period between the exchange and the old tail update
 // where the queue sequence is split into two parts, the list from the queue
 // head to the old tail, and the list being appended.  If there are concurrent
 // push/append operations, each may introduce another such segment.  But they
 // all eventually get resolved by their respective updates of their old tail's
 // "next" value.  This also means that pop operations must handle a buffer
 // with a NULL "next" value specially.
 //
 // A push operation is just a degenerate append, where the buffer being pushed
 // is both the head and the tail of the list being appended.
 void G1DirtyCardQueueSet::Queue::append(BufferNode& first, BufferNode& last) {
  assert(last.next() == NULL, "precondition");
  BufferNode* old_tail = Atomic::xchg(&_tail, &last);
  if (old_tail == NULL) {       // Was empty.
    assert(Atomic::load(&_head) == NULL, "invariant");
    Atomic::store(&_head, &first);
  } else {
    assert(old_tail->next() == NULL, "invariant");
    old_tail->set_next(&first);
  }
 }
 // pop gets the queue head as the candidate result (returning NULL if the
 // queue head was NULL), and then gets that result node's "next" value.  If
 // that "next" value is NULL and the queue head hasn't changed, then there
 // is only one element in the accessible part of the list (the sequence from
 // head to a node with a NULL "next" value).  We can't return that element,
 // because it may be the old tail of a concurrent push/append that has not
 // yet had its "next" field set to the new tail.  So return NULL in this case.
 // Otherwise, attempt to cmpxchg that "next" value into the queue head,
 // retrying the whole operation if that fails. This is the "usual" lock-free
 // pop from the head of a singly linked list, with the additional restriction
 // on taking the last element.
 BufferNode* G1DirtyCardQueueSet::Queue::pop() {
  Thread* current_thread = Thread::current();
  while (true) {
    // Use a critical section per iteration, rather than over the whole
    // operation.  We're not guaranteed to make progress, because of possible
    // contention on the queue head.  Lingering in one CS the whole time could
    // lead to excessive allocation of buffers, because the CS blocks return
    // of released buffers to the free list for reuse.
    GlobalCounter::CriticalSection cs(current_thread);
    BufferNode* result = Atomic::load_acquire(&_head);
    // Check for empty queue.  Only needs to be done on first iteration,
    // since we never take the last element, but it's messy to make use
    // of that and we expect one iteration to be the common case.
    if (result == NULL) return NULL;
    BufferNode* next = Atomic::load_acquire(BufferNode::next_ptr(*result));
    if (next != NULL) {
      next = Atomic::cmpxchg(&_head, result, next);
      if (next == result) {
        // Former head successfully taken; it is not the last.
        assert(Atomic::load(&_tail) != result, "invariant");
        assert(result->next() != NULL, "invariant");
        result->set_next(NULL);
        return result;
      }
      // cmpxchg failed; try again.
    } else if (result == Atomic::load_acquire(&_head)) {
      // If follower of head is NULL and head hasn't changed, then only
      // the one element is currently accessible.  We don't take the last
      // accessible element, because there may be a concurrent add using it.
      // The check for unchanged head isn't needed for correctness, but the
      // retry on change may sometimes let us get a buffer after all.
      return NULL;
    }
    // Head changed; try again.
  }
 }
 G1DirtyCardQueueSet::HeadTail G1DirtyCardQueueSet::Queue::take_all() {
  assert_at_safepoint();
  HeadTail result(Atomic::load(&_head), Atomic::load(&_tail));
  Atomic::store(&_head, (BufferNode*)NULL);
  Atomic::store(&_tail, (BufferNode*)NULL);
  return result;
 }
 void G1DirtyCardQueueSet::enqueue_completed_buffer(BufferNode* cbn) {
  assert(cbn != NULL, "precondition");
  // Increment _num_cards before adding to queue, so queue removal doesn't
  // need to deal with _num_cards possibly going negative.
  size_t new_num_cards = Atomic::add(&_num_cards, buffer_size() - cbn->index());
  _completed.push(*cbn);
  if ((new_num_cards > process_cards_threshold()) &&
      (_primary_refinement_thread != NULL)) {
    _primary_refinement_thread->activate();
  }
  verify_num_cards();
 }
 BufferNode* G1DirtyCardQueueSet::get_completed_buffer(size_t stop_at) {
-  MutexLocker x(_cbl_mon, Mutex::_no_safepoint_check_flag);
+  enqueue_previous_paused_buffers();
-  if (num_cards() <= stop_at) {
+  // Check for insufficient cards to satisfy request.  We only do this once,
  // up front, rather than on each iteration below, since the test is racy
  // regardless of when we do it.
  if (Atomic::load_acquire(&_num_cards) <= stop_at) {
    return NULL;
  }
-  assert(num_cards() > 0, "invariant");
+  BufferNode* result = _completed.pop();
-  assert(_completed_buffers_head != NULL, "invariant");
+  if (result != NULL) {
-  assert(_completed_buffers_tail != NULL, "invariant");
+    Atomic::sub(&_num_cards, buffer_size() - result->index());
  BufferNode* bn = _completed_buffers_head;
  _num_cards -= buffer_size() - bn->index();
  _completed_buffers_head = bn->next();
  if (_completed_buffers_head == NULL) {
    assert(num_cards() == 0, "invariant");
    _completed_buffers_tail = NULL;
    set_process_completed_buffers(false);
  }
-  verify_num_cards();
+  return result;
  bn->set_next(NULL);
  return bn;
 }
 #ifdef ASSERT
 void G1DirtyCardQueueSet::verify_num_cards() const {
  size_t actual = 0;
-  BufferNode* cur = _completed_buffers_head;
+  BufferNode* cur = _completed.top();
-  while (cur != NULL) {
+  for ( ; cur != NULL; cur = cur->next()) {
    actual += buffer_size() - cur->index();
    cur = cur->next();
  }
-  assert(actual == _num_cards,
+  assert(actual == Atomic::load(&_num_cards),
         "Num entries in completed buffers should be " SIZE_FORMAT " but are " SIZE_FORMAT,
-         _num_cards, actual);
+         Atomic::load(&_num_cards), actual);
 }
 #endif // ASSERT
 G1DirtyCardQueueSet::PausedBuffers::PausedList::PausedList() :
  _head(NULL), _tail(NULL),
  _safepoint_id(SafepointSynchronize::safepoint_id())
 {}
 #ifdef ASSERT
 G1DirtyCardQueueSet::PausedBuffers::PausedList::~PausedList() {
  assert(Atomic::load(&_head) == NULL, "precondition");
  assert(_tail == NULL, "precondition");
 }
 #endif // ASSERT
 bool G1DirtyCardQueueSet::PausedBuffers::PausedList::is_next() const {
  assert_not_at_safepoint();
  return _safepoint_id == SafepointSynchronize::safepoint_id();
 }
 void G1DirtyCardQueueSet::PausedBuffers::PausedList::add(BufferNode* node) {
  assert_not_at_safepoint();
  assert(is_next(), "precondition");
  BufferNode* old_head = Atomic::xchg(&_head, node);
  if (old_head == NULL) {
    assert(_tail == NULL, "invariant");
    _tail = node;
  } else {
    node->set_next(old_head);
  }
 }
 G1DirtyCardQueueSet::HeadTail G1DirtyCardQueueSet::PausedBuffers::PausedList::take() {
  BufferNode* head = Atomic::load(&_head);
  BufferNode* tail = _tail;
  Atomic::store(&_head, (BufferNode*)NULL);
  _tail = NULL;
  return HeadTail(head, tail);
 }
 G1DirtyCardQueueSet::PausedBuffers::PausedBuffers() : _plist(NULL) {}
 #ifdef ASSERT
 G1DirtyCardQueueSet::PausedBuffers::~PausedBuffers() {
  assert(is_empty(), "invariant");
 }
 #endif // ASSERT
 bool G1DirtyCardQueueSet::PausedBuffers::is_empty() const {
  return Atomic::load(&_plist) == NULL;
 }
 void G1DirtyCardQueueSet::PausedBuffers::add(BufferNode* node) {
  assert_not_at_safepoint();
  PausedList* plist = Atomic::load_acquire(&_plist);
  if (plist != NULL) {
    // Already have a next list, so use it.  We know it's a next list because
    // of the precondition that take_previous() has already been called.
    assert(plist->is_next(), "invariant");
  } else {
    // Try to install a new next list.
    plist = new PausedList();
    PausedList* old_plist = Atomic::cmpxchg(&_plist, (PausedList*)NULL, plist);
    if (old_plist != NULL) {
      // Some other thread installed a new next list. Use it instead.
      delete plist;
      plist = old_plist;
    }
  }
  plist->add(node);
 }
 G1DirtyCardQueueSet::HeadTail G1DirtyCardQueueSet::PausedBuffers::take_previous() {
  assert_not_at_safepoint();
  PausedList* previous;
  {
    // Deal with plist in a critical section, to prevent it from being
    // deleted out from under us by a concurrent take_previous().
    GlobalCounter::CriticalSection cs(Thread::current());
    previous = Atomic::load_acquire(&_plist);
    if ((previous == NULL) ||   // Nothing to take.
        previous->is_next() ||  // Not from a previous safepoint.
        // Some other thread stole it.
        (Atomic::cmpxchg(&_plist, previous, (PausedList*)NULL) != previous)) {
      return HeadTail();
    }
  }
  // We now own previous.
  HeadTail result = previous->take();
  // There might be other threads examining previous (in concurrent
  // take_previous()).  Synchronize to wait until any such threads are
  // done with such examination before deleting.
  GlobalCounter::write_synchronize();
  delete previous;
  return result;
 }
 G1DirtyCardQueueSet::HeadTail G1DirtyCardQueueSet::PausedBuffers::take_all() {
  assert_at_safepoint();
  HeadTail result;
  PausedList* plist = Atomic::load(&_plist);
  if (plist != NULL) {
    Atomic::store(&_plist, (PausedList*)NULL);
    result = plist->take();
    delete plist;
  }
  return result;
 }
 void G1DirtyCardQueueSet::record_paused_buffer(BufferNode* node) {
  assert_not_at_safepoint();
  assert(node->next() == NULL, "precondition");
  // Cards for paused buffers are included in count, to contribute to
  // notification checking after the coming safepoint if it doesn't GC.
  // Note that this means the queue's _num_cards differs from the number
  // of cards in the queued buffers when there are paused buffers.
  Atomic::add(&_num_cards, buffer_size() - node->index());
  _paused.add(node);
 }
 void G1DirtyCardQueueSet::enqueue_paused_buffers_aux(const HeadTail& paused) {
  if (paused._head != NULL) {
    assert(paused._tail != NULL, "invariant");
    // Cards from paused buffers are already recorded in the queue count.
    _completed.append(*paused._head, *paused._tail);
  }
 }
 void G1DirtyCardQueueSet::enqueue_previous_paused_buffers() {
  assert_not_at_safepoint();
  // The fast-path still satisfies the precondition for record_paused_buffer
  // and PausedBuffers::add, even with a racy test.  If there are paused
  // buffers from a previous safepoint, is_empty() will return false; there
  // will have been a safepoint between recording and test, so there can't be
  // a false negative (is_empty() returns true) while such buffers are present.
  // If is_empty() is false, there are two cases:
  //
  // (1) There were paused buffers from a previous safepoint.  A concurrent
  // caller may take and enqueue them first, but that's okay; the precondition
  // for a possible later record_paused_buffer by this thread will still hold.
  //
  // (2) There are paused buffers for a requested next safepoint.
  //
  // In each of those cases some effort may be spent detecting and dealing
  // with those circumstances; any wasted effort in such cases is expected to
  // be well compensated by the fast path.
  if (!_paused.is_empty()) {
    enqueue_paused_buffers_aux(_paused.take_previous());
  }
 }
 void G1DirtyCardQueueSet::enqueue_all_paused_buffers() {
  assert_at_safepoint();
  enqueue_paused_buffers_aux(_paused.take_all());
 }
 #endif
 void G1DirtyCardQueueSet::abandon_completed_buffers() {
-  BufferNode* buffers_to_delete = NULL;
+  enqueue_all_paused_buffers();
-  {
+  verify_num_cards();
-    MutexLocker x(_cbl_mon, Mutex::_no_safepoint_check_flag);
+  G1BufferNodeList list = take_all_completed_buffers();
-    buffers_to_delete = _completed_buffers_head;
+  BufferNode* buffers_to_delete = list._head;
    _completed_buffers_head = NULL;
    _completed_buffers_tail = NULL;
    _num_cards = 0;
    set_process_completed_buffers(false);
  }
  while (buffers_to_delete != NULL) {
    BufferNode* bn = buffers_to_delete;
    buffers_to_delete = bn->next();
@ -186,46 +413,30 @@ void G1DirtyCardQueueSet::abandon_completed_buffers() {
 }
 void G1DirtyCardQueueSet::notify_if_necessary() {
-  MonitorLocker ml(_cbl_mon, Mutex::_no_safepoint_check_flag);
+  if ((_primary_refinement_thread != NULL) &&
-  if (num_cards() > process_cards_threshold()) {
+      (num_cards() > process_cards_threshold())) {
-    set_process_completed_buffers(true);
+    _primary_refinement_thread->activate();
    ml.notify_all();
  }
 }
-// Merge lists of buffers. Notify the processing threads.
+// Merge lists of buffers. The source queue set is emptied as a
-// The source queue is emptied as a result. The queues
+// result. The queue sets must share the same allocator.
 // must share the monitor.
 void G1DirtyCardQueueSet::merge_bufferlists(G1RedirtyCardsQueueSet* src) {
  assert(allocator() == src->allocator(), "precondition");
  const G1BufferNodeList from = src->take_all_completed_buffers();
-  if (from._head == NULL) return;
+  if (from._head != NULL) {
-
+    Atomic::add(&_num_cards, from._entry_count);
-  MutexLocker x(_cbl_mon, Mutex::_no_safepoint_check_flag);
+    _completed.append(*from._head, *from._tail);
  if (_completed_buffers_tail == NULL) {
    assert(_completed_buffers_head == NULL, "Well-formedness");
    _completed_buffers_head = from._head;
    _completed_buffers_tail = from._tail;
  } else {
    assert(_completed_buffers_head != NULL, "Well formedness");
    _completed_buffers_tail->set_next(from._head);
    _completed_buffers_tail = from._tail;
  }
  _num_cards += from._entry_count;
  assert(_completed_buffers_head == NULL && _completed_buffers_tail == NULL ||
         _completed_buffers_head != NULL && _completed_buffers_tail != NULL,
         "Sanity");
  verify_num_cards();
 }
 G1BufferNodeList G1DirtyCardQueueSet::take_all_completed_buffers() {
-  MutexLocker x(_cbl_mon, Mutex::_no_safepoint_check_flag);
+  enqueue_all_paused_buffers();
-  G1BufferNodeList result(_completed_buffers_head, _completed_buffers_tail, _num_cards);
+  verify_num_cards();
-  _completed_buffers_head = NULL;
+  HeadTail buffers = _completed.take_all();
-  _completed_buffers_tail = NULL;
+  size_t num_cards = Atomic::load(&_num_cards);
-  _num_cards = 0;
+  Atomic::store(&_num_cards, size_t(0));
-  return result;
+  return G1BufferNodeList(buffers._head, buffers._tail, num_cards);
 }
 class G1RefineBufferedCards : public StackObj {
@ -368,14 +579,20 @@ bool G1DirtyCardQueueSet::refine_buffer(BufferNode* node,
 bool G1DirtyCardQueueSet::process_or_enqueue_completed_buffer(BufferNode* node) {
  if (Thread::current()->is_Java_thread()) {
    // If the number of buffers exceeds the limit, make this Java
-    // thread do the processing itself.  We don't lock to access
+    // thread do the processing itself.  Calculation is racy but we
-    // buffer count or padding; it is fine to be imprecise here.  The
+    // don't need precision here.  The add of padding could overflow,
-    // add of padding could overflow, which is treated as unlimited.
+    // which is treated as unlimited.
    size_t limit = max_cards() + max_cards_padding();
    if ((num_cards() > limit) && (limit >= max_cards())) {
      if (mut_process_buffer(node)) {
        return true;
      }
      // Buffer was incompletely processed because of a pending safepoint
      // request.  Unlike with refinement thread processing, for mutator
      // processing the buffer did not come from the completed buffer queue,
      // so it is okay to add it to the queue rather than to the paused set.
      // Indeed, it can't be added to the paused set because we didn't pass
      // through enqueue_previous_paused_buffers.
    }
  }
  enqueue_completed_buffer(node);
@ -407,14 +624,15 @@ bool G1DirtyCardQueueSet::refine_completed_buffer_concurrently(uint worker_id,
    deallocate_buffer(node);
    return true;
  } else {
-    // Return partially processed buffer to the queue.
+    // Buffer incompletely processed because there is a pending safepoint.
-    enqueue_completed_buffer(node);
+    // Record partially processed buffer, to be finished later.
    record_paused_buffer(node);
    return true;
  }
 }
 void G1DirtyCardQueueSet::abandon_logs() {
-  assert(SafepointSynchronize::is_at_safepoint(), "Must be at safepoint.");
+  assert_at_safepoint();
  abandon_completed_buffers();
  // Since abandon is done only at safepoints, we can safely manipulate
@ -433,7 +651,7 @@ void G1DirtyCardQueueSet::concatenate_logs() {
  // Iterate over all the threads, if we find a partial log add it to
  // the global list of logs.  Temporarily turn off the limit on the number
  // of outstanding buffers.
-  assert(SafepointSynchronize::is_at_safepoint(), "Must be at safepoint.");
+  assert_at_safepoint();
  size_t old_limit = max_cards();
  set_max_cards(MaxCardsUnlimited);
@ -448,5 +666,7 @@ void G1DirtyCardQueueSet::concatenate_logs() {
  Threads::threads_do(&closure);
  G1BarrierSet::shared_dirty_card_queue().flush();
  enqueue_all_paused_buffers();
  verify_num_cards();
  set_max_cards(old_limit);
 }
--- a/src/hotspot/share/gc/g1/g1DirtyCardQueue.hpp
+++ b/src/hotspot/share/gc/g1/g1DirtyCardQueue.hpp
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2001, 2019, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2001, 2020, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@ -29,11 +29,12 @@
 #include "gc/g1/g1FreeIdSet.hpp"
 #include "gc/shared/ptrQueue.hpp"
 #include "memory/allocation.hpp"
 #include "memory/padded.hpp"
 class G1ConcurrentRefineThread;
 class G1DirtyCardQueueSet;
 class G1RedirtyCardsQueueSet;
 class Thread;
 class Monitor;
 // A ptrQueue whose elements are "oops", pointers to object heads.
 class G1DirtyCardQueue: public PtrQueue {
@ -66,15 +67,178 @@ public:
 };
 class G1DirtyCardQueueSet: public PtrQueueSet {
-  Monitor* _cbl_mon;  // Protects the list and count members.
+  // Head and tail of a list of BufferNodes, linked through their next()
-  BufferNode* _completed_buffers_head;
+  // fields.  Similar to G1BufferNodeList, but without the _entry_count.
-  BufferNode* _completed_buffers_tail;
+  struct HeadTail {
    BufferNode* _head;
    BufferNode* _tail;
    HeadTail() : _head(NULL), _tail(NULL) {}
    HeadTail(BufferNode* head, BufferNode* tail) : _head(head), _tail(tail) {}
  };
-  // Number of actual cards in the list of completed buffers.
+  // A lock-free FIFO of BufferNodes, linked through their next() fields.
  // This class has a restriction that pop() cannot return the last buffer
  // in the queue, or what was the last buffer for a concurrent push/append
  // operation.  It is expected that there will be a later push/append that
  // will make that buffer available to a future pop(), or there will
  // eventually be a complete transfer via take_all().
  class Queue {
    BufferNode* volatile _head;
    DEFINE_PAD_MINUS_SIZE(1, DEFAULT_CACHE_LINE_SIZE, sizeof(BufferNode*));
    BufferNode* volatile _tail;
    DEFINE_PAD_MINUS_SIZE(2, DEFAULT_CACHE_LINE_SIZE, sizeof(BufferNode*));
    NONCOPYABLE(Queue);
  public:
    Queue() : _head(NULL), _tail(NULL) {}
    DEBUG_ONLY(~Queue();)
    // Return the first buffer in the queue.
    // Thread-safe, but the result may change immediately.
    BufferNode* top() const;
    // Thread-safe add the buffer to the end of the queue.
    void push(BufferNode& node) { append(node, node); }
    // Thread-safe add the buffers from first to last to the end of the queue.
    void append(BufferNode& first, BufferNode& last);
    // Thread-safe attempt to remove and return the first buffer in the queue.
    // Returns NULL if the queue is empty, or if only one buffer is found.
    // Uses GlobalCounter critical sections to address the ABA problem; this
    // works with the buffer allocator's use of GlobalCounter synchronization.
    BufferNode* pop();
    // Take all the buffers from the queue, leaving the queue empty.
    // Not thread-safe.
    HeadTail take_all();
  };
  // Concurrent refinement may stop processing in the middle of a buffer if
  // there is a pending safepoint, to avoid long delays to safepoint.  A
  // partially processed buffer needs to be recorded for processing by the
  // safepoint if it's a GC safepoint; otherwise it needs to be recorded for
  // further concurrent refinement work after the safepoint.  But if the
  // buffer was obtained from the completed buffer queue then it can't simply
  // be added back to the queue, as that would introduce a new source of ABA
  // for the queue.
  //
  // The PausedBuffer object is used to record such buffers for the upcoming
  // safepoint, and provides access to the buffers recorded for previous
  // safepoints.  Before obtaining a buffer from the completed buffers queue,
  // we first transfer any buffers from previous safepoints to the queue.
  // This is ABA-safe because threads cannot be in the midst of a queue pop
  // across a safepoint.
  //
  // The paused buffers are conceptually an extension of the completed buffers
  // queue, and operations which need to deal with all of the queued buffers
  // (such as concatenate_logs) also need to deal with any paused buffers.  In
  // general, if a safepoint performs a GC then the paused buffers will be
  // processed as part of it, and there won't be any paused buffers after a
  // GC safepoint.
  class PausedBuffers {
    class PausedList : public CHeapObj<mtGC> {
      BufferNode* volatile _head;
      BufferNode* _tail;
      size_t _safepoint_id;
      NONCOPYABLE(PausedList);
    public:
      PausedList();
      DEBUG_ONLY(~PausedList();)
      // Return true if this list was created to hold buffers for the
      // next safepoint.
      // precondition: not at safepoint.
      bool is_next() const;
      // Thread-safe add the buffer to the list.
      // precondition: not at safepoint.
      // precondition: is_next().
      void add(BufferNode* node);
      // Take all the buffers from the list.  Not thread-safe.
      HeadTail take();
    };
    // The most recently created list, which might be for either the next or
    // a previous safepoint, or might be NULL if the next list hasn't been
    // created yet.  We only need one list because of the requirement that
    // threads calling add() must first ensure there are no paused buffers
    // from a previous safepoint.  There might be many list instances existing
    // at the same time though; there can be many threads competing to create
    // and install the next list, and meanwhile there can be a thread dealing
    // with the previous list.
    PausedList* volatile _plist;
    DEFINE_PAD_MINUS_SIZE(1, DEFAULT_CACHE_LINE_SIZE, sizeof(PausedList*));
    NONCOPYABLE(PausedBuffers);
  public:
    PausedBuffers();
    DEBUG_ONLY(~PausedBuffers();)
    // Test whether there are any paused lists.
    // Thread-safe, but the answer may change immediately.
    bool is_empty() const;
    // Thread-safe add the buffer to paused list for next safepoint.
    // precondition: not at safepoint.
    // precondition: does not have paused buffers from a previous safepoint.
    void add(BufferNode* node);
    // Thread-safe take all paused buffers for previous safepoints.
    // precondition: not at safepoint.
    HeadTail take_previous();
    // Take all the paused buffers.
    // precondition: at safepoint.
    HeadTail take_all();
  };
  // The primary refinement thread, for activation when the processing
  // threshold is reached.  NULL if there aren't any refinement threads.
  G1ConcurrentRefineThread* _primary_refinement_thread;
  DEFINE_PAD_MINUS_SIZE(1, DEFAULT_CACHE_LINE_SIZE, sizeof(G1ConcurrentRefineThread*));
  // Upper bound on the number of cards in the completed and paused buffers.
  volatile size_t _num_cards;
  DEFINE_PAD_MINUS_SIZE(2, DEFAULT_CACHE_LINE_SIZE, sizeof(size_t));
  // Buffers ready for refinement.
  Queue _completed;           // Has inner padding, including trailer.
  // Buffers for which refinement is temporarily paused.
  PausedBuffers _paused;      // Has inner padding, including trailer.
  G1FreeIdSet _free_ids;
  // Activation threshold for the primary refinement thread.
  size_t _process_cards_threshold;
-  volatile bool _process_completed_buffers;
+
  // If the queue contains more cards than configured here, the
  // mutator must start doing some of the concurrent refinement work.
  size_t _max_cards;
  size_t _max_cards_padding;
  static const size_t MaxCardsUnlimited = SIZE_MAX;
  // Array of cumulative dirty cards refined by mutator threads.
  // Array has an entry per id in _free_ids.
  size_t* _mutator_refined_cards_counters;
  // Verify _num_cards == sum of cards in the completed queue.
  void verify_num_cards() const NOT_DEBUG_RETURN;
  // Thread-safe add a buffer to paused list for next safepoint.
  // precondition: not at safepoint.
  // precondition: does not have paused buffers from a previous safepoint.
  void record_paused_buffer(BufferNode* node);
  void enqueue_paused_buffers_aux(const HeadTail& paused);
  // Thread-safe transfer paused buffers for previous safepoints to the queue.
  // precondition: not at safepoint.
  void enqueue_previous_paused_buffers();
  // Transfer all paused buffers to the queue.
  // precondition: at safepoint.
  void enqueue_all_paused_buffers();
  void abandon_completed_buffers();
@ -90,22 +254,18 @@ class G1DirtyCardQueueSet: public PtrQueueSet {
  bool mut_process_buffer(BufferNode* node);
-  // If the queue contains more cards than configured here, the
+  // If the number of completed buffers is > stop_at, then remove and
-  // mutator must start doing some of the concurrent refinement work.
+  // return a completed buffer from the list.  Otherwise, return NULL.
-  size_t _max_cards;
+  BufferNode* get_completed_buffer(size_t stop_at = 0);
  size_t _max_cards_padding;
  static const size_t MaxCardsUnlimited = SIZE_MAX;
  G1FreeIdSet _free_ids;
  // Array of cumulative dirty cards refined by mutator threads.
  // Array has an entry per id in _free_ids.
  size_t* _mutator_refined_cards_counters;
 public:
-  G1DirtyCardQueueSet(Monitor* cbl_mon, BufferNode::Allocator* allocator);
+  G1DirtyCardQueueSet(BufferNode::Allocator* allocator);
  ~G1DirtyCardQueueSet();
  void set_primary_refinement_thread(G1ConcurrentRefineThread* thread) {
    _primary_refinement_thread = thread;
  }
  // The number of parallel ids that can be claimed to allow collector or
  // mutator threads to do card-processing work.
  static uint num_par_ids();
@ -119,20 +279,11 @@ public:
  virtual void enqueue_completed_buffer(BufferNode* node);
-  // If the number of completed buffers is > stop_at, then remove and
+  // Upper bound on the number of cards currently in in this queue set.
-  // return a completed buffer from the list.  Otherwise, return NULL.
+  // Read without synchronization.  The value may be high because there
-  BufferNode* get_completed_buffer(size_t stop_at = 0);
+  // is a concurrent modification of the set of buffers.
  // The number of cards in completed buffers. Read without synchronization.
  size_t num_cards() const { return _num_cards; }
  // Verify that _num_cards is equal to the sum of actual cards
  // in the completed buffers.
  void verify_num_cards() const NOT_DEBUG_RETURN;
  bool process_completed_buffers() { return _process_completed_buffers; }
  void set_process_completed_buffers(bool x) { _process_completed_buffers = x; }
  // Get/Set the number of cards that triggers log processing.
  // Log processing should be done when the number of cards exceeds the
  // threshold.
@ -156,8 +307,8 @@ public:
  // false.
  //
  // Stops processing a buffer if SuspendibleThreadSet::should_yield(),
-  // returning the incompletely processed buffer to the completed buffer
+  // recording the incompletely processed buffer for later processing of
-  // list, for later processing of the remainder.
+  // the remainder.
  //
  // Increments *total_refined_cards by the number of cards processed and
  // removed from the buffer.
--- a/src/hotspot/share/gc/shared/ptrQueue.hpp
+++ b/src/hotspot/share/gc/shared/ptrQueue.hpp
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2001, 2019, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2001, 2020, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@ -210,8 +210,6 @@ class BufferNode {
    return offset_of(BufferNode, _buffer);
  }
  static BufferNode* volatile* next_ptr(BufferNode& bn) { return &bn._next; }
  // Allocate a new BufferNode with the "buffer" having size elements.
  static BufferNode* allocate(size_t size);
@ -219,6 +217,7 @@ class BufferNode {
  static void deallocate(BufferNode* node);
 public:
  static BufferNode* volatile* next_ptr(BufferNode& bn) { return &bn._next; }
  typedef LockFreeStack<BufferNode, &next_ptr> Stack;
  BufferNode* next() const     { return _next;  }
--- a/src/hotspot/share/runtime/mutexLocker.cpp
+++ b/src/hotspot/share/runtime/mutexLocker.cpp
@ -73,7 +73,6 @@ Monitor* CGC_lock                     = NULL;
 Monitor* STS_lock                     = NULL;
 Monitor* FullGCCount_lock             = NULL;
 Monitor* G1OldGCCount_lock            = NULL;
 Monitor* DirtyCardQ_CBL_mon           = NULL;
 Mutex*   Shared_DirtyCardQ_lock       = NULL;
 Mutex*   MarkStackFreeList_lock       = NULL;
 Mutex*   MarkStackChunkList_lock      = NULL;
@ -211,7 +210,6 @@ void mutex_init() {
  if (UseG1GC) {
    def(G1OldGCCount_lock          , PaddedMonitor, leaf,        true,  _safepoint_check_always);
    def(DirtyCardQ_CBL_mon         , PaddedMonitor, access,      true,  _safepoint_check_never);
    def(Shared_DirtyCardQ_lock     , PaddedMutex  , access + 1,  true,  _safepoint_check_never);
    def(FreeList_lock              , PaddedMutex  , leaf     ,   true,  _safepoint_check_never);
--- a/src/hotspot/share/runtime/mutexLocker.hpp
+++ b/src/hotspot/share/runtime/mutexLocker.hpp
@ -69,8 +69,6 @@ extern Monitor* CGC_lock;                        // used for coordination betwee
 extern Monitor* STS_lock;                        // used for joining/leaving SuspendibleThreadSet.
 extern Monitor* FullGCCount_lock;                // in support of "concurrent" full gc
 extern Monitor* G1OldGCCount_lock;               // in support of "concurrent" full gc
 extern Monitor* DirtyCardQ_CBL_mon;              // Protects dirty card Q
                                                 // completed buffer queue.
 extern Mutex*   Shared_DirtyCardQ_lock;          // Lock protecting dirty card
                                                 // queue shared by
                                                 // non-Java threads.