8237143: Eliminate DirtyCardQ_cbl_mon

Replace locked data structures with lock-free data structures. Reviewed-by: tschatzl, sangheki
2025-09-16 00:54:38 +02:00 · 2020-02-06 19:09:07 -05:00 · 2020-02-06 19:09:07 -05:00 · ccbd819a01
commit ccbd819a01
parent e37a6aed88
10 changed files with 569 additions and 198 deletions
--- a/src/hotspot/share/gc/g1/g1BarrierSet.cpp
+++ b/src/hotspot/share/gc/g1/g1BarrierSet.cpp
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2001, 2019, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2001, 2020, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@ -36,7 +36,6 @@
 #include "oops/compressedOops.inline.hpp"
 #include "oops/oop.inline.hpp"
 #include "runtime/interfaceSupport.inline.hpp"
-#include "runtime/mutexLocker.hpp"
 #include "runtime/orderAccess.hpp"
 #include "runtime/thread.inline.hpp"
 #include "utilities/macros.hpp"
@ -59,7 +58,7 @@ G1BarrierSet::G1BarrierSet(G1CardTable* card_table) :
  _satb_mark_queue_buffer_allocator("SATB Buffer Allocator", G1SATBBufferSize),
  _dirty_card_queue_buffer_allocator("DC Buffer Allocator", G1UpdateBufferSize),
  _satb_mark_queue_set(&_satb_mark_queue_buffer_allocator),
-  _dirty_card_queue_set(DirtyCardQ_CBL_mon, &_dirty_card_queue_buffer_allocator),
+  _dirty_card_queue_set(&_dirty_card_queue_buffer_allocator),
  _shared_dirty_card_queue(&_dirty_card_queue_set)
 {}

--- a/src/hotspot/share/gc/g1/g1CollectedHeap.cpp
+++ b/src/hotspot/share/gc/g1/g1CollectedHeap.cpp
@ -2776,8 +2776,6 @@ size_t G1CollectedHeap::pending_card_num() {
  Threads::threads_do(&count_from_threads);

  G1DirtyCardQueueSet& dcqs = G1BarrierSet::dirty_card_queue_set();
-  dcqs.verify_num_cards();
-
  return dcqs.num_cards() + count_from_threads._cards;
 }

--- a/src/hotspot/share/gc/g1/g1ConcurrentRefine.cpp
+++ b/src/hotspot/share/gc/g1/g1ConcurrentRefine.cpp
@ -89,6 +89,11 @@ jint G1ConcurrentRefineThreadControl::initialize(G1ConcurrentRefine* cr, uint nu
      }
    }
  }
+
+  if (num_max_threads > 0) {
+    G1BarrierSet::dirty_card_queue_set().set_primary_refinement_thread(_threads[0]);
+  }
+
  return JNI_OK;
 }

@ -108,7 +113,7 @@ void G1ConcurrentRefineThreadControl::maybe_activate_next(uint cur_worker_id) {
    _threads[worker_id] = create_refinement_thread(worker_id, false);
    thread_to_activate = _threads[worker_id];
  }
-  if (thread_to_activate != NULL && !thread_to_activate->is_active()) {
+  if (thread_to_activate != NULL) {
    thread_to_activate->activate();
  }
 }
--- a/src/hotspot/share/gc/g1/g1ConcurrentRefineThread.cpp
+++ b/src/hotspot/share/gc/g1/g1ConcurrentRefineThread.cpp
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2001, 2019, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2001, 2020, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@ -29,9 +29,8 @@
 #include "gc/g1/g1DirtyCardQueue.hpp"
 #include "gc/shared/suspendibleThreadSet.hpp"
 #include "logging/log.hpp"
-#include "memory/resourceArea.hpp"
-#include "runtime/handles.inline.hpp"
-#include "runtime/mutexLocker.hpp"
+#include "runtime/atomic.hpp"
+#include "runtime/thread.hpp"

 G1ConcurrentRefineThread::G1ConcurrentRefineThread(G1ConcurrentRefine* cr, uint worker_id) :
  ConcurrentGCThread(),
@ -40,56 +39,53 @@ G1ConcurrentRefineThread::G1ConcurrentRefineThread(G1ConcurrentRefine* cr, uint
  _total_refinement_time(),
  _total_refined_cards(0),
  _worker_id(worker_id),
-  _active(false),
-  _monitor(NULL),
+  _notifier(new Semaphore(0)),
+  _should_notify(true),
  _cr(cr)
 {
-  // Each thread has its own monitor. The i-th thread is responsible for signaling
-  // to thread i+1 if the number of buffers in the queue exceeds a threshold for this
-  // thread. Monitors are also used to wake up the threads during termination.
-  // The 0th (primary) worker is notified by mutator threads and has a special monitor.
-  if (!is_primary()) {
-    _monitor = new Monitor(Mutex::nonleaf, "Refinement monitor", true,
-                           Monitor::_safepoint_check_never);
-  } else {
-    _monitor = DirtyCardQ_CBL_mon;
-  }
-
  // set name
  set_name("G1 Refine#%d", worker_id);
  create_and_start();
 }

 void G1ConcurrentRefineThread::wait_for_completed_buffers() {
-  MonitorLocker ml(_monitor, Mutex::_no_safepoint_check_flag);
-  while (!should_terminate() && !is_active()) {
-    ml.wait();
+  assert(this == Thread::current(), "precondition");
+  while (Atomic::load_acquire(&_should_notify)) {
+    _notifier->wait();
  }
 }

-bool G1ConcurrentRefineThread::is_active() {
-  G1DirtyCardQueueSet& dcqs = G1BarrierSet::dirty_card_queue_set();
-  return is_primary() ? dcqs.process_completed_buffers() : _active;
-}
-
 void G1ConcurrentRefineThread::activate() {
-  MutexLocker x(_monitor, Mutex::_no_safepoint_check_flag);
-  if (!is_primary()) {
-    set_active(true);
-  } else {
-    G1DirtyCardQueueSet& dcqs = G1BarrierSet::dirty_card_queue_set();
-    dcqs.set_process_completed_buffers(true);
+  assert(this != Thread::current(), "precondition");
+  // Notify iff transitioning from needing activation to not.  This helps
+  // keep the semaphore count bounded and minimizes the work done by
+  // activators when the thread is already active.
+  if (Atomic::load_acquire(&_should_notify) &&
+      Atomic::cmpxchg(&_should_notify, true, false)) {
+    _notifier->signal();
  }
-  _monitor->notify();
 }

-void G1ConcurrentRefineThread::deactivate() {
-  MutexLocker x(_monitor, Mutex::_no_safepoint_check_flag);
-  if (!is_primary()) {
-    set_active(false);
+bool G1ConcurrentRefineThread::maybe_deactivate(bool more_work) {
+  assert(this == Thread::current(), "precondition");
+
+  if (more_work) {
+    // Suppress unnecessary notifications.
+    Atomic::release_store(&_should_notify, false);
+    return false;
+  } else if (Atomic::load_acquire(&_should_notify)) {
+    // Deactivate if no notifications since enabled (see below).
+    return true;
  } else {
-    G1DirtyCardQueueSet& dcqs = G1BarrierSet::dirty_card_queue_set();
-    dcqs.set_process_completed_buffers(false);
+    // Try for more refinement work with notifications enabled, to close
+    // race; there could be a plethora of suppressed activation attempts
+    // after we found no work but before we enable notifications here
+    // (so there could be lots of work for this thread to do), followed
+    // by a long time without activation after enabling notifications.
+    // But first, clear any pending signals to prevent accumulation.
+    while (_notifier->trywait()) {}
+    Atomic::release_store(&_should_notify, true);
+    return false;
  }
 }

@ -119,14 +115,13 @@ void G1ConcurrentRefineThread::run_service() {
        }

        Ticks start_time = Ticks::now();
-        if (!_cr->do_refinement_step(_worker_id, &_total_refined_cards)) {
-          break;                // No cards to process.
-        }
+        bool more_work = _cr->do_refinement_step(_worker_id, &_total_refined_cards);
        _total_refinement_time += (Ticks::now() - start_time);
+
+        if (maybe_deactivate(more_work)) break;
      }
    }

-    deactivate();
    log_debug(gc, refine)("Deactivated worker %d, off threshold: " SIZE_FORMAT
                          ", current: " SIZE_FORMAT ", refined cards: "
                          SIZE_FORMAT ", total refined cards: " SIZE_FORMAT,
@ -146,6 +141,5 @@ void G1ConcurrentRefineThread::run_service() {
 }

 void G1ConcurrentRefineThread::stop_service() {
-  MutexLocker x(_monitor, Mutex::_no_safepoint_check_flag);
-  _monitor->notify();
+  activate();
 }
--- a/src/hotspot/share/gc/g1/g1ConcurrentRefineThread.hpp
+++ b/src/hotspot/share/gc/g1/g1ConcurrentRefineThread.hpp
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2001, 2019, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2001, 2020, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@ -45,24 +45,33 @@ class G1ConcurrentRefineThread: public ConcurrentGCThread {

  uint _worker_id;

-  bool _active;
-  Monitor* _monitor;
+  // _notifier and _should_notify form a single-reader / multi-writer
+  // notification mechanism.  The owning concurrent refinement thread is the
+  // single reader. The writers are (other) threads that call activate() on
+  // the thread.  The i-th concurrent refinement thread is responsible for
+  // activating thread i+1 if the number of buffers in the queue exceeds a
+  // threshold for that i+1th thread.  The 0th (primary) thread is activated
+  // by threads that add cards to the dirty card queue set when the primary
+  // thread's threshold is exceeded.  activate() is also used to wake up the
+  // threads during termination, so even the non-primary thread case is
+  // multi-writer.
+  Semaphore* _notifier;
+  volatile bool _should_notify;
+
+  // Called when no refinement work found for this thread.
+  // Returns true if should deactivate.
+  bool maybe_deactivate(bool more_work);
+
  G1ConcurrentRefine* _cr;

  void wait_for_completed_buffers();

-  void set_active(bool x) { _active = x; }
-  // Deactivate this thread.
-  void deactivate();
+  virtual void run_service();
+  virtual void stop_service();

-  bool is_primary() { return (_worker_id == 0); }
-
-  void run_service();
-  void stop_service();
 public:
  G1ConcurrentRefineThread(G1ConcurrentRefine* cg1r, uint worker_id);

-  bool is_active();
  // Activate this thread.
  void activate();

--- a/src/hotspot/share/gc/g1/g1DirtyCardQueue.cpp
+++ b/src/hotspot/share/gc/g1/g1DirtyCardQueue.cpp
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2001, 2019, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2001, 2020, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@ -26,6 +26,7 @@
 #include "gc/g1/g1BufferNodeList.hpp"
 #include "gc/g1/g1CardTableEntryClosure.hpp"
 #include "gc/g1/g1CollectedHeap.inline.hpp"
+#include "gc/g1/g1ConcurrentRefineThread.hpp"
 #include "gc/g1/g1DirtyCardQueue.hpp"
 #include "gc/g1/g1FreeIdSet.hpp"
 #include "gc/g1/g1RedirtyCardsQueue.hpp"
@ -33,15 +34,14 @@
 #include "gc/g1/g1ThreadLocalData.hpp"
 #include "gc/g1/heapRegionRemSet.hpp"
 #include "gc/shared/suspendibleThreadSet.hpp"
-#include "gc/shared/workgroup.hpp"
 #include "memory/iterator.hpp"
-#include "runtime/flags/flagSetting.hpp"
-#include "runtime/mutexLocker.hpp"
-#include "runtime/orderAccess.hpp"
+#include "runtime/atomic.hpp"
 #include "runtime/os.hpp"
 #include "runtime/safepoint.hpp"
 #include "runtime/thread.inline.hpp"
 #include "runtime/threadSMR.hpp"
+#include "utilities/globalCounter.inline.hpp"
+#include "utilities/macros.hpp"
 #include "utilities/quickSort.hpp"

 G1DirtyCardQueue::G1DirtyCardQueue(G1DirtyCardQueueSet* qset) :
@ -68,18 +68,16 @@ void G1DirtyCardQueue::handle_completed_buffer() {
 // Assumed to be zero by concurrent threads.
 static uint par_ids_start() { return 0; }

-G1DirtyCardQueueSet::G1DirtyCardQueueSet(Monitor* cbl_mon,
-                                         BufferNode::Allocator* allocator) :
+G1DirtyCardQueueSet::G1DirtyCardQueueSet(BufferNode::Allocator* allocator) :
  PtrQueueSet(allocator),
-  _cbl_mon(cbl_mon),
-  _completed_buffers_head(NULL),
-  _completed_buffers_tail(NULL),
+  _primary_refinement_thread(NULL),
  _num_cards(0),
+  _completed(),
+  _paused(),
+  _free_ids(par_ids_start(), num_par_ids()),
  _process_cards_threshold(ProcessCardsThresholdNever),
-  _process_completed_buffers(false),
  _max_cards(MaxCardsUnlimited),
  _max_cards_padding(0),
-  _free_ids(par_ids_start(), num_par_ids()),
  _mutator_refined_cards_counters(NEW_C_HEAP_ARRAY(size_t, num_par_ids(), mtGC))
 {
  ::memset(_mutator_refined_cards_counters, 0, num_par_ids() * sizeof(size_t));
@ -108,75 +106,304 @@ void G1DirtyCardQueueSet::handle_zero_index_for_thread(Thread* t) {
  G1ThreadLocalData::dirty_card_queue(t).handle_zero_index();
 }

-void G1DirtyCardQueueSet::enqueue_completed_buffer(BufferNode* cbn) {
-  MonitorLocker ml(_cbl_mon, Mutex::_no_safepoint_check_flag);
-  cbn->set_next(NULL);
-  if (_completed_buffers_tail == NULL) {
-    assert(_completed_buffers_head == NULL, "Well-formedness");
-    _completed_buffers_head = cbn;
-    _completed_buffers_tail = cbn;
-  } else {
-    _completed_buffers_tail->set_next(cbn);
-    _completed_buffers_tail = cbn;
+#ifdef ASSERT
+G1DirtyCardQueueSet::Queue::~Queue() {
+  assert(_head == NULL, "precondition");
+  assert(_tail == NULL, "precondition");
 }
-  _num_cards += buffer_size() - cbn->index();
+#endif // ASSERT

-  if (!process_completed_buffers() &&
-      (num_cards() > process_cards_threshold())) {
-    set_process_completed_buffers(true);
-    ml.notify_all();
+BufferNode* G1DirtyCardQueueSet::Queue::top() const {
+  return Atomic::load(&_head);
+}
+
+// An append operation atomically exchanges the new tail with the queue tail.
+// It then sets the "next" value of the old tail to the head of the list being
+// appended; it is an invariant that the old tail's "next" value is NULL.
+// But if the old tail is NULL then the queue was empty.  In this case the
+// head of the list being appended is instead stored in the queue head; it is
+// an invariant that the queue head is NULL in this case.
+//
+// This means there is a period between the exchange and the old tail update
+// where the queue sequence is split into two parts, the list from the queue
+// head to the old tail, and the list being appended.  If there are concurrent
+// push/append operations, each may introduce another such segment.  But they
+// all eventually get resolved by their respective updates of their old tail's
+// "next" value.  This also means that pop operations must handle a buffer
+// with a NULL "next" value specially.
+//
+// A push operation is just a degenerate append, where the buffer being pushed
+// is both the head and the tail of the list being appended.
+void G1DirtyCardQueueSet::Queue::append(BufferNode& first, BufferNode& last) {
+  assert(last.next() == NULL, "precondition");
+  BufferNode* old_tail = Atomic::xchg(&_tail, &last);
+  if (old_tail == NULL) {       // Was empty.
+    assert(Atomic::load(&_head) == NULL, "invariant");
+    Atomic::store(&_head, &first);
+  } else {
+    assert(old_tail->next() == NULL, "invariant");
+    old_tail->set_next(&first);
+  }
+}
+
+// pop gets the queue head as the candidate result (returning NULL if the
+// queue head was NULL), and then gets that result node's "next" value.  If
+// that "next" value is NULL and the queue head hasn't changed, then there
+// is only one element in the accessible part of the list (the sequence from
+// head to a node with a NULL "next" value).  We can't return that element,
+// because it may be the old tail of a concurrent push/append that has not
+// yet had its "next" field set to the new tail.  So return NULL in this case.
+// Otherwise, attempt to cmpxchg that "next" value into the queue head,
+// retrying the whole operation if that fails. This is the "usual" lock-free
+// pop from the head of a singly linked list, with the additional restriction
+// on taking the last element.
+BufferNode* G1DirtyCardQueueSet::Queue::pop() {
+  Thread* current_thread = Thread::current();
+  while (true) {
+    // Use a critical section per iteration, rather than over the whole
+    // operation.  We're not guaranteed to make progress, because of possible
+    // contention on the queue head.  Lingering in one CS the whole time could
+    // lead to excessive allocation of buffers, because the CS blocks return
+    // of released buffers to the free list for reuse.
+    GlobalCounter::CriticalSection cs(current_thread);
+
+    BufferNode* result = Atomic::load_acquire(&_head);
+    // Check for empty queue.  Only needs to be done on first iteration,
+    // since we never take the last element, but it's messy to make use
+    // of that and we expect one iteration to be the common case.
+    if (result == NULL) return NULL;
+
+    BufferNode* next = Atomic::load_acquire(BufferNode::next_ptr(*result));
+    if (next != NULL) {
+      next = Atomic::cmpxchg(&_head, result, next);
+      if (next == result) {
+        // Former head successfully taken; it is not the last.
+        assert(Atomic::load(&_tail) != result, "invariant");
+        assert(result->next() != NULL, "invariant");
+        result->set_next(NULL);
+        return result;
+      }
+      // cmpxchg failed; try again.
+    } else if (result == Atomic::load_acquire(&_head)) {
+      // If follower of head is NULL and head hasn't changed, then only
+      // the one element is currently accessible.  We don't take the last
+      // accessible element, because there may be a concurrent add using it.
+      // The check for unchanged head isn't needed for correctness, but the
+      // retry on change may sometimes let us get a buffer after all.
+      return NULL;
+    }
+    // Head changed; try again.
+  }
+}
+
+G1DirtyCardQueueSet::HeadTail G1DirtyCardQueueSet::Queue::take_all() {
+  assert_at_safepoint();
+  HeadTail result(Atomic::load(&_head), Atomic::load(&_tail));
+  Atomic::store(&_head, (BufferNode*)NULL);
+  Atomic::store(&_tail, (BufferNode*)NULL);
+  return result;
+}
+
+void G1DirtyCardQueueSet::enqueue_completed_buffer(BufferNode* cbn) {
+  assert(cbn != NULL, "precondition");
+  // Increment _num_cards before adding to queue, so queue removal doesn't
+  // need to deal with _num_cards possibly going negative.
+  size_t new_num_cards = Atomic::add(&_num_cards, buffer_size() - cbn->index());
+  _completed.push(*cbn);
+  if ((new_num_cards > process_cards_threshold()) &&
+      (_primary_refinement_thread != NULL)) {
+    _primary_refinement_thread->activate();
  }
-  verify_num_cards();
 }

 BufferNode* G1DirtyCardQueueSet::get_completed_buffer(size_t stop_at) {
-  MutexLocker x(_cbl_mon, Mutex::_no_safepoint_check_flag);
+  enqueue_previous_paused_buffers();

-  if (num_cards() <= stop_at) {
+  // Check for insufficient cards to satisfy request.  We only do this once,
+  // up front, rather than on each iteration below, since the test is racy
+  // regardless of when we do it.
+  if (Atomic::load_acquire(&_num_cards) <= stop_at) {
    return NULL;
  }

-  assert(num_cards() > 0, "invariant");
-  assert(_completed_buffers_head != NULL, "invariant");
-  assert(_completed_buffers_tail != NULL, "invariant");
-
-  BufferNode* bn = _completed_buffers_head;
-  _num_cards -= buffer_size() - bn->index();
-  _completed_buffers_head = bn->next();
-  if (_completed_buffers_head == NULL) {
-    assert(num_cards() == 0, "invariant");
-    _completed_buffers_tail = NULL;
-    set_process_completed_buffers(false);
+  BufferNode* result = _completed.pop();
+  if (result != NULL) {
+    Atomic::sub(&_num_cards, buffer_size() - result->index());
  }
-  verify_num_cards();
-  bn->set_next(NULL);
-  return bn;
+  return result;
 }

 #ifdef ASSERT
 void G1DirtyCardQueueSet::verify_num_cards() const {
  size_t actual = 0;
-  BufferNode* cur = _completed_buffers_head;
-  while (cur != NULL) {
+  BufferNode* cur = _completed.top();
+  for ( ; cur != NULL; cur = cur->next()) {
    actual += buffer_size() - cur->index();
-    cur = cur->next();
  }
-  assert(actual == _num_cards,
+  assert(actual == Atomic::load(&_num_cards),
         "Num entries in completed buffers should be " SIZE_FORMAT " but are " SIZE_FORMAT,
-         _num_cards, actual);
+         Atomic::load(&_num_cards), actual);
+}
+#endif // ASSERT
+
+G1DirtyCardQueueSet::PausedBuffers::PausedList::PausedList() :
+  _head(NULL), _tail(NULL),
+  _safepoint_id(SafepointSynchronize::safepoint_id())
+{}
+
+#ifdef ASSERT
+G1DirtyCardQueueSet::PausedBuffers::PausedList::~PausedList() {
+  assert(Atomic::load(&_head) == NULL, "precondition");
+  assert(_tail == NULL, "precondition");
+}
+#endif // ASSERT
+
+bool G1DirtyCardQueueSet::PausedBuffers::PausedList::is_next() const {
+  assert_not_at_safepoint();
+  return _safepoint_id == SafepointSynchronize::safepoint_id();
+}
+
+void G1DirtyCardQueueSet::PausedBuffers::PausedList::add(BufferNode* node) {
+  assert_not_at_safepoint();
+  assert(is_next(), "precondition");
+  BufferNode* old_head = Atomic::xchg(&_head, node);
+  if (old_head == NULL) {
+    assert(_tail == NULL, "invariant");
+    _tail = node;
+  } else {
+    node->set_next(old_head);
+  }
+}
+
+G1DirtyCardQueueSet::HeadTail G1DirtyCardQueueSet::PausedBuffers::PausedList::take() {
+  BufferNode* head = Atomic::load(&_head);
+  BufferNode* tail = _tail;
+  Atomic::store(&_head, (BufferNode*)NULL);
+  _tail = NULL;
+  return HeadTail(head, tail);
+}
+
+G1DirtyCardQueueSet::PausedBuffers::PausedBuffers() : _plist(NULL) {}
+
+#ifdef ASSERT
+G1DirtyCardQueueSet::PausedBuffers::~PausedBuffers() {
+  assert(is_empty(), "invariant");
+}
+#endif // ASSERT
+
+bool G1DirtyCardQueueSet::PausedBuffers::is_empty() const {
+  return Atomic::load(&_plist) == NULL;
+}
+
+void G1DirtyCardQueueSet::PausedBuffers::add(BufferNode* node) {
+  assert_not_at_safepoint();
+  PausedList* plist = Atomic::load_acquire(&_plist);
+  if (plist != NULL) {
+    // Already have a next list, so use it.  We know it's a next list because
+    // of the precondition that take_previous() has already been called.
+    assert(plist->is_next(), "invariant");
+  } else {
+    // Try to install a new next list.
+    plist = new PausedList();
+    PausedList* old_plist = Atomic::cmpxchg(&_plist, (PausedList*)NULL, plist);
+    if (old_plist != NULL) {
+      // Some other thread installed a new next list. Use it instead.
+      delete plist;
+      plist = old_plist;
+    }
+  }
+  plist->add(node);
+}
+
+G1DirtyCardQueueSet::HeadTail G1DirtyCardQueueSet::PausedBuffers::take_previous() {
+  assert_not_at_safepoint();
+  PausedList* previous;
+  {
+    // Deal with plist in a critical section, to prevent it from being
+    // deleted out from under us by a concurrent take_previous().
+    GlobalCounter::CriticalSection cs(Thread::current());
+    previous = Atomic::load_acquire(&_plist);
+    if ((previous == NULL) ||   // Nothing to take.
+        previous->is_next() ||  // Not from a previous safepoint.
+        // Some other thread stole it.
+        (Atomic::cmpxchg(&_plist, previous, (PausedList*)NULL) != previous)) {
+      return HeadTail();
+    }
+  }
+  // We now own previous.
+  HeadTail result = previous->take();
+  // There might be other threads examining previous (in concurrent
+  // take_previous()).  Synchronize to wait until any such threads are
+  // done with such examination before deleting.
+  GlobalCounter::write_synchronize();
+  delete previous;
+  return result;
+}
+
+G1DirtyCardQueueSet::HeadTail G1DirtyCardQueueSet::PausedBuffers::take_all() {
+  assert_at_safepoint();
+  HeadTail result;
+  PausedList* plist = Atomic::load(&_plist);
+  if (plist != NULL) {
+    Atomic::store(&_plist, (PausedList*)NULL);
+    result = plist->take();
+    delete plist;
+  }
+  return result;
+}
+
+void G1DirtyCardQueueSet::record_paused_buffer(BufferNode* node) {
+  assert_not_at_safepoint();
+  assert(node->next() == NULL, "precondition");
+  // Cards for paused buffers are included in count, to contribute to
+  // notification checking after the coming safepoint if it doesn't GC.
+  // Note that this means the queue's _num_cards differs from the number
+  // of cards in the queued buffers when there are paused buffers.
+  Atomic::add(&_num_cards, buffer_size() - node->index());
+  _paused.add(node);
+}
+
+void G1DirtyCardQueueSet::enqueue_paused_buffers_aux(const HeadTail& paused) {
+  if (paused._head != NULL) {
+    assert(paused._tail != NULL, "invariant");
+    // Cards from paused buffers are already recorded in the queue count.
+    _completed.append(*paused._head, *paused._tail);
+  }
+}
+
+void G1DirtyCardQueueSet::enqueue_previous_paused_buffers() {
+  assert_not_at_safepoint();
+  // The fast-path still satisfies the precondition for record_paused_buffer
+  // and PausedBuffers::add, even with a racy test.  If there are paused
+  // buffers from a previous safepoint, is_empty() will return false; there
+  // will have been a safepoint between recording and test, so there can't be
+  // a false negative (is_empty() returns true) while such buffers are present.
+  // If is_empty() is false, there are two cases:
+  //
+  // (1) There were paused buffers from a previous safepoint.  A concurrent
+  // caller may take and enqueue them first, but that's okay; the precondition
+  // for a possible later record_paused_buffer by this thread will still hold.
+  //
+  // (2) There are paused buffers for a requested next safepoint.
+  //
+  // In each of those cases some effort may be spent detecting and dealing
+  // with those circumstances; any wasted effort in such cases is expected to
+  // be well compensated by the fast path.
+  if (!_paused.is_empty()) {
+    enqueue_paused_buffers_aux(_paused.take_previous());
+  }
+}
+
+void G1DirtyCardQueueSet::enqueue_all_paused_buffers() {
+  assert_at_safepoint();
+  enqueue_paused_buffers_aux(_paused.take_all());
 }
-#endif

 void G1DirtyCardQueueSet::abandon_completed_buffers() {
-  BufferNode* buffers_to_delete = NULL;
-  {
-    MutexLocker x(_cbl_mon, Mutex::_no_safepoint_check_flag);
-    buffers_to_delete = _completed_buffers_head;
-    _completed_buffers_head = NULL;
-    _completed_buffers_tail = NULL;
-    _num_cards = 0;
-    set_process_completed_buffers(false);
-  }
+  enqueue_all_paused_buffers();
+  verify_num_cards();
+  G1BufferNodeList list = take_all_completed_buffers();
+  BufferNode* buffers_to_delete = list._head;
  while (buffers_to_delete != NULL) {
    BufferNode* bn = buffers_to_delete;
    buffers_to_delete = bn->next();
@ -186,46 +413,30 @@ void G1DirtyCardQueueSet::abandon_completed_buffers() {
 }

 void G1DirtyCardQueueSet::notify_if_necessary() {
-  MonitorLocker ml(_cbl_mon, Mutex::_no_safepoint_check_flag);
-  if (num_cards() > process_cards_threshold()) {
-    set_process_completed_buffers(true);
-    ml.notify_all();
+  if ((_primary_refinement_thread != NULL) &&
+      (num_cards() > process_cards_threshold())) {
+    _primary_refinement_thread->activate();
  }
 }

-// Merge lists of buffers. Notify the processing threads.
-// The source queue is emptied as a result. The queues
-// must share the monitor.
+// Merge lists of buffers. The source queue set is emptied as a
+// result. The queue sets must share the same allocator.
 void G1DirtyCardQueueSet::merge_bufferlists(G1RedirtyCardsQueueSet* src) {
  assert(allocator() == src->allocator(), "precondition");
  const G1BufferNodeList from = src->take_all_completed_buffers();
-  if (from._head == NULL) return;
-
-  MutexLocker x(_cbl_mon, Mutex::_no_safepoint_check_flag);
-  if (_completed_buffers_tail == NULL) {
-    assert(_completed_buffers_head == NULL, "Well-formedness");
-    _completed_buffers_head = from._head;
-    _completed_buffers_tail = from._tail;
-  } else {
-    assert(_completed_buffers_head != NULL, "Well formedness");
-    _completed_buffers_tail->set_next(from._head);
-    _completed_buffers_tail = from._tail;
+  if (from._head != NULL) {
+    Atomic::add(&_num_cards, from._entry_count);
+    _completed.append(*from._head, *from._tail);
  }
-  _num_cards += from._entry_count;
-
-  assert(_completed_buffers_head == NULL && _completed_buffers_tail == NULL ||
-         _completed_buffers_head != NULL && _completed_buffers_tail != NULL,
-         "Sanity");
-  verify_num_cards();
 }

 G1BufferNodeList G1DirtyCardQueueSet::take_all_completed_buffers() {
-  MutexLocker x(_cbl_mon, Mutex::_no_safepoint_check_flag);
-  G1BufferNodeList result(_completed_buffers_head, _completed_buffers_tail, _num_cards);
-  _completed_buffers_head = NULL;
-  _completed_buffers_tail = NULL;
-  _num_cards = 0;
-  return result;
+  enqueue_all_paused_buffers();
+  verify_num_cards();
+  HeadTail buffers = _completed.take_all();
+  size_t num_cards = Atomic::load(&_num_cards);
+  Atomic::store(&_num_cards, size_t(0));
+  return G1BufferNodeList(buffers._head, buffers._tail, num_cards);
 }

 class G1RefineBufferedCards : public StackObj {
@ -368,14 +579,20 @@ bool G1DirtyCardQueueSet::refine_buffer(BufferNode* node,
 bool G1DirtyCardQueueSet::process_or_enqueue_completed_buffer(BufferNode* node) {
  if (Thread::current()->is_Java_thread()) {
    // If the number of buffers exceeds the limit, make this Java
-    // thread do the processing itself.  We don't lock to access
-    // buffer count or padding; it is fine to be imprecise here.  The
-    // add of padding could overflow, which is treated as unlimited.
+    // thread do the processing itself.  Calculation is racy but we
+    // don't need precision here.  The add of padding could overflow,
+    // which is treated as unlimited.
    size_t limit = max_cards() + max_cards_padding();
    if ((num_cards() > limit) && (limit >= max_cards())) {
      if (mut_process_buffer(node)) {
        return true;
      }
+      // Buffer was incompletely processed because of a pending safepoint
+      // request.  Unlike with refinement thread processing, for mutator
+      // processing the buffer did not come from the completed buffer queue,
+      // so it is okay to add it to the queue rather than to the paused set.
+      // Indeed, it can't be added to the paused set because we didn't pass
+      // through enqueue_previous_paused_buffers.
    }
  }
  enqueue_completed_buffer(node);
@ -407,14 +624,15 @@ bool G1DirtyCardQueueSet::refine_completed_buffer_concurrently(uint worker_id,
    deallocate_buffer(node);
    return true;
  } else {
-    // Return partially processed buffer to the queue.
-    enqueue_completed_buffer(node);
+    // Buffer incompletely processed because there is a pending safepoint.
+    // Record partially processed buffer, to be finished later.
+    record_paused_buffer(node);
    return true;
  }
 }

 void G1DirtyCardQueueSet::abandon_logs() {
-  assert(SafepointSynchronize::is_at_safepoint(), "Must be at safepoint.");
+  assert_at_safepoint();
  abandon_completed_buffers();

  // Since abandon is done only at safepoints, we can safely manipulate
@ -433,7 +651,7 @@ void G1DirtyCardQueueSet::concatenate_logs() {
  // Iterate over all the threads, if we find a partial log add it to
  // the global list of logs.  Temporarily turn off the limit on the number
  // of outstanding buffers.
-  assert(SafepointSynchronize::is_at_safepoint(), "Must be at safepoint.");
+  assert_at_safepoint();
  size_t old_limit = max_cards();
  set_max_cards(MaxCardsUnlimited);

@ -448,5 +666,7 @@ void G1DirtyCardQueueSet::concatenate_logs() {
  Threads::threads_do(&closure);

  G1BarrierSet::shared_dirty_card_queue().flush();
+  enqueue_all_paused_buffers();
+  verify_num_cards();
  set_max_cards(old_limit);
 }
--- a/src/hotspot/share/gc/g1/g1DirtyCardQueue.hpp
+++ b/src/hotspot/share/gc/g1/g1DirtyCardQueue.hpp
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2001, 2019, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2001, 2020, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@ -29,11 +29,12 @@
 #include "gc/g1/g1FreeIdSet.hpp"
 #include "gc/shared/ptrQueue.hpp"
 #include "memory/allocation.hpp"
+#include "memory/padded.hpp"

+class G1ConcurrentRefineThread;
 class G1DirtyCardQueueSet;
 class G1RedirtyCardsQueueSet;
 class Thread;
-class Monitor;

 // A ptrQueue whose elements are "oops", pointers to object heads.
 class G1DirtyCardQueue: public PtrQueue {
@ -66,15 +67,178 @@ public:
 };

 class G1DirtyCardQueueSet: public PtrQueueSet {
-  Monitor* _cbl_mon;  // Protects the list and count members.
-  BufferNode* _completed_buffers_head;
-  BufferNode* _completed_buffers_tail;
+  // Head and tail of a list of BufferNodes, linked through their next()
+  // fields.  Similar to G1BufferNodeList, but without the _entry_count.
+  struct HeadTail {
+    BufferNode* _head;
+    BufferNode* _tail;
+    HeadTail() : _head(NULL), _tail(NULL) {}
+    HeadTail(BufferNode* head, BufferNode* tail) : _head(head), _tail(tail) {}
+  };

-  // Number of actual cards in the list of completed buffers.
+  // A lock-free FIFO of BufferNodes, linked through their next() fields.
+  // This class has a restriction that pop() cannot return the last buffer
+  // in the queue, or what was the last buffer for a concurrent push/append
+  // operation.  It is expected that there will be a later push/append that
+  // will make that buffer available to a future pop(), or there will
+  // eventually be a complete transfer via take_all().
+  class Queue {
+    BufferNode* volatile _head;
+    DEFINE_PAD_MINUS_SIZE(1, DEFAULT_CACHE_LINE_SIZE, sizeof(BufferNode*));
+    BufferNode* volatile _tail;
+    DEFINE_PAD_MINUS_SIZE(2, DEFAULT_CACHE_LINE_SIZE, sizeof(BufferNode*));
+
+    NONCOPYABLE(Queue);
+
+  public:
+    Queue() : _head(NULL), _tail(NULL) {}
+    DEBUG_ONLY(~Queue();)
+
+    // Return the first buffer in the queue.
+    // Thread-safe, but the result may change immediately.
+    BufferNode* top() const;
+
+    // Thread-safe add the buffer to the end of the queue.
+    void push(BufferNode& node) { append(node, node); }
+
+    // Thread-safe add the buffers from first to last to the end of the queue.
+    void append(BufferNode& first, BufferNode& last);
+
+    // Thread-safe attempt to remove and return the first buffer in the queue.
+    // Returns NULL if the queue is empty, or if only one buffer is found.
+    // Uses GlobalCounter critical sections to address the ABA problem; this
+    // works with the buffer allocator's use of GlobalCounter synchronization.
+    BufferNode* pop();
+
+    // Take all the buffers from the queue, leaving the queue empty.
+    // Not thread-safe.
+    HeadTail take_all();
+  };
+
+  // Concurrent refinement may stop processing in the middle of a buffer if
+  // there is a pending safepoint, to avoid long delays to safepoint.  A
+  // partially processed buffer needs to be recorded for processing by the
+  // safepoint if it's a GC safepoint; otherwise it needs to be recorded for
+  // further concurrent refinement work after the safepoint.  But if the
+  // buffer was obtained from the completed buffer queue then it can't simply
+  // be added back to the queue, as that would introduce a new source of ABA
+  // for the queue.
+  //
+  // The PausedBuffer object is used to record such buffers for the upcoming
+  // safepoint, and provides access to the buffers recorded for previous
+  // safepoints.  Before obtaining a buffer from the completed buffers queue,
+  // we first transfer any buffers from previous safepoints to the queue.
+  // This is ABA-safe because threads cannot be in the midst of a queue pop
+  // across a safepoint.
+  //
+  // The paused buffers are conceptually an extension of the completed buffers
+  // queue, and operations which need to deal with all of the queued buffers
+  // (such as concatenate_logs) also need to deal with any paused buffers.  In
+  // general, if a safepoint performs a GC then the paused buffers will be
+  // processed as part of it, and there won't be any paused buffers after a
+  // GC safepoint.
+  class PausedBuffers {
+    class PausedList : public CHeapObj<mtGC> {
+      BufferNode* volatile _head;
+      BufferNode* _tail;
+      size_t _safepoint_id;
+
+      NONCOPYABLE(PausedList);
+
+    public:
+      PausedList();
+      DEBUG_ONLY(~PausedList();)
+
+      // Return true if this list was created to hold buffers for the
+      // next safepoint.
+      // precondition: not at safepoint.
+      bool is_next() const;
+
+      // Thread-safe add the buffer to the list.
+      // precondition: not at safepoint.
+      // precondition: is_next().
+      void add(BufferNode* node);
+
+      // Take all the buffers from the list.  Not thread-safe.
+      HeadTail take();
+    };
+
+    // The most recently created list, which might be for either the next or
+    // a previous safepoint, or might be NULL if the next list hasn't been
+    // created yet.  We only need one list because of the requirement that
+    // threads calling add() must first ensure there are no paused buffers
+    // from a previous safepoint.  There might be many list instances existing
+    // at the same time though; there can be many threads competing to create
+    // and install the next list, and meanwhile there can be a thread dealing
+    // with the previous list.
+    PausedList* volatile _plist;
+    DEFINE_PAD_MINUS_SIZE(1, DEFAULT_CACHE_LINE_SIZE, sizeof(PausedList*));
+
+    NONCOPYABLE(PausedBuffers);
+
+  public:
+    PausedBuffers();
+    DEBUG_ONLY(~PausedBuffers();)
+
+    // Test whether there are any paused lists.
+    // Thread-safe, but the answer may change immediately.
+    bool is_empty() const;
+
+    // Thread-safe add the buffer to paused list for next safepoint.
+    // precondition: not at safepoint.
+    // precondition: does not have paused buffers from a previous safepoint.
+    void add(BufferNode* node);
+
+    // Thread-safe take all paused buffers for previous safepoints.
+    // precondition: not at safepoint.
+    HeadTail take_previous();
+
+    // Take all the paused buffers.
+    // precondition: at safepoint.
+    HeadTail take_all();
+  };
+
+  // The primary refinement thread, for activation when the processing
+  // threshold is reached.  NULL if there aren't any refinement threads.
+  G1ConcurrentRefineThread* _primary_refinement_thread;
+  DEFINE_PAD_MINUS_SIZE(1, DEFAULT_CACHE_LINE_SIZE, sizeof(G1ConcurrentRefineThread*));
+  // Upper bound on the number of cards in the completed and paused buffers.
  volatile size_t _num_cards;
+  DEFINE_PAD_MINUS_SIZE(2, DEFAULT_CACHE_LINE_SIZE, sizeof(size_t));
+  // Buffers ready for refinement.
+  Queue _completed;           // Has inner padding, including trailer.
+  // Buffers for which refinement is temporarily paused.
+  PausedBuffers _paused;      // Has inner padding, including trailer.

+  G1FreeIdSet _free_ids;
+
+  // Activation threshold for the primary refinement thread.
  size_t _process_cards_threshold;
-  volatile bool _process_completed_buffers;
+
+  // If the queue contains more cards than configured here, the
+  // mutator must start doing some of the concurrent refinement work.
+  size_t _max_cards;
+  size_t _max_cards_padding;
+  static const size_t MaxCardsUnlimited = SIZE_MAX;
+
+  // Array of cumulative dirty cards refined by mutator threads.
+  // Array has an entry per id in _free_ids.
+  size_t* _mutator_refined_cards_counters;
+
+  // Verify _num_cards == sum of cards in the completed queue.
+  void verify_num_cards() const NOT_DEBUG_RETURN;
+
+  // Thread-safe add a buffer to paused list for next safepoint.
+  // precondition: not at safepoint.
+  // precondition: does not have paused buffers from a previous safepoint.
+  void record_paused_buffer(BufferNode* node);
+  void enqueue_paused_buffers_aux(const HeadTail& paused);
+  // Thread-safe transfer paused buffers for previous safepoints to the queue.
+  // precondition: not at safepoint.
+  void enqueue_previous_paused_buffers();
+  // Transfer all paused buffers to the queue.
+  // precondition: at safepoint.
+  void enqueue_all_paused_buffers();

  void abandon_completed_buffers();

@ -90,22 +254,18 @@ class G1DirtyCardQueueSet: public PtrQueueSet {

  bool mut_process_buffer(BufferNode* node);

-  // If the queue contains more cards than configured here, the
-  // mutator must start doing some of the concurrent refinement work.
-  size_t _max_cards;
-  size_t _max_cards_padding;
-  static const size_t MaxCardsUnlimited = SIZE_MAX;
-
-  G1FreeIdSet _free_ids;
-
-  // Array of cumulative dirty cards refined by mutator threads.
-  // Array has an entry per id in _free_ids.
-  size_t* _mutator_refined_cards_counters;
+  // If the number of completed buffers is > stop_at, then remove and
+  // return a completed buffer from the list.  Otherwise, return NULL.
+  BufferNode* get_completed_buffer(size_t stop_at = 0);

 public:
-  G1DirtyCardQueueSet(Monitor* cbl_mon, BufferNode::Allocator* allocator);
+  G1DirtyCardQueueSet(BufferNode::Allocator* allocator);
  ~G1DirtyCardQueueSet();

+  void set_primary_refinement_thread(G1ConcurrentRefineThread* thread) {
+    _primary_refinement_thread = thread;
+  }
+
  // The number of parallel ids that can be claimed to allow collector or
  // mutator threads to do card-processing work.
  static uint num_par_ids();
@ -119,20 +279,11 @@ public:

  virtual void enqueue_completed_buffer(BufferNode* node);

-  // If the number of completed buffers is > stop_at, then remove and
-  // return a completed buffer from the list.  Otherwise, return NULL.
-  BufferNode* get_completed_buffer(size_t stop_at = 0);
-
-  // The number of cards in completed buffers. Read without synchronization.
+  // Upper bound on the number of cards currently in in this queue set.
+  // Read without synchronization.  The value may be high because there
+  // is a concurrent modification of the set of buffers.
  size_t num_cards() const { return _num_cards; }

-  // Verify that _num_cards is equal to the sum of actual cards
-  // in the completed buffers.
-  void verify_num_cards() const NOT_DEBUG_RETURN;
-
-  bool process_completed_buffers() { return _process_completed_buffers; }
-  void set_process_completed_buffers(bool x) { _process_completed_buffers = x; }
-
  // Get/Set the number of cards that triggers log processing.
  // Log processing should be done when the number of cards exceeds the
  // threshold.
@ -156,8 +307,8 @@ public:
  // false.
  //
  // Stops processing a buffer if SuspendibleThreadSet::should_yield(),
-  // returning the incompletely processed buffer to the completed buffer
-  // list, for later processing of the remainder.
+  // recording the incompletely processed buffer for later processing of
+  // the remainder.
  //
  // Increments *total_refined_cards by the number of cards processed and
  // removed from the buffer.
--- a/src/hotspot/share/gc/shared/ptrQueue.hpp
+++ b/src/hotspot/share/gc/shared/ptrQueue.hpp
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2001, 2019, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2001, 2020, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@ -210,8 +210,6 @@ class BufferNode {
    return offset_of(BufferNode, _buffer);
  }

-  static BufferNode* volatile* next_ptr(BufferNode& bn) { return &bn._next; }
-
  // Allocate a new BufferNode with the "buffer" having size elements.
  static BufferNode* allocate(size_t size);

@ -219,6 +217,7 @@ class BufferNode {
  static void deallocate(BufferNode* node);

 public:
+  static BufferNode* volatile* next_ptr(BufferNode& bn) { return &bn._next; }
  typedef LockFreeStack<BufferNode, &next_ptr> Stack;

  BufferNode* next() const     { return _next;  }
--- a/src/hotspot/share/runtime/mutexLocker.cpp
+++ b/src/hotspot/share/runtime/mutexLocker.cpp
@ -73,7 +73,6 @@ Monitor* CGC_lock                     = NULL;
 Monitor* STS_lock                     = NULL;
 Monitor* FullGCCount_lock             = NULL;
 Monitor* G1OldGCCount_lock            = NULL;
-Monitor* DirtyCardQ_CBL_mon           = NULL;
 Mutex*   Shared_DirtyCardQ_lock       = NULL;
 Mutex*   MarkStackFreeList_lock       = NULL;
 Mutex*   MarkStackChunkList_lock      = NULL;
@ -211,7 +210,6 @@ void mutex_init() {
  if (UseG1GC) {
    def(G1OldGCCount_lock          , PaddedMonitor, leaf,        true,  _safepoint_check_always);

-    def(DirtyCardQ_CBL_mon         , PaddedMonitor, access,      true,  _safepoint_check_never);
    def(Shared_DirtyCardQ_lock     , PaddedMutex  , access + 1,  true,  _safepoint_check_never);

    def(FreeList_lock              , PaddedMutex  , leaf     ,   true,  _safepoint_check_never);
--- a/src/hotspot/share/runtime/mutexLocker.hpp
+++ b/src/hotspot/share/runtime/mutexLocker.hpp
@ -69,8 +69,6 @@ extern Monitor* CGC_lock;                        // used for coordination betwee
 extern Monitor* STS_lock;                        // used for joining/leaving SuspendibleThreadSet.
 extern Monitor* FullGCCount_lock;                // in support of "concurrent" full gc
 extern Monitor* G1OldGCCount_lock;               // in support of "concurrent" full gc
-extern Monitor* DirtyCardQ_CBL_mon;              // Protects dirty card Q
-                                                 // completed buffer queue.
 extern Mutex*   Shared_DirtyCardQ_lock;          // Lock protecting dirty card
                                                 // queue shared by
                                                 // non-Java threads.