8242078: G1: Improve concurrent refinement analytics and logging

Unify data collection and reporting. Reviewed-by: tschatzl, sjohanss
2025-09-17 01:24:33 +02:00 · 2020-04-14 17:31:53 -04:00 · 2020-04-14 17:31:53 -04:00 · 4694da31f8
commit 4694da31f8
parent b98e863f57
25 changed files with 405 additions and 216 deletions
--- a/src/hotspot/share/gc/g1/g1Analytics.cpp
+++ b/src/hotspot/share/gc/g1/g1Analytics.cpp
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2019, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2016, 2020, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@ -79,7 +79,7 @@ G1Analytics::G1Analytics(const G1Predictions* predictor) :
    _prev_collection_pause_end_ms(0.0),
    _rs_length_diff_seq(new TruncatedSeq(TruncatedSeqLength)),
    _concurrent_refine_rate_ms_seq(new TruncatedSeq(TruncatedSeqLength)),
-    _logged_cards_rate_ms_seq(new TruncatedSeq(TruncatedSeqLength)),
+    _dirtied_cards_rate_ms_seq(new TruncatedSeq(TruncatedSeqLength)),
    _young_card_merge_to_scan_ratio_seq(new TruncatedSeq(TruncatedSeqLength)),
    _mixed_card_merge_to_scan_ratio_seq(new TruncatedSeq(TruncatedSeqLength)),
    _young_cost_per_card_scan_ms_seq(new TruncatedSeq(TruncatedSeqLength)),
@ -107,7 +107,7 @@ G1Analytics::G1Analytics(const G1Predictions* predictor) :
  // Start with inverse of maximum STW cost.
  _concurrent_refine_rate_ms_seq->add(1/cost_per_logged_card_ms_defaults[0]);
  // Some applications have very low rates for logging cards.
-  _logged_cards_rate_ms_seq->add(0.0);
+  _dirtied_cards_rate_ms_seq->add(0.0);
  _young_card_merge_to_scan_ratio_seq->add(young_card_merge_to_scan_ratio_defaults[index]);
  _young_cost_per_card_scan_ms_seq->add(young_only_cost_per_card_scan_ms_defaults[index]);
@ -168,8 +168,8 @@ void G1Analytics::report_concurrent_refine_rate_ms(double cards_per_ms) {
  _concurrent_refine_rate_ms_seq->add(cards_per_ms);
 }
-void G1Analytics::report_logged_cards_rate_ms(double cards_per_ms) {
+void G1Analytics::report_dirtied_cards_rate_ms(double cards_per_ms) {
-  _logged_cards_rate_ms_seq->add(cards_per_ms);
+  _dirtied_cards_rate_ms_seq->add(cards_per_ms);
 }
 void G1Analytics::report_cost_per_card_scan_ms(double cost_per_card_ms, bool for_young_gc) {
@ -236,8 +236,8 @@ double G1Analytics::predict_concurrent_refine_rate_ms() const {
  return predict_zero_bounded(_concurrent_refine_rate_ms_seq);
 }
-double G1Analytics::predict_logged_cards_rate_ms() const {
+double G1Analytics::predict_dirtied_cards_rate_ms() const {
-  return predict_zero_bounded(_logged_cards_rate_ms_seq);
+  return predict_zero_bounded(_dirtied_cards_rate_ms_seq);
 }
 double G1Analytics::predict_young_card_merge_to_scan_ratio() const {
--- a/src/hotspot/share/gc/g1/g1Analytics.hpp
+++ b/src/hotspot/share/gc/g1/g1Analytics.hpp
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2019, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2016, 2020, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@ -47,7 +47,7 @@ class G1Analytics: public CHeapObj<mtGC> {
  TruncatedSeq* _rs_length_diff_seq;
  TruncatedSeq* _concurrent_refine_rate_ms_seq;
-  TruncatedSeq* _logged_cards_rate_ms_seq;
+  TruncatedSeq* _dirtied_cards_rate_ms_seq;
  // The ratio between the number of merged cards and actually scanned cards, for
  // young-only and mixed gcs.
  TruncatedSeq* _young_card_merge_to_scan_ratio_seq;
@ -115,7 +115,7 @@ public:
  void report_concurrent_mark_cleanup_times_ms(double ms);
  void report_alloc_rate_ms(double alloc_rate);
  void report_concurrent_refine_rate_ms(double cards_per_ms);
-  void report_logged_cards_rate_ms(double cards_per_ms);
+  void report_dirtied_cards_rate_ms(double cards_per_ms);
  void report_cost_per_card_scan_ms(double cost_per_remset_card_ms, bool for_young_gc);
  void report_cost_per_card_merge_ms(double cost_per_card_ms, bool for_young_gc);
  void report_card_merge_to_scan_ratio(double cards_per_entry_ratio, bool for_young_gc);
@ -131,7 +131,7 @@ public:
  int num_alloc_rate_ms() const;
  double predict_concurrent_refine_rate_ms() const;
-  double predict_logged_cards_rate_ms() const;
+  double predict_dirtied_cards_rate_ms() const;
  double predict_young_card_merge_to_scan_ratio() const;
  double predict_mixed_card_merge_to_scan_ratio() const;
--- a/src/hotspot/share/gc/g1/g1BarrierSet.cpp
+++ b/src/hotspot/share/gc/g1/g1BarrierSet.cpp
@ -157,5 +157,5 @@ void G1BarrierSet::on_thread_detach(Thread* thread) {
  // Flush any deferred card marks.
  CardTableBarrierSet::on_thread_detach(thread);
  G1ThreadLocalData::satb_mark_queue(thread).flush();
-  G1ThreadLocalData::dirty_card_queue(thread).flush();
+  G1ThreadLocalData::dirty_card_queue(thread).on_thread_detach();
 }
--- a/src/hotspot/share/gc/g1/g1CollectedHeap.cpp
+++ b/src/hotspot/share/gc/g1/g1CollectedHeap.cpp
@ -1054,10 +1054,11 @@ void G1CollectedHeap::abort_refinement() {
    _hot_card_cache->reset_hot_cache();
  }
-  // Discard all remembered set updates.
+  // Discard all remembered set updates and reset refinement statistics.
  G1BarrierSet::dirty_card_queue_set().abandon_logs();
  assert(G1BarrierSet::dirty_card_queue_set().num_cards() == 0,
         "DCQS should be empty");
  concurrent_refine()->get_and_reset_refinement_stats();
 }
 void G1CollectedHeap::verify_after_full_collection() {
@ -2684,9 +2685,22 @@ void G1CollectedHeap::gc_prologue(bool full) {
  }
  // Fill TLAB's and such
-  double start = os::elapsedTime();
+  {
-  ensure_parsability(true);
+    Ticks start = Ticks::now();
-  phase_times()->record_prepare_tlab_time_ms((os::elapsedTime() - start) * 1000.0);
+    ensure_parsability(true);
    Tickspan dt = Ticks::now() - start;
    phase_times()->record_prepare_tlab_time_ms(dt.seconds() * MILLIUNITS);
  }
  if (!full) {
    // Flush dirty card queues to qset, so later phases don't need to account
    // for partially filled per-thread queues and such.  Not needed for full
    // collections, which ignore those logs.
    Ticks start = Ticks::now();
    G1BarrierSet::dirty_card_queue_set().concatenate_logs();
    Tickspan dt = Ticks::now() - start;
    phase_times()->record_concatenate_dirty_card_logs_time_ms(dt.seconds() * MILLIUNITS);
  }
 }
 void G1CollectedHeap::gc_epilogue(bool full) {
@ -2759,20 +2773,6 @@ void G1CollectedHeap::do_concurrent_mark() {
  }
 }
 size_t G1CollectedHeap::pending_card_num() {
  struct CountCardsClosure : public ThreadClosure {
    size_t _cards;
    CountCardsClosure() : _cards(0) {}
    virtual void do_thread(Thread* t) {
      _cards += G1ThreadLocalData::dirty_card_queue(t).size();
    }
  } count_from_threads;
  Threads::threads_do(&count_from_threads);
  G1DirtyCardQueueSet& dcqs = G1BarrierSet::dirty_card_queue_set();
  return dcqs.num_cards() + count_from_threads._cards;
 }
 bool G1CollectedHeap::is_potential_eager_reclaim_candidate(HeapRegion* r) const {
  // We don't nominate objects with many remembered set entries, on
  // the assumption that such objects are likely still live.
--- a/src/hotspot/share/gc/g1/g1CollectedHeap.hpp
+++ b/src/hotspot/share/gc/g1/g1CollectedHeap.hpp
@ -1465,8 +1465,6 @@ public:
  // Used to print information about locations in the hs_err file.
  virtual bool print_location(outputStream* st, void* addr) const;
  size_t pending_card_num();
 };
 class G1ParEvacuateFollowersClosure : public VoidClosure {
--- a/src/hotspot/share/gc/g1/g1ConcurrentRefine.cpp
+++ b/src/hotspot/share/gc/g1/g1ConcurrentRefine.cpp
@ -408,20 +408,18 @@ void G1ConcurrentRefine::adjust(double logged_cards_scan_time,
  dcqs.notify_if_necessary();
 }
-G1ConcurrentRefine::RefinementStats G1ConcurrentRefine::total_refinement_stats() const {
+G1ConcurrentRefineStats G1ConcurrentRefine::get_and_reset_refinement_stats() {
-  struct CollectData : public ThreadClosure {
+  struct CollectStats : public ThreadClosure {
-    Tickspan _total_time;
+    G1ConcurrentRefineStats _total_stats;
    size_t _total_cards;
    CollectData() : _total_time(), _total_cards(0) {}
    virtual void do_thread(Thread* t) {
      G1ConcurrentRefineThread* crt = static_cast<G1ConcurrentRefineThread*>(t);
-      _total_time += crt->total_refinement_time();
+      G1ConcurrentRefineStats& stats = *crt->refinement_stats();
-      _total_cards += crt->total_refined_cards();
+      _total_stats += stats;
      stats.reset();
    }
  } collector;
-  // Cast away const so we can call non-modifying closure on threads.
+  threads_do(&collector);
-  const_cast<G1ConcurrentRefine*>(this)->threads_do(&collector);
+  return collector._total_stats;
  return RefinementStats(collector._total_time, collector._total_cards);
 }
 size_t G1ConcurrentRefine::activation_threshold(uint worker_id) const {
@ -445,7 +443,7 @@ void G1ConcurrentRefine::maybe_activate_more_threads(uint worker_id, size_t num_
 }
 bool G1ConcurrentRefine::do_refinement_step(uint worker_id,
-                                            size_t* total_refined_cards) {
+                                            G1ConcurrentRefineStats* stats) {
  G1DirtyCardQueueSet& dcqs = G1BarrierSet::dirty_card_queue_set();
  size_t curr_cards = dcqs.num_cards();
@ -460,5 +458,5 @@ bool G1ConcurrentRefine::do_refinement_step(uint worker_id,
  // Process the next buffer, if there are enough left.
  return dcqs.refine_completed_buffer_concurrently(worker_id + worker_id_offset(),
                                                   deactivation_threshold(worker_id),
-                                                   total_refined_cards);
+                                                   stats);
 }
--- a/src/hotspot/share/gc/g1/g1ConcurrentRefine.hpp
+++ b/src/hotspot/share/gc/g1/g1ConcurrentRefine.hpp
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2001, 2019, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2001, 2020, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@ -25,6 +25,7 @@
 #ifndef SHARE_GC_G1_G1CONCURRENTREFINE_HPP
 #define SHARE_GC_G1_G1CONCURRENTREFINE_HPP
 #include "gc/g1/g1ConcurrentRefineStats.hpp"
 #include "memory/allocation.hpp"
 #include "utilities/globalDefinitions.hpp"
 #include "utilities/ticks.hpp"
@ -119,13 +120,9 @@ public:
  // Adjust refinement thresholds based on work done during the pause and the goal time.
  void adjust(double logged_cards_scan_time, size_t processed_logged_cards, double goal_ms);
-  struct RefinementStats {
+  // Return total of concurrent refinement stats for the
-    Tickspan _time;
+  // ConcurrentRefineThreads.  Also reset the stats for the threads.
-    size_t _cards;
+  G1ConcurrentRefineStats get_and_reset_refinement_stats();
    RefinementStats(Tickspan time, size_t cards) : _time(time), _cards(cards) {}
  };
  RefinementStats total_refinement_stats() const;
  // Cards in the dirty card queue set.
  size_t activation_threshold(uint worker_id) const;
@ -133,8 +130,8 @@ public:
  // Perform a single refinement step; called by the refinement
  // threads.  Returns true if there was refinement work available.
-  // Increments *total_refined_cards.
+  // Updates stats.
-  bool do_refinement_step(uint worker_id, size_t* total_refined_cards);
+  bool do_refinement_step(uint worker_id, G1ConcurrentRefineStats* stats);
  // Iterate over all concurrent refinement threads applying the given closure.
  void threads_do(ThreadClosure *tc);
--- a/src/hotspot/share/gc/g1/g1ConcurrentRefineStats.cpp
+++ b/src/hotspot/share/gc/g1/g1ConcurrentRefineStats.cpp
@ -0,0 +1,60 @@
 /*
 * Copyright (c) 2020, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
 * under the terms of the GNU General Public License version 2 only, as
 * published by the Free Software Foundation.
 *
 * This code is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 * version 2 for more details (a copy is included in the LICENSE file that
 * accompanied this code).
 *
 * You should have received a copy of the GNU General Public License version
 * 2 along with this work; if not, write to the Free Software Foundation,
 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
 *
 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
 * or visit www.oracle.com if you need additional information or have any
 * questions.
 *
 */
 #include "precompiled.hpp"
 #include "gc/g1/g1ConcurrentRefineStats.hpp"
 G1ConcurrentRefineStats::G1ConcurrentRefineStats() :
  _refinement_time(),
  _refined_cards(0),
  _precleaned_cards(0),
  _dirtied_cards(0)
 {}
 G1ConcurrentRefineStats&
 G1ConcurrentRefineStats::operator+=(const G1ConcurrentRefineStats& other) {
  _refinement_time += other._refinement_time;
  _refined_cards += other._refined_cards;
  _precleaned_cards += other._precleaned_cards;
  _dirtied_cards += other._dirtied_cards;
  return *this;
 }
 template<typename T>
 static T clipped_sub(T x, T y) {
  return (x < y) ? T() : (x - y);
 }
 G1ConcurrentRefineStats&
 G1ConcurrentRefineStats::operator-=(const G1ConcurrentRefineStats& other) {
  _refinement_time = clipped_sub(_refinement_time, other._refinement_time);
  _refined_cards = clipped_sub(_refined_cards, other._refined_cards);
  _precleaned_cards = clipped_sub(_precleaned_cards, other._precleaned_cards);
  _dirtied_cards = clipped_sub(_dirtied_cards, other._dirtied_cards);
  return *this;
 }
 void G1ConcurrentRefineStats::reset() {
  *this = G1ConcurrentRefineStats();
 }
--- a/src/hotspot/share/gc/g1/g1ConcurrentRefineStats.hpp
+++ b/src/hotspot/share/gc/g1/g1ConcurrentRefineStats.hpp
@ -0,0 +1,78 @@
 /*
 * Copyright (c) 2020, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
 * under the terms of the GNU General Public License version 2 only, as
 * published by the Free Software Foundation.
 *
 * This code is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 * version 2 for more details (a copy is included in the LICENSE file that
 * accompanied this code).
 *
 * You should have received a copy of the GNU General Public License version
 * 2 along with this work; if not, write to the Free Software Foundation,
 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
 *
 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
 * or visit www.oracle.com if you need additional information or have any
 * questions.
 *
 */
 #ifndef SHARE_GC_G1_G1CONCURRENTREFINESTATS_HPP
 #define SHARE_GC_G1_G1CONCURRENTREFINESTATS_HPP
 #include "memory/allocation.hpp"
 #include "utilities/globalDefinitions.hpp"
 #include "utilities/ticks.hpp"
 // Collection of statistics for concurrent refinement processing.
 // Used for collecting per-thread statistics and for summaries over a
 // collection of threads.
 class G1ConcurrentRefineStats : public CHeapObj<mtGC> {
  Tickspan _refinement_time;
  size_t _refined_cards;
  size_t _precleaned_cards;
  size_t _dirtied_cards;
 public:
  G1ConcurrentRefineStats();
  // Time spent performing concurrent refinement.
  Tickspan refinement_time() const { return _refinement_time; }
  // Number of refined cards.
  size_t refined_cards() const { return _refined_cards; }
  // Number of cards for which refinement was skipped because some other
  // thread had already refined them.
  size_t precleaned_cards() const { return _precleaned_cards; }
  // Number of cards marked dirty and in need of refinement.
  size_t dirtied_cards() const { return _dirtied_cards; }
  void inc_refinement_time(Tickspan t) { _refinement_time += t; }
  void inc_refined_cards(size_t cards) { _refined_cards += cards; }
  void inc_precleaned_cards(size_t cards) { _precleaned_cards += cards; }
  void inc_dirtied_cards(size_t cards) { _dirtied_cards += cards; }
  G1ConcurrentRefineStats& operator+=(const G1ConcurrentRefineStats& other);
  G1ConcurrentRefineStats& operator-=(const G1ConcurrentRefineStats& other);
  friend G1ConcurrentRefineStats operator+(G1ConcurrentRefineStats x,
                                           const G1ConcurrentRefineStats& y) {
    return x += y;
  }
  friend G1ConcurrentRefineStats operator-(G1ConcurrentRefineStats x,
                                           const G1ConcurrentRefineStats& y) {
    return x -= y;
  }
  void reset();
 };
 #endif // SHARE_GC_G1_G1CONCURRENTREFINESTATS_HPP
--- a/src/hotspot/share/gc/g1/g1ConcurrentRefineThread.cpp
+++ b/src/hotspot/share/gc/g1/g1ConcurrentRefineThread.cpp
@ -25,6 +25,7 @@
 #include "precompiled.hpp"
 #include "gc/g1/g1BarrierSet.hpp"
 #include "gc/g1/g1ConcurrentRefine.hpp"
 #include "gc/g1/g1ConcurrentRefineStats.hpp"
 #include "gc/g1/g1ConcurrentRefineThread.hpp"
 #include "gc/g1/g1DirtyCardQueue.hpp"
 #include "gc/shared/suspendibleThreadSet.hpp"
@ -36,8 +37,7 @@ G1ConcurrentRefineThread::G1ConcurrentRefineThread(G1ConcurrentRefine* cr, uint
  ConcurrentGCThread(),
  _vtime_start(0.0),
  _vtime_accum(0.0),
-  _total_refinement_time(),
+  _refinement_stats(new G1ConcurrentRefineStats()),
  _total_refined_cards(0),
  _worker_id(worker_id),
  _notifier(new Semaphore(0)),
  _should_notify(true),
@ -48,6 +48,11 @@ G1ConcurrentRefineThread::G1ConcurrentRefineThread(G1ConcurrentRefine* cr, uint
  create_and_start();
 }
 G1ConcurrentRefineThread::~G1ConcurrentRefineThread() {
  delete _refinement_stats;
  delete _notifier;
 }
 void G1ConcurrentRefineThread::wait_for_completed_buffers() {
  assert(this == Thread::current(), "precondition");
  while (Atomic::load_acquire(&_should_notify)) {
@ -103,32 +108,35 @@ void G1ConcurrentRefineThread::run_service() {
                          _worker_id, _cr->activation_threshold(_worker_id),
                          G1BarrierSet::dirty_card_queue_set().num_cards());
-    size_t start_total_refined_cards = _total_refined_cards; // For logging.
+    // For logging.
    G1ConcurrentRefineStats start_stats = *_refinement_stats;
    G1ConcurrentRefineStats total_stats; // Accumulate over activation.
    {
      SuspendibleThreadSetJoiner sts_join;
      while (!should_terminate()) {
        if (sts_join.should_yield()) {
          // Accumulate changed stats before possible GC that resets stats.
          total_stats += *_refinement_stats - start_stats;
          sts_join.yield();
          // Reinitialize baseline stats after safepoint.
          start_stats = *_refinement_stats;
          continue;             // Re-check for termination after yield delay.
        }
-        Ticks start_time = Ticks::now();
+        bool more_work = _cr->do_refinement_step(_worker_id, _refinement_stats);
        bool more_work = _cr->do_refinement_step(_worker_id, &_total_refined_cards);
        _total_refinement_time += (Ticks::now() - start_time);
        if (maybe_deactivate(more_work)) break;
      }
    }
    total_stats += *_refinement_stats - start_stats;
    log_debug(gc, refine)("Deactivated worker %d, off threshold: " SIZE_FORMAT
-                          ", current: " SIZE_FORMAT ", refined cards: "
+                          ", current: " SIZE_FORMAT
-                          SIZE_FORMAT ", total refined cards: " SIZE_FORMAT,
+                          ", refined cards: " SIZE_FORMAT,
                          _worker_id, _cr->deactivation_threshold(_worker_id),
                          G1BarrierSet::dirty_card_queue_set().num_cards(),
-                          _total_refined_cards - start_total_refined_cards,
+                          total_stats.refined_cards());
                          _total_refined_cards);
    if (os::supports_vtime()) {
      _vtime_accum = (os::elapsedVTime() - _vtime_start);
--- a/src/hotspot/share/gc/g1/g1ConcurrentRefineThread.hpp
+++ b/src/hotspot/share/gc/g1/g1ConcurrentRefineThread.hpp
@ -30,6 +30,7 @@
 // Forward Decl.
 class G1ConcurrentRefine;
 class G1ConcurrentRefineStats;
 // One or more G1 Concurrent Refinement Threads may be active if concurrent
 // refinement is in progress.
@ -40,8 +41,7 @@ class G1ConcurrentRefineThread: public ConcurrentGCThread {
  double _vtime_start;  // Initial virtual time.
  double _vtime_accum;  // Accumulated virtual time.
-  Tickspan _total_refinement_time;
+  G1ConcurrentRefineStats* _refinement_stats;
  size_t _total_refined_cards;
  uint _worker_id;
@ -71,12 +71,14 @@ class G1ConcurrentRefineThread: public ConcurrentGCThread {
 public:
  G1ConcurrentRefineThread(G1ConcurrentRefine* cg1r, uint worker_id);
  virtual ~G1ConcurrentRefineThread();
  // Activate this thread.
  void activate();
-  Tickspan total_refinement_time() const { return _total_refinement_time; }
+  G1ConcurrentRefineStats* refinement_stats() const {
-  size_t total_refined_cards() const { return _total_refined_cards; }
+    return _refinement_stats;
  }
  // Total virtual time so far.
  double vtime_accum() { return _vtime_accum; }
--- a/src/hotspot/share/gc/g1/g1DirtyCardQueue.cpp
+++ b/src/hotspot/share/gc/g1/g1DirtyCardQueue.cpp
@ -26,6 +26,7 @@
 #include "gc/g1/g1BufferNodeList.hpp"
 #include "gc/g1/g1CardTableEntryClosure.hpp"
 #include "gc/g1/g1CollectedHeap.inline.hpp"
 #include "gc/g1/g1ConcurrentRefineStats.hpp"
 #include "gc/g1/g1ConcurrentRefineThread.hpp"
 #include "gc/g1/g1DirtyCardQueue.hpp"
 #include "gc/g1/g1FreeIdSet.hpp"
@ -36,6 +37,8 @@
 #include "gc/shared/suspendibleThreadSet.hpp"
 #include "memory/iterator.hpp"
 #include "runtime/atomic.hpp"
 #include "runtime/mutex.hpp"
 #include "runtime/mutexLocker.hpp"
 #include "runtime/os.hpp"
 #include "runtime/safepoint.hpp"
 #include "runtime/thread.inline.hpp"
@ -43,22 +46,37 @@
 #include "utilities/globalCounter.inline.hpp"
 #include "utilities/macros.hpp"
 #include "utilities/quickSort.hpp"
 #include "utilities/ticks.hpp"
 G1DirtyCardQueue::G1DirtyCardQueue(G1DirtyCardQueueSet* qset) :
  // Dirty card queues are always active, so we create them with their
  // active field set to true.
-  PtrQueue(qset, true /* active */)
+  PtrQueue(qset, true /* active */),
  _refinement_stats(new G1ConcurrentRefineStats())
 { }
 G1DirtyCardQueue::~G1DirtyCardQueue() {
  flush();
  delete _refinement_stats;
 }
 void G1DirtyCardQueue::flush() {
  _refinement_stats->inc_dirtied_cards(size());
  flush_impl();
 }
 void G1DirtyCardQueue::on_thread_detach() {
  assert(this == &G1ThreadLocalData::dirty_card_queue(Thread::current()), "precondition");
  flush();
  dirty_card_qset()->record_detached_refinement_stats(_refinement_stats);
 }
 void G1DirtyCardQueue::handle_completed_buffer() {
  assert(!is_empty(), "precondition");
  _refinement_stats->inc_dirtied_cards(size());
  BufferNode* node = BufferNode::make_node_from_buffer(_buf, index());
  allocate_buffer();
-  dirty_card_qset()->handle_completed_buffer(node);
+  dirty_card_qset()->handle_completed_buffer(node, _refinement_stats);
 }
 // Assumed to be zero by concurrent threads.
@ -74,15 +92,13 @@ G1DirtyCardQueueSet::G1DirtyCardQueueSet(BufferNode::Allocator* allocator) :
  _process_cards_threshold(ProcessCardsThresholdNever),
  _max_cards(MaxCardsUnlimited),
  _padded_max_cards(MaxCardsUnlimited),
-  _mutator_refined_cards_counters(NEW_C_HEAP_ARRAY(size_t, num_par_ids(), mtGC))
+  _detached_refinement_stats()
 {
  ::memset(_mutator_refined_cards_counters, 0, num_par_ids() * sizeof(size_t));
  _all_active = true;
 }
 G1DirtyCardQueueSet::~G1DirtyCardQueueSet() {
  abandon_completed_buffers();
  FREE_C_HEAP_ARRAY(size_t, _mutator_refined_cards_counters);
 }
 // Determines how many mutator threads can process the buffers in parallel.
@ -90,14 +106,6 @@ uint G1DirtyCardQueueSet::num_par_ids() {
  return (uint)os::initial_active_processor_count();
 }
 size_t G1DirtyCardQueueSet::total_mutator_refined_cards() const {
  size_t sum = 0;
  for (uint i = 0; i < num_par_ids(); ++i) {
    sum += _mutator_refined_cards_counters[i];
  }
  return sum;
 }
 void G1DirtyCardQueueSet::handle_zero_index_for_thread(Thread* t) {
  G1ThreadLocalData::dirty_card_queue(t).handle_zero_index();
 }
@ -422,7 +430,7 @@ class G1RefineBufferedCards : public StackObj {
  CardTable::CardValue** const _node_buffer;
  const size_t _node_buffer_size;
  const uint _worker_id;
-  size_t* _total_refined_cards;
+  G1ConcurrentRefineStats* _stats;
  G1RemSet* const _g1rs;
  static inline int compare_card(const CardTable::CardValue* p1,
@ -472,7 +480,8 @@ class G1RefineBufferedCards : public StackObj {
    const size_t first_clean = dst - _node_buffer;
    assert(first_clean >= start && first_clean <= _node_buffer_size, "invariant");
    // Discarded cards are considered as refined.
-    *_total_refined_cards += first_clean - start;
+    _stats->inc_refined_cards(first_clean - start);
    _stats->inc_precleaned_cards(first_clean - start);
    return first_clean;
  }
@ -488,7 +497,7 @@ class G1RefineBufferedCards : public StackObj {
      _g1rs->refine_card_concurrently(_node_buffer[i], _worker_id);
    }
    _node->set_index(i);
-    *_total_refined_cards += i - start_index;
+    _stats->inc_refined_cards(i - start_index);
    return result;
  }
@ -502,12 +511,12 @@ public:
  G1RefineBufferedCards(BufferNode* node,
                        size_t node_buffer_size,
                        uint worker_id,
-                        size_t* total_refined_cards) :
+                        G1ConcurrentRefineStats* stats) :
    _node(node),
    _node_buffer(reinterpret_cast<CardTable::CardValue**>(BufferNode::make_buffer_from_node(node))),
    _node_buffer_size(node_buffer_size),
    _worker_id(worker_id),
-    _total_refined_cards(total_refined_cards),
+    _stats(stats),
    _g1rs(G1CollectedHeap::heap()->rem_set()) {}
  bool refine() {
@ -532,12 +541,15 @@ public:
 bool G1DirtyCardQueueSet::refine_buffer(BufferNode* node,
                                        uint worker_id,
-                                        size_t* total_refined_cards) {
+                                        G1ConcurrentRefineStats* stats) {
  Ticks start_time = Ticks::now();
  G1RefineBufferedCards buffered_cards(node,
                                       buffer_size(),
                                       worker_id,
-                                       total_refined_cards);
+                                       stats);
-  return buffered_cards.refine();
+  bool result = buffered_cards.refine();
  stats->inc_refinement_time(Ticks::now() - start_time);
  return result;
 }
 void G1DirtyCardQueueSet::handle_refined_buffer(BufferNode* node,
@ -555,7 +567,8 @@ void G1DirtyCardQueueSet::handle_refined_buffer(BufferNode* node,
  }
 }
-void G1DirtyCardQueueSet::handle_completed_buffer(BufferNode* new_node) {
+void G1DirtyCardQueueSet::handle_completed_buffer(BufferNode* new_node,
                                                  G1ConcurrentRefineStats* stats) {
  enqueue_completed_buffer(new_node);
  // No need for mutator refinement if number of cards is below limit.
@ -574,9 +587,7 @@ void G1DirtyCardQueueSet::handle_completed_buffer(BufferNode* new_node) {
  // Refine cards in buffer.
  uint worker_id = _free_ids.claim_par_id(); // temporarily claim an id
-  uint counter_index = worker_id - par_ids_start();
+  bool fully_processed = refine_buffer(node, worker_id, stats);
  size_t* counter = &_mutator_refined_cards_counters[counter_index];
  bool fully_processed = refine_buffer(node, worker_id, counter);
  _free_ids.release_par_id(worker_id); // release the id
  // Deal with buffer after releasing id, to let another thread use id.
@ -585,14 +596,14 @@ void G1DirtyCardQueueSet::handle_completed_buffer(BufferNode* new_node) {
 bool G1DirtyCardQueueSet::refine_completed_buffer_concurrently(uint worker_id,
                                                               size_t stop_at,
-                                                               size_t* total_refined_cards) {
+                                                               G1ConcurrentRefineStats* stats) {
  // Not enough cards to trigger processing.
  if (Atomic::load(&_num_cards) <= stop_at) return false;
  BufferNode* node = get_completed_buffer();
  if (node == NULL) return false; // Didn't get a buffer to process.
-  bool fully_processed = refine_buffer(node, worker_id, total_refined_cards);
+  bool fully_processed = refine_buffer(node, worker_id, stats);
  handle_refined_buffer(node, fully_processed);
  return true;
 }
@ -600,12 +611,15 @@ bool G1DirtyCardQueueSet::refine_completed_buffer_concurrently(uint worker_id,
 void G1DirtyCardQueueSet::abandon_logs() {
  assert_at_safepoint();
  abandon_completed_buffers();
  _detached_refinement_stats.reset();
  // Since abandon is done only at safepoints, we can safely manipulate
  // these queues.
  struct AbandonThreadLogClosure : public ThreadClosure {
    virtual void do_thread(Thread* t) {
-      G1ThreadLocalData::dirty_card_queue(t).reset();
+      G1DirtyCardQueue& dcq = G1ThreadLocalData::dirty_card_queue(t);
      dcq.reset();
      dcq.refinement_stats()->reset();
    }
  } closure;
  Threads::threads_do(&closure);
@ -637,6 +651,40 @@ void G1DirtyCardQueueSet::concatenate_logs() {
  set_max_cards(old_limit);
 }
 G1ConcurrentRefineStats G1DirtyCardQueueSet::get_and_reset_refinement_stats() {
  assert_at_safepoint();
  // Since we're at a safepoint, there aren't any races with recording of
  // detached refinement stats.  In particular, there's no risk of double
  // counting a thread that detaches after we've examined it but before
  // we've processed the detached stats.
  // Collect and reset stats for attached threads.
  struct CollectStats : public ThreadClosure {
    G1ConcurrentRefineStats _total_stats;
    virtual void do_thread(Thread* t) {
      G1DirtyCardQueue& dcq = G1ThreadLocalData::dirty_card_queue(t);
      G1ConcurrentRefineStats& stats = *dcq.refinement_stats();
      _total_stats += stats;
      stats.reset();
    }
  } closure;
  Threads::threads_do(&closure);
  // Collect and reset stats from detached threads.
  MutexLocker ml(G1DetachedRefinementStats_lock, Mutex::_no_safepoint_check_flag);
  closure._total_stats += _detached_refinement_stats;
  _detached_refinement_stats.reset();
  return closure._total_stats;
 }
 void G1DirtyCardQueueSet::record_detached_refinement_stats(G1ConcurrentRefineStats* stats) {
  MutexLocker ml(G1DetachedRefinementStats_lock, Mutex::_no_safepoint_check_flag);
  _detached_refinement_stats += *stats;
  stats->reset();
 }
 size_t G1DirtyCardQueueSet::max_cards() const {
  return _max_cards;
 }
--- a/src/hotspot/share/gc/g1/g1DirtyCardQueue.hpp
+++ b/src/hotspot/share/gc/g1/g1DirtyCardQueue.hpp
@ -27,6 +27,7 @@
 #include "gc/g1/g1BufferNodeList.hpp"
 #include "gc/g1/g1FreeIdSet.hpp"
 #include "gc/g1/g1ConcurrentRefineStats.hpp"
 #include "gc/shared/ptrQueue.hpp"
 #include "memory/allocation.hpp"
 #include "memory/padded.hpp"
@ -38,6 +39,8 @@ class Thread;
 // A ptrQueue whose elements are "oops", pointers to object heads.
 class G1DirtyCardQueue: public PtrQueue {
  G1ConcurrentRefineStats* _refinement_stats;
 protected:
  virtual void handle_completed_buffer();
@ -49,10 +52,18 @@ public:
  ~G1DirtyCardQueue();
  // Process queue entries and release resources.
-  void flush() { flush_impl(); }
+  void flush();
  inline G1DirtyCardQueueSet* dirty_card_qset() const;
  G1ConcurrentRefineStats* refinement_stats() const {
    return _refinement_stats;
  }
  // To be called by the barrier set's on_thread_detach, to notify this
  // object of the corresponding state change of its owning thread.
  void on_thread_detach();
  // Compiler support.
  static ByteSize byte_offset_of_index() {
    return PtrQueue::byte_offset_of_index<G1DirtyCardQueue>();
@ -215,9 +226,7 @@ class G1DirtyCardQueueSet: public PtrQueueSet {
  volatile size_t _padded_max_cards;
  static const size_t MaxCardsUnlimited = SIZE_MAX;
-  // Array of cumulative dirty cards refined by mutator threads.
+  G1ConcurrentRefineStats _detached_refinement_stats;
  // Array has an entry per id in _free_ids.
  size_t* _mutator_refined_cards_counters;
  // Verify _num_cards == sum of cards in the completed queue.
  void verify_num_cards() const NOT_DEBUG_RETURN;
@ -241,9 +250,10 @@ class G1DirtyCardQueueSet: public PtrQueueSet {
  // is a pending yield request.  The node's index is updated to exclude
  // the processed elements, e.g. up to the element before processing
  // stopped, or one past the last element if the entire buffer was
-  // processed. Increments *total_refined_cards by the number of cards
+  // processed. Updates stats.
-  // processed and removed from the buffer.
+  bool refine_buffer(BufferNode* node,
-  bool refine_buffer(BufferNode* node, uint worker_id, size_t* total_refined_cards);
+                     uint worker_id,
                     G1ConcurrentRefineStats* stats);
  // Deal with buffer after a call to refine_buffer.  If fully processed,
  // deallocate the buffer.  Otherwise, record it as paused.
@ -296,26 +306,23 @@ public:
  // Enqueue the buffer, and optionally perform refinement by the mutator.
  // Mutator refinement is only done by Java threads, and only if there
  // are more than max_cards (possibly padded) cards in the completed
-  // buffers.
+  // buffers.  Updates stats.
  //
  // Mutator refinement, if performed, stops processing a buffer if
  // SuspendibleThreadSet::should_yield(), recording the incompletely
  // processed buffer for later processing of the remainder.
-  void handle_completed_buffer(BufferNode* node);
+  void handle_completed_buffer(BufferNode* node, G1ConcurrentRefineStats* stats);
  // If there are more than stop_at cards in the completed buffers, pop
  // a buffer, refine its contents, and return true.  Otherwise return
-  // false.
+  // false.  Updates stats.
  //
  // Stops processing a buffer if SuspendibleThreadSet::should_yield(),
  // recording the incompletely processed buffer for later processing of
  // the remainder.
  //
  // Increments *total_refined_cards by the number of cards processed and
  // removed from the buffer.
  bool refine_completed_buffer_concurrently(uint worker_id,
                                            size_t stop_at,
-                                            size_t* total_refined_cards);
+                                            G1ConcurrentRefineStats* stats);
  // If a full collection is happening, reset partial logs, and release
  // completed ones: the full collection will make them all irrelevant.
@ -324,6 +331,14 @@ public:
  // If any threads have partial logs, add them to the global list of logs.
  void concatenate_logs();
  // Return the total of mutator refinement stats for all threads.
  // Also resets the stats for the threads.
  // precondition: at safepoint.
  G1ConcurrentRefineStats get_and_reset_refinement_stats();
  // Accumulate refinement stats from threads that are detaching.
  void record_detached_refinement_stats(G1ConcurrentRefineStats* stats);
  // Threshold for mutator threads to also do refinement when there
  // are concurrent refinement threads.
  size_t max_cards() const;
@ -336,9 +351,6 @@ public:
  // Discard artificial increase of mutator refinement threshold.
  void discard_max_cards_padding();
  // Total dirty cards refined by mutator threads.
  size_t total_mutator_refined_cards() const;
 };
 inline G1DirtyCardQueueSet* G1DirtyCardQueue::dirty_card_qset() const {
--- a/src/hotspot/share/gc/g1/g1GCPhaseTimes.cpp
+++ b/src/hotspot/share/gc/g1/g1GCPhaseTimes.cpp
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2013, 2019, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2013, 2020, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@ -157,6 +157,7 @@ void G1GCPhaseTimes::reset() {
  _cur_string_deduplication_time_ms = 0.0;
  _cur_prepare_tlab_time_ms = 0.0;
  _cur_resize_tlab_time_ms = 0.0;
  _cur_concatenate_dirty_card_logs_time_ms = 0.0;
  _cur_derived_pointer_table_update_time_ms = 0.0;
  _cur_clear_ct_time_ms = 0.0;
  _cur_expand_heap_time_ms = 0.0;
@ -377,6 +378,8 @@ void G1GCPhaseTimes::trace_count(const char* name, size_t value) const {
 double G1GCPhaseTimes::print_pre_evacuate_collection_set() const {
  const double sum_ms = _root_region_scan_wait_time_ms +
                        _cur_prepare_tlab_time_ms +
                        _cur_concatenate_dirty_card_logs_time_ms +
                        _recorded_young_cset_choice_time_ms +
                        _recorded_non_young_cset_choice_time_ms +
                        _cur_region_register_time +
@ -389,6 +392,7 @@ double G1GCPhaseTimes::print_pre_evacuate_collection_set() const {
    debug_time("Root Region Scan Waiting", _root_region_scan_wait_time_ms);
  }
  debug_time("Prepare TLABs", _cur_prepare_tlab_time_ms);
  debug_time("Concatenate Dirty Card Logs", _cur_concatenate_dirty_card_logs_time_ms);
  debug_time("Choose Collection Set", (_recorded_young_cset_choice_time_ms + _recorded_non_young_cset_choice_time_ms));
  debug_time("Region Register", _cur_region_register_time);
  if (G1EagerReclaimHumongousObjects) {
--- a/src/hotspot/share/gc/g1/g1GCPhaseTimes.hpp
+++ b/src/hotspot/share/gc/g1/g1GCPhaseTimes.hpp
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2013, 2019, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2013, 2020, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@ -143,6 +143,8 @@ class G1GCPhaseTimes : public CHeapObj<mtGC> {
  double _cur_prepare_tlab_time_ms;
  double _cur_resize_tlab_time_ms;
  double _cur_concatenate_dirty_card_logs_time_ms;
  double _cur_derived_pointer_table_update_time_ms;
  double _cur_clear_ct_time_ms;
@ -252,6 +254,10 @@ class G1GCPhaseTimes : public CHeapObj<mtGC> {
    _cur_resize_tlab_time_ms = ms;
  }
  void record_concatenate_dirty_card_logs_time_ms(double ms) {
    _cur_concatenate_dirty_card_logs_time_ms = ms;
  }
  void record_derived_pointer_table_update_time(double ms) {
    _cur_derived_pointer_table_update_time_ms = ms;
  }
--- a/src/hotspot/share/gc/g1/g1Policy.cpp
+++ b/src/hotspot/share/gc/g1/g1Policy.cpp
@ -31,6 +31,7 @@
 #include "gc/g1/g1ConcurrentMark.hpp"
 #include "gc/g1/g1ConcurrentMarkThread.inline.hpp"
 #include "gc/g1/g1ConcurrentRefine.hpp"
 #include "gc/g1/g1ConcurrentRefineStats.hpp"
 #include "gc/g1/g1CollectionSetChooser.hpp"
 #include "gc/g1/g1HeterogeneousHeapPolicy.hpp"
 #include "gc/g1/g1HotCardCache.hpp"
@ -43,7 +44,7 @@
 #include "gc/g1/heapRegionRemSet.hpp"
 #include "gc/shared/concurrentGCBreakpoints.hpp"
 #include "gc/shared/gcPolicyCounters.hpp"
-#include "logging/logStream.hpp"
+#include "logging/log.hpp"
 #include "runtime/arguments.hpp"
 #include "runtime/java.hpp"
 #include "runtime/mutexLocker.hpp"
@ -72,10 +73,6 @@ G1Policy::G1Policy(STWGCTimer* gc_timer) :
  _rs_length(0),
  _rs_length_prediction(0),
  _pending_cards_at_gc_start(0),
  _pending_cards_at_prev_gc_end(0),
  _total_mutator_refined_cards(0),
  _total_concurrent_refined_cards(0),
  _total_concurrent_refinement_time(),
  _bytes_allocated_in_old_since_last_gc(0),
  _initial_mark_to_mixed(),
  _collection_set(NULL),
@ -432,7 +429,7 @@ void G1Policy::record_full_collection_start() {
  collector_state()->set_in_young_only_phase(false);
  collector_state()->set_in_full_gc(true);
  _collection_set->clear_candidates();
-  record_concurrent_refinement_data(true /* is_full_collection */);
+  _pending_cards_at_gc_start = 0;
 }
 void G1Policy::record_full_collection_end() {
@ -462,64 +459,62 @@ void G1Policy::record_full_collection_end() {
  _survivor_surv_rate_group->reset();
  update_young_list_max_and_target_length();
  update_rs_length_prediction();
  _pending_cards_at_prev_gc_end = _g1h->pending_card_num();
  _bytes_allocated_in_old_since_last_gc = 0;
  record_pause(FullGC, _full_collection_start_sec, end_sec);
 }
-void G1Policy::record_concurrent_refinement_data(bool is_full_collection) {
+static void log_refinement_stats(const char* kind, const G1ConcurrentRefineStats& stats) {
-  _pending_cards_at_gc_start = _g1h->pending_card_num();
+  log_debug(gc, refine, stats)
           ("%s refinement: %.2fms, refined: " SIZE_FORMAT
            ", precleaned: " SIZE_FORMAT ", dirtied: " SIZE_FORMAT,
            kind,
            stats.refinement_time().seconds() * MILLIUNITS,
            stats.refined_cards(),
            stats.precleaned_cards(),
            stats.dirtied_cards());
 }
-  // Record info about concurrent refinement thread processing.
+void G1Policy::record_concurrent_refinement_stats() {
  G1DirtyCardQueueSet& dcqs = G1BarrierSet::dirty_card_queue_set();
  _pending_cards_at_gc_start = dcqs.num_cards();
  // Collect per-thread stats, mostly from mutator activity.
  G1ConcurrentRefineStats mut_stats = dcqs.get_and_reset_refinement_stats();
  // Collect specialized concurrent refinement thread stats.
  G1ConcurrentRefine* cr = _g1h->concurrent_refine();
-  G1ConcurrentRefine::RefinementStats cr_stats = cr->total_refinement_stats();
+  G1ConcurrentRefineStats cr_stats = cr->get_and_reset_refinement_stats();
-  Tickspan cr_time = cr_stats._time - _total_concurrent_refinement_time;
+  G1ConcurrentRefineStats total_stats = mut_stats + cr_stats;
  _total_concurrent_refinement_time = cr_stats._time;
-  size_t cr_cards = cr_stats._cards - _total_concurrent_refined_cards;
+  log_refinement_stats("Mutator", mut_stats);
-  _total_concurrent_refined_cards = cr_stats._cards;
+  log_refinement_stats("Concurrent", cr_stats);
  log_refinement_stats("Total", total_stats);
-  // Don't update rate if full collection.  We could be in an implicit full
+  // Record the rate at which cards were refined.
-  // collection after a non-full collection failure, in which case there
+  // Don't update the rate if the current sample is empty or time is zero.
-  // wasn't any mutator/cr-thread activity since last recording.  And if
+  Tickspan refinement_time = total_stats.refinement_time();
-  // we're in an explicit full collection, the time since the last GC can
+  size_t refined_cards = total_stats.refined_cards();
-  // be arbitrarily short, so not a very good sample.  Similarly, don't
+  if ((refined_cards > 0) && (refinement_time > Tickspan())) {
-  // update the rate if the current sample is empty or time is zero.
+    double rate = refined_cards / (refinement_time.seconds() * MILLIUNITS);
  if (!is_full_collection && (cr_cards > 0) && (cr_time > Tickspan())) {
    double rate = cr_cards / (cr_time.seconds() * MILLIUNITS);
    _analytics->report_concurrent_refine_rate_ms(rate);
    log_debug(gc, refine, stats)("Concurrent refinement rate: %.2f cards/ms", rate);
  }
  // Record info about mutator thread processing.
  G1DirtyCardQueueSet& dcqs = G1BarrierSet::dirty_card_queue_set();
  size_t mut_total_cards = dcqs.total_mutator_refined_cards();
  size_t mut_cards = mut_total_cards - _total_mutator_refined_cards;
  _total_mutator_refined_cards = mut_total_cards;
  // Record mutator's card logging rate.
-  // Don't update if full collection; see above.
+  double mut_start_time = _analytics->prev_collection_pause_end_ms();
-  if (!is_full_collection) {
+  double mut_end_time = phase_times()->cur_collection_start_sec() * MILLIUNITS;
-    size_t total_cards = _pending_cards_at_gc_start + cr_cards + mut_cards;
+  double mut_time = mut_end_time - mut_start_time;
-    assert(_pending_cards_at_prev_gc_end <= total_cards,
+  // Unlike above for conc-refine rate, here we should not require a
-           "untracked cards: last pending: " SIZE_FORMAT
+  // non-empty sample, since an application could go some time with only
-           ", pending: " SIZE_FORMAT ", conc refine: " SIZE_FORMAT
+  // young-gen or filtered out writes.  But we'll ignore unusually short
-           ", mut refine:" SIZE_FORMAT,
+  // sample periods, as they may just pollute the predictions.
-           _pending_cards_at_prev_gc_end, _pending_cards_at_gc_start,
+  if (mut_time > 1.0) {   // Require > 1ms sample time.
-           cr_cards, mut_cards);
+    double dirtied_rate = total_stats.dirtied_cards() / mut_time;
-    size_t logged_cards = total_cards - _pending_cards_at_prev_gc_end;
+    _analytics->report_dirtied_cards_rate_ms(dirtied_rate);
-    double logging_start_time = _analytics->prev_collection_pause_end_ms();
+    log_debug(gc, refine, stats)("Generate dirty cards rate: %.2f cards/ms", dirtied_rate);
    double logging_end_time = Ticks::now().seconds() * MILLIUNITS;
    double logging_time = logging_end_time - logging_start_time;
    // Unlike above for conc-refine rate, here we should not require a
    // non-empty sample, since an application could go some time with only
    // young-gen or filtered out writes.  But we'll ignore unusually short
    // sample periods, as they may just pollute the predictions.
    if (logging_time > 1.0) {   // Require > 1ms sample time.
      _analytics->report_logged_cards_rate_ms(logged_cards / logging_time);
    }
  }
 }
@ -536,7 +531,7 @@ void G1Policy::record_collection_pause_start(double start_time_sec) {
  phase_times()->record_cur_collection_start_sec(start_time_sec);
-  record_concurrent_refinement_data(false /* is_full_collection */);
+  record_concurrent_refinement_stats();
  _collection_set->reset_bytes_used_before();
@ -830,7 +825,6 @@ void G1Policy::record_collection_pause_end(double pause_time_ms) {
    scan_logged_cards_time_goal_ms -= merge_hcc_time_ms;
  }
  _pending_cards_at_prev_gc_end = _g1h->pending_card_num();
  double const logged_cards_time = logged_cards_processing_time();
  log_debug(gc, ergo, refine)("Concurrent refinement times: Logged Cards Scan time goal: %1.2fms Logged Cards Scan time: %1.2fms HCC time: %1.2fms",
--- a/src/hotspot/share/gc/g1/g1Policy.hpp
+++ b/src/hotspot/share/gc/g1/g1Policy.hpp
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2019, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2016, 2020, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@ -101,10 +101,6 @@ class G1Policy: public CHeapObj<mtGC> {
  size_t _rs_length_prediction;
  size_t _pending_cards_at_gc_start;
  size_t _pending_cards_at_prev_gc_end;
  size_t _total_mutator_refined_cards;
  size_t _total_concurrent_refined_cards;
  Tickspan _total_concurrent_refinement_time;
  // The amount of allocated bytes in old gen during the last mutator and the following
  // young GC phase.
@ -287,7 +283,8 @@ private:
  // Indicate that we aborted marking before doing any mixed GCs.
  void abort_time_to_mixed_tracking();
-  void record_concurrent_refinement_data(bool is_full_collection);
+  // Record and log stats before not-full collection.
  void record_concurrent_refinement_stats();
 public:
--- a/src/hotspot/share/gc/g1/g1RemSet.cpp
+++ b/src/hotspot/share/gc/g1/g1RemSet.cpp
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2001, 2019, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2001, 2020, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@ -910,9 +910,6 @@ void G1RemSet::prepare_region_for_scan(HeapRegion* region) {
 }
 void G1RemSet::prepare_for_scan_heap_roots() {
  G1DirtyCardQueueSet& dcqs = G1BarrierSet::dirty_card_queue_set();
  dcqs.concatenate_logs();
  _scan_state->prepare();
 }
--- a/src/hotspot/share/gc/g1/g1RemSetSummary.cpp
+++ b/src/hotspot/share/gc/g1/g1RemSetSummary.cpp
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2013, 2019, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2013, 2020, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@ -47,12 +47,10 @@ void G1RemSetSummary::update() {
      G1ConcurrentRefineThread* crt = static_cast<G1ConcurrentRefineThread*>(t);
      _summary->set_rs_thread_vtime(_counter, crt->vtime_accum());
      _counter++;
      _summary->_total_concurrent_refined_cards += crt->total_refined_cards();
    }
  } collector(this);
  G1CollectedHeap* g1h = G1CollectedHeap::heap();
  g1h->concurrent_refine()->threads_do(&collector);
  _total_mutator_refined_cards = G1BarrierSet::dirty_card_queue_set().total_mutator_refined_cards();
  _num_coarsenings = HeapRegionRemSet::n_coarsenings();
  set_sampling_thread_vtime(g1h->sampling_thread()->vtime_accum());
@ -71,8 +69,6 @@ double G1RemSetSummary::rs_thread_vtime(uint thread) const {
 }
 G1RemSetSummary::G1RemSetSummary(bool should_update) :
  _total_mutator_refined_cards(0),
  _total_concurrent_refined_cards(0),
  _num_coarsenings(0),
  _num_vtimes(G1ConcurrentRefine::max_num_threads()),
  _rs_threads_vtimes(NEW_C_HEAP_ARRAY(double, _num_vtimes, mtGC)),
@ -93,9 +89,6 @@ void G1RemSetSummary::set(G1RemSetSummary* other) {
  assert(other != NULL, "just checking");
  assert(_num_vtimes == other->_num_vtimes, "just checking");
  _total_mutator_refined_cards = other->total_mutator_refined_cards();
  _total_concurrent_refined_cards = other->total_concurrent_refined_cards();
  _num_coarsenings = other->num_coarsenings();
  memcpy(_rs_threads_vtimes, other->_rs_threads_vtimes, sizeof(double) * _num_vtimes);
@ -107,9 +100,6 @@ void G1RemSetSummary::subtract_from(G1RemSetSummary* other) {
  assert(other != NULL, "just checking");
  assert(_num_vtimes == other->_num_vtimes, "just checking");
  _total_mutator_refined_cards = other->total_mutator_refined_cards() - _total_mutator_refined_cards;
  _total_concurrent_refined_cards = other->total_concurrent_refined_cards() - _total_concurrent_refined_cards;
  _num_coarsenings = other->num_coarsenings() - _num_coarsenings;
  for (uint i = 0; i < _num_vtimes; i++) {
@ -330,14 +320,6 @@ public:
 };
 void G1RemSetSummary::print_on(outputStream* out) {
  out->print_cr(" Recent concurrent refinement statistics");
  out->print_cr("  Of " SIZE_FORMAT " refined cards:", total_refined_cards());
  out->print_cr("     " SIZE_FORMAT_W(8) " (%5.1f%%) by concurrent refinement threads.",
                total_concurrent_refined_cards(),
                percent_of(total_concurrent_refined_cards(), total_refined_cards()));
  out->print_cr("     " SIZE_FORMAT_W(8) " (%5.1f%%) by mutator threads.",
                total_mutator_refined_cards(),
                percent_of(total_mutator_refined_cards(), total_refined_cards()));
  out->print_cr("  Did " SIZE_FORMAT " coarsenings.", num_coarsenings());
  out->print_cr("  Concurrent refinement threads times (s)");
  out->print("     ");
--- a/src/hotspot/share/gc/g1/g1RemSetSummary.hpp
+++ b/src/hotspot/share/gc/g1/g1RemSetSummary.hpp
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2013, 2019, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2013, 2020, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@ -33,10 +33,6 @@ class G1RemSet;
 // A G1RemSetSummary manages statistical information about the G1RemSet
 class G1RemSetSummary {
 private:
  size_t _total_mutator_refined_cards;
  size_t _total_concurrent_refined_cards;
  size_t _num_coarsenings;
  size_t _num_vtimes;
@ -70,18 +66,6 @@ public:
    return _sampling_thread_vtime;
  }
  size_t total_mutator_refined_cards() const {
    return _total_mutator_refined_cards;
  }
  size_t total_concurrent_refined_cards() const {
    return _total_concurrent_refined_cards;
  }
  size_t total_refined_cards() const {
    return total_mutator_refined_cards() + total_concurrent_refined_cards();
  }
  size_t num_coarsenings() const {
    return _num_coarsenings;
  }
--- a/src/hotspot/share/logging/logPrefix.hpp
+++ b/src/hotspot/share/logging/logPrefix.hpp
@ -82,6 +82,7 @@ DEBUG_ONLY(size_t Test_log_prefix_prefixer(char* buf, size_t len);)
  LOG_PREFIX(GCId::print_prefix, LOG_TAGS(gc, remset, tracking)) \
  LOG_PREFIX(GCId::print_prefix, LOG_TAGS(gc, ref)) \
  LOG_PREFIX(GCId::print_prefix, LOG_TAGS(gc, ref, start)) \
  LOG_PREFIX(GCId::print_prefix, LOG_TAGS(gc, refine, stats)) \
  LOG_PREFIX(GCId::print_prefix, LOG_TAGS(gc, reloc)) \
  LOG_PREFIX(GCId::print_prefix, LOG_TAGS(gc, start)) \
  LOG_PREFIX(GCId::print_prefix, LOG_TAGS(gc, symboltable)) \
--- a/src/hotspot/share/runtime/mutexLocker.cpp
+++ b/src/hotspot/share/runtime/mutexLocker.cpp
@ -74,6 +74,7 @@ Monitor* STS_lock                     = NULL;
 Monitor* FullGCCount_lock             = NULL;
 Monitor* G1OldGCCount_lock            = NULL;
 Mutex*   Shared_DirtyCardQ_lock       = NULL;
 Mutex*   G1DetachedRefinementStats_lock = NULL;
 Mutex*   MarkStackFreeList_lock       = NULL;
 Mutex*   MarkStackChunkList_lock      = NULL;
 Mutex*   MonitoringSupport_lock       = NULL;
@ -212,6 +213,8 @@ void mutex_init() {
    def(Shared_DirtyCardQ_lock     , PaddedMutex  , access + 1,  true,  _safepoint_check_never);
    def(G1DetachedRefinementStats_lock, PaddedMutex, leaf    ,   true, _safepoint_check_never);
    def(FreeList_lock              , PaddedMutex  , leaf     ,   true,  _safepoint_check_never);
    def(OldSets_lock               , PaddedMutex  , leaf     ,   true,  _safepoint_check_never);
    def(RootRegionScan_lock        , PaddedMonitor, leaf     ,   true,  _safepoint_check_never);
@ -269,7 +272,7 @@ void mutex_init() {
  def(PerfDataManager_lock         , PaddedMutex  , leaf,        true,  _safepoint_check_always); // used for synchronized access to PerfDataManager resources
  def(Threads_lock                 , PaddedMonitor, barrier,     true,  _safepoint_check_always);  // Used for safepoint protocol.
-  def(NonJavaThreadsList_lock      , PaddedMutex,   leaf,        true,  _safepoint_check_never);
+  def(NonJavaThreadsList_lock      , PaddedMutex,   barrier,     true,  _safepoint_check_never);
  def(NonJavaThreadsListSync_lock  , PaddedMutex,   leaf,        true,  _safepoint_check_never);
  def(VMOperationQueue_lock        , PaddedMonitor, nonleaf,     true,  _safepoint_check_never);  // VM_thread allowed to block on these
--- a/src/hotspot/share/runtime/mutexLocker.hpp
+++ b/src/hotspot/share/runtime/mutexLocker.hpp
@ -71,6 +71,7 @@ extern Monitor* G1OldGCCount_lock;               // in support of "concurrent" f
 extern Mutex*   Shared_DirtyCardQ_lock;          // Lock protecting dirty card
                                                 // queue shared by
                                                 // non-Java threads.
 extern Mutex*   G1DetachedRefinementStats_lock;  // Lock protecting detached refinement stats
 extern Mutex*   MarkStackFreeList_lock;          // Protects access to the global mark stack free list.
 extern Mutex*   MarkStackChunkList_lock;         // Protects access to the global mark stack chunk list.
 extern Mutex*   MonitoringSupport_lock;          // Protects updates to the serviceability memory pools.
--- a/test/hotspot/jtreg/gc/g1/TestGCLogMessages.java
+++ b/test/hotspot/jtreg/gc/g1/TestGCLogMessages.java
@ -143,6 +143,7 @@ public class TestGCLogMessages {
        new LogMessageWithLevel("Expand Heap After Collection", Level.DEBUG),
        new LogMessageWithLevel("Region Register", Level.DEBUG),
        new LogMessageWithLevel("Prepare Heap Roots", Level.DEBUG),
        new LogMessageWithLevel("Concatenate Dirty Card Logs", Level.DEBUG),
        // Free CSet
        new LogMessageWithLevel("Free Collection Set", Level.DEBUG),
        new LogMessageWithLevel("Serial Free Collection Set", Level.TRACE),
@ -186,6 +187,7 @@ public class TestGCLogMessages {
    public static void main(String[] args) throws Exception {
        new TestGCLogMessages().testNormalLogs();
        new TestGCLogMessages().testConcurrentRefinementLogs();
        new TestGCLogMessages().testWithToSpaceExhaustionLogs();
        new TestGCLogMessages().testWithInitialMark();
        new TestGCLogMessages().testExpandHeap();
@ -221,6 +223,23 @@ public class TestGCLogMessages {
        output.shouldHaveExitValue(0);
    }
    LogMessageWithLevel concRefineMessages[] = new LogMessageWithLevel[] {
        new LogMessageWithLevel("Mutator refinement: ", Level.DEBUG),
        new LogMessageWithLevel("Concurrent refinement: ", Level.DEBUG),
        new LogMessageWithLevel("Total refinement: ", Level.DEBUG),
        // "Concurrent refinement rate" optionally printed if any.
        // "Generate dirty cards rate" optionally printed if any.
    };
    private void testConcurrentRefinementLogs() throws Exception {
        ProcessBuilder pb = ProcessTools.createJavaProcessBuilder("-XX:+UseG1GC",
                                                                  "-Xmx10M",
                                                                  "-Xlog:gc+refine+stats=debug",
                                                                  GCTest.class.getName());
        OutputAnalyzer output = new OutputAnalyzer(pb.start());
        checkMessagesAtLevel(output, concRefineMessages, Level.DEBUG);
    }
    LogMessageWithLevel exhFailureMessages[] = new LogMessageWithLevel[] {
        new LogMessageWithLevel("Evacuation Failure", Level.DEBUG),
        new LogMessageWithLevel("Recalculate Used", Level.TRACE),
--- a/test/hotspot/jtreg/gc/g1/TestRemsetLoggingTools.java
+++ b/test/hotspot/jtreg/gc/g1/TestRemsetLoggingTools.java
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2013, 2019, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2013, 2020, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@ -110,7 +110,7 @@ public class TestRemsetLoggingTools {
    }
    public static void expectRSetSummaries(String result, int expectedCumulative, int expectedPeriodic) throws Exception {
-        int actualTotal = result.split("concurrent refinement statistics").length - 1;
+        int actualTotal = result.split("Concurrent refinement threads times").length - 1;
        int actualCumulative = result.split("Cumulative RS summary").length - 1;
        if (expectedCumulative != actualCumulative) {