7034139: G1: assert(Thread::current()->is_ConcurrentGC_thread()) failed: only a conc GC thread can call this

We were calling STS join and leave during a STW pause and we are not suppoesed to. I now only call those during concurrent phase. I also added stress code in the non-product builds to force an overflows (the condition that ws uncovering the bug) to make sure it does not happen again. Reviewed-by: johnc, brutisso
2025-08-26 14:24:46 +02:00 · 2011-04-29 12:40:49 -04:00 · 2011-04-29 12:40:49 -04:00 · 8c04c76193
commit 8c04c76193
parent 5b3550c107
3 changed files with 125 additions and 22 deletions
--- a/hotspot/src/share/vm/gc_implementation/g1/concurrentMark.cpp
+++ b/hotspot/src/share/vm/gc_implementation/g1/concurrentMark.cpp
@ -826,6 +826,14 @@ public:
 void ConcurrentMark::checkpointRootsInitialPost() {
  G1CollectedHeap*   g1h = G1CollectedHeap::heap();

+  // If we force an overflow during remark, the remark operation will
+  // actually abort and we'll restart concurrent marking. If we always
+  // force an oveflow during remark we'll never actually complete the
+  // marking phase. So, we initilize this here, at the start of the
+  // cycle, so that at the remaining overflow number will decrease at
+  // every remark and we'll eventually not need to cause one.
+  force_overflow_stw()->init();
+
  // For each region note start of marking.
  NoteStartOfMarkHRClosure startcl;
  g1h->heap_region_iterate(&startcl);
@ -893,27 +901,37 @@ void ConcurrentMark::checkpointRootsInitial() {
 }

 /*
-   Notice that in the next two methods, we actually leave the STS
-   during the barrier sync and join it immediately afterwards. If we
-   do not do this, this then the following deadlock can occur: one
-   thread could be in the barrier sync code, waiting for the other
-   thread to also sync up, whereas another one could be trying to
-   yield, while also waiting for the other threads to sync up too.
-
-   Because the thread that does the sync barrier has left the STS, it
-   is possible to be suspended for a Full GC or an evacuation pause
-   could occur. This is actually safe, since the entering the sync
-   barrier is one of the last things do_marking_step() does, and it
-   doesn't manipulate any data structures afterwards.
-*/
+ * Notice that in the next two methods, we actually leave the STS
+ * during the barrier sync and join it immediately afterwards. If we
+ * do not do this, the following deadlock can occur: one thread could
+ * be in the barrier sync code, waiting for the other thread to also
+ * sync up, whereas another one could be trying to yield, while also
+ * waiting for the other threads to sync up too.
+ *
+ * Note, however, that this code is also used during remark and in
+ * this case we should not attempt to leave / enter the STS, otherwise
+ * we'll either hit an asseert (debug / fastdebug) or deadlock
+ * (product). So we should only leave / enter the STS if we are
+ * operating concurrently.
+ *
+ * Because the thread that does the sync barrier has left the STS, it
+ * is possible to be suspended for a Full GC or an evacuation pause
+ * could occur. This is actually safe, since the entering the sync
+ * barrier is one of the last things do_marking_step() does, and it
+ * doesn't manipulate any data structures afterwards.
+ */

 void ConcurrentMark::enter_first_sync_barrier(int task_num) {
  if (verbose_low())
    gclog_or_tty->print_cr("[%d] entering first barrier", task_num);

-  ConcurrentGCThread::stsLeave();
+  if (concurrent()) {
+    ConcurrentGCThread::stsLeave();
+  }
  _first_overflow_barrier_sync.enter();
-  ConcurrentGCThread::stsJoin();
+  if (concurrent()) {
+    ConcurrentGCThread::stsJoin();
+  }
  // at this point everyone should have synced up and not be doing any
  // more work

@ -923,7 +941,12 @@ void ConcurrentMark::enter_first_sync_barrier(int task_num) {
  // let task 0 do this
  if (task_num == 0) {
    // task 0 is responsible for clearing the global data structures
-    clear_marking_state();
+    // We should be here because of an overflow. During STW we should
+    // not clear the overflow flag since we rely on it being true when
+    // we exit this method to abort the pause and restart concurent
+    // marking.
+    clear_marking_state(concurrent() /* clear_overflow */);
+    force_overflow()->update();

    if (PrintGC) {
      gclog_or_tty->date_stamp(PrintGCDateStamps);
@ -940,15 +963,45 @@ void ConcurrentMark::enter_second_sync_barrier(int task_num) {
  if (verbose_low())
    gclog_or_tty->print_cr("[%d] entering second barrier", task_num);

-  ConcurrentGCThread::stsLeave();
+  if (concurrent()) {
+    ConcurrentGCThread::stsLeave();
+  }
  _second_overflow_barrier_sync.enter();
-  ConcurrentGCThread::stsJoin();
+  if (concurrent()) {
+    ConcurrentGCThread::stsJoin();
+  }
  // at this point everything should be re-initialised and ready to go

  if (verbose_low())
    gclog_or_tty->print_cr("[%d] leaving second barrier", task_num);
 }

+#ifndef PRODUCT
+void ForceOverflowSettings::init() {
+  _num_remaining = G1ConcMarkForceOverflow;
+  _force = false;
+  update();
+}
+
+void ForceOverflowSettings::update() {
+  if (_num_remaining > 0) {
+    _num_remaining -= 1;
+    _force = true;
+  } else {
+    _force = false;
+  }
+}
+
+bool ForceOverflowSettings::should_force() {
+  if (_force) {
+    _force = false;
+    return true;
+  } else {
+    return false;
+  }
+}
+#endif // !PRODUCT
+
 void ConcurrentMark::grayRoot(oop p) {
  HeapWord* addr = (HeapWord*) p;
  // We can't really check against _heap_start and _heap_end, since it
@ -1117,6 +1170,7 @@ void ConcurrentMark::markFromRoots() {
  _restart_for_overflow = false;

  size_t active_workers = MAX2((size_t) 1, parallel_marking_threads());
+  force_overflow_conc()->init();
  set_phase(active_workers, true /* concurrent */);

  CMConcurrentMarkingTask markingTask(this, cmThread());
@ -2703,12 +2757,16 @@ void ConcurrentMark::oops_do(OopClosure* cl) {

 }

-void ConcurrentMark::clear_marking_state() {
+void ConcurrentMark::clear_marking_state(bool clear_overflow) {
  _markStack.setEmpty();
  _markStack.clear_overflow();
  _regionStack.setEmpty();
  _regionStack.clear_overflow();
-  clear_has_overflown();
+  if (clear_overflow) {
+    clear_has_overflown();
+  } else {
+    assert(has_overflown(), "pre-condition");
+  }
  _finger = _heap_start;

  for (int i = 0; i < (int)_max_task_num; ++i) {
@ -4279,6 +4337,15 @@ void CMTask::do_marking_step(double time_target_ms,
    }
  }

+  // If we are about to wrap up and go into termination, check if we
+  // should raise the overflow flag.
+  if (do_termination && !has_aborted()) {
+    if (_cm->force_overflow()->should_force()) {
+      _cm->set_has_overflown();
+      regular_clock_call();
+    }
+  }
+
  // We still haven't aborted. Now, let's try to get into the
  // termination protocol.
  if (do_termination && !has_aborted()) {