8157952: Parallelize Memory Pretouch

Use multiple threads to pretouch memory using -XX:+AlwaysPreTouch to use more memory bandwidth Reviewed-by: jmasa, sangheki
2025-09-19 18:44:38 +02:00 · 2016-09-16 11:33:47 +02:00 · 2016-09-16 11:33:47 +02:00 · 317f1aa044
commit 317f1aa044
parent b77d0de3d9
12 changed files with 124 additions and 39 deletions
--- a/hotspot/src/share/vm/gc/g1/g1CollectedHeap.cpp
+++ b/hotspot/src/share/vm/gc/g1/g1CollectedHeap.cpp
@ -1479,7 +1479,7 @@ void G1CollectedHeap::resize_if_necessary_after_full_collection() {
                              "Capacity: " SIZE_FORMAT "B occupancy: " SIZE_FORMAT "B min_desired_capacity: " SIZE_FORMAT "B (" UINTX_FORMAT " %%)",
                              capacity_after_gc, used_after_gc, minimum_desired_capacity, MinHeapFreeRatio);

-    expand(expand_bytes);
+    expand(expand_bytes, _workers);

    // No expansion, now see if we want to shrink
  } else if (capacity_after_gc > maximum_desired_capacity) {
@ -1599,7 +1599,7 @@ HeapWord* G1CollectedHeap::expand_and_allocate(size_t word_size, AllocationConte
                            word_size * HeapWordSize);


-  if (expand(expand_bytes)) {
+  if (expand(expand_bytes, _workers)) {
    _hrm.verify_optional();
    _verifier->verify_region_sets_optional();
    return attempt_allocation_at_safepoint(word_size,
@ -1609,7 +1609,7 @@ HeapWord* G1CollectedHeap::expand_and_allocate(size_t word_size, AllocationConte
  return NULL;
 }

-bool G1CollectedHeap::expand(size_t expand_bytes, double* expand_time_ms) {
+bool G1CollectedHeap::expand(size_t expand_bytes, WorkGang* pretouch_workers, double* expand_time_ms) {
  size_t aligned_expand_bytes = ReservedSpace::page_align_size_up(expand_bytes);
  aligned_expand_bytes = align_size_up(aligned_expand_bytes,
                                       HeapRegion::GrainBytes);
@ -1626,7 +1626,7 @@ bool G1CollectedHeap::expand(size_t expand_bytes, double* expand_time_ms) {
  uint regions_to_expand = (uint)(aligned_expand_bytes / HeapRegion::GrainBytes);
  assert(regions_to_expand > 0, "Must expand by at least one region");

-  uint expanded_by = _hrm.expand_by(regions_to_expand);
+  uint expanded_by = _hrm.expand_by(regions_to_expand, pretouch_workers);
  if (expand_time_ms != NULL) {
    *expand_time_ms = (os::elapsedTime() - expand_heap_start_time_sec) * MILLIUNITS;
  }
@ -1927,7 +1927,7 @@ jint G1CollectedHeap::initialize() {
  _cmThread = _cm->cmThread();

  // Now expand into the initial heap size.
-  if (!expand(init_byte_size)) {
+  if (!expand(init_byte_size, _workers)) {
    vm_shutdown_during_initialization("Failed to allocate initial heap.");
    return JNI_ENOMEM;
  }
@ -3240,7 +3240,7 @@ G1CollectedHeap::do_collection_pause_at_safepoint(double target_pause_time_ms) {
            // No need for an ergo logging here,
            // expansion_amount() does this when it returns a value > 0.
            double expand_ms;
-            if (!expand(expand_bytes, &expand_ms)) {
+            if (!expand(expand_bytes, _workers, &expand_ms)) {
              // We failed to expand the heap. Cannot do anything about it.
            }
            g1_policy()->phase_times()->record_expand_heap_time(expand_ms);
--- a/hotspot/src/share/vm/gc/g1/g1CollectedHeap.hpp
+++ b/hotspot/src/share/vm/gc/g1/g1CollectedHeap.hpp
@ -557,7 +557,7 @@ public:
  // Returns true if the heap was expanded by the requested amount;
  // false otherwise.
  // (Rounds up to a HeapRegion boundary.)
-  bool expand(size_t expand_bytes, double* expand_time_ms = NULL);
+  bool expand(size_t expand_bytes, WorkGang* pretouch_workers = NULL, double* expand_time_ms = NULL);

  // Returns the PLAB statistics for a given destination.
  inline G1EvacStats* alloc_buffer_stats(InCSetState dest);
--- a/hotspot/src/share/vm/gc/g1/g1PageBasedVirtualSpace.cpp
+++ b/hotspot/src/share/vm/gc/g1/g1PageBasedVirtualSpace.cpp
@ -24,8 +24,10 @@

 #include "precompiled.hpp"
 #include "gc/g1/g1PageBasedVirtualSpace.hpp"
+#include "gc/shared/workgroup.hpp"
 #include "oops/markOop.hpp"
 #include "oops/oop.inline.hpp"
+#include "runtime/atomic.hpp"
 #include "runtime/os.inline.hpp"
 #include "services/memTracker.hpp"
 #include "utilities/bitMap.inline.hpp"
@ -177,7 +179,7 @@ void G1PageBasedVirtualSpace::pretouch_internal(size_t start_page, size_t end_pa
  guarantee(start_page < end_page,
            "Given start page " SIZE_FORMAT " is larger or equal to end page " SIZE_FORMAT, start_page, end_page);

-  os::pretouch_memory(page_start(start_page), bounded_end_addr(end_page));
+  os::pretouch_memory(page_start(start_page), bounded_end_addr(end_page), _page_size);
 }

 bool G1PageBasedVirtualSpace::commit(size_t start_page, size_t size_in_pages) {
@ -198,9 +200,6 @@ bool G1PageBasedVirtualSpace::commit(size_t start_page, size_t size_in_pages) {
  }
  _committed.set_range(start_page, end_page);

-  if (AlwaysPreTouch) {
-    pretouch_internal(start_page, end_page);
-  }
  return zero_filled;
 }

@ -227,6 +226,53 @@ void G1PageBasedVirtualSpace::uncommit(size_t start_page, size_t size_in_pages)
  _committed.clear_range(start_page, end_page);
 }

+class G1PretouchTask : public AbstractGangTask {
+private:
+  char* volatile _cur_addr;
+  char* const _start_addr;
+  char* const _end_addr;
+  size_t const _page_size;
+public:
+  G1PretouchTask(char* start_address, char* end_address, size_t page_size) :
+    AbstractGangTask("G1 PreTouch",
+                     Universe::is_fully_initialized() ? GCId::current_raw() :
+                                                        // During VM initialization there is
+                                                        // no GC cycle that this task can be
+                                                        // associated with.
+                                                        GCId::undefined()),
+    _cur_addr(start_address),
+    _start_addr(start_address),
+    _end_addr(end_address),
+    _page_size(page_size) {
+  }
+
+  virtual void work(uint worker_id) {
+    size_t const actual_chunk_size = MAX2(chunk_size(), _page_size);
+    while (true) {
+      char* touch_addr = (char*)Atomic::add_ptr((intptr_t)actual_chunk_size, (volatile void*) &_cur_addr) - actual_chunk_size;
+      if (touch_addr < _start_addr || touch_addr >= _end_addr) {
+        break;
+      }
+      char* end_addr = touch_addr + MIN2(actual_chunk_size, pointer_delta(_end_addr, touch_addr, sizeof(char)));
+      os::pretouch_memory(touch_addr, end_addr, _page_size);
+    }
+  }
+
+  static size_t chunk_size() { return PreTouchParallelChunkSize; }
+};
+
+void G1PageBasedVirtualSpace::pretouch(size_t start_page, size_t size_in_pages, WorkGang* pretouch_gang) {
+  guarantee(pretouch_gang != NULL, "No pretouch gang specified.");
+
+  size_t num_chunks = MAX2((size_t)1, size_in_pages * _page_size / MAX2(G1PretouchTask::chunk_size(), _page_size));
+
+  uint num_workers = MIN2((uint)num_chunks, pretouch_gang->active_workers());
+  G1PretouchTask cl(page_start(start_page), bounded_end_addr(start_page + size_in_pages), _page_size);
+  log_debug(gc, heap)("Running %s with %u workers for " SIZE_FORMAT " work units pre-touching " SIZE_FORMAT "B.",
+                      cl.name(), num_workers, num_chunks, size_in_pages * _page_size);
+  pretouch_gang->run_task(&cl, num_workers);
+}
+
 bool G1PageBasedVirtualSpace::contains(const void* p) const {
  return _low_boundary <= (const char*) p && (const char*) p < _high_boundary;
 }
--- a/hotspot/src/share/vm/gc/g1/g1PageBasedVirtualSpace.hpp
+++ b/hotspot/src/share/vm/gc/g1/g1PageBasedVirtualSpace.hpp
@ -30,6 +30,8 @@
 #include "memory/virtualspace.hpp"
 #include "utilities/bitMap.hpp"

+class WorkGang;
+
 // Virtual space management helper for a virtual space with an OS page allocation
 // granularity.
 // (De-)Allocation requests are always OS page aligned by passing a page index
@ -117,6 +119,8 @@ class G1PageBasedVirtualSpace VALUE_OBJ_CLASS_SPEC {
  // Uncommit the given area of pages starting at start being size_in_pages large.
  void uncommit(size_t start_page, size_t size_in_pages);

+  void pretouch(size_t start_page, size_t size_in_pages, WorkGang* pretouch_gang = NULL);
+
  // Initialize the given reserved space with the given base address and the size
  // actually used.
  // Prefer to commit in page_size chunks.
--- a/hotspot/src/share/vm/gc/g1/g1RegionToSpaceMapper.cpp
+++ b/hotspot/src/share/vm/gc/g1/g1RegionToSpaceMapper.cpp
@ -66,8 +66,12 @@ class G1RegionsLargerThanCommitSizeMapper : public G1RegionToSpaceMapper {
    guarantee(alloc_granularity >= page_size, "allocation granularity smaller than commit granularity");
  }

-  virtual void commit_regions(uint start_idx, size_t num_regions) {
-    bool zero_filled = _storage.commit((size_t)start_idx * _pages_per_region, num_regions * _pages_per_region);
+  virtual void commit_regions(uint start_idx, size_t num_regions, WorkGang* pretouch_gang) {
+    size_t const start_page = (size_t)start_idx * _pages_per_region;
+    bool zero_filled = _storage.commit(start_page, num_regions * _pages_per_region);
+    if (AlwaysPreTouch) {
+      _storage.pretouch(start_page, num_regions * _pages_per_region, pretouch_gang);
+    }
    _commit_map.set_range(start_idx, start_idx + num_regions);
    fire_on_commit(start_idx, num_regions, zero_filled);
  }
@ -110,19 +114,38 @@ class G1RegionsSmallerThanCommitSizeMapper : public G1RegionToSpaceMapper {
    _refcounts.initialize((HeapWord*)rs.base(), (HeapWord*)(rs.base() + align_size_up(rs.size(), page_size)), page_size);
  }

-  virtual void commit_regions(uint start_idx, size_t num_regions) {
+  virtual void commit_regions(uint start_idx, size_t num_regions, WorkGang* pretouch_gang) {
+    size_t const NoPage = ~(size_t)0;
+
+    size_t first_committed = NoPage;
+    size_t num_committed = 0;
+
+    bool all_zero_filled = true;
+
    for (uint i = start_idx; i < start_idx + num_regions; i++) {
      assert(!_commit_map.at(i), "Trying to commit storage at region %u that is already committed", i);
      size_t idx = region_idx_to_page_idx(i);
      uint old_refcount = _refcounts.get_by_index(idx);
+
      bool zero_filled = false;
      if (old_refcount == 0) {
+        if (first_committed == NoPage) {
+          first_committed = idx;
+          num_committed = 1;
+        } else {
+          num_committed++;
+        }
        zero_filled = _storage.commit(idx, 1);
      }
+      all_zero_filled &= zero_filled;
+
      _refcounts.set_by_index(idx, old_refcount + 1);
      _commit_map.set_bit(i);
-      fire_on_commit(i, 1, zero_filled);
    }
+    if (AlwaysPreTouch && num_committed > 0) {
+      _storage.pretouch(first_committed, num_committed, pretouch_gang);
+    }
+    fire_on_commit(start_idx, num_regions, all_zero_filled);
  }

  virtual void uncommit_regions(uint start_idx, size_t num_regions) {
--- a/hotspot/src/share/vm/gc/g1/g1RegionToSpaceMapper.hpp
+++ b/hotspot/src/share/vm/gc/g1/g1RegionToSpaceMapper.hpp
@ -29,6 +29,8 @@
 #include "memory/allocation.hpp"
 #include "utilities/debug.hpp"

+class WorkGang;
+
 class G1MappingChangedListener VALUE_OBJ_CLASS_SPEC {
 public:
  // Fired after commit of the memory, i.e. the memory this listener is registered
@ -68,7 +70,7 @@ class G1RegionToSpaceMapper : public CHeapObj<mtGC> {
    return _commit_map.at(idx);
  }

-  virtual void commit_regions(uint start_idx, size_t num_regions = 1) = 0;
+  virtual void commit_regions(uint start_idx, size_t num_regions = 1, WorkGang* pretouch_workers = NULL) = 0;
  virtual void uncommit_regions(uint start_idx, size_t num_regions = 1) = 0;

  // Creates an appropriate G1RegionToSpaceMapper for the given parameters.
--- a/hotspot/src/share/vm/gc/g1/heapRegionManager.cpp
+++ b/hotspot/src/share/vm/gc/g1/heapRegionManager.cpp
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2001, 2015, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2001, 2016, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@ -72,22 +72,22 @@ HeapRegion* HeapRegionManager::new_heap_region(uint hrm_index) {
  return g1h->new_heap_region(hrm_index, mr);
 }

-void HeapRegionManager::commit_regions(uint index, size_t num_regions) {
+void HeapRegionManager::commit_regions(uint index, size_t num_regions, WorkGang* pretouch_gang) {
  guarantee(num_regions > 0, "Must commit more than zero regions");
  guarantee(_num_committed + num_regions <= max_length(), "Cannot commit more than the maximum amount of regions");

  _num_committed += (uint)num_regions;

-  _heap_mapper->commit_regions(index, num_regions);
+  _heap_mapper->commit_regions(index, num_regions, pretouch_gang);

  // Also commit auxiliary data
-  _prev_bitmap_mapper->commit_regions(index, num_regions);
-  _next_bitmap_mapper->commit_regions(index, num_regions);
+  _prev_bitmap_mapper->commit_regions(index, num_regions, pretouch_gang);
+  _next_bitmap_mapper->commit_regions(index, num_regions, pretouch_gang);

-  _bot_mapper->commit_regions(index, num_regions);
-  _cardtable_mapper->commit_regions(index, num_regions);
+  _bot_mapper->commit_regions(index, num_regions, pretouch_gang);
+  _cardtable_mapper->commit_regions(index, num_regions, pretouch_gang);

-  _card_counts_mapper->commit_regions(index, num_regions);
+  _card_counts_mapper->commit_regions(index, num_regions, pretouch_gang);
 }

 void HeapRegionManager::uncommit_regions(uint start, size_t num_regions) {
@ -117,9 +117,9 @@ void HeapRegionManager::uncommit_regions(uint start, size_t num_regions) {
  _card_counts_mapper->uncommit_regions(start, num_regions);
 }

-void HeapRegionManager::make_regions_available(uint start, uint num_regions) {
+void HeapRegionManager::make_regions_available(uint start, uint num_regions, WorkGang* pretouch_gang) {
  guarantee(num_regions > 0, "No point in calling this for zero regions");
-  commit_regions(start, num_regions);
+  commit_regions(start, num_regions, pretouch_gang);
  for (uint i = start; i < start + num_regions; i++) {
    if (_regions.get_by_index(i) == NULL) {
      HeapRegion* new_hr = new_heap_region(i);
@ -163,11 +163,11 @@ MemoryUsage HeapRegionManager::get_auxiliary_data_memory_usage() const {
  return MemoryUsage(0, used_sz, committed_sz, committed_sz);
 }

-uint HeapRegionManager::expand_by(uint num_regions) {
-  return expand_at(0, num_regions);
+uint HeapRegionManager::expand_by(uint num_regions, WorkGang* pretouch_workers) {
+  return expand_at(0, num_regions, pretouch_workers);
 }

-uint HeapRegionManager::expand_at(uint start, uint num_regions) {
+uint HeapRegionManager::expand_at(uint start, uint num_regions, WorkGang* pretouch_workers) {
  if (num_regions == 0) {
    return 0;
  }
@ -181,7 +181,7 @@ uint HeapRegionManager::expand_at(uint start, uint num_regions) {
  while (expanded < num_regions &&
         (num_last_found = find_unavailable_from_idx(cur, &idx_last_found)) > 0) {
    uint to_expand = MIN2(num_regions - expanded, num_last_found);
-    make_regions_available(idx_last_found, to_expand);
+    make_regions_available(idx_last_found, to_expand, pretouch_workers);
    expanded += to_expand;
    cur = idx_last_found + num_last_found + 1;
  }
--- a/hotspot/src/share/vm/gc/g1/heapRegionManager.hpp
+++ b/hotspot/src/share/vm/gc/g1/heapRegionManager.hpp
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2001, 2015, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2001, 2016, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@ -34,6 +34,7 @@ class HeapRegion;
 class HeapRegionClosure;
 class HeapRegionClaimer;
 class FreeRegionList;
+class WorkGang;

 class G1HeapRegionTable : public G1BiasedMappedArray<HeapRegion*> {
 protected:
@ -94,10 +95,10 @@ class HeapRegionManager: public CHeapObj<mtGC> {
  HeapWord* heap_bottom() const { return _regions.bottom_address_mapped(); }
  HeapWord* heap_end() const {return _regions.end_address_mapped(); }

-  void make_regions_available(uint index, uint num_regions = 1);
+  void make_regions_available(uint index, uint num_regions = 1, WorkGang* pretouch_gang = NULL);

  // Pass down commit calls to the VirtualSpace.
-  void commit_regions(uint index, size_t num_regions = 1);
+  void commit_regions(uint index, size_t num_regions = 1, WorkGang* pretouch_gang = NULL);
  void uncommit_regions(uint index, size_t num_regions = 1);

  // Notify other data structures about change in the heap layout.
@ -209,12 +210,12 @@ public:
  // HeapRegions, or re-use existing ones. Returns the number of regions the
  // sequence was expanded by. If a HeapRegion allocation fails, the resulting
  // number of regions might be smaller than what's desired.
-  uint expand_by(uint num_regions);
+  uint expand_by(uint num_regions, WorkGang* pretouch_workers = NULL);

  // Makes sure that the regions from start to start+num_regions-1 are available
  // for allocation. Returns the number of regions that were committed to achieve
  // this.
-  uint expand_at(uint start, uint num_regions);
+  uint expand_at(uint start, uint num_regions, WorkGang* pretouch_workers = NULL);

  // Find a contiguous set of empty regions of length num. Returns the start index of
  // that set, or G1_NO_HRM_INDEX.
--- a/hotspot/src/share/vm/gc/shared/workgroup.hpp
+++ b/hotspot/src/share/vm/gc/shared/workgroup.hpp
@ -64,6 +64,11 @@ class AbstractGangTask VALUE_OBJ_CLASS_SPEC {
    _gc_id(GCId::current_raw())
  {}

+  AbstractGangTask(const char* name, const uint gc_id) :
+    _name(name),
+    _gc_id(gc_id)
+  {}
+
  // The abstract work method.
  // The argument tells you which member of the gang you are.
  virtual void work(uint worker_id) = 0;
--- a/hotspot/src/share/vm/runtime/globals.hpp
+++ b/hotspot/src/share/vm/runtime/globals.hpp
@ -1596,6 +1596,10 @@ public:
  product(bool, AlwaysPreTouch, false,                                      \
          "Force all freshly committed pages to be pre-touched")            \
                                                                            \
+  product(size_t, PreTouchParallelChunkSize, 1 * G,                         \
+          "Per-thread chunk size for parallel memory pre-touch.")           \
+          range(1, SIZE_MAX / 2)                                            \
+                                                                            \
  product_pd(size_t, CMSYoungGenPerWorker,                                  \
          "The maximum size of young gen chosen by default per GC worker "  \
          "thread available")                                               \
--- a/hotspot/src/share/vm/runtime/os.cpp
+++ b/hotspot/src/share/vm/runtime/os.cpp
@ -1705,8 +1705,8 @@ bool os::release_memory(char* addr, size_t bytes) {
  return res;
 }

-void os::pretouch_memory(void* start, void* end) {
-  for (volatile char *p = (char*)start; p < (char*)end; p += os::vm_page_size()) {
+void os::pretouch_memory(void* start, void* end, size_t page_size) {
+  for (volatile char *p = (char*)start; p < (char*)end; p += page_size) {
    *p = 0;
  }
 }
--- a/hotspot/src/share/vm/runtime/os.hpp
+++ b/hotspot/src/share/vm/runtime/os.hpp
@ -324,7 +324,7 @@ class os: AllStatic {
  // to make the OS back the memory range with actual memory.
  // Current implementation may not touch the last page if unaligned addresses
  // are passed.
-  static void   pretouch_memory(void* start, void* end);
+  static void   pretouch_memory(void* start, void* end, size_t page_size = vm_page_size());

  enum ProtType { MEM_PROT_NONE, MEM_PROT_READ, MEM_PROT_RW, MEM_PROT_RWX };
  static bool   protect_memory(char* addr, size_t bytes, ProtType prot,