6684395: Port NUMA-aware allocator to linux

NUMA-aware allocator port to Linux

Reviewed-by: jmasa, apetrusenko
This commit is contained in:
Igor Veresov 2008-04-29 13:51:26 +04:00
parent f784be24d1
commit a24f915b60
14 changed files with 260 additions and 73 deletions

View file

@ -273,6 +273,8 @@ SUNWprivate_1.1 {
jio_vfprintf;
jio_vsnprintf;
fork1;
numa_warn;
numa_error;
# Needed because there is no JVM interface for this.
sysThreadAvailableStackWithSlack;

View file

@ -268,6 +268,8 @@ SUNWprivate_1.1 {
jio_vfprintf;
jio_vsnprintf;
fork1;
numa_warn;
numa_error;
# Needed because there is no JVM interface for this.
sysThreadAvailableStackWithSlack;

View file

@ -2228,20 +2228,42 @@ bool os::commit_memory(char* addr, size_t size, size_t alignment_hint) {
}
void os::realign_memory(char *addr, size_t bytes, size_t alignment_hint) { }
void os::free_memory(char *addr, size_t bytes) { }
void os::free_memory(char *addr, size_t bytes) {
uncommit_memory(addr, bytes);
}
void os::numa_make_global(char *addr, size_t bytes) { }
void os::numa_make_local(char *addr, size_t bytes) { }
void os::numa_make_local(char *addr, size_t bytes, int lgrp_hint) {
Linux::numa_tonode_memory(addr, bytes, lgrp_hint);
}
bool os::numa_topology_changed() { return false; }
size_t os::numa_get_groups_num() { return 1; }
int os::numa_get_group_id() { return 0; }
size_t os::numa_get_leaf_groups(int *ids, size_t size) {
if (size > 0) {
ids[0] = 0;
return 1;
size_t os::numa_get_groups_num() {
int max_node = Linux::numa_max_node();
return max_node > 0 ? max_node + 1 : 1;
}
int os::numa_get_group_id() {
int cpu_id = Linux::sched_getcpu();
if (cpu_id != -1) {
int lgrp_id = Linux::get_node_by_cpu(cpu_id);
if (lgrp_id != -1) {
return lgrp_id;
}
}
return 0;
}
size_t os::numa_get_leaf_groups(int *ids, size_t size) {
for (size_t i = 0; i < size; i++) {
ids[i] = i;
}
return size;
}
bool os::get_page_info(char *start, page_info* info) {
return false;
}
@ -2250,6 +2272,74 @@ char *os::scan_pages(char *start, char* end, page_info* page_expected, page_info
return end;
}
extern "C" void numa_warn(int number, char *where, ...) { }
extern "C" void numa_error(char *where) { }
void os::Linux::libnuma_init() {
// sched_getcpu() should be in libc.
set_sched_getcpu(CAST_TO_FN_PTR(sched_getcpu_func_t,
dlsym(RTLD_DEFAULT, "sched_getcpu")));
if (sched_getcpu() != -1) { // Does it work?
void *handle = dlopen("libnuma.so", RTLD_LAZY);
if (handle != NULL) {
set_numa_node_to_cpus(CAST_TO_FN_PTR(numa_node_to_cpus_func_t,
dlsym(handle, "numa_node_to_cpus")));
set_numa_max_node(CAST_TO_FN_PTR(numa_max_node_func_t,
dlsym(handle, "numa_max_node")));
set_numa_available(CAST_TO_FN_PTR(numa_available_func_t,
dlsym(handle, "numa_available")));
set_numa_tonode_memory(CAST_TO_FN_PTR(numa_tonode_memory_func_t,
dlsym(handle, "numa_tonode_memory")));
if (numa_available() != -1) {
// Create a cpu -> node mapping
_cpu_to_node = new (ResourceObj::C_HEAP) GrowableArray<int>(0, true);
rebuild_cpu_to_node_map();
}
}
}
}
// rebuild_cpu_to_node_map() constructs a table mapping cpud id to node id.
// The table is later used in get_node_by_cpu().
void os::Linux::rebuild_cpu_to_node_map() {
int cpu_num = os::active_processor_count();
cpu_to_node()->clear();
cpu_to_node()->at_grow(cpu_num - 1);
int node_num = numa_get_groups_num();
int cpu_map_size = (cpu_num + BitsPerLong - 1) / BitsPerLong;
unsigned long *cpu_map = NEW_C_HEAP_ARRAY(unsigned long, cpu_map_size);
for (int i = 0; i < node_num; i++) {
if (numa_node_to_cpus(i, cpu_map, cpu_map_size * sizeof(unsigned long)) != -1) {
for (int j = 0; j < cpu_map_size; j++) {
if (cpu_map[j] != 0) {
for (int k = 0; k < BitsPerLong; k++) {
if (cpu_map[j] & (1UL << k)) {
cpu_to_node()->at_put(j * BitsPerLong + k, i);
}
}
}
}
}
}
FREE_C_HEAP_ARRAY(unsigned long, cpu_map);
}
int os::Linux::get_node_by_cpu(int cpu_id) {
if (cpu_to_node() != NULL && cpu_id >= 0 && cpu_id < cpu_to_node()->length()) {
return cpu_to_node()->at(cpu_id);
}
return -1;
}
GrowableArray<int>* os::Linux::_cpu_to_node;
os::Linux::sched_getcpu_func_t os::Linux::_sched_getcpu;
os::Linux::numa_node_to_cpus_func_t os::Linux::_numa_node_to_cpus;
os::Linux::numa_max_node_func_t os::Linux::_numa_max_node;
os::Linux::numa_available_func_t os::Linux::_numa_available;
os::Linux::numa_tonode_memory_func_t os::Linux::_numa_tonode_memory;
bool os::uncommit_memory(char* addr, size_t size) {
return ::mmap(addr, size,
PROT_READ|PROT_WRITE|PROT_EXEC,
@ -3552,6 +3642,10 @@ jint os::init_2(void)
Linux::is_floating_stack() ? "floating stack" : "fixed stack");
}
if (UseNUMA) {
Linux::libnuma_init();
}
if (MaxFDLimit) {
// set the number of file descriptors to max. print out error
// if getrlimit/setrlimit fails but continue regardless.

View file

@ -59,6 +59,8 @@ class Linux {
static bool _is_NPTL;
static bool _supports_fast_thread_cpu_time;
static GrowableArray<int>* _cpu_to_node;
protected:
static julong _physical_memory;
@ -79,8 +81,9 @@ class Linux {
static void set_is_LinuxThreads() { _is_NPTL = false; }
static void set_is_floating_stack() { _is_floating_stack = true; }
static void rebuild_cpu_to_node_map();
static GrowableArray<int>* cpu_to_node() { return _cpu_to_node; }
public:
static void init_thread_fpu_state();
static int get_fpu_control_word();
static void set_fpu_control_word(int fpu_control);
@ -143,6 +146,7 @@ class Linux {
static bool is_floating_stack() { return _is_floating_stack; }
static void libpthread_init();
static void libnuma_init();
// Minimum stack size a thread can be created with (allowing
// the VM to completely create the thread and enter user code)
@ -229,6 +233,38 @@ class Linux {
#undef SR_SUSPENDED
};
private:
typedef int (*sched_getcpu_func_t)(void);
typedef int (*numa_node_to_cpus_func_t)(int node, unsigned long *buffer, int bufferlen);
typedef int (*numa_max_node_func_t)(void);
typedef int (*numa_available_func_t)(void);
typedef int (*numa_tonode_memory_func_t)(void *start, size_t size, int node);
static sched_getcpu_func_t _sched_getcpu;
static numa_node_to_cpus_func_t _numa_node_to_cpus;
static numa_max_node_func_t _numa_max_node;
static numa_available_func_t _numa_available;
static numa_tonode_memory_func_t _numa_tonode_memory;
static void set_sched_getcpu(sched_getcpu_func_t func) { _sched_getcpu = func; }
static void set_numa_node_to_cpus(numa_node_to_cpus_func_t func) { _numa_node_to_cpus = func; }
static void set_numa_max_node(numa_max_node_func_t func) { _numa_max_node = func; }
static void set_numa_available(numa_available_func_t func) { _numa_available = func; }
static void set_numa_tonode_memory(numa_tonode_memory_func_t func) { _numa_tonode_memory = func; }
public:
static int sched_getcpu() { return _sched_getcpu != NULL ? _sched_getcpu() : -1; }
static int numa_node_to_cpus(int node, unsigned long *buffer, int bufferlen) {
return _numa_node_to_cpus != NULL ? _numa_node_to_cpus(node, buffer, bufferlen) : -1;
}
static int numa_max_node() { return _numa_max_node != NULL ? _numa_max_node() : -1; }
static int numa_available() { return _numa_available != NULL ? _numa_available() : -1; }
static int numa_tonode_memory(void *start, size_t size, int node) {
return _numa_tonode_memory != NULL ? _numa_tonode_memory(start, size, node) : -1;
}
static int get_node_by_cpu(int cpu_id);
};

View file

@ -120,3 +120,6 @@ inline int os::closedir(DIR *dirp)
RESTARTABLE(_cmd, _result); \
return _result; \
} while(false)
inline bool os::numa_has_static_binding() { return true; }
inline bool os::numa_has_group_homing() { return false; }

View file

@ -2602,7 +2602,7 @@ void os::realign_memory(char *addr, size_t bytes, size_t alignment_hint) {
}
// Tell the OS to make the range local to the first-touching LWP
void os::numa_make_local(char *addr, size_t bytes) {
void os::numa_make_local(char *addr, size_t bytes, int lgrp_hint) {
assert((intptr_t)addr % os::vm_page_size() == 0, "Address should be page-aligned.");
if (madvise(addr, bytes, MADV_ACCESS_LWP) < 0) {
debug_only(warning("MADV_ACCESS_LWP failed."));

View file

@ -204,3 +204,6 @@ do { \
RESTARTABLE(_cmd, _result); \
return _result; \
} while(false)
inline bool os::numa_has_static_binding() { return false; }
inline bool os::numa_has_group_homing() { return true; }

View file

@ -2581,7 +2581,7 @@ bool os::unguard_memory(char* addr, size_t bytes) {
void os::realign_memory(char *addr, size_t bytes, size_t alignment_hint) { }
void os::free_memory(char *addr, size_t bytes) { }
void os::numa_make_global(char *addr, size_t bytes) { }
void os::numa_make_local(char *addr, size_t bytes) { }
void os::numa_make_local(char *addr, size_t bytes, int lgrp_hint) { }
bool os::numa_topology_changed() { return false; }
size_t os::numa_get_groups_num() { return 1; }
int os::numa_get_group_id() { return 0; }

View file

@ -69,3 +69,6 @@ inline void os::bang_stack_shadow_pages() {
*((int *)(sp - (pages * vm_page_size()))) = 0;
}
}
inline bool os::numa_has_static_binding() { return true; }
inline bool os::numa_has_group_homing() { return false; }

View file

@ -169,8 +169,9 @@ class ParallelScavengeHeap : public CollectedHeap {
size_t large_typearray_limit() { return FastAllocateSizeLimit; }
bool supports_inline_contig_alloc() const { return !UseNUMA; }
HeapWord** top_addr() const { return !UseNUMA ? young_gen()->top_addr() : NULL; }
HeapWord** end_addr() const { return !UseNUMA ? young_gen()->end_addr() : NULL; }
HeapWord** top_addr() const { return !UseNUMA ? young_gen()->top_addr() : (HeapWord**)-1; }
HeapWord** end_addr() const { return !UseNUMA ? young_gen()->end_addr() : (HeapWord**)-1; }
void ensure_parsability(bool retire_tlabs);
void accumulate_statistics_all_tlabs();

View file

@ -46,10 +46,12 @@ void MutableNUMASpace::mangle_unused_area() {
for (int i = 0; i < lgrp_spaces()->length(); i++) {
LGRPSpace *ls = lgrp_spaces()->at(i);
MutableSpace *s = ls->space();
if (!os::numa_has_static_binding()) {
HeapWord *top = MAX2((HeapWord*)round_down((intptr_t)s->top(), page_size()), s->bottom());
if (top < s->end()) {
ls->add_invalid_region(MemRegion(top, s->end()));
}
}
s->mangle_unused_area();
}
}
@ -70,6 +72,7 @@ void MutableNUMASpace::ensure_parsability() {
area_touched_words);
}
#endif
if (!os::numa_has_static_binding()) {
MemRegion invalid;
HeapWord *crossing_start = (HeapWord*)round_to((intptr_t)s->top(), os::vm_page_size());
HeapWord *crossing_end = (HeapWord*)round_to((intptr_t)(s->top() + area_touched_words),
@ -84,9 +87,11 @@ void MutableNUMASpace::ensure_parsability() {
}
ls->add_invalid_region(invalid);
}
s->set_top(s->end());
}
} else {
if (!os::numa_has_static_binding()) {
#ifdef ASSERT
MemRegion invalid(s->top(), s->end());
ls->add_invalid_region(invalid);
@ -98,6 +103,7 @@ void MutableNUMASpace::ensure_parsability() {
#endif
}
}
}
}
size_t MutableNUMASpace::used_in_words() const {
@ -194,7 +200,7 @@ bool MutableNUMASpace::update_layout(bool force) {
}
// Bias region towards the first-touching lgrp. Set the right page sizes.
void MutableNUMASpace::bias_region(MemRegion mr) {
void MutableNUMASpace::bias_region(MemRegion mr, int lgrp_id) {
HeapWord *start = (HeapWord*)round_to((intptr_t)mr.start(), page_size());
HeapWord *end = (HeapWord*)round_down((intptr_t)mr.end(), page_size());
if (end > start) {
@ -202,9 +208,13 @@ void MutableNUMASpace::bias_region(MemRegion mr) {
assert((intptr_t)aligned_region.start() % page_size() == 0 &&
(intptr_t)aligned_region.byte_size() % page_size() == 0, "Bad alignment");
assert(region().contains(aligned_region), "Sanity");
os::free_memory((char*)aligned_region.start(), aligned_region.byte_size());
// First we tell the OS which page size we want in the given range. The underlying
// large page can be broken down if we require small pages.
os::realign_memory((char*)aligned_region.start(), aligned_region.byte_size(), page_size());
os::numa_make_local((char*)aligned_region.start(), aligned_region.byte_size());
// Then we uncommit the pages in the range.
os::free_memory((char*)aligned_region.start(), aligned_region.byte_size());
// And make them local/first-touch biased.
os::numa_make_local((char*)aligned_region.start(), aligned_region.byte_size(), lgrp_id);
}
}
@ -233,12 +243,14 @@ void MutableNUMASpace::update() {
initialize(region(), true);
} else {
bool should_initialize = false;
if (!os::numa_has_static_binding()) {
for (int i = 0; i < lgrp_spaces()->length(); i++) {
if (!lgrp_spaces()->at(i)->invalid_region().is_empty()) {
should_initialize = true;
break;
}
}
}
if (should_initialize ||
(UseAdaptiveNUMAChunkSizing && adaptation_cycles() < samples_count())) {
@ -472,8 +484,8 @@ void MutableNUMASpace::initialize(MemRegion mr, bool clear_space) {
intersection = MemRegion(new_region.start(), new_region.start());
}
select_tails(new_region, intersection, &bottom_region, &top_region);
bias_region(bottom_region);
bias_region(top_region);
bias_region(bottom_region, lgrp_spaces()->at(0)->lgrp_id());
bias_region(top_region, lgrp_spaces()->at(lgrp_spaces()->length() - 1)->lgrp_id());
}
// Check if the space layout has changed significantly?
@ -545,22 +557,37 @@ void MutableNUMASpace::initialize(MemRegion mr, bool clear_space) {
intersection = MemRegion(new_region.start(), new_region.start());
}
if (!os::numa_has_static_binding()) {
MemRegion invalid_region = ls->invalid_region().intersection(new_region);
// Invalid region is a range of memory that could've possibly
// been allocated on the other node. That's relevant only on Solaris where
// there is no static memory binding.
if (!invalid_region.is_empty()) {
merge_regions(new_region, &intersection, &invalid_region);
free_region(invalid_region);
ls->set_invalid_region(MemRegion());
}
}
select_tails(new_region, intersection, &bottom_region, &top_region);
if (!os::numa_has_static_binding()) {
// If that's a system with the first-touch policy then it's enough
// to free the pages.
free_region(bottom_region);
free_region(top_region);
} else {
// In a system with static binding we have to change the bias whenever
// we reshape the heap.
bias_region(bottom_region, ls->lgrp_id());
bias_region(top_region, ls->lgrp_id());
}
// If we clear the region, we would mangle it in debug. That would cause page
// allocation in a different place. Hence setting the top directly.
s->initialize(new_region, false);
s->set_top(s->bottom());
ls->set_invalid_region(MemRegion());
set_adaptation_cycles(samples_count());
}
}
@ -575,7 +602,7 @@ void MutableNUMASpace::set_top(HeapWord* value) {
HeapWord *top = MAX2((HeapWord*)round_down((intptr_t)s->top(), page_size()), s->bottom());
if (s->contains(value)) {
if (top < value && top < s->end()) {
if (!os::numa_has_static_binding() && top < value && top < s->end()) {
ls->add_invalid_region(MemRegion(top, value));
}
s->set_top(value);
@ -584,7 +611,7 @@ void MutableNUMASpace::set_top(HeapWord* value) {
if (found_top) {
s->set_top(s->bottom());
} else {
if (top < s->end()) {
if (!os::numa_has_static_binding() && top < s->end()) {
ls->add_invalid_region(MemRegion(top, s->end()));
}
s->set_top(s->end());
@ -601,11 +628,23 @@ void MutableNUMASpace::clear() {
}
}
/*
Linux supports static memory binding, therefore the most part of the
logic dealing with the possible invalid page allocation is effectively
disabled. Besides there is no notion of the home node in Linux. A
thread is allowed to migrate freely. Although the scheduler is rather
reluctant to move threads between the nodes. We check for the current
node every allocation. And with a high probability a thread stays on
the same node for some time allowing local access to recently allocated
objects.
*/
HeapWord* MutableNUMASpace::allocate(size_t size) {
int lgrp_id = Thread::current()->lgrp_id();
if (lgrp_id == -1) {
Thread* thr = Thread::current();
int lgrp_id = thr->lgrp_id();
if (lgrp_id == -1 || !os::numa_has_group_homing()) {
lgrp_id = os::numa_get_group_id();
Thread::current()->set_lgrp_id(lgrp_id);
thr->set_lgrp_id(lgrp_id);
}
int i = lgrp_spaces()->find(&lgrp_id, LGRPSpace::equals);
@ -628,22 +667,22 @@ HeapWord* MutableNUMASpace::allocate(size_t size) {
MutableSpace::set_top(s->top());
}
}
// Make the page allocation happen here.
if (p != NULL) {
// Make the page allocation happen here if there is no static binding..
if (p != NULL && !os::numa_has_static_binding()) {
for (HeapWord *i = p; i < p + size; i += os::vm_page_size() >> LogHeapWordSize) {
*(int*)i = 0;
}
}
return p;
}
// This version is lock-free.
HeapWord* MutableNUMASpace::cas_allocate(size_t size) {
int lgrp_id = Thread::current()->lgrp_id();
if (lgrp_id == -1) {
Thread* thr = Thread::current();
int lgrp_id = thr->lgrp_id();
if (lgrp_id == -1 || !os::numa_has_group_homing()) {
lgrp_id = os::numa_get_group_id();
Thread::current()->set_lgrp_id(lgrp_id);
thr->set_lgrp_id(lgrp_id);
}
int i = lgrp_spaces()->find(&lgrp_id, LGRPSpace::equals);
@ -670,8 +709,8 @@ HeapWord* MutableNUMASpace::cas_allocate(size_t size) {
}
}
// Make the page allocation happen here.
if (p != NULL) {
// Make the page allocation happen here if there is no static binding.
if (p != NULL && !os::numa_has_static_binding() ) {
for (HeapWord *i = p; i < p + size; i += os::vm_page_size() >> LogHeapWordSize) {
*(int*)i = 0;
}

View file

@ -139,8 +139,8 @@ class MutableNUMASpace : public MutableSpace {
// Check if the NUMA topology has changed. Add and remove spaces if needed.
// The update can be forced by setting the force parameter equal to true.
bool update_layout(bool force);
// Bias region towards the first-touching lgrp.
void bias_region(MemRegion mr);
// Bias region towards the lgrp.
void bias_region(MemRegion mr, int lgrp_id);
// Free pages in a given region.
void free_region(MemRegion mr);
// Get current chunk size.

View file

@ -3181,6 +3181,7 @@ os_<os_family>.cpp events.hpp
os_<os_family>.cpp extendedPC.hpp
os_<os_family>.cpp filemap.hpp
os_<os_family>.cpp globals.hpp
os_<os_family>.cpp growableArray.hpp
os_<os_family>.cpp hpi.hpp
os_<os_family>.cpp icBuffer.hpp
os_<os_family>.cpp interfaceSupport.hpp

View file

@ -33,6 +33,7 @@ class JavaThread;
class Event;
class DLL;
class FileHandle;
template<class E> class GrowableArray;
// %%%%% Moved ThreadState, START_FN, OSThread to new osThread.hpp. -- Rose
@ -206,7 +207,9 @@ class os: AllStatic {
static void realign_memory(char *addr, size_t bytes, size_t alignment_hint);
// NUMA-specific interface
static void numa_make_local(char *addr, size_t bytes);
static bool numa_has_static_binding();
static bool numa_has_group_homing();
static void numa_make_local(char *addr, size_t bytes, int lgrp_hint);
static void numa_make_global(char *addr, size_t bytes);
static size_t numa_get_groups_num();
static size_t numa_get_leaf_groups(int *ids, size_t size);