mirror of
https://github.com/openjdk/jdk.git
synced 2025-09-16 00:54:38 +02:00
8230305: Cgroups v2: Container awareness
Implement Cgroups v2 container awareness in hotspot Reviewed-by: bobv, dholmes
This commit is contained in:
parent
71340f51fa
commit
d462a6b5c9
10 changed files with 1425 additions and 638 deletions
421
src/hotspot/os/linux/cgroupSubsystem_linux.cpp
Normal file
421
src/hotspot/os/linux/cgroupSubsystem_linux.cpp
Normal file
|
@ -0,0 +1,421 @@
|
|||
/*
|
||||
* Copyright (c) 2019, 2020, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 only, as
|
||||
* published by the Free Software Foundation.
|
||||
*
|
||||
* This code is distributed in the hope that it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||
* version 2 for more details (a copy is included in the LICENSE file that
|
||||
* accompanied this code).
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License version
|
||||
* 2 along with this work; if not, write to the Free Software Foundation,
|
||||
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
*
|
||||
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
||||
* or visit www.oracle.com if you need additional information or have any
|
||||
* questions.
|
||||
*
|
||||
*/
|
||||
|
||||
#include <string.h>
|
||||
#include <math.h>
|
||||
#include <errno.h>
|
||||
#include "cgroupSubsystem_linux.hpp"
|
||||
#include "cgroupV1Subsystem_linux.hpp"
|
||||
#include "cgroupV2Subsystem_linux.hpp"
|
||||
#include "logging/log.hpp"
|
||||
#include "memory/allocation.hpp"
|
||||
#include "runtime/globals.hpp"
|
||||
#include "runtime/os.hpp"
|
||||
#include "utilities/globalDefinitions.hpp"
|
||||
|
||||
CgroupSubsystem* CgroupSubsystemFactory::create() {
|
||||
CgroupV1MemoryController* memory = NULL;
|
||||
CgroupV1Controller* cpuset = NULL;
|
||||
CgroupV1Controller* cpu = NULL;
|
||||
CgroupV1Controller* cpuacct = NULL;
|
||||
FILE *mntinfo = NULL;
|
||||
FILE *cgroups = NULL;
|
||||
FILE *cgroup = NULL;
|
||||
char buf[MAXPATHLEN+1];
|
||||
char tmproot[MAXPATHLEN+1];
|
||||
char tmpmount[MAXPATHLEN+1];
|
||||
char *p;
|
||||
bool is_cgroupsV2;
|
||||
// true iff all controllers, memory, cpu, cpuset, cpuacct are enabled
|
||||
// at the kernel level.
|
||||
bool all_controllers_enabled;
|
||||
|
||||
CgroupInfo cg_infos[CG_INFO_LENGTH];
|
||||
int cpuset_idx = 0;
|
||||
int cpu_idx = 1;
|
||||
int cpuacct_idx = 2;
|
||||
int memory_idx = 3;
|
||||
|
||||
/*
|
||||
* Read /proc/cgroups so as to be able to distinguish cgroups v2 vs cgroups v1.
|
||||
*
|
||||
* For cgroups v1 unified hierarchy, cpu, cpuacct, cpuset, memory controllers
|
||||
* must have non-zero for the hierarchy ID field.
|
||||
*/
|
||||
cgroups = fopen("/proc/cgroups", "r");
|
||||
if (cgroups == NULL) {
|
||||
log_debug(os, container)("Can't open /proc/cgroups, %s",
|
||||
os::strerror(errno));
|
||||
return NULL;
|
||||
}
|
||||
|
||||
while ((p = fgets(buf, MAXPATHLEN, cgroups)) != NULL) {
|
||||
char name[MAXPATHLEN+1];
|
||||
int hierarchy_id;
|
||||
int enabled;
|
||||
|
||||
// Format of /proc/cgroups documented via man 7 cgroups
|
||||
if (sscanf(p, "%s %d %*d %d", name, &hierarchy_id, &enabled) != 3) {
|
||||
continue;
|
||||
}
|
||||
if (strcmp(name, "memory") == 0) {
|
||||
cg_infos[memory_idx]._name = os::strdup(name);
|
||||
cg_infos[memory_idx]._hierarchy_id = hierarchy_id;
|
||||
cg_infos[memory_idx]._enabled = (enabled == 1);
|
||||
} else if (strcmp(name, "cpuset") == 0) {
|
||||
cg_infos[cpuset_idx]._name = os::strdup(name);
|
||||
cg_infos[cpuset_idx]._hierarchy_id = hierarchy_id;
|
||||
cg_infos[cpuset_idx]._enabled = (enabled == 1);
|
||||
} else if (strcmp(name, "cpu") == 0) {
|
||||
cg_infos[cpu_idx]._name = os::strdup(name);
|
||||
cg_infos[cpu_idx]._hierarchy_id = hierarchy_id;
|
||||
cg_infos[cpu_idx]._enabled = (enabled == 1);
|
||||
} else if (strcmp(name, "cpuacct") == 0) {
|
||||
cg_infos[cpuacct_idx]._name = os::strdup(name);
|
||||
cg_infos[cpuacct_idx]._hierarchy_id = hierarchy_id;
|
||||
cg_infos[cpuacct_idx]._enabled = (enabled == 1);
|
||||
}
|
||||
}
|
||||
fclose(cgroups);
|
||||
|
||||
is_cgroupsV2 = true;
|
||||
all_controllers_enabled = true;
|
||||
for (int i = 0; i < CG_INFO_LENGTH; i++) {
|
||||
is_cgroupsV2 = is_cgroupsV2 && cg_infos[i]._hierarchy_id == 0;
|
||||
all_controllers_enabled = all_controllers_enabled && cg_infos[i]._enabled;
|
||||
}
|
||||
|
||||
if (!all_controllers_enabled) {
|
||||
// one or more controllers disabled, disable container support
|
||||
log_debug(os, container)("One or more required controllers disabled at kernel level.");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/*
|
||||
* Read /proc/self/cgroup and determine:
|
||||
* - the cgroup path for cgroups v2 or
|
||||
* - on a cgroups v1 system, collect info for mapping
|
||||
* the host mount point to the local one via /proc/self/mountinfo below.
|
||||
*/
|
||||
cgroup = fopen("/proc/self/cgroup", "r");
|
||||
if (cgroup == NULL) {
|
||||
log_debug(os, container)("Can't open /proc/self/cgroup, %s",
|
||||
os::strerror(errno));
|
||||
return NULL;
|
||||
}
|
||||
|
||||
while ((p = fgets(buf, MAXPATHLEN, cgroup)) != NULL) {
|
||||
char *controllers;
|
||||
char *token;
|
||||
char *hierarchy_id_str;
|
||||
int hierarchy_id;
|
||||
char *cgroup_path;
|
||||
|
||||
hierarchy_id_str = strsep(&p, ":");
|
||||
hierarchy_id = atoi(hierarchy_id_str);
|
||||
/* Get controllers and base */
|
||||
controllers = strsep(&p, ":");
|
||||
cgroup_path = strsep(&p, "\n");
|
||||
|
||||
if (controllers == NULL) {
|
||||
continue;
|
||||
}
|
||||
|
||||
while (!is_cgroupsV2 && (token = strsep(&controllers, ",")) != NULL) {
|
||||
if (strcmp(token, "memory") == 0) {
|
||||
assert(hierarchy_id == cg_infos[memory_idx]._hierarchy_id, "/proc/cgroups and /proc/self/cgroup hierarchy mismatch");
|
||||
cg_infos[memory_idx]._cgroup_path = os::strdup(cgroup_path);
|
||||
} else if (strcmp(token, "cpuset") == 0) {
|
||||
assert(hierarchy_id == cg_infos[cpuset_idx]._hierarchy_id, "/proc/cgroups and /proc/self/cgroup hierarchy mismatch");
|
||||
cg_infos[cpuset_idx]._cgroup_path = os::strdup(cgroup_path);
|
||||
} else if (strcmp(token, "cpu") == 0) {
|
||||
assert(hierarchy_id == cg_infos[cpu_idx]._hierarchy_id, "/proc/cgroups and /proc/self/cgroup hierarchy mismatch");
|
||||
cg_infos[cpu_idx]._cgroup_path = os::strdup(cgroup_path);
|
||||
} else if (strcmp(token, "cpuacct") == 0) {
|
||||
assert(hierarchy_id == cg_infos[cpuacct_idx]._hierarchy_id, "/proc/cgroups and /proc/self/cgroup hierarchy mismatch");
|
||||
cg_infos[cpuacct_idx]._cgroup_path = os::strdup(cgroup_path);
|
||||
}
|
||||
}
|
||||
if (is_cgroupsV2) {
|
||||
for (int i = 0; i < CG_INFO_LENGTH; i++) {
|
||||
cg_infos[i]._cgroup_path = os::strdup(cgroup_path);
|
||||
}
|
||||
}
|
||||
}
|
||||
fclose(cgroup);
|
||||
|
||||
if (is_cgroupsV2) {
|
||||
// Find the cgroup2 mount point by reading /proc/self/mountinfo
|
||||
mntinfo = fopen("/proc/self/mountinfo", "r");
|
||||
if (mntinfo == NULL) {
|
||||
log_debug(os, container)("Can't open /proc/self/mountinfo, %s",
|
||||
os::strerror(errno));
|
||||
return NULL;
|
||||
}
|
||||
|
||||
char cgroupv2_mount[MAXPATHLEN+1];
|
||||
char fstype[MAXPATHLEN+1];
|
||||
bool mount_point_found = false;
|
||||
while ((p = fgets(buf, MAXPATHLEN, mntinfo)) != NULL) {
|
||||
char *tmp_mount_point = cgroupv2_mount;
|
||||
char *tmp_fs_type = fstype;
|
||||
|
||||
// mountinfo format is documented at https://www.kernel.org/doc/Documentation/filesystems/proc.txt
|
||||
if (sscanf(p, "%*d %*d %*d:%*d %*s %s %*[^-]- %s cgroup2 %*s", tmp_mount_point, tmp_fs_type) == 2) {
|
||||
// we likely have an early match return, be sure we have cgroup2 as fstype
|
||||
if (strcmp("cgroup2", tmp_fs_type) == 0) {
|
||||
mount_point_found = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
fclose(mntinfo);
|
||||
if (!mount_point_found) {
|
||||
log_trace(os, container)("Mount point for cgroupv2 not found in /proc/self/mountinfo");
|
||||
return NULL;
|
||||
}
|
||||
// Cgroups v2 case, we have all the info we need.
|
||||
// Construct the subsystem, free resources and return
|
||||
// Note: any index in cg_infos will do as the path is the same for
|
||||
// all controllers.
|
||||
CgroupController* unified = new CgroupV2Controller(cgroupv2_mount, cg_infos[memory_idx]._cgroup_path);
|
||||
for (int i = 0; i < CG_INFO_LENGTH; i++) {
|
||||
os::free(cg_infos[i]._name);
|
||||
os::free(cg_infos[i]._cgroup_path);
|
||||
}
|
||||
log_debug(os, container)("Detected cgroups v2 unified hierarchy");
|
||||
return new CgroupV2Subsystem(unified);
|
||||
}
|
||||
|
||||
// What follows is cgroups v1
|
||||
log_debug(os, container)("Detected cgroups hybrid or legacy hierarchy, using cgroups v1 controllers");
|
||||
|
||||
/*
|
||||
* Find the cgroup mount point for memory and cpuset
|
||||
* by reading /proc/self/mountinfo
|
||||
*
|
||||
* Example for docker:
|
||||
* 219 214 0:29 /docker/7208cebd00fa5f2e342b1094f7bed87fa25661471a4637118e65f1c995be8a34 /sys/fs/cgroup/memory ro,nosuid,nodev,noexec,relatime - cgroup cgroup rw,memory
|
||||
*
|
||||
* Example for host:
|
||||
* 34 28 0:29 / /sys/fs/cgroup/memory rw,nosuid,nodev,noexec,relatime shared:16 - cgroup cgroup rw,memory
|
||||
*/
|
||||
mntinfo = fopen("/proc/self/mountinfo", "r");
|
||||
if (mntinfo == NULL) {
|
||||
log_debug(os, container)("Can't open /proc/self/mountinfo, %s",
|
||||
os::strerror(errno));
|
||||
return NULL;
|
||||
}
|
||||
|
||||
while ((p = fgets(buf, MAXPATHLEN, mntinfo)) != NULL) {
|
||||
char tmpcgroups[MAXPATHLEN+1];
|
||||
char *cptr = tmpcgroups;
|
||||
char *token;
|
||||
|
||||
// mountinfo format is documented at https://www.kernel.org/doc/Documentation/filesystems/proc.txt
|
||||
if (sscanf(p, "%*d %*d %*d:%*d %s %s %*[^-]- cgroup %*s %s", tmproot, tmpmount, tmpcgroups) != 3) {
|
||||
continue;
|
||||
}
|
||||
while ((token = strsep(&cptr, ",")) != NULL) {
|
||||
if (strcmp(token, "memory") == 0) {
|
||||
memory = new CgroupV1MemoryController(tmproot, tmpmount);
|
||||
} else if (strcmp(token, "cpuset") == 0) {
|
||||
cpuset = new CgroupV1Controller(tmproot, tmpmount);
|
||||
} else if (strcmp(token, "cpu") == 0) {
|
||||
cpu = new CgroupV1Controller(tmproot, tmpmount);
|
||||
} else if (strcmp(token, "cpuacct") == 0) {
|
||||
cpuacct= new CgroupV1Controller(tmproot, tmpmount);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fclose(mntinfo);
|
||||
|
||||
if (memory == NULL) {
|
||||
log_debug(os, container)("Required cgroup v1 memory subsystem not found");
|
||||
return NULL;
|
||||
}
|
||||
if (cpuset == NULL) {
|
||||
log_debug(os, container)("Required cgroup v1 cpuset subsystem not found");
|
||||
return NULL;
|
||||
}
|
||||
if (cpu == NULL) {
|
||||
log_debug(os, container)("Required cgroup v1 cpu subsystem not found");
|
||||
return NULL;
|
||||
}
|
||||
if (cpuacct == NULL) {
|
||||
log_debug(os, container)("Required cgroup v1 cpuacct subsystem not found");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/*
|
||||
* Use info gathered previously from /proc/self/cgroup
|
||||
* and map host mount point to
|
||||
* local one via /proc/self/mountinfo content above
|
||||
*
|
||||
* Docker example:
|
||||
* 5:memory:/docker/6558aed8fc662b194323ceab5b964f69cf36b3e8af877a14b80256e93aecb044
|
||||
*
|
||||
* Host example:
|
||||
* 5:memory:/user.slice
|
||||
*
|
||||
* Construct a path to the process specific memory and cpuset
|
||||
* cgroup directory.
|
||||
*
|
||||
* For a container running under Docker from memory example above
|
||||
* the paths would be:
|
||||
*
|
||||
* /sys/fs/cgroup/memory
|
||||
*
|
||||
* For a Host from memory example above the path would be:
|
||||
*
|
||||
* /sys/fs/cgroup/memory/user.slice
|
||||
*
|
||||
*/
|
||||
for (int i = 0; i < CG_INFO_LENGTH; i++) {
|
||||
CgroupInfo info = cg_infos[i];
|
||||
if (strcmp(info._name, "memory") == 0) {
|
||||
memory->set_subsystem_path(info._cgroup_path);
|
||||
} else if (strcmp(info._name, "cpuset") == 0) {
|
||||
cpuset->set_subsystem_path(info._cgroup_path);
|
||||
} else if (strcmp(info._name, "cpu") == 0) {
|
||||
cpu->set_subsystem_path(info._cgroup_path);
|
||||
} else if (strcmp(info._name, "cpuacct") == 0) {
|
||||
cpuacct->set_subsystem_path(info._cgroup_path);
|
||||
}
|
||||
}
|
||||
return new CgroupV1Subsystem(cpuset, cpu, cpuacct, memory);
|
||||
}
|
||||
|
||||
/* active_processor_count
|
||||
*
|
||||
* Calculate an appropriate number of active processors for the
|
||||
* VM to use based on these three inputs.
|
||||
*
|
||||
* cpu affinity
|
||||
* cgroup cpu quota & cpu period
|
||||
* cgroup cpu shares
|
||||
*
|
||||
* Algorithm:
|
||||
*
|
||||
* Determine the number of available CPUs from sched_getaffinity
|
||||
*
|
||||
* If user specified a quota (quota != -1), calculate the number of
|
||||
* required CPUs by dividing quota by period.
|
||||
*
|
||||
* If shares are in effect (shares != -1), calculate the number
|
||||
* of CPUs required for the shares by dividing the share value
|
||||
* by PER_CPU_SHARES.
|
||||
*
|
||||
* All results of division are rounded up to the next whole number.
|
||||
*
|
||||
* If neither shares or quotas have been specified, return the
|
||||
* number of active processors in the system.
|
||||
*
|
||||
* If both shares and quotas have been specified, the results are
|
||||
* based on the flag PreferContainerQuotaForCPUCount. If true,
|
||||
* return the quota value. If false return the smallest value
|
||||
* between shares or quotas.
|
||||
*
|
||||
* If shares and/or quotas have been specified, the resulting number
|
||||
* returned will never exceed the number of active processors.
|
||||
*
|
||||
* return:
|
||||
* number of CPUs
|
||||
*/
|
||||
int CgroupSubsystem::active_processor_count() {
|
||||
int quota_count = 0, share_count = 0;
|
||||
int cpu_count, limit_count;
|
||||
int result;
|
||||
|
||||
// We use a cache with a timeout to avoid performing expensive
|
||||
// computations in the event this function is called frequently.
|
||||
// [See 8227006].
|
||||
CachingCgroupController* contrl = cpu_controller();
|
||||
CachedMetric* cpu_limit = contrl->metrics_cache();
|
||||
if (!cpu_limit->should_check_metric()) {
|
||||
int val = (int)cpu_limit->value();
|
||||
log_trace(os, container)("CgroupSubsystem::active_processor_count (cached): %d", val);
|
||||
return val;
|
||||
}
|
||||
|
||||
cpu_count = limit_count = os::Linux::active_processor_count();
|
||||
int quota = cpu_quota();
|
||||
int period = cpu_period();
|
||||
int share = cpu_shares();
|
||||
|
||||
if (quota > -1 && period > 0) {
|
||||
quota_count = ceilf((float)quota / (float)period);
|
||||
log_trace(os, container)("CPU Quota count based on quota/period: %d", quota_count);
|
||||
}
|
||||
if (share > -1) {
|
||||
share_count = ceilf((float)share / (float)PER_CPU_SHARES);
|
||||
log_trace(os, container)("CPU Share count based on shares: %d", share_count);
|
||||
}
|
||||
|
||||
// If both shares and quotas are setup results depend
|
||||
// on flag PreferContainerQuotaForCPUCount.
|
||||
// If true, limit CPU count to quota
|
||||
// If false, use minimum of shares and quotas
|
||||
if (quota_count !=0 && share_count != 0) {
|
||||
if (PreferContainerQuotaForCPUCount) {
|
||||
limit_count = quota_count;
|
||||
} else {
|
||||
limit_count = MIN2(quota_count, share_count);
|
||||
}
|
||||
} else if (quota_count != 0) {
|
||||
limit_count = quota_count;
|
||||
} else if (share_count != 0) {
|
||||
limit_count = share_count;
|
||||
}
|
||||
|
||||
result = MIN2(cpu_count, limit_count);
|
||||
log_trace(os, container)("OSContainer::active_processor_count: %d", result);
|
||||
|
||||
// Update cached metric to avoid re-reading container settings too often
|
||||
cpu_limit->set_value(result, OSCONTAINER_CACHE_TIMEOUT);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
/* memory_limit_in_bytes
|
||||
*
|
||||
* Return the limit of available memory for this process.
|
||||
*
|
||||
* return:
|
||||
* memory limit in bytes or
|
||||
* -1 for unlimited
|
||||
* OSCONTAINER_ERROR for not supported
|
||||
*/
|
||||
jlong CgroupSubsystem::memory_limit_in_bytes() {
|
||||
CachingCgroupController* contrl = memory_controller();
|
||||
CachedMetric* memory_limit = contrl->metrics_cache();
|
||||
if (!memory_limit->should_check_metric()) {
|
||||
return memory_limit->value();
|
||||
}
|
||||
jlong mem_limit = read_memory_limit_in_bytes();
|
||||
// Update cached metric to avoid re-reading container settings too often
|
||||
memory_limit->set_value(mem_limit, OSCONTAINER_CACHE_TIMEOUT);
|
||||
return mem_limit;
|
||||
}
|
Loading…
Add table
Add a link
Reference in a new issue