From 6a9e2fb1bab53b54d02714a2ee3c6612d19629ce Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 6 Jun 2025 11:45:07 +0200 Subject: [PATCH 1/4] nsfs: move root inode number to uapi Userspace relies on the root inode numbers to identify the initial namespaces. That's already a hard dependency. So we cannot change that anymore. Move the initial inode numbers to a public header. Link: https://github.com/systemd/systemd/commit/d293fade24b34ccc2f5716b0ff5513e9533cf0c4 Link: https://lore.kernel.org/20250606-work-nsfs-v1-1-b8749c9a8844@kernel.org Signed-off-by: Christian Brauner --- include/linux/proc_ns.h | 13 +++++++------ include/uapi/linux/nsfs.h | 9 +++++++++ 2 files changed, 16 insertions(+), 6 deletions(-) diff --git a/include/linux/proc_ns.h b/include/linux/proc_ns.h index 5ea470eb4d76..e77a37b23ca7 100644 --- a/include/linux/proc_ns.h +++ b/include/linux/proc_ns.h @@ -6,6 +6,7 @@ #define _LINUX_PROC_NS_H #include +#include struct pid_namespace; struct nsset; @@ -40,12 +41,12 @@ extern const struct proc_ns_operations timens_for_children_operations; */ enum { PROC_ROOT_INO = 1, - PROC_IPC_INIT_INO = 0xEFFFFFFFU, - PROC_UTS_INIT_INO = 0xEFFFFFFEU, - PROC_USER_INIT_INO = 0xEFFFFFFDU, - PROC_PID_INIT_INO = 0xEFFFFFFCU, - PROC_CGROUP_INIT_INO = 0xEFFFFFFBU, - PROC_TIME_INIT_INO = 0xEFFFFFFAU, + PROC_IPC_INIT_INO = IPC_NS_INIT_INO, + PROC_UTS_INIT_INO = UTS_NS_INIT_INO, + PROC_USER_INIT_INO = USER_NS_INIT_INO, + PROC_PID_INIT_INO = PID_NS_INIT_INO, + PROC_CGROUP_INIT_INO = CGROUP_NS_INIT_INO, + PROC_TIME_INIT_INO = TIME_NS_INIT_INO, }; #ifdef CONFIG_PROC_FS diff --git a/include/uapi/linux/nsfs.h b/include/uapi/linux/nsfs.h index 34127653fd00..6683e7ca3996 100644 --- a/include/uapi/linux/nsfs.h +++ b/include/uapi/linux/nsfs.h @@ -42,4 +42,13 @@ struct mnt_ns_info { /* Get previous namespace. */ #define NS_MNT_GET_PREV _IOR(NSIO, 12, struct mnt_ns_info) +enum init_ns_ino { + IPC_NS_INIT_INO = 0xEFFFFFFFU, + UTS_NS_INIT_INO = 0xEFFFFFFEU, + USER_NS_INIT_INO = 0xEFFFFFFDU, + PID_NS_INIT_INO = 0xEFFFFFFCU, + CGROUP_NS_INIT_INO = 0xEFFFFFFBU, + TIME_NS_INIT_INO = 0xEFFFFFFAU, +}; + #endif /* __LINUX_NSFS_H */ From 9b0240b3ccc325c7a96cf362877180bc9e10d546 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 6 Jun 2025 11:45:08 +0200 Subject: [PATCH 2/4] netns: use stable inode number for initial mount ns Apart from the network and mount namespace all other namespaces expose a stable inode number and userspace has been relying on that for a very long time now. It's very much heavily used API. Align the network namespace and use a stable inode number from the reserved procfs inode number space so this is consistent across all namespaces. Link: https://lore.kernel.org/20250606-work-nsfs-v1-2-b8749c9a8844@kernel.org Reviewed-by: Jakub Kicinski Signed-off-by: Christian Brauner --- include/linux/proc_ns.h | 1 + include/uapi/linux/nsfs.h | 1 + net/core/net_namespace.c | 8 ++++++++ 3 files changed, 10 insertions(+) diff --git a/include/linux/proc_ns.h b/include/linux/proc_ns.h index e77a37b23ca7..3ff0bd381704 100644 --- a/include/linux/proc_ns.h +++ b/include/linux/proc_ns.h @@ -47,6 +47,7 @@ enum { PROC_PID_INIT_INO = PID_NS_INIT_INO, PROC_CGROUP_INIT_INO = CGROUP_NS_INIT_INO, PROC_TIME_INIT_INO = TIME_NS_INIT_INO, + PROC_NET_INIT_INO = NET_NS_INIT_INO, }; #ifdef CONFIG_PROC_FS diff --git a/include/uapi/linux/nsfs.h b/include/uapi/linux/nsfs.h index 6683e7ca3996..393778489d85 100644 --- a/include/uapi/linux/nsfs.h +++ b/include/uapi/linux/nsfs.h @@ -49,6 +49,7 @@ enum init_ns_ino { PID_NS_INIT_INO = 0xEFFFFFFCU, CGROUP_NS_INIT_INO = 0xEFFFFFFBU, TIME_NS_INIT_INO = 0xEFFFFFFAU, + NET_NS_INIT_INO = 0xEFFFFFF9U, }; #endif /* __LINUX_NSFS_H */ diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index ae54f26709ca..03cf87d3b380 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -796,11 +796,19 @@ static __net_init int net_ns_net_init(struct net *net) #ifdef CONFIG_NET_NS net->ns.ops = &netns_operations; #endif + if (net == &init_net) { + net->ns.inum = PROC_NET_INIT_INO; + return 0; + } return ns_alloc_inum(&net->ns); } static __net_exit void net_ns_net_exit(struct net *net) { + /* + * Initial network namespace doesn't exit so we don't need any + * special checks here. + */ ns_free_inum(&net->ns); } From 7f4f229195b73606ded77e56943f463b78adf635 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 6 Jun 2025 11:45:09 +0200 Subject: [PATCH 3/4] mntns: use stable inode number for initial mount ns Apart from the network and mount namespace all other namespaces expose a stable inode number and userspace has been relying on that for a very long time now. It's very much heavily used API. Align the mount namespace and use a stable inode number from the reserved procfs inode number space so this is consistent across all namespaces. Link: https://lore.kernel.org/20250606-work-nsfs-v1-3-b8749c9a8844@kernel.org Signed-off-by: Christian Brauner --- fs/namespace.c | 4 +++- include/linux/proc_ns.h | 1 + include/uapi/linux/nsfs.h | 1 + 3 files changed, 5 insertions(+), 1 deletion(-) diff --git a/fs/namespace.c b/fs/namespace.c index e13d9ab4f564..7ca4612c7ae9 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -6203,9 +6203,11 @@ static void __init init_mount_tree(void) if (IS_ERR(mnt)) panic("Can't create rootfs"); - ns = alloc_mnt_ns(&init_user_ns, false); + ns = alloc_mnt_ns(&init_user_ns, true); if (IS_ERR(ns)) panic("Can't allocate initial namespace"); + ns->seq = atomic64_inc_return(&mnt_ns_seq); + ns->ns.inum = PROC_MNT_INIT_INO; m = real_mount(mnt); ns->root = m; ns->nr_mounts = 1; diff --git a/include/linux/proc_ns.h b/include/linux/proc_ns.h index 3ff0bd381704..6258455e49a4 100644 --- a/include/linux/proc_ns.h +++ b/include/linux/proc_ns.h @@ -48,6 +48,7 @@ enum { PROC_CGROUP_INIT_INO = CGROUP_NS_INIT_INO, PROC_TIME_INIT_INO = TIME_NS_INIT_INO, PROC_NET_INIT_INO = NET_NS_INIT_INO, + PROC_MNT_INIT_INO = MNT_NS_INIT_INO, }; #ifdef CONFIG_PROC_FS diff --git a/include/uapi/linux/nsfs.h b/include/uapi/linux/nsfs.h index 393778489d85..97d8d80d139f 100644 --- a/include/uapi/linux/nsfs.h +++ b/include/uapi/linux/nsfs.h @@ -50,6 +50,7 @@ enum init_ns_ino { CGROUP_NS_INIT_INO = 0xEFFFFFFBU, TIME_NS_INIT_INO = 0xEFFFFFFAU, NET_NS_INIT_INO = 0xEFFFFFF9U, + MNT_NS_INIT_INO = 0xEFFFFFF8U, }; #endif /* __LINUX_NSFS_H */ From 76fdb7eb4e1c91086ce9c3db6972c2ed48c96afb Mon Sep 17 00:00:00 2001 From: Aleksa Sarai Date: Tue, 8 Jul 2025 23:21:51 +1000 Subject: [PATCH 4/4] uapi: export PROCFS_ROOT_INO The root inode of /proc having a fixed inode number has been part of the core kernel ABI since its inception, and recently some userspace programs (mainly container runtimes) have started to explicitly depend on this behaviour. The main reason this is useful to userspace is that by checking that a suspect /proc handle has fstype PROC_SUPER_MAGIC and is PROCFS_ROOT_INO, they can then use openat2(RESOLVE_{NO_{XDEV,MAGICLINK},BENEATH}) to ensure that there isn't a bind-mount that replaces some procfs file with a different one. This kind of attack has lead to security issues in container runtimes in the past (such as CVE-2019-19921) and libraries like libpathrs[1] use this feature of procfs to provide safe procfs handling functions. There was also some trailing whitespace in the "struct proc_dir_entry" initialiser, so fix that up as well. [1]: https://github.com/openSUSE/libpathrs Signed-off-by: Aleksa Sarai Link: https://lore.kernel.org/20250708-uapi-procfs-root-ino-v1-1-6ae61e97c79b@cyphar.com Signed-off-by: Christian Brauner --- fs/proc/root.c | 10 +++++----- include/linux/proc_ns.h | 1 - include/uapi/linux/fs.h | 11 +++++++++++ 3 files changed, 16 insertions(+), 6 deletions(-) diff --git a/fs/proc/root.c b/fs/proc/root.c index 06a297a27ba3..ed86ac710384 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -363,12 +363,12 @@ static const struct inode_operations proc_root_inode_operations = { * This is the root "inode" in the /proc tree.. */ struct proc_dir_entry proc_root = { - .low_ino = PROC_ROOT_INO, - .namelen = 5, - .mode = S_IFDIR | S_IRUGO | S_IXUGO, - .nlink = 2, + .low_ino = PROCFS_ROOT_INO, + .namelen = 5, + .mode = S_IFDIR | S_IRUGO | S_IXUGO, + .nlink = 2, .refcnt = REFCOUNT_INIT(1), - .proc_iops = &proc_root_inode_operations, + .proc_iops = &proc_root_inode_operations, .proc_dir_ops = &proc_root_operations, .parent = &proc_root, .subdir = RB_ROOT, diff --git a/include/linux/proc_ns.h b/include/linux/proc_ns.h index 6258455e49a4..4b20375f3783 100644 --- a/include/linux/proc_ns.h +++ b/include/linux/proc_ns.h @@ -40,7 +40,6 @@ extern const struct proc_ns_operations timens_for_children_operations; * We always define these enumerators */ enum { - PROC_ROOT_INO = 1, PROC_IPC_INIT_INO = IPC_NS_INIT_INO, PROC_UTS_INIT_INO = UTS_NS_INIT_INO, PROC_USER_INIT_INO = USER_NS_INIT_INO, diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h index 0098b0ce8ccb..28238a3edbc1 100644 --- a/include/uapi/linux/fs.h +++ b/include/uapi/linux/fs.h @@ -60,6 +60,17 @@ #define RENAME_EXCHANGE (1 << 1) /* Exchange source and dest */ #define RENAME_WHITEOUT (1 << 2) /* Whiteout source */ +/* + * The root inode of procfs is guaranteed to always have the same inode number. + * For programs that make heavy use of procfs, verifying that the root is a + * real procfs root and using openat2(RESOLVE_{NO_{XDEV,MAGICLINKS},BENEATH}) + * will allow you to make sure you are never tricked into operating on the + * wrong procfs file. + */ +enum procfs_ino { + PROCFS_ROOT_INO = 1, +}; + struct file_clone_range { __s64 src_fd; __u64 src_offset;