From 1a1ad73aa1a66787f05f7f10f686b74bab77be72 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 18 Jun 2025 22:53:35 +0200 Subject: [PATCH 01/37] pidfs: raise SB_I_NODEV and SB_I_NOEXEC Similar to commit 1ed95281c0c7 ("anon_inode: raise SB_I_NODEV and SB_I_NOEXEC"): it shouldn't be possible to execute pidfds via execveat(fd_anon_inode, "", NULL, NULL, AT_EMPTY_PATH) so raise SB_I_NOEXEC so that no one gets any creative ideas. Also raise SB_I_NODEV as we don't expect or support any devices on pidfs. Link: https://lore.kernel.org/20250618-work-pidfs-persistent-v2-1-98f3456fd552@kernel.org Reviewed-by: Alexander Mikhalitsyn Signed-off-by: Christian Brauner --- fs/pidfs.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/pidfs.c b/fs/pidfs.c index c1f0a067be40..ff2560b34ed1 100644 --- a/fs/pidfs.c +++ b/fs/pidfs.c @@ -891,6 +891,8 @@ static int pidfs_init_fs_context(struct fs_context *fc) if (!ctx) return -ENOMEM; + fc->s_iflags |= SB_I_NOEXEC; + fc->s_iflags |= SB_I_NODEV; ctx->ops = &pidfs_sops; ctx->eops = &pidfs_export_operations; ctx->dops = &pidfs_dentry_operations; From bda3f1608d993419fa247dc11263fc931ceca58a Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 18 Jun 2025 22:53:36 +0200 Subject: [PATCH 02/37] libfs: massage path_from_stashed() to allow custom stashing behavior * Add a callback to struct stashed_operations so it's possible to implement custom behavior for pidfs and allow for it to return errors. * Teach stashed_dentry_get() to handle error pointers. Link: https://lore.kernel.org/20250618-work-pidfs-persistent-v2-2-98f3456fd552@kernel.org Reviewed-by: Alexander Mikhalitsyn Signed-off-by: Christian Brauner --- fs/internal.h | 3 +++ fs/libfs.c | 27 ++++++++++++++++++++------- 2 files changed, 23 insertions(+), 7 deletions(-) diff --git a/fs/internal.h b/fs/internal.h index 393f6c5c24f6..22ba066d1dba 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -322,12 +322,15 @@ struct mnt_idmap *alloc_mnt_idmap(struct user_namespace *mnt_userns); struct mnt_idmap *mnt_idmap_get(struct mnt_idmap *idmap); void mnt_idmap_put(struct mnt_idmap *idmap); struct stashed_operations { + struct dentry *(*stash_dentry)(struct dentry **stashed, + struct dentry *dentry); void (*put_data)(void *data); int (*init_inode)(struct inode *inode, void *data); }; int path_from_stashed(struct dentry **stashed, struct vfsmount *mnt, void *data, struct path *path); void stashed_dentry_prune(struct dentry *dentry); +struct dentry *stash_dentry(struct dentry **stashed, struct dentry *dentry); struct dentry *stashed_dentry_get(struct dentry **stashed); /** * path_mounted - check whether path is mounted diff --git a/fs/libfs.c b/fs/libfs.c index 9ea0ecc325a8..3541e22c87b5 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -2128,6 +2128,8 @@ struct dentry *stashed_dentry_get(struct dentry **stashed) dentry = rcu_dereference(*stashed); if (!dentry) return NULL; + if (IS_ERR(dentry)) + return dentry; if (!lockref_get_not_dead(&dentry->d_lockref)) return NULL; return dentry; @@ -2176,8 +2178,7 @@ static struct dentry *prepare_anon_dentry(struct dentry **stashed, return dentry; } -static struct dentry *stash_dentry(struct dentry **stashed, - struct dentry *dentry) +struct dentry *stash_dentry(struct dentry **stashed, struct dentry *dentry) { guard(rcu)(); for (;;) { @@ -2218,12 +2219,15 @@ static struct dentry *stash_dentry(struct dentry **stashed, int path_from_stashed(struct dentry **stashed, struct vfsmount *mnt, void *data, struct path *path) { - struct dentry *dentry; + struct dentry *dentry, *res; const struct stashed_operations *sops = mnt->mnt_sb->s_fs_info; /* See if dentry can be reused. */ - path->dentry = stashed_dentry_get(stashed); - if (path->dentry) { + res = stashed_dentry_get(stashed); + if (IS_ERR(res)) + return PTR_ERR(res); + if (res) { + path->dentry = res; sops->put_data(data); goto out_path; } @@ -2234,8 +2238,17 @@ int path_from_stashed(struct dentry **stashed, struct vfsmount *mnt, void *data, return PTR_ERR(dentry); /* Added a new dentry. @data is now owned by the filesystem. */ - path->dentry = stash_dentry(stashed, dentry); - if (path->dentry != dentry) + if (sops->stash_dentry) + res = sops->stash_dentry(stashed, dentry); + else + res = stash_dentry(stashed, dentry); + if (IS_ERR(res)) { + dput(dentry); + return PTR_ERR(res); + } + path->dentry = res; + /* A dentry was reused. */ + if (res != dentry) dput(dentry); out_path: From 23cdee615c4fdad1a8ec6f317b3c294cb37d662d Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 18 Jun 2025 22:53:37 +0200 Subject: [PATCH 03/37] libfs: massage path_from_stashed() Make it a littler easier to follow. Link: https://lore.kernel.org/20250618-work-pidfs-persistent-v2-3-98f3456fd552@kernel.org Reviewed-by: Alexander Mikhalitsyn Signed-off-by: Christian Brauner --- fs/libfs.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/fs/libfs.c b/fs/libfs.c index 3541e22c87b5..997d3a363ce6 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -2227,9 +2227,8 @@ int path_from_stashed(struct dentry **stashed, struct vfsmount *mnt, void *data, if (IS_ERR(res)) return PTR_ERR(res); if (res) { - path->dentry = res; sops->put_data(data); - goto out_path; + goto make_path; } /* Allocate a new dentry. */ @@ -2246,15 +2245,14 @@ int path_from_stashed(struct dentry **stashed, struct vfsmount *mnt, void *data, dput(dentry); return PTR_ERR(res); } - path->dentry = res; - /* A dentry was reused. */ if (res != dentry) dput(dentry); -out_path: - WARN_ON_ONCE(path->dentry->d_fsdata != stashed); - WARN_ON_ONCE(d_inode(path->dentry)->i_private != data); +make_path: + path->dentry = res; path->mnt = mntget(mnt); + VFS_WARN_ON_ONCE(path->dentry->d_fsdata != stashed); + VFS_WARN_ON_ONCE(d_inode(path->dentry)->i_private != data); return 0; } From 75215c972581d3934e76a57690cf838d7ceab399 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 18 Jun 2025 22:53:38 +0200 Subject: [PATCH 04/37] pidfs: move to anonymous struct Move the pidfs entries to an anonymous struct. Link: https://lore.kernel.org/20250618-work-pidfs-persistent-v2-4-98f3456fd552@kernel.org Reviewed-by: Alexander Mikhalitsyn Signed-off-by: Christian Brauner --- include/linux/pid.h | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/include/linux/pid.h b/include/linux/pid.h index 453ae6d8a68d..00646a692dd4 100644 --- a/include/linux/pid.h +++ b/include/linux/pid.h @@ -52,14 +52,15 @@ struct upid { struct pid_namespace *ns; }; -struct pid -{ +struct pid { refcount_t count; unsigned int level; spinlock_t lock; - struct dentry *stashed; - u64 ino; - struct rb_node pidfs_node; + struct { + u64 ino; + struct rb_node pidfs_node; + struct dentry *stashed; + }; /* lists of tasks that use this pid */ struct hlist_head tasks[PIDTYPE_MAX]; struct hlist_head inodes; From 8ec7c826d97b390879df2a03dfb035c70af86779 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 18 Jun 2025 22:53:39 +0200 Subject: [PATCH 05/37] pidfs: persist information Persist exit and coredump information independent of whether anyone currently holds a pidfd for the struct pid. The current scheme allocated pidfs dentries on-demand repeatedly. This scheme is reaching it's limits as it makes it impossible to pin information that needs to be available after the task has exited or coredumped and that should not be lost simply because the pidfd got closed temporarily. The next opener should still see the stashed information. This is also a prerequisite for supporting extended attributes on pidfds to allow attaching meta information to them. If someone opens a pidfd for a struct pid a pidfs dentry is allocated and stashed in pid->stashed. Once the last pidfd for the struct pid is closed the pidfs dentry is released and removed from pid->stashed. So if 10 callers create a pidfs dentry for the same struct pid sequentially, i.e., each closing the pidfd before the other creates a new one then a new pidfs dentry is allocated every time. Because multiple tasks acquiring and releasing a pidfd for the same struct pid can race with each another a task may still find a valid pidfs entry from the previous task in pid->stashed and reuse it. Or it might find a dead dentry in there and fail to reuse it and so stashes a new pidfs dentry. Multiple tasks may race to stash a new pidfs dentry but only one will succeed, the other ones will put their dentry. The current scheme aims to ensure that a pidfs dentry for a struct pid can only be created if the task is still alive or if a pidfs dentry already existed before the task was reaped and so exit information has been was stashed in the pidfs inode. That's great except that it's buggy. If a pidfs dentry is stashed in pid->stashed after pidfs_exit() but before __unhash_process() is called we will return a pidfd for a reaped task without exit information being available. The pidfds_pid_valid() check does not guard against this race as it doens't sync at all with pidfs_exit(). The pid_has_task() check might be successful simply because we're before __unhash_process() but after pidfs_exit(). Introduce a new scheme where the lifetime of information associated with a pidfs entry (coredump and exit information) isn't bound to the lifetime of the pidfs inode but the struct pid itself. The first time a pidfs dentry is allocated for a struct pid a struct pidfs_attr will be allocated which will be used to store exit and coredump information. If all pidfs for the pidfs dentry are closed the dentry and inode can be cleaned up but the struct pidfs_attr will stick until the struct pid itself is freed. This will ensure minimal memory usage while persisting relevant information. The new scheme has various advantages. First, it allows to close the race where we end up handing out a pidfd for a reaped task for which no exit information is available. Second, it minimizes memory usage. Third, it allows to remove complex lifetime tracking via dentries when registering a struct pid with pidfs. There's no need to get or put a reference. Instead, the lifetime of exit and coredump information associated with a struct pid is bound to the lifetime of struct pid itself. Link: https://lore.kernel.org/20250618-work-pidfs-persistent-v2-5-98f3456fd552@kernel.org Reviewed-by: Alexander Mikhalitsyn Signed-off-by: Christian Brauner --- fs/pidfs.c | 216 +++++++++++++++++++++++++++++------------- include/linux/pid.h | 3 + include/linux/pidfs.h | 1 + kernel/pid.c | 2 +- 4 files changed, 153 insertions(+), 69 deletions(-) diff --git a/fs/pidfs.c b/fs/pidfs.c index ff2560b34ed1..6a907457b1fe 100644 --- a/fs/pidfs.c +++ b/fs/pidfs.c @@ -25,7 +25,10 @@ #include "internal.h" #include "mount.h" +#define PIDFS_PID_DEAD ERR_PTR(-ESRCH) + static struct kmem_cache *pidfs_cachep __ro_after_init; +static struct kmem_cache *pidfs_attr_cachep __ro_after_init; /* * Stashes information that userspace needs to access even after the @@ -37,6 +40,11 @@ struct pidfs_exit_info { __u32 coredump_mask; }; +struct pidfs_attr { + struct pidfs_exit_info __pei; + struct pidfs_exit_info *exit_info; +}; + struct pidfs_inode { struct pidfs_exit_info __pei; struct pidfs_exit_info *exit_info; @@ -125,6 +133,7 @@ void pidfs_add_pid(struct pid *pid) pid->ino = pidfs_ino_nr; pid->stashed = NULL; + pid->attr = NULL; pidfs_ino_nr++; write_seqcount_begin(&pidmap_lock_seq); @@ -139,6 +148,18 @@ void pidfs_remove_pid(struct pid *pid) write_seqcount_end(&pidmap_lock_seq); } +void pidfs_free_pid(struct pid *pid) +{ + /* + * Any dentry must've been wiped from the pid by now. + * Otherwise there's a reference count bug. + */ + VFS_WARN_ON_ONCE(pid->stashed); + + if (!IS_ERR(pid->attr)) + kfree(pid->attr); +} + #ifdef CONFIG_PROC_FS /** * pidfd_show_fdinfo - print information about a pidfd @@ -261,13 +282,13 @@ static __u32 pidfs_coredump_mask(unsigned long mm_flags) static long pidfd_info(struct file *file, unsigned int cmd, unsigned long arg) { struct pidfd_info __user *uinfo = (struct pidfd_info __user *)arg; - struct inode *inode = file_inode(file); struct pid *pid = pidfd_pid(file); size_t usize = _IOC_SIZE(cmd); struct pidfd_info kinfo = {}; struct pidfs_exit_info *exit_info; struct user_namespace *user_ns; struct task_struct *task; + struct pidfs_attr *attr; const struct cred *c; __u64 mask; @@ -286,8 +307,9 @@ static long pidfd_info(struct file *file, unsigned int cmd, unsigned long arg) if (!pid_in_current_pidns(pid)) return -ESRCH; + attr = READ_ONCE(pid->attr); if (mask & PIDFD_INFO_EXIT) { - exit_info = READ_ONCE(pidfs_i(inode)->exit_info); + exit_info = READ_ONCE(attr->exit_info); if (exit_info) { kinfo.mask |= PIDFD_INFO_EXIT; #ifdef CONFIG_CGROUPS @@ -300,7 +322,7 @@ static long pidfd_info(struct file *file, unsigned int cmd, unsigned long arg) if (mask & PIDFD_INFO_COREDUMP) { kinfo.mask |= PIDFD_INFO_COREDUMP; - kinfo.coredump_mask = READ_ONCE(pidfs_i(inode)->__pei.coredump_mask); + kinfo.coredump_mask = READ_ONCE(attr->__pei.coredump_mask); } task = get_pid_task(pid, PIDTYPE_PID); @@ -552,41 +574,61 @@ struct pid *pidfd_pid(const struct file *file) * task has been reaped which cannot happen until we're out of * release_task(). * - * If this struct pid is referred to by a pidfd then - * stashed_dentry_get() will return the dentry and inode for that struct - * pid. Since we've taken a reference on it there's now an additional - * reference from the exit path on it. Which is fine. We're going to put - * it again in a second and we know that the pid is kept alive anyway. + * If this struct pid has at least once been referred to by a pidfd then + * pid->attr will be allocated. If not we mark the struct pid as dead so + * anyone who is trying to register it with pidfs will fail to do so. + * Otherwise we would hand out pidfs for reaped tasks without having + * exit information available. * - * Worst case is that we've filled in the info and immediately free the - * dentry and inode afterwards since the pidfd has been closed. Since + * Worst case is that we've filled in the info and the pid gets freed + * right away in free_pid() when no one holds a pidfd anymore. Since * pidfs_exit() currently is placed after exit_task_work() we know that - * it cannot be us aka the exiting task holding a pidfd to ourselves. + * it cannot be us aka the exiting task holding a pidfd to itself. */ void pidfs_exit(struct task_struct *tsk) { - struct dentry *dentry; + struct pid *pid = task_pid(tsk); + struct pidfs_attr *attr; + struct pidfs_exit_info *exit_info; +#ifdef CONFIG_CGROUPS + struct cgroup *cgrp; +#endif might_sleep(); - dentry = stashed_dentry_get(&task_pid(tsk)->stashed); - if (dentry) { - struct inode *inode = d_inode(dentry); - struct pidfs_exit_info *exit_info = &pidfs_i(inode)->__pei; -#ifdef CONFIG_CGROUPS - struct cgroup *cgrp; - - rcu_read_lock(); - cgrp = task_dfl_cgroup(tsk); - exit_info->cgroupid = cgroup_id(cgrp); - rcu_read_unlock(); -#endif - exit_info->exit_code = tsk->exit_code; - - /* Ensure that PIDFD_GET_INFO sees either all or nothing. */ - smp_store_release(&pidfs_i(inode)->exit_info, &pidfs_i(inode)->__pei); - dput(dentry); + guard(spinlock_irq)(&pid->wait_pidfd.lock); + attr = pid->attr; + if (!attr) { + /* + * No one ever held a pidfd for this struct pid. + * Mark it as dead so no one can add a pidfs + * entry anymore. We're about to be reaped and + * so no exit information would be available. + */ + pid->attr = PIDFS_PID_DEAD; + return; } + + /* + * If @pid->attr is set someone might still legitimately hold a + * pidfd to @pid or someone might concurrently still be getting + * a reference to an already stashed dentry from @pid->stashed. + * So defer cleaning @pid->attr until the last reference to @pid + * is put + */ + + exit_info = &attr->__pei; + +#ifdef CONFIG_CGROUPS + rcu_read_lock(); + cgrp = task_dfl_cgroup(tsk); + exit_info->cgroupid = cgroup_id(cgrp); + rcu_read_unlock(); +#endif + exit_info->exit_code = tsk->exit_code; + + /* Ensure that PIDFD_GET_INFO sees either all or nothing. */ + smp_store_release(&attr->exit_info, &attr->__pei); } #ifdef CONFIG_COREDUMP @@ -594,16 +636,15 @@ void pidfs_coredump(const struct coredump_params *cprm) { struct pid *pid = cprm->pid; struct pidfs_exit_info *exit_info; - struct dentry *dentry; - struct inode *inode; + struct pidfs_attr *attr; __u32 coredump_mask = 0; - dentry = pid->stashed; - if (WARN_ON_ONCE(!dentry)) - return; + attr = READ_ONCE(pid->attr); - inode = d_inode(dentry); - exit_info = &pidfs_i(inode)->__pei; + VFS_WARN_ON_ONCE(!attr); + VFS_WARN_ON_ONCE(attr == PIDFS_PID_DEAD); + + exit_info = &attr->__pei; /* Note how we were coredumped. */ coredump_mask = pidfs_coredump_mask(cprm->mm_flags); /* Note that we actually did coredump. */ @@ -663,7 +704,7 @@ static struct inode *pidfs_alloc_inode(struct super_block *sb) static void pidfs_free_inode(struct inode *inode) { - kmem_cache_free(pidfs_cachep, pidfs_i(inode)); + kfree(pidfs_i(inode)); } static const struct super_operations pidfs_sops = { @@ -831,8 +872,13 @@ static inline bool pidfs_pid_valid(struct pid *pid, const struct path *path, * recorded and published can be handled correctly. */ if (unlikely(!pid_has_task(pid, type))) { - struct inode *inode = d_inode(path->dentry); - return !!READ_ONCE(pidfs_i(inode)->exit_info); + struct pidfs_attr *attr; + + attr = READ_ONCE(pid->attr); + if (!attr) + return false; + if (!READ_ONCE(attr->exit_info)) + return false; } return true; @@ -878,9 +924,67 @@ static void pidfs_put_data(void *data) put_pid(pid); } +/** + * pidfs_register_pid - register a struct pid in pidfs + * @pid: pid to pin + * + * Register a struct pid in pidfs. Needs to be paired with + * pidfs_put_pid() to not risk leaking the pidfs dentry and inode. + * + * Return: On success zero, on error a negative error code is returned. + */ +int pidfs_register_pid(struct pid *pid) +{ + struct pidfs_attr *new_attr __free(kfree) = NULL; + struct pidfs_attr *attr; + + might_sleep(); + + if (!pid) + return 0; + + attr = READ_ONCE(pid->attr); + if (unlikely(attr == PIDFS_PID_DEAD)) + return PTR_ERR(PIDFS_PID_DEAD); + if (attr) + return 0; + + new_attr = kmem_cache_zalloc(pidfs_attr_cachep, GFP_KERNEL); + if (!new_attr) + return -ENOMEM; + + /* Synchronize with pidfs_exit(). */ + guard(spinlock_irq)(&pid->wait_pidfd.lock); + + attr = pid->attr; + if (unlikely(attr == PIDFS_PID_DEAD)) + return PTR_ERR(PIDFS_PID_DEAD); + if (unlikely(attr)) + return 0; + + pid->attr = no_free_ptr(new_attr); + return 0; +} + +static struct dentry *pidfs_stash_dentry(struct dentry **stashed, + struct dentry *dentry) +{ + int ret; + struct pid *pid = d_inode(dentry)->i_private; + + VFS_WARN_ON_ONCE(stashed != &pid->stashed); + + ret = pidfs_register_pid(pid); + if (ret) + return ERR_PTR(ret); + + return stash_dentry(stashed, dentry); +} + static const struct stashed_operations pidfs_stashed_ops = { - .init_inode = pidfs_init_inode, - .put_data = pidfs_put_data, + .stash_dentry = pidfs_stash_dentry, + .init_inode = pidfs_init_inode, + .put_data = pidfs_put_data, }; static int pidfs_init_fs_context(struct fs_context *fc) @@ -936,33 +1040,6 @@ struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags) return pidfd_file; } -/** - * pidfs_register_pid - register a struct pid in pidfs - * @pid: pid to pin - * - * Register a struct pid in pidfs. Needs to be paired with - * pidfs_put_pid() to not risk leaking the pidfs dentry and inode. - * - * Return: On success zero, on error a negative error code is returned. - */ -int pidfs_register_pid(struct pid *pid) -{ - struct path path __free(path_put) = {}; - int ret; - - might_sleep(); - - if (!pid) - return 0; - - ret = path_from_stashed(&pid->stashed, pidfs_mnt, get_pid(pid), &path); - if (unlikely(ret)) - return ret; - /* Keep the dentry and only put the reference to the mount. */ - path.dentry = NULL; - return 0; -} - /** * pidfs_get_pid - pin a struct pid through pidfs * @pid: pid to pin @@ -1008,6 +1085,9 @@ void __init pidfs_init(void) (SLAB_HWCACHE_ALIGN | SLAB_RECLAIM_ACCOUNT | SLAB_ACCOUNT | SLAB_PANIC), pidfs_inode_init_once); + pidfs_attr_cachep = kmem_cache_create("pidfs_attr_cache", sizeof(struct pidfs_attr), 0, + (SLAB_HWCACHE_ALIGN | SLAB_RECLAIM_ACCOUNT | + SLAB_ACCOUNT | SLAB_PANIC), NULL); pidfs_mnt = kern_mount(&pidfs_type); if (IS_ERR(pidfs_mnt)) panic("Failed to mount pidfs pseudo filesystem"); diff --git a/include/linux/pid.h b/include/linux/pid.h index 00646a692dd4..003a1027d219 100644 --- a/include/linux/pid.h +++ b/include/linux/pid.h @@ -47,6 +47,8 @@ #define RESERVED_PIDS 300 +struct pidfs_attr; + struct upid { int nr; struct pid_namespace *ns; @@ -60,6 +62,7 @@ struct pid { u64 ino; struct rb_node pidfs_node; struct dentry *stashed; + struct pidfs_attr *attr; }; /* lists of tasks that use this pid */ struct hlist_head tasks[PIDTYPE_MAX]; diff --git a/include/linux/pidfs.h b/include/linux/pidfs.h index 77e7db194914..8f6ed59bb3fb 100644 --- a/include/linux/pidfs.h +++ b/include/linux/pidfs.h @@ -16,5 +16,6 @@ extern const struct dentry_operations pidfs_dentry_operations; int pidfs_register_pid(struct pid *pid); void pidfs_get_pid(struct pid *pid); void pidfs_put_pid(struct pid *pid); +void pidfs_free_pid(struct pid *pid); #endif /* _LINUX_PID_FS_H */ diff --git a/kernel/pid.c b/kernel/pid.c index 8317bcbc7cf7..07db7d8d066c 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -100,7 +100,7 @@ void put_pid(struct pid *pid) ns = pid->numbers[pid->level].ns; if (refcount_dec_and_test(&pid->count)) { - WARN_ON_ONCE(pid->stashed); + pidfs_free_pid(pid); kmem_cache_free(ns->pid_cachep, pid); put_pid_ns(ns); } From 5ee83f8d1af4d069475eabd9a5ed551b3d2cf9a8 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 18 Jun 2025 22:53:40 +0200 Subject: [PATCH 06/37] pidfs: remove unused members from struct pidfs_inode We've moved persistent information to struct pid. So there's no need for these anymore. Link: https://lore.kernel.org/20250618-work-pidfs-persistent-v2-6-98f3456fd552@kernel.org Reviewed-by: Alexander Mikhalitsyn Signed-off-by: Christian Brauner --- fs/pidfs.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/fs/pidfs.c b/fs/pidfs.c index 6a907457b1fe..72aac4f7b7d5 100644 --- a/fs/pidfs.c +++ b/fs/pidfs.c @@ -46,8 +46,6 @@ struct pidfs_attr { }; struct pidfs_inode { - struct pidfs_exit_info __pei; - struct pidfs_exit_info *exit_info; struct inode vfs_inode; }; @@ -696,9 +694,6 @@ static struct inode *pidfs_alloc_inode(struct super_block *sb) if (!pi) return NULL; - memset(&pi->__pei, 0, sizeof(pi->__pei)); - pi->exit_info = NULL; - return &pi->vfs_inode; } From 0f93d71b9d17a8b3fcb38b5e66ac5bd94f56a8de Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 18 Jun 2025 22:53:41 +0200 Subject: [PATCH 07/37] pidfs: remove custom inode allocation We don't need it anymore as persistent information is allocated lazily and stashed in struct pid. Link: https://lore.kernel.org/20250618-work-pidfs-persistent-v2-7-98f3456fd552@kernel.org Reviewed-by: Alexander Mikhalitsyn Signed-off-by: Christian Brauner --- fs/pidfs.c | 39 --------------------------------------- 1 file changed, 39 deletions(-) diff --git a/fs/pidfs.c b/fs/pidfs.c index 72aac4f7b7d5..c49c53d6ae51 100644 --- a/fs/pidfs.c +++ b/fs/pidfs.c @@ -27,7 +27,6 @@ #define PIDFS_PID_DEAD ERR_PTR(-ESRCH) -static struct kmem_cache *pidfs_cachep __ro_after_init; static struct kmem_cache *pidfs_attr_cachep __ro_after_init; /* @@ -45,15 +44,6 @@ struct pidfs_attr { struct pidfs_exit_info *exit_info; }; -struct pidfs_inode { - struct inode vfs_inode; -}; - -static inline struct pidfs_inode *pidfs_i(struct inode *inode) -{ - return container_of(inode, struct pidfs_inode, vfs_inode); -} - static struct rb_root pidfs_ino_tree = RB_ROOT; #if BITS_PER_LONG == 32 @@ -686,27 +676,9 @@ static void pidfs_evict_inode(struct inode *inode) put_pid(pid); } -static struct inode *pidfs_alloc_inode(struct super_block *sb) -{ - struct pidfs_inode *pi; - - pi = alloc_inode_sb(sb, pidfs_cachep, GFP_KERNEL); - if (!pi) - return NULL; - - return &pi->vfs_inode; -} - -static void pidfs_free_inode(struct inode *inode) -{ - kfree(pidfs_i(inode)); -} - static const struct super_operations pidfs_sops = { - .alloc_inode = pidfs_alloc_inode, .drop_inode = generic_delete_inode, .evict_inode = pidfs_evict_inode, - .free_inode = pidfs_free_inode, .statfs = simple_statfs, }; @@ -1067,19 +1039,8 @@ void pidfs_put_pid(struct pid *pid) dput(pid->stashed); } -static void pidfs_inode_init_once(void *data) -{ - struct pidfs_inode *pi = data; - - inode_init_once(&pi->vfs_inode); -} - void __init pidfs_init(void) { - pidfs_cachep = kmem_cache_create("pidfs_cache", sizeof(struct pidfs_inode), 0, - (SLAB_HWCACHE_ALIGN | SLAB_RECLAIM_ACCOUNT | - SLAB_ACCOUNT | SLAB_PANIC), - pidfs_inode_init_once); pidfs_attr_cachep = kmem_cache_create("pidfs_attr_cache", sizeof(struct pidfs_attr), 0, (SLAB_HWCACHE_ALIGN | SLAB_RECLAIM_ACCOUNT | SLAB_ACCOUNT | SLAB_PANIC), NULL); From 804d6794497e6f3992d156e07d01e22b037ce09e Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 18 Jun 2025 22:53:42 +0200 Subject: [PATCH 08/37] pidfs: remove pidfs_{get,put}_pid() Now that we stash persistent information in struct pid there's no need to play volatile games with pinning struct pid via dentries in pidfs. Link: https://lore.kernel.org/20250618-work-pidfs-persistent-v2-8-98f3456fd552@kernel.org Reviewed-by: Alexander Mikhalitsyn Signed-off-by: Christian Brauner --- fs/coredump.c | 6 ------ fs/pidfs.c | 35 +---------------------------------- include/linux/pidfs.h | 2 -- net/unix/af_unix.c | 5 ----- 4 files changed, 1 insertion(+), 47 deletions(-) diff --git a/fs/coredump.c b/fs/coredump.c index f217ebf2b3b6..55d6a713a0fb 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -898,12 +898,6 @@ void do_coredump(const kernel_siginfo_t *siginfo) retval = kernel_connect(socket, (struct sockaddr *)(&addr), addr_len, O_NONBLOCK | SOCK_COREDUMP); - /* - * ... Make sure to only put our reference after connect() took - * its own reference keeping the pidfs entry alive ... - */ - pidfs_put_pid(cprm.pid); - if (retval) { if (retval == -EAGAIN) coredump_report_failure("Coredump socket %s receive queue full", addr.sun_path); diff --git a/fs/pidfs.c b/fs/pidfs.c index c49c53d6ae51..bc2342cf4492 100644 --- a/fs/pidfs.c +++ b/fs/pidfs.c @@ -895,8 +895,7 @@ static void pidfs_put_data(void *data) * pidfs_register_pid - register a struct pid in pidfs * @pid: pid to pin * - * Register a struct pid in pidfs. Needs to be paired with - * pidfs_put_pid() to not risk leaking the pidfs dentry and inode. + * Register a struct pid in pidfs. * * Return: On success zero, on error a negative error code is returned. */ @@ -1007,38 +1006,6 @@ struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags) return pidfd_file; } -/** - * pidfs_get_pid - pin a struct pid through pidfs - * @pid: pid to pin - * - * Similar to pidfs_register_pid() but only valid if the caller knows - * there's a reference to the @pid through a dentry already that can't - * go away. - */ -void pidfs_get_pid(struct pid *pid) -{ - if (!pid) - return; - WARN_ON_ONCE(!stashed_dentry_get(&pid->stashed)); -} - -/** - * pidfs_put_pid - drop a pidfs reference - * @pid: pid to drop - * - * Drop a reference to @pid via pidfs. This is only safe if the - * reference has been taken via pidfs_get_pid(). - */ -void pidfs_put_pid(struct pid *pid) -{ - might_sleep(); - - if (!pid) - return; - VFS_WARN_ON_ONCE(!pid->stashed); - dput(pid->stashed); -} - void __init pidfs_init(void) { pidfs_attr_cachep = kmem_cache_create("pidfs_attr_cache", sizeof(struct pidfs_attr), 0, diff --git a/include/linux/pidfs.h b/include/linux/pidfs.h index 8f6ed59bb3fb..3e08c33da2df 100644 --- a/include/linux/pidfs.h +++ b/include/linux/pidfs.h @@ -14,8 +14,6 @@ void pidfs_coredump(const struct coredump_params *cprm); #endif extern const struct dentry_operations pidfs_dentry_operations; int pidfs_register_pid(struct pid *pid); -void pidfs_get_pid(struct pid *pid); -void pidfs_put_pid(struct pid *pid); void pidfs_free_pid(struct pid *pid); #endif /* _LINUX_PID_FS_H */ diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 2e2e9997a68e..129388c309b0 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -646,9 +646,6 @@ static void unix_sock_destructor(struct sock *sk) return; } - if (sk->sk_peer_pid) - pidfs_put_pid(sk->sk_peer_pid); - if (u->addr) unix_release_addr(u->addr); @@ -769,7 +766,6 @@ static void drop_peercred(struct unix_peercred *peercred) swap(peercred->peer_pid, pid); swap(peercred->peer_cred, cred); - pidfs_put_pid(pid); put_pid(pid); put_cred(cred); } @@ -802,7 +798,6 @@ static void copy_peercred(struct sock *sk, struct sock *peersk) spin_lock(&sk->sk_peer_lock); sk->sk_peer_pid = get_pid(peersk->sk_peer_pid); - pidfs_get_pid(sk->sk_peer_pid); sk->sk_peer_cred = get_cred(peersk->sk_peer_cred); spin_unlock(&sk->sk_peer_lock); } From d718249bbac664c24a34ed472d627381d4505e00 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 18 Jun 2025 22:53:43 +0200 Subject: [PATCH 09/37] pidfs: remove pidfs_pid_valid() The validation is now completely handled in path_from_stashed(). Link: https://lore.kernel.org/20250618-work-pidfs-persistent-v2-9-98f3456fd552@kernel.org Reviewed-by: Alexander Mikhalitsyn Signed-off-by: Christian Brauner --- fs/pidfs.c | 53 ----------------------------------------------------- 1 file changed, 53 deletions(-) diff --git a/fs/pidfs.c b/fs/pidfs.c index bc2342cf4492..ec375692a710 100644 --- a/fs/pidfs.c +++ b/fs/pidfs.c @@ -804,58 +804,8 @@ static int pidfs_export_permission(struct handle_to_path_ctx *ctx, return 0; } -static inline bool pidfs_pid_valid(struct pid *pid, const struct path *path, - unsigned int flags) -{ - enum pid_type type; - - if (flags & PIDFD_STALE) - return true; - - /* - * Make sure that if a pidfd is created PIDFD_INFO_EXIT - * information will be available. So after an inode for the - * pidfd has been allocated perform another check that the pid - * is still alive. If it is exit information is available even - * if the task gets reaped before the pidfd is returned to - * userspace. The only exception are indicated by PIDFD_STALE: - * - * (1) The kernel is in the middle of task creation and thus no - * task linkage has been established yet. - * (2) The caller knows @pid has been registered in pidfs at a - * time when the task was still alive. - * - * In both cases exit information will have been reported. - */ - if (flags & PIDFD_THREAD) - type = PIDTYPE_PID; - else - type = PIDTYPE_TGID; - - /* - * Since pidfs_exit() is called before struct pid's task linkage - * is removed the case where the task got reaped but a dentry - * was already attached to struct pid and exit information was - * recorded and published can be handled correctly. - */ - if (unlikely(!pid_has_task(pid, type))) { - struct pidfs_attr *attr; - - attr = READ_ONCE(pid->attr); - if (!attr) - return false; - if (!READ_ONCE(attr->exit_info)) - return false; - } - - return true; -} - static struct file *pidfs_export_open(struct path *path, unsigned int oflags) { - if (!pidfs_pid_valid(d_inode(path->dentry)->i_private, path, oflags)) - return ERR_PTR(-ESRCH); - /* * Clear O_LARGEFILE as open_by_handle_at() forces it and raise * O_RDWR as pidfds always are. @@ -993,9 +943,6 @@ struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags) if (ret < 0) return ERR_PTR(ret); - if (!pidfs_pid_valid(pid, &path, flags)) - return ERR_PTR(-ESRCH); - flags &= ~PIDFD_STALE; flags |= O_RDWR; pidfd_file = dentry_open(&path, flags, current_cred()); From c007d95221397eda24e7d6b4ac5a5d699ea2f1ca Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 18 Jun 2025 22:53:44 +0200 Subject: [PATCH 10/37] libfs: prepare to allow for non-immutable pidfd inodes Allow for S_IMMUTABLE to be stripped so that we can support xattrs. Link: https://lore.kernel.org/20250618-work-pidfs-persistent-v2-10-98f3456fd552@kernel.org Reviewed-by: Alexander Mikhalitsyn Signed-off-by: Christian Brauner --- fs/libfs.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/libfs.c b/fs/libfs.c index 997d3a363ce6..c5b520df9032 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -2162,7 +2162,6 @@ static struct dentry *prepare_anon_dentry(struct dentry **stashed, /* Notice when this is changed. */ WARN_ON_ONCE(!S_ISREG(inode->i_mode)); - WARN_ON_ONCE(!IS_IMMUTABLE(inode)); dentry = d_alloc_anon(sb); if (!dentry) { From f769b3db24fa9ef48abcb515c50de1abeeaa0281 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 18 Jun 2025 22:53:45 +0200 Subject: [PATCH 11/37] pidfs: make inodes mutable Prepare for allowing extended attributes to be set on pidfd inodes by allowing them to be mutable. Link: https://lore.kernel.org/20250618-work-pidfs-persistent-v2-11-98f3456fd552@kernel.org Reviewed-by: Alexander Mikhalitsyn Signed-off-by: Christian Brauner --- fs/pidfs.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/pidfs.c b/fs/pidfs.c index ec375692a710..df5bc69ea1c0 100644 --- a/fs/pidfs.c +++ b/fs/pidfs.c @@ -827,6 +827,8 @@ static int pidfs_init_inode(struct inode *inode, void *data) inode->i_private = data; inode->i_flags |= S_PRIVATE | S_ANON_INODE; + /* We allow to set xattrs. */ + inode->i_flags &= ~S_IMMUTABLE; inode->i_mode |= S_IRWXU; inode->i_op = &pidfs_inode_operations; inode->i_fop = &pidfs_file_operations; From 91d837cae3c7856cdca23dc6e8ec8954d887e970 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 18 Jun 2025 22:53:46 +0200 Subject: [PATCH 12/37] pidfs: support xattrs on pidfds Now that we have a way to persist information for pidfs dentries we can start supporting extended attributes on pidfds. This will allow userspace to attach meta information to tasks. One natural extension would be to introduce a custom pidfs.* extended attribute space and allow for the inheritance of extended attributes across fork() and exec(). The first simple scheme will allow privileged userspace to set trusted extended attributes on pidfs inodes. Link: https://lore.kernel.org/20250618-work-pidfs-persistent-v2-12-98f3456fd552@kernel.org Signed-off-by: Christian Brauner --- fs/pidfs.c | 107 +++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 103 insertions(+), 4 deletions(-) diff --git a/fs/pidfs.c b/fs/pidfs.c index df5bc69ea1c0..bde19614ef8b 100644 --- a/fs/pidfs.c +++ b/fs/pidfs.c @@ -21,6 +21,7 @@ #include #include #include +#include #include "internal.h" #include "mount.h" @@ -28,6 +29,7 @@ #define PIDFS_PID_DEAD ERR_PTR(-ESRCH) static struct kmem_cache *pidfs_attr_cachep __ro_after_init; +static struct kmem_cache *pidfs_xattr_cachep __ro_after_init; /* * Stashes information that userspace needs to access even after the @@ -40,6 +42,7 @@ struct pidfs_exit_info { }; struct pidfs_attr { + struct simple_xattrs *xattrs; struct pidfs_exit_info __pei; struct pidfs_exit_info *exit_info; }; @@ -138,14 +141,27 @@ void pidfs_remove_pid(struct pid *pid) void pidfs_free_pid(struct pid *pid) { + struct pidfs_attr *attr __free(kfree) = no_free_ptr(pid->attr); + struct simple_xattrs *xattrs __free(kfree) = NULL; + /* * Any dentry must've been wiped from the pid by now. * Otherwise there's a reference count bug. */ VFS_WARN_ON_ONCE(pid->stashed); - if (!IS_ERR(pid->attr)) - kfree(pid->attr); + if (IS_ERR(attr)) + return; + + /* + * Any dentry must've been wiped from the pid by now. Otherwise + * there's a reference count bug. + */ + VFS_WARN_ON_ONCE(pid->stashed); + + xattrs = attr->xattrs; + if (xattrs) + simple_xattrs_free(attr->xattrs, NULL); } #ifdef CONFIG_PROC_FS @@ -663,9 +679,24 @@ static int pidfs_getattr(struct mnt_idmap *idmap, const struct path *path, return anon_inode_getattr(idmap, path, stat, request_mask, query_flags); } +static ssize_t pidfs_listxattr(struct dentry *dentry, char *buf, size_t size) +{ + struct inode *inode = d_inode(dentry); + struct pid *pid = inode->i_private; + struct pidfs_attr *attr = pid->attr; + struct simple_xattrs *xattrs; + + xattrs = READ_ONCE(attr->xattrs); + if (!xattrs) + return 0; + + return simple_xattr_list(inode, xattrs, buf, size); +} + static const struct inode_operations pidfs_inode_operations = { - .getattr = pidfs_getattr, - .setattr = pidfs_setattr, + .getattr = pidfs_getattr, + .setattr = pidfs_setattr, + .listxattr = pidfs_listxattr, }; static void pidfs_evict_inode(struct inode *inode) @@ -905,6 +936,67 @@ static const struct stashed_operations pidfs_stashed_ops = { .put_data = pidfs_put_data, }; +static int pidfs_xattr_get(const struct xattr_handler *handler, + struct dentry *unused, struct inode *inode, + const char *suffix, void *value, size_t size) +{ + struct pid *pid = inode->i_private; + struct pidfs_attr *attr = pid->attr; + const char *name; + struct simple_xattrs *xattrs; + + xattrs = READ_ONCE(attr->xattrs); + if (!xattrs) + return 0; + + name = xattr_full_name(handler, suffix); + return simple_xattr_get(xattrs, name, value, size); +} + +static int pidfs_xattr_set(const struct xattr_handler *handler, + struct mnt_idmap *idmap, struct dentry *unused, + struct inode *inode, const char *suffix, + const void *value, size_t size, int flags) +{ + struct pid *pid = inode->i_private; + struct pidfs_attr *attr = pid->attr; + const char *name; + struct simple_xattrs *xattrs; + struct simple_xattr *old_xattr; + + /* Ensure we're the only one to set @attr->xattrs. */ + WARN_ON_ONCE(!inode_is_locked(inode)); + + xattrs = READ_ONCE(attr->xattrs); + if (!xattrs) { + xattrs = kmem_cache_zalloc(pidfs_xattr_cachep, GFP_KERNEL); + if (!xattrs) + return -ENOMEM; + + simple_xattrs_init(xattrs); + smp_store_release(&pid->attr->xattrs, xattrs); + } + + name = xattr_full_name(handler, suffix); + old_xattr = simple_xattr_set(xattrs, name, value, size, flags); + if (IS_ERR(old_xattr)) + return PTR_ERR(old_xattr); + + simple_xattr_free(old_xattr); + return 0; +} + +static const struct xattr_handler pidfs_trusted_xattr_handler = { + .prefix = XATTR_TRUSTED_PREFIX, + .get = pidfs_xattr_get, + .set = pidfs_xattr_set, +}; + +static const struct xattr_handler *const pidfs_xattr_handlers[] = { + &pidfs_trusted_xattr_handler, + NULL +}; + static int pidfs_init_fs_context(struct fs_context *fc) { struct pseudo_fs_context *ctx; @@ -918,6 +1010,7 @@ static int pidfs_init_fs_context(struct fs_context *fc) ctx->ops = &pidfs_sops; ctx->eops = &pidfs_export_operations; ctx->dops = &pidfs_dentry_operations; + ctx->xattr = pidfs_xattr_handlers; fc->s_fs_info = (void *)&pidfs_stashed_ops; return 0; } @@ -960,6 +1053,12 @@ void __init pidfs_init(void) pidfs_attr_cachep = kmem_cache_create("pidfs_attr_cache", sizeof(struct pidfs_attr), 0, (SLAB_HWCACHE_ALIGN | SLAB_RECLAIM_ACCOUNT | SLAB_ACCOUNT | SLAB_PANIC), NULL); + + pidfs_xattr_cachep = kmem_cache_create("pidfs_xattr_cache", + sizeof(struct simple_xattrs), 0, + (SLAB_HWCACHE_ALIGN | SLAB_RECLAIM_ACCOUNT | + SLAB_ACCOUNT | SLAB_PANIC), NULL); + pidfs_mnt = kern_mount(&pidfs_type); if (IS_ERR(pidfs_mnt)) panic("Failed to mount pidfs pseudo filesystem"); From 49fba3725910c54878212ca08b968b9e1285866c Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 18 Jun 2025 22:53:47 +0200 Subject: [PATCH 13/37] selftests/pidfd: test extended attribute support Add tests for extended attribute support on pidfds. Link: https://lore.kernel.org/20250618-work-pidfs-persistent-v2-13-98f3456fd552@kernel.org Reviewed-by: Alexander Mikhalitsyn Signed-off-by: Christian Brauner --- tools/testing/selftests/pidfd/.gitignore | 1 + tools/testing/selftests/pidfd/Makefile | 3 +- .../selftests/pidfd/pidfd_xattr_test.c | 97 +++++++++++++++++++ 3 files changed, 100 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/pidfd/pidfd_xattr_test.c diff --git a/tools/testing/selftests/pidfd/.gitignore b/tools/testing/selftests/pidfd/.gitignore index 0406a065deb4..bc4130506eda 100644 --- a/tools/testing/selftests/pidfd/.gitignore +++ b/tools/testing/selftests/pidfd/.gitignore @@ -10,3 +10,4 @@ pidfd_file_handle_test pidfd_bind_mount pidfd_info_test pidfd_exec_helper +pidfd_xattr_test diff --git a/tools/testing/selftests/pidfd/Makefile b/tools/testing/selftests/pidfd/Makefile index fcbefc0d77f6..c9fd5023ef15 100644 --- a/tools/testing/selftests/pidfd/Makefile +++ b/tools/testing/selftests/pidfd/Makefile @@ -3,7 +3,8 @@ CFLAGS += -g $(KHDR_INCLUDES) -pthread -Wall TEST_GEN_PROGS := pidfd_test pidfd_fdinfo_test pidfd_open_test \ pidfd_poll_test pidfd_wait pidfd_getfd_test pidfd_setns_test \ - pidfd_file_handle_test pidfd_bind_mount pidfd_info_test + pidfd_file_handle_test pidfd_bind_mount pidfd_info_test \ + pidfd_xattr_test TEST_GEN_PROGS_EXTENDED := pidfd_exec_helper diff --git a/tools/testing/selftests/pidfd/pidfd_xattr_test.c b/tools/testing/selftests/pidfd/pidfd_xattr_test.c new file mode 100644 index 000000000000..00d400ac515b --- /dev/null +++ b/tools/testing/selftests/pidfd/pidfd_xattr_test.c @@ -0,0 +1,97 @@ +// SPDX-License-Identifier: GPL-2.0 + +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "pidfd.h" +#include "../kselftest_harness.h" + +FIXTURE(pidfs_xattr) +{ + pid_t child_pid; + int child_pidfd; +}; + +FIXTURE_SETUP(pidfs_xattr) +{ + self->child_pid = create_child(&self->child_pidfd, CLONE_NEWUSER | CLONE_NEWPID); + EXPECT_GE(self->child_pid, 0); + + if (self->child_pid == 0) + _exit(EXIT_SUCCESS); +} + +FIXTURE_TEARDOWN(pidfs_xattr) +{ + sys_waitid(P_PID, self->child_pid, NULL, WEXITED); +} + +TEST_F(pidfs_xattr, set_get_list_xattr_multiple) +{ + int ret, i; + char xattr_name[32]; + char xattr_value[32]; + char buf[32]; + const int num_xattrs = 10; + char list[PATH_MAX] = {}; + + for (i = 0; i < num_xattrs; i++) { + snprintf(xattr_name, sizeof(xattr_name), "trusted.testattr%d", i); + snprintf(xattr_value, sizeof(xattr_value), "testvalue%d", i); + ret = fsetxattr(self->child_pidfd, xattr_name, xattr_value, strlen(xattr_value), 0); + ASSERT_EQ(ret, 0); + } + + for (i = 0; i < num_xattrs; i++) { + snprintf(xattr_name, sizeof(xattr_name), "trusted.testattr%d", i); + snprintf(xattr_value, sizeof(xattr_value), "testvalue%d", i); + memset(buf, 0, sizeof(buf)); + ret = fgetxattr(self->child_pidfd, xattr_name, buf, sizeof(buf)); + ASSERT_EQ(ret, strlen(xattr_value)); + ASSERT_EQ(strcmp(buf, xattr_value), 0); + } + + ret = flistxattr(self->child_pidfd, list, sizeof(list)); + ASSERT_GT(ret, 0); + for (i = 0; i < num_xattrs; i++) { + snprintf(xattr_name, sizeof(xattr_name), "trusted.testattr%d", i); + bool found = false; + for (char *it = list; it < list + ret; it += strlen(it) + 1) { + if (strcmp(it, xattr_name)) + continue; + found = true; + break; + } + ASSERT_TRUE(found); + } + + for (i = 0; i < num_xattrs; i++) { + snprintf(xattr_name, sizeof(xattr_name), "trusted.testattr%d", i); + ret = fremovexattr(self->child_pidfd, xattr_name); + ASSERT_EQ(ret, 0); + + ret = fgetxattr(self->child_pidfd, xattr_name, buf, sizeof(buf)); + ASSERT_EQ(ret, -1); + ASSERT_EQ(errno, ENODATA); + } +} + +TEST_HARNESS_MAIN From 7442d093dfae0c9482eeeb8bebfd682459244be0 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 18 Jun 2025 22:53:48 +0200 Subject: [PATCH 14/37] selftests/pidfd: test extended attribute support Test that extended attributes are permanent. Link: https://lore.kernel.org/20250618-work-pidfs-persistent-v2-14-98f3456fd552@kernel.org Reviewed-by: Alexander Mikhalitsyn Signed-off-by: Christian Brauner --- .../selftests/pidfd/pidfd_xattr_test.c | 35 +++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/tools/testing/selftests/pidfd/pidfd_xattr_test.c b/tools/testing/selftests/pidfd/pidfd_xattr_test.c index 00d400ac515b..5cf7bb0e4bf2 100644 --- a/tools/testing/selftests/pidfd/pidfd_xattr_test.c +++ b/tools/testing/selftests/pidfd/pidfd_xattr_test.c @@ -94,4 +94,39 @@ TEST_F(pidfs_xattr, set_get_list_xattr_multiple) } } +TEST_F(pidfs_xattr, set_get_list_xattr_persistent) +{ + int ret; + char buf[32]; + char list[PATH_MAX] = {}; + + ret = fsetxattr(self->child_pidfd, "trusted.persistent", "persistent value", strlen("persistent value"), 0); + ASSERT_EQ(ret, 0); + + memset(buf, 0, sizeof(buf)); + ret = fgetxattr(self->child_pidfd, "trusted.persistent", buf, sizeof(buf)); + ASSERT_EQ(ret, strlen("persistent value")); + ASSERT_EQ(strcmp(buf, "persistent value"), 0); + + ret = flistxattr(self->child_pidfd, list, sizeof(list)); + ASSERT_GT(ret, 0); + ASSERT_EQ(strcmp(list, "trusted.persistent"), 0) + + ASSERT_EQ(close(self->child_pidfd), 0); + self->child_pidfd = -EBADF; + sleep(2); + + self->child_pidfd = sys_pidfd_open(self->child_pid, 0); + ASSERT_GE(self->child_pidfd, 0); + + memset(buf, 0, sizeof(buf)); + ret = fgetxattr(self->child_pidfd, "trusted.persistent", buf, sizeof(buf)); + ASSERT_EQ(ret, strlen("persistent value")); + ASSERT_EQ(strcmp(buf, "persistent value"), 0); + + ret = flistxattr(self->child_pidfd, list, sizeof(list)); + ASSERT_GT(ret, 0); + ASSERT_EQ(strcmp(list, "trusted.persistent"), 0); +} + TEST_HARNESS_MAIN From 8c2ab04135682d0f5b1eb1c74ac5f328b65015ea Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 18 Jun 2025 22:53:49 +0200 Subject: [PATCH 15/37] selftests/pidfd: test setattr support Verify that ->setattr() on a pidfd doens't work. Link: https://lore.kernel.org/20250618-work-pidfs-persistent-v2-15-98f3456fd552@kernel.org Reviewed-by: Alexander Mikhalitsyn Signed-off-by: Christian Brauner --- tools/testing/selftests/pidfd/.gitignore | 1 + tools/testing/selftests/pidfd/Makefile | 2 +- .../selftests/pidfd/pidfd_setattr_test.c | 69 +++++++++++++++++++ 3 files changed, 71 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/pidfd/pidfd_setattr_test.c diff --git a/tools/testing/selftests/pidfd/.gitignore b/tools/testing/selftests/pidfd/.gitignore index bc4130506eda..144e7ff65d6a 100644 --- a/tools/testing/selftests/pidfd/.gitignore +++ b/tools/testing/selftests/pidfd/.gitignore @@ -11,3 +11,4 @@ pidfd_bind_mount pidfd_info_test pidfd_exec_helper pidfd_xattr_test +pidfd_setattr_test diff --git a/tools/testing/selftests/pidfd/Makefile b/tools/testing/selftests/pidfd/Makefile index c9fd5023ef15..03a6eede9c9e 100644 --- a/tools/testing/selftests/pidfd/Makefile +++ b/tools/testing/selftests/pidfd/Makefile @@ -4,7 +4,7 @@ CFLAGS += -g $(KHDR_INCLUDES) -pthread -Wall TEST_GEN_PROGS := pidfd_test pidfd_fdinfo_test pidfd_open_test \ pidfd_poll_test pidfd_wait pidfd_getfd_test pidfd_setns_test \ pidfd_file_handle_test pidfd_bind_mount pidfd_info_test \ - pidfd_xattr_test + pidfd_xattr_test pidfd_setattr_test TEST_GEN_PROGS_EXTENDED := pidfd_exec_helper diff --git a/tools/testing/selftests/pidfd/pidfd_setattr_test.c b/tools/testing/selftests/pidfd/pidfd_setattr_test.c new file mode 100644 index 000000000000..d7de05edc4b3 --- /dev/null +++ b/tools/testing/selftests/pidfd/pidfd_setattr_test.c @@ -0,0 +1,69 @@ +// SPDX-License-Identifier: GPL-2.0 + +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "pidfd.h" +#include "../kselftest_harness.h" + +FIXTURE(pidfs_setattr) +{ + pid_t child_pid; + int child_pidfd; +}; + +FIXTURE_SETUP(pidfs_setattr) +{ + self->child_pid = create_child(&self->child_pidfd, CLONE_NEWUSER | CLONE_NEWPID); + EXPECT_GE(self->child_pid, 0); + + if (self->child_pid == 0) + _exit(EXIT_SUCCESS); +} + +FIXTURE_TEARDOWN(pidfs_setattr) +{ + sys_waitid(P_PID, self->child_pid, NULL, WEXITED); + EXPECT_EQ(close(self->child_pidfd), 0); +} + +TEST_F(pidfs_setattr, no_chown) +{ + ASSERT_LT(fchown(self->child_pidfd, 1234, 5678), 0); + ASSERT_EQ(errno, EOPNOTSUPP); +} + +TEST_F(pidfs_setattr, no_chmod) +{ + ASSERT_LT(fchmod(self->child_pidfd, 0777), 0); + ASSERT_EQ(errno, EOPNOTSUPP); +} + +TEST_F(pidfs_setattr, no_exec) +{ + char *const argv[] = { NULL }; + char *const envp[] = { NULL }; + + ASSERT_LT(execveat(self->child_pidfd, "", argv, envp, AT_EMPTY_PATH), 0); + ASSERT_EQ(errno, EACCES); +} + +TEST_HARNESS_MAIN From f9fac1f48c20a29f2c39c9a9b96d539ad1636824 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 18 Jun 2025 22:53:50 +0200 Subject: [PATCH 16/37] pidfs: add some CONFIG_DEBUG_VFS asserts Allow to catch some obvious bugs. Link: https://lore.kernel.org/20250618-work-pidfs-persistent-v2-16-98f3456fd552@kernel.org Reviewed-by: Alexander Mikhalitsyn Signed-off-by: Christian Brauner --- fs/pidfs.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/pidfs.c b/fs/pidfs.c index bde19614ef8b..ba526fdd4c4d 100644 --- a/fs/pidfs.c +++ b/fs/pidfs.c @@ -809,6 +809,8 @@ static struct dentry *pidfs_fh_to_dentry(struct super_block *sb, if (ret < 0) return ERR_PTR(ret); + VFS_WARN_ON_ONCE(!pid->attr); + mntput(path.mnt); return path.dentry; } @@ -1038,6 +1040,8 @@ struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags) if (ret < 0) return ERR_PTR(ret); + VFS_WARN_ON_ONCE(!pid->attr); + flags &= ~PIDFD_STALE; flags |= O_RDWR; pidfd_file = dentry_open(&path, flags, current_cred()); From f077638b5f19080b877fd4cd15fc00558669aa6d Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 23 Jun 2025 14:50:30 +0200 Subject: [PATCH 17/37] pidfs: fix pidfs_free_pid() Ensure that we handle the case where task creation fails and pid->attr was never accessed at all. Signed-off-by: Christian Brauner --- fs/pidfs.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/fs/pidfs.c b/fs/pidfs.c index ba526fdd4c4d..47f5f9e0bdff 100644 --- a/fs/pidfs.c +++ b/fs/pidfs.c @@ -150,18 +150,20 @@ void pidfs_free_pid(struct pid *pid) */ VFS_WARN_ON_ONCE(pid->stashed); + /* + * This if an error occurred during e.g., task creation that + * causes us to never go through the exit path. + */ + if (unlikely(!attr)) + return; + + /* This never had a pidfd created. */ if (IS_ERR(attr)) return; - /* - * Any dentry must've been wiped from the pid by now. Otherwise - * there's a reference count bug. - */ - VFS_WARN_ON_ONCE(pid->stashed); - - xattrs = attr->xattrs; + xattrs = no_free_ptr(attr->xattrs); if (xattrs) - simple_xattrs_free(attr->xattrs, NULL); + simple_xattrs_free(xattrs, NULL); } #ifdef CONFIG_PROC_FS From cc678bf7aa9e2e6c2356fd7f955513c1bd7d4c97 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Tue, 24 Jun 2025 10:29:04 +0200 Subject: [PATCH 18/37] fhandle: raise FILEID_IS_DIR in handle_type Currently FILEID_IS_DIR is raised in fh_flags which is wrong. Raise it in handle->handle_type were it's supposed to be. Link: https://lore.kernel.org/20250624-work-pidfs-fhandle-v2-1-d02a04858fe3@kernel.org Fixes: c374196b2b9f ("fs: name_to_handle_at() support for "explicit connectable" file handles") Reviewed-by: Jan Kara Reviewed-by: Amir Goldstein Cc: stable@kernel.org Signed-off-by: Christian Brauner --- fs/fhandle.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/fhandle.c b/fs/fhandle.c index 3e092ae6d142..66ff60591d17 100644 --- a/fs/fhandle.c +++ b/fs/fhandle.c @@ -88,7 +88,7 @@ static long do_sys_name_to_handle(const struct path *path, if (fh_flags & EXPORT_FH_CONNECTABLE) { handle->handle_type |= FILEID_IS_CONNECTABLE; if (d_is_dir(path->dentry)) - fh_flags |= FILEID_IS_DIR; + handle->handle_type |= FILEID_IS_DIR; } retval = 0; } From 774adcb55f159167fe0dfd343174fdedba3ae2f4 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Tue, 24 Jun 2025 10:29:05 +0200 Subject: [PATCH 19/37] fhandle: hoist copy_from_user() above get_path_from_fd() In follow-up patches we need access to @file_handle->handle_type before we start caring about get_path_from_fd(). Link: https://lore.kernel.org/20250624-work-pidfs-fhandle-v2-2-d02a04858fe3@kernel.org Reviewed-by: Jan Kara Reviewed-by: Amir Goldstein Signed-off-by: Christian Brauner --- fs/fhandle.c | 35 ++++++++++++++--------------------- 1 file changed, 14 insertions(+), 21 deletions(-) diff --git a/fs/fhandle.c b/fs/fhandle.c index 66ff60591d17..73f56f8e7d5d 100644 --- a/fs/fhandle.c +++ b/fs/fhandle.c @@ -323,13 +323,24 @@ static int handle_to_path(int mountdirfd, struct file_handle __user *ufh, { int retval = 0; struct file_handle f_handle; - struct file_handle *handle = NULL; + struct file_handle *handle __free(kfree) = NULL; struct handle_to_path_ctx ctx = {}; const struct export_operations *eops; + if (copy_from_user(&f_handle, ufh, sizeof(struct file_handle))) + return -EFAULT; + + if ((f_handle.handle_bytes > MAX_HANDLE_SZ) || + (f_handle.handle_bytes == 0)) + return -EINVAL; + + if (f_handle.handle_type < 0 || + FILEID_USER_FLAGS(f_handle.handle_type) & ~FILEID_VALID_USER_FLAGS) + return -EINVAL; + retval = get_path_from_fd(mountdirfd, &ctx.root); if (retval) - goto out_err; + return retval; eops = ctx.root.mnt->mnt_sb->s_export_op; if (eops && eops->permission) @@ -339,21 +350,6 @@ static int handle_to_path(int mountdirfd, struct file_handle __user *ufh, if (retval) goto out_path; - if (copy_from_user(&f_handle, ufh, sizeof(struct file_handle))) { - retval = -EFAULT; - goto out_path; - } - if ((f_handle.handle_bytes > MAX_HANDLE_SZ) || - (f_handle.handle_bytes == 0)) { - retval = -EINVAL; - goto out_path; - } - if (f_handle.handle_type < 0 || - FILEID_USER_FLAGS(f_handle.handle_type) & ~FILEID_VALID_USER_FLAGS) { - retval = -EINVAL; - goto out_path; - } - handle = kmalloc(struct_size(handle, f_handle, f_handle.handle_bytes), GFP_KERNEL); if (!handle) { @@ -366,7 +362,7 @@ static int handle_to_path(int mountdirfd, struct file_handle __user *ufh, &ufh->f_handle, f_handle.handle_bytes)) { retval = -EFAULT; - goto out_handle; + goto out_path; } /* @@ -384,11 +380,8 @@ static int handle_to_path(int mountdirfd, struct file_handle __user *ufh, handle->handle_type &= ~FILEID_USER_FLAGS_MASK; retval = do_handle_to_path(handle, path, &ctx); -out_handle: - kfree(handle); out_path: path_put(&ctx.root); -out_err: return retval; } From f7be8a333253cc319f5c6456b5cdab2a57b7351b Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Tue, 24 Jun 2025 10:29:06 +0200 Subject: [PATCH 20/37] fhandle: rename to get_path_anchor() Rename as we're going to expand the function in the next step. The path just serves as the anchor tying the decoding to the filesystem. Link: https://lore.kernel.org/20250624-work-pidfs-fhandle-v2-3-d02a04858fe3@kernel.org Reviewed-by: Jan Kara Reviewed-by: Amir Goldstein Signed-off-by: Christian Brauner --- fs/fhandle.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/fhandle.c b/fs/fhandle.c index 73f56f8e7d5d..d8d32208c621 100644 --- a/fs/fhandle.c +++ b/fs/fhandle.c @@ -168,7 +168,7 @@ SYSCALL_DEFINE5(name_to_handle_at, int, dfd, const char __user *, name, return err; } -static int get_path_from_fd(int fd, struct path *root) +static int get_path_anchor(int fd, struct path *root) { if (fd == AT_FDCWD) { struct fs_struct *fs = current->fs; @@ -338,7 +338,7 @@ static int handle_to_path(int mountdirfd, struct file_handle __user *ufh, FILEID_USER_FLAGS(f_handle.handle_type) & ~FILEID_VALID_USER_FLAGS) return -EINVAL; - retval = get_path_from_fd(mountdirfd, &ctx.root); + retval = get_path_anchor(mountdirfd, &ctx.root); if (retval) return retval; From a0d8051cfd8145eb49dbd0c0c2f174d09da77796 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Tue, 24 Jun 2025 10:29:07 +0200 Subject: [PATCH 21/37] pidfs: add pidfs_root_path() helper Allow to return the root of the global pidfs filesystem. Link: https://lore.kernel.org/20250624-work-pidfs-fhandle-v2-4-d02a04858fe3@kernel.org Reviewed-by: Jan Kara Reviewed-by: Amir Goldstein Signed-off-by: Christian Brauner --- fs/internal.h | 1 + fs/pidfs.c | 11 +++++++++++ 2 files changed, 12 insertions(+) diff --git a/fs/internal.h b/fs/internal.h index 22ba066d1dba..ad256bccdc85 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -353,3 +353,4 @@ int anon_inode_getattr(struct mnt_idmap *idmap, const struct path *path, unsigned int query_flags); int anon_inode_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr); +void pidfs_get_root(struct path *path); diff --git a/fs/pidfs.c b/fs/pidfs.c index 47f5f9e0bdff..4fc7a7f4a3fe 100644 --- a/fs/pidfs.c +++ b/fs/pidfs.c @@ -31,6 +31,14 @@ static struct kmem_cache *pidfs_attr_cachep __ro_after_init; static struct kmem_cache *pidfs_xattr_cachep __ro_after_init; +static struct path pidfs_root_path = {}; + +void pidfs_get_root(struct path *path) +{ + *path = pidfs_root_path; + path_get(path); +} + /* * Stashes information that userspace needs to access even after the * process has been reaped. @@ -1068,4 +1076,7 @@ void __init pidfs_init(void) pidfs_mnt = kern_mount(&pidfs_type); if (IS_ERR(pidfs_mnt)) panic("Failed to mount pidfs pseudo filesystem"); + + pidfs_root_path.mnt = pidfs_mnt; + pidfs_root_path.dentry = pidfs_mnt->mnt_root; } From 1c5484395f9f8b7bf0702f34aa3406353e45d7ec Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Tue, 24 Jun 2025 10:29:08 +0200 Subject: [PATCH 22/37] fhandle: reflow get_path_anchor() Switch to a more common coding style. Link: https://lore.kernel.org/20250624-work-pidfs-fhandle-v2-5-d02a04858fe3@kernel.org Reviewed-by: Jan Kara Reviewed-by: Amir Goldstein Signed-off-by: Christian Brauner --- fs/fhandle.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/fs/fhandle.c b/fs/fhandle.c index d8d32208c621..9ef35f8e8989 100644 --- a/fs/fhandle.c +++ b/fs/fhandle.c @@ -170,21 +170,25 @@ SYSCALL_DEFINE5(name_to_handle_at, int, dfd, const char __user *, name, static int get_path_anchor(int fd, struct path *root) { + if (fd >= 0) { + CLASS(fd, f)(fd); + if (fd_empty(f)) + return -EBADF; + *root = fd_file(f)->f_path; + path_get(root); + return 0; + } + if (fd == AT_FDCWD) { struct fs_struct *fs = current->fs; spin_lock(&fs->lock); *root = fs->pwd; path_get(root); spin_unlock(&fs->lock); - } else { - CLASS(fd, f)(fd); - if (fd_empty(f)) - return -EBADF; - *root = fd_file(f)->f_path; - path_get(root); + return 0; } - return 0; + return -EBADF; } static int vfs_dentry_acceptable(void *context, struct dentry *dentry) From a4c746f06853f91d3759ae8aca514d135b6aa56d Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Tue, 24 Jun 2025 15:48:42 +0200 Subject: [PATCH 23/37] uapi/fcntl: mark range as reserved Mark the range from -10000 to -40000 as a range reserved for special in-kernel values. Move the PIDFD_SELF_*/PIDFD_THREAD_* sentinels over so all the special values are in one place. Link: https://lore.kernel.org/20250624-work-pidfs-fhandle-v2-6-d02a04858fe3@kernel.org Reviewed-by: Jan Kara Reviewed-by: Amir Goldstein Signed-off-by: Christian Brauner --- include/uapi/linux/fcntl.h | 16 ++++++++++++++++ include/uapi/linux/pidfd.h | 15 --------------- 2 files changed, 16 insertions(+), 15 deletions(-) diff --git a/include/uapi/linux/fcntl.h b/include/uapi/linux/fcntl.h index a15ac2fa4b20..ed02d8ae0667 100644 --- a/include/uapi/linux/fcntl.h +++ b/include/uapi/linux/fcntl.h @@ -90,10 +90,26 @@ #define DN_ATTRIB 0x00000020 /* File changed attibutes */ #define DN_MULTISHOT 0x80000000 /* Don't remove notifier */ +/* Reserved kernel ranges [-100], [-10000, -40000]. */ #define AT_FDCWD -100 /* Special value for dirfd used to indicate openat should use the current working directory. */ +/* + * The concept of process and threads in userland and the kernel is a confusing + * one - within the kernel every thread is a 'task' with its own individual PID, + * however from userland's point of view threads are grouped by a single PID, + * which is that of the 'thread group leader', typically the first thread + * spawned. + * + * To cut the Gideon knot, for internal kernel usage, we refer to + * PIDFD_SELF_THREAD to refer to the current thread (or task from a kernel + * perspective), and PIDFD_SELF_THREAD_GROUP to refer to the current thread + * group leader... + */ +#define PIDFD_SELF_THREAD -10000 /* Current thread. */ +#define PIDFD_SELF_THREAD_GROUP -20000 /* Current thread group leader. */ + /* Generic flags for the *at(2) family of syscalls. */ diff --git a/include/uapi/linux/pidfd.h b/include/uapi/linux/pidfd.h index c27a4e238e4b..957db425d459 100644 --- a/include/uapi/linux/pidfd.h +++ b/include/uapi/linux/pidfd.h @@ -42,21 +42,6 @@ #define PIDFD_COREDUMP_USER (1U << 2) /* coredump was done as the user. */ #define PIDFD_COREDUMP_ROOT (1U << 3) /* coredump was done as root. */ -/* - * The concept of process and threads in userland and the kernel is a confusing - * one - within the kernel every thread is a 'task' with its own individual PID, - * however from userland's point of view threads are grouped by a single PID, - * which is that of the 'thread group leader', typically the first thread - * spawned. - * - * To cut the Gideon knot, for internal kernel usage, we refer to - * PIDFD_SELF_THREAD to refer to the current thread (or task from a kernel - * perspective), and PIDFD_SELF_THREAD_GROUP to refer to the current thread - * group leader... - */ -#define PIDFD_SELF_THREAD -10000 /* Current thread. */ -#define PIDFD_SELF_THREAD_GROUP -20000 /* Current thread group leader. */ - /* * ...and for userland we make life simpler - PIDFD_SELF refers to the current * thread, PIDFD_SELF_PROCESS refers to the process thread group leader. From 67fcec2919e4ed31ab845eb456ad7d6f1e85505c Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Tue, 24 Jun 2025 15:48:49 +0200 Subject: [PATCH 24/37] fcntl/pidfd: redefine PIDFD_SELF_THREAD_GROUP Don't jump somewhere into the middle of the reserved range. We're still able to change that value it won't be that widely used yet. If not, we can revert. Signed-off-by: Christian Brauner --- include/uapi/linux/fcntl.h | 2 +- tools/testing/selftests/pidfd/pidfd.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/include/uapi/linux/fcntl.h b/include/uapi/linux/fcntl.h index ed02d8ae0667..ba4a698d2f33 100644 --- a/include/uapi/linux/fcntl.h +++ b/include/uapi/linux/fcntl.h @@ -108,7 +108,7 @@ * group leader... */ #define PIDFD_SELF_THREAD -10000 /* Current thread. */ -#define PIDFD_SELF_THREAD_GROUP -20000 /* Current thread group leader. */ +#define PIDFD_SELF_THREAD_GROUP -10001 /* Current thread group leader. */ /* Generic flags for the *at(2) family of syscalls. */ diff --git a/tools/testing/selftests/pidfd/pidfd.h b/tools/testing/selftests/pidfd/pidfd.h index efd74063126e..5dfeb1bdf399 100644 --- a/tools/testing/selftests/pidfd/pidfd.h +++ b/tools/testing/selftests/pidfd/pidfd.h @@ -56,7 +56,7 @@ #endif #ifndef PIDFD_SELF_THREAD_GROUP -#define PIDFD_SELF_THREAD_GROUP -20000 /* Current thread group leader. */ +#define PIDFD_SELF_THREAD_GROUP -10001 /* Current thread group leader. */ #endif #ifndef PIDFD_SELF From cd5d2006327b6d8488612cb8c03ad7304417c8f2 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Tue, 24 Jun 2025 10:29:10 +0200 Subject: [PATCH 25/37] uapi/fcntl: add FD_INVALID Add a marker for an invalid file descriptor. Link: https://lore.kernel.org/20250624-work-pidfs-fhandle-v2-7-d02a04858fe3@kernel.org Reviewed-by: Jan Kara Reviewed-by: Amir Goldstein Signed-off-by: Christian Brauner --- include/uapi/linux/fcntl.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/uapi/linux/fcntl.h b/include/uapi/linux/fcntl.h index ba4a698d2f33..a5bebe7c4400 100644 --- a/include/uapi/linux/fcntl.h +++ b/include/uapi/linux/fcntl.h @@ -110,6 +110,7 @@ #define PIDFD_SELF_THREAD -10000 /* Current thread. */ #define PIDFD_SELF_THREAD_GROUP -10001 /* Current thread group leader. */ +#define FD_INVALID -10009 /* Invalid file descriptor: -10000 - EBADF = -10009 */ /* Generic flags for the *at(2) family of syscalls. */ From 3941e37f62fe2c3c8b8675c12183185f20450539 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Tue, 24 Jun 2025 16:57:51 +0200 Subject: [PATCH 26/37] uapi/fcntl: add FD_PIDFS_ROOT Add a special file descriptor indicating the root of the pidfs filesystem. Signed-off-by: Christian Brauner --- include/uapi/linux/fcntl.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/uapi/linux/fcntl.h b/include/uapi/linux/fcntl.h index a5bebe7c4400..f291ab4f94eb 100644 --- a/include/uapi/linux/fcntl.h +++ b/include/uapi/linux/fcntl.h @@ -110,6 +110,7 @@ #define PIDFD_SELF_THREAD -10000 /* Current thread. */ #define PIDFD_SELF_THREAD_GROUP -10001 /* Current thread group leader. */ +#define FD_PIDFS_ROOT -10002 /* Root of the pidfs filesystem */ #define FD_INVALID -10009 /* Invalid file descriptor: -10000 - EBADF = -10009 */ /* Generic flags for the *at(2) family of syscalls. */ From b95361481b1e5bd3627835b7e4b921d5a09e68a4 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Tue, 24 Jun 2025 10:29:13 +0200 Subject: [PATCH 27/37] fhandle, pidfs: support open_by_handle_at() purely based on file handle Various filesystems such as pidfs (and likely drm in the future) have a use-case to support opening files purely based on the handle without having to require a file descriptor to another object. That's especially the case for filesystems that don't do any lookup whatsoever and there's zero relationship between the objects. Such filesystems are also singletons that stay around for the lifetime of the system meaning that they can be uniquely identified and accessed purely based on the file handle type. Enable that so that userspace doesn't have to allocate an object needlessly especially if they can't do that for whatever reason. Link: https://lore.kernel.org/20250624-work-pidfs-fhandle-v2-10-d02a04858fe3@kernel.org Reviewed-by: Jan Kara Reviewed-by: Amir Goldstein Signed-off-by: Christian Brauner --- fs/fhandle.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/fs/fhandle.c b/fs/fhandle.c index 9ef35f8e8989..b1363ead6c5e 100644 --- a/fs/fhandle.c +++ b/fs/fhandle.c @@ -188,6 +188,11 @@ static int get_path_anchor(int fd, struct path *root) return 0; } + if (fd == FD_PIDFS_ROOT) { + pidfs_get_root(root); + return 0; + } + return -EBADF; } From 914e6b1e85c5715ca2e7ec6293c05c71e9a98e86 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Tue, 24 Jun 2025 10:29:14 +0200 Subject: [PATCH 28/37] selftests/pidfd: decode pidfd file handles withou having to specify an fd Link: https://lore.kernel.org/20250624-work-pidfs-fhandle-v2-11-d02a04858fe3@kernel.org Reviewed-by: Jan Kara Reviewed-by: Amir Goldstein Signed-off-by: Christian Brauner --- tools/testing/selftests/pidfd/Makefile | 2 +- tools/testing/selftests/pidfd/pidfd.h | 4 ++ .../selftests/pidfd/pidfd_file_handle_test.c | 60 +++++++++++++++++++ 3 files changed, 65 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/pidfd/Makefile b/tools/testing/selftests/pidfd/Makefile index 03a6eede9c9e..764a8f9ecefa 100644 --- a/tools/testing/selftests/pidfd/Makefile +++ b/tools/testing/selftests/pidfd/Makefile @@ -1,5 +1,5 @@ # SPDX-License-Identifier: GPL-2.0-only -CFLAGS += -g $(KHDR_INCLUDES) -pthread -Wall +CFLAGS += -g $(KHDR_INCLUDES) $(TOOLS_INCLUDES) -pthread -Wall TEST_GEN_PROGS := pidfd_test pidfd_fdinfo_test pidfd_open_test \ pidfd_poll_test pidfd_wait pidfd_getfd_test pidfd_setns_test \ diff --git a/tools/testing/selftests/pidfd/pidfd.h b/tools/testing/selftests/pidfd/pidfd.h index 5dfeb1bdf399..cd244d0860ff 100644 --- a/tools/testing/selftests/pidfd/pidfd.h +++ b/tools/testing/selftests/pidfd/pidfd.h @@ -19,6 +19,10 @@ #include "../kselftest.h" #include "../clone3/clone3_selftests.h" +#ifndef FD_PIDFS_ROOT +#define FD_PIDFS_ROOT -10002 +#endif + #ifndef P_PIDFD #define P_PIDFD 3 #endif diff --git a/tools/testing/selftests/pidfd/pidfd_file_handle_test.c b/tools/testing/selftests/pidfd/pidfd_file_handle_test.c index 439b9c6c0457..6bd2e9c9565b 100644 --- a/tools/testing/selftests/pidfd/pidfd_file_handle_test.c +++ b/tools/testing/selftests/pidfd/pidfd_file_handle_test.c @@ -500,4 +500,64 @@ TEST_F(file_handle, valid_name_to_handle_at_flags) ASSERT_EQ(close(pidfd), 0); } +/* + * That we decode a file handle without having to pass a pidfd. + */ +TEST_F(file_handle, decode_purely_based_on_file_handle) +{ + int mnt_id; + struct file_handle *fh; + int pidfd = -EBADF; + struct stat st1, st2; + + fh = malloc(sizeof(struct file_handle) + MAX_HANDLE_SZ); + ASSERT_NE(fh, NULL); + memset(fh, 0, sizeof(struct file_handle) + MAX_HANDLE_SZ); + fh->handle_bytes = MAX_HANDLE_SZ; + + ASSERT_EQ(name_to_handle_at(self->child_pidfd1, "", fh, &mnt_id, AT_EMPTY_PATH), 0); + + ASSERT_EQ(fstat(self->child_pidfd1, &st1), 0); + + pidfd = open_by_handle_at(FD_PIDFS_ROOT, fh, 0); + ASSERT_GE(pidfd, 0); + + ASSERT_EQ(fstat(pidfd, &st2), 0); + ASSERT_TRUE(st1.st_dev == st2.st_dev && st1.st_ino == st2.st_ino); + + ASSERT_EQ(close(pidfd), 0); + + pidfd = open_by_handle_at(FD_PIDFS_ROOT, fh, O_CLOEXEC); + ASSERT_GE(pidfd, 0); + + ASSERT_EQ(fstat(pidfd, &st2), 0); + ASSERT_TRUE(st1.st_dev == st2.st_dev && st1.st_ino == st2.st_ino); + + ASSERT_EQ(close(pidfd), 0); + + pidfd = open_by_handle_at(FD_PIDFS_ROOT, fh, O_NONBLOCK); + ASSERT_GE(pidfd, 0); + + ASSERT_EQ(fstat(pidfd, &st2), 0); + ASSERT_TRUE(st1.st_dev == st2.st_dev && st1.st_ino == st2.st_ino); + + ASSERT_EQ(close(pidfd), 0); + + pidfd = open_by_handle_at(self->pidfd, fh, 0); + ASSERT_GE(pidfd, 0); + + ASSERT_EQ(fstat(pidfd, &st2), 0); + ASSERT_TRUE(st1.st_dev == st2.st_dev && st1.st_ino == st2.st_ino); + + ASSERT_EQ(close(pidfd), 0); + + pidfd = open_by_handle_at(-EBADF, fh, 0); + ASSERT_LT(pidfd, 0); + + pidfd = open_by_handle_at(AT_FDCWD, fh, 0); + ASSERT_LT(pidfd, 0); + + free(fh); +} + TEST_HARNESS_MAIN From 9bedee7cdf4cb7f9a4928f10567b326eaab8125d Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Fri, 4 Jul 2025 00:23:05 +0200 Subject: [PATCH 29/37] af_unix: rework unix_maybe_add_creds() to allow sleep As a preparation for the next patches we need to allow sleeping in unix_maybe_add_creds() and also return err. Currently, we can't do that as unix_maybe_add_creds() is being called under unix_state_lock(). There is no need for this, really. So let's move call sites of this helper a bit and do necessary function signature changes. Cc: linux-kernel@vger.kernel.org Cc: netdev@vger.kernel.org Cc: David S. Miller Cc: Eric Dumazet Cc: Jakub Kicinski Cc: Paolo Abeni Cc: Simon Horman Cc: Leon Romanovsky Cc: Arnd Bergmann Cc: Christian Brauner Cc: Kuniyuki Iwashima Cc: Lennart Poettering Cc: Luca Boccassi Cc: David Rheinsberg Signed-off-by: Alexander Mikhalitsyn Link: https://lore.kernel.org/20250703222314.309967-2-aleksandr.mikhalitsyn@canonical.com Reviewed-by: Christian Brauner Reviewed-by: Kuniyuki Iwashima Signed-off-by: Christian Brauner --- net/unix/af_unix.c | 32 +++++++++++++++++++++++++------- 1 file changed, 25 insertions(+), 7 deletions(-) diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 129388c309b0..fba50ceab42b 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -1955,21 +1955,30 @@ static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool sen return err; } -/* +/** + * unix_maybe_add_creds() - Adds current task uid/gid and struct pid to skb if needed. + * @skb: skb to attach creds to. + * @sk: Sender sock. + * @other: Receiver sock. + * * Some apps rely on write() giving SCM_CREDENTIALS * We include credentials if source or destination socket * asserted SOCK_PASSCRED. + * + * Return: On success zero, on error a negative error code is returned. */ -static void unix_maybe_add_creds(struct sk_buff *skb, const struct sock *sk, - const struct sock *other) +static int unix_maybe_add_creds(struct sk_buff *skb, const struct sock *sk, + const struct sock *other) { if (UNIXCB(skb).pid) - return; + return 0; if (unix_may_passcred(sk) || unix_may_passcred(other)) { UNIXCB(skb).pid = get_pid(task_tgid(current)); current_uid_gid(&UNIXCB(skb).uid, &UNIXCB(skb).gid); } + + return 0; } static bool unix_skb_scm_eq(struct sk_buff *skb, @@ -2104,6 +2113,10 @@ lookup: goto out_sock_put; } + err = unix_maybe_add_creds(skb, sk, other); + if (err) + goto out_sock_put; + restart: sk_locked = 0; unix_state_lock(other); @@ -2212,7 +2225,6 @@ restart_locked: if (sock_flag(other, SOCK_RCVTSTAMP)) __net_timestamp(skb); - unix_maybe_add_creds(skb, sk, other); scm_stat_add(other, skb); skb_queue_tail(&other->sk_receive_queue, skb); unix_state_unlock(other); @@ -2256,6 +2268,10 @@ static int queue_oob(struct sock *sk, struct msghdr *msg, struct sock *other, if (err < 0) goto out; + err = unix_maybe_add_creds(skb, sk, other); + if (err) + goto out; + skb_put(skb, 1); err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, 1); @@ -2275,7 +2291,6 @@ static int queue_oob(struct sock *sk, struct msghdr *msg, struct sock *other, goto out_unlock; } - unix_maybe_add_creds(skb, sk, other); scm_stat_add(other, skb); spin_lock(&other->sk_receive_queue.lock); @@ -2369,6 +2384,10 @@ static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg, fds_sent = true; + err = unix_maybe_add_creds(skb, sk, other); + if (err) + goto out_free; + if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) { skb->ip_summed = CHECKSUM_UNNECESSARY; err = skb_splice_from_iter(skb, &msg->msg_iter, size, @@ -2399,7 +2418,6 @@ static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg, goto out_free; } - unix_maybe_add_creds(skb, sk, other); scm_stat_add(other, skb); skb_queue_tail(&other->sk_receive_queue, skb); unix_state_unlock(other); From ee47976264cd499426c89328827970ffb6acd406 Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Fri, 4 Jul 2025 00:23:06 +0200 Subject: [PATCH 30/37] af_unix: introduce unix_skb_to_scm helper Instead of open-coding let's consolidate this logic in a separate helper. This will simplify further changes. Cc: linux-kernel@vger.kernel.org Cc: netdev@vger.kernel.org Cc: David S. Miller Cc: Eric Dumazet Cc: Jakub Kicinski Cc: Paolo Abeni Cc: Simon Horman Cc: Leon Romanovsky Cc: Arnd Bergmann Cc: Christian Brauner Cc: Kuniyuki Iwashima Cc: Lennart Poettering Cc: Luca Boccassi Cc: David Rheinsberg Signed-off-by: Alexander Mikhalitsyn Link: https://lore.kernel.org/20250703222314.309967-3-aleksandr.mikhalitsyn@canonical.com Reviewed-by: Kuniyuki Iwashima Reviewed-by: Christian Brauner Signed-off-by: Christian Brauner --- net/unix/af_unix.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index fba50ceab42b..df2174d9904d 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -1955,6 +1955,12 @@ static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool sen return err; } +static void unix_skb_to_scm(struct sk_buff *skb, struct scm_cookie *scm) +{ + scm_set_cred(scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid); + unix_set_secdata(scm, skb); +} + /** * unix_maybe_add_creds() - Adds current task uid/gid and struct pid to skb if needed. * @skb: skb to attach creds to. @@ -2565,8 +2571,7 @@ int __unix_dgram_recvmsg(struct sock *sk, struct msghdr *msg, size_t size, memset(&scm, 0, sizeof(scm)); - scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid); - unix_set_secdata(&scm, skb); + unix_skb_to_scm(skb, &scm); if (!(flags & MSG_PEEK)) { if (UNIXCB(skb).fp) @@ -2951,8 +2956,7 @@ unlock: break; } else if (unix_may_passcred(sk)) { /* Copy credentials */ - scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid); - unix_set_secdata(&scm, skb); + unix_skb_to_scm(skb, &scm); check_creds = true; } From 30580dc96a3e6e146006e811ad1d01ef8a82a509 Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Fri, 4 Jul 2025 00:23:07 +0200 Subject: [PATCH 31/37] af_unix: introduce and use scm_replace_pid() helper Existing logic in __scm_send() related to filling an struct scm_cookie with a proper struct pid reference is already pretty tricky. Let's simplify it a bit by introducing a new helper. This helper will be extended in one of the next patches. Cc: linux-kernel@vger.kernel.org Cc: netdev@vger.kernel.org Cc: David S. Miller Cc: Eric Dumazet Cc: Jakub Kicinski Cc: Paolo Abeni Cc: Simon Horman Cc: Willem de Bruijn Cc: Leon Romanovsky Cc: Arnd Bergmann Cc: Christian Brauner Cc: Kuniyuki Iwashima Cc: Lennart Poettering Cc: Luca Boccassi Cc: David Rheinsberg Signed-off-by: Alexander Mikhalitsyn Link: https://lore.kernel.org/20250703222314.309967-4-aleksandr.mikhalitsyn@canonical.com Reviewed-by: Kuniyuki Iwashima Signed-off-by: Christian Brauner --- net/core/scm.c | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/net/core/scm.c b/net/core/scm.c index 0225bd94170f..045ab5bdac7d 100644 --- a/net/core/scm.c +++ b/net/core/scm.c @@ -145,6 +145,16 @@ void __scm_destroy(struct scm_cookie *scm) } EXPORT_SYMBOL(__scm_destroy); +static inline int scm_replace_pid(struct scm_cookie *scm, struct pid *pid) +{ + /* drop all previous references */ + scm_destroy_cred(scm); + + scm->pid = pid; + scm->creds.pid = pid_vnr(pid); + return 0; +} + int __scm_send(struct socket *sock, struct msghdr *msg, struct scm_cookie *p) { const struct proto_ops *ops = READ_ONCE(sock->ops); @@ -189,15 +199,21 @@ int __scm_send(struct socket *sock, struct msghdr *msg, struct scm_cookie *p) if (err) goto error; - p->creds.pid = creds.pid; if (!p->pid || pid_vnr(p->pid) != creds.pid) { struct pid *pid; err = -ESRCH; pid = find_get_pid(creds.pid); if (!pid) goto error; - put_pid(p->pid); - p->pid = pid; + + /* pass a struct pid reference from + * find_get_pid() to scm_replace_pid(). + */ + err = scm_replace_pid(p, pid); + if (err) { + put_pid(pid); + goto error; + } } err = -EINVAL; From 2b9996417e4ec231c91818f9ea8107ae62ef75ad Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Fri, 4 Jul 2025 00:23:08 +0200 Subject: [PATCH 32/37] af_unix/scm: fix whitespace errors Fix whitespace/formatting errors. Cc: linux-kernel@vger.kernel.org Cc: netdev@vger.kernel.org Cc: David S. Miller Cc: Eric Dumazet Cc: Jakub Kicinski Cc: Paolo Abeni Cc: Simon Horman Cc: Leon Romanovsky Cc: Arnd Bergmann Cc: Christian Brauner Cc: Kuniyuki Iwashima Cc: Lennart Poettering Cc: Luca Boccassi Cc: David Rheinsberg Signed-off-by: Alexander Mikhalitsyn Link: https://lore.kernel.org/20250703222314.309967-5-aleksandr.mikhalitsyn@canonical.com Reviewed-by: Kuniyuki Iwashima Signed-off-by: Christian Brauner --- include/net/scm.h | 4 ++-- net/unix/af_unix.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/include/net/scm.h b/include/net/scm.h index 84c4707e78a5..c52519669349 100644 --- a/include/net/scm.h +++ b/include/net/scm.h @@ -69,7 +69,7 @@ static __inline__ void unix_get_peersec_dgram(struct socket *sock, struct scm_co static __inline__ void scm_set_cred(struct scm_cookie *scm, struct pid *pid, kuid_t uid, kgid_t gid) { - scm->pid = get_pid(pid); + scm->pid = get_pid(pid); scm->creds.pid = pid_vnr(pid); scm->creds.uid = uid; scm->creds.gid = gid; @@ -78,7 +78,7 @@ static __inline__ void scm_set_cred(struct scm_cookie *scm, static __inline__ void scm_destroy_cred(struct scm_cookie *scm) { put_pid(scm->pid); - scm->pid = NULL; + scm->pid = NULL; } static __inline__ void scm_destroy(struct scm_cookie *scm) diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index df2174d9904d..323e4fc85d4b 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -1929,7 +1929,7 @@ static void unix_destruct_scm(struct sk_buff *skb) struct scm_cookie scm; memset(&scm, 0, sizeof(scm)); - scm.pid = UNIXCB(skb).pid; + scm.pid = UNIXCB(skb).pid; if (UNIXCB(skb).fp) unix_detach_fds(&scm, skb); From 2775832f71e53a294c93fa4b343a71787a87e5d3 Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Fri, 4 Jul 2025 00:23:09 +0200 Subject: [PATCH 33/37] af_unix: stash pidfs dentry when needed We need to ensure that pidfs dentry is allocated when we meet any struct pid for the first time. This will allows us to open pidfd even after the task it corresponds to is reaped. Basically, we need to identify all places where we fill skb/scm_cookie with struct pid reference for the first time and call pidfs_register_pid(). Tricky thing here is that we have a few places where this happends depending on what userspace is doing: - [__scm_replace_pid()] explicitly sending an SCM_CREDENTIALS message and specified pid in a numeric format - [unix_maybe_add_creds()] enabled SO_PASSCRED/SO_PASSPIDFD but didn't send SCM_CREDENTIALS explicitly - [scm_send()] force_creds is true. Netlink case, we don't need to touch it. Cc: linux-kernel@vger.kernel.org Cc: netdev@vger.kernel.org Cc: David S. Miller Cc: Eric Dumazet Cc: Jakub Kicinski Cc: Paolo Abeni Cc: Simon Horman Cc: Leon Romanovsky Cc: Arnd Bergmann Cc: Christian Brauner Cc: Kuniyuki Iwashima Cc: Lennart Poettering Cc: Luca Boccassi Cc: David Rheinsberg Signed-off-by: Alexander Mikhalitsyn Link: https://lore.kernel.org/20250703222314.309967-6-aleksandr.mikhalitsyn@canonical.com Reviewed-by: Kuniyuki Iwashima Signed-off-by: Christian Brauner --- net/core/scm.c | 7 +++++++ net/unix/af_unix.c | 11 ++++++++++- 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/net/core/scm.c b/net/core/scm.c index 045ab5bdac7d..358a4e04d46c 100644 --- a/net/core/scm.c +++ b/net/core/scm.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include @@ -147,9 +148,15 @@ EXPORT_SYMBOL(__scm_destroy); static inline int scm_replace_pid(struct scm_cookie *scm, struct pid *pid) { + int err; + /* drop all previous references */ scm_destroy_cred(scm); + err = pidfs_register_pid(pid); + if (unlikely(err)) + return err; + scm->pid = pid; scm->creds.pid = pid_vnr(pid); return 0; diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 323e4fc85d4b..d52811321fce 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -1971,6 +1971,7 @@ static void unix_skb_to_scm(struct sk_buff *skb, struct scm_cookie *scm) * We include credentials if source or destination socket * asserted SOCK_PASSCRED. * + * Context: May sleep. * Return: On success zero, on error a negative error code is returned. */ static int unix_maybe_add_creds(struct sk_buff *skb, const struct sock *sk, @@ -1980,7 +1981,15 @@ static int unix_maybe_add_creds(struct sk_buff *skb, const struct sock *sk, return 0; if (unix_may_passcred(sk) || unix_may_passcred(other)) { - UNIXCB(skb).pid = get_pid(task_tgid(current)); + struct pid *pid; + int err; + + pid = task_tgid(current); + err = pidfs_register_pid(pid); + if (unlikely(err)) + return err; + + UNIXCB(skb).pid = get_pid(pid); current_uid_gid(&UNIXCB(skb).uid, &UNIXCB(skb).gid); } From c679d17d3f2d895b34e660673141ad250889831f Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Fri, 4 Jul 2025 00:23:10 +0200 Subject: [PATCH 34/37] af_unix: enable handing out pidfds for reaped tasks in SCM_PIDFD Now everything is ready to pass PIDFD_STALE to pidfd_prepare(). This will allow opening pidfd for reaped tasks. Cc: linux-kernel@vger.kernel.org Cc: netdev@vger.kernel.org Cc: David S. Miller Cc: Eric Dumazet Cc: Jakub Kicinski Cc: Paolo Abeni Cc: Simon Horman Cc: Willem de Bruijn Cc: Leon Romanovsky Cc: Arnd Bergmann Cc: Christian Brauner Cc: Kuniyuki Iwashima Cc: Lennart Poettering Cc: Luca Boccassi Cc: David Rheinsberg Signed-off-by: Alexander Mikhalitsyn Link: https://lore.kernel.org/20250703222314.309967-7-aleksandr.mikhalitsyn@canonical.com Reviewed-by: Christian Brauner Reviewed-by: Kuniyuki Iwashima Signed-off-by: Christian Brauner --- net/core/scm.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/core/scm.c b/net/core/scm.c index 358a4e04d46c..072d5742440a 100644 --- a/net/core/scm.c +++ b/net/core/scm.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include @@ -482,7 +483,7 @@ static void scm_pidfd_recv(struct msghdr *msg, struct scm_cookie *scm) if (!scm->pid) return; - pidfd = pidfd_prepare(scm->pid, 0, &pidfd_file); + pidfd = pidfd_prepare(scm->pid, PIDFD_STALE, &pidfd_file); if (put_cmsg(msg, SOL_SOCKET, SCM_PIDFD, sizeof(int), &pidfd)) { if (pidfd_file) { From 861bdc6314a49520d9b3778b763461517c1321c0 Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Fri, 4 Jul 2025 00:23:11 +0200 Subject: [PATCH 35/37] selftests: net: extend SCM_PIDFD test to cover stale pidfds Extend SCM_PIDFD test scenarios to also cover dead task's pidfd retrieval and reading its exit info. Cc: linux-kselftest@vger.kernel.org Cc: linux-kernel@vger.kernel.org Cc: netdev@vger.kernel.org Cc: Shuah Khan Cc: David S. Miller Cc: Eric Dumazet Cc: Jakub Kicinski Cc: Paolo Abeni Cc: Simon Horman Cc: Christian Brauner Cc: Kuniyuki Iwashima Cc: Lennart Poettering Cc: Luca Boccassi Cc: David Rheinsberg Signed-off-by: Alexander Mikhalitsyn Link: https://lore.kernel.org/20250703222314.309967-8-aleksandr.mikhalitsyn@canonical.com Reviewed-by: Christian Brauner Signed-off-by: Christian Brauner --- .../testing/selftests/net/af_unix/scm_pidfd.c | 221 ++++++++++++++---- 1 file changed, 175 insertions(+), 46 deletions(-) diff --git a/tools/testing/selftests/net/af_unix/scm_pidfd.c b/tools/testing/selftests/net/af_unix/scm_pidfd.c index 7e534594167e..37e034874034 100644 --- a/tools/testing/selftests/net/af_unix/scm_pidfd.c +++ b/tools/testing/selftests/net/af_unix/scm_pidfd.c @@ -15,6 +15,7 @@ #include #include +#include "../../pidfd/pidfd.h" #include "../../kselftest_harness.h" #define clean_errno() (errno == 0 ? "None" : strerror(errno)) @@ -26,6 +27,8 @@ #define SCM_PIDFD 0x04 #endif +#define CHILD_EXIT_CODE_OK 123 + static void child_die() { exit(1); @@ -126,16 +129,65 @@ out: return result; } +struct cmsg_data { + struct ucred *ucred; + int *pidfd; +}; + +static int parse_cmsg(struct msghdr *msg, struct cmsg_data *res) +{ + struct cmsghdr *cmsg; + int data = 0; + + if (msg->msg_flags & (MSG_TRUNC | MSG_CTRUNC)) { + log_err("recvmsg: truncated"); + return 1; + } + + for (cmsg = CMSG_FIRSTHDR(msg); cmsg != NULL; + cmsg = CMSG_NXTHDR(msg, cmsg)) { + if (cmsg->cmsg_level == SOL_SOCKET && + cmsg->cmsg_type == SCM_PIDFD) { + if (cmsg->cmsg_len < sizeof(*res->pidfd)) { + log_err("CMSG parse: SCM_PIDFD wrong len"); + return 1; + } + + res->pidfd = (void *)CMSG_DATA(cmsg); + } + + if (cmsg->cmsg_level == SOL_SOCKET && + cmsg->cmsg_type == SCM_CREDENTIALS) { + if (cmsg->cmsg_len < sizeof(*res->ucred)) { + log_err("CMSG parse: SCM_CREDENTIALS wrong len"); + return 1; + } + + res->ucred = (void *)CMSG_DATA(cmsg); + } + } + + if (!res->pidfd) { + log_err("CMSG parse: SCM_PIDFD not found"); + return 1; + } + + if (!res->ucred) { + log_err("CMSG parse: SCM_CREDENTIALS not found"); + return 1; + } + + return 0; +} + static int cmsg_check(int fd) { struct msghdr msg = { 0 }; - struct cmsghdr *cmsg; + struct cmsg_data res; struct iovec iov; - struct ucred *ucred = NULL; int data = 0; char control[CMSG_SPACE(sizeof(struct ucred)) + CMSG_SPACE(sizeof(int))] = { 0 }; - int *pidfd = NULL; pid_t parent_pid; int err; @@ -158,53 +210,99 @@ static int cmsg_check(int fd) return 1; } - for (cmsg = CMSG_FIRSTHDR(&msg); cmsg != NULL; - cmsg = CMSG_NXTHDR(&msg, cmsg)) { - if (cmsg->cmsg_level == SOL_SOCKET && - cmsg->cmsg_type == SCM_PIDFD) { - if (cmsg->cmsg_len < sizeof(*pidfd)) { - log_err("CMSG parse: SCM_PIDFD wrong len"); - return 1; - } - - pidfd = (void *)CMSG_DATA(cmsg); - } - - if (cmsg->cmsg_level == SOL_SOCKET && - cmsg->cmsg_type == SCM_CREDENTIALS) { - if (cmsg->cmsg_len < sizeof(*ucred)) { - log_err("CMSG parse: SCM_CREDENTIALS wrong len"); - return 1; - } - - ucred = (void *)CMSG_DATA(cmsg); - } - } - /* send(pfd, "x", sizeof(char), 0) */ if (data != 'x') { log_err("recvmsg: data corruption"); return 1; } - if (!pidfd) { - log_err("CMSG parse: SCM_PIDFD not found"); - return 1; - } - - if (!ucred) { - log_err("CMSG parse: SCM_CREDENTIALS not found"); + if (parse_cmsg(&msg, &res)) { + log_err("CMSG parse: parse_cmsg() failed"); return 1; } /* pidfd from SCM_PIDFD should point to the parent process PID */ parent_pid = - get_pid_from_fdinfo_file(*pidfd, "Pid:", sizeof("Pid:") - 1); + get_pid_from_fdinfo_file(*res.pidfd, "Pid:", sizeof("Pid:") - 1); if (parent_pid != getppid()) { log_err("wrong SCM_PIDFD %d != %d", parent_pid, getppid()); + close(*res.pidfd); return 1; } + close(*res.pidfd); + return 0; +} + +static int cmsg_check_dead(int fd, int expected_pid) +{ + int err; + struct msghdr msg = { 0 }; + struct cmsg_data res; + struct iovec iov; + int data = 0; + char control[CMSG_SPACE(sizeof(struct ucred)) + + CMSG_SPACE(sizeof(int))] = { 0 }; + pid_t client_pid; + struct pidfd_info info = { + .mask = PIDFD_INFO_EXIT, + }; + + iov.iov_base = &data; + iov.iov_len = sizeof(data); + + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + msg.msg_control = control; + msg.msg_controllen = sizeof(control); + + err = recvmsg(fd, &msg, 0); + if (err < 0) { + log_err("recvmsg"); + return 1; + } + + if (msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC)) { + log_err("recvmsg: truncated"); + return 1; + } + + /* send(cfd, "y", sizeof(char), 0) */ + if (data != 'y') { + log_err("recvmsg: data corruption"); + return 1; + } + + if (parse_cmsg(&msg, &res)) { + log_err("CMSG parse: parse_cmsg() failed"); + return 1; + } + + /* + * pidfd from SCM_PIDFD should point to the client_pid. + * Let's read exit information and check if it's what + * we expect to see. + */ + if (ioctl(*res.pidfd, PIDFD_GET_INFO, &info)) { + log_err("%s: ioctl(PIDFD_GET_INFO) failed", __func__); + close(*res.pidfd); + return 1; + } + + if (!(info.mask & PIDFD_INFO_EXIT)) { + log_err("%s: No exit information from ioctl(PIDFD_GET_INFO)", __func__); + close(*res.pidfd); + return 1; + } + + err = WIFEXITED(info.exit_code) ? WEXITSTATUS(info.exit_code) : 1; + if (err != CHILD_EXIT_CODE_OK) { + log_err("%s: wrong exit_code %d != %d", __func__, err, CHILD_EXIT_CODE_OK); + close(*res.pidfd); + return 1; + } + + close(*res.pidfd); return 0; } @@ -291,6 +389,24 @@ static void fill_sockaddr(struct sock_addr *addr, bool abstract) memcpy(sun_path_buf, addr->sock_name, strlen(addr->sock_name)); } +static int sk_enable_cred_pass(int sk) +{ + int on = 0; + + on = 1; + if (setsockopt(sk, SOL_SOCKET, SO_PASSCRED, &on, sizeof(on))) { + log_err("Failed to set SO_PASSCRED"); + return 1; + } + + if (setsockopt(sk, SOL_SOCKET, SO_PASSPIDFD, &on, sizeof(on))) { + log_err("Failed to set SO_PASSPIDFD"); + return 1; + } + + return 0; +} + static void client(FIXTURE_DATA(scm_pidfd) *self, const FIXTURE_VARIANT(scm_pidfd) *variant) { @@ -299,7 +415,6 @@ static void client(FIXTURE_DATA(scm_pidfd) *self, struct ucred peer_cred; int peer_pidfd; pid_t peer_pid; - int on = 0; cfd = socket(AF_UNIX, variant->type, 0); if (cfd < 0) { @@ -322,14 +437,8 @@ static void client(FIXTURE_DATA(scm_pidfd) *self, child_die(); } - on = 1; - if (setsockopt(cfd, SOL_SOCKET, SO_PASSCRED, &on, sizeof(on))) { - log_err("Failed to set SO_PASSCRED"); - child_die(); - } - - if (setsockopt(cfd, SOL_SOCKET, SO_PASSPIDFD, &on, sizeof(on))) { - log_err("Failed to set SO_PASSPIDFD"); + if (sk_enable_cred_pass(cfd)) { + log_err("sk_enable_cred_pass() failed"); child_die(); } @@ -340,6 +449,12 @@ static void client(FIXTURE_DATA(scm_pidfd) *self, child_die(); } + /* send something to the parent so it can receive SCM_PIDFD too and validate it */ + if (send(cfd, "y", sizeof(char), 0) == -1) { + log_err("Failed to send(cfd, \"y\", sizeof(char), 0)"); + child_die(); + } + /* skip further for SOCK_DGRAM as it's not applicable */ if (variant->type == SOCK_DGRAM) return; @@ -398,7 +513,13 @@ TEST_F(scm_pidfd, test) close(self->server); close(self->startup_pipe[0]); client(self, variant); - exit(0); + + /* + * It's a bit unusual, but in case of success we return non-zero + * exit code (CHILD_EXIT_CODE_OK) and then we expect to read it + * from ioctl(PIDFD_GET_INFO) in cmsg_check_dead(). + */ + exit(CHILD_EXIT_CODE_OK); } close(self->startup_pipe[1]); @@ -421,9 +542,17 @@ TEST_F(scm_pidfd, test) ASSERT_NE(-1, err); } - close(pfd); waitpid(self->client_pid, &child_status, 0); - ASSERT_EQ(0, WIFEXITED(child_status) ? WEXITSTATUS(child_status) : 1); + /* see comment before exit(CHILD_EXIT_CODE_OK) */ + ASSERT_EQ(CHILD_EXIT_CODE_OK, WIFEXITED(child_status) ? WEXITSTATUS(child_status) : 1); + + err = sk_enable_cred_pass(pfd); + ASSERT_EQ(0, err); + + err = cmsg_check_dead(pfd, self->client_pid); + ASSERT_EQ(0, err); + + close(pfd); } TEST_HARNESS_MAIN From a683a5b2ba23598ad343e5ec10a4ef4077497fc9 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 2 Jul 2025 06:34:37 +0100 Subject: [PATCH 36/37] fold fs_struct->{lock,seq} into a seqlock The combination of spinlock_t lock and seqcount_spinlock_t seq in struct fs_struct is an open-coded seqlock_t (see linux/seqlock_types.h). Combine and switch to equivalent seqlock_t primitives. AFAICS, that does end up with the same sequence of underlying operations in all cases. While we are at it, get_fs_pwd() is open-coded verbatim in get_path_from_fd(); rather than applying conversion to it, replace with the call of get_fs_pwd() there. Not worth splitting the commit for that, IMO... A bit of historical background - conversion of seqlock_t to use of seqcount_spinlock_t happened several months after the same had been done to struct fs_struct; switching fs_struct to seqlock_t could've been done immediately after that, but it looks like nobody had gotten around to that until now. Signed-off-by: Al Viro Link: https://lore.kernel.org/20250702053437.GC1880847@ZenIV Acked-by: Ahmed S. Darwish Acked-by: Peter Zijlstra (Intel) Reviewed-by: Christian Brauner Signed-off-by: Christian Brauner --- fs/d_path.c | 8 ++++---- fs/exec.c | 4 ++-- fs/fhandle.c | 6 +----- fs/fs_struct.c | 36 ++++++++++++++---------------------- fs/namei.c | 8 ++++---- include/linux/fs_struct.h | 11 +++++------ kernel/fork.c | 10 +++++----- 7 files changed, 35 insertions(+), 48 deletions(-) diff --git a/fs/d_path.c b/fs/d_path.c index 5f4da5c8d5db..bb365511066b 100644 --- a/fs/d_path.c +++ b/fs/d_path.c @@ -241,9 +241,9 @@ static void get_fs_root_rcu(struct fs_struct *fs, struct path *root) unsigned seq; do { - seq = read_seqcount_begin(&fs->seq); + seq = read_seqbegin(&fs->seq); *root = fs->root; - } while (read_seqcount_retry(&fs->seq, seq)); + } while (read_seqretry(&fs->seq, seq)); } /** @@ -385,10 +385,10 @@ static void get_fs_root_and_pwd_rcu(struct fs_struct *fs, struct path *root, unsigned seq; do { - seq = read_seqcount_begin(&fs->seq); + seq = read_seqbegin(&fs->seq); *root = fs->root; *pwd = fs->pwd; - } while (read_seqcount_retry(&fs->seq, seq)); + } while (read_seqretry(&fs->seq, seq)); } /* diff --git a/fs/exec.c b/fs/exec.c index 1f5fdd2e096e..871078ddb220 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1510,7 +1510,7 @@ static void check_unsafe_exec(struct linux_binprm *bprm) * state is protected by cred_guard_mutex we hold. */ n_fs = 1; - spin_lock(&p->fs->lock); + read_seqlock_excl(&p->fs->seq); rcu_read_lock(); for_other_threads(p, t) { if (t->fs == p->fs) @@ -1523,7 +1523,7 @@ static void check_unsafe_exec(struct linux_binprm *bprm) bprm->unsafe |= LSM_UNSAFE_SHARE; else p->fs->in_exec = 1; - spin_unlock(&p->fs->lock); + read_sequnlock_excl(&p->fs->seq); } static void bprm_fill_uid(struct linux_binprm *bprm, struct file *file) diff --git a/fs/fhandle.c b/fs/fhandle.c index b1363ead6c5e..7c236f64cdea 100644 --- a/fs/fhandle.c +++ b/fs/fhandle.c @@ -180,11 +180,7 @@ static int get_path_anchor(int fd, struct path *root) } if (fd == AT_FDCWD) { - struct fs_struct *fs = current->fs; - spin_lock(&fs->lock); - *root = fs->pwd; - path_get(root); - spin_unlock(&fs->lock); + get_fs_pwd(current->fs, root); return 0; } diff --git a/fs/fs_struct.c b/fs/fs_struct.c index 64c2d0814ed6..28be762ac1c6 100644 --- a/fs/fs_struct.c +++ b/fs/fs_struct.c @@ -17,12 +17,10 @@ void set_fs_root(struct fs_struct *fs, const struct path *path) struct path old_root; path_get(path); - spin_lock(&fs->lock); - write_seqcount_begin(&fs->seq); + write_seqlock(&fs->seq); old_root = fs->root; fs->root = *path; - write_seqcount_end(&fs->seq); - spin_unlock(&fs->lock); + write_sequnlock(&fs->seq); if (old_root.dentry) path_put(&old_root); } @@ -36,12 +34,10 @@ void set_fs_pwd(struct fs_struct *fs, const struct path *path) struct path old_pwd; path_get(path); - spin_lock(&fs->lock); - write_seqcount_begin(&fs->seq); + write_seqlock(&fs->seq); old_pwd = fs->pwd; fs->pwd = *path; - write_seqcount_end(&fs->seq); - spin_unlock(&fs->lock); + write_sequnlock(&fs->seq); if (old_pwd.dentry) path_put(&old_pwd); @@ -67,16 +63,14 @@ void chroot_fs_refs(const struct path *old_root, const struct path *new_root) fs = p->fs; if (fs) { int hits = 0; - spin_lock(&fs->lock); - write_seqcount_begin(&fs->seq); + write_seqlock(&fs->seq); hits += replace_path(&fs->root, old_root, new_root); hits += replace_path(&fs->pwd, old_root, new_root); - write_seqcount_end(&fs->seq); while (hits--) { count++; path_get(new_root); } - spin_unlock(&fs->lock); + write_sequnlock(&fs->seq); } task_unlock(p); } @@ -99,10 +93,10 @@ void exit_fs(struct task_struct *tsk) if (fs) { int kill; task_lock(tsk); - spin_lock(&fs->lock); + read_seqlock_excl(&fs->seq); tsk->fs = NULL; kill = !--fs->users; - spin_unlock(&fs->lock); + read_sequnlock_excl(&fs->seq); task_unlock(tsk); if (kill) free_fs_struct(fs); @@ -116,16 +110,15 @@ struct fs_struct *copy_fs_struct(struct fs_struct *old) if (fs) { fs->users = 1; fs->in_exec = 0; - spin_lock_init(&fs->lock); - seqcount_spinlock_init(&fs->seq, &fs->lock); + seqlock_init(&fs->seq); fs->umask = old->umask; - spin_lock(&old->lock); + read_seqlock_excl(&old->seq); fs->root = old->root; path_get(&fs->root); fs->pwd = old->pwd; path_get(&fs->pwd); - spin_unlock(&old->lock); + read_sequnlock_excl(&old->seq); } return fs; } @@ -140,10 +133,10 @@ int unshare_fs_struct(void) return -ENOMEM; task_lock(current); - spin_lock(&fs->lock); + read_seqlock_excl(&fs->seq); kill = !--fs->users; current->fs = new_fs; - spin_unlock(&fs->lock); + read_sequnlock_excl(&fs->seq); task_unlock(current); if (kill) @@ -162,7 +155,6 @@ EXPORT_SYMBOL(current_umask); /* to be mentioned only in INIT_TASK */ struct fs_struct init_fs = { .users = 1, - .lock = __SPIN_LOCK_UNLOCKED(init_fs.lock), - .seq = SEQCNT_SPINLOCK_ZERO(init_fs.seq, &init_fs.lock), + .seq = __SEQLOCK_UNLOCKED(init_fs.seq), .umask = 0022, }; diff --git a/fs/namei.c b/fs/namei.c index 4bb889fc980b..f2fcaf84e111 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1012,10 +1012,10 @@ static int set_root(struct nameidata *nd) unsigned seq; do { - seq = read_seqcount_begin(&fs->seq); + seq = read_seqbegin(&fs->seq); nd->root = fs->root; nd->root_seq = __read_seqcount_begin(&nd->root.dentry->d_seq); - } while (read_seqcount_retry(&fs->seq, seq)); + } while (read_seqretry(&fs->seq, seq)); } else { get_fs_root(fs, &nd->root); nd->state |= ND_ROOT_GRABBED; @@ -2580,11 +2580,11 @@ static const char *path_init(struct nameidata *nd, unsigned flags) unsigned seq; do { - seq = read_seqcount_begin(&fs->seq); + seq = read_seqbegin(&fs->seq); nd->path = fs->pwd; nd->inode = nd->path.dentry->d_inode; nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq); - } while (read_seqcount_retry(&fs->seq, seq)); + } while (read_seqretry(&fs->seq, seq)); } else { get_fs_pwd(current->fs, &nd->path); nd->inode = nd->path.dentry->d_inode; diff --git a/include/linux/fs_struct.h b/include/linux/fs_struct.h index 783b48dedb72..baf200ab5c77 100644 --- a/include/linux/fs_struct.h +++ b/include/linux/fs_struct.h @@ -8,8 +8,7 @@ struct fs_struct { int users; - spinlock_t lock; - seqcount_spinlock_t seq; + seqlock_t seq; int umask; int in_exec; struct path root, pwd; @@ -26,18 +25,18 @@ extern int unshare_fs_struct(void); static inline void get_fs_root(struct fs_struct *fs, struct path *root) { - spin_lock(&fs->lock); + read_seqlock_excl(&fs->seq); *root = fs->root; path_get(root); - spin_unlock(&fs->lock); + read_sequnlock_excl(&fs->seq); } static inline void get_fs_pwd(struct fs_struct *fs, struct path *pwd) { - spin_lock(&fs->lock); + read_seqlock_excl(&fs->seq); *pwd = fs->pwd; path_get(pwd); - spin_unlock(&fs->lock); + read_sequnlock_excl(&fs->seq); } extern bool current_chrooted(void); diff --git a/kernel/fork.c b/kernel/fork.c index 1ee8eb11f38b..6318a25a16ba 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1542,14 +1542,14 @@ static int copy_fs(unsigned long clone_flags, struct task_struct *tsk) struct fs_struct *fs = current->fs; if (clone_flags & CLONE_FS) { /* tsk->fs is already what we want */ - spin_lock(&fs->lock); + read_seqlock_excl(&fs->seq); /* "users" and "in_exec" locked for check_unsafe_exec() */ if (fs->in_exec) { - spin_unlock(&fs->lock); + read_sequnlock_excl(&fs->seq); return -EAGAIN; } fs->users++; - spin_unlock(&fs->lock); + read_sequnlock_excl(&fs->seq); return 0; } tsk->fs = copy_fs_struct(fs); @@ -3149,13 +3149,13 @@ int ksys_unshare(unsigned long unshare_flags) if (new_fs) { fs = current->fs; - spin_lock(&fs->lock); + read_seqlock_excl(&fs->seq); current->fs = new_fs; if (--fs->users) new_fs = NULL; else new_fs = fs; - spin_unlock(&fs->lock); + read_sequnlock_excl(&fs->seq); } if (new_fd) From 1f531e35c146cca22dc6f4a1bc657098f146f358 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 12 Jul 2025 06:41:57 +0100 Subject: [PATCH 37/37] don't bother with path_get()/path_put() in unix_open_file() Once unix_sock ->path is set, we are guaranteed that its ->path will remain unchanged (and pinned) until the socket is closed. OTOH, dentry_open() does not modify the path passed to it. IOW, there's no need to copy unix_sk(sk)->path in unix_open_file() - we can just pass it to dentry_open() and be done with that. Signed-off-by: Al Viro Link: https://lore.kernel.org/20250712054157.GZ1880847@ZenIV Reviewed-by: Kuniyuki Iwashima Signed-off-by: Christian Brauner --- net/unix/af_unix.c | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index d52811321fce..c247fb9ac761 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -3201,7 +3201,6 @@ EXPORT_SYMBOL_GPL(unix_outq_len); static int unix_open_file(struct sock *sk) { - struct path path; struct file *f; int fd; @@ -3211,27 +3210,20 @@ static int unix_open_file(struct sock *sk) if (!smp_load_acquire(&unix_sk(sk)->addr)) return -ENOENT; - path = unix_sk(sk)->path; - if (!path.dentry) + if (!unix_sk(sk)->path.dentry) return -ENOENT; - path_get(&path); - fd = get_unused_fd_flags(O_CLOEXEC); if (fd < 0) - goto out; + return fd; - f = dentry_open(&path, O_PATH, current_cred()); + f = dentry_open(&unix_sk(sk)->path, O_PATH, current_cred()); if (IS_ERR(f)) { put_unused_fd(fd); - fd = PTR_ERR(f); - goto out; + return PTR_ERR(f); } fd_install(fd, f); -out: - path_put(&path); - return fd; }