diff --git a/fs/coredump.c b/fs/coredump.c index fadf9d4be2e1..fedbead956ed 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -710,11 +710,6 @@ static bool coredump_sock_connect(struct core_name *cn, struct coredump_params * retval = kernel_connect(socket, (struct sockaddr *)(&addr), addr_len, O_NONBLOCK | SOCK_COREDUMP); - /* - * ... Make sure to only put our reference after connect() took - * its own reference keeping the pidfs entry alive ... - */ - pidfs_put_pid(cprm->pid); if (retval) { if (retval == -EAGAIN) diff --git a/fs/d_path.c b/fs/d_path.c index 5f4da5c8d5db..bb365511066b 100644 --- a/fs/d_path.c +++ b/fs/d_path.c @@ -241,9 +241,9 @@ static void get_fs_root_rcu(struct fs_struct *fs, struct path *root) unsigned seq; do { - seq = read_seqcount_begin(&fs->seq); + seq = read_seqbegin(&fs->seq); *root = fs->root; - } while (read_seqcount_retry(&fs->seq, seq)); + } while (read_seqretry(&fs->seq, seq)); } /** @@ -385,10 +385,10 @@ static void get_fs_root_and_pwd_rcu(struct fs_struct *fs, struct path *root, unsigned seq; do { - seq = read_seqcount_begin(&fs->seq); + seq = read_seqbegin(&fs->seq); *root = fs->root; *pwd = fs->pwd; - } while (read_seqcount_retry(&fs->seq, seq)); + } while (read_seqretry(&fs->seq, seq)); } /* diff --git a/fs/exec.c b/fs/exec.c index ba400aafd640..fe895e47f1dd 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1515,7 +1515,7 @@ static void check_unsafe_exec(struct linux_binprm *bprm) * state is protected by cred_guard_mutex we hold. */ n_fs = 1; - spin_lock(&p->fs->lock); + read_seqlock_excl(&p->fs->seq); rcu_read_lock(); for_other_threads(p, t) { if (t->fs == p->fs) @@ -1528,7 +1528,7 @@ static void check_unsafe_exec(struct linux_binprm *bprm) bprm->unsafe |= LSM_UNSAFE_SHARE; else p->fs->in_exec = 1; - spin_unlock(&p->fs->lock); + read_sequnlock_excl(&p->fs->seq); } static void bprm_fill_uid(struct linux_binprm *bprm, struct file *file) diff --git a/fs/fhandle.c b/fs/fhandle.c index 3e092ae6d142..7c236f64cdea 100644 --- a/fs/fhandle.c +++ b/fs/fhandle.c @@ -88,7 +88,7 @@ static long do_sys_name_to_handle(const struct path *path, if (fh_flags & EXPORT_FH_CONNECTABLE) { handle->handle_type |= FILEID_IS_CONNECTABLE; if (d_is_dir(path->dentry)) - fh_flags |= FILEID_IS_DIR; + handle->handle_type |= FILEID_IS_DIR; } retval = 0; } @@ -168,23 +168,28 @@ SYSCALL_DEFINE5(name_to_handle_at, int, dfd, const char __user *, name, return err; } -static int get_path_from_fd(int fd, struct path *root) +static int get_path_anchor(int fd, struct path *root) { - if (fd == AT_FDCWD) { - struct fs_struct *fs = current->fs; - spin_lock(&fs->lock); - *root = fs->pwd; - path_get(root); - spin_unlock(&fs->lock); - } else { + if (fd >= 0) { CLASS(fd, f)(fd); if (fd_empty(f)) return -EBADF; *root = fd_file(f)->f_path; path_get(root); + return 0; } - return 0; + if (fd == AT_FDCWD) { + get_fs_pwd(current->fs, root); + return 0; + } + + if (fd == FD_PIDFS_ROOT) { + pidfs_get_root(root); + return 0; + } + + return -EBADF; } static int vfs_dentry_acceptable(void *context, struct dentry *dentry) @@ -323,13 +328,24 @@ static int handle_to_path(int mountdirfd, struct file_handle __user *ufh, { int retval = 0; struct file_handle f_handle; - struct file_handle *handle = NULL; + struct file_handle *handle __free(kfree) = NULL; struct handle_to_path_ctx ctx = {}; const struct export_operations *eops; - retval = get_path_from_fd(mountdirfd, &ctx.root); + if (copy_from_user(&f_handle, ufh, sizeof(struct file_handle))) + return -EFAULT; + + if ((f_handle.handle_bytes > MAX_HANDLE_SZ) || + (f_handle.handle_bytes == 0)) + return -EINVAL; + + if (f_handle.handle_type < 0 || + FILEID_USER_FLAGS(f_handle.handle_type) & ~FILEID_VALID_USER_FLAGS) + return -EINVAL; + + retval = get_path_anchor(mountdirfd, &ctx.root); if (retval) - goto out_err; + return retval; eops = ctx.root.mnt->mnt_sb->s_export_op; if (eops && eops->permission) @@ -339,21 +355,6 @@ static int handle_to_path(int mountdirfd, struct file_handle __user *ufh, if (retval) goto out_path; - if (copy_from_user(&f_handle, ufh, sizeof(struct file_handle))) { - retval = -EFAULT; - goto out_path; - } - if ((f_handle.handle_bytes > MAX_HANDLE_SZ) || - (f_handle.handle_bytes == 0)) { - retval = -EINVAL; - goto out_path; - } - if (f_handle.handle_type < 0 || - FILEID_USER_FLAGS(f_handle.handle_type) & ~FILEID_VALID_USER_FLAGS) { - retval = -EINVAL; - goto out_path; - } - handle = kmalloc(struct_size(handle, f_handle, f_handle.handle_bytes), GFP_KERNEL); if (!handle) { @@ -366,7 +367,7 @@ static int handle_to_path(int mountdirfd, struct file_handle __user *ufh, &ufh->f_handle, f_handle.handle_bytes)) { retval = -EFAULT; - goto out_handle; + goto out_path; } /* @@ -384,11 +385,8 @@ static int handle_to_path(int mountdirfd, struct file_handle __user *ufh, handle->handle_type &= ~FILEID_USER_FLAGS_MASK; retval = do_handle_to_path(handle, path, &ctx); -out_handle: - kfree(handle); out_path: path_put(&ctx.root); -out_err: return retval; } diff --git a/fs/fs_struct.c b/fs/fs_struct.c index 64c2d0814ed6..28be762ac1c6 100644 --- a/fs/fs_struct.c +++ b/fs/fs_struct.c @@ -17,12 +17,10 @@ void set_fs_root(struct fs_struct *fs, const struct path *path) struct path old_root; path_get(path); - spin_lock(&fs->lock); - write_seqcount_begin(&fs->seq); + write_seqlock(&fs->seq); old_root = fs->root; fs->root = *path; - write_seqcount_end(&fs->seq); - spin_unlock(&fs->lock); + write_sequnlock(&fs->seq); if (old_root.dentry) path_put(&old_root); } @@ -36,12 +34,10 @@ void set_fs_pwd(struct fs_struct *fs, const struct path *path) struct path old_pwd; path_get(path); - spin_lock(&fs->lock); - write_seqcount_begin(&fs->seq); + write_seqlock(&fs->seq); old_pwd = fs->pwd; fs->pwd = *path; - write_seqcount_end(&fs->seq); - spin_unlock(&fs->lock); + write_sequnlock(&fs->seq); if (old_pwd.dentry) path_put(&old_pwd); @@ -67,16 +63,14 @@ void chroot_fs_refs(const struct path *old_root, const struct path *new_root) fs = p->fs; if (fs) { int hits = 0; - spin_lock(&fs->lock); - write_seqcount_begin(&fs->seq); + write_seqlock(&fs->seq); hits += replace_path(&fs->root, old_root, new_root); hits += replace_path(&fs->pwd, old_root, new_root); - write_seqcount_end(&fs->seq); while (hits--) { count++; path_get(new_root); } - spin_unlock(&fs->lock); + write_sequnlock(&fs->seq); } task_unlock(p); } @@ -99,10 +93,10 @@ void exit_fs(struct task_struct *tsk) if (fs) { int kill; task_lock(tsk); - spin_lock(&fs->lock); + read_seqlock_excl(&fs->seq); tsk->fs = NULL; kill = !--fs->users; - spin_unlock(&fs->lock); + read_sequnlock_excl(&fs->seq); task_unlock(tsk); if (kill) free_fs_struct(fs); @@ -116,16 +110,15 @@ struct fs_struct *copy_fs_struct(struct fs_struct *old) if (fs) { fs->users = 1; fs->in_exec = 0; - spin_lock_init(&fs->lock); - seqcount_spinlock_init(&fs->seq, &fs->lock); + seqlock_init(&fs->seq); fs->umask = old->umask; - spin_lock(&old->lock); + read_seqlock_excl(&old->seq); fs->root = old->root; path_get(&fs->root); fs->pwd = old->pwd; path_get(&fs->pwd); - spin_unlock(&old->lock); + read_sequnlock_excl(&old->seq); } return fs; } @@ -140,10 +133,10 @@ int unshare_fs_struct(void) return -ENOMEM; task_lock(current); - spin_lock(&fs->lock); + read_seqlock_excl(&fs->seq); kill = !--fs->users; current->fs = new_fs; - spin_unlock(&fs->lock); + read_sequnlock_excl(&fs->seq); task_unlock(current); if (kill) @@ -162,7 +155,6 @@ EXPORT_SYMBOL(current_umask); /* to be mentioned only in INIT_TASK */ struct fs_struct init_fs = { .users = 1, - .lock = __SPIN_LOCK_UNLOCKED(init_fs.lock), - .seq = SEQCNT_SPINLOCK_ZERO(init_fs.seq, &init_fs.lock), + .seq = __SEQLOCK_UNLOCKED(init_fs.seq), .umask = 0022, }; diff --git a/fs/internal.h b/fs/internal.h index d733d8bb3d1f..38e8aab27bbd 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -323,12 +323,15 @@ struct mnt_idmap *alloc_mnt_idmap(struct user_namespace *mnt_userns); struct mnt_idmap *mnt_idmap_get(struct mnt_idmap *idmap); void mnt_idmap_put(struct mnt_idmap *idmap); struct stashed_operations { + struct dentry *(*stash_dentry)(struct dentry **stashed, + struct dentry *dentry); void (*put_data)(void *data); int (*init_inode)(struct inode *inode, void *data); }; int path_from_stashed(struct dentry **stashed, struct vfsmount *mnt, void *data, struct path *path); void stashed_dentry_prune(struct dentry *dentry); +struct dentry *stash_dentry(struct dentry **stashed, struct dentry *dentry); struct dentry *stashed_dentry_get(struct dentry **stashed); /** * path_mounted - check whether path is mounted @@ -351,3 +354,4 @@ int anon_inode_getattr(struct mnt_idmap *idmap, const struct path *path, unsigned int query_flags); int anon_inode_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr); +void pidfs_get_root(struct path *path); diff --git a/fs/libfs.c b/fs/libfs.c index 67bd8eea4af1..ce8c496a6940 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -2143,6 +2143,8 @@ struct dentry *stashed_dentry_get(struct dentry **stashed) dentry = rcu_dereference(*stashed); if (!dentry) return NULL; + if (IS_ERR(dentry)) + return dentry; if (!lockref_get_not_dead(&dentry->d_lockref)) return NULL; return dentry; @@ -2175,7 +2177,6 @@ static struct dentry *prepare_anon_dentry(struct dentry **stashed, /* Notice when this is changed. */ WARN_ON_ONCE(!S_ISREG(inode->i_mode)); - WARN_ON_ONCE(!IS_IMMUTABLE(inode)); dentry = d_alloc_anon(sb); if (!dentry) { @@ -2191,8 +2192,7 @@ static struct dentry *prepare_anon_dentry(struct dentry **stashed, return dentry; } -static struct dentry *stash_dentry(struct dentry **stashed, - struct dentry *dentry) +struct dentry *stash_dentry(struct dentry **stashed, struct dentry *dentry) { guard(rcu)(); for (;;) { @@ -2233,14 +2233,16 @@ static struct dentry *stash_dentry(struct dentry **stashed, int path_from_stashed(struct dentry **stashed, struct vfsmount *mnt, void *data, struct path *path) { - struct dentry *dentry; + struct dentry *dentry, *res; const struct stashed_operations *sops = mnt->mnt_sb->s_fs_info; /* See if dentry can be reused. */ - path->dentry = stashed_dentry_get(stashed); - if (path->dentry) { + res = stashed_dentry_get(stashed); + if (IS_ERR(res)) + return PTR_ERR(res); + if (res) { sops->put_data(data); - goto out_path; + goto make_path; } /* Allocate a new dentry. */ @@ -2249,14 +2251,22 @@ int path_from_stashed(struct dentry **stashed, struct vfsmount *mnt, void *data, return PTR_ERR(dentry); /* Added a new dentry. @data is now owned by the filesystem. */ - path->dentry = stash_dentry(stashed, dentry); - if (path->dentry != dentry) + if (sops->stash_dentry) + res = sops->stash_dentry(stashed, dentry); + else + res = stash_dentry(stashed, dentry); + if (IS_ERR(res)) { + dput(dentry); + return PTR_ERR(res); + } + if (res != dentry) dput(dentry); -out_path: - WARN_ON_ONCE(path->dentry->d_fsdata != stashed); - WARN_ON_ONCE(d_inode(path->dentry)->i_private != data); +make_path: + path->dentry = res; path->mnt = mntget(mnt); + VFS_WARN_ON_ONCE(path->dentry->d_fsdata != stashed); + VFS_WARN_ON_ONCE(d_inode(path->dentry)->i_private != data); return 0; } diff --git a/fs/namei.c b/fs/namei.c index a1cd129a2e8b..cd43ff89fbaa 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1012,10 +1012,10 @@ static int set_root(struct nameidata *nd) unsigned seq; do { - seq = read_seqcount_begin(&fs->seq); + seq = read_seqbegin(&fs->seq); nd->root = fs->root; nd->root_seq = __read_seqcount_begin(&nd->root.dentry->d_seq); - } while (read_seqcount_retry(&fs->seq, seq)); + } while (read_seqretry(&fs->seq, seq)); } else { get_fs_root(fs, &nd->root); nd->state |= ND_ROOT_GRABBED; @@ -2571,11 +2571,11 @@ static const char *path_init(struct nameidata *nd, unsigned flags) unsigned seq; do { - seq = read_seqcount_begin(&fs->seq); + seq = read_seqbegin(&fs->seq); nd->path = fs->pwd; nd->inode = nd->path.dentry->d_inode; nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq); - } while (read_seqcount_retry(&fs->seq, seq)); + } while (read_seqretry(&fs->seq, seq)); } else { get_fs_pwd(current->fs, &nd->path); nd->inode = nd->path.dentry->d_inode; diff --git a/fs/pidfs.c b/fs/pidfs.c index 4625e097e3a0..edc35522d75c 100644 --- a/fs/pidfs.c +++ b/fs/pidfs.c @@ -21,11 +21,23 @@ #include #include #include +#include #include "internal.h" #include "mount.h" -static struct kmem_cache *pidfs_cachep __ro_after_init; +#define PIDFS_PID_DEAD ERR_PTR(-ESRCH) + +static struct kmem_cache *pidfs_attr_cachep __ro_after_init; +static struct kmem_cache *pidfs_xattr_cachep __ro_after_init; + +static struct path pidfs_root_path = {}; + +void pidfs_get_root(struct path *path) +{ + *path = pidfs_root_path; + path_get(path); +} /* * Stashes information that userspace needs to access even after the @@ -37,17 +49,12 @@ struct pidfs_exit_info { __u32 coredump_mask; }; -struct pidfs_inode { +struct pidfs_attr { + struct simple_xattrs *xattrs; struct pidfs_exit_info __pei; struct pidfs_exit_info *exit_info; - struct inode vfs_inode; }; -static inline struct pidfs_inode *pidfs_i(struct inode *inode) -{ - return container_of(inode, struct pidfs_inode, vfs_inode); -} - static struct rb_root pidfs_ino_tree = RB_ROOT; #if BITS_PER_LONG == 32 @@ -125,6 +132,7 @@ void pidfs_add_pid(struct pid *pid) pid->ino = pidfs_ino_nr; pid->stashed = NULL; + pid->attr = NULL; pidfs_ino_nr++; write_seqcount_begin(&pidmap_lock_seq); @@ -139,6 +147,33 @@ void pidfs_remove_pid(struct pid *pid) write_seqcount_end(&pidmap_lock_seq); } +void pidfs_free_pid(struct pid *pid) +{ + struct pidfs_attr *attr __free(kfree) = no_free_ptr(pid->attr); + struct simple_xattrs *xattrs __free(kfree) = NULL; + + /* + * Any dentry must've been wiped from the pid by now. + * Otherwise there's a reference count bug. + */ + VFS_WARN_ON_ONCE(pid->stashed); + + /* + * This if an error occurred during e.g., task creation that + * causes us to never go through the exit path. + */ + if (unlikely(!attr)) + return; + + /* This never had a pidfd created. */ + if (IS_ERR(attr)) + return; + + xattrs = no_free_ptr(attr->xattrs); + if (xattrs) + simple_xattrs_free(xattrs, NULL); +} + #ifdef CONFIG_PROC_FS /** * pidfd_show_fdinfo - print information about a pidfd @@ -261,13 +296,13 @@ static __u32 pidfs_coredump_mask(unsigned long mm_flags) static long pidfd_info(struct file *file, unsigned int cmd, unsigned long arg) { struct pidfd_info __user *uinfo = (struct pidfd_info __user *)arg; - struct inode *inode = file_inode(file); struct pid *pid = pidfd_pid(file); size_t usize = _IOC_SIZE(cmd); struct pidfd_info kinfo = {}; struct pidfs_exit_info *exit_info; struct user_namespace *user_ns; struct task_struct *task; + struct pidfs_attr *attr; const struct cred *c; __u64 mask; @@ -286,8 +321,9 @@ static long pidfd_info(struct file *file, unsigned int cmd, unsigned long arg) if (!pid_in_current_pidns(pid)) return -ESRCH; + attr = READ_ONCE(pid->attr); if (mask & PIDFD_INFO_EXIT) { - exit_info = READ_ONCE(pidfs_i(inode)->exit_info); + exit_info = READ_ONCE(attr->exit_info); if (exit_info) { kinfo.mask |= PIDFD_INFO_EXIT; #ifdef CONFIG_CGROUPS @@ -300,7 +336,7 @@ static long pidfd_info(struct file *file, unsigned int cmd, unsigned long arg) if (mask & PIDFD_INFO_COREDUMP) { kinfo.mask |= PIDFD_INFO_COREDUMP; - kinfo.coredump_mask = READ_ONCE(pidfs_i(inode)->__pei.coredump_mask); + kinfo.coredump_mask = READ_ONCE(attr->__pei.coredump_mask); } task = get_pid_task(pid, PIDTYPE_PID); @@ -552,41 +588,61 @@ struct pid *pidfd_pid(const struct file *file) * task has been reaped which cannot happen until we're out of * release_task(). * - * If this struct pid is referred to by a pidfd then - * stashed_dentry_get() will return the dentry and inode for that struct - * pid. Since we've taken a reference on it there's now an additional - * reference from the exit path on it. Which is fine. We're going to put - * it again in a second and we know that the pid is kept alive anyway. + * If this struct pid has at least once been referred to by a pidfd then + * pid->attr will be allocated. If not we mark the struct pid as dead so + * anyone who is trying to register it with pidfs will fail to do so. + * Otherwise we would hand out pidfs for reaped tasks without having + * exit information available. * - * Worst case is that we've filled in the info and immediately free the - * dentry and inode afterwards since the pidfd has been closed. Since + * Worst case is that we've filled in the info and the pid gets freed + * right away in free_pid() when no one holds a pidfd anymore. Since * pidfs_exit() currently is placed after exit_task_work() we know that - * it cannot be us aka the exiting task holding a pidfd to ourselves. + * it cannot be us aka the exiting task holding a pidfd to itself. */ void pidfs_exit(struct task_struct *tsk) { - struct dentry *dentry; + struct pid *pid = task_pid(tsk); + struct pidfs_attr *attr; + struct pidfs_exit_info *exit_info; +#ifdef CONFIG_CGROUPS + struct cgroup *cgrp; +#endif might_sleep(); - dentry = stashed_dentry_get(&task_pid(tsk)->stashed); - if (dentry) { - struct inode *inode = d_inode(dentry); - struct pidfs_exit_info *exit_info = &pidfs_i(inode)->__pei; -#ifdef CONFIG_CGROUPS - struct cgroup *cgrp; - - rcu_read_lock(); - cgrp = task_dfl_cgroup(tsk); - exit_info->cgroupid = cgroup_id(cgrp); - rcu_read_unlock(); -#endif - exit_info->exit_code = tsk->exit_code; - - /* Ensure that PIDFD_GET_INFO sees either all or nothing. */ - smp_store_release(&pidfs_i(inode)->exit_info, &pidfs_i(inode)->__pei); - dput(dentry); + guard(spinlock_irq)(&pid->wait_pidfd.lock); + attr = pid->attr; + if (!attr) { + /* + * No one ever held a pidfd for this struct pid. + * Mark it as dead so no one can add a pidfs + * entry anymore. We're about to be reaped and + * so no exit information would be available. + */ + pid->attr = PIDFS_PID_DEAD; + return; } + + /* + * If @pid->attr is set someone might still legitimately hold a + * pidfd to @pid or someone might concurrently still be getting + * a reference to an already stashed dentry from @pid->stashed. + * So defer cleaning @pid->attr until the last reference to @pid + * is put + */ + + exit_info = &attr->__pei; + +#ifdef CONFIG_CGROUPS + rcu_read_lock(); + cgrp = task_dfl_cgroup(tsk); + exit_info->cgroupid = cgroup_id(cgrp); + rcu_read_unlock(); +#endif + exit_info->exit_code = tsk->exit_code; + + /* Ensure that PIDFD_GET_INFO sees either all or nothing. */ + smp_store_release(&attr->exit_info, &attr->__pei); } #ifdef CONFIG_COREDUMP @@ -594,16 +650,15 @@ void pidfs_coredump(const struct coredump_params *cprm) { struct pid *pid = cprm->pid; struct pidfs_exit_info *exit_info; - struct dentry *dentry; - struct inode *inode; + struct pidfs_attr *attr; __u32 coredump_mask = 0; - dentry = pid->stashed; - if (WARN_ON_ONCE(!dentry)) - return; + attr = READ_ONCE(pid->attr); - inode = d_inode(dentry); - exit_info = &pidfs_i(inode)->__pei; + VFS_WARN_ON_ONCE(!attr); + VFS_WARN_ON_ONCE(attr == PIDFS_PID_DEAD); + + exit_info = &attr->__pei; /* Note how we were coredumped. */ coredump_mask = pidfs_coredump_mask(cprm->mm_flags); /* Note that we actually did coredump. */ @@ -634,9 +689,24 @@ static int pidfs_getattr(struct mnt_idmap *idmap, const struct path *path, return anon_inode_getattr(idmap, path, stat, request_mask, query_flags); } +static ssize_t pidfs_listxattr(struct dentry *dentry, char *buf, size_t size) +{ + struct inode *inode = d_inode(dentry); + struct pid *pid = inode->i_private; + struct pidfs_attr *attr = pid->attr; + struct simple_xattrs *xattrs; + + xattrs = READ_ONCE(attr->xattrs); + if (!xattrs) + return 0; + + return simple_xattr_list(inode, xattrs, buf, size); +} + static const struct inode_operations pidfs_inode_operations = { - .getattr = pidfs_getattr, - .setattr = pidfs_setattr, + .getattr = pidfs_getattr, + .setattr = pidfs_setattr, + .listxattr = pidfs_listxattr, }; static void pidfs_evict_inode(struct inode *inode) @@ -647,30 +717,9 @@ static void pidfs_evict_inode(struct inode *inode) put_pid(pid); } -static struct inode *pidfs_alloc_inode(struct super_block *sb) -{ - struct pidfs_inode *pi; - - pi = alloc_inode_sb(sb, pidfs_cachep, GFP_KERNEL); - if (!pi) - return NULL; - - memset(&pi->__pei, 0, sizeof(pi->__pei)); - pi->exit_info = NULL; - - return &pi->vfs_inode; -} - -static void pidfs_free_inode(struct inode *inode) -{ - kmem_cache_free(pidfs_cachep, pidfs_i(inode)); -} - static const struct super_operations pidfs_sops = { - .alloc_inode = pidfs_alloc_inode, .drop_inode = generic_delete_inode, .evict_inode = pidfs_evict_inode, - .free_inode = pidfs_free_inode, .statfs = simple_statfs, }; @@ -770,6 +819,8 @@ static struct dentry *pidfs_fh_to_dentry(struct super_block *sb, if (ret < 0) return ERR_PTR(ret); + VFS_WARN_ON_ONCE(!pid->attr); + mntput(path.mnt); return path.dentry; } @@ -796,53 +847,8 @@ static int pidfs_export_permission(struct handle_to_path_ctx *ctx, return 0; } -static inline bool pidfs_pid_valid(struct pid *pid, const struct path *path, - unsigned int flags) -{ - enum pid_type type; - - if (flags & PIDFD_STALE) - return true; - - /* - * Make sure that if a pidfd is created PIDFD_INFO_EXIT - * information will be available. So after an inode for the - * pidfd has been allocated perform another check that the pid - * is still alive. If it is exit information is available even - * if the task gets reaped before the pidfd is returned to - * userspace. The only exception are indicated by PIDFD_STALE: - * - * (1) The kernel is in the middle of task creation and thus no - * task linkage has been established yet. - * (2) The caller knows @pid has been registered in pidfs at a - * time when the task was still alive. - * - * In both cases exit information will have been reported. - */ - if (flags & PIDFD_THREAD) - type = PIDTYPE_PID; - else - type = PIDTYPE_TGID; - - /* - * Since pidfs_exit() is called before struct pid's task linkage - * is removed the case where the task got reaped but a dentry - * was already attached to struct pid and exit information was - * recorded and published can be handled correctly. - */ - if (unlikely(!pid_has_task(pid, type))) { - struct inode *inode = d_inode(path->dentry); - return !!READ_ONCE(pidfs_i(inode)->exit_info); - } - - return true; -} - static struct file *pidfs_export_open(struct path *path, unsigned int oflags) { - if (!pidfs_pid_valid(d_inode(path->dentry)->i_private, path, oflags)) - return ERR_PTR(-ESRCH); - /* * Clear O_LARGEFILE as open_by_handle_at() forces it and raise * O_RDWR as pidfds always are. @@ -864,6 +870,8 @@ static int pidfs_init_inode(struct inode *inode, void *data) inode->i_private = data; inode->i_flags |= S_PRIVATE | S_ANON_INODE; + /* We allow to set xattrs. */ + inode->i_flags &= ~S_IMMUTABLE; inode->i_mode |= S_IRWXU; inode->i_op = &pidfs_inode_operations; inode->i_fop = &pidfs_file_operations; @@ -878,9 +886,127 @@ static void pidfs_put_data(void *data) put_pid(pid); } +/** + * pidfs_register_pid - register a struct pid in pidfs + * @pid: pid to pin + * + * Register a struct pid in pidfs. + * + * Return: On success zero, on error a negative error code is returned. + */ +int pidfs_register_pid(struct pid *pid) +{ + struct pidfs_attr *new_attr __free(kfree) = NULL; + struct pidfs_attr *attr; + + might_sleep(); + + if (!pid) + return 0; + + attr = READ_ONCE(pid->attr); + if (unlikely(attr == PIDFS_PID_DEAD)) + return PTR_ERR(PIDFS_PID_DEAD); + if (attr) + return 0; + + new_attr = kmem_cache_zalloc(pidfs_attr_cachep, GFP_KERNEL); + if (!new_attr) + return -ENOMEM; + + /* Synchronize with pidfs_exit(). */ + guard(spinlock_irq)(&pid->wait_pidfd.lock); + + attr = pid->attr; + if (unlikely(attr == PIDFS_PID_DEAD)) + return PTR_ERR(PIDFS_PID_DEAD); + if (unlikely(attr)) + return 0; + + pid->attr = no_free_ptr(new_attr); + return 0; +} + +static struct dentry *pidfs_stash_dentry(struct dentry **stashed, + struct dentry *dentry) +{ + int ret; + struct pid *pid = d_inode(dentry)->i_private; + + VFS_WARN_ON_ONCE(stashed != &pid->stashed); + + ret = pidfs_register_pid(pid); + if (ret) + return ERR_PTR(ret); + + return stash_dentry(stashed, dentry); +} + static const struct stashed_operations pidfs_stashed_ops = { - .init_inode = pidfs_init_inode, - .put_data = pidfs_put_data, + .stash_dentry = pidfs_stash_dentry, + .init_inode = pidfs_init_inode, + .put_data = pidfs_put_data, +}; + +static int pidfs_xattr_get(const struct xattr_handler *handler, + struct dentry *unused, struct inode *inode, + const char *suffix, void *value, size_t size) +{ + struct pid *pid = inode->i_private; + struct pidfs_attr *attr = pid->attr; + const char *name; + struct simple_xattrs *xattrs; + + xattrs = READ_ONCE(attr->xattrs); + if (!xattrs) + return 0; + + name = xattr_full_name(handler, suffix); + return simple_xattr_get(xattrs, name, value, size); +} + +static int pidfs_xattr_set(const struct xattr_handler *handler, + struct mnt_idmap *idmap, struct dentry *unused, + struct inode *inode, const char *suffix, + const void *value, size_t size, int flags) +{ + struct pid *pid = inode->i_private; + struct pidfs_attr *attr = pid->attr; + const char *name; + struct simple_xattrs *xattrs; + struct simple_xattr *old_xattr; + + /* Ensure we're the only one to set @attr->xattrs. */ + WARN_ON_ONCE(!inode_is_locked(inode)); + + xattrs = READ_ONCE(attr->xattrs); + if (!xattrs) { + xattrs = kmem_cache_zalloc(pidfs_xattr_cachep, GFP_KERNEL); + if (!xattrs) + return -ENOMEM; + + simple_xattrs_init(xattrs); + smp_store_release(&pid->attr->xattrs, xattrs); + } + + name = xattr_full_name(handler, suffix); + old_xattr = simple_xattr_set(xattrs, name, value, size, flags); + if (IS_ERR(old_xattr)) + return PTR_ERR(old_xattr); + + simple_xattr_free(old_xattr); + return 0; +} + +static const struct xattr_handler pidfs_trusted_xattr_handler = { + .prefix = XATTR_TRUSTED_PREFIX, + .get = pidfs_xattr_get, + .set = pidfs_xattr_set, +}; + +static const struct xattr_handler *const pidfs_xattr_handlers[] = { + &pidfs_trusted_xattr_handler, + NULL }; static int pidfs_init_fs_context(struct fs_context *fc) @@ -891,9 +1017,12 @@ static int pidfs_init_fs_context(struct fs_context *fc) if (!ctx) return -ENOMEM; + fc->s_iflags |= SB_I_NOEXEC; + fc->s_iflags |= SB_I_NODEV; ctx->ops = &pidfs_sops; ctx->eops = &pidfs_export_operations; ctx->dops = &pidfs_dentry_operations; + ctx->xattr = pidfs_xattr_handlers; fc->s_fs_info = (void *)&pidfs_stashed_ops; return 0; } @@ -921,8 +1050,7 @@ struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags) if (ret < 0) return ERR_PTR(ret); - if (!pidfs_pid_valid(pid, &path, flags)) - return ERR_PTR(-ESRCH); + VFS_WARN_ON_ONCE(!pid->attr); flags &= ~PIDFD_STALE; flags |= O_RDWR; @@ -934,79 +1062,21 @@ struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags) return pidfd_file; } -/** - * pidfs_register_pid - register a struct pid in pidfs - * @pid: pid to pin - * - * Register a struct pid in pidfs. Needs to be paired with - * pidfs_put_pid() to not risk leaking the pidfs dentry and inode. - * - * Return: On success zero, on error a negative error code is returned. - */ -int pidfs_register_pid(struct pid *pid) -{ - struct path path __free(path_put) = {}; - int ret; - - might_sleep(); - - if (!pid) - return 0; - - ret = path_from_stashed(&pid->stashed, pidfs_mnt, get_pid(pid), &path); - if (unlikely(ret)) - return ret; - /* Keep the dentry and only put the reference to the mount. */ - path.dentry = NULL; - return 0; -} - -/** - * pidfs_get_pid - pin a struct pid through pidfs - * @pid: pid to pin - * - * Similar to pidfs_register_pid() but only valid if the caller knows - * there's a reference to the @pid through a dentry already that can't - * go away. - */ -void pidfs_get_pid(struct pid *pid) -{ - if (!pid) - return; - WARN_ON_ONCE(!stashed_dentry_get(&pid->stashed)); -} - -/** - * pidfs_put_pid - drop a pidfs reference - * @pid: pid to drop - * - * Drop a reference to @pid via pidfs. This is only safe if the - * reference has been taken via pidfs_get_pid(). - */ -void pidfs_put_pid(struct pid *pid) -{ - might_sleep(); - - if (!pid) - return; - VFS_WARN_ON_ONCE(!pid->stashed); - dput(pid->stashed); -} - -static void pidfs_inode_init_once(void *data) -{ - struct pidfs_inode *pi = data; - - inode_init_once(&pi->vfs_inode); -} - void __init pidfs_init(void) { - pidfs_cachep = kmem_cache_create("pidfs_cache", sizeof(struct pidfs_inode), 0, + pidfs_attr_cachep = kmem_cache_create("pidfs_attr_cache", sizeof(struct pidfs_attr), 0, (SLAB_HWCACHE_ALIGN | SLAB_RECLAIM_ACCOUNT | - SLAB_ACCOUNT | SLAB_PANIC), - pidfs_inode_init_once); + SLAB_ACCOUNT | SLAB_PANIC), NULL); + + pidfs_xattr_cachep = kmem_cache_create("pidfs_xattr_cache", + sizeof(struct simple_xattrs), 0, + (SLAB_HWCACHE_ALIGN | SLAB_RECLAIM_ACCOUNT | + SLAB_ACCOUNT | SLAB_PANIC), NULL); + pidfs_mnt = kern_mount(&pidfs_type); if (IS_ERR(pidfs_mnt)) panic("Failed to mount pidfs pseudo filesystem"); + + pidfs_root_path.mnt = pidfs_mnt; + pidfs_root_path.dentry = pidfs_mnt->mnt_root; } diff --git a/include/linux/fs_struct.h b/include/linux/fs_struct.h index 783b48dedb72..baf200ab5c77 100644 --- a/include/linux/fs_struct.h +++ b/include/linux/fs_struct.h @@ -8,8 +8,7 @@ struct fs_struct { int users; - spinlock_t lock; - seqcount_spinlock_t seq; + seqlock_t seq; int umask; int in_exec; struct path root, pwd; @@ -26,18 +25,18 @@ extern int unshare_fs_struct(void); static inline void get_fs_root(struct fs_struct *fs, struct path *root) { - spin_lock(&fs->lock); + read_seqlock_excl(&fs->seq); *root = fs->root; path_get(root); - spin_unlock(&fs->lock); + read_sequnlock_excl(&fs->seq); } static inline void get_fs_pwd(struct fs_struct *fs, struct path *pwd) { - spin_lock(&fs->lock); + read_seqlock_excl(&fs->seq); *pwd = fs->pwd; path_get(pwd); - spin_unlock(&fs->lock); + read_sequnlock_excl(&fs->seq); } extern bool current_chrooted(void); diff --git a/include/linux/pid.h b/include/linux/pid.h index 453ae6d8a68d..003a1027d219 100644 --- a/include/linux/pid.h +++ b/include/linux/pid.h @@ -47,19 +47,23 @@ #define RESERVED_PIDS 300 +struct pidfs_attr; + struct upid { int nr; struct pid_namespace *ns; }; -struct pid -{ +struct pid { refcount_t count; unsigned int level; spinlock_t lock; - struct dentry *stashed; - u64 ino; - struct rb_node pidfs_node; + struct { + u64 ino; + struct rb_node pidfs_node; + struct dentry *stashed; + struct pidfs_attr *attr; + }; /* lists of tasks that use this pid */ struct hlist_head tasks[PIDTYPE_MAX]; struct hlist_head inodes; diff --git a/include/linux/pidfs.h b/include/linux/pidfs.h index 77e7db194914..3e08c33da2df 100644 --- a/include/linux/pidfs.h +++ b/include/linux/pidfs.h @@ -14,7 +14,6 @@ void pidfs_coredump(const struct coredump_params *cprm); #endif extern const struct dentry_operations pidfs_dentry_operations; int pidfs_register_pid(struct pid *pid); -void pidfs_get_pid(struct pid *pid); -void pidfs_put_pid(struct pid *pid); +void pidfs_free_pid(struct pid *pid); #endif /* _LINUX_PID_FS_H */ diff --git a/include/net/scm.h b/include/net/scm.h index 84c4707e78a5..c52519669349 100644 --- a/include/net/scm.h +++ b/include/net/scm.h @@ -69,7 +69,7 @@ static __inline__ void unix_get_peersec_dgram(struct socket *sock, struct scm_co static __inline__ void scm_set_cred(struct scm_cookie *scm, struct pid *pid, kuid_t uid, kgid_t gid) { - scm->pid = get_pid(pid); + scm->pid = get_pid(pid); scm->creds.pid = pid_vnr(pid); scm->creds.uid = uid; scm->creds.gid = gid; @@ -78,7 +78,7 @@ static __inline__ void scm_set_cred(struct scm_cookie *scm, static __inline__ void scm_destroy_cred(struct scm_cookie *scm) { put_pid(scm->pid); - scm->pid = NULL; + scm->pid = NULL; } static __inline__ void scm_destroy(struct scm_cookie *scm) diff --git a/include/uapi/linux/fcntl.h b/include/uapi/linux/fcntl.h index a15ac2fa4b20..f291ab4f94eb 100644 --- a/include/uapi/linux/fcntl.h +++ b/include/uapi/linux/fcntl.h @@ -90,10 +90,28 @@ #define DN_ATTRIB 0x00000020 /* File changed attibutes */ #define DN_MULTISHOT 0x80000000 /* Don't remove notifier */ +/* Reserved kernel ranges [-100], [-10000, -40000]. */ #define AT_FDCWD -100 /* Special value for dirfd used to indicate openat should use the current working directory. */ +/* + * The concept of process and threads in userland and the kernel is a confusing + * one - within the kernel every thread is a 'task' with its own individual PID, + * however from userland's point of view threads are grouped by a single PID, + * which is that of the 'thread group leader', typically the first thread + * spawned. + * + * To cut the Gideon knot, for internal kernel usage, we refer to + * PIDFD_SELF_THREAD to refer to the current thread (or task from a kernel + * perspective), and PIDFD_SELF_THREAD_GROUP to refer to the current thread + * group leader... + */ +#define PIDFD_SELF_THREAD -10000 /* Current thread. */ +#define PIDFD_SELF_THREAD_GROUP -10001 /* Current thread group leader. */ + +#define FD_PIDFS_ROOT -10002 /* Root of the pidfs filesystem */ +#define FD_INVALID -10009 /* Invalid file descriptor: -10000 - EBADF = -10009 */ /* Generic flags for the *at(2) family of syscalls. */ diff --git a/include/uapi/linux/pidfd.h b/include/uapi/linux/pidfd.h index c27a4e238e4b..957db425d459 100644 --- a/include/uapi/linux/pidfd.h +++ b/include/uapi/linux/pidfd.h @@ -42,21 +42,6 @@ #define PIDFD_COREDUMP_USER (1U << 2) /* coredump was done as the user. */ #define PIDFD_COREDUMP_ROOT (1U << 3) /* coredump was done as root. */ -/* - * The concept of process and threads in userland and the kernel is a confusing - * one - within the kernel every thread is a 'task' with its own individual PID, - * however from userland's point of view threads are grouped by a single PID, - * which is that of the 'thread group leader', typically the first thread - * spawned. - * - * To cut the Gideon knot, for internal kernel usage, we refer to - * PIDFD_SELF_THREAD to refer to the current thread (or task from a kernel - * perspective), and PIDFD_SELF_THREAD_GROUP to refer to the current thread - * group leader... - */ -#define PIDFD_SELF_THREAD -10000 /* Current thread. */ -#define PIDFD_SELF_THREAD_GROUP -20000 /* Current thread group leader. */ - /* * ...and for userland we make life simpler - PIDFD_SELF refers to the current * thread, PIDFD_SELF_PROCESS refers to the process thread group leader. diff --git a/kernel/fork.c b/kernel/fork.c index 1ee8eb11f38b..6318a25a16ba 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1542,14 +1542,14 @@ static int copy_fs(unsigned long clone_flags, struct task_struct *tsk) struct fs_struct *fs = current->fs; if (clone_flags & CLONE_FS) { /* tsk->fs is already what we want */ - spin_lock(&fs->lock); + read_seqlock_excl(&fs->seq); /* "users" and "in_exec" locked for check_unsafe_exec() */ if (fs->in_exec) { - spin_unlock(&fs->lock); + read_sequnlock_excl(&fs->seq); return -EAGAIN; } fs->users++; - spin_unlock(&fs->lock); + read_sequnlock_excl(&fs->seq); return 0; } tsk->fs = copy_fs_struct(fs); @@ -3149,13 +3149,13 @@ int ksys_unshare(unsigned long unshare_flags) if (new_fs) { fs = current->fs; - spin_lock(&fs->lock); + read_seqlock_excl(&fs->seq); current->fs = new_fs; if (--fs->users) new_fs = NULL; else new_fs = fs; - spin_unlock(&fs->lock); + read_sequnlock_excl(&fs->seq); } if (new_fd) diff --git a/kernel/pid.c b/kernel/pid.c index 8317bcbc7cf7..07db7d8d066c 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -100,7 +100,7 @@ void put_pid(struct pid *pid) ns = pid->numbers[pid->level].ns; if (refcount_dec_and_test(&pid->count)) { - WARN_ON_ONCE(pid->stashed); + pidfs_free_pid(pid); kmem_cache_free(ns->pid_cachep, pid); put_pid_ns(ns); } diff --git a/net/core/scm.c b/net/core/scm.c index 0225bd94170f..072d5742440a 100644 --- a/net/core/scm.c +++ b/net/core/scm.c @@ -23,6 +23,8 @@ #include #include #include +#include +#include #include #include #include @@ -145,6 +147,22 @@ void __scm_destroy(struct scm_cookie *scm) } EXPORT_SYMBOL(__scm_destroy); +static inline int scm_replace_pid(struct scm_cookie *scm, struct pid *pid) +{ + int err; + + /* drop all previous references */ + scm_destroy_cred(scm); + + err = pidfs_register_pid(pid); + if (unlikely(err)) + return err; + + scm->pid = pid; + scm->creds.pid = pid_vnr(pid); + return 0; +} + int __scm_send(struct socket *sock, struct msghdr *msg, struct scm_cookie *p) { const struct proto_ops *ops = READ_ONCE(sock->ops); @@ -189,15 +207,21 @@ int __scm_send(struct socket *sock, struct msghdr *msg, struct scm_cookie *p) if (err) goto error; - p->creds.pid = creds.pid; if (!p->pid || pid_vnr(p->pid) != creds.pid) { struct pid *pid; err = -ESRCH; pid = find_get_pid(creds.pid); if (!pid) goto error; - put_pid(p->pid); - p->pid = pid; + + /* pass a struct pid reference from + * find_get_pid() to scm_replace_pid(). + */ + err = scm_replace_pid(p, pid); + if (err) { + put_pid(pid); + goto error; + } } err = -EINVAL; @@ -459,7 +483,7 @@ static void scm_pidfd_recv(struct msghdr *msg, struct scm_cookie *scm) if (!scm->pid) return; - pidfd = pidfd_prepare(scm->pid, 0, &pidfd_file); + pidfd = pidfd_prepare(scm->pid, PIDFD_STALE, &pidfd_file); if (put_cmsg(msg, SOL_SOCKET, SCM_PIDFD, sizeof(int), &pidfd)) { if (pidfd_file) { diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 52b155123985..a8895786e016 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -646,9 +646,6 @@ static void unix_sock_destructor(struct sock *sk) return; } - if (sk->sk_peer_pid) - pidfs_put_pid(sk->sk_peer_pid); - if (u->addr) unix_release_addr(u->addr); @@ -780,7 +777,6 @@ static void drop_peercred(struct unix_peercred *peercred) swap(peercred->peer_pid, pid); swap(peercred->peer_cred, cred); - pidfs_put_pid(pid); put_pid(pid); put_cred(cred); } @@ -813,7 +809,6 @@ static void copy_peercred(struct sock *sk, struct sock *peersk) spin_lock(&sk->sk_peer_lock); sk->sk_peer_pid = get_pid(peersk->sk_peer_pid); - pidfs_get_pid(sk->sk_peer_pid); sk->sk_peer_cred = get_cred(peersk->sk_peer_cred); spin_unlock(&sk->sk_peer_lock); } @@ -1945,7 +1940,7 @@ static void unix_destruct_scm(struct sk_buff *skb) struct scm_cookie scm; memset(&scm, 0, sizeof(scm)); - scm.pid = UNIXCB(skb).pid; + scm.pid = UNIXCB(skb).pid; if (UNIXCB(skb).fp) unix_detach_fds(&scm, skb); @@ -1971,22 +1966,46 @@ static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool sen return err; } -/* +static void unix_skb_to_scm(struct sk_buff *skb, struct scm_cookie *scm) +{ + scm_set_cred(scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid); + unix_set_secdata(scm, skb); +} + +/** + * unix_maybe_add_creds() - Adds current task uid/gid and struct pid to skb if needed. + * @skb: skb to attach creds to. + * @sk: Sender sock. + * @other: Receiver sock. + * * Some apps rely on write() giving SCM_CREDENTIALS * We include credentials if source or destination socket * asserted SOCK_PASSCRED. + * + * Context: May sleep. + * Return: On success zero, on error a negative error code is returned. */ -static void unix_maybe_add_creds(struct sk_buff *skb, const struct sock *sk, - const struct sock *other) +static int unix_maybe_add_creds(struct sk_buff *skb, const struct sock *sk, + const struct sock *other) { if (UNIXCB(skb).pid) - return; + return 0; if (unix_may_passcred(sk) || unix_may_passcred(other) || !other->sk_socket) { - UNIXCB(skb).pid = get_pid(task_tgid(current)); + struct pid *pid; + int err; + + pid = task_tgid(current); + err = pidfs_register_pid(pid); + if (unlikely(err)) + return err; + + UNIXCB(skb).pid = get_pid(pid); current_uid_gid(&UNIXCB(skb).uid, &UNIXCB(skb).gid); } + + return 0; } static bool unix_skb_scm_eq(struct sk_buff *skb, @@ -2121,6 +2140,10 @@ lookup: goto out_sock_put; } + err = unix_maybe_add_creds(skb, sk, other); + if (err) + goto out_sock_put; + restart: sk_locked = 0; unix_state_lock(other); @@ -2229,7 +2252,6 @@ restart_locked: if (sock_flag(other, SOCK_RCVTSTAMP)) __net_timestamp(skb); - unix_maybe_add_creds(skb, sk, other); scm_stat_add(other, skb); skb_queue_tail(&other->sk_receive_queue, skb); unix_state_unlock(other); @@ -2273,6 +2295,10 @@ static int queue_oob(struct sock *sk, struct msghdr *msg, struct sock *other, if (err < 0) goto out; + err = unix_maybe_add_creds(skb, sk, other); + if (err) + goto out; + skb_put(skb, 1); err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, 1); @@ -2292,7 +2318,6 @@ static int queue_oob(struct sock *sk, struct msghdr *msg, struct sock *other, goto out_unlock; } - unix_maybe_add_creds(skb, sk, other); scm_stat_add(other, skb); spin_lock(&other->sk_receive_queue.lock); @@ -2386,6 +2411,10 @@ static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg, fds_sent = true; + err = unix_maybe_add_creds(skb, sk, other); + if (err) + goto out_free; + if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) { skb->ip_summed = CHECKSUM_UNNECESSARY; err = skb_splice_from_iter(skb, &msg->msg_iter, size, @@ -2416,7 +2445,6 @@ static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg, goto out_free; } - unix_maybe_add_creds(skb, sk, other); scm_stat_add(other, skb); skb_queue_tail(&other->sk_receive_queue, skb); unix_state_unlock(other); @@ -2564,8 +2592,7 @@ int __unix_dgram_recvmsg(struct sock *sk, struct msghdr *msg, size_t size, memset(&scm, 0, sizeof(scm)); - scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid); - unix_set_secdata(&scm, skb); + unix_skb_to_scm(skb, &scm); if (!(flags & MSG_PEEK)) { if (UNIXCB(skb).fp) @@ -2954,8 +2981,7 @@ unlock: break; } else if (unix_may_passcred(sk)) { /* Copy credentials */ - scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid); - unix_set_secdata(&scm, skb); + unix_skb_to_scm(skb, &scm); check_creds = true; } @@ -3191,7 +3217,6 @@ EXPORT_SYMBOL_GPL(unix_outq_len); static int unix_open_file(struct sock *sk) { - struct path path; struct file *f; int fd; @@ -3201,27 +3226,20 @@ static int unix_open_file(struct sock *sk) if (!smp_load_acquire(&unix_sk(sk)->addr)) return -ENOENT; - path = unix_sk(sk)->path; - if (!path.dentry) + if (!unix_sk(sk)->path.dentry) return -ENOENT; - path_get(&path); - fd = get_unused_fd_flags(O_CLOEXEC); if (fd < 0) - goto out; + return fd; - f = dentry_open(&path, O_PATH, current_cred()); + f = dentry_open(&unix_sk(sk)->path, O_PATH, current_cred()); if (IS_ERR(f)) { put_unused_fd(fd); - fd = PTR_ERR(f); - goto out; + return PTR_ERR(f); } fd_install(fd, f); -out: - path_put(&path); - return fd; } diff --git a/tools/testing/selftests/net/af_unix/scm_pidfd.c b/tools/testing/selftests/net/af_unix/scm_pidfd.c index 7e534594167e..37e034874034 100644 --- a/tools/testing/selftests/net/af_unix/scm_pidfd.c +++ b/tools/testing/selftests/net/af_unix/scm_pidfd.c @@ -15,6 +15,7 @@ #include #include +#include "../../pidfd/pidfd.h" #include "../../kselftest_harness.h" #define clean_errno() (errno == 0 ? "None" : strerror(errno)) @@ -26,6 +27,8 @@ #define SCM_PIDFD 0x04 #endif +#define CHILD_EXIT_CODE_OK 123 + static void child_die() { exit(1); @@ -126,16 +129,65 @@ out: return result; } +struct cmsg_data { + struct ucred *ucred; + int *pidfd; +}; + +static int parse_cmsg(struct msghdr *msg, struct cmsg_data *res) +{ + struct cmsghdr *cmsg; + int data = 0; + + if (msg->msg_flags & (MSG_TRUNC | MSG_CTRUNC)) { + log_err("recvmsg: truncated"); + return 1; + } + + for (cmsg = CMSG_FIRSTHDR(msg); cmsg != NULL; + cmsg = CMSG_NXTHDR(msg, cmsg)) { + if (cmsg->cmsg_level == SOL_SOCKET && + cmsg->cmsg_type == SCM_PIDFD) { + if (cmsg->cmsg_len < sizeof(*res->pidfd)) { + log_err("CMSG parse: SCM_PIDFD wrong len"); + return 1; + } + + res->pidfd = (void *)CMSG_DATA(cmsg); + } + + if (cmsg->cmsg_level == SOL_SOCKET && + cmsg->cmsg_type == SCM_CREDENTIALS) { + if (cmsg->cmsg_len < sizeof(*res->ucred)) { + log_err("CMSG parse: SCM_CREDENTIALS wrong len"); + return 1; + } + + res->ucred = (void *)CMSG_DATA(cmsg); + } + } + + if (!res->pidfd) { + log_err("CMSG parse: SCM_PIDFD not found"); + return 1; + } + + if (!res->ucred) { + log_err("CMSG parse: SCM_CREDENTIALS not found"); + return 1; + } + + return 0; +} + static int cmsg_check(int fd) { struct msghdr msg = { 0 }; - struct cmsghdr *cmsg; + struct cmsg_data res; struct iovec iov; - struct ucred *ucred = NULL; int data = 0; char control[CMSG_SPACE(sizeof(struct ucred)) + CMSG_SPACE(sizeof(int))] = { 0 }; - int *pidfd = NULL; pid_t parent_pid; int err; @@ -158,53 +210,99 @@ static int cmsg_check(int fd) return 1; } - for (cmsg = CMSG_FIRSTHDR(&msg); cmsg != NULL; - cmsg = CMSG_NXTHDR(&msg, cmsg)) { - if (cmsg->cmsg_level == SOL_SOCKET && - cmsg->cmsg_type == SCM_PIDFD) { - if (cmsg->cmsg_len < sizeof(*pidfd)) { - log_err("CMSG parse: SCM_PIDFD wrong len"); - return 1; - } - - pidfd = (void *)CMSG_DATA(cmsg); - } - - if (cmsg->cmsg_level == SOL_SOCKET && - cmsg->cmsg_type == SCM_CREDENTIALS) { - if (cmsg->cmsg_len < sizeof(*ucred)) { - log_err("CMSG parse: SCM_CREDENTIALS wrong len"); - return 1; - } - - ucred = (void *)CMSG_DATA(cmsg); - } - } - /* send(pfd, "x", sizeof(char), 0) */ if (data != 'x') { log_err("recvmsg: data corruption"); return 1; } - if (!pidfd) { - log_err("CMSG parse: SCM_PIDFD not found"); - return 1; - } - - if (!ucred) { - log_err("CMSG parse: SCM_CREDENTIALS not found"); + if (parse_cmsg(&msg, &res)) { + log_err("CMSG parse: parse_cmsg() failed"); return 1; } /* pidfd from SCM_PIDFD should point to the parent process PID */ parent_pid = - get_pid_from_fdinfo_file(*pidfd, "Pid:", sizeof("Pid:") - 1); + get_pid_from_fdinfo_file(*res.pidfd, "Pid:", sizeof("Pid:") - 1); if (parent_pid != getppid()) { log_err("wrong SCM_PIDFD %d != %d", parent_pid, getppid()); + close(*res.pidfd); return 1; } + close(*res.pidfd); + return 0; +} + +static int cmsg_check_dead(int fd, int expected_pid) +{ + int err; + struct msghdr msg = { 0 }; + struct cmsg_data res; + struct iovec iov; + int data = 0; + char control[CMSG_SPACE(sizeof(struct ucred)) + + CMSG_SPACE(sizeof(int))] = { 0 }; + pid_t client_pid; + struct pidfd_info info = { + .mask = PIDFD_INFO_EXIT, + }; + + iov.iov_base = &data; + iov.iov_len = sizeof(data); + + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + msg.msg_control = control; + msg.msg_controllen = sizeof(control); + + err = recvmsg(fd, &msg, 0); + if (err < 0) { + log_err("recvmsg"); + return 1; + } + + if (msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC)) { + log_err("recvmsg: truncated"); + return 1; + } + + /* send(cfd, "y", sizeof(char), 0) */ + if (data != 'y') { + log_err("recvmsg: data corruption"); + return 1; + } + + if (parse_cmsg(&msg, &res)) { + log_err("CMSG parse: parse_cmsg() failed"); + return 1; + } + + /* + * pidfd from SCM_PIDFD should point to the client_pid. + * Let's read exit information and check if it's what + * we expect to see. + */ + if (ioctl(*res.pidfd, PIDFD_GET_INFO, &info)) { + log_err("%s: ioctl(PIDFD_GET_INFO) failed", __func__); + close(*res.pidfd); + return 1; + } + + if (!(info.mask & PIDFD_INFO_EXIT)) { + log_err("%s: No exit information from ioctl(PIDFD_GET_INFO)", __func__); + close(*res.pidfd); + return 1; + } + + err = WIFEXITED(info.exit_code) ? WEXITSTATUS(info.exit_code) : 1; + if (err != CHILD_EXIT_CODE_OK) { + log_err("%s: wrong exit_code %d != %d", __func__, err, CHILD_EXIT_CODE_OK); + close(*res.pidfd); + return 1; + } + + close(*res.pidfd); return 0; } @@ -291,6 +389,24 @@ static void fill_sockaddr(struct sock_addr *addr, bool abstract) memcpy(sun_path_buf, addr->sock_name, strlen(addr->sock_name)); } +static int sk_enable_cred_pass(int sk) +{ + int on = 0; + + on = 1; + if (setsockopt(sk, SOL_SOCKET, SO_PASSCRED, &on, sizeof(on))) { + log_err("Failed to set SO_PASSCRED"); + return 1; + } + + if (setsockopt(sk, SOL_SOCKET, SO_PASSPIDFD, &on, sizeof(on))) { + log_err("Failed to set SO_PASSPIDFD"); + return 1; + } + + return 0; +} + static void client(FIXTURE_DATA(scm_pidfd) *self, const FIXTURE_VARIANT(scm_pidfd) *variant) { @@ -299,7 +415,6 @@ static void client(FIXTURE_DATA(scm_pidfd) *self, struct ucred peer_cred; int peer_pidfd; pid_t peer_pid; - int on = 0; cfd = socket(AF_UNIX, variant->type, 0); if (cfd < 0) { @@ -322,14 +437,8 @@ static void client(FIXTURE_DATA(scm_pidfd) *self, child_die(); } - on = 1; - if (setsockopt(cfd, SOL_SOCKET, SO_PASSCRED, &on, sizeof(on))) { - log_err("Failed to set SO_PASSCRED"); - child_die(); - } - - if (setsockopt(cfd, SOL_SOCKET, SO_PASSPIDFD, &on, sizeof(on))) { - log_err("Failed to set SO_PASSPIDFD"); + if (sk_enable_cred_pass(cfd)) { + log_err("sk_enable_cred_pass() failed"); child_die(); } @@ -340,6 +449,12 @@ static void client(FIXTURE_DATA(scm_pidfd) *self, child_die(); } + /* send something to the parent so it can receive SCM_PIDFD too and validate it */ + if (send(cfd, "y", sizeof(char), 0) == -1) { + log_err("Failed to send(cfd, \"y\", sizeof(char), 0)"); + child_die(); + } + /* skip further for SOCK_DGRAM as it's not applicable */ if (variant->type == SOCK_DGRAM) return; @@ -398,7 +513,13 @@ TEST_F(scm_pidfd, test) close(self->server); close(self->startup_pipe[0]); client(self, variant); - exit(0); + + /* + * It's a bit unusual, but in case of success we return non-zero + * exit code (CHILD_EXIT_CODE_OK) and then we expect to read it + * from ioctl(PIDFD_GET_INFO) in cmsg_check_dead(). + */ + exit(CHILD_EXIT_CODE_OK); } close(self->startup_pipe[1]); @@ -421,9 +542,17 @@ TEST_F(scm_pidfd, test) ASSERT_NE(-1, err); } - close(pfd); waitpid(self->client_pid, &child_status, 0); - ASSERT_EQ(0, WIFEXITED(child_status) ? WEXITSTATUS(child_status) : 1); + /* see comment before exit(CHILD_EXIT_CODE_OK) */ + ASSERT_EQ(CHILD_EXIT_CODE_OK, WIFEXITED(child_status) ? WEXITSTATUS(child_status) : 1); + + err = sk_enable_cred_pass(pfd); + ASSERT_EQ(0, err); + + err = cmsg_check_dead(pfd, self->client_pid); + ASSERT_EQ(0, err); + + close(pfd); } TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/pidfd/.gitignore b/tools/testing/selftests/pidfd/.gitignore index 0406a065deb4..144e7ff65d6a 100644 --- a/tools/testing/selftests/pidfd/.gitignore +++ b/tools/testing/selftests/pidfd/.gitignore @@ -10,3 +10,5 @@ pidfd_file_handle_test pidfd_bind_mount pidfd_info_test pidfd_exec_helper +pidfd_xattr_test +pidfd_setattr_test diff --git a/tools/testing/selftests/pidfd/Makefile b/tools/testing/selftests/pidfd/Makefile index fcbefc0d77f6..764a8f9ecefa 100644 --- a/tools/testing/selftests/pidfd/Makefile +++ b/tools/testing/selftests/pidfd/Makefile @@ -1,9 +1,10 @@ # SPDX-License-Identifier: GPL-2.0-only -CFLAGS += -g $(KHDR_INCLUDES) -pthread -Wall +CFLAGS += -g $(KHDR_INCLUDES) $(TOOLS_INCLUDES) -pthread -Wall TEST_GEN_PROGS := pidfd_test pidfd_fdinfo_test pidfd_open_test \ pidfd_poll_test pidfd_wait pidfd_getfd_test pidfd_setns_test \ - pidfd_file_handle_test pidfd_bind_mount pidfd_info_test + pidfd_file_handle_test pidfd_bind_mount pidfd_info_test \ + pidfd_xattr_test pidfd_setattr_test TEST_GEN_PROGS_EXTENDED := pidfd_exec_helper diff --git a/tools/testing/selftests/pidfd/pidfd.h b/tools/testing/selftests/pidfd/pidfd.h index efd74063126e..cd244d0860ff 100644 --- a/tools/testing/selftests/pidfd/pidfd.h +++ b/tools/testing/selftests/pidfd/pidfd.h @@ -19,6 +19,10 @@ #include "../kselftest.h" #include "../clone3/clone3_selftests.h" +#ifndef FD_PIDFS_ROOT +#define FD_PIDFS_ROOT -10002 +#endif + #ifndef P_PIDFD #define P_PIDFD 3 #endif @@ -56,7 +60,7 @@ #endif #ifndef PIDFD_SELF_THREAD_GROUP -#define PIDFD_SELF_THREAD_GROUP -20000 /* Current thread group leader. */ +#define PIDFD_SELF_THREAD_GROUP -10001 /* Current thread group leader. */ #endif #ifndef PIDFD_SELF diff --git a/tools/testing/selftests/pidfd/pidfd_file_handle_test.c b/tools/testing/selftests/pidfd/pidfd_file_handle_test.c index 439b9c6c0457..6bd2e9c9565b 100644 --- a/tools/testing/selftests/pidfd/pidfd_file_handle_test.c +++ b/tools/testing/selftests/pidfd/pidfd_file_handle_test.c @@ -500,4 +500,64 @@ TEST_F(file_handle, valid_name_to_handle_at_flags) ASSERT_EQ(close(pidfd), 0); } +/* + * That we decode a file handle without having to pass a pidfd. + */ +TEST_F(file_handle, decode_purely_based_on_file_handle) +{ + int mnt_id; + struct file_handle *fh; + int pidfd = -EBADF; + struct stat st1, st2; + + fh = malloc(sizeof(struct file_handle) + MAX_HANDLE_SZ); + ASSERT_NE(fh, NULL); + memset(fh, 0, sizeof(struct file_handle) + MAX_HANDLE_SZ); + fh->handle_bytes = MAX_HANDLE_SZ; + + ASSERT_EQ(name_to_handle_at(self->child_pidfd1, "", fh, &mnt_id, AT_EMPTY_PATH), 0); + + ASSERT_EQ(fstat(self->child_pidfd1, &st1), 0); + + pidfd = open_by_handle_at(FD_PIDFS_ROOT, fh, 0); + ASSERT_GE(pidfd, 0); + + ASSERT_EQ(fstat(pidfd, &st2), 0); + ASSERT_TRUE(st1.st_dev == st2.st_dev && st1.st_ino == st2.st_ino); + + ASSERT_EQ(close(pidfd), 0); + + pidfd = open_by_handle_at(FD_PIDFS_ROOT, fh, O_CLOEXEC); + ASSERT_GE(pidfd, 0); + + ASSERT_EQ(fstat(pidfd, &st2), 0); + ASSERT_TRUE(st1.st_dev == st2.st_dev && st1.st_ino == st2.st_ino); + + ASSERT_EQ(close(pidfd), 0); + + pidfd = open_by_handle_at(FD_PIDFS_ROOT, fh, O_NONBLOCK); + ASSERT_GE(pidfd, 0); + + ASSERT_EQ(fstat(pidfd, &st2), 0); + ASSERT_TRUE(st1.st_dev == st2.st_dev && st1.st_ino == st2.st_ino); + + ASSERT_EQ(close(pidfd), 0); + + pidfd = open_by_handle_at(self->pidfd, fh, 0); + ASSERT_GE(pidfd, 0); + + ASSERT_EQ(fstat(pidfd, &st2), 0); + ASSERT_TRUE(st1.st_dev == st2.st_dev && st1.st_ino == st2.st_ino); + + ASSERT_EQ(close(pidfd), 0); + + pidfd = open_by_handle_at(-EBADF, fh, 0); + ASSERT_LT(pidfd, 0); + + pidfd = open_by_handle_at(AT_FDCWD, fh, 0); + ASSERT_LT(pidfd, 0); + + free(fh); +} + TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/pidfd/pidfd_setattr_test.c b/tools/testing/selftests/pidfd/pidfd_setattr_test.c new file mode 100644 index 000000000000..d7de05edc4b3 --- /dev/null +++ b/tools/testing/selftests/pidfd/pidfd_setattr_test.c @@ -0,0 +1,69 @@ +// SPDX-License-Identifier: GPL-2.0 + +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "pidfd.h" +#include "../kselftest_harness.h" + +FIXTURE(pidfs_setattr) +{ + pid_t child_pid; + int child_pidfd; +}; + +FIXTURE_SETUP(pidfs_setattr) +{ + self->child_pid = create_child(&self->child_pidfd, CLONE_NEWUSER | CLONE_NEWPID); + EXPECT_GE(self->child_pid, 0); + + if (self->child_pid == 0) + _exit(EXIT_SUCCESS); +} + +FIXTURE_TEARDOWN(pidfs_setattr) +{ + sys_waitid(P_PID, self->child_pid, NULL, WEXITED); + EXPECT_EQ(close(self->child_pidfd), 0); +} + +TEST_F(pidfs_setattr, no_chown) +{ + ASSERT_LT(fchown(self->child_pidfd, 1234, 5678), 0); + ASSERT_EQ(errno, EOPNOTSUPP); +} + +TEST_F(pidfs_setattr, no_chmod) +{ + ASSERT_LT(fchmod(self->child_pidfd, 0777), 0); + ASSERT_EQ(errno, EOPNOTSUPP); +} + +TEST_F(pidfs_setattr, no_exec) +{ + char *const argv[] = { NULL }; + char *const envp[] = { NULL }; + + ASSERT_LT(execveat(self->child_pidfd, "", argv, envp, AT_EMPTY_PATH), 0); + ASSERT_EQ(errno, EACCES); +} + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/pidfd/pidfd_xattr_test.c b/tools/testing/selftests/pidfd/pidfd_xattr_test.c new file mode 100644 index 000000000000..5cf7bb0e4bf2 --- /dev/null +++ b/tools/testing/selftests/pidfd/pidfd_xattr_test.c @@ -0,0 +1,132 @@ +// SPDX-License-Identifier: GPL-2.0 + +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "pidfd.h" +#include "../kselftest_harness.h" + +FIXTURE(pidfs_xattr) +{ + pid_t child_pid; + int child_pidfd; +}; + +FIXTURE_SETUP(pidfs_xattr) +{ + self->child_pid = create_child(&self->child_pidfd, CLONE_NEWUSER | CLONE_NEWPID); + EXPECT_GE(self->child_pid, 0); + + if (self->child_pid == 0) + _exit(EXIT_SUCCESS); +} + +FIXTURE_TEARDOWN(pidfs_xattr) +{ + sys_waitid(P_PID, self->child_pid, NULL, WEXITED); +} + +TEST_F(pidfs_xattr, set_get_list_xattr_multiple) +{ + int ret, i; + char xattr_name[32]; + char xattr_value[32]; + char buf[32]; + const int num_xattrs = 10; + char list[PATH_MAX] = {}; + + for (i = 0; i < num_xattrs; i++) { + snprintf(xattr_name, sizeof(xattr_name), "trusted.testattr%d", i); + snprintf(xattr_value, sizeof(xattr_value), "testvalue%d", i); + ret = fsetxattr(self->child_pidfd, xattr_name, xattr_value, strlen(xattr_value), 0); + ASSERT_EQ(ret, 0); + } + + for (i = 0; i < num_xattrs; i++) { + snprintf(xattr_name, sizeof(xattr_name), "trusted.testattr%d", i); + snprintf(xattr_value, sizeof(xattr_value), "testvalue%d", i); + memset(buf, 0, sizeof(buf)); + ret = fgetxattr(self->child_pidfd, xattr_name, buf, sizeof(buf)); + ASSERT_EQ(ret, strlen(xattr_value)); + ASSERT_EQ(strcmp(buf, xattr_value), 0); + } + + ret = flistxattr(self->child_pidfd, list, sizeof(list)); + ASSERT_GT(ret, 0); + for (i = 0; i < num_xattrs; i++) { + snprintf(xattr_name, sizeof(xattr_name), "trusted.testattr%d", i); + bool found = false; + for (char *it = list; it < list + ret; it += strlen(it) + 1) { + if (strcmp(it, xattr_name)) + continue; + found = true; + break; + } + ASSERT_TRUE(found); + } + + for (i = 0; i < num_xattrs; i++) { + snprintf(xattr_name, sizeof(xattr_name), "trusted.testattr%d", i); + ret = fremovexattr(self->child_pidfd, xattr_name); + ASSERT_EQ(ret, 0); + + ret = fgetxattr(self->child_pidfd, xattr_name, buf, sizeof(buf)); + ASSERT_EQ(ret, -1); + ASSERT_EQ(errno, ENODATA); + } +} + +TEST_F(pidfs_xattr, set_get_list_xattr_persistent) +{ + int ret; + char buf[32]; + char list[PATH_MAX] = {}; + + ret = fsetxattr(self->child_pidfd, "trusted.persistent", "persistent value", strlen("persistent value"), 0); + ASSERT_EQ(ret, 0); + + memset(buf, 0, sizeof(buf)); + ret = fgetxattr(self->child_pidfd, "trusted.persistent", buf, sizeof(buf)); + ASSERT_EQ(ret, strlen("persistent value")); + ASSERT_EQ(strcmp(buf, "persistent value"), 0); + + ret = flistxattr(self->child_pidfd, list, sizeof(list)); + ASSERT_GT(ret, 0); + ASSERT_EQ(strcmp(list, "trusted.persistent"), 0) + + ASSERT_EQ(close(self->child_pidfd), 0); + self->child_pidfd = -EBADF; + sleep(2); + + self->child_pidfd = sys_pidfd_open(self->child_pid, 0); + ASSERT_GE(self->child_pidfd, 0); + + memset(buf, 0, sizeof(buf)); + ret = fgetxattr(self->child_pidfd, "trusted.persistent", buf, sizeof(buf)); + ASSERT_EQ(ret, strlen("persistent value")); + ASSERT_EQ(strcmp(buf, "persistent value"), 0); + + ret = flistxattr(self->child_pidfd, list, sizeof(list)); + ASSERT_GT(ret, 0); + ASSERT_EQ(strcmp(list, "trusted.persistent"), 0); +} + +TEST_HARNESS_MAIN