linux/fs/backing-file.c
Linus Torvalds 7031769e10 vfs-6.17-rc1.mmap_prepare
-----BEGIN PGP SIGNATURE-----
 
 iHUEABYKAB0WIQRAhzRXHqcMeLMyaSiRxhvAZXjcogUCaINCgQAKCRCRxhvAZXjc
 os+nAP9LFHUwWO6EBzHJJGEVjJvvzsbzqeYrRFamYiMc5ulPJwD+KW4RIgJa/MWO
 pcYE40CacaekD8rFWwYUyszpgmv6ewc=
 =wCwp
 -----END PGP SIGNATURE-----

Merge tag 'vfs-6.17-rc1.mmap_prepare' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs

Pull mmap_prepare updates from Christian Brauner:
 "Last cycle we introduce f_op->mmap_prepare() in c84bf6dd2b ("mm:
  introduce new .mmap_prepare() file callback").

  This is preferred to the existing f_op->mmap() hook as it does require
  a VMA to be established yet, thus allowing the mmap logic to invoke
  this hook far, far earlier, prior to inserting a VMA into the virtual
  address space, or performing any other heavy handed operations.

  This allows for much simpler unwinding on error, and for there to be a
  single attempt at merging a VMA rather than having to possibly
  reattempt a merge based on potentially altered VMA state.

  Far more importantly, it prevents inappropriate manipulation of
  incompletely initialised VMA state, which is something that has been
  the cause of bugs and complexity in the past.

  The intent is to gradually deprecate f_op->mmap, and in that vein this
  series coverts the majority of file systems to using f_op->mmap_prepare.

  Prerequisite steps are taken - firstly ensuring all checks for mmap
  capabilities use the file_has_valid_mmap_hooks() helper rather than
  directly checking for f_op->mmap (which is now not a valid check) and
  secondly updating daxdev_mapping_supported() to not require a VMA
  parameter to allow ext4 and xfs to be converted.

  Commit bb666b7c27 ("mm: add mmap_prepare() compatibility layer for
  nested file systems") handles the nasty edge-case of nested file
  systems like overlayfs, which introduces a compatibility shim to allow
  f_op->mmap_prepare() to be invoked from an f_op->mmap() callback.

  This allows for nested filesystems to continue to function correctly
  with all file systems regardless of which callback is used. Once we
  finally convert all file systems, this shim can be removed.

  As a result, ecryptfs, fuse, and overlayfs remain unaltered so they
  can nest all other file systems.

  We additionally do not update resctl - as this requires an update to
  remap_pfn_range() (or an alternative to it) which we defer to a later
  series, equally we do not update cramfs which needs a mixed mapping
  insertion with the same issue, nor do we update procfs, hugetlbfs,
  syfs or kernfs all of which require VMAs for internal state and hooks.
  We shall return to all of these later"

* tag 'vfs-6.17-rc1.mmap_prepare' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs:
  doc: update porting, vfs documentation to describe mmap_prepare()
  fs: replace mmap hook with .mmap_prepare for simple mappings
  fs: convert most other generic_file_*mmap() users to .mmap_prepare()
  fs: convert simple use of generic_file_*_mmap() to .mmap_prepare()
  mm/filemap: introduce generic_file_*_mmap_prepare() helpers
  fs/xfs: transition from deprecated .mmap hook to .mmap_prepare
  fs/ext4: transition from deprecated .mmap hook to .mmap_prepare
  fs/dax: make it possible to check dev dax support without a VMA
  fs: consistently use can_mmap_file() helper
  mm/nommu: use file_has_valid_mmap_hooks() helper
  mm: rename call_mmap/mmap_prepare to vfs_mmap/mmap_prepare
2025-07-28 13:43:25 -07:00

360 lines
8.4 KiB
C

// SPDX-License-Identifier: GPL-2.0-only
/*
* Common helpers for stackable filesystems and backing files.
*
* Forked from fs/overlayfs/file.c.
*
* Copyright (C) 2017 Red Hat, Inc.
* Copyright (C) 2023 CTERA Networks.
*/
#include <linux/fs.h>
#include <linux/backing-file.h>
#include <linux/splice.h>
#include <linux/mm.h>
#include "internal.h"
/**
* backing_file_open - open a backing file for kernel internal use
* @user_path: path that the user reuqested to open
* @flags: open flags
* @real_path: path of the backing file
* @cred: credentials for open
*
* Open a backing file for a stackable filesystem (e.g., overlayfs).
* @user_path may be on the stackable filesystem and @real_path on the
* underlying filesystem. In this case, we want to be able to return the
* @user_path of the stackable filesystem. This is done by embedding the
* returned file into a container structure that also stores the stacked
* file's path, which can be retrieved using backing_file_user_path().
*/
struct file *backing_file_open(const struct path *user_path, int flags,
const struct path *real_path,
const struct cred *cred)
{
struct file *f;
int error;
f = alloc_empty_backing_file(flags, cred);
if (IS_ERR(f))
return f;
path_get(user_path);
backing_file_set_user_path(f, user_path);
error = vfs_open(real_path, f);
if (error) {
fput(f);
f = ERR_PTR(error);
}
return f;
}
EXPORT_SYMBOL_GPL(backing_file_open);
struct file *backing_tmpfile_open(const struct path *user_path, int flags,
const struct path *real_parentpath,
umode_t mode, const struct cred *cred)
{
struct mnt_idmap *real_idmap = mnt_idmap(real_parentpath->mnt);
struct file *f;
int error;
f = alloc_empty_backing_file(flags, cred);
if (IS_ERR(f))
return f;
path_get(user_path);
backing_file_set_user_path(f, user_path);
error = vfs_tmpfile(real_idmap, real_parentpath, f, mode);
if (error) {
fput(f);
f = ERR_PTR(error);
}
return f;
}
EXPORT_SYMBOL(backing_tmpfile_open);
struct backing_aio {
struct kiocb iocb;
refcount_t ref;
struct kiocb *orig_iocb;
/* used for aio completion */
void (*end_write)(struct kiocb *iocb, ssize_t);
struct work_struct work;
long res;
};
static struct kmem_cache *backing_aio_cachep;
#define BACKING_IOCB_MASK \
(IOCB_NOWAIT | IOCB_HIPRI | IOCB_DSYNC | IOCB_SYNC | IOCB_APPEND)
static rwf_t iocb_to_rw_flags(int flags)
{
return (__force rwf_t)(flags & BACKING_IOCB_MASK);
}
static void backing_aio_put(struct backing_aio *aio)
{
if (refcount_dec_and_test(&aio->ref)) {
fput(aio->iocb.ki_filp);
kmem_cache_free(backing_aio_cachep, aio);
}
}
static void backing_aio_cleanup(struct backing_aio *aio, long res)
{
struct kiocb *iocb = &aio->iocb;
struct kiocb *orig_iocb = aio->orig_iocb;
orig_iocb->ki_pos = iocb->ki_pos;
if (aio->end_write)
aio->end_write(orig_iocb, res);
backing_aio_put(aio);
}
static void backing_aio_rw_complete(struct kiocb *iocb, long res)
{
struct backing_aio *aio = container_of(iocb, struct backing_aio, iocb);
struct kiocb *orig_iocb = aio->orig_iocb;
if (iocb->ki_flags & IOCB_WRITE)
kiocb_end_write(iocb);
backing_aio_cleanup(aio, res);
orig_iocb->ki_complete(orig_iocb, res);
}
static void backing_aio_complete_work(struct work_struct *work)
{
struct backing_aio *aio = container_of(work, struct backing_aio, work);
backing_aio_rw_complete(&aio->iocb, aio->res);
}
static void backing_aio_queue_completion(struct kiocb *iocb, long res)
{
struct backing_aio *aio = container_of(iocb, struct backing_aio, iocb);
/*
* Punt to a work queue to serialize updates of mtime/size.
*/
aio->res = res;
INIT_WORK(&aio->work, backing_aio_complete_work);
queue_work(file_inode(aio->orig_iocb->ki_filp)->i_sb->s_dio_done_wq,
&aio->work);
}
static int backing_aio_init_wq(struct kiocb *iocb)
{
struct super_block *sb = file_inode(iocb->ki_filp)->i_sb;
if (sb->s_dio_done_wq)
return 0;
return sb_init_dio_done_wq(sb);
}
ssize_t backing_file_read_iter(struct file *file, struct iov_iter *iter,
struct kiocb *iocb, int flags,
struct backing_file_ctx *ctx)
{
struct backing_aio *aio = NULL;
const struct cred *old_cred;
ssize_t ret;
if (WARN_ON_ONCE(!(file->f_mode & FMODE_BACKING)))
return -EIO;
if (!iov_iter_count(iter))
return 0;
if (iocb->ki_flags & IOCB_DIRECT &&
!(file->f_mode & FMODE_CAN_ODIRECT))
return -EINVAL;
old_cred = override_creds(ctx->cred);
if (is_sync_kiocb(iocb)) {
rwf_t rwf = iocb_to_rw_flags(flags);
ret = vfs_iter_read(file, iter, &iocb->ki_pos, rwf);
} else {
ret = -ENOMEM;
aio = kmem_cache_zalloc(backing_aio_cachep, GFP_KERNEL);
if (!aio)
goto out;
aio->orig_iocb = iocb;
kiocb_clone(&aio->iocb, iocb, get_file(file));
aio->iocb.ki_complete = backing_aio_rw_complete;
refcount_set(&aio->ref, 2);
ret = vfs_iocb_iter_read(file, &aio->iocb, iter);
backing_aio_put(aio);
if (ret != -EIOCBQUEUED)
backing_aio_cleanup(aio, ret);
}
out:
revert_creds(old_cred);
if (ctx->accessed)
ctx->accessed(iocb->ki_filp);
return ret;
}
EXPORT_SYMBOL_GPL(backing_file_read_iter);
ssize_t backing_file_write_iter(struct file *file, struct iov_iter *iter,
struct kiocb *iocb, int flags,
struct backing_file_ctx *ctx)
{
const struct cred *old_cred;
ssize_t ret;
if (WARN_ON_ONCE(!(file->f_mode & FMODE_BACKING)))
return -EIO;
if (!iov_iter_count(iter))
return 0;
ret = file_remove_privs(iocb->ki_filp);
if (ret)
return ret;
if (iocb->ki_flags & IOCB_DIRECT &&
!(file->f_mode & FMODE_CAN_ODIRECT))
return -EINVAL;
/*
* Stacked filesystems don't support deferred completions, don't copy
* this property in case it is set by the issuer.
*/
flags &= ~IOCB_DIO_CALLER_COMP;
old_cred = override_creds(ctx->cred);
if (is_sync_kiocb(iocb)) {
rwf_t rwf = iocb_to_rw_flags(flags);
ret = vfs_iter_write(file, iter, &iocb->ki_pos, rwf);
if (ctx->end_write)
ctx->end_write(iocb, ret);
} else {
struct backing_aio *aio;
ret = backing_aio_init_wq(iocb);
if (ret)
goto out;
ret = -ENOMEM;
aio = kmem_cache_zalloc(backing_aio_cachep, GFP_KERNEL);
if (!aio)
goto out;
aio->orig_iocb = iocb;
aio->end_write = ctx->end_write;
kiocb_clone(&aio->iocb, iocb, get_file(file));
aio->iocb.ki_flags = flags;
aio->iocb.ki_complete = backing_aio_queue_completion;
refcount_set(&aio->ref, 2);
ret = vfs_iocb_iter_write(file, &aio->iocb, iter);
backing_aio_put(aio);
if (ret != -EIOCBQUEUED)
backing_aio_cleanup(aio, ret);
}
out:
revert_creds(old_cred);
return ret;
}
EXPORT_SYMBOL_GPL(backing_file_write_iter);
ssize_t backing_file_splice_read(struct file *in, struct kiocb *iocb,
struct pipe_inode_info *pipe, size_t len,
unsigned int flags,
struct backing_file_ctx *ctx)
{
const struct cred *old_cred;
ssize_t ret;
if (WARN_ON_ONCE(!(in->f_mode & FMODE_BACKING)))
return -EIO;
old_cred = override_creds(ctx->cred);
ret = vfs_splice_read(in, &iocb->ki_pos, pipe, len, flags);
revert_creds(old_cred);
if (ctx->accessed)
ctx->accessed(iocb->ki_filp);
return ret;
}
EXPORT_SYMBOL_GPL(backing_file_splice_read);
ssize_t backing_file_splice_write(struct pipe_inode_info *pipe,
struct file *out, struct kiocb *iocb,
size_t len, unsigned int flags,
struct backing_file_ctx *ctx)
{
const struct cred *old_cred;
ssize_t ret;
if (WARN_ON_ONCE(!(out->f_mode & FMODE_BACKING)))
return -EIO;
if (!out->f_op->splice_write)
return -EINVAL;
ret = file_remove_privs(iocb->ki_filp);
if (ret)
return ret;
old_cred = override_creds(ctx->cred);
file_start_write(out);
ret = out->f_op->splice_write(pipe, out, &iocb->ki_pos, len, flags);
file_end_write(out);
revert_creds(old_cred);
if (ctx->end_write)
ctx->end_write(iocb, ret);
return ret;
}
EXPORT_SYMBOL_GPL(backing_file_splice_write);
int backing_file_mmap(struct file *file, struct vm_area_struct *vma,
struct backing_file_ctx *ctx)
{
const struct cred *old_cred;
struct file *user_file = vma->vm_file;
int ret;
if (WARN_ON_ONCE(!(file->f_mode & FMODE_BACKING)))
return -EIO;
if (!can_mmap_file(file))
return -ENODEV;
vma_set_file(vma, file);
old_cred = override_creds(ctx->cred);
ret = vfs_mmap(vma->vm_file, vma);
revert_creds(old_cred);
if (ctx->accessed)
ctx->accessed(user_file);
return ret;
}
EXPORT_SYMBOL_GPL(backing_file_mmap);
static int __init backing_aio_init(void)
{
backing_aio_cachep = KMEM_CACHE(backing_aio, SLAB_HWCACHE_ALIGN);
if (!backing_aio_cachep)
return -ENOMEM;
return 0;
}
fs_initcall(backing_aio_init);