for-6.16/block-20250523

-----BEGIN PGP SIGNATURE-----
 
 iQJEBAABCAAuFiEEwPw5LcreJtl1+l5K99NY+ylx4KYFAmgwnGYQHGF4Ym9lQGtl
 cm5lbC5kawAKCRD301j7KXHgpq9aD/4iqOts77xhWWLrOJWkkhOcV5rREeyppq8X
 MKYul9S4cc4Uin9Xou9a+nab31QBQEk3nsN3kX9o3yAXvkh6yUm36HD8qYNW/46q
 IUkwRQQJ0COyTnexMZQNTbZPQDIYcenXmQxOcrEJ5jC1Jcz0sOKHsgekL+ab3kCy
 fLnuz2ozvjGDMala/NmE8fN5qSlj4qQABHgbamwlwfo4aWu07cwfqn5G/FCYJgDO
 xUvsnTVclom2g4G+7eSSvGQI1QyAxl5QpviPnj/TEgfFBFnhbCSoBTEY6ecqhlfW
 6u59MF/Uw8E+weiuGY4L87kDtBhjQs3UMSLxCuwH7MxXb25ff7qB4AIkcFD0kKFH
 3V5NtwqlU7aQT0xOjGxaHhfPwjLD+FVss4ARmuHS09/Kn8egOW9yROPyetnuH84R
 Oz0Ctnt1IPLFjvGeg3+rt9fjjS9jWOXLITb9Q6nX9gnCt7orCwIYke8YCpmnJyhn
 i+fV4CWYIQBBRKxIT0E/GhJxZOmL0JKpomnbpP2dH8npemnsTCuvtfdrK9gfhH2X
 chBVqCPY8MNU5zKfzdEiavPqcm9392lMzOoOXW2pSC1eAKqnAQ86ZT3r7rLntqE8
 75LxHcvaQIsnpyG+YuJVHvoiJ83TbqZNpyHwNaQTYhDmdYpp2d/wTtTQywX4DuXb
 Y6NDJw5+kQ==
 =1PNK
 -----END PGP SIGNATURE-----

Merge tag 'for-6.16/block-20250523' of git://git.kernel.dk/linux

Pull block updates from Jens Axboe:

 - ublk updates:
      - Add support for updating the size of a ublk instance
      - Zero-copy improvements
      - Auto-registering of buffers for zero-copy
      - Series simplifying and improving GET_DATA and request lookup
      - Series adding quiesce support
      - Lots of selftests additions
      - Various cleanups

 - NVMe updates via Christoph:
      - add per-node DMA pools and use them for PRP/SGL allocations
        (Caleb Sander Mateos, Keith Busch)
      - nvme-fcloop refcounting fixes (Daniel Wagner)
      - support delayed removal of the multipath node and optionally
        support the multipath node for private namespaces (Nilay Shroff)
      - support shared CQs in the PCI endpoint target code (Wilfred
        Mallawa)
      - support admin-queue only authentication (Hannes Reinecke)
      - use the crc32c library instead of the crypto API (Eric Biggers)
      - misc cleanups (Christoph Hellwig, Marcelo Moreira, Hannes
        Reinecke, Leon Romanovsky, Gustavo A. R. Silva)

 - MD updates via Yu:
      - Fix that normal IO can be starved by sync IO, found by mkfs on
        newly created large raid5, with some clean up patches for bdev
        inflight counters

 - Clean up brd, getting rid of atomic kmaps and bvec poking

 - Add loop driver specifically for zoned IO testing

 - Eliminate blk-rq-qos calls with a static key, if not enabled

 - Improve hctx locking for when a plug has IO for multiple queues
   pending

 - Remove block layer bouncing support, which in turn means we can
   remove the per-node bounce stat as well

 - Improve blk-throttle support

 - Improve delay support for blk-throttle

 - Improve brd discard support

 - Unify IO scheduler switching. This should also fix a bunch of lockdep
   warnings we've been seeing, after enabling lockdep support for queue
   freezing/unfreezeing

 - Add support for block write streams via FDP (flexible data placement)
   on NVMe

 - Add a bunch of block helpers, facilitating the removal of a bunch of
   duplicated boilerplate code

 - Remove obsolete BLK_MQ pci and virtio Kconfig options

 - Add atomic/untorn write support to blktrace

 - Various little cleanups and fixes

* tag 'for-6.16/block-20250523' of git://git.kernel.dk/linux: (186 commits)
  selftests: ublk: add test for UBLK_F_QUIESCE
  ublk: add feature UBLK_F_QUIESCE
  selftests: ublk: add test case for UBLK_U_CMD_UPDATE_SIZE
  traceevent/block: Add REQ_ATOMIC flag to block trace events
  ublk: run auto buf unregisgering in same io_ring_ctx with registering
  io_uring: add helper io_uring_cmd_ctx_handle()
  ublk: remove io argument from ublk_auto_buf_reg_fallback()
  ublk: handle ublk_set_auto_buf_reg() failure correctly in ublk_fetch()
  selftests: ublk: add test for covering UBLK_AUTO_BUF_REG_FALLBACK
  selftests: ublk: support UBLK_F_AUTO_BUF_REG
  ublk: support UBLK_AUTO_BUF_REG_FALLBACK
  ublk: register buffer to local io_uring with provided buf index via UBLK_F_AUTO_BUF_REG
  ublk: prepare for supporting to register request buffer automatically
  ublk: convert to refcount_t
  selftests: ublk: make IO & device removal test more stressful
  nvme: rename nvme_mpath_shutdown_disk to nvme_mpath_remove_disk
  nvme: introduce multipath_always_on module param
  nvme-multipath: introduce delayed removal of the multipath head node
  nvme-pci: derive and better document max segments limits
  nvme-pci: use struct_size for allocation struct nvme_dev
  ...
This commit is contained in:
Linus Torvalds 2025-05-26 11:39:36 -07:00
commit 6f59de9bc0
129 changed files with 5512 additions and 2491 deletions

View file

@ -547,6 +547,21 @@ Description:
[RO] Maximum size in bytes of a single element in a DMA
scatter/gather list.
What: /sys/block/<disk>/queue/max_write_streams
Date: November 2024
Contact: linux-block@vger.kernel.org
Description:
[RO] Maximum number of write streams supported, 0 if not
supported. If supported, valid values are 1 through
max_write_streams, inclusive.
What: /sys/block/<disk>/queue/write_stream_granularity
Date: November 2024
Contact: linux-block@vger.kernel.org
Description:
[RO] Granularity of a write stream in bytes. The granularity
of a write stream is the size that should be discarded or
overwritten together to avoid write amplification in the device.
What: /sys/block/<disk>/queue/max_segments
Date: March 2010

View file

@ -11,6 +11,7 @@ Block Devices
nbd
paride
ramdisk
zoned_loop
zram
drbd/index

View file

@ -0,0 +1,169 @@
.. SPDX-License-Identifier: GPL-2.0
=======================
Zoned Loop Block Device
=======================
.. Contents:
1) Overview
2) Creating a Zoned Device
3) Deleting a Zoned Device
4) Example
1) Overview
-----------
The zoned loop block device driver (zloop) allows a user to create a zoned block
device using one regular file per zone as backing storage. This driver does not
directly control any hardware and uses read, write and truncate operations to
regular files of a file system to emulate a zoned block device.
Using zloop, zoned block devices with a configurable capacity, zone size and
number of conventional zones can be created. The storage for each zone of the
device is implemented using a regular file with a maximum size equal to the zone
size. The size of a file backing a conventional zone is always equal to the zone
size. The size of a file backing a sequential zone indicates the amount of data
sequentially written to the file, that is, the size of the file directly
indicates the position of the write pointer of the zone.
When resetting a sequential zone, its backing file size is truncated to zero.
Conversely, for a zone finish operation, the backing file is truncated to the
zone size. With this, the maximum capacity of a zloop zoned block device created
can be larger configured to be larger than the storage space available on the
backing file system. Of course, for such configuration, writing more data than
the storage space available on the backing file system will result in write
errors.
The zoned loop block device driver implements a complete zone transition state
machine. That is, zones can be empty, implicitly opened, explicitly opened,
closed or full. The current implementation does not support any limits on the
maximum number of open and active zones.
No user tools are necessary to create and delete zloop devices.
2) Creating a Zoned Device
--------------------------
Once the zloop module is loaded (or if zloop is compiled in the kernel), the
character device file /dev/zloop-control can be used to add a zloop device.
This is done by writing an "add" command directly to the /dev/zloop-control
device::
$ modprobe zloop
$ ls -l /dev/zloop*
crw-------. 1 root root 10, 123 Jan 6 19:18 /dev/zloop-control
$ mkdir -p <base directory/<device ID>
$ echo "add [options]" > /dev/zloop-control
The options available for the add command can be listed by reading the
/dev/zloop-control device::
$ cat /dev/zloop-control
add id=%d,capacity_mb=%u,zone_size_mb=%u,zone_capacity_mb=%u,conv_zones=%u,base_dir=%s,nr_queues=%u,queue_depth=%u,buffered_io
remove id=%d
In more details, the options that can be used with the "add" command are as
follows.
================ ===========================================================
id Device number (the X in /dev/zloopX).
Default: automatically assigned.
capacity_mb Device total capacity in MiB. This is always rounded up to
the nearest higher multiple of the zone size.
Default: 16384 MiB (16 GiB).
zone_size_mb Device zone size in MiB. Default: 256 MiB.
zone_capacity_mb Device zone capacity (must always be equal to or lower than
the zone size. Default: zone size.
conv_zones Total number of conventioanl zones starting from sector 0.
Default: 8.
base_dir Path to the base directoy where to create the directory
containing the zone files of the device.
Default=/var/local/zloop.
The device directory containing the zone files is always
named with the device ID. E.g. the default zone file
directory for /dev/zloop0 is /var/local/zloop/0.
nr_queues Number of I/O queues of the zoned block device. This value is
always capped by the number of online CPUs
Default: 1
queue_depth Maximum I/O queue depth per I/O queue.
Default: 64
buffered_io Do buffered IOs instead of direct IOs (default: false)
================ ===========================================================
3) Deleting a Zoned Device
--------------------------
Deleting an unused zoned loop block device is done by issuing the "remove"
command to /dev/zloop-control, specifying the ID of the device to remove::
$ echo "remove id=X" > /dev/zloop-control
The remove command does not have any option.
A zoned device that was removed can be re-added again without any change to the
state of the device zones: the device zones are restored to their last state
before the device was removed. Adding again a zoned device after it was removed
must always be done using the same configuration as when the device was first
added. If a zone configuration change is detected, an error will be returned and
the zoned device will not be created.
To fully delete a zoned device, after executing the remove operation, the device
base directory containing the backing files of the device zones must be deleted.
4) Example
----------
The following sequence of commands creates a 2GB zoned device with zones of 64
MB and a zone capacity of 63 MB::
$ modprobe zloop
$ mkdir -p /var/local/zloop/0
$ echo "add capacity_mb=2048,zone_size_mb=64,zone_capacity=63MB" > /dev/zloop-control
For the device created (/dev/zloop0), the zone backing files are all created
under the default base directory (/var/local/zloop)::
$ ls -l /var/local/zloop/0
total 0
-rw-------. 1 root root 67108864 Jan 6 22:23 cnv-000000
-rw-------. 1 root root 67108864 Jan 6 22:23 cnv-000001
-rw-------. 1 root root 67108864 Jan 6 22:23 cnv-000002
-rw-------. 1 root root 67108864 Jan 6 22:23 cnv-000003
-rw-------. 1 root root 67108864 Jan 6 22:23 cnv-000004
-rw-------. 1 root root 67108864 Jan 6 22:23 cnv-000005
-rw-------. 1 root root 67108864 Jan 6 22:23 cnv-000006
-rw-------. 1 root root 67108864 Jan 6 22:23 cnv-000007
-rw-------. 1 root root 0 Jan 6 22:23 seq-000008
-rw-------. 1 root root 0 Jan 6 22:23 seq-000009
...
The zoned device created (/dev/zloop0) can then be used normally::
$ lsblk -z
NAME ZONED ZONE-SZ ZONE-NR ZONE-AMAX ZONE-OMAX ZONE-APP ZONE-WGRAN
zloop0 host-managed 64M 32 0 0 1M 4K
$ blkzone report /dev/zloop0
start: 0x000000000, len 0x020000, cap 0x020000, wptr 0x000000 reset:0 non-seq:0, zcond: 0(nw) [type: 1(CONVENTIONAL)]
start: 0x000020000, len 0x020000, cap 0x020000, wptr 0x000000 reset:0 non-seq:0, zcond: 0(nw) [type: 1(CONVENTIONAL)]
start: 0x000040000, len 0x020000, cap 0x020000, wptr 0x000000 reset:0 non-seq:0, zcond: 0(nw) [type: 1(CONVENTIONAL)]
start: 0x000060000, len 0x020000, cap 0x020000, wptr 0x000000 reset:0 non-seq:0, zcond: 0(nw) [type: 1(CONVENTIONAL)]
start: 0x000080000, len 0x020000, cap 0x020000, wptr 0x000000 reset:0 non-seq:0, zcond: 0(nw) [type: 1(CONVENTIONAL)]
start: 0x0000a0000, len 0x020000, cap 0x020000, wptr 0x000000 reset:0 non-seq:0, zcond: 0(nw) [type: 1(CONVENTIONAL)]
start: 0x0000c0000, len 0x020000, cap 0x020000, wptr 0x000000 reset:0 non-seq:0, zcond: 0(nw) [type: 1(CONVENTIONAL)]
start: 0x0000e0000, len 0x020000, cap 0x020000, wptr 0x000000 reset:0 non-seq:0, zcond: 0(nw) [type: 1(CONVENTIONAL)]
start: 0x000100000, len 0x020000, cap 0x01f800, wptr 0x000000 reset:0 non-seq:0, zcond: 1(em) [type: 2(SEQ_WRITE_REQUIRED)]
start: 0x000120000, len 0x020000, cap 0x01f800, wptr 0x000000 reset:0 non-seq:0, zcond: 1(em) [type: 2(SEQ_WRITE_REQUIRED)]
...
Deleting this device is done using the command::
$ echo "remove id=0" > /dev/zloop-control
The removed device can be re-added again using the same "add" command as when
the device was first created. To fully delete a zoned device, its backing files
should also be deleted after executing the remove command::
$ rm -r /var/local/zloop/0

View file

@ -26894,6 +26894,14 @@ L: linux-kernel@vger.kernel.org
S: Maintained
F: arch/x86/kernel/cpu/zhaoxin.c
ZONED LOOP DEVICE
M: Damien Le Moal <dlemoal@kernel.org>
R: Christoph Hellwig <hch@lst.de>
L: linux-block@vger.kernel.org
S: Maintained
F: Documentation/admin-guide/blockdev/zoned_loop.rst
F: drivers/block/zloop.c
ZONEFS FILESYSTEM
M: Damien Le Moal <dlemoal@kernel.org>
M: Naohiro Aota <naohiro.aota@wdc.com>

View file

@ -13,7 +13,6 @@ CONFIG_MIPS_CMDLINE_DTB_EXTEND=y
CONFIG_MODULES=y
CONFIG_MODULE_UNLOAD=y
# CONFIG_BLK_DEV_BSG is not set
# CONFIG_BOUNCE is not set
CONFIG_NET=y
CONFIG_PACKET=y
CONFIG_UNIX=y

View file

@ -211,14 +211,6 @@ config BLK_INLINE_ENCRYPTION_FALLBACK
source "block/partitions/Kconfig"
config BLK_MQ_PCI
def_bool PCI
config BLK_MQ_VIRTIO
bool
depends on VIRTIO
default y
config BLK_PM
def_bool PM

View file

@ -5,13 +5,12 @@
obj-y := bdev.o fops.o bio.o elevator.o blk-core.o blk-sysfs.o \
blk-flush.o blk-settings.o blk-ioc.o blk-map.o \
blk-merge.o blk-timeout.o \
blk-lib.o blk-mq.o blk-mq-tag.o blk-stat.o \
blk-merge.o blk-timeout.o blk-lib.o blk-mq.o \
blk-mq-tag.o blk-mq-dma.o blk-stat.o \
blk-mq-sysfs.o blk-mq-cpumap.o blk-mq-sched.o ioctl.o \
genhd.o ioprio.o badblocks.o partitions/ blk-rq-qos.o \
disk-events.o blk-ia-ranges.o early-lookup.o
obj-$(CONFIG_BOUNCE) += bounce.o
obj-$(CONFIG_BLK_DEV_BSG_COMMON) += bsg.o
obj-$(CONFIG_BLK_DEV_BSGLIB) += bsg-lib.o
obj-$(CONFIG_BLK_CGROUP) += blk-cgroup.o

View file

@ -7210,8 +7210,8 @@ static void bfq_exit_queue(struct elevator_queue *e)
#endif
blk_stat_disable_accounting(bfqd->queue);
clear_bit(ELEVATOR_FLAG_DISABLE_WBT, &e->flags);
wbt_enable_default(bfqd->queue->disk);
blk_queue_flag_clear(QUEUE_FLAG_DISABLE_WBT_DEF, bfqd->queue);
set_bit(ELEVATOR_FLAG_ENABLE_WBT_ON_EXIT, &e->flags);
kfree(bfqd);
}
@ -7397,7 +7397,7 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
/* We dispatch from request queue wide instead of hw queue */
blk_queue_flag_set(QUEUE_FLAG_SQ_SCHED, q);
set_bit(ELEVATOR_FLAG_DISABLE_WBT, &eq->flags);
blk_queue_flag_set(QUEUE_FLAG_DISABLE_WBT_DEF, q);
wbt_disable_default(q->disk);
blk_stat_enable_accounting(q);

View file

@ -127,10 +127,8 @@ int bio_integrity_add_page(struct bio *bio, struct page *page,
if (bip->bip_vcnt > 0) {
struct bio_vec *bv = &bip->bip_vec[bip->bip_vcnt - 1];
bool same_page = false;
if (bvec_try_merge_hw_page(q, bv, page, len, offset,
&same_page)) {
if (bvec_try_merge_hw_page(q, bv, page, len, offset)) {
bip->bip_iter.bi_size += len;
return len;
}

View file

@ -251,6 +251,7 @@ void bio_init(struct bio *bio, struct block_device *bdev, struct bio_vec *table,
bio->bi_flags = 0;
bio->bi_ioprio = 0;
bio->bi_write_hint = 0;
bio->bi_write_stream = 0;
bio->bi_status = 0;
bio->bi_iter.bi_sector = 0;
bio->bi_iter.bi_size = 0;
@ -827,6 +828,7 @@ static int __bio_clone(struct bio *bio, struct bio *bio_src, gfp_t gfp)
bio_set_flag(bio, BIO_CLONED);
bio->bi_ioprio = bio_src->bi_ioprio;
bio->bi_write_hint = bio_src->bi_write_hint;
bio->bi_write_stream = bio_src->bi_write_stream;
bio->bi_iter = bio_src->bi_iter;
if (bio->bi_bdev) {
@ -918,7 +920,7 @@ static inline bool bio_full(struct bio *bio, unsigned len)
}
static bool bvec_try_merge_page(struct bio_vec *bv, struct page *page,
unsigned int len, unsigned int off, bool *same_page)
unsigned int len, unsigned int off)
{
size_t bv_end = bv->bv_offset + bv->bv_len;
phys_addr_t vec_end_addr = page_to_phys(bv->bv_page) + bv_end - 1;
@ -931,9 +933,7 @@ static bool bvec_try_merge_page(struct bio_vec *bv, struct page *page,
if (!zone_device_pages_have_same_pgmap(bv->bv_page, page))
return false;
*same_page = ((vec_end_addr & PAGE_MASK) == ((page_addr + off) &
PAGE_MASK));
if (!*same_page) {
if ((vec_end_addr & PAGE_MASK) != ((page_addr + off) & PAGE_MASK)) {
if (IS_ENABLED(CONFIG_KMSAN))
return false;
if (bv->bv_page + bv_end / PAGE_SIZE != page + off / PAGE_SIZE)
@ -953,8 +953,7 @@ static bool bvec_try_merge_page(struct bio_vec *bv, struct page *page,
* helpers to split. Hopefully this will go away soon.
*/
bool bvec_try_merge_hw_page(struct request_queue *q, struct bio_vec *bv,
struct page *page, unsigned len, unsigned offset,
bool *same_page)
struct page *page, unsigned len, unsigned offset)
{
unsigned long mask = queue_segment_boundary(q);
phys_addr_t addr1 = bvec_phys(bv);
@ -964,7 +963,7 @@ bool bvec_try_merge_hw_page(struct request_queue *q, struct bio_vec *bv,
return false;
if (len > queue_max_segment_size(q) - bv->bv_len)
return false;
return bvec_try_merge_page(bv, page, len, offset, same_page);
return bvec_try_merge_page(bv, page, len, offset);
}
/**
@ -989,6 +988,22 @@ void __bio_add_page(struct bio *bio, struct page *page,
}
EXPORT_SYMBOL_GPL(__bio_add_page);
/**
* bio_add_virt_nofail - add data in the direct kernel mapping to a bio
* @bio: destination bio
* @vaddr: data to add
* @len: length of the data to add, may cross pages
*
* Add the data at @vaddr to @bio. The caller must have ensure a segment
* is available for the added data. No merging into an existing segment
* will be performed.
*/
void bio_add_virt_nofail(struct bio *bio, void *vaddr, unsigned len)
{
__bio_add_page(bio, virt_to_page(vaddr), len, offset_in_page(vaddr));
}
EXPORT_SYMBOL_GPL(bio_add_virt_nofail);
/**
* bio_add_page - attempt to add page(s) to bio
* @bio: destination bio
@ -1002,8 +1017,6 @@ EXPORT_SYMBOL_GPL(__bio_add_page);
int bio_add_page(struct bio *bio, struct page *page,
unsigned int len, unsigned int offset)
{
bool same_page = false;
if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)))
return 0;
if (bio->bi_iter.bi_size > UINT_MAX - len)
@ -1011,7 +1024,7 @@ int bio_add_page(struct bio *bio, struct page *page,
if (bio->bi_vcnt > 0 &&
bvec_try_merge_page(&bio->bi_io_vec[bio->bi_vcnt - 1],
page, len, offset, &same_page)) {
page, len, offset)) {
bio->bi_iter.bi_size += len;
return len;
}
@ -1058,6 +1071,61 @@ bool bio_add_folio(struct bio *bio, struct folio *folio, size_t len,
}
EXPORT_SYMBOL(bio_add_folio);
/**
* bio_add_vmalloc_chunk - add a vmalloc chunk to a bio
* @bio: destination bio
* @vaddr: vmalloc address to add
* @len: total length in bytes of the data to add
*
* Add data starting at @vaddr to @bio and return how many bytes were added.
* This may be less than the amount originally asked. Returns 0 if no data
* could be added to @bio.
*
* This helper calls flush_kernel_vmap_range() for the range added. For reads
* the caller still needs to manually call invalidate_kernel_vmap_range() in
* the completion handler.
*/
unsigned int bio_add_vmalloc_chunk(struct bio *bio, void *vaddr, unsigned len)
{
unsigned int offset = offset_in_page(vaddr);
len = min(len, PAGE_SIZE - offset);
if (bio_add_page(bio, vmalloc_to_page(vaddr), len, offset) < len)
return 0;
if (op_is_write(bio_op(bio)))
flush_kernel_vmap_range(vaddr, len);
return len;
}
EXPORT_SYMBOL_GPL(bio_add_vmalloc_chunk);
/**
* bio_add_vmalloc - add a vmalloc region to a bio
* @bio: destination bio
* @vaddr: vmalloc address to add
* @len: total length in bytes of the data to add
*
* Add data starting at @vaddr to @bio. Return %true on success or %false if
* @bio does not have enough space for the payload.
*
* This helper calls flush_kernel_vmap_range() for the range added. For reads
* the caller still needs to manually call invalidate_kernel_vmap_range() in
* the completion handler.
*/
bool bio_add_vmalloc(struct bio *bio, void *vaddr, unsigned int len)
{
do {
unsigned int added = bio_add_vmalloc_chunk(bio, vaddr, len);
if (!added)
return false;
vaddr += added;
len -= added;
} while (len);
return true;
}
EXPORT_SYMBOL_GPL(bio_add_vmalloc);
void __bio_release_pages(struct bio *bio, bool mark_dirty)
{
struct folio_iter fi;
@ -1088,27 +1156,6 @@ void bio_iov_bvec_set(struct bio *bio, const struct iov_iter *iter)
bio_set_flag(bio, BIO_CLONED);
}
static int bio_iov_add_folio(struct bio *bio, struct folio *folio, size_t len,
size_t offset)
{
bool same_page = false;
if (WARN_ON_ONCE(bio->bi_iter.bi_size > UINT_MAX - len))
return -EIO;
if (bio->bi_vcnt > 0 &&
bvec_try_merge_page(&bio->bi_io_vec[bio->bi_vcnt - 1],
folio_page(folio, 0), len, offset,
&same_page)) {
bio->bi_iter.bi_size += len;
if (same_page && bio_flagged(bio, BIO_PAGE_PINNED))
unpin_user_folio(folio, 1);
return 0;
}
bio_add_folio_nofail(bio, folio, len, offset);
return 0;
}
static unsigned int get_contig_folio_len(unsigned int *num_pages,
struct page **pages, unsigned int i,
struct folio *folio, size_t left,
@ -1203,6 +1250,7 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
for (left = size, i = 0; left > 0; left -= len, i += num_pages) {
struct page *page = pages[i];
struct folio *folio = page_folio(page);
unsigned int old_vcnt = bio->bi_vcnt;
folio_offset = ((size_t)folio_page_idx(folio, page) <<
PAGE_SHIFT) + offset;
@ -1215,7 +1263,23 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
len = get_contig_folio_len(&num_pages, pages, i,
folio, left, offset);
bio_iov_add_folio(bio, folio, len, folio_offset);
if (!bio_add_folio(bio, folio, len, folio_offset)) {
WARN_ON_ONCE(1);
ret = -EINVAL;
goto out;
}
if (bio_flagged(bio, BIO_PAGE_PINNED)) {
/*
* We're adding another fragment of a page that already
* was part of the last segment. Undo our pin as the
* page was pinned when an earlier fragment of it was
* added to the bio and __bio_release_pages expects a
* single pin per page.
*/
if (offset && bio->bi_vcnt == old_vcnt)
unpin_user_folio(folio, 1);
}
offset = 0;
}
@ -1301,6 +1365,36 @@ int submit_bio_wait(struct bio *bio)
}
EXPORT_SYMBOL(submit_bio_wait);
/**
* bdev_rw_virt - synchronously read into / write from kernel mapping
* @bdev: block device to access
* @sector: sector to access
* @data: data to read/write
* @len: length in byte to read/write
* @op: operation (e.g. REQ_OP_READ/REQ_OP_WRITE)
*
* Performs synchronous I/O to @bdev for @data/@len. @data must be in
* the kernel direct mapping and not a vmalloc address.
*/
int bdev_rw_virt(struct block_device *bdev, sector_t sector, void *data,
size_t len, enum req_op op)
{
struct bio_vec bv;
struct bio bio;
int error;
if (WARN_ON_ONCE(is_vmalloc_addr(data)))
return -EIO;
bio_init(&bio, bdev, &bv, 1, op);
bio.bi_iter.bi_sector = sector;
bio_add_virt_nofail(&bio, data, len);
error = submit_bio_wait(&bio);
bio_uninit(&bio);
return error;
}
EXPORT_SYMBOL_GPL(bdev_rw_virt);
static void bio_wait_end_io(struct bio *bio)
{
complete(bio->bi_private);

View file

@ -1018,7 +1018,7 @@ again:
stamp = READ_ONCE(part->bd_stamp);
if (unlikely(time_after(now, stamp)) &&
likely(try_cmpxchg(&part->bd_stamp, &stamp, now)) &&
(end || part_in_flight(part)))
(end || bdev_count_inflight(part)))
__part_stat_add(part, io_ticks, now - stamp);
if (bdev_is_partition(part)) {

View file

@ -173,6 +173,7 @@ static struct bio *blk_crypto_fallback_clone_bio(struct bio *bio_src)
bio_set_flag(bio, BIO_REMAPPED);
bio->bi_ioprio = bio_src->bi_ioprio;
bio->bi_write_hint = bio_src->bi_write_hint;
bio->bi_write_stream = bio_src->bi_write_stream;
bio->bi_iter.bi_sector = bio_src->bi_iter.bi_sector;
bio->bi_iter.bi_size = bio_src->bi_iter.bi_size;

View file

@ -317,64 +317,26 @@ static void bio_map_kern_endio(struct bio *bio)
kfree(bio);
}
/**
* bio_map_kern - map kernel address into bio
* @q: the struct request_queue for the bio
* @data: pointer to buffer to map
* @len: length in bytes
* @gfp_mask: allocation flags for bio allocation
*
* Map the kernel address into a bio suitable for io to a block
* device. Returns an error pointer in case of error.
*/
static struct bio *bio_map_kern(struct request_queue *q, void *data,
unsigned int len, gfp_t gfp_mask)
static struct bio *bio_map_kern(void *data, unsigned int len, enum req_op op,
gfp_t gfp_mask)
{
unsigned long kaddr = (unsigned long)data;
unsigned long end = (kaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
unsigned long start = kaddr >> PAGE_SHIFT;
const int nr_pages = end - start;
bool is_vmalloc = is_vmalloc_addr(data);
struct page *page;
int offset, i;
unsigned int nr_vecs = bio_add_max_vecs(data, len);
struct bio *bio;
bio = bio_kmalloc(nr_pages, gfp_mask);
bio = bio_kmalloc(nr_vecs, gfp_mask);
if (!bio)
return ERR_PTR(-ENOMEM);
bio_init(bio, NULL, bio->bi_inline_vecs, nr_pages, 0);
if (is_vmalloc) {
flush_kernel_vmap_range(data, len);
bio_init(bio, NULL, bio->bi_inline_vecs, nr_vecs, op);
if (is_vmalloc_addr(data)) {
bio->bi_private = data;
}
offset = offset_in_page(kaddr);
for (i = 0; i < nr_pages; i++) {
unsigned int bytes = PAGE_SIZE - offset;
if (len <= 0)
break;
if (bytes > len)
bytes = len;
if (!is_vmalloc)
page = virt_to_page(data);
else
page = vmalloc_to_page(data);
if (bio_add_page(bio, page, bytes, offset) < bytes) {
/* we don't support partial mappings */
if (!bio_add_vmalloc(bio, data, len)) {
bio_uninit(bio);
kfree(bio);
return ERR_PTR(-EINVAL);
}
data += bytes;
len -= bytes;
offset = 0;
} else {
bio_add_virt_nofail(bio, data, len);
}
bio->bi_end_io = bio_map_kern_endio;
return bio;
}
@ -402,17 +364,16 @@ static void bio_copy_kern_endio_read(struct bio *bio)
/**
* bio_copy_kern - copy kernel address into bio
* @q: the struct request_queue for the bio
* @data: pointer to buffer to copy
* @len: length in bytes
* @op: bio/request operation
* @gfp_mask: allocation flags for bio and page allocation
* @reading: data direction is READ
*
* copy the kernel address into a bio suitable for io to a block
* device. Returns an error pointer in case of error.
*/
static struct bio *bio_copy_kern(struct request_queue *q, void *data,
unsigned int len, gfp_t gfp_mask, int reading)
static struct bio *bio_copy_kern(void *data, unsigned int len, enum req_op op,
gfp_t gfp_mask)
{
unsigned long kaddr = (unsigned long)data;
unsigned long end = (kaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
@ -431,7 +392,7 @@ static struct bio *bio_copy_kern(struct request_queue *q, void *data,
bio = bio_kmalloc(nr_pages, gfp_mask);
if (!bio)
return ERR_PTR(-ENOMEM);
bio_init(bio, NULL, bio->bi_inline_vecs, nr_pages, 0);
bio_init(bio, NULL, bio->bi_inline_vecs, nr_pages, op);
while (len) {
struct page *page;
@ -444,7 +405,7 @@ static struct bio *bio_copy_kern(struct request_queue *q, void *data,
if (!page)
goto cleanup;
if (!reading)
if (op_is_write(op))
memcpy(page_address(page), p, bytes);
if (bio_add_page(bio, page, bytes, 0) < bytes)
@ -454,11 +415,11 @@ static struct bio *bio_copy_kern(struct request_queue *q, void *data,
p += bytes;
}
if (reading) {
if (op_is_write(op)) {
bio->bi_end_io = bio_copy_kern_endio;
} else {
bio->bi_end_io = bio_copy_kern_endio_read;
bio->bi_private = data;
} else {
bio->bi_end_io = bio_copy_kern_endio;
}
return bio;
@ -556,8 +517,6 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq,
if (map_data)
copy = true;
else if (blk_queue_may_bounce(q))
copy = true;
else if (iov_iter_alignment(iter) & align)
copy = true;
else if (iov_iter_is_bvec(iter))
@ -689,7 +648,6 @@ EXPORT_SYMBOL(blk_rq_unmap_user);
/**
* blk_rq_map_kern - map kernel data to a request, for passthrough requests
* @q: request queue where request should be inserted
* @rq: request to fill
* @kbuf: the kernel buffer
* @len: length of user data
@ -700,31 +658,26 @@ EXPORT_SYMBOL(blk_rq_unmap_user);
* buffer is used. Can be called multiple times to append multiple
* buffers.
*/
int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf,
unsigned int len, gfp_t gfp_mask)
int blk_rq_map_kern(struct request *rq, void *kbuf, unsigned int len,
gfp_t gfp_mask)
{
int reading = rq_data_dir(rq) == READ;
unsigned long addr = (unsigned long) kbuf;
struct bio *bio;
int ret;
if (len > (queue_max_hw_sectors(q) << 9))
if (len > (queue_max_hw_sectors(rq->q) << SECTOR_SHIFT))
return -EINVAL;
if (!len || !kbuf)
return -EINVAL;
if (!blk_rq_aligned(q, addr, len) || object_is_on_stack(kbuf) ||
blk_queue_may_bounce(q))
bio = bio_copy_kern(q, kbuf, len, gfp_mask, reading);
if (!blk_rq_aligned(rq->q, addr, len) || object_is_on_stack(kbuf))
bio = bio_copy_kern(kbuf, len, req_op(rq), gfp_mask);
else
bio = bio_map_kern(q, kbuf, len, gfp_mask);
bio = bio_map_kern(kbuf, len, req_op(rq), gfp_mask);
if (IS_ERR(bio))
return PTR_ERR(bio);
bio->bi_opf &= ~REQ_OP_MASK;
bio->bi_opf |= req_op(rq);
ret = blk_rq_append_bio(rq, bio);
if (unlikely(ret)) {
bio_uninit(bio);

View file

@ -7,7 +7,6 @@
#include <linux/bio.h>
#include <linux/blkdev.h>
#include <linux/blk-integrity.h>
#include <linux/scatterlist.h>
#include <linux/part_stat.h>
#include <linux/blk-cgroup.h>
@ -225,27 +224,6 @@ static inline unsigned get_max_io_size(struct bio *bio,
return max_sectors & ~(lbs - 1);
}
/**
* get_max_segment_size() - maximum number of bytes to add as a single segment
* @lim: Request queue limits.
* @paddr: address of the range to add
* @len: maximum length available to add at @paddr
*
* Returns the maximum number of bytes of the range starting at @paddr that can
* be added to a single segment.
*/
static inline unsigned get_max_segment_size(const struct queue_limits *lim,
phys_addr_t paddr, unsigned int len)
{
/*
* Prevent an overflow if mask = ULONG_MAX and offset = 0 by adding 1
* after having calculated the minimum.
*/
return min_t(unsigned long, len,
min(lim->seg_boundary_mask - (lim->seg_boundary_mask & paddr),
(unsigned long)lim->max_segment_size - 1) + 1);
}
/**
* bvec_split_segs - verify whether or not a bvec should be split in the middle
* @lim: [in] queue limits to split based on
@ -473,117 +451,6 @@ unsigned int blk_recalc_rq_segments(struct request *rq)
return nr_phys_segs;
}
struct phys_vec {
phys_addr_t paddr;
u32 len;
};
static bool blk_map_iter_next(struct request *req,
struct req_iterator *iter, struct phys_vec *vec)
{
unsigned int max_size;
struct bio_vec bv;
if (req->rq_flags & RQF_SPECIAL_PAYLOAD) {
if (!iter->bio)
return false;
vec->paddr = bvec_phys(&req->special_vec);
vec->len = req->special_vec.bv_len;
iter->bio = NULL;
return true;
}
if (!iter->iter.bi_size)
return false;
bv = mp_bvec_iter_bvec(iter->bio->bi_io_vec, iter->iter);
vec->paddr = bvec_phys(&bv);
max_size = get_max_segment_size(&req->q->limits, vec->paddr, UINT_MAX);
bv.bv_len = min(bv.bv_len, max_size);
bio_advance_iter_single(iter->bio, &iter->iter, bv.bv_len);
/*
* If we are entirely done with this bi_io_vec entry, check if the next
* one could be merged into it. This typically happens when moving to
* the next bio, but some callers also don't pack bvecs tight.
*/
while (!iter->iter.bi_size || !iter->iter.bi_bvec_done) {
struct bio_vec next;
if (!iter->iter.bi_size) {
if (!iter->bio->bi_next)
break;
iter->bio = iter->bio->bi_next;
iter->iter = iter->bio->bi_iter;
}
next = mp_bvec_iter_bvec(iter->bio->bi_io_vec, iter->iter);
if (bv.bv_len + next.bv_len > max_size ||
!biovec_phys_mergeable(req->q, &bv, &next))
break;
bv.bv_len += next.bv_len;
bio_advance_iter_single(iter->bio, &iter->iter, next.bv_len);
}
vec->len = bv.bv_len;
return true;
}
static inline struct scatterlist *blk_next_sg(struct scatterlist **sg,
struct scatterlist *sglist)
{
if (!*sg)
return sglist;
/*
* If the driver previously mapped a shorter list, we could see a
* termination bit prematurely unless it fully inits the sg table
* on each mapping. We KNOW that there must be more entries here
* or the driver would be buggy, so force clear the termination bit
* to avoid doing a full sg_init_table() in drivers for each command.
*/
sg_unmark_end(*sg);
return sg_next(*sg);
}
/*
* Map a request to scatterlist, return number of sg entries setup. Caller
* must make sure sg can hold rq->nr_phys_segments entries.
*/
int __blk_rq_map_sg(struct request *rq, struct scatterlist *sglist,
struct scatterlist **last_sg)
{
struct req_iterator iter = {
.bio = rq->bio,
};
struct phys_vec vec;
int nsegs = 0;
/* the internal flush request may not have bio attached */
if (iter.bio)
iter.iter = iter.bio->bi_iter;
while (blk_map_iter_next(rq, &iter, &vec)) {
*last_sg = blk_next_sg(last_sg, sglist);
sg_set_page(*last_sg, phys_to_page(vec.paddr), vec.len,
offset_in_page(vec.paddr));
nsegs++;
}
if (*last_sg)
sg_mark_end(*last_sg);
/*
* Something must have been wrong if the figured number of
* segment is bigger than number of req's physical segments
*/
WARN_ON(nsegs > blk_rq_nr_phys_segments(rq));
return nsegs;
}
EXPORT_SYMBOL(__blk_rq_map_sg);
static inline unsigned int blk_rq_get_max_sectors(struct request *rq,
sector_t offset)
{
@ -832,6 +699,8 @@ static struct request *attempt_merge(struct request_queue *q,
if (req->bio->bi_write_hint != next->bio->bi_write_hint)
return NULL;
if (req->bio->bi_write_stream != next->bio->bi_write_stream)
return NULL;
if (req->bio->bi_ioprio != next->bio->bi_ioprio)
return NULL;
if (!blk_atomic_write_mergeable_rqs(req, next))
@ -953,6 +822,8 @@ bool blk_rq_merge_ok(struct request *rq, struct bio *bio)
return false;
if (rq->bio->bi_write_hint != bio->bi_write_hint)
return false;
if (rq->bio->bi_write_stream != bio->bi_write_stream)
return false;
if (rq->bio->bi_ioprio != bio->bi_ioprio)
return false;
if (blk_atomic_write_mergeable_rq_bio(rq, bio) == false)

View file

@ -93,6 +93,8 @@ static const char *const blk_queue_flag_name[] = {
QUEUE_FLAG_NAME(RQ_ALLOC_TIME),
QUEUE_FLAG_NAME(HCTX_ACTIVE),
QUEUE_FLAG_NAME(SQ_SCHED),
QUEUE_FLAG_NAME(DISABLE_WBT_DEF),
QUEUE_FLAG_NAME(NO_ELV_SWITCH),
};
#undef QUEUE_FLAG_NAME
@ -624,20 +626,9 @@ void blk_mq_debugfs_register(struct request_queue *q)
debugfs_create_files(q->debugfs_dir, q, blk_mq_debugfs_queue_attrs);
/*
* blk_mq_init_sched() attempted to do this already, but q->debugfs_dir
* didn't exist yet (because we don't know what to name the directory
* until the queue is registered to a gendisk).
*/
if (q->elevator && !q->sched_debugfs_dir)
blk_mq_debugfs_register_sched(q);
/* Similarly, blk_mq_init_hctx() couldn't do this previously. */
queue_for_each_hw_ctx(q, hctx, i) {
if (!hctx->debugfs_dir)
blk_mq_debugfs_register_hctx(q, hctx);
if (q->elevator && !hctx->sched_debugfs_dir)
blk_mq_debugfs_register_sched_hctx(q, hctx);
}
if (q->rq_qos) {

116
block/blk-mq-dma.c Normal file
View file

@ -0,0 +1,116 @@
// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (C) 2025 Christoph Hellwig
*/
#include "blk.h"
struct phys_vec {
phys_addr_t paddr;
u32 len;
};
static bool blk_map_iter_next(struct request *req, struct req_iterator *iter,
struct phys_vec *vec)
{
unsigned int max_size;
struct bio_vec bv;
if (req->rq_flags & RQF_SPECIAL_PAYLOAD) {
if (!iter->bio)
return false;
vec->paddr = bvec_phys(&req->special_vec);
vec->len = req->special_vec.bv_len;
iter->bio = NULL;
return true;
}
if (!iter->iter.bi_size)
return false;
bv = mp_bvec_iter_bvec(iter->bio->bi_io_vec, iter->iter);
vec->paddr = bvec_phys(&bv);
max_size = get_max_segment_size(&req->q->limits, vec->paddr, UINT_MAX);
bv.bv_len = min(bv.bv_len, max_size);
bio_advance_iter_single(iter->bio, &iter->iter, bv.bv_len);
/*
* If we are entirely done with this bi_io_vec entry, check if the next
* one could be merged into it. This typically happens when moving to
* the next bio, but some callers also don't pack bvecs tight.
*/
while (!iter->iter.bi_size || !iter->iter.bi_bvec_done) {
struct bio_vec next;
if (!iter->iter.bi_size) {
if (!iter->bio->bi_next)
break;
iter->bio = iter->bio->bi_next;
iter->iter = iter->bio->bi_iter;
}
next = mp_bvec_iter_bvec(iter->bio->bi_io_vec, iter->iter);
if (bv.bv_len + next.bv_len > max_size ||
!biovec_phys_mergeable(req->q, &bv, &next))
break;
bv.bv_len += next.bv_len;
bio_advance_iter_single(iter->bio, &iter->iter, next.bv_len);
}
vec->len = bv.bv_len;
return true;
}
static inline struct scatterlist *
blk_next_sg(struct scatterlist **sg, struct scatterlist *sglist)
{
if (!*sg)
return sglist;
/*
* If the driver previously mapped a shorter list, we could see a
* termination bit prematurely unless it fully inits the sg table
* on each mapping. We KNOW that there must be more entries here
* or the driver would be buggy, so force clear the termination bit
* to avoid doing a full sg_init_table() in drivers for each command.
*/
sg_unmark_end(*sg);
return sg_next(*sg);
}
/*
* Map a request to scatterlist, return number of sg entries setup. Caller
* must make sure sg can hold rq->nr_phys_segments entries.
*/
int __blk_rq_map_sg(struct request *rq, struct scatterlist *sglist,
struct scatterlist **last_sg)
{
struct req_iterator iter = {
.bio = rq->bio,
};
struct phys_vec vec;
int nsegs = 0;
/* the internal flush request may not have bio attached */
if (iter.bio)
iter.iter = iter.bio->bi_iter;
while (blk_map_iter_next(rq, &iter, &vec)) {
*last_sg = blk_next_sg(last_sg, sglist);
sg_set_page(*last_sg, phys_to_page(vec.paddr), vec.len,
offset_in_page(vec.paddr));
nsegs++;
}
if (*last_sg)
sg_mark_end(*last_sg);
/*
* Something must have been wrong if the figured number of
* segment is bigger than number of req's physical segments
*/
WARN_ON(nsegs > blk_rq_nr_phys_segments(rq));
return nsegs;
}
EXPORT_SYMBOL(__blk_rq_map_sg);

View file

@ -59,19 +59,17 @@ static bool blk_mq_dispatch_hctx_list(struct list_head *rq_list)
list_first_entry(rq_list, struct request, queuelist)->mq_hctx;
struct request *rq;
LIST_HEAD(hctx_list);
unsigned int count = 0;
list_for_each_entry(rq, rq_list, queuelist) {
if (rq->mq_hctx != hctx) {
list_cut_before(&hctx_list, rq_list, &rq->queuelist);
goto dispatch;
}
count++;
}
list_splice_tail_init(rq_list, &hctx_list);
dispatch:
return blk_mq_dispatch_rq_list(hctx, &hctx_list, count);
return blk_mq_dispatch_rq_list(hctx, &hctx_list, false);
}
#define BLK_MQ_BUDGET_DELAY 3 /* ms units */
@ -167,7 +165,7 @@ static int __blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
dispatched |= blk_mq_dispatch_hctx_list(&rq_list);
} while (!list_empty(&rq_list));
} else {
dispatched = blk_mq_dispatch_rq_list(hctx, &rq_list, count);
dispatched = blk_mq_dispatch_rq_list(hctx, &rq_list, false);
}
if (busy)
@ -261,7 +259,7 @@ static int blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx)
/* round robin for fair dispatch */
ctx = blk_mq_next_ctx(hctx, rq->mq_ctx);
} while (blk_mq_dispatch_rq_list(rq->mq_hctx, &rq_list, 1));
} while (blk_mq_dispatch_rq_list(rq->mq_hctx, &rq_list, false));
WRITE_ONCE(hctx->dispatch_from, ctx);
return ret;
@ -298,7 +296,7 @@ static int __blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
*/
if (!list_empty(&rq_list)) {
blk_mq_sched_mark_restart_hctx(hctx);
if (!blk_mq_dispatch_rq_list(hctx, &rq_list, 0))
if (!blk_mq_dispatch_rq_list(hctx, &rq_list, true))
return 0;
need_dispatch = true;
} else {
@ -312,7 +310,7 @@ static int __blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
if (need_dispatch)
return blk_mq_do_dispatch_ctx(hctx);
blk_mq_flush_busy_ctxs(hctx, &rq_list);
blk_mq_dispatch_rq_list(hctx, &rq_list, 0);
blk_mq_dispatch_rq_list(hctx, &rq_list, true);
return 0;
}
@ -436,6 +434,30 @@ static int blk_mq_init_sched_shared_tags(struct request_queue *queue)
return 0;
}
void blk_mq_sched_reg_debugfs(struct request_queue *q)
{
struct blk_mq_hw_ctx *hctx;
unsigned long i;
mutex_lock(&q->debugfs_mutex);
blk_mq_debugfs_register_sched(q);
queue_for_each_hw_ctx(q, hctx, i)
blk_mq_debugfs_register_sched_hctx(q, hctx);
mutex_unlock(&q->debugfs_mutex);
}
void blk_mq_sched_unreg_debugfs(struct request_queue *q)
{
struct blk_mq_hw_ctx *hctx;
unsigned long i;
mutex_lock(&q->debugfs_mutex);
queue_for_each_hw_ctx(q, hctx, i)
blk_mq_debugfs_unregister_sched_hctx(hctx);
blk_mq_debugfs_unregister_sched(q);
mutex_unlock(&q->debugfs_mutex);
}
/* caller must have a reference to @e, will grab another one if successful */
int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
{
@ -469,10 +491,6 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
if (ret)
goto err_free_map_and_rqs;
mutex_lock(&q->debugfs_mutex);
blk_mq_debugfs_register_sched(q);
mutex_unlock(&q->debugfs_mutex);
queue_for_each_hw_ctx(q, hctx, i) {
if (e->ops.init_hctx) {
ret = e->ops.init_hctx(hctx, i);
@ -484,11 +502,7 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
return ret;
}
}
mutex_lock(&q->debugfs_mutex);
blk_mq_debugfs_register_sched_hctx(q, hctx);
mutex_unlock(&q->debugfs_mutex);
}
return 0;
err_free_map_and_rqs:
@ -527,10 +541,6 @@ void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e)
unsigned int flags = 0;
queue_for_each_hw_ctx(q, hctx, i) {
mutex_lock(&q->debugfs_mutex);
blk_mq_debugfs_unregister_sched_hctx(hctx);
mutex_unlock(&q->debugfs_mutex);
if (e->type->ops.exit_hctx && hctx->sched_data) {
e->type->ops.exit_hctx(hctx, i);
hctx->sched_data = NULL;
@ -538,12 +548,9 @@ void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e)
flags = hctx->flags;
}
mutex_lock(&q->debugfs_mutex);
blk_mq_debugfs_unregister_sched(q);
mutex_unlock(&q->debugfs_mutex);
if (e->type->ops.exit_sched)
e->type->ops.exit_sched(e);
blk_mq_sched_tags_teardown(q, flags);
set_bit(ELEVATOR_FLAG_DYING, &q->elevator->flags);
q->elevator = NULL;
}

View file

@ -89,7 +89,7 @@ struct mq_inflight {
unsigned int inflight[2];
};
static bool blk_mq_check_inflight(struct request *rq, void *priv)
static bool blk_mq_check_in_driver(struct request *rq, void *priv)
{
struct mq_inflight *mi = priv;
@ -101,24 +101,14 @@ static bool blk_mq_check_inflight(struct request *rq, void *priv)
return true;
}
unsigned int blk_mq_in_flight(struct request_queue *q,
struct block_device *part)
void blk_mq_in_driver_rw(struct block_device *part, unsigned int inflight[2])
{
struct mq_inflight mi = { .part = part };
blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight, &mi);
return mi.inflight[0] + mi.inflight[1];
}
void blk_mq_in_flight_rw(struct request_queue *q, struct block_device *part,
unsigned int inflight[2])
{
struct mq_inflight mi = { .part = part };
blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight, &mi);
inflight[0] = mi.inflight[0];
inflight[1] = mi.inflight[1];
blk_mq_queue_tag_busy_iter(bdev_get_queue(part), blk_mq_check_in_driver,
&mi);
inflight[READ] = mi.inflight[READ];
inflight[WRITE] = mi.inflight[WRITE];
}
#ifdef CONFIG_LOCKDEP
@ -584,9 +574,13 @@ static struct request *blk_mq_rq_cache_fill(struct request_queue *q,
struct blk_mq_alloc_data data = {
.q = q,
.flags = flags,
.shallow_depth = 0,
.cmd_flags = opf,
.rq_flags = 0,
.nr_tags = plug->nr_ios,
.cached_rqs = &plug->cached_rqs,
.ctx = NULL,
.hctx = NULL
};
struct request *rq;
@ -646,8 +640,13 @@ struct request *blk_mq_alloc_request(struct request_queue *q, blk_opf_t opf,
struct blk_mq_alloc_data data = {
.q = q,
.flags = flags,
.shallow_depth = 0,
.cmd_flags = opf,
.rq_flags = 0,
.nr_tags = 1,
.cached_rqs = NULL,
.ctx = NULL,
.hctx = NULL
};
int ret;
@ -675,8 +674,13 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
struct blk_mq_alloc_data data = {
.q = q,
.flags = flags,
.shallow_depth = 0,
.cmd_flags = opf,
.rq_flags = 0,
.nr_tags = 1,
.cached_rqs = NULL,
.ctx = NULL,
.hctx = NULL
};
u64 alloc_time_ns = 0;
struct request *rq;
@ -2080,7 +2084,7 @@ static void blk_mq_commit_rqs(struct blk_mq_hw_ctx *hctx, int queued,
* Returns true if we did some work AND can potentially do more.
*/
bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list,
unsigned int nr_budgets)
bool get_budget)
{
enum prep_dispatch prep;
struct request_queue *q = hctx->queue;
@ -2102,7 +2106,7 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list,
rq = list_first_entry(list, struct request, queuelist);
WARN_ON_ONCE(hctx != rq->mq_hctx);
prep = blk_mq_prep_dispatch_rq(rq, !nr_budgets);
prep = blk_mq_prep_dispatch_rq(rq, get_budget);
if (prep != PREP_DISPATCH_OK)
break;
@ -2111,12 +2115,6 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list,
bd.rq = rq;
bd.last = list_empty(list);
/*
* once the request is queued to lld, no need to cover the
* budget any more
*/
if (nr_budgets)
nr_budgets--;
ret = q->mq_ops->queue_rq(hctx, &bd);
switch (ret) {
case BLK_STS_OK:
@ -2150,7 +2148,11 @@ out:
((hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED) ||
blk_mq_is_shared_tags(hctx->flags));
if (nr_budgets)
/*
* If the caller allocated budgets, free the budgets of the
* requests that have not yet been passed to the block driver.
*/
if (!get_budget)
blk_mq_release_budgets(q, list);
spin_lock(&hctx->lock);
@ -2778,15 +2780,15 @@ static blk_status_t blk_mq_request_issue_directly(struct request *rq, bool last)
return __blk_mq_issue_directly(hctx, rq, last);
}
static void blk_mq_plug_issue_direct(struct blk_plug *plug)
static void blk_mq_issue_direct(struct rq_list *rqs)
{
struct blk_mq_hw_ctx *hctx = NULL;
struct request *rq;
int queued = 0;
blk_status_t ret = BLK_STS_OK;
while ((rq = rq_list_pop(&plug->mq_list))) {
bool last = rq_list_empty(&plug->mq_list);
while ((rq = rq_list_pop(rqs))) {
bool last = rq_list_empty(rqs);
if (hctx != rq->mq_hctx) {
if (hctx) {
@ -2817,15 +2819,64 @@ out:
blk_mq_commit_rqs(hctx, queued, false);
}
static void __blk_mq_flush_plug_list(struct request_queue *q,
struct blk_plug *plug)
static void __blk_mq_flush_list(struct request_queue *q, struct rq_list *rqs)
{
if (blk_queue_quiesced(q))
return;
q->mq_ops->queue_rqs(&plug->mq_list);
q->mq_ops->queue_rqs(rqs);
}
static void blk_mq_dispatch_plug_list(struct blk_plug *plug, bool from_sched)
static unsigned blk_mq_extract_queue_requests(struct rq_list *rqs,
struct rq_list *queue_rqs)
{
struct request *rq = rq_list_pop(rqs);
struct request_queue *this_q = rq->q;
struct request **prev = &rqs->head;
struct rq_list matched_rqs = {};
struct request *last = NULL;
unsigned depth = 1;
rq_list_add_tail(&matched_rqs, rq);
while ((rq = *prev)) {
if (rq->q == this_q) {
/* move rq from rqs to matched_rqs */
*prev = rq->rq_next;
rq_list_add_tail(&matched_rqs, rq);
depth++;
} else {
/* leave rq in rqs */
prev = &rq->rq_next;
last = rq;
}
}
rqs->tail = last;
*queue_rqs = matched_rqs;
return depth;
}
static void blk_mq_dispatch_queue_requests(struct rq_list *rqs, unsigned depth)
{
struct request_queue *q = rq_list_peek(rqs)->q;
trace_block_unplug(q, depth, true);
/*
* Peek first request and see if we have a ->queue_rqs() hook.
* If we do, we can dispatch the whole list in one go.
* We already know at this point that all requests belong to the
* same queue, caller must ensure that's the case.
*/
if (q->mq_ops->queue_rqs) {
blk_mq_run_dispatch_ops(q, __blk_mq_flush_list(q, rqs));
if (rq_list_empty(rqs))
return;
}
blk_mq_run_dispatch_ops(q, blk_mq_issue_direct(rqs));
}
static void blk_mq_dispatch_list(struct rq_list *rqs, bool from_sched)
{
struct blk_mq_hw_ctx *this_hctx = NULL;
struct blk_mq_ctx *this_ctx = NULL;
@ -2835,7 +2886,7 @@ static void blk_mq_dispatch_plug_list(struct blk_plug *plug, bool from_sched)
LIST_HEAD(list);
do {
struct request *rq = rq_list_pop(&plug->mq_list);
struct request *rq = rq_list_pop(rqs);
if (!this_hctx) {
this_hctx = rq->mq_hctx;
@ -2848,9 +2899,9 @@ static void blk_mq_dispatch_plug_list(struct blk_plug *plug, bool from_sched)
}
list_add_tail(&rq->queuelist, &list);
depth++;
} while (!rq_list_empty(&plug->mq_list));
} while (!rq_list_empty(rqs));
plug->mq_list = requeue_list;
*rqs = requeue_list;
trace_block_unplug(this_hctx->queue, depth, !from_sched);
percpu_ref_get(&this_hctx->queue->q_usage_counter);
@ -2870,9 +2921,21 @@ static void blk_mq_dispatch_plug_list(struct blk_plug *plug, bool from_sched)
percpu_ref_put(&this_hctx->queue->q_usage_counter);
}
static void blk_mq_dispatch_multiple_queue_requests(struct rq_list *rqs)
{
do {
struct rq_list queue_rqs;
unsigned depth;
depth = blk_mq_extract_queue_requests(rqs, &queue_rqs);
blk_mq_dispatch_queue_requests(&queue_rqs, depth);
while (!rq_list_empty(&queue_rqs))
blk_mq_dispatch_list(&queue_rqs, false);
} while (!rq_list_empty(rqs));
}
void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
{
struct request *rq;
unsigned int depth;
/*
@ -2887,34 +2950,19 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
depth = plug->rq_count;
plug->rq_count = 0;
if (!plug->multiple_queues && !plug->has_elevator && !from_schedule) {
struct request_queue *q;
rq = rq_list_peek(&plug->mq_list);
q = rq->q;
trace_block_unplug(q, depth, true);
/*
* Peek first request and see if we have a ->queue_rqs() hook.
* If we do, we can dispatch the whole plug list in one go. We
* already know at this point that all requests belong to the
* same queue, caller must ensure that's the case.
*/
if (q->mq_ops->queue_rqs) {
blk_mq_run_dispatch_ops(q,
__blk_mq_flush_plug_list(q, plug));
if (rq_list_empty(&plug->mq_list))
if (!plug->has_elevator && !from_schedule) {
if (plug->multiple_queues) {
blk_mq_dispatch_multiple_queue_requests(&plug->mq_list);
return;
}
blk_mq_run_dispatch_ops(q,
blk_mq_plug_issue_direct(plug));
blk_mq_dispatch_queue_requests(&plug->mq_list, depth);
if (rq_list_empty(&plug->mq_list))
return;
}
do {
blk_mq_dispatch_plug_list(plug, from_schedule);
blk_mq_dispatch_list(&plug->mq_list, from_schedule);
} while (!rq_list_empty(&plug->mq_list));
}
@ -2969,8 +3017,14 @@ static struct request *blk_mq_get_new_requests(struct request_queue *q,
{
struct blk_mq_alloc_data data = {
.q = q,
.nr_tags = 1,
.flags = 0,
.shallow_depth = 0,
.cmd_flags = bio->bi_opf,
.rq_flags = 0,
.nr_tags = 1,
.cached_rqs = NULL,
.ctx = NULL,
.hctx = NULL
};
struct request *rq;
@ -3080,8 +3134,6 @@ void blk_mq_submit_bio(struct bio *bio)
goto new_request;
}
bio = blk_queue_bounce(bio, q);
/*
* The cached request already holds a q_usage_counter reference and we
* don't have to acquire a new one if we use it.
@ -4094,8 +4146,6 @@ static void blk_mq_map_swqueue(struct request_queue *q)
struct blk_mq_ctx *ctx;
struct blk_mq_tag_set *set = q->tag_set;
mutex_lock(&q->elevator_lock);
queue_for_each_hw_ctx(q, hctx, i) {
cpumask_clear(hctx->cpumask);
hctx->nr_ctx = 0;
@ -4200,8 +4250,6 @@ static void blk_mq_map_swqueue(struct request_queue *q)
hctx->next_cpu = blk_mq_first_mapped_cpu(hctx);
hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH;
}
mutex_unlock(&q->elevator_lock);
}
/*
@ -4505,16 +4553,9 @@ static void __blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
}
static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
struct request_queue *q, bool lock)
struct request_queue *q)
{
if (lock) {
/* protect against switching io scheduler */
mutex_lock(&q->elevator_lock);
__blk_mq_realloc_hw_ctxs(set, q);
mutex_unlock(&q->elevator_lock);
} else {
__blk_mq_realloc_hw_ctxs(set, q);
}
/* unregister cpuhp callbacks for exited hctxs */
blk_mq_remove_hw_queues_cpuhp(q);
@ -4546,7 +4587,7 @@ int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
xa_init(&q->hctx_table);
blk_mq_realloc_hw_ctxs(set, q, false);
blk_mq_realloc_hw_ctxs(set, q);
if (!q->nr_hw_queues)
goto err_hctxs;
@ -4563,8 +4604,8 @@ int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
q->nr_requests = set->queue_depth;
blk_mq_init_cpu_queues(q, set->nr_hw_queues);
blk_mq_add_queue_tag_set(set, q);
blk_mq_map_swqueue(q);
blk_mq_add_queue_tag_set(set, q);
return 0;
err_hctxs:
@ -4784,6 +4825,8 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
goto out_free_srcu;
}
init_rwsem(&set->update_nr_hwq_lock);
ret = -ENOMEM;
set->tags = kcalloc_node(set->nr_hw_queues,
sizeof(struct blk_mq_tags *), GFP_KERNEL,
@ -4923,88 +4966,10 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr)
return ret;
}
/*
* request_queue and elevator_type pair.
* It is just used by __blk_mq_update_nr_hw_queues to cache
* the elevator_type associated with a request_queue.
*/
struct blk_mq_qe_pair {
struct list_head node;
struct request_queue *q;
struct elevator_type *type;
};
/*
* Cache the elevator_type in qe pair list and switch the
* io scheduler to 'none'
*/
static bool blk_mq_elv_switch_none(struct list_head *head,
struct request_queue *q)
{
struct blk_mq_qe_pair *qe;
qe = kmalloc(sizeof(*qe), GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY);
if (!qe)
return false;
/* Accessing q->elevator needs protection from ->elevator_lock. */
mutex_lock(&q->elevator_lock);
if (!q->elevator) {
kfree(qe);
goto unlock;
}
INIT_LIST_HEAD(&qe->node);
qe->q = q;
qe->type = q->elevator->type;
/* keep a reference to the elevator module as we'll switch back */
__elevator_get(qe->type);
list_add(&qe->node, head);
elevator_disable(q);
unlock:
mutex_unlock(&q->elevator_lock);
return true;
}
static struct blk_mq_qe_pair *blk_lookup_qe_pair(struct list_head *head,
struct request_queue *q)
{
struct blk_mq_qe_pair *qe;
list_for_each_entry(qe, head, node)
if (qe->q == q)
return qe;
return NULL;
}
static void blk_mq_elv_switch_back(struct list_head *head,
struct request_queue *q)
{
struct blk_mq_qe_pair *qe;
struct elevator_type *t;
qe = blk_lookup_qe_pair(head, q);
if (!qe)
return;
t = qe->type;
list_del(&qe->node);
kfree(qe);
mutex_lock(&q->elevator_lock);
elevator_switch(q, t);
/* drop the reference acquired in blk_mq_elv_switch_none */
elevator_put(t);
mutex_unlock(&q->elevator_lock);
}
static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set,
int nr_hw_queues)
{
struct request_queue *q;
LIST_HEAD(head);
int prev_nr_hw_queues = set->nr_hw_queues;
unsigned int memflags;
int i;
@ -5019,30 +4984,24 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set,
return;
memflags = memalloc_noio_save();
list_for_each_entry(q, &set->tag_list, tag_set_list)
blk_mq_freeze_queue_nomemsave(q);
/*
* Switch IO scheduler to 'none', cleaning up the data associated
* with the previous scheduler. We will switch back once we are done
* updating the new sw to hw queue mappings.
*/
list_for_each_entry(q, &set->tag_list, tag_set_list)
if (!blk_mq_elv_switch_none(&head, q))
goto switch_back;
list_for_each_entry(q, &set->tag_list, tag_set_list) {
blk_mq_debugfs_unregister_hctxs(q);
blk_mq_sysfs_unregister_hctxs(q);
}
if (blk_mq_realloc_tag_set_tags(set, nr_hw_queues) < 0)
list_for_each_entry(q, &set->tag_list, tag_set_list)
blk_mq_freeze_queue_nomemsave(q);
if (blk_mq_realloc_tag_set_tags(set, nr_hw_queues) < 0) {
list_for_each_entry(q, &set->tag_list, tag_set_list)
blk_mq_unfreeze_queue_nomemrestore(q);
goto reregister;
}
fallback:
blk_mq_update_queue_map(set);
list_for_each_entry(q, &set->tag_list, tag_set_list) {
blk_mq_realloc_hw_ctxs(set, q, true);
__blk_mq_realloc_hw_ctxs(set, q);
if (q->nr_hw_queues != set->nr_hw_queues) {
int i = prev_nr_hw_queues;
@ -5058,18 +5017,18 @@ fallback:
blk_mq_map_swqueue(q);
}
/* elv_update_nr_hw_queues() unfreeze queue for us */
list_for_each_entry(q, &set->tag_list, tag_set_list)
elv_update_nr_hw_queues(q);
reregister:
list_for_each_entry(q, &set->tag_list, tag_set_list) {
blk_mq_sysfs_register_hctxs(q);
blk_mq_debugfs_register_hctxs(q);
blk_mq_remove_hw_queues_cpuhp(q);
blk_mq_add_hw_queues_cpuhp(q);
}
switch_back:
list_for_each_entry(q, &set->tag_list, tag_set_list)
blk_mq_elv_switch_back(&head, q);
list_for_each_entry(q, &set->tag_list, tag_set_list)
blk_mq_unfreeze_queue_nomemrestore(q);
memalloc_noio_restore(memflags);
/* Free the excess tags when nr_hw_queues shrink. */
@ -5079,9 +5038,11 @@ switch_back:
void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues)
{
down_write(&set->update_nr_hwq_lock);
mutex_lock(&set->tag_list_lock);
__blk_mq_update_nr_hw_queues(set, nr_hw_queues);
mutex_unlock(&set->tag_list_lock);
up_write(&set->update_nr_hwq_lock);
}
EXPORT_SYMBOL_GPL(blk_mq_update_nr_hw_queues);

View file

@ -48,7 +48,7 @@ void blk_mq_exit_queue(struct request_queue *q);
int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr);
void blk_mq_wake_waiters(struct request_queue *q);
bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *,
unsigned int);
bool);
void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list);
struct request *blk_mq_dequeue_from_ctx(struct blk_mq_hw_ctx *hctx,
struct blk_mq_ctx *start);
@ -246,10 +246,7 @@ static inline bool blk_mq_hw_queue_mapped(struct blk_mq_hw_ctx *hctx)
return hctx->nr_ctx && hctx->tags;
}
unsigned int blk_mq_in_flight(struct request_queue *q,
struct block_device *part);
void blk_mq_in_flight_rw(struct request_queue *q, struct block_device *part,
unsigned int inflight[2]);
void blk_mq_in_driver_rw(struct block_device *part, unsigned int inflight[2]);
static inline void blk_mq_put_dispatch_budget(struct request_queue *q,
int budget_token)

View file

@ -2,6 +2,8 @@
#include "blk-rq-qos.h"
__read_mostly DEFINE_STATIC_KEY_FALSE(block_rq_qos);
/*
* Increment 'v', if 'v' is below 'below'. Returns true if we succeeded,
* false if 'v' + 1 would be bigger than 'below'.
@ -317,6 +319,7 @@ void rq_qos_exit(struct request_queue *q)
struct rq_qos *rqos = q->rq_qos;
q->rq_qos = rqos->next;
rqos->ops->exit(rqos);
static_branch_dec(&block_rq_qos);
}
mutex_unlock(&q->rq_qos_mutex);
}
@ -343,6 +346,7 @@ int rq_qos_add(struct rq_qos *rqos, struct gendisk *disk, enum rq_qos_id id,
goto ebusy;
rqos->next = q->rq_qos;
q->rq_qos = rqos;
static_branch_inc(&block_rq_qos);
blk_mq_unfreeze_queue(q, memflags);

View file

@ -12,6 +12,7 @@
#include "blk-mq-debugfs.h"
struct blk_mq_debugfs_attr;
extern struct static_key_false block_rq_qos;
enum rq_qos_id {
RQ_QOS_WBT,
@ -112,31 +113,33 @@ void __rq_qos_queue_depth_changed(struct rq_qos *rqos);
static inline void rq_qos_cleanup(struct request_queue *q, struct bio *bio)
{
if (q->rq_qos)
if (static_branch_unlikely(&block_rq_qos) && q->rq_qos)
__rq_qos_cleanup(q->rq_qos, bio);
}
static inline void rq_qos_done(struct request_queue *q, struct request *rq)
{
if (q->rq_qos && !blk_rq_is_passthrough(rq))
if (static_branch_unlikely(&block_rq_qos) && q->rq_qos &&
!blk_rq_is_passthrough(rq))
__rq_qos_done(q->rq_qos, rq);
}
static inline void rq_qos_issue(struct request_queue *q, struct request *rq)
{
if (q->rq_qos)
if (static_branch_unlikely(&block_rq_qos) && q->rq_qos)
__rq_qos_issue(q->rq_qos, rq);
}
static inline void rq_qos_requeue(struct request_queue *q, struct request *rq)
{
if (q->rq_qos)
if (static_branch_unlikely(&block_rq_qos) && q->rq_qos)
__rq_qos_requeue(q->rq_qos, rq);
}
static inline void rq_qos_done_bio(struct bio *bio)
{
if (bio->bi_bdev && (bio_flagged(bio, BIO_QOS_THROTTLED) ||
if (static_branch_unlikely(&block_rq_qos) &&
bio->bi_bdev && (bio_flagged(bio, BIO_QOS_THROTTLED) ||
bio_flagged(bio, BIO_QOS_MERGED))) {
struct request_queue *q = bdev_get_queue(bio->bi_bdev);
if (q->rq_qos)
@ -146,7 +149,7 @@ static inline void rq_qos_done_bio(struct bio *bio)
static inline void rq_qos_throttle(struct request_queue *q, struct bio *bio)
{
if (q->rq_qos) {
if (static_branch_unlikely(&block_rq_qos) && q->rq_qos) {
bio_set_flag(bio, BIO_QOS_THROTTLED);
__rq_qos_throttle(q->rq_qos, bio);
}
@ -155,14 +158,14 @@ static inline void rq_qos_throttle(struct request_queue *q, struct bio *bio)
static inline void rq_qos_track(struct request_queue *q, struct request *rq,
struct bio *bio)
{
if (q->rq_qos)
if (static_branch_unlikely(&block_rq_qos) && q->rq_qos)
__rq_qos_track(q->rq_qos, rq, bio);
}
static inline void rq_qos_merge(struct request_queue *q, struct request *rq,
struct bio *bio)
{
if (q->rq_qos) {
if (static_branch_unlikely(&block_rq_qos) && q->rq_qos) {
bio_set_flag(bio, BIO_QOS_MERGED);
__rq_qos_merge(q->rq_qos, rq, bio);
}
@ -170,7 +173,7 @@ static inline void rq_qos_merge(struct request_queue *q, struct request *rq,
static inline void rq_qos_queue_depth_changed(struct request_queue *q)
{
if (q->rq_qos)
if (static_branch_unlikely(&block_rq_qos) && q->rq_qos)
__rq_qos_queue_depth_changed(q->rq_qos);
}

View file

@ -124,11 +124,6 @@ static int blk_validate_integrity_limits(struct queue_limits *lim)
return 0;
}
if (lim->features & BLK_FEAT_BOUNCE_HIGH) {
pr_warn("no bounce buffer support for integrity metadata\n");
return -EINVAL;
}
if (!IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY)) {
pr_warn("integrity support disabled.\n");
return -EINVAL;

View file

@ -134,6 +134,8 @@ QUEUE_SYSFS_LIMIT_SHOW(max_segments)
QUEUE_SYSFS_LIMIT_SHOW(max_discard_segments)
QUEUE_SYSFS_LIMIT_SHOW(max_integrity_segments)
QUEUE_SYSFS_LIMIT_SHOW(max_segment_size)
QUEUE_SYSFS_LIMIT_SHOW(max_write_streams)
QUEUE_SYSFS_LIMIT_SHOW(write_stream_granularity)
QUEUE_SYSFS_LIMIT_SHOW(logical_block_size)
QUEUE_SYSFS_LIMIT_SHOW(physical_block_size)
QUEUE_SYSFS_LIMIT_SHOW(chunk_sectors)
@ -488,6 +490,8 @@ QUEUE_LIM_RO_ENTRY(queue_max_hw_sectors, "max_hw_sectors_kb");
QUEUE_LIM_RO_ENTRY(queue_max_segments, "max_segments");
QUEUE_LIM_RO_ENTRY(queue_max_integrity_segments, "max_integrity_segments");
QUEUE_LIM_RO_ENTRY(queue_max_segment_size, "max_segment_size");
QUEUE_LIM_RO_ENTRY(queue_max_write_streams, "max_write_streams");
QUEUE_LIM_RO_ENTRY(queue_write_stream_granularity, "write_stream_granularity");
QUEUE_RW_ENTRY(elv_iosched, "scheduler");
QUEUE_LIM_RO_ENTRY(queue_logical_block_size, "logical_block_size");
@ -560,7 +564,7 @@ static ssize_t queue_wb_lat_show(struct gendisk *disk, char *page)
ssize_t ret;
struct request_queue *q = disk->queue;
mutex_lock(&q->elevator_lock);
mutex_lock(&disk->rqos_state_mutex);
if (!wbt_rq_qos(q)) {
ret = -EINVAL;
goto out;
@ -573,7 +577,7 @@ static ssize_t queue_wb_lat_show(struct gendisk *disk, char *page)
ret = sysfs_emit(page, "%llu\n", div_u64(wbt_get_min_lat(q), 1000));
out:
mutex_unlock(&q->elevator_lock);
mutex_unlock(&disk->rqos_state_mutex);
return ret;
}
@ -593,7 +597,6 @@ static ssize_t queue_wb_lat_store(struct gendisk *disk, const char *page,
return -EINVAL;
memflags = blk_mq_freeze_queue(q);
mutex_lock(&q->elevator_lock);
rqos = wbt_rq_qos(q);
if (!rqos) {
@ -618,11 +621,12 @@ static ssize_t queue_wb_lat_store(struct gendisk *disk, const char *page,
*/
blk_mq_quiesce_queue(q);
mutex_lock(&disk->rqos_state_mutex);
wbt_set_min_lat(q, val);
mutex_unlock(&disk->rqos_state_mutex);
blk_mq_unquiesce_queue(q);
out:
mutex_unlock(&q->elevator_lock);
blk_mq_unfreeze_queue(q, memflags);
return ret;
@ -642,6 +646,8 @@ static struct attribute *queue_attrs[] = {
&queue_max_discard_segments_entry.attr,
&queue_max_integrity_segments_entry.attr,
&queue_max_segment_size_entry.attr,
&queue_max_write_streams_entry.attr,
&queue_write_stream_granularity_entry.attr,
&queue_hw_sector_size_entry.attr,
&queue_logical_block_size_entry.attr,
&queue_physical_block_size_entry.attr,
@ -869,16 +875,9 @@ int blk_register_queue(struct gendisk *disk)
if (ret)
goto out_unregister_ia_ranges;
mutex_lock(&q->elevator_lock);
if (q->elevator) {
ret = elv_register_queue(q, false);
if (ret) {
mutex_unlock(&q->elevator_lock);
goto out_crypto_sysfs_unregister;
}
}
if (queue_is_mq(q))
elevator_set_default(q);
wbt_enable_default(disk);
mutex_unlock(&q->elevator_lock);
blk_queue_flag_set(QUEUE_FLAG_REGISTERED, q);
@ -902,8 +901,6 @@ int blk_register_queue(struct gendisk *disk)
return ret;
out_crypto_sysfs_unregister:
blk_crypto_sysfs_unregister(disk);
out_unregister_ia_ranges:
disk_unregister_independent_access_ranges(disk);
out_debugfs_remove:
@ -951,10 +948,6 @@ void blk_unregister_queue(struct gendisk *disk)
blk_mq_sysfs_unregister(disk);
blk_crypto_sysfs_unregister(disk);
mutex_lock(&q->elevator_lock);
elv_unregister_queue(q);
mutex_unlock(&q->elevator_lock);
mutex_lock(&q->sysfs_lock);
disk_unregister_independent_access_ranges(disk);
mutex_unlock(&q->sysfs_lock);
@ -963,5 +956,8 @@ void blk_unregister_queue(struct gendisk *disk)
kobject_uevent(&disk->queue_kobj, KOBJ_REMOVE);
kobject_del(&disk->queue_kobj);
if (queue_is_mq(q))
elevator_set_none(q);
blk_debugfs_remove(disk);
}

View file

@ -143,7 +143,8 @@ static inline unsigned int throtl_bio_data_size(struct bio *bio)
static void throtl_qnode_init(struct throtl_qnode *qn, struct throtl_grp *tg)
{
INIT_LIST_HEAD(&qn->node);
bio_list_init(&qn->bios);
bio_list_init(&qn->bios_bps);
bio_list_init(&qn->bios_iops);
qn->tg = tg;
}
@ -151,18 +152,32 @@ static void throtl_qnode_init(struct throtl_qnode *qn, struct throtl_grp *tg)
* throtl_qnode_add_bio - add a bio to a throtl_qnode and activate it
* @bio: bio being added
* @qn: qnode to add bio to
* @queued: the service_queue->queued[] list @qn belongs to
* @sq: the service_queue @qn belongs to
*
* Add @bio to @qn and put @qn on @queued if it's not already on.
* Add @bio to @qn and put @qn on @sq->queued if it's not already on.
* @qn->tg's reference count is bumped when @qn is activated. See the
* comment on top of throtl_qnode definition for details.
*/
static void throtl_qnode_add_bio(struct bio *bio, struct throtl_qnode *qn,
struct list_head *queued)
struct throtl_service_queue *sq)
{
bio_list_add(&qn->bios, bio);
bool rw = bio_data_dir(bio);
/*
* Split bios have already been throttled by bps, so they are
* directly queued into the iops path.
*/
if (bio_flagged(bio, BIO_TG_BPS_THROTTLED) ||
bio_flagged(bio, BIO_BPS_THROTTLED)) {
bio_list_add(&qn->bios_iops, bio);
sq->nr_queued_iops[rw]++;
} else {
bio_list_add(&qn->bios_bps, bio);
sq->nr_queued_bps[rw]++;
}
if (list_empty(&qn->node)) {
list_add_tail(&qn->node, queued);
list_add_tail(&qn->node, &sq->queued[rw]);
blkg_get(tg_to_blkg(qn->tg));
}
}
@ -170,6 +185,10 @@ static void throtl_qnode_add_bio(struct bio *bio, struct throtl_qnode *qn,
/**
* throtl_peek_queued - peek the first bio on a qnode list
* @queued: the qnode list to peek
*
* Always take a bio from the head of the iops queue first. If the queue is
* empty, we then take it from the bps queue to maintain the overall idea of
* fetching bios from the head.
*/
static struct bio *throtl_peek_queued(struct list_head *queued)
{
@ -180,28 +199,33 @@ static struct bio *throtl_peek_queued(struct list_head *queued)
return NULL;
qn = list_first_entry(queued, struct throtl_qnode, node);
bio = bio_list_peek(&qn->bios);
bio = bio_list_peek(&qn->bios_iops);
if (!bio)
bio = bio_list_peek(&qn->bios_bps);
WARN_ON_ONCE(!bio);
return bio;
}
/**
* throtl_pop_queued - pop the first bio form a qnode list
* @queued: the qnode list to pop a bio from
* @sq: the service_queue to pop a bio from
* @tg_to_put: optional out argument for throtl_grp to put
* @rw: read/write
*
* Pop the first bio from the qnode list @queued. After popping, the first
* qnode is removed from @queued if empty or moved to the end of @queued so
* that the popping order is round-robin.
* Pop the first bio from the qnode list @sq->queued. Note that we firstly
* focus on the iops list because bios are ultimately dispatched from it.
* After popping, the first qnode is removed from @sq->queued if empty or moved
* to the end of @sq->queued so that the popping order is round-robin.
*
* When the first qnode is removed, its associated throtl_grp should be put
* too. If @tg_to_put is NULL, this function automatically puts it;
* otherwise, *@tg_to_put is set to the throtl_grp to put and the caller is
* responsible for putting it.
*/
static struct bio *throtl_pop_queued(struct list_head *queued,
struct throtl_grp **tg_to_put)
static struct bio *throtl_pop_queued(struct throtl_service_queue *sq,
struct throtl_grp **tg_to_put, bool rw)
{
struct list_head *queued = &sq->queued[rw];
struct throtl_qnode *qn;
struct bio *bio;
@ -209,10 +233,17 @@ static struct bio *throtl_pop_queued(struct list_head *queued,
return NULL;
qn = list_first_entry(queued, struct throtl_qnode, node);
bio = bio_list_pop(&qn->bios);
bio = bio_list_pop(&qn->bios_iops);
if (bio) {
sq->nr_queued_iops[rw]--;
} else {
bio = bio_list_pop(&qn->bios_bps);
if (bio)
sq->nr_queued_bps[rw]--;
}
WARN_ON_ONCE(!bio);
if (bio_list_empty(&qn->bios)) {
if (bio_list_empty(&qn->bios_bps) && bio_list_empty(&qn->bios_iops)) {
list_del_init(&qn->node);
if (tg_to_put)
*tg_to_put = qn->tg;
@ -520,6 +551,9 @@ static inline void throtl_set_slice_end(struct throtl_grp *tg, bool rw,
static inline void throtl_extend_slice(struct throtl_grp *tg, bool rw,
unsigned long jiffy_end)
{
if (!time_before(tg->slice_end[rw], jiffy_end))
return;
throtl_set_slice_end(tg, rw, jiffy_end);
throtl_log(&tg->service_queue,
"[%c] extend slice start=%lu end=%lu jiffies=%lu",
@ -536,6 +570,11 @@ static bool throtl_slice_used(struct throtl_grp *tg, bool rw)
return true;
}
static unsigned int sq_queued(struct throtl_service_queue *sq, int type)
{
return sq->nr_queued_bps[type] + sq->nr_queued_iops[type];
}
static unsigned int calculate_io_allowed(u32 iops_limit,
unsigned long jiffy_elapsed)
{
@ -571,6 +610,48 @@ static u64 calculate_bytes_allowed(u64 bps_limit, unsigned long jiffy_elapsed)
return mul_u64_u64_div_u64(bps_limit, (u64)jiffy_elapsed, (u64)HZ);
}
static long long throtl_trim_bps(struct throtl_grp *tg, bool rw,
unsigned long time_elapsed)
{
u64 bps_limit = tg_bps_limit(tg, rw);
long long bytes_trim;
if (bps_limit == U64_MAX)
return 0;
/* Need to consider the case of bytes_allowed overflow. */
bytes_trim = calculate_bytes_allowed(bps_limit, time_elapsed);
if (bytes_trim <= 0 || tg->bytes_disp[rw] < bytes_trim) {
bytes_trim = tg->bytes_disp[rw];
tg->bytes_disp[rw] = 0;
} else {
tg->bytes_disp[rw] -= bytes_trim;
}
return bytes_trim;
}
static int throtl_trim_iops(struct throtl_grp *tg, bool rw,
unsigned long time_elapsed)
{
u32 iops_limit = tg_iops_limit(tg, rw);
int io_trim;
if (iops_limit == UINT_MAX)
return 0;
/* Need to consider the case of io_allowed overflow. */
io_trim = calculate_io_allowed(iops_limit, time_elapsed);
if (io_trim <= 0 || tg->io_disp[rw] < io_trim) {
io_trim = tg->io_disp[rw];
tg->io_disp[rw] = 0;
} else {
tg->io_disp[rw] -= io_trim;
}
return io_trim;
}
/* Trim the used slices and adjust slice start accordingly */
static inline void throtl_trim_slice(struct throtl_grp *tg, bool rw)
{
@ -612,22 +693,11 @@ static inline void throtl_trim_slice(struct throtl_grp *tg, bool rw)
* one extra slice is preserved for deviation.
*/
time_elapsed -= tg->td->throtl_slice;
bytes_trim = calculate_bytes_allowed(tg_bps_limit(tg, rw),
time_elapsed);
io_trim = calculate_io_allowed(tg_iops_limit(tg, rw), time_elapsed);
if (bytes_trim <= 0 && io_trim <= 0)
bytes_trim = throtl_trim_bps(tg, rw, time_elapsed);
io_trim = throtl_trim_iops(tg, rw, time_elapsed);
if (!bytes_trim && !io_trim)
return;
if ((long long)tg->bytes_disp[rw] >= bytes_trim)
tg->bytes_disp[rw] -= bytes_trim;
else
tg->bytes_disp[rw] = 0;
if ((int)tg->io_disp[rw] >= io_trim)
tg->io_disp[rw] -= io_trim;
else
tg->io_disp[rw] = 0;
tg->slice_start[rw] += time_elapsed;
throtl_log(&tg->service_queue,
@ -643,21 +713,41 @@ static void __tg_update_carryover(struct throtl_grp *tg, bool rw,
unsigned long jiffy_elapsed = jiffies - tg->slice_start[rw];
u64 bps_limit = tg_bps_limit(tg, rw);
u32 iops_limit = tg_iops_limit(tg, rw);
long long bytes_allowed;
int io_allowed;
/*
* If the queue is empty, carryover handling is not needed. In such cases,
* tg->[bytes/io]_disp should be reset to 0 to avoid impacting the dispatch
* of subsequent bios. The same handling applies when the previous BPS/IOPS
* limit was set to max.
*/
if (sq_queued(&tg->service_queue, rw) == 0) {
tg->bytes_disp[rw] = 0;
tg->io_disp[rw] = 0;
return;
}
/*
* If config is updated while bios are still throttled, calculate and
* accumulate how many bytes/ios are waited across changes. And
* carryover_bytes/ios will be used to calculate new wait time under new
* configuration.
* accumulate how many bytes/ios are waited across changes. And use the
* calculated carryover (@bytes/@ios) to update [bytes/io]_disp, which
* will be used to calculate new wait time under new configuration.
* And we need to consider the case of bytes/io_allowed overflow.
*/
if (bps_limit != U64_MAX)
*bytes = calculate_bytes_allowed(bps_limit, jiffy_elapsed) -
tg->bytes_disp[rw];
if (iops_limit != UINT_MAX)
*ios = calculate_io_allowed(iops_limit, jiffy_elapsed) -
tg->io_disp[rw];
tg->bytes_disp[rw] -= *bytes;
tg->io_disp[rw] -= *ios;
if (bps_limit != U64_MAX) {
bytes_allowed = calculate_bytes_allowed(bps_limit, jiffy_elapsed);
if (bytes_allowed > 0)
*bytes = bytes_allowed - tg->bytes_disp[rw];
}
if (iops_limit != UINT_MAX) {
io_allowed = calculate_io_allowed(iops_limit, jiffy_elapsed);
if (io_allowed > 0)
*ios = io_allowed - tg->io_disp[rw];
}
tg->bytes_disp[rw] = -*bytes;
tg->io_disp[rw] = -*ios;
}
static void tg_update_carryover(struct throtl_grp *tg)
@ -665,12 +755,10 @@ static void tg_update_carryover(struct throtl_grp *tg)
long long bytes[2] = {0};
int ios[2] = {0};
if (tg->service_queue.nr_queued[READ])
__tg_update_carryover(tg, READ, &bytes[READ], &ios[READ]);
if (tg->service_queue.nr_queued[WRITE])
__tg_update_carryover(tg, WRITE, &bytes[WRITE], &ios[WRITE]);
/* see comments in struct throtl_grp for meaning of these fields. */
/* see comments in struct throtl_grp for meaning of carryover. */
throtl_log(&tg->service_queue, "%s: %lld %lld %d %d\n", __func__,
bytes[READ], bytes[WRITE], ios[READ], ios[WRITE]);
}
@ -682,10 +770,6 @@ static unsigned long tg_within_iops_limit(struct throtl_grp *tg, struct bio *bio
int io_allowed;
unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd;
if (iops_limit == UINT_MAX) {
return 0;
}
jiffy_elapsed = jiffies - tg->slice_start[rw];
/* Round up to the next throttle slice, wait time must be nonzero */
@ -711,11 +795,6 @@ static unsigned long tg_within_bps_limit(struct throtl_grp *tg, struct bio *bio,
unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd;
unsigned int bio_size = throtl_bio_data_size(bio);
/* no need to throttle if this bio's bytes have been accounted */
if (bps_limit == U64_MAX || bio_flagged(bio, BIO_BPS_THROTTLED)) {
return 0;
}
jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw];
/* Slice has just started. Consider one slice interval */
@ -724,7 +803,9 @@ static unsigned long tg_within_bps_limit(struct throtl_grp *tg, struct bio *bio,
jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, tg->td->throtl_slice);
bytes_allowed = calculate_bytes_allowed(bps_limit, jiffy_elapsed_rnd);
if (bytes_allowed > 0 && tg->bytes_disp[rw] + bio_size <= bytes_allowed)
/* Need to consider the case of bytes_allowed overflow. */
if ((bytes_allowed > 0 && tg->bytes_disp[rw] + bio_size <= bytes_allowed)
|| bytes_allowed < 0)
return 0;
/* Calc approx time to dispatch */
@ -742,17 +823,82 @@ static unsigned long tg_within_bps_limit(struct throtl_grp *tg, struct bio *bio,
return jiffy_wait;
}
static void throtl_charge_bps_bio(struct throtl_grp *tg, struct bio *bio)
{
unsigned int bio_size = throtl_bio_data_size(bio);
/* Charge the bio to the group */
if (!bio_flagged(bio, BIO_BPS_THROTTLED) &&
!bio_flagged(bio, BIO_TG_BPS_THROTTLED)) {
bio_set_flag(bio, BIO_TG_BPS_THROTTLED);
tg->bytes_disp[bio_data_dir(bio)] += bio_size;
}
}
static void throtl_charge_iops_bio(struct throtl_grp *tg, struct bio *bio)
{
bio_clear_flag(bio, BIO_TG_BPS_THROTTLED);
tg->io_disp[bio_data_dir(bio)]++;
}
/*
* Returns whether one can dispatch a bio or not. Also returns approx number
* of jiffies to wait before this bio is with-in IO rate and can be dispatched
* If previous slice expired, start a new one otherwise renew/extend existing
* slice to make sure it is at least throtl_slice interval long since now. New
* slice is started only for empty throttle group. If there is queued bio, that
* means there should be an active slice and it should be extended instead.
*/
static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio,
unsigned long *wait)
static void tg_update_slice(struct throtl_grp *tg, bool rw)
{
if (throtl_slice_used(tg, rw) &&
sq_queued(&tg->service_queue, rw) == 0)
throtl_start_new_slice(tg, rw, true);
else
throtl_extend_slice(tg, rw, jiffies + tg->td->throtl_slice);
}
static unsigned long tg_dispatch_bps_time(struct throtl_grp *tg, struct bio *bio)
{
bool rw = bio_data_dir(bio);
unsigned long bps_wait = 0, iops_wait = 0, max_wait = 0;
u64 bps_limit = tg_bps_limit(tg, rw);
unsigned long bps_wait;
/* no need to throttle if this bio's bytes have been accounted */
if (bps_limit == U64_MAX || tg->flags & THROTL_TG_CANCELING ||
bio_flagged(bio, BIO_BPS_THROTTLED) ||
bio_flagged(bio, BIO_TG_BPS_THROTTLED))
return 0;
tg_update_slice(tg, rw);
bps_wait = tg_within_bps_limit(tg, bio, bps_limit);
throtl_extend_slice(tg, rw, jiffies + bps_wait);
return bps_wait;
}
static unsigned long tg_dispatch_iops_time(struct throtl_grp *tg, struct bio *bio)
{
bool rw = bio_data_dir(bio);
u32 iops_limit = tg_iops_limit(tg, rw);
unsigned long iops_wait;
if (iops_limit == UINT_MAX || tg->flags & THROTL_TG_CANCELING)
return 0;
tg_update_slice(tg, rw);
iops_wait = tg_within_iops_limit(tg, bio, iops_limit);
throtl_extend_slice(tg, rw, jiffies + iops_wait);
return iops_wait;
}
/*
* Returns approx number of jiffies to wait before this bio is with-in IO rate
* and can be moved to other queue or dispatched.
*/
static unsigned long tg_dispatch_time(struct throtl_grp *tg, struct bio *bio)
{
bool rw = bio_data_dir(bio);
unsigned long wait;
/*
* Currently whole state machine of group depends on first bio
@ -760,62 +906,20 @@ static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio,
* this function with a different bio if there are other bios
* queued.
*/
BUG_ON(tg->service_queue.nr_queued[rw] &&
BUG_ON(sq_queued(&tg->service_queue, rw) &&
bio != throtl_peek_queued(&tg->service_queue.queued[rw]));
/* If tg->bps = -1, then BW is unlimited */
if ((bps_limit == U64_MAX && iops_limit == UINT_MAX) ||
tg->flags & THROTL_TG_CANCELING) {
if (wait)
*wait = 0;
return true;
}
wait = tg_dispatch_bps_time(tg, bio);
if (wait != 0)
return wait;
/*
* If previous slice expired, start a new one otherwise renew/extend
* existing slice to make sure it is at least throtl_slice interval
* long since now. New slice is started only for empty throttle group.
* If there is queued bio, that means there should be an active
* slice and it should be extended instead.
* Charge bps here because @bio will be directly placed into the
* iops queue afterward.
*/
if (throtl_slice_used(tg, rw) && !(tg->service_queue.nr_queued[rw]))
throtl_start_new_slice(tg, rw, true);
else {
if (time_before(tg->slice_end[rw],
jiffies + tg->td->throtl_slice))
throtl_extend_slice(tg, rw,
jiffies + tg->td->throtl_slice);
}
throtl_charge_bps_bio(tg, bio);
bps_wait = tg_within_bps_limit(tg, bio, bps_limit);
iops_wait = tg_within_iops_limit(tg, bio, iops_limit);
if (bps_wait + iops_wait == 0) {
if (wait)
*wait = 0;
return true;
}
max_wait = max(bps_wait, iops_wait);
if (wait)
*wait = max_wait;
if (time_before(tg->slice_end[rw], jiffies + max_wait))
throtl_extend_slice(tg, rw, jiffies + max_wait);
return false;
}
static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio)
{
bool rw = bio_data_dir(bio);
unsigned int bio_size = throtl_bio_data_size(bio);
/* Charge the bio to the group */
if (!bio_flagged(bio, BIO_BPS_THROTTLED))
tg->bytes_disp[rw] += bio_size;
tg->io_disp[rw]++;
return tg_dispatch_iops_time(tg, bio);
}
/**
@ -842,28 +946,36 @@ static void throtl_add_bio_tg(struct bio *bio, struct throtl_qnode *qn,
* dispatched. Mark that @tg was empty. This is automatically
* cleared on the next tg_update_disptime().
*/
if (!sq->nr_queued[rw])
if (sq_queued(sq, rw) == 0)
tg->flags |= THROTL_TG_WAS_EMPTY;
throtl_qnode_add_bio(bio, qn, &sq->queued[rw]);
throtl_qnode_add_bio(bio, qn, sq);
/*
* Since we have split the queues, when the iops queue is
* previously empty and a new @bio is added into the first @qn,
* we also need to update the @tg->disptime.
*/
if (bio_flagged(bio, BIO_BPS_THROTTLED) &&
bio == throtl_peek_queued(&sq->queued[rw]))
tg->flags |= THROTL_TG_IOPS_WAS_EMPTY;
sq->nr_queued[rw]++;
throtl_enqueue_tg(tg);
}
static void tg_update_disptime(struct throtl_grp *tg)
{
struct throtl_service_queue *sq = &tg->service_queue;
unsigned long read_wait = -1, write_wait = -1, min_wait = -1, disptime;
unsigned long read_wait = -1, write_wait = -1, min_wait, disptime;
struct bio *bio;
bio = throtl_peek_queued(&sq->queued[READ]);
if (bio)
tg_may_dispatch(tg, bio, &read_wait);
read_wait = tg_dispatch_time(tg, bio);
bio = throtl_peek_queued(&sq->queued[WRITE]);
if (bio)
tg_may_dispatch(tg, bio, &write_wait);
write_wait = tg_dispatch_time(tg, bio);
min_wait = min(read_wait, write_wait);
disptime = jiffies + min_wait;
@ -875,6 +987,7 @@ static void tg_update_disptime(struct throtl_grp *tg)
/* see throtl_add_bio_tg() */
tg->flags &= ~THROTL_TG_WAS_EMPTY;
tg->flags &= ~THROTL_TG_IOPS_WAS_EMPTY;
}
static void start_parent_slice_with_credit(struct throtl_grp *child_tg,
@ -901,10 +1014,9 @@ static void tg_dispatch_one_bio(struct throtl_grp *tg, bool rw)
* getting released prematurely. Remember the tg to put and put it
* after @bio is transferred to @parent_sq.
*/
bio = throtl_pop_queued(&sq->queued[rw], &tg_to_put);
sq->nr_queued[rw]--;
bio = throtl_pop_queued(sq, &tg_to_put, rw);
throtl_charge_bio(tg, bio);
throtl_charge_iops_bio(tg, bio);
/*
* If our parent is another tg, we just need to transfer @bio to
@ -919,7 +1031,7 @@ static void tg_dispatch_one_bio(struct throtl_grp *tg, bool rw)
} else {
bio_set_flag(bio, BIO_BPS_THROTTLED);
throtl_qnode_add_bio(bio, &tg->qnode_on_parent[rw],
&parent_sq->queued[rw]);
parent_sq);
BUG_ON(tg->td->nr_queued[rw] <= 0);
tg->td->nr_queued[rw]--;
}
@ -941,7 +1053,7 @@ static int throtl_dispatch_tg(struct throtl_grp *tg)
/* Try to dispatch 75% READS and 25% WRITES */
while ((bio = throtl_peek_queued(&sq->queued[READ])) &&
tg_may_dispatch(tg, bio, NULL)) {
tg_dispatch_time(tg, bio) == 0) {
tg_dispatch_one_bio(tg, READ);
nr_reads++;
@ -951,7 +1063,7 @@ static int throtl_dispatch_tg(struct throtl_grp *tg)
}
while ((bio = throtl_peek_queued(&sq->queued[WRITE])) &&
tg_may_dispatch(tg, bio, NULL)) {
tg_dispatch_time(tg, bio) == 0) {
tg_dispatch_one_bio(tg, WRITE);
nr_writes++;
@ -984,7 +1096,7 @@ static int throtl_select_dispatch(struct throtl_service_queue *parent_sq)
nr_disp += throtl_dispatch_tg(tg);
sq = &tg->service_queue;
if (sq->nr_queued[READ] || sq->nr_queued[WRITE])
if (sq_queued(sq, READ) || sq_queued(sq, WRITE))
tg_update_disptime(tg);
else
throtl_dequeue_tg(tg);
@ -1037,9 +1149,11 @@ again:
dispatched = false;
while (true) {
unsigned int __maybe_unused bio_cnt_r = sq_queued(sq, READ);
unsigned int __maybe_unused bio_cnt_w = sq_queued(sq, WRITE);
throtl_log(sq, "dispatch nr_queued=%u read=%u write=%u",
sq->nr_queued[READ] + sq->nr_queued[WRITE],
sq->nr_queued[READ], sq->nr_queued[WRITE]);
bio_cnt_r + bio_cnt_w, bio_cnt_r, bio_cnt_w);
ret = throtl_select_dispatch(sq);
if (ret) {
@ -1061,7 +1175,8 @@ again:
if (parent_sq) {
/* @parent_sq is another throl_grp, propagate dispatch */
if (tg->flags & THROTL_TG_WAS_EMPTY) {
if (tg->flags & THROTL_TG_WAS_EMPTY ||
tg->flags & THROTL_TG_IOPS_WAS_EMPTY) {
tg_update_disptime(tg);
if (!throtl_schedule_next_dispatch(parent_sq, false)) {
/* window is already open, repeat dispatching */
@ -1101,7 +1216,7 @@ static void blk_throtl_dispatch_work_fn(struct work_struct *work)
spin_lock_irq(&q->queue_lock);
for (rw = READ; rw <= WRITE; rw++)
while ((bio = throtl_pop_queued(&td_sq->queued[rw], NULL)))
while ((bio = throtl_pop_queued(td_sq, NULL, rw)))
bio_list_add(&bio_list_on_stack, bio);
spin_unlock_irq(&q->queue_lock);
@ -1606,11 +1721,30 @@ void blk_throtl_cancel_bios(struct gendisk *disk)
static bool tg_within_limit(struct throtl_grp *tg, struct bio *bio, bool rw)
{
/* throtl is FIFO - if bios are already queued, should queue */
if (tg->service_queue.nr_queued[rw])
return false;
struct throtl_service_queue *sq = &tg->service_queue;
return tg_may_dispatch(tg, bio, NULL);
/*
* For a split bio, we need to specifically distinguish whether the
* iops queue is empty.
*/
if (bio_flagged(bio, BIO_BPS_THROTTLED))
return sq->nr_queued_iops[rw] == 0 &&
tg_dispatch_iops_time(tg, bio) == 0;
/*
* Throtl is FIFO - if bios are already queued, should queue.
* If the bps queue is empty and @bio is within the bps limit, charge
* bps here for direct placement into the iops queue.
*/
if (sq_queued(&tg->service_queue, rw)) {
if (sq->nr_queued_bps[rw] == 0 &&
tg_dispatch_bps_time(tg, bio) == 0)
throtl_charge_bps_bio(tg, bio);
return false;
}
return tg_dispatch_time(tg, bio) == 0;
}
bool __blk_throtl_bio(struct bio *bio)
@ -1631,7 +1765,7 @@ bool __blk_throtl_bio(struct bio *bio)
while (true) {
if (tg_within_limit(tg, bio, rw)) {
/* within limits, let's charge and dispatch directly */
throtl_charge_bio(tg, bio);
throtl_charge_iops_bio(tg, bio);
/*
* We need to trim slice even when bios are not being
@ -1654,7 +1788,8 @@ bool __blk_throtl_bio(struct bio *bio)
* control algorithm is adaptive, and extra IO bytes
* will be throttled for paying the debt
*/
throtl_charge_bio(tg, bio);
throtl_charge_bps_bio(tg, bio);
throtl_charge_iops_bio(tg, bio);
} else {
/* if above limits, break to queue */
break;
@ -1680,7 +1815,7 @@ bool __blk_throtl_bio(struct bio *bio)
tg->bytes_disp[rw], bio->bi_iter.bi_size,
tg_bps_limit(tg, rw),
tg->io_disp[rw], tg_iops_limit(tg, rw),
sq->nr_queued[READ], sq->nr_queued[WRITE]);
sq_queued(sq, READ), sq_queued(sq, WRITE));
td->nr_queued[rw]++;
throtl_add_bio_tg(bio, qn, tg);
@ -1688,11 +1823,13 @@ bool __blk_throtl_bio(struct bio *bio)
/*
* Update @tg's dispatch time and force schedule dispatch if @tg
* was empty before @bio. The forced scheduling isn't likely to
* cause undue delay as @bio is likely to be dispatched directly if
* its @tg's disptime is not in the future.
* was empty before @bio, or the iops queue is empty and @bio will
* add to. The forced scheduling isn't likely to cause undue
* delay as @bio is likely to be dispatched directly if its @tg's
* disptime is not in the future.
*/
if (tg->flags & THROTL_TG_WAS_EMPTY) {
if (tg->flags & THROTL_TG_WAS_EMPTY ||
tg->flags & THROTL_TG_IOPS_WAS_EMPTY) {
tg_update_disptime(tg);
throtl_schedule_next_dispatch(tg->service_queue.parent_sq, true);
}

View file

@ -29,7 +29,8 @@
*/
struct throtl_qnode {
struct list_head node; /* service_queue->queued[] */
struct bio_list bios; /* queued bios */
struct bio_list bios_bps; /* queued bios for bps limit */
struct bio_list bios_iops; /* queued bios for iops limit */
struct throtl_grp *tg; /* tg this qnode belongs to */
};
@ -41,7 +42,8 @@ struct throtl_service_queue {
* children throtl_grp's.
*/
struct list_head queued[2]; /* throtl_qnode [READ/WRITE] */
unsigned int nr_queued[2]; /* number of queued bios */
unsigned int nr_queued_bps[2]; /* number of queued bps bios */
unsigned int nr_queued_iops[2]; /* number of queued iops bios */
/*
* RB tree of active children throtl_grp's, which are sorted by
@ -56,7 +58,12 @@ struct throtl_service_queue {
enum tg_state_flags {
THROTL_TG_PENDING = 1 << 0, /* on parent's pending tree */
THROTL_TG_WAS_EMPTY = 1 << 1, /* bio_lists[] became non-empty */
THROTL_TG_CANCELING = 1 << 2, /* starts to cancel bio */
/*
* The sq's iops queue is empty, and a bio is about to be enqueued
* to the first qnode's bios_iops list.
*/
THROTL_TG_IOPS_WAS_EMPTY = 1 << 2,
THROTL_TG_CANCELING = 1 << 3, /* starts to cancel bio */
};
struct throtl_grp {
@ -102,19 +109,16 @@ struct throtl_grp {
/* IOPS limits */
unsigned int iops[2];
/* Number of bytes dispatched in current slice */
int64_t bytes_disp[2];
/* Number of bio's dispatched in current slice */
int io_disp[2];
/*
* The following two fields are updated when new configuration is
* submitted while some bios are still throttled, they record how many
* bytes/ios are waited already in previous configuration, and they will
* be used to calculate wait time under new configuration.
* Number of bytes/bio's dispatched in current slice.
* When new configuration is submitted while some bios are still throttled,
* first calculate the carryover: the amount of bytes/IOs already waited
* under the previous configuration. Then, [bytes/io]_disp are represented
* as the negative of the carryover, and they will be used to calculate the
* wait time under the new configuration.
*/
long long carryover_bytes[2];
int carryover_ios[2];
int64_t bytes_disp[2];
int io_disp[2];
unsigned long last_check_time;

View file

@ -704,8 +704,9 @@ void wbt_enable_default(struct gendisk *disk)
struct rq_qos *rqos;
bool enable = IS_ENABLED(CONFIG_BLK_WBT_MQ);
if (q->elevator &&
test_bit(ELEVATOR_FLAG_DISABLE_WBT, &q->elevator->flags))
mutex_lock(&disk->rqos_state_mutex);
if (blk_queue_disable_wbt(q))
enable = false;
/* Throttling already enabled? */
@ -713,8 +714,10 @@ void wbt_enable_default(struct gendisk *disk)
if (rqos) {
if (enable && RQWB(rqos)->enable_state == WBT_STATE_OFF_DEFAULT)
RQWB(rqos)->enable_state = WBT_STATE_ON_DEFAULT;
mutex_unlock(&disk->rqos_state_mutex);
return;
}
mutex_unlock(&disk->rqos_state_mutex);
/* Queue not registered? Maybe shutting down... */
if (!blk_queue_registered(q))
@ -774,11 +777,13 @@ void wbt_disable_default(struct gendisk *disk)
struct rq_wb *rwb;
if (!rqos)
return;
mutex_lock(&disk->rqos_state_mutex);
rwb = RQWB(rqos);
if (rwb->enable_state == WBT_STATE_ON_DEFAULT) {
blk_stat_deactivate(rwb->cb);
rwb->enable_state = WBT_STATE_OFF_DEFAULT;
}
mutex_unlock(&disk->rqos_state_mutex);
}
EXPORT_SYMBOL_GPL(wbt_disable_default);

View file

@ -103,8 +103,7 @@ struct bio_vec *bvec_alloc(mempool_t *pool, unsigned short *nr_vecs,
void bvec_free(mempool_t *pool, struct bio_vec *bv, unsigned short nr_vecs);
bool bvec_try_merge_hw_page(struct request_queue *q, struct bio_vec *bv,
struct page *page, unsigned len, unsigned offset,
bool *same_page);
struct page *page, unsigned len, unsigned offset);
static inline bool biovec_phys_mergeable(struct request_queue *q,
struct bio_vec *vec1, struct bio_vec *vec2)
@ -322,11 +321,9 @@ bool blk_bio_list_merge(struct request_queue *q, struct list_head *list,
bool blk_insert_flush(struct request *rq);
int elevator_switch(struct request_queue *q, struct elevator_type *new_e);
void elevator_disable(struct request_queue *q);
void elevator_exit(struct request_queue *q);
int elv_register_queue(struct request_queue *q, bool uevent);
void elv_unregister_queue(struct request_queue *q);
void elv_update_nr_hw_queues(struct request_queue *q);
void elevator_set_default(struct request_queue *q);
void elevator_set_none(struct request_queue *q);
ssize_t part_size_show(struct device *dev, struct device_attribute *attr,
char *buf);
@ -407,6 +404,27 @@ static inline struct bio *__bio_split_to_limits(struct bio *bio,
}
}
/**
* get_max_segment_size() - maximum number of bytes to add as a single segment
* @lim: Request queue limits.
* @paddr: address of the range to add
* @len: maximum length available to add at @paddr
*
* Returns the maximum number of bytes of the range starting at @paddr that can
* be added to a single segment.
*/
static inline unsigned get_max_segment_size(const struct queue_limits *lim,
phys_addr_t paddr, unsigned int len)
{
/*
* Prevent an overflow if mask = ULONG_MAX and offset = 0 by adding 1
* after having calculated the minimum.
*/
return min_t(unsigned long, len,
min(lim->seg_boundary_mask - (lim->seg_boundary_mask & paddr),
(unsigned long)lim->max_segment_size - 1) + 1);
}
int ll_back_merge_fn(struct request *req, struct bio *bio,
unsigned int nr_segs);
bool blk_attempt_req_merge(struct request_queue *q, struct request *rq,
@ -421,7 +439,6 @@ void blk_apply_bdi_limits(struct backing_dev_info *bdi,
int blk_dev_init(void);
void update_io_ticks(struct block_device *part, unsigned long now, bool end);
unsigned int part_in_flight(struct block_device *part);
static inline void req_set_nomerge(struct request_queue *q, struct request *req)
{
@ -443,23 +460,6 @@ static inline void ioc_clear_queue(struct request_queue *q)
}
#endif /* CONFIG_BLK_ICQ */
struct bio *__blk_queue_bounce(struct bio *bio, struct request_queue *q);
static inline bool blk_queue_may_bounce(struct request_queue *q)
{
return IS_ENABLED(CONFIG_BOUNCE) &&
(q->limits.features & BLK_FEAT_BOUNCE_HIGH) &&
max_low_pfn >= max_pfn;
}
static inline struct bio *blk_queue_bounce(struct bio *bio,
struct request_queue *q)
{
if (unlikely(blk_queue_may_bounce(q) && bio_has_data(bio)))
return __blk_queue_bounce(bio, q);
return bio;
}
#ifdef CONFIG_BLK_DEV_ZONED
void disk_init_zone_resources(struct gendisk *disk);
void disk_free_zone_resources(struct gendisk *disk);

View file

@ -1,267 +0,0 @@
// SPDX-License-Identifier: GPL-2.0
/* bounce buffer handling for block devices
*
* - Split from highmem.c
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/mm.h>
#include <linux/export.h>
#include <linux/swap.h>
#include <linux/gfp.h>
#include <linux/bio-integrity.h>
#include <linux/pagemap.h>
#include <linux/mempool.h>
#include <linux/blkdev.h>
#include <linux/backing-dev.h>
#include <linux/init.h>
#include <linux/hash.h>
#include <linux/highmem.h>
#include <linux/printk.h>
#include <asm/tlbflush.h>
#include <trace/events/block.h>
#include "blk.h"
#include "blk-cgroup.h"
#define POOL_SIZE 64
#define ISA_POOL_SIZE 16
static struct bio_set bounce_bio_set, bounce_bio_split;
static mempool_t page_pool;
static void init_bounce_bioset(void)
{
static bool bounce_bs_setup;
int ret;
if (bounce_bs_setup)
return;
ret = bioset_init(&bounce_bio_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS);
BUG_ON(ret);
ret = bioset_init(&bounce_bio_split, BIO_POOL_SIZE, 0, 0);
BUG_ON(ret);
bounce_bs_setup = true;
}
static __init int init_emergency_pool(void)
{
int ret;
#ifndef CONFIG_MEMORY_HOTPLUG
if (max_pfn <= max_low_pfn)
return 0;
#endif
ret = mempool_init_page_pool(&page_pool, POOL_SIZE, 0);
BUG_ON(ret);
pr_info("pool size: %d pages\n", POOL_SIZE);
init_bounce_bioset();
return 0;
}
__initcall(init_emergency_pool);
/*
* Simple bounce buffer support for highmem pages. Depending on the
* queue gfp mask set, *to may or may not be a highmem page. kmap it
* always, it will do the Right Thing
*/
static void copy_to_high_bio_irq(struct bio *to, struct bio *from)
{
struct bio_vec tovec, fromvec;
struct bvec_iter iter;
/*
* The bio of @from is created by bounce, so we can iterate
* its bvec from start to end, but the @from->bi_iter can't be
* trusted because it might be changed by splitting.
*/
struct bvec_iter from_iter = BVEC_ITER_ALL_INIT;
bio_for_each_segment(tovec, to, iter) {
fromvec = bio_iter_iovec(from, from_iter);
if (tovec.bv_page != fromvec.bv_page) {
/*
* fromvec->bv_offset and fromvec->bv_len might have
* been modified by the block layer, so use the original
* copy, bounce_copy_vec already uses tovec->bv_len
*/
memcpy_to_bvec(&tovec, page_address(fromvec.bv_page) +
tovec.bv_offset);
}
bio_advance_iter(from, &from_iter, tovec.bv_len);
}
}
static void bounce_end_io(struct bio *bio)
{
struct bio *bio_orig = bio->bi_private;
struct bio_vec *bvec, orig_vec;
struct bvec_iter orig_iter = bio_orig->bi_iter;
struct bvec_iter_all iter_all;
/*
* free up bounce indirect pages used
*/
bio_for_each_segment_all(bvec, bio, iter_all) {
orig_vec = bio_iter_iovec(bio_orig, orig_iter);
if (bvec->bv_page != orig_vec.bv_page) {
dec_zone_page_state(bvec->bv_page, NR_BOUNCE);
mempool_free(bvec->bv_page, &page_pool);
}
bio_advance_iter(bio_orig, &orig_iter, orig_vec.bv_len);
}
bio_orig->bi_status = bio->bi_status;
bio_endio(bio_orig);
bio_put(bio);
}
static void bounce_end_io_write(struct bio *bio)
{
bounce_end_io(bio);
}
static void bounce_end_io_read(struct bio *bio)
{
struct bio *bio_orig = bio->bi_private;
if (!bio->bi_status)
copy_to_high_bio_irq(bio_orig, bio);
bounce_end_io(bio);
}
static struct bio *bounce_clone_bio(struct bio *bio_src)
{
struct bvec_iter iter;
struct bio_vec bv;
struct bio *bio;
/*
* Pre immutable biovecs, __bio_clone() used to just do a memcpy from
* bio_src->bi_io_vec to bio->bi_io_vec.
*
* We can't do that anymore, because:
*
* - The point of cloning the biovec is to produce a bio with a biovec
* the caller can modify: bi_idx and bi_bvec_done should be 0.
*
* - The original bio could've had more than BIO_MAX_VECS biovecs; if
* we tried to clone the whole thing bio_alloc_bioset() would fail.
* But the clone should succeed as long as the number of biovecs we
* actually need to allocate is fewer than BIO_MAX_VECS.
*
* - Lastly, bi_vcnt should not be looked at or relied upon by code
* that does not own the bio - reason being drivers don't use it for
* iterating over the biovec anymore, so expecting it to be kept up
* to date (i.e. for clones that share the parent biovec) is just
* asking for trouble and would force extra work.
*/
bio = bio_alloc_bioset(bio_src->bi_bdev, bio_segments(bio_src),
bio_src->bi_opf, GFP_NOIO, &bounce_bio_set);
if (bio_flagged(bio_src, BIO_REMAPPED))
bio_set_flag(bio, BIO_REMAPPED);
bio->bi_ioprio = bio_src->bi_ioprio;
bio->bi_write_hint = bio_src->bi_write_hint;
bio->bi_iter.bi_sector = bio_src->bi_iter.bi_sector;
bio->bi_iter.bi_size = bio_src->bi_iter.bi_size;
switch (bio_op(bio)) {
case REQ_OP_DISCARD:
case REQ_OP_SECURE_ERASE:
case REQ_OP_WRITE_ZEROES:
break;
default:
bio_for_each_segment(bv, bio_src, iter)
bio->bi_io_vec[bio->bi_vcnt++] = bv;
break;
}
if (bio_crypt_clone(bio, bio_src, GFP_NOIO) < 0)
goto err_put;
if (bio_integrity(bio_src) &&
bio_integrity_clone(bio, bio_src, GFP_NOIO) < 0)
goto err_put;
bio_clone_blkg_association(bio, bio_src);
return bio;
err_put:
bio_put(bio);
return NULL;
}
struct bio *__blk_queue_bounce(struct bio *bio_orig, struct request_queue *q)
{
struct bio *bio;
int rw = bio_data_dir(bio_orig);
struct bio_vec *to, from;
struct bvec_iter iter;
unsigned i = 0, bytes = 0;
bool bounce = false;
int sectors;
bio_for_each_segment(from, bio_orig, iter) {
if (i++ < BIO_MAX_VECS)
bytes += from.bv_len;
if (PageHighMem(from.bv_page))
bounce = true;
}
if (!bounce)
return bio_orig;
/*
* Individual bvecs might not be logical block aligned. Round down
* the split size so that each bio is properly block size aligned,
* even if we do not use the full hardware limits.
*/
sectors = ALIGN_DOWN(bytes, queue_logical_block_size(q)) >>
SECTOR_SHIFT;
if (sectors < bio_sectors(bio_orig)) {
bio = bio_split(bio_orig, sectors, GFP_NOIO, &bounce_bio_split);
bio_chain(bio, bio_orig);
submit_bio_noacct(bio_orig);
bio_orig = bio;
}
bio = bounce_clone_bio(bio_orig);
/*
* Bvec table can't be updated by bio_for_each_segment_all(),
* so retrieve bvec from the table directly. This way is safe
* because the 'bio' is single-page bvec.
*/
for (i = 0, to = bio->bi_io_vec; i < bio->bi_vcnt; to++, i++) {
struct page *bounce_page;
if (!PageHighMem(to->bv_page))
continue;
bounce_page = mempool_alloc(&page_pool, GFP_NOIO);
inc_zone_page_state(bounce_page, NR_BOUNCE);
if (rw == WRITE) {
flush_dcache_page(to->bv_page);
memcpy_from_bvec(page_address(bounce_page), to);
}
to->bv_page = bounce_page;
}
trace_block_bio_bounce(bio_orig);
bio->bi_flags |= (1 << BIO_BOUNCED);
if (rw == READ)
bio->bi_end_io = bounce_end_io_read;
else
bio->bi_end_io = bounce_end_io_write;
bio->bi_private = bio_orig;
return bio;
}

View file

@ -45,6 +45,17 @@
#include "blk-wbt.h"
#include "blk-cgroup.h"
/* Holding context data for changing elevator */
struct elv_change_ctx {
const char *name;
bool no_uevent;
/* for unregistering old elevator */
struct elevator_queue *old;
/* for registering new elevator */
struct elevator_queue *new;
};
static DEFINE_SPINLOCK(elv_list_lock);
static LIST_HEAD(elv_list);
@ -148,18 +159,18 @@ static void elevator_release(struct kobject *kobj)
kfree(e);
}
void elevator_exit(struct request_queue *q)
static void elevator_exit(struct request_queue *q)
{
struct elevator_queue *e = q->elevator;
lockdep_assert_held(&q->elevator_lock);
ioc_clear_queue(q);
blk_mq_sched_free_rqs(q);
mutex_lock(&e->sysfs_lock);
blk_mq_exit_sched(q, e);
mutex_unlock(&e->sysfs_lock);
kobject_put(&e->kobj);
}
static inline void __elv_rqhash_del(struct request *rq)
@ -412,14 +423,15 @@ elv_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
{
const struct elv_fs_entry *entry = to_elv(attr);
struct elevator_queue *e;
ssize_t error;
ssize_t error = -ENODEV;
if (!entry->show)
return -EIO;
e = container_of(kobj, struct elevator_queue, kobj);
mutex_lock(&e->sysfs_lock);
error = e->type ? entry->show(e, page) : -ENOENT;
if (!test_bit(ELEVATOR_FLAG_DYING, &e->flags))
error = entry->show(e, page);
mutex_unlock(&e->sysfs_lock);
return error;
}
@ -430,14 +442,15 @@ elv_attr_store(struct kobject *kobj, struct attribute *attr,
{
const struct elv_fs_entry *entry = to_elv(attr);
struct elevator_queue *e;
ssize_t error;
ssize_t error = -ENODEV;
if (!entry->store)
return -EIO;
e = container_of(kobj, struct elevator_queue, kobj);
mutex_lock(&e->sysfs_lock);
error = e->type ? entry->store(e, page, length) : -ENOENT;
if (!test_bit(ELEVATOR_FLAG_DYING, &e->flags))
error = entry->store(e, page, length);
mutex_unlock(&e->sysfs_lock);
return error;
}
@ -452,13 +465,12 @@ static const struct kobj_type elv_ktype = {
.release = elevator_release,
};
int elv_register_queue(struct request_queue *q, bool uevent)
static int elv_register_queue(struct request_queue *q,
struct elevator_queue *e,
bool uevent)
{
struct elevator_queue *e = q->elevator;
int error;
lockdep_assert_held(&q->elevator_lock);
error = kobject_add(&e->kobj, &q->disk->queue_kobj, "iosched");
if (!error) {
const struct elv_fs_entry *attr = e->type->elevator_attrs;
@ -472,20 +484,25 @@ int elv_register_queue(struct request_queue *q, bool uevent)
if (uevent)
kobject_uevent(&e->kobj, KOBJ_ADD);
/*
* Sched is initialized, it is ready to export it via
* debugfs
*/
blk_mq_sched_reg_debugfs(q);
set_bit(ELEVATOR_FLAG_REGISTERED, &e->flags);
}
return error;
}
void elv_unregister_queue(struct request_queue *q)
static void elv_unregister_queue(struct request_queue *q,
struct elevator_queue *e)
{
struct elevator_queue *e = q->elevator;
lockdep_assert_held(&q->elevator_lock);
if (e && test_and_clear_bit(ELEVATOR_FLAG_REGISTERED, &e->flags)) {
kobject_uevent(&e->kobj, KOBJ_REMOVE);
kobject_del(&e->kobj);
/* unexport via debugfs before exiting sched */
blk_mq_sched_unreg_debugfs(q);
}
}
@ -548,42 +565,107 @@ void elv_unregister(struct elevator_type *e)
EXPORT_SYMBOL_GPL(elv_unregister);
/*
* For single queue devices, default to using mq-deadline. If we have multiple
* queues or mq-deadline is not available, default to "none".
* Switch to new_e io scheduler.
*
* If switching fails, we are most likely running out of memory and not able
* to restore the old io scheduler, so leaving the io scheduler being none.
*/
static struct elevator_type *elevator_get_default(struct request_queue *q)
static int elevator_switch(struct request_queue *q, struct elv_change_ctx *ctx)
{
if (q->tag_set->flags & BLK_MQ_F_NO_SCHED_BY_DEFAULT)
return NULL;
struct elevator_type *new_e = NULL;
int ret = 0;
if (q->nr_hw_queues != 1 &&
!blk_mq_is_shared_tags(q->tag_set->flags))
return NULL;
WARN_ON_ONCE(q->mq_freeze_depth == 0);
lockdep_assert_held(&q->elevator_lock);
return elevator_find_get("mq-deadline");
if (strncmp(ctx->name, "none", 4)) {
new_e = elevator_find_get(ctx->name);
if (!new_e)
return -EINVAL;
}
blk_mq_quiesce_queue(q);
if (q->elevator) {
ctx->old = q->elevator;
elevator_exit(q);
}
if (new_e) {
ret = blk_mq_init_sched(q, new_e);
if (ret)
goto out_unfreeze;
ctx->new = q->elevator;
} else {
blk_queue_flag_clear(QUEUE_FLAG_SQ_SCHED, q);
q->elevator = NULL;
q->nr_requests = q->tag_set->queue_depth;
}
blk_add_trace_msg(q, "elv switch: %s", ctx->name);
out_unfreeze:
blk_mq_unquiesce_queue(q);
if (ret) {
pr_warn("elv: switch to \"%s\" failed, falling back to \"none\"\n",
new_e->elevator_name);
}
if (new_e)
elevator_put(new_e);
return ret;
}
static void elv_exit_and_release(struct request_queue *q)
{
struct elevator_queue *e;
unsigned memflags;
memflags = blk_mq_freeze_queue(q);
mutex_lock(&q->elevator_lock);
e = q->elevator;
elevator_exit(q);
mutex_unlock(&q->elevator_lock);
blk_mq_unfreeze_queue(q, memflags);
if (e)
kobject_put(&e->kobj);
}
static int elevator_change_done(struct request_queue *q,
struct elv_change_ctx *ctx)
{
int ret = 0;
if (ctx->old) {
bool enable_wbt = test_bit(ELEVATOR_FLAG_ENABLE_WBT_ON_EXIT,
&ctx->old->flags);
elv_unregister_queue(q, ctx->old);
kobject_put(&ctx->old->kobj);
if (enable_wbt)
wbt_enable_default(q->disk);
}
if (ctx->new) {
ret = elv_register_queue(q, ctx->new, !ctx->no_uevent);
if (ret)
elv_exit_and_release(q);
}
return ret;
}
/*
* Use the default elevator settings. If the chosen elevator initialization
* fails, fall back to the "none" elevator (no elevator).
* Switch this queue to the given IO scheduler.
*/
void elevator_init_mq(struct request_queue *q)
static int elevator_change(struct request_queue *q, struct elv_change_ctx *ctx)
{
struct elevator_type *e;
unsigned int memflags;
int err;
int ret = 0;
WARN_ON_ONCE(blk_queue_registered(q));
if (unlikely(q->elevator))
return;
e = elevator_get_default(q);
if (!e)
return;
lockdep_assert_held(&q->tag_set->update_nr_hwq_lock);
memflags = blk_mq_freeze_queue(q);
/*
* We are called before adding disk, when there isn't any FS I/O,
* May be called before adding disk, when there isn't any FS I/O,
* so freezing queue plus canceling dispatch work is enough to
* drain any dispatch activities originated from passthrough
* requests, then no need to quiesce queue which may add long boot
@ -591,116 +673,86 @@ void elevator_init_mq(struct request_queue *q)
*
* Disk isn't added yet, so verifying queue lock only manually.
*/
memflags = blk_mq_freeze_queue(q);
blk_mq_cancel_work_sync(q);
err = blk_mq_init_sched(q, e);
mutex_lock(&q->elevator_lock);
if (!(q->elevator && elevator_match(q->elevator->type, ctx->name)))
ret = elevator_switch(q, ctx);
mutex_unlock(&q->elevator_lock);
blk_mq_unfreeze_queue(q, memflags);
if (err) {
pr_warn("\"%s\" elevator initialization failed, "
"falling back to \"none\"\n", e->elevator_name);
}
elevator_put(e);
}
/*
* Switch to new_e io scheduler.
*
* If switching fails, we are most likely running out of memory and not able
* to restore the old io scheduler, so leaving the io scheduler being none.
*/
int elevator_switch(struct request_queue *q, struct elevator_type *new_e)
{
unsigned int memflags;
int ret;
lockdep_assert_held(&q->elevator_lock);
memflags = blk_mq_freeze_queue(q);
blk_mq_quiesce_queue(q);
if (q->elevator) {
elv_unregister_queue(q);
elevator_exit(q);
}
ret = blk_mq_init_sched(q, new_e);
if (ret)
goto out_unfreeze;
ret = elv_register_queue(q, true);
if (ret) {
elevator_exit(q);
goto out_unfreeze;
}
blk_add_trace_msg(q, "elv switch: %s", new_e->elevator_name);
out_unfreeze:
blk_mq_unquiesce_queue(q);
blk_mq_unfreeze_queue(q, memflags);
if (ret) {
pr_warn("elv: switch to \"%s\" failed, falling back to \"none\"\n",
new_e->elevator_name);
}
if (!ret)
ret = elevator_change_done(q, ctx);
return ret;
}
void elevator_disable(struct request_queue *q)
/*
* The I/O scheduler depends on the number of hardware queues, this forces a
* reattachment when nr_hw_queues changes.
*/
void elv_update_nr_hw_queues(struct request_queue *q)
{
unsigned int memflags;
struct elv_change_ctx ctx = {};
int ret = -ENODEV;
lockdep_assert_held(&q->elevator_lock);
WARN_ON_ONCE(q->mq_freeze_depth == 0);
memflags = blk_mq_freeze_queue(q);
blk_mq_quiesce_queue(q);
mutex_lock(&q->elevator_lock);
if (q->elevator && !blk_queue_dying(q) && blk_queue_registered(q)) {
ctx.name = q->elevator->type->elevator_name;
elv_unregister_queue(q);
elevator_exit(q);
blk_queue_flag_clear(QUEUE_FLAG_SQ_SCHED, q);
q->elevator = NULL;
q->nr_requests = q->tag_set->queue_depth;
blk_add_trace_msg(q, "elv switch: none");
blk_mq_unquiesce_queue(q);
blk_mq_unfreeze_queue(q, memflags);
/* force to reattach elevator after nr_hw_queue is updated */
ret = elevator_switch(q, &ctx);
}
mutex_unlock(&q->elevator_lock);
blk_mq_unfreeze_queue_nomemrestore(q);
if (!ret)
WARN_ON_ONCE(elevator_change_done(q, &ctx));
}
/*
* Switch this queue to the given IO scheduler.
* Use the default elevator settings. If the chosen elevator initialization
* fails, fall back to the "none" elevator (no elevator).
*/
static int elevator_change(struct request_queue *q, const char *elevator_name)
void elevator_set_default(struct request_queue *q)
{
struct elevator_type *e;
int ret;
struct elv_change_ctx ctx = {
.name = "mq-deadline",
.no_uevent = true,
};
int err = 0;
/* Make sure queue is not in the middle of being removed */
if (!blk_queue_registered(q))
return -ENOENT;
/* now we allow to switch elevator */
blk_queue_flag_clear(QUEUE_FLAG_NO_ELV_SWITCH, q);
if (!strncmp(elevator_name, "none", 4)) {
if (q->elevator)
elevator_disable(q);
return 0;
if (q->tag_set->flags & BLK_MQ_F_NO_SCHED_BY_DEFAULT)
return;
/*
* For single queue devices, default to using mq-deadline. If we
* have multiple queues or mq-deadline is not available, default
* to "none".
*/
if (elevator_find_get(ctx.name) && (q->nr_hw_queues == 1 ||
blk_mq_is_shared_tags(q->tag_set->flags)))
err = elevator_change(q, &ctx);
if (err < 0)
pr_warn("\"%s\" elevator initialization, failed %d, "
"falling back to \"none\"\n", ctx.name, err);
}
if (q->elevator && elevator_match(q->elevator->type, elevator_name))
return 0;
void elevator_set_none(struct request_queue *q)
{
struct elv_change_ctx ctx = {
.name = "none",
};
int err;
e = elevator_find_get(elevator_name);
if (!e)
return -EINVAL;
ret = elevator_switch(q, e);
elevator_put(e);
return ret;
err = elevator_change(q, &ctx);
if (err < 0)
pr_warn("%s: set none elevator failed %d\n", __func__, err);
}
static void elv_iosched_load_module(char *elevator_name)
static void elv_iosched_load_module(const char *elevator_name)
{
struct elevator_type *found;
@ -716,10 +768,14 @@ ssize_t elv_iosched_store(struct gendisk *disk, const char *buf,
size_t count)
{
char elevator_name[ELV_NAME_MAX];
char *name;
struct elv_change_ctx ctx = {};
int ret;
unsigned int memflags;
struct request_queue *q = disk->queue;
struct blk_mq_tag_set *set = q->tag_set;
/* Make sure queue is not in the middle of being removed */
if (!blk_queue_registered(q))
return -ENOENT;
/*
* If the attribute needs to load a module, do it before freezing the
@ -727,24 +783,25 @@ ssize_t elv_iosched_store(struct gendisk *disk, const char *buf,
* queue is the one for the device storing the module file.
*/
strscpy(elevator_name, buf, sizeof(elevator_name));
name = strstrip(elevator_name);
ctx.name = strstrip(elevator_name);
elv_iosched_load_module(name);
elv_iosched_load_module(ctx.name);
memflags = blk_mq_freeze_queue(q);
mutex_lock(&q->elevator_lock);
ret = elevator_change(q, name);
down_read(&set->update_nr_hwq_lock);
if (!blk_queue_no_elv_switch(q)) {
ret = elevator_change(q, &ctx);
if (!ret)
ret = count;
mutex_unlock(&q->elevator_lock);
blk_mq_unfreeze_queue(q, memflags);
} else {
ret = -ENOENT;
}
up_read(&set->update_nr_hwq_lock);
return ret;
}
ssize_t elv_iosched_show(struct gendisk *disk, char *name)
{
struct request_queue *q = disk->queue;
struct elevator_queue *eq = q->elevator;
struct elevator_type *cur = NULL, *e;
int len = 0;
@ -753,7 +810,7 @@ ssize_t elv_iosched_show(struct gendisk *disk, char *name)
len += sprintf(name+len, "[none] ");
} else {
len += sprintf(name+len, "none ");
cur = eq->type;
cur = q->elevator->type;
}
spin_lock(&elv_list_lock);

View file

@ -121,7 +121,8 @@ struct elevator_queue
};
#define ELEVATOR_FLAG_REGISTERED 0
#define ELEVATOR_FLAG_DISABLE_WBT 1
#define ELEVATOR_FLAG_DYING 1
#define ELEVATOR_FLAG_ENABLE_WBT_ON_EXIT 2
/*
* block elevator interface
@ -182,4 +183,7 @@ extern struct request *elv_rb_find(struct rb_root *, sector_t);
#define rq_entry_fifo(ptr) list_entry((ptr), struct request, queuelist)
#define rq_fifo_clear(rq) list_del_init(&(rq)->queuelist)
void blk_mq_sched_reg_debugfs(struct request_queue *q);
void blk_mq_sched_unreg_debugfs(struct request_queue *q);
#endif /* _ELEVATOR_H */

View file

@ -73,6 +73,7 @@ static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb,
}
bio.bi_iter.bi_sector = pos >> SECTOR_SHIFT;
bio.bi_write_hint = file_inode(iocb->ki_filp)->i_write_hint;
bio.bi_write_stream = iocb->ki_write_stream;
bio.bi_ioprio = iocb->ki_ioprio;
if (iocb->ki_flags & IOCB_ATOMIC)
bio.bi_opf |= REQ_ATOMIC;
@ -206,6 +207,7 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
for (;;) {
bio->bi_iter.bi_sector = pos >> SECTOR_SHIFT;
bio->bi_write_hint = file_inode(iocb->ki_filp)->i_write_hint;
bio->bi_write_stream = iocb->ki_write_stream;
bio->bi_private = dio;
bio->bi_end_io = blkdev_bio_end_io;
bio->bi_ioprio = iocb->ki_ioprio;
@ -333,6 +335,7 @@ static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb,
dio->iocb = iocb;
bio->bi_iter.bi_sector = pos >> SECTOR_SHIFT;
bio->bi_write_hint = file_inode(iocb->ki_filp)->i_write_hint;
bio->bi_write_stream = iocb->ki_write_stream;
bio->bi_end_io = blkdev_bio_end_io_async;
bio->bi_ioprio = iocb->ki_ioprio;
@ -398,6 +401,26 @@ static ssize_t blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
if (blkdev_dio_invalid(bdev, iocb, iter))
return -EINVAL;
if (iov_iter_rw(iter) == WRITE) {
u16 max_write_streams = bdev_max_write_streams(bdev);
if (iocb->ki_write_stream) {
if (iocb->ki_write_stream > max_write_streams)
return -EINVAL;
} else if (max_write_streams) {
enum rw_hint write_hint =
file_inode(iocb->ki_filp)->i_write_hint;
/*
* Just use the write hint as write stream for block
* device writes. This assumes no file system is
* mounted that would use the streams differently.
*/
if (write_hint <= max_write_streams)
iocb->ki_write_stream = write_hint;
}
}
nr_pages = bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS + 1);
if (likely(nr_pages <= BIO_MAX_VECS)) {
if (is_sync_kiocb(iocb))
@ -451,12 +474,13 @@ static int blkdev_get_block(struct inode *inode, sector_t iblock,
static int blkdev_writepages(struct address_space *mapping,
struct writeback_control *wbc)
{
struct folio *folio = NULL;
struct blk_plug plug;
int err;
blk_start_plug(&plug);
err = write_cache_pages(mapping, wbc, block_write_full_folio,
blkdev_get_block);
while ((folio = writeback_iter(mapping, wbc, folio, &err)))
err = block_write_full_folio(folio, wbc, blkdev_get_block);
blk_finish_plug(&plug);
return err;

View file

@ -125,38 +125,47 @@ static void part_stat_read_all(struct block_device *part,
}
}
unsigned int part_in_flight(struct block_device *part)
{
unsigned int inflight = 0;
int cpu;
for_each_possible_cpu(cpu) {
inflight += part_stat_local_read_cpu(part, in_flight[0], cpu) +
part_stat_local_read_cpu(part, in_flight[1], cpu);
}
if ((int)inflight < 0)
inflight = 0;
return inflight;
}
static void part_in_flight_rw(struct block_device *part,
unsigned int inflight[2])
static void bdev_count_inflight_rw(struct block_device *part,
unsigned int inflight[2], bool mq_driver)
{
int cpu;
inflight[0] = 0;
inflight[1] = 0;
if (mq_driver) {
blk_mq_in_driver_rw(part, inflight);
} else {
for_each_possible_cpu(cpu) {
inflight[0] += part_stat_local_read_cpu(part, in_flight[0], cpu);
inflight[1] += part_stat_local_read_cpu(part, in_flight[1], cpu);
inflight[READ] += part_stat_local_read_cpu(
part, in_flight[READ], cpu);
inflight[WRITE] += part_stat_local_read_cpu(
part, in_flight[WRITE], cpu);
}
if ((int)inflight[0] < 0)
inflight[0] = 0;
if ((int)inflight[1] < 0)
inflight[1] = 0;
}
if (WARN_ON_ONCE((int)inflight[READ] < 0))
inflight[READ] = 0;
if (WARN_ON_ONCE((int)inflight[WRITE] < 0))
inflight[WRITE] = 0;
}
/**
* bdev_count_inflight - get the number of inflight IOs for a block device.
*
* @part: the block device.
*
* Inflight here means started IO accounting, from bdev_start_io_acct() for
* bio-based block device, and from blk_account_io_start() for rq-based block
* device.
*/
unsigned int bdev_count_inflight(struct block_device *part)
{
unsigned int inflight[2] = {0};
bdev_count_inflight_rw(part, inflight, false);
return inflight[READ] + inflight[WRITE];
}
EXPORT_SYMBOL_GPL(bdev_count_inflight);
/*
* Can be deleted altogether. Later.
*
@ -389,17 +398,33 @@ int disk_scan_partitions(struct gendisk *disk, blk_mode_t mode)
return ret;
}
/**
* add_disk_fwnode - add disk information to kernel list with fwnode
* @parent: parent device for the disk
* @disk: per-device partitioning information
* @groups: Additional per-device sysfs groups
* @fwnode: attached disk fwnode
*
* This function registers the partitioning information in @disk
* with the kernel. Also attach a fwnode to the disk device.
static void add_disk_final(struct gendisk *disk)
{
struct device *ddev = disk_to_dev(disk);
if (!(disk->flags & GENHD_FL_HIDDEN)) {
/* Make sure the first partition scan will be proceed */
if (get_capacity(disk) && disk_has_partscan(disk))
set_bit(GD_NEED_PART_SCAN, &disk->state);
bdev_add(disk->part0, ddev->devt);
if (get_capacity(disk))
disk_scan_partitions(disk, BLK_OPEN_READ);
/*
* Announce the disk and partitions after all partitions are
* created. (for hidden disks uevents remain suppressed forever)
*/
int __must_check add_disk_fwnode(struct device *parent, struct gendisk *disk,
dev_set_uevent_suppress(ddev, 0);
disk_uevent(disk, KOBJ_ADD);
}
blk_apply_bdi_limits(disk->bdi, &disk->queue->limits);
disk_add_events(disk);
set_bit(GD_ADDED, &disk->state);
}
static int __add_disk(struct device *parent, struct gendisk *disk,
const struct attribute_group **groups,
struct fwnode_handle *fwnode)
@ -416,12 +441,6 @@ int __must_check add_disk_fwnode(struct device *parent, struct gendisk *disk,
*/
if (disk->fops->submit_bio || disk->fops->poll_bio)
return -EINVAL;
/*
* Initialize the I/O scheduler code and pick a default one if
* needed.
*/
elevator_init_mq(disk->queue);
} else {
if (!disk->fops->submit_bio)
return -EINVAL;
@ -438,7 +457,7 @@ int __must_check add_disk_fwnode(struct device *parent, struct gendisk *disk,
ret = -EINVAL;
if (disk->major) {
if (WARN_ON(!disk->minors))
goto out_exit_elevator;
goto out;
if (disk->minors > DISK_MAX_PARTS) {
pr_err("block: can't allocate more than %d partitions\n",
@ -448,14 +467,14 @@ int __must_check add_disk_fwnode(struct device *parent, struct gendisk *disk,
if (disk->first_minor > MINORMASK ||
disk->minors > MINORMASK + 1 ||
disk->first_minor + disk->minors > MINORMASK + 1)
goto out_exit_elevator;
goto out;
} else {
if (WARN_ON(disk->minors))
goto out_exit_elevator;
goto out;
ret = blk_alloc_ext_minor();
if (ret < 0)
goto out_exit_elevator;
goto out;
disk->major = BLOCK_EXT_MAJOR;
disk->first_minor = ret;
}
@ -516,21 +535,6 @@ int __must_check add_disk_fwnode(struct device *parent, struct gendisk *disk,
&disk->bdi->dev->kobj, "bdi");
if (ret)
goto out_unregister_bdi;
/* Make sure the first partition scan will be proceed */
if (get_capacity(disk) && disk_has_partscan(disk))
set_bit(GD_NEED_PART_SCAN, &disk->state);
bdev_add(disk->part0, ddev->devt);
if (get_capacity(disk))
disk_scan_partitions(disk, BLK_OPEN_READ);
/*
* Announce the disk and partitions after all partitions are
* created. (for hidden disks uevents remain suppressed forever)
*/
dev_set_uevent_suppress(ddev, 0);
disk_uevent(disk, KOBJ_ADD);
} else {
/*
* Even if the block_device for a hidden gendisk is not
@ -539,10 +543,6 @@ int __must_check add_disk_fwnode(struct device *parent, struct gendisk *disk,
*/
disk->part0->bd_dev = MKDEV(disk->major, disk->first_minor);
}
blk_apply_bdi_limits(disk->bdi, &disk->queue->limits);
disk_add_events(disk);
set_bit(GD_ADDED, &disk->state);
return 0;
out_unregister_bdi:
@ -564,12 +564,46 @@ out_device_del:
out_free_ext_minor:
if (disk->major == BLOCK_EXT_MAJOR)
blk_free_ext_minor(disk->first_minor);
out_exit_elevator:
if (disk->queue->elevator) {
mutex_lock(&disk->queue->elevator_lock);
elevator_exit(disk->queue);
mutex_unlock(&disk->queue->elevator_lock);
out:
return ret;
}
/**
* add_disk_fwnode - add disk information to kernel list with fwnode
* @parent: parent device for the disk
* @disk: per-device partitioning information
* @groups: Additional per-device sysfs groups
* @fwnode: attached disk fwnode
*
* This function registers the partitioning information in @disk
* with the kernel. Also attach a fwnode to the disk device.
*/
int __must_check add_disk_fwnode(struct device *parent, struct gendisk *disk,
const struct attribute_group **groups,
struct fwnode_handle *fwnode)
{
struct blk_mq_tag_set *set;
unsigned int memflags;
int ret;
if (queue_is_mq(disk->queue)) {
set = disk->queue->tag_set;
memflags = memalloc_noio_save();
down_read(&set->update_nr_hwq_lock);
ret = __add_disk(parent, disk, groups, fwnode);
up_read(&set->update_nr_hwq_lock);
memalloc_noio_restore(memflags);
} else {
ret = __add_disk(parent, disk, groups, fwnode);
}
/*
* add_disk_final() needn't to read `nr_hw_queues`, so move it out
* of read lock `set->update_nr_hwq_lock` for avoiding unnecessary
* lock dependency on `disk->open_mutex` from scanning partition.
*/
if (!ret)
add_disk_final(disk);
return ret;
}
EXPORT_SYMBOL_GPL(add_disk_fwnode);
@ -652,26 +686,7 @@ void blk_mark_disk_dead(struct gendisk *disk)
}
EXPORT_SYMBOL_GPL(blk_mark_disk_dead);
/**
* del_gendisk - remove the gendisk
* @disk: the struct gendisk to remove
*
* Removes the gendisk and all its associated resources. This deletes the
* partitions associated with the gendisk, and unregisters the associated
* request_queue.
*
* This is the counter to the respective __device_add_disk() call.
*
* The final removal of the struct gendisk happens when its refcount reaches 0
* with put_disk(), which should be called after del_gendisk(), if
* __device_add_disk() was used.
*
* Drivers exist which depend on the release of the gendisk to be synchronous,
* it should not be deferred.
*
* Context: can sleep
*/
void del_gendisk(struct gendisk *disk)
static void __del_gendisk(struct gendisk *disk)
{
struct request_queue *q = disk->queue;
struct block_device *part;
@ -743,14 +758,7 @@ void del_gendisk(struct gendisk *disk)
if (queue_is_mq(q))
blk_mq_cancel_work_sync(q);
blk_mq_quiesce_queue(q);
if (q->elevator) {
mutex_lock(&q->elevator_lock);
elevator_exit(q);
mutex_unlock(&q->elevator_lock);
}
rq_qos_exit(q);
blk_mq_unquiesce_queue(q);
/*
* If the disk does not own the queue, allow using passthrough requests
@ -764,6 +772,55 @@ void del_gendisk(struct gendisk *disk)
if (start_drain)
blk_unfreeze_release_lock(q);
}
static void disable_elv_switch(struct request_queue *q)
{
struct blk_mq_tag_set *set = q->tag_set;
WARN_ON_ONCE(!queue_is_mq(q));
down_write(&set->update_nr_hwq_lock);
blk_queue_flag_set(QUEUE_FLAG_NO_ELV_SWITCH, q);
up_write(&set->update_nr_hwq_lock);
}
/**
* del_gendisk - remove the gendisk
* @disk: the struct gendisk to remove
*
* Removes the gendisk and all its associated resources. This deletes the
* partitions associated with the gendisk, and unregisters the associated
* request_queue.
*
* This is the counter to the respective __device_add_disk() call.
*
* The final removal of the struct gendisk happens when its refcount reaches 0
* with put_disk(), which should be called after del_gendisk(), if
* __device_add_disk() was used.
*
* Drivers exist which depend on the release of the gendisk to be synchronous,
* it should not be deferred.
*
* Context: can sleep
*/
void del_gendisk(struct gendisk *disk)
{
struct blk_mq_tag_set *set;
unsigned int memflags;
if (!queue_is_mq(disk->queue)) {
__del_gendisk(disk);
} else {
set = disk->queue->tag_set;
disable_elv_switch(disk->queue);
memflags = memalloc_noio_save();
down_read(&set->update_nr_hwq_lock);
__del_gendisk(disk);
up_read(&set->update_nr_hwq_lock);
memalloc_noio_restore(memflags);
}
}
EXPORT_SYMBOL(del_gendisk);
/**
@ -1005,7 +1062,7 @@ ssize_t part_stat_show(struct device *dev,
struct disk_stats stat;
unsigned int inflight;
inflight = part_in_flight(bdev);
inflight = bdev_count_inflight(bdev);
if (inflight) {
part_stat_lock();
update_io_ticks(bdev, jiffies, true);
@ -1042,19 +1099,21 @@ ssize_t part_stat_show(struct device *dev,
(unsigned int)div_u64(stat.nsecs[STAT_FLUSH], NSEC_PER_MSEC));
}
/*
* Show the number of IOs issued to driver.
* For bio-based device, started from bdev_start_io_acct();
* For rq-based device, started from blk_mq_start_request();
*/
ssize_t part_inflight_show(struct device *dev, struct device_attribute *attr,
char *buf)
{
struct block_device *bdev = dev_to_bdev(dev);
struct request_queue *q = bdev_get_queue(bdev);
unsigned int inflight[2];
unsigned int inflight[2] = {0};
if (queue_is_mq(q))
blk_mq_in_flight_rw(q, bdev, inflight);
else
part_in_flight_rw(bdev, inflight);
bdev_count_inflight_rw(bdev, inflight, queue_is_mq(q));
return sysfs_emit(buf, "%8u %8u\n", inflight[0], inflight[1]);
return sysfs_emit(buf, "%8u %8u\n", inflight[READ], inflight[WRITE]);
}
static ssize_t disk_capability_show(struct device *dev,
@ -1307,7 +1366,7 @@ static int diskstats_show(struct seq_file *seqf, void *v)
if (bdev_is_partition(hd) && !bdev_nr_sectors(hd))
continue;
inflight = part_in_flight(hd);
inflight = bdev_count_inflight(hd);
if (inflight) {
part_stat_lock();
update_io_ticks(hd, jiffies, true);
@ -1422,6 +1481,7 @@ struct gendisk *__alloc_disk_node(struct request_queue *q, int node_id,
#ifdef CONFIG_BLOCK_HOLDER_DEPRECATED
INIT_LIST_HEAD(&disk->slave_bdevs);
#endif
mutex_init(&disk->rqos_state_mutex);
return disk;
out_erase_part0:

View file

@ -715,7 +715,7 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
}
/*
* Called from blk_mq_insert_request() or blk_mq_dispatch_plug_list().
* Called from blk_mq_insert_request() or blk_mq_dispatch_list().
*/
static void dd_insert_requests(struct blk_mq_hw_ctx *hctx,
struct list_head *list,

View file

@ -468,7 +468,7 @@ static ssize_t node_read_meminfo(struct device *dev,
nid, K(node_page_state(pgdat, NR_PAGETABLE)),
nid, K(node_page_state(pgdat, NR_SECONDARY_PAGETABLE)),
nid, 0UL,
nid, K(sum_zone_node_page_state(nid, NR_BOUNCE)),
nid, 0UL,
nid, K(node_page_state(pgdat, NR_WRITEBACK_TEMP)),
nid, K(sreclaimable +
node_page_state(pgdat, NR_KERNEL_MISC_RECLAIMABLE)),

View file

@ -407,4 +407,23 @@ config BLKDEV_UBLK_LEGACY_OPCODES
source "drivers/block/rnbd/Kconfig"
config BLK_DEV_ZONED_LOOP
tristate "Zoned loopback device support"
depends on BLK_DEV_ZONED
help
Saying Y here will allow you to use create a zoned block device using
regular files for zones (one file per zones). This is useful to test
file systems, device mapper and applications that support zoned block
devices. To create a zoned loop device, no user utility is needed, a
zoned loop device can be created (or re-started) using a command
like:
echo "add id=0,zone_size_mb=256,capacity_mb=16384,conv_zones=11" > \
/dev/zloop-control
See Documentation/admin-guide/blockdev/zoned_loop.rst for usage
details.
If unsure, say N.
endif # BLK_DEV

View file

@ -41,5 +41,6 @@ obj-$(CONFIG_BLK_DEV_RNBD) += rnbd/
obj-$(CONFIG_BLK_DEV_NULL_BLK) += null_blk/
obj-$(CONFIG_BLK_DEV_UBLK) += ublk_drv.o
obj-$(CONFIG_BLK_DEV_ZONED_LOOP) += zloop.o
swim_mod-y := swim.o swim_asm.o

View file

@ -54,32 +54,33 @@ static struct page *brd_lookup_page(struct brd_device *brd, sector_t sector)
/*
* Insert a new page for a given sector, if one does not already exist.
*/
static int brd_insert_page(struct brd_device *brd, sector_t sector, gfp_t gfp)
static struct page *brd_insert_page(struct brd_device *brd, sector_t sector,
blk_opf_t opf)
__releases(rcu)
__acquires(rcu)
{
pgoff_t idx = sector >> PAGE_SECTORS_SHIFT;
struct page *page;
int ret = 0;
page = brd_lookup_page(brd, sector);
if (page)
return 0;
gfp_t gfp = (opf & REQ_NOWAIT) ? GFP_NOWAIT : GFP_NOIO;
struct page *page, *ret;
rcu_read_unlock();
page = alloc_page(gfp | __GFP_ZERO | __GFP_HIGHMEM);
rcu_read_lock();
if (!page)
return -ENOMEM;
return ERR_PTR(-ENOMEM);
xa_lock(&brd->brd_pages);
ret = __xa_insert(&brd->brd_pages, idx, page, gfp);
if (!ret)
ret = __xa_cmpxchg(&brd->brd_pages, sector >> PAGE_SECTORS_SHIFT, NULL,
page, gfp);
if (ret) {
xa_unlock(&brd->brd_pages);
__free_page(page);
if (xa_is_err(ret))
return ERR_PTR(xa_err(ret));
return ret;
}
brd->brd_nr_pages++;
xa_unlock(&brd->brd_pages);
if (ret < 0) {
__free_page(page);
if (ret == -EBUSY)
ret = 0;
}
return ret;
return page;
}
/*
@ -100,143 +101,77 @@ static void brd_free_pages(struct brd_device *brd)
}
/*
* copy_to_brd_setup must be called before copy_to_brd. It may sleep.
* Process a single segment. The segment is capped to not cross page boundaries
* in both the bio and the brd backing memory.
*/
static int copy_to_brd_setup(struct brd_device *brd, sector_t sector, size_t n,
gfp_t gfp)
{
unsigned int offset = (sector & (PAGE_SECTORS-1)) << SECTOR_SHIFT;
size_t copy;
int ret;
copy = min_t(size_t, n, PAGE_SIZE - offset);
ret = brd_insert_page(brd, sector, gfp);
if (ret)
return ret;
if (copy < n) {
sector += copy >> SECTOR_SHIFT;
ret = brd_insert_page(brd, sector, gfp);
}
return ret;
}
/*
* Copy n bytes from src to the brd starting at sector. Does not sleep.
*/
static void copy_to_brd(struct brd_device *brd, const void *src,
sector_t sector, size_t n)
static bool brd_rw_bvec(struct brd_device *brd, struct bio *bio)
{
struct bio_vec bv = bio_iter_iovec(bio, bio->bi_iter);
sector_t sector = bio->bi_iter.bi_sector;
u32 offset = (sector & (PAGE_SECTORS - 1)) << SECTOR_SHIFT;
blk_opf_t opf = bio->bi_opf;
struct page *page;
void *dst;
unsigned int offset = (sector & (PAGE_SECTORS-1)) << SECTOR_SHIFT;
size_t copy;
void *kaddr;
copy = min_t(size_t, n, PAGE_SIZE - offset);
bv.bv_len = min_t(u32, bv.bv_len, PAGE_SIZE - offset);
rcu_read_lock();
page = brd_lookup_page(brd, sector);
BUG_ON(!page);
dst = kmap_atomic(page);
memcpy(dst + offset, src, copy);
kunmap_atomic(dst);
if (copy < n) {
src += copy;
sector += copy >> SECTOR_SHIFT;
copy = n - copy;
page = brd_lookup_page(brd, sector);
BUG_ON(!page);
dst = kmap_atomic(page);
memcpy(dst, src, copy);
kunmap_atomic(dst);
}
if (!page && op_is_write(opf)) {
page = brd_insert_page(brd, sector, opf);
if (IS_ERR(page))
goto out_error;
}
/*
* Copy n bytes to dst from the brd starting at sector. Does not sleep.
*/
static void copy_from_brd(void *dst, struct brd_device *brd,
sector_t sector, size_t n)
{
struct page *page;
void *src;
unsigned int offset = (sector & (PAGE_SECTORS-1)) << SECTOR_SHIFT;
size_t copy;
copy = min_t(size_t, n, PAGE_SIZE - offset);
page = brd_lookup_page(brd, sector);
if (page) {
src = kmap_atomic(page);
memcpy(dst, src + offset, copy);
kunmap_atomic(src);
} else
memset(dst, 0, copy);
if (copy < n) {
dst += copy;
sector += copy >> SECTOR_SHIFT;
copy = n - copy;
page = brd_lookup_page(brd, sector);
if (page) {
src = kmap_atomic(page);
memcpy(dst, src, copy);
kunmap_atomic(src);
} else
memset(dst, 0, copy);
}
}
/*
* Process a single bvec of a bio.
*/
static int brd_do_bvec(struct brd_device *brd, struct page *page,
unsigned int len, unsigned int off, blk_opf_t opf,
sector_t sector)
{
void *mem;
int err = 0;
kaddr = bvec_kmap_local(&bv);
if (op_is_write(opf)) {
/*
* Must use NOIO because we don't want to recurse back into the
* block or filesystem layers from page reclaim.
*/
gfp_t gfp = opf & REQ_NOWAIT ? GFP_NOWAIT : GFP_NOIO;
err = copy_to_brd_setup(brd, sector, len, gfp);
if (err)
goto out;
}
mem = kmap_atomic(page);
if (!op_is_write(opf)) {
copy_from_brd(mem + off, brd, sector, len);
flush_dcache_page(page);
memcpy_to_page(page, offset, kaddr, bv.bv_len);
} else {
flush_dcache_page(page);
copy_to_brd(brd, mem + off, sector, len);
if (page)
memcpy_from_page(kaddr, page, offset, bv.bv_len);
else
memset(kaddr, 0, bv.bv_len);
}
kunmap_atomic(mem);
kunmap_local(kaddr);
rcu_read_unlock();
out:
return err;
bio_advance_iter_single(bio, &bio->bi_iter, bv.bv_len);
return true;
out_error:
rcu_read_unlock();
if (PTR_ERR(page) == -ENOMEM && (opf & REQ_NOWAIT))
bio_wouldblock_error(bio);
else
bio_io_error(bio);
return false;
}
static void brd_free_one_page(struct rcu_head *head)
{
struct page *page = container_of(head, struct page, rcu_head);
__free_page(page);
}
static void brd_do_discard(struct brd_device *brd, sector_t sector, u32 size)
{
sector_t aligned_sector = (sector + PAGE_SECTORS) & ~PAGE_SECTORS;
sector_t aligned_sector = round_up(sector, PAGE_SECTORS);
sector_t aligned_end = round_down(
sector + (size >> SECTOR_SHIFT), PAGE_SECTORS);
struct page *page;
size -= (aligned_sector - sector) * SECTOR_SIZE;
if (aligned_end <= aligned_sector)
return;
xa_lock(&brd->brd_pages);
while (size >= PAGE_SIZE && aligned_sector < rd_size * 2) {
while (aligned_sector < aligned_end && aligned_sector < rd_size * 2) {
page = __xa_erase(&brd->brd_pages, aligned_sector >> PAGE_SECTORS_SHIFT);
if (page) {
__free_page(page);
call_rcu(&page->rcu_head, brd_free_one_page);
brd->brd_nr_pages--;
}
aligned_sector += PAGE_SECTORS;
size -= PAGE_SIZE;
}
xa_unlock(&brd->brd_pages);
}
@ -244,36 +179,18 @@ static void brd_do_discard(struct brd_device *brd, sector_t sector, u32 size)
static void brd_submit_bio(struct bio *bio)
{
struct brd_device *brd = bio->bi_bdev->bd_disk->private_data;
sector_t sector = bio->bi_iter.bi_sector;
struct bio_vec bvec;
struct bvec_iter iter;
if (unlikely(op_is_discard(bio->bi_opf))) {
brd_do_discard(brd, sector, bio->bi_iter.bi_size);
brd_do_discard(brd, bio->bi_iter.bi_sector,
bio->bi_iter.bi_size);
bio_endio(bio);
return;
}
bio_for_each_segment(bvec, bio, iter) {
unsigned int len = bvec.bv_len;
int err;
/* Don't support un-aligned buffer */
WARN_ON_ONCE((bvec.bv_offset & (SECTOR_SIZE - 1)) ||
(len & (SECTOR_SIZE - 1)));
err = brd_do_bvec(brd, bvec.bv_page, len, bvec.bv_offset,
bio->bi_opf, sector);
if (err) {
if (err == -ENOMEM && bio->bi_opf & REQ_NOWAIT) {
bio_wouldblock_error(bio);
do {
if (!brd_rw_bvec(brd, bio))
return;
}
bio_io_error(bio);
return;
}
sector += len >> SECTOR_SHIFT;
}
} while (bio->bi_iter.bi_size);
bio_endio(bio);
}

View file

@ -725,7 +725,7 @@ static int pkt_generic_packet(struct pktcdvd_device *pd, struct packet_command *
scmd = blk_mq_rq_to_pdu(rq);
if (cgc->buflen) {
ret = blk_rq_map_kern(q, rq, cgc->buffer, cgc->buflen,
ret = blk_rq_map_kern(rq, cgc->buffer, cgc->buflen,
GFP_NOIO);
if (ret)
goto out;

View file

@ -147,12 +147,7 @@ static int process_rdma(struct rnbd_srv_session *srv_sess,
bio = bio_alloc(file_bdev(sess_dev->bdev_file), 1,
rnbd_to_bio_flags(le32_to_cpu(msg->rw)), GFP_KERNEL);
if (bio_add_page(bio, virt_to_page(data), datalen,
offset_in_page(data)) != datalen) {
rnbd_srv_err_rl(sess_dev, "Failed to map data to bio\n");
err = -EINVAL;
goto bio_put;
}
bio_add_virt_nofail(bio, data, datalen);
bio->bi_opf = rnbd_to_bio_flags(le32_to_cpu(msg->rw));
if (bio_has_data(bio) &&

View file

@ -50,6 +50,8 @@
/* private ioctl command mirror */
#define UBLK_CMD_DEL_DEV_ASYNC _IOC_NR(UBLK_U_CMD_DEL_DEV_ASYNC)
#define UBLK_CMD_UPDATE_SIZE _IOC_NR(UBLK_U_CMD_UPDATE_SIZE)
#define UBLK_CMD_QUIESCE_DEV _IOC_NR(UBLK_U_CMD_QUIESCE_DEV)
#define UBLK_IO_REGISTER_IO_BUF _IOC_NR(UBLK_U_IO_REGISTER_IO_BUF)
#define UBLK_IO_UNREGISTER_IO_BUF _IOC_NR(UBLK_U_IO_UNREGISTER_IO_BUF)
@ -64,7 +66,10 @@
| UBLK_F_CMD_IOCTL_ENCODE \
| UBLK_F_USER_COPY \
| UBLK_F_ZONED \
| UBLK_F_USER_RECOVERY_FAIL_IO)
| UBLK_F_USER_RECOVERY_FAIL_IO \
| UBLK_F_UPDATE_SIZE \
| UBLK_F_AUTO_BUF_REG \
| UBLK_F_QUIESCE)
#define UBLK_F_ALL_RECOVERY_FLAGS (UBLK_F_USER_RECOVERY \
| UBLK_F_USER_RECOVERY_REISSUE \
@ -77,7 +82,11 @@
UBLK_PARAM_TYPE_DMA_ALIGN | UBLK_PARAM_TYPE_SEGMENT)
struct ublk_rq_data {
struct kref ref;
refcount_t ref;
/* for auto-unregister buffer in case of UBLK_F_AUTO_BUF_REG */
u16 buf_index;
void *buf_ctx_handle;
};
struct ublk_uring_cmd_pdu {
@ -99,6 +108,9 @@ struct ublk_uring_cmd_pdu {
* setup in ublk uring_cmd handler
*/
struct ublk_queue *ubq;
struct ublk_auto_buf_reg buf;
u16 tag;
};
@ -131,6 +143,14 @@ struct ublk_uring_cmd_pdu {
*/
#define UBLK_IO_FLAG_NEED_GET_DATA 0x08
/*
* request buffer is registered automatically, so we have to unregister it
* before completing this request.
*
* io_uring will unregister buffer automatically for us during exiting.
*/
#define UBLK_IO_FLAG_AUTO_BUF_REG 0x10
/* atomic RW with ubq->cancel_lock */
#define UBLK_IO_FLAG_CANCELED 0x80000000
@ -140,7 +160,12 @@ struct ublk_io {
unsigned int flags;
int res;
union {
/* valid if UBLK_IO_FLAG_ACTIVE is set */
struct io_uring_cmd *cmd;
/* valid if UBLK_IO_FLAG_OWNED_BY_SRV is set */
struct request *req;
};
};
struct ublk_queue {
@ -198,13 +223,19 @@ struct ublk_params_header {
__u32 types;
};
static void ublk_io_release(void *priv);
static void ublk_stop_dev_unlocked(struct ublk_device *ub);
static void ublk_abort_queue(struct ublk_device *ub, struct ublk_queue *ubq);
static inline struct request *__ublk_check_and_get_req(struct ublk_device *ub,
const struct ublk_queue *ubq, int tag, size_t offset);
static inline unsigned int ublk_req_build_flags(struct request *req);
static inline struct ublksrv_io_desc *ublk_get_iod(struct ublk_queue *ubq,
int tag);
static inline struct ublksrv_io_desc *
ublk_get_iod(const struct ublk_queue *ubq, unsigned tag)
{
return &ubq->io_cmd_buf[tag];
}
static inline bool ublk_dev_is_zoned(const struct ublk_device *ub)
{
return ub->dev_info.flags & UBLK_F_ZONED;
@ -356,8 +387,7 @@ static int ublk_report_zones(struct gendisk *disk, sector_t sector,
if (ret)
goto free_req;
ret = blk_rq_map_kern(disk->queue, req, buffer, buffer_length,
GFP_KERNEL);
ret = blk_rq_map_kern(req, buffer, buffer_length, GFP_KERNEL);
if (ret)
goto erase_desc;
@ -477,7 +507,6 @@ static blk_status_t ublk_setup_iod_zoned(struct ublk_queue *ubq,
#endif
static inline void __ublk_complete_rq(struct request *req);
static void ublk_complete_rq(struct kref *ref);
static dev_t ublk_chr_devt;
static const struct class ublk_chr_class = {
@ -609,6 +638,11 @@ static inline bool ublk_support_zero_copy(const struct ublk_queue *ubq)
return ubq->flags & UBLK_F_SUPPORT_ZERO_COPY;
}
static inline bool ublk_support_auto_buf_reg(const struct ublk_queue *ubq)
{
return ubq->flags & UBLK_F_AUTO_BUF_REG;
}
static inline bool ublk_support_user_copy(const struct ublk_queue *ubq)
{
return ubq->flags & UBLK_F_USER_COPY;
@ -616,7 +650,8 @@ static inline bool ublk_support_user_copy(const struct ublk_queue *ubq)
static inline bool ublk_need_map_io(const struct ublk_queue *ubq)
{
return !ublk_support_user_copy(ubq) && !ublk_support_zero_copy(ubq);
return !ublk_support_user_copy(ubq) && !ublk_support_zero_copy(ubq) &&
!ublk_support_auto_buf_reg(ubq);
}
static inline bool ublk_need_req_ref(const struct ublk_queue *ubq)
@ -627,8 +662,13 @@ static inline bool ublk_need_req_ref(const struct ublk_queue *ubq)
*
* for zero copy, request buffer need to be registered to io_uring
* buffer table, so reference is needed
*
* For auto buffer register, ublk server still may issue
* UBLK_IO_COMMIT_AND_FETCH_REQ before one registered buffer is used up,
* so reference is required too.
*/
return ublk_support_user_copy(ubq) || ublk_support_zero_copy(ubq);
return ublk_support_user_copy(ubq) || ublk_support_zero_copy(ubq) ||
ublk_support_auto_buf_reg(ubq);
}
static inline void ublk_init_req_ref(const struct ublk_queue *ubq,
@ -637,7 +677,7 @@ static inline void ublk_init_req_ref(const struct ublk_queue *ubq,
if (ublk_need_req_ref(ubq)) {
struct ublk_rq_data *data = blk_mq_rq_to_pdu(req);
kref_init(&data->ref);
refcount_set(&data->ref, 1);
}
}
@ -647,7 +687,7 @@ static inline bool ublk_get_req_ref(const struct ublk_queue *ubq,
if (ublk_need_req_ref(ubq)) {
struct ublk_rq_data *data = blk_mq_rq_to_pdu(req);
return kref_get_unless_zero(&data->ref);
return refcount_inc_not_zero(&data->ref);
}
return true;
@ -659,7 +699,8 @@ static inline void ublk_put_req_ref(const struct ublk_queue *ubq,
if (ublk_need_req_ref(ubq)) {
struct ublk_rq_data *data = blk_mq_rq_to_pdu(req);
kref_put(&data->ref, ublk_complete_rq);
if (refcount_dec_and_test(&data->ref))
__ublk_complete_rq(req);
} else {
__ublk_complete_rq(req);
}
@ -695,12 +736,6 @@ static inline bool ublk_rq_has_data(const struct request *rq)
return bio_has_data(rq->bio);
}
static inline struct ublksrv_io_desc *ublk_get_iod(struct ublk_queue *ubq,
int tag)
{
return &ubq->io_cmd_buf[tag];
}
static inline struct ublksrv_io_desc *
ublk_queue_cmd_buf(struct ublk_device *ub, int q_id)
{
@ -1117,18 +1152,12 @@ exit:
blk_mq_end_request(req, res);
}
static void ublk_complete_rq(struct kref *ref)
static void ublk_complete_io_cmd(struct ublk_io *io, struct request *req,
int res, unsigned issue_flags)
{
struct ublk_rq_data *data = container_of(ref, struct ublk_rq_data,
ref);
struct request *req = blk_mq_rq_from_pdu(data);
/* read cmd first because req will overwrite it */
struct io_uring_cmd *cmd = io->cmd;
__ublk_complete_rq(req);
}
static void ubq_complete_io_cmd(struct ublk_io *io, int res,
unsigned issue_flags)
{
/* mark this cmd owned by ublksrv */
io->flags |= UBLK_IO_FLAG_OWNED_BY_SRV;
@ -1138,8 +1167,10 @@ static void ubq_complete_io_cmd(struct ublk_io *io, int res,
*/
io->flags &= ~UBLK_IO_FLAG_ACTIVE;
io->req = req;
/* tell ublksrv one io request is coming */
io_uring_cmd_done(io->cmd, res, 0, issue_flags);
io_uring_cmd_done(cmd, res, 0, issue_flags);
}
#define UBLK_REQUEUE_DELAY_MS 3
@ -1154,16 +1185,91 @@ static inline void __ublk_abort_rq(struct ublk_queue *ubq,
blk_mq_end_request(rq, BLK_STS_IOERR);
}
static void ublk_auto_buf_reg_fallback(struct request *req)
{
const struct ublk_queue *ubq = req->mq_hctx->driver_data;
struct ublksrv_io_desc *iod = ublk_get_iod(ubq, req->tag);
struct ublk_rq_data *data = blk_mq_rq_to_pdu(req);
iod->op_flags |= UBLK_IO_F_NEED_REG_BUF;
refcount_set(&data->ref, 1);
}
static bool ublk_auto_buf_reg(struct request *req, struct ublk_io *io,
unsigned int issue_flags)
{
struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(io->cmd);
struct ublk_rq_data *data = blk_mq_rq_to_pdu(req);
int ret;
ret = io_buffer_register_bvec(io->cmd, req, ublk_io_release,
pdu->buf.index, issue_flags);
if (ret) {
if (pdu->buf.flags & UBLK_AUTO_BUF_REG_FALLBACK) {
ublk_auto_buf_reg_fallback(req);
return true;
}
blk_mq_end_request(req, BLK_STS_IOERR);
return false;
}
/* one extra reference is dropped by ublk_io_release */
refcount_set(&data->ref, 2);
data->buf_ctx_handle = io_uring_cmd_ctx_handle(io->cmd);
/* store buffer index in request payload */
data->buf_index = pdu->buf.index;
io->flags |= UBLK_IO_FLAG_AUTO_BUF_REG;
return true;
}
static bool ublk_prep_auto_buf_reg(struct ublk_queue *ubq,
struct request *req, struct ublk_io *io,
unsigned int issue_flags)
{
if (ublk_support_auto_buf_reg(ubq) && ublk_rq_has_data(req))
return ublk_auto_buf_reg(req, io, issue_flags);
ublk_init_req_ref(ubq, req);
return true;
}
static bool ublk_start_io(const struct ublk_queue *ubq, struct request *req,
struct ublk_io *io)
{
unsigned mapped_bytes = ublk_map_io(ubq, req, io);
/* partially mapped, update io descriptor */
if (unlikely(mapped_bytes != blk_rq_bytes(req))) {
/*
* Nothing mapped, retry until we succeed.
*
* We may never succeed in mapping any bytes here because
* of OOM. TODO: reserve one buffer with single page pinned
* for providing forward progress guarantee.
*/
if (unlikely(!mapped_bytes)) {
blk_mq_requeue_request(req, false);
blk_mq_delay_kick_requeue_list(req->q,
UBLK_REQUEUE_DELAY_MS);
return false;
}
ublk_get_iod(ubq, req->tag)->nr_sectors =
mapped_bytes >> 9;
}
return true;
}
static void ublk_dispatch_req(struct ublk_queue *ubq,
struct request *req,
unsigned int issue_flags)
{
int tag = req->tag;
struct ublk_io *io = &ubq->ios[tag];
unsigned int mapped_bytes;
pr_devel("%s: complete: op %d, qid %d tag %d io_flags %x addr %llx\n",
__func__, io->cmd->cmd_op, ubq->q_id, req->tag, io->flags,
pr_devel("%s: complete: qid %d tag %d io_flags %x addr %llx\n",
__func__, ubq->q_id, req->tag, io->flags,
ublk_get_iod(ubq, req->tag)->addr);
/*
@ -1183,54 +1289,22 @@ static void ublk_dispatch_req(struct ublk_queue *ubq,
if (ublk_need_get_data(ubq) && ublk_need_map_req(req)) {
/*
* We have not handled UBLK_IO_NEED_GET_DATA command yet,
* so immepdately pass UBLK_IO_RES_NEED_GET_DATA to ublksrv
* so immediately pass UBLK_IO_RES_NEED_GET_DATA to ublksrv
* and notify it.
*/
if (!(io->flags & UBLK_IO_FLAG_NEED_GET_DATA)) {
io->flags |= UBLK_IO_FLAG_NEED_GET_DATA;
pr_devel("%s: need get data. op %d, qid %d tag %d io_flags %x\n",
__func__, io->cmd->cmd_op, ubq->q_id,
req->tag, io->flags);
ubq_complete_io_cmd(io, UBLK_IO_RES_NEED_GET_DATA, issue_flags);
return;
}
/*
* We have handled UBLK_IO_NEED_GET_DATA command,
* so clear UBLK_IO_FLAG_NEED_GET_DATA now and just
* do the copy work.
*/
io->flags &= ~UBLK_IO_FLAG_NEED_GET_DATA;
/* update iod->addr because ublksrv may have passed a new io buffer */
ublk_get_iod(ubq, req->tag)->addr = io->addr;
pr_devel("%s: update iod->addr: op %d, qid %d tag %d io_flags %x addr %llx\n",
__func__, io->cmd->cmd_op, ubq->q_id, req->tag, io->flags,
ublk_get_iod(ubq, req->tag)->addr);
}
mapped_bytes = ublk_map_io(ubq, req, io);
/* partially mapped, update io descriptor */
if (unlikely(mapped_bytes != blk_rq_bytes(req))) {
/*
* Nothing mapped, retry until we succeed.
*
* We may never succeed in mapping any bytes here because
* of OOM. TODO: reserve one buffer with single page pinned
* for providing forward progress guarantee.
*/
if (unlikely(!mapped_bytes)) {
blk_mq_requeue_request(req, false);
blk_mq_delay_kick_requeue_list(req->q,
UBLK_REQUEUE_DELAY_MS);
pr_devel("%s: need get data. qid %d tag %d io_flags %x\n",
__func__, ubq->q_id, req->tag, io->flags);
ublk_complete_io_cmd(io, req, UBLK_IO_RES_NEED_GET_DATA,
issue_flags);
return;
}
ublk_get_iod(ubq, req->tag)->nr_sectors =
mapped_bytes >> 9;
}
if (!ublk_start_io(ubq, req, io))
return;
ublk_init_req_ref(ubq, req);
ubq_complete_io_cmd(io, UBLK_IO_RES_OK, issue_flags);
if (ublk_prep_auto_buf_reg(ubq, req, io, issue_flags))
ublk_complete_io_cmd(io, req, UBLK_IO_RES_OK, issue_flags);
}
static void ublk_cmd_tw_cb(struct io_uring_cmd *cmd,
@ -1590,30 +1664,6 @@ static int ublk_ch_mmap(struct file *filp, struct vm_area_struct *vma)
return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot);
}
static void ublk_commit_completion(struct ublk_device *ub,
const struct ublksrv_io_cmd *ub_cmd)
{
u32 qid = ub_cmd->q_id, tag = ub_cmd->tag;
struct ublk_queue *ubq = ublk_get_queue(ub, qid);
struct ublk_io *io = &ubq->ios[tag];
struct request *req;
/* now this cmd slot is owned by nbd driver */
io->flags &= ~UBLK_IO_FLAG_OWNED_BY_SRV;
io->res = ub_cmd->result;
/* find the io request and complete */
req = blk_mq_tag_to_rq(ub->tag_set.tags[qid], tag);
if (WARN_ON_ONCE(unlikely(!req)))
return;
if (req_op(req) == REQ_OP_ZONE_APPEND)
req->__sector = ub_cmd->zone_append_lba;
if (likely(!blk_should_fake_timeout(req->q)))
ublk_put_req_ref(ubq, req);
}
static void __ublk_fail_req(struct ublk_queue *ubq, struct ublk_io *io,
struct request *req)
{
@ -1642,17 +1692,8 @@ static void ublk_abort_queue(struct ublk_device *ub, struct ublk_queue *ubq)
for (i = 0; i < ubq->q_depth; i++) {
struct ublk_io *io = &ubq->ios[i];
if (!(io->flags & UBLK_IO_FLAG_ACTIVE)) {
struct request *rq;
/*
* Either we fail the request or ublk_rq_task_work_cb
* will do it
*/
rq = blk_mq_tag_to_rq(ub->tag_set.tags[ubq->q_id], i);
if (rq && blk_mq_request_started(rq))
__ublk_fail_req(ubq, io, rq);
}
if (io->flags & UBLK_IO_FLAG_OWNED_BY_SRV)
__ublk_fail_req(ubq, io, io->req);
}
}
@ -1940,6 +1981,20 @@ static inline void ublk_prep_cancel(struct io_uring_cmd *cmd,
io_uring_cmd_mark_cancelable(cmd, issue_flags);
}
static inline int ublk_set_auto_buf_reg(struct io_uring_cmd *cmd)
{
struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
pdu->buf = ublk_sqe_addr_to_auto_buf_reg(READ_ONCE(cmd->sqe->addr));
if (pdu->buf.reserved0 || pdu->buf.reserved1)
return -EINVAL;
if (pdu->buf.flags & ~UBLK_AUTO_BUF_REG_F_MASK)
return -EINVAL;
return 0;
}
static void ublk_io_release(void *priv)
{
struct request *rq = priv;
@ -1953,16 +2008,12 @@ static int ublk_register_io_buf(struct io_uring_cmd *cmd,
unsigned int index, unsigned int issue_flags)
{
struct ublk_device *ub = cmd->file->private_data;
const struct ublk_io *io = &ubq->ios[tag];
struct request *req;
int ret;
if (!ublk_support_zero_copy(ubq))
return -EINVAL;
if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV))
return -EINVAL;
req = __ublk_check_and_get_req(ub, ubq, tag, 0);
if (!req)
return -EINVAL;
@ -1978,17 +2029,12 @@ static int ublk_register_io_buf(struct io_uring_cmd *cmd,
}
static int ublk_unregister_io_buf(struct io_uring_cmd *cmd,
const struct ublk_queue *ubq, unsigned int tag,
const struct ublk_queue *ubq,
unsigned int index, unsigned int issue_flags)
{
const struct ublk_io *io = &ubq->ios[tag];
if (!ublk_support_zero_copy(ubq))
return -EINVAL;
if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV))
return -EINVAL;
return io_buffer_unregister_bvec(cmd, index, issue_flags);
}
@ -2031,6 +2077,12 @@ static int ublk_fetch(struct io_uring_cmd *cmd, struct ublk_queue *ubq,
goto out;
}
if (ublk_support_auto_buf_reg(ubq)) {
ret = ublk_set_auto_buf_reg(cmd);
if (ret)
goto out;
}
ublk_fill_io_cmd(io, cmd, buf_addr);
ublk_mark_io_ready(ub, ubq);
out:
@ -2038,6 +2090,90 @@ out:
return ret;
}
static int ublk_commit_and_fetch(const struct ublk_queue *ubq,
struct ublk_io *io, struct io_uring_cmd *cmd,
const struct ublksrv_io_cmd *ub_cmd,
unsigned int issue_flags)
{
struct request *req = io->req;
if (ublk_need_map_io(ubq)) {
/*
* COMMIT_AND_FETCH_REQ has to provide IO buffer if
* NEED GET DATA is not enabled or it is Read IO.
*/
if (!ub_cmd->addr && (!ublk_need_get_data(ubq) ||
req_op(req) == REQ_OP_READ))
return -EINVAL;
} else if (req_op(req) != REQ_OP_ZONE_APPEND && ub_cmd->addr) {
/*
* User copy requires addr to be unset when command is
* not zone append
*/
return -EINVAL;
}
if (ublk_support_auto_buf_reg(ubq)) {
int ret;
/*
* `UBLK_F_AUTO_BUF_REG` only works iff `UBLK_IO_FETCH_REQ`
* and `UBLK_IO_COMMIT_AND_FETCH_REQ` are issued from same
* `io_ring_ctx`.
*
* If this uring_cmd's io_ring_ctx isn't same with the
* one for registering the buffer, it is ublk server's
* responsibility for unregistering the buffer, otherwise
* this ublk request gets stuck.
*/
if (io->flags & UBLK_IO_FLAG_AUTO_BUF_REG) {
struct ublk_rq_data *data = blk_mq_rq_to_pdu(req);
if (data->buf_ctx_handle == io_uring_cmd_ctx_handle(cmd))
io_buffer_unregister_bvec(cmd, data->buf_index,
issue_flags);
io->flags &= ~UBLK_IO_FLAG_AUTO_BUF_REG;
}
ret = ublk_set_auto_buf_reg(cmd);
if (ret)
return ret;
}
ublk_fill_io_cmd(io, cmd, ub_cmd->addr);
/* now this cmd slot is owned by ublk driver */
io->flags &= ~UBLK_IO_FLAG_OWNED_BY_SRV;
io->res = ub_cmd->result;
if (req_op(req) == REQ_OP_ZONE_APPEND)
req->__sector = ub_cmd->zone_append_lba;
if (likely(!blk_should_fake_timeout(req->q)))
ublk_put_req_ref(ubq, req);
return 0;
}
static bool ublk_get_data(const struct ublk_queue *ubq, struct ublk_io *io)
{
struct request *req = io->req;
/*
* We have handled UBLK_IO_NEED_GET_DATA command,
* so clear UBLK_IO_FLAG_NEED_GET_DATA now and just
* do the copy work.
*/
io->flags &= ~UBLK_IO_FLAG_NEED_GET_DATA;
/* update iod->addr because ublksrv may have passed a new io buffer */
ublk_get_iod(ubq, req->tag)->addr = io->addr;
pr_devel("%s: update iod->addr: qid %d tag %d io_flags %x addr %llx\n",
__func__, ubq->q_id, req->tag, io->flags,
ublk_get_iod(ubq, req->tag)->addr);
return ublk_start_io(ubq, req, io);
}
static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd,
unsigned int issue_flags,
const struct ublksrv_io_cmd *ub_cmd)
@ -2048,7 +2184,6 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd,
u32 cmd_op = cmd->cmd_op;
unsigned tag = ub_cmd->tag;
int ret = -EINVAL;
struct request *req;
pr_devel("%s: received: cmd op %d queue %d tag %d result %d\n",
__func__, cmd->cmd_op, ub_cmd->q_id, tag,
@ -2058,9 +2193,6 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd,
goto out;
ubq = ublk_get_queue(ub, ub_cmd->q_id);
if (!ubq || ub_cmd->q_id != ubq->q_id)
goto out;
if (ubq->ubq_daemon && ubq->ubq_daemon != current)
goto out;
@ -2075,6 +2207,11 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd,
goto out;
}
/* only UBLK_IO_FETCH_REQ is allowed if io is not OWNED_BY_SRV */
if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV) &&
_IOC_NR(cmd_op) != UBLK_IO_FETCH_REQ)
goto out;
/*
* ensure that the user issues UBLK_IO_NEED_GET_DATA
* iff the driver have set the UBLK_IO_FLAG_NEED_GET_DATA.
@ -2092,45 +2229,23 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd,
case UBLK_IO_REGISTER_IO_BUF:
return ublk_register_io_buf(cmd, ubq, tag, ub_cmd->addr, issue_flags);
case UBLK_IO_UNREGISTER_IO_BUF:
return ublk_unregister_io_buf(cmd, ubq, tag, ub_cmd->addr, issue_flags);
return ublk_unregister_io_buf(cmd, ubq, ub_cmd->addr, issue_flags);
case UBLK_IO_FETCH_REQ:
ret = ublk_fetch(cmd, ubq, io, ub_cmd->addr);
if (ret)
goto out;
break;
case UBLK_IO_COMMIT_AND_FETCH_REQ:
req = blk_mq_tag_to_rq(ub->tag_set.tags[ub_cmd->q_id], tag);
if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV))
ret = ublk_commit_and_fetch(ubq, io, cmd, ub_cmd, issue_flags);
if (ret)
goto out;
if (ublk_need_map_io(ubq)) {
/*
* COMMIT_AND_FETCH_REQ has to provide IO buffer if
* NEED GET DATA is not enabled or it is Read IO.
*/
if (!ub_cmd->addr && (!ublk_need_get_data(ubq) ||
req_op(req) == REQ_OP_READ))
goto out;
} else if (req_op(req) != REQ_OP_ZONE_APPEND && ub_cmd->addr) {
/*
* User copy requires addr to be unset when command is
* not zone append
*/
ret = -EINVAL;
goto out;
}
ublk_fill_io_cmd(io, cmd, ub_cmd->addr);
ublk_commit_completion(ub, ub_cmd);
break;
case UBLK_IO_NEED_GET_DATA:
if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV))
goto out;
ublk_fill_io_cmd(io, cmd, ub_cmd->addr);
req = blk_mq_tag_to_rq(ub->tag_set.tags[ub_cmd->q_id], tag);
ublk_dispatch_req(ubq, req, issue_flags);
io->addr = ub_cmd->addr;
if (!ublk_get_data(ubq, io))
return -EIOCBQUEUED;
return UBLK_IO_RES_OK;
default:
goto out;
}
@ -2728,6 +2843,11 @@ static int ublk_ctrl_add_dev(const struct ublksrv_ctrl_cmd *header)
return -EINVAL;
}
if ((info.flags & UBLK_F_QUIESCE) && !(info.flags & UBLK_F_USER_RECOVERY)) {
pr_warn("UBLK_F_QUIESCE requires UBLK_F_USER_RECOVERY\n");
return -EINVAL;
}
/*
* unprivileged device can't be trusted, but RECOVERY and
* RECOVERY_REISSUE still may hang error handling, so can't
@ -2744,8 +2864,11 @@ static int ublk_ctrl_add_dev(const struct ublksrv_ctrl_cmd *header)
* For USER_COPY, we depends on userspace to fill request
* buffer by pwrite() to ublk char device, which can't be
* used for unprivileged device
*
* Same with zero copy or auto buffer register.
*/
if (info.flags & (UBLK_F_USER_COPY | UBLK_F_SUPPORT_ZERO_COPY))
if (info.flags & (UBLK_F_USER_COPY | UBLK_F_SUPPORT_ZERO_COPY |
UBLK_F_AUTO_BUF_REG))
return -EINVAL;
}
@ -2803,7 +2926,8 @@ static int ublk_ctrl_add_dev(const struct ublksrv_ctrl_cmd *header)
UBLK_F_URING_CMD_COMP_IN_TASK;
/* GET_DATA isn't needed any more with USER_COPY or ZERO COPY */
if (ub->dev_info.flags & (UBLK_F_USER_COPY | UBLK_F_SUPPORT_ZERO_COPY))
if (ub->dev_info.flags & (UBLK_F_USER_COPY | UBLK_F_SUPPORT_ZERO_COPY |
UBLK_F_AUTO_BUF_REG))
ub->dev_info.flags &= ~UBLK_F_NEED_GET_DATA;
/*
@ -3106,6 +3230,127 @@ static int ublk_ctrl_get_features(const struct ublksrv_ctrl_cmd *header)
return 0;
}
static void ublk_ctrl_set_size(struct ublk_device *ub, const struct ublksrv_ctrl_cmd *header)
{
struct ublk_param_basic *p = &ub->params.basic;
u64 new_size = header->data[0];
mutex_lock(&ub->mutex);
p->dev_sectors = new_size;
set_capacity_and_notify(ub->ub_disk, p->dev_sectors);
mutex_unlock(&ub->mutex);
}
struct count_busy {
const struct ublk_queue *ubq;
unsigned int nr_busy;
};
static bool ublk_count_busy_req(struct request *rq, void *data)
{
struct count_busy *idle = data;
if (!blk_mq_request_started(rq) && rq->mq_hctx->driver_data == idle->ubq)
idle->nr_busy += 1;
return true;
}
/* uring_cmd is guaranteed to be active if the associated request is idle */
static bool ubq_has_idle_io(const struct ublk_queue *ubq)
{
struct count_busy data = {
.ubq = ubq,
};
blk_mq_tagset_busy_iter(&ubq->dev->tag_set, ublk_count_busy_req, &data);
return data.nr_busy < ubq->q_depth;
}
/* Wait until each hw queue has at least one idle IO */
static int ublk_wait_for_idle_io(struct ublk_device *ub,
unsigned int timeout_ms)
{
unsigned int elapsed = 0;
int ret;
while (elapsed < timeout_ms && !signal_pending(current)) {
unsigned int queues_cancelable = 0;
int i;
for (i = 0; i < ub->dev_info.nr_hw_queues; i++) {
struct ublk_queue *ubq = ublk_get_queue(ub, i);
queues_cancelable += !!ubq_has_idle_io(ubq);
}
/*
* Each queue needs at least one active command for
* notifying ublk server
*/
if (queues_cancelable == ub->dev_info.nr_hw_queues)
break;
msleep(UBLK_REQUEUE_DELAY_MS);
elapsed += UBLK_REQUEUE_DELAY_MS;
}
if (signal_pending(current))
ret = -EINTR;
else if (elapsed >= timeout_ms)
ret = -EBUSY;
else
ret = 0;
return ret;
}
static int ublk_ctrl_quiesce_dev(struct ublk_device *ub,
const struct ublksrv_ctrl_cmd *header)
{
/* zero means wait forever */
u64 timeout_ms = header->data[0];
struct gendisk *disk;
int i, ret = -ENODEV;
if (!(ub->dev_info.flags & UBLK_F_QUIESCE))
return -EOPNOTSUPP;
mutex_lock(&ub->mutex);
disk = ublk_get_disk(ub);
if (!disk)
goto unlock;
if (ub->dev_info.state == UBLK_S_DEV_DEAD)
goto put_disk;
ret = 0;
/* already in expected state */
if (ub->dev_info.state != UBLK_S_DEV_LIVE)
goto put_disk;
/* Mark all queues as canceling */
blk_mq_quiesce_queue(disk->queue);
for (i = 0; i < ub->dev_info.nr_hw_queues; i++) {
struct ublk_queue *ubq = ublk_get_queue(ub, i);
ubq->canceling = true;
}
blk_mq_unquiesce_queue(disk->queue);
if (!timeout_ms)
timeout_ms = UINT_MAX;
ret = ublk_wait_for_idle_io(ub, timeout_ms);
put_disk:
ublk_put_disk(disk);
unlock:
mutex_unlock(&ub->mutex);
/* Cancel pending uring_cmd */
if (!ret)
ublk_cancel_dev(ub);
return ret;
}
/*
* All control commands are sent via /dev/ublk-control, so we have to check
* the destination device's permission
@ -3191,6 +3436,8 @@ static int ublk_ctrl_uring_cmd_permission(struct ublk_device *ub,
case UBLK_CMD_SET_PARAMS:
case UBLK_CMD_START_USER_RECOVERY:
case UBLK_CMD_END_USER_RECOVERY:
case UBLK_CMD_UPDATE_SIZE:
case UBLK_CMD_QUIESCE_DEV:
mask = MAY_READ | MAY_WRITE;
break;
default:
@ -3282,6 +3529,13 @@ static int ublk_ctrl_uring_cmd(struct io_uring_cmd *cmd,
case UBLK_CMD_END_USER_RECOVERY:
ret = ublk_ctrl_end_recovery(ub, header);
break;
case UBLK_CMD_UPDATE_SIZE:
ublk_ctrl_set_size(ub, header);
ret = 0;
break;
case UBLK_CMD_QUIESCE_DEV:
ret = ublk_ctrl_quiesce_dev(ub, header);
break;
default:
ret = -EOPNOTSUPP;
break;
@ -3315,6 +3569,7 @@ static int __init ublk_init(void)
BUILD_BUG_ON((u64)UBLKSRV_IO_BUF_OFFSET +
UBLKSRV_IO_BUF_TOTAL_SIZE < UBLKSRV_IO_BUF_OFFSET);
BUILD_BUG_ON(sizeof(struct ublk_auto_buf_reg) != 8);
init_waitqueue_head(&ublk_idr_wq);

View file

@ -571,7 +571,7 @@ static int virtblk_submit_zone_report(struct virtio_blk *vblk,
vbr->out_hdr.type = cpu_to_virtio32(vblk->vdev, VIRTIO_BLK_T_ZONE_REPORT);
vbr->out_hdr.sector = cpu_to_virtio64(vblk->vdev, sector);
err = blk_rq_map_kern(q, req, report_buf, report_len, GFP_KERNEL);
err = blk_rq_map_kern(req, report_buf, report_len, GFP_KERNEL);
if (err)
goto out;
@ -817,7 +817,7 @@ static int virtblk_get_id(struct gendisk *disk, char *id_str)
vbr->out_hdr.type = cpu_to_virtio32(vblk->vdev, VIRTIO_BLK_T_GET_ID);
vbr->out_hdr.sector = 0;
err = blk_rq_map_kern(q, req, id_str, VIRTIO_BLK_ID_BYTES, GFP_KERNEL);
err = blk_rq_map_kern(req, id_str, VIRTIO_BLK_ID_BYTES, GFP_KERNEL);
if (err)
goto out;

1385
drivers/block/zloop.c Normal file

File diff suppressed because it is too large Load diff

View file

@ -3677,7 +3677,6 @@ static void cdrom_sysctl_register(void)
static void cdrom_sysctl_unregister(void)
{
if (cdrom_sysctl_header)
unregister_sysctl_table(cdrom_sysctl_header);
}

View file

@ -293,8 +293,7 @@ static void __write_super(struct cache_sb *sb, struct cache_sb_disk *out,
bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_META;
bio->bi_iter.bi_sector = SB_SECTOR;
__bio_add_page(bio, virt_to_page(out), SB_SIZE,
offset_in_page(out));
bio_add_virt_nofail(bio, out, SB_SIZE);
out->offset = cpu_to_le64(sb->offset);

View file

@ -1364,7 +1364,7 @@ static void use_bio(struct dm_buffer *b, enum req_op op, sector_t sector,
ptr = (char *)b->data + offset;
len = n_sectors << SECTOR_SHIFT;
__bio_add_page(bio, virt_to_page(ptr), len, offset_in_page(ptr));
bio_add_virt_nofail(bio, ptr, len);
submit_bio(bio);
}

View file

@ -2557,14 +2557,8 @@ static void dm_integrity_inline_recheck(struct work_struct *w)
char *mem;
outgoing_bio = bio_alloc_bioset(ic->dev->bdev, 1, REQ_OP_READ, GFP_NOIO, &ic->recheck_bios);
r = bio_add_page(outgoing_bio, virt_to_page(outgoing_data), ic->sectors_per_block << SECTOR_SHIFT, 0);
if (unlikely(r != (ic->sectors_per_block << SECTOR_SHIFT))) {
bio_put(outgoing_bio);
bio->bi_status = BLK_STS_RESOURCE;
bio_endio(bio);
return;
}
bio_add_virt_nofail(outgoing_bio, outgoing_data,
ic->sectors_per_block << SECTOR_SHIFT);
bip = bio_integrity_alloc(outgoing_bio, GFP_NOIO, 1);
if (IS_ERR(bip)) {
@ -3211,7 +3205,8 @@ next_chunk:
bio = bio_alloc_bioset(ic->dev->bdev, 1, REQ_OP_READ, GFP_NOIO, &ic->recalc_bios);
bio->bi_iter.bi_sector = ic->start + SB_SECTORS + range.logical_sector;
__bio_add_page(bio, virt_to_page(recalc_buffer), range.n_sectors << SECTOR_SHIFT, offset_in_page(recalc_buffer));
bio_add_virt_nofail(bio, recalc_buffer,
range.n_sectors << SECTOR_SHIFT);
r = submit_bio_wait(bio);
bio_put(bio);
if (unlikely(r)) {
@ -3228,7 +3223,8 @@ next_chunk:
bio = bio_alloc_bioset(ic->dev->bdev, 1, REQ_OP_WRITE, GFP_NOIO, &ic->recalc_bios);
bio->bi_iter.bi_sector = ic->start + SB_SECTORS + range.logical_sector;
__bio_add_page(bio, virt_to_page(recalc_buffer), range.n_sectors << SECTOR_SHIFT, offset_in_page(recalc_buffer));
bio_add_virt_nofail(bio, recalc_buffer,
range.n_sectors << SECTOR_SHIFT);
bip = bio_integrity_alloc(bio, GFP_NOIO, 1);
if (unlikely(IS_ERR(bip))) {

View file

@ -14,6 +14,7 @@
#include "raid5.h"
#include "raid10.h"
#include "md-bitmap.h"
#include "dm-core.h"
#include <linux/device-mapper.h>
@ -3308,6 +3309,7 @@ size_check:
/* Disable/enable discard support on raid set. */
configure_discard_support(rs);
rs->md.dm_gendisk = ti->table->md->disk;
mddev_unlock(&rs->md);
return 0;
@ -3327,6 +3329,7 @@ static void raid_dtr(struct dm_target *ti)
mddev_lock_nointr(&rs->md);
md_stop(&rs->md);
rs->md.dm_gendisk = NULL;
mddev_unlock(&rs->md);
if (work_pending(&rs->md.event_work))

View file

@ -111,32 +111,48 @@ static void md_wakeup_thread_directly(struct md_thread __rcu *thread);
/* Default safemode delay: 200 msec */
#define DEFAULT_SAFEMODE_DELAY ((200 * HZ)/1000 +1)
/*
* Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit'
* is 1000 KB/sec, so the extra system load does not show up that much.
* Increase it if you want to have more _guaranteed_ speed. Note that
* the RAID driver will use the maximum available bandwidth if the IO
* subsystem is idle. There is also an 'absolute maximum' reconstruction
* speed limit - in case reconstruction slows down your system despite
* idle IO detection.
* Current RAID-1,4,5,6,10 parallel reconstruction 'guaranteed speed limit'
* is sysctl_speed_limit_min, 1000 KB/sec by default, so the extra system load
* does not show up that much. Increase it if you want to have more guaranteed
* speed. Note that the RAID driver will use the maximum bandwidth
* sysctl_speed_limit_max, 200 MB/sec by default, if the IO subsystem is idle.
*
* you can change it via /proc/sys/dev/raid/speed_limit_min and _max.
* or /sys/block/mdX/md/sync_speed_{min,max}
* Background sync IO speed control:
*
* - below speed min:
* no limit;
* - above speed min and below speed max:
* a) if mddev is idle, then no limit;
* b) if mddev is busy handling normal IO, then limit inflight sync IO
* to sync_io_depth;
* - above speed max:
* sync IO can't be issued;
*
* Following configurations can be changed via /proc/sys/dev/raid/ for system
* or /sys/block/mdX/md/ for one array.
*/
static int sysctl_speed_limit_min = 1000;
static int sysctl_speed_limit_max = 200000;
static inline int speed_min(struct mddev *mddev)
static int sysctl_sync_io_depth = 32;
static int speed_min(struct mddev *mddev)
{
return mddev->sync_speed_min ?
mddev->sync_speed_min : sysctl_speed_limit_min;
}
static inline int speed_max(struct mddev *mddev)
static int speed_max(struct mddev *mddev)
{
return mddev->sync_speed_max ?
mddev->sync_speed_max : sysctl_speed_limit_max;
}
static int sync_io_depth(struct mddev *mddev)
{
return mddev->sync_io_depth ?
mddev->sync_io_depth : sysctl_sync_io_depth;
}
static void rdev_uninit_serial(struct md_rdev *rdev)
{
if (!test_and_clear_bit(CollisionCheck, &rdev->flags))
@ -293,14 +309,21 @@ static const struct ctl_table raid_table[] = {
.procname = "speed_limit_min",
.data = &sysctl_speed_limit_min,
.maxlen = sizeof(int),
.mode = S_IRUGO|S_IWUSR,
.mode = 0644,
.proc_handler = proc_dointvec,
},
{
.procname = "speed_limit_max",
.data = &sysctl_speed_limit_max,
.maxlen = sizeof(int),
.mode = S_IRUGO|S_IWUSR,
.mode = 0644,
.proc_handler = proc_dointvec,
},
{
.procname = "sync_io_depth",
.data = &sysctl_sync_io_depth,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec,
},
};
@ -5145,6 +5168,35 @@ sync_max_store(struct mddev *mddev, const char *buf, size_t len)
static struct md_sysfs_entry md_sync_max =
__ATTR(sync_speed_max, S_IRUGO|S_IWUSR, sync_max_show, sync_max_store);
static ssize_t
sync_io_depth_show(struct mddev *mddev, char *page)
{
return sprintf(page, "%d (%s)\n", sync_io_depth(mddev),
mddev->sync_io_depth ? "local" : "system");
}
static ssize_t
sync_io_depth_store(struct mddev *mddev, const char *buf, size_t len)
{
unsigned int max;
int rv;
if (strncmp(buf, "system", 6) == 0) {
max = 0;
} else {
rv = kstrtouint(buf, 10, &max);
if (rv < 0)
return rv;
if (max == 0)
return -EINVAL;
}
mddev->sync_io_depth = max;
return len;
}
static struct md_sysfs_entry md_sync_io_depth =
__ATTR_RW(sync_io_depth);
static ssize_t
degraded_show(struct mddev *mddev, char *page)
{
@ -5671,6 +5723,7 @@ static struct attribute *md_redundancy_attrs[] = {
&md_mismatches.attr,
&md_sync_min.attr,
&md_sync_max.attr,
&md_sync_io_depth.attr,
&md_sync_speed.attr,
&md_sync_force_parallel.attr,
&md_sync_completed.attr,
@ -8572,50 +8625,55 @@ void md_cluster_stop(struct mddev *mddev)
put_cluster_ops(mddev);
}
static int is_mddev_idle(struct mddev *mddev, int init)
static bool is_rdev_holder_idle(struct md_rdev *rdev, bool init)
{
struct md_rdev *rdev;
int idle;
int curr_events;
unsigned long last_events = rdev->last_events;
idle = 1;
rcu_read_lock();
rdev_for_each_rcu(rdev, mddev) {
struct gendisk *disk = rdev->bdev->bd_disk;
if (!bdev_is_partition(rdev->bdev))
return true;
if (!init && !blk_queue_io_stat(disk->queue))
continue;
curr_events = (int)part_stat_read_accum(disk->part0, sectors) -
atomic_read(&disk->sync_io);
/* sync IO will cause sync_io to increase before the disk_stats
* as sync_io is counted when a request starts, and
* disk_stats is counted when it completes.
* So resync activity will cause curr_events to be smaller than
* when there was no such activity.
* non-sync IO will cause disk_stat to increase without
* increasing sync_io so curr_events will (eventually)
* be larger than it was before. Once it becomes
* substantially larger, the test below will cause
* the array to appear non-idle, and resync will slow
* down.
* If there is a lot of outstanding resync activity when
* we set last_event to curr_events, then all that activity
* completing might cause the array to appear non-idle
* and resync will be slowed down even though there might
* not have been non-resync activity. This will only
* happen once though. 'last_events' will soon reflect
* the state where there is little or no outstanding
* resync requests, and further resync activity will
* always make curr_events less than last_events.
*
/*
* If rdev is partition, and user doesn't issue IO to the array, the
* array is still not idle if user issues IO to other partitions.
*/
if (init || curr_events - rdev->last_events > 64) {
rdev->last_events = curr_events;
idle = 0;
}
rdev->last_events = part_stat_read_accum(rdev->bdev->bd_disk->part0,
sectors) -
part_stat_read_accum(rdev->bdev, sectors);
return init || rdev->last_events <= last_events;
}
/*
* mddev is idle if following conditions are matched since last check:
* 1) mddev doesn't have normal IO completed;
* 2) mddev doesn't have inflight normal IO;
* 3) if any member disk is partition, and other partitions don't have IO
* completed;
*
* Noted this checking rely on IO accounting is enabled.
*/
static bool is_mddev_idle(struct mddev *mddev, int init)
{
unsigned long last_events = mddev->normal_io_events;
struct gendisk *disk;
struct md_rdev *rdev;
bool idle = true;
disk = mddev_is_dm(mddev) ? mddev->dm_gendisk : mddev->gendisk;
if (!disk)
return true;
mddev->normal_io_events = part_stat_read_accum(disk->part0, sectors);
if (!init && (mddev->normal_io_events > last_events ||
bdev_count_inflight(disk->part0)))
idle = false;
rcu_read_lock();
rdev_for_each_rcu(rdev, mddev)
if (!is_rdev_holder_idle(rdev, init))
idle = false;
rcu_read_unlock();
return idle;
}
@ -8927,6 +8985,23 @@ static sector_t md_sync_position(struct mddev *mddev, enum sync_action action)
}
}
static bool sync_io_within_limit(struct mddev *mddev)
{
int io_sectors;
/*
* For raid456, sync IO is stripe(4k) per IO, for other levels, it's
* RESYNC_PAGES(64k) per IO.
*/
if (mddev->level == 4 || mddev->level == 5 || mddev->level == 6)
io_sectors = 8;
else
io_sectors = 128;
return atomic_read(&mddev->recovery_active) <
io_sectors * sync_io_depth(mddev);
}
#define SYNC_MARKS 10
#define SYNC_MARK_STEP (3*HZ)
#define UPDATE_FREQUENCY (5*60*HZ)
@ -9195,7 +9270,8 @@ void md_do_sync(struct md_thread *thread)
msleep(500);
goto repeat;
}
if (!is_mddev_idle(mddev, 0)) {
if (!sync_io_within_limit(mddev) &&
!is_mddev_idle(mddev, 0)) {
/*
* Give other IO more of a chance.
* The faster the devices, the less we wait.

View file

@ -132,7 +132,7 @@ struct md_rdev {
sector_t sectors; /* Device size (in 512bytes sectors) */
struct mddev *mddev; /* RAID array if running */
int last_events; /* IO event timestamp */
unsigned long last_events; /* IO event timestamp */
/*
* If meta_bdev is non-NULL, it means that a separate device is
@ -404,7 +404,8 @@ struct mddev {
* are happening, so run/
* takeover/stop are not safe
*/
struct gendisk *gendisk;
struct gendisk *gendisk; /* mdraid gendisk */
struct gendisk *dm_gendisk; /* dm-raid gendisk */
struct kobject kobj;
int hold_active;
@ -483,6 +484,7 @@ struct mddev {
/* if zero, use the system-wide default */
int sync_speed_min;
int sync_speed_max;
int sync_io_depth;
/* resync even though the same disks are shared among md-devices */
int parallel_resync;
@ -518,6 +520,7 @@ struct mddev {
* adding a spare
*/
unsigned long normal_io_events; /* IO event timestamp */
atomic_t recovery_active; /* blocks scheduled, but not written */
wait_queue_head_t recovery_wait;
sector_t recovery_cp;
@ -714,17 +717,6 @@ static inline int mddev_trylock(struct mddev *mddev)
}
extern void mddev_unlock(struct mddev *mddev);
static inline void md_sync_acct(struct block_device *bdev, unsigned long nr_sectors)
{
if (blk_queue_io_stat(bdev->bd_disk->queue))
atomic_add(nr_sectors, &bdev->bd_disk->sync_io);
}
static inline void md_sync_acct_bio(struct bio *bio, unsigned long nr_sectors)
{
md_sync_acct(bio->bi_bdev, nr_sectors);
}
struct md_personality
{
struct md_submodule_head head;

View file

@ -2382,7 +2382,6 @@ static void sync_request_write(struct mddev *mddev, struct r1bio *r1_bio)
wbio->bi_end_io = end_sync_write;
atomic_inc(&r1_bio->remaining);
md_sync_acct(conf->mirrors[i].rdev->bdev, bio_sectors(wbio));
submit_bio_noacct(wbio);
}
@ -3055,7 +3054,6 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
bio = r1_bio->bios[i];
if (bio->bi_end_io == end_sync_read) {
read_targets--;
md_sync_acct_bio(bio, nr_sectors);
if (read_targets == 1)
bio->bi_opf &= ~MD_FAILFAST;
submit_bio_noacct(bio);
@ -3064,7 +3062,6 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
} else {
atomic_set(&r1_bio->remaining, 1);
bio = r1_bio->bios[r1_bio->read_disk];
md_sync_acct_bio(bio, nr_sectors);
if (read_targets == 1)
bio->bi_opf &= ~MD_FAILFAST;
submit_bio_noacct(bio);

View file

@ -2426,7 +2426,6 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
atomic_inc(&conf->mirrors[d].rdev->nr_pending);
atomic_inc(&r10_bio->remaining);
md_sync_acct(conf->mirrors[d].rdev->bdev, bio_sectors(tbio));
if (test_bit(FailFast, &conf->mirrors[d].rdev->flags))
tbio->bi_opf |= MD_FAILFAST;
@ -2448,8 +2447,6 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
bio_copy_data(tbio, fbio);
d = r10_bio->devs[i].devnum;
atomic_inc(&r10_bio->remaining);
md_sync_acct(conf->mirrors[d].replacement->bdev,
bio_sectors(tbio));
submit_bio_noacct(tbio);
}
@ -2583,13 +2580,10 @@ static void recovery_request_write(struct mddev *mddev, struct r10bio *r10_bio)
d = r10_bio->devs[1].devnum;
if (wbio->bi_end_io) {
atomic_inc(&conf->mirrors[d].rdev->nr_pending);
md_sync_acct(conf->mirrors[d].rdev->bdev, bio_sectors(wbio));
submit_bio_noacct(wbio);
}
if (wbio2) {
atomic_inc(&conf->mirrors[d].replacement->nr_pending);
md_sync_acct(conf->mirrors[d].replacement->bdev,
bio_sectors(wbio2));
submit_bio_noacct(wbio2);
}
}
@ -3757,7 +3751,6 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
r10_bio->sectors = nr_sectors;
if (bio->bi_end_io == end_sync_read) {
md_sync_acct_bio(bio, nr_sectors);
bio->bi_status = 0;
submit_bio_noacct(bio);
}
@ -4880,7 +4873,6 @@ read_more:
r10_bio->sectors = nr_sectors;
/* Now submit the read */
md_sync_acct_bio(read_bio, r10_bio->sectors);
atomic_inc(&r10_bio->remaining);
read_bio->bi_next = NULL;
submit_bio_noacct(read_bio);
@ -4940,7 +4932,6 @@ static void reshape_request_write(struct mddev *mddev, struct r10bio *r10_bio)
continue;
atomic_inc(&rdev->nr_pending);
md_sync_acct_bio(b, r10_bio->sectors);
atomic_inc(&r10_bio->remaining);
b->bi_next = NULL;
submit_bio_noacct(b);

View file

@ -1240,10 +1240,6 @@ again:
}
if (rdev) {
if (s->syncing || s->expanding || s->expanded
|| s->replacing)
md_sync_acct(rdev->bdev, RAID5_STRIPE_SECTORS(conf));
set_bit(STRIPE_IO_STARTED, &sh->state);
bio_init(bi, rdev->bdev, &dev->vec, 1, op | op_flags);
@ -1300,10 +1296,6 @@ again:
submit_bio_noacct(bi);
}
if (rrdev) {
if (s->syncing || s->expanding || s->expanded
|| s->replacing)
md_sync_acct(rrdev->bdev, RAID5_STRIPE_SECTORS(conf));
set_bit(STRIPE_IO_STARTED, &sh->state);
bio_init(rbi, rrdev->bdev, &dev->rvec, 1, op | op_flags);

View file

@ -242,7 +242,7 @@ struct nvme_dhchap_key *nvme_auth_transform_key(
{
const char *hmac_name;
struct crypto_shash *key_tfm;
struct shash_desc *shash;
SHASH_DESC_ON_STACK(shash, key_tfm);
struct nvme_dhchap_key *transformed_key;
int ret, key_len;
@ -267,19 +267,11 @@ struct nvme_dhchap_key *nvme_auth_transform_key(
if (IS_ERR(key_tfm))
return ERR_CAST(key_tfm);
shash = kmalloc(sizeof(struct shash_desc) +
crypto_shash_descsize(key_tfm),
GFP_KERNEL);
if (!shash) {
ret = -ENOMEM;
goto out_free_key;
}
key_len = crypto_shash_digestsize(key_tfm);
transformed_key = nvme_auth_alloc_key(key_len, key->hash);
if (!transformed_key) {
ret = -ENOMEM;
goto out_free_shash;
goto out_free_key;
}
shash->tfm = key_tfm;
@ -299,15 +291,12 @@ struct nvme_dhchap_key *nvme_auth_transform_key(
if (ret < 0)
goto out_free_transformed_key;
kfree(shash);
crypto_free_shash(key_tfm);
return transformed_key;
out_free_transformed_key:
nvme_auth_free_key(transformed_key);
out_free_shash:
kfree(shash);
out_free_key:
crypto_free_shash(key_tfm);

View file

@ -31,6 +31,7 @@ struct nvme_dhchap_queue_context {
u32 s1;
u32 s2;
bool bi_directional;
bool authenticated;
u16 transaction;
u8 status;
u8 dhgroup_id;
@ -682,6 +683,7 @@ static void nvme_auth_reset_dhchap(struct nvme_dhchap_queue_context *chap)
static void nvme_auth_free_dhchap(struct nvme_dhchap_queue_context *chap)
{
nvme_auth_reset_dhchap(chap);
chap->authenticated = false;
if (chap->shash_tfm)
crypto_free_shash(chap->shash_tfm);
if (chap->dh_tfm)
@ -930,12 +932,14 @@ static void nvme_queue_auth_work(struct work_struct *work)
}
if (!ret) {
chap->error = 0;
chap->authenticated = true;
if (ctrl->opts->concat &&
(ret = nvme_auth_secure_concat(ctrl, chap))) {
dev_warn(ctrl->device,
"%s: qid %d failed to enable secure concatenation\n",
__func__, chap->qid);
chap->error = ret;
chap->authenticated = false;
}
return;
}
@ -1023,13 +1027,16 @@ static void nvme_ctrl_auth_work(struct work_struct *work)
return;
for (q = 1; q < ctrl->queue_count; q++) {
ret = nvme_auth_negotiate(ctrl, q);
if (ret) {
dev_warn(ctrl->device,
"qid %d: error %d setting up authentication\n",
q, ret);
break;
}
struct nvme_dhchap_queue_context *chap =
&ctrl->dhchap_ctxs[q];
/*
* Skip re-authentication if the queue had
* not been authenticated initially.
*/
if (!chap->authenticated)
continue;
cancel_work_sync(&chap->auth_work);
queue_work(nvme_auth_wq, &chap->auth_work);
}
/*
@ -1037,7 +1044,13 @@ static void nvme_ctrl_auth_work(struct work_struct *work)
* the controller terminates the connection.
*/
for (q = 1; q < ctrl->queue_count; q++) {
ret = nvme_auth_wait(ctrl, q);
struct nvme_dhchap_queue_context *chap =
&ctrl->dhchap_ctxs[q];
if (!chap->authenticated)
continue;
flush_work(&chap->auth_work);
ret = chap->error;
nvme_auth_reset_dhchap(chap);
if (ret)
dev_warn(ctrl->device,
"qid %d: authentication failed\n", q);
@ -1076,6 +1089,7 @@ int nvme_auth_init_ctrl(struct nvme_ctrl *ctrl)
chap = &ctrl->dhchap_ctxs[i];
chap->qid = i;
chap->ctrl = ctrl;
chap->authenticated = false;
INIT_WORK(&chap->auth_work, nvme_queue_auth_work);
}

View file

@ -38,6 +38,8 @@ struct nvme_ns_info {
u32 nsid;
__le32 anagrpid;
u8 pi_offset;
u16 endgid;
u64 runs;
bool is_shared;
bool is_readonly;
bool is_ready;
@ -150,6 +152,8 @@ static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl,
unsigned nsid);
static void nvme_update_keep_alive(struct nvme_ctrl *ctrl,
struct nvme_command *cmd);
static int nvme_get_log_lsi(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page,
u8 lsp, u8 csi, void *log, size_t size, u64 offset, u16 lsi);
void nvme_queue_scan(struct nvme_ctrl *ctrl)
{
@ -664,10 +668,11 @@ static void nvme_free_ns_head(struct kref *ref)
struct nvme_ns_head *head =
container_of(ref, struct nvme_ns_head, ref);
nvme_mpath_remove_disk(head);
nvme_mpath_put_disk(head);
ida_free(&head->subsys->ns_ida, head->instance);
cleanup_srcu_struct(&head->srcu);
nvme_put_subsystem(head->subsys);
kfree(head->plids);
kfree(head);
}
@ -991,6 +996,18 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns,
if (req->cmd_flags & REQ_RAHEAD)
dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH;
if (op == nvme_cmd_write && ns->head->nr_plids) {
u16 write_stream = req->bio->bi_write_stream;
if (WARN_ON_ONCE(write_stream > ns->head->nr_plids))
return BLK_STS_INVAL;
if (write_stream) {
dsmgmt |= ns->head->plids[write_stream - 1] << 16;
control |= NVME_RW_DTYPE_DPLCMT;
}
}
if (req->cmd_flags & REQ_ATOMIC && !nvme_valid_atomic_write(req))
return BLK_STS_INVAL;
@ -1157,7 +1174,7 @@ int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
req->cmd_flags &= ~REQ_FAILFAST_DRIVER;
if (buffer && bufflen) {
ret = blk_rq_map_kern(q, req, buffer, bufflen, GFP_KERNEL);
ret = blk_rq_map_kern(req, buffer, bufflen, GFP_KERNEL);
if (ret)
goto out;
}
@ -1609,6 +1626,7 @@ static int nvme_ns_info_from_identify(struct nvme_ctrl *ctrl,
info->is_shared = id->nmic & NVME_NS_NMIC_SHARED;
info->is_readonly = id->nsattr & NVME_NS_ATTR_RO;
info->is_ready = true;
info->endgid = le16_to_cpu(id->endgid);
if (ctrl->quirks & NVME_QUIRK_BOGUS_NID) {
dev_info(ctrl->device,
"Ignoring bogus Namespace Identifiers\n");
@ -1649,6 +1667,7 @@ static int nvme_ns_info_from_id_cs_indep(struct nvme_ctrl *ctrl,
info->is_ready = id->nstat & NVME_NSTAT_NRDY;
info->is_rotational = id->nsfeat & NVME_NS_ROTATIONAL;
info->no_vwc = id->nsfeat & NVME_NS_VWC_NOT_PRESENT;
info->endgid = le16_to_cpu(id->endgid);
}
kfree(id);
return ret;
@ -1674,7 +1693,7 @@ static int nvme_features(struct nvme_ctrl *dev, u8 op, unsigned int fid,
int nvme_set_features(struct nvme_ctrl *dev, unsigned int fid,
unsigned int dword11, void *buffer, size_t buflen,
u32 *result)
void *result)
{
return nvme_features(dev, nvme_admin_set_features, fid, dword11, buffer,
buflen, result);
@ -1683,7 +1702,7 @@ EXPORT_SYMBOL_GPL(nvme_set_features);
int nvme_get_features(struct nvme_ctrl *dev, unsigned int fid,
unsigned int dword11, void *buffer, size_t buflen,
u32 *result)
void *result)
{
return nvme_features(dev, nvme_admin_get_features, fid, dword11, buffer,
buflen, result);
@ -2167,6 +2186,148 @@ static int nvme_update_ns_info_generic(struct nvme_ns *ns,
return ret;
}
static int nvme_query_fdp_granularity(struct nvme_ctrl *ctrl,
struct nvme_ns_info *info, u8 fdp_idx)
{
struct nvme_fdp_config_log hdr, *h;
struct nvme_fdp_config_desc *desc;
size_t size = sizeof(hdr);
void *log, *end;
int i, n, ret;
ret = nvme_get_log_lsi(ctrl, 0, NVME_LOG_FDP_CONFIGS, 0,
NVME_CSI_NVM, &hdr, size, 0, info->endgid);
if (ret) {
dev_warn(ctrl->device,
"FDP configs log header status:0x%x endgid:%d\n", ret,
info->endgid);
return ret;
}
size = le32_to_cpu(hdr.sze);
if (size > PAGE_SIZE * MAX_ORDER_NR_PAGES) {
dev_warn(ctrl->device, "FDP config size too large:%zu\n",
size);
return 0;
}
h = kvmalloc(size, GFP_KERNEL);
if (!h)
return -ENOMEM;
ret = nvme_get_log_lsi(ctrl, 0, NVME_LOG_FDP_CONFIGS, 0,
NVME_CSI_NVM, h, size, 0, info->endgid);
if (ret) {
dev_warn(ctrl->device,
"FDP configs log status:0x%x endgid:%d\n", ret,
info->endgid);
goto out;
}
n = le16_to_cpu(h->numfdpc) + 1;
if (fdp_idx > n) {
dev_warn(ctrl->device, "FDP index:%d out of range:%d\n",
fdp_idx, n);
/* Proceed without registering FDP streams */
ret = 0;
goto out;
}
log = h + 1;
desc = log;
end = log + size - sizeof(*h);
for (i = 0; i < fdp_idx; i++) {
log += le16_to_cpu(desc->dsze);
desc = log;
if (log >= end) {
dev_warn(ctrl->device,
"FDP invalid config descriptor list\n");
ret = 0;
goto out;
}
}
if (le32_to_cpu(desc->nrg) > 1) {
dev_warn(ctrl->device, "FDP NRG > 1 not supported\n");
ret = 0;
goto out;
}
info->runs = le64_to_cpu(desc->runs);
out:
kvfree(h);
return ret;
}
static int nvme_query_fdp_info(struct nvme_ns *ns, struct nvme_ns_info *info)
{
struct nvme_ns_head *head = ns->head;
struct nvme_ctrl *ctrl = ns->ctrl;
struct nvme_fdp_ruh_status *ruhs;
struct nvme_fdp_config fdp;
struct nvme_command c = {};
size_t size;
int i, ret;
/*
* The FDP configuration is static for the lifetime of the namespace,
* so return immediately if we've already registered this namespace's
* streams.
*/
if (head->nr_plids)
return 0;
ret = nvme_get_features(ctrl, NVME_FEAT_FDP, info->endgid, NULL, 0,
&fdp);
if (ret) {
dev_warn(ctrl->device, "FDP get feature status:0x%x\n", ret);
return ret;
}
if (!(fdp.flags & FDPCFG_FDPE))
return 0;
ret = nvme_query_fdp_granularity(ctrl, info, fdp.fdpcidx);
if (!info->runs)
return ret;
size = struct_size(ruhs, ruhsd, S8_MAX - 1);
ruhs = kzalloc(size, GFP_KERNEL);
if (!ruhs)
return -ENOMEM;
c.imr.opcode = nvme_cmd_io_mgmt_recv;
c.imr.nsid = cpu_to_le32(head->ns_id);
c.imr.mo = NVME_IO_MGMT_RECV_MO_RUHS;
c.imr.numd = cpu_to_le32(nvme_bytes_to_numd(size));
ret = nvme_submit_sync_cmd(ns->queue, &c, ruhs, size);
if (ret) {
dev_warn(ctrl->device, "FDP io-mgmt status:0x%x\n", ret);
goto free;
}
head->nr_plids = le16_to_cpu(ruhs->nruhsd);
if (!head->nr_plids)
goto free;
head->plids = kcalloc(head->nr_plids, sizeof(*head->plids),
GFP_KERNEL);
if (!head->plids) {
dev_warn(ctrl->device,
"failed to allocate %u FDP placement IDs\n",
head->nr_plids);
head->nr_plids = 0;
ret = -ENOMEM;
goto free;
}
for (i = 0; i < head->nr_plids; i++)
head->plids[i] = le16_to_cpu(ruhs->ruhsd[i].pid);
free:
kfree(ruhs);
return ret;
}
static int nvme_update_ns_info_block(struct nvme_ns *ns,
struct nvme_ns_info *info)
{
@ -2204,6 +2365,12 @@ static int nvme_update_ns_info_block(struct nvme_ns *ns,
goto out;
}
if (ns->ctrl->ctratt & NVME_CTRL_ATTR_FDPS) {
ret = nvme_query_fdp_info(ns, info);
if (ret < 0)
goto out;
}
lim = queue_limits_start_update(ns->disk->queue);
memflags = blk_mq_freeze_queue(ns->disk->queue);
@ -2248,6 +2415,12 @@ static int nvme_update_ns_info_block(struct nvme_ns *ns,
if (!nvme_init_integrity(ns->head, &lim, info))
capacity = 0;
lim.max_write_streams = ns->head->nr_plids;
if (lim.max_write_streams)
lim.write_stream_granularity = min(info->runs, U32_MAX);
else
lim.write_stream_granularity = 0;
ret = queue_limits_commit_update(ns->disk->queue, &lim);
if (ret) {
blk_mq_unfreeze_queue(ns->disk->queue, memflags);
@ -2351,6 +2524,8 @@ static int nvme_update_ns_info(struct nvme_ns *ns, struct nvme_ns_info *info)
ns->head->disk->flags |= GENHD_FL_HIDDEN;
else
nvme_init_integrity(ns->head, &lim, info);
lim.max_write_streams = ns_lim->max_write_streams;
lim.write_stream_granularity = ns_lim->write_stream_granularity;
ret = queue_limits_commit_update(ns->head->disk->queue, &lim);
set_capacity_and_notify(ns->head->disk, get_capacity(ns->disk));
@ -3108,8 +3283,8 @@ out_unlock:
return ret;
}
int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 lsp, u8 csi,
void *log, size_t size, u64 offset)
static int nvme_get_log_lsi(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page,
u8 lsp, u8 csi, void *log, size_t size, u64 offset, u16 lsi)
{
struct nvme_command c = { };
u32 dwlen = nvme_bytes_to_numd(size);
@ -3123,10 +3298,18 @@ int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 lsp, u8 csi,
c.get_log_page.lpol = cpu_to_le32(lower_32_bits(offset));
c.get_log_page.lpou = cpu_to_le32(upper_32_bits(offset));
c.get_log_page.csi = csi;
c.get_log_page.lsi = cpu_to_le16(lsi);
return nvme_submit_sync_cmd(ctrl->admin_q, &c, log, size);
}
int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 lsp, u8 csi,
void *log, size_t size, u64 offset)
{
return nvme_get_log_lsi(ctrl, nsid, log_page, lsp, csi, log, size,
offset, 0);
}
static int nvme_get_effects_log(struct nvme_ctrl *ctrl, u8 csi,
struct nvme_effects_log **log)
{
@ -3584,7 +3767,7 @@ static struct nvme_ns_head *nvme_find_ns_head(struct nvme_ctrl *ctrl,
*/
if (h->ns_id != nsid || !nvme_is_unique_nsid(ctrl, h))
continue;
if (!list_empty(&h->list) && nvme_tryget_ns_head(h))
if (nvme_tryget_ns_head(h))
return h;
}
@ -3828,7 +4011,8 @@ static int nvme_init_ns_head(struct nvme_ns *ns, struct nvme_ns_info *info)
}
} else {
ret = -EINVAL;
if (!info->is_shared || !head->shared) {
if ((!info->is_shared || !head->shared) &&
!list_empty(&head->list)) {
dev_err(ctrl->device,
"Duplicate unshared namespace %d\n",
info->nsid);
@ -4032,6 +4216,7 @@ static void nvme_ns_remove(struct nvme_ns *ns)
mutex_lock(&ns->ctrl->subsys->lock);
list_del_rcu(&ns->siblings);
if (list_empty(&ns->head->list)) {
if (!nvme_mpath_queue_if_no_path(ns->head))
list_del_init(&ns->head->entry);
last_path = true;
}
@ -4053,7 +4238,7 @@ static void nvme_ns_remove(struct nvme_ns *ns)
synchronize_srcu(&ns->ctrl->srcu);
if (last_path)
nvme_mpath_shutdown_disk(ns->head);
nvme_mpath_remove_disk(ns->head);
nvme_put_ns(ns);
}

View file

@ -1410,9 +1410,8 @@ nvme_fc_xmt_disconnect_assoc(struct nvme_fc_ctrl *ctrl)
}
static void
nvme_fc_xmt_ls_rsp_done(struct nvmefc_ls_rsp *lsrsp)
nvme_fc_xmt_ls_rsp_free(struct nvmefc_ls_rcv_op *lsop)
{
struct nvmefc_ls_rcv_op *lsop = lsrsp->nvme_fc_private;
struct nvme_fc_rport *rport = lsop->rport;
struct nvme_fc_lport *lport = rport->lport;
unsigned long flags;
@ -1433,6 +1432,14 @@ nvme_fc_xmt_ls_rsp_done(struct nvmefc_ls_rsp *lsrsp)
nvme_fc_rport_put(rport);
}
static void
nvme_fc_xmt_ls_rsp_done(struct nvmefc_ls_rsp *lsrsp)
{
struct nvmefc_ls_rcv_op *lsop = lsrsp->nvme_fc_private;
nvme_fc_xmt_ls_rsp_free(lsop);
}
static void
nvme_fc_xmt_ls_rsp(struct nvmefc_ls_rcv_op *lsop)
{
@ -1450,7 +1457,7 @@ nvme_fc_xmt_ls_rsp(struct nvmefc_ls_rcv_op *lsop)
dev_warn(lport->dev,
"LLDD rejected LS RSP xmt: LS %d status %d\n",
w0->ls_cmd, ret);
nvme_fc_xmt_ls_rsp_done(lsop->lsrsp);
nvme_fc_xmt_ls_rsp_free(lsop);
return;
}
}

View file

@ -10,10 +10,61 @@
#include "nvme.h"
bool multipath = true;
module_param(multipath, bool, 0444);
static bool multipath_always_on;
static int multipath_param_set(const char *val, const struct kernel_param *kp)
{
int ret;
bool *arg = kp->arg;
ret = param_set_bool(val, kp);
if (ret)
return ret;
if (multipath_always_on && !*arg) {
pr_err("Can't disable multipath when multipath_always_on is configured.\n");
*arg = true;
return -EINVAL;
}
return 0;
}
static const struct kernel_param_ops multipath_param_ops = {
.set = multipath_param_set,
.get = param_get_bool,
};
module_param_cb(multipath, &multipath_param_ops, &multipath, 0444);
MODULE_PARM_DESC(multipath,
"turn on native support for multiple controllers per subsystem");
static int multipath_always_on_set(const char *val,
const struct kernel_param *kp)
{
int ret;
bool *arg = kp->arg;
ret = param_set_bool(val, kp);
if (ret < 0)
return ret;
if (*arg)
multipath = true;
return 0;
}
static const struct kernel_param_ops multipath_always_on_ops = {
.set = multipath_always_on_set,
.get = param_get_bool,
};
module_param_cb(multipath_always_on, &multipath_always_on_ops,
&multipath_always_on, 0444);
MODULE_PARM_DESC(multipath_always_on,
"create multipath node always except for private namespace with non-unique nsid; note that this also implicitly enables native multipath support");
static const char *nvme_iopolicy_names[] = {
[NVME_IOPOLICY_NUMA] = "numa",
[NVME_IOPOLICY_RR] = "round-robin",
@ -442,7 +493,17 @@ static bool nvme_available_path(struct nvme_ns_head *head)
break;
}
}
return false;
/*
* If "head->delayed_removal_secs" is configured (i.e., non-zero), do
* not immediately fail I/O. Instead, requeue the I/O for the configured
* duration, anticipating that if there's a transient link failure then
* it may recover within this time window. This parameter is exported to
* userspace via sysfs, and its default value is zero. It is internally
* mapped to NVME_NSHEAD_QUEUE_IF_NO_PATH. When delayed_removal_secs is
* non-zero, this flag is set to true. When zero, the flag is cleared.
*/
return nvme_mpath_queue_if_no_path(head);
}
static void nvme_ns_head_submit_bio(struct bio *bio)
@ -617,6 +678,40 @@ static void nvme_requeue_work(struct work_struct *work)
}
}
static void nvme_remove_head(struct nvme_ns_head *head)
{
if (test_and_clear_bit(NVME_NSHEAD_DISK_LIVE, &head->flags)) {
/*
* requeue I/O after NVME_NSHEAD_DISK_LIVE has been cleared
* to allow multipath to fail all I/O.
*/
kblockd_schedule_work(&head->requeue_work);
nvme_cdev_del(&head->cdev, &head->cdev_device);
synchronize_srcu(&head->srcu);
del_gendisk(head->disk);
nvme_put_ns_head(head);
}
}
static void nvme_remove_head_work(struct work_struct *work)
{
struct nvme_ns_head *head = container_of(to_delayed_work(work),
struct nvme_ns_head, remove_work);
bool remove = false;
mutex_lock(&head->subsys->lock);
if (list_empty(&head->list)) {
list_del_init(&head->entry);
remove = true;
}
mutex_unlock(&head->subsys->lock);
if (remove)
nvme_remove_head(head);
module_put(THIS_MODULE);
}
int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head)
{
struct queue_limits lim;
@ -626,14 +721,25 @@ int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head)
spin_lock_init(&head->requeue_lock);
INIT_WORK(&head->requeue_work, nvme_requeue_work);
INIT_WORK(&head->partition_scan_work, nvme_partition_scan_work);
INIT_DELAYED_WORK(&head->remove_work, nvme_remove_head_work);
head->delayed_removal_secs = 0;
/*
* Add a multipath node if the subsystems supports multiple controllers.
* We also do this for private namespaces as the namespace sharing flag
* could change after a rescan.
* If "multipath_always_on" is enabled, a multipath node is added
* regardless of whether the disk is single/multi ported, and whether
* the namespace is shared or private. If "multipath_always_on" is not
* enabled, a multipath node is added only if the subsystem supports
* multiple controllers and the "multipath" option is configured. In
* either case, for private namespaces, we ensure that the NSID is
* unique.
*/
if (!multipath_always_on) {
if (!(ctrl->subsys->cmic & NVME_CTRL_CMIC_MULTI_CTRL) ||
!nvme_is_unique_nsid(ctrl, head) || !multipath)
!multipath)
return 0;
}
if (!nvme_is_unique_nsid(ctrl, head))
return 0;
blk_set_stacking_limits(&lim);
@ -660,6 +766,7 @@ int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head)
set_bit(GD_SUPPRESS_PART_SCAN, &head->disk->state);
sprintf(head->disk->disk_name, "nvme%dn%d",
ctrl->subsys->instance, head->instance);
nvme_tryget_ns_head(head);
return 0;
}
@ -1016,6 +1123,49 @@ static ssize_t numa_nodes_show(struct device *dev, struct device_attribute *attr
}
DEVICE_ATTR_RO(numa_nodes);
static ssize_t delayed_removal_secs_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct gendisk *disk = dev_to_disk(dev);
struct nvme_ns_head *head = disk->private_data;
int ret;
mutex_lock(&head->subsys->lock);
ret = sysfs_emit(buf, "%u\n", head->delayed_removal_secs);
mutex_unlock(&head->subsys->lock);
return ret;
}
static ssize_t delayed_removal_secs_store(struct device *dev,
struct device_attribute *attr, const char *buf, size_t count)
{
struct gendisk *disk = dev_to_disk(dev);
struct nvme_ns_head *head = disk->private_data;
unsigned int sec;
int ret;
ret = kstrtouint(buf, 0, &sec);
if (ret < 0)
return ret;
mutex_lock(&head->subsys->lock);
head->delayed_removal_secs = sec;
if (sec)
set_bit(NVME_NSHEAD_QUEUE_IF_NO_PATH, &head->flags);
else
clear_bit(NVME_NSHEAD_QUEUE_IF_NO_PATH, &head->flags);
mutex_unlock(&head->subsys->lock);
/*
* Ensure that update to NVME_NSHEAD_QUEUE_IF_NO_PATH is seen
* by its reader.
*/
synchronize_srcu(&head->srcu);
return count;
}
DEVICE_ATTR_RW(delayed_removal_secs);
static int nvme_lookup_ana_group_desc(struct nvme_ctrl *ctrl,
struct nvme_ana_group_desc *desc, void *data)
{
@ -1137,23 +1287,43 @@ void nvme_mpath_add_disk(struct nvme_ns *ns, __le32 anagrpid)
#endif
}
void nvme_mpath_shutdown_disk(struct nvme_ns_head *head)
void nvme_mpath_remove_disk(struct nvme_ns_head *head)
{
if (!head->disk)
return;
if (test_and_clear_bit(NVME_NSHEAD_DISK_LIVE, &head->flags)) {
nvme_cdev_del(&head->cdev, &head->cdev_device);
bool remove = false;
mutex_lock(&head->subsys->lock);
/*
* requeue I/O after NVME_NSHEAD_DISK_LIVE has been cleared
* to allow multipath to fail all I/O.
* We are called when all paths have been removed, and at that point
* head->list is expected to be empty. However, nvme_remove_ns() and
* nvme_init_ns_head() can run concurrently and so if head->delayed_
* removal_secs is configured, it is possible that by the time we reach
* this point, head->list may no longer be empty. Therefore, we recheck
* head->list here. If it is no longer empty then we skip enqueuing the
* delayed head removal work.
*/
synchronize_srcu(&head->srcu);
kblockd_schedule_work(&head->requeue_work);
del_gendisk(head->disk);
if (!list_empty(&head->list))
goto out;
if (head->delayed_removal_secs) {
/*
* Ensure that no one could remove this module while the head
* remove work is pending.
*/
if (!try_module_get(THIS_MODULE))
goto out;
queue_delayed_work(nvme_wq, &head->remove_work,
head->delayed_removal_secs * HZ);
} else {
list_del_init(&head->entry);
remove = true;
}
out:
mutex_unlock(&head->subsys->lock);
if (remove)
nvme_remove_head(head);
}
void nvme_mpath_remove_disk(struct nvme_ns_head *head)
void nvme_mpath_put_disk(struct nvme_ns_head *head)
{
if (!head->disk)
return;

View file

@ -497,6 +497,9 @@ struct nvme_ns_head {
struct device cdev_device;
struct gendisk *disk;
u16 nr_plids;
u16 *plids;
#ifdef CONFIG_NVME_MULTIPATH
struct bio_list requeue_list;
spinlock_t requeue_lock;
@ -504,7 +507,10 @@ struct nvme_ns_head {
struct work_struct partition_scan_work;
struct mutex lock;
unsigned long flags;
struct delayed_work remove_work;
unsigned int delayed_removal_secs;
#define NVME_NSHEAD_DISK_LIVE 0
#define NVME_NSHEAD_QUEUE_IF_NO_PATH 1
struct nvme_ns __rcu *current_path[];
#endif
};
@ -897,10 +903,10 @@ int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
int qid, nvme_submit_flags_t flags);
int nvme_set_features(struct nvme_ctrl *dev, unsigned int fid,
unsigned int dword11, void *buffer, size_t buflen,
u32 *result);
void *result);
int nvme_get_features(struct nvme_ctrl *dev, unsigned int fid,
unsigned int dword11, void *buffer, size_t buflen,
u32 *result);
void *result);
int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count);
void nvme_stop_keep_alive(struct nvme_ctrl *ctrl);
int nvme_reset_ctrl(struct nvme_ctrl *ctrl);
@ -961,7 +967,7 @@ int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl,struct nvme_ns_head *head);
void nvme_mpath_add_sysfs_link(struct nvme_ns_head *ns);
void nvme_mpath_remove_sysfs_link(struct nvme_ns *ns);
void nvme_mpath_add_disk(struct nvme_ns *ns, __le32 anagrpid);
void nvme_mpath_remove_disk(struct nvme_ns_head *head);
void nvme_mpath_put_disk(struct nvme_ns_head *head);
int nvme_mpath_init_identify(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id);
void nvme_mpath_init_ctrl(struct nvme_ctrl *ctrl);
void nvme_mpath_update(struct nvme_ctrl *ctrl);
@ -970,7 +976,7 @@ void nvme_mpath_stop(struct nvme_ctrl *ctrl);
bool nvme_mpath_clear_current_path(struct nvme_ns *ns);
void nvme_mpath_revalidate_paths(struct nvme_ns *ns);
void nvme_mpath_clear_ctrl_paths(struct nvme_ctrl *ctrl);
void nvme_mpath_shutdown_disk(struct nvme_ns_head *head);
void nvme_mpath_remove_disk(struct nvme_ns_head *head);
void nvme_mpath_start_request(struct request *rq);
void nvme_mpath_end_request(struct request *rq);
@ -987,12 +993,19 @@ extern struct device_attribute dev_attr_ana_grpid;
extern struct device_attribute dev_attr_ana_state;
extern struct device_attribute dev_attr_queue_depth;
extern struct device_attribute dev_attr_numa_nodes;
extern struct device_attribute dev_attr_delayed_removal_secs;
extern struct device_attribute subsys_attr_iopolicy;
static inline bool nvme_disk_is_ns_head(struct gendisk *disk)
{
return disk->fops == &nvme_ns_head_ops;
}
static inline bool nvme_mpath_queue_if_no_path(struct nvme_ns_head *head)
{
if (test_bit(NVME_NSHEAD_QUEUE_IF_NO_PATH, &head->flags))
return true;
return false;
}
#else
#define multipath false
static inline bool nvme_ctrl_use_ana(struct nvme_ctrl *ctrl)
@ -1013,7 +1026,7 @@ static inline int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl,
static inline void nvme_mpath_add_disk(struct nvme_ns *ns, __le32 anagrpid)
{
}
static inline void nvme_mpath_remove_disk(struct nvme_ns_head *head)
static inline void nvme_mpath_put_disk(struct nvme_ns_head *head)
{
}
static inline void nvme_mpath_add_sysfs_link(struct nvme_ns *ns)
@ -1032,7 +1045,7 @@ static inline void nvme_mpath_revalidate_paths(struct nvme_ns *ns)
static inline void nvme_mpath_clear_ctrl_paths(struct nvme_ctrl *ctrl)
{
}
static inline void nvme_mpath_shutdown_disk(struct nvme_ns_head *head)
static inline void nvme_mpath_remove_disk(struct nvme_ns_head *head)
{
}
static inline void nvme_trace_bio_complete(struct request *req)
@ -1080,6 +1093,10 @@ static inline bool nvme_disk_is_ns_head(struct gendisk *disk)
{
return false;
}
static inline bool nvme_mpath_queue_if_no_path(struct nvme_ns_head *head)
{
return false;
}
#endif /* CONFIG_NVME_MULTIPATH */
int nvme_ns_get_unique_id(struct nvme_ns *ns, u8 id[16],

View file

@ -18,6 +18,7 @@
#include <linux/mm.h>
#include <linux/module.h>
#include <linux/mutex.h>
#include <linux/nodemask.h>
#include <linux/once.h>
#include <linux/pci.h>
#include <linux/suspend.h>
@ -34,16 +35,31 @@
#define SQ_SIZE(q) ((q)->q_depth << (q)->sqes)
#define CQ_SIZE(q) ((q)->q_depth * sizeof(struct nvme_completion))
#define SGES_PER_PAGE (NVME_CTRL_PAGE_SIZE / sizeof(struct nvme_sgl_desc))
/* Optimisation for I/Os between 4k and 128k */
#define NVME_SMALL_POOL_SIZE 256
/*
* These can be higher, but we need to ensure that any command doesn't
* require an sg allocation that needs more than a page of data.
*/
#define NVME_MAX_KB_SZ 8192
#define NVME_MAX_SEGS 128
#define NVME_MAX_META_SEGS 15
#define NVME_MAX_NR_ALLOCATIONS 5
#define NVME_MAX_NR_DESCRIPTORS 5
/*
* For data SGLs we support a single descriptors worth of SGL entries, but for
* now we also limit it to avoid an allocation larger than PAGE_SIZE for the
* scatterlist.
*/
#define NVME_MAX_SEGS \
min(NVME_CTRL_PAGE_SIZE / sizeof(struct nvme_sgl_desc), \
(PAGE_SIZE / sizeof(struct scatterlist)))
/*
* For metadata SGLs, only the small descriptor is supported, and the first
* entry is the segment descriptor, which for the data pointer sits in the SQE.
*/
#define NVME_MAX_META_SEGS \
((NVME_SMALL_POOL_SIZE / sizeof(struct nvme_sgl_desc)) - 1)
static int use_threaded_interrupts;
module_param(use_threaded_interrupts, int, 0444);
@ -112,6 +128,11 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown);
static void nvme_delete_io_queues(struct nvme_dev *dev);
static void nvme_update_attrs(struct nvme_dev *dev);
struct nvme_descriptor_pools {
struct dma_pool *large;
struct dma_pool *small;
};
/*
* Represents an NVM Express device. Each nvme_dev is a PCI function.
*/
@ -121,8 +142,6 @@ struct nvme_dev {
struct blk_mq_tag_set admin_tagset;
u32 __iomem *dbs;
struct device *dev;
struct dma_pool *prp_page_pool;
struct dma_pool *prp_small_pool;
unsigned online_queues;
unsigned max_qid;
unsigned io_queues[HCTX_MAX_TYPES];
@ -162,6 +181,7 @@ struct nvme_dev {
unsigned int nr_allocated_queues;
unsigned int nr_write_queues;
unsigned int nr_poll_queues;
struct nvme_descriptor_pools descriptor_pools[];
};
static int io_queue_depth_set(const char *val, const struct kernel_param *kp)
@ -191,6 +211,7 @@ static inline struct nvme_dev *to_nvme_dev(struct nvme_ctrl *ctrl)
*/
struct nvme_queue {
struct nvme_dev *dev;
struct nvme_descriptor_pools descriptor_pools;
spinlock_t sq_lock;
void *sq_cmds;
/* only used for poll queues: */
@ -219,30 +240,30 @@ struct nvme_queue {
struct completion delete_done;
};
union nvme_descriptor {
struct nvme_sgl_desc *sg_list;
__le64 *prp_list;
/* bits for iod->flags */
enum nvme_iod_flags {
/* this command has been aborted by the timeout handler */
IOD_ABORTED = 1U << 0,
/* uses the small descriptor pool */
IOD_SMALL_DESCRIPTOR = 1U << 1,
};
/*
* The nvme_iod describes the data in an I/O.
*
* The sg pointer contains the list of PRP/SGL chunk allocations in addition
* to the actual struct scatterlist.
*/
struct nvme_iod {
struct nvme_request req;
struct nvme_command cmd;
bool aborted;
s8 nr_allocations; /* PRP list pool allocations. 0 means small
pool in use */
u8 flags;
u8 nr_descriptors;
unsigned int dma_len; /* length of single DMA segment mapping */
dma_addr_t first_dma;
dma_addr_t meta_dma;
struct sg_table sgt;
struct sg_table meta_sgt;
union nvme_descriptor meta_list;
union nvme_descriptor list[NVME_MAX_NR_ALLOCATIONS];
struct nvme_sgl_desc *meta_descriptor;
void *descriptors[NVME_MAX_NR_DESCRIPTORS];
};
static inline unsigned int nvme_dbbuf_size(struct nvme_dev *dev)
@ -397,28 +418,76 @@ static __always_inline int nvme_pci_npages_prp(void)
return DIV_ROUND_UP(8 * nprps, NVME_CTRL_PAGE_SIZE - 8);
}
static struct nvme_descriptor_pools *
nvme_setup_descriptor_pools(struct nvme_dev *dev, unsigned numa_node)
{
struct nvme_descriptor_pools *pools = &dev->descriptor_pools[numa_node];
size_t small_align = NVME_SMALL_POOL_SIZE;
if (pools->small)
return pools; /* already initialized */
pools->large = dma_pool_create_node("nvme descriptor page", dev->dev,
NVME_CTRL_PAGE_SIZE, NVME_CTRL_PAGE_SIZE, 0, numa_node);
if (!pools->large)
return ERR_PTR(-ENOMEM);
if (dev->ctrl.quirks & NVME_QUIRK_DMAPOOL_ALIGN_512)
small_align = 512;
pools->small = dma_pool_create_node("nvme descriptor small", dev->dev,
NVME_SMALL_POOL_SIZE, small_align, 0, numa_node);
if (!pools->small) {
dma_pool_destroy(pools->large);
pools->large = NULL;
return ERR_PTR(-ENOMEM);
}
return pools;
}
static void nvme_release_descriptor_pools(struct nvme_dev *dev)
{
unsigned i;
for (i = 0; i < nr_node_ids; i++) {
struct nvme_descriptor_pools *pools = &dev->descriptor_pools[i];
dma_pool_destroy(pools->large);
dma_pool_destroy(pools->small);
}
}
static int nvme_init_hctx_common(struct blk_mq_hw_ctx *hctx, void *data,
unsigned qid)
{
struct nvme_dev *dev = to_nvme_dev(data);
struct nvme_queue *nvmeq = &dev->queues[qid];
struct nvme_descriptor_pools *pools;
struct blk_mq_tags *tags;
tags = qid ? dev->tagset.tags[qid - 1] : dev->admin_tagset.tags[0];
WARN_ON(tags != hctx->tags);
pools = nvme_setup_descriptor_pools(dev, hctx->numa_node);
if (IS_ERR(pools))
return PTR_ERR(pools);
nvmeq->descriptor_pools = *pools;
hctx->driver_data = nvmeq;
return 0;
}
static int nvme_admin_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
unsigned int hctx_idx)
{
struct nvme_dev *dev = to_nvme_dev(data);
struct nvme_queue *nvmeq = &dev->queues[0];
WARN_ON(hctx_idx != 0);
WARN_ON(dev->admin_tagset.tags[0] != hctx->tags);
hctx->driver_data = nvmeq;
return 0;
return nvme_init_hctx_common(hctx, data, 0);
}
static int nvme_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
unsigned int hctx_idx)
{
struct nvme_dev *dev = to_nvme_dev(data);
struct nvme_queue *nvmeq = &dev->queues[hctx_idx + 1];
WARN_ON(dev->tagset.tags[hctx_idx] != hctx->tags);
hctx->driver_data = nvmeq;
return 0;
return nvme_init_hctx_common(hctx, data, hctx_idx + 1);
}
static int nvme_pci_init_request(struct blk_mq_tag_set *set,
@ -537,23 +606,39 @@ static inline bool nvme_pci_use_sgls(struct nvme_dev *dev, struct request *req,
return true;
}
static void nvme_free_prps(struct nvme_dev *dev, struct request *req)
static inline struct dma_pool *nvme_dma_pool(struct nvme_queue *nvmeq,
struct nvme_iod *iod)
{
if (iod->flags & IOD_SMALL_DESCRIPTOR)
return nvmeq->descriptor_pools.small;
return nvmeq->descriptor_pools.large;
}
static void nvme_free_descriptors(struct nvme_queue *nvmeq, struct request *req)
{
const int last_prp = NVME_CTRL_PAGE_SIZE / sizeof(__le64) - 1;
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
dma_addr_t dma_addr = iod->first_dma;
int i;
for (i = 0; i < iod->nr_allocations; i++) {
__le64 *prp_list = iod->list[i].prp_list;
if (iod->nr_descriptors == 1) {
dma_pool_free(nvme_dma_pool(nvmeq, iod), iod->descriptors[0],
dma_addr);
return;
}
for (i = 0; i < iod->nr_descriptors; i++) {
__le64 *prp_list = iod->descriptors[i];
dma_addr_t next_dma_addr = le64_to_cpu(prp_list[last_prp]);
dma_pool_free(dev->prp_page_pool, prp_list, dma_addr);
dma_pool_free(nvmeq->descriptor_pools.large, prp_list,
dma_addr);
dma_addr = next_dma_addr;
}
}
static void nvme_unmap_data(struct nvme_dev *dev, struct request *req)
static void nvme_unmap_data(struct nvme_dev *dev, struct nvme_queue *nvmeq,
struct request *req)
{
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
@ -566,15 +651,7 @@ static void nvme_unmap_data(struct nvme_dev *dev, struct request *req)
WARN_ON_ONCE(!iod->sgt.nents);
dma_unmap_sgtable(dev->dev, &iod->sgt, rq_dma_dir(req), 0);
if (iod->nr_allocations == 0)
dma_pool_free(dev->prp_small_pool, iod->list[0].sg_list,
iod->first_dma);
else if (iod->nr_allocations == 1)
dma_pool_free(dev->prp_page_pool, iod->list[0].sg_list,
iod->first_dma);
else
nvme_free_prps(dev, req);
nvme_free_descriptors(nvmeq, req);
mempool_free(iod->sgt.sgl, dev->iod_mempool);
}
@ -592,11 +669,10 @@ static void nvme_print_sgl(struct scatterlist *sgl, int nents)
}
}
static blk_status_t nvme_pci_setup_prps(struct nvme_dev *dev,
static blk_status_t nvme_pci_setup_prps(struct nvme_queue *nvmeq,
struct request *req, struct nvme_rw_command *cmnd)
{
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
struct dma_pool *pool;
int length = blk_rq_payload_bytes(req);
struct scatterlist *sg = iod->sgt.sgl;
int dma_len = sg_dma_len(sg);
@ -604,7 +680,7 @@ static blk_status_t nvme_pci_setup_prps(struct nvme_dev *dev,
int offset = dma_addr & (NVME_CTRL_PAGE_SIZE - 1);
__le64 *prp_list;
dma_addr_t prp_dma;
int nprps, i;
int i;
length -= (NVME_CTRL_PAGE_SIZE - offset);
if (length <= 0) {
@ -626,30 +702,26 @@ static blk_status_t nvme_pci_setup_prps(struct nvme_dev *dev,
goto done;
}
nprps = DIV_ROUND_UP(length, NVME_CTRL_PAGE_SIZE);
if (nprps <= (256 / 8)) {
pool = dev->prp_small_pool;
iod->nr_allocations = 0;
} else {
pool = dev->prp_page_pool;
iod->nr_allocations = 1;
}
if (DIV_ROUND_UP(length, NVME_CTRL_PAGE_SIZE) <=
NVME_SMALL_POOL_SIZE / sizeof(__le64))
iod->flags |= IOD_SMALL_DESCRIPTOR;
prp_list = dma_pool_alloc(pool, GFP_ATOMIC, &prp_dma);
if (!prp_list) {
iod->nr_allocations = -1;
prp_list = dma_pool_alloc(nvme_dma_pool(nvmeq, iod), GFP_ATOMIC,
&prp_dma);
if (!prp_list)
return BLK_STS_RESOURCE;
}
iod->list[0].prp_list = prp_list;
iod->descriptors[iod->nr_descriptors++] = prp_list;
iod->first_dma = prp_dma;
i = 0;
for (;;) {
if (i == NVME_CTRL_PAGE_SIZE >> 3) {
__le64 *old_prp_list = prp_list;
prp_list = dma_pool_alloc(pool, GFP_ATOMIC, &prp_dma);
prp_list = dma_pool_alloc(nvmeq->descriptor_pools.large,
GFP_ATOMIC, &prp_dma);
if (!prp_list)
goto free_prps;
iod->list[iod->nr_allocations++].prp_list = prp_list;
iod->descriptors[iod->nr_descriptors++] = prp_list;
prp_list[0] = old_prp_list[i - 1];
old_prp_list[i - 1] = cpu_to_le64(prp_dma);
i = 1;
@ -673,7 +745,7 @@ done:
cmnd->dptr.prp2 = cpu_to_le64(iod->first_dma);
return BLK_STS_OK;
free_prps:
nvme_free_prps(dev, req);
nvme_free_descriptors(nvmeq, req);
return BLK_STS_RESOURCE;
bad_sgl:
WARN(DO_ONCE(nvme_print_sgl, iod->sgt.sgl, iod->sgt.nents),
@ -698,11 +770,10 @@ static void nvme_pci_sgl_set_seg(struct nvme_sgl_desc *sge,
sge->type = NVME_SGL_FMT_LAST_SEG_DESC << 4;
}
static blk_status_t nvme_pci_setup_sgls(struct nvme_dev *dev,
static blk_status_t nvme_pci_setup_sgls(struct nvme_queue *nvmeq,
struct request *req, struct nvme_rw_command *cmd)
{
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
struct dma_pool *pool;
struct nvme_sgl_desc *sg_list;
struct scatterlist *sg = iod->sgt.sgl;
unsigned int entries = iod->sgt.nents;
@ -717,21 +788,14 @@ static blk_status_t nvme_pci_setup_sgls(struct nvme_dev *dev,
return BLK_STS_OK;
}
if (entries <= (256 / sizeof(struct nvme_sgl_desc))) {
pool = dev->prp_small_pool;
iod->nr_allocations = 0;
} else {
pool = dev->prp_page_pool;
iod->nr_allocations = 1;
}
if (entries <= NVME_SMALL_POOL_SIZE / sizeof(*sg_list))
iod->flags |= IOD_SMALL_DESCRIPTOR;
sg_list = dma_pool_alloc(pool, GFP_ATOMIC, &sgl_dma);
if (!sg_list) {
iod->nr_allocations = -1;
sg_list = dma_pool_alloc(nvme_dma_pool(nvmeq, iod), GFP_ATOMIC,
&sgl_dma);
if (!sg_list)
return BLK_STS_RESOURCE;
}
iod->list[0].sg_list = sg_list;
iod->descriptors[iod->nr_descriptors++] = sg_list;
iod->first_dma = sgl_dma;
nvme_pci_sgl_set_seg(&cmd->dptr.sgl, sgl_dma, entries);
@ -785,12 +849,12 @@ static blk_status_t nvme_setup_sgl_simple(struct nvme_dev *dev,
static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req,
struct nvme_command *cmnd)
{
struct nvme_queue *nvmeq = req->mq_hctx->driver_data;
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
blk_status_t ret = BLK_STS_RESOURCE;
int rc;
if (blk_rq_nr_phys_segments(req) == 1) {
struct nvme_queue *nvmeq = req->mq_hctx->driver_data;
struct bio_vec bv = req_bvec(req);
if (!is_pci_p2pdma_page(bv.bv_page)) {
@ -825,9 +889,9 @@ static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req,
}
if (nvme_pci_use_sgls(dev, req, iod->sgt.nents))
ret = nvme_pci_setup_sgls(dev, req, &cmnd->rw);
ret = nvme_pci_setup_sgls(nvmeq, req, &cmnd->rw);
else
ret = nvme_pci_setup_prps(dev, req, &cmnd->rw);
ret = nvme_pci_setup_prps(nvmeq, req, &cmnd->rw);
if (ret != BLK_STS_OK)
goto out_unmap_sg;
return BLK_STS_OK;
@ -842,6 +906,7 @@ out_free_sg:
static blk_status_t nvme_pci_setup_meta_sgls(struct nvme_dev *dev,
struct request *req)
{
struct nvme_queue *nvmeq = req->mq_hctx->driver_data;
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
struct nvme_rw_command *cmnd = &iod->cmd.rw;
struct nvme_sgl_desc *sg_list;
@ -865,12 +930,13 @@ static blk_status_t nvme_pci_setup_meta_sgls(struct nvme_dev *dev,
if (rc)
goto out_free_sg;
sg_list = dma_pool_alloc(dev->prp_small_pool, GFP_ATOMIC, &sgl_dma);
sg_list = dma_pool_alloc(nvmeq->descriptor_pools.small, GFP_ATOMIC,
&sgl_dma);
if (!sg_list)
goto out_unmap_sg;
entries = iod->meta_sgt.nents;
iod->meta_list.sg_list = sg_list;
iod->meta_descriptor = sg_list;
iod->meta_dma = sgl_dma;
cmnd->flags = NVME_CMD_SGL_METASEG;
@ -912,7 +978,10 @@ static blk_status_t nvme_pci_setup_meta_mptr(struct nvme_dev *dev,
static blk_status_t nvme_map_metadata(struct nvme_dev *dev, struct request *req)
{
if (nvme_pci_metadata_use_sgls(dev, req))
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
if ((iod->cmd.common.flags & NVME_CMD_SGL_METABUF) &&
nvme_pci_metadata_use_sgls(dev, req))
return nvme_pci_setup_meta_sgls(dev, req);
return nvme_pci_setup_meta_mptr(dev, req);
}
@ -922,8 +991,8 @@ static blk_status_t nvme_prep_rq(struct nvme_dev *dev, struct request *req)
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
blk_status_t ret;
iod->aborted = false;
iod->nr_allocations = -1;
iod->flags = 0;
iod->nr_descriptors = 0;
iod->sgt.nents = 0;
iod->meta_sgt.nents = 0;
@ -947,7 +1016,7 @@ static blk_status_t nvme_prep_rq(struct nvme_dev *dev, struct request *req)
return BLK_STS_OK;
out_unmap_data:
if (blk_rq_nr_phys_segments(req))
nvme_unmap_data(dev, req);
nvme_unmap_data(dev, req->mq_hctx->driver_data, req);
out_free_cmd:
nvme_cleanup_cmd(req);
return ret;
@ -1037,6 +1106,7 @@ static void nvme_queue_rqs(struct rq_list *rqlist)
}
static __always_inline void nvme_unmap_metadata(struct nvme_dev *dev,
struct nvme_queue *nvmeq,
struct request *req)
{
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
@ -1048,7 +1118,7 @@ static __always_inline void nvme_unmap_metadata(struct nvme_dev *dev,
return;
}
dma_pool_free(dev->prp_small_pool, iod->meta_list.sg_list,
dma_pool_free(nvmeq->descriptor_pools.small, iod->meta_descriptor,
iod->meta_dma);
dma_unmap_sgtable(dev->dev, &iod->meta_sgt, rq_dma_dir(req), 0);
mempool_free(iod->meta_sgt.sgl, dev->iod_meta_mempool);
@ -1060,10 +1130,10 @@ static __always_inline void nvme_pci_unmap_rq(struct request *req)
struct nvme_dev *dev = nvmeq->dev;
if (blk_integrity_rq(req))
nvme_unmap_metadata(dev, req);
nvme_unmap_metadata(dev, nvmeq, req);
if (blk_rq_nr_phys_segments(req))
nvme_unmap_data(dev, req);
nvme_unmap_data(dev, nvmeq, req);
}
static void nvme_pci_complete_rq(struct request *req)
@ -1490,7 +1560,7 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req)
* returned to the driver, or if this is the admin queue.
*/
opcode = nvme_req(req)->cmd->common.opcode;
if (!nvmeq->qid || iod->aborted) {
if (!nvmeq->qid || (iod->flags & IOD_ABORTED)) {
dev_warn(dev->ctrl.device,
"I/O tag %d (%04x) opcode %#x (%s) QID %d timeout, reset controller\n",
req->tag, nvme_cid(req), opcode,
@ -1503,7 +1573,7 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req)
atomic_inc(&dev->ctrl.abort_limit);
return BLK_EH_RESET_TIMER;
}
iod->aborted = true;
iod->flags |= IOD_ABORTED;
cmd.abort.opcode = nvme_admin_abort_cmd;
cmd.abort.cid = nvme_cid(req);
@ -2842,35 +2912,6 @@ static int nvme_disable_prepare_reset(struct nvme_dev *dev, bool shutdown)
return 0;
}
static int nvme_setup_prp_pools(struct nvme_dev *dev)
{
size_t small_align = 256;
dev->prp_page_pool = dma_pool_create("prp list page", dev->dev,
NVME_CTRL_PAGE_SIZE,
NVME_CTRL_PAGE_SIZE, 0);
if (!dev->prp_page_pool)
return -ENOMEM;
if (dev->ctrl.quirks & NVME_QUIRK_DMAPOOL_ALIGN_512)
small_align = 512;
/* Optimisation for I/Os between 4k and 128k */
dev->prp_small_pool = dma_pool_create("prp list 256", dev->dev,
256, small_align, 0);
if (!dev->prp_small_pool) {
dma_pool_destroy(dev->prp_page_pool);
return -ENOMEM;
}
return 0;
}
static void nvme_release_prp_pools(struct nvme_dev *dev)
{
dma_pool_destroy(dev->prp_page_pool);
dma_pool_destroy(dev->prp_small_pool);
}
static int nvme_pci_alloc_iod_mempool(struct nvme_dev *dev)
{
size_t meta_size = sizeof(struct scatterlist) * (NVME_MAX_META_SEGS + 1);
@ -3185,7 +3226,8 @@ static struct nvme_dev *nvme_pci_alloc_dev(struct pci_dev *pdev,
struct nvme_dev *dev;
int ret = -ENOMEM;
dev = kzalloc_node(sizeof(*dev), GFP_KERNEL, node);
dev = kzalloc_node(struct_size(dev, descriptor_pools, nr_node_ids),
GFP_KERNEL, node);
if (!dev)
return ERR_PTR(-ENOMEM);
INIT_WORK(&dev->ctrl.reset_work, nvme_reset_work);
@ -3260,13 +3302,9 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
if (result)
goto out_uninit_ctrl;
result = nvme_setup_prp_pools(dev);
if (result)
goto out_dev_unmap;
result = nvme_pci_alloc_iod_mempool(dev);
if (result)
goto out_release_prp_pools;
goto out_dev_unmap;
dev_info(dev->ctrl.device, "pci function %s\n", dev_name(&pdev->dev));
@ -3342,8 +3380,6 @@ out_disable:
out_release_iod_mempool:
mempool_destroy(dev->iod_mempool);
mempool_destroy(dev->iod_meta_mempool);
out_release_prp_pools:
nvme_release_prp_pools(dev);
out_dev_unmap:
nvme_dev_unmap(dev);
out_uninit_ctrl:
@ -3408,7 +3444,7 @@ static void nvme_remove(struct pci_dev *pdev)
nvme_free_queues(dev, 0);
mempool_destroy(dev->iod_mempool);
mempool_destroy(dev->iod_meta_mempool);
nvme_release_prp_pools(dev);
nvme_release_descriptor_pools(dev);
nvme_dev_unmap(dev);
nvme_uninit_ctrl(&dev->ctrl);
}
@ -3809,9 +3845,7 @@ static int __init nvme_init(void)
BUILD_BUG_ON(sizeof(struct nvme_create_sq) != 64);
BUILD_BUG_ON(sizeof(struct nvme_delete_queue) != 64);
BUILD_BUG_ON(IRQ_AFFINITY_MAX_SETS < 2);
BUILD_BUG_ON(NVME_MAX_SEGS > SGES_PER_PAGE);
BUILD_BUG_ON(sizeof(struct scatterlist) * NVME_MAX_SEGS > PAGE_SIZE);
BUILD_BUG_ON(nvme_pci_npages_prp() > NVME_MAX_NR_ALLOCATIONS);
BUILD_BUG_ON(nvme_pci_npages_prp() > NVME_MAX_NR_DESCRIPTORS);
return pci_register_driver(&nvme_driver);
}

View file

@ -260,6 +260,7 @@ static struct attribute *nvme_ns_attrs[] = {
&dev_attr_ana_state.attr,
&dev_attr_queue_depth.attr,
&dev_attr_numa_nodes.attr,
&dev_attr_delayed_removal_secs.attr,
#endif
&dev_attr_io_passthru_err_log_enabled.attr,
NULL,
@ -296,6 +297,12 @@ static umode_t nvme_ns_attrs_are_visible(struct kobject *kobj,
if (nvme_disk_is_ns_head(dev_to_disk(dev)))
return 0;
}
if (a == &dev_attr_delayed_removal_secs.attr) {
struct gendisk *disk = dev_to_disk(dev);
if (!nvme_disk_is_ns_head(disk))
return 0;
}
#endif
return a->mode;
}

View file

@ -403,7 +403,7 @@ static inline bool nvme_tcp_queue_more(struct nvme_tcp_queue *queue)
}
static inline void nvme_tcp_queue_request(struct nvme_tcp_request *req,
bool sync, bool last)
bool last)
{
struct nvme_tcp_queue *queue = req->queue;
bool empty;
@ -417,7 +417,7 @@ static inline void nvme_tcp_queue_request(struct nvme_tcp_request *req,
* are on the same cpu, so we don't introduce contention.
*/
if (queue->io_cpu == raw_smp_processor_id() &&
sync && empty && mutex_trylock(&queue->send_mutex)) {
empty && mutex_trylock(&queue->send_mutex)) {
nvme_tcp_send_all(queue);
mutex_unlock(&queue->send_mutex);
}
@ -770,7 +770,9 @@ static int nvme_tcp_handle_r2t(struct nvme_tcp_queue *queue,
req->ttag = pdu->ttag;
nvme_tcp_setup_h2c_data_pdu(req);
nvme_tcp_queue_request(req, false, true);
llist_add(&req->lentry, &queue->req_list);
queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
return 0;
}
@ -2385,7 +2387,7 @@ static int nvme_tcp_setup_ctrl(struct nvme_ctrl *ctrl, bool new)
if (ret)
return ret;
if (ctrl->opts && ctrl->opts->concat && !ctrl->tls_pskid) {
if (ctrl->opts->concat && !ctrl->tls_pskid) {
/* See comments for nvme_tcp_key_revoke_needed() */
dev_dbg(ctrl->device, "restart admin queue for secure concatenation\n");
nvme_stop_keep_alive(ctrl);
@ -2637,7 +2639,7 @@ static void nvme_tcp_submit_async_event(struct nvme_ctrl *arg)
ctrl->async_req.curr_bio = NULL;
ctrl->async_req.data_len = 0;
nvme_tcp_queue_request(&ctrl->async_req, true, true);
nvme_tcp_queue_request(&ctrl->async_req, true);
}
static void nvme_tcp_complete_timed_out(struct request *rq)
@ -2789,7 +2791,7 @@ static blk_status_t nvme_tcp_queue_rq(struct blk_mq_hw_ctx *hctx,
nvme_start_request(rq);
nvme_tcp_queue_request(req, true, bd->last);
nvme_tcp_queue_request(req, bd->last);
return BLK_STS_OK;
}

View file

@ -63,14 +63,9 @@ static void nvmet_execute_create_sq(struct nvmet_req *req)
if (status != NVME_SC_SUCCESS)
goto complete;
/*
* Note: The NVMe specification allows multiple SQs to use the same CQ.
* However, the target code does not really support that. So for now,
* prevent this and fail the command if sqid and cqid are different.
*/
if (!cqid || cqid != sqid) {
pr_err("SQ %u: Unsupported CQID %u\n", sqid, cqid);
status = NVME_SC_CQ_INVALID | NVME_STATUS_DNR;
status = nvmet_check_io_cqid(ctrl, cqid, false);
if (status != NVME_SC_SUCCESS) {
pr_err("SQ %u: Invalid CQID %u\n", sqid, cqid);
goto complete;
}
@ -79,7 +74,7 @@ static void nvmet_execute_create_sq(struct nvmet_req *req)
goto complete;
}
status = ctrl->ops->create_sq(ctrl, sqid, sq_flags, qsize, prp1);
status = ctrl->ops->create_sq(ctrl, sqid, cqid, sq_flags, qsize, prp1);
complete:
nvmet_req_complete(req, status);
@ -96,15 +91,16 @@ static void nvmet_execute_delete_cq(struct nvmet_req *req)
goto complete;
}
if (!cqid) {
status = nvmet_check_io_cqid(ctrl, cqid, false);
if (status != NVME_SC_SUCCESS)
goto complete;
if (!ctrl->cqs[cqid] || nvmet_cq_in_use(ctrl->cqs[cqid])) {
/* Some SQs are still using this CQ */
status = NVME_SC_QID_INVALID | NVME_STATUS_DNR;
goto complete;
}
status = nvmet_check_cqid(ctrl, cqid);
if (status != NVME_SC_SUCCESS)
goto complete;
status = ctrl->ops->delete_cq(ctrl, cqid);
complete:
@ -127,12 +123,7 @@ static void nvmet_execute_create_cq(struct nvmet_req *req)
goto complete;
}
if (!cqid) {
status = NVME_SC_QID_INVALID | NVME_STATUS_DNR;
goto complete;
}
status = nvmet_check_cqid(ctrl, cqid);
status = nvmet_check_io_cqid(ctrl, cqid, true);
if (status != NVME_SC_SUCCESS)
goto complete;

View file

@ -280,9 +280,12 @@ void nvmet_destroy_auth(struct nvmet_ctrl *ctrl)
bool nvmet_check_auth_status(struct nvmet_req *req)
{
if (req->sq->ctrl->host_key &&
!req->sq->authenticated)
if (req->sq->ctrl->host_key) {
if (req->sq->qid > 0)
return true;
if (!req->sq->authenticated)
return false;
}
return true;
}
@ -290,7 +293,7 @@ int nvmet_auth_host_hash(struct nvmet_req *req, u8 *response,
unsigned int shash_len)
{
struct crypto_shash *shash_tfm;
struct shash_desc *shash;
SHASH_DESC_ON_STACK(shash, shash_tfm);
struct nvmet_ctrl *ctrl = req->sq->ctrl;
const char *hash_name;
u8 *challenge = req->sq->dhchap_c1;
@ -342,19 +345,13 @@ int nvmet_auth_host_hash(struct nvmet_req *req, u8 *response,
req->sq->dhchap_c1,
challenge, shash_len);
if (ret)
goto out_free_challenge;
goto out;
}
pr_debug("ctrl %d qid %d host response seq %u transaction %d\n",
ctrl->cntlid, req->sq->qid, req->sq->dhchap_s1,
req->sq->dhchap_tid);
shash = kzalloc(sizeof(*shash) + crypto_shash_descsize(shash_tfm),
GFP_KERNEL);
if (!shash) {
ret = -ENOMEM;
goto out_free_challenge;
}
shash->tfm = shash_tfm;
ret = crypto_shash_init(shash);
if (ret)
@ -389,8 +386,6 @@ int nvmet_auth_host_hash(struct nvmet_req *req, u8 *response,
goto out;
ret = crypto_shash_final(shash, response);
out:
kfree(shash);
out_free_challenge:
if (challenge != req->sq->dhchap_c1)
kfree(challenge);
out_free_response:

View file

@ -813,11 +813,43 @@ void nvmet_req_complete(struct nvmet_req *req, u16 status)
}
EXPORT_SYMBOL_GPL(nvmet_req_complete);
void nvmet_cq_init(struct nvmet_cq *cq)
{
refcount_set(&cq->ref, 1);
}
EXPORT_SYMBOL_GPL(nvmet_cq_init);
bool nvmet_cq_get(struct nvmet_cq *cq)
{
return refcount_inc_not_zero(&cq->ref);
}
EXPORT_SYMBOL_GPL(nvmet_cq_get);
void nvmet_cq_put(struct nvmet_cq *cq)
{
if (refcount_dec_and_test(&cq->ref))
nvmet_cq_destroy(cq);
}
EXPORT_SYMBOL_GPL(nvmet_cq_put);
void nvmet_cq_setup(struct nvmet_ctrl *ctrl, struct nvmet_cq *cq,
u16 qid, u16 size)
{
cq->qid = qid;
cq->size = size;
ctrl->cqs[qid] = cq;
}
void nvmet_cq_destroy(struct nvmet_cq *cq)
{
struct nvmet_ctrl *ctrl = cq->ctrl;
if (ctrl) {
ctrl->cqs[cq->qid] = NULL;
nvmet_ctrl_put(cq->ctrl);
cq->ctrl = NULL;
}
}
void nvmet_sq_setup(struct nvmet_ctrl *ctrl, struct nvmet_sq *sq,
@ -837,37 +869,47 @@ static void nvmet_confirm_sq(struct percpu_ref *ref)
complete(&sq->confirm_done);
}
u16 nvmet_check_cqid(struct nvmet_ctrl *ctrl, u16 cqid)
u16 nvmet_check_cqid(struct nvmet_ctrl *ctrl, u16 cqid, bool create)
{
if (!ctrl->sqs)
if (!ctrl->cqs)
return NVME_SC_INTERNAL | NVME_STATUS_DNR;
if (cqid > ctrl->subsys->max_qid)
return NVME_SC_QID_INVALID | NVME_STATUS_DNR;
/*
* Note: For PCI controllers, the NVMe specifications allows multiple
* SQs to share a single CQ. However, we do not support this yet, so
* check that there is no SQ defined for a CQ. If one exist, then the
* CQ ID is invalid for creation as well as when the CQ is being
* deleted (as that would mean that the SQ was not deleted before the
* CQ).
*/
if (ctrl->sqs[cqid])
if ((create && ctrl->cqs[cqid]) || (!create && !ctrl->cqs[cqid]))
return NVME_SC_QID_INVALID | NVME_STATUS_DNR;
return NVME_SC_SUCCESS;
}
u16 nvmet_check_io_cqid(struct nvmet_ctrl *ctrl, u16 cqid, bool create)
{
if (!cqid)
return NVME_SC_QID_INVALID | NVME_STATUS_DNR;
return nvmet_check_cqid(ctrl, cqid, create);
}
bool nvmet_cq_in_use(struct nvmet_cq *cq)
{
return refcount_read(&cq->ref) > 1;
}
EXPORT_SYMBOL_GPL(nvmet_cq_in_use);
u16 nvmet_cq_create(struct nvmet_ctrl *ctrl, struct nvmet_cq *cq,
u16 qid, u16 size)
{
u16 status;
status = nvmet_check_cqid(ctrl, qid);
status = nvmet_check_cqid(ctrl, qid, true);
if (status != NVME_SC_SUCCESS)
return status;
if (!kref_get_unless_zero(&ctrl->ref))
return NVME_SC_INTERNAL | NVME_STATUS_DNR;
cq->ctrl = ctrl;
nvmet_cq_init(cq);
nvmet_cq_setup(ctrl, cq, qid, size);
return NVME_SC_SUCCESS;
@ -891,7 +933,7 @@ u16 nvmet_check_sqid(struct nvmet_ctrl *ctrl, u16 sqid,
}
u16 nvmet_sq_create(struct nvmet_ctrl *ctrl, struct nvmet_sq *sq,
u16 sqid, u16 size)
struct nvmet_cq *cq, u16 sqid, u16 size)
{
u16 status;
int ret;
@ -903,7 +945,7 @@ u16 nvmet_sq_create(struct nvmet_ctrl *ctrl, struct nvmet_sq *sq,
if (status != NVME_SC_SUCCESS)
return status;
ret = nvmet_sq_init(sq);
ret = nvmet_sq_init(sq, cq);
if (ret) {
status = NVME_SC_INTERNAL | NVME_STATUS_DNR;
goto ctrl_put;
@ -935,6 +977,7 @@ void nvmet_sq_destroy(struct nvmet_sq *sq)
wait_for_completion(&sq->free_done);
percpu_ref_exit(&sq->ref);
nvmet_auth_sq_free(sq);
nvmet_cq_put(sq->cq);
/*
* we must reference the ctrl again after waiting for inflight IO
@ -967,18 +1010,23 @@ static void nvmet_sq_free(struct percpu_ref *ref)
complete(&sq->free_done);
}
int nvmet_sq_init(struct nvmet_sq *sq)
int nvmet_sq_init(struct nvmet_sq *sq, struct nvmet_cq *cq)
{
int ret;
if (!nvmet_cq_get(cq))
return -EINVAL;
ret = percpu_ref_init(&sq->ref, nvmet_sq_free, 0, GFP_KERNEL);
if (ret) {
pr_err("percpu_ref init failed!\n");
nvmet_cq_put(cq);
return ret;
}
init_completion(&sq->free_done);
init_completion(&sq->confirm_done);
nvmet_auth_sq_init(sq);
sq->cq = cq;
return 0;
}
@ -1108,13 +1156,13 @@ static u16 nvmet_parse_io_cmd(struct nvmet_req *req)
return ret;
}
bool nvmet_req_init(struct nvmet_req *req, struct nvmet_cq *cq,
struct nvmet_sq *sq, const struct nvmet_fabrics_ops *ops)
bool nvmet_req_init(struct nvmet_req *req, struct nvmet_sq *sq,
const struct nvmet_fabrics_ops *ops)
{
u8 flags = req->cmd->common.flags;
u16 status;
req->cq = cq;
req->cq = sq->cq;
req->sq = sq;
req->ops = ops;
req->sg = NULL;
@ -1612,12 +1660,17 @@ struct nvmet_ctrl *nvmet_alloc_ctrl(struct nvmet_alloc_ctrl_args *args)
if (!ctrl->sqs)
goto out_free_changed_ns_list;
ctrl->cqs = kcalloc(subsys->max_qid + 1, sizeof(struct nvmet_cq *),
GFP_KERNEL);
if (!ctrl->cqs)
goto out_free_sqs;
ret = ida_alloc_range(&cntlid_ida,
subsys->cntlid_min, subsys->cntlid_max,
GFP_KERNEL);
if (ret < 0) {
args->status = NVME_SC_CONNECT_CTRL_BUSY | NVME_STATUS_DNR;
goto out_free_sqs;
goto out_free_cqs;
}
ctrl->cntlid = ret;
@ -1676,6 +1729,8 @@ init_pr_fail:
mutex_unlock(&subsys->lock);
nvmet_stop_keep_alive_timer(ctrl);
ida_free(&cntlid_ida, ctrl->cntlid);
out_free_cqs:
kfree(ctrl->cqs);
out_free_sqs:
kfree(ctrl->sqs);
out_free_changed_ns_list:
@ -1712,6 +1767,7 @@ static void nvmet_ctrl_free(struct kref *ref)
nvmet_async_events_free(ctrl);
kfree(ctrl->sqs);
kfree(ctrl->cqs);
kfree(ctrl->changed_ns_list);
kfree(ctrl);

View file

@ -119,7 +119,7 @@ static void nvmet_format_discovery_entry(struct nvmf_disc_rsp_page_hdr *hdr,
memcpy(e->trsvcid, port->disc_addr.trsvcid, NVMF_TRSVCID_SIZE);
memcpy(e->traddr, traddr, NVMF_TRADDR_SIZE);
memcpy(e->tsas.common, port->disc_addr.tsas.common, NVMF_TSAS_SIZE);
strncpy(e->subnqn, subsys_nqn, NVMF_NQN_SIZE);
strscpy(e->subnqn, subsys_nqn, NVMF_NQN_SIZE);
}
/*

View file

@ -208,6 +208,14 @@ static u16 nvmet_install_queue(struct nvmet_ctrl *ctrl, struct nvmet_req *req)
return NVME_SC_CONNECT_CTRL_BUSY | NVME_STATUS_DNR;
}
kref_get(&ctrl->ref);
old = cmpxchg(&req->cq->ctrl, NULL, ctrl);
if (old) {
pr_warn("queue already connected!\n");
req->error_loc = offsetof(struct nvmf_connect_command, opcode);
return NVME_SC_CONNECT_CTRL_BUSY | NVME_STATUS_DNR;
}
/* note: convert queue size from 0's-based value to 1's-based value */
nvmet_cq_setup(ctrl, req->cq, qid, sqsize + 1);
nvmet_sq_setup(ctrl, req->sq, qid, sqsize + 1);
@ -239,8 +247,8 @@ static u32 nvmet_connect_result(struct nvmet_ctrl *ctrl, struct nvmet_sq *sq)
bool needs_auth = nvmet_has_auth(ctrl, sq);
key_serial_t keyid = nvmet_queue_tls_keyid(sq);
/* Do not authenticate I/O queues for secure concatenation */
if (ctrl->concat && sq->qid)
/* Do not authenticate I/O queues */
if (sq->qid)
needs_auth = false;
if (keyid)

View file

@ -816,7 +816,8 @@ nvmet_fc_alloc_target_queue(struct nvmet_fc_tgt_assoc *assoc,
nvmet_fc_prep_fcp_iodlist(assoc->tgtport, queue);
ret = nvmet_sq_init(&queue->nvme_sq);
nvmet_cq_init(&queue->nvme_cq);
ret = nvmet_sq_init(&queue->nvme_sq, &queue->nvme_cq);
if (ret)
goto out_fail_iodlist;
@ -826,6 +827,7 @@ nvmet_fc_alloc_target_queue(struct nvmet_fc_tgt_assoc *assoc,
return queue;
out_fail_iodlist:
nvmet_cq_put(&queue->nvme_cq);
nvmet_fc_destroy_fcp_iodlist(assoc->tgtport, queue);
destroy_workqueue(queue->work_q);
out_free_queue:
@ -934,6 +936,7 @@ nvmet_fc_delete_target_queue(struct nvmet_fc_tgt_queue *queue)
flush_workqueue(queue->work_q);
nvmet_sq_destroy(&queue->nvme_sq);
nvmet_cq_put(&queue->nvme_cq);
nvmet_fc_tgt_q_put(queue);
}
@ -1254,6 +1257,7 @@ nvmet_fc_portentry_bind(struct nvmet_fc_tgtport *tgtport,
{
lockdep_assert_held(&nvmet_fc_tgtlock);
nvmet_fc_tgtport_get(tgtport);
pe->tgtport = tgtport;
tgtport->pe = pe;
@ -1273,8 +1277,10 @@ nvmet_fc_portentry_unbind(struct nvmet_fc_port_entry *pe)
unsigned long flags;
spin_lock_irqsave(&nvmet_fc_tgtlock, flags);
if (pe->tgtport)
if (pe->tgtport) {
nvmet_fc_tgtport_put(pe->tgtport);
pe->tgtport->pe = NULL;
}
list_del(&pe->pe_list);
spin_unlock_irqrestore(&nvmet_fc_tgtlock, flags);
}
@ -1292,8 +1298,10 @@ nvmet_fc_portentry_unbind_tgt(struct nvmet_fc_tgtport *tgtport)
spin_lock_irqsave(&nvmet_fc_tgtlock, flags);
pe = tgtport->pe;
if (pe)
if (pe) {
nvmet_fc_tgtport_put(pe->tgtport);
pe->tgtport = NULL;
}
tgtport->pe = NULL;
spin_unlock_irqrestore(&nvmet_fc_tgtlock, flags);
}
@ -1316,6 +1324,9 @@ nvmet_fc_portentry_rebind_tgt(struct nvmet_fc_tgtport *tgtport)
list_for_each_entry(pe, &nvmet_fc_portentry_list, pe_list) {
if (tgtport->fc_target_port.node_name == pe->node_name &&
tgtport->fc_target_port.port_name == pe->port_name) {
if (!nvmet_fc_tgtport_get(tgtport))
continue;
WARN_ON(pe->tgtport);
tgtport->pe = pe;
pe->tgtport = tgtport;
@ -1580,6 +1591,39 @@ nvmet_fc_delete_ctrl(struct nvmet_ctrl *ctrl)
spin_unlock_irqrestore(&nvmet_fc_tgtlock, flags);
}
static void
nvmet_fc_free_pending_reqs(struct nvmet_fc_tgtport *tgtport)
{
struct nvmet_fc_ls_req_op *lsop;
struct nvmefc_ls_req *lsreq;
struct nvmet_fc_ls_iod *iod;
int i;
iod = tgtport->iod;
for (i = 0; i < NVMET_LS_CTX_COUNT; iod++, i++)
cancel_work(&iod->work);
/*
* After this point the connection is lost and thus any pending
* request can't be processed by the normal completion path. This
* is likely a request from nvmet_fc_send_ls_req_async.
*/
while ((lsop = list_first_entry_or_null(&tgtport->ls_req_list,
struct nvmet_fc_ls_req_op, lsreq_list))) {
list_del(&lsop->lsreq_list);
if (!lsop->req_queued)
continue;
lsreq = &lsop->ls_req;
fc_dma_unmap_single(tgtport->dev, lsreq->rqstdma,
(lsreq->rqstlen + lsreq->rsplen),
DMA_BIDIRECTIONAL);
nvmet_fc_tgtport_put(tgtport);
kfree(lsop);
}
}
/**
* nvmet_fc_unregister_targetport - transport entry point called by an
* LLDD to deregister/remove a previously
@ -1608,13 +1652,7 @@ nvmet_fc_unregister_targetport(struct nvmet_fc_target_port *target_port)
flush_workqueue(nvmet_wq);
/*
* should terminate LS's as well. However, LS's will be generated
* at the tail end of association termination, so they likely don't
* exist yet. And even if they did, it's worthwhile to just let
* them finish and targetport ref counting will clean things up.
*/
nvmet_fc_free_pending_reqs(tgtport);
nvmet_fc_tgtport_put(tgtport);
return 0;
@ -2531,9 +2569,7 @@ nvmet_fc_handle_fcp_rqst(struct nvmet_fc_tgtport *tgtport,
fod->data_sg = NULL;
fod->data_sg_cnt = 0;
ret = nvmet_req_init(&fod->req,
&fod->queue->nvme_cq,
&fod->queue->nvme_sq,
ret = nvmet_req_init(&fod->req, &fod->queue->nvme_sq,
&nvmet_fc_tgt_fcp_ops);
if (!ret) {
/* bad SQE content or invalid ctrl state */
@ -2860,12 +2896,17 @@ nvmet_fc_add_port(struct nvmet_port *port)
list_for_each_entry(tgtport, &nvmet_fc_target_list, tgt_list) {
if ((tgtport->fc_target_port.node_name == traddr.nn) &&
(tgtport->fc_target_port.port_name == traddr.pn)) {
if (!nvmet_fc_tgtport_get(tgtport))
continue;
/* a FC port can only be 1 nvmet port id */
if (!tgtport->pe) {
nvmet_fc_portentry_bind(tgtport, pe, port);
ret = 0;
} else
ret = -EALREADY;
nvmet_fc_tgtport_put(tgtport);
break;
}
}
@ -2881,11 +2922,21 @@ static void
nvmet_fc_remove_port(struct nvmet_port *port)
{
struct nvmet_fc_port_entry *pe = port->priv;
struct nvmet_fc_tgtport *tgtport = NULL;
unsigned long flags;
spin_lock_irqsave(&nvmet_fc_tgtlock, flags);
if (pe->tgtport && nvmet_fc_tgtport_get(pe->tgtport))
tgtport = pe->tgtport;
spin_unlock_irqrestore(&nvmet_fc_tgtlock, flags);
nvmet_fc_portentry_unbind(pe);
if (tgtport) {
/* terminate any outstanding associations */
__nvmet_fc_free_assocs(pe->tgtport);
__nvmet_fc_free_assocs(tgtport);
nvmet_fc_tgtport_put(tgtport);
}
kfree(pe);
}
@ -2894,10 +2945,21 @@ static void
nvmet_fc_discovery_chg(struct nvmet_port *port)
{
struct nvmet_fc_port_entry *pe = port->priv;
struct nvmet_fc_tgtport *tgtport = pe->tgtport;
struct nvmet_fc_tgtport *tgtport = NULL;
unsigned long flags;
spin_lock_irqsave(&nvmet_fc_tgtlock, flags);
if (pe->tgtport && nvmet_fc_tgtport_get(pe->tgtport))
tgtport = pe->tgtport;
spin_unlock_irqrestore(&nvmet_fc_tgtlock, flags);
if (!tgtport)
return;
if (tgtport && tgtport->ops->discovery_event)
tgtport->ops->discovery_event(&tgtport->fc_target_port);
nvmet_fc_tgtport_put(tgtport);
}
static ssize_t

View file

@ -207,7 +207,6 @@ static LIST_HEAD(fcloop_nports);
struct fcloop_lport {
struct nvme_fc_local_port *localport;
struct list_head lport_list;
struct completion unreg_done;
refcount_t ref;
};
@ -215,6 +214,9 @@ struct fcloop_lport_priv {
struct fcloop_lport *lport;
};
/* The port is already being removed, avoid double free */
#define PORT_DELETED 0
struct fcloop_rport {
struct nvme_fc_remote_port *remoteport;
struct nvmet_fc_target_port *targetport;
@ -223,6 +225,7 @@ struct fcloop_rport {
spinlock_t lock;
struct list_head ls_list;
struct work_struct ls_work;
unsigned long flags;
};
struct fcloop_tport {
@ -233,6 +236,7 @@ struct fcloop_tport {
spinlock_t lock;
struct list_head ls_list;
struct work_struct ls_work;
unsigned long flags;
};
struct fcloop_nport {
@ -288,6 +292,9 @@ struct fcloop_ini_fcpreq {
spinlock_t inilock;
};
/* SLAB cache for fcloop_lsreq structures */
static struct kmem_cache *lsreq_cache;
static inline struct fcloop_lsreq *
ls_rsp_to_lsreq(struct nvmefc_ls_rsp *lsrsp)
{
@ -338,6 +345,7 @@ fcloop_rport_lsrqst_work(struct work_struct *work)
* callee may free memory containing tls_req.
* do not reference lsreq after this.
*/
kmem_cache_free(lsreq_cache, tls_req);
spin_lock(&rport->lock);
}
@ -349,10 +357,13 @@ fcloop_h2t_ls_req(struct nvme_fc_local_port *localport,
struct nvme_fc_remote_port *remoteport,
struct nvmefc_ls_req *lsreq)
{
struct fcloop_lsreq *tls_req = lsreq->private;
struct fcloop_rport *rport = remoteport->private;
struct fcloop_lsreq *tls_req;
int ret = 0;
tls_req = kmem_cache_alloc(lsreq_cache, GFP_KERNEL);
if (!tls_req)
return -ENOMEM;
tls_req->lsreq = lsreq;
INIT_LIST_HEAD(&tls_req->ls_list);
@ -389,13 +400,16 @@ fcloop_h2t_xmt_ls_rsp(struct nvmet_fc_target_port *targetport,
lsrsp->done(lsrsp);
if (remoteport) {
if (!remoteport) {
kmem_cache_free(lsreq_cache, tls_req);
return 0;
}
rport = remoteport->private;
spin_lock(&rport->lock);
list_add_tail(&tls_req->ls_list, &rport->ls_list);
spin_unlock(&rport->lock);
queue_work(nvmet_wq, &rport->ls_work);
}
return 0;
}
@ -422,6 +436,7 @@ fcloop_tport_lsrqst_work(struct work_struct *work)
* callee may free memory containing tls_req.
* do not reference lsreq after this.
*/
kmem_cache_free(lsreq_cache, tls_req);
spin_lock(&tport->lock);
}
@ -432,8 +447,8 @@ static int
fcloop_t2h_ls_req(struct nvmet_fc_target_port *targetport, void *hosthandle,
struct nvmefc_ls_req *lsreq)
{
struct fcloop_lsreq *tls_req = lsreq->private;
struct fcloop_tport *tport = targetport->private;
struct fcloop_lsreq *tls_req;
int ret = 0;
/*
@ -441,6 +456,10 @@ fcloop_t2h_ls_req(struct nvmet_fc_target_port *targetport, void *hosthandle,
* hosthandle ignored as fcloop currently is
* 1:1 tgtport vs remoteport
*/
tls_req = kmem_cache_alloc(lsreq_cache, GFP_KERNEL);
if (!tls_req)
return -ENOMEM;
tls_req->lsreq = lsreq;
INIT_LIST_HEAD(&tls_req->ls_list);
@ -457,6 +476,9 @@ fcloop_t2h_ls_req(struct nvmet_fc_target_port *targetport, void *hosthandle,
ret = nvme_fc_rcv_ls_req(tport->remoteport, &tls_req->ls_rsp,
lsreq->rqstaddr, lsreq->rqstlen);
if (ret)
kmem_cache_free(lsreq_cache, tls_req);
return ret;
}
@ -471,18 +493,30 @@ fcloop_t2h_xmt_ls_rsp(struct nvme_fc_local_port *localport,
struct nvmet_fc_target_port *targetport = rport->targetport;
struct fcloop_tport *tport;
if (!targetport) {
/*
* The target port is gone. The target doesn't expect any
* response anymore and the ->done call is not valid
* because the resources have been freed by
* nvmet_fc_free_pending_reqs.
*
* We end up here from delete association exchange:
* nvmet_fc_xmt_disconnect_assoc sends an async request.
*/
kmem_cache_free(lsreq_cache, tls_req);
return 0;
}
memcpy(lsreq->rspaddr, lsrsp->rspbuf,
((lsreq->rsplen < lsrsp->rsplen) ?
lsreq->rsplen : lsrsp->rsplen));
lsrsp->done(lsrsp);
if (targetport) {
tport = targetport->private;
spin_lock(&tport->lock);
list_add_tail(&tls_req->ls_list, &tport->ls_list);
spin_unlock(&tport->lock);
queue_work(nvmet_wq, &tport->ls_work);
}
return 0;
}
@ -566,6 +600,7 @@ fcloop_call_host_done(struct nvmefc_fcp_req *fcpreq,
}
/* release original io reference on tgt struct */
if (tfcp_req)
fcloop_tfcp_req_put(tfcp_req);
}
@ -618,12 +653,13 @@ fcloop_fcp_recv_work(struct work_struct *work)
{
struct fcloop_fcpreq *tfcp_req =
container_of(work, struct fcloop_fcpreq, fcp_rcv_work);
struct nvmefc_fcp_req *fcpreq = tfcp_req->fcpreq;
struct nvmefc_fcp_req *fcpreq;
unsigned long flags;
int ret = 0;
bool aborted = false;
spin_lock_irqsave(&tfcp_req->reqlock, flags);
fcpreq = tfcp_req->fcpreq;
switch (tfcp_req->inistate) {
case INI_IO_START:
tfcp_req->inistate = INI_IO_ACTIVE;
@ -638,16 +674,19 @@ fcloop_fcp_recv_work(struct work_struct *work)
}
spin_unlock_irqrestore(&tfcp_req->reqlock, flags);
if (unlikely(aborted))
ret = -ECANCELED;
else {
if (likely(!check_for_drop(tfcp_req)))
if (unlikely(aborted)) {
/* the abort handler will call fcloop_call_host_done */
return;
}
if (unlikely(check_for_drop(tfcp_req))) {
pr_info("%s: dropped command ********\n", __func__);
return;
}
ret = nvmet_fc_rcv_fcp_req(tfcp_req->tport->targetport,
&tfcp_req->tgt_fcp_req,
fcpreq->cmdaddr, fcpreq->cmdlen);
else
pr_info("%s: dropped command ********\n", __func__);
}
if (ret)
fcloop_call_host_done(fcpreq, tfcp_req, ret);
}
@ -662,15 +701,17 @@ fcloop_fcp_abort_recv_work(struct work_struct *work)
unsigned long flags;
spin_lock_irqsave(&tfcp_req->reqlock, flags);
fcpreq = tfcp_req->fcpreq;
switch (tfcp_req->inistate) {
case INI_IO_ABORTED:
fcpreq = tfcp_req->fcpreq;
tfcp_req->fcpreq = NULL;
break;
case INI_IO_COMPLETED:
completed = true;
break;
default:
spin_unlock_irqrestore(&tfcp_req->reqlock, flags);
fcloop_tfcp_req_put(tfcp_req);
WARN_ON(1);
return;
}
@ -686,10 +727,6 @@ fcloop_fcp_abort_recv_work(struct work_struct *work)
nvmet_fc_rcv_fcp_abort(tfcp_req->tport->targetport,
&tfcp_req->tgt_fcp_req);
spin_lock_irqsave(&tfcp_req->reqlock, flags);
tfcp_req->fcpreq = NULL;
spin_unlock_irqrestore(&tfcp_req->reqlock, flags);
fcloop_call_host_done(fcpreq, tfcp_req, -ECANCELED);
/* call_host_done releases reference for abort downcall */
}
@ -958,13 +995,16 @@ fcloop_fcp_abort(struct nvme_fc_local_port *localport,
spin_lock(&inireq->inilock);
tfcp_req = inireq->tfcp_req;
if (tfcp_req)
fcloop_tfcp_req_get(tfcp_req);
if (tfcp_req) {
if (!fcloop_tfcp_req_get(tfcp_req))
tfcp_req = NULL;
}
spin_unlock(&inireq->inilock);
if (!tfcp_req)
if (!tfcp_req) {
/* abort has already been called */
return;
goto out_host_done;
}
/* break initiator/target relationship for io */
spin_lock_irqsave(&tfcp_req->reqlock, flags);
@ -979,7 +1019,7 @@ fcloop_fcp_abort(struct nvme_fc_local_port *localport,
default:
spin_unlock_irqrestore(&tfcp_req->reqlock, flags);
WARN_ON(1);
return;
goto out_host_done;
}
spin_unlock_irqrestore(&tfcp_req->reqlock, flags);
@ -993,6 +1033,11 @@ fcloop_fcp_abort(struct nvme_fc_local_port *localport,
*/
fcloop_tfcp_req_put(tfcp_req);
}
return;
out_host_done:
fcloop_call_host_done(fcpreq, tfcp_req, -ECANCELED);
}
static void
@ -1019,9 +1064,18 @@ fcloop_lport_get(struct fcloop_lport *lport)
static void
fcloop_nport_put(struct fcloop_nport *nport)
{
unsigned long flags;
if (!refcount_dec_and_test(&nport->ref))
return;
spin_lock_irqsave(&fcloop_lock, flags);
list_del(&nport->nport_list);
spin_unlock_irqrestore(&fcloop_lock, flags);
if (nport->lport)
fcloop_lport_put(nport->lport);
kfree(nport);
}
@ -1037,9 +1091,6 @@ fcloop_localport_delete(struct nvme_fc_local_port *localport)
struct fcloop_lport_priv *lport_priv = localport->private;
struct fcloop_lport *lport = lport_priv->lport;
/* release any threads waiting for the unreg to complete */
complete(&lport->unreg_done);
fcloop_lport_put(lport);
}
@ -1047,8 +1098,18 @@ static void
fcloop_remoteport_delete(struct nvme_fc_remote_port *remoteport)
{
struct fcloop_rport *rport = remoteport->private;
bool put_port = false;
unsigned long flags;
flush_work(&rport->ls_work);
spin_lock_irqsave(&fcloop_lock, flags);
if (!test_and_set_bit(PORT_DELETED, &rport->flags))
put_port = true;
rport->nport->rport = NULL;
spin_unlock_irqrestore(&fcloop_lock, flags);
if (put_port)
fcloop_nport_put(rport->nport);
}
@ -1056,8 +1117,18 @@ static void
fcloop_targetport_delete(struct nvmet_fc_target_port *targetport)
{
struct fcloop_tport *tport = targetport->private;
bool put_port = false;
unsigned long flags;
flush_work(&tport->ls_work);
spin_lock_irqsave(&fcloop_lock, flags);
if (!test_and_set_bit(PORT_DELETED, &tport->flags))
put_port = true;
tport->nport->tport = NULL;
spin_unlock_irqrestore(&fcloop_lock, flags);
if (put_port)
fcloop_nport_put(tport->nport);
}
@ -1082,7 +1153,6 @@ static struct nvme_fc_port_template fctemplate = {
/* sizes of additional private data for data structures */
.local_priv_sz = sizeof(struct fcloop_lport_priv),
.remote_priv_sz = sizeof(struct fcloop_rport),
.lsrqst_priv_sz = sizeof(struct fcloop_lsreq),
.fcprqst_priv_sz = sizeof(struct fcloop_ini_fcpreq),
};
@ -1105,7 +1175,6 @@ static struct nvmet_fc_target_template tgttemplate = {
.target_features = 0,
/* sizes of additional private data for data structures */
.target_priv_sz = sizeof(struct fcloop_tport),
.lsrqst_priv_sz = sizeof(struct fcloop_lsreq),
};
static ssize_t
@ -1170,51 +1239,92 @@ out_free_lport:
}
static int
__wait_localport_unreg(struct fcloop_lport *lport)
__localport_unreg(struct fcloop_lport *lport)
{
int ret;
init_completion(&lport->unreg_done);
ret = nvme_fc_unregister_localport(lport->localport);
if (!ret)
wait_for_completion(&lport->unreg_done);
return ret;
return nvme_fc_unregister_localport(lport->localport);
}
static struct fcloop_nport *
__fcloop_nport_lookup(u64 node_name, u64 port_name)
{
struct fcloop_nport *nport;
list_for_each_entry(nport, &fcloop_nports, nport_list) {
if (nport->node_name != node_name ||
nport->port_name != port_name)
continue;
if (fcloop_nport_get(nport))
return nport;
break;
}
return NULL;
}
static struct fcloop_nport *
fcloop_nport_lookup(u64 node_name, u64 port_name)
{
struct fcloop_nport *nport;
unsigned long flags;
spin_lock_irqsave(&fcloop_lock, flags);
nport = __fcloop_nport_lookup(node_name, port_name);
spin_unlock_irqrestore(&fcloop_lock, flags);
return nport;
}
static struct fcloop_lport *
__fcloop_lport_lookup(u64 node_name, u64 port_name)
{
struct fcloop_lport *lport;
list_for_each_entry(lport, &fcloop_lports, lport_list) {
if (lport->localport->node_name != node_name ||
lport->localport->port_name != port_name)
continue;
if (fcloop_lport_get(lport))
return lport;
break;
}
return NULL;
}
static struct fcloop_lport *
fcloop_lport_lookup(u64 node_name, u64 port_name)
{
struct fcloop_lport *lport;
unsigned long flags;
spin_lock_irqsave(&fcloop_lock, flags);
lport = __fcloop_lport_lookup(node_name, port_name);
spin_unlock_irqrestore(&fcloop_lock, flags);
return lport;
}
static ssize_t
fcloop_delete_local_port(struct device *dev, struct device_attribute *attr,
const char *buf, size_t count)
{
struct fcloop_lport *tlport, *lport = NULL;
struct fcloop_lport *lport;
u64 nodename, portname;
unsigned long flags;
int ret;
ret = fcloop_parse_nm_options(dev, &nodename, &portname, buf);
if (ret)
return ret;
spin_lock_irqsave(&fcloop_lock, flags);
list_for_each_entry(tlport, &fcloop_lports, lport_list) {
if (tlport->localport->node_name == nodename &&
tlport->localport->port_name == portname) {
if (!fcloop_lport_get(tlport))
break;
lport = tlport;
break;
}
}
spin_unlock_irqrestore(&fcloop_lock, flags);
lport = fcloop_lport_lookup(nodename, portname);
if (!lport)
return -ENOENT;
ret = __wait_localport_unreg(lport);
ret = __localport_unreg(lport);
fcloop_lport_put(lport);
return ret ? ret : count;
@ -1223,8 +1333,8 @@ fcloop_delete_local_port(struct device *dev, struct device_attribute *attr,
static struct fcloop_nport *
fcloop_alloc_nport(const char *buf, size_t count, bool remoteport)
{
struct fcloop_nport *newnport, *nport = NULL;
struct fcloop_lport *tmplport, *lport = NULL;
struct fcloop_nport *newnport, *nport;
struct fcloop_lport *lport;
struct fcloop_ctrl_options *opts;
unsigned long flags;
u32 opts_mask = (remoteport) ? RPORT_OPTS : TGTPORT_OPTS;
@ -1239,10 +1349,8 @@ fcloop_alloc_nport(const char *buf, size_t count, bool remoteport)
goto out_free_opts;
/* everything there ? */
if ((opts->mask & opts_mask) != opts_mask) {
ret = -EINVAL;
if ((opts->mask & opts_mask) != opts_mask)
goto out_free_opts;
}
newnport = kzalloc(sizeof(*newnport), GFP_KERNEL);
if (!newnport)
@ -1258,60 +1366,61 @@ fcloop_alloc_nport(const char *buf, size_t count, bool remoteport)
refcount_set(&newnport->ref, 1);
spin_lock_irqsave(&fcloop_lock, flags);
list_for_each_entry(tmplport, &fcloop_lports, lport_list) {
if (tmplport->localport->node_name == opts->wwnn &&
tmplport->localport->port_name == opts->wwpn)
goto out_invalid_opts;
if (tmplport->localport->node_name == opts->lpwwnn &&
tmplport->localport->port_name == opts->lpwwpn)
lport = tmplport;
lport = __fcloop_lport_lookup(opts->wwnn, opts->wwpn);
if (lport) {
/* invalid configuration */
fcloop_lport_put(lport);
goto out_free_newnport;
}
if (remoteport) {
if (!lport)
goto out_invalid_opts;
newnport->lport = lport;
}
list_for_each_entry(nport, &fcloop_nports, nport_list) {
if (nport->node_name == opts->wwnn &&
nport->port_name == opts->wwpn) {
if ((remoteport && nport->rport) ||
(!remoteport && nport->tport)) {
nport = NULL;
goto out_invalid_opts;
}
fcloop_nport_get(nport);
spin_unlock_irqrestore(&fcloop_lock, flags);
if (remoteport)
nport->lport = lport;
if (opts->mask & NVMF_OPT_ROLES)
nport->port_role = opts->roles;
if (opts->mask & NVMF_OPT_FCADDR)
nport->port_id = opts->fcaddr;
lport = __fcloop_lport_lookup(opts->lpwwnn, opts->lpwwpn);
if (!lport) {
/* invalid configuration */
goto out_free_newnport;
}
}
list_add_tail(&newnport->nport_list, &fcloop_nports);
nport = __fcloop_nport_lookup(opts->wwnn, opts->wwpn);
if (nport) {
if ((remoteport && nport->rport) ||
(!remoteport && nport->tport)) {
/* invalid configuration */
goto out_put_nport;
}
/* found existing nport, discard the new nport */
kfree(newnport);
} else {
list_add_tail(&newnport->nport_list, &fcloop_nports);
nport = newnport;
}
if (opts->mask & NVMF_OPT_ROLES)
nport->port_role = opts->roles;
if (opts->mask & NVMF_OPT_FCADDR)
nport->port_id = opts->fcaddr;
if (lport) {
if (!nport->lport)
nport->lport = lport;
else
fcloop_lport_put(lport);
}
spin_unlock_irqrestore(&fcloop_lock, flags);
kfree(opts);
return newnport;
return nport;
out_invalid_opts:
spin_unlock_irqrestore(&fcloop_lock, flags);
out_put_nport:
if (lport)
fcloop_lport_put(lport);
fcloop_nport_put(nport);
out_free_newnport:
spin_unlock_irqrestore(&fcloop_lock, flags);
kfree(newnport);
out_free_opts:
kfree(opts);
return nport;
return NULL;
}
static ssize_t
@ -1352,6 +1461,7 @@ fcloop_create_remote_port(struct device *dev, struct device_attribute *attr,
rport->nport = nport;
rport->lport = nport->lport;
nport->rport = rport;
rport->flags = 0;
spin_lock_init(&rport->lock);
INIT_WORK(&rport->ls_work, fcloop_rport_lsrqst_work);
INIT_LIST_HEAD(&rport->ls_list);
@ -1365,21 +1475,18 @@ __unlink_remote_port(struct fcloop_nport *nport)
{
struct fcloop_rport *rport = nport->rport;
lockdep_assert_held(&fcloop_lock);
if (rport && nport->tport)
nport->tport->remoteport = NULL;
nport->rport = NULL;
list_del(&nport->nport_list);
return rport;
}
static int
__remoteport_unreg(struct fcloop_nport *nport, struct fcloop_rport *rport)
{
if (!rport)
return -EALREADY;
return nvme_fc_unregister_remoteport(rport->remoteport);
}
@ -1387,8 +1494,8 @@ static ssize_t
fcloop_delete_remote_port(struct device *dev, struct device_attribute *attr,
const char *buf, size_t count)
{
struct fcloop_nport *nport = NULL, *tmpport;
static struct fcloop_rport *rport;
struct fcloop_nport *nport;
struct fcloop_rport *rport;
u64 nodename, portname;
unsigned long flags;
int ret;
@ -1397,24 +1504,24 @@ fcloop_delete_remote_port(struct device *dev, struct device_attribute *attr,
if (ret)
return ret;
spin_lock_irqsave(&fcloop_lock, flags);
list_for_each_entry(tmpport, &fcloop_nports, nport_list) {
if (tmpport->node_name == nodename &&
tmpport->port_name == portname && tmpport->rport) {
nport = tmpport;
rport = __unlink_remote_port(nport);
break;
}
}
spin_unlock_irqrestore(&fcloop_lock, flags);
nport = fcloop_nport_lookup(nodename, portname);
if (!nport)
return -ENOENT;
spin_lock_irqsave(&fcloop_lock, flags);
rport = __unlink_remote_port(nport);
spin_unlock_irqrestore(&fcloop_lock, flags);
if (!rport) {
ret = -ENOENT;
goto out_nport_put;
}
ret = __remoteport_unreg(nport, rport);
out_nport_put:
fcloop_nport_put(nport);
return ret ? ret : count;
}
@ -1452,6 +1559,7 @@ fcloop_create_target_port(struct device *dev, struct device_attribute *attr,
tport->nport = nport;
tport->lport = nport->lport;
nport->tport = tport;
tport->flags = 0;
spin_lock_init(&tport->lock);
INIT_WORK(&tport->ls_work, fcloop_tport_lsrqst_work);
INIT_LIST_HEAD(&tport->ls_list);
@ -1465,6 +1573,8 @@ __unlink_target_port(struct fcloop_nport *nport)
{
struct fcloop_tport *tport = nport->tport;
lockdep_assert_held(&fcloop_lock);
if (tport && nport->rport)
nport->rport->targetport = NULL;
nport->tport = NULL;
@ -1475,9 +1585,6 @@ __unlink_target_port(struct fcloop_nport *nport)
static int
__targetport_unreg(struct fcloop_nport *nport, struct fcloop_tport *tport)
{
if (!tport)
return -EALREADY;
return nvmet_fc_unregister_targetport(tport->targetport);
}
@ -1485,8 +1592,8 @@ static ssize_t
fcloop_delete_target_port(struct device *dev, struct device_attribute *attr,
const char *buf, size_t count)
{
struct fcloop_nport *nport = NULL, *tmpport;
struct fcloop_tport *tport = NULL;
struct fcloop_nport *nport;
struct fcloop_tport *tport;
u64 nodename, portname;
unsigned long flags;
int ret;
@ -1495,24 +1602,24 @@ fcloop_delete_target_port(struct device *dev, struct device_attribute *attr,
if (ret)
return ret;
spin_lock_irqsave(&fcloop_lock, flags);
list_for_each_entry(tmpport, &fcloop_nports, nport_list) {
if (tmpport->node_name == nodename &&
tmpport->port_name == portname && tmpport->tport) {
nport = tmpport;
tport = __unlink_target_port(nport);
break;
}
}
spin_unlock_irqrestore(&fcloop_lock, flags);
nport = fcloop_nport_lookup(nodename, portname);
if (!nport)
return -ENOENT;
spin_lock_irqsave(&fcloop_lock, flags);
tport = __unlink_target_port(nport);
spin_unlock_irqrestore(&fcloop_lock, flags);
if (!tport) {
ret = -ENOENT;
goto out_nport_put;
}
ret = __targetport_unreg(nport, tport);
out_nport_put:
fcloop_nport_put(nport);
return ret ? ret : count;
}
@ -1578,15 +1685,20 @@ static const struct class fcloop_class = {
};
static struct device *fcloop_device;
static int __init fcloop_init(void)
{
int ret;
lsreq_cache = kmem_cache_create("lsreq_cache",
sizeof(struct fcloop_lsreq), 0,
0, NULL);
if (!lsreq_cache)
return -ENOMEM;
ret = class_register(&fcloop_class);
if (ret) {
pr_err("couldn't register class fcloop\n");
return ret;
goto out_destroy_cache;
}
fcloop_device = device_create_with_groups(
@ -1604,13 +1716,15 @@ static int __init fcloop_init(void)
out_destroy_class:
class_unregister(&fcloop_class);
out_destroy_cache:
kmem_cache_destroy(lsreq_cache);
return ret;
}
static void __exit fcloop_exit(void)
{
struct fcloop_lport *lport = NULL;
struct fcloop_nport *nport = NULL;
struct fcloop_lport *lport;
struct fcloop_nport *nport;
struct fcloop_tport *tport;
struct fcloop_rport *rport;
unsigned long flags;
@ -1621,7 +1735,7 @@ static void __exit fcloop_exit(void)
for (;;) {
nport = list_first_entry_or_null(&fcloop_nports,
typeof(*nport), nport_list);
if (!nport)
if (!nport || !fcloop_nport_get(nport))
break;
tport = __unlink_target_port(nport);
@ -1629,13 +1743,21 @@ static void __exit fcloop_exit(void)
spin_unlock_irqrestore(&fcloop_lock, flags);
if (tport) {
ret = __targetport_unreg(nport, tport);
if (ret)
pr_warn("%s: Failed deleting target port\n", __func__);
pr_warn("%s: Failed deleting target port\n",
__func__);
}
if (rport) {
ret = __remoteport_unreg(nport, rport);
if (ret)
pr_warn("%s: Failed deleting remote port\n", __func__);
pr_warn("%s: Failed deleting remote port\n",
__func__);
}
fcloop_nport_put(nport);
spin_lock_irqsave(&fcloop_lock, flags);
}
@ -1648,7 +1770,7 @@ static void __exit fcloop_exit(void)
spin_unlock_irqrestore(&fcloop_lock, flags);
ret = __wait_localport_unreg(lport);
ret = __localport_unreg(lport);
if (ret)
pr_warn("%s: Failed deleting local port\n", __func__);
@ -1663,6 +1785,7 @@ static void __exit fcloop_exit(void)
device_destroy(&fcloop_class, MKDEV(0, 0));
class_unregister(&fcloop_class);
kmem_cache_destroy(lsreq_cache);
}
module_init(fcloop_init);

View file

@ -33,10 +33,12 @@ struct nvme_loop_ctrl {
struct list_head list;
struct blk_mq_tag_set tag_set;
struct nvme_loop_iod async_event_iod;
struct nvme_ctrl ctrl;
struct nvmet_port *port;
/* Must be last --ends in a flexible-array member. */
struct nvme_loop_iod async_event_iod;
};
static inline struct nvme_loop_ctrl *to_loop_ctrl(struct nvme_ctrl *ctrl)
@ -148,8 +150,7 @@ static blk_status_t nvme_loop_queue_rq(struct blk_mq_hw_ctx *hctx,
nvme_start_request(req);
iod->cmd.common.flags |= NVME_CMD_SGL_METABUF;
iod->req.port = queue->ctrl->port;
if (!nvmet_req_init(&iod->req, &queue->nvme_cq,
&queue->nvme_sq, &nvme_loop_ops))
if (!nvmet_req_init(&iod->req, &queue->nvme_sq, &nvme_loop_ops))
return BLK_STS_OK;
if (blk_rq_nr_phys_segments(req)) {
@ -181,8 +182,7 @@ static void nvme_loop_submit_async_event(struct nvme_ctrl *arg)
iod->cmd.common.command_id = NVME_AQ_BLK_MQ_DEPTH;
iod->cmd.common.flags |= NVME_CMD_SGL_METABUF;
if (!nvmet_req_init(&iod->req, &queue->nvme_cq, &queue->nvme_sq,
&nvme_loop_ops)) {
if (!nvmet_req_init(&iod->req, &queue->nvme_sq, &nvme_loop_ops)) {
dev_err(ctrl->ctrl.device, "failed async event work\n");
return;
}
@ -273,6 +273,7 @@ static void nvme_loop_destroy_admin_queue(struct nvme_loop_ctrl *ctrl)
nvme_unquiesce_admin_queue(&ctrl->ctrl);
nvmet_sq_destroy(&ctrl->queues[0].nvme_sq);
nvmet_cq_put(&ctrl->queues[0].nvme_cq);
nvme_remove_admin_tag_set(&ctrl->ctrl);
}
@ -302,6 +303,7 @@ static void nvme_loop_destroy_io_queues(struct nvme_loop_ctrl *ctrl)
for (i = 1; i < ctrl->ctrl.queue_count; i++) {
clear_bit(NVME_LOOP_Q_LIVE, &ctrl->queues[i].flags);
nvmet_sq_destroy(&ctrl->queues[i].nvme_sq);
nvmet_cq_put(&ctrl->queues[i].nvme_cq);
}
ctrl->ctrl.queue_count = 1;
/*
@ -327,9 +329,13 @@ static int nvme_loop_init_io_queues(struct nvme_loop_ctrl *ctrl)
for (i = 1; i <= nr_io_queues; i++) {
ctrl->queues[i].ctrl = ctrl;
ret = nvmet_sq_init(&ctrl->queues[i].nvme_sq);
if (ret)
nvmet_cq_init(&ctrl->queues[i].nvme_cq);
ret = nvmet_sq_init(&ctrl->queues[i].nvme_sq,
&ctrl->queues[i].nvme_cq);
if (ret) {
nvmet_cq_put(&ctrl->queues[i].nvme_cq);
goto out_destroy_queues;
}
ctrl->ctrl.queue_count++;
}
@ -360,9 +366,13 @@ static int nvme_loop_configure_admin_queue(struct nvme_loop_ctrl *ctrl)
int error;
ctrl->queues[0].ctrl = ctrl;
error = nvmet_sq_init(&ctrl->queues[0].nvme_sq);
if (error)
nvmet_cq_init(&ctrl->queues[0].nvme_cq);
error = nvmet_sq_init(&ctrl->queues[0].nvme_sq,
&ctrl->queues[0].nvme_cq);
if (error) {
nvmet_cq_put(&ctrl->queues[0].nvme_cq);
return error;
}
ctrl->ctrl.queue_count = 1;
error = nvme_alloc_admin_tag_set(&ctrl->ctrl, &ctrl->admin_tag_set,
@ -401,6 +411,7 @@ out_cleanup_tagset:
nvme_remove_admin_tag_set(&ctrl->ctrl);
out_free_sq:
nvmet_sq_destroy(&ctrl->queues[0].nvme_sq);
nvmet_cq_put(&ctrl->queues[0].nvme_cq);
return error;
}

View file

@ -141,13 +141,16 @@ static inline struct device *nvmet_ns_dev(struct nvmet_ns *ns)
}
struct nvmet_cq {
struct nvmet_ctrl *ctrl;
u16 qid;
u16 size;
refcount_t ref;
};
struct nvmet_sq {
struct nvmet_ctrl *ctrl;
struct percpu_ref ref;
struct nvmet_cq *cq;
u16 qid;
u16 size;
u32 sqhd;
@ -247,6 +250,7 @@ struct nvmet_pr_log_mgr {
struct nvmet_ctrl {
struct nvmet_subsys *subsys;
struct nvmet_sq **sqs;
struct nvmet_cq **cqs;
void *drvdata;
@ -424,7 +428,7 @@ struct nvmet_fabrics_ops {
u16 (*get_max_queue_size)(const struct nvmet_ctrl *ctrl);
/* Operations mandatory for PCI target controllers */
u16 (*create_sq)(struct nvmet_ctrl *ctrl, u16 sqid, u16 flags,
u16 (*create_sq)(struct nvmet_ctrl *ctrl, u16 sqid, u16 cqid, u16 flags,
u16 qsize, u64 prp1);
u16 (*delete_sq)(struct nvmet_ctrl *ctrl, u16 sqid);
u16 (*create_cq)(struct nvmet_ctrl *ctrl, u16 cqid, u16 flags,
@ -557,8 +561,8 @@ u32 nvmet_fabrics_admin_cmd_data_len(struct nvmet_req *req);
u16 nvmet_parse_fabrics_io_cmd(struct nvmet_req *req);
u32 nvmet_fabrics_io_cmd_data_len(struct nvmet_req *req);
bool nvmet_req_init(struct nvmet_req *req, struct nvmet_cq *cq,
struct nvmet_sq *sq, const struct nvmet_fabrics_ops *ops);
bool nvmet_req_init(struct nvmet_req *req, struct nvmet_sq *sq,
const struct nvmet_fabrics_ops *ops);
void nvmet_req_uninit(struct nvmet_req *req);
size_t nvmet_req_transfer_len(struct nvmet_req *req);
bool nvmet_check_transfer_len(struct nvmet_req *req, size_t len);
@ -571,18 +575,24 @@ void nvmet_execute_set_features(struct nvmet_req *req);
void nvmet_execute_get_features(struct nvmet_req *req);
void nvmet_execute_keep_alive(struct nvmet_req *req);
u16 nvmet_check_cqid(struct nvmet_ctrl *ctrl, u16 cqid);
u16 nvmet_check_cqid(struct nvmet_ctrl *ctrl, u16 cqid, bool create);
u16 nvmet_check_io_cqid(struct nvmet_ctrl *ctrl, u16 cqid, bool create);
void nvmet_cq_init(struct nvmet_cq *cq);
void nvmet_cq_setup(struct nvmet_ctrl *ctrl, struct nvmet_cq *cq, u16 qid,
u16 size);
u16 nvmet_cq_create(struct nvmet_ctrl *ctrl, struct nvmet_cq *cq, u16 qid,
u16 size);
void nvmet_cq_destroy(struct nvmet_cq *cq);
bool nvmet_cq_get(struct nvmet_cq *cq);
void nvmet_cq_put(struct nvmet_cq *cq);
bool nvmet_cq_in_use(struct nvmet_cq *cq);
u16 nvmet_check_sqid(struct nvmet_ctrl *ctrl, u16 sqid, bool create);
void nvmet_sq_setup(struct nvmet_ctrl *ctrl, struct nvmet_sq *sq, u16 qid,
u16 size);
u16 nvmet_sq_create(struct nvmet_ctrl *ctrl, struct nvmet_sq *sq, u16 qid,
u16 size);
u16 nvmet_sq_create(struct nvmet_ctrl *ctrl, struct nvmet_sq *sq,
struct nvmet_cq *cq, u16 qid, u16 size);
void nvmet_sq_destroy(struct nvmet_sq *sq);
int nvmet_sq_init(struct nvmet_sq *sq);
int nvmet_sq_init(struct nvmet_sq *sq, struct nvmet_cq *cq);
void nvmet_ctrl_fatal_error(struct nvmet_ctrl *ctrl);

View file

@ -1354,15 +1354,17 @@ static u16 nvmet_pci_epf_delete_cq(struct nvmet_ctrl *tctrl, u16 cqid)
if (test_and_clear_bit(NVMET_PCI_EPF_Q_IRQ_ENABLED, &cq->flags))
nvmet_pci_epf_remove_irq_vector(ctrl, cq->vector);
nvmet_pci_epf_mem_unmap(ctrl->nvme_epf, &cq->pci_map);
nvmet_cq_put(&cq->nvme_cq);
return NVME_SC_SUCCESS;
}
static u16 nvmet_pci_epf_create_sq(struct nvmet_ctrl *tctrl,
u16 sqid, u16 flags, u16 qsize, u64 pci_addr)
u16 sqid, u16 cqid, u16 flags, u16 qsize, u64 pci_addr)
{
struct nvmet_pci_epf_ctrl *ctrl = tctrl->drvdata;
struct nvmet_pci_epf_queue *sq = &ctrl->sq[sqid];
struct nvmet_pci_epf_queue *cq = &ctrl->cq[cqid];
u16 status;
if (test_bit(NVMET_PCI_EPF_Q_LIVE, &sq->flags))
@ -1385,7 +1387,8 @@ static u16 nvmet_pci_epf_create_sq(struct nvmet_ctrl *tctrl,
sq->qes = ctrl->io_sqes;
sq->pci_size = sq->qes * sq->depth;
status = nvmet_sq_create(tctrl, &sq->nvme_sq, sqid, sq->depth);
status = nvmet_sq_create(tctrl, &sq->nvme_sq, &cq->nvme_cq, sqid,
sq->depth);
if (status != NVME_SC_SUCCESS)
return status;
@ -1601,8 +1604,7 @@ static void nvmet_pci_epf_exec_iod_work(struct work_struct *work)
goto complete;
}
if (!nvmet_req_init(req, &iod->cq->nvme_cq, &iod->sq->nvme_sq,
&nvmet_pci_epf_fabrics_ops))
if (!nvmet_req_init(req, &iod->sq->nvme_sq, &nvmet_pci_epf_fabrics_ops))
goto complete;
iod->data_len = nvmet_req_transfer_len(req);
@ -1879,8 +1881,8 @@ static int nvmet_pci_epf_enable_ctrl(struct nvmet_pci_epf_ctrl *ctrl)
qsize = aqa & 0x00000fff;
pci_addr = asq & GENMASK_ULL(63, 12);
status = nvmet_pci_epf_create_sq(ctrl->tctrl, 0, NVME_QUEUE_PHYS_CONTIG,
qsize, pci_addr);
status = nvmet_pci_epf_create_sq(ctrl->tctrl, 0, 0,
NVME_QUEUE_PHYS_CONTIG, qsize, pci_addr);
if (status != NVME_SC_SUCCESS) {
dev_err(ctrl->dev, "Failed to create admin submission queue\n");
nvmet_pci_epf_delete_cq(ctrl->tctrl, 0);

View file

@ -976,8 +976,7 @@ static void nvmet_rdma_handle_command(struct nvmet_rdma_queue *queue,
cmd->send_sge.addr, cmd->send_sge.length,
DMA_TO_DEVICE);
if (!nvmet_req_init(&cmd->req, &queue->nvme_cq,
&queue->nvme_sq, &nvmet_rdma_ops))
if (!nvmet_req_init(&cmd->req, &queue->nvme_sq, &nvmet_rdma_ops))
return;
status = nvmet_rdma_map_sgl(cmd);
@ -1353,6 +1352,7 @@ static void nvmet_rdma_free_queue(struct nvmet_rdma_queue *queue)
pr_debug("freeing queue %d\n", queue->idx);
nvmet_sq_destroy(&queue->nvme_sq);
nvmet_cq_put(&queue->nvme_cq);
nvmet_rdma_destroy_queue_ib(queue);
if (!queue->nsrq) {
@ -1436,7 +1436,8 @@ nvmet_rdma_alloc_queue(struct nvmet_rdma_device *ndev,
goto out_reject;
}
ret = nvmet_sq_init(&queue->nvme_sq);
nvmet_cq_init(&queue->nvme_cq);
ret = nvmet_sq_init(&queue->nvme_sq, &queue->nvme_cq);
if (ret) {
ret = NVME_RDMA_CM_NO_RSC;
goto out_free_queue;
@ -1517,6 +1518,7 @@ out_ida_remove:
out_destroy_sq:
nvmet_sq_destroy(&queue->nvme_sq);
out_free_queue:
nvmet_cq_put(&queue->nvme_cq);
kfree(queue);
out_reject:
nvmet_rdma_cm_reject(cm_id, ret);

View file

@ -7,6 +7,7 @@
#include <linux/module.h>
#include <linux/init.h>
#include <linux/slab.h>
#include <linux/crc32c.h>
#include <linux/err.h>
#include <linux/nvme-tcp.h>
#include <linux/nvme-keyring.h>
@ -17,7 +18,6 @@
#include <net/handshake.h>
#include <linux/inet.h>
#include <linux/llist.h>
#include <crypto/hash.h>
#include <trace/events/sock.h>
#include "nvmet.h"
@ -172,8 +172,6 @@ struct nvmet_tcp_queue {
/* digest state */
bool hdr_digest;
bool data_digest;
struct ahash_request *snd_hash;
struct ahash_request *rcv_hash;
/* TLS state */
key_serial_t tls_pskid;
@ -294,14 +292,9 @@ static inline u8 nvmet_tcp_ddgst_len(struct nvmet_tcp_queue *queue)
return queue->data_digest ? NVME_TCP_DIGEST_LENGTH : 0;
}
static inline void nvmet_tcp_hdgst(struct ahash_request *hash,
void *pdu, size_t len)
static inline void nvmet_tcp_hdgst(void *pdu, size_t len)
{
struct scatterlist sg;
sg_init_one(&sg, pdu, len);
ahash_request_set_crypt(hash, &sg, pdu + len, len);
crypto_ahash_digest(hash);
put_unaligned_le32(~crc32c(~0, pdu, len), pdu + len);
}
static int nvmet_tcp_verify_hdgst(struct nvmet_tcp_queue *queue,
@ -318,7 +311,7 @@ static int nvmet_tcp_verify_hdgst(struct nvmet_tcp_queue *queue,
}
recv_digest = *(__le32 *)(pdu + hdr->hlen);
nvmet_tcp_hdgst(queue->rcv_hash, pdu, len);
nvmet_tcp_hdgst(pdu, len);
exp_digest = *(__le32 *)(pdu + hdr->hlen);
if (recv_digest != exp_digest) {
pr_err("queue %d: header digest error: recv %#x expected %#x\n",
@ -441,12 +434,24 @@ err:
return NVME_SC_INTERNAL;
}
static void nvmet_tcp_calc_ddgst(struct ahash_request *hash,
struct nvmet_tcp_cmd *cmd)
static void nvmet_tcp_calc_ddgst(struct nvmet_tcp_cmd *cmd)
{
ahash_request_set_crypt(hash, cmd->req.sg,
(void *)&cmd->exp_ddgst, cmd->req.transfer_len);
crypto_ahash_digest(hash);
size_t total_len = cmd->req.transfer_len;
struct scatterlist *sg = cmd->req.sg;
u32 crc = ~0;
while (total_len) {
size_t len = min_t(size_t, total_len, sg->length);
/*
* Note that the scatterlist does not contain any highmem pages,
* as it was allocated by sgl_alloc() with GFP_KERNEL.
*/
crc = crc32c(crc, sg_virt(sg), len);
total_len -= len;
sg = sg_next(sg);
}
cmd->exp_ddgst = cpu_to_le32(~crc);
}
static void nvmet_setup_c2h_data_pdu(struct nvmet_tcp_cmd *cmd)
@ -473,19 +478,18 @@ static void nvmet_setup_c2h_data_pdu(struct nvmet_tcp_cmd *cmd)
if (queue->data_digest) {
pdu->hdr.flags |= NVME_TCP_F_DDGST;
nvmet_tcp_calc_ddgst(queue->snd_hash, cmd);
nvmet_tcp_calc_ddgst(cmd);
}
if (cmd->queue->hdr_digest) {
pdu->hdr.flags |= NVME_TCP_F_HDGST;
nvmet_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu));
nvmet_tcp_hdgst(pdu, sizeof(*pdu));
}
}
static void nvmet_setup_r2t_pdu(struct nvmet_tcp_cmd *cmd)
{
struct nvme_tcp_r2t_pdu *pdu = cmd->r2t_pdu;
struct nvmet_tcp_queue *queue = cmd->queue;
u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue);
cmd->offset = 0;
@ -503,14 +507,13 @@ static void nvmet_setup_r2t_pdu(struct nvmet_tcp_cmd *cmd)
pdu->r2t_offset = cpu_to_le32(cmd->rbytes_done);
if (cmd->queue->hdr_digest) {
pdu->hdr.flags |= NVME_TCP_F_HDGST;
nvmet_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu));
nvmet_tcp_hdgst(pdu, sizeof(*pdu));
}
}
static void nvmet_setup_response_pdu(struct nvmet_tcp_cmd *cmd)
{
struct nvme_tcp_rsp_pdu *pdu = cmd->rsp_pdu;
struct nvmet_tcp_queue *queue = cmd->queue;
u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue);
cmd->offset = 0;
@ -523,7 +526,7 @@ static void nvmet_setup_response_pdu(struct nvmet_tcp_cmd *cmd)
pdu->hdr.plen = cpu_to_le32(pdu->hdr.hlen + hdgst);
if (cmd->queue->hdr_digest) {
pdu->hdr.flags |= NVME_TCP_F_HDGST;
nvmet_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu));
nvmet_tcp_hdgst(pdu, sizeof(*pdu));
}
}
@ -857,42 +860,6 @@ static void nvmet_prepare_receive_pdu(struct nvmet_tcp_queue *queue)
smp_store_release(&queue->rcv_state, NVMET_TCP_RECV_PDU);
}
static void nvmet_tcp_free_crypto(struct nvmet_tcp_queue *queue)
{
struct crypto_ahash *tfm = crypto_ahash_reqtfm(queue->rcv_hash);
ahash_request_free(queue->rcv_hash);
ahash_request_free(queue->snd_hash);
crypto_free_ahash(tfm);
}
static int nvmet_tcp_alloc_crypto(struct nvmet_tcp_queue *queue)
{
struct crypto_ahash *tfm;
tfm = crypto_alloc_ahash("crc32c", 0, CRYPTO_ALG_ASYNC);
if (IS_ERR(tfm))
return PTR_ERR(tfm);
queue->snd_hash = ahash_request_alloc(tfm, GFP_KERNEL);
if (!queue->snd_hash)
goto free_tfm;
ahash_request_set_callback(queue->snd_hash, 0, NULL, NULL);
queue->rcv_hash = ahash_request_alloc(tfm, GFP_KERNEL);
if (!queue->rcv_hash)
goto free_snd_hash;
ahash_request_set_callback(queue->rcv_hash, 0, NULL, NULL);
return 0;
free_snd_hash:
ahash_request_free(queue->snd_hash);
free_tfm:
crypto_free_ahash(tfm);
return -ENOMEM;
}
static int nvmet_tcp_handle_icreq(struct nvmet_tcp_queue *queue)
{
struct nvme_tcp_icreq_pdu *icreq = &queue->pdu.icreq;
@ -921,11 +888,6 @@ static int nvmet_tcp_handle_icreq(struct nvmet_tcp_queue *queue)
queue->hdr_digest = !!(icreq->digest & NVME_TCP_HDR_DIGEST_ENABLE);
queue->data_digest = !!(icreq->digest & NVME_TCP_DATA_DIGEST_ENABLE);
if (queue->hdr_digest || queue->data_digest) {
ret = nvmet_tcp_alloc_crypto(queue);
if (ret)
return ret;
}
memset(icresp, 0, sizeof(*icresp));
icresp->hdr.type = nvme_tcp_icresp;
@ -1077,8 +1039,7 @@ static int nvmet_tcp_done_recv_pdu(struct nvmet_tcp_queue *queue)
req = &queue->cmd->req;
memcpy(req->cmd, nvme_cmd, sizeof(*nvme_cmd));
if (unlikely(!nvmet_req_init(req, &queue->nvme_cq,
&queue->nvme_sq, &nvmet_tcp_ops))) {
if (unlikely(!nvmet_req_init(req, &queue->nvme_sq, &nvmet_tcp_ops))) {
pr_err("failed cmd %p id %d opcode %d, data_len: %d, status: %04x\n",
req->cmd, req->cmd->common.command_id,
req->cmd->common.opcode,
@ -1247,7 +1208,7 @@ static void nvmet_tcp_prep_recv_ddgst(struct nvmet_tcp_cmd *cmd)
{
struct nvmet_tcp_queue *queue = cmd->queue;
nvmet_tcp_calc_ddgst(queue->rcv_hash, cmd);
nvmet_tcp_calc_ddgst(cmd);
queue->offset = 0;
queue->left = NVME_TCP_DIGEST_LENGTH;
queue->rcv_state = NVMET_TCP_RECV_DDGST;
@ -1615,13 +1576,12 @@ static void nvmet_tcp_release_queue_work(struct work_struct *w)
nvmet_sq_put_tls_key(&queue->nvme_sq);
nvmet_tcp_uninit_data_in_cmds(queue);
nvmet_sq_destroy(&queue->nvme_sq);
nvmet_cq_put(&queue->nvme_cq);
cancel_work_sync(&queue->io_work);
nvmet_tcp_free_cmd_data_in_buffers(queue);
/* ->sock will be released by fput() */
fput(queue->sock->file);
nvmet_tcp_free_cmds(queue);
if (queue->hdr_digest || queue->data_digest)
nvmet_tcp_free_crypto(queue);
ida_free(&nvmet_tcp_queue_ida, queue->idx);
page_frag_cache_drain(&queue->pf_cache);
kfree(queue);
@ -1950,7 +1910,8 @@ static void nvmet_tcp_alloc_queue(struct nvmet_tcp_port *port,
if (ret)
goto out_ida_remove;
ret = nvmet_sq_init(&queue->nvme_sq);
nvmet_cq_init(&queue->nvme_cq);
ret = nvmet_sq_init(&queue->nvme_sq, &queue->nvme_cq);
if (ret)
goto out_free_connect;
@ -1993,6 +1954,7 @@ out_destroy_sq:
mutex_unlock(&nvmet_tcp_queue_mutex);
nvmet_sq_destroy(&queue->nvme_sq);
out_free_connect:
nvmet_cq_put(&queue->nvme_cq);
nvmet_tcp_free_cmd(&queue->connect);
out_ida_remove:
ida_free(&nvmet_tcp_queue_ida, queue->idx);

View file

@ -403,6 +403,7 @@ config SCSI_ACARD
config SCSI_AHA152X
tristate "Adaptec AHA152X/2825 support"
depends on ISA && SCSI
depends on !HIGHMEM
select SCSI_SPI_ATTRS
select CHECK_SIGNATURE
help
@ -795,6 +796,7 @@ config SCSI_PPA
tristate "IOMEGA parallel port (ppa - older drives)"
depends on SCSI && PARPORT_PC
depends on HAS_IOPORT
depends on !HIGHMEM
help
This driver supports older versions of IOMEGA's parallel port ZIP
drive (a 100 MB removable media device).
@ -822,6 +824,7 @@ config SCSI_PPA
config SCSI_IMM
tristate "IOMEGA parallel port (imm - newer drives)"
depends on SCSI && PARPORT_PC
depends on !HIGHMEM
help
This driver supports newer versions of IOMEGA's parallel port ZIP
drive (a 100 MB removable media device).

View file

@ -746,7 +746,6 @@ struct Scsi_Host *aha152x_probe_one(struct aha152x_setup *setup)
/* need to have host registered before triggering any interrupt */
list_add_tail(&HOSTDATA(shpnt)->host_list, &aha152x_host_list);
shpnt->no_highmem = true;
shpnt->io_port = setup->io_port;
shpnt->n_io_port = IO_RANGE;
shpnt->irq = setup->irq;

View file

@ -1224,7 +1224,6 @@ static int __imm_attach(struct parport *pb)
host = scsi_host_alloc(&imm_template, sizeof(imm_struct *));
if (!host)
goto out1;
host->no_highmem = true;
host->io_port = pb->base;
host->n_io_port = ports;
host->dma_channel = -1;

View file

@ -1104,7 +1104,6 @@ static int __ppa_attach(struct parport *pb)
host = scsi_host_alloc(&ppa_template, sizeof(ppa_struct *));
if (!host)
goto out1;
host->no_highmem = true;
host->io_port = pb->base;
host->n_io_port = ports;
host->dma_channel = -1;

View file

@ -601,7 +601,7 @@ static int sg_scsi_ioctl(struct request_queue *q, bool open_for_write,
}
if (bytes) {
err = blk_rq_map_kern(q, rq, buffer, bytes, GFP_NOIO);
err = blk_rq_map_kern(rq, buffer, bytes, GFP_NOIO);
if (err)
goto error;
}

View file

@ -313,8 +313,7 @@ retry:
return PTR_ERR(req);
if (bufflen) {
ret = blk_rq_map_kern(sdev->request_queue, req,
buffer, bufflen, GFP_NOIO);
ret = blk_rq_map_kern(req, buffer, bufflen, GFP_NOIO);
if (ret)
goto out;
}
@ -2004,9 +2003,6 @@ void scsi_init_limits(struct Scsi_Host *shost, struct queue_limits *lim)
lim->dma_alignment = max_t(unsigned int,
shost->dma_alignment, dma_get_cache_alignment() - 1);
if (shost->no_highmem)
lim->features |= BLK_FEAT_BOUNCE_HIGH;
/*
* Propagate the DMA formation properties to the dma-mapping layer as
* a courtesy service to the LLDDs. This needs to check that the buses

View file

@ -1056,13 +1056,20 @@ int usb_stor_probe1(struct us_data **pus,
goto BadDevice;
/*
* Some USB host controllers can't do DMA; they have to use PIO.
* For such controllers we need to make sure the block layer sets
* up bounce buffers in addressable memory.
* Some USB host controllers can't do DMA: They have to use PIO, or they
* have to use a small dedicated local memory area, or they have other
* restrictions on addressable memory.
*
* We can't support these controllers on highmem systems as we don't
* kmap or bounce buffer.
*/
if (!hcd_uses_dma(bus_to_hcd(us->pusb_dev->bus)) ||
bus_to_hcd(us->pusb_dev->bus)->localmem_pool)
host->no_highmem = true;
if (IS_ENABLED(CONFIG_HIGHMEM) &&
(!hcd_uses_dma(bus_to_hcd(us->pusb_dev->bus)) ||
bus_to_hcd(us->pusb_dev->bus)->localmem_pool)) {
dev_warn(&intf->dev, "USB Mass Storage not supported on this host controller\n");
result = -EINVAL;
goto release;
}
/* Get the unusual_devs entries and the descriptors */
result = get_device_info(us, id, unusual_dev);
@ -1081,6 +1088,7 @@ int usb_stor_probe1(struct us_data **pus,
BadDevice:
usb_stor_dbg(us, "storage_probe() failed\n");
release:
release_everything(us);
return result;
}

View file

@ -1511,6 +1511,7 @@ static int aio_prep_rw(struct kiocb *req, const struct iocb *iocb, int rw_type)
{
int ret;
req->ki_write_stream = 0;
req->ki_complete = aio_complete_rw;
req->private = NULL;
req->ki_pos = iocb->aio_offset;

View file

@ -2770,17 +2770,11 @@ static int scrub_one_super(struct scrub_ctx *sctx, struct btrfs_device *dev,
struct page *page, u64 physical, u64 generation)
{
struct btrfs_fs_info *fs_info = sctx->fs_info;
struct bio_vec bvec;
struct bio bio;
struct btrfs_super_block *sb = page_address(page);
int ret;
bio_init(&bio, dev->bdev, &bvec, 1, REQ_OP_READ);
bio.bi_iter.bi_sector = physical >> SECTOR_SHIFT;
__bio_add_page(&bio, page, BTRFS_SUPER_INFO_SIZE, 0);
ret = submit_bio_wait(&bio);
bio_uninit(&bio);
ret = bdev_rw_virt(dev->bdev, physical >> SECTOR_SHIFT, sb,
BTRFS_SUPER_INFO_SIZE, REQ_OP_READ);
if (ret < 0)
return ret;
ret = btrfs_check_super_csum(fs_info, sb);

View file

@ -226,28 +226,22 @@ static void gfs2_sb_in(struct gfs2_sbd *sdp, const struct gfs2_sb *str)
static int gfs2_read_super(struct gfs2_sbd *sdp, sector_t sector, int silent)
{
struct super_block *sb = sdp->sd_vfs;
struct page *page;
struct bio_vec bvec;
struct bio bio;
struct gfs2_sb *sb;
int err;
page = alloc_page(GFP_KERNEL);
if (unlikely(!page))
sb = kmalloc(PAGE_SIZE, GFP_KERNEL);
if (unlikely(!sb))
return -ENOMEM;
bio_init(&bio, sb->s_bdev, &bvec, 1, REQ_OP_READ | REQ_META);
bio.bi_iter.bi_sector = sector * (sb->s_blocksize >> 9);
__bio_add_page(&bio, page, PAGE_SIZE, 0);
err = submit_bio_wait(&bio);
err = bdev_rw_virt(sdp->sd_vfs->s_bdev,
sector * (sdp->sd_vfs->s_blocksize >> 9), sb, PAGE_SIZE,
REQ_OP_READ | REQ_META);
if (err) {
pr_warn("error %d reading superblock\n", err);
__free_page(page);
kfree(sb);
return err;
}
gfs2_sb_in(sdp, page_address(page));
__free_page(page);
gfs2_sb_in(sdp, sb);
kfree(sb);
return gfs2_check_sb(sdp, silent);
}

View file

@ -48,47 +48,19 @@ struct hfsplus_wd {
int hfsplus_submit_bio(struct super_block *sb, sector_t sector,
void *buf, void **data, blk_opf_t opf)
{
const enum req_op op = opf & REQ_OP_MASK;
struct bio *bio;
int ret = 0;
u64 io_size;
loff_t start;
int offset;
u64 io_size = hfsplus_min_io_size(sb);
loff_t start = (loff_t)sector << HFSPLUS_SECTOR_SHIFT;
int offset = start & (io_size - 1);
/*
* Align sector to hardware sector size and find offset. We
* assume that io_size is a power of two, which _should_
* be true.
*/
io_size = hfsplus_min_io_size(sb);
start = (loff_t)sector << HFSPLUS_SECTOR_SHIFT;
offset = start & (io_size - 1);
sector &= ~((io_size >> HFSPLUS_SECTOR_SHIFT) - 1);
bio = bio_alloc(sb->s_bdev, 1, opf, GFP_NOIO);
bio->bi_iter.bi_sector = sector;
if (op != REQ_OP_WRITE && data)
if ((opf & REQ_OP_MASK) != REQ_OP_WRITE && data)
*data = (u8 *)buf + offset;
while (io_size > 0) {
unsigned int page_offset = offset_in_page(buf);
unsigned int len = min_t(unsigned int, PAGE_SIZE - page_offset,
io_size);
ret = bio_add_page(bio, virt_to_page(buf), len, page_offset);
if (ret != len) {
ret = -EIO;
goto out;
}
io_size -= len;
buf = (u8 *)buf + len;
}
ret = submit_bio_wait(bio);
out:
bio_put(bio);
return ret < 0 ? ret : 0;
/*
* Align sector to hardware sector size and find offset. We assume that
* io_size is a power of two, which _should_ be true.
*/
sector &= ~((io_size >> HFSPLUS_SECTOR_SHIFT) - 1);
return bdev_rw_virt(sb->s_bdev, sector, buf, io_size, opf);
}
static int hfsplus_read_mdb(void *bufptr, struct hfsplus_wd *wd)

View file

@ -120,8 +120,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
global_node_page_state(NR_SECONDARY_PAGETABLE));
show_val_kb(m, "NFS_Unstable: ", 0);
show_val_kb(m, "Bounce: ",
global_zone_page_state(NR_BOUNCE));
show_val_kb(m, "Bounce: ", 0);
show_val_kb(m, "WritebackTmp: ",
global_node_page_state(NR_WRITEBACK_TEMP));
show_val_kb(m, "CommitLimit: ", vm_commit_limit());

View file

@ -18,42 +18,36 @@ xfs_rw_bdev(
enum req_op op)
{
unsigned int is_vmalloc = is_vmalloc_addr(data);
unsigned int left = count;
unsigned int done = 0, added;
int error;
struct bio *bio;
if (is_vmalloc && op == REQ_OP_WRITE)
flush_kernel_vmap_range(data, count);
op |= REQ_META | REQ_SYNC;
if (!is_vmalloc_addr(data))
return bdev_rw_virt(bdev, sector, data, count, op);
bio = bio_alloc(bdev, bio_max_vecs(left), op | REQ_META | REQ_SYNC,
GFP_KERNEL);
bio = bio_alloc(bdev, bio_max_vecs(count), op, GFP_KERNEL);
bio->bi_iter.bi_sector = sector;
do {
struct page *page = kmem_to_page(data);
unsigned int off = offset_in_page(data);
unsigned int len = min_t(unsigned, left, PAGE_SIZE - off);
while (bio_add_page(bio, page, len, off) != len) {
added = bio_add_vmalloc_chunk(bio, data + done, count - done);
if (!added) {
struct bio *prev = bio;
bio = bio_alloc(prev->bi_bdev, bio_max_vecs(left),
bio = bio_alloc(prev->bi_bdev,
bio_max_vecs(count - done),
prev->bi_opf, GFP_KERNEL);
bio->bi_iter.bi_sector = bio_end_sector(prev);
bio_chain(prev, bio);
submit_bio(prev);
}
data += len;
left -= len;
} while (left > 0);
done += added;
} while (done < count);
error = submit_bio_wait(bio);
bio_put(bio);
if (is_vmalloc && op == REQ_OP_READ)
if (op == REQ_OP_READ)
invalidate_kernel_vmap_range(data, count);
return error;
}

View file

@ -1333,45 +1333,18 @@ static void
xfs_buf_submit_bio(
struct xfs_buf *bp)
{
unsigned int len = BBTOB(bp->b_length);
unsigned int nr_vecs = bio_add_max_vecs(bp->b_addr, len);
unsigned int map = 0;
struct blk_plug plug;
struct bio *bio;
if (is_vmalloc_addr(bp->b_addr)) {
unsigned int size = BBTOB(bp->b_length);
unsigned int alloc_size = roundup(size, PAGE_SIZE);
void *data = bp->b_addr;
bio = bio_alloc(bp->b_target->bt_bdev, alloc_size >> PAGE_SHIFT,
xfs_buf_bio_op(bp), GFP_NOIO);
do {
unsigned int len = min(size, PAGE_SIZE);
ASSERT(offset_in_page(data) == 0);
__bio_add_page(bio, vmalloc_to_page(data), len, 0);
data += len;
size -= len;
} while (size);
flush_kernel_vmap_range(bp->b_addr, alloc_size);
} else {
/*
* Single folio or slab allocation. Must be contiguous and thus
* only a single bvec is needed.
*
* This uses the page based bio add helper for now as that is
* the lowest common denominator between folios and slab
* allocations. To be replaced with a better block layer
* helper soon (hopefully).
*/
bio = bio_alloc(bp->b_target->bt_bdev, 1, xfs_buf_bio_op(bp),
bio = bio_alloc(bp->b_target->bt_bdev, nr_vecs, xfs_buf_bio_op(bp),
GFP_NOIO);
__bio_add_page(bio, virt_to_page(bp->b_addr),
BBTOB(bp->b_length),
offset_in_page(bp->b_addr));
}
if (is_vmalloc_addr(bp->b_addr))
bio_add_vmalloc(bio, bp->b_addr, len);
else
bio_add_virt_nofail(bio, bp->b_addr, len);
bio->bi_private = bp;
bio->bi_end_io = xfs_buf_bio_end_io;

View file

@ -1607,27 +1607,6 @@ xlog_bio_end_io(
&iclog->ic_end_io_work);
}
static int
xlog_map_iclog_data(
struct bio *bio,
void *data,
size_t count)
{
do {
struct page *page = kmem_to_page(data);
unsigned int off = offset_in_page(data);
size_t len = min_t(size_t, count, PAGE_SIZE - off);
if (bio_add_page(bio, page, len, off) != len)
return -EIO;
data += len;
count -= len;
} while (count);
return 0;
}
STATIC void
xlog_write_iclog(
struct xlog *log,
@ -1693,11 +1672,12 @@ xlog_write_iclog(
iclog->ic_flags &= ~(XLOG_ICL_NEED_FLUSH | XLOG_ICL_NEED_FUA);
if (xlog_map_iclog_data(&iclog->ic_bio, iclog->ic_data, count))
if (is_vmalloc_addr(iclog->ic_data)) {
if (!bio_add_vmalloc(&iclog->ic_bio, iclog->ic_data, count))
goto shutdown;
if (is_vmalloc_addr(iclog->ic_data))
flush_kernel_vmap_range(iclog->ic_data, count);
} else {
bio_add_virt_nofail(&iclog->ic_bio, iclog->ic_data, count);
}
/*
* If this log buffer would straddle the end of the log we will have

View file

@ -1111,28 +1111,19 @@ static int zonefs_read_super(struct super_block *sb)
struct zonefs_sb_info *sbi = ZONEFS_SB(sb);
struct zonefs_super *super;
u32 crc, stored_crc;
struct page *page;
struct bio_vec bio_vec;
struct bio bio;
int ret;
page = alloc_page(GFP_KERNEL);
if (!page)
super = kmalloc(PAGE_SIZE, GFP_KERNEL);
if (!super)
return -ENOMEM;
bio_init(&bio, sb->s_bdev, &bio_vec, 1, REQ_OP_READ);
bio.bi_iter.bi_sector = 0;
__bio_add_page(&bio, page, PAGE_SIZE, 0);
ret = submit_bio_wait(&bio);
ret = bdev_rw_virt(sb->s_bdev, 0, super, PAGE_SIZE, REQ_OP_READ);
if (ret)
goto free_page;
super = page_address(page);
goto free_super;
ret = -EINVAL;
if (le32_to_cpu(super->s_magic) != ZONEFS_MAGIC)
goto free_page;
goto free_super;
stored_crc = le32_to_cpu(super->s_crc);
super->s_crc = 0;
@ -1140,14 +1131,14 @@ static int zonefs_read_super(struct super_block *sb)
if (crc != stored_crc) {
zonefs_err(sb, "Invalid checksum (Expected 0x%08x, got 0x%08x)",
crc, stored_crc);
goto free_page;
goto free_super;
}
sbi->s_features = le64_to_cpu(super->s_features);
if (sbi->s_features & ~ZONEFS_F_DEFINED_FEATURES) {
zonefs_err(sb, "Unknown features set 0x%llx\n",
sbi->s_features);
goto free_page;
goto free_super;
}
if (sbi->s_features & ZONEFS_F_UID) {
@ -1155,7 +1146,7 @@ static int zonefs_read_super(struct super_block *sb)
le32_to_cpu(super->s_uid));
if (!uid_valid(sbi->s_uid)) {
zonefs_err(sb, "Invalid UID feature\n");
goto free_page;
goto free_super;
}
}
@ -1164,7 +1155,7 @@ static int zonefs_read_super(struct super_block *sb)
le32_to_cpu(super->s_gid));
if (!gid_valid(sbi->s_gid)) {
zonefs_err(sb, "Invalid GID feature\n");
goto free_page;
goto free_super;
}
}
@ -1173,15 +1164,14 @@ static int zonefs_read_super(struct super_block *sb)
if (memchr_inv(super->s_reserved, 0, sizeof(super->s_reserved))) {
zonefs_err(sb, "Reserved area is being used\n");
goto free_page;
goto free_super;
}
import_uuid(&sbi->s_uuid, super->s_uuid);
ret = 0;
free_page:
__free_page(page);
free_super:
kfree(super);
return ret;
}

View file

@ -403,7 +403,6 @@ static inline int bio_iov_vecs_to_alloc(struct iov_iter *iter, int max_segs)
struct request_queue;
extern int submit_bio_wait(struct bio *bio);
void bio_init(struct bio *bio, struct block_device *bdev, struct bio_vec *table,
unsigned short max_vecs, blk_opf_t opf);
extern void bio_uninit(struct bio *);
@ -418,6 +417,30 @@ void __bio_add_page(struct bio *bio, struct page *page,
unsigned int len, unsigned int off);
void bio_add_folio_nofail(struct bio *bio, struct folio *folio, size_t len,
size_t off);
void bio_add_virt_nofail(struct bio *bio, void *vaddr, unsigned len);
/**
* bio_add_max_vecs - number of bio_vecs needed to add data to a bio
* @kaddr: kernel virtual address to add
* @len: length in bytes to add
*
* Calculate how many bio_vecs need to be allocated to add the kernel virtual
* address range in [@kaddr:@len] in the worse case.
*/
static inline unsigned int bio_add_max_vecs(void *kaddr, unsigned int len)
{
if (is_vmalloc_addr(kaddr))
return DIV_ROUND_UP(offset_in_page(kaddr) + len, PAGE_SIZE);
return 1;
}
unsigned int bio_add_vmalloc_chunk(struct bio *bio, void *vaddr, unsigned len);
bool bio_add_vmalloc(struct bio *bio, void *vaddr, unsigned int len);
int submit_bio_wait(struct bio *bio);
int bdev_rw_virt(struct block_device *bdev, sector_t sector, void *data,
size_t len, enum req_op op);
int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter);
void bio_iov_bvec_set(struct bio *bio, const struct iov_iter *iter);
void __bio_release_pages(struct bio *bio, bool mark_dirty);

View file

@ -9,6 +9,7 @@
#include <linux/prefetch.h>
#include <linux/srcu.h>
#include <linux/rw_hint.h>
#include <linux/rwsem.h>
struct blk_mq_tags;
struct blk_flush_queue;
@ -506,6 +507,9 @@ enum hctx_type {
* request_queue.tag_set_list.
* @srcu: Use as lock when type of the request queue is blocking
* (BLK_MQ_F_BLOCKING).
* @update_nr_hwq_lock:
* Synchronize updating nr_hw_queues with add/del disk &
* switching elevator.
*/
struct blk_mq_tag_set {
const struct blk_mq_ops *ops;
@ -527,6 +531,8 @@ struct blk_mq_tag_set {
struct mutex tag_list_lock;
struct list_head tag_list;
struct srcu_struct *srcu;
struct rw_semaphore update_nr_hwq_lock;
};
/**
@ -1031,8 +1037,8 @@ int blk_rq_map_user_io(struct request *, struct rq_map_data *,
int blk_rq_map_user_iov(struct request_queue *, struct request *,
struct rq_map_data *, const struct iov_iter *, gfp_t);
int blk_rq_unmap_user(struct bio *);
int blk_rq_map_kern(struct request_queue *, struct request *, void *,
unsigned int, gfp_t);
int blk_rq_map_kern(struct request *rq, void *kbuf, unsigned int len,
gfp_t gfp);
int blk_rq_append_bio(struct request *rq, struct bio *bio);
void blk_execute_rq_nowait(struct request *rq, bool at_head);
blk_status_t blk_execute_rq(struct request *rq, bool at_head);

View file

@ -220,6 +220,7 @@ struct bio {
unsigned short bi_flags; /* BIO_* below */
unsigned short bi_ioprio;
enum rw_hint bi_write_hint;
u8 bi_write_stream;
blk_status_t bi_status;
atomic_t __bi_remaining;
@ -286,7 +287,6 @@ struct bio {
enum {
BIO_PAGE_PINNED, /* Unpin pages in bio_release_pages() */
BIO_CLONED, /* doesn't own data */
BIO_BOUNCED, /* bio is a bounce bio */
BIO_QUIET, /* Make BIO Quiet */
BIO_CHAIN, /* chained bio, ->bi_remaining in effect */
BIO_REFFED, /* bio has elevated ->bi_cnt */
@ -296,6 +296,14 @@ enum {
* of this bio. */
BIO_CGROUP_ACCT, /* has been accounted to a cgroup */
BIO_QOS_THROTTLED, /* bio went through rq_qos throttle path */
/*
* This bio has completed bps throttling at the single tg granularity,
* which is different from BIO_BPS_THROTTLED. When the bio is enqueued
* into the sq->queued of the upper tg, or is about to be dispatched,
* this flag needs to be cleared. Since blk-throttle and rq_qos are not
* on the same hierarchical level, reuse the value.
*/
BIO_TG_BPS_THROTTLED = BIO_QOS_THROTTLED,
BIO_QOS_MERGED, /* but went through rq_qos merge path */
BIO_REMAPPED,
BIO_ZONE_WRITE_PLUGGING, /* bio handled through zone write plugging */

View file

@ -182,7 +182,6 @@ struct gendisk {
struct list_head slave_bdevs;
#endif
struct timer_rand_state *random;
atomic_t sync_io; /* RAID */
struct disk_events *ev;
#ifdef CONFIG_BLK_DEV_ZONED
@ -218,6 +217,8 @@ struct gendisk {
* devices that do not have multiple independent access ranges.
*/
struct blk_independent_access_ranges *ia_ranges;
struct mutex rqos_state_mutex; /* rqos state change mutex */
};
/**
@ -331,9 +332,6 @@ typedef unsigned int __bitwise blk_features_t;
/* skip this queue in blk_mq_(un)quiesce_tagset */
#define BLK_FEAT_SKIP_TAGSET_QUIESCE ((__force blk_features_t)(1u << 13))
/* bounce all highmem pages */
#define BLK_FEAT_BOUNCE_HIGH ((__force blk_features_t)(1u << 14))
/* undocumented magic for bcache */
#define BLK_FEAT_RAID_PARTIAL_STRIPES_EXPENSIVE \
((__force blk_features_t)(1u << 15))
@ -347,7 +345,7 @@ typedef unsigned int __bitwise blk_features_t;
*/
#define BLK_FEAT_INHERIT_MASK \
(BLK_FEAT_WRITE_CACHE | BLK_FEAT_FUA | BLK_FEAT_ROTATIONAL | \
BLK_FEAT_STABLE_WRITES | BLK_FEAT_ZONED | BLK_FEAT_BOUNCE_HIGH | \
BLK_FEAT_STABLE_WRITES | BLK_FEAT_ZONED | \
BLK_FEAT_RAID_PARTIAL_STRIPES_EXPENSIVE)
/* internal flags in queue_limits.flags */
@ -405,6 +403,9 @@ struct queue_limits {
unsigned short max_integrity_segments;
unsigned short max_discard_segments;
unsigned short max_write_streams;
unsigned int write_stream_granularity;
unsigned int max_open_zones;
unsigned int max_active_zones;
@ -644,6 +645,8 @@ enum {
QUEUE_FLAG_RQ_ALLOC_TIME, /* record rq->alloc_time_ns */
QUEUE_FLAG_HCTX_ACTIVE, /* at least one blk-mq hctx is active */
QUEUE_FLAG_SQ_SCHED, /* single queue style io dispatch */
QUEUE_FLAG_DISABLE_WBT_DEF, /* for sched to disable/enable wbt */
QUEUE_FLAG_NO_ELV_SWITCH, /* can't switch elevator any more */
QUEUE_FLAG_MAX
};
@ -679,6 +682,10 @@ void blk_queue_flag_clear(unsigned int flag, struct request_queue *q);
#define blk_queue_sq_sched(q) test_bit(QUEUE_FLAG_SQ_SCHED, &(q)->queue_flags)
#define blk_queue_skip_tagset_quiesce(q) \
((q)->limits.features & BLK_FEAT_SKIP_TAGSET_QUIESCE)
#define blk_queue_disable_wbt(q) \
test_bit(QUEUE_FLAG_DISABLE_WBT_DEF, &(q)->queue_flags)
#define blk_queue_no_elv_switch(q) \
test_bit(QUEUE_FLAG_NO_ELV_SWITCH, &(q)->queue_flags)
extern void blk_set_pm_only(struct request_queue *q);
extern void blk_clear_pm_only(struct request_queue *q);
@ -1288,6 +1295,13 @@ static inline unsigned int bdev_max_segments(struct block_device *bdev)
return queue_max_segments(bdev_get_queue(bdev));
}
static inline unsigned short bdev_max_write_streams(struct block_device *bdev)
{
if (bdev_is_partition(bdev))
return 0;
return bdev_limits(bdev)->max_write_streams;
}
static inline unsigned queue_logical_block_size(const struct request_queue *q)
{
return q->limits.logical_block_size;

View file

@ -11,6 +11,7 @@
#ifndef LINUX_DMAPOOL_H
#define LINUX_DMAPOOL_H
#include <linux/nodemask_types.h>
#include <linux/scatterlist.h>
#include <asm/io.h>
@ -18,8 +19,8 @@ struct device;
#ifdef CONFIG_HAS_DMA
struct dma_pool *dma_pool_create(const char *name, struct device *dev,
size_t size, size_t align, size_t allocation);
struct dma_pool *dma_pool_create_node(const char *name, struct device *dev,
size_t size, size_t align, size_t boundary, int node);
void dma_pool_destroy(struct dma_pool *pool);
@ -35,9 +36,12 @@ struct dma_pool *dmam_pool_create(const char *name, struct device *dev,
void dmam_pool_destroy(struct dma_pool *pool);
#else /* !CONFIG_HAS_DMA */
static inline struct dma_pool *dma_pool_create(const char *name,
struct device *dev, size_t size, size_t align, size_t allocation)
{ return NULL; }
static inline struct dma_pool *dma_pool_create_node(const char *name,
struct device *dev, size_t size, size_t align, size_t boundary,
int node)
{
return NULL;
}
static inline void dma_pool_destroy(struct dma_pool *pool) { }
static inline void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags,
dma_addr_t *handle) { return NULL; }
@ -49,6 +53,13 @@ static inline struct dma_pool *dmam_pool_create(const char *name,
static inline void dmam_pool_destroy(struct dma_pool *pool) { }
#endif /* !CONFIG_HAS_DMA */
static inline struct dma_pool *dma_pool_create(const char *name,
struct device *dev, size_t size, size_t align, size_t boundary)
{
return dma_pool_create_node(name, dev, size, align, boundary,
NUMA_NO_NODE);
}
static inline void *dma_pool_zalloc(struct dma_pool *pool, gfp_t mem_flags,
dma_addr_t *handle)
{

View file

@ -408,6 +408,7 @@ struct kiocb {
void *private;
int ki_flags;
u16 ki_ioprio; /* See linux/ioprio.h */
u8 ki_write_stream;
union {
/*
* Only used for async buffered reads, where it denotes the

View file

@ -140,6 +140,15 @@ static inline struct io_uring_cmd_data *io_uring_cmd_get_async_data(struct io_ur
return cmd_to_io_kiocb(cmd)->async_data;
}
/*
* Return uring_cmd's context reference as its context handle for driver to
* track per-context resource, such as registered kernel IO buffer
*/
static inline void *io_uring_cmd_ctx_handle(struct io_uring_cmd *cmd)
{
return cmd_to_io_kiocb(cmd)->ctx;
}
int io_buffer_register_bvec(struct io_uring_cmd *cmd, struct request *rq,
void (*release)(void *), unsigned int index,
unsigned int issue_flags);

View file

@ -148,7 +148,6 @@ enum zone_stat_item {
NR_ZONE_WRITE_PENDING, /* Count of dirty, writeback and unstable pages */
NR_MLOCK, /* mlock()ed pages found and moved off LRU */
/* Second 128 byte cacheline */
NR_BOUNCE,
#if IS_ENABLED(CONFIG_ZSMALLOC)
NR_ZSPAGES, /* allocated in zsmalloc */
#endif

View file

@ -303,6 +303,7 @@ enum nvme_ctrl_attr {
NVME_CTRL_ATTR_TBKAS = (1 << 6),
NVME_CTRL_ATTR_ELBAS = (1 << 15),
NVME_CTRL_ATTR_RHII = (1 << 18),
NVME_CTRL_ATTR_FDPS = (1 << 19),
};
struct nvme_id_ctrl {
@ -689,6 +690,44 @@ struct nvme_rotational_media_log {
__u8 rsvd24[488];
};
struct nvme_fdp_config {
__u8 flags;
#define FDPCFG_FDPE (1U << 0)
__u8 fdpcidx;
__le16 reserved;
};
struct nvme_fdp_ruh_desc {
__u8 ruht;
__u8 reserved[3];
};
struct nvme_fdp_config_desc {
__le16 dsze;
__u8 fdpa;
__u8 vss;
__le32 nrg;
__le16 nruh;
__le16 maxpids;
__le32 nns;
__le64 runs;
__le32 erutl;
__u8 rsvd28[36];
struct nvme_fdp_ruh_desc ruhs[];
};
struct nvme_fdp_config_log {
__le16 numfdpc;
__u8 ver;
__u8 rsvd3;
__le32 sze;
__u8 rsvd8[8];
/*
* This is followed by variable number of nvme_fdp_config_desc
* structures, but sparse doesn't like nested variable sized arrays.
*/
};
struct nvme_smart_log {
__u8 critical_warning;
__u8 temperature[2];
@ -915,6 +954,7 @@ enum nvme_opcode {
nvme_cmd_resv_register = 0x0d,
nvme_cmd_resv_report = 0x0e,
nvme_cmd_resv_acquire = 0x11,
nvme_cmd_io_mgmt_recv = 0x12,
nvme_cmd_resv_release = 0x15,
nvme_cmd_zone_mgmt_send = 0x79,
nvme_cmd_zone_mgmt_recv = 0x7a,
@ -936,6 +976,7 @@ enum nvme_opcode {
nvme_opcode_name(nvme_cmd_resv_register), \
nvme_opcode_name(nvme_cmd_resv_report), \
nvme_opcode_name(nvme_cmd_resv_acquire), \
nvme_opcode_name(nvme_cmd_io_mgmt_recv), \
nvme_opcode_name(nvme_cmd_resv_release), \
nvme_opcode_name(nvme_cmd_zone_mgmt_send), \
nvme_opcode_name(nvme_cmd_zone_mgmt_recv), \
@ -1087,6 +1128,7 @@ enum {
NVME_RW_PRINFO_PRCHK_GUARD = 1 << 12,
NVME_RW_PRINFO_PRACT = 1 << 13,
NVME_RW_DTYPE_STREAMS = 1 << 4,
NVME_RW_DTYPE_DPLCMT = 2 << 4,
NVME_WZ_DEAC = 1 << 9,
};
@ -1174,6 +1216,38 @@ struct nvme_zone_mgmt_recv_cmd {
__le32 cdw14[2];
};
struct nvme_io_mgmt_recv_cmd {
__u8 opcode;
__u8 flags;
__u16 command_id;
__le32 nsid;
__le64 rsvd2[2];
union nvme_data_ptr dptr;
__u8 mo;
__u8 rsvd11;
__u16 mos;
__le32 numd;
__le32 cdw12[4];
};
enum {
NVME_IO_MGMT_RECV_MO_RUHS = 1,
};
struct nvme_fdp_ruh_status_desc {
__le16 pid;
__le16 ruhid;
__le32 earutr;
__le64 ruamw;
__u8 reserved[16];
};
struct nvme_fdp_ruh_status {
__u8 rsvd0[14];
__le16 nruhsd;
struct nvme_fdp_ruh_status_desc ruhsd[];
};
enum {
NVME_ZRA_ZONE_REPORT = 0,
NVME_ZRASF_ZONE_REPORT_ALL = 0,
@ -1309,6 +1383,7 @@ enum {
NVME_FEAT_PLM_WINDOW = 0x14,
NVME_FEAT_HOST_BEHAVIOR = 0x16,
NVME_FEAT_SANITIZE = 0x17,
NVME_FEAT_FDP = 0x1d,
NVME_FEAT_SW_PROGRESS = 0x80,
NVME_FEAT_HOST_ID = 0x81,
NVME_FEAT_RESV_MASK = 0x82,
@ -1329,6 +1404,7 @@ enum {
NVME_LOG_ANA = 0x0c,
NVME_LOG_FEATURES = 0x12,
NVME_LOG_RMI = 0x16,
NVME_LOG_FDP_CONFIGS = 0x20,
NVME_LOG_DISC = 0x70,
NVME_LOG_RESERVATION = 0x80,
NVME_FWACT_REPL = (0 << 3),
@ -1923,6 +1999,7 @@ struct nvme_command {
struct nvmf_auth_receive_command auth_receive;
struct nvme_dbbuf dbbuf;
struct nvme_directive_cmd directive;
struct nvme_io_mgmt_recv_cmd imr;
};
};

View file

@ -79,4 +79,6 @@ static inline void part_stat_set_all(struct block_device *part, int value)
#define part_stat_local_read_cpu(part, field, cpu) \
local_read(&(part_stat_get_cpu(part, field, cpu)))
unsigned int bdev_count_inflight(struct block_device *part);
#endif /* _LINUX_PART_STAT_H */

View file

@ -670,8 +670,6 @@ struct Scsi_Host {
/* The transport requires the LUN bits NOT to be stored in CDB[1] */
unsigned no_scsi2_lun_in_cdb:1;
unsigned no_highmem:1;
/*
* Optional work queue to be utilized by the transport
*/

Some files were not shown because too many files have changed in this diff Show more