mirror of
https://github.com/torvalds/linux.git
synced 2025-08-15 14:11:42 +02:00
for-6.16/block-20250523
-----BEGIN PGP SIGNATURE----- iQJEBAABCAAuFiEEwPw5LcreJtl1+l5K99NY+ylx4KYFAmgwnGYQHGF4Ym9lQGtl cm5lbC5kawAKCRD301j7KXHgpq9aD/4iqOts77xhWWLrOJWkkhOcV5rREeyppq8X MKYul9S4cc4Uin9Xou9a+nab31QBQEk3nsN3kX9o3yAXvkh6yUm36HD8qYNW/46q IUkwRQQJ0COyTnexMZQNTbZPQDIYcenXmQxOcrEJ5jC1Jcz0sOKHsgekL+ab3kCy fLnuz2ozvjGDMala/NmE8fN5qSlj4qQABHgbamwlwfo4aWu07cwfqn5G/FCYJgDO xUvsnTVclom2g4G+7eSSvGQI1QyAxl5QpviPnj/TEgfFBFnhbCSoBTEY6ecqhlfW 6u59MF/Uw8E+weiuGY4L87kDtBhjQs3UMSLxCuwH7MxXb25ff7qB4AIkcFD0kKFH 3V5NtwqlU7aQT0xOjGxaHhfPwjLD+FVss4ARmuHS09/Kn8egOW9yROPyetnuH84R Oz0Ctnt1IPLFjvGeg3+rt9fjjS9jWOXLITb9Q6nX9gnCt7orCwIYke8YCpmnJyhn i+fV4CWYIQBBRKxIT0E/GhJxZOmL0JKpomnbpP2dH8npemnsTCuvtfdrK9gfhH2X chBVqCPY8MNU5zKfzdEiavPqcm9392lMzOoOXW2pSC1eAKqnAQ86ZT3r7rLntqE8 75LxHcvaQIsnpyG+YuJVHvoiJ83TbqZNpyHwNaQTYhDmdYpp2d/wTtTQywX4DuXb Y6NDJw5+kQ== =1PNK -----END PGP SIGNATURE----- Merge tag 'for-6.16/block-20250523' of git://git.kernel.dk/linux Pull block updates from Jens Axboe: - ublk updates: - Add support for updating the size of a ublk instance - Zero-copy improvements - Auto-registering of buffers for zero-copy - Series simplifying and improving GET_DATA and request lookup - Series adding quiesce support - Lots of selftests additions - Various cleanups - NVMe updates via Christoph: - add per-node DMA pools and use them for PRP/SGL allocations (Caleb Sander Mateos, Keith Busch) - nvme-fcloop refcounting fixes (Daniel Wagner) - support delayed removal of the multipath node and optionally support the multipath node for private namespaces (Nilay Shroff) - support shared CQs in the PCI endpoint target code (Wilfred Mallawa) - support admin-queue only authentication (Hannes Reinecke) - use the crc32c library instead of the crypto API (Eric Biggers) - misc cleanups (Christoph Hellwig, Marcelo Moreira, Hannes Reinecke, Leon Romanovsky, Gustavo A. R. Silva) - MD updates via Yu: - Fix that normal IO can be starved by sync IO, found by mkfs on newly created large raid5, with some clean up patches for bdev inflight counters - Clean up brd, getting rid of atomic kmaps and bvec poking - Add loop driver specifically for zoned IO testing - Eliminate blk-rq-qos calls with a static key, if not enabled - Improve hctx locking for when a plug has IO for multiple queues pending - Remove block layer bouncing support, which in turn means we can remove the per-node bounce stat as well - Improve blk-throttle support - Improve delay support for blk-throttle - Improve brd discard support - Unify IO scheduler switching. This should also fix a bunch of lockdep warnings we've been seeing, after enabling lockdep support for queue freezing/unfreezeing - Add support for block write streams via FDP (flexible data placement) on NVMe - Add a bunch of block helpers, facilitating the removal of a bunch of duplicated boilerplate code - Remove obsolete BLK_MQ pci and virtio Kconfig options - Add atomic/untorn write support to blktrace - Various little cleanups and fixes * tag 'for-6.16/block-20250523' of git://git.kernel.dk/linux: (186 commits) selftests: ublk: add test for UBLK_F_QUIESCE ublk: add feature UBLK_F_QUIESCE selftests: ublk: add test case for UBLK_U_CMD_UPDATE_SIZE traceevent/block: Add REQ_ATOMIC flag to block trace events ublk: run auto buf unregisgering in same io_ring_ctx with registering io_uring: add helper io_uring_cmd_ctx_handle() ublk: remove io argument from ublk_auto_buf_reg_fallback() ublk: handle ublk_set_auto_buf_reg() failure correctly in ublk_fetch() selftests: ublk: add test for covering UBLK_AUTO_BUF_REG_FALLBACK selftests: ublk: support UBLK_F_AUTO_BUF_REG ublk: support UBLK_AUTO_BUF_REG_FALLBACK ublk: register buffer to local io_uring with provided buf index via UBLK_F_AUTO_BUF_REG ublk: prepare for supporting to register request buffer automatically ublk: convert to refcount_t selftests: ublk: make IO & device removal test more stressful nvme: rename nvme_mpath_shutdown_disk to nvme_mpath_remove_disk nvme: introduce multipath_always_on module param nvme-multipath: introduce delayed removal of the multipath head node nvme-pci: derive and better document max segments limits nvme-pci: use struct_size for allocation struct nvme_dev ...
This commit is contained in:
commit
6f59de9bc0
129 changed files with 5512 additions and 2491 deletions
|
@ -547,6 +547,21 @@ Description:
|
|||
[RO] Maximum size in bytes of a single element in a DMA
|
||||
scatter/gather list.
|
||||
|
||||
What: /sys/block/<disk>/queue/max_write_streams
|
||||
Date: November 2024
|
||||
Contact: linux-block@vger.kernel.org
|
||||
Description:
|
||||
[RO] Maximum number of write streams supported, 0 if not
|
||||
supported. If supported, valid values are 1 through
|
||||
max_write_streams, inclusive.
|
||||
|
||||
What: /sys/block/<disk>/queue/write_stream_granularity
|
||||
Date: November 2024
|
||||
Contact: linux-block@vger.kernel.org
|
||||
Description:
|
||||
[RO] Granularity of a write stream in bytes. The granularity
|
||||
of a write stream is the size that should be discarded or
|
||||
overwritten together to avoid write amplification in the device.
|
||||
|
||||
What: /sys/block/<disk>/queue/max_segments
|
||||
Date: March 2010
|
||||
|
|
|
@ -11,6 +11,7 @@ Block Devices
|
|||
nbd
|
||||
paride
|
||||
ramdisk
|
||||
zoned_loop
|
||||
zram
|
||||
|
||||
drbd/index
|
||||
|
|
169
Documentation/admin-guide/blockdev/zoned_loop.rst
Normal file
169
Documentation/admin-guide/blockdev/zoned_loop.rst
Normal file
|
@ -0,0 +1,169 @@
|
|||
.. SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
=======================
|
||||
Zoned Loop Block Device
|
||||
=======================
|
||||
|
||||
.. Contents:
|
||||
|
||||
1) Overview
|
||||
2) Creating a Zoned Device
|
||||
3) Deleting a Zoned Device
|
||||
4) Example
|
||||
|
||||
|
||||
1) Overview
|
||||
-----------
|
||||
|
||||
The zoned loop block device driver (zloop) allows a user to create a zoned block
|
||||
device using one regular file per zone as backing storage. This driver does not
|
||||
directly control any hardware and uses read, write and truncate operations to
|
||||
regular files of a file system to emulate a zoned block device.
|
||||
|
||||
Using zloop, zoned block devices with a configurable capacity, zone size and
|
||||
number of conventional zones can be created. The storage for each zone of the
|
||||
device is implemented using a regular file with a maximum size equal to the zone
|
||||
size. The size of a file backing a conventional zone is always equal to the zone
|
||||
size. The size of a file backing a sequential zone indicates the amount of data
|
||||
sequentially written to the file, that is, the size of the file directly
|
||||
indicates the position of the write pointer of the zone.
|
||||
|
||||
When resetting a sequential zone, its backing file size is truncated to zero.
|
||||
Conversely, for a zone finish operation, the backing file is truncated to the
|
||||
zone size. With this, the maximum capacity of a zloop zoned block device created
|
||||
can be larger configured to be larger than the storage space available on the
|
||||
backing file system. Of course, for such configuration, writing more data than
|
||||
the storage space available on the backing file system will result in write
|
||||
errors.
|
||||
|
||||
The zoned loop block device driver implements a complete zone transition state
|
||||
machine. That is, zones can be empty, implicitly opened, explicitly opened,
|
||||
closed or full. The current implementation does not support any limits on the
|
||||
maximum number of open and active zones.
|
||||
|
||||
No user tools are necessary to create and delete zloop devices.
|
||||
|
||||
2) Creating a Zoned Device
|
||||
--------------------------
|
||||
|
||||
Once the zloop module is loaded (or if zloop is compiled in the kernel), the
|
||||
character device file /dev/zloop-control can be used to add a zloop device.
|
||||
This is done by writing an "add" command directly to the /dev/zloop-control
|
||||
device::
|
||||
|
||||
$ modprobe zloop
|
||||
$ ls -l /dev/zloop*
|
||||
crw-------. 1 root root 10, 123 Jan 6 19:18 /dev/zloop-control
|
||||
|
||||
$ mkdir -p <base directory/<device ID>
|
||||
$ echo "add [options]" > /dev/zloop-control
|
||||
|
||||
The options available for the add command can be listed by reading the
|
||||
/dev/zloop-control device::
|
||||
|
||||
$ cat /dev/zloop-control
|
||||
add id=%d,capacity_mb=%u,zone_size_mb=%u,zone_capacity_mb=%u,conv_zones=%u,base_dir=%s,nr_queues=%u,queue_depth=%u,buffered_io
|
||||
remove id=%d
|
||||
|
||||
In more details, the options that can be used with the "add" command are as
|
||||
follows.
|
||||
|
||||
================ ===========================================================
|
||||
id Device number (the X in /dev/zloopX).
|
||||
Default: automatically assigned.
|
||||
capacity_mb Device total capacity in MiB. This is always rounded up to
|
||||
the nearest higher multiple of the zone size.
|
||||
Default: 16384 MiB (16 GiB).
|
||||
zone_size_mb Device zone size in MiB. Default: 256 MiB.
|
||||
zone_capacity_mb Device zone capacity (must always be equal to or lower than
|
||||
the zone size. Default: zone size.
|
||||
conv_zones Total number of conventioanl zones starting from sector 0.
|
||||
Default: 8.
|
||||
base_dir Path to the base directoy where to create the directory
|
||||
containing the zone files of the device.
|
||||
Default=/var/local/zloop.
|
||||
The device directory containing the zone files is always
|
||||
named with the device ID. E.g. the default zone file
|
||||
directory for /dev/zloop0 is /var/local/zloop/0.
|
||||
nr_queues Number of I/O queues of the zoned block device. This value is
|
||||
always capped by the number of online CPUs
|
||||
Default: 1
|
||||
queue_depth Maximum I/O queue depth per I/O queue.
|
||||
Default: 64
|
||||
buffered_io Do buffered IOs instead of direct IOs (default: false)
|
||||
================ ===========================================================
|
||||
|
||||
3) Deleting a Zoned Device
|
||||
--------------------------
|
||||
|
||||
Deleting an unused zoned loop block device is done by issuing the "remove"
|
||||
command to /dev/zloop-control, specifying the ID of the device to remove::
|
||||
|
||||
$ echo "remove id=X" > /dev/zloop-control
|
||||
|
||||
The remove command does not have any option.
|
||||
|
||||
A zoned device that was removed can be re-added again without any change to the
|
||||
state of the device zones: the device zones are restored to their last state
|
||||
before the device was removed. Adding again a zoned device after it was removed
|
||||
must always be done using the same configuration as when the device was first
|
||||
added. If a zone configuration change is detected, an error will be returned and
|
||||
the zoned device will not be created.
|
||||
|
||||
To fully delete a zoned device, after executing the remove operation, the device
|
||||
base directory containing the backing files of the device zones must be deleted.
|
||||
|
||||
4) Example
|
||||
----------
|
||||
|
||||
The following sequence of commands creates a 2GB zoned device with zones of 64
|
||||
MB and a zone capacity of 63 MB::
|
||||
|
||||
$ modprobe zloop
|
||||
$ mkdir -p /var/local/zloop/0
|
||||
$ echo "add capacity_mb=2048,zone_size_mb=64,zone_capacity=63MB" > /dev/zloop-control
|
||||
|
||||
For the device created (/dev/zloop0), the zone backing files are all created
|
||||
under the default base directory (/var/local/zloop)::
|
||||
|
||||
$ ls -l /var/local/zloop/0
|
||||
total 0
|
||||
-rw-------. 1 root root 67108864 Jan 6 22:23 cnv-000000
|
||||
-rw-------. 1 root root 67108864 Jan 6 22:23 cnv-000001
|
||||
-rw-------. 1 root root 67108864 Jan 6 22:23 cnv-000002
|
||||
-rw-------. 1 root root 67108864 Jan 6 22:23 cnv-000003
|
||||
-rw-------. 1 root root 67108864 Jan 6 22:23 cnv-000004
|
||||
-rw-------. 1 root root 67108864 Jan 6 22:23 cnv-000005
|
||||
-rw-------. 1 root root 67108864 Jan 6 22:23 cnv-000006
|
||||
-rw-------. 1 root root 67108864 Jan 6 22:23 cnv-000007
|
||||
-rw-------. 1 root root 0 Jan 6 22:23 seq-000008
|
||||
-rw-------. 1 root root 0 Jan 6 22:23 seq-000009
|
||||
...
|
||||
|
||||
The zoned device created (/dev/zloop0) can then be used normally::
|
||||
|
||||
$ lsblk -z
|
||||
NAME ZONED ZONE-SZ ZONE-NR ZONE-AMAX ZONE-OMAX ZONE-APP ZONE-WGRAN
|
||||
zloop0 host-managed 64M 32 0 0 1M 4K
|
||||
$ blkzone report /dev/zloop0
|
||||
start: 0x000000000, len 0x020000, cap 0x020000, wptr 0x000000 reset:0 non-seq:0, zcond: 0(nw) [type: 1(CONVENTIONAL)]
|
||||
start: 0x000020000, len 0x020000, cap 0x020000, wptr 0x000000 reset:0 non-seq:0, zcond: 0(nw) [type: 1(CONVENTIONAL)]
|
||||
start: 0x000040000, len 0x020000, cap 0x020000, wptr 0x000000 reset:0 non-seq:0, zcond: 0(nw) [type: 1(CONVENTIONAL)]
|
||||
start: 0x000060000, len 0x020000, cap 0x020000, wptr 0x000000 reset:0 non-seq:0, zcond: 0(nw) [type: 1(CONVENTIONAL)]
|
||||
start: 0x000080000, len 0x020000, cap 0x020000, wptr 0x000000 reset:0 non-seq:0, zcond: 0(nw) [type: 1(CONVENTIONAL)]
|
||||
start: 0x0000a0000, len 0x020000, cap 0x020000, wptr 0x000000 reset:0 non-seq:0, zcond: 0(nw) [type: 1(CONVENTIONAL)]
|
||||
start: 0x0000c0000, len 0x020000, cap 0x020000, wptr 0x000000 reset:0 non-seq:0, zcond: 0(nw) [type: 1(CONVENTIONAL)]
|
||||
start: 0x0000e0000, len 0x020000, cap 0x020000, wptr 0x000000 reset:0 non-seq:0, zcond: 0(nw) [type: 1(CONVENTIONAL)]
|
||||
start: 0x000100000, len 0x020000, cap 0x01f800, wptr 0x000000 reset:0 non-seq:0, zcond: 1(em) [type: 2(SEQ_WRITE_REQUIRED)]
|
||||
start: 0x000120000, len 0x020000, cap 0x01f800, wptr 0x000000 reset:0 non-seq:0, zcond: 1(em) [type: 2(SEQ_WRITE_REQUIRED)]
|
||||
...
|
||||
|
||||
Deleting this device is done using the command::
|
||||
|
||||
$ echo "remove id=0" > /dev/zloop-control
|
||||
|
||||
The removed device can be re-added again using the same "add" command as when
|
||||
the device was first created. To fully delete a zoned device, its backing files
|
||||
should also be deleted after executing the remove command::
|
||||
|
||||
$ rm -r /var/local/zloop/0
|
|
@ -26894,6 +26894,14 @@ L: linux-kernel@vger.kernel.org
|
|||
S: Maintained
|
||||
F: arch/x86/kernel/cpu/zhaoxin.c
|
||||
|
||||
ZONED LOOP DEVICE
|
||||
M: Damien Le Moal <dlemoal@kernel.org>
|
||||
R: Christoph Hellwig <hch@lst.de>
|
||||
L: linux-block@vger.kernel.org
|
||||
S: Maintained
|
||||
F: Documentation/admin-guide/blockdev/zoned_loop.rst
|
||||
F: drivers/block/zloop.c
|
||||
|
||||
ZONEFS FILESYSTEM
|
||||
M: Damien Le Moal <dlemoal@kernel.org>
|
||||
M: Naohiro Aota <naohiro.aota@wdc.com>
|
||||
|
|
|
@ -13,7 +13,6 @@ CONFIG_MIPS_CMDLINE_DTB_EXTEND=y
|
|||
CONFIG_MODULES=y
|
||||
CONFIG_MODULE_UNLOAD=y
|
||||
# CONFIG_BLK_DEV_BSG is not set
|
||||
# CONFIG_BOUNCE is not set
|
||||
CONFIG_NET=y
|
||||
CONFIG_PACKET=y
|
||||
CONFIG_UNIX=y
|
||||
|
|
|
@ -211,14 +211,6 @@ config BLK_INLINE_ENCRYPTION_FALLBACK
|
|||
|
||||
source "block/partitions/Kconfig"
|
||||
|
||||
config BLK_MQ_PCI
|
||||
def_bool PCI
|
||||
|
||||
config BLK_MQ_VIRTIO
|
||||
bool
|
||||
depends on VIRTIO
|
||||
default y
|
||||
|
||||
config BLK_PM
|
||||
def_bool PM
|
||||
|
||||
|
|
|
@ -5,13 +5,12 @@
|
|||
|
||||
obj-y := bdev.o fops.o bio.o elevator.o blk-core.o blk-sysfs.o \
|
||||
blk-flush.o blk-settings.o blk-ioc.o blk-map.o \
|
||||
blk-merge.o blk-timeout.o \
|
||||
blk-lib.o blk-mq.o blk-mq-tag.o blk-stat.o \
|
||||
blk-merge.o blk-timeout.o blk-lib.o blk-mq.o \
|
||||
blk-mq-tag.o blk-mq-dma.o blk-stat.o \
|
||||
blk-mq-sysfs.o blk-mq-cpumap.o blk-mq-sched.o ioctl.o \
|
||||
genhd.o ioprio.o badblocks.o partitions/ blk-rq-qos.o \
|
||||
disk-events.o blk-ia-ranges.o early-lookup.o
|
||||
|
||||
obj-$(CONFIG_BOUNCE) += bounce.o
|
||||
obj-$(CONFIG_BLK_DEV_BSG_COMMON) += bsg.o
|
||||
obj-$(CONFIG_BLK_DEV_BSGLIB) += bsg-lib.o
|
||||
obj-$(CONFIG_BLK_CGROUP) += blk-cgroup.o
|
||||
|
|
|
@ -7210,8 +7210,8 @@ static void bfq_exit_queue(struct elevator_queue *e)
|
|||
#endif
|
||||
|
||||
blk_stat_disable_accounting(bfqd->queue);
|
||||
clear_bit(ELEVATOR_FLAG_DISABLE_WBT, &e->flags);
|
||||
wbt_enable_default(bfqd->queue->disk);
|
||||
blk_queue_flag_clear(QUEUE_FLAG_DISABLE_WBT_DEF, bfqd->queue);
|
||||
set_bit(ELEVATOR_FLAG_ENABLE_WBT_ON_EXIT, &e->flags);
|
||||
|
||||
kfree(bfqd);
|
||||
}
|
||||
|
@ -7397,7 +7397,7 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
|
|||
/* We dispatch from request queue wide instead of hw queue */
|
||||
blk_queue_flag_set(QUEUE_FLAG_SQ_SCHED, q);
|
||||
|
||||
set_bit(ELEVATOR_FLAG_DISABLE_WBT, &eq->flags);
|
||||
blk_queue_flag_set(QUEUE_FLAG_DISABLE_WBT_DEF, q);
|
||||
wbt_disable_default(q->disk);
|
||||
blk_stat_enable_accounting(q);
|
||||
|
||||
|
|
|
@ -127,10 +127,8 @@ int bio_integrity_add_page(struct bio *bio, struct page *page,
|
|||
|
||||
if (bip->bip_vcnt > 0) {
|
||||
struct bio_vec *bv = &bip->bip_vec[bip->bip_vcnt - 1];
|
||||
bool same_page = false;
|
||||
|
||||
if (bvec_try_merge_hw_page(q, bv, page, len, offset,
|
||||
&same_page)) {
|
||||
if (bvec_try_merge_hw_page(q, bv, page, len, offset)) {
|
||||
bip->bip_iter.bi_size += len;
|
||||
return len;
|
||||
}
|
||||
|
|
158
block/bio.c
158
block/bio.c
|
@ -251,6 +251,7 @@ void bio_init(struct bio *bio, struct block_device *bdev, struct bio_vec *table,
|
|||
bio->bi_flags = 0;
|
||||
bio->bi_ioprio = 0;
|
||||
bio->bi_write_hint = 0;
|
||||
bio->bi_write_stream = 0;
|
||||
bio->bi_status = 0;
|
||||
bio->bi_iter.bi_sector = 0;
|
||||
bio->bi_iter.bi_size = 0;
|
||||
|
@ -827,6 +828,7 @@ static int __bio_clone(struct bio *bio, struct bio *bio_src, gfp_t gfp)
|
|||
bio_set_flag(bio, BIO_CLONED);
|
||||
bio->bi_ioprio = bio_src->bi_ioprio;
|
||||
bio->bi_write_hint = bio_src->bi_write_hint;
|
||||
bio->bi_write_stream = bio_src->bi_write_stream;
|
||||
bio->bi_iter = bio_src->bi_iter;
|
||||
|
||||
if (bio->bi_bdev) {
|
||||
|
@ -918,7 +920,7 @@ static inline bool bio_full(struct bio *bio, unsigned len)
|
|||
}
|
||||
|
||||
static bool bvec_try_merge_page(struct bio_vec *bv, struct page *page,
|
||||
unsigned int len, unsigned int off, bool *same_page)
|
||||
unsigned int len, unsigned int off)
|
||||
{
|
||||
size_t bv_end = bv->bv_offset + bv->bv_len;
|
||||
phys_addr_t vec_end_addr = page_to_phys(bv->bv_page) + bv_end - 1;
|
||||
|
@ -931,9 +933,7 @@ static bool bvec_try_merge_page(struct bio_vec *bv, struct page *page,
|
|||
if (!zone_device_pages_have_same_pgmap(bv->bv_page, page))
|
||||
return false;
|
||||
|
||||
*same_page = ((vec_end_addr & PAGE_MASK) == ((page_addr + off) &
|
||||
PAGE_MASK));
|
||||
if (!*same_page) {
|
||||
if ((vec_end_addr & PAGE_MASK) != ((page_addr + off) & PAGE_MASK)) {
|
||||
if (IS_ENABLED(CONFIG_KMSAN))
|
||||
return false;
|
||||
if (bv->bv_page + bv_end / PAGE_SIZE != page + off / PAGE_SIZE)
|
||||
|
@ -953,8 +953,7 @@ static bool bvec_try_merge_page(struct bio_vec *bv, struct page *page,
|
|||
* helpers to split. Hopefully this will go away soon.
|
||||
*/
|
||||
bool bvec_try_merge_hw_page(struct request_queue *q, struct bio_vec *bv,
|
||||
struct page *page, unsigned len, unsigned offset,
|
||||
bool *same_page)
|
||||
struct page *page, unsigned len, unsigned offset)
|
||||
{
|
||||
unsigned long mask = queue_segment_boundary(q);
|
||||
phys_addr_t addr1 = bvec_phys(bv);
|
||||
|
@ -964,7 +963,7 @@ bool bvec_try_merge_hw_page(struct request_queue *q, struct bio_vec *bv,
|
|||
return false;
|
||||
if (len > queue_max_segment_size(q) - bv->bv_len)
|
||||
return false;
|
||||
return bvec_try_merge_page(bv, page, len, offset, same_page);
|
||||
return bvec_try_merge_page(bv, page, len, offset);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -989,6 +988,22 @@ void __bio_add_page(struct bio *bio, struct page *page,
|
|||
}
|
||||
EXPORT_SYMBOL_GPL(__bio_add_page);
|
||||
|
||||
/**
|
||||
* bio_add_virt_nofail - add data in the direct kernel mapping to a bio
|
||||
* @bio: destination bio
|
||||
* @vaddr: data to add
|
||||
* @len: length of the data to add, may cross pages
|
||||
*
|
||||
* Add the data at @vaddr to @bio. The caller must have ensure a segment
|
||||
* is available for the added data. No merging into an existing segment
|
||||
* will be performed.
|
||||
*/
|
||||
void bio_add_virt_nofail(struct bio *bio, void *vaddr, unsigned len)
|
||||
{
|
||||
__bio_add_page(bio, virt_to_page(vaddr), len, offset_in_page(vaddr));
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(bio_add_virt_nofail);
|
||||
|
||||
/**
|
||||
* bio_add_page - attempt to add page(s) to bio
|
||||
* @bio: destination bio
|
||||
|
@ -1002,8 +1017,6 @@ EXPORT_SYMBOL_GPL(__bio_add_page);
|
|||
int bio_add_page(struct bio *bio, struct page *page,
|
||||
unsigned int len, unsigned int offset)
|
||||
{
|
||||
bool same_page = false;
|
||||
|
||||
if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)))
|
||||
return 0;
|
||||
if (bio->bi_iter.bi_size > UINT_MAX - len)
|
||||
|
@ -1011,7 +1024,7 @@ int bio_add_page(struct bio *bio, struct page *page,
|
|||
|
||||
if (bio->bi_vcnt > 0 &&
|
||||
bvec_try_merge_page(&bio->bi_io_vec[bio->bi_vcnt - 1],
|
||||
page, len, offset, &same_page)) {
|
||||
page, len, offset)) {
|
||||
bio->bi_iter.bi_size += len;
|
||||
return len;
|
||||
}
|
||||
|
@ -1058,6 +1071,61 @@ bool bio_add_folio(struct bio *bio, struct folio *folio, size_t len,
|
|||
}
|
||||
EXPORT_SYMBOL(bio_add_folio);
|
||||
|
||||
/**
|
||||
* bio_add_vmalloc_chunk - add a vmalloc chunk to a bio
|
||||
* @bio: destination bio
|
||||
* @vaddr: vmalloc address to add
|
||||
* @len: total length in bytes of the data to add
|
||||
*
|
||||
* Add data starting at @vaddr to @bio and return how many bytes were added.
|
||||
* This may be less than the amount originally asked. Returns 0 if no data
|
||||
* could be added to @bio.
|
||||
*
|
||||
* This helper calls flush_kernel_vmap_range() for the range added. For reads
|
||||
* the caller still needs to manually call invalidate_kernel_vmap_range() in
|
||||
* the completion handler.
|
||||
*/
|
||||
unsigned int bio_add_vmalloc_chunk(struct bio *bio, void *vaddr, unsigned len)
|
||||
{
|
||||
unsigned int offset = offset_in_page(vaddr);
|
||||
|
||||
len = min(len, PAGE_SIZE - offset);
|
||||
if (bio_add_page(bio, vmalloc_to_page(vaddr), len, offset) < len)
|
||||
return 0;
|
||||
if (op_is_write(bio_op(bio)))
|
||||
flush_kernel_vmap_range(vaddr, len);
|
||||
return len;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(bio_add_vmalloc_chunk);
|
||||
|
||||
/**
|
||||
* bio_add_vmalloc - add a vmalloc region to a bio
|
||||
* @bio: destination bio
|
||||
* @vaddr: vmalloc address to add
|
||||
* @len: total length in bytes of the data to add
|
||||
*
|
||||
* Add data starting at @vaddr to @bio. Return %true on success or %false if
|
||||
* @bio does not have enough space for the payload.
|
||||
*
|
||||
* This helper calls flush_kernel_vmap_range() for the range added. For reads
|
||||
* the caller still needs to manually call invalidate_kernel_vmap_range() in
|
||||
* the completion handler.
|
||||
*/
|
||||
bool bio_add_vmalloc(struct bio *bio, void *vaddr, unsigned int len)
|
||||
{
|
||||
do {
|
||||
unsigned int added = bio_add_vmalloc_chunk(bio, vaddr, len);
|
||||
|
||||
if (!added)
|
||||
return false;
|
||||
vaddr += added;
|
||||
len -= added;
|
||||
} while (len);
|
||||
|
||||
return true;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(bio_add_vmalloc);
|
||||
|
||||
void __bio_release_pages(struct bio *bio, bool mark_dirty)
|
||||
{
|
||||
struct folio_iter fi;
|
||||
|
@ -1088,27 +1156,6 @@ void bio_iov_bvec_set(struct bio *bio, const struct iov_iter *iter)
|
|||
bio_set_flag(bio, BIO_CLONED);
|
||||
}
|
||||
|
||||
static int bio_iov_add_folio(struct bio *bio, struct folio *folio, size_t len,
|
||||
size_t offset)
|
||||
{
|
||||
bool same_page = false;
|
||||
|
||||
if (WARN_ON_ONCE(bio->bi_iter.bi_size > UINT_MAX - len))
|
||||
return -EIO;
|
||||
|
||||
if (bio->bi_vcnt > 0 &&
|
||||
bvec_try_merge_page(&bio->bi_io_vec[bio->bi_vcnt - 1],
|
||||
folio_page(folio, 0), len, offset,
|
||||
&same_page)) {
|
||||
bio->bi_iter.bi_size += len;
|
||||
if (same_page && bio_flagged(bio, BIO_PAGE_PINNED))
|
||||
unpin_user_folio(folio, 1);
|
||||
return 0;
|
||||
}
|
||||
bio_add_folio_nofail(bio, folio, len, offset);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static unsigned int get_contig_folio_len(unsigned int *num_pages,
|
||||
struct page **pages, unsigned int i,
|
||||
struct folio *folio, size_t left,
|
||||
|
@ -1203,6 +1250,7 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
|
|||
for (left = size, i = 0; left > 0; left -= len, i += num_pages) {
|
||||
struct page *page = pages[i];
|
||||
struct folio *folio = page_folio(page);
|
||||
unsigned int old_vcnt = bio->bi_vcnt;
|
||||
|
||||
folio_offset = ((size_t)folio_page_idx(folio, page) <<
|
||||
PAGE_SHIFT) + offset;
|
||||
|
@ -1215,7 +1263,23 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
|
|||
len = get_contig_folio_len(&num_pages, pages, i,
|
||||
folio, left, offset);
|
||||
|
||||
bio_iov_add_folio(bio, folio, len, folio_offset);
|
||||
if (!bio_add_folio(bio, folio, len, folio_offset)) {
|
||||
WARN_ON_ONCE(1);
|
||||
ret = -EINVAL;
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (bio_flagged(bio, BIO_PAGE_PINNED)) {
|
||||
/*
|
||||
* We're adding another fragment of a page that already
|
||||
* was part of the last segment. Undo our pin as the
|
||||
* page was pinned when an earlier fragment of it was
|
||||
* added to the bio and __bio_release_pages expects a
|
||||
* single pin per page.
|
||||
*/
|
||||
if (offset && bio->bi_vcnt == old_vcnt)
|
||||
unpin_user_folio(folio, 1);
|
||||
}
|
||||
offset = 0;
|
||||
}
|
||||
|
||||
|
@ -1301,6 +1365,36 @@ int submit_bio_wait(struct bio *bio)
|
|||
}
|
||||
EXPORT_SYMBOL(submit_bio_wait);
|
||||
|
||||
/**
|
||||
* bdev_rw_virt - synchronously read into / write from kernel mapping
|
||||
* @bdev: block device to access
|
||||
* @sector: sector to access
|
||||
* @data: data to read/write
|
||||
* @len: length in byte to read/write
|
||||
* @op: operation (e.g. REQ_OP_READ/REQ_OP_WRITE)
|
||||
*
|
||||
* Performs synchronous I/O to @bdev for @data/@len. @data must be in
|
||||
* the kernel direct mapping and not a vmalloc address.
|
||||
*/
|
||||
int bdev_rw_virt(struct block_device *bdev, sector_t sector, void *data,
|
||||
size_t len, enum req_op op)
|
||||
{
|
||||
struct bio_vec bv;
|
||||
struct bio bio;
|
||||
int error;
|
||||
|
||||
if (WARN_ON_ONCE(is_vmalloc_addr(data)))
|
||||
return -EIO;
|
||||
|
||||
bio_init(&bio, bdev, &bv, 1, op);
|
||||
bio.bi_iter.bi_sector = sector;
|
||||
bio_add_virt_nofail(&bio, data, len);
|
||||
error = submit_bio_wait(&bio);
|
||||
bio_uninit(&bio);
|
||||
return error;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(bdev_rw_virt);
|
||||
|
||||
static void bio_wait_end_io(struct bio *bio)
|
||||
{
|
||||
complete(bio->bi_private);
|
||||
|
|
|
@ -1018,7 +1018,7 @@ again:
|
|||
stamp = READ_ONCE(part->bd_stamp);
|
||||
if (unlikely(time_after(now, stamp)) &&
|
||||
likely(try_cmpxchg(&part->bd_stamp, &stamp, now)) &&
|
||||
(end || part_in_flight(part)))
|
||||
(end || bdev_count_inflight(part)))
|
||||
__part_stat_add(part, io_ticks, now - stamp);
|
||||
|
||||
if (bdev_is_partition(part)) {
|
||||
|
|
|
@ -173,6 +173,7 @@ static struct bio *blk_crypto_fallback_clone_bio(struct bio *bio_src)
|
|||
bio_set_flag(bio, BIO_REMAPPED);
|
||||
bio->bi_ioprio = bio_src->bi_ioprio;
|
||||
bio->bi_write_hint = bio_src->bi_write_hint;
|
||||
bio->bi_write_stream = bio_src->bi_write_stream;
|
||||
bio->bi_iter.bi_sector = bio_src->bi_iter.bi_sector;
|
||||
bio->bi_iter.bi_size = bio_src->bi_iter.bi_size;
|
||||
|
||||
|
|
|
@ -317,64 +317,26 @@ static void bio_map_kern_endio(struct bio *bio)
|
|||
kfree(bio);
|
||||
}
|
||||
|
||||
/**
|
||||
* bio_map_kern - map kernel address into bio
|
||||
* @q: the struct request_queue for the bio
|
||||
* @data: pointer to buffer to map
|
||||
* @len: length in bytes
|
||||
* @gfp_mask: allocation flags for bio allocation
|
||||
*
|
||||
* Map the kernel address into a bio suitable for io to a block
|
||||
* device. Returns an error pointer in case of error.
|
||||
*/
|
||||
static struct bio *bio_map_kern(struct request_queue *q, void *data,
|
||||
unsigned int len, gfp_t gfp_mask)
|
||||
static struct bio *bio_map_kern(void *data, unsigned int len, enum req_op op,
|
||||
gfp_t gfp_mask)
|
||||
{
|
||||
unsigned long kaddr = (unsigned long)data;
|
||||
unsigned long end = (kaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
|
||||
unsigned long start = kaddr >> PAGE_SHIFT;
|
||||
const int nr_pages = end - start;
|
||||
bool is_vmalloc = is_vmalloc_addr(data);
|
||||
struct page *page;
|
||||
int offset, i;
|
||||
unsigned int nr_vecs = bio_add_max_vecs(data, len);
|
||||
struct bio *bio;
|
||||
|
||||
bio = bio_kmalloc(nr_pages, gfp_mask);
|
||||
bio = bio_kmalloc(nr_vecs, gfp_mask);
|
||||
if (!bio)
|
||||
return ERR_PTR(-ENOMEM);
|
||||
bio_init(bio, NULL, bio->bi_inline_vecs, nr_pages, 0);
|
||||
|
||||
if (is_vmalloc) {
|
||||
flush_kernel_vmap_range(data, len);
|
||||
bio_init(bio, NULL, bio->bi_inline_vecs, nr_vecs, op);
|
||||
if (is_vmalloc_addr(data)) {
|
||||
bio->bi_private = data;
|
||||
}
|
||||
|
||||
offset = offset_in_page(kaddr);
|
||||
for (i = 0; i < nr_pages; i++) {
|
||||
unsigned int bytes = PAGE_SIZE - offset;
|
||||
|
||||
if (len <= 0)
|
||||
break;
|
||||
|
||||
if (bytes > len)
|
||||
bytes = len;
|
||||
|
||||
if (!is_vmalloc)
|
||||
page = virt_to_page(data);
|
||||
else
|
||||
page = vmalloc_to_page(data);
|
||||
if (bio_add_page(bio, page, bytes, offset) < bytes) {
|
||||
/* we don't support partial mappings */
|
||||
if (!bio_add_vmalloc(bio, data, len)) {
|
||||
bio_uninit(bio);
|
||||
kfree(bio);
|
||||
return ERR_PTR(-EINVAL);
|
||||
}
|
||||
|
||||
data += bytes;
|
||||
len -= bytes;
|
||||
offset = 0;
|
||||
} else {
|
||||
bio_add_virt_nofail(bio, data, len);
|
||||
}
|
||||
|
||||
bio->bi_end_io = bio_map_kern_endio;
|
||||
return bio;
|
||||
}
|
||||
|
@ -402,17 +364,16 @@ static void bio_copy_kern_endio_read(struct bio *bio)
|
|||
|
||||
/**
|
||||
* bio_copy_kern - copy kernel address into bio
|
||||
* @q: the struct request_queue for the bio
|
||||
* @data: pointer to buffer to copy
|
||||
* @len: length in bytes
|
||||
* @op: bio/request operation
|
||||
* @gfp_mask: allocation flags for bio and page allocation
|
||||
* @reading: data direction is READ
|
||||
*
|
||||
* copy the kernel address into a bio suitable for io to a block
|
||||
* device. Returns an error pointer in case of error.
|
||||
*/
|
||||
static struct bio *bio_copy_kern(struct request_queue *q, void *data,
|
||||
unsigned int len, gfp_t gfp_mask, int reading)
|
||||
static struct bio *bio_copy_kern(void *data, unsigned int len, enum req_op op,
|
||||
gfp_t gfp_mask)
|
||||
{
|
||||
unsigned long kaddr = (unsigned long)data;
|
||||
unsigned long end = (kaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
|
||||
|
@ -431,7 +392,7 @@ static struct bio *bio_copy_kern(struct request_queue *q, void *data,
|
|||
bio = bio_kmalloc(nr_pages, gfp_mask);
|
||||
if (!bio)
|
||||
return ERR_PTR(-ENOMEM);
|
||||
bio_init(bio, NULL, bio->bi_inline_vecs, nr_pages, 0);
|
||||
bio_init(bio, NULL, bio->bi_inline_vecs, nr_pages, op);
|
||||
|
||||
while (len) {
|
||||
struct page *page;
|
||||
|
@ -444,7 +405,7 @@ static struct bio *bio_copy_kern(struct request_queue *q, void *data,
|
|||
if (!page)
|
||||
goto cleanup;
|
||||
|
||||
if (!reading)
|
||||
if (op_is_write(op))
|
||||
memcpy(page_address(page), p, bytes);
|
||||
|
||||
if (bio_add_page(bio, page, bytes, 0) < bytes)
|
||||
|
@ -454,11 +415,11 @@ static struct bio *bio_copy_kern(struct request_queue *q, void *data,
|
|||
p += bytes;
|
||||
}
|
||||
|
||||
if (reading) {
|
||||
if (op_is_write(op)) {
|
||||
bio->bi_end_io = bio_copy_kern_endio;
|
||||
} else {
|
||||
bio->bi_end_io = bio_copy_kern_endio_read;
|
||||
bio->bi_private = data;
|
||||
} else {
|
||||
bio->bi_end_io = bio_copy_kern_endio;
|
||||
}
|
||||
|
||||
return bio;
|
||||
|
@ -556,8 +517,6 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq,
|
|||
|
||||
if (map_data)
|
||||
copy = true;
|
||||
else if (blk_queue_may_bounce(q))
|
||||
copy = true;
|
||||
else if (iov_iter_alignment(iter) & align)
|
||||
copy = true;
|
||||
else if (iov_iter_is_bvec(iter))
|
||||
|
@ -689,7 +648,6 @@ EXPORT_SYMBOL(blk_rq_unmap_user);
|
|||
|
||||
/**
|
||||
* blk_rq_map_kern - map kernel data to a request, for passthrough requests
|
||||
* @q: request queue where request should be inserted
|
||||
* @rq: request to fill
|
||||
* @kbuf: the kernel buffer
|
||||
* @len: length of user data
|
||||
|
@ -700,31 +658,26 @@ EXPORT_SYMBOL(blk_rq_unmap_user);
|
|||
* buffer is used. Can be called multiple times to append multiple
|
||||
* buffers.
|
||||
*/
|
||||
int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf,
|
||||
unsigned int len, gfp_t gfp_mask)
|
||||
int blk_rq_map_kern(struct request *rq, void *kbuf, unsigned int len,
|
||||
gfp_t gfp_mask)
|
||||
{
|
||||
int reading = rq_data_dir(rq) == READ;
|
||||
unsigned long addr = (unsigned long) kbuf;
|
||||
struct bio *bio;
|
||||
int ret;
|
||||
|
||||
if (len > (queue_max_hw_sectors(q) << 9))
|
||||
if (len > (queue_max_hw_sectors(rq->q) << SECTOR_SHIFT))
|
||||
return -EINVAL;
|
||||
if (!len || !kbuf)
|
||||
return -EINVAL;
|
||||
|
||||
if (!blk_rq_aligned(q, addr, len) || object_is_on_stack(kbuf) ||
|
||||
blk_queue_may_bounce(q))
|
||||
bio = bio_copy_kern(q, kbuf, len, gfp_mask, reading);
|
||||
if (!blk_rq_aligned(rq->q, addr, len) || object_is_on_stack(kbuf))
|
||||
bio = bio_copy_kern(kbuf, len, req_op(rq), gfp_mask);
|
||||
else
|
||||
bio = bio_map_kern(q, kbuf, len, gfp_mask);
|
||||
bio = bio_map_kern(kbuf, len, req_op(rq), gfp_mask);
|
||||
|
||||
if (IS_ERR(bio))
|
||||
return PTR_ERR(bio);
|
||||
|
||||
bio->bi_opf &= ~REQ_OP_MASK;
|
||||
bio->bi_opf |= req_op(rq);
|
||||
|
||||
ret = blk_rq_append_bio(rq, bio);
|
||||
if (unlikely(ret)) {
|
||||
bio_uninit(bio);
|
||||
|
|
|
@ -7,7 +7,6 @@
|
|||
#include <linux/bio.h>
|
||||
#include <linux/blkdev.h>
|
||||
#include <linux/blk-integrity.h>
|
||||
#include <linux/scatterlist.h>
|
||||
#include <linux/part_stat.h>
|
||||
#include <linux/blk-cgroup.h>
|
||||
|
||||
|
@ -225,27 +224,6 @@ static inline unsigned get_max_io_size(struct bio *bio,
|
|||
return max_sectors & ~(lbs - 1);
|
||||
}
|
||||
|
||||
/**
|
||||
* get_max_segment_size() - maximum number of bytes to add as a single segment
|
||||
* @lim: Request queue limits.
|
||||
* @paddr: address of the range to add
|
||||
* @len: maximum length available to add at @paddr
|
||||
*
|
||||
* Returns the maximum number of bytes of the range starting at @paddr that can
|
||||
* be added to a single segment.
|
||||
*/
|
||||
static inline unsigned get_max_segment_size(const struct queue_limits *lim,
|
||||
phys_addr_t paddr, unsigned int len)
|
||||
{
|
||||
/*
|
||||
* Prevent an overflow if mask = ULONG_MAX and offset = 0 by adding 1
|
||||
* after having calculated the minimum.
|
||||
*/
|
||||
return min_t(unsigned long, len,
|
||||
min(lim->seg_boundary_mask - (lim->seg_boundary_mask & paddr),
|
||||
(unsigned long)lim->max_segment_size - 1) + 1);
|
||||
}
|
||||
|
||||
/**
|
||||
* bvec_split_segs - verify whether or not a bvec should be split in the middle
|
||||
* @lim: [in] queue limits to split based on
|
||||
|
@ -473,117 +451,6 @@ unsigned int blk_recalc_rq_segments(struct request *rq)
|
|||
return nr_phys_segs;
|
||||
}
|
||||
|
||||
struct phys_vec {
|
||||
phys_addr_t paddr;
|
||||
u32 len;
|
||||
};
|
||||
|
||||
static bool blk_map_iter_next(struct request *req,
|
||||
struct req_iterator *iter, struct phys_vec *vec)
|
||||
{
|
||||
unsigned int max_size;
|
||||
struct bio_vec bv;
|
||||
|
||||
if (req->rq_flags & RQF_SPECIAL_PAYLOAD) {
|
||||
if (!iter->bio)
|
||||
return false;
|
||||
vec->paddr = bvec_phys(&req->special_vec);
|
||||
vec->len = req->special_vec.bv_len;
|
||||
iter->bio = NULL;
|
||||
return true;
|
||||
}
|
||||
|
||||
if (!iter->iter.bi_size)
|
||||
return false;
|
||||
|
||||
bv = mp_bvec_iter_bvec(iter->bio->bi_io_vec, iter->iter);
|
||||
vec->paddr = bvec_phys(&bv);
|
||||
max_size = get_max_segment_size(&req->q->limits, vec->paddr, UINT_MAX);
|
||||
bv.bv_len = min(bv.bv_len, max_size);
|
||||
bio_advance_iter_single(iter->bio, &iter->iter, bv.bv_len);
|
||||
|
||||
/*
|
||||
* If we are entirely done with this bi_io_vec entry, check if the next
|
||||
* one could be merged into it. This typically happens when moving to
|
||||
* the next bio, but some callers also don't pack bvecs tight.
|
||||
*/
|
||||
while (!iter->iter.bi_size || !iter->iter.bi_bvec_done) {
|
||||
struct bio_vec next;
|
||||
|
||||
if (!iter->iter.bi_size) {
|
||||
if (!iter->bio->bi_next)
|
||||
break;
|
||||
iter->bio = iter->bio->bi_next;
|
||||
iter->iter = iter->bio->bi_iter;
|
||||
}
|
||||
|
||||
next = mp_bvec_iter_bvec(iter->bio->bi_io_vec, iter->iter);
|
||||
if (bv.bv_len + next.bv_len > max_size ||
|
||||
!biovec_phys_mergeable(req->q, &bv, &next))
|
||||
break;
|
||||
|
||||
bv.bv_len += next.bv_len;
|
||||
bio_advance_iter_single(iter->bio, &iter->iter, next.bv_len);
|
||||
}
|
||||
|
||||
vec->len = bv.bv_len;
|
||||
return true;
|
||||
}
|
||||
|
||||
static inline struct scatterlist *blk_next_sg(struct scatterlist **sg,
|
||||
struct scatterlist *sglist)
|
||||
{
|
||||
if (!*sg)
|
||||
return sglist;
|
||||
|
||||
/*
|
||||
* If the driver previously mapped a shorter list, we could see a
|
||||
* termination bit prematurely unless it fully inits the sg table
|
||||
* on each mapping. We KNOW that there must be more entries here
|
||||
* or the driver would be buggy, so force clear the termination bit
|
||||
* to avoid doing a full sg_init_table() in drivers for each command.
|
||||
*/
|
||||
sg_unmark_end(*sg);
|
||||
return sg_next(*sg);
|
||||
}
|
||||
|
||||
/*
|
||||
* Map a request to scatterlist, return number of sg entries setup. Caller
|
||||
* must make sure sg can hold rq->nr_phys_segments entries.
|
||||
*/
|
||||
int __blk_rq_map_sg(struct request *rq, struct scatterlist *sglist,
|
||||
struct scatterlist **last_sg)
|
||||
{
|
||||
struct req_iterator iter = {
|
||||
.bio = rq->bio,
|
||||
};
|
||||
struct phys_vec vec;
|
||||
int nsegs = 0;
|
||||
|
||||
/* the internal flush request may not have bio attached */
|
||||
if (iter.bio)
|
||||
iter.iter = iter.bio->bi_iter;
|
||||
|
||||
while (blk_map_iter_next(rq, &iter, &vec)) {
|
||||
*last_sg = blk_next_sg(last_sg, sglist);
|
||||
sg_set_page(*last_sg, phys_to_page(vec.paddr), vec.len,
|
||||
offset_in_page(vec.paddr));
|
||||
nsegs++;
|
||||
}
|
||||
|
||||
if (*last_sg)
|
||||
sg_mark_end(*last_sg);
|
||||
|
||||
/*
|
||||
* Something must have been wrong if the figured number of
|
||||
* segment is bigger than number of req's physical segments
|
||||
*/
|
||||
WARN_ON(nsegs > blk_rq_nr_phys_segments(rq));
|
||||
|
||||
return nsegs;
|
||||
}
|
||||
EXPORT_SYMBOL(__blk_rq_map_sg);
|
||||
|
||||
static inline unsigned int blk_rq_get_max_sectors(struct request *rq,
|
||||
sector_t offset)
|
||||
{
|
||||
|
@ -832,6 +699,8 @@ static struct request *attempt_merge(struct request_queue *q,
|
|||
|
||||
if (req->bio->bi_write_hint != next->bio->bi_write_hint)
|
||||
return NULL;
|
||||
if (req->bio->bi_write_stream != next->bio->bi_write_stream)
|
||||
return NULL;
|
||||
if (req->bio->bi_ioprio != next->bio->bi_ioprio)
|
||||
return NULL;
|
||||
if (!blk_atomic_write_mergeable_rqs(req, next))
|
||||
|
@ -953,6 +822,8 @@ bool blk_rq_merge_ok(struct request *rq, struct bio *bio)
|
|||
return false;
|
||||
if (rq->bio->bi_write_hint != bio->bi_write_hint)
|
||||
return false;
|
||||
if (rq->bio->bi_write_stream != bio->bi_write_stream)
|
||||
return false;
|
||||
if (rq->bio->bi_ioprio != bio->bi_ioprio)
|
||||
return false;
|
||||
if (blk_atomic_write_mergeable_rq_bio(rq, bio) == false)
|
||||
|
|
|
@ -93,6 +93,8 @@ static const char *const blk_queue_flag_name[] = {
|
|||
QUEUE_FLAG_NAME(RQ_ALLOC_TIME),
|
||||
QUEUE_FLAG_NAME(HCTX_ACTIVE),
|
||||
QUEUE_FLAG_NAME(SQ_SCHED),
|
||||
QUEUE_FLAG_NAME(DISABLE_WBT_DEF),
|
||||
QUEUE_FLAG_NAME(NO_ELV_SWITCH),
|
||||
};
|
||||
#undef QUEUE_FLAG_NAME
|
||||
|
||||
|
@ -624,20 +626,9 @@ void blk_mq_debugfs_register(struct request_queue *q)
|
|||
|
||||
debugfs_create_files(q->debugfs_dir, q, blk_mq_debugfs_queue_attrs);
|
||||
|
||||
/*
|
||||
* blk_mq_init_sched() attempted to do this already, but q->debugfs_dir
|
||||
* didn't exist yet (because we don't know what to name the directory
|
||||
* until the queue is registered to a gendisk).
|
||||
*/
|
||||
if (q->elevator && !q->sched_debugfs_dir)
|
||||
blk_mq_debugfs_register_sched(q);
|
||||
|
||||
/* Similarly, blk_mq_init_hctx() couldn't do this previously. */
|
||||
queue_for_each_hw_ctx(q, hctx, i) {
|
||||
if (!hctx->debugfs_dir)
|
||||
blk_mq_debugfs_register_hctx(q, hctx);
|
||||
if (q->elevator && !hctx->sched_debugfs_dir)
|
||||
blk_mq_debugfs_register_sched_hctx(q, hctx);
|
||||
}
|
||||
|
||||
if (q->rq_qos) {
|
||||
|
|
116
block/blk-mq-dma.c
Normal file
116
block/blk-mq-dma.c
Normal file
|
@ -0,0 +1,116 @@
|
|||
// SPDX-License-Identifier: GPL-2.0-only
|
||||
/*
|
||||
* Copyright (C) 2025 Christoph Hellwig
|
||||
*/
|
||||
#include "blk.h"
|
||||
|
||||
struct phys_vec {
|
||||
phys_addr_t paddr;
|
||||
u32 len;
|
||||
};
|
||||
|
||||
static bool blk_map_iter_next(struct request *req, struct req_iterator *iter,
|
||||
struct phys_vec *vec)
|
||||
{
|
||||
unsigned int max_size;
|
||||
struct bio_vec bv;
|
||||
|
||||
if (req->rq_flags & RQF_SPECIAL_PAYLOAD) {
|
||||
if (!iter->bio)
|
||||
return false;
|
||||
vec->paddr = bvec_phys(&req->special_vec);
|
||||
vec->len = req->special_vec.bv_len;
|
||||
iter->bio = NULL;
|
||||
return true;
|
||||
}
|
||||
|
||||
if (!iter->iter.bi_size)
|
||||
return false;
|
||||
|
||||
bv = mp_bvec_iter_bvec(iter->bio->bi_io_vec, iter->iter);
|
||||
vec->paddr = bvec_phys(&bv);
|
||||
max_size = get_max_segment_size(&req->q->limits, vec->paddr, UINT_MAX);
|
||||
bv.bv_len = min(bv.bv_len, max_size);
|
||||
bio_advance_iter_single(iter->bio, &iter->iter, bv.bv_len);
|
||||
|
||||
/*
|
||||
* If we are entirely done with this bi_io_vec entry, check if the next
|
||||
* one could be merged into it. This typically happens when moving to
|
||||
* the next bio, but some callers also don't pack bvecs tight.
|
||||
*/
|
||||
while (!iter->iter.bi_size || !iter->iter.bi_bvec_done) {
|
||||
struct bio_vec next;
|
||||
|
||||
if (!iter->iter.bi_size) {
|
||||
if (!iter->bio->bi_next)
|
||||
break;
|
||||
iter->bio = iter->bio->bi_next;
|
||||
iter->iter = iter->bio->bi_iter;
|
||||
}
|
||||
|
||||
next = mp_bvec_iter_bvec(iter->bio->bi_io_vec, iter->iter);
|
||||
if (bv.bv_len + next.bv_len > max_size ||
|
||||
!biovec_phys_mergeable(req->q, &bv, &next))
|
||||
break;
|
||||
|
||||
bv.bv_len += next.bv_len;
|
||||
bio_advance_iter_single(iter->bio, &iter->iter, next.bv_len);
|
||||
}
|
||||
|
||||
vec->len = bv.bv_len;
|
||||
return true;
|
||||
}
|
||||
|
||||
static inline struct scatterlist *
|
||||
blk_next_sg(struct scatterlist **sg, struct scatterlist *sglist)
|
||||
{
|
||||
if (!*sg)
|
||||
return sglist;
|
||||
|
||||
/*
|
||||
* If the driver previously mapped a shorter list, we could see a
|
||||
* termination bit prematurely unless it fully inits the sg table
|
||||
* on each mapping. We KNOW that there must be more entries here
|
||||
* or the driver would be buggy, so force clear the termination bit
|
||||
* to avoid doing a full sg_init_table() in drivers for each command.
|
||||
*/
|
||||
sg_unmark_end(*sg);
|
||||
return sg_next(*sg);
|
||||
}
|
||||
|
||||
/*
|
||||
* Map a request to scatterlist, return number of sg entries setup. Caller
|
||||
* must make sure sg can hold rq->nr_phys_segments entries.
|
||||
*/
|
||||
int __blk_rq_map_sg(struct request *rq, struct scatterlist *sglist,
|
||||
struct scatterlist **last_sg)
|
||||
{
|
||||
struct req_iterator iter = {
|
||||
.bio = rq->bio,
|
||||
};
|
||||
struct phys_vec vec;
|
||||
int nsegs = 0;
|
||||
|
||||
/* the internal flush request may not have bio attached */
|
||||
if (iter.bio)
|
||||
iter.iter = iter.bio->bi_iter;
|
||||
|
||||
while (blk_map_iter_next(rq, &iter, &vec)) {
|
||||
*last_sg = blk_next_sg(last_sg, sglist);
|
||||
sg_set_page(*last_sg, phys_to_page(vec.paddr), vec.len,
|
||||
offset_in_page(vec.paddr));
|
||||
nsegs++;
|
||||
}
|
||||
|
||||
if (*last_sg)
|
||||
sg_mark_end(*last_sg);
|
||||
|
||||
/*
|
||||
* Something must have been wrong if the figured number of
|
||||
* segment is bigger than number of req's physical segments
|
||||
*/
|
||||
WARN_ON(nsegs > blk_rq_nr_phys_segments(rq));
|
||||
|
||||
return nsegs;
|
||||
}
|
||||
EXPORT_SYMBOL(__blk_rq_map_sg);
|
|
@ -59,19 +59,17 @@ static bool blk_mq_dispatch_hctx_list(struct list_head *rq_list)
|
|||
list_first_entry(rq_list, struct request, queuelist)->mq_hctx;
|
||||
struct request *rq;
|
||||
LIST_HEAD(hctx_list);
|
||||
unsigned int count = 0;
|
||||
|
||||
list_for_each_entry(rq, rq_list, queuelist) {
|
||||
if (rq->mq_hctx != hctx) {
|
||||
list_cut_before(&hctx_list, rq_list, &rq->queuelist);
|
||||
goto dispatch;
|
||||
}
|
||||
count++;
|
||||
}
|
||||
list_splice_tail_init(rq_list, &hctx_list);
|
||||
|
||||
dispatch:
|
||||
return blk_mq_dispatch_rq_list(hctx, &hctx_list, count);
|
||||
return blk_mq_dispatch_rq_list(hctx, &hctx_list, false);
|
||||
}
|
||||
|
||||
#define BLK_MQ_BUDGET_DELAY 3 /* ms units */
|
||||
|
@ -167,7 +165,7 @@ static int __blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
|
|||
dispatched |= blk_mq_dispatch_hctx_list(&rq_list);
|
||||
} while (!list_empty(&rq_list));
|
||||
} else {
|
||||
dispatched = blk_mq_dispatch_rq_list(hctx, &rq_list, count);
|
||||
dispatched = blk_mq_dispatch_rq_list(hctx, &rq_list, false);
|
||||
}
|
||||
|
||||
if (busy)
|
||||
|
@ -261,7 +259,7 @@ static int blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx)
|
|||
/* round robin for fair dispatch */
|
||||
ctx = blk_mq_next_ctx(hctx, rq->mq_ctx);
|
||||
|
||||
} while (blk_mq_dispatch_rq_list(rq->mq_hctx, &rq_list, 1));
|
||||
} while (blk_mq_dispatch_rq_list(rq->mq_hctx, &rq_list, false));
|
||||
|
||||
WRITE_ONCE(hctx->dispatch_from, ctx);
|
||||
return ret;
|
||||
|
@ -298,7 +296,7 @@ static int __blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
|
|||
*/
|
||||
if (!list_empty(&rq_list)) {
|
||||
blk_mq_sched_mark_restart_hctx(hctx);
|
||||
if (!blk_mq_dispatch_rq_list(hctx, &rq_list, 0))
|
||||
if (!blk_mq_dispatch_rq_list(hctx, &rq_list, true))
|
||||
return 0;
|
||||
need_dispatch = true;
|
||||
} else {
|
||||
|
@ -312,7 +310,7 @@ static int __blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
|
|||
if (need_dispatch)
|
||||
return blk_mq_do_dispatch_ctx(hctx);
|
||||
blk_mq_flush_busy_ctxs(hctx, &rq_list);
|
||||
blk_mq_dispatch_rq_list(hctx, &rq_list, 0);
|
||||
blk_mq_dispatch_rq_list(hctx, &rq_list, true);
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
@ -436,6 +434,30 @@ static int blk_mq_init_sched_shared_tags(struct request_queue *queue)
|
|||
return 0;
|
||||
}
|
||||
|
||||
void blk_mq_sched_reg_debugfs(struct request_queue *q)
|
||||
{
|
||||
struct blk_mq_hw_ctx *hctx;
|
||||
unsigned long i;
|
||||
|
||||
mutex_lock(&q->debugfs_mutex);
|
||||
blk_mq_debugfs_register_sched(q);
|
||||
queue_for_each_hw_ctx(q, hctx, i)
|
||||
blk_mq_debugfs_register_sched_hctx(q, hctx);
|
||||
mutex_unlock(&q->debugfs_mutex);
|
||||
}
|
||||
|
||||
void blk_mq_sched_unreg_debugfs(struct request_queue *q)
|
||||
{
|
||||
struct blk_mq_hw_ctx *hctx;
|
||||
unsigned long i;
|
||||
|
||||
mutex_lock(&q->debugfs_mutex);
|
||||
queue_for_each_hw_ctx(q, hctx, i)
|
||||
blk_mq_debugfs_unregister_sched_hctx(hctx);
|
||||
blk_mq_debugfs_unregister_sched(q);
|
||||
mutex_unlock(&q->debugfs_mutex);
|
||||
}
|
||||
|
||||
/* caller must have a reference to @e, will grab another one if successful */
|
||||
int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
|
||||
{
|
||||
|
@ -469,10 +491,6 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
|
|||
if (ret)
|
||||
goto err_free_map_and_rqs;
|
||||
|
||||
mutex_lock(&q->debugfs_mutex);
|
||||
blk_mq_debugfs_register_sched(q);
|
||||
mutex_unlock(&q->debugfs_mutex);
|
||||
|
||||
queue_for_each_hw_ctx(q, hctx, i) {
|
||||
if (e->ops.init_hctx) {
|
||||
ret = e->ops.init_hctx(hctx, i);
|
||||
|
@ -484,11 +502,7 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
|
|||
return ret;
|
||||
}
|
||||
}
|
||||
mutex_lock(&q->debugfs_mutex);
|
||||
blk_mq_debugfs_register_sched_hctx(q, hctx);
|
||||
mutex_unlock(&q->debugfs_mutex);
|
||||
}
|
||||
|
||||
return 0;
|
||||
|
||||
err_free_map_and_rqs:
|
||||
|
@ -527,10 +541,6 @@ void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e)
|
|||
unsigned int flags = 0;
|
||||
|
||||
queue_for_each_hw_ctx(q, hctx, i) {
|
||||
mutex_lock(&q->debugfs_mutex);
|
||||
blk_mq_debugfs_unregister_sched_hctx(hctx);
|
||||
mutex_unlock(&q->debugfs_mutex);
|
||||
|
||||
if (e->type->ops.exit_hctx && hctx->sched_data) {
|
||||
e->type->ops.exit_hctx(hctx, i);
|
||||
hctx->sched_data = NULL;
|
||||
|
@ -538,12 +548,9 @@ void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e)
|
|||
flags = hctx->flags;
|
||||
}
|
||||
|
||||
mutex_lock(&q->debugfs_mutex);
|
||||
blk_mq_debugfs_unregister_sched(q);
|
||||
mutex_unlock(&q->debugfs_mutex);
|
||||
|
||||
if (e->type->ops.exit_sched)
|
||||
e->type->ops.exit_sched(e);
|
||||
blk_mq_sched_tags_teardown(q, flags);
|
||||
set_bit(ELEVATOR_FLAG_DYING, &q->elevator->flags);
|
||||
q->elevator = NULL;
|
||||
}
|
||||
|
|
305
block/blk-mq.c
305
block/blk-mq.c
|
@ -89,7 +89,7 @@ struct mq_inflight {
|
|||
unsigned int inflight[2];
|
||||
};
|
||||
|
||||
static bool blk_mq_check_inflight(struct request *rq, void *priv)
|
||||
static bool blk_mq_check_in_driver(struct request *rq, void *priv)
|
||||
{
|
||||
struct mq_inflight *mi = priv;
|
||||
|
||||
|
@ -101,24 +101,14 @@ static bool blk_mq_check_inflight(struct request *rq, void *priv)
|
|||
return true;
|
||||
}
|
||||
|
||||
unsigned int blk_mq_in_flight(struct request_queue *q,
|
||||
struct block_device *part)
|
||||
void blk_mq_in_driver_rw(struct block_device *part, unsigned int inflight[2])
|
||||
{
|
||||
struct mq_inflight mi = { .part = part };
|
||||
|
||||
blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight, &mi);
|
||||
|
||||
return mi.inflight[0] + mi.inflight[1];
|
||||
}
|
||||
|
||||
void blk_mq_in_flight_rw(struct request_queue *q, struct block_device *part,
|
||||
unsigned int inflight[2])
|
||||
{
|
||||
struct mq_inflight mi = { .part = part };
|
||||
|
||||
blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight, &mi);
|
||||
inflight[0] = mi.inflight[0];
|
||||
inflight[1] = mi.inflight[1];
|
||||
blk_mq_queue_tag_busy_iter(bdev_get_queue(part), blk_mq_check_in_driver,
|
||||
&mi);
|
||||
inflight[READ] = mi.inflight[READ];
|
||||
inflight[WRITE] = mi.inflight[WRITE];
|
||||
}
|
||||
|
||||
#ifdef CONFIG_LOCKDEP
|
||||
|
@ -584,9 +574,13 @@ static struct request *blk_mq_rq_cache_fill(struct request_queue *q,
|
|||
struct blk_mq_alloc_data data = {
|
||||
.q = q,
|
||||
.flags = flags,
|
||||
.shallow_depth = 0,
|
||||
.cmd_flags = opf,
|
||||
.rq_flags = 0,
|
||||
.nr_tags = plug->nr_ios,
|
||||
.cached_rqs = &plug->cached_rqs,
|
||||
.ctx = NULL,
|
||||
.hctx = NULL
|
||||
};
|
||||
struct request *rq;
|
||||
|
||||
|
@ -646,8 +640,13 @@ struct request *blk_mq_alloc_request(struct request_queue *q, blk_opf_t opf,
|
|||
struct blk_mq_alloc_data data = {
|
||||
.q = q,
|
||||
.flags = flags,
|
||||
.shallow_depth = 0,
|
||||
.cmd_flags = opf,
|
||||
.rq_flags = 0,
|
||||
.nr_tags = 1,
|
||||
.cached_rqs = NULL,
|
||||
.ctx = NULL,
|
||||
.hctx = NULL
|
||||
};
|
||||
int ret;
|
||||
|
||||
|
@ -675,8 +674,13 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
|
|||
struct blk_mq_alloc_data data = {
|
||||
.q = q,
|
||||
.flags = flags,
|
||||
.shallow_depth = 0,
|
||||
.cmd_flags = opf,
|
||||
.rq_flags = 0,
|
||||
.nr_tags = 1,
|
||||
.cached_rqs = NULL,
|
||||
.ctx = NULL,
|
||||
.hctx = NULL
|
||||
};
|
||||
u64 alloc_time_ns = 0;
|
||||
struct request *rq;
|
||||
|
@ -2080,7 +2084,7 @@ static void blk_mq_commit_rqs(struct blk_mq_hw_ctx *hctx, int queued,
|
|||
* Returns true if we did some work AND can potentially do more.
|
||||
*/
|
||||
bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list,
|
||||
unsigned int nr_budgets)
|
||||
bool get_budget)
|
||||
{
|
||||
enum prep_dispatch prep;
|
||||
struct request_queue *q = hctx->queue;
|
||||
|
@ -2102,7 +2106,7 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list,
|
|||
rq = list_first_entry(list, struct request, queuelist);
|
||||
|
||||
WARN_ON_ONCE(hctx != rq->mq_hctx);
|
||||
prep = blk_mq_prep_dispatch_rq(rq, !nr_budgets);
|
||||
prep = blk_mq_prep_dispatch_rq(rq, get_budget);
|
||||
if (prep != PREP_DISPATCH_OK)
|
||||
break;
|
||||
|
||||
|
@ -2111,12 +2115,6 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list,
|
|||
bd.rq = rq;
|
||||
bd.last = list_empty(list);
|
||||
|
||||
/*
|
||||
* once the request is queued to lld, no need to cover the
|
||||
* budget any more
|
||||
*/
|
||||
if (nr_budgets)
|
||||
nr_budgets--;
|
||||
ret = q->mq_ops->queue_rq(hctx, &bd);
|
||||
switch (ret) {
|
||||
case BLK_STS_OK:
|
||||
|
@ -2150,7 +2148,11 @@ out:
|
|||
((hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED) ||
|
||||
blk_mq_is_shared_tags(hctx->flags));
|
||||
|
||||
if (nr_budgets)
|
||||
/*
|
||||
* If the caller allocated budgets, free the budgets of the
|
||||
* requests that have not yet been passed to the block driver.
|
||||
*/
|
||||
if (!get_budget)
|
||||
blk_mq_release_budgets(q, list);
|
||||
|
||||
spin_lock(&hctx->lock);
|
||||
|
@ -2778,15 +2780,15 @@ static blk_status_t blk_mq_request_issue_directly(struct request *rq, bool last)
|
|||
return __blk_mq_issue_directly(hctx, rq, last);
|
||||
}
|
||||
|
||||
static void blk_mq_plug_issue_direct(struct blk_plug *plug)
|
||||
static void blk_mq_issue_direct(struct rq_list *rqs)
|
||||
{
|
||||
struct blk_mq_hw_ctx *hctx = NULL;
|
||||
struct request *rq;
|
||||
int queued = 0;
|
||||
blk_status_t ret = BLK_STS_OK;
|
||||
|
||||
while ((rq = rq_list_pop(&plug->mq_list))) {
|
||||
bool last = rq_list_empty(&plug->mq_list);
|
||||
while ((rq = rq_list_pop(rqs))) {
|
||||
bool last = rq_list_empty(rqs);
|
||||
|
||||
if (hctx != rq->mq_hctx) {
|
||||
if (hctx) {
|
||||
|
@ -2817,15 +2819,64 @@ out:
|
|||
blk_mq_commit_rqs(hctx, queued, false);
|
||||
}
|
||||
|
||||
static void __blk_mq_flush_plug_list(struct request_queue *q,
|
||||
struct blk_plug *plug)
|
||||
static void __blk_mq_flush_list(struct request_queue *q, struct rq_list *rqs)
|
||||
{
|
||||
if (blk_queue_quiesced(q))
|
||||
return;
|
||||
q->mq_ops->queue_rqs(&plug->mq_list);
|
||||
q->mq_ops->queue_rqs(rqs);
|
||||
}
|
||||
|
||||
static void blk_mq_dispatch_plug_list(struct blk_plug *plug, bool from_sched)
|
||||
static unsigned blk_mq_extract_queue_requests(struct rq_list *rqs,
|
||||
struct rq_list *queue_rqs)
|
||||
{
|
||||
struct request *rq = rq_list_pop(rqs);
|
||||
struct request_queue *this_q = rq->q;
|
||||
struct request **prev = &rqs->head;
|
||||
struct rq_list matched_rqs = {};
|
||||
struct request *last = NULL;
|
||||
unsigned depth = 1;
|
||||
|
||||
rq_list_add_tail(&matched_rqs, rq);
|
||||
while ((rq = *prev)) {
|
||||
if (rq->q == this_q) {
|
||||
/* move rq from rqs to matched_rqs */
|
||||
*prev = rq->rq_next;
|
||||
rq_list_add_tail(&matched_rqs, rq);
|
||||
depth++;
|
||||
} else {
|
||||
/* leave rq in rqs */
|
||||
prev = &rq->rq_next;
|
||||
last = rq;
|
||||
}
|
||||
}
|
||||
|
||||
rqs->tail = last;
|
||||
*queue_rqs = matched_rqs;
|
||||
return depth;
|
||||
}
|
||||
|
||||
static void blk_mq_dispatch_queue_requests(struct rq_list *rqs, unsigned depth)
|
||||
{
|
||||
struct request_queue *q = rq_list_peek(rqs)->q;
|
||||
|
||||
trace_block_unplug(q, depth, true);
|
||||
|
||||
/*
|
||||
* Peek first request and see if we have a ->queue_rqs() hook.
|
||||
* If we do, we can dispatch the whole list in one go.
|
||||
* We already know at this point that all requests belong to the
|
||||
* same queue, caller must ensure that's the case.
|
||||
*/
|
||||
if (q->mq_ops->queue_rqs) {
|
||||
blk_mq_run_dispatch_ops(q, __blk_mq_flush_list(q, rqs));
|
||||
if (rq_list_empty(rqs))
|
||||
return;
|
||||
}
|
||||
|
||||
blk_mq_run_dispatch_ops(q, blk_mq_issue_direct(rqs));
|
||||
}
|
||||
|
||||
static void blk_mq_dispatch_list(struct rq_list *rqs, bool from_sched)
|
||||
{
|
||||
struct blk_mq_hw_ctx *this_hctx = NULL;
|
||||
struct blk_mq_ctx *this_ctx = NULL;
|
||||
|
@ -2835,7 +2886,7 @@ static void blk_mq_dispatch_plug_list(struct blk_plug *plug, bool from_sched)
|
|||
LIST_HEAD(list);
|
||||
|
||||
do {
|
||||
struct request *rq = rq_list_pop(&plug->mq_list);
|
||||
struct request *rq = rq_list_pop(rqs);
|
||||
|
||||
if (!this_hctx) {
|
||||
this_hctx = rq->mq_hctx;
|
||||
|
@ -2848,9 +2899,9 @@ static void blk_mq_dispatch_plug_list(struct blk_plug *plug, bool from_sched)
|
|||
}
|
||||
list_add_tail(&rq->queuelist, &list);
|
||||
depth++;
|
||||
} while (!rq_list_empty(&plug->mq_list));
|
||||
} while (!rq_list_empty(rqs));
|
||||
|
||||
plug->mq_list = requeue_list;
|
||||
*rqs = requeue_list;
|
||||
trace_block_unplug(this_hctx->queue, depth, !from_sched);
|
||||
|
||||
percpu_ref_get(&this_hctx->queue->q_usage_counter);
|
||||
|
@ -2870,9 +2921,21 @@ static void blk_mq_dispatch_plug_list(struct blk_plug *plug, bool from_sched)
|
|||
percpu_ref_put(&this_hctx->queue->q_usage_counter);
|
||||
}
|
||||
|
||||
static void blk_mq_dispatch_multiple_queue_requests(struct rq_list *rqs)
|
||||
{
|
||||
do {
|
||||
struct rq_list queue_rqs;
|
||||
unsigned depth;
|
||||
|
||||
depth = blk_mq_extract_queue_requests(rqs, &queue_rqs);
|
||||
blk_mq_dispatch_queue_requests(&queue_rqs, depth);
|
||||
while (!rq_list_empty(&queue_rqs))
|
||||
blk_mq_dispatch_list(&queue_rqs, false);
|
||||
} while (!rq_list_empty(rqs));
|
||||
}
|
||||
|
||||
void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
|
||||
{
|
||||
struct request *rq;
|
||||
unsigned int depth;
|
||||
|
||||
/*
|
||||
|
@ -2887,34 +2950,19 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
|
|||
depth = plug->rq_count;
|
||||
plug->rq_count = 0;
|
||||
|
||||
if (!plug->multiple_queues && !plug->has_elevator && !from_schedule) {
|
||||
struct request_queue *q;
|
||||
|
||||
rq = rq_list_peek(&plug->mq_list);
|
||||
q = rq->q;
|
||||
trace_block_unplug(q, depth, true);
|
||||
|
||||
/*
|
||||
* Peek first request and see if we have a ->queue_rqs() hook.
|
||||
* If we do, we can dispatch the whole plug list in one go. We
|
||||
* already know at this point that all requests belong to the
|
||||
* same queue, caller must ensure that's the case.
|
||||
*/
|
||||
if (q->mq_ops->queue_rqs) {
|
||||
blk_mq_run_dispatch_ops(q,
|
||||
__blk_mq_flush_plug_list(q, plug));
|
||||
if (rq_list_empty(&plug->mq_list))
|
||||
if (!plug->has_elevator && !from_schedule) {
|
||||
if (plug->multiple_queues) {
|
||||
blk_mq_dispatch_multiple_queue_requests(&plug->mq_list);
|
||||
return;
|
||||
}
|
||||
|
||||
blk_mq_run_dispatch_ops(q,
|
||||
blk_mq_plug_issue_direct(plug));
|
||||
blk_mq_dispatch_queue_requests(&plug->mq_list, depth);
|
||||
if (rq_list_empty(&plug->mq_list))
|
||||
return;
|
||||
}
|
||||
|
||||
do {
|
||||
blk_mq_dispatch_plug_list(plug, from_schedule);
|
||||
blk_mq_dispatch_list(&plug->mq_list, from_schedule);
|
||||
} while (!rq_list_empty(&plug->mq_list));
|
||||
}
|
||||
|
||||
|
@ -2969,8 +3017,14 @@ static struct request *blk_mq_get_new_requests(struct request_queue *q,
|
|||
{
|
||||
struct blk_mq_alloc_data data = {
|
||||
.q = q,
|
||||
.nr_tags = 1,
|
||||
.flags = 0,
|
||||
.shallow_depth = 0,
|
||||
.cmd_flags = bio->bi_opf,
|
||||
.rq_flags = 0,
|
||||
.nr_tags = 1,
|
||||
.cached_rqs = NULL,
|
||||
.ctx = NULL,
|
||||
.hctx = NULL
|
||||
};
|
||||
struct request *rq;
|
||||
|
||||
|
@ -3080,8 +3134,6 @@ void blk_mq_submit_bio(struct bio *bio)
|
|||
goto new_request;
|
||||
}
|
||||
|
||||
bio = blk_queue_bounce(bio, q);
|
||||
|
||||
/*
|
||||
* The cached request already holds a q_usage_counter reference and we
|
||||
* don't have to acquire a new one if we use it.
|
||||
|
@ -4094,8 +4146,6 @@ static void blk_mq_map_swqueue(struct request_queue *q)
|
|||
struct blk_mq_ctx *ctx;
|
||||
struct blk_mq_tag_set *set = q->tag_set;
|
||||
|
||||
mutex_lock(&q->elevator_lock);
|
||||
|
||||
queue_for_each_hw_ctx(q, hctx, i) {
|
||||
cpumask_clear(hctx->cpumask);
|
||||
hctx->nr_ctx = 0;
|
||||
|
@ -4200,8 +4250,6 @@ static void blk_mq_map_swqueue(struct request_queue *q)
|
|||
hctx->next_cpu = blk_mq_first_mapped_cpu(hctx);
|
||||
hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH;
|
||||
}
|
||||
|
||||
mutex_unlock(&q->elevator_lock);
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -4505,16 +4553,9 @@ static void __blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
|
|||
}
|
||||
|
||||
static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
|
||||
struct request_queue *q, bool lock)
|
||||
struct request_queue *q)
|
||||
{
|
||||
if (lock) {
|
||||
/* protect against switching io scheduler */
|
||||
mutex_lock(&q->elevator_lock);
|
||||
__blk_mq_realloc_hw_ctxs(set, q);
|
||||
mutex_unlock(&q->elevator_lock);
|
||||
} else {
|
||||
__blk_mq_realloc_hw_ctxs(set, q);
|
||||
}
|
||||
|
||||
/* unregister cpuhp callbacks for exited hctxs */
|
||||
blk_mq_remove_hw_queues_cpuhp(q);
|
||||
|
@ -4546,7 +4587,7 @@ int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
|
|||
|
||||
xa_init(&q->hctx_table);
|
||||
|
||||
blk_mq_realloc_hw_ctxs(set, q, false);
|
||||
blk_mq_realloc_hw_ctxs(set, q);
|
||||
if (!q->nr_hw_queues)
|
||||
goto err_hctxs;
|
||||
|
||||
|
@ -4563,8 +4604,8 @@ int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
|
|||
q->nr_requests = set->queue_depth;
|
||||
|
||||
blk_mq_init_cpu_queues(q, set->nr_hw_queues);
|
||||
blk_mq_add_queue_tag_set(set, q);
|
||||
blk_mq_map_swqueue(q);
|
||||
blk_mq_add_queue_tag_set(set, q);
|
||||
return 0;
|
||||
|
||||
err_hctxs:
|
||||
|
@ -4784,6 +4825,8 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
|
|||
goto out_free_srcu;
|
||||
}
|
||||
|
||||
init_rwsem(&set->update_nr_hwq_lock);
|
||||
|
||||
ret = -ENOMEM;
|
||||
set->tags = kcalloc_node(set->nr_hw_queues,
|
||||
sizeof(struct blk_mq_tags *), GFP_KERNEL,
|
||||
|
@ -4923,88 +4966,10 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr)
|
|||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* request_queue and elevator_type pair.
|
||||
* It is just used by __blk_mq_update_nr_hw_queues to cache
|
||||
* the elevator_type associated with a request_queue.
|
||||
*/
|
||||
struct blk_mq_qe_pair {
|
||||
struct list_head node;
|
||||
struct request_queue *q;
|
||||
struct elevator_type *type;
|
||||
};
|
||||
|
||||
/*
|
||||
* Cache the elevator_type in qe pair list and switch the
|
||||
* io scheduler to 'none'
|
||||
*/
|
||||
static bool blk_mq_elv_switch_none(struct list_head *head,
|
||||
struct request_queue *q)
|
||||
{
|
||||
struct blk_mq_qe_pair *qe;
|
||||
|
||||
qe = kmalloc(sizeof(*qe), GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY);
|
||||
if (!qe)
|
||||
return false;
|
||||
|
||||
/* Accessing q->elevator needs protection from ->elevator_lock. */
|
||||
mutex_lock(&q->elevator_lock);
|
||||
|
||||
if (!q->elevator) {
|
||||
kfree(qe);
|
||||
goto unlock;
|
||||
}
|
||||
|
||||
INIT_LIST_HEAD(&qe->node);
|
||||
qe->q = q;
|
||||
qe->type = q->elevator->type;
|
||||
/* keep a reference to the elevator module as we'll switch back */
|
||||
__elevator_get(qe->type);
|
||||
list_add(&qe->node, head);
|
||||
elevator_disable(q);
|
||||
unlock:
|
||||
mutex_unlock(&q->elevator_lock);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static struct blk_mq_qe_pair *blk_lookup_qe_pair(struct list_head *head,
|
||||
struct request_queue *q)
|
||||
{
|
||||
struct blk_mq_qe_pair *qe;
|
||||
|
||||
list_for_each_entry(qe, head, node)
|
||||
if (qe->q == q)
|
||||
return qe;
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static void blk_mq_elv_switch_back(struct list_head *head,
|
||||
struct request_queue *q)
|
||||
{
|
||||
struct blk_mq_qe_pair *qe;
|
||||
struct elevator_type *t;
|
||||
|
||||
qe = blk_lookup_qe_pair(head, q);
|
||||
if (!qe)
|
||||
return;
|
||||
t = qe->type;
|
||||
list_del(&qe->node);
|
||||
kfree(qe);
|
||||
|
||||
mutex_lock(&q->elevator_lock);
|
||||
elevator_switch(q, t);
|
||||
/* drop the reference acquired in blk_mq_elv_switch_none */
|
||||
elevator_put(t);
|
||||
mutex_unlock(&q->elevator_lock);
|
||||
}
|
||||
|
||||
static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set,
|
||||
int nr_hw_queues)
|
||||
{
|
||||
struct request_queue *q;
|
||||
LIST_HEAD(head);
|
||||
int prev_nr_hw_queues = set->nr_hw_queues;
|
||||
unsigned int memflags;
|
||||
int i;
|
||||
|
@ -5019,30 +4984,24 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set,
|
|||
return;
|
||||
|
||||
memflags = memalloc_noio_save();
|
||||
list_for_each_entry(q, &set->tag_list, tag_set_list)
|
||||
blk_mq_freeze_queue_nomemsave(q);
|
||||
|
||||
/*
|
||||
* Switch IO scheduler to 'none', cleaning up the data associated
|
||||
* with the previous scheduler. We will switch back once we are done
|
||||
* updating the new sw to hw queue mappings.
|
||||
*/
|
||||
list_for_each_entry(q, &set->tag_list, tag_set_list)
|
||||
if (!blk_mq_elv_switch_none(&head, q))
|
||||
goto switch_back;
|
||||
|
||||
list_for_each_entry(q, &set->tag_list, tag_set_list) {
|
||||
blk_mq_debugfs_unregister_hctxs(q);
|
||||
blk_mq_sysfs_unregister_hctxs(q);
|
||||
}
|
||||
|
||||
if (blk_mq_realloc_tag_set_tags(set, nr_hw_queues) < 0)
|
||||
list_for_each_entry(q, &set->tag_list, tag_set_list)
|
||||
blk_mq_freeze_queue_nomemsave(q);
|
||||
|
||||
if (blk_mq_realloc_tag_set_tags(set, nr_hw_queues) < 0) {
|
||||
list_for_each_entry(q, &set->tag_list, tag_set_list)
|
||||
blk_mq_unfreeze_queue_nomemrestore(q);
|
||||
goto reregister;
|
||||
}
|
||||
|
||||
fallback:
|
||||
blk_mq_update_queue_map(set);
|
||||
list_for_each_entry(q, &set->tag_list, tag_set_list) {
|
||||
blk_mq_realloc_hw_ctxs(set, q, true);
|
||||
__blk_mq_realloc_hw_ctxs(set, q);
|
||||
|
||||
if (q->nr_hw_queues != set->nr_hw_queues) {
|
||||
int i = prev_nr_hw_queues;
|
||||
|
@ -5058,18 +5017,18 @@ fallback:
|
|||
blk_mq_map_swqueue(q);
|
||||
}
|
||||
|
||||
/* elv_update_nr_hw_queues() unfreeze queue for us */
|
||||
list_for_each_entry(q, &set->tag_list, tag_set_list)
|
||||
elv_update_nr_hw_queues(q);
|
||||
|
||||
reregister:
|
||||
list_for_each_entry(q, &set->tag_list, tag_set_list) {
|
||||
blk_mq_sysfs_register_hctxs(q);
|
||||
blk_mq_debugfs_register_hctxs(q);
|
||||
|
||||
blk_mq_remove_hw_queues_cpuhp(q);
|
||||
blk_mq_add_hw_queues_cpuhp(q);
|
||||
}
|
||||
|
||||
switch_back:
|
||||
list_for_each_entry(q, &set->tag_list, tag_set_list)
|
||||
blk_mq_elv_switch_back(&head, q);
|
||||
|
||||
list_for_each_entry(q, &set->tag_list, tag_set_list)
|
||||
blk_mq_unfreeze_queue_nomemrestore(q);
|
||||
memalloc_noio_restore(memflags);
|
||||
|
||||
/* Free the excess tags when nr_hw_queues shrink. */
|
||||
|
@ -5079,9 +5038,11 @@ switch_back:
|
|||
|
||||
void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues)
|
||||
{
|
||||
down_write(&set->update_nr_hwq_lock);
|
||||
mutex_lock(&set->tag_list_lock);
|
||||
__blk_mq_update_nr_hw_queues(set, nr_hw_queues);
|
||||
mutex_unlock(&set->tag_list_lock);
|
||||
up_write(&set->update_nr_hwq_lock);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(blk_mq_update_nr_hw_queues);
|
||||
|
||||
|
|
|
@ -48,7 +48,7 @@ void blk_mq_exit_queue(struct request_queue *q);
|
|||
int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr);
|
||||
void blk_mq_wake_waiters(struct request_queue *q);
|
||||
bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *,
|
||||
unsigned int);
|
||||
bool);
|
||||
void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list);
|
||||
struct request *blk_mq_dequeue_from_ctx(struct blk_mq_hw_ctx *hctx,
|
||||
struct blk_mq_ctx *start);
|
||||
|
@ -246,10 +246,7 @@ static inline bool blk_mq_hw_queue_mapped(struct blk_mq_hw_ctx *hctx)
|
|||
return hctx->nr_ctx && hctx->tags;
|
||||
}
|
||||
|
||||
unsigned int blk_mq_in_flight(struct request_queue *q,
|
||||
struct block_device *part);
|
||||
void blk_mq_in_flight_rw(struct request_queue *q, struct block_device *part,
|
||||
unsigned int inflight[2]);
|
||||
void blk_mq_in_driver_rw(struct block_device *part, unsigned int inflight[2]);
|
||||
|
||||
static inline void blk_mq_put_dispatch_budget(struct request_queue *q,
|
||||
int budget_token)
|
||||
|
|
|
@ -2,6 +2,8 @@
|
|||
|
||||
#include "blk-rq-qos.h"
|
||||
|
||||
__read_mostly DEFINE_STATIC_KEY_FALSE(block_rq_qos);
|
||||
|
||||
/*
|
||||
* Increment 'v', if 'v' is below 'below'. Returns true if we succeeded,
|
||||
* false if 'v' + 1 would be bigger than 'below'.
|
||||
|
@ -317,6 +319,7 @@ void rq_qos_exit(struct request_queue *q)
|
|||
struct rq_qos *rqos = q->rq_qos;
|
||||
q->rq_qos = rqos->next;
|
||||
rqos->ops->exit(rqos);
|
||||
static_branch_dec(&block_rq_qos);
|
||||
}
|
||||
mutex_unlock(&q->rq_qos_mutex);
|
||||
}
|
||||
|
@ -343,6 +346,7 @@ int rq_qos_add(struct rq_qos *rqos, struct gendisk *disk, enum rq_qos_id id,
|
|||
goto ebusy;
|
||||
rqos->next = q->rq_qos;
|
||||
q->rq_qos = rqos;
|
||||
static_branch_inc(&block_rq_qos);
|
||||
|
||||
blk_mq_unfreeze_queue(q, memflags);
|
||||
|
||||
|
|
|
@ -12,6 +12,7 @@
|
|||
#include "blk-mq-debugfs.h"
|
||||
|
||||
struct blk_mq_debugfs_attr;
|
||||
extern struct static_key_false block_rq_qos;
|
||||
|
||||
enum rq_qos_id {
|
||||
RQ_QOS_WBT,
|
||||
|
@ -112,31 +113,33 @@ void __rq_qos_queue_depth_changed(struct rq_qos *rqos);
|
|||
|
||||
static inline void rq_qos_cleanup(struct request_queue *q, struct bio *bio)
|
||||
{
|
||||
if (q->rq_qos)
|
||||
if (static_branch_unlikely(&block_rq_qos) && q->rq_qos)
|
||||
__rq_qos_cleanup(q->rq_qos, bio);
|
||||
}
|
||||
|
||||
static inline void rq_qos_done(struct request_queue *q, struct request *rq)
|
||||
{
|
||||
if (q->rq_qos && !blk_rq_is_passthrough(rq))
|
||||
if (static_branch_unlikely(&block_rq_qos) && q->rq_qos &&
|
||||
!blk_rq_is_passthrough(rq))
|
||||
__rq_qos_done(q->rq_qos, rq);
|
||||
}
|
||||
|
||||
static inline void rq_qos_issue(struct request_queue *q, struct request *rq)
|
||||
{
|
||||
if (q->rq_qos)
|
||||
if (static_branch_unlikely(&block_rq_qos) && q->rq_qos)
|
||||
__rq_qos_issue(q->rq_qos, rq);
|
||||
}
|
||||
|
||||
static inline void rq_qos_requeue(struct request_queue *q, struct request *rq)
|
||||
{
|
||||
if (q->rq_qos)
|
||||
if (static_branch_unlikely(&block_rq_qos) && q->rq_qos)
|
||||
__rq_qos_requeue(q->rq_qos, rq);
|
||||
}
|
||||
|
||||
static inline void rq_qos_done_bio(struct bio *bio)
|
||||
{
|
||||
if (bio->bi_bdev && (bio_flagged(bio, BIO_QOS_THROTTLED) ||
|
||||
if (static_branch_unlikely(&block_rq_qos) &&
|
||||
bio->bi_bdev && (bio_flagged(bio, BIO_QOS_THROTTLED) ||
|
||||
bio_flagged(bio, BIO_QOS_MERGED))) {
|
||||
struct request_queue *q = bdev_get_queue(bio->bi_bdev);
|
||||
if (q->rq_qos)
|
||||
|
@ -146,7 +149,7 @@ static inline void rq_qos_done_bio(struct bio *bio)
|
|||
|
||||
static inline void rq_qos_throttle(struct request_queue *q, struct bio *bio)
|
||||
{
|
||||
if (q->rq_qos) {
|
||||
if (static_branch_unlikely(&block_rq_qos) && q->rq_qos) {
|
||||
bio_set_flag(bio, BIO_QOS_THROTTLED);
|
||||
__rq_qos_throttle(q->rq_qos, bio);
|
||||
}
|
||||
|
@ -155,14 +158,14 @@ static inline void rq_qos_throttle(struct request_queue *q, struct bio *bio)
|
|||
static inline void rq_qos_track(struct request_queue *q, struct request *rq,
|
||||
struct bio *bio)
|
||||
{
|
||||
if (q->rq_qos)
|
||||
if (static_branch_unlikely(&block_rq_qos) && q->rq_qos)
|
||||
__rq_qos_track(q->rq_qos, rq, bio);
|
||||
}
|
||||
|
||||
static inline void rq_qos_merge(struct request_queue *q, struct request *rq,
|
||||
struct bio *bio)
|
||||
{
|
||||
if (q->rq_qos) {
|
||||
if (static_branch_unlikely(&block_rq_qos) && q->rq_qos) {
|
||||
bio_set_flag(bio, BIO_QOS_MERGED);
|
||||
__rq_qos_merge(q->rq_qos, rq, bio);
|
||||
}
|
||||
|
@ -170,7 +173,7 @@ static inline void rq_qos_merge(struct request_queue *q, struct request *rq,
|
|||
|
||||
static inline void rq_qos_queue_depth_changed(struct request_queue *q)
|
||||
{
|
||||
if (q->rq_qos)
|
||||
if (static_branch_unlikely(&block_rq_qos) && q->rq_qos)
|
||||
__rq_qos_queue_depth_changed(q->rq_qos);
|
||||
}
|
||||
|
||||
|
|
|
@ -124,11 +124,6 @@ static int blk_validate_integrity_limits(struct queue_limits *lim)
|
|||
return 0;
|
||||
}
|
||||
|
||||
if (lim->features & BLK_FEAT_BOUNCE_HIGH) {
|
||||
pr_warn("no bounce buffer support for integrity metadata\n");
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
if (!IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY)) {
|
||||
pr_warn("integrity support disabled.\n");
|
||||
return -EINVAL;
|
||||
|
|
|
@ -134,6 +134,8 @@ QUEUE_SYSFS_LIMIT_SHOW(max_segments)
|
|||
QUEUE_SYSFS_LIMIT_SHOW(max_discard_segments)
|
||||
QUEUE_SYSFS_LIMIT_SHOW(max_integrity_segments)
|
||||
QUEUE_SYSFS_LIMIT_SHOW(max_segment_size)
|
||||
QUEUE_SYSFS_LIMIT_SHOW(max_write_streams)
|
||||
QUEUE_SYSFS_LIMIT_SHOW(write_stream_granularity)
|
||||
QUEUE_SYSFS_LIMIT_SHOW(logical_block_size)
|
||||
QUEUE_SYSFS_LIMIT_SHOW(physical_block_size)
|
||||
QUEUE_SYSFS_LIMIT_SHOW(chunk_sectors)
|
||||
|
@ -488,6 +490,8 @@ QUEUE_LIM_RO_ENTRY(queue_max_hw_sectors, "max_hw_sectors_kb");
|
|||
QUEUE_LIM_RO_ENTRY(queue_max_segments, "max_segments");
|
||||
QUEUE_LIM_RO_ENTRY(queue_max_integrity_segments, "max_integrity_segments");
|
||||
QUEUE_LIM_RO_ENTRY(queue_max_segment_size, "max_segment_size");
|
||||
QUEUE_LIM_RO_ENTRY(queue_max_write_streams, "max_write_streams");
|
||||
QUEUE_LIM_RO_ENTRY(queue_write_stream_granularity, "write_stream_granularity");
|
||||
QUEUE_RW_ENTRY(elv_iosched, "scheduler");
|
||||
|
||||
QUEUE_LIM_RO_ENTRY(queue_logical_block_size, "logical_block_size");
|
||||
|
@ -560,7 +564,7 @@ static ssize_t queue_wb_lat_show(struct gendisk *disk, char *page)
|
|||
ssize_t ret;
|
||||
struct request_queue *q = disk->queue;
|
||||
|
||||
mutex_lock(&q->elevator_lock);
|
||||
mutex_lock(&disk->rqos_state_mutex);
|
||||
if (!wbt_rq_qos(q)) {
|
||||
ret = -EINVAL;
|
||||
goto out;
|
||||
|
@ -573,7 +577,7 @@ static ssize_t queue_wb_lat_show(struct gendisk *disk, char *page)
|
|||
|
||||
ret = sysfs_emit(page, "%llu\n", div_u64(wbt_get_min_lat(q), 1000));
|
||||
out:
|
||||
mutex_unlock(&q->elevator_lock);
|
||||
mutex_unlock(&disk->rqos_state_mutex);
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
@ -593,7 +597,6 @@ static ssize_t queue_wb_lat_store(struct gendisk *disk, const char *page,
|
|||
return -EINVAL;
|
||||
|
||||
memflags = blk_mq_freeze_queue(q);
|
||||
mutex_lock(&q->elevator_lock);
|
||||
|
||||
rqos = wbt_rq_qos(q);
|
||||
if (!rqos) {
|
||||
|
@ -618,11 +621,12 @@ static ssize_t queue_wb_lat_store(struct gendisk *disk, const char *page,
|
|||
*/
|
||||
blk_mq_quiesce_queue(q);
|
||||
|
||||
mutex_lock(&disk->rqos_state_mutex);
|
||||
wbt_set_min_lat(q, val);
|
||||
mutex_unlock(&disk->rqos_state_mutex);
|
||||
|
||||
blk_mq_unquiesce_queue(q);
|
||||
out:
|
||||
mutex_unlock(&q->elevator_lock);
|
||||
blk_mq_unfreeze_queue(q, memflags);
|
||||
|
||||
return ret;
|
||||
|
@ -642,6 +646,8 @@ static struct attribute *queue_attrs[] = {
|
|||
&queue_max_discard_segments_entry.attr,
|
||||
&queue_max_integrity_segments_entry.attr,
|
||||
&queue_max_segment_size_entry.attr,
|
||||
&queue_max_write_streams_entry.attr,
|
||||
&queue_write_stream_granularity_entry.attr,
|
||||
&queue_hw_sector_size_entry.attr,
|
||||
&queue_logical_block_size_entry.attr,
|
||||
&queue_physical_block_size_entry.attr,
|
||||
|
@ -869,16 +875,9 @@ int blk_register_queue(struct gendisk *disk)
|
|||
if (ret)
|
||||
goto out_unregister_ia_ranges;
|
||||
|
||||
mutex_lock(&q->elevator_lock);
|
||||
if (q->elevator) {
|
||||
ret = elv_register_queue(q, false);
|
||||
if (ret) {
|
||||
mutex_unlock(&q->elevator_lock);
|
||||
goto out_crypto_sysfs_unregister;
|
||||
}
|
||||
}
|
||||
if (queue_is_mq(q))
|
||||
elevator_set_default(q);
|
||||
wbt_enable_default(disk);
|
||||
mutex_unlock(&q->elevator_lock);
|
||||
|
||||
blk_queue_flag_set(QUEUE_FLAG_REGISTERED, q);
|
||||
|
||||
|
@ -902,8 +901,6 @@ int blk_register_queue(struct gendisk *disk)
|
|||
|
||||
return ret;
|
||||
|
||||
out_crypto_sysfs_unregister:
|
||||
blk_crypto_sysfs_unregister(disk);
|
||||
out_unregister_ia_ranges:
|
||||
disk_unregister_independent_access_ranges(disk);
|
||||
out_debugfs_remove:
|
||||
|
@ -951,10 +948,6 @@ void blk_unregister_queue(struct gendisk *disk)
|
|||
blk_mq_sysfs_unregister(disk);
|
||||
blk_crypto_sysfs_unregister(disk);
|
||||
|
||||
mutex_lock(&q->elevator_lock);
|
||||
elv_unregister_queue(q);
|
||||
mutex_unlock(&q->elevator_lock);
|
||||
|
||||
mutex_lock(&q->sysfs_lock);
|
||||
disk_unregister_independent_access_ranges(disk);
|
||||
mutex_unlock(&q->sysfs_lock);
|
||||
|
@ -963,5 +956,8 @@ void blk_unregister_queue(struct gendisk *disk)
|
|||
kobject_uevent(&disk->queue_kobj, KOBJ_REMOVE);
|
||||
kobject_del(&disk->queue_kobj);
|
||||
|
||||
if (queue_is_mq(q))
|
||||
elevator_set_none(q);
|
||||
|
||||
blk_debugfs_remove(disk);
|
||||
}
|
||||
|
|
|
@ -143,7 +143,8 @@ static inline unsigned int throtl_bio_data_size(struct bio *bio)
|
|||
static void throtl_qnode_init(struct throtl_qnode *qn, struct throtl_grp *tg)
|
||||
{
|
||||
INIT_LIST_HEAD(&qn->node);
|
||||
bio_list_init(&qn->bios);
|
||||
bio_list_init(&qn->bios_bps);
|
||||
bio_list_init(&qn->bios_iops);
|
||||
qn->tg = tg;
|
||||
}
|
||||
|
||||
|
@ -151,18 +152,32 @@ static void throtl_qnode_init(struct throtl_qnode *qn, struct throtl_grp *tg)
|
|||
* throtl_qnode_add_bio - add a bio to a throtl_qnode and activate it
|
||||
* @bio: bio being added
|
||||
* @qn: qnode to add bio to
|
||||
* @queued: the service_queue->queued[] list @qn belongs to
|
||||
* @sq: the service_queue @qn belongs to
|
||||
*
|
||||
* Add @bio to @qn and put @qn on @queued if it's not already on.
|
||||
* Add @bio to @qn and put @qn on @sq->queued if it's not already on.
|
||||
* @qn->tg's reference count is bumped when @qn is activated. See the
|
||||
* comment on top of throtl_qnode definition for details.
|
||||
*/
|
||||
static void throtl_qnode_add_bio(struct bio *bio, struct throtl_qnode *qn,
|
||||
struct list_head *queued)
|
||||
struct throtl_service_queue *sq)
|
||||
{
|
||||
bio_list_add(&qn->bios, bio);
|
||||
bool rw = bio_data_dir(bio);
|
||||
|
||||
/*
|
||||
* Split bios have already been throttled by bps, so they are
|
||||
* directly queued into the iops path.
|
||||
*/
|
||||
if (bio_flagged(bio, BIO_TG_BPS_THROTTLED) ||
|
||||
bio_flagged(bio, BIO_BPS_THROTTLED)) {
|
||||
bio_list_add(&qn->bios_iops, bio);
|
||||
sq->nr_queued_iops[rw]++;
|
||||
} else {
|
||||
bio_list_add(&qn->bios_bps, bio);
|
||||
sq->nr_queued_bps[rw]++;
|
||||
}
|
||||
|
||||
if (list_empty(&qn->node)) {
|
||||
list_add_tail(&qn->node, queued);
|
||||
list_add_tail(&qn->node, &sq->queued[rw]);
|
||||
blkg_get(tg_to_blkg(qn->tg));
|
||||
}
|
||||
}
|
||||
|
@ -170,6 +185,10 @@ static void throtl_qnode_add_bio(struct bio *bio, struct throtl_qnode *qn,
|
|||
/**
|
||||
* throtl_peek_queued - peek the first bio on a qnode list
|
||||
* @queued: the qnode list to peek
|
||||
*
|
||||
* Always take a bio from the head of the iops queue first. If the queue is
|
||||
* empty, we then take it from the bps queue to maintain the overall idea of
|
||||
* fetching bios from the head.
|
||||
*/
|
||||
static struct bio *throtl_peek_queued(struct list_head *queued)
|
||||
{
|
||||
|
@ -180,28 +199,33 @@ static struct bio *throtl_peek_queued(struct list_head *queued)
|
|||
return NULL;
|
||||
|
||||
qn = list_first_entry(queued, struct throtl_qnode, node);
|
||||
bio = bio_list_peek(&qn->bios);
|
||||
bio = bio_list_peek(&qn->bios_iops);
|
||||
if (!bio)
|
||||
bio = bio_list_peek(&qn->bios_bps);
|
||||
WARN_ON_ONCE(!bio);
|
||||
return bio;
|
||||
}
|
||||
|
||||
/**
|
||||
* throtl_pop_queued - pop the first bio form a qnode list
|
||||
* @queued: the qnode list to pop a bio from
|
||||
* @sq: the service_queue to pop a bio from
|
||||
* @tg_to_put: optional out argument for throtl_grp to put
|
||||
* @rw: read/write
|
||||
*
|
||||
* Pop the first bio from the qnode list @queued. After popping, the first
|
||||
* qnode is removed from @queued if empty or moved to the end of @queued so
|
||||
* that the popping order is round-robin.
|
||||
* Pop the first bio from the qnode list @sq->queued. Note that we firstly
|
||||
* focus on the iops list because bios are ultimately dispatched from it.
|
||||
* After popping, the first qnode is removed from @sq->queued if empty or moved
|
||||
* to the end of @sq->queued so that the popping order is round-robin.
|
||||
*
|
||||
* When the first qnode is removed, its associated throtl_grp should be put
|
||||
* too. If @tg_to_put is NULL, this function automatically puts it;
|
||||
* otherwise, *@tg_to_put is set to the throtl_grp to put and the caller is
|
||||
* responsible for putting it.
|
||||
*/
|
||||
static struct bio *throtl_pop_queued(struct list_head *queued,
|
||||
struct throtl_grp **tg_to_put)
|
||||
static struct bio *throtl_pop_queued(struct throtl_service_queue *sq,
|
||||
struct throtl_grp **tg_to_put, bool rw)
|
||||
{
|
||||
struct list_head *queued = &sq->queued[rw];
|
||||
struct throtl_qnode *qn;
|
||||
struct bio *bio;
|
||||
|
||||
|
@ -209,10 +233,17 @@ static struct bio *throtl_pop_queued(struct list_head *queued,
|
|||
return NULL;
|
||||
|
||||
qn = list_first_entry(queued, struct throtl_qnode, node);
|
||||
bio = bio_list_pop(&qn->bios);
|
||||
bio = bio_list_pop(&qn->bios_iops);
|
||||
if (bio) {
|
||||
sq->nr_queued_iops[rw]--;
|
||||
} else {
|
||||
bio = bio_list_pop(&qn->bios_bps);
|
||||
if (bio)
|
||||
sq->nr_queued_bps[rw]--;
|
||||
}
|
||||
WARN_ON_ONCE(!bio);
|
||||
|
||||
if (bio_list_empty(&qn->bios)) {
|
||||
if (bio_list_empty(&qn->bios_bps) && bio_list_empty(&qn->bios_iops)) {
|
||||
list_del_init(&qn->node);
|
||||
if (tg_to_put)
|
||||
*tg_to_put = qn->tg;
|
||||
|
@ -520,6 +551,9 @@ static inline void throtl_set_slice_end(struct throtl_grp *tg, bool rw,
|
|||
static inline void throtl_extend_slice(struct throtl_grp *tg, bool rw,
|
||||
unsigned long jiffy_end)
|
||||
{
|
||||
if (!time_before(tg->slice_end[rw], jiffy_end))
|
||||
return;
|
||||
|
||||
throtl_set_slice_end(tg, rw, jiffy_end);
|
||||
throtl_log(&tg->service_queue,
|
||||
"[%c] extend slice start=%lu end=%lu jiffies=%lu",
|
||||
|
@ -536,6 +570,11 @@ static bool throtl_slice_used(struct throtl_grp *tg, bool rw)
|
|||
return true;
|
||||
}
|
||||
|
||||
static unsigned int sq_queued(struct throtl_service_queue *sq, int type)
|
||||
{
|
||||
return sq->nr_queued_bps[type] + sq->nr_queued_iops[type];
|
||||
}
|
||||
|
||||
static unsigned int calculate_io_allowed(u32 iops_limit,
|
||||
unsigned long jiffy_elapsed)
|
||||
{
|
||||
|
@ -571,6 +610,48 @@ static u64 calculate_bytes_allowed(u64 bps_limit, unsigned long jiffy_elapsed)
|
|||
return mul_u64_u64_div_u64(bps_limit, (u64)jiffy_elapsed, (u64)HZ);
|
||||
}
|
||||
|
||||
static long long throtl_trim_bps(struct throtl_grp *tg, bool rw,
|
||||
unsigned long time_elapsed)
|
||||
{
|
||||
u64 bps_limit = tg_bps_limit(tg, rw);
|
||||
long long bytes_trim;
|
||||
|
||||
if (bps_limit == U64_MAX)
|
||||
return 0;
|
||||
|
||||
/* Need to consider the case of bytes_allowed overflow. */
|
||||
bytes_trim = calculate_bytes_allowed(bps_limit, time_elapsed);
|
||||
if (bytes_trim <= 0 || tg->bytes_disp[rw] < bytes_trim) {
|
||||
bytes_trim = tg->bytes_disp[rw];
|
||||
tg->bytes_disp[rw] = 0;
|
||||
} else {
|
||||
tg->bytes_disp[rw] -= bytes_trim;
|
||||
}
|
||||
|
||||
return bytes_trim;
|
||||
}
|
||||
|
||||
static int throtl_trim_iops(struct throtl_grp *tg, bool rw,
|
||||
unsigned long time_elapsed)
|
||||
{
|
||||
u32 iops_limit = tg_iops_limit(tg, rw);
|
||||
int io_trim;
|
||||
|
||||
if (iops_limit == UINT_MAX)
|
||||
return 0;
|
||||
|
||||
/* Need to consider the case of io_allowed overflow. */
|
||||
io_trim = calculate_io_allowed(iops_limit, time_elapsed);
|
||||
if (io_trim <= 0 || tg->io_disp[rw] < io_trim) {
|
||||
io_trim = tg->io_disp[rw];
|
||||
tg->io_disp[rw] = 0;
|
||||
} else {
|
||||
tg->io_disp[rw] -= io_trim;
|
||||
}
|
||||
|
||||
return io_trim;
|
||||
}
|
||||
|
||||
/* Trim the used slices and adjust slice start accordingly */
|
||||
static inline void throtl_trim_slice(struct throtl_grp *tg, bool rw)
|
||||
{
|
||||
|
@ -612,22 +693,11 @@ static inline void throtl_trim_slice(struct throtl_grp *tg, bool rw)
|
|||
* one extra slice is preserved for deviation.
|
||||
*/
|
||||
time_elapsed -= tg->td->throtl_slice;
|
||||
bytes_trim = calculate_bytes_allowed(tg_bps_limit(tg, rw),
|
||||
time_elapsed);
|
||||
io_trim = calculate_io_allowed(tg_iops_limit(tg, rw), time_elapsed);
|
||||
if (bytes_trim <= 0 && io_trim <= 0)
|
||||
bytes_trim = throtl_trim_bps(tg, rw, time_elapsed);
|
||||
io_trim = throtl_trim_iops(tg, rw, time_elapsed);
|
||||
if (!bytes_trim && !io_trim)
|
||||
return;
|
||||
|
||||
if ((long long)tg->bytes_disp[rw] >= bytes_trim)
|
||||
tg->bytes_disp[rw] -= bytes_trim;
|
||||
else
|
||||
tg->bytes_disp[rw] = 0;
|
||||
|
||||
if ((int)tg->io_disp[rw] >= io_trim)
|
||||
tg->io_disp[rw] -= io_trim;
|
||||
else
|
||||
tg->io_disp[rw] = 0;
|
||||
|
||||
tg->slice_start[rw] += time_elapsed;
|
||||
|
||||
throtl_log(&tg->service_queue,
|
||||
|
@ -643,21 +713,41 @@ static void __tg_update_carryover(struct throtl_grp *tg, bool rw,
|
|||
unsigned long jiffy_elapsed = jiffies - tg->slice_start[rw];
|
||||
u64 bps_limit = tg_bps_limit(tg, rw);
|
||||
u32 iops_limit = tg_iops_limit(tg, rw);
|
||||
long long bytes_allowed;
|
||||
int io_allowed;
|
||||
|
||||
/*
|
||||
* If the queue is empty, carryover handling is not needed. In such cases,
|
||||
* tg->[bytes/io]_disp should be reset to 0 to avoid impacting the dispatch
|
||||
* of subsequent bios. The same handling applies when the previous BPS/IOPS
|
||||
* limit was set to max.
|
||||
*/
|
||||
if (sq_queued(&tg->service_queue, rw) == 0) {
|
||||
tg->bytes_disp[rw] = 0;
|
||||
tg->io_disp[rw] = 0;
|
||||
return;
|
||||
}
|
||||
|
||||
/*
|
||||
* If config is updated while bios are still throttled, calculate and
|
||||
* accumulate how many bytes/ios are waited across changes. And
|
||||
* carryover_bytes/ios will be used to calculate new wait time under new
|
||||
* configuration.
|
||||
* accumulate how many bytes/ios are waited across changes. And use the
|
||||
* calculated carryover (@bytes/@ios) to update [bytes/io]_disp, which
|
||||
* will be used to calculate new wait time under new configuration.
|
||||
* And we need to consider the case of bytes/io_allowed overflow.
|
||||
*/
|
||||
if (bps_limit != U64_MAX)
|
||||
*bytes = calculate_bytes_allowed(bps_limit, jiffy_elapsed) -
|
||||
tg->bytes_disp[rw];
|
||||
if (iops_limit != UINT_MAX)
|
||||
*ios = calculate_io_allowed(iops_limit, jiffy_elapsed) -
|
||||
tg->io_disp[rw];
|
||||
tg->bytes_disp[rw] -= *bytes;
|
||||
tg->io_disp[rw] -= *ios;
|
||||
if (bps_limit != U64_MAX) {
|
||||
bytes_allowed = calculate_bytes_allowed(bps_limit, jiffy_elapsed);
|
||||
if (bytes_allowed > 0)
|
||||
*bytes = bytes_allowed - tg->bytes_disp[rw];
|
||||
}
|
||||
if (iops_limit != UINT_MAX) {
|
||||
io_allowed = calculate_io_allowed(iops_limit, jiffy_elapsed);
|
||||
if (io_allowed > 0)
|
||||
*ios = io_allowed - tg->io_disp[rw];
|
||||
}
|
||||
|
||||
tg->bytes_disp[rw] = -*bytes;
|
||||
tg->io_disp[rw] = -*ios;
|
||||
}
|
||||
|
||||
static void tg_update_carryover(struct throtl_grp *tg)
|
||||
|
@ -665,12 +755,10 @@ static void tg_update_carryover(struct throtl_grp *tg)
|
|||
long long bytes[2] = {0};
|
||||
int ios[2] = {0};
|
||||
|
||||
if (tg->service_queue.nr_queued[READ])
|
||||
__tg_update_carryover(tg, READ, &bytes[READ], &ios[READ]);
|
||||
if (tg->service_queue.nr_queued[WRITE])
|
||||
__tg_update_carryover(tg, WRITE, &bytes[WRITE], &ios[WRITE]);
|
||||
|
||||
/* see comments in struct throtl_grp for meaning of these fields. */
|
||||
/* see comments in struct throtl_grp for meaning of carryover. */
|
||||
throtl_log(&tg->service_queue, "%s: %lld %lld %d %d\n", __func__,
|
||||
bytes[READ], bytes[WRITE], ios[READ], ios[WRITE]);
|
||||
}
|
||||
|
@ -682,10 +770,6 @@ static unsigned long tg_within_iops_limit(struct throtl_grp *tg, struct bio *bio
|
|||
int io_allowed;
|
||||
unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd;
|
||||
|
||||
if (iops_limit == UINT_MAX) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
jiffy_elapsed = jiffies - tg->slice_start[rw];
|
||||
|
||||
/* Round up to the next throttle slice, wait time must be nonzero */
|
||||
|
@ -711,11 +795,6 @@ static unsigned long tg_within_bps_limit(struct throtl_grp *tg, struct bio *bio,
|
|||
unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd;
|
||||
unsigned int bio_size = throtl_bio_data_size(bio);
|
||||
|
||||
/* no need to throttle if this bio's bytes have been accounted */
|
||||
if (bps_limit == U64_MAX || bio_flagged(bio, BIO_BPS_THROTTLED)) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw];
|
||||
|
||||
/* Slice has just started. Consider one slice interval */
|
||||
|
@ -724,7 +803,9 @@ static unsigned long tg_within_bps_limit(struct throtl_grp *tg, struct bio *bio,
|
|||
|
||||
jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, tg->td->throtl_slice);
|
||||
bytes_allowed = calculate_bytes_allowed(bps_limit, jiffy_elapsed_rnd);
|
||||
if (bytes_allowed > 0 && tg->bytes_disp[rw] + bio_size <= bytes_allowed)
|
||||
/* Need to consider the case of bytes_allowed overflow. */
|
||||
if ((bytes_allowed > 0 && tg->bytes_disp[rw] + bio_size <= bytes_allowed)
|
||||
|| bytes_allowed < 0)
|
||||
return 0;
|
||||
|
||||
/* Calc approx time to dispatch */
|
||||
|
@ -742,17 +823,82 @@ static unsigned long tg_within_bps_limit(struct throtl_grp *tg, struct bio *bio,
|
|||
return jiffy_wait;
|
||||
}
|
||||
|
||||
static void throtl_charge_bps_bio(struct throtl_grp *tg, struct bio *bio)
|
||||
{
|
||||
unsigned int bio_size = throtl_bio_data_size(bio);
|
||||
|
||||
/* Charge the bio to the group */
|
||||
if (!bio_flagged(bio, BIO_BPS_THROTTLED) &&
|
||||
!bio_flagged(bio, BIO_TG_BPS_THROTTLED)) {
|
||||
bio_set_flag(bio, BIO_TG_BPS_THROTTLED);
|
||||
tg->bytes_disp[bio_data_dir(bio)] += bio_size;
|
||||
}
|
||||
}
|
||||
|
||||
static void throtl_charge_iops_bio(struct throtl_grp *tg, struct bio *bio)
|
||||
{
|
||||
bio_clear_flag(bio, BIO_TG_BPS_THROTTLED);
|
||||
tg->io_disp[bio_data_dir(bio)]++;
|
||||
}
|
||||
|
||||
/*
|
||||
* Returns whether one can dispatch a bio or not. Also returns approx number
|
||||
* of jiffies to wait before this bio is with-in IO rate and can be dispatched
|
||||
* If previous slice expired, start a new one otherwise renew/extend existing
|
||||
* slice to make sure it is at least throtl_slice interval long since now. New
|
||||
* slice is started only for empty throttle group. If there is queued bio, that
|
||||
* means there should be an active slice and it should be extended instead.
|
||||
*/
|
||||
static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio,
|
||||
unsigned long *wait)
|
||||
static void tg_update_slice(struct throtl_grp *tg, bool rw)
|
||||
{
|
||||
if (throtl_slice_used(tg, rw) &&
|
||||
sq_queued(&tg->service_queue, rw) == 0)
|
||||
throtl_start_new_slice(tg, rw, true);
|
||||
else
|
||||
throtl_extend_slice(tg, rw, jiffies + tg->td->throtl_slice);
|
||||
}
|
||||
|
||||
static unsigned long tg_dispatch_bps_time(struct throtl_grp *tg, struct bio *bio)
|
||||
{
|
||||
bool rw = bio_data_dir(bio);
|
||||
unsigned long bps_wait = 0, iops_wait = 0, max_wait = 0;
|
||||
u64 bps_limit = tg_bps_limit(tg, rw);
|
||||
unsigned long bps_wait;
|
||||
|
||||
/* no need to throttle if this bio's bytes have been accounted */
|
||||
if (bps_limit == U64_MAX || tg->flags & THROTL_TG_CANCELING ||
|
||||
bio_flagged(bio, BIO_BPS_THROTTLED) ||
|
||||
bio_flagged(bio, BIO_TG_BPS_THROTTLED))
|
||||
return 0;
|
||||
|
||||
tg_update_slice(tg, rw);
|
||||
bps_wait = tg_within_bps_limit(tg, bio, bps_limit);
|
||||
throtl_extend_slice(tg, rw, jiffies + bps_wait);
|
||||
|
||||
return bps_wait;
|
||||
}
|
||||
|
||||
static unsigned long tg_dispatch_iops_time(struct throtl_grp *tg, struct bio *bio)
|
||||
{
|
||||
bool rw = bio_data_dir(bio);
|
||||
u32 iops_limit = tg_iops_limit(tg, rw);
|
||||
unsigned long iops_wait;
|
||||
|
||||
if (iops_limit == UINT_MAX || tg->flags & THROTL_TG_CANCELING)
|
||||
return 0;
|
||||
|
||||
tg_update_slice(tg, rw);
|
||||
iops_wait = tg_within_iops_limit(tg, bio, iops_limit);
|
||||
throtl_extend_slice(tg, rw, jiffies + iops_wait);
|
||||
|
||||
return iops_wait;
|
||||
}
|
||||
|
||||
/*
|
||||
* Returns approx number of jiffies to wait before this bio is with-in IO rate
|
||||
* and can be moved to other queue or dispatched.
|
||||
*/
|
||||
static unsigned long tg_dispatch_time(struct throtl_grp *tg, struct bio *bio)
|
||||
{
|
||||
bool rw = bio_data_dir(bio);
|
||||
unsigned long wait;
|
||||
|
||||
/*
|
||||
* Currently whole state machine of group depends on first bio
|
||||
|
@ -760,62 +906,20 @@ static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio,
|
|||
* this function with a different bio if there are other bios
|
||||
* queued.
|
||||
*/
|
||||
BUG_ON(tg->service_queue.nr_queued[rw] &&
|
||||
BUG_ON(sq_queued(&tg->service_queue, rw) &&
|
||||
bio != throtl_peek_queued(&tg->service_queue.queued[rw]));
|
||||
|
||||
/* If tg->bps = -1, then BW is unlimited */
|
||||
if ((bps_limit == U64_MAX && iops_limit == UINT_MAX) ||
|
||||
tg->flags & THROTL_TG_CANCELING) {
|
||||
if (wait)
|
||||
*wait = 0;
|
||||
return true;
|
||||
}
|
||||
wait = tg_dispatch_bps_time(tg, bio);
|
||||
if (wait != 0)
|
||||
return wait;
|
||||
|
||||
/*
|
||||
* If previous slice expired, start a new one otherwise renew/extend
|
||||
* existing slice to make sure it is at least throtl_slice interval
|
||||
* long since now. New slice is started only for empty throttle group.
|
||||
* If there is queued bio, that means there should be an active
|
||||
* slice and it should be extended instead.
|
||||
* Charge bps here because @bio will be directly placed into the
|
||||
* iops queue afterward.
|
||||
*/
|
||||
if (throtl_slice_used(tg, rw) && !(tg->service_queue.nr_queued[rw]))
|
||||
throtl_start_new_slice(tg, rw, true);
|
||||
else {
|
||||
if (time_before(tg->slice_end[rw],
|
||||
jiffies + tg->td->throtl_slice))
|
||||
throtl_extend_slice(tg, rw,
|
||||
jiffies + tg->td->throtl_slice);
|
||||
}
|
||||
throtl_charge_bps_bio(tg, bio);
|
||||
|
||||
bps_wait = tg_within_bps_limit(tg, bio, bps_limit);
|
||||
iops_wait = tg_within_iops_limit(tg, bio, iops_limit);
|
||||
if (bps_wait + iops_wait == 0) {
|
||||
if (wait)
|
||||
*wait = 0;
|
||||
return true;
|
||||
}
|
||||
|
||||
max_wait = max(bps_wait, iops_wait);
|
||||
|
||||
if (wait)
|
||||
*wait = max_wait;
|
||||
|
||||
if (time_before(tg->slice_end[rw], jiffies + max_wait))
|
||||
throtl_extend_slice(tg, rw, jiffies + max_wait);
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio)
|
||||
{
|
||||
bool rw = bio_data_dir(bio);
|
||||
unsigned int bio_size = throtl_bio_data_size(bio);
|
||||
|
||||
/* Charge the bio to the group */
|
||||
if (!bio_flagged(bio, BIO_BPS_THROTTLED))
|
||||
tg->bytes_disp[rw] += bio_size;
|
||||
|
||||
tg->io_disp[rw]++;
|
||||
return tg_dispatch_iops_time(tg, bio);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -842,28 +946,36 @@ static void throtl_add_bio_tg(struct bio *bio, struct throtl_qnode *qn,
|
|||
* dispatched. Mark that @tg was empty. This is automatically
|
||||
* cleared on the next tg_update_disptime().
|
||||
*/
|
||||
if (!sq->nr_queued[rw])
|
||||
if (sq_queued(sq, rw) == 0)
|
||||
tg->flags |= THROTL_TG_WAS_EMPTY;
|
||||
|
||||
throtl_qnode_add_bio(bio, qn, &sq->queued[rw]);
|
||||
throtl_qnode_add_bio(bio, qn, sq);
|
||||
|
||||
/*
|
||||
* Since we have split the queues, when the iops queue is
|
||||
* previously empty and a new @bio is added into the first @qn,
|
||||
* we also need to update the @tg->disptime.
|
||||
*/
|
||||
if (bio_flagged(bio, BIO_BPS_THROTTLED) &&
|
||||
bio == throtl_peek_queued(&sq->queued[rw]))
|
||||
tg->flags |= THROTL_TG_IOPS_WAS_EMPTY;
|
||||
|
||||
sq->nr_queued[rw]++;
|
||||
throtl_enqueue_tg(tg);
|
||||
}
|
||||
|
||||
static void tg_update_disptime(struct throtl_grp *tg)
|
||||
{
|
||||
struct throtl_service_queue *sq = &tg->service_queue;
|
||||
unsigned long read_wait = -1, write_wait = -1, min_wait = -1, disptime;
|
||||
unsigned long read_wait = -1, write_wait = -1, min_wait, disptime;
|
||||
struct bio *bio;
|
||||
|
||||
bio = throtl_peek_queued(&sq->queued[READ]);
|
||||
if (bio)
|
||||
tg_may_dispatch(tg, bio, &read_wait);
|
||||
read_wait = tg_dispatch_time(tg, bio);
|
||||
|
||||
bio = throtl_peek_queued(&sq->queued[WRITE]);
|
||||
if (bio)
|
||||
tg_may_dispatch(tg, bio, &write_wait);
|
||||
write_wait = tg_dispatch_time(tg, bio);
|
||||
|
||||
min_wait = min(read_wait, write_wait);
|
||||
disptime = jiffies + min_wait;
|
||||
|
@ -875,6 +987,7 @@ static void tg_update_disptime(struct throtl_grp *tg)
|
|||
|
||||
/* see throtl_add_bio_tg() */
|
||||
tg->flags &= ~THROTL_TG_WAS_EMPTY;
|
||||
tg->flags &= ~THROTL_TG_IOPS_WAS_EMPTY;
|
||||
}
|
||||
|
||||
static void start_parent_slice_with_credit(struct throtl_grp *child_tg,
|
||||
|
@ -901,10 +1014,9 @@ static void tg_dispatch_one_bio(struct throtl_grp *tg, bool rw)
|
|||
* getting released prematurely. Remember the tg to put and put it
|
||||
* after @bio is transferred to @parent_sq.
|
||||
*/
|
||||
bio = throtl_pop_queued(&sq->queued[rw], &tg_to_put);
|
||||
sq->nr_queued[rw]--;
|
||||
bio = throtl_pop_queued(sq, &tg_to_put, rw);
|
||||
|
||||
throtl_charge_bio(tg, bio);
|
||||
throtl_charge_iops_bio(tg, bio);
|
||||
|
||||
/*
|
||||
* If our parent is another tg, we just need to transfer @bio to
|
||||
|
@ -919,7 +1031,7 @@ static void tg_dispatch_one_bio(struct throtl_grp *tg, bool rw)
|
|||
} else {
|
||||
bio_set_flag(bio, BIO_BPS_THROTTLED);
|
||||
throtl_qnode_add_bio(bio, &tg->qnode_on_parent[rw],
|
||||
&parent_sq->queued[rw]);
|
||||
parent_sq);
|
||||
BUG_ON(tg->td->nr_queued[rw] <= 0);
|
||||
tg->td->nr_queued[rw]--;
|
||||
}
|
||||
|
@ -941,7 +1053,7 @@ static int throtl_dispatch_tg(struct throtl_grp *tg)
|
|||
/* Try to dispatch 75% READS and 25% WRITES */
|
||||
|
||||
while ((bio = throtl_peek_queued(&sq->queued[READ])) &&
|
||||
tg_may_dispatch(tg, bio, NULL)) {
|
||||
tg_dispatch_time(tg, bio) == 0) {
|
||||
|
||||
tg_dispatch_one_bio(tg, READ);
|
||||
nr_reads++;
|
||||
|
@ -951,7 +1063,7 @@ static int throtl_dispatch_tg(struct throtl_grp *tg)
|
|||
}
|
||||
|
||||
while ((bio = throtl_peek_queued(&sq->queued[WRITE])) &&
|
||||
tg_may_dispatch(tg, bio, NULL)) {
|
||||
tg_dispatch_time(tg, bio) == 0) {
|
||||
|
||||
tg_dispatch_one_bio(tg, WRITE);
|
||||
nr_writes++;
|
||||
|
@ -984,7 +1096,7 @@ static int throtl_select_dispatch(struct throtl_service_queue *parent_sq)
|
|||
nr_disp += throtl_dispatch_tg(tg);
|
||||
|
||||
sq = &tg->service_queue;
|
||||
if (sq->nr_queued[READ] || sq->nr_queued[WRITE])
|
||||
if (sq_queued(sq, READ) || sq_queued(sq, WRITE))
|
||||
tg_update_disptime(tg);
|
||||
else
|
||||
throtl_dequeue_tg(tg);
|
||||
|
@ -1037,9 +1149,11 @@ again:
|
|||
dispatched = false;
|
||||
|
||||
while (true) {
|
||||
unsigned int __maybe_unused bio_cnt_r = sq_queued(sq, READ);
|
||||
unsigned int __maybe_unused bio_cnt_w = sq_queued(sq, WRITE);
|
||||
|
||||
throtl_log(sq, "dispatch nr_queued=%u read=%u write=%u",
|
||||
sq->nr_queued[READ] + sq->nr_queued[WRITE],
|
||||
sq->nr_queued[READ], sq->nr_queued[WRITE]);
|
||||
bio_cnt_r + bio_cnt_w, bio_cnt_r, bio_cnt_w);
|
||||
|
||||
ret = throtl_select_dispatch(sq);
|
||||
if (ret) {
|
||||
|
@ -1061,7 +1175,8 @@ again:
|
|||
|
||||
if (parent_sq) {
|
||||
/* @parent_sq is another throl_grp, propagate dispatch */
|
||||
if (tg->flags & THROTL_TG_WAS_EMPTY) {
|
||||
if (tg->flags & THROTL_TG_WAS_EMPTY ||
|
||||
tg->flags & THROTL_TG_IOPS_WAS_EMPTY) {
|
||||
tg_update_disptime(tg);
|
||||
if (!throtl_schedule_next_dispatch(parent_sq, false)) {
|
||||
/* window is already open, repeat dispatching */
|
||||
|
@ -1101,7 +1216,7 @@ static void blk_throtl_dispatch_work_fn(struct work_struct *work)
|
|||
|
||||
spin_lock_irq(&q->queue_lock);
|
||||
for (rw = READ; rw <= WRITE; rw++)
|
||||
while ((bio = throtl_pop_queued(&td_sq->queued[rw], NULL)))
|
||||
while ((bio = throtl_pop_queued(td_sq, NULL, rw)))
|
||||
bio_list_add(&bio_list_on_stack, bio);
|
||||
spin_unlock_irq(&q->queue_lock);
|
||||
|
||||
|
@ -1606,11 +1721,30 @@ void blk_throtl_cancel_bios(struct gendisk *disk)
|
|||
|
||||
static bool tg_within_limit(struct throtl_grp *tg, struct bio *bio, bool rw)
|
||||
{
|
||||
/* throtl is FIFO - if bios are already queued, should queue */
|
||||
if (tg->service_queue.nr_queued[rw])
|
||||
return false;
|
||||
struct throtl_service_queue *sq = &tg->service_queue;
|
||||
|
||||
return tg_may_dispatch(tg, bio, NULL);
|
||||
/*
|
||||
* For a split bio, we need to specifically distinguish whether the
|
||||
* iops queue is empty.
|
||||
*/
|
||||
if (bio_flagged(bio, BIO_BPS_THROTTLED))
|
||||
return sq->nr_queued_iops[rw] == 0 &&
|
||||
tg_dispatch_iops_time(tg, bio) == 0;
|
||||
|
||||
/*
|
||||
* Throtl is FIFO - if bios are already queued, should queue.
|
||||
* If the bps queue is empty and @bio is within the bps limit, charge
|
||||
* bps here for direct placement into the iops queue.
|
||||
*/
|
||||
if (sq_queued(&tg->service_queue, rw)) {
|
||||
if (sq->nr_queued_bps[rw] == 0 &&
|
||||
tg_dispatch_bps_time(tg, bio) == 0)
|
||||
throtl_charge_bps_bio(tg, bio);
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
return tg_dispatch_time(tg, bio) == 0;
|
||||
}
|
||||
|
||||
bool __blk_throtl_bio(struct bio *bio)
|
||||
|
@ -1631,7 +1765,7 @@ bool __blk_throtl_bio(struct bio *bio)
|
|||
while (true) {
|
||||
if (tg_within_limit(tg, bio, rw)) {
|
||||
/* within limits, let's charge and dispatch directly */
|
||||
throtl_charge_bio(tg, bio);
|
||||
throtl_charge_iops_bio(tg, bio);
|
||||
|
||||
/*
|
||||
* We need to trim slice even when bios are not being
|
||||
|
@ -1654,7 +1788,8 @@ bool __blk_throtl_bio(struct bio *bio)
|
|||
* control algorithm is adaptive, and extra IO bytes
|
||||
* will be throttled for paying the debt
|
||||
*/
|
||||
throtl_charge_bio(tg, bio);
|
||||
throtl_charge_bps_bio(tg, bio);
|
||||
throtl_charge_iops_bio(tg, bio);
|
||||
} else {
|
||||
/* if above limits, break to queue */
|
||||
break;
|
||||
|
@ -1680,7 +1815,7 @@ bool __blk_throtl_bio(struct bio *bio)
|
|||
tg->bytes_disp[rw], bio->bi_iter.bi_size,
|
||||
tg_bps_limit(tg, rw),
|
||||
tg->io_disp[rw], tg_iops_limit(tg, rw),
|
||||
sq->nr_queued[READ], sq->nr_queued[WRITE]);
|
||||
sq_queued(sq, READ), sq_queued(sq, WRITE));
|
||||
|
||||
td->nr_queued[rw]++;
|
||||
throtl_add_bio_tg(bio, qn, tg);
|
||||
|
@ -1688,11 +1823,13 @@ bool __blk_throtl_bio(struct bio *bio)
|
|||
|
||||
/*
|
||||
* Update @tg's dispatch time and force schedule dispatch if @tg
|
||||
* was empty before @bio. The forced scheduling isn't likely to
|
||||
* cause undue delay as @bio is likely to be dispatched directly if
|
||||
* its @tg's disptime is not in the future.
|
||||
* was empty before @bio, or the iops queue is empty and @bio will
|
||||
* add to. The forced scheduling isn't likely to cause undue
|
||||
* delay as @bio is likely to be dispatched directly if its @tg's
|
||||
* disptime is not in the future.
|
||||
*/
|
||||
if (tg->flags & THROTL_TG_WAS_EMPTY) {
|
||||
if (tg->flags & THROTL_TG_WAS_EMPTY ||
|
||||
tg->flags & THROTL_TG_IOPS_WAS_EMPTY) {
|
||||
tg_update_disptime(tg);
|
||||
throtl_schedule_next_dispatch(tg->service_queue.parent_sq, true);
|
||||
}
|
||||
|
|
|
@ -29,7 +29,8 @@
|
|||
*/
|
||||
struct throtl_qnode {
|
||||
struct list_head node; /* service_queue->queued[] */
|
||||
struct bio_list bios; /* queued bios */
|
||||
struct bio_list bios_bps; /* queued bios for bps limit */
|
||||
struct bio_list bios_iops; /* queued bios for iops limit */
|
||||
struct throtl_grp *tg; /* tg this qnode belongs to */
|
||||
};
|
||||
|
||||
|
@ -41,7 +42,8 @@ struct throtl_service_queue {
|
|||
* children throtl_grp's.
|
||||
*/
|
||||
struct list_head queued[2]; /* throtl_qnode [READ/WRITE] */
|
||||
unsigned int nr_queued[2]; /* number of queued bios */
|
||||
unsigned int nr_queued_bps[2]; /* number of queued bps bios */
|
||||
unsigned int nr_queued_iops[2]; /* number of queued iops bios */
|
||||
|
||||
/*
|
||||
* RB tree of active children throtl_grp's, which are sorted by
|
||||
|
@ -56,7 +58,12 @@ struct throtl_service_queue {
|
|||
enum tg_state_flags {
|
||||
THROTL_TG_PENDING = 1 << 0, /* on parent's pending tree */
|
||||
THROTL_TG_WAS_EMPTY = 1 << 1, /* bio_lists[] became non-empty */
|
||||
THROTL_TG_CANCELING = 1 << 2, /* starts to cancel bio */
|
||||
/*
|
||||
* The sq's iops queue is empty, and a bio is about to be enqueued
|
||||
* to the first qnode's bios_iops list.
|
||||
*/
|
||||
THROTL_TG_IOPS_WAS_EMPTY = 1 << 2,
|
||||
THROTL_TG_CANCELING = 1 << 3, /* starts to cancel bio */
|
||||
};
|
||||
|
||||
struct throtl_grp {
|
||||
|
@ -102,19 +109,16 @@ struct throtl_grp {
|
|||
/* IOPS limits */
|
||||
unsigned int iops[2];
|
||||
|
||||
/* Number of bytes dispatched in current slice */
|
||||
int64_t bytes_disp[2];
|
||||
/* Number of bio's dispatched in current slice */
|
||||
int io_disp[2];
|
||||
|
||||
/*
|
||||
* The following two fields are updated when new configuration is
|
||||
* submitted while some bios are still throttled, they record how many
|
||||
* bytes/ios are waited already in previous configuration, and they will
|
||||
* be used to calculate wait time under new configuration.
|
||||
* Number of bytes/bio's dispatched in current slice.
|
||||
* When new configuration is submitted while some bios are still throttled,
|
||||
* first calculate the carryover: the amount of bytes/IOs already waited
|
||||
* under the previous configuration. Then, [bytes/io]_disp are represented
|
||||
* as the negative of the carryover, and they will be used to calculate the
|
||||
* wait time under the new configuration.
|
||||
*/
|
||||
long long carryover_bytes[2];
|
||||
int carryover_ios[2];
|
||||
int64_t bytes_disp[2];
|
||||
int io_disp[2];
|
||||
|
||||
unsigned long last_check_time;
|
||||
|
||||
|
|
|
@ -704,8 +704,9 @@ void wbt_enable_default(struct gendisk *disk)
|
|||
struct rq_qos *rqos;
|
||||
bool enable = IS_ENABLED(CONFIG_BLK_WBT_MQ);
|
||||
|
||||
if (q->elevator &&
|
||||
test_bit(ELEVATOR_FLAG_DISABLE_WBT, &q->elevator->flags))
|
||||
mutex_lock(&disk->rqos_state_mutex);
|
||||
|
||||
if (blk_queue_disable_wbt(q))
|
||||
enable = false;
|
||||
|
||||
/* Throttling already enabled? */
|
||||
|
@ -713,8 +714,10 @@ void wbt_enable_default(struct gendisk *disk)
|
|||
if (rqos) {
|
||||
if (enable && RQWB(rqos)->enable_state == WBT_STATE_OFF_DEFAULT)
|
||||
RQWB(rqos)->enable_state = WBT_STATE_ON_DEFAULT;
|
||||
mutex_unlock(&disk->rqos_state_mutex);
|
||||
return;
|
||||
}
|
||||
mutex_unlock(&disk->rqos_state_mutex);
|
||||
|
||||
/* Queue not registered? Maybe shutting down... */
|
||||
if (!blk_queue_registered(q))
|
||||
|
@ -774,11 +777,13 @@ void wbt_disable_default(struct gendisk *disk)
|
|||
struct rq_wb *rwb;
|
||||
if (!rqos)
|
||||
return;
|
||||
mutex_lock(&disk->rqos_state_mutex);
|
||||
rwb = RQWB(rqos);
|
||||
if (rwb->enable_state == WBT_STATE_ON_DEFAULT) {
|
||||
blk_stat_deactivate(rwb->cb);
|
||||
rwb->enable_state = WBT_STATE_OFF_DEFAULT;
|
||||
}
|
||||
mutex_unlock(&disk->rqos_state_mutex);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(wbt_disable_default);
|
||||
|
||||
|
|
50
block/blk.h
50
block/blk.h
|
@ -103,8 +103,7 @@ struct bio_vec *bvec_alloc(mempool_t *pool, unsigned short *nr_vecs,
|
|||
void bvec_free(mempool_t *pool, struct bio_vec *bv, unsigned short nr_vecs);
|
||||
|
||||
bool bvec_try_merge_hw_page(struct request_queue *q, struct bio_vec *bv,
|
||||
struct page *page, unsigned len, unsigned offset,
|
||||
bool *same_page);
|
||||
struct page *page, unsigned len, unsigned offset);
|
||||
|
||||
static inline bool biovec_phys_mergeable(struct request_queue *q,
|
||||
struct bio_vec *vec1, struct bio_vec *vec2)
|
||||
|
@ -322,11 +321,9 @@ bool blk_bio_list_merge(struct request_queue *q, struct list_head *list,
|
|||
|
||||
bool blk_insert_flush(struct request *rq);
|
||||
|
||||
int elevator_switch(struct request_queue *q, struct elevator_type *new_e);
|
||||
void elevator_disable(struct request_queue *q);
|
||||
void elevator_exit(struct request_queue *q);
|
||||
int elv_register_queue(struct request_queue *q, bool uevent);
|
||||
void elv_unregister_queue(struct request_queue *q);
|
||||
void elv_update_nr_hw_queues(struct request_queue *q);
|
||||
void elevator_set_default(struct request_queue *q);
|
||||
void elevator_set_none(struct request_queue *q);
|
||||
|
||||
ssize_t part_size_show(struct device *dev, struct device_attribute *attr,
|
||||
char *buf);
|
||||
|
@ -407,6 +404,27 @@ static inline struct bio *__bio_split_to_limits(struct bio *bio,
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* get_max_segment_size() - maximum number of bytes to add as a single segment
|
||||
* @lim: Request queue limits.
|
||||
* @paddr: address of the range to add
|
||||
* @len: maximum length available to add at @paddr
|
||||
*
|
||||
* Returns the maximum number of bytes of the range starting at @paddr that can
|
||||
* be added to a single segment.
|
||||
*/
|
||||
static inline unsigned get_max_segment_size(const struct queue_limits *lim,
|
||||
phys_addr_t paddr, unsigned int len)
|
||||
{
|
||||
/*
|
||||
* Prevent an overflow if mask = ULONG_MAX and offset = 0 by adding 1
|
||||
* after having calculated the minimum.
|
||||
*/
|
||||
return min_t(unsigned long, len,
|
||||
min(lim->seg_boundary_mask - (lim->seg_boundary_mask & paddr),
|
||||
(unsigned long)lim->max_segment_size - 1) + 1);
|
||||
}
|
||||
|
||||
int ll_back_merge_fn(struct request *req, struct bio *bio,
|
||||
unsigned int nr_segs);
|
||||
bool blk_attempt_req_merge(struct request_queue *q, struct request *rq,
|
||||
|
@ -421,7 +439,6 @@ void blk_apply_bdi_limits(struct backing_dev_info *bdi,
|
|||
int blk_dev_init(void);
|
||||
|
||||
void update_io_ticks(struct block_device *part, unsigned long now, bool end);
|
||||
unsigned int part_in_flight(struct block_device *part);
|
||||
|
||||
static inline void req_set_nomerge(struct request_queue *q, struct request *req)
|
||||
{
|
||||
|
@ -443,23 +460,6 @@ static inline void ioc_clear_queue(struct request_queue *q)
|
|||
}
|
||||
#endif /* CONFIG_BLK_ICQ */
|
||||
|
||||
struct bio *__blk_queue_bounce(struct bio *bio, struct request_queue *q);
|
||||
|
||||
static inline bool blk_queue_may_bounce(struct request_queue *q)
|
||||
{
|
||||
return IS_ENABLED(CONFIG_BOUNCE) &&
|
||||
(q->limits.features & BLK_FEAT_BOUNCE_HIGH) &&
|
||||
max_low_pfn >= max_pfn;
|
||||
}
|
||||
|
||||
static inline struct bio *blk_queue_bounce(struct bio *bio,
|
||||
struct request_queue *q)
|
||||
{
|
||||
if (unlikely(blk_queue_may_bounce(q) && bio_has_data(bio)))
|
||||
return __blk_queue_bounce(bio, q);
|
||||
return bio;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_BLK_DEV_ZONED
|
||||
void disk_init_zone_resources(struct gendisk *disk);
|
||||
void disk_free_zone_resources(struct gendisk *disk);
|
||||
|
|
267
block/bounce.c
267
block/bounce.c
|
@ -1,267 +0,0 @@
|
|||
// SPDX-License-Identifier: GPL-2.0
|
||||
/* bounce buffer handling for block devices
|
||||
*
|
||||
* - Split from highmem.c
|
||||
*/
|
||||
|
||||
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
|
||||
|
||||
#include <linux/mm.h>
|
||||
#include <linux/export.h>
|
||||
#include <linux/swap.h>
|
||||
#include <linux/gfp.h>
|
||||
#include <linux/bio-integrity.h>
|
||||
#include <linux/pagemap.h>
|
||||
#include <linux/mempool.h>
|
||||
#include <linux/blkdev.h>
|
||||
#include <linux/backing-dev.h>
|
||||
#include <linux/init.h>
|
||||
#include <linux/hash.h>
|
||||
#include <linux/highmem.h>
|
||||
#include <linux/printk.h>
|
||||
#include <asm/tlbflush.h>
|
||||
|
||||
#include <trace/events/block.h>
|
||||
#include "blk.h"
|
||||
#include "blk-cgroup.h"
|
||||
|
||||
#define POOL_SIZE 64
|
||||
#define ISA_POOL_SIZE 16
|
||||
|
||||
static struct bio_set bounce_bio_set, bounce_bio_split;
|
||||
static mempool_t page_pool;
|
||||
|
||||
static void init_bounce_bioset(void)
|
||||
{
|
||||
static bool bounce_bs_setup;
|
||||
int ret;
|
||||
|
||||
if (bounce_bs_setup)
|
||||
return;
|
||||
|
||||
ret = bioset_init(&bounce_bio_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS);
|
||||
BUG_ON(ret);
|
||||
|
||||
ret = bioset_init(&bounce_bio_split, BIO_POOL_SIZE, 0, 0);
|
||||
BUG_ON(ret);
|
||||
bounce_bs_setup = true;
|
||||
}
|
||||
|
||||
static __init int init_emergency_pool(void)
|
||||
{
|
||||
int ret;
|
||||
|
||||
#ifndef CONFIG_MEMORY_HOTPLUG
|
||||
if (max_pfn <= max_low_pfn)
|
||||
return 0;
|
||||
#endif
|
||||
|
||||
ret = mempool_init_page_pool(&page_pool, POOL_SIZE, 0);
|
||||
BUG_ON(ret);
|
||||
pr_info("pool size: %d pages\n", POOL_SIZE);
|
||||
|
||||
init_bounce_bioset();
|
||||
return 0;
|
||||
}
|
||||
|
||||
__initcall(init_emergency_pool);
|
||||
|
||||
/*
|
||||
* Simple bounce buffer support for highmem pages. Depending on the
|
||||
* queue gfp mask set, *to may or may not be a highmem page. kmap it
|
||||
* always, it will do the Right Thing
|
||||
*/
|
||||
static void copy_to_high_bio_irq(struct bio *to, struct bio *from)
|
||||
{
|
||||
struct bio_vec tovec, fromvec;
|
||||
struct bvec_iter iter;
|
||||
/*
|
||||
* The bio of @from is created by bounce, so we can iterate
|
||||
* its bvec from start to end, but the @from->bi_iter can't be
|
||||
* trusted because it might be changed by splitting.
|
||||
*/
|
||||
struct bvec_iter from_iter = BVEC_ITER_ALL_INIT;
|
||||
|
||||
bio_for_each_segment(tovec, to, iter) {
|
||||
fromvec = bio_iter_iovec(from, from_iter);
|
||||
if (tovec.bv_page != fromvec.bv_page) {
|
||||
/*
|
||||
* fromvec->bv_offset and fromvec->bv_len might have
|
||||
* been modified by the block layer, so use the original
|
||||
* copy, bounce_copy_vec already uses tovec->bv_len
|
||||
*/
|
||||
memcpy_to_bvec(&tovec, page_address(fromvec.bv_page) +
|
||||
tovec.bv_offset);
|
||||
}
|
||||
bio_advance_iter(from, &from_iter, tovec.bv_len);
|
||||
}
|
||||
}
|
||||
|
||||
static void bounce_end_io(struct bio *bio)
|
||||
{
|
||||
struct bio *bio_orig = bio->bi_private;
|
||||
struct bio_vec *bvec, orig_vec;
|
||||
struct bvec_iter orig_iter = bio_orig->bi_iter;
|
||||
struct bvec_iter_all iter_all;
|
||||
|
||||
/*
|
||||
* free up bounce indirect pages used
|
||||
*/
|
||||
bio_for_each_segment_all(bvec, bio, iter_all) {
|
||||
orig_vec = bio_iter_iovec(bio_orig, orig_iter);
|
||||
if (bvec->bv_page != orig_vec.bv_page) {
|
||||
dec_zone_page_state(bvec->bv_page, NR_BOUNCE);
|
||||
mempool_free(bvec->bv_page, &page_pool);
|
||||
}
|
||||
bio_advance_iter(bio_orig, &orig_iter, orig_vec.bv_len);
|
||||
}
|
||||
|
||||
bio_orig->bi_status = bio->bi_status;
|
||||
bio_endio(bio_orig);
|
||||
bio_put(bio);
|
||||
}
|
||||
|
||||
static void bounce_end_io_write(struct bio *bio)
|
||||
{
|
||||
bounce_end_io(bio);
|
||||
}
|
||||
|
||||
static void bounce_end_io_read(struct bio *bio)
|
||||
{
|
||||
struct bio *bio_orig = bio->bi_private;
|
||||
|
||||
if (!bio->bi_status)
|
||||
copy_to_high_bio_irq(bio_orig, bio);
|
||||
|
||||
bounce_end_io(bio);
|
||||
}
|
||||
|
||||
static struct bio *bounce_clone_bio(struct bio *bio_src)
|
||||
{
|
||||
struct bvec_iter iter;
|
||||
struct bio_vec bv;
|
||||
struct bio *bio;
|
||||
|
||||
/*
|
||||
* Pre immutable biovecs, __bio_clone() used to just do a memcpy from
|
||||
* bio_src->bi_io_vec to bio->bi_io_vec.
|
||||
*
|
||||
* We can't do that anymore, because:
|
||||
*
|
||||
* - The point of cloning the biovec is to produce a bio with a biovec
|
||||
* the caller can modify: bi_idx and bi_bvec_done should be 0.
|
||||
*
|
||||
* - The original bio could've had more than BIO_MAX_VECS biovecs; if
|
||||
* we tried to clone the whole thing bio_alloc_bioset() would fail.
|
||||
* But the clone should succeed as long as the number of biovecs we
|
||||
* actually need to allocate is fewer than BIO_MAX_VECS.
|
||||
*
|
||||
* - Lastly, bi_vcnt should not be looked at or relied upon by code
|
||||
* that does not own the bio - reason being drivers don't use it for
|
||||
* iterating over the biovec anymore, so expecting it to be kept up
|
||||
* to date (i.e. for clones that share the parent biovec) is just
|
||||
* asking for trouble and would force extra work.
|
||||
*/
|
||||
bio = bio_alloc_bioset(bio_src->bi_bdev, bio_segments(bio_src),
|
||||
bio_src->bi_opf, GFP_NOIO, &bounce_bio_set);
|
||||
if (bio_flagged(bio_src, BIO_REMAPPED))
|
||||
bio_set_flag(bio, BIO_REMAPPED);
|
||||
bio->bi_ioprio = bio_src->bi_ioprio;
|
||||
bio->bi_write_hint = bio_src->bi_write_hint;
|
||||
bio->bi_iter.bi_sector = bio_src->bi_iter.bi_sector;
|
||||
bio->bi_iter.bi_size = bio_src->bi_iter.bi_size;
|
||||
|
||||
switch (bio_op(bio)) {
|
||||
case REQ_OP_DISCARD:
|
||||
case REQ_OP_SECURE_ERASE:
|
||||
case REQ_OP_WRITE_ZEROES:
|
||||
break;
|
||||
default:
|
||||
bio_for_each_segment(bv, bio_src, iter)
|
||||
bio->bi_io_vec[bio->bi_vcnt++] = bv;
|
||||
break;
|
||||
}
|
||||
|
||||
if (bio_crypt_clone(bio, bio_src, GFP_NOIO) < 0)
|
||||
goto err_put;
|
||||
|
||||
if (bio_integrity(bio_src) &&
|
||||
bio_integrity_clone(bio, bio_src, GFP_NOIO) < 0)
|
||||
goto err_put;
|
||||
|
||||
bio_clone_blkg_association(bio, bio_src);
|
||||
|
||||
return bio;
|
||||
|
||||
err_put:
|
||||
bio_put(bio);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
struct bio *__blk_queue_bounce(struct bio *bio_orig, struct request_queue *q)
|
||||
{
|
||||
struct bio *bio;
|
||||
int rw = bio_data_dir(bio_orig);
|
||||
struct bio_vec *to, from;
|
||||
struct bvec_iter iter;
|
||||
unsigned i = 0, bytes = 0;
|
||||
bool bounce = false;
|
||||
int sectors;
|
||||
|
||||
bio_for_each_segment(from, bio_orig, iter) {
|
||||
if (i++ < BIO_MAX_VECS)
|
||||
bytes += from.bv_len;
|
||||
if (PageHighMem(from.bv_page))
|
||||
bounce = true;
|
||||
}
|
||||
if (!bounce)
|
||||
return bio_orig;
|
||||
|
||||
/*
|
||||
* Individual bvecs might not be logical block aligned. Round down
|
||||
* the split size so that each bio is properly block size aligned,
|
||||
* even if we do not use the full hardware limits.
|
||||
*/
|
||||
sectors = ALIGN_DOWN(bytes, queue_logical_block_size(q)) >>
|
||||
SECTOR_SHIFT;
|
||||
if (sectors < bio_sectors(bio_orig)) {
|
||||
bio = bio_split(bio_orig, sectors, GFP_NOIO, &bounce_bio_split);
|
||||
bio_chain(bio, bio_orig);
|
||||
submit_bio_noacct(bio_orig);
|
||||
bio_orig = bio;
|
||||
}
|
||||
bio = bounce_clone_bio(bio_orig);
|
||||
|
||||
/*
|
||||
* Bvec table can't be updated by bio_for_each_segment_all(),
|
||||
* so retrieve bvec from the table directly. This way is safe
|
||||
* because the 'bio' is single-page bvec.
|
||||
*/
|
||||
for (i = 0, to = bio->bi_io_vec; i < bio->bi_vcnt; to++, i++) {
|
||||
struct page *bounce_page;
|
||||
|
||||
if (!PageHighMem(to->bv_page))
|
||||
continue;
|
||||
|
||||
bounce_page = mempool_alloc(&page_pool, GFP_NOIO);
|
||||
inc_zone_page_state(bounce_page, NR_BOUNCE);
|
||||
|
||||
if (rw == WRITE) {
|
||||
flush_dcache_page(to->bv_page);
|
||||
memcpy_from_bvec(page_address(bounce_page), to);
|
||||
}
|
||||
to->bv_page = bounce_page;
|
||||
}
|
||||
|
||||
trace_block_bio_bounce(bio_orig);
|
||||
|
||||
bio->bi_flags |= (1 << BIO_BOUNCED);
|
||||
|
||||
if (rw == READ)
|
||||
bio->bi_end_io = bounce_end_io_read;
|
||||
else
|
||||
bio->bi_end_io = bounce_end_io_write;
|
||||
|
||||
bio->bi_private = bio_orig;
|
||||
return bio;
|
||||
}
|
333
block/elevator.c
333
block/elevator.c
|
@ -45,6 +45,17 @@
|
|||
#include "blk-wbt.h"
|
||||
#include "blk-cgroup.h"
|
||||
|
||||
/* Holding context data for changing elevator */
|
||||
struct elv_change_ctx {
|
||||
const char *name;
|
||||
bool no_uevent;
|
||||
|
||||
/* for unregistering old elevator */
|
||||
struct elevator_queue *old;
|
||||
/* for registering new elevator */
|
||||
struct elevator_queue *new;
|
||||
};
|
||||
|
||||
static DEFINE_SPINLOCK(elv_list_lock);
|
||||
static LIST_HEAD(elv_list);
|
||||
|
||||
|
@ -148,18 +159,18 @@ static void elevator_release(struct kobject *kobj)
|
|||
kfree(e);
|
||||
}
|
||||
|
||||
void elevator_exit(struct request_queue *q)
|
||||
static void elevator_exit(struct request_queue *q)
|
||||
{
|
||||
struct elevator_queue *e = q->elevator;
|
||||
|
||||
lockdep_assert_held(&q->elevator_lock);
|
||||
|
||||
ioc_clear_queue(q);
|
||||
blk_mq_sched_free_rqs(q);
|
||||
|
||||
mutex_lock(&e->sysfs_lock);
|
||||
blk_mq_exit_sched(q, e);
|
||||
mutex_unlock(&e->sysfs_lock);
|
||||
|
||||
kobject_put(&e->kobj);
|
||||
}
|
||||
|
||||
static inline void __elv_rqhash_del(struct request *rq)
|
||||
|
@ -412,14 +423,15 @@ elv_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
|
|||
{
|
||||
const struct elv_fs_entry *entry = to_elv(attr);
|
||||
struct elevator_queue *e;
|
||||
ssize_t error;
|
||||
ssize_t error = -ENODEV;
|
||||
|
||||
if (!entry->show)
|
||||
return -EIO;
|
||||
|
||||
e = container_of(kobj, struct elevator_queue, kobj);
|
||||
mutex_lock(&e->sysfs_lock);
|
||||
error = e->type ? entry->show(e, page) : -ENOENT;
|
||||
if (!test_bit(ELEVATOR_FLAG_DYING, &e->flags))
|
||||
error = entry->show(e, page);
|
||||
mutex_unlock(&e->sysfs_lock);
|
||||
return error;
|
||||
}
|
||||
|
@ -430,14 +442,15 @@ elv_attr_store(struct kobject *kobj, struct attribute *attr,
|
|||
{
|
||||
const struct elv_fs_entry *entry = to_elv(attr);
|
||||
struct elevator_queue *e;
|
||||
ssize_t error;
|
||||
ssize_t error = -ENODEV;
|
||||
|
||||
if (!entry->store)
|
||||
return -EIO;
|
||||
|
||||
e = container_of(kobj, struct elevator_queue, kobj);
|
||||
mutex_lock(&e->sysfs_lock);
|
||||
error = e->type ? entry->store(e, page, length) : -ENOENT;
|
||||
if (!test_bit(ELEVATOR_FLAG_DYING, &e->flags))
|
||||
error = entry->store(e, page, length);
|
||||
mutex_unlock(&e->sysfs_lock);
|
||||
return error;
|
||||
}
|
||||
|
@ -452,13 +465,12 @@ static const struct kobj_type elv_ktype = {
|
|||
.release = elevator_release,
|
||||
};
|
||||
|
||||
int elv_register_queue(struct request_queue *q, bool uevent)
|
||||
static int elv_register_queue(struct request_queue *q,
|
||||
struct elevator_queue *e,
|
||||
bool uevent)
|
||||
{
|
||||
struct elevator_queue *e = q->elevator;
|
||||
int error;
|
||||
|
||||
lockdep_assert_held(&q->elevator_lock);
|
||||
|
||||
error = kobject_add(&e->kobj, &q->disk->queue_kobj, "iosched");
|
||||
if (!error) {
|
||||
const struct elv_fs_entry *attr = e->type->elevator_attrs;
|
||||
|
@ -472,20 +484,25 @@ int elv_register_queue(struct request_queue *q, bool uevent)
|
|||
if (uevent)
|
||||
kobject_uevent(&e->kobj, KOBJ_ADD);
|
||||
|
||||
/*
|
||||
* Sched is initialized, it is ready to export it via
|
||||
* debugfs
|
||||
*/
|
||||
blk_mq_sched_reg_debugfs(q);
|
||||
set_bit(ELEVATOR_FLAG_REGISTERED, &e->flags);
|
||||
}
|
||||
return error;
|
||||
}
|
||||
|
||||
void elv_unregister_queue(struct request_queue *q)
|
||||
static void elv_unregister_queue(struct request_queue *q,
|
||||
struct elevator_queue *e)
|
||||
{
|
||||
struct elevator_queue *e = q->elevator;
|
||||
|
||||
lockdep_assert_held(&q->elevator_lock);
|
||||
|
||||
if (e && test_and_clear_bit(ELEVATOR_FLAG_REGISTERED, &e->flags)) {
|
||||
kobject_uevent(&e->kobj, KOBJ_REMOVE);
|
||||
kobject_del(&e->kobj);
|
||||
|
||||
/* unexport via debugfs before exiting sched */
|
||||
blk_mq_sched_unreg_debugfs(q);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -548,42 +565,107 @@ void elv_unregister(struct elevator_type *e)
|
|||
EXPORT_SYMBOL_GPL(elv_unregister);
|
||||
|
||||
/*
|
||||
* For single queue devices, default to using mq-deadline. If we have multiple
|
||||
* queues or mq-deadline is not available, default to "none".
|
||||
* Switch to new_e io scheduler.
|
||||
*
|
||||
* If switching fails, we are most likely running out of memory and not able
|
||||
* to restore the old io scheduler, so leaving the io scheduler being none.
|
||||
*/
|
||||
static struct elevator_type *elevator_get_default(struct request_queue *q)
|
||||
static int elevator_switch(struct request_queue *q, struct elv_change_ctx *ctx)
|
||||
{
|
||||
if (q->tag_set->flags & BLK_MQ_F_NO_SCHED_BY_DEFAULT)
|
||||
return NULL;
|
||||
struct elevator_type *new_e = NULL;
|
||||
int ret = 0;
|
||||
|
||||
if (q->nr_hw_queues != 1 &&
|
||||
!blk_mq_is_shared_tags(q->tag_set->flags))
|
||||
return NULL;
|
||||
WARN_ON_ONCE(q->mq_freeze_depth == 0);
|
||||
lockdep_assert_held(&q->elevator_lock);
|
||||
|
||||
return elevator_find_get("mq-deadline");
|
||||
if (strncmp(ctx->name, "none", 4)) {
|
||||
new_e = elevator_find_get(ctx->name);
|
||||
if (!new_e)
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
blk_mq_quiesce_queue(q);
|
||||
|
||||
if (q->elevator) {
|
||||
ctx->old = q->elevator;
|
||||
elevator_exit(q);
|
||||
}
|
||||
|
||||
if (new_e) {
|
||||
ret = blk_mq_init_sched(q, new_e);
|
||||
if (ret)
|
||||
goto out_unfreeze;
|
||||
ctx->new = q->elevator;
|
||||
} else {
|
||||
blk_queue_flag_clear(QUEUE_FLAG_SQ_SCHED, q);
|
||||
q->elevator = NULL;
|
||||
q->nr_requests = q->tag_set->queue_depth;
|
||||
}
|
||||
blk_add_trace_msg(q, "elv switch: %s", ctx->name);
|
||||
|
||||
out_unfreeze:
|
||||
blk_mq_unquiesce_queue(q);
|
||||
|
||||
if (ret) {
|
||||
pr_warn("elv: switch to \"%s\" failed, falling back to \"none\"\n",
|
||||
new_e->elevator_name);
|
||||
}
|
||||
|
||||
if (new_e)
|
||||
elevator_put(new_e);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void elv_exit_and_release(struct request_queue *q)
|
||||
{
|
||||
struct elevator_queue *e;
|
||||
unsigned memflags;
|
||||
|
||||
memflags = blk_mq_freeze_queue(q);
|
||||
mutex_lock(&q->elevator_lock);
|
||||
e = q->elevator;
|
||||
elevator_exit(q);
|
||||
mutex_unlock(&q->elevator_lock);
|
||||
blk_mq_unfreeze_queue(q, memflags);
|
||||
if (e)
|
||||
kobject_put(&e->kobj);
|
||||
}
|
||||
|
||||
static int elevator_change_done(struct request_queue *q,
|
||||
struct elv_change_ctx *ctx)
|
||||
{
|
||||
int ret = 0;
|
||||
|
||||
if (ctx->old) {
|
||||
bool enable_wbt = test_bit(ELEVATOR_FLAG_ENABLE_WBT_ON_EXIT,
|
||||
&ctx->old->flags);
|
||||
|
||||
elv_unregister_queue(q, ctx->old);
|
||||
kobject_put(&ctx->old->kobj);
|
||||
if (enable_wbt)
|
||||
wbt_enable_default(q->disk);
|
||||
}
|
||||
if (ctx->new) {
|
||||
ret = elv_register_queue(q, ctx->new, !ctx->no_uevent);
|
||||
if (ret)
|
||||
elv_exit_and_release(q);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Use the default elevator settings. If the chosen elevator initialization
|
||||
* fails, fall back to the "none" elevator (no elevator).
|
||||
* Switch this queue to the given IO scheduler.
|
||||
*/
|
||||
void elevator_init_mq(struct request_queue *q)
|
||||
static int elevator_change(struct request_queue *q, struct elv_change_ctx *ctx)
|
||||
{
|
||||
struct elevator_type *e;
|
||||
unsigned int memflags;
|
||||
int err;
|
||||
int ret = 0;
|
||||
|
||||
WARN_ON_ONCE(blk_queue_registered(q));
|
||||
|
||||
if (unlikely(q->elevator))
|
||||
return;
|
||||
|
||||
e = elevator_get_default(q);
|
||||
if (!e)
|
||||
return;
|
||||
lockdep_assert_held(&q->tag_set->update_nr_hwq_lock);
|
||||
|
||||
memflags = blk_mq_freeze_queue(q);
|
||||
/*
|
||||
* We are called before adding disk, when there isn't any FS I/O,
|
||||
* May be called before adding disk, when there isn't any FS I/O,
|
||||
* so freezing queue plus canceling dispatch work is enough to
|
||||
* drain any dispatch activities originated from passthrough
|
||||
* requests, then no need to quiesce queue which may add long boot
|
||||
|
@ -591,116 +673,86 @@ void elevator_init_mq(struct request_queue *q)
|
|||
*
|
||||
* Disk isn't added yet, so verifying queue lock only manually.
|
||||
*/
|
||||
memflags = blk_mq_freeze_queue(q);
|
||||
|
||||
blk_mq_cancel_work_sync(q);
|
||||
|
||||
err = blk_mq_init_sched(q, e);
|
||||
|
||||
mutex_lock(&q->elevator_lock);
|
||||
if (!(q->elevator && elevator_match(q->elevator->type, ctx->name)))
|
||||
ret = elevator_switch(q, ctx);
|
||||
mutex_unlock(&q->elevator_lock);
|
||||
blk_mq_unfreeze_queue(q, memflags);
|
||||
|
||||
if (err) {
|
||||
pr_warn("\"%s\" elevator initialization failed, "
|
||||
"falling back to \"none\"\n", e->elevator_name);
|
||||
}
|
||||
|
||||
elevator_put(e);
|
||||
}
|
||||
|
||||
/*
|
||||
* Switch to new_e io scheduler.
|
||||
*
|
||||
* If switching fails, we are most likely running out of memory and not able
|
||||
* to restore the old io scheduler, so leaving the io scheduler being none.
|
||||
*/
|
||||
int elevator_switch(struct request_queue *q, struct elevator_type *new_e)
|
||||
{
|
||||
unsigned int memflags;
|
||||
int ret;
|
||||
|
||||
lockdep_assert_held(&q->elevator_lock);
|
||||
|
||||
memflags = blk_mq_freeze_queue(q);
|
||||
blk_mq_quiesce_queue(q);
|
||||
|
||||
if (q->elevator) {
|
||||
elv_unregister_queue(q);
|
||||
elevator_exit(q);
|
||||
}
|
||||
|
||||
ret = blk_mq_init_sched(q, new_e);
|
||||
if (ret)
|
||||
goto out_unfreeze;
|
||||
|
||||
ret = elv_register_queue(q, true);
|
||||
if (ret) {
|
||||
elevator_exit(q);
|
||||
goto out_unfreeze;
|
||||
}
|
||||
blk_add_trace_msg(q, "elv switch: %s", new_e->elevator_name);
|
||||
|
||||
out_unfreeze:
|
||||
blk_mq_unquiesce_queue(q);
|
||||
blk_mq_unfreeze_queue(q, memflags);
|
||||
|
||||
if (ret) {
|
||||
pr_warn("elv: switch to \"%s\" failed, falling back to \"none\"\n",
|
||||
new_e->elevator_name);
|
||||
}
|
||||
if (!ret)
|
||||
ret = elevator_change_done(q, ctx);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
void elevator_disable(struct request_queue *q)
|
||||
/*
|
||||
* The I/O scheduler depends on the number of hardware queues, this forces a
|
||||
* reattachment when nr_hw_queues changes.
|
||||
*/
|
||||
void elv_update_nr_hw_queues(struct request_queue *q)
|
||||
{
|
||||
unsigned int memflags;
|
||||
struct elv_change_ctx ctx = {};
|
||||
int ret = -ENODEV;
|
||||
|
||||
lockdep_assert_held(&q->elevator_lock);
|
||||
WARN_ON_ONCE(q->mq_freeze_depth == 0);
|
||||
|
||||
memflags = blk_mq_freeze_queue(q);
|
||||
blk_mq_quiesce_queue(q);
|
||||
mutex_lock(&q->elevator_lock);
|
||||
if (q->elevator && !blk_queue_dying(q) && blk_queue_registered(q)) {
|
||||
ctx.name = q->elevator->type->elevator_name;
|
||||
|
||||
elv_unregister_queue(q);
|
||||
elevator_exit(q);
|
||||
blk_queue_flag_clear(QUEUE_FLAG_SQ_SCHED, q);
|
||||
q->elevator = NULL;
|
||||
q->nr_requests = q->tag_set->queue_depth;
|
||||
blk_add_trace_msg(q, "elv switch: none");
|
||||
|
||||
blk_mq_unquiesce_queue(q);
|
||||
blk_mq_unfreeze_queue(q, memflags);
|
||||
/* force to reattach elevator after nr_hw_queue is updated */
|
||||
ret = elevator_switch(q, &ctx);
|
||||
}
|
||||
mutex_unlock(&q->elevator_lock);
|
||||
blk_mq_unfreeze_queue_nomemrestore(q);
|
||||
if (!ret)
|
||||
WARN_ON_ONCE(elevator_change_done(q, &ctx));
|
||||
}
|
||||
|
||||
/*
|
||||
* Switch this queue to the given IO scheduler.
|
||||
* Use the default elevator settings. If the chosen elevator initialization
|
||||
* fails, fall back to the "none" elevator (no elevator).
|
||||
*/
|
||||
static int elevator_change(struct request_queue *q, const char *elevator_name)
|
||||
void elevator_set_default(struct request_queue *q)
|
||||
{
|
||||
struct elevator_type *e;
|
||||
int ret;
|
||||
struct elv_change_ctx ctx = {
|
||||
.name = "mq-deadline",
|
||||
.no_uevent = true,
|
||||
};
|
||||
int err = 0;
|
||||
|
||||
/* Make sure queue is not in the middle of being removed */
|
||||
if (!blk_queue_registered(q))
|
||||
return -ENOENT;
|
||||
/* now we allow to switch elevator */
|
||||
blk_queue_flag_clear(QUEUE_FLAG_NO_ELV_SWITCH, q);
|
||||
|
||||
if (!strncmp(elevator_name, "none", 4)) {
|
||||
if (q->elevator)
|
||||
elevator_disable(q);
|
||||
return 0;
|
||||
if (q->tag_set->flags & BLK_MQ_F_NO_SCHED_BY_DEFAULT)
|
||||
return;
|
||||
|
||||
/*
|
||||
* For single queue devices, default to using mq-deadline. If we
|
||||
* have multiple queues or mq-deadline is not available, default
|
||||
* to "none".
|
||||
*/
|
||||
if (elevator_find_get(ctx.name) && (q->nr_hw_queues == 1 ||
|
||||
blk_mq_is_shared_tags(q->tag_set->flags)))
|
||||
err = elevator_change(q, &ctx);
|
||||
if (err < 0)
|
||||
pr_warn("\"%s\" elevator initialization, failed %d, "
|
||||
"falling back to \"none\"\n", ctx.name, err);
|
||||
}
|
||||
|
||||
if (q->elevator && elevator_match(q->elevator->type, elevator_name))
|
||||
return 0;
|
||||
void elevator_set_none(struct request_queue *q)
|
||||
{
|
||||
struct elv_change_ctx ctx = {
|
||||
.name = "none",
|
||||
};
|
||||
int err;
|
||||
|
||||
e = elevator_find_get(elevator_name);
|
||||
if (!e)
|
||||
return -EINVAL;
|
||||
ret = elevator_switch(q, e);
|
||||
elevator_put(e);
|
||||
return ret;
|
||||
err = elevator_change(q, &ctx);
|
||||
if (err < 0)
|
||||
pr_warn("%s: set none elevator failed %d\n", __func__, err);
|
||||
}
|
||||
|
||||
static void elv_iosched_load_module(char *elevator_name)
|
||||
static void elv_iosched_load_module(const char *elevator_name)
|
||||
{
|
||||
struct elevator_type *found;
|
||||
|
||||
|
@ -716,10 +768,14 @@ ssize_t elv_iosched_store(struct gendisk *disk, const char *buf,
|
|||
size_t count)
|
||||
{
|
||||
char elevator_name[ELV_NAME_MAX];
|
||||
char *name;
|
||||
struct elv_change_ctx ctx = {};
|
||||
int ret;
|
||||
unsigned int memflags;
|
||||
struct request_queue *q = disk->queue;
|
||||
struct blk_mq_tag_set *set = q->tag_set;
|
||||
|
||||
/* Make sure queue is not in the middle of being removed */
|
||||
if (!blk_queue_registered(q))
|
||||
return -ENOENT;
|
||||
|
||||
/*
|
||||
* If the attribute needs to load a module, do it before freezing the
|
||||
|
@ -727,24 +783,25 @@ ssize_t elv_iosched_store(struct gendisk *disk, const char *buf,
|
|||
* queue is the one for the device storing the module file.
|
||||
*/
|
||||
strscpy(elevator_name, buf, sizeof(elevator_name));
|
||||
name = strstrip(elevator_name);
|
||||
ctx.name = strstrip(elevator_name);
|
||||
|
||||
elv_iosched_load_module(name);
|
||||
elv_iosched_load_module(ctx.name);
|
||||
|
||||
memflags = blk_mq_freeze_queue(q);
|
||||
mutex_lock(&q->elevator_lock);
|
||||
ret = elevator_change(q, name);
|
||||
down_read(&set->update_nr_hwq_lock);
|
||||
if (!blk_queue_no_elv_switch(q)) {
|
||||
ret = elevator_change(q, &ctx);
|
||||
if (!ret)
|
||||
ret = count;
|
||||
mutex_unlock(&q->elevator_lock);
|
||||
blk_mq_unfreeze_queue(q, memflags);
|
||||
} else {
|
||||
ret = -ENOENT;
|
||||
}
|
||||
up_read(&set->update_nr_hwq_lock);
|
||||
return ret;
|
||||
}
|
||||
|
||||
ssize_t elv_iosched_show(struct gendisk *disk, char *name)
|
||||
{
|
||||
struct request_queue *q = disk->queue;
|
||||
struct elevator_queue *eq = q->elevator;
|
||||
struct elevator_type *cur = NULL, *e;
|
||||
int len = 0;
|
||||
|
||||
|
@ -753,7 +810,7 @@ ssize_t elv_iosched_show(struct gendisk *disk, char *name)
|
|||
len += sprintf(name+len, "[none] ");
|
||||
} else {
|
||||
len += sprintf(name+len, "none ");
|
||||
cur = eq->type;
|
||||
cur = q->elevator->type;
|
||||
}
|
||||
|
||||
spin_lock(&elv_list_lock);
|
||||
|
|
|
@ -121,7 +121,8 @@ struct elevator_queue
|
|||
};
|
||||
|
||||
#define ELEVATOR_FLAG_REGISTERED 0
|
||||
#define ELEVATOR_FLAG_DISABLE_WBT 1
|
||||
#define ELEVATOR_FLAG_DYING 1
|
||||
#define ELEVATOR_FLAG_ENABLE_WBT_ON_EXIT 2
|
||||
|
||||
/*
|
||||
* block elevator interface
|
||||
|
@ -182,4 +183,7 @@ extern struct request *elv_rb_find(struct rb_root *, sector_t);
|
|||
#define rq_entry_fifo(ptr) list_entry((ptr), struct request, queuelist)
|
||||
#define rq_fifo_clear(rq) list_del_init(&(rq)->queuelist)
|
||||
|
||||
void blk_mq_sched_reg_debugfs(struct request_queue *q);
|
||||
void blk_mq_sched_unreg_debugfs(struct request_queue *q);
|
||||
|
||||
#endif /* _ELEVATOR_H */
|
||||
|
|
28
block/fops.c
28
block/fops.c
|
@ -73,6 +73,7 @@ static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb,
|
|||
}
|
||||
bio.bi_iter.bi_sector = pos >> SECTOR_SHIFT;
|
||||
bio.bi_write_hint = file_inode(iocb->ki_filp)->i_write_hint;
|
||||
bio.bi_write_stream = iocb->ki_write_stream;
|
||||
bio.bi_ioprio = iocb->ki_ioprio;
|
||||
if (iocb->ki_flags & IOCB_ATOMIC)
|
||||
bio.bi_opf |= REQ_ATOMIC;
|
||||
|
@ -206,6 +207,7 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
|
|||
for (;;) {
|
||||
bio->bi_iter.bi_sector = pos >> SECTOR_SHIFT;
|
||||
bio->bi_write_hint = file_inode(iocb->ki_filp)->i_write_hint;
|
||||
bio->bi_write_stream = iocb->ki_write_stream;
|
||||
bio->bi_private = dio;
|
||||
bio->bi_end_io = blkdev_bio_end_io;
|
||||
bio->bi_ioprio = iocb->ki_ioprio;
|
||||
|
@ -333,6 +335,7 @@ static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb,
|
|||
dio->iocb = iocb;
|
||||
bio->bi_iter.bi_sector = pos >> SECTOR_SHIFT;
|
||||
bio->bi_write_hint = file_inode(iocb->ki_filp)->i_write_hint;
|
||||
bio->bi_write_stream = iocb->ki_write_stream;
|
||||
bio->bi_end_io = blkdev_bio_end_io_async;
|
||||
bio->bi_ioprio = iocb->ki_ioprio;
|
||||
|
||||
|
@ -398,6 +401,26 @@ static ssize_t blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
|
|||
if (blkdev_dio_invalid(bdev, iocb, iter))
|
||||
return -EINVAL;
|
||||
|
||||
if (iov_iter_rw(iter) == WRITE) {
|
||||
u16 max_write_streams = bdev_max_write_streams(bdev);
|
||||
|
||||
if (iocb->ki_write_stream) {
|
||||
if (iocb->ki_write_stream > max_write_streams)
|
||||
return -EINVAL;
|
||||
} else if (max_write_streams) {
|
||||
enum rw_hint write_hint =
|
||||
file_inode(iocb->ki_filp)->i_write_hint;
|
||||
|
||||
/*
|
||||
* Just use the write hint as write stream for block
|
||||
* device writes. This assumes no file system is
|
||||
* mounted that would use the streams differently.
|
||||
*/
|
||||
if (write_hint <= max_write_streams)
|
||||
iocb->ki_write_stream = write_hint;
|
||||
}
|
||||
}
|
||||
|
||||
nr_pages = bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS + 1);
|
||||
if (likely(nr_pages <= BIO_MAX_VECS)) {
|
||||
if (is_sync_kiocb(iocb))
|
||||
|
@ -451,12 +474,13 @@ static int blkdev_get_block(struct inode *inode, sector_t iblock,
|
|||
static int blkdev_writepages(struct address_space *mapping,
|
||||
struct writeback_control *wbc)
|
||||
{
|
||||
struct folio *folio = NULL;
|
||||
struct blk_plug plug;
|
||||
int err;
|
||||
|
||||
blk_start_plug(&plug);
|
||||
err = write_cache_pages(mapping, wbc, block_write_full_folio,
|
||||
blkdev_get_block);
|
||||
while ((folio = writeback_iter(mapping, wbc, folio, &err)))
|
||||
err = block_write_full_folio(folio, wbc, blkdev_get_block);
|
||||
blk_finish_plug(&plug);
|
||||
|
||||
return err;
|
||||
|
|
268
block/genhd.c
268
block/genhd.c
|
@ -125,38 +125,47 @@ static void part_stat_read_all(struct block_device *part,
|
|||
}
|
||||
}
|
||||
|
||||
unsigned int part_in_flight(struct block_device *part)
|
||||
{
|
||||
unsigned int inflight = 0;
|
||||
int cpu;
|
||||
|
||||
for_each_possible_cpu(cpu) {
|
||||
inflight += part_stat_local_read_cpu(part, in_flight[0], cpu) +
|
||||
part_stat_local_read_cpu(part, in_flight[1], cpu);
|
||||
}
|
||||
if ((int)inflight < 0)
|
||||
inflight = 0;
|
||||
|
||||
return inflight;
|
||||
}
|
||||
|
||||
static void part_in_flight_rw(struct block_device *part,
|
||||
unsigned int inflight[2])
|
||||
static void bdev_count_inflight_rw(struct block_device *part,
|
||||
unsigned int inflight[2], bool mq_driver)
|
||||
{
|
||||
int cpu;
|
||||
|
||||
inflight[0] = 0;
|
||||
inflight[1] = 0;
|
||||
if (mq_driver) {
|
||||
blk_mq_in_driver_rw(part, inflight);
|
||||
} else {
|
||||
for_each_possible_cpu(cpu) {
|
||||
inflight[0] += part_stat_local_read_cpu(part, in_flight[0], cpu);
|
||||
inflight[1] += part_stat_local_read_cpu(part, in_flight[1], cpu);
|
||||
inflight[READ] += part_stat_local_read_cpu(
|
||||
part, in_flight[READ], cpu);
|
||||
inflight[WRITE] += part_stat_local_read_cpu(
|
||||
part, in_flight[WRITE], cpu);
|
||||
}
|
||||
if ((int)inflight[0] < 0)
|
||||
inflight[0] = 0;
|
||||
if ((int)inflight[1] < 0)
|
||||
inflight[1] = 0;
|
||||
}
|
||||
|
||||
if (WARN_ON_ONCE((int)inflight[READ] < 0))
|
||||
inflight[READ] = 0;
|
||||
if (WARN_ON_ONCE((int)inflight[WRITE] < 0))
|
||||
inflight[WRITE] = 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* bdev_count_inflight - get the number of inflight IOs for a block device.
|
||||
*
|
||||
* @part: the block device.
|
||||
*
|
||||
* Inflight here means started IO accounting, from bdev_start_io_acct() for
|
||||
* bio-based block device, and from blk_account_io_start() for rq-based block
|
||||
* device.
|
||||
*/
|
||||
unsigned int bdev_count_inflight(struct block_device *part)
|
||||
{
|
||||
unsigned int inflight[2] = {0};
|
||||
|
||||
bdev_count_inflight_rw(part, inflight, false);
|
||||
|
||||
return inflight[READ] + inflight[WRITE];
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(bdev_count_inflight);
|
||||
|
||||
/*
|
||||
* Can be deleted altogether. Later.
|
||||
*
|
||||
|
@ -389,17 +398,33 @@ int disk_scan_partitions(struct gendisk *disk, blk_mode_t mode)
|
|||
return ret;
|
||||
}
|
||||
|
||||
/**
|
||||
* add_disk_fwnode - add disk information to kernel list with fwnode
|
||||
* @parent: parent device for the disk
|
||||
* @disk: per-device partitioning information
|
||||
* @groups: Additional per-device sysfs groups
|
||||
* @fwnode: attached disk fwnode
|
||||
*
|
||||
* This function registers the partitioning information in @disk
|
||||
* with the kernel. Also attach a fwnode to the disk device.
|
||||
static void add_disk_final(struct gendisk *disk)
|
||||
{
|
||||
struct device *ddev = disk_to_dev(disk);
|
||||
|
||||
if (!(disk->flags & GENHD_FL_HIDDEN)) {
|
||||
/* Make sure the first partition scan will be proceed */
|
||||
if (get_capacity(disk) && disk_has_partscan(disk))
|
||||
set_bit(GD_NEED_PART_SCAN, &disk->state);
|
||||
|
||||
bdev_add(disk->part0, ddev->devt);
|
||||
if (get_capacity(disk))
|
||||
disk_scan_partitions(disk, BLK_OPEN_READ);
|
||||
|
||||
/*
|
||||
* Announce the disk and partitions after all partitions are
|
||||
* created. (for hidden disks uevents remain suppressed forever)
|
||||
*/
|
||||
int __must_check add_disk_fwnode(struct device *parent, struct gendisk *disk,
|
||||
dev_set_uevent_suppress(ddev, 0);
|
||||
disk_uevent(disk, KOBJ_ADD);
|
||||
}
|
||||
|
||||
blk_apply_bdi_limits(disk->bdi, &disk->queue->limits);
|
||||
disk_add_events(disk);
|
||||
set_bit(GD_ADDED, &disk->state);
|
||||
}
|
||||
|
||||
static int __add_disk(struct device *parent, struct gendisk *disk,
|
||||
const struct attribute_group **groups,
|
||||
struct fwnode_handle *fwnode)
|
||||
|
||||
|
@ -416,12 +441,6 @@ int __must_check add_disk_fwnode(struct device *parent, struct gendisk *disk,
|
|||
*/
|
||||
if (disk->fops->submit_bio || disk->fops->poll_bio)
|
||||
return -EINVAL;
|
||||
|
||||
/*
|
||||
* Initialize the I/O scheduler code and pick a default one if
|
||||
* needed.
|
||||
*/
|
||||
elevator_init_mq(disk->queue);
|
||||
} else {
|
||||
if (!disk->fops->submit_bio)
|
||||
return -EINVAL;
|
||||
|
@ -438,7 +457,7 @@ int __must_check add_disk_fwnode(struct device *parent, struct gendisk *disk,
|
|||
ret = -EINVAL;
|
||||
if (disk->major) {
|
||||
if (WARN_ON(!disk->minors))
|
||||
goto out_exit_elevator;
|
||||
goto out;
|
||||
|
||||
if (disk->minors > DISK_MAX_PARTS) {
|
||||
pr_err("block: can't allocate more than %d partitions\n",
|
||||
|
@ -448,14 +467,14 @@ int __must_check add_disk_fwnode(struct device *parent, struct gendisk *disk,
|
|||
if (disk->first_minor > MINORMASK ||
|
||||
disk->minors > MINORMASK + 1 ||
|
||||
disk->first_minor + disk->minors > MINORMASK + 1)
|
||||
goto out_exit_elevator;
|
||||
goto out;
|
||||
} else {
|
||||
if (WARN_ON(disk->minors))
|
||||
goto out_exit_elevator;
|
||||
goto out;
|
||||
|
||||
ret = blk_alloc_ext_minor();
|
||||
if (ret < 0)
|
||||
goto out_exit_elevator;
|
||||
goto out;
|
||||
disk->major = BLOCK_EXT_MAJOR;
|
||||
disk->first_minor = ret;
|
||||
}
|
||||
|
@ -516,21 +535,6 @@ int __must_check add_disk_fwnode(struct device *parent, struct gendisk *disk,
|
|||
&disk->bdi->dev->kobj, "bdi");
|
||||
if (ret)
|
||||
goto out_unregister_bdi;
|
||||
|
||||
/* Make sure the first partition scan will be proceed */
|
||||
if (get_capacity(disk) && disk_has_partscan(disk))
|
||||
set_bit(GD_NEED_PART_SCAN, &disk->state);
|
||||
|
||||
bdev_add(disk->part0, ddev->devt);
|
||||
if (get_capacity(disk))
|
||||
disk_scan_partitions(disk, BLK_OPEN_READ);
|
||||
|
||||
/*
|
||||
* Announce the disk and partitions after all partitions are
|
||||
* created. (for hidden disks uevents remain suppressed forever)
|
||||
*/
|
||||
dev_set_uevent_suppress(ddev, 0);
|
||||
disk_uevent(disk, KOBJ_ADD);
|
||||
} else {
|
||||
/*
|
||||
* Even if the block_device for a hidden gendisk is not
|
||||
|
@ -539,10 +543,6 @@ int __must_check add_disk_fwnode(struct device *parent, struct gendisk *disk,
|
|||
*/
|
||||
disk->part0->bd_dev = MKDEV(disk->major, disk->first_minor);
|
||||
}
|
||||
|
||||
blk_apply_bdi_limits(disk->bdi, &disk->queue->limits);
|
||||
disk_add_events(disk);
|
||||
set_bit(GD_ADDED, &disk->state);
|
||||
return 0;
|
||||
|
||||
out_unregister_bdi:
|
||||
|
@ -564,12 +564,46 @@ out_device_del:
|
|||
out_free_ext_minor:
|
||||
if (disk->major == BLOCK_EXT_MAJOR)
|
||||
blk_free_ext_minor(disk->first_minor);
|
||||
out_exit_elevator:
|
||||
if (disk->queue->elevator) {
|
||||
mutex_lock(&disk->queue->elevator_lock);
|
||||
elevator_exit(disk->queue);
|
||||
mutex_unlock(&disk->queue->elevator_lock);
|
||||
out:
|
||||
return ret;
|
||||
}
|
||||
|
||||
/**
|
||||
* add_disk_fwnode - add disk information to kernel list with fwnode
|
||||
* @parent: parent device for the disk
|
||||
* @disk: per-device partitioning information
|
||||
* @groups: Additional per-device sysfs groups
|
||||
* @fwnode: attached disk fwnode
|
||||
*
|
||||
* This function registers the partitioning information in @disk
|
||||
* with the kernel. Also attach a fwnode to the disk device.
|
||||
*/
|
||||
int __must_check add_disk_fwnode(struct device *parent, struct gendisk *disk,
|
||||
const struct attribute_group **groups,
|
||||
struct fwnode_handle *fwnode)
|
||||
{
|
||||
struct blk_mq_tag_set *set;
|
||||
unsigned int memflags;
|
||||
int ret;
|
||||
|
||||
if (queue_is_mq(disk->queue)) {
|
||||
set = disk->queue->tag_set;
|
||||
memflags = memalloc_noio_save();
|
||||
down_read(&set->update_nr_hwq_lock);
|
||||
ret = __add_disk(parent, disk, groups, fwnode);
|
||||
up_read(&set->update_nr_hwq_lock);
|
||||
memalloc_noio_restore(memflags);
|
||||
} else {
|
||||
ret = __add_disk(parent, disk, groups, fwnode);
|
||||
}
|
||||
|
||||
/*
|
||||
* add_disk_final() needn't to read `nr_hw_queues`, so move it out
|
||||
* of read lock `set->update_nr_hwq_lock` for avoiding unnecessary
|
||||
* lock dependency on `disk->open_mutex` from scanning partition.
|
||||
*/
|
||||
if (!ret)
|
||||
add_disk_final(disk);
|
||||
return ret;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(add_disk_fwnode);
|
||||
|
@ -652,26 +686,7 @@ void blk_mark_disk_dead(struct gendisk *disk)
|
|||
}
|
||||
EXPORT_SYMBOL_GPL(blk_mark_disk_dead);
|
||||
|
||||
/**
|
||||
* del_gendisk - remove the gendisk
|
||||
* @disk: the struct gendisk to remove
|
||||
*
|
||||
* Removes the gendisk and all its associated resources. This deletes the
|
||||
* partitions associated with the gendisk, and unregisters the associated
|
||||
* request_queue.
|
||||
*
|
||||
* This is the counter to the respective __device_add_disk() call.
|
||||
*
|
||||
* The final removal of the struct gendisk happens when its refcount reaches 0
|
||||
* with put_disk(), which should be called after del_gendisk(), if
|
||||
* __device_add_disk() was used.
|
||||
*
|
||||
* Drivers exist which depend on the release of the gendisk to be synchronous,
|
||||
* it should not be deferred.
|
||||
*
|
||||
* Context: can sleep
|
||||
*/
|
||||
void del_gendisk(struct gendisk *disk)
|
||||
static void __del_gendisk(struct gendisk *disk)
|
||||
{
|
||||
struct request_queue *q = disk->queue;
|
||||
struct block_device *part;
|
||||
|
@ -743,14 +758,7 @@ void del_gendisk(struct gendisk *disk)
|
|||
if (queue_is_mq(q))
|
||||
blk_mq_cancel_work_sync(q);
|
||||
|
||||
blk_mq_quiesce_queue(q);
|
||||
if (q->elevator) {
|
||||
mutex_lock(&q->elevator_lock);
|
||||
elevator_exit(q);
|
||||
mutex_unlock(&q->elevator_lock);
|
||||
}
|
||||
rq_qos_exit(q);
|
||||
blk_mq_unquiesce_queue(q);
|
||||
|
||||
/*
|
||||
* If the disk does not own the queue, allow using passthrough requests
|
||||
|
@ -764,6 +772,55 @@ void del_gendisk(struct gendisk *disk)
|
|||
if (start_drain)
|
||||
blk_unfreeze_release_lock(q);
|
||||
}
|
||||
|
||||
static void disable_elv_switch(struct request_queue *q)
|
||||
{
|
||||
struct blk_mq_tag_set *set = q->tag_set;
|
||||
WARN_ON_ONCE(!queue_is_mq(q));
|
||||
|
||||
down_write(&set->update_nr_hwq_lock);
|
||||
blk_queue_flag_set(QUEUE_FLAG_NO_ELV_SWITCH, q);
|
||||
up_write(&set->update_nr_hwq_lock);
|
||||
}
|
||||
|
||||
/**
|
||||
* del_gendisk - remove the gendisk
|
||||
* @disk: the struct gendisk to remove
|
||||
*
|
||||
* Removes the gendisk and all its associated resources. This deletes the
|
||||
* partitions associated with the gendisk, and unregisters the associated
|
||||
* request_queue.
|
||||
*
|
||||
* This is the counter to the respective __device_add_disk() call.
|
||||
*
|
||||
* The final removal of the struct gendisk happens when its refcount reaches 0
|
||||
* with put_disk(), which should be called after del_gendisk(), if
|
||||
* __device_add_disk() was used.
|
||||
*
|
||||
* Drivers exist which depend on the release of the gendisk to be synchronous,
|
||||
* it should not be deferred.
|
||||
*
|
||||
* Context: can sleep
|
||||
*/
|
||||
void del_gendisk(struct gendisk *disk)
|
||||
{
|
||||
struct blk_mq_tag_set *set;
|
||||
unsigned int memflags;
|
||||
|
||||
if (!queue_is_mq(disk->queue)) {
|
||||
__del_gendisk(disk);
|
||||
} else {
|
||||
set = disk->queue->tag_set;
|
||||
|
||||
disable_elv_switch(disk->queue);
|
||||
|
||||
memflags = memalloc_noio_save();
|
||||
down_read(&set->update_nr_hwq_lock);
|
||||
__del_gendisk(disk);
|
||||
up_read(&set->update_nr_hwq_lock);
|
||||
memalloc_noio_restore(memflags);
|
||||
}
|
||||
}
|
||||
EXPORT_SYMBOL(del_gendisk);
|
||||
|
||||
/**
|
||||
|
@ -1005,7 +1062,7 @@ ssize_t part_stat_show(struct device *dev,
|
|||
struct disk_stats stat;
|
||||
unsigned int inflight;
|
||||
|
||||
inflight = part_in_flight(bdev);
|
||||
inflight = bdev_count_inflight(bdev);
|
||||
if (inflight) {
|
||||
part_stat_lock();
|
||||
update_io_ticks(bdev, jiffies, true);
|
||||
|
@ -1042,19 +1099,21 @@ ssize_t part_stat_show(struct device *dev,
|
|||
(unsigned int)div_u64(stat.nsecs[STAT_FLUSH], NSEC_PER_MSEC));
|
||||
}
|
||||
|
||||
/*
|
||||
* Show the number of IOs issued to driver.
|
||||
* For bio-based device, started from bdev_start_io_acct();
|
||||
* For rq-based device, started from blk_mq_start_request();
|
||||
*/
|
||||
ssize_t part_inflight_show(struct device *dev, struct device_attribute *attr,
|
||||
char *buf)
|
||||
{
|
||||
struct block_device *bdev = dev_to_bdev(dev);
|
||||
struct request_queue *q = bdev_get_queue(bdev);
|
||||
unsigned int inflight[2];
|
||||
unsigned int inflight[2] = {0};
|
||||
|
||||
if (queue_is_mq(q))
|
||||
blk_mq_in_flight_rw(q, bdev, inflight);
|
||||
else
|
||||
part_in_flight_rw(bdev, inflight);
|
||||
bdev_count_inflight_rw(bdev, inflight, queue_is_mq(q));
|
||||
|
||||
return sysfs_emit(buf, "%8u %8u\n", inflight[0], inflight[1]);
|
||||
return sysfs_emit(buf, "%8u %8u\n", inflight[READ], inflight[WRITE]);
|
||||
}
|
||||
|
||||
static ssize_t disk_capability_show(struct device *dev,
|
||||
|
@ -1307,7 +1366,7 @@ static int diskstats_show(struct seq_file *seqf, void *v)
|
|||
if (bdev_is_partition(hd) && !bdev_nr_sectors(hd))
|
||||
continue;
|
||||
|
||||
inflight = part_in_flight(hd);
|
||||
inflight = bdev_count_inflight(hd);
|
||||
if (inflight) {
|
||||
part_stat_lock();
|
||||
update_io_ticks(hd, jiffies, true);
|
||||
|
@ -1422,6 +1481,7 @@ struct gendisk *__alloc_disk_node(struct request_queue *q, int node_id,
|
|||
#ifdef CONFIG_BLOCK_HOLDER_DEPRECATED
|
||||
INIT_LIST_HEAD(&disk->slave_bdevs);
|
||||
#endif
|
||||
mutex_init(&disk->rqos_state_mutex);
|
||||
return disk;
|
||||
|
||||
out_erase_part0:
|
||||
|
|
|
@ -715,7 +715,7 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
|
|||
}
|
||||
|
||||
/*
|
||||
* Called from blk_mq_insert_request() or blk_mq_dispatch_plug_list().
|
||||
* Called from blk_mq_insert_request() or blk_mq_dispatch_list().
|
||||
*/
|
||||
static void dd_insert_requests(struct blk_mq_hw_ctx *hctx,
|
||||
struct list_head *list,
|
||||
|
|
|
@ -468,7 +468,7 @@ static ssize_t node_read_meminfo(struct device *dev,
|
|||
nid, K(node_page_state(pgdat, NR_PAGETABLE)),
|
||||
nid, K(node_page_state(pgdat, NR_SECONDARY_PAGETABLE)),
|
||||
nid, 0UL,
|
||||
nid, K(sum_zone_node_page_state(nid, NR_BOUNCE)),
|
||||
nid, 0UL,
|
||||
nid, K(node_page_state(pgdat, NR_WRITEBACK_TEMP)),
|
||||
nid, K(sreclaimable +
|
||||
node_page_state(pgdat, NR_KERNEL_MISC_RECLAIMABLE)),
|
||||
|
|
|
@ -407,4 +407,23 @@ config BLKDEV_UBLK_LEGACY_OPCODES
|
|||
|
||||
source "drivers/block/rnbd/Kconfig"
|
||||
|
||||
config BLK_DEV_ZONED_LOOP
|
||||
tristate "Zoned loopback device support"
|
||||
depends on BLK_DEV_ZONED
|
||||
help
|
||||
Saying Y here will allow you to use create a zoned block device using
|
||||
regular files for zones (one file per zones). This is useful to test
|
||||
file systems, device mapper and applications that support zoned block
|
||||
devices. To create a zoned loop device, no user utility is needed, a
|
||||
zoned loop device can be created (or re-started) using a command
|
||||
like:
|
||||
|
||||
echo "add id=0,zone_size_mb=256,capacity_mb=16384,conv_zones=11" > \
|
||||
/dev/zloop-control
|
||||
|
||||
See Documentation/admin-guide/blockdev/zoned_loop.rst for usage
|
||||
details.
|
||||
|
||||
If unsure, say N.
|
||||
|
||||
endif # BLK_DEV
|
||||
|
|
|
@ -41,5 +41,6 @@ obj-$(CONFIG_BLK_DEV_RNBD) += rnbd/
|
|||
obj-$(CONFIG_BLK_DEV_NULL_BLK) += null_blk/
|
||||
|
||||
obj-$(CONFIG_BLK_DEV_UBLK) += ublk_drv.o
|
||||
obj-$(CONFIG_BLK_DEV_ZONED_LOOP) += zloop.o
|
||||
|
||||
swim_mod-y := swim.o swim_asm.o
|
||||
|
|
|
@ -54,32 +54,33 @@ static struct page *brd_lookup_page(struct brd_device *brd, sector_t sector)
|
|||
/*
|
||||
* Insert a new page for a given sector, if one does not already exist.
|
||||
*/
|
||||
static int brd_insert_page(struct brd_device *brd, sector_t sector, gfp_t gfp)
|
||||
static struct page *brd_insert_page(struct brd_device *brd, sector_t sector,
|
||||
blk_opf_t opf)
|
||||
__releases(rcu)
|
||||
__acquires(rcu)
|
||||
{
|
||||
pgoff_t idx = sector >> PAGE_SECTORS_SHIFT;
|
||||
struct page *page;
|
||||
int ret = 0;
|
||||
|
||||
page = brd_lookup_page(brd, sector);
|
||||
if (page)
|
||||
return 0;
|
||||
gfp_t gfp = (opf & REQ_NOWAIT) ? GFP_NOWAIT : GFP_NOIO;
|
||||
struct page *page, *ret;
|
||||
|
||||
rcu_read_unlock();
|
||||
page = alloc_page(gfp | __GFP_ZERO | __GFP_HIGHMEM);
|
||||
rcu_read_lock();
|
||||
if (!page)
|
||||
return -ENOMEM;
|
||||
return ERR_PTR(-ENOMEM);
|
||||
|
||||
xa_lock(&brd->brd_pages);
|
||||
ret = __xa_insert(&brd->brd_pages, idx, page, gfp);
|
||||
if (!ret)
|
||||
ret = __xa_cmpxchg(&brd->brd_pages, sector >> PAGE_SECTORS_SHIFT, NULL,
|
||||
page, gfp);
|
||||
if (ret) {
|
||||
xa_unlock(&brd->brd_pages);
|
||||
__free_page(page);
|
||||
if (xa_is_err(ret))
|
||||
return ERR_PTR(xa_err(ret));
|
||||
return ret;
|
||||
}
|
||||
brd->brd_nr_pages++;
|
||||
xa_unlock(&brd->brd_pages);
|
||||
|
||||
if (ret < 0) {
|
||||
__free_page(page);
|
||||
if (ret == -EBUSY)
|
||||
ret = 0;
|
||||
}
|
||||
return ret;
|
||||
return page;
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -100,143 +101,77 @@ static void brd_free_pages(struct brd_device *brd)
|
|||
}
|
||||
|
||||
/*
|
||||
* copy_to_brd_setup must be called before copy_to_brd. It may sleep.
|
||||
* Process a single segment. The segment is capped to not cross page boundaries
|
||||
* in both the bio and the brd backing memory.
|
||||
*/
|
||||
static int copy_to_brd_setup(struct brd_device *brd, sector_t sector, size_t n,
|
||||
gfp_t gfp)
|
||||
{
|
||||
unsigned int offset = (sector & (PAGE_SECTORS-1)) << SECTOR_SHIFT;
|
||||
size_t copy;
|
||||
int ret;
|
||||
|
||||
copy = min_t(size_t, n, PAGE_SIZE - offset);
|
||||
ret = brd_insert_page(brd, sector, gfp);
|
||||
if (ret)
|
||||
return ret;
|
||||
if (copy < n) {
|
||||
sector += copy >> SECTOR_SHIFT;
|
||||
ret = brd_insert_page(brd, sector, gfp);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Copy n bytes from src to the brd starting at sector. Does not sleep.
|
||||
*/
|
||||
static void copy_to_brd(struct brd_device *brd, const void *src,
|
||||
sector_t sector, size_t n)
|
||||
static bool brd_rw_bvec(struct brd_device *brd, struct bio *bio)
|
||||
{
|
||||
struct bio_vec bv = bio_iter_iovec(bio, bio->bi_iter);
|
||||
sector_t sector = bio->bi_iter.bi_sector;
|
||||
u32 offset = (sector & (PAGE_SECTORS - 1)) << SECTOR_SHIFT;
|
||||
blk_opf_t opf = bio->bi_opf;
|
||||
struct page *page;
|
||||
void *dst;
|
||||
unsigned int offset = (sector & (PAGE_SECTORS-1)) << SECTOR_SHIFT;
|
||||
size_t copy;
|
||||
void *kaddr;
|
||||
|
||||
copy = min_t(size_t, n, PAGE_SIZE - offset);
|
||||
bv.bv_len = min_t(u32, bv.bv_len, PAGE_SIZE - offset);
|
||||
|
||||
rcu_read_lock();
|
||||
page = brd_lookup_page(brd, sector);
|
||||
BUG_ON(!page);
|
||||
|
||||
dst = kmap_atomic(page);
|
||||
memcpy(dst + offset, src, copy);
|
||||
kunmap_atomic(dst);
|
||||
|
||||
if (copy < n) {
|
||||
src += copy;
|
||||
sector += copy >> SECTOR_SHIFT;
|
||||
copy = n - copy;
|
||||
page = brd_lookup_page(brd, sector);
|
||||
BUG_ON(!page);
|
||||
|
||||
dst = kmap_atomic(page);
|
||||
memcpy(dst, src, copy);
|
||||
kunmap_atomic(dst);
|
||||
}
|
||||
if (!page && op_is_write(opf)) {
|
||||
page = brd_insert_page(brd, sector, opf);
|
||||
if (IS_ERR(page))
|
||||
goto out_error;
|
||||
}
|
||||
|
||||
/*
|
||||
* Copy n bytes to dst from the brd starting at sector. Does not sleep.
|
||||
*/
|
||||
static void copy_from_brd(void *dst, struct brd_device *brd,
|
||||
sector_t sector, size_t n)
|
||||
{
|
||||
struct page *page;
|
||||
void *src;
|
||||
unsigned int offset = (sector & (PAGE_SECTORS-1)) << SECTOR_SHIFT;
|
||||
size_t copy;
|
||||
|
||||
copy = min_t(size_t, n, PAGE_SIZE - offset);
|
||||
page = brd_lookup_page(brd, sector);
|
||||
if (page) {
|
||||
src = kmap_atomic(page);
|
||||
memcpy(dst, src + offset, copy);
|
||||
kunmap_atomic(src);
|
||||
} else
|
||||
memset(dst, 0, copy);
|
||||
|
||||
if (copy < n) {
|
||||
dst += copy;
|
||||
sector += copy >> SECTOR_SHIFT;
|
||||
copy = n - copy;
|
||||
page = brd_lookup_page(brd, sector);
|
||||
if (page) {
|
||||
src = kmap_atomic(page);
|
||||
memcpy(dst, src, copy);
|
||||
kunmap_atomic(src);
|
||||
} else
|
||||
memset(dst, 0, copy);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Process a single bvec of a bio.
|
||||
*/
|
||||
static int brd_do_bvec(struct brd_device *brd, struct page *page,
|
||||
unsigned int len, unsigned int off, blk_opf_t opf,
|
||||
sector_t sector)
|
||||
{
|
||||
void *mem;
|
||||
int err = 0;
|
||||
|
||||
kaddr = bvec_kmap_local(&bv);
|
||||
if (op_is_write(opf)) {
|
||||
/*
|
||||
* Must use NOIO because we don't want to recurse back into the
|
||||
* block or filesystem layers from page reclaim.
|
||||
*/
|
||||
gfp_t gfp = opf & REQ_NOWAIT ? GFP_NOWAIT : GFP_NOIO;
|
||||
|
||||
err = copy_to_brd_setup(brd, sector, len, gfp);
|
||||
if (err)
|
||||
goto out;
|
||||
}
|
||||
|
||||
mem = kmap_atomic(page);
|
||||
if (!op_is_write(opf)) {
|
||||
copy_from_brd(mem + off, brd, sector, len);
|
||||
flush_dcache_page(page);
|
||||
memcpy_to_page(page, offset, kaddr, bv.bv_len);
|
||||
} else {
|
||||
flush_dcache_page(page);
|
||||
copy_to_brd(brd, mem + off, sector, len);
|
||||
if (page)
|
||||
memcpy_from_page(kaddr, page, offset, bv.bv_len);
|
||||
else
|
||||
memset(kaddr, 0, bv.bv_len);
|
||||
}
|
||||
kunmap_atomic(mem);
|
||||
kunmap_local(kaddr);
|
||||
rcu_read_unlock();
|
||||
|
||||
out:
|
||||
return err;
|
||||
bio_advance_iter_single(bio, &bio->bi_iter, bv.bv_len);
|
||||
return true;
|
||||
|
||||
out_error:
|
||||
rcu_read_unlock();
|
||||
if (PTR_ERR(page) == -ENOMEM && (opf & REQ_NOWAIT))
|
||||
bio_wouldblock_error(bio);
|
||||
else
|
||||
bio_io_error(bio);
|
||||
return false;
|
||||
}
|
||||
|
||||
static void brd_free_one_page(struct rcu_head *head)
|
||||
{
|
||||
struct page *page = container_of(head, struct page, rcu_head);
|
||||
|
||||
__free_page(page);
|
||||
}
|
||||
|
||||
static void brd_do_discard(struct brd_device *brd, sector_t sector, u32 size)
|
||||
{
|
||||
sector_t aligned_sector = (sector + PAGE_SECTORS) & ~PAGE_SECTORS;
|
||||
sector_t aligned_sector = round_up(sector, PAGE_SECTORS);
|
||||
sector_t aligned_end = round_down(
|
||||
sector + (size >> SECTOR_SHIFT), PAGE_SECTORS);
|
||||
struct page *page;
|
||||
|
||||
size -= (aligned_sector - sector) * SECTOR_SIZE;
|
||||
if (aligned_end <= aligned_sector)
|
||||
return;
|
||||
|
||||
xa_lock(&brd->brd_pages);
|
||||
while (size >= PAGE_SIZE && aligned_sector < rd_size * 2) {
|
||||
while (aligned_sector < aligned_end && aligned_sector < rd_size * 2) {
|
||||
page = __xa_erase(&brd->brd_pages, aligned_sector >> PAGE_SECTORS_SHIFT);
|
||||
if (page) {
|
||||
__free_page(page);
|
||||
call_rcu(&page->rcu_head, brd_free_one_page);
|
||||
brd->brd_nr_pages--;
|
||||
}
|
||||
aligned_sector += PAGE_SECTORS;
|
||||
size -= PAGE_SIZE;
|
||||
}
|
||||
xa_unlock(&brd->brd_pages);
|
||||
}
|
||||
|
@ -244,36 +179,18 @@ static void brd_do_discard(struct brd_device *brd, sector_t sector, u32 size)
|
|||
static void brd_submit_bio(struct bio *bio)
|
||||
{
|
||||
struct brd_device *brd = bio->bi_bdev->bd_disk->private_data;
|
||||
sector_t sector = bio->bi_iter.bi_sector;
|
||||
struct bio_vec bvec;
|
||||
struct bvec_iter iter;
|
||||
|
||||
if (unlikely(op_is_discard(bio->bi_opf))) {
|
||||
brd_do_discard(brd, sector, bio->bi_iter.bi_size);
|
||||
brd_do_discard(brd, bio->bi_iter.bi_sector,
|
||||
bio->bi_iter.bi_size);
|
||||
bio_endio(bio);
|
||||
return;
|
||||
}
|
||||
|
||||
bio_for_each_segment(bvec, bio, iter) {
|
||||
unsigned int len = bvec.bv_len;
|
||||
int err;
|
||||
|
||||
/* Don't support un-aligned buffer */
|
||||
WARN_ON_ONCE((bvec.bv_offset & (SECTOR_SIZE - 1)) ||
|
||||
(len & (SECTOR_SIZE - 1)));
|
||||
|
||||
err = brd_do_bvec(brd, bvec.bv_page, len, bvec.bv_offset,
|
||||
bio->bi_opf, sector);
|
||||
if (err) {
|
||||
if (err == -ENOMEM && bio->bi_opf & REQ_NOWAIT) {
|
||||
bio_wouldblock_error(bio);
|
||||
do {
|
||||
if (!brd_rw_bvec(brd, bio))
|
||||
return;
|
||||
}
|
||||
bio_io_error(bio);
|
||||
return;
|
||||
}
|
||||
sector += len >> SECTOR_SHIFT;
|
||||
}
|
||||
} while (bio->bi_iter.bi_size);
|
||||
|
||||
bio_endio(bio);
|
||||
}
|
||||
|
|
|
@ -725,7 +725,7 @@ static int pkt_generic_packet(struct pktcdvd_device *pd, struct packet_command *
|
|||
scmd = blk_mq_rq_to_pdu(rq);
|
||||
|
||||
if (cgc->buflen) {
|
||||
ret = blk_rq_map_kern(q, rq, cgc->buffer, cgc->buflen,
|
||||
ret = blk_rq_map_kern(rq, cgc->buffer, cgc->buflen,
|
||||
GFP_NOIO);
|
||||
if (ret)
|
||||
goto out;
|
||||
|
|
|
@ -147,12 +147,7 @@ static int process_rdma(struct rnbd_srv_session *srv_sess,
|
|||
|
||||
bio = bio_alloc(file_bdev(sess_dev->bdev_file), 1,
|
||||
rnbd_to_bio_flags(le32_to_cpu(msg->rw)), GFP_KERNEL);
|
||||
if (bio_add_page(bio, virt_to_page(data), datalen,
|
||||
offset_in_page(data)) != datalen) {
|
||||
rnbd_srv_err_rl(sess_dev, "Failed to map data to bio\n");
|
||||
err = -EINVAL;
|
||||
goto bio_put;
|
||||
}
|
||||
bio_add_virt_nofail(bio, data, datalen);
|
||||
|
||||
bio->bi_opf = rnbd_to_bio_flags(le32_to_cpu(msg->rw));
|
||||
if (bio_has_data(bio) &&
|
||||
|
|
|
@ -50,6 +50,8 @@
|
|||
|
||||
/* private ioctl command mirror */
|
||||
#define UBLK_CMD_DEL_DEV_ASYNC _IOC_NR(UBLK_U_CMD_DEL_DEV_ASYNC)
|
||||
#define UBLK_CMD_UPDATE_SIZE _IOC_NR(UBLK_U_CMD_UPDATE_SIZE)
|
||||
#define UBLK_CMD_QUIESCE_DEV _IOC_NR(UBLK_U_CMD_QUIESCE_DEV)
|
||||
|
||||
#define UBLK_IO_REGISTER_IO_BUF _IOC_NR(UBLK_U_IO_REGISTER_IO_BUF)
|
||||
#define UBLK_IO_UNREGISTER_IO_BUF _IOC_NR(UBLK_U_IO_UNREGISTER_IO_BUF)
|
||||
|
@ -64,7 +66,10 @@
|
|||
| UBLK_F_CMD_IOCTL_ENCODE \
|
||||
| UBLK_F_USER_COPY \
|
||||
| UBLK_F_ZONED \
|
||||
| UBLK_F_USER_RECOVERY_FAIL_IO)
|
||||
| UBLK_F_USER_RECOVERY_FAIL_IO \
|
||||
| UBLK_F_UPDATE_SIZE \
|
||||
| UBLK_F_AUTO_BUF_REG \
|
||||
| UBLK_F_QUIESCE)
|
||||
|
||||
#define UBLK_F_ALL_RECOVERY_FLAGS (UBLK_F_USER_RECOVERY \
|
||||
| UBLK_F_USER_RECOVERY_REISSUE \
|
||||
|
@ -77,7 +82,11 @@
|
|||
UBLK_PARAM_TYPE_DMA_ALIGN | UBLK_PARAM_TYPE_SEGMENT)
|
||||
|
||||
struct ublk_rq_data {
|
||||
struct kref ref;
|
||||
refcount_t ref;
|
||||
|
||||
/* for auto-unregister buffer in case of UBLK_F_AUTO_BUF_REG */
|
||||
u16 buf_index;
|
||||
void *buf_ctx_handle;
|
||||
};
|
||||
|
||||
struct ublk_uring_cmd_pdu {
|
||||
|
@ -99,6 +108,9 @@ struct ublk_uring_cmd_pdu {
|
|||
* setup in ublk uring_cmd handler
|
||||
*/
|
||||
struct ublk_queue *ubq;
|
||||
|
||||
struct ublk_auto_buf_reg buf;
|
||||
|
||||
u16 tag;
|
||||
};
|
||||
|
||||
|
@ -131,6 +143,14 @@ struct ublk_uring_cmd_pdu {
|
|||
*/
|
||||
#define UBLK_IO_FLAG_NEED_GET_DATA 0x08
|
||||
|
||||
/*
|
||||
* request buffer is registered automatically, so we have to unregister it
|
||||
* before completing this request.
|
||||
*
|
||||
* io_uring will unregister buffer automatically for us during exiting.
|
||||
*/
|
||||
#define UBLK_IO_FLAG_AUTO_BUF_REG 0x10
|
||||
|
||||
/* atomic RW with ubq->cancel_lock */
|
||||
#define UBLK_IO_FLAG_CANCELED 0x80000000
|
||||
|
||||
|
@ -140,7 +160,12 @@ struct ublk_io {
|
|||
unsigned int flags;
|
||||
int res;
|
||||
|
||||
union {
|
||||
/* valid if UBLK_IO_FLAG_ACTIVE is set */
|
||||
struct io_uring_cmd *cmd;
|
||||
/* valid if UBLK_IO_FLAG_OWNED_BY_SRV is set */
|
||||
struct request *req;
|
||||
};
|
||||
};
|
||||
|
||||
struct ublk_queue {
|
||||
|
@ -198,13 +223,19 @@ struct ublk_params_header {
|
|||
__u32 types;
|
||||
};
|
||||
|
||||
static void ublk_io_release(void *priv);
|
||||
static void ublk_stop_dev_unlocked(struct ublk_device *ub);
|
||||
static void ublk_abort_queue(struct ublk_device *ub, struct ublk_queue *ubq);
|
||||
static inline struct request *__ublk_check_and_get_req(struct ublk_device *ub,
|
||||
const struct ublk_queue *ubq, int tag, size_t offset);
|
||||
static inline unsigned int ublk_req_build_flags(struct request *req);
|
||||
static inline struct ublksrv_io_desc *ublk_get_iod(struct ublk_queue *ubq,
|
||||
int tag);
|
||||
|
||||
static inline struct ublksrv_io_desc *
|
||||
ublk_get_iod(const struct ublk_queue *ubq, unsigned tag)
|
||||
{
|
||||
return &ubq->io_cmd_buf[tag];
|
||||
}
|
||||
|
||||
static inline bool ublk_dev_is_zoned(const struct ublk_device *ub)
|
||||
{
|
||||
return ub->dev_info.flags & UBLK_F_ZONED;
|
||||
|
@ -356,8 +387,7 @@ static int ublk_report_zones(struct gendisk *disk, sector_t sector,
|
|||
if (ret)
|
||||
goto free_req;
|
||||
|
||||
ret = blk_rq_map_kern(disk->queue, req, buffer, buffer_length,
|
||||
GFP_KERNEL);
|
||||
ret = blk_rq_map_kern(req, buffer, buffer_length, GFP_KERNEL);
|
||||
if (ret)
|
||||
goto erase_desc;
|
||||
|
||||
|
@ -477,7 +507,6 @@ static blk_status_t ublk_setup_iod_zoned(struct ublk_queue *ubq,
|
|||
#endif
|
||||
|
||||
static inline void __ublk_complete_rq(struct request *req);
|
||||
static void ublk_complete_rq(struct kref *ref);
|
||||
|
||||
static dev_t ublk_chr_devt;
|
||||
static const struct class ublk_chr_class = {
|
||||
|
@ -609,6 +638,11 @@ static inline bool ublk_support_zero_copy(const struct ublk_queue *ubq)
|
|||
return ubq->flags & UBLK_F_SUPPORT_ZERO_COPY;
|
||||
}
|
||||
|
||||
static inline bool ublk_support_auto_buf_reg(const struct ublk_queue *ubq)
|
||||
{
|
||||
return ubq->flags & UBLK_F_AUTO_BUF_REG;
|
||||
}
|
||||
|
||||
static inline bool ublk_support_user_copy(const struct ublk_queue *ubq)
|
||||
{
|
||||
return ubq->flags & UBLK_F_USER_COPY;
|
||||
|
@ -616,7 +650,8 @@ static inline bool ublk_support_user_copy(const struct ublk_queue *ubq)
|
|||
|
||||
static inline bool ublk_need_map_io(const struct ublk_queue *ubq)
|
||||
{
|
||||
return !ublk_support_user_copy(ubq) && !ublk_support_zero_copy(ubq);
|
||||
return !ublk_support_user_copy(ubq) && !ublk_support_zero_copy(ubq) &&
|
||||
!ublk_support_auto_buf_reg(ubq);
|
||||
}
|
||||
|
||||
static inline bool ublk_need_req_ref(const struct ublk_queue *ubq)
|
||||
|
@ -627,8 +662,13 @@ static inline bool ublk_need_req_ref(const struct ublk_queue *ubq)
|
|||
*
|
||||
* for zero copy, request buffer need to be registered to io_uring
|
||||
* buffer table, so reference is needed
|
||||
*
|
||||
* For auto buffer register, ublk server still may issue
|
||||
* UBLK_IO_COMMIT_AND_FETCH_REQ before one registered buffer is used up,
|
||||
* so reference is required too.
|
||||
*/
|
||||
return ublk_support_user_copy(ubq) || ublk_support_zero_copy(ubq);
|
||||
return ublk_support_user_copy(ubq) || ublk_support_zero_copy(ubq) ||
|
||||
ublk_support_auto_buf_reg(ubq);
|
||||
}
|
||||
|
||||
static inline void ublk_init_req_ref(const struct ublk_queue *ubq,
|
||||
|
@ -637,7 +677,7 @@ static inline void ublk_init_req_ref(const struct ublk_queue *ubq,
|
|||
if (ublk_need_req_ref(ubq)) {
|
||||
struct ublk_rq_data *data = blk_mq_rq_to_pdu(req);
|
||||
|
||||
kref_init(&data->ref);
|
||||
refcount_set(&data->ref, 1);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -647,7 +687,7 @@ static inline bool ublk_get_req_ref(const struct ublk_queue *ubq,
|
|||
if (ublk_need_req_ref(ubq)) {
|
||||
struct ublk_rq_data *data = blk_mq_rq_to_pdu(req);
|
||||
|
||||
return kref_get_unless_zero(&data->ref);
|
||||
return refcount_inc_not_zero(&data->ref);
|
||||
}
|
||||
|
||||
return true;
|
||||
|
@ -659,7 +699,8 @@ static inline void ublk_put_req_ref(const struct ublk_queue *ubq,
|
|||
if (ublk_need_req_ref(ubq)) {
|
||||
struct ublk_rq_data *data = blk_mq_rq_to_pdu(req);
|
||||
|
||||
kref_put(&data->ref, ublk_complete_rq);
|
||||
if (refcount_dec_and_test(&data->ref))
|
||||
__ublk_complete_rq(req);
|
||||
} else {
|
||||
__ublk_complete_rq(req);
|
||||
}
|
||||
|
@ -695,12 +736,6 @@ static inline bool ublk_rq_has_data(const struct request *rq)
|
|||
return bio_has_data(rq->bio);
|
||||
}
|
||||
|
||||
static inline struct ublksrv_io_desc *ublk_get_iod(struct ublk_queue *ubq,
|
||||
int tag)
|
||||
{
|
||||
return &ubq->io_cmd_buf[tag];
|
||||
}
|
||||
|
||||
static inline struct ublksrv_io_desc *
|
||||
ublk_queue_cmd_buf(struct ublk_device *ub, int q_id)
|
||||
{
|
||||
|
@ -1117,18 +1152,12 @@ exit:
|
|||
blk_mq_end_request(req, res);
|
||||
}
|
||||
|
||||
static void ublk_complete_rq(struct kref *ref)
|
||||
static void ublk_complete_io_cmd(struct ublk_io *io, struct request *req,
|
||||
int res, unsigned issue_flags)
|
||||
{
|
||||
struct ublk_rq_data *data = container_of(ref, struct ublk_rq_data,
|
||||
ref);
|
||||
struct request *req = blk_mq_rq_from_pdu(data);
|
||||
/* read cmd first because req will overwrite it */
|
||||
struct io_uring_cmd *cmd = io->cmd;
|
||||
|
||||
__ublk_complete_rq(req);
|
||||
}
|
||||
|
||||
static void ubq_complete_io_cmd(struct ublk_io *io, int res,
|
||||
unsigned issue_flags)
|
||||
{
|
||||
/* mark this cmd owned by ublksrv */
|
||||
io->flags |= UBLK_IO_FLAG_OWNED_BY_SRV;
|
||||
|
||||
|
@ -1138,8 +1167,10 @@ static void ubq_complete_io_cmd(struct ublk_io *io, int res,
|
|||
*/
|
||||
io->flags &= ~UBLK_IO_FLAG_ACTIVE;
|
||||
|
||||
io->req = req;
|
||||
|
||||
/* tell ublksrv one io request is coming */
|
||||
io_uring_cmd_done(io->cmd, res, 0, issue_flags);
|
||||
io_uring_cmd_done(cmd, res, 0, issue_flags);
|
||||
}
|
||||
|
||||
#define UBLK_REQUEUE_DELAY_MS 3
|
||||
|
@ -1154,16 +1185,91 @@ static inline void __ublk_abort_rq(struct ublk_queue *ubq,
|
|||
blk_mq_end_request(rq, BLK_STS_IOERR);
|
||||
}
|
||||
|
||||
static void ublk_auto_buf_reg_fallback(struct request *req)
|
||||
{
|
||||
const struct ublk_queue *ubq = req->mq_hctx->driver_data;
|
||||
struct ublksrv_io_desc *iod = ublk_get_iod(ubq, req->tag);
|
||||
struct ublk_rq_data *data = blk_mq_rq_to_pdu(req);
|
||||
|
||||
iod->op_flags |= UBLK_IO_F_NEED_REG_BUF;
|
||||
refcount_set(&data->ref, 1);
|
||||
}
|
||||
|
||||
static bool ublk_auto_buf_reg(struct request *req, struct ublk_io *io,
|
||||
unsigned int issue_flags)
|
||||
{
|
||||
struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(io->cmd);
|
||||
struct ublk_rq_data *data = blk_mq_rq_to_pdu(req);
|
||||
int ret;
|
||||
|
||||
ret = io_buffer_register_bvec(io->cmd, req, ublk_io_release,
|
||||
pdu->buf.index, issue_flags);
|
||||
if (ret) {
|
||||
if (pdu->buf.flags & UBLK_AUTO_BUF_REG_FALLBACK) {
|
||||
ublk_auto_buf_reg_fallback(req);
|
||||
return true;
|
||||
}
|
||||
blk_mq_end_request(req, BLK_STS_IOERR);
|
||||
return false;
|
||||
}
|
||||
/* one extra reference is dropped by ublk_io_release */
|
||||
refcount_set(&data->ref, 2);
|
||||
|
||||
data->buf_ctx_handle = io_uring_cmd_ctx_handle(io->cmd);
|
||||
/* store buffer index in request payload */
|
||||
data->buf_index = pdu->buf.index;
|
||||
io->flags |= UBLK_IO_FLAG_AUTO_BUF_REG;
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool ublk_prep_auto_buf_reg(struct ublk_queue *ubq,
|
||||
struct request *req, struct ublk_io *io,
|
||||
unsigned int issue_flags)
|
||||
{
|
||||
if (ublk_support_auto_buf_reg(ubq) && ublk_rq_has_data(req))
|
||||
return ublk_auto_buf_reg(req, io, issue_flags);
|
||||
|
||||
ublk_init_req_ref(ubq, req);
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool ublk_start_io(const struct ublk_queue *ubq, struct request *req,
|
||||
struct ublk_io *io)
|
||||
{
|
||||
unsigned mapped_bytes = ublk_map_io(ubq, req, io);
|
||||
|
||||
/* partially mapped, update io descriptor */
|
||||
if (unlikely(mapped_bytes != blk_rq_bytes(req))) {
|
||||
/*
|
||||
* Nothing mapped, retry until we succeed.
|
||||
*
|
||||
* We may never succeed in mapping any bytes here because
|
||||
* of OOM. TODO: reserve one buffer with single page pinned
|
||||
* for providing forward progress guarantee.
|
||||
*/
|
||||
if (unlikely(!mapped_bytes)) {
|
||||
blk_mq_requeue_request(req, false);
|
||||
blk_mq_delay_kick_requeue_list(req->q,
|
||||
UBLK_REQUEUE_DELAY_MS);
|
||||
return false;
|
||||
}
|
||||
|
||||
ublk_get_iod(ubq, req->tag)->nr_sectors =
|
||||
mapped_bytes >> 9;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static void ublk_dispatch_req(struct ublk_queue *ubq,
|
||||
struct request *req,
|
||||
unsigned int issue_flags)
|
||||
{
|
||||
int tag = req->tag;
|
||||
struct ublk_io *io = &ubq->ios[tag];
|
||||
unsigned int mapped_bytes;
|
||||
|
||||
pr_devel("%s: complete: op %d, qid %d tag %d io_flags %x addr %llx\n",
|
||||
__func__, io->cmd->cmd_op, ubq->q_id, req->tag, io->flags,
|
||||
pr_devel("%s: complete: qid %d tag %d io_flags %x addr %llx\n",
|
||||
__func__, ubq->q_id, req->tag, io->flags,
|
||||
ublk_get_iod(ubq, req->tag)->addr);
|
||||
|
||||
/*
|
||||
|
@ -1183,54 +1289,22 @@ static void ublk_dispatch_req(struct ublk_queue *ubq,
|
|||
if (ublk_need_get_data(ubq) && ublk_need_map_req(req)) {
|
||||
/*
|
||||
* We have not handled UBLK_IO_NEED_GET_DATA command yet,
|
||||
* so immepdately pass UBLK_IO_RES_NEED_GET_DATA to ublksrv
|
||||
* so immediately pass UBLK_IO_RES_NEED_GET_DATA to ublksrv
|
||||
* and notify it.
|
||||
*/
|
||||
if (!(io->flags & UBLK_IO_FLAG_NEED_GET_DATA)) {
|
||||
io->flags |= UBLK_IO_FLAG_NEED_GET_DATA;
|
||||
pr_devel("%s: need get data. op %d, qid %d tag %d io_flags %x\n",
|
||||
__func__, io->cmd->cmd_op, ubq->q_id,
|
||||
req->tag, io->flags);
|
||||
ubq_complete_io_cmd(io, UBLK_IO_RES_NEED_GET_DATA, issue_flags);
|
||||
return;
|
||||
}
|
||||
/*
|
||||
* We have handled UBLK_IO_NEED_GET_DATA command,
|
||||
* so clear UBLK_IO_FLAG_NEED_GET_DATA now and just
|
||||
* do the copy work.
|
||||
*/
|
||||
io->flags &= ~UBLK_IO_FLAG_NEED_GET_DATA;
|
||||
/* update iod->addr because ublksrv may have passed a new io buffer */
|
||||
ublk_get_iod(ubq, req->tag)->addr = io->addr;
|
||||
pr_devel("%s: update iod->addr: op %d, qid %d tag %d io_flags %x addr %llx\n",
|
||||
__func__, io->cmd->cmd_op, ubq->q_id, req->tag, io->flags,
|
||||
ublk_get_iod(ubq, req->tag)->addr);
|
||||
}
|
||||
|
||||
mapped_bytes = ublk_map_io(ubq, req, io);
|
||||
|
||||
/* partially mapped, update io descriptor */
|
||||
if (unlikely(mapped_bytes != blk_rq_bytes(req))) {
|
||||
/*
|
||||
* Nothing mapped, retry until we succeed.
|
||||
*
|
||||
* We may never succeed in mapping any bytes here because
|
||||
* of OOM. TODO: reserve one buffer with single page pinned
|
||||
* for providing forward progress guarantee.
|
||||
*/
|
||||
if (unlikely(!mapped_bytes)) {
|
||||
blk_mq_requeue_request(req, false);
|
||||
blk_mq_delay_kick_requeue_list(req->q,
|
||||
UBLK_REQUEUE_DELAY_MS);
|
||||
pr_devel("%s: need get data. qid %d tag %d io_flags %x\n",
|
||||
__func__, ubq->q_id, req->tag, io->flags);
|
||||
ublk_complete_io_cmd(io, req, UBLK_IO_RES_NEED_GET_DATA,
|
||||
issue_flags);
|
||||
return;
|
||||
}
|
||||
|
||||
ublk_get_iod(ubq, req->tag)->nr_sectors =
|
||||
mapped_bytes >> 9;
|
||||
}
|
||||
if (!ublk_start_io(ubq, req, io))
|
||||
return;
|
||||
|
||||
ublk_init_req_ref(ubq, req);
|
||||
ubq_complete_io_cmd(io, UBLK_IO_RES_OK, issue_flags);
|
||||
if (ublk_prep_auto_buf_reg(ubq, req, io, issue_flags))
|
||||
ublk_complete_io_cmd(io, req, UBLK_IO_RES_OK, issue_flags);
|
||||
}
|
||||
|
||||
static void ublk_cmd_tw_cb(struct io_uring_cmd *cmd,
|
||||
|
@ -1590,30 +1664,6 @@ static int ublk_ch_mmap(struct file *filp, struct vm_area_struct *vma)
|
|||
return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot);
|
||||
}
|
||||
|
||||
static void ublk_commit_completion(struct ublk_device *ub,
|
||||
const struct ublksrv_io_cmd *ub_cmd)
|
||||
{
|
||||
u32 qid = ub_cmd->q_id, tag = ub_cmd->tag;
|
||||
struct ublk_queue *ubq = ublk_get_queue(ub, qid);
|
||||
struct ublk_io *io = &ubq->ios[tag];
|
||||
struct request *req;
|
||||
|
||||
/* now this cmd slot is owned by nbd driver */
|
||||
io->flags &= ~UBLK_IO_FLAG_OWNED_BY_SRV;
|
||||
io->res = ub_cmd->result;
|
||||
|
||||
/* find the io request and complete */
|
||||
req = blk_mq_tag_to_rq(ub->tag_set.tags[qid], tag);
|
||||
if (WARN_ON_ONCE(unlikely(!req)))
|
||||
return;
|
||||
|
||||
if (req_op(req) == REQ_OP_ZONE_APPEND)
|
||||
req->__sector = ub_cmd->zone_append_lba;
|
||||
|
||||
if (likely(!blk_should_fake_timeout(req->q)))
|
||||
ublk_put_req_ref(ubq, req);
|
||||
}
|
||||
|
||||
static void __ublk_fail_req(struct ublk_queue *ubq, struct ublk_io *io,
|
||||
struct request *req)
|
||||
{
|
||||
|
@ -1642,17 +1692,8 @@ static void ublk_abort_queue(struct ublk_device *ub, struct ublk_queue *ubq)
|
|||
for (i = 0; i < ubq->q_depth; i++) {
|
||||
struct ublk_io *io = &ubq->ios[i];
|
||||
|
||||
if (!(io->flags & UBLK_IO_FLAG_ACTIVE)) {
|
||||
struct request *rq;
|
||||
|
||||
/*
|
||||
* Either we fail the request or ublk_rq_task_work_cb
|
||||
* will do it
|
||||
*/
|
||||
rq = blk_mq_tag_to_rq(ub->tag_set.tags[ubq->q_id], i);
|
||||
if (rq && blk_mq_request_started(rq))
|
||||
__ublk_fail_req(ubq, io, rq);
|
||||
}
|
||||
if (io->flags & UBLK_IO_FLAG_OWNED_BY_SRV)
|
||||
__ublk_fail_req(ubq, io, io->req);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1940,6 +1981,20 @@ static inline void ublk_prep_cancel(struct io_uring_cmd *cmd,
|
|||
io_uring_cmd_mark_cancelable(cmd, issue_flags);
|
||||
}
|
||||
|
||||
static inline int ublk_set_auto_buf_reg(struct io_uring_cmd *cmd)
|
||||
{
|
||||
struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
|
||||
|
||||
pdu->buf = ublk_sqe_addr_to_auto_buf_reg(READ_ONCE(cmd->sqe->addr));
|
||||
|
||||
if (pdu->buf.reserved0 || pdu->buf.reserved1)
|
||||
return -EINVAL;
|
||||
|
||||
if (pdu->buf.flags & ~UBLK_AUTO_BUF_REG_F_MASK)
|
||||
return -EINVAL;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void ublk_io_release(void *priv)
|
||||
{
|
||||
struct request *rq = priv;
|
||||
|
@ -1953,16 +2008,12 @@ static int ublk_register_io_buf(struct io_uring_cmd *cmd,
|
|||
unsigned int index, unsigned int issue_flags)
|
||||
{
|
||||
struct ublk_device *ub = cmd->file->private_data;
|
||||
const struct ublk_io *io = &ubq->ios[tag];
|
||||
struct request *req;
|
||||
int ret;
|
||||
|
||||
if (!ublk_support_zero_copy(ubq))
|
||||
return -EINVAL;
|
||||
|
||||
if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV))
|
||||
return -EINVAL;
|
||||
|
||||
req = __ublk_check_and_get_req(ub, ubq, tag, 0);
|
||||
if (!req)
|
||||
return -EINVAL;
|
||||
|
@ -1978,17 +2029,12 @@ static int ublk_register_io_buf(struct io_uring_cmd *cmd,
|
|||
}
|
||||
|
||||
static int ublk_unregister_io_buf(struct io_uring_cmd *cmd,
|
||||
const struct ublk_queue *ubq, unsigned int tag,
|
||||
const struct ublk_queue *ubq,
|
||||
unsigned int index, unsigned int issue_flags)
|
||||
{
|
||||
const struct ublk_io *io = &ubq->ios[tag];
|
||||
|
||||
if (!ublk_support_zero_copy(ubq))
|
||||
return -EINVAL;
|
||||
|
||||
if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV))
|
||||
return -EINVAL;
|
||||
|
||||
return io_buffer_unregister_bvec(cmd, index, issue_flags);
|
||||
}
|
||||
|
||||
|
@ -2031,6 +2077,12 @@ static int ublk_fetch(struct io_uring_cmd *cmd, struct ublk_queue *ubq,
|
|||
goto out;
|
||||
}
|
||||
|
||||
if (ublk_support_auto_buf_reg(ubq)) {
|
||||
ret = ublk_set_auto_buf_reg(cmd);
|
||||
if (ret)
|
||||
goto out;
|
||||
}
|
||||
|
||||
ublk_fill_io_cmd(io, cmd, buf_addr);
|
||||
ublk_mark_io_ready(ub, ubq);
|
||||
out:
|
||||
|
@ -2038,6 +2090,90 @@ out:
|
|||
return ret;
|
||||
}
|
||||
|
||||
static int ublk_commit_and_fetch(const struct ublk_queue *ubq,
|
||||
struct ublk_io *io, struct io_uring_cmd *cmd,
|
||||
const struct ublksrv_io_cmd *ub_cmd,
|
||||
unsigned int issue_flags)
|
||||
{
|
||||
struct request *req = io->req;
|
||||
|
||||
if (ublk_need_map_io(ubq)) {
|
||||
/*
|
||||
* COMMIT_AND_FETCH_REQ has to provide IO buffer if
|
||||
* NEED GET DATA is not enabled or it is Read IO.
|
||||
*/
|
||||
if (!ub_cmd->addr && (!ublk_need_get_data(ubq) ||
|
||||
req_op(req) == REQ_OP_READ))
|
||||
return -EINVAL;
|
||||
} else if (req_op(req) != REQ_OP_ZONE_APPEND && ub_cmd->addr) {
|
||||
/*
|
||||
* User copy requires addr to be unset when command is
|
||||
* not zone append
|
||||
*/
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
if (ublk_support_auto_buf_reg(ubq)) {
|
||||
int ret;
|
||||
|
||||
/*
|
||||
* `UBLK_F_AUTO_BUF_REG` only works iff `UBLK_IO_FETCH_REQ`
|
||||
* and `UBLK_IO_COMMIT_AND_FETCH_REQ` are issued from same
|
||||
* `io_ring_ctx`.
|
||||
*
|
||||
* If this uring_cmd's io_ring_ctx isn't same with the
|
||||
* one for registering the buffer, it is ublk server's
|
||||
* responsibility for unregistering the buffer, otherwise
|
||||
* this ublk request gets stuck.
|
||||
*/
|
||||
if (io->flags & UBLK_IO_FLAG_AUTO_BUF_REG) {
|
||||
struct ublk_rq_data *data = blk_mq_rq_to_pdu(req);
|
||||
|
||||
if (data->buf_ctx_handle == io_uring_cmd_ctx_handle(cmd))
|
||||
io_buffer_unregister_bvec(cmd, data->buf_index,
|
||||
issue_flags);
|
||||
io->flags &= ~UBLK_IO_FLAG_AUTO_BUF_REG;
|
||||
}
|
||||
|
||||
ret = ublk_set_auto_buf_reg(cmd);
|
||||
if (ret)
|
||||
return ret;
|
||||
}
|
||||
|
||||
ublk_fill_io_cmd(io, cmd, ub_cmd->addr);
|
||||
|
||||
/* now this cmd slot is owned by ublk driver */
|
||||
io->flags &= ~UBLK_IO_FLAG_OWNED_BY_SRV;
|
||||
io->res = ub_cmd->result;
|
||||
|
||||
if (req_op(req) == REQ_OP_ZONE_APPEND)
|
||||
req->__sector = ub_cmd->zone_append_lba;
|
||||
|
||||
if (likely(!blk_should_fake_timeout(req->q)))
|
||||
ublk_put_req_ref(ubq, req);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static bool ublk_get_data(const struct ublk_queue *ubq, struct ublk_io *io)
|
||||
{
|
||||
struct request *req = io->req;
|
||||
|
||||
/*
|
||||
* We have handled UBLK_IO_NEED_GET_DATA command,
|
||||
* so clear UBLK_IO_FLAG_NEED_GET_DATA now and just
|
||||
* do the copy work.
|
||||
*/
|
||||
io->flags &= ~UBLK_IO_FLAG_NEED_GET_DATA;
|
||||
/* update iod->addr because ublksrv may have passed a new io buffer */
|
||||
ublk_get_iod(ubq, req->tag)->addr = io->addr;
|
||||
pr_devel("%s: update iod->addr: qid %d tag %d io_flags %x addr %llx\n",
|
||||
__func__, ubq->q_id, req->tag, io->flags,
|
||||
ublk_get_iod(ubq, req->tag)->addr);
|
||||
|
||||
return ublk_start_io(ubq, req, io);
|
||||
}
|
||||
|
||||
static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd,
|
||||
unsigned int issue_flags,
|
||||
const struct ublksrv_io_cmd *ub_cmd)
|
||||
|
@ -2048,7 +2184,6 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd,
|
|||
u32 cmd_op = cmd->cmd_op;
|
||||
unsigned tag = ub_cmd->tag;
|
||||
int ret = -EINVAL;
|
||||
struct request *req;
|
||||
|
||||
pr_devel("%s: received: cmd op %d queue %d tag %d result %d\n",
|
||||
__func__, cmd->cmd_op, ub_cmd->q_id, tag,
|
||||
|
@ -2058,9 +2193,6 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd,
|
|||
goto out;
|
||||
|
||||
ubq = ublk_get_queue(ub, ub_cmd->q_id);
|
||||
if (!ubq || ub_cmd->q_id != ubq->q_id)
|
||||
goto out;
|
||||
|
||||
if (ubq->ubq_daemon && ubq->ubq_daemon != current)
|
||||
goto out;
|
||||
|
||||
|
@ -2075,6 +2207,11 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd,
|
|||
goto out;
|
||||
}
|
||||
|
||||
/* only UBLK_IO_FETCH_REQ is allowed if io is not OWNED_BY_SRV */
|
||||
if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV) &&
|
||||
_IOC_NR(cmd_op) != UBLK_IO_FETCH_REQ)
|
||||
goto out;
|
||||
|
||||
/*
|
||||
* ensure that the user issues UBLK_IO_NEED_GET_DATA
|
||||
* iff the driver have set the UBLK_IO_FLAG_NEED_GET_DATA.
|
||||
|
@ -2092,45 +2229,23 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd,
|
|||
case UBLK_IO_REGISTER_IO_BUF:
|
||||
return ublk_register_io_buf(cmd, ubq, tag, ub_cmd->addr, issue_flags);
|
||||
case UBLK_IO_UNREGISTER_IO_BUF:
|
||||
return ublk_unregister_io_buf(cmd, ubq, tag, ub_cmd->addr, issue_flags);
|
||||
return ublk_unregister_io_buf(cmd, ubq, ub_cmd->addr, issue_flags);
|
||||
case UBLK_IO_FETCH_REQ:
|
||||
ret = ublk_fetch(cmd, ubq, io, ub_cmd->addr);
|
||||
if (ret)
|
||||
goto out;
|
||||
break;
|
||||
case UBLK_IO_COMMIT_AND_FETCH_REQ:
|
||||
req = blk_mq_tag_to_rq(ub->tag_set.tags[ub_cmd->q_id], tag);
|
||||
|
||||
if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV))
|
||||
ret = ublk_commit_and_fetch(ubq, io, cmd, ub_cmd, issue_flags);
|
||||
if (ret)
|
||||
goto out;
|
||||
|
||||
if (ublk_need_map_io(ubq)) {
|
||||
/*
|
||||
* COMMIT_AND_FETCH_REQ has to provide IO buffer if
|
||||
* NEED GET DATA is not enabled or it is Read IO.
|
||||
*/
|
||||
if (!ub_cmd->addr && (!ublk_need_get_data(ubq) ||
|
||||
req_op(req) == REQ_OP_READ))
|
||||
goto out;
|
||||
} else if (req_op(req) != REQ_OP_ZONE_APPEND && ub_cmd->addr) {
|
||||
/*
|
||||
* User copy requires addr to be unset when command is
|
||||
* not zone append
|
||||
*/
|
||||
ret = -EINVAL;
|
||||
goto out;
|
||||
}
|
||||
|
||||
ublk_fill_io_cmd(io, cmd, ub_cmd->addr);
|
||||
ublk_commit_completion(ub, ub_cmd);
|
||||
break;
|
||||
case UBLK_IO_NEED_GET_DATA:
|
||||
if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV))
|
||||
goto out;
|
||||
ublk_fill_io_cmd(io, cmd, ub_cmd->addr);
|
||||
req = blk_mq_tag_to_rq(ub->tag_set.tags[ub_cmd->q_id], tag);
|
||||
ublk_dispatch_req(ubq, req, issue_flags);
|
||||
io->addr = ub_cmd->addr;
|
||||
if (!ublk_get_data(ubq, io))
|
||||
return -EIOCBQUEUED;
|
||||
|
||||
return UBLK_IO_RES_OK;
|
||||
default:
|
||||
goto out;
|
||||
}
|
||||
|
@ -2728,6 +2843,11 @@ static int ublk_ctrl_add_dev(const struct ublksrv_ctrl_cmd *header)
|
|||
return -EINVAL;
|
||||
}
|
||||
|
||||
if ((info.flags & UBLK_F_QUIESCE) && !(info.flags & UBLK_F_USER_RECOVERY)) {
|
||||
pr_warn("UBLK_F_QUIESCE requires UBLK_F_USER_RECOVERY\n");
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
/*
|
||||
* unprivileged device can't be trusted, but RECOVERY and
|
||||
* RECOVERY_REISSUE still may hang error handling, so can't
|
||||
|
@ -2744,8 +2864,11 @@ static int ublk_ctrl_add_dev(const struct ublksrv_ctrl_cmd *header)
|
|||
* For USER_COPY, we depends on userspace to fill request
|
||||
* buffer by pwrite() to ublk char device, which can't be
|
||||
* used for unprivileged device
|
||||
*
|
||||
* Same with zero copy or auto buffer register.
|
||||
*/
|
||||
if (info.flags & (UBLK_F_USER_COPY | UBLK_F_SUPPORT_ZERO_COPY))
|
||||
if (info.flags & (UBLK_F_USER_COPY | UBLK_F_SUPPORT_ZERO_COPY |
|
||||
UBLK_F_AUTO_BUF_REG))
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
|
@ -2803,7 +2926,8 @@ static int ublk_ctrl_add_dev(const struct ublksrv_ctrl_cmd *header)
|
|||
UBLK_F_URING_CMD_COMP_IN_TASK;
|
||||
|
||||
/* GET_DATA isn't needed any more with USER_COPY or ZERO COPY */
|
||||
if (ub->dev_info.flags & (UBLK_F_USER_COPY | UBLK_F_SUPPORT_ZERO_COPY))
|
||||
if (ub->dev_info.flags & (UBLK_F_USER_COPY | UBLK_F_SUPPORT_ZERO_COPY |
|
||||
UBLK_F_AUTO_BUF_REG))
|
||||
ub->dev_info.flags &= ~UBLK_F_NEED_GET_DATA;
|
||||
|
||||
/*
|
||||
|
@ -3106,6 +3230,127 @@ static int ublk_ctrl_get_features(const struct ublksrv_ctrl_cmd *header)
|
|||
return 0;
|
||||
}
|
||||
|
||||
static void ublk_ctrl_set_size(struct ublk_device *ub, const struct ublksrv_ctrl_cmd *header)
|
||||
{
|
||||
struct ublk_param_basic *p = &ub->params.basic;
|
||||
u64 new_size = header->data[0];
|
||||
|
||||
mutex_lock(&ub->mutex);
|
||||
p->dev_sectors = new_size;
|
||||
set_capacity_and_notify(ub->ub_disk, p->dev_sectors);
|
||||
mutex_unlock(&ub->mutex);
|
||||
}
|
||||
|
||||
struct count_busy {
|
||||
const struct ublk_queue *ubq;
|
||||
unsigned int nr_busy;
|
||||
};
|
||||
|
||||
static bool ublk_count_busy_req(struct request *rq, void *data)
|
||||
{
|
||||
struct count_busy *idle = data;
|
||||
|
||||
if (!blk_mq_request_started(rq) && rq->mq_hctx->driver_data == idle->ubq)
|
||||
idle->nr_busy += 1;
|
||||
return true;
|
||||
}
|
||||
|
||||
/* uring_cmd is guaranteed to be active if the associated request is idle */
|
||||
static bool ubq_has_idle_io(const struct ublk_queue *ubq)
|
||||
{
|
||||
struct count_busy data = {
|
||||
.ubq = ubq,
|
||||
};
|
||||
|
||||
blk_mq_tagset_busy_iter(&ubq->dev->tag_set, ublk_count_busy_req, &data);
|
||||
return data.nr_busy < ubq->q_depth;
|
||||
}
|
||||
|
||||
/* Wait until each hw queue has at least one idle IO */
|
||||
static int ublk_wait_for_idle_io(struct ublk_device *ub,
|
||||
unsigned int timeout_ms)
|
||||
{
|
||||
unsigned int elapsed = 0;
|
||||
int ret;
|
||||
|
||||
while (elapsed < timeout_ms && !signal_pending(current)) {
|
||||
unsigned int queues_cancelable = 0;
|
||||
int i;
|
||||
|
||||
for (i = 0; i < ub->dev_info.nr_hw_queues; i++) {
|
||||
struct ublk_queue *ubq = ublk_get_queue(ub, i);
|
||||
|
||||
queues_cancelable += !!ubq_has_idle_io(ubq);
|
||||
}
|
||||
|
||||
/*
|
||||
* Each queue needs at least one active command for
|
||||
* notifying ublk server
|
||||
*/
|
||||
if (queues_cancelable == ub->dev_info.nr_hw_queues)
|
||||
break;
|
||||
|
||||
msleep(UBLK_REQUEUE_DELAY_MS);
|
||||
elapsed += UBLK_REQUEUE_DELAY_MS;
|
||||
}
|
||||
|
||||
if (signal_pending(current))
|
||||
ret = -EINTR;
|
||||
else if (elapsed >= timeout_ms)
|
||||
ret = -EBUSY;
|
||||
else
|
||||
ret = 0;
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int ublk_ctrl_quiesce_dev(struct ublk_device *ub,
|
||||
const struct ublksrv_ctrl_cmd *header)
|
||||
{
|
||||
/* zero means wait forever */
|
||||
u64 timeout_ms = header->data[0];
|
||||
struct gendisk *disk;
|
||||
int i, ret = -ENODEV;
|
||||
|
||||
if (!(ub->dev_info.flags & UBLK_F_QUIESCE))
|
||||
return -EOPNOTSUPP;
|
||||
|
||||
mutex_lock(&ub->mutex);
|
||||
disk = ublk_get_disk(ub);
|
||||
if (!disk)
|
||||
goto unlock;
|
||||
if (ub->dev_info.state == UBLK_S_DEV_DEAD)
|
||||
goto put_disk;
|
||||
|
||||
ret = 0;
|
||||
/* already in expected state */
|
||||
if (ub->dev_info.state != UBLK_S_DEV_LIVE)
|
||||
goto put_disk;
|
||||
|
||||
/* Mark all queues as canceling */
|
||||
blk_mq_quiesce_queue(disk->queue);
|
||||
for (i = 0; i < ub->dev_info.nr_hw_queues; i++) {
|
||||
struct ublk_queue *ubq = ublk_get_queue(ub, i);
|
||||
|
||||
ubq->canceling = true;
|
||||
}
|
||||
blk_mq_unquiesce_queue(disk->queue);
|
||||
|
||||
if (!timeout_ms)
|
||||
timeout_ms = UINT_MAX;
|
||||
ret = ublk_wait_for_idle_io(ub, timeout_ms);
|
||||
|
||||
put_disk:
|
||||
ublk_put_disk(disk);
|
||||
unlock:
|
||||
mutex_unlock(&ub->mutex);
|
||||
|
||||
/* Cancel pending uring_cmd */
|
||||
if (!ret)
|
||||
ublk_cancel_dev(ub);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* All control commands are sent via /dev/ublk-control, so we have to check
|
||||
* the destination device's permission
|
||||
|
@ -3191,6 +3436,8 @@ static int ublk_ctrl_uring_cmd_permission(struct ublk_device *ub,
|
|||
case UBLK_CMD_SET_PARAMS:
|
||||
case UBLK_CMD_START_USER_RECOVERY:
|
||||
case UBLK_CMD_END_USER_RECOVERY:
|
||||
case UBLK_CMD_UPDATE_SIZE:
|
||||
case UBLK_CMD_QUIESCE_DEV:
|
||||
mask = MAY_READ | MAY_WRITE;
|
||||
break;
|
||||
default:
|
||||
|
@ -3282,6 +3529,13 @@ static int ublk_ctrl_uring_cmd(struct io_uring_cmd *cmd,
|
|||
case UBLK_CMD_END_USER_RECOVERY:
|
||||
ret = ublk_ctrl_end_recovery(ub, header);
|
||||
break;
|
||||
case UBLK_CMD_UPDATE_SIZE:
|
||||
ublk_ctrl_set_size(ub, header);
|
||||
ret = 0;
|
||||
break;
|
||||
case UBLK_CMD_QUIESCE_DEV:
|
||||
ret = ublk_ctrl_quiesce_dev(ub, header);
|
||||
break;
|
||||
default:
|
||||
ret = -EOPNOTSUPP;
|
||||
break;
|
||||
|
@ -3315,6 +3569,7 @@ static int __init ublk_init(void)
|
|||
|
||||
BUILD_BUG_ON((u64)UBLKSRV_IO_BUF_OFFSET +
|
||||
UBLKSRV_IO_BUF_TOTAL_SIZE < UBLKSRV_IO_BUF_OFFSET);
|
||||
BUILD_BUG_ON(sizeof(struct ublk_auto_buf_reg) != 8);
|
||||
|
||||
init_waitqueue_head(&ublk_idr_wq);
|
||||
|
||||
|
|
|
@ -571,7 +571,7 @@ static int virtblk_submit_zone_report(struct virtio_blk *vblk,
|
|||
vbr->out_hdr.type = cpu_to_virtio32(vblk->vdev, VIRTIO_BLK_T_ZONE_REPORT);
|
||||
vbr->out_hdr.sector = cpu_to_virtio64(vblk->vdev, sector);
|
||||
|
||||
err = blk_rq_map_kern(q, req, report_buf, report_len, GFP_KERNEL);
|
||||
err = blk_rq_map_kern(req, report_buf, report_len, GFP_KERNEL);
|
||||
if (err)
|
||||
goto out;
|
||||
|
||||
|
@ -817,7 +817,7 @@ static int virtblk_get_id(struct gendisk *disk, char *id_str)
|
|||
vbr->out_hdr.type = cpu_to_virtio32(vblk->vdev, VIRTIO_BLK_T_GET_ID);
|
||||
vbr->out_hdr.sector = 0;
|
||||
|
||||
err = blk_rq_map_kern(q, req, id_str, VIRTIO_BLK_ID_BYTES, GFP_KERNEL);
|
||||
err = blk_rq_map_kern(req, id_str, VIRTIO_BLK_ID_BYTES, GFP_KERNEL);
|
||||
if (err)
|
||||
goto out;
|
||||
|
||||
|
|
1385
drivers/block/zloop.c
Normal file
1385
drivers/block/zloop.c
Normal file
File diff suppressed because it is too large
Load diff
|
@ -3677,7 +3677,6 @@ static void cdrom_sysctl_register(void)
|
|||
|
||||
static void cdrom_sysctl_unregister(void)
|
||||
{
|
||||
if (cdrom_sysctl_header)
|
||||
unregister_sysctl_table(cdrom_sysctl_header);
|
||||
}
|
||||
|
||||
|
|
|
@ -293,8 +293,7 @@ static void __write_super(struct cache_sb *sb, struct cache_sb_disk *out,
|
|||
|
||||
bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_META;
|
||||
bio->bi_iter.bi_sector = SB_SECTOR;
|
||||
__bio_add_page(bio, virt_to_page(out), SB_SIZE,
|
||||
offset_in_page(out));
|
||||
bio_add_virt_nofail(bio, out, SB_SIZE);
|
||||
|
||||
out->offset = cpu_to_le64(sb->offset);
|
||||
|
||||
|
|
|
@ -1364,7 +1364,7 @@ static void use_bio(struct dm_buffer *b, enum req_op op, sector_t sector,
|
|||
ptr = (char *)b->data + offset;
|
||||
len = n_sectors << SECTOR_SHIFT;
|
||||
|
||||
__bio_add_page(bio, virt_to_page(ptr), len, offset_in_page(ptr));
|
||||
bio_add_virt_nofail(bio, ptr, len);
|
||||
|
||||
submit_bio(bio);
|
||||
}
|
||||
|
|
|
@ -2557,14 +2557,8 @@ static void dm_integrity_inline_recheck(struct work_struct *w)
|
|||
char *mem;
|
||||
|
||||
outgoing_bio = bio_alloc_bioset(ic->dev->bdev, 1, REQ_OP_READ, GFP_NOIO, &ic->recheck_bios);
|
||||
|
||||
r = bio_add_page(outgoing_bio, virt_to_page(outgoing_data), ic->sectors_per_block << SECTOR_SHIFT, 0);
|
||||
if (unlikely(r != (ic->sectors_per_block << SECTOR_SHIFT))) {
|
||||
bio_put(outgoing_bio);
|
||||
bio->bi_status = BLK_STS_RESOURCE;
|
||||
bio_endio(bio);
|
||||
return;
|
||||
}
|
||||
bio_add_virt_nofail(outgoing_bio, outgoing_data,
|
||||
ic->sectors_per_block << SECTOR_SHIFT);
|
||||
|
||||
bip = bio_integrity_alloc(outgoing_bio, GFP_NOIO, 1);
|
||||
if (IS_ERR(bip)) {
|
||||
|
@ -3211,7 +3205,8 @@ next_chunk:
|
|||
|
||||
bio = bio_alloc_bioset(ic->dev->bdev, 1, REQ_OP_READ, GFP_NOIO, &ic->recalc_bios);
|
||||
bio->bi_iter.bi_sector = ic->start + SB_SECTORS + range.logical_sector;
|
||||
__bio_add_page(bio, virt_to_page(recalc_buffer), range.n_sectors << SECTOR_SHIFT, offset_in_page(recalc_buffer));
|
||||
bio_add_virt_nofail(bio, recalc_buffer,
|
||||
range.n_sectors << SECTOR_SHIFT);
|
||||
r = submit_bio_wait(bio);
|
||||
bio_put(bio);
|
||||
if (unlikely(r)) {
|
||||
|
@ -3228,7 +3223,8 @@ next_chunk:
|
|||
|
||||
bio = bio_alloc_bioset(ic->dev->bdev, 1, REQ_OP_WRITE, GFP_NOIO, &ic->recalc_bios);
|
||||
bio->bi_iter.bi_sector = ic->start + SB_SECTORS + range.logical_sector;
|
||||
__bio_add_page(bio, virt_to_page(recalc_buffer), range.n_sectors << SECTOR_SHIFT, offset_in_page(recalc_buffer));
|
||||
bio_add_virt_nofail(bio, recalc_buffer,
|
||||
range.n_sectors << SECTOR_SHIFT);
|
||||
|
||||
bip = bio_integrity_alloc(bio, GFP_NOIO, 1);
|
||||
if (unlikely(IS_ERR(bip))) {
|
||||
|
|
|
@ -14,6 +14,7 @@
|
|||
#include "raid5.h"
|
||||
#include "raid10.h"
|
||||
#include "md-bitmap.h"
|
||||
#include "dm-core.h"
|
||||
|
||||
#include <linux/device-mapper.h>
|
||||
|
||||
|
@ -3308,6 +3309,7 @@ size_check:
|
|||
|
||||
/* Disable/enable discard support on raid set. */
|
||||
configure_discard_support(rs);
|
||||
rs->md.dm_gendisk = ti->table->md->disk;
|
||||
|
||||
mddev_unlock(&rs->md);
|
||||
return 0;
|
||||
|
@ -3327,6 +3329,7 @@ static void raid_dtr(struct dm_target *ti)
|
|||
|
||||
mddev_lock_nointr(&rs->md);
|
||||
md_stop(&rs->md);
|
||||
rs->md.dm_gendisk = NULL;
|
||||
mddev_unlock(&rs->md);
|
||||
|
||||
if (work_pending(&rs->md.event_work))
|
||||
|
|
182
drivers/md/md.c
182
drivers/md/md.c
|
@ -111,32 +111,48 @@ static void md_wakeup_thread_directly(struct md_thread __rcu *thread);
|
|||
/* Default safemode delay: 200 msec */
|
||||
#define DEFAULT_SAFEMODE_DELAY ((200 * HZ)/1000 +1)
|
||||
/*
|
||||
* Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit'
|
||||
* is 1000 KB/sec, so the extra system load does not show up that much.
|
||||
* Increase it if you want to have more _guaranteed_ speed. Note that
|
||||
* the RAID driver will use the maximum available bandwidth if the IO
|
||||
* subsystem is idle. There is also an 'absolute maximum' reconstruction
|
||||
* speed limit - in case reconstruction slows down your system despite
|
||||
* idle IO detection.
|
||||
* Current RAID-1,4,5,6,10 parallel reconstruction 'guaranteed speed limit'
|
||||
* is sysctl_speed_limit_min, 1000 KB/sec by default, so the extra system load
|
||||
* does not show up that much. Increase it if you want to have more guaranteed
|
||||
* speed. Note that the RAID driver will use the maximum bandwidth
|
||||
* sysctl_speed_limit_max, 200 MB/sec by default, if the IO subsystem is idle.
|
||||
*
|
||||
* you can change it via /proc/sys/dev/raid/speed_limit_min and _max.
|
||||
* or /sys/block/mdX/md/sync_speed_{min,max}
|
||||
* Background sync IO speed control:
|
||||
*
|
||||
* - below speed min:
|
||||
* no limit;
|
||||
* - above speed min and below speed max:
|
||||
* a) if mddev is idle, then no limit;
|
||||
* b) if mddev is busy handling normal IO, then limit inflight sync IO
|
||||
* to sync_io_depth;
|
||||
* - above speed max:
|
||||
* sync IO can't be issued;
|
||||
*
|
||||
* Following configurations can be changed via /proc/sys/dev/raid/ for system
|
||||
* or /sys/block/mdX/md/ for one array.
|
||||
*/
|
||||
|
||||
static int sysctl_speed_limit_min = 1000;
|
||||
static int sysctl_speed_limit_max = 200000;
|
||||
static inline int speed_min(struct mddev *mddev)
|
||||
static int sysctl_sync_io_depth = 32;
|
||||
|
||||
static int speed_min(struct mddev *mddev)
|
||||
{
|
||||
return mddev->sync_speed_min ?
|
||||
mddev->sync_speed_min : sysctl_speed_limit_min;
|
||||
}
|
||||
|
||||
static inline int speed_max(struct mddev *mddev)
|
||||
static int speed_max(struct mddev *mddev)
|
||||
{
|
||||
return mddev->sync_speed_max ?
|
||||
mddev->sync_speed_max : sysctl_speed_limit_max;
|
||||
}
|
||||
|
||||
static int sync_io_depth(struct mddev *mddev)
|
||||
{
|
||||
return mddev->sync_io_depth ?
|
||||
mddev->sync_io_depth : sysctl_sync_io_depth;
|
||||
}
|
||||
|
||||
static void rdev_uninit_serial(struct md_rdev *rdev)
|
||||
{
|
||||
if (!test_and_clear_bit(CollisionCheck, &rdev->flags))
|
||||
|
@ -293,14 +309,21 @@ static const struct ctl_table raid_table[] = {
|
|||
.procname = "speed_limit_min",
|
||||
.data = &sysctl_speed_limit_min,
|
||||
.maxlen = sizeof(int),
|
||||
.mode = S_IRUGO|S_IWUSR,
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_dointvec,
|
||||
},
|
||||
{
|
||||
.procname = "speed_limit_max",
|
||||
.data = &sysctl_speed_limit_max,
|
||||
.maxlen = sizeof(int),
|
||||
.mode = S_IRUGO|S_IWUSR,
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_dointvec,
|
||||
},
|
||||
{
|
||||
.procname = "sync_io_depth",
|
||||
.data = &sysctl_sync_io_depth,
|
||||
.maxlen = sizeof(int),
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_dointvec,
|
||||
},
|
||||
};
|
||||
|
@ -5145,6 +5168,35 @@ sync_max_store(struct mddev *mddev, const char *buf, size_t len)
|
|||
static struct md_sysfs_entry md_sync_max =
|
||||
__ATTR(sync_speed_max, S_IRUGO|S_IWUSR, sync_max_show, sync_max_store);
|
||||
|
||||
static ssize_t
|
||||
sync_io_depth_show(struct mddev *mddev, char *page)
|
||||
{
|
||||
return sprintf(page, "%d (%s)\n", sync_io_depth(mddev),
|
||||
mddev->sync_io_depth ? "local" : "system");
|
||||
}
|
||||
|
||||
static ssize_t
|
||||
sync_io_depth_store(struct mddev *mddev, const char *buf, size_t len)
|
||||
{
|
||||
unsigned int max;
|
||||
int rv;
|
||||
|
||||
if (strncmp(buf, "system", 6) == 0) {
|
||||
max = 0;
|
||||
} else {
|
||||
rv = kstrtouint(buf, 10, &max);
|
||||
if (rv < 0)
|
||||
return rv;
|
||||
if (max == 0)
|
||||
return -EINVAL;
|
||||
}
|
||||
mddev->sync_io_depth = max;
|
||||
return len;
|
||||
}
|
||||
|
||||
static struct md_sysfs_entry md_sync_io_depth =
|
||||
__ATTR_RW(sync_io_depth);
|
||||
|
||||
static ssize_t
|
||||
degraded_show(struct mddev *mddev, char *page)
|
||||
{
|
||||
|
@ -5671,6 +5723,7 @@ static struct attribute *md_redundancy_attrs[] = {
|
|||
&md_mismatches.attr,
|
||||
&md_sync_min.attr,
|
||||
&md_sync_max.attr,
|
||||
&md_sync_io_depth.attr,
|
||||
&md_sync_speed.attr,
|
||||
&md_sync_force_parallel.attr,
|
||||
&md_sync_completed.attr,
|
||||
|
@ -8572,50 +8625,55 @@ void md_cluster_stop(struct mddev *mddev)
|
|||
put_cluster_ops(mddev);
|
||||
}
|
||||
|
||||
static int is_mddev_idle(struct mddev *mddev, int init)
|
||||
static bool is_rdev_holder_idle(struct md_rdev *rdev, bool init)
|
||||
{
|
||||
struct md_rdev *rdev;
|
||||
int idle;
|
||||
int curr_events;
|
||||
unsigned long last_events = rdev->last_events;
|
||||
|
||||
idle = 1;
|
||||
rcu_read_lock();
|
||||
rdev_for_each_rcu(rdev, mddev) {
|
||||
struct gendisk *disk = rdev->bdev->bd_disk;
|
||||
if (!bdev_is_partition(rdev->bdev))
|
||||
return true;
|
||||
|
||||
if (!init && !blk_queue_io_stat(disk->queue))
|
||||
continue;
|
||||
|
||||
curr_events = (int)part_stat_read_accum(disk->part0, sectors) -
|
||||
atomic_read(&disk->sync_io);
|
||||
/* sync IO will cause sync_io to increase before the disk_stats
|
||||
* as sync_io is counted when a request starts, and
|
||||
* disk_stats is counted when it completes.
|
||||
* So resync activity will cause curr_events to be smaller than
|
||||
* when there was no such activity.
|
||||
* non-sync IO will cause disk_stat to increase without
|
||||
* increasing sync_io so curr_events will (eventually)
|
||||
* be larger than it was before. Once it becomes
|
||||
* substantially larger, the test below will cause
|
||||
* the array to appear non-idle, and resync will slow
|
||||
* down.
|
||||
* If there is a lot of outstanding resync activity when
|
||||
* we set last_event to curr_events, then all that activity
|
||||
* completing might cause the array to appear non-idle
|
||||
* and resync will be slowed down even though there might
|
||||
* not have been non-resync activity. This will only
|
||||
* happen once though. 'last_events' will soon reflect
|
||||
* the state where there is little or no outstanding
|
||||
* resync requests, and further resync activity will
|
||||
* always make curr_events less than last_events.
|
||||
*
|
||||
/*
|
||||
* If rdev is partition, and user doesn't issue IO to the array, the
|
||||
* array is still not idle if user issues IO to other partitions.
|
||||
*/
|
||||
if (init || curr_events - rdev->last_events > 64) {
|
||||
rdev->last_events = curr_events;
|
||||
idle = 0;
|
||||
}
|
||||
rdev->last_events = part_stat_read_accum(rdev->bdev->bd_disk->part0,
|
||||
sectors) -
|
||||
part_stat_read_accum(rdev->bdev, sectors);
|
||||
|
||||
return init || rdev->last_events <= last_events;
|
||||
}
|
||||
|
||||
/*
|
||||
* mddev is idle if following conditions are matched since last check:
|
||||
* 1) mddev doesn't have normal IO completed;
|
||||
* 2) mddev doesn't have inflight normal IO;
|
||||
* 3) if any member disk is partition, and other partitions don't have IO
|
||||
* completed;
|
||||
*
|
||||
* Noted this checking rely on IO accounting is enabled.
|
||||
*/
|
||||
static bool is_mddev_idle(struct mddev *mddev, int init)
|
||||
{
|
||||
unsigned long last_events = mddev->normal_io_events;
|
||||
struct gendisk *disk;
|
||||
struct md_rdev *rdev;
|
||||
bool idle = true;
|
||||
|
||||
disk = mddev_is_dm(mddev) ? mddev->dm_gendisk : mddev->gendisk;
|
||||
if (!disk)
|
||||
return true;
|
||||
|
||||
mddev->normal_io_events = part_stat_read_accum(disk->part0, sectors);
|
||||
if (!init && (mddev->normal_io_events > last_events ||
|
||||
bdev_count_inflight(disk->part0)))
|
||||
idle = false;
|
||||
|
||||
rcu_read_lock();
|
||||
rdev_for_each_rcu(rdev, mddev)
|
||||
if (!is_rdev_holder_idle(rdev, init))
|
||||
idle = false;
|
||||
rcu_read_unlock();
|
||||
|
||||
return idle;
|
||||
}
|
||||
|
||||
|
@ -8927,6 +8985,23 @@ static sector_t md_sync_position(struct mddev *mddev, enum sync_action action)
|
|||
}
|
||||
}
|
||||
|
||||
static bool sync_io_within_limit(struct mddev *mddev)
|
||||
{
|
||||
int io_sectors;
|
||||
|
||||
/*
|
||||
* For raid456, sync IO is stripe(4k) per IO, for other levels, it's
|
||||
* RESYNC_PAGES(64k) per IO.
|
||||
*/
|
||||
if (mddev->level == 4 || mddev->level == 5 || mddev->level == 6)
|
||||
io_sectors = 8;
|
||||
else
|
||||
io_sectors = 128;
|
||||
|
||||
return atomic_read(&mddev->recovery_active) <
|
||||
io_sectors * sync_io_depth(mddev);
|
||||
}
|
||||
|
||||
#define SYNC_MARKS 10
|
||||
#define SYNC_MARK_STEP (3*HZ)
|
||||
#define UPDATE_FREQUENCY (5*60*HZ)
|
||||
|
@ -9195,7 +9270,8 @@ void md_do_sync(struct md_thread *thread)
|
|||
msleep(500);
|
||||
goto repeat;
|
||||
}
|
||||
if (!is_mddev_idle(mddev, 0)) {
|
||||
if (!sync_io_within_limit(mddev) &&
|
||||
!is_mddev_idle(mddev, 0)) {
|
||||
/*
|
||||
* Give other IO more of a chance.
|
||||
* The faster the devices, the less we wait.
|
||||
|
|
|
@ -132,7 +132,7 @@ struct md_rdev {
|
|||
|
||||
sector_t sectors; /* Device size (in 512bytes sectors) */
|
||||
struct mddev *mddev; /* RAID array if running */
|
||||
int last_events; /* IO event timestamp */
|
||||
unsigned long last_events; /* IO event timestamp */
|
||||
|
||||
/*
|
||||
* If meta_bdev is non-NULL, it means that a separate device is
|
||||
|
@ -404,7 +404,8 @@ struct mddev {
|
|||
* are happening, so run/
|
||||
* takeover/stop are not safe
|
||||
*/
|
||||
struct gendisk *gendisk;
|
||||
struct gendisk *gendisk; /* mdraid gendisk */
|
||||
struct gendisk *dm_gendisk; /* dm-raid gendisk */
|
||||
|
||||
struct kobject kobj;
|
||||
int hold_active;
|
||||
|
@ -483,6 +484,7 @@ struct mddev {
|
|||
/* if zero, use the system-wide default */
|
||||
int sync_speed_min;
|
||||
int sync_speed_max;
|
||||
int sync_io_depth;
|
||||
|
||||
/* resync even though the same disks are shared among md-devices */
|
||||
int parallel_resync;
|
||||
|
@ -518,6 +520,7 @@ struct mddev {
|
|||
* adding a spare
|
||||
*/
|
||||
|
||||
unsigned long normal_io_events; /* IO event timestamp */
|
||||
atomic_t recovery_active; /* blocks scheduled, but not written */
|
||||
wait_queue_head_t recovery_wait;
|
||||
sector_t recovery_cp;
|
||||
|
@ -714,17 +717,6 @@ static inline int mddev_trylock(struct mddev *mddev)
|
|||
}
|
||||
extern void mddev_unlock(struct mddev *mddev);
|
||||
|
||||
static inline void md_sync_acct(struct block_device *bdev, unsigned long nr_sectors)
|
||||
{
|
||||
if (blk_queue_io_stat(bdev->bd_disk->queue))
|
||||
atomic_add(nr_sectors, &bdev->bd_disk->sync_io);
|
||||
}
|
||||
|
||||
static inline void md_sync_acct_bio(struct bio *bio, unsigned long nr_sectors)
|
||||
{
|
||||
md_sync_acct(bio->bi_bdev, nr_sectors);
|
||||
}
|
||||
|
||||
struct md_personality
|
||||
{
|
||||
struct md_submodule_head head;
|
||||
|
|
|
@ -2382,7 +2382,6 @@ static void sync_request_write(struct mddev *mddev, struct r1bio *r1_bio)
|
|||
|
||||
wbio->bi_end_io = end_sync_write;
|
||||
atomic_inc(&r1_bio->remaining);
|
||||
md_sync_acct(conf->mirrors[i].rdev->bdev, bio_sectors(wbio));
|
||||
|
||||
submit_bio_noacct(wbio);
|
||||
}
|
||||
|
@ -3055,7 +3054,6 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
|
|||
bio = r1_bio->bios[i];
|
||||
if (bio->bi_end_io == end_sync_read) {
|
||||
read_targets--;
|
||||
md_sync_acct_bio(bio, nr_sectors);
|
||||
if (read_targets == 1)
|
||||
bio->bi_opf &= ~MD_FAILFAST;
|
||||
submit_bio_noacct(bio);
|
||||
|
@ -3064,7 +3062,6 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
|
|||
} else {
|
||||
atomic_set(&r1_bio->remaining, 1);
|
||||
bio = r1_bio->bios[r1_bio->read_disk];
|
||||
md_sync_acct_bio(bio, nr_sectors);
|
||||
if (read_targets == 1)
|
||||
bio->bi_opf &= ~MD_FAILFAST;
|
||||
submit_bio_noacct(bio);
|
||||
|
|
|
@ -2426,7 +2426,6 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
|
|||
|
||||
atomic_inc(&conf->mirrors[d].rdev->nr_pending);
|
||||
atomic_inc(&r10_bio->remaining);
|
||||
md_sync_acct(conf->mirrors[d].rdev->bdev, bio_sectors(tbio));
|
||||
|
||||
if (test_bit(FailFast, &conf->mirrors[d].rdev->flags))
|
||||
tbio->bi_opf |= MD_FAILFAST;
|
||||
|
@ -2448,8 +2447,6 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
|
|||
bio_copy_data(tbio, fbio);
|
||||
d = r10_bio->devs[i].devnum;
|
||||
atomic_inc(&r10_bio->remaining);
|
||||
md_sync_acct(conf->mirrors[d].replacement->bdev,
|
||||
bio_sectors(tbio));
|
||||
submit_bio_noacct(tbio);
|
||||
}
|
||||
|
||||
|
@ -2583,13 +2580,10 @@ static void recovery_request_write(struct mddev *mddev, struct r10bio *r10_bio)
|
|||
d = r10_bio->devs[1].devnum;
|
||||
if (wbio->bi_end_io) {
|
||||
atomic_inc(&conf->mirrors[d].rdev->nr_pending);
|
||||
md_sync_acct(conf->mirrors[d].rdev->bdev, bio_sectors(wbio));
|
||||
submit_bio_noacct(wbio);
|
||||
}
|
||||
if (wbio2) {
|
||||
atomic_inc(&conf->mirrors[d].replacement->nr_pending);
|
||||
md_sync_acct(conf->mirrors[d].replacement->bdev,
|
||||
bio_sectors(wbio2));
|
||||
submit_bio_noacct(wbio2);
|
||||
}
|
||||
}
|
||||
|
@ -3757,7 +3751,6 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
|
|||
r10_bio->sectors = nr_sectors;
|
||||
|
||||
if (bio->bi_end_io == end_sync_read) {
|
||||
md_sync_acct_bio(bio, nr_sectors);
|
||||
bio->bi_status = 0;
|
||||
submit_bio_noacct(bio);
|
||||
}
|
||||
|
@ -4880,7 +4873,6 @@ read_more:
|
|||
r10_bio->sectors = nr_sectors;
|
||||
|
||||
/* Now submit the read */
|
||||
md_sync_acct_bio(read_bio, r10_bio->sectors);
|
||||
atomic_inc(&r10_bio->remaining);
|
||||
read_bio->bi_next = NULL;
|
||||
submit_bio_noacct(read_bio);
|
||||
|
@ -4940,7 +4932,6 @@ static void reshape_request_write(struct mddev *mddev, struct r10bio *r10_bio)
|
|||
continue;
|
||||
|
||||
atomic_inc(&rdev->nr_pending);
|
||||
md_sync_acct_bio(b, r10_bio->sectors);
|
||||
atomic_inc(&r10_bio->remaining);
|
||||
b->bi_next = NULL;
|
||||
submit_bio_noacct(b);
|
||||
|
|
|
@ -1240,10 +1240,6 @@ again:
|
|||
}
|
||||
|
||||
if (rdev) {
|
||||
if (s->syncing || s->expanding || s->expanded
|
||||
|| s->replacing)
|
||||
md_sync_acct(rdev->bdev, RAID5_STRIPE_SECTORS(conf));
|
||||
|
||||
set_bit(STRIPE_IO_STARTED, &sh->state);
|
||||
|
||||
bio_init(bi, rdev->bdev, &dev->vec, 1, op | op_flags);
|
||||
|
@ -1300,10 +1296,6 @@ again:
|
|||
submit_bio_noacct(bi);
|
||||
}
|
||||
if (rrdev) {
|
||||
if (s->syncing || s->expanding || s->expanded
|
||||
|| s->replacing)
|
||||
md_sync_acct(rrdev->bdev, RAID5_STRIPE_SECTORS(conf));
|
||||
|
||||
set_bit(STRIPE_IO_STARTED, &sh->state);
|
||||
|
||||
bio_init(rbi, rrdev->bdev, &dev->rvec, 1, op | op_flags);
|
||||
|
|
|
@ -242,7 +242,7 @@ struct nvme_dhchap_key *nvme_auth_transform_key(
|
|||
{
|
||||
const char *hmac_name;
|
||||
struct crypto_shash *key_tfm;
|
||||
struct shash_desc *shash;
|
||||
SHASH_DESC_ON_STACK(shash, key_tfm);
|
||||
struct nvme_dhchap_key *transformed_key;
|
||||
int ret, key_len;
|
||||
|
||||
|
@ -267,19 +267,11 @@ struct nvme_dhchap_key *nvme_auth_transform_key(
|
|||
if (IS_ERR(key_tfm))
|
||||
return ERR_CAST(key_tfm);
|
||||
|
||||
shash = kmalloc(sizeof(struct shash_desc) +
|
||||
crypto_shash_descsize(key_tfm),
|
||||
GFP_KERNEL);
|
||||
if (!shash) {
|
||||
ret = -ENOMEM;
|
||||
goto out_free_key;
|
||||
}
|
||||
|
||||
key_len = crypto_shash_digestsize(key_tfm);
|
||||
transformed_key = nvme_auth_alloc_key(key_len, key->hash);
|
||||
if (!transformed_key) {
|
||||
ret = -ENOMEM;
|
||||
goto out_free_shash;
|
||||
goto out_free_key;
|
||||
}
|
||||
|
||||
shash->tfm = key_tfm;
|
||||
|
@ -299,15 +291,12 @@ struct nvme_dhchap_key *nvme_auth_transform_key(
|
|||
if (ret < 0)
|
||||
goto out_free_transformed_key;
|
||||
|
||||
kfree(shash);
|
||||
crypto_free_shash(key_tfm);
|
||||
|
||||
return transformed_key;
|
||||
|
||||
out_free_transformed_key:
|
||||
nvme_auth_free_key(transformed_key);
|
||||
out_free_shash:
|
||||
kfree(shash);
|
||||
out_free_key:
|
||||
crypto_free_shash(key_tfm);
|
||||
|
||||
|
|
|
@ -31,6 +31,7 @@ struct nvme_dhchap_queue_context {
|
|||
u32 s1;
|
||||
u32 s2;
|
||||
bool bi_directional;
|
||||
bool authenticated;
|
||||
u16 transaction;
|
||||
u8 status;
|
||||
u8 dhgroup_id;
|
||||
|
@ -682,6 +683,7 @@ static void nvme_auth_reset_dhchap(struct nvme_dhchap_queue_context *chap)
|
|||
static void nvme_auth_free_dhchap(struct nvme_dhchap_queue_context *chap)
|
||||
{
|
||||
nvme_auth_reset_dhchap(chap);
|
||||
chap->authenticated = false;
|
||||
if (chap->shash_tfm)
|
||||
crypto_free_shash(chap->shash_tfm);
|
||||
if (chap->dh_tfm)
|
||||
|
@ -930,12 +932,14 @@ static void nvme_queue_auth_work(struct work_struct *work)
|
|||
}
|
||||
if (!ret) {
|
||||
chap->error = 0;
|
||||
chap->authenticated = true;
|
||||
if (ctrl->opts->concat &&
|
||||
(ret = nvme_auth_secure_concat(ctrl, chap))) {
|
||||
dev_warn(ctrl->device,
|
||||
"%s: qid %d failed to enable secure concatenation\n",
|
||||
__func__, chap->qid);
|
||||
chap->error = ret;
|
||||
chap->authenticated = false;
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
@ -1023,13 +1027,16 @@ static void nvme_ctrl_auth_work(struct work_struct *work)
|
|||
return;
|
||||
|
||||
for (q = 1; q < ctrl->queue_count; q++) {
|
||||
ret = nvme_auth_negotiate(ctrl, q);
|
||||
if (ret) {
|
||||
dev_warn(ctrl->device,
|
||||
"qid %d: error %d setting up authentication\n",
|
||||
q, ret);
|
||||
break;
|
||||
}
|
||||
struct nvme_dhchap_queue_context *chap =
|
||||
&ctrl->dhchap_ctxs[q];
|
||||
/*
|
||||
* Skip re-authentication if the queue had
|
||||
* not been authenticated initially.
|
||||
*/
|
||||
if (!chap->authenticated)
|
||||
continue;
|
||||
cancel_work_sync(&chap->auth_work);
|
||||
queue_work(nvme_auth_wq, &chap->auth_work);
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -1037,7 +1044,13 @@ static void nvme_ctrl_auth_work(struct work_struct *work)
|
|||
* the controller terminates the connection.
|
||||
*/
|
||||
for (q = 1; q < ctrl->queue_count; q++) {
|
||||
ret = nvme_auth_wait(ctrl, q);
|
||||
struct nvme_dhchap_queue_context *chap =
|
||||
&ctrl->dhchap_ctxs[q];
|
||||
if (!chap->authenticated)
|
||||
continue;
|
||||
flush_work(&chap->auth_work);
|
||||
ret = chap->error;
|
||||
nvme_auth_reset_dhchap(chap);
|
||||
if (ret)
|
||||
dev_warn(ctrl->device,
|
||||
"qid %d: authentication failed\n", q);
|
||||
|
@ -1076,6 +1089,7 @@ int nvme_auth_init_ctrl(struct nvme_ctrl *ctrl)
|
|||
chap = &ctrl->dhchap_ctxs[i];
|
||||
chap->qid = i;
|
||||
chap->ctrl = ctrl;
|
||||
chap->authenticated = false;
|
||||
INIT_WORK(&chap->auth_work, nvme_queue_auth_work);
|
||||
}
|
||||
|
||||
|
|
|
@ -38,6 +38,8 @@ struct nvme_ns_info {
|
|||
u32 nsid;
|
||||
__le32 anagrpid;
|
||||
u8 pi_offset;
|
||||
u16 endgid;
|
||||
u64 runs;
|
||||
bool is_shared;
|
||||
bool is_readonly;
|
||||
bool is_ready;
|
||||
|
@ -150,6 +152,8 @@ static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl,
|
|||
unsigned nsid);
|
||||
static void nvme_update_keep_alive(struct nvme_ctrl *ctrl,
|
||||
struct nvme_command *cmd);
|
||||
static int nvme_get_log_lsi(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page,
|
||||
u8 lsp, u8 csi, void *log, size_t size, u64 offset, u16 lsi);
|
||||
|
||||
void nvme_queue_scan(struct nvme_ctrl *ctrl)
|
||||
{
|
||||
|
@ -664,10 +668,11 @@ static void nvme_free_ns_head(struct kref *ref)
|
|||
struct nvme_ns_head *head =
|
||||
container_of(ref, struct nvme_ns_head, ref);
|
||||
|
||||
nvme_mpath_remove_disk(head);
|
||||
nvme_mpath_put_disk(head);
|
||||
ida_free(&head->subsys->ns_ida, head->instance);
|
||||
cleanup_srcu_struct(&head->srcu);
|
||||
nvme_put_subsystem(head->subsys);
|
||||
kfree(head->plids);
|
||||
kfree(head);
|
||||
}
|
||||
|
||||
|
@ -991,6 +996,18 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns,
|
|||
if (req->cmd_flags & REQ_RAHEAD)
|
||||
dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH;
|
||||
|
||||
if (op == nvme_cmd_write && ns->head->nr_plids) {
|
||||
u16 write_stream = req->bio->bi_write_stream;
|
||||
|
||||
if (WARN_ON_ONCE(write_stream > ns->head->nr_plids))
|
||||
return BLK_STS_INVAL;
|
||||
|
||||
if (write_stream) {
|
||||
dsmgmt |= ns->head->plids[write_stream - 1] << 16;
|
||||
control |= NVME_RW_DTYPE_DPLCMT;
|
||||
}
|
||||
}
|
||||
|
||||
if (req->cmd_flags & REQ_ATOMIC && !nvme_valid_atomic_write(req))
|
||||
return BLK_STS_INVAL;
|
||||
|
||||
|
@ -1157,7 +1174,7 @@ int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
|
|||
req->cmd_flags &= ~REQ_FAILFAST_DRIVER;
|
||||
|
||||
if (buffer && bufflen) {
|
||||
ret = blk_rq_map_kern(q, req, buffer, bufflen, GFP_KERNEL);
|
||||
ret = blk_rq_map_kern(req, buffer, bufflen, GFP_KERNEL);
|
||||
if (ret)
|
||||
goto out;
|
||||
}
|
||||
|
@ -1609,6 +1626,7 @@ static int nvme_ns_info_from_identify(struct nvme_ctrl *ctrl,
|
|||
info->is_shared = id->nmic & NVME_NS_NMIC_SHARED;
|
||||
info->is_readonly = id->nsattr & NVME_NS_ATTR_RO;
|
||||
info->is_ready = true;
|
||||
info->endgid = le16_to_cpu(id->endgid);
|
||||
if (ctrl->quirks & NVME_QUIRK_BOGUS_NID) {
|
||||
dev_info(ctrl->device,
|
||||
"Ignoring bogus Namespace Identifiers\n");
|
||||
|
@ -1649,6 +1667,7 @@ static int nvme_ns_info_from_id_cs_indep(struct nvme_ctrl *ctrl,
|
|||
info->is_ready = id->nstat & NVME_NSTAT_NRDY;
|
||||
info->is_rotational = id->nsfeat & NVME_NS_ROTATIONAL;
|
||||
info->no_vwc = id->nsfeat & NVME_NS_VWC_NOT_PRESENT;
|
||||
info->endgid = le16_to_cpu(id->endgid);
|
||||
}
|
||||
kfree(id);
|
||||
return ret;
|
||||
|
@ -1674,7 +1693,7 @@ static int nvme_features(struct nvme_ctrl *dev, u8 op, unsigned int fid,
|
|||
|
||||
int nvme_set_features(struct nvme_ctrl *dev, unsigned int fid,
|
||||
unsigned int dword11, void *buffer, size_t buflen,
|
||||
u32 *result)
|
||||
void *result)
|
||||
{
|
||||
return nvme_features(dev, nvme_admin_set_features, fid, dword11, buffer,
|
||||
buflen, result);
|
||||
|
@ -1683,7 +1702,7 @@ EXPORT_SYMBOL_GPL(nvme_set_features);
|
|||
|
||||
int nvme_get_features(struct nvme_ctrl *dev, unsigned int fid,
|
||||
unsigned int dword11, void *buffer, size_t buflen,
|
||||
u32 *result)
|
||||
void *result)
|
||||
{
|
||||
return nvme_features(dev, nvme_admin_get_features, fid, dword11, buffer,
|
||||
buflen, result);
|
||||
|
@ -2167,6 +2186,148 @@ static int nvme_update_ns_info_generic(struct nvme_ns *ns,
|
|||
return ret;
|
||||
}
|
||||
|
||||
static int nvme_query_fdp_granularity(struct nvme_ctrl *ctrl,
|
||||
struct nvme_ns_info *info, u8 fdp_idx)
|
||||
{
|
||||
struct nvme_fdp_config_log hdr, *h;
|
||||
struct nvme_fdp_config_desc *desc;
|
||||
size_t size = sizeof(hdr);
|
||||
void *log, *end;
|
||||
int i, n, ret;
|
||||
|
||||
ret = nvme_get_log_lsi(ctrl, 0, NVME_LOG_FDP_CONFIGS, 0,
|
||||
NVME_CSI_NVM, &hdr, size, 0, info->endgid);
|
||||
if (ret) {
|
||||
dev_warn(ctrl->device,
|
||||
"FDP configs log header status:0x%x endgid:%d\n", ret,
|
||||
info->endgid);
|
||||
return ret;
|
||||
}
|
||||
|
||||
size = le32_to_cpu(hdr.sze);
|
||||
if (size > PAGE_SIZE * MAX_ORDER_NR_PAGES) {
|
||||
dev_warn(ctrl->device, "FDP config size too large:%zu\n",
|
||||
size);
|
||||
return 0;
|
||||
}
|
||||
|
||||
h = kvmalloc(size, GFP_KERNEL);
|
||||
if (!h)
|
||||
return -ENOMEM;
|
||||
|
||||
ret = nvme_get_log_lsi(ctrl, 0, NVME_LOG_FDP_CONFIGS, 0,
|
||||
NVME_CSI_NVM, h, size, 0, info->endgid);
|
||||
if (ret) {
|
||||
dev_warn(ctrl->device,
|
||||
"FDP configs log status:0x%x endgid:%d\n", ret,
|
||||
info->endgid);
|
||||
goto out;
|
||||
}
|
||||
|
||||
n = le16_to_cpu(h->numfdpc) + 1;
|
||||
if (fdp_idx > n) {
|
||||
dev_warn(ctrl->device, "FDP index:%d out of range:%d\n",
|
||||
fdp_idx, n);
|
||||
/* Proceed without registering FDP streams */
|
||||
ret = 0;
|
||||
goto out;
|
||||
}
|
||||
|
||||
log = h + 1;
|
||||
desc = log;
|
||||
end = log + size - sizeof(*h);
|
||||
for (i = 0; i < fdp_idx; i++) {
|
||||
log += le16_to_cpu(desc->dsze);
|
||||
desc = log;
|
||||
if (log >= end) {
|
||||
dev_warn(ctrl->device,
|
||||
"FDP invalid config descriptor list\n");
|
||||
ret = 0;
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
|
||||
if (le32_to_cpu(desc->nrg) > 1) {
|
||||
dev_warn(ctrl->device, "FDP NRG > 1 not supported\n");
|
||||
ret = 0;
|
||||
goto out;
|
||||
}
|
||||
|
||||
info->runs = le64_to_cpu(desc->runs);
|
||||
out:
|
||||
kvfree(h);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int nvme_query_fdp_info(struct nvme_ns *ns, struct nvme_ns_info *info)
|
||||
{
|
||||
struct nvme_ns_head *head = ns->head;
|
||||
struct nvme_ctrl *ctrl = ns->ctrl;
|
||||
struct nvme_fdp_ruh_status *ruhs;
|
||||
struct nvme_fdp_config fdp;
|
||||
struct nvme_command c = {};
|
||||
size_t size;
|
||||
int i, ret;
|
||||
|
||||
/*
|
||||
* The FDP configuration is static for the lifetime of the namespace,
|
||||
* so return immediately if we've already registered this namespace's
|
||||
* streams.
|
||||
*/
|
||||
if (head->nr_plids)
|
||||
return 0;
|
||||
|
||||
ret = nvme_get_features(ctrl, NVME_FEAT_FDP, info->endgid, NULL, 0,
|
||||
&fdp);
|
||||
if (ret) {
|
||||
dev_warn(ctrl->device, "FDP get feature status:0x%x\n", ret);
|
||||
return ret;
|
||||
}
|
||||
|
||||
if (!(fdp.flags & FDPCFG_FDPE))
|
||||
return 0;
|
||||
|
||||
ret = nvme_query_fdp_granularity(ctrl, info, fdp.fdpcidx);
|
||||
if (!info->runs)
|
||||
return ret;
|
||||
|
||||
size = struct_size(ruhs, ruhsd, S8_MAX - 1);
|
||||
ruhs = kzalloc(size, GFP_KERNEL);
|
||||
if (!ruhs)
|
||||
return -ENOMEM;
|
||||
|
||||
c.imr.opcode = nvme_cmd_io_mgmt_recv;
|
||||
c.imr.nsid = cpu_to_le32(head->ns_id);
|
||||
c.imr.mo = NVME_IO_MGMT_RECV_MO_RUHS;
|
||||
c.imr.numd = cpu_to_le32(nvme_bytes_to_numd(size));
|
||||
ret = nvme_submit_sync_cmd(ns->queue, &c, ruhs, size);
|
||||
if (ret) {
|
||||
dev_warn(ctrl->device, "FDP io-mgmt status:0x%x\n", ret);
|
||||
goto free;
|
||||
}
|
||||
|
||||
head->nr_plids = le16_to_cpu(ruhs->nruhsd);
|
||||
if (!head->nr_plids)
|
||||
goto free;
|
||||
|
||||
head->plids = kcalloc(head->nr_plids, sizeof(*head->plids),
|
||||
GFP_KERNEL);
|
||||
if (!head->plids) {
|
||||
dev_warn(ctrl->device,
|
||||
"failed to allocate %u FDP placement IDs\n",
|
||||
head->nr_plids);
|
||||
head->nr_plids = 0;
|
||||
ret = -ENOMEM;
|
||||
goto free;
|
||||
}
|
||||
|
||||
for (i = 0; i < head->nr_plids; i++)
|
||||
head->plids[i] = le16_to_cpu(ruhs->ruhsd[i].pid);
|
||||
free:
|
||||
kfree(ruhs);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int nvme_update_ns_info_block(struct nvme_ns *ns,
|
||||
struct nvme_ns_info *info)
|
||||
{
|
||||
|
@ -2204,6 +2365,12 @@ static int nvme_update_ns_info_block(struct nvme_ns *ns,
|
|||
goto out;
|
||||
}
|
||||
|
||||
if (ns->ctrl->ctratt & NVME_CTRL_ATTR_FDPS) {
|
||||
ret = nvme_query_fdp_info(ns, info);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
}
|
||||
|
||||
lim = queue_limits_start_update(ns->disk->queue);
|
||||
|
||||
memflags = blk_mq_freeze_queue(ns->disk->queue);
|
||||
|
@ -2248,6 +2415,12 @@ static int nvme_update_ns_info_block(struct nvme_ns *ns,
|
|||
if (!nvme_init_integrity(ns->head, &lim, info))
|
||||
capacity = 0;
|
||||
|
||||
lim.max_write_streams = ns->head->nr_plids;
|
||||
if (lim.max_write_streams)
|
||||
lim.write_stream_granularity = min(info->runs, U32_MAX);
|
||||
else
|
||||
lim.write_stream_granularity = 0;
|
||||
|
||||
ret = queue_limits_commit_update(ns->disk->queue, &lim);
|
||||
if (ret) {
|
||||
blk_mq_unfreeze_queue(ns->disk->queue, memflags);
|
||||
|
@ -2351,6 +2524,8 @@ static int nvme_update_ns_info(struct nvme_ns *ns, struct nvme_ns_info *info)
|
|||
ns->head->disk->flags |= GENHD_FL_HIDDEN;
|
||||
else
|
||||
nvme_init_integrity(ns->head, &lim, info);
|
||||
lim.max_write_streams = ns_lim->max_write_streams;
|
||||
lim.write_stream_granularity = ns_lim->write_stream_granularity;
|
||||
ret = queue_limits_commit_update(ns->head->disk->queue, &lim);
|
||||
|
||||
set_capacity_and_notify(ns->head->disk, get_capacity(ns->disk));
|
||||
|
@ -3108,8 +3283,8 @@ out_unlock:
|
|||
return ret;
|
||||
}
|
||||
|
||||
int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 lsp, u8 csi,
|
||||
void *log, size_t size, u64 offset)
|
||||
static int nvme_get_log_lsi(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page,
|
||||
u8 lsp, u8 csi, void *log, size_t size, u64 offset, u16 lsi)
|
||||
{
|
||||
struct nvme_command c = { };
|
||||
u32 dwlen = nvme_bytes_to_numd(size);
|
||||
|
@ -3123,10 +3298,18 @@ int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 lsp, u8 csi,
|
|||
c.get_log_page.lpol = cpu_to_le32(lower_32_bits(offset));
|
||||
c.get_log_page.lpou = cpu_to_le32(upper_32_bits(offset));
|
||||
c.get_log_page.csi = csi;
|
||||
c.get_log_page.lsi = cpu_to_le16(lsi);
|
||||
|
||||
return nvme_submit_sync_cmd(ctrl->admin_q, &c, log, size);
|
||||
}
|
||||
|
||||
int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 lsp, u8 csi,
|
||||
void *log, size_t size, u64 offset)
|
||||
{
|
||||
return nvme_get_log_lsi(ctrl, nsid, log_page, lsp, csi, log, size,
|
||||
offset, 0);
|
||||
}
|
||||
|
||||
static int nvme_get_effects_log(struct nvme_ctrl *ctrl, u8 csi,
|
||||
struct nvme_effects_log **log)
|
||||
{
|
||||
|
@ -3584,7 +3767,7 @@ static struct nvme_ns_head *nvme_find_ns_head(struct nvme_ctrl *ctrl,
|
|||
*/
|
||||
if (h->ns_id != nsid || !nvme_is_unique_nsid(ctrl, h))
|
||||
continue;
|
||||
if (!list_empty(&h->list) && nvme_tryget_ns_head(h))
|
||||
if (nvme_tryget_ns_head(h))
|
||||
return h;
|
||||
}
|
||||
|
||||
|
@ -3828,7 +4011,8 @@ static int nvme_init_ns_head(struct nvme_ns *ns, struct nvme_ns_info *info)
|
|||
}
|
||||
} else {
|
||||
ret = -EINVAL;
|
||||
if (!info->is_shared || !head->shared) {
|
||||
if ((!info->is_shared || !head->shared) &&
|
||||
!list_empty(&head->list)) {
|
||||
dev_err(ctrl->device,
|
||||
"Duplicate unshared namespace %d\n",
|
||||
info->nsid);
|
||||
|
@ -4032,6 +4216,7 @@ static void nvme_ns_remove(struct nvme_ns *ns)
|
|||
mutex_lock(&ns->ctrl->subsys->lock);
|
||||
list_del_rcu(&ns->siblings);
|
||||
if (list_empty(&ns->head->list)) {
|
||||
if (!nvme_mpath_queue_if_no_path(ns->head))
|
||||
list_del_init(&ns->head->entry);
|
||||
last_path = true;
|
||||
}
|
||||
|
@ -4053,7 +4238,7 @@ static void nvme_ns_remove(struct nvme_ns *ns)
|
|||
synchronize_srcu(&ns->ctrl->srcu);
|
||||
|
||||
if (last_path)
|
||||
nvme_mpath_shutdown_disk(ns->head);
|
||||
nvme_mpath_remove_disk(ns->head);
|
||||
nvme_put_ns(ns);
|
||||
}
|
||||
|
||||
|
|
|
@ -1410,9 +1410,8 @@ nvme_fc_xmt_disconnect_assoc(struct nvme_fc_ctrl *ctrl)
|
|||
}
|
||||
|
||||
static void
|
||||
nvme_fc_xmt_ls_rsp_done(struct nvmefc_ls_rsp *lsrsp)
|
||||
nvme_fc_xmt_ls_rsp_free(struct nvmefc_ls_rcv_op *lsop)
|
||||
{
|
||||
struct nvmefc_ls_rcv_op *lsop = lsrsp->nvme_fc_private;
|
||||
struct nvme_fc_rport *rport = lsop->rport;
|
||||
struct nvme_fc_lport *lport = rport->lport;
|
||||
unsigned long flags;
|
||||
|
@ -1433,6 +1432,14 @@ nvme_fc_xmt_ls_rsp_done(struct nvmefc_ls_rsp *lsrsp)
|
|||
nvme_fc_rport_put(rport);
|
||||
}
|
||||
|
||||
static void
|
||||
nvme_fc_xmt_ls_rsp_done(struct nvmefc_ls_rsp *lsrsp)
|
||||
{
|
||||
struct nvmefc_ls_rcv_op *lsop = lsrsp->nvme_fc_private;
|
||||
|
||||
nvme_fc_xmt_ls_rsp_free(lsop);
|
||||
}
|
||||
|
||||
static void
|
||||
nvme_fc_xmt_ls_rsp(struct nvmefc_ls_rcv_op *lsop)
|
||||
{
|
||||
|
@ -1450,7 +1457,7 @@ nvme_fc_xmt_ls_rsp(struct nvmefc_ls_rcv_op *lsop)
|
|||
dev_warn(lport->dev,
|
||||
"LLDD rejected LS RSP xmt: LS %d status %d\n",
|
||||
w0->ls_cmd, ret);
|
||||
nvme_fc_xmt_ls_rsp_done(lsop->lsrsp);
|
||||
nvme_fc_xmt_ls_rsp_free(lsop);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -10,10 +10,61 @@
|
|||
#include "nvme.h"
|
||||
|
||||
bool multipath = true;
|
||||
module_param(multipath, bool, 0444);
|
||||
static bool multipath_always_on;
|
||||
|
||||
static int multipath_param_set(const char *val, const struct kernel_param *kp)
|
||||
{
|
||||
int ret;
|
||||
bool *arg = kp->arg;
|
||||
|
||||
ret = param_set_bool(val, kp);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
if (multipath_always_on && !*arg) {
|
||||
pr_err("Can't disable multipath when multipath_always_on is configured.\n");
|
||||
*arg = true;
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static const struct kernel_param_ops multipath_param_ops = {
|
||||
.set = multipath_param_set,
|
||||
.get = param_get_bool,
|
||||
};
|
||||
|
||||
module_param_cb(multipath, &multipath_param_ops, &multipath, 0444);
|
||||
MODULE_PARM_DESC(multipath,
|
||||
"turn on native support for multiple controllers per subsystem");
|
||||
|
||||
static int multipath_always_on_set(const char *val,
|
||||
const struct kernel_param *kp)
|
||||
{
|
||||
int ret;
|
||||
bool *arg = kp->arg;
|
||||
|
||||
ret = param_set_bool(val, kp);
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
|
||||
if (*arg)
|
||||
multipath = true;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static const struct kernel_param_ops multipath_always_on_ops = {
|
||||
.set = multipath_always_on_set,
|
||||
.get = param_get_bool,
|
||||
};
|
||||
|
||||
module_param_cb(multipath_always_on, &multipath_always_on_ops,
|
||||
&multipath_always_on, 0444);
|
||||
MODULE_PARM_DESC(multipath_always_on,
|
||||
"create multipath node always except for private namespace with non-unique nsid; note that this also implicitly enables native multipath support");
|
||||
|
||||
static const char *nvme_iopolicy_names[] = {
|
||||
[NVME_IOPOLICY_NUMA] = "numa",
|
||||
[NVME_IOPOLICY_RR] = "round-robin",
|
||||
|
@ -442,7 +493,17 @@ static bool nvme_available_path(struct nvme_ns_head *head)
|
|||
break;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
|
||||
/*
|
||||
* If "head->delayed_removal_secs" is configured (i.e., non-zero), do
|
||||
* not immediately fail I/O. Instead, requeue the I/O for the configured
|
||||
* duration, anticipating that if there's a transient link failure then
|
||||
* it may recover within this time window. This parameter is exported to
|
||||
* userspace via sysfs, and its default value is zero. It is internally
|
||||
* mapped to NVME_NSHEAD_QUEUE_IF_NO_PATH. When delayed_removal_secs is
|
||||
* non-zero, this flag is set to true. When zero, the flag is cleared.
|
||||
*/
|
||||
return nvme_mpath_queue_if_no_path(head);
|
||||
}
|
||||
|
||||
static void nvme_ns_head_submit_bio(struct bio *bio)
|
||||
|
@ -617,6 +678,40 @@ static void nvme_requeue_work(struct work_struct *work)
|
|||
}
|
||||
}
|
||||
|
||||
static void nvme_remove_head(struct nvme_ns_head *head)
|
||||
{
|
||||
if (test_and_clear_bit(NVME_NSHEAD_DISK_LIVE, &head->flags)) {
|
||||
/*
|
||||
* requeue I/O after NVME_NSHEAD_DISK_LIVE has been cleared
|
||||
* to allow multipath to fail all I/O.
|
||||
*/
|
||||
kblockd_schedule_work(&head->requeue_work);
|
||||
|
||||
nvme_cdev_del(&head->cdev, &head->cdev_device);
|
||||
synchronize_srcu(&head->srcu);
|
||||
del_gendisk(head->disk);
|
||||
nvme_put_ns_head(head);
|
||||
}
|
||||
}
|
||||
|
||||
static void nvme_remove_head_work(struct work_struct *work)
|
||||
{
|
||||
struct nvme_ns_head *head = container_of(to_delayed_work(work),
|
||||
struct nvme_ns_head, remove_work);
|
||||
bool remove = false;
|
||||
|
||||
mutex_lock(&head->subsys->lock);
|
||||
if (list_empty(&head->list)) {
|
||||
list_del_init(&head->entry);
|
||||
remove = true;
|
||||
}
|
||||
mutex_unlock(&head->subsys->lock);
|
||||
if (remove)
|
||||
nvme_remove_head(head);
|
||||
|
||||
module_put(THIS_MODULE);
|
||||
}
|
||||
|
||||
int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head)
|
||||
{
|
||||
struct queue_limits lim;
|
||||
|
@ -626,14 +721,25 @@ int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head)
|
|||
spin_lock_init(&head->requeue_lock);
|
||||
INIT_WORK(&head->requeue_work, nvme_requeue_work);
|
||||
INIT_WORK(&head->partition_scan_work, nvme_partition_scan_work);
|
||||
INIT_DELAYED_WORK(&head->remove_work, nvme_remove_head_work);
|
||||
head->delayed_removal_secs = 0;
|
||||
|
||||
/*
|
||||
* Add a multipath node if the subsystems supports multiple controllers.
|
||||
* We also do this for private namespaces as the namespace sharing flag
|
||||
* could change after a rescan.
|
||||
* If "multipath_always_on" is enabled, a multipath node is added
|
||||
* regardless of whether the disk is single/multi ported, and whether
|
||||
* the namespace is shared or private. If "multipath_always_on" is not
|
||||
* enabled, a multipath node is added only if the subsystem supports
|
||||
* multiple controllers and the "multipath" option is configured. In
|
||||
* either case, for private namespaces, we ensure that the NSID is
|
||||
* unique.
|
||||
*/
|
||||
if (!multipath_always_on) {
|
||||
if (!(ctrl->subsys->cmic & NVME_CTRL_CMIC_MULTI_CTRL) ||
|
||||
!nvme_is_unique_nsid(ctrl, head) || !multipath)
|
||||
!multipath)
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (!nvme_is_unique_nsid(ctrl, head))
|
||||
return 0;
|
||||
|
||||
blk_set_stacking_limits(&lim);
|
||||
|
@ -660,6 +766,7 @@ int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head)
|
|||
set_bit(GD_SUPPRESS_PART_SCAN, &head->disk->state);
|
||||
sprintf(head->disk->disk_name, "nvme%dn%d",
|
||||
ctrl->subsys->instance, head->instance);
|
||||
nvme_tryget_ns_head(head);
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
@ -1016,6 +1123,49 @@ static ssize_t numa_nodes_show(struct device *dev, struct device_attribute *attr
|
|||
}
|
||||
DEVICE_ATTR_RO(numa_nodes);
|
||||
|
||||
static ssize_t delayed_removal_secs_show(struct device *dev,
|
||||
struct device_attribute *attr, char *buf)
|
||||
{
|
||||
struct gendisk *disk = dev_to_disk(dev);
|
||||
struct nvme_ns_head *head = disk->private_data;
|
||||
int ret;
|
||||
|
||||
mutex_lock(&head->subsys->lock);
|
||||
ret = sysfs_emit(buf, "%u\n", head->delayed_removal_secs);
|
||||
mutex_unlock(&head->subsys->lock);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static ssize_t delayed_removal_secs_store(struct device *dev,
|
||||
struct device_attribute *attr, const char *buf, size_t count)
|
||||
{
|
||||
struct gendisk *disk = dev_to_disk(dev);
|
||||
struct nvme_ns_head *head = disk->private_data;
|
||||
unsigned int sec;
|
||||
int ret;
|
||||
|
||||
ret = kstrtouint(buf, 0, &sec);
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
|
||||
mutex_lock(&head->subsys->lock);
|
||||
head->delayed_removal_secs = sec;
|
||||
if (sec)
|
||||
set_bit(NVME_NSHEAD_QUEUE_IF_NO_PATH, &head->flags);
|
||||
else
|
||||
clear_bit(NVME_NSHEAD_QUEUE_IF_NO_PATH, &head->flags);
|
||||
mutex_unlock(&head->subsys->lock);
|
||||
/*
|
||||
* Ensure that update to NVME_NSHEAD_QUEUE_IF_NO_PATH is seen
|
||||
* by its reader.
|
||||
*/
|
||||
synchronize_srcu(&head->srcu);
|
||||
|
||||
return count;
|
||||
}
|
||||
|
||||
DEVICE_ATTR_RW(delayed_removal_secs);
|
||||
|
||||
static int nvme_lookup_ana_group_desc(struct nvme_ctrl *ctrl,
|
||||
struct nvme_ana_group_desc *desc, void *data)
|
||||
{
|
||||
|
@ -1137,23 +1287,43 @@ void nvme_mpath_add_disk(struct nvme_ns *ns, __le32 anagrpid)
|
|||
#endif
|
||||
}
|
||||
|
||||
void nvme_mpath_shutdown_disk(struct nvme_ns_head *head)
|
||||
void nvme_mpath_remove_disk(struct nvme_ns_head *head)
|
||||
{
|
||||
if (!head->disk)
|
||||
return;
|
||||
if (test_and_clear_bit(NVME_NSHEAD_DISK_LIVE, &head->flags)) {
|
||||
nvme_cdev_del(&head->cdev, &head->cdev_device);
|
||||
bool remove = false;
|
||||
|
||||
mutex_lock(&head->subsys->lock);
|
||||
/*
|
||||
* requeue I/O after NVME_NSHEAD_DISK_LIVE has been cleared
|
||||
* to allow multipath to fail all I/O.
|
||||
* We are called when all paths have been removed, and at that point
|
||||
* head->list is expected to be empty. However, nvme_remove_ns() and
|
||||
* nvme_init_ns_head() can run concurrently and so if head->delayed_
|
||||
* removal_secs is configured, it is possible that by the time we reach
|
||||
* this point, head->list may no longer be empty. Therefore, we recheck
|
||||
* head->list here. If it is no longer empty then we skip enqueuing the
|
||||
* delayed head removal work.
|
||||
*/
|
||||
synchronize_srcu(&head->srcu);
|
||||
kblockd_schedule_work(&head->requeue_work);
|
||||
del_gendisk(head->disk);
|
||||
if (!list_empty(&head->list))
|
||||
goto out;
|
||||
|
||||
if (head->delayed_removal_secs) {
|
||||
/*
|
||||
* Ensure that no one could remove this module while the head
|
||||
* remove work is pending.
|
||||
*/
|
||||
if (!try_module_get(THIS_MODULE))
|
||||
goto out;
|
||||
queue_delayed_work(nvme_wq, &head->remove_work,
|
||||
head->delayed_removal_secs * HZ);
|
||||
} else {
|
||||
list_del_init(&head->entry);
|
||||
remove = true;
|
||||
}
|
||||
out:
|
||||
mutex_unlock(&head->subsys->lock);
|
||||
if (remove)
|
||||
nvme_remove_head(head);
|
||||
}
|
||||
|
||||
void nvme_mpath_remove_disk(struct nvme_ns_head *head)
|
||||
void nvme_mpath_put_disk(struct nvme_ns_head *head)
|
||||
{
|
||||
if (!head->disk)
|
||||
return;
|
||||
|
|
|
@ -497,6 +497,9 @@ struct nvme_ns_head {
|
|||
struct device cdev_device;
|
||||
|
||||
struct gendisk *disk;
|
||||
|
||||
u16 nr_plids;
|
||||
u16 *plids;
|
||||
#ifdef CONFIG_NVME_MULTIPATH
|
||||
struct bio_list requeue_list;
|
||||
spinlock_t requeue_lock;
|
||||
|
@ -504,7 +507,10 @@ struct nvme_ns_head {
|
|||
struct work_struct partition_scan_work;
|
||||
struct mutex lock;
|
||||
unsigned long flags;
|
||||
struct delayed_work remove_work;
|
||||
unsigned int delayed_removal_secs;
|
||||
#define NVME_NSHEAD_DISK_LIVE 0
|
||||
#define NVME_NSHEAD_QUEUE_IF_NO_PATH 1
|
||||
struct nvme_ns __rcu *current_path[];
|
||||
#endif
|
||||
};
|
||||
|
@ -897,10 +903,10 @@ int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
|
|||
int qid, nvme_submit_flags_t flags);
|
||||
int nvme_set_features(struct nvme_ctrl *dev, unsigned int fid,
|
||||
unsigned int dword11, void *buffer, size_t buflen,
|
||||
u32 *result);
|
||||
void *result);
|
||||
int nvme_get_features(struct nvme_ctrl *dev, unsigned int fid,
|
||||
unsigned int dword11, void *buffer, size_t buflen,
|
||||
u32 *result);
|
||||
void *result);
|
||||
int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count);
|
||||
void nvme_stop_keep_alive(struct nvme_ctrl *ctrl);
|
||||
int nvme_reset_ctrl(struct nvme_ctrl *ctrl);
|
||||
|
@ -961,7 +967,7 @@ int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl,struct nvme_ns_head *head);
|
|||
void nvme_mpath_add_sysfs_link(struct nvme_ns_head *ns);
|
||||
void nvme_mpath_remove_sysfs_link(struct nvme_ns *ns);
|
||||
void nvme_mpath_add_disk(struct nvme_ns *ns, __le32 anagrpid);
|
||||
void nvme_mpath_remove_disk(struct nvme_ns_head *head);
|
||||
void nvme_mpath_put_disk(struct nvme_ns_head *head);
|
||||
int nvme_mpath_init_identify(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id);
|
||||
void nvme_mpath_init_ctrl(struct nvme_ctrl *ctrl);
|
||||
void nvme_mpath_update(struct nvme_ctrl *ctrl);
|
||||
|
@ -970,7 +976,7 @@ void nvme_mpath_stop(struct nvme_ctrl *ctrl);
|
|||
bool nvme_mpath_clear_current_path(struct nvme_ns *ns);
|
||||
void nvme_mpath_revalidate_paths(struct nvme_ns *ns);
|
||||
void nvme_mpath_clear_ctrl_paths(struct nvme_ctrl *ctrl);
|
||||
void nvme_mpath_shutdown_disk(struct nvme_ns_head *head);
|
||||
void nvme_mpath_remove_disk(struct nvme_ns_head *head);
|
||||
void nvme_mpath_start_request(struct request *rq);
|
||||
void nvme_mpath_end_request(struct request *rq);
|
||||
|
||||
|
@ -987,12 +993,19 @@ extern struct device_attribute dev_attr_ana_grpid;
|
|||
extern struct device_attribute dev_attr_ana_state;
|
||||
extern struct device_attribute dev_attr_queue_depth;
|
||||
extern struct device_attribute dev_attr_numa_nodes;
|
||||
extern struct device_attribute dev_attr_delayed_removal_secs;
|
||||
extern struct device_attribute subsys_attr_iopolicy;
|
||||
|
||||
static inline bool nvme_disk_is_ns_head(struct gendisk *disk)
|
||||
{
|
||||
return disk->fops == &nvme_ns_head_ops;
|
||||
}
|
||||
static inline bool nvme_mpath_queue_if_no_path(struct nvme_ns_head *head)
|
||||
{
|
||||
if (test_bit(NVME_NSHEAD_QUEUE_IF_NO_PATH, &head->flags))
|
||||
return true;
|
||||
return false;
|
||||
}
|
||||
#else
|
||||
#define multipath false
|
||||
static inline bool nvme_ctrl_use_ana(struct nvme_ctrl *ctrl)
|
||||
|
@ -1013,7 +1026,7 @@ static inline int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl,
|
|||
static inline void nvme_mpath_add_disk(struct nvme_ns *ns, __le32 anagrpid)
|
||||
{
|
||||
}
|
||||
static inline void nvme_mpath_remove_disk(struct nvme_ns_head *head)
|
||||
static inline void nvme_mpath_put_disk(struct nvme_ns_head *head)
|
||||
{
|
||||
}
|
||||
static inline void nvme_mpath_add_sysfs_link(struct nvme_ns *ns)
|
||||
|
@ -1032,7 +1045,7 @@ static inline void nvme_mpath_revalidate_paths(struct nvme_ns *ns)
|
|||
static inline void nvme_mpath_clear_ctrl_paths(struct nvme_ctrl *ctrl)
|
||||
{
|
||||
}
|
||||
static inline void nvme_mpath_shutdown_disk(struct nvme_ns_head *head)
|
||||
static inline void nvme_mpath_remove_disk(struct nvme_ns_head *head)
|
||||
{
|
||||
}
|
||||
static inline void nvme_trace_bio_complete(struct request *req)
|
||||
|
@ -1080,6 +1093,10 @@ static inline bool nvme_disk_is_ns_head(struct gendisk *disk)
|
|||
{
|
||||
return false;
|
||||
}
|
||||
static inline bool nvme_mpath_queue_if_no_path(struct nvme_ns_head *head)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
#endif /* CONFIG_NVME_MULTIPATH */
|
||||
|
||||
int nvme_ns_get_unique_id(struct nvme_ns *ns, u8 id[16],
|
||||
|
|
|
@ -18,6 +18,7 @@
|
|||
#include <linux/mm.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/mutex.h>
|
||||
#include <linux/nodemask.h>
|
||||
#include <linux/once.h>
|
||||
#include <linux/pci.h>
|
||||
#include <linux/suspend.h>
|
||||
|
@ -34,16 +35,31 @@
|
|||
#define SQ_SIZE(q) ((q)->q_depth << (q)->sqes)
|
||||
#define CQ_SIZE(q) ((q)->q_depth * sizeof(struct nvme_completion))
|
||||
|
||||
#define SGES_PER_PAGE (NVME_CTRL_PAGE_SIZE / sizeof(struct nvme_sgl_desc))
|
||||
/* Optimisation for I/Os between 4k and 128k */
|
||||
#define NVME_SMALL_POOL_SIZE 256
|
||||
|
||||
/*
|
||||
* These can be higher, but we need to ensure that any command doesn't
|
||||
* require an sg allocation that needs more than a page of data.
|
||||
*/
|
||||
#define NVME_MAX_KB_SZ 8192
|
||||
#define NVME_MAX_SEGS 128
|
||||
#define NVME_MAX_META_SEGS 15
|
||||
#define NVME_MAX_NR_ALLOCATIONS 5
|
||||
#define NVME_MAX_NR_DESCRIPTORS 5
|
||||
|
||||
/*
|
||||
* For data SGLs we support a single descriptors worth of SGL entries, but for
|
||||
* now we also limit it to avoid an allocation larger than PAGE_SIZE for the
|
||||
* scatterlist.
|
||||
*/
|
||||
#define NVME_MAX_SEGS \
|
||||
min(NVME_CTRL_PAGE_SIZE / sizeof(struct nvme_sgl_desc), \
|
||||
(PAGE_SIZE / sizeof(struct scatterlist)))
|
||||
|
||||
/*
|
||||
* For metadata SGLs, only the small descriptor is supported, and the first
|
||||
* entry is the segment descriptor, which for the data pointer sits in the SQE.
|
||||
*/
|
||||
#define NVME_MAX_META_SEGS \
|
||||
((NVME_SMALL_POOL_SIZE / sizeof(struct nvme_sgl_desc)) - 1)
|
||||
|
||||
static int use_threaded_interrupts;
|
||||
module_param(use_threaded_interrupts, int, 0444);
|
||||
|
@ -112,6 +128,11 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown);
|
|||
static void nvme_delete_io_queues(struct nvme_dev *dev);
|
||||
static void nvme_update_attrs(struct nvme_dev *dev);
|
||||
|
||||
struct nvme_descriptor_pools {
|
||||
struct dma_pool *large;
|
||||
struct dma_pool *small;
|
||||
};
|
||||
|
||||
/*
|
||||
* Represents an NVM Express device. Each nvme_dev is a PCI function.
|
||||
*/
|
||||
|
@ -121,8 +142,6 @@ struct nvme_dev {
|
|||
struct blk_mq_tag_set admin_tagset;
|
||||
u32 __iomem *dbs;
|
||||
struct device *dev;
|
||||
struct dma_pool *prp_page_pool;
|
||||
struct dma_pool *prp_small_pool;
|
||||
unsigned online_queues;
|
||||
unsigned max_qid;
|
||||
unsigned io_queues[HCTX_MAX_TYPES];
|
||||
|
@ -162,6 +181,7 @@ struct nvme_dev {
|
|||
unsigned int nr_allocated_queues;
|
||||
unsigned int nr_write_queues;
|
||||
unsigned int nr_poll_queues;
|
||||
struct nvme_descriptor_pools descriptor_pools[];
|
||||
};
|
||||
|
||||
static int io_queue_depth_set(const char *val, const struct kernel_param *kp)
|
||||
|
@ -191,6 +211,7 @@ static inline struct nvme_dev *to_nvme_dev(struct nvme_ctrl *ctrl)
|
|||
*/
|
||||
struct nvme_queue {
|
||||
struct nvme_dev *dev;
|
||||
struct nvme_descriptor_pools descriptor_pools;
|
||||
spinlock_t sq_lock;
|
||||
void *sq_cmds;
|
||||
/* only used for poll queues: */
|
||||
|
@ -219,30 +240,30 @@ struct nvme_queue {
|
|||
struct completion delete_done;
|
||||
};
|
||||
|
||||
union nvme_descriptor {
|
||||
struct nvme_sgl_desc *sg_list;
|
||||
__le64 *prp_list;
|
||||
/* bits for iod->flags */
|
||||
enum nvme_iod_flags {
|
||||
/* this command has been aborted by the timeout handler */
|
||||
IOD_ABORTED = 1U << 0,
|
||||
|
||||
/* uses the small descriptor pool */
|
||||
IOD_SMALL_DESCRIPTOR = 1U << 1,
|
||||
};
|
||||
|
||||
/*
|
||||
* The nvme_iod describes the data in an I/O.
|
||||
*
|
||||
* The sg pointer contains the list of PRP/SGL chunk allocations in addition
|
||||
* to the actual struct scatterlist.
|
||||
*/
|
||||
struct nvme_iod {
|
||||
struct nvme_request req;
|
||||
struct nvme_command cmd;
|
||||
bool aborted;
|
||||
s8 nr_allocations; /* PRP list pool allocations. 0 means small
|
||||
pool in use */
|
||||
u8 flags;
|
||||
u8 nr_descriptors;
|
||||
unsigned int dma_len; /* length of single DMA segment mapping */
|
||||
dma_addr_t first_dma;
|
||||
dma_addr_t meta_dma;
|
||||
struct sg_table sgt;
|
||||
struct sg_table meta_sgt;
|
||||
union nvme_descriptor meta_list;
|
||||
union nvme_descriptor list[NVME_MAX_NR_ALLOCATIONS];
|
||||
struct nvme_sgl_desc *meta_descriptor;
|
||||
void *descriptors[NVME_MAX_NR_DESCRIPTORS];
|
||||
};
|
||||
|
||||
static inline unsigned int nvme_dbbuf_size(struct nvme_dev *dev)
|
||||
|
@ -397,28 +418,76 @@ static __always_inline int nvme_pci_npages_prp(void)
|
|||
return DIV_ROUND_UP(8 * nprps, NVME_CTRL_PAGE_SIZE - 8);
|
||||
}
|
||||
|
||||
static struct nvme_descriptor_pools *
|
||||
nvme_setup_descriptor_pools(struct nvme_dev *dev, unsigned numa_node)
|
||||
{
|
||||
struct nvme_descriptor_pools *pools = &dev->descriptor_pools[numa_node];
|
||||
size_t small_align = NVME_SMALL_POOL_SIZE;
|
||||
|
||||
if (pools->small)
|
||||
return pools; /* already initialized */
|
||||
|
||||
pools->large = dma_pool_create_node("nvme descriptor page", dev->dev,
|
||||
NVME_CTRL_PAGE_SIZE, NVME_CTRL_PAGE_SIZE, 0, numa_node);
|
||||
if (!pools->large)
|
||||
return ERR_PTR(-ENOMEM);
|
||||
|
||||
if (dev->ctrl.quirks & NVME_QUIRK_DMAPOOL_ALIGN_512)
|
||||
small_align = 512;
|
||||
|
||||
pools->small = dma_pool_create_node("nvme descriptor small", dev->dev,
|
||||
NVME_SMALL_POOL_SIZE, small_align, 0, numa_node);
|
||||
if (!pools->small) {
|
||||
dma_pool_destroy(pools->large);
|
||||
pools->large = NULL;
|
||||
return ERR_PTR(-ENOMEM);
|
||||
}
|
||||
|
||||
return pools;
|
||||
}
|
||||
|
||||
static void nvme_release_descriptor_pools(struct nvme_dev *dev)
|
||||
{
|
||||
unsigned i;
|
||||
|
||||
for (i = 0; i < nr_node_ids; i++) {
|
||||
struct nvme_descriptor_pools *pools = &dev->descriptor_pools[i];
|
||||
|
||||
dma_pool_destroy(pools->large);
|
||||
dma_pool_destroy(pools->small);
|
||||
}
|
||||
}
|
||||
|
||||
static int nvme_init_hctx_common(struct blk_mq_hw_ctx *hctx, void *data,
|
||||
unsigned qid)
|
||||
{
|
||||
struct nvme_dev *dev = to_nvme_dev(data);
|
||||
struct nvme_queue *nvmeq = &dev->queues[qid];
|
||||
struct nvme_descriptor_pools *pools;
|
||||
struct blk_mq_tags *tags;
|
||||
|
||||
tags = qid ? dev->tagset.tags[qid - 1] : dev->admin_tagset.tags[0];
|
||||
WARN_ON(tags != hctx->tags);
|
||||
pools = nvme_setup_descriptor_pools(dev, hctx->numa_node);
|
||||
if (IS_ERR(pools))
|
||||
return PTR_ERR(pools);
|
||||
|
||||
nvmeq->descriptor_pools = *pools;
|
||||
hctx->driver_data = nvmeq;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int nvme_admin_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
|
||||
unsigned int hctx_idx)
|
||||
{
|
||||
struct nvme_dev *dev = to_nvme_dev(data);
|
||||
struct nvme_queue *nvmeq = &dev->queues[0];
|
||||
|
||||
WARN_ON(hctx_idx != 0);
|
||||
WARN_ON(dev->admin_tagset.tags[0] != hctx->tags);
|
||||
|
||||
hctx->driver_data = nvmeq;
|
||||
return 0;
|
||||
return nvme_init_hctx_common(hctx, data, 0);
|
||||
}
|
||||
|
||||
static int nvme_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
|
||||
unsigned int hctx_idx)
|
||||
{
|
||||
struct nvme_dev *dev = to_nvme_dev(data);
|
||||
struct nvme_queue *nvmeq = &dev->queues[hctx_idx + 1];
|
||||
|
||||
WARN_ON(dev->tagset.tags[hctx_idx] != hctx->tags);
|
||||
hctx->driver_data = nvmeq;
|
||||
return 0;
|
||||
return nvme_init_hctx_common(hctx, data, hctx_idx + 1);
|
||||
}
|
||||
|
||||
static int nvme_pci_init_request(struct blk_mq_tag_set *set,
|
||||
|
@ -537,23 +606,39 @@ static inline bool nvme_pci_use_sgls(struct nvme_dev *dev, struct request *req,
|
|||
return true;
|
||||
}
|
||||
|
||||
static void nvme_free_prps(struct nvme_dev *dev, struct request *req)
|
||||
static inline struct dma_pool *nvme_dma_pool(struct nvme_queue *nvmeq,
|
||||
struct nvme_iod *iod)
|
||||
{
|
||||
if (iod->flags & IOD_SMALL_DESCRIPTOR)
|
||||
return nvmeq->descriptor_pools.small;
|
||||
return nvmeq->descriptor_pools.large;
|
||||
}
|
||||
|
||||
static void nvme_free_descriptors(struct nvme_queue *nvmeq, struct request *req)
|
||||
{
|
||||
const int last_prp = NVME_CTRL_PAGE_SIZE / sizeof(__le64) - 1;
|
||||
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
|
||||
dma_addr_t dma_addr = iod->first_dma;
|
||||
int i;
|
||||
|
||||
for (i = 0; i < iod->nr_allocations; i++) {
|
||||
__le64 *prp_list = iod->list[i].prp_list;
|
||||
if (iod->nr_descriptors == 1) {
|
||||
dma_pool_free(nvme_dma_pool(nvmeq, iod), iod->descriptors[0],
|
||||
dma_addr);
|
||||
return;
|
||||
}
|
||||
|
||||
for (i = 0; i < iod->nr_descriptors; i++) {
|
||||
__le64 *prp_list = iod->descriptors[i];
|
||||
dma_addr_t next_dma_addr = le64_to_cpu(prp_list[last_prp]);
|
||||
|
||||
dma_pool_free(dev->prp_page_pool, prp_list, dma_addr);
|
||||
dma_pool_free(nvmeq->descriptor_pools.large, prp_list,
|
||||
dma_addr);
|
||||
dma_addr = next_dma_addr;
|
||||
}
|
||||
}
|
||||
|
||||
static void nvme_unmap_data(struct nvme_dev *dev, struct request *req)
|
||||
static void nvme_unmap_data(struct nvme_dev *dev, struct nvme_queue *nvmeq,
|
||||
struct request *req)
|
||||
{
|
||||
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
|
||||
|
||||
|
@ -566,15 +651,7 @@ static void nvme_unmap_data(struct nvme_dev *dev, struct request *req)
|
|||
WARN_ON_ONCE(!iod->sgt.nents);
|
||||
|
||||
dma_unmap_sgtable(dev->dev, &iod->sgt, rq_dma_dir(req), 0);
|
||||
|
||||
if (iod->nr_allocations == 0)
|
||||
dma_pool_free(dev->prp_small_pool, iod->list[0].sg_list,
|
||||
iod->first_dma);
|
||||
else if (iod->nr_allocations == 1)
|
||||
dma_pool_free(dev->prp_page_pool, iod->list[0].sg_list,
|
||||
iod->first_dma);
|
||||
else
|
||||
nvme_free_prps(dev, req);
|
||||
nvme_free_descriptors(nvmeq, req);
|
||||
mempool_free(iod->sgt.sgl, dev->iod_mempool);
|
||||
}
|
||||
|
||||
|
@ -592,11 +669,10 @@ static void nvme_print_sgl(struct scatterlist *sgl, int nents)
|
|||
}
|
||||
}
|
||||
|
||||
static blk_status_t nvme_pci_setup_prps(struct nvme_dev *dev,
|
||||
static blk_status_t nvme_pci_setup_prps(struct nvme_queue *nvmeq,
|
||||
struct request *req, struct nvme_rw_command *cmnd)
|
||||
{
|
||||
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
|
||||
struct dma_pool *pool;
|
||||
int length = blk_rq_payload_bytes(req);
|
||||
struct scatterlist *sg = iod->sgt.sgl;
|
||||
int dma_len = sg_dma_len(sg);
|
||||
|
@ -604,7 +680,7 @@ static blk_status_t nvme_pci_setup_prps(struct nvme_dev *dev,
|
|||
int offset = dma_addr & (NVME_CTRL_PAGE_SIZE - 1);
|
||||
__le64 *prp_list;
|
||||
dma_addr_t prp_dma;
|
||||
int nprps, i;
|
||||
int i;
|
||||
|
||||
length -= (NVME_CTRL_PAGE_SIZE - offset);
|
||||
if (length <= 0) {
|
||||
|
@ -626,30 +702,26 @@ static blk_status_t nvme_pci_setup_prps(struct nvme_dev *dev,
|
|||
goto done;
|
||||
}
|
||||
|
||||
nprps = DIV_ROUND_UP(length, NVME_CTRL_PAGE_SIZE);
|
||||
if (nprps <= (256 / 8)) {
|
||||
pool = dev->prp_small_pool;
|
||||
iod->nr_allocations = 0;
|
||||
} else {
|
||||
pool = dev->prp_page_pool;
|
||||
iod->nr_allocations = 1;
|
||||
}
|
||||
if (DIV_ROUND_UP(length, NVME_CTRL_PAGE_SIZE) <=
|
||||
NVME_SMALL_POOL_SIZE / sizeof(__le64))
|
||||
iod->flags |= IOD_SMALL_DESCRIPTOR;
|
||||
|
||||
prp_list = dma_pool_alloc(pool, GFP_ATOMIC, &prp_dma);
|
||||
if (!prp_list) {
|
||||
iod->nr_allocations = -1;
|
||||
prp_list = dma_pool_alloc(nvme_dma_pool(nvmeq, iod), GFP_ATOMIC,
|
||||
&prp_dma);
|
||||
if (!prp_list)
|
||||
return BLK_STS_RESOURCE;
|
||||
}
|
||||
iod->list[0].prp_list = prp_list;
|
||||
iod->descriptors[iod->nr_descriptors++] = prp_list;
|
||||
iod->first_dma = prp_dma;
|
||||
i = 0;
|
||||
for (;;) {
|
||||
if (i == NVME_CTRL_PAGE_SIZE >> 3) {
|
||||
__le64 *old_prp_list = prp_list;
|
||||
prp_list = dma_pool_alloc(pool, GFP_ATOMIC, &prp_dma);
|
||||
|
||||
prp_list = dma_pool_alloc(nvmeq->descriptor_pools.large,
|
||||
GFP_ATOMIC, &prp_dma);
|
||||
if (!prp_list)
|
||||
goto free_prps;
|
||||
iod->list[iod->nr_allocations++].prp_list = prp_list;
|
||||
iod->descriptors[iod->nr_descriptors++] = prp_list;
|
||||
prp_list[0] = old_prp_list[i - 1];
|
||||
old_prp_list[i - 1] = cpu_to_le64(prp_dma);
|
||||
i = 1;
|
||||
|
@ -673,7 +745,7 @@ done:
|
|||
cmnd->dptr.prp2 = cpu_to_le64(iod->first_dma);
|
||||
return BLK_STS_OK;
|
||||
free_prps:
|
||||
nvme_free_prps(dev, req);
|
||||
nvme_free_descriptors(nvmeq, req);
|
||||
return BLK_STS_RESOURCE;
|
||||
bad_sgl:
|
||||
WARN(DO_ONCE(nvme_print_sgl, iod->sgt.sgl, iod->sgt.nents),
|
||||
|
@ -698,11 +770,10 @@ static void nvme_pci_sgl_set_seg(struct nvme_sgl_desc *sge,
|
|||
sge->type = NVME_SGL_FMT_LAST_SEG_DESC << 4;
|
||||
}
|
||||
|
||||
static blk_status_t nvme_pci_setup_sgls(struct nvme_dev *dev,
|
||||
static blk_status_t nvme_pci_setup_sgls(struct nvme_queue *nvmeq,
|
||||
struct request *req, struct nvme_rw_command *cmd)
|
||||
{
|
||||
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
|
||||
struct dma_pool *pool;
|
||||
struct nvme_sgl_desc *sg_list;
|
||||
struct scatterlist *sg = iod->sgt.sgl;
|
||||
unsigned int entries = iod->sgt.nents;
|
||||
|
@ -717,21 +788,14 @@ static blk_status_t nvme_pci_setup_sgls(struct nvme_dev *dev,
|
|||
return BLK_STS_OK;
|
||||
}
|
||||
|
||||
if (entries <= (256 / sizeof(struct nvme_sgl_desc))) {
|
||||
pool = dev->prp_small_pool;
|
||||
iod->nr_allocations = 0;
|
||||
} else {
|
||||
pool = dev->prp_page_pool;
|
||||
iod->nr_allocations = 1;
|
||||
}
|
||||
if (entries <= NVME_SMALL_POOL_SIZE / sizeof(*sg_list))
|
||||
iod->flags |= IOD_SMALL_DESCRIPTOR;
|
||||
|
||||
sg_list = dma_pool_alloc(pool, GFP_ATOMIC, &sgl_dma);
|
||||
if (!sg_list) {
|
||||
iod->nr_allocations = -1;
|
||||
sg_list = dma_pool_alloc(nvme_dma_pool(nvmeq, iod), GFP_ATOMIC,
|
||||
&sgl_dma);
|
||||
if (!sg_list)
|
||||
return BLK_STS_RESOURCE;
|
||||
}
|
||||
|
||||
iod->list[0].sg_list = sg_list;
|
||||
iod->descriptors[iod->nr_descriptors++] = sg_list;
|
||||
iod->first_dma = sgl_dma;
|
||||
|
||||
nvme_pci_sgl_set_seg(&cmd->dptr.sgl, sgl_dma, entries);
|
||||
|
@ -785,12 +849,12 @@ static blk_status_t nvme_setup_sgl_simple(struct nvme_dev *dev,
|
|||
static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req,
|
||||
struct nvme_command *cmnd)
|
||||
{
|
||||
struct nvme_queue *nvmeq = req->mq_hctx->driver_data;
|
||||
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
|
||||
blk_status_t ret = BLK_STS_RESOURCE;
|
||||
int rc;
|
||||
|
||||
if (blk_rq_nr_phys_segments(req) == 1) {
|
||||
struct nvme_queue *nvmeq = req->mq_hctx->driver_data;
|
||||
struct bio_vec bv = req_bvec(req);
|
||||
|
||||
if (!is_pci_p2pdma_page(bv.bv_page)) {
|
||||
|
@ -825,9 +889,9 @@ static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req,
|
|||
}
|
||||
|
||||
if (nvme_pci_use_sgls(dev, req, iod->sgt.nents))
|
||||
ret = nvme_pci_setup_sgls(dev, req, &cmnd->rw);
|
||||
ret = nvme_pci_setup_sgls(nvmeq, req, &cmnd->rw);
|
||||
else
|
||||
ret = nvme_pci_setup_prps(dev, req, &cmnd->rw);
|
||||
ret = nvme_pci_setup_prps(nvmeq, req, &cmnd->rw);
|
||||
if (ret != BLK_STS_OK)
|
||||
goto out_unmap_sg;
|
||||
return BLK_STS_OK;
|
||||
|
@ -842,6 +906,7 @@ out_free_sg:
|
|||
static blk_status_t nvme_pci_setup_meta_sgls(struct nvme_dev *dev,
|
||||
struct request *req)
|
||||
{
|
||||
struct nvme_queue *nvmeq = req->mq_hctx->driver_data;
|
||||
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
|
||||
struct nvme_rw_command *cmnd = &iod->cmd.rw;
|
||||
struct nvme_sgl_desc *sg_list;
|
||||
|
@ -865,12 +930,13 @@ static blk_status_t nvme_pci_setup_meta_sgls(struct nvme_dev *dev,
|
|||
if (rc)
|
||||
goto out_free_sg;
|
||||
|
||||
sg_list = dma_pool_alloc(dev->prp_small_pool, GFP_ATOMIC, &sgl_dma);
|
||||
sg_list = dma_pool_alloc(nvmeq->descriptor_pools.small, GFP_ATOMIC,
|
||||
&sgl_dma);
|
||||
if (!sg_list)
|
||||
goto out_unmap_sg;
|
||||
|
||||
entries = iod->meta_sgt.nents;
|
||||
iod->meta_list.sg_list = sg_list;
|
||||
iod->meta_descriptor = sg_list;
|
||||
iod->meta_dma = sgl_dma;
|
||||
|
||||
cmnd->flags = NVME_CMD_SGL_METASEG;
|
||||
|
@ -912,7 +978,10 @@ static blk_status_t nvme_pci_setup_meta_mptr(struct nvme_dev *dev,
|
|||
|
||||
static blk_status_t nvme_map_metadata(struct nvme_dev *dev, struct request *req)
|
||||
{
|
||||
if (nvme_pci_metadata_use_sgls(dev, req))
|
||||
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
|
||||
|
||||
if ((iod->cmd.common.flags & NVME_CMD_SGL_METABUF) &&
|
||||
nvme_pci_metadata_use_sgls(dev, req))
|
||||
return nvme_pci_setup_meta_sgls(dev, req);
|
||||
return nvme_pci_setup_meta_mptr(dev, req);
|
||||
}
|
||||
|
@ -922,8 +991,8 @@ static blk_status_t nvme_prep_rq(struct nvme_dev *dev, struct request *req)
|
|||
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
|
||||
blk_status_t ret;
|
||||
|
||||
iod->aborted = false;
|
||||
iod->nr_allocations = -1;
|
||||
iod->flags = 0;
|
||||
iod->nr_descriptors = 0;
|
||||
iod->sgt.nents = 0;
|
||||
iod->meta_sgt.nents = 0;
|
||||
|
||||
|
@ -947,7 +1016,7 @@ static blk_status_t nvme_prep_rq(struct nvme_dev *dev, struct request *req)
|
|||
return BLK_STS_OK;
|
||||
out_unmap_data:
|
||||
if (blk_rq_nr_phys_segments(req))
|
||||
nvme_unmap_data(dev, req);
|
||||
nvme_unmap_data(dev, req->mq_hctx->driver_data, req);
|
||||
out_free_cmd:
|
||||
nvme_cleanup_cmd(req);
|
||||
return ret;
|
||||
|
@ -1037,6 +1106,7 @@ static void nvme_queue_rqs(struct rq_list *rqlist)
|
|||
}
|
||||
|
||||
static __always_inline void nvme_unmap_metadata(struct nvme_dev *dev,
|
||||
struct nvme_queue *nvmeq,
|
||||
struct request *req)
|
||||
{
|
||||
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
|
||||
|
@ -1048,7 +1118,7 @@ static __always_inline void nvme_unmap_metadata(struct nvme_dev *dev,
|
|||
return;
|
||||
}
|
||||
|
||||
dma_pool_free(dev->prp_small_pool, iod->meta_list.sg_list,
|
||||
dma_pool_free(nvmeq->descriptor_pools.small, iod->meta_descriptor,
|
||||
iod->meta_dma);
|
||||
dma_unmap_sgtable(dev->dev, &iod->meta_sgt, rq_dma_dir(req), 0);
|
||||
mempool_free(iod->meta_sgt.sgl, dev->iod_meta_mempool);
|
||||
|
@ -1060,10 +1130,10 @@ static __always_inline void nvme_pci_unmap_rq(struct request *req)
|
|||
struct nvme_dev *dev = nvmeq->dev;
|
||||
|
||||
if (blk_integrity_rq(req))
|
||||
nvme_unmap_metadata(dev, req);
|
||||
nvme_unmap_metadata(dev, nvmeq, req);
|
||||
|
||||
if (blk_rq_nr_phys_segments(req))
|
||||
nvme_unmap_data(dev, req);
|
||||
nvme_unmap_data(dev, nvmeq, req);
|
||||
}
|
||||
|
||||
static void nvme_pci_complete_rq(struct request *req)
|
||||
|
@ -1490,7 +1560,7 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req)
|
|||
* returned to the driver, or if this is the admin queue.
|
||||
*/
|
||||
opcode = nvme_req(req)->cmd->common.opcode;
|
||||
if (!nvmeq->qid || iod->aborted) {
|
||||
if (!nvmeq->qid || (iod->flags & IOD_ABORTED)) {
|
||||
dev_warn(dev->ctrl.device,
|
||||
"I/O tag %d (%04x) opcode %#x (%s) QID %d timeout, reset controller\n",
|
||||
req->tag, nvme_cid(req), opcode,
|
||||
|
@ -1503,7 +1573,7 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req)
|
|||
atomic_inc(&dev->ctrl.abort_limit);
|
||||
return BLK_EH_RESET_TIMER;
|
||||
}
|
||||
iod->aborted = true;
|
||||
iod->flags |= IOD_ABORTED;
|
||||
|
||||
cmd.abort.opcode = nvme_admin_abort_cmd;
|
||||
cmd.abort.cid = nvme_cid(req);
|
||||
|
@ -2842,35 +2912,6 @@ static int nvme_disable_prepare_reset(struct nvme_dev *dev, bool shutdown)
|
|||
return 0;
|
||||
}
|
||||
|
||||
static int nvme_setup_prp_pools(struct nvme_dev *dev)
|
||||
{
|
||||
size_t small_align = 256;
|
||||
|
||||
dev->prp_page_pool = dma_pool_create("prp list page", dev->dev,
|
||||
NVME_CTRL_PAGE_SIZE,
|
||||
NVME_CTRL_PAGE_SIZE, 0);
|
||||
if (!dev->prp_page_pool)
|
||||
return -ENOMEM;
|
||||
|
||||
if (dev->ctrl.quirks & NVME_QUIRK_DMAPOOL_ALIGN_512)
|
||||
small_align = 512;
|
||||
|
||||
/* Optimisation for I/Os between 4k and 128k */
|
||||
dev->prp_small_pool = dma_pool_create("prp list 256", dev->dev,
|
||||
256, small_align, 0);
|
||||
if (!dev->prp_small_pool) {
|
||||
dma_pool_destroy(dev->prp_page_pool);
|
||||
return -ENOMEM;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void nvme_release_prp_pools(struct nvme_dev *dev)
|
||||
{
|
||||
dma_pool_destroy(dev->prp_page_pool);
|
||||
dma_pool_destroy(dev->prp_small_pool);
|
||||
}
|
||||
|
||||
static int nvme_pci_alloc_iod_mempool(struct nvme_dev *dev)
|
||||
{
|
||||
size_t meta_size = sizeof(struct scatterlist) * (NVME_MAX_META_SEGS + 1);
|
||||
|
@ -3185,7 +3226,8 @@ static struct nvme_dev *nvme_pci_alloc_dev(struct pci_dev *pdev,
|
|||
struct nvme_dev *dev;
|
||||
int ret = -ENOMEM;
|
||||
|
||||
dev = kzalloc_node(sizeof(*dev), GFP_KERNEL, node);
|
||||
dev = kzalloc_node(struct_size(dev, descriptor_pools, nr_node_ids),
|
||||
GFP_KERNEL, node);
|
||||
if (!dev)
|
||||
return ERR_PTR(-ENOMEM);
|
||||
INIT_WORK(&dev->ctrl.reset_work, nvme_reset_work);
|
||||
|
@ -3260,13 +3302,9 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
|
|||
if (result)
|
||||
goto out_uninit_ctrl;
|
||||
|
||||
result = nvme_setup_prp_pools(dev);
|
||||
if (result)
|
||||
goto out_dev_unmap;
|
||||
|
||||
result = nvme_pci_alloc_iod_mempool(dev);
|
||||
if (result)
|
||||
goto out_release_prp_pools;
|
||||
goto out_dev_unmap;
|
||||
|
||||
dev_info(dev->ctrl.device, "pci function %s\n", dev_name(&pdev->dev));
|
||||
|
||||
|
@ -3342,8 +3380,6 @@ out_disable:
|
|||
out_release_iod_mempool:
|
||||
mempool_destroy(dev->iod_mempool);
|
||||
mempool_destroy(dev->iod_meta_mempool);
|
||||
out_release_prp_pools:
|
||||
nvme_release_prp_pools(dev);
|
||||
out_dev_unmap:
|
||||
nvme_dev_unmap(dev);
|
||||
out_uninit_ctrl:
|
||||
|
@ -3408,7 +3444,7 @@ static void nvme_remove(struct pci_dev *pdev)
|
|||
nvme_free_queues(dev, 0);
|
||||
mempool_destroy(dev->iod_mempool);
|
||||
mempool_destroy(dev->iod_meta_mempool);
|
||||
nvme_release_prp_pools(dev);
|
||||
nvme_release_descriptor_pools(dev);
|
||||
nvme_dev_unmap(dev);
|
||||
nvme_uninit_ctrl(&dev->ctrl);
|
||||
}
|
||||
|
@ -3809,9 +3845,7 @@ static int __init nvme_init(void)
|
|||
BUILD_BUG_ON(sizeof(struct nvme_create_sq) != 64);
|
||||
BUILD_BUG_ON(sizeof(struct nvme_delete_queue) != 64);
|
||||
BUILD_BUG_ON(IRQ_AFFINITY_MAX_SETS < 2);
|
||||
BUILD_BUG_ON(NVME_MAX_SEGS > SGES_PER_PAGE);
|
||||
BUILD_BUG_ON(sizeof(struct scatterlist) * NVME_MAX_SEGS > PAGE_SIZE);
|
||||
BUILD_BUG_ON(nvme_pci_npages_prp() > NVME_MAX_NR_ALLOCATIONS);
|
||||
BUILD_BUG_ON(nvme_pci_npages_prp() > NVME_MAX_NR_DESCRIPTORS);
|
||||
|
||||
return pci_register_driver(&nvme_driver);
|
||||
}
|
||||
|
|
|
@ -260,6 +260,7 @@ static struct attribute *nvme_ns_attrs[] = {
|
|||
&dev_attr_ana_state.attr,
|
||||
&dev_attr_queue_depth.attr,
|
||||
&dev_attr_numa_nodes.attr,
|
||||
&dev_attr_delayed_removal_secs.attr,
|
||||
#endif
|
||||
&dev_attr_io_passthru_err_log_enabled.attr,
|
||||
NULL,
|
||||
|
@ -296,6 +297,12 @@ static umode_t nvme_ns_attrs_are_visible(struct kobject *kobj,
|
|||
if (nvme_disk_is_ns_head(dev_to_disk(dev)))
|
||||
return 0;
|
||||
}
|
||||
if (a == &dev_attr_delayed_removal_secs.attr) {
|
||||
struct gendisk *disk = dev_to_disk(dev);
|
||||
|
||||
if (!nvme_disk_is_ns_head(disk))
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
return a->mode;
|
||||
}
|
||||
|
|
|
@ -403,7 +403,7 @@ static inline bool nvme_tcp_queue_more(struct nvme_tcp_queue *queue)
|
|||
}
|
||||
|
||||
static inline void nvme_tcp_queue_request(struct nvme_tcp_request *req,
|
||||
bool sync, bool last)
|
||||
bool last)
|
||||
{
|
||||
struct nvme_tcp_queue *queue = req->queue;
|
||||
bool empty;
|
||||
|
@ -417,7 +417,7 @@ static inline void nvme_tcp_queue_request(struct nvme_tcp_request *req,
|
|||
* are on the same cpu, so we don't introduce contention.
|
||||
*/
|
||||
if (queue->io_cpu == raw_smp_processor_id() &&
|
||||
sync && empty && mutex_trylock(&queue->send_mutex)) {
|
||||
empty && mutex_trylock(&queue->send_mutex)) {
|
||||
nvme_tcp_send_all(queue);
|
||||
mutex_unlock(&queue->send_mutex);
|
||||
}
|
||||
|
@ -770,7 +770,9 @@ static int nvme_tcp_handle_r2t(struct nvme_tcp_queue *queue,
|
|||
req->ttag = pdu->ttag;
|
||||
|
||||
nvme_tcp_setup_h2c_data_pdu(req);
|
||||
nvme_tcp_queue_request(req, false, true);
|
||||
|
||||
llist_add(&req->lentry, &queue->req_list);
|
||||
queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
@ -2385,7 +2387,7 @@ static int nvme_tcp_setup_ctrl(struct nvme_ctrl *ctrl, bool new)
|
|||
if (ret)
|
||||
return ret;
|
||||
|
||||
if (ctrl->opts && ctrl->opts->concat && !ctrl->tls_pskid) {
|
||||
if (ctrl->opts->concat && !ctrl->tls_pskid) {
|
||||
/* See comments for nvme_tcp_key_revoke_needed() */
|
||||
dev_dbg(ctrl->device, "restart admin queue for secure concatenation\n");
|
||||
nvme_stop_keep_alive(ctrl);
|
||||
|
@ -2637,7 +2639,7 @@ static void nvme_tcp_submit_async_event(struct nvme_ctrl *arg)
|
|||
ctrl->async_req.curr_bio = NULL;
|
||||
ctrl->async_req.data_len = 0;
|
||||
|
||||
nvme_tcp_queue_request(&ctrl->async_req, true, true);
|
||||
nvme_tcp_queue_request(&ctrl->async_req, true);
|
||||
}
|
||||
|
||||
static void nvme_tcp_complete_timed_out(struct request *rq)
|
||||
|
@ -2789,7 +2791,7 @@ static blk_status_t nvme_tcp_queue_rq(struct blk_mq_hw_ctx *hctx,
|
|||
|
||||
nvme_start_request(rq);
|
||||
|
||||
nvme_tcp_queue_request(req, true, bd->last);
|
||||
nvme_tcp_queue_request(req, bd->last);
|
||||
|
||||
return BLK_STS_OK;
|
||||
}
|
||||
|
|
|
@ -63,14 +63,9 @@ static void nvmet_execute_create_sq(struct nvmet_req *req)
|
|||
if (status != NVME_SC_SUCCESS)
|
||||
goto complete;
|
||||
|
||||
/*
|
||||
* Note: The NVMe specification allows multiple SQs to use the same CQ.
|
||||
* However, the target code does not really support that. So for now,
|
||||
* prevent this and fail the command if sqid and cqid are different.
|
||||
*/
|
||||
if (!cqid || cqid != sqid) {
|
||||
pr_err("SQ %u: Unsupported CQID %u\n", sqid, cqid);
|
||||
status = NVME_SC_CQ_INVALID | NVME_STATUS_DNR;
|
||||
status = nvmet_check_io_cqid(ctrl, cqid, false);
|
||||
if (status != NVME_SC_SUCCESS) {
|
||||
pr_err("SQ %u: Invalid CQID %u\n", sqid, cqid);
|
||||
goto complete;
|
||||
}
|
||||
|
||||
|
@ -79,7 +74,7 @@ static void nvmet_execute_create_sq(struct nvmet_req *req)
|
|||
goto complete;
|
||||
}
|
||||
|
||||
status = ctrl->ops->create_sq(ctrl, sqid, sq_flags, qsize, prp1);
|
||||
status = ctrl->ops->create_sq(ctrl, sqid, cqid, sq_flags, qsize, prp1);
|
||||
|
||||
complete:
|
||||
nvmet_req_complete(req, status);
|
||||
|
@ -96,15 +91,16 @@ static void nvmet_execute_delete_cq(struct nvmet_req *req)
|
|||
goto complete;
|
||||
}
|
||||
|
||||
if (!cqid) {
|
||||
status = nvmet_check_io_cqid(ctrl, cqid, false);
|
||||
if (status != NVME_SC_SUCCESS)
|
||||
goto complete;
|
||||
|
||||
if (!ctrl->cqs[cqid] || nvmet_cq_in_use(ctrl->cqs[cqid])) {
|
||||
/* Some SQs are still using this CQ */
|
||||
status = NVME_SC_QID_INVALID | NVME_STATUS_DNR;
|
||||
goto complete;
|
||||
}
|
||||
|
||||
status = nvmet_check_cqid(ctrl, cqid);
|
||||
if (status != NVME_SC_SUCCESS)
|
||||
goto complete;
|
||||
|
||||
status = ctrl->ops->delete_cq(ctrl, cqid);
|
||||
|
||||
complete:
|
||||
|
@ -127,12 +123,7 @@ static void nvmet_execute_create_cq(struct nvmet_req *req)
|
|||
goto complete;
|
||||
}
|
||||
|
||||
if (!cqid) {
|
||||
status = NVME_SC_QID_INVALID | NVME_STATUS_DNR;
|
||||
goto complete;
|
||||
}
|
||||
|
||||
status = nvmet_check_cqid(ctrl, cqid);
|
||||
status = nvmet_check_io_cqid(ctrl, cqid, true);
|
||||
if (status != NVME_SC_SUCCESS)
|
||||
goto complete;
|
||||
|
||||
|
|
|
@ -280,9 +280,12 @@ void nvmet_destroy_auth(struct nvmet_ctrl *ctrl)
|
|||
|
||||
bool nvmet_check_auth_status(struct nvmet_req *req)
|
||||
{
|
||||
if (req->sq->ctrl->host_key &&
|
||||
!req->sq->authenticated)
|
||||
if (req->sq->ctrl->host_key) {
|
||||
if (req->sq->qid > 0)
|
||||
return true;
|
||||
if (!req->sq->authenticated)
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
|
@ -290,7 +293,7 @@ int nvmet_auth_host_hash(struct nvmet_req *req, u8 *response,
|
|||
unsigned int shash_len)
|
||||
{
|
||||
struct crypto_shash *shash_tfm;
|
||||
struct shash_desc *shash;
|
||||
SHASH_DESC_ON_STACK(shash, shash_tfm);
|
||||
struct nvmet_ctrl *ctrl = req->sq->ctrl;
|
||||
const char *hash_name;
|
||||
u8 *challenge = req->sq->dhchap_c1;
|
||||
|
@ -342,19 +345,13 @@ int nvmet_auth_host_hash(struct nvmet_req *req, u8 *response,
|
|||
req->sq->dhchap_c1,
|
||||
challenge, shash_len);
|
||||
if (ret)
|
||||
goto out_free_challenge;
|
||||
goto out;
|
||||
}
|
||||
|
||||
pr_debug("ctrl %d qid %d host response seq %u transaction %d\n",
|
||||
ctrl->cntlid, req->sq->qid, req->sq->dhchap_s1,
|
||||
req->sq->dhchap_tid);
|
||||
|
||||
shash = kzalloc(sizeof(*shash) + crypto_shash_descsize(shash_tfm),
|
||||
GFP_KERNEL);
|
||||
if (!shash) {
|
||||
ret = -ENOMEM;
|
||||
goto out_free_challenge;
|
||||
}
|
||||
shash->tfm = shash_tfm;
|
||||
ret = crypto_shash_init(shash);
|
||||
if (ret)
|
||||
|
@ -389,8 +386,6 @@ int nvmet_auth_host_hash(struct nvmet_req *req, u8 *response,
|
|||
goto out;
|
||||
ret = crypto_shash_final(shash, response);
|
||||
out:
|
||||
kfree(shash);
|
||||
out_free_challenge:
|
||||
if (challenge != req->sq->dhchap_c1)
|
||||
kfree(challenge);
|
||||
out_free_response:
|
||||
|
|
|
@ -813,11 +813,43 @@ void nvmet_req_complete(struct nvmet_req *req, u16 status)
|
|||
}
|
||||
EXPORT_SYMBOL_GPL(nvmet_req_complete);
|
||||
|
||||
void nvmet_cq_init(struct nvmet_cq *cq)
|
||||
{
|
||||
refcount_set(&cq->ref, 1);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(nvmet_cq_init);
|
||||
|
||||
bool nvmet_cq_get(struct nvmet_cq *cq)
|
||||
{
|
||||
return refcount_inc_not_zero(&cq->ref);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(nvmet_cq_get);
|
||||
|
||||
void nvmet_cq_put(struct nvmet_cq *cq)
|
||||
{
|
||||
if (refcount_dec_and_test(&cq->ref))
|
||||
nvmet_cq_destroy(cq);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(nvmet_cq_put);
|
||||
|
||||
void nvmet_cq_setup(struct nvmet_ctrl *ctrl, struct nvmet_cq *cq,
|
||||
u16 qid, u16 size)
|
||||
{
|
||||
cq->qid = qid;
|
||||
cq->size = size;
|
||||
|
||||
ctrl->cqs[qid] = cq;
|
||||
}
|
||||
|
||||
void nvmet_cq_destroy(struct nvmet_cq *cq)
|
||||
{
|
||||
struct nvmet_ctrl *ctrl = cq->ctrl;
|
||||
|
||||
if (ctrl) {
|
||||
ctrl->cqs[cq->qid] = NULL;
|
||||
nvmet_ctrl_put(cq->ctrl);
|
||||
cq->ctrl = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
void nvmet_sq_setup(struct nvmet_ctrl *ctrl, struct nvmet_sq *sq,
|
||||
|
@ -837,37 +869,47 @@ static void nvmet_confirm_sq(struct percpu_ref *ref)
|
|||
complete(&sq->confirm_done);
|
||||
}
|
||||
|
||||
u16 nvmet_check_cqid(struct nvmet_ctrl *ctrl, u16 cqid)
|
||||
u16 nvmet_check_cqid(struct nvmet_ctrl *ctrl, u16 cqid, bool create)
|
||||
{
|
||||
if (!ctrl->sqs)
|
||||
if (!ctrl->cqs)
|
||||
return NVME_SC_INTERNAL | NVME_STATUS_DNR;
|
||||
|
||||
if (cqid > ctrl->subsys->max_qid)
|
||||
return NVME_SC_QID_INVALID | NVME_STATUS_DNR;
|
||||
|
||||
/*
|
||||
* Note: For PCI controllers, the NVMe specifications allows multiple
|
||||
* SQs to share a single CQ. However, we do not support this yet, so
|
||||
* check that there is no SQ defined for a CQ. If one exist, then the
|
||||
* CQ ID is invalid for creation as well as when the CQ is being
|
||||
* deleted (as that would mean that the SQ was not deleted before the
|
||||
* CQ).
|
||||
*/
|
||||
if (ctrl->sqs[cqid])
|
||||
if ((create && ctrl->cqs[cqid]) || (!create && !ctrl->cqs[cqid]))
|
||||
return NVME_SC_QID_INVALID | NVME_STATUS_DNR;
|
||||
|
||||
return NVME_SC_SUCCESS;
|
||||
}
|
||||
|
||||
u16 nvmet_check_io_cqid(struct nvmet_ctrl *ctrl, u16 cqid, bool create)
|
||||
{
|
||||
if (!cqid)
|
||||
return NVME_SC_QID_INVALID | NVME_STATUS_DNR;
|
||||
return nvmet_check_cqid(ctrl, cqid, create);
|
||||
}
|
||||
|
||||
bool nvmet_cq_in_use(struct nvmet_cq *cq)
|
||||
{
|
||||
return refcount_read(&cq->ref) > 1;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(nvmet_cq_in_use);
|
||||
|
||||
u16 nvmet_cq_create(struct nvmet_ctrl *ctrl, struct nvmet_cq *cq,
|
||||
u16 qid, u16 size)
|
||||
{
|
||||
u16 status;
|
||||
|
||||
status = nvmet_check_cqid(ctrl, qid);
|
||||
status = nvmet_check_cqid(ctrl, qid, true);
|
||||
if (status != NVME_SC_SUCCESS)
|
||||
return status;
|
||||
|
||||
if (!kref_get_unless_zero(&ctrl->ref))
|
||||
return NVME_SC_INTERNAL | NVME_STATUS_DNR;
|
||||
cq->ctrl = ctrl;
|
||||
|
||||
nvmet_cq_init(cq);
|
||||
nvmet_cq_setup(ctrl, cq, qid, size);
|
||||
|
||||
return NVME_SC_SUCCESS;
|
||||
|
@ -891,7 +933,7 @@ u16 nvmet_check_sqid(struct nvmet_ctrl *ctrl, u16 sqid,
|
|||
}
|
||||
|
||||
u16 nvmet_sq_create(struct nvmet_ctrl *ctrl, struct nvmet_sq *sq,
|
||||
u16 sqid, u16 size)
|
||||
struct nvmet_cq *cq, u16 sqid, u16 size)
|
||||
{
|
||||
u16 status;
|
||||
int ret;
|
||||
|
@ -903,7 +945,7 @@ u16 nvmet_sq_create(struct nvmet_ctrl *ctrl, struct nvmet_sq *sq,
|
|||
if (status != NVME_SC_SUCCESS)
|
||||
return status;
|
||||
|
||||
ret = nvmet_sq_init(sq);
|
||||
ret = nvmet_sq_init(sq, cq);
|
||||
if (ret) {
|
||||
status = NVME_SC_INTERNAL | NVME_STATUS_DNR;
|
||||
goto ctrl_put;
|
||||
|
@ -935,6 +977,7 @@ void nvmet_sq_destroy(struct nvmet_sq *sq)
|
|||
wait_for_completion(&sq->free_done);
|
||||
percpu_ref_exit(&sq->ref);
|
||||
nvmet_auth_sq_free(sq);
|
||||
nvmet_cq_put(sq->cq);
|
||||
|
||||
/*
|
||||
* we must reference the ctrl again after waiting for inflight IO
|
||||
|
@ -967,18 +1010,23 @@ static void nvmet_sq_free(struct percpu_ref *ref)
|
|||
complete(&sq->free_done);
|
||||
}
|
||||
|
||||
int nvmet_sq_init(struct nvmet_sq *sq)
|
||||
int nvmet_sq_init(struct nvmet_sq *sq, struct nvmet_cq *cq)
|
||||
{
|
||||
int ret;
|
||||
|
||||
if (!nvmet_cq_get(cq))
|
||||
return -EINVAL;
|
||||
|
||||
ret = percpu_ref_init(&sq->ref, nvmet_sq_free, 0, GFP_KERNEL);
|
||||
if (ret) {
|
||||
pr_err("percpu_ref init failed!\n");
|
||||
nvmet_cq_put(cq);
|
||||
return ret;
|
||||
}
|
||||
init_completion(&sq->free_done);
|
||||
init_completion(&sq->confirm_done);
|
||||
nvmet_auth_sq_init(sq);
|
||||
sq->cq = cq;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
@ -1108,13 +1156,13 @@ static u16 nvmet_parse_io_cmd(struct nvmet_req *req)
|
|||
return ret;
|
||||
}
|
||||
|
||||
bool nvmet_req_init(struct nvmet_req *req, struct nvmet_cq *cq,
|
||||
struct nvmet_sq *sq, const struct nvmet_fabrics_ops *ops)
|
||||
bool nvmet_req_init(struct nvmet_req *req, struct nvmet_sq *sq,
|
||||
const struct nvmet_fabrics_ops *ops)
|
||||
{
|
||||
u8 flags = req->cmd->common.flags;
|
||||
u16 status;
|
||||
|
||||
req->cq = cq;
|
||||
req->cq = sq->cq;
|
||||
req->sq = sq;
|
||||
req->ops = ops;
|
||||
req->sg = NULL;
|
||||
|
@ -1612,12 +1660,17 @@ struct nvmet_ctrl *nvmet_alloc_ctrl(struct nvmet_alloc_ctrl_args *args)
|
|||
if (!ctrl->sqs)
|
||||
goto out_free_changed_ns_list;
|
||||
|
||||
ctrl->cqs = kcalloc(subsys->max_qid + 1, sizeof(struct nvmet_cq *),
|
||||
GFP_KERNEL);
|
||||
if (!ctrl->cqs)
|
||||
goto out_free_sqs;
|
||||
|
||||
ret = ida_alloc_range(&cntlid_ida,
|
||||
subsys->cntlid_min, subsys->cntlid_max,
|
||||
GFP_KERNEL);
|
||||
if (ret < 0) {
|
||||
args->status = NVME_SC_CONNECT_CTRL_BUSY | NVME_STATUS_DNR;
|
||||
goto out_free_sqs;
|
||||
goto out_free_cqs;
|
||||
}
|
||||
ctrl->cntlid = ret;
|
||||
|
||||
|
@ -1676,6 +1729,8 @@ init_pr_fail:
|
|||
mutex_unlock(&subsys->lock);
|
||||
nvmet_stop_keep_alive_timer(ctrl);
|
||||
ida_free(&cntlid_ida, ctrl->cntlid);
|
||||
out_free_cqs:
|
||||
kfree(ctrl->cqs);
|
||||
out_free_sqs:
|
||||
kfree(ctrl->sqs);
|
||||
out_free_changed_ns_list:
|
||||
|
@ -1712,6 +1767,7 @@ static void nvmet_ctrl_free(struct kref *ref)
|
|||
|
||||
nvmet_async_events_free(ctrl);
|
||||
kfree(ctrl->sqs);
|
||||
kfree(ctrl->cqs);
|
||||
kfree(ctrl->changed_ns_list);
|
||||
kfree(ctrl);
|
||||
|
||||
|
|
|
@ -119,7 +119,7 @@ static void nvmet_format_discovery_entry(struct nvmf_disc_rsp_page_hdr *hdr,
|
|||
memcpy(e->trsvcid, port->disc_addr.trsvcid, NVMF_TRSVCID_SIZE);
|
||||
memcpy(e->traddr, traddr, NVMF_TRADDR_SIZE);
|
||||
memcpy(e->tsas.common, port->disc_addr.tsas.common, NVMF_TSAS_SIZE);
|
||||
strncpy(e->subnqn, subsys_nqn, NVMF_NQN_SIZE);
|
||||
strscpy(e->subnqn, subsys_nqn, NVMF_NQN_SIZE);
|
||||
}
|
||||
|
||||
/*
|
||||
|
|
|
@ -208,6 +208,14 @@ static u16 nvmet_install_queue(struct nvmet_ctrl *ctrl, struct nvmet_req *req)
|
|||
return NVME_SC_CONNECT_CTRL_BUSY | NVME_STATUS_DNR;
|
||||
}
|
||||
|
||||
kref_get(&ctrl->ref);
|
||||
old = cmpxchg(&req->cq->ctrl, NULL, ctrl);
|
||||
if (old) {
|
||||
pr_warn("queue already connected!\n");
|
||||
req->error_loc = offsetof(struct nvmf_connect_command, opcode);
|
||||
return NVME_SC_CONNECT_CTRL_BUSY | NVME_STATUS_DNR;
|
||||
}
|
||||
|
||||
/* note: convert queue size from 0's-based value to 1's-based value */
|
||||
nvmet_cq_setup(ctrl, req->cq, qid, sqsize + 1);
|
||||
nvmet_sq_setup(ctrl, req->sq, qid, sqsize + 1);
|
||||
|
@ -239,8 +247,8 @@ static u32 nvmet_connect_result(struct nvmet_ctrl *ctrl, struct nvmet_sq *sq)
|
|||
bool needs_auth = nvmet_has_auth(ctrl, sq);
|
||||
key_serial_t keyid = nvmet_queue_tls_keyid(sq);
|
||||
|
||||
/* Do not authenticate I/O queues for secure concatenation */
|
||||
if (ctrl->concat && sq->qid)
|
||||
/* Do not authenticate I/O queues */
|
||||
if (sq->qid)
|
||||
needs_auth = false;
|
||||
|
||||
if (keyid)
|
||||
|
|
|
@ -816,7 +816,8 @@ nvmet_fc_alloc_target_queue(struct nvmet_fc_tgt_assoc *assoc,
|
|||
|
||||
nvmet_fc_prep_fcp_iodlist(assoc->tgtport, queue);
|
||||
|
||||
ret = nvmet_sq_init(&queue->nvme_sq);
|
||||
nvmet_cq_init(&queue->nvme_cq);
|
||||
ret = nvmet_sq_init(&queue->nvme_sq, &queue->nvme_cq);
|
||||
if (ret)
|
||||
goto out_fail_iodlist;
|
||||
|
||||
|
@ -826,6 +827,7 @@ nvmet_fc_alloc_target_queue(struct nvmet_fc_tgt_assoc *assoc,
|
|||
return queue;
|
||||
|
||||
out_fail_iodlist:
|
||||
nvmet_cq_put(&queue->nvme_cq);
|
||||
nvmet_fc_destroy_fcp_iodlist(assoc->tgtport, queue);
|
||||
destroy_workqueue(queue->work_q);
|
||||
out_free_queue:
|
||||
|
@ -934,6 +936,7 @@ nvmet_fc_delete_target_queue(struct nvmet_fc_tgt_queue *queue)
|
|||
flush_workqueue(queue->work_q);
|
||||
|
||||
nvmet_sq_destroy(&queue->nvme_sq);
|
||||
nvmet_cq_put(&queue->nvme_cq);
|
||||
|
||||
nvmet_fc_tgt_q_put(queue);
|
||||
}
|
||||
|
@ -1254,6 +1257,7 @@ nvmet_fc_portentry_bind(struct nvmet_fc_tgtport *tgtport,
|
|||
{
|
||||
lockdep_assert_held(&nvmet_fc_tgtlock);
|
||||
|
||||
nvmet_fc_tgtport_get(tgtport);
|
||||
pe->tgtport = tgtport;
|
||||
tgtport->pe = pe;
|
||||
|
||||
|
@ -1273,8 +1277,10 @@ nvmet_fc_portentry_unbind(struct nvmet_fc_port_entry *pe)
|
|||
unsigned long flags;
|
||||
|
||||
spin_lock_irqsave(&nvmet_fc_tgtlock, flags);
|
||||
if (pe->tgtport)
|
||||
if (pe->tgtport) {
|
||||
nvmet_fc_tgtport_put(pe->tgtport);
|
||||
pe->tgtport->pe = NULL;
|
||||
}
|
||||
list_del(&pe->pe_list);
|
||||
spin_unlock_irqrestore(&nvmet_fc_tgtlock, flags);
|
||||
}
|
||||
|
@ -1292,8 +1298,10 @@ nvmet_fc_portentry_unbind_tgt(struct nvmet_fc_tgtport *tgtport)
|
|||
|
||||
spin_lock_irqsave(&nvmet_fc_tgtlock, flags);
|
||||
pe = tgtport->pe;
|
||||
if (pe)
|
||||
if (pe) {
|
||||
nvmet_fc_tgtport_put(pe->tgtport);
|
||||
pe->tgtport = NULL;
|
||||
}
|
||||
tgtport->pe = NULL;
|
||||
spin_unlock_irqrestore(&nvmet_fc_tgtlock, flags);
|
||||
}
|
||||
|
@ -1316,6 +1324,9 @@ nvmet_fc_portentry_rebind_tgt(struct nvmet_fc_tgtport *tgtport)
|
|||
list_for_each_entry(pe, &nvmet_fc_portentry_list, pe_list) {
|
||||
if (tgtport->fc_target_port.node_name == pe->node_name &&
|
||||
tgtport->fc_target_port.port_name == pe->port_name) {
|
||||
if (!nvmet_fc_tgtport_get(tgtport))
|
||||
continue;
|
||||
|
||||
WARN_ON(pe->tgtport);
|
||||
tgtport->pe = pe;
|
||||
pe->tgtport = tgtport;
|
||||
|
@ -1580,6 +1591,39 @@ nvmet_fc_delete_ctrl(struct nvmet_ctrl *ctrl)
|
|||
spin_unlock_irqrestore(&nvmet_fc_tgtlock, flags);
|
||||
}
|
||||
|
||||
static void
|
||||
nvmet_fc_free_pending_reqs(struct nvmet_fc_tgtport *tgtport)
|
||||
{
|
||||
struct nvmet_fc_ls_req_op *lsop;
|
||||
struct nvmefc_ls_req *lsreq;
|
||||
struct nvmet_fc_ls_iod *iod;
|
||||
int i;
|
||||
|
||||
iod = tgtport->iod;
|
||||
for (i = 0; i < NVMET_LS_CTX_COUNT; iod++, i++)
|
||||
cancel_work(&iod->work);
|
||||
|
||||
/*
|
||||
* After this point the connection is lost and thus any pending
|
||||
* request can't be processed by the normal completion path. This
|
||||
* is likely a request from nvmet_fc_send_ls_req_async.
|
||||
*/
|
||||
while ((lsop = list_first_entry_or_null(&tgtport->ls_req_list,
|
||||
struct nvmet_fc_ls_req_op, lsreq_list))) {
|
||||
list_del(&lsop->lsreq_list);
|
||||
|
||||
if (!lsop->req_queued)
|
||||
continue;
|
||||
|
||||
lsreq = &lsop->ls_req;
|
||||
fc_dma_unmap_single(tgtport->dev, lsreq->rqstdma,
|
||||
(lsreq->rqstlen + lsreq->rsplen),
|
||||
DMA_BIDIRECTIONAL);
|
||||
nvmet_fc_tgtport_put(tgtport);
|
||||
kfree(lsop);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* nvmet_fc_unregister_targetport - transport entry point called by an
|
||||
* LLDD to deregister/remove a previously
|
||||
|
@ -1608,13 +1652,7 @@ nvmet_fc_unregister_targetport(struct nvmet_fc_target_port *target_port)
|
|||
|
||||
flush_workqueue(nvmet_wq);
|
||||
|
||||
/*
|
||||
* should terminate LS's as well. However, LS's will be generated
|
||||
* at the tail end of association termination, so they likely don't
|
||||
* exist yet. And even if they did, it's worthwhile to just let
|
||||
* them finish and targetport ref counting will clean things up.
|
||||
*/
|
||||
|
||||
nvmet_fc_free_pending_reqs(tgtport);
|
||||
nvmet_fc_tgtport_put(tgtport);
|
||||
|
||||
return 0;
|
||||
|
@ -2531,9 +2569,7 @@ nvmet_fc_handle_fcp_rqst(struct nvmet_fc_tgtport *tgtport,
|
|||
fod->data_sg = NULL;
|
||||
fod->data_sg_cnt = 0;
|
||||
|
||||
ret = nvmet_req_init(&fod->req,
|
||||
&fod->queue->nvme_cq,
|
||||
&fod->queue->nvme_sq,
|
||||
ret = nvmet_req_init(&fod->req, &fod->queue->nvme_sq,
|
||||
&nvmet_fc_tgt_fcp_ops);
|
||||
if (!ret) {
|
||||
/* bad SQE content or invalid ctrl state */
|
||||
|
@ -2860,12 +2896,17 @@ nvmet_fc_add_port(struct nvmet_port *port)
|
|||
list_for_each_entry(tgtport, &nvmet_fc_target_list, tgt_list) {
|
||||
if ((tgtport->fc_target_port.node_name == traddr.nn) &&
|
||||
(tgtport->fc_target_port.port_name == traddr.pn)) {
|
||||
if (!nvmet_fc_tgtport_get(tgtport))
|
||||
continue;
|
||||
|
||||
/* a FC port can only be 1 nvmet port id */
|
||||
if (!tgtport->pe) {
|
||||
nvmet_fc_portentry_bind(tgtport, pe, port);
|
||||
ret = 0;
|
||||
} else
|
||||
ret = -EALREADY;
|
||||
|
||||
nvmet_fc_tgtport_put(tgtport);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
@ -2881,11 +2922,21 @@ static void
|
|||
nvmet_fc_remove_port(struct nvmet_port *port)
|
||||
{
|
||||
struct nvmet_fc_port_entry *pe = port->priv;
|
||||
struct nvmet_fc_tgtport *tgtport = NULL;
|
||||
unsigned long flags;
|
||||
|
||||
spin_lock_irqsave(&nvmet_fc_tgtlock, flags);
|
||||
if (pe->tgtport && nvmet_fc_tgtport_get(pe->tgtport))
|
||||
tgtport = pe->tgtport;
|
||||
spin_unlock_irqrestore(&nvmet_fc_tgtlock, flags);
|
||||
|
||||
nvmet_fc_portentry_unbind(pe);
|
||||
|
||||
if (tgtport) {
|
||||
/* terminate any outstanding associations */
|
||||
__nvmet_fc_free_assocs(pe->tgtport);
|
||||
__nvmet_fc_free_assocs(tgtport);
|
||||
nvmet_fc_tgtport_put(tgtport);
|
||||
}
|
||||
|
||||
kfree(pe);
|
||||
}
|
||||
|
@ -2894,10 +2945,21 @@ static void
|
|||
nvmet_fc_discovery_chg(struct nvmet_port *port)
|
||||
{
|
||||
struct nvmet_fc_port_entry *pe = port->priv;
|
||||
struct nvmet_fc_tgtport *tgtport = pe->tgtport;
|
||||
struct nvmet_fc_tgtport *tgtport = NULL;
|
||||
unsigned long flags;
|
||||
|
||||
spin_lock_irqsave(&nvmet_fc_tgtlock, flags);
|
||||
if (pe->tgtport && nvmet_fc_tgtport_get(pe->tgtport))
|
||||
tgtport = pe->tgtport;
|
||||
spin_unlock_irqrestore(&nvmet_fc_tgtlock, flags);
|
||||
|
||||
if (!tgtport)
|
||||
return;
|
||||
|
||||
if (tgtport && tgtport->ops->discovery_event)
|
||||
tgtport->ops->discovery_event(&tgtport->fc_target_port);
|
||||
|
||||
nvmet_fc_tgtport_put(tgtport);
|
||||
}
|
||||
|
||||
static ssize_t
|
||||
|
|
|
@ -207,7 +207,6 @@ static LIST_HEAD(fcloop_nports);
|
|||
struct fcloop_lport {
|
||||
struct nvme_fc_local_port *localport;
|
||||
struct list_head lport_list;
|
||||
struct completion unreg_done;
|
||||
refcount_t ref;
|
||||
};
|
||||
|
||||
|
@ -215,6 +214,9 @@ struct fcloop_lport_priv {
|
|||
struct fcloop_lport *lport;
|
||||
};
|
||||
|
||||
/* The port is already being removed, avoid double free */
|
||||
#define PORT_DELETED 0
|
||||
|
||||
struct fcloop_rport {
|
||||
struct nvme_fc_remote_port *remoteport;
|
||||
struct nvmet_fc_target_port *targetport;
|
||||
|
@ -223,6 +225,7 @@ struct fcloop_rport {
|
|||
spinlock_t lock;
|
||||
struct list_head ls_list;
|
||||
struct work_struct ls_work;
|
||||
unsigned long flags;
|
||||
};
|
||||
|
||||
struct fcloop_tport {
|
||||
|
@ -233,6 +236,7 @@ struct fcloop_tport {
|
|||
spinlock_t lock;
|
||||
struct list_head ls_list;
|
||||
struct work_struct ls_work;
|
||||
unsigned long flags;
|
||||
};
|
||||
|
||||
struct fcloop_nport {
|
||||
|
@ -288,6 +292,9 @@ struct fcloop_ini_fcpreq {
|
|||
spinlock_t inilock;
|
||||
};
|
||||
|
||||
/* SLAB cache for fcloop_lsreq structures */
|
||||
static struct kmem_cache *lsreq_cache;
|
||||
|
||||
static inline struct fcloop_lsreq *
|
||||
ls_rsp_to_lsreq(struct nvmefc_ls_rsp *lsrsp)
|
||||
{
|
||||
|
@ -338,6 +345,7 @@ fcloop_rport_lsrqst_work(struct work_struct *work)
|
|||
* callee may free memory containing tls_req.
|
||||
* do not reference lsreq after this.
|
||||
*/
|
||||
kmem_cache_free(lsreq_cache, tls_req);
|
||||
|
||||
spin_lock(&rport->lock);
|
||||
}
|
||||
|
@ -349,10 +357,13 @@ fcloop_h2t_ls_req(struct nvme_fc_local_port *localport,
|
|||
struct nvme_fc_remote_port *remoteport,
|
||||
struct nvmefc_ls_req *lsreq)
|
||||
{
|
||||
struct fcloop_lsreq *tls_req = lsreq->private;
|
||||
struct fcloop_rport *rport = remoteport->private;
|
||||
struct fcloop_lsreq *tls_req;
|
||||
int ret = 0;
|
||||
|
||||
tls_req = kmem_cache_alloc(lsreq_cache, GFP_KERNEL);
|
||||
if (!tls_req)
|
||||
return -ENOMEM;
|
||||
tls_req->lsreq = lsreq;
|
||||
INIT_LIST_HEAD(&tls_req->ls_list);
|
||||
|
||||
|
@ -389,13 +400,16 @@ fcloop_h2t_xmt_ls_rsp(struct nvmet_fc_target_port *targetport,
|
|||
|
||||
lsrsp->done(lsrsp);
|
||||
|
||||
if (remoteport) {
|
||||
if (!remoteport) {
|
||||
kmem_cache_free(lsreq_cache, tls_req);
|
||||
return 0;
|
||||
}
|
||||
|
||||
rport = remoteport->private;
|
||||
spin_lock(&rport->lock);
|
||||
list_add_tail(&tls_req->ls_list, &rport->ls_list);
|
||||
spin_unlock(&rport->lock);
|
||||
queue_work(nvmet_wq, &rport->ls_work);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
@ -422,6 +436,7 @@ fcloop_tport_lsrqst_work(struct work_struct *work)
|
|||
* callee may free memory containing tls_req.
|
||||
* do not reference lsreq after this.
|
||||
*/
|
||||
kmem_cache_free(lsreq_cache, tls_req);
|
||||
|
||||
spin_lock(&tport->lock);
|
||||
}
|
||||
|
@ -432,8 +447,8 @@ static int
|
|||
fcloop_t2h_ls_req(struct nvmet_fc_target_port *targetport, void *hosthandle,
|
||||
struct nvmefc_ls_req *lsreq)
|
||||
{
|
||||
struct fcloop_lsreq *tls_req = lsreq->private;
|
||||
struct fcloop_tport *tport = targetport->private;
|
||||
struct fcloop_lsreq *tls_req;
|
||||
int ret = 0;
|
||||
|
||||
/*
|
||||
|
@ -441,6 +456,10 @@ fcloop_t2h_ls_req(struct nvmet_fc_target_port *targetport, void *hosthandle,
|
|||
* hosthandle ignored as fcloop currently is
|
||||
* 1:1 tgtport vs remoteport
|
||||
*/
|
||||
|
||||
tls_req = kmem_cache_alloc(lsreq_cache, GFP_KERNEL);
|
||||
if (!tls_req)
|
||||
return -ENOMEM;
|
||||
tls_req->lsreq = lsreq;
|
||||
INIT_LIST_HEAD(&tls_req->ls_list);
|
||||
|
||||
|
@ -457,6 +476,9 @@ fcloop_t2h_ls_req(struct nvmet_fc_target_port *targetport, void *hosthandle,
|
|||
ret = nvme_fc_rcv_ls_req(tport->remoteport, &tls_req->ls_rsp,
|
||||
lsreq->rqstaddr, lsreq->rqstlen);
|
||||
|
||||
if (ret)
|
||||
kmem_cache_free(lsreq_cache, tls_req);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
@ -471,18 +493,30 @@ fcloop_t2h_xmt_ls_rsp(struct nvme_fc_local_port *localport,
|
|||
struct nvmet_fc_target_port *targetport = rport->targetport;
|
||||
struct fcloop_tport *tport;
|
||||
|
||||
if (!targetport) {
|
||||
/*
|
||||
* The target port is gone. The target doesn't expect any
|
||||
* response anymore and the ->done call is not valid
|
||||
* because the resources have been freed by
|
||||
* nvmet_fc_free_pending_reqs.
|
||||
*
|
||||
* We end up here from delete association exchange:
|
||||
* nvmet_fc_xmt_disconnect_assoc sends an async request.
|
||||
*/
|
||||
kmem_cache_free(lsreq_cache, tls_req);
|
||||
return 0;
|
||||
}
|
||||
|
||||
memcpy(lsreq->rspaddr, lsrsp->rspbuf,
|
||||
((lsreq->rsplen < lsrsp->rsplen) ?
|
||||
lsreq->rsplen : lsrsp->rsplen));
|
||||
lsrsp->done(lsrsp);
|
||||
|
||||
if (targetport) {
|
||||
tport = targetport->private;
|
||||
spin_lock(&tport->lock);
|
||||
list_add_tail(&tls_req->ls_list, &tport->ls_list);
|
||||
spin_unlock(&tport->lock);
|
||||
queue_work(nvmet_wq, &tport->ls_work);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
@ -566,6 +600,7 @@ fcloop_call_host_done(struct nvmefc_fcp_req *fcpreq,
|
|||
}
|
||||
|
||||
/* release original io reference on tgt struct */
|
||||
if (tfcp_req)
|
||||
fcloop_tfcp_req_put(tfcp_req);
|
||||
}
|
||||
|
||||
|
@ -618,12 +653,13 @@ fcloop_fcp_recv_work(struct work_struct *work)
|
|||
{
|
||||
struct fcloop_fcpreq *tfcp_req =
|
||||
container_of(work, struct fcloop_fcpreq, fcp_rcv_work);
|
||||
struct nvmefc_fcp_req *fcpreq = tfcp_req->fcpreq;
|
||||
struct nvmefc_fcp_req *fcpreq;
|
||||
unsigned long flags;
|
||||
int ret = 0;
|
||||
bool aborted = false;
|
||||
|
||||
spin_lock_irqsave(&tfcp_req->reqlock, flags);
|
||||
fcpreq = tfcp_req->fcpreq;
|
||||
switch (tfcp_req->inistate) {
|
||||
case INI_IO_START:
|
||||
tfcp_req->inistate = INI_IO_ACTIVE;
|
||||
|
@ -638,16 +674,19 @@ fcloop_fcp_recv_work(struct work_struct *work)
|
|||
}
|
||||
spin_unlock_irqrestore(&tfcp_req->reqlock, flags);
|
||||
|
||||
if (unlikely(aborted))
|
||||
ret = -ECANCELED;
|
||||
else {
|
||||
if (likely(!check_for_drop(tfcp_req)))
|
||||
if (unlikely(aborted)) {
|
||||
/* the abort handler will call fcloop_call_host_done */
|
||||
return;
|
||||
}
|
||||
|
||||
if (unlikely(check_for_drop(tfcp_req))) {
|
||||
pr_info("%s: dropped command ********\n", __func__);
|
||||
return;
|
||||
}
|
||||
|
||||
ret = nvmet_fc_rcv_fcp_req(tfcp_req->tport->targetport,
|
||||
&tfcp_req->tgt_fcp_req,
|
||||
fcpreq->cmdaddr, fcpreq->cmdlen);
|
||||
else
|
||||
pr_info("%s: dropped command ********\n", __func__);
|
||||
}
|
||||
if (ret)
|
||||
fcloop_call_host_done(fcpreq, tfcp_req, ret);
|
||||
}
|
||||
|
@ -662,15 +701,17 @@ fcloop_fcp_abort_recv_work(struct work_struct *work)
|
|||
unsigned long flags;
|
||||
|
||||
spin_lock_irqsave(&tfcp_req->reqlock, flags);
|
||||
fcpreq = tfcp_req->fcpreq;
|
||||
switch (tfcp_req->inistate) {
|
||||
case INI_IO_ABORTED:
|
||||
fcpreq = tfcp_req->fcpreq;
|
||||
tfcp_req->fcpreq = NULL;
|
||||
break;
|
||||
case INI_IO_COMPLETED:
|
||||
completed = true;
|
||||
break;
|
||||
default:
|
||||
spin_unlock_irqrestore(&tfcp_req->reqlock, flags);
|
||||
fcloop_tfcp_req_put(tfcp_req);
|
||||
WARN_ON(1);
|
||||
return;
|
||||
}
|
||||
|
@ -686,10 +727,6 @@ fcloop_fcp_abort_recv_work(struct work_struct *work)
|
|||
nvmet_fc_rcv_fcp_abort(tfcp_req->tport->targetport,
|
||||
&tfcp_req->tgt_fcp_req);
|
||||
|
||||
spin_lock_irqsave(&tfcp_req->reqlock, flags);
|
||||
tfcp_req->fcpreq = NULL;
|
||||
spin_unlock_irqrestore(&tfcp_req->reqlock, flags);
|
||||
|
||||
fcloop_call_host_done(fcpreq, tfcp_req, -ECANCELED);
|
||||
/* call_host_done releases reference for abort downcall */
|
||||
}
|
||||
|
@ -958,13 +995,16 @@ fcloop_fcp_abort(struct nvme_fc_local_port *localport,
|
|||
|
||||
spin_lock(&inireq->inilock);
|
||||
tfcp_req = inireq->tfcp_req;
|
||||
if (tfcp_req)
|
||||
fcloop_tfcp_req_get(tfcp_req);
|
||||
if (tfcp_req) {
|
||||
if (!fcloop_tfcp_req_get(tfcp_req))
|
||||
tfcp_req = NULL;
|
||||
}
|
||||
spin_unlock(&inireq->inilock);
|
||||
|
||||
if (!tfcp_req)
|
||||
if (!tfcp_req) {
|
||||
/* abort has already been called */
|
||||
return;
|
||||
goto out_host_done;
|
||||
}
|
||||
|
||||
/* break initiator/target relationship for io */
|
||||
spin_lock_irqsave(&tfcp_req->reqlock, flags);
|
||||
|
@ -979,7 +1019,7 @@ fcloop_fcp_abort(struct nvme_fc_local_port *localport,
|
|||
default:
|
||||
spin_unlock_irqrestore(&tfcp_req->reqlock, flags);
|
||||
WARN_ON(1);
|
||||
return;
|
||||
goto out_host_done;
|
||||
}
|
||||
spin_unlock_irqrestore(&tfcp_req->reqlock, flags);
|
||||
|
||||
|
@ -993,6 +1033,11 @@ fcloop_fcp_abort(struct nvme_fc_local_port *localport,
|
|||
*/
|
||||
fcloop_tfcp_req_put(tfcp_req);
|
||||
}
|
||||
|
||||
return;
|
||||
|
||||
out_host_done:
|
||||
fcloop_call_host_done(fcpreq, tfcp_req, -ECANCELED);
|
||||
}
|
||||
|
||||
static void
|
||||
|
@ -1019,9 +1064,18 @@ fcloop_lport_get(struct fcloop_lport *lport)
|
|||
static void
|
||||
fcloop_nport_put(struct fcloop_nport *nport)
|
||||
{
|
||||
unsigned long flags;
|
||||
|
||||
if (!refcount_dec_and_test(&nport->ref))
|
||||
return;
|
||||
|
||||
spin_lock_irqsave(&fcloop_lock, flags);
|
||||
list_del(&nport->nport_list);
|
||||
spin_unlock_irqrestore(&fcloop_lock, flags);
|
||||
|
||||
if (nport->lport)
|
||||
fcloop_lport_put(nport->lport);
|
||||
|
||||
kfree(nport);
|
||||
}
|
||||
|
||||
|
@ -1037,9 +1091,6 @@ fcloop_localport_delete(struct nvme_fc_local_port *localport)
|
|||
struct fcloop_lport_priv *lport_priv = localport->private;
|
||||
struct fcloop_lport *lport = lport_priv->lport;
|
||||
|
||||
/* release any threads waiting for the unreg to complete */
|
||||
complete(&lport->unreg_done);
|
||||
|
||||
fcloop_lport_put(lport);
|
||||
}
|
||||
|
||||
|
@ -1047,8 +1098,18 @@ static void
|
|||
fcloop_remoteport_delete(struct nvme_fc_remote_port *remoteport)
|
||||
{
|
||||
struct fcloop_rport *rport = remoteport->private;
|
||||
bool put_port = false;
|
||||
unsigned long flags;
|
||||
|
||||
flush_work(&rport->ls_work);
|
||||
|
||||
spin_lock_irqsave(&fcloop_lock, flags);
|
||||
if (!test_and_set_bit(PORT_DELETED, &rport->flags))
|
||||
put_port = true;
|
||||
rport->nport->rport = NULL;
|
||||
spin_unlock_irqrestore(&fcloop_lock, flags);
|
||||
|
||||
if (put_port)
|
||||
fcloop_nport_put(rport->nport);
|
||||
}
|
||||
|
||||
|
@ -1056,8 +1117,18 @@ static void
|
|||
fcloop_targetport_delete(struct nvmet_fc_target_port *targetport)
|
||||
{
|
||||
struct fcloop_tport *tport = targetport->private;
|
||||
bool put_port = false;
|
||||
unsigned long flags;
|
||||
|
||||
flush_work(&tport->ls_work);
|
||||
|
||||
spin_lock_irqsave(&fcloop_lock, flags);
|
||||
if (!test_and_set_bit(PORT_DELETED, &tport->flags))
|
||||
put_port = true;
|
||||
tport->nport->tport = NULL;
|
||||
spin_unlock_irqrestore(&fcloop_lock, flags);
|
||||
|
||||
if (put_port)
|
||||
fcloop_nport_put(tport->nport);
|
||||
}
|
||||
|
||||
|
@ -1082,7 +1153,6 @@ static struct nvme_fc_port_template fctemplate = {
|
|||
/* sizes of additional private data for data structures */
|
||||
.local_priv_sz = sizeof(struct fcloop_lport_priv),
|
||||
.remote_priv_sz = sizeof(struct fcloop_rport),
|
||||
.lsrqst_priv_sz = sizeof(struct fcloop_lsreq),
|
||||
.fcprqst_priv_sz = sizeof(struct fcloop_ini_fcpreq),
|
||||
};
|
||||
|
||||
|
@ -1105,7 +1175,6 @@ static struct nvmet_fc_target_template tgttemplate = {
|
|||
.target_features = 0,
|
||||
/* sizes of additional private data for data structures */
|
||||
.target_priv_sz = sizeof(struct fcloop_tport),
|
||||
.lsrqst_priv_sz = sizeof(struct fcloop_lsreq),
|
||||
};
|
||||
|
||||
static ssize_t
|
||||
|
@ -1170,51 +1239,92 @@ out_free_lport:
|
|||
}
|
||||
|
||||
static int
|
||||
__wait_localport_unreg(struct fcloop_lport *lport)
|
||||
__localport_unreg(struct fcloop_lport *lport)
|
||||
{
|
||||
int ret;
|
||||
|
||||
init_completion(&lport->unreg_done);
|
||||
|
||||
ret = nvme_fc_unregister_localport(lport->localport);
|
||||
|
||||
if (!ret)
|
||||
wait_for_completion(&lport->unreg_done);
|
||||
|
||||
return ret;
|
||||
return nvme_fc_unregister_localport(lport->localport);
|
||||
}
|
||||
|
||||
static struct fcloop_nport *
|
||||
__fcloop_nport_lookup(u64 node_name, u64 port_name)
|
||||
{
|
||||
struct fcloop_nport *nport;
|
||||
|
||||
list_for_each_entry(nport, &fcloop_nports, nport_list) {
|
||||
if (nport->node_name != node_name ||
|
||||
nport->port_name != port_name)
|
||||
continue;
|
||||
|
||||
if (fcloop_nport_get(nport))
|
||||
return nport;
|
||||
|
||||
break;
|
||||
}
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static struct fcloop_nport *
|
||||
fcloop_nport_lookup(u64 node_name, u64 port_name)
|
||||
{
|
||||
struct fcloop_nport *nport;
|
||||
unsigned long flags;
|
||||
|
||||
spin_lock_irqsave(&fcloop_lock, flags);
|
||||
nport = __fcloop_nport_lookup(node_name, port_name);
|
||||
spin_unlock_irqrestore(&fcloop_lock, flags);
|
||||
|
||||
return nport;
|
||||
}
|
||||
|
||||
static struct fcloop_lport *
|
||||
__fcloop_lport_lookup(u64 node_name, u64 port_name)
|
||||
{
|
||||
struct fcloop_lport *lport;
|
||||
|
||||
list_for_each_entry(lport, &fcloop_lports, lport_list) {
|
||||
if (lport->localport->node_name != node_name ||
|
||||
lport->localport->port_name != port_name)
|
||||
continue;
|
||||
|
||||
if (fcloop_lport_get(lport))
|
||||
return lport;
|
||||
|
||||
break;
|
||||
}
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static struct fcloop_lport *
|
||||
fcloop_lport_lookup(u64 node_name, u64 port_name)
|
||||
{
|
||||
struct fcloop_lport *lport;
|
||||
unsigned long flags;
|
||||
|
||||
spin_lock_irqsave(&fcloop_lock, flags);
|
||||
lport = __fcloop_lport_lookup(node_name, port_name);
|
||||
spin_unlock_irqrestore(&fcloop_lock, flags);
|
||||
|
||||
return lport;
|
||||
}
|
||||
|
||||
static ssize_t
|
||||
fcloop_delete_local_port(struct device *dev, struct device_attribute *attr,
|
||||
const char *buf, size_t count)
|
||||
{
|
||||
struct fcloop_lport *tlport, *lport = NULL;
|
||||
struct fcloop_lport *lport;
|
||||
u64 nodename, portname;
|
||||
unsigned long flags;
|
||||
int ret;
|
||||
|
||||
ret = fcloop_parse_nm_options(dev, &nodename, &portname, buf);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
spin_lock_irqsave(&fcloop_lock, flags);
|
||||
|
||||
list_for_each_entry(tlport, &fcloop_lports, lport_list) {
|
||||
if (tlport->localport->node_name == nodename &&
|
||||
tlport->localport->port_name == portname) {
|
||||
if (!fcloop_lport_get(tlport))
|
||||
break;
|
||||
lport = tlport;
|
||||
break;
|
||||
}
|
||||
}
|
||||
spin_unlock_irqrestore(&fcloop_lock, flags);
|
||||
|
||||
lport = fcloop_lport_lookup(nodename, portname);
|
||||
if (!lport)
|
||||
return -ENOENT;
|
||||
|
||||
ret = __wait_localport_unreg(lport);
|
||||
ret = __localport_unreg(lport);
|
||||
fcloop_lport_put(lport);
|
||||
|
||||
return ret ? ret : count;
|
||||
|
@ -1223,8 +1333,8 @@ fcloop_delete_local_port(struct device *dev, struct device_attribute *attr,
|
|||
static struct fcloop_nport *
|
||||
fcloop_alloc_nport(const char *buf, size_t count, bool remoteport)
|
||||
{
|
||||
struct fcloop_nport *newnport, *nport = NULL;
|
||||
struct fcloop_lport *tmplport, *lport = NULL;
|
||||
struct fcloop_nport *newnport, *nport;
|
||||
struct fcloop_lport *lport;
|
||||
struct fcloop_ctrl_options *opts;
|
||||
unsigned long flags;
|
||||
u32 opts_mask = (remoteport) ? RPORT_OPTS : TGTPORT_OPTS;
|
||||
|
@ -1239,10 +1349,8 @@ fcloop_alloc_nport(const char *buf, size_t count, bool remoteport)
|
|||
goto out_free_opts;
|
||||
|
||||
/* everything there ? */
|
||||
if ((opts->mask & opts_mask) != opts_mask) {
|
||||
ret = -EINVAL;
|
||||
if ((opts->mask & opts_mask) != opts_mask)
|
||||
goto out_free_opts;
|
||||
}
|
||||
|
||||
newnport = kzalloc(sizeof(*newnport), GFP_KERNEL);
|
||||
if (!newnport)
|
||||
|
@ -1258,60 +1366,61 @@ fcloop_alloc_nport(const char *buf, size_t count, bool remoteport)
|
|||
refcount_set(&newnport->ref, 1);
|
||||
|
||||
spin_lock_irqsave(&fcloop_lock, flags);
|
||||
|
||||
list_for_each_entry(tmplport, &fcloop_lports, lport_list) {
|
||||
if (tmplport->localport->node_name == opts->wwnn &&
|
||||
tmplport->localport->port_name == opts->wwpn)
|
||||
goto out_invalid_opts;
|
||||
|
||||
if (tmplport->localport->node_name == opts->lpwwnn &&
|
||||
tmplport->localport->port_name == opts->lpwwpn)
|
||||
lport = tmplport;
|
||||
lport = __fcloop_lport_lookup(opts->wwnn, opts->wwpn);
|
||||
if (lport) {
|
||||
/* invalid configuration */
|
||||
fcloop_lport_put(lport);
|
||||
goto out_free_newnport;
|
||||
}
|
||||
|
||||
if (remoteport) {
|
||||
if (!lport)
|
||||
goto out_invalid_opts;
|
||||
newnport->lport = lport;
|
||||
}
|
||||
|
||||
list_for_each_entry(nport, &fcloop_nports, nport_list) {
|
||||
if (nport->node_name == opts->wwnn &&
|
||||
nport->port_name == opts->wwpn) {
|
||||
if ((remoteport && nport->rport) ||
|
||||
(!remoteport && nport->tport)) {
|
||||
nport = NULL;
|
||||
goto out_invalid_opts;
|
||||
}
|
||||
|
||||
fcloop_nport_get(nport);
|
||||
|
||||
spin_unlock_irqrestore(&fcloop_lock, flags);
|
||||
|
||||
if (remoteport)
|
||||
nport->lport = lport;
|
||||
if (opts->mask & NVMF_OPT_ROLES)
|
||||
nport->port_role = opts->roles;
|
||||
if (opts->mask & NVMF_OPT_FCADDR)
|
||||
nport->port_id = opts->fcaddr;
|
||||
lport = __fcloop_lport_lookup(opts->lpwwnn, opts->lpwwpn);
|
||||
if (!lport) {
|
||||
/* invalid configuration */
|
||||
goto out_free_newnport;
|
||||
}
|
||||
}
|
||||
|
||||
list_add_tail(&newnport->nport_list, &fcloop_nports);
|
||||
nport = __fcloop_nport_lookup(opts->wwnn, opts->wwpn);
|
||||
if (nport) {
|
||||
if ((remoteport && nport->rport) ||
|
||||
(!remoteport && nport->tport)) {
|
||||
/* invalid configuration */
|
||||
goto out_put_nport;
|
||||
}
|
||||
|
||||
/* found existing nport, discard the new nport */
|
||||
kfree(newnport);
|
||||
} else {
|
||||
list_add_tail(&newnport->nport_list, &fcloop_nports);
|
||||
nport = newnport;
|
||||
}
|
||||
|
||||
if (opts->mask & NVMF_OPT_ROLES)
|
||||
nport->port_role = opts->roles;
|
||||
if (opts->mask & NVMF_OPT_FCADDR)
|
||||
nport->port_id = opts->fcaddr;
|
||||
if (lport) {
|
||||
if (!nport->lport)
|
||||
nport->lport = lport;
|
||||
else
|
||||
fcloop_lport_put(lport);
|
||||
}
|
||||
spin_unlock_irqrestore(&fcloop_lock, flags);
|
||||
|
||||
kfree(opts);
|
||||
return newnport;
|
||||
return nport;
|
||||
|
||||
out_invalid_opts:
|
||||
spin_unlock_irqrestore(&fcloop_lock, flags);
|
||||
out_put_nport:
|
||||
if (lport)
|
||||
fcloop_lport_put(lport);
|
||||
fcloop_nport_put(nport);
|
||||
out_free_newnport:
|
||||
spin_unlock_irqrestore(&fcloop_lock, flags);
|
||||
kfree(newnport);
|
||||
out_free_opts:
|
||||
kfree(opts);
|
||||
return nport;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static ssize_t
|
||||
|
@ -1352,6 +1461,7 @@ fcloop_create_remote_port(struct device *dev, struct device_attribute *attr,
|
|||
rport->nport = nport;
|
||||
rport->lport = nport->lport;
|
||||
nport->rport = rport;
|
||||
rport->flags = 0;
|
||||
spin_lock_init(&rport->lock);
|
||||
INIT_WORK(&rport->ls_work, fcloop_rport_lsrqst_work);
|
||||
INIT_LIST_HEAD(&rport->ls_list);
|
||||
|
@ -1365,21 +1475,18 @@ __unlink_remote_port(struct fcloop_nport *nport)
|
|||
{
|
||||
struct fcloop_rport *rport = nport->rport;
|
||||
|
||||
lockdep_assert_held(&fcloop_lock);
|
||||
|
||||
if (rport && nport->tport)
|
||||
nport->tport->remoteport = NULL;
|
||||
nport->rport = NULL;
|
||||
|
||||
list_del(&nport->nport_list);
|
||||
|
||||
return rport;
|
||||
}
|
||||
|
||||
static int
|
||||
__remoteport_unreg(struct fcloop_nport *nport, struct fcloop_rport *rport)
|
||||
{
|
||||
if (!rport)
|
||||
return -EALREADY;
|
||||
|
||||
return nvme_fc_unregister_remoteport(rport->remoteport);
|
||||
}
|
||||
|
||||
|
@ -1387,8 +1494,8 @@ static ssize_t
|
|||
fcloop_delete_remote_port(struct device *dev, struct device_attribute *attr,
|
||||
const char *buf, size_t count)
|
||||
{
|
||||
struct fcloop_nport *nport = NULL, *tmpport;
|
||||
static struct fcloop_rport *rport;
|
||||
struct fcloop_nport *nport;
|
||||
struct fcloop_rport *rport;
|
||||
u64 nodename, portname;
|
||||
unsigned long flags;
|
||||
int ret;
|
||||
|
@ -1397,24 +1504,24 @@ fcloop_delete_remote_port(struct device *dev, struct device_attribute *attr,
|
|||
if (ret)
|
||||
return ret;
|
||||
|
||||
spin_lock_irqsave(&fcloop_lock, flags);
|
||||
|
||||
list_for_each_entry(tmpport, &fcloop_nports, nport_list) {
|
||||
if (tmpport->node_name == nodename &&
|
||||
tmpport->port_name == portname && tmpport->rport) {
|
||||
nport = tmpport;
|
||||
rport = __unlink_remote_port(nport);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
spin_unlock_irqrestore(&fcloop_lock, flags);
|
||||
|
||||
nport = fcloop_nport_lookup(nodename, portname);
|
||||
if (!nport)
|
||||
return -ENOENT;
|
||||
|
||||
spin_lock_irqsave(&fcloop_lock, flags);
|
||||
rport = __unlink_remote_port(nport);
|
||||
spin_unlock_irqrestore(&fcloop_lock, flags);
|
||||
|
||||
if (!rport) {
|
||||
ret = -ENOENT;
|
||||
goto out_nport_put;
|
||||
}
|
||||
|
||||
ret = __remoteport_unreg(nport, rport);
|
||||
|
||||
out_nport_put:
|
||||
fcloop_nport_put(nport);
|
||||
|
||||
return ret ? ret : count;
|
||||
}
|
||||
|
||||
|
@ -1452,6 +1559,7 @@ fcloop_create_target_port(struct device *dev, struct device_attribute *attr,
|
|||
tport->nport = nport;
|
||||
tport->lport = nport->lport;
|
||||
nport->tport = tport;
|
||||
tport->flags = 0;
|
||||
spin_lock_init(&tport->lock);
|
||||
INIT_WORK(&tport->ls_work, fcloop_tport_lsrqst_work);
|
||||
INIT_LIST_HEAD(&tport->ls_list);
|
||||
|
@ -1465,6 +1573,8 @@ __unlink_target_port(struct fcloop_nport *nport)
|
|||
{
|
||||
struct fcloop_tport *tport = nport->tport;
|
||||
|
||||
lockdep_assert_held(&fcloop_lock);
|
||||
|
||||
if (tport && nport->rport)
|
||||
nport->rport->targetport = NULL;
|
||||
nport->tport = NULL;
|
||||
|
@ -1475,9 +1585,6 @@ __unlink_target_port(struct fcloop_nport *nport)
|
|||
static int
|
||||
__targetport_unreg(struct fcloop_nport *nport, struct fcloop_tport *tport)
|
||||
{
|
||||
if (!tport)
|
||||
return -EALREADY;
|
||||
|
||||
return nvmet_fc_unregister_targetport(tport->targetport);
|
||||
}
|
||||
|
||||
|
@ -1485,8 +1592,8 @@ static ssize_t
|
|||
fcloop_delete_target_port(struct device *dev, struct device_attribute *attr,
|
||||
const char *buf, size_t count)
|
||||
{
|
||||
struct fcloop_nport *nport = NULL, *tmpport;
|
||||
struct fcloop_tport *tport = NULL;
|
||||
struct fcloop_nport *nport;
|
||||
struct fcloop_tport *tport;
|
||||
u64 nodename, portname;
|
||||
unsigned long flags;
|
||||
int ret;
|
||||
|
@ -1495,24 +1602,24 @@ fcloop_delete_target_port(struct device *dev, struct device_attribute *attr,
|
|||
if (ret)
|
||||
return ret;
|
||||
|
||||
spin_lock_irqsave(&fcloop_lock, flags);
|
||||
|
||||
list_for_each_entry(tmpport, &fcloop_nports, nport_list) {
|
||||
if (tmpport->node_name == nodename &&
|
||||
tmpport->port_name == portname && tmpport->tport) {
|
||||
nport = tmpport;
|
||||
tport = __unlink_target_port(nport);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
spin_unlock_irqrestore(&fcloop_lock, flags);
|
||||
|
||||
nport = fcloop_nport_lookup(nodename, portname);
|
||||
if (!nport)
|
||||
return -ENOENT;
|
||||
|
||||
spin_lock_irqsave(&fcloop_lock, flags);
|
||||
tport = __unlink_target_port(nport);
|
||||
spin_unlock_irqrestore(&fcloop_lock, flags);
|
||||
|
||||
if (!tport) {
|
||||
ret = -ENOENT;
|
||||
goto out_nport_put;
|
||||
}
|
||||
|
||||
ret = __targetport_unreg(nport, tport);
|
||||
|
||||
out_nport_put:
|
||||
fcloop_nport_put(nport);
|
||||
|
||||
return ret ? ret : count;
|
||||
}
|
||||
|
||||
|
@ -1578,15 +1685,20 @@ static const struct class fcloop_class = {
|
|||
};
|
||||
static struct device *fcloop_device;
|
||||
|
||||
|
||||
static int __init fcloop_init(void)
|
||||
{
|
||||
int ret;
|
||||
|
||||
lsreq_cache = kmem_cache_create("lsreq_cache",
|
||||
sizeof(struct fcloop_lsreq), 0,
|
||||
0, NULL);
|
||||
if (!lsreq_cache)
|
||||
return -ENOMEM;
|
||||
|
||||
ret = class_register(&fcloop_class);
|
||||
if (ret) {
|
||||
pr_err("couldn't register class fcloop\n");
|
||||
return ret;
|
||||
goto out_destroy_cache;
|
||||
}
|
||||
|
||||
fcloop_device = device_create_with_groups(
|
||||
|
@ -1604,13 +1716,15 @@ static int __init fcloop_init(void)
|
|||
|
||||
out_destroy_class:
|
||||
class_unregister(&fcloop_class);
|
||||
out_destroy_cache:
|
||||
kmem_cache_destroy(lsreq_cache);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void __exit fcloop_exit(void)
|
||||
{
|
||||
struct fcloop_lport *lport = NULL;
|
||||
struct fcloop_nport *nport = NULL;
|
||||
struct fcloop_lport *lport;
|
||||
struct fcloop_nport *nport;
|
||||
struct fcloop_tport *tport;
|
||||
struct fcloop_rport *rport;
|
||||
unsigned long flags;
|
||||
|
@ -1621,7 +1735,7 @@ static void __exit fcloop_exit(void)
|
|||
for (;;) {
|
||||
nport = list_first_entry_or_null(&fcloop_nports,
|
||||
typeof(*nport), nport_list);
|
||||
if (!nport)
|
||||
if (!nport || !fcloop_nport_get(nport))
|
||||
break;
|
||||
|
||||
tport = __unlink_target_port(nport);
|
||||
|
@ -1629,13 +1743,21 @@ static void __exit fcloop_exit(void)
|
|||
|
||||
spin_unlock_irqrestore(&fcloop_lock, flags);
|
||||
|
||||
if (tport) {
|
||||
ret = __targetport_unreg(nport, tport);
|
||||
if (ret)
|
||||
pr_warn("%s: Failed deleting target port\n", __func__);
|
||||
pr_warn("%s: Failed deleting target port\n",
|
||||
__func__);
|
||||
}
|
||||
|
||||
if (rport) {
|
||||
ret = __remoteport_unreg(nport, rport);
|
||||
if (ret)
|
||||
pr_warn("%s: Failed deleting remote port\n", __func__);
|
||||
pr_warn("%s: Failed deleting remote port\n",
|
||||
__func__);
|
||||
}
|
||||
|
||||
fcloop_nport_put(nport);
|
||||
|
||||
spin_lock_irqsave(&fcloop_lock, flags);
|
||||
}
|
||||
|
@ -1648,7 +1770,7 @@ static void __exit fcloop_exit(void)
|
|||
|
||||
spin_unlock_irqrestore(&fcloop_lock, flags);
|
||||
|
||||
ret = __wait_localport_unreg(lport);
|
||||
ret = __localport_unreg(lport);
|
||||
if (ret)
|
||||
pr_warn("%s: Failed deleting local port\n", __func__);
|
||||
|
||||
|
@ -1663,6 +1785,7 @@ static void __exit fcloop_exit(void)
|
|||
|
||||
device_destroy(&fcloop_class, MKDEV(0, 0));
|
||||
class_unregister(&fcloop_class);
|
||||
kmem_cache_destroy(lsreq_cache);
|
||||
}
|
||||
|
||||
module_init(fcloop_init);
|
||||
|
|
|
@ -33,10 +33,12 @@ struct nvme_loop_ctrl {
|
|||
|
||||
struct list_head list;
|
||||
struct blk_mq_tag_set tag_set;
|
||||
struct nvme_loop_iod async_event_iod;
|
||||
struct nvme_ctrl ctrl;
|
||||
|
||||
struct nvmet_port *port;
|
||||
|
||||
/* Must be last --ends in a flexible-array member. */
|
||||
struct nvme_loop_iod async_event_iod;
|
||||
};
|
||||
|
||||
static inline struct nvme_loop_ctrl *to_loop_ctrl(struct nvme_ctrl *ctrl)
|
||||
|
@ -148,8 +150,7 @@ static blk_status_t nvme_loop_queue_rq(struct blk_mq_hw_ctx *hctx,
|
|||
nvme_start_request(req);
|
||||
iod->cmd.common.flags |= NVME_CMD_SGL_METABUF;
|
||||
iod->req.port = queue->ctrl->port;
|
||||
if (!nvmet_req_init(&iod->req, &queue->nvme_cq,
|
||||
&queue->nvme_sq, &nvme_loop_ops))
|
||||
if (!nvmet_req_init(&iod->req, &queue->nvme_sq, &nvme_loop_ops))
|
||||
return BLK_STS_OK;
|
||||
|
||||
if (blk_rq_nr_phys_segments(req)) {
|
||||
|
@ -181,8 +182,7 @@ static void nvme_loop_submit_async_event(struct nvme_ctrl *arg)
|
|||
iod->cmd.common.command_id = NVME_AQ_BLK_MQ_DEPTH;
|
||||
iod->cmd.common.flags |= NVME_CMD_SGL_METABUF;
|
||||
|
||||
if (!nvmet_req_init(&iod->req, &queue->nvme_cq, &queue->nvme_sq,
|
||||
&nvme_loop_ops)) {
|
||||
if (!nvmet_req_init(&iod->req, &queue->nvme_sq, &nvme_loop_ops)) {
|
||||
dev_err(ctrl->ctrl.device, "failed async event work\n");
|
||||
return;
|
||||
}
|
||||
|
@ -273,6 +273,7 @@ static void nvme_loop_destroy_admin_queue(struct nvme_loop_ctrl *ctrl)
|
|||
nvme_unquiesce_admin_queue(&ctrl->ctrl);
|
||||
|
||||
nvmet_sq_destroy(&ctrl->queues[0].nvme_sq);
|
||||
nvmet_cq_put(&ctrl->queues[0].nvme_cq);
|
||||
nvme_remove_admin_tag_set(&ctrl->ctrl);
|
||||
}
|
||||
|
||||
|
@ -302,6 +303,7 @@ static void nvme_loop_destroy_io_queues(struct nvme_loop_ctrl *ctrl)
|
|||
for (i = 1; i < ctrl->ctrl.queue_count; i++) {
|
||||
clear_bit(NVME_LOOP_Q_LIVE, &ctrl->queues[i].flags);
|
||||
nvmet_sq_destroy(&ctrl->queues[i].nvme_sq);
|
||||
nvmet_cq_put(&ctrl->queues[i].nvme_cq);
|
||||
}
|
||||
ctrl->ctrl.queue_count = 1;
|
||||
/*
|
||||
|
@ -327,9 +329,13 @@ static int nvme_loop_init_io_queues(struct nvme_loop_ctrl *ctrl)
|
|||
|
||||
for (i = 1; i <= nr_io_queues; i++) {
|
||||
ctrl->queues[i].ctrl = ctrl;
|
||||
ret = nvmet_sq_init(&ctrl->queues[i].nvme_sq);
|
||||
if (ret)
|
||||
nvmet_cq_init(&ctrl->queues[i].nvme_cq);
|
||||
ret = nvmet_sq_init(&ctrl->queues[i].nvme_sq,
|
||||
&ctrl->queues[i].nvme_cq);
|
||||
if (ret) {
|
||||
nvmet_cq_put(&ctrl->queues[i].nvme_cq);
|
||||
goto out_destroy_queues;
|
||||
}
|
||||
|
||||
ctrl->ctrl.queue_count++;
|
||||
}
|
||||
|
@ -360,9 +366,13 @@ static int nvme_loop_configure_admin_queue(struct nvme_loop_ctrl *ctrl)
|
|||
int error;
|
||||
|
||||
ctrl->queues[0].ctrl = ctrl;
|
||||
error = nvmet_sq_init(&ctrl->queues[0].nvme_sq);
|
||||
if (error)
|
||||
nvmet_cq_init(&ctrl->queues[0].nvme_cq);
|
||||
error = nvmet_sq_init(&ctrl->queues[0].nvme_sq,
|
||||
&ctrl->queues[0].nvme_cq);
|
||||
if (error) {
|
||||
nvmet_cq_put(&ctrl->queues[0].nvme_cq);
|
||||
return error;
|
||||
}
|
||||
ctrl->ctrl.queue_count = 1;
|
||||
|
||||
error = nvme_alloc_admin_tag_set(&ctrl->ctrl, &ctrl->admin_tag_set,
|
||||
|
@ -401,6 +411,7 @@ out_cleanup_tagset:
|
|||
nvme_remove_admin_tag_set(&ctrl->ctrl);
|
||||
out_free_sq:
|
||||
nvmet_sq_destroy(&ctrl->queues[0].nvme_sq);
|
||||
nvmet_cq_put(&ctrl->queues[0].nvme_cq);
|
||||
return error;
|
||||
}
|
||||
|
||||
|
|
|
@ -141,13 +141,16 @@ static inline struct device *nvmet_ns_dev(struct nvmet_ns *ns)
|
|||
}
|
||||
|
||||
struct nvmet_cq {
|
||||
struct nvmet_ctrl *ctrl;
|
||||
u16 qid;
|
||||
u16 size;
|
||||
refcount_t ref;
|
||||
};
|
||||
|
||||
struct nvmet_sq {
|
||||
struct nvmet_ctrl *ctrl;
|
||||
struct percpu_ref ref;
|
||||
struct nvmet_cq *cq;
|
||||
u16 qid;
|
||||
u16 size;
|
||||
u32 sqhd;
|
||||
|
@ -247,6 +250,7 @@ struct nvmet_pr_log_mgr {
|
|||
struct nvmet_ctrl {
|
||||
struct nvmet_subsys *subsys;
|
||||
struct nvmet_sq **sqs;
|
||||
struct nvmet_cq **cqs;
|
||||
|
||||
void *drvdata;
|
||||
|
||||
|
@ -424,7 +428,7 @@ struct nvmet_fabrics_ops {
|
|||
u16 (*get_max_queue_size)(const struct nvmet_ctrl *ctrl);
|
||||
|
||||
/* Operations mandatory for PCI target controllers */
|
||||
u16 (*create_sq)(struct nvmet_ctrl *ctrl, u16 sqid, u16 flags,
|
||||
u16 (*create_sq)(struct nvmet_ctrl *ctrl, u16 sqid, u16 cqid, u16 flags,
|
||||
u16 qsize, u64 prp1);
|
||||
u16 (*delete_sq)(struct nvmet_ctrl *ctrl, u16 sqid);
|
||||
u16 (*create_cq)(struct nvmet_ctrl *ctrl, u16 cqid, u16 flags,
|
||||
|
@ -557,8 +561,8 @@ u32 nvmet_fabrics_admin_cmd_data_len(struct nvmet_req *req);
|
|||
u16 nvmet_parse_fabrics_io_cmd(struct nvmet_req *req);
|
||||
u32 nvmet_fabrics_io_cmd_data_len(struct nvmet_req *req);
|
||||
|
||||
bool nvmet_req_init(struct nvmet_req *req, struct nvmet_cq *cq,
|
||||
struct nvmet_sq *sq, const struct nvmet_fabrics_ops *ops);
|
||||
bool nvmet_req_init(struct nvmet_req *req, struct nvmet_sq *sq,
|
||||
const struct nvmet_fabrics_ops *ops);
|
||||
void nvmet_req_uninit(struct nvmet_req *req);
|
||||
size_t nvmet_req_transfer_len(struct nvmet_req *req);
|
||||
bool nvmet_check_transfer_len(struct nvmet_req *req, size_t len);
|
||||
|
@ -571,18 +575,24 @@ void nvmet_execute_set_features(struct nvmet_req *req);
|
|||
void nvmet_execute_get_features(struct nvmet_req *req);
|
||||
void nvmet_execute_keep_alive(struct nvmet_req *req);
|
||||
|
||||
u16 nvmet_check_cqid(struct nvmet_ctrl *ctrl, u16 cqid);
|
||||
u16 nvmet_check_cqid(struct nvmet_ctrl *ctrl, u16 cqid, bool create);
|
||||
u16 nvmet_check_io_cqid(struct nvmet_ctrl *ctrl, u16 cqid, bool create);
|
||||
void nvmet_cq_init(struct nvmet_cq *cq);
|
||||
void nvmet_cq_setup(struct nvmet_ctrl *ctrl, struct nvmet_cq *cq, u16 qid,
|
||||
u16 size);
|
||||
u16 nvmet_cq_create(struct nvmet_ctrl *ctrl, struct nvmet_cq *cq, u16 qid,
|
||||
u16 size);
|
||||
void nvmet_cq_destroy(struct nvmet_cq *cq);
|
||||
bool nvmet_cq_get(struct nvmet_cq *cq);
|
||||
void nvmet_cq_put(struct nvmet_cq *cq);
|
||||
bool nvmet_cq_in_use(struct nvmet_cq *cq);
|
||||
u16 nvmet_check_sqid(struct nvmet_ctrl *ctrl, u16 sqid, bool create);
|
||||
void nvmet_sq_setup(struct nvmet_ctrl *ctrl, struct nvmet_sq *sq, u16 qid,
|
||||
u16 size);
|
||||
u16 nvmet_sq_create(struct nvmet_ctrl *ctrl, struct nvmet_sq *sq, u16 qid,
|
||||
u16 size);
|
||||
u16 nvmet_sq_create(struct nvmet_ctrl *ctrl, struct nvmet_sq *sq,
|
||||
struct nvmet_cq *cq, u16 qid, u16 size);
|
||||
void nvmet_sq_destroy(struct nvmet_sq *sq);
|
||||
int nvmet_sq_init(struct nvmet_sq *sq);
|
||||
int nvmet_sq_init(struct nvmet_sq *sq, struct nvmet_cq *cq);
|
||||
|
||||
void nvmet_ctrl_fatal_error(struct nvmet_ctrl *ctrl);
|
||||
|
||||
|
|
|
@ -1354,15 +1354,17 @@ static u16 nvmet_pci_epf_delete_cq(struct nvmet_ctrl *tctrl, u16 cqid)
|
|||
if (test_and_clear_bit(NVMET_PCI_EPF_Q_IRQ_ENABLED, &cq->flags))
|
||||
nvmet_pci_epf_remove_irq_vector(ctrl, cq->vector);
|
||||
nvmet_pci_epf_mem_unmap(ctrl->nvme_epf, &cq->pci_map);
|
||||
nvmet_cq_put(&cq->nvme_cq);
|
||||
|
||||
return NVME_SC_SUCCESS;
|
||||
}
|
||||
|
||||
static u16 nvmet_pci_epf_create_sq(struct nvmet_ctrl *tctrl,
|
||||
u16 sqid, u16 flags, u16 qsize, u64 pci_addr)
|
||||
u16 sqid, u16 cqid, u16 flags, u16 qsize, u64 pci_addr)
|
||||
{
|
||||
struct nvmet_pci_epf_ctrl *ctrl = tctrl->drvdata;
|
||||
struct nvmet_pci_epf_queue *sq = &ctrl->sq[sqid];
|
||||
struct nvmet_pci_epf_queue *cq = &ctrl->cq[cqid];
|
||||
u16 status;
|
||||
|
||||
if (test_bit(NVMET_PCI_EPF_Q_LIVE, &sq->flags))
|
||||
|
@ -1385,7 +1387,8 @@ static u16 nvmet_pci_epf_create_sq(struct nvmet_ctrl *tctrl,
|
|||
sq->qes = ctrl->io_sqes;
|
||||
sq->pci_size = sq->qes * sq->depth;
|
||||
|
||||
status = nvmet_sq_create(tctrl, &sq->nvme_sq, sqid, sq->depth);
|
||||
status = nvmet_sq_create(tctrl, &sq->nvme_sq, &cq->nvme_cq, sqid,
|
||||
sq->depth);
|
||||
if (status != NVME_SC_SUCCESS)
|
||||
return status;
|
||||
|
||||
|
@ -1601,8 +1604,7 @@ static void nvmet_pci_epf_exec_iod_work(struct work_struct *work)
|
|||
goto complete;
|
||||
}
|
||||
|
||||
if (!nvmet_req_init(req, &iod->cq->nvme_cq, &iod->sq->nvme_sq,
|
||||
&nvmet_pci_epf_fabrics_ops))
|
||||
if (!nvmet_req_init(req, &iod->sq->nvme_sq, &nvmet_pci_epf_fabrics_ops))
|
||||
goto complete;
|
||||
|
||||
iod->data_len = nvmet_req_transfer_len(req);
|
||||
|
@ -1879,8 +1881,8 @@ static int nvmet_pci_epf_enable_ctrl(struct nvmet_pci_epf_ctrl *ctrl)
|
|||
|
||||
qsize = aqa & 0x00000fff;
|
||||
pci_addr = asq & GENMASK_ULL(63, 12);
|
||||
status = nvmet_pci_epf_create_sq(ctrl->tctrl, 0, NVME_QUEUE_PHYS_CONTIG,
|
||||
qsize, pci_addr);
|
||||
status = nvmet_pci_epf_create_sq(ctrl->tctrl, 0, 0,
|
||||
NVME_QUEUE_PHYS_CONTIG, qsize, pci_addr);
|
||||
if (status != NVME_SC_SUCCESS) {
|
||||
dev_err(ctrl->dev, "Failed to create admin submission queue\n");
|
||||
nvmet_pci_epf_delete_cq(ctrl->tctrl, 0);
|
||||
|
|
|
@ -976,8 +976,7 @@ static void nvmet_rdma_handle_command(struct nvmet_rdma_queue *queue,
|
|||
cmd->send_sge.addr, cmd->send_sge.length,
|
||||
DMA_TO_DEVICE);
|
||||
|
||||
if (!nvmet_req_init(&cmd->req, &queue->nvme_cq,
|
||||
&queue->nvme_sq, &nvmet_rdma_ops))
|
||||
if (!nvmet_req_init(&cmd->req, &queue->nvme_sq, &nvmet_rdma_ops))
|
||||
return;
|
||||
|
||||
status = nvmet_rdma_map_sgl(cmd);
|
||||
|
@ -1353,6 +1352,7 @@ static void nvmet_rdma_free_queue(struct nvmet_rdma_queue *queue)
|
|||
pr_debug("freeing queue %d\n", queue->idx);
|
||||
|
||||
nvmet_sq_destroy(&queue->nvme_sq);
|
||||
nvmet_cq_put(&queue->nvme_cq);
|
||||
|
||||
nvmet_rdma_destroy_queue_ib(queue);
|
||||
if (!queue->nsrq) {
|
||||
|
@ -1436,7 +1436,8 @@ nvmet_rdma_alloc_queue(struct nvmet_rdma_device *ndev,
|
|||
goto out_reject;
|
||||
}
|
||||
|
||||
ret = nvmet_sq_init(&queue->nvme_sq);
|
||||
nvmet_cq_init(&queue->nvme_cq);
|
||||
ret = nvmet_sq_init(&queue->nvme_sq, &queue->nvme_cq);
|
||||
if (ret) {
|
||||
ret = NVME_RDMA_CM_NO_RSC;
|
||||
goto out_free_queue;
|
||||
|
@ -1517,6 +1518,7 @@ out_ida_remove:
|
|||
out_destroy_sq:
|
||||
nvmet_sq_destroy(&queue->nvme_sq);
|
||||
out_free_queue:
|
||||
nvmet_cq_put(&queue->nvme_cq);
|
||||
kfree(queue);
|
||||
out_reject:
|
||||
nvmet_rdma_cm_reject(cm_id, ret);
|
||||
|
|
|
@ -7,6 +7,7 @@
|
|||
#include <linux/module.h>
|
||||
#include <linux/init.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/crc32c.h>
|
||||
#include <linux/err.h>
|
||||
#include <linux/nvme-tcp.h>
|
||||
#include <linux/nvme-keyring.h>
|
||||
|
@ -17,7 +18,6 @@
|
|||
#include <net/handshake.h>
|
||||
#include <linux/inet.h>
|
||||
#include <linux/llist.h>
|
||||
#include <crypto/hash.h>
|
||||
#include <trace/events/sock.h>
|
||||
|
||||
#include "nvmet.h"
|
||||
|
@ -172,8 +172,6 @@ struct nvmet_tcp_queue {
|
|||
/* digest state */
|
||||
bool hdr_digest;
|
||||
bool data_digest;
|
||||
struct ahash_request *snd_hash;
|
||||
struct ahash_request *rcv_hash;
|
||||
|
||||
/* TLS state */
|
||||
key_serial_t tls_pskid;
|
||||
|
@ -294,14 +292,9 @@ static inline u8 nvmet_tcp_ddgst_len(struct nvmet_tcp_queue *queue)
|
|||
return queue->data_digest ? NVME_TCP_DIGEST_LENGTH : 0;
|
||||
}
|
||||
|
||||
static inline void nvmet_tcp_hdgst(struct ahash_request *hash,
|
||||
void *pdu, size_t len)
|
||||
static inline void nvmet_tcp_hdgst(void *pdu, size_t len)
|
||||
{
|
||||
struct scatterlist sg;
|
||||
|
||||
sg_init_one(&sg, pdu, len);
|
||||
ahash_request_set_crypt(hash, &sg, pdu + len, len);
|
||||
crypto_ahash_digest(hash);
|
||||
put_unaligned_le32(~crc32c(~0, pdu, len), pdu + len);
|
||||
}
|
||||
|
||||
static int nvmet_tcp_verify_hdgst(struct nvmet_tcp_queue *queue,
|
||||
|
@ -318,7 +311,7 @@ static int nvmet_tcp_verify_hdgst(struct nvmet_tcp_queue *queue,
|
|||
}
|
||||
|
||||
recv_digest = *(__le32 *)(pdu + hdr->hlen);
|
||||
nvmet_tcp_hdgst(queue->rcv_hash, pdu, len);
|
||||
nvmet_tcp_hdgst(pdu, len);
|
||||
exp_digest = *(__le32 *)(pdu + hdr->hlen);
|
||||
if (recv_digest != exp_digest) {
|
||||
pr_err("queue %d: header digest error: recv %#x expected %#x\n",
|
||||
|
@ -441,12 +434,24 @@ err:
|
|||
return NVME_SC_INTERNAL;
|
||||
}
|
||||
|
||||
static void nvmet_tcp_calc_ddgst(struct ahash_request *hash,
|
||||
struct nvmet_tcp_cmd *cmd)
|
||||
static void nvmet_tcp_calc_ddgst(struct nvmet_tcp_cmd *cmd)
|
||||
{
|
||||
ahash_request_set_crypt(hash, cmd->req.sg,
|
||||
(void *)&cmd->exp_ddgst, cmd->req.transfer_len);
|
||||
crypto_ahash_digest(hash);
|
||||
size_t total_len = cmd->req.transfer_len;
|
||||
struct scatterlist *sg = cmd->req.sg;
|
||||
u32 crc = ~0;
|
||||
|
||||
while (total_len) {
|
||||
size_t len = min_t(size_t, total_len, sg->length);
|
||||
|
||||
/*
|
||||
* Note that the scatterlist does not contain any highmem pages,
|
||||
* as it was allocated by sgl_alloc() with GFP_KERNEL.
|
||||
*/
|
||||
crc = crc32c(crc, sg_virt(sg), len);
|
||||
total_len -= len;
|
||||
sg = sg_next(sg);
|
||||
}
|
||||
cmd->exp_ddgst = cpu_to_le32(~crc);
|
||||
}
|
||||
|
||||
static void nvmet_setup_c2h_data_pdu(struct nvmet_tcp_cmd *cmd)
|
||||
|
@ -473,19 +478,18 @@ static void nvmet_setup_c2h_data_pdu(struct nvmet_tcp_cmd *cmd)
|
|||
|
||||
if (queue->data_digest) {
|
||||
pdu->hdr.flags |= NVME_TCP_F_DDGST;
|
||||
nvmet_tcp_calc_ddgst(queue->snd_hash, cmd);
|
||||
nvmet_tcp_calc_ddgst(cmd);
|
||||
}
|
||||
|
||||
if (cmd->queue->hdr_digest) {
|
||||
pdu->hdr.flags |= NVME_TCP_F_HDGST;
|
||||
nvmet_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu));
|
||||
nvmet_tcp_hdgst(pdu, sizeof(*pdu));
|
||||
}
|
||||
}
|
||||
|
||||
static void nvmet_setup_r2t_pdu(struct nvmet_tcp_cmd *cmd)
|
||||
{
|
||||
struct nvme_tcp_r2t_pdu *pdu = cmd->r2t_pdu;
|
||||
struct nvmet_tcp_queue *queue = cmd->queue;
|
||||
u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue);
|
||||
|
||||
cmd->offset = 0;
|
||||
|
@ -503,14 +507,13 @@ static void nvmet_setup_r2t_pdu(struct nvmet_tcp_cmd *cmd)
|
|||
pdu->r2t_offset = cpu_to_le32(cmd->rbytes_done);
|
||||
if (cmd->queue->hdr_digest) {
|
||||
pdu->hdr.flags |= NVME_TCP_F_HDGST;
|
||||
nvmet_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu));
|
||||
nvmet_tcp_hdgst(pdu, sizeof(*pdu));
|
||||
}
|
||||
}
|
||||
|
||||
static void nvmet_setup_response_pdu(struct nvmet_tcp_cmd *cmd)
|
||||
{
|
||||
struct nvme_tcp_rsp_pdu *pdu = cmd->rsp_pdu;
|
||||
struct nvmet_tcp_queue *queue = cmd->queue;
|
||||
u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue);
|
||||
|
||||
cmd->offset = 0;
|
||||
|
@ -523,7 +526,7 @@ static void nvmet_setup_response_pdu(struct nvmet_tcp_cmd *cmd)
|
|||
pdu->hdr.plen = cpu_to_le32(pdu->hdr.hlen + hdgst);
|
||||
if (cmd->queue->hdr_digest) {
|
||||
pdu->hdr.flags |= NVME_TCP_F_HDGST;
|
||||
nvmet_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu));
|
||||
nvmet_tcp_hdgst(pdu, sizeof(*pdu));
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -857,42 +860,6 @@ static void nvmet_prepare_receive_pdu(struct nvmet_tcp_queue *queue)
|
|||
smp_store_release(&queue->rcv_state, NVMET_TCP_RECV_PDU);
|
||||
}
|
||||
|
||||
static void nvmet_tcp_free_crypto(struct nvmet_tcp_queue *queue)
|
||||
{
|
||||
struct crypto_ahash *tfm = crypto_ahash_reqtfm(queue->rcv_hash);
|
||||
|
||||
ahash_request_free(queue->rcv_hash);
|
||||
ahash_request_free(queue->snd_hash);
|
||||
crypto_free_ahash(tfm);
|
||||
}
|
||||
|
||||
static int nvmet_tcp_alloc_crypto(struct nvmet_tcp_queue *queue)
|
||||
{
|
||||
struct crypto_ahash *tfm;
|
||||
|
||||
tfm = crypto_alloc_ahash("crc32c", 0, CRYPTO_ALG_ASYNC);
|
||||
if (IS_ERR(tfm))
|
||||
return PTR_ERR(tfm);
|
||||
|
||||
queue->snd_hash = ahash_request_alloc(tfm, GFP_KERNEL);
|
||||
if (!queue->snd_hash)
|
||||
goto free_tfm;
|
||||
ahash_request_set_callback(queue->snd_hash, 0, NULL, NULL);
|
||||
|
||||
queue->rcv_hash = ahash_request_alloc(tfm, GFP_KERNEL);
|
||||
if (!queue->rcv_hash)
|
||||
goto free_snd_hash;
|
||||
ahash_request_set_callback(queue->rcv_hash, 0, NULL, NULL);
|
||||
|
||||
return 0;
|
||||
free_snd_hash:
|
||||
ahash_request_free(queue->snd_hash);
|
||||
free_tfm:
|
||||
crypto_free_ahash(tfm);
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
|
||||
static int nvmet_tcp_handle_icreq(struct nvmet_tcp_queue *queue)
|
||||
{
|
||||
struct nvme_tcp_icreq_pdu *icreq = &queue->pdu.icreq;
|
||||
|
@ -921,11 +888,6 @@ static int nvmet_tcp_handle_icreq(struct nvmet_tcp_queue *queue)
|
|||
|
||||
queue->hdr_digest = !!(icreq->digest & NVME_TCP_HDR_DIGEST_ENABLE);
|
||||
queue->data_digest = !!(icreq->digest & NVME_TCP_DATA_DIGEST_ENABLE);
|
||||
if (queue->hdr_digest || queue->data_digest) {
|
||||
ret = nvmet_tcp_alloc_crypto(queue);
|
||||
if (ret)
|
||||
return ret;
|
||||
}
|
||||
|
||||
memset(icresp, 0, sizeof(*icresp));
|
||||
icresp->hdr.type = nvme_tcp_icresp;
|
||||
|
@ -1077,8 +1039,7 @@ static int nvmet_tcp_done_recv_pdu(struct nvmet_tcp_queue *queue)
|
|||
req = &queue->cmd->req;
|
||||
memcpy(req->cmd, nvme_cmd, sizeof(*nvme_cmd));
|
||||
|
||||
if (unlikely(!nvmet_req_init(req, &queue->nvme_cq,
|
||||
&queue->nvme_sq, &nvmet_tcp_ops))) {
|
||||
if (unlikely(!nvmet_req_init(req, &queue->nvme_sq, &nvmet_tcp_ops))) {
|
||||
pr_err("failed cmd %p id %d opcode %d, data_len: %d, status: %04x\n",
|
||||
req->cmd, req->cmd->common.command_id,
|
||||
req->cmd->common.opcode,
|
||||
|
@ -1247,7 +1208,7 @@ static void nvmet_tcp_prep_recv_ddgst(struct nvmet_tcp_cmd *cmd)
|
|||
{
|
||||
struct nvmet_tcp_queue *queue = cmd->queue;
|
||||
|
||||
nvmet_tcp_calc_ddgst(queue->rcv_hash, cmd);
|
||||
nvmet_tcp_calc_ddgst(cmd);
|
||||
queue->offset = 0;
|
||||
queue->left = NVME_TCP_DIGEST_LENGTH;
|
||||
queue->rcv_state = NVMET_TCP_RECV_DDGST;
|
||||
|
@ -1615,13 +1576,12 @@ static void nvmet_tcp_release_queue_work(struct work_struct *w)
|
|||
nvmet_sq_put_tls_key(&queue->nvme_sq);
|
||||
nvmet_tcp_uninit_data_in_cmds(queue);
|
||||
nvmet_sq_destroy(&queue->nvme_sq);
|
||||
nvmet_cq_put(&queue->nvme_cq);
|
||||
cancel_work_sync(&queue->io_work);
|
||||
nvmet_tcp_free_cmd_data_in_buffers(queue);
|
||||
/* ->sock will be released by fput() */
|
||||
fput(queue->sock->file);
|
||||
nvmet_tcp_free_cmds(queue);
|
||||
if (queue->hdr_digest || queue->data_digest)
|
||||
nvmet_tcp_free_crypto(queue);
|
||||
ida_free(&nvmet_tcp_queue_ida, queue->idx);
|
||||
page_frag_cache_drain(&queue->pf_cache);
|
||||
kfree(queue);
|
||||
|
@ -1950,7 +1910,8 @@ static void nvmet_tcp_alloc_queue(struct nvmet_tcp_port *port,
|
|||
if (ret)
|
||||
goto out_ida_remove;
|
||||
|
||||
ret = nvmet_sq_init(&queue->nvme_sq);
|
||||
nvmet_cq_init(&queue->nvme_cq);
|
||||
ret = nvmet_sq_init(&queue->nvme_sq, &queue->nvme_cq);
|
||||
if (ret)
|
||||
goto out_free_connect;
|
||||
|
||||
|
@ -1993,6 +1954,7 @@ out_destroy_sq:
|
|||
mutex_unlock(&nvmet_tcp_queue_mutex);
|
||||
nvmet_sq_destroy(&queue->nvme_sq);
|
||||
out_free_connect:
|
||||
nvmet_cq_put(&queue->nvme_cq);
|
||||
nvmet_tcp_free_cmd(&queue->connect);
|
||||
out_ida_remove:
|
||||
ida_free(&nvmet_tcp_queue_ida, queue->idx);
|
||||
|
|
|
@ -403,6 +403,7 @@ config SCSI_ACARD
|
|||
config SCSI_AHA152X
|
||||
tristate "Adaptec AHA152X/2825 support"
|
||||
depends on ISA && SCSI
|
||||
depends on !HIGHMEM
|
||||
select SCSI_SPI_ATTRS
|
||||
select CHECK_SIGNATURE
|
||||
help
|
||||
|
@ -795,6 +796,7 @@ config SCSI_PPA
|
|||
tristate "IOMEGA parallel port (ppa - older drives)"
|
||||
depends on SCSI && PARPORT_PC
|
||||
depends on HAS_IOPORT
|
||||
depends on !HIGHMEM
|
||||
help
|
||||
This driver supports older versions of IOMEGA's parallel port ZIP
|
||||
drive (a 100 MB removable media device).
|
||||
|
@ -822,6 +824,7 @@ config SCSI_PPA
|
|||
config SCSI_IMM
|
||||
tristate "IOMEGA parallel port (imm - newer drives)"
|
||||
depends on SCSI && PARPORT_PC
|
||||
depends on !HIGHMEM
|
||||
help
|
||||
This driver supports newer versions of IOMEGA's parallel port ZIP
|
||||
drive (a 100 MB removable media device).
|
||||
|
|
|
@ -746,7 +746,6 @@ struct Scsi_Host *aha152x_probe_one(struct aha152x_setup *setup)
|
|||
/* need to have host registered before triggering any interrupt */
|
||||
list_add_tail(&HOSTDATA(shpnt)->host_list, &aha152x_host_list);
|
||||
|
||||
shpnt->no_highmem = true;
|
||||
shpnt->io_port = setup->io_port;
|
||||
shpnt->n_io_port = IO_RANGE;
|
||||
shpnt->irq = setup->irq;
|
||||
|
|
|
@ -1224,7 +1224,6 @@ static int __imm_attach(struct parport *pb)
|
|||
host = scsi_host_alloc(&imm_template, sizeof(imm_struct *));
|
||||
if (!host)
|
||||
goto out1;
|
||||
host->no_highmem = true;
|
||||
host->io_port = pb->base;
|
||||
host->n_io_port = ports;
|
||||
host->dma_channel = -1;
|
||||
|
|
|
@ -1104,7 +1104,6 @@ static int __ppa_attach(struct parport *pb)
|
|||
host = scsi_host_alloc(&ppa_template, sizeof(ppa_struct *));
|
||||
if (!host)
|
||||
goto out1;
|
||||
host->no_highmem = true;
|
||||
host->io_port = pb->base;
|
||||
host->n_io_port = ports;
|
||||
host->dma_channel = -1;
|
||||
|
|
|
@ -601,7 +601,7 @@ static int sg_scsi_ioctl(struct request_queue *q, bool open_for_write,
|
|||
}
|
||||
|
||||
if (bytes) {
|
||||
err = blk_rq_map_kern(q, rq, buffer, bytes, GFP_NOIO);
|
||||
err = blk_rq_map_kern(rq, buffer, bytes, GFP_NOIO);
|
||||
if (err)
|
||||
goto error;
|
||||
}
|
||||
|
|
|
@ -313,8 +313,7 @@ retry:
|
|||
return PTR_ERR(req);
|
||||
|
||||
if (bufflen) {
|
||||
ret = blk_rq_map_kern(sdev->request_queue, req,
|
||||
buffer, bufflen, GFP_NOIO);
|
||||
ret = blk_rq_map_kern(req, buffer, bufflen, GFP_NOIO);
|
||||
if (ret)
|
||||
goto out;
|
||||
}
|
||||
|
@ -2004,9 +2003,6 @@ void scsi_init_limits(struct Scsi_Host *shost, struct queue_limits *lim)
|
|||
lim->dma_alignment = max_t(unsigned int,
|
||||
shost->dma_alignment, dma_get_cache_alignment() - 1);
|
||||
|
||||
if (shost->no_highmem)
|
||||
lim->features |= BLK_FEAT_BOUNCE_HIGH;
|
||||
|
||||
/*
|
||||
* Propagate the DMA formation properties to the dma-mapping layer as
|
||||
* a courtesy service to the LLDDs. This needs to check that the buses
|
||||
|
|
|
@ -1056,13 +1056,20 @@ int usb_stor_probe1(struct us_data **pus,
|
|||
goto BadDevice;
|
||||
|
||||
/*
|
||||
* Some USB host controllers can't do DMA; they have to use PIO.
|
||||
* For such controllers we need to make sure the block layer sets
|
||||
* up bounce buffers in addressable memory.
|
||||
* Some USB host controllers can't do DMA: They have to use PIO, or they
|
||||
* have to use a small dedicated local memory area, or they have other
|
||||
* restrictions on addressable memory.
|
||||
*
|
||||
* We can't support these controllers on highmem systems as we don't
|
||||
* kmap or bounce buffer.
|
||||
*/
|
||||
if (!hcd_uses_dma(bus_to_hcd(us->pusb_dev->bus)) ||
|
||||
bus_to_hcd(us->pusb_dev->bus)->localmem_pool)
|
||||
host->no_highmem = true;
|
||||
if (IS_ENABLED(CONFIG_HIGHMEM) &&
|
||||
(!hcd_uses_dma(bus_to_hcd(us->pusb_dev->bus)) ||
|
||||
bus_to_hcd(us->pusb_dev->bus)->localmem_pool)) {
|
||||
dev_warn(&intf->dev, "USB Mass Storage not supported on this host controller\n");
|
||||
result = -EINVAL;
|
||||
goto release;
|
||||
}
|
||||
|
||||
/* Get the unusual_devs entries and the descriptors */
|
||||
result = get_device_info(us, id, unusual_dev);
|
||||
|
@ -1081,6 +1088,7 @@ int usb_stor_probe1(struct us_data **pus,
|
|||
|
||||
BadDevice:
|
||||
usb_stor_dbg(us, "storage_probe() failed\n");
|
||||
release:
|
||||
release_everything(us);
|
||||
return result;
|
||||
}
|
||||
|
|
1
fs/aio.c
1
fs/aio.c
|
@ -1511,6 +1511,7 @@ static int aio_prep_rw(struct kiocb *req, const struct iocb *iocb, int rw_type)
|
|||
{
|
||||
int ret;
|
||||
|
||||
req->ki_write_stream = 0;
|
||||
req->ki_complete = aio_complete_rw;
|
||||
req->private = NULL;
|
||||
req->ki_pos = iocb->aio_offset;
|
||||
|
|
|
@ -2770,17 +2770,11 @@ static int scrub_one_super(struct scrub_ctx *sctx, struct btrfs_device *dev,
|
|||
struct page *page, u64 physical, u64 generation)
|
||||
{
|
||||
struct btrfs_fs_info *fs_info = sctx->fs_info;
|
||||
struct bio_vec bvec;
|
||||
struct bio bio;
|
||||
struct btrfs_super_block *sb = page_address(page);
|
||||
int ret;
|
||||
|
||||
bio_init(&bio, dev->bdev, &bvec, 1, REQ_OP_READ);
|
||||
bio.bi_iter.bi_sector = physical >> SECTOR_SHIFT;
|
||||
__bio_add_page(&bio, page, BTRFS_SUPER_INFO_SIZE, 0);
|
||||
ret = submit_bio_wait(&bio);
|
||||
bio_uninit(&bio);
|
||||
|
||||
ret = bdev_rw_virt(dev->bdev, physical >> SECTOR_SHIFT, sb,
|
||||
BTRFS_SUPER_INFO_SIZE, REQ_OP_READ);
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
ret = btrfs_check_super_csum(fs_info, sb);
|
||||
|
|
|
@ -226,28 +226,22 @@ static void gfs2_sb_in(struct gfs2_sbd *sdp, const struct gfs2_sb *str)
|
|||
|
||||
static int gfs2_read_super(struct gfs2_sbd *sdp, sector_t sector, int silent)
|
||||
{
|
||||
struct super_block *sb = sdp->sd_vfs;
|
||||
struct page *page;
|
||||
struct bio_vec bvec;
|
||||
struct bio bio;
|
||||
struct gfs2_sb *sb;
|
||||
int err;
|
||||
|
||||
page = alloc_page(GFP_KERNEL);
|
||||
if (unlikely(!page))
|
||||
sb = kmalloc(PAGE_SIZE, GFP_KERNEL);
|
||||
if (unlikely(!sb))
|
||||
return -ENOMEM;
|
||||
|
||||
bio_init(&bio, sb->s_bdev, &bvec, 1, REQ_OP_READ | REQ_META);
|
||||
bio.bi_iter.bi_sector = sector * (sb->s_blocksize >> 9);
|
||||
__bio_add_page(&bio, page, PAGE_SIZE, 0);
|
||||
|
||||
err = submit_bio_wait(&bio);
|
||||
err = bdev_rw_virt(sdp->sd_vfs->s_bdev,
|
||||
sector * (sdp->sd_vfs->s_blocksize >> 9), sb, PAGE_SIZE,
|
||||
REQ_OP_READ | REQ_META);
|
||||
if (err) {
|
||||
pr_warn("error %d reading superblock\n", err);
|
||||
__free_page(page);
|
||||
kfree(sb);
|
||||
return err;
|
||||
}
|
||||
gfs2_sb_in(sdp, page_address(page));
|
||||
__free_page(page);
|
||||
gfs2_sb_in(sdp, sb);
|
||||
kfree(sb);
|
||||
return gfs2_check_sb(sdp, silent);
|
||||
}
|
||||
|
||||
|
|
|
@ -48,47 +48,19 @@ struct hfsplus_wd {
|
|||
int hfsplus_submit_bio(struct super_block *sb, sector_t sector,
|
||||
void *buf, void **data, blk_opf_t opf)
|
||||
{
|
||||
const enum req_op op = opf & REQ_OP_MASK;
|
||||
struct bio *bio;
|
||||
int ret = 0;
|
||||
u64 io_size;
|
||||
loff_t start;
|
||||
int offset;
|
||||
u64 io_size = hfsplus_min_io_size(sb);
|
||||
loff_t start = (loff_t)sector << HFSPLUS_SECTOR_SHIFT;
|
||||
int offset = start & (io_size - 1);
|
||||
|
||||
/*
|
||||
* Align sector to hardware sector size and find offset. We
|
||||
* assume that io_size is a power of two, which _should_
|
||||
* be true.
|
||||
*/
|
||||
io_size = hfsplus_min_io_size(sb);
|
||||
start = (loff_t)sector << HFSPLUS_SECTOR_SHIFT;
|
||||
offset = start & (io_size - 1);
|
||||
sector &= ~((io_size >> HFSPLUS_SECTOR_SHIFT) - 1);
|
||||
|
||||
bio = bio_alloc(sb->s_bdev, 1, opf, GFP_NOIO);
|
||||
bio->bi_iter.bi_sector = sector;
|
||||
|
||||
if (op != REQ_OP_WRITE && data)
|
||||
if ((opf & REQ_OP_MASK) != REQ_OP_WRITE && data)
|
||||
*data = (u8 *)buf + offset;
|
||||
|
||||
while (io_size > 0) {
|
||||
unsigned int page_offset = offset_in_page(buf);
|
||||
unsigned int len = min_t(unsigned int, PAGE_SIZE - page_offset,
|
||||
io_size);
|
||||
|
||||
ret = bio_add_page(bio, virt_to_page(buf), len, page_offset);
|
||||
if (ret != len) {
|
||||
ret = -EIO;
|
||||
goto out;
|
||||
}
|
||||
io_size -= len;
|
||||
buf = (u8 *)buf + len;
|
||||
}
|
||||
|
||||
ret = submit_bio_wait(bio);
|
||||
out:
|
||||
bio_put(bio);
|
||||
return ret < 0 ? ret : 0;
|
||||
/*
|
||||
* Align sector to hardware sector size and find offset. We assume that
|
||||
* io_size is a power of two, which _should_ be true.
|
||||
*/
|
||||
sector &= ~((io_size >> HFSPLUS_SECTOR_SHIFT) - 1);
|
||||
return bdev_rw_virt(sb->s_bdev, sector, buf, io_size, opf);
|
||||
}
|
||||
|
||||
static int hfsplus_read_mdb(void *bufptr, struct hfsplus_wd *wd)
|
||||
|
|
|
@ -120,8 +120,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
|
|||
global_node_page_state(NR_SECONDARY_PAGETABLE));
|
||||
|
||||
show_val_kb(m, "NFS_Unstable: ", 0);
|
||||
show_val_kb(m, "Bounce: ",
|
||||
global_zone_page_state(NR_BOUNCE));
|
||||
show_val_kb(m, "Bounce: ", 0);
|
||||
show_val_kb(m, "WritebackTmp: ",
|
||||
global_node_page_state(NR_WRITEBACK_TEMP));
|
||||
show_val_kb(m, "CommitLimit: ", vm_commit_limit());
|
||||
|
|
|
@ -18,42 +18,36 @@ xfs_rw_bdev(
|
|||
enum req_op op)
|
||||
|
||||
{
|
||||
unsigned int is_vmalloc = is_vmalloc_addr(data);
|
||||
unsigned int left = count;
|
||||
unsigned int done = 0, added;
|
||||
int error;
|
||||
struct bio *bio;
|
||||
|
||||
if (is_vmalloc && op == REQ_OP_WRITE)
|
||||
flush_kernel_vmap_range(data, count);
|
||||
op |= REQ_META | REQ_SYNC;
|
||||
if (!is_vmalloc_addr(data))
|
||||
return bdev_rw_virt(bdev, sector, data, count, op);
|
||||
|
||||
bio = bio_alloc(bdev, bio_max_vecs(left), op | REQ_META | REQ_SYNC,
|
||||
GFP_KERNEL);
|
||||
bio = bio_alloc(bdev, bio_max_vecs(count), op, GFP_KERNEL);
|
||||
bio->bi_iter.bi_sector = sector;
|
||||
|
||||
do {
|
||||
struct page *page = kmem_to_page(data);
|
||||
unsigned int off = offset_in_page(data);
|
||||
unsigned int len = min_t(unsigned, left, PAGE_SIZE - off);
|
||||
|
||||
while (bio_add_page(bio, page, len, off) != len) {
|
||||
added = bio_add_vmalloc_chunk(bio, data + done, count - done);
|
||||
if (!added) {
|
||||
struct bio *prev = bio;
|
||||
|
||||
bio = bio_alloc(prev->bi_bdev, bio_max_vecs(left),
|
||||
bio = bio_alloc(prev->bi_bdev,
|
||||
bio_max_vecs(count - done),
|
||||
prev->bi_opf, GFP_KERNEL);
|
||||
bio->bi_iter.bi_sector = bio_end_sector(prev);
|
||||
bio_chain(prev, bio);
|
||||
|
||||
submit_bio(prev);
|
||||
}
|
||||
|
||||
data += len;
|
||||
left -= len;
|
||||
} while (left > 0);
|
||||
done += added;
|
||||
} while (done < count);
|
||||
|
||||
error = submit_bio_wait(bio);
|
||||
bio_put(bio);
|
||||
|
||||
if (is_vmalloc && op == REQ_OP_READ)
|
||||
if (op == REQ_OP_READ)
|
||||
invalidate_kernel_vmap_range(data, count);
|
||||
return error;
|
||||
}
|
||||
|
|
|
@ -1333,45 +1333,18 @@ static void
|
|||
xfs_buf_submit_bio(
|
||||
struct xfs_buf *bp)
|
||||
{
|
||||
unsigned int len = BBTOB(bp->b_length);
|
||||
unsigned int nr_vecs = bio_add_max_vecs(bp->b_addr, len);
|
||||
unsigned int map = 0;
|
||||
struct blk_plug plug;
|
||||
struct bio *bio;
|
||||
|
||||
if (is_vmalloc_addr(bp->b_addr)) {
|
||||
unsigned int size = BBTOB(bp->b_length);
|
||||
unsigned int alloc_size = roundup(size, PAGE_SIZE);
|
||||
void *data = bp->b_addr;
|
||||
|
||||
bio = bio_alloc(bp->b_target->bt_bdev, alloc_size >> PAGE_SHIFT,
|
||||
xfs_buf_bio_op(bp), GFP_NOIO);
|
||||
|
||||
do {
|
||||
unsigned int len = min(size, PAGE_SIZE);
|
||||
|
||||
ASSERT(offset_in_page(data) == 0);
|
||||
__bio_add_page(bio, vmalloc_to_page(data), len, 0);
|
||||
data += len;
|
||||
size -= len;
|
||||
} while (size);
|
||||
|
||||
flush_kernel_vmap_range(bp->b_addr, alloc_size);
|
||||
} else {
|
||||
/*
|
||||
* Single folio or slab allocation. Must be contiguous and thus
|
||||
* only a single bvec is needed.
|
||||
*
|
||||
* This uses the page based bio add helper for now as that is
|
||||
* the lowest common denominator between folios and slab
|
||||
* allocations. To be replaced with a better block layer
|
||||
* helper soon (hopefully).
|
||||
*/
|
||||
bio = bio_alloc(bp->b_target->bt_bdev, 1, xfs_buf_bio_op(bp),
|
||||
bio = bio_alloc(bp->b_target->bt_bdev, nr_vecs, xfs_buf_bio_op(bp),
|
||||
GFP_NOIO);
|
||||
__bio_add_page(bio, virt_to_page(bp->b_addr),
|
||||
BBTOB(bp->b_length),
|
||||
offset_in_page(bp->b_addr));
|
||||
}
|
||||
|
||||
if (is_vmalloc_addr(bp->b_addr))
|
||||
bio_add_vmalloc(bio, bp->b_addr, len);
|
||||
else
|
||||
bio_add_virt_nofail(bio, bp->b_addr, len);
|
||||
bio->bi_private = bp;
|
||||
bio->bi_end_io = xfs_buf_bio_end_io;
|
||||
|
||||
|
|
|
@ -1607,27 +1607,6 @@ xlog_bio_end_io(
|
|||
&iclog->ic_end_io_work);
|
||||
}
|
||||
|
||||
static int
|
||||
xlog_map_iclog_data(
|
||||
struct bio *bio,
|
||||
void *data,
|
||||
size_t count)
|
||||
{
|
||||
do {
|
||||
struct page *page = kmem_to_page(data);
|
||||
unsigned int off = offset_in_page(data);
|
||||
size_t len = min_t(size_t, count, PAGE_SIZE - off);
|
||||
|
||||
if (bio_add_page(bio, page, len, off) != len)
|
||||
return -EIO;
|
||||
|
||||
data += len;
|
||||
count -= len;
|
||||
} while (count);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
STATIC void
|
||||
xlog_write_iclog(
|
||||
struct xlog *log,
|
||||
|
@ -1693,11 +1672,12 @@ xlog_write_iclog(
|
|||
|
||||
iclog->ic_flags &= ~(XLOG_ICL_NEED_FLUSH | XLOG_ICL_NEED_FUA);
|
||||
|
||||
if (xlog_map_iclog_data(&iclog->ic_bio, iclog->ic_data, count))
|
||||
if (is_vmalloc_addr(iclog->ic_data)) {
|
||||
if (!bio_add_vmalloc(&iclog->ic_bio, iclog->ic_data, count))
|
||||
goto shutdown;
|
||||
|
||||
if (is_vmalloc_addr(iclog->ic_data))
|
||||
flush_kernel_vmap_range(iclog->ic_data, count);
|
||||
} else {
|
||||
bio_add_virt_nofail(&iclog->ic_bio, iclog->ic_data, count);
|
||||
}
|
||||
|
||||
/*
|
||||
* If this log buffer would straddle the end of the log we will have
|
||||
|
|
|
@ -1111,28 +1111,19 @@ static int zonefs_read_super(struct super_block *sb)
|
|||
struct zonefs_sb_info *sbi = ZONEFS_SB(sb);
|
||||
struct zonefs_super *super;
|
||||
u32 crc, stored_crc;
|
||||
struct page *page;
|
||||
struct bio_vec bio_vec;
|
||||
struct bio bio;
|
||||
int ret;
|
||||
|
||||
page = alloc_page(GFP_KERNEL);
|
||||
if (!page)
|
||||
super = kmalloc(PAGE_SIZE, GFP_KERNEL);
|
||||
if (!super)
|
||||
return -ENOMEM;
|
||||
|
||||
bio_init(&bio, sb->s_bdev, &bio_vec, 1, REQ_OP_READ);
|
||||
bio.bi_iter.bi_sector = 0;
|
||||
__bio_add_page(&bio, page, PAGE_SIZE, 0);
|
||||
|
||||
ret = submit_bio_wait(&bio);
|
||||
ret = bdev_rw_virt(sb->s_bdev, 0, super, PAGE_SIZE, REQ_OP_READ);
|
||||
if (ret)
|
||||
goto free_page;
|
||||
|
||||
super = page_address(page);
|
||||
goto free_super;
|
||||
|
||||
ret = -EINVAL;
|
||||
if (le32_to_cpu(super->s_magic) != ZONEFS_MAGIC)
|
||||
goto free_page;
|
||||
goto free_super;
|
||||
|
||||
stored_crc = le32_to_cpu(super->s_crc);
|
||||
super->s_crc = 0;
|
||||
|
@ -1140,14 +1131,14 @@ static int zonefs_read_super(struct super_block *sb)
|
|||
if (crc != stored_crc) {
|
||||
zonefs_err(sb, "Invalid checksum (Expected 0x%08x, got 0x%08x)",
|
||||
crc, stored_crc);
|
||||
goto free_page;
|
||||
goto free_super;
|
||||
}
|
||||
|
||||
sbi->s_features = le64_to_cpu(super->s_features);
|
||||
if (sbi->s_features & ~ZONEFS_F_DEFINED_FEATURES) {
|
||||
zonefs_err(sb, "Unknown features set 0x%llx\n",
|
||||
sbi->s_features);
|
||||
goto free_page;
|
||||
goto free_super;
|
||||
}
|
||||
|
||||
if (sbi->s_features & ZONEFS_F_UID) {
|
||||
|
@ -1155,7 +1146,7 @@ static int zonefs_read_super(struct super_block *sb)
|
|||
le32_to_cpu(super->s_uid));
|
||||
if (!uid_valid(sbi->s_uid)) {
|
||||
zonefs_err(sb, "Invalid UID feature\n");
|
||||
goto free_page;
|
||||
goto free_super;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1164,7 +1155,7 @@ static int zonefs_read_super(struct super_block *sb)
|
|||
le32_to_cpu(super->s_gid));
|
||||
if (!gid_valid(sbi->s_gid)) {
|
||||
zonefs_err(sb, "Invalid GID feature\n");
|
||||
goto free_page;
|
||||
goto free_super;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1173,15 +1164,14 @@ static int zonefs_read_super(struct super_block *sb)
|
|||
|
||||
if (memchr_inv(super->s_reserved, 0, sizeof(super->s_reserved))) {
|
||||
zonefs_err(sb, "Reserved area is being used\n");
|
||||
goto free_page;
|
||||
goto free_super;
|
||||
}
|
||||
|
||||
import_uuid(&sbi->s_uuid, super->s_uuid);
|
||||
ret = 0;
|
||||
|
||||
free_page:
|
||||
__free_page(page);
|
||||
|
||||
free_super:
|
||||
kfree(super);
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
|
|
@ -403,7 +403,6 @@ static inline int bio_iov_vecs_to_alloc(struct iov_iter *iter, int max_segs)
|
|||
|
||||
struct request_queue;
|
||||
|
||||
extern int submit_bio_wait(struct bio *bio);
|
||||
void bio_init(struct bio *bio, struct block_device *bdev, struct bio_vec *table,
|
||||
unsigned short max_vecs, blk_opf_t opf);
|
||||
extern void bio_uninit(struct bio *);
|
||||
|
@ -418,6 +417,30 @@ void __bio_add_page(struct bio *bio, struct page *page,
|
|||
unsigned int len, unsigned int off);
|
||||
void bio_add_folio_nofail(struct bio *bio, struct folio *folio, size_t len,
|
||||
size_t off);
|
||||
void bio_add_virt_nofail(struct bio *bio, void *vaddr, unsigned len);
|
||||
|
||||
/**
|
||||
* bio_add_max_vecs - number of bio_vecs needed to add data to a bio
|
||||
* @kaddr: kernel virtual address to add
|
||||
* @len: length in bytes to add
|
||||
*
|
||||
* Calculate how many bio_vecs need to be allocated to add the kernel virtual
|
||||
* address range in [@kaddr:@len] in the worse case.
|
||||
*/
|
||||
static inline unsigned int bio_add_max_vecs(void *kaddr, unsigned int len)
|
||||
{
|
||||
if (is_vmalloc_addr(kaddr))
|
||||
return DIV_ROUND_UP(offset_in_page(kaddr) + len, PAGE_SIZE);
|
||||
return 1;
|
||||
}
|
||||
|
||||
unsigned int bio_add_vmalloc_chunk(struct bio *bio, void *vaddr, unsigned len);
|
||||
bool bio_add_vmalloc(struct bio *bio, void *vaddr, unsigned int len);
|
||||
|
||||
int submit_bio_wait(struct bio *bio);
|
||||
int bdev_rw_virt(struct block_device *bdev, sector_t sector, void *data,
|
||||
size_t len, enum req_op op);
|
||||
|
||||
int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter);
|
||||
void bio_iov_bvec_set(struct bio *bio, const struct iov_iter *iter);
|
||||
void __bio_release_pages(struct bio *bio, bool mark_dirty);
|
||||
|
|
|
@ -9,6 +9,7 @@
|
|||
#include <linux/prefetch.h>
|
||||
#include <linux/srcu.h>
|
||||
#include <linux/rw_hint.h>
|
||||
#include <linux/rwsem.h>
|
||||
|
||||
struct blk_mq_tags;
|
||||
struct blk_flush_queue;
|
||||
|
@ -506,6 +507,9 @@ enum hctx_type {
|
|||
* request_queue.tag_set_list.
|
||||
* @srcu: Use as lock when type of the request queue is blocking
|
||||
* (BLK_MQ_F_BLOCKING).
|
||||
* @update_nr_hwq_lock:
|
||||
* Synchronize updating nr_hw_queues with add/del disk &
|
||||
* switching elevator.
|
||||
*/
|
||||
struct blk_mq_tag_set {
|
||||
const struct blk_mq_ops *ops;
|
||||
|
@ -527,6 +531,8 @@ struct blk_mq_tag_set {
|
|||
struct mutex tag_list_lock;
|
||||
struct list_head tag_list;
|
||||
struct srcu_struct *srcu;
|
||||
|
||||
struct rw_semaphore update_nr_hwq_lock;
|
||||
};
|
||||
|
||||
/**
|
||||
|
@ -1031,8 +1037,8 @@ int blk_rq_map_user_io(struct request *, struct rq_map_data *,
|
|||
int blk_rq_map_user_iov(struct request_queue *, struct request *,
|
||||
struct rq_map_data *, const struct iov_iter *, gfp_t);
|
||||
int blk_rq_unmap_user(struct bio *);
|
||||
int blk_rq_map_kern(struct request_queue *, struct request *, void *,
|
||||
unsigned int, gfp_t);
|
||||
int blk_rq_map_kern(struct request *rq, void *kbuf, unsigned int len,
|
||||
gfp_t gfp);
|
||||
int blk_rq_append_bio(struct request *rq, struct bio *bio);
|
||||
void blk_execute_rq_nowait(struct request *rq, bool at_head);
|
||||
blk_status_t blk_execute_rq(struct request *rq, bool at_head);
|
||||
|
|
|
@ -220,6 +220,7 @@ struct bio {
|
|||
unsigned short bi_flags; /* BIO_* below */
|
||||
unsigned short bi_ioprio;
|
||||
enum rw_hint bi_write_hint;
|
||||
u8 bi_write_stream;
|
||||
blk_status_t bi_status;
|
||||
atomic_t __bi_remaining;
|
||||
|
||||
|
@ -286,7 +287,6 @@ struct bio {
|
|||
enum {
|
||||
BIO_PAGE_PINNED, /* Unpin pages in bio_release_pages() */
|
||||
BIO_CLONED, /* doesn't own data */
|
||||
BIO_BOUNCED, /* bio is a bounce bio */
|
||||
BIO_QUIET, /* Make BIO Quiet */
|
||||
BIO_CHAIN, /* chained bio, ->bi_remaining in effect */
|
||||
BIO_REFFED, /* bio has elevated ->bi_cnt */
|
||||
|
@ -296,6 +296,14 @@ enum {
|
|||
* of this bio. */
|
||||
BIO_CGROUP_ACCT, /* has been accounted to a cgroup */
|
||||
BIO_QOS_THROTTLED, /* bio went through rq_qos throttle path */
|
||||
/*
|
||||
* This bio has completed bps throttling at the single tg granularity,
|
||||
* which is different from BIO_BPS_THROTTLED. When the bio is enqueued
|
||||
* into the sq->queued of the upper tg, or is about to be dispatched,
|
||||
* this flag needs to be cleared. Since blk-throttle and rq_qos are not
|
||||
* on the same hierarchical level, reuse the value.
|
||||
*/
|
||||
BIO_TG_BPS_THROTTLED = BIO_QOS_THROTTLED,
|
||||
BIO_QOS_MERGED, /* but went through rq_qos merge path */
|
||||
BIO_REMAPPED,
|
||||
BIO_ZONE_WRITE_PLUGGING, /* bio handled through zone write plugging */
|
||||
|
|
|
@ -182,7 +182,6 @@ struct gendisk {
|
|||
struct list_head slave_bdevs;
|
||||
#endif
|
||||
struct timer_rand_state *random;
|
||||
atomic_t sync_io; /* RAID */
|
||||
struct disk_events *ev;
|
||||
|
||||
#ifdef CONFIG_BLK_DEV_ZONED
|
||||
|
@ -218,6 +217,8 @@ struct gendisk {
|
|||
* devices that do not have multiple independent access ranges.
|
||||
*/
|
||||
struct blk_independent_access_ranges *ia_ranges;
|
||||
|
||||
struct mutex rqos_state_mutex; /* rqos state change mutex */
|
||||
};
|
||||
|
||||
/**
|
||||
|
@ -331,9 +332,6 @@ typedef unsigned int __bitwise blk_features_t;
|
|||
/* skip this queue in blk_mq_(un)quiesce_tagset */
|
||||
#define BLK_FEAT_SKIP_TAGSET_QUIESCE ((__force blk_features_t)(1u << 13))
|
||||
|
||||
/* bounce all highmem pages */
|
||||
#define BLK_FEAT_BOUNCE_HIGH ((__force blk_features_t)(1u << 14))
|
||||
|
||||
/* undocumented magic for bcache */
|
||||
#define BLK_FEAT_RAID_PARTIAL_STRIPES_EXPENSIVE \
|
||||
((__force blk_features_t)(1u << 15))
|
||||
|
@ -347,7 +345,7 @@ typedef unsigned int __bitwise blk_features_t;
|
|||
*/
|
||||
#define BLK_FEAT_INHERIT_MASK \
|
||||
(BLK_FEAT_WRITE_CACHE | BLK_FEAT_FUA | BLK_FEAT_ROTATIONAL | \
|
||||
BLK_FEAT_STABLE_WRITES | BLK_FEAT_ZONED | BLK_FEAT_BOUNCE_HIGH | \
|
||||
BLK_FEAT_STABLE_WRITES | BLK_FEAT_ZONED | \
|
||||
BLK_FEAT_RAID_PARTIAL_STRIPES_EXPENSIVE)
|
||||
|
||||
/* internal flags in queue_limits.flags */
|
||||
|
@ -405,6 +403,9 @@ struct queue_limits {
|
|||
unsigned short max_integrity_segments;
|
||||
unsigned short max_discard_segments;
|
||||
|
||||
unsigned short max_write_streams;
|
||||
unsigned int write_stream_granularity;
|
||||
|
||||
unsigned int max_open_zones;
|
||||
unsigned int max_active_zones;
|
||||
|
||||
|
@ -644,6 +645,8 @@ enum {
|
|||
QUEUE_FLAG_RQ_ALLOC_TIME, /* record rq->alloc_time_ns */
|
||||
QUEUE_FLAG_HCTX_ACTIVE, /* at least one blk-mq hctx is active */
|
||||
QUEUE_FLAG_SQ_SCHED, /* single queue style io dispatch */
|
||||
QUEUE_FLAG_DISABLE_WBT_DEF, /* for sched to disable/enable wbt */
|
||||
QUEUE_FLAG_NO_ELV_SWITCH, /* can't switch elevator any more */
|
||||
QUEUE_FLAG_MAX
|
||||
};
|
||||
|
||||
|
@ -679,6 +682,10 @@ void blk_queue_flag_clear(unsigned int flag, struct request_queue *q);
|
|||
#define blk_queue_sq_sched(q) test_bit(QUEUE_FLAG_SQ_SCHED, &(q)->queue_flags)
|
||||
#define blk_queue_skip_tagset_quiesce(q) \
|
||||
((q)->limits.features & BLK_FEAT_SKIP_TAGSET_QUIESCE)
|
||||
#define blk_queue_disable_wbt(q) \
|
||||
test_bit(QUEUE_FLAG_DISABLE_WBT_DEF, &(q)->queue_flags)
|
||||
#define blk_queue_no_elv_switch(q) \
|
||||
test_bit(QUEUE_FLAG_NO_ELV_SWITCH, &(q)->queue_flags)
|
||||
|
||||
extern void blk_set_pm_only(struct request_queue *q);
|
||||
extern void blk_clear_pm_only(struct request_queue *q);
|
||||
|
@ -1288,6 +1295,13 @@ static inline unsigned int bdev_max_segments(struct block_device *bdev)
|
|||
return queue_max_segments(bdev_get_queue(bdev));
|
||||
}
|
||||
|
||||
static inline unsigned short bdev_max_write_streams(struct block_device *bdev)
|
||||
{
|
||||
if (bdev_is_partition(bdev))
|
||||
return 0;
|
||||
return bdev_limits(bdev)->max_write_streams;
|
||||
}
|
||||
|
||||
static inline unsigned queue_logical_block_size(const struct request_queue *q)
|
||||
{
|
||||
return q->limits.logical_block_size;
|
||||
|
|
|
@ -11,6 +11,7 @@
|
|||
#ifndef LINUX_DMAPOOL_H
|
||||
#define LINUX_DMAPOOL_H
|
||||
|
||||
#include <linux/nodemask_types.h>
|
||||
#include <linux/scatterlist.h>
|
||||
#include <asm/io.h>
|
||||
|
||||
|
@ -18,8 +19,8 @@ struct device;
|
|||
|
||||
#ifdef CONFIG_HAS_DMA
|
||||
|
||||
struct dma_pool *dma_pool_create(const char *name, struct device *dev,
|
||||
size_t size, size_t align, size_t allocation);
|
||||
struct dma_pool *dma_pool_create_node(const char *name, struct device *dev,
|
||||
size_t size, size_t align, size_t boundary, int node);
|
||||
|
||||
void dma_pool_destroy(struct dma_pool *pool);
|
||||
|
||||
|
@ -35,9 +36,12 @@ struct dma_pool *dmam_pool_create(const char *name, struct device *dev,
|
|||
void dmam_pool_destroy(struct dma_pool *pool);
|
||||
|
||||
#else /* !CONFIG_HAS_DMA */
|
||||
static inline struct dma_pool *dma_pool_create(const char *name,
|
||||
struct device *dev, size_t size, size_t align, size_t allocation)
|
||||
{ return NULL; }
|
||||
static inline struct dma_pool *dma_pool_create_node(const char *name,
|
||||
struct device *dev, size_t size, size_t align, size_t boundary,
|
||||
int node)
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
static inline void dma_pool_destroy(struct dma_pool *pool) { }
|
||||
static inline void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags,
|
||||
dma_addr_t *handle) { return NULL; }
|
||||
|
@ -49,6 +53,13 @@ static inline struct dma_pool *dmam_pool_create(const char *name,
|
|||
static inline void dmam_pool_destroy(struct dma_pool *pool) { }
|
||||
#endif /* !CONFIG_HAS_DMA */
|
||||
|
||||
static inline struct dma_pool *dma_pool_create(const char *name,
|
||||
struct device *dev, size_t size, size_t align, size_t boundary)
|
||||
{
|
||||
return dma_pool_create_node(name, dev, size, align, boundary,
|
||||
NUMA_NO_NODE);
|
||||
}
|
||||
|
||||
static inline void *dma_pool_zalloc(struct dma_pool *pool, gfp_t mem_flags,
|
||||
dma_addr_t *handle)
|
||||
{
|
||||
|
|
|
@ -408,6 +408,7 @@ struct kiocb {
|
|||
void *private;
|
||||
int ki_flags;
|
||||
u16 ki_ioprio; /* See linux/ioprio.h */
|
||||
u8 ki_write_stream;
|
||||
union {
|
||||
/*
|
||||
* Only used for async buffered reads, where it denotes the
|
||||
|
|
|
@ -140,6 +140,15 @@ static inline struct io_uring_cmd_data *io_uring_cmd_get_async_data(struct io_ur
|
|||
return cmd_to_io_kiocb(cmd)->async_data;
|
||||
}
|
||||
|
||||
/*
|
||||
* Return uring_cmd's context reference as its context handle for driver to
|
||||
* track per-context resource, such as registered kernel IO buffer
|
||||
*/
|
||||
static inline void *io_uring_cmd_ctx_handle(struct io_uring_cmd *cmd)
|
||||
{
|
||||
return cmd_to_io_kiocb(cmd)->ctx;
|
||||
}
|
||||
|
||||
int io_buffer_register_bvec(struct io_uring_cmd *cmd, struct request *rq,
|
||||
void (*release)(void *), unsigned int index,
|
||||
unsigned int issue_flags);
|
||||
|
|
|
@ -148,7 +148,6 @@ enum zone_stat_item {
|
|||
NR_ZONE_WRITE_PENDING, /* Count of dirty, writeback and unstable pages */
|
||||
NR_MLOCK, /* mlock()ed pages found and moved off LRU */
|
||||
/* Second 128 byte cacheline */
|
||||
NR_BOUNCE,
|
||||
#if IS_ENABLED(CONFIG_ZSMALLOC)
|
||||
NR_ZSPAGES, /* allocated in zsmalloc */
|
||||
#endif
|
||||
|
|
|
@ -303,6 +303,7 @@ enum nvme_ctrl_attr {
|
|||
NVME_CTRL_ATTR_TBKAS = (1 << 6),
|
||||
NVME_CTRL_ATTR_ELBAS = (1 << 15),
|
||||
NVME_CTRL_ATTR_RHII = (1 << 18),
|
||||
NVME_CTRL_ATTR_FDPS = (1 << 19),
|
||||
};
|
||||
|
||||
struct nvme_id_ctrl {
|
||||
|
@ -689,6 +690,44 @@ struct nvme_rotational_media_log {
|
|||
__u8 rsvd24[488];
|
||||
};
|
||||
|
||||
struct nvme_fdp_config {
|
||||
__u8 flags;
|
||||
#define FDPCFG_FDPE (1U << 0)
|
||||
__u8 fdpcidx;
|
||||
__le16 reserved;
|
||||
};
|
||||
|
||||
struct nvme_fdp_ruh_desc {
|
||||
__u8 ruht;
|
||||
__u8 reserved[3];
|
||||
};
|
||||
|
||||
struct nvme_fdp_config_desc {
|
||||
__le16 dsze;
|
||||
__u8 fdpa;
|
||||
__u8 vss;
|
||||
__le32 nrg;
|
||||
__le16 nruh;
|
||||
__le16 maxpids;
|
||||
__le32 nns;
|
||||
__le64 runs;
|
||||
__le32 erutl;
|
||||
__u8 rsvd28[36];
|
||||
struct nvme_fdp_ruh_desc ruhs[];
|
||||
};
|
||||
|
||||
struct nvme_fdp_config_log {
|
||||
__le16 numfdpc;
|
||||
__u8 ver;
|
||||
__u8 rsvd3;
|
||||
__le32 sze;
|
||||
__u8 rsvd8[8];
|
||||
/*
|
||||
* This is followed by variable number of nvme_fdp_config_desc
|
||||
* structures, but sparse doesn't like nested variable sized arrays.
|
||||
*/
|
||||
};
|
||||
|
||||
struct nvme_smart_log {
|
||||
__u8 critical_warning;
|
||||
__u8 temperature[2];
|
||||
|
@ -915,6 +954,7 @@ enum nvme_opcode {
|
|||
nvme_cmd_resv_register = 0x0d,
|
||||
nvme_cmd_resv_report = 0x0e,
|
||||
nvme_cmd_resv_acquire = 0x11,
|
||||
nvme_cmd_io_mgmt_recv = 0x12,
|
||||
nvme_cmd_resv_release = 0x15,
|
||||
nvme_cmd_zone_mgmt_send = 0x79,
|
||||
nvme_cmd_zone_mgmt_recv = 0x7a,
|
||||
|
@ -936,6 +976,7 @@ enum nvme_opcode {
|
|||
nvme_opcode_name(nvme_cmd_resv_register), \
|
||||
nvme_opcode_name(nvme_cmd_resv_report), \
|
||||
nvme_opcode_name(nvme_cmd_resv_acquire), \
|
||||
nvme_opcode_name(nvme_cmd_io_mgmt_recv), \
|
||||
nvme_opcode_name(nvme_cmd_resv_release), \
|
||||
nvme_opcode_name(nvme_cmd_zone_mgmt_send), \
|
||||
nvme_opcode_name(nvme_cmd_zone_mgmt_recv), \
|
||||
|
@ -1087,6 +1128,7 @@ enum {
|
|||
NVME_RW_PRINFO_PRCHK_GUARD = 1 << 12,
|
||||
NVME_RW_PRINFO_PRACT = 1 << 13,
|
||||
NVME_RW_DTYPE_STREAMS = 1 << 4,
|
||||
NVME_RW_DTYPE_DPLCMT = 2 << 4,
|
||||
NVME_WZ_DEAC = 1 << 9,
|
||||
};
|
||||
|
||||
|
@ -1174,6 +1216,38 @@ struct nvme_zone_mgmt_recv_cmd {
|
|||
__le32 cdw14[2];
|
||||
};
|
||||
|
||||
struct nvme_io_mgmt_recv_cmd {
|
||||
__u8 opcode;
|
||||
__u8 flags;
|
||||
__u16 command_id;
|
||||
__le32 nsid;
|
||||
__le64 rsvd2[2];
|
||||
union nvme_data_ptr dptr;
|
||||
__u8 mo;
|
||||
__u8 rsvd11;
|
||||
__u16 mos;
|
||||
__le32 numd;
|
||||
__le32 cdw12[4];
|
||||
};
|
||||
|
||||
enum {
|
||||
NVME_IO_MGMT_RECV_MO_RUHS = 1,
|
||||
};
|
||||
|
||||
struct nvme_fdp_ruh_status_desc {
|
||||
__le16 pid;
|
||||
__le16 ruhid;
|
||||
__le32 earutr;
|
||||
__le64 ruamw;
|
||||
__u8 reserved[16];
|
||||
};
|
||||
|
||||
struct nvme_fdp_ruh_status {
|
||||
__u8 rsvd0[14];
|
||||
__le16 nruhsd;
|
||||
struct nvme_fdp_ruh_status_desc ruhsd[];
|
||||
};
|
||||
|
||||
enum {
|
||||
NVME_ZRA_ZONE_REPORT = 0,
|
||||
NVME_ZRASF_ZONE_REPORT_ALL = 0,
|
||||
|
@ -1309,6 +1383,7 @@ enum {
|
|||
NVME_FEAT_PLM_WINDOW = 0x14,
|
||||
NVME_FEAT_HOST_BEHAVIOR = 0x16,
|
||||
NVME_FEAT_SANITIZE = 0x17,
|
||||
NVME_FEAT_FDP = 0x1d,
|
||||
NVME_FEAT_SW_PROGRESS = 0x80,
|
||||
NVME_FEAT_HOST_ID = 0x81,
|
||||
NVME_FEAT_RESV_MASK = 0x82,
|
||||
|
@ -1329,6 +1404,7 @@ enum {
|
|||
NVME_LOG_ANA = 0x0c,
|
||||
NVME_LOG_FEATURES = 0x12,
|
||||
NVME_LOG_RMI = 0x16,
|
||||
NVME_LOG_FDP_CONFIGS = 0x20,
|
||||
NVME_LOG_DISC = 0x70,
|
||||
NVME_LOG_RESERVATION = 0x80,
|
||||
NVME_FWACT_REPL = (0 << 3),
|
||||
|
@ -1923,6 +1999,7 @@ struct nvme_command {
|
|||
struct nvmf_auth_receive_command auth_receive;
|
||||
struct nvme_dbbuf dbbuf;
|
||||
struct nvme_directive_cmd directive;
|
||||
struct nvme_io_mgmt_recv_cmd imr;
|
||||
};
|
||||
};
|
||||
|
||||
|
|
|
@ -79,4 +79,6 @@ static inline void part_stat_set_all(struct block_device *part, int value)
|
|||
#define part_stat_local_read_cpu(part, field, cpu) \
|
||||
local_read(&(part_stat_get_cpu(part, field, cpu)))
|
||||
|
||||
unsigned int bdev_count_inflight(struct block_device *part);
|
||||
|
||||
#endif /* _LINUX_PART_STAT_H */
|
||||
|
|
|
@ -670,8 +670,6 @@ struct Scsi_Host {
|
|||
/* The transport requires the LUN bits NOT to be stored in CDB[1] */
|
||||
unsigned no_scsi2_lun_in_cdb:1;
|
||||
|
||||
unsigned no_highmem:1;
|
||||
|
||||
/*
|
||||
* Optional work queue to be utilized by the transport
|
||||
*/
|
||||
|
|
Some files were not shown because too many files have changed in this diff Show more
Loading…
Add table
Add a link
Reference in a new issue