linux/block/blk-mq-sched.h
Nilay Shroff 04225d13ae block: fix potential deadlock while running nr_hw_queue update
Move scheduler tags (sched_tags) allocation and deallocation outside
both the ->elevator_lock and ->freeze_lock when updating nr_hw_queues.
This change breaks the dependency chain from the percpu allocator lock
to the elevator lock, helping to prevent potential deadlocks, as
observed in the reported lockdep splat[1].

This commit introduces batch allocation and deallocation helpers for
sched_tags, which are now used from within __blk_mq_update_nr_hw_queues
routine while iterating through the tagset.

With this change, all sched_tags memory management is handled entirely
outside the ->elevator_lock and the ->freeze_lock context, thereby
eliminating the lock dependency that could otherwise manifest during
nr_hw_queues updates.

[1] https://lore.kernel.org/all/0659ea8d-a463-47c8-9180-43c719e106eb@linux.ibm.com/

Reported-by: Stefan Haberland <sth@linux.ibm.com>
Closes: https://lore.kernel.org/all/0659ea8d-a463-47c8-9180-43c719e106eb@linux.ibm.com/
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Signed-off-by: Nilay Shroff <nilay@linux.ibm.com>
Link: https://lore.kernel.org/r/20250730074614.2537382-4-nilay@linux.ibm.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2025-07-30 06:20:51 -06:00

95 lines
2.7 KiB
C

/* SPDX-License-Identifier: GPL-2.0 */
#ifndef BLK_MQ_SCHED_H
#define BLK_MQ_SCHED_H
#include "elevator.h"
#include "blk-mq.h"
#define MAX_SCHED_RQ (16 * BLKDEV_DEFAULT_RQ)
bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio,
unsigned int nr_segs, struct request **merged_request);
bool blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio,
unsigned int nr_segs);
bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq,
struct list_head *free);
void blk_mq_sched_mark_restart_hctx(struct blk_mq_hw_ctx *hctx);
void __blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx);
void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx);
int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e,
struct elevator_tags *et);
void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e);
void blk_mq_sched_free_rqs(struct request_queue *q);
struct elevator_tags *blk_mq_alloc_sched_tags(struct blk_mq_tag_set *set,
unsigned int nr_hw_queues);
int blk_mq_alloc_sched_tags_batch(struct xarray *et_table,
struct blk_mq_tag_set *set, unsigned int nr_hw_queues);
void blk_mq_free_sched_tags(struct elevator_tags *et,
struct blk_mq_tag_set *set);
void blk_mq_free_sched_tags_batch(struct xarray *et_table,
struct blk_mq_tag_set *set);
static inline void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx)
{
if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
__blk_mq_sched_restart(hctx);
}
static inline bool bio_mergeable(struct bio *bio)
{
return !(bio->bi_opf & REQ_NOMERGE_FLAGS);
}
static inline bool
blk_mq_sched_allow_merge(struct request_queue *q, struct request *rq,
struct bio *bio)
{
if (rq->rq_flags & RQF_USE_SCHED) {
struct elevator_queue *e = q->elevator;
if (e->type->ops.allow_merge)
return e->type->ops.allow_merge(q, rq, bio);
}
return true;
}
static inline void blk_mq_sched_completed_request(struct request *rq, u64 now)
{
if (rq->rq_flags & RQF_USE_SCHED) {
struct elevator_queue *e = rq->q->elevator;
if (e->type->ops.completed_request)
e->type->ops.completed_request(rq, now);
}
}
static inline void blk_mq_sched_requeue_request(struct request *rq)
{
if (rq->rq_flags & RQF_USE_SCHED) {
struct request_queue *q = rq->q;
struct elevator_queue *e = q->elevator;
if (e->type->ops.requeue_request)
e->type->ops.requeue_request(rq);
}
}
static inline bool blk_mq_sched_has_work(struct blk_mq_hw_ctx *hctx)
{
struct elevator_queue *e = hctx->queue->elevator;
if (e && e->type->ops.has_work)
return e->type->ops.has_work(hctx);
return false;
}
static inline bool blk_mq_sched_needs_restart(struct blk_mq_hw_ctx *hctx)
{
return test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
}
#endif