block-6.17-20250808

-----BEGIN PGP SIGNATURE-----
 
 iQJEBAABCAAuFiEEwPw5LcreJtl1+l5K99NY+ylx4KYFAmiWLjoQHGF4Ym9lQGtl
 cm5lbC5kawAKCRD301j7KXHgpvveD/9vbvp3XaF0LagRJLH0fcdhcxL7Z+IHD+7U
 v5vICMeoeBhhhOtPJ0y+h/9LMLQWFYDFl6drkY0atSSxp/CK6CB25qFhIDsoA6Qk
 RBM/qZ64z4Uxvlc+VQmCqI2EMc/ZrYtrcr7jsornwORoTSEKXVHdyO5k7Q9002Sw
 XNWc0bZKIibFlgOk12Wnd8ZS5RWHw1uViUcreojcGVZAVR+BuHNGGoa3xq0bLiHU
 ERbQXfjaN28R+eo4E1euCtdf++7tW2kFjClrDmLcszdb27E2+MWMA6AKMiSTBE2k
 2e2TvJUcGZs1s8atqSIIjBtmwQW3rKws33zODLMONzOP8CIErcaniHxyDSaxJIJr
 kjsdKnwlziL3xVnwQcpgnVOPvvDSKZ4OKEqx8rAuYTqiknpz3uhbt/7EqumuPLHr
 e7Rz0MnFolrVN7KZOHQ5CPJIezkEAOAEpItLdfc5cfLS06pbeTN3j+dJZp+tUohi
 WP/K3l2N3C5pkXA0ilAzshRF20Rwv/09M85BoqWocTLBJY7WqyIKXywCNdX81wkv
 tpbQvp2MpPkJXUIbAh5484BOfCfx9vkYVm2cam2UxXJhR6VfrQCjYfXIjfpqF4jp
 q7xxNesUezrOqB2Q/cKxw8dKOaRtO1XzVnmwutBrcKgqqLezMwUTDDjQYe8l6p1Z
 40E74tsJwQ==
 =EQ7g
 -----END PGP SIGNATURE-----

Merge tag 'block-6.17-20250808' of git://git.kernel.dk/linux

Pull more block updates from Jens Axboe:

 - MD pull request via Yu:
      - mddev null-ptr-dereference fix, by Erkun
      - md-cluster fail to remove the faulty disk regression fix, by
        Heming
      - minor cleanup, by Li Nan and Jinchao
      - mdadm lifetime regression fix reported by syzkaller, by Yu Kuai

 - MD pull request via Christoph
      - add support for getting the FDP featuee in fabrics passthru path
        (Nitesh Shetty)
      - add capability to connect to an administrative controller
        (Kamaljit Singh)
      - fix a leak on sgl setup error (Keith Busch)
      - initialize discovery subsys after debugfs is initialized
        (Mohamed Khalfella)
      - fix various comment typos (Bjorn Helgaas)
      - remove unneeded semicolons (Jiapeng Chong)

 - nvmet debugfs ordering issue fix

 - Fix UAF in the tag_set in zloop

 - Ensure sbitmap shallow depth covers entire set

 - Reduce lock roundtrips in io context lookup

 - Move scheduler tags alloc/free out of elevator and freeze lock, to
   fix some lockdep found issues

 - Improve robustness of queue limits checking

 - Fix a regression with IO priorities, if no io context exists

* tag 'block-6.17-20250808' of git://git.kernel.dk/linux: (26 commits)
  lib/sbitmap: make sbitmap_get_shallow() internal
  lib/sbitmap: convert shallow_depth from one word to the whole sbitmap
  nvmet: exit debugfs after discovery subsystem exits
  block, bfq: Reorder struct bfq_iocq_bfqq_data
  md: make rdev_addable usable for rcu mode
  md/raid1: remove struct pool_info and related code
  md/raid1: change r1conf->r1bio_pool to a pointer type
  block: ensure discard_granularity is zero when discard is not supported
  zloop: fix KASAN use-after-free of tag set
  block: Fix default IO priority if there is no IO context
  nvme: fix various comment typos
  nvme-auth: remove unneeded semicolon
  nvme-pci: fix leak on sgl setup error
  nvmet: initialize discovery subsys after debugfs is initialized
  nvme: add capability to connect to an administrative controller
  nvmet: add support for FDP in fabrics passthru path
  md: rename recovery_cp to resync_offset
  md/md-cluster: handle REMOVE message earlier
  md: fix create on open mddev lifetime regression
  block: fix potential deadlock while running nr_hw_queue update
  ...
This commit is contained in:
Linus Torvalds 2025-08-09 08:47:28 +03:00
commit 2988dfed8a
38 changed files with 521 additions and 444 deletions

View file

@ -454,17 +454,10 @@ static struct bfq_io_cq *icq_to_bic(struct io_cq *icq)
*/ */
static struct bfq_io_cq *bfq_bic_lookup(struct request_queue *q) static struct bfq_io_cq *bfq_bic_lookup(struct request_queue *q)
{ {
struct bfq_io_cq *icq;
unsigned long flags;
if (!current->io_context) if (!current->io_context)
return NULL; return NULL;
spin_lock_irqsave(&q->queue_lock, flags); return icq_to_bic(ioc_lookup_icq(q));
icq = icq_to_bic(ioc_lookup_icq(q));
spin_unlock_irqrestore(&q->queue_lock, flags);
return icq;
} }
/* /*
@ -701,17 +694,13 @@ static void bfq_limit_depth(blk_opf_t opf, struct blk_mq_alloc_data *data)
{ {
struct bfq_data *bfqd = data->q->elevator->elevator_data; struct bfq_data *bfqd = data->q->elevator->elevator_data;
struct bfq_io_cq *bic = bfq_bic_lookup(data->q); struct bfq_io_cq *bic = bfq_bic_lookup(data->q);
int depth; unsigned int limit, act_idx;
unsigned limit = data->q->nr_requests;
unsigned int act_idx;
/* Sync reads have full depth available */ /* Sync reads have full depth available */
if (op_is_sync(opf) && !op_is_write(opf)) { if (op_is_sync(opf) && !op_is_write(opf))
depth = 0; limit = data->q->nr_requests;
} else { else
depth = bfqd->word_depths[!!bfqd->wr_busy_queues][op_is_sync(opf)]; limit = bfqd->async_depths[!!bfqd->wr_busy_queues][op_is_sync(opf)];
limit = (limit * depth) >> bfqd->full_depth_shift;
}
for (act_idx = 0; bic && act_idx < bfqd->num_actuators; act_idx++) { for (act_idx = 0; bic && act_idx < bfqd->num_actuators; act_idx++) {
/* Fast path to check if bfqq is already allocated. */ /* Fast path to check if bfqq is already allocated. */
@ -725,14 +714,16 @@ static void bfq_limit_depth(blk_opf_t opf, struct blk_mq_alloc_data *data)
* available requests and thus starve other entities. * available requests and thus starve other entities.
*/ */
if (bfqq_request_over_limit(bfqd, bic, opf, act_idx, limit)) { if (bfqq_request_over_limit(bfqd, bic, opf, act_idx, limit)) {
depth = 1; limit = 1;
break; break;
} }
} }
bfq_log(bfqd, "[%s] wr_busy %d sync %d depth %u", bfq_log(bfqd, "[%s] wr_busy %d sync %d depth %u",
__func__, bfqd->wr_busy_queues, op_is_sync(opf), depth); __func__, bfqd->wr_busy_queues, op_is_sync(opf), limit);
if (depth)
data->shallow_depth = depth; if (limit < data->q->nr_requests)
data->shallow_depth = limit;
} }
static struct bfq_queue * static struct bfq_queue *
@ -2457,15 +2448,8 @@ static bool bfq_bio_merge(struct request_queue *q, struct bio *bio,
unsigned int nr_segs) unsigned int nr_segs)
{ {
struct bfq_data *bfqd = q->elevator->elevator_data; struct bfq_data *bfqd = q->elevator->elevator_data;
struct request *free = NULL;
/*
* bfq_bic_lookup grabs the queue_lock: invoke it now and
* store its return value for later use, to avoid nesting
* queue_lock inside the bfqd->lock. We assume that the bic
* returned by bfq_bic_lookup does not go away before
* bfqd->lock is taken.
*/
struct bfq_io_cq *bic = bfq_bic_lookup(q); struct bfq_io_cq *bic = bfq_bic_lookup(q);
struct request *free = NULL;
bool ret; bool ret;
spin_lock_irq(&bfqd->lock); spin_lock_irq(&bfqd->lock);
@ -7128,9 +7112,8 @@ void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg)
*/ */
static void bfq_update_depths(struct bfq_data *bfqd, struct sbitmap_queue *bt) static void bfq_update_depths(struct bfq_data *bfqd, struct sbitmap_queue *bt)
{ {
unsigned int depth = 1U << bt->sb.shift; unsigned int nr_requests = bfqd->queue->nr_requests;
bfqd->full_depth_shift = bt->sb.shift;
/* /*
* In-word depths if no bfq_queue is being weight-raised: * In-word depths if no bfq_queue is being weight-raised:
* leaving 25% of tags only for sync reads. * leaving 25% of tags only for sync reads.
@ -7142,13 +7125,13 @@ static void bfq_update_depths(struct bfq_data *bfqd, struct sbitmap_queue *bt)
* limit 'something'. * limit 'something'.
*/ */
/* no more than 50% of tags for async I/O */ /* no more than 50% of tags for async I/O */
bfqd->word_depths[0][0] = max(depth >> 1, 1U); bfqd->async_depths[0][0] = max(nr_requests >> 1, 1U);
/* /*
* no more than 75% of tags for sync writes (25% extra tags * no more than 75% of tags for sync writes (25% extra tags
* w.r.t. async I/O, to prevent async I/O from starving sync * w.r.t. async I/O, to prevent async I/O from starving sync
* writes) * writes)
*/ */
bfqd->word_depths[0][1] = max((depth * 3) >> 2, 1U); bfqd->async_depths[0][1] = max((nr_requests * 3) >> 2, 1U);
/* /*
* In-word depths in case some bfq_queue is being weight- * In-word depths in case some bfq_queue is being weight-
@ -7158,9 +7141,9 @@ static void bfq_update_depths(struct bfq_data *bfqd, struct sbitmap_queue *bt)
* shortage. * shortage.
*/ */
/* no more than ~18% of tags for async I/O */ /* no more than ~18% of tags for async I/O */
bfqd->word_depths[1][0] = max((depth * 3) >> 4, 1U); bfqd->async_depths[1][0] = max((nr_requests * 3) >> 4, 1U);
/* no more than ~37% of tags for sync writes (~20% extra tags) */ /* no more than ~37% of tags for sync writes (~20% extra tags) */
bfqd->word_depths[1][1] = max((depth * 6) >> 4, 1U); bfqd->async_depths[1][1] = max((nr_requests * 6) >> 4, 1U);
} }
static void bfq_depth_updated(struct blk_mq_hw_ctx *hctx) static void bfq_depth_updated(struct blk_mq_hw_ctx *hctx)
@ -7232,22 +7215,16 @@ static void bfq_init_root_group(struct bfq_group *root_group,
root_group->sched_data.bfq_class_idle_last_service = jiffies; root_group->sched_data.bfq_class_idle_last_service = jiffies;
} }
static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) static int bfq_init_queue(struct request_queue *q, struct elevator_queue *eq)
{ {
struct bfq_data *bfqd; struct bfq_data *bfqd;
struct elevator_queue *eq;
unsigned int i; unsigned int i;
struct blk_independent_access_ranges *ia_ranges = q->disk->ia_ranges; struct blk_independent_access_ranges *ia_ranges = q->disk->ia_ranges;
eq = elevator_alloc(q, e); bfqd = kzalloc_node(sizeof(*bfqd), GFP_KERNEL, q->node);
if (!eq) if (!bfqd)
return -ENOMEM; return -ENOMEM;
bfqd = kzalloc_node(sizeof(*bfqd), GFP_KERNEL, q->node);
if (!bfqd) {
kobject_put(&eq->kobj);
return -ENOMEM;
}
eq->elevator_data = bfqd; eq->elevator_data = bfqd;
spin_lock_irq(&q->queue_lock); spin_lock_irq(&q->queue_lock);
@ -7405,7 +7382,6 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
out_free: out_free:
kfree(bfqd); kfree(bfqd);
kobject_put(&eq->kobj);
return -ENOMEM; return -ENOMEM;
} }

View file

@ -427,9 +427,6 @@ struct bfq_iocq_bfqq_data {
*/ */
bool saved_IO_bound; bool saved_IO_bound;
u64 saved_io_start_time;
u64 saved_tot_idle_time;
/* /*
* Same purpose as the previous fields for the values of the * Same purpose as the previous fields for the values of the
* field keeping the queue's belonging to a large burst * field keeping the queue's belonging to a large burst
@ -450,6 +447,9 @@ struct bfq_iocq_bfqq_data {
*/ */
unsigned int saved_weight; unsigned int saved_weight;
u64 saved_io_start_time;
u64 saved_tot_idle_time;
/* /*
* Similar to previous fields: save wr information. * Similar to previous fields: save wr information.
*/ */
@ -457,13 +457,13 @@ struct bfq_iocq_bfqq_data {
unsigned long saved_last_wr_start_finish; unsigned long saved_last_wr_start_finish;
unsigned long saved_service_from_wr; unsigned long saved_service_from_wr;
unsigned long saved_wr_start_at_switch_to_srt; unsigned long saved_wr_start_at_switch_to_srt;
unsigned int saved_wr_cur_max_time;
struct bfq_ttime saved_ttime; struct bfq_ttime saved_ttime;
unsigned int saved_wr_cur_max_time;
/* Save also injection state */ /* Save also injection state */
u64 saved_last_serv_time_ns;
unsigned int saved_inject_limit; unsigned int saved_inject_limit;
unsigned long saved_decrease_time_jif; unsigned long saved_decrease_time_jif;
u64 saved_last_serv_time_ns;
/* candidate queue for a stable merge (due to close creation time) */ /* candidate queue for a stable merge (due to close creation time) */
struct bfq_queue *stable_merge_bfqq; struct bfq_queue *stable_merge_bfqq;
@ -813,8 +813,7 @@ struct bfq_data {
* Depth limits used in bfq_limit_depth (see comments on the * Depth limits used in bfq_limit_depth (see comments on the
* function) * function)
*/ */
unsigned int word_depths[2][2]; unsigned int async_depths[2][2];
unsigned int full_depth_shift;
/* /*
* Number of independent actuators. This is equal to 1 in * Number of independent actuators. This is equal to 1 in

View file

@ -308,24 +308,23 @@ int __copy_io(unsigned long clone_flags, struct task_struct *tsk)
#ifdef CONFIG_BLK_ICQ #ifdef CONFIG_BLK_ICQ
/** /**
* ioc_lookup_icq - lookup io_cq from ioc * ioc_lookup_icq - lookup io_cq from ioc in io issue path
* @q: the associated request_queue * @q: the associated request_queue
* *
* Look up io_cq associated with @ioc - @q pair from @ioc. Must be called * Look up io_cq associated with @ioc - @q pair from @ioc. Must be called
* with @q->queue_lock held. * from io issue path, either return NULL if current issue io to @q for the
* first time, or return a valid icq.
*/ */
struct io_cq *ioc_lookup_icq(struct request_queue *q) struct io_cq *ioc_lookup_icq(struct request_queue *q)
{ {
struct io_context *ioc = current->io_context; struct io_context *ioc = current->io_context;
struct io_cq *icq; struct io_cq *icq;
lockdep_assert_held(&q->queue_lock);
/* /*
* icq's are indexed from @ioc using radix tree and hint pointer, * icq's are indexed from @ioc using radix tree and hint pointer,
* both of which are protected with RCU. All removals are done * both of which are protected with RCU, io issue path ensures that
* holding both q and ioc locks, and we're holding q lock - if we * both request_queue and current task are valid, the found icq
* find a icq which points to us, it's guaranteed to be valid. * is guaranteed to be valid until the io is done.
*/ */
rcu_read_lock(); rcu_read_lock();
icq = rcu_dereference(ioc->icq_hint); icq = rcu_dereference(ioc->icq_hint);
@ -419,10 +418,7 @@ struct io_cq *ioc_find_get_icq(struct request_queue *q)
task_unlock(current); task_unlock(current);
} else { } else {
get_io_context(ioc); get_io_context(ioc);
spin_lock_irq(&q->queue_lock);
icq = ioc_lookup_icq(q); icq = ioc_lookup_icq(q);
spin_unlock_irq(&q->queue_lock);
} }
if (!icq) { if (!icq) {

View file

@ -374,64 +374,17 @@ bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq,
} }
EXPORT_SYMBOL_GPL(blk_mq_sched_try_insert_merge); EXPORT_SYMBOL_GPL(blk_mq_sched_try_insert_merge);
static int blk_mq_sched_alloc_map_and_rqs(struct request_queue *q,
struct blk_mq_hw_ctx *hctx,
unsigned int hctx_idx)
{
if (blk_mq_is_shared_tags(q->tag_set->flags)) {
hctx->sched_tags = q->sched_shared_tags;
return 0;
}
hctx->sched_tags = blk_mq_alloc_map_and_rqs(q->tag_set, hctx_idx,
q->nr_requests);
if (!hctx->sched_tags)
return -ENOMEM;
return 0;
}
static void blk_mq_exit_sched_shared_tags(struct request_queue *queue)
{
blk_mq_free_rq_map(queue->sched_shared_tags);
queue->sched_shared_tags = NULL;
}
/* called in queue's release handler, tagset has gone away */ /* called in queue's release handler, tagset has gone away */
static void blk_mq_sched_tags_teardown(struct request_queue *q, unsigned int flags) static void blk_mq_sched_tags_teardown(struct request_queue *q, unsigned int flags)
{ {
struct blk_mq_hw_ctx *hctx; struct blk_mq_hw_ctx *hctx;
unsigned long i; unsigned long i;
queue_for_each_hw_ctx(q, hctx, i) { queue_for_each_hw_ctx(q, hctx, i)
if (hctx->sched_tags) { hctx->sched_tags = NULL;
if (!blk_mq_is_shared_tags(flags))
blk_mq_free_rq_map(hctx->sched_tags);
hctx->sched_tags = NULL;
}
}
if (blk_mq_is_shared_tags(flags)) if (blk_mq_is_shared_tags(flags))
blk_mq_exit_sched_shared_tags(q); q->sched_shared_tags = NULL;
}
static int blk_mq_init_sched_shared_tags(struct request_queue *queue)
{
struct blk_mq_tag_set *set = queue->tag_set;
/*
* Set initial depth at max so that we don't need to reallocate for
* updating nr_requests.
*/
queue->sched_shared_tags = blk_mq_alloc_map_and_rqs(set,
BLK_MQ_NO_HCTX_IDX,
MAX_SCHED_RQ);
if (!queue->sched_shared_tags)
return -ENOMEM;
blk_mq_tag_update_sched_shared_tags(queue);
return 0;
} }
void blk_mq_sched_reg_debugfs(struct request_queue *q) void blk_mq_sched_reg_debugfs(struct request_queue *q)
@ -458,8 +411,140 @@ void blk_mq_sched_unreg_debugfs(struct request_queue *q)
mutex_unlock(&q->debugfs_mutex); mutex_unlock(&q->debugfs_mutex);
} }
void blk_mq_free_sched_tags(struct elevator_tags *et,
struct blk_mq_tag_set *set)
{
unsigned long i;
/* Shared tags are stored at index 0 in @tags. */
if (blk_mq_is_shared_tags(set->flags))
blk_mq_free_map_and_rqs(set, et->tags[0], BLK_MQ_NO_HCTX_IDX);
else {
for (i = 0; i < et->nr_hw_queues; i++)
blk_mq_free_map_and_rqs(set, et->tags[i], i);
}
kfree(et);
}
void blk_mq_free_sched_tags_batch(struct xarray *et_table,
struct blk_mq_tag_set *set)
{
struct request_queue *q;
struct elevator_tags *et;
lockdep_assert_held_write(&set->update_nr_hwq_lock);
list_for_each_entry(q, &set->tag_list, tag_set_list) {
/*
* Accessing q->elevator without holding q->elevator_lock is
* safe because we're holding here set->update_nr_hwq_lock in
* the writer context. So, scheduler update/switch code (which
* acquires the same lock but in the reader context) can't run
* concurrently.
*/
if (q->elevator) {
et = xa_load(et_table, q->id);
if (unlikely(!et))
WARN_ON_ONCE(1);
else
blk_mq_free_sched_tags(et, set);
}
}
}
struct elevator_tags *blk_mq_alloc_sched_tags(struct blk_mq_tag_set *set,
unsigned int nr_hw_queues)
{
unsigned int nr_tags;
int i;
struct elevator_tags *et;
gfp_t gfp = GFP_NOIO | __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY;
if (blk_mq_is_shared_tags(set->flags))
nr_tags = 1;
else
nr_tags = nr_hw_queues;
et = kmalloc(sizeof(struct elevator_tags) +
nr_tags * sizeof(struct blk_mq_tags *), gfp);
if (!et)
return NULL;
/*
* Default to double of smaller one between hw queue_depth and
* 128, since we don't split into sync/async like the old code
* did. Additionally, this is a per-hw queue depth.
*/
et->nr_requests = 2 * min_t(unsigned int, set->queue_depth,
BLKDEV_DEFAULT_RQ);
et->nr_hw_queues = nr_hw_queues;
if (blk_mq_is_shared_tags(set->flags)) {
/* Shared tags are stored at index 0 in @tags. */
et->tags[0] = blk_mq_alloc_map_and_rqs(set, BLK_MQ_NO_HCTX_IDX,
MAX_SCHED_RQ);
if (!et->tags[0])
goto out;
} else {
for (i = 0; i < et->nr_hw_queues; i++) {
et->tags[i] = blk_mq_alloc_map_and_rqs(set, i,
et->nr_requests);
if (!et->tags[i])
goto out_unwind;
}
}
return et;
out_unwind:
while (--i >= 0)
blk_mq_free_map_and_rqs(set, et->tags[i], i);
out:
kfree(et);
return NULL;
}
int blk_mq_alloc_sched_tags_batch(struct xarray *et_table,
struct blk_mq_tag_set *set, unsigned int nr_hw_queues)
{
struct request_queue *q;
struct elevator_tags *et;
gfp_t gfp = GFP_NOIO | __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY;
lockdep_assert_held_write(&set->update_nr_hwq_lock);
list_for_each_entry(q, &set->tag_list, tag_set_list) {
/*
* Accessing q->elevator without holding q->elevator_lock is
* safe because we're holding here set->update_nr_hwq_lock in
* the writer context. So, scheduler update/switch code (which
* acquires the same lock but in the reader context) can't run
* concurrently.
*/
if (q->elevator) {
et = blk_mq_alloc_sched_tags(set, nr_hw_queues);
if (!et)
goto out_unwind;
if (xa_insert(et_table, q->id, et, gfp))
goto out_free_tags;
}
}
return 0;
out_free_tags:
blk_mq_free_sched_tags(et, set);
out_unwind:
list_for_each_entry_continue_reverse(q, &set->tag_list, tag_set_list) {
if (q->elevator) {
et = xa_load(et_table, q->id);
if (et)
blk_mq_free_sched_tags(et, set);
}
}
return -ENOMEM;
}
/* caller must have a reference to @e, will grab another one if successful */ /* caller must have a reference to @e, will grab another one if successful */
int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e) int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e,
struct elevator_tags *et)
{ {
unsigned int flags = q->tag_set->flags; unsigned int flags = q->tag_set->flags;
struct blk_mq_hw_ctx *hctx; struct blk_mq_hw_ctx *hctx;
@ -467,36 +552,33 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
unsigned long i; unsigned long i;
int ret; int ret;
/* eq = elevator_alloc(q, e, et);
* Default to double of smaller one between hw queue_depth and 128, if (!eq)
* since we don't split into sync/async like the old code did. return -ENOMEM;
* Additionally, this is a per-hw queue depth.
*/ q->nr_requests = et->nr_requests;
q->nr_requests = 2 * min_t(unsigned int, q->tag_set->queue_depth,
BLKDEV_DEFAULT_RQ);
if (blk_mq_is_shared_tags(flags)) { if (blk_mq_is_shared_tags(flags)) {
ret = blk_mq_init_sched_shared_tags(q); /* Shared tags are stored at index 0 in @et->tags. */
if (ret) q->sched_shared_tags = et->tags[0];
return ret; blk_mq_tag_update_sched_shared_tags(q);
} }
queue_for_each_hw_ctx(q, hctx, i) { queue_for_each_hw_ctx(q, hctx, i) {
ret = blk_mq_sched_alloc_map_and_rqs(q, hctx, i); if (blk_mq_is_shared_tags(flags))
if (ret) hctx->sched_tags = q->sched_shared_tags;
goto err_free_map_and_rqs; else
hctx->sched_tags = et->tags[i];
} }
ret = e->ops.init_sched(q, e); ret = e->ops.init_sched(q, eq);
if (ret) if (ret)
goto err_free_map_and_rqs; goto out;
queue_for_each_hw_ctx(q, hctx, i) { queue_for_each_hw_ctx(q, hctx, i) {
if (e->ops.init_hctx) { if (e->ops.init_hctx) {
ret = e->ops.init_hctx(hctx, i); ret = e->ops.init_hctx(hctx, i);
if (ret) { if (ret) {
eq = q->elevator;
blk_mq_sched_free_rqs(q);
blk_mq_exit_sched(q, eq); blk_mq_exit_sched(q, eq);
kobject_put(&eq->kobj); kobject_put(&eq->kobj);
return ret; return ret;
@ -505,10 +587,9 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
} }
return 0; return 0;
err_free_map_and_rqs: out:
blk_mq_sched_free_rqs(q);
blk_mq_sched_tags_teardown(q, flags); blk_mq_sched_tags_teardown(q, flags);
kobject_put(&eq->kobj);
q->elevator = NULL; q->elevator = NULL;
return ret; return ret;
} }

View file

@ -18,10 +18,20 @@ void __blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx);
void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx); void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx);
int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e); int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e,
struct elevator_tags *et);
void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e); void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e);
void blk_mq_sched_free_rqs(struct request_queue *q); void blk_mq_sched_free_rqs(struct request_queue *q);
struct elevator_tags *blk_mq_alloc_sched_tags(struct blk_mq_tag_set *set,
unsigned int nr_hw_queues);
int blk_mq_alloc_sched_tags_batch(struct xarray *et_table,
struct blk_mq_tag_set *set, unsigned int nr_hw_queues);
void blk_mq_free_sched_tags(struct elevator_tags *et,
struct blk_mq_tag_set *set);
void blk_mq_free_sched_tags_batch(struct xarray *et_table,
struct blk_mq_tag_set *set);
static inline void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx) static inline void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx)
{ {
if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))

View file

@ -4974,12 +4974,13 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr)
* Switch back to the elevator type stored in the xarray. * Switch back to the elevator type stored in the xarray.
*/ */
static void blk_mq_elv_switch_back(struct request_queue *q, static void blk_mq_elv_switch_back(struct request_queue *q,
struct xarray *elv_tbl) struct xarray *elv_tbl, struct xarray *et_tbl)
{ {
struct elevator_type *e = xa_load(elv_tbl, q->id); struct elevator_type *e = xa_load(elv_tbl, q->id);
struct elevator_tags *t = xa_load(et_tbl, q->id);
/* The elv_update_nr_hw_queues unfreezes the queue. */ /* The elv_update_nr_hw_queues unfreezes the queue. */
elv_update_nr_hw_queues(q, e); elv_update_nr_hw_queues(q, e, t);
/* Drop the reference acquired in blk_mq_elv_switch_none. */ /* Drop the reference acquired in blk_mq_elv_switch_none. */
if (e) if (e)
@ -5031,7 +5032,7 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set,
int prev_nr_hw_queues = set->nr_hw_queues; int prev_nr_hw_queues = set->nr_hw_queues;
unsigned int memflags; unsigned int memflags;
int i; int i;
struct xarray elv_tbl; struct xarray elv_tbl, et_tbl;
lockdep_assert_held(&set->tag_list_lock); lockdep_assert_held(&set->tag_list_lock);
@ -5044,6 +5045,10 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set,
memflags = memalloc_noio_save(); memflags = memalloc_noio_save();
xa_init(&et_tbl);
if (blk_mq_alloc_sched_tags_batch(&et_tbl, set, nr_hw_queues) < 0)
goto out_memalloc_restore;
xa_init(&elv_tbl); xa_init(&elv_tbl);
list_for_each_entry(q, &set->tag_list, tag_set_list) { list_for_each_entry(q, &set->tag_list, tag_set_list) {
@ -5087,7 +5092,7 @@ fallback:
switch_back: switch_back:
/* The blk_mq_elv_switch_back unfreezes queue for us. */ /* The blk_mq_elv_switch_back unfreezes queue for us. */
list_for_each_entry(q, &set->tag_list, tag_set_list) list_for_each_entry(q, &set->tag_list, tag_set_list)
blk_mq_elv_switch_back(q, &elv_tbl); blk_mq_elv_switch_back(q, &elv_tbl, &et_tbl);
list_for_each_entry(q, &set->tag_list, tag_set_list) { list_for_each_entry(q, &set->tag_list, tag_set_list) {
blk_mq_sysfs_register_hctxs(q); blk_mq_sysfs_register_hctxs(q);
@ -5098,7 +5103,8 @@ switch_back:
} }
xa_destroy(&elv_tbl); xa_destroy(&elv_tbl);
xa_destroy(&et_tbl);
out_memalloc_restore:
memalloc_noio_restore(memflags); memalloc_noio_restore(memflags);
/* Free the excess tags when nr_hw_queues shrink. */ /* Free the excess tags when nr_hw_queues shrink. */

View file

@ -62,16 +62,24 @@ EXPORT_SYMBOL(blk_set_stacking_limits);
void blk_apply_bdi_limits(struct backing_dev_info *bdi, void blk_apply_bdi_limits(struct backing_dev_info *bdi,
struct queue_limits *lim) struct queue_limits *lim)
{ {
u64 io_opt = lim->io_opt;
/* /*
* For read-ahead of large files to be effective, we need to read ahead * For read-ahead of large files to be effective, we need to read ahead
* at least twice the optimal I/O size. * at least twice the optimal I/O size. For rotational devices that do
* not report an optimal I/O size (e.g. ATA HDDs), use the maximum I/O
* size to avoid falling back to the (rather inefficient) small default
* read-ahead size.
* *
* There is no hardware limitation for the read-ahead size and the user * There is no hardware limitation for the read-ahead size and the user
* might have increased the read-ahead size through sysfs, so don't ever * might have increased the read-ahead size through sysfs, so don't ever
* decrease it. * decrease it.
*/ */
if (!io_opt && (lim->features & BLK_FEAT_ROTATIONAL))
io_opt = (u64)lim->max_sectors << SECTOR_SHIFT;
bdi->ra_pages = max3(bdi->ra_pages, bdi->ra_pages = max3(bdi->ra_pages,
lim->io_opt * 2 / PAGE_SIZE, io_opt * 2 >> PAGE_SHIFT,
VM_READAHEAD_PAGES); VM_READAHEAD_PAGES);
bdi->io_pages = lim->max_sectors >> PAGE_SECTORS_SHIFT; bdi->io_pages = lim->max_sectors >> PAGE_SECTORS_SHIFT;
} }
@ -312,8 +320,12 @@ int blk_validate_limits(struct queue_limits *lim)
pr_warn("Invalid logical block size (%d)\n", lim->logical_block_size); pr_warn("Invalid logical block size (%d)\n", lim->logical_block_size);
return -EINVAL; return -EINVAL;
} }
if (lim->physical_block_size < lim->logical_block_size) if (lim->physical_block_size < lim->logical_block_size) {
lim->physical_block_size = lim->logical_block_size; lim->physical_block_size = lim->logical_block_size;
} else if (!is_power_of_2(lim->physical_block_size)) {
pr_warn("Invalid physical block size (%d)\n", lim->physical_block_size);
return -EINVAL;
}
/* /*
* The minimum I/O size defaults to the physical block size unless * The minimum I/O size defaults to the physical block size unless
@ -388,12 +400,19 @@ int blk_validate_limits(struct queue_limits *lim)
lim->max_discard_sectors = lim->max_discard_sectors =
min(lim->max_hw_discard_sectors, lim->max_user_discard_sectors); min(lim->max_hw_discard_sectors, lim->max_user_discard_sectors);
/*
* When discard is not supported, discard_granularity should be reported
* as 0 to userspace.
*/
if (lim->max_discard_sectors)
lim->discard_granularity =
max(lim->discard_granularity, lim->physical_block_size);
else
lim->discard_granularity = 0;
if (!lim->max_discard_segments) if (!lim->max_discard_segments)
lim->max_discard_segments = 1; lim->max_discard_segments = 1;
if (lim->discard_granularity < lim->physical_block_size)
lim->discard_granularity = lim->physical_block_size;
/* /*
* By default there is no limit on the segment boundary alignment, * By default there is no limit on the segment boundary alignment,
* but if there is one it can't be smaller than the page size as * but if there is one it can't be smaller than the page size as
@ -849,7 +868,7 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
} }
/* chunk_sectors a multiple of the physical block size? */ /* chunk_sectors a multiple of the physical block size? */
if ((t->chunk_sectors << 9) & (t->physical_block_size - 1)) { if (t->chunk_sectors % (t->physical_block_size >> SECTOR_SHIFT)) {
t->chunk_sectors = 0; t->chunk_sectors = 0;
t->flags |= BLK_FLAG_MISALIGNED; t->flags |= BLK_FLAG_MISALIGNED;
ret = -1; ret = -1;

View file

@ -12,6 +12,7 @@
#include "blk-crypto-internal.h" #include "blk-crypto-internal.h"
struct elevator_type; struct elevator_type;
struct elevator_tags;
/* /*
* Default upper limit for the software max_sectors limit used for regular I/Os. * Default upper limit for the software max_sectors limit used for regular I/Os.
@ -330,7 +331,8 @@ bool blk_bio_list_merge(struct request_queue *q, struct list_head *list,
bool blk_insert_flush(struct request *rq); bool blk_insert_flush(struct request *rq);
void elv_update_nr_hw_queues(struct request_queue *q, struct elevator_type *e); void elv_update_nr_hw_queues(struct request_queue *q, struct elevator_type *e,
struct elevator_tags *t);
void elevator_set_default(struct request_queue *q); void elevator_set_default(struct request_queue *q);
void elevator_set_none(struct request_queue *q); void elevator_set_none(struct request_queue *q);

View file

@ -54,6 +54,8 @@ struct elv_change_ctx {
struct elevator_queue *old; struct elevator_queue *old;
/* for registering new elevator */ /* for registering new elevator */
struct elevator_queue *new; struct elevator_queue *new;
/* holds sched tags data */
struct elevator_tags *et;
}; };
static DEFINE_SPINLOCK(elv_list_lock); static DEFINE_SPINLOCK(elv_list_lock);
@ -132,7 +134,7 @@ static struct elevator_type *elevator_find_get(const char *name)
static const struct kobj_type elv_ktype; static const struct kobj_type elv_ktype;
struct elevator_queue *elevator_alloc(struct request_queue *q, struct elevator_queue *elevator_alloc(struct request_queue *q,
struct elevator_type *e) struct elevator_type *e, struct elevator_tags *et)
{ {
struct elevator_queue *eq; struct elevator_queue *eq;
@ -145,10 +147,10 @@ struct elevator_queue *elevator_alloc(struct request_queue *q,
kobject_init(&eq->kobj, &elv_ktype); kobject_init(&eq->kobj, &elv_ktype);
mutex_init(&eq->sysfs_lock); mutex_init(&eq->sysfs_lock);
hash_init(eq->hash); hash_init(eq->hash);
eq->et = et;
return eq; return eq;
} }
EXPORT_SYMBOL(elevator_alloc);
static void elevator_release(struct kobject *kobj) static void elevator_release(struct kobject *kobj)
{ {
@ -166,7 +168,6 @@ static void elevator_exit(struct request_queue *q)
lockdep_assert_held(&q->elevator_lock); lockdep_assert_held(&q->elevator_lock);
ioc_clear_queue(q); ioc_clear_queue(q);
blk_mq_sched_free_rqs(q);
mutex_lock(&e->sysfs_lock); mutex_lock(&e->sysfs_lock);
blk_mq_exit_sched(q, e); blk_mq_exit_sched(q, e);
@ -592,7 +593,7 @@ static int elevator_switch(struct request_queue *q, struct elv_change_ctx *ctx)
} }
if (new_e) { if (new_e) {
ret = blk_mq_init_sched(q, new_e); ret = blk_mq_init_sched(q, new_e, ctx->et);
if (ret) if (ret)
goto out_unfreeze; goto out_unfreeze;
ctx->new = q->elevator; ctx->new = q->elevator;
@ -627,8 +628,10 @@ static void elv_exit_and_release(struct request_queue *q)
elevator_exit(q); elevator_exit(q);
mutex_unlock(&q->elevator_lock); mutex_unlock(&q->elevator_lock);
blk_mq_unfreeze_queue(q, memflags); blk_mq_unfreeze_queue(q, memflags);
if (e) if (e) {
blk_mq_free_sched_tags(e->et, q->tag_set);
kobject_put(&e->kobj); kobject_put(&e->kobj);
}
} }
static int elevator_change_done(struct request_queue *q, static int elevator_change_done(struct request_queue *q,
@ -641,6 +644,7 @@ static int elevator_change_done(struct request_queue *q,
&ctx->old->flags); &ctx->old->flags);
elv_unregister_queue(q, ctx->old); elv_unregister_queue(q, ctx->old);
blk_mq_free_sched_tags(ctx->old->et, q->tag_set);
kobject_put(&ctx->old->kobj); kobject_put(&ctx->old->kobj);
if (enable_wbt) if (enable_wbt)
wbt_enable_default(q->disk); wbt_enable_default(q->disk);
@ -659,9 +663,16 @@ static int elevator_change_done(struct request_queue *q,
static int elevator_change(struct request_queue *q, struct elv_change_ctx *ctx) static int elevator_change(struct request_queue *q, struct elv_change_ctx *ctx)
{ {
unsigned int memflags; unsigned int memflags;
struct blk_mq_tag_set *set = q->tag_set;
int ret = 0; int ret = 0;
lockdep_assert_held(&q->tag_set->update_nr_hwq_lock); lockdep_assert_held(&set->update_nr_hwq_lock);
if (strncmp(ctx->name, "none", 4)) {
ctx->et = blk_mq_alloc_sched_tags(set, set->nr_hw_queues);
if (!ctx->et)
return -ENOMEM;
}
memflags = blk_mq_freeze_queue(q); memflags = blk_mq_freeze_queue(q);
/* /*
@ -681,6 +692,11 @@ static int elevator_change(struct request_queue *q, struct elv_change_ctx *ctx)
blk_mq_unfreeze_queue(q, memflags); blk_mq_unfreeze_queue(q, memflags);
if (!ret) if (!ret)
ret = elevator_change_done(q, ctx); ret = elevator_change_done(q, ctx);
/*
* Free sched tags if it's allocated but we couldn't switch elevator.
*/
if (ctx->et && !ctx->new)
blk_mq_free_sched_tags(ctx->et, set);
return ret; return ret;
} }
@ -689,8 +705,10 @@ static int elevator_change(struct request_queue *q, struct elv_change_ctx *ctx)
* The I/O scheduler depends on the number of hardware queues, this forces a * The I/O scheduler depends on the number of hardware queues, this forces a
* reattachment when nr_hw_queues changes. * reattachment when nr_hw_queues changes.
*/ */
void elv_update_nr_hw_queues(struct request_queue *q, struct elevator_type *e) void elv_update_nr_hw_queues(struct request_queue *q, struct elevator_type *e,
struct elevator_tags *t)
{ {
struct blk_mq_tag_set *set = q->tag_set;
struct elv_change_ctx ctx = {}; struct elv_change_ctx ctx = {};
int ret = -ENODEV; int ret = -ENODEV;
@ -698,6 +716,7 @@ void elv_update_nr_hw_queues(struct request_queue *q, struct elevator_type *e)
if (e && !blk_queue_dying(q) && blk_queue_registered(q)) { if (e && !blk_queue_dying(q) && blk_queue_registered(q)) {
ctx.name = e->elevator_name; ctx.name = e->elevator_name;
ctx.et = t;
mutex_lock(&q->elevator_lock); mutex_lock(&q->elevator_lock);
/* force to reattach elevator after nr_hw_queue is updated */ /* force to reattach elevator after nr_hw_queue is updated */
@ -707,6 +726,11 @@ void elv_update_nr_hw_queues(struct request_queue *q, struct elevator_type *e)
blk_mq_unfreeze_queue_nomemrestore(q); blk_mq_unfreeze_queue_nomemrestore(q);
if (!ret) if (!ret)
WARN_ON_ONCE(elevator_change_done(q, &ctx)); WARN_ON_ONCE(elevator_change_done(q, &ctx));
/*
* Free sched tags if it's allocated but we couldn't switch elevator.
*/
if (t && !ctx.new)
blk_mq_free_sched_tags(t, set);
} }
/* /*

View file

@ -23,8 +23,17 @@ enum elv_merge {
struct blk_mq_alloc_data; struct blk_mq_alloc_data;
struct blk_mq_hw_ctx; struct blk_mq_hw_ctx;
struct elevator_tags {
/* num. of hardware queues for which tags are allocated */
unsigned int nr_hw_queues;
/* depth used while allocating tags */
unsigned int nr_requests;
/* shared tag is stored at index 0 */
struct blk_mq_tags *tags[];
};
struct elevator_mq_ops { struct elevator_mq_ops {
int (*init_sched)(struct request_queue *, struct elevator_type *); int (*init_sched)(struct request_queue *, struct elevator_queue *);
void (*exit_sched)(struct elevator_queue *); void (*exit_sched)(struct elevator_queue *);
int (*init_hctx)(struct blk_mq_hw_ctx *, unsigned int); int (*init_hctx)(struct blk_mq_hw_ctx *, unsigned int);
void (*exit_hctx)(struct blk_mq_hw_ctx *, unsigned int); void (*exit_hctx)(struct blk_mq_hw_ctx *, unsigned int);
@ -113,6 +122,7 @@ struct request *elv_rqhash_find(struct request_queue *q, sector_t offset);
struct elevator_queue struct elevator_queue
{ {
struct elevator_type *type; struct elevator_type *type;
struct elevator_tags *et;
void *elevator_data; void *elevator_data;
struct kobject kobj; struct kobject kobj;
struct mutex sysfs_lock; struct mutex sysfs_lock;
@ -152,8 +162,8 @@ ssize_t elv_iosched_show(struct gendisk *disk, char *page);
ssize_t elv_iosched_store(struct gendisk *disk, const char *page, size_t count); ssize_t elv_iosched_store(struct gendisk *disk, const char *page, size_t count);
extern bool elv_bio_merge_ok(struct request *, struct bio *); extern bool elv_bio_merge_ok(struct request *, struct bio *);
extern struct elevator_queue *elevator_alloc(struct request_queue *, struct elevator_queue *elevator_alloc(struct request_queue *,
struct elevator_type *); struct elevator_type *, struct elevator_tags *);
/* /*
* Helper functions. * Helper functions.

View file

@ -157,10 +157,7 @@ struct kyber_queue_data {
*/ */
struct sbitmap_queue domain_tokens[KYBER_NUM_DOMAINS]; struct sbitmap_queue domain_tokens[KYBER_NUM_DOMAINS];
/* /* Number of allowed async requests. */
* Async request percentage, converted to per-word depth for
* sbitmap_get_shallow().
*/
unsigned int async_depth; unsigned int async_depth;
struct kyber_cpu_latency __percpu *cpu_latency; struct kyber_cpu_latency __percpu *cpu_latency;
@ -402,20 +399,13 @@ err:
return ERR_PTR(ret); return ERR_PTR(ret);
} }
static int kyber_init_sched(struct request_queue *q, struct elevator_type *e) static int kyber_init_sched(struct request_queue *q, struct elevator_queue *eq)
{ {
struct kyber_queue_data *kqd; struct kyber_queue_data *kqd;
struct elevator_queue *eq;
eq = elevator_alloc(q, e);
if (!eq)
return -ENOMEM;
kqd = kyber_queue_data_alloc(q); kqd = kyber_queue_data_alloc(q);
if (IS_ERR(kqd)) { if (IS_ERR(kqd))
kobject_put(&eq->kobj);
return PTR_ERR(kqd); return PTR_ERR(kqd);
}
blk_stat_enable_accounting(q); blk_stat_enable_accounting(q);
@ -454,10 +444,8 @@ static void kyber_depth_updated(struct blk_mq_hw_ctx *hctx)
{ {
struct kyber_queue_data *kqd = hctx->queue->elevator->elevator_data; struct kyber_queue_data *kqd = hctx->queue->elevator->elevator_data;
struct blk_mq_tags *tags = hctx->sched_tags; struct blk_mq_tags *tags = hctx->sched_tags;
unsigned int shift = tags->bitmap_tags.sb.shift;
kqd->async_depth = (1U << shift) * KYBER_ASYNC_PERCENT / 100U;
kqd->async_depth = hctx->queue->nr_requests * KYBER_ASYNC_PERCENT / 100U;
sbitmap_queue_min_shallow_depth(&tags->bitmap_tags, kqd->async_depth); sbitmap_queue_min_shallow_depth(&tags->bitmap_tags, kqd->async_depth);
} }

View file

@ -487,20 +487,6 @@ unlock:
return rq; return rq;
} }
/*
* 'depth' is a number in the range 1..INT_MAX representing a number of
* requests. Scale it with a factor (1 << bt->sb.shift) / q->nr_requests since
* 1..(1 << bt->sb.shift) is the range expected by sbitmap_get_shallow().
* Values larger than q->nr_requests have the same effect as q->nr_requests.
*/
static int dd_to_word_depth(struct blk_mq_hw_ctx *hctx, unsigned int qdepth)
{
struct sbitmap_queue *bt = &hctx->sched_tags->bitmap_tags;
const unsigned int nrr = hctx->queue->nr_requests;
return ((qdepth << bt->sb.shift) + nrr - 1) / nrr;
}
/* /*
* Called by __blk_mq_alloc_request(). The shallow_depth value set by this * Called by __blk_mq_alloc_request(). The shallow_depth value set by this
* function is used by __blk_mq_get_tag(). * function is used by __blk_mq_get_tag().
@ -517,7 +503,7 @@ static void dd_limit_depth(blk_opf_t opf, struct blk_mq_alloc_data *data)
* Throttle asynchronous requests and writes such that these requests * Throttle asynchronous requests and writes such that these requests
* do not block the allocation of synchronous requests. * do not block the allocation of synchronous requests.
*/ */
data->shallow_depth = dd_to_word_depth(data->hctx, dd->async_depth); data->shallow_depth = dd->async_depth;
} }
/* Called by blk_mq_update_nr_requests(). */ /* Called by blk_mq_update_nr_requests(). */
@ -568,20 +554,14 @@ static void dd_exit_sched(struct elevator_queue *e)
/* /*
* initialize elevator private data (deadline_data). * initialize elevator private data (deadline_data).
*/ */
static int dd_init_sched(struct request_queue *q, struct elevator_type *e) static int dd_init_sched(struct request_queue *q, struct elevator_queue *eq)
{ {
struct deadline_data *dd; struct deadline_data *dd;
struct elevator_queue *eq;
enum dd_prio prio; enum dd_prio prio;
int ret = -ENOMEM;
eq = elevator_alloc(q, e);
if (!eq)
return ret;
dd = kzalloc_node(sizeof(*dd), GFP_KERNEL, q->node); dd = kzalloc_node(sizeof(*dd), GFP_KERNEL, q->node);
if (!dd) if (!dd)
goto put_eq; return -ENOMEM;
eq->elevator_data = dd; eq->elevator_data = dd;
@ -608,10 +588,6 @@ static int dd_init_sched(struct request_queue *q, struct elevator_type *e)
q->elevator = eq; q->elevator = eq;
return 0; return 0;
put_eq:
kobject_put(&eq->kobj);
return ret;
} }
/* /*

View file

@ -700,6 +700,8 @@ static void zloop_free_disk(struct gendisk *disk)
struct zloop_device *zlo = disk->private_data; struct zloop_device *zlo = disk->private_data;
unsigned int i; unsigned int i;
blk_mq_free_tag_set(&zlo->tag_set);
for (i = 0; i < zlo->nr_zones; i++) { for (i = 0; i < zlo->nr_zones; i++) {
struct zloop_zone *zone = &zlo->zones[i]; struct zloop_zone *zone = &zlo->zones[i];
@ -1080,7 +1082,6 @@ static int zloop_ctl_remove(struct zloop_options *opts)
del_gendisk(zlo->disk); del_gendisk(zlo->disk);
put_disk(zlo->disk); put_disk(zlo->disk);
blk_mq_free_tag_set(&zlo->tag_set);
pr_info("Removed device %d\n", opts->id); pr_info("Removed device %d\n", opts->id);

View file

@ -438,7 +438,7 @@ static bool rs_is_reshapable(struct raid_set *rs)
/* Return true, if raid set in @rs is recovering */ /* Return true, if raid set in @rs is recovering */
static bool rs_is_recovering(struct raid_set *rs) static bool rs_is_recovering(struct raid_set *rs)
{ {
return rs->md.recovery_cp < rs->md.dev_sectors; return rs->md.resync_offset < rs->md.dev_sectors;
} }
/* Return true, if raid set in @rs is reshaping */ /* Return true, if raid set in @rs is reshaping */
@ -768,7 +768,7 @@ static struct raid_set *raid_set_alloc(struct dm_target *ti, struct raid_type *r
rs->md.layout = raid_type->algorithm; rs->md.layout = raid_type->algorithm;
rs->md.new_layout = rs->md.layout; rs->md.new_layout = rs->md.layout;
rs->md.delta_disks = 0; rs->md.delta_disks = 0;
rs->md.recovery_cp = MaxSector; rs->md.resync_offset = MaxSector;
for (i = 0; i < raid_devs; i++) for (i = 0; i < raid_devs; i++)
md_rdev_init(&rs->dev[i].rdev); md_rdev_init(&rs->dev[i].rdev);
@ -912,7 +912,7 @@ static int parse_dev_params(struct raid_set *rs, struct dm_arg_set *as)
rs->md.external = 0; rs->md.external = 0;
rs->md.persistent = 1; rs->md.persistent = 1;
rs->md.major_version = 2; rs->md.major_version = 2;
} else if (rebuild && !rs->md.recovery_cp) { } else if (rebuild && !rs->md.resync_offset) {
/* /*
* Without metadata, we will not be able to tell if the array * Without metadata, we will not be able to tell if the array
* is in-sync or not - we must assume it is not. Therefore, * is in-sync or not - we must assume it is not. Therefore,
@ -1695,20 +1695,20 @@ static void rs_setup_recovery(struct raid_set *rs, sector_t dev_sectors)
{ {
/* raid0 does not recover */ /* raid0 does not recover */
if (rs_is_raid0(rs)) if (rs_is_raid0(rs))
rs->md.recovery_cp = MaxSector; rs->md.resync_offset = MaxSector;
/* /*
* A raid6 set has to be recovered either * A raid6 set has to be recovered either
* completely or for the grown part to * completely or for the grown part to
* ensure proper parity and Q-Syndrome * ensure proper parity and Q-Syndrome
*/ */
else if (rs_is_raid6(rs)) else if (rs_is_raid6(rs))
rs->md.recovery_cp = dev_sectors; rs->md.resync_offset = dev_sectors;
/* /*
* Other raid set types may skip recovery * Other raid set types may skip recovery
* depending on the 'nosync' flag. * depending on the 'nosync' flag.
*/ */
else else
rs->md.recovery_cp = test_bit(__CTR_FLAG_NOSYNC, &rs->ctr_flags) rs->md.resync_offset = test_bit(__CTR_FLAG_NOSYNC, &rs->ctr_flags)
? MaxSector : dev_sectors; ? MaxSector : dev_sectors;
} }
@ -2143,7 +2143,7 @@ static void super_sync(struct mddev *mddev, struct md_rdev *rdev)
sb->events = cpu_to_le64(mddev->events); sb->events = cpu_to_le64(mddev->events);
sb->disk_recovery_offset = cpu_to_le64(rdev->recovery_offset); sb->disk_recovery_offset = cpu_to_le64(rdev->recovery_offset);
sb->array_resync_offset = cpu_to_le64(mddev->recovery_cp); sb->array_resync_offset = cpu_to_le64(mddev->resync_offset);
sb->level = cpu_to_le32(mddev->level); sb->level = cpu_to_le32(mddev->level);
sb->layout = cpu_to_le32(mddev->layout); sb->layout = cpu_to_le32(mddev->layout);
@ -2334,18 +2334,18 @@ static int super_init_validation(struct raid_set *rs, struct md_rdev *rdev)
} }
if (!test_bit(__CTR_FLAG_NOSYNC, &rs->ctr_flags)) if (!test_bit(__CTR_FLAG_NOSYNC, &rs->ctr_flags))
mddev->recovery_cp = le64_to_cpu(sb->array_resync_offset); mddev->resync_offset = le64_to_cpu(sb->array_resync_offset);
/* /*
* During load, we set FirstUse if a new superblock was written. * During load, we set FirstUse if a new superblock was written.
* There are two reasons we might not have a superblock: * There are two reasons we might not have a superblock:
* 1) The raid set is brand new - in which case, all of the * 1) The raid set is brand new - in which case, all of the
* devices must have their In_sync bit set. Also, * devices must have their In_sync bit set. Also,
* recovery_cp must be 0, unless forced. * resync_offset must be 0, unless forced.
* 2) This is a new device being added to an old raid set * 2) This is a new device being added to an old raid set
* and the new device needs to be rebuilt - in which * and the new device needs to be rebuilt - in which
* case the In_sync bit will /not/ be set and * case the In_sync bit will /not/ be set and
* recovery_cp must be MaxSector. * resync_offset must be MaxSector.
* 3) This is/are a new device(s) being added to an old * 3) This is/are a new device(s) being added to an old
* raid set during takeover to a higher raid level * raid set during takeover to a higher raid level
* to provide capacity for redundancy or during reshape * to provide capacity for redundancy or during reshape
@ -2390,8 +2390,8 @@ static int super_init_validation(struct raid_set *rs, struct md_rdev *rdev)
new_devs > 1 ? "s" : ""); new_devs > 1 ? "s" : "");
return -EINVAL; return -EINVAL;
} else if (!test_bit(__CTR_FLAG_REBUILD, &rs->ctr_flags) && rs_is_recovering(rs)) { } else if (!test_bit(__CTR_FLAG_REBUILD, &rs->ctr_flags) && rs_is_recovering(rs)) {
DMERR("'rebuild' specified while raid set is not in-sync (recovery_cp=%llu)", DMERR("'rebuild' specified while raid set is not in-sync (resync_offset=%llu)",
(unsigned long long) mddev->recovery_cp); (unsigned long long) mddev->resync_offset);
return -EINVAL; return -EINVAL;
} else if (rs_is_reshaping(rs)) { } else if (rs_is_reshaping(rs)) {
DMERR("'rebuild' specified while raid set is being reshaped (reshape_position=%llu)", DMERR("'rebuild' specified while raid set is being reshaped (reshape_position=%llu)",
@ -2700,11 +2700,11 @@ static int rs_adjust_data_offsets(struct raid_set *rs)
} }
out: out:
/* /*
* Raise recovery_cp in case data_offset != 0 to * Raise resync_offset in case data_offset != 0 to
* avoid false recovery positives in the constructor. * avoid false recovery positives in the constructor.
*/ */
if (rs->md.recovery_cp < rs->md.dev_sectors) if (rs->md.resync_offset < rs->md.dev_sectors)
rs->md.recovery_cp += rs->dev[0].rdev.data_offset; rs->md.resync_offset += rs->dev[0].rdev.data_offset;
/* Adjust data offsets on all rdevs but on any raid4/5/6 journal device */ /* Adjust data offsets on all rdevs but on any raid4/5/6 journal device */
rdev_for_each(rdev, &rs->md) { rdev_for_each(rdev, &rs->md) {
@ -2759,7 +2759,7 @@ static int rs_setup_takeover(struct raid_set *rs)
} }
clear_bit(MD_ARRAY_FIRST_USE, &mddev->flags); clear_bit(MD_ARRAY_FIRST_USE, &mddev->flags);
mddev->recovery_cp = MaxSector; mddev->resync_offset = MaxSector;
while (d--) { while (d--) {
rdev = &rs->dev[d].rdev; rdev = &rs->dev[d].rdev;
@ -2767,7 +2767,7 @@ static int rs_setup_takeover(struct raid_set *rs)
if (test_bit(d, (void *) rs->rebuild_disks)) { if (test_bit(d, (void *) rs->rebuild_disks)) {
clear_bit(In_sync, &rdev->flags); clear_bit(In_sync, &rdev->flags);
clear_bit(Faulty, &rdev->flags); clear_bit(Faulty, &rdev->flags);
mddev->recovery_cp = rdev->recovery_offset = 0; mddev->resync_offset = rdev->recovery_offset = 0;
/* Bitmap has to be created when we do an "up" takeover */ /* Bitmap has to be created when we do an "up" takeover */
set_bit(MD_ARRAY_FIRST_USE, &mddev->flags); set_bit(MD_ARRAY_FIRST_USE, &mddev->flags);
} }
@ -3225,7 +3225,7 @@ size_check:
if (r) if (r)
goto bad; goto bad;
rs_setup_recovery(rs, rs->md.recovery_cp < rs->md.dev_sectors ? rs->md.recovery_cp : rs->md.dev_sectors); rs_setup_recovery(rs, rs->md.resync_offset < rs->md.dev_sectors ? rs->md.resync_offset : rs->md.dev_sectors);
} else { } else {
/* This is no size change or it is shrinking, update size and record in superblocks */ /* This is no size change or it is shrinking, update size and record in superblocks */
r = rs_set_dev_and_array_sectors(rs, rs->ti->len, false); r = rs_set_dev_and_array_sectors(rs, rs->ti->len, false);
@ -3449,7 +3449,7 @@ static sector_t rs_get_progress(struct raid_set *rs, unsigned long recovery,
} else { } else {
if (state == st_idle && !test_bit(MD_RECOVERY_INTR, &recovery)) if (state == st_idle && !test_bit(MD_RECOVERY_INTR, &recovery))
r = mddev->recovery_cp; r = mddev->resync_offset;
else else
r = mddev->curr_resync_completed; r = mddev->curr_resync_completed;
@ -4077,9 +4077,9 @@ static int raid_preresume(struct dm_target *ti)
} }
/* Check for any resize/reshape on @rs and adjust/initiate */ /* Check for any resize/reshape on @rs and adjust/initiate */
if (mddev->recovery_cp && mddev->recovery_cp < MaxSector) { if (mddev->resync_offset && mddev->resync_offset < MaxSector) {
set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
mddev->resync_min = mddev->recovery_cp; mddev->resync_min = mddev->resync_offset;
if (test_bit(RT_FLAG_RS_GROW, &rs->runtime_flags)) if (test_bit(RT_FLAG_RS_GROW, &rs->runtime_flags))
mddev->resync_max_sectors = mddev->dev_sectors; mddev->resync_max_sectors = mddev->dev_sectors;
} }

View file

@ -1987,12 +1987,12 @@ static void bitmap_dirty_bits(struct mddev *mddev, unsigned long s,
md_bitmap_set_memory_bits(bitmap, sec, 1); md_bitmap_set_memory_bits(bitmap, sec, 1);
md_bitmap_file_set_bit(bitmap, sec); md_bitmap_file_set_bit(bitmap, sec);
if (sec < bitmap->mddev->recovery_cp) if (sec < bitmap->mddev->resync_offset)
/* We are asserting that the array is dirty, /* We are asserting that the array is dirty,
* so move the recovery_cp address back so * so move the resync_offset address back so
* that it is obvious that it is dirty * that it is obvious that it is dirty
*/ */
bitmap->mddev->recovery_cp = sec; bitmap->mddev->resync_offset = sec;
} }
} }
@ -2258,7 +2258,7 @@ static int bitmap_load(struct mddev *mddev)
|| bitmap->events_cleared == mddev->events) || bitmap->events_cleared == mddev->events)
/* no need to keep dirty bits to optimise a /* no need to keep dirty bits to optimise a
* re-add of a missing device */ * re-add of a missing device */
start = mddev->recovery_cp; start = mddev->resync_offset;
mutex_lock(&mddev->bitmap_info.mutex); mutex_lock(&mddev->bitmap_info.mutex);
err = md_bitmap_init_from_disk(bitmap, start); err = md_bitmap_init_from_disk(bitmap, start);

View file

@ -337,11 +337,11 @@ static void recover_bitmaps(struct md_thread *thread)
md_wakeup_thread(mddev->sync_thread); md_wakeup_thread(mddev->sync_thread);
if (hi > 0) { if (hi > 0) {
if (lo < mddev->recovery_cp) if (lo < mddev->resync_offset)
mddev->recovery_cp = lo; mddev->resync_offset = lo;
/* wake up thread to continue resync in case resync /* wake up thread to continue resync in case resync
* is not finished */ * is not finished */
if (mddev->recovery_cp != MaxSector) { if (mddev->resync_offset != MaxSector) {
/* /*
* clear the REMOTE flag since we will launch * clear the REMOTE flag since we will launch
* resync thread in current node. * resync thread in current node.
@ -863,9 +863,9 @@ static int gather_all_resync_info(struct mddev *mddev, int total_slots)
lockres_free(bm_lockres); lockres_free(bm_lockres);
continue; continue;
} }
if ((hi > 0) && (lo < mddev->recovery_cp)) { if ((hi > 0) && (lo < mddev->resync_offset)) {
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
mddev->recovery_cp = lo; mddev->resync_offset = lo;
md_check_recovery(mddev); md_check_recovery(mddev);
} }
@ -1027,7 +1027,7 @@ static int leave(struct mddev *mddev)
* Also, we should send BITMAP_NEEDS_SYNC message in * Also, we should send BITMAP_NEEDS_SYNC message in
* case reshaping is interrupted. * case reshaping is interrupted.
*/ */
if ((cinfo->slot_number > 0 && mddev->recovery_cp != MaxSector) || if ((cinfo->slot_number > 0 && mddev->resync_offset != MaxSector) ||
(mddev->reshape_position != MaxSector && (mddev->reshape_position != MaxSector &&
test_bit(MD_CLOSING, &mddev->flags))) test_bit(MD_CLOSING, &mddev->flags)))
resync_bitmap(mddev); resync_bitmap(mddev);
@ -1605,8 +1605,8 @@ static int gather_bitmaps(struct md_rdev *rdev)
pr_warn("md-cluster: Could not gather bitmaps from slot %d", sn); pr_warn("md-cluster: Could not gather bitmaps from slot %d", sn);
goto out; goto out;
} }
if ((hi > 0) && (lo < mddev->recovery_cp)) if ((hi > 0) && (lo < mddev->resync_offset))
mddev->recovery_cp = lo; mddev->resync_offset = lo;
} }
out: out:
return err; return err;

View file

@ -636,6 +636,12 @@ static void __mddev_put(struct mddev *mddev)
mddev->ctime || mddev->hold_active) mddev->ctime || mddev->hold_active)
return; return;
/*
* If array is freed by stopping array, MD_DELETED is set by
* do_md_stop(), MD_DELETED is still set here in case mddev is freed
* directly by closing a mddev that is created by create_on_open.
*/
set_bit(MD_DELETED, &mddev->flags);
/* /*
* Call queue_work inside the spinlock so that flush_workqueue() after * Call queue_work inside the spinlock so that flush_workqueue() after
* mddev_find will succeed in waiting for the work to be done. * mddev_find will succeed in waiting for the work to be done.
@ -1409,13 +1415,13 @@ static int super_90_validate(struct mddev *mddev, struct md_rdev *freshest, stru
mddev->layout = -1; mddev->layout = -1;
if (sb->state & (1<<MD_SB_CLEAN)) if (sb->state & (1<<MD_SB_CLEAN))
mddev->recovery_cp = MaxSector; mddev->resync_offset = MaxSector;
else { else {
if (sb->events_hi == sb->cp_events_hi && if (sb->events_hi == sb->cp_events_hi &&
sb->events_lo == sb->cp_events_lo) { sb->events_lo == sb->cp_events_lo) {
mddev->recovery_cp = sb->recovery_cp; mddev->resync_offset = sb->resync_offset;
} else } else
mddev->recovery_cp = 0; mddev->resync_offset = 0;
} }
memcpy(mddev->uuid+0, &sb->set_uuid0, 4); memcpy(mddev->uuid+0, &sb->set_uuid0, 4);
@ -1541,13 +1547,13 @@ static void super_90_sync(struct mddev *mddev, struct md_rdev *rdev)
mddev->minor_version = sb->minor_version; mddev->minor_version = sb->minor_version;
if (mddev->in_sync) if (mddev->in_sync)
{ {
sb->recovery_cp = mddev->recovery_cp; sb->resync_offset = mddev->resync_offset;
sb->cp_events_hi = (mddev->events>>32); sb->cp_events_hi = (mddev->events>>32);
sb->cp_events_lo = (u32)mddev->events; sb->cp_events_lo = (u32)mddev->events;
if (mddev->recovery_cp == MaxSector) if (mddev->resync_offset == MaxSector)
sb->state = (1<< MD_SB_CLEAN); sb->state = (1<< MD_SB_CLEAN);
} else } else
sb->recovery_cp = 0; sb->resync_offset = 0;
sb->layout = mddev->layout; sb->layout = mddev->layout;
sb->chunk_size = mddev->chunk_sectors << 9; sb->chunk_size = mddev->chunk_sectors << 9;
@ -1895,7 +1901,7 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *freshest, struc
mddev->bitmap_info.default_space = (4096-1024) >> 9; mddev->bitmap_info.default_space = (4096-1024) >> 9;
mddev->reshape_backwards = 0; mddev->reshape_backwards = 0;
mddev->recovery_cp = le64_to_cpu(sb->resync_offset); mddev->resync_offset = le64_to_cpu(sb->resync_offset);
memcpy(mddev->uuid, sb->set_uuid, 16); memcpy(mddev->uuid, sb->set_uuid, 16);
mddev->max_disks = (4096-256)/2; mddev->max_disks = (4096-256)/2;
@ -2081,7 +2087,7 @@ static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev)
sb->utime = cpu_to_le64((__u64)mddev->utime); sb->utime = cpu_to_le64((__u64)mddev->utime);
sb->events = cpu_to_le64(mddev->events); sb->events = cpu_to_le64(mddev->events);
if (mddev->in_sync) if (mddev->in_sync)
sb->resync_offset = cpu_to_le64(mddev->recovery_cp); sb->resync_offset = cpu_to_le64(mddev->resync_offset);
else if (test_bit(MD_JOURNAL_CLEAN, &mddev->flags)) else if (test_bit(MD_JOURNAL_CLEAN, &mddev->flags))
sb->resync_offset = cpu_to_le64(MaxSector); sb->resync_offset = cpu_to_le64(MaxSector);
else else
@ -2761,7 +2767,7 @@ repeat:
/* If this is just a dirty<->clean transition, and the array is clean /* If this is just a dirty<->clean transition, and the array is clean
* and 'events' is odd, we can roll back to the previous clean state */ * and 'events' is odd, we can roll back to the previous clean state */
if (nospares if (nospares
&& (mddev->in_sync && mddev->recovery_cp == MaxSector) && (mddev->in_sync && mddev->resync_offset == MaxSector)
&& mddev->can_decrease_events && mddev->can_decrease_events
&& mddev->events != 1) { && mddev->events != 1) {
mddev->events--; mddev->events--;
@ -4297,9 +4303,9 @@ __ATTR(chunk_size, S_IRUGO|S_IWUSR, chunk_size_show, chunk_size_store);
static ssize_t static ssize_t
resync_start_show(struct mddev *mddev, char *page) resync_start_show(struct mddev *mddev, char *page)
{ {
if (mddev->recovery_cp == MaxSector) if (mddev->resync_offset == MaxSector)
return sprintf(page, "none\n"); return sprintf(page, "none\n");
return sprintf(page, "%llu\n", (unsigned long long)mddev->recovery_cp); return sprintf(page, "%llu\n", (unsigned long long)mddev->resync_offset);
} }
static ssize_t static ssize_t
@ -4325,7 +4331,7 @@ resync_start_store(struct mddev *mddev, const char *buf, size_t len)
err = -EBUSY; err = -EBUSY;
if (!err) { if (!err) {
mddev->recovery_cp = n; mddev->resync_offset = n;
if (mddev->pers) if (mddev->pers)
set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
} }
@ -6417,7 +6423,7 @@ static void md_clean(struct mddev *mddev)
mddev->external_size = 0; mddev->external_size = 0;
mddev->dev_sectors = 0; mddev->dev_sectors = 0;
mddev->raid_disks = 0; mddev->raid_disks = 0;
mddev->recovery_cp = 0; mddev->resync_offset = 0;
mddev->resync_min = 0; mddev->resync_min = 0;
mddev->resync_max = MaxSector; mddev->resync_max = MaxSector;
mddev->reshape_position = MaxSector; mddev->reshape_position = MaxSector;
@ -7362,9 +7368,9 @@ int md_set_array_info(struct mddev *mddev, struct mdu_array_info_s *info)
* openned * openned
*/ */
if (info->state & (1<<MD_SB_CLEAN)) if (info->state & (1<<MD_SB_CLEAN))
mddev->recovery_cp = MaxSector; mddev->resync_offset = MaxSector;
else else
mddev->recovery_cp = 0; mddev->resync_offset = 0;
mddev->persistent = ! info->not_persistent; mddev->persistent = ! info->not_persistent;
mddev->external = 0; mddev->external = 0;
@ -8303,7 +8309,7 @@ static int status_resync(struct seq_file *seq, struct mddev *mddev)
seq_printf(seq, "\tresync=REMOTE"); seq_printf(seq, "\tresync=REMOTE");
return 1; return 1;
} }
if (mddev->recovery_cp < MaxSector) { if (mddev->resync_offset < MaxSector) {
seq_printf(seq, "\tresync=PENDING"); seq_printf(seq, "\tresync=PENDING");
return 1; return 1;
} }
@ -8946,7 +8952,7 @@ static sector_t md_sync_position(struct mddev *mddev, enum sync_action action)
return mddev->resync_min; return mddev->resync_min;
case ACTION_RESYNC: case ACTION_RESYNC:
if (!mddev->bitmap) if (!mddev->bitmap)
return mddev->recovery_cp; return mddev->resync_offset;
return 0; return 0;
case ACTION_RESHAPE: case ACTION_RESHAPE:
/* /*
@ -9184,8 +9190,8 @@ void md_do_sync(struct md_thread *thread)
atomic_read(&mddev->recovery_active) == 0); atomic_read(&mddev->recovery_active) == 0);
mddev->curr_resync_completed = j; mddev->curr_resync_completed = j;
if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) &&
j > mddev->recovery_cp) j > mddev->resync_offset)
mddev->recovery_cp = j; mddev->resync_offset = j;
update_time = jiffies; update_time = jiffies;
set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
sysfs_notify_dirent_safe(mddev->sysfs_completed); sysfs_notify_dirent_safe(mddev->sysfs_completed);
@ -9305,19 +9311,19 @@ void md_do_sync(struct md_thread *thread)
mddev->curr_resync > MD_RESYNC_ACTIVE) { mddev->curr_resync > MD_RESYNC_ACTIVE) {
if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
if (mddev->curr_resync >= mddev->recovery_cp) { if (mddev->curr_resync >= mddev->resync_offset) {
pr_debug("md: checkpointing %s of %s.\n", pr_debug("md: checkpointing %s of %s.\n",
desc, mdname(mddev)); desc, mdname(mddev));
if (test_bit(MD_RECOVERY_ERROR, if (test_bit(MD_RECOVERY_ERROR,
&mddev->recovery)) &mddev->recovery))
mddev->recovery_cp = mddev->resync_offset =
mddev->curr_resync_completed; mddev->curr_resync_completed;
else else
mddev->recovery_cp = mddev->resync_offset =
mddev->curr_resync; mddev->curr_resync;
} }
} else } else
mddev->recovery_cp = MaxSector; mddev->resync_offset = MaxSector;
} else { } else {
if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery))
mddev->curr_resync = MaxSector; mddev->curr_resync = MaxSector;
@ -9421,6 +9427,12 @@ static bool rdev_is_spare(struct md_rdev *rdev)
static bool rdev_addable(struct md_rdev *rdev) static bool rdev_addable(struct md_rdev *rdev)
{ {
struct mddev *mddev;
mddev = READ_ONCE(rdev->mddev);
if (!mddev)
return false;
/* rdev is already used, don't add it again. */ /* rdev is already used, don't add it again. */
if (test_bit(Candidate, &rdev->flags) || rdev->raid_disk >= 0 || if (test_bit(Candidate, &rdev->flags) || rdev->raid_disk >= 0 ||
test_bit(Faulty, &rdev->flags)) test_bit(Faulty, &rdev->flags))
@ -9431,7 +9443,7 @@ static bool rdev_addable(struct md_rdev *rdev)
return true; return true;
/* Allow to add if array is read-write. */ /* Allow to add if array is read-write. */
if (md_is_rdwr(rdev->mddev)) if (md_is_rdwr(mddev))
return true; return true;
/* /*
@ -9533,7 +9545,7 @@ static bool md_choose_sync_action(struct mddev *mddev, int *spares)
} }
/* Check if resync is in progress. */ /* Check if resync is in progress. */
if (mddev->recovery_cp < MaxSector) { if (mddev->resync_offset < MaxSector) {
remove_spares(mddev, NULL); remove_spares(mddev, NULL);
set_bit(MD_RECOVERY_SYNC, &mddev->recovery); set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
@ -9714,7 +9726,7 @@ void md_check_recovery(struct mddev *mddev)
test_bit(MD_RECOVERY_DONE, &mddev->recovery) || test_bit(MD_RECOVERY_DONE, &mddev->recovery) ||
(mddev->external == 0 && mddev->safemode == 1) || (mddev->external == 0 && mddev->safemode == 1) ||
(mddev->safemode == 2 (mddev->safemode == 2
&& !mddev->in_sync && mddev->recovery_cp == MaxSector) && !mddev->in_sync && mddev->resync_offset == MaxSector)
)) ))
return; return;
@ -9771,8 +9783,8 @@ void md_check_recovery(struct mddev *mddev)
* remove disk. * remove disk.
*/ */
rdev_for_each_safe(rdev, tmp, mddev) { rdev_for_each_safe(rdev, tmp, mddev) {
if (test_and_clear_bit(ClusterRemove, &rdev->flags) && if (rdev->raid_disk < 0 &&
rdev->raid_disk < 0) test_and_clear_bit(ClusterRemove, &rdev->flags))
md_kick_rdev_from_array(rdev); md_kick_rdev_from_array(rdev);
} }
} }
@ -10078,8 +10090,11 @@ static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev)
/* Check for change of roles in the active devices */ /* Check for change of roles in the active devices */
rdev_for_each_safe(rdev2, tmp, mddev) { rdev_for_each_safe(rdev2, tmp, mddev) {
if (test_bit(Faulty, &rdev2->flags)) if (test_bit(Faulty, &rdev2->flags)) {
if (test_bit(ClusterRemove, &rdev2->flags))
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
continue; continue;
}
/* Check if the roles changed */ /* Check if the roles changed */
role = le16_to_cpu(sb->dev_roles[rdev2->desc_nr]); role = le16_to_cpu(sb->dev_roles[rdev2->desc_nr]);

View file

@ -523,7 +523,7 @@ struct mddev {
unsigned long normal_io_events; /* IO event timestamp */ unsigned long normal_io_events; /* IO event timestamp */
atomic_t recovery_active; /* blocks scheduled, but not written */ atomic_t recovery_active; /* blocks scheduled, but not written */
wait_queue_head_t recovery_wait; wait_queue_head_t recovery_wait;
sector_t recovery_cp; sector_t resync_offset;
sector_t resync_min; /* user requested sync sector_t resync_min; /* user requested sync
* starts here */ * starts here */
sector_t resync_max; /* resync should pause sector_t resync_max; /* resync should pause

View file

@ -674,7 +674,7 @@ static void *raid0_takeover_raid45(struct mddev *mddev)
mddev->raid_disks--; mddev->raid_disks--;
mddev->delta_disks = -1; mddev->delta_disks = -1;
/* make sure it will be not marked as dirty */ /* make sure it will be not marked as dirty */
mddev->recovery_cp = MaxSector; mddev->resync_offset = MaxSector;
mddev_clear_unsupported_flags(mddev, UNSUPPORTED_MDDEV_FLAGS); mddev_clear_unsupported_flags(mddev, UNSUPPORTED_MDDEV_FLAGS);
create_strip_zones(mddev, &priv_conf); create_strip_zones(mddev, &priv_conf);
@ -717,7 +717,7 @@ static void *raid0_takeover_raid10(struct mddev *mddev)
mddev->raid_disks += mddev->delta_disks; mddev->raid_disks += mddev->delta_disks;
mddev->degraded = 0; mddev->degraded = 0;
/* make sure it will be not marked as dirty */ /* make sure it will be not marked as dirty */
mddev->recovery_cp = MaxSector; mddev->resync_offset = MaxSector;
mddev_clear_unsupported_flags(mddev, UNSUPPORTED_MDDEV_FLAGS); mddev_clear_unsupported_flags(mddev, UNSUPPORTED_MDDEV_FLAGS);
create_strip_zones(mddev, &priv_conf); create_strip_zones(mddev, &priv_conf);
@ -760,7 +760,7 @@ static void *raid0_takeover_raid1(struct mddev *mddev)
mddev->delta_disks = 1 - mddev->raid_disks; mddev->delta_disks = 1 - mddev->raid_disks;
mddev->raid_disks = 1; mddev->raid_disks = 1;
/* make sure it will be not marked as dirty */ /* make sure it will be not marked as dirty */
mddev->recovery_cp = MaxSector; mddev->resync_offset = MaxSector;
mddev_clear_unsupported_flags(mddev, UNSUPPORTED_MDDEV_FLAGS); mddev_clear_unsupported_flags(mddev, UNSUPPORTED_MDDEV_FLAGS);
create_strip_zones(mddev, &priv_conf); create_strip_zones(mddev, &priv_conf);

View file

@ -283,7 +283,7 @@ static inline int raid1_check_read_range(struct md_rdev *rdev,
static inline bool raid1_should_read_first(struct mddev *mddev, static inline bool raid1_should_read_first(struct mddev *mddev,
sector_t this_sector, int len) sector_t this_sector, int len)
{ {
if ((mddev->recovery_cp < this_sector + len)) if ((mddev->resync_offset < this_sector + len))
return true; return true;
if (mddev_is_clustered(mddev) && if (mddev_is_clustered(mddev) &&

View file

@ -127,10 +127,9 @@ static inline struct r1bio *get_resync_r1bio(struct bio *bio)
return get_resync_pages(bio)->raid_bio; return get_resync_pages(bio)->raid_bio;
} }
static void * r1bio_pool_alloc(gfp_t gfp_flags, void *data) static void *r1bio_pool_alloc(gfp_t gfp_flags, struct r1conf *conf)
{ {
struct pool_info *pi = data; int size = offsetof(struct r1bio, bios[conf->raid_disks * 2]);
int size = offsetof(struct r1bio, bios[pi->raid_disks]);
/* allocate a r1bio with room for raid_disks entries in the bios array */ /* allocate a r1bio with room for raid_disks entries in the bios array */
return kzalloc(size, gfp_flags); return kzalloc(size, gfp_flags);
@ -145,18 +144,18 @@ static void * r1bio_pool_alloc(gfp_t gfp_flags, void *data)
static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data) static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data)
{ {
struct pool_info *pi = data; struct r1conf *conf = data;
struct r1bio *r1_bio; struct r1bio *r1_bio;
struct bio *bio; struct bio *bio;
int need_pages; int need_pages;
int j; int j;
struct resync_pages *rps; struct resync_pages *rps;
r1_bio = r1bio_pool_alloc(gfp_flags, pi); r1_bio = r1bio_pool_alloc(gfp_flags, conf);
if (!r1_bio) if (!r1_bio)
return NULL; return NULL;
rps = kmalloc_array(pi->raid_disks, sizeof(struct resync_pages), rps = kmalloc_array(conf->raid_disks * 2, sizeof(struct resync_pages),
gfp_flags); gfp_flags);
if (!rps) if (!rps)
goto out_free_r1bio; goto out_free_r1bio;
@ -164,7 +163,7 @@ static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data)
/* /*
* Allocate bios : 1 for reading, n-1 for writing * Allocate bios : 1 for reading, n-1 for writing
*/ */
for (j = pi->raid_disks ; j-- ; ) { for (j = conf->raid_disks * 2; j-- ; ) {
bio = bio_kmalloc(RESYNC_PAGES, gfp_flags); bio = bio_kmalloc(RESYNC_PAGES, gfp_flags);
if (!bio) if (!bio)
goto out_free_bio; goto out_free_bio;
@ -177,11 +176,11 @@ static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data)
* If this is a user-requested check/repair, allocate * If this is a user-requested check/repair, allocate
* RESYNC_PAGES for each bio. * RESYNC_PAGES for each bio.
*/ */
if (test_bit(MD_RECOVERY_REQUESTED, &pi->mddev->recovery)) if (test_bit(MD_RECOVERY_REQUESTED, &conf->mddev->recovery))
need_pages = pi->raid_disks; need_pages = conf->raid_disks * 2;
else else
need_pages = 1; need_pages = 1;
for (j = 0; j < pi->raid_disks; j++) { for (j = 0; j < conf->raid_disks * 2; j++) {
struct resync_pages *rp = &rps[j]; struct resync_pages *rp = &rps[j];
bio = r1_bio->bios[j]; bio = r1_bio->bios[j];
@ -207,7 +206,7 @@ out_free_pages:
resync_free_pages(&rps[j]); resync_free_pages(&rps[j]);
out_free_bio: out_free_bio:
while (++j < pi->raid_disks) { while (++j < conf->raid_disks * 2) {
bio_uninit(r1_bio->bios[j]); bio_uninit(r1_bio->bios[j]);
kfree(r1_bio->bios[j]); kfree(r1_bio->bios[j]);
} }
@ -220,12 +219,12 @@ out_free_r1bio:
static void r1buf_pool_free(void *__r1_bio, void *data) static void r1buf_pool_free(void *__r1_bio, void *data)
{ {
struct pool_info *pi = data; struct r1conf *conf = data;
int i; int i;
struct r1bio *r1bio = __r1_bio; struct r1bio *r1bio = __r1_bio;
struct resync_pages *rp = NULL; struct resync_pages *rp = NULL;
for (i = pi->raid_disks; i--; ) { for (i = conf->raid_disks * 2; i--; ) {
rp = get_resync_pages(r1bio->bios[i]); rp = get_resync_pages(r1bio->bios[i]);
resync_free_pages(rp); resync_free_pages(rp);
bio_uninit(r1bio->bios[i]); bio_uninit(r1bio->bios[i]);
@ -255,7 +254,7 @@ static void free_r1bio(struct r1bio *r1_bio)
struct r1conf *conf = r1_bio->mddev->private; struct r1conf *conf = r1_bio->mddev->private;
put_all_bios(conf, r1_bio); put_all_bios(conf, r1_bio);
mempool_free(r1_bio, &conf->r1bio_pool); mempool_free(r1_bio, conf->r1bio_pool);
} }
static void put_buf(struct r1bio *r1_bio) static void put_buf(struct r1bio *r1_bio)
@ -1305,9 +1304,8 @@ alloc_r1bio(struct mddev *mddev, struct bio *bio)
struct r1conf *conf = mddev->private; struct r1conf *conf = mddev->private;
struct r1bio *r1_bio; struct r1bio *r1_bio;
r1_bio = mempool_alloc(&conf->r1bio_pool, GFP_NOIO); r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO);
/* Ensure no bio records IO_BLOCKED */ memset(r1_bio, 0, offsetof(struct r1bio, bios[conf->raid_disks * 2]));
memset(r1_bio->bios, 0, conf->raid_disks * sizeof(r1_bio->bios[0]));
init_r1bio(r1_bio, mddev, bio); init_r1bio(r1_bio, mddev, bio);
return r1_bio; return r1_bio;
} }
@ -2747,7 +2745,7 @@ static int init_resync(struct r1conf *conf)
BUG_ON(mempool_initialized(&conf->r1buf_pool)); BUG_ON(mempool_initialized(&conf->r1buf_pool));
return mempool_init(&conf->r1buf_pool, buffs, r1buf_pool_alloc, return mempool_init(&conf->r1buf_pool, buffs, r1buf_pool_alloc,
r1buf_pool_free, conf->poolinfo); r1buf_pool_free, conf);
} }
static struct r1bio *raid1_alloc_init_r1buf(struct r1conf *conf) static struct r1bio *raid1_alloc_init_r1buf(struct r1conf *conf)
@ -2757,7 +2755,7 @@ static struct r1bio *raid1_alloc_init_r1buf(struct r1conf *conf)
struct bio *bio; struct bio *bio;
int i; int i;
for (i = conf->poolinfo->raid_disks; i--; ) { for (i = conf->raid_disks * 2; i--; ) {
bio = r1bio->bios[i]; bio = r1bio->bios[i];
rps = bio->bi_private; rps = bio->bi_private;
bio_reset(bio, NULL, 0); bio_reset(bio, NULL, 0);
@ -2822,7 +2820,7 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
} }
if (mddev->bitmap == NULL && if (mddev->bitmap == NULL &&
mddev->recovery_cp == MaxSector && mddev->resync_offset == MaxSector &&
!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) && !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) &&
conf->fullsync == 0) { conf->fullsync == 0) {
*skipped = 1; *skipped = 1;
@ -3085,6 +3083,7 @@ static struct r1conf *setup_conf(struct mddev *mddev)
int i; int i;
struct raid1_info *disk; struct raid1_info *disk;
struct md_rdev *rdev; struct md_rdev *rdev;
size_t r1bio_size;
int err = -ENOMEM; int err = -ENOMEM;
conf = kzalloc(sizeof(struct r1conf), GFP_KERNEL); conf = kzalloc(sizeof(struct r1conf), GFP_KERNEL);
@ -3121,21 +3120,15 @@ static struct r1conf *setup_conf(struct mddev *mddev)
if (!conf->tmppage) if (!conf->tmppage)
goto abort; goto abort;
conf->poolinfo = kzalloc(sizeof(*conf->poolinfo), GFP_KERNEL); r1bio_size = offsetof(struct r1bio, bios[mddev->raid_disks * 2]);
if (!conf->poolinfo) conf->r1bio_pool = mempool_create_kmalloc_pool(NR_RAID_BIOS, r1bio_size);
goto abort; if (!conf->r1bio_pool)
conf->poolinfo->raid_disks = mddev->raid_disks * 2;
err = mempool_init(&conf->r1bio_pool, NR_RAID_BIOS, r1bio_pool_alloc,
rbio_pool_free, conf->poolinfo);
if (err)
goto abort; goto abort;
err = bioset_init(&conf->bio_split, BIO_POOL_SIZE, 0, 0); err = bioset_init(&conf->bio_split, BIO_POOL_SIZE, 0, 0);
if (err) if (err)
goto abort; goto abort;
conf->poolinfo->mddev = mddev;
err = -EINVAL; err = -EINVAL;
spin_lock_init(&conf->device_lock); spin_lock_init(&conf->device_lock);
conf->raid_disks = mddev->raid_disks; conf->raid_disks = mddev->raid_disks;
@ -3198,10 +3191,9 @@ static struct r1conf *setup_conf(struct mddev *mddev)
abort: abort:
if (conf) { if (conf) {
mempool_exit(&conf->r1bio_pool); mempool_destroy(conf->r1bio_pool);
kfree(conf->mirrors); kfree(conf->mirrors);
safe_put_page(conf->tmppage); safe_put_page(conf->tmppage);
kfree(conf->poolinfo);
kfree(conf->nr_pending); kfree(conf->nr_pending);
kfree(conf->nr_waiting); kfree(conf->nr_waiting);
kfree(conf->nr_queued); kfree(conf->nr_queued);
@ -3282,9 +3274,9 @@ static int raid1_run(struct mddev *mddev)
} }
if (conf->raid_disks - mddev->degraded == 1) if (conf->raid_disks - mddev->degraded == 1)
mddev->recovery_cp = MaxSector; mddev->resync_offset = MaxSector;
if (mddev->recovery_cp != MaxSector) if (mddev->resync_offset != MaxSector)
pr_info("md/raid1:%s: not clean -- starting background reconstruction\n", pr_info("md/raid1:%s: not clean -- starting background reconstruction\n",
mdname(mddev)); mdname(mddev));
pr_info("md/raid1:%s: active with %d out of %d mirrors\n", pr_info("md/raid1:%s: active with %d out of %d mirrors\n",
@ -3311,10 +3303,9 @@ static void raid1_free(struct mddev *mddev, void *priv)
{ {
struct r1conf *conf = priv; struct r1conf *conf = priv;
mempool_exit(&conf->r1bio_pool); mempool_destroy(conf->r1bio_pool);
kfree(conf->mirrors); kfree(conf->mirrors);
safe_put_page(conf->tmppage); safe_put_page(conf->tmppage);
kfree(conf->poolinfo);
kfree(conf->nr_pending); kfree(conf->nr_pending);
kfree(conf->nr_waiting); kfree(conf->nr_waiting);
kfree(conf->nr_queued); kfree(conf->nr_queued);
@ -3345,8 +3336,8 @@ static int raid1_resize(struct mddev *mddev, sector_t sectors)
md_set_array_sectors(mddev, newsize); md_set_array_sectors(mddev, newsize);
if (sectors > mddev->dev_sectors && if (sectors > mddev->dev_sectors &&
mddev->recovery_cp > mddev->dev_sectors) { mddev->resync_offset > mddev->dev_sectors) {
mddev->recovery_cp = mddev->dev_sectors; mddev->resync_offset = mddev->dev_sectors;
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
} }
mddev->dev_sectors = sectors; mddev->dev_sectors = sectors;
@ -3367,17 +3358,13 @@ static int raid1_reshape(struct mddev *mddev)
* At the same time, we "pack" the devices so that all the missing * At the same time, we "pack" the devices so that all the missing
* devices have the higher raid_disk numbers. * devices have the higher raid_disk numbers.
*/ */
mempool_t newpool, oldpool; mempool_t *newpool, *oldpool;
struct pool_info *newpoolinfo; size_t new_r1bio_size;
struct raid1_info *newmirrors; struct raid1_info *newmirrors;
struct r1conf *conf = mddev->private; struct r1conf *conf = mddev->private;
int cnt, raid_disks; int cnt, raid_disks;
unsigned long flags; unsigned long flags;
int d, d2; int d, d2;
int ret;
memset(&newpool, 0, sizeof(newpool));
memset(&oldpool, 0, sizeof(oldpool));
/* Cannot change chunk_size, layout, or level */ /* Cannot change chunk_size, layout, or level */
if (mddev->chunk_sectors != mddev->new_chunk_sectors || if (mddev->chunk_sectors != mddev->new_chunk_sectors ||
@ -3403,24 +3390,16 @@ static int raid1_reshape(struct mddev *mddev)
return -EBUSY; return -EBUSY;
} }
newpoolinfo = kmalloc(sizeof(*newpoolinfo), GFP_KERNEL); new_r1bio_size = offsetof(struct r1bio, bios[raid_disks * 2]);
if (!newpoolinfo) newpool = mempool_create_kmalloc_pool(NR_RAID_BIOS, new_r1bio_size);
if (!newpool) {
return -ENOMEM; return -ENOMEM;
newpoolinfo->mddev = mddev;
newpoolinfo->raid_disks = raid_disks * 2;
ret = mempool_init(&newpool, NR_RAID_BIOS, r1bio_pool_alloc,
rbio_pool_free, newpoolinfo);
if (ret) {
kfree(newpoolinfo);
return ret;
} }
newmirrors = kzalloc(array3_size(sizeof(struct raid1_info), newmirrors = kzalloc(array3_size(sizeof(struct raid1_info),
raid_disks, 2), raid_disks, 2),
GFP_KERNEL); GFP_KERNEL);
if (!newmirrors) { if (!newmirrors) {
kfree(newpoolinfo); mempool_destroy(newpool);
mempool_exit(&newpool);
return -ENOMEM; return -ENOMEM;
} }
@ -3429,7 +3408,6 @@ static int raid1_reshape(struct mddev *mddev)
/* ok, everything is stopped */ /* ok, everything is stopped */
oldpool = conf->r1bio_pool; oldpool = conf->r1bio_pool;
conf->r1bio_pool = newpool; conf->r1bio_pool = newpool;
init_waitqueue_head(&conf->r1bio_pool.wait);
for (d = d2 = 0; d < conf->raid_disks; d++) { for (d = d2 = 0; d < conf->raid_disks; d++) {
struct md_rdev *rdev = conf->mirrors[d].rdev; struct md_rdev *rdev = conf->mirrors[d].rdev;
@ -3446,8 +3424,6 @@ static int raid1_reshape(struct mddev *mddev)
} }
kfree(conf->mirrors); kfree(conf->mirrors);
conf->mirrors = newmirrors; conf->mirrors = newmirrors;
kfree(conf->poolinfo);
conf->poolinfo = newpoolinfo;
spin_lock_irqsave(&conf->device_lock, flags); spin_lock_irqsave(&conf->device_lock, flags);
mddev->degraded += (raid_disks - conf->raid_disks); mddev->degraded += (raid_disks - conf->raid_disks);
@ -3461,7 +3437,7 @@ static int raid1_reshape(struct mddev *mddev)
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
md_wakeup_thread(mddev->thread); md_wakeup_thread(mddev->thread);
mempool_exit(&oldpool); mempool_destroy(oldpool);
return 0; return 0;
} }

View file

@ -49,22 +49,6 @@ struct raid1_info {
sector_t seq_start; sector_t seq_start;
}; };
/*
* memory pools need a pointer to the mddev, so they can force an unplug
* when memory is tight, and a count of the number of drives that the
* pool was allocated for, so they know how much to allocate and free.
* mddev->raid_disks cannot be used, as it can change while a pool is active
* These two datums are stored in a kmalloced struct.
* The 'raid_disks' here is twice the raid_disks in r1conf.
* This allows space for each 'real' device can have a replacement in the
* second half of the array.
*/
struct pool_info {
struct mddev *mddev;
int raid_disks;
};
struct r1conf { struct r1conf {
struct mddev *mddev; struct mddev *mddev;
struct raid1_info *mirrors; /* twice 'raid_disks' to struct raid1_info *mirrors; /* twice 'raid_disks' to
@ -114,11 +98,7 @@ struct r1conf {
*/ */
int recovery_disabled; int recovery_disabled;
/* poolinfo contains information about the content of the mempool_t *r1bio_pool;
* mempools - it changes when the array grows or shrinks
*/
struct pool_info *poolinfo;
mempool_t r1bio_pool;
mempool_t r1buf_pool; mempool_t r1buf_pool;
struct bio_set bio_split; struct bio_set bio_split;

View file

@ -2117,7 +2117,7 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
int last = conf->geo.raid_disks - 1; int last = conf->geo.raid_disks - 1;
struct raid10_info *p; struct raid10_info *p;
if (mddev->recovery_cp < MaxSector) if (mddev->resync_offset < MaxSector)
/* only hot-add to in-sync arrays, as recovery is /* only hot-add to in-sync arrays, as recovery is
* very different from resync * very different from resync
*/ */
@ -3185,7 +3185,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
* of a clean array, like RAID1 does. * of a clean array, like RAID1 does.
*/ */
if (mddev->bitmap == NULL && if (mddev->bitmap == NULL &&
mddev->recovery_cp == MaxSector && mddev->resync_offset == MaxSector &&
mddev->reshape_position == MaxSector && mddev->reshape_position == MaxSector &&
!test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && !test_bit(MD_RECOVERY_SYNC, &mddev->recovery) &&
!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) && !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) &&
@ -4145,7 +4145,7 @@ static int raid10_run(struct mddev *mddev)
disk->recovery_disabled = mddev->recovery_disabled - 1; disk->recovery_disabled = mddev->recovery_disabled - 1;
} }
if (mddev->recovery_cp != MaxSector) if (mddev->resync_offset != MaxSector)
pr_notice("md/raid10:%s: not clean -- starting background reconstruction\n", pr_notice("md/raid10:%s: not clean -- starting background reconstruction\n",
mdname(mddev)); mdname(mddev));
pr_info("md/raid10:%s: active with %d out of %d devices\n", pr_info("md/raid10:%s: active with %d out of %d devices\n",
@ -4245,8 +4245,8 @@ static int raid10_resize(struct mddev *mddev, sector_t sectors)
md_set_array_sectors(mddev, size); md_set_array_sectors(mddev, size);
if (sectors > mddev->dev_sectors && if (sectors > mddev->dev_sectors &&
mddev->recovery_cp > oldsize) { mddev->resync_offset > oldsize) {
mddev->recovery_cp = oldsize; mddev->resync_offset = oldsize;
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
} }
calc_sectors(conf, sectors); calc_sectors(conf, sectors);
@ -4275,7 +4275,7 @@ static void *raid10_takeover_raid0(struct mddev *mddev, sector_t size, int devs)
mddev->delta_disks = mddev->raid_disks; mddev->delta_disks = mddev->raid_disks;
mddev->raid_disks *= 2; mddev->raid_disks *= 2;
/* make sure it will be not marked as dirty */ /* make sure it will be not marked as dirty */
mddev->recovery_cp = MaxSector; mddev->resync_offset = MaxSector;
mddev->dev_sectors = size; mddev->dev_sectors = size;
conf = setup_conf(mddev); conf = setup_conf(mddev);
@ -5087,8 +5087,8 @@ static void raid10_finish_reshape(struct mddev *mddev)
return; return;
if (mddev->delta_disks > 0) { if (mddev->delta_disks > 0) {
if (mddev->recovery_cp > mddev->resync_max_sectors) { if (mddev->resync_offset > mddev->resync_max_sectors) {
mddev->recovery_cp = mddev->resync_max_sectors; mddev->resync_offset = mddev->resync_max_sectors;
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
} }
mddev->resync_max_sectors = mddev->array_sectors; mddev->resync_max_sectors = mddev->array_sectors;

View file

@ -1163,7 +1163,7 @@ static int ppl_load_distributed(struct ppl_log *log)
le64_to_cpu(pplhdr->generation)); le64_to_cpu(pplhdr->generation));
/* attempt to recover from log if we are starting a dirty array */ /* attempt to recover from log if we are starting a dirty array */
if (pplhdr && !mddev->pers && mddev->recovery_cp != MaxSector) if (pplhdr && !mddev->pers && mddev->resync_offset != MaxSector)
ret = ppl_recover(log, pplhdr, pplhdr_offset); ret = ppl_recover(log, pplhdr, pplhdr_offset);
/* write empty header if we are starting the array */ /* write empty header if we are starting the array */
@ -1422,14 +1422,14 @@ int ppl_init_log(struct r5conf *conf)
if (ret) { if (ret) {
goto err; goto err;
} else if (!mddev->pers && mddev->recovery_cp == 0 && } else if (!mddev->pers && mddev->resync_offset == 0 &&
ppl_conf->recovered_entries > 0 && ppl_conf->recovered_entries > 0 &&
ppl_conf->mismatch_count == 0) { ppl_conf->mismatch_count == 0) {
/* /*
* If we are starting a dirty array and the recovery succeeds * If we are starting a dirty array and the recovery succeeds
* without any issues, set the array as clean. * without any issues, set the array as clean.
*/ */
mddev->recovery_cp = MaxSector; mddev->resync_offset = MaxSector;
set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
} else if (mddev->pers && ppl_conf->mismatch_count > 0) { } else if (mddev->pers && ppl_conf->mismatch_count > 0) {
/* no mismatch allowed when enabling PPL for a running array */ /* no mismatch allowed when enabling PPL for a running array */

View file

@ -3740,7 +3740,7 @@ static int want_replace(struct stripe_head *sh, int disk_idx)
&& !test_bit(Faulty, &rdev->flags) && !test_bit(Faulty, &rdev->flags)
&& !test_bit(In_sync, &rdev->flags) && !test_bit(In_sync, &rdev->flags)
&& (rdev->recovery_offset <= sh->sector && (rdev->recovery_offset <= sh->sector
|| rdev->mddev->recovery_cp <= sh->sector)) || rdev->mddev->resync_offset <= sh->sector))
rv = 1; rv = 1;
return rv; return rv;
} }
@ -3832,7 +3832,7 @@ static int need_this_block(struct stripe_head *sh, struct stripe_head_state *s,
* is missing/faulty, then we need to read everything we can. * is missing/faulty, then we need to read everything we can.
*/ */
if (!force_rcw && if (!force_rcw &&
sh->sector < sh->raid_conf->mddev->recovery_cp) sh->sector < sh->raid_conf->mddev->resync_offset)
/* reconstruct-write isn't being forced */ /* reconstruct-write isn't being forced */
return 0; return 0;
for (i = 0; i < s->failed && i < 2; i++) { for (i = 0; i < s->failed && i < 2; i++) {
@ -4097,7 +4097,7 @@ static int handle_stripe_dirtying(struct r5conf *conf,
int disks) int disks)
{ {
int rmw = 0, rcw = 0, i; int rmw = 0, rcw = 0, i;
sector_t recovery_cp = conf->mddev->recovery_cp; sector_t resync_offset = conf->mddev->resync_offset;
/* Check whether resync is now happening or should start. /* Check whether resync is now happening or should start.
* If yes, then the array is dirty (after unclean shutdown or * If yes, then the array is dirty (after unclean shutdown or
@ -4107,14 +4107,14 @@ static int handle_stripe_dirtying(struct r5conf *conf,
* generate correct data from the parity. * generate correct data from the parity.
*/ */
if (conf->rmw_level == PARITY_DISABLE_RMW || if (conf->rmw_level == PARITY_DISABLE_RMW ||
(recovery_cp < MaxSector && sh->sector >= recovery_cp && (resync_offset < MaxSector && sh->sector >= resync_offset &&
s->failed == 0)) { s->failed == 0)) {
/* Calculate the real rcw later - for now make it /* Calculate the real rcw later - for now make it
* look like rcw is cheaper * look like rcw is cheaper
*/ */
rcw = 1; rmw = 2; rcw = 1; rmw = 2;
pr_debug("force RCW rmw_level=%u, recovery_cp=%llu sh->sector=%llu\n", pr_debug("force RCW rmw_level=%u, resync_offset=%llu sh->sector=%llu\n",
conf->rmw_level, (unsigned long long)recovery_cp, conf->rmw_level, (unsigned long long)resync_offset,
(unsigned long long)sh->sector); (unsigned long long)sh->sector);
} else for (i = disks; i--; ) { } else for (i = disks; i--; ) {
/* would I have to read this buffer for read_modify_write */ /* would I have to read this buffer for read_modify_write */
@ -4770,14 +4770,14 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
if (test_bit(STRIPE_SYNCING, &sh->state)) { if (test_bit(STRIPE_SYNCING, &sh->state)) {
/* If there is a failed device being replaced, /* If there is a failed device being replaced,
* we must be recovering. * we must be recovering.
* else if we are after recovery_cp, we must be syncing * else if we are after resync_offset, we must be syncing
* else if MD_RECOVERY_REQUESTED is set, we also are syncing. * else if MD_RECOVERY_REQUESTED is set, we also are syncing.
* else we can only be replacing * else we can only be replacing
* sync and recovery both need to read all devices, and so * sync and recovery both need to read all devices, and so
* use the same flag. * use the same flag.
*/ */
if (do_recovery || if (do_recovery ||
sh->sector >= conf->mddev->recovery_cp || sh->sector >= conf->mddev->resync_offset ||
test_bit(MD_RECOVERY_REQUESTED, &(conf->mddev->recovery))) test_bit(MD_RECOVERY_REQUESTED, &(conf->mddev->recovery)))
s->syncing = 1; s->syncing = 1;
else else
@ -7780,7 +7780,7 @@ static int raid5_run(struct mddev *mddev)
int first = 1; int first = 1;
int ret = -EIO; int ret = -EIO;
if (mddev->recovery_cp != MaxSector) if (mddev->resync_offset != MaxSector)
pr_notice("md/raid:%s: not clean -- starting background reconstruction\n", pr_notice("md/raid:%s: not clean -- starting background reconstruction\n",
mdname(mddev)); mdname(mddev));
@ -7921,7 +7921,7 @@ static int raid5_run(struct mddev *mddev)
mdname(mddev)); mdname(mddev));
mddev->ro = 1; mddev->ro = 1;
set_disk_ro(mddev->gendisk, 1); set_disk_ro(mddev->gendisk, 1);
} else if (mddev->recovery_cp == MaxSector) } else if (mddev->resync_offset == MaxSector)
set_bit(MD_JOURNAL_CLEAN, &mddev->flags); set_bit(MD_JOURNAL_CLEAN, &mddev->flags);
} }
@ -7988,7 +7988,7 @@ static int raid5_run(struct mddev *mddev)
mddev->resync_max_sectors = mddev->dev_sectors; mddev->resync_max_sectors = mddev->dev_sectors;
if (mddev->degraded > dirty_parity_disks && if (mddev->degraded > dirty_parity_disks &&
mddev->recovery_cp != MaxSector) { mddev->resync_offset != MaxSector) {
if (test_bit(MD_HAS_PPL, &mddev->flags)) if (test_bit(MD_HAS_PPL, &mddev->flags))
pr_crit("md/raid:%s: starting dirty degraded array with PPL.\n", pr_crit("md/raid:%s: starting dirty degraded array with PPL.\n",
mdname(mddev)); mdname(mddev));
@ -8328,8 +8328,8 @@ static int raid5_resize(struct mddev *mddev, sector_t sectors)
md_set_array_sectors(mddev, newsize); md_set_array_sectors(mddev, newsize);
if (sectors > mddev->dev_sectors && if (sectors > mddev->dev_sectors &&
mddev->recovery_cp > mddev->dev_sectors) { mddev->resync_offset > mddev->dev_sectors) {
mddev->recovery_cp = mddev->dev_sectors; mddev->resync_offset = mddev->dev_sectors;
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
} }
mddev->dev_sectors = sectors; mddev->dev_sectors = sectors;
@ -8423,7 +8423,7 @@ static int raid5_start_reshape(struct mddev *mddev)
return -EINVAL; return -EINVAL;
/* raid5 can't handle concurrent reshape and recovery */ /* raid5 can't handle concurrent reshape and recovery */
if (mddev->recovery_cp < MaxSector) if (mddev->resync_offset < MaxSector)
return -EBUSY; return -EBUSY;
for (i = 0; i < conf->raid_disks; i++) for (i = 0; i < conf->raid_disks; i++)
if (conf->disks[i].replacement) if (conf->disks[i].replacement)
@ -8648,7 +8648,7 @@ static void *raid45_takeover_raid0(struct mddev *mddev, int level)
mddev->raid_disks += 1; mddev->raid_disks += 1;
mddev->delta_disks = 1; mddev->delta_disks = 1;
/* make sure it will be not marked as dirty */ /* make sure it will be not marked as dirty */
mddev->recovery_cp = MaxSector; mddev->resync_offset = MaxSector;
return setup_conf(mddev); return setup_conf(mddev);
} }

View file

@ -742,7 +742,7 @@ static int nvme_auth_secure_concat(struct nvme_ctrl *ctrl,
"%s: qid %d failed to generate digest, error %d\n", "%s: qid %d failed to generate digest, error %d\n",
__func__, chap->qid, ret); __func__, chap->qid, ret);
goto out_free_psk; goto out_free_psk;
}; }
dev_dbg(ctrl->device, "%s: generated digest %s\n", dev_dbg(ctrl->device, "%s: generated digest %s\n",
__func__, digest); __func__, digest);
ret = nvme_auth_derive_tls_psk(chap->hash_id, psk, psk_len, ret = nvme_auth_derive_tls_psk(chap->hash_id, psk, psk_len,
@ -752,7 +752,7 @@ static int nvme_auth_secure_concat(struct nvme_ctrl *ctrl,
"%s: qid %d failed to derive TLS psk, error %d\n", "%s: qid %d failed to derive TLS psk, error %d\n",
__func__, chap->qid, ret); __func__, chap->qid, ret);
goto out_free_digest; goto out_free_digest;
}; }
tls_key = nvme_tls_psk_refresh(ctrl->opts->keyring, tls_key = nvme_tls_psk_refresh(ctrl->opts->keyring,
ctrl->opts->host->nqn, ctrl->opts->host->nqn,

View file

@ -3158,6 +3158,11 @@ static inline bool nvme_discovery_ctrl(struct nvme_ctrl *ctrl)
return ctrl->opts && ctrl->opts->discovery_nqn; return ctrl->opts && ctrl->opts->discovery_nqn;
} }
static inline bool nvme_admin_ctrl(struct nvme_ctrl *ctrl)
{
return ctrl->cntrltype == NVME_CTRL_ADMIN;
}
static bool nvme_validate_cntlid(struct nvme_subsystem *subsys, static bool nvme_validate_cntlid(struct nvme_subsystem *subsys,
struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id) struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
{ {
@ -3670,6 +3675,17 @@ int nvme_init_ctrl_finish(struct nvme_ctrl *ctrl, bool was_suspended)
if (ret) if (ret)
return ret; return ret;
if (nvme_admin_ctrl(ctrl)) {
/*
* An admin controller has one admin queue, but no I/O queues.
* Override queue_count so it only creates an admin queue.
*/
dev_dbg(ctrl->device,
"Subsystem %s is an administrative controller",
ctrl->subsys->subnqn);
ctrl->queue_count = 1;
}
ret = nvme_configure_apst(ctrl); ret = nvme_configure_apst(ctrl);
if (ret < 0) if (ret < 0)
return ret; return ret;

View file

@ -1363,7 +1363,7 @@ nvme_fc_disconnect_assoc_done(struct nvmefc_ls_req *lsreq, int status)
* down, and the related FC-NVME Association ID and Connection IDs * down, and the related FC-NVME Association ID and Connection IDs
* become invalid. * become invalid.
* *
* The behavior of the fc-nvme initiator is such that it's * The behavior of the fc-nvme initiator is such that its
* understanding of the association and connections will implicitly * understanding of the association and connections will implicitly
* be torn down. The action is implicit as it may be due to a loss of * be torn down. The action is implicit as it may be due to a loss of
* connectivity with the fc-nvme target, so you may never get a * connectivity with the fc-nvme target, so you may never get a
@ -2777,7 +2777,7 @@ nvme_fc_queue_rq(struct blk_mq_hw_ctx *hctx,
* as WRITE ZEROES will return a non-zero rq payload_bytes yet * as WRITE ZEROES will return a non-zero rq payload_bytes yet
* there is no actual payload to be transferred. * there is no actual payload to be transferred.
* To get it right, key data transmission on there being 1 or * To get it right, key data transmission on there being 1 or
* more physical segments in the sg list. If there is no * more physical segments in the sg list. If there are no
* physical segments, there is no payload. * physical segments, there is no payload.
*/ */
if (blk_rq_nr_phys_segments(rq)) { if (blk_rq_nr_phys_segments(rq)) {

View file

@ -935,7 +935,7 @@ static blk_status_t nvme_pci_setup_data_sgl(struct request *req,
nvme_pci_sgl_set_seg(&iod->cmd.common.dptr.sgl, sgl_dma, mapped); nvme_pci_sgl_set_seg(&iod->cmd.common.dptr.sgl, sgl_dma, mapped);
if (unlikely(iter->status)) if (unlikely(iter->status))
nvme_free_sgls(req); nvme_unmap_data(req);
return iter->status; return iter->status;
} }

View file

@ -2179,7 +2179,7 @@ static int nvme_tcp_configure_io_queues(struct nvme_ctrl *ctrl, bool new)
/* /*
* Only start IO queues for which we have allocated the tagset * Only start IO queues for which we have allocated the tagset
* and limitted it to the available queues. On reconnects, the * and limited it to the available queues. On reconnects, the
* queue number might have changed. * queue number might have changed.
*/ */
nr_queues = min(ctrl->tagset->nr_hw_queues + 1, ctrl->queue_count); nr_queues = min(ctrl->tagset->nr_hw_queues + 1, ctrl->queue_count);

View file

@ -1960,24 +1960,24 @@ static int __init nvmet_init(void)
if (!nvmet_wq) if (!nvmet_wq)
goto out_free_buffered_work_queue; goto out_free_buffered_work_queue;
error = nvmet_init_discovery(); error = nvmet_init_debugfs();
if (error) if (error)
goto out_free_nvmet_work_queue; goto out_free_nvmet_work_queue;
error = nvmet_init_debugfs(); error = nvmet_init_discovery();
if (error)
goto out_exit_discovery;
error = nvmet_init_configfs();
if (error) if (error)
goto out_exit_debugfs; goto out_exit_debugfs;
error = nvmet_init_configfs();
if (error)
goto out_exit_discovery;
return 0; return 0;
out_exit_debugfs:
nvmet_exit_debugfs();
out_exit_discovery: out_exit_discovery:
nvmet_exit_discovery(); nvmet_exit_discovery();
out_exit_debugfs:
nvmet_exit_debugfs();
out_free_nvmet_work_queue: out_free_nvmet_work_queue:
destroy_workqueue(nvmet_wq); destroy_workqueue(nvmet_wq);
out_free_buffered_work_queue: out_free_buffered_work_queue:
@ -1992,8 +1992,8 @@ out_destroy_bvec_cache:
static void __exit nvmet_exit(void) static void __exit nvmet_exit(void)
{ {
nvmet_exit_configfs(); nvmet_exit_configfs();
nvmet_exit_debugfs();
nvmet_exit_discovery(); nvmet_exit_discovery();
nvmet_exit_debugfs();
ida_destroy(&cntlid_ida); ida_destroy(&cntlid_ida);
destroy_workqueue(nvmet_wq); destroy_workqueue(nvmet_wq);
destroy_workqueue(buffered_io_wq); destroy_workqueue(buffered_io_wq);

View file

@ -459,7 +459,7 @@ nvmet_fc_disconnect_assoc_done(struct nvmefc_ls_req *lsreq, int status)
* down, and the related FC-NVME Association ID and Connection IDs * down, and the related FC-NVME Association ID and Connection IDs
* become invalid. * become invalid.
* *
* The behavior of the fc-nvme target is such that it's * The behavior of the fc-nvme target is such that its
* understanding of the association and connections will implicitly * understanding of the association and connections will implicitly
* be torn down. The action is implicit as it may be due to a loss of * be torn down. The action is implicit as it may be due to a loss of
* connectivity with the fc-nvme host, so the target may never get a * connectivity with the fc-nvme host, so the target may never get a
@ -2313,7 +2313,7 @@ nvmet_fc_transfer_fcp_data(struct nvmet_fc_tgtport *tgtport,
ret = tgtport->ops->fcp_op(&tgtport->fc_target_port, fod->fcpreq); ret = tgtport->ops->fcp_op(&tgtport->fc_target_port, fod->fcpreq);
if (ret) { if (ret) {
/* /*
* should be ok to set w/o lock as its in the thread of * should be ok to set w/o lock as it's in the thread of
* execution (not an async timer routine) and doesn't * execution (not an async timer routine) and doesn't
* contend with any clearing action * contend with any clearing action
*/ */
@ -2629,7 +2629,7 @@ transport_error:
* and the api of the FC LLDD which may issue a hw command to send the * and the api of the FC LLDD which may issue a hw command to send the
* response, but the LLDD may not get the hw completion for that command * response, but the LLDD may not get the hw completion for that command
* and upcall the nvmet_fc layer before a new command may be * and upcall the nvmet_fc layer before a new command may be
* asynchronously received - its possible for a command to be received * asynchronously received - it's possible for a command to be received
* before the LLDD and nvmet_fc have recycled the job structure. It gives * before the LLDD and nvmet_fc have recycled the job structure. It gives
* the appearance of more commands received than fits in the sq. * the appearance of more commands received than fits in the sq.
* To alleviate this scenario, a temporary queue is maintained in the * To alleviate this scenario, a temporary queue is maintained in the

View file

@ -533,6 +533,8 @@ u16 nvmet_parse_passthru_admin_cmd(struct nvmet_req *req)
case NVME_FEAT_HOST_ID: case NVME_FEAT_HOST_ID:
req->execute = nvmet_execute_get_features; req->execute = nvmet_execute_get_features;
return NVME_SC_SUCCESS; return NVME_SC_SUCCESS;
case NVME_FEAT_FDP:
return nvmet_setup_passthru_command(req);
default: default:
return nvmet_passthru_get_set_features(req); return nvmet_passthru_get_set_features(req);
} }

View file

@ -1731,7 +1731,7 @@ static void nvmet_rdma_queue_connect_fail(struct rdma_cm_id *cm_id,
* We registered an ib_client to handle device removal for queues, * We registered an ib_client to handle device removal for queues,
* so we only need to handle the listening port cm_ids. In this case * so we only need to handle the listening port cm_ids. In this case
* we nullify the priv to prevent double cm_id destruction and destroying * we nullify the priv to prevent double cm_id destruction and destroying
* the cm_id implicitely by returning a non-zero rc to the callout. * the cm_id implicitly by returning a non-zero rc to the callout.
*/ */
static int nvmet_rdma_device_removal(struct rdma_cm_id *cm_id, static int nvmet_rdma_device_removal(struct rdma_cm_id *cm_id,
struct nvmet_rdma_queue *queue) struct nvmet_rdma_queue *queue)
@ -1742,7 +1742,7 @@ static int nvmet_rdma_device_removal(struct rdma_cm_id *cm_id,
/* /*
* This is a queue cm_id. we have registered * This is a queue cm_id. we have registered
* an ib_client to handle queues removal * an ib_client to handle queues removal
* so don't interfear and just return. * so don't interfere and just return.
*/ */
return 0; return 0;
} }
@ -1760,7 +1760,7 @@ static int nvmet_rdma_device_removal(struct rdma_cm_id *cm_id,
/* /*
* We need to return 1 so that the core will destroy * We need to return 1 so that the core will destroy
* it's own ID. What a great API design.. * its own ID. What a great API design..
*/ */
return 1; return 1;
} }

View file

@ -60,7 +60,8 @@ static inline int __get_task_ioprio(struct task_struct *p)
int prio; int prio;
if (!ioc) if (!ioc)
return IOPRIO_DEFAULT; return IOPRIO_PRIO_VALUE(task_nice_ioclass(p),
task_nice_ioprio(p));
if (p != current) if (p != current)
lockdep_assert_held(&p->alloc_lock); lockdep_assert_held(&p->alloc_lock);

View file

@ -209,23 +209,6 @@ void sbitmap_resize(struct sbitmap *sb, unsigned int depth);
*/ */
int sbitmap_get(struct sbitmap *sb); int sbitmap_get(struct sbitmap *sb);
/**
* sbitmap_get_shallow() - Try to allocate a free bit from a &struct sbitmap,
* limiting the depth used from each word.
* @sb: Bitmap to allocate from.
* @shallow_depth: The maximum number of bits to allocate from a single word.
*
* This rather specific operation allows for having multiple users with
* different allocation limits. E.g., there can be a high-priority class that
* uses sbitmap_get() and a low-priority class that uses sbitmap_get_shallow()
* with a @shallow_depth of (1 << (@sb->shift - 1)). Then, the low-priority
* class can only allocate half of the total bits in the bitmap, preventing it
* from starving out the high-priority class.
*
* Return: Non-negative allocated bit number if successful, -1 otherwise.
*/
int sbitmap_get_shallow(struct sbitmap *sb, unsigned long shallow_depth);
/** /**
* sbitmap_any_bit_set() - Check for a set bit in a &struct sbitmap. * sbitmap_any_bit_set() - Check for a set bit in a &struct sbitmap.
* @sb: Bitmap to check. * @sb: Bitmap to check.
@ -478,7 +461,7 @@ unsigned long __sbitmap_queue_get_batch(struct sbitmap_queue *sbq, int nr_tags,
* sbitmap_queue, limiting the depth used from each word, with preemption * sbitmap_queue, limiting the depth used from each word, with preemption
* already disabled. * already disabled.
* @sbq: Bitmap queue to allocate from. * @sbq: Bitmap queue to allocate from.
* @shallow_depth: The maximum number of bits to allocate from a single word. * @shallow_depth: The maximum number of bits to allocate from the queue.
* See sbitmap_get_shallow(). * See sbitmap_get_shallow().
* *
* If you call this, make sure to call sbitmap_queue_min_shallow_depth() after * If you call this, make sure to call sbitmap_queue_min_shallow_depth() after

View file

@ -173,7 +173,7 @@ typedef struct mdp_superblock_s {
#else #else
#error unspecified endianness #error unspecified endianness
#endif #endif
__u32 recovery_cp; /* 11 recovery checkpoint sector count */ __u32 resync_offset; /* 11 resync checkpoint sector count */
/* There are only valid for minor_version > 90 */ /* There are only valid for minor_version > 90 */
__u64 reshape_position; /* 12,13 next address in array-space for reshape */ __u64 reshape_position; /* 12,13 next address in array-space for reshape */
__u32 new_level; /* 14 new level we are reshaping to */ __u32 new_level; /* 14 new level we are reshaping to */

View file

@ -208,8 +208,28 @@ static int sbitmap_find_bit_in_word(struct sbitmap_word *map,
return nr; return nr;
} }
static unsigned int __map_depth_with_shallow(const struct sbitmap *sb,
int index,
unsigned int shallow_depth)
{
u64 shallow_word_depth;
unsigned int word_depth, reminder;
word_depth = __map_depth(sb, index);
if (shallow_depth >= sb->depth)
return word_depth;
shallow_word_depth = word_depth * shallow_depth;
reminder = do_div(shallow_word_depth, sb->depth);
if (reminder >= (index + 1) * word_depth)
shallow_word_depth++;
return (unsigned int)shallow_word_depth;
}
static int sbitmap_find_bit(struct sbitmap *sb, static int sbitmap_find_bit(struct sbitmap *sb,
unsigned int depth, unsigned int shallow_depth,
unsigned int index, unsigned int index,
unsigned int alloc_hint, unsigned int alloc_hint,
bool wrap) bool wrap)
@ -218,12 +238,12 @@ static int sbitmap_find_bit(struct sbitmap *sb,
int nr = -1; int nr = -1;
for (i = 0; i < sb->map_nr; i++) { for (i = 0; i < sb->map_nr; i++) {
nr = sbitmap_find_bit_in_word(&sb->map[index], unsigned int depth = __map_depth_with_shallow(sb, index,
min_t(unsigned int, shallow_depth);
__map_depth(sb, index),
depth),
alloc_hint, wrap);
if (depth)
nr = sbitmap_find_bit_in_word(&sb->map[index], depth,
alloc_hint, wrap);
if (nr != -1) { if (nr != -1) {
nr += index << sb->shift; nr += index << sb->shift;
break; break;
@ -287,7 +307,22 @@ static int __sbitmap_get_shallow(struct sbitmap *sb,
return sbitmap_find_bit(sb, shallow_depth, index, alloc_hint, true); return sbitmap_find_bit(sb, shallow_depth, index, alloc_hint, true);
} }
int sbitmap_get_shallow(struct sbitmap *sb, unsigned long shallow_depth) /**
* sbitmap_get_shallow() - Try to allocate a free bit from a &struct sbitmap,
* limiting the depth used from each word.
* @sb: Bitmap to allocate from.
* @shallow_depth: The maximum number of bits to allocate from the bitmap.
*
* This rather specific operation allows for having multiple users with
* different allocation limits. E.g., there can be a high-priority class that
* uses sbitmap_get() and a low-priority class that uses sbitmap_get_shallow()
* with a @shallow_depth of (sb->depth >> 1). Then, the low-priority
* class can only allocate half of the total bits in the bitmap, preventing it
* from starving out the high-priority class.
*
* Return: Non-negative allocated bit number if successful, -1 otherwise.
*/
static int sbitmap_get_shallow(struct sbitmap *sb, unsigned long shallow_depth)
{ {
int nr; int nr;
unsigned int hint, depth; unsigned int hint, depth;
@ -302,7 +337,6 @@ int sbitmap_get_shallow(struct sbitmap *sb, unsigned long shallow_depth)
return nr; return nr;
} }
EXPORT_SYMBOL_GPL(sbitmap_get_shallow);
bool sbitmap_any_bit_set(const struct sbitmap *sb) bool sbitmap_any_bit_set(const struct sbitmap *sb)
{ {
@ -406,27 +440,9 @@ EXPORT_SYMBOL_GPL(sbitmap_bitmap_show);
static unsigned int sbq_calc_wake_batch(struct sbitmap_queue *sbq, static unsigned int sbq_calc_wake_batch(struct sbitmap_queue *sbq,
unsigned int depth) unsigned int depth)
{ {
unsigned int wake_batch; return clamp_t(unsigned int,
unsigned int shallow_depth; min(depth, sbq->min_shallow_depth) / SBQ_WAIT_QUEUES,
1, SBQ_WAKE_BATCH);
/*
* Each full word of the bitmap has bits_per_word bits, and there might
* be a partial word. There are depth / bits_per_word full words and
* depth % bits_per_word bits left over. In bitwise arithmetic:
*
* bits_per_word = 1 << shift
* depth / bits_per_word = depth >> shift
* depth % bits_per_word = depth & ((1 << shift) - 1)
*
* Each word can be limited to sbq->min_shallow_depth bits.
*/
shallow_depth = min(1U << sbq->sb.shift, sbq->min_shallow_depth);
depth = ((depth >> sbq->sb.shift) * shallow_depth +
min(depth & ((1U << sbq->sb.shift) - 1), shallow_depth));
wake_batch = clamp_t(unsigned int, depth / SBQ_WAIT_QUEUES, 1,
SBQ_WAKE_BATCH);
return wake_batch;
} }
int sbitmap_queue_init_node(struct sbitmap_queue *sbq, unsigned int depth, int sbitmap_queue_init_node(struct sbitmap_queue *sbq, unsigned int depth,