mirror of
https://github.com/torvalds/linux.git
synced 2025-08-15 06:01:56 +02:00
block-6.17-20250808
-----BEGIN PGP SIGNATURE----- iQJEBAABCAAuFiEEwPw5LcreJtl1+l5K99NY+ylx4KYFAmiWLjoQHGF4Ym9lQGtl cm5lbC5kawAKCRD301j7KXHgpvveD/9vbvp3XaF0LagRJLH0fcdhcxL7Z+IHD+7U v5vICMeoeBhhhOtPJ0y+h/9LMLQWFYDFl6drkY0atSSxp/CK6CB25qFhIDsoA6Qk RBM/qZ64z4Uxvlc+VQmCqI2EMc/ZrYtrcr7jsornwORoTSEKXVHdyO5k7Q9002Sw XNWc0bZKIibFlgOk12Wnd8ZS5RWHw1uViUcreojcGVZAVR+BuHNGGoa3xq0bLiHU ERbQXfjaN28R+eo4E1euCtdf++7tW2kFjClrDmLcszdb27E2+MWMA6AKMiSTBE2k 2e2TvJUcGZs1s8atqSIIjBtmwQW3rKws33zODLMONzOP8CIErcaniHxyDSaxJIJr kjsdKnwlziL3xVnwQcpgnVOPvvDSKZ4OKEqx8rAuYTqiknpz3uhbt/7EqumuPLHr e7Rz0MnFolrVN7KZOHQ5CPJIezkEAOAEpItLdfc5cfLS06pbeTN3j+dJZp+tUohi WP/K3l2N3C5pkXA0ilAzshRF20Rwv/09M85BoqWocTLBJY7WqyIKXywCNdX81wkv tpbQvp2MpPkJXUIbAh5484BOfCfx9vkYVm2cam2UxXJhR6VfrQCjYfXIjfpqF4jp q7xxNesUezrOqB2Q/cKxw8dKOaRtO1XzVnmwutBrcKgqqLezMwUTDDjQYe8l6p1Z 40E74tsJwQ== =EQ7g -----END PGP SIGNATURE----- Merge tag 'block-6.17-20250808' of git://git.kernel.dk/linux Pull more block updates from Jens Axboe: - MD pull request via Yu: - mddev null-ptr-dereference fix, by Erkun - md-cluster fail to remove the faulty disk regression fix, by Heming - minor cleanup, by Li Nan and Jinchao - mdadm lifetime regression fix reported by syzkaller, by Yu Kuai - MD pull request via Christoph - add support for getting the FDP featuee in fabrics passthru path (Nitesh Shetty) - add capability to connect to an administrative controller (Kamaljit Singh) - fix a leak on sgl setup error (Keith Busch) - initialize discovery subsys after debugfs is initialized (Mohamed Khalfella) - fix various comment typos (Bjorn Helgaas) - remove unneeded semicolons (Jiapeng Chong) - nvmet debugfs ordering issue fix - Fix UAF in the tag_set in zloop - Ensure sbitmap shallow depth covers entire set - Reduce lock roundtrips in io context lookup - Move scheduler tags alloc/free out of elevator and freeze lock, to fix some lockdep found issues - Improve robustness of queue limits checking - Fix a regression with IO priorities, if no io context exists * tag 'block-6.17-20250808' of git://git.kernel.dk/linux: (26 commits) lib/sbitmap: make sbitmap_get_shallow() internal lib/sbitmap: convert shallow_depth from one word to the whole sbitmap nvmet: exit debugfs after discovery subsystem exits block, bfq: Reorder struct bfq_iocq_bfqq_data md: make rdev_addable usable for rcu mode md/raid1: remove struct pool_info and related code md/raid1: change r1conf->r1bio_pool to a pointer type block: ensure discard_granularity is zero when discard is not supported zloop: fix KASAN use-after-free of tag set block: Fix default IO priority if there is no IO context nvme: fix various comment typos nvme-auth: remove unneeded semicolon nvme-pci: fix leak on sgl setup error nvmet: initialize discovery subsys after debugfs is initialized nvme: add capability to connect to an administrative controller nvmet: add support for FDP in fabrics passthru path md: rename recovery_cp to resync_offset md/md-cluster: handle REMOVE message earlier md: fix create on open mddev lifetime regression block: fix potential deadlock while running nr_hw_queue update ...
This commit is contained in:
commit
2988dfed8a
38 changed files with 521 additions and 444 deletions
|
@ -454,17 +454,10 @@ static struct bfq_io_cq *icq_to_bic(struct io_cq *icq)
|
|||
*/
|
||||
static struct bfq_io_cq *bfq_bic_lookup(struct request_queue *q)
|
||||
{
|
||||
struct bfq_io_cq *icq;
|
||||
unsigned long flags;
|
||||
|
||||
if (!current->io_context)
|
||||
return NULL;
|
||||
|
||||
spin_lock_irqsave(&q->queue_lock, flags);
|
||||
icq = icq_to_bic(ioc_lookup_icq(q));
|
||||
spin_unlock_irqrestore(&q->queue_lock, flags);
|
||||
|
||||
return icq;
|
||||
return icq_to_bic(ioc_lookup_icq(q));
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -701,17 +694,13 @@ static void bfq_limit_depth(blk_opf_t opf, struct blk_mq_alloc_data *data)
|
|||
{
|
||||
struct bfq_data *bfqd = data->q->elevator->elevator_data;
|
||||
struct bfq_io_cq *bic = bfq_bic_lookup(data->q);
|
||||
int depth;
|
||||
unsigned limit = data->q->nr_requests;
|
||||
unsigned int act_idx;
|
||||
unsigned int limit, act_idx;
|
||||
|
||||
/* Sync reads have full depth available */
|
||||
if (op_is_sync(opf) && !op_is_write(opf)) {
|
||||
depth = 0;
|
||||
} else {
|
||||
depth = bfqd->word_depths[!!bfqd->wr_busy_queues][op_is_sync(opf)];
|
||||
limit = (limit * depth) >> bfqd->full_depth_shift;
|
||||
}
|
||||
if (op_is_sync(opf) && !op_is_write(opf))
|
||||
limit = data->q->nr_requests;
|
||||
else
|
||||
limit = bfqd->async_depths[!!bfqd->wr_busy_queues][op_is_sync(opf)];
|
||||
|
||||
for (act_idx = 0; bic && act_idx < bfqd->num_actuators; act_idx++) {
|
||||
/* Fast path to check if bfqq is already allocated. */
|
||||
|
@ -725,14 +714,16 @@ static void bfq_limit_depth(blk_opf_t opf, struct blk_mq_alloc_data *data)
|
|||
* available requests and thus starve other entities.
|
||||
*/
|
||||
if (bfqq_request_over_limit(bfqd, bic, opf, act_idx, limit)) {
|
||||
depth = 1;
|
||||
limit = 1;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
bfq_log(bfqd, "[%s] wr_busy %d sync %d depth %u",
|
||||
__func__, bfqd->wr_busy_queues, op_is_sync(opf), depth);
|
||||
if (depth)
|
||||
data->shallow_depth = depth;
|
||||
__func__, bfqd->wr_busy_queues, op_is_sync(opf), limit);
|
||||
|
||||
if (limit < data->q->nr_requests)
|
||||
data->shallow_depth = limit;
|
||||
}
|
||||
|
||||
static struct bfq_queue *
|
||||
|
@ -2457,15 +2448,8 @@ static bool bfq_bio_merge(struct request_queue *q, struct bio *bio,
|
|||
unsigned int nr_segs)
|
||||
{
|
||||
struct bfq_data *bfqd = q->elevator->elevator_data;
|
||||
struct request *free = NULL;
|
||||
/*
|
||||
* bfq_bic_lookup grabs the queue_lock: invoke it now and
|
||||
* store its return value for later use, to avoid nesting
|
||||
* queue_lock inside the bfqd->lock. We assume that the bic
|
||||
* returned by bfq_bic_lookup does not go away before
|
||||
* bfqd->lock is taken.
|
||||
*/
|
||||
struct bfq_io_cq *bic = bfq_bic_lookup(q);
|
||||
struct request *free = NULL;
|
||||
bool ret;
|
||||
|
||||
spin_lock_irq(&bfqd->lock);
|
||||
|
@ -7128,9 +7112,8 @@ void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg)
|
|||
*/
|
||||
static void bfq_update_depths(struct bfq_data *bfqd, struct sbitmap_queue *bt)
|
||||
{
|
||||
unsigned int depth = 1U << bt->sb.shift;
|
||||
unsigned int nr_requests = bfqd->queue->nr_requests;
|
||||
|
||||
bfqd->full_depth_shift = bt->sb.shift;
|
||||
/*
|
||||
* In-word depths if no bfq_queue is being weight-raised:
|
||||
* leaving 25% of tags only for sync reads.
|
||||
|
@ -7142,13 +7125,13 @@ static void bfq_update_depths(struct bfq_data *bfqd, struct sbitmap_queue *bt)
|
|||
* limit 'something'.
|
||||
*/
|
||||
/* no more than 50% of tags for async I/O */
|
||||
bfqd->word_depths[0][0] = max(depth >> 1, 1U);
|
||||
bfqd->async_depths[0][0] = max(nr_requests >> 1, 1U);
|
||||
/*
|
||||
* no more than 75% of tags for sync writes (25% extra tags
|
||||
* w.r.t. async I/O, to prevent async I/O from starving sync
|
||||
* writes)
|
||||
*/
|
||||
bfqd->word_depths[0][1] = max((depth * 3) >> 2, 1U);
|
||||
bfqd->async_depths[0][1] = max((nr_requests * 3) >> 2, 1U);
|
||||
|
||||
/*
|
||||
* In-word depths in case some bfq_queue is being weight-
|
||||
|
@ -7158,9 +7141,9 @@ static void bfq_update_depths(struct bfq_data *bfqd, struct sbitmap_queue *bt)
|
|||
* shortage.
|
||||
*/
|
||||
/* no more than ~18% of tags for async I/O */
|
||||
bfqd->word_depths[1][0] = max((depth * 3) >> 4, 1U);
|
||||
bfqd->async_depths[1][0] = max((nr_requests * 3) >> 4, 1U);
|
||||
/* no more than ~37% of tags for sync writes (~20% extra tags) */
|
||||
bfqd->word_depths[1][1] = max((depth * 6) >> 4, 1U);
|
||||
bfqd->async_depths[1][1] = max((nr_requests * 6) >> 4, 1U);
|
||||
}
|
||||
|
||||
static void bfq_depth_updated(struct blk_mq_hw_ctx *hctx)
|
||||
|
@ -7232,22 +7215,16 @@ static void bfq_init_root_group(struct bfq_group *root_group,
|
|||
root_group->sched_data.bfq_class_idle_last_service = jiffies;
|
||||
}
|
||||
|
||||
static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
|
||||
static int bfq_init_queue(struct request_queue *q, struct elevator_queue *eq)
|
||||
{
|
||||
struct bfq_data *bfqd;
|
||||
struct elevator_queue *eq;
|
||||
unsigned int i;
|
||||
struct blk_independent_access_ranges *ia_ranges = q->disk->ia_ranges;
|
||||
|
||||
eq = elevator_alloc(q, e);
|
||||
if (!eq)
|
||||
bfqd = kzalloc_node(sizeof(*bfqd), GFP_KERNEL, q->node);
|
||||
if (!bfqd)
|
||||
return -ENOMEM;
|
||||
|
||||
bfqd = kzalloc_node(sizeof(*bfqd), GFP_KERNEL, q->node);
|
||||
if (!bfqd) {
|
||||
kobject_put(&eq->kobj);
|
||||
return -ENOMEM;
|
||||
}
|
||||
eq->elevator_data = bfqd;
|
||||
|
||||
spin_lock_irq(&q->queue_lock);
|
||||
|
@ -7405,7 +7382,6 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
|
|||
|
||||
out_free:
|
||||
kfree(bfqd);
|
||||
kobject_put(&eq->kobj);
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
|
|
|
@ -427,9 +427,6 @@ struct bfq_iocq_bfqq_data {
|
|||
*/
|
||||
bool saved_IO_bound;
|
||||
|
||||
u64 saved_io_start_time;
|
||||
u64 saved_tot_idle_time;
|
||||
|
||||
/*
|
||||
* Same purpose as the previous fields for the values of the
|
||||
* field keeping the queue's belonging to a large burst
|
||||
|
@ -450,6 +447,9 @@ struct bfq_iocq_bfqq_data {
|
|||
*/
|
||||
unsigned int saved_weight;
|
||||
|
||||
u64 saved_io_start_time;
|
||||
u64 saved_tot_idle_time;
|
||||
|
||||
/*
|
||||
* Similar to previous fields: save wr information.
|
||||
*/
|
||||
|
@ -457,13 +457,13 @@ struct bfq_iocq_bfqq_data {
|
|||
unsigned long saved_last_wr_start_finish;
|
||||
unsigned long saved_service_from_wr;
|
||||
unsigned long saved_wr_start_at_switch_to_srt;
|
||||
unsigned int saved_wr_cur_max_time;
|
||||
struct bfq_ttime saved_ttime;
|
||||
unsigned int saved_wr_cur_max_time;
|
||||
|
||||
/* Save also injection state */
|
||||
u64 saved_last_serv_time_ns;
|
||||
unsigned int saved_inject_limit;
|
||||
unsigned long saved_decrease_time_jif;
|
||||
u64 saved_last_serv_time_ns;
|
||||
|
||||
/* candidate queue for a stable merge (due to close creation time) */
|
||||
struct bfq_queue *stable_merge_bfqq;
|
||||
|
@ -813,8 +813,7 @@ struct bfq_data {
|
|||
* Depth limits used in bfq_limit_depth (see comments on the
|
||||
* function)
|
||||
*/
|
||||
unsigned int word_depths[2][2];
|
||||
unsigned int full_depth_shift;
|
||||
unsigned int async_depths[2][2];
|
||||
|
||||
/*
|
||||
* Number of independent actuators. This is equal to 1 in
|
||||
|
|
|
@ -308,24 +308,23 @@ int __copy_io(unsigned long clone_flags, struct task_struct *tsk)
|
|||
|
||||
#ifdef CONFIG_BLK_ICQ
|
||||
/**
|
||||
* ioc_lookup_icq - lookup io_cq from ioc
|
||||
* ioc_lookup_icq - lookup io_cq from ioc in io issue path
|
||||
* @q: the associated request_queue
|
||||
*
|
||||
* Look up io_cq associated with @ioc - @q pair from @ioc. Must be called
|
||||
* with @q->queue_lock held.
|
||||
* from io issue path, either return NULL if current issue io to @q for the
|
||||
* first time, or return a valid icq.
|
||||
*/
|
||||
struct io_cq *ioc_lookup_icq(struct request_queue *q)
|
||||
{
|
||||
struct io_context *ioc = current->io_context;
|
||||
struct io_cq *icq;
|
||||
|
||||
lockdep_assert_held(&q->queue_lock);
|
||||
|
||||
/*
|
||||
* icq's are indexed from @ioc using radix tree and hint pointer,
|
||||
* both of which are protected with RCU. All removals are done
|
||||
* holding both q and ioc locks, and we're holding q lock - if we
|
||||
* find a icq which points to us, it's guaranteed to be valid.
|
||||
* both of which are protected with RCU, io issue path ensures that
|
||||
* both request_queue and current task are valid, the found icq
|
||||
* is guaranteed to be valid until the io is done.
|
||||
*/
|
||||
rcu_read_lock();
|
||||
icq = rcu_dereference(ioc->icq_hint);
|
||||
|
@ -419,10 +418,7 @@ struct io_cq *ioc_find_get_icq(struct request_queue *q)
|
|||
task_unlock(current);
|
||||
} else {
|
||||
get_io_context(ioc);
|
||||
|
||||
spin_lock_irq(&q->queue_lock);
|
||||
icq = ioc_lookup_icq(q);
|
||||
spin_unlock_irq(&q->queue_lock);
|
||||
}
|
||||
|
||||
if (!icq) {
|
||||
|
|
|
@ -374,64 +374,17 @@ bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq,
|
|||
}
|
||||
EXPORT_SYMBOL_GPL(blk_mq_sched_try_insert_merge);
|
||||
|
||||
static int blk_mq_sched_alloc_map_and_rqs(struct request_queue *q,
|
||||
struct blk_mq_hw_ctx *hctx,
|
||||
unsigned int hctx_idx)
|
||||
{
|
||||
if (blk_mq_is_shared_tags(q->tag_set->flags)) {
|
||||
hctx->sched_tags = q->sched_shared_tags;
|
||||
return 0;
|
||||
}
|
||||
|
||||
hctx->sched_tags = blk_mq_alloc_map_and_rqs(q->tag_set, hctx_idx,
|
||||
q->nr_requests);
|
||||
|
||||
if (!hctx->sched_tags)
|
||||
return -ENOMEM;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void blk_mq_exit_sched_shared_tags(struct request_queue *queue)
|
||||
{
|
||||
blk_mq_free_rq_map(queue->sched_shared_tags);
|
||||
queue->sched_shared_tags = NULL;
|
||||
}
|
||||
|
||||
/* called in queue's release handler, tagset has gone away */
|
||||
static void blk_mq_sched_tags_teardown(struct request_queue *q, unsigned int flags)
|
||||
{
|
||||
struct blk_mq_hw_ctx *hctx;
|
||||
unsigned long i;
|
||||
|
||||
queue_for_each_hw_ctx(q, hctx, i) {
|
||||
if (hctx->sched_tags) {
|
||||
if (!blk_mq_is_shared_tags(flags))
|
||||
blk_mq_free_rq_map(hctx->sched_tags);
|
||||
hctx->sched_tags = NULL;
|
||||
}
|
||||
}
|
||||
queue_for_each_hw_ctx(q, hctx, i)
|
||||
hctx->sched_tags = NULL;
|
||||
|
||||
if (blk_mq_is_shared_tags(flags))
|
||||
blk_mq_exit_sched_shared_tags(q);
|
||||
}
|
||||
|
||||
static int blk_mq_init_sched_shared_tags(struct request_queue *queue)
|
||||
{
|
||||
struct blk_mq_tag_set *set = queue->tag_set;
|
||||
|
||||
/*
|
||||
* Set initial depth at max so that we don't need to reallocate for
|
||||
* updating nr_requests.
|
||||
*/
|
||||
queue->sched_shared_tags = blk_mq_alloc_map_and_rqs(set,
|
||||
BLK_MQ_NO_HCTX_IDX,
|
||||
MAX_SCHED_RQ);
|
||||
if (!queue->sched_shared_tags)
|
||||
return -ENOMEM;
|
||||
|
||||
blk_mq_tag_update_sched_shared_tags(queue);
|
||||
|
||||
return 0;
|
||||
q->sched_shared_tags = NULL;
|
||||
}
|
||||
|
||||
void blk_mq_sched_reg_debugfs(struct request_queue *q)
|
||||
|
@ -458,8 +411,140 @@ void blk_mq_sched_unreg_debugfs(struct request_queue *q)
|
|||
mutex_unlock(&q->debugfs_mutex);
|
||||
}
|
||||
|
||||
void blk_mq_free_sched_tags(struct elevator_tags *et,
|
||||
struct blk_mq_tag_set *set)
|
||||
{
|
||||
unsigned long i;
|
||||
|
||||
/* Shared tags are stored at index 0 in @tags. */
|
||||
if (blk_mq_is_shared_tags(set->flags))
|
||||
blk_mq_free_map_and_rqs(set, et->tags[0], BLK_MQ_NO_HCTX_IDX);
|
||||
else {
|
||||
for (i = 0; i < et->nr_hw_queues; i++)
|
||||
blk_mq_free_map_and_rqs(set, et->tags[i], i);
|
||||
}
|
||||
|
||||
kfree(et);
|
||||
}
|
||||
|
||||
void blk_mq_free_sched_tags_batch(struct xarray *et_table,
|
||||
struct blk_mq_tag_set *set)
|
||||
{
|
||||
struct request_queue *q;
|
||||
struct elevator_tags *et;
|
||||
|
||||
lockdep_assert_held_write(&set->update_nr_hwq_lock);
|
||||
|
||||
list_for_each_entry(q, &set->tag_list, tag_set_list) {
|
||||
/*
|
||||
* Accessing q->elevator without holding q->elevator_lock is
|
||||
* safe because we're holding here set->update_nr_hwq_lock in
|
||||
* the writer context. So, scheduler update/switch code (which
|
||||
* acquires the same lock but in the reader context) can't run
|
||||
* concurrently.
|
||||
*/
|
||||
if (q->elevator) {
|
||||
et = xa_load(et_table, q->id);
|
||||
if (unlikely(!et))
|
||||
WARN_ON_ONCE(1);
|
||||
else
|
||||
blk_mq_free_sched_tags(et, set);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
struct elevator_tags *blk_mq_alloc_sched_tags(struct blk_mq_tag_set *set,
|
||||
unsigned int nr_hw_queues)
|
||||
{
|
||||
unsigned int nr_tags;
|
||||
int i;
|
||||
struct elevator_tags *et;
|
||||
gfp_t gfp = GFP_NOIO | __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY;
|
||||
|
||||
if (blk_mq_is_shared_tags(set->flags))
|
||||
nr_tags = 1;
|
||||
else
|
||||
nr_tags = nr_hw_queues;
|
||||
|
||||
et = kmalloc(sizeof(struct elevator_tags) +
|
||||
nr_tags * sizeof(struct blk_mq_tags *), gfp);
|
||||
if (!et)
|
||||
return NULL;
|
||||
/*
|
||||
* Default to double of smaller one between hw queue_depth and
|
||||
* 128, since we don't split into sync/async like the old code
|
||||
* did. Additionally, this is a per-hw queue depth.
|
||||
*/
|
||||
et->nr_requests = 2 * min_t(unsigned int, set->queue_depth,
|
||||
BLKDEV_DEFAULT_RQ);
|
||||
et->nr_hw_queues = nr_hw_queues;
|
||||
|
||||
if (blk_mq_is_shared_tags(set->flags)) {
|
||||
/* Shared tags are stored at index 0 in @tags. */
|
||||
et->tags[0] = blk_mq_alloc_map_and_rqs(set, BLK_MQ_NO_HCTX_IDX,
|
||||
MAX_SCHED_RQ);
|
||||
if (!et->tags[0])
|
||||
goto out;
|
||||
} else {
|
||||
for (i = 0; i < et->nr_hw_queues; i++) {
|
||||
et->tags[i] = blk_mq_alloc_map_and_rqs(set, i,
|
||||
et->nr_requests);
|
||||
if (!et->tags[i])
|
||||
goto out_unwind;
|
||||
}
|
||||
}
|
||||
|
||||
return et;
|
||||
out_unwind:
|
||||
while (--i >= 0)
|
||||
blk_mq_free_map_and_rqs(set, et->tags[i], i);
|
||||
out:
|
||||
kfree(et);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
int blk_mq_alloc_sched_tags_batch(struct xarray *et_table,
|
||||
struct blk_mq_tag_set *set, unsigned int nr_hw_queues)
|
||||
{
|
||||
struct request_queue *q;
|
||||
struct elevator_tags *et;
|
||||
gfp_t gfp = GFP_NOIO | __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY;
|
||||
|
||||
lockdep_assert_held_write(&set->update_nr_hwq_lock);
|
||||
|
||||
list_for_each_entry(q, &set->tag_list, tag_set_list) {
|
||||
/*
|
||||
* Accessing q->elevator without holding q->elevator_lock is
|
||||
* safe because we're holding here set->update_nr_hwq_lock in
|
||||
* the writer context. So, scheduler update/switch code (which
|
||||
* acquires the same lock but in the reader context) can't run
|
||||
* concurrently.
|
||||
*/
|
||||
if (q->elevator) {
|
||||
et = blk_mq_alloc_sched_tags(set, nr_hw_queues);
|
||||
if (!et)
|
||||
goto out_unwind;
|
||||
if (xa_insert(et_table, q->id, et, gfp))
|
||||
goto out_free_tags;
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
out_free_tags:
|
||||
blk_mq_free_sched_tags(et, set);
|
||||
out_unwind:
|
||||
list_for_each_entry_continue_reverse(q, &set->tag_list, tag_set_list) {
|
||||
if (q->elevator) {
|
||||
et = xa_load(et_table, q->id);
|
||||
if (et)
|
||||
blk_mq_free_sched_tags(et, set);
|
||||
}
|
||||
}
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
/* caller must have a reference to @e, will grab another one if successful */
|
||||
int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
|
||||
int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e,
|
||||
struct elevator_tags *et)
|
||||
{
|
||||
unsigned int flags = q->tag_set->flags;
|
||||
struct blk_mq_hw_ctx *hctx;
|
||||
|
@ -467,36 +552,33 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
|
|||
unsigned long i;
|
||||
int ret;
|
||||
|
||||
/*
|
||||
* Default to double of smaller one between hw queue_depth and 128,
|
||||
* since we don't split into sync/async like the old code did.
|
||||
* Additionally, this is a per-hw queue depth.
|
||||
*/
|
||||
q->nr_requests = 2 * min_t(unsigned int, q->tag_set->queue_depth,
|
||||
BLKDEV_DEFAULT_RQ);
|
||||
eq = elevator_alloc(q, e, et);
|
||||
if (!eq)
|
||||
return -ENOMEM;
|
||||
|
||||
q->nr_requests = et->nr_requests;
|
||||
|
||||
if (blk_mq_is_shared_tags(flags)) {
|
||||
ret = blk_mq_init_sched_shared_tags(q);
|
||||
if (ret)
|
||||
return ret;
|
||||
/* Shared tags are stored at index 0 in @et->tags. */
|
||||
q->sched_shared_tags = et->tags[0];
|
||||
blk_mq_tag_update_sched_shared_tags(q);
|
||||
}
|
||||
|
||||
queue_for_each_hw_ctx(q, hctx, i) {
|
||||
ret = blk_mq_sched_alloc_map_and_rqs(q, hctx, i);
|
||||
if (ret)
|
||||
goto err_free_map_and_rqs;
|
||||
if (blk_mq_is_shared_tags(flags))
|
||||
hctx->sched_tags = q->sched_shared_tags;
|
||||
else
|
||||
hctx->sched_tags = et->tags[i];
|
||||
}
|
||||
|
||||
ret = e->ops.init_sched(q, e);
|
||||
ret = e->ops.init_sched(q, eq);
|
||||
if (ret)
|
||||
goto err_free_map_and_rqs;
|
||||
goto out;
|
||||
|
||||
queue_for_each_hw_ctx(q, hctx, i) {
|
||||
if (e->ops.init_hctx) {
|
||||
ret = e->ops.init_hctx(hctx, i);
|
||||
if (ret) {
|
||||
eq = q->elevator;
|
||||
blk_mq_sched_free_rqs(q);
|
||||
blk_mq_exit_sched(q, eq);
|
||||
kobject_put(&eq->kobj);
|
||||
return ret;
|
||||
|
@ -505,10 +587,9 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
|
|||
}
|
||||
return 0;
|
||||
|
||||
err_free_map_and_rqs:
|
||||
blk_mq_sched_free_rqs(q);
|
||||
out:
|
||||
blk_mq_sched_tags_teardown(q, flags);
|
||||
|
||||
kobject_put(&eq->kobj);
|
||||
q->elevator = NULL;
|
||||
return ret;
|
||||
}
|
||||
|
|
|
@ -18,10 +18,20 @@ void __blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx);
|
|||
|
||||
void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx);
|
||||
|
||||
int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e);
|
||||
int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e,
|
||||
struct elevator_tags *et);
|
||||
void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e);
|
||||
void blk_mq_sched_free_rqs(struct request_queue *q);
|
||||
|
||||
struct elevator_tags *blk_mq_alloc_sched_tags(struct blk_mq_tag_set *set,
|
||||
unsigned int nr_hw_queues);
|
||||
int blk_mq_alloc_sched_tags_batch(struct xarray *et_table,
|
||||
struct blk_mq_tag_set *set, unsigned int nr_hw_queues);
|
||||
void blk_mq_free_sched_tags(struct elevator_tags *et,
|
||||
struct blk_mq_tag_set *set);
|
||||
void blk_mq_free_sched_tags_batch(struct xarray *et_table,
|
||||
struct blk_mq_tag_set *set);
|
||||
|
||||
static inline void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx)
|
||||
{
|
||||
if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
|
||||
|
|
|
@ -4974,12 +4974,13 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr)
|
|||
* Switch back to the elevator type stored in the xarray.
|
||||
*/
|
||||
static void blk_mq_elv_switch_back(struct request_queue *q,
|
||||
struct xarray *elv_tbl)
|
||||
struct xarray *elv_tbl, struct xarray *et_tbl)
|
||||
{
|
||||
struct elevator_type *e = xa_load(elv_tbl, q->id);
|
||||
struct elevator_tags *t = xa_load(et_tbl, q->id);
|
||||
|
||||
/* The elv_update_nr_hw_queues unfreezes the queue. */
|
||||
elv_update_nr_hw_queues(q, e);
|
||||
elv_update_nr_hw_queues(q, e, t);
|
||||
|
||||
/* Drop the reference acquired in blk_mq_elv_switch_none. */
|
||||
if (e)
|
||||
|
@ -5031,7 +5032,7 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set,
|
|||
int prev_nr_hw_queues = set->nr_hw_queues;
|
||||
unsigned int memflags;
|
||||
int i;
|
||||
struct xarray elv_tbl;
|
||||
struct xarray elv_tbl, et_tbl;
|
||||
|
||||
lockdep_assert_held(&set->tag_list_lock);
|
||||
|
||||
|
@ -5044,6 +5045,10 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set,
|
|||
|
||||
memflags = memalloc_noio_save();
|
||||
|
||||
xa_init(&et_tbl);
|
||||
if (blk_mq_alloc_sched_tags_batch(&et_tbl, set, nr_hw_queues) < 0)
|
||||
goto out_memalloc_restore;
|
||||
|
||||
xa_init(&elv_tbl);
|
||||
|
||||
list_for_each_entry(q, &set->tag_list, tag_set_list) {
|
||||
|
@ -5087,7 +5092,7 @@ fallback:
|
|||
switch_back:
|
||||
/* The blk_mq_elv_switch_back unfreezes queue for us. */
|
||||
list_for_each_entry(q, &set->tag_list, tag_set_list)
|
||||
blk_mq_elv_switch_back(q, &elv_tbl);
|
||||
blk_mq_elv_switch_back(q, &elv_tbl, &et_tbl);
|
||||
|
||||
list_for_each_entry(q, &set->tag_list, tag_set_list) {
|
||||
blk_mq_sysfs_register_hctxs(q);
|
||||
|
@ -5098,7 +5103,8 @@ switch_back:
|
|||
}
|
||||
|
||||
xa_destroy(&elv_tbl);
|
||||
|
||||
xa_destroy(&et_tbl);
|
||||
out_memalloc_restore:
|
||||
memalloc_noio_restore(memflags);
|
||||
|
||||
/* Free the excess tags when nr_hw_queues shrink. */
|
||||
|
|
|
@ -62,16 +62,24 @@ EXPORT_SYMBOL(blk_set_stacking_limits);
|
|||
void blk_apply_bdi_limits(struct backing_dev_info *bdi,
|
||||
struct queue_limits *lim)
|
||||
{
|
||||
u64 io_opt = lim->io_opt;
|
||||
|
||||
/*
|
||||
* For read-ahead of large files to be effective, we need to read ahead
|
||||
* at least twice the optimal I/O size.
|
||||
* at least twice the optimal I/O size. For rotational devices that do
|
||||
* not report an optimal I/O size (e.g. ATA HDDs), use the maximum I/O
|
||||
* size to avoid falling back to the (rather inefficient) small default
|
||||
* read-ahead size.
|
||||
*
|
||||
* There is no hardware limitation for the read-ahead size and the user
|
||||
* might have increased the read-ahead size through sysfs, so don't ever
|
||||
* decrease it.
|
||||
*/
|
||||
if (!io_opt && (lim->features & BLK_FEAT_ROTATIONAL))
|
||||
io_opt = (u64)lim->max_sectors << SECTOR_SHIFT;
|
||||
|
||||
bdi->ra_pages = max3(bdi->ra_pages,
|
||||
lim->io_opt * 2 / PAGE_SIZE,
|
||||
io_opt * 2 >> PAGE_SHIFT,
|
||||
VM_READAHEAD_PAGES);
|
||||
bdi->io_pages = lim->max_sectors >> PAGE_SECTORS_SHIFT;
|
||||
}
|
||||
|
@ -312,8 +320,12 @@ int blk_validate_limits(struct queue_limits *lim)
|
|||
pr_warn("Invalid logical block size (%d)\n", lim->logical_block_size);
|
||||
return -EINVAL;
|
||||
}
|
||||
if (lim->physical_block_size < lim->logical_block_size)
|
||||
if (lim->physical_block_size < lim->logical_block_size) {
|
||||
lim->physical_block_size = lim->logical_block_size;
|
||||
} else if (!is_power_of_2(lim->physical_block_size)) {
|
||||
pr_warn("Invalid physical block size (%d)\n", lim->physical_block_size);
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
/*
|
||||
* The minimum I/O size defaults to the physical block size unless
|
||||
|
@ -388,12 +400,19 @@ int blk_validate_limits(struct queue_limits *lim)
|
|||
lim->max_discard_sectors =
|
||||
min(lim->max_hw_discard_sectors, lim->max_user_discard_sectors);
|
||||
|
||||
/*
|
||||
* When discard is not supported, discard_granularity should be reported
|
||||
* as 0 to userspace.
|
||||
*/
|
||||
if (lim->max_discard_sectors)
|
||||
lim->discard_granularity =
|
||||
max(lim->discard_granularity, lim->physical_block_size);
|
||||
else
|
||||
lim->discard_granularity = 0;
|
||||
|
||||
if (!lim->max_discard_segments)
|
||||
lim->max_discard_segments = 1;
|
||||
|
||||
if (lim->discard_granularity < lim->physical_block_size)
|
||||
lim->discard_granularity = lim->physical_block_size;
|
||||
|
||||
/*
|
||||
* By default there is no limit on the segment boundary alignment,
|
||||
* but if there is one it can't be smaller than the page size as
|
||||
|
@ -849,7 +868,7 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
|
|||
}
|
||||
|
||||
/* chunk_sectors a multiple of the physical block size? */
|
||||
if ((t->chunk_sectors << 9) & (t->physical_block_size - 1)) {
|
||||
if (t->chunk_sectors % (t->physical_block_size >> SECTOR_SHIFT)) {
|
||||
t->chunk_sectors = 0;
|
||||
t->flags |= BLK_FLAG_MISALIGNED;
|
||||
ret = -1;
|
||||
|
|
|
@ -12,6 +12,7 @@
|
|||
#include "blk-crypto-internal.h"
|
||||
|
||||
struct elevator_type;
|
||||
struct elevator_tags;
|
||||
|
||||
/*
|
||||
* Default upper limit for the software max_sectors limit used for regular I/Os.
|
||||
|
@ -330,7 +331,8 @@ bool blk_bio_list_merge(struct request_queue *q, struct list_head *list,
|
|||
|
||||
bool blk_insert_flush(struct request *rq);
|
||||
|
||||
void elv_update_nr_hw_queues(struct request_queue *q, struct elevator_type *e);
|
||||
void elv_update_nr_hw_queues(struct request_queue *q, struct elevator_type *e,
|
||||
struct elevator_tags *t);
|
||||
void elevator_set_default(struct request_queue *q);
|
||||
void elevator_set_none(struct request_queue *q);
|
||||
|
||||
|
|
|
@ -54,6 +54,8 @@ struct elv_change_ctx {
|
|||
struct elevator_queue *old;
|
||||
/* for registering new elevator */
|
||||
struct elevator_queue *new;
|
||||
/* holds sched tags data */
|
||||
struct elevator_tags *et;
|
||||
};
|
||||
|
||||
static DEFINE_SPINLOCK(elv_list_lock);
|
||||
|
@ -132,7 +134,7 @@ static struct elevator_type *elevator_find_get(const char *name)
|
|||
static const struct kobj_type elv_ktype;
|
||||
|
||||
struct elevator_queue *elevator_alloc(struct request_queue *q,
|
||||
struct elevator_type *e)
|
||||
struct elevator_type *e, struct elevator_tags *et)
|
||||
{
|
||||
struct elevator_queue *eq;
|
||||
|
||||
|
@ -145,10 +147,10 @@ struct elevator_queue *elevator_alloc(struct request_queue *q,
|
|||
kobject_init(&eq->kobj, &elv_ktype);
|
||||
mutex_init(&eq->sysfs_lock);
|
||||
hash_init(eq->hash);
|
||||
eq->et = et;
|
||||
|
||||
return eq;
|
||||
}
|
||||
EXPORT_SYMBOL(elevator_alloc);
|
||||
|
||||
static void elevator_release(struct kobject *kobj)
|
||||
{
|
||||
|
@ -166,7 +168,6 @@ static void elevator_exit(struct request_queue *q)
|
|||
lockdep_assert_held(&q->elevator_lock);
|
||||
|
||||
ioc_clear_queue(q);
|
||||
blk_mq_sched_free_rqs(q);
|
||||
|
||||
mutex_lock(&e->sysfs_lock);
|
||||
blk_mq_exit_sched(q, e);
|
||||
|
@ -592,7 +593,7 @@ static int elevator_switch(struct request_queue *q, struct elv_change_ctx *ctx)
|
|||
}
|
||||
|
||||
if (new_e) {
|
||||
ret = blk_mq_init_sched(q, new_e);
|
||||
ret = blk_mq_init_sched(q, new_e, ctx->et);
|
||||
if (ret)
|
||||
goto out_unfreeze;
|
||||
ctx->new = q->elevator;
|
||||
|
@ -627,8 +628,10 @@ static void elv_exit_and_release(struct request_queue *q)
|
|||
elevator_exit(q);
|
||||
mutex_unlock(&q->elevator_lock);
|
||||
blk_mq_unfreeze_queue(q, memflags);
|
||||
if (e)
|
||||
if (e) {
|
||||
blk_mq_free_sched_tags(e->et, q->tag_set);
|
||||
kobject_put(&e->kobj);
|
||||
}
|
||||
}
|
||||
|
||||
static int elevator_change_done(struct request_queue *q,
|
||||
|
@ -641,6 +644,7 @@ static int elevator_change_done(struct request_queue *q,
|
|||
&ctx->old->flags);
|
||||
|
||||
elv_unregister_queue(q, ctx->old);
|
||||
blk_mq_free_sched_tags(ctx->old->et, q->tag_set);
|
||||
kobject_put(&ctx->old->kobj);
|
||||
if (enable_wbt)
|
||||
wbt_enable_default(q->disk);
|
||||
|
@ -659,9 +663,16 @@ static int elevator_change_done(struct request_queue *q,
|
|||
static int elevator_change(struct request_queue *q, struct elv_change_ctx *ctx)
|
||||
{
|
||||
unsigned int memflags;
|
||||
struct blk_mq_tag_set *set = q->tag_set;
|
||||
int ret = 0;
|
||||
|
||||
lockdep_assert_held(&q->tag_set->update_nr_hwq_lock);
|
||||
lockdep_assert_held(&set->update_nr_hwq_lock);
|
||||
|
||||
if (strncmp(ctx->name, "none", 4)) {
|
||||
ctx->et = blk_mq_alloc_sched_tags(set, set->nr_hw_queues);
|
||||
if (!ctx->et)
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
memflags = blk_mq_freeze_queue(q);
|
||||
/*
|
||||
|
@ -681,6 +692,11 @@ static int elevator_change(struct request_queue *q, struct elv_change_ctx *ctx)
|
|||
blk_mq_unfreeze_queue(q, memflags);
|
||||
if (!ret)
|
||||
ret = elevator_change_done(q, ctx);
|
||||
/*
|
||||
* Free sched tags if it's allocated but we couldn't switch elevator.
|
||||
*/
|
||||
if (ctx->et && !ctx->new)
|
||||
blk_mq_free_sched_tags(ctx->et, set);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
@ -689,8 +705,10 @@ static int elevator_change(struct request_queue *q, struct elv_change_ctx *ctx)
|
|||
* The I/O scheduler depends on the number of hardware queues, this forces a
|
||||
* reattachment when nr_hw_queues changes.
|
||||
*/
|
||||
void elv_update_nr_hw_queues(struct request_queue *q, struct elevator_type *e)
|
||||
void elv_update_nr_hw_queues(struct request_queue *q, struct elevator_type *e,
|
||||
struct elevator_tags *t)
|
||||
{
|
||||
struct blk_mq_tag_set *set = q->tag_set;
|
||||
struct elv_change_ctx ctx = {};
|
||||
int ret = -ENODEV;
|
||||
|
||||
|
@ -698,6 +716,7 @@ void elv_update_nr_hw_queues(struct request_queue *q, struct elevator_type *e)
|
|||
|
||||
if (e && !blk_queue_dying(q) && blk_queue_registered(q)) {
|
||||
ctx.name = e->elevator_name;
|
||||
ctx.et = t;
|
||||
|
||||
mutex_lock(&q->elevator_lock);
|
||||
/* force to reattach elevator after nr_hw_queue is updated */
|
||||
|
@ -707,6 +726,11 @@ void elv_update_nr_hw_queues(struct request_queue *q, struct elevator_type *e)
|
|||
blk_mq_unfreeze_queue_nomemrestore(q);
|
||||
if (!ret)
|
||||
WARN_ON_ONCE(elevator_change_done(q, &ctx));
|
||||
/*
|
||||
* Free sched tags if it's allocated but we couldn't switch elevator.
|
||||
*/
|
||||
if (t && !ctx.new)
|
||||
blk_mq_free_sched_tags(t, set);
|
||||
}
|
||||
|
||||
/*
|
||||
|
|
|
@ -23,8 +23,17 @@ enum elv_merge {
|
|||
struct blk_mq_alloc_data;
|
||||
struct blk_mq_hw_ctx;
|
||||
|
||||
struct elevator_tags {
|
||||
/* num. of hardware queues for which tags are allocated */
|
||||
unsigned int nr_hw_queues;
|
||||
/* depth used while allocating tags */
|
||||
unsigned int nr_requests;
|
||||
/* shared tag is stored at index 0 */
|
||||
struct blk_mq_tags *tags[];
|
||||
};
|
||||
|
||||
struct elevator_mq_ops {
|
||||
int (*init_sched)(struct request_queue *, struct elevator_type *);
|
||||
int (*init_sched)(struct request_queue *, struct elevator_queue *);
|
||||
void (*exit_sched)(struct elevator_queue *);
|
||||
int (*init_hctx)(struct blk_mq_hw_ctx *, unsigned int);
|
||||
void (*exit_hctx)(struct blk_mq_hw_ctx *, unsigned int);
|
||||
|
@ -113,6 +122,7 @@ struct request *elv_rqhash_find(struct request_queue *q, sector_t offset);
|
|||
struct elevator_queue
|
||||
{
|
||||
struct elevator_type *type;
|
||||
struct elevator_tags *et;
|
||||
void *elevator_data;
|
||||
struct kobject kobj;
|
||||
struct mutex sysfs_lock;
|
||||
|
@ -152,8 +162,8 @@ ssize_t elv_iosched_show(struct gendisk *disk, char *page);
|
|||
ssize_t elv_iosched_store(struct gendisk *disk, const char *page, size_t count);
|
||||
|
||||
extern bool elv_bio_merge_ok(struct request *, struct bio *);
|
||||
extern struct elevator_queue *elevator_alloc(struct request_queue *,
|
||||
struct elevator_type *);
|
||||
struct elevator_queue *elevator_alloc(struct request_queue *,
|
||||
struct elevator_type *, struct elevator_tags *);
|
||||
|
||||
/*
|
||||
* Helper functions.
|
||||
|
|
|
@ -157,10 +157,7 @@ struct kyber_queue_data {
|
|||
*/
|
||||
struct sbitmap_queue domain_tokens[KYBER_NUM_DOMAINS];
|
||||
|
||||
/*
|
||||
* Async request percentage, converted to per-word depth for
|
||||
* sbitmap_get_shallow().
|
||||
*/
|
||||
/* Number of allowed async requests. */
|
||||
unsigned int async_depth;
|
||||
|
||||
struct kyber_cpu_latency __percpu *cpu_latency;
|
||||
|
@ -402,20 +399,13 @@ err:
|
|||
return ERR_PTR(ret);
|
||||
}
|
||||
|
||||
static int kyber_init_sched(struct request_queue *q, struct elevator_type *e)
|
||||
static int kyber_init_sched(struct request_queue *q, struct elevator_queue *eq)
|
||||
{
|
||||
struct kyber_queue_data *kqd;
|
||||
struct elevator_queue *eq;
|
||||
|
||||
eq = elevator_alloc(q, e);
|
||||
if (!eq)
|
||||
return -ENOMEM;
|
||||
|
||||
kqd = kyber_queue_data_alloc(q);
|
||||
if (IS_ERR(kqd)) {
|
||||
kobject_put(&eq->kobj);
|
||||
if (IS_ERR(kqd))
|
||||
return PTR_ERR(kqd);
|
||||
}
|
||||
|
||||
blk_stat_enable_accounting(q);
|
||||
|
||||
|
@ -454,10 +444,8 @@ static void kyber_depth_updated(struct blk_mq_hw_ctx *hctx)
|
|||
{
|
||||
struct kyber_queue_data *kqd = hctx->queue->elevator->elevator_data;
|
||||
struct blk_mq_tags *tags = hctx->sched_tags;
|
||||
unsigned int shift = tags->bitmap_tags.sb.shift;
|
||||
|
||||
kqd->async_depth = (1U << shift) * KYBER_ASYNC_PERCENT / 100U;
|
||||
|
||||
kqd->async_depth = hctx->queue->nr_requests * KYBER_ASYNC_PERCENT / 100U;
|
||||
sbitmap_queue_min_shallow_depth(&tags->bitmap_tags, kqd->async_depth);
|
||||
}
|
||||
|
||||
|
|
|
@ -487,20 +487,6 @@ unlock:
|
|||
return rq;
|
||||
}
|
||||
|
||||
/*
|
||||
* 'depth' is a number in the range 1..INT_MAX representing a number of
|
||||
* requests. Scale it with a factor (1 << bt->sb.shift) / q->nr_requests since
|
||||
* 1..(1 << bt->sb.shift) is the range expected by sbitmap_get_shallow().
|
||||
* Values larger than q->nr_requests have the same effect as q->nr_requests.
|
||||
*/
|
||||
static int dd_to_word_depth(struct blk_mq_hw_ctx *hctx, unsigned int qdepth)
|
||||
{
|
||||
struct sbitmap_queue *bt = &hctx->sched_tags->bitmap_tags;
|
||||
const unsigned int nrr = hctx->queue->nr_requests;
|
||||
|
||||
return ((qdepth << bt->sb.shift) + nrr - 1) / nrr;
|
||||
}
|
||||
|
||||
/*
|
||||
* Called by __blk_mq_alloc_request(). The shallow_depth value set by this
|
||||
* function is used by __blk_mq_get_tag().
|
||||
|
@ -517,7 +503,7 @@ static void dd_limit_depth(blk_opf_t opf, struct blk_mq_alloc_data *data)
|
|||
* Throttle asynchronous requests and writes such that these requests
|
||||
* do not block the allocation of synchronous requests.
|
||||
*/
|
||||
data->shallow_depth = dd_to_word_depth(data->hctx, dd->async_depth);
|
||||
data->shallow_depth = dd->async_depth;
|
||||
}
|
||||
|
||||
/* Called by blk_mq_update_nr_requests(). */
|
||||
|
@ -568,20 +554,14 @@ static void dd_exit_sched(struct elevator_queue *e)
|
|||
/*
|
||||
* initialize elevator private data (deadline_data).
|
||||
*/
|
||||
static int dd_init_sched(struct request_queue *q, struct elevator_type *e)
|
||||
static int dd_init_sched(struct request_queue *q, struct elevator_queue *eq)
|
||||
{
|
||||
struct deadline_data *dd;
|
||||
struct elevator_queue *eq;
|
||||
enum dd_prio prio;
|
||||
int ret = -ENOMEM;
|
||||
|
||||
eq = elevator_alloc(q, e);
|
||||
if (!eq)
|
||||
return ret;
|
||||
|
||||
dd = kzalloc_node(sizeof(*dd), GFP_KERNEL, q->node);
|
||||
if (!dd)
|
||||
goto put_eq;
|
||||
return -ENOMEM;
|
||||
|
||||
eq->elevator_data = dd;
|
||||
|
||||
|
@ -608,10 +588,6 @@ static int dd_init_sched(struct request_queue *q, struct elevator_type *e)
|
|||
|
||||
q->elevator = eq;
|
||||
return 0;
|
||||
|
||||
put_eq:
|
||||
kobject_put(&eq->kobj);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
|
|
|
@ -700,6 +700,8 @@ static void zloop_free_disk(struct gendisk *disk)
|
|||
struct zloop_device *zlo = disk->private_data;
|
||||
unsigned int i;
|
||||
|
||||
blk_mq_free_tag_set(&zlo->tag_set);
|
||||
|
||||
for (i = 0; i < zlo->nr_zones; i++) {
|
||||
struct zloop_zone *zone = &zlo->zones[i];
|
||||
|
||||
|
@ -1080,7 +1082,6 @@ static int zloop_ctl_remove(struct zloop_options *opts)
|
|||
|
||||
del_gendisk(zlo->disk);
|
||||
put_disk(zlo->disk);
|
||||
blk_mq_free_tag_set(&zlo->tag_set);
|
||||
|
||||
pr_info("Removed device %d\n", opts->id);
|
||||
|
||||
|
|
|
@ -438,7 +438,7 @@ static bool rs_is_reshapable(struct raid_set *rs)
|
|||
/* Return true, if raid set in @rs is recovering */
|
||||
static bool rs_is_recovering(struct raid_set *rs)
|
||||
{
|
||||
return rs->md.recovery_cp < rs->md.dev_sectors;
|
||||
return rs->md.resync_offset < rs->md.dev_sectors;
|
||||
}
|
||||
|
||||
/* Return true, if raid set in @rs is reshaping */
|
||||
|
@ -768,7 +768,7 @@ static struct raid_set *raid_set_alloc(struct dm_target *ti, struct raid_type *r
|
|||
rs->md.layout = raid_type->algorithm;
|
||||
rs->md.new_layout = rs->md.layout;
|
||||
rs->md.delta_disks = 0;
|
||||
rs->md.recovery_cp = MaxSector;
|
||||
rs->md.resync_offset = MaxSector;
|
||||
|
||||
for (i = 0; i < raid_devs; i++)
|
||||
md_rdev_init(&rs->dev[i].rdev);
|
||||
|
@ -912,7 +912,7 @@ static int parse_dev_params(struct raid_set *rs, struct dm_arg_set *as)
|
|||
rs->md.external = 0;
|
||||
rs->md.persistent = 1;
|
||||
rs->md.major_version = 2;
|
||||
} else if (rebuild && !rs->md.recovery_cp) {
|
||||
} else if (rebuild && !rs->md.resync_offset) {
|
||||
/*
|
||||
* Without metadata, we will not be able to tell if the array
|
||||
* is in-sync or not - we must assume it is not. Therefore,
|
||||
|
@ -1695,20 +1695,20 @@ static void rs_setup_recovery(struct raid_set *rs, sector_t dev_sectors)
|
|||
{
|
||||
/* raid0 does not recover */
|
||||
if (rs_is_raid0(rs))
|
||||
rs->md.recovery_cp = MaxSector;
|
||||
rs->md.resync_offset = MaxSector;
|
||||
/*
|
||||
* A raid6 set has to be recovered either
|
||||
* completely or for the grown part to
|
||||
* ensure proper parity and Q-Syndrome
|
||||
*/
|
||||
else if (rs_is_raid6(rs))
|
||||
rs->md.recovery_cp = dev_sectors;
|
||||
rs->md.resync_offset = dev_sectors;
|
||||
/*
|
||||
* Other raid set types may skip recovery
|
||||
* depending on the 'nosync' flag.
|
||||
*/
|
||||
else
|
||||
rs->md.recovery_cp = test_bit(__CTR_FLAG_NOSYNC, &rs->ctr_flags)
|
||||
rs->md.resync_offset = test_bit(__CTR_FLAG_NOSYNC, &rs->ctr_flags)
|
||||
? MaxSector : dev_sectors;
|
||||
}
|
||||
|
||||
|
@ -2143,7 +2143,7 @@ static void super_sync(struct mddev *mddev, struct md_rdev *rdev)
|
|||
sb->events = cpu_to_le64(mddev->events);
|
||||
|
||||
sb->disk_recovery_offset = cpu_to_le64(rdev->recovery_offset);
|
||||
sb->array_resync_offset = cpu_to_le64(mddev->recovery_cp);
|
||||
sb->array_resync_offset = cpu_to_le64(mddev->resync_offset);
|
||||
|
||||
sb->level = cpu_to_le32(mddev->level);
|
||||
sb->layout = cpu_to_le32(mddev->layout);
|
||||
|
@ -2334,18 +2334,18 @@ static int super_init_validation(struct raid_set *rs, struct md_rdev *rdev)
|
|||
}
|
||||
|
||||
if (!test_bit(__CTR_FLAG_NOSYNC, &rs->ctr_flags))
|
||||
mddev->recovery_cp = le64_to_cpu(sb->array_resync_offset);
|
||||
mddev->resync_offset = le64_to_cpu(sb->array_resync_offset);
|
||||
|
||||
/*
|
||||
* During load, we set FirstUse if a new superblock was written.
|
||||
* There are two reasons we might not have a superblock:
|
||||
* 1) The raid set is brand new - in which case, all of the
|
||||
* devices must have their In_sync bit set. Also,
|
||||
* recovery_cp must be 0, unless forced.
|
||||
* resync_offset must be 0, unless forced.
|
||||
* 2) This is a new device being added to an old raid set
|
||||
* and the new device needs to be rebuilt - in which
|
||||
* case the In_sync bit will /not/ be set and
|
||||
* recovery_cp must be MaxSector.
|
||||
* resync_offset must be MaxSector.
|
||||
* 3) This is/are a new device(s) being added to an old
|
||||
* raid set during takeover to a higher raid level
|
||||
* to provide capacity for redundancy or during reshape
|
||||
|
@ -2390,8 +2390,8 @@ static int super_init_validation(struct raid_set *rs, struct md_rdev *rdev)
|
|||
new_devs > 1 ? "s" : "");
|
||||
return -EINVAL;
|
||||
} else if (!test_bit(__CTR_FLAG_REBUILD, &rs->ctr_flags) && rs_is_recovering(rs)) {
|
||||
DMERR("'rebuild' specified while raid set is not in-sync (recovery_cp=%llu)",
|
||||
(unsigned long long) mddev->recovery_cp);
|
||||
DMERR("'rebuild' specified while raid set is not in-sync (resync_offset=%llu)",
|
||||
(unsigned long long) mddev->resync_offset);
|
||||
return -EINVAL;
|
||||
} else if (rs_is_reshaping(rs)) {
|
||||
DMERR("'rebuild' specified while raid set is being reshaped (reshape_position=%llu)",
|
||||
|
@ -2700,11 +2700,11 @@ static int rs_adjust_data_offsets(struct raid_set *rs)
|
|||
}
|
||||
out:
|
||||
/*
|
||||
* Raise recovery_cp in case data_offset != 0 to
|
||||
* Raise resync_offset in case data_offset != 0 to
|
||||
* avoid false recovery positives in the constructor.
|
||||
*/
|
||||
if (rs->md.recovery_cp < rs->md.dev_sectors)
|
||||
rs->md.recovery_cp += rs->dev[0].rdev.data_offset;
|
||||
if (rs->md.resync_offset < rs->md.dev_sectors)
|
||||
rs->md.resync_offset += rs->dev[0].rdev.data_offset;
|
||||
|
||||
/* Adjust data offsets on all rdevs but on any raid4/5/6 journal device */
|
||||
rdev_for_each(rdev, &rs->md) {
|
||||
|
@ -2759,7 +2759,7 @@ static int rs_setup_takeover(struct raid_set *rs)
|
|||
}
|
||||
|
||||
clear_bit(MD_ARRAY_FIRST_USE, &mddev->flags);
|
||||
mddev->recovery_cp = MaxSector;
|
||||
mddev->resync_offset = MaxSector;
|
||||
|
||||
while (d--) {
|
||||
rdev = &rs->dev[d].rdev;
|
||||
|
@ -2767,7 +2767,7 @@ static int rs_setup_takeover(struct raid_set *rs)
|
|||
if (test_bit(d, (void *) rs->rebuild_disks)) {
|
||||
clear_bit(In_sync, &rdev->flags);
|
||||
clear_bit(Faulty, &rdev->flags);
|
||||
mddev->recovery_cp = rdev->recovery_offset = 0;
|
||||
mddev->resync_offset = rdev->recovery_offset = 0;
|
||||
/* Bitmap has to be created when we do an "up" takeover */
|
||||
set_bit(MD_ARRAY_FIRST_USE, &mddev->flags);
|
||||
}
|
||||
|
@ -3225,7 +3225,7 @@ size_check:
|
|||
if (r)
|
||||
goto bad;
|
||||
|
||||
rs_setup_recovery(rs, rs->md.recovery_cp < rs->md.dev_sectors ? rs->md.recovery_cp : rs->md.dev_sectors);
|
||||
rs_setup_recovery(rs, rs->md.resync_offset < rs->md.dev_sectors ? rs->md.resync_offset : rs->md.dev_sectors);
|
||||
} else {
|
||||
/* This is no size change or it is shrinking, update size and record in superblocks */
|
||||
r = rs_set_dev_and_array_sectors(rs, rs->ti->len, false);
|
||||
|
@ -3449,7 +3449,7 @@ static sector_t rs_get_progress(struct raid_set *rs, unsigned long recovery,
|
|||
|
||||
} else {
|
||||
if (state == st_idle && !test_bit(MD_RECOVERY_INTR, &recovery))
|
||||
r = mddev->recovery_cp;
|
||||
r = mddev->resync_offset;
|
||||
else
|
||||
r = mddev->curr_resync_completed;
|
||||
|
||||
|
@ -4077,9 +4077,9 @@ static int raid_preresume(struct dm_target *ti)
|
|||
}
|
||||
|
||||
/* Check for any resize/reshape on @rs and adjust/initiate */
|
||||
if (mddev->recovery_cp && mddev->recovery_cp < MaxSector) {
|
||||
if (mddev->resync_offset && mddev->resync_offset < MaxSector) {
|
||||
set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
|
||||
mddev->resync_min = mddev->recovery_cp;
|
||||
mddev->resync_min = mddev->resync_offset;
|
||||
if (test_bit(RT_FLAG_RS_GROW, &rs->runtime_flags))
|
||||
mddev->resync_max_sectors = mddev->dev_sectors;
|
||||
}
|
||||
|
|
|
@ -1987,12 +1987,12 @@ static void bitmap_dirty_bits(struct mddev *mddev, unsigned long s,
|
|||
|
||||
md_bitmap_set_memory_bits(bitmap, sec, 1);
|
||||
md_bitmap_file_set_bit(bitmap, sec);
|
||||
if (sec < bitmap->mddev->recovery_cp)
|
||||
if (sec < bitmap->mddev->resync_offset)
|
||||
/* We are asserting that the array is dirty,
|
||||
* so move the recovery_cp address back so
|
||||
* so move the resync_offset address back so
|
||||
* that it is obvious that it is dirty
|
||||
*/
|
||||
bitmap->mddev->recovery_cp = sec;
|
||||
bitmap->mddev->resync_offset = sec;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -2258,7 +2258,7 @@ static int bitmap_load(struct mddev *mddev)
|
|||
|| bitmap->events_cleared == mddev->events)
|
||||
/* no need to keep dirty bits to optimise a
|
||||
* re-add of a missing device */
|
||||
start = mddev->recovery_cp;
|
||||
start = mddev->resync_offset;
|
||||
|
||||
mutex_lock(&mddev->bitmap_info.mutex);
|
||||
err = md_bitmap_init_from_disk(bitmap, start);
|
||||
|
|
|
@ -337,11 +337,11 @@ static void recover_bitmaps(struct md_thread *thread)
|
|||
md_wakeup_thread(mddev->sync_thread);
|
||||
|
||||
if (hi > 0) {
|
||||
if (lo < mddev->recovery_cp)
|
||||
mddev->recovery_cp = lo;
|
||||
if (lo < mddev->resync_offset)
|
||||
mddev->resync_offset = lo;
|
||||
/* wake up thread to continue resync in case resync
|
||||
* is not finished */
|
||||
if (mddev->recovery_cp != MaxSector) {
|
||||
if (mddev->resync_offset != MaxSector) {
|
||||
/*
|
||||
* clear the REMOTE flag since we will launch
|
||||
* resync thread in current node.
|
||||
|
@ -863,9 +863,9 @@ static int gather_all_resync_info(struct mddev *mddev, int total_slots)
|
|||
lockres_free(bm_lockres);
|
||||
continue;
|
||||
}
|
||||
if ((hi > 0) && (lo < mddev->recovery_cp)) {
|
||||
if ((hi > 0) && (lo < mddev->resync_offset)) {
|
||||
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
|
||||
mddev->recovery_cp = lo;
|
||||
mddev->resync_offset = lo;
|
||||
md_check_recovery(mddev);
|
||||
}
|
||||
|
||||
|
@ -1027,7 +1027,7 @@ static int leave(struct mddev *mddev)
|
|||
* Also, we should send BITMAP_NEEDS_SYNC message in
|
||||
* case reshaping is interrupted.
|
||||
*/
|
||||
if ((cinfo->slot_number > 0 && mddev->recovery_cp != MaxSector) ||
|
||||
if ((cinfo->slot_number > 0 && mddev->resync_offset != MaxSector) ||
|
||||
(mddev->reshape_position != MaxSector &&
|
||||
test_bit(MD_CLOSING, &mddev->flags)))
|
||||
resync_bitmap(mddev);
|
||||
|
@ -1605,8 +1605,8 @@ static int gather_bitmaps(struct md_rdev *rdev)
|
|||
pr_warn("md-cluster: Could not gather bitmaps from slot %d", sn);
|
||||
goto out;
|
||||
}
|
||||
if ((hi > 0) && (lo < mddev->recovery_cp))
|
||||
mddev->recovery_cp = lo;
|
||||
if ((hi > 0) && (lo < mddev->resync_offset))
|
||||
mddev->resync_offset = lo;
|
||||
}
|
||||
out:
|
||||
return err;
|
||||
|
|
|
@ -636,6 +636,12 @@ static void __mddev_put(struct mddev *mddev)
|
|||
mddev->ctime || mddev->hold_active)
|
||||
return;
|
||||
|
||||
/*
|
||||
* If array is freed by stopping array, MD_DELETED is set by
|
||||
* do_md_stop(), MD_DELETED is still set here in case mddev is freed
|
||||
* directly by closing a mddev that is created by create_on_open.
|
||||
*/
|
||||
set_bit(MD_DELETED, &mddev->flags);
|
||||
/*
|
||||
* Call queue_work inside the spinlock so that flush_workqueue() after
|
||||
* mddev_find will succeed in waiting for the work to be done.
|
||||
|
@ -1409,13 +1415,13 @@ static int super_90_validate(struct mddev *mddev, struct md_rdev *freshest, stru
|
|||
mddev->layout = -1;
|
||||
|
||||
if (sb->state & (1<<MD_SB_CLEAN))
|
||||
mddev->recovery_cp = MaxSector;
|
||||
mddev->resync_offset = MaxSector;
|
||||
else {
|
||||
if (sb->events_hi == sb->cp_events_hi &&
|
||||
sb->events_lo == sb->cp_events_lo) {
|
||||
mddev->recovery_cp = sb->recovery_cp;
|
||||
mddev->resync_offset = sb->resync_offset;
|
||||
} else
|
||||
mddev->recovery_cp = 0;
|
||||
mddev->resync_offset = 0;
|
||||
}
|
||||
|
||||
memcpy(mddev->uuid+0, &sb->set_uuid0, 4);
|
||||
|
@ -1541,13 +1547,13 @@ static void super_90_sync(struct mddev *mddev, struct md_rdev *rdev)
|
|||
mddev->minor_version = sb->minor_version;
|
||||
if (mddev->in_sync)
|
||||
{
|
||||
sb->recovery_cp = mddev->recovery_cp;
|
||||
sb->resync_offset = mddev->resync_offset;
|
||||
sb->cp_events_hi = (mddev->events>>32);
|
||||
sb->cp_events_lo = (u32)mddev->events;
|
||||
if (mddev->recovery_cp == MaxSector)
|
||||
if (mddev->resync_offset == MaxSector)
|
||||
sb->state = (1<< MD_SB_CLEAN);
|
||||
} else
|
||||
sb->recovery_cp = 0;
|
||||
sb->resync_offset = 0;
|
||||
|
||||
sb->layout = mddev->layout;
|
||||
sb->chunk_size = mddev->chunk_sectors << 9;
|
||||
|
@ -1895,7 +1901,7 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *freshest, struc
|
|||
mddev->bitmap_info.default_space = (4096-1024) >> 9;
|
||||
mddev->reshape_backwards = 0;
|
||||
|
||||
mddev->recovery_cp = le64_to_cpu(sb->resync_offset);
|
||||
mddev->resync_offset = le64_to_cpu(sb->resync_offset);
|
||||
memcpy(mddev->uuid, sb->set_uuid, 16);
|
||||
|
||||
mddev->max_disks = (4096-256)/2;
|
||||
|
@ -2081,7 +2087,7 @@ static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev)
|
|||
sb->utime = cpu_to_le64((__u64)mddev->utime);
|
||||
sb->events = cpu_to_le64(mddev->events);
|
||||
if (mddev->in_sync)
|
||||
sb->resync_offset = cpu_to_le64(mddev->recovery_cp);
|
||||
sb->resync_offset = cpu_to_le64(mddev->resync_offset);
|
||||
else if (test_bit(MD_JOURNAL_CLEAN, &mddev->flags))
|
||||
sb->resync_offset = cpu_to_le64(MaxSector);
|
||||
else
|
||||
|
@ -2761,7 +2767,7 @@ repeat:
|
|||
/* If this is just a dirty<->clean transition, and the array is clean
|
||||
* and 'events' is odd, we can roll back to the previous clean state */
|
||||
if (nospares
|
||||
&& (mddev->in_sync && mddev->recovery_cp == MaxSector)
|
||||
&& (mddev->in_sync && mddev->resync_offset == MaxSector)
|
||||
&& mddev->can_decrease_events
|
||||
&& mddev->events != 1) {
|
||||
mddev->events--;
|
||||
|
@ -4297,9 +4303,9 @@ __ATTR(chunk_size, S_IRUGO|S_IWUSR, chunk_size_show, chunk_size_store);
|
|||
static ssize_t
|
||||
resync_start_show(struct mddev *mddev, char *page)
|
||||
{
|
||||
if (mddev->recovery_cp == MaxSector)
|
||||
if (mddev->resync_offset == MaxSector)
|
||||
return sprintf(page, "none\n");
|
||||
return sprintf(page, "%llu\n", (unsigned long long)mddev->recovery_cp);
|
||||
return sprintf(page, "%llu\n", (unsigned long long)mddev->resync_offset);
|
||||
}
|
||||
|
||||
static ssize_t
|
||||
|
@ -4325,7 +4331,7 @@ resync_start_store(struct mddev *mddev, const char *buf, size_t len)
|
|||
err = -EBUSY;
|
||||
|
||||
if (!err) {
|
||||
mddev->recovery_cp = n;
|
||||
mddev->resync_offset = n;
|
||||
if (mddev->pers)
|
||||
set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
|
||||
}
|
||||
|
@ -6417,7 +6423,7 @@ static void md_clean(struct mddev *mddev)
|
|||
mddev->external_size = 0;
|
||||
mddev->dev_sectors = 0;
|
||||
mddev->raid_disks = 0;
|
||||
mddev->recovery_cp = 0;
|
||||
mddev->resync_offset = 0;
|
||||
mddev->resync_min = 0;
|
||||
mddev->resync_max = MaxSector;
|
||||
mddev->reshape_position = MaxSector;
|
||||
|
@ -7362,9 +7368,9 @@ int md_set_array_info(struct mddev *mddev, struct mdu_array_info_s *info)
|
|||
* openned
|
||||
*/
|
||||
if (info->state & (1<<MD_SB_CLEAN))
|
||||
mddev->recovery_cp = MaxSector;
|
||||
mddev->resync_offset = MaxSector;
|
||||
else
|
||||
mddev->recovery_cp = 0;
|
||||
mddev->resync_offset = 0;
|
||||
mddev->persistent = ! info->not_persistent;
|
||||
mddev->external = 0;
|
||||
|
||||
|
@ -8303,7 +8309,7 @@ static int status_resync(struct seq_file *seq, struct mddev *mddev)
|
|||
seq_printf(seq, "\tresync=REMOTE");
|
||||
return 1;
|
||||
}
|
||||
if (mddev->recovery_cp < MaxSector) {
|
||||
if (mddev->resync_offset < MaxSector) {
|
||||
seq_printf(seq, "\tresync=PENDING");
|
||||
return 1;
|
||||
}
|
||||
|
@ -8946,7 +8952,7 @@ static sector_t md_sync_position(struct mddev *mddev, enum sync_action action)
|
|||
return mddev->resync_min;
|
||||
case ACTION_RESYNC:
|
||||
if (!mddev->bitmap)
|
||||
return mddev->recovery_cp;
|
||||
return mddev->resync_offset;
|
||||
return 0;
|
||||
case ACTION_RESHAPE:
|
||||
/*
|
||||
|
@ -9184,8 +9190,8 @@ void md_do_sync(struct md_thread *thread)
|
|||
atomic_read(&mddev->recovery_active) == 0);
|
||||
mddev->curr_resync_completed = j;
|
||||
if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) &&
|
||||
j > mddev->recovery_cp)
|
||||
mddev->recovery_cp = j;
|
||||
j > mddev->resync_offset)
|
||||
mddev->resync_offset = j;
|
||||
update_time = jiffies;
|
||||
set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
|
||||
sysfs_notify_dirent_safe(mddev->sysfs_completed);
|
||||
|
@ -9305,19 +9311,19 @@ void md_do_sync(struct md_thread *thread)
|
|||
mddev->curr_resync > MD_RESYNC_ACTIVE) {
|
||||
if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
|
||||
if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
|
||||
if (mddev->curr_resync >= mddev->recovery_cp) {
|
||||
if (mddev->curr_resync >= mddev->resync_offset) {
|
||||
pr_debug("md: checkpointing %s of %s.\n",
|
||||
desc, mdname(mddev));
|
||||
if (test_bit(MD_RECOVERY_ERROR,
|
||||
&mddev->recovery))
|
||||
mddev->recovery_cp =
|
||||
mddev->resync_offset =
|
||||
mddev->curr_resync_completed;
|
||||
else
|
||||
mddev->recovery_cp =
|
||||
mddev->resync_offset =
|
||||
mddev->curr_resync;
|
||||
}
|
||||
} else
|
||||
mddev->recovery_cp = MaxSector;
|
||||
mddev->resync_offset = MaxSector;
|
||||
} else {
|
||||
if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery))
|
||||
mddev->curr_resync = MaxSector;
|
||||
|
@ -9421,6 +9427,12 @@ static bool rdev_is_spare(struct md_rdev *rdev)
|
|||
|
||||
static bool rdev_addable(struct md_rdev *rdev)
|
||||
{
|
||||
struct mddev *mddev;
|
||||
|
||||
mddev = READ_ONCE(rdev->mddev);
|
||||
if (!mddev)
|
||||
return false;
|
||||
|
||||
/* rdev is already used, don't add it again. */
|
||||
if (test_bit(Candidate, &rdev->flags) || rdev->raid_disk >= 0 ||
|
||||
test_bit(Faulty, &rdev->flags))
|
||||
|
@ -9431,7 +9443,7 @@ static bool rdev_addable(struct md_rdev *rdev)
|
|||
return true;
|
||||
|
||||
/* Allow to add if array is read-write. */
|
||||
if (md_is_rdwr(rdev->mddev))
|
||||
if (md_is_rdwr(mddev))
|
||||
return true;
|
||||
|
||||
/*
|
||||
|
@ -9533,7 +9545,7 @@ static bool md_choose_sync_action(struct mddev *mddev, int *spares)
|
|||
}
|
||||
|
||||
/* Check if resync is in progress. */
|
||||
if (mddev->recovery_cp < MaxSector) {
|
||||
if (mddev->resync_offset < MaxSector) {
|
||||
remove_spares(mddev, NULL);
|
||||
set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
|
||||
clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
|
||||
|
@ -9714,7 +9726,7 @@ void md_check_recovery(struct mddev *mddev)
|
|||
test_bit(MD_RECOVERY_DONE, &mddev->recovery) ||
|
||||
(mddev->external == 0 && mddev->safemode == 1) ||
|
||||
(mddev->safemode == 2
|
||||
&& !mddev->in_sync && mddev->recovery_cp == MaxSector)
|
||||
&& !mddev->in_sync && mddev->resync_offset == MaxSector)
|
||||
))
|
||||
return;
|
||||
|
||||
|
@ -9771,8 +9783,8 @@ void md_check_recovery(struct mddev *mddev)
|
|||
* remove disk.
|
||||
*/
|
||||
rdev_for_each_safe(rdev, tmp, mddev) {
|
||||
if (test_and_clear_bit(ClusterRemove, &rdev->flags) &&
|
||||
rdev->raid_disk < 0)
|
||||
if (rdev->raid_disk < 0 &&
|
||||
test_and_clear_bit(ClusterRemove, &rdev->flags))
|
||||
md_kick_rdev_from_array(rdev);
|
||||
}
|
||||
}
|
||||
|
@ -10078,8 +10090,11 @@ static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev)
|
|||
|
||||
/* Check for change of roles in the active devices */
|
||||
rdev_for_each_safe(rdev2, tmp, mddev) {
|
||||
if (test_bit(Faulty, &rdev2->flags))
|
||||
if (test_bit(Faulty, &rdev2->flags)) {
|
||||
if (test_bit(ClusterRemove, &rdev2->flags))
|
||||
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
|
||||
continue;
|
||||
}
|
||||
|
||||
/* Check if the roles changed */
|
||||
role = le16_to_cpu(sb->dev_roles[rdev2->desc_nr]);
|
||||
|
|
|
@ -523,7 +523,7 @@ struct mddev {
|
|||
unsigned long normal_io_events; /* IO event timestamp */
|
||||
atomic_t recovery_active; /* blocks scheduled, but not written */
|
||||
wait_queue_head_t recovery_wait;
|
||||
sector_t recovery_cp;
|
||||
sector_t resync_offset;
|
||||
sector_t resync_min; /* user requested sync
|
||||
* starts here */
|
||||
sector_t resync_max; /* resync should pause
|
||||
|
|
|
@ -674,7 +674,7 @@ static void *raid0_takeover_raid45(struct mddev *mddev)
|
|||
mddev->raid_disks--;
|
||||
mddev->delta_disks = -1;
|
||||
/* make sure it will be not marked as dirty */
|
||||
mddev->recovery_cp = MaxSector;
|
||||
mddev->resync_offset = MaxSector;
|
||||
mddev_clear_unsupported_flags(mddev, UNSUPPORTED_MDDEV_FLAGS);
|
||||
|
||||
create_strip_zones(mddev, &priv_conf);
|
||||
|
@ -717,7 +717,7 @@ static void *raid0_takeover_raid10(struct mddev *mddev)
|
|||
mddev->raid_disks += mddev->delta_disks;
|
||||
mddev->degraded = 0;
|
||||
/* make sure it will be not marked as dirty */
|
||||
mddev->recovery_cp = MaxSector;
|
||||
mddev->resync_offset = MaxSector;
|
||||
mddev_clear_unsupported_flags(mddev, UNSUPPORTED_MDDEV_FLAGS);
|
||||
|
||||
create_strip_zones(mddev, &priv_conf);
|
||||
|
@ -760,7 +760,7 @@ static void *raid0_takeover_raid1(struct mddev *mddev)
|
|||
mddev->delta_disks = 1 - mddev->raid_disks;
|
||||
mddev->raid_disks = 1;
|
||||
/* make sure it will be not marked as dirty */
|
||||
mddev->recovery_cp = MaxSector;
|
||||
mddev->resync_offset = MaxSector;
|
||||
mddev_clear_unsupported_flags(mddev, UNSUPPORTED_MDDEV_FLAGS);
|
||||
|
||||
create_strip_zones(mddev, &priv_conf);
|
||||
|
|
|
@ -283,7 +283,7 @@ static inline int raid1_check_read_range(struct md_rdev *rdev,
|
|||
static inline bool raid1_should_read_first(struct mddev *mddev,
|
||||
sector_t this_sector, int len)
|
||||
{
|
||||
if ((mddev->recovery_cp < this_sector + len))
|
||||
if ((mddev->resync_offset < this_sector + len))
|
||||
return true;
|
||||
|
||||
if (mddev_is_clustered(mddev) &&
|
||||
|
|
|
@ -127,10 +127,9 @@ static inline struct r1bio *get_resync_r1bio(struct bio *bio)
|
|||
return get_resync_pages(bio)->raid_bio;
|
||||
}
|
||||
|
||||
static void * r1bio_pool_alloc(gfp_t gfp_flags, void *data)
|
||||
static void *r1bio_pool_alloc(gfp_t gfp_flags, struct r1conf *conf)
|
||||
{
|
||||
struct pool_info *pi = data;
|
||||
int size = offsetof(struct r1bio, bios[pi->raid_disks]);
|
||||
int size = offsetof(struct r1bio, bios[conf->raid_disks * 2]);
|
||||
|
||||
/* allocate a r1bio with room for raid_disks entries in the bios array */
|
||||
return kzalloc(size, gfp_flags);
|
||||
|
@ -145,18 +144,18 @@ static void * r1bio_pool_alloc(gfp_t gfp_flags, void *data)
|
|||
|
||||
static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data)
|
||||
{
|
||||
struct pool_info *pi = data;
|
||||
struct r1conf *conf = data;
|
||||
struct r1bio *r1_bio;
|
||||
struct bio *bio;
|
||||
int need_pages;
|
||||
int j;
|
||||
struct resync_pages *rps;
|
||||
|
||||
r1_bio = r1bio_pool_alloc(gfp_flags, pi);
|
||||
r1_bio = r1bio_pool_alloc(gfp_flags, conf);
|
||||
if (!r1_bio)
|
||||
return NULL;
|
||||
|
||||
rps = kmalloc_array(pi->raid_disks, sizeof(struct resync_pages),
|
||||
rps = kmalloc_array(conf->raid_disks * 2, sizeof(struct resync_pages),
|
||||
gfp_flags);
|
||||
if (!rps)
|
||||
goto out_free_r1bio;
|
||||
|
@ -164,7 +163,7 @@ static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data)
|
|||
/*
|
||||
* Allocate bios : 1 for reading, n-1 for writing
|
||||
*/
|
||||
for (j = pi->raid_disks ; j-- ; ) {
|
||||
for (j = conf->raid_disks * 2; j-- ; ) {
|
||||
bio = bio_kmalloc(RESYNC_PAGES, gfp_flags);
|
||||
if (!bio)
|
||||
goto out_free_bio;
|
||||
|
@ -177,11 +176,11 @@ static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data)
|
|||
* If this is a user-requested check/repair, allocate
|
||||
* RESYNC_PAGES for each bio.
|
||||
*/
|
||||
if (test_bit(MD_RECOVERY_REQUESTED, &pi->mddev->recovery))
|
||||
need_pages = pi->raid_disks;
|
||||
if (test_bit(MD_RECOVERY_REQUESTED, &conf->mddev->recovery))
|
||||
need_pages = conf->raid_disks * 2;
|
||||
else
|
||||
need_pages = 1;
|
||||
for (j = 0; j < pi->raid_disks; j++) {
|
||||
for (j = 0; j < conf->raid_disks * 2; j++) {
|
||||
struct resync_pages *rp = &rps[j];
|
||||
|
||||
bio = r1_bio->bios[j];
|
||||
|
@ -207,7 +206,7 @@ out_free_pages:
|
|||
resync_free_pages(&rps[j]);
|
||||
|
||||
out_free_bio:
|
||||
while (++j < pi->raid_disks) {
|
||||
while (++j < conf->raid_disks * 2) {
|
||||
bio_uninit(r1_bio->bios[j]);
|
||||
kfree(r1_bio->bios[j]);
|
||||
}
|
||||
|
@ -220,12 +219,12 @@ out_free_r1bio:
|
|||
|
||||
static void r1buf_pool_free(void *__r1_bio, void *data)
|
||||
{
|
||||
struct pool_info *pi = data;
|
||||
struct r1conf *conf = data;
|
||||
int i;
|
||||
struct r1bio *r1bio = __r1_bio;
|
||||
struct resync_pages *rp = NULL;
|
||||
|
||||
for (i = pi->raid_disks; i--; ) {
|
||||
for (i = conf->raid_disks * 2; i--; ) {
|
||||
rp = get_resync_pages(r1bio->bios[i]);
|
||||
resync_free_pages(rp);
|
||||
bio_uninit(r1bio->bios[i]);
|
||||
|
@ -255,7 +254,7 @@ static void free_r1bio(struct r1bio *r1_bio)
|
|||
struct r1conf *conf = r1_bio->mddev->private;
|
||||
|
||||
put_all_bios(conf, r1_bio);
|
||||
mempool_free(r1_bio, &conf->r1bio_pool);
|
||||
mempool_free(r1_bio, conf->r1bio_pool);
|
||||
}
|
||||
|
||||
static void put_buf(struct r1bio *r1_bio)
|
||||
|
@ -1305,9 +1304,8 @@ alloc_r1bio(struct mddev *mddev, struct bio *bio)
|
|||
struct r1conf *conf = mddev->private;
|
||||
struct r1bio *r1_bio;
|
||||
|
||||
r1_bio = mempool_alloc(&conf->r1bio_pool, GFP_NOIO);
|
||||
/* Ensure no bio records IO_BLOCKED */
|
||||
memset(r1_bio->bios, 0, conf->raid_disks * sizeof(r1_bio->bios[0]));
|
||||
r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO);
|
||||
memset(r1_bio, 0, offsetof(struct r1bio, bios[conf->raid_disks * 2]));
|
||||
init_r1bio(r1_bio, mddev, bio);
|
||||
return r1_bio;
|
||||
}
|
||||
|
@ -2747,7 +2745,7 @@ static int init_resync(struct r1conf *conf)
|
|||
BUG_ON(mempool_initialized(&conf->r1buf_pool));
|
||||
|
||||
return mempool_init(&conf->r1buf_pool, buffs, r1buf_pool_alloc,
|
||||
r1buf_pool_free, conf->poolinfo);
|
||||
r1buf_pool_free, conf);
|
||||
}
|
||||
|
||||
static struct r1bio *raid1_alloc_init_r1buf(struct r1conf *conf)
|
||||
|
@ -2757,7 +2755,7 @@ static struct r1bio *raid1_alloc_init_r1buf(struct r1conf *conf)
|
|||
struct bio *bio;
|
||||
int i;
|
||||
|
||||
for (i = conf->poolinfo->raid_disks; i--; ) {
|
||||
for (i = conf->raid_disks * 2; i--; ) {
|
||||
bio = r1bio->bios[i];
|
||||
rps = bio->bi_private;
|
||||
bio_reset(bio, NULL, 0);
|
||||
|
@ -2822,7 +2820,7 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
|
|||
}
|
||||
|
||||
if (mddev->bitmap == NULL &&
|
||||
mddev->recovery_cp == MaxSector &&
|
||||
mddev->resync_offset == MaxSector &&
|
||||
!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) &&
|
||||
conf->fullsync == 0) {
|
||||
*skipped = 1;
|
||||
|
@ -3085,6 +3083,7 @@ static struct r1conf *setup_conf(struct mddev *mddev)
|
|||
int i;
|
||||
struct raid1_info *disk;
|
||||
struct md_rdev *rdev;
|
||||
size_t r1bio_size;
|
||||
int err = -ENOMEM;
|
||||
|
||||
conf = kzalloc(sizeof(struct r1conf), GFP_KERNEL);
|
||||
|
@ -3121,21 +3120,15 @@ static struct r1conf *setup_conf(struct mddev *mddev)
|
|||
if (!conf->tmppage)
|
||||
goto abort;
|
||||
|
||||
conf->poolinfo = kzalloc(sizeof(*conf->poolinfo), GFP_KERNEL);
|
||||
if (!conf->poolinfo)
|
||||
goto abort;
|
||||
conf->poolinfo->raid_disks = mddev->raid_disks * 2;
|
||||
err = mempool_init(&conf->r1bio_pool, NR_RAID_BIOS, r1bio_pool_alloc,
|
||||
rbio_pool_free, conf->poolinfo);
|
||||
if (err)
|
||||
r1bio_size = offsetof(struct r1bio, bios[mddev->raid_disks * 2]);
|
||||
conf->r1bio_pool = mempool_create_kmalloc_pool(NR_RAID_BIOS, r1bio_size);
|
||||
if (!conf->r1bio_pool)
|
||||
goto abort;
|
||||
|
||||
err = bioset_init(&conf->bio_split, BIO_POOL_SIZE, 0, 0);
|
||||
if (err)
|
||||
goto abort;
|
||||
|
||||
conf->poolinfo->mddev = mddev;
|
||||
|
||||
err = -EINVAL;
|
||||
spin_lock_init(&conf->device_lock);
|
||||
conf->raid_disks = mddev->raid_disks;
|
||||
|
@ -3198,10 +3191,9 @@ static struct r1conf *setup_conf(struct mddev *mddev)
|
|||
|
||||
abort:
|
||||
if (conf) {
|
||||
mempool_exit(&conf->r1bio_pool);
|
||||
mempool_destroy(conf->r1bio_pool);
|
||||
kfree(conf->mirrors);
|
||||
safe_put_page(conf->tmppage);
|
||||
kfree(conf->poolinfo);
|
||||
kfree(conf->nr_pending);
|
||||
kfree(conf->nr_waiting);
|
||||
kfree(conf->nr_queued);
|
||||
|
@ -3282,9 +3274,9 @@ static int raid1_run(struct mddev *mddev)
|
|||
}
|
||||
|
||||
if (conf->raid_disks - mddev->degraded == 1)
|
||||
mddev->recovery_cp = MaxSector;
|
||||
mddev->resync_offset = MaxSector;
|
||||
|
||||
if (mddev->recovery_cp != MaxSector)
|
||||
if (mddev->resync_offset != MaxSector)
|
||||
pr_info("md/raid1:%s: not clean -- starting background reconstruction\n",
|
||||
mdname(mddev));
|
||||
pr_info("md/raid1:%s: active with %d out of %d mirrors\n",
|
||||
|
@ -3311,10 +3303,9 @@ static void raid1_free(struct mddev *mddev, void *priv)
|
|||
{
|
||||
struct r1conf *conf = priv;
|
||||
|
||||
mempool_exit(&conf->r1bio_pool);
|
||||
mempool_destroy(conf->r1bio_pool);
|
||||
kfree(conf->mirrors);
|
||||
safe_put_page(conf->tmppage);
|
||||
kfree(conf->poolinfo);
|
||||
kfree(conf->nr_pending);
|
||||
kfree(conf->nr_waiting);
|
||||
kfree(conf->nr_queued);
|
||||
|
@ -3345,8 +3336,8 @@ static int raid1_resize(struct mddev *mddev, sector_t sectors)
|
|||
|
||||
md_set_array_sectors(mddev, newsize);
|
||||
if (sectors > mddev->dev_sectors &&
|
||||
mddev->recovery_cp > mddev->dev_sectors) {
|
||||
mddev->recovery_cp = mddev->dev_sectors;
|
||||
mddev->resync_offset > mddev->dev_sectors) {
|
||||
mddev->resync_offset = mddev->dev_sectors;
|
||||
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
|
||||
}
|
||||
mddev->dev_sectors = sectors;
|
||||
|
@ -3367,17 +3358,13 @@ static int raid1_reshape(struct mddev *mddev)
|
|||
* At the same time, we "pack" the devices so that all the missing
|
||||
* devices have the higher raid_disk numbers.
|
||||
*/
|
||||
mempool_t newpool, oldpool;
|
||||
struct pool_info *newpoolinfo;
|
||||
mempool_t *newpool, *oldpool;
|
||||
size_t new_r1bio_size;
|
||||
struct raid1_info *newmirrors;
|
||||
struct r1conf *conf = mddev->private;
|
||||
int cnt, raid_disks;
|
||||
unsigned long flags;
|
||||
int d, d2;
|
||||
int ret;
|
||||
|
||||
memset(&newpool, 0, sizeof(newpool));
|
||||
memset(&oldpool, 0, sizeof(oldpool));
|
||||
|
||||
/* Cannot change chunk_size, layout, or level */
|
||||
if (mddev->chunk_sectors != mddev->new_chunk_sectors ||
|
||||
|
@ -3403,24 +3390,16 @@ static int raid1_reshape(struct mddev *mddev)
|
|||
return -EBUSY;
|
||||
}
|
||||
|
||||
newpoolinfo = kmalloc(sizeof(*newpoolinfo), GFP_KERNEL);
|
||||
if (!newpoolinfo)
|
||||
new_r1bio_size = offsetof(struct r1bio, bios[raid_disks * 2]);
|
||||
newpool = mempool_create_kmalloc_pool(NR_RAID_BIOS, new_r1bio_size);
|
||||
if (!newpool) {
|
||||
return -ENOMEM;
|
||||
newpoolinfo->mddev = mddev;
|
||||
newpoolinfo->raid_disks = raid_disks * 2;
|
||||
|
||||
ret = mempool_init(&newpool, NR_RAID_BIOS, r1bio_pool_alloc,
|
||||
rbio_pool_free, newpoolinfo);
|
||||
if (ret) {
|
||||
kfree(newpoolinfo);
|
||||
return ret;
|
||||
}
|
||||
newmirrors = kzalloc(array3_size(sizeof(struct raid1_info),
|
||||
raid_disks, 2),
|
||||
GFP_KERNEL);
|
||||
if (!newmirrors) {
|
||||
kfree(newpoolinfo);
|
||||
mempool_exit(&newpool);
|
||||
mempool_destroy(newpool);
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
|
@ -3429,7 +3408,6 @@ static int raid1_reshape(struct mddev *mddev)
|
|||
/* ok, everything is stopped */
|
||||
oldpool = conf->r1bio_pool;
|
||||
conf->r1bio_pool = newpool;
|
||||
init_waitqueue_head(&conf->r1bio_pool.wait);
|
||||
|
||||
for (d = d2 = 0; d < conf->raid_disks; d++) {
|
||||
struct md_rdev *rdev = conf->mirrors[d].rdev;
|
||||
|
@ -3446,8 +3424,6 @@ static int raid1_reshape(struct mddev *mddev)
|
|||
}
|
||||
kfree(conf->mirrors);
|
||||
conf->mirrors = newmirrors;
|
||||
kfree(conf->poolinfo);
|
||||
conf->poolinfo = newpoolinfo;
|
||||
|
||||
spin_lock_irqsave(&conf->device_lock, flags);
|
||||
mddev->degraded += (raid_disks - conf->raid_disks);
|
||||
|
@ -3461,7 +3437,7 @@ static int raid1_reshape(struct mddev *mddev)
|
|||
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
|
||||
md_wakeup_thread(mddev->thread);
|
||||
|
||||
mempool_exit(&oldpool);
|
||||
mempool_destroy(oldpool);
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
|
|
@ -49,22 +49,6 @@ struct raid1_info {
|
|||
sector_t seq_start;
|
||||
};
|
||||
|
||||
/*
|
||||
* memory pools need a pointer to the mddev, so they can force an unplug
|
||||
* when memory is tight, and a count of the number of drives that the
|
||||
* pool was allocated for, so they know how much to allocate and free.
|
||||
* mddev->raid_disks cannot be used, as it can change while a pool is active
|
||||
* These two datums are stored in a kmalloced struct.
|
||||
* The 'raid_disks' here is twice the raid_disks in r1conf.
|
||||
* This allows space for each 'real' device can have a replacement in the
|
||||
* second half of the array.
|
||||
*/
|
||||
|
||||
struct pool_info {
|
||||
struct mddev *mddev;
|
||||
int raid_disks;
|
||||
};
|
||||
|
||||
struct r1conf {
|
||||
struct mddev *mddev;
|
||||
struct raid1_info *mirrors; /* twice 'raid_disks' to
|
||||
|
@ -114,11 +98,7 @@ struct r1conf {
|
|||
*/
|
||||
int recovery_disabled;
|
||||
|
||||
/* poolinfo contains information about the content of the
|
||||
* mempools - it changes when the array grows or shrinks
|
||||
*/
|
||||
struct pool_info *poolinfo;
|
||||
mempool_t r1bio_pool;
|
||||
mempool_t *r1bio_pool;
|
||||
mempool_t r1buf_pool;
|
||||
|
||||
struct bio_set bio_split;
|
||||
|
|
|
@ -2117,7 +2117,7 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
|
|||
int last = conf->geo.raid_disks - 1;
|
||||
struct raid10_info *p;
|
||||
|
||||
if (mddev->recovery_cp < MaxSector)
|
||||
if (mddev->resync_offset < MaxSector)
|
||||
/* only hot-add to in-sync arrays, as recovery is
|
||||
* very different from resync
|
||||
*/
|
||||
|
@ -3185,7 +3185,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
|
|||
* of a clean array, like RAID1 does.
|
||||
*/
|
||||
if (mddev->bitmap == NULL &&
|
||||
mddev->recovery_cp == MaxSector &&
|
||||
mddev->resync_offset == MaxSector &&
|
||||
mddev->reshape_position == MaxSector &&
|
||||
!test_bit(MD_RECOVERY_SYNC, &mddev->recovery) &&
|
||||
!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) &&
|
||||
|
@ -4145,7 +4145,7 @@ static int raid10_run(struct mddev *mddev)
|
|||
disk->recovery_disabled = mddev->recovery_disabled - 1;
|
||||
}
|
||||
|
||||
if (mddev->recovery_cp != MaxSector)
|
||||
if (mddev->resync_offset != MaxSector)
|
||||
pr_notice("md/raid10:%s: not clean -- starting background reconstruction\n",
|
||||
mdname(mddev));
|
||||
pr_info("md/raid10:%s: active with %d out of %d devices\n",
|
||||
|
@ -4245,8 +4245,8 @@ static int raid10_resize(struct mddev *mddev, sector_t sectors)
|
|||
|
||||
md_set_array_sectors(mddev, size);
|
||||
if (sectors > mddev->dev_sectors &&
|
||||
mddev->recovery_cp > oldsize) {
|
||||
mddev->recovery_cp = oldsize;
|
||||
mddev->resync_offset > oldsize) {
|
||||
mddev->resync_offset = oldsize;
|
||||
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
|
||||
}
|
||||
calc_sectors(conf, sectors);
|
||||
|
@ -4275,7 +4275,7 @@ static void *raid10_takeover_raid0(struct mddev *mddev, sector_t size, int devs)
|
|||
mddev->delta_disks = mddev->raid_disks;
|
||||
mddev->raid_disks *= 2;
|
||||
/* make sure it will be not marked as dirty */
|
||||
mddev->recovery_cp = MaxSector;
|
||||
mddev->resync_offset = MaxSector;
|
||||
mddev->dev_sectors = size;
|
||||
|
||||
conf = setup_conf(mddev);
|
||||
|
@ -5087,8 +5087,8 @@ static void raid10_finish_reshape(struct mddev *mddev)
|
|||
return;
|
||||
|
||||
if (mddev->delta_disks > 0) {
|
||||
if (mddev->recovery_cp > mddev->resync_max_sectors) {
|
||||
mddev->recovery_cp = mddev->resync_max_sectors;
|
||||
if (mddev->resync_offset > mddev->resync_max_sectors) {
|
||||
mddev->resync_offset = mddev->resync_max_sectors;
|
||||
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
|
||||
}
|
||||
mddev->resync_max_sectors = mddev->array_sectors;
|
||||
|
|
|
@ -1163,7 +1163,7 @@ static int ppl_load_distributed(struct ppl_log *log)
|
|||
le64_to_cpu(pplhdr->generation));
|
||||
|
||||
/* attempt to recover from log if we are starting a dirty array */
|
||||
if (pplhdr && !mddev->pers && mddev->recovery_cp != MaxSector)
|
||||
if (pplhdr && !mddev->pers && mddev->resync_offset != MaxSector)
|
||||
ret = ppl_recover(log, pplhdr, pplhdr_offset);
|
||||
|
||||
/* write empty header if we are starting the array */
|
||||
|
@ -1422,14 +1422,14 @@ int ppl_init_log(struct r5conf *conf)
|
|||
|
||||
if (ret) {
|
||||
goto err;
|
||||
} else if (!mddev->pers && mddev->recovery_cp == 0 &&
|
||||
} else if (!mddev->pers && mddev->resync_offset == 0 &&
|
||||
ppl_conf->recovered_entries > 0 &&
|
||||
ppl_conf->mismatch_count == 0) {
|
||||
/*
|
||||
* If we are starting a dirty array and the recovery succeeds
|
||||
* without any issues, set the array as clean.
|
||||
*/
|
||||
mddev->recovery_cp = MaxSector;
|
||||
mddev->resync_offset = MaxSector;
|
||||
set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
|
||||
} else if (mddev->pers && ppl_conf->mismatch_count > 0) {
|
||||
/* no mismatch allowed when enabling PPL for a running array */
|
||||
|
|
|
@ -3740,7 +3740,7 @@ static int want_replace(struct stripe_head *sh, int disk_idx)
|
|||
&& !test_bit(Faulty, &rdev->flags)
|
||||
&& !test_bit(In_sync, &rdev->flags)
|
||||
&& (rdev->recovery_offset <= sh->sector
|
||||
|| rdev->mddev->recovery_cp <= sh->sector))
|
||||
|| rdev->mddev->resync_offset <= sh->sector))
|
||||
rv = 1;
|
||||
return rv;
|
||||
}
|
||||
|
@ -3832,7 +3832,7 @@ static int need_this_block(struct stripe_head *sh, struct stripe_head_state *s,
|
|||
* is missing/faulty, then we need to read everything we can.
|
||||
*/
|
||||
if (!force_rcw &&
|
||||
sh->sector < sh->raid_conf->mddev->recovery_cp)
|
||||
sh->sector < sh->raid_conf->mddev->resync_offset)
|
||||
/* reconstruct-write isn't being forced */
|
||||
return 0;
|
||||
for (i = 0; i < s->failed && i < 2; i++) {
|
||||
|
@ -4097,7 +4097,7 @@ static int handle_stripe_dirtying(struct r5conf *conf,
|
|||
int disks)
|
||||
{
|
||||
int rmw = 0, rcw = 0, i;
|
||||
sector_t recovery_cp = conf->mddev->recovery_cp;
|
||||
sector_t resync_offset = conf->mddev->resync_offset;
|
||||
|
||||
/* Check whether resync is now happening or should start.
|
||||
* If yes, then the array is dirty (after unclean shutdown or
|
||||
|
@ -4107,14 +4107,14 @@ static int handle_stripe_dirtying(struct r5conf *conf,
|
|||
* generate correct data from the parity.
|
||||
*/
|
||||
if (conf->rmw_level == PARITY_DISABLE_RMW ||
|
||||
(recovery_cp < MaxSector && sh->sector >= recovery_cp &&
|
||||
(resync_offset < MaxSector && sh->sector >= resync_offset &&
|
||||
s->failed == 0)) {
|
||||
/* Calculate the real rcw later - for now make it
|
||||
* look like rcw is cheaper
|
||||
*/
|
||||
rcw = 1; rmw = 2;
|
||||
pr_debug("force RCW rmw_level=%u, recovery_cp=%llu sh->sector=%llu\n",
|
||||
conf->rmw_level, (unsigned long long)recovery_cp,
|
||||
pr_debug("force RCW rmw_level=%u, resync_offset=%llu sh->sector=%llu\n",
|
||||
conf->rmw_level, (unsigned long long)resync_offset,
|
||||
(unsigned long long)sh->sector);
|
||||
} else for (i = disks; i--; ) {
|
||||
/* would I have to read this buffer for read_modify_write */
|
||||
|
@ -4770,14 +4770,14 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
|
|||
if (test_bit(STRIPE_SYNCING, &sh->state)) {
|
||||
/* If there is a failed device being replaced,
|
||||
* we must be recovering.
|
||||
* else if we are after recovery_cp, we must be syncing
|
||||
* else if we are after resync_offset, we must be syncing
|
||||
* else if MD_RECOVERY_REQUESTED is set, we also are syncing.
|
||||
* else we can only be replacing
|
||||
* sync and recovery both need to read all devices, and so
|
||||
* use the same flag.
|
||||
*/
|
||||
if (do_recovery ||
|
||||
sh->sector >= conf->mddev->recovery_cp ||
|
||||
sh->sector >= conf->mddev->resync_offset ||
|
||||
test_bit(MD_RECOVERY_REQUESTED, &(conf->mddev->recovery)))
|
||||
s->syncing = 1;
|
||||
else
|
||||
|
@ -7780,7 +7780,7 @@ static int raid5_run(struct mddev *mddev)
|
|||
int first = 1;
|
||||
int ret = -EIO;
|
||||
|
||||
if (mddev->recovery_cp != MaxSector)
|
||||
if (mddev->resync_offset != MaxSector)
|
||||
pr_notice("md/raid:%s: not clean -- starting background reconstruction\n",
|
||||
mdname(mddev));
|
||||
|
||||
|
@ -7921,7 +7921,7 @@ static int raid5_run(struct mddev *mddev)
|
|||
mdname(mddev));
|
||||
mddev->ro = 1;
|
||||
set_disk_ro(mddev->gendisk, 1);
|
||||
} else if (mddev->recovery_cp == MaxSector)
|
||||
} else if (mddev->resync_offset == MaxSector)
|
||||
set_bit(MD_JOURNAL_CLEAN, &mddev->flags);
|
||||
}
|
||||
|
||||
|
@ -7988,7 +7988,7 @@ static int raid5_run(struct mddev *mddev)
|
|||
mddev->resync_max_sectors = mddev->dev_sectors;
|
||||
|
||||
if (mddev->degraded > dirty_parity_disks &&
|
||||
mddev->recovery_cp != MaxSector) {
|
||||
mddev->resync_offset != MaxSector) {
|
||||
if (test_bit(MD_HAS_PPL, &mddev->flags))
|
||||
pr_crit("md/raid:%s: starting dirty degraded array with PPL.\n",
|
||||
mdname(mddev));
|
||||
|
@ -8328,8 +8328,8 @@ static int raid5_resize(struct mddev *mddev, sector_t sectors)
|
|||
|
||||
md_set_array_sectors(mddev, newsize);
|
||||
if (sectors > mddev->dev_sectors &&
|
||||
mddev->recovery_cp > mddev->dev_sectors) {
|
||||
mddev->recovery_cp = mddev->dev_sectors;
|
||||
mddev->resync_offset > mddev->dev_sectors) {
|
||||
mddev->resync_offset = mddev->dev_sectors;
|
||||
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
|
||||
}
|
||||
mddev->dev_sectors = sectors;
|
||||
|
@ -8423,7 +8423,7 @@ static int raid5_start_reshape(struct mddev *mddev)
|
|||
return -EINVAL;
|
||||
|
||||
/* raid5 can't handle concurrent reshape and recovery */
|
||||
if (mddev->recovery_cp < MaxSector)
|
||||
if (mddev->resync_offset < MaxSector)
|
||||
return -EBUSY;
|
||||
for (i = 0; i < conf->raid_disks; i++)
|
||||
if (conf->disks[i].replacement)
|
||||
|
@ -8648,7 +8648,7 @@ static void *raid45_takeover_raid0(struct mddev *mddev, int level)
|
|||
mddev->raid_disks += 1;
|
||||
mddev->delta_disks = 1;
|
||||
/* make sure it will be not marked as dirty */
|
||||
mddev->recovery_cp = MaxSector;
|
||||
mddev->resync_offset = MaxSector;
|
||||
|
||||
return setup_conf(mddev);
|
||||
}
|
||||
|
|
|
@ -742,7 +742,7 @@ static int nvme_auth_secure_concat(struct nvme_ctrl *ctrl,
|
|||
"%s: qid %d failed to generate digest, error %d\n",
|
||||
__func__, chap->qid, ret);
|
||||
goto out_free_psk;
|
||||
};
|
||||
}
|
||||
dev_dbg(ctrl->device, "%s: generated digest %s\n",
|
||||
__func__, digest);
|
||||
ret = nvme_auth_derive_tls_psk(chap->hash_id, psk, psk_len,
|
||||
|
@ -752,7 +752,7 @@ static int nvme_auth_secure_concat(struct nvme_ctrl *ctrl,
|
|||
"%s: qid %d failed to derive TLS psk, error %d\n",
|
||||
__func__, chap->qid, ret);
|
||||
goto out_free_digest;
|
||||
};
|
||||
}
|
||||
|
||||
tls_key = nvme_tls_psk_refresh(ctrl->opts->keyring,
|
||||
ctrl->opts->host->nqn,
|
||||
|
|
|
@ -3158,6 +3158,11 @@ static inline bool nvme_discovery_ctrl(struct nvme_ctrl *ctrl)
|
|||
return ctrl->opts && ctrl->opts->discovery_nqn;
|
||||
}
|
||||
|
||||
static inline bool nvme_admin_ctrl(struct nvme_ctrl *ctrl)
|
||||
{
|
||||
return ctrl->cntrltype == NVME_CTRL_ADMIN;
|
||||
}
|
||||
|
||||
static bool nvme_validate_cntlid(struct nvme_subsystem *subsys,
|
||||
struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
|
||||
{
|
||||
|
@ -3670,6 +3675,17 @@ int nvme_init_ctrl_finish(struct nvme_ctrl *ctrl, bool was_suspended)
|
|||
if (ret)
|
||||
return ret;
|
||||
|
||||
if (nvme_admin_ctrl(ctrl)) {
|
||||
/*
|
||||
* An admin controller has one admin queue, but no I/O queues.
|
||||
* Override queue_count so it only creates an admin queue.
|
||||
*/
|
||||
dev_dbg(ctrl->device,
|
||||
"Subsystem %s is an administrative controller",
|
||||
ctrl->subsys->subnqn);
|
||||
ctrl->queue_count = 1;
|
||||
}
|
||||
|
||||
ret = nvme_configure_apst(ctrl);
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
|
|
|
@ -1363,7 +1363,7 @@ nvme_fc_disconnect_assoc_done(struct nvmefc_ls_req *lsreq, int status)
|
|||
* down, and the related FC-NVME Association ID and Connection IDs
|
||||
* become invalid.
|
||||
*
|
||||
* The behavior of the fc-nvme initiator is such that it's
|
||||
* The behavior of the fc-nvme initiator is such that its
|
||||
* understanding of the association and connections will implicitly
|
||||
* be torn down. The action is implicit as it may be due to a loss of
|
||||
* connectivity with the fc-nvme target, so you may never get a
|
||||
|
@ -2777,7 +2777,7 @@ nvme_fc_queue_rq(struct blk_mq_hw_ctx *hctx,
|
|||
* as WRITE ZEROES will return a non-zero rq payload_bytes yet
|
||||
* there is no actual payload to be transferred.
|
||||
* To get it right, key data transmission on there being 1 or
|
||||
* more physical segments in the sg list. If there is no
|
||||
* more physical segments in the sg list. If there are no
|
||||
* physical segments, there is no payload.
|
||||
*/
|
||||
if (blk_rq_nr_phys_segments(rq)) {
|
||||
|
|
|
@ -935,7 +935,7 @@ static blk_status_t nvme_pci_setup_data_sgl(struct request *req,
|
|||
|
||||
nvme_pci_sgl_set_seg(&iod->cmd.common.dptr.sgl, sgl_dma, mapped);
|
||||
if (unlikely(iter->status))
|
||||
nvme_free_sgls(req);
|
||||
nvme_unmap_data(req);
|
||||
return iter->status;
|
||||
}
|
||||
|
||||
|
|
|
@ -2179,7 +2179,7 @@ static int nvme_tcp_configure_io_queues(struct nvme_ctrl *ctrl, bool new)
|
|||
|
||||
/*
|
||||
* Only start IO queues for which we have allocated the tagset
|
||||
* and limitted it to the available queues. On reconnects, the
|
||||
* and limited it to the available queues. On reconnects, the
|
||||
* queue number might have changed.
|
||||
*/
|
||||
nr_queues = min(ctrl->tagset->nr_hw_queues + 1, ctrl->queue_count);
|
||||
|
|
|
@ -1960,24 +1960,24 @@ static int __init nvmet_init(void)
|
|||
if (!nvmet_wq)
|
||||
goto out_free_buffered_work_queue;
|
||||
|
||||
error = nvmet_init_discovery();
|
||||
error = nvmet_init_debugfs();
|
||||
if (error)
|
||||
goto out_free_nvmet_work_queue;
|
||||
|
||||
error = nvmet_init_debugfs();
|
||||
if (error)
|
||||
goto out_exit_discovery;
|
||||
|
||||
error = nvmet_init_configfs();
|
||||
error = nvmet_init_discovery();
|
||||
if (error)
|
||||
goto out_exit_debugfs;
|
||||
|
||||
error = nvmet_init_configfs();
|
||||
if (error)
|
||||
goto out_exit_discovery;
|
||||
|
||||
return 0;
|
||||
|
||||
out_exit_debugfs:
|
||||
nvmet_exit_debugfs();
|
||||
out_exit_discovery:
|
||||
nvmet_exit_discovery();
|
||||
out_exit_debugfs:
|
||||
nvmet_exit_debugfs();
|
||||
out_free_nvmet_work_queue:
|
||||
destroy_workqueue(nvmet_wq);
|
||||
out_free_buffered_work_queue:
|
||||
|
@ -1992,8 +1992,8 @@ out_destroy_bvec_cache:
|
|||
static void __exit nvmet_exit(void)
|
||||
{
|
||||
nvmet_exit_configfs();
|
||||
nvmet_exit_debugfs();
|
||||
nvmet_exit_discovery();
|
||||
nvmet_exit_debugfs();
|
||||
ida_destroy(&cntlid_ida);
|
||||
destroy_workqueue(nvmet_wq);
|
||||
destroy_workqueue(buffered_io_wq);
|
||||
|
|
|
@ -459,7 +459,7 @@ nvmet_fc_disconnect_assoc_done(struct nvmefc_ls_req *lsreq, int status)
|
|||
* down, and the related FC-NVME Association ID and Connection IDs
|
||||
* become invalid.
|
||||
*
|
||||
* The behavior of the fc-nvme target is such that it's
|
||||
* The behavior of the fc-nvme target is such that its
|
||||
* understanding of the association and connections will implicitly
|
||||
* be torn down. The action is implicit as it may be due to a loss of
|
||||
* connectivity with the fc-nvme host, so the target may never get a
|
||||
|
@ -2313,7 +2313,7 @@ nvmet_fc_transfer_fcp_data(struct nvmet_fc_tgtport *tgtport,
|
|||
ret = tgtport->ops->fcp_op(&tgtport->fc_target_port, fod->fcpreq);
|
||||
if (ret) {
|
||||
/*
|
||||
* should be ok to set w/o lock as its in the thread of
|
||||
* should be ok to set w/o lock as it's in the thread of
|
||||
* execution (not an async timer routine) and doesn't
|
||||
* contend with any clearing action
|
||||
*/
|
||||
|
@ -2629,7 +2629,7 @@ transport_error:
|
|||
* and the api of the FC LLDD which may issue a hw command to send the
|
||||
* response, but the LLDD may not get the hw completion for that command
|
||||
* and upcall the nvmet_fc layer before a new command may be
|
||||
* asynchronously received - its possible for a command to be received
|
||||
* asynchronously received - it's possible for a command to be received
|
||||
* before the LLDD and nvmet_fc have recycled the job structure. It gives
|
||||
* the appearance of more commands received than fits in the sq.
|
||||
* To alleviate this scenario, a temporary queue is maintained in the
|
||||
|
|
|
@ -533,6 +533,8 @@ u16 nvmet_parse_passthru_admin_cmd(struct nvmet_req *req)
|
|||
case NVME_FEAT_HOST_ID:
|
||||
req->execute = nvmet_execute_get_features;
|
||||
return NVME_SC_SUCCESS;
|
||||
case NVME_FEAT_FDP:
|
||||
return nvmet_setup_passthru_command(req);
|
||||
default:
|
||||
return nvmet_passthru_get_set_features(req);
|
||||
}
|
||||
|
|
|
@ -1731,7 +1731,7 @@ static void nvmet_rdma_queue_connect_fail(struct rdma_cm_id *cm_id,
|
|||
* We registered an ib_client to handle device removal for queues,
|
||||
* so we only need to handle the listening port cm_ids. In this case
|
||||
* we nullify the priv to prevent double cm_id destruction and destroying
|
||||
* the cm_id implicitely by returning a non-zero rc to the callout.
|
||||
* the cm_id implicitly by returning a non-zero rc to the callout.
|
||||
*/
|
||||
static int nvmet_rdma_device_removal(struct rdma_cm_id *cm_id,
|
||||
struct nvmet_rdma_queue *queue)
|
||||
|
@ -1742,7 +1742,7 @@ static int nvmet_rdma_device_removal(struct rdma_cm_id *cm_id,
|
|||
/*
|
||||
* This is a queue cm_id. we have registered
|
||||
* an ib_client to handle queues removal
|
||||
* so don't interfear and just return.
|
||||
* so don't interfere and just return.
|
||||
*/
|
||||
return 0;
|
||||
}
|
||||
|
@ -1760,7 +1760,7 @@ static int nvmet_rdma_device_removal(struct rdma_cm_id *cm_id,
|
|||
|
||||
/*
|
||||
* We need to return 1 so that the core will destroy
|
||||
* it's own ID. What a great API design..
|
||||
* its own ID. What a great API design..
|
||||
*/
|
||||
return 1;
|
||||
}
|
||||
|
|
|
@ -60,7 +60,8 @@ static inline int __get_task_ioprio(struct task_struct *p)
|
|||
int prio;
|
||||
|
||||
if (!ioc)
|
||||
return IOPRIO_DEFAULT;
|
||||
return IOPRIO_PRIO_VALUE(task_nice_ioclass(p),
|
||||
task_nice_ioprio(p));
|
||||
|
||||
if (p != current)
|
||||
lockdep_assert_held(&p->alloc_lock);
|
||||
|
|
|
@ -209,23 +209,6 @@ void sbitmap_resize(struct sbitmap *sb, unsigned int depth);
|
|||
*/
|
||||
int sbitmap_get(struct sbitmap *sb);
|
||||
|
||||
/**
|
||||
* sbitmap_get_shallow() - Try to allocate a free bit from a &struct sbitmap,
|
||||
* limiting the depth used from each word.
|
||||
* @sb: Bitmap to allocate from.
|
||||
* @shallow_depth: The maximum number of bits to allocate from a single word.
|
||||
*
|
||||
* This rather specific operation allows for having multiple users with
|
||||
* different allocation limits. E.g., there can be a high-priority class that
|
||||
* uses sbitmap_get() and a low-priority class that uses sbitmap_get_shallow()
|
||||
* with a @shallow_depth of (1 << (@sb->shift - 1)). Then, the low-priority
|
||||
* class can only allocate half of the total bits in the bitmap, preventing it
|
||||
* from starving out the high-priority class.
|
||||
*
|
||||
* Return: Non-negative allocated bit number if successful, -1 otherwise.
|
||||
*/
|
||||
int sbitmap_get_shallow(struct sbitmap *sb, unsigned long shallow_depth);
|
||||
|
||||
/**
|
||||
* sbitmap_any_bit_set() - Check for a set bit in a &struct sbitmap.
|
||||
* @sb: Bitmap to check.
|
||||
|
@ -478,7 +461,7 @@ unsigned long __sbitmap_queue_get_batch(struct sbitmap_queue *sbq, int nr_tags,
|
|||
* sbitmap_queue, limiting the depth used from each word, with preemption
|
||||
* already disabled.
|
||||
* @sbq: Bitmap queue to allocate from.
|
||||
* @shallow_depth: The maximum number of bits to allocate from a single word.
|
||||
* @shallow_depth: The maximum number of bits to allocate from the queue.
|
||||
* See sbitmap_get_shallow().
|
||||
*
|
||||
* If you call this, make sure to call sbitmap_queue_min_shallow_depth() after
|
||||
|
|
|
@ -173,7 +173,7 @@ typedef struct mdp_superblock_s {
|
|||
#else
|
||||
#error unspecified endianness
|
||||
#endif
|
||||
__u32 recovery_cp; /* 11 recovery checkpoint sector count */
|
||||
__u32 resync_offset; /* 11 resync checkpoint sector count */
|
||||
/* There are only valid for minor_version > 90 */
|
||||
__u64 reshape_position; /* 12,13 next address in array-space for reshape */
|
||||
__u32 new_level; /* 14 new level we are reshaping to */
|
||||
|
|
|
@ -208,8 +208,28 @@ static int sbitmap_find_bit_in_word(struct sbitmap_word *map,
|
|||
return nr;
|
||||
}
|
||||
|
||||
static unsigned int __map_depth_with_shallow(const struct sbitmap *sb,
|
||||
int index,
|
||||
unsigned int shallow_depth)
|
||||
{
|
||||
u64 shallow_word_depth;
|
||||
unsigned int word_depth, reminder;
|
||||
|
||||
word_depth = __map_depth(sb, index);
|
||||
if (shallow_depth >= sb->depth)
|
||||
return word_depth;
|
||||
|
||||
shallow_word_depth = word_depth * shallow_depth;
|
||||
reminder = do_div(shallow_word_depth, sb->depth);
|
||||
|
||||
if (reminder >= (index + 1) * word_depth)
|
||||
shallow_word_depth++;
|
||||
|
||||
return (unsigned int)shallow_word_depth;
|
||||
}
|
||||
|
||||
static int sbitmap_find_bit(struct sbitmap *sb,
|
||||
unsigned int depth,
|
||||
unsigned int shallow_depth,
|
||||
unsigned int index,
|
||||
unsigned int alloc_hint,
|
||||
bool wrap)
|
||||
|
@ -218,12 +238,12 @@ static int sbitmap_find_bit(struct sbitmap *sb,
|
|||
int nr = -1;
|
||||
|
||||
for (i = 0; i < sb->map_nr; i++) {
|
||||
nr = sbitmap_find_bit_in_word(&sb->map[index],
|
||||
min_t(unsigned int,
|
||||
__map_depth(sb, index),
|
||||
depth),
|
||||
alloc_hint, wrap);
|
||||
unsigned int depth = __map_depth_with_shallow(sb, index,
|
||||
shallow_depth);
|
||||
|
||||
if (depth)
|
||||
nr = sbitmap_find_bit_in_word(&sb->map[index], depth,
|
||||
alloc_hint, wrap);
|
||||
if (nr != -1) {
|
||||
nr += index << sb->shift;
|
||||
break;
|
||||
|
@ -287,7 +307,22 @@ static int __sbitmap_get_shallow(struct sbitmap *sb,
|
|||
return sbitmap_find_bit(sb, shallow_depth, index, alloc_hint, true);
|
||||
}
|
||||
|
||||
int sbitmap_get_shallow(struct sbitmap *sb, unsigned long shallow_depth)
|
||||
/**
|
||||
* sbitmap_get_shallow() - Try to allocate a free bit from a &struct sbitmap,
|
||||
* limiting the depth used from each word.
|
||||
* @sb: Bitmap to allocate from.
|
||||
* @shallow_depth: The maximum number of bits to allocate from the bitmap.
|
||||
*
|
||||
* This rather specific operation allows for having multiple users with
|
||||
* different allocation limits. E.g., there can be a high-priority class that
|
||||
* uses sbitmap_get() and a low-priority class that uses sbitmap_get_shallow()
|
||||
* with a @shallow_depth of (sb->depth >> 1). Then, the low-priority
|
||||
* class can only allocate half of the total bits in the bitmap, preventing it
|
||||
* from starving out the high-priority class.
|
||||
*
|
||||
* Return: Non-negative allocated bit number if successful, -1 otherwise.
|
||||
*/
|
||||
static int sbitmap_get_shallow(struct sbitmap *sb, unsigned long shallow_depth)
|
||||
{
|
||||
int nr;
|
||||
unsigned int hint, depth;
|
||||
|
@ -302,7 +337,6 @@ int sbitmap_get_shallow(struct sbitmap *sb, unsigned long shallow_depth)
|
|||
|
||||
return nr;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(sbitmap_get_shallow);
|
||||
|
||||
bool sbitmap_any_bit_set(const struct sbitmap *sb)
|
||||
{
|
||||
|
@ -406,27 +440,9 @@ EXPORT_SYMBOL_GPL(sbitmap_bitmap_show);
|
|||
static unsigned int sbq_calc_wake_batch(struct sbitmap_queue *sbq,
|
||||
unsigned int depth)
|
||||
{
|
||||
unsigned int wake_batch;
|
||||
unsigned int shallow_depth;
|
||||
|
||||
/*
|
||||
* Each full word of the bitmap has bits_per_word bits, and there might
|
||||
* be a partial word. There are depth / bits_per_word full words and
|
||||
* depth % bits_per_word bits left over. In bitwise arithmetic:
|
||||
*
|
||||
* bits_per_word = 1 << shift
|
||||
* depth / bits_per_word = depth >> shift
|
||||
* depth % bits_per_word = depth & ((1 << shift) - 1)
|
||||
*
|
||||
* Each word can be limited to sbq->min_shallow_depth bits.
|
||||
*/
|
||||
shallow_depth = min(1U << sbq->sb.shift, sbq->min_shallow_depth);
|
||||
depth = ((depth >> sbq->sb.shift) * shallow_depth +
|
||||
min(depth & ((1U << sbq->sb.shift) - 1), shallow_depth));
|
||||
wake_batch = clamp_t(unsigned int, depth / SBQ_WAIT_QUEUES, 1,
|
||||
SBQ_WAKE_BATCH);
|
||||
|
||||
return wake_batch;
|
||||
return clamp_t(unsigned int,
|
||||
min(depth, sbq->min_shallow_depth) / SBQ_WAIT_QUEUES,
|
||||
1, SBQ_WAKE_BATCH);
|
||||
}
|
||||
|
||||
int sbitmap_queue_init_node(struct sbitmap_queue *sbq, unsigned int depth,
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue