Skip to content

Commit 6218863

Browse files
shroffniChristoph Hellwig
authored andcommitted
nvme-multipath: introduce delayed removal of the multipath head node
Currently, the multipath head node of an NVMe disk is removed immediately as soon as all paths of the disk are removed. However, this can cause issues in scenarios where: - The disk hot-removal followed by re-addition. - Transient PCIe link failures that trigger re-enumeration, temporarily removing and then restoring the disk. In these cases, removing the head node prematurely may lead to a head disk node name change upon re-addition, requiring applications to reopen their handles if they were performing I/O during the failure. To address this, introduce a delayed removal mechanism of head disk node. During transient failure, instead of immediate removal of head disk node, the system waits for a configurable timeout, allowing the disk to recover. During transient disk failure, if application sends any IO then we queue it instead of failing such IO immediately. If the disk comes back online within the timeout, the queued IOs are resubmitted to the disk ensuring seamless operation. In case disk couldn't recover from the failure then queued IOs are failed to its completion and application receives the error. So this way, if disk comes back online within the configured period, the head node remains unchanged, ensuring uninterrupted workloads without requiring applications to reopen device handles. A new sysfs attribute, named "delayed_removal_secs" is added under head disk blkdev for user who wish to configure time for the delayed removal of head disk node. The default value of this attribute is set to zero second ensuring no behavior change unless explicitly configured. Link: https://lore.kernel.org/linux-nvme/[email protected]/ Link: https://lore.kernel.org/linux-nvme/[email protected]/ Suggested-by: Keith Busch <[email protected]> Suggested-by: Christoph Hellwig <[email protected]> [nilay: reworked based on the original idea/POC from Christoph and Keith] Reviewed-by: Hannes Reinecke <[email protected]> Signed-off-by: Nilay Shroff <[email protected]> Signed-off-by: Christoph Hellwig <[email protected]>
1 parent 414a4c9 commit 6218863

File tree

4 files changed

+147
-14
lines changed

4 files changed

+147
-14
lines changed

drivers/nvme/host/core.c

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3743,7 +3743,7 @@ static struct nvme_ns_head *nvme_find_ns_head(struct nvme_ctrl *ctrl,
37433743
*/
37443744
if (h->ns_id != nsid || !nvme_is_unique_nsid(ctrl, h))
37453745
continue;
3746-
if (!list_empty(&h->list) && nvme_tryget_ns_head(h))
3746+
if (nvme_tryget_ns_head(h))
37473747
return h;
37483748
}
37493749

@@ -3987,7 +3987,8 @@ static int nvme_init_ns_head(struct nvme_ns *ns, struct nvme_ns_info *info)
39873987
}
39883988
} else {
39893989
ret = -EINVAL;
3990-
if (!info->is_shared || !head->shared) {
3990+
if ((!info->is_shared || !head->shared) &&
3991+
!list_empty(&head->list)) {
39913992
dev_err(ctrl->device,
39923993
"Duplicate unshared namespace %d\n",
39933994
info->nsid);
@@ -4191,7 +4192,8 @@ static void nvme_ns_remove(struct nvme_ns *ns)
41914192
mutex_lock(&ns->ctrl->subsys->lock);
41924193
list_del_rcu(&ns->siblings);
41934194
if (list_empty(&ns->head->list)) {
4194-
list_del_init(&ns->head->entry);
4195+
if (!nvme_mpath_queue_if_no_path(ns->head))
4196+
list_del_init(&ns->head->entry);
41954197
last_path = true;
41964198
}
41974199
mutex_unlock(&ns->ctrl->subsys->lock);

drivers/nvme/host/multipath.c

Lines changed: 120 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -442,7 +442,17 @@ static bool nvme_available_path(struct nvme_ns_head *head)
442442
break;
443443
}
444444
}
445-
return false;
445+
446+
/*
447+
* If "head->delayed_removal_secs" is configured (i.e., non-zero), do
448+
* not immediately fail I/O. Instead, requeue the I/O for the configured
449+
* duration, anticipating that if there's a transient link failure then
450+
* it may recover within this time window. This parameter is exported to
451+
* userspace via sysfs, and its default value is zero. It is internally
452+
* mapped to NVME_NSHEAD_QUEUE_IF_NO_PATH. When delayed_removal_secs is
453+
* non-zero, this flag is set to true. When zero, the flag is cleared.
454+
*/
455+
return nvme_mpath_queue_if_no_path(head);
446456
}
447457

448458
static void nvme_ns_head_submit_bio(struct bio *bio)
@@ -617,6 +627,40 @@ static void nvme_requeue_work(struct work_struct *work)
617627
}
618628
}
619629

630+
static void nvme_remove_head(struct nvme_ns_head *head)
631+
{
632+
if (test_and_clear_bit(NVME_NSHEAD_DISK_LIVE, &head->flags)) {
633+
/*
634+
* requeue I/O after NVME_NSHEAD_DISK_LIVE has been cleared
635+
* to allow multipath to fail all I/O.
636+
*/
637+
kblockd_schedule_work(&head->requeue_work);
638+
639+
nvme_cdev_del(&head->cdev, &head->cdev_device);
640+
synchronize_srcu(&head->srcu);
641+
del_gendisk(head->disk);
642+
nvme_put_ns_head(head);
643+
}
644+
}
645+
646+
static void nvme_remove_head_work(struct work_struct *work)
647+
{
648+
struct nvme_ns_head *head = container_of(to_delayed_work(work),
649+
struct nvme_ns_head, remove_work);
650+
bool shutdown = false;
651+
652+
mutex_lock(&head->subsys->lock);
653+
if (list_empty(&head->list)) {
654+
list_del_init(&head->entry);
655+
shutdown = true;
656+
}
657+
mutex_unlock(&head->subsys->lock);
658+
if (shutdown)
659+
nvme_remove_head(head);
660+
661+
module_put(THIS_MODULE);
662+
}
663+
620664
int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head)
621665
{
622666
struct queue_limits lim;
@@ -626,6 +670,8 @@ int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head)
626670
spin_lock_init(&head->requeue_lock);
627671
INIT_WORK(&head->requeue_work, nvme_requeue_work);
628672
INIT_WORK(&head->partition_scan_work, nvme_partition_scan_work);
673+
INIT_DELAYED_WORK(&head->remove_work, nvme_remove_head_work);
674+
head->delayed_removal_secs = 0;
629675

630676
/*
631677
* Add a multipath node if the subsystems supports multiple controllers.
@@ -659,6 +705,7 @@ int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head)
659705
set_bit(GD_SUPPRESS_PART_SCAN, &head->disk->state);
660706
sprintf(head->disk->disk_name, "nvme%dn%d",
661707
ctrl->subsys->instance, head->instance);
708+
nvme_tryget_ns_head(head);
662709
return 0;
663710
}
664711

@@ -1015,6 +1062,49 @@ static ssize_t numa_nodes_show(struct device *dev, struct device_attribute *attr
10151062
}
10161063
DEVICE_ATTR_RO(numa_nodes);
10171064

1065+
static ssize_t delayed_removal_secs_show(struct device *dev,
1066+
struct device_attribute *attr, char *buf)
1067+
{
1068+
struct gendisk *disk = dev_to_disk(dev);
1069+
struct nvme_ns_head *head = disk->private_data;
1070+
int ret;
1071+
1072+
mutex_lock(&head->subsys->lock);
1073+
ret = sysfs_emit(buf, "%u\n", head->delayed_removal_secs);
1074+
mutex_unlock(&head->subsys->lock);
1075+
return ret;
1076+
}
1077+
1078+
static ssize_t delayed_removal_secs_store(struct device *dev,
1079+
struct device_attribute *attr, const char *buf, size_t count)
1080+
{
1081+
struct gendisk *disk = dev_to_disk(dev);
1082+
struct nvme_ns_head *head = disk->private_data;
1083+
unsigned int sec;
1084+
int ret;
1085+
1086+
ret = kstrtouint(buf, 0, &sec);
1087+
if (ret < 0)
1088+
return ret;
1089+
1090+
mutex_lock(&head->subsys->lock);
1091+
head->delayed_removal_secs = sec;
1092+
if (sec)
1093+
set_bit(NVME_NSHEAD_QUEUE_IF_NO_PATH, &head->flags);
1094+
else
1095+
clear_bit(NVME_NSHEAD_QUEUE_IF_NO_PATH, &head->flags);
1096+
mutex_unlock(&head->subsys->lock);
1097+
/*
1098+
* Ensure that update to NVME_NSHEAD_QUEUE_IF_NO_PATH is seen
1099+
* by its reader.
1100+
*/
1101+
synchronize_srcu(&head->srcu);
1102+
1103+
return count;
1104+
}
1105+
1106+
DEVICE_ATTR_RW(delayed_removal_secs);
1107+
10181108
static int nvme_lookup_ana_group_desc(struct nvme_ctrl *ctrl,
10191109
struct nvme_ana_group_desc *desc, void *data)
10201110
{
@@ -1138,18 +1228,38 @@ void nvme_mpath_add_disk(struct nvme_ns *ns, __le32 anagrpid)
11381228

11391229
void nvme_mpath_shutdown_disk(struct nvme_ns_head *head)
11401230
{
1141-
if (!head->disk)
1142-
return;
1143-
if (test_and_clear_bit(NVME_NSHEAD_DISK_LIVE, &head->flags)) {
1144-
nvme_cdev_del(&head->cdev, &head->cdev_device);
1231+
bool shutdown = false;
1232+
1233+
mutex_lock(&head->subsys->lock);
1234+
/*
1235+
* We are called when all paths have been removed, and at that point
1236+
* head->list is expected to be empty. However, nvme_remove_ns() and
1237+
* nvme_init_ns_head() can run concurrently and so if head->delayed_
1238+
* removal_secs is configured, it is possible that by the time we reach
1239+
* this point, head->list may no longer be empty. Therefore, we recheck
1240+
* head->list here. If it is no longer empty then we skip enqueuing the
1241+
* delayed head removal work.
1242+
*/
1243+
if (!list_empty(&head->list))
1244+
goto out;
1245+
1246+
if (head->delayed_removal_secs) {
11451247
/*
1146-
* requeue I/O after NVME_NSHEAD_DISK_LIVE has been cleared
1147-
* to allow multipath to fail all I/O.
1248+
* Ensure that no one could remove this module while the head
1249+
* remove work is pending.
11481250
*/
1149-
synchronize_srcu(&head->srcu);
1150-
kblockd_schedule_work(&head->requeue_work);
1151-
del_gendisk(head->disk);
1251+
if (!try_module_get(THIS_MODULE))
1252+
goto out;
1253+
queue_delayed_work(nvme_wq, &head->remove_work,
1254+
head->delayed_removal_secs * HZ);
1255+
} else {
1256+
list_del_init(&head->entry);
1257+
shutdown = true;
11521258
}
1259+
out:
1260+
mutex_unlock(&head->subsys->lock);
1261+
if (shutdown)
1262+
nvme_remove_head(head);
11531263
}
11541264

11551265
void nvme_mpath_remove_disk(struct nvme_ns_head *head)

drivers/nvme/host/nvme.h

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -506,7 +506,10 @@ struct nvme_ns_head {
506506
struct work_struct partition_scan_work;
507507
struct mutex lock;
508508
unsigned long flags;
509-
#define NVME_NSHEAD_DISK_LIVE 0
509+
struct delayed_work remove_work;
510+
unsigned int delayed_removal_secs;
511+
#define NVME_NSHEAD_DISK_LIVE 0
512+
#define NVME_NSHEAD_QUEUE_IF_NO_PATH 1
510513
struct nvme_ns __rcu *current_path[];
511514
#endif
512515
};
@@ -989,12 +992,19 @@ extern struct device_attribute dev_attr_ana_grpid;
989992
extern struct device_attribute dev_attr_ana_state;
990993
extern struct device_attribute dev_attr_queue_depth;
991994
extern struct device_attribute dev_attr_numa_nodes;
995+
extern struct device_attribute dev_attr_delayed_removal_secs;
992996
extern struct device_attribute subsys_attr_iopolicy;
993997

994998
static inline bool nvme_disk_is_ns_head(struct gendisk *disk)
995999
{
9961000
return disk->fops == &nvme_ns_head_ops;
9971001
}
1002+
static inline bool nvme_mpath_queue_if_no_path(struct nvme_ns_head *head)
1003+
{
1004+
if (test_bit(NVME_NSHEAD_QUEUE_IF_NO_PATH, &head->flags))
1005+
return true;
1006+
return false;
1007+
}
9981008
#else
9991009
#define multipath false
10001010
static inline bool nvme_ctrl_use_ana(struct nvme_ctrl *ctrl)
@@ -1082,6 +1092,10 @@ static inline bool nvme_disk_is_ns_head(struct gendisk *disk)
10821092
{
10831093
return false;
10841094
}
1095+
static inline bool nvme_mpath_queue_if_no_path(struct nvme_ns_head *head)
1096+
{
1097+
return false;
1098+
}
10851099
#endif /* CONFIG_NVME_MULTIPATH */
10861100

10871101
int nvme_ns_get_unique_id(struct nvme_ns *ns, u8 id[16],

drivers/nvme/host/sysfs.c

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -260,6 +260,7 @@ static struct attribute *nvme_ns_attrs[] = {
260260
&dev_attr_ana_state.attr,
261261
&dev_attr_queue_depth.attr,
262262
&dev_attr_numa_nodes.attr,
263+
&dev_attr_delayed_removal_secs.attr,
263264
#endif
264265
&dev_attr_io_passthru_err_log_enabled.attr,
265266
NULL,
@@ -296,6 +297,12 @@ static umode_t nvme_ns_attrs_are_visible(struct kobject *kobj,
296297
if (nvme_disk_is_ns_head(dev_to_disk(dev)))
297298
return 0;
298299
}
300+
if (a == &dev_attr_delayed_removal_secs.attr) {
301+
struct gendisk *disk = dev_to_disk(dev);
302+
303+
if (!nvme_disk_is_ns_head(disk))
304+
return 0;
305+
}
299306
#endif
300307
return a->mode;
301308
}

0 commit comments

Comments
 (0)