Skip to content

Commit 92baa6a

Browse files
nvme-tcp: fix premature queue removal and I/O failover
JIRA: https://issues.redhat.com/browse/RHEL-95382 This patch addresses a data corruption issue observed in nvme-tcp during testing. In an NVMe native multipath setup, when an I/O timeout occurs, all inflight I/Os are canceled almost immediately after the kernel socket is shut down. These canceled I/Os are reported as host path errors, triggering a failover that succeeds on a different path. However, at this point, the original I/O may still be outstanding in the host's network transmission path (e.g., the NIC’s TX queue). From the user-space app's perspective, the buffer associated with the I/O is considered completed since they're acked on the different path and may be reused for new I/O requests. Because nvme-tcp enables zero-copy by default in the transmission path, this can lead to corrupted data being sent to the original target, ultimately causing data corruption. We can reproduce this data corruption by injecting delay on one path and triggering i/o timeout. To prevent this issue, this change ensures that all inflight transmissions are fully completed from host's perspective before returning from queue stop. To handle concurrent I/O timeout from multiple namespaces under the same controller, always wait in queue stop regardless of queue's state. This aligns with the behavior of queue stopping in other NVMe fabric transports. Fixes: 3f2304f ("nvme-tcp: add NVMe over TCP host driver") Signed-off-by: Michael Liang <[email protected]> Reviewed-by: Mohamed Khalfella <[email protected]> Reviewed-by: Randy Jennings <[email protected]> Reviewed-by: Sagi Grimberg <[email protected]> Signed-off-by: Christoph Hellwig <[email protected]> (cherry picked from commit 77e40bb) Signed-off-by: Maurizio Lombardi <[email protected]>
1 parent ab7be5c commit 92baa6a

File tree

1 file changed

+29
-2
lines changed

1 file changed

+29
-2
lines changed

drivers/nvme/host/tcp.c

Lines changed: 29 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1944,7 +1944,7 @@ static void __nvme_tcp_stop_queue(struct nvme_tcp_queue *queue)
19441944
cancel_work_sync(&queue->io_work);
19451945
}
19461946

1947-
static void nvme_tcp_stop_queue(struct nvme_ctrl *nctrl, int qid)
1947+
static void nvme_tcp_stop_queue_nowait(struct nvme_ctrl *nctrl, int qid)
19481948
{
19491949
struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
19501950
struct nvme_tcp_queue *queue = &ctrl->queues[qid];
@@ -1963,6 +1963,31 @@ static void nvme_tcp_stop_queue(struct nvme_ctrl *nctrl, int qid)
19631963
mutex_unlock(&queue->queue_lock);
19641964
}
19651965

1966+
static void nvme_tcp_wait_queue(struct nvme_ctrl *nctrl, int qid)
1967+
{
1968+
struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
1969+
struct nvme_tcp_queue *queue = &ctrl->queues[qid];
1970+
int timeout = 100;
1971+
1972+
while (timeout > 0) {
1973+
if (!test_bit(NVME_TCP_Q_ALLOCATED, &queue->flags) ||
1974+
!sk_wmem_alloc_get(queue->sock->sk))
1975+
return;
1976+
msleep(2);
1977+
timeout -= 2;
1978+
}
1979+
dev_warn(nctrl->device,
1980+
"qid %d: timeout draining sock wmem allocation expired\n",
1981+
qid);
1982+
}
1983+
1984+
static void nvme_tcp_stop_queue(struct nvme_ctrl *nctrl, int qid)
1985+
{
1986+
nvme_tcp_stop_queue_nowait(nctrl, qid);
1987+
nvme_tcp_wait_queue(nctrl, qid);
1988+
}
1989+
1990+
19661991
static void nvme_tcp_setup_sock_ops(struct nvme_tcp_queue *queue)
19671992
{
19681993
write_lock_bh(&queue->sock->sk->sk_callback_lock);
@@ -2030,7 +2055,9 @@ static void nvme_tcp_stop_io_queues(struct nvme_ctrl *ctrl)
20302055
int i;
20312056

20322057
for (i = 1; i < ctrl->queue_count; i++)
2033-
nvme_tcp_stop_queue(ctrl, i);
2058+
nvme_tcp_stop_queue_nowait(ctrl, i);
2059+
for (i = 1; i < ctrl->queue_count; i++)
2060+
nvme_tcp_wait_queue(ctrl, i);
20342061
}
20352062

20362063
static int nvme_tcp_start_io_queues(struct nvme_ctrl *ctrl,

0 commit comments

Comments
 (0)