Skip to content

Commit f195fc0

Browse files
Ranjan Kumarmartinkpetersen
authored andcommitted
scsi: mpi3mr: Synchronous access b/w reset and tm thread for reply queue
When the task management thread processes reply queues while the reset thread resets them, the task management thread accesses an invalid queue ID (0xFFFF), set by the reset thread, which points to unallocated memory, causing a crash. Add flag 'io_admin_reset_sync' to synchronize access between the reset, I/O, and admin threads. Before a reset, the reset handler sets this flag to block I/O and admin processing threads. If any thread bypasses the initial check, the reset thread waits up to 10 seconds for processing to finish. If the wait exceeds 10 seconds, the controller is marked as unrecoverable. Signed-off-by: Sumit Saxena <[email protected]> Signed-off-by: Ranjan Kumar <[email protected]> Link: https://lore.kernel.org/r/[email protected] Signed-off-by: Martin K. Petersen <[email protected]>
1 parent 339a7b3 commit f195fc0

File tree

2 files changed

+66
-3
lines changed

2 files changed

+66
-3
lines changed

drivers/scsi/mpi3mr/mpi3mr.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1100,6 +1100,7 @@ struct scmd_priv {
11001100
* @ts_update_interval: Timestamp update interval
11011101
* @reset_in_progress: Reset in progress flag
11021102
* @unrecoverable: Controller unrecoverable flag
1103+
* @io_admin_reset_sync: Manage state of I/O ops during an admin reset process
11031104
* @prev_reset_result: Result of previous reset
11041105
* @reset_mutex: Controller reset mutex
11051106
* @reset_waitq: Controller reset wait queue
@@ -1292,6 +1293,7 @@ struct mpi3mr_ioc {
12921293
u16 ts_update_interval;
12931294
u8 reset_in_progress;
12941295
u8 unrecoverable;
1296+
u8 io_admin_reset_sync;
12951297
int prev_reset_result;
12961298
struct mutex reset_mutex;
12971299
wait_queue_head_t reset_waitq;

drivers/scsi/mpi3mr/mpi3mr_fw.c

Lines changed: 64 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ static void mpi3mr_process_factsdata(struct mpi3mr_ioc *mrioc,
1717
struct mpi3_ioc_facts_data *facts_data);
1818
static void mpi3mr_pel_wait_complete(struct mpi3mr_ioc *mrioc,
1919
struct mpi3mr_drv_cmd *drv_cmd);
20-
20+
static int mpi3mr_check_op_admin_proc(struct mpi3mr_ioc *mrioc);
2121
static int poll_queues;
2222
module_param(poll_queues, int, 0444);
2323
MODULE_PARM_DESC(poll_queues, "Number of queues for io_uring poll mode. (Range 1 - 126)");
@@ -459,7 +459,7 @@ int mpi3mr_process_admin_reply_q(struct mpi3mr_ioc *mrioc)
459459
}
460460

461461
do {
462-
if (mrioc->unrecoverable)
462+
if (mrioc->unrecoverable || mrioc->io_admin_reset_sync)
463463
break;
464464

465465
mrioc->admin_req_ci = le16_to_cpu(reply_desc->request_queue_ci);
@@ -554,7 +554,7 @@ int mpi3mr_process_op_reply_q(struct mpi3mr_ioc *mrioc,
554554
}
555555

556556
do {
557-
if (mrioc->unrecoverable)
557+
if (mrioc->unrecoverable || mrioc->io_admin_reset_sync)
558558
break;
559559

560560
req_q_idx = le16_to_cpu(reply_desc->request_queue_id) - 1;
@@ -4411,6 +4411,7 @@ int mpi3mr_reinit_ioc(struct mpi3mr_ioc *mrioc, u8 is_resume)
44114411
goto out_failed_noretry;
44124412
}
44134413

4414+
mrioc->io_admin_reset_sync = 0;
44144415
if (is_resume || mrioc->block_on_pci_err) {
44154416
dprint_reset(mrioc, "setting up single ISR\n");
44164417
retval = mpi3mr_setup_isr(mrioc, 1);
@@ -5289,6 +5290,55 @@ void mpi3mr_pel_get_seqnum_complete(struct mpi3mr_ioc *mrioc,
52895290
drv_cmd->retry_count = 0;
52905291
}
52915292

5293+
/**
5294+
* mpi3mr_check_op_admin_proc -
5295+
* @mrioc: Adapter instance reference
5296+
*
5297+
* Check if any of the operation reply queues
5298+
* or the admin reply queue are currently in use.
5299+
* If any queue is in use, this function waits for
5300+
* a maximum of 10 seconds for them to become available.
5301+
*
5302+
* Return: 0 on success, non-zero on failure.
5303+
*/
5304+
static int mpi3mr_check_op_admin_proc(struct mpi3mr_ioc *mrioc)
5305+
{
5306+
5307+
u16 timeout = 10 * 10;
5308+
u16 elapsed_time = 0;
5309+
bool op_admin_in_use = false;
5310+
5311+
do {
5312+
op_admin_in_use = false;
5313+
5314+
/* Check admin_reply queue first to exit early */
5315+
if (atomic_read(&mrioc->admin_reply_q_in_use) == 1)
5316+
op_admin_in_use = true;
5317+
else {
5318+
/* Check op_reply queues */
5319+
int i;
5320+
5321+
for (i = 0; i < mrioc->num_queues; i++) {
5322+
if (atomic_read(&mrioc->op_reply_qinfo[i].in_use) == 1) {
5323+
op_admin_in_use = true;
5324+
break;
5325+
}
5326+
}
5327+
}
5328+
5329+
if (!op_admin_in_use)
5330+
break;
5331+
5332+
msleep(100);
5333+
5334+
} while (++elapsed_time < timeout);
5335+
5336+
if (op_admin_in_use)
5337+
return 1;
5338+
5339+
return 0;
5340+
}
5341+
52925342
/**
52935343
* mpi3mr_soft_reset_handler - Reset the controller
52945344
* @mrioc: Adapter instance reference
@@ -5369,6 +5419,7 @@ int mpi3mr_soft_reset_handler(struct mpi3mr_ioc *mrioc,
53695419
mpi3mr_wait_for_host_io(mrioc, MPI3MR_RESET_HOST_IOWAIT_TIMEOUT);
53705420

53715421
mpi3mr_ioc_disable_intr(mrioc);
5422+
mrioc->io_admin_reset_sync = 1;
53725423

53735424
if (snapdump) {
53745425
mpi3mr_set_diagsave(mrioc);
@@ -5396,6 +5447,16 @@ int mpi3mr_soft_reset_handler(struct mpi3mr_ioc *mrioc,
53965447
ioc_err(mrioc, "Failed to issue soft reset to the ioc\n");
53975448
goto out;
53985449
}
5450+
5451+
retval = mpi3mr_check_op_admin_proc(mrioc);
5452+
if (retval) {
5453+
ioc_err(mrioc, "Soft reset failed due to an Admin or I/O queue polling\n"
5454+
"thread still processing replies even after a 10 second\n"
5455+
"timeout. Marking the controller as unrecoverable!\n");
5456+
5457+
goto out;
5458+
}
5459+
53995460
if (mrioc->num_io_throttle_group !=
54005461
mrioc->facts.max_io_throttle_group) {
54015462
ioc_err(mrioc,

0 commit comments

Comments
 (0)