Skip to content

Commit 083c53a

Browse files
ksinyukKobyElbaz
authored andcommitted
accel/habanalabs: disable device access after CPLD_SHUTDOWN
After a CPLD shutdown event the device becomes unusable. Prevent further device access once this event is received. Signed-off-by: Konstantin Sinyuk <[email protected]> Reviewed-by: Koby Elbaz <[email protected]> Signed-off-by: Koby Elbaz <[email protected]>
1 parent cade027 commit 083c53a

File tree

2 files changed

+28
-0
lines changed

2 files changed

+28
-0
lines changed

drivers/accel/habanalabs/common/device.c

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1630,6 +1630,11 @@ int hl_device_reset(struct hl_device *hdev, u32 flags)
16301630
from_watchdog_thread = !!(flags & HL_DRV_RESET_FROM_WD_THR);
16311631
reset_upon_device_release = hdev->reset_upon_device_release && from_dev_release;
16321632

1633+
if (hdev->cpld_shutdown) {
1634+
dev_err(hdev->dev, "Cannot reset device, cpld is shutdown! Device is NOT usable\n");
1635+
return -EIO;
1636+
}
1637+
16331638
if (!hard_reset && (hl_device_status(hdev) == HL_DEVICE_STATUS_MALFUNCTION)) {
16341639
dev_dbg(hdev->dev, "soft-reset isn't supported on a malfunctioning device\n");
16351640
return 0;
@@ -2576,6 +2581,14 @@ void hl_device_fini(struct hl_device *hdev)
25762581
if (rc)
25772582
dev_err(hdev->dev, "hw_fini failed in device fini while removing device %d\n", rc);
25782583

2584+
/* Reset the H/W (if it accessible). It will be in idle state after this returns */
2585+
if (!hdev->cpld_shutdown) {
2586+
rc = hdev->asic_funcs->hw_fini(hdev, true, false);
2587+
if (rc)
2588+
dev_err(hdev->dev,
2589+
"hw_fini failed in device fini while removing device %d\n", rc);
2590+
}
2591+
25792592
hdev->fw_loader.fw_comp_loaded = FW_TYPE_NONE;
25802593

25812594
/* Release kernel context */
@@ -2943,3 +2956,15 @@ void hl_handle_clk_change_event(struct hl_device *hdev, u16 event_type, u64 *eve
29432956

29442957
mutex_unlock(&clk_throttle->lock);
29452958
}
2959+
2960+
void hl_eq_cpld_shutdown_event_handle(struct hl_device *hdev, u16 event_id, u64 *event_mask)
2961+
{
2962+
hl_handle_critical_hw_err(hdev, event_id, event_mask);
2963+
*event_mask |= HL_NOTIFIER_EVENT_DEVICE_UNAVAILABLE;
2964+
2965+
/* Avoid any new accesses to the H/W */
2966+
hdev->disabled = true;
2967+
hdev->cpld_shutdown = true;
2968+
hl_cn_hard_reset_prepare(hdev);
2969+
hl_cn_stop(hdev);
2970+
}

drivers/accel/habanalabs/common/habanalabs.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3386,6 +3386,7 @@ struct eq_heartbeat_debug_info {
33863386
* addresses.
33873387
* @is_in_dram_scrub: true if dram scrub operation is on going.
33883388
* @disabled: is device disabled.
3389+
* @cpld_shutdown: is cpld shutdown.
33893390
* @late_init_done: is late init stage was done during initialization.
33903391
* @hwmon_initialized: is H/W monitor sensors was initialized.
33913392
* @reset_on_lockup: true if a reset should be done in case of stuck CS, false
@@ -3562,6 +3563,7 @@ struct hl_device {
35623563
u16 cpu_pci_msb_addr;
35633564
u8 is_in_dram_scrub;
35643565
u8 disabled;
3566+
u8 cpld_shutdown;
35653567
u8 late_init_done;
35663568
u8 hwmon_initialized;
35673569
u8 reset_on_lockup;
@@ -4119,6 +4121,7 @@ void hl_init_cpu_for_irq(struct hl_device *hdev);
41194121
void hl_set_irq_affinity(struct hl_device *hdev, int irq);
41204122
void hl_eq_heartbeat_event_handle(struct hl_device *hdev);
41214123
void hl_handle_clk_change_event(struct hl_device *hdev, u16 event_type, u64 *event_mask);
4124+
void hl_eq_cpld_shutdown_event_handle(struct hl_device *hdev, u16 event_id, u64 *event_mask);
41224125

41234126
#ifdef CONFIG_DEBUG_FS
41244127

0 commit comments

Comments
 (0)