Skip to content

Commit 8cc95ce

Browse files
Edwin Peerdavem330
authored andcommitted
bnxt_en: improve fw diagnose devlink health messages
Add firmware event counters as well as health state severity. In the unhealthy state, recommend a remedy and inform the user as to its impact. Readability of the devlink tool's output is negatively impacted by adding these fields to the diagnosis. The single line of text, as rendered by devlink health diagnose, benefits from more terse descriptions, which can be substituted without loss of clarity, even in pretty printed JSON mode. Signed-off-by: Edwin Peer <[email protected]> Signed-off-by: Michael Chan <[email protected]> Signed-off-by: David S. Miller <[email protected]>
1 parent 2bb21b8 commit 8cc95ce

File tree

3 files changed

+148
-27
lines changed

3 files changed

+148
-27
lines changed

drivers/net/ethernet/broadcom/bnxt/bnxt.c

Lines changed: 16 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2138,10 +2138,12 @@ static int bnxt_async_event_process(struct bnxt *bp,
21382138
set_bit(BNXT_STATE_FW_ACTIVATE_RESET, &bp->state);
21392139
} else if (EVENT_DATA1_RESET_NOTIFY_FATAL(data1)) {
21402140
type_str = "Fatal";
2141+
bp->fw_health->fatalities++;
21412142
set_bit(BNXT_STATE_FW_FATAL_COND, &bp->state);
21422143
} else if (data2 && BNXT_FW_STATUS_HEALTHY !=
21432144
EVENT_DATA2_RESET_NOTIFY_FW_STATUS_CODE(data2)) {
21442145
type_str = "Non-fatal";
2146+
bp->fw_health->survivals++;
21452147
set_bit(BNXT_STATE_FW_NON_FATAL_COND, &bp->state);
21462148
}
21472149
netif_warn(bp, hw, bp->dev,
@@ -7604,6 +7606,7 @@ static int __bnxt_alloc_fw_health(struct bnxt *bp)
76047606
if (!bp->fw_health)
76057607
return -ENOMEM;
76067608

7609+
mutex_init(&bp->fw_health->lock);
76077610
return 0;
76087611
}
76097612

@@ -7650,12 +7653,16 @@ static void bnxt_inv_fw_health_reg(struct bnxt *bp)
76507653
struct bnxt_fw_health *fw_health = bp->fw_health;
76517654
u32 reg_type;
76527655

7653-
if (!fw_health || !fw_health->status_reliable)
7656+
if (!fw_health)
76547657
return;
76557658

76567659
reg_type = BNXT_FW_HEALTH_REG_TYPE(fw_health->regs[BNXT_FW_HEALTH_REG]);
76577660
if (reg_type == BNXT_FW_HEALTH_REG_TYPE_GRC)
76587661
fw_health->status_reliable = false;
7662+
7663+
reg_type = BNXT_FW_HEALTH_REG_TYPE(fw_health->regs[BNXT_FW_RESET_CNT_REG]);
7664+
if (reg_type == BNXT_FW_HEALTH_REG_TYPE_GRC)
7665+
fw_health->resets_reliable = false;
76597666
}
76607667

76617668
static void bnxt_try_map_fw_health_reg(struct bnxt *bp)
@@ -7712,6 +7719,7 @@ static int bnxt_map_fw_health_regs(struct bnxt *bp)
77127719
int i;
77137720

77147721
bp->fw_health->status_reliable = false;
7722+
bp->fw_health->resets_reliable = false;
77157723
/* Only pre-map the monitoring GRC registers using window 3 */
77167724
for (i = 0; i < 4; i++) {
77177725
u32 reg = fw_health->regs[i];
@@ -7725,6 +7733,7 @@ static int bnxt_map_fw_health_regs(struct bnxt *bp)
77257733
fw_health->mapped_regs[i] = BNXT_FW_HEALTH_WIN_OFF(reg);
77267734
}
77277735
bp->fw_health->status_reliable = true;
7736+
bp->fw_health->resets_reliable = true;
77287737
if (reg_base == 0xffffffff)
77297738
return 0;
77307739

@@ -11264,14 +11273,18 @@ static void bnxt_fw_health_check(struct bnxt *bp)
1126411273
}
1126511274

1126611275
val = bnxt_fw_health_readl(bp, BNXT_FW_HEARTBEAT_REG);
11267-
if (val == fw_health->last_fw_heartbeat)
11276+
if (val == fw_health->last_fw_heartbeat) {
11277+
fw_health->arrests++;
1126811278
goto fw_reset;
11279+
}
1126911280

1127011281
fw_health->last_fw_heartbeat = val;
1127111282

1127211283
val = bnxt_fw_health_readl(bp, BNXT_FW_RESET_CNT_REG);
11273-
if (val != fw_health->last_fw_reset_cnt)
11284+
if (val != fw_health->last_fw_reset_cnt) {
11285+
fw_health->discoveries++;
1127411286
goto fw_reset;
11287+
}
1127511288

1127611289
fw_health->tmr_counter = fw_health->tmr_multiplier;
1127711290
return;

drivers/net/ethernet/broadcom/bnxt/bnxt.h

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1523,6 +1523,21 @@ struct bnxt_ctx_mem_info {
15231523
struct bnxt_mem_init mem_init[BNXT_CTX_MEM_INIT_MAX];
15241524
};
15251525

1526+
enum bnxt_health_severity {
1527+
SEVERITY_NORMAL = 0,
1528+
SEVERITY_WARNING,
1529+
SEVERITY_RECOVERABLE,
1530+
SEVERITY_FATAL,
1531+
};
1532+
1533+
enum bnxt_health_remedy {
1534+
REMEDY_DEVLINK_RECOVER,
1535+
REMEDY_POWER_CYCLE_DEVICE,
1536+
REMEDY_POWER_CYCLE_HOST,
1537+
REMEDY_FW_UPDATE,
1538+
REMEDY_HW_REPLACE,
1539+
};
1540+
15261541
struct bnxt_fw_health {
15271542
u32 flags;
15281543
u32 polling_dsecs;
@@ -1542,6 +1557,7 @@ struct bnxt_fw_health {
15421557
u8 enabled:1;
15431558
u8 primary:1;
15441559
u8 status_reliable:1;
1560+
u8 resets_reliable:1;
15451561
u8 tmr_multiplier;
15461562
u8 tmr_counter;
15471563
u8 fw_reset_seq_cnt;
@@ -1551,6 +1567,15 @@ struct bnxt_fw_health {
15511567
u32 echo_req_data1;
15521568
u32 echo_req_data2;
15531569
struct devlink_health_reporter *fw_reporter;
1570+
/* Protects severity and remedy */
1571+
struct mutex lock;
1572+
enum bnxt_health_severity severity;
1573+
enum bnxt_health_remedy remedy;
1574+
u32 arrests;
1575+
u32 discoveries;
1576+
u32 survivals;
1577+
u32 fatalities;
1578+
u32 diagnoses;
15541579
};
15551580

15561581
#define BNXT_FW_HEALTH_REG_TYPE_MASK 3

drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c

Lines changed: 107 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -71,43 +71,110 @@ static int bnxt_hwrm_remote_dev_reset_set(struct bnxt *bp, bool remote_reset)
7171
return hwrm_req_send(bp, req);
7272
}
7373

74+
static char *bnxt_health_severity_str(enum bnxt_health_severity severity)
75+
{
76+
switch (severity) {
77+
case SEVERITY_NORMAL: return "normal";
78+
case SEVERITY_WARNING: return "warning";
79+
case SEVERITY_RECOVERABLE: return "recoverable";
80+
case SEVERITY_FATAL: return "fatal";
81+
default: return "unknown";
82+
}
83+
}
84+
85+
static char *bnxt_health_remedy_str(enum bnxt_health_remedy remedy)
86+
{
87+
switch (remedy) {
88+
case REMEDY_DEVLINK_RECOVER: return "devlink recover";
89+
case REMEDY_POWER_CYCLE_DEVICE: return "device power cycle";
90+
case REMEDY_POWER_CYCLE_HOST: return "host power cycle";
91+
case REMEDY_FW_UPDATE: return "update firmware";
92+
case REMEDY_HW_REPLACE: return "replace hardware";
93+
default: return "unknown";
94+
}
95+
}
96+
7497
static int bnxt_fw_diagnose(struct devlink_health_reporter *reporter,
7598
struct devlink_fmsg *fmsg,
7699
struct netlink_ext_ack *extack)
77100
{
78101
struct bnxt *bp = devlink_health_reporter_priv(reporter);
79-
u32 val;
102+
struct bnxt_fw_health *h = bp->fw_health;
103+
u32 fw_status, fw_resets;
80104
int rc;
81105

82106
if (test_bit(BNXT_STATE_IN_FW_RESET, &bp->state))
83-
return 0;
107+
return devlink_fmsg_string_pair_put(fmsg, "Status", "recovering");
84108

85-
val = bnxt_fw_health_readl(bp, BNXT_FW_HEALTH_REG);
109+
if (!h->status_reliable)
110+
return devlink_fmsg_string_pair_put(fmsg, "Status", "unknown");
86111

87-
if (BNXT_FW_IS_BOOTING(val)) {
88-
rc = devlink_fmsg_string_pair_put(fmsg, "Description",
89-
"Not yet completed initialization");
112+
mutex_lock(&h->lock);
113+
fw_status = bnxt_fw_health_readl(bp, BNXT_FW_HEALTH_REG);
114+
if (BNXT_FW_IS_BOOTING(fw_status)) {
115+
rc = devlink_fmsg_string_pair_put(fmsg, "Status", "initializing");
90116
if (rc)
91-
return rc;
92-
} else if (BNXT_FW_IS_ERR(val)) {
93-
rc = devlink_fmsg_string_pair_put(fmsg, "Description",
94-
"Encountered fatal error and cannot recover");
117+
goto unlock;
118+
} else if (h->severity || fw_status != BNXT_FW_STATUS_HEALTHY) {
119+
if (!h->severity) {
120+
h->severity = SEVERITY_FATAL;
121+
h->remedy = REMEDY_POWER_CYCLE_DEVICE;
122+
h->diagnoses++;
123+
devlink_health_report(h->fw_reporter,
124+
"FW error diagnosed", h);
125+
}
126+
rc = devlink_fmsg_string_pair_put(fmsg, "Status", "error");
95127
if (rc)
96-
return rc;
128+
goto unlock;
129+
rc = devlink_fmsg_u32_pair_put(fmsg, "Syndrome", fw_status);
130+
if (rc)
131+
goto unlock;
132+
} else {
133+
rc = devlink_fmsg_string_pair_put(fmsg, "Status", "healthy");
134+
if (rc)
135+
goto unlock;
97136
}
98137

99-
if (val >> 16) {
100-
rc = devlink_fmsg_u32_pair_put(fmsg, "Error code", val >> 16);
138+
rc = devlink_fmsg_string_pair_put(fmsg, "Severity",
139+
bnxt_health_severity_str(h->severity));
140+
if (rc)
141+
goto unlock;
142+
143+
if (h->severity) {
144+
rc = devlink_fmsg_string_pair_put(fmsg, "Remedy",
145+
bnxt_health_remedy_str(h->remedy));
101146
if (rc)
102-
return rc;
147+
goto unlock;
148+
if (h->remedy == REMEDY_DEVLINK_RECOVER) {
149+
rc = devlink_fmsg_string_pair_put(fmsg, "Impact",
150+
"traffic+ntuple_cfg");
151+
if (rc)
152+
goto unlock;
153+
}
103154
}
104155

105-
val = bnxt_fw_health_readl(bp, BNXT_FW_RESET_CNT_REG);
106-
rc = devlink_fmsg_u32_pair_put(fmsg, "Reset count", val);
107-
if (rc)
156+
unlock:
157+
mutex_unlock(&h->lock);
158+
if (rc || !h->resets_reliable)
108159
return rc;
109160

110-
return 0;
161+
fw_resets = bnxt_fw_health_readl(bp, BNXT_FW_RESET_CNT_REG);
162+
rc = devlink_fmsg_u32_pair_put(fmsg, "Resets", fw_resets);
163+
if (rc)
164+
return rc;
165+
rc = devlink_fmsg_u32_pair_put(fmsg, "Arrests", h->arrests);
166+
if (rc)
167+
return rc;
168+
rc = devlink_fmsg_u32_pair_put(fmsg, "Survivals", h->survivals);
169+
if (rc)
170+
return rc;
171+
rc = devlink_fmsg_u32_pair_put(fmsg, "Discoveries", h->discoveries);
172+
if (rc)
173+
return rc;
174+
rc = devlink_fmsg_u32_pair_put(fmsg, "Fatalities", h->fatalities);
175+
if (rc)
176+
return rc;
177+
return devlink_fmsg_u32_pair_put(fmsg, "Diagnoses", h->diagnoses);
111178
}
112179

113180
static int bnxt_fw_recover(struct devlink_health_reporter *reporter,
@@ -116,6 +183,9 @@ static int bnxt_fw_recover(struct devlink_health_reporter *reporter,
116183
{
117184
struct bnxt *bp = devlink_health_reporter_priv(reporter);
118185

186+
if (bp->fw_health->severity == SEVERITY_FATAL)
187+
return -ENODEV;
188+
119189
set_bit(BNXT_STATE_RECOVER, &bp->state);
120190
__bnxt_fw_recover(bp);
121191

@@ -165,6 +235,7 @@ void bnxt_dl_fw_reporters_destroy(struct bnxt *bp, bool all)
165235
void bnxt_devlink_health_fw_report(struct bnxt *bp)
166236
{
167237
struct bnxt_fw_health *fw_health = bp->fw_health;
238+
int rc;
168239

169240
if (!fw_health)
170241
return;
@@ -174,20 +245,32 @@ void bnxt_devlink_health_fw_report(struct bnxt *bp)
174245
return;
175246
}
176247

177-
devlink_health_report(fw_health->fw_reporter, "FW error reported", NULL);
248+
mutex_lock(&fw_health->lock);
249+
fw_health->severity = SEVERITY_RECOVERABLE;
250+
fw_health->remedy = REMEDY_DEVLINK_RECOVER;
251+
mutex_unlock(&fw_health->lock);
252+
rc = devlink_health_report(fw_health->fw_reporter, "FW error reported",
253+
fw_health);
254+
if (rc == -ECANCELED)
255+
__bnxt_fw_recover(bp);
178256
}
179257

180258
void bnxt_dl_health_fw_status_update(struct bnxt *bp, bool healthy)
181259
{
182-
struct bnxt_fw_health *health = bp->fw_health;
260+
struct bnxt_fw_health *fw_health = bp->fw_health;
183261
u8 state;
184262

185-
if (healthy)
263+
mutex_lock(&fw_health->lock);
264+
if (healthy) {
265+
fw_health->severity = SEVERITY_NORMAL;
186266
state = DEVLINK_HEALTH_REPORTER_STATE_HEALTHY;
187-
else
267+
} else {
268+
fw_health->severity = SEVERITY_FATAL;
269+
fw_health->remedy = REMEDY_POWER_CYCLE_DEVICE;
188270
state = DEVLINK_HEALTH_REPORTER_STATE_ERROR;
189-
190-
devlink_health_reporter_state_update(health->fw_reporter, state);
271+
}
272+
mutex_unlock(&fw_health->lock);
273+
devlink_health_reporter_state_update(fw_health->fw_reporter, state);
191274
}
192275

193276
void bnxt_dl_health_fw_recovery_done(struct bnxt *bp)

0 commit comments

Comments
 (0)