@@ -337,6 +337,30 @@ static const struct hclge_hw_error hclge_ssu_port_based_pf_int[] = {
337337 { /* sentinel */ }
338338};
339339
340+ static const struct hclge_hw_error hclge_rocee_qmm_ovf_err_int [] = {
341+ { .int_msk = 0 , .msg = "rocee qmm ovf: sgid invalid err" },
342+ { .int_msk = 0x4 , .msg = "rocee qmm ovf: sgid ovf err" },
343+ { .int_msk = 0x8 , .msg = "rocee qmm ovf: smac invalid err" },
344+ { .int_msk = 0xC , .msg = "rocee qmm ovf: smac ovf err" },
345+ { .int_msk = 0x10 , .msg = "rocee qmm ovf: cqc invalid err" },
346+ { .int_msk = 0x11 , .msg = "rocee qmm ovf: cqc ovf err" },
347+ { .int_msk = 0x12 , .msg = "rocee qmm ovf: cqc hopnum err" },
348+ { .int_msk = 0x13 , .msg = "rocee qmm ovf: cqc ba0 err" },
349+ { .int_msk = 0x14 , .msg = "rocee qmm ovf: srqc invalid err" },
350+ { .int_msk = 0x15 , .msg = "rocee qmm ovf: srqc ovf err" },
351+ { .int_msk = 0x16 , .msg = "rocee qmm ovf: srqc hopnum err" },
352+ { .int_msk = 0x17 , .msg = "rocee qmm ovf: srqc ba0 err" },
353+ { .int_msk = 0x18 , .msg = "rocee qmm ovf: mpt invalid err" },
354+ { .int_msk = 0x19 , .msg = "rocee qmm ovf: mpt ovf err" },
355+ { .int_msk = 0x1A , .msg = "rocee qmm ovf: mpt hopnum err" },
356+ { .int_msk = 0x1B , .msg = "rocee qmm ovf: mpt ba0 err" },
357+ { .int_msk = 0x1C , .msg = "rocee qmm ovf: qpc invalid err" },
358+ { .int_msk = 0x1D , .msg = "rocee qmm ovf: qpc ovf err" },
359+ { .int_msk = 0x1E , .msg = "rocee qmm ovf: qpc hopnum err" },
360+ { .int_msk = 0x1F , .msg = "rocee qmm ovf: qpc ba0 err" },
361+ { /* sentinel */ }
362+ };
363+
340364static void hclge_log_error (struct device * dev , char * reg ,
341365 const struct hclge_hw_error * err ,
342366 u32 err_sts )
@@ -1023,6 +1047,148 @@ static int hclge_handle_all_ras_errors(struct hclge_dev *hdev)
10231047 return ret ;
10241048}
10251049
1050+ static int hclge_log_rocee_ovf_error (struct hclge_dev * hdev )
1051+ {
1052+ struct device * dev = & hdev -> pdev -> dev ;
1053+ struct hclge_desc desc [2 ];
1054+ int ret ;
1055+
1056+ /* read overflow error status */
1057+ ret = hclge_cmd_query_error (hdev , & desc [0 ],
1058+ HCLGE_ROCEE_PF_RAS_INT_CMD ,
1059+ 0 , 0 , 0 );
1060+ if (ret ) {
1061+ dev_err (dev , "failed(%d) to query ROCEE OVF error sts\n" , ret );
1062+ return ret ;
1063+ }
1064+
1065+ /* log overflow error */
1066+ if (le32_to_cpu (desc [0 ].data [0 ]) & HCLGE_ROCEE_OVF_ERR_INT_MASK ) {
1067+ const struct hclge_hw_error * err ;
1068+ u32 err_sts ;
1069+
1070+ err = & hclge_rocee_qmm_ovf_err_int [0 ];
1071+ err_sts = HCLGE_ROCEE_OVF_ERR_TYPE_MASK &
1072+ le32_to_cpu (desc [0 ].data [0 ]);
1073+ while (err -> msg ) {
1074+ if (err -> int_msk == err_sts ) {
1075+ dev_warn (dev , "%s [error status=0x%x] found\n" ,
1076+ err -> msg ,
1077+ le32_to_cpu (desc [0 ].data [0 ]));
1078+ break ;
1079+ }
1080+ err ++ ;
1081+ }
1082+ }
1083+
1084+ if (le32_to_cpu (desc [0 ].data [1 ]) & HCLGE_ROCEE_OVF_ERR_INT_MASK ) {
1085+ dev_warn (dev , "ROCEE TSP OVF [error status=0x%x] found\n" ,
1086+ le32_to_cpu (desc [0 ].data [1 ]));
1087+ }
1088+
1089+ if (le32_to_cpu (desc [0 ].data [2 ]) & HCLGE_ROCEE_OVF_ERR_INT_MASK ) {
1090+ dev_warn (dev , "ROCEE SCC OVF [error status=0x%x] found\n" ,
1091+ le32_to_cpu (desc [0 ].data [2 ]));
1092+ }
1093+
1094+ return 0 ;
1095+ }
1096+
1097+ static int hclge_log_and_clear_rocee_ras_error (struct hclge_dev * hdev )
1098+ {
1099+ enum hnae3_reset_type reset_type = HNAE3_FUNC_RESET ;
1100+ struct hnae3_ae_dev * ae_dev = hdev -> ae_dev ;
1101+ struct device * dev = & hdev -> pdev -> dev ;
1102+ struct hclge_desc desc [2 ];
1103+ unsigned int status ;
1104+ int ret ;
1105+
1106+ /* read RAS error interrupt status */
1107+ ret = hclge_cmd_query_error (hdev , & desc [0 ],
1108+ HCLGE_QUERY_CLEAR_ROCEE_RAS_INT ,
1109+ 0 , 0 , 0 );
1110+ if (ret ) {
1111+ dev_err (dev , "failed(%d) to query ROCEE RAS INT SRC\n" , ret );
1112+ /* reset everything for now */
1113+ HCLGE_SET_DEFAULT_RESET_REQUEST (HNAE3_GLOBAL_RESET );
1114+ return ret ;
1115+ }
1116+
1117+ status = le32_to_cpu (desc [0 ].data [0 ]);
1118+
1119+ if (status & HCLGE_ROCEE_RERR_INT_MASK )
1120+ dev_warn (dev , "ROCEE RAS AXI rresp error\n" );
1121+
1122+ if (status & HCLGE_ROCEE_BERR_INT_MASK )
1123+ dev_warn (dev , "ROCEE RAS AXI bresp error\n" );
1124+
1125+ if (status & HCLGE_ROCEE_ECC_INT_MASK ) {
1126+ dev_warn (dev , "ROCEE RAS 2bit ECC error\n" );
1127+ reset_type = HNAE3_GLOBAL_RESET ;
1128+ }
1129+
1130+ if (status & HCLGE_ROCEE_OVF_INT_MASK ) {
1131+ ret = hclge_log_rocee_ovf_error (hdev );
1132+ if (ret ) {
1133+ dev_err (dev , "failed(%d) to process ovf error\n" , ret );
1134+ /* reset everything for now */
1135+ HCLGE_SET_DEFAULT_RESET_REQUEST (HNAE3_GLOBAL_RESET );
1136+ return ret ;
1137+ }
1138+ }
1139+
1140+ /* clear error status */
1141+ hclge_cmd_reuse_desc (& desc [0 ], false);
1142+ ret = hclge_cmd_send (& hdev -> hw , & desc [0 ], 1 );
1143+ if (ret ) {
1144+ dev_err (dev , "failed(%d) to clear ROCEE RAS error\n" , ret );
1145+ /* reset everything for now */
1146+ reset_type = HNAE3_GLOBAL_RESET ;
1147+ }
1148+
1149+ HCLGE_SET_DEFAULT_RESET_REQUEST (reset_type );
1150+
1151+ return ret ;
1152+ }
1153+
1154+ static int hclge_config_rocee_ras_interrupt (struct hclge_dev * hdev , bool en )
1155+ {
1156+ struct device * dev = & hdev -> pdev -> dev ;
1157+ struct hclge_desc desc ;
1158+ int ret ;
1159+
1160+ if (hdev -> pdev -> revision < 0x21 || !hnae3_dev_roce_supported (hdev ))
1161+ return 0 ;
1162+
1163+ hclge_cmd_setup_basic_desc (& desc , HCLGE_CONFIG_ROCEE_RAS_INT_EN , false);
1164+ if (en ) {
1165+ /* enable ROCEE hw error interrupts */
1166+ desc .data [0 ] = cpu_to_le32 (HCLGE_ROCEE_RAS_NFE_INT_EN );
1167+ desc .data [1 ] = cpu_to_le32 (HCLGE_ROCEE_RAS_CE_INT_EN );
1168+
1169+ hclge_log_and_clear_rocee_ras_error (hdev );
1170+ }
1171+ desc .data [2 ] = cpu_to_le32 (HCLGE_ROCEE_RAS_NFE_INT_EN_MASK );
1172+ desc .data [3 ] = cpu_to_le32 (HCLGE_ROCEE_RAS_CE_INT_EN_MASK );
1173+
1174+ ret = hclge_cmd_send (& hdev -> hw , & desc , 1 );
1175+ if (ret )
1176+ dev_err (dev , "failed(%d) to config ROCEE RAS interrupt\n" , ret );
1177+
1178+ return ret ;
1179+ }
1180+
1181+ static int hclge_handle_rocee_ras_error (struct hnae3_ae_dev * ae_dev )
1182+ {
1183+ struct hclge_dev * hdev = ae_dev -> priv ;
1184+
1185+ if (test_bit (HCLGE_STATE_RST_HANDLING , & hdev -> state ) ||
1186+ hdev -> pdev -> revision < 0x21 )
1187+ return HNAE3_NONE_RESET ;
1188+
1189+ return hclge_log_and_clear_rocee_ras_error (hdev );
1190+ }
1191+
10261192static const struct hclge_hw_blk hw_blk [] = {
10271193 {
10281194 .msk = BIT (0 ), .name = "IGU_EGU" ,
@@ -1058,6 +1224,7 @@ static const struct hclge_hw_blk hw_blk[] = {
10581224int hclge_hw_error_set_state (struct hclge_dev * hdev , bool state )
10591225{
10601226 const struct hclge_hw_blk * module = hw_blk ;
1227+ struct device * dev = & hdev -> pdev -> dev ;
10611228 int ret = 0 ;
10621229
10631230 while (module -> name ) {
@@ -1069,6 +1236,10 @@ int hclge_hw_error_set_state(struct hclge_dev *hdev, bool state)
10691236 module ++ ;
10701237 }
10711238
1239+ ret = hclge_config_rocee_ras_interrupt (hdev , state );
1240+ if (ret )
1241+ dev_err (dev , "fail(%d) to configure ROCEE err int\n" , ret );
1242+
10721243 return ret ;
10731244}
10741245
@@ -1086,9 +1257,21 @@ pci_ers_result_t hclge_handle_hw_ras_error(struct hnae3_ae_dev *ae_dev)
10861257 "HNS Non-Fatal RAS error(status=0x%x) identified\n" ,
10871258 status );
10881259 hclge_handle_all_ras_errors (hdev );
1089- return PCI_ERS_RESULT_NEED_RESET ;
1260+ } else {
1261+ if (test_bit (HCLGE_STATE_RST_HANDLING , & hdev -> state ) ||
1262+ hdev -> pdev -> revision < 0x21 )
1263+ return PCI_ERS_RESULT_RECOVERED ;
1264+ }
1265+
1266+ if (status & HCLGE_RAS_REG_ROCEE_ERR_MASK ) {
1267+ dev_warn (dev , "ROCEE uncorrected RAS error identified\n" );
1268+ hclge_handle_rocee_ras_error (ae_dev );
10901269 }
10911270
1271+ if (status & HCLGE_RAS_REG_NFE_MASK ||
1272+ status & HCLGE_RAS_REG_ROCEE_ERR_MASK )
1273+ return PCI_ERS_RESULT_NEED_RESET ;
1274+
10921275 return PCI_ERS_RESULT_RECOVERED ;
10931276}
10941277
0 commit comments