Skip to content

Commit 6a7ee37

Browse files
authored
Merge pull request #12395 from wenduwan/v4.1.x_backport_pr12388
[v4.1.x] mtl/ofi: bail gracefully if completion error context is null
2 parents e39f156 + 2c8db51 commit 6a7ee37

File tree

1 file changed

+18
-10
lines changed

1 file changed

+18
-10
lines changed

ompi/mca/mtl/ofi/mtl_ofi.h

Lines changed: 18 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -125,8 +125,7 @@ ompi_mtl_ofi_context_progress(int ctxt_id)
125125
opal_output(0, "%s:%d: Error returned by request event callback: %zd.\n"
126126
"*** The Open MPI OFI MTL is aborting the MPI job (via exit(3)).\n",
127127
__FILE__, __LINE__, ret);
128-
fflush(stderr);
129-
exit(1);
128+
goto bail;
130129
}
131130
}
132131
}
@@ -148,25 +147,31 @@ ompi_mtl_ofi_context_progress(int ctxt_id)
148147
* thread fetches the entry while others get -FI_EAGAIN
149148
* indicating an empty queue, which is not erroneous.
150149
*/
151-
if (ret == -FI_EAGAIN)
150+
if (ret == -FI_EAGAIN) {
152151
return count;
152+
}
153153
opal_output(0, "%s:%d: Error returned from fi_cq_readerr: %s(%zd).\n"
154154
"*** The Open MPI OFI MTL is aborting the MPI job (via exit(3)).\n",
155155
__FILE__, __LINE__, fi_strerror(-ret), ret);
156-
fflush(stderr);
157-
exit(1);
156+
goto bail;
157+
}
158+
159+
if (!error.op_context) {
160+
opal_output(0, "%s:%d: Error returned from fi_cq_readerr with null context. "
161+
"Completion flags: %016lx\n"
162+
"*** The Open MPI OFI MTL is aborting the MPI job (via exit(3)).\n",
163+
__FILE__, __LINE__, error.flags);
164+
goto bail;
158165
}
159166

160-
assert(error.op_context);
161167
ofi_req = TO_OFI_REQ(error.op_context);
162168
assert(ofi_req);
163169
ret = ofi_req->error_callback(&error, ofi_req);
164170
if (OMPI_SUCCESS != ret) {
165171
opal_output(0, "%s:%d: Error returned by request error callback: %zd.\n"
166172
"*** The Open MPI OFI MTL is aborting the MPI job (via exit(3)).\n",
167173
__FILE__, __LINE__, ret);
168-
fflush(stderr);
169-
exit(1);
174+
goto bail;
170175
}
171176
} else {
172177
if (ret == -FI_EAGAIN || ret == -EINTR) {
@@ -175,13 +180,16 @@ ompi_mtl_ofi_context_progress(int ctxt_id)
175180
opal_output(0, "%s:%d: Error returned from fi_cq_read: %s(%zd).\n"
176181
"*** The Open MPI OFI MTL is aborting the MPI job (via exit(3)).\n",
177182
__FILE__, __LINE__, fi_strerror(-ret), ret);
178-
fflush(stderr);
179-
exit(1);
183+
goto bail;
180184
}
181185
}
182186
}
183187

184188
return count;
189+
190+
bail:
191+
fflush(stderr);
192+
exit(1);
185193
}
186194

187195
__opal_attribute_always_inline__ static inline int

0 commit comments

Comments
 (0)