Skip to content

Commit 630817a

Browse files
authored
Fix Dr.CI flaky FP when GH fails to dispatch the workflow (#4998)
Fixes #4987 Dr.CI logic to detect `isInfraFlakyJob` and `isLogClassifierFailed` has a FP where it misclassifies the GH failure to dispatch the whole workflow as flaky, for example pytorch/pytorch#121317. These logic should only be applicable to workflow job, not workflow run. The way to separate them is to check the `workflowId` field where it is set to `null` whenever it is a workflow run. ### Testing Unit test + local curl command will mark them as legit failures: ``` curl --request POST \ --url "http://localhost:3000/api/drci/drci?prNumber=121317" \ --header "Authorization: TOKEN" \ --data 'repo=pytorch' ```
1 parent 199a5b6 commit 630817a

File tree

2 files changed

+29
-5
lines changed

2 files changed

+29
-5
lines changed

torchci/lib/drciUtils.ts

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -368,9 +368,15 @@ export async function hasSimilarFailures(
368368
export function isInfraFlakyJob(job: RecentWorkflowsData): boolean {
369369
// An infra flaky job is a failed job without any failure line and runner. It shows
370370
// up as an empty job without any logs on GitHub. The failure can only be seen via
371-
// the workflow summary tab
371+
// the workflow summary tab.
372+
//
373+
// Also having a workflow ID means that this is a workflow job, not a workflow run.
374+
// This is to prevent the case where GitHub failed to run the whole workflow, but
375+
// was allowed to go through as flaky
372376
return (
373377
job.conclusion === "failure" &&
378+
job.workflowId !== null &&
379+
job.workflowId !== undefined &&
374380
(job.failure_lines === null ||
375381
job.failure_lines === undefined ||
376382
job.failure_lines.join("") === "") &&
@@ -383,6 +389,12 @@ export function isInfraFlakyJob(job: RecentWorkflowsData): boolean {
383389
export async function isLogClassifierFailed(
384390
job: RecentWorkflowsData
385391
): Promise<boolean> {
392+
// Having no workflow ID means that this is a workflow run, not a workflow job.
393+
// We don't want to apply the log classifier check for a workflow run
394+
if (job.workflowId === null || job.workflowId === undefined) {
395+
return false;
396+
}
397+
386398
// This covers the case when there is no log on S3 or log classifier fails to triggered
387399
const hasFailureLines =
388400
job.failure_lines !== null &&

torchci/test/drciUtils.test.ts

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -334,6 +334,7 @@ describe("Test various utils used by Dr.CI", () => {
334334
});
335335

336336
test("test isInfraFlakyJob", () => {
337+
// Not a workflow job
337338
const notInfraFlakyFailure: RecentWorkflowsData = {
338339
id: "A",
339340
name: "A",
@@ -348,8 +349,13 @@ describe("Test various utils used by Dr.CI", () => {
348349
};
349350
expect(isInfraFlakyJob(notInfraFlakyFailure)).toEqual(false);
350351

352+
// Set the workflow ID to mark this as a workflow job
353+
notInfraFlakyFailure.workflowId = "A";
354+
expect(isInfraFlakyJob(notInfraFlakyFailure)).toEqual(false);
355+
351356
const notInfraFlakyFailureAgain: RecentWorkflowsData = {
352357
id: "A",
358+
workflowId: "A",
353359
name: "A",
354360
html_url: "A",
355361
head_sha: "A",
@@ -364,6 +370,7 @@ describe("Test various utils used by Dr.CI", () => {
364370

365371
const isInfraFlakyFailure: RecentWorkflowsData = {
366372
id: "A",
373+
workflowId: "A",
367374
name: "A",
368375
html_url: "A",
369376
head_sha: "A",
@@ -381,8 +388,8 @@ describe("Test various utils used by Dr.CI", () => {
381388
const mockJobUtils = jest.spyOn(jobUtils, "hasS3Log");
382389
mockJobUtils.mockImplementation(() => Promise.resolve(true));
383390

384-
// Has log and failure lines
385-
const validFailure: RecentWorkflowsData = {
391+
// Not a workflow job
392+
const mockFailure: RecentWorkflowsData = {
386393
id: "A",
387394
name: "A",
388395
html_url: "A",
@@ -394,11 +401,16 @@ describe("Test various utils used by Dr.CI", () => {
394401
head_branch: "whatever",
395402
runnerName: "dummy",
396403
};
397-
expect(await isLogClassifierFailed(validFailure)).toEqual(false);
404+
expect(await isLogClassifierFailed(mockFailure)).toEqual(false);
405+
406+
// Has log and failure lines and is a workflow job
407+
mockFailure.workflowId = "A";
408+
expect(await isLogClassifierFailed(mockFailure)).toEqual(false);
398409

399410
// Has log but not failure lines (log classifier not triggered)
400411
const logClassifierNotTriggered: RecentWorkflowsData = {
401412
id: "A",
413+
workflowId: "A",
402414
name: "A",
403415
html_url: "A",
404416
head_sha: "A",
@@ -415,7 +427,7 @@ describe("Test various utils used by Dr.CI", () => {
415427

416428
// No S3 log
417429
mockJobUtils.mockImplementation(() => Promise.resolve(false));
418-
expect(await isLogClassifierFailed(validFailure)).toEqual(true);
430+
expect(await isLogClassifierFailed(mockFailure)).toEqual(true);
419431
});
420432

421433
test("test isExcludedFromFlakiness", () => {

0 commit comments

Comments
 (0)