Fix Dr.CI flaky FP when GH fails to dispatch the workflow (#4998)

huydhn · web-flow · commit 630817a1d519 · 2024-03-12T16:22:52.000-07:00
Fixes #4987 Dr.CI logic to detect `isInfraFlakyJob` and `isLogClassifierFailed` has a FP where it misclassifies the GH failure to dispatch the whole workflow as flaky, for example pytorch/pytorch#121317. These logic should only be applicable to workflow job, not workflow run. The way to separate them is to check the `workflowId` field where it is set to `null` whenever it is a workflow run. ### Testing Unit test + local curl command will mark them as legit failures: ``` curl --request POST \ --url "http://localhost:3000/api/drci/drci?prNumber=121317" \ --header "Authorization: TOKEN" \ --data 'repo=pytorch' ```
diff --git a/torchci/lib/drciUtils.ts b/torchci/lib/drciUtils.ts
@@ -368,9 +368,15 @@ export async function hasSimilarFailures(
 export function isInfraFlakyJob(job: RecentWorkflowsData): boolean {
   // An infra flaky job is a failed job without any failure line and runner. It shows
   // up as an empty job without any logs on GitHub. The failure can only be seen via
-  // the workflow summary tab
+  // the workflow summary tab.
+  //
+  // Also having a workflow ID means that this is a workflow job, not a workflow run.
+  // This is to prevent the case where GitHub failed to run the whole workflow, but
+  // was allowed to go through as flaky
   return (
     job.conclusion === "failure" &&
+    job.workflowId !== null &&
+    job.workflowId !== undefined &&
     (job.failure_lines === null ||
       job.failure_lines === undefined ||
       job.failure_lines.join("") === "") &&
@@ -383,6 +389,12 @@ export function isInfraFlakyJob(job: RecentWorkflowsData): boolean {
 export async function isLogClassifierFailed(
   job: RecentWorkflowsData
 ): Promise<boolean> {
+  // Having no workflow ID means that this is a workflow run, not a workflow job.
+  // We don't want to apply the log classifier check for a workflow run
+  if (job.workflowId === null || job.workflowId === undefined) {
+    return false;
+  }
+
   // This covers the case when there is no log on S3 or log classifier fails to triggered
   const hasFailureLines =
     job.failure_lines !== null &&
diff --git a/torchci/test/drciUtils.test.ts b/torchci/test/drciUtils.test.ts
@@ -334,6 +334,7 @@ describe("Test various utils used by Dr.CI", () => {
   });
 
   test("test isInfraFlakyJob", () => {
+    // Not a workflow job
     const notInfraFlakyFailure: RecentWorkflowsData = {
       id: "A",
       name: "A",
@@ -348,8 +349,13 @@ describe("Test various utils used by Dr.CI", () => {
     };
     expect(isInfraFlakyJob(notInfraFlakyFailure)).toEqual(false);
 
+    // Set the workflow ID to mark this as a workflow job
+    notInfraFlakyFailure.workflowId = "A";
+    expect(isInfraFlakyJob(notInfraFlakyFailure)).toEqual(false);
+
     const notInfraFlakyFailureAgain: RecentWorkflowsData = {
       id: "A",
+      workflowId: "A",
       name: "A",
       html_url: "A",
       head_sha: "A",
@@ -364,6 +370,7 @@ describe("Test various utils used by Dr.CI", () => {
 
     const isInfraFlakyFailure: RecentWorkflowsData = {
       id: "A",
+      workflowId: "A",
       name: "A",
       html_url: "A",
       head_sha: "A",
@@ -381,8 +388,8 @@ describe("Test various utils used by Dr.CI", () => {
     const mockJobUtils = jest.spyOn(jobUtils, "hasS3Log");
     mockJobUtils.mockImplementation(() => Promise.resolve(true));
 
-    // Has log and failure lines
-    const validFailure: RecentWorkflowsData = {
+    // Not a workflow job
+    const mockFailure: RecentWorkflowsData = {
       id: "A",
       name: "A",
       html_url: "A",
@@ -394,11 +401,16 @@ describe("Test various utils used by Dr.CI", () => {
       head_branch: "whatever",
       runnerName: "dummy",
     };
-    expect(await isLogClassifierFailed(validFailure)).toEqual(false);
+    expect(await isLogClassifierFailed(mockFailure)).toEqual(false);
+
+    // Has log and failure lines and is a workflow job
+    mockFailure.workflowId = "A";
+    expect(await isLogClassifierFailed(mockFailure)).toEqual(false);
 
     // Has log but not failure lines (log classifier not triggered)
     const logClassifierNotTriggered: RecentWorkflowsData = {
       id: "A",
+      workflowId: "A",
       name: "A",
       html_url: "A",
       head_sha: "A",
@@ -415,7 +427,7 @@ describe("Test various utils used by Dr.CI", () => {
 
     // No S3 log
     mockJobUtils.mockImplementation(() => Promise.resolve(false));
-    expect(await isLogClassifierFailed(validFailure)).toEqual(true);
+    expect(await isLogClassifierFailed(mockFailure)).toEqual(true);
   });
 
   test("test isExcludedFromFlakiness", () => {