diff --git a/docs/environment-reference.md b/docs/environment-reference.md
index 3b3f7ba..c37fbea 100644
--- a/docs/environment-reference.md
+++ b/docs/environment-reference.md
@@ -179,3 +179,8 @@ Defaults to `<package manager> run build`.
 
 Command used to start a local dev server as a part of the evaluation.
 Defaults to `<package manager> run start --port 0`.
+
+### `testCommand`
+
+Command used to run tests against the generated code. If this property is not provided, tests will not be run. The command should exit with code 0 on success and a non-zero exit code on failure. The output from the command (both `stdout` and `stderr`) is captured and used for repair attempts if the tests fail. The test command will time out after 4 minutes.
+
diff --git a/report-app/src/app/pages/report-viewer/report-viewer.html b/report-app/src/app/pages/report-viewer/report-viewer.html
index d7c7603..08369b9 100644
--- a/report-app/src/app/pages/report-viewer/report-viewer.html
+++ b/report-app/src/app/pages/report-viewer/report-viewer.html
@@ -73,6 +73,20 @@ <h3 class="chart-title">
             <stacked-bar-chart [data]="buildsAsGraphData(overview.stats.builds)" [compact]="true" />
           </div>
         </div>
+        @if (overview.stats.tests) {
+          <div class="chart-container test-results-details">
+            <h3 class="chart-title">
+              <span class="material-symbols-outlined"> quiz </span>
+              <span>Tests</span>
+            </h3>
+            <div class="summary-card-item">
+              <stacked-bar-chart
+                [data]="testsAsGraphData(overview.stats.tests)"
+                [compact]="true"
+              />
+            </div>
+          </div>
+        }
         @if (overview.stats.runtime) {
           <div class="chart-container">
             <h3 class="chart-title">
@@ -276,9 +290,19 @@ <h2>Generated applications</h2>
                   <span class="status-badge error">Initial build failed</span>
                 }
 
-                @if (hasBuildFailureDuringA11yRepair(result)) {
+                @if (hasBuildFailureDuringTestRepair(result)) {
                   <span class="status-badge error">Build failed after a11y repair</span>
                 }
+                <!-- Test status badges -->
+                @if (finalAttempt.testResult) {
+                  @if (finalAttempt.testResult.passed) {
+                    @if ((result.testRepairAttempts || 0) > 0) {
+                      <span class="status-badge warning">Tests passed after repair</span>
+                    }
+                  } @else {
+                    <span class="status-badge error">Tests failed</span>
+                  }
+                }
               </div>
             </div>
           </expansion-panel-header>
@@ -350,12 +374,36 @@ <h5>
                 </div>
               </div>
 
+              @if (result.testResult) {
+                <div class="app-details-section">
+                  <h4>Test Results</h4>
+                  <div class="test-summary">
+                    @if (result.testResult.passed) {
+                      <span class="status-text success">✔ Tests passed</span>
+                      @if ((result.testRepairAttempts || 0) > 0) {
+                        <span class="status-text">&nbsp;after {{ result.testRepairAttempts }} repair attempt(s)</span>
+                      }
+                    } @else {
+                      <span class="status-text error">✘ Tests failed</span>
+                    }
+                  </div>
+                  
+                  @if (result.testResult.output && !result.testResult.passed) {
+                    <details class="test-output-button">
+                      <summary class="neutral-button">See Test Output</summary>
+                      <pre class="callout neutral code">{{ result.testResult.output }}</pre>
+                    </details>
+                  }
+                </div>
+              }
+
               <div class="app-details-section">
                 <h4>Additional info</h4>
                 @for (attempt of result.attemptDetails; track attempt) {
                   @let isBuilt = attempt.buildResult.status === 'success';
                   @let axeViolations = attempt.serveTestingResult?.axeViolations;
                   @let hasAxeViolations = axeViolations && axeViolations.length > 0;
+                  @let testsFailed = attempt.testResult?.passed === false;
 
                   <expansion-panel #expansionPanel>
                     <expansion-panel-header>
@@ -380,6 +428,15 @@ <h4>Additional info</h4>
                           >A11y</span
                         >
                       }
+
+                      @if (attempt.testResult) {
+                        <span
+                          class="status-badge"
+                          [class.error]="!attempt.testResult.passed"
+                          [class.success]="attempt.testResult.passed"
+                          >Tests</span
+                        >
+                      }
                     </expansion-panel-header>
 
                     @if (expansionPanel.opened()) {
@@ -416,6 +473,11 @@ <h4>A11y Violations</h4>
                         </pre>
                       }
 
+                      @if (testsFailed) {
+                        <h4>Failed Tests</h4>
+                        <pre class="callout neutral code">{{ attempt.testResult?.output }}</pre>
+                      }
+
                       <h4>Generated Code</h4>
 
                       @for (file of attempt.outputFiles; track file) {
diff --git a/report-app/src/app/pages/report-viewer/report-viewer.ts b/report-app/src/app/pages/report-viewer/report-viewer.ts
index 08e7109..32c5cc0 100644
--- a/report-app/src/app/pages/report-viewer/report-viewer.ts
+++ b/report-app/src/app/pages/report-viewer/report-viewer.ts
@@ -21,6 +21,7 @@ import {
   LlmResponseFile,
   RunInfo,
   RunSummaryBuilds,
+  RunSummaryTests,
   RuntimeStats,
   ScoreBucket,
   SkippedIndividualAssessment,
@@ -265,6 +266,31 @@ export class ReportViewer {
     ];
   }
 
+  protected testsAsGraphData(tests: RunSummaryTests): StackedBarChartData {
+    return [
+      {
+        label: 'Passed',
+        color: ScoreCssVariable.excellent,
+        value: tests.successfulInitialTests,
+      },
+      {
+        label: 'Passed after repair',
+        color: ScoreCssVariable.great,
+        value: tests.successfulTestsAfterRepair,
+      },
+      {
+        label: 'Failed',
+        color: ScoreCssVariable.poor,
+        value: tests.failedTests,
+      },
+      {
+        label: 'No tests run',
+        color: ScoreCssVariable.neutral,
+        value: tests.noTestsRun,
+      },
+    ];
+  }
+
   protected checksAsGraphData(buckets: ScoreBucket[]): StackedBarChartData {
     return buckets.map(b => ({
       label: b.nameWithLabels,
@@ -400,7 +426,7 @@ export class ReportViewer {
     return `wcs run --prompt=${result.promptDef.name} --env=<path to ${report.details.summary.environmentId} config>`;
   }
 
-  protected hasBuildFailureDuringA11yRepair(result: AssessmentResult): boolean {
-    return result.attemptDetails.some(attempt => attempt.buildFailedDuringA11yRepair);
+  protected hasBuildFailureDuringTestRepair(result: AssessmentResult): boolean {
+    return result.attemptDetails.some(attempt => attempt.buildFailedDuringTestRepair);
   }
 }
diff --git a/runner/configuration/constants.ts b/runner/configuration/constants.ts
index 3151ec1..422a2dd 100644
--- a/runner/configuration/constants.ts
+++ b/runner/configuration/constants.ts
@@ -25,7 +25,13 @@ export const LLM_OUTPUT_DIR = join(rootDir, 'llm-output');
  * providing the build output and the code that causes the problem.
  */
 // Note: When updating, also adjust the default description in `README.md`.
-export const DEFAULT_MAX_REPAIR_ATTEMPTS = 1;
+export const DEFAULT_MAX_BUILD_REPAIR_ATTEMPTS = 1;
+
+/**
+ * Number of times we'll try to ask LLM to repair test failures
+ * E.g. Axe violations, or test command failures
+ */
+export const DEFAULT_MAX_TEST_REPAIR_ATTEMPTS = 1;
 
 /** Name of the folder where we store all generated reports */
 export const REPORTS_ROOT_DIR = join(rootDir, 'reports');
diff --git a/runner/configuration/environment-config.ts b/runner/configuration/environment-config.ts
index a959738..085a342 100644
--- a/runner/configuration/environment-config.ts
+++ b/runner/configuration/environment-config.ts
@@ -73,11 +73,6 @@ export const environmentConfigSchema = z.object({
 export type EnvironmentConfig = z.infer<typeof environmentConfigSchema> &
   Partial<LocalExecutorConfig>;
 
-/** Package managers that are currently supported. */
-export function getPossiblePackageManagers() {
-  return ['npm', 'pnpm', 'yarn'] as const;
-}
-
 /** Asserts that the specified data is a valid environment config. */
 export function assertIsEnvironmentConfig(value: unknown): asserts value is EnvironmentConfig {
   const validationResult = environmentConfigSchema
diff --git a/runner/configuration/package-managers.ts b/runner/configuration/package-managers.ts
new file mode 100644
index 0000000..6929cd2
--- /dev/null
+++ b/runner/configuration/package-managers.ts
@@ -0,0 +1,4 @@
+/** Package managers that are currently supported. */
+export function getPossiblePackageManagers() {
+  return ['npm', 'pnpm', 'yarn'] as const;
+}
diff --git a/runner/eval-cli.ts b/runner/eval-cli.ts
index 39077ca..210fe26 100644
--- a/runner/eval-cli.ts
+++ b/runner/eval-cli.ts
@@ -3,7 +3,8 @@ import chalk from 'chalk';
 import {
   BUILT_IN_ENVIRONMENTS,
   DEFAULT_AUTORATER_MODEL_NAME,
-  DEFAULT_MAX_REPAIR_ATTEMPTS,
+  DEFAULT_MAX_BUILD_REPAIR_ATTEMPTS,
+  DEFAULT_MAX_TEST_REPAIR_ATTEMPTS,
   DEFAULT_MODEL_NAME,
 } from './configuration/constants.js';
 import {generateCodeAndAssess} from './orchestration/generate.js';
@@ -37,9 +38,9 @@ interface Options {
   enableUserJourneyTesting?: boolean;
   enableAutoCsp?: boolean;
   autoraterModel?: string;
-  a11yRepairAttempts?: number;
   logging?: 'text-only' | 'dynamic';
   skipLighthouse?: boolean;
+  maxTestRepairAttempts?: number;
   maxBuildRepairAttempts?: number;
 }
 
@@ -151,11 +152,6 @@ function builder(argv: Argv): Argv<Options> {
         default: DEFAULT_AUTORATER_MODEL_NAME,
         description: 'Model to use when automatically rating generated code',
       })
-      .option('a11y-repair-attempts', {
-        type: 'number',
-        default: 0,
-        description: 'Number of repair attempts for discovered a11y violations',
-      })
       .option('skip-lighthouse', {
         type: 'boolean',
         default: false,
@@ -163,9 +159,15 @@ function builder(argv: Argv): Argv<Options> {
       })
       .option('max-build-repair-attempts', {
         type: 'number',
-        default: DEFAULT_MAX_REPAIR_ATTEMPTS,
+        default: DEFAULT_MAX_BUILD_REPAIR_ATTEMPTS,
         description: 'Number of repair attempts when build errors are discovered',
       })
+      .option('max-test-repair-attempts', {
+        type: 'number',
+        default: DEFAULT_MAX_TEST_REPAIR_ATTEMPTS,
+        description:
+          'Number of repair attempts for discovered test failures (including a11y violations and ones from testCommand)',
+      })
       .strict()
       .version(false)
       .help()
@@ -209,9 +211,9 @@ async function handler(cliArgs: Arguments<Options>): Promise<void> {
       logging: cliArgs.logging,
       autoraterModel: cliArgs.autoraterModel,
       skipAiSummary: cliArgs.skipAiSummary,
-      a11yRepairAttempts: cliArgs.a11yRepairAttempts,
       skipLighthouse: cliArgs.skipLighthouse,
       maxBuildRepairAttempts: cliArgs.maxBuildRepairAttempts,
+      maxTestRepairAttempts: cliArgs.maxTestRepairAttempts,
     });
 
     logReportToConsole(runInfo);
diff --git a/runner/orchestration/build-serve-loop.ts b/runner/orchestration/build-serve-loop.ts
index f543add..957f58c 100644
--- a/runner/orchestration/build-serve-loop.ts
+++ b/runner/orchestration/build-serve-loop.ts
@@ -10,15 +10,21 @@ import {
 } from '../shared-interfaces.js';
 import {ProgressLogger} from '../progress/progress-logger.js';
 import {runBuild} from './build-worker.js';
-import {repairAndBuild} from './build-repair.js';
 import {EvalID} from './executors/executor.js';
 import {serveAndTestApp} from './serve-testing-worker.js';
+import {runTest} from './test-worker.js';
 import {BrowserAgentTaskInput} from '../testing/browser-agent/models.js';
-import {DEFAULT_MAX_REPAIR_ATTEMPTS} from '../configuration/constants.js';
+import {
+  DEFAULT_MAX_BUILD_REPAIR_ATTEMPTS,
+  DEFAULT_MAX_TEST_REPAIR_ATTEMPTS,
+} from '../configuration/constants.js';
+import {repairAndBuild} from './repair.js';
 
 /**
- * Attempts to build the code that an LLM generated. If the build fails, attempts
- * to fix the breakage and build again.
+ * Attempts to build and test the code that an LLM generated.
+ *
+ *  * If the build fails, attempts to fix the breakage and build again.
+ *  * If tests fail (like Axe or project tests), we may repair and retry.
  *
  * @param config Assessment config.
  * @param evalID ID of the eval being attempted for build.
@@ -34,7 +40,7 @@ import {DEFAULT_MAX_REPAIR_ATTEMPTS} from '../configuration/constants.js';
  * @param abortSignal Signal to fire when the build should be aborted.
  * @param workerConcurrencyQueue Concurrency queue for controlling parallelism of worker invocations (as they are more expensive than LLM calls).
  */
-export async function attemptBuild(
+export async function attemptBuildAndTest(
   config: AssessmentConfig,
   evalID: EvalID,
   env: Environment,
@@ -59,8 +65,9 @@ export async function attemptBuild(
   );
   let repairAttempts = 0;
   const maxRepairAttempts = (await env.executor.shouldRepairFailedBuilds(evalID))
-    ? (config.maxBuildRepairAttempts ?? DEFAULT_MAX_REPAIR_ATTEMPTS)
+    ? (config.maxBuildRepairAttempts ?? DEFAULT_MAX_BUILD_REPAIR_ATTEMPTS)
     : 0;
+  const maxTestRepairAttempts = config.maxTestRepairAttempts ?? DEFAULT_MAX_TEST_REPAIR_ATTEMPTS;
 
   const initialAttempt = {
     outputFiles: initialResponse.files,
@@ -94,13 +101,18 @@ export async function attemptBuild(
       rootPromptDef,
       directory,
       lastAttempt.outputFiles,
-      lastAttempt.buildResult.message,
-      'There are the following build errors:',
+      [
+        {
+          errorContext: 'There are the following build errors:',
+          errorMessage: lastAttempt.buildResult.message,
+        },
+      ],
       contextFiles,
       abortSignal,
       workerConcurrencyQueue,
       repairAttempts,
       progress,
+      'build',
     );
 
     attemptDetails.push(attempt);
@@ -121,31 +133,69 @@ export async function attemptBuild(
       progress,
       userJourneyAgentTaskInput,
     );
+    const testResult = await runTest(
+      env,
+      evalID,
+      directory,
+      rootPromptDef,
+      abortSignal,
+      workerConcurrencyQueue,
+      progress,
+    );
+
+    if (testResult !== null) {
+      lastAttempt.testResult = testResult;
+    }
   }
 
-  // Attempt to repair axe testing. This only runs when the last build
-  // passed and serving did run. Note: By default, we don't run axe repair
+  // Attempt to repair testing. This only runs when the last build
+  // passed and serving did run. Note: By default, we don't run repair
   // attempts as it's not commonly done by LLMs in the ecosystem.
   let axeRepairAttempts = 0;
-  while (
-    lastAttempt.serveTestingResult &&
-    (lastAttempt.serveTestingResult.axeViolations?.length ?? 0) > 0 &&
-    axeRepairAttempts < (config.a11yRepairAttempts ?? 0)
-  ) {
-    axeRepairAttempts++;
-    progress.log(
-      rootPromptDef,
-      'build',
-      `Trying to repair axe accessibility violations (attempt #${axeRepairAttempts + 1})...`,
-    );
+  let testRepairAttempts = 0;
+  for (let testRepairAttempt = 0; testRepairAttempt < maxTestRepairAttempts; testRepairAttempt++) {
+    const hasAxeFailure =
+      lastAttempt.serveTestingResult && lastAttempt.serveTestingResult.axeViolations?.length;
+    const hasTestFailure = lastAttempt.testResult && !lastAttempt.testResult.passed;
+    if (!hasAxeFailure && !hasTestFailure) {
+      break;
+    }
 
-    const axeViolationsError = JSON.stringify(
-      lastAttempt.serveTestingResult.axeViolations,
-      null,
-      2,
-    );
+    const attemptId = testRepairAttempt + repairAttempts + 1;
 
-    progress.log(rootPromptDef, 'error', 'Found Axe accessibility violations');
+    const errors: Array<{errorContext: string; errorMessage: string}> = [];
+    if (hasAxeFailure) {
+      axeRepairAttempts++;
+      progress.log(
+        rootPromptDef,
+        'build',
+        `Trying to repair axe accessibility violations (attempt #${attemptId})...`,
+      );
+      const axeViolationsError = JSON.stringify(
+        lastAttempt.serveTestingResult!.axeViolations,
+        null,
+        2,
+      );
+      progress.log(rootPromptDef, 'error', 'Found Axe accessibility violations');
+      errors.push({
+        errorContext:
+          'There are the following accessibility errors from axe accessibility violations:',
+        errorMessage: axeViolationsError,
+      });
+    }
+    if (hasTestFailure) {
+      testRepairAttempts++;
+      progress.log(
+        rootPromptDef,
+        'test',
+        `Trying to repair test failures (attempt #${attemptId})...`,
+      );
+
+      errors.push({
+        errorContext: 'Application tests failed. Attempt to fix them. Test output was:',
+        errorMessage: lastAttempt.testResult!.output,
+      });
+    }
 
     const attempt = await repairAndBuild(
       evalID,
@@ -154,28 +204,28 @@ export async function attemptBuild(
       rootPromptDef,
       directory,
       lastAttempt.outputFiles,
-      axeViolationsError,
-      'There are the following accessibility errors from axe accessibility violations:',
+      errors,
       contextFiles,
       abortSignal,
       workerConcurrencyQueue,
-      axeRepairAttempts + repairAttempts,
+      attemptId,
       progress,
+      'test',
     );
 
     let hasBuildFailure = attempt.buildResult.status !== BuildResultStatus.SUCCESS;
-    attempt.buildFailedDuringA11yRepair = hasBuildFailure;
+    attempt.buildFailedDuringTestRepair = hasBuildFailure;
     attemptDetails.push(attempt);
     lastAttempt = attempt;
+    // If we somehow introduced build errors via the repair loop, we abort
+    // further repairs and capture the failed build. This is useful insight
+    // as LLMs seem to regress when asked to repair violations.
+    if (hasBuildFailure) {
+      break;
+    }
 
-    // If we somehow introduced build errors via the Axe repair loop, we abort
-    // further a11y repairs and capture the failed build. This is useful insight
-    // as LLMs seem to regress when asked to repair a11y violations.
-    if (hasBuildFailure) break;
-
-    // Re-run serving & tests after Axe repair.
-    // This allows us to check if we fixed the violations.
-    attempt.serveTestingResult = await serveAndTestApp(
+    // Re-run serving & tests after repair.
+    lastAttempt.serveTestingResult = await serveAndTestApp(
       config,
       evalID,
       directory,
@@ -186,10 +236,26 @@ export async function attemptBuild(
       progress,
       userJourneyAgentTaskInput,
     );
+    const testResult = await runTest(
+      env,
+      evalID,
+      directory,
+      rootPromptDef,
+      abortSignal,
+      workerConcurrencyQueue,
+      progress,
+    );
+
+    if (testResult !== null) {
+      lastAttempt.testResult = testResult;
+    }
 
-    if (attempt.serveTestingResult.axeViolations?.length === 0) {
+    if (hasAxeFailure && lastAttempt.serveTestingResult.axeViolations?.length === 0) {
       progress.log(rootPromptDef, 'success', `Successfully fixed all Axe accessibility violations`);
     }
+    if (hasTestFailure && lastAttempt.testResult?.passed) {
+      progress.log(rootPromptDef, 'success', `Successfully fixed all test failures`);
+    }
   }
 
   return {
@@ -197,6 +263,8 @@ export async function attemptBuild(
     serveTestingResult: lastAttempt.serveTestingResult,
     outputFiles: lastAttempt.outputFiles,
     repairAttempts,
-    axeRepairAttempts,
+    axeRepairAttempts: axeRepairAttempts,
+    testResult: lastAttempt.testResult,
+    testRepairAttempts: testRepairAttempts,
   };
 }
diff --git a/runner/orchestration/codegen.ts b/runner/orchestration/codegen.ts
index bacf398..47f6cbe 100644
--- a/runner/orchestration/codegen.ts
+++ b/runner/orchestration/codegen.ts
@@ -9,10 +9,8 @@ import {
 } from '../shared-interfaces.js';
 import {LlmRunner, LocalLlmGenerateFilesContext, PromptDataMessage} from '../codegen/llm-runner.js';
 import {Environment} from '../configuration/environment.js';
-import {getPossiblePackageManagers} from '../configuration/environment-config.js';
 import {ProgressLogger} from '../progress/progress-logger.js';
 import {EvalID} from './executors/executor.js';
-import {LocalExecutor} from './executors/local-executor.js';
 
 /**
  * Generates code using the configured AI model based on the provided prompt.
@@ -94,18 +92,17 @@ export async function repairCodeWithAI(
   promptDef: RootPromptDefinition,
   directory: string,
   appFiles: LlmResponseFile[],
-  errorMessage: string,
-  errorContext: string,
+  errors: Array<{errorContext: string; errorMessage: string}>,
   contextFiles: LlmContextFile[],
   abortSignal: AbortSignal,
   progress: ProgressLogger,
+  repairType: 'build' | 'test',
 ): Promise<LlmResponse> {
   const repairSystemInstructions = env.systemPromptRepair();
   const repairPrompt = [
-    errorContext,
-    '```',
-    errorMessage,
-    '```',
+    ...errors.map(({errorContext, errorMessage}) =>
+      [errorContext, '```', errorMessage, '```'].join('\n'),
+    ),
     '',
     'In the following source code:',
     ...appFiles.map(file => `${file.filePath}:\n\`\`\`\n${file.code}\`\`\`\n\n`),
@@ -118,13 +115,13 @@ export async function repairCodeWithAI(
     combinedPrompt: `${repairSystemInstructions}\n${repairPrompt}`,
   };
 
-  progress.log(promptDef, 'codegen', 'Repairing code with AI');
+  progress.log(promptDef, 'codegen', `Repairing ${repairType} failures with AI`);
 
   const response = await env.executor.generateRepairFiles(
     evalID,
     context,
     model,
-    errorMessage,
+    errors.map(ec => ec.errorMessage).join('\n'),
     appFiles,
     contextFiles,
     abortSignal,
diff --git a/runner/orchestration/executors/executor.ts b/runner/orchestration/executors/executor.ts
index d6a37e1..297cd89 100644
--- a/runner/orchestration/executors/executor.ts
+++ b/runner/orchestration/executors/executor.ts
@@ -6,6 +6,7 @@ import {
   LlmResponse,
   LlmResponseFile,
   RootPromptDefinition,
+  TestExecutionResult,
 } from '../../shared-interfaces.js';
 import {BuildResult} from '../../workers/builder/builder-types.js';
 import z from 'zod';
@@ -72,6 +73,19 @@ export const executorSchema = z.object({
     ]),
     z.promise(z.custom<ServeTestingResult>()),
   ),
+  executeProjectTests: z.function(
+    z.tuple([
+      z.custom<EvalID>().describe('ID of the eval'),
+      z.string().describe('Path to the application directory'),
+      z.custom<RootPromptDefinition>().describe('Root prompt definition'),
+      z
+        .custom<WorkerQueueType>()
+        .describe('Worker concurrency queue. Use this for limiting local workers.'),
+      z.custom<AbortSignal>().describe('Abort Signal to fire when tests should be canceled.'),
+      z.custom<ProgressLogger>().describe('Progress logger'),
+    ]),
+    z.promise(z.custom<TestExecutionResult>().nullable()),
+  ),
   finalizeEval: z.function(
     z.tuple([z.custom<EvalID>().describe('ID of the eval')]),
     z.promise(z.void()),
diff --git a/runner/orchestration/executors/local-executor-config.ts b/runner/orchestration/executors/local-executor-config.ts
index d90cfbb..ae6df7c 100644
--- a/runner/orchestration/executors/local-executor-config.ts
+++ b/runner/orchestration/executors/local-executor-config.ts
@@ -1,6 +1,6 @@
 import z from 'zod';
 import {mcpServerOptionsSchema} from '../../codegen/llm-runner.js';
-import {getPossiblePackageManagers} from '../../configuration/environment-config.js';
+import {getPossiblePackageManagers} from '../../configuration/package-managers.js';
 
 export const localExecutorConfigSchema = z.strictObject({
   /** MCP servers that can be started for this environment. */
@@ -24,6 +24,10 @@ export const localExecutorConfigSchema = z.strictObject({
    * Defaults to `<package manager> run start --port 0`.
    */
   serveCommand: z.string().optional(),
+  /**
+   * Optional command for executing project tests.
+   */
+  testCommand: z.string().optional(),
   /**
    * Whether to skip installing dependencies when running evals in the environment.
    * Useful if you're managing dependencies yourself.
diff --git a/runner/orchestration/executors/local-executor.ts b/runner/orchestration/executors/local-executor.ts
index 7c3dcf8..afbcccf 100644
--- a/runner/orchestration/executors/local-executor.ts
+++ b/runner/orchestration/executors/local-executor.ts
@@ -10,6 +10,7 @@ import {
   LlmResponse,
   LlmResponseFile,
   RootPromptDefinition,
+  TestExecutionResult,
 } from '../../shared-interfaces.js';
 import {killChildProcessGracefully} from '../../utils/kill-gracefully.js';
 import {
@@ -21,7 +22,10 @@ import {serveApp} from '../../workers/serve-testing/serve-app.js';
 import {generateCodeWithAI} from '../codegen.js';
 import {EvalID, Executor} from './executor.js';
 import {LocalExecutorConfig} from './local-executor-config.js';
-import {getPossiblePackageManagers} from '../../configuration/environment-config.js';
+import {getPossiblePackageManagers} from '../../configuration/package-managers.js';
+import {callWithTimeout} from '../../utils/timeout.js';
+import {executeCommand} from '../../utils/exec.js';
+import {cleanupBuildMessage} from '../../workers/builder/worker.js';
 
 let uniqueIDs = 0;
 
@@ -117,6 +121,48 @@ export class LocalExecutor implements Executor {
     );
   }
 
+  async executeProjectTests(
+    _id: EvalID,
+    appDirectoryPath: string,
+    rootPromptDef: RootPromptDefinition,
+    workerConcurrencyQueue: PQueue,
+    abortSignal: AbortSignal,
+    progress: ProgressLogger,
+  ): Promise<TestExecutionResult | null> {
+    if (!this.config.testCommand) {
+      return Promise.resolve(null);
+    }
+    const testCommand = this.config.testCommand;
+
+    let output: string;
+    let passed: boolean;
+
+    try {
+      // Run the test command inside the temporary project directory
+      // Also add to the worker concurrency queue to not overload local systems.
+      const stdout = await workerConcurrencyQueue.add(() =>
+        callWithTimeout(
+          `Testing ${rootPromptDef.name}`,
+          timeoutAbort =>
+            executeCommand(testCommand, appDirectoryPath, undefined, {
+              abortSignal: AbortSignal.any([abortSignal, timeoutAbort]),
+            }),
+          4, // 4min. This is a safety boundary. Lots of parallelism can slow-down.
+        ),
+      );
+      output = stdout;
+      passed = true;
+    } catch (error: any) {
+      output = error.message;
+      passed = false;
+    }
+
+    return {
+      passed,
+      output: cleanupBuildMessage(output),
+    } satisfies TestExecutionResult;
+  }
+
   async serveWebApplication<T>(
     _id: EvalID,
     appDirectoryPath: string,
diff --git a/runner/orchestration/generate.ts b/runner/orchestration/generate.ts
index 9edf72b..dd076b0 100644
--- a/runner/orchestration/generate.ts
+++ b/runner/orchestration/generate.ts
@@ -31,7 +31,6 @@ import {
 } from '../shared-interfaces.js';
 import {BrowserAgentTaskInput} from '../testing/browser-agent/models.js';
 import {callWithTimeout} from '../utils/timeout.js';
-import {attemptBuild} from './build-serve-loop.js';
 import {createLlmResponseTokenUsageMessage} from './codegen.js';
 import {generateUserJourneysForApp} from './user-journeys.js';
 import {resolveContextFiles, setupProjectStructure, writeResponseFiles} from './file-system.js';
@@ -48,6 +47,7 @@ import {getRunnerByName} from '../codegen/runner-creation.js';
 import {summarizeReportWithAI} from '../reporting/report-ai-summary.js';
 import {LocalExecutor} from './executors/local-executor.js';
 import {EvalID} from './executors/executor.js';
+import {attemptBuildAndTest} from './build-serve-loop.js';
 
 /**
  * Orchestrates the entire assessment process for each prompt defined in the `prompts` array.
@@ -56,7 +56,8 @@ import {EvalID} from './executors/executor.js';
  * 1. Makes a request to Gemini to generate code.
  * 2. Attempts to build it in a template Angular project.
  * 3. If the build fails, it makes a number of "fix it" Gemini requests.
- * 4. Runs other validations and computes a score for generated output.
+ * 4. If configured, runs unit tests and attempts to repair test failures.
+ * 5. Runs other validations and computes a score for generated output.
  *
  * @returns A Promise that resolves to an array of AssessmentResult objects,
  *          each containing the prompt, generated code, and final validation status.
@@ -345,7 +346,7 @@ async function startEvaluationTask(
 
     // Try to build the files in the root prompt directory.
     // This will also attempt to fix issues with the generated code.
-    const attempt = await attemptBuild(
+    const attempt = await attemptBuildAndTest(
       config,
       evalID,
       env,
@@ -378,6 +379,8 @@ async function startEvaluationTask(
       abortSignal,
       progress,
       config.autoraterModel || DEFAULT_AUTORATER_MODEL_NAME,
+      attempt.testResult ?? null,
+      attempt.testRepairAttempts,
     );
 
     results.push({
@@ -395,6 +398,8 @@ async function startEvaluationTask(
       userJourneys: userJourneys,
       axeRepairAttempts: attempt.axeRepairAttempts,
       toolLogs,
+      testResult: attempt.testResult ?? null,
+      testRepairAttempts: attempt.testRepairAttempts,
     } satisfies AssessmentResult);
   }
 
diff --git a/runner/orchestration/build-repair.ts b/runner/orchestration/repair.ts
similarity index 94%
rename from runner/orchestration/build-repair.ts
rename to runner/orchestration/repair.ts
index 5e6b9e8..c7b52ac 100644
--- a/runner/orchestration/build-repair.ts
+++ b/runner/orchestration/repair.ts
@@ -1,3 +1,4 @@
+import {Environment} from '../configuration/environment.js';
 import PQueue from 'p-queue';
 import {
   AttemptDetails,
@@ -6,12 +7,11 @@ import {
   LlmResponseFile,
   RootPromptDefinition,
 } from '../shared-interfaces.js';
-import {Environment} from '../configuration/environment.js';
-import {repairCodeWithAI} from './codegen.js';
-import {writeResponseFiles} from './file-system.js';
 import {runBuild} from './build-worker.js';
 import {ProgressLogger} from '../progress/progress-logger.js';
-import {EvalID, Executor} from './executors/executor.js';
+import {EvalID} from './executors/executor.js';
+import {repairCodeWithAI} from './codegen.js';
+import {writeResponseFiles} from './file-system.js';
 
 /**
  * Calls the LLM to repair code, handles the response, and attempts to build the project again.
@@ -22,12 +22,11 @@ import {EvalID, Executor} from './executors/executor.js';
  * @param directory The working directory.
  * @param finalOutputFiles The list of output files to be modified.
  * @param errorMessage The error message from the failed build.
- * @param errorContext Additional context for the error.
+ * @param errors Additional context for the error.
  * @param contextFiles A list of context files for the LLM.
  * @param abortSignal An AbortSignal to cancel the operation.
  * @param workerConcurrencyQueue The queue for managing worker concurrency.
  * @param attempts The current attempt number.
- * @param repairType The type of repair being performed.
  * @returns A promise that resolves to the new BuildResult.
  */
 export async function repairAndBuild(
@@ -37,13 +36,13 @@ export async function repairAndBuild(
   rootPromptDef: RootPromptDefinition,
   directory: string,
   previousAttemptFiles: LlmResponseFile[],
-  errorMessage: string,
-  errorContext: string,
+  errors: Array<{errorContext: string; errorMessage: string}>,
   contextFiles: LlmContextFile[],
   abortSignal: AbortSignal,
   workerConcurrencyQueue: PQueue,
   attempts: number,
   progress: ProgressLogger,
+  repairType: 'build' | 'test',
 ): Promise<AttemptDetails> {
   const repairResponse = await repairCodeWithAI(
     evalID,
@@ -52,11 +51,11 @@ export async function repairAndBuild(
     rootPromptDef,
     directory,
     previousAttemptFiles,
-    errorMessage,
-    errorContext,
+    errors,
     contextFiles,
     abortSignal,
     progress,
+    repairType,
   );
 
   return await handleRepairResponse(
@@ -73,6 +72,27 @@ export async function repairAndBuild(
   );
 }
 
+/**
+ * Merges a set of new or updated files from a repair attempt into the
+ * current set of files.
+ * @param repairOutputFiles The array of new or updated files to merge.
+ * @param finalFiles The array of files to be updated.
+ */
+function mergeRepairFiles(repairOutputFiles: LlmResponseFile[], finalFiles: LlmResponseFile[]) {
+  // Merge the repair response into the original files. Otherwise we may end up dropping
+  // files that were valid in the initial response and the LLM decided not to touch, because
+  // they're still valid.
+  for (const file of repairOutputFiles) {
+    const existingFile = finalFiles.find(f => f.filePath === file.filePath);
+
+    if (existingFile) {
+      existingFile.code = file.code;
+    } else {
+      finalFiles.push(file);
+    }
+  }
+}
+
 /**
  * Processes an LLM repair response by merging the suggested file changes,
  * writing them to disk, rebuilding the application, and logging the outcome.
@@ -88,7 +108,7 @@ async function handleRepairResponse(
   abortSignal: AbortSignal,
   attempts: number,
   progress: ProgressLogger,
-) {
+): Promise<AttemptDetails> {
   if (!repairResponse.success) {
     progress.log(
       rootPromptDef,
@@ -99,7 +119,6 @@ async function handleRepairResponse(
     // Stop trying to repair if AI can't suggest a fix (API request fails)
     throw new Error(`Repair request failed: ${repairResponse.errors.join('\n')}`);
   }
-
   // Clone the previous files because `mergeRepairFiles` mutates the attempt files.
   // We don't want to change files of a previous attempt.
   const newAttemptFiles = previousAttemptFiles.map(f => ({...f}));
@@ -126,24 +145,3 @@ async function handleRepairResponse(
     attempt: attempts,
   };
 }
-
-/**
- * Merges a set of new or updated files from a repair attempt into the
- * current set of files.
- * @param repairOutputFiles The array of new or updated files to merge.
- * @param finalFiles The array of files to be updated.
- */
-function mergeRepairFiles(repairOutputFiles: LlmResponseFile[], finalFiles: LlmResponseFile[]) {
-  // Merge the repair response into the original files. Otherwise we may end up dropping
-  // files that were valid in the initial response and the LLM decided not to touch, because
-  // they're still valid.
-  for (const file of repairOutputFiles) {
-    const existingFile = finalFiles.find(f => f.filePath === file.filePath);
-
-    if (existingFile) {
-      existingFile.code = file.code;
-    } else {
-      finalFiles.push(file);
-    }
-  }
-}
diff --git a/runner/orchestration/test-worker.ts b/runner/orchestration/test-worker.ts
new file mode 100644
index 0000000..df08d0a
--- /dev/null
+++ b/runner/orchestration/test-worker.ts
@@ -0,0 +1,42 @@
+import PQueue from 'p-queue';
+import {RootPromptDefinition, TestExecutionResult} from '../shared-interfaces.js';
+import {ProgressLogger} from '../progress/progress-logger.js';
+import {Environment} from '../configuration/environment.js';
+import {EvalID} from './executors/executor.js';
+
+export async function runTest(
+  env: Environment,
+  evalID: EvalID,
+  appDirectoryPath: string,
+  rootPromptDef: RootPromptDefinition,
+  abortSignal: AbortSignal,
+  workerConcurrencyQueue: PQueue,
+  progress: ProgressLogger,
+): Promise<TestExecutionResult | null> {
+  progress.log(rootPromptDef, 'test', `Running tests`);
+
+  try {
+    const result = await env.executor.executeProjectTests(
+      evalID,
+      appDirectoryPath,
+      rootPromptDef,
+      workerConcurrencyQueue,
+      abortSignal,
+      progress,
+    );
+    if (result === null) {
+      return result;
+    }
+
+    if (result.passed) {
+      progress.log(rootPromptDef, 'success', 'Tests have passed');
+    } else {
+      progress.log(rootPromptDef, 'error', 'Tests have failed');
+    }
+
+    return result;
+  } catch (err) {
+    progress.log(rootPromptDef, 'error', `Error when executing tests`, err + '');
+    throw err;
+  }
+}
diff --git a/runner/progress/dynamic-progress-logger.ts b/runner/progress/dynamic-progress-logger.ts
index 949cf96..0e68632 100644
--- a/runner/progress/dynamic-progress-logger.ts
+++ b/runner/progress/dynamic-progress-logger.ts
@@ -148,6 +148,7 @@ export class DynamicProgressLogger implements ProgressLogger {
     switch (type) {
       case 'success':
       case 'serve-testing':
+      case 'test':
       case 'build':
         return chalk.green;
       case 'error':
diff --git a/runner/progress/progress-logger.ts b/runner/progress/progress-logger.ts
index c888aba..b029aa6 100644
--- a/runner/progress/progress-logger.ts
+++ b/runner/progress/progress-logger.ts
@@ -2,7 +2,14 @@ import {greenCheckmark, redX} from '../reporting/format.js';
 import {AssessmentResult, RootPromptDefinition} from '../shared-interfaces.js';
 
 /** Possible progress event types. */
-export type ProgressType = 'codegen' | 'build' | 'serve-testing' | 'success' | 'error' | 'eval';
+export type ProgressType =
+  | 'codegen'
+  | 'build'
+  | 'test'
+  | 'serve-testing'
+  | 'success'
+  | 'error'
+  | 'eval';
 
 /** Maps a ProgressType to an icon that can represent it. */
 export function progressTypeToIcon(type: ProgressType): string {
@@ -12,6 +19,8 @@ export function progressTypeToIcon(type: ProgressType): string {
       return '🤖';
     case 'build':
       return '🔨';
+    case 'test':
+      return '🧪';
     case 'serve-testing':
       return '🌊';
     case 'success':
diff --git a/runner/ratings/built-in-ratings/successful-tests-rating.ts b/runner/ratings/built-in-ratings/successful-tests-rating.ts
new file mode 100644
index 0000000..2941fd3
--- /dev/null
+++ b/runner/ratings/built-in-ratings/successful-tests-rating.ts
@@ -0,0 +1,28 @@
+import {PerBuildRating, RatingKind, RatingCategory, RatingState} from '../rating-types.js';
+
+/** Rating which verifies that unit tests pass successfully. */
+export const successfulTestsRating: PerBuildRating = {
+  name: 'Tests pass successfully',
+  description: 'Ensures tests run and pass without errors.',
+  id: 'common-successful-tests',
+  kind: RatingKind.PER_BUILD,
+  category: RatingCategory.MEDIUM_IMPACT,
+  scoreReduction: '30%',
+  // Reduce the amount of points in case we've had test repair attempts.
+  rate: ({testResult, testRepairAttempts}) => {
+    // If no test results are available, skip this rating
+    if (!testResult) {
+      return {
+        state: RatingState.SKIPPED,
+        message: 'Unit testing not configured.',
+      };
+    }
+
+    return {
+      state: RatingState.EXECUTED,
+      coefficient: testResult.passed
+        ? 1 / ((testRepairAttempts || 0) + 1) // Reduce score based on repair attempts
+        : 0, // No points if tests failed
+    };
+  },
+};
diff --git a/runner/ratings/rate-code.ts b/runner/ratings/rate-code.ts
index 99d0874..c0500ec 100644
--- a/runner/ratings/rate-code.ts
+++ b/runner/ratings/rate-code.ts
@@ -8,6 +8,7 @@ import {
   IndividualAssessmentState,
   PromptDefinition,
   AssessmentCategory,
+  TestExecutionResult,
 } from '../shared-interfaces.js';
 import {
   RatingState,
@@ -56,6 +57,8 @@ export async function rateGeneratedCode(
   abortSignal: AbortSignal,
   progress: ProgressLogger,
   autoraterModel: string,
+  testResult: TestExecutionResult | null,
+  testRepairAttempts: number,
 ): Promise<CodeAssessmentScore> {
   let categorizedFiles: CategorizedFiles | null = null;
   let totalPoints = 0;
@@ -93,6 +96,8 @@ export async function rateGeneratedCode(
           buildResult,
           serveTestingResult,
           repairAttempts,
+          testResult,
+          testRepairAttempts,
           outputFiles.length,
           axeRepairAttempts,
           ratingsResult,
@@ -173,6 +178,8 @@ function runPerBuildRating(
   buildResult: BuildResult,
   serveResult: ServeTestingResult | null,
   repairAttempts: number,
+  testResult: TestExecutionResult | null,
+  testRepairAttempts: number,
   generatedFileCount: number,
   axeRepairAttempts: number,
   ratingsResult: RatingsResult,
@@ -184,6 +191,8 @@ function runPerBuildRating(
     generatedFileCount,
     axeRepairAttempts,
     ratingsResult,
+    testResult,
+    testRepairAttempts,
   });
 
   // If the rating was skipped (e.g., Axe test wasn't run), create a skipped assessment.
diff --git a/runner/ratings/rating-types.ts b/runner/ratings/rating-types.ts
index fceb104..6dcbf1c 100644
--- a/runner/ratings/rating-types.ts
+++ b/runner/ratings/rating-types.ts
@@ -5,6 +5,7 @@ import type {
   LlmResponseFile,
   PromptDefinition,
   SkippedIndividualAssessment,
+  TestExecutionResult,
   Usage,
 } from '../shared-interfaces.js';
 import {Environment} from '../configuration/environment.js';
@@ -64,6 +65,8 @@ const perBuildRatingSchema = z
           buildResult: z.custom<BuildResult>(),
           serveResult: z.custom<ServeTestingResult | null>(),
           repairAttempts: z.number(),
+          testResult: z.custom<TestExecutionResult | null>(),
+          testRepairAttempts: z.number(),
           axeRepairAttempts: z.number(),
           generatedFileCount: z.number(),
           ratingsResult: z.record(z.custom<IndividualAssessment | SkippedIndividualAssessment>()),
diff --git a/runner/ratings/stats.ts b/runner/ratings/stats.ts
index 7d94753..a97e927 100644
--- a/runner/ratings/stats.ts
+++ b/runner/ratings/stats.ts
@@ -25,6 +25,10 @@ export function calculateBuildAndCheckStats(assessments: AssessmentResult[]): Ag
   let successfulInitialBuilds = 0;
   let successfulBuildsAfterRepair = 0;
   let failedBuilds = 0;
+  let successfulInitialTests = 0;
+  let successfulTestsAfterRepair = 0;
+  let failedTests = 0;
+  let noTestsRun = 0;
   let runtimeStats: RuntimeStats | undefined;
   let accessibilityStats:
     | {
@@ -59,6 +63,20 @@ export function calculateBuildAndCheckStats(assessments: AssessmentResult[]): Ag
       }
     }
 
+    // Calculate test statistics
+    if (result.testResult) {
+      if (result.testResult.passed) {
+        if ((result.testRepairAttempts || 0) === 0) {
+          successfulInitialTests++;
+        } else {
+          successfulTestsAfterRepair++;
+        }
+      } else {
+        failedTests++;
+      }
+    } else {
+      noTestsRun++;
+    }
     if (result.finalAttempt.serveTestingResult?.runtimeErrors != undefined) {
       runtimeStats ??= {appsWithErrors: 0, appsWithoutErrors: 0};
       if (result.finalAttempt.serveTestingResult.runtimeErrors.trim() != '') {
@@ -124,6 +142,12 @@ export function calculateBuildAndCheckStats(assessments: AssessmentResult[]): Ag
       failedBuilds,
       errorDistribution: Object.keys(errorDistribution).length > 0 ? errorDistribution : undefined,
     },
+    tests: {
+      successfulInitialTests,
+      successfulTestsAfterRepair,
+      failedTests,
+      noTestsRun,
+    },
     buckets,
     runtime: runtimeStats
       ? {
diff --git a/runner/shared-interfaces.ts b/runner/shared-interfaces.ts
index e28c4b8..586cb32 100644
--- a/runner/shared-interfaces.ts
+++ b/runner/shared-interfaces.ts
@@ -27,8 +27,8 @@ export interface AssessmentConfig {
   enableAutoCsp?: boolean;
   logging?: 'text-only' | 'dynamic';
   autoraterModel?: string;
-  a11yRepairAttempts?: number;
   skipLighthouse?: boolean;
+  maxTestRepairAttempts?: number;
   maxBuildRepairAttempts?: number;
 }
 
@@ -248,8 +248,12 @@ export interface AttemptDetails {
   // Note: May not be set in older reports.
   reasoning?: string;
 
-  /** Whether the build failed during an accessibility repair attempt. */
-  buildFailedDuringA11yRepair?: boolean;
+  /** Whether the build failed during an test repair attempt (a11y or unit). */
+  buildFailedDuringTestRepair?: boolean;
+  /** Result of running tests for this attempt. */
+  testResult?: TestExecutionResult;
+  /** The number of repair attempts made for tests in this attempt. */
+  testRepairAttempts?: number;
 }
 
 /** Statistics related to the build process of the generated applications. */
@@ -264,6 +268,18 @@ export interface RunSummaryBuilds {
   errorDistribution?: Partial<Record<BuildErrorType, number>>;
 }
 
+/** Statistics related to the test process of the generated applications. */
+export interface RunSummaryTests {
+  /** The number of applications that had tests run and all tests passed on the first attempt. */
+  successfulInitialTests: number;
+  /** The number of applications that had tests run and all tests passed after repair attempts. */
+  successfulTestsAfterRepair: number;
+  /** The number of applications that had tests run but tests failed even after repair attempts. */
+  failedTests: number;
+  /** The number of applications that did not have tests run (no test command configured). */
+  noTestsRun: number;
+}
+
 /** Buckets into which scores can be categorized. */
 export interface ScoreBucket {
   /** Plain name of the bucket, e.g. "Good" */
@@ -298,6 +314,8 @@ export interface AggregatedRunStats {
   buckets: ScoreBucket[];
   /** Runtime stats. Not present for reports that didn't request runtime error collection. */
   runtime?: RuntimeStats;
+  /** Test stats. Not present for reports that didn't run tests or older reports. */
+  tests?: RunSummaryTests;
 
   accessibility?: {
     appsWithErrors: number;
@@ -476,6 +494,10 @@ export interface AssessmentResult {
   axeRepairAttempts: number;
   /** Tool requests logs (e.g. MCP requests and responses). */
   toolLogs?: ToolLogEntry[];
+  /** Result of running unit tests. */
+  testResult: TestExecutionResult | null;
+  /** Number of repair attempts for tests. */
+  testRepairAttempts?: number;
 }
 
 /**
@@ -565,3 +587,9 @@ export interface LlmGenerateFilesRequest {
   /** Directory in which the generation will occur. */
   directory: string;
 }
+
+/** Result of running tests. */
+export interface TestExecutionResult {
+  passed: boolean;
+  output: string;
+}