diff --git a/docs/environment-reference.md b/docs/environment-reference.md index 3b3f7ba..c37fbea 100644 --- a/docs/environment-reference.md +++ b/docs/environment-reference.md @@ -179,3 +179,8 @@ Defaults to ` run build`. Command used to start a local dev server as a part of the evaluation. Defaults to ` run start --port 0`. + +### `testCommand` + +Command used to run tests against the generated code. If this property is not provided, tests will not be run. The command should exit with code 0 on success and a non-zero exit code on failure. The output from the command (both `stdout` and `stderr`) is captured and used for repair attempts if the tests fail. The test command will time out after 4 minutes. + diff --git a/report-app/src/app/pages/report-viewer/report-viewer.html b/report-app/src/app/pages/report-viewer/report-viewer.html index d7c7603..08369b9 100644 --- a/report-app/src/app/pages/report-viewer/report-viewer.html +++ b/report-app/src/app/pages/report-viewer/report-viewer.html @@ -73,6 +73,20 @@

+ @if (overview.stats.tests) { +
+

+ quiz + Tests +

+
+ +
+
+ } @if (overview.stats.runtime) {

@@ -276,9 +290,19 @@

Generated applications

Initial build failed } - @if (hasBuildFailureDuringA11yRepair(result)) { + @if (hasBuildFailureDuringTestRepair(result)) { Build failed after a11y repair } + + @if (finalAttempt.testResult) { + @if (finalAttempt.testResult.passed) { + @if ((result.testRepairAttempts || 0) > 0) { + Tests passed after repair + } + } @else { + Tests failed + } + }
@@ -350,12 +374,36 @@
+ @if (result.testResult) { +
+

Test Results

+
+ @if (result.testResult.passed) { + โœ” Tests passed + @if ((result.testRepairAttempts || 0) > 0) { +  after {{ result.testRepairAttempts }} repair attempt(s) + } + } @else { + โœ˜ Tests failed + } +
+ + @if (result.testResult.output && !result.testResult.passed) { +
+ See Test Output +
{{ result.testResult.output }}
+
+ } +
+ } +

Additional info

@for (attempt of result.attemptDetails; track attempt) { @let isBuilt = attempt.buildResult.status === 'success'; @let axeViolations = attempt.serveTestingResult?.axeViolations; @let hasAxeViolations = axeViolations && axeViolations.length > 0; + @let testsFailed = attempt.testResult?.passed === false; @@ -380,6 +428,15 @@

Additional info

>A11y } + + @if (attempt.testResult) { + Tests + }
@if (expansionPanel.opened()) { @@ -416,6 +473,11 @@

A11y Violations

} + @if (testsFailed) { +

Failed Tests

+
{{ attempt.testResult?.output }}
+ } +

Generated Code

@for (file of attempt.outputFiles; track file) { diff --git a/report-app/src/app/pages/report-viewer/report-viewer.ts b/report-app/src/app/pages/report-viewer/report-viewer.ts index 08e7109..32c5cc0 100644 --- a/report-app/src/app/pages/report-viewer/report-viewer.ts +++ b/report-app/src/app/pages/report-viewer/report-viewer.ts @@ -21,6 +21,7 @@ import { LlmResponseFile, RunInfo, RunSummaryBuilds, + RunSummaryTests, RuntimeStats, ScoreBucket, SkippedIndividualAssessment, @@ -265,6 +266,31 @@ export class ReportViewer { ]; } + protected testsAsGraphData(tests: RunSummaryTests): StackedBarChartData { + return [ + { + label: 'Passed', + color: ScoreCssVariable.excellent, + value: tests.successfulInitialTests, + }, + { + label: 'Passed after repair', + color: ScoreCssVariable.great, + value: tests.successfulTestsAfterRepair, + }, + { + label: 'Failed', + color: ScoreCssVariable.poor, + value: tests.failedTests, + }, + { + label: 'No tests run', + color: ScoreCssVariable.neutral, + value: tests.noTestsRun, + }, + ]; + } + protected checksAsGraphData(buckets: ScoreBucket[]): StackedBarChartData { return buckets.map(b => ({ label: b.nameWithLabels, @@ -400,7 +426,7 @@ export class ReportViewer { return `wcs run --prompt=${result.promptDef.name} --env=`; } - protected hasBuildFailureDuringA11yRepair(result: AssessmentResult): boolean { - return result.attemptDetails.some(attempt => attempt.buildFailedDuringA11yRepair); + protected hasBuildFailureDuringTestRepair(result: AssessmentResult): boolean { + return result.attemptDetails.some(attempt => attempt.buildFailedDuringTestRepair); } } diff --git a/runner/configuration/constants.ts b/runner/configuration/constants.ts index 3151ec1..422a2dd 100644 --- a/runner/configuration/constants.ts +++ b/runner/configuration/constants.ts @@ -25,7 +25,13 @@ export const LLM_OUTPUT_DIR = join(rootDir, 'llm-output'); * providing the build output and the code that causes the problem. */ // Note: When updating, also adjust the default description in `README.md`. -export const DEFAULT_MAX_REPAIR_ATTEMPTS = 1; +export const DEFAULT_MAX_BUILD_REPAIR_ATTEMPTS = 1; + +/** + * Number of times we'll try to ask LLM to repair test failures + * E.g. Axe violations, or test command failures + */ +export const DEFAULT_MAX_TEST_REPAIR_ATTEMPTS = 1; /** Name of the folder where we store all generated reports */ export const REPORTS_ROOT_DIR = join(rootDir, 'reports'); diff --git a/runner/configuration/environment-config.ts b/runner/configuration/environment-config.ts index a959738..085a342 100644 --- a/runner/configuration/environment-config.ts +++ b/runner/configuration/environment-config.ts @@ -73,11 +73,6 @@ export const environmentConfigSchema = z.object({ export type EnvironmentConfig = z.infer & Partial; -/** Package managers that are currently supported. */ -export function getPossiblePackageManagers() { - return ['npm', 'pnpm', 'yarn'] as const; -} - /** Asserts that the specified data is a valid environment config. */ export function assertIsEnvironmentConfig(value: unknown): asserts value is EnvironmentConfig { const validationResult = environmentConfigSchema diff --git a/runner/configuration/package-managers.ts b/runner/configuration/package-managers.ts new file mode 100644 index 0000000..6929cd2 --- /dev/null +++ b/runner/configuration/package-managers.ts @@ -0,0 +1,4 @@ +/** Package managers that are currently supported. */ +export function getPossiblePackageManagers() { + return ['npm', 'pnpm', 'yarn'] as const; +} diff --git a/runner/eval-cli.ts b/runner/eval-cli.ts index 39077ca..210fe26 100644 --- a/runner/eval-cli.ts +++ b/runner/eval-cli.ts @@ -3,7 +3,8 @@ import chalk from 'chalk'; import { BUILT_IN_ENVIRONMENTS, DEFAULT_AUTORATER_MODEL_NAME, - DEFAULT_MAX_REPAIR_ATTEMPTS, + DEFAULT_MAX_BUILD_REPAIR_ATTEMPTS, + DEFAULT_MAX_TEST_REPAIR_ATTEMPTS, DEFAULT_MODEL_NAME, } from './configuration/constants.js'; import {generateCodeAndAssess} from './orchestration/generate.js'; @@ -37,9 +38,9 @@ interface Options { enableUserJourneyTesting?: boolean; enableAutoCsp?: boolean; autoraterModel?: string; - a11yRepairAttempts?: number; logging?: 'text-only' | 'dynamic'; skipLighthouse?: boolean; + maxTestRepairAttempts?: number; maxBuildRepairAttempts?: number; } @@ -151,11 +152,6 @@ function builder(argv: Argv): Argv { default: DEFAULT_AUTORATER_MODEL_NAME, description: 'Model to use when automatically rating generated code', }) - .option('a11y-repair-attempts', { - type: 'number', - default: 0, - description: 'Number of repair attempts for discovered a11y violations', - }) .option('skip-lighthouse', { type: 'boolean', default: false, @@ -163,9 +159,15 @@ function builder(argv: Argv): Argv { }) .option('max-build-repair-attempts', { type: 'number', - default: DEFAULT_MAX_REPAIR_ATTEMPTS, + default: DEFAULT_MAX_BUILD_REPAIR_ATTEMPTS, description: 'Number of repair attempts when build errors are discovered', }) + .option('max-test-repair-attempts', { + type: 'number', + default: DEFAULT_MAX_TEST_REPAIR_ATTEMPTS, + description: + 'Number of repair attempts for discovered test failures (including a11y violations and ones from testCommand)', + }) .strict() .version(false) .help() @@ -209,9 +211,9 @@ async function handler(cliArgs: Arguments): Promise { logging: cliArgs.logging, autoraterModel: cliArgs.autoraterModel, skipAiSummary: cliArgs.skipAiSummary, - a11yRepairAttempts: cliArgs.a11yRepairAttempts, skipLighthouse: cliArgs.skipLighthouse, maxBuildRepairAttempts: cliArgs.maxBuildRepairAttempts, + maxTestRepairAttempts: cliArgs.maxTestRepairAttempts, }); logReportToConsole(runInfo); diff --git a/runner/orchestration/build-serve-loop.ts b/runner/orchestration/build-serve-loop.ts index f543add..957f58c 100644 --- a/runner/orchestration/build-serve-loop.ts +++ b/runner/orchestration/build-serve-loop.ts @@ -10,15 +10,21 @@ import { } from '../shared-interfaces.js'; import {ProgressLogger} from '../progress/progress-logger.js'; import {runBuild} from './build-worker.js'; -import {repairAndBuild} from './build-repair.js'; import {EvalID} from './executors/executor.js'; import {serveAndTestApp} from './serve-testing-worker.js'; +import {runTest} from './test-worker.js'; import {BrowserAgentTaskInput} from '../testing/browser-agent/models.js'; -import {DEFAULT_MAX_REPAIR_ATTEMPTS} from '../configuration/constants.js'; +import { + DEFAULT_MAX_BUILD_REPAIR_ATTEMPTS, + DEFAULT_MAX_TEST_REPAIR_ATTEMPTS, +} from '../configuration/constants.js'; +import {repairAndBuild} from './repair.js'; /** - * Attempts to build the code that an LLM generated. If the build fails, attempts - * to fix the breakage and build again. + * Attempts to build and test the code that an LLM generated. + * + * * If the build fails, attempts to fix the breakage and build again. + * * If tests fail (like Axe or project tests), we may repair and retry. * * @param config Assessment config. * @param evalID ID of the eval being attempted for build. @@ -34,7 +40,7 @@ import {DEFAULT_MAX_REPAIR_ATTEMPTS} from '../configuration/constants.js'; * @param abortSignal Signal to fire when the build should be aborted. * @param workerConcurrencyQueue Concurrency queue for controlling parallelism of worker invocations (as they are more expensive than LLM calls). */ -export async function attemptBuild( +export async function attemptBuildAndTest( config: AssessmentConfig, evalID: EvalID, env: Environment, @@ -59,8 +65,9 @@ export async function attemptBuild( ); let repairAttempts = 0; const maxRepairAttempts = (await env.executor.shouldRepairFailedBuilds(evalID)) - ? (config.maxBuildRepairAttempts ?? DEFAULT_MAX_REPAIR_ATTEMPTS) + ? (config.maxBuildRepairAttempts ?? DEFAULT_MAX_BUILD_REPAIR_ATTEMPTS) : 0; + const maxTestRepairAttempts = config.maxTestRepairAttempts ?? DEFAULT_MAX_TEST_REPAIR_ATTEMPTS; const initialAttempt = { outputFiles: initialResponse.files, @@ -94,13 +101,18 @@ export async function attemptBuild( rootPromptDef, directory, lastAttempt.outputFiles, - lastAttempt.buildResult.message, - 'There are the following build errors:', + [ + { + errorContext: 'There are the following build errors:', + errorMessage: lastAttempt.buildResult.message, + }, + ], contextFiles, abortSignal, workerConcurrencyQueue, repairAttempts, progress, + 'build', ); attemptDetails.push(attempt); @@ -121,31 +133,69 @@ export async function attemptBuild( progress, userJourneyAgentTaskInput, ); + const testResult = await runTest( + env, + evalID, + directory, + rootPromptDef, + abortSignal, + workerConcurrencyQueue, + progress, + ); + + if (testResult !== null) { + lastAttempt.testResult = testResult; + } } - // Attempt to repair axe testing. This only runs when the last build - // passed and serving did run. Note: By default, we don't run axe repair + // Attempt to repair testing. This only runs when the last build + // passed and serving did run. Note: By default, we don't run repair // attempts as it's not commonly done by LLMs in the ecosystem. let axeRepairAttempts = 0; - while ( - lastAttempt.serveTestingResult && - (lastAttempt.serveTestingResult.axeViolations?.length ?? 0) > 0 && - axeRepairAttempts < (config.a11yRepairAttempts ?? 0) - ) { - axeRepairAttempts++; - progress.log( - rootPromptDef, - 'build', - `Trying to repair axe accessibility violations (attempt #${axeRepairAttempts + 1})...`, - ); + let testRepairAttempts = 0; + for (let testRepairAttempt = 0; testRepairAttempt < maxTestRepairAttempts; testRepairAttempt++) { + const hasAxeFailure = + lastAttempt.serveTestingResult && lastAttempt.serveTestingResult.axeViolations?.length; + const hasTestFailure = lastAttempt.testResult && !lastAttempt.testResult.passed; + if (!hasAxeFailure && !hasTestFailure) { + break; + } - const axeViolationsError = JSON.stringify( - lastAttempt.serveTestingResult.axeViolations, - null, - 2, - ); + const attemptId = testRepairAttempt + repairAttempts + 1; - progress.log(rootPromptDef, 'error', 'Found Axe accessibility violations'); + const errors: Array<{errorContext: string; errorMessage: string}> = []; + if (hasAxeFailure) { + axeRepairAttempts++; + progress.log( + rootPromptDef, + 'build', + `Trying to repair axe accessibility violations (attempt #${attemptId})...`, + ); + const axeViolationsError = JSON.stringify( + lastAttempt.serveTestingResult!.axeViolations, + null, + 2, + ); + progress.log(rootPromptDef, 'error', 'Found Axe accessibility violations'); + errors.push({ + errorContext: + 'There are the following accessibility errors from axe accessibility violations:', + errorMessage: axeViolationsError, + }); + } + if (hasTestFailure) { + testRepairAttempts++; + progress.log( + rootPromptDef, + 'test', + `Trying to repair test failures (attempt #${attemptId})...`, + ); + + errors.push({ + errorContext: 'Application tests failed. Attempt to fix them. Test output was:', + errorMessage: lastAttempt.testResult!.output, + }); + } const attempt = await repairAndBuild( evalID, @@ -154,28 +204,28 @@ export async function attemptBuild( rootPromptDef, directory, lastAttempt.outputFiles, - axeViolationsError, - 'There are the following accessibility errors from axe accessibility violations:', + errors, contextFiles, abortSignal, workerConcurrencyQueue, - axeRepairAttempts + repairAttempts, + attemptId, progress, + 'test', ); let hasBuildFailure = attempt.buildResult.status !== BuildResultStatus.SUCCESS; - attempt.buildFailedDuringA11yRepair = hasBuildFailure; + attempt.buildFailedDuringTestRepair = hasBuildFailure; attemptDetails.push(attempt); lastAttempt = attempt; + // If we somehow introduced build errors via the repair loop, we abort + // further repairs and capture the failed build. This is useful insight + // as LLMs seem to regress when asked to repair violations. + if (hasBuildFailure) { + break; + } - // If we somehow introduced build errors via the Axe repair loop, we abort - // further a11y repairs and capture the failed build. This is useful insight - // as LLMs seem to regress when asked to repair a11y violations. - if (hasBuildFailure) break; - - // Re-run serving & tests after Axe repair. - // This allows us to check if we fixed the violations. - attempt.serveTestingResult = await serveAndTestApp( + // Re-run serving & tests after repair. + lastAttempt.serveTestingResult = await serveAndTestApp( config, evalID, directory, @@ -186,10 +236,26 @@ export async function attemptBuild( progress, userJourneyAgentTaskInput, ); + const testResult = await runTest( + env, + evalID, + directory, + rootPromptDef, + abortSignal, + workerConcurrencyQueue, + progress, + ); + + if (testResult !== null) { + lastAttempt.testResult = testResult; + } - if (attempt.serveTestingResult.axeViolations?.length === 0) { + if (hasAxeFailure && lastAttempt.serveTestingResult.axeViolations?.length === 0) { progress.log(rootPromptDef, 'success', `Successfully fixed all Axe accessibility violations`); } + if (hasTestFailure && lastAttempt.testResult?.passed) { + progress.log(rootPromptDef, 'success', `Successfully fixed all test failures`); + } } return { @@ -197,6 +263,8 @@ export async function attemptBuild( serveTestingResult: lastAttempt.serveTestingResult, outputFiles: lastAttempt.outputFiles, repairAttempts, - axeRepairAttempts, + axeRepairAttempts: axeRepairAttempts, + testResult: lastAttempt.testResult, + testRepairAttempts: testRepairAttempts, }; } diff --git a/runner/orchestration/codegen.ts b/runner/orchestration/codegen.ts index bacf398..47f6cbe 100644 --- a/runner/orchestration/codegen.ts +++ b/runner/orchestration/codegen.ts @@ -9,10 +9,8 @@ import { } from '../shared-interfaces.js'; import {LlmRunner, LocalLlmGenerateFilesContext, PromptDataMessage} from '../codegen/llm-runner.js'; import {Environment} from '../configuration/environment.js'; -import {getPossiblePackageManagers} from '../configuration/environment-config.js'; import {ProgressLogger} from '../progress/progress-logger.js'; import {EvalID} from './executors/executor.js'; -import {LocalExecutor} from './executors/local-executor.js'; /** * Generates code using the configured AI model based on the provided prompt. @@ -94,18 +92,17 @@ export async function repairCodeWithAI( promptDef: RootPromptDefinition, directory: string, appFiles: LlmResponseFile[], - errorMessage: string, - errorContext: string, + errors: Array<{errorContext: string; errorMessage: string}>, contextFiles: LlmContextFile[], abortSignal: AbortSignal, progress: ProgressLogger, + repairType: 'build' | 'test', ): Promise { const repairSystemInstructions = env.systemPromptRepair(); const repairPrompt = [ - errorContext, - '```', - errorMessage, - '```', + ...errors.map(({errorContext, errorMessage}) => + [errorContext, '```', errorMessage, '```'].join('\n'), + ), '', 'In the following source code:', ...appFiles.map(file => `${file.filePath}:\n\`\`\`\n${file.code}\`\`\`\n\n`), @@ -118,13 +115,13 @@ export async function repairCodeWithAI( combinedPrompt: `${repairSystemInstructions}\n${repairPrompt}`, }; - progress.log(promptDef, 'codegen', 'Repairing code with AI'); + progress.log(promptDef, 'codegen', `Repairing ${repairType} failures with AI`); const response = await env.executor.generateRepairFiles( evalID, context, model, - errorMessage, + errors.map(ec => ec.errorMessage).join('\n'), appFiles, contextFiles, abortSignal, diff --git a/runner/orchestration/executors/executor.ts b/runner/orchestration/executors/executor.ts index d6a37e1..297cd89 100644 --- a/runner/orchestration/executors/executor.ts +++ b/runner/orchestration/executors/executor.ts @@ -6,6 +6,7 @@ import { LlmResponse, LlmResponseFile, RootPromptDefinition, + TestExecutionResult, } from '../../shared-interfaces.js'; import {BuildResult} from '../../workers/builder/builder-types.js'; import z from 'zod'; @@ -72,6 +73,19 @@ export const executorSchema = z.object({ ]), z.promise(z.custom()), ), + executeProjectTests: z.function( + z.tuple([ + z.custom().describe('ID of the eval'), + z.string().describe('Path to the application directory'), + z.custom().describe('Root prompt definition'), + z + .custom() + .describe('Worker concurrency queue. Use this for limiting local workers.'), + z.custom().describe('Abort Signal to fire when tests should be canceled.'), + z.custom().describe('Progress logger'), + ]), + z.promise(z.custom().nullable()), + ), finalizeEval: z.function( z.tuple([z.custom().describe('ID of the eval')]), z.promise(z.void()), diff --git a/runner/orchestration/executors/local-executor-config.ts b/runner/orchestration/executors/local-executor-config.ts index d90cfbb..ae6df7c 100644 --- a/runner/orchestration/executors/local-executor-config.ts +++ b/runner/orchestration/executors/local-executor-config.ts @@ -1,6 +1,6 @@ import z from 'zod'; import {mcpServerOptionsSchema} from '../../codegen/llm-runner.js'; -import {getPossiblePackageManagers} from '../../configuration/environment-config.js'; +import {getPossiblePackageManagers} from '../../configuration/package-managers.js'; export const localExecutorConfigSchema = z.strictObject({ /** MCP servers that can be started for this environment. */ @@ -24,6 +24,10 @@ export const localExecutorConfigSchema = z.strictObject({ * Defaults to ` run start --port 0`. */ serveCommand: z.string().optional(), + /** + * Optional command for executing project tests. + */ + testCommand: z.string().optional(), /** * Whether to skip installing dependencies when running evals in the environment. * Useful if you're managing dependencies yourself. diff --git a/runner/orchestration/executors/local-executor.ts b/runner/orchestration/executors/local-executor.ts index 7c3dcf8..afbcccf 100644 --- a/runner/orchestration/executors/local-executor.ts +++ b/runner/orchestration/executors/local-executor.ts @@ -10,6 +10,7 @@ import { LlmResponse, LlmResponseFile, RootPromptDefinition, + TestExecutionResult, } from '../../shared-interfaces.js'; import {killChildProcessGracefully} from '../../utils/kill-gracefully.js'; import { @@ -21,7 +22,10 @@ import {serveApp} from '../../workers/serve-testing/serve-app.js'; import {generateCodeWithAI} from '../codegen.js'; import {EvalID, Executor} from './executor.js'; import {LocalExecutorConfig} from './local-executor-config.js'; -import {getPossiblePackageManagers} from '../../configuration/environment-config.js'; +import {getPossiblePackageManagers} from '../../configuration/package-managers.js'; +import {callWithTimeout} from '../../utils/timeout.js'; +import {executeCommand} from '../../utils/exec.js'; +import {cleanupBuildMessage} from '../../workers/builder/worker.js'; let uniqueIDs = 0; @@ -117,6 +121,48 @@ export class LocalExecutor implements Executor { ); } + async executeProjectTests( + _id: EvalID, + appDirectoryPath: string, + rootPromptDef: RootPromptDefinition, + workerConcurrencyQueue: PQueue, + abortSignal: AbortSignal, + progress: ProgressLogger, + ): Promise { + if (!this.config.testCommand) { + return Promise.resolve(null); + } + const testCommand = this.config.testCommand; + + let output: string; + let passed: boolean; + + try { + // Run the test command inside the temporary project directory + // Also add to the worker concurrency queue to not overload local systems. + const stdout = await workerConcurrencyQueue.add(() => + callWithTimeout( + `Testing ${rootPromptDef.name}`, + timeoutAbort => + executeCommand(testCommand, appDirectoryPath, undefined, { + abortSignal: AbortSignal.any([abortSignal, timeoutAbort]), + }), + 4, // 4min. This is a safety boundary. Lots of parallelism can slow-down. + ), + ); + output = stdout; + passed = true; + } catch (error: any) { + output = error.message; + passed = false; + } + + return { + passed, + output: cleanupBuildMessage(output), + } satisfies TestExecutionResult; + } + async serveWebApplication( _id: EvalID, appDirectoryPath: string, diff --git a/runner/orchestration/generate.ts b/runner/orchestration/generate.ts index 9edf72b..dd076b0 100644 --- a/runner/orchestration/generate.ts +++ b/runner/orchestration/generate.ts @@ -31,7 +31,6 @@ import { } from '../shared-interfaces.js'; import {BrowserAgentTaskInput} from '../testing/browser-agent/models.js'; import {callWithTimeout} from '../utils/timeout.js'; -import {attemptBuild} from './build-serve-loop.js'; import {createLlmResponseTokenUsageMessage} from './codegen.js'; import {generateUserJourneysForApp} from './user-journeys.js'; import {resolveContextFiles, setupProjectStructure, writeResponseFiles} from './file-system.js'; @@ -48,6 +47,7 @@ import {getRunnerByName} from '../codegen/runner-creation.js'; import {summarizeReportWithAI} from '../reporting/report-ai-summary.js'; import {LocalExecutor} from './executors/local-executor.js'; import {EvalID} from './executors/executor.js'; +import {attemptBuildAndTest} from './build-serve-loop.js'; /** * Orchestrates the entire assessment process for each prompt defined in the `prompts` array. @@ -56,7 +56,8 @@ import {EvalID} from './executors/executor.js'; * 1. Makes a request to Gemini to generate code. * 2. Attempts to build it in a template Angular project. * 3. If the build fails, it makes a number of "fix it" Gemini requests. - * 4. Runs other validations and computes a score for generated output. + * 4. If configured, runs unit tests and attempts to repair test failures. + * 5. Runs other validations and computes a score for generated output. * * @returns A Promise that resolves to an array of AssessmentResult objects, * each containing the prompt, generated code, and final validation status. @@ -345,7 +346,7 @@ async function startEvaluationTask( // Try to build the files in the root prompt directory. // This will also attempt to fix issues with the generated code. - const attempt = await attemptBuild( + const attempt = await attemptBuildAndTest( config, evalID, env, @@ -378,6 +379,8 @@ async function startEvaluationTask( abortSignal, progress, config.autoraterModel || DEFAULT_AUTORATER_MODEL_NAME, + attempt.testResult ?? null, + attempt.testRepairAttempts, ); results.push({ @@ -395,6 +398,8 @@ async function startEvaluationTask( userJourneys: userJourneys, axeRepairAttempts: attempt.axeRepairAttempts, toolLogs, + testResult: attempt.testResult ?? null, + testRepairAttempts: attempt.testRepairAttempts, } satisfies AssessmentResult); } diff --git a/runner/orchestration/build-repair.ts b/runner/orchestration/repair.ts similarity index 94% rename from runner/orchestration/build-repair.ts rename to runner/orchestration/repair.ts index 5e6b9e8..c7b52ac 100644 --- a/runner/orchestration/build-repair.ts +++ b/runner/orchestration/repair.ts @@ -1,3 +1,4 @@ +import {Environment} from '../configuration/environment.js'; import PQueue from 'p-queue'; import { AttemptDetails, @@ -6,12 +7,11 @@ import { LlmResponseFile, RootPromptDefinition, } from '../shared-interfaces.js'; -import {Environment} from '../configuration/environment.js'; -import {repairCodeWithAI} from './codegen.js'; -import {writeResponseFiles} from './file-system.js'; import {runBuild} from './build-worker.js'; import {ProgressLogger} from '../progress/progress-logger.js'; -import {EvalID, Executor} from './executors/executor.js'; +import {EvalID} from './executors/executor.js'; +import {repairCodeWithAI} from './codegen.js'; +import {writeResponseFiles} from './file-system.js'; /** * Calls the LLM to repair code, handles the response, and attempts to build the project again. @@ -22,12 +22,11 @@ import {EvalID, Executor} from './executors/executor.js'; * @param directory The working directory. * @param finalOutputFiles The list of output files to be modified. * @param errorMessage The error message from the failed build. - * @param errorContext Additional context for the error. + * @param errors Additional context for the error. * @param contextFiles A list of context files for the LLM. * @param abortSignal An AbortSignal to cancel the operation. * @param workerConcurrencyQueue The queue for managing worker concurrency. * @param attempts The current attempt number. - * @param repairType The type of repair being performed. * @returns A promise that resolves to the new BuildResult. */ export async function repairAndBuild( @@ -37,13 +36,13 @@ export async function repairAndBuild( rootPromptDef: RootPromptDefinition, directory: string, previousAttemptFiles: LlmResponseFile[], - errorMessage: string, - errorContext: string, + errors: Array<{errorContext: string; errorMessage: string}>, contextFiles: LlmContextFile[], abortSignal: AbortSignal, workerConcurrencyQueue: PQueue, attempts: number, progress: ProgressLogger, + repairType: 'build' | 'test', ): Promise { const repairResponse = await repairCodeWithAI( evalID, @@ -52,11 +51,11 @@ export async function repairAndBuild( rootPromptDef, directory, previousAttemptFiles, - errorMessage, - errorContext, + errors, contextFiles, abortSignal, progress, + repairType, ); return await handleRepairResponse( @@ -73,6 +72,27 @@ export async function repairAndBuild( ); } +/** + * Merges a set of new or updated files from a repair attempt into the + * current set of files. + * @param repairOutputFiles The array of new or updated files to merge. + * @param finalFiles The array of files to be updated. + */ +function mergeRepairFiles(repairOutputFiles: LlmResponseFile[], finalFiles: LlmResponseFile[]) { + // Merge the repair response into the original files. Otherwise we may end up dropping + // files that were valid in the initial response and the LLM decided not to touch, because + // they're still valid. + for (const file of repairOutputFiles) { + const existingFile = finalFiles.find(f => f.filePath === file.filePath); + + if (existingFile) { + existingFile.code = file.code; + } else { + finalFiles.push(file); + } + } +} + /** * Processes an LLM repair response by merging the suggested file changes, * writing them to disk, rebuilding the application, and logging the outcome. @@ -88,7 +108,7 @@ async function handleRepairResponse( abortSignal: AbortSignal, attempts: number, progress: ProgressLogger, -) { +): Promise { if (!repairResponse.success) { progress.log( rootPromptDef, @@ -99,7 +119,6 @@ async function handleRepairResponse( // Stop trying to repair if AI can't suggest a fix (API request fails) throw new Error(`Repair request failed: ${repairResponse.errors.join('\n')}`); } - // Clone the previous files because `mergeRepairFiles` mutates the attempt files. // We don't want to change files of a previous attempt. const newAttemptFiles = previousAttemptFiles.map(f => ({...f})); @@ -126,24 +145,3 @@ async function handleRepairResponse( attempt: attempts, }; } - -/** - * Merges a set of new or updated files from a repair attempt into the - * current set of files. - * @param repairOutputFiles The array of new or updated files to merge. - * @param finalFiles The array of files to be updated. - */ -function mergeRepairFiles(repairOutputFiles: LlmResponseFile[], finalFiles: LlmResponseFile[]) { - // Merge the repair response into the original files. Otherwise we may end up dropping - // files that were valid in the initial response and the LLM decided not to touch, because - // they're still valid. - for (const file of repairOutputFiles) { - const existingFile = finalFiles.find(f => f.filePath === file.filePath); - - if (existingFile) { - existingFile.code = file.code; - } else { - finalFiles.push(file); - } - } -} diff --git a/runner/orchestration/test-worker.ts b/runner/orchestration/test-worker.ts new file mode 100644 index 0000000..df08d0a --- /dev/null +++ b/runner/orchestration/test-worker.ts @@ -0,0 +1,42 @@ +import PQueue from 'p-queue'; +import {RootPromptDefinition, TestExecutionResult} from '../shared-interfaces.js'; +import {ProgressLogger} from '../progress/progress-logger.js'; +import {Environment} from '../configuration/environment.js'; +import {EvalID} from './executors/executor.js'; + +export async function runTest( + env: Environment, + evalID: EvalID, + appDirectoryPath: string, + rootPromptDef: RootPromptDefinition, + abortSignal: AbortSignal, + workerConcurrencyQueue: PQueue, + progress: ProgressLogger, +): Promise { + progress.log(rootPromptDef, 'test', `Running tests`); + + try { + const result = await env.executor.executeProjectTests( + evalID, + appDirectoryPath, + rootPromptDef, + workerConcurrencyQueue, + abortSignal, + progress, + ); + if (result === null) { + return result; + } + + if (result.passed) { + progress.log(rootPromptDef, 'success', 'Tests have passed'); + } else { + progress.log(rootPromptDef, 'error', 'Tests have failed'); + } + + return result; + } catch (err) { + progress.log(rootPromptDef, 'error', `Error when executing tests`, err + ''); + throw err; + } +} diff --git a/runner/progress/dynamic-progress-logger.ts b/runner/progress/dynamic-progress-logger.ts index 949cf96..0e68632 100644 --- a/runner/progress/dynamic-progress-logger.ts +++ b/runner/progress/dynamic-progress-logger.ts @@ -148,6 +148,7 @@ export class DynamicProgressLogger implements ProgressLogger { switch (type) { case 'success': case 'serve-testing': + case 'test': case 'build': return chalk.green; case 'error': diff --git a/runner/progress/progress-logger.ts b/runner/progress/progress-logger.ts index c888aba..b029aa6 100644 --- a/runner/progress/progress-logger.ts +++ b/runner/progress/progress-logger.ts @@ -2,7 +2,14 @@ import {greenCheckmark, redX} from '../reporting/format.js'; import {AssessmentResult, RootPromptDefinition} from '../shared-interfaces.js'; /** Possible progress event types. */ -export type ProgressType = 'codegen' | 'build' | 'serve-testing' | 'success' | 'error' | 'eval'; +export type ProgressType = + | 'codegen' + | 'build' + | 'test' + | 'serve-testing' + | 'success' + | 'error' + | 'eval'; /** Maps a ProgressType to an icon that can represent it. */ export function progressTypeToIcon(type: ProgressType): string { @@ -12,6 +19,8 @@ export function progressTypeToIcon(type: ProgressType): string { return '๐Ÿค–'; case 'build': return '๐Ÿ”จ'; + case 'test': + return '๐Ÿงช'; case 'serve-testing': return '๐ŸŒŠ'; case 'success': diff --git a/runner/ratings/built-in-ratings/successful-tests-rating.ts b/runner/ratings/built-in-ratings/successful-tests-rating.ts new file mode 100644 index 0000000..2941fd3 --- /dev/null +++ b/runner/ratings/built-in-ratings/successful-tests-rating.ts @@ -0,0 +1,28 @@ +import {PerBuildRating, RatingKind, RatingCategory, RatingState} from '../rating-types.js'; + +/** Rating which verifies that unit tests pass successfully. */ +export const successfulTestsRating: PerBuildRating = { + name: 'Tests pass successfully', + description: 'Ensures tests run and pass without errors.', + id: 'common-successful-tests', + kind: RatingKind.PER_BUILD, + category: RatingCategory.MEDIUM_IMPACT, + scoreReduction: '30%', + // Reduce the amount of points in case we've had test repair attempts. + rate: ({testResult, testRepairAttempts}) => { + // If no test results are available, skip this rating + if (!testResult) { + return { + state: RatingState.SKIPPED, + message: 'Unit testing not configured.', + }; + } + + return { + state: RatingState.EXECUTED, + coefficient: testResult.passed + ? 1 / ((testRepairAttempts || 0) + 1) // Reduce score based on repair attempts + : 0, // No points if tests failed + }; + }, +}; diff --git a/runner/ratings/rate-code.ts b/runner/ratings/rate-code.ts index 99d0874..c0500ec 100644 --- a/runner/ratings/rate-code.ts +++ b/runner/ratings/rate-code.ts @@ -8,6 +8,7 @@ import { IndividualAssessmentState, PromptDefinition, AssessmentCategory, + TestExecutionResult, } from '../shared-interfaces.js'; import { RatingState, @@ -56,6 +57,8 @@ export async function rateGeneratedCode( abortSignal: AbortSignal, progress: ProgressLogger, autoraterModel: string, + testResult: TestExecutionResult | null, + testRepairAttempts: number, ): Promise { let categorizedFiles: CategorizedFiles | null = null; let totalPoints = 0; @@ -93,6 +96,8 @@ export async function rateGeneratedCode( buildResult, serveTestingResult, repairAttempts, + testResult, + testRepairAttempts, outputFiles.length, axeRepairAttempts, ratingsResult, @@ -173,6 +178,8 @@ function runPerBuildRating( buildResult: BuildResult, serveResult: ServeTestingResult | null, repairAttempts: number, + testResult: TestExecutionResult | null, + testRepairAttempts: number, generatedFileCount: number, axeRepairAttempts: number, ratingsResult: RatingsResult, @@ -184,6 +191,8 @@ function runPerBuildRating( generatedFileCount, axeRepairAttempts, ratingsResult, + testResult, + testRepairAttempts, }); // If the rating was skipped (e.g., Axe test wasn't run), create a skipped assessment. diff --git a/runner/ratings/rating-types.ts b/runner/ratings/rating-types.ts index fceb104..6dcbf1c 100644 --- a/runner/ratings/rating-types.ts +++ b/runner/ratings/rating-types.ts @@ -5,6 +5,7 @@ import type { LlmResponseFile, PromptDefinition, SkippedIndividualAssessment, + TestExecutionResult, Usage, } from '../shared-interfaces.js'; import {Environment} from '../configuration/environment.js'; @@ -64,6 +65,8 @@ const perBuildRatingSchema = z buildResult: z.custom(), serveResult: z.custom(), repairAttempts: z.number(), + testResult: z.custom(), + testRepairAttempts: z.number(), axeRepairAttempts: z.number(), generatedFileCount: z.number(), ratingsResult: z.record(z.custom()), diff --git a/runner/ratings/stats.ts b/runner/ratings/stats.ts index 7d94753..a97e927 100644 --- a/runner/ratings/stats.ts +++ b/runner/ratings/stats.ts @@ -25,6 +25,10 @@ export function calculateBuildAndCheckStats(assessments: AssessmentResult[]): Ag let successfulInitialBuilds = 0; let successfulBuildsAfterRepair = 0; let failedBuilds = 0; + let successfulInitialTests = 0; + let successfulTestsAfterRepair = 0; + let failedTests = 0; + let noTestsRun = 0; let runtimeStats: RuntimeStats | undefined; let accessibilityStats: | { @@ -59,6 +63,20 @@ export function calculateBuildAndCheckStats(assessments: AssessmentResult[]): Ag } } + // Calculate test statistics + if (result.testResult) { + if (result.testResult.passed) { + if ((result.testRepairAttempts || 0) === 0) { + successfulInitialTests++; + } else { + successfulTestsAfterRepair++; + } + } else { + failedTests++; + } + } else { + noTestsRun++; + } if (result.finalAttempt.serveTestingResult?.runtimeErrors != undefined) { runtimeStats ??= {appsWithErrors: 0, appsWithoutErrors: 0}; if (result.finalAttempt.serveTestingResult.runtimeErrors.trim() != '') { @@ -124,6 +142,12 @@ export function calculateBuildAndCheckStats(assessments: AssessmentResult[]): Ag failedBuilds, errorDistribution: Object.keys(errorDistribution).length > 0 ? errorDistribution : undefined, }, + tests: { + successfulInitialTests, + successfulTestsAfterRepair, + failedTests, + noTestsRun, + }, buckets, runtime: runtimeStats ? { diff --git a/runner/shared-interfaces.ts b/runner/shared-interfaces.ts index e28c4b8..586cb32 100644 --- a/runner/shared-interfaces.ts +++ b/runner/shared-interfaces.ts @@ -27,8 +27,8 @@ export interface AssessmentConfig { enableAutoCsp?: boolean; logging?: 'text-only' | 'dynamic'; autoraterModel?: string; - a11yRepairAttempts?: number; skipLighthouse?: boolean; + maxTestRepairAttempts?: number; maxBuildRepairAttempts?: number; } @@ -248,8 +248,12 @@ export interface AttemptDetails { // Note: May not be set in older reports. reasoning?: string; - /** Whether the build failed during an accessibility repair attempt. */ - buildFailedDuringA11yRepair?: boolean; + /** Whether the build failed during an test repair attempt (a11y or unit). */ + buildFailedDuringTestRepair?: boolean; + /** Result of running tests for this attempt. */ + testResult?: TestExecutionResult; + /** The number of repair attempts made for tests in this attempt. */ + testRepairAttempts?: number; } /** Statistics related to the build process of the generated applications. */ @@ -264,6 +268,18 @@ export interface RunSummaryBuilds { errorDistribution?: Partial>; } +/** Statistics related to the test process of the generated applications. */ +export interface RunSummaryTests { + /** The number of applications that had tests run and all tests passed on the first attempt. */ + successfulInitialTests: number; + /** The number of applications that had tests run and all tests passed after repair attempts. */ + successfulTestsAfterRepair: number; + /** The number of applications that had tests run but tests failed even after repair attempts. */ + failedTests: number; + /** The number of applications that did not have tests run (no test command configured). */ + noTestsRun: number; +} + /** Buckets into which scores can be categorized. */ export interface ScoreBucket { /** Plain name of the bucket, e.g. "Good" */ @@ -298,6 +314,8 @@ export interface AggregatedRunStats { buckets: ScoreBucket[]; /** Runtime stats. Not present for reports that didn't request runtime error collection. */ runtime?: RuntimeStats; + /** Test stats. Not present for reports that didn't run tests or older reports. */ + tests?: RunSummaryTests; accessibility?: { appsWithErrors: number; @@ -476,6 +494,10 @@ export interface AssessmentResult { axeRepairAttempts: number; /** Tool requests logs (e.g. MCP requests and responses). */ toolLogs?: ToolLogEntry[]; + /** Result of running unit tests. */ + testResult: TestExecutionResult | null; + /** Number of repair attempts for tests. */ + testRepairAttempts?: number; } /** @@ -565,3 +587,9 @@ export interface LlmGenerateFilesRequest { /** Directory in which the generation will occur. */ directory: string; } + +/** Result of running tests. */ +export interface TestExecutionResult { + passed: boolean; + output: string; +}