Skip to content

Commit bc065f5

Browse files
committed
feat: add auto rating methods to executor
Adds the ability to run the visual and code automated ratings through the `Executor`.
1 parent 54c1cc4 commit bc065f5

File tree

5 files changed

+198
-88
lines changed

5 files changed

+198
-88
lines changed

runner/orchestration/executors/executor.ts

Lines changed: 32 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,21 @@
11
import PQueue from 'p-queue';
2-
import {ProgressLogger} from '../../progress/progress-logger.js';
3-
import {
2+
import z from 'zod';
3+
import type {ProgressLogger} from '../../progress/progress-logger.js';
4+
import type {
45
LlmContextFile,
56
LlmGenerateFilesRequest,
67
LlmResponse,
78
LlmResponseFile,
89
RootPromptDefinition,
910
TestExecutionResult,
1011
} from '../../shared-interfaces.js';
11-
import {BuildResult} from '../../workers/builder/builder-types.js';
12-
import z from 'zod';
13-
import {ServeTestingResult} from '../../workers/serve-testing/worker-types.js';
12+
import type {BuildResult} from '../../workers/builder/builder-types.js';
13+
import type {ServeTestingResult} from '../../workers/serve-testing/worker-types.js';
14+
import type {
15+
ExecutorAutoRateResponse,
16+
ExecutorCodeAutoRateRequest,
17+
ExecutorVisualAutoRateRequest,
18+
} from '../../ratings/autoraters/auto-rate-shared.js';
1419

1520
export type EvalID = string & {__evalID: true};
1621

@@ -124,6 +129,28 @@ export const executorSchema = z.object({
124129
}),
125130
),
126131
),
132+
autoRateCode: z
133+
.function(
134+
z.tuple([
135+
z.custom<ExecutorCodeAutoRateRequest>().describe('Context for the automated code rating'),
136+
z
137+
.custom<AbortSignal>()
138+
.describe('Abort Signal to fire when the request should be canceled.'),
139+
]),
140+
z.promise(z.custom<ExecutorAutoRateResponse>()),
141+
)
142+
.optional(),
143+
autoRateVisuals: z
144+
.function(
145+
z.tuple([
146+
z.custom<ExecutorVisualAutoRateRequest>().describe('Context for the automated code rating'),
147+
z
148+
.custom<AbortSignal>()
149+
.describe('Abort Signal to fire when the request should be canceled.'),
150+
]),
151+
z.promise(z.custom<ExecutorAutoRateResponse>()),
152+
)
153+
.optional(),
127154
});
128155

129156
export type Executor = z.infer<typeof executorSchema>;

runner/orchestration/executors/local-executor.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
import {ChildProcess, fork} from 'node:child_process';
2-
import path, {join} from 'node:path';
2+
import path from 'node:path';
33
import PQueue from 'p-queue';
44
import {LlmRunner, McpServerDetails} from '../../codegen/llm-runner.js';
55
import {getRunnerByName, RunnerName} from '../../codegen/runner-creation.js';

runner/ratings/autoraters/auto-rate-shared.ts

Lines changed: 43 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,7 @@
1-
import {Usage} from '../../shared-interfaces.js';
1+
import type {LlmContextFile, Usage} from '../../shared-interfaces.js';
2+
3+
/** Minimum rating that the LLM can assign. */
4+
export const MIN_RATING = 1;
25

36
/** Maximum rating that the LLM can assign. */
47
export const MAX_RATING = 10;
@@ -13,8 +16,45 @@ export interface AutoRateResult {
1316
};
1417
}
1518

16-
export function getCoefficient(rating: number): number {
17-
const percent = rating / MAX_RATING;
19+
/** Request for executor to auto-rate generated code. */
20+
export interface ExecutorCodeAutoRateRequest {
21+
/** Prompt used for the rating. */
22+
ratingPrompt: string;
23+
/** Files that should be rated. */
24+
files: LlmContextFile[];
25+
/** Minimum score. */
26+
minRating: number;
27+
/** Maxmum score. */
28+
maxRating: number;
29+
}
30+
31+
export interface ExecutorVisualAutoRateRequest {
32+
/** Prompt used for the rating. */
33+
ratingPrompt: string;
34+
/** URL to the image to be rated. */
35+
imageUrl: string;
36+
/** base64 representation of the image. */
37+
base64Image: string;
38+
/** Minimum score. */
39+
minRating: number;
40+
/** Maxmum score. */
41+
maxRating: number;
42+
}
43+
44+
/** Response from the executor to an automated rating request. */
45+
export interface ExecutorAutoRateResponse {
46+
/** Score of the rating. */
47+
rating: number;
48+
/** Text summary of the result. */
49+
summary: string;
50+
/** Categories of the rating and related descriptions. */
51+
categories: {name: string; message: string}[];
52+
/** Usage information about the auto rate request. */
53+
usage?: Usage;
54+
}
55+
56+
export function getCoefficient(rating: number, maxRating: number): number {
57+
const percent = rating / maxRating;
1858

1959
// More than 80% is a perfect score.
2060
if (percent >= 0.8) {

runner/ratings/autoraters/code-rater.ts

Lines changed: 55 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -2,13 +2,14 @@ import {readFileSync} from 'node:fs';
22
import {z} from 'zod';
33
import {prepareContextFilesMessage} from '../../orchestration/codegen.js';
44
import {Environment} from '../../configuration/environment.js';
5+
import {IndividualAssessmentState, LlmResponseFile, Usage} from '../../shared-interfaces.js';
56
import {
6-
IndividualAssessment,
7-
IndividualAssessmentState,
8-
LlmResponseFile,
9-
SkippedIndividualAssessment,
10-
} from '../../shared-interfaces.js';
11-
import {AutoRateResult, getCoefficient, MAX_RATING} from './auto-rate-shared.js';
7+
AutoRateResult,
8+
ExecutorAutoRateResponse,
9+
getCoefficient,
10+
MAX_RATING,
11+
MIN_RATING,
12+
} from './auto-rate-shared.js';
1213
import {GenkitRunner} from '../../codegen/genkit/genkit-runner.js';
1314
import defaultCodeRaterPrompt from './code-rating-prompt.js';
1415
import {RatingsResult} from '../rating-types.js';
@@ -46,13 +47,7 @@ export async function autoRateCode(
4647
appPrompt: string,
4748
ratingsResult: RatingsResult,
4849
): Promise<AutoRateResult> {
49-
const contextMessage = prepareContextFilesMessage(
50-
files.map(o => ({
51-
relativePath: o.filePath,
52-
content: o.code,
53-
})),
54-
);
55-
50+
const contextFiles = files.map(o => ({relativePath: o.filePath, content: o.code}));
5651
let promptText: string;
5752

5853
if (environment.codeRatingPromptPath) {
@@ -80,32 +75,56 @@ export async function autoRateCode(
8075
SAFETY_WEB_RESULTS_JSON: safetyWebResultsJson,
8176
}).result;
8277

83-
const result = await llm.generateConstrained({
84-
abortSignal,
85-
messages: contextMessage ? [contextMessage] : [],
86-
model,
87-
prompt,
88-
skipMcp: true,
89-
schema: z.object({
90-
rating: z.number().describe(`Rating from 1-${MAX_RATING}. Best is ${MAX_RATING}.`),
91-
summary: z.string().describe('Summary of the overall code quality.'),
92-
categories: z.array(
93-
z.object({
94-
name: z.string().describe('Category name'),
95-
message: z.string().describe('Short description of the problem.'),
96-
}),
97-
),
98-
}),
99-
});
78+
let output: ExecutorAutoRateResponse;
79+
let usage: Usage | null;
80+
81+
if (environment.executor.autoRateCode) {
82+
output = await environment.executor.autoRateCode(
83+
{
84+
ratingPrompt: prompt,
85+
files: contextFiles,
86+
minRating: MIN_RATING,
87+
maxRating: MAX_RATING,
88+
},
89+
abortSignal,
90+
);
91+
usage = output.usage || null;
92+
} else {
93+
// TODO(crisbeto): move this into the local executor once
94+
// `Executor.autoRateVisuals` becomes a required method.
95+
const contextMessage = prepareContextFilesMessage(contextFiles);
96+
const result = await llm.generateConstrained({
97+
abortSignal,
98+
messages: contextMessage ? [contextMessage] : [],
99+
model,
100+
prompt,
101+
skipMcp: true,
102+
schema: z.object({
103+
rating: z
104+
.number()
105+
.describe(`Rating from ${MIN_RATING}-${MAX_RATING}. Best is ${MAX_RATING}.`),
106+
summary: z.string().describe('Summary of the overall code quality.'),
107+
categories: z.array(
108+
z.object({
109+
name: z.string().describe('Category name'),
110+
message: z.string().describe('Short description of the problem.'),
111+
}),
112+
),
113+
}),
114+
});
115+
116+
output = result.output!;
117+
usage = result.usage || null;
118+
}
100119

101120
return {
102-
coefficient: getCoefficient(result.output!.rating),
121+
coefficient: getCoefficient(output.rating, MAX_RATING),
103122
usage: {
104-
inputTokens: result.usage?.inputTokens ?? 0,
105-
outputTokens: result.usage?.outputTokens ?? 0,
106-
totalTokens: result.usage?.totalTokens ?? 0,
107-
thinkingTokens: result.usage?.thinkingTokens ?? 0,
123+
inputTokens: usage?.inputTokens ?? 0,
124+
outputTokens: usage?.outputTokens ?? 0,
125+
totalTokens: usage?.totalTokens ?? 0,
126+
thinkingTokens: usage?.thinkingTokens ?? 0,
108127
},
109-
details: result.output!,
128+
details: output,
110129
};
111130
}

runner/ratings/autoraters/visuals-rater.ts

Lines changed: 67 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,17 @@
11
import {z} from 'zod';
22
import {PromptDataMessage} from '../../codegen/llm-runner.js';
3-
import {AutoRateResult, getCoefficient, MAX_RATING} from './auto-rate-shared.js';
3+
import {
4+
AutoRateResult,
5+
ExecutorAutoRateResponse,
6+
getCoefficient,
7+
MAX_RATING,
8+
MIN_RATING,
9+
} from './auto-rate-shared.js';
410
import {GenkitRunner} from '../../codegen/genkit/genkit-runner.js';
511
import defaultVisualRaterPrompt from './visual-rating-prompt.js';
612
import {Environment} from '../../configuration/environment.js';
713
import {screenshotUrlToPngBuffer} from '../../utils/screenshots.js';
14+
import {Usage} from '../../shared-interfaces.js';
815

916
/**
1017
* Automatically rate the appearance of a screenshot using an LLM.
@@ -29,53 +36,70 @@ export async function autoRateAppearance(
2936
APP_PROMPT: appPrompt,
3037
}).result;
3138

32-
const messages: PromptDataMessage[] = [
33-
{
34-
role: 'user',
35-
content: [
36-
{
37-
media: {
38-
base64PngImage: (await screenshotUrlToPngBuffer(screenshotPngUrl)).toString('base64'),
39-
url: screenshotPngUrl,
40-
},
41-
},
42-
],
43-
},
44-
];
39+
const base64Image = (await screenshotUrlToPngBuffer(screenshotPngUrl)).toString('base64');
4540

46-
const result = await llm.generateConstrained({
47-
abortSignal,
48-
messages,
49-
prompt,
50-
model,
51-
skipMcp: true,
52-
timeout: {
53-
description: `Rating screenshot of ${label} using ${model}`,
54-
durationInMins: 2.5,
55-
},
56-
schema: z.object({
57-
rating: z.number().describe(`Rating from 1-${MAX_RATING}. Best is ${MAX_RATING}.`),
58-
summary: z
59-
.string()
60-
.describe('Summary of the overall app, talking about concrete features, super concise.'),
61-
categories: z.array(
62-
z.object({
63-
name: z.string().describe('Category name'),
64-
message: z.string().describe('Short description of what is missing.'),
65-
}),
66-
),
67-
}),
68-
});
41+
let output: ExecutorAutoRateResponse;
42+
let usage: Usage | null;
43+
44+
if (environment.executor.autoRateVisuals) {
45+
output = await environment.executor.autoRateVisuals(
46+
{
47+
ratingPrompt: prompt,
48+
imageUrl: screenshotPngUrl,
49+
base64Image,
50+
minRating: MIN_RATING,
51+
maxRating: MAX_RATING,
52+
},
53+
abortSignal,
54+
);
55+
usage = output.usage || null;
56+
} else {
57+
// TODO(crisbeto): move this into the local executor once
58+
// `Executor.autoRateVisuals` becomes a required method.
59+
const messages: PromptDataMessage[] = [
60+
{
61+
role: 'user',
62+
content: [{media: {base64PngImage: base64Image, url: screenshotPngUrl}}],
63+
},
64+
];
65+
66+
const result = await llm.generateConstrained({
67+
abortSignal,
68+
messages,
69+
prompt,
70+
model,
71+
skipMcp: true,
72+
timeout: {
73+
description: `Rating screenshot of ${label} using ${model}`,
74+
durationInMins: 2.5,
75+
},
76+
schema: z.object({
77+
rating: z
78+
.number()
79+
.describe(`Rating from ${MIN_RATING}-${MAX_RATING}. Best is ${MAX_RATING}.`),
80+
summary: z
81+
.string()
82+
.describe('Summary of the overall app, talking about concrete features, super concise.'),
83+
categories: z.array(
84+
z.object({
85+
name: z.string().describe('Category name'),
86+
message: z.string().describe('Short description of what is missing.'),
87+
}),
88+
),
89+
}),
90+
});
6991

70-
const output = result.output!;
92+
output = result.output!;
93+
usage = result.usage || null;
94+
}
7195

7296
return {
73-
coefficient: getCoefficient(output.rating),
97+
coefficient: getCoefficient(output.rating, MAX_RATING),
7498
usage: {
75-
inputTokens: result.usage?.inputTokens ?? 0,
76-
outputTokens: result.usage?.outputTokens ?? 0,
77-
totalTokens: result.usage?.totalTokens ?? 0,
78-
thinkingTokens: result.usage?.thinkingTokens ?? 0,
99+
inputTokens: usage?.inputTokens ?? 0,
100+
outputTokens: usage?.outputTokens ?? 0,
101+
totalTokens: usage?.totalTokens ?? 0,
102+
thinkingTokens: usage?.thinkingTokens ?? 0,
79103
},
80104
details: output,
81105
};

0 commit comments

Comments
 (0)