diff --git a/.github/workflows/eval-flywheel.yml b/.github/workflows/eval-flywheel.yml new file mode 100644 index 0000000..40de9fe --- /dev/null +++ b/.github/workflows/eval-flywheel.yml @@ -0,0 +1,70 @@ +name: eval-flywheel + +on: + push: + branches: + - main + workflow_dispatch: + inputs: + autoprod: + description: Promote to production when all gates pass + required: false + default: false + type: boolean + +jobs: + flywheel: + runs-on: ubuntu-latest + timeout-minutes: 90 + + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Setup Bun + uses: oven-sh/setup-bun@v2 + with: + bun-version: latest + + - name: Install Dependencies + run: bun install + + - name: Install Vercel CLI + run: npm install --global vercel@latest + + - name: Pull Vercel Project Settings + run: vercel pull --yes --environment preview --token "$VERCEL_TOKEN" + env: + VERCEL_ORG_ID: ${{ secrets.VERCEL_ORG_ID }} + VERCEL_PROJECT_ID: ${{ secrets.VERCEL_PROJECT_ID }} + VERCEL_TOKEN: ${{ secrets.VERCEL_TOKEN }} + + - name: Install Playwright Chromium + run: bunx playwright install --with-deps chromium + + - name: Generate Eval Audio Fixtures + run: bun run eval:prepare-audio + env: + OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} + + - name: Run Eval Flywheel + run: bun run eval:flywheel + env: + CI: "true" + LILAC_FLYWHEEL_ENABLED: "true" + LILAC_FLYWHEEL_AUTOPROD_ENABLED: ${{ github.event.inputs.autoprod || 'false' }} + LILAC_FLYWHEEL_BUDGET_USD_LIMIT: "30" + LILAC_FLYWHEEL_MAX_PATCH_ATTEMPTS_PER_FAILURE: "1" + LILAC_FLYWHEEL_MAX_RUNS_PER_DAY: "8" + OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} + VERCEL_ORG_ID: ${{ secrets.VERCEL_ORG_ID }} + VERCEL_PROJECT_ID: ${{ secrets.VERCEL_PROJECT_ID }} + VERCEL_TOKEN: ${{ secrets.VERCEL_TOKEN }} + + - name: Upload Eval Artifacts + if: always() + uses: actions/upload-artifact@v4 + with: + name: eval-artifacts + path: .artifacts/evals + if-no-files-found: ignore diff --git a/.github/workflows/live-smoke.yml b/.github/workflows/live-smoke.yml new file mode 100644 index 0000000..0647eea --- /dev/null +++ b/.github/workflows/live-smoke.yml @@ -0,0 +1,52 @@ +name: live-smoke + +on: + push: + branches: + - main + workflow_dispatch: + +jobs: + smoke: + runs-on: ubuntu-latest + timeout-minutes: 30 + + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Setup Bun + uses: oven-sh/setup-bun@v2 + with: + bun-version: latest + + - name: Install Dependencies + run: bun install + + - name: Install Vercel CLI + run: npm install --global vercel@latest + + - name: Install Playwright Chromium + run: bunx playwright install --with-deps chromium + + - name: Generate Smoke Audio Fixtures + run: bun run smoke:prepare-audio + env: + OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} + + - name: Run Live Smoke + run: bun run smoke:live + env: + CI: "true" + OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} + VERCEL_ORG_ID: ${{ secrets.VERCEL_ORG_ID }} + VERCEL_PROJECT_ID: ${{ secrets.VERCEL_PROJECT_ID }} + VERCEL_TOKEN: ${{ secrets.VERCEL_TOKEN }} + + - name: Upload Smoke Artifacts + if: always() + uses: actions/upload-artifact@v4 + with: + name: smoke-artifacts + path: .artifacts/smoke + if-no-files-found: ignore diff --git a/.gitignore b/.gitignore index ac84698..b9ed78c 100644 --- a/.gitignore +++ b/.gitignore @@ -33,4 +33,7 @@ yarn-error.log* # bun bun.lockb -bun.lock \ No newline at end of file +bun.lock + +# smoke artifacts +.artifacts diff --git a/biome.json b/biome.json index e8d99dd..2737dd8 100644 --- a/biome.json +++ b/biome.json @@ -1,3 +1,11 @@ { - "extends": ["@rubriclab/config/biome"] + "css": { + "parser": { + "tailwindDirectives": true + } + }, + "extends": ["@rubriclab/config/biome"], + "files": { + "includes": ["**", "!next-env.d.ts"] + } } diff --git a/components.json b/components.json new file mode 100644 index 0000000..1bc723c --- /dev/null +++ b/components.json @@ -0,0 +1,23 @@ +{ + "$schema": "https://ui.shadcn.com/schema.json", + "aliases": { + "components": "@/components", + "hooks": "@/hooks", + "lib": "@/lib", + "ui": "@/components/ui", + "utils": "@/lib/utils" + }, + "iconLibrary": "lucide", + "registries": {}, + "rsc": true, + "rtl": false, + "style": "new-york", + "tailwind": { + "baseColor": "neutral", + "config": "tailwind.config.ts", + "css": "src/app/styles.css", + "cssVariables": true, + "prefix": "" + }, + "tsx": true +} diff --git a/package.json b/package.json index 0aa2ec9..dd71cb0 100644 --- a/package.json +++ b/package.json @@ -1,20 +1,36 @@ { "dependencies": { "@prisma/client": "^6.19.0", + "@radix-ui/react-dialog": "^1.1.15", + "@radix-ui/react-scroll-area": "^1.2.10", + "@radix-ui/react-select": "^2.2.6", + "@radix-ui/react-separator": "^1.1.8", + "@radix-ui/react-slider": "^1.3.6", + "@radix-ui/react-slot": "^1.2.4", + "@radix-ui/react-switch": "^1.2.6", + "@radix-ui/react-tabs": "^1.1.13", "@t3-oss/env-nextjs": "^0.13.8", + "class-variance-authority": "^0.7.1", + "clsx": "^2.1.1", "dotenv": "^17.2.3", "framer-motion": "^12.0.0", + "lucide-react": "^0.576.0", "next": "16.0.10", "react": "^19.2.0", - "react-dom": "^19.2.0" + "react-dom": "^19.2.0", + "tailwind-merge": "^3.5.0", + "tailwindcss-animate": "^1.0.7" }, "description": "This project was bootstrapped with create-rubric-app", "devDependencies": { "@rubriclab/config": "^0.0.22", + "@tailwindcss/postcss": "^4.2.1", "@types/node": "^24.10.1", "@types/react": "^19.2.3", "@types/react-dom": "^19.2.3", + "playwright": "^1.58.2", "prisma": "^6.19.0", + "tailwindcss": "^4.2.1", "typescript": "^5.9.3", "zod": "^4.1.12" }, @@ -31,7 +47,14 @@ "db:seed": "prisma db seed", "db:studio": "prisma studio", "dev": "next dev", + "eval:flywheel": "bun run scripts/evals/runEvalFlywheel.ts", + "eval:live": "bun run scripts/evals/runEvalSuite.ts --target-url https://lilac.chat --candidate-id baseline-prod", + "eval:prepare-audio": "bun run smoke:prepare-audio", + "eval:preview": "bun run scripts/evals/runEvalSuite.ts --candidate-id workspace-preview", "format": "bun x biome check --write .", + "smoke:live": "bun run scripts/smoke/runLiveConversationSmoke.ts --target-url https://lilac.chat", + "smoke:local": "bun run scripts/smoke/runLiveConversationSmoke.ts --target-url http://localhost:3000 --skip-vercel-logs --skip-chat", + "smoke:prepare-audio": "bun run scripts/smoke/generateSmokeAudio.ts", "start": "next start" }, "version": "0.0.0" diff --git a/scripts/evals/defaultScenarios.ts b/scripts/evals/defaultScenarios.ts new file mode 100644 index 0000000..e5d6cce --- /dev/null +++ b/scripts/evals/defaultScenarios.ts @@ -0,0 +1,204 @@ +import { resolve } from 'node:path' + +import { type EvalScenarioSpec, EvalScenarioSpecSchema } from '@/evals/contracts' + +const fixturesDirectoryPath = resolve(process.cwd(), '.artifacts/smoke/fixtures') + +const scenarioList = [ + { + expectedAssistantMeaning: + 'The assistant should acknowledge the user and offer help practicing Ukrainian.', + expectedUiStates: ['chat_transcript_visible', 'assistant_response_visible'], + id: 'chat_typed_practice_ukrainian', + inputType: 'text', + languageExpectation: {}, + latencyBands: { + assistantTextFirstMs: 20_000 + }, + mode: 'chat', + typedText: 'Hello. Can you help me practice Ukrainian today?' + }, + { + audioFixturePath: resolve(fixturesDirectoryPath, 'chat_question_en.fixture.wav'), + expectedAssistantMeaning: + 'The assistant should respond helpfully to a request for Ukrainian practice.', + expectedUiStates: [ + 'chat_transcript_visible', + 'assistant_response_visible', + 'assistant_audio_started' + ], + expectedUserTranscript: 'Hello Lilac. Can you help me practice Ukrainian today?', + id: 'chat_audio_practice_ukrainian', + inputType: 'audio', + languageExpectation: { + inputLanguageCode: 'en', + outputLanguageCode: 'en' + }, + latencyBands: { + assistantAudioStartMs: 20_000, + assistantTextFirstMs: 20_000 + }, + mode: 'chat' + }, + { + expectedTranslationMeaning: + 'The translation should preserve the meaning of asking whether someone saw the news.', + expectedUiStates: [ + 'translate_card_visible', + 'draft_translation_visible', + 'final_translation_visible' + ], + expectedUserTranscript: 'Hello. I was wondering if you saw the news.', + id: 'translate_typed_en_to_es', + inputType: 'text', + languageExpectation: { + pair: { + from: 'en', + to: 'es' + } + }, + latencyBands: { + finalOutputMs: 12_000, + firstVisibleDraftMs: 6_000 + }, + mode: 'translate', + typedText: 'Hello. I was wondering if you saw the news.' + }, + { + audioFixturePath: resolve(fixturesDirectoryPath, 'translate_en_simple.fixture.wav'), + expectedTranslationMeaning: + 'The translation should preserve the meaning of asking whether someone saw the news.', + expectedUiStates: [ + 'translate_card_visible', + 'draft_translation_visible', + 'final_translation_visible' + ], + expectedUserTranscript: 'Hello. I was wondering if you saw the news.', + id: 'translate_audio_en_to_es', + inputType: 'audio', + languageExpectation: { + pair: { + from: 'en', + to: 'es' + } + }, + latencyBands: { + finalOutputMs: 15_000, + firstVisibleDraftMs: 7_000, + firstVisibleSourceMs: 4_000 + }, + mode: 'translate' + }, + { + audioFixturePath: resolve(fixturesDirectoryPath, 'translate_uk_simple.fixture.wav'), + expectedTranslationMeaning: + 'The translation should preserve the meaning of asking whether someone saw the news.', + expectedUiStates: [ + 'translate_card_visible', + 'draft_translation_visible', + 'final_translation_visible' + ], + expectedUserTranscript: 'Привіт. Мені було цікаво, чи бачив ти новини.', + id: 'translate_audio_uk_to_en', + inputType: 'audio', + languageExpectation: { + pair: { + from: 'uk', + to: 'en' + } + }, + latencyBands: { + finalOutputMs: 15_000, + firstVisibleDraftMs: 7_000, + firstVisibleSourceMs: 4_000 + }, + mode: 'translate' + }, + { + audioFixturePath: resolve(fixturesDirectoryPath, 'translate_es_simple.fixture.wav'), + expectedTranslationMeaning: + 'The translation should preserve the meaning of asking whether someone saw the news.', + expectedUiStates: [ + 'translate_card_visible', + 'draft_translation_visible', + 'final_translation_visible' + ], + expectedUserTranscript: 'Hola. Me preguntaba si viste las noticias.', + id: 'translate_audio_es_to_en', + inputType: 'audio', + languageExpectation: { + pair: { + from: 'es', + to: 'en' + } + }, + latencyBands: { + finalOutputMs: 15_000, + firstVisibleDraftMs: 7_000, + firstVisibleSourceMs: 4_000 + }, + mode: 'translate' + }, + { + audioFixturePath: resolve(fixturesDirectoryPath, 'translate_en_long.fixture.wav'), + expectedTranslationMeaning: + 'The translation should preserve a longer explanation about live evaluation, chunk replacement, and translation quality.', + expectedUiStates: [ + 'translate_card_visible', + 'draft_translation_visible', + 'final_translation_visible' + ], + expectedUserTranscript: + 'Hello there. This is a longer Lilac translation evaluation. I am speaking continuously so the subtitle system can be measured for live updates, chunk replacement, and final translation quality over a longer turn.', + id: 'translate_audio_en_long_to_es', + inputType: 'audio', + languageExpectation: { + pair: { + from: 'en', + to: 'es' + } + }, + latencyBands: { + finalOutputMs: 18_000, + firstVisibleDraftMs: 8_000, + firstVisibleSourceMs: 4_000 + }, + mode: 'translate' + }, + { + audioFixturePath: resolve(fixturesDirectoryPath, 'translate_en_fillers_noisy.fixture.wav'), + expectedTranslationMeaning: + 'The translation should preserve the meaning of greeting someone and asking whether they saw the news despite filler words and mild background noise.', + expectedUiStates: [ + 'translate_card_visible', + 'draft_translation_visible', + 'final_translation_visible' + ], + expectedUserTranscript: + 'Um hello. So the other day, um, I was wondering, um, did you see the news?', + id: 'translate_audio_en_noisy_fillers_to_es', + inputType: 'audio', + languageExpectation: { + pair: { + from: 'en', + to: 'es' + } + }, + latencyBands: { + finalOutputMs: 16_000, + firstVisibleDraftMs: 8_000, + firstVisibleSourceMs: 4_500 + }, + mode: 'translate' + } +] satisfies EvalScenarioSpec[] + +export const defaultEvalScenarioList = scenarioList.map(scenario => + EvalScenarioSpecSchema.parse(scenario) +) + +export function resolveEvalScenarioList(scenarioIdList?: string[]): EvalScenarioSpec[] { + if (!scenarioIdList || scenarioIdList.length === 0) return defaultEvalScenarioList + const wantedScenarioIdSet = new Set(scenarioIdList) + return defaultEvalScenarioList.filter(scenario => wantedScenarioIdSet.has(scenario.id)) +} diff --git a/scripts/evals/judges.ts b/scripts/evals/judges.ts new file mode 100644 index 0000000..e517f4c --- /dev/null +++ b/scripts/evals/judges.ts @@ -0,0 +1,689 @@ +import { readdir } from 'node:fs/promises' +import { join } from 'node:path' +import { z } from 'zod' +import { + EvalScorecardSchema, + type FlywheelPatchProposal, + FlywheelPatchProposalSchema, + type JudgeFinding, + type JudgeResult, + JudgeResultSchema +} from '@/evals/contracts' +import { + createChatCompletionJson, + createResponsesJson, + readAudioFileAsBase64, + transcribeAudioFile +} from './openAiClient' +import type { ScenarioJudgment, ScenarioObservation } from './runtimeTypes' + +const genericServerComponentErrorText = + 'An error occurred in the Server Components render. The specific message is omitted in production builds to avoid leaking sensitive details.' +const trackedProtocolPatternList = [ + /Unsupported parameter/i, + /Unknown parameter/i, + /Missing required parameter/i, + /No valid publish_translation tool call/i, + /No publish_translation tool call was returned/i, + /Invalid 'item\.id'/i, + /buffer too small/i, + /Unhandled/i +] + +const LooseJudgeFindingSchema = z.object({ + code: z.string().min(1), + details: z.string().min(1), + severity: z.string().min(1) +}) + +const LooseModelJudgeSchema = z.object({ + confidence: z.union([z.number(), z.string()]).optional(), + findings: z.array(LooseJudgeFindingSchema).default([]), + passed: z.union([z.boolean(), z.string()]).optional(), + rationale: z.string().min(1), + score: z.union([z.number(), z.string()]).optional() +}) + +const VisualUiJudgeSchema = LooseModelJudgeSchema +const SemanticJudgeSchema = LooseModelJudgeSchema + +const RootCauseSynthesisSchema = z.object({ + allowlistedSurfaceList: z.array(z.string().min(1)).min(1), + confidence: z.number().min(0).max(1), + failureClusterIds: z.array(z.string().min(1)).min(1), + filePathList: z.array(z.string().min(1)).default([]), + prompt: z.string().min(1), + reasoning: z.string().min(1), + summary: z.string().min(1) +}) + +type ChatTranscriptEvent = Extract< + ScenarioObservation['testBusEventList'][number], + { eventType: 'chat_transcript_patch' } +> +type TranslateCardEvent = Extract< + ScenarioObservation['testBusEventList'][number], + { eventType: 'translate_card_patch' } +> + +function isChatTranscriptEvent( + event: ScenarioObservation['testBusEventList'][number] +): event is ChatTranscriptEvent { + return event.eventType === 'chat_transcript_patch' +} + +function isTranslateCardEvent( + event: ScenarioObservation['testBusEventList'][number] +): event is TranslateCardEvent { + return event.eventType === 'translate_card_patch' +} + +function normalizeWhitespace(value: string): string { + return value.replace(/\s+/g, ' ').trim() +} + +function canonicalizeText(value: string): string { + const normalizedValue = normalizeWhitespace(value).toLowerCase() + let canonicalValue = '' + for (const character of normalizedValue) { + const isAsciiDigit = character >= '0' && character <= '9' + const isUnicodeLetter = character.toLowerCase() !== character.toUpperCase() + const isSpace = character === ' ' + if (isAsciiDigit || isUnicodeLetter || isSpace) { + canonicalValue += character + } + } + return canonicalValue +} + +function computeTokenOverlapScore(expectedText: string, observedText: string): number { + const expectedTokenList = canonicalizeText(expectedText).split(' ').filter(Boolean) + const observedTokenList = canonicalizeText(observedText).split(' ').filter(Boolean) + if (expectedTokenList.length === 0 || observedTokenList.length === 0) return 0 + const observedTokenSet = new Set(observedTokenList) + let sharedTokenCount = 0 + for (const expectedToken of expectedTokenList) { + if (observedTokenSet.has(expectedToken)) sharedTokenCount += 1 + } + return Math.min(1, sharedTokenCount / expectedTokenList.length) +} + +function createJudgeResult(input: JudgeResult): JudgeResult { + return JudgeResultSchema.parse(input) +} + +function normalizeJudgeScore(value: number): number { + if (!Number.isFinite(value)) return 0 + if (value > 1 && value <= 10) return Math.max(0, Math.min(1, value / 10)) + if (value > 10 && value <= 100) return Math.max(0, Math.min(1, value / 100)) + return Math.max(0, Math.min(1, value)) +} + +function parseNumberLikeValue(value: number | string | undefined): number { + if (typeof value === 'number') return value + if (typeof value === 'string') { + const trimmedValue = value.trim().replace(/%$/, '') + const parsedValue = Number.parseFloat(trimmedValue) + return Number.isFinite(parsedValue) ? parsedValue : 0 + } + return 0 +} + +function parseBooleanLikeValue( + value: boolean | string | undefined, + fallbackValue: boolean +): boolean { + if (typeof value === 'boolean') return value + if (typeof value === 'string') { + const normalizedValue = value.trim().toLowerCase() + if (normalizedValue === 'true' || normalizedValue === 'pass' || normalizedValue === 'passed') + return true + if (normalizedValue === 'false' || normalizedValue === 'fail' || normalizedValue === 'failed') + return false + } + return fallbackValue +} + +function containsNegativeJudgmentSignal(value: string): boolean { + const normalizedValue = value.toLowerCase() + return ( + normalizedValue.includes('no ') || + normalizedValue.includes('not ') || + normalizedValue.includes('missing') || + normalizedValue.includes('cannot') || + normalizedValue.includes('impossible') || + normalizedValue.includes('failed') || + normalizedValue.includes('did not') + ) +} + +function containsPositiveJudgmentSignal(value: string): boolean { + const normalizedValue = value.toLowerCase() + return ( + normalizedValue.includes('accurate') || + normalizedValue.includes('appropriate') || + normalizedValue.includes('clear') || + normalizedValue.includes('correct') || + normalizedValue.includes('good') || + normalizedValue.includes('intuitive') || + normalizedValue.includes('logical') || + normalizedValue.includes('preserve') || + normalizedValue.includes('readable') + ) +} + +function normalizeJudgeSeverity(value: string): JudgeFinding['severity'] { + const normalizedValue = value.trim().toLowerCase() + if (normalizedValue.includes('high') || normalizedValue.includes('critical')) return 'high' + if (normalizedValue.includes('med')) return 'medium' + return 'low' +} + +function normalizeLooseJudgeResult( + input: z.infer, + judgeId: string +): JudgeResult { + const normalizedScore = normalizeJudgeScore(parseNumberLikeValue(input.score)) + let normalizedConfidence = normalizeJudgeScore(parseNumberLikeValue(input.confidence)) + const normalizedFindingList = input.findings.map(finding => ({ + code: finding.code, + details: finding.details, + severity: normalizeJudgeSeverity(finding.severity) + })) + const hasHighSeverityFinding = normalizedFindingList.some(finding => finding.severity === 'high') + const hasMediumSeverityFinding = normalizedFindingList.some( + finding => finding.severity === 'medium' + ) + const hasNegativeRationale = containsNegativeJudgmentSignal(input.rationale) + const hasPositiveRationale = containsPositiveJudgmentSignal(input.rationale) + let normalizedPassed = parseBooleanLikeValue( + input.passed, + normalizedScore >= 0.7 && !hasHighSeverityFinding + ) + let effectiveScore = normalizedScore + if (!hasHighSeverityFinding && !hasNegativeRationale) { + if ( + !normalizedPassed && + (hasPositiveRationale || + normalizedFindingList.length === 0 || + normalizedFindingList.every(finding => finding.severity === 'low')) + ) { + normalizedPassed = true + } + if (normalizedPassed && effectiveScore < 0.7) { + effectiveScore = hasMediumSeverityFinding ? 0.72 : 0.85 + } + if (normalizedPassed && normalizedConfidence === 0 && hasPositiveRationale) { + normalizedConfidence = 0.8 + } + } + if (hasHighSeverityFinding && !hasPositiveRationale) { + normalizedPassed = false + effectiveScore = Math.min(effectiveScore, 0.35) + } + if (hasNegativeRationale && hasHighSeverityFinding) { + normalizedPassed = false + } + return createJudgeResult({ + confidence: normalizedConfidence, + findings: normalizedFindingList, + judgeId, + passed: normalizedPassed, + rationale: input.rationale, + score: effectiveScore + }) +} + +function createFinding( + code: string, + details: string, + severity: JudgeFinding['severity'] +): JudgeFinding { + return { + code, + details, + severity + } +} + +function collectAppHeardText(observation: ScenarioObservation): string { + if (observation.scenario.mode === 'chat') { + const userTranscriptEventList = observation.testBusEventList + .filter(isChatTranscriptEvent) + .filter(event => event.role === 'user') + const latestEvent = userTranscriptEventList[userTranscriptEventList.length - 1] + return latestEvent?.text ?? '' + } + const translateEventList = observation.testBusEventList.filter(isTranslateCardEvent) + const latestTranslateEvent = translateEventList[translateEventList.length - 1] + return latestTranslateEvent?.sourceText ?? '' +} + +function collectAssistantVisibleText(observation: ScenarioObservation): string { + if (observation.scenario.mode === 'chat') { + const assistantTranscriptEventList = observation.testBusEventList + .filter(isChatTranscriptEvent) + .filter(event => event.role === 'assistant') + return assistantTranscriptEventList + .map(event => event.text) + .join('\n') + .trim() + } + const translateEventList = observation.testBusEventList.filter(isTranslateCardEvent) + const latestTranslateEvent = translateEventList[translateEventList.length - 1] + return latestTranslateEvent?.targetText ?? '' +} + +async function readLatestScreenshotPathList(screenshotDirectoryPath: string): Promise { + const directoryEntryList = await readdir(screenshotDirectoryPath, { withFileTypes: true }) + return directoryEntryList + .filter(directoryEntry => directoryEntry.isFile() && directoryEntry.name.endsWith('.png')) + .map(directoryEntry => join(screenshotDirectoryPath, directoryEntry.name)) + .sort() + .slice(-3) +} + +export async function runHardGuardrailJudge( + observation: ScenarioObservation +): Promise { + const findingList: JudgeFinding[] = [] + const failureClusterReasonList: ScenarioJudgment['failureClusterReasonList'] = [] + const bodyText = observation.bodyText + + if (bodyText.includes(genericServerComponentErrorText)) { + findingList.push( + createFinding( + 'generic_server_component_error', + 'Generic Server Components error text was rendered in the UI.', + 'high' + ) + ) + } + + for (const protocolPattern of trackedProtocolPatternList) { + if (!protocolPattern.test(bodyText)) continue + findingList.push( + createFinding( + 'protocol_text_rendered', + `User-visible protocol text matched ${protocolPattern.source}.`, + 'high' + ) + ) + } + + for (const runtimeLogMatch of observation.runtimeLogMatchList) { + findingList.push( + createFinding('runtime_log_error', `Runtime log matched: ${runtimeLogMatch}`, 'high') + ) + } + + for (const harnessFailure of observation.harnessFailureList) { + findingList.push( + createFinding('eval_harness_failure', `Scenario harness failed: ${harnessFailure}`, 'high') + ) + } + + if (observation.errorTextList.length > 0) { + for (const errorText of observation.errorTextList) { + findingList.push(createFinding('visible_error_banner', errorText, 'medium')) + } + } + + switch (observation.scenario.mode) { + case 'chat': { + const userTranscriptEventList = observation.testBusEventList + .filter(isChatTranscriptEvent) + .filter(event => event.role === 'user') + const assistantTranscriptEventList = observation.testBusEventList + .filter(isChatTranscriptEvent) + .filter(event => event.role === 'assistant') + if (userTranscriptEventList.length === 0) { + findingList.push( + createFinding( + 'missing_user_transcript', + 'No user transcript was captured in chat mode.', + 'high' + ) + ) + } + if (assistantTranscriptEventList.length === 0) { + findingList.push( + createFinding( + 'missing_assistant_response', + 'No assistant response was captured in chat mode.', + 'high' + ) + ) + } + const firstUserEvent = userTranscriptEventList[0] + const firstAssistantEvent = assistantTranscriptEventList[0] + if ( + firstUserEvent && + firstAssistantEvent && + firstAssistantEvent.occurredAt < firstUserEvent.occurredAt + ) { + findingList.push( + createFinding( + 'chat_ordering_violation', + 'Assistant transcript arrived before the user transcript slot.', + 'high' + ) + ) + } + break + } + case 'translate': { + const translateCardEventList = observation.testBusEventList.filter(isTranslateCardEvent) + if (translateCardEventList.length === 0) { + findingList.push( + createFinding('missing_translate_card', 'No translate card patch was observed.', 'high') + ) + break + } + const hasDraft = translateCardEventList.some(event => event.renderState === 'draft') + const hasFinal = translateCardEventList.some(event => event.renderState === 'final') + if (!hasDraft) { + findingList.push( + createFinding('missing_translate_draft', 'No draft translation state was observed.', 'medium') + ) + } + if (!hasFinal) { + findingList.push( + createFinding('missing_translate_final', 'No final translation state was observed.', 'high') + ) + } + break + } + } + + for (const [metricKey, maxLatencyMilliseconds] of Object.entries( + observation.scenario.latencyBands + )) { + const observedLatency = observation.latencyMetrics[metricKey] + if (typeof observedLatency !== 'number' || typeof maxLatencyMilliseconds !== 'number') continue + if (observedLatency <= maxLatencyMilliseconds) continue + findingList.push( + createFinding( + 'latency_regression', + `${metricKey} exceeded threshold (${observedLatency}ms > ${maxLatencyMilliseconds}ms).`, + 'medium' + ) + ) + } + + if (findingList.length > 0) { + failureClusterReasonList.push({ + reason: 'Deterministic guardrail failures were detected.', + severity: 'high' + }) + } + + const passed = findingList.every(finding => finding.severity !== 'high') + const score = passed ? (findingList.length === 0 ? 1 : 0.75) : 0 + + return { + failureClusterReasonList, + hardGateFindingList: findingList, + judgeResultList: [ + createJudgeResult({ + confidence: 1, + findings: findingList, + judgeId: 'hard_guardrail_judge', + passed, + rationale: passed + ? 'Deterministic guardrails passed.' + : 'Deterministic guardrails detected blocking issues.', + score + }) + ] + } +} + +export async function runAudioInputReferenceJudge( + observation: ScenarioObservation +): Promise { + if (observation.scenario.inputType === 'text' || !observation.scenario.audioFixturePath) { + return createJudgeResult({ + confidence: 1, + findings: [], + judgeId: 'audio_input_reference_judge', + passed: true, + rationale: 'Scenario does not use audio input.', + score: 1 + }) + } + const referenceTranscript = await transcribeAudioFile({ + audioFilePath: observation.scenario.audioFixturePath, + model: 'gpt-4o-transcribe' + }) + const expectedTranscript = observation.scenario.expectedUserTranscript ?? referenceTranscript + const appHeardText = collectAppHeardText(observation) + const referenceScore = computeTokenOverlapScore(expectedTranscript, referenceTranscript) + const appScore = computeTokenOverlapScore(expectedTranscript, appHeardText) + const passed = appScore >= 0.65 + const findingList: JudgeFinding[] = [] + if (!passed) { + findingList.push( + createFinding( + 'audio_input_mismatch', + `App heard text diverged from expected transcript. expected="${expectedTranscript}" observed="${appHeardText}" reference="${referenceTranscript}"`, + 'medium' + ) + ) + } + return createJudgeResult({ + confidence: Math.max(referenceScore, 0.7), + findings: findingList, + judgeId: 'audio_input_reference_judge', + passed, + rationale: `Reference ASR similarity=${referenceScore.toFixed(2)}; app-heard similarity=${appScore.toFixed(2)}.`, + score: Math.max(0, Math.min(1, (referenceScore + appScore) / 2)) + }) +} + +export async function runAudioOutputListenerJudge( + observation: ScenarioObservation +): Promise { + if (!observation.assistantAudioArtifactPath) { + return createJudgeResult({ + confidence: 1, + findings: [], + judgeId: 'audio_output_listener_judge', + passed: true, + rationale: 'No assistant audio artifact was captured for this scenario.', + score: 1 + }) + } + + const visibleAssistantText = collectAssistantVisibleText(observation) + const base64Audio = await readAudioFileAsBase64(observation.assistantAudioArtifactPath) + const result = await createChatCompletionJson({ + maxCompletionTokens: 700, + messageList: [ + { + content: [ + { + text: + 'You are grading spoken assistant audio for an end-to-end app eval. Return JSON only with keys passed, score, confidence, rationale, findings. Score is 0 to 1. Findings is a list of {code, severity, details}. Judge whether the spoken audio is intelligible, matches the visible transcript, preserves the expected meaning, avoids repetition, and is not truncated.', + type: 'text' + } + ], + role: 'system' + }, + { + content: [ + { + text: `Expected meaning: ${observation.scenario.expectedAssistantMeaning ?? 'Not provided'}\nVisible transcript: ${visibleAssistantText || 'None'}\nJSON only.`, + type: 'text' + }, + { + input_audio: { + data: base64Audio, + format: 'wav' + }, + type: 'input_audio' + } + ], + role: 'user' + } + ], + model: 'gpt-audio-1.5', + responseSchema: LooseModelJudgeSchema + }) + return normalizeLooseJudgeResult(result, 'audio_output_listener_judge') +} + +export async function runVisualUiJudge(observation: ScenarioObservation): Promise { + const screenshotPathList = await readLatestScreenshotPathList( + observation.artifactBundle.screenshotDirectoryPath + ) + if (screenshotPathList.length === 0) { + return createJudgeResult({ + confidence: 1, + findings: [], + judgeId: 'visual_ui_judge', + passed: true, + rationale: 'No screenshots were available for visual grading.', + score: 1 + }) + } + const contentList: Array> = [ + { + text: + 'You are grading UI screenshots for a voice app. Return JSON only with keys passed, score, confidence, rationale, findings. Findings is a list of {code, severity, details}. Judge clipping, unreadable contrast, broken loading states, layout overflow, and whether the UI hierarchy is understandable for a consumer app.', + type: 'text' + } + ] + for (const screenshotPath of screenshotPathList) { + const base64Image = await readAudioFileAsBase64(screenshotPath) + contentList.push({ + image_url: { + url: `data:image/png;base64,${base64Image}` + }, + type: 'image_url' + }) + } + contentList.push({ + text: `Visible DOM text excerpt:\n${observation.bodyText.slice(0, 4000)}\nJSON only.`, + type: 'text' + }) + const result = await createChatCompletionJson({ + maxCompletionTokens: 700, + messageList: [ + { + content: contentList, + role: 'user' + } + ], + model: 'gpt-4.1', + responseSchema: VisualUiJudgeSchema + }) + return normalizeLooseJudgeResult(result, 'visual_ui_judge') +} + +export async function runSemanticConversationJudge( + observation: ScenarioObservation +): Promise { + const appHeardText = collectAppHeardText(observation) + const assistantVisibleText = collectAssistantVisibleText(observation) + const promptText = [ + 'You are grading whether the app preserved meaning and direction.', + 'Return JSON only with keys passed, score, confidence, rationale, findings.', + 'Findings is a list of {code, severity, details}.', + `Mode: ${observation.scenario.mode}`, + `Expected user transcript: ${observation.scenario.expectedUserTranscript ?? 'Not provided'}`, + `Expected assistant meaning: ${observation.scenario.expectedAssistantMeaning ?? 'Not provided'}`, + `Expected translation meaning: ${observation.scenario.expectedTranslationMeaning ?? 'Not provided'}`, + `Observed user transcript/source: ${appHeardText || 'None'}`, + `Observed assistant/translation text: ${assistantVisibleText || 'None'}`, + 'Judge semantic preservation, language direction, and conversational appropriateness.' + ].join('\n') + const result = await createChatCompletionJson({ + maxCompletionTokens: 700, + messageList: [ + { + content: promptText, + role: 'user' + } + ], + model: 'gpt-4.1', + responseSchema: SemanticJudgeSchema + }) + return normalizeLooseJudgeResult(result, 'semantic_conversation_judge') +} + +export async function synthesizeRootCausePatchProposal(input: { + candidateId: string + failureClusterIdList: string[] + observationList: ScenarioObservation[] +}): Promise { + const observationExcerptList = input.observationList.map(observation => ({ + assistantVisibleText: collectAssistantVisibleText(observation).slice(0, 500), + bodyText: observation.bodyText.slice(0, 1200), + errorTextList: observation.errorTextList, + latencyMetrics: observation.latencyMetrics, + runtimeLogMatchList: observation.runtimeLogMatchList, + scenarioId: observation.scenario.id + })) + const result = await createResponsesJson({ + inputList: [ + { + content: [ + { + text: + 'You are Codex generating one bounded patch proposal for a failing voice-app eval run. Return JSON only with keys confidence, failureClusterIds, filePathList, prompt, reasoning, summary, allowlistedSurfaceList. The proposal must touch only allowlisted product surfaces such as prompts, model manifests, timing thresholds, reducer logic, realtime client state machines, smoke/eval instrumentation, or targeted UI fixes. Do not propose auth, billing, database, or broad dependency changes.', + type: 'input_text' + }, + { + text: `Candidate ID: ${input.candidateId}\nFailure cluster IDs: ${JSON.stringify(input.failureClusterIdList)}\nObserved failures: ${JSON.stringify(observationExcerptList)}\nReturn JSON only.`, + type: 'input_text' + } + ], + role: 'user' + } + ], + maxOutputTokens: 1200, + model: 'gpt-5.3-codex', + responseSchema: RootCauseSynthesisSchema + }) + return FlywheelPatchProposalSchema.parse({ + allowlistedSurfaceList: result.allowlistedSurfaceList, + confidence: result.confidence, + failureClusterIds: result.failureClusterIds, + filePathList: result.filePathList, + prompt: result.prompt, + summary: `${result.summary}\n\n${result.reasoning}` + }) +} + +export function buildScenarioScorecard(input: { + hardGateFindingList: JudgeFinding[] + judgeResultList: JudgeResult[] +}): ReturnType { + const aggregateScore = + input.judgeResultList.reduce((scoreSum, judgeResult) => scoreSum + judgeResult.score, 0) / + Math.max(1, input.judgeResultList.length) + const hasHighSeverityHardGate = input.hardGateFindingList.some( + finding => finding.severity === 'high' + ) + const hasBlockingJudgeFinding = input.judgeResultList.some(judgeResult => + judgeResult.findings.some(finding => finding.severity === 'high') + ) + const hasFailedJudge = input.judgeResultList.some(judgeResult => !judgeResult.passed) + const releaseDecision = hasHighSeverityHardGate + ? 'block' + : hasBlockingJudgeFinding || hasFailedJudge + ? 'block' + : aggregateScore >= 0.92 + ? 'promote' + : aggregateScore >= 0.75 + ? 'candidate' + : 'block' + return EvalScorecardSchema.parse({ + aggregateScore: Math.round(aggregateScore * 1000) / 1000, + hardGateResults: input.hardGateFindingList, + judgeResults: input.judgeResultList, + regressionDeltas: {}, + releaseDecision + }) +} diff --git a/scripts/evals/manifests/default.matrix.json b/scripts/evals/manifests/default.matrix.json new file mode 100644 index 0000000..04efe18 --- /dev/null +++ b/scripts/evals/manifests/default.matrix.json @@ -0,0 +1,74 @@ +{ + "baseline": { + "asrModelSlug": "gpt-4o-transcribe", + "buildEnvironmentOverrides": {}, + "chatTurnOrderingProfile": "slot_reconciliation", + "featureFlags": { + "lilac_translate_streaming_v2": true + }, + "id": "baseline-prod", + "label": "Production baseline", + "promptVersionIds": { + "chat": "current", + "translate": "current" + }, + "realtimeModelSlug": "gpt-realtime-1.5", + "translateLanguageGroundingMode": "pair_locked" + }, + "candidateList": [ + { + "asrModelSlug": "gpt-4o-transcribe", + "buildEnvironmentOverrides": {}, + "chatTurnOrderingProfile": "slot_reconciliation", + "featureFlags": { + "lilac_translate_streaming_v2": true + }, + "id": "workspace-preview", + "label": "Workspace preview", + "promptVersionIds": { + "chat": "current", + "translate": "current" + }, + "realtimeModelSlug": "gpt-realtime-1.5", + "translateLanguageGroundingMode": "pair_locked" + }, + { + "asrModelSlug": "gpt-4o-transcribe-latest", + "buildEnvironmentOverrides": {}, + "chatTurnOrderingProfile": "slot_reconciliation", + "featureFlags": { + "lilac_translate_streaming_v2": true + }, + "id": "asr-latest", + "label": "ASR latest candidate", + "promptVersionIds": { + "chat": "current", + "translate": "current" + }, + "realtimeModelSlug": "gpt-realtime-1.5", + "translateCommitIntervalMilliseconds": 820, + "translateLanguageGroundingMode": "pair_locked", + "translateMinimumCommitAudioMilliseconds": 300 + }, + { + "asrModelSlug": "gpt-4o-mini-transcribe", + "buildEnvironmentOverrides": {}, + "chatTurnOrderingProfile": "slot_reconciliation", + "featureFlags": { + "lilac_translate_streaming_v2": true + }, + "id": "asr-mini-fast", + "label": "ASR mini candidate", + "promptVersionIds": { + "chat": "current", + "translate": "balanced" + }, + "realtimeModelSlug": "gpt-realtime-1.5", + "translateCommitIntervalMilliseconds": 700, + "translateLanguageGroundingMode": "pair_locked", + "translateMinimumCommitAudioMilliseconds": 260 + } + ], + "id": "lilac-default", + "version": 1 +} diff --git a/scripts/evals/openAiClient.ts b/scripts/evals/openAiClient.ts new file mode 100644 index 0000000..a52b1d5 --- /dev/null +++ b/scripts/evals/openAiClient.ts @@ -0,0 +1,311 @@ +import { readFile } from 'node:fs/promises' +import { basename } from 'node:path' + +import { z } from 'zod' + +type OpenAiJsonResponse = Record + +type CreateChatCompletionJsonInput = { + maxCompletionTokens?: number + messageList: unknown[] + model: string + responseSchema: Schema +} + +type CreateResponsesJsonInput = { + inputList: unknown[] + maxOutputTokens?: number + model: string + responseSchema: Schema +} + +type AudioTranscriptionInput = { + audioFilePath: string + model: string + prompt?: string +} + +const openAiApiBaseUrl = 'https://api.openai.com/v1' +const openAiRequestTimeoutMilliseconds = 60_000 +const openAiMaximumAttempts = 3 + +function requireOpenAiApiKey(): string { + const openAiApiKey = process.env.OPENAI_API_KEY?.trim() + if (!openAiApiKey) { + throw new Error('OPENAI_API_KEY is required for eval runner.') + } + return openAiApiKey +} + +async function parseJsonResponse(response: Response): Promise { + const responseText = await response.text() + if (!responseText.trim()) return {} + try { + return JSON.parse(responseText) as unknown + } catch { + throw new Error(responseText) + } +} + +function isRetryableStatus(status: number): boolean { + switch (status) { + case 408: + case 409: + case 429: + case 500: + case 502: + case 503: + case 504: + return true + default: + return false + } +} + +function isRetryableFetchError(error: unknown): boolean { + if (!(error instanceof Error)) return false + return ( + error.name === 'AbortError' || + /aborted|fetch failed|gateway|timeout|timed out|temporarily unavailable/i.test(error.message) + ) +} + +async function waitForRetryDelay(attemptNumber: number): Promise { + const delayMilliseconds = 300 * 2 ** Math.max(0, attemptNumber - 1) + await new Promise(resolve => setTimeout(resolve, delayMilliseconds)) +} + +async function postOpenAiJson(path: string, body: unknown): Promise { + for (let attemptNumber = 1; attemptNumber <= openAiMaximumAttempts; attemptNumber += 1) { + try { + const response = await fetch(`${openAiApiBaseUrl}${path}`, { + body: JSON.stringify(body), + headers: { + Authorization: `Bearer ${requireOpenAiApiKey()}`, + 'Content-Type': 'application/json' + }, + method: 'POST', + signal: AbortSignal.timeout(openAiRequestTimeoutMilliseconds) + }) + const payload = await parseJsonResponse(response) + if (!response.ok) { + const errorMessage = + typeof payload === 'object' && + payload !== null && + 'error' in payload && + typeof payload.error === 'object' && + payload.error !== null && + 'message' in payload.error && + typeof payload.error.message === 'string' + ? payload.error.message + : `OpenAI request failed (${response.status})` + if (isRetryableStatus(response.status) && attemptNumber < openAiMaximumAttempts) { + await waitForRetryDelay(attemptNumber) + continue + } + throw new Error(errorMessage) + } + if (!payload || typeof payload !== 'object' || Array.isArray(payload)) { + throw new Error('Expected OpenAI JSON object response.') + } + return payload as OpenAiJsonResponse + } catch (error) { + if (attemptNumber >= openAiMaximumAttempts || !isRetryableFetchError(error)) { + throw error + } + await waitForRetryDelay(attemptNumber) + } + } + throw new Error('OpenAI JSON request exhausted retry attempts.') +} + +function extractChatCompletionText(payload: OpenAiJsonResponse): string { + const choiceList = Array.isArray(payload.choices) ? payload.choices : [] + const firstChoice = choiceList[0] + if (!firstChoice || typeof firstChoice !== 'object') { + throw new Error('Chat Completions response did not include choices.') + } + const message = + 'message' in firstChoice && firstChoice.message && typeof firstChoice.message === 'object' + ? (firstChoice.message as Record) + : null + if (!message) { + throw new Error('Chat Completions response did not include a message.') + } + if (typeof message.content === 'string') return message.content + if (Array.isArray(message.content)) { + const textPartList = message.content + .map(contentPart => { + if (!contentPart || typeof contentPart !== 'object') return null + const contentPartRecord = contentPart as Record + if (typeof contentPartRecord.text === 'string') return contentPartRecord.text + return null + }) + .filter((value): value is string => typeof value === 'string' && value.length > 0) + if (textPartList.length > 0) return textPartList.join('\n').trim() + } + throw new Error('Chat Completions response did not include text content.') +} + +function extractResponsesText(payload: OpenAiJsonResponse): string { + const directOutputText = typeof payload.output_text === 'string' ? payload.output_text.trim() : '' + if (directOutputText) return directOutputText + + const outputItemList = Array.isArray(payload.output) ? payload.output : [] + const textPartList: string[] = [] + for (const outputItem of outputItemList) { + if (!outputItem || typeof outputItem !== 'object') continue + const outputItemRecord = outputItem as Record + const contentPartList = Array.isArray(outputItemRecord.content) ? outputItemRecord.content : [] + for (const contentPart of contentPartList) { + if (!contentPart || typeof contentPart !== 'object') continue + const contentPartRecord = contentPart as Record + const textValue = + typeof contentPartRecord.text === 'string' + ? contentPartRecord.text + : typeof contentPartRecord.output_text === 'string' + ? contentPartRecord.output_text + : null + if (!textValue) continue + textPartList.push(textValue) + } + } + if (textPartList.length === 0) { + throw new Error('Responses API output did not include text content.') + } + return textPartList.join('\n').trim() +} + +function parseJsonText( + jsonText: string, + responseSchema: Schema, + contextLabel: string +): z.infer { + let parsedJson: unknown + try { + parsedJson = JSON.parse(jsonText) + } catch (error) { + const fencedJsonMatch = + jsonText.match(/```json\s*([\s\S]+?)```/i) ?? jsonText.match(/```\s*([\s\S]+?)```/i) + if (fencedJsonMatch?.[1]) { + try { + parsedJson = JSON.parse(fencedJsonMatch[1].trim()) + } catch { + throw new Error( + `${contextLabel} did not return valid JSON: ${error instanceof Error ? error.message : 'unknown'}` + ) + } + } else { + const firstBraceIndex = jsonText.indexOf('{') + const lastBraceIndex = jsonText.lastIndexOf('}') + if (firstBraceIndex >= 0 && lastBraceIndex > firstBraceIndex) { + try { + parsedJson = JSON.parse(jsonText.slice(firstBraceIndex, lastBraceIndex + 1)) + } catch { + throw new Error( + `${contextLabel} did not return valid JSON: ${error instanceof Error ? error.message : 'unknown'}` + ) + } + } else { + throw new Error( + `${contextLabel} did not return valid JSON: ${error instanceof Error ? error.message : 'unknown'}` + ) + } + } + } + return responseSchema.parse(parsedJson) +} + +export async function createChatCompletionJson( + input: CreateChatCompletionJsonInput +): Promise> { + const payload = await postOpenAiJson('/chat/completions', { + max_completion_tokens: input.maxCompletionTokens, + messages: input.messageList, + model: input.model + }) + return parseJsonText( + extractChatCompletionText(payload), + input.responseSchema, + `Chat completion (${input.model})` + ) +} + +export async function createResponsesJson( + input: CreateResponsesJsonInput +): Promise> { + const payload = await postOpenAiJson('/responses', { + input: input.inputList, + max_output_tokens: input.maxOutputTokens, + model: input.model, + text: { + format: { + type: 'json_object' + } + } + }) + return parseJsonText( + extractResponsesText(payload), + input.responseSchema, + `Responses request (${input.model})` + ) +} + +export async function transcribeAudioFile(input: AudioTranscriptionInput): Promise { + const fileBytes = await readFile(input.audioFilePath) + const formData = new FormData() + formData.append( + 'file', + new File([fileBytes], basename(input.audioFilePath), { type: 'audio/wav' }) + ) + formData.append('model', input.model) + if (input.prompt) formData.append('prompt', input.prompt) + + for (let attemptNumber = 1; attemptNumber <= openAiMaximumAttempts; attemptNumber += 1) { + try { + const response = await fetch(`${openAiApiBaseUrl}/audio/transcriptions`, { + body: formData, + headers: { + Authorization: `Bearer ${requireOpenAiApiKey()}` + }, + method: 'POST', + signal: AbortSignal.timeout(openAiRequestTimeoutMilliseconds) + }) + const payload = await parseJsonResponse(response) + if (!response.ok) { + const errorMessage = + typeof payload === 'object' && + payload !== null && + 'error' in payload && + typeof payload.error === 'object' && + payload.error !== null && + 'message' in payload.error && + typeof payload.error.message === 'string' + ? payload.error.message + : `Audio transcription failed (${response.status})` + if (isRetryableStatus(response.status) && attemptNumber < openAiMaximumAttempts) { + await waitForRetryDelay(attemptNumber) + continue + } + throw new Error(errorMessage) + } + const parsedPayload = z + .object({ + text: z.string().min(1) + }) + .parse(payload) + return parsedPayload.text.trim() + } catch (error) { + if (attemptNumber >= openAiMaximumAttempts || !isRetryableFetchError(error)) { + throw error + } + await waitForRetryDelay(attemptNumber) + } + } + throw new Error('Audio transcription request exhausted retry attempts.') +} + +export async function readAudioFileAsBase64(audioFilePath: string): Promise { + const fileBuffer = await readFile(audioFilePath) + return Buffer.from(fileBuffer).toString('base64') +} diff --git a/scripts/evals/runEvalFlywheel.ts b/scripts/evals/runEvalFlywheel.ts new file mode 100644 index 0000000..452b559 --- /dev/null +++ b/scripts/evals/runEvalFlywheel.ts @@ -0,0 +1,691 @@ +import { spawnSync } from 'node:child_process' +import { mkdir, readFile, writeFile } from 'node:fs/promises' +import { dirname, join, resolve } from 'node:path' +import { z } from 'zod' +import type { + EvalCandidateConfig, + EvalRunSummary, + FlywheelDecision, + FlywheelPatchProposal +} from '@/evals/contracts' +import { FlywheelDecisionSchema } from '@/evals/contracts' +import { synthesizeRootCausePatchProposal } from './judges' +import { + type EvalRunExecution, + executeEvalSuite, + loadEvalMatrix, + selectCandidateConfig +} from './runEvalSuite' + +type ParsedFlywheelArguments = { + baselineUrl: string + candidateId: string + candidateUrl?: string + captureVercelLogs: boolean + evaluationId?: string + matrixPath: string + scenarioIdList?: string[] +} + +type CandidateComparison = { + candidateBeatsBaseline: boolean + hasLowConfidenceBlockingJudge: boolean + protectedRegressionList: string[] + regressionDeltas: Record + reasonList: string[] +} + +type FlywheelState = z.infer + +type VercelCommandResult = { + outputText: string + url?: string + urlList: string[] +} + +const highSignalCanaryScenarioIdList = [ + 'chat_audio_practice_ukrainian', + 'translate_audio_en_to_es', + 'translate_audio_en_long_to_es', + 'translate_audio_en_noisy_fillers_to_es' +] as const +const flywheelStatePath = resolve(process.cwd(), '.artifacts/evals/flywheel-state.json') +const flywheelPatchPromptPath = resolve(process.cwd(), '.artifacts/evals/latest-patch-proposal.md') +const flywheelPatchProposalPath = resolve( + process.cwd(), + '.artifacts/evals/latest-patch-proposal.json' +) + +const FlywheelStateSchema = z.object({ + dateKey: z.string().min(1), + estimatedCostUsd: z.number().min(0), + failureAttemptCountByClusterKey: z.record(z.string(), z.number().int().nonnegative()), + lastGoodProductionDeploymentUrl: z.string().url().optional(), + runCount: z.number().int().nonnegative() +}) + +function parseArguments(argumentList: string[]): ParsedFlywheelArguments { + const parsedArguments: ParsedFlywheelArguments = { + baselineUrl: 'https://lilac.chat', + candidateId: 'workspace-preview', + captureVercelLogs: true, + matrixPath: resolve(process.cwd(), 'scripts/evals/manifests/default.matrix.json') + } + + for (let argumentIndex = 0; argumentIndex < argumentList.length; argumentIndex += 1) { + const argument = argumentList[argumentIndex] + switch (argument) { + case '--baseline-url': { + parsedArguments.baselineUrl = argumentList[argumentIndex + 1] ?? parsedArguments.baselineUrl + argumentIndex += 1 + break + } + case '--candidate-id': { + parsedArguments.candidateId = argumentList[argumentIndex + 1] ?? parsedArguments.candidateId + argumentIndex += 1 + break + } + case '--candidate-url': { + const candidateUrl = argumentList[argumentIndex + 1] + if (candidateUrl) { + parsedArguments.candidateUrl = candidateUrl + } + argumentIndex += 1 + break + } + case '--evaluation-id': { + const evaluationId = argumentList[argumentIndex + 1] + if (evaluationId) { + parsedArguments.evaluationId = evaluationId + } + argumentIndex += 1 + break + } + case '--matrix-path': { + parsedArguments.matrixPath = resolve( + argumentList[argumentIndex + 1] ?? parsedArguments.matrixPath + ) + argumentIndex += 1 + break + } + case '--scenario-ids': { + parsedArguments.scenarioIdList = (argumentList[argumentIndex + 1] ?? '') + .split(',') + .map(value => value.trim()) + .filter(Boolean) + argumentIndex += 1 + break + } + case '--skip-vercel-logs': { + parsedArguments.captureVercelLogs = false + break + } + default: + break + } + } + + return parsedArguments +} + +function createTimestampLabel(): string { + return new Date().toISOString().replaceAll(':', '-').replaceAll('.', '-') +} + +function createEvaluationId(prefix: string): string { + return `${prefix}-${createTimestampLabel()}` +} + +function resolveBooleanEnvironmentFlag(name: string, fallbackValue: boolean): boolean { + const rawValue = process.env[name]?.trim().toLowerCase() + if (rawValue === 'true') return true + if (rawValue === 'false') return false + return fallbackValue +} + +function resolvePositiveIntegerEnvironmentValue(name: string, fallbackValue: number): number { + const rawValue = process.env[name]?.trim() + if (!rawValue) return fallbackValue + const parsedValue = Number.parseInt(rawValue, 10) + return Number.isFinite(parsedValue) && parsedValue > 0 ? parsedValue : fallbackValue +} + +function resolvePositiveNumberEnvironmentValue(name: string, fallbackValue: number): number { + const rawValue = process.env[name]?.trim() + if (!rawValue) return fallbackValue + const parsedValue = Number.parseFloat(rawValue) + return Number.isFinite(parsedValue) && parsedValue > 0 ? parsedValue : fallbackValue +} + +function toEnvironmentVariableName(flagId: string): string { + return `NEXT_PUBLIC_${flagId.replaceAll(/[^a-zA-Z0-9]+/g, '_').toUpperCase()}` +} + +function buildCandidateEnvironmentOverrides( + candidateConfig: EvalCandidateConfig +): Record { + const environmentOverrides: Record = { + ...candidateConfig.buildEnvironmentOverrides, + NEXT_PUBLIC_LILAC_CHAT_TURN_ORDERING_PROFILE: candidateConfig.chatTurnOrderingProfile, + NEXT_PUBLIC_LILAC_REALTIME_MODEL: candidateConfig.realtimeModelSlug, + NEXT_PUBLIC_LILAC_TRANSCRIPTION_MODEL: candidateConfig.asrModelSlug, + NEXT_PUBLIC_LILAC_TRANSLATE_LANGUAGE_GROUNDING_MODE: + candidateConfig.translateLanguageGroundingMode, + NEXT_PUBLIC_LILAC_TRANSLATE_PROMPT_VERSION: + candidateConfig.promptVersionIds.translate ?? 'current' + } + if (candidateConfig.asrModelSlug === 'gpt-4o-mini-transcribe') { + environmentOverrides.NEXT_PUBLIC_LILAC_TRANSCRIBE_ASR_PROFILE = 'fast' + } else { + environmentOverrides.NEXT_PUBLIC_LILAC_TRANSCRIBE_ASR_PROFILE = 'accurate' + } + if (typeof candidateConfig.draftDebounceMilliseconds === 'number') { + environmentOverrides.NEXT_PUBLIC_LILAC_TRANSLATE_DRAFT_DEBOUNCE_MS = String( + candidateConfig.draftDebounceMilliseconds + ) + } + if (typeof candidateConfig.translateCommitIntervalMilliseconds === 'number') { + environmentOverrides.NEXT_PUBLIC_LILAC_TRANSLATE_COMMIT_INTERVAL_MS = String( + candidateConfig.translateCommitIntervalMilliseconds + ) + } + if (typeof candidateConfig.translateMinimumCommitAudioMilliseconds === 'number') { + environmentOverrides.NEXT_PUBLIC_LILAC_TRANSLATE_MIN_COMMIT_AUDIO_MS = String( + candidateConfig.translateMinimumCommitAudioMilliseconds + ) + } + for (const [featureFlagId, featureFlagValue] of Object.entries(candidateConfig.featureFlags)) { + environmentOverrides[toEnvironmentVariableName(featureFlagId)] = String(featureFlagValue) + } + return environmentOverrides +} + +function extractUrlList(outputText: string): string[] { + return Array.from(outputText.matchAll(/https:\/\/[\w.-]+\.vercel\.app/g)).map(match => match[0]) +} + +function executeVercelCommand(argumentList: string[]): VercelCommandResult { + const result = spawnSync('vercel', argumentList, { + cwd: process.cwd(), + encoding: 'utf8', + env: process.env + }) + const outputText = [result.stdout, result.stderr].filter(Boolean).join('\n') + if (result.status !== 0) { + throw new Error(outputText || `Vercel command failed: vercel ${argumentList.join(' ')}`) + } + const urlList = extractUrlList(outputText) + return { + outputText, + ...(urlList.length > 0 ? { url: urlList[urlList.length - 1] } : {}), + urlList + } +} + +function buildVercelEnvironmentArgumentList( + environmentOverrides: Record +): string[] { + const argumentList: string[] = [] + for (const [key, value] of Object.entries(environmentOverrides)) { + argumentList.push('--build-env', `${key}=${value}`, '--env', `${key}=${value}`) + } + return argumentList +} + +function slugifyAliasSegment(value: string): string { + return value + .trim() + .toLowerCase() + .replace(/[^a-z0-9-]+/g, '-') + .replace(/^-+|-+$/g, '') + .slice(0, 48) +} + +function resolvePublicPreviewAliasHost(candidateConfig: EvalCandidateConfig): string | null { + const explicitUrl = process.env.LILAC_EVAL_PREVIEW_URL?.trim() + if (explicitUrl) { + try { + return new URL(explicitUrl).host + } catch { + return explicitUrl.replace(/^https?:\/\//, '') + } + } + const aliasDomain = process.env.LILAC_EVAL_PREVIEW_ALIAS_DOMAIN?.trim() + if (!aliasDomain) return null + const aliasPrefix = slugifyAliasSegment(candidateConfig.id || candidateConfig.label) + if (!aliasPrefix) return null + return `${aliasPrefix}.${aliasDomain}` +} + +function deployPreview(candidateConfig: EvalCandidateConfig): string { + const environmentOverrides = buildCandidateEnvironmentOverrides(candidateConfig) + const deploymentResult = executeVercelCommand([ + 'deploy', + '--yes', + '--force', + '--public', + '--target', + 'preview', + '--meta', + `lilac_eval_candidate_id=${candidateConfig.id}`, + '--meta', + `lilac_eval_candidate_label=${candidateConfig.label}`, + ...buildVercelEnvironmentArgumentList(environmentOverrides) + ]) + if (!deploymentResult.url) { + throw new Error('Preview deployment completed without a deployment URL.') + } + const publicPreviewAliasHost = resolvePublicPreviewAliasHost(candidateConfig) + if (!publicPreviewAliasHost) return deploymentResult.url + executeVercelCommand(['alias', 'set', deploymentResult.url, publicPreviewAliasHost]) + return `https://${publicPreviewAliasHost}` +} + +function promoteDeployment(deploymentUrl: string): void { + executeVercelCommand(['promote', deploymentUrl, '--yes']) +} + +function rollbackDeployment(deploymentUrl: string): void { + executeVercelCommand(['rollback', deploymentUrl, '--yes']) +} + +function getCurrentProductionDeploymentUrl(): string | undefined { + const deploymentResult = executeVercelCommand(['list', '--environment', 'production', 'lilac']) + return deploymentResult.url +} + +async function loadFlywheelState(): Promise { + try { + const stateContent = await readFile(flywheelStatePath, 'utf8') + const parsedState = FlywheelStateSchema.parse(JSON.parse(stateContent)) + const currentDateKey = new Date().toISOString().slice(0, 10) + if (parsedState.dateKey === currentDateKey) return parsedState + } catch {} + return FlywheelStateSchema.parse({ + dateKey: new Date().toISOString().slice(0, 10), + estimatedCostUsd: 0, + failureAttemptCountByClusterKey: {}, + runCount: 0 + }) +} + +async function writeFlywheelState(state: FlywheelState): Promise { + await mkdir(dirname(flywheelStatePath), { recursive: true }) + await writeFile(flywheelStatePath, JSON.stringify(state, null, 2), 'utf8') +} + +function estimateEvalRunCostUsd(execution: EvalRunExecution): number { + const scenarioCount = execution.runSummary.scenarioRunSummaryList.length + const audioScenarioCount = execution.runSummary.scenarioRunSummaryList.filter(summary => { + return summary.scenario.inputType !== 'text' + }).length + const screenshotWeight = execution.runSummary.scenarioRunSummaryList.length * 0.03 + return ( + Math.round((scenarioCount * 0.18 + audioScenarioCount * 0.12 + screenshotWeight) * 100) / 100 + ) +} + +function compareRuns( + baselineSummary: EvalRunSummary, + candidateSummary: EvalRunSummary +): CandidateComparison { + const regressionDeltas: Record = { + aggregateScore: + Math.round( + (candidateSummary.scorecard.aggregateScore - baselineSummary.scorecard.aggregateScore) * 1000 + ) / 1000 + } + const protectedRegressionList: string[] = [] + const reasonList: string[] = [] + + const baselineScenarioSummaryById = new Map( + baselineSummary.scenarioRunSummaryList.map(scenarioSummary => [ + scenarioSummary.scenario.id, + scenarioSummary + ]) + ) + for (const candidateScenarioSummary of candidateSummary.scenarioRunSummaryList) { + const baselineScenarioSummary = baselineScenarioSummaryById.get( + candidateScenarioSummary.scenario.id + ) + if (!baselineScenarioSummary) continue + const scoreDelta = + candidateScenarioSummary.scorecard.aggregateScore - + baselineScenarioSummary.scorecard.aggregateScore + regressionDeltas[candidateScenarioSummary.scenario.id] = Math.round(scoreDelta * 1000) / 1000 + if (scoreDelta < -0.05) { + protectedRegressionList.push( + `Scenario ${candidateScenarioSummary.scenario.id} regressed by ${Math.abs(scoreDelta).toFixed(3)}.` + ) + } + if ( + candidateScenarioSummary.scorecard.releaseDecision === 'block' && + baselineScenarioSummary.scorecard.releaseDecision !== 'block' + ) { + protectedRegressionList.push( + `Scenario ${candidateScenarioSummary.scenario.id} became blocking in the candidate run.` + ) + } + } + + const baselineHighSeverityCount = baselineSummary.scorecard.hardGateResults.filter( + finding => finding.severity === 'high' + ).length + const candidateHighSeverityCount = candidateSummary.scorecard.hardGateResults.filter( + finding => finding.severity === 'high' + ).length + regressionDeltas.hardGateHighSeverity = candidateHighSeverityCount - baselineHighSeverityCount + if (candidateHighSeverityCount > baselineHighSeverityCount) { + protectedRegressionList.push('Candidate introduced additional high-severity hard-gate findings.') + } + + const hasLowConfidenceBlockingJudge = + candidateSummary.scorecard.releaseDecision === 'block' && + (candidateSummary.scorecard.judgeResults.some(judgeResult => { + return !judgeResult.passed && judgeResult.confidence < 0.6 + }) || + false) + if (hasLowConfidenceBlockingJudge) { + protectedRegressionList.push('Blocking candidate failures include low-confidence judge outputs.') + } + + reasonList.push( + `Baseline aggregate score=${baselineSummary.scorecard.aggregateScore.toFixed(3)}.`, + `Candidate aggregate score=${candidateSummary.scorecard.aggregateScore.toFixed(3)}.` + ) + + const candidateBeatsBaseline = + candidateSummary.scorecard.releaseDecision !== 'block' && + candidateSummary.scorecard.aggregateScore > baselineSummary.scorecard.aggregateScore && + protectedRegressionList.length === 0 + if (!candidateBeatsBaseline) { + reasonList.push('Candidate did not strictly beat the baseline under protected metrics.') + } + + return { + candidateBeatsBaseline, + hasLowConfidenceBlockingJudge, + protectedRegressionList, + reasonList, + regressionDeltas + } +} + +function buildFailureClusterKey(summary: EvalRunSummary): string { + const clusterIdList = summary.failureClusterList.map(cluster => cluster.clusterId).sort() + return clusterIdList.join('|') || 'no-failure-clusters' +} + +function createFlywheelDecision(input: FlywheelDecision): FlywheelDecision { + return FlywheelDecisionSchema.parse(input) +} + +async function maybeRunPatchCommand(input: { + patchProposal: FlywheelPatchProposal + state: FlywheelState + summary: EvalRunSummary +}): Promise<{ commandRan: boolean; commandSucceeded: boolean }> { + const patchCommand = process.env.LILAC_FLYWHEEL_PATCH_COMMAND?.trim() + if (!patchCommand) return { commandRan: false, commandSucceeded: false } + + await mkdir(dirname(flywheelPatchPromptPath), { recursive: true }) + await writeFile(flywheelPatchProposalPath, JSON.stringify(input.patchProposal, null, 2), 'utf8') + await writeFile( + flywheelPatchPromptPath, + [ + `Candidate: ${input.summary.candidateConfig.id}`, + `Evaluation: ${input.summary.evaluationId}`, + `Failure clusters: ${input.patchProposal.failureClusterIds.join(', ')}`, + '', + input.patchProposal.summary, + '', + input.patchProposal.prompt + ].join('\n'), + 'utf8' + ) + + const commandResult = spawnSync('zsh', ['-lc', patchCommand], { + cwd: process.cwd(), + encoding: 'utf8', + env: { + ...process.env, + LILAC_FLYWHEEL_EVALUATION_ID: input.summary.evaluationId, + LILAC_FLYWHEEL_PATCH_PROMPT_PATH: flywheelPatchPromptPath, + LILAC_FLYWHEEL_PATCH_PROPOSAL_PATH: flywheelPatchProposalPath + }, + stdio: 'inherit' + }) + return { + commandRan: true, + commandSucceeded: commandResult.status === 0 + } +} + +async function evaluateCandidateAgainstPreview(input: { + candidateConfig: EvalCandidateConfig + captureVercelLogs: boolean + deploymentUrl: string + evaluationIdPrefix: string + scenarioIdList?: string[] +}): Promise { + return executeEvalSuite({ + candidateConfig: input.candidateConfig, + captureVercelLogs: input.captureVercelLogs, + evaluationId: createEvaluationId(input.evaluationIdPrefix), + targetUrl: input.deploymentUrl, + ...(input.scenarioIdList ? { scenarioIdList: input.scenarioIdList } : {}) + }) +} + +async function main(): Promise { + const parsedArguments = parseArguments(process.argv.slice(2)) + const evalMatrix = await loadEvalMatrix(parsedArguments.matrixPath) + const baselineConfig = evalMatrix.baseline + const candidateConfig = selectCandidateConfig( + [evalMatrix.baseline, ...evalMatrix.candidateList], + parsedArguments.candidateId + ) + const flywheelEnabled = resolveBooleanEnvironmentFlag('LILAC_FLYWHEEL_ENABLED', true) + const autoprodEnabled = resolveBooleanEnvironmentFlag('LILAC_FLYWHEEL_AUTOPROD_ENABLED', false) + const maxRunsPerDay = resolvePositiveIntegerEnvironmentValue('LILAC_FLYWHEEL_MAX_RUNS_PER_DAY', 8) + const maxPatchAttemptsPerFailure = resolvePositiveIntegerEnvironmentValue( + 'LILAC_FLYWHEEL_MAX_PATCH_ATTEMPTS_PER_FAILURE', + 1 + ) + const budgetLimitUsd = resolvePositiveNumberEnvironmentValue('LILAC_FLYWHEEL_BUDGET_USD_LIMIT', 25) + const flywheelState = await loadFlywheelState() + if (flywheelState.runCount >= maxRunsPerDay) { + throw new Error(`Flywheel max runs per day exceeded (${maxRunsPerDay}).`) + } + + const flywheelArtifactDirectoryPath = resolve( + process.cwd(), + '.artifacts/evals/flywheel', + parsedArguments.evaluationId ?? createEvaluationId('flywheel') + ) + await mkdir(flywheelArtifactDirectoryPath, { recursive: true }) + + const baselineExecution = await executeEvalSuite({ + candidateConfig: baselineConfig, + captureVercelLogs: parsedArguments.captureVercelLogs, + evaluationId: createEvaluationId('baseline'), + targetUrl: parsedArguments.baselineUrl, + ...(parsedArguments.scenarioIdList ? { scenarioIdList: parsedArguments.scenarioIdList } : {}) + }) + const baselineCostEstimate = estimateEvalRunCostUsd(baselineExecution) + flywheelState.runCount += 1 + flywheelState.estimatedCostUsd += baselineCostEstimate + + if (flywheelState.estimatedCostUsd > budgetLimitUsd) { + await writeFlywheelState(flywheelState) + throw new Error( + `Flywheel estimated budget exceeded (${flywheelState.estimatedCostUsd.toFixed(2)} > ${budgetLimitUsd}).` + ) + } + + const previewDeploymentUrl = parsedArguments.candidateUrl + ? parsedArguments.candidateUrl + : candidateConfig.id === baselineConfig.id + ? parsedArguments.baselineUrl + : flywheelEnabled + ? deployPreview(candidateConfig) + : undefined + if (!previewDeploymentUrl) { + await writeFlywheelState(flywheelState) + throw new Error('Candidate URL is required when flywheel deployment is disabled.') + } + + const candidateExecution = await evaluateCandidateAgainstPreview({ + candidateConfig, + captureVercelLogs: parsedArguments.captureVercelLogs, + deploymentUrl: previewDeploymentUrl, + evaluationIdPrefix: 'candidate-01', + ...(parsedArguments.scenarioIdList ? { scenarioIdList: parsedArguments.scenarioIdList } : {}) + }) + flywheelState.runCount += 1 + flywheelState.estimatedCostUsd += estimateEvalRunCostUsd(candidateExecution) + const candidateComparison = compareRuns( + baselineExecution.runSummary, + candidateExecution.runSummary + ) + const failureClusterKey = buildFailureClusterKey(candidateExecution.runSummary) + const failureAttemptCount = flywheelState.failureAttemptCountByClusterKey[failureClusterKey] ?? 0 + + const decisionReasonList = [...candidateComparison.reasonList] + let finalDecision = createFlywheelDecision({ + action: 'report_only', + candidateId: candidateConfig.id, + reasons: ['Flywheel evaluation completed.'] + }) + let verificationExecution: EvalRunExecution | null = null + let canaryExecution: EvalRunExecution | null = null + let patchProposal: FlywheelPatchProposal | null = null + let previousProductionDeploymentUrl: string | undefined + let patchCommandResult = { commandRan: false, commandSucceeded: false } + + if (!candidateComparison.candidateBeatsBaseline) { + decisionReasonList.push(...candidateComparison.protectedRegressionList) + patchProposal = await synthesizeRootCausePatchProposal({ + candidateId: candidateConfig.id, + failureClusterIdList: candidateExecution.runSummary.failureClusterList.map( + cluster => cluster.clusterId + ), + observationList: candidateExecution.observationList + }) + finalDecision = createFlywheelDecision({ + action: 'propose_patch', + candidateId: candidateConfig.id, + patchProposal, + reasons: + decisionReasonList.length > 0 ? decisionReasonList : ['Candidate did not beat baseline.'] + }) + if (failureAttemptCount < maxPatchAttemptsPerFailure) { + patchCommandResult = await maybeRunPatchCommand({ + patchProposal, + state: flywheelState, + summary: candidateExecution.runSummary + }) + flywheelState.failureAttemptCountByClusterKey[failureClusterKey] = failureAttemptCount + 1 + } + } else { + verificationExecution = await evaluateCandidateAgainstPreview({ + candidateConfig, + captureVercelLogs: parsedArguments.captureVercelLogs, + deploymentUrl: previewDeploymentUrl, + evaluationIdPrefix: 'candidate-02', + ...(parsedArguments.scenarioIdList ? { scenarioIdList: parsedArguments.scenarioIdList } : {}) + }) + flywheelState.runCount += 1 + flywheelState.estimatedCostUsd += estimateEvalRunCostUsd(verificationExecution) + const verificationComparison = compareRuns( + baselineExecution.runSummary, + verificationExecution.runSummary + ) + if (!verificationComparison.candidateBeatsBaseline) { + decisionReasonList.push( + 'Candidate passed once but failed the required second verification run.', + ...verificationComparison.protectedRegressionList + ) + finalDecision = createFlywheelDecision({ + action: 'block', + candidateId: candidateConfig.id, + reasons: decisionReasonList + }) + } else if (!autoprodEnabled || !flywheelEnabled) { + decisionReasonList.push( + 'Preview passed twice. Autoprod is disabled, so the run stops at candidate status.' + ) + finalDecision = createFlywheelDecision({ + action: 'deploy_preview', + candidateId: candidateConfig.id, + reasons: decisionReasonList + }) + } else { + previousProductionDeploymentUrl = getCurrentProductionDeploymentUrl() + promoteDeployment(previewDeploymentUrl) + canaryExecution = await executeEvalSuite({ + candidateConfig, + captureVercelLogs: parsedArguments.captureVercelLogs, + evaluationId: createEvaluationId('prod-canary'), + scenarioIdList: [...highSignalCanaryScenarioIdList], + targetUrl: parsedArguments.baselineUrl + }) + flywheelState.runCount += 1 + flywheelState.estimatedCostUsd += estimateEvalRunCostUsd(canaryExecution) + const canaryComparison = compareRuns(baselineExecution.runSummary, canaryExecution.runSummary) + if (!canaryComparison.candidateBeatsBaseline) { + rollbackDeployment(previewDeploymentUrl) + decisionReasonList.push( + 'Production canary regressed after promotion. Rollback was triggered.', + ...canaryComparison.protectedRegressionList + ) + finalDecision = createFlywheelDecision({ + action: 'rollback', + candidateId: candidateConfig.id, + reasons: decisionReasonList + }) + } else { + flywheelState.lastGoodProductionDeploymentUrl = previewDeploymentUrl + decisionReasonList.push('Preview passed twice and production canary passed.') + finalDecision = createFlywheelDecision({ + action: 'promote_to_production', + candidateId: candidateConfig.id, + reasons: decisionReasonList + }) + } + } + } + + const summary = { + autoprodEnabled, + baseline: baselineExecution.runSummary, + budgetLimitUsd, + candidate: candidateExecution.runSummary, + candidateComparison, + candidateDeploymentUrl: previewDeploymentUrl, + decision: finalDecision, + flywheelEnabled, + patchCommandResult, + ...(patchProposal ? { patchProposal } : {}), + ...(previousProductionDeploymentUrl ? { previousProductionDeploymentUrl } : {}), + state: flywheelState, + ...(verificationExecution ? { verification: verificationExecution.runSummary } : {}), + ...(canaryExecution ? { productionCanary: canaryExecution.runSummary } : {}) + } + await writeFile( + join(flywheelArtifactDirectoryPath, 'flywheel-summary.json'), + JSON.stringify(summary, null, 2), + 'utf8' + ) + await writeFlywheelState(flywheelState) + console.log(JSON.stringify(summary, null, 2)) + + if (finalDecision.action === 'block' || finalDecision.action === 'rollback') { + throw new Error(finalDecision.reasons.join(' ')) + } +} + +void main().catch(error => { + console.error(error instanceof Error ? error.message : 'Eval flywheel failed.') + process.exit(1) +}) diff --git a/scripts/evals/runEvalSuite.ts b/scripts/evals/runEvalSuite.ts new file mode 100644 index 0000000..d4de23b --- /dev/null +++ b/scripts/evals/runEvalSuite.ts @@ -0,0 +1,1087 @@ +import { spawnSync } from 'node:child_process' +import { access, mkdir, readFile, writeFile } from 'node:fs/promises' +import { dirname, join, resolve } from 'node:path' +import { + type Browser, + type BrowserContext, + type ConsoleMessage, + chromium, + type Page +} from 'playwright' +import { z } from 'zod' + +import { + type EvalCandidateConfig, + EvalMatrixSchema, + type EvalRunSummary, + EvalRunSummarySchema, + EvalScenarioRunSummarySchema, + type EvalScenarioSpec, + LilacTestBusEventSchema +} from '@/evals/contracts' +import { + collectRuntimeErrorMatches, + type RuntimeLogCapture, + startVercelRuntimeLogCapture +} from '../smoke/captureVercelRuntimeLogs' +import { resolveEvalScenarioList } from './defaultScenarios' +import { + buildScenarioScorecard, + runAudioInputReferenceJudge, + runAudioOutputListenerJudge, + runHardGuardrailJudge, + runSemanticConversationJudge, + runVisualUiJudge +} from './judges' +import type { ScenarioObservation } from './runtimeTypes' + +type EvalRunOptions = { + candidateConfig: EvalCandidateConfig + captureVercelLogs: boolean + evaluationId?: string + scenarioIdList?: string[] + targetUrl: string +} + +type ParsedArgumentList = { + candidateId: string + captureVercelLogs: boolean + evaluationId?: string + matrixPath: string + scenarioIdList?: string[] + targetUrl: string +} + +type ScenarioArtifactCollectors = { + consoleEntryList: string[] + stop: () => Promise +} + +export type EvalRunExecution = { + observationList: ScenarioObservation[] + runSummary: EvalRunSummary +} + +type ParsedLilacTestBusEvent = z.infer +type AssistantAudioStateEvent = Extract< + ParsedLilacTestBusEvent, + { eventType: 'assistant_audio_state_changed' } +> +type ChatTranscriptEvent = Extract +type SubtitleStateEvent = Extract +type TranslateCardEvent = Extract +type VisibleErrorEvent = Extract + +const defaultSilentAudioFixturePath = resolve( + process.cwd(), + '.artifacts/smoke/fixtures/silence_5s.wav' +) +const defaultFixtureManifestPath = resolve( + process.cwd(), + '.artifacts/smoke/fixtures/fixtures.manifest.json' +) +const defaultViewport = { + height: 900, + width: 1365 +} +const mobileViewportList = [ + { height: 812, width: 375 }, + { height: 844, width: 390 } +] as const + +function parseArguments(argumentList: string[]): ParsedArgumentList { + const parsedArguments: ParsedArgumentList = { + candidateId: 'workspace-preview', + captureVercelLogs: true, + matrixPath: resolve(process.cwd(), 'scripts/evals/manifests/default.matrix.json'), + targetUrl: 'https://lilac.chat' + } + + for (let argumentIndex = 0; argumentIndex < argumentList.length; argumentIndex += 1) { + const argument = argumentList[argumentIndex] + switch (argument) { + case '--candidate-id': { + parsedArguments.candidateId = argumentList[argumentIndex + 1] ?? parsedArguments.candidateId + argumentIndex += 1 + break + } + case '--evaluation-id': { + const evaluationId = argumentList[argumentIndex + 1] + if (evaluationId) { + parsedArguments.evaluationId = evaluationId + } + argumentIndex += 1 + break + } + case '--matrix-path': { + parsedArguments.matrixPath = resolve( + argumentList[argumentIndex + 1] ?? parsedArguments.matrixPath + ) + argumentIndex += 1 + break + } + case '--scenario-ids': { + const rawScenarioIdList = argumentList[argumentIndex + 1] ?? '' + parsedArguments.scenarioIdList = rawScenarioIdList + .split(',') + .map(value => value.trim()) + .filter(Boolean) + argumentIndex += 1 + break + } + case '--skip-vercel-logs': { + parsedArguments.captureVercelLogs = false + break + } + case '--target-url': { + parsedArguments.targetUrl = argumentList[argumentIndex + 1] ?? parsedArguments.targetUrl + argumentIndex += 1 + break + } + default: + break + } + } + + return parsedArguments +} + +function createTimestampLabel(): string { + return new Date().toISOString().replaceAll(':', '-').replaceAll('.', '-') +} + +function createEvaluationId(): string { + return `eval-${createTimestampLabel()}` +} + +function getGitCommitSha(): string { + const result = spawnSync('git', ['rev-parse', 'HEAD'], { + cwd: process.cwd(), + encoding: 'utf8' + }) + return result.status === 0 ? result.stdout.trim() : 'unknown' +} + +async function pathExists(path: string): Promise { + try { + await access(path) + return true + } catch { + return false + } +} + +async function ensureAudioFixturesReady(scenarioList: EvalScenarioSpec[]): Promise { + const requiredPathList = [ + defaultSilentAudioFixturePath, + defaultFixtureManifestPath, + ...scenarioList + .map(scenario => scenario.audioFixturePath) + .filter((value): value is string => typeof value === 'string' && value.length > 0) + ] + const missingPathList: string[] = [] + for (const requiredPath of requiredPathList) { + if (!(await pathExists(requiredPath))) { + missingPathList.push(requiredPath) + } + } + if (missingPathList.length === 0) return + + const generationResult = spawnSync('bun', ['run', 'smoke:prepare-audio'], { + cwd: process.cwd(), + encoding: 'utf8', + env: process.env, + stdio: 'inherit' + }) + if (generationResult.status !== 0) { + throw new Error('Audio fixture generation failed while preparing eval scenarios.') + } + + const unresolvedPathList: string[] = [] + for (const requiredPath of requiredPathList) { + if (!(await pathExists(requiredPath))) { + unresolvedPathList.push(requiredPath) + } + } + if (unresolvedPathList.length > 0) { + throw new Error(`Missing required eval audio fixtures: ${unresolvedPathList.join(', ')}`) + } +} + +async function waitForCondition( + condition: () => Promise, + timeoutMilliseconds: number, + errorMessage: string +): Promise { + const startTime = Date.now() + while (Date.now() - startTime <= timeoutMilliseconds) { + const passed = await condition() + if (passed) return + await new Promise(resolveTimeout => setTimeout(resolveTimeout, 350)) + } + throw new Error(errorMessage) +} + +async function switchToModeWithRetry( + page: Page, + targetMode: 'chat' | 'translate', + maxAttempts = 3 +): Promise { + const triggerTestId = targetMode === 'chat' ? 'mode-tab-chat' : 'mode-tab-translate' + for (let attempt = 1; attempt <= maxAttempts; attempt += 1) { + await page.keyboard.press('Escape').catch(() => {}) + await page.getByTestId(triggerTestId).click({ timeout: 15_000 }) + try { + await waitForCondition( + async function hasTargetModeTabState(): Promise { + const stateAttributeValue = await page.getByTestId(triggerTestId).getAttribute('data-state') + return stateAttributeValue === 'active' + }, + 8_000, + `Mode switch to ${targetMode} did not complete.` + ) + return + } catch { + if (attempt === maxAttempts) { + throw new Error(`Mode switch to ${targetMode} did not complete.`) + } + } + } +} + +async function ensureVoiceInputEnabled(page: Page): Promise { + const voiceToggle = page.getByTestId('header-voice-input-toggle') + await voiceToggle.waitFor({ state: 'visible', timeout: 10_000 }) + const ariaLabel = (await voiceToggle.getAttribute('aria-label')) ?? '' + if (ariaLabel.toLowerCase().includes('enable microphone')) { + await voiceToggle.click() + await page.waitForTimeout(500) + } +} + +async function openTranslateLanguagePicker(page: Page): Promise { + const desktopButton = page.getByTestId('translate-language-picker-open-desktop') + if (await desktopButton.isVisible().catch(() => false)) { + await desktopButton.click() + return + } + await page.getByTestId('translate-language-picker-open-mobile').click() +} + +async function setTranslateLanguagePair( + page: Page, + languagePair: { from: string; to: string } +): Promise { + await openTranslateLanguagePicker(page) + + const primarySearchInput = page.getByTestId('translate-primary-language-search') + await primarySearchInput.fill(languagePair.from) + await page + .getByTestId(`translate-primary-language-option-${languagePair.from.toLowerCase()}`) + .click() + + const secondarySearchInput = page.getByTestId('translate-secondary-language-search') + await secondarySearchInput.fill(languagePair.to) + await page + .getByTestId(`translate-secondary-language-option-${languagePair.to.toLowerCase()}`) + .click() + await page.keyboard.press('Escape').catch(() => {}) +} + +async function installEvalCaptureHooks(page: Page): Promise { + await page.addInitScript(() => { + const globalWindow = window as typeof window & { + __lilacTestBus?: { + assistantAudioElement: HTMLAudioElement | null + eventLog: unknown[] + } + __lilacEvalCapture?: { + startAssistantAudioRecorder: () => Promise + stopAssistantAudioRecorder: () => Promise + } + } + globalWindow.__lilacEvalCapture = { + async startAssistantAudioRecorder(): Promise { + const lilacTestBus = globalWindow.__lilacTestBus + const audioElement = lilacTestBus?.assistantAudioElement + if (!(audioElement instanceof HTMLAudioElement)) return false + const audioStream = audioElement.srcObject + if (!(audioStream instanceof MediaStream)) return false + const audioTrackList = audioStream.getAudioTracks() + if (audioTrackList.length === 0) return false + const captureWindow = globalWindow as typeof globalWindow & { + __lilacEvalActiveRecorder?: MediaRecorder | null + __lilacEvalAudioChunkList?: Blob[] + __lilacEvalRecorderMimeType?: string + } + if (captureWindow.__lilacEvalActiveRecorder) return true + const preferredMimeType = MediaRecorder.isTypeSupported('audio/webm;codecs=opus') + ? 'audio/webm;codecs=opus' + : 'audio/webm' + captureWindow.__lilacEvalAudioChunkList = [] + captureWindow.__lilacEvalRecorderMimeType = preferredMimeType + const mediaRecorder = new MediaRecorder(audioStream, { mimeType: preferredMimeType }) + mediaRecorder.addEventListener('dataavailable', event => { + if (!event.data || event.data.size === 0) return + captureWindow.__lilacEvalAudioChunkList?.push(event.data) + }) + mediaRecorder.start(250) + captureWindow.__lilacEvalActiveRecorder = mediaRecorder + return true + }, + async stopAssistantAudioRecorder(): Promise { + const captureWindow = globalWindow as typeof globalWindow & { + __lilacEvalActiveRecorder?: MediaRecorder | null + __lilacEvalAudioChunkList?: Blob[] + __lilacEvalRecorderMimeType?: string + } + const mediaRecorder = captureWindow.__lilacEvalActiveRecorder + if (!mediaRecorder) return null + const blob = await new Promise(resolve => { + mediaRecorder.addEventListener( + 'stop', + () => { + resolve( + new Blob(captureWindow.__lilacEvalAudioChunkList ?? [], { + type: captureWindow.__lilacEvalRecorderMimeType ?? 'audio/webm' + }) + ) + }, + { once: true } + ) + mediaRecorder.stop() + }) + captureWindow.__lilacEvalActiveRecorder = null + captureWindow.__lilacEvalAudioChunkList = [] + const arrayBuffer = await blob.arrayBuffer() + const byteArray = new Uint8Array(arrayBuffer) + let binaryString = '' + for (let byteIndex = 0; byteIndex < byteArray.length; byteIndex += 1) { + binaryString += String.fromCharCode(byteArray[byteIndex] ?? 0) + } + return { + base64: btoa(binaryString), + mimeType: blob.type || captureWindow.__lilacEvalRecorderMimeType || 'audio/webm' + } + } + } + }) +} + +function convertConsoleMessageToString(consoleMessage: ConsoleMessage): string { + return `[${consoleMessage.type()}] ${consoleMessage.text()}` +} + +async function startScenarioArtifactCollectors( + page: Page, + scenarioArtifactDirectoryPath: string +): Promise { + const screenshotDirectoryPath = join(scenarioArtifactDirectoryPath, 'screenshots') + const domSnapshotDirectoryPath = join(scenarioArtifactDirectoryPath, 'dom-snapshots') + await mkdir(screenshotDirectoryPath, { recursive: true }) + await mkdir(domSnapshotDirectoryPath, { recursive: true }) + + const consoleEntryList: string[] = [] + const handleConsoleMessage = (consoleMessage: ConsoleMessage): void => { + consoleEntryList.push(convertConsoleMessageToString(consoleMessage)) + } + const handlePageError = (error: Error): void => { + consoleEntryList.push(`[pageerror] ${error.message}`) + } + page.on('console', handleConsoleMessage) + page.on('pageerror', handlePageError) + + let screenshotSequence = 0 + let isStopped = false + let previousEventCount = -1 + const intervalId = setInterval(async () => { + if (isStopped) return + try { + const eventCount = await page.evaluate(() => window.__lilacTestBus?.eventLog.length ?? 0) + if (eventCount === previousEventCount && screenshotSequence > 0) return + previousEventCount = eventCount + screenshotSequence += 1 + const screenshotPath = join( + screenshotDirectoryPath, + `${String(screenshotSequence).padStart(3, '0')}-events-${eventCount}.png` + ) + const htmlPath = join( + domSnapshotDirectoryPath, + `${String(screenshotSequence).padStart(3, '0')}-events-${eventCount}.html` + ) + await page.screenshot({ fullPage: true, path: screenshotPath }) + await writeFile(htmlPath, await page.content(), 'utf8') + } catch (error) { + consoleEntryList.push( + `[collector] ${error instanceof Error ? error.message : 'collector capture failed'}` + ) + } + }, 1200) + + return { + consoleEntryList, + async stop(): Promise { + isStopped = true + clearInterval(intervalId) + page.off('console', handleConsoleMessage) + page.off('pageerror', handlePageError) + } + } +} + +async function getTestBusEventList(page: Page) { + const rawEventList = await page.evaluate(() => window.__lilacTestBus?.eventLog ?? []) + return z.array(LilacTestBusEventSchema).parse(rawEventList) +} + +function isAssistantAudioStateEvent( + event: ParsedLilacTestBusEvent +): event is AssistantAudioStateEvent { + return event.eventType === 'assistant_audio_state_changed' +} + +function isChatTranscriptEvent(event: ParsedLilacTestBusEvent): event is ChatTranscriptEvent { + return event.eventType === 'chat_transcript_patch' +} + +function isSubtitleStateEvent(event: ParsedLilacTestBusEvent): event is SubtitleStateEvent { + return event.eventType === 'subtitle_state_changed' +} + +function isTranslateCardEvent(event: ParsedLilacTestBusEvent): event is TranslateCardEvent { + return event.eventType === 'translate_card_patch' +} + +function isVisibleErrorEvent(event: ParsedLilacTestBusEvent): event is VisibleErrorEvent { + return event.eventType === 'visible_error_changed' +} + +async function maybeStartAssistantAudioRecorder(page: Page): Promise { + return page.evaluate(async () => { + const globalWindow = window as typeof window & { + __lilacEvalCapture?: { + startAssistantAudioRecorder: () => Promise + } + } + return (await globalWindow.__lilacEvalCapture?.startAssistantAudioRecorder()) ?? false + }) +} + +async function stopAssistantAudioRecorder( + page: Page, + audioArtifactDirectoryPath: string, + scenarioId: string +): Promise { + const recordingResult = await page.evaluate(async () => { + const globalWindow = window as typeof window & { + __lilacEvalCapture?: { + stopAssistantAudioRecorder: () => Promise + } + } + return (await globalWindow.__lilacEvalCapture?.stopAssistantAudioRecorder()) ?? null + }) + if (!recordingResult) return null + await mkdir(audioArtifactDirectoryPath, { recursive: true }) + const webmArtifactPath = join(audioArtifactDirectoryPath, `${scenarioId}.webm`) + const wavArtifactPath = join(audioArtifactDirectoryPath, `${scenarioId}.wav`) + await writeFile(webmArtifactPath, Buffer.from(recordingResult.base64, 'base64')) + const ffmpegResult = spawnSync( + 'ffmpeg', + ['-y', '-i', webmArtifactPath, '-ar', '24000', '-ac', '1', wavArtifactPath], + { encoding: 'utf8' } + ) + if (ffmpegResult.status !== 0) { + return webmArtifactPath + } + return wavArtifactPath +} + +async function configureScenario(page: Page, scenario: EvalScenarioSpec): Promise { + await switchToModeWithRetry(page, scenario.mode) + await ensureVoiceInputEnabled(page) + if (scenario.mode !== 'translate') return + const languagePair = scenario.languageExpectation.pair + if (!languagePair) return + await setTranslateLanguagePair(page, languagePair) +} + +async function submitScenarioInput(page: Page, scenario: EvalScenarioSpec): Promise { + const startedAt = Date.now() + switch (scenario.mode) { + case 'chat': { + if (scenario.inputType === 'text') { + const typedText = scenario.typedText ?? scenario.promptText ?? '' + await page.getByTestId('chat-text-input').fill(typedText) + await page.getByTestId('chat-text-send').click() + } + return startedAt + } + case 'translate': { + if (scenario.inputType === 'text') { + const typedText = scenario.typedText ?? scenario.promptText ?? '' + await page.getByTestId('translate-text-input').fill(typedText) + await page.getByTestId('translate-text-send').click() + } + return startedAt + } + default: + return startedAt + } +} + +function collectVisibleErrorTextList(testBusEventList: ParsedLilacTestBusEvent[]): string[] { + return testBusEventList + .filter(isVisibleErrorEvent) + .map(event => event.errorMessage) + .filter((value): value is string => typeof value === 'string' && value.length > 0) +} + +function findFirstEventTimestamp( + testBusEventList: ParsedLilacTestBusEvent[], + predicate: (event: ParsedLilacTestBusEvent) => boolean, + startedAt: number +): number | undefined { + const matchedEvent = testBusEventList.find(predicate) + if (!matchedEvent) return undefined + return matchedEvent.occurredAt - startedAt +} + +function buildLatencyMetrics( + testBusEventList: ParsedLilacTestBusEvent[], + scenario: EvalScenarioSpec, + startedAt: number +): Record { + const latencyMetrics: Record = {} + if (scenario.mode === 'chat') { + const assistantTextFirstMs = findFirstEventTimestamp( + testBusEventList, + event => isChatTranscriptEvent(event) && event.role === 'assistant', + startedAt + ) + if (typeof assistantTextFirstMs === 'number') { + latencyMetrics.assistantTextFirstMs = assistantTextFirstMs + } + const assistantAudioStartMs = findFirstEventTimestamp( + testBusEventList, + event => isAssistantAudioStateEvent(event) && event.isPlaying, + startedAt + ) + if (typeof assistantAudioStartMs === 'number') { + latencyMetrics.assistantAudioStartMs = assistantAudioStartMs + } + return latencyMetrics + } + const firstVisibleSourceMs = findFirstEventTimestamp( + testBusEventList, + event => isTranslateCardEvent(event) && event.sourceText.trim().length > 0, + startedAt + ) + if (typeof firstVisibleSourceMs === 'number') { + latencyMetrics.firstVisibleSourceMs = firstVisibleSourceMs + } + const firstVisibleDraftMs = findFirstEventTimestamp( + testBusEventList, + event => + isTranslateCardEvent(event) && + event.renderState === 'draft' && + event.targetText.trim().length > 0, + startedAt + ) + if (typeof firstVisibleDraftMs === 'number') { + latencyMetrics.firstVisibleDraftMs = firstVisibleDraftMs + } + const finalOutputMs = findFirstEventTimestamp( + testBusEventList, + event => + isTranslateCardEvent(event) && + event.renderState === 'final' && + event.targetText.trim().length > 0, + startedAt + ) + if (typeof finalOutputMs === 'number') { + latencyMetrics.finalOutputMs = finalOutputMs + } + return latencyMetrics +} + +async function waitForScenarioCompletion(page: Page, scenario: EvalScenarioSpec): Promise { + switch (scenario.mode) { + case 'chat': { + await waitForCondition( + async () => { + const eventList = await getTestBusEventList(page) + const hasUserTranscript = eventList.some( + event => isChatTranscriptEvent(event) && event.role === 'user' + ) + const hasAssistantTranscript = eventList.some( + event => isChatTranscriptEvent(event) && event.role === 'assistant' + ) + return hasUserTranscript && hasAssistantTranscript + }, + 35_000, + 'Chat scenario did not produce user and assistant transcript output.' + ) + return + } + case 'translate': { + await waitForCondition( + async () => { + const eventList = await getTestBusEventList(page) + const hasDraft = eventList.some( + event => isTranslateCardEvent(event) && event.renderState === 'draft' + ) + const hasFinal = eventList.some( + event => isTranslateCardEvent(event) && event.renderState === 'final' + ) + return hasDraft && hasFinal + }, + 35_000, + 'Translate scenario did not produce draft and final translation output.' + ) + return + } + default: + return + } +} + +async function writeJsonLines(filePath: string, entryList: unknown[]): Promise { + const fileContent = entryList.map(entry => JSON.stringify(entry)).join('\n') + await writeFile(filePath, fileContent ? `${fileContent}\n` : '', 'utf8') +} + +async function runScenario(input: { + scenario: EvalScenarioSpec + suiteArtifactDirectoryPath: string + targetUrl: string +}): Promise> { + const scenarioArtifactDirectoryPath = join(input.suiteArtifactDirectoryPath, input.scenario.id) + const screenshotDirectoryPath = join(scenarioArtifactDirectoryPath, 'screenshots') + const domSnapshotDirectoryPath = join(scenarioArtifactDirectoryPath, 'dom-snapshots') + const audioArtifactDirectoryPath = join(scenarioArtifactDirectoryPath, 'audio-output') + const bodyTextPath = join(scenarioArtifactDirectoryPath, 'body.txt') + const pageHtmlPath = join(scenarioArtifactDirectoryPath, 'page.html') + const rawTestBusPath = join(scenarioArtifactDirectoryPath, 'test-bus.jsonl') + await mkdir(scenarioArtifactDirectoryPath, { recursive: true }) + + const fakeAudioFixturePath = + input.scenario.audioFixturePath && input.scenario.inputType !== 'text' + ? input.scenario.audioFixturePath + : defaultSilentAudioFixturePath + + const browser = await chromium.launch({ + args: [ + '--use-fake-ui-for-media-stream', + '--use-fake-device-for-media-stream', + `--use-file-for-fake-audio-capture=${fakeAudioFixturePath}` + ], + headless: true + }) + + let context: BrowserContext | null = null + let collector: ScenarioArtifactCollectors | null = null + let page: Page | null = null + const harnessFailureList: string[] = [] + try { + context = await browser.newContext({ + permissions: ['microphone'], + viewport: input.scenario.targetViewport ?? defaultViewport + }) + page = await context.newPage() + await installEvalCaptureHooks(page) + collector = await startScenarioArtifactCollectors(page, scenarioArtifactDirectoryPath) + let startedAt = Date.now() + try { + await page.goto(input.targetUrl, { timeout: 120_000, waitUntil: 'domcontentloaded' }) + await configureScenario(page, input.scenario) + startedAt = await submitScenarioInput(page, input.scenario) + if (input.scenario.mode === 'chat') { + await waitForCondition( + async () => await maybeStartAssistantAudioRecorder(page as Page), + 20_000, + 'Assistant audio recorder could not attach to remote audio stream.' + ).catch(() => {}) + } + await waitForScenarioCompletion(page, input.scenario) + await page.waitForTimeout(1200) + } catch (error) { + harnessFailureList.push(error instanceof Error ? error.message : 'Scenario execution failed.') + } + const assistantAudioArtifactPath = + input.scenario.mode === 'chat' && page + ? await stopAssistantAudioRecorder(page, audioArtifactDirectoryPath, input.scenario.id) + : null + const testBusEventList = page ? await getTestBusEventList(page).catch(() => []) : [] + const bodyText = page + ? await page + .locator('body') + .innerText() + .catch(() => '') + : '' + const domHtml = page ? await page.content().catch(() => '') : '' + const latencyMetrics = buildLatencyMetrics(testBusEventList, input.scenario, startedAt) + const consoleLogPath = join(scenarioArtifactDirectoryPath, 'browser-console.log') + const clientTimelinePath = join(scenarioArtifactDirectoryPath, 'client-timeline.jsonl') + const transcriptTimelinePath = join(scenarioArtifactDirectoryPath, 'transcript-timeline.jsonl') + const translationTimelinePath = join(scenarioArtifactDirectoryPath, 'translation-timeline.jsonl') + const consoleEntryList = collector?.consoleEntryList ?? [] + await writeFile(bodyTextPath, bodyText, 'utf8') + await writeFile(pageHtmlPath, domHtml, 'utf8') + await writeFile(consoleLogPath, `${consoleEntryList.join('\n')}\n`, 'utf8') + await writeJsonLines(clientTimelinePath, testBusEventList) + await writeJsonLines(rawTestBusPath, testBusEventList) + await writeJsonLines( + transcriptTimelinePath, + testBusEventList.filter(event => isChatTranscriptEvent(event) || isSubtitleStateEvent(event)) + ) + await writeJsonLines(translationTimelinePath, testBusEventList.filter(isTranslateCardEvent)) + return { + artifactBundle: { + artifactDirectoryPath: scenarioArtifactDirectoryPath, + ...(assistantAudioArtifactPath ? { assistantAudioArtifactPath } : {}), + bodyTextPath, + browserConsoleLogPath: consoleLogPath, + clientTimelinePath, + domSnapshotDirectoryPath, + mode: input.scenario.mode, + pageHtmlPath, + rawTestBusPath, + scenarioId: input.scenario.id, + screenshotDirectoryPath, + targetUrl: input.targetUrl, + transcriptTimelinePath, + translationTimelinePath + }, + bodyText, + browserConsoleEntryList: consoleEntryList, + domHtml, + errorTextList: collectVisibleErrorTextList(testBusEventList), + harnessFailureList, + latencyMetrics, + scenario: input.scenario, + testBusEventList, + ...(assistantAudioArtifactPath ? { assistantAudioArtifactPath } : {}) + } + } finally { + if (collector) await collector.stop() + if (context) await context.close() + await browser.close() + } +} + +async function runMobileLayoutChecks(input: { + suiteArtifactDirectoryPath: string + targetUrl: string +}): Promise { + const findingList: string[] = [] + const artifactDirectoryPath = join(input.suiteArtifactDirectoryPath, 'mobile-layout') + await mkdir(artifactDirectoryPath, { recursive: true }) + for (const colorScheme of ['light', 'dark'] as const) { + for (const viewport of mobileViewportList) { + const browser: Browser = await chromium.launch({ headless: true }) + const context = await browser.newContext({ + colorScheme, + permissions: ['microphone'], + viewport + }) + const page = await context.newPage() + try { + await page.goto(input.targetUrl, { timeout: 120_000, waitUntil: 'domcontentloaded' }) + await switchToModeWithRetry(page, 'translate') + const hasHorizontalOverflow = await page.evaluate(() => { + return document.documentElement.scrollWidth > window.innerWidth + 1 + }) + const hasVerticalGrowth = await page.evaluate(() => { + return document.documentElement.scrollHeight > window.innerHeight + 1 + }) + const screenshotPath = join( + artifactDirectoryPath, + `${colorScheme}-${viewport.width}x${viewport.height}.png` + ) + await page.screenshot({ fullPage: true, path: screenshotPath }) + if (hasHorizontalOverflow) { + findingList.push(`horizontal overflow at ${colorScheme} ${viewport.width}x${viewport.height}`) + } + if (hasVerticalGrowth) { + findingList.push( + `vertical document growth at ${colorScheme} ${viewport.width}x${viewport.height}` + ) + } + } finally { + await context.close() + await browser.close() + } + } + } + return findingList +} + +export function selectCandidateConfig( + candidateConfigList: EvalCandidateConfig[], + candidateId: string +): EvalCandidateConfig { + const selectedCandidateConfig = candidateConfigList.find( + candidateConfig => candidateConfig.id === candidateId + ) + if (!selectedCandidateConfig) { + throw new Error(`Unknown candidate ID: ${candidateId}`) + } + return selectedCandidateConfig +} + +function buildFailureClusterList( + scenarioJudgmentList: Array<{ + scenarioId: string + severity: 'high' | 'low' | 'medium' + reason: string + }> +) { + return scenarioJudgmentList.map((scenarioJudgment, index) => ({ + clusterId: `cluster-${String(index + 1).padStart(2, '0')}`, + reason: scenarioJudgment.reason, + scenarioIds: [scenarioJudgment.scenarioId], + severity: scenarioJudgment.severity + })) +} + +export async function loadEvalMatrix(matrixPath: string) { + const matrixContent = await readFile(matrixPath, 'utf8') + return EvalMatrixSchema.parse(JSON.parse(matrixContent)) +} + +export async function executeEvalSuite(input: EvalRunOptions): Promise { + const evaluationId = input.evaluationId ?? createEvaluationId() + const suiteArtifactDirectoryPath = resolve( + process.cwd(), + '.artifacts/evals', + createTimestampLabel(), + evaluationId + ) + await mkdir(suiteArtifactDirectoryPath, { recursive: true }) + await mkdir(dirname(defaultSilentAudioFixturePath), { recursive: true }) + + let runtimeLogCapture: RuntimeLogCapture | null = null + if (input.captureVercelLogs) { + runtimeLogCapture = await startVercelRuntimeLogCapture({ + artifactDirectoryPath: suiteArtifactDirectoryPath, + deploymentDomain: new URL(input.targetUrl).host, + ...(process.env.VERCEL_TOKEN ? { vercelToken: process.env.VERCEL_TOKEN } : {}) + }) + } + + const scenarioList = resolveEvalScenarioList(input.scenarioIdList) + await ensureAudioFixturesReady(scenarioList) + const observationList: ScenarioObservation[] = [] + try { + for (const scenario of scenarioList) { + const observation = await runScenario({ + scenario, + suiteArtifactDirectoryPath, + targetUrl: input.targetUrl + }) + observationList.push({ + ...observation, + runtimeLogMatchList: [] + }) + } + } finally { + if (runtimeLogCapture) await runtimeLogCapture.stop() + } + + const runtimeLogMatchList = runtimeLogCapture + ? await collectRuntimeErrorMatches(runtimeLogCapture.stdoutPath) + : [] + for (const observation of observationList) { + observation.runtimeLogMatchList = runtimeLogMatchList + observation.artifactBundle = { + ...observation.artifactBundle, + ...(runtimeLogCapture ? { runtimeLogPath: runtimeLogCapture.stdoutPath } : {}), + ...(runtimeLogCapture ? { runtimeStderrPath: runtimeLogCapture.stderrPath } : {}) + } + } + const layoutFindingList = await runMobileLayoutChecks({ + suiteArtifactDirectoryPath, + targetUrl: input.targetUrl + }) + + const scenarioRunSummaryList = [] as z.infer[] + const suiteJudgeResultList = [] as Array< + z.infer['scorecard']['judgeResults'][number] + > + const suiteHardGateFindingList = [] as z.infer< + typeof EvalRunSummarySchema + >['scorecard']['hardGateResults'] + const failureClusterSeedList: Array<{ + scenarioId: string + severity: 'high' | 'low' | 'medium' + reason: string + }> = [] + + for (const observation of observationList) { + observation.runtimeLogMatchList = runtimeLogMatchList + const hardGuardrailJudgment = await runHardGuardrailJudge(observation) + const audioInputJudgeResult = await runAudioInputReferenceJudge(observation) + const audioOutputJudgeResult = await runAudioOutputListenerJudge(observation) + const visualJudgeResult = await runVisualUiJudge(observation) + const semanticJudgeResult = await runSemanticConversationJudge(observation) + const judgeResultList = [ + ...hardGuardrailJudgment.judgeResultList, + audioInputJudgeResult, + audioOutputJudgeResult, + visualJudgeResult, + semanticJudgeResult + ] + const scorecard = buildScenarioScorecard({ + hardGateFindingList: hardGuardrailJudgment.hardGateFindingList, + judgeResultList + }) + scenarioRunSummaryList.push( + EvalScenarioRunSummarySchema.parse({ + artifactBundle: observation.artifactBundle, + latencyMetrics: observation.latencyMetrics, + scenario: observation.scenario, + scorecard + }) + ) + suiteJudgeResultList.push(...judgeResultList) + suiteHardGateFindingList.push(...hardGuardrailJudgment.hardGateFindingList) + for (const failureClusterReason of hardGuardrailJudgment.failureClusterReasonList) { + failureClusterSeedList.push({ + reason: failureClusterReason.reason, + scenarioId: observation.scenario.id, + severity: failureClusterReason.severity + }) + } + } + + for (const layoutFinding of layoutFindingList) { + suiteHardGateFindingList.push({ + code: 'mobile_layout_regression', + details: layoutFinding, + severity: 'medium' + }) + failureClusterSeedList.push({ + reason: layoutFinding, + scenarioId: 'mobile-layout', + severity: 'medium' + }) + } + + const suiteAggregateScore = + suiteJudgeResultList.reduce((scoreSum, judgeResult) => scoreSum + judgeResult.score, 0) / + Math.max(1, suiteJudgeResultList.length) + const suiteReleaseDecision = + suiteHardGateFindingList.some(finding => finding.severity === 'high') || + suiteJudgeResultList.some(judgeResult => !judgeResult.passed) || + suiteJudgeResultList.some(judgeResult => + judgeResult.findings.some(finding => finding.severity === 'high') + ) + ? 'block' + : suiteAggregateScore >= 0.92 + ? 'promote' + : suiteAggregateScore >= 0.75 + ? 'candidate' + : 'block' + const failureClusterList = buildFailureClusterList(failureClusterSeedList) + const runSummary = EvalRunSummarySchema.parse({ + appCommitSha: getGitCommitSha(), + artifactDirectoryPath: suiteArtifactDirectoryPath, + candidateConfig: input.candidateConfig, + deployedUrl: input.targetUrl, + evaluationId, + failureClusterList, + scenarioRunSummaryList, + scorecard: { + aggregateScore: Math.round(suiteAggregateScore * 1000) / 1000, + hardGateResults: suiteHardGateFindingList, + judgeResults: suiteJudgeResultList, + regressionDeltas: {}, + releaseDecision: suiteReleaseDecision + }, + startedAtIso: new Date().toISOString() + }) + + const summaryPath = join(suiteArtifactDirectoryPath, 'summary.json') + await writeFile(summaryPath, JSON.stringify(runSummary, null, 2), 'utf8') + const leaderboardPath = resolve(process.cwd(), '.artifacts/evals/leaderboard.json') + let previousLeaderboard: unknown[] = [] + try { + previousLeaderboard = JSON.parse(await readFile(leaderboardPath, 'utf8')) as unknown[] + } catch { + previousLeaderboard = [] + } + await mkdir(dirname(leaderboardPath), { recursive: true }) + await writeFile( + leaderboardPath, + JSON.stringify( + [ + ...previousLeaderboard, + { + aggregateScore: runSummary.scorecard.aggregateScore, + artifactDirectoryPath: runSummary.artifactDirectoryPath, + candidateId: runSummary.candidateConfig.id, + deployedUrl: runSummary.deployedUrl, + evaluationId: runSummary.evaluationId, + releaseDecision: runSummary.scorecard.releaseDecision, + startedAtIso: runSummary.startedAtIso + } + ], + null, + 2 + ), + 'utf8' + ) + await writeFile( + join(suiteArtifactDirectoryPath, 'observations.json'), + JSON.stringify( + observationList.map(observation => ({ + artifactBundle: observation.artifactBundle, + bodyText: observation.bodyText, + errorTextList: observation.errorTextList, + latencyMetrics: observation.latencyMetrics, + runtimeLogMatchList: observation.runtimeLogMatchList, + scenarioId: observation.scenario.id + })), + null, + 2 + ), + 'utf8' + ) + + return { + observationList, + runSummary + } +} + +export async function runEvalSuite(input: EvalRunOptions): Promise { + const execution = await executeEvalSuite(input) + return execution.runSummary +} + +async function main(): Promise { + const parsedArguments = parseArguments(process.argv.slice(2)) + const evalMatrix = await loadEvalMatrix(parsedArguments.matrixPath) + const candidateConfig = selectCandidateConfig( + [evalMatrix.baseline, ...evalMatrix.candidateList], + parsedArguments.candidateId + ) + const runSummary = await runEvalSuite({ + candidateConfig, + captureVercelLogs: parsedArguments.captureVercelLogs, + targetUrl: parsedArguments.targetUrl, + ...(parsedArguments.evaluationId ? { evaluationId: parsedArguments.evaluationId } : {}), + ...(parsedArguments.scenarioIdList ? { scenarioIdList: parsedArguments.scenarioIdList } : {}) + }) + console.log(JSON.stringify(runSummary, null, 2)) + if (runSummary.scorecard.releaseDecision === 'block') { + throw new Error('Eval suite failed.') + } +} + +if (import.meta.main) { + void main().catch(error => { + console.error(error instanceof Error ? error.message : 'Eval suite failed.') + process.exit(1) + }) +} diff --git a/scripts/evals/runtimeTypes.ts b/scripts/evals/runtimeTypes.ts new file mode 100644 index 0000000..8c3dc1a --- /dev/null +++ b/scripts/evals/runtimeTypes.ts @@ -0,0 +1,30 @@ +import type { + EvalArtifactBundle, + EvalScenarioSpec, + JudgeResult, + LilacTestBusEvent +} from '@/evals/contracts' + +export type ScenarioObservation = { + artifactBundle: EvalArtifactBundle + assistantAudioArtifactPath?: string + bodyText: string + browserConsoleEntryList: string[] + domHtml: string + errorTextList: string[] + harnessFailureList: string[] + latencyMetrics: Record + runtimeLogMatchList: string[] + scenario: EvalScenarioSpec + testBusEventList: LilacTestBusEvent[] +} + +export type ScenarioJudgeContext = { + observation: ScenarioObservation +} + +export type ScenarioJudgment = { + failureClusterReasonList: Array<{ reason: string; severity: 'high' | 'low' | 'medium' }> + judgeResultList: JudgeResult[] + hardGateFindingList: Array<{ code: string; details: string; severity: 'high' | 'low' | 'medium' }> +} diff --git a/scripts/smoke/captureVercelRuntimeLogs.ts b/scripts/smoke/captureVercelRuntimeLogs.ts new file mode 100644 index 0000000..9b341eb --- /dev/null +++ b/scripts/smoke/captureVercelRuntimeLogs.ts @@ -0,0 +1,87 @@ +import { spawn } from 'node:child_process' +import { createWriteStream } from 'node:fs' +import { mkdir, readFile } from 'node:fs/promises' +import { join } from 'node:path' + +export type RuntimeLogCapture = { + stderrPath: string + stdoutPath: string + stop: () => Promise +} + +export type StartRuntimeLogCaptureInput = { + artifactDirectoryPath: string + deploymentDomain: string + vercelToken?: string +} + +export async function startVercelRuntimeLogCapture( + input: StartRuntimeLogCaptureInput +): Promise { + const runtimeDirectoryPath = join(input.artifactDirectoryPath, 'runtime-logs') + await mkdir(runtimeDirectoryPath, { recursive: true }) + + const stdoutPath = join(runtimeDirectoryPath, 'vercel-runtime.jsonl') + const stderrPath = join(runtimeDirectoryPath, 'vercel-runtime.stderr.log') + const stdoutWriter = createWriteStream(stdoutPath, { flags: 'w' }) + const stderrWriter = createWriteStream(stderrPath, { flags: 'w' }) + + const captureEnvironment = { + ...process.env + } + if (input.vercelToken) captureEnvironment.VERCEL_TOKEN = input.vercelToken + + const vercelProcess = spawn('vercel', ['logs', input.deploymentDomain, '--json'], { + env: captureEnvironment, + stdio: ['ignore', 'pipe', 'pipe'] + }) + + vercelProcess.stdout.pipe(stdoutWriter) + vercelProcess.stderr.pipe(stderrWriter) + + async function stop(): Promise { + if (!vercelProcess.killed) { + vercelProcess.kill('SIGINT') + } + await new Promise(resolve => { + vercelProcess.once('close', () => resolve()) + setTimeout(() => resolve(), 3000) + }) + stdoutWriter.end() + stderrWriter.end() + } + + return { + stderrPath, + stdoutPath, + stop + } +} + +export async function collectRuntimeErrorMatches(runtimeLogPath: string): Promise { + const runtimeLogContent = await readFile(runtimeLogPath, 'utf8') + const runtimeLogLines = runtimeLogContent + .split('\n') + .map(line => line.trim()) + .filter(Boolean) + + const matchPatterns = [ + /Unsupported parameter/i, + /Unsupported value/i, + /Unknown parameter/i, + /Invalid 'item\.id'/i, + /buffer too small/i, + /Draft translation timed out/i, + /Unhandled/i, + /An error occurred in the Server Components render/i, + /\bdigest\b/i + ] + + const matches: string[] = [] + for (const runtimeLine of runtimeLogLines) { + if (matchPatterns.some(pattern => pattern.test(runtimeLine))) { + matches.push(runtimeLine) + } + } + return matches +} diff --git a/scripts/smoke/generateSmokeAudio.ts b/scripts/smoke/generateSmokeAudio.ts new file mode 100644 index 0000000..d9920e6 --- /dev/null +++ b/scripts/smoke/generateSmokeAudio.ts @@ -0,0 +1,278 @@ +import { spawnSync } from 'node:child_process' +import { mkdir, writeFile } from 'node:fs/promises' +import { join, resolve } from 'node:path' + +import { z } from 'zod' + +type SmokeAudioPrompt = { + expectedTranscript: string + fileName: string + languageCode: 'en' | 'es' | 'uk' + text: string +} + +const smokeAudioPromptList: SmokeAudioPrompt[] = [ + { + expectedTranscript: + 'Hello everyone. This is a Lilac translation smoke test. We are validating live production conversation flow.', + fileName: 'en_smoke', + languageCode: 'en', + text: + 'Hello everyone. This is a Lilac translation smoke test. We are validating live production conversation flow.' + }, + { + expectedTranscript: + 'Hola a todos. Esta es una prueba de humo de traduccion de Lilac. Estamos validando el flujo de conversacion en produccion.', + fileName: 'es_smoke', + languageCode: 'es', + text: + 'Hola a todos. Esta es una prueba de humo de traduccion de Lilac. Estamos validando el flujo de conversacion en produccion.' + }, + { + expectedTranscript: 'Hello Lilac. Can you help me practice Ukrainian today?', + fileName: 'chat_question_en', + languageCode: 'en', + text: 'Hello Lilac. Can you help me practice Ukrainian today?' + }, + { + expectedTranscript: 'Hello. I was wondering if you saw the news.', + fileName: 'translate_en_simple', + languageCode: 'en', + text: 'Hello. I was wondering if you saw the news.' + }, + { + expectedTranscript: 'Hola. Me preguntaba si viste las noticias.', + fileName: 'translate_es_simple', + languageCode: 'es', + text: 'Hola. Me preguntaba si viste las noticias.' + }, + { + expectedTranscript: 'Um hello. So the other day, um, I was wondering, um, did you see the news?', + fileName: 'translate_en_fillers', + languageCode: 'en', + text: 'Um hello. So the other day, um, I was wondering, um, did you see the news?' + }, + { + expectedTranscript: 'Привіт. Мені було цікаво, чи бачив ти новини.', + fileName: 'translate_uk_simple', + languageCode: 'uk', + text: 'Привіт. Мені було цікаво, чи бачив ти новини.' + }, + { + expectedTranscript: + 'Hello there. This is a longer Lilac translation evaluation. I am speaking continuously so the subtitle system can be measured for live updates, chunk replacement, and final translation quality over a longer turn.', + fileName: 'translate_en_long', + languageCode: 'en', + text: + 'Hello there. This is a longer Lilac translation evaluation. I am speaking continuously so the subtitle system can be measured for live updates, chunk replacement, and final translation quality over a longer turn.' + } +] + +const fixturesDirectoryPath = resolve(process.cwd(), '.artifacts/smoke/fixtures') +const fixtureManifestPath = join(fixturesDirectoryPath, 'fixtures.manifest.json') + +const FixtureManifestSchema = z.object({ + fixtureList: z.array( + z.object({ + expectedTranscript: z.string().min(1), + fileName: z.string().min(1), + languageCode: z.enum(['en', 'es', 'uk']), + wavPath: z.string().min(1) + }) + ) +}) + +function requireOpenAiApiKey(): string { + const openAiApiKey = process.env.OPENAI_API_KEY?.trim() + if (!openAiApiKey) { + throw new Error('OPENAI_API_KEY is required for smoke audio generation.') + } + return openAiApiKey +} + +function buildNoisyFixture(fileName: string): void { + const sourceWavPath = join(fixturesDirectoryPath, `${fileName}.fixture.wav`) + const noisyOutputPath = join(fixturesDirectoryPath, `${fileName}_noisy.fixture.wav`) + runFfmpeg([ + '-y', + '-i', + sourceWavPath, + '-f', + 'lavfi', + '-i', + 'anoisesrc=color=white:amplitude=0.02', + '-filter_complex', + '[0:a][1:a]amix=inputs=2:weights=1 0.18:duration=first:dropout_transition=0', + '-ar', + '48000', + '-ac', + '1', + '-sample_fmt', + 's16', + noisyOutputPath + ]) +} + +function runFfmpeg(argumentList: string[]): void { + const ffmpegResult = spawnSync('ffmpeg', argumentList, { encoding: 'utf8' }) + if (ffmpegResult.status !== 0) { + throw new Error(`ffmpeg failed: ${ffmpegResult.stderr || ffmpegResult.stdout}`) + } +} + +async function generatePromptAudio(prompt: SmokeAudioPrompt, openAiApiKey: string): Promise { + const mp3OutputPath = join(fixturesDirectoryPath, `${prompt.fileName}.mp3`) + const wavOutputPath = join(fixturesDirectoryPath, `${prompt.fileName}.wav`) + const paddedOutputPath = join(fixturesDirectoryPath, `${prompt.fileName}.fixture.wav`) + + const ttsResponse = await fetch('https://api.openai.com/v1/audio/speech', { + body: JSON.stringify({ + input: prompt.text, + model: 'gpt-4o-mini-tts', + voice: 'alloy' + }), + headers: { + Authorization: `Bearer ${openAiApiKey}`, + 'Content-Type': 'application/json' + }, + method: 'POST' + }) + + if (!ttsResponse.ok) { + const errorText = await ttsResponse.text() + throw new Error(`TTS generation failed for ${prompt.fileName}: ${errorText}`) + } + + const audioBuffer = Buffer.from(await ttsResponse.arrayBuffer()) + await writeFile(mp3OutputPath, audioBuffer) + + runFfmpeg([ + '-y', + '-i', + mp3OutputPath, + '-ar', + '48000', + '-ac', + '1', + '-sample_fmt', + 's16', + wavOutputPath + ]) + + runFfmpeg([ + '-y', + '-f', + 'lavfi', + '-i', + 'anullsrc=r=48000:cl=mono:d=1.5', + '-i', + wavOutputPath, + '-f', + 'lavfi', + '-i', + 'anullsrc=r=48000:cl=mono:d=1.5', + '-filter_complex', + '[0:a][1:a][2:a]concat=n=3:v=0:a=1', + '-ar', + '48000', + '-ac', + '1', + '-sample_fmt', + 's16', + paddedOutputPath + ]) +} + +function buildConversationFixture(): void { + const conversationOutputPath = join(fixturesDirectoryPath, 'conversation_smoke.wav') + const englishWavPath = join(fixturesDirectoryPath, 'en_smoke.wav') + const spanishWavPath = join(fixturesDirectoryPath, 'es_smoke.wav') + + runFfmpeg([ + '-y', + '-f', + 'lavfi', + '-i', + 'anullsrc=r=48000:cl=mono:d=4', + '-i', + englishWavPath, + '-f', + 'lavfi', + '-i', + 'anullsrc=r=48000:cl=mono:d=3', + '-i', + spanishWavPath, + '-f', + 'lavfi', + '-i', + 'anullsrc=r=48000:cl=mono:d=3', + '-i', + englishWavPath, + '-f', + 'lavfi', + '-i', + 'anullsrc=r=48000:cl=mono:d=3', + '-i', + spanishWavPath, + '-filter_complex', + '[0:a][1:a][2:a][3:a][4:a][5:a][6:a]concat=n=7:v=0:a=1', + '-ar', + '48000', + '-ac', + '1', + '-sample_fmt', + 's16', + conversationOutputPath + ]) +} + +function buildSilentFixture(): void { + const silentOutputPath = join(fixturesDirectoryPath, 'silence_5s.wav') + runFfmpeg([ + '-y', + '-f', + 'lavfi', + '-i', + 'anullsrc=r=48000:cl=mono:d=5', + '-ar', + '48000', + '-ac', + '1', + '-sample_fmt', + 's16', + silentOutputPath + ]) +} + +async function writeFixtureManifest(): Promise { + const manifest = FixtureManifestSchema.parse({ + fixtureList: smokeAudioPromptList.map(prompt => ({ + expectedTranscript: prompt.expectedTranscript, + fileName: prompt.fileName, + languageCode: prompt.languageCode, + wavPath: join(fixturesDirectoryPath, `${prompt.fileName}.fixture.wav`) + })) + }) + await writeFile(fixtureManifestPath, JSON.stringify(manifest, null, 2), 'utf8') +} + +async function main(): Promise { + const openAiApiKey = requireOpenAiApiKey() + await mkdir(fixturesDirectoryPath, { recursive: true }) + + for (const prompt of smokeAudioPromptList) { + await generatePromptAudio(prompt, openAiApiKey) + } + + buildNoisyFixture('translate_en_fillers') + buildConversationFixture() + buildSilentFixture() + await writeFixtureManifest() + + console.log(`Smoke fixtures generated in ${fixturesDirectoryPath}`) +} + +void main().catch(error => { + console.error(error instanceof Error ? error.message : 'Smoke audio generation failed.') + process.exit(1) +}) diff --git a/scripts/smoke/runLiveConversationSmoke.ts b/scripts/smoke/runLiveConversationSmoke.ts new file mode 100644 index 0000000..05f4ff5 --- /dev/null +++ b/scripts/smoke/runLiveConversationSmoke.ts @@ -0,0 +1,612 @@ +import { mkdir, readFile, writeFile } from 'node:fs/promises' +import { dirname, join, resolve } from 'node:path' +import { type Browser, type BrowserContext, chromium, type Page } from 'playwright' + +import { + collectRuntimeErrorMatches, + type RuntimeLogCapture, + startVercelRuntimeLogCapture +} from './captureVercelRuntimeLogs' + +type SmokeMode = 'chat' | 'translate' + +type SmokeRunOptions = { + audioPath: string + captureVercelLogs: boolean + runChatScenario: boolean + targetUrl: string +} + +type ScenarioResult = { + failures: string[] + metrics?: Record + mode: SmokeMode +} + +const genericServerComponentErrorText = + 'An error occurred in the Server Components render. The specific message is omitted in production builds to avoid leaking sensitive details.' +const missingSessionTypeErrorText = "Missing required parameter: 'session.type'." +const unknownSessionTypeErrorText = "Unknown parameter: 'session.type'." +const invalidItemIdErrorText = "Invalid 'item.id':" +const missingToolCallErrorTextList = [ + 'No valid publish_translation tool call was returned.', + 'No publish_translation tool call was returned for the completed turn.' +] +const trackedProtocolErrorTextList = [ + missingSessionTypeErrorText, + unknownSessionTypeErrorText, + invalidItemIdErrorText, + ...missingToolCallErrorTextList +] +const maxTranslateFinalizationLatencyMilliseconds = 30_000 +const translateCardSelector = + '[data-testid^="translate-card-"]:not([data-testid="translate-card-list"])' + +const mobileViewports = [ + { height: 812, width: 375 }, + { height: 844, width: 390 } +] as const + +function parseArguments(argumentList: string[]): SmokeRunOptions { + const parsedOptions: SmokeRunOptions = { + audioPath: resolve(process.cwd(), '.artifacts/smoke/fixtures/conversation_smoke.wav'), + captureVercelLogs: true, + runChatScenario: true, + targetUrl: 'https://lilac.chat' + } + + for (let argumentIndex = 0; argumentIndex < argumentList.length; argumentIndex += 1) { + const argument = argumentList[argumentIndex] + switch (argument) { + case '--target-url': { + parsedOptions.targetUrl = argumentList[argumentIndex + 1] ?? parsedOptions.targetUrl + argumentIndex += 1 + continue + } + case '--audio-path': { + parsedOptions.audioPath = resolve(argumentList[argumentIndex + 1] ?? parsedOptions.audioPath) + argumentIndex += 1 + continue + } + case '--skip-vercel-logs': { + parsedOptions.captureVercelLogs = false + continue + } + case '--skip-chat': { + parsedOptions.runChatScenario = false + continue + } + default: + continue + } + } + + return parsedOptions +} + +function createTimestampLabel(): string { + return new Date().toISOString().replaceAll(':', '-').replaceAll('.', '-') +} + +async function waitForCondition( + condition: () => Promise, + timeoutMilliseconds: number, + errorMessage: string +): Promise { + const startTime = Date.now() + while (Date.now() - startTime <= timeoutMilliseconds) { + const passed = await condition() + if (passed) return + await new Promise(resolveTimeout => setTimeout(resolveTimeout, 350)) + } + throw new Error(errorMessage) +} + +async function waitForConnectionLive(page: Page): Promise { + await page.waitForTimeout(1800) +} + +async function switchToModeWithRetry( + page: Page, + targetMode: 'chat' | 'translate', + maxAttempts = 3 +): Promise { + const triggerTestId = targetMode === 'chat' ? 'mode-tab-chat' : 'mode-tab-translate' + + for (let attempt = 1; attempt <= maxAttempts; attempt += 1) { + await page.keyboard.press('Escape').catch(() => {}) + await page.getByTestId(triggerTestId).click({ timeout: 15_000 }) + try { + await waitForCondition( + async function hasTargetModeTabState(): Promise { + const stateAttributeValue = await page.getByTestId(triggerTestId).getAttribute('data-state') + return stateAttributeValue === 'active' + }, + 8_000, + `Mode switch to ${targetMode} did not complete.` + ) + return + } catch { + if (attempt === maxAttempts) { + throw new Error(`Mode switch to ${targetMode} did not complete.`) + } + } + } +} + +async function openGlobalSettings(page: Page): Promise { + const desktopButton = page.getByTestId('global-settings-open-desktop') + if (await desktopButton.isVisible().catch(() => false)) { + await desktopButton.click() + return + } + await page.getByTestId('global-settings-open-mobile').click() +} + +async function closeOverlayIfPresent(page: Page): Promise { + for (let attempt = 0; attempt < 3; attempt += 1) { + const overlayCount = await page.locator('[data-state="open"][aria-hidden="true"]').count() + if (overlayCount === 0) return + await page.keyboard.press('Escape').catch(() => {}) + await page.waitForTimeout(120) + } +} + +async function ensureVoiceInputEnabled(page: Page): Promise { + const voiceToggle = page.getByTestId('header-voice-input-toggle') + await voiceToggle.waitFor({ state: 'visible', timeout: 10_000 }) + const ariaLabel = (await voiceToggle.getAttribute('aria-label')) ?? '' + if (ariaLabel.toLowerCase().includes('enable microphone')) { + await voiceToggle.click() + await page.waitForTimeout(500) + } +} + +async function writeScenarioArtifacts( + page: Page, + artifactDirectoryPath: string, + mode: SmokeMode +): Promise { + const modeDirectoryPath = join(artifactDirectoryPath, mode) + await mkdir(modeDirectoryPath, { recursive: true }) + const screenshotPath = join(modeDirectoryPath, 'screenshot.png') + const bodyTextPath = join(modeDirectoryPath, 'body.txt') + const htmlPath = join(modeDirectoryPath, 'page.html') + + await page.screenshot({ fullPage: true, path: screenshotPath }) + const bodyText = await page.locator('body').innerText() + await writeFile(bodyTextPath, bodyText, 'utf8') + await writeFile(htmlPath, await page.content(), 'utf8') + + return bodyText +} + +function collectProtocolErrorText(bodyText: string): string[] { + return trackedProtocolErrorTextList.filter(errorText => bodyText.includes(errorText)) +} + +async function clickSliderByRatio(page: Page, testId: string, ratio: number): Promise { + const sliderLocator = page.getByTestId(testId) + await sliderLocator.waitFor({ state: 'visible', timeout: 15_000 }) + const sliderBox = await sliderLocator.boundingBox() + if (!sliderBox) throw new Error(`Unable to locate slider bounds for ${testId}.`) + + const normalizedRatio = Math.max(0, Math.min(1, ratio)) + const clickX = sliderBox.x + sliderBox.width * normalizedRatio + const clickY = sliderBox.y + sliderBox.height / 2 + await page.mouse.click(clickX, clickY) +} + +async function runChatScenario(page: Page, artifactDirectoryPath: string): Promise { + const failures: string[] = [] + const typedMessage = `typed smoke ${Date.now()}` + try { + await switchToModeWithRetry(page, 'chat') + await waitForConnectionLive(page) + const initialStatusMessageText = await page + .getByTestId('mode-status-message') + .innerText() + .catch(() => '') + if ( + initialStatusMessageText.includes('Offline. Waiting for network…') || + initialStatusMessageText.includes('Reconnecting…') + ) { + throw new Error('Chat mode remained in offline/reconnecting banner state after connection.') + } + + await openGlobalSettings(page) + await clickSliderByRatio(page, 'chat-turn-delay-slider', 0.68) + await page.waitForTimeout(800) + + const bodyTextAfterSliderUpdate = await page.locator('body').innerText() + if (bodyTextAfterSliderUpdate.includes(missingSessionTypeErrorText)) { + throw new Error('Chat slider interaction triggered session.type error.') + } + + await closeOverlayIfPresent(page) + const assistantRowCountBeforeTypedSend = await page + .locator('[data-testid^="chat-row-"][data-chat-role="assistant"]') + .count() + await page.getByTestId('chat-text-input').fill(typedMessage) + await page.getByTestId('chat-text-send').click() + + await waitForCondition( + async function hasVisibleTypedMessage(): Promise { + const visibleCount = await page + .locator('[data-testid^="chat-message-text-"]') + .filter({ hasText: typedMessage }) + .count() + return visibleCount > 0 + }, + 20_000, + 'Typed chat message was not appended to visible transcript.' + ) + + await waitForCondition( + async function hasNewAssistantMessage(): Promise { + const assistantRowCountAfterTypedSend = await page + .locator('[data-testid^="chat-row-"][data-chat-role="assistant"]') + .count() + return assistantRowCountAfterTypedSend > assistantRowCountBeforeTypedSend + }, + 35_000, + 'Chat mode did not produce a follow-up assistant transcript.' + ) + + const orderingIsValid = await page.evaluate((typedMessageValue: string) => { + const rowNodeList = Array.from( + document.querySelectorAll('[data-testid^="chat-row-"]') + ) + if (rowNodeList.length === 0) return false + const orderedRows = rowNodeList.map(rowNode => { + const role = rowNode.dataset.chatRole + const textNode = rowNode.querySelector('[data-testid^="chat-message-text-"]') + return { + role, + text: textNode?.innerText.trim() ?? '' + } + }) + const typedMessageIndex = orderedRows.findIndex( + row => row.role === 'user' && row.text.includes(typedMessageValue) + ) + if (typedMessageIndex === -1) return false + const assistantAfterTypedIndex = orderedRows.findIndex( + (row, index) => index > typedMessageIndex && row.role === 'assistant' + ) + return assistantAfterTypedIndex > typedMessageIndex + }, typedMessage) + if (!orderingIsValid) { + throw new Error('Chat timeline order was invalid (assistant rendered before user turn).') + } + } catch (error) { + failures.push(error instanceof Error ? error.message : 'Chat mode validation failed.') + } + + const bodyText = await writeScenarioArtifacts(page, artifactDirectoryPath, 'chat') + if (bodyText.includes(genericServerComponentErrorText)) { + failures.push('Chat mode rendered generic Server Components error text.') + } + const protocolErrors = collectProtocolErrorText(bodyText) + for (const protocolError of protocolErrors) { + failures.push(`Chat mode rendered protocol error text: ${protocolError}`) + } + + return { + failures, + mode: 'chat' + } +} + +async function runTranslateScenario( + page: Page, + artifactDirectoryPath: string +): Promise { + const failures: string[] = [] + const metrics: Record = {} + let connectionFailureMessage: null | string = null + try { + await closeOverlayIfPresent(page) + await switchToModeWithRetry(page, 'chat') + await switchToModeWithRetry(page, 'translate') + await ensureVoiceInputEnabled(page) + try { + await waitForConnectionLive(page) + } catch (error) { + connectionFailureMessage = + error instanceof Error ? error.message : 'Translate mode connection did not become live.' + } + + await page.getByTestId('translate-language-picker-open-desktop').click() + await page.getByTestId('translate-secondary-language-search').fill('French') + await page.getByTestId('translate-secondary-language-option-fr').click() + await page.keyboard.press('Escape') + await page.waitForTimeout(1000) + const bodyTextAfterLanguageChange = await page.locator('body').innerText() + if (bodyTextAfterLanguageChange.includes(missingSessionTypeErrorText)) { + throw new Error('Translate language change triggered session.type error.') + } + if ( + bodyTextAfterLanguageChange.includes('AUDIO') || + bodyTextAfterLanguageChange.includes('FINAL') + ) { + throw new Error('Translate mode rendered internal status chips in user-facing UI.') + } + if ( + bodyTextAfterLanguageChange.includes('Offline. Waiting for network…') || + bodyTextAfterLanguageChange.includes('Reconnecting…') + ) { + throw new Error('Translate mode remained in offline/reconnecting banner state after connection.') + } + + const typedTranslateInput = `translate smoke ${Date.now()}` + await page.getByTestId('translate-text-input').fill(typedTranslateInput) + await page.getByTestId('translate-text-send').click() + + const translateStartTime = Date.now() + await waitForCondition( + async function hasTranslateCardShell(): Promise { + const cardCount = await page.locator(translateCardSelector).count() + return cardCount > 0 + }, + 12_000, + 'Translate mode did not create a subtitle card.' + ) + metrics.firstCardLatencyMs = Date.now() - translateStartTime + + await waitForCondition( + async function hasDraftRenderState(): Promise { + const draftCardCount = await page + .locator(`${translateCardSelector}[data-render-state="draft"]`) + .count() + return draftCardCount > 0 + }, + 12_000, + 'Translate mode did not stream draft translation state before final output.' + ) + metrics.firstDraftLatencyMs = Date.now() - translateStartTime + + await waitForCondition( + async function hasTranslateCard(): Promise { + const cardCount = await page.locator(translateCardSelector).count() + if (cardCount === 0) return false + const hasFinalCard = + (await page.locator(`${translateCardSelector}[data-render-state="final"]`).count()) > 0 + if (!hasFinalCard) return false + const targetTextList = await page + .locator('[data-testid^="translate-card-target-"]') + .allInnerTexts() + return targetTextList.some(targetText => { + const normalizedText = targetText.trim() + if (!normalizedText) return false + if (normalizedText === 'Translating…') return false + if (normalizedText === 'Translating...') return false + if (normalizedText === 'Listening…') return false + return true + }) + }, + 20_000, + 'Translate mode did not produce finalized translation output.' + ) + const translateLatencyMilliseconds = Date.now() - translateStartTime + metrics.finalTranslationLatencyMs = translateLatencyMilliseconds + if (translateLatencyMilliseconds > maxTranslateFinalizationLatencyMilliseconds) { + throw new Error( + `Translate mode finalized too slowly (${translateLatencyMilliseconds}ms > ${maxTranslateFinalizationLatencyMilliseconds}ms).` + ) + } + const draftLatencyMilliseconds = + typeof metrics.firstDraftLatencyMs === 'number' ? metrics.firstDraftLatencyMs : null + if ( + typeof draftLatencyMilliseconds === 'number' && + draftLatencyMilliseconds >= translateLatencyMilliseconds + ) { + throw new Error('Translate mode did not emit draft output before final output.') + } + } catch (error) { + failures.push(error instanceof Error ? error.message : 'Translate mode validation failed.') + } + if (failures.length === 0 && connectionFailureMessage) { + console.warn(connectionFailureMessage) + } + + const bodyText = await writeScenarioArtifacts(page, artifactDirectoryPath, 'translate') + if (bodyText.includes(genericServerComponentErrorText)) { + failures.push('Translate mode rendered generic Server Components error text.') + } + const protocolErrors = collectProtocolErrorText(bodyText) + for (const protocolError of protocolErrors) { + failures.push(`Translate mode rendered protocol error text: ${protocolError}`) + } + + return { + failures, + metrics, + mode: 'translate' + } +} + +async function runMobileThemeChecks( + browser: Browser, + artifactDirectoryPath: string, + targetUrl: string +): Promise { + const failures: string[] = [] + const layoutDirectoryPath = join(artifactDirectoryPath, 'layout') + await mkdir(layoutDirectoryPath, { recursive: true }) + + for (const colorScheme of ['light', 'dark'] as const) { + for (const viewport of mobileViewports) { + let context: BrowserContext | null = null + try { + context = await browser.newContext({ + colorScheme, + permissions: ['microphone'], + viewport + }) + const page = await context.newPage() + await page.goto(targetUrl, { timeout: 120_000, waitUntil: 'domcontentloaded' }) + + await switchToModeWithRetry(page, 'translate') + + const hasHorizontalOverflow = await page.evaluate(function detectOverflow(): boolean { + return document.documentElement.scrollWidth > window.innerWidth + 1 + }) + const hasDocumentVerticalGrowth = await page.evaluate(function detectVerticalGrowth(): boolean { + return document.documentElement.scrollHeight > window.innerHeight + 1 + }) + const bodyText = await page.locator('body').innerText() + + const screenshotPath = join( + layoutDirectoryPath, + `${colorScheme}-${viewport.width}x${viewport.height}.png` + ) + await page.screenshot({ fullPage: true, path: screenshotPath }) + + if (hasHorizontalOverflow) { + failures.push( + `layout-${colorScheme}-${viewport.width}x${viewport.height}: horizontal overflow detected` + ) + } + if (hasDocumentVerticalGrowth) { + failures.push( + `layout-${colorScheme}-${viewport.width}x${viewport.height}: document vertical growth detected` + ) + } + if (bodyText.includes(genericServerComponentErrorText)) { + failures.push( + `layout-${colorScheme}-${viewport.width}x${viewport.height}: generic Server Components error text rendered` + ) + } + for (const protocolError of collectProtocolErrorText(bodyText)) { + failures.push( + `layout-${colorScheme}-${viewport.width}x${viewport.height}: protocol error text rendered: ${protocolError}` + ) + } + } catch (error) { + failures.push( + `layout-${colorScheme}-${viewport.width}x${viewport.height}: ${ + error instanceof Error ? error.message : 'layout check failed' + }` + ) + } finally { + if (context) await context.close() + } + } + } + + return failures +} + +async function resolveRuntimeMatches( + logCapture: null | RuntimeLogCapture +): Promise<{ matches: string[]; stderrTail: string }> { + if (!logCapture) return { matches: [], stderrTail: '' } + + const matches = await collectRuntimeErrorMatches(logCapture.stdoutPath) + const stderrContent = await readFile(logCapture.stderrPath, 'utf8') + const stderrLines = stderrContent + .split('\n') + .map(line => line.trim()) + .filter(Boolean) + .slice(-20) + + return { + matches, + stderrTail: stderrLines.join('\n') + } +} + +async function main(): Promise { + const options = parseArguments(process.argv.slice(2)) + const artifactDirectoryPath = resolve(process.cwd(), '.artifacts/smoke', createTimestampLabel()) + await mkdir(artifactDirectoryPath, { recursive: true }) + await mkdir(dirname(options.audioPath), { recursive: true }) + + let logCapture: null | RuntimeLogCapture = null + if (options.captureVercelLogs) { + const runtimeCaptureInput = { + artifactDirectoryPath, + deploymentDomain: new URL(options.targetUrl).host, + ...(process.env.VERCEL_TOKEN ? { vercelToken: process.env.VERCEL_TOKEN } : {}) + } + logCapture = await startVercelRuntimeLogCapture({ + ...runtimeCaptureInput + }) + } + + const browser = await chromium.launch({ + args: [ + '--use-fake-ui-for-media-stream', + '--use-fake-device-for-media-stream', + `--use-file-for-fake-audio-capture=${options.audioPath}` + ], + headless: true + }) + + const failureMessages: string[] = [] + const scenarioResults: ScenarioResult[] = [] + + try { + const pageContext = await browser.newContext({ + permissions: ['microphone'] + }) + const page = await pageContext.newPage() + await page.goto(options.targetUrl, { timeout: 120_000, waitUntil: 'domcontentloaded' }) + + if (options.runChatScenario) { + scenarioResults.push(await runChatScenario(page, artifactDirectoryPath)) + } + scenarioResults.push(await runTranslateScenario(page, artifactDirectoryPath)) + await pageContext.close() + + const layoutFailures = await runMobileThemeChecks( + browser, + artifactDirectoryPath, + options.targetUrl + ) + failureMessages.push(...layoutFailures) + } finally { + await browser.close() + if (logCapture) await logCapture.stop() + } + + for (const scenarioResult of scenarioResults) { + for (const failureMessage of scenarioResult.failures) { + failureMessages.push(`${scenarioResult.mode}: ${failureMessage}`) + } + } + + const runtimeSummary = await resolveRuntimeMatches(logCapture) + for (const runtimeMatch of runtimeSummary.matches) { + failureMessages.push(`runtime-log: ${runtimeMatch}`) + } + + const summary = { + artifactDirectoryPath, + failures: failureMessages, + runtimeLog: logCapture + ? { + stderrPath: logCapture.stderrPath, + stderrTail: runtimeSummary.stderrTail, + stdoutPath: logCapture.stdoutPath + } + : null, + scenarios: scenarioResults + } + + await writeFile( + join(artifactDirectoryPath, 'summary.json'), + JSON.stringify(summary, null, 2), + 'utf8' + ) + console.log(JSON.stringify(summary, null, 2)) + + if (failureMessages.length > 0) { + throw new Error(`Smoke run failed with ${failureMessages.length} failure(s).`) + } +} + +void main().catch(error => { + console.error(error instanceof Error ? error.message : 'Smoke runner failed.') + process.exit(1) +}) diff --git a/src/app/(app)/ChatMode.tsx b/src/app/(app)/ChatMode.tsx new file mode 100644 index 0000000..15bcf1d --- /dev/null +++ b/src/app/(app)/ChatMode.tsx @@ -0,0 +1,188 @@ +'use client' + +import { useEffect, useRef, useState } from 'react' + +import { Button } from '@/components/ui/button' +import { ScrollArea } from '@/components/ui/scroll-area' +import { Textarea } from '@/components/ui/textarea' +import { emitLilacTestBusEvent, setLilacTestAssistantAudioElement } from '@/evals/testBus' +import { useLilacModeRuntime } from '@/realtime/modeRuntimeStore' + +export default function ChatMode() { + const { chatSpeechOutputEnabled, chatTranscripts, remoteAudioStream, submitChatTextInput } = + useLilacModeRuntime() + + const [draftMessage, setDraftMessage] = useState('') + const stayPinnedToBottomRef = useRef(true) + const transcriptScrollAreaRef = useRef(null) + const transcriptViewportRef = useRef(null) + const playbackAudioElementRef = useRef(null) + const chatTranscriptCount = chatTranscripts.length + + useEffect(() => { + const rootElement = transcriptScrollAreaRef.current + if (!rootElement) return + const foundViewportElement = rootElement.querySelector( + '[data-radix-scroll-area-viewport]' + ) as HTMLDivElement | null + if (!foundViewportElement) return + const viewportElement = foundViewportElement + + transcriptViewportRef.current = viewportElement + + function handleScroll(): void { + const distanceFromBottom = + viewportElement.scrollHeight - viewportElement.scrollTop - viewportElement.clientHeight + stayPinnedToBottomRef.current = distanceFromBottom < 140 + } + + viewportElement.addEventListener('scroll', handleScroll) + handleScroll() + return () => viewportElement.removeEventListener('scroll', handleScroll) + }, []) + + useEffect(() => { + if (chatTranscriptCount === 0) return + if (!stayPinnedToBottomRef.current) return + const viewportElement = transcriptViewportRef.current + if (!viewportElement) return + viewportElement.scrollTo({ behavior: 'auto', top: viewportElement.scrollHeight }) + }, [chatTranscriptCount]) + + useEffect(() => { + if (!playbackAudioElementRef.current) { + playbackAudioElementRef.current = new Audio() + playbackAudioElementRef.current.autoplay = true + } + const playbackAudioElement = playbackAudioElementRef.current + if (!playbackAudioElement) return + setLilacTestAssistantAudioElement(playbackAudioElement) + + function handlePlaying(): void { + emitLilacTestBusEvent({ + eventType: 'assistant_audio_state_changed', + isPlaying: true + }) + } + + function handleStopped(): void { + emitLilacTestBusEvent({ + eventType: 'assistant_audio_state_changed', + isPlaying: false + }) + } + + playbackAudioElement.addEventListener('ended', handleStopped) + playbackAudioElement.addEventListener('pause', handleStopped) + playbackAudioElement.addEventListener('playing', handlePlaying) + playbackAudioElement.srcObject = chatSpeechOutputEnabled ? remoteAudioStream : null + if (chatSpeechOutputEnabled && remoteAudioStream) { + void playbackAudioElement.play().catch(() => {}) + } else { + handleStopped() + } + return () => { + playbackAudioElement.removeEventListener('ended', handleStopped) + playbackAudioElement.removeEventListener('pause', handleStopped) + playbackAudioElement.removeEventListener('playing', handlePlaying) + playbackAudioElement.pause() + playbackAudioElement.srcObject = null + setLilacTestAssistantAudioElement(null) + handleStopped() + } + }, [chatSpeechOutputEnabled, remoteAudioStream]) + + function submitMessage(): void { + const normalizedMessage = draftMessage.trim() + if (!normalizedMessage) return + submitChatTextInput(normalizedMessage) + setDraftMessage('') + } + + return ( +
+ + {chatTranscripts.length ? ( +
+ {chatTranscripts.map(message => { + const isUser = message.role === 'user' + const bubbleBaseClass = + 'max-w-[92%] whitespace-pre-wrap break-words rounded-2xl px-3 py-2 text-sm leading-relaxed' + const bubbleClass = isUser + ? `${bubbleBaseClass} self-end bg-[var(--lilac-brand-primary)] text-[var(--lilac-brand-primary-foreground)]` + : `${bubbleBaseClass} self-start border border-[var(--lilac-border)] bg-[var(--lilac-card-muted)] text-[var(--lilac-ink)]` + + return ( +
+
+ {isUser ? 'You' : 'Lilac'} + {message.status === 'streaming' ? : null} +
+
+ {message.text.trim() || '…'} +
+
+ ) + })} +
+ ) : ( +
+ Start with voice or type below. +
+ )} +
+ +
{ + event.preventDefault() + submitMessage() + }} + > +
+