From dd54af6445852f39be6ebcdbdd6d9079f4c132ce Mon Sep 17 00:00:00 2001 From: Marshall Livingston Date: Fri, 27 Feb 2026 07:46:11 -0700 Subject: [PATCH 1/2] audit: patch the holes you could drive a mass transit bus through MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Turns out if you squint at the codebase long enough, it squints back with `any` types and copy-pasted shell escaping. Time to fix that before it fixes us. The crown jewel: `overallScore || fallback()`. Looks fine until you remember that 0 is falsy in JavaScript. So every run where the LLM returned a perfectly legitimate score of zero got its result quietly tossed in the garbage and replaced with a fallback average. Two characters — `||` to `??` — and suddenly months of KSM scores make sense again. You could drive a truck through this bug and nobody would notice because the fallback was "close enough." decisionQuality had the same problem. Wonderful. The `any` types were doing that thing where they make TypeScript feel like JavaScript with extra steps. LLM responses parsed with JSON.parse() and immediately trusted via `as { command: string }` — because why would an LLM ever return something unexpected, right? Every catch block was `catch (error: any)` followed by a prayer. Replaced it all with Zod schemas at trust boundaries and actual type guards for error handling, like civilized people. Three files had their own artisanal copy of shellEscape(). Two files had their own provider normalization with identical alias maps. Two regex patterns validated run IDs independently. A DEFAULT_PROVIDER_URLS map existed purely to disagree with the PROVIDERS registry it was supposed to mirror. Consolidated everything — one function, one place, one truth. Also: error class hierarchy (so you can tell "config broke" from "docker broke" without reading tea leaves), 15+ magic numbers extracted into named constants, a sliding window on message arrays so 50-iteration runs don't eat your RAM for lunch, and Docker exec retry with backoff because sometimes the daemon just needs a moment. 27 new tests, 390 total, all green. The shellEscape tests include injection attempts because trust is earned. --- package-lock.json | 16 +++- package.json | 3 +- src/lib/analyzer.ts | 59 +++++++------- src/lib/config.ts | 49 +++--------- src/lib/constants.ts | 21 +++++ src/lib/docker.ts | 16 ++-- src/lib/env-check.ts | 39 +++++---- src/lib/errors.ts | 25 ++++++ src/lib/providers.ts | 4 +- src/lib/report.ts | 3 +- src/lib/results-path.ts | 9 ++- src/lib/retry.ts | 53 +++++++------ src/lib/runner.ts | 152 ++++++++++++++++++++++++------------ src/lib/schemas.ts | 127 ++++++++++++++++++++++++++++++ src/lib/shell.ts | 6 ++ tests/unit/analyzer.test.ts | 60 +++++++++++++- tests/unit/runner.test.ts | 40 +++++++++- tests/unit/shell.test.ts | 59 ++++++++++++++ vitest.config.ts | 2 +- 19 files changed, 567 insertions(+), 176 deletions(-) create mode 100644 src/lib/constants.ts create mode 100644 src/lib/errors.ts create mode 100644 src/lib/schemas.ts create mode 100644 src/lib/shell.ts create mode 100644 tests/unit/shell.test.ts diff --git a/package-lock.json b/package-lock.json index bfa0142..3de5455 100644 --- a/package-lock.json +++ b/package-lock.json @@ -1,12 +1,12 @@ { "name": "@kryptsec/oasis", - "version": "0.1.3", + "version": "0.1.5", "lockfileVersion": 3, "requires": true, "packages": { "": { "name": "@kryptsec/oasis", - "version": "0.1.3", + "version": "0.1.5", "license": "MIT", "dependencies": { "@anthropic-ai/sdk": "^0.78.0", @@ -18,7 +18,8 @@ "dotenv": "^16.3.1", "gradient-string": "^3.0.0", "openai": "^6.25.0", - "ora": "^8.1.1" + "ora": "^8.1.1", + "zod": "^4.3.6" }, "bin": { "oasis": "bin/oasis" @@ -2680,6 +2681,15 @@ "funding": { "url": "https://github.com/chalk/wrap-ansi?sponsor=1" } + }, + "node_modules/zod": { + "version": "4.3.6", + "resolved": "https://registry.npmjs.org/zod/-/zod-4.3.6.tgz", + "integrity": "sha512-rftlrkhHZOcjDwkGlnUtZZkvaPHCsDATp4pGpuOOMDaTdDDXF91wuVDJoWoPsKX/3YPQ5fHuF3STjcYyKr+Qhg==", + "license": "MIT", + "funding": { + "url": "https://github.com/sponsors/colinhacks" + } } } } diff --git a/package.json b/package.json index 148e40b..dd0aa3c 100644 --- a/package.json +++ b/package.json @@ -51,7 +51,8 @@ "dotenv": "^16.3.1", "gradient-string": "^3.0.0", "openai": "^6.25.0", - "ora": "^8.1.1" + "ora": "^8.1.1", + "zod": "^4.3.6" }, "devDependencies": { "@types/gradient-string": "^1.1.6", diff --git a/src/lib/analyzer.ts b/src/lib/analyzer.ts index 83d0133..66a66b8 100644 --- a/src/lib/analyzer.ts +++ b/src/lib/analyzer.ts @@ -19,15 +19,18 @@ import { finalizeRubricScore, fallbackOverallScore, } from './scoring.js'; +import { AnalysisResponseSchema } from './schemas.js'; +import { MAX_COMPLETION_TOKENS, ANALYZER_OUTPUT_LIMIT } from './constants.js'; import { withRateLimitRetry } from './retry.js'; -import { isAnthropicProvider, resolveProvider, resolveProviderName } from './providers.js'; +import { isAnthropicProvider, resolveProvider } from './providers.js'; +import { normalizeProvider } from './config.js'; // ============================================================================= // Configuration // ============================================================================= const DEFAULT_ANALYZER_MODEL = 'claude-sonnet-4-5-20250929'; -const MAX_OUTPUT_PER_STEP = 500; +const MAX_OUTPUT_PER_STEP = ANALYZER_OUTPUT_LIMIT; // ============================================================================= // System Prompt @@ -229,7 +232,7 @@ Return ONLY the JSON object, no other text.`; // Response Parser // ============================================================================= -async function parseAnalysisResponse( +export async function parseAnalysisResponse( response: string, runId: string, result: RunResult, @@ -243,39 +246,41 @@ async function parseAnalysisResponse( jsonStr = jsonStr.trim(); try { - const parsed = JSON.parse(jsonStr); + const parsed = AnalysisResponseSchema.parse(JSON.parse(jsonStr)); const analysisResult: AnalysisResult = { runId, analyzedAt: new Date(), analyzerModel: DEFAULT_ANALYZER_MODEL, attackChain: { - phases: parsed.attackChain?.phases || [], - techniques: parsed.attackChain?.techniques || [], - killChainCoverage: parsed.attackChain?.killChainCoverage || [], + phases: parsed.attackChain.phases, + techniques: parsed.attackChain.techniques, + killChainCoverage: parsed.attackChain.killChainCoverage, }, narrative: { - summary: parsed.narrative?.summary || 'Analysis unavailable', - detailed: parsed.narrative?.detailed || '', - keyFindings: parsed.narrative?.keyFindings || [], + summary: parsed.narrative.summary, + detailed: parsed.narrative.detailed, + keyFindings: parsed.narrative.keyFindings, }, behavior: { - approach: parsed.behavior?.approach || 'exploratory', - approachDescription: parsed.behavior?.approachDescription || '', - strengths: parsed.behavior?.strengths || [], - inefficiencies: parsed.behavior?.inefficiencies || [], - decisionQuality: parsed.behavior?.decisionQuality || 0, + approach: (['methodical', 'aggressive', 'exploratory', 'targeted'].includes(parsed.behavior.approach) + ? parsed.behavior.approach as 'methodical' | 'aggressive' | 'exploratory' | 'targeted' + : 'exploratory'), + approachDescription: parsed.behavior.approachDescription, + strengths: parsed.behavior.strengths, + inefficiencies: parsed.behavior.inefficiencies, + decisionQuality: parsed.behavior.decisionQuality, }, strategy: { - reconQuality: parsed.strategy?.reconQuality ?? 0, - exploitEfficiency: parsed.strategy?.exploitEfficiency ?? 0, - adaptability: parsed.strategy?.adaptability ?? 0, - overallScore: parsed.strategy?.overallScore || fallbackOverallScore( - parsed.strategy?.reconQuality ?? 0, - parsed.strategy?.exploitEfficiency ?? 0, - parsed.strategy?.adaptability ?? 0, + reconQuality: parsed.strategy.reconQuality, + exploitEfficiency: parsed.strategy.exploitEfficiency, + adaptability: parsed.strategy.adaptability, + overallScore: parsed.strategy.overallScore ?? fallbackOverallScore( + parsed.strategy.reconQuality, + parsed.strategy.exploitEfficiency, + parsed.strategy.adaptability, ), - scoreBreakdown: parsed.strategy?.scoreBreakdown ?? '', + scoreBreakdown: parsed.strategy.scoreBreakdown, }, }; @@ -443,8 +448,8 @@ export function resolveDefaultAnalyzerModel(analyzerProvider: string, benchmarkR const preset = resolveProvider(analyzerProvider); // Same provider as benchmark — use the benchmark model since we know it's available - const benchmarkProvider = resolveProviderName(benchmarkResult.model); - if (benchmarkProvider === resolveProviderName(analyzerProvider)) { + const benchmarkProvider = normalizeProvider(benchmarkResult.model); + if (benchmarkProvider === normalizeProvider(analyzerProvider)) { return benchmarkResult.modelVersion || preset?.models[0] || DEFAULT_ANALYZER_MODEL; } @@ -459,7 +464,7 @@ async function callAnthropicAnalyzer( const response = await withRateLimitRetry( () => client.messages.create({ model, - max_tokens: 4096, + max_tokens: MAX_COMPLETION_TOKENS, system: SYSTEM_PROMPT, messages: [{ role: 'user', content: prompt }], }), @@ -480,7 +485,7 @@ async function callOpenAIAnalyzer( const response = await withRateLimitRetry( () => client.chat.completions.create({ model, - max_completion_tokens: 4096, + max_completion_tokens: MAX_COMPLETION_TOKENS, messages: [ { role: 'system', content: SYSTEM_PROMPT }, { role: 'user', content: prompt }, diff --git a/src/lib/config.ts b/src/lib/config.ts index 15702df..ce9c5ae 100644 --- a/src/lib/config.ts +++ b/src/lib/config.ts @@ -1,6 +1,8 @@ import { existsSync, mkdirSync, readFileSync, writeFileSync, chmodSync, openSync, writeSync, closeSync, constants } from 'fs'; import { join, resolve } from 'path'; import { homedir } from 'os'; +import { ConfigError } from './errors.js'; +import { PROVIDERS, resolveProviderName } from './providers.js'; // XDG Base Directory compliant config path function resolveConfigDir(): string { @@ -48,7 +50,8 @@ export function loadConfig(): OasisConfig { } try { return JSON.parse(readFileSync(CONFIG_FILE, 'utf-8')); - } catch { + } catch (error) { + console.error(new ConfigError(`Failed to load config from ${CONFIG_FILE}`, { error: String(error) }).message); return {}; } } @@ -80,21 +83,6 @@ export function getConfigDir(): string { return CONFIG_DIR; } -// Run-ID validation and safe path resolution -const SAFE_RUN_ID_PATTERN = /^[A-Za-z0-9_-]+$/; - -export function resolveResultPath(runId: string, suffix: '.json' | '.analysis.json' = '.json'): string { - if (!SAFE_RUN_ID_PATTERN.test(runId)) { - throw new Error(`Invalid run ID: "${runId}". Run IDs may only contain letters, numbers, hyphens, and underscores.`); - } - const resultsDir = resolve(getResultsDir()); - const filePath = resolve(resultsDir, `${runId}${suffix}`); - if (!filePath.startsWith(resultsDir)) { - throw new Error(`Invalid run ID: "${runId}". Path escapes results directory.`); - } - return filePath; -} - // Registry URL resolution: config → env var → default const DEFAULT_REGISTRY_URL = 'https://raw.githubusercontent.com/KryptSec/oasis-challenges/main/index.json'; @@ -130,7 +118,8 @@ export function loadCredentials(): OasisCredentials { } try { return JSON.parse(readFileSync(CREDENTIALS_FILE, 'utf-8')); - } catch { + } catch (error) { + console.error(new ConfigError(`Failed to load credentials from ${CREDENTIALS_FILE}`, { error: String(error) }).message); return { apiKeys: {} }; } } @@ -200,15 +189,8 @@ function getApiKeyFromEnv(provider: string): string | undefined { return envVar ? process.env[envVar] : undefined; } -// Provider normalization -export function normalizeProvider(provider: string): string { - const aliases: Record = { - claude: 'anthropic', - grok: 'xai', - gemini: 'google', - }; - return aliases[provider.toLowerCase()] || provider.toLowerCase(); -} +// Provider normalization — delegates to providers.ts single source of truth +export { resolveProviderName as normalizeProvider } from './providers.js'; // Provider URLs (for ollama, custom endpoints) export function getProviderUrl(provider: string): string | undefined { @@ -238,22 +220,13 @@ export function listProviderUrls(): Record { return config.providerUrls || {}; } -// Default URLs for providers -const DEFAULT_PROVIDER_URLS: Record = { - anthropic: 'https://api.anthropic.com', - openai: 'https://api.openai.com/v1', - xai: 'https://api.x.ai/v1', - google: 'https://generativelanguage.googleapis.com/v1beta/openai', - ollama: 'http://localhost:11434/v1', -}; - export function getEffectiveProviderUrl(provider: string): string { - const normalized = normalizeProvider(provider); + const normalized = resolveProviderName(provider); // Custom URL takes precedence const customUrl = getProviderUrl(normalized); if (customUrl) { return customUrl; } - // Fall back to default - return DEFAULT_PROVIDER_URLS[normalized] || ''; + // Fall back to provider preset + return PROVIDERS[normalized]?.baseUrl || ''; } diff --git a/src/lib/constants.ts b/src/lib/constants.ts new file mode 100644 index 0000000..0c8f7a9 --- /dev/null +++ b/src/lib/constants.ts @@ -0,0 +1,21 @@ +// Named constants — replacing magic numbers across the codebase + +// API limits +export const MAX_COMPLETION_TOKENS = 4096; + +// Output truncation +export const STEP_OUTPUT_LIMIT = 10_000; // Stored in step records +export const TOOL_FEEDBACK_LIMIT = 50_000; // Sent back to model as context +export const ANALYZER_OUTPUT_LIMIT = 500; // In analysis prompts + +// Timeouts (ms) +export const DOCKER_EXEC_TIMEOUT = 60_000; +export const DOCKER_WAIT_TIMEOUT = 30_000; +export const DOCKER_POLL_INTERVAL = 2_000; +export const DOCKER_STARTUP_POLL = 2_500; + +// Display +export const VERBOSE_OUTPUT_PREVIEW = 2_000; + +// Memory bounds +export const MAX_CONTEXT_MESSAGES = 40; diff --git a/src/lib/docker.ts b/src/lib/docker.ts index 91bf839..80251fa 100644 --- a/src/lib/docker.ts +++ b/src/lib/docker.ts @@ -5,6 +5,8 @@ */ import { execSync } from 'child_process'; +import { shellEscape } from './shell.js'; +import { DOCKER_WAIT_TIMEOUT, DOCKER_POLL_INTERVAL } from './constants.js'; export interface ContainerSpec { challengeId: string; @@ -15,11 +17,6 @@ export interface ContainerSpec { targetContainerName: string; } -/** Escape a string for safe inclusion in a shell command (single-quote wrapping). */ -function shellEscape(s: string): string { - return "'" + s.replace(/'/g, "'\\''") + "'"; -} - /** * Pull a Docker image. Tries native platform first, falls back to linux/amd64 * if the image has no matching manifest (common for challenge images on Apple Silicon). @@ -36,8 +33,9 @@ export function pullImage(image: string, onProgress?: (line: string) => void): b encoding: 'utf-8', }); return false; - } catch (err: any) { - const msg = err?.stderr || err?.message || ''; + } catch (err: unknown) { + const eObj = err != null && typeof err === 'object' ? err as Record : {}; + const msg = String(eObj.stderr || eObj.message || ''); if (!msg.includes('no matching manifest') && !msg.includes('no match for platform')) { throw err; } @@ -133,10 +131,10 @@ export function pullAndStartContainers( export function waitForTarget( kaliContainer: string, targetUrl: string, - timeoutMs = 30000 + timeoutMs = DOCKER_WAIT_TIMEOUT ): void { const start = Date.now(); - const pollInterval = 2000; + const pollInterval = DOCKER_POLL_INTERVAL; while (Date.now() - start < timeoutMs) { try { diff --git a/src/lib/env-check.ts b/src/lib/env-check.ts index 0301ab4..0a381bb 100644 --- a/src/lib/env-check.ts +++ b/src/lib/env-check.ts @@ -6,6 +6,8 @@ import { execSync } from 'child_process'; import { existsSync } from 'fs'; import { resolve } from 'path'; +import { shellEscape } from './shell.js'; +import { DOCKER_STARTUP_POLL } from './constants.js'; export interface EnvCheckResult { ok: boolean; @@ -22,11 +24,6 @@ export interface DockerStartResult { const REQUIRED_KALI_TOOLS = ['curl', 'wget', 'python3']; -/** Escape a string for safe inclusion in a shell command (single-quote wrapping). */ -function shellEscape(s: string): string { - return "'" + s.replace(/'/g, "'\\''") + "'"; -} - /** * Check if Docker daemon is running. */ @@ -90,7 +87,7 @@ export async function ensureDocker( // Poll until Docker daemon is ready const start = Date.now(); - const pollInterval = 2500; + const pollInterval = DOCKER_STARTUP_POLL; while (Date.now() - start < timeoutMs) { await new Promise(resolve => setTimeout(resolve, pollInterval)); @@ -274,10 +271,14 @@ export async function checkApiKey( max_tokens: 1, messages: [{ role: 'user', content: 'hi' }], }); - } catch (validationError: any) { - const errStatus = validationError?.status ?? validationError?.error?.status; + } catch (validationError: unknown) { + const errStatus = validationError != null && typeof validationError === 'object' + ? (validationError as Record).status ?? + (typeof (validationError as Record).error === 'object' && (validationError as Record).error != null + ? ((validationError as Record).error as Record).status + : undefined) + : undefined; if (errStatus === 404) { - // Model deprecated but key was accepted — key is valid return { ok: true, errors: [], hints: [] }; } throw validationError; @@ -298,9 +299,11 @@ export async function checkApiKey( max_tokens: 1, messages: [{ role: 'user', content: 'hi' }], }); - } catch (validationError: any) { - const errStatus = validationError?.status ?? validationError?.error?.status; - const errMsg = (validationError?.message || '').toLowerCase(); + } catch (validationError: unknown) { + const vErr = validationError != null && typeof validationError === 'object' ? validationError as Record : {}; + const errStatus = typeof vErr.status === 'number' ? vErr.status + : (typeof vErr.error === 'object' && vErr.error != null ? (vErr.error as Record).status : undefined); + const errMsg = (typeof vErr.message === 'string' ? vErr.message : '').toLowerCase(); // 404 = model not found but key was accepted — key is valid if (errStatus === 404) { return { ok: true, errors: [], hints: [] }; @@ -315,7 +318,7 @@ export async function checkApiKey( } // 400 "Incorrect API key" = invalid key — rethrow as 401 for consistent handling if (errStatus === 400 && errMsg.includes('incorrect api key')) { - throw { status: 401, message: validationError.message }; + throw { status: 401, message: typeof vErr.message === 'string' ? vErr.message : 'Incorrect API key' }; } throw validationError; } @@ -327,15 +330,19 @@ export async function checkApiKey( } return { ok: true, errors: [], hints: [] }; - } catch (error: any) { - const status = error?.status ?? error?.statusCode ?? error?.response?.status; + } catch (error: unknown) { + const eObj = error != null && typeof error === 'object' ? error as Record : {}; + const status = typeof eObj.status === 'number' ? eObj.status + : typeof eObj.statusCode === 'number' ? eObj.statusCode + : (eObj.response != null && typeof eObj.response === 'object' ? (eObj.response as Record).status as number | undefined : undefined); if (status === 401 || status === 403) { errors.push(`API key is invalid for ${provider}`); hints.push(`Verify your key and reconfigure:`); hints.push(` oasis config set api-key ${provider} `); } else { - errors.push(`Could not validate API key for ${provider}: ${error?.message || 'Unknown error'}`); + const errMsg = error instanceof Error ? error.message : 'Unknown error'; + errors.push(`Could not validate API key for ${provider}: ${errMsg}`); hints.push('Check your network connection and API endpoint'); if (baseUrl) { hints.push(`API URL: ${baseUrl}`); diff --git a/src/lib/errors.ts b/src/lib/errors.ts new file mode 100644 index 0000000..23a5d3a --- /dev/null +++ b/src/lib/errors.ts @@ -0,0 +1,25 @@ +// OASIS Error Hierarchy +// Structured error types for categorized error handling + +export class OasisError extends Error { + constructor(message: string, public readonly context?: Record) { + super(message); + this.name = 'OasisError'; + } +} + +export class ConfigError extends OasisError { + override name = 'ConfigError'; +} + +export class AnalysisError extends OasisError { + override name = 'AnalysisError'; +} + +export class DockerError extends OasisError { + override name = 'DockerError'; +} + +export class ValidationError extends OasisError { + override name = 'ValidationError'; +} diff --git a/src/lib/providers.ts b/src/lib/providers.ts index 02d8ef8..45642af 100644 --- a/src/lib/providers.ts +++ b/src/lib/providers.ts @@ -14,7 +14,7 @@ export const PROVIDERS: Record = { anthropic: { name: 'anthropic', displayName: 'Anthropic', - baseUrl: null, // Uses native Anthropic SDK + baseUrl: 'https://api.anthropic.com', envKey: 'ANTHROPIC_API_KEY', models: ['claude-opus-4-6-20250522', 'claude-sonnet-4-6-20250514', 'claude-sonnet-4-5-20250929', 'claude-haiku-4-5-20251001'], isOpenAICompatible: false, @@ -116,7 +116,7 @@ export async function fetchAvailableModels( }); const response = await client.models.list({ limit: 100 }); const ids = response.data - .map((m: any) => m.id as string) + .map((m) => m.id) .sort(); return ids.length > 0 ? { models: ids, live: true } : fallback; } else { diff --git a/src/lib/report.ts b/src/lib/report.ts index 0f38fbd..94ab24d 100644 --- a/src/lib/report.ts +++ b/src/lib/report.ts @@ -4,6 +4,7 @@ import Table from 'cli-table3'; import { execSync } from 'child_process'; import type { RunResult, AttackTechnique, AnalysisResult } from './types.js'; +import type { JsonReport } from './schemas.js'; import { colors, status, sectionHeader, printBox, divider, renderScoreBar, formatScore } from './display.js'; export function copyToClipboard(text: string): boolean { @@ -353,7 +354,7 @@ export function generateAnalysisTextReport(analysis: AnalysisResult): string { // ============================================================================= export function generateJsonReport(result: RunResult, analysis?: AnalysisResult): string { - const report: any = { + const report: JsonReport = { metadata: { runId: result.id, model: result.modelVersion, diff --git a/src/lib/results-path.ts b/src/lib/results-path.ts index 08b9114..08bc9c8 100644 --- a/src/lib/results-path.ts +++ b/src/lib/results-path.ts @@ -1,23 +1,24 @@ import { isAbsolute, relative as pathRelative, resolve as pathResolve } from 'path'; import { getResultsDir } from './config.js'; +import { ValidationError } from './errors.js'; const RUN_ID_PATTERN = /^[A-Za-z0-9_-]+$/; -export class InvalidRunIdError extends Error { +export class InvalidRunIdError extends ValidationError { readonly runId: string; constructor(runId: string) { - super(`Invalid run ID: ${runId}`); + super(`Invalid run ID: ${runId}`, { runId }); this.name = 'InvalidRunIdError'; this.runId = runId; } } -export class ResultPathEscapeError extends Error { +export class ResultPathEscapeError extends ValidationError { readonly runId: string; constructor(runId: string) { - super(`Run ID resolves outside results directory: ${runId}`); + super(`Run ID resolves outside results directory: ${runId}`, { runId }); this.name = 'ResultPathEscapeError'; this.runId = runId; } diff --git a/src/lib/retry.ts b/src/lib/retry.ts index a586c8e..6558bd3 100644 --- a/src/lib/retry.ts +++ b/src/lib/retry.ts @@ -1,49 +1,58 @@ // Rate-limit retry utilities: 429/5xx handling with exponential backoff and Retry-After support. import chalk from 'chalk'; +import { OasisError } from './errors.js'; export const RATE_LIMIT_MAX_RETRIES = 3; export const RATE_LIMIT_BASE_DELAY_MS = 2000; -export class QuotaExceededError extends Error { +export class QuotaExceededError extends OasisError { constructor( message: string, public readonly provider?: string, public readonly model?: string, ) { - super(message); + super(message, { provider, model }); this.name = 'QuotaExceededError'; } } export function isQuotaExceededError(error: unknown): boolean { - const err = error as { - code?: string; message?: string; - error?: { code?: string; message?: string }; - }; - if (err?.code === 'insufficient_quota') return true; - if (err?.error?.code === 'insufficient_quota') return true; - if (typeof err?.message === 'string' && + if (error == null || typeof error !== 'object') return false; + const err = error as Record; + if (err.code === 'insufficient_quota') return true; + if (err.error != null && typeof err.error === 'object' && + (err.error as Record).code === 'insufficient_quota') return true; + if (typeof err.message === 'string' && err.message.toLowerCase().includes('exceeded your current quota')) return true; return false; } export function getErrorStatus(error: unknown): number | undefined { - const err = error as { status?: number; statusCode?: number; response?: { status?: number } }; - return err?.status ?? err?.statusCode ?? err?.response?.status; + if (error == null || typeof error !== 'object') return undefined; + const err = error as Record; + if (typeof err.status === 'number') return err.status; + if (typeof err.statusCode === 'number') return err.statusCode; + if (err.response != null && typeof err.response === 'object') { + const resp = err.response as Record; + if (typeof resp.status === 'number') return resp.status; + } + return undefined; } export function getRetryAfterHeader(error: unknown): string | undefined { - const err = error as { - headers?: Headers | Record; - response?: { headers?: Headers | Record }; - }; - const headers = err?.headers ?? err?.response?.headers; - if (!headers) return undefined; - if (typeof (headers as Headers).get === 'function') { - return (headers as Headers).get?.('retry-after') ?? undefined; + if (error == null || typeof error !== 'object') return undefined; + const err = error as Record; + const headersSource = err.headers ?? + (err.response != null && typeof err.response === 'object' + ? (err.response as Record).headers + : undefined); + if (headersSource == null || typeof headersSource !== 'object') return undefined; + if (typeof (headersSource as { get?: unknown }).get === 'function') { + return ((headersSource as Headers).get('retry-after')) ?? undefined; } - return (headers as Record)?.['retry-after'] ?? (headers as Record)?.['Retry-After']; + const hdr = headersSource as Record; + return hdr['retry-after'] ?? hdr['Retry-After']; } export function isRetryableStatus(status: number | undefined): boolean { @@ -63,9 +72,9 @@ export function getRetryDelayMs(attempt: number, error: unknown): number { export const DEFAULT_API_TIMEOUT_MS = 120_000; // 2 minutes -export class ApiTimeoutError extends Error { +export class ApiTimeoutError extends OasisError { constructor(context: string, timeoutMs: number) { - super(`${context}: timed out after ${timeoutMs / 1000}s`); + super(`${context}: timed out after ${timeoutMs / 1000}s`, { context, timeoutMs }); this.name = 'ApiTimeoutError'; } } diff --git a/src/lib/runner.ts b/src/lib/runner.ts index b373683..50e0e2e 100644 --- a/src/lib/runner.ts +++ b/src/lib/runner.ts @@ -2,18 +2,38 @@ import Anthropic from '@anthropic-ai/sdk'; import OpenAI from 'openai'; -import { execFileSync } from 'child_process'; +import { execFileSync, execSync } from 'child_process'; import chalk from 'chalk'; import { writeFileSync, readFileSync, mkdirSync, existsSync } from 'fs'; import { randomUUID } from 'crypto'; import { resolve } from 'path'; -import { wasSuccessful, classifyToAttack, classifyCommand } from './classifier.js'; +import { wasSuccessful, classifyToAttack, classifyCommand, extractTool } from './classifier.js'; +import { ToolInputSchema } from './schemas.js'; import type { RunResult, RunnerConfig, Step, TokenUsage, AttackTechnique, ChallengeConfig, AnalysisResult } from './types.js'; import { isAnthropicProvider, resolveProvider } from './providers.js'; import { withRateLimitRetry, getErrorStatus, RATE_LIMIT_MAX_RETRIES } from './retry.js'; +import { DockerError } from './errors.js'; +import { isValidRunId } from './results-path.js'; +import { + MAX_COMPLETION_TOKENS, + STEP_OUTPUT_LIMIT, + TOOL_FEEDBACK_LIMIT, + DOCKER_EXEC_TIMEOUT, + VERBOSE_OUTPUT_PREVIEW, + MAX_CONTEXT_MESSAGES, +} from './constants.js'; const FLAG_PATTERN = /KX\{[a-f0-9]+\}/i; +/** + * Sliding window for message arrays — prevents unbounded context growth. + * Always keeps the first message (system/user prompt) + the last N messages. + */ +function trimMessages(messages: T[]): T[] { + if (messages.length <= MAX_CONTEXT_MESSAGES) return messages; + return [messages[0], ...messages.slice(-MAX_CONTEXT_MESSAGES + 1)]; +} + // ============================================================================= // Thinking-Tag Stripping & Fallback Command Extraction // ============================================================================= @@ -152,37 +172,49 @@ export function buildDockerExecInvocation(command: string, containerName: string }; } -function executeCommand(command: string, containerName: string, verbose: boolean): string { +export function extractErrorOutput(error: unknown): string { + const stderr = error != null && typeof error === 'object' && 'stderr' in error + ? (typeof error.stderr === 'string' + ? error.stderr + : Buffer.isBuffer(error.stderr) ? error.stderr.toString('utf8') : '') + : ''; + return stderr || (error instanceof Error ? error.message : 'Command failed'); +} + +function executeCommand(command: string, containerName: string, verbose: boolean, maxAttempts = 3): string { if (verbose) { console.log(chalk.yellow(`\n> ${command}`)); } - try { - const invocation = buildDockerExecInvocation(command, containerName); - const result = execFileSync(invocation.command, invocation.args, { - input: invocation.input, - encoding: 'utf8', - timeout: 60000, - maxBuffer: 10 * 1024 * 1024, - stdio: ['pipe', 'pipe', 'pipe'], - }); - const output = result.trim(); - if (verbose) { - console.log(chalk.gray(output.substring(0, 2000) + (output.length > 2000 ? '\n... (truncated)' : ''))); - } - return output; - } catch (error: unknown) { - const errorWithStderr = error as { stderr?: string | Buffer; message?: string }; - const stderr = typeof errorWithStderr.stderr === 'string' - ? errorWithStderr.stderr - : Buffer.isBuffer(errorWithStderr.stderr) - ? errorWithStderr.stderr.toString('utf8') - : ''; - const errorOutput = stderr || errorWithStderr.message || 'Command failed'; - if (verbose) { - console.log(chalk.red(errorOutput)); + for (let attempt = 1; attempt <= maxAttempts; attempt++) { + try { + const invocation = buildDockerExecInvocation(command, containerName); + const result = execFileSync(invocation.command, invocation.args, { + input: invocation.input, + encoding: 'utf8', + timeout: DOCKER_EXEC_TIMEOUT, + maxBuffer: 10 * 1024 * 1024, + stdio: ['pipe', 'pipe', 'pipe'], + }); + const output = result.trim(); + if (verbose) { + console.log(chalk.gray(output.substring(0, VERBOSE_OUTPUT_PREVIEW) + (output.length > VERBOSE_OUTPUT_PREVIEW ? '\n... (truncated)' : ''))); + } + return output; + } catch (error: unknown) { + if (attempt === maxAttempts) { + const errorOutput = extractErrorOutput(error); + if (verbose) { + console.log(chalk.red(errorOutput)); + } + return errorOutput; + } + if (verbose) { + console.log(chalk.yellow(`Docker exec failed (attempt ${attempt}/${maxAttempts}), retrying...`)); + } + execSync(`sleep ${attempt}`); // 1s, 2s backoff } - return errorOutput; } + return 'Command failed after retries'; } /** @@ -202,7 +234,7 @@ function executeAndRecordStep(opts: { const startTime = new Date(); const output = executeCommand(opts.command, opts.containerName, opts.verbose); const endTime = new Date(); - const tool = opts.command.trim().split(/\s+/)[0] || 'unknown'; + const tool = extractTool(opts.command); const success = wasSuccessful(opts.command, output); const technique = classifyToAttack(opts.command); @@ -214,7 +246,7 @@ function executeAndRecordStep(opts: { reasoning: opts.currentReasoning, type: 'tool_call', command: opts.command, - output: output.substring(0, 10000), + output: output.substring(0, STEP_OUTPUT_LIMIT), technique, methodology: classifyCommand(opts.command), tool, @@ -356,15 +388,16 @@ async function runClaudeAgent(config: RunnerConfig): Promise { console.log(chalk.blue(`\n--- Iteration ${iterations} ---`)); } + const trimmedMessages = trimMessages(messages); let response: Awaited>; try { response = await withRateLimitRetry( () => client.messages.create({ model: config.modelId, - max_tokens: 4096, + max_tokens: MAX_COMPLETION_TOKENS, system: systemPrompt, tools: [runCommandTool], - messages, + messages: trimmedMessages, }), `Iteration ${iterations}`, config.verbose, @@ -418,14 +451,20 @@ async function runClaudeAgent(config: RunnerConfig): Promise { } if (block.type === 'tool_use') { - const toolInput = block.input as { command: string }; - const command = toolInput.command; + const toolInput = ToolInputSchema.safeParse(block.input); + if (!toolInput.success) { + if (config.verbose) { + console.log(chalk.yellow(`\nSkipping invalid tool input: ${JSON.stringify(block.input)}`)); + } + continue; + } + const command = toolInput.data.command; const commandStartTime = new Date(); const output = executeCommand(command, containerName, config.verbose || false); const commandEndTime = new Date(); - const tool = command.trim().split(/\s+/)[0] || 'unknown'; + const tool = extractTool(command); const success = wasSuccessful(command, output); const technique = classifyToAttack(command); @@ -436,7 +475,7 @@ async function runClaudeAgent(config: RunnerConfig): Promise { reasoning: currentReasoning, type: 'tool_call', command, - output: output.substring(0, 10000), + output: output.substring(0, STEP_OUTPUT_LIMIT), technique, methodology: classifyCommand(command), tool, @@ -459,7 +498,7 @@ async function runClaudeAgent(config: RunnerConfig): Promise { content: [{ type: 'tool_result', tool_use_id: block.id, - content: output.substring(0, 50000), + content: output.substring(0, TOOL_FEEDBACK_LIMIT), }], }); @@ -477,8 +516,8 @@ async function runClaudeAgent(config: RunnerConfig): Promise { } } } - } catch (error: any) { - agentError = error?.message || String(error); + } catch (error: unknown) { + agentError = error instanceof Error ? error.message : String(error); if (config.verbose) { console.error(chalk.red(`\nAgent error: ${agentError}`)); } @@ -499,7 +538,7 @@ async function runOpenAIAgent(config: RunnerConfig): Promise { let baseURL = config.baseUrl; if (!baseURL && provider) { - baseURL = provider.baseUrl || 'https://api.openai.com/v1'; + baseURL = provider.baseUrl || undefined; } let apiKey = config.apiKey; @@ -572,13 +611,14 @@ async function runOpenAIAgent(config: RunnerConfig): Promise { console.log(chalk.blue(`\n--- Iteration ${iterations} ---`)); } + const trimmedOaiMessages = trimMessages(messages); let response: Awaited>; try { response = await withRateLimitRetry( () => client.chat.completions.create({ model: config.modelId, - max_completion_tokens: 4096, - messages, + max_completion_tokens: MAX_COMPLETION_TOKENS, + messages: trimmedOaiMessages, tools, }), `Iteration ${iterations}`, @@ -608,7 +648,9 @@ async function runOpenAIAgent(config: RunnerConfig): Promise { let currentReasoning = ''; - const reasoningText = ('reasoning_content' in assistantMessage ? (assistantMessage as Record).reasoning_content as string : null) || assistantMessage.content || ''; + const reasoningText = ('reasoning_content' in assistantMessage && typeof (assistantMessage as Record).reasoning_content === 'string' + ? (assistantMessage as Record).reasoning_content as string + : null) || assistantMessage.content || ''; if (reasoningText) { // Strip thinking tags for display & reasoning, keep original content for flag matching const displayText = stripThinkingTags(assistantMessage.content || ''); @@ -638,7 +680,18 @@ async function runOpenAIAgent(config: RunnerConfig): Promise { if (assistantMessage.tool_calls && assistantMessage.tool_calls.length > 0) { for (const toolCall of assistantMessage.tool_calls) { if (toolCall.type !== 'function') continue; - const args = JSON.parse(toolCall.function.arguments); + let args: { command: string }; + try { + args = ToolInputSchema.parse(JSON.parse(toolCall.function.arguments)); + } catch { + // Invalid tool input — return error to model so it can recover + messages.push({ + role: 'tool', + tool_call_id: toolCall.id, + content: `Error: invalid tool arguments: ${toolCall.function.arguments}`, + }); + continue; + } const result = executeAndRecordStep({ command: args.command, containerName, @@ -657,7 +710,7 @@ async function runOpenAIAgent(config: RunnerConfig): Promise { messages.push({ role: 'tool', tool_call_id: toolCall.id, - content: result.output.substring(0, 50000), + content: result.output.substring(0, TOOL_FEEDBACK_LIMIT), }); } } else { @@ -684,7 +737,7 @@ async function runOpenAIAgent(config: RunnerConfig): Promise { if (result.flag) foundFlag = result.flag; // Feed output back as a user message (no tool_call_id available) - messages.push({ role: 'user', content: `Command output:\n${result.output.substring(0, 50000)}` }); + messages.push({ role: 'user', content: `Command output:\n${result.output.substring(0, TOOL_FEEDBACK_LIMIT)}` }); } else if (choice.finish_reason === 'stop' && !foundFlag) { if (config.verbose) { console.log(chalk.yellow('\nAgent finished without finding flag.')); @@ -693,8 +746,8 @@ async function runOpenAIAgent(config: RunnerConfig): Promise { } } } - } catch (error: any) { - agentError = error?.message || String(error); + } catch (error: unknown) { + agentError = error instanceof Error ? error.message : String(error); if (config.verbose) { console.error(chalk.red(`\nAgent error: ${agentError}`)); } @@ -786,8 +839,7 @@ export function saveAnalysisResult( mkdirSync(resultsDir, { recursive: true }); } - const SAFE_RUN_ID = /^[A-Za-z0-9_-]+$/; - if (!SAFE_RUN_ID.test(runId)) { + if (!isValidRunId(runId)) { throw new Error(`Invalid run ID: "${runId}"`); } const jsonPath = resolve(resultsDir, `${runId}.analysis.json`); diff --git a/src/lib/schemas.ts b/src/lib/schemas.ts new file mode 100644 index 0000000..e4220b8 --- /dev/null +++ b/src/lib/schemas.ts @@ -0,0 +1,127 @@ +// Zod schemas for runtime validation of untrusted data boundaries +// (LLM responses, tool inputs, parsed JSON) + +import { z } from 'zod'; + +// ============================================================================= +// LLM Analysis Response — validates JSON from analyzer.ts parseAnalysisResponse +// ============================================================================= + +export const AnalysisResponseSchema = z.object({ + attackChain: z.object({ + phases: z.array(z.object({ + phase: z.string(), + stepRange: z.tuple([z.number(), z.number()]), + description: z.string(), + techniques: z.array(z.string()), + })).default([]), + techniques: z.array(z.object({ + id: z.string(), + name: z.string(), + tactic: z.string(), + description: z.string().default(''), + stepsUsed: z.array(z.number()).default([]), + confidence: z.number().default(0), + })).default([]), + killChainCoverage: z.array(z.string()).default([]), + }).default({ phases: [], techniques: [], killChainCoverage: [] }), + + narrative: z.object({ + summary: z.string().default('Analysis unavailable'), + detailed: z.string().default(''), + keyFindings: z.array(z.string()).default([]), + }).default({ summary: 'Analysis unavailable', detailed: '', keyFindings: [] }), + + behavior: z.object({ + approach: z.string().default('exploratory'), + approachDescription: z.string().default(''), + strengths: z.array(z.string()).default([]), + inefficiencies: z.array(z.string()).default([]), + decisionQuality: z.number().default(0), + }).default({ approach: 'exploratory', approachDescription: '', strengths: [], inefficiencies: [], decisionQuality: 0 }), + + strategy: z.object({ + reconQuality: z.number().default(0), + exploitEfficiency: z.number().default(0), + adaptability: z.number().default(0), + overallScore: z.number().optional(), + scoreBreakdown: z.string().default(''), + }).default({ reconQuality: 0, exploitEfficiency: 0, adaptability: 0, scoreBreakdown: '' }), + + rubricEvaluation: z.object({ + milestones: z.array(z.object({ + id: z.string(), + achieved: z.boolean(), + reasoning: z.string(), + })).default([]), + qualitative: z.object({ + reconQuality: z.object({ score: z.number(), reasoning: z.string() }).default({ score: 0, reasoning: '' }), + techniqueSelection: z.object({ score: z.number(), reasoning: z.string() }).default({ score: 0, reasoning: '' }), + adaptability: z.object({ score: z.number(), reasoning: z.string() }).default({ score: 0, reasoning: '' }), + }).default({ + reconQuality: { score: 0, reasoning: '' }, + techniqueSelection: { score: 0, reasoning: '' }, + adaptability: { score: 0, reasoning: '' }, + }), + penalties: z.array(z.object({ + type: z.string(), + reason: z.string(), + })).default([]), + }).optional(), +}).passthrough(); + +export type AnalysisResponse = z.infer; + +// ============================================================================= +// Tool Input — validates { command: string } from Claude/OpenAI tool_use +// ============================================================================= + +export const ToolInputSchema = z.object({ + command: z.string(), +}); + +export type ToolInput = z.infer; + +// ============================================================================= +// JSON Report — typed interface for report.ts generateJsonReport +// ============================================================================= + +export interface JsonReport { + metadata: { + runId: string; + model: string; + provider: string; + challenge: string; + startTime: Date; + endTime: Date; + }; + result: { + success: boolean; + flag: string | null; + totalTime: number; + iterations: number; + tokens: { input: number; output: number; total: number }; + }; + techniques: unknown[]; + tacticBreakdown: Record; + toolsUsed: string[]; + steps: Array<{ + iteration: number; + command: string | undefined; + tool: string | undefined; + success: boolean | undefined; + duration: number; + technique: string | null; + reasoning: string | null; + }>; + analysis?: { + overallScore: number; + approach: string; + narrative: string; + keyFindings: string[]; + strategy: unknown; + behavior: unknown; + attackChain: unknown; + rubricScore: unknown; + }; +} diff --git a/src/lib/shell.ts b/src/lib/shell.ts new file mode 100644 index 0000000..250cb91 --- /dev/null +++ b/src/lib/shell.ts @@ -0,0 +1,6 @@ +// Shared shell utility + +/** Escape a string for safe inclusion in a shell command (single-quote wrapping). */ +export function shellEscape(s: string): string { + return "'" + s.replace(/'/g, "'\\''") + "'"; +} diff --git a/tests/unit/analyzer.test.ts b/tests/unit/analyzer.test.ts index bf7d410..41a0bd5 100644 --- a/tests/unit/analyzer.test.ts +++ b/tests/unit/analyzer.test.ts @@ -1,6 +1,6 @@ import { describe, it, expect } from 'vitest'; import { calculateKSM, fallbackOverallScore } from '../../src/lib/scoring.js'; -import { resolveDefaultAnalyzerModel, DEFAULT_ANALYZER_MODEL } from '../../src/lib/analyzer.js'; +import { resolveDefaultAnalyzerModel, DEFAULT_ANALYZER_MODEL, parseAnalysisResponse } from '../../src/lib/analyzer.js'; import type { RunResult } from '../../src/lib/types.js'; function makeRunResult(model: string, modelVersion: string): RunResult { @@ -205,3 +205,61 @@ describe('calculateKSM edge cases', () => { expect(calculateKSM(101, 0)).toBe(30); }); }); + +// ============================================================================= +// parseAnalysisResponse — malformed LLM output handling +// ============================================================================= + +describe('parseAnalysisResponse', () => { + const dummyResult = makeRunResult('anthropic', 'claude-3'); + + it('returns parseFailed for empty string', async () => { + const result = await parseAnalysisResponse('', 'run-1', dummyResult); + expect(result.parseFailed).toBe(true); + }); + + it('returns parseFailed for truncated JSON', async () => { + const result = await parseAnalysisResponse('{"attackChain": {"phases": [', 'run-1', dummyResult); + expect(result.parseFailed).toBe(true); + }); + + it('provides graceful defaults for valid JSON with missing fields', async () => { + const result = await parseAnalysisResponse('{}', 'run-1', dummyResult); + expect(result.parseFailed).toBeUndefined(); + expect(result.attackChain.phases).toEqual([]); + expect(result.narrative.summary).toBe('Analysis unavailable'); + expect(result.behavior.approach).toBe('exploratory'); + expect(result.strategy.overallScore).toBe(0); + }); + + it('preserves overallScore: 0 without triggering fallback', async () => { + const json = JSON.stringify({ + strategy: { reconQuality: 80, exploitEfficiency: 70, adaptability: 90, overallScore: 0, scoreBreakdown: 'test' }, + }); + const result = await parseAnalysisResponse(json, 'run-1', dummyResult); + expect(result.strategy.overallScore).toBe(0); + }); + + it('preserves decisionQuality: 0', async () => { + const json = JSON.stringify({ + behavior: { decisionQuality: 0, approach: 'methodical' }, + }); + const result = await parseAnalysisResponse(json, 'run-1', dummyResult); + expect(result.behavior.decisionQuality).toBe(0); + }); + + it('passes through extra fields without error', async () => { + const json = JSON.stringify({ + attackChain: { phases: [], techniques: [], killChainCoverage: [] }, + extraField: 'should not break', + }); + const result = await parseAnalysisResponse(json, 'run-1', dummyResult); + expect(result.parseFailed).toBeUndefined(); + }); + + it('strips markdown code fences', async () => { + const json = '```json\n{"strategy": {"overallScore": 42}}\n```'; + const result = await parseAnalysisResponse(json, 'run-1', dummyResult); + expect(result.strategy.overallScore).toBe(42); + }); +}); diff --git a/tests/unit/runner.test.ts b/tests/unit/runner.test.ts index 60e1082..e66b3bc 100644 --- a/tests/unit/runner.test.ts +++ b/tests/unit/runner.test.ts @@ -1,5 +1,5 @@ import { describe, it, expect } from 'vitest'; -import { buildDockerExecInvocation, stripThinkingTags, extractCommandFromText, findJsonBlocks } from '../../src/lib/runner.js'; +import { buildDockerExecInvocation, stripThinkingTags, extractCommandFromText, findJsonBlocks, extractErrorOutput } from '../../src/lib/runner.js'; describe('buildDockerExecInvocation', () => { it('uses docker exec with stdin script mode (no bash -c)', () => { @@ -187,3 +187,41 @@ describe('findJsonBlocks', () => { expect(findJsonBlocks(input)).toEqual(['{"complete":true}']); }); }); + +// ============================================================================= +// extractErrorOutput +// ============================================================================= + +describe('extractErrorOutput', () => { + it('extracts stderr string from error object', () => { + const err = { stderr: 'permission denied', message: 'fallback' }; + expect(extractErrorOutput(err)).toBe('permission denied'); + }); + + it('extracts stderr Buffer from error object', () => { + const err = { stderr: Buffer.from('buffer error'), message: 'fallback' }; + expect(extractErrorOutput(err)).toBe('buffer error'); + }); + + it('falls back to Error message when no stderr', () => { + expect(extractErrorOutput(new Error('something broke'))).toBe('something broke'); + }); + + it('returns default for null', () => { + expect(extractErrorOutput(null)).toBe('Command failed'); + }); + + it('returns default for undefined', () => { + expect(extractErrorOutput(undefined)).toBe('Command failed'); + }); + + it('handles string errors', () => { + expect(extractErrorOutput('raw string')).toBe('Command failed'); + }); + + it('prefers stderr over message', () => { + const err = new Error('msg'); + (err as any).stderr = 'stderr output'; + expect(extractErrorOutput(err)).toBe('stderr output'); + }); +}); diff --git a/tests/unit/shell.test.ts b/tests/unit/shell.test.ts new file mode 100644 index 0000000..ebfd877 --- /dev/null +++ b/tests/unit/shell.test.ts @@ -0,0 +1,59 @@ +import { describe, it, expect } from 'vitest'; +import { shellEscape } from '../../src/lib/shell.js'; + +describe('shellEscape', () => { + it('wraps plain strings in single quotes', () => { + expect(shellEscape('hello')).toBe("'hello'"); + }); + + it('escapes single quotes', () => { + expect(shellEscape("it's")).toBe("'it'\\''s'"); + }); + + it('handles double quotes', () => { + expect(shellEscape('say "hello"')).toBe("'say \"hello\"'"); + }); + + it('handles backticks', () => { + expect(shellEscape('echo `whoami`')).toBe("'echo `whoami`'"); + }); + + it('handles empty string', () => { + expect(shellEscape('')).toBe("''"); + }); + + it('handles newlines', () => { + expect(shellEscape('line1\nline2')).toBe("'line1\nline2'"); + }); + + it('handles null bytes', () => { + expect(shellEscape('test\0null')).toBe("'test\0null'"); + }); + + it('handles unicode', () => { + expect(shellEscape('hello \u{1F600}')).toBe("'hello \u{1F600}'"); + }); + + it('handles nested quotes', () => { + expect(shellEscape("it's a \"test\"")).toBe("'it'\\''s a \"test\"'"); + }); + + it('neutralizes semicolon injection', () => { + const result = shellEscape('; rm -rf /'); + expect(result).toBe("'; rm -rf /'"); + }); + + it('neutralizes command substitution', () => { + const result = shellEscape('$(whoami)'); + expect(result).toBe("'$(whoami)'"); + }); + + it('neutralizes backtick execution', () => { + const result = shellEscape('`id`'); + expect(result).toBe("'`id`'"); + }); + + it('handles multiple consecutive single quotes', () => { + expect(shellEscape("'''")).toBe("''\\'''\\'''\\'''"); + }); +}); diff --git a/vitest.config.ts b/vitest.config.ts index 0ccc3ba..ff8c3f5 100644 --- a/vitest.config.ts +++ b/vitest.config.ts @@ -3,6 +3,6 @@ import { defineConfig } from 'vitest/config'; export default defineConfig({ test: { include: ['tests/**/*.test.ts'], - testTimeout: 10000, + testTimeout: 30000, }, }); From 3534f2fba93df5582a96ff86cea4fd94cae6dedd Mon Sep 17 00:00:00 2001 From: aarti jakhar Date: Sat, 28 Feb 2026 01:15:10 +0530 Subject: [PATCH 2/2] =?UTF-8?q?fix:=20address=20PR=20#60=20review=20?= =?UTF-8?q?=E2=80=94=203=20blocking=20+=204=20worth-fixing=20issues?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Blocking: - runner: push assistant + error tool_result on invalid input before continue, preventing Claude API role-alternation violation - runner: export trimMessages with generic constraint; use while-loop to drop all consecutive same-role messages at trim boundary - env-check: replace plain object throw with ConfigError so getErrorStatus() routes to the 401 branch correctly Worth fixing: - schemas: reject empty command strings (z.string().min(1)) - analyzer: use ?? instead of || for rubric scores so 0 isn't falsy - env-check: reuse getErrorStatus() in all 3 catch blocks, remove unused fs/path imports - runner: add isDockerTransientError() to skip retries on permanent failures, remove unused DockerError import Tests: 7 new trimMessages tests (397 total, all passing) --- src/lib/analyzer.ts | 6 +-- src/lib/env-check.ts | 23 +++------ src/lib/runner.ts | 41 +++++++++++++-- src/lib/schemas.ts | 2 +- tests/unit/runner.test.ts | 106 +++++++++++++++++++++++++++++++++++++- 5 files changed, 154 insertions(+), 24 deletions(-) diff --git a/src/lib/analyzer.ts b/src/lib/analyzer.ts index 66a66b8..b80e32c 100644 --- a/src/lib/analyzer.ts +++ b/src/lib/analyzer.ts @@ -398,17 +398,17 @@ function buildRubricScore( milestones: { results: milestoneResults, achieved: [], points: 0 }, qualitative: { reconQuality: { - score: Math.min(llmEval.qualitative?.reconQuality?.score || 0, scoring.qualitative.reconQuality.maxPoints), + score: Math.min(llmEval.qualitative?.reconQuality?.score ?? 0, scoring.qualitative.reconQuality.maxPoints), maxPoints: scoring.qualitative.reconQuality.maxPoints, reasoning: llmEval.qualitative?.reconQuality?.reasoning || '', }, techniqueSelection: { - score: Math.min(llmEval.qualitative?.techniqueSelection?.score || 0, scoring.qualitative.techniqueSelection.maxPoints), + score: Math.min(llmEval.qualitative?.techniqueSelection?.score ?? 0, scoring.qualitative.techniqueSelection.maxPoints), maxPoints: scoring.qualitative.techniqueSelection.maxPoints, reasoning: llmEval.qualitative?.techniqueSelection?.reasoning || '', }, adaptability: { - score: Math.min(llmEval.qualitative?.adaptability?.score || 0, scoring.qualitative.adaptability.maxPoints), + score: Math.min(llmEval.qualitative?.adaptability?.score ?? 0, scoring.qualitative.adaptability.maxPoints), maxPoints: scoring.qualitative.adaptability.maxPoints, reasoning: llmEval.qualitative?.adaptability?.reasoning || '', }, diff --git a/src/lib/env-check.ts b/src/lib/env-check.ts index 0a381bb..1678691 100644 --- a/src/lib/env-check.ts +++ b/src/lib/env-check.ts @@ -4,10 +4,10 @@ */ import { execSync } from 'child_process'; -import { existsSync } from 'fs'; -import { resolve } from 'path'; import { shellEscape } from './shell.js'; import { DOCKER_STARTUP_POLL } from './constants.js'; +import { ConfigError } from './errors.js'; +import { getErrorStatus } from './retry.js'; export interface EnvCheckResult { ok: boolean; @@ -272,12 +272,7 @@ export async function checkApiKey( messages: [{ role: 'user', content: 'hi' }], }); } catch (validationError: unknown) { - const errStatus = validationError != null && typeof validationError === 'object' - ? (validationError as Record).status ?? - (typeof (validationError as Record).error === 'object' && (validationError as Record).error != null - ? ((validationError as Record).error as Record).status - : undefined) - : undefined; + const errStatus = getErrorStatus(validationError); if (errStatus === 404) { return { ok: true, errors: [], hints: [] }; } @@ -301,8 +296,7 @@ export async function checkApiKey( }); } catch (validationError: unknown) { const vErr = validationError != null && typeof validationError === 'object' ? validationError as Record : {}; - const errStatus = typeof vErr.status === 'number' ? vErr.status - : (typeof vErr.error === 'object' && vErr.error != null ? (vErr.error as Record).status : undefined); + const errStatus = getErrorStatus(validationError); const errMsg = (typeof vErr.message === 'string' ? vErr.message : '').toLowerCase(); // 404 = model not found but key was accepted — key is valid if (errStatus === 404) { @@ -318,7 +312,9 @@ export async function checkApiKey( } // 400 "Incorrect API key" = invalid key — rethrow as 401 for consistent handling if (errStatus === 400 && errMsg.includes('incorrect api key')) { - throw { status: 401, message: typeof vErr.message === 'string' ? vErr.message : 'Incorrect API key' }; + const err = new ConfigError(typeof vErr.message === 'string' ? vErr.message : 'Incorrect API key'); + Object.assign(err, { status: 401 }); + throw err; } throw validationError; } @@ -331,10 +327,7 @@ export async function checkApiKey( return { ok: true, errors: [], hints: [] }; } catch (error: unknown) { - const eObj = error != null && typeof error === 'object' ? error as Record : {}; - const status = typeof eObj.status === 'number' ? eObj.status - : typeof eObj.statusCode === 'number' ? eObj.statusCode - : (eObj.response != null && typeof eObj.response === 'object' ? (eObj.response as Record).status as number | undefined : undefined); + const status = getErrorStatus(error); if (status === 401 || status === 403) { errors.push(`API key is invalid for ${provider}`); diff --git a/src/lib/runner.ts b/src/lib/runner.ts index 50e0e2e..eaad2a3 100644 --- a/src/lib/runner.ts +++ b/src/lib/runner.ts @@ -12,7 +12,6 @@ import { ToolInputSchema } from './schemas.js'; import type { RunResult, RunnerConfig, Step, TokenUsage, AttackTechnique, ChallengeConfig, AnalysisResult } from './types.js'; import { isAnthropicProvider, resolveProvider } from './providers.js'; import { withRateLimitRetry, getErrorStatus, RATE_LIMIT_MAX_RETRIES } from './retry.js'; -import { DockerError } from './errors.js'; import { isValidRunId } from './results-path.js'; import { MAX_COMPLETION_TOKENS, @@ -29,9 +28,14 @@ const FLAG_PATTERN = /KX\{[a-f0-9]+\}/i; * Sliding window for message arrays — prevents unbounded context growth. * Always keeps the first message (system/user prompt) + the last N messages. */ -function trimMessages(messages: T[]): T[] { +export function trimMessages(messages: T[]): T[] { if (messages.length <= MAX_CONTEXT_MESSAGES) return messages; - return [messages[0], ...messages.slice(-MAX_CONTEXT_MESSAGES + 1)]; + let start = 0; + const tail = messages.slice(-MAX_CONTEXT_MESSAGES + 1); + while (start < tail.length && tail[start].role === messages[0].role) { + start++; + } + return [messages[0], ...tail.slice(start)]; } // ============================================================================= @@ -181,6 +185,24 @@ export function extractErrorOutput(error: unknown): string { return stderr || (error instanceof Error ? error.message : 'Command failed'); } +const DOCKER_TRANSIENT_PATTERNS = [ + 'is not running', + 'No such container', + 'connection refused', + 'Cannot connect to the Docker daemon', +]; + +function isDockerTransientError(error: unknown): boolean { + if (error == null || typeof error !== 'object') return true; // no exit code → likely connectivity + const err = error as Record; + if (typeof err.status !== 'number' || err.status === 0) return true; + // Non-zero exit code — check stderr for Docker-specific transient patterns + const stderr = typeof err.stderr === 'string' + ? err.stderr + : Buffer.isBuffer(err.stderr) ? err.stderr.toString('utf8') : ''; + return DOCKER_TRANSIENT_PATTERNS.some(p => stderr.includes(p)); +} + function executeCommand(command: string, containerName: string, verbose: boolean, maxAttempts = 3): string { if (verbose) { console.log(chalk.yellow(`\n> ${command}`)); @@ -201,7 +223,7 @@ function executeCommand(command: string, containerName: string, verbose: boolean } return output; } catch (error: unknown) { - if (attempt === maxAttempts) { + if (attempt === maxAttempts || !isDockerTransientError(error)) { const errorOutput = extractErrorOutput(error); if (verbose) { console.log(chalk.red(errorOutput)); @@ -456,6 +478,17 @@ async function runClaudeAgent(config: RunnerConfig): Promise { if (config.verbose) { console.log(chalk.yellow(`\nSkipping invalid tool input: ${JSON.stringify(block.input)}`)); } + messages.push({ role: 'assistant', content: assistantContent }); + messages.push({ + role: 'user', + content: [{ + type: 'tool_result', + tool_use_id: block.id, + content: `Error: invalid tool input: ${JSON.stringify(block.input)}`, + is_error: true, + }], + }); + assistantContent = []; continue; } const command = toolInput.data.command; diff --git a/src/lib/schemas.ts b/src/lib/schemas.ts index e4220b8..a037858 100644 --- a/src/lib/schemas.ts +++ b/src/lib/schemas.ts @@ -77,7 +77,7 @@ export type AnalysisResponse = z.infer; // ============================================================================= export const ToolInputSchema = z.object({ - command: z.string(), + command: z.string().min(1), }); export type ToolInput = z.infer; diff --git a/tests/unit/runner.test.ts b/tests/unit/runner.test.ts index e66b3bc..0d779f9 100644 --- a/tests/unit/runner.test.ts +++ b/tests/unit/runner.test.ts @@ -1,5 +1,5 @@ import { describe, it, expect } from 'vitest'; -import { buildDockerExecInvocation, stripThinkingTags, extractCommandFromText, findJsonBlocks, extractErrorOutput } from '../../src/lib/runner.js'; +import { buildDockerExecInvocation, stripThinkingTags, extractCommandFromText, findJsonBlocks, extractErrorOutput, trimMessages } from '../../src/lib/runner.js'; describe('buildDockerExecInvocation', () => { it('uses docker exec with stdin script mode (no bash -c)', () => { @@ -225,3 +225,107 @@ describe('extractErrorOutput', () => { expect(extractErrorOutput(err)).toBe('stderr output'); }); }); + +// ============================================================================= +// trimMessages +// ============================================================================= + +describe('trimMessages', () => { + const MAX_CONTEXT_MESSAGES = 40; // mirrors constants.ts + + function makeMessages(count: number, startRole: 'user' | 'assistant' = 'user') { + const roles = ['user', 'assistant'] as const; + const offset = startRole === 'user' ? 0 : 1; + return Array.from({ length: count }, (_, i) => ({ + role: roles[(i + offset) % 2], + content: `msg-${i}`, + })); + } + + it('returns messages unchanged when under limit', () => { + const msgs = makeMessages(10); + expect(trimMessages(msgs)).toEqual(msgs); + }); + + it('returns messages unchanged when at limit', () => { + const msgs = makeMessages(MAX_CONTEXT_MESSAGES); + expect(trimMessages(msgs)).toEqual(msgs); + }); + + it('trims messages over limit preserving anchor', () => { + const msgs = makeMessages(MAX_CONTEXT_MESSAGES + 10); + const result = trimMessages(msgs); + // First message preserved + expect(result[0]).toBe(msgs[0]); + // Last message preserved + expect(result[result.length - 1]).toBe(msgs[msgs.length - 1]); + // Length is at most MAX_CONTEXT_MESSAGES + expect(result.length).toBeLessThanOrEqual(MAX_CONTEXT_MESSAGES); + }); + + it('preserves role alternation after trim', () => { + const msgs = makeMessages(MAX_CONTEXT_MESSAGES + 10); + const result = trimMessages(msgs); + for (let i = 1; i < result.length; i++) { + expect(result[i].role).not.toBe(result[i - 1].role); + } + }); + + it('drops adjacent same-role when anchor matches tail[0]', () => { + // Force anchor and tail[0] to share a role by using an even-offset count + // anchor is user (index 0), and we need tail[0] to also be user + // With alternating roles, tail[0] role depends on the slice offset + // Build a custom array where this collision happens + const msgs = makeMessages(MAX_CONTEXT_MESSAGES + 1); + // msgs[0].role = 'user', tail = msgs.slice(-39) + // msgs.slice(-39)[0] = msgs[MAX_CONTEXT_MESSAGES + 1 - 39] = msgs[2] + // msgs[2].role = 'user' — collision! tail[0] should be dropped + const result = trimMessages(msgs); + expect(result[0].role).toBe('user'); + expect(result[1].role).not.toBe('user'); + // Verify no adjacent same-role + for (let i = 1; i < result.length; i++) { + expect(result[i].role).not.toBe(result[i - 1].role); + } + }); + + it('drops multiple consecutive same-role messages at trim boundary', () => { + // Build array where the trim boundary lands on multiple same-role messages + // that collide with the anchor (messages[0]). + // Anchor = user. We need tail[0], tail[1], ... to also be 'user'. + const msgs: { role: string; content: string }[] = [ + { role: 'user', content: 'anchor' }, + ]; + // Fill with alternating roles up to the trim point + for (let i = 1; i <= 5; i++) { + msgs.push({ role: i % 2 === 0 ? 'user' : 'assistant', content: `early-${i}` }); + } + // Now add a block of consecutive 'user' messages (simulating tool results) + // followed by normal alternation to fill past the limit + msgs.push({ role: 'user', content: 'tool-1' }); + msgs.push({ role: 'user', content: 'tool-2' }); + msgs.push({ role: 'user', content: 'tool-3' }); + // Fill remaining with alternating to go past limit + let nextRole: 'assistant' | 'user' = 'assistant'; + while (msgs.length <= MAX_CONTEXT_MESSAGES + 5) { + msgs.push({ role: nextRole, content: `fill-${msgs.length}` }); + nextRole = nextRole === 'assistant' ? 'user' : 'assistant'; + } + + const result = trimMessages(msgs); + // Anchor preserved + expect(result[0].role).toBe('user'); + // result[1] must not be 'user' (anchor collision resolved) + expect(result[1].role).not.toBe('user'); + }); + + it('works with OpenAI-style system role anchor', () => { + const msgs: { role: string; content: string }[] = [ + { role: 'system', content: 'system prompt' }, + ...makeMessages(MAX_CONTEXT_MESSAGES + 10).slice(1), + ]; + const result = trimMessages(msgs); + expect(result[0].role).toBe('system'); + expect(result.length).toBeLessThanOrEqual(MAX_CONTEXT_MESSAGES); + }); +});