From 577f3c6b7396dbdef33976dac542f2c8cd1ee29a Mon Sep 17 00:00:00 2001 From: Calvin Smith Date: Thu, 28 May 2026 10:35:24 -0600 Subject: [PATCH 1/9] feat: add A/B testing support for plugin automations Extend the plugin preset endpoint to support experiment variants. Users can now provide 'variants' (mutually exclusive with 'plugins') to create A/B tests that randomly select a plugin configuration per run and tag the resulting conversation with experiment metadata. Changes: - preset_router.py: ExperimentVariant model, variants/experiment_id fields on CreatePluginAutomationRequest with mutual exclusivity validation, experiment_config.json tarball generation - sdk_main.py: experiment config detection, weighted variant selection via random.choices, experiment tags on Conversation() - tests: 12 new tests covering validation edge cases and tarball generation for both experiment and standard paths No database migrations. No SDK changes. No new endpoints. Co-authored-by: openhands --- openhands/automation/preset_router.py | 170 +++++++++---- .../automation/presets/plugin/sdk_main.py | 35 ++- tests/test_preset_router.py | 223 ++++++++++++++++++ 3 files changed, 381 insertions(+), 47 deletions(-) diff --git a/openhands/automation/preset_router.py b/openhands/automation/preset_router.py index d63ab10..dbd3914 100644 --- a/openhands/automation/preset_router.py +++ b/openhands/automation/preset_router.py @@ -316,6 +316,21 @@ async def create_automation_from_prompt( # --- Plugin Preset --- +MAX_VARIANTS = 10 + + +class ExperimentVariant(BaseModel): + """A single variant in an A/B test experiment.""" + + model_config = ConfigDict(extra="forbid") + + name: str = Field(..., min_length=1, max_length=100) + weight: int = Field(..., gt=0, description="Relative selection weight (must be > 0)") + plugins: list[PluginSource] = Field( + ..., + description="Plugin(s) for this variant.", + ) + class CreatePluginAutomationRequest(BaseModel): """Request to create an automation using plugins.""" @@ -323,11 +338,22 @@ class CreatePluginAutomationRequest(BaseModel): model_config = ConfigDict(extra="forbid") name: str = Field(..., min_length=1, max_length=500) - plugins: list[PluginSource] = Field( - ..., - description="Plugin(s) to load. Can be a single plugin or a list of plugins. " - "Each plugin specifies a source (github:owner/repo, git URL, or local path), " - "optional ref (branch/tag/commit), and optional repo_path for monorepos.", + plugins: list[PluginSource] | None = Field( + default=None, + description="Plugin(s) to load. Mutually exclusive with 'variants'.", + ) + variants: list[ExperimentVariant] | None = Field( + default=None, + description=( + "A/B test variants. Each variant specifies its own plugin set and a " + "relative weight. Mutually exclusive with 'plugins'." + ), + ) + experiment_id: str | None = Field( + default=None, + min_length=1, + max_length=200, + description="Required when using variants. A human-readable experiment name.", ) prompt: str = Field( ..., @@ -375,7 +401,7 @@ def normalize_plugins_and_repos(cls, data: dict) -> dict: # type: ignore[type-a """Normalize plugins and repos to always be lists.""" if isinstance(data, dict): # Normalize plugins - if "plugins" in data: + if "plugins" in data and data["plugins"] is not None: plugins = data["plugins"] if isinstance(plugins, dict): data["plugins"] = [plugins] @@ -388,46 +414,87 @@ def normalize_plugins_and_repos(cls, data: dict) -> dict: # type: ignore[type-a data["repos"] = [repos] return data + @model_validator(mode="after") + def validate_plugins_or_variants(self) -> "CreatePluginAutomationRequest": + """Enforce mutual exclusivity between plugins and variants.""" + has_plugins = self.plugins is not None + has_variants = self.variants is not None -def _generate_plugin_tarball( - plugins: list[PluginSource], prompt: str, repos: list[RepoSource] | None = None -) -> bytes: - """Generate a tarball containing SDK code, plugin config, and prompt. + if has_plugins == has_variants: + raise ValueError( + "Exactly one of 'plugins' or 'variants' must be provided." + ) - The tarball contains: - - main.py: SDK boilerplate that loads plugins and runs conversation - - plugins_config.json: List of plugin sources (serialized PluginSource models) - - prompt.txt: The prompt to send - - setup.sh: Script to install the SDK - - repos_config.json: (optional) Repository configuration for cloning + if has_variants: + if self.experiment_id is None: + raise ValueError( + "'experiment_id' is required when using 'variants'." + ) + if len(self.variants) < 2: + raise ValueError("At least two variants are required for an A/B test.") + if len(self.variants) > MAX_VARIANTS: + raise ValueError( + f"At most {MAX_VARIANTS} variants are allowed." + ) + names = [v.name for v in self.variants] + if len(names) != len(set(names)): + raise ValueError("Variant names must be unique.") + else: + if self.experiment_id is not None: + raise ValueError( + "'experiment_id' can only be used with 'variants', not 'plugins'." + ) + + return self - Note: Clone and skill loading functionality is now provided by the SDK's - OpenHandsCloudWorkspace.clone_repos() and load_skills_from_agent_server() - methods, so separate scripts are no longer needed. - Args: - plugins: List of plugins to load - prompt: The user's prompt text - repos: Optional list of repositories to clone +def _generate_plugin_tarball( + plugins: list[PluginSource] | None, + prompt: str, + repos: list[RepoSource] | None = None, + *, + experiment_id: str | None = None, + variants: list[ExperimentVariant] | None = None, +) -> bytes: + """Generate a tarball containing SDK code, plugin config, and prompt. - Returns: - bytes: The tarball content as bytes + When *variants* is provided the tarball contains ``experiment_config.json`` + instead of ``plugins_config.json``. The two are mutually exclusive. """ preset_files = _load_plugin_preset_files() - # Serialize plugins using Pydantic (exclude None values for cleaner JSON) - plugins_config = [p.model_dump(exclude_none=True) for p in plugins] - plugins_config_json = json.dumps(plugins_config, indent=2) - tarball_buffer = io.BytesIO() with tarfile.open(fileobj=tarball_buffer, mode="w:gz") as tar: _add_file_to_tar(tar, "main.py", preset_files["main.py"]) - _add_file_to_tar(tar, "plugins_config.json", plugins_config_json) _add_file_to_tar(tar, "prompt.txt", prompt) _add_file_to_tar(tar, "setup.sh", preset_files["setup.sh"], mode=0o755) - # Add repos config if repos specified (SDK workspace handles cloning) + if variants is not None: + experiment_config = { + "experiment_id": experiment_id, + "variants": [ + { + "name": v.name, + "weight": v.weight, + "plugins": [ + p.model_dump(exclude_none=True) for p in v.plugins + ], + } + for v in variants + ], + } + _add_file_to_tar( + tar, + "experiment_config.json", + json.dumps(experiment_config, indent=2), + ) + else: + plugins_config = [p.model_dump(exclude_none=True) for p in plugins] # type: ignore[union-attr] + _add_file_to_tar( + tar, "plugins_config.json", json.dumps(plugins_config, indent=2) + ) + if repos: repos_config = [r.model_dump(exclude_none=True) for r in repos] _add_file_to_tar( @@ -472,10 +539,15 @@ async def create_automation_from_plugin( - With repo_path: subdirectory for monorepos """ model = resolve_model_profile_for_user(body.model, user) + is_experiment = body.variants is not None - # 1. Generate tarball with SDK code, plugin config, prompt, and repos config + # 1. Generate tarball with SDK code, plugin/experiment config, and prompt tarball_content = _generate_plugin_tarball( - body.plugins, body.prompt, repos=body.repos + body.plugins, + body.prompt, + repos=body.repos, + experiment_id=body.experiment_id, + variants=body.variants, ) # 2. Upload tarball to storage @@ -483,14 +555,19 @@ async def create_automation_from_plugin( storage_path = _build_storage_path(user.org_id, user.user_id, upload_id) # Create upload record - plugin_sources_str = _format_plugin_sources_for_description(body.plugins) - truncated_sources = _safe_truncate(plugin_sources_str, 100) + if is_experiment: + variant_names = ", ".join(v.name for v in body.variants) # type: ignore[union-attr] + description = f"A/B experiment {body.experiment_id}: {variant_names}" + else: + plugin_sources_str = _format_plugin_sources_for_description(body.plugins) # type: ignore[arg-type] + description = f"Auto-generated with plugins: {_safe_truncate(plugin_sources_str, 100)}" + upload = TarballUpload( id=upload_id, user_id=user.user_id, org_id=user.org_id, name=f"plugin-automation-{_safe_truncate(body.name, 50)}", - description=f"Auto-generated with plugins: {truncated_sources}", + description=description, status=UploadStatus.UPLOADING, storage_path=storage_path, ) @@ -548,14 +625,17 @@ async def create_automation_from_plugin( detail=f"Failed to create automation: {e!s}", ) - logger.info( - "Created automation from plugin", - extra={ - "automation_id": str(automation.id), - "upload_id": str(upload_id), - "plugin_count": len(body.plugins), - "prompt_length": len(body.prompt), - }, - ) + log_extra: dict[str, Any] = { + "automation_id": str(automation.id), + "upload_id": str(upload_id), + "prompt_length": len(body.prompt), + } + if is_experiment: + log_extra["experiment_id"] = body.experiment_id + log_extra["variant_count"] = len(body.variants) # type: ignore[arg-type] + else: + log_extra["plugin_count"] = len(body.plugins) # type: ignore[arg-type] + + logger.info("Created automation from plugin", extra=log_extra) return AutomationResponse.model_validate(automation) diff --git a/openhands/automation/presets/plugin/sdk_main.py b/openhands/automation/presets/plugin/sdk_main.py index 69c30de..54ea7c4 100644 --- a/openhands/automation/presets/plugin/sdk_main.py +++ b/openhands/automation/presets/plugin/sdk_main.py @@ -64,6 +64,7 @@ import json import os +import random import sys import time @@ -215,11 +216,32 @@ repos_context = workspace.get_repos_context(clone_result.repo_mappings) # Load configuration files + EXPERIMENT_CONFIG_FILE = os.path.join(SCRIPT_DIR, "experiment_config.json") PLUGINS_CONFIG_FILE = os.path.join(SCRIPT_DIR, "plugins_config.json") PROMPT_FILE = os.path.join(SCRIPT_DIR, "prompt.txt") - with open(PLUGINS_CONFIG_FILE) as f: - plugins_config = json.load(f) + # Experiment-aware variant selection + experiment_id: str | None = None + selected_variant: str | None = None + + if os.path.exists(EXPERIMENT_CONFIG_FILE): + with open(EXPERIMENT_CONFIG_FILE) as f: + experiment_config = json.load(f) + + experiment_id = experiment_config["experiment_id"] + variants = experiment_config["variants"] + weights = [v["weight"] for v in variants] + selected = random.choices(variants, weights=weights, k=1)[0] + + selected_variant = selected["name"] + plugins_config = selected["plugins"] + print(f"\n=== EXPERIMENT ===") + print(f" id: {experiment_id}") + print(f" variant: {selected_variant}") + print(f" weights: {dict(zip([v['name'] for v in variants], weights))}") + else: + with open(PLUGINS_CONFIG_FILE) as f: + plugins_config = json.load(f) with open(PROMPT_FILE) as f: USER_PROMPT = f.read() @@ -330,16 +352,25 @@ def event_callback(event) -> None: received_events.append(event) last_event_time["ts"] = time.time() + # Build experiment tags (if running an A/B test) + experiment_tags: dict[str, str] = {} + if experiment_id: + experiment_tags["experiment_id"] = experiment_id + experiment_tags["variant"] = selected_variant # type: ignore[assignment] + conversation = Conversation( agent=agent, workspace=workspace, plugins=plugin_sources, # All plugins loaded here callbacks=[event_callback], delete_on_close=False, # Keep conversation history after completion + tags=experiment_tags or None, ) assert isinstance(conversation, RemoteConversation) print(f" conversation created: {type(conversation).__name__}") print(f" plugins loaded: {len(plugin_sources)}") + if experiment_tags: + print(f" experiment tags: {experiment_tags}") # Inject secrets into the conversation (auto-exported as env vars in bash) if secrets: diff --git a/tests/test_preset_router.py b/tests/test_preset_router.py index 4ccf750..0ba2122 100644 --- a/tests/test_preset_router.py +++ b/tests/test_preset_router.py @@ -854,6 +854,229 @@ def test_generate_plugin_tarball_with_repos(self): assert repos_config[1]["ref"] == "develop" +class TestExperimentVariantValidation: + """Tests for A/B test variant validation on CreatePluginAutomationRequest.""" + + def test_variants_accepted(self): + """Valid variants request is accepted.""" + from openhands.automation.preset_router import CreatePluginAutomationRequest + + request = CreatePluginAutomationRequest.model_validate( + { + "name": "AB Test", + "experiment_id": "my-experiment", + "variants": [ + {"name": "control", "weight": 50, "plugins": [{"source": "github:owner/repo", "ref": "v1"}]}, + {"name": "treatment", "weight": 50, "plugins": [{"source": "github:owner/repo", "ref": "v2"}]}, + ], + "prompt": "Test prompt", + "trigger": {"type": "cron", "schedule": "0 0 * * *"}, + } + ) + assert request.variants is not None + assert len(request.variants) == 2 + assert request.plugins is None + assert request.experiment_id == "my-experiment" + + def test_plugins_and_variants_mutually_exclusive(self): + """Providing both plugins and variants raises error.""" + from openhands.automation.preset_router import CreatePluginAutomationRequest + + with pytest.raises(ValueError, match="Exactly one of"): + CreatePluginAutomationRequest.model_validate( + { + "name": "Bad", + "plugins": [{"source": "github:owner/repo"}], + "variants": [ + {"name": "a", "weight": 1, "plugins": [{"source": "github:owner/repo"}]}, + {"name": "b", "weight": 1, "plugins": [{"source": "github:owner/repo"}]}, + ], + "prompt": "Test", + "trigger": {"type": "cron", "schedule": "0 0 * * *"}, + } + ) + + def test_neither_plugins_nor_variants_rejected(self): + """Providing neither plugins nor variants raises error.""" + from openhands.automation.preset_router import CreatePluginAutomationRequest + + with pytest.raises(ValueError, match="Exactly one of"): + CreatePluginAutomationRequest.model_validate( + { + "name": "Bad", + "prompt": "Test", + "trigger": {"type": "cron", "schedule": "0 0 * * *"}, + } + ) + + def test_experiment_id_required_with_variants(self): + """experiment_id is required when variants is used.""" + from openhands.automation.preset_router import CreatePluginAutomationRequest + + with pytest.raises(ValueError, match="experiment_id.*required"): + CreatePluginAutomationRequest.model_validate( + { + "name": "Test", + "variants": [ + {"name": "a", "weight": 1, "plugins": [{"source": "github:owner/repo"}]}, + {"name": "b", "weight": 1, "plugins": [{"source": "github:owner/repo"}]}, + ], + "prompt": "Test", + "trigger": {"type": "cron", "schedule": "0 0 * * *"}, + } + ) + + def test_experiment_id_rejected_with_plugins(self): + """experiment_id cannot be used with plugins (only variants).""" + from openhands.automation.preset_router import CreatePluginAutomationRequest + + with pytest.raises(ValueError, match="experiment_id.*can only be used with"): + CreatePluginAutomationRequest.model_validate( + { + "name": "Test", + "plugins": [{"source": "github:owner/repo"}], + "experiment_id": "oops", + "prompt": "Test", + "trigger": {"type": "cron", "schedule": "0 0 * * *"}, + } + ) + + def test_single_variant_rejected(self): + """At least two variants are required.""" + from openhands.automation.preset_router import CreatePluginAutomationRequest + + with pytest.raises(ValueError, match="At least two variants"): + CreatePluginAutomationRequest.model_validate( + { + "name": "Test", + "experiment_id": "test", + "variants": [ + {"name": "only", "weight": 1, "plugins": [{"source": "github:owner/repo"}]}, + ], + "prompt": "Test", + "trigger": {"type": "cron", "schedule": "0 0 * * *"}, + } + ) + + def test_duplicate_variant_names_rejected(self): + """Variant names must be unique.""" + from openhands.automation.preset_router import CreatePluginAutomationRequest + + with pytest.raises(ValueError, match="unique"): + CreatePluginAutomationRequest.model_validate( + { + "name": "Test", + "experiment_id": "test", + "variants": [ + {"name": "same", "weight": 1, "plugins": [{"source": "github:owner/repo"}]}, + {"name": "same", "weight": 1, "plugins": [{"source": "github:owner/repo"}]}, + ], + "prompt": "Test", + "trigger": {"type": "cron", "schedule": "0 0 * * *"}, + } + ) + + def test_zero_weight_rejected(self): + """Variant weight must be positive.""" + from openhands.automation.preset_router import CreatePluginAutomationRequest + + with pytest.raises(ValueError): + CreatePluginAutomationRequest.model_validate( + { + "name": "Test", + "experiment_id": "test", + "variants": [ + {"name": "a", "weight": 0, "plugins": [{"source": "github:owner/repo"}]}, + {"name": "b", "weight": 1, "plugins": [{"source": "github:owner/repo"}]}, + ], + "prompt": "Test", + "trigger": {"type": "cron", "schedule": "0 0 * * *"}, + } + ) + + def test_too_many_variants_rejected(self): + """More than MAX_VARIANTS is rejected.""" + from openhands.automation.preset_router import ( + MAX_VARIANTS, + CreatePluginAutomationRequest, + ) + + variants = [ + {"name": f"v{i}", "weight": 1, "plugins": [{"source": "github:owner/repo"}]} + for i in range(MAX_VARIANTS + 1) + ] + with pytest.raises(ValueError, match="At most"): + CreatePluginAutomationRequest.model_validate( + { + "name": "Test", + "experiment_id": "test", + "variants": variants, + "prompt": "Test", + "trigger": {"type": "cron", "schedule": "0 0 * * *"}, + } + ) + + +class TestExperimentTarball: + """Tests for experiment (A/B) tarball generation.""" + + def test_experiment_tarball_contains_experiment_config(self): + """Experiment tarball has experiment_config.json, not plugins_config.json.""" + from openhands.automation.preset_router import ExperimentVariant + + variants = [ + ExperimentVariant(name="control", weight=50, plugins=[PluginSource(source="github:owner/repo", ref="v1")]), + ExperimentVariant(name="treatment", weight=50, plugins=[PluginSource(source="github:owner/repo", ref="v2")]), + ] + tarball_bytes = _generate_plugin_tarball( + None, "Test prompt", experiment_id="test-exp", variants=variants, + ) + + with tarfile.open(fileobj=io.BytesIO(tarball_bytes), mode="r:gz") as tar: + names = tar.getnames() + assert "experiment_config.json" in names + assert "plugins_config.json" not in names + assert "main.py" in names + assert "prompt.txt" in names + assert "setup.sh" in names + + def test_experiment_config_content(self): + """experiment_config.json has correct structure.""" + from openhands.automation.preset_router import ExperimentVariant + + variants = [ + ExperimentVariant(name="control", weight=70, plugins=[PluginSource(source="github:owner/repo", ref="v1")]), + ExperimentVariant(name="treatment", weight=30, plugins=[PluginSource(source="github:owner/repo", ref="v2")]), + ] + tarball_bytes = _generate_plugin_tarball( + None, "Test prompt", experiment_id="my-exp", variants=variants, + ) + + with tarfile.open(fileobj=io.BytesIO(tarball_bytes), mode="r:gz") as tar: + config_file = tar.extractfile("experiment_config.json") + assert config_file is not None + config = json.loads(config_file.read().decode("utf-8")) + + assert config["experiment_id"] == "my-exp" + assert len(config["variants"]) == 2 + assert config["variants"][0]["name"] == "control" + assert config["variants"][0]["weight"] == 70 + assert config["variants"][0]["plugins"][0]["source"] == "github:owner/repo" + assert config["variants"][0]["plugins"][0]["ref"] == "v1" + assert config["variants"][1]["name"] == "treatment" + assert config["variants"][1]["weight"] == 30 + + def test_standard_tarball_unchanged(self): + """Non-experiment tarball still produces plugins_config.json.""" + plugins = [PluginSource(source="github:owner/repo", ref="main")] + tarball_bytes = _generate_plugin_tarball(plugins, "Test prompt") + + with tarfile.open(fileobj=io.BytesIO(tarball_bytes), mode="r:gz") as tar: + names = tar.getnames() + assert "plugins_config.json" in names + assert "experiment_config.json" not in names + + @requires_docker class TestCreateAutomationFromPlugin: """Tests for POST /v1/preset/plugin endpoint.""" From 4cfb866a2939b3adeec72959ca01493885fc3fd2 Mon Sep 17 00:00:00 2001 From: Calvin Smith Date: Thu, 28 May 2026 19:40:07 -0600 Subject: [PATCH 2/9] test: add SQLite-based integration tests for A/B testing End-to-end tests that run without Docker by using SQLite + mock file store. Covers: - API creation flow (201 response, tarball contents, experiment_config.json) - Runtime variant selection logic (weight distribution, determinism) - Backward compatibility (standard plugin automations unchanged) - Validation errors via API (422 responses for invalid requests) 15 tests, all passing. Co-authored-by: openhands --- tests/test_ab_testing_integration.py | 432 +++++++++++++++++++++++++++ 1 file changed, 432 insertions(+) create mode 100644 tests/test_ab_testing_integration.py diff --git a/tests/test_ab_testing_integration.py b/tests/test_ab_testing_integration.py new file mode 100644 index 0000000..ba48fa3 --- /dev/null +++ b/tests/test_ab_testing_integration.py @@ -0,0 +1,432 @@ +"""Integration tests for A/B testing plugin variants. + +Uses SQLite (no Docker needed) to test the full flow: + 1. Create an A/B test automation via POST /v1/preset/plugin + 2. Verify the API response + 3. Inspect the generated tarball (experiment_config.json) + 4. Simulate the sdk_main.py variant selection logic + 5. Verify backward compatibility with standard plugin automations +""" + +import io +import json +import os +import random +import tarfile +import uuid +from collections.abc import AsyncGenerator, AsyncIterator +from unittest.mock import AsyncMock, MagicMock + +import pytest +from httpx import ASGITransport, AsyncClient +from sqlalchemy.ext.asyncio import AsyncSession, async_sessionmaker, create_async_engine + +os.environ["LOG_JSON"] = "0" + +from openhands.automation.app import app # noqa: E402 +from openhands.automation.auth import ( # noqa: E402 + AuthenticatedUser, + AuthMethod, + authenticate_request, + create_http_client, +) +from openhands.automation.db import get_session # noqa: E402 +from openhands.automation.models import Base # noqa: E402 +from openhands.automation.storage import get_file_store # noqa: E402 + + +# --- Fixtures (SQLite-based, no Docker) --- + +TEST_USER_ID = uuid.UUID("12345678-1234-5678-1234-567812345678") +TEST_ORG_ID = uuid.UUID("87654321-4321-8765-4321-876543218765") + + +@pytest.fixture +async def async_engine(): + """Create an in-memory SQLite engine.""" + engine = create_async_engine( + "sqlite+aiosqlite://", + connect_args={"check_same_thread": False}, + ) + async with engine.begin() as conn: + await conn.run_sync(Base.metadata.create_all) + yield engine + async with engine.begin() as conn: + await conn.run_sync(Base.metadata.drop_all) + await engine.dispose() + + +@pytest.fixture +async def async_session_factory(async_engine): + return async_sessionmaker( + async_engine, class_=AsyncSession, expire_on_commit=False, + ) + + +@pytest.fixture +async def async_session(async_session_factory) -> AsyncGenerator[AsyncSession, None]: + async with async_session_factory() as session: + yield session + + +@pytest.fixture +def mock_authenticated_user(): + return AuthenticatedUser( + user_id=TEST_USER_ID, + org_id=TEST_ORG_ID, + email="test@example.com", + role="owner", + permissions=["view_org_settings", "manage_automations"], + auth_method=AuthMethod.API_KEY, + api_key="test-api-key", + ) + + +@pytest.fixture +def mock_file_store(): + """Mock file store that captures uploaded content.""" + store = MagicMock() + store._captured_content = None + + async def mock_write_stream( + path: str, + stream: AsyncIterator[bytes], + max_size: int | None = None, + content_type: str = "application/octet-stream", + ) -> int: + content = b"" + async for chunk in stream: + content += chunk + store._captured_content = content + return len(content) + + store.write_stream = AsyncMock(side_effect=mock_write_stream) + store.delete = MagicMock() + return store + + +@pytest.fixture +async def client( + async_engine, async_session_factory, async_session, + mock_authenticated_user, mock_file_store, +) -> AsyncGenerator[AsyncClient, None]: + """Async test client with SQLite DB, mock auth, and mock file store.""" + async def override_get_session(): + yield async_session + + app.dependency_overrides[get_session] = override_get_session + app.dependency_overrides[authenticate_request] = lambda: mock_authenticated_user + app.dependency_overrides[get_file_store] = lambda: mock_file_store + + app.state.engine = async_engine + app.state.session_factory = async_session_factory + app.state.http_client = create_http_client() + + async with AsyncClient( + transport=ASGITransport(app=app), base_url="http://test", + ) as c: + yield c + + await app.state.http_client.aclose() + app.dependency_overrides.clear() + + +# --- Helpers --- + + +def _extract_tarball(mock_store) -> dict[str, bytes]: + """Extract all files from the captured tarball.""" + assert mock_store._captured_content is not None + files = {} + with tarfile.open( + fileobj=io.BytesIO(mock_store._captured_content), mode="r:gz" + ) as tar: + for member in tar.getmembers(): + f = tar.extractfile(member) + if f: + files[member.name] = f.read() + return files + + +def _simulate_variant_selection(experiment_config: dict, seed: int = 42) -> str: + """Run the same selection logic as sdk_main.py and return the variant name.""" + rng = random.Random(seed) + variants = experiment_config["variants"] + weights = [v["weight"] for v in variants] + selected = rng.choices(variants, weights=weights, k=1)[0] + return selected["name"] + + +# --- Tests --- + + +class TestABTestAutomationCreation: + """End-to-end: create an A/B test automation via the API.""" + + AB_PAYLOAD = { + "name": "PR Review A/B Test", + "experiment_id": "pr-review-v2-test", + "variants": [ + { + "name": "control", + "weight": 70, + "plugins": [ + {"source": "github:OpenHands/extensions", + "repo_path": "plugins/pr-review", "ref": "v1.0.0"}, + ], + }, + { + "name": "treatment", + "weight": 30, + "plugins": [ + {"source": "github:OpenHands/extensions", + "repo_path": "plugins/pr-review", "ref": "v2.0.0"}, + ], + }, + ], + "prompt": "Review this PR for code quality and potential bugs.", + "trigger": { + "type": "event", + "source": "github", + "on": "pull_request.opened", + }, + } + + async def test_create_ab_automation_returns_201(self, client, mock_file_store): + """POST with variants returns 201 and correct metadata.""" + resp = await client.post( + "/api/automation/v1/preset/plugin", json=self.AB_PAYLOAD, + ) + assert resp.status_code == 201, resp.text + data = resp.json() + + assert data["name"] == "PR Review A/B Test" + assert data["prompt"] == self.AB_PAYLOAD["prompt"] + assert data["trigger"]["type"] == "event" + assert data["tarball_path"].startswith("oh-internal://uploads/") + assert data["enabled"] is True + + async def test_tarball_contains_experiment_config(self, client, mock_file_store): + """Generated tarball has experiment_config.json, not plugins_config.json.""" + resp = await client.post( + "/api/automation/v1/preset/plugin", json=self.AB_PAYLOAD, + ) + assert resp.status_code == 201 + + files = _extract_tarball(mock_file_store) + assert "experiment_config.json" in files + assert "plugins_config.json" not in files + assert "main.py" in files + assert "prompt.txt" in files + assert "setup.sh" in files + + async def test_experiment_config_matches_request(self, client, mock_file_store): + """experiment_config.json faithfully represents the request.""" + resp = await client.post( + "/api/automation/v1/preset/plugin", json=self.AB_PAYLOAD, + ) + assert resp.status_code == 201 + + files = _extract_tarball(mock_file_store) + config = json.loads(files["experiment_config.json"]) + + assert config["experiment_id"] == "pr-review-v2-test" + assert len(config["variants"]) == 2 + + control = config["variants"][0] + assert control["name"] == "control" + assert control["weight"] == 70 + assert control["plugins"][0]["ref"] == "v1.0.0" + + treatment = config["variants"][1] + assert treatment["name"] == "treatment" + assert treatment["weight"] == 30 + assert treatment["plugins"][0]["ref"] == "v2.0.0" + + async def test_main_py_has_experiment_support(self, client, mock_file_store): + """main.py template includes experiment detection and tagging code.""" + resp = await client.post( + "/api/automation/v1/preset/plugin", json=self.AB_PAYLOAD, + ) + assert resp.status_code == 201 + + files = _extract_tarball(mock_file_store) + main_py = files["main.py"].decode("utf-8") + + assert "experiment_config.json" in main_py + assert "experiment_id" in main_py + assert "selected_variant" in main_py + assert "random.choices" in main_py + assert "experiment_tags" in main_py + + +class TestVariantSelectionLogic: + """Test the runtime variant selection as it would run in sdk_main.py.""" + + EXPERIMENT_CONFIG = { + "experiment_id": "test-experiment", + "variants": [ + {"name": "control", "weight": 80, "plugins": [{"source": "github:o/r", "ref": "v1"}]}, + {"name": "treatment", "weight": 20, "plugins": [{"source": "github:o/r", "ref": "v2"}]}, + ], + } + + def test_selection_respects_weights_distribution(self): + """Over many runs, variant selection roughly follows weight ratios.""" + counts = {"control": 0, "treatment": 0} + for seed in range(1000): + name = _simulate_variant_selection(self.EXPERIMENT_CONFIG, seed=seed) + counts[name] += 1 + + # 80/20 weights → expect ~800/200 with some variance + assert counts["control"] > 600, f"control selected only {counts['control']}/1000 times" + assert counts["treatment"] > 100, f"treatment selected only {counts['treatment']}/1000 times" + + def test_deterministic_with_same_seed(self): + """Same seed always picks the same variant.""" + v1 = _simulate_variant_selection(self.EXPERIMENT_CONFIG, seed=12345) + v2 = _simulate_variant_selection(self.EXPERIMENT_CONFIG, seed=12345) + assert v1 == v2 + + def test_selected_variant_has_plugins(self): + """The selected variant carries its plugin config.""" + variants = self.EXPERIMENT_CONFIG["variants"] + weights = [v["weight"] for v in variants] + rng = random.Random(42) + selected = rng.choices(variants, weights=weights, k=1)[0] + + assert "plugins" in selected + assert len(selected["plugins"]) == 1 + assert "source" in selected["plugins"][0] + + def test_equal_weights_both_variants_appear(self): + """With 50/50 weights, both variants should appear over many runs.""" + config = { + "experiment_id": "equal", + "variants": [ + {"name": "a", "weight": 50, "plugins": []}, + {"name": "b", "weight": 50, "plugins": []}, + ], + } + seen = {_simulate_variant_selection(config, seed=s) for s in range(100)} + assert seen == {"a", "b"} + + +class TestBackwardCompatibility: + """Standard plugin automations still work exactly as before.""" + + STANDARD_PAYLOAD = { + "name": "Standard Plugin Automation", + "plugins": [ + {"source": "github:owner/plugin", "ref": "v1.0.0"}, + ], + "prompt": "Do something with the plugin.", + "trigger": {"type": "cron", "schedule": "0 9 * * *"}, + } + + async def test_standard_plugin_automation_returns_201(self, client, mock_file_store): + """Standard (non-experiment) request still works.""" + resp = await client.post( + "/api/automation/v1/preset/plugin", json=self.STANDARD_PAYLOAD, + ) + assert resp.status_code == 201, resp.text + data = resp.json() + assert data["name"] == "Standard Plugin Automation" + + async def test_standard_tarball_has_plugins_config(self, client, mock_file_store): + """Standard tarball uses plugins_config.json, no experiment_config.json.""" + resp = await client.post( + "/api/automation/v1/preset/plugin", json=self.STANDARD_PAYLOAD, + ) + assert resp.status_code == 201 + + files = _extract_tarball(mock_file_store) + assert "plugins_config.json" in files + assert "experiment_config.json" not in files + + config = json.loads(files["plugins_config.json"]) + assert len(config) == 1 + assert config[0]["source"] == "github:owner/plugin" + assert config[0]["ref"] == "v1.0.0" + + +class TestABTestValidationViaAPI: + """Validation errors return proper 422 responses through the API.""" + + async def test_both_plugins_and_variants_rejected(self, client): + resp = await client.post( + "/api/automation/v1/preset/plugin", + json={ + "name": "Bad", + "plugins": [{"source": "github:o/r"}], + "variants": [ + {"name": "a", "weight": 1, "plugins": [{"source": "github:o/r"}]}, + {"name": "b", "weight": 1, "plugins": [{"source": "github:o/r"}]}, + ], + "prompt": "Test", + "trigger": {"type": "cron", "schedule": "0 0 * * *"}, + }, + ) + assert resp.status_code == 422 + + async def test_missing_experiment_id_rejected(self, client): + resp = await client.post( + "/api/automation/v1/preset/plugin", + json={ + "name": "Bad", + "variants": [ + {"name": "a", "weight": 1, "plugins": [{"source": "github:o/r"}]}, + {"name": "b", "weight": 1, "plugins": [{"source": "github:o/r"}]}, + ], + "prompt": "Test", + "trigger": {"type": "cron", "schedule": "0 0 * * *"}, + }, + ) + assert resp.status_code == 422 + + async def test_single_variant_rejected(self, client): + resp = await client.post( + "/api/automation/v1/preset/plugin", + json={ + "name": "Bad", + "experiment_id": "test", + "variants": [ + {"name": "only", "weight": 1, "plugins": [{"source": "github:o/r"}]}, + ], + "prompt": "Test", + "trigger": {"type": "cron", "schedule": "0 0 * * *"}, + }, + ) + assert resp.status_code == 422 + + async def test_duplicate_names_rejected(self, client): + resp = await client.post( + "/api/automation/v1/preset/plugin", + json={ + "name": "Bad", + "experiment_id": "test", + "variants": [ + {"name": "same", "weight": 1, "plugins": [{"source": "github:o/r"}]}, + {"name": "same", "weight": 1, "plugins": [{"source": "github:o/r"}]}, + ], + "prompt": "Test", + "trigger": {"type": "cron", "schedule": "0 0 * * *"}, + }, + ) + assert resp.status_code == 422 + + async def test_zero_weight_rejected(self, client): + resp = await client.post( + "/api/automation/v1/preset/plugin", + json={ + "name": "Bad", + "experiment_id": "test", + "variants": [ + {"name": "a", "weight": 0, "plugins": [{"source": "github:o/r"}]}, + {"name": "b", "weight": 1, "plugins": [{"source": "github:o/r"}]}, + ], + "prompt": "Test", + "trigger": {"type": "cron", "schedule": "0 0 * * *"}, + }, + ) + assert resp.status_code == 422 From a16161678fb3aab3491bf99e187e4a90984d3833 Mon Sep 17 00:00:00 2001 From: Calvin Smith Date: Fri, 29 May 2026 08:49:37 -0600 Subject: [PATCH 3/9] style: fix ruff format + RUF012 lint errors - Break long lines in preset_router.py (E501) - Add ClassVar annotations to test class attributes (RUF012) - Apply ruff format to both test files Co-authored-by: openhands --- openhands/automation/preset_router.py | 25 +++---- tests/test_ab_testing_integration.py | 97 ++++++++++++++++++------- tests/test_preset_router.py | 100 +++++++++++++++++++++----- 3 files changed, 165 insertions(+), 57 deletions(-) diff --git a/openhands/automation/preset_router.py b/openhands/automation/preset_router.py index dbd3914..21cdf67 100644 --- a/openhands/automation/preset_router.py +++ b/openhands/automation/preset_router.py @@ -325,7 +325,7 @@ class ExperimentVariant(BaseModel): model_config = ConfigDict(extra="forbid") name: str = Field(..., min_length=1, max_length=100) - weight: int = Field(..., gt=0, description="Relative selection weight (must be > 0)") + weight: int = Field(..., gt=0, description="Relative selection weight (> 0)") plugins: list[PluginSource] = Field( ..., description="Plugin(s) for this variant.", @@ -421,21 +421,15 @@ def validate_plugins_or_variants(self) -> "CreatePluginAutomationRequest": has_variants = self.variants is not None if has_plugins == has_variants: - raise ValueError( - "Exactly one of 'plugins' or 'variants' must be provided." - ) + raise ValueError("Exactly one of 'plugins' or 'variants' must be provided.") if has_variants: if self.experiment_id is None: - raise ValueError( - "'experiment_id' is required when using 'variants'." - ) + raise ValueError("'experiment_id' is required when using 'variants'.") if len(self.variants) < 2: raise ValueError("At least two variants are required for an A/B test.") if len(self.variants) > MAX_VARIANTS: - raise ValueError( - f"At most {MAX_VARIANTS} variants are allowed." - ) + raise ValueError(f"At most {MAX_VARIANTS} variants are allowed.") names = [v.name for v in self.variants] if len(names) != len(set(names)): raise ValueError("Variant names must be unique.") @@ -477,9 +471,7 @@ def _generate_plugin_tarball( { "name": v.name, "weight": v.weight, - "plugins": [ - p.model_dump(exclude_none=True) for p in v.plugins - ], + "plugins": [p.model_dump(exclude_none=True) for p in v.plugins], } for v in variants ], @@ -559,8 +551,11 @@ async def create_automation_from_plugin( variant_names = ", ".join(v.name for v in body.variants) # type: ignore[union-attr] description = f"A/B experiment {body.experiment_id}: {variant_names}" else: - plugin_sources_str = _format_plugin_sources_for_description(body.plugins) # type: ignore[arg-type] - description = f"Auto-generated with plugins: {_safe_truncate(plugin_sources_str, 100)}" + plugin_sources_str = _format_plugin_sources_for_description( + body.plugins # type: ignore[arg-type] + ) + truncated = _safe_truncate(plugin_sources_str, 100) + description = f"Auto-generated with plugins: {truncated}" upload = TarballUpload( id=upload_id, diff --git a/tests/test_ab_testing_integration.py b/tests/test_ab_testing_integration.py index ba48fa3..989c612 100644 --- a/tests/test_ab_testing_integration.py +++ b/tests/test_ab_testing_integration.py @@ -15,12 +15,14 @@ import tarfile import uuid from collections.abc import AsyncGenerator, AsyncIterator +from typing import ClassVar from unittest.mock import AsyncMock, MagicMock import pytest from httpx import ASGITransport, AsyncClient from sqlalchemy.ext.asyncio import AsyncSession, async_sessionmaker, create_async_engine + os.environ["LOG_JSON"] = "0" from openhands.automation.app import app # noqa: E402 @@ -59,7 +61,9 @@ async def async_engine(): @pytest.fixture async def async_session_factory(async_engine): return async_sessionmaker( - async_engine, class_=AsyncSession, expire_on_commit=False, + async_engine, + class_=AsyncSession, + expire_on_commit=False, ) @@ -107,10 +111,14 @@ async def mock_write_stream( @pytest.fixture async def client( - async_engine, async_session_factory, async_session, - mock_authenticated_user, mock_file_store, + async_engine, + async_session_factory, + async_session, + mock_authenticated_user, + mock_file_store, ) -> AsyncGenerator[AsyncClient, None]: """Async test client with SQLite DB, mock auth, and mock file store.""" + async def override_get_session(): yield async_session @@ -123,7 +131,8 @@ async def override_get_session(): app.state.http_client = create_http_client() async with AsyncClient( - transport=ASGITransport(app=app), base_url="http://test", + transport=ASGITransport(app=app), + base_url="http://test", ) as c: yield c @@ -163,7 +172,7 @@ def _simulate_variant_selection(experiment_config: dict, seed: int = 42) -> str: class TestABTestAutomationCreation: """End-to-end: create an A/B test automation via the API.""" - AB_PAYLOAD = { + AB_PAYLOAD: ClassVar[dict] = { "name": "PR Review A/B Test", "experiment_id": "pr-review-v2-test", "variants": [ @@ -171,16 +180,22 @@ class TestABTestAutomationCreation: "name": "control", "weight": 70, "plugins": [ - {"source": "github:OpenHands/extensions", - "repo_path": "plugins/pr-review", "ref": "v1.0.0"}, + { + "source": "github:OpenHands/extensions", + "repo_path": "plugins/pr-review", + "ref": "v1.0.0", + }, ], }, { "name": "treatment", "weight": 30, "plugins": [ - {"source": "github:OpenHands/extensions", - "repo_path": "plugins/pr-review", "ref": "v2.0.0"}, + { + "source": "github:OpenHands/extensions", + "repo_path": "plugins/pr-review", + "ref": "v2.0.0", + }, ], }, ], @@ -195,7 +210,8 @@ class TestABTestAutomationCreation: async def test_create_ab_automation_returns_201(self, client, mock_file_store): """POST with variants returns 201 and correct metadata.""" resp = await client.post( - "/api/automation/v1/preset/plugin", json=self.AB_PAYLOAD, + "/api/automation/v1/preset/plugin", + json=self.AB_PAYLOAD, ) assert resp.status_code == 201, resp.text data = resp.json() @@ -209,7 +225,8 @@ async def test_create_ab_automation_returns_201(self, client, mock_file_store): async def test_tarball_contains_experiment_config(self, client, mock_file_store): """Generated tarball has experiment_config.json, not plugins_config.json.""" resp = await client.post( - "/api/automation/v1/preset/plugin", json=self.AB_PAYLOAD, + "/api/automation/v1/preset/plugin", + json=self.AB_PAYLOAD, ) assert resp.status_code == 201 @@ -223,7 +240,8 @@ async def test_tarball_contains_experiment_config(self, client, mock_file_store) async def test_experiment_config_matches_request(self, client, mock_file_store): """experiment_config.json faithfully represents the request.""" resp = await client.post( - "/api/automation/v1/preset/plugin", json=self.AB_PAYLOAD, + "/api/automation/v1/preset/plugin", + json=self.AB_PAYLOAD, ) assert resp.status_code == 201 @@ -246,7 +264,8 @@ async def test_experiment_config_matches_request(self, client, mock_file_store): async def test_main_py_has_experiment_support(self, client, mock_file_store): """main.py template includes experiment detection and tagging code.""" resp = await client.post( - "/api/automation/v1/preset/plugin", json=self.AB_PAYLOAD, + "/api/automation/v1/preset/plugin", + json=self.AB_PAYLOAD, ) assert resp.status_code == 201 @@ -263,11 +282,19 @@ async def test_main_py_has_experiment_support(self, client, mock_file_store): class TestVariantSelectionLogic: """Test the runtime variant selection as it would run in sdk_main.py.""" - EXPERIMENT_CONFIG = { + EXPERIMENT_CONFIG: ClassVar[dict] = { "experiment_id": "test-experiment", "variants": [ - {"name": "control", "weight": 80, "plugins": [{"source": "github:o/r", "ref": "v1"}]}, - {"name": "treatment", "weight": 20, "plugins": [{"source": "github:o/r", "ref": "v2"}]}, + { + "name": "control", + "weight": 80, + "plugins": [{"source": "github:o/r", "ref": "v1"}], + }, + { + "name": "treatment", + "weight": 20, + "plugins": [{"source": "github:o/r", "ref": "v2"}], + }, ], } @@ -279,8 +306,12 @@ def test_selection_respects_weights_distribution(self): counts[name] += 1 # 80/20 weights → expect ~800/200 with some variance - assert counts["control"] > 600, f"control selected only {counts['control']}/1000 times" - assert counts["treatment"] > 100, f"treatment selected only {counts['treatment']}/1000 times" + assert counts["control"] > 600, ( + f"control selected only {counts['control']}/1000 times" + ) + assert counts["treatment"] > 100, ( + f"treatment selected only {counts['treatment']}/1000 times" + ) def test_deterministic_with_same_seed(self): """Same seed always picks the same variant.""" @@ -315,7 +346,7 @@ def test_equal_weights_both_variants_appear(self): class TestBackwardCompatibility: """Standard plugin automations still work exactly as before.""" - STANDARD_PAYLOAD = { + STANDARD_PAYLOAD: ClassVar[dict] = { "name": "Standard Plugin Automation", "plugins": [ {"source": "github:owner/plugin", "ref": "v1.0.0"}, @@ -324,10 +355,13 @@ class TestBackwardCompatibility: "trigger": {"type": "cron", "schedule": "0 9 * * *"}, } - async def test_standard_plugin_automation_returns_201(self, client, mock_file_store): + async def test_standard_plugin_automation_returns_201( + self, client, mock_file_store + ): """Standard (non-experiment) request still works.""" resp = await client.post( - "/api/automation/v1/preset/plugin", json=self.STANDARD_PAYLOAD, + "/api/automation/v1/preset/plugin", + json=self.STANDARD_PAYLOAD, ) assert resp.status_code == 201, resp.text data = resp.json() @@ -336,7 +370,8 @@ async def test_standard_plugin_automation_returns_201(self, client, mock_file_st async def test_standard_tarball_has_plugins_config(self, client, mock_file_store): """Standard tarball uses plugins_config.json, no experiment_config.json.""" resp = await client.post( - "/api/automation/v1/preset/plugin", json=self.STANDARD_PAYLOAD, + "/api/automation/v1/preset/plugin", + json=self.STANDARD_PAYLOAD, ) assert resp.status_code == 201 @@ -391,7 +426,11 @@ async def test_single_variant_rejected(self, client): "name": "Bad", "experiment_id": "test", "variants": [ - {"name": "only", "weight": 1, "plugins": [{"source": "github:o/r"}]}, + { + "name": "only", + "weight": 1, + "plugins": [{"source": "github:o/r"}], + }, ], "prompt": "Test", "trigger": {"type": "cron", "schedule": "0 0 * * *"}, @@ -406,8 +445,16 @@ async def test_duplicate_names_rejected(self, client): "name": "Bad", "experiment_id": "test", "variants": [ - {"name": "same", "weight": 1, "plugins": [{"source": "github:o/r"}]}, - {"name": "same", "weight": 1, "plugins": [{"source": "github:o/r"}]}, + { + "name": "same", + "weight": 1, + "plugins": [{"source": "github:o/r"}], + }, + { + "name": "same", + "weight": 1, + "plugins": [{"source": "github:o/r"}], + }, ], "prompt": "Test", "trigger": {"type": "cron", "schedule": "0 0 * * *"}, diff --git a/tests/test_preset_router.py b/tests/test_preset_router.py index 0ba2122..6f7322b 100644 --- a/tests/test_preset_router.py +++ b/tests/test_preset_router.py @@ -866,8 +866,16 @@ def test_variants_accepted(self): "name": "AB Test", "experiment_id": "my-experiment", "variants": [ - {"name": "control", "weight": 50, "plugins": [{"source": "github:owner/repo", "ref": "v1"}]}, - {"name": "treatment", "weight": 50, "plugins": [{"source": "github:owner/repo", "ref": "v2"}]}, + { + "name": "control", + "weight": 50, + "plugins": [{"source": "github:owner/repo", "ref": "v1"}], + }, + { + "name": "treatment", + "weight": 50, + "plugins": [{"source": "github:owner/repo", "ref": "v2"}], + }, ], "prompt": "Test prompt", "trigger": {"type": "cron", "schedule": "0 0 * * *"}, @@ -888,8 +896,16 @@ def test_plugins_and_variants_mutually_exclusive(self): "name": "Bad", "plugins": [{"source": "github:owner/repo"}], "variants": [ - {"name": "a", "weight": 1, "plugins": [{"source": "github:owner/repo"}]}, - {"name": "b", "weight": 1, "plugins": [{"source": "github:owner/repo"}]}, + { + "name": "a", + "weight": 1, + "plugins": [{"source": "github:owner/repo"}], + }, + { + "name": "b", + "weight": 1, + "plugins": [{"source": "github:owner/repo"}], + }, ], "prompt": "Test", "trigger": {"type": "cron", "schedule": "0 0 * * *"}, @@ -918,8 +934,16 @@ def test_experiment_id_required_with_variants(self): { "name": "Test", "variants": [ - {"name": "a", "weight": 1, "plugins": [{"source": "github:owner/repo"}]}, - {"name": "b", "weight": 1, "plugins": [{"source": "github:owner/repo"}]}, + { + "name": "a", + "weight": 1, + "plugins": [{"source": "github:owner/repo"}], + }, + { + "name": "b", + "weight": 1, + "plugins": [{"source": "github:owner/repo"}], + }, ], "prompt": "Test", "trigger": {"type": "cron", "schedule": "0 0 * * *"}, @@ -951,7 +975,11 @@ def test_single_variant_rejected(self): "name": "Test", "experiment_id": "test", "variants": [ - {"name": "only", "weight": 1, "plugins": [{"source": "github:owner/repo"}]}, + { + "name": "only", + "weight": 1, + "plugins": [{"source": "github:owner/repo"}], + }, ], "prompt": "Test", "trigger": {"type": "cron", "schedule": "0 0 * * *"}, @@ -968,8 +996,16 @@ def test_duplicate_variant_names_rejected(self): "name": "Test", "experiment_id": "test", "variants": [ - {"name": "same", "weight": 1, "plugins": [{"source": "github:owner/repo"}]}, - {"name": "same", "weight": 1, "plugins": [{"source": "github:owner/repo"}]}, + { + "name": "same", + "weight": 1, + "plugins": [{"source": "github:owner/repo"}], + }, + { + "name": "same", + "weight": 1, + "plugins": [{"source": "github:owner/repo"}], + }, ], "prompt": "Test", "trigger": {"type": "cron", "schedule": "0 0 * * *"}, @@ -986,8 +1022,16 @@ def test_zero_weight_rejected(self): "name": "Test", "experiment_id": "test", "variants": [ - {"name": "a", "weight": 0, "plugins": [{"source": "github:owner/repo"}]}, - {"name": "b", "weight": 1, "plugins": [{"source": "github:owner/repo"}]}, + { + "name": "a", + "weight": 0, + "plugins": [{"source": "github:owner/repo"}], + }, + { + "name": "b", + "weight": 1, + "plugins": [{"source": "github:owner/repo"}], + }, ], "prompt": "Test", "trigger": {"type": "cron", "schedule": "0 0 * * *"}, @@ -1025,11 +1069,22 @@ def test_experiment_tarball_contains_experiment_config(self): from openhands.automation.preset_router import ExperimentVariant variants = [ - ExperimentVariant(name="control", weight=50, plugins=[PluginSource(source="github:owner/repo", ref="v1")]), - ExperimentVariant(name="treatment", weight=50, plugins=[PluginSource(source="github:owner/repo", ref="v2")]), + ExperimentVariant( + name="control", + weight=50, + plugins=[PluginSource(source="github:owner/repo", ref="v1")], + ), + ExperimentVariant( + name="treatment", + weight=50, + plugins=[PluginSource(source="github:owner/repo", ref="v2")], + ), ] tarball_bytes = _generate_plugin_tarball( - None, "Test prompt", experiment_id="test-exp", variants=variants, + None, + "Test prompt", + experiment_id="test-exp", + variants=variants, ) with tarfile.open(fileobj=io.BytesIO(tarball_bytes), mode="r:gz") as tar: @@ -1045,11 +1100,22 @@ def test_experiment_config_content(self): from openhands.automation.preset_router import ExperimentVariant variants = [ - ExperimentVariant(name="control", weight=70, plugins=[PluginSource(source="github:owner/repo", ref="v1")]), - ExperimentVariant(name="treatment", weight=30, plugins=[PluginSource(source="github:owner/repo", ref="v2")]), + ExperimentVariant( + name="control", + weight=70, + plugins=[PluginSource(source="github:owner/repo", ref="v1")], + ), + ExperimentVariant( + name="treatment", + weight=30, + plugins=[PluginSource(source="github:owner/repo", ref="v2")], + ), ] tarball_bytes = _generate_plugin_tarball( - None, "Test prompt", experiment_id="my-exp", variants=variants, + None, + "Test prompt", + experiment_id="my-exp", + variants=variants, ) with tarfile.open(fileobj=io.BytesIO(tarball_bytes), mode="r:gz") as tar: From 50d8a264ff850305a7ffd8d7dd88410101673f71 Mon Sep 17 00:00:00 2001 From: Calvin Smith Date: Fri, 29 May 2026 09:01:29 -0600 Subject: [PATCH 4/9] fix: resolve pyright type-narrowing errors for Optional variants/plugins Replace type: ignore comments with proper None-checks so pyright can narrow list[ExperimentVariant] | None to list[ExperimentVariant]. Co-authored-by: openhands --- openhands/automation/preset_router.py | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/openhands/automation/preset_router.py b/openhands/automation/preset_router.py index 21cdf67..b17f946 100644 --- a/openhands/automation/preset_router.py +++ b/openhands/automation/preset_router.py @@ -423,7 +423,7 @@ def validate_plugins_or_variants(self) -> "CreatePluginAutomationRequest": if has_plugins == has_variants: raise ValueError("Exactly one of 'plugins' or 'variants' must be provided.") - if has_variants: + if has_variants and self.variants is not None: if self.experiment_id is None: raise ValueError("'experiment_id' is required when using 'variants'.") if len(self.variants) < 2: @@ -547,13 +547,12 @@ async def create_automation_from_plugin( storage_path = _build_storage_path(user.org_id, user.user_id, upload_id) # Create upload record - if is_experiment: - variant_names = ", ".join(v.name for v in body.variants) # type: ignore[union-attr] + if is_experiment and body.variants is not None: + variant_names = ", ".join(v.name for v in body.variants) description = f"A/B experiment {body.experiment_id}: {variant_names}" else: - plugin_sources_str = _format_plugin_sources_for_description( - body.plugins # type: ignore[arg-type] - ) + assert body.plugins is not None + plugin_sources_str = _format_plugin_sources_for_description(body.plugins) truncated = _safe_truncate(plugin_sources_str, 100) description = f"Auto-generated with plugins: {truncated}" @@ -625,11 +624,11 @@ async def create_automation_from_plugin( "upload_id": str(upload_id), "prompt_length": len(body.prompt), } - if is_experiment: + if is_experiment and body.variants is not None: log_extra["experiment_id"] = body.experiment_id - log_extra["variant_count"] = len(body.variants) # type: ignore[arg-type] - else: - log_extra["plugin_count"] = len(body.plugins) # type: ignore[arg-type] + log_extra["variant_count"] = len(body.variants) + elif body.plugins is not None: + log_extra["plugin_count"] = len(body.plugins) logger.info("Created automation from plugin", extra=log_extra) From d6fb711fe99024abbcaea0dc573c1afa11ead0e8 Mon Sep 17 00:00:00 2001 From: Calvin Smith Date: Fri, 29 May 2026 10:14:12 -0600 Subject: [PATCH 5/9] =?UTF-8?q?refactor:=20address=20review=20feedback=20?= =?UTF-8?q?=E2=80=94=20tighten=20type=20safety,=20fix=20docstring?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Drop intermediate bool vars (has_variants/is_experiment), use direct 'is not None' checks so pyright narrows without redundant conditions - Replace 'type: ignore' comments with explicit asserts in _generate_plugin_tarball and sdk_main.py experiment_tags block - Fix misleading docstring on _simulate_variant_selection test helper (seeded RNG ≠ 'same logic' as production) Co-authored-by: openhands --- openhands/automation/preset_router.py | 17 +++++++---------- openhands/automation/presets/plugin/sdk_main.py | 3 ++- tests/test_ab_testing_integration.py | 6 +++++- 3 files changed, 14 insertions(+), 12 deletions(-) diff --git a/openhands/automation/preset_router.py b/openhands/automation/preset_router.py index 660c89e..bd3dc78 100644 --- a/openhands/automation/preset_router.py +++ b/openhands/automation/preset_router.py @@ -566,13 +566,10 @@ def normalize_plugins_and_repos(cls, data: dict) -> dict: # type: ignore[type-a @model_validator(mode="after") def validate_plugins_or_variants(self) -> "CreatePluginAutomationRequest": """Enforce mutual exclusivity between plugins and variants.""" - has_plugins = self.plugins is not None - has_variants = self.variants is not None - - if has_plugins == has_variants: + if (self.plugins is None) == (self.variants is None): raise ValueError("Exactly one of 'plugins' or 'variants' must be provided.") - if has_variants and self.variants is not None: + if self.variants is not None: if self.experiment_id is None: raise ValueError("'experiment_id' is required when using 'variants'.") if len(self.variants) < 2: @@ -631,7 +628,8 @@ def _generate_plugin_tarball( json.dumps(experiment_config, indent=2), ) else: - plugins_config = [p.model_dump(exclude_none=True) for p in plugins] # type: ignore[union-attr] + assert plugins is not None # guaranteed by caller + plugins_config = [p.model_dump(exclude_none=True) for p in plugins] _add_file_to_tar( tar, "plugins_config.json", json.dumps(plugins_config, indent=2) ) @@ -680,7 +678,6 @@ async def create_automation_from_plugin( - With repo_path: subdirectory for monorepos """ model = resolve_model_profile_for_user(body.model, user) - is_experiment = body.variants is not None # 1. Generate tarball with SDK code, plugin/experiment config, and prompt tarball_content = _generate_plugin_tarball( @@ -696,11 +693,11 @@ async def create_automation_from_plugin( storage_path = _build_storage_path(user.org_id, user.user_id, upload_id) # Create upload record - if is_experiment and body.variants is not None: + if body.variants is not None: variant_names = ", ".join(v.name for v in body.variants) description = f"A/B experiment {body.experiment_id}: {variant_names}" else: - assert body.plugins is not None + assert body.plugins is not None # guaranteed by validator plugin_sources_str = _format_plugin_sources_for_description(body.plugins) truncated = _safe_truncate(plugin_sources_str, 100) description = f"Auto-generated with plugins: {truncated}" @@ -773,7 +770,7 @@ async def create_automation_from_plugin( "upload_id": str(upload_id), "prompt_length": len(body.prompt), } - if is_experiment and body.variants is not None: + if body.variants is not None: log_extra["experiment_id"] = body.experiment_id log_extra["variant_count"] = len(body.variants) elif body.plugins is not None: diff --git a/openhands/automation/presets/plugin/sdk_main.py b/openhands/automation/presets/plugin/sdk_main.py index 54ea7c4..97055e1 100644 --- a/openhands/automation/presets/plugin/sdk_main.py +++ b/openhands/automation/presets/plugin/sdk_main.py @@ -356,7 +356,8 @@ def event_callback(event) -> None: experiment_tags: dict[str, str] = {} if experiment_id: experiment_tags["experiment_id"] = experiment_id - experiment_tags["variant"] = selected_variant # type: ignore[assignment] + assert selected_variant is not None # set when experiment_id is set + experiment_tags["variant"] = selected_variant conversation = Conversation( agent=agent, diff --git a/tests/test_ab_testing_integration.py b/tests/test_ab_testing_integration.py index 989c612..ec52e8d 100644 --- a/tests/test_ab_testing_integration.py +++ b/tests/test_ab_testing_integration.py @@ -158,7 +158,11 @@ def _extract_tarball(mock_store) -> dict[str, bytes]: def _simulate_variant_selection(experiment_config: dict, seed: int = 42) -> str: - """Run the same selection logic as sdk_main.py and return the variant name.""" + """Select a variant using the same weighted-random logic as sdk_main.py. + + Uses a seeded RNG for deterministic test assertions; production code + uses the global (unseeded) random module. + """ rng = random.Random(seed) variants = experiment_config["variants"] weights = [v["weight"] for v in variants] From 7bceaaa7adb392d87cf5b210e0720c3239aebcac Mon Sep 17 00:00:00 2001 From: Calvin Smith Date: Fri, 29 May 2026 10:29:27 -0600 Subject: [PATCH 6/9] =?UTF-8?q?fix:=20address=20round-2=20review=20?= =?UTF-8?q?=E2=80=94=20empty=20plugins=20bug,=20f-prefix=20nit,=20assert?= =?UTF-8?q?=20safety?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add min_length=1 to ExperimentVariant.plugins to reject empty lists - Remove extraneous f-prefix on non-interpolated string in sdk_main.py - Replace assert with explicit RuntimeError guard in sdk_main.py since generated code runs in sandboxes where python -O could strip asserts Co-authored-by: openhands --- openhands/automation/preset_router.py | 1 + openhands/automation/presets/plugin/sdk_main.py | 8 ++++++-- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/openhands/automation/preset_router.py b/openhands/automation/preset_router.py index bd3dc78..ad9ca91 100644 --- a/openhands/automation/preset_router.py +++ b/openhands/automation/preset_router.py @@ -477,6 +477,7 @@ class ExperimentVariant(BaseModel): weight: int = Field(..., gt=0, description="Relative selection weight (> 0)") plugins: list[PluginSource] = Field( ..., + min_length=1, description="Plugin(s) for this variant.", ) diff --git a/openhands/automation/presets/plugin/sdk_main.py b/openhands/automation/presets/plugin/sdk_main.py index 97055e1..068238f 100644 --- a/openhands/automation/presets/plugin/sdk_main.py +++ b/openhands/automation/presets/plugin/sdk_main.py @@ -235,7 +235,7 @@ selected_variant = selected["name"] plugins_config = selected["plugins"] - print(f"\n=== EXPERIMENT ===") + print("\n=== EXPERIMENT ===") print(f" id: {experiment_id}") print(f" variant: {selected_variant}") print(f" weights: {dict(zip([v['name'] for v in variants], weights))}") @@ -356,7 +356,11 @@ def event_callback(event) -> None: experiment_tags: dict[str, str] = {} if experiment_id: experiment_tags["experiment_id"] = experiment_id - assert selected_variant is not None # set when experiment_id is set + if selected_variant is None: + raise RuntimeError( + "BUG: experiment_id is set but selected_variant is None — " + "experiment config may be malformed." + ) experiment_tags["variant"] = selected_variant conversation = Conversation( From 2baf544cff74db194111e1d92de9614fba9e9554 Mon Sep 17 00:00:00 2001 From: Calvin Smith Date: Fri, 29 May 2026 10:43:52 -0600 Subject: [PATCH 7/9] fix: truncate experiment description, add MAX_VARIANTS boundary test - Wrap experiment description in _safe_truncate(..., 200) for consistency with the plugin description path - Add test_max_variants_accepted to close off-by-one gap Co-authored-by: openhands --- openhands/automation/preset_router.py | 4 +++- tests/test_preset_router.py | 22 ++++++++++++++++++++++ 2 files changed, 25 insertions(+), 1 deletion(-) diff --git a/openhands/automation/preset_router.py b/openhands/automation/preset_router.py index ad9ca91..815541d 100644 --- a/openhands/automation/preset_router.py +++ b/openhands/automation/preset_router.py @@ -696,7 +696,9 @@ async def create_automation_from_plugin( # Create upload record if body.variants is not None: variant_names = ", ".join(v.name for v in body.variants) - description = f"A/B experiment {body.experiment_id}: {variant_names}" + description = _safe_truncate( + f"A/B experiment {body.experiment_id}: {variant_names}", 200 + ) else: assert body.plugins is not None # guaranteed by validator plugin_sources_str = _format_plugin_sources_for_description(body.plugins) diff --git a/tests/test_preset_router.py b/tests/test_preset_router.py index 7b770f9..e35dad2 100644 --- a/tests/test_preset_router.py +++ b/tests/test_preset_router.py @@ -1091,6 +1091,28 @@ def test_zero_weight_rejected(self): } ) + def test_max_variants_accepted(self): + """Exactly MAX_VARIANTS variants is accepted.""" + from openhands.automation.preset_router import ( + MAX_VARIANTS, + CreatePluginAutomationRequest, + ) + + variants = [ + {"name": f"v{i}", "weight": 1, "plugins": [{"source": "github:owner/repo"}]} + for i in range(MAX_VARIANTS) + ] + req = CreatePluginAutomationRequest.model_validate( + { + "name": "Test", + "experiment_id": "boundary-test", + "variants": variants, + "prompt": "Test", + "trigger": {"type": "cron", "schedule": "0 0 * * *"}, + } + ) + assert len(req.variants) == MAX_VARIANTS + def test_too_many_variants_rejected(self): """More than MAX_VARIANTS is rejected.""" from openhands.automation.preset_router import ( From b044db59c42f96b202e18b1ea68ae4134432ed99 Mon Sep 17 00:00:00 2001 From: Calvin Smith Date: Fri, 29 May 2026 10:50:12 -0600 Subject: [PATCH 8/9] fix: narrow Optional type in boundary test for pyright Co-authored-by: openhands --- tests/test_preset_router.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_preset_router.py b/tests/test_preset_router.py index e35dad2..3bc88fc 100644 --- a/tests/test_preset_router.py +++ b/tests/test_preset_router.py @@ -1111,7 +1111,7 @@ def test_max_variants_accepted(self): "trigger": {"type": "cron", "schedule": "0 0 * * *"}, } ) - assert len(req.variants) == MAX_VARIANTS + assert req.variants is not None and len(req.variants) == MAX_VARIANTS def test_too_many_variants_rejected(self): """More than MAX_VARIANTS is rejected.""" From d44c383edd58ebbae6a74531a6904eea282dc3e3 Mon Sep 17 00:00:00 2001 From: Calvin Smith Date: Fri, 29 May 2026 11:01:02 -0600 Subject: [PATCH 9/9] fix: use valid plugin entries in test fixtures for min_length=1 consistency Co-authored-by: openhands --- tests/test_ab_testing_integration.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_ab_testing_integration.py b/tests/test_ab_testing_integration.py index ec52e8d..c86bc69 100644 --- a/tests/test_ab_testing_integration.py +++ b/tests/test_ab_testing_integration.py @@ -339,8 +339,8 @@ def test_equal_weights_both_variants_appear(self): config = { "experiment_id": "equal", "variants": [ - {"name": "a", "weight": 50, "plugins": []}, - {"name": "b", "weight": 50, "plugins": []}, + {"name": "a", "weight": 50, "plugins": [{"source": "github:o/r"}]}, + {"name": "b", "weight": 50, "plugins": [{"source": "github:o/r"}]}, ], } seen = {_simulate_variant_selection(config, seed=s) for s in range(100)}