OpenHands · csmith49 · May 29, 2026 · May 28, 2026 · May 29, 2026 · May 29, 2026
diff --git a/openhands/automation/preset_router.py b/openhands/automation/preset_router.py
@@ -465,18 +465,45 @@ async def create_automation_from_prompt(
 
 # --- Plugin Preset ---
 
+MAX_VARIANTS = 10
+
+
+class ExperimentVariant(BaseModel):
+    """A single variant in an A/B test experiment."""
+
+    model_config = ConfigDict(extra="forbid")
+
+    name: str = Field(..., min_length=1, max_length=100)
+    weight: int = Field(..., gt=0, description="Relative selection weight (> 0)")
+    plugins: list[PluginSource] = Field(
+        ...,
+        min_length=1,
+        description="Plugin(s) for this variant.",
+    )
+
 
 class CreatePluginAutomationRequest(BaseModel):
     """Request to create an automation using plugins."""
 
     model_config = ConfigDict(extra="forbid")
 
     name: str = Field(..., min_length=1, max_length=500)
-    plugins: list[PluginSource] = Field(
-        ...,
-        description="Plugin(s) to load. Can be a single plugin or a list of plugins. "
-        "Each plugin specifies a source (github:owner/repo, git URL, or local path), "
-        "optional ref (branch/tag/commit), and optional repo_path for monorepos.",
+    plugins: list[PluginSource] | None = Field(
+        default=None,
+        description="Plugin(s) to load. Mutually exclusive with 'variants'.",
+    )
+    variants: list[ExperimentVariant] | None = Field(
+        default=None,
+        description=(
+            "A/B test variants. Each variant specifies its own plugin set and a "
+            "relative weight. Mutually exclusive with 'plugins'."
+        ),
+    )
+    experiment_id: str | None = Field(
+        default=None,
+        min_length=1,
+        max_length=200,
+        description="Required when using variants. A human-readable experiment name.",
     )
     prompt: str = Field(
         ...,
@@ -524,7 +551,7 @@ def normalize_plugins_and_repos(cls, data: dict) -> dict:  # type: ignore[type-a
         """Normalize plugins and repos to always be lists."""
         if isinstance(data, dict):
             # Normalize plugins
-            if "plugins" in data:
+            if "plugins" in data and data["plugins"] is not None:
                 plugins = data["plugins"]
                 if isinstance(plugins, dict):
                     data["plugins"] = [plugins]
@@ -537,46 +564,77 @@ def normalize_plugins_and_repos(cls, data: dict) -> dict:  # type: ignore[type-a
                     data["repos"] = [repos]
         return data
 
+    @model_validator(mode="after")
+    def validate_plugins_or_variants(self) -> "CreatePluginAutomationRequest":
+        """Enforce mutual exclusivity between plugins and variants."""
+        if (self.plugins is None) == (self.variants is None):
+            raise ValueError("Exactly one of 'plugins' or 'variants' must be provided.")
+
+        if self.variants is not None:
+            if self.experiment_id is None:
+                raise ValueError("'experiment_id' is required when using 'variants'.")
+            if len(self.variants) < 2:
+                raise ValueError("At least two variants are required for an A/B test.")
+            if len(self.variants) > MAX_VARIANTS:
+                raise ValueError(f"At most {MAX_VARIANTS} variants are allowed.")
+            names = [v.name for v in self.variants]
+            if len(names) != len(set(names)):
+                raise ValueError("Variant names must be unique.")
+        else:
+            if self.experiment_id is not None:
+                raise ValueError(
+                    "'experiment_id' can only be used with 'variants', not 'plugins'."
+                )
+
+        return self
+
 
 def _generate_plugin_tarball(
-    plugins: list[PluginSource], prompt: str, repos: list[RepoSource] | None = None
+    plugins: list[PluginSource] | None,
+    prompt: str,
+    repos: list[RepoSource] | None = None,
+    *,
+    experiment_id: str | None = None,
+    variants: list[ExperimentVariant] | None = None,
 ) -> bytes:
     """Generate a tarball containing SDK code, plugin config, and prompt.
 
-    The tarball contains:
-    - main.py: SDK boilerplate that loads plugins and runs conversation
-    - plugins_config.json: List of plugin sources (serialized PluginSource models)
-    - prompt.txt: The prompt to send
-    - setup.sh: Script to install the SDK
-    - repos_config.json: (optional) Repository configuration for cloning
-
-    Note: Clone and skill loading functionality is now provided by the SDK's
-    OpenHandsCloudWorkspace.clone_repos() and load_skills_from_agent_server()
-    methods, so separate scripts are no longer needed.
-
-    Args:
-        plugins: List of plugins to load
-        prompt: The user's prompt text
-        repos: Optional list of repositories to clone
-
-    Returns:
-        bytes: The tarball content as bytes
+    When *variants* is provided the tarball contains ``experiment_config.json``
+    instead of ``plugins_config.json``.  The two are mutually exclusive.
     """
     preset_files = _load_plugin_preset_files()
 
-    # Serialize plugins using Pydantic (exclude None values for cleaner JSON)
-    plugins_config = [p.model_dump(exclude_none=True) for p in plugins]
-    plugins_config_json = json.dumps(plugins_config, indent=2)
-
     tarball_buffer = io.BytesIO()
 
     with tarfile.open(fileobj=tarball_buffer, mode="w:gz") as tar:
         _add_file_to_tar(tar, "main.py", preset_files["main.py"])
-        _add_file_to_tar(tar, "plugins_config.json", plugins_config_json)
         _add_file_to_tar(tar, "prompt.txt", prompt)
         _add_file_to_tar(tar, "setup.sh", preset_files["setup.sh"], mode=0o755)
 
-        # Add repos config if repos specified (SDK workspace handles cloning)
+        if variants is not None:
+            experiment_config = {
+                "experiment_id": experiment_id,
+                "variants": [
+                    {
+                        "name": v.name,
+                        "weight": v.weight,
+                        "plugins": [p.model_dump(exclude_none=True) for p in v.plugins],
+                    }
+                    for v in variants
+                ],
+            }
+            _add_file_to_tar(
+                tar,
+                "experiment_config.json",
+                json.dumps(experiment_config, indent=2),
+            )
+        else:
+            assert plugins is not None  # guaranteed by caller
+            plugins_config = [p.model_dump(exclude_none=True) for p in plugins]
+            _add_file_to_tar(
+                tar, "plugins_config.json", json.dumps(plugins_config, indent=2)
+            )
+
         if repos:
             repos_config = [r.model_dump(exclude_none=True) for r in repos]
             _add_file_to_tar(
@@ -622,24 +680,37 @@ async def create_automation_from_plugin(
     """
     model = resolve_model_profile_for_user(body.model, user)
 
-    # 1. Generate tarball with SDK code, plugin config, prompt, and repos config
+    # 1. Generate tarball with SDK code, plugin/experiment config, and prompt
     tarball_content = _generate_plugin_tarball(
-        body.plugins, body.prompt, repos=body.repos
+        body.plugins,
+        body.prompt,
+        repos=body.repos,
+        experiment_id=body.experiment_id,
+        variants=body.variants,
     )
 
     # 2. Upload tarball to storage
     upload_id = uuid.uuid4()
     storage_path = _build_storage_path(user.org_id, user.user_id, upload_id)
 
     # Create upload record
-    plugin_sources_str = _format_plugin_sources_for_description(body.plugins)
-    truncated_sources = _safe_truncate(plugin_sources_str, 100)
+    if body.variants is not None:
+        variant_names = ", ".join(v.name for v in body.variants)
+        description = _safe_truncate(
+            f"A/B experiment {body.experiment_id}: {variant_names}", 200
+        )
+    else:
+        assert body.plugins is not None  # guaranteed by validator
+        plugin_sources_str = _format_plugin_sources_for_description(body.plugins)
+        truncated = _safe_truncate(plugin_sources_str, 100)
+        description = f"Auto-generated with plugins: {truncated}"
+
     upload = TarballUpload(
         id=upload_id,
         user_id=user.user_id,
         org_id=user.org_id,
         name=f"plugin-automation-{_safe_truncate(body.name, 50)}",
-        description=f"Auto-generated with plugins: {truncated_sources}",
+        description=description,
         status=UploadStatus.UPLOADING,
         storage_path=storage_path,
     )
@@ -697,14 +768,17 @@ async def create_automation_from_plugin(
             detail=f"Failed to create automation: {e!s}",
         )
 
-    logger.info(
-        "Created automation from plugin",
-        extra={
-            "automation_id": str(automation.id),
-            "upload_id": str(upload_id),
-            "plugin_count": len(body.plugins),
-            "prompt_length": len(body.prompt),
-        },
-    )
+    log_extra: dict[str, Any] = {
+        "automation_id": str(automation.id),
+        "upload_id": str(upload_id),
+        "prompt_length": len(body.prompt),
+    }
+    if body.variants is not None:
+        log_extra["experiment_id"] = body.experiment_id
+        log_extra["variant_count"] = len(body.variants)
+    elif body.plugins is not None:
+        log_extra["plugin_count"] = len(body.plugins)
+
+    logger.info("Created automation from plugin", extra=log_extra)
 
     return AutomationResponse.model_validate(automation)
diff --git a/openhands/automation/presets/plugin/sdk_main.py b/openhands/automation/presets/plugin/sdk_main.py
@@ -64,6 +64,7 @@
 
 import json
 import os
+import random
 import sys
 import time
 
@@ -215,11 +216,32 @@
         repos_context = workspace.get_repos_context(clone_result.repo_mappings)
 
     # Load configuration files
+    EXPERIMENT_CONFIG_FILE = os.path.join(SCRIPT_DIR, "experiment_config.json")
     PLUGINS_CONFIG_FILE = os.path.join(SCRIPT_DIR, "plugins_config.json")
     PROMPT_FILE = os.path.join(SCRIPT_DIR, "prompt.txt")
 
-    with open(PLUGINS_CONFIG_FILE) as f:
-        plugins_config = json.load(f)
+    # Experiment-aware variant selection
+    experiment_id: str | None = None
+    selected_variant: str | None = None
+
+    if os.path.exists(EXPERIMENT_CONFIG_FILE):
+        with open(EXPERIMENT_CONFIG_FILE) as f:
+            experiment_config = json.load(f)
+
+        experiment_id = experiment_config["experiment_id"]
+        variants = experiment_config["variants"]
+        weights = [v["weight"] for v in variants]
+        selected = random.choices(variants, weights=weights, k=1)[0]
+
+        selected_variant = selected["name"]
+        plugins_config = selected["plugins"]
+        print("\n=== EXPERIMENT ===")
+        print(f"  id: {experiment_id}")
+        print(f"  variant: {selected_variant}")
+        print(f"  weights: {dict(zip([v['name'] for v in variants], weights))}")
+    else:
+        with open(PLUGINS_CONFIG_FILE) as f:
+            plugins_config = json.load(f)
 
     with open(PROMPT_FILE) as f:
         USER_PROMPT = f.read()
@@ -330,16 +352,30 @@ def event_callback(event) -> None:
         received_events.append(event)
         last_event_time["ts"] = time.time()
 
+    # Build experiment tags (if running an A/B test)
+    experiment_tags: dict[str, str] = {}
+    if experiment_id:
+        experiment_tags["experiment_id"] = experiment_id
+        if selected_variant is None:
+            raise RuntimeError(
+                "BUG: experiment_id is set but selected_variant is None — "
+                "experiment config may be malformed."
+            )
+        experiment_tags["variant"] = selected_variant
+
     conversation = Conversation(
         agent=agent,
         workspace=workspace,
         plugins=plugin_sources,  # All plugins loaded here
         callbacks=[event_callback],
         delete_on_close=False,  # Keep conversation history after completion
+        tags=experiment_tags or None,
     )
     assert isinstance(conversation, RemoteConversation)
     print(f"  conversation created: {type(conversation).__name__}")
     print(f"  plugins loaded: {len(plugin_sources)}")
+    if experiment_tags:
+        print(f"  experiment tags: {experiment_tags}")
 
     # Inject secrets into the conversation (auto-exported as env vars in bash)
     if secrets: