From da893df76a9d2fb9a0c3721e660b5b2b366acb45 Mon Sep 17 00:00:00 2001
From: David Mihalcik <dmihalcik@virtru.com>
Date: Thu, 12 Feb 2026 20:39:56 -0500
Subject: [PATCH 1/5] feat(xtest): add audit cancel event tests

Add test coverage for deferred/cancel audit event guarantees
with 340-line test file covering cancel audit event scenarios.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
Signed-off-by: David Mihalcik <dmihalcik@virtru.com>
---
 xtest/test_audit_cancel.py | 340 +++++++++++++++++++++++++++++++++++++
 1 file changed, 340 insertions(+)
 create mode 100644 xtest/test_audit_cancel.py

diff --git a/xtest/test_audit_cancel.py b/xtest/test_audit_cancel.py
new file mode 100644
index 000000000..76b1866da
--- /dev/null
+++ b/xtest/test_audit_cancel.py
@@ -0,0 +1,340 @@
+"""Integration tests for deferred audit event guarantees.
+
+These tests verify that the deferred audit pattern guarantees event logging
+even when operations are interrupted by client disconnection. The deferred
+pattern uses Go's `defer` to ensure audit events are always logged, regardless
+of how the request handler exits (success, failure, or cancellation).
+
+Strategy: Rather than guessing a fixed kill time, we launch K concurrent
+decrypt processes and kill them at staggered intervals. The concurrent load
+increases server processing time, widening the window for cancellation
+"sniping". By spreading kill times and observing which events appear,
+we adaptively find the right timing.
+
+Run with:
+    cd tests/xtest
+    uv run pytest test_audit_cancel.py --sdks go -v
+
+Note: These tests require audit log collection to be enabled. They will be
+skipped when running with --no-audit-logs.
+"""
+
+import filecmp
+import logging
+import os
+import signal
+import subprocess
+import time
+from pathlib import Path
+
+import pytest
+
+import tdfs
+from abac import Attribute
+from audit_logs import AuditLogAsserter, ParsedAuditEvent
+
+logger = logging.getLogger("xtest")
+
+# Number of concurrent decrypt processes to launch per wave
+CONCURRENT_DECRYPTS = 6
+
+# Staggered kill delays in seconds, spread across the likely processing window.
+# The first few are early (CLI startup), the middle ones target the gRPC call,
+# and the last ones catch slow processing under load.
+KILL_DELAYS = [0.05, 0.1, 0.2, 0.4, 0.8, 1.5]
+
+
+@pytest.fixture(autouse=True)
+def skip_if_audit_disabled(audit_logs: AuditLogAsserter):
+    """Skip all tests in this module if audit log collection is disabled."""
+    if not audit_logs.is_enabled:
+        pytest.skip("Audit log collection is disabled (--no-audit-logs)")
+
+
+def _build_decrypt_command(sdk: tdfs.SDK, ct_file: Path, rt_file: Path) -> list[str]:
+    """Build the decrypt command for a given SDK, suitable for subprocess.Popen."""
+    return [
+        sdk.path,
+        "decrypt",
+        str(ct_file),
+        str(rt_file),
+        "ztdf",
+    ]
+
+
+def _launch_and_kill_staggered(
+    cmd: list[str],
+    env: dict[str, str],
+    count: int,
+    kill_delays: list[float],
+    tmp_dir: Path,
+    prefix: str,
+) -> list[dict]:
+    """Launch `count` decrypt processes and kill them at staggered times.
+
+    Returns a list of dicts with timing and exit info for each process.
+    """
+    assert count == len(kill_delays), "count must match number of kill delays"
+
+    procs = []
+    for i in range(count):
+        # Each process needs its own output file
+        # cmd layout: [sdk_path, "decrypt", ct_file, rt_file, "ztdf"]
+        rt_file = tmp_dir / f"{prefix}-{i}.untdf"
+        proc_cmd = cmd[:3] + [str(rt_file)] + cmd[4:]  # Replace rt_file (index 3)
+        proc = subprocess.Popen(
+            proc_cmd, env=env, stdout=subprocess.PIPE, stderr=subprocess.PIPE
+        )
+        procs.append(
+            {
+                "proc": proc,
+                "kill_delay": kill_delays[i],
+                "launch_time": time.monotonic(),
+                "index": i,
+            }
+        )
+
+    # Kill each process at its scheduled time
+    start = time.monotonic()
+    for info in sorted(procs, key=lambda x: x["kill_delay"]):
+        delay = info["kill_delay"]
+        elapsed = time.monotonic() - start
+        remaining = delay - elapsed
+        if remaining > 0:
+            time.sleep(remaining)
+        proc = info["proc"]
+        if proc.poll() is None:
+            proc.send_signal(signal.SIGTERM)
+            info["killed"] = True
+        else:
+            info["killed"] = False
+        info["kill_time"] = time.monotonic() - start
+
+    # Wait for all to exit
+    for info in procs:
+        try:
+            info["proc"].wait(timeout=5)
+        except subprocess.TimeoutExpired:
+            info["proc"].kill()
+            info["proc"].wait(timeout=2)
+        info["exit_code"] = info["proc"].returncode
+
+    return procs
+
+
+def _collect_rewrap_events(
+    audit_logs: AuditLogAsserter,
+    since_mark: str,
+    min_count: int = 1,
+    timeout: float = 15.0,
+) -> list[ParsedAuditEvent]:
+    """Collect rewrap audit events, retrying until we have at least min_count."""
+    deadline = time.monotonic() + timeout
+    best: list[ParsedAuditEvent] = []
+    while time.monotonic() < deadline:
+        events = audit_logs.get_parsed_audit_logs(
+            since_mark=since_mark, timeout=min(2.0, deadline - time.monotonic())
+        )
+        rewrap_events = [e for e in events if e.action_type == "rewrap"]
+        if len(rewrap_events) > len(best):
+            best = rewrap_events
+        if len(best) >= min_count:
+            return best
+        time.sleep(0.5)
+    return best
+
+
+class TestDeferredAuditGuarantees:
+    """Tests that the deferred audit pattern guarantees event logging.
+
+    The deferred pattern ensures audit events are logged even when:
+    - Operations succeed normally (baseline)
+    - Client disconnects (context cancellation)
+
+    These tests verify the core guarantee: an audit event is ALWAYS produced.
+    """
+
+    def test_rewrap_always_audited_on_success(
+        self,
+        encrypt_sdk: tdfs.SDK,
+        decrypt_sdk: tdfs.SDK,
+        pt_file: Path,
+        tmp_dir: Path,
+        audit_logs: AuditLogAsserter,
+        in_focus: set[tdfs.SDK],
+        attribute_default_rsa: Attribute,
+    ):
+        """Baseline: normal decrypt produces a rewrap audit event via deferred pattern."""
+        if not in_focus & {encrypt_sdk, decrypt_sdk}:
+            pytest.skip("Not in focus")
+        pfs = tdfs.PlatformFeatureSet()
+        tdfs.skip_connectrpc_skew(encrypt_sdk, decrypt_sdk, pfs)
+        tdfs.skip_hexless_skew(encrypt_sdk, decrypt_sdk)
+
+        ct_file = tmp_dir / f"deferred-success-{encrypt_sdk}.tdf"
+        encrypt_sdk.encrypt(
+            pt_file,
+            ct_file,
+            container="ztdf",
+            attr_values=attribute_default_rsa.value_fqns,
+        )
+
+        mark = audit_logs.mark("before_success_decrypt")
+        rt_file = tmp_dir / f"deferred-success-{encrypt_sdk}-{decrypt_sdk}.untdf"
+        decrypt_sdk.decrypt(ct_file, rt_file, "ztdf")
+        assert filecmp.cmp(pt_file, rt_file)
+
+        # The deferred pattern should produce a success event
+        events = audit_logs.assert_rewrap_success(min_count=1, since_mark=mark)
+        assert len(events) >= 1
+        event = events[0]
+        assert event.action_result == "success"
+        assert event.action_type == "rewrap"
+        assert event.object_type == "key_object"
+
+    def test_rewrap_always_audited_on_client_disconnect(
+        self,
+        encrypt_sdk: tdfs.SDK,
+        decrypt_sdk: tdfs.SDK,
+        pt_file: Path,
+        tmp_dir: Path,
+        audit_logs: AuditLogAsserter,
+        in_focus: set[tdfs.SDK],
+        attribute_default_rsa: Attribute,
+    ):
+        """Staggered client kills during decrypt always produce audit events.
+
+        Launches CONCURRENT_DECRYPTS processes and kills them at staggered
+        intervals. The concurrent load increases server processing time,
+        widening the cancellation window. We assert that every process that
+        reached the server produced an audit event (success, failure, or
+        cancel) -- proving the deferred Log() always executes.
+        """
+        if not in_focus & {encrypt_sdk, decrypt_sdk}:
+            pytest.skip("Not in focus")
+        pfs = tdfs.PlatformFeatureSet()
+        tdfs.skip_connectrpc_skew(encrypt_sdk, decrypt_sdk, pfs)
+        tdfs.skip_hexless_skew(encrypt_sdk, decrypt_sdk)
+
+        ct_file = tmp_dir / f"deferred-cancel-{encrypt_sdk}.tdf"
+        encrypt_sdk.encrypt(
+            pt_file,
+            ct_file,
+            container="ztdf",
+            attr_values=attribute_default_rsa.value_fqns,
+        )
+
+        mark = audit_logs.mark("before_cancel_barrage")
+
+        # Build base command (rt_file will be replaced per-process)
+        base_cmd = _build_decrypt_command(decrypt_sdk, ct_file, tmp_dir / "placeholder")
+        env = dict(os.environ)
+
+        proc_results = _launch_and_kill_staggered(
+            cmd=base_cmd,
+            env=env,
+            count=CONCURRENT_DECRYPTS,
+            kill_delays=KILL_DELAYS,
+            tmp_dir=tmp_dir,
+            prefix=f"cancel-{encrypt_sdk}-{decrypt_sdk}",
+        )
+
+        # Log what happened for debugging
+        killed_count = sum(1 for p in proc_results if p["killed"])
+        completed_count = sum(1 for p in proc_results if not p["killed"])
+        logger.info(
+            f"Cancel barrage: {killed_count} killed, {completed_count} "
+            f"completed before kill. Kill times: "
+            f"{[f'{p["kill_time"]:.3f}s' for p in proc_results]}"
+        )
+
+        # Collect rewrap events. We expect at least one event for every
+        # process that reached the server (those killed too early may not
+        # have sent the gRPC request yet).
+        events = _collect_rewrap_events(
+            audit_logs, since_mark=mark, min_count=1, timeout=15.0
+        )
+
+        # Core guarantee: at least 1 rewrap event was produced
+        assert len(events) >= 1, (
+            f"Deferred pattern guarantee violated: {CONCURRENT_DECRYPTS} "
+            f"decrypt processes launched but got 0 rewrap audit events"
+        )
+
+        # Categorize events by result
+        by_result: dict[str | None, list[ParsedAuditEvent]] = {}
+        for e in events:
+            by_result.setdefault(e.action_result, []).append(e)
+        logger.info(
+            f"Audit results: {', '.join(f'{k}={len(v)}' for k, v in by_result.items())}, "
+            f"total={len(events)}"
+        )
+
+        # Every event should have valid rewrap structure
+        for event in events:
+            assert event.action_type == "rewrap"
+            assert event.object_type == "key_object"
+            assert event.client_platform == "kas"
+            assert event.action_result in ("success", "failure", "error", "cancel")
+
+    def test_rewrap_cancel_has_initial_metadata(
+        self,
+        encrypt_sdk: tdfs.SDK,
+        decrypt_sdk: tdfs.SDK,
+        pt_file: Path,
+        tmp_dir: Path,
+        audit_logs: AuditLogAsserter,
+        in_focus: set[tdfs.SDK],
+        attribute_default_rsa: Attribute,
+    ):
+        """All deferred events include metadata populated at event creation time.
+
+        The deferred pattern pre-creates events with TDF format, algorithm,
+        key ID, and policy binding before processing starts. Even cancelled
+        events should have at least these initial fields.
+
+        Uses the same staggered-kill approach to generate events across
+        different outcomes (success, cancel, failure).
+        """
+        if not in_focus & {encrypt_sdk, decrypt_sdk}:
+            pytest.skip("Not in focus")
+        pfs = tdfs.PlatformFeatureSet()
+        tdfs.skip_connectrpc_skew(encrypt_sdk, decrypt_sdk, pfs)
+        tdfs.skip_hexless_skew(encrypt_sdk, decrypt_sdk)
+
+        ct_file = tmp_dir / f"deferred-meta-{encrypt_sdk}.tdf"
+        encrypt_sdk.encrypt(
+            pt_file,
+            ct_file,
+            container="ztdf",
+            attr_values=attribute_default_rsa.value_fqns,
+        )
+
+        mark = audit_logs.mark("before_metadata_barrage")
+
+        base_cmd = _build_decrypt_command(decrypt_sdk, ct_file, tmp_dir / "placeholder")
+        env = dict(os.environ)
+
+        _launch_and_kill_staggered(
+            cmd=base_cmd,
+            env=env,
+            count=CONCURRENT_DECRYPTS,
+            kill_delays=KILL_DELAYS,
+            tmp_dir=tmp_dir,
+            prefix=f"meta-{encrypt_sdk}-{decrypt_sdk}",
+        )
+
+        events = _collect_rewrap_events(
+            audit_logs, since_mark=mark, min_count=1, timeout=15.0
+        )
+        assert len(events) >= 1, "Expected at least 1 rewrap event"
+
+        # Every event (success, failure, or cancel) should have tdf_format
+        # since it's populated at event creation time in the deferred pattern
+        for event in events:
+            assert event.tdf_format is not None, (
+                f"Deferred event missing tdf_format: result={event.action_result}"
+            )
+            assert event.tdf_format == "tdf3", (
+                f"Expected tdf_format='tdf3', got '{event.tdf_format}'"
+            )

From 9286c331feae79a9fe6fcba9a08a38d92fe92841 Mon Sep 17 00:00:00 2001
From: David Mihalcik <dmihalcik@virtru.com>
Date: Sat, 21 Feb 2026 10:32:42 -0500
Subject: [PATCH 2/5] fix(otdf-local): fall back to git describe for platform
 version detection

The platform API doesn't always expose a version field in its
.well-known/opentdf-configuration response. Without PLATFORM_VERSION,
xtest defaults to 0.9.0 which disables audit_logging feature detection,
causing audit cancel tests to be skipped.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 AGENTS.md                        |  6 +++---
 otdf-local/src/otdf_local/cli.py | 26 +++++++++++++++++++++++++-
 2 files changed, 28 insertions(+), 4 deletions(-)

diff --git a/AGENTS.md b/AGENTS.md
index b48dd7ddc..f8d1ffc19 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -25,8 +25,8 @@ otdf-sdk-mgr install tip go    # Build from source
 ### Running Tests
 
 ```bash
-# Configure environment
-cd xtest && set -a && source test.env && set +a
+# Generate local environment from otdf-local and configure
+cd xtest && uv run ../otdf-local env > local.env && set -a && source local.env && set +a
 
 # Run with specific SDK
 uv run pytest --sdks go -v
@@ -247,7 +247,7 @@ yq e '.services.kas.root_key' platform/opentdf-dev.yaml
 ### Preferred Workflow
 
 1. **Build SDK CLIs**: `cd xtest/sdk && make`
-2. **Configure environment**: `cd xtest && set -a && source test.env && set +a`
+2. **Configure environment**: `cd xtest && uv run ../otdf-local env > local.env && set -a && source local.env && set +a`
 3. **Run tests**: `uv run pytest --sdks go -v`
 4. **Restart after config changes**: Restart the affected platform/KAS services
 
diff --git a/otdf-local/src/otdf_local/cli.py b/otdf-local/src/otdf_local/cli.py
index d8e3597ff..73d46097a 100644
--- a/otdf-local/src/otdf_local/cli.py
+++ b/otdf-local/src/otdf_local/cli.py
@@ -2,6 +2,7 @@
 
 import json
 import shutil
+import subprocess
 import sys
 import time
 from typing import Annotated
@@ -607,7 +608,30 @@ def env(
                 if "version" in config:
                     env_vars["PLATFORM_VERSION"] = config["version"]
     except Exception as e:
-        print_warning(f"Could not get platform version: {e}")
+        print_warning(f"Could not get platform version from API: {e}")
+
+    # Fall back to git describe if API didn't provide version
+    if "PLATFORM_VERSION" not in env_vars:
+        try:
+            result = subprocess.run(
+                ["git", "describe", "--tags", "--match", "service/v*"],
+                capture_output=True,
+                text=True,
+                cwd=settings.platform_dir,
+                timeout=5,
+            )
+            if result.returncode == 0:
+                # Parse "service/v0.13.0" or "service/v0.13.0-1-gabcdef"
+                tag = result.stdout.strip()
+                if tag.startswith("service/v"):
+                    version = tag[len("service/v") :]
+                    # Strip git describe suffix (e.g. "-1-gabcdef")
+                    parts = version.split("-")
+                    if len(parts) >= 3 and parts[-1].startswith("g"):
+                        version = "-".join(parts[:-2])
+                    env_vars["PLATFORM_VERSION"] = version
+        except Exception as e:
+            print_warning(f"Could not get platform version from git: {e}")
 
     # Output in requested format
     if format == "json":

From 584b7d80968fc17fb9d3dde1c9cbee91ee37befb Mon Sep 17 00:00:00 2001
From: David Mihalcik <dmihalcik@virtru.com>
Date: Sat, 21 Feb 2026 10:33:30 -0500
Subject: [PATCH 3/5] chore: gitignore xtest/local.env

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .gitignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.gitignore b/.gitignore
index ecb9a979f..2daa43eb3 100644
--- a/.gitignore
+++ b/.gitignore
@@ -13,6 +13,7 @@ vulnerability/tilt_modules/
 /xtest/node_modules/
 /xtest/tilt_modules/
 /xtest/tmp/
+/xtest/local.env
 /xtest/sdk/js/web/dist/
 /xtest/.helm
 

From c403a2d9ff1968ad9ec2d50eabf03d3e9521ee0c Mon Sep 17 00:00:00 2001
From: David Mihalcik <dmihalcik@virtru.com>
Date: Sat, 21 Feb 2026 10:35:29 -0500
Subject: [PATCH 4/5] docs: use local.env file instead of eval in otdf-local
 guide

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 otdf-local/AGENTS.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/otdf-local/AGENTS.md b/otdf-local/AGENTS.md
index 452038956..37d6e79d4 100644
--- a/otdf-local/AGENTS.md
+++ b/otdf-local/AGENTS.md
@@ -6,9 +6,9 @@ This guide covers operational procedures for managing the test environment with
 
 ```bash
 cd otdf-local
-eval $(uv run otdf-local env)         # Sets PLATFORM_LOG_FILE, KAS_*_LOG_FILE, etc.
-uv run otdf-local env --format json   # Output as JSON
+uv run otdf-local env > ../xtest/local.env   # Generate local.env
 cd ../xtest
+set -a && source local.env && set +a         # Source the environment
 uv run pytest --sdks go -v
 ```
 

From 57ee69c8ef8812ebc4d42d9a792c6d84f6b6aeb6 Mon Sep 17 00:00:00 2001
From: David Mihalcik <dmihalcik@virtru.com>
Date: Sat, 21 Feb 2026 10:37:31 -0500
Subject: [PATCH 5/5] ci: add test_audit_cancel.py to xtest workflow

Runs audit cancel tests after ABAC tests with the same multi-KAS
and audit log environment. Runs without parallel execution to avoid
timing flakiness from staggered process kill assertions.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .github/workflows/xtest.yml | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/.github/workflows/xtest.yml b/.github/workflows/xtest.yml
index 773448892..42dd0f10c 100644
--- a/.github/workflows/xtest.yml
+++ b/.github/workflows/xtest.yml
@@ -633,6 +633,30 @@ jobs:
           KAS_KM1_LOG_FILE: "../../${{ steps.kas-km1.outputs.log-file }}"
           KAS_KM2_LOG_FILE: "../../${{ steps.kas-km2.outputs.log-file }}"
 
+      - name: Run audit cancel tests
+        if: ${{ steps.multikas.outputs.supported == 'true' }}
+        run: >-
+          uv run pytest
+          -ra
+          -v
+          --html test-results/audit-cancel-${FOCUS_SDK}-${PLATFORM_TAG}.html
+          --self-contained-html
+          --audit-log-dir test-results/audit-logs
+          --sdks-encrypt "${ENCRYPT_SDK}"
+          --focus "$FOCUS_SDK"
+          test_audit_cancel.py
+        working-directory: otdftests/xtest
+        env:
+          PLATFORM_DIR: "../../${{ steps.run-platform.outputs.platform-working-dir }}"
+          PLATFORM_TAG: ${{ matrix.platform-tag }}
+          PLATFORM_LOG_FILE: "../../${{ steps.run-platform.outputs.platform-log-file }}"
+          KAS_ALPHA_LOG_FILE: "../../${{ steps.kas-alpha.outputs.log-file }}"
+          KAS_BETA_LOG_FILE: "../../${{ steps.kas-beta.outputs.log-file }}"
+          KAS_GAMMA_LOG_FILE: "../../${{ steps.kas-gamma.outputs.log-file }}"
+          KAS_DELTA_LOG_FILE: "../../${{ steps.kas-delta.outputs.log-file }}"
+          KAS_KM1_LOG_FILE: "../../${{ steps.kas-km1.outputs.log-file }}"
+          KAS_KM2_LOG_FILE: "../../${{ steps.kas-km2.outputs.log-file }}"
+
       - name: Upload artifact
         uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f # v6.0.0
         id: upload-artifact