From 6cce8b66e1050c6f1c8813079e555b6ba927b365 Mon Sep 17 00:00:00 2001 From: Andres Contreras Date: Thu, 25 Jun 2026 20:44:21 +0200 Subject: [PATCH] feat: persistent audit trail of GenAI gate decisions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The docs claim 'every GenAI decision is logged and auditable' — until now that was only in-memory (EngineeringResult). Add an AuditLogPort + JsonlAuditLog adapter; GenAIFeatureEngineer records every decision (accepted OR rejected, with feature name, code, score, baseline, metric, reason) durably, one JSON line per proposal. Opt-in (audit_log=...); no-op when unwired. TDD on real data; docs + 'Auditing the decisions' section added. --- docs/genai-features.md | 19 ++++++ .../features/audit.py | 38 +++++++++++ .../features/genai.py | 40 +++++++++-- tests/features/test_audit_trail.py | 66 +++++++++++++++++++ 4 files changed, 159 insertions(+), 4 deletions(-) create mode 100644 src/fireflyframework_datascience/features/audit.py create mode 100644 tests/features/test_audit_trail.py diff --git a/docs/genai-features.md b/docs/genai-features.md index f2c5889..1511ffc 100644 --- a/docs/genai-features.md +++ b/docs/genai-features.md @@ -242,6 +242,25 @@ gate = CostBenefitGate(min_gain=0.005) engineer = GenAIFeatureEngineer(proposer, gate=gate) ``` +## Auditing the decisions + +The accepted/rejected trail lives on the returned `EngineeringResult`, but for governance you usually +want it **persisted**. Wire an `AuditLogPort` and every gate decision — accepted *or* rejected, with the +feature name, the code, the score, the baseline and the reason — is written durably, one record per +proposal: + +```python +from fireflyframework_datascience.features.audit import JsonlAuditLog +from fireflyframework_datascience.features.genai import GenAIFeatureEngineer + +engineer = GenAIFeatureEngineer(proposer, audit_log=JsonlAuditLog("audit/genai-decisions.jsonl")) +engineer.engineer(dataset) +# audit/genai-decisions.jsonl now holds one JSON line per decision — greppable and append-only. +``` + +This is what makes "every GenAI decision is logged and auditable" literally true: a risk or compliance +reviewer can reconstruct *why* each feature was kept or dropped, long after the run. + ## See also - [Datasets](datasets.md) — the `Dataset` the engineer consumes and `with_features` returns. diff --git a/src/fireflyframework_datascience/features/audit.py b/src/fireflyframework_datascience/features/audit.py new file mode 100644 index 0000000..2316baf --- /dev/null +++ b/src/fireflyframework_datascience/features/audit.py @@ -0,0 +1,38 @@ +# Copyright 2026 Firefly Software Foundation. +"""A persistent, auditable trail of GenAI gate decisions. + +The framework's thesis is that *every* GenAI decision is logged and auditable. The in-memory +:class:`EngineeringResult` carries the trail for one run; an :class:`AuditLogPort` persists each +decision durably so a risk / compliance / audit function can reconstruct *why* a feature was kept or +dropped, long after the run. +""" + +from __future__ import annotations + +import json +from pathlib import Path +from typing import Any, Protocol, runtime_checkable + + +@runtime_checkable +class AuditLogPort(Protocol): + """Durably records one GenAI gate decision per call.""" + + def record(self, event: dict[str, Any]) -> None: ... + + +class JsonlAuditLog: + """Appends each decision as a JSON line — simple, greppable, append-only.""" + + name = "jsonl" + + def __init__(self, path: str | Path) -> None: + self._path = Path(path) + + def record(self, event: dict[str, Any]) -> None: + self._path.parent.mkdir(parents=True, exist_ok=True) + with self._path.open("a", encoding="utf-8") as handle: + handle.write(json.dumps(event, default=str) + "\n") + + +__all__ = ["AuditLogPort", "JsonlAuditLog"] diff --git a/src/fireflyframework_datascience/features/genai.py b/src/fireflyframework_datascience/features/genai.py index e99c1e8..e2a4e02 100644 --- a/src/fireflyframework_datascience/features/genai.py +++ b/src/fireflyframework_datascience/features/genai.py @@ -23,6 +23,7 @@ FeatureProposer, RejectedFeature, ) +from fireflyframework_datascience.features.audit import AuditLogPort from fireflyframework_datascience.features.executor import FeatureCodeExecutor, FeatureExecutionError _CLASSIFICATION = {TaskType.BINARY, TaskType.MULTICLASS, TaskType.CLASSIFICATION} @@ -47,6 +48,7 @@ def __init__( evaluator: MetricsEvaluatorPort | None = None, executor: FeatureCodeExecutor | None = None, gate: CostBenefitGate | None = None, + audit_log: AuditLogPort | None = None, scorer_estimator: Callable[[TaskType], Any] | None = None, cv: int = 5, max_features: int = 5, @@ -56,6 +58,7 @@ def __init__( self._evaluator = evaluator or _default_evaluator() self._executor = executor or FeatureCodeExecutor() self._gate = gate or CostBenefitGate() + self._audit_log = audit_log self._scorer_estimator_factory = scorer_estimator self._cv = cv self._max_features = max_features @@ -78,16 +81,19 @@ def engineer(self, dataset: Dataset, *, max_features: int | None = None) -> Engi candidate = self._executor.execute(proposal.code, working) except FeatureExecutionError as exc: rejected.append(RejectedFeature(proposal, str(exc))) + self._audit(dataset, proposal, "rejected", float("nan"), baseline, metric, str(exc)) continue candidate_score = self._cv_score(candidate, dataset.y, task, scoring) if self._gate.accepts(current, candidate_score): working = candidate - accepted.append(AcceptedFeature(proposal, candidate_score, candidate_score - current)) + gain = candidate_score - current + accepted.append(AcceptedFeature(proposal, candidate_score, gain)) + self._audit(dataset, proposal, "accepted", candidate_score, baseline, metric, f"gain {gain:+.4f}") current = candidate_score else: - rejected.append( - RejectedFeature(proposal, f"no lift ({candidate_score:.4f} <= {current:.4f})", candidate_score) - ) + reason = f"no lift ({candidate_score:.4f} <= {current:.4f})" + rejected.append(RejectedFeature(proposal, reason, candidate_score)) + self._audit(dataset, proposal, "rejected", candidate_score, baseline, metric, reason) return EngineeringResult( dataset=dataset.with_features(working), @@ -98,6 +104,32 @@ def engineer(self, dataset: Dataset, *, max_features: int | None = None) -> Engi metric=metric, ) + def _audit( + self, + dataset: Dataset, + proposal: FeatureProposal, + decision: str, + score: float, + baseline: float, + metric: str, + detail: str, + ) -> None: + """Persist one gate decision to the audit log (no-op if none is wired).""" + if self._audit_log is None: + return + self._audit_log.record( + { + "dataset": dataset.name, + "feature": proposal.name, + "code": proposal.code, + "decision": decision, + "score": score, + "baseline": baseline, + "metric": metric, + "detail": detail, + } + ) + def _cv_score(self, X: Any, y: Any, task: TaskType, scoring: str) -> float: from sklearn.model_selection import cross_val_score diff --git a/tests/features/test_audit_trail.py b/tests/features/test_audit_trail.py new file mode 100644 index 0000000..21b886a --- /dev/null +++ b/tests/features/test_audit_trail.py @@ -0,0 +1,66 @@ +# Copyright 2026 Firefly Software Foundation. +"""The GenAI gate must persist an auditable trail of every decision. + +The docs claim "every decision is logged and auditable" — until now that was only in-memory +(`EngineeringResult`). This verifies decisions are written durably (JSONL), one record per proposal. +Real data, LLM-free (StaticFeatureProposer). +""" + +from __future__ import annotations + +import json + +import numpy as np +import pandas as pd + + +def test_genai_engineer_writes_persistent_audit_trail(tmp_path) -> None: + from fireflyframework_datascience.core.types import TaskType + from fireflyframework_datascience.datasets import Dataset + from fireflyframework_datascience.features import FeatureProposal, StaticFeatureProposer + from fireflyframework_datascience.features.audit import JsonlAuditLog + from fireflyframework_datascience.features.genai import GenAIFeatureEngineer + + rng = np.random.default_rng(0) + n = 200 + a = rng.normal(size=n) + b = rng.normal(size=n) + X = pd.DataFrame({"a": a, "b": b}) + y = pd.Series((a + b > 0).astype(int)) + ds = Dataset(name="t", X=X, y=y, task=TaskType.BINARY, target_name="y", feature_names=["a", "b"]) + + proposer = StaticFeatureProposer( + [ + FeatureProposal("sum_ab", "df['sum_ab'] = df['a'] + df['b']", "useful"), + FeatureProposal("noise", "df['noise'] = 0.0", "useless constant"), + ] + ) + audit_path = tmp_path / "audit.jsonl" + engineer = GenAIFeatureEngineer(proposer, audit_log=JsonlAuditLog(audit_path), cv=3) + engineer.engineer(ds) + + records = [json.loads(line) for line in audit_path.read_text().splitlines() if line.strip()] + # every proposal produces exactly one durable decision record + assert len(records) == 2 + by_feature = {r["feature"]: r for r in records} + assert set(by_feature) == {"sum_ab", "noise"} + for record in records: + assert record["decision"] in {"accepted", "rejected"} + assert "score" in record and "baseline" in record and "code" in record + assert record["dataset"] == "t" + + +def test_audit_log_is_optional() -> None: + # No audit log wired -> engineer still works (in-memory trail only), nothing written. + from fireflyframework_datascience.core.types import TaskType + from fireflyframework_datascience.datasets import Dataset + from fireflyframework_datascience.features import FeatureProposal, StaticFeatureProposer + from fireflyframework_datascience.features.genai import GenAIFeatureEngineer + + X = pd.DataFrame({"a": [1.0, 2.0, 3.0, 4.0], "b": [4.0, 3.0, 2.0, 1.0]}) + y = pd.Series([0, 0, 1, 1]) + ds = Dataset(name="t", X=X, y=y, task=TaskType.BINARY, target_name="y", feature_names=["a", "b"]) + proposer = StaticFeatureProposer([FeatureProposal("c", "df['c'] = df['a'] + df['b']", "")]) + + result = GenAIFeatureEngineer(proposer, cv=2).engineer(ds) + assert len(result.accepted) + len(result.rejected) == 1