Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 19 additions & 0 deletions docs/genai-features.md
Original file line number Diff line number Diff line change
Expand Up @@ -242,6 +242,25 @@ gate = CostBenefitGate(min_gain=0.005)
engineer = GenAIFeatureEngineer(proposer, gate=gate)
```

## Auditing the decisions

The accepted/rejected trail lives on the returned `EngineeringResult`, but for governance you usually
want it **persisted**. Wire an `AuditLogPort` and every gate decision — accepted *or* rejected, with the
feature name, the code, the score, the baseline and the reason — is written durably, one record per
proposal:

```python
from fireflyframework_datascience.features.audit import JsonlAuditLog
from fireflyframework_datascience.features.genai import GenAIFeatureEngineer

engineer = GenAIFeatureEngineer(proposer, audit_log=JsonlAuditLog("audit/genai-decisions.jsonl"))
engineer.engineer(dataset)
# audit/genai-decisions.jsonl now holds one JSON line per decision — greppable and append-only.
```

This is what makes "every GenAI decision is logged and auditable" literally true: a risk or compliance
reviewer can reconstruct *why* each feature was kept or dropped, long after the run.

## See also

- [Datasets](datasets.md) — the `Dataset` the engineer consumes and `with_features` returns.
Expand Down
38 changes: 38 additions & 0 deletions src/fireflyframework_datascience/features/audit.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
# Copyright 2026 Firefly Software Foundation.
"""A persistent, auditable trail of GenAI gate decisions.

The framework's thesis is that *every* GenAI decision is logged and auditable. The in-memory
:class:`EngineeringResult` carries the trail for one run; an :class:`AuditLogPort` persists each
decision durably so a risk / compliance / audit function can reconstruct *why* a feature was kept or
dropped, long after the run.
"""

from __future__ import annotations

import json
from pathlib import Path
from typing import Any, Protocol, runtime_checkable


@runtime_checkable
class AuditLogPort(Protocol):
"""Durably records one GenAI gate decision per call."""

def record(self, event: dict[str, Any]) -> None: ...


class JsonlAuditLog:
"""Appends each decision as a JSON line — simple, greppable, append-only."""

name = "jsonl"

def __init__(self, path: str | Path) -> None:
self._path = Path(path)

def record(self, event: dict[str, Any]) -> None:
self._path.parent.mkdir(parents=True, exist_ok=True)
with self._path.open("a", encoding="utf-8") as handle:
handle.write(json.dumps(event, default=str) + "\n")


__all__ = ["AuditLogPort", "JsonlAuditLog"]
40 changes: 36 additions & 4 deletions src/fireflyframework_datascience/features/genai.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
FeatureProposer,
RejectedFeature,
)
from fireflyframework_datascience.features.audit import AuditLogPort
from fireflyframework_datascience.features.executor import FeatureCodeExecutor, FeatureExecutionError

_CLASSIFICATION = {TaskType.BINARY, TaskType.MULTICLASS, TaskType.CLASSIFICATION}
Expand All @@ -47,6 +48,7 @@ def __init__(
evaluator: MetricsEvaluatorPort | None = None,
executor: FeatureCodeExecutor | None = None,
gate: CostBenefitGate | None = None,
audit_log: AuditLogPort | None = None,
scorer_estimator: Callable[[TaskType], Any] | None = None,
cv: int = 5,
max_features: int = 5,
Expand All @@ -56,6 +58,7 @@ def __init__(
self._evaluator = evaluator or _default_evaluator()
self._executor = executor or FeatureCodeExecutor()
self._gate = gate or CostBenefitGate()
self._audit_log = audit_log
self._scorer_estimator_factory = scorer_estimator
self._cv = cv
self._max_features = max_features
Expand All @@ -78,16 +81,19 @@ def engineer(self, dataset: Dataset, *, max_features: int | None = None) -> Engi
candidate = self._executor.execute(proposal.code, working)
except FeatureExecutionError as exc:
rejected.append(RejectedFeature(proposal, str(exc)))
self._audit(dataset, proposal, "rejected", float("nan"), baseline, metric, str(exc))
continue
candidate_score = self._cv_score(candidate, dataset.y, task, scoring)
if self._gate.accepts(current, candidate_score):
working = candidate
accepted.append(AcceptedFeature(proposal, candidate_score, candidate_score - current))
gain = candidate_score - current
accepted.append(AcceptedFeature(proposal, candidate_score, gain))
self._audit(dataset, proposal, "accepted", candidate_score, baseline, metric, f"gain {gain:+.4f}")
current = candidate_score
else:
rejected.append(
RejectedFeature(proposal, f"no lift ({candidate_score:.4f} <= {current:.4f})", candidate_score)
)
reason = f"no lift ({candidate_score:.4f} <= {current:.4f})"
rejected.append(RejectedFeature(proposal, reason, candidate_score))
self._audit(dataset, proposal, "rejected", candidate_score, baseline, metric, reason)

return EngineeringResult(
dataset=dataset.with_features(working),
Expand All @@ -98,6 +104,32 @@ def engineer(self, dataset: Dataset, *, max_features: int | None = None) -> Engi
metric=metric,
)

def _audit(
self,
dataset: Dataset,
proposal: FeatureProposal,
decision: str,
score: float,
baseline: float,
metric: str,
detail: str,
) -> None:
"""Persist one gate decision to the audit log (no-op if none is wired)."""
if self._audit_log is None:
return
self._audit_log.record(
{
"dataset": dataset.name,
"feature": proposal.name,
"code": proposal.code,
"decision": decision,
"score": score,
"baseline": baseline,
"metric": metric,
"detail": detail,
}
)

def _cv_score(self, X: Any, y: Any, task: TaskType, scoring: str) -> float:
from sklearn.model_selection import cross_val_score

Expand Down
66 changes: 66 additions & 0 deletions tests/features/test_audit_trail.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
# Copyright 2026 Firefly Software Foundation.
"""The GenAI gate must persist an auditable trail of every decision.

The docs claim "every decision is logged and auditable" — until now that was only in-memory
(`EngineeringResult`). This verifies decisions are written durably (JSONL), one record per proposal.
Real data, LLM-free (StaticFeatureProposer).
"""

from __future__ import annotations

import json

import numpy as np
import pandas as pd


def test_genai_engineer_writes_persistent_audit_trail(tmp_path) -> None:
from fireflyframework_datascience.core.types import TaskType
from fireflyframework_datascience.datasets import Dataset
from fireflyframework_datascience.features import FeatureProposal, StaticFeatureProposer
from fireflyframework_datascience.features.audit import JsonlAuditLog
from fireflyframework_datascience.features.genai import GenAIFeatureEngineer

rng = np.random.default_rng(0)
n = 200
a = rng.normal(size=n)
b = rng.normal(size=n)
X = pd.DataFrame({"a": a, "b": b})
y = pd.Series((a + b > 0).astype(int))
ds = Dataset(name="t", X=X, y=y, task=TaskType.BINARY, target_name="y", feature_names=["a", "b"])

proposer = StaticFeatureProposer(
[
FeatureProposal("sum_ab", "df['sum_ab'] = df['a'] + df['b']", "useful"),
FeatureProposal("noise", "df['noise'] = 0.0", "useless constant"),
]
)
audit_path = tmp_path / "audit.jsonl"
engineer = GenAIFeatureEngineer(proposer, audit_log=JsonlAuditLog(audit_path), cv=3)
engineer.engineer(ds)

records = [json.loads(line) for line in audit_path.read_text().splitlines() if line.strip()]
# every proposal produces exactly one durable decision record
assert len(records) == 2
by_feature = {r["feature"]: r for r in records}
assert set(by_feature) == {"sum_ab", "noise"}
for record in records:
assert record["decision"] in {"accepted", "rejected"}
assert "score" in record and "baseline" in record and "code" in record
assert record["dataset"] == "t"


def test_audit_log_is_optional() -> None:
# No audit log wired -> engineer still works (in-memory trail only), nothing written.
from fireflyframework_datascience.core.types import TaskType
from fireflyframework_datascience.datasets import Dataset
from fireflyframework_datascience.features import FeatureProposal, StaticFeatureProposer
from fireflyframework_datascience.features.genai import GenAIFeatureEngineer

X = pd.DataFrame({"a": [1.0, 2.0, 3.0, 4.0], "b": [4.0, 3.0, 2.0, 1.0]})
y = pd.Series([0, 0, 1, 1])
ds = Dataset(name="t", X=X, y=y, task=TaskType.BINARY, target_name="y", feature_names=["a", "b"])
proposer = StaticFeatureProposer([FeatureProposal("c", "df['c'] = df['a'] + df['b']", "")])

result = GenAIFeatureEngineer(proposer, cv=2).engineer(ds)
assert len(result.accepted) + len(result.rejected) == 1
Loading