Move design-term uniqueness check before the regression (fail-fast)

igerber · claude · igerber · commit 7ecffd21460c · 2026-06-01T10:02:57.000-04:00
validate_design_term_names ran just before coef_dict construction — i.e. AFTER
the OLS fit — in DifferenceInDifferences and MultiPeriodDiD, so a duplicate
assembled term name (e.g. an MPD fixed-effect dummy colliding with a structural
period_{p} key) drove a wasted rank-deficient fit and emitted a misleading
multicollinearity warning before the intended ValueError (local review P3).
Move the check to immediately after var_names is fully assembled and before the
regression call in both estimators (TwoWayFixedEffects already checked pre-fit).
var_names is not mutated between assembly and coef_dict, so this is behavior-
preserving except that the ValueError now fires fast and warning-free.

Co-Authored-By: Claude Opus 4.8 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/diff_diff/estimators.py b/diff_diff/estimators.py
@@ -510,6 +510,12 @@ def fit(
                     X = np.column_stack([X, dummies[col].values.astype(float)])
                     var_names.append(col)
 
+        # Reject any duplicate in the FINAL term list (e.g. a fixed-effect dummy
+        # colliding with a structural term) BEFORE the regression — so the fit is
+        # not wasted and no misleading multicollinearity warning is emitted ahead
+        # of the intended ValueError.
+        validate_design_term_names(var_names, estimator="DifferenceInDifferences")
+
         # Extract ATT index (coefficient on interaction term)
         att_idx = 3  # Index of interaction term
         att_var_name = f"{treatment}:{time}"
@@ -686,7 +692,6 @@ def _refit_did_absorb(w_r):
         n_control = n_control_raw
 
         # Create coefficient dictionary
-        validate_design_term_names(var_names, estimator="DifferenceInDifferences")
         coef_dict = {name: coef for name, coef in zip(var_names, coefficients)}
 
         # Determine inference method and bootstrap info
@@ -1680,6 +1685,12 @@ def fit(  # type: ignore[override]
                     X = np.column_stack([X, dummies[col].values.astype(float)])
                     var_names.append(col)
 
+        # Reject any duplicate in the FINAL term list (e.g. a fixed-effect dummy
+        # colliding with a structural period_{p} key) BEFORE the regression — so
+        # the fit is not wasted and no misleading multicollinearity warning is
+        # emitted ahead of the intended ValueError.
+        validate_design_term_names(var_names, estimator="MultiPeriodDiD")
+
         # Fit OLS using unified backend
         # Pass cluster_ids to solve_ols for proper vcov computation
         # This handles rank-deficient matrices by returning NaN for dropped columns
@@ -2038,11 +2049,8 @@ def _refit_mp_absorb(w_r):
         n_treated = n_treated_raw
         n_control = n_control_raw
 
-        # Backstop: reject any duplicate in the FINAL term list (e.g. a
-        # fixed-effect dummy colliding with a structural `period_{p}` key)
-        # before it silently overwrites a coefficient in the dict below.
-        validate_design_term_names(var_names, estimator="MultiPeriodDiD")
-        # Create coefficient dictionary
+        # Create coefficient dictionary (var_names uniqueness already enforced
+        # before the fit above).
         coef_dict = {name: coef for name, coef in zip(var_names, coefficients)}
 
         # Store results