Merge pull request #505 from igerber/fix/ci-pure-python-runtime

igerber · web-flow · commit d59039e0e5c4 · 2026-05-31T09:50:41.000-04:00
Right-size pure-Python CI test runtime (SyntheticControl regression + SDID safe trims)
diff --git a/tests/test_methodology_sdid.py b/tests/test_methodology_sdid.py
@@ -3076,13 +3076,17 @@ def test_baseline_parity_small_scale(self, variance_method):
         assert len(r.placebo_effects) == n0
 
     @pytest.mark.parametrize("variance_method", ["placebo", "bootstrap", "jackknife"])
-    def test_scale_equivariance(self, variance_method):
+    def test_scale_equivariance(self, variance_method, ci_params):
         """τ/a, SE/|a|, p-value, and n_successful must be invariant under
         (Y → a*Y + b) across ~15 orders of magnitude."""
+        # Pure invariance check (baseline captured at runtime, not vs _BASELINE), so the
+        # absolute n_bootstrap is irrelevant: r0 and the scaled refits all use the same
+        # (ci_params-scaled in pure-Python, 200 under Rust) count, preserving equivariance.
+        nb = ci_params.bootstrap(200)
         data = _make_panel(seed=42)
         with warnings.catch_warnings():
             warnings.simplefilter("ignore", UserWarning)
-            r0 = self._fit(data, variance_method)
+            r0 = self._fit(data, variance_method, n_bootstrap=nb)
         att0, se0, p0 = r0.att, r0.se, r0.p_value
         n0 = len(r0.placebo_effects)
         noise0 = r0.noise_level
@@ -3092,7 +3096,7 @@ def test_scale_equivariance(self, variance_method):
             scaled = self._rescale(data, a, b)
             with warnings.catch_warnings():
                 warnings.simplefilter("ignore", UserWarning)
-                r = self._fit(scaled, variance_method)
+                r = self._fit(scaled, variance_method, n_bootstrap=nb)
             # Variance-method success count must be identical; divergence
             # would shift the empirical p-value floor 1/(n+1).
             assert len(r.placebo_effects) == n0, (
@@ -3172,13 +3176,15 @@ class TestPValueSemantics:
     null draws either and also use the analytical p-value.
     """
 
-    def test_bootstrap_p_value_matches_analytical(self):
+    def test_bootstrap_p_value_matches_analytical(self, ci_params):
         """Bootstrap p-value must equal safe_inference(att, se)[1]."""
+        # Self-consistency check (reported p vs the analytical formula on the reported se) —
+        # independent of the bootstrap draw count, so ci_params scaling is safe.
         df = _make_panel(seed=42)
         with warnings.catch_warnings():
             warnings.simplefilter("ignore", UserWarning)
             r = SyntheticDiD(
-                variance_method="bootstrap", n_bootstrap=200, seed=1
+                variance_method="bootstrap", n_bootstrap=ci_params.bootstrap(200), seed=1
             ).fit(
                 df, outcome="outcome", treatment="treated",
                 unit="unit", time="period",
@@ -3189,13 +3195,15 @@ def test_bootstrap_p_value_matches_analytical(self):
             f"bootstrap p_value={r.p_value} != analytical {expected_p}"
         )
 
-    def test_placebo_p_value_uses_empirical_formula(self):
+    def test_placebo_p_value_uses_empirical_formula(self, ci_params):
         """Placebo p-value must equal max(mean(|draws| >= |att|), 1/(r+1))."""
+        # Self-consistency check (reported p vs the empirical formula on the reported
+        # placebo_effects) — independent of the draw count, so ci_params scaling is safe.
         df = _make_panel(seed=42)
         with warnings.catch_warnings():
             warnings.simplefilter("ignore", UserWarning)
             r = SyntheticDiD(
-                variance_method="placebo", n_bootstrap=200, seed=1
+                variance_method="placebo", n_bootstrap=ci_params.bootstrap(200), seed=1
             ).fit(
                 df, outcome="outcome", treatment="treated",
                 unit="unit", time="period",
diff --git a/tests/test_methodology_synthetic_control.py b/tests/test_methodology_synthetic_control.py
@@ -47,6 +47,31 @@
 ]
 
 
+# ---------------------------------------------------------------------------
+# Cheap optimizer settings for behavior tests (pure-Python CI speed)
+# ---------------------------------------------------------------------------
+# Behavior tests only need a VALID, cleanly-converged fit, not data-driven V quality.
+# The production nested defaults (n_starts=4, inner_max_iter=10000, inner_min_decrease=1e-5)
+# cost 30-150s per *pure-Python* fit because the inner Frank-Wolfe solve grinds its slow
+# sublinear tail to hit the tight tolerance on every objective evaluation. Loosening the
+# inner tolerance + a single start + a small outer cap gives a clean ~0.1s fit without
+# changing what these tests assert. Pure-Python coverage of the production-default nested
+# path (n_starts=4 with the _v_starts heuristic candidates + the tight inner_min_decrease=1e-5)
+# is kept by the dedicated non-slow ``test_nested_production_defaults_smoke`` (a 2-donor panel
+# whose inner FW simplex is ~1-D, so defaults stay <0.1s). The @slow Tier-2 Basque test
+# additionally covers the defaults in the Rust matrix, and the Rust<->numpy Frank-Wolfe kernel
+# equivalence is locked by tests/test_rust_backend.py::test_sc_weight_fw_matches_numpy.
+#
+# NB: inner_max_iter is deliberately LEFT AT DEFAULT here — the speedup comes from the
+# looser tolerance letting FW terminate on *convergence* (not on an iteration cap), so the
+# solve stays clean (no non-convergence warning). Do NOT fold inner_max_iter into _FAST or
+# the inner-non-convergence warning starts firing spuriously.
+_FAST = dict(n_starts=1, optimizer_options={"maxiter": 50}, inner_min_decrease=1e-3)
+# Churn tests deliberately force inner non-convergence (inner_max_iter=1); KEEP that and only
+# cap the outer optimizer so it does not iterate to maxiter on the flat penalty landscape.
+_FAST_CHURN = dict(n_starts=1, optimizer_options={"maxiter": 5})
+
+
 # ---------------------------------------------------------------------------
 # Synthetic panel builders (fast; no R needed)
 # ---------------------------------------------------------------------------
@@ -197,8 +222,12 @@ def test_post_periods_canonicalized_and_gap_order_independent():
     df, years, T0 = _make_panel()
     ordered = years[T0:]
     scrambled = list(reversed(ordered)) + [ordered[-1]]  # unsorted + duplicate
-    r1 = synthetic_control(df, "y", "treated", "unit", "year", post_periods=ordered, seed=0)
-    r2 = synthetic_control(df, "y", "treated", "unit", "year", post_periods=scrambled, seed=0)
+    r1 = synthetic_control(
+        df, "y", "treated", "unit", "year", post_periods=ordered, seed=0, **_FAST
+    )
+    r2 = synthetic_control(
+        df, "y", "treated", "unit", "year", post_periods=scrambled, seed=0, **_FAST
+    )
     assert r1.post_periods == r2.post_periods == ordered
     assert abs(r1.att - r2.att) < 1e-12
     gdf = r2.get_gap_df()
@@ -214,7 +243,9 @@ def test_post_periods_canonicalized_and_gap_order_independent():
 
 def test_donor_pool_restricts_donors():
     df, years, T0 = _make_panel(n_donors=4)
-    res = synthetic_control(df, "y", "treated", "unit", "year", donor_pool=["d0", "d1"], seed=0)
+    res = synthetic_control(
+        df, "y", "treated", "unit", "year", donor_pool=["d0", "d1"], seed=0, **_FAST
+    )
     assert res.n_donors == 2
     assert set(res.get_weights_df()["unit"]) <= {"d0", "d1"}
 
@@ -309,8 +340,19 @@ def test_outer_v_nonconvergence_warning():
     # Outer V-search non-convergence must not be silent (optimizer capped at 1 iter).
     df, _, _ = _make_panel()
     with pytest.warns(UserWarning, match="Outer V-search"):
+        # maxiter=1 forces the OUTER non-convergence; n_starts=1 + a loose inner tolerance
+        # keep the (still-real) inner solves cheap. Loosening inner_min_decrease does not
+        # affect whether the outer optimizer hits its 1-iteration cap.
         synthetic_control(
-            df, "y", "treated", "unit", "year", seed=0, optimizer_options={"maxiter": 1}
+            df,
+            "y",
+            "treated",
+            "unit",
+            "year",
+            seed=0,
+            n_starts=1,
+            optimizer_options={"maxiter": 1},
+            inner_min_decrease=1e-3,
         )
 
 
@@ -319,7 +361,9 @@ def test_inner_v_search_nonconvergence_warning():
     # inner_max_iter=1 makes them truncate, and the estimator emits an aggregated warning.
     df, _, _ = _make_panel()
     with pytest.warns(UserWarning, match="during nested V selection"):
-        synthetic_control(df, "y", "treated", "unit", "year", seed=0, inner_max_iter=1)
+        synthetic_control(
+            df, "y", "treated", "unit", "year", seed=0, inner_max_iter=1, **_FAST_CHURN
+        )
 
 
 def test_single_inner_nonconvergence_excluded_from_v_ranking(monkeypatch):
@@ -348,7 +392,7 @@ def patched(X1s, X0s, v, max_iter, min_decrease):
 
     monkeypatch.setattr(sc, "_inner_solve_W", patched)
     with pytest.warns(UserWarning, match="during nested V selection"):
-        res = synthetic_control(df, "y", "treated", "unit", "year", seed=0)
+        res = synthetic_control(df, "y", "treated", "unit", "year", seed=0, **_FAST)
 
     assert state["failed"]  # the patch actually fired on an objective evaluation
     assert np.isfinite(res.att)
@@ -361,11 +405,37 @@ def test_n_starts_one_runs():
     # n_starts=1 uses only the uniform start (short-circuits the heuristic candidates)
     # and still produces a valid nested fit.
     df, _, _ = _make_panel()
-    res = synthetic_control(df, "y", "treated", "unit", "year", seed=0, n_starts=1)
+    res = synthetic_control(
+        df,
+        "y",
+        "treated",
+        "unit",
+        "year",
+        seed=0,
+        n_starts=1,
+        optimizer_options={"maxiter": 50},
+        inner_min_decrease=1e-3,
+    )
     assert np.isfinite(res.att)
     assert abs(sum(res.donor_weights.values()) - 1.0) < 1e-6
 
 
+def test_nested_production_defaults_smoke():
+    # Coverage anchor: exercise the FULL production-default nested path end-to-end in
+    # pure-Python — n_starts=4 (so the _v_starts heuristic candidates: inverse-variance,
+    # univariate-fit and Dirichlet starts are generated, which the n_starts=1 _FAST tests
+    # skip) and the tight inner_min_decrease=1e-5. A 2-donor panel keeps the inner
+    # Frank-Wolfe simplex effectively 1-D, so the default settings still run in <0.1s and
+    # this stays non-slow. The @slow Tier-2 Basque test covers the defaults only in the Rust
+    # matrix; this is the pure-Python complement.
+    df, _, _ = _make_panel(n_donors=2)
+    res = synthetic_control(df, "y", "treated", "unit", "year", seed=0)  # production defaults
+    assert np.isfinite(res.att)
+    assert abs(sum(res.donor_weights.values()) - 1.0) < 1e-6
+    assert res.n_donors == 2
+    assert res.mspe_v is not None  # nested V was selected by minimizing pre-period MSPE
+
+
 def test_non_finite_outcome_rejected():
     df, years, T0 = _make_panel()
     df = df.copy()
@@ -378,7 +448,7 @@ def test_distinct_special_period_sets_not_duplicate():
     # Same var/op, same endpoints + length, different intermediate period -> distinct
     # predictors, must NOT be rejected as duplicates.
     df, years, T0 = _make_panel(T=8, T0=6)
-    res = SyntheticControl(seed=0).fit(
+    res = SyntheticControl(seed=0, **_FAST).fit(
         df,
         "y",
         "treated",
@@ -423,6 +493,7 @@ def test_duplicate_predictor_window_periods_deduped():
         predictors=["y"],
         predictor_window=[years[0], years[0], years[1]],
         seed=0,
+        **_FAST,
     )
     r_uniq = synthetic_control(
         df,
@@ -433,6 +504,7 @@ def test_duplicate_predictor_window_periods_deduped():
         predictors=["y"],
         predictor_window=[years[0], years[1]],
         seed=0,
+        **_FAST,
     )
     assert abs(r_dup.att - r_uniq.att) < 1e-9
 
@@ -465,7 +537,7 @@ def test_poor_fit_warning():
         rows.append({"unit": "treated", "year": yr, "y": 50 + 2.0 * t, "treated": int(t >= T0)})
     df = pd.DataFrame(rows)
     with pytest.warns(UserWarning, match="Pre-treatment fit is poor"):
-        synthetic_control(df, "y", "treated", "unit", "year", seed=0)
+        synthetic_control(df, "y", "treated", "unit", "year", seed=0, **_FAST)
 
 
 def test_poor_fit_warning_flat_treated_pre_path():
@@ -484,7 +556,7 @@ def test_poor_fit_warning_flat_treated_pre_path():
         )
     df = pd.DataFrame(rows)
     with pytest.warns(UserWarning, match="Pre-treatment fit is poor"):
-        synthetic_control(df, "y", "treated", "unit", "year", seed=0)
+        synthetic_control(df, "y", "treated", "unit", "year", seed=0, **_FAST)
 
 
 # ---------------------------------------------------------------------------
@@ -520,7 +592,7 @@ def test_duplicate_regular_predictor_rejected():
 def test_inner_nonconvergence_warning():
     df, _, _ = _make_panel(n_donors=4)
     with pytest.warns(UserWarning, match="did not converge"):
-        SyntheticControl(seed=0, v_method="nested", inner_max_iter=1).fit(
+        SyntheticControl(seed=0, v_method="nested", inner_max_iter=1, **_FAST_CHURN).fit(
             df, "y", "treated", "unit", "year"
         )
 
@@ -532,7 +604,7 @@ def test_inner_nonconvergence_warning():
 
 def test_standardize_none_runs():
     df, _, _ = _make_panel()
-    res = synthetic_control(df, "y", "treated", "unit", "year", standardize="none", seed=0)
+    res = synthetic_control(df, "y", "treated", "unit", "year", standardize="none", seed=0, **_FAST)
     assert res.standardize == "none"
     assert np.isfinite(res.att)
 
@@ -652,7 +724,7 @@ def test_set_params_rolls_back_on_invalid():
 
 def test_nan_inference_contract():
     df, _, _ = _make_panel()
-    res = synthetic_control(df, "y", "treated", "unit", "year", seed=0)
+    res = synthetic_control(df, "y", "treated", "unit", "year", seed=0, **_FAST)
     assert_nan_inference(
         {"se": res.se, "t_stat": res.t_stat, "p_value": res.p_value, "conf_int": res.conf_int}
     )
@@ -661,7 +733,7 @@ def test_nan_inference_contract():
 
 def test_result_accessors_render():
     df, _, _ = _make_panel()
-    res = synthetic_control(df, "y", "treated", "unit", "year", seed=0)
+    res = synthetic_control(df, "y", "treated", "unit", "year", seed=0, **_FAST)
     assert isinstance(res, SyntheticControlResults)
     assert isinstance(res.summary(), str) and "Synthetic Control" in res.summary()
     assert "att" in res.to_dict()
@@ -676,8 +748,10 @@ def test_result_accessors_render():
 
 def test_inferred_post_matches_explicit():
     df, years, T0 = _make_panel()
-    r_inf = synthetic_control(df, "y", "treated", "unit", "year", seed=0)
-    r_exp = synthetic_control(df, "y", "treated", "unit", "year", post_periods=years[T0:], seed=0)
+    r_inf = synthetic_control(df, "y", "treated", "unit", "year", seed=0, **_FAST)
+    r_exp = synthetic_control(
+        df, "y", "treated", "unit", "year", post_periods=years[T0:], seed=0, **_FAST
+    )
     assert r_inf.post_periods == r_exp.post_periods == years[T0:]
     assert abs(r_inf.att - r_exp.att) < 1e-12