Merge pull request #44 from igerber/claude/plan-v1-1-1-release-8iyGQ

igerber · web-flow · commit 026a031e0dec · 2026-01-06T08:59:00.000-05:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,6 +5,21 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [1.1.1] - 2026-01-06
+
+### Fixed
+- **SyntheticDiD bootstrap error handling**: Bootstrap now raises clear `ValueError` when all iterations fail, instead of silently returning SE=0.0. Added warnings for edge cases (single successful iteration, high failure rate).
+
+- **Diagnostics module error handling**: Improved error messages in `permutation_test()` and `leave_one_out_test()` with actionable guidance. Added warnings when significant iterations fail. Enhanced `run_all_placebo_tests()` to return structured error info including error type.
+
+### Changed
+- **Code deduplication**: Extracted wild bootstrap inference logic to shared `_run_wild_bootstrap_inference()` method in `DifferenceInDifferences` base class, used by both `DifferenceInDifferences` and `TwoWayFixedEffects`.
+
+- **Type hints**: Added missing type hints to nested functions:
+  - `compute_trend()` in `utils.py`
+  - `neg_log_likelihood()` and `gradient()` in `staggered.py`
+  - `format_label()` in `prep.py`
+
 ## [1.1.0] - 2026-01-05
 
 ### Added
diff --git a/TODO.md b/TODO.md
@@ -14,8 +14,6 @@ Current limitations that may affect users:
 |-------|----------|----------|-------|
 | MultiPeriodDiD wild bootstrap not supported | `estimators.py:1068-1074` | Low | Edge case |
 | `predict()` raises NotImplementedError | `estimators.py:532-554` | Low | Rarely needed |
-| SyntheticDiD bootstrap can fail silently | `estimators.py:1580-1654` | Medium | Needs better error handling |
-| Diagnostics module error handling | `diagnostics.py:782-885` | Medium | Improve robustness |
 
 ---
 
@@ -27,7 +25,6 @@ Consolidation opportunities for cleaner maintenance:
 
 | Duplicate Code | Locations | Notes |
 |---------------|-----------|-------|
-| Wild bootstrap inference block | `estimators.py:278-296`, `estimators.py:725-748` | Extract to shared method |
 | Within-transformation logic | `estimators.py:217-232`, `estimators.py:787-833`, `bacon.py:567-642` | Extract to utils.py |
 | Linear regression helper | `staggered.py:205-240`, `estimators.py:366-408` | Consider consolidation |
 
@@ -117,8 +114,4 @@ No major performance issues identified. Potential future optimizations:
 
 ## Type Hints
 
-Missing type hints in internal functions:
-
-- `utils.py:593` - `compute_trend()` nested function
-- `staggered.py:173, 180` - Nested functions in `_logistic_regression()`
-- `prep.py:604` - `format_label()` nested function
+All previously identified missing type hints have been addressed in v1.1.1.
diff --git a/diff_diff/__init__.py b/diff_diff/__init__.py
@@ -90,7 +90,7 @@
     plot_sensitivity,
 )
 
-__version__ = "1.1.0"
+__version__ = "1.1.1"
 __all__ = [
     # Estimators
     "DifferenceInDifferences",
diff --git a/diff_diff/diagnostics.py b/diff_diff/diagnostics.py
@@ -625,11 +625,30 @@ def permutation_test(
             # Handle edge cases where fitting fails
             permuted_effects[i] = np.nan
 
-    # Remove any NaN values
+    # Remove any NaN values and track failure rate
     valid_effects = permuted_effects[~np.isnan(permuted_effects)]
+    n_failed = n_permutations - len(valid_effects)
 
     if len(valid_effects) == 0:
-        raise RuntimeError("All permutations failed - check your data")
+        raise RuntimeError(
+            f"All {n_permutations} permutations failed. This typically occurs when:\n"
+            f"  - Treatment/control groups are too small for valid permutation\n"
+            f"  - Data contains collinearity or singular matrices after permutation\n"
+            f"  - There are too few observations per time period\n"
+            f"Consider checking data quality with validate_did_data() from diff_diff.prep."
+        )
+
+    # Warn if significant number of permutations failed
+    if n_failed > 0:
+        failure_rate = n_failed / n_permutations
+        if failure_rate > 0.1:
+            import warnings
+            warnings.warn(
+                f"{n_failed}/{n_permutations} permutations failed ({failure_rate:.1%}). "
+                f"Results based on {len(valid_effects)} successful permutations.",
+                UserWarning,
+                stacklevel=2
+            )
 
     # Compute p-value: proportion of |permuted| >= |original|
     p_value = np.mean(np.abs(valid_effects) >= np.abs(original_att))
@@ -736,11 +755,30 @@ def leave_one_out_test(
             # Skip units that cause fitting issues
             loo_effects[u] = np.nan
 
-    # Remove NaN values for statistics
+    # Remove NaN values for statistics and track failures
     valid_effects = [v for v in loo_effects.values() if not np.isnan(v)]
+    n_total = len(loo_effects)
+    n_failed = n_total - len(valid_effects)
 
     if len(valid_effects) == 0:
-        raise RuntimeError("All leave-one-out estimates failed")
+        raise RuntimeError(
+            f"All {n_total} leave-one-out estimates failed. This typically occurs when:\n"
+            f"  - Removing any single treated unit causes model fitting to fail\n"
+            f"  - Very few treated units (need at least 2 for LOO)\n"
+            f"  - Data has collinearity issues that manifest when units are removed\n"
+            f"Consider checking data quality and ensuring sufficient treated units."
+        )
+
+    # Warn if significant number of LOO iterations failed
+    if n_failed > 0:
+        import warnings
+        failed_units = [u for u, v in loo_effects.items() if np.isnan(v)]
+        warnings.warn(
+            f"{n_failed}/{n_total} leave-one-out estimates failed for units: {failed_units}. "
+            f"Results based on {len(valid_effects)} successful iterations.",
+            UserWarning,
+            stacklevel=2
+        )
 
     # Statistics of LOO distribution
     mean_effect = np.mean(valid_effects)
@@ -838,8 +876,13 @@ def run_all_placebo_tests(
             )
             results[f"fake_timing_{period}"] = test_result
         except Exception as e:
-            # Store error info
-            results[f"fake_timing_{period}"] = {"error": str(e)}
+            # Store structured error info for debugging
+            results[f"fake_timing_{period}"] = {
+                "error": str(e),
+                "error_type": type(e).__name__,
+                "test_type": "fake_timing",
+                "period": period
+            }
 
     # Permutation test
     try:
@@ -856,7 +899,11 @@ def run_all_placebo_tests(
         )
         results["permutation"] = perm_result
     except Exception as e:
-        results["permutation"] = {"error": str(e)}
+        results["permutation"] = {
+            "error": str(e),
+            "error_type": type(e).__name__,
+            "test_type": "permutation"
+        }
 
     # Leave-one-out test
     try:
@@ -871,6 +918,10 @@ def run_all_placebo_tests(
         )
         results["leave_one_out"] = loo_result
     except Exception as e:
-        results["leave_one_out"] = {"error": str(e)}
+        results["leave_one_out"] = {
+            "error": str(e),
+            "error_type": type(e).__name__,
+            "test_type": "leave_one_out"
+        }
 
     return results
diff --git a/diff_diff/estimators.py b/diff_diff/estimators.py
@@ -19,6 +19,7 @@
 
 from diff_diff.results import DiDResults, MultiPeriodDiDResults, PeriodEffect
 from diff_diff.utils import (
+    WildBootstrapResults,
     compute_confidence_interval,
     compute_p_value,
     compute_robust_se,
@@ -279,22 +280,9 @@ def fit(
         if self.inference == "wild_bootstrap" and self.cluster is not None:
             # Wild cluster bootstrap for few-cluster inference
             cluster_ids = data[self.cluster].values
-            bootstrap_results = wild_bootstrap_se(
-                X, y, residuals, cluster_ids,
-                coefficient_index=att_idx,
-                n_bootstrap=self.n_bootstrap,
-                weight_type=self.bootstrap_weights,
-                alpha=self.alpha,
-                seed=self.seed,
-                return_distribution=False
+            se, p_value, conf_int, t_stat, vcov, _ = self._run_wild_bootstrap_inference(
+                X, y, residuals, cluster_ids, att_idx
             )
-            self._bootstrap_results = bootstrap_results
-            se = bootstrap_results.se
-            p_value = bootstrap_results.p_value
-            conf_int = (bootstrap_results.ci_lower, bootstrap_results.ci_upper)
-            t_stat = bootstrap_results.t_stat_original
-            # Also compute vcov for storage (using cluster-robust for consistency)
-            vcov = compute_robust_se(X, residuals, cluster_ids)
         elif self.cluster is not None:
             cluster_ids = data[self.cluster].values
             vcov = compute_robust_se(X, residuals, cluster_ids)
@@ -408,6 +396,56 @@ def _fit_ols(self, X: np.ndarray, y: np.ndarray) -> Tuple[np.ndarray, np.ndarray
 
         return coefficients, residuals, fitted, r_squared
 
+    def _run_wild_bootstrap_inference(
+        self,
+        X: np.ndarray,
+        y: np.ndarray,
+        residuals: np.ndarray,
+        cluster_ids: np.ndarray,
+        coefficient_index: int,
+    ) -> Tuple[float, float, Tuple[float, float], float, np.ndarray, WildBootstrapResults]:
+        """
+        Run wild cluster bootstrap inference.
+
+        Parameters
+        ----------
+        X : np.ndarray
+            Design matrix.
+        y : np.ndarray
+            Outcome vector.
+        residuals : np.ndarray
+            OLS residuals.
+        cluster_ids : np.ndarray
+            Cluster identifiers for each observation.
+        coefficient_index : int
+            Index of the coefficient to compute inference for.
+
+        Returns
+        -------
+        tuple
+            (se, p_value, conf_int, t_stat, vcov, bootstrap_results)
+        """
+        bootstrap_results = wild_bootstrap_se(
+            X, y, residuals, cluster_ids,
+            coefficient_index=coefficient_index,
+            n_bootstrap=self.n_bootstrap,
+            weight_type=self.bootstrap_weights,
+            alpha=self.alpha,
+            seed=self.seed,
+            return_distribution=False
+        )
+        self._bootstrap_results = bootstrap_results
+
+        se = bootstrap_results.se
+        p_value = bootstrap_results.p_value
+        conf_int = (bootstrap_results.ci_lower, bootstrap_results.ci_upper)
+        t_stat = bootstrap_results.t_stat_original
+
+        # Also compute vcov for storage (using cluster-robust for consistency)
+        vcov = compute_robust_se(X, residuals, cluster_ids)
+
+        return se, p_value, conf_int, t_stat, vcov, bootstrap_results
+
     def _parse_formula(
         self, formula: str, data: pd.DataFrame
     ) -> Tuple[str, str, str, Optional[List[str]]]:
diff --git a/diff_diff/prep.py b/diff_diff/prep.py
@@ -601,7 +601,7 @@ def summarize_did_data(
     if len(time_vals) == 2:
         pre_val, post_val = time_vals[0], time_vals[1]
 
-        def format_label(x):
+        def format_label(x: tuple) -> str:
             treatment_label = 'Treated' if x[0] == 1 else 'Control'
             time_label = 'Post' if x[1] == post_val else 'Pre'
             return f"{treatment_label} - {time_label}"
diff --git a/diff_diff/staggered.py b/diff_diff/staggered.py
@@ -169,14 +169,14 @@ def _logistic_regression(
     # Add intercept
     X_with_intercept = np.column_stack([np.ones(n), X])
 
-    def neg_log_likelihood(beta):
+    def neg_log_likelihood(beta: np.ndarray) -> float:
         z = X_with_intercept @ beta
         # Clip to prevent overflow
         z = np.clip(z, -500, 500)
         log_lik = np.sum(y * z - np.log(1 + np.exp(z)))
         return -log_lik
 
-    def gradient(beta):
+    def gradient(beta: np.ndarray) -> np.ndarray:
         z = X_with_intercept @ beta
         z = np.clip(z, -500, 500)
         probs = 1 / (1 + np.exp(-z))
diff --git a/diff_diff/synthetic_did.py b/diff_diff/synthetic_did.py
@@ -503,10 +503,33 @@ def _bootstrap_se(
 
         bootstrap_estimates = np.array(bootstrap_estimates)
 
-        # Warn if too many bootstrap iterations failed
+        # Check bootstrap success rate and handle failures appropriately
         n_successful = len(bootstrap_estimates)
         failure_rate = 1 - (n_successful / self.n_bootstrap)
-        if failure_rate > 0.05:
+
+        if n_successful == 0:
+            raise ValueError(
+                f"All {self.n_bootstrap} bootstrap iterations failed. "
+                f"This typically occurs when:\n"
+                f"  - Sample size is too small for reliable resampling\n"
+                f"  - Weight matrices are singular or near-singular\n"
+                f"  - Insufficient pre-treatment periods for weight estimation\n"
+                f"  - Too few control units relative to treated units\n"
+                f"Consider using n_bootstrap=0 to disable bootstrap inference "
+                f"and rely on placebo-based standard errors, or increase "
+                f"the regularization parameters (lambda_reg, zeta)."
+            )
+        elif n_successful == 1:
+            warnings.warn(
+                f"Only 1/{self.n_bootstrap} bootstrap iteration succeeded. "
+                f"Standard error cannot be computed reliably (requires at least 2). "
+                f"Returning SE=0.0. Consider the suggestions above for improving "
+                f"bootstrap convergence.",
+                UserWarning,
+                stacklevel=2,
+            )
+            se = 0.0
+        elif failure_rate > 0.05:
             warnings.warn(
                 f"Only {n_successful}/{self.n_bootstrap} bootstrap iterations succeeded "
                 f"({failure_rate:.1%} failure rate). Standard errors may be unreliable. "
@@ -515,8 +538,9 @@ def _bootstrap_se(
                 UserWarning,
                 stacklevel=2,
             )
-
-        se = np.std(bootstrap_estimates, ddof=1) if len(bootstrap_estimates) > 1 else 0.0
+            se = np.std(bootstrap_estimates, ddof=1)
+        else:
+            se = np.std(bootstrap_estimates, ddof=1)
 
         return se, bootstrap_estimates
 
diff --git a/diff_diff/twfe.py b/diff_diff/twfe.py
@@ -17,7 +17,6 @@
     compute_confidence_interval,
     compute_p_value,
     compute_robust_se,
-    wild_bootstrap_se,
 )
 
 
@@ -132,21 +131,9 @@ def fit(  # type: ignore[override]
         cluster_ids = data[cluster_var].values
         if self.inference == "wild_bootstrap":
             # Wild cluster bootstrap for few-cluster inference
-            bootstrap_results = wild_bootstrap_se(
-                X, y, residuals, cluster_ids,
-                coefficient_index=att_idx,
-                n_bootstrap=self.n_bootstrap,
-                weight_type=self.bootstrap_weights,
-                alpha=self.alpha,
-                seed=self.seed,
-                return_distribution=False
+            se, p_value, conf_int, t_stat, vcov, _ = self._run_wild_bootstrap_inference(
+                X, y, residuals, cluster_ids, att_idx
             )
-            self._bootstrap_results = bootstrap_results
-            se = bootstrap_results.se
-            p_value = bootstrap_results.p_value
-            conf_int = (bootstrap_results.ci_lower, bootstrap_results.ci_upper)
-            t_stat = bootstrap_results.t_stat_original
-            vcov = compute_robust_se(X, residuals, cluster_ids)
         else:
             # Standard cluster-robust SE
             vcov = compute_robust_se(X, residuals, cluster_ids)
diff --git a/diff_diff/utils.py b/diff_diff/utils.py
@@ -590,7 +590,7 @@ def check_parallel_trends(
     control_data = pre_data[pre_data[treatment_group] == 0]
 
     # Simple linear regression for trends
-    def compute_trend(group_data):
+    def compute_trend(group_data: pd.DataFrame) -> Tuple[float, float]:
         time_values = group_data[time].values
         outcome_values = group_data[outcome].values
 
diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "diff-diff"
-version = "1.1.0"
+version = "1.1.1"
 description = "A library for Difference-in-Differences causal inference analysis"
 readme = "README.md"
 license = "MIT"

Original file line number	Diff line number	Diff line change
`@@ -90,7 +90,7 @@`
`90`	`90`	`plot_sensitivity,`
`91`	`91`	`)`
`92`	`92`
`93`		`-__version__ = "1.1.0"`
	`93`	`+__version__ = "1.1.1"`
`94`	`94`	`__all__ = [`
`95`	`95`	`# Estimators`
`96`	`96`	`"DifferenceInDifferences",`