Add robust parallel trends testing with Wasserstein distance

claude · claude · commit 2889a84e9b74 · 2026-01-01T22:00:10.000Z
- Add check_parallel_trends_robust() using Wasserstein (Earth Mover's) distance
  for distributional comparison of pre-treatment outcome changes
- Include permutation-based p-value for statistical inference
- Add Kolmogorov-Smirnov test as complementary distributional test
- Add equivalence_test_trends() using TOST procedure
- Compute normalized Wasserstein and variance ratio diagnostics
- Add 9 new tests for robust parallel trends functionality
- Update README with usage examples for all three approaches

The Wasserstein distance is more robust than simple slope comparisons
because it captures differences in the full distribution shape, not
just means, making it better suited for heterogeneous effects.
diff --git a/README.md b/README.md
@@ -235,6 +235,8 @@ results.r_squared
 
 ### Parallel Trends
 
+**Simple slope-based test:**
+
 ```python
 from diff_diff.utils import check_parallel_trends
 
@@ -248,7 +250,51 @@ trends = check_parallel_trends(
 print(f"Treated trend: {trends['treated_trend']:.4f}")
 print(f"Control trend: {trends['control_trend']:.4f}")
 print(f"Difference p-value: {trends['p_value']:.4f}")
-print(f"Parallel trends plausible: {trends['parallel_trends_plausible']}")
+```
+
+**Robust distributional test (Wasserstein distance):**
+
+```python
+from diff_diff.utils import check_parallel_trends_robust
+
+results = check_parallel_trends_robust(
+    data,
+    outcome='outcome',
+    time='period',
+    treatment_group='treated',
+    unit='firm_id',              # Unit identifier for panel data
+    pre_periods=[2018, 2019],    # Pre-treatment periods
+    n_permutations=1000          # Permutations for p-value
+)
+
+print(f"Wasserstein distance: {results['wasserstein_distance']:.4f}")
+print(f"Wasserstein p-value: {results['wasserstein_p_value']:.4f}")
+print(f"KS test p-value: {results['ks_p_value']:.4f}")
+print(f"Parallel trends plausible: {results['parallel_trends_plausible']}")
+```
+
+The Wasserstein (Earth Mover's) distance compares the full distribution of outcome changes, not just means. This is more robust to:
+- Non-normal distributions
+- Heterogeneous effects across units
+- Outliers
+
+**Equivalence testing (TOST):**
+
+```python
+from diff_diff.utils import equivalence_test_trends
+
+results = equivalence_test_trends(
+    data,
+    outcome='outcome',
+    time='period',
+    treatment_group='treated',
+    unit='firm_id',
+    equivalence_margin=0.5       # Define "practically equivalent"
+)
+
+print(f"Mean difference: {results['mean_difference']:.4f}")
+print(f"TOST p-value: {results['tost_p_value']:.4f}")
+print(f"Trends equivalent: {results['equivalent']}")
 ```
 
 ## API Reference
diff --git a/diff_diff/utils.py b/diff_diff/utils.py
@@ -248,3 +248,334 @@ def compute_trend(group_data):
         "p_value": p_value,
         "parallel_trends_plausible": p_value > 0.05 if not np.isnan(p_value) else None,
     }
+
+
+def check_parallel_trends_robust(
+    data: pd.DataFrame,
+    outcome: str,
+    time: str,
+    treatment_group: str,
+    unit: str = None,
+    pre_periods: list = None,
+    n_permutations: int = 1000,
+    seed: int = None
+) -> dict:
+    """
+    Perform robust parallel trends testing using distributional comparisons.
+
+    Uses the Wasserstein (Earth Mover's) distance to compare the full
+    distribution of outcome changes between treated and control groups,
+    with permutation-based inference.
+
+    Parameters
+    ----------
+    data : pd.DataFrame
+        Panel data with repeated observations over time.
+    outcome : str
+        Name of outcome variable column.
+    time : str
+        Name of time period column.
+    treatment_group : str
+        Name of treatment group indicator column (0/1).
+    unit : str, optional
+        Name of unit identifier column. If provided, computes unit-level
+        changes. Otherwise uses observation-level data.
+    pre_periods : list, optional
+        List of pre-treatment time periods. If None, uses first half of periods.
+    n_permutations : int, default=1000
+        Number of permutations for computing p-value.
+    seed : int, optional
+        Random seed for reproducibility.
+
+    Returns
+    -------
+    dict
+        Dictionary containing:
+        - wasserstein_distance: Wasserstein distance between group distributions
+        - wasserstein_p_value: Permutation-based p-value
+        - ks_statistic: Kolmogorov-Smirnov test statistic
+        - ks_p_value: KS test p-value
+        - mean_difference: Difference in mean changes
+        - variance_ratio: Ratio of variances in changes
+        - treated_changes: Array of outcome changes for treated
+        - control_changes: Array of outcome changes for control
+        - parallel_trends_plausible: Boolean assessment
+
+    Examples
+    --------
+    >>> results = check_parallel_trends_robust(
+    ...     data, outcome='sales', time='year',
+    ...     treatment_group='treated', unit='firm_id'
+    ... )
+    >>> print(f"Wasserstein distance: {results['wasserstein_distance']:.4f}")
+    >>> print(f"P-value: {results['wasserstein_p_value']:.4f}")
+
+    Notes
+    -----
+    The Wasserstein distance (Earth Mover's Distance) measures the minimum
+    "cost" of transforming one distribution into another. Unlike simple
+    mean comparisons, it captures differences in the entire distribution
+    shape, making it more robust to non-normal data and heterogeneous effects.
+
+    A small Wasserstein distance and high p-value suggest the distributions
+    of pre-treatment changes are similar, supporting the parallel trends
+    assumption.
+    """
+    if seed is not None:
+        np.random.seed(seed)
+
+    # Identify pre-treatment periods
+    if pre_periods is None:
+        all_periods = sorted(data[time].unique())
+        mid_point = len(all_periods) // 2
+        pre_periods = all_periods[:mid_point]
+
+    pre_data = data[data[time].isin(pre_periods)].copy()
+
+    # Compute outcome changes
+    treated_changes, control_changes = _compute_outcome_changes(
+        pre_data, outcome, time, treatment_group, unit
+    )
+
+    if len(treated_changes) < 2 or len(control_changes) < 2:
+        return {
+            "wasserstein_distance": np.nan,
+            "wasserstein_p_value": np.nan,
+            "ks_statistic": np.nan,
+            "ks_p_value": np.nan,
+            "mean_difference": np.nan,
+            "variance_ratio": np.nan,
+            "treated_changes": treated_changes,
+            "control_changes": control_changes,
+            "parallel_trends_plausible": None,
+            "error": "Insufficient data for comparison",
+        }
+
+    # Compute Wasserstein distance
+    wasserstein_dist = stats.wasserstein_distance(treated_changes, control_changes)
+
+    # Permutation test for Wasserstein distance
+    all_changes = np.concatenate([treated_changes, control_changes])
+    n_treated = len(treated_changes)
+    n_total = len(all_changes)
+
+    permuted_distances = np.zeros(n_permutations)
+    for i in range(n_permutations):
+        perm_idx = np.random.permutation(n_total)
+        perm_treated = all_changes[perm_idx[:n_treated]]
+        perm_control = all_changes[perm_idx[n_treated:]]
+        permuted_distances[i] = stats.wasserstein_distance(perm_treated, perm_control)
+
+    # P-value: proportion of permuted distances >= observed
+    wasserstein_p = np.mean(permuted_distances >= wasserstein_dist)
+
+    # Kolmogorov-Smirnov test
+    ks_stat, ks_p = stats.ks_2samp(treated_changes, control_changes)
+
+    # Additional summary statistics
+    mean_diff = np.mean(treated_changes) - np.mean(control_changes)
+    var_treated = np.var(treated_changes, ddof=1)
+    var_control = np.var(control_changes, ddof=1)
+    var_ratio = var_treated / var_control if var_control > 0 else np.nan
+
+    # Normalized Wasserstein (relative to pooled std)
+    pooled_std = np.std(all_changes, ddof=1)
+    wasserstein_normalized = wasserstein_dist / pooled_std if pooled_std > 0 else np.nan
+
+    # Assessment: parallel trends plausible if p-value > 0.05
+    # and normalized Wasserstein is small (< 0.2 as rule of thumb)
+    plausible = bool(
+        wasserstein_p > 0.05 and
+        (wasserstein_normalized < 0.2 if not np.isnan(wasserstein_normalized) else True)
+    )
+
+    return {
+        "wasserstein_distance": wasserstein_dist,
+        "wasserstein_normalized": wasserstein_normalized,
+        "wasserstein_p_value": wasserstein_p,
+        "ks_statistic": ks_stat,
+        "ks_p_value": ks_p,
+        "mean_difference": mean_diff,
+        "variance_ratio": var_ratio,
+        "n_treated": len(treated_changes),
+        "n_control": len(control_changes),
+        "treated_changes": treated_changes,
+        "control_changes": control_changes,
+        "parallel_trends_plausible": plausible,
+    }
+
+
+def _compute_outcome_changes(
+    data: pd.DataFrame,
+    outcome: str,
+    time: str,
+    treatment_group: str,
+    unit: str = None
+) -> tuple:
+    """
+    Compute period-to-period outcome changes for treated and control groups.
+
+    Parameters
+    ----------
+    data : pd.DataFrame
+        Panel data.
+    outcome : str
+        Outcome variable column.
+    time : str
+        Time period column.
+    treatment_group : str
+        Treatment group indicator column.
+    unit : str, optional
+        Unit identifier column.
+
+    Returns
+    -------
+    tuple
+        (treated_changes, control_changes) as numpy arrays.
+    """
+    if unit is not None:
+        # Unit-level changes: compute change for each unit across periods
+        data_sorted = data.sort_values([unit, time])
+        data_sorted["_outcome_change"] = data_sorted.groupby(unit)[outcome].diff()
+
+        # Remove NaN from first period of each unit
+        changes_data = data_sorted.dropna(subset=["_outcome_change"])
+
+        treated_changes = changes_data[
+            changes_data[treatment_group] == 1
+        ]["_outcome_change"].values
+
+        control_changes = changes_data[
+            changes_data[treatment_group] == 0
+        ]["_outcome_change"].values
+    else:
+        # Aggregate changes: compute mean change per period per group
+        periods = sorted(data[time].unique())
+
+        treated_data = data[data[treatment_group] == 1]
+        control_data = data[data[treatment_group] == 0]
+
+        # Compute period means
+        treated_means = treated_data.groupby(time)[outcome].mean()
+        control_means = control_data.groupby(time)[outcome].mean()
+
+        # Compute changes between consecutive periods
+        treated_changes = np.diff(treated_means.values)
+        control_changes = np.diff(control_means.values)
+
+    return treated_changes.astype(float), control_changes.astype(float)
+
+
+def equivalence_test_trends(
+    data: pd.DataFrame,
+    outcome: str,
+    time: str,
+    treatment_group: str,
+    unit: str = None,
+    pre_periods: list = None,
+    equivalence_margin: float = None
+) -> dict:
+    """
+    Perform equivalence testing (TOST) for parallel trends.
+
+    Tests whether the difference in trends is practically equivalent to zero
+    using Two One-Sided Tests (TOST) procedure.
+
+    Parameters
+    ----------
+    data : pd.DataFrame
+        Panel data.
+    outcome : str
+        Name of outcome variable column.
+    time : str
+        Name of time period column.
+    treatment_group : str
+        Name of treatment group indicator column.
+    unit : str, optional
+        Name of unit identifier column.
+    pre_periods : list, optional
+        List of pre-treatment time periods.
+    equivalence_margin : float, optional
+        The margin for equivalence (delta). If None, uses 0.5 * pooled SD
+        of outcome changes as a default.
+
+    Returns
+    -------
+    dict
+        Dictionary containing:
+        - mean_difference: Difference in mean changes
+        - equivalence_margin: The margin used
+        - lower_p_value: P-value for lower bound test
+        - upper_p_value: P-value for upper bound test
+        - tost_p_value: Maximum of the two p-values
+        - equivalent: Boolean indicating equivalence at alpha=0.05
+    """
+    # Get pre-treatment periods
+    if pre_periods is None:
+        all_periods = sorted(data[time].unique())
+        mid_point = len(all_periods) // 2
+        pre_periods = all_periods[:mid_point]
+
+    pre_data = data[data[time].isin(pre_periods)].copy()
+
+    # Compute outcome changes
+    treated_changes, control_changes = _compute_outcome_changes(
+        pre_data, outcome, time, treatment_group, unit
+    )
+
+    if len(treated_changes) < 2 or len(control_changes) < 2:
+        return {
+            "mean_difference": np.nan,
+            "equivalence_margin": np.nan,
+            "lower_p_value": np.nan,
+            "upper_p_value": np.nan,
+            "tost_p_value": np.nan,
+            "equivalent": None,
+            "error": "Insufficient data",
+        }
+
+    # Compute statistics
+    mean_diff = np.mean(treated_changes) - np.mean(control_changes)
+    se_diff = np.sqrt(
+        np.var(treated_changes, ddof=1) / len(treated_changes) +
+        np.var(control_changes, ddof=1) / len(control_changes)
+    )
+
+    # Set equivalence margin if not provided
+    if equivalence_margin is None:
+        pooled_changes = np.concatenate([treated_changes, control_changes])
+        equivalence_margin = 0.5 * np.std(pooled_changes, ddof=1)
+
+    # Degrees of freedom (Welch-Satterthwaite approximation)
+    var_t = np.var(treated_changes, ddof=1)
+    var_c = np.var(control_changes, ddof=1)
+    n_t = len(treated_changes)
+    n_c = len(control_changes)
+
+    df = ((var_t/n_t + var_c/n_c)**2 /
+          ((var_t/n_t)**2/(n_t-1) + (var_c/n_c)**2/(n_c-1)))
+
+    # TOST: Two one-sided tests
+    # Test 1: H0: diff <= -margin vs H1: diff > -margin
+    t_lower = (mean_diff - (-equivalence_margin)) / se_diff
+    p_lower = stats.t.sf(t_lower, df)
+
+    # Test 2: H0: diff >= margin vs H1: diff < margin
+    t_upper = (mean_diff - equivalence_margin) / se_diff
+    p_upper = stats.t.cdf(t_upper, df)
+
+    # TOST p-value is the maximum of the two
+    tost_p = max(p_lower, p_upper)
+
+    return {
+        "mean_difference": mean_diff,
+        "se_difference": se_diff,
+        "equivalence_margin": equivalence_margin,
+        "lower_t_stat": t_lower,
+        "upper_t_stat": t_upper,
+        "lower_p_value": p_lower,
+        "upper_p_value": p_upper,
+        "tost_p_value": tost_p,
+        "degrees_of_freedom": df,
+        "equivalent": bool(tost_p < 0.05),
+    }
diff --git a/tests/test_estimators.py b/tests/test_estimators.py