Merge pull request #2 from igerber/claude/init-did-library-pvNmf

igerber · web-flow · commit 860f8c87e909 · 2026-01-01T16:55:23.000-05:00
Add fixed effects and absorb parameters to DifferenceInDifferences
diff --git a/README.md b/README.md
@@ -117,6 +117,51 @@ results = did.fit(
 )
 ```
 
+### Fixed Effects
+
+Use `fixed_effects` for low-dimensional categorical controls (creates dummy variables):
+
+```python
+# State and industry fixed effects
+results = did.fit(
+    data,
+    outcome='sales',
+    treatment='treated',
+    time='post',
+    fixed_effects=['state', 'industry']
+)
+
+# Access fixed effect coefficients
+state_coefs = {k: v for k, v in results.coefficients.items() if k.startswith('state_')}
+```
+
+Use `absorb` for high-dimensional fixed effects (more efficient, uses within-transformation):
+
+```python
+# Absorb firm-level fixed effects (efficient for many firms)
+results = did.fit(
+    data,
+    outcome='sales',
+    treatment='treated',
+    time='post',
+    absorb=['firm_id']
+)
+```
+
+Combine covariates with fixed effects:
+
+```python
+results = did.fit(
+    data,
+    outcome='sales',
+    treatment='treated',
+    time='post',
+    covariates=['size', 'age'],           # Linear controls
+    fixed_effects=['industry'],            # Low-dimensional FE (dummies)
+    absorb=['firm_id']                     # High-dimensional FE (absorbed)
+)
+```
+
 ### Cluster-Robust Standard Errors
 
 ```python
@@ -222,12 +267,25 @@ DifferenceInDifferences(
 
 | Method | Description |
 |--------|-------------|
-| `fit(data, outcome, treatment, time, formula, covariates)` | Fit the DiD model |
+| `fit(data, outcome, treatment, time, ...)` | Fit the DiD model |
 | `summary()` | Get formatted summary string |
 | `print_summary()` | Print summary to stdout |
 | `get_params()` | Get estimator parameters (sklearn-compatible) |
 | `set_params(**params)` | Set estimator parameters (sklearn-compatible) |
 
+**fit() Parameters:**
+
+| Parameter | Type | Description |
+|-----------|------|-------------|
+| `data` | DataFrame | Input data |
+| `outcome` | str | Outcome variable column name |
+| `treatment` | str | Treatment indicator column (0/1) |
+| `time` | str | Post-treatment indicator column (0/1) |
+| `formula` | str | R-style formula (alternative to column names) |
+| `covariates` | list | Linear control variables |
+| `fixed_effects` | list | Categorical FE columns (creates dummies) |
+| `absorb` | list | High-dimensional FE (within-transformation) |
+
 ### DiDResults
 
 **Attributes:**
diff --git a/diff_diff/estimators.py b/diff_diff/estimators.py
@@ -105,7 +105,9 @@ def fit(
         treatment: str = None,
         time: str = None,
         formula: str = None,
-        covariates: list = None
+        covariates: list = None,
+        fixed_effects: list = None,
+        absorb: list = None
     ) -> DiDResults:
         """
         Fit the Difference-in-Differences model.
@@ -124,7 +126,15 @@ def fit(
             R-style formula (e.g., "outcome ~ treated * post").
             If provided, overrides outcome, treatment, and time parameters.
         covariates : list, optional
-            List of covariate column names to include in the regression.
+            List of covariate column names to include as linear controls.
+        fixed_effects : list, optional
+            List of categorical column names to include as fixed effects.
+            Creates dummy variables for each category (drops first level).
+            Use for low-dimensional fixed effects (e.g., industry, region).
+        absorb : list, optional
+            List of categorical column names for high-dimensional fixed effects.
+            Uses within-transformation (demeaning) instead of dummy variables.
+            More efficient for large numbers of categories (e.g., firm, individual).
 
         Returns
         -------
@@ -135,6 +145,18 @@ def fit(
         ------
         ValueError
             If required parameters are missing or data validation fails.
+
+        Examples
+        --------
+        Using fixed effects (dummy variables):
+
+        >>> did.fit(data, outcome='sales', treatment='treated', time='post',
+        ...         fixed_effects=['state', 'industry'])
+
+        Using absorbed fixed effects (within-transformation):
+
+        >>> did.fit(data, outcome='sales', treatment='treated', time='post',
+        ...         absorb=['firm_id'])
         """
         # Parse formula if provided
         if formula is not None:
@@ -147,10 +169,35 @@ def fit(
         # Validate inputs
         self._validate_data(data, outcome, treatment, time, covariates)
 
+        # Validate fixed effects and absorb columns
+        if fixed_effects:
+            for fe in fixed_effects:
+                if fe not in data.columns:
+                    raise ValueError(f"Fixed effect column '{fe}' not found in data")
+        if absorb:
+            for ab in absorb:
+                if ab not in data.columns:
+                    raise ValueError(f"Absorb column '{ab}' not found in data")
+
+        # Handle absorbed fixed effects (within-transformation)
+        working_data = data.copy()
+        absorbed_vars = []
+        n_absorbed_effects = 0
+
+        if absorb:
+            # Apply within-transformation for each absorbed variable
+            vars_to_demean = [outcome] + (covariates or [])
+            for ab_var in absorb:
+                n_absorbed_effects += working_data[ab_var].nunique() - 1
+                for var in vars_to_demean:
+                    group_means = working_data.groupby(ab_var)[var].transform("mean")
+                    working_data[var] = working_data[var] - group_means
+                absorbed_vars.append(ab_var)
+
         # Extract variables
-        y = data[outcome].values.astype(float)
-        d = data[treatment].values.astype(float)
-        t = data[time].values.astype(float)
+        y = working_data[outcome].values.astype(float)
+        d = working_data[treatment].values.astype(float)
+        t = working_data[time].values.astype(float)
 
         # Validate binary variables
         validate_binary(d, "treatment")
@@ -166,9 +213,18 @@ def fit(
         # Add covariates if provided
         if covariates:
             for cov in covariates:
-                X = np.column_stack([X, data[cov].values.astype(float)])
+                X = np.column_stack([X, working_data[cov].values.astype(float)])
                 var_names.append(cov)
 
+        # Add fixed effects as dummy variables
+        if fixed_effects:
+            for fe in fixed_effects:
+                # Create dummies, drop first category to avoid multicollinearity
+                dummies = pd.get_dummies(data[fe], prefix=fe, drop_first=True)
+                for col in dummies.columns:
+                    X = np.column_stack([X, dummies[col].values.astype(float)])
+                    var_names.append(col)
+
         # Fit OLS
         coefficients, residuals, fitted, r_squared = self._fit_ols(X, y)
 
@@ -190,8 +246,8 @@ def fit(
         att = coefficients[att_idx]
         se = np.sqrt(vcov[att_idx, att_idx])
 
-        # Compute test statistics
-        df = len(y) - X.shape[1]
+        # Compute test statistics (adjust df for absorbed fixed effects)
+        df = len(y) - X.shape[1] - n_absorbed_effects
         t_stat = att / se
         p_value = compute_p_value(t_stat, df=df)
         conf_int = compute_confidence_interval(att, se, self.alpha, df=df)
diff --git a/tests/test_estimators.py b/tests/test_estimators.py
@@ -276,3 +276,179 @@ def test_is_significant_property(self, simple_did_data):
         assert isinstance(results.is_significant, bool)
         # With true effect, should be significant
         assert results.is_significant
+
+
+class TestFixedEffects:
+    """Tests for fixed effects functionality."""
+
+    @pytest.fixture
+    def panel_data_with_fe(self):
+        """Create panel data with fixed effects."""
+        np.random.seed(42)
+        n_units = 50
+        n_periods = 4
+        n_states = 5
+
+        data = []
+        for unit in range(n_units):
+            state = unit % n_states
+            is_treated = unit < n_units // 2
+            # State-level effect
+            state_effect = state * 2.0
+
+            for period in range(n_periods):
+                post = 1 if period >= 2 else 0
+
+                y = 10.0 + state_effect + period * 0.5
+                if is_treated and post:
+                    y += 3.0  # True ATT
+
+                y += np.random.normal(0, 0.5)
+
+                data.append({
+                    "unit": unit,
+                    "state": f"state_{state}",
+                    "period": period,
+                    "treated": int(is_treated),
+                    "post": post,
+                    "outcome": y,
+                })
+
+        return pd.DataFrame(data)
+
+    def test_fixed_effects_dummy(self, panel_data_with_fe):
+        """Test fixed effects using dummy variables."""
+        did = DifferenceInDifferences()
+        results = did.fit(
+            panel_data_with_fe,
+            outcome="outcome",
+            treatment="treated",
+            time="post",
+            fixed_effects=["state"]
+        )
+
+        assert results is not None
+        assert did.is_fitted_
+        # ATT should still be close to 3.0
+        assert abs(results.att - 3.0) < 1.0
+
+    def test_fixed_effects_coefficients_include_dummies(self, panel_data_with_fe):
+        """Test that dummy coefficients are included in results."""
+        did = DifferenceInDifferences()
+        results = did.fit(
+            panel_data_with_fe,
+            outcome="outcome",
+            treatment="treated",
+            time="post",
+            fixed_effects=["state"]
+        )
+
+        # Should have state dummy coefficients
+        state_coefs = [k for k in results.coefficients.keys() if k.startswith("state_")]
+        assert len(state_coefs) == 4  # 5 states - 1 (dropped first)
+
+    def test_absorb_fixed_effects(self, panel_data_with_fe):
+        """Test absorbed (within-transformed) fixed effects."""
+        did = DifferenceInDifferences()
+        results = did.fit(
+            panel_data_with_fe,
+            outcome="outcome",
+            treatment="treated",
+            time="post",
+            absorb=["unit"]
+        )
+
+        assert results is not None
+        assert did.is_fitted_
+        # ATT should still be close to 3.0
+        assert abs(results.att - 3.0) < 1.0
+
+    def test_fixed_effects_vs_no_fe(self, panel_data_with_fe):
+        """Test that FE produces different (usually better) estimates."""
+        did_no_fe = DifferenceInDifferences()
+        did_with_fe = DifferenceInDifferences()
+
+        results_no_fe = did_no_fe.fit(
+            panel_data_with_fe,
+            outcome="outcome",
+            treatment="treated",
+            time="post"
+        )
+
+        results_with_fe = did_with_fe.fit(
+            panel_data_with_fe,
+            outcome="outcome",
+            treatment="treated",
+            time="post",
+            fixed_effects=["state"]
+        )
+
+        # Both should estimate positive ATT
+        assert results_no_fe.att > 0
+        assert results_with_fe.att > 0
+
+        # FE model should have higher R-squared (explains more variance)
+        assert results_with_fe.r_squared >= results_no_fe.r_squared
+
+    def test_invalid_fixed_effects_column(self, panel_data_with_fe):
+        """Test error when fixed effects column doesn't exist."""
+        did = DifferenceInDifferences()
+        with pytest.raises(ValueError, match="not found"):
+            did.fit(
+                panel_data_with_fe,
+                outcome="outcome",
+                treatment="treated",
+                time="post",
+                fixed_effects=["nonexistent_column"]
+            )
+
+    def test_invalid_absorb_column(self, panel_data_with_fe):
+        """Test error when absorb column doesn't exist."""
+        did = DifferenceInDifferences()
+        with pytest.raises(ValueError, match="not found"):
+            did.fit(
+                panel_data_with_fe,
+                outcome="outcome",
+                treatment="treated",
+                time="post",
+                absorb=["nonexistent_column"]
+            )
+
+    def test_multiple_fixed_effects(self, panel_data_with_fe):
+        """Test multiple fixed effects."""
+        # Add another categorical variable
+        panel_data_with_fe["industry"] = panel_data_with_fe["unit"] % 3
+
+        did = DifferenceInDifferences()
+        results = did.fit(
+            panel_data_with_fe,
+            outcome="outcome",
+            treatment="treated",
+            time="post",
+            fixed_effects=["state", "industry"]
+        )
+
+        assert results is not None
+        # Should have both state and industry dummies
+        state_coefs = [k for k in results.coefficients.keys() if k.startswith("state_")]
+        industry_coefs = [k for k in results.coefficients.keys() if k.startswith("industry_")]
+        assert len(state_coefs) > 0
+        assert len(industry_coefs) > 0
+
+    def test_covariates_with_fixed_effects(self, panel_data_with_fe):
+        """Test combining covariates with fixed effects."""
+        # Add a continuous covariate
+        panel_data_with_fe["size"] = np.random.normal(100, 10, len(panel_data_with_fe))
+
+        did = DifferenceInDifferences()
+        results = did.fit(
+            panel_data_with_fe,
+            outcome="outcome",
+            treatment="treated",
+            time="post",
+            covariates=["size"],
+            fixed_effects=["state"]
+        )
+
+        assert results is not None
+        assert "size" in results.coefficients