Skip to content

Commit e81d95b

Browse files
committed
Add fixed effects and absorb parameters to DifferenceInDifferences
- Add fixed_effects parameter for low-dimensional categorical FE (dummy variables) - Add absorb parameter for high-dimensional FE (within-transformation) - Properly adjust degrees of freedom for absorbed fixed effects - Add comprehensive test suite for fixed effects functionality (8 new tests) - Update README with fixed effects usage examples and API documentation
1 parent fbc0ec9 commit e81d95b

3 files changed

Lines changed: 299 additions & 9 deletions

File tree

README.md

Lines changed: 59 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -117,6 +117,51 @@ results = did.fit(
117117
)
118118
```
119119

120+
### Fixed Effects
121+
122+
Use `fixed_effects` for low-dimensional categorical controls (creates dummy variables):
123+
124+
```python
125+
# State and industry fixed effects
126+
results = did.fit(
127+
data,
128+
outcome='sales',
129+
treatment='treated',
130+
time='post',
131+
fixed_effects=['state', 'industry']
132+
)
133+
134+
# Access fixed effect coefficients
135+
state_coefs = {k: v for k, v in results.coefficients.items() if k.startswith('state_')}
136+
```
137+
138+
Use `absorb` for high-dimensional fixed effects (more efficient, uses within-transformation):
139+
140+
```python
141+
# Absorb firm-level fixed effects (efficient for many firms)
142+
results = did.fit(
143+
data,
144+
outcome='sales',
145+
treatment='treated',
146+
time='post',
147+
absorb=['firm_id']
148+
)
149+
```
150+
151+
Combine covariates with fixed effects:
152+
153+
```python
154+
results = did.fit(
155+
data,
156+
outcome='sales',
157+
treatment='treated',
158+
time='post',
159+
covariates=['size', 'age'], # Linear controls
160+
fixed_effects=['industry'], # Low-dimensional FE (dummies)
161+
absorb=['firm_id'] # High-dimensional FE (absorbed)
162+
)
163+
```
164+
120165
### Cluster-Robust Standard Errors
121166

122167
```python
@@ -222,12 +267,25 @@ DifferenceInDifferences(
222267

223268
| Method | Description |
224269
|--------|-------------|
225-
| `fit(data, outcome, treatment, time, formula, covariates)` | Fit the DiD model |
270+
| `fit(data, outcome, treatment, time, ...)` | Fit the DiD model |
226271
| `summary()` | Get formatted summary string |
227272
| `print_summary()` | Print summary to stdout |
228273
| `get_params()` | Get estimator parameters (sklearn-compatible) |
229274
| `set_params(**params)` | Set estimator parameters (sklearn-compatible) |
230275

276+
**fit() Parameters:**
277+
278+
| Parameter | Type | Description |
279+
|-----------|------|-------------|
280+
| `data` | DataFrame | Input data |
281+
| `outcome` | str | Outcome variable column name |
282+
| `treatment` | str | Treatment indicator column (0/1) |
283+
| `time` | str | Post-treatment indicator column (0/1) |
284+
| `formula` | str | R-style formula (alternative to column names) |
285+
| `covariates` | list | Linear control variables |
286+
| `fixed_effects` | list | Categorical FE columns (creates dummies) |
287+
| `absorb` | list | High-dimensional FE (within-transformation) |
288+
231289
### DiDResults
232290

233291
**Attributes:**

diff_diff/estimators.py

Lines changed: 64 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -105,7 +105,9 @@ def fit(
105105
treatment: str = None,
106106
time: str = None,
107107
formula: str = None,
108-
covariates: list = None
108+
covariates: list = None,
109+
fixed_effects: list = None,
110+
absorb: list = None
109111
) -> DiDResults:
110112
"""
111113
Fit the Difference-in-Differences model.
@@ -124,7 +126,15 @@ def fit(
124126
R-style formula (e.g., "outcome ~ treated * post").
125127
If provided, overrides outcome, treatment, and time parameters.
126128
covariates : list, optional
127-
List of covariate column names to include in the regression.
129+
List of covariate column names to include as linear controls.
130+
fixed_effects : list, optional
131+
List of categorical column names to include as fixed effects.
132+
Creates dummy variables for each category (drops first level).
133+
Use for low-dimensional fixed effects (e.g., industry, region).
134+
absorb : list, optional
135+
List of categorical column names for high-dimensional fixed effects.
136+
Uses within-transformation (demeaning) instead of dummy variables.
137+
More efficient for large numbers of categories (e.g., firm, individual).
128138
129139
Returns
130140
-------
@@ -135,6 +145,18 @@ def fit(
135145
------
136146
ValueError
137147
If required parameters are missing or data validation fails.
148+
149+
Examples
150+
--------
151+
Using fixed effects (dummy variables):
152+
153+
>>> did.fit(data, outcome='sales', treatment='treated', time='post',
154+
... fixed_effects=['state', 'industry'])
155+
156+
Using absorbed fixed effects (within-transformation):
157+
158+
>>> did.fit(data, outcome='sales', treatment='treated', time='post',
159+
... absorb=['firm_id'])
138160
"""
139161
# Parse formula if provided
140162
if formula is not None:
@@ -147,10 +169,35 @@ def fit(
147169
# Validate inputs
148170
self._validate_data(data, outcome, treatment, time, covariates)
149171

172+
# Validate fixed effects and absorb columns
173+
if fixed_effects:
174+
for fe in fixed_effects:
175+
if fe not in data.columns:
176+
raise ValueError(f"Fixed effect column '{fe}' not found in data")
177+
if absorb:
178+
for ab in absorb:
179+
if ab not in data.columns:
180+
raise ValueError(f"Absorb column '{ab}' not found in data")
181+
182+
# Handle absorbed fixed effects (within-transformation)
183+
working_data = data.copy()
184+
absorbed_vars = []
185+
n_absorbed_effects = 0
186+
187+
if absorb:
188+
# Apply within-transformation for each absorbed variable
189+
vars_to_demean = [outcome] + (covariates or [])
190+
for ab_var in absorb:
191+
n_absorbed_effects += working_data[ab_var].nunique() - 1
192+
for var in vars_to_demean:
193+
group_means = working_data.groupby(ab_var)[var].transform("mean")
194+
working_data[var] = working_data[var] - group_means
195+
absorbed_vars.append(ab_var)
196+
150197
# Extract variables
151-
y = data[outcome].values.astype(float)
152-
d = data[treatment].values.astype(float)
153-
t = data[time].values.astype(float)
198+
y = working_data[outcome].values.astype(float)
199+
d = working_data[treatment].values.astype(float)
200+
t = working_data[time].values.astype(float)
154201

155202
# Validate binary variables
156203
validate_binary(d, "treatment")
@@ -166,9 +213,18 @@ def fit(
166213
# Add covariates if provided
167214
if covariates:
168215
for cov in covariates:
169-
X = np.column_stack([X, data[cov].values.astype(float)])
216+
X = np.column_stack([X, working_data[cov].values.astype(float)])
170217
var_names.append(cov)
171218

219+
# Add fixed effects as dummy variables
220+
if fixed_effects:
221+
for fe in fixed_effects:
222+
# Create dummies, drop first category to avoid multicollinearity
223+
dummies = pd.get_dummies(data[fe], prefix=fe, drop_first=True)
224+
for col in dummies.columns:
225+
X = np.column_stack([X, dummies[col].values.astype(float)])
226+
var_names.append(col)
227+
172228
# Fit OLS
173229
coefficients, residuals, fitted, r_squared = self._fit_ols(X, y)
174230

@@ -190,8 +246,8 @@ def fit(
190246
att = coefficients[att_idx]
191247
se = np.sqrt(vcov[att_idx, att_idx])
192248

193-
# Compute test statistics
194-
df = len(y) - X.shape[1]
249+
# Compute test statistics (adjust df for absorbed fixed effects)
250+
df = len(y) - X.shape[1] - n_absorbed_effects
195251
t_stat = att / se
196252
p_value = compute_p_value(t_stat, df=df)
197253
conf_int = compute_confidence_interval(att, se, self.alpha, df=df)

tests/test_estimators.py

Lines changed: 176 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -276,3 +276,179 @@ def test_is_significant_property(self, simple_did_data):
276276
assert isinstance(results.is_significant, bool)
277277
# With true effect, should be significant
278278
assert results.is_significant
279+
280+
281+
class TestFixedEffects:
282+
"""Tests for fixed effects functionality."""
283+
284+
@pytest.fixture
285+
def panel_data_with_fe(self):
286+
"""Create panel data with fixed effects."""
287+
np.random.seed(42)
288+
n_units = 50
289+
n_periods = 4
290+
n_states = 5
291+
292+
data = []
293+
for unit in range(n_units):
294+
state = unit % n_states
295+
is_treated = unit < n_units // 2
296+
# State-level effect
297+
state_effect = state * 2.0
298+
299+
for period in range(n_periods):
300+
post = 1 if period >= 2 else 0
301+
302+
y = 10.0 + state_effect + period * 0.5
303+
if is_treated and post:
304+
y += 3.0 # True ATT
305+
306+
y += np.random.normal(0, 0.5)
307+
308+
data.append({
309+
"unit": unit,
310+
"state": f"state_{state}",
311+
"period": period,
312+
"treated": int(is_treated),
313+
"post": post,
314+
"outcome": y,
315+
})
316+
317+
return pd.DataFrame(data)
318+
319+
def test_fixed_effects_dummy(self, panel_data_with_fe):
320+
"""Test fixed effects using dummy variables."""
321+
did = DifferenceInDifferences()
322+
results = did.fit(
323+
panel_data_with_fe,
324+
outcome="outcome",
325+
treatment="treated",
326+
time="post",
327+
fixed_effects=["state"]
328+
)
329+
330+
assert results is not None
331+
assert did.is_fitted_
332+
# ATT should still be close to 3.0
333+
assert abs(results.att - 3.0) < 1.0
334+
335+
def test_fixed_effects_coefficients_include_dummies(self, panel_data_with_fe):
336+
"""Test that dummy coefficients are included in results."""
337+
did = DifferenceInDifferences()
338+
results = did.fit(
339+
panel_data_with_fe,
340+
outcome="outcome",
341+
treatment="treated",
342+
time="post",
343+
fixed_effects=["state"]
344+
)
345+
346+
# Should have state dummy coefficients
347+
state_coefs = [k for k in results.coefficients.keys() if k.startswith("state_")]
348+
assert len(state_coefs) == 4 # 5 states - 1 (dropped first)
349+
350+
def test_absorb_fixed_effects(self, panel_data_with_fe):
351+
"""Test absorbed (within-transformed) fixed effects."""
352+
did = DifferenceInDifferences()
353+
results = did.fit(
354+
panel_data_with_fe,
355+
outcome="outcome",
356+
treatment="treated",
357+
time="post",
358+
absorb=["unit"]
359+
)
360+
361+
assert results is not None
362+
assert did.is_fitted_
363+
# ATT should still be close to 3.0
364+
assert abs(results.att - 3.0) < 1.0
365+
366+
def test_fixed_effects_vs_no_fe(self, panel_data_with_fe):
367+
"""Test that FE produces different (usually better) estimates."""
368+
did_no_fe = DifferenceInDifferences()
369+
did_with_fe = DifferenceInDifferences()
370+
371+
results_no_fe = did_no_fe.fit(
372+
panel_data_with_fe,
373+
outcome="outcome",
374+
treatment="treated",
375+
time="post"
376+
)
377+
378+
results_with_fe = did_with_fe.fit(
379+
panel_data_with_fe,
380+
outcome="outcome",
381+
treatment="treated",
382+
time="post",
383+
fixed_effects=["state"]
384+
)
385+
386+
# Both should estimate positive ATT
387+
assert results_no_fe.att > 0
388+
assert results_with_fe.att > 0
389+
390+
# FE model should have higher R-squared (explains more variance)
391+
assert results_with_fe.r_squared >= results_no_fe.r_squared
392+
393+
def test_invalid_fixed_effects_column(self, panel_data_with_fe):
394+
"""Test error when fixed effects column doesn't exist."""
395+
did = DifferenceInDifferences()
396+
with pytest.raises(ValueError, match="not found"):
397+
did.fit(
398+
panel_data_with_fe,
399+
outcome="outcome",
400+
treatment="treated",
401+
time="post",
402+
fixed_effects=["nonexistent_column"]
403+
)
404+
405+
def test_invalid_absorb_column(self, panel_data_with_fe):
406+
"""Test error when absorb column doesn't exist."""
407+
did = DifferenceInDifferences()
408+
with pytest.raises(ValueError, match="not found"):
409+
did.fit(
410+
panel_data_with_fe,
411+
outcome="outcome",
412+
treatment="treated",
413+
time="post",
414+
absorb=["nonexistent_column"]
415+
)
416+
417+
def test_multiple_fixed_effects(self, panel_data_with_fe):
418+
"""Test multiple fixed effects."""
419+
# Add another categorical variable
420+
panel_data_with_fe["industry"] = panel_data_with_fe["unit"] % 3
421+
422+
did = DifferenceInDifferences()
423+
results = did.fit(
424+
panel_data_with_fe,
425+
outcome="outcome",
426+
treatment="treated",
427+
time="post",
428+
fixed_effects=["state", "industry"]
429+
)
430+
431+
assert results is not None
432+
# Should have both state and industry dummies
433+
state_coefs = [k for k in results.coefficients.keys() if k.startswith("state_")]
434+
industry_coefs = [k for k in results.coefficients.keys() if k.startswith("industry_")]
435+
assert len(state_coefs) > 0
436+
assert len(industry_coefs) > 0
437+
438+
def test_covariates_with_fixed_effects(self, panel_data_with_fe):
439+
"""Test combining covariates with fixed effects."""
440+
# Add a continuous covariate
441+
panel_data_with_fe["size"] = np.random.normal(100, 10, len(panel_data_with_fe))
442+
443+
did = DifferenceInDifferences()
444+
results = did.fit(
445+
panel_data_with_fe,
446+
outcome="outcome",
447+
treatment="treated",
448+
time="post",
449+
covariates=["size"],
450+
fixed_effects=["state"]
451+
)
452+
453+
assert results is not None
454+
assert "size" in results.coefficients

0 commit comments

Comments
 (0)