Skip to content

Commit 9f3368a

Browse files
igerberclaude
andcommitted
two-stage-did: count vcov n_clusters via np.unique like the variance (codex P2)
n_clusters used Series.nunique() (drops NaN), but the GMM sandwich counts np.unique(cluster_ids) (keeps a single NaN group). A non-survey cluster= column with missing IDs would make the reported G undercount the SE's actual cluster count. Count clusters the same way the variance does — np.unique(df[cluster_var]) — which also consolidates the two non-survey branches and still excludes always-treated-dropped units (df, not data). Adds a NaN-cluster regression test. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
1 parent 64119a0 commit 9f3368a

2 files changed

Lines changed: 41 additions & 17 deletions

File tree

diff_diff/two_stage.py

Lines changed: 15 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -2067,27 +2067,25 @@ def _refit_ts(w_r):
20672067
# Resolve cluster_name / n_clusters for Results metadata. Suppress under
20682068
# ANY survey design (the summary survey block already reports the
20692069
# design's PSU/strata/replicate metadata; replicate-weight variance
2070-
# ignores cluster entirely). Otherwise count clusters on the POST-DROP
2071-
# fit sample `df` (always-treated units were removed above at the
2072-
# `df = df[~df[unit].isin(always_treated_units)]` step), NOT the full
2073-
# input `data`: the GMM sandwich computes variance over
2074-
# `cluster_ids=df[cluster_var].values` (see the `_compute_gmm_variance`
2075-
# call sites), so the reported G must match that effective count — using
2076-
# `data` would overstate the clusters the SE is actually based on when an
2077-
# always-treated unit/cluster is excluded. Branches:
2078-
# bare cluster= -> the user-named cluster column (df[self.cluster])
2079-
# cluster=None -> the Gardner GMM sandwich clusters at `unit` by
2080-
# default (cluster_var = unit above), so the summary
2081-
# label reports unit-cluster CR1, not HC1.
2070+
# ignores cluster entirely). Otherwise count clusters EXACTLY the way the
2071+
# GMM sandwich does — `np.unique(df[cluster_var].values)` — so the
2072+
# reported G can never disagree with the SE:
2073+
# - `df` (not the full input `data`) excludes always-treated units
2074+
# dropped above at `df = df[~df[unit].isin(always_treated_units)]`,
2075+
# matching the post-drop `cluster_ids=df[cluster_var].values` fed to
2076+
# `_compute_gmm_variance`;
2077+
# - `np.unique` (not `Series.nunique()`, which drops NaN) keeps the
2078+
# same single NaN group the variance forms, so missing cluster IDs
2079+
# cannot make the metadata undercount.
2080+
# `cluster_var` is `self.cluster`, or the `unit` column when
2081+
# `cluster=None` (the Gardner sandwich always clusters at unit by
2082+
# default), so the summary renders the unit-cluster CR1 label, not HC1.
20822083
if resolved_survey is not None:
20832084
_cluster_name_for_results: Optional[str] = None
20842085
_n_clusters_for_results: Optional[int] = None
2085-
elif self.cluster is not None:
2086-
_cluster_name_for_results = self.cluster
2087-
_n_clusters_for_results = int(df[self.cluster].nunique())
20882086
else:
2089-
_cluster_name_for_results = unit
2090-
_n_clusters_for_results = int(df[unit].nunique())
2087+
_cluster_name_for_results = self.cluster if self.cluster is not None else unit
2088+
_n_clusters_for_results = int(np.unique(df[cluster_var].values).size)
20912089

20922090
# Construct results
20932091
self.results_ = TwoStageDiDResults(

tests/test_two_stage.py

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2252,3 +2252,29 @@ def test_n_clusters_reflects_post_drop_fit_sample(self):
22522252
assert r.n_clusters < full_units # would equal full_units under the bug
22532253
# to_dict mirrors the corrected count
22542254
assert r.to_dict()["n_clusters"] == expected_g
2255+
2256+
def test_n_clusters_counts_nan_cluster_like_the_variance(self):
2257+
"""The GMM sandwich counts clusters via np.unique(cluster_ids), which
2258+
keeps a single NaN group; Series.nunique() would drop NaN. n_clusters
2259+
metadata must match the variance so a `cluster=` column with missing IDs
2260+
cannot make the reported G undercount the SE's actual cluster count.
2261+
Regression for the codex round-3 P2.
2262+
"""
2263+
data = generate_test_data(n_units=80, seed=21)
2264+
data["cl"] = (data["unit"] % 6).astype(float)
2265+
data.loc[data["unit"].isin([0, 1, 2]), "cl"] = np.nan
2266+
# No always-treated drop here (cohorts start at t=3, min_time=0), so
2267+
# df == data; count clusters the way the variance does.
2268+
expected_g = int(np.unique(data["cl"].values).size)
2269+
n_valid = int(data.loc[data["cl"].notna(), "cl"].nunique())
2270+
with warnings.catch_warnings():
2271+
warnings.simplefilter("ignore")
2272+
r = TwoStageDiD(cluster="cl").fit(
2273+
data, outcome="outcome", unit="unit", time="time", first_treat="first_treat"
2274+
)
2275+
assert r.cluster_name == "cl"
2276+
assert r.n_clusters == expected_g
2277+
# NaN is counted as a cluster (Series.nunique() would have dropped it),
2278+
# so G strictly exceeds the distinct non-NaN cluster count.
2279+
assert r.n_clusters > n_valid
2280+
assert r.to_dict()["n_clusters"] == expected_g

0 commit comments

Comments
 (0)