two-stage-did: count vcov n_clusters via np.unique like the variance (codex P2)

igerber · claude · igerber · commit 9f3368a7d96a · 2026-05-30T07:23:34.000-04:00
n_clusters used Series.nunique() (drops NaN), but the GMM sandwich counts
np.unique(cluster_ids) (keeps a single NaN group). A non-survey cluster= column
with missing IDs would make the reported G undercount the SE's actual cluster
count. Count clusters the same way the variance does — np.unique(df[cluster_var])
— which also consolidates the two non-survey branches and still excludes
always-treated-dropped units (df, not data). Adds a NaN-cluster regression test.

Co-Authored-By: Claude Opus 4.8 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/diff_diff/two_stage.py b/diff_diff/two_stage.py
@@ -2067,27 +2067,25 @@ def _refit_ts(w_r):
         # Resolve cluster_name / n_clusters for Results metadata. Suppress under
         # ANY survey design (the summary survey block already reports the
         # design's PSU/strata/replicate metadata; replicate-weight variance
-        # ignores cluster entirely). Otherwise count clusters on the POST-DROP
-        # fit sample `df` (always-treated units were removed above at the
-        # `df = df[~df[unit].isin(always_treated_units)]` step), NOT the full
-        # input `data`: the GMM sandwich computes variance over
-        # `cluster_ids=df[cluster_var].values` (see the `_compute_gmm_variance`
-        # call sites), so the reported G must match that effective count — using
-        # `data` would overstate the clusters the SE is actually based on when an
-        # always-treated unit/cluster is excluded. Branches:
-        #   bare cluster= -> the user-named cluster column (df[self.cluster])
-        #   cluster=None  -> the Gardner GMM sandwich clusters at `unit` by
-        #                    default (cluster_var = unit above), so the summary
-        #                    label reports unit-cluster CR1, not HC1.
+        # ignores cluster entirely). Otherwise count clusters EXACTLY the way the
+        # GMM sandwich does — `np.unique(df[cluster_var].values)` — so the
+        # reported G can never disagree with the SE:
+        #   - `df` (not the full input `data`) excludes always-treated units
+        #     dropped above at `df = df[~df[unit].isin(always_treated_units)]`,
+        #     matching the post-drop `cluster_ids=df[cluster_var].values` fed to
+        #     `_compute_gmm_variance`;
+        #   - `np.unique` (not `Series.nunique()`, which drops NaN) keeps the
+        #     same single NaN group the variance forms, so missing cluster IDs
+        #     cannot make the metadata undercount.
+        # `cluster_var` is `self.cluster`, or the `unit` column when
+        # `cluster=None` (the Gardner sandwich always clusters at unit by
+        # default), so the summary renders the unit-cluster CR1 label, not HC1.
         if resolved_survey is not None:
             _cluster_name_for_results: Optional[str] = None
             _n_clusters_for_results: Optional[int] = None
-        elif self.cluster is not None:
-            _cluster_name_for_results = self.cluster
-            _n_clusters_for_results = int(df[self.cluster].nunique())
         else:
-            _cluster_name_for_results = unit
-            _n_clusters_for_results = int(df[unit].nunique())
+            _cluster_name_for_results = self.cluster if self.cluster is not None else unit
+            _n_clusters_for_results = int(np.unique(df[cluster_var].values).size)
 
         # Construct results
         self.results_ = TwoStageDiDResults(
diff --git a/tests/test_two_stage.py b/tests/test_two_stage.py
@@ -2252,3 +2252,29 @@ def test_n_clusters_reflects_post_drop_fit_sample(self):
         assert r.n_clusters < full_units  # would equal full_units under the bug
         # to_dict mirrors the corrected count
         assert r.to_dict()["n_clusters"] == expected_g
+
+    def test_n_clusters_counts_nan_cluster_like_the_variance(self):
+        """The GMM sandwich counts clusters via np.unique(cluster_ids), which
+        keeps a single NaN group; Series.nunique() would drop NaN. n_clusters
+        metadata must match the variance so a `cluster=` column with missing IDs
+        cannot make the reported G undercount the SE's actual cluster count.
+        Regression for the codex round-3 P2.
+        """
+        data = generate_test_data(n_units=80, seed=21)
+        data["cl"] = (data["unit"] % 6).astype(float)
+        data.loc[data["unit"].isin([0, 1, 2]), "cl"] = np.nan
+        # No always-treated drop here (cohorts start at t=3, min_time=0), so
+        # df == data; count clusters the way the variance does.
+        expected_g = int(np.unique(data["cl"].values).size)
+        n_valid = int(data.loc[data["cl"].notna(), "cl"].nunique())
+        with warnings.catch_warnings():
+            warnings.simplefilter("ignore")
+            r = TwoStageDiD(cluster="cl").fit(
+                data, outcome="outcome", unit="unit", time="time", first_treat="first_treat"
+            )
+        assert r.cluster_name == "cl"
+        assert r.n_clusters == expected_g
+        # NaN is counted as a cluster (Series.nunique() would have dropped it),
+        # so G strictly exceeds the distinct non-NaN cluster count.
+        assert r.n_clusters > n_valid
+        assert r.to_dict()["n_clusters"] == expected_g