|
| 1 | +#!/usr/bin/env Rscript |
| 2 | +# Generate the Basque Country (Abadie & Gardeazabal 2003) R `Synth` golden fixture |
| 3 | +# for the SyntheticControl estimator's two-tier R-parity test. |
| 4 | +# |
| 5 | +# Run from the repo root: |
| 6 | +# Rscript benchmarks/R/generate_synth_basque_golden.R |
| 7 | +# |
| 8 | +# Writes (into tests/data/ so the deterministic Tier-1 parity test runs in |
| 9 | +# isolated-install CI without R): |
| 10 | +# tests/data/synth_basque_panel.csv verbatim Synth::basque, regions != 1 |
| 11 | +# (Spain aggregate dropped), long format, |
| 12 | +# plus an absorbing `treated` indicator. |
| 13 | +# tests/data/synth_basque_golden.json R Synth solution.v / solution.w, losses, |
| 14 | +# the standardization divisor, X1/X0, and |
| 15 | +# the treated/synthetic/gap paths. |
| 16 | +# |
| 17 | +# Provenance: the panel is a verbatim export of R `Synth::basque`; the V-selection |
| 18 | +# numerics (standardization divisor, optimizer) are pinned from the `Synth` source, |
| 19 | +# not from Abadie-Diamond-Hainmueller (2010) — see docs/methodology/REGISTRY.md. |
| 20 | + |
| 21 | +suppressMessages({ |
| 22 | + library(Synth) |
| 23 | + library(jsonlite) |
| 24 | +}) |
| 25 | + |
| 26 | +data(basque) |
| 27 | + |
| 28 | +predictors <- c( |
| 29 | + "school.illit", "school.prim", "school.med", |
| 30 | + "school.high", "school.post.high", "invest" |
| 31 | +) |
| 32 | +special <- list( |
| 33 | + list("gdpcap", 1960:1969, "mean"), |
| 34 | + list("sec.agriculture", seq(1961, 1969, 2), "mean"), |
| 35 | + list("sec.energy", seq(1961, 1969, 2), "mean"), |
| 36 | + list("sec.industry", seq(1961, 1969, 2), "mean"), |
| 37 | + list("sec.construction", seq(1961, 1969, 2), "mean"), |
| 38 | + list("sec.services.venta", seq(1961, 1969, 2), "mean"), |
| 39 | + list("sec.services.nonventa", seq(1961, 1969, 2), "mean"), |
| 40 | + list("popdens", 1969, "mean") |
| 41 | +) |
| 42 | +controls <- c(2:16, 18) |
| 43 | + |
| 44 | +invisible(capture.output({ |
| 45 | + dp <- dataprep( |
| 46 | + foo = basque, |
| 47 | + predictors = predictors, |
| 48 | + predictors.op = "mean", |
| 49 | + time.predictors.prior = 1964:1969, |
| 50 | + special.predictors = special, |
| 51 | + dependent = "gdpcap", |
| 52 | + unit.variable = "regionno", |
| 53 | + unit.names.variable = "regionname", |
| 54 | + time.variable = "year", |
| 55 | + treatment.identifier = 17, |
| 56 | + controls.identifier = controls, |
| 57 | + time.optimize.ssr = 1960:1969, |
| 58 | + time.plot = 1955:1997 |
| 59 | + ) |
| 60 | + so <- synth(dp) |
| 61 | +})) |
| 62 | + |
| 63 | +# Standardization divisor exactly as computed inside synth(): |
| 64 | +# divisor <- sqrt(apply(cbind(X0, X1), 1, var)) |
| 65 | +big <- cbind(dp$X0, dp$X1) |
| 66 | +divisor <- sqrt(apply(big, 1, var)) |
| 67 | + |
| 68 | +pred_names <- rownames(dp$X1) |
| 69 | +v <- as.numeric(so$solution.v) |
| 70 | +w <- as.numeric(so$solution.w) |
| 71 | + |
| 72 | +# X0 as predictor -> {control -> value} so Python can verify matrix construction. |
| 73 | +X0_list <- setNames( |
| 74 | + lapply(seq_len(nrow(dp$X0)), function(i) as.list(setNames(dp$X0[i, ], colnames(dp$X0)))), |
| 75 | + pred_names |
| 76 | +) |
| 77 | + |
| 78 | +synthetic_path <- as.numeric(dp$Y0plot %*% so$solution.w) |
| 79 | +treated_path <- as.numeric(dp$Y1plot) |
| 80 | +years <- as.integer(rownames(dp$Y1plot)) |
| 81 | + |
| 82 | +golden <- list( |
| 83 | + config = list( |
| 84 | + treated_regionno = 17, |
| 85 | + controls = controls, |
| 86 | + treatment_year = 1970, |
| 87 | + predictors = predictors, |
| 88 | + predictors_op = "mean", |
| 89 | + predictor_window = 1964:1969, |
| 90 | + special = lapply(special, function(s) { |
| 91 | + list(var = s[[1]], periods = s[[2]], op = s[[3]]) |
| 92 | + }), |
| 93 | + time_optimize_ssr = 1960:1969, |
| 94 | + time_plot = c(1955, 1997) |
| 95 | + ), |
| 96 | + predictor_names = pred_names, |
| 97 | + solution_v = setNames(v, pred_names), |
| 98 | + solution_w = as.list(setNames(w, colnames(dp$X0))), |
| 99 | + loss_v = as.numeric(so$loss.v), |
| 100 | + loss_w = as.numeric(so$loss.w), |
| 101 | + divisor = setNames(as.numeric(divisor), pred_names), |
| 102 | + X1 = setNames(as.numeric(dp$X1), pred_names), |
| 103 | + X0 = X0_list, |
| 104 | + years = years, |
| 105 | + treated_path = treated_path, |
| 106 | + synthetic_path = synthetic_path, |
| 107 | + gap = treated_path - synthetic_path |
| 108 | +) |
| 109 | + |
| 110 | +dir.create("tests/data", showWarnings = FALSE, recursive = TRUE) |
| 111 | +write_json( |
| 112 | + golden, "tests/data/synth_basque_golden.json", |
| 113 | + auto_unbox = TRUE, digits = 12, pretty = TRUE |
| 114 | +) |
| 115 | + |
| 116 | +# Panel CSV: drop region 1 (Spain aggregate); long format + absorbing treated. |
| 117 | +panel <- basque[basque$regionno != 1, ] |
| 118 | +panel$treated <- as.integer(panel$regionno == 17 & panel$year >= 1970) |
| 119 | +stopifnot(!any(is.na(panel$gdpcap))) # outcome must be complete (balanced panel) |
| 120 | +write.csv(panel, "tests/data/synth_basque_panel.csv", row.names = FALSE) |
| 121 | + |
| 122 | +cat("Wrote tests/data/synth_basque_golden.json and synth_basque_panel.csv\n") |
| 123 | +cat("nvarsV:", length(v), " n_controls:", length(w), "\n") |
| 124 | +cat("loss.v:", format(so$loss.v, digits = 6), " loss.w:", format(so$loss.w, digits = 6), "\n") |
| 125 | +nz <- setNames(round(w, 4), colnames(dp$X0)) |
| 126 | +cat("solution.w (nonzero):\n") |
| 127 | +print(nz[nz > 1e-4]) |
0 commit comments