--- title: "Stata Parity: Replicating Stata fuzzydid in R" output: rmarkdown::html_vignette vignette: > %\VignetteIndexEntry{Stata Parity: Replicating Stata fuzzydid in R} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- ```{r, include = FALSE} knitr::opts_chunk$set( collapse = TRUE, comment = "#>" ) ``` ```{r parity-helpers, include = FALSE} # Prefer local package sources when rendering inside the repository so # parity checks target the code under development. find_local_pkg_dir <- function() { roots <- c(".", "..", "../..") for (root in roots) { pkg_desc <- file.path(root, "pkg", "DESCRIPTION") root_desc <- file.path(root, "DESCRIPTION") if (file.exists(pkg_desc)) { return(file.path(root, "pkg")) } if (file.exists(root_desc)) { desc <- tryCatch(read.dcf(root_desc), error = function(e) NULL) if (!is.null(desc) && "Package" %in% colnames(desc) && identical(desc[1, "Package"], "Rfuzzydid")) { return(root) } } } NULL } local_pkg_dir <- find_local_pkg_dir() if (!is.null(local_pkg_dir) && requireNamespace("pkgload", quietly = TRUE)) { pkgload::load_all(local_pkg_dir, helpers = FALSE, export_all = FALSE, quiet = TRUE) } else { library(Rfuzzydid) } stata_partial <- c(TC_inf = 0.01220596000000, TC_sup = 3.08177130000000) stata_eqtest <- data.frame( contrast = c("W_DID_W_TC", "W_DID_W_CIC", "W_TC_W_CIC"), estimate = c(-0.0112300674614629, -0.145514320262384, -0.134284252800921), stringsAsFactors = FALSE ) stata_lqte <- data.frame( quantile = seq(0.05, 0.95, by = 0.05), estimate = c( 1.67274034023285, 1.79422724246979, 1.81602716445923, 1.94611406326294, 1.95544064044952, 1.94727826118469, 1.97404694557190, 2.09413313865662, 2.13104295730591, 2.03220224380493, 2.10763168334961, -0.360978126525879, -0.325689792633057, -0.228986263275146, -0.172835350036621, -0.146114349365234, -0.098971366882324, -0.079638004302979, -0.034532070159912 ) ) native_cluster <- c(n_reps = 20L, n_misreps = 3L, share_failures = 0.15) ``` # Introduction This vignette is a parity check between `Rfuzzydid` and Stata's `fuzzydid`. Each section runs an R example and, where possible, compares against frozen Stata outputs used in the test suite. The Stata snippets assume the Stata `fuzzydid` command is already installed locally; `Rfuzzydid` no longer bundles the `.ado` sources. The shared estimators come from de Chaisemartin and D'Haultfoeuille (2018): local average treatment effects (LATE) and local quantile treatment effects under fuzzy DID assumptions. # Simulated Data Example We start with a simple two-period, two-group design: - A control group (G = 0) observed in periods T = 0 and T = 1 - A treatment group (G = 1) observed in periods T = 0 and T = 1 - Treatment uptake D that increases more in the treatment group between periods - Outcome Y that responds to treatment ```{r sim-data} set.seed(50321) n_cell <- 120 # observations per cell # Helper function to create each (G, T) cell make_cell <- function(g, t, prob_d) { d <- rbinom(n_cell, size = 1, prob = prob_d) # Outcome: base + group effect + time effect + treatment effect + noise y <- 1 + 0.5 * g + 0.4 * t + 2.0 * d + sin(seq_len(n_cell) / 7) data.frame(y = y, g = g, t = t, d = d) } # Build the four cells df <- rbind( make_cell(g = 0, t = 0, prob_d = 0.20), # Control group, baseline make_cell(g = 0, t = 1, prob_d = 0.40), # Control group, follow-up make_cell(g = 1, t = 0, prob_d = 0.30), # Treatment group, baseline make_cell(g = 1, t = 1, prob_d = 0.80) # Treatment group, follow-up ) # Verify cell sizes and treatment rates aggregate(cbind(n = d) ~ g + t, data = df, FUN = function(x) c(n = length(x), mean = mean(x))) # Save fixture used by most examples for direct Stata comparison if (requireNamespace("haven", quietly = TRUE)) { haven::write_dta(df, "stata-parity-sim-fixture.dta") } ``` The treatment rate jumps from 30% to 80% in the treated group and from 20% to 40% in controls. That gap in treatment uptake changes is the first stage that identifies fuzzy DID effects. # Basic Estimation ## Wald-DID, Wald-TC, and Wald-CIC We request the three main LATE estimators: ```{r basic-est} fit <- fuzzydid( data = df, formula = y ~ d, group = "g", time = "t", did = TRUE, tc = TRUE, cic = TRUE, breps = 50 ) summary(fit) stata_basic <- c( W_DID = 2.00000000000000, W_TC = 1.97320415000280, W_CIC = 2.22209836585117 ) r_basic <- setNames(fit$late$estimate, fit$late$estimator)[names(stata_basic)] basic_cmp <- data.frame( estimator = names(stata_basic), R = unname(r_basic), Stata = unname(stata_basic), abs_diff = abs(unname(r_basic) - unname(stata_basic)) ) knitr::kable(basic_cmp, digits = 8) ``` The table reports point estimates, bootstrap standard errors, and percentile confidence intervals. Equivalent Stata code: ```stata fuzzydid y g t d, did tc cic breps(50) matrix list e(b_LATE) matrix list e(se_LATE) matrix list e(ci_LATE) ``` ## Extracting Results Programmatically `modelsummary` gives a compact table that is easy to pass into reports: ```{r tidy-output, eval = requireNamespace("modelsummary", quietly = TRUE)} library(modelsummary) modelsummary::modelsummary(fit) ``` Equivalent Stata code: ```stata * Tidy-like access in Stata comes from stored matrices in e() matrix list e(b_LATE) matrix list e(se_LATE) matrix list e(ci_LATE) ``` ## Core Stata Parity Check This chunk reproduces the exact fixture used by the parity tests and checks that R and Stata point estimates agree to tolerance: ```{r parity-check} n_cell <- 120L make_parity_cell <- function(g, t, ones) { data.frame( g = rep.int(g, n_cell), t = rep.int(t, n_cell), d = c(rep.int(1L, ones), rep.int(0L, n_cell - ones)) ) } out <- rbind( make_parity_cell(0L, 0L, 24L), make_parity_cell(0L, 1L, 48L), make_parity_cell(1L, 0L, 36L), make_parity_cell(1L, 1L, 96L) ) out$id <- seq_len(nrow(out)) out$y <- 1 + 0.5 * out$g + 0.4 * out$t + 2 * out$d + sin(out$id / 7) parity_fixture <- out[, c("y", "g", "t", "d")] fit_parity <- fuzzydid( data = parity_fixture, formula = y ~ d, group = "g", time = "t", did = TRUE, tc = TRUE, cic = TRUE, breps = 5, seed = 1 ) stata_core <- c( W_DID = 2.17430877504668, W_TC = 2.18553887285118, W_CIC = 2.31982319056988 ) r_core <- setNames(fit_parity$late$estimate, fit_parity$late$estimator)[c("W_DID", "W_TC", "W_CIC")] core_cmp <- data.frame( estimator = names(stata_core), R = unname(r_core), Stata = unname(stata_core), abs_diff = abs(unname(r_core) - unname(stata_core)) ) knitr::kable(core_cmp, digits = 8) # Save exact parity fixture for Stata users if (requireNamespace("haven", quietly = TRUE)) { haven::write_dta(parity_fixture, "stata-parity-core-fixture.dta") } ``` Point estimates are strict parity targets here. Standard errors and intervals depend on bootstrap simulation, so tiny platform-level differences are normal. Equivalent Stata code: ```stata * Load the same dataset exported in the parity check chunk use "stata-parity-core-fixture.dta", clear * Generate the group variable (already in dataset as 'g') * The Stata command expects G to identify treatment groups * Run fuzzydid fuzzydid y g t d, did tc cic breps(50) * Results are stored in e() matrix list e(b_LATE) matrix list e(se_LATE) matrix list e(ci_LATE) ``` ## Key Parity Points The next sections cover behavior that should match Stata and often causes confusion in practice. ### 1. User-Controlled Bootstrap Seed Bootstrap inference is reproducible when you pass the same `seed` value. If you change `seed`, the resamples change too: ```{r seed-parity} # These produce identical SE/CI because the same bootstrap seed is supplied fit1 <- fuzzydid(df, y ~ d, group = "g", time = "t", did = TRUE, breps = 50, seed = 1) fit2 <- fuzzydid(df, y ~ d, group = "g", time = "t", did = TRUE, breps = 50, seed = 1) all.equal(fit1$late$std.error, fit2$late$std.error) stata_seed_se <- 0.66310988000000 seed_cmp <- data.frame( estimator = "W_DID", R_seed_1_run_1 = fit1$late$std.error[fit1$late$estimator == "W_DID"], R_seed_1_run_2 = fit2$late$std.error[fit2$late$estimator == "W_DID"], Stata = stata_seed_se, stringsAsFactors = FALSE ) seed_cmp$abs_diff_run_1_vs_Stata <- abs(seed_cmp$R_seed_1_run_1 - seed_cmp$Stata) seed_cmp$abs_diff_run_2_vs_Stata <- abs(seed_cmp$R_seed_1_run_2 - seed_cmp$Stata) knitr::kable(seed_cmp, digits = 8) ``` Equivalent Stata code: ```stata use "stata-parity-sim-fixture.dta", clear set seed 1 fuzzydid y g t d, did breps(50) matrix se1 = e(se_LATE) set seed 1 fuzzydid y g t d, did breps(50) matrix se2 = e(se_LATE) matrix list se1 matrix list se2 ``` ### 2. Design Restrictions `Rfuzzydid` enforces the same design restrictions as Stata: ```{r restrictions, error = TRUE} df_restrict <- transform(df, x = rnorm(nrow(df))) # CIC requires no covariates try({ fuzzydid(df_restrict, y ~ d + x, group = "g", time = "t", cic = TRUE) }) # LQTE requires binary G, T, and D, plus no covariates try({ fuzzydid(df_restrict, y ~ d + x, group = "g", time = "t", lqte = TRUE) }) ``` Equivalent Stata code: ```stata use "stata-parity-sim-fixture.dta", clear gen x = rnormal() * CIC with covariates should error capture noisily fuzzydid y g t d x, cic * LQTE with covariates should error capture noisily fuzzydid y g t d x, lqte ``` ### 3. Partial Identification Bounds When Wald-TC point identification is unavailable, both implementations can report partial-identification bounds: ```{r partial} fit_partial <- fuzzydid( data = df, formula = y ~ d, group = "g", time = "t", tc = TRUE, partial = TRUE, breps = 50 ) knitr::kable(fit_partial$late) r_partial <- setNames(fit_partial$late$estimate, fit_partial$late$estimator)[names(stata_partial)] partial_cmp <- data.frame( estimator = names(stata_partial), R = unname(r_partial), Stata = unname(stata_partial), abs_diff = abs(unname(r_partial) - unname(stata_partial)) ) knitr::kable(partial_cmp, digits = 8) ``` Equivalent Stata code: ```stata use "stata-parity-sim-fixture.dta", clear fuzzydid y g t d, tc partial breps(50) matrix list e(b_LATE) ``` ### 4. Equality Tests When multiple estimators are requested, you can also test pairwise equality: ```{r eqtest} fit_eq <- fuzzydid( data = parity_fixture, formula = y ~ d, group = "g", time = "t", did = TRUE, tc = TRUE, cic = TRUE, eqtest = TRUE, breps = 5, seed = 1 ) eq_cmp <- data.frame( contrast = stata_eqtest$contrast, R = fit_eq$eqtest$estimate, Stata = stata_eqtest$estimate, abs_diff = abs(fit_eq$eqtest$estimate - stata_eqtest$estimate) ) knitr::kable(eq_cmp, digits = 8) ``` Equivalent Stata code: ```stata use "stata-parity-core-fixture.dta", clear fuzzydid y g t d, did tc cic eqtest breps(5) matrix list e(b_LATE_eqtest) ``` # Covariate Adjustment ## Parametric Adjustment with modelx With covariates, `modelx` controls how conditional expectations are modeled (for example OLS, logit, or probit): ```{r covariates} # Add covariates set.seed(123) df$x1 <- rnorm(nrow(df)) df$x2 <- factor(ifelse(runif(nrow(df)) > 0.5, "a", "b")) fit_modelx <- fuzzydid( data = df, formula = y ~ d + x1 + x2, group = "g", time = "t", did = TRUE, tc = TRUE, modelx = c("ols", "ols"), nose = TRUE ) knitr::kable(fit_modelx$late) stata_modelx <- c( W_DID = 2.03222780000000, W_TC = 1.93062450000000 ) r_modelx <- setNames(fit_modelx$late$estimate, fit_modelx$late$estimator)[names(stata_modelx)] modelx_cmp <- data.frame( estimator = names(stata_modelx), R = unname(r_modelx), Stata = unname(stata_modelx), abs_diff = abs(unname(r_modelx) - unname(stata_modelx)) ) knitr::kable(modelx_cmp, digits = 8) if (requireNamespace("haven", quietly = TRUE)) { haven::write_dta(df[, c("y", "g", "t", "d", "x1", "x2")], "stata-parity-covariates-fixture.dta") } ``` For binary treatment, `modelx` takes two entries: 1. Method for E(Y_{gt}|X) and E(Y_{dgt}|X) 2. Method for E(D_{gt}|X) For multi-valued treatment, a third entry specifies the model for `P(D_{gt}=d | X)`. Equivalent Stata code: ```stata use "stata-parity-covariates-fixture.dta", clear fuzzydid y g t d, did tc continuous(x1) qualitative(x2) modelx(ols ols) matrix list e(b_LATE) ``` ## Nonparametric Adjustment with Sieves Continuous controls can also be handled nonparametrically: ```{r sieves} fit_sieves <- fuzzydid( data = df, formula = y ~ d + x1, group = "g", time = "t", did = TRUE, tc = TRUE, sieves = TRUE, nose = TRUE ) knitr::kable(fit_sieves$late) stata_sieves <- c( W_DID = 2.00081280000000, W_TC = 1.93678270000000 ) r_sieves <- setNames(fit_sieves$late$estimate, fit_sieves$late$estimator)[names(stata_sieves)] sieves_cmp <- data.frame( estimator = names(stata_sieves), R = unname(r_sieves), Stata = unname(stata_sieves), abs_diff = abs(unname(r_sieves) - unname(stata_sieves)) ) knitr::kable(sieves_cmp, digits = 8) ``` When `sieves = TRUE`, `sieveorder = NULL` triggers deterministic 5-fold CV (the Stata-parity default). You can set `sieveorder` manually if you want a fixed basis order. Equivalent Stata code: ```stata use "stata-parity-covariates-fixture.dta", clear fuzzydid y g t d, did tc continuous(x1) sieves matrix list e(b_LATE) ``` # LQTE Estimation LQTE estimates are available from the 5th through 95th percentiles: ```{r lqte} fit_lqte <- fuzzydid( data = parity_fixture, formula = y ~ d, group = "g", time = "t", lqte = TRUE, nose = TRUE, seed = 1 ) lqte_cmp <- data.frame( quantile = stata_lqte$quantile, R = fit_lqte$lqte$estimate, Stata = stata_lqte$estimate, abs_diff = abs(fit_lqte$lqte$estimate - stata_lqte$estimate) ) knitr::kable(head(lqte_cmp, 10), digits = 8) ``` The native LQTE path follows the same rearranged counterfactual-CDF construction as the upstream Stata command. Small differences can occur because the original Stata implementation stores intermediate empirical CDFs as Stata floats. LQTE requires binary `G`, `T`, and `D`, with no covariates. Equivalent Stata code: ```stata use "stata-parity-core-fixture.dta", clear fuzzydid y g t d, lqte matrix list e(b_LQTE) ``` # Multi-Period Designs For more than two periods, use `group_forward`: ```{r multiperiod} set.seed(7) n_mp <- 70 make_mp_cell <- function(gb, gf, t, prob, shift = 0) { d <- rbinom(n_mp, 1, prob) y <- 1 + 0.35 * t + 1.6 * d + shift + rnorm(n_mp, sd = 0.25) data.frame(y = y, d = d, gb = gb, gf = gf, t = t) } df_mp <- rbind( make_mp_cell(gb = 0, gf = 0, t = 0, prob = 0.20), make_mp_cell(gb = 0, gf = 0, t = 1, prob = 0.30), make_mp_cell(gb = 0, gf = 0, t = 2, prob = 0.35), make_mp_cell(gb = 0, gf = 1, t = 0, prob = 0.30, shift = 0.2), make_mp_cell(gb = 1, gf = 1, t = 1, prob = 0.60, shift = 0.8), make_mp_cell(gb = 1, gf = 0, t = 2, prob = 0.75, shift = 1.1) ) fit_mp <- fuzzydid( data = df_mp, formula = y ~ d, group = "gb", group_forward = "gf", time = "t", did = TRUE, tc = TRUE, nose = TRUE ) knitr::kable(fit_mp$late) if (requireNamespace("haven", quietly = TRUE)) { haven::write_dta(df_mp, "stata-parity-multiperiod-fixture.dta") } ``` Equivalent Stata code: ```stata use "stata-parity-multiperiod-fixture.dta", clear fuzzydid y gb t d, groupf(gf) did tc matrix list e(b_LATE) ``` # Bootstrap Diagnostics The object also tracks bootstrap failures (degenerate resamples): ```{r bootstrap-diag} df_diag <- parity_fixture df_diag$cl <- rep(seq_len(24L), each = 20L) fit_diag <- fuzzydid( data = df_diag, formula = y ~ d, group = "g", time = "t", did = TRUE, tc = TRUE, cic = TRUE, cluster = "cl", breps = 20, seed = 1 ) diag_cmp <- data.frame( metric = c("N.reps", "N.misreps", "Share.failures"), R = c(fit_diag$n_reps, fit_diag$n_misreps, fit_diag$share_failures), Reference = c( native_cluster[["n_reps"]], native_cluster[["n_misreps"]], native_cluster[["share_failures"]] ), stringsAsFactors = FALSE ) share_from_counts <- if (diag_cmp$R[1] > 0) diag_cmp$R[2] / diag_cmp$R[1] else NA_real_ # These diagnostics are deterministic in R when the same bootstrap seed is used. diag_cmp$Delta <- diag_cmp$R - diag_cmp$Reference knitr::kable(diag_cmp, digits = 8) if (requireNamespace("haven", quietly = TRUE)) { haven::write_dta(df_diag, "stata-parity-cluster-fixture.dta") } ``` Stata can run the same clustered bootstrap command: ```stata use "stata-parity-cluster-fixture.dta", clear fuzzydid y g t d, did tc cic cluster(cl) breps(20) ``` The Stata command does not expose `N_misreps` or `share_failures` directly, so the table above is tracked as a deterministic native regression check rather than a stored-scalar parity assertion. # References de Chaisemartin, C. and D'Haultfoeuille, X. (2018). *Fuzzy Differences-in-Differences*. *Review of Economic Studies*, 85(2): 999-1028. doi:10.1093/restud/rdx049.