Introduction

This vignette is a parity check between Rfuzzydid and Stata’s fuzzydid. Each section runs an R example and, where possible, compares against frozen Stata outputs used in the test suite.

The Stata snippets assume the Stata fuzzydid command is already installed locally; Rfuzzydid no longer bundles the .ado sources.

The shared estimators come from de Chaisemartin and D’Haultfoeuille (2018): local average treatment effects (LATE) and local quantile treatment effects under fuzzy DID assumptions.

Simulated Data Example

We start with a simple two-period, two-group design:

A control group (G = 0) observed in periods T = 0 and T = 1
A treatment group (G = 1) observed in periods T = 0 and T = 1
Treatment uptake D that increases more in the treatment group between periods
Outcome Y that responds to treatment

set.seed(50321)

n_cell <- 120  # observations per cell

# Helper function to create each (G, T) cell
make_cell <- function(g, t, prob_d) {
  d <- rbinom(n_cell, size = 1, prob = prob_d)
  # Outcome: base + group effect + time effect + treatment effect + noise
  y <- 1 + 0.5 * g + 0.4 * t + 2.0 * d + sin(seq_len(n_cell) / 7)
  data.frame(y = y, g = g, t = t, d = d)
}

# Build the four cells
df <- rbind(
  make_cell(g = 0, t = 0, prob_d = 0.20),  # Control group, baseline
  make_cell(g = 0, t = 1, prob_d = 0.40),  # Control group, follow-up  
  make_cell(g = 1, t = 0, prob_d = 0.30),  # Treatment group, baseline
  make_cell(g = 1, t = 1, prob_d = 0.80)   # Treatment group, follow-up
)

# Verify cell sizes and treatment rates
aggregate(cbind(n = d) ~ g + t, data = df, FUN = function(x) c(n = length(x), mean = mean(x)))
#>   g t         n.n      n.mean
#> 1 0 0 120.0000000   0.1583333
#> 2 1 0 120.0000000   0.2916667
#> 3 0 1 120.0000000   0.4333333
#> 4 1 1 120.0000000   0.8083333

# Save fixture used by most examples for direct Stata comparison
if (requireNamespace("haven", quietly = TRUE)) {
  haven::write_dta(df, "stata-parity-sim-fixture.dta")
}

The treatment rate jumps from 30% to 80% in the treated group and from 20% to 40% in controls. That gap in treatment uptake changes is the first stage that identifies fuzzy DID effects.

Basic Estimation

Wald-DID, Wald-TC, and Wald-CIC

We request the three main LATE estimators:

fit <- fuzzydid(
  data = df,
  formula = y ~ d,
  group = "g",
  time = "t",
  did = TRUE,
  tc = TRUE,
  cic = TRUE,
  breps = 50
)

summary(fit)
#> LATE estimators
#> 
#> 
#> |estimator | estimate| std.error| conf.low| conf.high|
#> |:---------|--------:|---------:|--------:|---------:|
#> |W_DID     | 2.000000| 0.4679188| 1.088768|  2.960728|
#> |W_TC      | 1.973204| 0.2333661| 1.526637|  2.309330|
#> |W_CIC     | 2.222098| 0.2028711| 1.820405|  2.545051|

stata_basic <- c(
  W_DID = 2.00000000000000,
  W_TC = 1.97320415000280,
  W_CIC = 2.22209836585117
)
r_basic <- setNames(fit$late$estimate, fit$late$estimator)[names(stata_basic)]
basic_cmp <- data.frame(
  estimator = names(stata_basic),
  R = unname(r_basic),
  Stata = unname(stata_basic),
  abs_diff = abs(unname(r_basic) - unname(stata_basic))
)
knitr::kable(basic_cmp, digits = 8)

estimator	R	Stata	abs_diff
W_DID	2.000000	2.000000	0e+00
W_TC	1.973204	1.973204	8e-08
W_CIC	2.222098	2.222098	5e-08

The table reports point estimates, bootstrap standard errors, and percentile confidence intervals.

Equivalent Stata code:

fuzzydid y g t d, did tc cic breps(50)
matrix list e(b_LATE)
matrix list e(se_LATE)
matrix list e(ci_LATE)

Extracting Results Programmatically

modelsummary gives a compact table that is easy to pass into reports:

library(modelsummary)

modelsummary::modelsummary(fit)
#> Found litedown! Enabling r-universe template

	(1)
W_DID	2.000
	(0.468)
W_TC	1.973
	(0.233)
W_CIC	2.222
	(0.203)
Num.Obs.	480
backend	native
N.11	120
N.10	120
N.01	120
N.00	120
N.reps	50
N.misreps	0
Share.failures	0

Equivalent Stata code:

* Tidy-like access in Stata comes from stored matrices in e()
matrix list e(b_LATE)
matrix list e(se_LATE)
matrix list e(ci_LATE)

Core Stata Parity Check

This chunk reproduces the exact fixture used by the parity tests and checks that R and Stata point estimates agree to tolerance:

n_cell <- 120L

make_parity_cell <- function(g, t, ones) {
  data.frame(
    g = rep.int(g, n_cell),
    t = rep.int(t, n_cell),
    d = c(rep.int(1L, ones), rep.int(0L, n_cell - ones))
  )
}

out <- rbind(
  make_parity_cell(0L, 0L, 24L),
  make_parity_cell(0L, 1L, 48L),
  make_parity_cell(1L, 0L, 36L),
  make_parity_cell(1L, 1L, 96L)
)
out$id <- seq_len(nrow(out))
out$y <- 1 + 0.5 * out$g + 0.4 * out$t + 2 * out$d + sin(out$id / 7)
parity_fixture <- out[, c("y", "g", "t", "d")]

fit_parity <- fuzzydid(
  data = parity_fixture,
  formula = y ~ d,
  group = "g",
  time = "t",
  did = TRUE,
  tc = TRUE,
  cic = TRUE,
  breps = 5,
  seed = 1
)

stata_core <- c(
  W_DID = 2.17430877504668,
  W_TC = 2.18553887285118,
  W_CIC = 2.31982319056988
)

r_core <- setNames(fit_parity$late$estimate, fit_parity$late$estimator)[c("W_DID", "W_TC", "W_CIC")]

core_cmp <- data.frame(
  estimator = names(stata_core),
  R = unname(r_core),
  Stata = unname(stata_core),
  abs_diff = abs(unname(r_core) - unname(stata_core))
)

knitr::kable(core_cmp, digits = 8)

estimator	R	Stata	abs_diff
W_DID	2.174309	2.174309	7e-08
W_TC	2.185539	2.185539	4e-08
W_CIC	2.319823	2.319823	3e-08


# Save exact parity fixture for Stata users
if (requireNamespace("haven", quietly = TRUE)) {
  haven::write_dta(parity_fixture, "stata-parity-core-fixture.dta")
}

Point estimates are strict parity targets here. Standard errors and intervals depend on bootstrap simulation, so tiny platform-level differences are normal.

Equivalent Stata code:

* Load the same dataset exported in the parity check chunk
use "stata-parity-core-fixture.dta", clear

* Generate the group variable (already in dataset as 'g')
* The Stata command expects G to identify treatment groups

* Run fuzzydid
fuzzydid y g t d, did tc cic breps(50)

* Results are stored in e()
matrix list e(b_LATE)
matrix list e(se_LATE)
matrix list e(ci_LATE)

Key Parity Points

The next sections cover behavior that should match Stata and often causes confusion in practice.

1. User-Controlled Bootstrap Seed

Bootstrap inference is reproducible when you pass the same seed value. If you change seed, the resamples change too:

# These produce identical SE/CI because the same bootstrap seed is supplied
fit1 <- fuzzydid(df, y ~ d, group = "g", time = "t", did = TRUE, breps = 50, seed = 1)
fit2 <- fuzzydid(df, y ~ d, group = "g", time = "t", did = TRUE, breps = 50, seed = 1)

all.equal(fit1$late$std.error, fit2$late$std.error)
#> [1] TRUE

stata_seed_se <- 0.66310988000000
seed_cmp <- data.frame(
  estimator = "W_DID",
  R_seed_1_run_1 = fit1$late$std.error[fit1$late$estimator == "W_DID"],
  R_seed_1_run_2 = fit2$late$std.error[fit2$late$estimator == "W_DID"],
  Stata = stata_seed_se,
  stringsAsFactors = FALSE
)
seed_cmp$abs_diff_run_1_vs_Stata <- abs(seed_cmp$R_seed_1_run_1 - seed_cmp$Stata)
seed_cmp$abs_diff_run_2_vs_Stata <- abs(seed_cmp$R_seed_1_run_2 - seed_cmp$Stata)
knitr::kable(seed_cmp, digits = 8)

estimator	R_seed_1_run_1	R_seed_1_run_2	Stata	abs_diff_run_1_vs_Stata	abs_diff_run_2_vs_Stata
W_DID	0.7859547	0.7859547	0.6631099	0.1228449	0.1228449

Equivalent Stata code:

use "stata-parity-sim-fixture.dta", clear
set seed 1
fuzzydid y g t d, did breps(50)
matrix se1 = e(se_LATE)

set seed 1
fuzzydid y g t d, did breps(50)
matrix se2 = e(se_LATE)

matrix list se1
matrix list se2

2. Design Restrictions

Rfuzzydid enforces the same design restrictions as Stata:

df_restrict <- transform(df, x = rnorm(nrow(df)))

# CIC requires no covariates
try({
  fuzzydid(df_restrict, y ~ d + x, group = "g", time = "t", cic = TRUE)
})
#> Error : continuous and qualitative cannot be used in combination with cic or lqte.

# LQTE requires binary G, T, and D, plus no covariates
try({
  fuzzydid(df_restrict, y ~ d + x, group = "g", time = "t", lqte = TRUE)
})
#> Error : continuous and qualitative cannot be used in combination with cic or lqte.

Equivalent Stata code:

use "stata-parity-sim-fixture.dta", clear
gen x = rnormal()

* CIC with covariates should error
capture noisily fuzzydid y g t d x, cic

* LQTE with covariates should error
capture noisily fuzzydid y g t d x, lqte

3. Partial Identification Bounds

When Wald-TC point identification is unavailable, both implementations can report partial-identification bounds:

fit_partial <- fuzzydid(
  data = df,
  formula = y ~ d,
  group = "g",
  time = "t",
  tc = TRUE,
  partial = TRUE,
  breps = 50
)

knitr::kable(fit_partial$late)

estimator	estimate	std.error	conf.low	conf.high
TC_inf	0.012206	0.4554927	-0.8004909	0.9406186
TC_sup	3.081771	0.3345019	2.4384801	3.6728553


r_partial <- setNames(fit_partial$late$estimate, fit_partial$late$estimator)[names(stata_partial)]
partial_cmp <- data.frame(
  estimator = names(stata_partial),
  R = unname(r_partial),
  Stata = unname(stata_partial),
  abs_diff = abs(unname(r_partial) - unname(stata_partial))
)
knitr::kable(partial_cmp, digits = 8)

estimator	R	Stata	abs_diff
TC_inf	0.01220596	0.01220596	0e+00
TC_sup	3.08177133	3.08177130	3e-08

Equivalent Stata code:

use "stata-parity-sim-fixture.dta", clear
fuzzydid y g t d, tc partial breps(50)
matrix list e(b_LATE)

4. Equality Tests

When multiple estimators are requested, you can also test pairwise equality:

fit_eq <- fuzzydid(
  data = parity_fixture,
  formula = y ~ d,
  group = "g",
  time = "t",
  did = TRUE,
  tc = TRUE,
  cic = TRUE,
  eqtest = TRUE,
  breps = 5,
  seed = 1
)

eq_cmp <- data.frame(
  contrast = stata_eqtest$contrast,
  R = fit_eq$eqtest$estimate,
  Stata = stata_eqtest$estimate,
  abs_diff = abs(fit_eq$eqtest$estimate - stata_eqtest$estimate)
)
knitr::kable(eq_cmp, digits = 8)

contrast	R	Stata
W_DID_W_TC	-0.01123007	-0.01123007
W_DID_W_CIC	-0.14551432	-0.14551432
W_TC_W_CIC	-0.13428425	-0.13428425

Equivalent Stata code:

use "stata-parity-core-fixture.dta", clear
fuzzydid y g t d, did tc cic eqtest breps(5)
matrix list e(b_LATE_eqtest)

Covariate Adjustment

Parametric Adjustment with modelx

With covariates, modelx controls how conditional expectations are modeled (for example OLS, logit, or probit):

# Add covariates
set.seed(123)
df$x1 <- rnorm(nrow(df))
df$x2 <- factor(ifelse(runif(nrow(df)) > 0.5, "a", "b"))

fit_modelx <- fuzzydid(
  data = df,
  formula = y ~ d + x1 + x2,
  group = "g",
  time = "t",
  did = TRUE,
  tc = TRUE,
  modelx = c("ols", "ols"),
  nose = TRUE
)

knitr::kable(fit_modelx$late)

estimator	estimate	std.error	conf.low	conf.high
W_DID	2.032228	NA	NA	NA
W_TC	1.930624	NA	NA	NA


stata_modelx <- c(
  W_DID = 2.03222780000000,
  W_TC = 1.93062450000000
)
r_modelx <- setNames(fit_modelx$late$estimate, fit_modelx$late$estimator)[names(stata_modelx)]
modelx_cmp <- data.frame(
  estimator = names(stata_modelx),
  R = unname(r_modelx),
  Stata = unname(stata_modelx),
  abs_diff = abs(unname(r_modelx) - unname(stata_modelx))
)
knitr::kable(modelx_cmp, digits = 8)

estimator	R	Stata	abs_diff
W_DID	2.032228	2.032228	1e-08
W_TC	1.930625	1.930624	1e-08


if (requireNamespace("haven", quietly = TRUE)) {
  haven::write_dta(df[, c("y", "g", "t", "d", "x1", "x2")], "stata-parity-covariates-fixture.dta")
}

For binary treatment, modelx takes two entries:

Method for E(Y_{gt}|X) and E(Y_{dgt}|X)
Method for E(D_{gt}|X)

For multi-valued treatment, a third entry specifies the model for P(D_{gt}=d | X).

Equivalent Stata code:

use "stata-parity-covariates-fixture.dta", clear
fuzzydid y g t d, did tc continuous(x1) qualitative(x2) modelx(ols ols)
matrix list e(b_LATE)

Nonparametric Adjustment with Sieves

Continuous controls can also be handled nonparametrically:

fit_sieves <- fuzzydid(
  data = df,
  formula = y ~ d + x1,
  group = "g",
  time = "t",
  did = TRUE,
  tc = TRUE,
  sieves = TRUE,
  nose = TRUE
)

knitr::kable(fit_sieves$late)

estimator	estimate	std.error	conf.low	conf.high
W_DID	2.013885	NA	NA	NA
W_TC	1.894181	NA	NA	NA


stata_sieves <- c(
  W_DID = 2.00081280000000,
  W_TC = 1.93678270000000
)
r_sieves <- setNames(fit_sieves$late$estimate, fit_sieves$late$estimator)[names(stata_sieves)]
sieves_cmp <- data.frame(
  estimator = names(stata_sieves),
  R = unname(r_sieves),
  Stata = unname(stata_sieves),
  abs_diff = abs(unname(r_sieves) - unname(stata_sieves))
)
knitr::kable(sieves_cmp, digits = 8)

estimator	R	Stata	abs_diff
W_DID	2.013885	2.000813	0.01307262
W_TC	1.894181	1.936783	0.04260123

When sieves = TRUE, sieveorder = NULL triggers deterministic 5-fold CV (the Stata-parity default). You can set sieveorder manually if you want a fixed basis order.

Equivalent Stata code:

use "stata-parity-covariates-fixture.dta", clear
fuzzydid y g t d, did tc continuous(x1) sieves
matrix list e(b_LATE)

LQTE Estimation

LQTE estimates are available from the 5th through 95th percentiles:

fit_lqte <- fuzzydid(
  data = parity_fixture,
  formula = y ~ d,
  group = "g",
  time = "t",
  lqte = TRUE,
  nose = TRUE,
  seed = 1
)

lqte_cmp <- data.frame(
  quantile = stata_lqte$quantile,
  R = fit_lqte$lqte$estimate,
  Stata = stata_lqte$estimate,
  abs_diff = abs(fit_lqte$lqte$estimate - stata_lqte$estimate)
)
knitr::kable(head(lqte_cmp, 10), digits = 8)

quantile	R	Stata	abs_diff
0.05	1.670645	1.672740	0.00209542
0.10	1.745951	1.794227	0.04827636
0.15	1.816027	1.816027	0.00000008
0.20	1.946114	1.946114	0.00000003
0.25	1.955441	1.955441	0.00000004
0.30	1.947278	1.947278	0.00000007
0.35	1.974047	1.974047	0.00000005
0.40	2.002776	2.094133	0.09135670
0.45	2.043456	2.131043	0.08758671
0.50	2.032202	2.032202	0.00000023

The native LQTE path follows the same rearranged counterfactual-CDF construction as the upstream Stata command. Small differences can occur because the original Stata implementation stores intermediate empirical CDFs as Stata floats.

LQTE requires binary G, T, and D, with no covariates.

Equivalent Stata code:

use "stata-parity-core-fixture.dta", clear
fuzzydid y g t d, lqte
matrix list e(b_LQTE)

Multi-Period Designs

For more than two periods, use group_forward:

set.seed(7)
n_mp <- 70

make_mp_cell <- function(gb, gf, t, prob, shift = 0) {
  d <- rbinom(n_mp, 1, prob)
  y <- 1 + 0.35 * t + 1.6 * d + shift + rnorm(n_mp, sd = 0.25)
  data.frame(y = y, d = d, gb = gb, gf = gf, t = t)
}

df_mp <- rbind(
  make_mp_cell(gb = 0, gf = 0, t = 0, prob = 0.20),
  make_mp_cell(gb = 0, gf = 0, t = 1, prob = 0.30),
  make_mp_cell(gb = 0, gf = 0, t = 2, prob = 0.35),
  make_mp_cell(gb = 0, gf = 1, t = 0, prob = 0.30, shift = 0.2),
  make_mp_cell(gb = 1, gf = 1, t = 1, prob = 0.60, shift = 0.8),
  make_mp_cell(gb = 1, gf = 0, t = 2, prob = 0.75, shift = 1.1)
)

fit_mp <- fuzzydid(
  data = df_mp,
  formula = y ~ d,
  group = "gb",
  group_forward = "gf",
  time = "t",
  did = TRUE,
  tc = TRUE,
  nose = TRUE
)

knitr::kable(fit_mp$late)

estimator	estimate	std.error	conf.low	conf.high
W_DID	4.769522	NA	NA	NA
W_TC	4.294036	NA	NA	NA


if (requireNamespace("haven", quietly = TRUE)) {
  haven::write_dta(df_mp, "stata-parity-multiperiod-fixture.dta")
}

Equivalent Stata code:

use "stata-parity-multiperiod-fixture.dta", clear
fuzzydid y gb t d, groupf(gf) did tc
matrix list e(b_LATE)

Bootstrap Diagnostics

The object also tracks bootstrap failures (degenerate resamples):

df_diag <- parity_fixture
df_diag$cl <- rep(seq_len(24L), each = 20L)

fit_diag <- fuzzydid(
  data = df_diag,
  formula = y ~ d,
  group = "g",
  time = "t",
  did = TRUE,
  tc = TRUE,
  cic = TRUE,
  cluster = "cl",
  breps = 20,
  seed = 1
)

diag_cmp <- data.frame(
  metric = c("N.reps", "N.misreps", "Share.failures"),
  R = c(fit_diag$n_reps, fit_diag$n_misreps, fit_diag$share_failures),
  Reference = c(
    native_cluster[["n_reps"]],
    native_cluster[["n_misreps"]],
    native_cluster[["share_failures"]]
  ),
  stringsAsFactors = FALSE
)

share_from_counts <- if (diag_cmp$R[1] > 0) diag_cmp$R[2] / diag_cmp$R[1] else NA_real_

# These diagnostics are deterministic in R when the same bootstrap seed is used.
diag_cmp$Delta <- diag_cmp$R - diag_cmp$Reference
knitr::kable(diag_cmp, digits = 8)

metric	R	Reference	Delta
N.reps	20.0	20.00	0.00
N.misreps	4.0	3.00	1.00
Share.failures	0.2	0.15	0.05


if (requireNamespace("haven", quietly = TRUE)) {
  haven::write_dta(df_diag, "stata-parity-cluster-fixture.dta")
}

Stata can run the same clustered bootstrap command:

use "stata-parity-cluster-fixture.dta", clear
fuzzydid y g t d, did tc cic cluster(cl) breps(20)

The Stata command does not expose N_misreps or share_failures directly, so the table above is tracked as a deterministic native regression check rather than a stored-scalar parity assertion.

References

de Chaisemartin, C. and D’Haultfoeuille, X. (2018). Fuzzy Differences-in-Differences. Review of Economic Studies, 85(2): 999-1028. doi:10.1093/restud/rdx049.

Stata Parity: Replicating Stata fuzzydid in R

Introduction

Simulated Data Example

Basic Estimation

Wald-DID, Wald-TC, and Wald-CIC

Extracting Results Programmatically

Core Stata Parity Check

Key Parity Points

1. User-Controlled Bootstrap Seed

2. Design Restrictions

3. Partial Identification Bounds

4. Equality Tests

Covariate Adjustment

Parametric Adjustment with modelx

Nonparametric Adjustment with Sieves

LQTE Estimation

Multi-Period Designs

Bootstrap Diagnostics

References