--- title: "Replicating Duflo (2001) with Rfuzzydid" output: rmarkdown::html_vignette vignette: > %\VignetteIndexEntry{Replicating Duflo (2001) with Rfuzzydid} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- ```{r, include = FALSE} knitr::opts_chunk$set( collapse = TRUE, comment = "#>", eval = requireNamespace("foreign", quietly = TRUE) ) ``` # Introduction This vignette walks through a concrete replication exercise: re-estimating the Duflo (2001) INPRES specification with `Rfuzzydid`. The key idea is to recover the wage return to schooling from cohort-by-district variation in treatment intensity, using the fuzzy DID estimators in de Chaisemartin and D'Haultfoeuille (2018). The dataset is the bundled `inpresdata.dta` file in `inst/extdata`. The file originates from David Roodman's replication materials ([droodman/Duflo-2001](https://github.com/droodman/Duflo-2001)). In the companion archive README ([droodman/Duflo-2001](https://github.com/droodman/Duflo-2001)), he notes that the main 1995 SUPAS file is "not contained in this repository" and links to that downloadable location. # Data Preparation ## Loading the Data ```{r load-data} library(Rfuzzydid) # Resolve package root when rendering inside source tree find_local_pkg_dir <- function() { roots <- c(".", "..", "../..") for (root in roots) { pkg_desc <- file.path(root, "pkg", "DESCRIPTION") root_desc <- file.path(root, "DESCRIPTION") if (file.exists(pkg_desc)) { return(file.path(root, "pkg")) } if (file.exists(root_desc)) { desc <- tryCatch(read.dcf(root_desc), error = function(e) NULL) if (!is.null(desc) && "Package" %in% colnames(desc) && identical(desc[1, "Package"], "Rfuzzydid")) { return(root) } } } NULL } # First try installed package location, then source-tree fallback data_path <- system.file("extdata", "inpresdata.dta", package = "Rfuzzydid") if (identical(data_path, "")) { local_pkg_dir <- find_local_pkg_dir() local_path <- if (!is.null(local_pkg_dir)) { file.path(local_pkg_dir, "inst", "extdata", "inpresdata.dta") } else { "" } if (file.exists(local_path)) { data_path <- local_path } else { stop("Bundled dataset 'inpresdata.dta' was not found in installed package or source-tree extdata.") } } raw <- foreign::read.dta(data_path) ``` ## Constructing the Analysis Sample Following the original design, we keep two cohort windows: - 1957-1962 (earlier cohort) - 1968-1972 (later cohort) The `t` indicator equals 1 for the later cohort. ```{r sample-construction} # Define cohort groups cohort0 <- raw$p504thn >= 57 & raw$p504thn <= 62 cohort1 <- raw$p504thn >= 68 & raw$p504thn <= 72 # Keep observations with non-missing outcome and treatment keep <- !is.na(raw$lhwage) & !is.na(raw$yeduc) & (cohort0 | cohort1) dat <- raw[keep, c("birthpl", "lhwage", "yeduc", "p504thn")] # Time indicator: t = 1 for later cohort dat$t <- as.integer(dat$p504thn >= 68 & dat$p504thn <= 72) ``` ## Constructing Group Variables The original Stata workflow builds "super-groups" based on how schooling changes within district between the two cohort windows. We follow the same logic with a chi-square screen and the sign of the change in mean schooling: ```{r group-construction} pval_threshold <- 0.5 districts <- sort(unique(dat$birthpl)) gstar_map <- setNames(integer(length(districts)), as.character(districts)) for (g in districts) { i <- dat$birthpl == g sub <- dat[i, ] tab <- table(sub$yeduc, sub$t) pval <- suppressWarnings(chisq.test(tab)$p.value) evol <- mean(sub$yeduc[sub$t == 1]) - mean(sub$yeduc[sub$t == 0]) if (!is.na(pval) && pval < pval_threshold) { gstar_map[[as.character(g)]] <- if (evol > 0) 1L else if (evol < 0) -1L else 0L } else { gstar_map[[as.character(g)]] <- 0L } } dat$gstar <- unname(gstar_map[as.character(dat$birthpl)]) ``` This produces three groups: - `gstar = 1`: Districts where education increased (treatment intensity rose) - `gstar = -1`: Districts where education decreased - `gstar = 0`: Districts with no significant change # Two-Arm Estimation As in the paper's setup, we estimate two arms separately and combine them after: ## Arm 1: Districts with Increasing Education (g* = 1 vs g* = 0) ```{r arm-inc} arm_inc_data <- dat[dat$gstar >= 0, ] arm_inc_data$G <- as.integer(arm_inc_data$gstar == 1) fit_inc <- fuzzydid( data = arm_inc_data, formula = lhwage ~ yeduc, group = "G", time = "t", did = TRUE, tc = TRUE, cic = TRUE, newcateg = c(5, 8, 11, 14, 1000), nose = TRUE, backend = "native" ) knitr::kable(fit_inc$late) ``` ## Arm 2: Districts with Decreasing Education (g* = -1 vs g* = 0) ```{r arm-dec} arm_dec_data <- dat[dat$gstar <= 0, ] arm_dec_data$G <- as.integer(arm_dec_data$gstar == -1) fit_dec <- fuzzydid( data = arm_dec_data, formula = lhwage ~ yeduc, group = "G", time = "t", did = TRUE, tc = TRUE, cic = TRUE, newcateg = c(5, 8, 11, 14, 1000), nose = TRUE, backend = "native" ) knitr::kable(fit_dec$late) ``` # Aggregating Results To aggregate, we weight each arm by: - its sample share (`p_inc`, `p_dec`) - its first-stage strength (the DID in schooling, `dD_inc`, `dD_dec`) The decreasing arm uses `-dD_dec` so both weights are on a comparable "increase in treatment" scale. ```{r aggregate} # Treatment intensity weights dD_inc <- mean(arm_inc_data$yeduc[arm_inc_data$G == 1 & arm_inc_data$t == 1]) - mean(arm_inc_data$yeduc[arm_inc_data$G == 1 & arm_inc_data$t == 0]) - mean(arm_inc_data$yeduc[arm_inc_data$G == 0 & arm_inc_data$t == 1]) + mean(arm_inc_data$yeduc[arm_inc_data$G == 0 & arm_inc_data$t == 0]) dD_dec <- mean(arm_dec_data$yeduc[arm_dec_data$G == 1 & arm_dec_data$t == 1]) - mean(arm_dec_data$yeduc[arm_dec_data$G == 1 & arm_dec_data$t == 0]) - mean(arm_dec_data$yeduc[arm_dec_data$G == 0 & arm_dec_data$t == 1]) + mean(arm_dec_data$yeduc[arm_dec_data$G == 0 & arm_dec_data$t == 0]) p_inc <- mean(dat$gstar == 1) p_dec <- mean(dat$gstar == -1) w_inc <- p_inc * dD_inc w_dec <- p_dec * (-dD_dec) # Extract point estimates did_inc <- fit_inc$late$estimate[fit_inc$late$estimator == "W_DID"] tc_inc <- fit_inc$late$estimate[fit_inc$late$estimator == "W_TC"] cic_inc <- fit_inc$late$estimate[fit_inc$late$estimator == "W_CIC"] did_dec <- fit_dec$late$estimate[fit_dec$late$estimator == "W_DID"] tc_dec <- fit_dec$late$estimate[fit_dec$late$estimator == "W_TC"] cic_dec <- fit_dec$late$estimate[fit_dec$late$estimator == "W_CIC"] # Weighted aggregation did_agg <- (did_inc * w_inc + did_dec * w_dec) / (w_inc + w_dec) tc_agg <- (tc_inc * w_inc + tc_dec * w_dec) / (w_inc + w_dec) cic_agg <- (cic_inc * w_inc + cic_dec * w_dec) / (w_inc + w_dec) cat("Aggregate Estimates:\n") cat(sprintf("DID = %.6f\n", did_agg)) cat(sprintf("TC = %.6f\n", tc_agg)) cat(sprintf("CIC = %.6f\n", cic_agg)) ``` # Comparison with Stata The following one-shot Stata code starts from `inpresdata.dta` and reproduces the arm-level and aggregate point estimates: The Stata snippet assumes the Stata `fuzzydid` command is already installed locally; `Rfuzzydid` no longer ships the `.ado` sources. ```stata clear all set more off local pval_threshold 0.5 use inpresdata.dta, clear keep if lhwage < . & yeduc < . & (inrange(p504thn,57,62) | inrange(p504thn,68,72)) gen byte t = inrange(p504thn,68,72) gen gstar = . levelsof birthpl, local(districts) local pval_threshold = 0.05 quietly foreach g of local districts { tab yeduc t if birthpl == `g', chi2 scalar p = r(p) summarize yeduc if t == 1 & birthpl == `g', meanonly scalar m1 = r(mean) summarize yeduc if t == 0 & birthpl == `g', meanonly scalar m0 = r(mean) scalar evol = m1 - m0 replace gstar = (scalar(evol) > 0 & scalar(p) < `pval_threshold') - (scalar(evol) < 0 & scalar(p) < `pval_threshold') if birthpl == `g' } gen byte inc = (gstar == 1) gen byte dec = (gstar == -1) summarize inc, meanonly scalar p_inc = r(mean) summarize dec, meanonly scalar p_dec = r(mean) * Arm g*=1 vs g*=0 preserve keep if gstar >= 0 gen byte G = (gstar == 1) fuzzydid lhwage G t yeduc, did tc cic newcateg(5 8 11 14 1000) nose matrix M = e(b_LATE) scalar did_inc = M[1,1] scalar tc_inc = M[2,1] scalar cic_inc = M[3,1] summarize yeduc if G == 1 & t == 1, meanonly scalar D11 = r(mean) summarize yeduc if G == 1 & t == 0, meanonly scalar D10 = r(mean) summarize yeduc if G == 0 & t == 1, meanonly scalar D01 = r(mean) summarize yeduc if G == 0 & t == 0, meanonly scalar D00 = r(mean) scalar dD_inc = (D11 - D10) - (D01 - D00) restore * Arm g*=-1 vs g*=0 preserve keep if gstar <= 0 gen byte G = (gstar == -1) fuzzydid lhwage G t yeduc, did tc cic newcateg(5 8 11 14 1000) nose matrix M = e(b_LATE) scalar did_dec = M[1,1] scalar tc_dec = M[2,1] scalar cic_dec = M[3,1] summarize yeduc if G == 1 & t == 1, meanonly scalar D11m = r(mean) summarize yeduc if G == 1 & t == 0, meanonly scalar D10m = r(mean) summarize yeduc if G == 0 & t == 1, meanonly scalar D01m = r(mean) summarize yeduc if G == 0 & t == 0, meanonly scalar D00m = r(mean) scalar dD_dec_pos = -((D11m - D10m) - (D01m - D00m)) restore scalar w_inc = p_inc * dD_inc scalar w_dec = p_dec * dD_dec_pos scalar did_agg = (did_inc * w_inc + did_dec * w_dec) / (w_inc + w_dec) scalar tc_agg = (tc_inc * w_inc + tc_dec * w_dec) / (w_inc + w_dec) scalar cic_agg = (cic_inc * w_inc + cic_dec * w_dec) / (w_inc + w_dec) display "Arm g*=1: DID = " %9.6f did_inc " ; TC = " %9.6f tc_inc " ; CIC = " %9.6f cic_inc display "Arm g*=-1: DID = " %9.6f did_dec " ; TC = " %9.6f tc_dec " ; CIC = " %9.6f cic_dec display "AGG DID = " %9.6f did_agg display "AGG TC = " %9.6f tc_agg display "AGG CIC = " %9.6f cic_agg ``` Running that code in Stata reports: - Arm `g*=1`: DID = `0.123679191`, TC = `0.079868219`, CIC = `0.074316049` - Arm `g*=-1`: DID = `0.117024113`, TC = `0.109567707`, CIC = `0.114301855` - Aggregate: DID = `0.122244473`, TC = `0.086270906`, CIC = `0.082936285` The next chunk checks parity directly in R: ```{r stata-parity-check} stata_vals <- c( did_inc = 0.123679191, tc_inc = 0.079868219, cic_inc = 0.074316049, did_dec = 0.117024113, tc_dec = 0.109567707, cic_dec = 0.114301855, did_agg = 0.122244473, tc_agg = 0.086270906, cic_agg = 0.082936285 ) r_vals <- c( did_inc = did_inc, tc_inc = tc_inc, cic_inc = cic_inc, did_dec = did_dec, tc_dec = tc_dec, cic_dec = cic_dec, did_agg = did_agg, tc_agg = tc_agg, cic_agg = cic_agg ) cmp <- data.frame( estimate = names(stata_vals), R = unname(r_vals), Stata = unname(stata_vals), abs_diff = abs(unname(r_vals) - unname(stata_vals)) ) knitr::kable(cmp, digits = 6) ``` Point estimates match to numerical tolerance. Confidence intervals can still differ when bootstrap is enabled, because draws and RNG streams differ across software. # References Duflo, E. (2001). Schooling and Labor Market Consequences of School Construction in Indonesia. *American Economic Review*, 91(4): 795-813. de Chaisemartin, C. and D'Haultfoeuille, X. (2018). *Fuzzy Differences-in-Differences*. *Review of Economic Studies*, 85(2): 999-1028. doi:10.1093/restud/rdx049. de Chaisemartin, C., D'Haultfoeuille, X., and Guyonvarch, Y. (2018). *Fuzzy Differences-in-Differences with Stata*. *Stata Journal*. doi:10.1177/1536867X19854019.