In this vignette we will explain how some functions of the package
are used in order to estimate a contingency table. We will work on the
eusilc
dataset of the laeken
package. All the
functions presented in the following are explained in the proposed
manuscript by Raphaël Jauslin and Yves Tillé (2021) doi:10.1016/j.jspi.2022.12.003.
We will estimate the contingency table when the factor variable which
represents the economic status pl030
is crossed with a
discretized version of the equivalized household income
eqIncome
. In order to discretize the equivalized income, we
calculate percentiles (0.15,0.30,0.45,0.60,0.75,0.90) of the variable
and define the category as intervals between the values.
library(laeken)
library(sampling)
library(StratifiedSampling)
#> Loading required package: Matrix
data("eusilc")
eusilc <- na.omit(eusilc)
N <- nrow(eusilc)
# Xm are the matching variables and id are identity of the units
Xm <- eusilc[,c("hsize","db040","age","rb090","pb220a")]
Xmcat <- do.call(cbind,apply(Xm[,c(2,4,5)],MARGIN = 2,FUN = disjunctive))
Xm <- cbind(Xmcat,Xm[,-c(2,4,5)])
id <- eusilc$rb030
# categorial income splitted by the percentile
c_income <- eusilc$eqIncome
q <- quantile(eusilc$eqIncome, probs = seq(0, 1, 0.15))
c_income[which(eusilc$eqIncome <= q[2])] <- "(0,15]"
c_income[which(q[2] < eusilc$eqIncome & eusilc$eqIncome <= q[3])] <- "(15,30]"
c_income[which(q[3] < eusilc$eqIncome & eusilc$eqIncome <= q[4])] <- "(30,45]"
c_income[which(q[4] < eusilc$eqIncome & eusilc$eqIncome <= q[5])] <- "(45,60]"
c_income[which(q[5] < eusilc$eqIncome & eusilc$eqIncome <= q[6])] <- "(60,75]"
c_income[which(q[6] < eusilc$eqIncome & eusilc$eqIncome <= q[7])] <- "(75,90]"
c_income[which( eusilc$eqIncome > q[7] )] <- "(90,100]"
# variable of interests
Y <- data.frame(ecostat = eusilc$pl030)
Z <- data.frame(c_income = c_income)
# put same rownames
rownames(Xm) <- rownames(Y) <- rownames(Z)<- id
YZ <- table(cbind(Y,Z))
addmargins(YZ)
#> c_income
#> ecostat (0,15] (15,30] (30,45] (45,60] (60,75] (75,90] (90,100] Sum
#> 1 409 616 722 807 935 1025 648 5162
#> 2 189 181 205 184 165 154 82 1160
#> 3 137 90 72 75 59 52 33 518
#> 4 210 159 103 95 74 49 46 736
#> 5 470 462 492 477 459 435 351 3146
#> 6 57 25 28 30 17 11 10 178
#> 7 344 283 194 149 106 91 40 1207
#> Sum 1816 1816 1816 1817 1815 1817 1210 12107
Here we set up the sampling designs and define all the quantities we will need for the rest of the vignette. The sample are selected with simple random sampling without replacement and the weights are equal to the inverse of the inclusion probabilities.
# size of sample
n1 <- 1000
n2 <- 500
# samples
s1 <- srswor(n1,N)
s2 <- srswor(n2,N)
# extract matching units
X1 <- Xm[s1 == 1,]
X2 <- Xm[s2 == 1,]
# extract variable of interest
Y1 <- data.frame(Y[s1 == 1,])
colnames(Y1) <- colnames(Y)
Z2 <- as.data.frame(Z[s2 == 1,])
colnames(Z2) <- colnames(Z)
# extract correct identities
id1 <- id[s1 == 1]
id2 <- id[s2 == 1]
# put correct rownames
rownames(Y1) <- id1
rownames(Z2) <- id2
# here weights are inverse of inclusion probabilities
d1 <- rep(N/n1,n1)
d2 <- rep(N/n2,n2)
# disjunctive form
Y_dis <- sampling::disjunctive(as.matrix(Y))
Z_dis <- sampling::disjunctive(as.matrix(Z))
Y1_dis <- Y_dis[s1 ==1,]
Z2_dis <- Z_dis[s2 ==1,]
Then the harmonization step must be performed. The
harmonize
function returns the harmonized weights. If by
chance the true population totals are known, it is possible to use these
instead of the estimate made within the function.
re <- harmonize(X1,d1,id1,X2,d2,id2)
# if we want to use the population totals to harmonize we can use
re <- harmonize(X1,d1,id1,X2,d2,id2,totals = c(N,colSums(Xm)))
w1 <- re$w1
w2 <- re$w2
colSums(Xm)
#> 1 2 3 4 5 6 7 8 9 10 11
#> 476 887 2340 763 1880 1021 2244 1938 558 6263 5844
#> 12 13 14 hsize age
#> 11073 283 751 36380 559915
colSums(w1*X1)
#> 1 2 3 4 5 6 7 8 9 10 11
#> 476 887 2340 763 1880 1021 2244 1938 558 6263 5844
#> 12 13 14 hsize age
#> 11073 283 751 36380 559915
colSums(w2*X2)
#> 1 2 3 4 5 6 7 8 9 10 11
#> 476 887 2340 763 1880 1021 2244 1938 558 6263 5844
#> 12 13 14 hsize age
#> 11073 283 751 36380 559915
The statistical matching is done by using the otmatch
function. The estimation of the contingency table is calculated by
extracting the id1
units (respectively id2
units) and by using the function tapply
with the correct
weights.
# Optimal transport matching
object <- otmatch(X1,id1,X2,id2,w1,w2)
head(object[,1:3])
#> id1 id2 weight
#> 202 202 2801 3.816626
#> 202.1 202 551001 7.458110
#> 1401 1401 133802 9.384434
#> 1604 1604 167801 7.016546
#> 1604.1 1604 258203 5.097148
#> 1903 1903 303202 7.785325
Y1_ot <- cbind(X1[as.character(object$id1),],y = Y1[as.character(object$id1),])
Z2_ot <- cbind(X2[as.character(object$id2),],z = Z2[as.character(object$id2),])
YZ_ot <- tapply(object$weight,list(Y1_ot$y,Z2_ot$z),sum)
# transform NA into 0
YZ_ot[is.na(YZ_ot)] <- 0
# result
round(addmargins(YZ_ot),3)
#> (0,15] (15,30] (30,45] (45,60] (60,75] (75,90] (90,100] Sum
#> 1 707.389 863.781 646.338 831.064 812.230 836.188 602.869 5299.858
#> 2 188.524 189.014 134.221 151.914 227.806 150.646 89.866 1131.990
#> 3 35.317 99.326 76.017 48.439 72.604 86.125 74.055 491.883
#> 4 67.534 77.334 150.105 73.640 45.420 75.103 38.897 528.033
#> 5 364.334 339.338 554.635 466.888 338.280 678.597 345.293 3087.365
#> 6 25.031 22.548 35.370 58.257 7.818 15.778 17.966 182.768
#> 7 221.405 199.874 224.169 115.657 245.251 265.628 113.118 1385.102
#> Sum 1609.535 1791.214 1820.854 1745.859 1749.410 2108.065 1282.064 12107.000
As you can see from the previous section, the optimal transport
results generally do not have a one-to-one match, meaning that for every
unit in sample 1, we have more than one unit with weights not equal to 0
in sample 2. The bsmatch
function creates a one-to-one
match by selecting a balanced stratified sampling to obtain a data.frame
where each unit in sample 1 has only one imputed unit from sample 2.
# Balanced Sampling
BS <- bsmatch(object)
head(BS$object[,1:3])
#> id1 id2 weight
#> 202.1 202 551001 7.458110
#> 1401 1401 133802 9.384434
#> 1604 1604 167801 7.016546
#> 1903 1903 303202 7.785325
#> 2003 2003 470603 10.291664
#> 2502 2502 323402 12.098266
Y1_bs <- cbind(X1[as.character(BS$object$id1),],y = Y1[as.character(BS$object$id1),])
Z2_bs <- cbind(X2[as.character(BS$object$id2),],z = Z2[as.character(BS$object$id2),])
YZ_bs <- tapply(BS$object$weight/BS$q,list(Y1_bs$y,Z2_bs$z),sum)
YZ_bs[is.na(YZ_bs)] <- 0
round(addmargins(YZ_bs),3)
#> (0,15] (15,30] (30,45] (45,60] (60,75] (75,90] (90,100] Sum
#> 1 684.406 833.248 659.891 816.898 863.414 856.922 585.080 5299.858
#> 2 173.283 188.159 123.456 158.824 246.468 147.742 94.058 1131.990
#> 3 40.172 87.287 84.030 49.171 72.110 87.781 71.331 491.883
#> 4 49.878 79.118 157.640 70.620 45.900 89.409 35.468 528.033
#> 5 379.469 324.008 570.219 466.501 316.299 654.691 376.178 3087.365
#> 6 26.532 22.569 43.714 52.430 12.324 10.895 14.304 182.768
#> 7 242.655 189.056 237.988 94.647 208.435 320.367 91.954 1385.102
#> Sum 1596.397 1723.444 1876.938 1709.091 1764.950 2167.807 1268.373 12107.000
# With Z2 as auxiliary information for stratified balanced sampling.
BS <- bsmatch(object,Z2)
Y1_bs <- cbind(X1[as.character(BS$object$id1),],y = Y1[as.character(BS$object$id1),])
Z2_bs <- cbind(X2[as.character(BS$object$id2),],z = Z2[as.character(BS$object$id2),])
YZ_bs <- tapply(BS$object$weight/BS$q,list(Y1_bs$y,Z2_bs$z),sum)
YZ_bs[is.na(YZ_bs)] <- 0
round(addmargins(YZ_bs),3)
#> (0,15] (15,30] (30,45] (45,60] (60,75] (75,90] (90,100] Sum
#> 1 731.233 899.316 648.513 849.054 811.372 810.830 549.540 5299.858
#> 2 175.333 192.336 141.168 176.120 205.356 156.722 84.954 1131.990
#> 3 28.059 89.968 69.062 75.780 72.110 73.460 83.444 491.883
#> 4 74.943 67.023 156.310 69.983 47.878 64.389 47.507 528.033
#> 5 327.032 347.086 521.409 431.167 367.510 715.015 378.147 3087.365
#> 6 26.532 11.260 43.714 52.430 12.324 22.204 14.304 182.768
#> 7 242.241 175.852 241.964 97.050 235.899 268.121 123.974 1385.102
#> Sum 1605.374 1782.841 1822.139 1751.585 1752.450 2110.740 1281.872 12107.000
# split the weight by id1
q_l <- split(object$weight,f = object$id1)
# normalize in each id1
q_l <- lapply(q_l, function(x){x/sum(x)})
q <- as.numeric(do.call(c,q_l))
Z_pred <- t(q*disjunctive(object$id1))%*%disjunctive(Z2[as.character(object$id2),])
colnames(Z_pred) <- levels(factor(Z2$c_income))
head(Z_pred)
#> (0,15] (15,30] (30,45] (45,60] (60,75] (75,90] (90,100]
#> [1,] 0.0000000 0 0.0000000 0.0000000 0.6614887 0 0.3385113
#> [2,] 0.0000000 0 1.0000000 0.0000000 0.0000000 0 0.0000000
#> [3,] 0.5792243 0 0.0000000 0.0000000 0.0000000 0 0.4207757
#> [4,] 0.0000000 0 0.3527935 0.6472065 0.0000000 0 0.0000000
#> [5,] 0.0000000 0 0.0000000 1.0000000 0.0000000 0 0.0000000
#> [6,] 0.0000000 0 1.0000000 0.0000000 0.0000000 0 0.0000000