In this vignette we will explain how some functions of the package
are used in order to estimate a contingency table. We will work on the
eusilc dataset of the laeken package. All the
functions presented in the following are explained in the proposed
manuscript by Raphaël Jauslin and Yves Tillé (2021) doi:10.1016/j.jspi.2022.12.003.
We will estimate the contingency table when the factor variable which
represents the economic status pl030 is crossed with a
discretized version of the equivalized household income
eqIncome. In order to discretize the equivalized income, we
calculate percentiles (0.15,0.30,0.45,0.60,0.75,0.90) of the variable
and define the category as intervals between the values.
library(laeken)
library(sampling)
library(StratifiedSampling)
#> Loading required package: Matrix
data("eusilc")
eusilc <- na.omit(eusilc)
N <- nrow(eusilc)
# Xm are the matching variables and id are identity of the units
Xm <- eusilc[,c("hsize","db040","age","rb090","pb220a")]
Xmcat <- do.call(cbind,apply(Xm[,c(2,4,5)],MARGIN = 2,FUN = disjunctive))
Xm <- cbind(Xmcat,Xm[,-c(2,4,5)])
id <- eusilc$rb030
# categorial income splitted by the percentile
c_income <- eusilc$eqIncome
q <- quantile(eusilc$eqIncome, probs = seq(0, 1, 0.15))
c_income[which(eusilc$eqIncome <= q[2])] <- "(0,15]"
c_income[which(q[2] < eusilc$eqIncome & eusilc$eqIncome <= q[3])] <- "(15,30]"
c_income[which(q[3] < eusilc$eqIncome & eusilc$eqIncome <= q[4])] <- "(30,45]"
c_income[which(q[4] < eusilc$eqIncome & eusilc$eqIncome <= q[5])] <- "(45,60]"
c_income[which(q[5] < eusilc$eqIncome & eusilc$eqIncome <= q[6])] <- "(60,75]"
c_income[which(q[6] < eusilc$eqIncome & eusilc$eqIncome <= q[7])] <- "(75,90]"
c_income[which( eusilc$eqIncome > q[7] )] <- "(90,100]"
# variable of interests
Y <- data.frame(ecostat = eusilc$pl030)
Z <- data.frame(c_income = c_income)
# put same rownames
rownames(Xm) <- rownames(Y) <- rownames(Z)<- id
YZ <- table(cbind(Y,Z))
addmargins(YZ)
#> c_income
#> ecostat (0,15] (15,30] (30,45] (45,60] (60,75] (75,90] (90,100] Sum
#> 1 409 616 722 807 935 1025 648 5162
#> 2 189 181 205 184 165 154 82 1160
#> 3 137 90 72 75 59 52 33 518
#> 4 210 159 103 95 74 49 46 736
#> 5 470 462 492 477 459 435 351 3146
#> 6 57 25 28 30 17 11 10 178
#> 7 344 283 194 149 106 91 40 1207
#> Sum 1816 1816 1816 1817 1815 1817 1210 12107Here we set up the sampling designs and define all the quantities we will need for the rest of the vignette. The sample are selected with simple random sampling without replacement and the weights are equal to the inverse of the inclusion probabilities.
# size of sample
n1 <- 1000
n2 <- 500
# samples
s1 <- srswor(n1,N)
s2 <- srswor(n2,N)
# extract matching units
X1 <- Xm[s1 == 1,]
X2 <- Xm[s2 == 1,]
# extract variable of interest
Y1 <- data.frame(Y[s1 == 1,])
colnames(Y1) <- colnames(Y)
Z2 <- as.data.frame(Z[s2 == 1,])
colnames(Z2) <- colnames(Z)
# extract correct identities
id1 <- id[s1 == 1]
id2 <- id[s2 == 1]
# put correct rownames
rownames(Y1) <- id1
rownames(Z2) <- id2
# here weights are inverse of inclusion probabilities
d1 <- rep(N/n1,n1)
d2 <- rep(N/n2,n2)
# disjunctive form
Y_dis <- sampling::disjunctive(as.matrix(Y))
Z_dis <- sampling::disjunctive(as.matrix(Z))
Y1_dis <- Y_dis[s1 ==1,]
Z2_dis <- Z_dis[s2 ==1,]Then the harmonization step must be performed. The
harmonize function returns the harmonized weights. If by
chance the true population totals are known, it is possible to use these
instead of the estimate made within the function.
re <- harmonize(X1,d1,id1,X2,d2,id2)
# if we want to use the population totals to harmonize we can use
re <- harmonize(X1,d1,id1,X2,d2,id2,totals = c(N,colSums(Xm)))
w1 <- re$w1
w2 <- re$w2
colSums(Xm)
#> 1 2 3 4 5 6 7 8 9 10 11
#> 476 887 2340 763 1880 1021 2244 1938 558 6263 5844
#> 12 13 14 hsize age
#> 11073 283 751 36380 559915
colSums(w1*X1)
#> 1 2 3 4 5 6 7 8 9 10 11
#> 476 887 2340 763 1880 1021 2244 1938 558 6263 5844
#> 12 13 14 hsize age
#> 11073 283 751 36380 559915
colSums(w2*X2)
#> 1 2 3 4 5 6 7 8 9 10 11
#> 476 887 2340 763 1880 1021 2244 1938 558 6263 5844
#> 12 13 14 hsize age
#> 11073 283 751 36380 559915The statistical matching is done by using the otmatch
function. The estimation of the contingency table is calculated by
extracting the id1 units (respectively id2
units) and by using the function tapply with the correct
weights.
# Optimal transport matching
object <- otmatch(X1,id1,X2,id2,w1,w2)
head(object[,1:3])
#> id1 id2 weight
#> 101 101 316702 4.262105
#> 101.1 101 383502 7.195460
#> 201 201 589802 11.375502
#> 1101 1101 588401 14.340399
#> 1501 1501 81401 5.097244
#> 1501.1 1501 578202 6.636816
Y1_ot <- cbind(X1[as.character(object$id1),],y = Y1[as.character(object$id1),])
Z2_ot <- cbind(X2[as.character(object$id2),],z = Z2[as.character(object$id2),])
YZ_ot <- tapply(object$weight,list(Y1_ot$y,Z2_ot$z),sum)
# transform NA into 0
YZ_ot[is.na(YZ_ot)] <- 0
# result
round(addmargins(YZ_ot),3)
#> (0,15] (15,30] (30,45] (45,60] (60,75] (75,90] (90,100] Sum
#> 1 601.402 829.989 670.670 737.112 1011.755 738.773 540.461 5130.161
#> 2 197.416 211.934 164.826 107.705 169.762 215.407 134.521 1201.571
#> 3 47.993 78.154 105.370 80.508 64.328 71.814 50.925 499.091
#> 4 123.172 182.237 100.541 80.155 142.687 56.733 61.813 747.338
#> 5 423.816 418.535 596.715 465.259 617.813 447.014 274.175 3243.328
#> 6 25.096 20.949 5.934 0.000 19.099 6.474 6.399 83.952
#> 7 98.470 224.440 202.318 120.790 224.133 199.125 132.283 1201.558
#> Sum 1517.365 1966.240 1846.374 1591.528 2249.577 1735.339 1200.578 12107.000As you can see from the previous section, the optimal transport
results generally do not have a one-to-one match, meaning that for every
unit in sample 1, we have more than one unit with weights not equal to 0
in sample 2. The bsmatch function creates a one-to-one
match by selecting a balanced stratified sampling to obtain a data.frame
where each unit in sample 1 has only one imputed unit from sample 2.
# Balanced Sampling
BS <- bsmatch(object)
head(BS$object[,1:3])
#> id1 id2 weight
#> 101 101 316702 4.262105
#> 201 201 589802 11.375502
#> 1101 1101 588401 14.340399
#> 1501.1 1501 578202 6.636816
#> 1 1801 1801 11.651611
#> 2002.1 2002 42702 7.631196
Y1_bs <- cbind(X1[as.character(BS$object$id1),],y = Y1[as.character(BS$object$id1),])
Z2_bs <- cbind(X2[as.character(BS$object$id2),],z = Z2[as.character(BS$object$id2),])
YZ_bs <- tapply(BS$object$weight/BS$q,list(Y1_bs$y,Z2_bs$z),sum)
YZ_bs[is.na(YZ_bs)] <- 0
round(addmargins(YZ_bs),3)
#> (0,15] (15,30] (30,45] (45,60] (60,75] (75,90] (90,100] Sum
#> 1 627.833 904.635 677.325 707.843 970.451 755.175 486.900 5130.161
#> 2 192.815 211.977 204.107 72.056 179.475 216.542 124.599 1201.571
#> 3 46.710 73.230 105.559 88.577 61.878 76.282 46.855 499.091
#> 4 136.003 178.569 115.200 64.568 157.469 48.578 46.951 747.338
#> 5 407.800 418.776 617.913 490.041 601.555 459.341 247.902 3243.328
#> 6 25.096 22.942 0.000 0.000 25.034 0.000 10.881 83.952
#> 7 96.887 218.956 218.078 125.684 241.680 165.598 134.676 1201.558
#> Sum 1533.143 2029.085 1938.183 1548.769 2237.542 1721.515 1098.763 12107.000
# With Z2 as auxiliary information for stratified balanced sampling.
BS <- bsmatch(object,Z2)
Y1_bs <- cbind(X1[as.character(BS$object$id1),],y = Y1[as.character(BS$object$id1),])
Z2_bs <- cbind(X2[as.character(BS$object$id2),],z = Z2[as.character(BS$object$id2),])
YZ_bs <- tapply(BS$object$weight/BS$q,list(Y1_bs$y,Z2_bs$z),sum)
YZ_bs[is.na(YZ_bs)] <- 0
round(addmargins(YZ_bs),3)
#> (0,15] (15,30] (30,45] (45,60] (60,75] (75,90] (90,100] Sum
#> 1 590.145 827.742 670.065 728.406 980.477 762.761 570.565 5130.161
#> 2 215.133 201.011 143.845 107.613 166.470 255.370 112.130 1201.571
#> 3 35.247 86.071 105.631 66.137 76.223 61.937 67.845 499.091
#> 4 103.437 217.668 101.162 85.702 136.330 59.247 43.791 747.338
#> 5 445.777 408.842 590.066 486.482 643.501 407.328 261.332 3243.328
#> 6 25.096 22.942 13.360 0.000 11.673 0.000 10.881 83.952
#> 7 97.370 206.150 214.965 126.903 240.961 188.874 126.334 1201.558
#> Sum 1512.206 1970.426 1839.094 1601.244 2255.635 1735.518 1192.877 12107.000
# split the weight by id1
q_l <- split(object$weight,f = object$id1)
# normalize in each id1
q_l <- lapply(q_l, function(x){x/sum(x)})
q <- as.numeric(do.call(c,q_l))
Z_pred <- t(q*disjunctive(object$id1))%*%disjunctive(Z2[as.character(object$id2),])
colnames(Z_pred) <- levels(factor(Z2$c_income))
head(Z_pred)
#> (0,15] (15,30] (30,45] (45,60] (60,75] (75,90] (90,100]
#> [1,] 0 0.3719905 0 0 0.0000000 0 0.6280095
#> [2,] 0 0.0000000 0 0 1.0000000 0 0.0000000
#> [3,] 0 0.0000000 0 1 0.0000000 0 0.0000000
#> [4,] 0 0.0000000 0 0 0.5656027 0 0.4343973
#> [5,] 0 0.0000000 1 0 0.0000000 0 0.0000000
#> [6,] 0 0.0000000 0 0 0.2913712 0 0.7086288