Statistical Matching using Optimal Transport

Introduction

In this vignette we will explain how some functions of the package are used in order to estimate a contingency table. We will work on the eusilc dataset of the laeken package. All the functions presented in the following are explained in the proposed manuscript by Raphaël Jauslin and Yves Tillé (2021) doi:10.1016/j.jspi.2022.12.003.

Contingency table

We will estimate the contingency table when the factor variable which represents the economic status pl030 is crossed with a discretized version of the equivalized household income eqIncome. In order to discretize the equivalized income, we calculate percentiles (0.15,0.30,0.45,0.60,0.75,0.90) of the variable and define the category as intervals between the values.

library(laeken)
library(sampling)
library(StratifiedSampling)
#> Loading required package: Matrix

data("eusilc")
eusilc <- na.omit(eusilc)
N <- nrow(eusilc)


# Xm are the matching variables and id are identity of the units
Xm <- eusilc[,c("hsize","db040","age","rb090","pb220a")]
Xmcat <- do.call(cbind,apply(Xm[,c(2,4,5)],MARGIN = 2,FUN = disjunctive))
Xm <- cbind(Xmcat,Xm[,-c(2,4,5)])
id <- eusilc$rb030


# categorial income splitted by the percentile
c_income  <- eusilc$eqIncome
q <- quantile(eusilc$eqIncome, probs = seq(0, 1, 0.15))
c_income[which(eusilc$eqIncome <= q[2])] <- "(0,15]"
c_income[which(q[2] < eusilc$eqIncome & eusilc$eqIncome <= q[3])] <- "(15,30]"
c_income[which(q[3] < eusilc$eqIncome & eusilc$eqIncome <= q[4])] <- "(30,45]"
c_income[which(q[4] < eusilc$eqIncome & eusilc$eqIncome <= q[5])] <- "(45,60]"
c_income[which(q[5] < eusilc$eqIncome & eusilc$eqIncome <= q[6])] <- "(60,75]"
c_income[which(q[6] < eusilc$eqIncome & eusilc$eqIncome <= q[7])] <- "(75,90]"
c_income[which(  eusilc$eqIncome > q[7] )] <- "(90,100]"

# variable of interests
Y <- data.frame(ecostat = eusilc$pl030)
Z <- data.frame(c_income = c_income)

# put same rownames
rownames(Xm) <- rownames(Y) <- rownames(Z)<- id

YZ <- table(cbind(Y,Z))
addmargins(YZ)
#>        c_income
#> ecostat (0,15] (15,30] (30,45] (45,60] (60,75] (75,90] (90,100]   Sum
#>     1      409     616     722     807     935    1025      648  5162
#>     2      189     181     205     184     165     154       82  1160
#>     3      137      90      72      75      59      52       33   518
#>     4      210     159     103      95      74      49       46   736
#>     5      470     462     492     477     459     435      351  3146
#>     6       57      25      28      30      17      11       10   178
#>     7      344     283     194     149     106      91       40  1207
#>     Sum   1816    1816    1816    1817    1815    1817     1210 12107

Sampling schemes

Here we set up the sampling designs and define all the quantities we will need for the rest of the vignette. The sample are selected with simple random sampling without replacement and the weights are equal to the inverse of the inclusion probabilities.


# size of sample
n1 <- 1000
n2 <- 500

# samples
s1 <- srswor(n1,N)
s2 <- srswor(n2,N)
  
# extract matching units
X1 <- Xm[s1 == 1,]
X2 <- Xm[s2 == 1,]
  
# extract variable of interest
Y1 <- data.frame(Y[s1 == 1,])
colnames(Y1) <- colnames(Y)
Z2 <- as.data.frame(Z[s2 == 1,])
colnames(Z2) <- colnames(Z)
  
# extract correct identities
id1 <- id[s1 == 1]
id2 <- id[s2 == 1]
  
# put correct rownames
rownames(Y1) <- id1
rownames(Z2) <- id2
  
# here weights are inverse of inclusion probabilities
d1 <- rep(N/n1,n1)
d2 <- rep(N/n2,n2)
  
# disjunctive form
Y_dis <- sampling::disjunctive(as.matrix(Y))
Z_dis <- sampling::disjunctive(as.matrix(Z))
  
Y1_dis <- Y_dis[s1 ==1,]
Z2_dis <- Z_dis[s2 ==1,]

Harmonization

Then the harmonization step must be performed. The harmonize function returns the harmonized weights. If by chance the true population totals are known, it is possible to use these instead of the estimate made within the function.



re <- harmonize(X1,d1,id1,X2,d2,id2)  

# if we want to use the population totals to harmonize we can use 
re <- harmonize(X1,d1,id1,X2,d2,id2,totals = c(N,colSums(Xm)))

w1 <- re$w1
w2 <- re$w2

colSums(Xm)
#>      1      2      3      4      5      6      7      8      9     10     11 
#>    476    887   2340    763   1880   1021   2244   1938    558   6263   5844 
#>     12     13     14  hsize    age 
#>  11073    283    751  36380 559915
colSums(w1*X1)
#>      1      2      3      4      5      6      7      8      9     10     11 
#>    476    887   2340    763   1880   1021   2244   1938    558   6263   5844 
#>     12     13     14  hsize    age 
#>  11073    283    751  36380 559915
colSums(w2*X2)
#>      1      2      3      4      5      6      7      8      9     10     11 
#>    476    887   2340    763   1880   1021   2244   1938    558   6263   5844 
#>     12     13     14  hsize    age 
#>  11073    283    751  36380 559915

Optimal transport matching

The statistical matching is done by using the otmatch function. The estimation of the contingency table is calculated by extracting the id1 units (respectively id2 units) and by using the function tapply with the correct weights.


# Optimal transport matching
object <- otmatch(X1,id1,X2,id2,w1,w2)
head(object[,1:3])
#>       id1    id2    weight
#> 401   401 254002 4.6897016
#> 401.1 401 338203 1.2225508
#> 401.2 401 520204 8.3685001
#> 404   404 128203 7.4585822
#> 404.1 404 129304 1.4732214
#> 404.2 404 345204 0.3303463

Y1_ot <- cbind(X1[as.character(object$id1),],y = Y1[as.character(object$id1),])
Z2_ot <- cbind(X2[as.character(object$id2),],z = Z2[as.character(object$id2),])
YZ_ot <- tapply(object$weight,list(Y1_ot$y,Z2_ot$z),sum)

# transform NA into 0
YZ_ot[is.na(YZ_ot)] <- 0

# result
round(addmargins(YZ_ot),3)
#>       (0,15]  (15,30]  (30,45]  (45,60]  (60,75]  (75,90] (90,100]       Sum
#> 1    664.752  739.638 1016.183  763.107  721.479  971.998  471.848  5349.005
#> 2    173.976  155.584  110.832  153.651  141.377  204.326  101.979  1041.726
#> 3    105.741   89.723   60.746   97.024   63.562   92.840   24.214   533.849
#> 4    158.217  110.751  199.985  112.196   95.874   43.539   16.814   737.375
#> 5    501.775  354.176  597.302  495.710  377.382  265.385  371.108  2962.837
#> 6      0.000   20.910   30.913   26.154   25.471   39.062   12.287   154.797
#> 7    200.786  186.903  170.232  221.771  156.036  293.921   97.763  1327.412
#> Sum 1805.247 1657.686 2186.193 1869.612 1581.180 1911.071 1096.012 12107.000

Balanced sampling

As you can see from the previous section, the optimal transport results generally do not have a one-to-one match, meaning that for every unit in sample 1, we have more than one unit with weights not equal to 0 in sample 2. The bsmatch function creates a one-to-one match by selecting a balanced stratified sampling to obtain a data.frame where each unit in sample 1 has only one imputed unit from sample 2.


# Balanced Sampling 
BS <- bsmatch(object)
head(BS$object[,1:3])
#>         id1    id2    weight
#> 401.2   401 520204  8.368500
#> 404     404 128203  7.458582
#> 1604.1 1604 346803  5.162819
#> 1      2404   2404 12.494322
#> 2502.1 2502 254301  6.673364
#> 2505   2505 517303 12.065209


Y1_bs <- cbind(X1[as.character(BS$object$id1),],y = Y1[as.character(BS$object$id1),])
Z2_bs <- cbind(X2[as.character(BS$object$id2),],z = Z2[as.character(BS$object$id2),])
YZ_bs <- tapply(BS$object$weight/BS$q,list(Y1_bs$y,Z2_bs$z),sum)
YZ_bs[is.na(YZ_bs)] <- 0
round(addmargins(YZ_bs),3)
#>       (0,15]  (15,30]  (30,45]  (45,60]  (60,75]  (75,90] (90,100]       Sum
#> 1    702.807  757.839  988.954  742.316  747.296  901.918  507.874  5349.005
#> 2    146.707  149.430  128.279  143.933  163.601  210.191   99.585  1041.726
#> 3    107.077   86.191   62.112   96.616   72.315   85.654   23.885   533.849
#> 4    160.604  122.115  184.879  123.801   85.967   46.142   13.868   737.375
#> 5    492.239  364.711  598.014  534.895  369.306  239.095  364.576  2962.837
#> 6      0.000   36.935   34.838   12.382   23.839   37.354    9.450   154.797
#> 7    178.657  209.746  173.591  212.002  142.691  316.860   93.865  1327.412
#> Sum 1788.091 1726.966 2170.666 1865.944 1605.016 1837.214 1113.103 12107.000

# With Z2 as auxiliary information for stratified balanced sampling.
BS <- bsmatch(object,Z2)

Y1_bs <- cbind(X1[as.character(BS$object$id1),],y = Y1[as.character(BS$object$id1),])
Z2_bs <- cbind(X2[as.character(BS$object$id2),],z = Z2[as.character(BS$object$id2),])
YZ_bs <- tapply(BS$object$weight/BS$q,list(Y1_bs$y,Z2_bs$z),sum)
YZ_bs[is.na(YZ_bs)] <- 0
round(addmargins(YZ_bs),3)
#>       (0,15]  (15,30]  (30,45]  (45,60]  (60,75]  (75,90] (90,100]       Sum
#> 1    657.740  731.212 1019.177  733.526  730.351  978.206  498.793  5349.005
#> 2    197.392  173.803  106.704  154.185  127.148  195.200   87.293  1041.726
#> 3    117.703   87.281   62.356   91.397   60.404   89.065   25.644   533.849
#> 4    133.951  108.105  202.113  121.487   98.109   59.742   13.868   737.375
#> 5    488.592  345.554  629.447  505.639  363.751  257.436  372.419  2962.837
#> 6      0.000   24.473   34.838   26.333   23.839   35.864    9.450   154.797
#> 7    215.996  182.906  136.943  236.239  160.274  305.875   89.178  1327.412
#> Sum 1811.373 1653.334 2191.577 1868.806 1563.875 1921.388 1096.646 12107.000

Prediction


# split the weight by id1
q_l <- split(object$weight,f = object$id1)
# normalize in each id1
q_l <- lapply(q_l, function(x){x/sum(x)})
q <- as.numeric(do.call(c,q_l))
  
Z_pred <- t(q*disjunctive(object$id1))%*%disjunctive(Z2[as.character(object$id2),])
colnames(Z_pred) <- levels(factor(Z2$c_income))
head(Z_pred)
#>      (0,15]   (15,30]   (30,45]    (45,60]   (60,75]    (75,90] (90,100]
#> [1,]      0 0.0000000 0.3283932 0.58599854 0.0000000 0.08560829        0
#> [2,]      0 0.5351022 0.3355042 0.02370009 0.1056935 0.00000000        0
#> [3,]      0 0.5665677 0.0000000 0.00000000 0.4334323 0.00000000        0
#> [4,]      0 0.0000000 0.0000000 0.00000000 0.0000000 1.00000000        0
#> [5,]      0 0.5382098 0.0000000 0.46179017 0.0000000 0.00000000        0
#> [6,]      0 1.0000000 0.0000000 0.00000000 0.0000000 0.00000000        0