Assuming all software dependencies and ROKET (Little et al., 2023) installation are installed, we can begin.
# Load libraries
req_packs = c("devtools","smarter","ggplot2","reshape2",
"survival","ggdendro","MiRKAT","ROKET")
for(pack in req_packs){
library(package = pack,character.only = TRUE)
}
#> Loading required package: usethis
#> Registered S3 method overwritten by 'httr':
#> method from
#> print.response rmutil
# List package's exported functions
ls("package:ROKET")
#> [1] "kernTEST" "kOT_sim_AGG" "kOT_sim_make" "kOT_sim_OT" "kOT_sim_REG"
#> [6] "run_myOT" "run_myOTs"
# Fix seed
set.seed(2)For \(i = 1,\ldots,N\) and \(g = 1,\ldots,G\), let
We would like to calculate the distance between the \(i\)th and \(j\)th samples in terms of mutated genes \(\mathbf{Z}_i\) and \(\mathbf{Z}_j\), denoted by \(d\left(\mathbf{Z}_i,\mathbf{Z}_j\right)\).
Several optimal transport (OT) references to get familiarized with the framework, objective functions, updating equations, entropic regularizations, types of OT are provided:
For the \(i\)th and \(j\)th samples, let
If the mass of the two vectors are equal or normalized such that \(\sum_g Z_{ig} = \sum_g Z_{jg}\), we could use balanced optimal transport.
If the mass of the two vectors are not equal, \(\sum_g Z_{ig} \neq \sum_g Z_{jg}\), we could use unbalanced optimal transport with penalty parameters.
The code below will simulate samples and mutated genes. We welcome the reader to manipulate the following input arguments:
NN for sample size,PP for number of pathways,GG for number of genes, andbnd_same the lower bound gene similarity for genes
sharing the same pathway and 1 - bnd_same, an upper bound
on gene similarity for genes not sharing the same pathwayIdeally, PP should be much less than GG to
allow multiple genes to be allocated to each pathway.
# number of samples
NN = 30
NN_nms = sprintf("S%s",seq(NN))
# number of pathways
PP = 4
PP_nms = sprintf("P%s",seq(PP))
# number of genes
GG = 30
GG_nms = sprintf("G%s",seq(GG))
# bound for gene similarity of two genes on same or different pathway
bnd_same = 0.75
# Gene and pathway relationship
GP = smart_df(PATH = sample(seq(PP),GG,replace = TRUE),
GENE = seq(GG))
table(GP$PATH)
#>
#> 1 2 3 4
#> 9 6 7 8
# gene-gene similarity matrix
GS = matrix(NA,GG,GG)
dimnames(GS) = list(GG_nms,GG_nms)
diag(GS) = 1
tmp_mat = t(combn(seq(GG),2))
for(ii in seq(nrow(tmp_mat))){
G1 = tmp_mat[ii,1]
G2 = tmp_mat[ii,2]
same = GP$PATH[GP$GENE == G1] == GP$PATH[GP$GENE == G2]
if( same )
GS[G1,G2] = runif(1,bnd_same,1)
else
GS[G1,G2] = runif(1,0,1 - bnd_same)
}
GS[lower.tri(GS)] = t(GS)[lower.tri(GS)]
# round(GS,3)Let’s take a look at the gene similarity matrix.
#> Warning: The `size` argument of `element_line()` is deprecated as of ggplot2 3.4.0.
#> ℹ Please use the `linewidth` argument instead.
#> This warning is displayed once per session.
#> Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
#> generated.
The function show_tile() is inherent to this vignette
and not part of the ROKET package.
# Mutated gene statuses
prob_mut = 0.2
prob_muts = c(1 - prob_mut,prob_mut)
while(TRUE){
ZZ = matrix(sample(c(0,1),NN*GG,replace = TRUE,prob = prob_muts),NN,GG)
# Ensure each sample has at least one mutated gene
if( min(rowSums(ZZ)) > 0 ) break
}
dimnames(ZZ) = list(NN_nms,GG_nms)
show_tile(MAT = ZZ,
LABEL = "Mutation Status: Gene by Sample",
TYPE = "MUT",DIGITS = 0)
# Store all distances
DD = array(data = NA,dim = c(NN,NN,5))
dimnames(DD)[1:2] = list(NN_nms,NN_nms)
dimnames(DD)[[3]] = c("EUC","OT_Balanced",sprintf("OT_LAM%s",c(0.5,1.0,5.0)))We can look at the distribution of gene mutation frequencies.
freq = colSums(ZZ); # freq
dat = smart_df(GENE = names(freq),FREQ = as.integer(freq))
dat$GENE = factor(dat$GENE,levels = names(sort(freq,decreasing = TRUE)))
# dat
ggplot(data = dat,mapping = aes(x = GENE,y = FREQ)) +
geom_bar(stat = "identity") +
xlab("Gene") + ylab("Mutation Frequency") +
scale_x_discrete(guide = guide_axis(n.dodge = 2))We can calculate the Euclidean distance, which does not incorporate relationships between pairs of genes.
DD[,,"EUC"] = as.matrix(dist(ZZ,diag = TRUE,upper = TRUE))
hout = hclust(as.dist(DD[,,"EUC"]))
ord = hout$labels[hout$order]
show_tile(MAT = DD[,,"EUC"][ord,ord],
LABEL = "Euclidean Pairwise Distances",
TYPE = "DIST",DIGITS = 2)To demonstrate ROKET’s functionality, the code below
will run balanced OT (pre-normalizing input vectors) between two
samples. Regardless of the values specified by LAMBDA1 and
LAMBDA2 arguments, we need to set
balance = TRUE. The OT cost matrix argument corresponds to
1 - GS, one minus the gene similarity matrix.
# Pick two samples
ii = 1
jj = 2
ZZ[c(ii,jj),colSums(ZZ[c(ii,jj),]) > 0]
#> G6 G8 G9 G17 G21 G25 G26 G27 G29
#> S1 0 1 0 1 0 1 1 1 1
#> S2 1 1 1 0 1 0 0 0 0
outOT = run_myOT(XX = ZZ[ii,],YY = ZZ[jj,],
COST = 1 - GS,EPS = 1e-3,LAMBDA1 = 1,
LAMBDA2 = 1,balance = TRUE,verbose = FALSE)
# str(outOT)
# Optimal transport matrix
tmpOT = outOT$OT
tmpOT = tmpOT[rowSums(tmpOT) > 0,colSums(tmpOT) > 0]
show_tile(MAT = tmpOT,LABEL = "Balanced OT (showing only genes mutated in each sample)",
TYPE = "DIST",LABx = sprintf("Sample %s",ii),
LABy = sprintf("Sample %s",jj),
DIGITS = 2)Let’s try again but with unbalanced OT and \(\lambda = 0.5\). We need to set
balance = FALSE and specify LAMBDA1 = 0.5 and
LAMBDA2 = 0.5.
ZZ[c(ii,jj),colSums(ZZ[c(ii,jj),]) > 0]
#> G6 G8 G9 G17 G21 G25 G26 G27 G29
#> S1 0 1 0 1 0 1 1 1 1
#> S2 1 1 1 0 1 0 0 0 0
LAM = 0.5
outOT = run_myOT(XX = ZZ[ii,],YY = ZZ[jj,],
COST = 1 - GS,EPS = 1e-3,LAMBDA1 = LAM,
LAMBDA2 = LAM,balance = FALSE,verbose = FALSE)
# str(outOT)
# Optimal transport matrix
tmpOT = outOT$OT
tmpOT = tmpOT[rowSums(tmpOT) > 0,colSums(tmpOT) > 0]
show_tile(MAT = tmpOT,
LABEL = "Unbalanced OT (showing only genes mutated in each sample)",TYPE = "DIST",
LABx = sprintf("Sample %s",ii),
LABy = sprintf("Sample %s",jj),
DIGITS = 2)ROKET can run optimal transport calculations across all \(N\) choose 2 samples. Below is code to run balanced OT.
outOTs = run_myOTs(ZZ = t(ZZ),COST = 1 - GS,
EPS = 1e-3,balance = TRUE,LAMBDA1 = 1,
LAMBDA2 = 1,conv = 1e-5,max_iter = 3e3,
ncores = 1,verbose = FALSE)
hout = hclust(as.dist(outOTs))
ord = hout$labels[hout$order]
show_tile(MAT = outOTs[ord,ord],
LABEL = "Balanced OT Distances",
TYPE = "DIST",DIGITS = 1)We can run calculations for various \(\lambda\) values. For shorthand, \(\lambda\) = Inf corresponds to balanced OT.
LAMs = c(0,0.5,1.0,5.0)
for(LAM in LAMs){
# LAM = LAMs[2]
BAL = ifelse(LAM == 0,TRUE,FALSE)
LAM2 = ifelse(BAL,1,LAM)
outOTs = run_myOTs(ZZ = t(ZZ),COST = 1 - GS,
EPS = 1e-3,balance = BAL,LAMBDA1 = LAM2,
LAMBDA2 = LAM2,conv = 1e-5,max_iter = 3e3,
ncores = 1,verbose = FALSE)
hout = hclust(as.dist(outOTs))
ord = hout$labels[hout$order]
LABEL = ifelse(BAL,"OT Distances (Balanced)",
sprintf("OT Distances (Lambda = %s)",LAM))
LABEL2 = ifelse(BAL,"OT_Balanced",sprintf("OT_LAM%s",LAM))
gg = show_tile(MAT = outOTs[ord,ord],
LABEL = LABEL,TYPE = "DIST",DIGITS = 2)
print(gg)
DD[,,LABEL2] = outOTs
rm(outOTs)
}We can see that Euclidean distance calculations on gene mutation statuses alone does not lead to strong evidence of sample clusters. However optimal transport-based distance calculations with integrated gene-gene similarities provide stronger evidence of sample clusters.
nms = dimnames(DD)[[3]]; nms
#> [1] "EUC" "OT_Balanced" "OT_LAM0.5" "OT_LAM1" "OT_LAM5"
for(nm in nms){
# nm = nms[2]
hout = hclust(as.dist(DD[,,nm]))
hdend = as.dendrogram(hout)
dend_data = dendro_data(hdend,type = "rectangle")
gg = ggplot(dend_data$segments) +
geom_segment(aes(x = x,y = y,xend = xend,yend = yend)) +
ggtitle(nm) + xlab("") + ylab("") +
geom_text(data = dend_data$labels,
mapping = aes(x,y,label = label),vjust = 0.5,hjust = 1) +
theme_dendro() + coord_flip() +
theme(plot.title = element_text(hjust = 0.5))
print(gg)
rm(hout,hdend,dend_data,gg)
}The next step is to transform distance matrices into centered kernel matrices to perform hypothesis testing on our constructed kernels.
For a binary or continuous outcome, \(Y_i\), fitted with a generalized linear model, we have
\[ g\left(E\left[Y_i \middle| \mathbf{X}_i,\mathbf{Z}_i\right]\right) = \mathbf{X}_i^\text{T}\mathbf{\beta} + f(\mathbf{Z}_i) = \mathbf{X}_i^\text{T}\mathbf{\beta} + \mathbf{K}_i^\text{T}\mathbf{\alpha} \]
and for Cox proportional hazards for survival outcomes, we have
\[ h(t;\mathbf{X}_i,\mathbf{Z}_i) = h_0(t) \exp\left\{\mathbf{X}_i^\text{T}\mathbf{\beta} + f(\mathbf{Z}_i)\right\} = h_0(t) \exp\left\{\mathbf{X}_i^\text{T}\mathbf{\beta} + \mathbf{K}_i^\text{T}\mathbf{\alpha}\right\} \]
where
The matrix \(\mathbf{K}\) is constructed from the distance matrix (Euclidean or optimal transport), \(\mathbf{D}\), by double centering the rows and columns of \(\mathbf{D}\) with the formula
\[ \tilde{\mathbf{K}} = -\frac{1}{2} \left(\mathbf{I}_N - \mathbf{J}_N \mathbf{J}_N^\text{T}\right) \mathbf{D}^2 \left(\mathbf{I}_N - \mathbf{J}_N \mathbf{J}_N^\text{T}\right) \]
where
Since \(\tilde{\mathbf{K}}\) is not guaranteed to be positive semi-definite, we perform spectral decomposition and replace negative eigenvalues with zero and re-calculate the kernel to arrive at \(\mathbf{K}\).
For a single candidate distance matrix, transform the distance matrix
to a kernel matrix and convert the object into a list, denoted by the
object KK, code is provided below from the R package, MiRKAT
(Plantinga et al., 2017; Zhao et al.,
2015).
If there are multiple distance matrices, in our case, contained in an
array DD, store them into a list of kernel matrices
KK. Example code is below.
The user needs to pre-specify and fit the null model,
\[ H_0: \mathbf{\alpha} = 0, \]
of baseline covariates, \(\mathbf{X}_i\), to one of the three outcome
models to obtain continuous, logistic, or martingale residuals
(RESI). Some example code is provided below.
# Continuous
out_LM = lm(Y ~ .,data = data.frame(X))
RESI = residuals(out_LM)
# Logistic
out_LOG = glm(Y ~ .,data = data.frame(X),family = "binomial")
RESI = residuals(out_LOG)
# Survival
out_CX = coxph(Surv(TIME,CENS) ~ .,data = data.frame(X))
RESI = residuals(out_CX)With one or multiple candidate kernels, ROKET will take
aKK) (defined
below),RESI), andnPERMS)to calculate individual kernel p-values as well as omnibus tests. An
omnibus test assesses the significance of the minimum p-value kernel
among kernels considered. The function kernTEST requires
names(RESI) and dimension names per object as well as
row/column names per kernel within aKK are named for sample
order consistency. Sample code is provided below. Setting
verbose = TRUE allows the user to track the permutation’s
progress, especially when requesting hundreds of thousands of
permutations.
nPERMS = 1e5
nKK = length(KK)
# Array of kernels
aKK = array(data = NA,dim = dim(DD),
dimnames = dimnames(DD))
for(nm in dimnames(DD)[[3]]){
aKK[,,nm] = KK[[nm]]
}
# Create OMNI matrix
OMNI = matrix(0,nrow = 2,ncol = dim(aKK)[3],
dimnames = list(c("EUC","OT"),dimnames(aKK)[[3]]))
OMNI["EUC","EUC"] = 1
OMNI["OT",grepl("^OT",colnames(OMNI))] = 1
OMNI
# Hypothesis Testing
ROKET::kernTEST(RESI = RESI,
KK = aKK,
OMNI = OMNI,
nPERMS = nPERMS,
ncores = 1)The final output contains individual kernel p-values and the omnibus p-value.
Have fun with utilizing kernel regression and optimal transport frameworks with ROKET!
sessionInfo()
#> R version 4.6.0 (2026-04-24)
#> Platform: x86_64-pc-linux-gnu
#> Running under: Ubuntu 24.04.4 LTS
#>
#> Matrix products: default
#> BLAS: /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3
#> LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.26.so; LAPACK version 3.12.0
#>
#> locale:
#> [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
#> [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
#> [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
#> [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
#> [9] LC_ADDRESS=C LC_TELEPHONE=C
#> [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
#>
#> time zone: Etc/UTC
#> tzcode source: system (glibc)
#>
#> attached base packages:
#> [1] stats graphics grDevices utils datasets methods base
#>
#> other attached packages:
#> [1] ROKET_1.0.0 MiRKAT_1.2.3 ggdendro_0.2.0 survival_3.8-6 reshape2_1.4.5
#> [6] ggplot2_4.0.3 smarter_1.0.1 devtools_2.5.2 usethis_3.2.1
#>
#> loaded via a namespace (and not attached):
#> [1] Rdpack_2.6.6 bitops_1.0-9 inline_0.3.21
#> [4] permute_0.9-10 rlang_1.2.0 magrittr_2.0.5
#> [7] clue_0.3-68 otel_0.2.0 matrixStats_1.5.0
#> [10] compiler_4.6.0 mgcv_1.9-4 vctrs_0.7.3
#> [13] quantreg_6.1 rmutil_1.1.10 stringr_1.6.0
#> [16] pkgconfig_2.0.3 fastmap_1.2.0 ellipsis_0.3.3
#> [19] labeling_0.4.3 caTools_1.18.3 modeest_2.4.0
#> [22] rmarkdown_2.31 sessioninfo_1.2.3 nloptr_2.2.1
#> [25] MatrixModels_0.5-4 purrr_1.2.2 xfun_0.57
#> [28] cachem_1.1.0 jsonlite_2.0.0 parallel_4.6.0
#> [31] cluster_2.1.8.2 R6_2.6.1 bslib_0.11.0
#> [34] stringi_1.8.7 RColorBrewer_1.1-3 boot_1.3-32
#> [37] pkgload_1.5.2 rpart_4.1.27 jquerylib_0.1.4
#> [40] Rcpp_1.1.1-1.1 iterators_1.0.14 knitr_1.51
#> [43] mixtools_2.0.0.1 Matrix_1.7-5 splines_4.6.0
#> [46] tidyselect_1.2.1 yaml_2.3.12 vegan_2.7-3
#> [49] timeDate_4052.112 gplots_3.3.0 codetools_0.2-20
#> [52] pkgbuild_1.4.8 lattice_0.22-9 tibble_3.3.1
#> [55] plyr_1.8.9 withr_3.0.2 S7_0.2.2
#> [58] evaluate_1.0.5 stable_1.1.7 CompQuadForm_1.4.4
#> [61] kernlab_0.9-33 pillar_1.11.1 KernSmooth_2.23-26
#> [64] foreach_1.5.2 reformulas_0.4.4 plotly_4.12.0
#> [67] generics_0.1.4 RCurl_1.98-1.18 scales_1.4.0
#> [70] minqa_1.2.8 timeSeries_4052.112 gtools_3.9.5
#> [73] glue_1.8.1 statip_0.2.3 lazyeval_0.2.3
#> [76] maketools_1.3.2 tools_4.6.0 data.table_1.18.4
#> [79] sys_3.4.3 lme4_2.0-1 spatial_7.3-18
#> [82] SparseM_1.84-2 fBasics_4052.98 buildtools_1.0.0
#> [85] fs_2.1.0 grid_4.6.0 tidyr_1.3.2
#> [88] ape_5.8-1 rbibutils_2.4.1 nlme_3.1-169
#> [91] cli_3.6.6 segmented_2.2-1 viridisLite_0.4.3
#> [94] dplyr_1.2.1 gtable_0.3.6 stabledist_0.7-2
#> [97] sass_0.4.10 digest_0.6.39 ggrepel_0.9.8
#> [100] htmlwidgets_1.6.4 farver_2.1.2 memoise_2.0.1
#> [103] htmltools_0.5.9 lifecycle_1.0.5 httr_1.4.8
#> [106] PearsonDS_1.3.2 statmod_1.5.2 GUniFrac_1.9
#> [109] MASS_7.3-65