Assuming all software dependencies and ROKET (Little et al., 2023) installation are installed, we can begin.
# Load libraries
req_packs = c("devtools","smarter","ggplot2","reshape2",
"survival","ggdendro","MiRKAT","ROKET")
for(pack in req_packs){
library(package = pack,character.only = TRUE)
}
#> Loading required package: usethis
#> Registered S3 method overwritten by 'httr':
#> method from
#> print.response rmutil
# List package's exported functions
ls("package:ROKET")
#> [1] "kOT_sim_AGG" "kOT_sim_OT" "kOT_sim_REG" "kOT_sim_make" "kernTEST"
#> [6] "run_myOT" "run_myOTs"
# Fix seed
set.seed(2)
For i = 1, …, N and g = 1, …, G, let
We would like to calculate the distance between the ith and jth samples in terms of mutated genes Zi and Zj, denoted by d(Zi, Zj).
Several optimal transport (OT) references to get familiarized with the framework, objective functions, updating equations, entropic regularizations, types of OT are provided:
For the ith and jth samples, let
If the mass of the two vectors are equal or normalized such that ∑gZig = ∑gZjg, we could use balanced optimal transport.
If the mass of the two vectors are not equal, ∑gZig ≠ ∑gZjg, we could use unbalanced optimal transport with penalty parameters.
The code below will simulate samples and mutated genes. We welcome the reader to manipulate the following input arguments:
NN
for sample size,PP
for number of pathways,GG
for number of genes, andbnd_same
the lower bound gene similarity for genes
sharing the same pathway and 1 - bnd_same
, an upper bound
on gene similarity for genes not sharing the same pathwayIdeally, PP
should be much less than GG
to
allow multiple genes to be allocated to each pathway.
# number of samples
NN = 30
NN_nms = sprintf("S%s",seq(NN))
# number of pathways
PP = 4
PP_nms = sprintf("P%s",seq(PP))
# number of genes
GG = 30
GG_nms = sprintf("G%s",seq(GG))
# bound for gene similarity of two genes on same or different pathway
bnd_same = 0.75
# Gene and pathway relationship
GP = smart_df(PATH = sample(seq(PP),GG,replace = TRUE),
GENE = seq(GG))
table(GP$PATH)
#>
#> 1 2 3 4
#> 9 6 7 8
# gene-gene similarity matrix
GS = matrix(NA,GG,GG)
dimnames(GS) = list(GG_nms,GG_nms)
diag(GS) = 1
tmp_mat = t(combn(seq(GG),2))
for(ii in seq(nrow(tmp_mat))){
G1 = tmp_mat[ii,1]
G2 = tmp_mat[ii,2]
same = GP$PATH[GP$GENE == G1] == GP$PATH[GP$GENE == G2]
if( same )
GS[G1,G2] = runif(1,bnd_same,1)
else
GS[G1,G2] = runif(1,0,1 - bnd_same)
}
GS[lower.tri(GS)] = t(GS)[lower.tri(GS)]
# round(GS,3)
Let’s take a look at the gene similarity matrix.
#> Warning: The `size` argument of `element_line()` is deprecated as of ggplot2 3.4.0.
#> ℹ Please use the `linewidth` argument instead.
#> This warning is displayed once every 8 hours.
#> Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
#> generated.
The function show_tile()
is inherent to this vignette
and not part of the ROKET package.
# Mutated gene statuses
prob_mut = 0.2
prob_muts = c(1 - prob_mut,prob_mut)
while(TRUE){
ZZ = matrix(sample(c(0,1),NN*GG,replace = TRUE,prob = prob_muts),NN,GG)
# Ensure each sample has at least one mutated gene
if( min(rowSums(ZZ)) > 0 ) break
}
dimnames(ZZ) = list(NN_nms,GG_nms)
show_tile(MAT = ZZ,
LABEL = "Mutation Status: Gene by Sample",
TYPE = "MUT",DIGITS = 0)
# Store all distances
DD = array(data = NA,dim = c(NN,NN,5))
dimnames(DD)[1:2] = list(NN_nms,NN_nms)
dimnames(DD)[[3]] = c("EUC","OT_Balanced",sprintf("OT_LAM%s",c(0.5,1.0,5.0)))
We can look at the distribution of gene mutation frequencies.
freq = colSums(ZZ); # freq
dat = smart_df(GENE = names(freq),FREQ = as.integer(freq))
dat$GENE = factor(dat$GENE,levels = names(sort(freq,decreasing = TRUE)))
# dat
ggplot(data = dat,mapping = aes(x = GENE,y = FREQ)) +
geom_bar(stat = "identity") +
xlab("Gene") + ylab("Mutation Frequency") +
scale_x_discrete(guide = guide_axis(n.dodge = 2))
We can calculate the Euclidean distance, which does not incorporate relationships between pairs of genes.
DD[,,"EUC"] = as.matrix(dist(ZZ,diag = TRUE,upper = TRUE))
hout = hclust(as.dist(DD[,,"EUC"]))
ord = hout$labels[hout$order]
show_tile(MAT = DD[,,"EUC"][ord,ord],
LABEL = "Euclidean Pairwise Distances",
TYPE = "DIST",DIGITS = 2)
To demonstrate ROKET’s functionality, the code below
will run balanced OT (pre-normalizing input vectors) between two
samples. Regardless of the values specified by LAMBDA1
and
LAMBDA2
arguments, we need to set
balance = TRUE
. The OT cost matrix argument corresponds to
1 - GS
, one minus the gene similarity matrix.
# Pick two samples
ii = 1
jj = 2
ZZ[c(ii,jj),colSums(ZZ[c(ii,jj),]) > 0]
#> G6 G8 G9 G17 G21 G25 G26 G27 G29
#> S1 0 1 0 1 0 1 1 1 1
#> S2 1 1 1 0 1 0 0 0 0
outOT = run_myOT(XX = ZZ[ii,],YY = ZZ[jj,],
COST = 1 - GS,EPS = 1e-3,LAMBDA1 = 1,
LAMBDA2 = 1,balance = TRUE,verbose = FALSE)
# str(outOT)
# Optimal transport matrix
tmpOT = outOT$OT
tmpOT = tmpOT[rowSums(tmpOT) > 0,colSums(tmpOT) > 0]
show_tile(MAT = tmpOT,LABEL = "Balanced OT (showing only genes mutated in each sample)",
TYPE = "DIST",LABx = sprintf("Sample %s",ii),
LABy = sprintf("Sample %s",jj),
DIGITS = 2)
Let’s try again but with unbalanced OT and λ = 0.5. We need to set
balance = FALSE
and specify LAMBDA1 = 0.5
and
LAMBDA2 = 0.5
.
ZZ[c(ii,jj),colSums(ZZ[c(ii,jj),]) > 0]
#> G6 G8 G9 G17 G21 G25 G26 G27 G29
#> S1 0 1 0 1 0 1 1 1 1
#> S2 1 1 1 0 1 0 0 0 0
LAM = 0.5
outOT = run_myOT(XX = ZZ[ii,],YY = ZZ[jj,],
COST = 1 - GS,EPS = 1e-3,LAMBDA1 = LAM,
LAMBDA2 = LAM,balance = FALSE,verbose = FALSE)
# str(outOT)
# Optimal transport matrix
tmpOT = outOT$OT
tmpOT = tmpOT[rowSums(tmpOT) > 0,colSums(tmpOT) > 0]
show_tile(MAT = tmpOT,
LABEL = "Unbalanced OT (showing only genes mutated in each sample)",TYPE = "DIST",
LABx = sprintf("Sample %s",ii),
LABy = sprintf("Sample %s",jj),
DIGITS = 2)
ROKET can run optimal transport calculations across all N choose 2 samples. Below is code to run balanced OT.
outOTs = run_myOTs(ZZ = t(ZZ),COST = 1 - GS,
EPS = 1e-3,balance = TRUE,LAMBDA1 = 1,
LAMBDA2 = 1,conv = 1e-5,max_iter = 3e3,
ncores = 1,verbose = FALSE)
hout = hclust(as.dist(outOTs))
ord = hout$labels[hout$order]
show_tile(MAT = outOTs[ord,ord],
LABEL = "Balanced OT Distances",
TYPE = "DIST",DIGITS = 1)
We can run calculations for various λ values. For shorthand, λ = Inf corresponds to balanced OT.
LAMs = c(0,0.5,1.0,5.0)
for(LAM in LAMs){
# LAM = LAMs[2]
BAL = ifelse(LAM == 0,TRUE,FALSE)
LAM2 = ifelse(BAL,1,LAM)
outOTs = run_myOTs(ZZ = t(ZZ),COST = 1 - GS,
EPS = 1e-3,balance = BAL,LAMBDA1 = LAM2,
LAMBDA2 = LAM2,conv = 1e-5,max_iter = 3e3,
ncores = 1,verbose = FALSE)
hout = hclust(as.dist(outOTs))
ord = hout$labels[hout$order]
LABEL = ifelse(BAL,"OT Distances (Balanced)",
sprintf("OT Distances (Lambda = %s)",LAM))
LABEL2 = ifelse(BAL,"OT_Balanced",sprintf("OT_LAM%s",LAM))
gg = show_tile(MAT = outOTs[ord,ord],
LABEL = LABEL,TYPE = "DIST",DIGITS = 2)
print(gg)
DD[,,LABEL2] = outOTs
rm(outOTs)
}
We can see that Euclidean distance calculations on gene mutation statuses alone does not lead to strong evidence of sample clusters. However optimal transport-based distance calculations with integrated gene-gene similarities provide stronger evidence of sample clusters.
nms = dimnames(DD)[[3]]; nms
#> [1] "EUC" "OT_Balanced" "OT_LAM0.5" "OT_LAM1" "OT_LAM5"
for(nm in nms){
# nm = nms[2]
hout = hclust(as.dist(DD[,,nm]))
hdend = as.dendrogram(hout)
dend_data = dendro_data(hdend,type = "rectangle")
gg = ggplot(dend_data$segments) +
geom_segment(aes(x = x,y = y,xend = xend,yend = yend)) +
ggtitle(nm) + xlab("") + ylab("") +
geom_text(data = dend_data$labels,
mapping = aes(x,y,label = label),vjust = 0.5,hjust = 1) +
theme_dendro() + coord_flip() +
theme(plot.title = element_text(hjust = 0.5))
print(gg)
rm(hout,hdend,dend_data,gg)
}
The next step is to transform distance matrices into centered kernel matrices to perform hypothesis testing on our constructed kernels.
For a binary or continuous outcome, Yi, fitted with a generalized linear model, we have
g(E[Yi|Xi, Zi]) = XiTβ + f(Zi) = XiTβ + KiTα
and for Cox proportional hazards for survival outcomes, we have
h(t; Xi, Zi) = h0(t)exp {XiTβ + f(Zi)} = h0(t)exp {XiTβ + KiTα}
where
The matrix K is constructed from the distance matrix (Euclidean or optimal transport), D, by double centering the rows and columns of D with the formula
$$ \tilde{\mathbf{K}} = -\frac{1}{2} \left(\mathbf{I}_N - \mathbf{J}_N \mathbf{J}_N^\text{T}\right) \mathbf{D}^2 \left(\mathbf{I}_N - \mathbf{J}_N \mathbf{J}_N^\text{T}\right) $$
where
Since $\tilde{\mathbf{K}}$ is not guaranteed to be positive semi-definite, we perform spectral decomposition and replace negative eigenvalues with zero and re-calculate the kernel to arrive at K.
For a single candidate distance matrix, transform the distance matrix
to a kernel matrix and convert the object into a list, denoted by the
object KK
, code is provided below from the R package, MiRKAT
(Plantinga et al., 2017; Zhao et al.,
2015).
If there are multiple distance matrices, in our case, contained in an
array DD
, store them into a list of kernel matrices
KK
. Example code is below.
The user needs to pre-specify and fit the null model,
H0 : α = 0,
of baseline covariates, Xi, to
one of the three outcome models to obtain continuous, logistic, or
martingale residuals (RESI
). Some example code is provided
below.
# Continuous
out_LM = lm(Y ~ .,data = data.frame(X))
RESI = residuals(out_LM)
# Logistic
out_LOG = glm(Y ~ .,data = data.frame(X),family = "binomial")
RESI = residuals(out_LOG)
# Survival
out_CX = coxph(Surv(TIME,CENS) ~ .,data = data.frame(X))
RESI = residuals(out_CX)
With one or multiple candidate kernels, ROKET will take
aKK
) (defined
below),RESI
), andnPERMS
)to calculate individual kernel p-values as well as omnibus tests. An
omnibus test assesses the significance of the minimum p-value kernel
among kernels considered. The function kernTEST
requires
names(RESI)
and dimension names per object as well as
row/column names per kernel within aKK
are named for sample
order consistency. Sample code is provided below. Setting
verbose = TRUE
allows the user to track the permutation’s
progress, especially when requesting hundreds of thousands of
permutations.
nPERMS = 1e5
nKK = length(KK)
# Array of kernels
aKK = array(data = NA,dim = dim(DD),
dimnames = dimnames(DD))
for(nm in dimnames(DD)[[3]]){
aKK[,,nm] = KK[[nm]]
}
# Create OMNI matrix
OMNI = matrix(0,nrow = 2,ncol = dim(aKK)[3],
dimnames = list(c("EUC","OT"),dimnames(aKK)[[3]]))
OMNI["EUC","EUC"] = 1
OMNI["OT",grepl("^OT",colnames(OMNI))] = 1
OMNI
# Hypothesis Testing
ROKET::kernTEST(RESI = RESI,
KK = aKK,
OMNI = OMNI,
nPERMS = nPERMS,
ncores = 1)
The final output contains individual kernel p-values and the omnibus p-value.
Have fun with utilizing kernel regression and optimal transport frameworks with ROKET!
sessionInfo()
#> R version 4.4.2 (2024-10-31)
#> Platform: x86_64-pc-linux-gnu
#> Running under: Ubuntu 24.04.2 LTS
#>
#> Matrix products: default
#> BLAS: /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3
#> LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.26.so; LAPACK version 3.12.0
#>
#> locale:
#> [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
#> [3] LC_TIME=en_US.UTF-8 LC_COLLATE=C
#> [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
#> [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
#> [9] LC_ADDRESS=C LC_TELEPHONE=C
#> [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
#>
#> time zone: Etc/UTC
#> tzcode source: system (glibc)
#>
#> attached base packages:
#> [1] stats graphics grDevices utils datasets methods base
#>
#> other attached packages:
#> [1] ROKET_1.0.0 MiRKAT_1.2.3 ggdendro_0.2.0 survival_3.8-3 reshape2_1.4.4
#> [6] ggplot2_3.5.1 smarter_1.0.1 devtools_2.4.5 usethis_3.1.0
#>
#> loaded via a namespace (and not attached):
#> [1] Rdpack_2.6.2 bitops_1.0-9 remotes_2.5.0
#> [4] inline_0.3.21 permute_0.9-7 rlang_1.1.5
#> [7] magrittr_2.0.3 clue_0.3-66 matrixStats_1.5.0
#> [10] compiler_4.4.2 mgcv_1.9-1 vctrs_0.6.5
#> [13] quantreg_6.00 rmutil_1.1.10 stringr_1.5.1
#> [16] profvis_0.4.0 crayon_1.5.3 pkgconfig_2.0.3
#> [19] fastmap_1.2.0 ellipsis_0.3.2 labeling_0.4.3
#> [22] caTools_1.18.3 modeest_2.4.0 promises_1.3.2
#> [25] rmarkdown_2.29 sessioninfo_1.2.3 nloptr_2.1.1
#> [28] MatrixModels_0.5-3 purrr_1.0.4 xfun_0.51
#> [31] cachem_1.1.0 jsonlite_1.9.1 later_1.4.1
#> [34] parallel_4.4.2 cluster_2.1.8 R6_2.6.1
#> [37] bslib_0.9.0 stringi_1.8.4 boot_1.3-31
#> [40] pkgload_1.4.0 rpart_4.1.24 jquerylib_0.1.4
#> [43] Rcpp_1.0.14 iterators_1.0.14 knitr_1.49
#> [46] mixtools_2.0.0 httpuv_1.6.15 Matrix_1.7-2
#> [49] splines_4.4.2 tidyselect_1.2.1 yaml_2.3.10
#> [52] vegan_2.6-10 timeDate_4041.110 gplots_3.2.0
#> [55] codetools_0.2-20 miniUI_0.1.1.1 pkgbuild_1.4.6
#> [58] lattice_0.22-6 tibble_3.2.1 plyr_1.8.9
#> [61] shiny_1.10.0 withr_3.0.2 evaluate_1.0.3
#> [64] stable_1.1.6 CompQuadForm_1.4.3 urlchecker_1.0.1
#> [67] kernlab_0.9-33 pillar_1.10.1 KernSmooth_2.23-26
#> [70] foreach_1.5.2 reformulas_0.4.0 plotly_4.10.4
#> [73] generics_0.1.3 RCurl_1.98-1.16 munsell_0.5.1
#> [76] scales_1.3.0 minqa_1.2.8 timeSeries_4041.111
#> [79] gtools_3.9.5 xtable_1.8-4 glue_1.8.0
#> [82] statip_0.2.3 lazyeval_0.2.2 maketools_1.3.2
#> [85] tools_4.4.2 data.table_1.17.0 sys_3.4.3
#> [88] lme4_1.1-36 spatial_7.3-18 SparseM_1.84-2
#> [91] fBasics_4041.97 buildtools_1.0.0 fs_1.6.5
#> [94] grid_4.4.2 tidyr_1.3.1 ape_5.8-1
#> [97] rbibutils_2.3 colorspace_2.1-1 nlme_3.1-167
#> [100] cli_3.6.4 segmented_2.1-4 viridisLite_0.4.2
#> [103] dplyr_1.1.4 gtable_0.3.6 stabledist_0.7-2
#> [106] sass_0.4.9 digest_0.6.37 ggrepel_0.9.6
#> [109] farver_2.1.2 htmlwidgets_1.6.4 memoise_2.0.1
#> [112] htmltools_0.5.8.1 lifecycle_1.0.4 httr_1.4.7
#> [115] PearsonDS_1.3.1 statmod_1.5.0 mime_0.12
#> [118] GUniFrac_1.8 MASS_7.3-65