--- title: "Negative Binomial factor regression with application to microbiome data analysis" output: rmarkdown::pdf_document author: "Aditya Mishra, Christian L. Müller" date: "`r Sys.Date()`" vignette: > %\VignetteIndexEntry{Negative Binomial factor regression with application to microbiome data analysis} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- ```{r, include = FALSE} knitr::opts_chunk$set( collapse = TRUE, comment = "#>" ) ``` ```{r setup} library(nbfar) ``` # Simulation examples We showcase the usage of **nbfar** and **nbrrr** on simulated data. We simulate response and covariates matrix for multivariate negative binomial regression with a low-rank and sparse coefficient matrix. The coefficient matrix **C** is expressed in terms of **U** (left singular vector), **D** (singular values) and **V** (right singular vector). ### Data simulation ```{r} ## ## -----------------------Simulation settings -------------------- ## Simulation setting: ## n: sample size ## p: number of predictors ## pz: number of control variable other than intercept parameter ## q: number of outcome variables ## nrank: true rank of the model ## snr: signal to noise ratio to be used in generating gaussian outcome variables ## nlam: number of lambda values to be fitted ## rank.est: maximum estimated rank to be specified to the model ## s: multiplicative factor of singular values ## nthread: number of parallel thread can be used in parallel for cross validation ## The simulation was replicated 100 times under each setting as detailed in the paper. # ## Model specification: p <- 50; example_seed <- 123 xp = 30 set.seed(example_seed) n <- 200 nrank <- 3 # true rank rank.est <- 5 # estimated rank nlam <- 40 # number of tuning parameter s = 0.5; q <- 30 sp = xp/p nthread = 1 ## -------------- Generate low-rank and sparse coefficient matrix --- ## D: singular values ## U: sparse left singular vectors ## V: sparse right singular vectors D <- rep(0, nrank) V <- matrix(0, ncol = nrank, nrow = q) U <- matrix(0, ncol = nrank, nrow = p) # U[, 1] <- c(sample(c(1, -1), 8, replace = TRUE), rep(0, p - 8)) U[, 2] <- c(rep(0, 5), sample(c(1, -1), 9, replace = TRUE), rep(0, p - 14)) U[, 3] <- c(rep(0, 11), sample(c(1, -1), 9, replace = TRUE), rep(0, p - 20)) # # for similar type response type setting V[, 1] <- c(rep(0, 8), sample(c(1, -1), 8, replace = TRUE) * runif(8, 0.3, 1), rep(0, q - 16)) V[, 2] <- c(rep(0, 20), sample(c(1, -1), 8, replace = TRUE) * runif(8, 0.3, 1), rep(0, q - 28)) V[, 3] <- c( sample(c(1, -1), 5, replace = TRUE) * runif(5, 0.3, 1), rep(0, 23), sample(c(1, -1), 2, replace = TRUE) * runif(2, 0.3, 1), rep(0, q - 30)) U[, 1:3] <- apply(U[, 1:3], 2, function(x) x / sqrt(sum(x^2))) V[, 1:3] <- apply(V[, 1:3], 2, function(x) x / sqrt(sum(x^2))) # D <- s * c(4, 6, 5) # signal strength varries as per the value of s or <- order(D, decreasing = T) U <- U[, or] V <- V[, or] D <- D[or] C <- U %*% (D * t(V)) # simulated coefficient matrix intercept <- rep(0.5, q) # specifying intercept to the model: C0 <- rbind(intercept, C) ## ----- Simulate data ----- Xsigma <- 0.5^abs(outer(1:p, 1:p, FUN = "-")) sim.sample <- nbfar_sim(U, D, V, n, Xsigma, C0,disp = 0.75, depth = 10) # Simulated sample X <- sim.sample$X[1:n, ] # simulated predictors (training) Y <- sim.sample$Y[1:n, ] # simulated responses (training) # 1000 test sample data sim.sample <- nbfar_sim(U, D, V, 1000, Xsigma, C0, disp = 0.75, depth = 10) Xt <- sim.sample$X # simulated predictors (test) Yt <- sim.sample$Y # simulated predictors (test) X0 <- cbind(1, X) # 1st column accounting for intercept # Simulate data with 20% missing entries miss <- 0.10 # Proportion of entries missing t.ind <- sample.int(n * q, size = miss * n * q) y <- as.vector(Y) y[t.ind] <- NA Ym <- matrix(y, n, q) # 20% of entries are missing at random ``` ### Negative Binomial reduced rank regression: **nbrrr** In the range of 1 to maxrank, the estimation procedure selects the rank r of the coefficient matrix using a cross-validation approach. For the selected rank, a rank r coefficient matrix is estimated that best fits the observations. ```{r} # Model fit: (full data) set.seed(example_seed) control_r3 <- nbfar_control(initmaxit = 10000, initepsilon = 1e-5, objI = 1) # nbrrr_test <- nbrrr(Y, X, maxrank = 5, control = control_r3, nfold = 5, trace = F) # Model fit: (missing data) set.seed(example_seed) control_r3 <- nbfar_control(initmaxit = 10000, initepsilon = 1e-5, objI = 1) # nbrrr_testm <- nbrrr(Ym, X, maxrank = 5, control = control_r3, nfold = 5,trace = F) ``` ### Negative Binomial co-sparse factor regression: **nbfar** To estimate a low-rank and sparse coefficient matrix in large/high dimensional setting, the approach extracts unit-rank components of required matrix in sequential order. The algorithm automatically stops after extracting sufficient unit rank components. ```{r} # Model fit: (full data) RcppParallel::setThreadOptions(numThreads = nthread) set.seed(example_seed) control_nbfar <- nbfar_control(gamma0 = 1, spU = sp, spV = 20/q, maxit = 2000, lamMaxFac = 1e-2, lamMinFac = 1e-7, epsilon = 1e-4, objI = 0, initmaxit = 10000, initepsilon = 1e-7) # nbfar_test <- nbfar(Y, X, maxrank = rank.est, nlambda = nlam, # cIndex = NULL, # ofset = NULL, control = control_nbfar, nfold = 5, # PATH = FALSE, nthread = nthread,trace = F) # Model fit: (missing data) RcppParallel::setThreadOptions(numThreads = nthread) set.seed(example_seed) control_nbfar <- nbfar_control(gamma0 = 1, spU = sp, spV = 20/q, maxit = 2000, lamMaxFac = 1e-2, lamMinFac = 1e-7, epsilon = 1e-4, objI = 0, initmaxit = 10000, initepsilon = 1e-7) # nbfar_testm <- nbfar(Ym, X, maxrank = rank.est, nlambda = nlam, # cIndex = NULL, # ofset = NULL, control = control_nbfar, nfold = 5, # PATH = FALSE, nthread = nthread,trace = F) ```