--- title: "Introduction to CSeQTL" author: "Paul Little" date: "Last Refresh: `r Sys.Date()`" header-includes: - \usepackage{amsmath} - \usepackage{amssymb} - \usepackage{bm} output: html_document: theme: journal highlight: tango toc: true toc_depth: 3 toc_float: collapsed: true smooth_scroll: false fig_width: 5 vignette: > %\VignetteIndexEntry{test} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} bibliography: references.bib csl: apa.csl --- ```{r include = FALSE} knitr::opts_chunk$set( collapse = TRUE,comment = "#>", echo = TRUE,cache = TRUE, dev = "png") ``` \def\T{\text{T}} \newcommand{\bf}[1]{\mathbf{#1}} \newcommand{\bigPar}[1]{\left(#1\right)} \newcommand{\bigCur}[1]{\left\{#1\right\}} \newcommand{\bcSqu}[2]{\left[#1 \middle| #2\right]} \newcommand{\cE}[2]{E\bcSqu{#1}{#2}} \newcommand{\nexp}[1]{\exp\bigCur{#1}} \newcommand{\ind}[1]{1\bigCur{#1}} \newcommand{\w}[1]{\widehat{#1}} # Overview Assuming all software dependencies and **CSeQTL** [@little2023computational] installation are installed, we can begin. ```{r setup,warning = FALSE} # Load libraries req_packs = c("ggplot2","smarter","CSeQTL") for(pack in req_packs){ library(package = pack,character.only = TRUE) } # List package's exported functions ls("package:CSeQTL") # Fix seed set.seed(2) ``` # Simulation: No eQTL We will simulate a dataset with three cell types, reference allele differential expression, and no cell type-specific eQTLs. ```{r fig.dim = c(8,6)} # sample size NN = 3e2 # fold-change between q-th and 1st cell type true_KAPPA = c(1,3,1) # eQTL effect size per cell type, # fold change between B and A allele true_ETA = c(1,1,1) # cis/trans effect size true_ALPHA = c(1,1,1) # count number of cell types QQ = length(true_KAPPA) # TReC model overdispersion true_PHI = 0.1 # ASReC model overdispersion true_PSI = 0.05 # Simulate cell type proportions tRHO = gen_true_RHO(wRHO = 1,NN = NN,QQ = QQ) plot_RHO(RHO = tRHO) boxplot(tRHO,xlab = "Cell Type",ylab = "Proportion") # Simulate a data object for a gene and SNP sim = CSeQTL_dataGen(NN = NN,MAF = 0.3,true_BETA0 = log(1000), true_KAPPA = true_KAPPA,true_ETA = true_ETA,true_PHI = true_PHI, true_PSI = true_PSI,prob_phased = 0.05,true_ALPHA = true_ALPHA, RHO = tRHO,cnfSNP = TRUE,show = FALSE) names(sim) # TReC, ASReC, haplotype 2 counts sim$dat[1:3,] # Batch covariates including the intercept sim$XX[1:3,] sim$dat$SNP = sim$true_SNP sim$dat$SNP = factor(sim$dat$SNP, levels = sort(unique(sim$dat$SNP)), labels = c("AA","AB","BA","BB")) # TReC vs SNP ggplot(data = sim$dat, mapping = aes(x = SNP,y = log10(total + 1))) + geom_violin(aes(fill = SNP)) + geom_jitter(width = 0.25) + geom_boxplot(width = 0.1) + xlab("Phased Genotype") + ylab("log10(TReC + 1)") + theme(legend.position = "right", axis.title = element_text(size = 15), axis.text = element_text(size = 12)) # TReC vs ASReC ggplot(data = sim$dat, mapping = aes(x = total,y = total_phased,color = SNP)) + geom_point() + xlab("Total Read Count (TReC)") + ylab("Total Allele-specific Read Count (ASReC)") + theme(legend.position = "right", axis.title = element_text(size = 15), axis.text = element_text(size = 12)) ``` ## CSeQTL ### Bulk Test Based on the above TReC boxplot, the phased SNP genotype appears to be an eQTL. However, this has yet to account for batch effects. Let us test for a bulk eQTL accounting for batch covariates `sim$XX`. Make sure to center continuous covariates. Notice that CSeQTL, treating bulk TReC as sourced from a single cell type, corresponds to the **TReCASE** [@sun2012statistical] method. ```{r R.options = list(width = 200)} summary(sim$XX) mRHO = matrix(1,NN,1) colnames(mRHO) = "Bulk" cistrans_thres = 0.01 res = c() for(trec_only in c(TRUE,FALSE)){ for(neg_binom in c(TRUE,FALSE)){ for(beta_binom in c(TRUE,FALSE)){ cat(sprintf("%s: trec_only = %s, neg_binom = %s, beta_binom = %s ...\n", date(),trec_only,neg_binom,beta_binom)) PHASE = sim$dat$phased * ifelse(trec_only,0,1) upPHI = ifelse(neg_binom,1,0) upPSI = ifelse(beta_binom,1,0) sout = CSeQTL_smart(TREC = sim$dat$total,hap2 = sim$dat$hap2, ASREC = sim$dat$total_phased,PHASE = PHASE, SNP = sim$true_SNP,RHO = mRHO,XX = sim$XX,upPHI = upPHI, upKAPPA = 1,upETA = 1,upPSI = upPSI,upALPHA = 1, iFullModel = FALSE,trim = FALSE,thres_TRIM = 10, hypotest = TRUE,swap = FALSE,numAS = 5,numASn = 5, numAS_het = 5,cistrans_thres = cistrans_thres) # sout$HYPO res = rbind(res,smart_df(Model = ifelse(trec_only,"TReC-only","TReCASE"), TReC_Dist = ifelse(neg_binom,"Negative Binomial","Poisson"), ASReC_Dist = ifelse(beta_binom,"Beta-Binomial","Binomial"), sout$res)) rm(sout) }}} num_vars = which(unlist(lapply(res,function(xx) class(xx))) == "numeric") res[,num_vars] = apply(res[,num_vars],2,function(xx) smart_SN(x = xx,digits = 2)) res$Interpret = apply(res[,c("cistrans","PVAL_eQTL")],1,function(xx){ ct = xx[1] pval = as.numeric(xx[2]) ifelse(pval < 0.05,sprintf("%s eQTL",ct),"no eQTL") }) res$Correct_Model = apply(res[,c("TReC_Dist","ASReC_Dist")],1,function(xx){ chk_trec = (( true_PHI > 0 & xx[1] == "Negative Binomial" ) | ( true_PHI == 0 & xx[1] == "Poisson" )) chk_trec chk_asrec = (( true_PSI > 0 & xx[2] == "Beta-Binomial" ) | ( true_PSI == 0 & xx[2] == "Binomial" )) chk_trec = ifelse(chk_trec,"TReC-Yes","TReC-No") chk_asrec = ifelse(chk_asrec,"ASReC-Yes","ASReC-No") sprintf("%s;%s",chk_trec,chk_asrec) }) # Simplified output smart_rmcols(res,c("LRT_trec","LRT_trecase","LRT_cistrans","cistrans")) ``` Thus a bulk approach to eQTL testing without accounting for cell type variation leads to a false positive association. Model misspecification can also lead to inflated type 1 error. ### Cell type-specific Testing The code below adjusts for cell type proportions and tests for cell type-specific eQTLs. ```{r R.options = list(width = 200)} cistrans_thres = 0.01 res = c() for(trec_only in c(TRUE,FALSE)){ for(neg_binom in c(TRUE,FALSE)){ for(beta_binom in c(TRUE,FALSE)){ cat(sprintf("%s: trec_only = %s, neg_binom = %s, beta_binom = %s ...\n", date(),trec_only,neg_binom,beta_binom)) PHASE = sim$dat$phased * ifelse(trec_only,0,1) upPHI = ifelse(neg_binom,1,0) upPSI = ifelse(beta_binom,1,0) upKAPPA = rep(1,QQ) upETA = upKAPPA upALPHA = upETA sout = CSeQTL_smart(TREC = sim$dat$total,hap2 = sim$dat$hap2, ASREC = sim$dat$total_phased,PHASE = PHASE, SNP = sim$true_SNP,RHO = sim$true_RHO,XX = sim$XX,upPHI = upPHI, upKAPPA = upKAPPA,upETA = upETA,upPSI = upPSI,upALPHA = upALPHA, iFullModel = FALSE,trim = FALSE,thres_TRIM = 10, hypotest = TRUE,swap = FALSE,numAS = 5,numASn = 5, numAS_het = 5,cistrans_thres = cistrans_thres) # sout$HYPO res = rbind(res,smart_df(Model = ifelse(trec_only,"TReC-only","TReCASE"), TReC_Dist = ifelse(neg_binom,"Negative Binomial","Poisson"), ASReC_Dist = ifelse(beta_binom,"Beta-Binomial","Binomial"), sout$res)) rm(sout) }}} num_vars = which(unlist(lapply(res,function(xx) class(xx))) == "numeric") res[,num_vars] = apply(res[,num_vars],2,function(xx) smart_SN(x = xx,digits = 2)) res$Interpret = apply(res[,c("cistrans","PVAL_eQTL")],1,function(xx){ ct = xx[1] pval = as.numeric(xx[2]) ifelse(pval < 0.05,sprintf("%s eQTL",ct),"no eQTL") }) res$Correct_Model = apply(res[,c("TReC_Dist","ASReC_Dist")],1,function(xx){ chk_trec = (( true_PHI > 0 & xx[1] == "Negative Binomial" ) | ( true_PHI == 0 & xx[1] == "Poisson" )) chk_trec chk_asrec = (( true_PSI > 0 & xx[2] == "Beta-Binomial" ) | ( true_PSI == 0 & xx[2] == "Binomial" )) chk_trec = ifelse(chk_trec,"TReC-Yes","TReC-No") chk_asrec = ifelse(chk_asrec,"ASReC-Yes","ASReC-No") sprintf("%s;%s",chk_trec,chk_asrec) }) # Simplified output smart_rmcols(res,c("LRT_trec","LRT_trecase","LRT_cistrans","cistrans")) ``` From above, we see that modeling cell types through proportion and reference allele expression and correctly specifying the distribution of TReC and ASReC leads to no association or no cell type specific eQTLs. ## OLS We benchmark CSeQTL against OLS, an ordinary least squares model. This approach corresponds to **Matrix eQTL** [@shabalin2012matrix]. ### Bulk eQTL For a bulk eQTL, the model adjusts for genotype and confounders. The code below fits and tests for a bulk eQTL using the simulated dataset. ```{r ols_bulk} ols_out = CSeQTL_linearTest(input = sim$dat,XX = sim$XX, RHO = sim$true_RHO,SNP = sim$true_SNP,MARG = TRUE) names(ols_out) # Model estimates summary(ols_out$lm_out) # Effect sizes and hypothesis test ols_out$out_df ``` Similar to CSeQTL's bulk testing, we have a false positive bulk eQTL. ### Cell type-specific eQTLs The code below will test for cell type-specific eQTLs with OLS. ```{r ols_cts,fig.dim = c(6,7)} ols_out = CSeQTL_linearTest(input = sim$dat,XX = sim$XX, RHO = sim$true_RHO,SNP = sim$true_SNP,MARG = FALSE) names(ols_out) # Model estimates summary(ols_out$lm_out) # Effect sizes and hypothesis tests ols_out$out_df ``` # Future Sections to create * Trimming * Sample data for eQTL mapping per gene and multiple SNPs # Session Info ```{r} sessionInfo() ``` # References