---
title: "Introduction to CSeQTL"
author: "Paul Little"
date: "Last Refresh: `r Sys.Date()`"
header-includes:
  - \usepackage{amsmath}
  - \usepackage{amssymb}
  - \usepackage{bm}
output:
  html_document:
    theme: journal
    highlight: tango
    toc: true
    toc_depth: 3
    toc_float:
      collapsed: true
      smooth_scroll: false
    fig_width: 5
vignette: >
  %\VignetteIndexEntry{test}
  %\VignetteEngine{knitr::rmarkdown}
  %\VignetteEncoding{UTF-8}
bibliography: references.bib
csl: apa.csl
---

```{r include = FALSE}
knitr::opts_chunk$set(
	collapse = TRUE,comment = "#>",
	echo = TRUE,cache = TRUE,
	dev = "png")
```

\def\T{\text{T}}
\newcommand{\bf}[1]{\mathbf{#1}}
\newcommand{\bigPar}[1]{\left(#1\right)}
\newcommand{\bigCur}[1]{\left\{#1\right\}}
\newcommand{\bcSqu}[2]{\left[#1 \middle| #2\right]}
\newcommand{\cE}[2]{E\bcSqu{#1}{#2}}
\newcommand{\nexp}[1]{\exp\bigCur{#1}}
\newcommand{\ind}[1]{1\bigCur{#1}}
\newcommand{\w}[1]{\widehat{#1}}

# Overview

Assuming all software dependencies and **CSeQTL** [@little2023computational] installation are installed, we can begin.

```{r setup,warning = FALSE}
# Load libraries
req_packs = c("ggplot2","smarter","CSeQTL")
for(pack in req_packs){
  library(package = pack,character.only = TRUE)
}

# List package's exported functions
ls("package:CSeQTL")

# Fix seed
set.seed(2)
```

# Simulation: No eQTL

We will simulate a dataset with three cell types, reference allele differential expression, and no cell type-specific eQTLs.

```{r fig.dim = c(8,6)}
# sample size
NN = 3e2

# fold-change between q-th and 1st cell type
true_KAPPA  = c(1,3,1)

# eQTL effect size per cell type, 
#   fold change between B and A allele
true_ETA    = c(1,1,1)

# cis/trans effect size
true_ALPHA  = c(1,1,1)

# count number of cell types
QQ = length(true_KAPPA)

# TReC model overdispersion
true_PHI = 0.1

# ASReC model overdispersion
true_PSI = 0.05

# Simulate cell type proportions
tRHO = gen_true_RHO(wRHO = 1,NN = NN,QQ = QQ)
plot_RHO(RHO = tRHO)
boxplot(tRHO,xlab = "Cell Type",ylab = "Proportion")

# Simulate a data object for a gene and SNP
sim = CSeQTL_dataGen(NN = NN,MAF = 0.3,true_BETA0 = log(1000),
  true_KAPPA = true_KAPPA,true_ETA = true_ETA,true_PHI = true_PHI,
  true_PSI = true_PSI,prob_phased = 0.05,true_ALPHA = true_ALPHA,
  RHO = tRHO,cnfSNP = TRUE,show = FALSE)

names(sim)

# TReC, ASReC, haplotype 2 counts
sim$dat[1:3,]

# Batch covariates including the intercept
sim$XX[1:3,]

sim$dat$SNP = sim$true_SNP
sim$dat$SNP = factor(sim$dat$SNP,
  levels = sort(unique(sim$dat$SNP)),
  labels = c("AA","AB","BA","BB"))

# TReC vs SNP
ggplot(data = sim$dat,
  mapping = aes(x = SNP,y = log10(total + 1))) +
  geom_violin(aes(fill = SNP)) + geom_jitter(width = 0.25) +
  geom_boxplot(width = 0.1) +
  xlab("Phased Genotype") + ylab("log10(TReC + 1)") +
  theme(legend.position = "right",
    axis.title = element_text(size = 15),
    axis.text = element_text(size = 12))

# TReC vs ASReC
ggplot(data = sim$dat,
  mapping = aes(x = total,y = total_phased,color = SNP)) +
  geom_point() + xlab("Total Read Count (TReC)") + 
  ylab("Total Allele-specific Read Count (ASReC)") +
  theme(legend.position = "right",
    axis.title = element_text(size = 15),
    axis.text = element_text(size = 12))
```

## CSeQTL

### Bulk Test

Based on the above TReC boxplot, the phased SNP genotype appears to be an eQTL. However, this has yet to account for batch effects. Let us test for a bulk eQTL accounting for batch covariates `sim$XX`. Make sure to center continuous covariates. Notice that CSeQTL, treating bulk TReC as sourced from a single cell type, corresponds to the **TReCASE** [@sun2012statistical] method.

```{r R.options = list(width = 200)}
summary(sim$XX)

mRHO = matrix(1,NN,1)
colnames(mRHO) = "Bulk"
cistrans_thres = 0.01
res = c()

for(trec_only in c(TRUE,FALSE)){
for(neg_binom in c(TRUE,FALSE)){
for(beta_binom in c(TRUE,FALSE)){
  
  cat(sprintf("%s: trec_only = %s, neg_binom = %s, beta_binom = %s ...\n",
    date(),trec_only,neg_binom,beta_binom))
  
  PHASE = sim$dat$phased * ifelse(trec_only,0,1)
  upPHI = ifelse(neg_binom,1,0)
  upPSI = ifelse(beta_binom,1,0)
  
  sout = CSeQTL_smart(TREC = sim$dat$total,hap2 = sim$dat$hap2,
    ASREC = sim$dat$total_phased,PHASE = PHASE,
    SNP = sim$true_SNP,RHO = mRHO,XX = sim$XX,upPHI = upPHI,
    upKAPPA = 1,upETA = 1,upPSI = upPSI,upALPHA = 1,
    iFullModel = FALSE,trim = FALSE,thres_TRIM = 10,
    hypotest = TRUE,swap = FALSE,numAS = 5,numASn = 5,
    numAS_het = 5,cistrans_thres = cistrans_thres)
  # sout$HYPO
  
  res = rbind(res,smart_df(Model = ifelse(trec_only,"TReC-only","TReCASE"),
    TReC_Dist = ifelse(neg_binom,"Negative Binomial","Poisson"),
    ASReC_Dist = ifelse(beta_binom,"Beta-Binomial","Binomial"),
    sout$res))
  rm(sout)
  
}}}

num_vars = which(unlist(lapply(res,function(xx) class(xx))) == "numeric")
res[,num_vars] = apply(res[,num_vars],2,function(xx) smart_SN(x = xx,digits = 2))
res$Interpret = apply(res[,c("cistrans","PVAL_eQTL")],1,function(xx){
  ct = xx[1]
  pval = as.numeric(xx[2])
  ifelse(pval < 0.05,sprintf("%s eQTL",ct),"no eQTL")
})
res$Correct_Model = apply(res[,c("TReC_Dist","ASReC_Dist")],1,function(xx){
  chk_trec = (( true_PHI > 0 & xx[1] == "Negative Binomial" )
    | ( true_PHI == 0 & xx[1] == "Poisson" ))
  chk_trec
  
  chk_asrec = (( true_PSI > 0 & xx[2] == "Beta-Binomial" )
    | ( true_PSI == 0 & xx[2] == "Binomial" ))
  
  chk_trec = ifelse(chk_trec,"TReC-Yes","TReC-No")
  chk_asrec = ifelse(chk_asrec,"ASReC-Yes","ASReC-No")
  sprintf("%s;%s",chk_trec,chk_asrec)
})

# Simplified output
smart_rmcols(res,c("LRT_trec","LRT_trecase","LRT_cistrans","cistrans"))
```

Thus a bulk approach to eQTL testing without accounting for cell type variation leads to a false positive association. Model misspecification can also lead to inflated type 1 error.

### Cell type-specific Testing

The code below adjusts for cell type proportions and tests for cell type-specific eQTLs.

```{r R.options = list(width = 200)}
cistrans_thres = 0.01
res = c()

for(trec_only in c(TRUE,FALSE)){
for(neg_binom in c(TRUE,FALSE)){
for(beta_binom in c(TRUE,FALSE)){
  
  cat(sprintf("%s: trec_only = %s, neg_binom = %s, beta_binom = %s ...\n",
    date(),trec_only,neg_binom,beta_binom))
  
  PHASE = sim$dat$phased * ifelse(trec_only,0,1)
  upPHI = ifelse(neg_binom,1,0)
  upPSI = ifelse(beta_binom,1,0)
  upKAPPA = rep(1,QQ)
  upETA = upKAPPA
  upALPHA = upETA
  
  sout = CSeQTL_smart(TREC = sim$dat$total,hap2 = sim$dat$hap2,
    ASREC = sim$dat$total_phased,PHASE = PHASE,
    SNP = sim$true_SNP,RHO = sim$true_RHO,XX = sim$XX,upPHI = upPHI,
    upKAPPA = upKAPPA,upETA = upETA,upPSI = upPSI,upALPHA = upALPHA,
    iFullModel = FALSE,trim = FALSE,thres_TRIM = 10,
    hypotest = TRUE,swap = FALSE,numAS = 5,numASn = 5,
    numAS_het = 5,cistrans_thres = cistrans_thres)
  # sout$HYPO
  
  res = rbind(res,smart_df(Model = ifelse(trec_only,"TReC-only","TReCASE"),
    TReC_Dist = ifelse(neg_binom,"Negative Binomial","Poisson"),
    ASReC_Dist = ifelse(beta_binom,"Beta-Binomial","Binomial"),
    sout$res))
  rm(sout)
  
}}}

num_vars = which(unlist(lapply(res,function(xx) class(xx))) == "numeric")
res[,num_vars] = apply(res[,num_vars],2,function(xx) smart_SN(x = xx,digits = 2))
res$Interpret = apply(res[,c("cistrans","PVAL_eQTL")],1,function(xx){
  ct = xx[1]
  pval = as.numeric(xx[2])
  ifelse(pval < 0.05,sprintf("%s eQTL",ct),"no eQTL")
})
res$Correct_Model = apply(res[,c("TReC_Dist","ASReC_Dist")],1,function(xx){
  chk_trec = (( true_PHI > 0 & xx[1] == "Negative Binomial" )
    | ( true_PHI == 0 & xx[1] == "Poisson" ))
  chk_trec
  
  chk_asrec = (( true_PSI > 0 & xx[2] == "Beta-Binomial" )
    | ( true_PSI == 0 & xx[2] == "Binomial" ))
  
  chk_trec = ifelse(chk_trec,"TReC-Yes","TReC-No")
  chk_asrec = ifelse(chk_asrec,"ASReC-Yes","ASReC-No")
  sprintf("%s;%s",chk_trec,chk_asrec)
})

# Simplified output
smart_rmcols(res,c("LRT_trec","LRT_trecase","LRT_cistrans","cistrans"))

```

From above, we see that modeling cell types through proportion and reference allele expression and correctly specifying the distribution of TReC and ASReC leads to no association or no cell type specific eQTLs.


## OLS

We benchmark CSeQTL against OLS, an ordinary least squares model. This approach corresponds to **Matrix eQTL** [@shabalin2012matrix].

### Bulk eQTL

For a bulk eQTL, the model adjusts for genotype and confounders. The code below fits and tests for a bulk eQTL using the simulated dataset.

```{r ols_bulk}
ols_out = CSeQTL_linearTest(input = sim$dat,XX = sim$XX,
  RHO = sim$true_RHO,SNP = sim$true_SNP,MARG = TRUE)
names(ols_out)

# Model estimates
summary(ols_out$lm_out)

# Effect sizes and hypothesis test
ols_out$out_df
```

Similar to CSeQTL's bulk testing, we have a false positive bulk eQTL. 

### Cell type-specific eQTLs

The code below will test for cell type-specific eQTLs with OLS.

```{r ols_cts,fig.dim = c(6,7)}
ols_out = CSeQTL_linearTest(input = sim$dat,XX = sim$XX,
  RHO = sim$true_RHO,SNP = sim$true_SNP,MARG = FALSE)
names(ols_out)

# Model estimates
summary(ols_out$lm_out)

# Effect sizes and hypothesis tests
ols_out$out_df
```




# Future Sections to create

* Trimming
* Sample data for eQTL mapping per gene and multiple SNPs

# Session Info

```{r}
sessionInfo()
```

# References