---
title: "glds"
output: rmarkdown::html_vignette
vignette: >
  %\VignetteIndexEntry{glds}
  %\VignetteEngine{knitr::rmarkdown}
  %\VignetteEncoding{UTF-8}
---

```{r, include = FALSE}
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)

vignette_file <- function(...) {
  candidates <- c(
    file.path(...),
    file.path("vignettes", ...),
    file.path("inst", "extdata", ...),
    file.path(Sys.getenv("PWD"), "inst", "extdata", ...),
    system.file("extdata", ..., package = "oncoPredict"),
    system.file("doc", ..., package = "oncoPredict")
  )
  candidates <- candidates[nzchar(candidates) & file.exists(candidates)]
  if (!length(candidates)) {
    stop("Could not find vignette file: ", file.path(...), call. = FALSE)
  }
  candidates[[1]]
}
```

```{r setup}
library(oncoPredict)

#This vignette demonstrates how to control for general levels of drug sensitivity
#(GLDS) in pre-clinical biomarker discovery. The example applies glds() to GDSC2
#data to obtain p-values and beta values for drug-marker associations.

#Set parameters of completeMatrix().
#_____________________________________________________________________
#nPerms=50

#trainingPtype = readRDS(file = "GDSC2_Res.rds")
#There are some NA values, which will cause prcomp() to fail when applying GLDS.
#senMat=trainingPtype

#Apply completeMatrix()
#_____________________________________________________________________
#This function returns the completed matrix. Set folder=TRUE to also write complete_matrix_output.txt.

#completeMatrix(trainingPtype)

#Apply the glds() function.
#_____________________________________________________________________

#Set parameters...

#drugMat is a matrix of drug sensitivity data. rownames() are pre-clinical samples, and colnames() are drug names.
#The sensitivity data used here is GDSCv2.

#Read GDSC's updated cell line information file (used later).
#cellLineDetails<-read_excel('Cell_Lines_Details.xlsx')
cellLineDetails<-read.csv(vignette_file("Cell_Lines_Details.csv"))

#The response data were processed with completeMatrix() because NA values in the
#response matrix will cause prcomp() to fail.
cm<-read.table(vignette_file("complete_matrix_output_GDSCv2.txt"), header=TRUE, row.names=1) #No NA values remain.

#Cosmic identifiers are used for cell names in this dataset and are converted
#to cell-line names before matching with the marker matrix.

#Replace the rownames of cm with cell line names. Right now, they are cosmic ids.
#This will require using GDSC's cell line details file (which maps cosmic ids to cell line names).
newRows <- substring(rownames(cm),8) #Remove 'COSMIC'...keep the numbers after COSMIC.
indices<-match(as.numeric(newRows), as.vector(unlist(cellLineDetails[,2]))) #Refer to the cell line details file to make this replacement.
newNames<-as.vector(unlist(cellLineDetails[,1]))[indices] #Reports the corresponding cell line names
# Match the sanitized cell-line names used in the example marker matrix.
rownames(cm)<-make.names(newNames)

#Update the drug names in cm by removing extra identifiers appended to the names.
#gdscv2_drugs.xlsx contains the colnames of cm in the correct order with those identifiers removed.
#fix<-read_excel('gdscv2_drugs.xlsx')
#fix<-as.vector(unlist(fix[,2]))
fix<-as.vector(unlist(read.table(vignette_file("gdscv2_drugs.txt"), header=TRUE)))
colnames(cm)<-as.vector(fix)
drugMat<-as.matrix(cm) #Finally, set this object as the drugMat parameter. 
#dim(drugMat) #100 samples vs. 198 drugs in this reduced example file.

#markerMat contains the data to test for association with drug sensitivity (e.g. a matrix of somatic mutation data). rownames() are
#marker names (e.g. gene names), and colnames() are samples.
#The dataset used here is GDSCv2's updated mutation data for pan-cancer. It includes both CNV and coding variant.
#mutationMat<-read.csv('GDSC2_Pan_Both.csv')
#mutationMat<-mutationMat[,c(1,6,7)] #Index to these 3 columns of interest.
#colnames(mutationMat) #"cell_line_name"  "genetic_feature" "is_mutated" 
#Some entries are duplicated cell line name - genetic feature combos...remove them to avoid problems with pivot_wider().
#vec<-c()
#for (i in 1:nrow(mutationMat)){
#  vec[i]<-paste(mutationMat[i,1],mutationMat[i,2], sep=' ')
#}
#nonDupIndices<-match(unique(vec), vec)
#mutationMat2<-mutationMat[nonDupIndices,]

#Some gene mutation entries are blank...remove them to avoid problems with pivot_wider().
#library(tidyverse)
#good<-(mutationMat2[,2]) != ""
#mutationMat3<-mutationMat2[good,]
#mutationMat4<-mutationMat3 %>%
#  pivot_wider(names_from=genetic_feature,
#              values_from=is_mutated)
#rownames(mutationMat4)<-as.vector(unlist(mutationMat4[,1])) #Use cell lines as rownames before transposing.
#cols<-rownames(mutationMat4)
#mutationMat4<-as.matrix(t(mutationMat4[,-1]))
#Make sure the matrix is numeric.
#mutationMat<-mutationMat4
#mutationMat4<-apply(mutationMat4, 2, as.numeric)
#rownames(mutationMat4)<-rownames(mutationMat)
#markerMat<-mutationMat4
# replace all non-finite values with 0
#markerMat[!is.finite(markerMat)] <- 0
#colnames(markerMat)<-cols
#write.table(markerMat, file='markerMat.txt')
#The included example markerMat is reduced to the top 200 markers among the samples used here.
markerMat<-as.matrix(read.table(vignette_file("markerMat.txt"), header=TRUE, row.names=1, check.names=FALSE))
#dim(markerMat) #200 markers vs. 40 samples in this reduced example file.

if(length(intersect(colnames(markerMat), rownames(drugMat))) == 0){
  stop("No overlapping samples were found between markerMat and drugMat.")
}

#drugRelatedness contains drug names and the corresponding target pathways.
#This file is GDSC's updated drug relatedness file (obtained from bulk data download/all compounds screened/compounds-annotation).
#Some drug names in this file were adjusted so they match colnames of cm.
#Ex: replace - with . (small modifications like that).
drugRelatedness <- read.csv(vignette_file("screened_compunds_rel_8.2.csv"))
drugRelatedness<-drugRelatedness[,c(3,6)]
#colnames(drugRelatedness) #"DRUG_NAME"      "TARGET_PATHWAY"

glds_results <- glds(drugMat,
                     drugRelatedness,
                     markerMat,
                     minMuts=5,
                     additionalCovariateMatrix=NULL,
                     threshold=0.7)
```