--- title: "calcPhenotype" output: rmarkdown::html_vignette vignette: > %\VignetteIndexEntry{calcPhenotype} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- ```{r, include = FALSE} knitr::opts_chunk$set( collapse = TRUE, comment = "#>" ) vignette_file <- function(...) { candidates <- c( file.path(...), file.path("vignettes", ...), file.path("inst", "extdata", ...), file.path(Sys.getenv("PWD"), "inst", "extdata", ...), system.file("extdata", ..., package = "oncoPredict"), system.file("doc", ..., package = "oncoPredict") ) candidates <- candidates[nzchar(candidates) & file.exists(candidates)] if (!length(candidates)) { stop("Could not find vignette file: ", file.path(...), call. = FALSE) } candidates[[1]] } ``` ```{r setup} library(oncoPredict) #This vignette demonstrates how to use calcPhenotype() for drug response prediction and how to set commonly used optional parameters. #Set the seed for reproducibility. set.seed(12345) #Determine parameters for calcPhenotype() function. #Read training data for GDSC (expression and response). #_______________________________________________________ #GDSC1 Data #Read GDSC training expression data. rownames() are genes and colnames() are samples (cell lines/cosmic ids). #trainingExprData=(readRDS("GDSC1_Expr.rds")) #dim(trainingExprData) #17419 958 #Read GDSC1 response data. rownames() are samples (cell lines, cosmic ids), colnames() are drugs. #trainingPtype = (readRDS("GDSC1_Res.rds")) #dim(trainingPtype) #958 367 For GDSC1 #trainingPtype<-trainingPtype[,1:2] #Just 2 drugs for the vignette. #GDSC2 Data #Read GDSC training expression data. rownames() are genes and colnames() are samples. #trainingExprData=readRDS(file='GDSC2_Expr.rds') #dim(trainingExprData) #17419 805 #Read GDSC2 response data. rownames() are samples, colnames() are drugs. trainingPtype = readRDS(file = vignette_file("GDSC2_Res.rds")) #dim(trainingPtype) #805 198 #GDSC2 expression data for this vignette. This is a reduced example dataset. trainingExprData=readRDS(file = vignette_file("GDSC2_Expr_short.rds")) #dim(trainingExprData) #500 200 #The GDSC IC50 values are already log-transformed. Convert them back before using #the default power transformation in calcPhenotype(). trainingPtype<-exp(trainingPtype) #Or read training data for CTRP (expression and response) #_______________________________________________________ #Read CTRP training expression data. rownames() are genes and colnames() are samples (cell lines/cosmic ids). #trainingExprData = readRDS(file = "CTRP2_Expr.rds") #dim(trainingExprData) #51847 829 #Read CTRP training response data. rownames() are samples (cell lines, cosmic ids), colnames() are drugs. #trainingPtype = readRDS(file = "CTRP2_Res.rds") #dim(trainingPtype) #829 545 #Test data. #_______________________________________________________ #Read testing data as a matrix with rownames() as genes and colnames() as samples. testExprData=as.matrix(read.table(vignette_file("prostate_test_data.txt"), header=TRUE, row.names=1)) #dim(testExprData) #1000 20 #Additional parameters. #_______________________________________________________ #batchCorrect options: "eb" for ComBat, "qn" for quantile normalization, "standardize" for z-score standardization, "rank", "rank_then_eb", or "none" #"eb" is often used when the training and testing datasets are both microarray data. #"standardize" can be useful when training with microarray data and predicting in RNA-seq data. batchCorrect<-"eb" #Determine whether or not to power transform the phenotype data. #Default is TRUE. powerTransformPhenotype<-TRUE #Determine percentage of low varying genes to remove. #Default is 0.2. This filter reduces the influence of low-variance genes and can #also reduce runtime for large RNA-seq matrices. removeLowVaryingGenes<-0.2 #Determine method to remove low varying genes. #Options are 'homogenizeData' and 'rawData' #Use 'homogenizeData' to filter after the training and test matrices have been #aligned and batch-corrected. removeLowVaringGenesFrom<-"homogenizeData" #Determine the minimum number of training samples required to train on. #This example uses reduced data, so the threshold is set low enough for the #vignette. Larger analyses should use enough samples to fit a reliable model. minNumSamples=10 #Determine how you would like to deal with duplicate gene IDs. #Depending on preprocessing, duplicate gene identifiers may or may not be present. #Options are -1 for ask user, 1 for summarize by mean, and 2 for disregard duplicates selection<- 1 #Determine if you'd like to print outputs. Set to FALSE here to keep the vignette output concise. printOutput=FALSE #Indicate whether or not you'd like to use principal component regression for feature/gene reduction. Options are 'TRUE' and 'FALSE'. #Note: If you indicate 'report_pc=TRUE' you need to also indicate 'pcr=TRUE' pcr=FALSE #Indicate whether you want to output the principal components. Options are 'TRUE' and 'FALSE'. report_pc=FALSE #Indicate if you want correlation coefficients for biomarker discovery. These are the correlations between a given gene of interest across all samples vs. a given drug response across samples. #These correlations can be ranked to obtain a ranked correlation to determine highly correlated drug-gene associations. cc=FALSE #Indicate whether or not you want to output the R^2 values for the data you train on from true and predicted values. #These values represent the percentage in which the optimal model accounts for the variance in the training data. #Options are 'TRUE' and 'FALSE'. rsq=FALSE #Indicate percent variability (of the training data) you'd like principal components to reflect if pcr=TRUE. Default is 80. percent=80 #Run calcPhenotype() using the parameters specified above. #__________________________________________________________________________________________________________________________________ drug_predictions <- calcPhenotype(trainingExprData=trainingExprData, trainingPtype=trainingPtype, testExprData=testExprData, batchCorrect=batchCorrect, powerTransformPhenotype=powerTransformPhenotype, removeLowVaryingGenes=removeLowVaryingGenes, minNumSamples=minNumSamples, selection=selection, printOutput=printOutput, pcr=pcr, removeLowVaringGenesFrom=removeLowVaringGenesFrom, report_pc=report_pc, cc=cc, percent=percent, rsq=rsq) #The returned matrix contains predicted drug response values for each test sample. #dim(drug_predictions) #Visualize the predicted drug response matrix. drug_predictions_scaled <- t(scale(drug_predictions)) drug_predictions_scaled[!is.finite(drug_predictions_scaled)] <- 0 heatmap(drug_predictions_scaled, Rowv=NA, Colv=NA, scale="none", labRow=NA, cexCol=0.7, margins=c(6, 2), xlab="Test samples", ylab="Drugs", main="Predicted drug response") ```