We have implemented a highly optimized version of the iq pipeline (Pham et al., Bioinformatics 2020). To run the following examples, download DIA-report-long-format.zip and unzip the file to a local working directory.
The unzipped file DIA-report-long-format.txt
is a
tab-deliminated text file exported from a Spectronaut search using this
export schema iq.rs.
The user might want to import this schema to their Spectronaut
installation for the ease of using the iq pipeline.
First, we apply the standard pipeline implemented in pure R. Read and filter the data
library("iq") # if not already installed, run install.packages("iq")
raw <- read.delim("DIA-Report-long-format.txt")
selected <- raw$F.ExcludedFromQuantification == "False" &
!is.na(raw$PG.Qvalue) & (raw$PG.Qvalue < 0.01) &
!is.na(raw$EG.Qvalue) & (raw$EG.Qvalue < 0.01)
raw <- raw[selected,]
Normalize the data, create a protein list, and perform the MaxLFQ algorithm
sample_id <- "R.FileName"
secondary_id <- c("EG.Library", "FG.Id", "FG.Charge", "F.FrgIon", "F.Charge", "F.FrgLossType")
norm_data <- iq::preprocess(raw,
sample_id = sample_id,
secondary_id = secondary_id)
protein_list <- iq::create_protein_list(norm_data)
result <- iq::create_protein_table(protein_list)
Extract annotation columns and write the result to an output file
annotation_columns <- c("PG.Genes", "PG.ProteinNames")
extra_names <- iq::extract_annotation(rownames(result$estimate),
raw,
annotation_columns = annotation_columns)
write.table(cbind(Protein = rownames(result$estimate),
extra_names[, annotation_columns],
MaxLFQ_annotation = result$annotation,
result$estimate),
"iq-MaxLFQ.txt", sep = "\t", row.names = FALSE)
The resulting file iq-MaxLFQ.txt
is the protein level
quantification report in a tab-deliminated text format.
The function iq::fast_MaxLFQ
implemented in C++ combines
the functionalities of iq::create_protein_list
and
iq::create_protein_table
.
#--------------------- Replacing ---------------------
# protein_list <- iq::create_protein_list(norm_data) #
# result <- iq::create_protein_table(protein_list) #
#-----------------------------------------------------
result_faster <- iq::fast_MaxLFQ(norm_data)
The results of the R implementation result
and C++
implementation result_faster
should be equal up to the
floating-point precision of the underlying numerical libraries.
cat("Max difference =", max(abs(result_faster$estimate - result$estimate), na.rm = TRUE), "\n")
cat("Identical NAs =", identical(is.na(result_faster$estimate), is.na(result$estimate)), "\n")
cat("Equal annotation =", identical(result_faster$annotation, result$annotation), "\n")
We can check the improvement in execution time. The following result is obtained on a computer with 12 CPU cores.
We have implemented a fast data loading algorithm and an efficient data structure. The memory usage is highly optimized to enable the processing of very large datasets.
sample_id <- "R.FileName"
secondary_id <- c("EG.Library", "FG.Id", "FG.Charge", "F.FrgIon", "F.Charge", "F.FrgLossType")
annotation_columns <- c("PG.Genes", "PG.ProteinNames")
iq_dat <- iq::fast_read("DIA-report-long-format.txt",
sample_id = sample_id,
secondary_id = secondary_id,
filter_string_equal = c("F.ExcludedFromQuantification" = "False"),
annotation_col = annotation_columns)
iq_norm_data <- iq::fast_preprocess(iq_dat$quant_table)
result_fastest <- iq::fast_MaxLFQ(iq_norm_data,
row_names = iq_dat$protein[, 1],
col_names = iq_dat$sample)
The result of the optimized pipeline result_fastest
should be the same as that of the standard pipeline
result
.
cat("Max difference =", max(abs(result_fastest$estimate - result$estimate), na.rm = TRUE), "\n")
cat("Identical NAs =", identical(is.na(result_fastest$estimate), is.na(result$estimate)), "\n")
cat("Equal annotation =", identical(result_fastest$annotation, result$annotation), "\n")
The annotation columns are stored in the protein
component of the input data structure iq_dat
. We can
extract the annotation columns and write the result to an output text
file.
iq_extra_names <- iq::extract_annotation(rownames(result_fastest$estimate),
iq_dat$protein,
annotation_columns = annotation_columns)
write.table(cbind(Protein = rownames(result_fastest$estimate),
iq_extra_names[, annotation_columns],
MaxLFQ_annotation = result_fastest$annotation,
result_fastest$estimate),
"iq-MaxLFQ-fast.txt", sep = "\t", row.names = FALSE)
sample_id <- "R.FileName"
secondary_id <- c("EG.Library", "FG.Id", "FG.Charge", "F.FrgIon", "F.Charge", "F.FrgLossType")
annotation_columns <- c("PG.Genes", "PG.ProteinNames")
system.time({
# reading data
raw <- read.delim("DIA-report-long-format.txt")
# filtering
selected <- raw$F.ExcludedFromQuantification == "False" &
!is.na(raw$PG.Qvalue) & raw$PG.Qvalue < 0.01 &
!is.na(raw$EG.Qvalue) & raw$EG.Qvalue < 0.01
raw <- raw[selected,]
## process
norm_data <- iq::preprocess(raw,
sample_id = sample_id,
secondary_id = secondary_id)
protein_list <- iq::create_protein_list(norm_data)
result <- iq::create_protein_table(protein_list)
})
system.time({
iq_dat <- iq::fast_read("DIA-report-long-format.txt",
sample_id = sample_id,
secondary_id = secondary_id,
filter_string_equal = c("F.ExcludedFromQuantification" = "False"),
annotation_col = annotation_columns)
iq_norm_data <- iq::fast_preprocess(iq_dat$quant_table)
result_fastest <- iq::fast_MaxLFQ(iq_norm_data,
row_names = iq_dat$protein[, 1],
col_names = iq_dat$sample)
})