Title: | Efficiently Read Sequence Data (VCF Format, BCF Format, METAL Format and BGEN Format) into R |
---|---|
Description: | Integrate sequencing data (Variant call format, e.g. VCF or BCF) or meta-analysis results in R. This package can help you (1) read VCF/BCF/BGEN files by chromosomal ranges (e.g. 1:100-200); (2) read RareMETAL summary statistics files; (3) read tables from a tabix-indexed files; (4) annotate VCF/BCF files; (5) create customized workflow based on Makefile. |
Authors: | Xiaowei Zhan [aut, cre], Dajiang Liu [aut], Attractive Chaos [cph] (We have used the following software and made minimal necessary changes: Tabix, Heng Li <[email protected]> (MIT license). We removed standard IO related functions, e.g. printf, fprintf ; also changed its un-safe pointer arithmetics.), Broad Institute / Massachusetts Institute of Technology [cph], Genome Research Ltd (GRL) [cph], Facebook, Inc [cph], D. Richard Hipp [cph] |
Maintainer: | Xiaowei Zhan <[email protected]> |
License: | GPL | file LICENSE |
Version: | 9.7 |
Built: | 2024-11-02 06:43:03 UTC |
Source: | CRAN |
Read from binary PLINK file and return a genotype matrix
## S3 method for class 'PlinkFile' plinkFileObject[sampleIndex, markerIndex]
## S3 method for class 'PlinkFile' plinkFileObject[sampleIndex, markerIndex]
plinkFileObject |
a PlinkFileObject obtained by openPlink() |
sampleIndex |
integer, 1-basd, index of samples to be extracted |
markerIndex |
integer, 1-basd, index of markers to be extracted |
genotype matrix, marker by sample
http://zhanxw.com/seqminer/ for online manual and examples
## these indice are nonsynonymous markers for 1:196621007-196716634", ## refer to the readVCFToMatrixByRange() fileName = system.file("plink/all.anno.filtered.extract.bed", package = "seqminer") filePrefix = sub(fileName, pattern = ".bed", replacement = "") plinkObj = openPlink(filePrefix) sampleIndex = seq(3) markerIndex =c(14, 36) cfh <- plinkObj[sampleIndex, markerIndex]
## these indice are nonsynonymous markers for 1:196621007-196716634", ## refer to the readVCFToMatrixByRange() fileName = system.file("plink/all.anno.filtered.extract.bed", package = "seqminer") filePrefix = sub(fileName, pattern = ".bed", replacement = "") plinkObj = openPlink(filePrefix) sampleIndex = seq(3) markerIndex =c(14, 36) cfh <- plinkObj[sampleIndex, markerIndex]
Add a job to a workflow
addJob(wf, job)
addJob(wf, job)
wf |
a variable of workflow class |
job |
a variable of job class |
j1 <- newJob('id1', 'cmd out1', 'out1') j2 <- newJob('id2', 'cmd out2', 'out2', depend = 'out1') w <- newWorkflow("wf") w <- addJob(w, j1) w <- addJob(w, j2) outFile <- file.path(tempdir(), "Makefile") writeWorkflow(w, outFile) cat('Outputted Makefile file are in the temp directory:', outFile, '\n')
j1 <- newJob('id1', 'cmd out1', 'out1') j2 <- newJob('id2', 'cmd out2', 'out2', depend = 'out1') w <- newWorkflow("wf") w <- addJob(w, j1) w <- addJob(w, j2) outFile <- file.path(tempdir(), "Makefile") writeWorkflow(w, outFile) cat('Outputted Makefile file are in the temp directory:', outFile, '\n')
Annotate a test variant
annotateGene(param, chrom, position, ref, alt)
annotateGene(param, chrom, position, ref, alt)
param |
a list of annotation configuaration (e.g. reference file, gene definition) |
chrom |
a vector of chromosome names |
position |
a vector of chromosome positions |
ref |
a vector of reference alleles |
alt |
a vector of alternative alleles |
annotated results in a data frame structure
makeAnnotationParameter
if (.Platform$endian == "little") { param <- list(reference = system.file("tabanno/test.fa", package = "seqminer"), geneFile = system.file("tabanno/test.gene.txt", package = "seqminer")) param <- makeAnnotationParameter(param) print(param) annotateGene(param, c("1", "1"), c(3, 5) , c("A", "C"), c("G", "C")) } else { message("Tabix does not work well for big endian for now") }
if (.Platform$endian == "little") { param <- list(reference = system.file("tabanno/test.fa", package = "seqminer"), geneFile = system.file("tabanno/test.gene.txt", package = "seqminer")) param <- makeAnnotationParameter(param) print(param) annotateGene(param, c("1", "1"), c(3, 5) , c("A", "C"), c("G", "C")) } else { message("Tabix does not work well for big endian for now") }
Annotate a plain text file
annotatePlain(inFile, outFile, params)
annotatePlain(inFile, outFile, params)
inFile |
input file name |
outFile |
output file name |
params |
parameters |
0 if succeed
param <- list(reference = system.file("tabanno/test.fa", package = "seqminer"), geneFile = system.file("tabanno/test.gene.txt", package = "seqminer"), inputFormat = "plain") param <- makeAnnotationParameter(param) inFile <- system.file("tabanno/input.test.plain.txt", package = "seqminer") outFile <- file.path(tempdir(), "out.annotated.txt") annotatePlain(inFile, outFile, param) cat('Outputted annotation results are in the temp directory:', outFile, '\n')
param <- list(reference = system.file("tabanno/test.fa", package = "seqminer"), geneFile = system.file("tabanno/test.gene.txt", package = "seqminer"), inputFormat = "plain") param <- makeAnnotationParameter(param) inFile <- system.file("tabanno/input.test.plain.txt", package = "seqminer") outFile <- file.path(tempdir(), "out.annotated.txt") annotatePlain(inFile, outFile, param) cat('Outputted annotation results are in the temp directory:', outFile, '\n')
Annotate a VCF file
annotateVcf(inVcf, outVcf, params)
annotateVcf(inVcf, outVcf, params)
inVcf |
input VCF file name |
outVcf |
output VCF file name |
params |
parameters |
0 if succeed
param <- list(reference = system.file("tabanno/test.fa", package = "seqminer"), geneFile = system.file("tabanno/test.gene.txt", package = "seqminer")) param <- makeAnnotationParameter(param) inVcf <- system.file("tabanno/input.test.vcf", package = "seqminer") outVcf <- file.path(tempdir(), "/", "out.vcf") annotateVcf (inVcf, outVcf, param) cat('Annotated VCF files are in the temp directory:', outVcf, '\n')
param <- list(reference = system.file("tabanno/test.fa", package = "seqminer"), geneFile = system.file("tabanno/test.gene.txt", package = "seqminer")) param <- makeAnnotationParameter(param) inVcf <- system.file("tabanno/input.test.vcf", package = "seqminer") outVcf <- file.path(tempdir(), "/", "out.vcf") annotateVcf (inVcf, outVcf, param) cat('Annotated VCF files are in the temp directory:', outVcf, '\n')
Create a single chromosome index
createSingleChromosomeBCFIndex(fileName, indexFileName = NULL)
createSingleChromosomeBCFIndex(fileName, indexFileName = NULL)
fileName |
character, represents an input BCF file (Bgzipped, with Tabix index) |
indexFileName |
character, by default, create 'fileName'.scIdx |
indexFileName if success, or NULL is failed
http://zhanxw.com/seqminer/ for online manual and examples
fileName = system.file("vcf/all.anno.filtered.extract.headerFixed.bcf.gz", package = "seqminer") cfh <- createSingleChromosomeBCFIndex(fileName)
fileName = system.file("vcf/all.anno.filtered.extract.headerFixed.bcf.gz", package = "seqminer") cfh <- createSingleChromosomeBCFIndex(fileName)
Create a single chromosome index
createSingleChromosomeVCFIndex(fileName, indexFileName = NULL)
createSingleChromosomeVCFIndex(fileName, indexFileName = NULL)
fileName |
character, represents an input VCF file (Bgzipped, with Tabix index) |
indexFileName |
character, by default, create 'fileName'.scIdx |
indexFileName if success, or NULL is failed
http://zhanxw.com/seqminer/ for online manual and examples
fileName = system.file("vcf/all.anno.filtered.extract.vcf.gz", package = "seqminer") cfh <- createSingleChromosomeVCFIndex(fileName)
fileName = system.file("vcf/all.anno.filtered.extract.vcf.gz", package = "seqminer") cfh <- createSingleChromosomeVCFIndex(fileName)
Download annotation resources to a directory
download.annotation.resource(outputDirectory)
download.annotation.resource(outputDirectory)
outputDirectory |
the directory to store annotation resources |
will not return anything
## Not run: download.annotation.resource("/tmp") ## End(Not run)
## Not run: download.annotation.resource("/tmp") ## End(Not run)
Extract pair of positions by ranges
getCovPair(covData, rangeList1, rangeList2)
getCovPair(covData, rangeList1, rangeList2)
covData |
a covariance matrix with positions as dimnames |
rangeList1 |
character specify a range, 1-based index |
rangeList2 |
character specify a range, 1-based index |
a covariance matrix covFileName = system.file("rvtests/rvtest.MetaCov.assoc.gz", package = "seqminer") cfh <- rvmeta.readCovByRange(covFileName, "1:196621007-196716634") rangeList1 <- "1:196621007-196700000" rangeList2 <- "1:196700000-196716634" getCovPair(cfh, rangeList1, rangeList2)
Annotate a test variant
getRefBase(reference, chrom, position, len = NULL)
getRefBase(reference, chrom, position, len = NULL)
reference |
path to the reference genome file (.fa file) |
chrom |
a vector of chromosome names |
position |
a vector of chromosome positions |
len |
a vector of length |
based extracted from the reference genome
Test whether directory is writable
isDirWritable(outDir)
isDirWritable(outDir)
outDir |
the name of the directory |
TRUE if the file is writable isDirWritable("~")
Test whether a vector of positions are inside given ranges
isInRange(positions, rangeList)
isInRange(positions, rangeList)
positions |
characters, positions. e.g. c("1:2-3", "1:4") |
rangeList |
character, ranges, e.g. "1:1-3,1:2-4", 1-based index |
logical vector, TRUE/FALSE/NA
positions <- c("1:2-3", "1:4", "XX") ranges <- "1:1-3,1:2-4,1:5-10" isInRange(positions, ranges)
positions <- c("1:2-3", "1:4", "XX") ranges <- "1:1-3,1:2-4,1:5-10" isInRange(positions, ranges)
Check if the inputs are valid tabix range such as chr1:2-300
isTabixRange(range)
isTabixRange(range)
range |
characer vector |
valid <- isTabixRange(c("chr1:1-200", "X:1", "1:100-100", "chr1", "1:1-20,1:30-40")) stopifnot(all(valid)) invalid <- isTabixRange(c(":1", "chr1::", ":-")) stopifnot(all(!invalid))
valid <- isTabixRange(c("chr1:1-200", "X:1", "1:100-100", "chr1", "1:1-20,1:30-40")) stopifnot(all(valid)) invalid <- isTabixRange(c(":1", "chr1::", ":-")) stopifnot(all(!invalid))
Construct a usable set of annotation parameters
makeAnnotationParameter(param = NULL)
makeAnnotationParameter(param = NULL)
param |
a list of annotation elements |
list, a complete list of supported parameters
Create a new job
newJob(id, cmd, outFile, depend = NULL)
newJob(id, cmd, outFile, depend = NULL)
id |
character, job ids. |
cmd |
character, commands to run |
outFile |
character, the output file names after command are run successfully |
depend |
character vector, specify the prerequisite files (e.g. outFile from other jobs) |
j1 <- newJob('id1', 'cmd out1', 'out1') j2 <- newJob('id2', 'cmd out2', 'out2', depend = 'out1')
j1 <- newJob('id1', 'cmd out1', 'out1') j2 <- newJob('id2', 'cmd out2', 'out2', depend = 'out1')
Create a new workflow
newWorkflow(name)
newWorkflow(name)
name |
character, specify the name of the workflow |
w <- newWorkflow("wf")
w <- newWorkflow("wf")
Open binary PLINK files
openPlink(fileName)
openPlink(fileName)
fileName |
character, represents the prefix of PLINK input file |
an PLINK file object with class name ("PlinkFile")
fileName = system.file("plink/all.anno.filtered.extract.bed", package = "seqminer") fileName = sub(fileName, pattern = ".bed", replacement = "") plinkObj <- openPlink(fileName) str(plinkObj)
fileName = system.file("plink/all.anno.filtered.extract.bed", package = "seqminer") fileName = sub(fileName, pattern = ".bed", replacement = "") plinkObj <- openPlink(fileName) str(plinkObj)
Read information from BGEN file in a given range and return a list
readBGENToListByGene(fileName, geneFile, geneName)
readBGENToListByGene(fileName, geneFile, geneName)
fileName |
character, represents an input BGEN file (Bgzipped, with Tabix index) |
geneFile |
character, a text file listing all genes in refFlat format |
geneName |
character vector, which gene(s) to be extracted |
a list of chrom, pos, varid, rsid, alleles, isPhased, probability, sampleId
http://zhanxw.com/seqminer/ for online manual and examples
fileName = system.file("bgen/all.anno.filtered.extract.bgen", package = "seqminer") geneFile = system.file("vcf/refFlat_hg19_6col.txt.gz", package = "seqminer") cfh <- readBGENToListByGene(fileName, geneFile, "CFH")
fileName = system.file("bgen/all.anno.filtered.extract.bgen", package = "seqminer") geneFile = system.file("vcf/refFlat_hg19_6col.txt.gz", package = "seqminer") cfh <- readBGENToListByGene(fileName, geneFile, "CFH")
Read information from BGEN file in a given range and return a list
readBGENToListByRange(fileName, range)
readBGENToListByRange(fileName, range)
fileName |
character, represents an input BGEN file (Bgzipped, with Tabix index) |
range |
character, a text indicating which range in the BGEN file to extract. e.g. 1:100-200, 1-based index |
a list of chrom, pos, varid, rsid, alleles, isPhased, probability, sampleId
http://zhanxw.com/seqminer/ for online manual and examples
fileName = system.file("bgen/all.anno.filtered.extract.bgen", package = "seqminer") cfh <- readBGENToListByRange(fileName, "1:196621007-196716634")
fileName = system.file("bgen/all.anno.filtered.extract.bgen", package = "seqminer") cfh <- readBGENToListByRange(fileName, "1:196621007-196716634")
Read a gene from BGEN file and return a genotype matrix
readBGENToMatrixByGene(fileName, geneFile, geneName)
readBGENToMatrixByGene(fileName, geneFile, geneName)
fileName |
character, represents an input BGEN file (Bgzipped, with Tabix index) |
geneFile |
character, a text file listing all genes in refFlat format |
geneName |
character vector, which gene(s) to be extracted |
genotype matrix
http://zhanxw.com/seqminer/ for online manual and examples
fileName = system.file("bgen/all.anno.filtered.extract.bgen", package = "seqminer") geneFile = system.file("vcf/refFlat_hg19_6col.txt.gz", package = "seqminer") cfh <- readBGENToMatrixByGene(fileName, geneFile, "CFH")
fileName = system.file("bgen/all.anno.filtered.extract.bgen", package = "seqminer") geneFile = system.file("vcf/refFlat_hg19_6col.txt.gz", package = "seqminer") cfh <- readBGENToMatrixByGene(fileName, geneFile, "CFH")
Read a gene from BGEN file and return a genotype matrix
readBGENToMatrixByRange(fileName, range)
readBGENToMatrixByRange(fileName, range)
fileName |
character, represents an input BGEN file (Bgzipped, with Tabix index) |
range |
character, a text indicating which range in the BGEN file to extract. e.g. 1:100-200, 1-based index |
genotype matrix
http://zhanxw.com/seqminer/ for online manual and examples
fileName = system.file("bgen/all.anno.filtered.extract.bgen", package = "seqminer") cfh <- readBGENToMatrixByRange(fileName, "1:196621007-196716634")
fileName = system.file("bgen/all.anno.filtered.extract.bgen", package = "seqminer") cfh <- readBGENToMatrixByRange(fileName, "1:196621007-196716634")
Read from binary PLINK file and return a genotype matrix
readPlinkToMatrixByIndex(plinkFilePrefix, sampleIndex, markerIndex)
readPlinkToMatrixByIndex(plinkFilePrefix, sampleIndex, markerIndex)
plinkFilePrefix |
a PlinkFileObject obtained by openPlink() |
sampleIndex |
integer, 1-basd, index of samples to be extracted |
markerIndex |
integer, 1-basd, index of markers to be extracted |
genotype matrix, marker by sample
http://zhanxw.com/seqminer/ for online manual and examples
## these indice are nonsynonymous markers for 1:196621007-196716634", ## refer to the readVCFToMatrixByRange() fileName = system.file("plink/all.anno.filtered.extract.bed", package = "seqminer") fileName = sub(fileName, pattern = ".bed", replacement = "") sampleIndex = seq(3) markerIndex =c(14, 36) cfh <- readPlinkToMatrixByIndex(fileName, sampleIndex, markerIndex)
## these indice are nonsynonymous markers for 1:196621007-196716634", ## refer to the readVCFToMatrixByRange() fileName = system.file("plink/all.anno.filtered.extract.bed", package = "seqminer") fileName = sub(fileName, pattern = ".bed", replacement = "") sampleIndex = seq(3) markerIndex =c(14, 36) cfh <- readPlinkToMatrixByIndex(fileName, sampleIndex, markerIndex)
Read a range from BCF file and return a genotype matrix
readSingleChromosomeBCFToMatrixByRange(fileName, range, indexFileName = NULL)
readSingleChromosomeBCFToMatrixByRange(fileName, range, indexFileName = NULL)
fileName |
character, represents an input BCF file (Bgzipped, with Tabix index) |
range |
character, a text indicating which range in the BCF file to extract. e.g. 1:100-200, 1-based index |
indexFileName |
character, index file, by default, it s 'fileName'.scIdx |
genotype matrix
http://zhanxw.com/seqminer/ for online manual and examples
fileName = system.file("vcf/all.anno.filtered.extract.headerFixed.bcf.gz", package = "seqminer") cfh <- readSingleChromosomeBCFToMatrixByRange(fileName, "1:196621007-196716634")
fileName = system.file("vcf/all.anno.filtered.extract.headerFixed.bcf.gz", package = "seqminer") cfh <- readSingleChromosomeBCFToMatrixByRange(fileName, "1:196621007-196716634")
Read a range from VCF file and return a genotype matrix
readSingleChromosomeVCFToMatrixByRange(fileName, range, indexFileName = NULL)
readSingleChromosomeVCFToMatrixByRange(fileName, range, indexFileName = NULL)
fileName |
character, represents an input VCF file (Bgzipped, with Tabix index) |
range |
character, a text indicating which range in the VCF file to extract. e.g. 1:100-200, 1-based index |
indexFileName |
character, index file, by default, it s 'fileName'.scIdx |
genotype matrix
http://zhanxw.com/seqminer/ for online manual and examples
fileName = system.file("vcf/all.anno.filtered.extract.vcf.gz", package = "seqminer") cfh <- readSingleChromosomeVCFToMatrixByRange(fileName, "1:196621007-196716634")
fileName = system.file("vcf/all.anno.filtered.extract.vcf.gz", package = "seqminer") cfh <- readSingleChromosomeVCFToMatrixByRange(fileName, "1:196621007-196716634")
Read information from VCF file in a given range and return a list
readVCFToListByGene( fileName, geneFile, geneName, annoType, vcfColumn, vcfInfo, vcfIndv )
readVCFToListByGene( fileName, geneFile, geneName, annoType, vcfColumn, vcfInfo, vcfIndv )
fileName |
character, represents an input VCF file (Bgzipped, with Tabix index) |
geneFile |
character, a text file listing all genes in refFlat format |
geneName |
character vector, which gene(s) to be extracted |
annoType |
character, annotated types you would like to extract, such as "Nonsynonymous", "Synonymous". This can be left empty. |
vcfColumn |
character vector, which vcf columns to extract. It can be chosen from CHROM, POS, ID, REF, ALT, QUAL, FILTER, INFO, FORMAT and etc. |
vcfInfo |
character vector, which should be tags in the INFO columns to extarct. Common choices include: DP, AC, AF, NS |
vcfIndv |
character vector, which values to extract at individual level. Common choices are: GT, GQ, GD |
a list of genes, and each elements has specified vcfColumn, vcfinfo, vcfIndv
http://zhanxw.com/seqminer/ for online manual and examples
fileName = system.file("vcf/all.anno.filtered.extract.vcf.gz", package = "seqminer") geneFile = system.file("vcf/refFlat_hg19_6col.txt.gz", package = "seqminer") cfh <- readVCFToListByGene(fileName, geneFile, "CFH", "Synonymous", c("CHROM", "POS"), c("AF", "AC"), c("GT") )
fileName = system.file("vcf/all.anno.filtered.extract.vcf.gz", package = "seqminer") geneFile = system.file("vcf/refFlat_hg19_6col.txt.gz", package = "seqminer") cfh <- readVCFToListByGene(fileName, geneFile, "CFH", "Synonymous", c("CHROM", "POS"), c("AF", "AC"), c("GT") )
Read information from VCF file in a given range and return a list
readVCFToListByRange(fileName, range, annoType, vcfColumn, vcfInfo, vcfIndv)
readVCFToListByRange(fileName, range, annoType, vcfColumn, vcfInfo, vcfIndv)
fileName |
character, represents an input VCF file (Bgzipped, with Tabix index) |
range |
character, a text indicating which range in the VCF file to extract. e.g. 1:100-200, 1-based index |
annoType |
character, annotated types you would like to extract, such as "Nonsynonymous", "Synonymous". This can be left empty. |
vcfColumn |
character vector, which vcf columns to extract. It can be chosen from CHROM, POS, ID, REF, ALT, QUAL, FILTER, INFO, FORMAT and etc. |
vcfInfo |
character vector, which should be tags in the INFO columns to extarct. Common choices include: DP, AC, AF, NS |
vcfIndv |
character vector, which values to extract at individual level. Common choices are: GT, GQ, GD |
a list of genes, and each elements has specified vcfColumn, vcfinfo, vcfIndv
http://zhanxw.com/seqminer/ for online manual and examples
fileName = system.file("vcf/all.anno.filtered.extract.vcf.gz", package = "seqminer") cfh <- readVCFToListByRange(fileName, "1:196621007-196716634", "Nonsynonymous", c("CHROM", "POS"), c("AF", "AC"), c("GT") )
fileName = system.file("vcf/all.anno.filtered.extract.vcf.gz", package = "seqminer") cfh <- readVCFToListByRange(fileName, "1:196621007-196716634", "Nonsynonymous", c("CHROM", "POS"), c("AF", "AC"), c("GT") )
Read a gene from VCF file and return a genotype matrix
readVCFToMatrixByGene(fileName, geneFile, geneName, annoType)
readVCFToMatrixByGene(fileName, geneFile, geneName, annoType)
fileName |
character, represents an input VCF file (Bgzipped, with Tabix index) |
geneFile |
character, a text file listing all genes in refFlat format |
geneName |
character vector, which gene(s) to be extracted |
annoType |
character, annotated types you would like to extract, such as "Nonsynonymous", "Synonymous". This can be left empty. |
genotype matrix
http://zhanxw.com/seqminer/ for online manual and examples
fileName = system.file("vcf/all.anno.filtered.extract.vcf.gz", package = "seqminer") geneFile = system.file("vcf/refFlat_hg19_6col.txt.gz", package = "seqminer") cfh <- readVCFToMatrixByGene(fileName, geneFile, "CFH", "Synonymous")
fileName = system.file("vcf/all.anno.filtered.extract.vcf.gz", package = "seqminer") geneFile = system.file("vcf/refFlat_hg19_6col.txt.gz", package = "seqminer") cfh <- readVCFToMatrixByGene(fileName, geneFile, "CFH", "Synonymous")
Read a gene from VCF file and return a genotype matrix
readVCFToMatrixByRange(fileName, range, annoType)
readVCFToMatrixByRange(fileName, range, annoType)
fileName |
character, represents an input VCF file (Bgzipped, with Tabix index) |
range |
character, a text indicating which range in the VCF file to extract. e.g. 1:100-200, 1-based index |
annoType |
character, annotated types you would like to extract, such as "Nonsynonymous", "Synonymous". This can be left empty. |
genotype matrix
http://zhanxw.com/seqminer/ for online manual and examples
fileName = system.file("vcf/all.anno.filtered.extract.vcf.gz", package = "seqminer") cfh <- readVCFToMatrixByRange(fileName, "1:196621007-196716634", "Nonsynonymous")
fileName = system.file("vcf/all.anno.filtered.extract.vcf.gz", package = "seqminer") cfh <- readVCFToMatrixByRange(fileName, "1:196621007-196716634", "Nonsynonymous")
Read covariance by range from METAL-format files.
rvmeta.readCovByRange(covFile, tabixRange)
rvmeta.readCovByRange(covFile, tabixRange)
covFile |
character, a covariance file (rvtests outputs using –meta cov) |
tabixRange |
character, a text indicating which range in the VCF file to extract. e.g. 1:100-200 |
a matrix of covariance within given range
http://zhanxw.com/seqminer/ for online manual and examples
covFileName = system.file("rvtests/rvtest.MetaCov.assoc.gz", package = "seqminer") cfh <- rvmeta.readCovByRange(covFileName, "1:196621007-196716634")
covFileName = system.file("rvtests/rvtest.MetaCov.assoc.gz", package = "seqminer") cfh <- rvmeta.readCovByRange(covFileName, "1:196621007-196716634")
Read association statistics by gene from METAL-format files. Both score statistics and covariance statistics will be extracted.
rvmeta.readDataByGene( scoreTestFiles, covFiles, geneFile, geneName, multiAllelic = FALSE )
rvmeta.readDataByGene( scoreTestFiles, covFiles, geneFile, geneName, multiAllelic = FALSE )
scoreTestFiles |
character vector, score test output files (rvtests outputs using –meta score) |
covFiles |
character vector, covaraite files (rvtests outputs using –meta cov) |
geneFile |
character, a text file listing all genes in refFlat format |
geneName |
character vector, which gene(s) to be extracted |
multiAllelic |
boolean, whether to read multi-allelic sites as multiple variants or not |
a list of statistics including chromosome, position, allele frequency, score statistics, covariance and annotation(if input files are annotated).
http://zhanxw.com/seqminer/ for online manual and examples
scoreFileName = system.file("rvtests/rvtest.MetaScore.assoc.anno.gz", package = "seqminer") covFileName = system.file("rvtests/rvtest.MetaCov.assoc.gz", package = "seqminer") geneFile = system.file("vcf/refFlat_hg19_6col.txt.gz", package = "seqminer") cfh <- rvmeta.readDataByGene(scoreFileName, covFileName, geneFile, "CFH")
scoreFileName = system.file("rvtests/rvtest.MetaScore.assoc.anno.gz", package = "seqminer") covFileName = system.file("rvtests/rvtest.MetaCov.assoc.gz", package = "seqminer") geneFile = system.file("vcf/refFlat_hg19_6col.txt.gz", package = "seqminer") cfh <- rvmeta.readDataByGene(scoreFileName, covFileName, geneFile, "CFH")
Read association statistics by range from METAL-format files. Both score statistics and covariance statistics will be extracted.
rvmeta.readDataByRange(scoreTestFiles, covFiles, ranges, multiAllelic = FALSE)
rvmeta.readDataByRange(scoreTestFiles, covFiles, ranges, multiAllelic = FALSE)
scoreTestFiles |
character vector, score test output files (rvtests outputs using –meta score) |
covFiles |
character vector, covaraite files (rvtests outputs using –meta cov) |
ranges |
character, a text indicating which range in the VCF file to extract. e.g. 1:100-200, 1-based index |
multiAllelic |
boolean, whether to read multi-allelic sites as multiple variants or not |
a list of statistics including chromosome, position, allele frequency, score statistics, covariance and annotation(if input files are annotated).
http://zhanxw.com/seqminer/ for online manual and examples
scoreFileName = system.file("rvtests/rvtest.MetaScore.assoc.anno.gz", package = "seqminer") covFileName = system.file("rvtests/rvtest.MetaCov.assoc.gz", package = "seqminer") geneFile = system.file("vcf/refFlat_hg19_6col.txt.gz", package = "seqminer") cfh <- rvmeta.readDataByRange(scoreFileName, covFileName, "1:196621007-196716634")
scoreFileName = system.file("rvtests/rvtest.MetaScore.assoc.anno.gz", package = "seqminer") covFileName = system.file("rvtests/rvtest.MetaCov.assoc.gz", package = "seqminer") geneFile = system.file("vcf/refFlat_hg19_6col.txt.gz", package = "seqminer") cfh <- rvmeta.readDataByRange(scoreFileName, covFileName, "1:196621007-196716634")
Read null model statistics
rvmeta.readNullModel(scoreTestFiles)
rvmeta.readNullModel(scoreTestFiles)
scoreTestFiles |
character vector, score test output files (rvtests outputs using –meta score) |
a list of statistics fitted under the null mode (without genetic effects)
http://zhanxw.com/seqminer/ for online manual and examples
scoreFileName = system.file("rvtests/rvtest.MetaScore.assoc.anno.gz", package = "seqminer")
scoreFileName = system.file("rvtests/rvtest.MetaScore.assoc.anno.gz", package = "seqminer")
Read score test statistics by range from METAL-format files.
rvmeta.readScoreByRange(scoreTestFiles, tabixRange)
rvmeta.readScoreByRange(scoreTestFiles, tabixRange)
scoreTestFiles |
character vector, score test output files (rvtests outputs using –meta score) |
tabixRange |
character, a text indicating which range in the VCF file to extract. e.g. 1:100-200 |
score test statistics within given range
http://zhanxw.com/seqminer/ for online manual and examples
scoreFileName = system.file("rvtests/rvtest.MetaScore.assoc.anno.gz", package = "seqminer") cfh <- rvmeta.readScoreByRange(scoreFileName, "1:196621007-196716634")
scoreFileName = system.file("rvtests/rvtest.MetaScore.assoc.anno.gz", package = "seqminer") cfh <- rvmeta.readScoreByRange(scoreFileName, "1:196621007-196716634")
Read skew by range from METAL-format files.
rvmeta.readSkewByRange(skewFile, tabixRange)
rvmeta.readSkewByRange(skewFile, tabixRange)
skewFile |
character, a skew file (rvtests outputs using –meta skew) |
tabixRange |
character, a text indicating which range in the VCF file to extract. e.g. 1:100-200 |
an 3-dimensional array of skewness within given range
http://zhanxw.com/seqminer/ for online manual and examples
skewFileName = system.file("rvtests/rvtest.MetaSkew.assoc.gz", package = "seqminer") cfh <- rvmeta.readSkewByRange(skewFileName, "1:196621007-196716634")
skewFileName = system.file("rvtests/rvtest.MetaSkew.assoc.gz", package = "seqminer") cfh <- rvmeta.readSkewByRange(skewFileName, "1:196621007-196716634")
Write covariance association statistics files.
rvmeta.writeCovData(rvmetaData, outName)
rvmeta.writeCovData(rvmetaData, outName)
rvmetaData |
a list vector. It's usually read by rvmeta.readDataByRange or rvmeta.readDataByGene function |
outName |
character, a text indicating output file prefix |
TRUE only if succeed
http://zhanxw.com/seqminer/ for online manual and examples
scoreFileName = system.file("rvtests/rvtest.MetaScore.assoc.anno.gz", package = "seqminer") covFileName = system.file("rvtests/rvtest.MetaCov.assoc.gz", package = "seqminer") geneFile = system.file("vcf/refFlat_hg19_6col.txt.gz", package = "seqminer") cfh <- rvmeta.readDataByRange(scoreFileName, covFileName, "1:196621007-196716634") outFile <- file.path(tempdir(), "cfh.MetaCov.assoc.gz") rvmeta.writeCovData(cfh, outFile) cat('Outputted MetaCov file are in the temp directory:', outFile, '\n')
scoreFileName = system.file("rvtests/rvtest.MetaScore.assoc.anno.gz", package = "seqminer") covFileName = system.file("rvtests/rvtest.MetaCov.assoc.gz", package = "seqminer") geneFile = system.file("vcf/refFlat_hg19_6col.txt.gz", package = "seqminer") cfh <- rvmeta.readDataByRange(scoreFileName, covFileName, "1:196621007-196716634") outFile <- file.path(tempdir(), "cfh.MetaCov.assoc.gz") rvmeta.writeCovData(cfh, outFile) cat('Outputted MetaCov file are in the temp directory:', outFile, '\n')
Write score-based association statistics files.
rvmeta.writeScoreData(rvmetaData, outName, createIndex = FALSE)
rvmeta.writeScoreData(rvmetaData, outName, createIndex = FALSE)
rvmetaData |
a list vector. It's usually read by rvmeta.readDataByRange or rvmeta.readDataByGene function |
outName |
character, a text indicating output file prefix |
createIndex |
boolean, (default FALSE), whether or not to create the index |
TRUE only if succeed
http://zhanxw.com/seqminer/ for online manual and examples
scoreFileName = system.file("rvtests/rvtest.MetaScore.assoc.anno.gz", package = "seqminer") covFileName = system.file("rvtests/rvtest.MetaCov.assoc.gz", package = "seqminer") geneFile = system.file("vcf/refFlat_hg19_6col.txt.gz", package = "seqminer") cfh <- rvmeta.readDataByRange(scoreFileName, covFileName, "1:196621007-196716634") outFile <- file.path(tempdir(), "cfh.MetaScore.assoc") rvmeta.writeScoreData(cfh, outFile) cat('Outputted MetaScore file are in the temp directory:', outFile, '\n')
scoreFileName = system.file("rvtests/rvtest.MetaScore.assoc.anno.gz", package = "seqminer") covFileName = system.file("rvtests/rvtest.MetaCov.assoc.gz", package = "seqminer") geneFile = system.file("vcf/refFlat_hg19_6col.txt.gz", package = "seqminer") cfh <- rvmeta.readDataByRange(scoreFileName, covFileName, "1:196621007-196716634") outFile <- file.path(tempdir(), "cfh.MetaScore.assoc") rvmeta.writeScoreData(cfh, outFile) cat('Outputted MetaScore file are in the temp directory:', outFile, '\n')
SeqMiner provides functions to easily load Variant Call Format (VCF) or METAL format into R
The aim of this package is to save your time parsing large text file. That means data processing time can be saved for other researches. This packages requires Bgzip compressed and Tabix indexed files as input. If input files contaians annotation by TabAnno (), it is possible to extract information at the unit of genes.
Maintainer: Xiaowei Zhan [email protected]
Authors:
Dajiang Liu [email protected]
Other contributors:
Attractive Chaos [email protected] (We have used the following software and made minimal necessary changes: Tabix, Heng Li <[email protected]> (MIT license). We removed standard IO related functions, e.g. printf, fprintf ; also changed its un-safe pointer arithmetics.) [copyright holder]
Broad Institute / Massachusetts Institute of Technology [copyright holder]
Genome Research Ltd (GRL) [copyright holder]
Facebook, Inc [copyright holder]
D. Richard Hipp [copyright holder]
Useful links:
Create tabix index file, similar to running tabix in command line.
tabix.createIndex( bgzipFile, sequenceColumn = 1, startColumn = 4, endColumn = 5, metaChar = "#", skipLines = 0 )
tabix.createIndex( bgzipFile, sequenceColumn = 1, startColumn = 4, endColumn = 5, metaChar = "#", skipLines = 0 )
bgzipFile |
character, an tabix indexed file |
sequenceColumn |
integer, sequence name column |
startColumn |
integer, start column |
endColumn |
integer, end column |
metaChar |
character, symbol for comment/meta lines |
skipLines |
integer, first this number of lines will be skipped |
http://zhanxw.com/seqminer/ for online manual and examples
fileName = system.file("vcf/all.anno.filtered.extract.vcf.gz", package = "seqminer") tabix.createIndex(fileName, 1, 2, 0, '#', 0)
fileName = system.file("vcf/all.anno.filtered.extract.vcf.gz", package = "seqminer") tabix.createIndex(fileName, 1, 2, 0, '#', 0)
Create tabix index for bgzipped MetaScore/MetaCov file
tabix.createIndex.meta(bgzipFile)
tabix.createIndex.meta(bgzipFile)
bgzipFile |
character, input vcf file |
http://zhanxw.com/seqminer/ for online manual and examples
http://zhanxw.github.io/rvtests/ for rvtests
fileName = system.file("rvtests/rvtest.MetaScore.assoc.anno.gz", package = "seqminer") tabix.createIndex.meta(fileName)
fileName = system.file("rvtests/rvtest.MetaScore.assoc.anno.gz", package = "seqminer") tabix.createIndex.meta(fileName)
Create tabix index for bgzipped VCF file
tabix.createIndex.vcf(bgzipVcfFile)
tabix.createIndex.vcf(bgzipVcfFile)
bgzipVcfFile |
character, input vcf file |
http://zhanxw.com/seqminer/ for online manual and examples
fileName = system.file("vcf/all.anno.filtered.extract.vcf.gz", package = "seqminer") tabix.createIndex.vcf(fileName)
fileName = system.file("vcf/all.anno.filtered.extract.vcf.gz", package = "seqminer") tabix.createIndex.vcf(fileName)
Read tabix file, similar to running tabix in command line.
tabix.read(tabixFile, tabixRange)
tabix.read(tabixFile, tabixRange)
tabixFile |
character, an tabix indexed file |
tabixRange |
character, a text indicating which range in the VCF file to extract. e.g. 1:100-200 |
character vector, each elements is an individual line
http://zhanxw.com/seqminer/ for online manual and examples
if (.Platform$endian == "little") { fileName = system.file("vcf/all.anno.filtered.extract.vcf.gz", package = "seqminer") snp <- tabix.read(fileName, "1:196623337-196632470") } else { message("Tabix does not work well for big endian for now") }
if (.Platform$endian == "little") { fileName = system.file("vcf/all.anno.filtered.extract.vcf.gz", package = "seqminer") snp <- tabix.read(fileName, "1:196623337-196632470") } else { message("Tabix does not work well for big endian for now") }
Read tabix file, similar to running tabix in command line.
tabix.read.header(tabixFile, skippedLine = FALSE)
tabix.read.header(tabixFile, skippedLine = FALSE)
tabixFile |
character, an tabix indexed file |
skippedLine |
logical, whether to read tabix skipped lines (when used 'tabix -S NUM') |
a list
http://zhanxw.com/seqminer/ for online manual and examples
fileName = system.file("vcf/all.anno.filtered.extract.vcf.gz", package = "seqminer") snp <- tabix.read.header(fileName)
fileName = system.file("vcf/all.anno.filtered.extract.vcf.gz", package = "seqminer") snp <- tabix.read.header(fileName)
Read tabix file, similar to running tabix in command line.
tabix.read.table( tabixFile, tabixRange, col.names = TRUE, stringsAsFactors = FALSE )
tabix.read.table( tabixFile, tabixRange, col.names = TRUE, stringsAsFactors = FALSE )
tabixFile |
character, an tabix indexed file |
tabixRange |
character, a text indicating which range in the VCF file to extract. e.g. 1:100-200 |
col.names |
logical, use tabix file header as result headers (default: TRUE) |
stringsAsFactors |
logical, store loaded data as factors (default: FALSE) |
data frame, each elements is an individual line
http://zhanxw.com/seqminer/ for online manual and examples
fileName = system.file("vcf/all.anno.filtered.extract.vcf.gz", package = "seqminer") snp <- tabix.read.table(fileName, "1:196623337-196632470")
fileName = system.file("vcf/all.anno.filtered.extract.vcf.gz", package = "seqminer") snp <- tabix.read.table(fileName, "1:196623337-196632470")
Validate annotate parameter is valid
validateAnnotationParameter(param, debug = FALSE)
validateAnnotationParameter(param, debug = FALSE)
param |
a list of annotation elements |
debug |
show extra debug information or not |
list, first element is TRUE/FALSE if parameter is valid/invalid;
validate the inVcf can be created, and outVcf can be write to. will stop if any error occurs
verifyFilename(inVcf, outVcf)
verifyFilename(inVcf, outVcf)
inVcf |
input file |
outVcf |
output file |
Export workflow to Makefile
writeWorkflow(wf, outFile)
writeWorkflow(wf, outFile)
wf |
a variable workflow class |
outFile |
character, typically named "Makefile" |
j1 <- newJob('id1', 'cmd out1', 'out1') j2 <- newJob('id2', 'cmd out2', 'out2', depend = 'out1') w <- newWorkflow("wf") w <- addJob(w, j1) w <- addJob(w, j2) outFile <- file.path(tempdir(), "Makefile") writeWorkflow(w, outFile) cat('Outputted Makefile file are in the temp directory:', outFile, '\n')
j1 <- newJob('id1', 'cmd out1', 'out1') j2 <- newJob('id2', 'cmd out2', 'out2', depend = 'out1') w <- newWorkflow("wf") w <- addJob(w, j1) w <- addJob(w, j2) outFile <- file.path(tempdir(), "Makefile") writeWorkflow(w, outFile) cat('Outputted Makefile file are in the temp directory:', outFile, '\n')