Typically you can install sim1000G from the CRAN repository:
The genetic map in use by the package is from the Hapmap 2010 release, lifted over to GrCH37 coordinates. It was downloaded from:
ftp://ftp.ncbi.nlm.nih.gov/hapmap/recombination/2011-01_phaseII_B37/
Before starting the simulator, a VCF file of the region of interest is needed. The VCF file is used to provide the haplotypes that will be used for the simulator.
For this example we use an unfiltered region from 1000 genomes Phase III sequencing data VCF, chromosome 4, CEU samples.
We also need to initialize all the simulator internal structures with the command startSimulation.
The following parameters can be set:
Simulationg of related individuals in a pedigrees requires modelling recombination events.
Below we show an example on how to simulate 100 families with 2 offspring each.
In addition we write the output to a PED/MAP file in plink format, for further analysis.
# Simulate one family with 2 offspring
fam = newFamilyWithOffspring("fam1",2)
print(fam)
# Simulate 100 families
SIM$reset()
## For testing the IBD, we set the cM so that the regions spans to 4000cm
## Remove for normal use
SIM$cm = seq( 0,4000, length = length(SIM$cm) )
time10families = function() {
fam = lapply(1:10, function(x) newFamilyWithOffspring(x,2) )
fam = do.call(rbind, fam)
fam
}
fam <- time10families()
# Uncomment the following line to write a plink compatible ped/map file
# writePED(vcf, fam, "out")
The simulator tracks the locations of all the ancestral alleles in 2 seperate arrays. These can be used to compute the IBD1,2 matrices, in arbitrary pedigrees.
Unfortunately, tracking the ancestral alleles makes the simulator a lot slower, so if we don’t need this functionality, we can remove it later.
n = SIM$individuals_generated
IBD1matrix =
sapply(1:n, function(y) {
z = sapply(1:n, function(x) computePairIBD12(x,y) [1])
names(z) = 1:n
z
})
IBD2matrix =
sapply(1:n, function(y) {
z = sapply(1:n, function(x) computePairIBD12(x,y) [2])
names(z) = 1:n
z
})
colnames(IBD1matrix) = 1:nrow(IBD1matrix)
rownames(IBD1matrix) = 1:nrow(IBD1matrix)
colnames(IBD2matrix) = 1:nrow(IBD2matrix)
rownames(IBD2matrix) = 1:nrow(IBD2matrix)
knitr::kable(IBD1matrix[1:8,1:8] )
1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 |
---|---|---|---|---|---|---|---|
0 | 0 | 1.00 | 1.00 | 0 | 0 | 0.00 | 0.00 |
0 | 0 | 1.00 | 1.00 | 0 | 0 | 0.00 | 0.00 |
1 | 1 | 0.00 | 0.48 | 0 | 0 | 0.00 | 0.00 |
1 | 1 | 0.48 | 0.00 | 0 | 0 | 0.00 | 0.00 |
0 | 0 | 0.00 | 0.00 | 0 | 0 | 1.00 | 1.00 |
0 | 0 | 0.00 | 0.00 | 0 | 0 | 1.00 | 1.00 |
0 | 0 | 0.00 | 0.00 | 1 | 1 | 0.00 | 0.47 |
0 | 0 | 0.00 | 0.00 | 1 | 1 | 0.47 | 0.00 |
colnames(IBD1matrix) = 1:nrow(IBD1matrix)
rownames(IBD1matrix) = 1:nrow(IBD1matrix)
colnames(IBD2matrix) = 1:nrow(IBD2matrix)
rownames(IBD2matrix) = 1:nrow(IBD2matrix)
knitr::kable(IBD2matrix[1:8,1:8] )
1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 |
---|---|---|---|---|---|---|---|
1 | 0 | 0.00 | 0.00 | 0 | 0 | 0.00 | 0.00 |
0 | 1 | 0.00 | 0.00 | 0 | 0 | 0.00 | 0.00 |
0 | 0 | 1.00 | 0.31 | 0 | 0 | 0.00 | 0.00 |
0 | 0 | 0.31 | 1.00 | 0 | 0 | 0.00 | 0.00 |
0 | 0 | 0.00 | 0.00 | 1 | 0 | 0.00 | 0.00 |
0 | 0 | 0.00 | 0.00 | 0 | 1 | 0.00 | 0.00 |
0 | 0 | 0.00 | 0.00 | 0 | 0 | 1.00 | 0.29 |
0 | 0 | 0.00 | 0.00 | 0 | 0 | 0.29 | 1.00 |
The function to generate family data can be extended to simulate arbitraty pedigrees, it is shown below:
newFamilyWithOffspring = function(familyid, noffspring = 2) {
fam = data.frame(fid = familyid ,
id = c(1:2) ,
father = c(0,0),
mother = c(0,0),
sex = c(1,2)
)
j1 = SIM$addUnrelatedIndividual()
j2 = SIM$addUnrelatedIndividual()
fam$gtindex = c(j1,j2) # Holds the genotype position in the arrays SIM$gt1 and SIM$gt2
for(i in 1:noffspring) {
j3 = SIM$mate(j1,j2)
newFamilyMember = c(familyid, i+10, 1,2, 1 , j3)
fam = rbind(fam, newFamilyMember)
}
return (fam)
}
In this example, we generate a pedigree with 6 individuals, across 3 generations. After that, we compute the IBD matrices of the family.
# Reset simulation
SIM$reset()
# Set the region size in cM (0-4000cm, for testing the correctness of the function)
SIM$cm = seq(0,4000,l=length(SIM$cm))
A = SIM$addUnrelatedIndividual()
B = SIM$addUnrelatedIndividual()
C = SIM$mate(A,B)
D = SIM$mate(A,B)
G = SIM$addUnrelatedIndividual()
E = SIM$mate(G,C)
computePairIBD12(C,D)
computePairIBD12(E,A)
n = SIM$individuals_generated
IBD1matrix =
sapply(1:n, function(y) {
z = sapply(1:n, function(x) computePairIBD12(x,y) [1])
names(z) = 1:n
z
})
IBD2matrix =
sapply(1:n, function(y) {
z = sapply(1:n, function(x) computePairIBD12(x,y) [2])
names(z) = 1:n
z
})
printMatrix(IBD1matrix)
## user system elapsed
## 0.130 0.210 0.086
## IBD1 IBD2
## 0.49 0.28
## IBD1 IBD2
## 0.52 0.00
## [ 1] [ 2] [ 3] [ 4] [ 5] [ 6]
## [ 1] 0.000 0.000 1.000 1.000 0.000 0.520
## [ 2] 0.000 0.000 1.000 1.000 0.000 0.480
## [ 3] 1.000 1.000 0.000 0.490 0.000 1.000
## [ 4] 1.000 1.000 0.490 0.000 0.000 0.480
## [ 5] 0.000 0.000 0.000 0.000 0.000 1.000
## [ 6] 0.520 0.480 1.000 0.480 1.000 0.000