This script compares genotypes predicted from DNAm data to ensure paired samples are from the same individual.
Load packages
library(tidyverse)
library(GenomicRanges)
library(SummarizedExperiment)
library(omicsPrint)
We use the Zhou manifest, which was pulled recently (2022) from Zhou’s github: https://zwdzwd.github.io/InfiniumAnnotation
snp_cpgs <- read_tsv(
"/exports/molepi/users/ljsinke/LLS/Shared_Data/Manifests/EPIC.hg19.manifest.pop.tsv.gz")
We are interested in the:
cpg
- ID of the probe for the CpGchr
- Chromosome where the CpG is locatedstart
- Start position of the CpGend
- End position of the CpGsnp_cpgs <- snp_cpgs %>%
dplyr::select(
cpg = probeID,
chr = CpG_chrm,
start = CpG_beg,
end = CpG_end,
MASK_snp5_EUR
) %>%
mutate(
chr = substr(chr,4,5)
)
snp_cpgs <- (snp_cpgs %>%
dplyr::filter(MASK_snp5_EUR == TRUE))$cpg
print(paste0("There are ", length(snp_cpgs),
" CpGs containing common European SNPs"))
## [1] "There are 28152 CpGs containing common European SNPs"
Load in unfiltered methylation data
load("../GOTO_Data/Processing/GOTO_methData-unfiltered.Rdata")
methData_unfiltered
## class: SummarizedExperiment
## dim: 865859 548
## metadata(0):
## assays(1): beta
## rownames(865859): cg18478105 cg09835024 ... cg10633746 cg12623625
## rowData names(7): cpg chr ... gene_HGNC MASK_general
## colnames(548): 203527980082_R01C01 203527980082_R02C01 ...
## 203550300093_R07C01 203550300093_R08C01
## colData names(19): DNA_labnr IOP2_ID ... Basename smoke
Subset methData_unfiltered
to keep only those probes containing common SNPs
methData_snps <- methData_unfiltered[
names(methData_unfiltered) %in% snp_cpgs,
]
Save the phenotype information
targets <- as.data.frame(colData(methData_snps))
Check that targets contains information on the right samples
table(rownames(targets) == colnames(methData_snps))
##
## TRUE
## 548
First, we save the methylation data from the blood samples
methData_blood <- methData_snps[ ,
methData_snps$tissue == "fasted blood"]
methData_blood
## class: SummarizedExperiment
## dim: 28152 200
## metadata(0):
## assays(1): beta
## rownames(28152): cg19453472 cg07277756 ... cg15974150 cg16040564
## rowData names(7): cpg chr ... gene_HGNC MASK_general
## colnames(200): 203527980080_R01C01 203527980080_R02C01 ...
## 203550300093_R07C01 203550300093_R08C01
## colData names(19): DNA_labnr IOP2_ID ... Basename smoke
Save the phenotype information
targets_blood <- as.data.frame(colData(methData_blood))
dim(targets_blood)
## [1] 200 19
Save the beta values
betas_blood <- assay(methData_blood)
dim(betas_blood)
## [1] 28152 200
Check the different data matches
table(rownames(targets_blood) == colnames(betas_blood))
##
## TRUE
## 200
Create a data frame of all samples against each other e.g.:
idx | idy | sample_name_x | sample_name_y |
---|---|---|---|
001 | 001 | A | A |
001 | 002 | A | B |
002 | 001 | B | A |
002 | 002 | B | B |
There are 2 samples from all individuals, with the same IOP2_ID
but different timepoint
relations <- expand.grid(
idx = colnames(methData_blood),
idy = colnames(methData_blood))
head(relations)
## idx idy
## 1 203527980080_R01C01 203527980080_R01C01
## 2 203527980080_R02C01 203527980080_R01C01
## 3 203527980080_R07C01 203527980080_R01C01
## 4 203527980080_R08C01 203527980080_R01C01
## 5 203527980024_R07C01 203527980080_R01C01
## 6 203527980024_R08C01 203527980080_R01C01
Save the sample name, so we know which samples should match
relations$sample_name_x <- targets_blood[relations$idx, "IOP2_ID"]
relations$sample_name_y <- targets_blood[relations$idy, "IOP2_ID"]
Create a relation_type variable and set it as Identical if the samples come from the same individual
relations$relation_type <- "unrelated"
relations$relation_type[relations$sample_name_x == relations$sample_name_y] <- "identical"
Show results
table(relations$relation_type)
##
## identical unrelated
## 400 39600
Calculate genotype using omicsPrint
genotype <- beta2genotype(betas_blood,
na.rm=TRUE,
minSep = 0.25,
minSize = 5,
centers = c(0.2, 0.5, 0.8),
assayName=NULL)
str(genotype)
## int [1:1230, 1:200] 2 3 1 2 2 1 1 3 2 2 ...
## - attr(*, "dimnames")=List of 2
## ..$ : chr [1:1230] "cg11418303" "cg16398051" "cg16775095" "cg23489384" ...
## ..$ : chr [1:200] "203527980080_R01C01" "203527980080_R02C01" "203527980080_R07C01" "203527980080_R08C01" ...
Save output
out <- alleleSharing(genotype, relations = relations)
## Hash relations
## Pruning 1230 SNPs ...
## 0 SNPs removed because of low call rate!
## 0 samples removed because too few SNPs called!
## Using 1230 polymorphic SNPs to determine allele sharing.
## Running `square` IBS algorithm!
## 201 of 20100 (1%) ...
## 15051 of 20100 (74.88%) ...
## 20101 of 20100 (100%) ...
Plot
mismatches <- inferRelations(out, verbose=TRUE)
## Assumed relation
## Predicted relation identical unrelated
## identical 300 .
## unrelated . 19800
str(mismatches)
## 'data.frame': 0 obs. of 6 variables:
## $ mean : num
## $ var : num
## $ colnames.x: chr
## $ colnames.y: chr
## $ relation : Factor w/ 2 levels "identical","unrelated":
## $ predicted : Factor w/ 2 levels "identical","unrelated":
Table of mismatches
table(mismatches$relation, mismatches$predicted)
##
## identical unrelated
## identical 0 0
## unrelated 0 0
Save output
save(genotype, relations, out, mismatches, file = "../GOTO_Data/Processing/omicsPrint/GOTO_omicsPrint-blood.Rdata")
First, we save the methylation data from the fat samples
methData_fat <- methData_snps[ ,
methData_snps$tissue == "fat"]
methData_fat
## class: SummarizedExperiment
## dim: 28152 184
## metadata(0):
## assays(1): beta
## rownames(28152): cg19453472 cg07277756 ... cg15974150 cg16040564
## rowData names(7): cpg chr ... gene_HGNC MASK_general
## colnames(184): 203527980082_R07C01 203527980082_R08C01 ...
## 203550300093_R05C01 203550300093_R06C01
## colData names(19): DNA_labnr IOP2_ID ... Basename smoke
Save the phenotype information
targets_fat <- as.data.frame(colData(methData_fat))
dim(targets_fat)
## [1] 184 19
Save the beta values
betas_fat <- assay(methData_fat)
dim(betas_fat)
## [1] 28152 184
Check the different data matches
table(rownames(targets_fat) == colnames(betas_fat))
##
## TRUE
## 184
Create a data frame of all samples against each other e.g.:
idx | idy | sample_name_x | sample_name_y |
---|---|---|---|
001 | 001 | A | A |
001 | 002 | A | B |
002 | 001 | B | A |
002 | 002 | B | B |
There are 2 samples from all individuals, with the same IOP2_ID
but different timepoint
relations <- expand.grid(
idx = colnames(methData_fat),
idy = colnames(methData_fat))
Save the sample name, so we know which samples should match
relations$sample_name_x <- targets_fat[relations$idx, "IOP2_ID"]
relations$sample_name_y <- targets_fat[relations$idy, "IOP2_ID"]
Create a relation_type variable and set it as Identical if the samples come from the same individual
relations$relation_type <- "unrelated"
relations$relation_type[relations$sample_name_x == relations$sample_name_y] <- "identical"
Show results
table(relations$relation_type)
##
## identical unrelated
## 368 33488
Calculate genotype using omicsPrint
genotype <- beta2genotype(betas_fat,
na.rm=TRUE,
minSep = 0.25,
minSize = 5,
centers = c(0.2, 0.5, 0.8),
assayName=NULL)
str(genotype)
## int [1:1059, 1:184] 3 3 3 1 3 2 2 3 2 1 ...
## - attr(*, "dimnames")=List of 2
## ..$ : chr [1:1059] "cg16398051" "cg09727210" "cg06813297" "cg23489384" ...
## ..$ : chr [1:184] "203527980082_R07C01" "203527980082_R08C01" "203527980080_R03C01" "203527980080_R04C01" ...
Save output
out <- alleleSharing(genotype, relations = relations)
## Hash relations
## Pruning 1059 SNPs ...
## 0 SNPs removed because of low call rate!
## 0 samples removed because too few SNPs called!
## Using 1059 polymorphic SNPs to determine allele sharing.
## Running `square` IBS algorithm!
## 185 of 17020 (1.09%) ...
## 13451 of 17020 (79.03%) ...
Plot
mismatches <- inferRelations(out, verbose=TRUE)
## Assumed relation
## Predicted relation identical unrelated
## identical 275 .
## unrelated 1 16744
str(mismatches)
## 'data.frame': 1 obs. of 6 variables:
## $ mean : num 1.31
## $ var : num 0.435
## $ colnames.x: chr "203548970042_R06C01"
## $ colnames.y: chr "203548970042_R05C01"
## $ relation : Factor w/ 2 levels "identical","unrelated": 1
## $ predicted : Factor w/ 2 levels "identical","unrelated": 2
There is one sample pair that should come from the same individual but does not appear to.
table(mismatches$relation, mismatches$predicted)
##
## identical unrelated
## identical 0 1
## unrelated 0 0
Save output
save(genotype, relations, out, mismatches, file = "../GOTO_Data/Processing/omicsPrint/GOTO_omicsPrint-fat.Rdata")
First, we save the methylation data from the muscle samples
methData_muscle <- methData_snps[ ,
methData_snps$tissue == "muscle"]
methData_muscle
## class: SummarizedExperiment
## dim: 28152 164
## metadata(0):
## assays(1): beta
## rownames(28152): cg19453472 cg07277756 ... cg15974150 cg16040564
## rowData names(7): cpg chr ... gene_HGNC MASK_general
## colnames(164): 203527980082_R01C01 203527980082_R02C01 ...
## 203550300091_R01C01 203550300091_R02C01
## colData names(19): DNA_labnr IOP2_ID ... Basename smoke
Save the phenotype information
targets_muscle <- as.data.frame(colData(methData_muscle))
dim(targets_muscle)
## [1] 164 19
Save the beta values
betas_muscle <- assay(methData_muscle)
dim(betas_muscle)
## [1] 28152 164
Check the different data matches
table(rownames(targets_muscle) == colnames(betas_muscle))
##
## TRUE
## 164
Create a data frame of all samples against each other e.g.:
idx | idy | sample_name_x | sample_name_y |
---|---|---|---|
001 | 001 | A | A |
001 | 002 | A | B |
002 | 001 | B | A |
002 | 002 | B | B |
There are 2 samples from all individuals, with the same IOP2_ID
but different timepoint
relations <- expand.grid(
idx = colnames(methData_muscle),
idy = colnames(methData_muscle))
Save the sample name, so we know which samples should match
relations$sample_name_x <- targets_muscle[relations$idx, "IOP2_ID"]
relations$sample_name_y <- targets_muscle[relations$idy, "IOP2_ID"]
Create a relation_type variable and set it as Identical if the samples come from the same individual
relations$relation_type <- "unrelated"
relations$relation_type[relations$sample_name_x == relations$sample_name_y] <- "identical"
Show results
table(relations$relation_type)
##
## identical unrelated
## 328 26568
Calculate genotype using omicsPrint
genotype <- beta2genotype(betas_muscle,
na.rm=TRUE,
minSep = 0.25,
minSize = 5,
centers = c(0.2, 0.5, 0.8),
assayName=NULL)
str(genotype)
## int [1:1010, 1:164] 1 2 3 3 1 3 3 3 2 3 ...
## - attr(*, "dimnames")=List of 2
## ..$ : chr [1:1010] "cg11418303" "cg16398051" "cg09727210" "cg23489384" ...
## ..$ : chr [1:164] "203527980082_R01C01" "203527980082_R02C01" "203527980082_R03C01" "203527980082_R04C01" ...
Save output
out <- alleleSharing(genotype, relations = relations)
## Hash relations
## Pruning 1010 SNPs ...
## 0 SNPs removed because of low call rate!
## 0 samples removed because too few SNPs called!
## Using 1010 polymorphic SNPs to determine allele sharing.
## Running `square` IBS algorithm!
## 165 of 13530 (1.22%) ...
## 11451 of 13530 (84.63%) ...
Plot
mismatches <- inferRelations(out, verbose=TRUE)
## Assumed relation
## Predicted relation identical unrelated
## identical 244 2
## unrelated 2 13282
str(mismatches)
## 'data.frame': 4 obs. of 6 variables:
## $ mean : num 1.26 2 2 1.26
## $ var : num 0.44452 0.00296 0.00493 0.44356
## $ colnames.x: chr "203548970088_R08C01" "203548980011_R03C01" "203548980011_R04C01" "203548980011_R04C01"
## $ colnames.y: chr "203548970088_R07C01" "203548970088_R07C01" "203548970088_R08C01" "203548980011_R03C01"
## $ relation : Factor w/ 2 levels "identical","unrelated": 1 2 2 1
## $ predicted : Factor w/ 2 levels "identical","unrelated": 2 1 1 2
There are two sample pairs that should match but don’t, perhaps representing a sample swap.
table(mismatches$relation, mismatches$predicted)
##
## identical unrelated
## identical 0 2
## unrelated 2 0
Save output
save(genotype, relations, out, mismatches, file = "../GOTO_Data/Processing/omicsPrint/GOTO_omicsPrint-muscle.Rdata")
sessionInfo()
## R version 4.2.2 (2022-10-31)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Rocky Linux 8.10 (Green Obsidian)
##
## Matrix products: default
## BLAS/LAPACK: /usr/lib64/libopenblas-r0.3.15.so
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] parallel stats4 stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] ggrepel_0.9.1
## [2] ggfortify_0.4.14
## [3] irlba_2.3.5.1
## [4] Matrix_1.5-4.1
## [5] omicsPrint_1.14.0
## [6] MASS_7.3-60
## [7] DNAmArray_2.0.0
## [8] pls_2.8-2
## [9] FDb.InfiniumMethylation.hg19_2.2.0
## [10] org.Hs.eg.db_3.14.0
## [11] TxDb.Hsapiens.UCSC.hg19.knownGene_3.2.2
## [12] GenomicFeatures_1.46.5
## [13] AnnotationDbi_1.56.2
## [14] IlluminaHumanMethylationEPICmanifest_0.3.0
## [15] minfi_1.40.0
## [16] bumphunter_1.36.0
## [17] locfit_1.5-9.8
## [18] iterators_1.0.14
## [19] foreach_1.5.2
## [20] Biostrings_2.62.0
## [21] XVector_0.34.0
## [22] SummarizedExperiment_1.24.0
## [23] Biobase_2.58.0
## [24] MatrixGenerics_1.10.0
## [25] matrixStats_1.0.0
## [26] GenomicRanges_1.46.1
## [27] GenomeInfoDb_1.34.9
## [28] IRanges_2.32.0
## [29] S4Vectors_0.36.2
## [30] BiocGenerics_0.44.0
## [31] BiocParallel_1.32.6
## [32] MethylAid_1.28.0
## [33] forcats_0.5.2
## [34] stringr_1.5.0
## [35] dplyr_1.1.3
## [36] purrr_0.3.4
## [37] readr_2.1.2
## [38] tidyr_1.2.1
## [39] tibble_3.2.1
## [40] ggplot2_3.4.3
## [41] tidyverse_1.3.2
## [42] rmarkdown_2.16
##
## loaded via a namespace (and not attached):
## [1] utf8_1.2.3 tidyselect_1.2.0
## [3] RSQLite_2.2.17 grid_4.2.2
## [5] munsell_0.5.0 codetools_0.2-19
## [7] preprocessCore_1.60.2 withr_2.5.0
## [9] colorspace_2.1-0 filelock_1.0.2
## [11] highr_0.10 knitr_1.43
## [13] rstudioapi_0.14 labeling_0.4.2
## [15] GenomeInfoDbData_1.2.9 farver_2.1.1
## [17] bit64_4.0.5 rhdf5_2.42.1
## [19] vctrs_0.6.3 generics_0.1.3
## [21] xfun_0.39 timechange_0.2.0
## [23] BiocFileCache_2.2.1 R6_2.5.1
## [25] illuminaio_0.40.0 bitops_1.0-7
## [27] rhdf5filters_1.10.1 cachem_1.0.8
## [29] reshape_0.8.9 DelayedArray_0.24.0
## [31] assertthat_0.2.1 vroom_1.5.7
## [33] promises_1.2.0.1 BiocIO_1.8.0
## [35] scales_1.2.1 googlesheets4_1.0.1
## [37] gtable_0.3.3 rlang_1.1.1
## [39] genefilter_1.76.0 splines_4.2.2
## [41] rtracklayer_1.54.0 gargle_1.5.0
## [43] GEOquery_2.62.2 htm2txt_2.2.2
## [45] hexbin_1.28.3 broom_1.0.1
## [47] yaml_2.3.7 reshape2_1.4.4
## [49] RaggedExperiment_1.18.0 modelr_0.1.9
## [51] backports_1.4.1 httpuv_1.6.11
## [53] tools_4.2.2 gridBase_0.4-7
## [55] nor1mix_1.3-0 ellipsis_0.3.2
## [57] jquerylib_0.1.4 RColorBrewer_1.1-3
## [59] siggenes_1.68.0 MultiAssayExperiment_1.20.0
## [61] Rcpp_1.0.10 plyr_1.8.8
## [63] sparseMatrixStats_1.10.0 progress_1.2.2
## [65] zlibbioc_1.44.0 RCurl_1.98-1.12
## [67] prettyunits_1.1.1 openssl_2.0.6
## [69] haven_2.5.1 fs_1.6.2
## [71] magrittr_2.0.3 data.table_1.14.8
## [73] reprex_2.0.2 googledrive_2.0.0
## [75] hms_1.1.2 mime_0.12
## [77] evaluate_0.21 xtable_1.8-4
## [79] XML_3.99-0.14 mclust_6.0.0
## [81] readxl_1.4.1 gridExtra_2.3
## [83] compiler_4.2.2 biomaRt_2.50.3
## [85] crayon_1.5.2 htmltools_0.5.5
## [87] later_1.3.1 tzdb_0.4.0
## [89] lubridate_1.9.2 DBI_1.1.3
## [91] dbplyr_2.2.1 rappdirs_0.3.3
## [93] cli_3.6.1 quadprog_1.5-8
## [95] pkgconfig_2.0.3 GenomicAlignments_1.30.0
## [97] xml2_1.3.4 annotate_1.72.0
## [99] bslib_0.5.0 rngtools_1.5.2
## [101] multtest_2.50.0 beanplot_1.3.1
## [103] rvest_1.0.3 doRNG_1.8.6
## [105] scrime_1.3.5 digest_0.6.31
## [107] base64_2.0.1 cellranger_1.1.0
## [109] DelayedMatrixStats_1.16.0 restfulr_0.0.15
## [111] curl_5.0.1 shiny_1.7.2
## [113] Rsamtools_2.10.0 rjson_0.2.21
## [115] lifecycle_1.0.3 nlme_3.1-162
## [117] jsonlite_1.8.5 Rhdf5lib_1.20.0
## [119] askpass_1.1 limma_3.54.2
## [121] fansi_1.0.4 pillar_1.9.0
## [123] lattice_0.21-8 KEGGREST_1.34.0
## [125] fastmap_1.1.1 httr_1.4.6
## [127] survival_3.5-5 glue_1.6.2
## [129] png_0.1-8 bit_4.0.5
## [131] stringi_1.7.12 sass_0.4.6
## [133] HDF5Array_1.22.1 blob_1.2.4
## [135] memoise_2.0.1
Clear
rm(list=ls())