This script compares genotypes predicted from DNAm data to ensure paired samples are from the same individual.

Setup

Load packages

library(tidyverse)
library(GenomicRanges)
library(SummarizedExperiment)
library(omicsPrint)

Annotation

We use the Zhou manifest, which was pulled recently (2022) from Zhou’s github: https://zwdzwd.github.io/InfiniumAnnotation

snp_cpgs <- read_tsv(
  "/exports/molepi/users/ljsinke/LLS/Shared_Data/Manifests/EPIC.hg19.manifest.pop.tsv.gz")

We are interested in the:

cpg - ID of the probe for the CpG
chr - Chromosome where the CpG is located
start - Start position of the CpG
end - End position of the CpG

snp_cpgs <- snp_cpgs %>% 
  dplyr::select(
    cpg = probeID,
    chr = CpG_chrm,
    start = CpG_beg,
    end = CpG_end,
    MASK_snp5_EUR
  ) %>% 
  mutate(
    chr = substr(chr,4,5)
  )

snp_cpgs <- (snp_cpgs %>% 
               dplyr::filter(MASK_snp5_EUR == TRUE))$cpg

print(paste0("There are ", length(snp_cpgs),
             " CpGs containing common European SNPs"))

## [1] "There are 28152 CpGs containing common European SNPs"

Common SNP data

Load in unfiltered methylation data

load("../GOTO_Data/Processing/GOTO_methData-unfiltered.Rdata")
methData_unfiltered

## class: SummarizedExperiment 
## dim: 865859 548 
## metadata(0):
## assays(1): beta
## rownames(865859): cg18478105 cg09835024 ... cg10633746 cg12623625
## rowData names(7): cpg chr ... gene_HGNC MASK_general
## colnames(548): 203527980082_R01C01 203527980082_R02C01 ...
##   203550300093_R07C01 203550300093_R08C01
## colData names(19): DNA_labnr IOP2_ID ... Basename smoke

Subset methData_unfiltered to keep only those probes containing common SNPs

methData_snps <- methData_unfiltered[
  names(methData_unfiltered) %in% snp_cpgs,
]

Save the phenotype information

targets <- as.data.frame(colData(methData_snps))

Check that targets contains information on the right samples

table(rownames(targets) == colnames(methData_snps))

## 
## TRUE 
##  548

Fasted Blood

Subset Data

First, we save the methylation data from the blood samples

methData_blood <- methData_snps[ , 
                      methData_snps$tissue == "fasted blood"]
methData_blood

## class: SummarizedExperiment 
## dim: 28152 200 
## metadata(0):
## assays(1): beta
## rownames(28152): cg19453472 cg07277756 ... cg15974150 cg16040564
## rowData names(7): cpg chr ... gene_HGNC MASK_general
## colnames(200): 203527980080_R01C01 203527980080_R02C01 ...
##   203550300093_R07C01 203550300093_R08C01
## colData names(19): DNA_labnr IOP2_ID ... Basename smoke

Save the phenotype information

targets_blood <- as.data.frame(colData(methData_blood))
dim(targets_blood)

## [1] 200  19

Save the beta values

betas_blood <- assay(methData_blood)
dim(betas_blood)

## [1] 28152   200

Check the different data matches

table(rownames(targets_blood) == colnames(betas_blood))

## 
## TRUE 
##  200

Assumed relations

Create a data frame of all samples against each other e.g.:

idx	idy	sample_name_x	sample_name_y
001	001	A	A
001	002	A	B
002	001	B	A
002	002	B	B

There are 2 samples from all individuals, with the same IOP2_ID but different timepoint

relations <- expand.grid(
  idx = colnames(methData_blood), 
  idy = colnames(methData_blood))
head(relations)

##                   idx                 idy
## 1 203527980080_R01C01 203527980080_R01C01
## 2 203527980080_R02C01 203527980080_R01C01
## 3 203527980080_R07C01 203527980080_R01C01
## 4 203527980080_R08C01 203527980080_R01C01
## 5 203527980024_R07C01 203527980080_R01C01
## 6 203527980024_R08C01 203527980080_R01C01

Save the sample name, so we know which samples should match

relations$sample_name_x <- targets_blood[relations$idx, "IOP2_ID"] 
relations$sample_name_y <- targets_blood[relations$idy, "IOP2_ID"]

Create a relation_type variable and set it as Identical if the samples come from the same individual

relations$relation_type <- "unrelated"
relations$relation_type[relations$sample_name_x == relations$sample_name_y] <- "identical"

Show results

table(relations$relation_type)

## 
## identical unrelated 
##       400     39600

Genotyping

Calculate genotype using omicsPrint

genotype <- beta2genotype(betas_blood, 
                          na.rm=TRUE,
                          minSep = 0.25,
                          minSize = 5,
                          centers = c(0.2, 0.5, 0.8),
                          assayName=NULL)
str(genotype)

##  int [1:1230, 1:200] 2 3 1 2 2 1 1 3 2 2 ...
##  - attr(*, "dimnames")=List of 2
##   ..$ : chr [1:1230] "cg11418303" "cg16398051" "cg16775095" "cg23489384" ...
##   ..$ : chr [1:200] "203527980080_R01C01" "203527980080_R02C01" "203527980080_R07C01" "203527980080_R08C01" ...

Save output

out <- alleleSharing(genotype, relations = relations)

## Hash relations

## Pruning 1230 SNPs ...

## 0 SNPs removed because of low call rate!

## 0 samples removed because too few SNPs called!

## Using 1230 polymorphic SNPs to determine allele sharing.

## Running `square` IBS algorithm!

## 201 of 20100 (1%) ...

## 15051 of 20100 (74.88%) ...

## 20101 of 20100 (100%) ...

Identify Mismatches

Plot

mismatches <- inferRelations(out, verbose=TRUE)

##                   Assumed relation
## Predicted relation identical unrelated
##          identical       300         .
##          unrelated         .     19800

str(mismatches)

## 'data.frame':    0 obs. of  6 variables:
##  $ mean      : num 
##  $ var       : num 
##  $ colnames.x: chr 
##  $ colnames.y: chr 
##  $ relation  : Factor w/ 2 levels "identical","unrelated": 
##  $ predicted : Factor w/ 2 levels "identical","unrelated":

Table of mismatches

table(mismatches$relation, mismatches$predicted)

##            
##             identical unrelated
##   identical         0         0
##   unrelated         0         0

Save output

save(genotype, relations, out, mismatches, file = "../GOTO_Data/Processing/omicsPrint/GOTO_omicsPrint-blood.Rdata")

Subcutaneous Fat

Subset Data

First, we save the methylation data from the fat samples

methData_fat <- methData_snps[ , 
                      methData_snps$tissue == "fat"]
methData_fat

## class: SummarizedExperiment 
## dim: 28152 184 
## metadata(0):
## assays(1): beta
## rownames(28152): cg19453472 cg07277756 ... cg15974150 cg16040564
## rowData names(7): cpg chr ... gene_HGNC MASK_general
## colnames(184): 203527980082_R07C01 203527980082_R08C01 ...
##   203550300093_R05C01 203550300093_R06C01
## colData names(19): DNA_labnr IOP2_ID ... Basename smoke

Save the phenotype information

targets_fat <- as.data.frame(colData(methData_fat))
dim(targets_fat)

## [1] 184  19

Save the beta values

betas_fat <- assay(methData_fat)
dim(betas_fat)

## [1] 28152   184

Check the different data matches

table(rownames(targets_fat) == colnames(betas_fat))

## 
## TRUE 
##  184

Assumed relations

Create a data frame of all samples against each other e.g.:

idx	idy	sample_name_x	sample_name_y
001	001	A	A
001	002	A	B
002	001	B	A
002	002	B	B

There are 2 samples from all individuals, with the same IOP2_ID but different timepoint

relations <- expand.grid(
  idx = colnames(methData_fat), 
  idy = colnames(methData_fat))

Save the sample name, so we know which samples should match

relations$sample_name_x <- targets_fat[relations$idx, "IOP2_ID"] 
relations$sample_name_y <- targets_fat[relations$idy, "IOP2_ID"]

Create a relation_type variable and set it as Identical if the samples come from the same individual

relations$relation_type <- "unrelated"
relations$relation_type[relations$sample_name_x == relations$sample_name_y] <- "identical"

Show results

table(relations$relation_type)

## 
## identical unrelated 
##       368     33488

Genotyping

Calculate genotype using omicsPrint

genotype <- beta2genotype(betas_fat, 
                          na.rm=TRUE,
                          minSep = 0.25,
                          minSize = 5,
                          centers = c(0.2, 0.5, 0.8),
                          assayName=NULL)
str(genotype)

##  int [1:1059, 1:184] 3 3 3 1 3 2 2 3 2 1 ...
##  - attr(*, "dimnames")=List of 2
##   ..$ : chr [1:1059] "cg16398051" "cg09727210" "cg06813297" "cg23489384" ...
##   ..$ : chr [1:184] "203527980082_R07C01" "203527980082_R08C01" "203527980080_R03C01" "203527980080_R04C01" ...

Save output

out <- alleleSharing(genotype, relations = relations)

## Hash relations

## Pruning 1059 SNPs ...

## 0 SNPs removed because of low call rate!

## 0 samples removed because too few SNPs called!

## Using 1059 polymorphic SNPs to determine allele sharing.

## Running `square` IBS algorithm!

## 185 of 17020 (1.09%) ...

## 13451 of 17020 (79.03%) ...

Identify Mismatches

Plot

mismatches <- inferRelations(out, verbose=TRUE)

##                   Assumed relation
## Predicted relation identical unrelated
##          identical       275         .
##          unrelated         1     16744

str(mismatches)

## 'data.frame':    1 obs. of  6 variables:
##  $ mean      : num 1.31
##  $ var       : num 0.435
##  $ colnames.x: chr "203548970042_R06C01"
##  $ colnames.y: chr "203548970042_R05C01"
##  $ relation  : Factor w/ 2 levels "identical","unrelated": 1
##  $ predicted : Factor w/ 2 levels "identical","unrelated": 2

There is one sample pair that should come from the same individual but does not appear to.

table(mismatches$relation, mismatches$predicted)

##            
##             identical unrelated
##   identical         0         1
##   unrelated         0         0

Save output

save(genotype, relations, out, mismatches, file = "../GOTO_Data/Processing/omicsPrint/GOTO_omicsPrint-fat.Rdata")

Skeletal Muscle

Subset Data

First, we save the methylation data from the muscle samples

methData_muscle <- methData_snps[ , 
                      methData_snps$tissue == "muscle"]
methData_muscle

## class: SummarizedExperiment 
## dim: 28152 164 
## metadata(0):
## assays(1): beta
## rownames(28152): cg19453472 cg07277756 ... cg15974150 cg16040564
## rowData names(7): cpg chr ... gene_HGNC MASK_general
## colnames(164): 203527980082_R01C01 203527980082_R02C01 ...
##   203550300091_R01C01 203550300091_R02C01
## colData names(19): DNA_labnr IOP2_ID ... Basename smoke

Save the phenotype information

targets_muscle <- as.data.frame(colData(methData_muscle))
dim(targets_muscle)

## [1] 164  19

Save the beta values

betas_muscle <- assay(methData_muscle)
dim(betas_muscle)

## [1] 28152   164

Check the different data matches

table(rownames(targets_muscle) == colnames(betas_muscle))

## 
## TRUE 
##  164

Assumed relations

Create a data frame of all samples against each other e.g.:

idx	idy	sample_name_x	sample_name_y
001	001	A	A
001	002	A	B
002	001	B	A
002	002	B	B

There are 2 samples from all individuals, with the same IOP2_ID but different timepoint

relations <- expand.grid(
  idx = colnames(methData_muscle), 
  idy = colnames(methData_muscle))

Save the sample name, so we know which samples should match

relations$sample_name_x <- targets_muscle[relations$idx, "IOP2_ID"] 
relations$sample_name_y <- targets_muscle[relations$idy, "IOP2_ID"]

Create a relation_type variable and set it as Identical if the samples come from the same individual

relations$relation_type <- "unrelated"
relations$relation_type[relations$sample_name_x == relations$sample_name_y] <- "identical"

Show results

table(relations$relation_type)

## 
## identical unrelated 
##       328     26568

Genotyping

Calculate genotype using omicsPrint

genotype <- beta2genotype(betas_muscle, 
                          na.rm=TRUE,
                          minSep = 0.25,
                          minSize = 5,
                          centers = c(0.2, 0.5, 0.8),
                          assayName=NULL)
str(genotype)

##  int [1:1010, 1:164] 1 2 3 3 1 3 3 3 2 3 ...
##  - attr(*, "dimnames")=List of 2
##   ..$ : chr [1:1010] "cg11418303" "cg16398051" "cg09727210" "cg23489384" ...
##   ..$ : chr [1:164] "203527980082_R01C01" "203527980082_R02C01" "203527980082_R03C01" "203527980082_R04C01" ...

Save output

out <- alleleSharing(genotype, relations = relations)

## Hash relations

## Pruning 1010 SNPs ...

## 0 SNPs removed because of low call rate!

## 0 samples removed because too few SNPs called!

## Using 1010 polymorphic SNPs to determine allele sharing.

## Running `square` IBS algorithm!

## 165 of 13530 (1.22%) ...

## 11451 of 13530 (84.63%) ...

Identify Mismatches

Plot

mismatches <- inferRelations(out, verbose=TRUE)

##                   Assumed relation
## Predicted relation identical unrelated
##          identical       244         2
##          unrelated         2     13282

str(mismatches)

## 'data.frame':    4 obs. of  6 variables:
##  $ mean      : num  1.26 2 2 1.26
##  $ var       : num  0.44452 0.00296 0.00493 0.44356
##  $ colnames.x: chr  "203548970088_R08C01" "203548980011_R03C01" "203548980011_R04C01" "203548980011_R04C01"
##  $ colnames.y: chr  "203548970088_R07C01" "203548970088_R07C01" "203548970088_R08C01" "203548980011_R03C01"
##  $ relation  : Factor w/ 2 levels "identical","unrelated": 1 2 2 1
##  $ predicted : Factor w/ 2 levels "identical","unrelated": 2 1 1 2

There are two sample pairs that should match but don’t, perhaps representing a sample swap.

table(mismatches$relation, mismatches$predicted)

##            
##             identical unrelated
##   identical         0         2
##   unrelated         2         0

Save output

save(genotype, relations, out, mismatches, file = "../GOTO_Data/Processing/omicsPrint/GOTO_omicsPrint-muscle.Rdata")

Session info

sessionInfo()

## R version 4.2.2 (2022-10-31)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Rocky Linux 8.10 (Green Obsidian)
## 
## Matrix products: default
## BLAS/LAPACK: /usr/lib64/libopenblas-r0.3.15.so
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] parallel  stats4    stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] ggrepel_0.9.1                             
##  [2] ggfortify_0.4.14                          
##  [3] irlba_2.3.5.1                             
##  [4] Matrix_1.5-4.1                            
##  [5] omicsPrint_1.14.0                         
##  [6] MASS_7.3-60                               
##  [7] DNAmArray_2.0.0                           
##  [8] pls_2.8-2                                 
##  [9] FDb.InfiniumMethylation.hg19_2.2.0        
## [10] org.Hs.eg.db_3.14.0                       
## [11] TxDb.Hsapiens.UCSC.hg19.knownGene_3.2.2   
## [12] GenomicFeatures_1.46.5                    
## [13] AnnotationDbi_1.56.2                      
## [14] IlluminaHumanMethylationEPICmanifest_0.3.0
## [15] minfi_1.40.0                              
## [16] bumphunter_1.36.0                         
## [17] locfit_1.5-9.8                            
## [18] iterators_1.0.14                          
## [19] foreach_1.5.2                             
## [20] Biostrings_2.62.0                         
## [21] XVector_0.34.0                            
## [22] SummarizedExperiment_1.24.0               
## [23] Biobase_2.58.0                            
## [24] MatrixGenerics_1.10.0                     
## [25] matrixStats_1.0.0                         
## [26] GenomicRanges_1.46.1                      
## [27] GenomeInfoDb_1.34.9                       
## [28] IRanges_2.32.0                            
## [29] S4Vectors_0.36.2                          
## [30] BiocGenerics_0.44.0                       
## [31] BiocParallel_1.32.6                       
## [32] MethylAid_1.28.0                          
## [33] forcats_0.5.2                             
## [34] stringr_1.5.0                             
## [35] dplyr_1.1.3                               
## [36] purrr_0.3.4                               
## [37] readr_2.1.2                               
## [38] tidyr_1.2.1                               
## [39] tibble_3.2.1                              
## [40] ggplot2_3.4.3                             
## [41] tidyverse_1.3.2                           
## [42] rmarkdown_2.16                            
## 
## loaded via a namespace (and not attached):
##   [1] utf8_1.2.3                  tidyselect_1.2.0           
##   [3] RSQLite_2.2.17              grid_4.2.2                 
##   [5] munsell_0.5.0               codetools_0.2-19           
##   [7] preprocessCore_1.60.2       withr_2.5.0                
##   [9] colorspace_2.1-0            filelock_1.0.2             
##  [11] highr_0.10                  knitr_1.43                 
##  [13] rstudioapi_0.14             labeling_0.4.2             
##  [15] GenomeInfoDbData_1.2.9      farver_2.1.1               
##  [17] bit64_4.0.5                 rhdf5_2.42.1               
##  [19] vctrs_0.6.3                 generics_0.1.3             
##  [21] xfun_0.39                   timechange_0.2.0           
##  [23] BiocFileCache_2.2.1         R6_2.5.1                   
##  [25] illuminaio_0.40.0           bitops_1.0-7               
##  [27] rhdf5filters_1.10.1         cachem_1.0.8               
##  [29] reshape_0.8.9               DelayedArray_0.24.0        
##  [31] assertthat_0.2.1            vroom_1.5.7                
##  [33] promises_1.2.0.1            BiocIO_1.8.0               
##  [35] scales_1.2.1                googlesheets4_1.0.1        
##  [37] gtable_0.3.3                rlang_1.1.1                
##  [39] genefilter_1.76.0           splines_4.2.2              
##  [41] rtracklayer_1.54.0          gargle_1.5.0               
##  [43] GEOquery_2.62.2             htm2txt_2.2.2              
##  [45] hexbin_1.28.3               broom_1.0.1                
##  [47] yaml_2.3.7                  reshape2_1.4.4             
##  [49] RaggedExperiment_1.18.0     modelr_0.1.9               
##  [51] backports_1.4.1             httpuv_1.6.11              
##  [53] tools_4.2.2                 gridBase_0.4-7             
##  [55] nor1mix_1.3-0               ellipsis_0.3.2             
##  [57] jquerylib_0.1.4             RColorBrewer_1.1-3         
##  [59] siggenes_1.68.0             MultiAssayExperiment_1.20.0
##  [61] Rcpp_1.0.10                 plyr_1.8.8                 
##  [63] sparseMatrixStats_1.10.0    progress_1.2.2             
##  [65] zlibbioc_1.44.0             RCurl_1.98-1.12            
##  [67] prettyunits_1.1.1           openssl_2.0.6              
##  [69] haven_2.5.1                 fs_1.6.2                   
##  [71] magrittr_2.0.3              data.table_1.14.8          
##  [73] reprex_2.0.2                googledrive_2.0.0          
##  [75] hms_1.1.2                   mime_0.12                  
##  [77] evaluate_0.21               xtable_1.8-4               
##  [79] XML_3.99-0.14               mclust_6.0.0               
##  [81] readxl_1.4.1                gridExtra_2.3              
##  [83] compiler_4.2.2              biomaRt_2.50.3             
##  [85] crayon_1.5.2                htmltools_0.5.5            
##  [87] later_1.3.1                 tzdb_0.4.0                 
##  [89] lubridate_1.9.2             DBI_1.1.3                  
##  [91] dbplyr_2.2.1                rappdirs_0.3.3             
##  [93] cli_3.6.1                   quadprog_1.5-8             
##  [95] pkgconfig_2.0.3             GenomicAlignments_1.30.0   
##  [97] xml2_1.3.4                  annotate_1.72.0            
##  [99] bslib_0.5.0                 rngtools_1.5.2             
## [101] multtest_2.50.0             beanplot_1.3.1             
## [103] rvest_1.0.3                 doRNG_1.8.6                
## [105] scrime_1.3.5                digest_0.6.31              
## [107] base64_2.0.1                cellranger_1.1.0           
## [109] DelayedMatrixStats_1.16.0   restfulr_0.0.15            
## [111] curl_5.0.1                  shiny_1.7.2                
## [113] Rsamtools_2.10.0            rjson_0.2.21               
## [115] lifecycle_1.0.3             nlme_3.1-162               
## [117] jsonlite_1.8.5              Rhdf5lib_1.20.0            
## [119] askpass_1.1                 limma_3.54.2               
## [121] fansi_1.0.4                 pillar_1.9.0               
## [123] lattice_0.21-8              KEGGREST_1.34.0            
## [125] fastmap_1.1.1               httr_1.4.6                 
## [127] survival_3.5-5              glue_1.6.2                 
## [129] png_0.1-8                   bit_4.0.5                  
## [131] stringi_1.7.12              sass_0.4.6                 
## [133] HDF5Array_1.22.1            blob_1.2.4                 
## [135] memoise_2.0.1

Clear

rm(list=ls())

Script 4: Identify sample mismatches

Compiled: June 25, 2024

Setup

Annotation

Common SNP data

Fasted Blood

Subset Data

Assumed relations

Genotyping

Identify Mismatches

Subcutaneous Fat

Subset Data

Assumed relations

Genotyping

Identify Mismatches

Skeletal Muscle

Subset Data

Assumed relations

Genotyping

Identify Mismatches

Session info