This script removes outliers identified by MethylAid and single (unpaired) samples, reruns MethylAid to confirm sample quality, inspects beta value distributions, and performs PCA.

Setup

Load packages

library(tidyverse)
library(DNAmArray)
library(minfi)
library(omicsPrint)
library(irlba)
library(BiocParallel)
library(FDb.InfiniumMethylation.hg19)
library(ggfortify)
library(GenomicRanges)
library(SummarizedExperiment)
library(MethylAid)
library(ggrepel)

Annotation

We use DNAmArray internal files, which were pulled recently (2022) from Zhou’s github: https://zwdzwd.github.io/InfiniumAnnotation

anno <- read_tsv(
  "../../../../LLS/Shared_Data/Manifests/EPIC.hg19.manifest.tsv.gz")

We are interested in the:

cpg - ID of the probe for the CpG
chr - Chromosome where the CpG is located
start - Start position of the CpG
end - End position of the CpG
strand - Strand of the CpG
gene_HGNC - Annotated gene

anno <- anno %>% 
  dplyr::select(
    cpg = probeID,
    chr = CpG_chrm,
    start = CpG_beg,
    end = CpG_end,
    strand = probe_strand,
    gene_HGNC,
    MASK_general
  ) %>% 
  mutate(
    chr = substr(chr,4,5)
  )

Remove outliers

Load in targets file from wave 1

load("../Processing/GOTO_targets-unfiltered.Rdata")

Load in MethylAid outliers

outliers <- read_csv("../Processing/MethylAid/GOTO_MethylAid-Outliers.csv")

## New names:
## Rows: 25 Columns: 7
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (2): ...1, ID lgl (5): MU, OP, BS, HC, DP
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`

Remove outliers and their pairs from the data

outlier_pair <- (targets_goto %>% 
  filter(Basename %in% outliers$ID) %>% 
  mutate(
    pair = paste0(IOP2_ID, tissue)
  ))$pair

targets_wave1 <- targets_goto %>% 
  mutate(
    pair = paste0(IOP2_ID, tissue)
  ) %>% 
  filter(
    !pair %in% outlier_pair) %>% 
  dplyr::select(-pair)

print(paste0("Data on ", nrow(targets_wave1), " samples remains..."))

## [1] "Data on 534 samples remains..."

Add wave 2 targets

load("../Sample_Sheets/GOTO_wave2-targets.Rdata")

targets_wave2 <- targets %>% 
  mutate(
    DNA_labnr = as.factor(DNA_labnr),
    IOP2_ID = as.factor(IOP2_ID),
    HMU_ID = as.factor(HMU_ID),
    tissue = as.factor(tissue),
    timepoint = factor(timepoint, 
                       levels = c("before", "after")),
    sex = factor(sex, 
                 levels=c("male", "female")),
    op_status = factor(op_status, 
                       levels = c("partner", "offspring")),
    plate = factor(plate),
    well = as.factor(paste0(rep(LETTERS[1:8], 3), rep(c("09", "10", "11"), each=8))),
    Basename = paste0(chip_n, "_", chip_position),
    array_n = factor(chip_n),
    array_row = as.numeric(substr(chip_position,3,3))
  ) %>% 
  dplyr::select(
    DNA_labnr, IOP2_ID, HMU_ID, 
    tissue, timepoint, sex, 
    age, bmi, op_status,
    plate, well, isolationdate, 
    conc_ngul, A260280, volume, 
    array_n, array_row, Basename)

Combine waves

targets <- rbind(targets_wave1, targets_wave2)

Remove unpaired

Make pair ID

targets <- targets %>% 
  mutate(
    pair = paste0(IOP2_ID, tissue)
  )

Remove non-paired samples and one sample where pair was resent in wave 2 (fasted blood_2302)

unpaired <- targets %>% 
  group_by(pair) %>% 
  dplyr::summarize(n=n()) %>% 
  ungroup()

unpaired <- (unpaired %>% 
  filter(n<2))$pair

targets <- targets %>% filter(
  (!pair %in% unpaired)) %>% 
  dplyr::select(
    -pair
  )

print(paste0("Data on ", nrow(targets), " samples remains..."))

## [1] "Data on 556 samples remains..."

MethylAid

Set BPPARAM

BPPARAM <- MulticoreParam(8)

Summarize IDAT files for MethylAid

sData_goto <- MethylAid::summarize(
  targets, 
  batchSize=50, 
  BPPARAM = BPPARAM,
  force=TRUE)

Save

save(
  sData_goto, 
  file="../Processing/MethylAid/GOTO_sData-all.Rdata")

This data was visualized using MethylAid and no outliers were identified

Inspect betas

Read IDATs into R as an RGSet object

register(MulticoreParam(6))

RGset <- read.metharray.exp(
  targets, 
  base = ".",
  extended=TRUE)
RGset

## class: RGChannelSetExtended 
## dim: 1051815 556 
## metadata(0):
## assays(5): Green Red GreenSD RedSD NBeads
## rownames(1051815): 1600101 1600111 ... 99810990 99810992
## rowData names(0):
## colnames(556): 203527980082_R01C01 203527980082_R02C01 ...
##   203550300093_R07C01 203550300093_R08C01
## colData names(19): DNA_labnr IOP2_ID ... Basename filenames
## Annotation
##   array: IlluminaHumanMethylationEPIC
##   annotation: ilm10b4.hg19

Calculate beta values from the red and green intensities

betas <- getBeta(
  RGset, 
  type="Illumina")

dim(betas)

## [1] 866091    556

Look at the beta value distribution

densityPlot(
  RGset, 
  main="Beta density plot", 
  xlab="Beta values")

PCA

Calculate PCs

pc <- prcomp_irlba(
  t(betas), 
  n=6)

summary(pc)

## Importance of components:
##                           PC1     PC2      PC3     PC4    PC5     PC6
## Standard deviation     66.284 33.4194 14.31651 13.6111 8.9570 7.65194
## Proportion of Variance  0.619  0.1573  0.02888  0.0261 0.0113 0.00825
## Cumulative Proportion   0.619  0.7763  0.80519  0.8313 0.8426 0.85085

Add PCs to targets

targets <- cbind(
  targets, 
  pc$x)

The PCA analysis can result in two different symmetrical clusters. This code handles those two eventualities and saves IDs to remove.

if(nrow(targets %>% 
        dplyr::filter(PC1 < -30 & tissue != "fasted blood")) == 2) {
  pc_blood <- "right"
} else {
  pc_blood <- "left"
}

Plot the first two PCs against each other to ensure the primary source of variability is tissue.

if(pc_blood == "right"){
  pca_plot <- targets %>% 
  ggplot(aes(
    x=PC1,
    y=PC2,
    label = ifelse(
        (PC1 < -30 & tissue != "fasted blood") |
        (PC1 > -60 & tissue == "fasted blood"), 
        as.character(IOP2_ID), NA)
  )) 
} else {
  pca_plot <- targets %>% 
  ggplot(aes(
    x=PC1,
    y=PC2,
    label = ifelse(
        (PC1 > 30 & tissue != "fasted blood") |
        (PC1 < 60 & tissue == "fasted blood"), 
        as.character(IOP2_ID), NA)
  )) 
}

pca_plot +
  geom_point(
    aes(
      colour = tissue)) +
  geom_label_repel(
    aes(color = tissue),
    fill = "white"
  ) +
  xlab("PC1") +
  ylab("PC2") +
  ggtitle("PCA plot coloured by tissue") +
  theme(
    axis.text = element_text(
       size=9, 
       color="#1B2021"),
    axis.title = element_text(
      size=12, 
      hjust=0.5, 
      color="#1B2021"),
    plot.title = element_text(
      size=16, 
      hjust=0.5,
      face="bold", 
      color = "#548687"),
    panel.background = element_rect(
      fill="white", color="#1B2021"),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    plot.background = element_rect(
      fill="white"))

There are a few outliers than should be removed:

One fat sample and its pair, which clusters closely to the blood samples. It is likely this was a very bloody biopsy.
One muscle sample and its pair, which similarly clusters close to the blood samples
Two blood samples and their pairs, which cluster close to the fat samples. Potentially, these individual had higher levels of lipids in their blood.

Save the outliers

outliers <- targets %>% 
  dplyr::filter(
      (IOP2_ID == "61479" & tissue == "fasted blood") |
      (IOP2_ID == "60871" & tissue == "fasted blood") |
      (IOP2_ID == "61093" & tissue == "fat") |
      (IOP2_ID == "60822" & tissue == "muscle"))

Remove them from targets

targets <- targets[!(targets$Basename %in% outliers$Basename),]

print(paste0("Data on ", nrow(targets), " samples remains..."))

## [1] "Data on 548 samples remains..."

Add smoking

Import data

smoke_df <- read_csv('../Pheno/GOTO_SmokingData_20201117_v2.csv')

## Rows: 164 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (3): IOP2_ID, CurrentSmoking, EverSmoked
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Create smoking var and inspect

smoke_df <- smoke_df %>% 
  mutate(smoke = case_when(
    (CurrentSmoking == 1) ~ 2,
    (CurrentSmoking == 0) & (EverSmoked == 1) ~ 1,
    (CurrentSmoking == 0) & (EverSmoked == 0) ~ 0
  ))
xtabs(~smoke_df$smoke + smoke_df$CurrentSmoking)

##               smoke_df$CurrentSmoking
## smoke_df$smoke  0  1
##              0 52  0
##              1 99  0
##              2  0 12

xtabs(~smoke_df$smoke + smoke_df$EverSmoked)

##               smoke_df$EverSmoked
## smoke_df$smoke  0  1
##              0 52  0
##              1  0 99
##              2  0 12

Remove original vars

smoke_df <- smoke_df %>% dplyr::select(IOP2_ID, smoke) %>% mutate(IOP2_ID = as.factor(IOP2_ID))

Add to targets

targets <- left_join(targets, smoke_df, by='IOP2_ID')
xtabs(~targets$smoke)

## targets$smoke
##   0   1   2 
## 186 336  26

Save Unfiltered Data

Attach targets rownames and remove PCs

rownames(targets) <- targets$Basename
targets <- targets %>% 
  dplyr::select(-PC1, -PC2, -PC3, -PC4, -PC5, -PC6)

Subset betas to include only target samples and check order

betas <- betas[,colnames(betas) %in% rownames(targets)]

xtabs(~colnames(betas) == rownames(targets))

## colnames(betas) == rownames(targets)
## TRUE 
##  548

Subset annotation to include only probes in betas

anno <- anno %>% filter(cpg %in% rownames(betas))
str(anno)

## tibble [865,859 × 7] (S3: tbl_df/tbl/data.frame)
##  $ cpg         : chr [1:865859] "cg14817997" "cg26928153" "cg16269199" "cg13869341" ...
##  $ chr         : chr [1:865859] "1" "1" "1" "1" ...
##  $ start       : num [1:865859] 10524 10847 10849 15864 18826 ...
##  $ end         : num [1:865859] 10526 10849 10851 15866 18828 ...
##  $ strand      : chr [1:865859] "-" "+" "+" "-" ...
##  $ gene_HGNC   : chr [1:865859] NA "DDX11L1" "DDX11L1" "WASH7P" ...
##  $ MASK_general: logi [1:865859] TRUE TRUE TRUE TRUE TRUE TRUE ...

betas <- betas[rownames(betas) %in% anno$cpg,]

Save unfiltered values as a SummarizedExperiment object before probe removal

methData_unfiltered <- SummarizedExperiment(
  assays=SimpleList(beta=betas), 
  colData=targets,
  rowData=anno)

methData_unfiltered

## class: SummarizedExperiment 
## dim: 865859 548 
## metadata(0):
## assays(1): beta
## rownames(865859): cg18478105 cg09835024 ... cg10633746 cg12623625
## rowData names(7): cpg chr ... gene_HGNC MASK_general
## colnames(548): 203527980082_R01C01 203527980082_R02C01 ...
##   203550300093_R07C01 203550300093_R08C01
## colData names(19): DNA_labnr IOP2_ID ... Basename smoke

save(
  methData_unfiltered,
  file="../Processing/GOTO_methData-unfiltered.Rdata")

save(
  targets, 
  file="../Processing/GOTO_targets-unfiltered.Rdata")

Subset RGset and check column order

RGset <- RGset[,colnames(RGset) %in% rownames(targets)]
RGset

## class: RGChannelSetExtended 
## dim: 1051815 548 
## metadata(0):
## assays(5): Green Red GreenSD RedSD NBeads
## rownames(1051815): 1600101 1600111 ... 99810990 99810992
## rowData names(0):
## colnames(548): 203527980082_R01C01 203527980082_R02C01 ...
##   203550300093_R07C01 203550300093_R08C01
## colData names(19): DNA_labnr IOP2_ID ... Basename filenames
## Annotation
##   array: IlluminaHumanMethylationEPIC
##   annotation: ilm10b4.hg19

xtabs(~colnames(RGset) == rownames(targets))

## colnames(RGset) == rownames(targets)
## TRUE 
##  548

Save RGset

RGset <- RGset[ , colnames(RGset) %in% colnames(betas)]

save(
  RGset, 
  file = "../Processing/GOTO_RGset-unfiltered.Rdata")

Session Info

sessionInfo()

## R version 4.2.2 (2022-10-31)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Rocky Linux 8.10 (Green Obsidian)
## 
## Matrix products: default
## BLAS/LAPACK: /usr/lib64/libopenblas-r0.3.15.so
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] parallel  stats4    stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] ggrepel_0.9.1                             
##  [2] ggfortify_0.4.14                          
##  [3] irlba_2.3.5.1                             
##  [4] Matrix_1.5-4.1                            
##  [5] omicsPrint_1.14.0                         
##  [6] MASS_7.3-60                               
##  [7] DNAmArray_2.0.0                           
##  [8] pls_2.8-2                                 
##  [9] FDb.InfiniumMethylation.hg19_2.2.0        
## [10] org.Hs.eg.db_3.14.0                       
## [11] TxDb.Hsapiens.UCSC.hg19.knownGene_3.2.2   
## [12] GenomicFeatures_1.46.5                    
## [13] AnnotationDbi_1.56.2                      
## [14] IlluminaHumanMethylationEPICmanifest_0.3.0
## [15] minfi_1.40.0                              
## [16] bumphunter_1.36.0                         
## [17] locfit_1.5-9.8                            
## [18] iterators_1.0.14                          
## [19] foreach_1.5.2                             
## [20] Biostrings_2.62.0                         
## [21] XVector_0.34.0                            
## [22] SummarizedExperiment_1.24.0               
## [23] Biobase_2.58.0                            
## [24] MatrixGenerics_1.10.0                     
## [25] matrixStats_1.0.0                         
## [26] GenomicRanges_1.46.1                      
## [27] GenomeInfoDb_1.34.9                       
## [28] IRanges_2.32.0                            
## [29] S4Vectors_0.36.2                          
## [30] BiocGenerics_0.44.0                       
## [31] BiocParallel_1.32.6                       
## [32] MethylAid_1.28.0                          
## [33] forcats_0.5.2                             
## [34] stringr_1.5.0                             
## [35] dplyr_1.1.3                               
## [36] purrr_0.3.4                               
## [37] readr_2.1.2                               
## [38] tidyr_1.2.1                               
## [39] tibble_3.2.1                              
## [40] ggplot2_3.4.3                             
## [41] tidyverse_1.3.2                           
## [42] rmarkdown_2.16                            
## 
## loaded via a namespace (and not attached):
##   [1] utf8_1.2.3                  tidyselect_1.2.0           
##   [3] RSQLite_2.2.17              grid_4.2.2                 
##   [5] munsell_0.5.0               codetools_0.2-19           
##   [7] preprocessCore_1.60.2       withr_2.5.0                
##   [9] colorspace_2.1-0            filelock_1.0.2             
##  [11] highr_0.10                  knitr_1.43                 
##  [13] rstudioapi_0.14             labeling_0.4.2             
##  [15] GenomeInfoDbData_1.2.9      farver_2.1.1               
##  [17] bit64_4.0.5                 rhdf5_2.42.1               
##  [19] vctrs_0.6.3                 generics_0.1.3             
##  [21] xfun_0.39                   timechange_0.2.0           
##  [23] BiocFileCache_2.2.1         R6_2.5.1                   
##  [25] illuminaio_0.40.0           bitops_1.0-7               
##  [27] rhdf5filters_1.10.1         cachem_1.0.8               
##  [29] reshape_0.8.9               DelayedArray_0.24.0        
##  [31] assertthat_0.2.1            vroom_1.5.7                
##  [33] promises_1.2.0.1            BiocIO_1.8.0               
##  [35] scales_1.2.1                googlesheets4_1.0.1        
##  [37] gtable_0.3.3                rlang_1.1.1                
##  [39] genefilter_1.76.0           splines_4.2.2              
##  [41] rtracklayer_1.54.0          gargle_1.5.0               
##  [43] GEOquery_2.62.2             htm2txt_2.2.2              
##  [45] hexbin_1.28.3               broom_1.0.1                
##  [47] yaml_2.3.7                  reshape2_1.4.4             
##  [49] RaggedExperiment_1.18.0     modelr_0.1.9               
##  [51] backports_1.4.1             httpuv_1.6.11              
##  [53] tools_4.2.2                 gridBase_0.4-7             
##  [55] nor1mix_1.3-0               ellipsis_0.3.2             
##  [57] jquerylib_0.1.4             RColorBrewer_1.1-3         
##  [59] siggenes_1.68.0             MultiAssayExperiment_1.20.0
##  [61] Rcpp_1.0.10                 plyr_1.8.8                 
##  [63] sparseMatrixStats_1.10.0    progress_1.2.2             
##  [65] zlibbioc_1.44.0             RCurl_1.98-1.12            
##  [67] prettyunits_1.1.1           openssl_2.0.6              
##  [69] haven_2.5.1                 fs_1.6.2                   
##  [71] magrittr_2.0.3              data.table_1.14.8          
##  [73] reprex_2.0.2                googledrive_2.0.0          
##  [75] hms_1.1.2                   mime_0.12                  
##  [77] evaluate_0.21               xtable_1.8-4               
##  [79] XML_3.99-0.14               mclust_6.0.0               
##  [81] readxl_1.4.1                gridExtra_2.3              
##  [83] compiler_4.2.2              biomaRt_2.50.3             
##  [85] crayon_1.5.2                htmltools_0.5.5            
##  [87] later_1.3.1                 tzdb_0.4.0                 
##  [89] lubridate_1.9.2             DBI_1.1.3                  
##  [91] dbplyr_2.2.1                rappdirs_0.3.3             
##  [93] cli_3.6.1                   quadprog_1.5-8             
##  [95] pkgconfig_2.0.3             GenomicAlignments_1.30.0   
##  [97] xml2_1.3.4                  annotate_1.72.0            
##  [99] bslib_0.5.0                 rngtools_1.5.2             
## [101] multtest_2.50.0             beanplot_1.3.1             
## [103] rvest_1.0.3                 doRNG_1.8.6                
## [105] scrime_1.3.5                digest_0.6.31              
## [107] base64_2.0.1                cellranger_1.1.0           
## [109] DelayedMatrixStats_1.16.0   restfulr_0.0.15            
## [111] curl_5.0.1                  shiny_1.7.2                
## [113] Rsamtools_2.10.0            rjson_0.2.21               
## [115] lifecycle_1.0.3             nlme_3.1-162               
## [117] jsonlite_1.8.5              Rhdf5lib_1.20.0            
## [119] askpass_1.1                 limma_3.54.2               
## [121] fansi_1.0.4                 pillar_1.9.0               
## [123] lattice_0.21-8              KEGGREST_1.34.0            
## [125] fastmap_1.1.1               httr_1.4.6                 
## [127] survival_3.5-5              glue_1.6.2                 
## [129] png_0.1-8                   bit_4.0.5                  
## [131] stringi_1.7.12              sass_0.4.6                 
## [133] HDF5Array_1.22.1            blob_1.2.4                 
## [135] memoise_2.0.1

Clear

rm(list=ls())

Script 3: MethylAid and PCA

Compiled: June 25, 2024