This script removes outliers identified by MethylAid and single (unpaired) samples, reruns MethylAid to confirm sample quality, inspects beta value distributions, and performs PCA.
Load packages
library(tidyverse)
library(DNAmArray)
library(minfi)
library(omicsPrint)
library(irlba)
library(BiocParallel)
library(FDb.InfiniumMethylation.hg19)
library(ggfortify)
library(GenomicRanges)
library(SummarizedExperiment)
library(MethylAid)
library(ggrepel)
We use DNAmArray
internal files, which were pulled recently (2022) from Zhou’s github: https://zwdzwd.github.io/InfiniumAnnotation
anno <- read_tsv(
"../../../../LLS/Shared_Data/Manifests/EPIC.hg19.manifest.tsv.gz")
We are interested in the:
cpg
- ID of the probe for the CpGchr
- Chromosome where the CpG is locatedstart
- Start position of the CpGend
- End position of the CpGstrand
- Strand of the CpGgene_HGNC
- Annotated geneanno <- anno %>%
dplyr::select(
cpg = probeID,
chr = CpG_chrm,
start = CpG_beg,
end = CpG_end,
strand = probe_strand,
gene_HGNC,
MASK_general
) %>%
mutate(
chr = substr(chr,4,5)
)
Load in targets file from wave 1
load("../Processing/GOTO_targets-unfiltered.Rdata")
Load in MethylAid outliers
outliers <- read_csv("../Processing/MethylAid/GOTO_MethylAid-Outliers.csv")
## New names:
## Rows: 25 Columns: 7
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (2): ...1, ID lgl (5): MU, OP, BS, HC, DP
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
Remove outliers and their pairs from the data
outlier_pair <- (targets_goto %>%
filter(Basename %in% outliers$ID) %>%
mutate(
pair = paste0(IOP2_ID, tissue)
))$pair
targets_wave1 <- targets_goto %>%
mutate(
pair = paste0(IOP2_ID, tissue)
) %>%
filter(
!pair %in% outlier_pair) %>%
dplyr::select(-pair)
print(paste0("Data on ", nrow(targets_wave1), " samples remains..."))
## [1] "Data on 534 samples remains..."
Add wave 2 targets
load("../Sample_Sheets/GOTO_wave2-targets.Rdata")
targets_wave2 <- targets %>%
mutate(
DNA_labnr = as.factor(DNA_labnr),
IOP2_ID = as.factor(IOP2_ID),
HMU_ID = as.factor(HMU_ID),
tissue = as.factor(tissue),
timepoint = factor(timepoint,
levels = c("before", "after")),
sex = factor(sex,
levels=c("male", "female")),
op_status = factor(op_status,
levels = c("partner", "offspring")),
plate = factor(plate),
well = as.factor(paste0(rep(LETTERS[1:8], 3), rep(c("09", "10", "11"), each=8))),
Basename = paste0(chip_n, "_", chip_position),
array_n = factor(chip_n),
array_row = as.numeric(substr(chip_position,3,3))
) %>%
dplyr::select(
DNA_labnr, IOP2_ID, HMU_ID,
tissue, timepoint, sex,
age, bmi, op_status,
plate, well, isolationdate,
conc_ngul, A260280, volume,
array_n, array_row, Basename)
Combine waves
targets <- rbind(targets_wave1, targets_wave2)
Make pair ID
targets <- targets %>%
mutate(
pair = paste0(IOP2_ID, tissue)
)
Remove non-paired samples and one sample where pair was resent in wave 2 (fasted blood_2302)
unpaired <- targets %>%
group_by(pair) %>%
dplyr::summarize(n=n()) %>%
ungroup()
unpaired <- (unpaired %>%
filter(n<2))$pair
targets <- targets %>% filter(
(!pair %in% unpaired)) %>%
dplyr::select(
-pair
)
print(paste0("Data on ", nrow(targets), " samples remains..."))
## [1] "Data on 556 samples remains..."
Set BPPARAM
BPPARAM <- MulticoreParam(8)
Summarize IDAT files for MethylAid
sData_goto <- MethylAid::summarize(
targets,
batchSize=50,
BPPARAM = BPPARAM,
force=TRUE)
Save
save(
sData_goto,
file="../Processing/MethylAid/GOTO_sData-all.Rdata")
This data was visualized using MethylAid and no outliers were identified
Read IDATs into R as an RGSet
object
register(MulticoreParam(6))
RGset <- read.metharray.exp(
targets,
base = ".",
extended=TRUE)
RGset
## class: RGChannelSetExtended
## dim: 1051815 556
## metadata(0):
## assays(5): Green Red GreenSD RedSD NBeads
## rownames(1051815): 1600101 1600111 ... 99810990 99810992
## rowData names(0):
## colnames(556): 203527980082_R01C01 203527980082_R02C01 ...
## 203550300093_R07C01 203550300093_R08C01
## colData names(19): DNA_labnr IOP2_ID ... Basename filenames
## Annotation
## array: IlluminaHumanMethylationEPIC
## annotation: ilm10b4.hg19
Calculate beta values from the red and green intensities
betas <- getBeta(
RGset,
type="Illumina")
dim(betas)
## [1] 866091 556
Look at the beta value distribution
densityPlot(
RGset,
main="Beta density plot",
xlab="Beta values")
Calculate PCs
pc <- prcomp_irlba(
t(betas),
n=6)
summary(pc)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6
## Standard deviation 66.284 33.4194 14.31651 13.6111 8.9570 7.65194
## Proportion of Variance 0.619 0.1573 0.02888 0.0261 0.0113 0.00825
## Cumulative Proportion 0.619 0.7763 0.80519 0.8313 0.8426 0.85085
Add PCs to targets
targets <- cbind(
targets,
pc$x)
The PCA analysis can result in two different symmetrical clusters. This code handles those two eventualities and saves IDs to remove.
if(nrow(targets %>%
dplyr::filter(PC1 < -30 & tissue != "fasted blood")) == 2) {
pc_blood <- "right"
} else {
pc_blood <- "left"
}
Plot the first two PCs against each other to ensure the primary source of variability is tissue.
if(pc_blood == "right"){
pca_plot <- targets %>%
ggplot(aes(
x=PC1,
y=PC2,
label = ifelse(
(PC1 < -30 & tissue != "fasted blood") |
(PC1 > -60 & tissue == "fasted blood"),
as.character(IOP2_ID), NA)
))
} else {
pca_plot <- targets %>%
ggplot(aes(
x=PC1,
y=PC2,
label = ifelse(
(PC1 > 30 & tissue != "fasted blood") |
(PC1 < 60 & tissue == "fasted blood"),
as.character(IOP2_ID), NA)
))
}
pca_plot +
geom_point(
aes(
colour = tissue)) +
geom_label_repel(
aes(color = tissue),
fill = "white"
) +
xlab("PC1") +
ylab("PC2") +
ggtitle("PCA plot coloured by tissue") +
theme(
axis.text = element_text(
size=9,
color="#1B2021"),
axis.title = element_text(
size=12,
hjust=0.5,
color="#1B2021"),
plot.title = element_text(
size=16,
hjust=0.5,
face="bold",
color = "#548687"),
panel.background = element_rect(
fill="white", color="#1B2021"),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
plot.background = element_rect(
fill="white"))
There are a few outliers than should be removed:
Save the outliers
outliers <- targets %>%
dplyr::filter(
(IOP2_ID == "61479" & tissue == "fasted blood") |
(IOP2_ID == "60871" & tissue == "fasted blood") |
(IOP2_ID == "61093" & tissue == "fat") |
(IOP2_ID == "60822" & tissue == "muscle"))
Remove them from targets
targets <- targets[!(targets$Basename %in% outliers$Basename),]
print(paste0("Data on ", nrow(targets), " samples remains..."))
## [1] "Data on 548 samples remains..."
Import data
smoke_df <- read_csv('../Pheno/GOTO_SmokingData_20201117_v2.csv')
## Rows: 164 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (3): IOP2_ID, CurrentSmoking, EverSmoked
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Create smoking var and inspect
smoke_df <- smoke_df %>%
mutate(smoke = case_when(
(CurrentSmoking == 1) ~ 2,
(CurrentSmoking == 0) & (EverSmoked == 1) ~ 1,
(CurrentSmoking == 0) & (EverSmoked == 0) ~ 0
))
xtabs(~smoke_df$smoke + smoke_df$CurrentSmoking)
## smoke_df$CurrentSmoking
## smoke_df$smoke 0 1
## 0 52 0
## 1 99 0
## 2 0 12
xtabs(~smoke_df$smoke + smoke_df$EverSmoked)
## smoke_df$EverSmoked
## smoke_df$smoke 0 1
## 0 52 0
## 1 0 99
## 2 0 12
Remove original vars
smoke_df <- smoke_df %>% dplyr::select(IOP2_ID, smoke) %>% mutate(IOP2_ID = as.factor(IOP2_ID))
Add to targets
targets <- left_join(targets, smoke_df, by='IOP2_ID')
xtabs(~targets$smoke)
## targets$smoke
## 0 1 2
## 186 336 26
Attach targets
rownames and remove PCs
rownames(targets) <- targets$Basename
targets <- targets %>%
dplyr::select(-PC1, -PC2, -PC3, -PC4, -PC5, -PC6)
Subset betas
to include only target samples and check order
betas <- betas[,colnames(betas) %in% rownames(targets)]
xtabs(~colnames(betas) == rownames(targets))
## colnames(betas) == rownames(targets)
## TRUE
## 548
Subset annotation to include only probes in betas
anno <- anno %>% filter(cpg %in% rownames(betas))
str(anno)
## tibble [865,859 × 7] (S3: tbl_df/tbl/data.frame)
## $ cpg : chr [1:865859] "cg14817997" "cg26928153" "cg16269199" "cg13869341" ...
## $ chr : chr [1:865859] "1" "1" "1" "1" ...
## $ start : num [1:865859] 10524 10847 10849 15864 18826 ...
## $ end : num [1:865859] 10526 10849 10851 15866 18828 ...
## $ strand : chr [1:865859] "-" "+" "+" "-" ...
## $ gene_HGNC : chr [1:865859] NA "DDX11L1" "DDX11L1" "WASH7P" ...
## $ MASK_general: logi [1:865859] TRUE TRUE TRUE TRUE TRUE TRUE ...
betas <- betas[rownames(betas) %in% anno$cpg,]
Save unfiltered values as a SummarizedExperiment
object before probe removal
methData_unfiltered <- SummarizedExperiment(
assays=SimpleList(beta=betas),
colData=targets,
rowData=anno)
methData_unfiltered
## class: SummarizedExperiment
## dim: 865859 548
## metadata(0):
## assays(1): beta
## rownames(865859): cg18478105 cg09835024 ... cg10633746 cg12623625
## rowData names(7): cpg chr ... gene_HGNC MASK_general
## colnames(548): 203527980082_R01C01 203527980082_R02C01 ...
## 203550300093_R07C01 203550300093_R08C01
## colData names(19): DNA_labnr IOP2_ID ... Basename smoke
save(
methData_unfiltered,
file="../Processing/GOTO_methData-unfiltered.Rdata")
save(
targets,
file="../Processing/GOTO_targets-unfiltered.Rdata")
Subset RGset
and check column order
RGset <- RGset[,colnames(RGset) %in% rownames(targets)]
RGset
## class: RGChannelSetExtended
## dim: 1051815 548
## metadata(0):
## assays(5): Green Red GreenSD RedSD NBeads
## rownames(1051815): 1600101 1600111 ... 99810990 99810992
## rowData names(0):
## colnames(548): 203527980082_R01C01 203527980082_R02C01 ...
## 203550300093_R07C01 203550300093_R08C01
## colData names(19): DNA_labnr IOP2_ID ... Basename filenames
## Annotation
## array: IlluminaHumanMethylationEPIC
## annotation: ilm10b4.hg19
xtabs(~colnames(RGset) == rownames(targets))
## colnames(RGset) == rownames(targets)
## TRUE
## 548
Save RGset
RGset <- RGset[ , colnames(RGset) %in% colnames(betas)]
save(
RGset,
file = "../Processing/GOTO_RGset-unfiltered.Rdata")
sessionInfo()
## R version 4.2.2 (2022-10-31)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Rocky Linux 8.10 (Green Obsidian)
##
## Matrix products: default
## BLAS/LAPACK: /usr/lib64/libopenblas-r0.3.15.so
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] parallel stats4 stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] ggrepel_0.9.1
## [2] ggfortify_0.4.14
## [3] irlba_2.3.5.1
## [4] Matrix_1.5-4.1
## [5] omicsPrint_1.14.0
## [6] MASS_7.3-60
## [7] DNAmArray_2.0.0
## [8] pls_2.8-2
## [9] FDb.InfiniumMethylation.hg19_2.2.0
## [10] org.Hs.eg.db_3.14.0
## [11] TxDb.Hsapiens.UCSC.hg19.knownGene_3.2.2
## [12] GenomicFeatures_1.46.5
## [13] AnnotationDbi_1.56.2
## [14] IlluminaHumanMethylationEPICmanifest_0.3.0
## [15] minfi_1.40.0
## [16] bumphunter_1.36.0
## [17] locfit_1.5-9.8
## [18] iterators_1.0.14
## [19] foreach_1.5.2
## [20] Biostrings_2.62.0
## [21] XVector_0.34.0
## [22] SummarizedExperiment_1.24.0
## [23] Biobase_2.58.0
## [24] MatrixGenerics_1.10.0
## [25] matrixStats_1.0.0
## [26] GenomicRanges_1.46.1
## [27] GenomeInfoDb_1.34.9
## [28] IRanges_2.32.0
## [29] S4Vectors_0.36.2
## [30] BiocGenerics_0.44.0
## [31] BiocParallel_1.32.6
## [32] MethylAid_1.28.0
## [33] forcats_0.5.2
## [34] stringr_1.5.0
## [35] dplyr_1.1.3
## [36] purrr_0.3.4
## [37] readr_2.1.2
## [38] tidyr_1.2.1
## [39] tibble_3.2.1
## [40] ggplot2_3.4.3
## [41] tidyverse_1.3.2
## [42] rmarkdown_2.16
##
## loaded via a namespace (and not attached):
## [1] utf8_1.2.3 tidyselect_1.2.0
## [3] RSQLite_2.2.17 grid_4.2.2
## [5] munsell_0.5.0 codetools_0.2-19
## [7] preprocessCore_1.60.2 withr_2.5.0
## [9] colorspace_2.1-0 filelock_1.0.2
## [11] highr_0.10 knitr_1.43
## [13] rstudioapi_0.14 labeling_0.4.2
## [15] GenomeInfoDbData_1.2.9 farver_2.1.1
## [17] bit64_4.0.5 rhdf5_2.42.1
## [19] vctrs_0.6.3 generics_0.1.3
## [21] xfun_0.39 timechange_0.2.0
## [23] BiocFileCache_2.2.1 R6_2.5.1
## [25] illuminaio_0.40.0 bitops_1.0-7
## [27] rhdf5filters_1.10.1 cachem_1.0.8
## [29] reshape_0.8.9 DelayedArray_0.24.0
## [31] assertthat_0.2.1 vroom_1.5.7
## [33] promises_1.2.0.1 BiocIO_1.8.0
## [35] scales_1.2.1 googlesheets4_1.0.1
## [37] gtable_0.3.3 rlang_1.1.1
## [39] genefilter_1.76.0 splines_4.2.2
## [41] rtracklayer_1.54.0 gargle_1.5.0
## [43] GEOquery_2.62.2 htm2txt_2.2.2
## [45] hexbin_1.28.3 broom_1.0.1
## [47] yaml_2.3.7 reshape2_1.4.4
## [49] RaggedExperiment_1.18.0 modelr_0.1.9
## [51] backports_1.4.1 httpuv_1.6.11
## [53] tools_4.2.2 gridBase_0.4-7
## [55] nor1mix_1.3-0 ellipsis_0.3.2
## [57] jquerylib_0.1.4 RColorBrewer_1.1-3
## [59] siggenes_1.68.0 MultiAssayExperiment_1.20.0
## [61] Rcpp_1.0.10 plyr_1.8.8
## [63] sparseMatrixStats_1.10.0 progress_1.2.2
## [65] zlibbioc_1.44.0 RCurl_1.98-1.12
## [67] prettyunits_1.1.1 openssl_2.0.6
## [69] haven_2.5.1 fs_1.6.2
## [71] magrittr_2.0.3 data.table_1.14.8
## [73] reprex_2.0.2 googledrive_2.0.0
## [75] hms_1.1.2 mime_0.12
## [77] evaluate_0.21 xtable_1.8-4
## [79] XML_3.99-0.14 mclust_6.0.0
## [81] readxl_1.4.1 gridExtra_2.3
## [83] compiler_4.2.2 biomaRt_2.50.3
## [85] crayon_1.5.2 htmltools_0.5.5
## [87] later_1.3.1 tzdb_0.4.0
## [89] lubridate_1.9.2 DBI_1.1.3
## [91] dbplyr_2.2.1 rappdirs_0.3.3
## [93] cli_3.6.1 quadprog_1.5-8
## [95] pkgconfig_2.0.3 GenomicAlignments_1.30.0
## [97] xml2_1.3.4 annotate_1.72.0
## [99] bslib_0.5.0 rngtools_1.5.2
## [101] multtest_2.50.0 beanplot_1.3.1
## [103] rvest_1.0.3 doRNG_1.8.6
## [105] scrime_1.3.5 digest_0.6.31
## [107] base64_2.0.1 cellranger_1.1.0
## [109] DelayedMatrixStats_1.16.0 restfulr_0.0.15
## [111] curl_5.0.1 shiny_1.7.2
## [113] Rsamtools_2.10.0 rjson_0.2.21
## [115] lifecycle_1.0.3 nlme_3.1-162
## [117] jsonlite_1.8.5 Rhdf5lib_1.20.0
## [119] askpass_1.1 limma_3.54.2
## [121] fansi_1.0.4 pillar_1.9.0
## [123] lattice_0.21-8 KEGGREST_1.34.0
## [125] fastmap_1.1.1 httr_1.4.6
## [127] survival_3.5-5 glue_1.6.2
## [129] png_0.1-8 bit_4.0.5
## [131] stringi_1.7.12 sass_0.4.6
## [133] HDF5Array_1.22.1 blob_1.2.4
## [135] memoise_2.0.1
Clear
rm(list=ls())