This script loads in differential expression results and tests if genes within 100kb of blood CpGs are differentially expressed in blood.
Load packages
library(edgeR)
library(cinaR)
library(limma)
library(tidyverse)
library(GenomicFeatures)
library(AnnotationHub)
library(SummarizedExperiment)
library(lme4)
library(lmerTest)
Load df
which maps CpGs to genes within 100kb
load('../GOTO_Data/DEGs/Blood-100kb.Rdata')
head(df)
## cpg gene_ens gene
## 1 cg25967669 ENSG00000130413 STK33
## 2 cg24899571 ENSG00000151503 NCAPD3
## 3 cg24899571 ENSG00000166086 JAM3
## 4 cg25828097 ENSG00000084734 GCKR
## 5 cg25828097 ENSG00000115207 GTF3C2
## 6 cg25828097 ENSG00000115211 EIF2B4
Load results of DEG analysis
load('../GOTO_Data/DEGs/Blood-DEG.Rdata')
Merge
deg$gene_ens <- rownames(deg)
deg <- deg %>%
dplyr::select(gene_ens,
logFC, deg_p = pvalue,
deg_padj = padj)
df <- right_join(df, deg, by='gene_ens')
head(df)
## cpg gene_ens gene logFC deg_p deg_padj
## 1 cg20567361 ENSG00000106049 HIBADH -0.10676116 0.0007794506 0.02828947
## 2 cg00230271 ENSG00000145348 TBCK -0.09904288 0.0026457355 0.04509699
## 3 cg00230271 ENSG00000164022 AIMP1 -0.15377210 0.0002962625 0.01661763
## 4 cg08943940 ENSG00000213654 GPSM3 0.07802996 0.0030245519 0.04665371
## 5 cg27422762 ENSG00000187866 FAM122A -0.05978154 0.0010766863 0.03163407
## 6 cg08775556 ENSG00000054611 TBC1D22A 0.07848708 0.0030112452 0.04665371
Load top CpG results and methData
load('../GOTO_Data/GOTO_methData-filtered.Rdata')
Save only blood samples
methData <- methData[ , methData$tissue == 'fasted blood']
Convert betas to data frame
beta_rows <- methData$Basename
betas <- as.data.frame(t(assay(methData)))
rownames(betas) <- beta_rows
Save only CpGs to be tested
betas <- betas %>% dplyr::select(any_of(df$cpg))
dim(betas)
## [1] 196 70
Load data
load("../GOTO_Data/GOTO_targets-filtered.Rdata")
Filter blood
targets <- targets %>%
filter(tissue == 'fasted blood')
Save covariates
targets <- targets %>%
dplyr::select(IOP2_ID, timepoint, age, sex, smoke,
plate, array_row, Basename) %>%
mutate(ID = paste0(IOP2_ID, '_', timepoint))
Create ID list
ID_list <- targets %>% dplyr::select(ID)
Load functions
source('../GOTO_Data/RNAseq/goto.rnaseq.functions.R')
Load data
pathIN_dat <- "../GOTO_Data/RNAseq/merge.gene.counts_biopet_13052016.RData"
pathIN_cov <- "../GOTO_Data/RNAseq/datasheet_RNAseq_blood_V2.csv"
filt.samp <- "tissue_blood|qc_sexswitch|qc_multdim2|qc_rep1|complete_pairs"
dat1 <- read.gotornaseq(pathIN_dat = pathIN_dat, pathIN_cov, filt.samp = filt.samp, type = 'voom-export', quiet = FALSE)
## ||| PREPARING GOTO RNASEQ DATA
## || READING DATA
## | Loading RNASEQ .. OK!
## [555 samples x 56520 features]
## | Reading COVARIATES .. OK!
## [maintaining 379 samples x 84 features]
## | Merging data .. OK!
## [555 samples x 56604 features]
## || SUBSETTING SAMPLES
## | Subsetting SAMPLES on ['tissue_blood']; PASS: 379 out of 555
## | Subsetting SAMPLES on ['qc_sexswitch']; PASS: 379 out of 379
## | Subsetting SAMPLES on ['qc_multdim2']; PASS: 379 out of 379
## | Subsetting SAMPLES on ['qc_rep1']; PASS: 376 out of 379
## | Subsetting SAMPLES on ['complete_pairs']; PASS: 376 out of 376
## || TRANSFORMING DATA
## | VOOM .. OK!
## | DONE!
goto_exp <- dat1[["dat"]]
goto_exp <- goto_exp %>%
dplyr::filter(nutridrink == 0) %>%
dplyr::select(IOP2_ID = sampID2, timepoint = intervention, age, sex, flowcell, starts_with('ENS'))
Formatting
goto_exp <- goto_exp %>%
dplyr::select(IOP2_ID, timepoint, flowcell,
starts_with('ENS')) %>%
mutate(
ID = str_c(IOP2_ID, '_', timepoint)
)
exp_df <- left_join(ID_list, goto_exp, by = 'ID')
dim(exp_df)
## [1] 196 56519
Gene names
ens2gene <- cinaR::grch37
Check data availability
check <- df$gene_ens %in% colnames(goto_exp)
xtabs(~check)
## check
## TRUE
## 88
Save genes we will check
av_genes <- df$gene_ens
head(av_genes)
## [1] "ENSG00000106049" "ENSG00000145348" "ENSG00000164022" "ENSG00000213654"
## [5] "ENSG00000187866" "ENSG00000054611"
Filter covariates and genes
goto_exp <- goto_exp %>%
dplyr::select(IOP2_ID,
timepoint,
flowcell,
all_of(av_genes)) %>%
mutate(
ID = str_c(IOP2_ID, '_', timepoint))
Merge with samples we have DNAm data for
ID_list <- targets %>%
mutate(
ID = paste0(IOP2_ID, '_', timepoint)) %>%
dplyr::select(ID, Basename)
exp_df <- inner_join(ID_list, goto_exp, by = 'ID')
dim(exp_df)
## [1] 162 88
Ensure DNAm data for all samples
targets <- targets %>% filter(Basename %in% rownames(betas))
print(paste0('We have complete covariate and DNAm data for ',
nrow(targets), ' samples'))
## [1] "We have complete covariate and DNAm data for 196 samples"
Ensure expression data for all samples
targets <- targets %>% filter(Basename %in% exp_df$Basename)
print(paste0('We have complete data for everything for ',
nrow(targets), ' samples'))
## [1] "We have complete data for everything for 162 samples"
Order uuid alphabetically
targets <- targets[order(targets$Basename),]
rownames(targets) <- targets$Basename
dim(targets)
## [1] 162 9
Order DNAm data
betas <- betas[rownames(betas) %in% targets$Basename, ]
betas <- betas[order(rownames(betas)), ]
dim(betas)
## [1] 162 70
Order expression data frame
exp_df <- exp_df[exp_df$Basename %in% targets$Basename, ]
exp_df <- exp_df[order(exp_df$Basename), ]
dim(exp_df)
## [1] 162 88
Save
save(betas, exp_df, targets, df,
file='../GOTO_Data/eQTM/all_eQTMobjects-blood.Rdata')
Bind data frames
lm_df <- cbind(targets,
betas,
exp_df)
Run models
for(i in 1:nrow(df)){
cpg <- df$cpg[i]
gene <- df$gene[i]
gene_ens <- df$gene_ens[i]
fit <- lmer(substitute(cpg ~ gene_ens + age + sex + smoke +
plate + array_row + flowcell + (1|IOP2_ID),
list(cpg = as.name(cpg),
gene_ens = as.name(gene_ens))),
data=lm_df)
if(i == 1){
out <- data.frame(
cpg = cpg,
gene = gene,
gene_ens = gene_ens,
es = coef(summary(fit))[2,1],
se = coef(summary(fit))[2,2],
p = coef(summary(fit))[2,5]
)
} else {
out <- rbind(out,
data.frame(
cpg = cpg,
gene = gene,
gene_ens = gene_ens,
es = coef(summary(fit))[2,1],
se = coef(summary(fit))[2,2],
p = coef(summary(fit))[2,5]))
}
}
Adjust p-values
out$padj <- p.adjust(out$p, method='fdr')
Inspect top
out %>% arrange(padj) %>% head()
## cpg gene gene_ens es se p
## 1 cg18032191 LTBR ENSG00000111321 -0.019222108 0.0036034556 3.707907e-07
## 2 cg18032191 TNFRSF1A ENSG00000067182 -0.017629456 0.0035327616 1.715039e-06
## 3 cg21095811 ATG2B ENSG00000066739 -0.003572713 0.0009643526 3.016543e-04
## 4 cg26554902 TPP2 ENSG00000134900 0.055677386 0.0184736006 3.228242e-03
## 5 cg23343408 TLR5 ENSG00000187554 0.005186785 0.0021365309 1.645720e-02
## 6 cg17707984 SAPCD1 ENSG00000228727 -0.001408493 0.0005887660 1.805402e-02
## padj
## 1 3.262958e-05
## 2 7.546173e-05
## 3 8.848525e-03
## 4 7.102133e-02
## 5 2.647923e-01
## 6 2.647923e-01
eqtm <- out %>% filter(padj <= 0.05) %>%
arrange(padj)
Add DEG data
eqtm <- eqtm %>%
dplyr::select(cpg, gene,
eqtm_es = es, eqtm_se = se, eqtm_p = p,
eqtm_padj = padj)
eqtm <- inner_join(eqtm, df, by=c('cpg', 'gene'))
eqtm <- eqtm %>% arrange(eqtm_padj)
eqtm
## cpg gene eqtm_es eqtm_se eqtm_p eqtm_padj
## 1 cg18032191 LTBR -0.019222108 0.0036034556 3.707907e-07 3.262958e-05
## 2 cg18032191 TNFRSF1A -0.017629456 0.0035327616 1.715039e-06 7.546173e-05
## 3 cg21095811 ATG2B -0.003572713 0.0009643526 3.016543e-04 8.848525e-03
## gene_ens logFC deg_p deg_padj
## 1 ENSG00000111321 0.15518002 6.433843e-06 0.003969681
## 2 ENSG00000067182 0.12251647 2.069650e-04 0.016477654
## 3 ENSG00000066739 -0.08609413 1.697930e-03 0.038095373
Save
save(out, eqtm, file='../GOTO_Data/eQTM/GOTO-blood_eQTM.Rdata')
write_csv(eqtm, file='../GOTO_Data/Tables/ST20.csv')
sessionInfo()
## R version 4.2.2 (2022-10-31)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Rocky Linux 8.10 (Green Obsidian)
##
## Matrix products: default
## BLAS/LAPACK: /usr/lib64/libopenblas-r0.3.15.so
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] parallel stats4 stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] lmerTest_3.1-3
## [2] lme4_1.1-30
## [3] Matrix_1.5-4.1
## [4] AnnotationHub_3.2.2
## [5] BiocFileCache_2.2.1
## [6] dbplyr_2.2.1
## [7] cinaR_0.2.3
## [8] edgeR_3.40.2
## [9] limma_3.54.2
## [10] MASS_7.3-60
## [11] DNAmArray_2.0.0
## [12] pls_2.8-2
## [13] FDb.InfiniumMethylation.hg19_2.2.0
## [14] org.Hs.eg.db_3.14.0
## [15] TxDb.Hsapiens.UCSC.hg19.knownGene_3.2.2
## [16] GenomicFeatures_1.46.5
## [17] AnnotationDbi_1.56.2
## [18] minfi_1.40.0
## [19] bumphunter_1.36.0
## [20] locfit_1.5-9.8
## [21] iterators_1.0.14
## [22] foreach_1.5.2
## [23] Biostrings_2.62.0
## [24] XVector_0.34.0
## [25] SummarizedExperiment_1.24.0
## [26] Biobase_2.58.0
## [27] MatrixGenerics_1.10.0
## [28] matrixStats_1.0.0
## [29] ggpubr_0.4.0
## [30] GenomicRanges_1.46.1
## [31] GenomeInfoDb_1.34.9
## [32] IRanges_2.32.0
## [33] S4Vectors_0.36.2
## [34] BiocGenerics_0.44.0
## [35] ggrepel_0.9.1
## [36] forcats_0.5.2
## [37] stringr_1.5.0
## [38] dplyr_1.1.3
## [39] purrr_0.3.4
## [40] readr_2.1.2
## [41] tidyr_1.2.1
## [42] tibble_3.2.1
## [43] ggplot2_3.4.3
## [44] tidyverse_1.3.2
## [45] rmarkdown_2.16
##
## loaded via a namespace (and not attached):
## [1] utf8_1.2.3 tidyselect_1.2.0
## [3] RSQLite_2.2.17 grid_4.2.2
## [5] BiocParallel_1.32.6 munsell_0.5.0
## [7] codetools_0.2-19 preprocessCore_1.60.2
## [9] withr_2.5.0 colorspace_2.1-0
## [11] filelock_1.0.2 knitr_1.43
## [13] rstudioapi_0.14 ggsignif_0.6.3
## [15] GenomeInfoDbData_1.2.9 bit64_4.0.5
## [17] rhdf5_2.42.1 vctrs_0.6.3
## [19] generics_0.1.3 xfun_0.39
## [21] timechange_0.2.0 R6_2.5.1
## [23] illuminaio_0.40.0 bitops_1.0-7
## [25] rhdf5filters_1.10.1 cachem_1.0.8
## [27] reshape_0.8.9 DelayedArray_0.24.0
## [29] assertthat_0.2.1 vroom_1.5.7
## [31] promises_1.2.0.1 BiocIO_1.8.0
## [33] scales_1.2.1 googlesheets4_1.0.1
## [35] gtable_0.3.3 rlang_1.1.1
## [37] genefilter_1.76.0 splines_4.2.2
## [39] rtracklayer_1.54.0 rstatix_0.7.0
## [41] gargle_1.5.0 GEOquery_2.62.2
## [43] htm2txt_2.2.2 broom_1.0.1
## [45] BiocManager_1.30.21 yaml_2.3.7
## [47] reshape2_1.4.4 abind_1.4-5
## [49] modelr_0.1.9 backports_1.4.1
## [51] httpuv_1.6.11 tools_4.2.2
## [53] nor1mix_1.3-0 ellipsis_0.3.2
## [55] jquerylib_0.1.4 RColorBrewer_1.1-3
## [57] siggenes_1.68.0 Rcpp_1.0.10
## [59] plyr_1.8.8 sparseMatrixStats_1.10.0
## [61] progress_1.2.2 zlibbioc_1.44.0
## [63] RCurl_1.98-1.12 prettyunits_1.1.1
## [65] openssl_2.0.6 haven_2.5.1
## [67] fs_1.6.2 magrittr_2.0.3
## [69] data.table_1.14.8 reprex_2.0.2
## [71] googledrive_2.0.0 mime_0.12
## [73] hms_1.1.2 evaluate_0.21
## [75] xtable_1.8-4 XML_3.99-0.14
## [77] mclust_6.0.0 readxl_1.4.1
## [79] compiler_4.2.2 biomaRt_2.50.3
## [81] crayon_1.5.2 minqa_1.2.5
## [83] htmltools_0.5.5 later_1.3.1
## [85] tzdb_0.4.0 lubridate_1.9.2
## [87] DBI_1.1.3 rappdirs_0.3.3
## [89] boot_1.3-28.1 car_3.1-0
## [91] cli_3.6.1 quadprog_1.5-8
## [93] pkgconfig_2.0.3 GenomicAlignments_1.30.0
## [95] numDeriv_2016.8-1.1 xml2_1.3.4
## [97] annotate_1.72.0 bslib_0.5.0
## [99] rngtools_1.5.2 multtest_2.50.0
## [101] beanplot_1.3.1 rvest_1.0.3
## [103] doRNG_1.8.6 scrime_1.3.5
## [105] digest_0.6.31 base64_2.0.1
## [107] cellranger_1.1.0 DelayedMatrixStats_1.16.0
## [109] restfulr_0.0.15 curl_5.0.1
## [111] shiny_1.7.2 Rsamtools_2.10.0
## [113] nloptr_2.0.3 rjson_0.2.21
## [115] lifecycle_1.0.3 nlme_3.1-162
## [117] jsonlite_1.8.5 Rhdf5lib_1.20.0
## [119] carData_3.0-5 askpass_1.1
## [121] fansi_1.0.4 pillar_1.9.0
## [123] lattice_0.21-8 KEGGREST_1.34.0
## [125] fastmap_1.1.1 httr_1.4.6
## [127] survival_3.5-5 interactiveDisplayBase_1.32.0
## [129] glue_1.6.2 png_0.1-8
## [131] BiocVersion_3.16.0 bit_4.0.5
## [133] stringi_1.7.12 sass_0.4.6
## [135] HDF5Array_1.22.1 blob_1.2.4
## [137] memoise_2.0.1
Clear
rm(list=ls())