This script loads in differential expression results and tests if genes within 100kb of blood CpGs are differentially expressed in blood.


Setup

Load packages

library(edgeR)
library(cinaR)
library(limma)
library(tidyverse)
library(GenomicFeatures)
library(AnnotationHub)
library(SummarizedExperiment)
library(lme4)
library(lmerTest)

Load df which maps CpGs to genes within 100kb

load('../GOTO_Data/DEGs/Blood-100kb.Rdata')
head(df)
##          cpg        gene_ens   gene
## 1 cg25967669 ENSG00000130413  STK33
## 2 cg24899571 ENSG00000151503 NCAPD3
## 3 cg24899571 ENSG00000166086   JAM3
## 4 cg25828097 ENSG00000084734   GCKR
## 5 cg25828097 ENSG00000115207 GTF3C2
## 6 cg25828097 ENSG00000115211 EIF2B4

Load results of DEG analysis

load('../GOTO_Data/DEGs/Blood-DEG.Rdata')

Merge

deg$gene_ens <- rownames(deg)
deg <- deg %>% 
  dplyr::select(gene_ens, 
                logFC, deg_p = pvalue, 
                deg_padj = padj)

df <- right_join(df, deg, by='gene_ens')
head(df)
##          cpg        gene_ens     gene       logFC        deg_p   deg_padj
## 1 cg20567361 ENSG00000106049   HIBADH -0.10676116 0.0007794506 0.02828947
## 2 cg00230271 ENSG00000145348     TBCK -0.09904288 0.0026457355 0.04509699
## 3 cg00230271 ENSG00000164022    AIMP1 -0.15377210 0.0002962625 0.01661763
## 4 cg08943940 ENSG00000213654    GPSM3  0.07802996 0.0030245519 0.04665371
## 5 cg27422762 ENSG00000187866  FAM122A -0.05978154 0.0010766863 0.03163407
## 6 cg08775556 ENSG00000054611 TBC1D22A  0.07848708 0.0030112452 0.04665371

DNAm data

Load top CpG results and methData

load('../GOTO_Data/GOTO_methData-filtered.Rdata')

Save only blood samples

methData <- methData[ , methData$tissue == 'fasted blood']

Convert betas to data frame

beta_rows <- methData$Basename
betas <- as.data.frame(t(assay(methData)))
rownames(betas) <- beta_rows

Save only CpGs to be tested

betas <- betas %>% dplyr::select(any_of(df$cpg))
dim(betas)
## [1] 196  70

Phenotype data

Load data

load("../GOTO_Data/GOTO_targets-filtered.Rdata")

Filter blood

targets <- targets %>% 
  filter(tissue == 'fasted blood')

Save covariates

targets <- targets %>% 
  dplyr::select(IOP2_ID, timepoint, age, sex, smoke,
                plate, array_row, Basename) %>% 
  mutate(ID = paste0(IOP2_ID, '_', timepoint))

RNAseq data

Create ID list

ID_list <- targets %>% dplyr::select(ID)

Load functions

source('../GOTO_Data/RNAseq/goto.rnaseq.functions.R')

Load data

pathIN_dat <- "../GOTO_Data/RNAseq/merge.gene.counts_biopet_13052016.RData"
pathIN_cov <- "../GOTO_Data/RNAseq/datasheet_RNAseq_blood_V2.csv"

filt.samp <- "tissue_blood|qc_sexswitch|qc_multdim2|qc_rep1|complete_pairs"

dat1 <- read.gotornaseq(pathIN_dat = pathIN_dat, pathIN_cov, filt.samp = filt.samp, type = 'voom-export', quiet = FALSE)
## ||| PREPARING GOTO RNASEQ DATA 
## || READING DATA 
## | Loading RNASEQ .. OK! 
##    [555 samples x 56520 features] 
## | Reading COVARIATES .. OK! 
##    [maintaining 379 samples x 84 features] 
## | Merging data .. OK! 
##    [555 samples x 56604 features] 
## || SUBSETTING SAMPLES 
## | Subsetting SAMPLES on ['tissue_blood']; PASS: 379 out of 555
## | Subsetting SAMPLES on ['qc_sexswitch']; PASS: 379 out of 379
## | Subsetting SAMPLES on ['qc_multdim2']; PASS: 379 out of 379
## | Subsetting SAMPLES on ['qc_rep1']; PASS: 376 out of 379
## | Subsetting SAMPLES on ['complete_pairs']; PASS: 376 out of 376
## || TRANSFORMING DATA 
## | VOOM .. OK!
## | DONE!
goto_exp <- dat1[["dat"]]

goto_exp <- goto_exp %>% 
  dplyr::filter(nutridrink == 0) %>% 
  dplyr::select(IOP2_ID = sampID2, timepoint = intervention, age, sex, flowcell, starts_with('ENS'))

Formatting

goto_exp <- goto_exp %>%  
  dplyr::select(IOP2_ID, timepoint, flowcell,
                       starts_with('ENS')) %>% 
  mutate(
  ID = str_c(IOP2_ID, '_', timepoint)
)

exp_df <- left_join(ID_list, goto_exp, by = 'ID')

dim(exp_df)
## [1]   196 56519

Gene names

ens2gene <- cinaR::grch37

Check data availability

check <- df$gene_ens %in% colnames(goto_exp)
xtabs(~check)
## check
## TRUE 
##   88

Save genes we will check

av_genes <- df$gene_ens
head(av_genes)
## [1] "ENSG00000106049" "ENSG00000145348" "ENSG00000164022" "ENSG00000213654"
## [5] "ENSG00000187866" "ENSG00000054611"

Filter covariates and genes

goto_exp <- goto_exp %>%  
  dplyr::select(IOP2_ID, 
                timepoint, 
                flowcell,
                all_of(av_genes)) %>% 
  mutate(
  ID = str_c(IOP2_ID, '_', timepoint))

Merge with samples we have DNAm data for

ID_list <- targets %>% 
  mutate(
    ID = paste0(IOP2_ID, '_', timepoint)) %>% 
  dplyr::select(ID, Basename)

exp_df <- inner_join(ID_list, goto_exp, by = 'ID')
dim(exp_df)
## [1] 162  88

Ordering and filtering

Ensure DNAm data for all samples

targets <- targets %>% filter(Basename %in% rownames(betas))

print(paste0('We have complete covariate and DNAm data for ',
             nrow(targets), ' samples'))
## [1] "We have complete covariate and DNAm data for 196 samples"

Ensure expression data for all samples

targets <- targets %>% filter(Basename %in% exp_df$Basename)

print(paste0('We have complete data for everything for ',
             nrow(targets), ' samples'))
## [1] "We have complete data for everything for 162 samples"

Order uuid alphabetically

targets <- targets[order(targets$Basename),]
rownames(targets) <- targets$Basename
dim(targets)
## [1] 162   9

Order DNAm data

betas <- betas[rownames(betas) %in% targets$Basename, ]
betas <- betas[order(rownames(betas)), ]
dim(betas)
## [1] 162  70

Order expression data frame

exp_df <- exp_df[exp_df$Basename %in% targets$Basename, ]
exp_df <- exp_df[order(exp_df$Basename), ]
dim(exp_df)
## [1] 162  88

Save

save(betas, exp_df, targets, df,
     file='../GOTO_Data/eQTM/all_eQTMobjects-blood.Rdata')

Analysis

Bind data frames

lm_df <- cbind(targets,
               betas,
               exp_df)

Run models

for(i in 1:nrow(df)){
  cpg <- df$cpg[i]
  gene <- df$gene[i]
  gene_ens <- df$gene_ens[i]
  
  fit <- lmer(substitute(cpg ~ gene_ens + age + sex + smoke +
                           plate + array_row + flowcell + (1|IOP2_ID),
                         list(cpg = as.name(cpg),
                              gene_ens = as.name(gene_ens))),
              data=lm_df)
  
  if(i == 1){
    out <- data.frame(
      cpg = cpg,
      gene = gene,
      gene_ens = gene_ens,
      es = coef(summary(fit))[2,1],
      se = coef(summary(fit))[2,2],
      p = coef(summary(fit))[2,5]
    )
  } else {
    out <- rbind(out,
                 data.frame(
                   cpg = cpg,
                   gene = gene,
                   gene_ens = gene_ens,
                   es = coef(summary(fit))[2,1],
                   se = coef(summary(fit))[2,2],
                   p = coef(summary(fit))[2,5]))
  }
}

Adjust p-values

out$padj <- p.adjust(out$p, method='fdr')

Inspect top

out %>% arrange(padj) %>% head()
##          cpg     gene        gene_ens           es           se            p
## 1 cg18032191     LTBR ENSG00000111321 -0.019222108 0.0036034556 3.707907e-07
## 2 cg18032191 TNFRSF1A ENSG00000067182 -0.017629456 0.0035327616 1.715039e-06
## 3 cg21095811    ATG2B ENSG00000066739 -0.003572713 0.0009643526 3.016543e-04
## 4 cg26554902     TPP2 ENSG00000134900  0.055677386 0.0184736006 3.228242e-03
## 5 cg23343408     TLR5 ENSG00000187554  0.005186785 0.0021365309 1.645720e-02
## 6 cg17707984   SAPCD1 ENSG00000228727 -0.001408493 0.0005887660 1.805402e-02
##           padj
## 1 3.262958e-05
## 2 7.546173e-05
## 3 8.848525e-03
## 4 7.102133e-02
## 5 2.647923e-01
## 6 2.647923e-01
eqtm <- out %>% filter(padj <= 0.05) %>% 
  arrange(padj)

Save

Add DEG data

eqtm <- eqtm %>% 
  dplyr::select(cpg, gene,
                eqtm_es = es, eqtm_se = se, eqtm_p = p, 
                eqtm_padj = padj)

eqtm <- inner_join(eqtm, df, by=c('cpg', 'gene'))
eqtm <- eqtm %>% arrange(eqtm_padj)
eqtm
##          cpg     gene      eqtm_es      eqtm_se       eqtm_p    eqtm_padj
## 1 cg18032191     LTBR -0.019222108 0.0036034556 3.707907e-07 3.262958e-05
## 2 cg18032191 TNFRSF1A -0.017629456 0.0035327616 1.715039e-06 7.546173e-05
## 3 cg21095811    ATG2B -0.003572713 0.0009643526 3.016543e-04 8.848525e-03
##          gene_ens       logFC        deg_p    deg_padj
## 1 ENSG00000111321  0.15518002 6.433843e-06 0.003969681
## 2 ENSG00000067182  0.12251647 2.069650e-04 0.016477654
## 3 ENSG00000066739 -0.08609413 1.697930e-03 0.038095373

Save

save(out, eqtm, file='../GOTO_Data/eQTM/GOTO-blood_eQTM.Rdata')
write_csv(eqtm, file='../GOTO_Data/Tables/ST20.csv')

Session Info

sessionInfo()
## R version 4.2.2 (2022-10-31)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Rocky Linux 8.10 (Green Obsidian)
## 
## Matrix products: default
## BLAS/LAPACK: /usr/lib64/libopenblas-r0.3.15.so
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] parallel  stats4    stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] lmerTest_3.1-3                         
##  [2] lme4_1.1-30                            
##  [3] Matrix_1.5-4.1                         
##  [4] AnnotationHub_3.2.2                    
##  [5] BiocFileCache_2.2.1                    
##  [6] dbplyr_2.2.1                           
##  [7] cinaR_0.2.3                            
##  [8] edgeR_3.40.2                           
##  [9] limma_3.54.2                           
## [10] MASS_7.3-60                            
## [11] DNAmArray_2.0.0                        
## [12] pls_2.8-2                              
## [13] FDb.InfiniumMethylation.hg19_2.2.0     
## [14] org.Hs.eg.db_3.14.0                    
## [15] TxDb.Hsapiens.UCSC.hg19.knownGene_3.2.2
## [16] GenomicFeatures_1.46.5                 
## [17] AnnotationDbi_1.56.2                   
## [18] minfi_1.40.0                           
## [19] bumphunter_1.36.0                      
## [20] locfit_1.5-9.8                         
## [21] iterators_1.0.14                       
## [22] foreach_1.5.2                          
## [23] Biostrings_2.62.0                      
## [24] XVector_0.34.0                         
## [25] SummarizedExperiment_1.24.0            
## [26] Biobase_2.58.0                         
## [27] MatrixGenerics_1.10.0                  
## [28] matrixStats_1.0.0                      
## [29] ggpubr_0.4.0                           
## [30] GenomicRanges_1.46.1                   
## [31] GenomeInfoDb_1.34.9                    
## [32] IRanges_2.32.0                         
## [33] S4Vectors_0.36.2                       
## [34] BiocGenerics_0.44.0                    
## [35] ggrepel_0.9.1                          
## [36] forcats_0.5.2                          
## [37] stringr_1.5.0                          
## [38] dplyr_1.1.3                            
## [39] purrr_0.3.4                            
## [40] readr_2.1.2                            
## [41] tidyr_1.2.1                            
## [42] tibble_3.2.1                           
## [43] ggplot2_3.4.3                          
## [44] tidyverse_1.3.2                        
## [45] rmarkdown_2.16                         
## 
## loaded via a namespace (and not attached):
##   [1] utf8_1.2.3                    tidyselect_1.2.0             
##   [3] RSQLite_2.2.17                grid_4.2.2                   
##   [5] BiocParallel_1.32.6           munsell_0.5.0                
##   [7] codetools_0.2-19              preprocessCore_1.60.2        
##   [9] withr_2.5.0                   colorspace_2.1-0             
##  [11] filelock_1.0.2                knitr_1.43                   
##  [13] rstudioapi_0.14               ggsignif_0.6.3               
##  [15] GenomeInfoDbData_1.2.9        bit64_4.0.5                  
##  [17] rhdf5_2.42.1                  vctrs_0.6.3                  
##  [19] generics_0.1.3                xfun_0.39                    
##  [21] timechange_0.2.0              R6_2.5.1                     
##  [23] illuminaio_0.40.0             bitops_1.0-7                 
##  [25] rhdf5filters_1.10.1           cachem_1.0.8                 
##  [27] reshape_0.8.9                 DelayedArray_0.24.0          
##  [29] assertthat_0.2.1              vroom_1.5.7                  
##  [31] promises_1.2.0.1              BiocIO_1.8.0                 
##  [33] scales_1.2.1                  googlesheets4_1.0.1          
##  [35] gtable_0.3.3                  rlang_1.1.1                  
##  [37] genefilter_1.76.0             splines_4.2.2                
##  [39] rtracklayer_1.54.0            rstatix_0.7.0                
##  [41] gargle_1.5.0                  GEOquery_2.62.2              
##  [43] htm2txt_2.2.2                 broom_1.0.1                  
##  [45] BiocManager_1.30.21           yaml_2.3.7                   
##  [47] reshape2_1.4.4                abind_1.4-5                  
##  [49] modelr_0.1.9                  backports_1.4.1              
##  [51] httpuv_1.6.11                 tools_4.2.2                  
##  [53] nor1mix_1.3-0                 ellipsis_0.3.2               
##  [55] jquerylib_0.1.4               RColorBrewer_1.1-3           
##  [57] siggenes_1.68.0               Rcpp_1.0.10                  
##  [59] plyr_1.8.8                    sparseMatrixStats_1.10.0     
##  [61] progress_1.2.2                zlibbioc_1.44.0              
##  [63] RCurl_1.98-1.12               prettyunits_1.1.1            
##  [65] openssl_2.0.6                 haven_2.5.1                  
##  [67] fs_1.6.2                      magrittr_2.0.3               
##  [69] data.table_1.14.8             reprex_2.0.2                 
##  [71] googledrive_2.0.0             mime_0.12                    
##  [73] hms_1.1.2                     evaluate_0.21                
##  [75] xtable_1.8-4                  XML_3.99-0.14                
##  [77] mclust_6.0.0                  readxl_1.4.1                 
##  [79] compiler_4.2.2                biomaRt_2.50.3               
##  [81] crayon_1.5.2                  minqa_1.2.5                  
##  [83] htmltools_0.5.5               later_1.3.1                  
##  [85] tzdb_0.4.0                    lubridate_1.9.2              
##  [87] DBI_1.1.3                     rappdirs_0.3.3               
##  [89] boot_1.3-28.1                 car_3.1-0                    
##  [91] cli_3.6.1                     quadprog_1.5-8               
##  [93] pkgconfig_2.0.3               GenomicAlignments_1.30.0     
##  [95] numDeriv_2016.8-1.1           xml2_1.3.4                   
##  [97] annotate_1.72.0               bslib_0.5.0                  
##  [99] rngtools_1.5.2                multtest_2.50.0              
## [101] beanplot_1.3.1                rvest_1.0.3                  
## [103] doRNG_1.8.6                   scrime_1.3.5                 
## [105] digest_0.6.31                 base64_2.0.1                 
## [107] cellranger_1.1.0              DelayedMatrixStats_1.16.0    
## [109] restfulr_0.0.15               curl_5.0.1                   
## [111] shiny_1.7.2                   Rsamtools_2.10.0             
## [113] nloptr_2.0.3                  rjson_0.2.21                 
## [115] lifecycle_1.0.3               nlme_3.1-162                 
## [117] jsonlite_1.8.5                Rhdf5lib_1.20.0              
## [119] carData_3.0-5                 askpass_1.1                  
## [121] fansi_1.0.4                   pillar_1.9.0                 
## [123] lattice_0.21-8                KEGGREST_1.34.0              
## [125] fastmap_1.1.1                 httr_1.4.6                   
## [127] survival_3.5-5                interactiveDisplayBase_1.32.0
## [129] glue_1.6.2                    png_0.1-8                    
## [131] BiocVersion_3.16.0            bit_4.0.5                    
## [133] stringi_1.7.12                sass_0.4.6                   
## [135] HDF5Array_1.22.1              blob_1.2.4                   
## [137] memoise_2.0.1

Clear

rm(list=ls())