This script loads in differential expression results and tests if genes within 100kb of blood CpGs are differentially expressed in blood.


Setup

Load packages

library(tidyverse)
library(ggrepel)
library(GenomicRanges)
library(ggpubr)
library(DNAmArray)
library(MASS)

Load data

load("../GOTO_Data/GOTO_results-top-blood.Rdata")

Genes within 100kb

Save significant CpGs

print(paste0("There are ", 
             nrow(top_cpgs), 
             " CpGs significant at 5% FDR level"))
## [1] "There are 441 CpGs significant at 5% FDR level"
cpg_top <- top_cpgs$cpg

Save gene locations (saved previously)

txdb <- makeTxDbFromEnsembl(organism = "Homo sapiens",
                            release = 75,
                            circ_seqs = NULL,
                            server = "ensembldb.ensembl.org",
                            username = "anonymous", password = NULL, port = 0L,
                            tx_attrib = NULL)

gene_range <- unlist(cdsBy(txdb, by = "gene"))
gene_range <- unique(gene_range)

save(gene_range, file='/exports/molepi/users/ljsinke/LLS/Shared_Data/allGeneRanges.Rdata)

Load gene_range object

load('/exports/molepi/users/ljsinke/LLS/Shared_Data/allGeneRanges.Rdata')
gene_range
## GRanges object with 302587 ranges and 2 metadata columns:
##                   seqnames            ranges strand |    cds_id        cds_name
##                      <Rle>         <IRanges>  <Rle> | <integer>     <character>
##   ENSG00000000003        X 99885795-99885863      - |    717267 ENSP00000362111
##   ENSG00000000003        X 99887482-99887565      - |    717268 ENSP00000362111
##   ENSG00000000003        X 99888402-99888536      - |    717269 ENSP00000362111
##   ENSG00000000003        X 99888928-99889026      - |    717270 ENSP00000362111
##   ENSG00000000003        X 99890175-99890249      - |    717271 ENSP00000362111
##               ...      ...               ...    ... .       ...             ...
##            LRG_97   LRG_97       17272-17334      + |    799400        LRG_97p1
##            LRG_97   LRG_97       17876-18035      + |    799401        LRG_97p1
##            LRG_97   LRG_97       22463-22593      + |    799402        LRG_97p1
##            LRG_98   LRG_98       10293-13424      + |    799403        LRG_98p1
##            LRG_99   LRG_99        9069-10652      + |    799404        LRG_99p1
##   -------
##   seqinfo: 722 sequences (1 circular) from an unspecified genome

Make a GenomicRanges object for the top CpGs

cpg_range <- GRanges(seqnames = top_cpgs$cpg_chr,
        IRanges(
          start = top_cpgs$cpg_start,
          end = top_cpgs$cpg_end,
          names = top_cpgs$cpg
        ))

cpg_range
## GRanges object with 441 ranges and 0 metadata columns:
##              seqnames              ranges strand
##                 <Rle>           <IRanges>  <Rle>
##   cg25967669       11     8430495-8430497      *
##   cg24899571       11 133938809-133938811      *
##   cg25828097        2   27656864-27656866      *
##   cg27508416        1   42922518-42922520      *
##   cg23812665        1 117061544-117061546      *
##          ...      ...                 ...    ...
##   cg25040733        5 141704733-141704735      *
##   cg13479371       18   72530930-72530932      *
##   cg26201957       11   82601012-82601014      *
##   cg16209776       12   63177893-63177895      *
##   cg23961638        4 153700210-153700212      *
##   -------
##   seqinfo: 23 sequences from an unspecified genome; no seqlengths

Initialize a distance matrix

distance.matrix <- matrix(
  NaN, 
  nrow = length(gene_range), 
  ncol = length(cpg_range), 
  dimnames = list(names(gene_range), 
                  names(cpg_range)))

Loop through CpGs

for(i in 1:nrow(top_cpgs)){
    cpg <- cpg_range[i]
    calc.distance <- distance(
      cpg, 
      gene_range, 
      ignore.strand = T)
    distance.matrix[,i] <- calc.distance
}

dim(distance.matrix)
## [1] 302587    441
distance.matrix <- as.data.frame(distance.matrix)
colnames(distance.matrix) <- names(cpg_range)
distance.matrix$gene <- names(gene_range)

Make a list of genes within 100kb of each cpg

gene_list <- lapply(names(cpg_range), function(x){
  df <- (distance.matrix %>% dplyr::select(gene, all_of(x)))
  colnames(df) <- c('gene', 'cpg')
  df <- df %>% filter(cpg <= 100000)
  if (nrow(df) > 0) {
    return(data.frame(cpg = x,
                      gene_ens = unique(df$gene)))
  }
})
gene_list[[1]]
##          cpg        gene_ens
## 1 cg25967669 ENSG00000130413

Save gene symbols

ens2gene <- cinaR::grch37

Bind

df <- data.frame()

for(i in 1:length(gene_list)){
  gene_df <- gene_list[[i]]
  
  cpg <- gene_df$cpg[1]
  genes <- unique(gene_df$gene_ens)
  
  df <- rbind(df,
              data.frame(cpg = cpg,
                         gene_ens = genes))
}

sym <- ens2gene$symbol[match(df$gene_ens,
                             ens2gene$ensgene)]

df$gene <- sym

head(df)
##          cpg        gene_ens   gene
## 1 cg25967669 ENSG00000130413  STK33
## 2 cg24899571 ENSG00000151503 NCAPD3
## 3 cg24899571 ENSG00000166086   JAM3
## 4 cg25828097 ENSG00000084734   GCKR
## 5 cg25828097 ENSG00000115207 GTF3C2
## 6 cg25828097 ENSG00000115211 EIF2B4
dim(df)
## [1] 1760    3

Save

save(df, file='../GOTO_Data/DEGs/Blood-100kb.Rdata')

DEGs

print(paste0("There are ", length(unique(df$gene_ens)), " unique genes within 100kb of ",
             length(unique(df$cpg)), " CpGs."))
## [1] "There are 1588 unique genes within 100kb of 403 CpGs."

Read in DEG results

deg <- read.table('../GOTO_Data/DEGs/DEGs_blood_postprandial_combined_new.txt')

Filter for those within 100kb

deg <- deg %>% 
  filter(GENE.SYMBOL %in% df$gene) %>% 
  dplyr::select(logFC, AveExpr, pvalue, gene = GENE.SYMBOL)

Adjust for multiple testing

deg$padj <- p.adjust(deg$pvalue, method='fdr')

Inspect top genes

deg <- deg %>% filter(padj <= 0.05)
head(deg)
##                       logFC  AveExpr       pvalue     gene        padj
## ENSG00000103056  0.20733606 5.162585 1.140226e-06    SMPD3 0.001407039
## ENSG00000111321  0.15518002 6.660071 6.433843e-06     LTBR 0.003969681
## ENSG00000118454 -0.21060787 3.027824 2.482431e-05 ANKRD13C 0.008397916
## ENSG00000054116  0.06261219 5.856389 2.970455e-05  TRAPPC3 0.008397916
## ENSG00000105438  0.09700830 6.254904 3.402721e-05   KDELR1 0.008397916
## ENSG00000158793  0.07877899 5.187106 6.174264e-05     NIT1 0.012698403
unique(deg$gene)
##  [1] "SMPD3"    "LTBR"     "ANKRD13C" "TRAPPC3"  "KDELR1"   "NIT1"    
##  [7] "TAPBP"    "ZNF672"   "KIAA1143" "METTL15"  "TUBA1B"   "TFEB"    
## [13] "TPP2"     "TNFRSF1A" "CLASP2"   "FAM133B"  "GTF2H3"   "CSNK1G3" 
## [19] "JUNB"     "MNAT1"    "TRAF7"    "AIMP1"    "CNGB1"    "ATF5"    
## [25] "TUBE1"    "GADD45B"  "PSEN2"    "USP21"    "ZFPL1"    "PPP1R11" 
## [31] "POLDIP3"  "CEP120"   "LMAN2"    "HIBADH"   "BCL3"     "C11orf68"
## [37] "USB1"     "UBE2L3"   "C6orf106" "TTC17"    "ANKRD46"  "FAM122A" 
## [43] "PRCC"     "DEDD"     "B3GALT4"  "CD74"     "MUM1"     "MAST1"   
## [49] "GMPPA"    "SAPCD1"   "DNASE1L2" "CTSA"     "CSNK2B"   "POLA2"   
## [55] "ATG2B"    "RHOA"     "MPV17L2"  "CYB5R3"   "ZBTB7B"   "ARHGAP5" 
## [61] "WDR5"     "NFKBIB"   "SNX14"    "SMC2"     "C10orf54" "NELFB"   
## [67] "STT3B"    "ACOT8"    "TRIP13"   "PURB"     "TMEM175"  "TBCK"    
## [73] "NEU1"     "RNASEH2A" "C2"       "ZNF350"   "TLR5"     "ICAM1"   
## [79] "TBC1D22A" "GPSM3"    "GBA2"     "ZNF649"   "VRK3"

Save

save(deg, file='../GOTO_Data/DEGs/Blood-DEG.Rdata')

Session Info

sessionInfo()
## R version 4.2.2 (2022-10-31)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Rocky Linux 8.10 (Green Obsidian)
## 
## Matrix products: default
## BLAS/LAPACK: /usr/lib64/libopenblas-r0.3.15.so
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] parallel  stats4    stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] MASS_7.3-60                            
##  [2] DNAmArray_2.0.0                        
##  [3] pls_2.8-2                              
##  [4] FDb.InfiniumMethylation.hg19_2.2.0     
##  [5] org.Hs.eg.db_3.14.0                    
##  [6] TxDb.Hsapiens.UCSC.hg19.knownGene_3.2.2
##  [7] GenomicFeatures_1.46.5                 
##  [8] AnnotationDbi_1.56.2                   
##  [9] minfi_1.40.0                           
## [10] bumphunter_1.36.0                      
## [11] locfit_1.5-9.8                         
## [12] iterators_1.0.14                       
## [13] foreach_1.5.2                          
## [14] Biostrings_2.62.0                      
## [15] XVector_0.34.0                         
## [16] SummarizedExperiment_1.24.0            
## [17] Biobase_2.58.0                         
## [18] MatrixGenerics_1.10.0                  
## [19] matrixStats_1.0.0                      
## [20] ggpubr_0.4.0                           
## [21] GenomicRanges_1.46.1                   
## [22] GenomeInfoDb_1.34.9                    
## [23] IRanges_2.32.0                         
## [24] S4Vectors_0.36.2                       
## [25] BiocGenerics_0.44.0                    
## [26] ggrepel_0.9.1                          
## [27] forcats_0.5.2                          
## [28] stringr_1.5.0                          
## [29] dplyr_1.1.3                            
## [30] purrr_0.3.4                            
## [31] readr_2.1.2                            
## [32] tidyr_1.2.1                            
## [33] tibble_3.2.1                           
## [34] ggplot2_3.4.3                          
## [35] tidyverse_1.3.2                        
## [36] rmarkdown_2.16                         
## 
## loaded via a namespace (and not attached):
##   [1] readxl_1.4.1              backports_1.4.1          
##   [3] BiocFileCache_2.2.1       plyr_1.8.8               
##   [5] splines_4.2.2             BiocParallel_1.32.6      
##   [7] digest_0.6.31             htmltools_0.5.5          
##   [9] fansi_1.0.4               magrittr_2.0.3           
##  [11] memoise_2.0.1             googlesheets4_1.0.1      
##  [13] limma_3.54.2              tzdb_0.4.0               
##  [15] annotate_1.72.0           cinaR_0.2.3              
##  [17] modelr_0.1.9              askpass_1.1              
##  [19] timechange_0.2.0          siggenes_1.68.0          
##  [21] prettyunits_1.1.1         colorspace_2.1-0         
##  [23] blob_1.2.4                rvest_1.0.3              
##  [25] rappdirs_0.3.3            haven_2.5.1              
##  [27] xfun_0.39                 crayon_1.5.2             
##  [29] RCurl_1.98-1.12           jsonlite_1.8.5           
##  [31] genefilter_1.76.0         GEOquery_2.62.2          
##  [33] survival_3.5-5            glue_1.6.2               
##  [35] gtable_0.3.3              gargle_1.5.0             
##  [37] zlibbioc_1.44.0           htm2txt_2.2.2            
##  [39] DelayedArray_0.24.0       car_3.1-0                
##  [41] Rhdf5lib_1.20.0           HDF5Array_1.22.1         
##  [43] abind_1.4-5               scales_1.2.1             
##  [45] edgeR_3.40.2              DBI_1.1.3                
##  [47] rngtools_1.5.2            rstatix_0.7.0            
##  [49] Rcpp_1.0.10               xtable_1.8-4             
##  [51] progress_1.2.2            bit_4.0.5                
##  [53] mclust_6.0.0              preprocessCore_1.60.2    
##  [55] httr_1.4.6                RColorBrewer_1.1-3       
##  [57] ellipsis_0.3.2            pkgconfig_2.0.3          
##  [59] reshape_0.8.9             XML_3.99-0.14            
##  [61] sass_0.4.6                dbplyr_2.2.1             
##  [63] utf8_1.2.3                reshape2_1.4.4           
##  [65] tidyselect_1.2.0          rlang_1.1.1              
##  [67] munsell_0.5.0             cellranger_1.1.0         
##  [69] tools_4.2.2               cachem_1.0.8             
##  [71] cli_3.6.1                 generics_0.1.3           
##  [73] RSQLite_2.2.17            broom_1.0.1              
##  [75] evaluate_0.21             fastmap_1.1.1            
##  [77] yaml_2.3.7                knitr_1.43               
##  [79] bit64_4.0.5               fs_1.6.2                 
##  [81] beanplot_1.3.1            scrime_1.3.5             
##  [83] KEGGREST_1.34.0           nlme_3.1-162             
##  [85] doRNG_1.8.6               sparseMatrixStats_1.10.0 
##  [87] nor1mix_1.3-0             xml2_1.3.4               
##  [89] biomaRt_2.50.3            compiler_4.2.2           
##  [91] rstudioapi_0.14           filelock_1.0.2           
##  [93] curl_5.0.1                png_0.1-8                
##  [95] ggsignif_0.6.3            reprex_2.0.2             
##  [97] bslib_0.5.0               stringi_1.7.12           
##  [99] lattice_0.21-8            Matrix_1.5-4.1           
## [101] multtest_2.50.0           vctrs_0.6.3              
## [103] pillar_1.9.0              lifecycle_1.0.3          
## [105] rhdf5filters_1.10.1       jquerylib_0.1.4          
## [107] data.table_1.14.8         bitops_1.0-7             
## [109] rtracklayer_1.54.0        R6_2.5.1                 
## [111] BiocIO_1.8.0              codetools_0.2-19         
## [113] assertthat_0.2.1          rhdf5_2.42.1             
## [115] openssl_2.0.6             rjson_0.2.21             
## [117] withr_2.5.0               GenomicAlignments_1.30.0 
## [119] Rsamtools_2.10.0          GenomeInfoDbData_1.2.9   
## [121] hms_1.1.2                 quadprog_1.5-8           
## [123] grid_4.2.2                base64_2.0.1             
## [125] DelayedMatrixStats_1.16.0 illuminaio_0.40.0        
## [127] carData_3.0-5             googledrive_2.0.0        
## [129] lubridate_1.9.2           restfulr_0.0.15

Clear

rm(list=ls())