This script saves top CpGs for HOMER TFBS enrichment analysis, their nearest genes, and reports any DMRs.


Setup

Load packages

library(ggrepel)
library(GenomicRanges)
library(ggpubr)
library(DNAmArray)
library(tidyverse)

Load data

load("../GOTO_Data/GOTO_results-top-blood.Rdata")
head(top_cpgs)
##                   cpg         beta           SE            p     padj_fdr
## cg16209776 cg16209776 -0.011702520 0.0023037182 2.512802e-09 0.0009495591
## cg23961638 cg23961638 -0.002938909 0.0005754124 2.061351e-09 0.0009495591
## cg25040733 cg25040733  0.012057441 0.0024292987 8.118415e-09 0.0012271422
## cg13479371 cg13479371 -0.005003661 0.0010128031 6.718311e-09 0.0012271422
## cg26201957 cg26201957 -0.027834573 0.0056150131 5.977380e-09 0.0012271422
## cg08769073 cg08769073  0.008630337 0.0017640579 1.323098e-08 0.0016666121
##                    t   N
## cg16209776 -5.960622 196
## cg23961638 -5.992897 194
## cg25040733  5.765981 194
## cg13479371 -5.797818 196
## cg26201957 -5.817389 195
## cg08769073  5.683055 196
dim(top_cpgs)
## [1] 441   7

Annotation

manifest_hg19 (fetched in 2023 from https://zwdzwd.github.io/InfiniumAnnotation)

  • probeID as cpg - CpG ID
  • CpG_chrm as cpg_chr_hg19 - chromosome (hg19)
  • CpG_beg as cpg_start_hg19 - CpG start position (hg19)
  • CpG_end as cpg_end_hg19 - CpG end position (hg19)
  • probe_strand as cpg_strand - strand
  • gene_HGNC
manifest_hg19 <- read_tsv(
  "/exports/molepi/users/ljsinke/LLS/Shared_Data/Manifests/EPIC.hg19.manifest.tsv.gz")
## Rows: 865918 Columns: 57
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (21): CpG_chrm, probe_strand, probeID, channel, designType, nextBase, ne...
## dbl (24): CpG_beg, CpG_end, address_A, address_B, probeCpGcnt, context35, pr...
## lgl (12): posMatch, MASK_mapping, MASK_typeINextBaseSwitch, MASK_rmsk15, MAS...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
anno <- manifest_hg19 %>% 
  dplyr::select(
    cpg = probeID,
    cpg_chr = CpG_chrm,
    cpg_start = CpG_beg,
    cpg_end = CpG_end,
    cpg_strand = probe_strand,
    gene_HGNC
  ) %>% 
  mutate(
    cpg_chr = substr(cpg_chr,4,5)
  )

anno <- anno %>% 
  dplyr::filter(cpg %in% top_cpgs$cpg)
manifest_chrom <- read_tsv(
  "/exports/molepi/users/ljsinke/LLS/Shared_Data/Manifests/EPIC.hg19.REMC.chromHMM.tsv.gz"
)
## Rows: 865918 Columns: 131
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (129): CpG_chrm, probeID, E001, E002, E003, E004, E005, E006, E007, E008...
## dbl   (2): CpG_beg, CpG_end
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
manifest_chrom <- manifest_chrom %>% 
  dplyr::select(
    cpg = probeID,
    E062)

anno <- left_join(
  anno, manifest_chrom,
  by="cpg"
)

top_cpgs <- left_join(top_cpgs, anno, by="cpg")
head(top_cpgs)
##          cpg         beta           SE            p     padj_fdr         t   N
## 1 cg16209776 -0.011702520 0.0023037182 2.512802e-09 0.0009495591 -5.960622 196
## 2 cg23961638 -0.002938909 0.0005754124 2.061351e-09 0.0009495591 -5.992897 194
## 3 cg25040733  0.012057441 0.0024292987 8.118415e-09 0.0012271422  5.765981 194
## 4 cg13479371 -0.005003661 0.0010128031 6.718311e-09 0.0012271422 -5.797818 196
## 5 cg26201957 -0.027834573 0.0056150131 5.977380e-09 0.0012271422 -5.817389 195
## 6 cg08769073  0.008630337 0.0017640579 1.323098e-08 0.0016666121  5.683055 196
##   cpg_chr cpg_start   cpg_end cpg_strand        gene_HGNC       E062
## 1      12  63177893  63177895          +            PPM1H   15_Quies
## 2       4 153700210 153700212          +     ARFIP1;TIGD4 2_TssAFlnk
## 3       5 141704733 141704735          - AC005592.2;SPRY4 11_BivFlnk
## 4      18  72530930  72530932          -           ZNF407   15_Quies
## 5      11  82601012  82601014          -             PRCP 2_TssAFlnk
## 6      11  12214803  12214805          -           MICAL2     5_TxWk

Homer

CpGs for HOMER

homer <- top_cpgs %>% 
  mutate(
    cpg_chr = paste0('chr', cpg_chr)) %>% 
  dplyr::select(
    cpg,
    chr = cpg_chr,
    start = cpg_start,
    end = cpg_end,
    strand = cpg_strand
  ) 

write_tsv(homer, file="../GOTO_Data/Homer/GOTO_Homer-Blood.tsv")

Nearest Gene

Save significant CpGs

print(paste0("There are ", 
             nrow(top_cpgs), 
             " CpGs significant at 5% FDR level"))
## [1] "There are 441 CpGs significant at 5% FDR level"
cpg_top <- top_cpgs$cpg

Save gene locations (saved previously)

txdb <- makeTxDbFromEnsembl(organism = "Homo sapiens",
                            release = 75,
                            circ_seqs = NULL,
                            server = "ensembldb.ensembl.org",
                            username = "anonymous", password = NULL, port = 0L,
                            tx_attrib = NULL)

gene_range <- unlist(cdsBy(txdb, by = "gene"))
gene_range <- unique(gene_range)

save(gene_range, file='/exports/molepi/users/ljsinke/LLS/Shared_Data/allGeneRanges.Rdata)

Load gene_range object

load('/exports/molepi/users/ljsinke/LLS/Shared_Data/allGeneRanges.Rdata')
gene_range
## GRanges object with 302587 ranges and 2 metadata columns:
##                   seqnames            ranges strand |    cds_id        cds_name
##                      <Rle>         <IRanges>  <Rle> | <integer>     <character>
##   ENSG00000000003        X 99885795-99885863      - |    717267 ENSP00000362111
##   ENSG00000000003        X 99887482-99887565      - |    717268 ENSP00000362111
##   ENSG00000000003        X 99888402-99888536      - |    717269 ENSP00000362111
##   ENSG00000000003        X 99888928-99889026      - |    717270 ENSP00000362111
##   ENSG00000000003        X 99890175-99890249      - |    717271 ENSP00000362111
##               ...      ...               ...    ... .       ...             ...
##            LRG_97   LRG_97       17272-17334      + |    799400        LRG_97p1
##            LRG_97   LRG_97       17876-18035      + |    799401        LRG_97p1
##            LRG_97   LRG_97       22463-22593      + |    799402        LRG_97p1
##            LRG_98   LRG_98       10293-13424      + |    799403        LRG_98p1
##            LRG_99   LRG_99        9069-10652      + |    799404        LRG_99p1
##   -------
##   seqinfo: 722 sequences (1 circular) from an unspecified genome

Make a GenomicRanges object for the top CpGs

cpg_range <- GRanges(seqnames = top_cpgs$cpg_chr,
        IRanges(
          start = top_cpgs$cpg_start,
          end = top_cpgs$cpg_end,
          names = top_cpgs$cpg
        ))

cpg_range
## GRanges object with 441 ranges and 0 metadata columns:
##              seqnames              ranges strand
##                 <Rle>           <IRanges>  <Rle>
##   cg16209776       12   63177893-63177895      *
##   cg23961638        4 153700210-153700212      *
##   cg25040733        5 141704733-141704735      *
##   cg13479371       18   72530930-72530932      *
##   cg26201957       11   82601012-82601014      *
##          ...      ...                 ...    ...
##   cg24562746       19   44598111-44598113      *
##   cg15400818        5   89767373-89767375      *
##   cg24899571       11 133938809-133938811      *
##   cg25828097        2   27656864-27656866      *
##   cg25967669       11     8430495-8430497      *
##   -------
##   seqinfo: 23 sequences from an unspecified genome; no seqlengths

Initialize a distance matrix

distance.matrix <- matrix(
  NaN, 
  nrow = length(gene_range), 
  ncol = length(cpg_range), 
  dimnames = list(names(gene_range), 
                  names(cpg_range)))

Loop through CpGs

for(i in 1:nrow(top_cpgs)){
    cpg <- cpg_range[i]
    calc.distance <- distance(
      cpg, 
      gene_range, 
      ignore.strand = T)
    distance.matrix[,i] <- calc.distance
}

dim(distance.matrix)
## [1] 302587    441
distance.matrix <- as.data.frame(distance.matrix)
colnames(distance.matrix) <- names(cpg_range)
distance.matrix$gene <- names(gene_range)

Make a list of nearest genes

gene_list <- lapply(names(cpg_range), function(x){
  df <- (distance.matrix %>% dplyr::select(gene, all_of(x)))
  df <- df[rowSums(is.na(df)) == 0,]
  return(unique(df[df[,2] == min(df[,2]), ]))
})

gene_list[[1]]
##                              gene cg16209776
## ENSG00000111110.6 ENSG00000111110       4110

Save gene symbols

ens2gene <- cinaR::grch37

Bind

df <- data.frame()

for(i in 1:length(gene_list)){
  df <- rbind(df, 
              data.frame(gene_nearest_ens = gene_list[[i]]$gene,
                     cpg = colnames(gene_list[[i]])[2],
                     gene_nearest_dist = gene_list[[i]][1,2])
                         )
}

sym <- ens2gene$symbol[match(df$gene_nearest_ens,
                             ens2gene$ensgene)]

df$gene_nearest_name <- sym

df <- df[match(cpg_top, df$cpg),]
top_cpgs <- top_cpgs[match(cpg_top, top_cpgs$cpg),]

top_cpgs <- left_join(top_cpgs, df, by = "cpg")
head(top_cpgs)
##          cpg         beta           SE            p     padj_fdr         t   N
## 1 cg16209776 -0.011702520 0.0023037182 2.512802e-09 0.0009495591 -5.960622 196
## 2 cg23961638 -0.002938909 0.0005754124 2.061351e-09 0.0009495591 -5.992897 194
## 3 cg25040733  0.012057441 0.0024292987 8.118415e-09 0.0012271422  5.765981 194
## 4 cg13479371 -0.005003661 0.0010128031 6.718311e-09 0.0012271422 -5.797818 196
## 5 cg26201957 -0.027834573 0.0056150131 5.977380e-09 0.0012271422 -5.817389 195
## 6 cg08769073  0.008630337 0.0017640579 1.323098e-08 0.0016666121  5.683055 196
##   cpg_chr cpg_start   cpg_end cpg_strand        gene_HGNC       E062
## 1      12  63177893  63177895          +            PPM1H   15_Quies
## 2       4 153700210 153700212          +     ARFIP1;TIGD4 2_TssAFlnk
## 3       5 141704733 141704735          - AC005592.2;SPRY4 11_BivFlnk
## 4      18  72530930  72530932          -           ZNF407   15_Quies
## 5      11  82601012  82601014          -             PRCP 2_TssAFlnk
## 6      11  12214803  12214805          -           MICAL2     5_TxWk
##   gene_nearest_ens gene_nearest_dist gene_nearest_name
## 1  ENSG00000111110              4110             PPM1H
## 2  ENSG00000169989              8053             TIGD4
## 3  ENSG00000187678              5402             SPRY4
## 4  ENSG00000215421             14904            ZNF407
## 5  ENSG00000137509              5084              PRCP
## 6  ENSG00000133816             10991            MICAL2

DMR finder

Make DMR data frame

dmr_cpgs <- top_cpgs %>% 
  dplyr::select(
    chromosome = cpg_chr,
    start = cpg_start,
    padj = padj_fdr)

Create significance indicator

dmr_cpgs <- dmr_cpgs %>% mutate(
    crit = ifelse(padj <= 0.05, 1, 0))

Arrange by genomic position

dmr_cpgs <- dmr_cpgs %>% 
  arrange(chromosome, start) %>% 
  dplyr::select(chromosome, start, crit)

Initialize variables

chromosome=1:22
MAXIMUM_REGION_LENGTH = 1000
mismatches = 3

Run DMRfinder

for(x in chromosome){

tryCatch({chr1 = dmr_cpgs[dmr_cpgs[,1]==x,]
if(nrow(chr1) >= 1){
chr1 <- chr1 %>% arrange(start)
chr.final = data.frame(
  coord = chr1$start,
  crit = chr1$crit
)

last_coordinate = length( chr.final$crit )
next_coordinate = 0

for (i in 1:(last_coordinate-1)) {
      if ( i>=next_coordinate ) {
        if (chr.final$crit[ i ]==1) {
          start_location = chr.final$coord[ i ]
          last_visited_crit_loc = start_location
          sum_of_ones = 1
          number_of_items = 1
          
          # start crawling loop
          for (j in (i+1):last_coordinate ) {
            if (chr.final$coord[ j ] > (last_visited_crit_loc + MAXIMUM_REGION_LENGTH)) { break }
            if((number_of_items-sum_of_ones)>mismatches) { break }   #Number of mismatches
            number_of_items = number_of_items + 1
            if (chr.final$crit[j]==1) { 
              last_visited_crit_loc = chr.final$coord[ j ]
              sum_of_ones = sum_of_ones + 1 
            }
          }
          
          # now check if the result is good enough
          if (sum_of_ones>=3) {
            last_one=i+number_of_items-1
            for (k in (i+number_of_items-1):1) {
              if ( chr.final$crit[k] == 0 ) {
                last_one = last_one - 1
                number_of_items = number_of_items - 1
              }
              else {
                break
              }
            }
            cat(x, ';',start_location,";",chr.final$coord[last_one],";",sum_of_ones/number_of_items,"\n")
            next_coordinate = last_one + 1
          }
        }
      }
}
}}, error=function(e){})
}

Save

top_cpgs <- top_cpgs %>% arrange(desc(padj_fdr))
save(top_cpgs, file='../GOTO_Data/GOTO_results-top-blood.Rdata')
write_csv(top_cpgs, file='../GOTO_Data/Tables/ST17.csv')

Session Info

sessionInfo()
## R version 4.2.2 (2022-10-31)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Rocky Linux 8.10 (Green Obsidian)
## 
## Matrix products: default
## BLAS/LAPACK: /usr/lib64/libopenblas-r0.3.15.so
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] parallel  stats4    stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] ggpubr_0.4.0                                       
##  [2] ggrepel_0.9.1                                      
##  [3] lmerTest_3.1-3                                     
##  [4] lme4_1.1-30                                        
##  [5] Matrix_1.5-4.1                                     
##  [6] IlluminaHumanMethylation450kmanifest_0.4.0         
##  [7] IlluminaHumanMethylationEPICmanifest_0.3.0         
##  [8] FlowSorted.BloodExtended.EPIC_1.1.1                
##  [9] FlowSorted.Blood.EPIC_1.12.1                       
## [10] IlluminaHumanMethylationEPICanno.ilm10b4.hg19_0.6.0
## [11] nlme_3.1-162                                       
## [12] quadprog_1.5-8                                     
## [13] genefilter_1.76.0                                  
## [14] ExperimentHub_2.2.1                                
## [15] AnnotationHub_3.2.2                                
## [16] BiocFileCache_2.2.1                                
## [17] dbplyr_2.2.1                                       
## [18] MuSiC_0.2.0                                        
## [19] nnls_1.4                                           
## [20] lubridate_1.9.2                                    
## [21] DNAmArray_2.0.0                                    
## [22] pls_2.8-2                                          
## [23] FDb.InfiniumMethylation.hg19_2.2.0                 
## [24] org.Hs.eg.db_3.14.0                                
## [25] TxDb.Hsapiens.UCSC.hg19.knownGene_3.2.2            
## [26] GenomicFeatures_1.46.5                             
## [27] AnnotationDbi_1.56.2                               
## [28] minfi_1.40.0                                       
## [29] bumphunter_1.36.0                                  
## [30] locfit_1.5-9.8                                     
## [31] iterators_1.0.14                                   
## [32] foreach_1.5.2                                      
## [33] Biostrings_2.62.0                                  
## [34] XVector_0.34.0                                     
## [35] SummarizedExperiment_1.24.0                        
## [36] Biobase_2.58.0                                     
## [37] MatrixGenerics_1.10.0                              
## [38] matrixStats_1.0.0                                  
## [39] GenomicRanges_1.46.1                               
## [40] GenomeInfoDb_1.34.9                                
## [41] IRanges_2.32.0                                     
## [42] S4Vectors_0.36.2                                   
## [43] BiocGenerics_0.44.0                                
## [44] forcats_0.5.2                                      
## [45] stringr_1.5.0                                      
## [46] dplyr_1.1.3                                        
## [47] purrr_0.3.4                                        
## [48] readr_2.1.2                                        
## [49] tidyr_1.2.1                                        
## [50] tibble_3.2.1                                       
## [51] ggplot2_3.4.3                                      
## [52] tidyverse_1.3.2                                    
## [53] cli_3.6.1                                          
## [54] htmltools_0.5.5                                    
## [55] rlang_1.1.1                                        
## [56] rmarkdown_2.16                                     
## 
## loaded via a namespace (and not attached):
##   [1] utf8_1.2.3                    tidyselect_1.2.0             
##   [3] RSQLite_2.2.17                grid_4.2.2                   
##   [5] BiocParallel_1.32.6           cinaR_0.2.3                  
##   [7] munsell_0.5.0                 codetools_0.2-19             
##   [9] preprocessCore_1.60.2         withr_2.5.0                  
##  [11] colorspace_2.1-0              filelock_1.0.2               
##  [13] highr_0.10                    knitr_1.43                   
##  [15] rstudioapi_0.14               ggsignif_0.6.3               
##  [17] labeling_0.4.2                GenomeInfoDbData_1.2.9       
##  [19] MCMCpack_1.6-3                farver_2.1.1                 
##  [21] bit64_4.0.5                   rhdf5_2.42.1                 
##  [23] coda_0.19-4                   vctrs_0.6.3                  
##  [25] generics_0.1.3                xfun_0.39                    
##  [27] timechange_0.2.0              R6_2.5.1                     
##  [29] illuminaio_0.40.0             bitops_1.0-7                 
##  [31] rhdf5filters_1.10.1           cachem_1.0.8                 
##  [33] reshape_0.8.9                 DelayedArray_0.24.0          
##  [35] assertthat_0.2.1              vroom_1.5.7                  
##  [37] promises_1.2.0.1              BiocIO_1.8.0                 
##  [39] scales_1.2.1                  googlesheets4_1.0.1          
##  [41] gtable_0.3.3                  mcmc_0.9-7                   
##  [43] MatrixModels_0.5-1            splines_4.2.2                
##  [45] rstatix_0.7.0                 rtracklayer_1.54.0           
##  [47] gargle_1.5.0                  GEOquery_2.62.2              
##  [49] htm2txt_2.2.2                 broom_1.0.1                  
##  [51] abind_1.4-5                   BiocManager_1.30.21          
##  [53] yaml_2.3.7                    reshape2_1.4.4               
##  [55] modelr_0.1.9                  backports_1.4.1              
##  [57] httpuv_1.6.11                 tools_4.2.2                  
##  [59] nor1mix_1.3-0                 ellipsis_0.3.2               
##  [61] jquerylib_0.1.4               RColorBrewer_1.1-3           
##  [63] siggenes_1.68.0               Rcpp_1.0.10                  
##  [65] plyr_1.8.8                    sparseMatrixStats_1.10.0     
##  [67] progress_1.2.2                zlibbioc_1.44.0              
##  [69] RCurl_1.98-1.12               prettyunits_1.1.1            
##  [71] openssl_2.0.6                 haven_2.5.1                  
##  [73] fs_1.6.2                      magrittr_2.0.3               
##  [75] data.table_1.14.8             SparseM_1.81                 
##  [77] reprex_2.0.2                  googledrive_2.0.0            
##  [79] mime_0.12                     hms_1.1.2                    
##  [81] evaluate_0.21                 xtable_1.8-4                 
##  [83] XML_3.99-0.14                 mclust_6.0.0                 
##  [85] readxl_1.4.1                  compiler_4.2.2               
##  [87] biomaRt_2.50.3                crayon_1.5.2                 
##  [89] minqa_1.2.5                   later_1.3.1                  
##  [91] tzdb_0.4.0                    DBI_1.1.3                    
##  [93] MASS_7.3-60                   rappdirs_0.3.3               
##  [95] boot_1.3-28.1                 car_3.1-0                    
##  [97] pkgconfig_2.0.3               GenomicAlignments_1.30.0     
##  [99] numDeriv_2016.8-1.1           xml2_1.3.4                   
## [101] annotate_1.72.0               bslib_0.5.0                  
## [103] rngtools_1.5.2                multtest_2.50.0              
## [105] beanplot_1.3.1                rvest_1.0.3                  
## [107] doRNG_1.8.6                   scrime_1.3.5                 
## [109] digest_0.6.31                 base64_2.0.1                 
## [111] cellranger_1.1.0              edgeR_3.40.2                 
## [113] DelayedMatrixStats_1.16.0     restfulr_0.0.15              
## [115] curl_5.0.1                    shiny_1.7.2                  
## [117] Rsamtools_2.10.0              quantreg_5.94                
## [119] nloptr_2.0.3                  rjson_0.2.21                 
## [121] lifecycle_1.0.3               jsonlite_1.8.5               
## [123] Rhdf5lib_1.20.0               carData_3.0-5                
## [125] askpass_1.1                   limma_3.54.2                 
## [127] fansi_1.0.4                   pillar_1.9.0                 
## [129] lattice_0.21-8                KEGGREST_1.34.0              
## [131] fastmap_1.1.1                 httr_1.4.6                   
## [133] survival_3.5-5                interactiveDisplayBase_1.32.0
## [135] glue_1.6.2                    png_0.1-8                    
## [137] BiocVersion_3.16.0            bit_4.0.5                    
## [139] stringi_1.7.12                sass_0.4.6                   
## [141] HDF5Array_1.22.1              blob_1.2.4                   
## [143] memoise_2.0.1

Clear

rm(list=ls())