This script saves top CpGs for HOMER TFBS enrichment analysis, their nearest genes, and reports any DMRs.
Load packages
library(ggrepel)
library(GenomicRanges)
library(ggpubr)
library(DNAmArray)
library(tidyverse)
Load data
load("../GOTO_Data/GOTO_results-top-blood.Rdata")
head(top_cpgs)
## cpg beta SE p padj_fdr
## cg16209776 cg16209776 -0.011702520 0.0023037182 2.512802e-09 0.0009495591
## cg23961638 cg23961638 -0.002938909 0.0005754124 2.061351e-09 0.0009495591
## cg25040733 cg25040733 0.012057441 0.0024292987 8.118415e-09 0.0012271422
## cg13479371 cg13479371 -0.005003661 0.0010128031 6.718311e-09 0.0012271422
## cg26201957 cg26201957 -0.027834573 0.0056150131 5.977380e-09 0.0012271422
## cg08769073 cg08769073 0.008630337 0.0017640579 1.323098e-08 0.0016666121
## t N
## cg16209776 -5.960622 196
## cg23961638 -5.992897 194
## cg25040733 5.765981 194
## cg13479371 -5.797818 196
## cg26201957 -5.817389 195
## cg08769073 5.683055 196
dim(top_cpgs)
## [1] 441 7
manifest_hg19
(fetched in 2023 from https://zwdzwd.github.io/InfiniumAnnotation)
probeID
as cpg
- CpG IDCpG_chrm
as cpg_chr_hg19
- chromosome (hg19)CpG_beg
as cpg_start_hg19
- CpG start position (hg19)CpG_end
as cpg_end_hg19
- CpG end position (hg19)probe_strand
as cpg_strand
- strandgene_HGNC
manifest_hg19 <- read_tsv(
"/exports/molepi/users/ljsinke/LLS/Shared_Data/Manifests/EPIC.hg19.manifest.tsv.gz")
## Rows: 865918 Columns: 57
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (21): CpG_chrm, probe_strand, probeID, channel, designType, nextBase, ne...
## dbl (24): CpG_beg, CpG_end, address_A, address_B, probeCpGcnt, context35, pr...
## lgl (12): posMatch, MASK_mapping, MASK_typeINextBaseSwitch, MASK_rmsk15, MAS...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
anno <- manifest_hg19 %>%
dplyr::select(
cpg = probeID,
cpg_chr = CpG_chrm,
cpg_start = CpG_beg,
cpg_end = CpG_end,
cpg_strand = probe_strand,
gene_HGNC
) %>%
mutate(
cpg_chr = substr(cpg_chr,4,5)
)
anno <- anno %>%
dplyr::filter(cpg %in% top_cpgs$cpg)
manifest_chrom <- read_tsv(
"/exports/molepi/users/ljsinke/LLS/Shared_Data/Manifests/EPIC.hg19.REMC.chromHMM.tsv.gz"
)
## Rows: 865918 Columns: 131
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (129): CpG_chrm, probeID, E001, E002, E003, E004, E005, E006, E007, E008...
## dbl (2): CpG_beg, CpG_end
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
manifest_chrom <- manifest_chrom %>%
dplyr::select(
cpg = probeID,
E062)
anno <- left_join(
anno, manifest_chrom,
by="cpg"
)
top_cpgs <- left_join(top_cpgs, anno, by="cpg")
head(top_cpgs)
## cpg beta SE p padj_fdr t N
## 1 cg16209776 -0.011702520 0.0023037182 2.512802e-09 0.0009495591 -5.960622 196
## 2 cg23961638 -0.002938909 0.0005754124 2.061351e-09 0.0009495591 -5.992897 194
## 3 cg25040733 0.012057441 0.0024292987 8.118415e-09 0.0012271422 5.765981 194
## 4 cg13479371 -0.005003661 0.0010128031 6.718311e-09 0.0012271422 -5.797818 196
## 5 cg26201957 -0.027834573 0.0056150131 5.977380e-09 0.0012271422 -5.817389 195
## 6 cg08769073 0.008630337 0.0017640579 1.323098e-08 0.0016666121 5.683055 196
## cpg_chr cpg_start cpg_end cpg_strand gene_HGNC E062
## 1 12 63177893 63177895 + PPM1H 15_Quies
## 2 4 153700210 153700212 + ARFIP1;TIGD4 2_TssAFlnk
## 3 5 141704733 141704735 - AC005592.2;SPRY4 11_BivFlnk
## 4 18 72530930 72530932 - ZNF407 15_Quies
## 5 11 82601012 82601014 - PRCP 2_TssAFlnk
## 6 11 12214803 12214805 - MICAL2 5_TxWk
CpGs for HOMER
homer <- top_cpgs %>%
mutate(
cpg_chr = paste0('chr', cpg_chr)) %>%
dplyr::select(
cpg,
chr = cpg_chr,
start = cpg_start,
end = cpg_end,
strand = cpg_strand
)
write_tsv(homer, file="../GOTO_Data/Homer/GOTO_Homer-Blood.tsv")
Save significant CpGs
print(paste0("There are ",
nrow(top_cpgs),
" CpGs significant at 5% FDR level"))
## [1] "There are 441 CpGs significant at 5% FDR level"
cpg_top <- top_cpgs$cpg
Save gene locations (saved previously)
txdb <- makeTxDbFromEnsembl(organism = "Homo sapiens",
release = 75,
circ_seqs = NULL,
server = "ensembldb.ensembl.org",
username = "anonymous", password = NULL, port = 0L,
tx_attrib = NULL)
gene_range <- unlist(cdsBy(txdb, by = "gene"))
gene_range <- unique(gene_range)
save(gene_range, file='/exports/molepi/users/ljsinke/LLS/Shared_Data/allGeneRanges.Rdata)
Load gene_range
object
load('/exports/molepi/users/ljsinke/LLS/Shared_Data/allGeneRanges.Rdata')
gene_range
## GRanges object with 302587 ranges and 2 metadata columns:
## seqnames ranges strand | cds_id cds_name
## <Rle> <IRanges> <Rle> | <integer> <character>
## ENSG00000000003 X 99885795-99885863 - | 717267 ENSP00000362111
## ENSG00000000003 X 99887482-99887565 - | 717268 ENSP00000362111
## ENSG00000000003 X 99888402-99888536 - | 717269 ENSP00000362111
## ENSG00000000003 X 99888928-99889026 - | 717270 ENSP00000362111
## ENSG00000000003 X 99890175-99890249 - | 717271 ENSP00000362111
## ... ... ... ... . ... ...
## LRG_97 LRG_97 17272-17334 + | 799400 LRG_97p1
## LRG_97 LRG_97 17876-18035 + | 799401 LRG_97p1
## LRG_97 LRG_97 22463-22593 + | 799402 LRG_97p1
## LRG_98 LRG_98 10293-13424 + | 799403 LRG_98p1
## LRG_99 LRG_99 9069-10652 + | 799404 LRG_99p1
## -------
## seqinfo: 722 sequences (1 circular) from an unspecified genome
Make a GenomicRanges
object for the top CpGs
cpg_range <- GRanges(seqnames = top_cpgs$cpg_chr,
IRanges(
start = top_cpgs$cpg_start,
end = top_cpgs$cpg_end,
names = top_cpgs$cpg
))
cpg_range
## GRanges object with 441 ranges and 0 metadata columns:
## seqnames ranges strand
## <Rle> <IRanges> <Rle>
## cg16209776 12 63177893-63177895 *
## cg23961638 4 153700210-153700212 *
## cg25040733 5 141704733-141704735 *
## cg13479371 18 72530930-72530932 *
## cg26201957 11 82601012-82601014 *
## ... ... ... ...
## cg24562746 19 44598111-44598113 *
## cg15400818 5 89767373-89767375 *
## cg24899571 11 133938809-133938811 *
## cg25828097 2 27656864-27656866 *
## cg25967669 11 8430495-8430497 *
## -------
## seqinfo: 23 sequences from an unspecified genome; no seqlengths
Initialize a distance matrix
distance.matrix <- matrix(
NaN,
nrow = length(gene_range),
ncol = length(cpg_range),
dimnames = list(names(gene_range),
names(cpg_range)))
Loop through CpGs
for(i in 1:nrow(top_cpgs)){
cpg <- cpg_range[i]
calc.distance <- distance(
cpg,
gene_range,
ignore.strand = T)
distance.matrix[,i] <- calc.distance
}
dim(distance.matrix)
## [1] 302587 441
distance.matrix <- as.data.frame(distance.matrix)
colnames(distance.matrix) <- names(cpg_range)
distance.matrix$gene <- names(gene_range)
Make a list of nearest genes
gene_list <- lapply(names(cpg_range), function(x){
df <- (distance.matrix %>% dplyr::select(gene, all_of(x)))
df <- df[rowSums(is.na(df)) == 0,]
return(unique(df[df[,2] == min(df[,2]), ]))
})
gene_list[[1]]
## gene cg16209776
## ENSG00000111110.6 ENSG00000111110 4110
Save gene symbols
ens2gene <- cinaR::grch37
Bind
df <- data.frame()
for(i in 1:length(gene_list)){
df <- rbind(df,
data.frame(gene_nearest_ens = gene_list[[i]]$gene,
cpg = colnames(gene_list[[i]])[2],
gene_nearest_dist = gene_list[[i]][1,2])
)
}
sym <- ens2gene$symbol[match(df$gene_nearest_ens,
ens2gene$ensgene)]
df$gene_nearest_name <- sym
df <- df[match(cpg_top, df$cpg),]
top_cpgs <- top_cpgs[match(cpg_top, top_cpgs$cpg),]
top_cpgs <- left_join(top_cpgs, df, by = "cpg")
head(top_cpgs)
## cpg beta SE p padj_fdr t N
## 1 cg16209776 -0.011702520 0.0023037182 2.512802e-09 0.0009495591 -5.960622 196
## 2 cg23961638 -0.002938909 0.0005754124 2.061351e-09 0.0009495591 -5.992897 194
## 3 cg25040733 0.012057441 0.0024292987 8.118415e-09 0.0012271422 5.765981 194
## 4 cg13479371 -0.005003661 0.0010128031 6.718311e-09 0.0012271422 -5.797818 196
## 5 cg26201957 -0.027834573 0.0056150131 5.977380e-09 0.0012271422 -5.817389 195
## 6 cg08769073 0.008630337 0.0017640579 1.323098e-08 0.0016666121 5.683055 196
## cpg_chr cpg_start cpg_end cpg_strand gene_HGNC E062
## 1 12 63177893 63177895 + PPM1H 15_Quies
## 2 4 153700210 153700212 + ARFIP1;TIGD4 2_TssAFlnk
## 3 5 141704733 141704735 - AC005592.2;SPRY4 11_BivFlnk
## 4 18 72530930 72530932 - ZNF407 15_Quies
## 5 11 82601012 82601014 - PRCP 2_TssAFlnk
## 6 11 12214803 12214805 - MICAL2 5_TxWk
## gene_nearest_ens gene_nearest_dist gene_nearest_name
## 1 ENSG00000111110 4110 PPM1H
## 2 ENSG00000169989 8053 TIGD4
## 3 ENSG00000187678 5402 SPRY4
## 4 ENSG00000215421 14904 ZNF407
## 5 ENSG00000137509 5084 PRCP
## 6 ENSG00000133816 10991 MICAL2
Make DMR data frame
dmr_cpgs <- top_cpgs %>%
dplyr::select(
chromosome = cpg_chr,
start = cpg_start,
padj = padj_fdr)
Create significance indicator
dmr_cpgs <- dmr_cpgs %>% mutate(
crit = ifelse(padj <= 0.05, 1, 0))
Arrange by genomic position
dmr_cpgs <- dmr_cpgs %>%
arrange(chromosome, start) %>%
dplyr::select(chromosome, start, crit)
Initialize variables
chromosome=1:22
MAXIMUM_REGION_LENGTH = 1000
mismatches = 3
Run DMRfinder
for(x in chromosome){
tryCatch({chr1 = dmr_cpgs[dmr_cpgs[,1]==x,]
if(nrow(chr1) >= 1){
chr1 <- chr1 %>% arrange(start)
chr.final = data.frame(
coord = chr1$start,
crit = chr1$crit
)
last_coordinate = length( chr.final$crit )
next_coordinate = 0
for (i in 1:(last_coordinate-1)) {
if ( i>=next_coordinate ) {
if (chr.final$crit[ i ]==1) {
start_location = chr.final$coord[ i ]
last_visited_crit_loc = start_location
sum_of_ones = 1
number_of_items = 1
# start crawling loop
for (j in (i+1):last_coordinate ) {
if (chr.final$coord[ j ] > (last_visited_crit_loc + MAXIMUM_REGION_LENGTH)) { break }
if((number_of_items-sum_of_ones)>mismatches) { break } #Number of mismatches
number_of_items = number_of_items + 1
if (chr.final$crit[j]==1) {
last_visited_crit_loc = chr.final$coord[ j ]
sum_of_ones = sum_of_ones + 1
}
}
# now check if the result is good enough
if (sum_of_ones>=3) {
last_one=i+number_of_items-1
for (k in (i+number_of_items-1):1) {
if ( chr.final$crit[k] == 0 ) {
last_one = last_one - 1
number_of_items = number_of_items - 1
}
else {
break
}
}
cat(x, ';',start_location,";",chr.final$coord[last_one],";",sum_of_ones/number_of_items,"\n")
next_coordinate = last_one + 1
}
}
}
}
}}, error=function(e){})
}
top_cpgs <- top_cpgs %>% arrange(desc(padj_fdr))
save(top_cpgs, file='../GOTO_Data/GOTO_results-top-blood.Rdata')
write_csv(top_cpgs, file='../GOTO_Data/Tables/ST17.csv')
sessionInfo()
## R version 4.2.2 (2022-10-31)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Rocky Linux 8.10 (Green Obsidian)
##
## Matrix products: default
## BLAS/LAPACK: /usr/lib64/libopenblas-r0.3.15.so
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] parallel stats4 stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] ggpubr_0.4.0
## [2] ggrepel_0.9.1
## [3] lmerTest_3.1-3
## [4] lme4_1.1-30
## [5] Matrix_1.5-4.1
## [6] IlluminaHumanMethylation450kmanifest_0.4.0
## [7] IlluminaHumanMethylationEPICmanifest_0.3.0
## [8] FlowSorted.BloodExtended.EPIC_1.1.1
## [9] FlowSorted.Blood.EPIC_1.12.1
## [10] IlluminaHumanMethylationEPICanno.ilm10b4.hg19_0.6.0
## [11] nlme_3.1-162
## [12] quadprog_1.5-8
## [13] genefilter_1.76.0
## [14] ExperimentHub_2.2.1
## [15] AnnotationHub_3.2.2
## [16] BiocFileCache_2.2.1
## [17] dbplyr_2.2.1
## [18] MuSiC_0.2.0
## [19] nnls_1.4
## [20] lubridate_1.9.2
## [21] DNAmArray_2.0.0
## [22] pls_2.8-2
## [23] FDb.InfiniumMethylation.hg19_2.2.0
## [24] org.Hs.eg.db_3.14.0
## [25] TxDb.Hsapiens.UCSC.hg19.knownGene_3.2.2
## [26] GenomicFeatures_1.46.5
## [27] AnnotationDbi_1.56.2
## [28] minfi_1.40.0
## [29] bumphunter_1.36.0
## [30] locfit_1.5-9.8
## [31] iterators_1.0.14
## [32] foreach_1.5.2
## [33] Biostrings_2.62.0
## [34] XVector_0.34.0
## [35] SummarizedExperiment_1.24.0
## [36] Biobase_2.58.0
## [37] MatrixGenerics_1.10.0
## [38] matrixStats_1.0.0
## [39] GenomicRanges_1.46.1
## [40] GenomeInfoDb_1.34.9
## [41] IRanges_2.32.0
## [42] S4Vectors_0.36.2
## [43] BiocGenerics_0.44.0
## [44] forcats_0.5.2
## [45] stringr_1.5.0
## [46] dplyr_1.1.3
## [47] purrr_0.3.4
## [48] readr_2.1.2
## [49] tidyr_1.2.1
## [50] tibble_3.2.1
## [51] ggplot2_3.4.3
## [52] tidyverse_1.3.2
## [53] cli_3.6.1
## [54] htmltools_0.5.5
## [55] rlang_1.1.1
## [56] rmarkdown_2.16
##
## loaded via a namespace (and not attached):
## [1] utf8_1.2.3 tidyselect_1.2.0
## [3] RSQLite_2.2.17 grid_4.2.2
## [5] BiocParallel_1.32.6 cinaR_0.2.3
## [7] munsell_0.5.0 codetools_0.2-19
## [9] preprocessCore_1.60.2 withr_2.5.0
## [11] colorspace_2.1-0 filelock_1.0.2
## [13] highr_0.10 knitr_1.43
## [15] rstudioapi_0.14 ggsignif_0.6.3
## [17] labeling_0.4.2 GenomeInfoDbData_1.2.9
## [19] MCMCpack_1.6-3 farver_2.1.1
## [21] bit64_4.0.5 rhdf5_2.42.1
## [23] coda_0.19-4 vctrs_0.6.3
## [25] generics_0.1.3 xfun_0.39
## [27] timechange_0.2.0 R6_2.5.1
## [29] illuminaio_0.40.0 bitops_1.0-7
## [31] rhdf5filters_1.10.1 cachem_1.0.8
## [33] reshape_0.8.9 DelayedArray_0.24.0
## [35] assertthat_0.2.1 vroom_1.5.7
## [37] promises_1.2.0.1 BiocIO_1.8.0
## [39] scales_1.2.1 googlesheets4_1.0.1
## [41] gtable_0.3.3 mcmc_0.9-7
## [43] MatrixModels_0.5-1 splines_4.2.2
## [45] rstatix_0.7.0 rtracklayer_1.54.0
## [47] gargle_1.5.0 GEOquery_2.62.2
## [49] htm2txt_2.2.2 broom_1.0.1
## [51] abind_1.4-5 BiocManager_1.30.21
## [53] yaml_2.3.7 reshape2_1.4.4
## [55] modelr_0.1.9 backports_1.4.1
## [57] httpuv_1.6.11 tools_4.2.2
## [59] nor1mix_1.3-0 ellipsis_0.3.2
## [61] jquerylib_0.1.4 RColorBrewer_1.1-3
## [63] siggenes_1.68.0 Rcpp_1.0.10
## [65] plyr_1.8.8 sparseMatrixStats_1.10.0
## [67] progress_1.2.2 zlibbioc_1.44.0
## [69] RCurl_1.98-1.12 prettyunits_1.1.1
## [71] openssl_2.0.6 haven_2.5.1
## [73] fs_1.6.2 magrittr_2.0.3
## [75] data.table_1.14.8 SparseM_1.81
## [77] reprex_2.0.2 googledrive_2.0.0
## [79] mime_0.12 hms_1.1.2
## [81] evaluate_0.21 xtable_1.8-4
## [83] XML_3.99-0.14 mclust_6.0.0
## [85] readxl_1.4.1 compiler_4.2.2
## [87] biomaRt_2.50.3 crayon_1.5.2
## [89] minqa_1.2.5 later_1.3.1
## [91] tzdb_0.4.0 DBI_1.1.3
## [93] MASS_7.3-60 rappdirs_0.3.3
## [95] boot_1.3-28.1 car_3.1-0
## [97] pkgconfig_2.0.3 GenomicAlignments_1.30.0
## [99] numDeriv_2016.8-1.1 xml2_1.3.4
## [101] annotate_1.72.0 bslib_0.5.0
## [103] rngtools_1.5.2 multtest_2.50.0
## [105] beanplot_1.3.1 rvest_1.0.3
## [107] doRNG_1.8.6 scrime_1.3.5
## [109] digest_0.6.31 base64_2.0.1
## [111] cellranger_1.1.0 edgeR_3.40.2
## [113] DelayedMatrixStats_1.16.0 restfulr_0.0.15
## [115] curl_5.0.1 shiny_1.7.2
## [117] Rsamtools_2.10.0 quantreg_5.94
## [119] nloptr_2.0.3 rjson_0.2.21
## [121] lifecycle_1.0.3 jsonlite_1.8.5
## [123] Rhdf5lib_1.20.0 carData_3.0-5
## [125] askpass_1.1 limma_3.54.2
## [127] fansi_1.0.4 pillar_1.9.0
## [129] lattice_0.21-8 KEGGREST_1.34.0
## [131] fastmap_1.1.1 httr_1.4.6
## [133] survival_3.5-5 interactiveDisplayBase_1.32.0
## [135] glue_1.6.2 png_0.1-8
## [137] BiocVersion_3.16.0 bit_4.0.5
## [139] stringi_1.7.12 sass_0.4.6
## [141] HDF5Array_1.22.1 blob_1.2.4
## [143] memoise_2.0.1
Clear
rm(list=ls())