This script loads in differential expression results and tests if genes within 100kb of blood CpGs are differentially expressed in blood.
Load packages
library(tidyverse)
library(ggrepel)
library(GenomicRanges)
library(ggpubr)
library(DNAmArray)
library(MASS)
Load data
load("../GOTO_Data/GOTO_results-top-blood.Rdata")
Save significant CpGs
print(paste0("There are ",
nrow(top_cpgs),
" CpGs significant at 5% FDR level"))
## [1] "There are 441 CpGs significant at 5% FDR level"
cpg_top <- top_cpgs$cpg
Save gene locations (saved previously)
txdb <- makeTxDbFromEnsembl(organism = "Homo sapiens",
release = 75,
circ_seqs = NULL,
server = "ensembldb.ensembl.org",
username = "anonymous", password = NULL, port = 0L,
tx_attrib = NULL)
gene_range <- unlist(cdsBy(txdb, by = "gene"))
gene_range <- unique(gene_range)
save(gene_range, file='/exports/molepi/users/ljsinke/LLS/Shared_Data/allGeneRanges.Rdata)
Load gene_range
object
load('/exports/molepi/users/ljsinke/LLS/Shared_Data/allGeneRanges.Rdata')
gene_range
## GRanges object with 302587 ranges and 2 metadata columns:
## seqnames ranges strand | cds_id cds_name
## <Rle> <IRanges> <Rle> | <integer> <character>
## ENSG00000000003 X 99885795-99885863 - | 717267 ENSP00000362111
## ENSG00000000003 X 99887482-99887565 - | 717268 ENSP00000362111
## ENSG00000000003 X 99888402-99888536 - | 717269 ENSP00000362111
## ENSG00000000003 X 99888928-99889026 - | 717270 ENSP00000362111
## ENSG00000000003 X 99890175-99890249 - | 717271 ENSP00000362111
## ... ... ... ... . ... ...
## LRG_97 LRG_97 17272-17334 + | 799400 LRG_97p1
## LRG_97 LRG_97 17876-18035 + | 799401 LRG_97p1
## LRG_97 LRG_97 22463-22593 + | 799402 LRG_97p1
## LRG_98 LRG_98 10293-13424 + | 799403 LRG_98p1
## LRG_99 LRG_99 9069-10652 + | 799404 LRG_99p1
## -------
## seqinfo: 722 sequences (1 circular) from an unspecified genome
Make a GenomicRanges
object for the top CpGs
cpg_range <- GRanges(seqnames = top_cpgs$cpg_chr,
IRanges(
start = top_cpgs$cpg_start,
end = top_cpgs$cpg_end,
names = top_cpgs$cpg
))
cpg_range
## GRanges object with 441 ranges and 0 metadata columns:
## seqnames ranges strand
## <Rle> <IRanges> <Rle>
## cg25967669 11 8430495-8430497 *
## cg24899571 11 133938809-133938811 *
## cg25828097 2 27656864-27656866 *
## cg27508416 1 42922518-42922520 *
## cg23812665 1 117061544-117061546 *
## ... ... ... ...
## cg25040733 5 141704733-141704735 *
## cg13479371 18 72530930-72530932 *
## cg26201957 11 82601012-82601014 *
## cg16209776 12 63177893-63177895 *
## cg23961638 4 153700210-153700212 *
## -------
## seqinfo: 23 sequences from an unspecified genome; no seqlengths
Initialize a distance matrix
distance.matrix <- matrix(
NaN,
nrow = length(gene_range),
ncol = length(cpg_range),
dimnames = list(names(gene_range),
names(cpg_range)))
Loop through CpGs
for(i in 1:nrow(top_cpgs)){
cpg <- cpg_range[i]
calc.distance <- distance(
cpg,
gene_range,
ignore.strand = T)
distance.matrix[,i] <- calc.distance
}
dim(distance.matrix)
## [1] 302587 441
distance.matrix <- as.data.frame(distance.matrix)
colnames(distance.matrix) <- names(cpg_range)
distance.matrix$gene <- names(gene_range)
Make a list of genes within 100kb of each cpg
gene_list <- lapply(names(cpg_range), function(x){
df <- (distance.matrix %>% dplyr::select(gene, all_of(x)))
colnames(df) <- c('gene', 'cpg')
df <- df %>% filter(cpg <= 100000)
if (nrow(df) > 0) {
return(data.frame(cpg = x,
gene_ens = unique(df$gene)))
}
})
gene_list[[1]]
## cpg gene_ens
## 1 cg25967669 ENSG00000130413
Save gene symbols
ens2gene <- cinaR::grch37
Bind
df <- data.frame()
for(i in 1:length(gene_list)){
gene_df <- gene_list[[i]]
cpg <- gene_df$cpg[1]
genes <- unique(gene_df$gene_ens)
df <- rbind(df,
data.frame(cpg = cpg,
gene_ens = genes))
}
sym <- ens2gene$symbol[match(df$gene_ens,
ens2gene$ensgene)]
df$gene <- sym
head(df)
## cpg gene_ens gene
## 1 cg25967669 ENSG00000130413 STK33
## 2 cg24899571 ENSG00000151503 NCAPD3
## 3 cg24899571 ENSG00000166086 JAM3
## 4 cg25828097 ENSG00000084734 GCKR
## 5 cg25828097 ENSG00000115207 GTF3C2
## 6 cg25828097 ENSG00000115211 EIF2B4
dim(df)
## [1] 1760 3
Save
save(df, file='../GOTO_Data/DEGs/Blood-100kb.Rdata')
print(paste0("There are ", length(unique(df$gene_ens)), " unique genes within 100kb of ",
length(unique(df$cpg)), " CpGs."))
## [1] "There are 1588 unique genes within 100kb of 403 CpGs."
Read in DEG results
deg <- read.table('../GOTO_Data/DEGs/DEGs_blood_postprandial_combined_new.txt')
Filter for those within 100kb
deg <- deg %>%
filter(GENE.SYMBOL %in% df$gene) %>%
dplyr::select(logFC, AveExpr, pvalue, gene = GENE.SYMBOL)
Adjust for multiple testing
deg$padj <- p.adjust(deg$pvalue, method='fdr')
Inspect top genes
deg <- deg %>% filter(padj <= 0.05)
head(deg)
## logFC AveExpr pvalue gene padj
## ENSG00000103056 0.20733606 5.162585 1.140226e-06 SMPD3 0.001407039
## ENSG00000111321 0.15518002 6.660071 6.433843e-06 LTBR 0.003969681
## ENSG00000118454 -0.21060787 3.027824 2.482431e-05 ANKRD13C 0.008397916
## ENSG00000054116 0.06261219 5.856389 2.970455e-05 TRAPPC3 0.008397916
## ENSG00000105438 0.09700830 6.254904 3.402721e-05 KDELR1 0.008397916
## ENSG00000158793 0.07877899 5.187106 6.174264e-05 NIT1 0.012698403
unique(deg$gene)
## [1] "SMPD3" "LTBR" "ANKRD13C" "TRAPPC3" "KDELR1" "NIT1"
## [7] "TAPBP" "ZNF672" "KIAA1143" "METTL15" "TUBA1B" "TFEB"
## [13] "TPP2" "TNFRSF1A" "CLASP2" "FAM133B" "GTF2H3" "CSNK1G3"
## [19] "JUNB" "MNAT1" "TRAF7" "AIMP1" "CNGB1" "ATF5"
## [25] "TUBE1" "GADD45B" "PSEN2" "USP21" "ZFPL1" "PPP1R11"
## [31] "POLDIP3" "CEP120" "LMAN2" "HIBADH" "BCL3" "C11orf68"
## [37] "USB1" "UBE2L3" "C6orf106" "TTC17" "ANKRD46" "FAM122A"
## [43] "PRCC" "DEDD" "B3GALT4" "CD74" "MUM1" "MAST1"
## [49] "GMPPA" "SAPCD1" "DNASE1L2" "CTSA" "CSNK2B" "POLA2"
## [55] "ATG2B" "RHOA" "MPV17L2" "CYB5R3" "ZBTB7B" "ARHGAP5"
## [61] "WDR5" "NFKBIB" "SNX14" "SMC2" "C10orf54" "NELFB"
## [67] "STT3B" "ACOT8" "TRIP13" "PURB" "TMEM175" "TBCK"
## [73] "NEU1" "RNASEH2A" "C2" "ZNF350" "TLR5" "ICAM1"
## [79] "TBC1D22A" "GPSM3" "GBA2" "ZNF649" "VRK3"
save(deg, file='../GOTO_Data/DEGs/Blood-DEG.Rdata')
sessionInfo()
## R version 4.2.2 (2022-10-31)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Rocky Linux 8.10 (Green Obsidian)
##
## Matrix products: default
## BLAS/LAPACK: /usr/lib64/libopenblas-r0.3.15.so
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] parallel stats4 stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] MASS_7.3-60
## [2] DNAmArray_2.0.0
## [3] pls_2.8-2
## [4] FDb.InfiniumMethylation.hg19_2.2.0
## [5] org.Hs.eg.db_3.14.0
## [6] TxDb.Hsapiens.UCSC.hg19.knownGene_3.2.2
## [7] GenomicFeatures_1.46.5
## [8] AnnotationDbi_1.56.2
## [9] minfi_1.40.0
## [10] bumphunter_1.36.0
## [11] locfit_1.5-9.8
## [12] iterators_1.0.14
## [13] foreach_1.5.2
## [14] Biostrings_2.62.0
## [15] XVector_0.34.0
## [16] SummarizedExperiment_1.24.0
## [17] Biobase_2.58.0
## [18] MatrixGenerics_1.10.0
## [19] matrixStats_1.0.0
## [20] ggpubr_0.4.0
## [21] GenomicRanges_1.46.1
## [22] GenomeInfoDb_1.34.9
## [23] IRanges_2.32.0
## [24] S4Vectors_0.36.2
## [25] BiocGenerics_0.44.0
## [26] ggrepel_0.9.1
## [27] forcats_0.5.2
## [28] stringr_1.5.0
## [29] dplyr_1.1.3
## [30] purrr_0.3.4
## [31] readr_2.1.2
## [32] tidyr_1.2.1
## [33] tibble_3.2.1
## [34] ggplot2_3.4.3
## [35] tidyverse_1.3.2
## [36] rmarkdown_2.16
##
## loaded via a namespace (and not attached):
## [1] readxl_1.4.1 backports_1.4.1
## [3] BiocFileCache_2.2.1 plyr_1.8.8
## [5] splines_4.2.2 BiocParallel_1.32.6
## [7] digest_0.6.31 htmltools_0.5.5
## [9] fansi_1.0.4 magrittr_2.0.3
## [11] memoise_2.0.1 googlesheets4_1.0.1
## [13] limma_3.54.2 tzdb_0.4.0
## [15] annotate_1.72.0 cinaR_0.2.3
## [17] modelr_0.1.9 askpass_1.1
## [19] timechange_0.2.0 siggenes_1.68.0
## [21] prettyunits_1.1.1 colorspace_2.1-0
## [23] blob_1.2.4 rvest_1.0.3
## [25] rappdirs_0.3.3 haven_2.5.1
## [27] xfun_0.39 crayon_1.5.2
## [29] RCurl_1.98-1.12 jsonlite_1.8.5
## [31] genefilter_1.76.0 GEOquery_2.62.2
## [33] survival_3.5-5 glue_1.6.2
## [35] gtable_0.3.3 gargle_1.5.0
## [37] zlibbioc_1.44.0 htm2txt_2.2.2
## [39] DelayedArray_0.24.0 car_3.1-0
## [41] Rhdf5lib_1.20.0 HDF5Array_1.22.1
## [43] abind_1.4-5 scales_1.2.1
## [45] edgeR_3.40.2 DBI_1.1.3
## [47] rngtools_1.5.2 rstatix_0.7.0
## [49] Rcpp_1.0.10 xtable_1.8-4
## [51] progress_1.2.2 bit_4.0.5
## [53] mclust_6.0.0 preprocessCore_1.60.2
## [55] httr_1.4.6 RColorBrewer_1.1-3
## [57] ellipsis_0.3.2 pkgconfig_2.0.3
## [59] reshape_0.8.9 XML_3.99-0.14
## [61] sass_0.4.6 dbplyr_2.2.1
## [63] utf8_1.2.3 reshape2_1.4.4
## [65] tidyselect_1.2.0 rlang_1.1.1
## [67] munsell_0.5.0 cellranger_1.1.0
## [69] tools_4.2.2 cachem_1.0.8
## [71] cli_3.6.1 generics_0.1.3
## [73] RSQLite_2.2.17 broom_1.0.1
## [75] evaluate_0.21 fastmap_1.1.1
## [77] yaml_2.3.7 knitr_1.43
## [79] bit64_4.0.5 fs_1.6.2
## [81] beanplot_1.3.1 scrime_1.3.5
## [83] KEGGREST_1.34.0 nlme_3.1-162
## [85] doRNG_1.8.6 sparseMatrixStats_1.10.0
## [87] nor1mix_1.3-0 xml2_1.3.4
## [89] biomaRt_2.50.3 compiler_4.2.2
## [91] rstudioapi_0.14 filelock_1.0.2
## [93] curl_5.0.1 png_0.1-8
## [95] ggsignif_0.6.3 reprex_2.0.2
## [97] bslib_0.5.0 stringi_1.7.12
## [99] lattice_0.21-8 Matrix_1.5-4.1
## [101] multtest_2.50.0 vctrs_0.6.3
## [103] pillar_1.9.0 lifecycle_1.0.3
## [105] rhdf5filters_1.10.1 jquerylib_0.1.4
## [107] data.table_1.14.8 bitops_1.0-7
## [109] rtracklayer_1.54.0 R6_2.5.1
## [111] BiocIO_1.8.0 codetools_0.2-19
## [113] assertthat_0.2.1 rhdf5_2.42.1
## [115] openssl_2.0.6 rjson_0.2.21
## [117] withr_2.5.0 GenomicAlignments_1.30.0
## [119] Rsamtools_2.10.0 GenomeInfoDbData_1.2.9
## [121] hms_1.1.2 quadprog_1.5-8
## [123] grid_4.2.2 base64_2.0.1
## [125] DelayedMatrixStats_1.16.0 illuminaio_0.40.0
## [127] carData_3.0-5 googledrive_2.0.0
## [129] lubridate_1.9.2 restfulr_0.0.15
Clear
rm(list=ls())