This script is derived from Bernabeu’s github and calculates their bAge estimation
Load packages
library(tidyverse)
library(survival)
Load DNAm data
load('../GOTO_Data/GOTO_methData-filtered.Rdata')
methData
## class: SummarizedExperiment
## dim: 755777 534
## metadata(0):
## assays(1): beta
## rownames(755777): cg18478105 cg09835024 ... cg10633746 cg12623625
## rowData names(57): cpg chr ... MASK_extBase MASK_general
## colnames(534): 203527980082_R01C01 203527980082_R02C01 ...
## 203550300093_R07C01 203550300093_R08C01
## colData names(102): DNA_labnr IOP2_ID ... cc_blood_ext_NK
## cc_blood_ext_Treg
Methylation table:
betas <- as.data.frame(assay(methData))
betas <- betas %>% rownames_to_column(var='cpg')
colnames(betas) <- c('cpg', methData$Basename)
write_csv(betas,
file='../GOTO_Data/Clocks/Clock_Data/methylationTable.csv')
methylationTable <- "../GOTO_Data/Clocks/Clock_Data/methylationTable.csv"
methylationTable_format <- "csv"
Path to phenotype table:
targets <- as.data.frame(colData(methData))
targets <- targets %>% select(sample_ID = Basename,
age,
sex) %>%
mutate(sex = ifelse(sex=='female', 1, 0),
tte = NA,
death = 0)
write_csv(targets,
file="../GOTO_Data/Clocks/Clock_Data/phenotypeTable.csv")
phenotypeTable <- "../GOTO_Data/Clocks/Clock_Data/phenotypeTable.csv"
phenotypeTable_format <- "csv"
Path to GrimAge results table:
grimageTable <- "../GOTO_Data/Clocks/Clock_Out/Grimage_output_GOTO.csv"
grimageTable_format <- "csv"
Insert column names for each of the variables corresponding to the names in phenotypeTable.
ageColname <- "age"
sexColname <- "sex"
tteColname <- "tte"
deathColname <- "death"
message("1. Loading data")
## 1. Loading data
message("1.1 Loading methylation data - rows to be CpGs and columns to be individuals")
## 1.1 Loading methylation data - rows to be CpGs and columns to be individuals
Loading in model coefficients.
(Make sure these files are present in the current working directory or change the paths to the correct directory)
coefficients <- read.delim("../GOTO_Data/Clocks/Clock_Data/bage_coefficients.tsv")
Loading in CpG coefficients for episcore projection
cpgs <- read.delim("../GOTO_Data/Clocks/Clock_Data/cpg_episcore_weights.tsv")
Loading methylation data
if (tolower(methylationTable_format) == "rds") {
data <- readRDS(methylationTable)
} else if (tolower(methylationTable_format) == "tsv") {
data <- read.delim(methylationTable, sep = "\t", row.names = 1)
} else if (tolower(methylationTable_format) == "csv") {
data <- read.csv(methylationTable, sep = ",", row.names = 1)
} else {
message("Unrecognized methylation data format. Accepted formats: rds, tsv, and csv.")
}
Loading phenotype data
if (tolower(phenotypeTable_format) == "rds") {
pheno <- readRDS(phenotypeTable)
} else if (tolower(phenotypeTable_format) == "tsv") {
pheno <- read.delim(phenotypeTable, sep = "\t", row.names = 1)
} else if (tolower(phenotypeTable_format) == "csv") {
pheno <- read.csv(phenotypeTable, sep = ",", row.names = 1)
} else {
message("Unrecognized phenotype data format. Accepted formats: rds, tsv, and csv.")
}
Loading GrimAge data
if (tolower(grimageTable_format) == "rds") {
grim <- readRDS(grimageTable)
} else if (tolower(grimageTable_format) == "tsv") {
grim <- read.delim(grimageTable, sep = "\t", row.names = 1)
} else if (tolower(grimageTable_format) == "csv") {
grim <- read.csv(grimageTable, sep = ",", row.names = 1)
} else {
message("Unrecognized GrimAge data format. Accepted formats: rds, tsv, and csv.")
}
row.names(grim) <- grim$SampleID
Set colnames
colnames(data) <- gsub('X', '', colnames(data))
Check if Data needs to be Transposed
message("2. Quality Control and data preparation")
## 2. Quality Control and data preparation
message("2.1 Checking if row names are CpG sites")
## 2.1 Checking if row names are CpG sites
if(ncol(data) > nrow(data)){
message("It seems that individuals are rows - data will be transposed!")
data <- t(data)
}
Keep individuals in pheno table
message("2.3 Subsetting samples to those in phenotype table")
## 2.3 Subsetting samples to those in phenotype table
data <- data[,which(colnames(data) %in% rownames(pheno))]
grim <- grim[which(rownames(grim) %in% rownames(pheno)),]
data <- data[,rownames(pheno)]
grim <- grim[rownames(pheno),]
Subset CpG sites to those present on list for predictors
message("2.4 Subsetting CpG sites to those required for predictor calculation")
## 2.4 Subsetting CpG sites to those required for predictor calculation
coef <- data[intersect(rownames(data), cpgs$CpG_Site),]
Check if Beta or M Values
message("2.5 Checking of Beta or M-values are present")
## 2.5 Checking of Beta or M-values are present
m_to_beta <- function(val) {
beta <- 2^val/(2^val + 1)
return(beta)
}
coef <- if((range(coef, na.rm = T) > 1)[[2]] == "TRUE") {
message("Suspect that M Values are present. Converting to Beta Values")
m_to_beta(coef)
} else {
message("Suspect that Beta Values are present");
coef
}
## Suspect that Beta Values are present
Scale data if needed
message("2.6 Scaling data (if needed)")
## 2.6 Scaling data (if needed)
ids <- colnames(coef)
scaled <- apply(coef, 1, function(x) sd(x, na.rm = T))
coef <- if(range(scaled)[1] == 1 & range(scaled)[2] == 1) {
coef
} else {
coef_scale <- apply(coef, 1, scale)
coef_scale <- t(coef_scale)
coef_scale <- as.data.frame(coef_scale)
colnames(coef_scale) <- ids
coef_scale
}
Identify CpGs missing from input dataframe, include them and provide values as mean methylation value at that site
message("2.7 Find CpGs not present in uploaded file, add these with mean Beta Value for CpG site from training sample")
## 2.7 Find CpGs not present in uploaded file, add these with mean Beta Value for CpG site from training sample
coef <- if (nrow(coef)==length(unique(cpgs$CpG_Site))) {
message("All sites present")
coef
} else if (nrow(coef)==0) {
message("There Are No Necessary CpGs in The dataset - All Individuals Would Have Same Values For Predictors. Analysis Is Not Informative!")
} else {
missing_cpgs = cpgs[-which(cpgs$CpG_Site %in% rownames(coef)), c("CpG_Site", "Mean_Beta_Value")]
message(paste(length(unique(missing_cpgs$CpG_Site)), "unique sites are missing - adding to dataset with mean Beta Value from training sample", sep = " "))
mat = matrix(nrow=length(unique(missing_cpgs$CpG_Site)), ncol = ncol(coef))
row.names(mat) <- unique(missing_cpgs$CpG_Site)
colnames(mat) <- colnames(coef)
mat[is.na(mat)] <- 1
missing_cpgs1 <- if (length(which(duplicated(missing_cpgs$CpG_Site))) > 1) {
missing_cpgs[-which(duplicated(missing_cpgs$CpG_Site)),]
} else {
missing_cpgs
}
ids = unique(row.names(mat))
missing_cpgs1 = missing_cpgs1[match(ids,missing_cpgs1$CpG_Site),]
mat = mat*missing_cpgs1$Mean_Beta_Value
coef = rbind(coef,mat)
}
## 754 unique sites are missing - adding to dataset with mean Beta Value from training sample
Impute missing probes
message("2.8 Convert NA Values to mean for each probe")
## 2.8 Convert NA Values to mean for each probe
na_to_mean <-function(methyl) {
methyl[is.na(methyl)] <- mean(methyl, na.rm=T)
return(methyl)
}
coef <- t(apply(coef,1,function(x) na_to_mean(x)))
message("3. Calculating Episcores")
## 3. Calculating Episcores
loop <- unique(cpgs$Predictor)
out <- data.frame()
for(i in loop){
tmp=coef[intersect(row.names(coef),cpgs[cpgs$Predictor %in% i,"CpG_Site"]),]
tmp_coef = cpgs[cpgs$Predictor %in% i, ]
if(nrow(tmp_coef) > 1) {
tmp_coef = tmp_coef[match(row.names(tmp),tmp_coef$CpG_Site),]
out[colnames(coef),i]=colSums(tmp_coef$Coefficient*tmp)
} else {
tmp2 = as.matrix(tmp)*tmp_coef$Coefficient
out[colnames(coef),i] = tmp2[,1]
}
}
Save file
message("3.1. Exporting Episcores")
## 3.1. Exporting Episcores
write.table(out, "../GOTO_Data/Clocks/Clock_Out/episcore_projections.tsv", sep = "\t", quote = FALSE)
message("4. Prepping GrimAge components")
## 4. Prepping GrimAge components
samples <- rownames(out)
grim_pred <- grim[samples, c("DNAmGrimAge"), drop = FALSE]
grim <- grim[samples, c("DNAmadm", "DNAmB2M", "DNAmCystatin_C", "DNAmGDF_15", "DNAmleptin", "DNAmPACKYRS", "DNAmpai_1", "DNAmTIMP_1")]
message("4.1. Scale GrimAge components (if needed)")
## 4.1. Scale GrimAge components (if needed)
scaled_grim <- apply(grim, 2, function(x) sd(x, na.rm = T))
ids <- colnames(grim)
head(scaled_grim)
## DNAmadm DNAmB2M DNAmCystatin_C DNAmGDF_15 DNAmleptin
## 40.575832 97163.617401 51192.152724 213.249610 5349.977914
## DNAmPACKYRS
## 6.976968
message("5. Obtaining bAge predictions")
## 5. Obtaining bAge predictions
names(pheno)[names(pheno) == ageColname] <- "Age"
names(pheno)[names(pheno) == tteColname] <- "TTE"
names(pheno)[names(pheno) == deathColname] <- "Dead"
names(pheno)[names(pheno) == sexColname] <- "Sex"
Fuse all Episcores and GrimAge components, along with other stuff, for each sample
scores <- cbind(pheno[samples, c("TTE", "Dead", "Age", "Sex")], grim, out)
Filter to elements in predictor
scores <- scores[, coefficients$Variable]
Calculate bAge
message("5.1. Calculating bAge")
## 5.1. Calculating bAge
scores <- t(scores)
pred <- scores * coefficients[,"Coefficient"]
pred_pp <- colSums(pred)
Scale to same scale as age in testing
message("5.2. Scaling bAge")
## 5.2. Scaling bAge
scale_pred <- function(x, mean_pred, sd_pred, mean_test, sd_test) {
scaled <- mean_test + (x - mean_pred)*(sd_test/sd_pred)
return(scaled)
}
Scale to same Z scale
scale_Z <- function(x, mean_pred, sd_pred) {
scaled <- (x - mean_pred)/sd_pred
return(scaled)
}
mean_pred <- mean(pred_pp)
mean_test <- mean(pheno$Age) # Mean age in testing data
sd_pred <- sd(pred_pp)
sd_test <- sd(pheno$Age) # SD age in testing data
pred_pp_Z <- scale_Z(pred_pp, mean_pred, sd_pred)
pred_pp_scaled <- scale_pred(pred_pp, mean_pred, sd_pred, mean_test, sd_test)
Make df with everything
pred_df <- data.frame(pred_pp_Z, pred_pp_scaled, grim_pred, pheno[samples, c("Age", "Sex", "TTE", "Dead")])
names(pred_df) <- c("bAge", "bAge_Years", "GrimAge", "Age", "Sex", "TTE", "Dead")
message("5.4. Exporting predictions")
## 5.4. Exporting predictions
write.table(data.frame("Sample" = rownames(pred_df), pred_df), file = paste0("../GOTO_Data/Clocks/Clock_Out/bage_predictions.tsv"),
quote = FALSE, sep = "\t", row.names = FALSE)
sessionInfo()
## R version 4.2.2 (2022-10-31)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Rocky Linux 8.10 (Green Obsidian)
##
## Matrix products: default
## BLAS/LAPACK: /usr/lib64/libopenblas-r0.3.15.so
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats4 stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] survival_3.5-5 RPMM_1.25
## [3] cluster_2.1.4 impute_1.72.3
## [5] SummarizedExperiment_1.24.0 Biobase_2.58.0
## [7] GenomicRanges_1.46.1 GenomeInfoDb_1.34.9
## [9] IRanges_2.32.0 S4Vectors_0.36.2
## [11] BiocGenerics_0.44.0 MatrixGenerics_1.10.0
## [13] matrixStats_1.0.0 dnaMethyAge_0.1.0
## [15] forcats_0.5.2 stringr_1.5.0
## [17] dplyr_1.1.3 purrr_0.3.4
## [19] readr_2.1.2 tidyr_1.2.1
## [21] tibble_3.2.1 ggplot2_3.4.3
## [23] tidyverse_1.3.2 rmarkdown_2.16
##
## loaded via a namespace (and not attached):
## [1] bitops_1.0-7 fs_1.6.2 bit64_4.0.5
## [4] lubridate_1.9.2 httr_1.4.6 tools_4.2.2
## [7] backports_1.4.1 bslib_0.5.0 utf8_1.2.3
## [10] R6_2.5.1 DBI_1.1.3 colorspace_2.1-0
## [13] withr_2.5.0 tidyselect_1.2.0 bit_4.0.5
## [16] compiler_4.2.2 cli_3.6.1 rvest_1.0.3
## [19] xml2_1.3.4 DelayedArray_0.24.0 sass_0.4.6
## [22] scales_1.2.1 digest_0.6.31 XVector_0.34.0
## [25] pkgconfig_2.0.3 htmltools_0.5.5 dbplyr_2.2.1
## [28] fastmap_1.1.1 highr_0.10 rlang_1.1.1
## [31] readxl_1.4.1 rstudioapi_0.14 jquerylib_0.1.4
## [34] generics_0.1.3 jsonlite_1.8.5 vroom_1.5.7
## [37] googlesheets4_1.0.1 RCurl_1.98-1.12 magrittr_2.0.3
## [40] GenomeInfoDbData_1.2.9 Matrix_1.5-4.1 munsell_0.5.0
## [43] fansi_1.0.4 lifecycle_1.0.3 stringi_1.7.12
## [46] yaml_2.3.7 zlibbioc_1.44.0 grid_4.2.2
## [49] parallel_4.2.2 crayon_1.5.2 lattice_0.21-8
## [52] haven_2.5.1 splines_4.2.2 hms_1.1.2
## [55] knitr_1.43 pillar_1.9.0 reprex_2.0.2
## [58] glue_1.6.2 evaluate_0.21 modelr_0.1.9
## [61] vctrs_0.6.3 tzdb_0.4.0 cellranger_1.1.0
## [64] gtable_0.3.3 assertthat_0.2.1 cachem_1.0.8
## [67] xfun_0.39 broom_1.0.1 googledrive_2.0.0
## [70] gargle_1.5.0 timechange_0.2.0 ellipsis_0.3.2
Clear
rm(list=ls())