This script is derived from Bernabeu’s github and calculates their bAge estimation


Setup

Load packages

library(tidyverse)
library(survival)

Load DNAm data

load('../GOTO_Data/GOTO_methData-filtered.Rdata')
methData
## class: SummarizedExperiment 
## dim: 755777 534 
## metadata(0):
## assays(1): beta
## rownames(755777): cg18478105 cg09835024 ... cg10633746 cg12623625
## rowData names(57): cpg chr ... MASK_extBase MASK_general
## colnames(534): 203527980082_R01C01 203527980082_R02C01 ...
##   203550300093_R07C01 203550300093_R08C01
## colData names(102): DNA_labnr IOP2_ID ... cc_blood_ext_NK
##   cc_blood_ext_Treg

Methylation table:

  • Each column corresponds to an individual
  • Each row corresponds to CpG
  • First column is the CpG name
  • (If RDS, assumed first column is rownames)
betas <- as.data.frame(assay(methData))
betas <- betas %>% rownames_to_column(var='cpg')
colnames(betas) <- c('cpg', methData$Basename)

write_csv(betas, 
      file='../GOTO_Data/Clocks/Clock_Data/methylationTable.csv')

methylationTable <- "../GOTO_Data/Clocks/Clock_Data/methylationTable.csv"
methylationTable_format <- "csv"

Path to phenotype table:

  • Each column corresponds to a variable
  • Each row corresponds to an individual/sample
  • First column is sample name
  • (If RDS, assumed first column is rownames)
targets <- as.data.frame(colData(methData))
targets <- targets %>% select(sample_ID = Basename, 
                              age,
                              sex) %>% 
  mutate(sex = ifelse(sex=='female', 1, 0),
         tte = NA,
         death = 0)

write_csv(targets,
          file="../GOTO_Data/Clocks/Clock_Data/phenotypeTable.csv")

phenotypeTable <- "../GOTO_Data/Clocks/Clock_Data/phenotypeTable.csv"
phenotypeTable_format <- "csv"

Path to GrimAge results table:

  • Each column represents a GrimAge component
  • Each row corresponds to an individual/sample
  • First column is sample name.
grimageTable <- "../GOTO_Data/Clocks/Clock_Out/Grimage_output_GOTO.csv"
grimageTable_format <- "csv"

Insert column names for each of the variables corresponding to the names in phenotypeTable.

ageColname <- "age"
sexColname <- "sex"
tteColname <- "tte"
deathColname <- "death"

Data Loading

message("1. Loading data") 
## 1. Loading data
message("1.1 Loading methylation data - rows to be CpGs and columns to be individuals") 
## 1.1 Loading methylation data - rows to be CpGs and columns to be individuals

Loading in model coefficients.

(Make sure these files are present in the current working directory or change the paths to the correct directory)

coefficients <- read.delim("../GOTO_Data/Clocks/Clock_Data/bage_coefficients.tsv")

Loading in CpG coefficients for episcore projection

cpgs <- read.delim("../GOTO_Data/Clocks/Clock_Data/cpg_episcore_weights.tsv")

Loading methylation data

if (tolower(methylationTable_format) == "rds") {
  data <- readRDS(methylationTable)
} else if (tolower(methylationTable_format) == "tsv") {
  data <- read.delim(methylationTable, sep = "\t", row.names = 1)
} else if (tolower(methylationTable_format) == "csv") {
  data <- read.csv(methylationTable, sep = ",", row.names = 1)
} else {
  message("Unrecognized methylation data format. Accepted formats: rds, tsv, and csv.")
}

Loading phenotype data

if (tolower(phenotypeTable_format) == "rds") {
  pheno <- readRDS(phenotypeTable)
} else if (tolower(phenotypeTable_format) == "tsv") {
  pheno <- read.delim(phenotypeTable, sep = "\t", row.names = 1)
} else if (tolower(phenotypeTable_format) == "csv") {
  pheno <- read.csv(phenotypeTable, sep = ",", row.names = 1)
} else {
  message("Unrecognized phenotype data format. Accepted formats: rds, tsv, and csv.")
}

Loading GrimAge data

if (tolower(grimageTable_format) == "rds") {
  grim <- readRDS(grimageTable)
} else if (tolower(grimageTable_format) == "tsv") {
  grim <- read.delim(grimageTable, sep = "\t", row.names = 1)
} else if (tolower(grimageTable_format) == "csv") {
  grim <- read.csv(grimageTable, sep = ",", row.names = 1)
} else {
  message("Unrecognized GrimAge data format. Accepted formats: rds, tsv, and csv.")
}

row.names(grim) <- grim$SampleID

Set colnames

colnames(data) <- gsub('X', '', colnames(data))

QC and data prep

Check if Data needs to be Transposed

message("2. Quality Control and data preparation") 
## 2. Quality Control and data preparation
message("2.1 Checking if row names are CpG sites") 
## 2.1 Checking if row names are CpG sites
if(ncol(data) > nrow(data)){
  message("It seems that individuals are rows - data will be transposed!")
  data <- t(data) 
}

Keep individuals in pheno table

message("2.3 Subsetting samples to those in phenotype table") 
## 2.3 Subsetting samples to those in phenotype table
data <- data[,which(colnames(data) %in% rownames(pheno))]
grim <- grim[which(rownames(grim) %in% rownames(pheno)),]
data <- data[,rownames(pheno)]
grim <- grim[rownames(pheno),]

Subset CpG sites to those present on list for predictors

message("2.4 Subsetting CpG sites to those required for predictor calculation") 
## 2.4 Subsetting CpG sites to those required for predictor calculation
coef <- data[intersect(rownames(data), cpgs$CpG_Site),]

Check if Beta or M Values

message("2.5 Checking of Beta or M-values are present") 
## 2.5 Checking of Beta or M-values are present
m_to_beta <- function(val) {
  beta <- 2^val/(2^val + 1)
  return(beta)
}

coef <- if((range(coef, na.rm = T) > 1)[[2]] == "TRUE") {
    message("Suspect that M Values are present. Converting to Beta Values")
    m_to_beta(coef)
  } else {
    message("Suspect that Beta Values are present");
    coef
  }
## Suspect that Beta Values are present

Scale data if needed

message("2.6 Scaling data (if needed)") 
## 2.6 Scaling data (if needed)
ids <- colnames(coef)
scaled <- apply(coef, 1, function(x) sd(x, na.rm = T)) 

coef <-  if(range(scaled)[1] == 1 & range(scaled)[2] == 1) { 
    coef
  } else { 
    coef_scale <- apply(coef, 1, scale)
    coef_scale <- t(coef_scale)
    coef_scale <- as.data.frame(coef_scale)
    colnames(coef_scale) <- ids
    coef_scale
  }

Identify CpGs missing from input dataframe, include them and provide values as mean methylation value at that site

message("2.7 Find CpGs not present in uploaded file, add these with mean Beta Value for CpG site from training sample") 
## 2.7 Find CpGs not present in uploaded file, add these with mean Beta Value for CpG site from training sample
coef <- if (nrow(coef)==length(unique(cpgs$CpG_Site))) { 
  message("All sites present")
  coef
  } else if (nrow(coef)==0) { 
  message("There Are No Necessary CpGs in The dataset - All Individuals Would Have Same Values For Predictors. Analysis Is Not Informative!")
  } else { 
    missing_cpgs = cpgs[-which(cpgs$CpG_Site %in% rownames(coef)), c("CpG_Site", "Mean_Beta_Value")]
    message(paste(length(unique(missing_cpgs$CpG_Site)), "unique sites are missing - adding to dataset with mean Beta Value from training sample", sep = " "))
    mat = matrix(nrow=length(unique(missing_cpgs$CpG_Site)), ncol = ncol(coef))
    row.names(mat) <- unique(missing_cpgs$CpG_Site)
    colnames(mat) <- colnames(coef) 
    mat[is.na(mat)] <- 1
    missing_cpgs1 <- if (length(which(duplicated(missing_cpgs$CpG_Site))) > 1) { 
      missing_cpgs[-which(duplicated(missing_cpgs$CpG_Site)),]
    } else {
      missing_cpgs
    }  
  ids = unique(row.names(mat))
  missing_cpgs1 = missing_cpgs1[match(ids,missing_cpgs1$CpG_Site),]
  mat = mat*missing_cpgs1$Mean_Beta_Value
  coef = rbind(coef,mat)
} 
## 754 unique sites are missing - adding to dataset with mean Beta Value from training sample

Impute missing probes

message("2.8 Convert NA Values to mean for each probe") 
## 2.8 Convert NA Values to mean for each probe
na_to_mean <-function(methyl) {
  methyl[is.na(methyl)] <- mean(methyl, na.rm=T)
  return(methyl)
}
coef <- t(apply(coef,1,function(x) na_to_mean(x)))

Calculate Episcores

message("3. Calculating Episcores") 
## 3. Calculating Episcores
loop <- unique(cpgs$Predictor)
out <- data.frame()
for(i in loop){ 
  tmp=coef[intersect(row.names(coef),cpgs[cpgs$Predictor %in% i,"CpG_Site"]),]
  tmp_coef = cpgs[cpgs$Predictor %in% i, ]
  if(nrow(tmp_coef) > 1) { 
    tmp_coef = tmp_coef[match(row.names(tmp),tmp_coef$CpG_Site),]
    out[colnames(coef),i]=colSums(tmp_coef$Coefficient*tmp)
  } else {
    tmp2 = as.matrix(tmp)*tmp_coef$Coefficient 
    out[colnames(coef),i] = tmp2[,1]
  }
} 

Save file

message("3.1. Exporting Episcores")  
## 3.1. Exporting Episcores
write.table(out, "../GOTO_Data/Clocks/Clock_Out/episcore_projections.tsv", sep = "\t", quote = FALSE)

Scale GrimAge components

message("4. Prepping GrimAge components") 
## 4. Prepping GrimAge components
samples <- rownames(out)
grim_pred <- grim[samples, c("DNAmGrimAge"), drop = FALSE]
grim <- grim[samples, c("DNAmadm", "DNAmB2M", "DNAmCystatin_C", "DNAmGDF_15", "DNAmleptin", "DNAmPACKYRS", "DNAmpai_1", "DNAmTIMP_1")]

message("4.1. Scale GrimAge components (if needed)") 
## 4.1. Scale GrimAge components (if needed)
scaled_grim <- apply(grim, 2, function(x) sd(x, na.rm = T)) 
ids <- colnames(grim)

head(scaled_grim)
##        DNAmadm        DNAmB2M DNAmCystatin_C     DNAmGDF_15     DNAmleptin 
##      40.575832   97163.617401   51192.152724     213.249610    5349.977914 
##    DNAmPACKYRS 
##       6.976968

bAge prediction

message("5. Obtaining bAge predictions") 
## 5. Obtaining bAge predictions
names(pheno)[names(pheno) == ageColname] <- "Age"
names(pheno)[names(pheno) == tteColname] <- "TTE"
names(pheno)[names(pheno) == deathColname] <- "Dead"
names(pheno)[names(pheno) == sexColname] <- "Sex"

Fuse all Episcores and GrimAge components, along with other stuff, for each sample

scores <- cbind(pheno[samples, c("TTE", "Dead", "Age", "Sex")], grim, out)

Filter to elements in predictor

scores <- scores[, coefficients$Variable]

Calculate bAge

message("5.1. Calculating bAge") 
## 5.1. Calculating bAge
scores <- t(scores)
pred <- scores * coefficients[,"Coefficient"]
pred_pp <- colSums(pred)

Scale to same scale as age in testing

message("5.2. Scaling bAge") 
## 5.2. Scaling bAge
scale_pred <- function(x, mean_pred, sd_pred, mean_test, sd_test) { 
  scaled <- mean_test + (x - mean_pred)*(sd_test/sd_pred)
  return(scaled)
}

Scale to same Z scale

scale_Z <- function(x, mean_pred, sd_pred) { 
  scaled <- (x - mean_pred)/sd_pred
  return(scaled)
}

mean_pred <- mean(pred_pp)
mean_test <- mean(pheno$Age) # Mean age in testing data
sd_pred <- sd(pred_pp)
sd_test <- sd(pheno$Age) # SD age in testing data
pred_pp_Z <- scale_Z(pred_pp, mean_pred, sd_pred)
pred_pp_scaled <- scale_pred(pred_pp, mean_pred, sd_pred, mean_test, sd_test)

Make df with everything

pred_df <- data.frame(pred_pp_Z, pred_pp_scaled, grim_pred, pheno[samples, c("Age", "Sex", "TTE", "Dead")])
names(pred_df) <- c("bAge", "bAge_Years", "GrimAge", "Age", "Sex", "TTE", "Dead")

Export

message("5.4. Exporting predictions") 
## 5.4. Exporting predictions
write.table(data.frame("Sample" = rownames(pred_df), pred_df), file = paste0("../GOTO_Data/Clocks/Clock_Out/bage_predictions.tsv"), 
            quote = FALSE, sep = "\t", row.names = FALSE)

Session Info

sessionInfo()
## R version 4.2.2 (2022-10-31)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Rocky Linux 8.10 (Green Obsidian)
## 
## Matrix products: default
## BLAS/LAPACK: /usr/lib64/libopenblas-r0.3.15.so
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats4    stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] survival_3.5-5              RPMM_1.25                  
##  [3] cluster_2.1.4               impute_1.72.3              
##  [5] SummarizedExperiment_1.24.0 Biobase_2.58.0             
##  [7] GenomicRanges_1.46.1        GenomeInfoDb_1.34.9        
##  [9] IRanges_2.32.0              S4Vectors_0.36.2           
## [11] BiocGenerics_0.44.0         MatrixGenerics_1.10.0      
## [13] matrixStats_1.0.0           dnaMethyAge_0.1.0          
## [15] forcats_0.5.2               stringr_1.5.0              
## [17] dplyr_1.1.3                 purrr_0.3.4                
## [19] readr_2.1.2                 tidyr_1.2.1                
## [21] tibble_3.2.1                ggplot2_3.4.3              
## [23] tidyverse_1.3.2             rmarkdown_2.16             
## 
## loaded via a namespace (and not attached):
##  [1] bitops_1.0-7           fs_1.6.2               bit64_4.0.5           
##  [4] lubridate_1.9.2        httr_1.4.6             tools_4.2.2           
##  [7] backports_1.4.1        bslib_0.5.0            utf8_1.2.3            
## [10] R6_2.5.1               DBI_1.1.3              colorspace_2.1-0      
## [13] withr_2.5.0            tidyselect_1.2.0       bit_4.0.5             
## [16] compiler_4.2.2         cli_3.6.1              rvest_1.0.3           
## [19] xml2_1.3.4             DelayedArray_0.24.0    sass_0.4.6            
## [22] scales_1.2.1           digest_0.6.31          XVector_0.34.0        
## [25] pkgconfig_2.0.3        htmltools_0.5.5        dbplyr_2.2.1          
## [28] fastmap_1.1.1          highr_0.10             rlang_1.1.1           
## [31] readxl_1.4.1           rstudioapi_0.14        jquerylib_0.1.4       
## [34] generics_0.1.3         jsonlite_1.8.5         vroom_1.5.7           
## [37] googlesheets4_1.0.1    RCurl_1.98-1.12        magrittr_2.0.3        
## [40] GenomeInfoDbData_1.2.9 Matrix_1.5-4.1         munsell_0.5.0         
## [43] fansi_1.0.4            lifecycle_1.0.3        stringi_1.7.12        
## [46] yaml_2.3.7             zlibbioc_1.44.0        grid_4.2.2            
## [49] parallel_4.2.2         crayon_1.5.2           lattice_0.21-8        
## [52] haven_2.5.1            splines_4.2.2          hms_1.1.2             
## [55] knitr_1.43             pillar_1.9.0           reprex_2.0.2          
## [58] glue_1.6.2             evaluate_0.21          modelr_0.1.9          
## [61] vctrs_0.6.3            tzdb_0.4.0             cellranger_1.1.0      
## [64] gtable_0.3.3           assertthat_0.2.1       cachem_1.0.8          
## [67] xfun_0.39              broom_1.0.1            googledrive_2.0.0     
## [70] gargle_1.5.0           timechange_0.2.0       ellipsis_0.3.2

Clear

rm(list=ls())