This script uses the MuSiC algorithm to predict muscle nuclei type proportions from bulk RNAseq and a single nuclei reference.
Load required packages
library(tidyverse)
library(Biobase)
library(MuSiC)
library(SummarizedExperiment)
library(GEOquery)
Load data
load('../GOTO_Data/Cell_Counts/Muscle/snCounts.Rdata')
counts[is.na(counts)] <- 0
Save genes
gene_names <- rownames(counts)
Load functions
source("../GOTO_Data/RNAseq/goto.rnaseq.functions.R")
# Paths
pathIN_dat <- "../GOTO_Data/RNAseq/merge.gene.counts_biopet_13052016.RData"
pathIN_cov <- "../GOTO_Data/RNAseq/muscle_QC_covariates_filesv2.csv"
# Muscle complete pairs
filt.samp <- "tissue_muscle|qc_sexswitch|qc_multdim2|qc_rep1|complete_pairs"
# Read in
dat1 <- read.gotornaseq(pathIN_dat = pathIN_dat,
pathIN_cov,
filt.samp = filt.samp,
type="counts",
quiet = FALSE)
## ||| PREPARING GOTO RNASEQ DATA
## || READING DATA
## | Loading RNASEQ .. OK!
## [555 samples x 56520 features]
## | Reading COVARIATES .. OK!
## [maintaining 174 samples x 117 features]
## | Merging data .. OK!
## [555 samples x 56637 features]
## || SUBSETTING SAMPLES
## | Subsetting SAMPLES on ['tissue_muscle']; PASS: 174 out of 555
## | Subsetting SAMPLES on ['qc_sexswitch']; PASS: 172 out of 174
## | Subsetting SAMPLES on ['qc_multdim2']; PASS: 168 out of 172
## | Subsetting SAMPLES on ['qc_rep1']; PASS: 168 out of 168
## | Subsetting SAMPLES on ['complete_pairs']; PASS: 162 out of 168
## | DONE!
# Format
goto_counts <- dat1[[1]][,grep('ENS',colnames(dat1[[1]]))]
goto_counts <- as.data.frame(t(goto_counts))
colnames(goto_counts) <- str_c(dat1[[1]]$sampID2,
"_",
dat1[[1]]$visitnr)
# Map ensembl ID to name
ens2gene <- cinaR::grch37
m <- match(rownames(goto_counts), ens2gene$ensgene)
mapped.genes <- ens2gene$symbol[m]
Remove genes
removed.genes <- duplicated(mapped.genes) | is.na(mapped.genes) | grepl("^MT", mapped.genes)
goto_counts <- goto_counts[!removed.genes,]
rownames(goto_counts) <- mapped.genes[!removed.genes]
Subset snRNA genes
goto_counts <- goto_counts[gene_names,]
goto_counts <- goto_counts[!rowSums(is.na(goto_counts)) == 162,]
Save goto genes
goto_genes <- rownames(goto_counts)
Save overlapping genes
bin <- gene_names %in% goto_genes
xtabs(~bin)
## bin
## FALSE TRUE
## 4858 16310
new_names <- gene_names[bin]
counts <- counts[bin,]
goto_counts <- goto_counts[new_names,]
Load clusters
load('../GOTO_Data/Cell_Counts/Muscle/snClusters.Rdata')
Annotate clusters
cluster_key <- data.frame(
seurat_clusters = c(0,1,2,3,4,5,6,7),
cell_type = c('Fast Skeletal Muscle',
'Slow Skeletal Muscle',
'FAPs',
'Endothelial cell',
'T/NK/Macrophages',
'SATs',
'Smooth Muscle Cell',
'Slow Skeletal Muscle')
)
nuclei_clusters <- nuclei_clusters %>%
mutate(
cell_type = case_when(
seurat_clusters == 0 ~ "Fast Skeletal Muscle",
seurat_clusters == 1 ~ "Slow Skeletal Muscle",
seurat_clusters == 2 ~ "FAPs",
seurat_clusters == 3 ~ "Endothelial Cells",
seurat_clusters == 4 ~ "T/NK/Macrophages",
seurat_clusters == 5 ~ "SATs",
seurat_clusters == 6 ~ "Smooth Muscle",
seurat_clusters == 7 ~ 'Slow Skeletal Muscle'
)
)
nuclei_clusters <- nuclei_clusters %>%
dplyr::select(cell_type)
head(nuclei_clusters)
## cell_type
## HM1_AAACCTGAGACTACAA Slow Skeletal Muscle
## HM1_AAACCTGAGCATGGCA FAPs
## HM1_AAACCTGAGCTAACTC Slow Skeletal Muscle
## HM1_AAACCTGAGGGATCTG Fast Skeletal Muscle
## HM1_AAACCTGCAACTTGAC Fast Skeletal Muscle
## HM1_AAACCTGCACAAGACG Slow Skeletal Muscle
cell_types <- nuclei_clusters[colnames(counts),]
cell_types[1:5]
## [1] "Slow Skeletal Muscle" "FAPs" "Slow Skeletal Muscle"
## [4] "Fast Skeletal Muscle" "Fast Skeletal Muscle"
Transform and label samples
meta <- as.data.frame(cell_types)
colnames(meta) <-'Cell'
meta$Sample <- colnames(counts)
rownames(meta) <- colnames(counts)
Remove non-zero genes
nz.gene = rownames(counts)[(rowSums(counts) != 0)]
counts <- counts[nz.gene,]
dim(counts)
## [1] 16310 140000
Convert counts to matrix
dGC_to_matrix <- function(x,ncol_break = 49999){
if(length(colnames(x))>(ncol_break+1)){
total_cols = length(colnames(x))
the_seq <- c(seq(1,total_cols,ncol_break), total_cols)
the_seq <- unique(the_seq)
}
matrix_list <- list()
total_parts <- length(the_seq)-1
for(i in 1:total_parts){
start_no = ifelse(i==1,1,the_seq[i]+1)
print(paste0(i, " is i"))
print(paste0("start_no is ", start_no))
end_no = the_seq[i+1]
print(paste0("part_number: ", i, ";cols-",start_no,":",end_no))
matrix_list[[i]] <- as.matrix(x[,start_no:end_no,drop = F])
}
return(do.call(cbind, matrix_list))
}
full_mtx <- dGC_to_matrix(counts, 50000)
## [1] "1 is i"
## [1] "start_no is 1"
## [1] "part_number: 1;cols-1:50001"
## Warning in asMethod(object): sparse->dense coercion: allocating vector of size
## 6.1 GiB
## [1] "2 is i"
## [1] "start_no is 50002"
## [1] "part_number: 2;cols-50002:100001"
## Warning in asMethod(object): sparse->dense coercion: allocating vector of size
## 6.1 GiB
## [1] "3 is i"
## [1] "start_no is 100002"
## [1] "part_number: 3;cols-100002:140000"
## Warning in asMethod(object): sparse->dense coercion: allocating vector of size
## 4.9 GiB
C.eset <- Biobase::ExpressionSet(
assayData = full_mtx,
phenoData = Biobase::AnnotatedDataFrame(meta))
C.eset
## ExpressionSet (storageMode: lockedEnvironment)
## assayData: 16310 features, 140000 samples
## element names: exprs
## protocolData: none
## phenoData
## sampleNames: HM1_AAACCTGAGACTACAA HM1_AAACCTGAGCATGGCA ...
## HM23_TCTCTAATCATGTCTT (140000 total)
## varLabels: Cell Sample
## varMetadata: labelDescription
## featureData: none
## experimentData: use 'experimentData(object)'
## Annotation:
T.eset <- Biobase::ExpressionSet(assayData = as.matrix(goto_counts))
T.eset
## ExpressionSet (storageMode: lockedEnvironment)
## assayData: 16310 features, 162 samples
## element names: exprs
## protocolData: none
## phenoData: none
## featureData: none
## experimentData: use 'experimentData(object)'
## Annotation:
deconv <- music_prop(
bulk.eset = T.eset,
sc.eset = C.eset,
clusters = 'Cell',
markers = NULL,
normalize = FALSE,
samples = 'Sample',
verbose = F)$Est.prop.weighted
Summarize
names <- colnames(deconv)
summary(deconv)
## Slow Skeletal Muscle FAPs Fast Skeletal Muscle
## Min. :0.0000 Min. :0.0000000 Min. :0.001605
## 1st Qu.:0.1848 1st Qu.:0.0000000 1st Qu.:0.105158
## Median :0.3735 Median :0.0000000 Median :0.219360
## Mean :0.3353 Mean :0.0007975 Mean :0.278944
## 3rd Qu.:0.4749 3rd Qu.:0.0000000 3rd Qu.:0.436309
## Max. :0.6957 Max. :0.0208963 Max. :0.786071
## SATs T/NK/Macrophages Endothelial Cells Smooth Muscle
## Min. :0.0000000 Min. :0.0000000 Min. :0.2093 Min. :0.0000000
## 1st Qu.:0.0000000 1st Qu.:0.0000000 1st Qu.:0.3262 1st Qu.:0.0000000
## Median :0.0000000 Median :0.0007591 Median :0.3754 Median :0.0000000
## Mean :0.0003633 Mean :0.0071134 Mean :0.3774 Mean :0.0001438
## 3rd Qu.:0.0000000 3rd Qu.:0.0088130 3rd Qu.:0.4291 3rd Qu.:0.0000000
## Max. :0.0216730 Max. :0.0909209 Max. :0.5930 Max. :0.0090539
heatmap(deconv, margins=c(12,8))
save(deconv, file='../GOTO_Data/Cell_Counts/Muscle/Music_PredictedMuscleCells.Rdata')
sessionInfo()
## R version 4.2.2 (2022-10-31)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Rocky Linux 8.10 (Green Obsidian)
##
## Matrix products: default
## BLAS/LAPACK: /usr/lib64/libopenblas-r0.3.15.so
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] parallel stats4 stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] GEOquery_2.62.2
## [2] MuSiC_0.2.0
## [3] nnls_1.4
## [4] gplots_3.1.3
## [5] plotly_4.10.1
## [6] SeuratObject_4.1.3
## [7] Seurat_4.3.0
## [8] gridExtra_2.3
## [9] lattice_0.21-8
## [10] bacon_1.22.0
## [11] ellipse_0.4.5
## [12] methylGSA_1.12.0
## [13] sva_3.42.0
## [14] genefilter_1.76.0
## [15] mgcv_1.8-42
## [16] nlme_3.1-162
## [17] limma_3.54.2
## [18] lmerTest_3.1-3
## [19] lme4_1.1-30
## [20] IlluminaHumanMethylationEPICanno.ilm10b4.hg19_0.6.0
## [21] snpStats_1.44.0
## [22] survival_3.5-5
## [23] ggrepel_0.9.1
## [24] ggfortify_0.4.14
## [25] irlba_2.3.5.1
## [26] Matrix_1.5-4.1
## [27] omicsPrint_1.14.0
## [28] MASS_7.3-60
## [29] DNAmArray_2.0.0
## [30] pls_2.8-2
## [31] FDb.InfiniumMethylation.hg19_2.2.0
## [32] org.Hs.eg.db_3.14.0
## [33] TxDb.Hsapiens.UCSC.hg19.knownGene_3.2.2
## [34] GenomicFeatures_1.46.5
## [35] AnnotationDbi_1.56.2
## [36] IlluminaHumanMethylationEPICmanifest_0.3.0
## [37] minfi_1.40.0
## [38] bumphunter_1.36.0
## [39] locfit_1.5-9.8
## [40] iterators_1.0.14
## [41] foreach_1.5.2
## [42] Biostrings_2.62.0
## [43] XVector_0.34.0
## [44] SummarizedExperiment_1.24.0
## [45] Biobase_2.58.0
## [46] MatrixGenerics_1.10.0
## [47] matrixStats_1.0.0
## [48] GenomicRanges_1.46.1
## [49] GenomeInfoDb_1.34.9
## [50] IRanges_2.32.0
## [51] S4Vectors_0.36.2
## [52] BiocGenerics_0.44.0
## [53] BiocParallel_1.32.6
## [54] MethylAid_1.28.0
## [55] forcats_0.5.2
## [56] stringr_1.5.0
## [57] dplyr_1.1.3
## [58] purrr_0.3.4
## [59] readr_2.1.2
## [60] tidyr_1.2.1
## [61] tibble_3.2.1
## [62] ggplot2_3.4.3
## [63] tidyverse_1.3.2
## [64] rmarkdown_2.16
##
## loaded via a namespace (and not attached):
## [1] ica_1.0-3
## [2] Rsamtools_2.10.0
## [3] cinaR_0.2.3
## [4] lmtest_0.9-40
## [5] crayon_1.5.2
## [6] rhdf5filters_1.10.1
## [7] backports_1.4.1
## [8] reprex_2.0.2
## [9] GOSemSim_2.20.0
## [10] rlang_1.1.1
## [11] ROCR_1.0-11
## [12] readxl_1.4.1
## [13] SparseM_1.81
## [14] nloptr_2.0.3
## [15] filelock_1.0.2
## [16] rjson_0.2.21
## [17] bit64_4.0.5
## [18] glue_1.6.2
## [19] sctransform_0.3.5
## [20] rngtools_1.5.2
## [21] spatstat.sparse_3.0-1
## [22] mcmc_0.9-7
## [23] spatstat.geom_3.2-1
## [24] DOSE_3.20.1
## [25] haven_2.5.1
## [26] tidyselect_1.2.0
## [27] fitdistrplus_1.1-11
## [28] XML_3.99-0.14
## [29] zoo_1.8-12
## [30] GenomicAlignments_1.30.0
## [31] MatrixModels_0.5-1
## [32] xtable_1.8-4
## [33] magrittr_2.0.3
## [34] evaluate_0.21
## [35] cli_3.6.1
## [36] zlibbioc_1.44.0
## [37] miniUI_0.1.1.1
## [38] rstudioapi_0.14
## [39] doRNG_1.8.6
## [40] sp_1.6-1
## [41] MultiAssayExperiment_1.20.0
## [42] bslib_0.5.0
## [43] fastmatch_1.1-3
## [44] treeio_1.18.1
## [45] shiny_1.7.2
## [46] xfun_0.39
## [47] askpass_1.1
## [48] multtest_2.50.0
## [49] cluster_2.1.4
## [50] caTools_1.18.2
## [51] tidygraph_1.2.2
## [52] KEGGREST_1.34.0
## [53] quantreg_5.94
## [54] base64_2.0.1
## [55] ape_5.7-1
## [56] scrime_1.3.5
## [57] listenv_0.9.0
## [58] png_0.1-8
## [59] reshape_0.8.9
## [60] future_1.32.0
## [61] withr_2.5.0
## [62] bitops_1.0-7
## [63] ggforce_0.3.4
## [64] plyr_1.8.8
## [65] cellranger_1.1.0
## [66] coda_0.19-4
## [67] pillar_1.9.0
## [68] cachem_1.0.8
## [69] fs_1.6.2
## [70] clusterProfiler_4.2.2
## [71] DelayedMatrixStats_1.16.0
## [72] vctrs_0.6.3
## [73] ellipsis_0.3.2
## [74] generics_0.1.3
## [75] tools_4.2.2
## [76] munsell_0.5.0
## [77] tweenr_2.0.2
## [78] fgsea_1.20.0
## [79] DelayedArray_0.24.0
## [80] abind_1.4-5
## [81] fastmap_1.1.1
## [82] compiler_4.2.2
## [83] httpuv_1.6.11
## [84] rtracklayer_1.54.0
## [85] beanplot_1.3.1
## [86] MCMCpack_1.6-3
## [87] GenomeInfoDbData_1.2.9
## [88] edgeR_3.40.2
## [89] deldir_1.0-9
## [90] utf8_1.2.3
## [91] later_1.3.1
## [92] RobustRankAggreg_1.2.1
## [93] BiocFileCache_2.2.1
## [94] jsonlite_1.8.5
## [95] scales_1.2.1
## [96] pbapply_1.7-0
## [97] tidytree_0.4.0
## [98] sparseMatrixStats_1.10.0
## [99] lazyeval_0.2.2
## [100] promises_1.2.0.1
## [101] goftest_1.2-3
## [102] spatstat.utils_3.0-3
## [103] reticulate_1.30
## [104] htm2txt_2.2.2
## [105] nor1mix_1.3-0
## [106] cowplot_1.1.1
## [107] statmod_1.5.0
## [108] siggenes_1.68.0
## [109] Rtsne_0.16
## [110] downloader_0.4
## [111] uwot_0.1.14
## [112] igraph_1.4.3
## [113] HDF5Array_1.22.1
## [114] numDeriv_2016.8-1.1
## [115] yaml_2.3.7
## [116] htmltools_0.5.5
## [117] memoise_2.0.1
## [118] BiocIO_1.8.0
## [119] graphlayouts_0.8.1
## [120] quadprog_1.5-8
## [121] viridisLite_0.4.2
## [122] digest_0.6.31
## [123] assertthat_0.2.1
## [124] mime_0.12
## [125] rappdirs_0.3.3
## [126] RSQLite_2.2.17
## [127] yulab.utils_0.0.6
## [128] future.apply_1.11.0
## [129] data.table_1.14.8
## [130] blob_1.2.4
## [131] preprocessCore_1.60.2
## [132] splines_4.2.2
## [133] labeling_0.4.2
## [134] Rhdf5lib_1.20.0
## [135] illuminaio_0.40.0
## [136] googledrive_2.0.0
## [137] RaggedExperiment_1.18.0
## [138] RCurl_1.98-1.12
## [139] broom_1.0.1
## [140] hms_1.1.2
## [141] modelr_0.1.9
## [142] rhdf5_2.42.1
## [143] colorspace_2.1-0
## [144] aplot_0.1.7
## [145] sass_0.4.6
## [146] Rcpp_1.0.10
## [147] mclust_6.0.0
## [148] RANN_2.6.1
## [149] enrichplot_1.14.2
## [150] fansi_1.0.4
## [151] tzdb_0.4.0
## [152] parallelly_1.36.0
## [153] R6_2.5.1
## [154] grid_4.2.2
## [155] ggridges_0.5.4
## [156] lifecycle_1.0.3
## [157] curl_5.0.1
## [158] googlesheets4_1.0.1
## [159] minqa_1.2.5
## [160] leiden_0.4.3
## [161] jquerylib_0.1.4
## [162] DO.db_2.9
## [163] qvalue_2.26.0
## [164] RcppAnnoy_0.0.20
## [165] RColorBrewer_1.1-3
## [166] spatstat.explore_3.1-0
## [167] htmlwidgets_1.5.4
## [168] polyclip_1.10-4
## [169] biomaRt_2.50.3
## [170] missMethyl_1.28.0
## [171] shadowtext_0.1.2
## [172] timechange_0.2.0
## [173] gridGraphics_0.5-1
## [174] reactome.db_1.77.0
## [175] rvest_1.0.3
## [176] globals_0.16.2
## [177] openssl_2.0.6
## [178] spatstat.random_3.1-5
## [179] patchwork_1.1.2
## [180] progressr_0.13.0
## [181] codetools_0.2-19
## [182] IlluminaHumanMethylation450kanno.ilmn12.hg19_0.6.0
## [183] lubridate_1.9.2
## [184] GO.db_3.14.0
## [185] gtools_3.9.4
## [186] prettyunits_1.1.1
## [187] dbplyr_2.2.1
## [188] gridBase_0.4-7
## [189] gtable_0.3.3
## [190] DBI_1.1.3
## [191] tensor_1.5
## [192] ggfun_0.0.7
## [193] httr_1.4.6
## [194] highr_0.10
## [195] KernSmooth_2.23-21
## [196] stringi_1.7.12
## [197] vroom_1.5.7
## [198] progress_1.2.2
## [199] reshape2_1.4.4
## [200] farver_2.1.1
## [201] annotate_1.72.0
## [202] viridis_0.6.2
## [203] hexbin_1.28.3
## [204] ggtree_3.2.1
## [205] xml2_1.3.4
## [206] boot_1.3-28.1
## [207] restfulr_0.0.15
## [208] scattermore_0.8
## [209] ggplotify_0.1.0
## [210] bit_4.0.5
## [211] spatstat.data_3.0-1
## [212] scatterpie_0.1.8
## [213] ggraph_2.0.6
## [214] pkgconfig_2.0.3
## [215] gargle_1.5.0
## [216] knitr_1.43
Clear
rm(list=ls())