适用背景
单细胞转录组分析中如果涉及到发育或具有时间序列的细胞谱系,往往需要进行拟时序分析,而最最最常用的拟时序分析软件就是Monocle,如今它已经出到了第三个版本monocle3。关于monocle3与monocle2之间的区别,可以参考这篇博客,有兴趣可以查看monocle3的官网教程。
安装
建议使用conda 安装
conda install -c conda-forge r-seurat
conda install -c conda-forge r-seuratobject
conda install -c conda-forge r-terra==1.5.12
conda install -c conda-forge r-units
conda install -c conda-forge r-raster
conda install -c conda-forge r-spdata
conda install -c conda-forge r-sf
conda install -c conda-forge r-spdep
conda install -c conda-forge r-ragg
conda install -c conda-forge r-ggrastr
conda install -c conda-forge libgit2
conda install -c conda-forge r-devtools
install.packages("BiocManager")
conda install -c anaconda cmake
在R里面进行最后的安装
BiocManager::install(c("nloptr", "lme4"))
if(!require(monocle3))devtools::install_github('cole-trapnell-lab/monocle3')
需要加载的R包
library(stringr)
library(ggplot2)
library(Seurat)
library(future)
library(rhdf5)
library(dplyr)
library(data.table)
library(Matrix)
library(rjson)
library(monocle3)
library(patchwork)
library(ComplexHeatmap)
library(RColorBrewer)
library(circlize)
运行Monocle3的主函数
obj是Seurat的对象,注意需要有pca和umap降维信息,assay选择使用的assay,ex.umap默认为F,使用Monocle3软件运行得到的umap,如果为T/TRUE,则使用Seurat对象的umap,group.by是细胞分组,label是输出文件的前缀,默认为’out’。
run_monocle3 <- function(obj,assay='RNA',ex.umap=F,group.by='seurat_clusters',label='out') {
all <- obj
all$ingroup <- all@meta.data[,group.by]
expression_matrix <- all@assays[assay][[1]]@counts
cell_metadata <- all@meta.data
gene_annotation <- data.frame(rownames(all), rownames(all))
rownames(gene_annotation) <- rownames(all)
colnames(gene_annotation) <- c("GeneSymbol", "gene_short_name")
NucSeq_cds <- new_cell_data_set(expression_matrix,
cell_metadata = cell_metadata,
gene_metadata = gene_annotation)
NucSeq_cds@int_colData$reducedDims$PCA<- all@reductions$pca@cell.embeddings
NucSeq_cds <- reduce_dimension(NucSeq_cds, reduction_method = 'UMAP', preprocess_method = "PCA")
NucSeq_cds$celltype <- NucSeq_cds$ingroup
jpeg('cds.umap.jpg')
p1 <- plot_cells(NucSeq_cds, reduction_method="UMAP", color_cells_by="celltype",show_trajectory_graph=F) + ggtitle('cds.umap')
print(p1)
dev.off()
NucSeq.coembed <- all
##import seurat umap
if(ex.umap){
cds.embed <- NucSeq_cds@int_colData$reducedDims$UMAP
int.embed <- Embeddings(NucSeq.coembed, reduction = "umap")
int.embed <- int.embed[rownames(cds.embed),]
NucSeq_cds@int_colData$reducedDims$UMAP <- int.embed
jpeg('int.umap.jpg')
p1 <- plot_cells(NucSeq_cds, reduction_method="UMAP", color_cells_by="celltype", ,show_trajectory_graph=F) + ggtitle('int.umap')
print(p1)
dev.off()
}
NucSeq_cds <- cluster_cells(NucSeq_cds, reduction_method='UMAP')
print(length(unique(partitions(NucSeq_cds))))
jpeg('partition.umap.jpg')
p1 <- plot_cells(NucSeq_cds, show_trajectory_graph = FALSE,group_label_size = 5,color_cells_by = "partition")
print(p1)
dev.off()
NucSeq_cds <- learn_graph(NucSeq_cds)
jpeg('celltype.umap.jpg')
p1 <- plot_cells(NucSeq_cds, color_cells_by = "celltype",
label_groups_by_cluster = FALSE, label_leaves = TRUE,
label_branch_points = TRUE,group_label_size = 5)
print(p1)
dev.off()
saveRDS(NucSeq_cds,paste0(label,".rds"))
meta <- data.frame(NucSeq_cds@colData@listData)
write.table(meta,paste0(label,"_metadata.xls"),sep='\t',quote=F)
}
选择轨迹起点的函数
首先获取细胞名列表,以下是两个例子:
cell_ids <- which(NucSeq_cds@clusters@listData[["UMAP"]][["clusters"]] == 5)
cell_ids <- which(cds$Celltypes == "OPCs")
NucSeq_cds是monocle3运行完上面run_monocle3主函数得到的monocle3对象,cell_ids是轨迹起点细胞名向量,label是输出文件的前缀,label_cell_groups、label_leaves、label_branch_points、graph_label_size和show_trajectory_graph是作图设置的参数,可自调。
select_root <- function(NucSeq_cds,cell_ids,label='out', label_cell_groups=FALSE,label_leaves=FALSE,label_branch_points=FALSE,
graph_label_size=1.5,show_trajectory_graph = F) {
closest_vertex <- NucSeq_cds@principal_graph_aux[["UMAP"]]$pr_graph_cell_proj_closest_vertex
closest_vertex <- as.matrix(closest_vertex[colnames(NucSeq_cds), ])
root_pr_nodes <- igraph::V(principal_graph(NucSeq_cds)[["UMAP"]])$name[as.numeric(names(which.max(table(closest_vertex[cell_ids,]))))]
NucSeq_cds <- order_cells(NucSeq_cds,root_pr_nodes=root_pr_nodes)
jpeg(paste0(label,'_select_root_umap.jpg'))
p1 <- plot_cells(NucSeq_cds,color_cells_by = "pseudotime",
label_cell_groups=label_cell_groups,label_leaves=label_leaves,label_branch_points=label_branch_points,
graph_label_size=graph_label_size, show_trajectory_graph = show_trajectory_graph)
print(p1)
dev.off()
}
基因的拟时序作图函数
NucSeq_cds是monocle3运行完上面run_monocle3主函数得到的monocle3对象,ingene是需要画的基因向量。
select_gene_plot <- function(NucSeq_cds,ingene=NULL) {
jpeg(paste0('select_gene_umap.jpg'))
p1 <- plot_cells(NucSeq_cds,color_cells_by = "pseudotime",label_cell_groups=F,label_leaves=T,label_branch_points=T,graph_label_size=1.5)
print(p1)
dev.off()
for (g in ingene) {
jpeg(paste0('gene_',g,'_select_gene_umap.jpg'))
p1 <- plot_cells(NucSeq_cds,
genes=g,
label_cell_groups=FALSE,
show_trajectory_graph=FALSE)
print(p1)
dev.off()
}
}
计算拟时序轨迹的相关基因及热图可视化
运行graph_test时会出现‘Error rBind“报错
#去除不需要的细胞亚群
NucSeq_cds <- NucSeq_cds[,names(NucSeq_cds@clusters@listData[["UMAP"]][["clusters"]])[NucSeq_cds@clusters@listData[["UMAP"]][["clusters"]] != 10]]
#这里有个bug,需要将calculateLW中的Matrix::rBind改为rbind才能运行graph_test
trace('calculateLW', edit = T, where = asNamespace("monocle3"))
sig_gene <- graph_test(NucSeq_cds, neighbor_graph="principal_graph", cores=8)
如果不想每次手动改,可以以下步骤
1、运行trace('calculateLW', edit = T, where = asNamespace("monocle3"))后会生成一个临时的R脚本,将其复制到一个新的R脚本文件
2、新的R脚本文件中修改tmp <- Matrix::rBind(tmp, cur_tmp) 为 tmp <- rbind(tmp, cur_tmp)(https://github.com/cole-trapnell-lab/monocle3/issues/512#issuecomment-1004224006)
3、新的R脚本文件中将jaccard_coeff全部替换为monocle3:::jaccard_coeff
4、运行trace('graph_test', edit = T, where = asNamespace("monocle3"))后复制代码到新的R脚本文件中并把名字改为graph_test_new。
5、加载这个新脚本后,运行graph_test_new函数。
6、calculateLW函数还需要将my.moran.test改为monocle3:::my.moran.test,my.geary.test改为monocle3:::my.geary.test。
函数obtain_sigene中,NucSeq_cds是monocle3运行完上面run_monocle3主函数得到的monocle3对象,obj之前输入的Seurat对象,neighbor_graph选择使用的graph,一般使用principal_graph就好,q_value.cutoff是q_value阈值,用于筛选基因,一般设为0.01,group.by是热图的细胞注释。热图可根据这个[网址](https://jokergoo.github.io/ComplexHeatmap-reference/book/heatmap-annotations.html)进行修改。
obtain_sigene <- function(NucSeq_cds, obj, neighbor_graph="principal_graph", cores=8,q_value.cutoff=0.01,group.by=NULL) {
NucSeq.coembed <- obj
source('~/graph_test_new.R')
sig_gene <- graph_test_new(NucSeq_cds, neighbor_graph=neighbor_graph, cores=cores)
write.table(sig_gene,'sig_gene.xls',sep='\t',quote=F)
sig_gene <- subset(sig_gene, q_value < q_value.cutoff)
genes <- row.names(subset(sig_gene, morans_I >= quantile(sig_gene$morans_I)[4]))
pt.matrix <- GetAssayData(NucSeq.coembed,slot='data', assay='RNA')[match(genes,rownames(NucSeq.coembed)),order(pseudotime(NucSeq_cds))]
pt.matrix.raw <- pt.matrix
pt.matrix <- t(apply(pt.matrix,1,function(x){smooth.spline(x,df=3)$y}))
pt.matrix = pt.matrix[!apply(pt.matrix, 1, sd) == 0, ]
pt.matrix = Matrix::t(scale(Matrix::t(pt.matrix), center = TRUE))
pt.matrix = pt.matrix[is.na(row.names(pt.matrix)) == FALSE, ]
# pt.matrix[is.nan(pt.matrix)] = 0
pt.matrix[pt.matrix > 3] = 3
pt.matrix[pt.matrix < -3] = -3
row_dist <- as.dist((1 - cor(Matrix::t(pt.matrix)))/2)
# row_dist[is.na(row_dist)] <- 1
library(ComplexHeatmap)
library(RColorBrewer)
library(circlize)
lab2=HeatmapAnnotation(foo = NucSeq.coembed@meta.data[,group.by])
hthc <- Heatmap(
pt.matrix,
name = "z-score",
col = colorRamp2(seq(from=-3,to=3,length=11),rev(brewer.pal(11, "RdYlGn"))),
show_row_names = FALSE,
show_column_names = FALSE,
#row_names_gp = gpar(fontsize = 6),
clustering_distance_rows = row_dist,
clustering_method_rows = "ward.D2",
clustering_method_columns = "ward.D2",
row_title_rot = 0,
cluster_rows = TRUE,
cluster_row_slices = FALSE,
cluster_columns = FALSE,
right_annotation = NULL,
top_annotation = lab2)
jpeg('heatmap_sig_genes.jpg')
print(hthc)
dev.off()
}
graph_test_new.R的代码
calculateLW <- function (cds, k, neighbor_graph, reduction_method, verbose = FALSE)
{
if (verbose) {
message("retrieve the matrices for Moran's I test...")
}
knn_res <- NULL
principal_g <- NULL
cell_coords <- reducedDims(cds)[[reduction_method]]
if (neighbor_graph == "knn") {
knn_res <- RANN::nn2(cell_coords, cell_coords, min(k +
1, nrow(cell_coords)), searchtype = "standard")[[1]]
}
else if (neighbor_graph == "principal_graph") {
pr_graph_node_coords <- cds@principal_graph_aux[[reduction_method]]$dp_mst
principal_g <- igraph::get.adjacency(cds@principal_graph[[reduction_method]])[colnames(pr_graph_node_coords),
colnames(pr_graph_node_coords)]
}
exprs_mat <- exprs(cds)
if (neighbor_graph == "knn") {
if (is.null(knn_res)) {
knn_res <- RANN::nn2(cell_coords, cell_coords, min(k +
1, nrow(cell_coords)), searchtype = "standard")[[1]]
}
links <- monocle3:::jaccard_coeff(knn_res[, -1], F)
links <- links[links[, 1] > 0, ]
relations <- as.data.frame(links)
colnames(relations) <- c("from", "to", "weight")
knn_res_graph <- igraph::graph.data.frame(relations,
directed = T)
knn_list <- lapply(1:nrow(knn_res), function(x) knn_res[x,
-1])
region_id_names <- colnames(cds)
id_map <- 1:ncol(cds)
names(id_map) <- id_map
points_selected <- 1:nrow(knn_res)
knn_list <- lapply(points_selected, function(x) id_map[as.character(knn_res[x,
-1])])
}
else if (neighbor_graph == "principal_graph") {
cell2pp_map <- cds@principal_graph_aux[[reduction_method]]$pr_graph_cell_proj_closest_vertex
if (is.null(cell2pp_map)) {
stop(paste("Error: projection matrix for each cell to principal",
"points doesn't exist, you may need to rerun learn_graph"))
}
cell2pp_map <- cell2pp_map[row.names(cell2pp_map) %in%
row.names(colData(cds)), , drop = FALSE]
cell2pp_map <- cell2pp_map[colnames(cds), ]
if (verbose) {
message("Identify connecting principal point pairs ...")
}
knn_res <- RANN::nn2(cell_coords, cell_coords, min(k +
1, nrow(cell_coords)), searchtype = "standard")[[1]]
principal_g_tmp <- principal_g
diag(principal_g_tmp) <- 1
cell_membership <- as.factor(cell2pp_map)
uniq_member <- sort(unique(cell_membership))
membership_matrix <- Matrix::sparse.model.matrix(~cell_membership +
0)
colnames(membership_matrix) <- levels(uniq_member)
feasible_space <- membership_matrix %*% Matrix::tcrossprod(principal_g_tmp[as.numeric(levels(uniq_member)),
as.numeric(levels(uniq_member))], membership_matrix)
links <- monocle3:::jaccard_coeff(knn_res[, -1], F)
links <- links[links[, 1] > 0, ]
relations <- as.data.frame(links)
colnames(relations) <- c("from", "to", "weight")
knn_res_graph <- igraph::graph.data.frame(relations,
directed = T)
tmp_a <- igraph::get.adjacency(knn_res_graph)
block_size <- 10000
num_blocks = ceiling(nrow(tmp_a)/block_size)
if (verbose) {
message("start calculating valid kNN graph ...")
}
tmp <- NULL
for (j in 1:num_blocks) {
if (j < num_blocks) {
block_a <- tmp_a[((((j - 1) * block_size) + 1):(j *
block_size)), ]
block_b <- feasible_space[((((j - 1) * block_size) +
1):(j * block_size)), ]
}
else {
block_a <- tmp_a[((((j - 1) * block_size) + 1):(nrow(tmp_a))),
]
block_b <- feasible_space[((((j - 1) * block_size) +
1):(nrow(tmp_a))), ]
}
cur_tmp <- block_a * block_b
if (is.null(tmp)) {
tmp <- cur_tmp
}
else {
tmp <- rbind(tmp, cur_tmp)
}
}
if (verbose) {
message("Calculating valid kNN graph, done ...")
}
region_id_names <- colnames(cds)
id_map <- 1:ncol(cds)
names(id_map) <- id_map
knn_list <- slam::rowapply_simple_triplet_matrix(slam::as.simple_triplet_matrix(tmp),
function(x) {
res <- which(as.numeric(x) > 0)
if (length(res) == 0)
res <- 0L
res
})
}
else {
stop("Error: unrecognized neighbor_graph option")
}
names(knn_list) <- id_map[names(knn_list)]
class(knn_list) <- "nb"
attr(knn_list, "region.id") <- region_id_names
attr(knn_list, "call") <- match.call()
lw <- spdep::nb2listw(knn_list, zero.policy = TRUE)
lw
}
graph_test_new <- function (cds, neighbor_graph = c("knn", "principal_graph"),
reduction_method = "UMAP", k = 25, method = c("Moran_I"),
alternative = "greater", expression_family = "quasipoisson",
cores = 1, verbose = FALSE)
{
neighbor_graph <- match.arg(neighbor_graph)
lw <- calculateLW(cds, k = k, verbose = verbose, neighbor_graph = neighbor_graph,
reduction_method = reduction_method)
if (verbose) {
message("Performing Moran's I test: ...")
}
exprs_mat <- SingleCellExperiment::counts(cds)[, attr(lw,
"region.id"), drop = FALSE]
sz <- size_factors(cds)[attr(lw, "region.id")]
wc <- spdep::spweights.constants(lw, zero.policy = TRUE,
adjust.n = TRUE)
test_res <- pbmcapply::pbmclapply(row.names(exprs_mat), FUN = function(x,
sz, alternative, method, expression_family) {
exprs_val <- exprs_mat[x, ]
if (expression_family %in% c("uninormal", "binomialff")) {
exprs_val <- exprs_val
}
else {
exprs_val <- log10(exprs_val/sz + 0.1)
}
test_res <- tryCatch({
if (method == "Moran_I") {
mt <- suppressWarnings(monocle3:::my.moran.test(exprs_val,
lw, wc, alternative = alternative))
data.frame(status = "OK", p_value = mt$p.value,
morans_test_statistic = mt$statistic, morans_I = mt$estimate[["Moran I statistic"]])
}
else if (method == "Geary_C") {
gt <- suppressWarnings(monocle3:::my.geary.test(exprs_val,
lw, wc, alternative = alternative))
data.frame(status = "OK", p_value = gt$p.value,
geary_test_statistic = gt$statistic, geary_C = gt$estimate[["Geary C statistic"]])
}
}, error = function(e) {
data.frame(status = "FAIL", p_value = NA, morans_test_statistic = NA,
morans_I = NA)
})
}, sz = sz, alternative = alternative, method = method, expression_family = expression_family,
mc.cores = cores, ignore.interactive = TRUE)
if (verbose) {
message("returning results: ...")
}
test_res <- do.call(rbind.data.frame, test_res)
row.names(test_res) <- row.names(cds)
test_res <- merge(test_res, rowData(cds), by = "row.names")
row.names(test_res) <- test_res[, 1]
test_res[, 1] <- NULL
test_res$q_value <- 1
test_res$q_value[which(test_res$status == "OK")] <- stats::p.adjust(subset(test_res,
status == "OK")[, "p_value"], method = "BH")
test_res$status = as.character(test_res$status)
test_res[row.names(cds), ]
}