背景:单细胞转录组数据的质量控制,质量控制去除(1)低质量的细胞(2)双胞(3)RNA污染的细胞
- 1.去除低质量的细胞
什么样的细胞是低质量的细胞呢?通常是检测到的基因数目少、计数深度低或者线粒体等基因的比例比较高的细胞。
去除低质量细胞的阈值对象:
(1)每个barcode(细胞)的计数数量(计数深度、测序深度)
(2)每个barcode的基因数量
(3)每个barcode的线粒体基因(核糖体、血红蛋白基因)比例
library(Seurat)
library(ggplot2)
library(ggpubr)
sc<-readRDS("./sc.merge.rds")
sc<-CreateSeuratObject(sc$RNA@counts,meta.data=sc@meta.data,min.cells=3) #'筛选基因,保留在大于3个细胞中表达的基因
sc[["percent.mt"]] <- PercentageFeatureSet(sc, pattern="^MT-") #'人类中线粒体基因以MT开头,因此计算MT开头的基因比例
sc[["percent.Ribo"]] <- PercentageFeatureSet(sc, pattern="^RP[SL]") #'计算以RPS或者RPL开头的基因比例(核糖体基因)
HB.genes <- c("HBA1","HBA2","HBB","HBD","HBE1","HBG1","HBG2","HBM","HBQ1","HBZ")
HB.genes <- CaseMatch(HB.genes, rownames(sc))
sc[["percent.HB"]] <- PercentageFeatureSet(sc, features=HB.genes) #'血红蛋白基因比例
qc_before<- VlnPlot(sc, group.by="orig.ident",
features=c("nFeature_RNA","nCount_RNA","percent.mt","percent.Ribo","percent.HB"),
pt.size=0,
ncol=2)
ggsave(qc_before,file="qc_before.png", width=12, height=6)
#'质控前的相关阈值情况
Feature_ber1 <- FeatureScatter(sc, feature1="nCount_RNA",
feature2="nFeature_RNA",
group.by="orig.ident")
Feature_ber2 <- FeatureScatter(sc, feature1 = 'nCount_RNA',
feature2="percent.mt",
group.by="orig.ident")
Feature_ber3 <- FeatureScatter(sc, feature1 = 'nCount_RNA',
feature2="percent.Ribo",
group.by="orig.ident")
Feature_ber4 <- FeatureScatter(sc, feature1 = 'nCount_RNA',
feature2="percent.HB",
group.by="orig.ident")
Feature_ber1 <- Feature_ber1 +theme(legend.position="none")
Feature_ber2 <- Feature_ber2 +theme(legend.position="none")
Feature_ber3 <- Feature_ber3 +theme(legend.position="none")
Feature_ber4 <- Feature_ber4 +theme(legend.position="none")
Feature_ber <- ggarrange(Feature_ber1, Feature_ber2, Feature_ber3,Feature_ber4, ncol=4, nrow=1, widths=c(1,1,1,1))
ggsave(Feature_ber, file="Feature_ber_before.png", width=12, height=6)
开始质控:
sc.sub=subset(sc,nFeature_RNA > 200 & nFeature_RNA < 2500 & nCount_RNA > 500 & nCount_RNA < 10000 & percent.mt < 5 & percent.HB<5 & percent.Ribo<10) #'筛选细胞,规则是细胞的基因数目在200-2500之间,测序深度(基因表达总量)在500-10000,线粒体比例低于5%,血红蛋白基因低>于5%,核糖体基因低于10%
qc_after<- VlnPlot(sc.sub, group.by="orig.ident",
features=c("nFeature_RNA","nCount_RNA","percent.mt","percent.Ribo","percent.HB"),
pt.size=0,
ncol=2)
ggsave(qc_after,file="qc_after.png", width=12, height=6)
Feature_ber1 <- FeatureScatter(sc.sub, feature1="nCount_RNA",
feature2="nFeature_RNA",
group.by="orig.ident")
Feature_ber2 <- FeatureScatter(sc.sub, feature1 = 'nCount_RNA',
feature2="percent.mt",
group.by="orig.ident")
Feature_ber3 <- FeatureScatter(sc.sub, feature1 = 'nCount_RNA',
feature2="percent.Ribo",
group.by="orig.ident")
Feature_ber4 <- FeatureScatter(sc.sub, feature1 = 'nCount_RNA',
feature2="percent.HB",
group.by="orig.ident")
Feature_ber1 <- Feature_ber1 +theme(legend.position="none")
Feature_ber2 <- Feature_ber2 +theme(legend.position="none")
Feature_ber3 <- Feature_ber3 +theme(legend.position="none")
Feature_ber4 <- Feature_ber4 +theme(legend.position="none")
Feature_ber <- ggarrange(Feature_ber1, Feature_ber2, Feature_ber3,Feature_ber4, ncol=4, nrow=1, widths=c(1,1,1,1))
ggsave(Feature_ber, file="Feature_ber_after.png", width=12, height=6)
可以看到留下来的细胞的基因数目都在200-2500之间,基因表达总量在300-10000之间,线粒体比例低于5%,核糖体基因比例低于10%。
- 2.去除双胞
“双细胞被定义为在相同的细胞条形码(barcodes)下进行测序的两个细胞,例如,两个细胞被捕获在同一个液滴(droplet)中。这也是为什么我们一直使用barcodes而不是cells的原因。”除了使用算法进行双细胞的识别外,一些别的指标也能进行提示:例如,同一个聚类类群中的marker基因来自不同的细胞类型、某个类群中的基因数目或者基因表达量异常高......
这里测试的是DoubletFinder:
#'obj the seurat obj
#'本过程不会对原始obj作任何数据预处理
myDoublet<-function(obj,assay="RNA",outpath="./",group="seurat_clusters",dims=30,resolution=0.5,replace=F){
library(Seurat)
library(DoubletFinder)
library(dplyr)
ls("package:DoubletFinder")
sc<-obj
#sc$group=sc@meta.data[,group]
if(assay=="RNA"){
DefaultAssay(sc)<-"RNA"
data <- sc %>%NormalizeData() %>% FindVariableFeatures(nfeatures=2000) %>% ScaleData() %>% RunPCA(assay="RNA",verbose=FALSE)%>%FindNeighbors(reduction = "pca", dims = 1:dims)%>%FindClusters(verbose = FALSE,resolution=resolution)%>%RunTSNE(dim.use=1:dims)
}else{
DefaultAssay(sc)<-"Spatial"
data <- sc %>%SCTransform(sc, assay = "Spatial",do.scale=TRUE, do.center=TRUE,verbose = FALSE) %>% RunPCA()%>%RunTSNE(dim.use=1:dims)
}
sweep.res.list <- paramSweep(data, PCs = 1:30, sct = F,num.cores=6)
sweep.stats <- summarizeSweep(sweep.res.list, GT = FALSE)
bcmvn <- find.pK(sweep.stats) #'可以看到最佳参数的点
pK_bcmvn <- bcmvn$pK[which.max(bcmvn$BCmetric)] %>% as.character() %>% as.numeric() #'提取最佳pk值
DoubletRate = ncol(data)*8*1e-6 #'每增加1000个细胞,双胞概率增加0.008
homotypic.prop <- modelHomotypic(data@meta.data[,group]) #'默认seurat_clusters
nExp_poi <- round(DoubletRate*ncol(data))
nExp_poi.adj <- round(nExp_poi*(1-homotypic.prop))
data <- doubletFinder(data, PCs = 1:30, pN = 0.25, pK = pK_bcmvn,
nExp = nExp_poi, reuse.pANN = F, sct = F)
data <- doubletFinder(data, PCs = 1:30, pN = 0.25, pK = pK_bcmvn,
nExp = nExp_poi.adj, reuse.pANN = F, sct = F)
data@meta.data[,"DF_hi.lo"] <- data@meta.data[,length(colnames(data@meta.data))]
data@meta.data$DF_hi.lo[which(data@meta.data$DF_hi.lo == "Doublet" & data@meta.data[,length(colnames(data@meta.data))-3] == "Singlet")] <- "Doublet-Low Confidience"
data@meta.data$DF_hi.lo[which(data@meta.data$DF_hi.lo == "Doublet")] <- "Doublet-High Confidience"
table(data@meta.data$DF_hi.lo) #'查看细胞类型(双胞或者单胞)分布
#'结果可视化
png(paste0(outpath,"01.",assay,"_",group,"_tsne.png"),2500,1800,res=300)
pp<-DimPlot(data, reduction = "tsne", group.by =group)
print(pp)
dev.off()
png(paste0(outpath,"01.",assay,"_",group,"_doubletFinder.png"),2500,1800,res=300)
pp<-DimPlot(data, reduction = "tsne", group.by ="DF_hi.lo",cols =c("black","red","gold"))
print(pp)
dev.off()
saveRDS(data,paste0(outpath,'01.sc_rmdoublet.rds'))#'保存数据的rds文件
if(replace==T){
sc@meta.data$DF_hi.lo<-data@meta.data[match(rownames(sc@meta.data),rownames(data@meta.data)),"DF_hi.lo"]
return(sc)
}
注意:理论上去除双胞应该在上一步细胞的基因数目等阈值质控之前进行。
- 3.去除周期相关的基因
#'去除周期基因的影响
#'对于assay="RNA"
library(Seurat)
library(dplyr)
sc<-readRDS("./sc.merge.rds")
sc<-sc%>%NormalizeData()%>%ScaleData(features=rownames(sc))%>%FindVariableFeatures() %>% RunPCA() #'数据预处理
#'首先不管是sct还是log标准化,随便选一个方法跑到RunPCA
#'然后使用seurat中内置的资源 周期基因cc.genes
cc.genes<-cc.genes.updated.2019
g2m_genes <- cc.genes$g2m.genes
g2m_genes <- CaseMatch(search=g2m_genes, match=rownames(sc))
s_genes <- cc.genes$s.genes
s_genes <- CaseMatch(search=s_genes, match=rownames(sc))
#'这里cc.genes默认是人类的大写基因
#'但是小鼠的即使没有转换大小写也能自动匹配
sc <- CellCycleScoring(object=sc, g2m.features=g2m_genes, s.features=s_genes,set.ident=TRUE) #'计算细胞周期
#'meta.data中多了几列S.Score G2M.Score Phase old.ident
#'可视化一下周期基因pca
cycleplot <- DimPlot(sc, reduction="pca", group.by="Phase")
ggsave(cycleplot, file="cycleplot_before_1.png", width=12, height=6)
sc <- RunPCA(sc, features=c(s_genes,g2m_genes))
cycleplot <- DimPlot(sc, reduction="pca", group.by="Phase")
ggsave(cycleplot, file="cycleplot_before_2.png", width=12, height=6)
#'对周期基因进行回归处理 关键函数是scale
sc<-sc%>%NormalizeData()%>%ScaleData(features=rownames(sc),vars.to.regress = c("S.Score", "G2M.Score"))%>%FindVariableFeatures()
sc<-RunPCA(sc,assay="RNA",verbose=FALSE)
#'这里需要注意一下,如果是空转的数据,应该走sct
#'sc<-SCTransform(sc, assay = "Spatial", vars.to.regress = c("S.Score", "G2M.Score"),do.scale=TRUE, do.center=TRUE,verbose = FALSE)
if(T){ #'周期基因可视化
cyclesce<-sc
cyclesce<- RunPCA(cyclesce, assay="RNA",features = VariableFeatures(cyclesce), nfeatures.print = 10)
cycleplot<-DimPlot(cyclesce,cols = pal_npg("nrc", alpha = 0.7)(3))
ggsave(cycleplot, file="cycleplot_after_1.png", width=130, height=100,units="mm",limitsize=FALSE)
cyclesce<- RunPCA(cyclesce, assay=assay,features = c(s.genes, g2m.genes))
cycleplot<-DimPlot(cyclesce,cols = pal_npg("nrc", alpha = 0.7)(3))
ggsave(cycleplot, file="cycleplot_after_2.png", width=130, height=100,units="mm",limitsize=FALSE)
}
注意:谨慎使用。
- 去除RNA污染
方法一:DecontX
myDecontX<-function(obj,assay="RNA",outpath="./",my.off=0.2,dims=30,resolution=0.5,group="seurat_clusters"){
library(Seurat)
library(decontX)
library(ggplot2)
library(dplyr)
sc<-obj
if(assay=="RNA"){
my.counts<-sc$RNA@counts
}else{
my.counts<-sc$Spatial@counts
}
sc<-sc%>%SCTransform(verbose=F,do.scale=T)%>% RunPCA(assay="SCT",verbose=FALSE)%>%FindNeighbors(reduction = "pca", dims = 1:dims)%>%FindClusters(verbose = FALSE,resolution=resolution)%>%RunUMAP(dims=1:dims)
decontX_results <- decontX(my.counts)
str(decontX_results)
sc$contamination=decontX_results$contamination
head(sc@meta.data)
png(paste0(outpath,"01.",assay,"_decontx_contamination_before.png"),2500,1800,res=300)
pp<-FeaturePlot(sc,
features = 'contamination',
raster=FALSE # 细胞过多时候需要加这个参数
) +
scale_color_viridis_c()+
theme_bw()+
theme(panel.grid = element_blank(),
axis.text = element_blank(),
axis.ticks = element_blank())
print(pp)
dev.off()
png(paste0(outpath,"01.",assay,"_decontx_umap_before.png"),2500,1800,res=300)
pp<-DimPlot(sc, reduction = "umap", group.by =group)
print(pp)
dev.off()
sc = sc[,sc$contamination < my.off]
png(paste0(outpath,"01.",assay,"_decontx_contamination_afetr.png"),2500,1800,res=300)
pp<-FeaturePlot(sc,
features = 'contamination',
raster=FALSE # 细胞过多时候需要加这个参数
) +
scale_color_viridis_c()+
theme_bw()+
theme(panel.grid = element_blank(),
axis.text = element_blank(),
axis.ticks = element_blank())
print(pp)
dev.off()
png(paste0(outpath,"01.",assay,"_decontx_umap_afetr.png"),2500,1800,res=300)
pp<-DimPlot(sc, reduction = "umap", group.by =group)
print(pp)
dev.off()
if(assay=="Spatial"){
print(paste0("the filtered-spot-gene counts's dim is gene:",nrow(sc$Spatial$counts),";spot:",ncol(sc$Spatial$counts)))
nf.before=nrow(obj$Spatial$counts)
nf.after=nrow(sc$Spatial$counts)
ns.before=ncol(obj$Spatial$counts)
ns.after=ncol(sc$Spatial$counts)
spot.s=paste0(as.character(round(ns.after/ns.before, 3)*100),"%")
feature.s=paste0(as.character(round(nf.after/nf.before, 3)*100),"%")
qc.table=data.frame(type_number=c("nSpot","nFeature"),filter_before=c(as.character(ns.before),as.character(nf.before)),filter_after=c(as.character(ns.after),as.character(nf.after)),percent=c(spot.s,feature.s))
}else{
print(paste0("the filtered-cell-gene counts's dim is gene:",nrow(sc$RNA$counts),";cell:",ncol(sc$RNA$counts)))
nf.before=nrow(obj$RNA$counts)
nf.after=nrow(sc$RNA$counts)
ns.before=ncol(obj$RNA$counts)
ns.after=ncol(sc$RNA$counts)
spot.s=paste0(as.character(round(ns.after/ns.before, 3)*100),"%")
feature.s=paste0(as.character(round(nf.after/nf.before, 3)*100),"%")
qc.table=data.frame(type_number=c("nCell","nFeature"),filter_before=c(as.character(ns.before),as.character(nf.before)),filter_after=c(as.character(ns.after),as.character(nf.after)),percent=c(spot.s,feature.s))
}
write.csv(qc.table,paste0(outpath,"01.",assay,"_decontx_qc.csv"),row.names=F)
print(head(qc.table))
return(sc)
}
和soupx不同之处在于,不会修改原始矩阵表达,只是根据阈值对细胞进行过滤,因此会减少细胞的数目。
方法二:soupx
#'raw.path the 10x raw matrix
#'fit.path the 10x filtered matrix
mySoupX<-function(raw.path,fit.path,outpath="./",my.off=0.2,assay="RNA",group="seurat_clusters",dims=30,resolution=0.5){
library(SoupX)
library(Seurat)
library(DropletUtils)
library(ggplot2)
library(DoubletFinder)
library(knitr)
library(dplyr)
filt.matrix <- Read10X(fit.path)
raw.matrix <- Read10X(raw.path)
raw.matrix<-raw.matrix[rownames(filt.matrix),]
srat <- CreateSeuratObject(counts = filt.matrix)
soup.channel <- SoupChannel(raw.matrix, filt.matrix)
srat <- SCTransform(srat, verbose = F,do.scale=T)
srat <- RunPCA(srat,assay="SCT", verbose = F)
srat <- RunUMAP(srat, dims = 1:dims, verbose = F)
srat <- FindNeighbors(srat,reduction="pca", dims = 1:dims, verbose = F)
srat <- FindClusters(srat, verbose = F,resolution=resolution)
meta <- srat@meta.data
umap <- srat@reductions$umap@cell.embeddings
soup.channel <- setClusters(soup.channel, setNames(meta$seurat_clusters, rownames(meta)))
soup.channel <- setDR(soup.channel, umap)
head(meta)
soup.channel <- autoEstCont(soup.channel)
soup.channel<-setContaminationFraction(soup.channel, my.off) #'阈值my.off
adj.matrix <- adjustCounts(soup.channel, roundToInt = T) #'校正矩阵
png(paste0(outpath,"01.",assay,"_soupx_before.png"),2500,1800,res=300)
pp<-DimPlot(srat, reduction = "umap", group.by =group)
print(pp)
dev.off()
srat_1<-CreateSeuratObject(counts = adj.matrix)
srat_1<-srat_1%>%SCTransform(verbose=F,do.scale=T)%>% RunPCA(assay="SCT",verbose=FALSE)%>%FindNeighbors(reduction = "pca", dims = 1:dims)%>%FindClusters(verbose = FALSE,resolution=resolution)%>%RunUMAP(dims=1:dims)
png(paste0(outpath,"01.",assay,"_soupx_after.png"),2500,1800,res=300)
pp<-DimPlot(srat_1, reduction = "umap", group.by =group)
print(pp)
dev.off()
DropletUtils:::write10xCounts(paste0(outpath,"01.soupX_pbmc10k_filt"), adj.matrix) #'保存矩阵
}
相同的参数下,可视化矩阵校正前后:
因为group.by用的是seurat_clusters,且没有相关的背景知识,所以不太能看出来优化之处。
注意:修改了矩阵,所以应该谨慎使用。