数据质控-单细胞转录组分析的数据质控scRNA-seq03

背景:单细胞转录组数据的质量控制,质量控制去除(1)低质量的细胞(2)双胞(3)RNA污染的细胞

  • 1.去除低质量的细胞
    什么样的细胞是低质量的细胞呢?通常是检测到的基因数目少、计数深度低或者线粒体等基因的比例比较高的细胞。
    去除低质量细胞的阈值对象:
    (1)每个barcode(细胞)的计数数量(计数深度、测序深度)
    (2)每个barcode的基因数量
    (3)每个barcode的线粒体基因(核糖体、血红蛋白基因)比例
library(Seurat)
library(ggplot2)
library(ggpubr)
sc<-readRDS("./sc.merge.rds")
sc<-CreateSeuratObject(sc$RNA@counts,meta.data=sc@meta.data,min.cells=3) #'筛选基因,保留在大于3个细胞中表达的基因
sc[["percent.mt"]] <- PercentageFeatureSet(sc, pattern="^MT-") #'人类中线粒体基因以MT开头,因此计算MT开头的基因比例
sc[["percent.Ribo"]] <- PercentageFeatureSet(sc, pattern="^RP[SL]") #'计算以RPS或者RPL开头的基因比例(核糖体基因)
HB.genes <- c("HBA1","HBA2","HBB","HBD","HBE1","HBG1","HBG2","HBM","HBQ1","HBZ")
HB.genes <- CaseMatch(HB.genes, rownames(sc))
sc[["percent.HB"]] <- PercentageFeatureSet(sc, features=HB.genes) #'血红蛋白基因比例

qc_before<- VlnPlot(sc, group.by="orig.ident",
                              features=c("nFeature_RNA","nCount_RNA","percent.mt","percent.Ribo","percent.HB"),
                              pt.size=0,
                              ncol=2)
ggsave(qc_before,file="qc_before.png", width=12, height=6)
#'质控前的相关阈值情况
Feature_ber1 <- FeatureScatter(sc, feature1="nCount_RNA",
                                   feature2="nFeature_RNA",
                                   group.by="orig.ident")
Feature_ber2 <- FeatureScatter(sc, feature1 = 'nCount_RNA',
                                   feature2="percent.mt",
                                   group.by="orig.ident")
Feature_ber3 <- FeatureScatter(sc, feature1 = 'nCount_RNA',
                                   feature2="percent.Ribo",
                                   group.by="orig.ident")
Feature_ber4 <- FeatureScatter(sc, feature1 = 'nCount_RNA',
                                   feature2="percent.HB",
                                   group.by="orig.ident")
Feature_ber1 <- Feature_ber1 +theme(legend.position="none")
Feature_ber2 <- Feature_ber2 +theme(legend.position="none")
Feature_ber3 <- Feature_ber3 +theme(legend.position="none")
Feature_ber4 <- Feature_ber4 +theme(legend.position="none")
Feature_ber <- ggarrange(Feature_ber1, Feature_ber2, Feature_ber3,Feature_ber4, ncol=4, nrow=1, widths=c(1,1,1,1))
ggsave(Feature_ber, file="Feature_ber_before.png", width=12, height=6)
qc_before.png

Feature_ber_before.png

开始质控:

sc.sub=subset(sc,nFeature_RNA > 200 & nFeature_RNA < 2500 & nCount_RNA > 500 & nCount_RNA < 10000 & percent.mt < 5 &  percent.HB<5 & percent.Ribo<10) #'筛选细胞,规则是细胞的基因数目在200-2500之间,测序深度(基因表达总量)在500-10000,线粒体比例低于5%,血红蛋白基因低>于5%,核糖体基因低于10%
qc_after<- VlnPlot(sc.sub, group.by="orig.ident",
                              features=c("nFeature_RNA","nCount_RNA","percent.mt","percent.Ribo","percent.HB"),
                              pt.size=0,
                              ncol=2)
ggsave(qc_after,file="qc_after.png", width=12, height=6)
Feature_ber1 <- FeatureScatter(sc.sub, feature1="nCount_RNA",
                                   feature2="nFeature_RNA",
                                   group.by="orig.ident")
Feature_ber2 <- FeatureScatter(sc.sub, feature1 = 'nCount_RNA',
                                   feature2="percent.mt",
                                   group.by="orig.ident")
Feature_ber3 <- FeatureScatter(sc.sub, feature1 = 'nCount_RNA',
                                   feature2="percent.Ribo",
                                   group.by="orig.ident")
Feature_ber4 <- FeatureScatter(sc.sub, feature1 = 'nCount_RNA',
                                   feature2="percent.HB",
                                   group.by="orig.ident")
Feature_ber1 <- Feature_ber1 +theme(legend.position="none")
Feature_ber2 <- Feature_ber2 +theme(legend.position="none")
Feature_ber3 <- Feature_ber3 +theme(legend.position="none")
Feature_ber4 <- Feature_ber4 +theme(legend.position="none")
Feature_ber <- ggarrange(Feature_ber1, Feature_ber2, Feature_ber3,Feature_ber4, ncol=4, nrow=1, widths=c(1,1,1,1))
ggsave(Feature_ber, file="Feature_ber_after.png", width=12, height=6)
qc_after.png

可以看到留下来的细胞的基因数目都在200-2500之间,基因表达总量在300-10000之间,线粒体比例低于5%,核糖体基因比例低于10%。


Feature_ber_after.png
  • 2.去除双胞
    “双细胞被定义为在相同的细胞条形码(barcodes)下进行测序的两个细胞,例如,两个细胞被捕获在同一个液滴(droplet)中。这也是为什么我们一直使用barcodes而不是cells的原因。”除了使用算法进行双细胞的识别外,一些别的指标也能进行提示:例如,同一个聚类类群中的marker基因来自不同的细胞类型、某个类群中的基因数目或者基因表达量异常高......
    这里测试的是DoubletFinder:
#'obj the seurat obj
#'本过程不会对原始obj作任何数据预处理
myDoublet<-function(obj,assay="RNA",outpath="./",group="seurat_clusters",dims=30,resolution=0.5,replace=F){
library(Seurat)
library(DoubletFinder)
library(dplyr)
ls("package:DoubletFinder")
sc<-obj
#sc$group=sc@meta.data[,group]
if(assay=="RNA"){
DefaultAssay(sc)<-"RNA"
data <- sc %>%NormalizeData() %>% FindVariableFeatures(nfeatures=2000) %>% ScaleData() %>% RunPCA(assay="RNA",verbose=FALSE)%>%FindNeighbors(reduction = "pca", dims = 1:dims)%>%FindClusters(verbose = FALSE,resolution=resolution)%>%RunTSNE(dim.use=1:dims)
}else{
DefaultAssay(sc)<-"Spatial"
data <- sc %>%SCTransform(sc, assay = "Spatial",do.scale=TRUE, do.center=TRUE,verbose = FALSE) %>% RunPCA()%>%RunTSNE(dim.use=1:dims)
}
sweep.res.list <- paramSweep(data, PCs = 1:30, sct = F,num.cores=6)
sweep.stats <- summarizeSweep(sweep.res.list, GT = FALSE)
bcmvn <- find.pK(sweep.stats) #'可以看到最佳参数的点
pK_bcmvn <- bcmvn$pK[which.max(bcmvn$BCmetric)] %>% as.character() %>% as.numeric() #'提取最佳pk值

DoubletRate = ncol(data)*8*1e-6 #'每增加1000个细胞,双胞概率增加0.008
homotypic.prop <- modelHomotypic(data@meta.data[,group]) #'默认seurat_clusters
nExp_poi <- round(DoubletRate*ncol(data))
nExp_poi.adj <- round(nExp_poi*(1-homotypic.prop))
data <- doubletFinder(data, PCs = 1:30, pN = 0.25, pK = pK_bcmvn,
                         nExp = nExp_poi, reuse.pANN = F, sct = F)
data <- doubletFinder(data, PCs = 1:30, pN = 0.25, pK = pK_bcmvn,
                         nExp = nExp_poi.adj, reuse.pANN = F, sct = F)

data@meta.data[,"DF_hi.lo"] <- data@meta.data[,length(colnames(data@meta.data))]
data@meta.data$DF_hi.lo[which(data@meta.data$DF_hi.lo == "Doublet" & data@meta.data[,length(colnames(data@meta.data))-3] == "Singlet")] <- "Doublet-Low Confidience"
data@meta.data$DF_hi.lo[which(data@meta.data$DF_hi.lo == "Doublet")] <- "Doublet-High Confidience"
table(data@meta.data$DF_hi.lo) #'查看细胞类型(双胞或者单胞)分布
#'结果可视化
png(paste0(outpath,"01.",assay,"_",group,"_tsne.png"),2500,1800,res=300)
pp<-DimPlot(data, reduction = "tsne", group.by =group)
print(pp)
dev.off()
png(paste0(outpath,"01.",assay,"_",group,"_doubletFinder.png"),2500,1800,res=300)
pp<-DimPlot(data, reduction = "tsne", group.by ="DF_hi.lo",cols =c("black","red","gold"))
print(pp)
dev.off()
saveRDS(data,paste0(outpath,'01.sc_rmdoublet.rds'))#'保存数据的rds文件
if(replace==T){
sc@meta.data$DF_hi.lo<-data@meta.data[match(rownames(sc@meta.data),rownames(data@meta.data)),"DF_hi.lo"]
return(sc)
}

01.doubletFinder.png

01.RNA_seurat_clusters_tsne.png

注意:理论上去除双胞应该在上一步细胞的基因数目等阈值质控之前进行。

  • 3.去除周期相关的基因
#'去除周期基因的影响
#'对于assay="RNA"
library(Seurat)
library(dplyr)
sc<-readRDS("./sc.merge.rds")
sc<-sc%>%NormalizeData()%>%ScaleData(features=rownames(sc))%>%FindVariableFeatures() %>% RunPCA() #'数据预处理
#'首先不管是sct还是log标准化,随便选一个方法跑到RunPCA
#'然后使用seurat中内置的资源 周期基因cc.genes
cc.genes<-cc.genes.updated.2019
g2m_genes <- cc.genes$g2m.genes
g2m_genes <- CaseMatch(search=g2m_genes, match=rownames(sc))
s_genes <- cc.genes$s.genes
s_genes <- CaseMatch(search=s_genes, match=rownames(sc))
#'这里cc.genes默认是人类的大写基因
#'但是小鼠的即使没有转换大小写也能自动匹配
sc <- CellCycleScoring(object=sc, g2m.features=g2m_genes, s.features=s_genes,set.ident=TRUE) #'计算细胞周期
#'meta.data中多了几列S.Score G2M.Score Phase old.ident
#'可视化一下周期基因pca
cycleplot <- DimPlot(sc, reduction="pca", group.by="Phase")
ggsave(cycleplot, file="cycleplot_before_1.png", width=12, height=6)
sc <- RunPCA(sc, features=c(s_genes,g2m_genes))
cycleplot <- DimPlot(sc, reduction="pca", group.by="Phase")
ggsave(cycleplot, file="cycleplot_before_2.png", width=12, height=6)
#'对周期基因进行回归处理 关键函数是scale
sc<-sc%>%NormalizeData()%>%ScaleData(features=rownames(sc),vars.to.regress = c("S.Score", "G2M.Score"))%>%FindVariableFeatures()
sc<-RunPCA(sc,assay="RNA",verbose=FALSE)
#'这里需要注意一下,如果是空转的数据,应该走sct
#'sc<-SCTransform(sc, assay = "Spatial", vars.to.regress = c("S.Score", "G2M.Score"),do.scale=TRUE, do.center=TRUE,verbose = FALSE)
if(T){ #'周期基因可视化
cyclesce<-sc
cyclesce<- RunPCA(cyclesce, assay="RNA",features = VariableFeatures(cyclesce), nfeatures.print = 10)
cycleplot<-DimPlot(cyclesce,cols = pal_npg("nrc", alpha = 0.7)(3))
ggsave(cycleplot, file="cycleplot_after_1.png", width=130, height=100,units="mm",limitsize=FALSE)
cyclesce<- RunPCA(cyclesce, assay=assay,features = c(s.genes, g2m.genes))
cycleplot<-DimPlot(cyclesce,cols = pal_npg("nrc", alpha = 0.7)(3))
ggsave(cycleplot, file="cycleplot_after_2.png", width=130, height=100,units="mm",limitsize=FALSE)
}
01.RNA_cycleplot_before_1.png

01.RNA_cycleplot_after_1.png

注意:谨慎使用。

  • 去除RNA污染
    方法一:DecontX
myDecontX<-function(obj,assay="RNA",outpath="./",my.off=0.2,dims=30,resolution=0.5,group="seurat_clusters"){
library(Seurat)
library(decontX)
library(ggplot2)
library(dplyr)
sc<-obj
if(assay=="RNA"){
my.counts<-sc$RNA@counts
}else{
my.counts<-sc$Spatial@counts
}
sc<-sc%>%SCTransform(verbose=F,do.scale=T)%>% RunPCA(assay="SCT",verbose=FALSE)%>%FindNeighbors(reduction = "pca", dims = 1:dims)%>%FindClusters(verbose = FALSE,resolution=resolution)%>%RunUMAP(dims=1:dims)
decontX_results <- decontX(my.counts)
str(decontX_results)
sc$contamination=decontX_results$contamination
head(sc@meta.data)
png(paste0(outpath,"01.",assay,"_decontx_contamination_before.png"),2500,1800,res=300)
pp<-FeaturePlot(sc,
            features = 'contamination',
            raster=FALSE       # 细胞过多时候需要加这个参数
           ) +
  scale_color_viridis_c()+
  theme_bw()+
  theme(panel.grid = element_blank(),
        axis.text = element_blank(),
        axis.ticks = element_blank())
print(pp)
dev.off()
png(paste0(outpath,"01.",assay,"_decontx_umap_before.png"),2500,1800,res=300)
pp<-DimPlot(sc, reduction = "umap", group.by =group)
print(pp)
dev.off()
sc = sc[,sc$contamination < my.off]
png(paste0(outpath,"01.",assay,"_decontx_contamination_afetr.png"),2500,1800,res=300)
pp<-FeaturePlot(sc,
            features = 'contamination',
            raster=FALSE       # 细胞过多时候需要加这个参数
           ) +
  scale_color_viridis_c()+
  theme_bw()+
  theme(panel.grid = element_blank(),
        axis.text = element_blank(),
        axis.ticks = element_blank())
print(pp)
dev.off()
png(paste0(outpath,"01.",assay,"_decontx_umap_afetr.png"),2500,1800,res=300)
pp<-DimPlot(sc, reduction = "umap", group.by =group)
print(pp)
dev.off()
if(assay=="Spatial"){
print(paste0("the filtered-spot-gene counts's dim is gene:",nrow(sc$Spatial$counts),";spot:",ncol(sc$Spatial$counts)))
nf.before=nrow(obj$Spatial$counts)
nf.after=nrow(sc$Spatial$counts)
ns.before=ncol(obj$Spatial$counts)
ns.after=ncol(sc$Spatial$counts)
spot.s=paste0(as.character(round(ns.after/ns.before, 3)*100),"%")
feature.s=paste0(as.character(round(nf.after/nf.before, 3)*100),"%")
qc.table=data.frame(type_number=c("nSpot","nFeature"),filter_before=c(as.character(ns.before),as.character(nf.before)),filter_after=c(as.character(ns.after),as.character(nf.after)),percent=c(spot.s,feature.s))
}else{
print(paste0("the filtered-cell-gene counts's dim is gene:",nrow(sc$RNA$counts),";cell:",ncol(sc$RNA$counts)))
nf.before=nrow(obj$RNA$counts)
nf.after=nrow(sc$RNA$counts)
ns.before=ncol(obj$RNA$counts)
ns.after=ncol(sc$RNA$counts)
spot.s=paste0(as.character(round(ns.after/ns.before, 3)*100),"%")
feature.s=paste0(as.character(round(nf.after/nf.before, 3)*100),"%")
qc.table=data.frame(type_number=c("nCell","nFeature"),filter_before=c(as.character(ns.before),as.character(nf.before)),filter_after=c(as.character(ns.after),as.character(nf.after)),percent=c(spot.s,feature.s))
}
write.csv(qc.table,paste0(outpath,"01.",assay,"_decontx_qc.csv"),row.names=F)
print(head(qc.table))
return(sc)
}

和soupx不同之处在于,不会修改原始矩阵表达,只是根据阈值对细胞进行过滤,因此会减少细胞的数目。
方法二:soupx

#'raw.path the 10x raw matrix
#'fit.path the 10x filtered matrix
mySoupX<-function(raw.path,fit.path,outpath="./",my.off=0.2,assay="RNA",group="seurat_clusters",dims=30,resolution=0.5){
library(SoupX)
library(Seurat)
library(DropletUtils)
library(ggplot2)
library(DoubletFinder)
library(knitr)
library(dplyr)
filt.matrix <- Read10X(fit.path)
raw.matrix  <- Read10X(raw.path)
raw.matrix<-raw.matrix[rownames(filt.matrix),]
srat  <- CreateSeuratObject(counts = filt.matrix)
soup.channel  <- SoupChannel(raw.matrix, filt.matrix)
srat <- SCTransform(srat, verbose = F,do.scale=T)
srat <- RunPCA(srat,assay="SCT", verbose = F)
srat <- RunUMAP(srat, dims = 1:dims, verbose = F)
srat <- FindNeighbors(srat,reduction="pca", dims = 1:dims, verbose = F)
srat <- FindClusters(srat, verbose = F,resolution=resolution)
meta <- srat@meta.data
umap <- srat@reductions$umap@cell.embeddings
soup.channel <- setClusters(soup.channel, setNames(meta$seurat_clusters, rownames(meta)))
soup.channel <- setDR(soup.channel, umap)
head(meta)
soup.channel <- autoEstCont(soup.channel)
soup.channel<-setContaminationFraction(soup.channel, my.off) #'阈值my.off
adj.matrix  <- adjustCounts(soup.channel, roundToInt = T) #'校正矩阵
png(paste0(outpath,"01.",assay,"_soupx_before.png"),2500,1800,res=300)
pp<-DimPlot(srat, reduction = "umap", group.by =group)
print(pp)
dev.off()
srat_1<-CreateSeuratObject(counts = adj.matrix)
srat_1<-srat_1%>%SCTransform(verbose=F,do.scale=T)%>% RunPCA(assay="SCT",verbose=FALSE)%>%FindNeighbors(reduction = "pca", dims = 1:dims)%>%FindClusters(verbose = FALSE,resolution=resolution)%>%RunUMAP(dims=1:dims)
png(paste0(outpath,"01.",assay,"_soupx_after.png"),2500,1800,res=300)
pp<-DimPlot(srat_1, reduction = "umap", group.by =group)
print(pp)
dev.off()
DropletUtils:::write10xCounts(paste0(outpath,"01.soupX_pbmc10k_filt"), adj.matrix) #'保存矩阵
}

相同的参数下,可视化矩阵校正前后:


01.RNA_soupx_before.png

01.RNA_soupx_after.png

因为group.by用的是seurat_clusters,且没有相关的背景知识,所以不太能看出来优化之处。
注意:修改了矩阵,所以应该谨慎使用。

最后编辑于
©著作权归作者所有,转载或内容合作请联系作者
  • 序言:七十年代末,一起剥皮案震惊了整个滨河市,随后出现的几起案子,更是在滨河造成了极大的恐慌,老刑警刘岩,带你破解...
    沈念sama阅读 202,529评论 5 475
  • 序言:滨河连续发生了三起死亡事件,死亡现场离奇诡异,居然都是意外死亡,警方通过查阅死者的电脑和手机,发现死者居然都...
    沈念sama阅读 85,015评论 2 379
  • 文/潘晓璐 我一进店门,熙熙楼的掌柜王于贵愁眉苦脸地迎上来,“玉大人,你说我怎么就摊上这事。” “怎么了?”我有些...
    开封第一讲书人阅读 149,409评论 0 335
  • 文/不坏的土叔 我叫张陵,是天一观的道长。 经常有香客问我,道长,这世上最难降的妖魔是什么? 我笑而不...
    开封第一讲书人阅读 54,385评论 1 273
  • 正文 为了忘掉前任,我火速办了婚礼,结果婚礼上,老公的妹妹穿的比我还像新娘。我一直安慰自己,他们只是感情好,可当我...
    茶点故事阅读 63,387评论 5 364
  • 文/花漫 我一把揭开白布。 她就那样静静地躺着,像睡着了一般。 火红的嫁衣衬着肌肤如雪。 梳的纹丝不乱的头发上,一...
    开封第一讲书人阅读 48,466评论 1 281
  • 那天,我揣着相机与录音,去河边找鬼。 笑死,一个胖子当着我的面吹牛,可吹牛的内容都是我干的。 我是一名探鬼主播,决...
    沈念sama阅读 37,880评论 3 395
  • 文/苍兰香墨 我猛地睁开眼,长吁一口气:“原来是场噩梦啊……” “哼!你这毒妇竟也来了?” 一声冷哼从身侧响起,我...
    开封第一讲书人阅读 36,528评论 0 256
  • 序言:老挝万荣一对情侣失踪,失踪者是张志新(化名)和其女友刘颖,没想到半个月后,有当地人在树林里发现了一具尸体,经...
    沈念sama阅读 40,727评论 1 295
  • 正文 独居荒郊野岭守林人离奇死亡,尸身上长有42处带血的脓包…… 初始之章·张勋 以下内容为张勋视角 年9月15日...
    茶点故事阅读 35,528评论 2 319
  • 正文 我和宋清朗相恋三年,在试婚纱的时候发现自己被绿了。 大学时的朋友给我发了我未婚夫和他白月光在一起吃饭的照片。...
    茶点故事阅读 37,602评论 1 329
  • 序言:一个原本活蹦乱跳的男人离奇死亡,死状恐怖,灵堂内的尸体忽然破棺而出,到底是诈尸还是另有隐情,我是刑警宁泽,带...
    沈念sama阅读 33,302评论 4 318
  • 正文 年R本政府宣布,位于F岛的核电站,受9级特大地震影响,放射性物质发生泄漏。R本人自食恶果不足惜,却给世界环境...
    茶点故事阅读 38,873评论 3 306
  • 文/蒙蒙 一、第九天 我趴在偏房一处隐蔽的房顶上张望。 院中可真热闹,春花似锦、人声如沸。这庄子的主人今日做“春日...
    开封第一讲书人阅读 29,890评论 0 19
  • 文/苍兰香墨 我抬头看了看天上的太阳。三九已至,却和暖如春,着一层夹袄步出监牢的瞬间,已是汗流浃背。 一阵脚步声响...
    开封第一讲书人阅读 31,132评论 1 259
  • 我被黑心中介骗来泰国打工, 没想到刚下飞机就差点儿被人妖公主榨干…… 1. 我叫王不留,地道东北人。 一个月前我还...
    沈念sama阅读 42,777评论 2 349
  • 正文 我出身青楼,却偏偏与公主长得像,于是被迫代替她去往敌国和亲。 传闻我的和亲对象是个残疾皇子,可洞房花烛夜当晚...
    茶点故事阅读 42,310评论 2 342

推荐阅读更多精彩内容