加载数据集
pbmc.data <- Read10X(data.dir = "../data/pbmc3k/filtered_gene_bc_matrices/hg19/") # 读取10X数据(一个包含三个文件的文件夹)
pbmc <- CreateSeuratObject(counts = pbmc.data, project = "pbmc3k", min.cells = 3, min.features = 200) # 创建Seurat对象
质量控制
# 找出所有线粒体基因保存在pbmc对象中
pbmc[["percent.mt"]] <- PercentageFeatureSet(pbmc, pattern = "^MT-")
# 可视化
VlnPlot(pbmc, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3)
FeatureScatter(pbmc, feature1 = "nCount_RNA", feature2 = "percent.mt")
FeatureScatter(pbmc, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
# 过滤基因数数超过2500或少于200的细胞和线粒体计数> 5%的细胞
pbmc <- subset(pbmc, subset = nFeature_RNA > 200 & nFeature_RNA < 2500 & percent.mt < 5)
pbmc <- NormalizeData(pbmc, normalization.method = "LogNormalize", scale.factor = 10000) # 规范化数据
pbmc <- FindVariableFeatures(pbmc, selection.method = "vst", nfeatures = 2000) # 识别高度可变基因
LabelPoints(plot = VariableFeaturePlot(pbmc), points = head(VariableFeatures(pbmc), 10), repel = TRUE)
pbmc <- ScaleData(pbmc, features = rownames(pbmc)) # 缩放数据:使所有细胞的平均表达为0,方差为1,如果不加feature参数,默认只选择高度可变基因进行scale
降维
pbmc <- RunPCA(pbmc, features = VariableFeatures(object = pbmc)) # 执行PCA
print(pbmc[["pca"]], dims = 1:5, nfeatures = 5) # 展示前5个对主成分影响最大的基因集
VizDimLoadings(pbmc, dims = 1:2, reduction = "pca") # 可视化
DimPlot(pbmc, reduction = "pca")
DimHeatmap(pbmc, dims = 1:15, cells = 500, balanced = TRUE)
# 确定主成分的个数
# 法一:重采样:随机置换数据的一部分(默认为1%),然后重新运行PCA,重复此过程。
pbmc <- JackStraw(pbmc, num.replicate = 100)
pbmc <- ScoreJackStraw(pbmc, dims = 1:20)
JackStrawPlot(pbmc, dims = 1:15)
# 法二:基于每个主成分所解释的方差百分比对主成分进行排序
ElbowPlot(pbmc)
细胞聚类
pbmc <- FindNeighbors(pbmc, dims = 1:10) # 首先基于PCA空间中的欧式距离构造一个KNN图,然后基于其局部邻域中的共享重叠来细化任意两个细胞之间边缘的权重(Jaccard相似性)
pbmc <- FindClusters(pbmc, resolution = 0.5) # 社区发现。resolution参数不同,发现的社区数量也不同
pbmc <- RunUMAP(pbmc, dims = 1:10) # 运行非线性降维(UMAP / tSNE)
DimPlot(pbmc, reduction = "umap") # UMAP可视化
寻找差异表达基因
cluster1.markers <- FindMarkers(pbmc, ident.1 = 1, min.pct = 0.25) # 找出cluster 1 的所有marker基因
head(cluster1.markers, n = 5)
cluster5.markers <- FindMarkers(pbmc, ident.1 = 5, ident.2 = c(0, 3), min.pct = 0.25) # 找出区分cluster5与社区cluster0和社区cluster3的所有marker
head(cluster5.markers, n = 5)
pbmc.markers <- FindAllMarkers(pbmc, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25) # 找出每一个簇的marker,并与所有剩余的细胞进行比较,只报告阳性的
pbmc.markers %>% group_by(cluster) %>% top_n(n = 2, wt = avg_logFC)
cluster1.markers <- FindMarkers(pbmc, ident.1 = 0, logfc.threshold = 0.25, test.use = "roc", only.pos = TRUE) # 使用test.use参数设置多个差异表达测试,例如,ROC测试返回任何单个标记的“分类能力”,范围从0(随机)到1(完美)
VlnPlot(pbmc, features = c("MS4A1", "CD79A")) # 火山图可视化marker基因
FeaturePlot(pbmc, features = c("MS4A1", "CD79A")) # Featureplot图可视化marker基因
top10 <- pbmc.markers %>% group_by(cluster) %>% top_n(n = 10, wt = avg_logFC)
DoHeatmap(pbmc, features = top10$gene) + NoLegend() # DoHeatmap生成给定单元和特征的表达式热图。在这种情况下,我们将为每个聚类绘制前10个标记
new.cluster.ids <- c("Naive CD4 T", "Memory CD4 T", "CD14+ Mono", "B", "CD8 T", "FCGR3A+ Mono", "NK", "DC", "Platelet") # 将细胞类型标识分给每个簇
names(new.cluster.ids) <- levels(pbmc)
pbmc <- RenameIdents(pbmc, new.cluster.ids)
DimPlot(pbmc, reduction = "umap", label = TRUE, pt.size = 0.5) + NoLegend()
参考
https://satijalab.org/seurat/v3.0/pbmc3k_tutorial.html
https://www.jianshu.com/p/03b94b2034d5?from=singlemessage&isappinstalled=0
https://wenku.baidu.com/view/c310a283152ded630b1c59eef8c75fbfc67d946e.html