gzh:BBio,欢迎关注
scanpy软件由Theis Lab实验室开发,和Seurat相同都是常用的单细胞数据分析工具。scanpy以anndata数据结构存储的单细胞基因表达数据,包括预处理、可视化、聚类、轨迹推断和差异基因鉴定等功能。基于python实现可以有效处理超过100万个细胞的数据集的强大功能。
以10X官网的3k pbmc数据为例,学习一下scanpy。
参考:
https://scanpy-tutorials.readthedocs.io/en/latest/pbmc3k.html
文章:
SCANPY: large-scale single-cell gene expression data analysis
//使用流程
import numpy as np
import pandas as pd
import scanpy as sc
import re
from matplotlib import pyplot as plt
import matplotlib
matplotlib.use("Agg") #使用非交互式的后端生成图像文件
//读取10X结果,创建AnnData对象
#下载数据
#wget http://cf.10xgenomics.com/samples/cell-exp/1.1.0/pbmc3k/pbmc3k_filtered_gene_bc_matrices.tar.gz -O data/pbmc3k_filtered_gene_bc_matrices.tar.gz
#sc.read_10x_mtx(path, var_names, make_unique=True, cache=False)
adata = sc.read_10x_mtx('data/filtered_gene_bc_matrices/hg19/')
print(adata)
#根据AnnData的数据结果,可以理解为每一行都是一个细胞的数据,每一列都是一个基因的数据
#AnnData object with n_obs × n_vars = 2700 × 32738
# var: 'gene_ids'
#对于有重复的var_names,添加1、2进行区分
adata.var_names_make_unique()
print(adata.var.head())
//定义绘图函数:sc.pl/scanpy.plotting
def pic(pdf):
searchObj = re.search( r'(.*).pdf', pdf)
png = f"{searchObj.group(1)}.png"
plt.savefig(pdf, bbox_inches="tight")
plt.savefig(png, bbox_inches="tight", dpi=100)
plt.close()
//数据预处理:sc.pp/scanpy.preprocessing
#查看高表达基因
sc.pl.highest_expr_genes(adata, n_top=5)
pic("highest_expr_genes.pdf")
#过滤低质量细胞
sc.pp.filter_cells(adata, min_genes=200)
sc.pp.filter_genes(adata, min_cells=3)
#计算线粒体比例
adata.var['mt'] = adata.var_names.str.startswith('MT-')
sc.pp.calculate_qc_metrics(adata, qc_vars=['mt'], percent_top=None, log1p=False, inplace=True)
print(adata)
#AnnData object with n_obs × n_vars = 2700 × 13714
# obs: 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', #'pct_counts_mt'
# var: 'gene_ids', 'n_cells', 'mt', 'n_cells_by_counts', 'mean_counts', #'pct_dropout_by_counts', 'total_counts'
print(adata.to_df().head())
# AL627309.1 AP006222.2 RP11-206L10.2 RP11-206L10.9 LINC00115 NOC2L # KLHL17 ... MT-ND6 MT-CYB AC145212.1 AL592183.1 AL354822.1 PNRC2-1 SRSF10-1
#AAACATACAACCAC-1 0.0 0.0 0.0 0.0 0.0 0.0 # 0.0 ... 0.0 4.0 0.0 0.0 0.0 0.0 0.0
#AAACATTGAGCTAC-1 0.0 0.0 0.0 0.0 0.0 0.0 # 0.0 ... 0.0 8.0 0.0 1.0 0.0 0.0 0.0
#绘图观察数据
#sc.pl.violin(adata, keys, groupby, log=False, use_raw=None, stripplot=True, jitter=True, size=1, order=None, multi_panel=None, xlabel, ylabel, rotation)
sc.pl.violin(adata, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],
jitter=0.4, multi_panel=True)
pic("qc_count.pdf")
sc.pl.scatter(adata, x='total_counts', y='pct_counts_mt')
pic("qc_total_counts_mt.pdf")
sc.pl.scatter(adata, x='total_counts', y='n_genes_by_counts')
pic("qc_total_counts_genes.pdf")
#过滤细胞
adata = adata[adata.obs.n_genes_by_counts < 2500, :]
adata = adata[adata.obs.pct_counts_mt < 5, :]
//标准化数据
#方法和Seurat中相同
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)
print(adata)
//鉴定高变基因
sc.pp.highly_variable_genes(adata, min_mean=0.0125, max_mean=3, min_disp=0.5)
sc.pl.highly_variable_genes(adata)
pic("highly_variable_genes.pdf")
print(adata)
#AnnData object with n_obs × n_vars = 2700 × 13714
# obs: 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', #'pct_counts_mt'
# var: 'gene_ids', 'n_cells', 'mt', 'n_cells_by_counts', 'mean_counts', #'pct_dropout_by_counts', 'total_counts', 'highly_variable', 'means', 'dispersions', #'dispersions_norm'
# uns: 'log1p', 'hvg'
//scale
adata.raw = adata
adata = adata[:, adata.var.highly_variable]
sc.pp.regress_out(adata, ['total_counts', 'pct_counts_mt'])
sc.pp.scale(adata, max_value=10)
//降维聚类:sc.tl/scanpy.tools
#pca降维
sc.tl.pca(adata, svd_solver='arpack')
sc.pl.pca(adata, color='CST3')
pic("pca_CST3.pdf")
sc.pl.pca_variance_ratio(adata, log=True)
pic("pca_variance.pdf")
adata.obsm.to_df()[['X_pca1', 'X_pca2']].to_csv('pca.xls')
#UMAP降维
sc.pp.neighbors(adata, n_neighbors=10, n_pcs=40)
sc.tl.umap(adata)
sc.tl.tsne(adata, n_pcs=40)
adata.obsm.to_df()[['X_umap1', 'X_umap2']].to_csv('umap.xls')
#help(sc.tl.umap)
#sc.tl.umap(adata, color, use_raw=None, projection='2d', groups=None, legend_loc='on data', legend_fontweight='bold', legend_fontoutline=True, size=None, color_map=None, palette=None, frameon=True, vmin='p1', vmax='p99', add_outline=False, ncols=4, hspace=0.25)
sc.pl.umap(adata, color=['CST3', 'NKG7', 'PPBP'])
pic("umap_feature_raw.pdf")
sc.pl.umap(adata, color=['CST3', 'NKG7', 'PPBP'], use_raw=False)
pic("umap_feature.pdf")
#聚类
sc.tl.leiden(adata)
print(adata)
sc.pl.umap(adata, color=['leiden', 'CST3', 'NKG7'], ncols=2, legend_loc='on data', legend_fontoutline=True, size=2, color_map='PuBuGn', palette=['red', 'black', '#006400', '#FFFF00', '#FFC1C1', '#FF00FF', '#FFA500', '#C1CDC1'], frameon=True, vmin='p1', vmax='p99', add_outline=True)
pic("umap_cluster_feature.pdf")
adata.write('test.h5ad')
//差异基因鉴定
#help(sc.tl.rank_genes_groups)
#sc.tl.rank_genes_groups(adata, groupby=None, groups='all', reference='rest', n_genes=None, method=['t-test', 'wilcoxon', 'logreg'], corr_method=['benjamini-hochberg', 'bonferroni'], rankby_abs=False, pts=False)
sc.tl.rank_genes_groups(adata, 'leiden', method='t-test')
sc.pl.rank_genes_groups(adata, n_genes=25, sharey=False)
pic("diff_feature.pdf")
df = sc.get.rank_genes_groups_df(adata, group="0")
df = df.sort_values(by="scores", ascending=False)
df.to_csv('diff_cluster0.xls', index=0, sep='\t')
#names scores logfoldchanges pvals pvals_adj
#RPS12 29.401457 0.879537 4.1169846518652114e-159 1.1292065503135902e-155
#RPS27 28.020739 0.85751283 1.1950869901189692e-146 2.0486778728114427e-143
#RPS6 26.885033 0.7554096 5.911086827796277e-135 8.106464475639814e-132
#RPS25 25.821676 0.9273434 6.483621820047835e-127 7.409699136678001e-124
#RPS3 25.549692 0.75146854 6.262897557106957e-123 6.134955507011772e-120
sc.tl.rank_genes_groups(adata, 'leiden', groups=['0'], reference='1', method='wilcoxon')
sc.pl.rank_genes_groups(adata, groups=['0'], n_genes=20)
pic("diff_feature_0vs1.pdf")
sc.pl.rank_genes_groups_violin(adata, groups='0', n_genes=8)
pic("diff_feature_n_genes.pdf")
sc.pl.violin(adata, ['CST3', 'NKG7', 'PPBP'], groupby='leiden')
pic("diff_feature_n_genes.pdf")
//细胞类型注释
marker_genes = ['IL7R', 'CD79A', 'MS4A1', 'CD8A', 'CD8B', 'LYZ', 'CD14',
'LGALS3', 'S100A8', 'GNLY', 'NKG7', 'KLRB1',
'FCGR3A', 'MS4A7', 'FCER1A', 'CST3', 'PPBP']
sc.pl.dotplot(adata, marker_genes, groupby='leiden')
pic("leiden_marker_dotplot.pdf")
adata = sc.read('test.h5ad')
new_cluster_names = [
'CD4 T', 'CD4 T', 'CD14 Monocytes',
'B', 'CD8 T',
'FCGR3A Monocytes', 'NK',
'Dendritic', 'Megakaryocytes']
adata.rename_categories('leiden', new_cluster_names)
#ValueError: Categorical categories must be unique
#不支持多个cluster是同一种细胞类型的格式,换一种方式添加细胞类型
new_cluster_names_unique=list(set(new_cluster_names))
new_cluster_names_unique.sort(key=new_cluster_names.index)
celltype = [new_cluster_names[int(i)] for i in adata.obs.leiden.values.tolist()]
adata.obs.leiden = celltype
adata.obs.leiden = adata.obs.leiden.astype('category')
adata.obs.leiden.cat.set_categories(new_cluster_names_unique, inplace=True)
with plt.rc_context({'figure.figsize':(10, 10)}):
sc.pl.umap(adata, color='leiden', legend_loc='on data', title='', frameon=True, legend_fontsize='x-small')
pic("celltype_umap.pdf")
sc.pl.dotplot(adata, marker_genes, groupby='leiden')
pic("celltype_marker_dotplot.pdf")
sc.pl.stacked_violin(adata, marker_genes, groupby='leiden', rotation=90)
pic("celltype_marker_violin.pdf")