一致性聚类基于重采样方法验证聚类的合理性,非常适合分子分型文章。下面就学起来。
数据下载参考:R|TCGA|m6AlncRNA
rm(list = ls())
library(tidyverse)
library(limma)
library(ConsensusClusterPlus)
df <- data.table::fread("F:/TCGA-Xena/expression/TCGA-PRAD.htseq_fpkm.tsv.gz",data.table = F)
df_ann <- data.table::fread("F:/TCGA-anno/gencode.v22.annotation.gene.probeMap",data.table = F)
gene <- read.table("F:/Ongoing/Inflammation-PRAD/感兴趣的基因.txt")$V1
df1 <- df_ann %>%
select(id,gene) %>%
inner_join(df,by=c("id"="Ensembl_ID")) %>%
select(-id)
df1 <- avereps(df1[,-1],ID=df1$gene) %>%
as.data.frame() #重复行取均值
df1 <- df1[rownames(df1) %in% gene,]
df1 <- df1[,str_sub(colnames(df1),14,15)<10]
tmp <- "F:/Ongoing/Inflammation-PRAD/results/"
maxk=9
results <- ConsensusClusterPlus(as.matrix(df1),
maxK = 9,
reps = 500,
pItem = 0.8,
pFeature = 1,
title =tmp ,
clusterAlg = "km",
distance = "euclidean",
seed = 123,
plot="png")
results[[2]][["consensusClass"]] #取出样本为2的分类
输出结果是这样式的:
彩蛋:
其实数据预处理很重要,可以像这样预处理:
##使用ALL示例数据
library(ALL)
data(ALL)
d=exprs(ALL)
d[1:5,1:5]
01005 01010 03002 04006 04007
1000_at 7.597323 7.479445 7.567593 7.384684 7.905312
1001_at 5.046194 4.932537 4.799294 4.922627 4.844565
1002_f_at 3.900466 4.208155 3.886169 4.206798 3.416923
1003_s_at 5.903856 6.169024 5.860459 6.116890 5.687997
1004_at 5.925260 5.912780 5.893209 6.170245 5.615210
#筛选前5000标准差的基因
mads=apply(d,1,mad)
d=d[rev(order(mads))[1:5000],]
#sweep函数减去中位数进行标准化
d = sweep(d,1, apply(d,1,median,na.rm=T))
#一步完成聚类
library(ConsensusClusterPlus)
title=tempdir()
results = ConsensusClusterPlus(d,maxK=6,reps=50,pItem=0.8,pFeature=1,
title=title,clusterAlg="hc",distance="pearson",seed=1262118388.71279,plot="png")
参考链接:
一致性聚类ConsensusClusterPlus
R-一致性聚类(公众号:医学僧的科研日记)