参考代码在这:
http://www.bio-info-trainee.com/bioconductor_China/software/limma.html
https://github.com/jmzeng1314/5years/blob/master/learn-R/tasks/3-r-20-codes.R
https://mp.weixin.qq.com/s?src=11×tamp=1659854016&ver=3967&signature=q9z*I2fKVPmCQzPa8m6LYyAv5W2FnPeNefVygehuJEXply44mZjPWSxKlTIkwwW98AMgCyueKh3*7L9D4QKszqMJuWshgThqGBVK3ojztSDSRQyoEAP6tKzPi5k*WAWm&new=1
1 差异分析
需要的数据:表达矩阵、分组矩阵、差异比较矩阵
exprSet##表达矩阵
group_list##分组信息
##查看数据
> dim(exprSet)
[1] 18857 6
library(limma)
##构建分组矩阵design
design <- model.matrix(~0+factor(group_list))
colnames(design)=levels(factor(group_list))
rownames(design)=colnames(exprSet)
design
##
> design
case control
X.GSM1052615. 0 1
X.GSM1052616. 0 1
X.GSM1052617. 0 1
X.GSM1052618. 1 0
X.GSM1052619. 1 0
X.GSM1052620. 1 0
##构建差异比较矩阵
contrast.matrix<-makeContrasts(paste0(unique(group_list),collapse = "-"),levels = design)
contrast.matrix ##这个矩阵声明,我们要把progres.组跟stable进行差异分析比较
##
contrast.matrix<- makeContrasts(case-control, levels = design)##原始代码有误,这条是修改
##错误的矩阵
> contrast.matrix
Contrasts
Levels control-case
case -1
control 1
##修改后正确的
> contrast.matrix
Contrasts
Levels case - control
case 1
control -1
##使用limma进行差异分析
##step1
fit <- lmFit(exprSet,design)
##step2
fit2 <- contrasts.fit(fit, contrast.matrix) ##这一步很重要
fit2 <- eBayes(fit2) ## default no trend !!!
##eBayes() with trend=TRUE
##step3
tempOutput = topTable(fit2, coef=1, n=Inf)
nrDEG = na.omit(tempOutput)
#write.csv(nrDEG2,"limma_notrend.results.csv",quote = F)
head(nrDEG)
##
> head(nrDEG)
logFC AveExpr t P.Value adj.P.Val B
CD36 -5.780170 7.370282 -79.80805 1.269462e-16 2.393825e-12 26.71919
DUSP6 4.212683 9.106625 62.52233 1.877247e-15 1.394464e-11 24.97560
DCT -5.633027 8.763220 -61.58268 2.218482e-15 1.394464e-11 24.85640
SPRY2 3.801663 9.726468 54.01706 9.411377e-15 4.436758e-11 23.77470
MOXD1 -3.263063 10.171635 -47.13927 4.218104e-14 1.362775e-10 22.56334
ETV4 3.843247 9.667077 47.02123 4.336136e-14 1.362775e-10 22.54028
至此获得了差异分析矩阵nrDEG
2 绘图
2.1 热图
##进一步看一下热图
## heatmap
library(pheatmap)
choose_gene=names(tail(sort(apply(exprSet,1,mad)),50))
choose_matrix=exprSet[choose_gene,]
choose_matrix=t(scale(t(choose_matrix)))
pheatmap(choose_matrix)
可以查看到高表达和低表达的基因。
2.2 火山图
##粗制火山图
plot(nrDEG$logFC, -log10(nrDEG$P.Value))
##好看一点的
library(ggplot2)
## volcano plot
DEG=nrDEG
logFC_cutoff <- with(DEG,mean(abs( logFC)) + 2*sd(abs( logFC)) )
DEG$change = as.factor(ifelse(DEG$P.Value < 0.05 & abs(DEG$logFC) > logFC_cutoff,
ifelse(DEG$logFC > logFC_cutoff ,'UP','DOWN'),'NOT')
)
this_tile <- paste0('Cutoff for logFC is ',round(logFC_cutoff,3),
'\nThe number of up gene is ',nrow(DEG[DEG$change =='UP',]) ,
'\nThe number of down gene is ',nrow(DEG[DEG$change =='DOWN',])
)
this_tile
head(DEG)
g = ggplot(data=DEG, aes(x=logFC, y=-log10(P.Value), color=change)) +
geom_point(alpha=0.4, size=1.75) +
theme_set(theme_set(theme_bw(base_size=20)))+
xlab("log2 fold change") + ylab("-log10 p-value") +
ggtitle( this_tile ) + theme(plot.title = element_text(size=15,hjust = 0.5))+
scale_colour_manual(values = c('blue','black','red')) ## corresponding to the levels(res$change)
print(g)
3 富集分析
基因注释数据库:GO terms、Pathway(KEGG, BIOCARTA)等
超几何分布检验:
##取前1000个基因进行分析
gene<-head(rownames(nrDEG),1000)
library(clusterProfiler)
## get the universal genes and sDEG 转化ID,从SYMBOL到ENTREZID
gene.df <- bitr(gene, fromType = "SYMBOL",
toType = c("ENSEMBL", "ENTREZID"),
OrgDb = org.Hs.eg.db)
head(gene.df)
1% of input gene IDs are fail to map... ##百分之一的ID没有匹配到
> head(gene.df)
SYMBOL ENSEMBL ENTREZID
1 CD36 ENSG00000135218 948
2 DUSP6 ENSG00000139318 1848
3 DCT ENSG00000080166 1638
4 SPRY2 ENSG00000136158 10253
5 MOXD1 ENSG00000079931 26002
6 ETV4 ENSG00000175832 2118
## KEGG pathway analysis
kk <- enrichKEGG(gene = gene.df$ENTREZID,
organism = 'hsa',
pvalueCutoff = 0.05)
head(kk)[,1:6]
kk2 <- gseKEGG(geneList = geneList,
organism = 'hsa',
nPerm = 1000,
minGSSize = 120,
pvalueCutoff = 0.05,
verbose = FALSE)
head(kk2)[,1:6]
gseaplot(kk2, geneSetID = "hsa04145")
##查看结果,了解基因功能(hsa03030在基因复制通路影响很明显)
> head(kk)[,1:6]
ID Description GeneRatio BgRatio
hsa03030 hsa03030 DNA replication 18/452 36/8164
hsa04110 hsa04110 Cell cycle 30/452 126/8164
hsa05322 hsa05322 Systemic lupus erythematosus 27/452 136/8164
hsa04613 hsa04613 Neutrophil extracellular trap formation 29/452 190/8164
hsa05034 hsa05034 Alcoholism 28/452 187/8164
hsa05203 hsa05203 Viral carcinogenesis 29/452 204/8164
pvalue p.adjust
hsa03030 6.173721e-14 1.845943e-11
hsa04110 4.947023e-12 7.395800e-10
hsa05322 4.540320e-09 4.525186e-07
hsa04613 5.203368e-07 3.889518e-05
hsa05034 1.219656e-06 7.293544e-05
hsa05203 2.326912e-06 1.159578e-04
##查看函数说明书
vignette('package name')
KEGG:
## get the universal genes and sDEG
## 获取相对应的genelist,用boxplot()函数查看一下该数据的分布,发现在0上下分布(logFC数据来的)
data(geneList, package="DOSE")
tmp<-data.frame(SYMBOL=names(geneList),
logFC=as.numeric(geneList))
tmp<-merge(tmp,gene.df,bu='SYMBOL')
##修改名称
geneList<-tmp$logFC
names(geneList)<-gene.df$ENTREZID
##排序
geneList<-sort(geneList,decreasing = T)
##KDGG分析
kk2 <- gseKEGG(geneList = geneList,
organism = 'hsa',
nPerm = 1000,
minGSSize = 120,
pvalueCutoff = 0.1,
verbose = FALSE)
head(kk2)[,1:6]
[1] ID Description setSize enrichmentScore
[5] NES pvalue
gseaplot(kk2, geneSetID = "hsa04145")
要点就是把data.frame要修改成对应的数据格式,对应的ID。
有可视化需求可以使用GSEA软件。
有关生存分析可以看这里:
https://github.com/jmzeng1314/GEO/tree/master/GSE11121_survival
那么GEO数据库挖掘就到这里,这里是佳奥,我们下一篇章再见!