读取和汇总gistic输出文件
maftools : Summarize, Analyze and Visualize MAF Files
rm(list = ls())
library(tidyverse)
# 建议下载github最新版本的maftools包
library(maftools)
#读入GISTIC输出的文件
laml.gistic = readGistic(gisticAllLesionsFile = './Gistic 2.0 results/all_lesions.conf_99.txt',
gisticAmpGenesFile = './Gistic 2.0 results/amp_genes.conf_99.txt',
gisticDelGenesFile = './Gistic 2.0 results/del_genes.conf_99.txt',
gisticScoresFile = './Gistic 2.0 results/scores.gistic',
isTCGA = T)
## -Processing Gistic files..
## --Processing amp_genes.conf_99.txt
## --Processing del_genes.conf_99.txt
## --Processing scores.gistic
## --Summarizing by samples
laml.gistic
## An object of class GISTIC
## ID summary
## 1: Samples 500
## 2: nGenes 8606
## 3: cytoBands 37
## 4: Amp 39
## 5: Del 59539
## 6: total 59578
可视化重要的结果
##绘图
#genome plot 部分突出了明显的扩增和删失区域
gisticChromPlot(gistic = laml.gistic, ref.build = 'hg38') ## 参考基因组选hg38
#Bubble plot 绘制显著改变的cytobands,以及改变的样本数和它所包含的基因数的函数。每一个气泡的大小是根据-log10转化q值。
gisticBubblePlot(gistic = laml.gistic)
#oncoplot
gisticOncoPlot(gistic = laml.gistic,
sortByAnnotation = TRUE, top =20)
挑选q<0.25的基因
(sig_cytoband <- laml.gistic@cytoband.summary %>% filter(qvalues<0.25) %>% .$Unique_Name)
## [1] "DP_13:8q24.22" "DP_17:10q23.1" "DP_33:22q12.3" "DP_16:10q21.2"
## [5] "DP_11:8p23.2" "AP_1:4q31.21" "AP_3:22q11.21" "DP_18:10q23.31"
## [9] "DP_25:16q23.2" "DP_27:17p13.3" "DP_34:22q13.32" "DP_3:2q37.1"
## [13] "DP_31:20p12.1" "DP_26:16q23.3" "DP_10:7q34" "DP_12:8q24.11"
## [17] "DP_14:9q22.32" "DP_15:10q11.21" "DP_19:12p13.2" "DP_1:2p23.1"
## [21] "DP_20:13q14.11" "DP_21:13q22.1" "DP_22:14q24.2" "DP_24:16p13.2"
## [25] "DP_28:18q21.33" "DP_29:19p13.3" "DP_2:2p11.2" "DP_30:19p13.12"
## [29] "DP_32:22q12.3" "DP_4:3p25.3" "DP_6:5p14.1" "DP_9:7q22.3"
## [33] "DP_23:15q25.3" "DP_8:6q27" "DP_5:4q31.21" "AP_2:7q34"
## [37] "DP_7:6q26"
table(grepl(pattern = "AP",sig_cytoband))
##
## FALSE TRUE
## 34 3
## 挑选q值小于0.25的Amplification cytoband
sig_AP_cytoband <- laml.gistic@cytoband.summary %>%
filter(qvalues<0.25) %>%
.$Unique_Name %>%
.[grepl(pattern = "AP",.)]
length(sig_AP_cytoband)
## [1] 3
## 将Amplification 的基因挑选出来
sig_AP_gene <- laml.gistic@data %>% filter(Cytoband %in% sig_AP_cytoband) %>% .$Hugo_Symbol
unique(sig_AP_gene)
## [1] "CECR2" "LOC105377458" "[MTRNR2L6]"
## 挑选q值小于0.15的Deletion cytoband
sig_DP_cytoband <- laml.gistic@cytoband.summary %>%
filter(qvalues<0.15) %>%
.$Unique_Name %>%
.[grepl(pattern = "DP",.)]
length(sig_DP_cytoband)
## [1] 10
## 将Deletion cytoband 的基因挑选出来
sig_DP_gene <- laml.gistic@data %>% filter(Cytoband %in% sig_DP_cytoband) %>% .$Hugo_Symbol
head(unique(sig_DP_gene))
## [1] "APOL1" "APOL2" "APOL4" "APOL3" "MIR6819" "FAM19A5"
挑一个基因出来看看
## 看一下经典的抑癌基因在那个Cytoband
"CECR2"%in% sig_AP_gene
## [1] TRUE
laml.gistic@data %>%
filter(Hugo_Symbol %in% "CECR2") %>%
.$Cytoband %>%
unique(.)
## [1] "AP_3:22q11.21"
## 一层一层的剥桔子,让我找到你了,然后再返回去看看Amplification GISTIC plot。真香,对上了。单基因的分析就结束了。
找到了基因之后就可以“为所欲为了….”
# 拷贝数缺失区域的基因那么多,来个GO、kegg、GSEA这些分析是不是来一套了。我就简单copy下代码做做GO分析
# 用Y叔的包玩玩
length(unique(sig_DP_gene))## 因为选择q<0.25的拷贝数区域缺失的基因个数太多了(8000多),所以我修改了下q值,按需修改,没有好的阈值,只有合不合适的结果。
## [1] 842
library(clusterProfiler)
library(org.Hs.eg.db)
library(topGO)
sig_DP_entrezId = mapIds(x = org.Hs.eg.db,
keys = unique(sig_DP_gene),
keytype = "SYMBOL",
column = "ENTREZID")
table(is.na(sig_DP_entrezId))
##
## FALSE TRUE
## 791 51
sig_DP_entrezId <- na.omit(sig_DP_entrezId)
go_ALL <- enrichGO(gene = sig_DP_entrezId,
OrgDb = org.Hs.eg.db,
keyType = "ENTREZID",
ont = "ALL",
pvalueCutoff = 0.5,
qvalueCutoff = 0.5)
##分析完成后,作图
dotplot(go_ALL, split="ONTOLOGY")+ facet_grid(ONTOLOGY~.,scale="free")
### 数目太多,我只要各个条目的前6个
BP_df<- go_ALL@result %>% arrange(desc(Count)) %>% filter(ONTOLOGY=="BP") %>% head()
CC_df<- go_ALL@result %>% arrange(desc(Count)) %>% filter(ONTOLOGY=="CC") %>% head()
MF_df<- go_ALL@result %>% arrange(desc(Count)) %>% filter(ONTOLOGY=="MF") %>% head()
go_df <- rbind(BP_df,CC_df,MF_df)
go_df$go_term_order=factor(x = c(1:nrow(go_df)),labels = go_df$Description)
library(ggsci)
ggplot(data=go_df, aes(x=go_term_order,y=Count, fill=ONTOLOGY)) +
geom_bar(stat="identity", width=0.8) +
scale_fill_jama() +
theme_classic() +
xlab("GO term") + ylab("Number of Genes") + labs(title = "The Most Enriched GO Terms")+
theme(axis.text.x=element_text(face = "bold", color="gray50",angle = 60,vjust = 1, hjust = 1 ))
## 顺利完成,里面的细节稍微调调图,按需美化,又可以放文章里面了。
## 拷贝数变化的分析就到暂时到这了,让我们拭目以待下一种类型的数据。
## 以上分析根据个人理解进行分析,如有错误,请批评指正。