1. 准备
安装和导入TCGAbiolinks包
if (!requireNamespace("BiocManager", quietly = TRUE))
install.packages("BiocManager")
BiocManager::install("TCGAbiolinks")
library(TCGAbiolinks)
library(dplyr)
library(DT)
查看各个癌种的项目id,总共有65个ID值
getGDCprojects()$project_id
# [1] "GENIE-MSK" "TCGA-UCEC" "TCGA-ACC"
# [4] "TCGA-LGG" "TCGA-SARC" "TCGA-PAAD"
# [7] "TCGA-ESCA" "TCGA-LUAD" "TCGA-PRAD"
# [10] "GENIE-VICC" "TCGA-LAML" "TCGA-KIRC"
# [13] "TCGA-COAD" "TCGA-PCPG" "TCGA-HNSC"
# [16] "BEATAML1.0-COHORT" "BEATAML1.0-CRENOLANIB" "GENIE-JHU"
# [19] "MMRF-COMMPASS" "TCGA-LUSC" "TCGA-OV"
# [22] "TCGA-GBM" "TCGA-UCS" "WCDT-MCRPC"
# [25] "TCGA-MESO" "TCGA-TGCT" "TCGA-KICH"
# [28] "TCGA-READ" "TCGA-UVM" "TCGA-STAD"
# [31] "TCGA-THCA" "OHSU-CNL" "GENIE-DFCI"
# [34] "GENIE-NKI" "ORGANOID-PANCREATIC" "VAREPOP-APOLLO"
# [37] "GENIE-GRCC" "FM-AD" "TARGET-ALL-P1"
# [40] "TARGET-ALL-P2" "TARGET-ALL-P3" "TARGET-RT"
# [43] "CTSP-DLBCL1" "GENIE-UHN" "GENIE-MDA"
# [46] "TARGET-AML" "TARGET-NBL" "TARGET-WT"
# [49] "NCICCR-DLBCL" "TCGA-LIHC" "TCGA-THYM"
# [52] "TCGA-CHOL" "TCGA-SKCM" "TARGET-CCSK"
# [55] "TARGET-OS" "TCGA-DLBC" "TCGA-CESC"
# [58] "TCGA-BRCA" "TCGA-KIRP" "TCGA-BLCA"
# [61] "CGCI-HTMCP-CC" "HCMI-CMDC" "CGCI-BLGSP"
# [64] "CPTAC-2" "CPTAC-3"
查看project中有哪些数据类型,如查询"TCGA-LUAD",有7种数据类型,case_count为病人数,file_count为对应的文件数,file_size为总文件大小,若要下载表达谱,可以设置参数data.category="Transcriptome Profiling"
TCGAbiolinks:::getProjectSummary("TCGA-LUAD")
# $case_count
# [1] 585
#
# $file_count
# [1] 18162
#
# $file_size
# [1] 2.304064e+13
#
# $data_categories
# case_count file_count data_category
# 1 519 2916 Transcriptome Profiling
# 2 569 5368 Simple Nucleotide Variation
# 3 585 2731 Biospecimen
# 4 585 623 Clinical
# 5 579 657 DNA Methylation
# 6 518 3383 Copy Number Variation
# 7 582 2484 Sequencing Reads
2. 下载LUAD肺腺癌的转录组数据
建立查询
query <- GDCquery(project = "TCGA-LUAD",
data.category = "Transcriptome Profiling",
data.type = "Gene Expression Quantification",
workflow.type = "HTSeq - Counts")
查看所有样本编号
# 594个barcode
samplesDown <- getResults(query,cols=c("cases"))
从结果中筛选出肿瘤样本barcode:TP(primary solid tumor)
# 533个barcode
dataSmTP <- TCGAquery_SampleTypes(barcode = samplesDown,
typesample = "TP")
从结果中筛选出NT(正常组织)样本的barcode:NT()
# 59个barcode
dataSmNT <- TCGAquery_SampleTypes(barcode = samplesDown,
typesample = "NT")
还有2个复发的样本barcode:NR(Recurrent Solid Tumor)
# 2个barcode
dataSmTR <- TCGAquery_SampleTypes(barcode = samplesDown,
typesample = "TR")
重新按照样本分组建立查询
queryDown <- GDCquery(project = "TCGA-LUAD",
data.category = "Transcriptome Profiling",
data.type = "Gene Expression Quantification",
workflow.type = "HTSeq - Counts",
barcode = c(dataSmTP, dataSmNT)
)
下载查询到的数据,默认存放位置为当前工作目录下的GDCdata文件夹中
GDCdownload(query = queryDown, files.per.chunk=6, method ="api",
directory ="lung_cancer")
3. 数据读入与预处理
读取下载的数据并将其准备到R对象中,在工作目录生成luad_case.rda文件,同时还产生Human_genes__GRCh38_p12_.rda文件(project文件)
dataPrep1 <- GDCprepare(query = queryDown, save = TRUE,
save.filename = "luad_cases.rda",
directory ="lung_cancer")
# Starting to add information to samples
# => Add clinical information to samples
# => Adding TCGA molecular information from marker papers
# => Information will have prefix 'paper_'
# luad subtype information from:doi:10.1038/nature13385
# Accessing www.ensembl.org to get gene information
# Downloading genome information (try:0) Using: Human genes (GRCh38.p13)
# From the 60483 genes we couldn't map 3990(不能mapping则自动删除该基因)
# 去除dataPrep1中的异常值,dataPrep数据中含有肿瘤组织和正常组织的数据
# save(dataPrep1, file="dataPrep1_LUAD_TP_TN.RData")
rm(list=ls())
load("dataPrep1_LUAD_TP_TN.RData")
TCGAanalyze_Preprocessing执行不同样本表达矩阵之间相关性(Array Array Intensity correlation,AAIC)分析。根据样本之间Spearman相关系数的平方对称矩阵和箱线图,找到具有低相关性的样本,这些样本可以被识别为可能的异常值。使用spearman相关系数去除数据中的异常值,一般设置阈值为0.6。在该次分析中无异常样本
# 无样本被删除
dataPrep <- TCGAanalyze_Preprocessing(object = dataPrep1,
cor.cut = 0.6,
datatype = "HTSeq - Counts",
filename = "AAIC.png")
将数据进行标准化使得不同样本之间具有可比性,使用包含EDASeq包功能的TCGAanalyze_Normalization函数,将mRNA标准化,此功能实现泳道内归一化程序,以针对GC含量效应(或其他基因水平的效应):全局缩放和分位数归一化。
dataNorm.luad <- TCGAanalyze_Normalization(tabDF = dataPrep,
geneInfo = geneInfo,
method = "gcContent")
使用TCGAanalyze_Filtering对低表达量的mRNA进行过滤,method = "quantile"时为筛选出所有样本中均值小于所有样本中rowMeans qnt.cut = 0.25的分位数(默认为25%的分位数)基因,即只保留表达量为前75%的基因。参考:The filtering of low-expression genes of RNA-seq data
dataFilt <- TCGAanalyze_Filtering(tabDF = dataNorm.luad,
method = "quantile",
qnt.cut = 0.25)
save(dataFilt, file="dataFilt.RData")
load("dataFilt.RData")
4. 分组及差异分析
选择NT组的前10和TP组的前10个样本进行差异分析
# selection of normal samples "NT"
samplesNT <- TCGAquery_SampleTypes(barcode = colnames(dataFilt),
typesample = c("NT"))
# selection of tumor samples "TP"
samplesTP <- TCGAquery_SampleTypes(barcode = colnames(dataFilt),
typesample = c("TP"))
# Diff.expr.analysis (DEA)
dataDEGs <- TCGAanalyze_DEA(mat1 = dataFilt[,samplesNT[1:10]],
mat2 = dataFilt[,samplesTP[1:10]],
Cond1type = "Normal",
Cond2type = "Tumor",
# fdr.cut = 0.01 ,
# logFC.cut = 1,
method = "glmLRT")
# Batch correction skipped since no factors provided
# ----------------------- DEA -------------------------------
# there are Cond1 type Normal in 10 samples
# there are Cond2 type Tumor in 10 samples
# there are 12980 features as miRNA or genes
# I Need about 8.7 seconds for this DEA. [Processing 30k elements /s]
# ----------------------- END DEA -------------------------------
DEG.LUAD.edgeR <- TCGAanalyze_DEA(mat1 = dataFilt[,samplesNT[1:10]],
mat2 = dataFilt[,samplesTP[1:10]],
pipeline="edgeR",
batch.factors = c("TSS"),
Cond1type = "Normal",
Cond2type = "Tumor",
voom =FALSE,
method = "glmLRT",
# fdr.cut = 0.01, #保留FDR<0.01的基因
# logFC.cut = 1 #保留logFC>1的基因
)
# ----------------------- DEA -------------------------------
# there are Cond1 type Normal in 10 samples
# there are Cond2 type Tumor in 10 samples
# there are 12980 features as miRNA or genes
# I Need about 8.7 seconds for this DEA. [Processing 30k elements /s]
# ----------------------- END DEA -------------------------------
5. 绘制火山图和热图
火山图绘制
valcano_data <- data.frame(genes=rownames(DEG.LUAD.edgeR),
logFC=DEG.LUAD.edgeR$logFC,
FDR=DEG.LUAD.edgeR$FDR,
group=rep("NotSignificant",
nrow(DEG.LUAD.edgeR)),
stringsAsFactors = F)
valcano_data[which(valcano_data['FDR'] < 0.05 &
valcano_data['logFC'] > 1.5),"group"] <- "Increased"
valcano_data[which(valcano_data['FDR'] < 0.05 &
valcano_data['logFC'] < -1.5),"group"] <- "Decreased"
cols = c("darkgrey","#00B2FF","orange")
names(cols) = c("NotSignificant","Increased","Decreased")
library(ggplot2)
ggplot(valcano_data, aes(x = logFC, y = -log10(FDR), color = group))+
scale_colour_manual(values = cols) +
ggtitle(label = "Volcano Plot", subtitle = "LUAD 10 samples volcano plot") +
geom_point(size = 2.5, alpha = 1, na.rm = T) +
theme_bw(base_size = 14) +
theme(legend.position = "right") +
xlab(expression(log[2]("logFC"))) +
ylab(expression(-log[10]("FDR"))) +
geom_hline(yintercept = 1.30102, colour="#990000", linetype="dashed") +
geom_vline(xintercept = 1.5849, colour="#990000", linetype="dashed") +
geom_vline(xintercept = -1.5849, colour="#990000", linetype="dashed")+
scale_y_continuous(trans = "log1p")
在火山图中标注特定基因如CALCA和MUC5B基因为红色
valcano_data_specific_genes = valcano_data[which(valcano_data$genes %in%
c("CALCA", "MUC5B")), ]
cols = c("darkgrey","#00B2FF","orange", "red")
names(cols) = c("NotSignificant","Increased","Decreased", "Specific")
library(ggplot2)
ggplot(valcano_data, aes(x = logFC, y = -log10(FDR), color = group))+
scale_colour_manual(values = cols) +
ggtitle(label = "Volcano Plot", subtitle = "LUAD 10 samples volcano plot") +
geom_point(size = 2.5, alpha = 1, na.rm = T) +
theme_bw(base_size = 14) +
theme(legend.position = "right") +
xlab(expression(log[2]("logFC"))) +
ylab(expression(-log[10]("FDR"))) +
geom_hline(yintercept = 1.30102, colour="#990000", linetype="dashed") +
geom_vline(xintercept = 1.5849, colour="#990000", linetype="dashed") +
geom_vline(xintercept = -1.5849, colour="#990000", linetype="dashed")+
scale_y_continuous(trans = "log1p") +
geom_point(data=valcano_data_specific_genes, col="red", size=2)
合并平均表达量与差异基因表格(方便查询)
#获取肿瘤组织对应的表达矩阵
dataTP <- dataFilt[,samplesTP[1:10]]
#获取正常组织对应的表达矩阵
dataTN <- dataFilt[,samplesNT[1:10]]
# 合并表格,增加2组每个基因的平均表达量
dataDEGsFiltLevel <- TCGAanalyze_LevelTab(FC_FDR_table_mRNA=dataDEGsFilt,
typeCond1="Normal",
typeCond2="Tumor",
TableCond1=dataTN,
TableCond2=dataTP)
#查看结果
head(dataDEGsFiltLevel,2)
# mRNA logFC FDR Normal Tumor Delta
# SFTPC SFTPC -9.656391 0.001493513 850460.0 76090.8 8212374.1
# SFTPA2 SFTPA2 -1.755048 0.277785397 521998.4 154404.7 916132.2
# CALCA CALCA 5.803164 0.2678382 22.8 9511.5 132.3121
# MUC5B MUC5B 5.2703226 1.827029e-03 2722.7 49155.1 14349.507227
热图绘制
library(pheatmap)
t <- dataFilt[valcano_data$genes[which(valcano_data['FDR'] < 0.05 &
abs(valcano_data['logFC']) >1.5)],
c(samplesTP[1:10], samplesNT[1:10])]
t <- na.omit(t)
t <- log2(t+1)
col <- data.frame(Type=c(rep("Tumor", 10), rep("Normal", 10)))
col$Type <- factor(col$Type, levels = c("Normal", "Tumor"))
rownames(col) <- colnames(t)
pheatmap(t,
annotation_col = col,
annotation_colors = list(Type=c(Tumor="red", Normal="#00B2FF")),
annotation_legend = T,
border_color = "black",
cluster_rows = T,
cluster_cols = T,
show_colnames = F,
show_rownames = F,
color = colorRampPalette(c("green", "black", "red"))(50))