注:本代码来源于B站基因课视频教程
准备参考基因组数据
#创建工作区间
mkdir workspace
cd workspace
mkdir RNAseq
mkdir 1.reference
mkdir 2.software
#进入工作目录进行文件下载
cd 1.reference
wget http://citrus.hzau.edu.cn/data/Genome_info/SWO.v3.0/SWO.v3.0.genome.fa
wget http://citrus.hzau.edu.cn/data/Genome_info/SWO.v3.0/SWO.v3.0.protein.fa
wget http://citrus.hzau.edu.cn/data/Genome_info/SWO.v3.0/SWO.v3.0.gene.model.gff3
seqtk seq -l 60 SWO.v3.0.genome.fa >genome.fa
#将SWO.v3.0.genome.fa转换为标准的genome.fa格式并为其建立索引;
seqkit faidx genome.fa
#把gff3格式的文件转换为ensembl格式的gtf文件便于后续分析;
gffread ‑T ‑o temp.gtf ./SWO.v3.0.gene.model.gff
gtftk convert_ensembl ‑i temp.gtf >genes.gtf
#提取基因名称以及最长转录本的ID
gtftk short_long -l -i genes.gtf | gtftk select_by_key -k feature -v transcript | gtftk tabulate -k transcript_id,gene_id >longest_mapid.txt
#筛选出转录本ID和对应的CDS序列
sed '1d' longest_mapid.txt|cut ‑f 1 | seqtk subseq raw/SWO.v3.0.protein.fa ‑ > longest_transcript.proteins.fa
#将转录本ID转化为基因ID,得到基因所对应的最长转录本
seqkit replace ‑p '(.)$' ‑r '{kv}' ‑k longest_mapid.txt longest_transcript.proteins.fa > proteins.fa
#用hisat2为参考基因组建立索引
hisat2_extract_splice_sites.py genes.gtf >genome.ss
hisat2_extract_exons.py ../12.ref/genes.gtf >genome.exon
hisat2‑build ‑‑exon genome.exon ‑‑ss genome.ss genome.fa genome
下载测序数据及处理
#下载使用的方法是用kingfisher加上SRA编号下载,在此演示一个文件的下载过程
kingfisher get -r SRR6451531 -m aws-http aws-cp ena-ftp prefetch
#将sra文件转换为fastq文件
fastq-dump --split-files SRR6451531.sra
#对比对的文件进行质控
fastp -i SRR6451531_1.fastq -I SRR6451531_2.fastq -o Cs31_1.fq.gz -O Cs31_2.fq.gz
#将质控之后的read文件比对到参考基因组上
hisat2 --new-summary -p 4 -x genome -1 Cs"$i"_1.fq.gz -2 Cs"$i"_2.fq.gz -S Cs"$i".sam --rna-strandness RF
#Sam文件转为bam文件便于稍后处理
samtools view -@ 4 -b -o Cs"$i".bam Cs"$i".sam
#将bam文件进行排序,便于可视化
samtools sort -O bam -@ 4 -o Cs31.sort.bam -T tmp_samtools Cs31.bam
#samtools 软件为文件建立索引
samtools index Cs31.sort.bam
#用IGV软件进行可视化
#将bam文件的read数与基因进行定量
Rscript ../2.software/RunFeatureCounts/run-featurecounts.R -b Cs31.bam -g genes.gtf -f exon -a gene_id -o Cs31"
#按照上述方法处理总计15个样本文件,之后准备gene.quant_files.txt文件
#将15个conut文件的内容组成数据矩阵
perl ../1.software/RunFeatureCounts/abundance_estimates_to_matrix.pl --est_method featureCounts --out_prefix gene --quant_files gene.quant_files.txt
#准备差异表达分析的文件:sample_info.txt文件、contrast.txt文件
#进行样本之间的差异表达分析
perl /pub/software/trinityrnaseq-v2.11.0/Analysis/DifferentialExpression/run_DE_analysis.pl --matrix gene.counts.matrix --method DESeq2 --samples_file sample_info.txt --contrasts contrast.txt
#得到一个DE.*某某的目录,计入之后可以看见Cs1与Cs2等之间的表达差异分析文件
#将各个样本进行合成总的数据
head -1 DESeq2.7796.dir/gene.counts.matrix.Cs1_vs_Cs2.DESeq2.DE_results >merge.DE_results
cat DESeq2.7796.dir/gene.counts.matrix.*.DE_results|grep -v "^sampleA" >>merge.DE_results
#汇总:
for i in 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45
do
kingfisher get -r SRR64515"$i" -m aws-http aws-cp ena-ftp prefetch
fastq-dump --split-files SRR64515"$i".sra
fastp -i SRR64515"$i"_1.fastq -I SRR64515"$i"_2.fastq -o Cs"$i"_1.fq.gz -O Cs"$i"_2.fq.gz
hisat2 --new-summary -p 4 -x genome -1 Cs"$i"_1.fq.gz -2 Cs"$i"_2.fq.gz -S Cs"$i".sam --rna-strandness RF
samtools view -@ 4 -b -o Cs"$i".bam Cs"$i".sam
Rscript ../RunFeatureCounts/run-featurecounts.R -b Cs"$i".bam -g genes.gtf -f exon -a gene_id -o Cs"$i"
done
R分析
基础分析
#一、样本关系探索
gene_exp = read.table(file = "gene.TMM.TPM.matrix") # 导入表达矩阵
library(tidyverse)
read.table(file = "./sample_info.txt") %>%
rename(group = "V1",sample = "V2") %>%
column_to_rownames(var = "sample") -> sample_info # 导入样本信息表
sample_cor = round(cor(gene_exp), digits = 2) # 计算样本之间的相关系数
#绘制热图
library(pheatmap)
pheatmap(sample_cor,
cluster_rows = F, cluster_cols =F, # 不聚类
cellwidth = 15, cellheight = 15, # cell大小
border_color = "white", # 边框颜色
fontsize = 8, # 字体大小
angle_col = 45, # 列倾斜
display_numbers = T, # 显示数值
fontsize_number = 5) # 数值字体大小
#样本聚类
dist(t(gene_exp))
plot(hclust(dist(t(gene_exp))))
library(corrgram)
stars(sample_cor,full = TRUE, draw.segments = TRUE,main = "15个样本星状图")
Library(corrgram)
corrgram(gene_exp, order = TRUE, upper.panel = panel.ellipse, lower.panel = panel.cor,
main = '柑橘样品之间的相关性')
#主成分分析
library(PCAtools)
p1 = pca(gene_exp, metadata = sample_info, removeVar = 0.3)
screeplot(p1)
biplot(p1,
x = 'PC1',
y = 'PC2',
colby = "group",
hline = 0, vline = 0,
encircle = T, encircleFil = T,
#ellipse = T
)
#二、差异基因表达探索
library(readr)
de_result <- read.delim("merge.DE_results",
delim = "\t",escape_double = FALSE,
trim_ws = TRUE) # 导入数据
library(tidyverse)
select(de_result, gene_id,sampleA, sampleB, log2FoldChange, padj) %>%
mutate(direction = if_else(abs(log2FoldChaneg) < 1 | padj > 0.05, 'ns',if_else(log2FoldChange >=1,'up', 'down'))) ->de_result # 数据处理
filter(de_result, direction != 'ns') %>%
group_by(sampleA, sampleB, direction) %>%
summarise(count = n()) # 统计上下调基因的数目
library(VennDiagram)
de_list = list(
Cs2 = filter(de_result, sampleA == "Cs1", sampleB == "Cs2", direction != 'ns') %>% pull(gene_id),
Cs3 = filter(de_result, sampleA == "Cs1", sampleB == "Cs3", direction != 'ns') %>% pull(gene_id),
Cs4 = filter(de_result, sampleA == "Cs1", sampleB == "Cs4", direction != 'ns') %>% pull(gene_id),
Cs5 = filter(de_result, sampleA == "Cs1", sampleB == "Cs5", direction != 'ns') %>% pull(gene_id),
)
library(RColorBrewer)
RColorBrewer::brewer.pal(4, name = "Set1")
venn.diagram(
x = de_list,
filename = "de_venn.tiff",
fill = RColorBrewer::brewer.pal(4, name = "Set1")
) # 画Venn图
my_de_result = filter(de_result, sampleA == 'Cs1', sampleB == 'Cs2') %>%
arrange(desc(abs(log2FoldChaneg)))
library(EnhancedVolcano)
EnhancedVolcano(my_de_result,
x = 'log2FoldChange',
y = 'padj',
lab = my_de_result$gene_id,
FCcutoff = 2,
pCutoff = 0.01) # 画Cs1组和Cs5组火山图
top_de = filter(de_result, sampleA == 'Cs1', sampleB == 'Cs5') %>%
arrange(desc(abs(log2FoldChange))) %>%
slice(1:20) %>%
pull(gene_id)
top_de_exp = gene_exp[top_de,]
pheatmap(log2(top_de_exp + 1)) # 挑选20个基因做热图
q() # 退出R环境
富集分析
#构建功能注释库,选择之前下载处理好的proteins.fa上传到这个网站在邮箱点击管理链接进入启动注释工作,之后到下方链接进行下载基因功能注释的数据库,进行网页下载构建好的文件
wget http://eggnog-mapper.embl.de/MM_fkcyqapg/out.emapper.annotations
#下载处理注释结果的文件和脚本,再用脚本将处理文件转为数据库
git clone http://git.genek.cn:3333/zhxd2/emcp.git
Rscript ../2.software/emcp/emapperx.R out.emapper.annotations ./proteins.fa
#进入R环境分析
#Go富集
library(org.My.eg.db, lib.loc = "./R_Library/") # 导入基因的功能注释数据库OrgDB
my_deg_result <- filter(de_result,
sampleA == "Cs1",
sampleB == "Cs5") %>%
arrange(desc(abs(log2FoldChaneg))) # 导入数据
gene <- filter(my_deg_result,
direction != "NS") %>% pull(gene_id) # 筛选目标基因列表
geneList <- my_deg_result$log2FoldChange
names(geneList) <- my_deg_result$gene$id
geneList <- sort(geneList, decreasing = T) # 准备基因差异列表和差异基因列表
library(clusterProfiler)
library(org.My.eg.db, lib.loc = "/R_Library/")
ggo <- groupGo(gene = gene,
OrgDb = org.My.eg.db,
keyType = "GID",
ont = "BP",
level = 3,
readable = FALSE) # 准备数据,GO分类
head(ggo) # 查看结果
ego <- enrichGO(gene = gene,
universe = names(geneList),
OrgDb = org.My.eg.db,
keyType = "GID",
ont = "BP",
pvalueCutoff = 0.05,
qvalueCutoff = 0.05,
readable = FALSE)
ego_df <- as.data.frame(ego) # Go over-representation 富集
ego <- dropGO(ego, level = 1:3) %>%
filter(!str_detect(Description, "drug")) # 去除部分term,去除 level 1,2,3的水平
ego <- enrichplot::pairwise_termsim(ego)
ego <- clusterProfiler::simplify(ego, cutoff=0.7,
by="p.adjust",
select_fun=min) # 优化富集结果
goplot(ego)
barplot(ego, showCategory = 15)
dotpolt(ego, showCategory = 15)
cnetpolt(ego, foldChange = geneList) # 可视化
# GO的 GSEA富集及可视化
ego2 <- gseGO(geneList = geneList,
OrgDb = org.My.eg.db,
keyType = "GID",
ont = "BP",
minGSSize = 100,
maxGSSize = 500,
pvalueCutoff = 0.05,
verbose = FLASE)
head(ego2)
goplot(ego2)
gseaplot2(ego2, geneSetID = 1:3,
pvalue_table = TRUE) # 可视化
#KEGG Pathway 的富集及可视化
library(magrittr)
get_path2name <- function(){
keggpathid2name.df <- clusterProfiler::kegg_list("pathway")
keggpathid2name.df[,1] %<>% gsub("path:map", "", .)
colnames(keggpathid2name.df) <- c("path_id", "path_naem")
return(keggpathid2name.df)
} # 数据库构建代码
pathway2name <- get_path2name() # 准备TERM2NAME
emapper <- read.delim("/out.emapper.annotations",
"\t", escape_double = FALSE, col_names = FALSE,
comment = "#", trim_ws = TRUE) %>%
dplyr::select(GID = X1,
COG = X7,
Gnen_Name = X8,
Gene_Symbol = X9,
GO = X10,
KO = X12,
Pathway = X13)
pathway2gene <- dplyr::select(emapper, Pathway, GID) %>%
separate_rows(Pathway, sep = ',', convert = F) %>%
filter(str_detect(Pathway, 'ko')) %>%
mutate(Pathway = str_remove(Pathway, 'ko')) # 准备 TERM2GENE
#富集分析
ekp <- enricher(gene,
TERM2GENE = pathway2gene,
TERM2NAME = payhway2name,
pvalueCutoff = 0.05,
qvalueCutoff = 0.05)
head(ekp)
#KEGG Pathway 的 GSEA富集及可视化
ekp2 = GSEA(
geneList,
TERM2GENE = pathway2gene,
TERM2NAME = pathway2name,
minGSSize = 10,
maxGSSize = 500,
pvalueCutoff = 0.05
)
#得到富集结果之后可以用其他的画图方法展现富集结果
#Keeg富集结果图
goplot(ego)
barplot(ekp, showCategory = 15)
dotplot(ekp, showCategory = 15)
centplot(ekp, foldChange = geneList)
cnetplot(ekp, foldChange = geneList,showCategory = 15)