ChIP-seq-analysis
In-depth-NGS-Data-Analysis-Course
以下使用DESeq2 & DiffBind R 包,来分析差异的peak区域,两者思路类似,DESeq2不能考虑input 的影响,DiffBind里面samplesheet 创建有点耗时,要细心!
1.DESeq2
data_list
41-Con-T1 41-Con-T2 41-TF-T1 41-TF-T2 41-TF broad
count.sh
#! bin/bash #ov-chip
bam_PATH=/public/home/xdyu/kcao/ChIP-seq-BWA-over/3.align
narrow_PATH=/public/home/xdyu/kcao/ChIP-seq-BWA-over/6.bed_macs2/02.narrow
broad_PATH=/public/home/xdyu/kcao/ChIP-seq-BWA-over/6.bed_macs2/03.broad
merge_peak=/public/home/xdyu/kcao/ChIP-seq-BWA-over/8.DESeq2/01.merge_peak
gtf_file=/public/home/xdyu/kcao/ChIP-seq-BWA-over/8.DESeq2/02.gtf
featureCounts_file=/public/home/xdyu/kcao/ChIP-seq-BWA-over/8.DESeq2/03.featurecount
##
cat data_list|while read id;
do
array=($id);
Con_1=${array[0]}; ##注意=与${}之间没空格
Con_2=${array[1]};
Treat_1=${array[2]};
Treat_2=${array[3]};
name=${array[4]};
type=${array[5]};
# echo $type ##cat peak
if [ $type = "narrow" ];then
####narrowPeak
cat ${narrow_PATH}/${Con_1}_narrow_peaks.narrowPeak ${narrow_PATH}/${Con_2}_narrow_peaks.narrowPeak ${narrow_PATH}/${Treat_1}_narrow_peaks.narrowPeak ${narrow_PATH}/${Treat_2}_narrow_peaks.narrowPeak >${merge_peak}/${name}.merged.peak;echo "$name cat step is ok!!"
sort -k1,1 -k2,2n ${merge_peak}/${name}.merged.peak >${merge_peak}/${name}.merged.peak.sorted ;bedtools merge -i ${merge_peak}/${name}.merged.peak.sorted >${merge_peak}/${name}.merge.bed;echo "$name bedtools merge step is ok!"
awk -v var=$name '{print $1"\t"var"\texon\t"$2"\t"$3"\t.\t+\t.\tgene_id \"merge_peak_"NR"\"; transcript_id \"merge_peak_"NR"\";"}' ${merge_peak}/${name}.merge.bed >${gtf_file}/${name}.gtf;echo "$name gtf is ok "
featureCounts -T 12 -p -t exon -g gene_id -a ${gtf_file}/${name}.gtf -o ${featureCounts_file}/${name}.count.txt ${bam_PATH}/${Con_1}.uniq.bam ${bam_PATH}/${Con_2}.uniq.bam ${bam_PATH}/${Treat_1}.uniq.bam ${bam_PATH}/${Treat_2}.uniq.bam;echo "$name count is ok!"
###convert to deseq2 count format
sed 1d ${featureCounts_file}/${name}.count.txt |sed -e 's/\/public\/home\/xdyu\/kcao\/ChIP-seq-BWA-over\/3.align\///g' |cut -f 1,7-10 >${featureCounts_file}/${name}.count.clean.txt
###R deseq2
module load R/3.5.1
Rscript run_deseq2.r ${name}.count.clean.txt ${name}.deseq2.output.csv
else
####broadPeak
cat ${broad_PATH}/${Con_1}_broad_peaks.broadPeak ${broad_PATH}/${Con_2}_broad_peaks.broadPeak ${broad_PATH}/${Treat_1}_broad_peaks.broadPeak ${broad_PATH}/${Treat_2}_broad_peaks.broadPeak >${merge_peak}/${name}.merged.peak;echo "$name cat step is ok!!"
sort -k1,1 -k2,2n ${merge_peak}/${name}.merged.peak >${merge_peak}/${name}.merged.peak.sorted ;bedtools merge -i ${merge_peak}/${name}.merged.peak.sorted >${merge_peak}/${name}.merge.bed;echo "$name bedtools merge step is ok!"
awk -v var=$name '{print $1"\t"var"\texon\t"$2"\t"$3"\t.\t+\t.\tgene_id \"merge_peak_"NR"\"; transcript_id \"merge_peak_"NR"\";"}' ${merge_peak}/${name}.merge.bed >${gtf_file}/${name}.gtf;echo "$name gtf is ok "
featureCounts -T 12 -p -t exon -g gene_id -a ${gtf_file}/${name}.gtf -o ${featureCounts_file}/${name}.count.txt ${bam_PATH}/${Con_1}.uniq.bam ${bam_PATH}/${Con_2}.uniq.bam ${bam_PATH}/${Treat_1}.uniq.bam ${bam_PATH}/${Treat_2}.uniq.bam;echo "$name count is ok!"
###convert to deseq2 count format
sed 1d ${featureCounts_file}/${name}.count.txt |sed -e 's/\/public\/home\/xdyu\/kcao\/ChIP-seq-BWA-over\/3.align\///g' |cut -f 1,7-10 >${featureCounts_file}/${name}.count.clean.txt
###R deseq2
module load R/3.5.1
Rscript run_deseq2.r ${name}.count.clean.txt ${name}.deseq2.output.csv
fi
done
run_deseq2.r
#/! bin/bash
## usage:Rscript run_deseq2.r ${name}.count.clean.txt $(name).deseq2.output.csv
## args[i] 读入成字符串
args=commandArgs(T)
library(DESeq2)
countdata <- read.table(paste("/public/home/xdyu/kcao/ChIP-seq-BWA-over/8.DESeq2/03.featurecount/",args[1],sep=""), header=TRUE, row.names=1,check.names=F)
#colnames(countdata)<-c("ov_41_con_T1","ov_41_con_T2","ov_41_EAF1_T1","ov_41_EAF1_T2")
condition<-c("1","1","2","2")
coldata <- data.frame(row.names=colnames(countdata), condition)
dds <- DESeqDataSetFromMatrix(countData = countdata, colData = coldata, design = ~condition)
dds
print("dds is ok")
keep<- rowSums(counts(dds)) >= 10
filter_dds <- dds[keep,]
dds2 <- DESeq(filter_dds)
res<-results(dds2)
res_2<-merge(as.data.frame(res), as.data.frame(counts(dds2, normalized=TRUE)),by="row.names",sort=FALSE)
res_2 <- res_2[order(res_2$padj),]
print("get res_2!")
write.csv(res_2,file=paste("/public/home/xdyu/kcao/ChIP-seq-BWA-over/8.DESeq2/04.deseq2/",args[2],sep=""))
2.DiffBind
#! Rscript wordspace:/public/home/xdyu/kcao/ChIP-seq-BWA-over/9.DiffBind
##usage Rscript run_DiffBind.r input_directory output_directory >run_DiffBind.log
library(DiffBind)
library(stringr)
args=commandArgs(T)
input_directory<-args[1] #/public/home/xdyu/kcao/ChIP-seq-BWA-over/9.DiffBind/01.samplesheet_file
output_directory<-args[2] #/public/home/xdyu/kcao/ChIP-seq-BWA-over/9.DiffBind/02.diffbind_output
data<-grep(".csv",list.files(input_directory),value=TRUE)
for (i in 1:length(data)){
name<-str_sub(data[i],end=-5)
paste(name," is working!")
###建立输出文件夹
dir.create(paste(output_directory,name,sep="/"))
###name<-"over_41_TF.csv"
(tmp<-dba(sampleSheet=paste(input_directory,data[i],sep="/")))
tmp<-dba.count(tmp)
###plotPCA
pdf(paste0(output_directory,"/",name,"/",name,".PCA.pdf"))
dba.plotPCA(tmp,attributes=DBA_TREATMENT,label=DBA_ID)
dev.off()
###heatmap
pdf(paste0(output_directory,"/",name,"/",name,".heatmap.pdf"))
plot(tmp)
dev.off()
###DESeq2 相似的模块
tmp<-dba.contrast(tmp,categories=DBA_TREATMENT,minMembers=2) #Treatment ,minmembers为重复的数目
tmp<-dba.analyze(tmp,method=DBA_ALL_METHODS)
dba.show(tmp, bContrasts=T)
###DESeq2 indetify region
pdf(paste0(output_directory,"/",name,"/",name,".DESeq2.PCA.pdf"))
dba.plotPCA(tmp, contrast=1, method=DBA_DESEQ2, attributes=DBA_TREATMENT, label=DBA_ID)
dev.off()
###plot DESEQ2 and edgeR overlap vennplot
pdf(paste0(output_directory,"/",name,"/",name,".Venn.pdf"))
dba.plotVenn(tmp,contrast=1,method=DBA_ALL_METHODS)
dev.off()
###extract deseq2 result
res_deseq<-dba.report(tmp, method=DBA_DESEQ2, contrast = 1, th=1)
out <- as.data.frame(res_deseq)
write.table(out,file=paste0(output_directory,"/",name,"/",name,".diffbind.csv"),sep=",", quote=F, row.names=F)
###FDR<0.05 bed
tmp_deseq2.bed<-out[ which(out$FDR < 0.05),c("seqnames", "start", "end", "strand", "Fold")]
write.table(tmp_deseq2.bed,file=paste0(output_directory,"/",name,"/",name,".enriched.bed"),sep="\t",quote=F,row.names=F,col.names=F)
}