Bulk RNAseq上游比对1:大致流程与conda环境 - 简书 (jianshu.com)
Bulk RNAseq上游比对2:下载数据、质控 - 简书 (jianshu.com)
Bulk RNAseq上游比对3:比对mapping - 简书 (jianshu.com)
Step1:下载数据
1.1 下载公共数据框的测序数据
- 示例数据:GSE158623 https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE158623
- SRR:https://www.ncbi.nlm.nih.gov/Traces/study/?acc=PRJNA665998,可以下载.sra文件[不推荐]
- EBI:https://www.ebi.ac.uk/ena/browser/view/SRR12720999,提供fastq.gz的aspera(推荐)、ftp下载链接
cat > SraAccList.txt
SRR12720999
SRR12721000
SRR12721001
SRR12721002
SRR12721003
SRR12721004
conda activate download
方式1:aspera
# 批量生成下载链接
# era-fasp@fasp.sra.ebi.ac.uk:/vol1/fastq/SRR166/009/SRR1663609/SRR1663609_1.fastq.gz
touch ascp.link
cat SraAccList.txt | while read id
do
echo "era-fasp@fasp.sra.ebi.ac.uk:/vol1/fastq/${id:0:6}/0${id:0-2}/${id}/${id}_1.fastq.gz" >> ascp.link
echo "era-fasp@fasp.sra.ebi.ac.uk:/vol1/fastq/${id:0:6}/0${id:0-2}/${id}/${id}_2.fastq.gz" >> ascp.link
done
#ascp高速下载
cat ascp.link |while read sample
do
ascp -QT -l 300m -P33001 \
-i ~/miniconda3/envs/download/etc/asperaweb_id_dsa.openssh \
$sample .
done
方式2:wget ftp
# 批量生成下载链接
# ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR127/099/SRR12720999/SRR12720999_1.fastq.gz
touch ftp.link
cat SraAccList.txt | while read id
do
echo "ftp://ftp.sra.ebi.ac.uk/vol1/fastq/${id:0:6}/0${id:0-2}/${id}/${id}_1.fastq.gz" >> ftp.link
echo "ftp://ftp.sra.ebi.ac.uk/vol1/fastq/${id:0:6}/0${id:0-2}/${id}/${id}_2.fastq.gz" >> ftp.link
done
cat ftp.link | while read id
do
wget -c $id
done
方式3:prefetch下载sra文件,再转为fastq.gz
#单独下载
prefetch SRR12720999
#批量下载 SRR list
prefetch --option-file SraAccList.txt
# sra2fastq
cat SraAccList.txt | while read id
do
echo ${id}
fasterq-dump --split-files $id
done
#fastq2fastq.gz
gzip *fastq
- 然后查看下载得到的fastq.gz质量如何
#### fastqc
fastqc $(ls ${pare_dir}/raw/ebi/*gz) -o ${pare_dir}/raw/ebi/ -t 10
####QC merged report
multiqc ./ -n rawfq_multiqc_report.html
1.2 下载参考基因组及相关数据
refgenie init -c ~/refgenie/genome_config.yaml
refgenie listr -c ~/refgenie/genome_config.yaml
refgenie listr -g hg38 -c ~/refgenie/genome_config.yaml
#参考基因组
refgenie pull hg38/fasta -c ~/refgenie/genome_config.yaml
refgenie pull hg38_cdna/fasta -c ~/refgenie/genome_config.yaml
#参考注释信息
refgenie pull hg38/gencode_gtf -c ~/refgenie/genome_config.yaml
#比对软件的索引文件
refgenie pull hg38/bowtie2_index -c ~/refgenie/genome_config.yaml
refgenie pull hg38/bwa_index -c ~/refgenie/genome_config.yaml
refgenie pull hg38/star_index -c ~/refgenie/genome_config.yaml
refgenie pull hg38/hisat2_index -c ~/refgenie/genome_config.yaml
refgenie pull hg38_cdna/salmon_index -c ~/refgenie/genome_config.yaml
#列出本地已经下载的数据
refgenie list -c ~/refgenie/genome_config.yaml
Step2:质控
- trim_glaore
#以其中一个作为示例
pare_dir=/home/data/****/mapping
fq1=${pare_dir}/raw/ebi/SRR12720999_1.fastq.gz
fq2=${pare_dir}/raw/ebi/SRR12720999_2.fastq.gz
trim_galore -j 8 -q 25 --phred33 --length 36 \
-paired -o ${pare_dir}/trim \
$fq1 $fq2
#批量
cat ${pare_dir}/SraAccList.txt | while read id
do
echo $id
trim_galore -j 8 -q 25 --phred33 --length 36 \
-paired -o ${pare_dir}/trim \
${pare_dir}/raw/ebi/${id}_1.fastq.gz \
${pare_dir}/raw/ebi/${id}_2.fastq.gz
done
- 查看质控之后的fatsq.gz质量
#### fastqc
fastqc $(ls ${pare_dir}/trim/*gz) -o ${pare_dir}/trim/ -t 10
####QC report
multiqc ./ -n trim_multiqc_report.html