1. 环境准备
python
gff3文件:https://www.gencodegenes.org/human/release_19.html
python包:
pip3 install pandas, pysam, numpy
2. 输入文件
#Chr Begin End Size(Mb) Nb_variants Percentage_homozygosity
chr1 37356469 39797055 2.44 57 89.47
chr1 89731959 91742055 2.01 31 90.32
chr1 92327126 94544234 2.22 59 91.53
chr1 172100831 176102969 4 78 91.03
chr2 11718591 15732002 4.01 26 100
chr2 39262068 41384390 2.12 26 92.31
chr2 59614572 62047433 2.43 33 93.94
chr2 80809027 85143406 4.33 27 92.59
chr2 143787161 159672168 15.89 163 93.87
3. 输出文件
样式1
chr start end Size(Mb) Nb_variants Percentage_homozygosity genes
chr1 37356469 39797055 2.44 57 89.47 GRIK3(5.87);ZC3H12A(0.4);MEAF6(0.91);SNIP1(0.81);DNALI1(0.41);GNL2(1.19);RSPO1(0.97);C1orf109(0.44);CDCA8(0.71);EPHA10(2.1);MANEAL(0.3);YRDC(0.21);C1orf122(0.1);MTF1(2.05);INPP5B(3.54);SF3A3(1.39);FHL3(0.36);UTP11L(0.64);POU3F1(0.12);RRAGC(0.89);MYCBP(0.76);GJA9(0.7);RHBDL2(2.29);AKIRIN1(0.61);NDUFS5(0.34);MACF1(10.25)
chr1 89731959 91742055 2.01 31 90.32 GBP5(0.33);GBP6(1.11);LRRC8B(3.63);LRRC8C(6.81);LRRC8D(5.75);ZNF326(2.01);BARHL2(0.28);ZNF644(5.32);HFM1(0.78)
chr1 92327126 94544234 2.22 59 91.53 TGFBR3(2.02);BRDT(2.93);EPHX4(1.51);BTBD8(3.05);KIAA1107(0.8);C1orf146(1.26);GLMN(2.37);RPAP2(4.65);GFI1(0.55);EVI5(12.8);RPL5(0.45);FAM69A(5.38);MTF2(2.7);TMED5(1.4);CCDC18(4.46);DR1(1.06);FNBP1L(4.8);BCAR3(12.87);DNTTIP2(0.55);GCLM(1.09);ABCA4(3.87)
chr1 172100831 176102969 4 78 91.03 DNM3(7.17);PIGC(1.85);C1orf105(1.2);SUCO(1.99);FASLG(0.2);TNFSF18(0.27);TNFSF4(0.59);PRDX6(0.29);SLC9C2(2.56);ANKRD45(1.51);KLHL20(1.79);CENPL(0.63);DARS2(0.85);ZBTB37(0.89);SERPINC1(0.34);RC3H1(2.28);RABGAP1L(20.89);GPR52(0.04);CACYBP(0.31);MRPS14(0.32);TNN(2.0);KIAA0040(0.9);TNR(10.71);RFWD2(4.72)
chr2 11718591 15732002 4.01 26 100 GREB1(1.6);NTSR2(0.3);LPIN1(3.73);TRIB2(0.64);FAM84A(0.45);AC011897.1(0.39);NBAS(9.83);DDX1(0.02)
chr2 39262068 41384390 2.12 26 92.31 SOS1(4.21);CDKL4(2.54);MAP4K3(8.86);TMEM178A(2.5);THUMPD2(2.04);SLC8A1(24.21)
chr2 59614572 62047433 2.43 33 93.94 BCL11A(4.21);PAPOLG(1.88);REL(2.06);PUS10(3.21);PEX13(1.43);KIAA1841(4.07);C2orf74(0.81);AHSA2(0.57);USP34(11.64);XPO1(2.5)
chr2 80809027 85143406 4.33 27 92.59 CTNNA2(1.54);SUCLG1(0.84);DNAH6(6.99);TRABD2A(1.97);TMSB10(0.02)
chr2 143787161 159672168 15.89 163 93.87 KYNU(0.08);ARHGAP15(4.26);GTDC1(2.48);ZEB2(0.88);ACVR2A(0.54);ORC4(0.57);MBD5(3.13);EPC2(0.9);KIF5C(1.58);LYPD6B(1.12);LYPD6(0.91);MMADHC(0.11);RND3(0.45);RBM43(0.09);NMI(0.12);TNFAIP6(0.14);RIF1(0.62);NEB(1.57);ARL5A(0.25);CACNB4(1.68);STAM2(0.37);FMNL2(1.98);PRPF40A(0.42);ARL6IP6(0.27);RPRM(0.01);GALNT13(3.66);KCNJ3(1.01);NR4A2(0.11);GPD2(1.12);GALNT5(0.36);ERMN(0.06);CYTIP(0.47);ACVR1C(0.64);ACVR1(0.88);UPP2(1.63);CCDC148(1.8);PKP4(1.42);DAPL1(0.13)
输出样式2
chr start end Size(Mb) Nb_variants Percentage_homozygosity gene_name gene_coverage
chr1 37356469 39797055 2.44 57 89.47 GRIK3 5.87
chr1 37356469 39797055 2.44 57 89.47 ZC3H12A 0.4
chr1 37356469 39797055 2.44 57 89.47 MEAF6 0.91
chr1 37356469 39797055 2.44 57 89.47 SNIP1 0.81
chr1 37356469 39797055 2.44 57 89.47 DNALI1 0.41
chr1 37356469 39797055 2.44 57 89.47 GNL2 1.19
chr1 37356469 39797055 2.44 57 89.47 RSPO1 0.97
chr1 37356469 39797055 2.44 57 89.47 C1orf109 0.44
chr1 37356469 39797055 2.44 57 89.47 CDCA8 0.71
chr1 37356469 39797055 2.44 57 89.47 EPHA10 2.1
chr1 37356469 39797055 2.44 57 89.47 MANEAL 0.3
chr1 37356469 39797055 2.44 57 89.47 YRDC 0.21
3. 提取gff3文件的python代码
# 导入包
import pandas as pd
import numpy as np
import pysam
# 读取gff3文件
gencode = pd.read_table("/mnt/f/genome/Homo_sapiens_gff3/gencode.v19.annotation.gff3", comment="#",
sep = "\t", names = ['seqname', 'source', 'feature', 'start' , 'end', 'score', 'strand', 'frame', 'attribute'])
gencode.head()
提取gff3文件中所需要的信息
gencode_genes = gencode[(gencode.feature == "gene")][['seqname', 'start', 'end', 'attribute']].copy().reset_index().drop('index', axis=1) # Extract genes
gencode_genes.head()
提取基因名,基因类型,基因状态,基因水平(1verified loci,2manually annotated loci,3automatically annotated loci)
def gene_info(x):
# Extract gene names
g_name = list(filter(lambda x: 'gene_name' in x, x.split(";")))[0].split("=")[1]
g_type = list(filter(lambda x: 'gene_type' in x, x.split(";")))[0].split("=")[1]
g_status = list(filter(lambda x: 'gene_status' in x, x.split(";")))[0].split("=")[1]
g_leve = int(list(filter(lambda x: 'level' in x, x.split(";")))[0].split("=")[1])
return (g_name, g_type, g_status, g_leve)
gencode_genes["gene_name"], gencode_genes["gene_type"], gencode_genes["gene_status"], gencode_genes["gene_level"] = zip(*gencode_genes.attribute.apply(lambda x: gene_info(x)))
gencode_genes.head()
查看基因类型分布
gencode_genes['gene_type'].drop_duplicates()
提取所有已知的蛋白编码基因
gencode_genes = gencode_genes[gencode_genes['gene_status'] == 'KNOWN'].reset_index().drop('index', axis=1)
gencode_genes = gencode_genes[gencode_genes['gene_type'] == 'protein_coding'].reset_index().drop('index', axis=1)
gencode_genes.info()
# 移除重复的记录并保存
## Sort gene_leve in each chromosome (ascending oder) and remove duplicates
gencode_genes = gencode_genes.sort_values(['gene_level', 'seqname'], ascending=True).drop_duplicates('gene_name', keep='first').reset_index().drop('index', axis=1)
gencode_genes.to_csv('/mnt/f/genome/Homo_sapiens_gff3/gencode.v19.annotation.gff3_all_known_genes.txt', index=False, header = False, sep="\t")
gencode_genes.info()
压缩
%%bash -s /mnt/f/genome/Homo_sapiens_gff3/gencode.v19.annotation.gff3_all_known_genes.txt
cut -f 1,2,3,5 $1 | sortBed -i > /mnt/f/genome/Homo_sapiens_gff3/gencode.v19.annotation.gff3_all_known_genes.txt.sorted.formatted.bed
bgzip /mnt/f/genome/Homo_sapiens_gff3/gencode.v19.annotation.gff3_all_known_genes.txt.sorted.formatted.bed
tabix -p bed /mnt/f/genome/Homo_sapiens_gff3/gencode.v19.annotation.gff3_all_known_genes.txt.sorted.formatted.bed.gz
ls -l
4. 注释染色体位置包含的基因名
定义重叠位置计算函数
def overlap(q_st, q_end, res_st, res_end):
o = min(q_end, res_end)-max(q_st, res_st)
return o
定义输入与输入文件目录
import os
input_path = "/mnt/f/projects/gene/patient1/"
input_file_name = "patient1.HomRegions.tsv"
input_file = os.path.join(input_path,input_file_name)
# 读取输入文件
### Read the sample file in to a pandas dataframe
df = pd.read_table(input_file,
names=['chr', 'start', 'end', 'Size(Mb)', 'Nb_variants', 'Percentage_homozygosity'], skiprows=1)
df.head()
计算重叠基因及重叠部位占染色体位置的百分比
def gencode_all_known_genes(a, tb):
genes = []
try:
for region in tb.fetch(a['chr'], int(a['start']), int(a['end'])):
if region:
r = region.split('\t')
overlap_len = overlap(int(a['start']), int(a['end']), int(r[1]), int(r[2]))
ret_val = '{}({})'.format(r[3], np.round(overlap_len/float(int(a['end'])-int(a['start']))*100, 2)) ### Percentage of the input interval that overlap with the gene
genes.append(ret_val)
if len(genes)>0:
return ";".join(genes)
else:
return "NA(0)"
except ValueError:
return "NA(0)"
import pysam
gencode_v19 = pysam.TabixFile('/mnt/f/genome/Homo_sapiens_gff3/gencode.v19.annotation.gff3_all_known_genes.txt.sorted.formatted.bed.gz')
df['genes'] = df.apply(lambda x: gencode_all_known_genes(x[['chr', 'start', 'end']], gencode_v19), axis=1)
df.head()
输出样式1
df.to_csv(os.path.join(input_path,'out_df.tsv'), index=False, header = True, sep="\t")
转为一个基因一行的格式
# 移除无基因的染色体位置
## Remove all the intervals that do not overlap with genes
df2 = df[df['genes'] != "NA(0)"].reset_index(drop=True)
# 每个基因一行
new_rows = []
for i,r in df2.iterrows():
g_list = r['genes'].split(";")
for g in g_list:
g = g.replace(" ","")
new_rows.append(np.append(r[['chr', 'start', 'end', 'Size(Mb)', 'Nb_variants', 'Percentage_homozygosity', 'genes']].values, g))
df_perGene = pd.DataFrame()
df_perGene = df_perGene.append(pd.DataFrame(new_rows, columns=['chr', 'start', 'end', 'Size(Mb)', 'Nb_variants', 'Percentage_homozygosity', 'genes', 'gene_ID'])).reset_index().drop('index', axis=1)
df_perGene['gene_name'] = df_perGene['gene_ID'].apply(lambda x: x.split("(")[0])
df_perGene['gene_coverage'] = df_perGene['gene_ID'].apply(lambda x: x.split("(")[1].replace(")", ""))
## 移除之前的重复列 drop the genes column
df_perGene = df_perGene.drop(["genes", "gene_ID"], axis=1)
df_perGene.head()
## 输出
df_perGene.to_csv(os.path.join(input_path,'out_df_perGene.tsv'), index=False, header = True, sep="\t")