Homo_sapiens.GRCh38.99.gtf 是人类参考基因组注释,在ensambel下载。
options(stringsAsFactors = F)
qe = data.table::fread("Homo_sapiens.GRCh38.99.gtf",sep = "\t",header = F,fill = T,skip = 5)
x = qe$V9
library(tidyr)
library(dplyr)
library(stringr)
table(str_count(x[1:100],";"))
x2 = str_split(x,";",simplify = T)
x3 = str_subset(x2,"transcript_biotype")
head(x3)
x3 = str_remove(x3," transcript_biotype \"" )
x3 = str_remove(x3,"\"")
table(x3)
IG_C_gene
266
IG_C_pseudogene
24
IG_D_gene
115
IG_J_gene
58
IG_J_pseudogene
6
IG_pseudogene
2
IG_V_gene
985
IG_V_pseudogene
476
lncRNA
220139
miRNA
3758
misc_RNA
4470
Mt_rRNA
4
Mt_tRNA
44
nonsense_mediated_decay
366812
non_stop_decay
1464
polymorphic_pseudogene
1207
processed_pseudogene
21963
processed_transcript
156564
protein_coding
1881424
pseudogene
144
retained_intron
153856
ribozyme
16
rRNA
116
rRNA_pseudogene
998
scaRNA
98
scRNA
2
snoRNA
1908
snRNA
3820
sRNA
10
TEC
2317
transcribed_processed_pseudogene
1162
transcribed_unitary_pseudogene
1406
transcribed_unprocessed_pseudogene
6659
translated_processed_pseudogene
4
translated_unprocessed_pseudogene
8
TR_C_gene
64
TR_D_gene
12
TR_J_gene
237
TR_J_pseudogene
8
TR_V_gene
717
TR_V_pseudogene
90
unitary_pseudogene
376
unprocessed_pseudogene
10567
vaultRNA
2