不同的方法从gtf中提取相关基因组信息in R

Method One:

library(GenomicRanges)

library(GenomicFeatures)
library(annotatr)
makeTxDbFromGFF
txdb <- annotatr::makeTxDbFromGFF(gff_file, format="gtf")

GRanges(txdb)

ebg <- transcriptsBy(txdb, by="seqlevels")

anno_info <- transcriptsBy(txdb, by=c("gene", "exon", "cds",'three_prime_utr','five_prime_utr'), use.names=FALSE)
####abstracting information about mouse gene names

library(org.Mm.eg.db)

x = get(sprintf('org.Mm.egSYMBOL', 'Mm10'))
mapped_genes = mappedkeys(x)
eg2symbol = as.data.frame(x[mapped_genes])
eg2symbol$ensemble <- mapIds(org.Mm.eg.db,keys = eg2symbol$gene_id,keytype = "ENTREZID",column = "ENSEMBL")

print(eg2symbol)

Method Two:

source("https://bioconductor.org/biocLite.R")

biocLite('GenomicFeatures')

library('GenomicFeatures')

biocLite("GenomicRanges")

library('GenomicRanges')

library(rtracklayer)

library(GenomicAlignments)

library("AnnotationDbi")

library("GenomicFeatures")

test <- import(gtfFile, format = "gtf")

#Abstract the first rank exon

exon_1=subset(mcols(test),mcols(test)$exon_number=="1")

#Abstract reads from bam files

reads <- readGAlignments('./test.bam')

reads <- as(reads, "GRanges") #change the class from dataframe to GRanges

txdb = makeTxDbFromGFF('./Homo_sapiens.GRCh38.87.gtf',format = "gtf",circ_seqs = character())

#Abstract sites information

genes <- genes(txdb)

TS=transcripts(txdb)

EX=exons(txdb)

EX_hits<- findOverlaps(reads, EX)

hits <- findOverlaps(reads, genes)

Method Three：

library(plyr)

pfgtf <- file.path("./Homo_sapiens.GRCh38.87.gtf")

gtf <- read.table(pfgtf, head = F, sep = " ", stringsAsFactors = F)

test <- data.frame(do.call(rbind, strsplit(gtf$V9, split = "[; ]")), stringsAsFactors=F)

valueFind <- function(x, type = "gene_id"){

for(i in 1:length(x)){

if(sum(x == type) == 0){

return(NA)

}else{

return(x[which(x == type)[1]+1])

}

gtf_infor <- data.frame(gene_id = as.vector(apply(test,1,valueFind)),

transcript_id = as.vector(apply(test,1,function(x){valueFind(x,type = "transcript_id")})),

gene_name = as.vector(apply(test,1,function(x){valueFind(x,type = "gene_name")})),

gene_biotype = as.vector(apply(test,1,function(x){valueFind(x,type = "gene_biotype")})),

exon_id = as.vector(apply(test,1,function(x){valueFind(x,type = "exon_id")})),

exon_number = as.vector(apply(test,1,function(x){valueFind(x,type = "exon_number")})), stringsAsFactors=F)

gtf_info <- cbind(gtf[,c(1,3,4,5,7)], gtf_infor)

gtf_exon_info <- gtf_info[gtf_info$V3 == "exon", ]

colnames(gtf_exon_info)[1:5] <- c("chr", "type", "start", "end", "strand")

gtf_exon_info$exon_number <- as.numeric(gtf_exon_info$exon_number)

gtf_exon_info$exon_loci <- paste(gtf_exon_info$chr,":", gtf_exon_info$start, "-", gtf_exon_info$end, "_", gtf_exon_info$strand, sep = "")

data_split <- dlply(gtf_exon_info, .variables = "gene_id")

first_exon <- lapply(data_split, function(x){

x[x$exon_number == 1, ]

})

biocLit('IRanges')

library('IRanges')

####Compile first exons into IRanges object

IRanges_first_exon = lapply(first_exon,function(x) IRanges(x[,3],x[,4]))

first_exon_overlap = lapply(IRanges_first_exon, function(x) findOverlaps(IRanges_reads, x))

####统计每个基因上第一个外显子的位置可以与我找出来reads，相互overlap到的数目

overlap_summary = lapply(first_exon_overlap,function(x) summary(x)[1])

###找出有map上reads的外显子个数，以及上面的reads数

exist_overlap_reads=unlist(overlap_summary)[unlist(overlap_summary)!=‘0’]

length(exist_overlap_reads)

every_gene_map_reads=as.numeric(exist_overlap_reads)

Method Four:

rtracklayer::readGFF(gtfFile)

importGTF(file , skip = 5 , nrow = -1, use.data.table = TRUE , level = 'gene' , features = NULL , print.feature = FALSE)