R/annotatER.R
annotatERs.Rd
Finds the overlap between junctions and ERs, then adds gene info
and junction info as metadata columns. Then, uses a gtf
file
or a Txdb
passed in to generate a genomic state used to label
each ER as to whether they are exonic, intronic, intergenic or none.
annotatERs(opt_ers, junc_data, genom_state, gtf, txdb)
opt_ers | optimally defined ERs (the product of the ODER function) |
---|---|
junc_data | junction data that should match the ERs passed into opt_ers |
genom_state | a genomic state object |
gtf | gtf in a GRanges object, pre-imported using
|
txdb | TxDb-class (txdb object) to create genomic state. This is used to annotate the expressed regions as exonic, intronic or intergenic. |
annotated ERs
gtf_url <- paste0(
"http://ftp.ensembl.org/pub/release-103/gtf/",
"homo_sapiens/Homo_sapiens.GRCh38.103.chr.gtf.gz"
)
# file_cache is an internal function to download a bigwig file from a link
# if the file has been downloaded recently, it will be retrieved from a cache
gtf_path <- file_cache(gtf_url)
gtf_gr <- rtracklayer::import(gtf_path)
ex_opt_ers <- GenomicRanges::GRanges(
seqnames = S4Vectors::Rle(c("chr21"), c(5)),
ranges = IRanges::IRanges(
start = c(5032176, 5033408, 5034717, 5035188, 5036577),
end = c(5032217, 5033425, 5034756, 5035189, 5036581)
)
)
junctions <- SummarizedExperiment::rowRanges(dasper::junctions_example)
chrs_to_keep <- c("21", "22")
#### preparing the txdb and genomstate object(s)
hg38_chrominfo <- GenomeInfoDb::getChromInfoFromUCSC("hg38")
new_info <- hg38_chrominfo$size[match(
chrs_to_keep,
GenomeInfoDb::mapSeqlevels(hg38_chrominfo$chrom, "Ensembl")
)]
names(new_info) <- chrs_to_keep
gtf_gr_tx <- GenomeInfoDb::keepSeqlevels(gtf_gr,
chrs_to_keep,
pruning.mode = "tidy"
)
GenomeInfoDb::seqlengths(gtf_gr_tx) <- new_info
GenomeInfoDb::seqlevelsStyle(gtf_gr_tx) <- "UCSC"
rtracklayer::genome(gtf_gr_tx) <- "hg38"
ucsc_txdb <- GenomicFeatures::makeTxDbFromGRanges(gtf_gr_tx)
#> Warning: The "phase" metadata column contains non-NA values for features of type
#> stop_codon. This information was ignored.
genom_state <- derfinder::makeGenomicState(txdb = ucsc_txdb)
#> extendedMapSeqlevels: sequence names mapped from NCBI to UCSC for species homo_sapiens
#> 'select()' returned 1:1 mapping between keys and columns
ens_txdb <- ucsc_txdb
GenomeInfoDb::seqlevelsStyle(ens_txdb) <- "Ensembl"
annot_ers1 <- annotatERs(
opt_ers = ex_opt_ers, junc_data = junctions,
gtf = gtf_gr, txdb = ens_txdb, genom_state = genom_state
)
#> [1] "2021-10-08 16:10:02 - Obtaining co-ordinates of annotated exons and junctions..."
#> [1] "2021-10-08 16:10:03 - Getting junction annotation using overlapping exons..."
#> [1] "2021-10-08 16:10:04 - Tidying junction annotation..."
#> [1] "2021-10-08 16:10:05 - Deriving junction categories..."
#> [1] "2021-10-08 16:10:10 - done!"
#> 2021-10-08 16:10:10 - Finding junctions overlapping ers...
#> 2021-10-08 16:10:10 - Annotating the Expressed regions...
#> 2021-10-08 16:10:10 annotateRegions: counting
#> 2021-10-08 16:10:11 annotateRegions: annotating
#> 2021-10-08 16:10:12 - done!
annot_ers1
#> GRanges object with 5 ranges and 5 metadata columns:
#> seqnames ranges strand | grl genes
#> <Rle> <IRanges> <Rle> | <GRangesList> <character>
#> [1] chr21 5032176-5032217 * | chr21:5026423-5323718:+ ENSG00000277117
#> [2] chr21 5033408-5033425 * | chr21:5026423-5323718:+ ENSG00000277117
#> [3] chr21 5034717-5034756 * | chr21:5026423-5323718:+ ENSG00000277117
#> [4] chr21 5035188-5035189 * | chr21:5026423-5323718:+ ENSG00000277117
#> [5] chr21 5036577-5036581 * | chr21:5026423-5323718:+ ENSG00000277117
#> annotation og_index gene_source
#> <character> <integer> <character>
#> [1] exon 1 nearest gtf genes
#> [2] exon 2 nearest gtf genes
#> [3] exon 3 nearest gtf genes
#> [4] exon 4 nearest gtf genes
#> [5] exon 5 nearest gtf genes
#> -------
#> seqinfo: 1 sequence from an unspecified genome; no seqlengths