Downloads a well-defined set of exons to be used in obtaining the optimum set of Expressed regions. These exons are used in calculating the exon deltas.
Calculates the median exon delta and the number of ERs with an exon delta of 0 by comparing each combination of MCC and MRG with the optimum exons from the ensembl database.
Uses a delta calculating function and a well defined set of exons to find which combination of MCC and MRG gives the best definition of the Expressed regions.
get_exons(gtf, ucsc_chr, ignore.strand = TRUE, biotype = "Non-overlapping")
get_ers_delta(ers, opt_exons, delta_fun = NULL)
get_opt_ers(ers, ers_delta)
gtf | Either a string containg the path to a .gtf file or a pre-imported
gtf using |
---|---|
ucsc_chr | logical scalar, determining whether to add "chr" prefix to the seqnames of non-overlapping exons and change "chrMT" -> "chrM". Note, if set to TRUE and seqnames already have "chr", it will not add another. |
ignore.strand | logical value for input into
|
biotype | Filters the GTF file passed in to what would be considered the "Gold Standard" exons. The Default is "Non-overlapping" but the options are: "Non-overlapping" (exons that don't intersect each other), "Three Prime" (3' UTR), "Five Prime" (5' UTR), "Internal" (Internal coding), "lncRNA" (Long Non-Coding RNA), "ncRNA" (Non-Coding RNA) and "Pseudogene" |
ers | Sets of ERs across various MCCs/MRGs - output of
|
opt_exons | GRanges object that contains the regions that ideally, you
want the ER definitions to match - output of |
delta_fun | Function that calculates the delta between ERs and
|
ers_delta | tibble/dataframe containing summarised delta values. One row per set of ERs. |
GRanges object containing non-overlapping exons.
tibble/dataframe containing summarised delta values. One row per set of ERs.
list containing optimised ERs, optimal pair of MCC/MRGs and
delta_df
get_exons
: Filter for the exons to calculate the deltas
against
get_ers_delta
: Method to get ers delta to help determine the
optimum ers
gtf_url <- paste0(
"http://ftp.ensembl.org/pub/release-103/gtf/",
"homo_sapiens/Homo_sapiens.GRCh38.103.chr.gtf.gz"
)
gtf_path <- file_cache(gtf_url)
gtf_gr <- rtracklayer::import(gtf_path)
eg_opt_exons <- get_exons(
gtf = gtf_gr,
ucsc_chr = TRUE,
ignore.strand = TRUE
)
#> 2021-10-08 16:11:07 - Obtaining non-overlapping exons
eg_opt_exons
#> GRanges object with 160103 ranges and 22 metadata columns:
#> seqnames ranges strand | source type score phase
#> <Rle> <IRanges> <Rle> | <factor> <factor> <numeric> <integer>
#> [1] chr1 12975-13052 + | havana exon NA <NA>
#> [2] chr1 24738-24891 - | havana exon NA <NA>
#> [3] chr1 18268-18366 - | havana exon NA <NA>
#> [4] chr1 17915-18061 - | havana exon NA <NA>
#> [5] chr1 17606-17742 - | havana exon NA <NA>
#> ... ... ... ... . ... ... ... ...
#> [160099] chrM 14149-14673 - | insdc exon NA <NA>
#> [160100] chrM 14674-14742 - | insdc exon NA <NA>
#> [160101] chrM 14747-15887 + | insdc exon NA <NA>
#> [160102] chrM 15888-15953 + | insdc exon NA <NA>
#> [160103] chrM 15956-16023 - | insdc exon NA <NA>
#> gene_id gene_version gene_name gene_source
#> <character> <character> <character> <character>
#> [1] ENSG00000223972 5 DDX11L1 havana
#> [2] ENSG00000227232 5 WASH7P havana
#> [3] ENSG00000227232 5 WASH7P havana
#> [4] ENSG00000227232 5 WASH7P havana
#> [5] ENSG00000227232 5 WASH7P havana
#> ... ... ... ... ...
#> [160099] ENSG00000198695 2 MT-ND6 insdc
#> [160100] ENSG00000210194 1 MT-TE insdc
#> [160101] ENSG00000198727 2 MT-CYB insdc
#> [160102] ENSG00000210195 2 MT-TT insdc
#> [160103] ENSG00000210196 2 MT-TP insdc
#> gene_biotype transcript_id transcript_version
#> <character> <character> <character>
#> [1] transcribed_unproces.. ENST00000450305 2
#> [2] unprocessed_pseudogene ENST00000488147 1
#> [3] unprocessed_pseudogene ENST00000488147 1
#> [4] unprocessed_pseudogene ENST00000488147 1
#> [5] unprocessed_pseudogene ENST00000488147 1
#> ... ... ... ...
#> [160099] protein_coding ENST00000361681 2
#> [160100] Mt_tRNA ENST00000387459 1
#> [160101] protein_coding ENST00000361789 2
#> [160102] Mt_tRNA ENST00000387460 2
#> [160103] Mt_tRNA ENST00000387461 2
#> transcript_name transcript_source transcript_biotype tag
#> <character> <character> <character> <character>
#> [1] DDX11L1-201 havana transcribed_unproces.. basic
#> [2] WASH7P-201 havana unprocessed_pseudogene basic
#> [3] WASH7P-201 havana unprocessed_pseudogene basic
#> [4] WASH7P-201 havana unprocessed_pseudogene basic
#> [5] WASH7P-201 havana unprocessed_pseudogene basic
#> ... ... ... ... ...
#> [160099] MT-ND6-201 insdc protein_coding basic
#> [160100] MT-TE-201 insdc Mt_tRNA basic
#> [160101] MT-CYB-201 insdc protein_coding basic
#> [160102] MT-TT-201 insdc Mt_tRNA basic
#> [160103] MT-TP-201 insdc Mt_tRNA basic
#> transcript_support_level exon_number exon_id exon_version
#> <character> <character> <character> <character>
#> [1] NA 4 ENSE00001799933 2
#> [2] NA 2 ENSE00003507205 1
#> [3] NA 3 ENSE00003477500 1
#> [4] NA 4 ENSE00003565697 1
#> [5] NA 5 ENSE00003475637 1
#> ... ... ... ... ...
#> [160099] NA 1 ENSE00001434974 2
#> [160100] NA 1 ENSE00001544476 1
#> [160101] NA 1 ENSE00001436074 2
#> [160102] NA 1 ENSE00001544475 2
#> [160103] NA 1 ENSE00001544473 2
#> protein_id protein_version ccds_id
#> <character> <character> <character>
#> [1] <NA> <NA> <NA>
#> [2] <NA> <NA> <NA>
#> [3] <NA> <NA> <NA>
#> [4] <NA> <NA> <NA>
#> [5] <NA> <NA> <NA>
#> ... ... ... ...
#> [160099] <NA> <NA> <NA>
#> [160100] <NA> <NA> <NA>
#> [160101] <NA> <NA> <NA>
#> [160102] <NA> <NA> <NA>
#> [160103] <NA> <NA> <NA>
#> -------
#> seqinfo: 25 sequences from an unspecified genome; no seqlengths
data(gtex_SRP012682_SRX222703_lung_ers_1, package = "ODER")
eg_ers_delta <- get_ers_delta(
ers = gtex_SRP012682_SRX222703_lung_ers_1,
opt_exons = eg_opt_exons
)
#> 2021-10-08 16:11:09 - Calculating delta for ERs...
eg_ers_delta
#> # A tibble: 4 × 7
#> mcc mrg sum mean median n_eq_0 propor_eq_0
#> <dbl> <dbl> <int> <dbl> <dbl> <int> <dbl>
#> 1 5 10 2187300 964. 150. 322 0.142
#> 2 5 20 1892386 898. 141 335 0.159
#> 3 10 10 1771837 1009. 139 316 0.180
#> 4 10 20 1457726 911. 118 323 0.202
data(gtex_SRP012682_SRX222703_lung_ers_1, package = "ODER")
opt_ers <- get_opt_ers(
ers = gtex_SRP012682_SRX222703_lung_ers_1,
ers_delta = eg_ers_delta
)
#> 2021-10-08 16:11:10 - Obtaining optimal set of ERs...
opt_ers
#> $opt_ers
#> GRanges object with 15977 ranges and 0 metadata columns:
#> seqnames ranges strand
#> <Rle> <IRanges> <Rle>
#> [1] chr21 5032176-5032217 *
#> [2] chr21 5033408-5033425 *
#> [3] chr21 5034717-5034756 *
#> [4] chr21 5035188-5035189 *
#> [5] chr21 5036577-5036581 *
#> ... ... ... ...
#> [15973] chr22 50798996-50799149 *
#> [15974] chr22 50799209-50799284 *
#> [15975] chr22 50799669-50799688 *
#> [15976] chr22 50799717-50799744 *
#> [15977] chr22 50800460-50800587 *
#> -------
#> seqinfo: 2 sequences from an unspecified genome; no seqlengths
#>
#> $opt_mcc_mrg
#> [1] "mcc_10" "mrg_20"
#>
#> $deltas
#> # A tibble: 4 × 7
#> mcc mrg sum mean median n_eq_0 propor_eq_0
#> <dbl> <dbl> <int> <dbl> <dbl> <int> <dbl>
#> 1 5 10 2187300 964. 150. 322 0.142
#> 2 5 20 1892386 898. 141 335 0.159
#> 3 10 10 1771837 1009. 139 316 0.180
#> 4 10 20 1457726 911. 118 323 0.202
#>