Obtain set of non-overlapping exons

Downloads a well-defined set of exons to be used in obtaining the optimum set of Expressed regions. These exons are used in calculating the exon deltas.

Calculates the median exon delta and the number of ERs with an exon delta of 0 by comparing each combination of MCC and MRG with the optimum exons from the ensembl database.

Uses a delta calculating function and a well defined set of exons to find which combination of MCC and MRG gives the best definition of the Expressed regions.

get_exons(gtf, ucsc_chr, ignore.strand = TRUE, biotype = "Non-overlapping")

get_ers_delta(ers, opt_exons, delta_fun = NULL)

get_opt_ers(ers, ers_delta)

Arguments

gtf	Either a string containg the path to a .gtf file or a pre-imported gtf using `rtracklayer::import` .
ucsc_chr	logical scalar, determining whether to add "chr" prefix to the seqnames of non-overlapping exons and change "chrMT" -> "chrM". Note, if set to TRUE and seqnames already have "chr", it will not add another.
ignore.strand	logical value for input into `findOverlaps`, default is True.
biotype	Filters the GTF file passed in to what would be considered the "Gold Standard" exons. The Default is "Non-overlapping" but the options are: "Non-overlapping" (exons that don't intersect each other), "Three Prime" (3' UTR), "Five Prime" (5' UTR), "Internal" (Internal coding), "lncRNA" (Long Non-Coding RNA), "ncRNA" (Non-Coding RNA) and "Pseudogene"
ers	Sets of ERs across various MCCs/MRGs - output of `get_ers`.
opt_exons	GRanges object that contains the regions that ideally, you want the ER definitions to match - output of `get_exons`.
delta_fun	Function that calculates the delta between ERs and `opt_exons`. Takes as input a set of ERs from `ers` and `opt_exons`. Then outputs a tibble/dataframe containing the summarised delta scores for that set of one set of ERs.
ers_delta	tibble/dataframe containing summarised delta values. One row per set of ERs.

Value

GRanges object containing non-overlapping exons.

tibble/dataframe containing summarised delta values. One row per set of ERs.

list containing optimised ERs, optimal pair of MCC/MRGs and delta_df

Functions

get_exons: Filter for the exons to calculate the deltas against
get_ers_delta: Method to get ers delta to help determine the optimum ers

Examples


gtf_url <- paste0(
    "http://ftp.ensembl.org/pub/release-103/gtf/",
    "homo_sapiens/Homo_sapiens.GRCh38.103.chr.gtf.gz"
)
gtf_path <- file_cache(gtf_url)

gtf_gr <- rtracklayer::import(gtf_path)

eg_opt_exons <- get_exons(
    gtf = gtf_gr,
    ucsc_chr = TRUE,
    ignore.strand = TRUE
)
#> 2021-10-08 16:11:07 - Obtaining non-overlapping exons


eg_opt_exons
#> GRanges object with 160103 ranges and 22 metadata columns:
#>            seqnames      ranges strand |   source     type     score     phase
#>               <Rle>   <IRanges>  <Rle> | <factor> <factor> <numeric> <integer>
#>        [1]     chr1 12975-13052      + |   havana     exon        NA      <NA>
#>        [2]     chr1 24738-24891      - |   havana     exon        NA      <NA>
#>        [3]     chr1 18268-18366      - |   havana     exon        NA      <NA>
#>        [4]     chr1 17915-18061      - |   havana     exon        NA      <NA>
#>        [5]     chr1 17606-17742      - |   havana     exon        NA      <NA>
#>        ...      ...         ...    ... .      ...      ...       ...       ...
#>   [160099]     chrM 14149-14673      - |    insdc     exon        NA      <NA>
#>   [160100]     chrM 14674-14742      - |    insdc     exon        NA      <NA>
#>   [160101]     chrM 14747-15887      + |    insdc     exon        NA      <NA>
#>   [160102]     chrM 15888-15953      + |    insdc     exon        NA      <NA>
#>   [160103]     chrM 15956-16023      - |    insdc     exon        NA      <NA>
#>                    gene_id gene_version   gene_name gene_source
#>                <character>  <character> <character> <character>
#>        [1] ENSG00000223972            5     DDX11L1      havana
#>        [2] ENSG00000227232            5      WASH7P      havana
#>        [3] ENSG00000227232            5      WASH7P      havana
#>        [4] ENSG00000227232            5      WASH7P      havana
#>        [5] ENSG00000227232            5      WASH7P      havana
#>        ...             ...          ...         ...         ...
#>   [160099] ENSG00000198695            2      MT-ND6       insdc
#>   [160100] ENSG00000210194            1       MT-TE       insdc
#>   [160101] ENSG00000198727            2      MT-CYB       insdc
#>   [160102] ENSG00000210195            2       MT-TT       insdc
#>   [160103] ENSG00000210196            2       MT-TP       insdc
#>                      gene_biotype   transcript_id transcript_version
#>                       <character>     <character>        <character>
#>        [1] transcribed_unproces.. ENST00000450305                  2
#>        [2] unprocessed_pseudogene ENST00000488147                  1
#>        [3] unprocessed_pseudogene ENST00000488147                  1
#>        [4] unprocessed_pseudogene ENST00000488147                  1
#>        [5] unprocessed_pseudogene ENST00000488147                  1
#>        ...                    ...             ...                ...
#>   [160099]         protein_coding ENST00000361681                  2
#>   [160100]                Mt_tRNA ENST00000387459                  1
#>   [160101]         protein_coding ENST00000361789                  2
#>   [160102]                Mt_tRNA ENST00000387460                  2
#>   [160103]                Mt_tRNA ENST00000387461                  2
#>            transcript_name transcript_source     transcript_biotype         tag
#>                <character>       <character>            <character> <character>
#>        [1]     DDX11L1-201            havana transcribed_unproces..       basic
#>        [2]      WASH7P-201            havana unprocessed_pseudogene       basic
#>        [3]      WASH7P-201            havana unprocessed_pseudogene       basic
#>        [4]      WASH7P-201            havana unprocessed_pseudogene       basic
#>        [5]      WASH7P-201            havana unprocessed_pseudogene       basic
#>        ...             ...               ...                    ...         ...
#>   [160099]      MT-ND6-201             insdc         protein_coding       basic
#>   [160100]       MT-TE-201             insdc                Mt_tRNA       basic
#>   [160101]      MT-CYB-201             insdc         protein_coding       basic
#>   [160102]       MT-TT-201             insdc                Mt_tRNA       basic
#>   [160103]       MT-TP-201             insdc                Mt_tRNA       basic
#>            transcript_support_level exon_number         exon_id exon_version
#>                         <character> <character>     <character>  <character>
#>        [1]                       NA           4 ENSE00001799933            2
#>        [2]                       NA           2 ENSE00003507205            1
#>        [3]                       NA           3 ENSE00003477500            1
#>        [4]                       NA           4 ENSE00003565697            1
#>        [5]                       NA           5 ENSE00003475637            1
#>        ...                      ...         ...             ...          ...
#>   [160099]                       NA           1 ENSE00001434974            2
#>   [160100]                       NA           1 ENSE00001544476            1
#>   [160101]                       NA           1 ENSE00001436074            2
#>   [160102]                       NA           1 ENSE00001544475            2
#>   [160103]                       NA           1 ENSE00001544473            2
#>             protein_id protein_version     ccds_id
#>            <character>     <character> <character>
#>        [1]        <NA>            <NA>        <NA>
#>        [2]        <NA>            <NA>        <NA>
#>        [3]        <NA>            <NA>        <NA>
#>        [4]        <NA>            <NA>        <NA>
#>        [5]        <NA>            <NA>        <NA>
#>        ...         ...             ...         ...
#>   [160099]        <NA>            <NA>        <NA>
#>   [160100]        <NA>            <NA>        <NA>
#>   [160101]        <NA>            <NA>        <NA>
#>   [160102]        <NA>            <NA>        <NA>
#>   [160103]        <NA>            <NA>        <NA>
#>   -------
#>   seqinfo: 25 sequences from an unspecified genome; no seqlengths
data(gtex_SRP012682_SRX222703_lung_ers_1, package = "ODER")

eg_ers_delta <- get_ers_delta(
    ers = gtex_SRP012682_SRX222703_lung_ers_1,
    opt_exons = eg_opt_exons
)
#> 2021-10-08 16:11:09 - Calculating delta for ERs...

eg_ers_delta
#> # A tibble: 4 × 7
#>     mcc   mrg     sum  mean median n_eq_0 propor_eq_0
#>   <dbl> <dbl>   <int> <dbl>  <dbl>  <int>       <dbl>
#> 1     5    10 2187300  964.   150.    322       0.142
#> 2     5    20 1892386  898.   141     335       0.159
#> 3    10    10 1771837 1009.   139     316       0.180
#> 4    10    20 1457726  911.   118     323       0.202
data(gtex_SRP012682_SRX222703_lung_ers_1, package = "ODER")
opt_ers <- get_opt_ers(
    ers = gtex_SRP012682_SRX222703_lung_ers_1,
    ers_delta = eg_ers_delta
)
#> 2021-10-08 16:11:10 - Obtaining optimal set of ERs...
opt_ers
#> $opt_ers
#> GRanges object with 15977 ranges and 0 metadata columns:
#>           seqnames            ranges strand
#>              <Rle>         <IRanges>  <Rle>
#>       [1]    chr21   5032176-5032217      *
#>       [2]    chr21   5033408-5033425      *
#>       [3]    chr21   5034717-5034756      *
#>       [4]    chr21   5035188-5035189      *
#>       [5]    chr21   5036577-5036581      *
#>       ...      ...               ...    ...
#>   [15973]    chr22 50798996-50799149      *
#>   [15974]    chr22 50799209-50799284      *
#>   [15975]    chr22 50799669-50799688      *
#>   [15976]    chr22 50799717-50799744      *
#>   [15977]    chr22 50800460-50800587      *
#>   -------
#>   seqinfo: 2 sequences from an unspecified genome; no seqlengths
#> 
#> $opt_mcc_mrg
#> [1] "mcc_10" "mrg_20"
#> 
#> $deltas
#> # A tibble: 4 × 7
#>     mcc   mrg     sum  mean median n_eq_0 propor_eq_0
#>   <dbl> <dbl>   <int> <dbl>  <dbl>  <int>       <dbl>
#> 1     5    10 2187300  964.   150.    322       0.142
#> 2     5    20 1892386  898.   141     335       0.159
#> 3    10    10 1771837 1009.   139     316       0.180
#> 4    10    20 1457726  911.   118     323       0.202
#>