Chapter 5 Worked examples with before/after tables

This chapter shows what the data look like, what the IDs look like, and how the table changes once mapped.

library(knitr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tibble)

kable_head <- function(x, n = 5, caption = NULL) {
  knitr::kable(utils::head(x, n), caption = caption)
}

strip_ensembl_version <- function(x) sub("\\..*$", "", x)

5.1 GSE233947

5.1.1 Raw data preview

source("./fetch_geo_supp.R")
fetch_geo_supp(gse = params$gse)
## Setting options('download.file.method.GEOquery'='auto')
## Setting options('GEOquery.inmemory.gpl'=FALSE)
## Using locally cached version of supplementary file(s) GSE233947 found here:
## data/GSE233947/GSE233947_FeatureCounts_V31genes_RawCounts_ENSG.tsv.gz
## Using locally cached version of supplementary file(s) GSE233947 found here:
## data/GSE233947/GSE233947_modulize_3CTG_20CTG_junctions.tsv.gz
## Using locally cached version of supplementary file(s) GSE233947 found here:
## data/GSE233947/GSE233947_modulize_NT_20CTG_junctions.tsv.gz
## Using locally cached version of supplementary file(s) GSE233947 found here:
## data/GSE233947/GSE233947_modulize_NT_3CTG_junctions.tsv.gz
path <- file.path("data", params$gse)
files <- list.files(path, pattern = params$file_grep, 
                    full.names = TRUE, recursive = TRUE)

kable_head(tibble(file = basename(files)), min(10, length(files)), paste(params$gse,": extracted file list (first 10)"))
(#tab:get_data)GSE233947 : extracted file list (first 10)
file
GSE233947_FeatureCounts_V31genes_RawCounts_ENSG.tsv.gz
GSE233947_modulize_3CTG_20CTG_junctions.tsv.gz
GSE233947_modulize_NT_20CTG_junctions.tsv.gz
GSE233947_modulize_NT_3CTG_junctions.tsv.gz

Raw table preview

library(readr)

safe_read <- function(file) {
  # First attempt: read as TSV
  df <- tryCatch(
    readr::read_tsv(file, show_col_types = FALSE),
    error = function(e) NULL   # catch fatal errors
  )
  
  # If read_tsv failed entirely:
  if (is.null(df)) {
    message("TSV read failed — reading as space-delimited file instead.")
    return(readr::read_table(file, show_col_types = FALSE))
  }
  
  # If read_tsv returned but with parsing issues:
  probs <- problems(df)
  if (nrow(probs) > 0) {
    message("Parsing issues detected in TSV — reading as space-delimited file instead.")
    return(readr::read_table(file, show_col_types = FALSE))
  }
  
  # If everything was fine:
  return(df)
}

x <- safe_read(files[1])


kable_head(x[, 1:min(6, ncol(x))], 5, paste(params$gse,": raw table preview"))
(#tab:preview_raw)GSE233947 : raw table preview
Gene T8657_900CTG_NT T8658_1150CTG_NT T8659_1450CTG_NT T8660_900CTG_20CTG T8661_1150CTG_20CTG
ENSG00000108821 456397 486088 608151 2012962 379186
ENSG00000265150 170681 299425 286295 747000 210962
ENSG00000164692 169781 190854 263391 869194 180006
ENSG00000265735 78113 121697 113379 532435 112977
ENSG00000259001 55081 78811 73032 353442 100823

ID preview

id_col <- names(x)[1]
ids <- x[[1]] |> as.character()
kable_head(tibble(raw_id = head(ids, 10), 
                  stripped = strip_ensembl_version(head(ids, 10))), 
           10, paste(params$gse,": ID preview"))
(#tab:id_preview)GSE233947 : ID preview
raw_id stripped
ENSG00000108821 ENSG00000108821
ENSG00000265150 ENSG00000265150
ENSG00000164692 ENSG00000164692
ENSG00000265735 ENSG00000265735
ENSG00000259001 ENSG00000259001
ENSG00000163359 ENSG00000163359
ENSG00000115414 ENSG00000115414
ENSG00000168542 ENSG00000168542
ENSG00000251562 ENSG00000251562
ENSG00000155657 ENSG00000155657

5.1.2 After mapping (Ensembl → HGNC)

if (any(grepl("^ENSG", strip_ensembl_version(ids)))) {
  library(biomaRt)
  ensembl_ids <- unique(strip_ensembl_version(ids))
  ensembl <- useMart("ensembl", dataset = "hsapiens_gene_ensembl")

  map <- getBM(
    attributes = c("ensembl_gene_id", "hgnc_symbol"),
    filters = "ensembl_gene_id",
    values = ensembl_ids,
    mart = ensembl
  )

  kable_head(map, 10, paste(params$gse,": mapping preview"))

  expr_mapped <- x %>%
    mutate(ensembl_gene_id = strip_ensembl_version(.data[[id_col]])) %>%
    left_join(map, by = "ensembl_gene_id") %>%
  dplyr::select( ensembl_gene_id,hgnc_symbol, everything())


  kable_head(expr_mapped[, 1:min(8, ncol(expr_mapped))], 5, paste(params$gse,": mapped table preview"))
}
(#tab:after_map)GSE233947 : mapped table preview
ensembl_gene_id hgnc_symbol Gene T8657_900CTG_NT T8658_1150CTG_NT T8659_1450CTG_NT T8660_900CTG_20CTG T8661_1150CTG_20CTG
ENSG00000108821 COL1A1 ENSG00000108821 456397 486088 608151 2012962 379186
ENSG00000265150 NA ENSG00000265150 170681 299425 286295 747000 210962
ENSG00000164692 COL1A2 ENSG00000164692 169781 190854 263391 869194 180006
ENSG00000265735 RN7SL5P ENSG00000265735 78113 121697 113379 532435 112977
ENSG00000259001 ENSG00000259001 55081 78811 73032 353442 100823

Summarize the mapping

n_ensembl_total <- expr_mapped %>%
  distinct(ensembl_gene_id) %>%
  nrow()

n_mapped <- expr_mapped %>%
  filter(!is.na(hgnc_symbol), hgnc_symbol != "") %>%
  distinct(ensembl_gene_id) %>%
  nrow()

n_unmapped <- n_ensembl_total - n_mapped

mapping_summary <- tibble::tibble(
  category = c("Total Ensembl IDs", "Mapped to HGNC", "Unmapped"),
  n = c(n_ensembl_total, n_mapped, n_unmapped)
)

mapping_summary
## # A tibble: 3 × 2
##   category              n
##   <chr>             <int>
## 1 Total Ensembl IDs 62248
## 2 Mapped to HGNC    40176
## 3 Unmapped          22072
unmapped_ids <- expr_mapped %>%
  distinct(ensembl_gene_id, hgnc_symbol) %>%
  filter(is.na(hgnc_symbol) | hgnc_symbol == "") %>%
  pull(ensembl_gene_id) %>%
  unique()

length(unmapped_ids)
## [1] 22072
library(stringr)

unmapped_classified <- tibble::tibble(id = unmapped_ids) %>%
  mutate(type = case_when(
    str_detect(id, "^ENSG\\d+$") ~ "Ensembl gene ID (ENSG)",
    str_detect(id, "^ENSG\\d+\\.\\d+$") ~ "Ensembl gene ID with version (ENSG.x) — needs stripping",
    str_detect(id, "^ENST\\d+") ~ "Ensembl transcript ID (ENST) — wrong target for gene mapping",
    str_detect(id, "^ENSP\\d+") ~ "Ensembl protein ID (ENSP) — wrong target",
    str_detect(id, "^ENS.*G\\d+") ~ "Non-human Ensembl gene (e.g., ENSMUSG...) — wrong organism",
    str_detect(id, "^ERCC-") ~ "ERCC spike-in control — not a gene",
    str_detect(id, "^MT-|^mt-") ~ "Mitochondrial gene symbol (not Ensembl ID) — different ID system",
    str_detect(id, "^\\d+$") ~ "Entrez Gene ID — different ID system",
    TRUE ~ "Other / unknown format"
  ))

unmapped_classified %>% count(type) %>% arrange(desc(n))
## # A tibble: 1 × 2
##   type                       n
##   <chr>                  <int>
## 1 Ensembl gene ID (ENSG) 22072
# Use the same mart/dataset as your mapping step:
# ensembl <- useMart("ensembl", dataset = "hsapiens_gene_ensembl")

ensg_details <- getBM(
  attributes = c(
    "ensembl_gene_id",
    "gene_biotype",
    "external_gene_name"
  ),
  filters = "ensembl_gene_id",
  values  = unmapped_ids,
  mart    = ensembl
)

head(ensg_details)
##   ensembl_gene_id                       gene_biotype external_gene_name
## 1 ENSG00000093100                             lncRNA                   
## 2 ENSG00000124593                     protein_coding                   
## 3 ENSG00000124835                             lncRNA                   
## 4 ENSG00000141979                     protein_coding                   
## 5 ENSG00000149656 transcribed_unprocessed_pseudogene                   
## 6 ENSG00000151303                             lncRNA

Output the distribution of biotypes for the subset of ensembl ids that have no HGNC ID.

library(ggplot2)
x <- ensg_details$gene_biotype
x[is.na(x)] <- "Unknown"

# Base R counts table -> data.frame
tab <- table(x)
df_counts <- data.frame(
  biotype = names(tab),
  n       = as.integer(tab),
  stringsAsFactors = FALSE
)

# Order by counts (ascending) for a nice horizontal bar chart
df_counts <- df_counts[order(df_counts$n,decreasing = TRUE), ]
df_counts$biotype <- factor(df_counts$biotype, levels = df_counts$biotype)

#only include the top 20 biotypes
df_counts <- df_counts[1:min(10, nrow(df_counts)), ]

ggplot(df_counts, aes(x = biotype, y = n, fill = biotype)) +
  geom_col(width = 0.8, color = "grey20") +
  coord_flip() +
  scale_fill_brewer(palette = "Set3", guide = "none") +
  labs(
    x = NULL, y = "Number of genes",
    title = "Distribution of Gene Biotypes - \ntop 10 types not mapped"
  ) +
  theme_minimal(base_size = 13) +
  theme(
    plot.title.position = "plot",
    panel.grid.major.y = element_blank()
  )

Would the mapping of these identifiers have been different if we used a different version of Ensembl?