Chapter 2 Get the data

2.1 Download the GEO supplementary file(s)

This chapter downloads it automatically using GEOquery.

GSE_ID <- "GSE233947"

# Download supplementary files into ./data/GSE233947/
# (If already downloaded, GEOquery will reuse existing files.)

dir.create("data", showWarnings = FALSE)

supp <- GEOquery::getGEOSuppFiles(
  GSE_ID,
  makeDirectory = TRUE,
  baseDir = "data"
)

2.2 Read the FeatureCounts matrix

counts_file <- file.path("data", GSE_ID, "GSE233947_FeatureCounts_V31genes_RawCounts_ENSG.tsv.gz")
stopifnot(file.exists(counts_file))

fc <- readr::read_tsv(counts_file, show_col_types = FALSE)

# Peek at the columns
names(fc)[1:min(12, ncol(fc))]
##  [1] "Gene"                "T8657_900CTG_NT"     "T8658_1150CTG_NT"   
##  [4] "T8659_1450CTG_NT"    "T8660_900CTG_20CTG"  "T8661_1150CTG_20CTG"
##  [7] "T8662_1450CTG_20CTG" "T8663_900CTG_3CTG"   "T8664_1150CTG_3CTG" 
## [10] "T8665_1450CTG_3CTG"

2.2.1 Build a counts matrix (genes × samples)

sample_cols <- colnames(fc)

counts <- fc %>%
  dplyr::select( all_of(sample_cols)) %>%
  as.data.frame()

rownames(counts) <- counts$Gene
counts$Gene <- NULL

counts <- as.matrix(counts)
storage.mode(counts) <- "integer"

dim(counts)
## [1] 62248     9
counts[1:3, 1:3]
##                 T8657_900CTG_NT T8658_1150CTG_NT T8659_1450CTG_NT
## ENSG00000108821          456397           486088           608151
## ENSG00000265150          170681           299425           286295
## ENSG00000164692          169781           190854           263391

2.3 Create metadata from sample names

Sample names in this series follow patterns like:

  • 900CTG_NT
  • 900CTG_3'CTG
  • 900CTG_20CTG

We will parse:

  • cellline: 900CTG, 1150CTG, 1450CTG
  • treatment: NT, 3CTG, 20CTG
#design matrix - 
samples <- colnames(counts)[1:ncol(counts)]
cellline  <- unlist(lapply(samples,FUN =
                             function(x){unlist(strsplit(x,split = "_"))[2]}))       
treatment <- unlist(lapply(samples,FUN = 
                             function(x){unlist(strsplit(x,split = "_"))[3]}))     
meta <- data.frame(samples, cellline, treatment)

2.4 Identifier mapping

strip_ensembl_version <- function(x) sub("\\..*$", "", x)

kable_head <- function(x, n = 5, caption = NULL) {
  knitr::kable(utils::head(x, n), caption = caption)
}

mapped_counts <- data.frame(ensembl_gene_id = rownames(counts),counts)
ids<- mapped_counts$ensembl_gene_id

if (any(grepl("^ENSG", strip_ensembl_version(ids)))) {
  library(biomaRt)
  ensembl_ids <- unique(strip_ensembl_version(ids))
  ensembl <- useMart("ensembl", dataset = "hsapiens_gene_ensembl")

  map <- getBM(
    attributes = c("ensembl_gene_id", "hgnc_symbol"),
    filters = "ensembl_gene_id",
    values = ensembl_ids,
    mart = ensembl
  )

  kable_head(map, 10, paste(": mapping preview"))

counts_mapped <- mapped_counts %>%
    left_join(map, by = "ensembl_gene_id") %>%
  dplyr::select( ensembl_gene_id,hgnc_symbol, everything())


  kable_head(counts_mapped[, 1:min(8, ncol(counts_mapped))], 5, paste(": mapped table preview"))
}
Table 2.1: : mapped table preview
ensembl_gene_id hgnc_symbol T8657_900CTG_NT T8658_1150CTG_NT T8659_1450CTG_NT T8660_900CTG_20CTG T8661_1150CTG_20CTG T8662_1450CTG_20CTG
ENSG00000108821 COL1A1 456397 486088 608151 2012962 379186 474716
ENSG00000265150 NA 170681 299425 286295 747000 210962 148756
ENSG00000164692 COL1A2 169781 190854 263391 869194 180006 217321
ENSG00000265735 RN7SL5P 78113 121697 113379 532435 112977 75960
ENSG00000259001 55081 78811 73032 353442 100823 118293

2.5 Save intermediate objects

saveRDS(list(counts = counts, 
             meta = meta, 
             mappings = counts_mapped[,1:2]), 
        file = "data/d1_counts_and_meta.rds")