Chapter 7 Get the data - Dataset 2

7.1 Download the GEO supplementary file(s)

This chapter downloads it automatically using GEOquery.

GSE_ID <- "GSE240829"

# Download supplementary files into ./data/GSE240829/
# (If already downloaded, GEOquery will reuse existing files.)

dir.create("data", showWarnings = FALSE)

supp <- GEOquery::getGEOSuppFiles(
  GSE_ID,
  makeDirectory = TRUE,
  baseDir = "data"
)

7.2 Read the FeatureCounts matrix

counts_file <- file.path("data", GSE_ID, "GSE240829_POCROC_good.abundance.salmon.genes_8.14.23.txt.gz")
stopifnot(file.exists(counts_file))

fc <- readr::read_tsv(counts_file, show_col_types = FALSE)

# Peek at the columns
names(fc)[1:min(12, ncol(fc))]
##  [1] "...1"               "21020_0_ovary"      "21020_2_lymph_node" "24487_0_l_ovary"   
##  [5] "24487_1_lymph_node" "25258_0_l_ovary"    "25258_1_diaphragm"  "29764_0_l_ovary"   
##  [9] "30961_0_r_ovary"    "30961_3_rectum"     "32761_0_l_ovary"    "32761_2_spleen"

7.2.1 Build a counts matrix (genes × samples)

sample_cols <- colnames(fc)

counts <- fc %>%
  dplyr::select( all_of(sample_cols)) %>%
  as.data.frame()

rownames(counts) <- counts[,1]
counts[,1] <- NULL

counts <- as.matrix(counts)
#storage.mode(counts) <- "integer"

dim(counts)
## [1] 33969    43
counts[1:3, 1:3]
##                   21020_0_ovary 21020_2_lymph_node 24487_0_l_ovary
## ENSG00000230092.7     10.718211           4.244121        2.959815
## ENSG00000177757.2      0.018135           0.025687        0.070439
## ENSG00000228794.8      2.335282           2.654898        0.616134

7.3 Create metadata from sample names

Sample names in this series follow patterns like:

21020_0_ovary 21020_2_lymph_node 24487_0_l_ovary … Where the first number is the patient id, second number is whether it is primary or recurrent and the third is location. It was separated really badly because there are underscores in the location name as well. (l_ovary = left ovary). –> just grab the last token for the location

We will parse:

  • ‘patient’ - patient id
  • ‘status’ - primary, recurrent1, recurrent2, recurrent3
  • ‘location’ - ovary,
#design matrix - 
samples <- colnames(counts)[1:ncol(counts)]
patient  <- unlist(lapply(samples,FUN =
                             function(x){unlist(strsplit(x,split = "_"))[1]}))       
status <- unlist(lapply(samples,FUN = 
                             function(x){unlist(strsplit(x,split = "_"))[2]}))     
location <- unlist(lapply(samples,FUN = 
                             function(x){
                               if(length(unlist(strsplit(x, "_"))) > 3){
                                 y <- paste(unlist(strsplit(x, "_"))[3:length(unlist(strsplit(x, "_")))],collapse ="_")
                               } else{
                                 y <- unlist(strsplit(x, "_"))[3]
                               }
                               return(y)
                               }))  
meta <- data.frame(samples, patient,status, location)

7.4 Identifier mapping

strip_ensembl_version <- function(x) sub("\\..*$", "", x)

kable_head <- function(x, n = 5, caption = NULL) {
  knitr::kable(utils::head(x, n), caption = caption)
}

mapped_counts <- data.frame(ensembl_gene_id = strip_ensembl_version(rownames(counts)),counts)
ids<- mapped_counts$ensembl_gene_id

if (any(grepl("^ENSG", strip_ensembl_version(ids)))) {
  library(biomaRt)
  ensembl_ids <- unique(strip_ensembl_version(ids))
  ensembl <- useMart("ensembl", dataset = "hsapiens_gene_ensembl")

  map <- getBM(
    attributes = c("ensembl_gene_id", "hgnc_symbol"),
    filters = "ensembl_gene_id",
    values = ensembl_ids,
    mart = ensembl
  )

  kable_head(map, 10, paste(": mapping preview"))

counts_mapped <- mapped_counts %>%
    left_join(map, by = "ensembl_gene_id") %>%
  dplyr::select( ensembl_gene_id,hgnc_symbol, everything())


  kable_head(counts_mapped[, 1:min(8, ncol(counts_mapped))], 5, paste(": mapped table preview"))
}
Table 7.1: : mapped table preview
ensembl_gene_id hgnc_symbol X21020_0_ovary X21020_2_lymph_node X24487_0_l_ovary X24487_1_lymph_node X25258_0_l_ovary X25258_1_diaphragm
ENSG00000230092 10.718211 4.244121 2.959815 1.920198 1.591502 0.695518
ENSG00000177757 FAM87B 0.018135 0.025687 0.070439 0.031852 0.005534 0.000000
ENSG00000228794 LINC01128 2.335282 2.654898 0.616134 0.635164 0.477670 0.851483
ENSG00000225880 LINC00115 0.425905 0.396866 0.313534 0.296781 0.125387 0.249801
ENSG00000230368 FAM41C 0.230066 0.409599 0.420536 0.126110 0.165970 0.029084

7.5 Save intermediate objects

saveRDS(list(counts = counts, 
             meta = meta, 
             mappings = counts_mapped[,1:2]), file = "data/d2_counts_and_meta.rds")