Chapter 7 Get the data - Dataset 2

7.1 Download the GEO supplementary file(s)

This chapter downloads it automatically using GEOquery.

GSE_ID <- "GSE240829"

# Download supplementary files into ./data/GSE240829/
# (If already downloaded, GEOquery will reuse existing files.)

dir.create("data", showWarnings = FALSE)

supp <- GEOquery::getGEOSuppFiles(
  GSE_ID,
  makeDirectory = TRUE,
  baseDir = "data"
)

7.2 Read the FeatureCounts matrix

counts_file <- file.path("data", GSE_ID, "GSE240829_POCROC_good.abundance.salmon.genes_8.14.23.txt.gz")
stopifnot(file.exists(counts_file))

fc <- readr::read_tsv(counts_file, show_col_types = FALSE)

# Peek at the columns
names(fc)[1:min(12, ncol(fc))]

##  [1] "...1"               "21020_0_ovary"      "21020_2_lymph_node" "24487_0_l_ovary"   
##  [5] "24487_1_lymph_node" "25258_0_l_ovary"    "25258_1_diaphragm"  "29764_0_l_ovary"   
##  [9] "30961_0_r_ovary"    "30961_3_rectum"     "32761_0_l_ovary"    "32761_2_spleen"

7.2.1 Build a counts matrix (genes × samples)

sample_cols <- colnames(fc)

counts <- fc %>%
  dplyr::select( all_of(sample_cols)) %>%
  as.data.frame()

rownames(counts) <- counts[,1]
counts[,1] <- NULL

counts <- as.matrix(counts)
#storage.mode(counts) <- "integer"

dim(counts)

## [1] 33969    43

counts[1:3, 1:3]

##                   21020_0_ovary 21020_2_lymph_node 24487_0_l_ovary
## ENSG00000230092.7     10.718211           4.244121        2.959815
## ENSG00000177757.2      0.018135           0.025687        0.070439
## ENSG00000228794.8      2.335282           2.654898        0.616134

7.3 Create metadata from sample names

Sample names in this series follow patterns like:

21020_0_ovary 21020_2_lymph_node 24487_0_l_ovary … Where the first number is the patient id, second number is whether it is primary or recurrent and the third is location. It was separated really badly because there are underscores in the location name as well. (l_ovary = left ovary). –> just grab the last token for the location

We will parse:

‘patient’ - patient id
‘status’ - primary, recurrent1, recurrent2, recurrent3
‘location’ - ovary,

#design matrix - 
samples <- colnames(counts)[1:ncol(counts)]
patient  <- unlist(lapply(samples,FUN =
                             function(x){unlist(strsplit(x,split = "_"))[1]}))       
status <- unlist(lapply(samples,FUN = 
                             function(x){unlist(strsplit(x,split = "_"))[2]}))     
location <- unlist(lapply(samples,FUN = 
                             function(x){
                               if(length(unlist(strsplit(x, "_"))) > 3){
                                 y <- paste(unlist(strsplit(x, "_"))[3:length(unlist(strsplit(x, "_")))],collapse ="_")
                               } else{
                                 y <- unlist(strsplit(x, "_"))[3]
                               }
                               return(y)
                               }))  
meta <- data.frame(samples, patient,status, location)

7.4 Identifier mapping

strip_ensembl_version <- function(x) sub("\\..*$", "", x)

kable_head <- function(x, n = 5, caption = NULL) {
  knitr::kable(utils::head(x, n), caption = caption)
}

mapped_counts <- data.frame(ensembl_gene_id = strip_ensembl_version(rownames(counts)),counts)
ids<- mapped_counts$ensembl_gene_id

if (any(grepl("^ENSG", strip_ensembl_version(ids)))) {
  library(biomaRt)
  ensembl_ids <- unique(strip_ensembl_version(ids))
  ensembl <- useMart("ensembl", dataset = "hsapiens_gene_ensembl")

  map <- getBM(
    attributes = c("ensembl_gene_id", "hgnc_symbol"),
    filters = "ensembl_gene_id",
    values = ensembl_ids,
    mart = ensembl
  )

  kable_head(map, 10, paste(": mapping preview"))

counts_mapped <- mapped_counts %>%
    left_join(map, by = "ensembl_gene_id") %>%
  dplyr::select( ensembl_gene_id,hgnc_symbol, everything())


  kable_head(counts_mapped[, 1:min(8, ncol(counts_mapped))], 5, paste(": mapped table preview"))
}

Table 7.1: : mapped table preview
ensembl_gene_id	hgnc_symbol	X21020_0_ovary	X21020_2_lymph_node	X24487_0_l_ovary	X24487_1_lymph_node	X25258_0_l_ovary	X25258_1_diaphragm
ENSG00000230092		10.718211	4.244121	2.959815	1.920198	1.591502	0.695518
ENSG00000177757	FAM87B	0.018135	0.025687	0.070439	0.031852	0.005534	0.000000
ENSG00000228794	LINC01128	2.335282	2.654898	0.616134	0.635164	0.477670	0.851483
ENSG00000225880	LINC00115	0.425905	0.396866	0.313534	0.296781	0.125387	0.249801
ENSG00000230368	FAM41C	0.230066	0.409599	0.420536	0.126110	0.165970	0.029084

7.5 Save intermediate objects

saveRDS(list(counts = counts, 
             meta = meta, 
             mappings = counts_mapped[,1:2]), file = "data/d2_counts_and_meta.rds")