Chapter 5 Worked normalization examples

We will demonstrate exploratory normalization steps on the expression datasets.

5.1 Setup: download supplementary files

5.2 load Some data

Raw table preview

library(readr)

safe_read <- function(file) {
  # First attempt: read as TSV
  df <- tryCatch(
    readr::read_tsv(file, show_col_types = FALSE),
    error = function(e) NULL   # catch fatal errors
  )
  
  # If read_tsv failed entirely:
  if (is.null(df)) {
    message("TSV read failed — reading as space-delimited file instead.")
    return(readr::read_table(file, show_col_types = FALSE))
  }
  
  # If read_tsv returned but with parsing issues:
  probs <- problems(df)
  if (nrow(probs) > 0) {
    message("Parsing issues detected in TSV — reading as space-delimited file instead.")
    return(readr::read_table(file, show_col_types = FALSE))
  }
  
  # If everything was fine:
  return(df)
}

x <- safe_read(files[1])


kable_head(x[, 1:min(6, ncol(x))], 5, paste(gse,": raw table preview"))

(#tab:preview_raw)GSE233947 : raw table preview
Gene	T8657_900CTG_NT	T8658_1150CTG_NT	T8659_1450CTG_NT	T8660_900CTG_20CTG	T8661_1150CTG_20CTG
ENSG00000108821	456397	486088	608151	2012962	379186
ENSG00000265150	170681	299425	286295	747000	210962
ENSG00000164692	169781	190854	263391	869194	180006
ENSG00000265735	78113	121697	113379	532435	112977
ENSG00000259001	55081	78811	73032	353442	100823

5.3 Example 1: GSE119732

Before doing anything to this data. What does it look like?

plot_box(x[,2:ncol(x)])

plot_density(x[,2:ncol(x)])

5.4 Convert the raw counts to Counts per million (CPM)

library(edgeR)

x_cpm <- cpm(y = x[,2:ncol(x)])

plot_box(x_cpm,main = "CPM - all genes")

Visualize as a density plot as well

plot_density(x_cpm,main = "CPM - all genes, NO design")

There are a lot of genes that have zero expression. That is the majority of them. So we need to get rid of them.

5.5 filter out lowly expressed genes

to_remove <- edgeR::filterByExpr(x_cpm,min.count = 3)
x_cpm_filtered <- x_cpm[to_remove,]

plot_box(x_cpm_filtered,main = "CPM filtered out lowly expressed - \nNO design matrix")

visualize this as density plot as well.

plot_density(x_cpm_filtered,main = "CPM filtered out lowly expressed - \n - NO design matrix")

5.6 Incorporate a design matrix - description of the samples.

The above was usingn all of the samples the same but this dataset has varying sample types. I might be hard to figure it out just from column names as they are not so informative but let’s just guess

colnames(x)

##  [1] "Gene"                "T8657_900CTG_NT"     "T8658_1150CTG_NT"   
##  [4] "T8659_1450CTG_NT"    "T8660_900CTG_20CTG"  "T8661_1150CTG_20CTG"
##  [7] "T8662_1450CTG_20CTG" "T8663_900CTG_3CTG"   "T8664_1150CTG_3CTG" 
## [10] "T8665_1450CTG_3CTG"

#design matrix - 
samples <- colnames(x)[2:ncol(x)]
patient  <- unlist(lapply(samples,FUN = function(x){unlist(strsplit(x,split = "_"))[2]}))       
celltype <- unlist(lapply(samples,FUN = function(x){unlist(strsplit(x,split = "_"))[3]}))     
sample_data <- data.frame(samples, patient, celltype)


design <- model.matrix(~ 0 + celltype,data = sample_data)
rownames(design) <- sample_data$samples
colnames(design) <- paste0("celltype", levels(factor(celltype)))
design

##                     celltype20CTG celltype3CTG celltypeNT
## T8657_900CTG_NT                 0            0          1
## T8658_1150CTG_NT                0            0          1
## T8659_1450CTG_NT                0            0          1
## T8660_900CTG_20CTG              1            0          0
## T8661_1150CTG_20CTG             1            0          0
## T8662_1450CTG_20CTG             1            0          0
## T8663_900CTG_3CTG               0            1          0
## T8664_1150CTG_3CTG              0            1          0
## T8665_1450CTG_3CTG              0            1          0
## attr(,"assign")
## [1] 1 1 1
## attr(,"contrasts")
## attr(,"contrasts")$celltype
## [1] "contr.treatment"

Filter use design information

to_remove_withdesign <- edgeR::filterByExpr(x_cpm,
                                            min.count = 3,
                                            design = design)
x_cpm_filtered_withdesign <- x_cpm[to_remove_withdesign,]

colnames(x_cpm_filtered_withdesign ) <- paste(sample_data$celltype,
                                              1:nrow(sample_data),sep = "_")

plot_box(x_cpm_filtered_withdesign,
         main = "CPM filtered out lowly expressed - \n - with design matrix")

plot_density(x_cpm_filtered_withdesign,main = "CPM filtered out lowly expressed - \n - with design matrix")

5.7 Normalize Dataset using TMM

library(edgeR)

dge <- DGEList(counts = x[,2:ncol(x)])
dge_filtered <- dge[filterByExpr(dge),]
dge_filtered  <- calcNormFactors(dge_filtered , method = "TMM")
norm_cpm <- cpm(dge_filtered , log = FALSE, prior.count = 1)

plot_box(norm_cpm)

plot_density(norm_cpm)

5.8 Look at the distribution of our samples in 2D space

y <- dge_filtered

plotMDS(y, top = 500, labels = colnames(y), 
       col = as.integer(y$samples$group))
legend("topright", legend = levels(y$samples$group),
       col = seq_along(levels(y$samples$group)), pch = 16, bty = "n")

Now incorporate the design into the process

dge <- DGEList(counts = x[,2:ncol(x)],group = sample_data$celltype)
dge_filtered <- dge[filterByExpr(dge),]
dge_filtered  <- calcNormFactors(dge_filtered , method = "TMM")
norm_cpm <- cpm(dge_filtered , log = FALSE, prior.count = 1)

plot_box(norm_cpm)

plot_density(norm_cpm)

y <- dge_filtered

plotMDS(y, top = 500, labels = sample_data$celltype,#colnames(y), 
       col = as.integer(y$samples$group))
legend("topright", legend = levels(y$samples$group),
       col = seq_along(levels(y$samples$group)), pch = 16, bty = "n")

Now incorporate the design into the process

dge <- DGEList(counts = x[,2:ncol(x)],group = sample_data$patient)
dge_filtered <- dge[filterByExpr(dge),]
dge_filtered  <- calcNormFactors(dge_filtered , method = "TMM")
norm_cpm <- cpm(dge_filtered , log = FALSE, prior.count = 1)

plot_box(norm_cpm)

plot_density(norm_cpm)

y <- dge_filtered

plotMDS(y, top = 500, labels = sample_data$patient,#  colnames(y), 
       col = as.integer(y$samples$group))
legend("topright", legend = levels(y$samples$group),
       col = seq_along(levels(y$samples$group)), pch = 16, bty = "n")

5.9 Normalize the Dataset using RLE

library(DESeq2)

counts <- x[,2:ncol(x)]

keep <- rowSums(counts >= 10) >= 2
counts_filtered <- counts[keep, ]

dds <- DESeqDataSetFromMatrix(countData = counts_filtered, 
                              colData = sample_data,
                              design = design)
dds <- estimateSizeFactors(dds)

norm_counts <- counts(dds, normalized = TRUE)

plot_box(norm_counts)

plot_density(norm_counts)