Chapter 5 Differential expression with edgeR (QLF)

In this chapter we run edgeR using the recommended quasi-likelihood pipeline:

  1. Start from raw counts in a DGEList
  2. Estimate dispersions
  3. Fit a QL GLM
  4. Perform QL F-tests for contrasts of interest

We use the same additive design:

\[ \text{counts} \sim \text{cellline} + \text{treatment} \]

obj <- readRDS("data/d1_qc_objects.rds")
dge_f <- obj$dge_f
meta <- obj$meta
mappings <- obj$mappings

design <- obj$design

# Contrasts
contr <- makeContrasts(
  `3CTG_vs_NT` = treatment3CTG - treatmentNT,
  `20CTG_vs_NT` = treatment20CTG - treatmentNT,
  `20CTG_vs_3CTG` = treatment20CTG - treatment3CTG,
  levels = design
)

# Estimate dispersion and fit QL model
edge <- edgeR::estimateDisp(dge_f, design)
fit <- edgeR::glmQLFit(edge, design)

5.1 Plot Biological coefficient of variance plot

plotBCV(edge)

5.2 Test: 3CTG vs NT and 20CTG vs NT

With this design, the coefficients treatment3CTG and treatment20CTG correspond to contrasts vs NT.

qlf_3_vs_nt <- edgeR::glmQLFTest(fit, contrast = contr[, "3CTG_vs_NT"])
qlf_20_vs_nt <- edgeR::glmQLFTest(fit, contrast = contr[, "20CTG_vs_NT"])

edgeR::topTags(qlf_20_vs_nt)
## Coefficient:  1*treatment20CTG -1*treatmentNT 
##                    logFC   logCPM         F       PValue          FDR
## ENSG00000134321 7.394417 4.870993 154.68256 1.329498e-07 0.0008950585
## ENSG00000028277 5.415078 2.672833 115.06349 1.342554e-07 0.0008950585
## ENSG00000130303 4.851522 4.759055 126.22056 2.336858e-07 0.0008950585
## ENSG00000142627 3.506374 3.654145 136.69159 2.434871e-07 0.0008950585
## ENSG00000138646 5.134338 3.601337 118.40507 3.291525e-07 0.0009679716
## ENSG00000106785 2.887412 5.773032 116.76336 5.126668e-07 0.0012563753
## ENSG00000173535 3.773864 2.895942 107.75445 6.975318e-07 0.0013657287
## ENSG00000187608 4.541247 8.814995 108.12725 7.430515e-07 0.0013657287
## ENSG00000135114 5.347384 2.941023  80.44935 9.846400e-07 0.0015619173
## ENSG00000184979 3.086410 4.887186 100.50212 1.062240e-06 0.0015619173

5.3 Test: 20CTG vs 3CTG

qlf_20_vs_3 <- edgeR::glmQLFTest(fit, contrast = contr[, "20CTG_vs_3CTG"])

edgeR::topTags(qlf_20_vs_3)
## Coefficient:  1*treatment20CTG -1*treatment3CTG 
##                     logFC      logCPM        F       PValue       FDR
## ENSG00000223883 -3.221550 -0.29165168 23.13003 0.0007347709 0.9209883
## ENSG00000280800  3.972084  3.13680001 14.61103 0.0021195502 0.9209883
## ENSG00000187672 -1.503998  1.40149268 15.71926 0.0024475763 0.9209883
## ENSG00000115266  1.654827  0.81019376 15.52729 0.0024905149 0.9209883
## ENSG00000147586  1.290415  1.60657763 15.29760 0.0026685337 0.9209883
## ENSG00000241781  3.766968  0.09488949 13.92006 0.0029691485 0.9209883
## ENSG00000102003  1.980855  0.22390723 14.43907 0.0030659698 0.9209883
## ENSG00000105357  1.886779  1.69377897 13.10103 0.0032984753 0.9209883
## ENSG00000184524  1.631481  1.49875360 13.57196 0.0036287558 0.9209883
## ENSG00000260630  0.986704  3.53676750 13.59760 0.0038688768 0.9209883

5.4 Extract full results

tab_3_vs_nt <- edgeR::topTags(qlf_3_vs_nt, n = Inf)$table %>% rownames_to_column("ensembl")
tab_20_vs_nt <- edgeR::topTags(qlf_20_vs_nt, n = Inf)$table %>% rownames_to_column("ensembl")
tab_20_vs_3 <- edgeR::topTags(qlf_20_vs_3, n = Inf)$table %>% rownames_to_column("ensembl")

head(tab_20_vs_nt)
##           ensembl    logFC   logCPM        F       PValue          FDR
## 1 ENSG00000134321 7.394417 4.870993 154.6826 1.329498e-07 0.0008950585
## 2 ENSG00000028277 5.415078 2.672833 115.0635 1.342554e-07 0.0008950585
## 3 ENSG00000130303 4.851522 4.759055 126.2206 2.336858e-07 0.0008950585
## 4 ENSG00000142627 3.506374 3.654145 136.6916 2.434871e-07 0.0008950585
## 5 ENSG00000138646 5.134338 3.601337 118.4051 3.291525e-07 0.0009679716
## 6 ENSG00000106785 2.887412 5.773032 116.7634 5.126668e-07 0.0012563753

5.5 Volcano plot of EdgeR results

volcano_df <- tab_20_vs_nt %>%
  mutate(sig = FDR < 0.05)

ggplot(volcano_df, aes(x = logFC, y = -log10(PValue), color = sig)) +
  geom_point(alpha = 0.6) +
  theme_minimal() +
  labs(
    title = "EdgeR: 20CTG vs NT",
    x = "log2 fold-change",
    y = "-log10(p-value)"
  )

tab <- topTags(qlf_20_vs_nt, n = Inf)$table %>%
  rownames_to_column("gene") %>%
  mutate(
    neglog10P = -log10(PValue),
    direction = case_when(
      FDR < 0.05 & logFC > 0 ~ "Up",
      FDR < 0.05 & logFC < 0 ~ "Down",
      TRUE ~ "Not sig"
    )
  )

ggplot(tab, aes(x = logFC, y = neglog10P)) +
  geom_point(aes(color = direction), alpha = 0.7, size = 1.2) +
  scale_color_manual(values = c(
    "Up" = "red",
    "Down" = "blue",
    "Not sig" = "grey70"
  )) +
  geom_vline(xintercept = c(-1, 1), linetype = "dashed", color = "steelblue") +
  geom_hline(yintercept = -log10(0.05), linetype = "dashed", color = "steelblue") +
  theme_minimal() +
  labs(
    title = "edgeR volcano: 20CTG vs NT",
    x = "log2 fold change (logFC)",
    y = "-log10(p-value)",
    color = NULL
  )

5.6 Visualize the top hits as a heatmap

first map the ensembl ids to gene ids

# Keep only the columns we need and ensure they are character
map_df <- mappings %>%
  transmute(
    ensembl = as.character(ensembl_gene_id),
    hgnc_symbol = as.character(hgnc_symbol)
  ) %>%
  distinct(ensembl, .keep_all = TRUE)

# Join mapping onto the edgeR results table (tab_20_vs_nt)
tab_20_vs_nt_annot <- tab_20_vs_nt %>%
  mutate(ensembl = as.character(ensembl)) %>%
  left_join(map_df, by = "ensembl") %>%
  mutate(
    # Use HGNC when available, otherwise fall back to Ensembl
    display_name = ifelse(!is.na(hgnc_symbol) & hgnc_symbol != "",
                          hgnc_symbol,
                          ensembl)
  )

# Quick check: how many mapped?
mean(!is.na(tab_20_vs_nt_annot$hgnc_symbol))
## [1] 0.9876904
head(tab_20_vs_nt_annot[, c("ensembl","hgnc_symbol","display_name","logFC","FDR")])
##           ensembl hgnc_symbol display_name    logFC          FDR
## 1 ENSG00000134321       RSAD2        RSAD2 7.394417 0.0008950585
## 2 ENSG00000028277      POU2F2       POU2F2 5.415078 0.0008950585
## 3 ENSG00000130303        BST2         BST2 4.851522 0.0008950585
## 4 ENSG00000142627       EPHA2        EPHA2 3.506374 0.0008950585
## 5 ENSG00000138646       HERC5        HERC5 5.134338 0.0009679716
## 6 ENSG00000106785      TRIM14       TRIM14 2.887412 0.0012563753

5.7 Select top hits from tab_20_vs_nt_annot

Pick either top N by FDR (good default for teaching) or all genes with FDR < 0.05.

5.7.1 top N genes by FDR

top_n <- 50  # change to 25/100 as desired

top_hits_tbl <- tab_20_vs_nt_annot %>%
  arrange(FDR, PValue) %>%
  slice_head(n = top_n)

# Ensembl IDs for subsetting the matrix
top_hits_ensembl <- top_hits_tbl$ensembl

# Row labels we want to show on the heatmap
top_hits_labels <- top_hits_tbl$display_name

head(top_hits_tbl[, c("ensembl","display_name","logFC","FDR")])
##           ensembl display_name    logFC          FDR
## 1 ENSG00000134321        RSAD2 7.394417 0.0008950585
## 2 ENSG00000028277       POU2F2 5.415078 0.0008950585
## 3 ENSG00000130303         BST2 4.851522 0.0008950585
## 4 ENSG00000142627        EPHA2 3.506374 0.0008950585
## 5 ENSG00000138646        HERC5 5.134338 0.0009679716
## 6 ENSG00000106785       TRIM14 2.887412 0.0012563753

5.8 Make the expression matrix for the heatmap (logCPM + row z-score)

We’ll compute logCPM from counts (recommended for visualization), then z-score per gene.

# logCPM from counts for visualization
# logCPM for visualization
logcpm <- edgeR::cpm(counts, log = TRUE, prior.count = 1)

# Keep only genes present in the expression matrix
keep <- top_hits_ensembl %in% rownames(logcpm)

hm_mat <- logcpm[top_hits_ensembl[keep], , drop = FALSE]

# Create labels aligned to hm_mat rows
hm_labels <- top_hits_labels[keep]

# IMPORTANT: avoid duplicate rownames (HGNC symbols can repeat)
# Make labels unique while preserving readability
rownames(hm_mat) <- make.unique(hm_labels)

# Z-score per gene across samples for pattern visibility
hm_z <- t(scale(t(hm_mat)))

dim(hm_z)
## [1] 50  9

5.9 Sample annotation (treatment + cellline) using Set3

meta$cellline <- as.factor(meta$cellline)
meta$treatment <- as.factor(meta$treatment)

# Set3 palettes (qualitative)
treat_levels <- levels(meta$treatment)
cell_levels  <- levels(meta$cellline)

treat_cols <- setNames(
  brewer.pal(max(3, length(treat_levels)), "Set3")[seq_along(treat_levels)],
  treat_levels
)

cell_cols <- setNames(
  brewer.pal(max(3, length(cell_levels)), "Set2")[seq_along(cell_levels)],
  cell_levels
)

ha <- HeatmapAnnotation(
  treatment = meta$treatment,
  cellline  = meta$cellline,
  col = list(
    treatment = treat_cols,
    cellline  = cell_cols
  ),
  annotation_legend_param = list(
    treatment = list(title = "Treatment"),
    cellline  = list(title = "Cell line")
  )
)

5.10 5) Draw the ComplexHeatmap heatmap

This produces the actual figure. Since we z-scored the rows, use a diverging palette centered at 0.

col_fun <- circlize::colorRamp2(c(-2, 0, 2), c("blue", "white", "red"))

Heatmap(
  hm_z,
  col = col_fun,
  top_annotation = ha,
  show_row_names = FALSE,
  show_column_names = FALSE,
  cluster_rows = TRUE,
  cluster_columns = TRUE,
  row_title = paste0("Top ", nrow(hm_z), " genes (edgeR: 20CTG vs NT)"),
  column_title = "Samples"
)

Add labels of the gene names - only include 25 of the top hit for easier visibility.

Is this the correct way to add the top 25?

Heatmap(
  hm_z[1:25,],
  col = col_fun,
  top_annotation = ha,
  show_row_names = TRUE,         # <-- now shows HGNC symbols
  row_names_gp = grid::gpar(fontsize = 8),
  show_column_names = FALSE,
  cluster_rows = TRUE,
  cluster_columns = TRUE,
  row_title = paste0("Top ", nrow(hm_z), " genes (edgeR: 20CTG vs NT)"),
  column_title = "Samples"
)

5.11 Save edgeR results

saveRDS(
  list(
    meta = meta,
    counts_mapped_filtered = counts_mapped_filtered,
    fit = fit,
    qlf_3_vs_nt = qlf_3_vs_nt,
    qlf_20_vs_nt = qlf_20_vs_nt,
    qlf_20_vs_3 = qlf_20_vs_3,
    tab_3_vs_nt = tab_3_vs_nt,
    tab_20_vs_nt = tab_20_vs_nt,
    tab_20_vs_3 = tab_20_vs_3
  ),
  file = "data/d1_edger_results.rds"
)