Chapter 2 Get the data
2.2 Read the FeatureCounts matrix
counts_file <- file.path("data", GSE_ID, "GSE233947_FeatureCounts_V31genes_RawCounts_ENSG.tsv.gz")
stopifnot(file.exists(counts_file))
fc <- readr::read_tsv(counts_file, show_col_types = FALSE)
# Peek at the columns
names(fc)[1:min(12, ncol(fc))]## [1] "Gene" "T8657_900CTG_NT" "T8658_1150CTG_NT"
## [4] "T8659_1450CTG_NT" "T8660_900CTG_20CTG" "T8661_1150CTG_20CTG"
## [7] "T8662_1450CTG_20CTG" "T8663_900CTG_3CTG" "T8664_1150CTG_3CTG"
## [10] "T8665_1450CTG_3CTG"
2.2.1 Build a counts matrix (genes × samples)
sample_cols <- colnames(fc)
counts <- fc %>%
dplyr::select( all_of(sample_cols)) %>%
as.data.frame()
rownames(counts) <- counts$Gene
counts$Gene <- NULL
counts <- as.matrix(counts)
storage.mode(counts) <- "integer"
dim(counts)## [1] 62248 9
## T8657_900CTG_NT T8658_1150CTG_NT T8659_1450CTG_NT
## ENSG00000108821 456397 486088 608151
## ENSG00000265150 170681 299425 286295
## ENSG00000164692 169781 190854 263391
2.3 Create metadata from sample names
Sample names in this series follow patterns like:
900CTG_NT900CTG_3'CTG900CTG_20CTG
We will parse:
cellline:900CTG,1150CTG,1450CTGtreatment:NT,3CTG,20CTG
2.4 Identifier mapping
strip_ensembl_version <- function(x) sub("\\..*$", "", x)
kable_head <- function(x, n = 5, caption = NULL) {
knitr::kable(utils::head(x, n), caption = caption)
}
mapped_counts <- data.frame(ensembl_gene_id = rownames(counts),counts)
ids<- mapped_counts$ensembl_gene_id
if (any(grepl("^ENSG", strip_ensembl_version(ids)))) {
library(biomaRt)
ensembl_ids <- unique(strip_ensembl_version(ids))
ensembl <- useMart("ensembl", dataset = "hsapiens_gene_ensembl")
map <- getBM(
attributes = c("ensembl_gene_id", "hgnc_symbol"),
filters = "ensembl_gene_id",
values = ensembl_ids,
mart = ensembl
)
kable_head(map, 10, paste(": mapping preview"))
counts_mapped <- mapped_counts %>%
left_join(map, by = "ensembl_gene_id") %>%
dplyr::select( ensembl_gene_id,hgnc_symbol, everything())
kable_head(counts_mapped[, 1:min(8, ncol(counts_mapped))], 5, paste(": mapped table preview"))
}| ensembl_gene_id | hgnc_symbol | T8657_900CTG_NT | T8658_1150CTG_NT | T8659_1450CTG_NT | T8660_900CTG_20CTG | T8661_1150CTG_20CTG | T8662_1450CTG_20CTG |
|---|---|---|---|---|---|---|---|
| ENSG00000108821 | COL1A1 | 456397 | 486088 | 608151 | 2012962 | 379186 | 474716 |
| ENSG00000265150 | NA | 170681 | 299425 | 286295 | 747000 | 210962 | 148756 |
| ENSG00000164692 | COL1A2 | 169781 | 190854 | 263391 | 869194 | 180006 | 217321 |
| ENSG00000265735 | RN7SL5P | 78113 | 121697 | 113379 | 532435 | 112977 | 75960 |
| ENSG00000259001 | 55081 | 78811 | 73032 | 353442 | 100823 | 118293 |