Chapter 7 Get the data - Dataset 2
7.2 Read the FeatureCounts matrix
counts_file <- file.path("data", GSE_ID, "GSE240829_POCROC_good.abundance.salmon.genes_8.14.23.txt.gz")
stopifnot(file.exists(counts_file))
fc <- readr::read_tsv(counts_file, show_col_types = FALSE)
# Peek at the columns
names(fc)[1:min(12, ncol(fc))]## [1] "...1" "21020_0_ovary" "21020_2_lymph_node" "24487_0_l_ovary"
## [5] "24487_1_lymph_node" "25258_0_l_ovary" "25258_1_diaphragm" "29764_0_l_ovary"
## [9] "30961_0_r_ovary" "30961_3_rectum" "32761_0_l_ovary" "32761_2_spleen"
7.2.1 Build a counts matrix (genes × samples)
sample_cols <- colnames(fc)
counts <- fc %>%
dplyr::select( all_of(sample_cols)) %>%
as.data.frame()
rownames(counts) <- counts[,1]
counts[,1] <- NULL
counts <- as.matrix(counts)
#storage.mode(counts) <- "integer"
dim(counts)## [1] 33969 43
## 21020_0_ovary 21020_2_lymph_node 24487_0_l_ovary
## ENSG00000230092.7 10.718211 4.244121 2.959815
## ENSG00000177757.2 0.018135 0.025687 0.070439
## ENSG00000228794.8 2.335282 2.654898 0.616134
7.3 Create metadata from sample names
Sample names in this series follow patterns like:
21020_0_ovary 21020_2_lymph_node 24487_0_l_ovary … Where the first number is the patient id, second number is whether it is primary or recurrent and the third is location. It was separated really badly because there are underscores in the location name as well. (l_ovary = left ovary). –> just grab the last token for the location
We will parse:
- ‘patient’ - patient id
- ‘status’ - primary, recurrent1, recurrent2, recurrent3
- ‘location’ - ovary,
#design matrix -
samples <- colnames(counts)[1:ncol(counts)]
patient <- unlist(lapply(samples,FUN =
function(x){unlist(strsplit(x,split = "_"))[1]}))
status <- unlist(lapply(samples,FUN =
function(x){unlist(strsplit(x,split = "_"))[2]}))
location <- unlist(lapply(samples,FUN =
function(x){
if(length(unlist(strsplit(x, "_"))) > 3){
y <- paste(unlist(strsplit(x, "_"))[3:length(unlist(strsplit(x, "_")))],collapse ="_")
} else{
y <- unlist(strsplit(x, "_"))[3]
}
return(y)
}))
meta <- data.frame(samples, patient,status, location)7.4 Identifier mapping
strip_ensembl_version <- function(x) sub("\\..*$", "", x)
kable_head <- function(x, n = 5, caption = NULL) {
knitr::kable(utils::head(x, n), caption = caption)
}
mapped_counts <- data.frame(ensembl_gene_id = strip_ensembl_version(rownames(counts)),counts)
ids<- mapped_counts$ensembl_gene_id
if (any(grepl("^ENSG", strip_ensembl_version(ids)))) {
library(biomaRt)
ensembl_ids <- unique(strip_ensembl_version(ids))
ensembl <- useMart("ensembl", dataset = "hsapiens_gene_ensembl")
map <- getBM(
attributes = c("ensembl_gene_id", "hgnc_symbol"),
filters = "ensembl_gene_id",
values = ensembl_ids,
mart = ensembl
)
kable_head(map, 10, paste(": mapping preview"))
counts_mapped <- mapped_counts %>%
left_join(map, by = "ensembl_gene_id") %>%
dplyr::select( ensembl_gene_id,hgnc_symbol, everything())
kable_head(counts_mapped[, 1:min(8, ncol(counts_mapped))], 5, paste(": mapped table preview"))
}| ensembl_gene_id | hgnc_symbol | X21020_0_ovary | X21020_2_lymph_node | X24487_0_l_ovary | X24487_1_lymph_node | X25258_0_l_ovary | X25258_1_diaphragm |
|---|---|---|---|---|---|---|---|
| ENSG00000230092 | 10.718211 | 4.244121 | 2.959815 | 1.920198 | 1.591502 | 0.695518 | |
| ENSG00000177757 | FAM87B | 0.018135 | 0.025687 | 0.070439 | 0.031852 | 0.005534 | 0.000000 |
| ENSG00000228794 | LINC01128 | 2.335282 | 2.654898 | 0.616134 | 0.635164 | 0.477670 | 0.851483 |
| ENSG00000225880 | LINC00115 | 0.425905 | 0.396866 | 0.313534 | 0.296781 | 0.125387 | 0.249801 |
| ENSG00000230368 | FAM41C | 0.230066 | 0.409599 | 0.420536 | 0.126110 | 0.165970 | 0.029084 |