Chapter 5 Finding Datasets of Interest
Goal: RNA-seq, human, recent (last ~3 years), and with sufficient samples.
5.1 RNA-seq series by platform technology
sql <- paste(
"SELECT DISTINCT gse.title, gse.gse, gpl.title",
"FROM gse JOIN gse_gpl ON gse_gpl.gse = gse.gse",
"JOIN gpl ON gse_gpl.gpl = gpl.gpl",
"WHERE gpl.technology = 'high-throughput sequencing'"
)
rs <- dbGetQuery(con, sql)
dim(rs)## [1] 61489 3
5.2 Add organism and recency filters
sql <- paste(
"SELECT DISTINCT gse.title, gse.gse, gpl.title, gse.submission_date",
"FROM gse JOIN gse_gpl ON gse_gpl.gse = gse.gse",
"JOIN gpl ON gse_gpl.gpl = gpl.gpl",
"WHERE gse.submission_date > '2022-01-01' AND",
"gpl.organism LIKE '%Homo sapiens%' AND",
"gpl.technology LIKE '%high-throughput seq%'"
)
rs <- dbGetQuery(con, sql)
dim(rs)## [1] 3202 4
5.3 Add a topic keyword (e.g., ovarian)
sql <- paste(
"SELECT DISTINCT gse.title, gse.gse, gpl.title, gse.submission_date",
"FROM gse JOIN gse_gpl ON gse_gpl.gse = gse.gse",
"JOIN gpl ON gse_gpl.gpl = gpl.gpl",
"WHERE gse.submission_date > '2022-01-01' AND",
"gse.title LIKE '%ovarian%' AND",
"gpl.organism LIKE '%Homo sapiens%' AND",
"gpl.technology LIKE '%high-throughput seq%'"
)
rs <- dbGetQuery(con, sql)
dim(rs)## [1] 27 4
## gse
## 1 GSE192898
## 2 GSE197320
## 3 GSE200852
## 4 GSE201223
## 5 GSE201337
## 6 GSE201525
## title
## 1 Single cell 5' RNA sequencing of 9 Korean ovarian cancer patients
## 2 Therapeutic Targeting of Ovarian Cancer Stem Cells Using Estrogen Receptor Beta Agonist
## 3 The microRNA profiles of ovarian clear cell carcinoma
## 4 Zinc increases hormone receptor expression in human ovarian granulosa cells and gene profiling
## 5 Investigation of the anti-tumour properties of interferon epsilon in high grade serous ovarian cancer in vivo.
## 6 Investigation of the anti-tumour properties of interferon epsilon in high grade serous ovarian cancer
5.4 Heuristic: require a counts-like supplementary file
sql <- paste(
"SELECT DISTINCT gse.title, gse.gse, gpl.title, gse.submission_date, gse.supplementary_file",
"FROM gse JOIN gse_gpl ON gse_gpl.gse = gse.gse",
"JOIN gpl ON gse_gpl.gpl = gpl.gpl",
"WHERE gse.submission_date > '2022-01-01' AND",
"gse.title LIKE '%ovarian%' AND",
"gpl.organism LIKE '%Homo sapiens%' AND",
"gpl.technology LIKE '%high-throughput sequencing%'",
"ORDER BY gse.submission_date DESC"
)
rs <- dbGetQuery(con, sql)
counts_files <- rs$supplementary_file[grep("count|cnt", rs$supplementary_file, ignore.case = TRUE)]
series_of_interest <- rs$gse[grep("count|cnt", rs$supplementary_file, ignore.case = TRUE)]
# Preview filenames
shortened <- unlist(lapply(counts_files, function(x){
x <- unlist(strsplit(x, ";"))
x <- x[grep("count|cnt|txt|csv|xlsx", x, ignore.case = TRUE)]
tail(unlist(strsplit(x, "/")), n = 1)
}))
head(shortened, 10)## [1] "GSE252379_samples_attr.txt.gz"
## [2] "GSE235755_raw_counts_batch_adjusted.txt.gz"
## [3] "GSE231560_OV90_TPM1.counts.txt.gz"
## [4] "GSE227276_MTF_KD_KURAMOCHI_FT246.sorted.htseq-count.TPM.txt.gz"
## [5] "GSE223011_Nijhuis_KURAMOCHI_VC_INDISULAM_humanCounts.tsv.xlsx"
## [6] "GSE222982_supplementary_file_estimated_raw_counts_for_all_samples.txt.gz"
## [7] "GSE212702_RNA_seq_count_table.xlsx"
## [8] "GSE207094_raw_counts.txt.gz"
## [9] "GSE202601_human_ovary_snRNA-seq_metadata.txt.gz"
## [10] "GSE201337_PDX_IFN_UMI_counts.csv.gz"
5.5 Require a minimum number of samples per series
num_series <- dbGetQuery(con, paste(
"select * from gsm where series_id in ('",
paste(series_of_interest, collapse = "','"), "')", sep = ""))
gse.count <- as.data.frame(table(num_series$series_id))
subset(gse.count, Freq > 6)## Var1 Freq
## 1 GSE202601 16
## 2 GSE207094 45
## 3 GSE212702 84
## 4 GSE222982 10
## 6 GSE227276 30
## 7 GSE231560 15
## 8 GSE235755 11