Chapter 5 Finding Datasets of Interest

Goal: RNA-seq, human, recent (last ~3 years), and with sufficient samples.

5.1 RNA-seq series by platform technology

sql <- paste(
  "SELECT DISTINCT gse.title, gse.gse, gpl.title",
  "FROM gse JOIN gse_gpl ON gse_gpl.gse = gse.gse",
  "JOIN gpl ON gse_gpl.gpl = gpl.gpl",
  "WHERE gpl.technology = 'high-throughput sequencing'"
)
rs <- dbGetQuery(con, sql)
dim(rs)
## [1] 61489     3

5.2 Add organism and recency filters

sql <- paste(
  "SELECT DISTINCT gse.title, gse.gse, gpl.title, gse.submission_date",
  "FROM gse JOIN gse_gpl ON gse_gpl.gse = gse.gse",
  "JOIN gpl ON gse_gpl.gpl = gpl.gpl",
  "WHERE gse.submission_date > '2022-01-01' AND",
  "gpl.organism LIKE '%Homo sapiens%' AND",
  "gpl.technology LIKE '%high-throughput seq%'"
)
rs <- dbGetQuery(con, sql)
dim(rs)
## [1] 3202    4

5.3 Add a topic keyword (e.g., ovarian)

sql <- paste(
  "SELECT DISTINCT gse.title, gse.gse, gpl.title, gse.submission_date",
  "FROM gse JOIN gse_gpl ON gse_gpl.gse = gse.gse",
  "JOIN gpl ON gse_gpl.gpl = gpl.gpl",
  "WHERE gse.submission_date > '2022-01-01' AND",
  "gse.title LIKE '%ovarian%' AND",
  "gpl.organism LIKE '%Homo sapiens%' AND",
  "gpl.technology LIKE '%high-throughput seq%'"
)
rs <- dbGetQuery(con, sql)
dim(rs)
## [1] 27  4
head(rs[, c('gse','title')])
##         gse
## 1 GSE192898
## 2 GSE197320
## 3 GSE200852
## 4 GSE201223
## 5 GSE201337
## 6 GSE201525
##                                                                                                            title
## 1                                              Single cell 5' RNA sequencing of 9 Korean ovarian cancer patients
## 2                        Therapeutic Targeting of Ovarian Cancer Stem Cells Using Estrogen Receptor Beta Agonist
## 3                                                          The microRNA profiles of ovarian clear cell carcinoma
## 4                 Zinc increases hormone receptor expression in human ovarian granulosa cells and gene profiling
## 5 Investigation of the anti-tumour properties of interferon epsilon in high grade serous ovarian cancer in vivo.
## 6          Investigation of the anti-tumour properties of interferon epsilon in high grade serous ovarian cancer

5.4 Heuristic: require a counts-like supplementary file

sql <- paste(
  "SELECT DISTINCT gse.title, gse.gse, gpl.title, gse.submission_date, gse.supplementary_file",
  "FROM gse JOIN gse_gpl ON gse_gpl.gse = gse.gse",
  "JOIN gpl ON gse_gpl.gpl = gpl.gpl",
  "WHERE gse.submission_date > '2022-01-01' AND",
  "gse.title LIKE '%ovarian%' AND",
  "gpl.organism LIKE '%Homo sapiens%' AND",
  "gpl.technology LIKE '%high-throughput sequencing%'",
  "ORDER BY gse.submission_date DESC"
)
rs <- dbGetQuery(con, sql)
counts_files <- rs$supplementary_file[grep("count|cnt", rs$supplementary_file, ignore.case = TRUE)]
series_of_interest <- rs$gse[grep("count|cnt", rs$supplementary_file, ignore.case = TRUE)]
# Preview filenames
shortened <- unlist(lapply(counts_files, function(x){
  x <- unlist(strsplit(x, ";"))
  x <- x[grep("count|cnt|txt|csv|xlsx", x, ignore.case = TRUE)]
  tail(unlist(strsplit(x, "/")), n = 1)
}))
head(shortened, 10)
##  [1] "GSE252379_samples_attr.txt.gz"                                           
##  [2] "GSE235755_raw_counts_batch_adjusted.txt.gz"                              
##  [3] "GSE231560_OV90_TPM1.counts.txt.gz"                                       
##  [4] "GSE227276_MTF_KD_KURAMOCHI_FT246.sorted.htseq-count.TPM.txt.gz"          
##  [5] "GSE223011_Nijhuis_KURAMOCHI_VC_INDISULAM_humanCounts.tsv.xlsx"           
##  [6] "GSE222982_supplementary_file_estimated_raw_counts_for_all_samples.txt.gz"
##  [7] "GSE212702_RNA_seq_count_table.xlsx"                                      
##  [8] "GSE207094_raw_counts.txt.gz"                                             
##  [9] "GSE202601_human_ovary_snRNA-seq_metadata.txt.gz"                         
## [10] "GSE201337_PDX_IFN_UMI_counts.csv.gz"

5.5 Require a minimum number of samples per series

num_series <- dbGetQuery(con, paste(
  "select * from gsm where series_id in ('",
  paste(series_of_interest, collapse = "','"), "')", sep = ""))

gse.count <- as.data.frame(table(num_series$series_id))
subset(gse.count, Freq > 6)
##        Var1 Freq
## 1 GSE202601   16
## 2 GSE207094   45
## 3 GSE212702   84
## 4 GSE222982   10
## 6 GSE227276   30
## 7 GSE231560   15
## 8 GSE235755   11