title | author | date | output |
---|---|---|---|
Interferome 2.0 web scraping |
Slim Fourati |
06 May, 2019 |
github_documents |
Load packages
suppressPackageStartupMessages(library(package = "knitr"))
suppressPackageStartupMessages(library(package = "httr"))
suppressPackageStartupMessages(library(package = "rvest"))
suppressPackageStartupMessages(library(package = "parallel"))
suppressPackageStartupMessages(library(package = "biomaRt"))
suppressPackageStartupMessages(library(package = "tidyverse"))
Set default options/variables
workDir <- dirname(getwd())
opts_chunk$set(tidy = FALSE, fig.path = "../figure/")
options(stringsAsFactors = FALSE)
Identify all experiment ids in interferome 2.0
expIds <- NULL
for (i in 1:125) {
url <- paste0("http://www.interferome.org/interferome/pubdata/",
"viewExperiment.jspx?experiment.id=",
i)
expPage <- GET(url = url)
# does page contains a experiment (0 if empty)
flag <- html_nodes(content(expPage),
xpath=".//div[contains(@class, 'data_field_title')]") %>%
length()
if (flag > 1) {
# print(i)
expIds <- c(expIds, i)
Sys.sleep(time = 1)
}
}
# number of experiments in Interferome 2.0
print(length(expIds))
## [1] 101
Identify datasets and interferon subtypes of each experiment
subtypeDF <- NULL
for (expId in expIds) {
# print(expId)
url <- paste0("http://interferome.its.monash.edu.au/",
"interferome/pubdata/listDatasets.jspx?experiment.id=",
expId)
expPage <- GET(url = url)
# get dataset ids
datasetLS <- html_nodes(content(expPage),
xpath=".//div[contains(@class, 'ds_link')]") %>% # get dataset div
html_nodes(css = "a") %>% # get hyperlink
html_attr(name = "href") %>% # get website url
gsub(pattern = ".+dataset.id=", replacement = "")
if (length(datasetLS) > 0) {
tab <- content(expPage) %>%
html_nodes(css = "table") %>%
html_table() %>%
as.data.frame() %>%
filter(X1 != "Dataset Name") %>% # remove header
mutate(InterferonType = gsub(pattern = "Interferon Type (I+).+",
replacement = "\\1",
X2),
InterferonSybtype = gsub(pattern = ".+Interferon SubType ([a-zA-Z]+).+",
replacement = "\\1",
X2),
Species = gsub(pattern = ".+Species ([a-zA-Z]+).+",
replacement = "\\1",
X2),
DatasetID = datasetLS,
ExperimentID = expId) %>%
select(ExperimentID, DatasetID, Species, InterferonType, InterferonSybtype)
} else {
tab <- data.frame(ExperimentID = expId,
DatasetID = NA,
Species = NA,
InterferonType = NA,
InterferonSybtype = NA)
}
subtypeDF <- rbind(subtypeDF, tab)
Sys.sleep(time = 2)
}
# correct typo for experiment 10
subtypeDF <- subtypeDF %>%
mutate(Species = ifelse(test = ExperimentID %in% 100,
yes = "Homo",
no = Species))
subtypeDF %>%
filter(!is.na(DatasetID)) %>%
group_by(Species, InterferonType, InterferonSybtype) %>%
summarize(nb.ds = n()) %>%
print()
## # A tibble: 7 x 4
## # Groups: Species, InterferonType [5]
## Species InterferonType InterferonSybtype nb.ds
## <chr> <chr> <chr> <int>
## 1 Homo I IFNalpha 69
## 2 Homo I IFNbeta 37
## 3 Homo II IFNgamma 68
## 4 Homo III IFNlambda 22
## 5 Mus I IFNalpha 54
## 6 Mus I IFNbeta 30
## 7 Mus II IFNgamma 42
Fetch genes for each experiment/dataset
subtypeTemp <- filter(subtypeDF, !is.na(DatasetID))
interferomeLS <- mclapply(1:nrow(subtypeTemp), FUN = function(i) {
expId <- subtypeTemp$ExperimentID[i]
dsId <- subtypeTemp$DatasetID[i]
url <- paste0("http://interferome.its.monash.edu.au/interferome/",
"pubdata/viewDataset.jspx?experiment.id=",
expId,
"&dataset.id=",
dsId,
"&pageSize=200")
# identify total number of pages
expPage <- GET(url = url)
totalPgs <- html_nodes(content(expPage),
xpath=".//span[contains(@class, 'total')]") %>%
html_text() %>%
gsub(pattern = "Total | Pages", replacement = "") %>%
as.numeric()
Sys.sleep(time = 2)
# get gene table for each experiment/dataset
geneDF <- NULL
for (currentPgs in 1:totalPgs) {
# print(paste0("exp=", expId,
# " ds=", dsId,
# " page=", currentPgs, "/", totalPgs))
pageUrl <- paste0(url, "&pageNo=", currentPgs)
pageHTML <- GET(url = pageUrl)
tab <- content(expPage) %>%
html_nodes(xpath=".//table[contains(@class, 'dataset_result_tab')]") %>%
html_table() %>%
as.data.frame() %>%
filter(X1 != "id") %>%# remove header
mutate(ExperimentID = expId,
DatasetID = dsId)
geneDF <- rbind(geneDF, tab)
Sys.sleep(time = 2)
}
return(value = geneDF)
}, mc.cores = detectCores() - 1)
# save interferome db
save(interferomeLS, file = file.path(workDir, "output/interferome.RData"))
# print number of genes in each dataset
do.call(what = rbind, args = interferomeLS) %>%
mutate(X2 = as.numeric(X2),
signlog2FC = ifelse(test = sign(X2) %in% 1,
yes = "UP",
no = "DN")) %>%
group_by(ExperimentID, DatasetID,signlog2FC) %>%
summarize(n = n()) %>%
spread(signlog2FC, n) %>%
print()
## # A tibble: 322 x 4
## # Groups: ExperimentID, DatasetID [322]
## ExperimentID DatasetID DN UP
## <int> <chr> <int> <int>
## 1 8 63 3360 640
## 2 8 64 2125 1275
## 3 8 8 480 520
## 4 11 10 264 336
## 5 11 9 60 84
## 6 12 11 309 291
## 7 13 12 1390 610
## 8 13 13 594 606
## 9 13 14 550 450
## 10 13 15 520 480
## # … with 312 more rows
Create interferome GMT
gmtDF <- do.call(what = rbind, args = interferomeLS) %>%
merge(y = select(subtypeDF, ExperimentID, DatasetID, Species, InterferonSybtype),
by = c("ExperimentID", "DatasetID"))
# convert mouse genes to human genes
human <- useMart("ensembl", dataset = "hsapiens_gene_ensembl")
mouse <- useMart("ensembl", dataset = "mmusculus_gene_ensembl")
mouse2human <- getLDS(attributes = "ensembl_gene_id",
filters = "ensembl_gene_id",
values = gmtDF$X8,
mart = mouse,
attributesL = c("ensembl_gene_id", "hgnc_symbol"),
martL = human,
uniqueRows = TRUE)
id2gene <- gather(mouse2human, cname, id, -HGNC.symbol) %>%
select(-cname)
gmtDF <- merge(x = gmtDF, y = id2gene, by.x = "X8", by.y = "id")
gmtLS <- gmtDF %>%
mutate(X2 = as.numeric(X2)) %>%
filter(abs(X2) >= 2) %>%
mutate(InterferonSybtype = ifelse(test = X2 > 0,
yes = paste0("INTERFEROME_", InterferonSybtype, "_UP"),
no = paste0("INTERFEROME_", InterferonSybtype, "_DN")),
InterferonSybtype = toupper(InterferonSybtype)) %>%
select(HGNC.symbol, InterferonSybtype) %>%
distinct() %>%
unstack()
# write gmt
gmtFile <- file.path(workDir, "utils/interferome.gmt")
if (file.exists(gmtFile)) {
file.remove(gmtFile)
}
## [1] TRUE
for (i in 1:length(gmtLS)) {
write(paste(names(gmtLS)[i],
NA,
paste(gmtLS[[i]], collapse = "\t"),
sep = "\t"),
file = gmtFile,
append = TRUE)
}
# print number of genes passing abs(logFC) above 2 per interferon subtype
stack(gmtLS) %>%
rename(gsname = ind) %>%
group_by(gsname) %>%
summarize(n = n()) %>%
print()
## # A tibble: 8 x 2
## gsname n
## <fct> <int>
## 1 INTERFEROME_IFNALPHA_DN 375
## 2 INTERFEROME_IFNALPHA_UP 476
## 3 INTERFEROME_IFNBETA_DN 295
## 4 INTERFEROME_IFNBETA_UP 322
## 5 INTERFEROME_IFNGAMMA_DN 415
## 6 INTERFEROME_IFNGAMMA_UP 488
## 7 INTERFEROME_IFNLAMBDA_DN 6
## 8 INTERFEROME_IFNLAMBDA_UP 10
Print session info
sessionInfo()
## R version 3.5.3 (2019-03-11)
## Platform: x86_64-apple-darwin18.2.0 (64-bit)
## Running under: macOS Mojave 10.14.4
##
## Matrix products: default
## BLAS/LAPACK: /usr/local/Cellar/openblas/0.3.5/lib/libopenblasp-r0.3.5.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] parallel stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] forcats_0.4.0 stringr_1.4.0 dplyr_0.8.0.1 purrr_0.3.2
## [5] readr_1.3.1 tidyr_0.8.3 tibble_2.1.1 ggplot2_3.1.1
## [9] tidyverse_1.2.1 biomaRt_2.38.0 rvest_0.3.3 xml2_1.2.0
## [13] httr_1.4.0 knitr_1.22
##
## loaded via a namespace (and not attached):
## [1] Rcpp_1.0.1 lubridate_1.7.4 lattice_0.20-38
## [4] prettyunits_1.0.2 utf8_1.1.4 assertthat_0.2.1
## [7] digest_0.6.18 R6_2.4.0 cellranger_1.1.0
## [10] plyr_1.8.4 backports_1.1.4 stats4_3.5.3
## [13] RSQLite_2.1.1 evaluate_0.13 pillar_1.3.1
## [16] rlang_0.3.4 progress_1.2.0 lazyeval_0.2.2
## [19] curl_3.3 readxl_1.3.1 rstudioapi_0.10
## [22] blob_1.1.1 S4Vectors_0.20.1 selectr_0.4-1
## [25] RCurl_1.95-4.12 bit_1.1-14 munsell_0.5.0
## [28] broom_0.5.2 compiler_3.5.3 modelr_0.1.4
## [31] xfun_0.6 pkgconfig_2.0.2 BiocGenerics_0.28.0
## [34] tidyselect_0.2.5 IRanges_2.16.0 XML_3.98-1.19
## [37] fansi_0.4.0 crayon_1.3.4 withr_2.1.2
## [40] bitops_1.0-6 grid_3.5.3 nlme_3.1-139
## [43] jsonlite_1.6 gtable_0.3.0 DBI_1.0.0
## [46] magrittr_1.5 scales_1.0.0 cli_1.1.0
## [49] stringi_1.4.3 generics_0.0.2 tools_3.5.3
## [52] bit64_0.9-7 Biobase_2.42.0 glue_1.3.1
## [55] hms_0.4.2 AnnotationDbi_1.44.0 colorspace_1.4-1
## [58] memoise_1.1.0 haven_2.1.0