Skip to content

Latest commit

 

History

History
327 lines (295 loc) · 10.1 KB

20190426_Fluomics.interferome.md

File metadata and controls

327 lines (295 loc) · 10.1 KB
title author date output
Interferome 2.0 web scraping
Slim Fourati
06 May, 2019
github_documents

Load packages

suppressPackageStartupMessages(library(package = "knitr"))
suppressPackageStartupMessages(library(package = "httr"))
suppressPackageStartupMessages(library(package = "rvest"))
suppressPackageStartupMessages(library(package = "parallel"))
suppressPackageStartupMessages(library(package = "biomaRt"))
suppressPackageStartupMessages(library(package = "tidyverse"))

Set default options/variables

workDir <- dirname(getwd())
opts_chunk$set(tidy = FALSE, fig.path = "../figure/")
options(stringsAsFactors = FALSE)

Identify all experiment ids in interferome 2.0

expIds <- NULL
for (i in 1:125) {
  url <- paste0("http://www.interferome.org/interferome/pubdata/",
		"viewExperiment.jspx?experiment.id=",
		i)
  expPage <- GET(url = url)
  # does page contains a experiment (0 if empty) 
  flag <- html_nodes(content(expPage),
		     xpath=".//div[contains(@class, 'data_field_title')]") %>%
    length()
  if (flag > 1) {
    # print(i)
    expIds <- c(expIds, i)
    Sys.sleep(time = 1)
  }
}
# number of experiments in Interferome 2.0 
print(length(expIds))
## [1] 101

Identify datasets and interferon subtypes of each experiment

subtypeDF <- NULL
for (expId in expIds) {
  # print(expId)
  url <- paste0("http://interferome.its.monash.edu.au/",
		"interferome/pubdata/listDatasets.jspx?experiment.id=",
		expId)
  expPage <- GET(url = url)
  # get dataset ids
  datasetLS <- html_nodes(content(expPage),
		     xpath=".//div[contains(@class, 'ds_link')]") %>% # get dataset div
    html_nodes(css = "a") %>% # get hyperlink
    html_attr(name = "href") %>% # get website url
    gsub(pattern = ".+dataset.id=", replacement = "")
  if (length(datasetLS) > 0) {
    tab <- content(expPage) %>%
      html_nodes(css = "table") %>%
      html_table() %>%
      as.data.frame() %>%
      filter(X1 != "Dataset Name") %>% # remove header
    mutate(InterferonType    = gsub(pattern     = "Interferon Type (I+).+",
				    replacement = "\\1",
				    X2),
	   InterferonSybtype = gsub(pattern     = ".+Interferon SubType ([a-zA-Z]+).+",
				    replacement = "\\1",
				    X2),
	   Species           = gsub(pattern     = ".+Species ([a-zA-Z]+).+",
				    replacement = "\\1",
				    X2),
	   DatasetID         = datasetLS,
	   ExperimentID      = expId) %>%
    select(ExperimentID, DatasetID, Species, InterferonType, InterferonSybtype)
  } else {
    tab <- data.frame(ExperimentID      = expId,
		      DatasetID         = NA,
		      Species           = NA,
		      InterferonType    = NA,
		      InterferonSybtype = NA)
  }
  subtypeDF <- rbind(subtypeDF, tab)
  Sys.sleep(time = 2)
}
# correct typo for experiment 10
subtypeDF <- subtypeDF %>%
  mutate(Species = ifelse(test = ExperimentID %in% 100,
			  yes  = "Homo",
			  no   = Species))
subtypeDF %>%
  filter(!is.na(DatasetID)) %>%
  group_by(Species, InterferonType, InterferonSybtype) %>%
  summarize(nb.ds = n()) %>%
  print()
## # A tibble: 7 x 4
## # Groups:   Species, InterferonType [5]
##   Species InterferonType InterferonSybtype nb.ds
##   <chr>   <chr>          <chr>             <int>
## 1 Homo    I              IFNalpha             69
## 2 Homo    I              IFNbeta              37
## 3 Homo    II             IFNgamma             68
## 4 Homo    III            IFNlambda            22
## 5 Mus     I              IFNalpha             54
## 6 Mus     I              IFNbeta              30
## 7 Mus     II             IFNgamma             42

Fetch genes for each experiment/dataset

subtypeTemp <- filter(subtypeDF, !is.na(DatasetID))

interferomeLS <- mclapply(1:nrow(subtypeTemp), FUN = function(i) {
  expId <- subtypeTemp$ExperimentID[i]
  dsId <- subtypeTemp$DatasetID[i]
  url <- paste0("http://interferome.its.monash.edu.au/interferome/",
		"pubdata/viewDataset.jspx?experiment.id=",
		expId,
		"&dataset.id=",
		dsId,
		"&pageSize=200")
  # identify total number of pages
  expPage <- GET(url = url)
  totalPgs <- html_nodes(content(expPage),
			 xpath=".//span[contains(@class, 'total')]") %>%
    html_text() %>%
    gsub(pattern = "Total | Pages", replacement = "") %>%
    as.numeric()
  Sys.sleep(time = 2)

  # get gene table for each experiment/dataset
  geneDF <- NULL
  for (currentPgs in 1:totalPgs) {
    # print(paste0("exp=", expId,
    #              " ds=", dsId,
    #  		   " page=", currentPgs, "/", totalPgs))
    pageUrl <- paste0(url, "&pageNo=", currentPgs)
    pageHTML <- GET(url = pageUrl)
    tab <- content(expPage) %>%
      html_nodes(xpath=".//table[contains(@class, 'dataset_result_tab')]") %>%
      html_table() %>%
      as.data.frame() %>%
      filter(X1 != "id") %>%# remove header
      mutate(ExperimentID = expId,
	     DatasetID    = dsId)
    geneDF <- rbind(geneDF, tab)
    Sys.sleep(time = 2)
  }
  return(value = geneDF)
}, mc.cores = detectCores() - 1)
# save interferome db
save(interferomeLS, file = file.path(workDir, "output/interferome.RData"))
# print number of genes in each dataset
do.call(what = rbind, args = interferomeLS) %>%
  mutate(X2 = as.numeric(X2),
	 signlog2FC = ifelse(test = sign(X2) %in% 1,
			     yes  = "UP",
			     no   = "DN")) %>%
  group_by(ExperimentID, DatasetID,signlog2FC) %>%
  summarize(n = n()) %>%
  spread(signlog2FC, n) %>%
  print()
## # A tibble: 322 x 4
## # Groups:   ExperimentID, DatasetID [322]
##    ExperimentID DatasetID    DN    UP
##           <int> <chr>     <int> <int>
##  1            8 63         3360   640
##  2            8 64         2125  1275
##  3            8 8           480   520
##  4           11 10          264   336
##  5           11 9            60    84
##  6           12 11          309   291
##  7           13 12         1390   610
##  8           13 13          594   606
##  9           13 14          550   450
## 10           13 15          520   480
## # … with 312 more rows

Create interferome GMT

gmtDF <- do.call(what = rbind, args = interferomeLS) %>%
  merge(y = select(subtypeDF, ExperimentID, DatasetID, Species, InterferonSybtype),
	by = c("ExperimentID", "DatasetID"))

# convert mouse genes to human genes
human <- useMart("ensembl", dataset = "hsapiens_gene_ensembl")
mouse <- useMart("ensembl", dataset = "mmusculus_gene_ensembl")

mouse2human <- getLDS(attributes  = "ensembl_gene_id",
		      filters     = "ensembl_gene_id",
		      values      = gmtDF$X8,
		      mart        = mouse,
		      attributesL = c("ensembl_gene_id", "hgnc_symbol"),
		      martL       = human,
		      uniqueRows  = TRUE)
id2gene <- gather(mouse2human, cname, id, -HGNC.symbol) %>%
  select(-cname)

gmtDF <- merge(x = gmtDF, y = id2gene, by.x = "X8", by.y = "id")

gmtLS <- gmtDF %>%
  mutate(X2 = as.numeric(X2)) %>%
  filter(abs(X2) >= 2) %>%
  mutate(InterferonSybtype = ifelse(test = X2 > 0,
				    yes  = paste0("INTERFEROME_", InterferonSybtype, "_UP"),
				    no   = paste0("INTERFEROME_", InterferonSybtype, "_DN")),
	 InterferonSybtype = toupper(InterferonSybtype)) %>%
  select(HGNC.symbol, InterferonSybtype) %>%
  distinct() %>%
  unstack()

# write gmt
gmtFile <- file.path(workDir, "utils/interferome.gmt")
if (file.exists(gmtFile)) {
  file.remove(gmtFile)
}
## [1] TRUE
for (i in 1:length(gmtLS)) {
  write(paste(names(gmtLS)[i],
	      NA,
	      paste(gmtLS[[i]], collapse = "\t"),
	      sep = "\t"),
	file   = gmtFile,
	append = TRUE)
}

# print number of genes passing abs(logFC) above 2 per interferon subtype
stack(gmtLS) %>%
  rename(gsname = ind) %>%
  group_by(gsname) %>%
  summarize(n = n()) %>%
  print()
## # A tibble: 8 x 2
##   gsname                       n
##   <fct>                    <int>
## 1 INTERFEROME_IFNALPHA_DN    375
## 2 INTERFEROME_IFNALPHA_UP    476
## 3 INTERFEROME_IFNBETA_DN     295
## 4 INTERFEROME_IFNBETA_UP     322
## 5 INTERFEROME_IFNGAMMA_DN    415
## 6 INTERFEROME_IFNGAMMA_UP    488
## 7 INTERFEROME_IFNLAMBDA_DN     6
## 8 INTERFEROME_IFNLAMBDA_UP    10

Print session info

sessionInfo()
## R version 3.5.3 (2019-03-11)
## Platform: x86_64-apple-darwin18.2.0 (64-bit)
## Running under: macOS Mojave 10.14.4
## 
## Matrix products: default
## BLAS/LAPACK: /usr/local/Cellar/openblas/0.3.5/lib/libopenblasp-r0.3.5.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] parallel  stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] forcats_0.4.0   stringr_1.4.0   dplyr_0.8.0.1   purrr_0.3.2    
##  [5] readr_1.3.1     tidyr_0.8.3     tibble_2.1.1    ggplot2_3.1.1  
##  [9] tidyverse_1.2.1 biomaRt_2.38.0  rvest_0.3.3     xml2_1.2.0     
## [13] httr_1.4.0      knitr_1.22     
## 
## loaded via a namespace (and not attached):
##  [1] Rcpp_1.0.1           lubridate_1.7.4      lattice_0.20-38     
##  [4] prettyunits_1.0.2    utf8_1.1.4           assertthat_0.2.1    
##  [7] digest_0.6.18        R6_2.4.0             cellranger_1.1.0    
## [10] plyr_1.8.4           backports_1.1.4      stats4_3.5.3        
## [13] RSQLite_2.1.1        evaluate_0.13        pillar_1.3.1        
## [16] rlang_0.3.4          progress_1.2.0       lazyeval_0.2.2      
## [19] curl_3.3             readxl_1.3.1         rstudioapi_0.10     
## [22] blob_1.1.1           S4Vectors_0.20.1     selectr_0.4-1       
## [25] RCurl_1.95-4.12      bit_1.1-14           munsell_0.5.0       
## [28] broom_0.5.2          compiler_3.5.3       modelr_0.1.4        
## [31] xfun_0.6             pkgconfig_2.0.2      BiocGenerics_0.28.0 
## [34] tidyselect_0.2.5     IRanges_2.16.0       XML_3.98-1.19       
## [37] fansi_0.4.0          crayon_1.3.4         withr_2.1.2         
## [40] bitops_1.0-6         grid_3.5.3           nlme_3.1-139        
## [43] jsonlite_1.6         gtable_0.3.0         DBI_1.0.0           
## [46] magrittr_1.5         scales_1.0.0         cli_1.1.0           
## [49] stringi_1.4.3        generics_0.0.2       tools_3.5.3         
## [52] bit64_0.9-7          Biobase_2.42.0       glue_1.3.1          
## [55] hms_0.4.2            AnnotationDbi_1.44.0 colorspace_1.4-1    
## [58] memoise_1.1.0        haven_2.1.0