update to MSigDB 7.0

igordot · Sep 4, 2019 · bdb19c8 · bdb19c8
1 parent 933e0bd
commit bdb19c8
Show file tree

Hide file tree

Showing 16 changed files with 155 additions and 74 deletions.
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -4,3 +4,5 @@
 ^cran-comments\.md$
 ^LICENSE\.md$
 ^\.travis\.yml$
+^revdep$
+^CRAN-RELEASE$
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,31 +1,31 @@
 Package: msigdbr
 Type: Package
 Title: MSigDB Gene Sets for Multiple Organisms in a Tidy Data Format
-Version: 6.2.1
+Version: 7.0.1
 Authors@R: person("Igor", "Dolgalev", email = "igor.dolgalev@nyumc.org", role = c("aut", "cre"))
 Description: Provides the 'Molecular Signatures Database' (MSigDB) gene sets
     typically used with the 'Gene Set Enrichment Analysis' (GSEA) software
     (Subramanian et al. 2005 <doi:10.1073/pnas.0506580102>, Liberzon et al. 2015
     <doi:10.1016/j.cels.2015.12.004>) in a standard R data frame with key-value
-    pairs. Included are the original human gene symbols and Entrez IDs as well
-    as the equivalents for various frequently studied model organisms such as
-    mouse, rat, pig, fly, and yeast.
+    pairs. The package includes the original human gene symbols and NCBI/Entrez
+    IDs as well as the equivalents for frequently studied model organisms such
+    as mouse, rat, pig, fly, and yeast.
 License: MIT + file LICENSE
 Encoding: UTF-8
 URL: https://github.com/igordot/msigdbr
 BugReports: https://github.com/igordot/msigdbr/issues
 LazyData: true
 Depends:
-    R (>= 3.2.0),
-    dplyr (>= 0.7.0),
-    tibble
+    R (>= 3.2.0)
 Imports:
     magrittr,
-    rlang
+    rlang,
+    dplyr (>= 0.7.0),
+    tibble
 Suggests:
     testthat,
     knitr,
     rmarkdown
 Roxygen: list(markdown = TRUE)
-RoxygenNote: 6.1.0
+RoxygenNote: 6.1.1
 VignetteBuilder: knitr
diff --git a/LICENSE b/LICENSE
@@ -1,2 +1,2 @@
-YEAR: 2018
+YEAR: 2018-2019
 COPYRIGHT HOLDER: Igor Dolgalev
diff --git a/LICENSE.md b/LICENSE.md
@@ -1,6 +1,6 @@
 MIT License
 
-Copyright (c) 2018
+Copyright (c) 2018-2019
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal

diff --git a/NAMESPACE b/NAMESPACE
@@ -3,7 +3,11 @@
 export("%>%")
 export(msigdbr)
 export(msigdbr_show_species)
-import(dplyr)
 import(tibble)
+importFrom(dplyr,arrange)
+importFrom(dplyr,filter)
+importFrom(dplyr,inner_join)
+importFrom(dplyr,pull)
+importFrom(dplyr,select)
 importFrom(magrittr,"%>%")
 importFrom(rlang,.data)
diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,8 @@
+# msigdbr 7.0.1
+
+* Based on MSigDB v7.0 release.
+* Fixed output when selecting multiple collections.
+
 # msigdbr 6.2.1
 
 * Based on MSigDB v6.2 release.

diff --git a/R/functions.R b/R/functions.R
@@ -2,7 +2,7 @@
 #' List the species available in the msigdbr package
 #'
 #' @return a vector of possible species
-#' @import dplyr
+#' @importFrom dplyr pull
 #' @export
 msigdbr_show_species <- function() {
 
@@ -18,7 +18,7 @@ msigdbr_show_species <- function() {
 #'
 #' @return a data frame of gene sets with one gene per row
 #' @import tibble
-#' @import dplyr
+#' @importFrom dplyr filter inner_join arrange select
 #' @export
 #'
 #' @examples
@@ -32,6 +32,10 @@ msigdbr <- function(species = "Homo sapiens", category = NULL, subcategory = NUL
   # filter by species
   orthologs_subset = filter(msigdbr_orthologs, .data$species_name == species)
 
+  if (length(species) > 1) {
+    stop("please specify only one species at a time")
+  }
+
   # confirm that the species exists in the database
   if (nrow(orthologs_subset) == 0) {
     stop("species does not exist in the database: ", species)
@@ -41,11 +45,17 @@ msigdbr <- function(species = "Homo sapiens", category = NULL, subcategory = NUL
 
   # filter by category
   if (is.character(category)) {
+    if (length(category) > 1) {
+      stop("please specify only one category at a time")
+    }
     genesets_subset = filter(genesets_subset, .data$gs_cat == category)
   }
 
   # filter by sub-category
   if (is.character(subcategory)) {
+    if (length(subcategory) > 1) {
+      stop("please specify only one subcategory at a time")
+    }
     genesets_subset = filter(genesets_subset, .data$gs_subcat == subcategory)
   }
 

diff --git a/R/sysdata.rda b/R/sysdata.rda
diff --git a/data-raw/msigdbr-prepare.R b/data-raw/msigdbr-prepare.R
@@ -10,14 +10,14 @@ library(usethis)
 # Import MSigDB gene sets -------------------------------------------------
 
 # Download the MSigDB XML file
-msigdb_version = "6.2"
+msigdb_version = "7.0"
 msigdb_xml = glue("msigdb_v{msigdb_version}.xml")
 msigdb_url_base = "http://software.broadinstitute.org/gsea/msigdb/download_file.jsp?filePath=/resources/msigdb"
 msigdb_xml_url = glue("{msigdb_url_base}/{msigdb_version}/msigdb_v{msigdb_version}.xml")
 download.file(
   url = msigdb_xml_url, destfile = msigdb_xml, quiet = TRUE,
   method = "wget",
-  extra = "-c --header='Cookie: JSESSIONID=DF8191E9C5083E0F32D4F826F46F4889'"
+  extra = "-c --header='Cookie: JSESSIONID=94C597E7CE4BCEF65ADE10782AD067AD'"
 )
 
 # Check MSigDB XML file size in bytes
@@ -38,50 +38,54 @@ file.remove(msigdb_xml)
 # * MEMBERS: list of gene set members as they originally appeared in the source
 # * MEMBERS_SYMBOLIZED: list of gene set members in the form of human gene symbols
 # * MEMBERS_EZID: list of gene set members in the form of human Entrez Gene IDs
+# * MEMBERS_MAPPING: pipe-separated list of in the form of: MEMBERS, MEMBERS_SYMBOLIZED, MEMBERS_EZID
 geneset_records = xml_find_all(msigdb_doc, xpath = ".//GENESET")
 msigdbr_genesets =
   tibble(
     gs_name             = xml_attr(geneset_records, attr = "STANDARD_NAME"),
     gs_id               = xml_attr(geneset_records, attr = "SYSTEMATIC_NAME"),
     gs_cat              = xml_attr(geneset_records, attr = "CATEGORY_CODE"),
     gs_subcat           = xml_attr(geneset_records, attr = "SUB_CATEGORY_CODE"),
-    gs_members_symbols  = xml_attr(geneset_records, attr = "MEMBERS_SYMBOLIZED"),
-    gs_members_ezids    = xml_attr(geneset_records, attr = "MEMBERS_EZID")
+    gs_members          = xml_attr(geneset_records, attr = "MEMBERS_MAPPING")
   ) %>%
   filter(gs_cat != "ARCHIVED")
 
 # Check the number of gene sets per category
 msigdbr_genesets %>% count(gs_cat) %>% arrange(gs_cat)
 
-# Create a table of human genes based on MSigDB gene mappings (for original human genesets)
-human_tbl =
+# Convert to "long" format (one gene per row)
+msigdbr_genesets =
   msigdbr_genesets %>%
-  mutate(
-    species_id = 9606,
-    species_name = "Homo sapiens",
-    human_entrez_gene = strsplit(gs_members_ezids, ","),
-    human_gene_symbol = strsplit(gs_members_symbols, ",")
+  mutate(members = strsplit(gs_members, "\\|")) %>%
+  unnest(members) %>%
+  separate(
+    col = members,
+    into = c("source_gene", "human_gene_symbol", "human_entrez_gene"),
+    sep = ","
   ) %>%
-  unnest(human_entrez_gene, human_gene_symbol) %>%
   mutate(human_entrez_gene = as.integer(human_entrez_gene)) %>%
-  select(species_id, species_name, human_entrez_gene, human_gene_symbol) %>%
+  filter(human_entrez_gene > 0)
+
+# Create a table of human genes based on MSigDB gene mappings
+human_tbl =
+  msigdbr_genesets %>%
+  select(human_entrez_gene, human_gene_symbol) %>%
   distinct() %>%
   mutate(
+    species_name = "Homo sapiens",
     entrez_gene = human_entrez_gene,
     gene_symbol = human_gene_symbol
   )
 
-# Get a list of all MSigDB genes (Entrez IDs)
-msigdb_entrez_genes = human_tbl %>% pull(entrez_gene) %>% sort() %>% unique()
-
-# Convert to "long" format (one gene per row)
+# Clean up
 msigdbr_genesets =
   msigdbr_genesets %>%
-  mutate(human_entrez_gene = strsplit(gs_members_ezids, ",")) %>%
-  unnest(human_entrez_gene) %>%
-  mutate(human_entrez_gene = as.integer(human_entrez_gene)) %>%
-  select(-gs_members_symbols, -gs_members_ezids) %>%
-  arrange(gs_name, human_entrez_gene)
+  select(gs_name, gs_id, gs_cat, gs_subcat, human_entrez_gene) %>%
+  arrange(gs_name, gs_id, human_entrez_gene) %>%
+  distinct()
+
+# Get a list of all MSigDB genes (Entrez IDs)
+msigdb_entrez_genes = msigdbr_genesets %>% pull(human_entrez_gene) %>% sort() %>% unique()
 
 # Import HCOP orthologs ---------------------------------------------------
 
@@ -94,27 +98,26 @@ msigdbr_orthologs =
   hcop %>%
   select(
     human_entrez_gene,
-    human_symbol,
-    ortholog_species,
-    ortholog_species_entrez_gene,
-    ortholog_species_symbol,
-    support
-  ) %>%
-  rename(
     human_gene_symbol = human_symbol,
     species_id = ortholog_species,
     entrez_gene = ortholog_species_entrez_gene,
     gene_symbol = ortholog_species_symbol,
     sources = support
   ) %>%
-  filter(human_entrez_gene != "-") %>%
-  filter(entrez_gene != "-") %>%
-  mutate(human_entrez_gene = as.integer(human_entrez_gene)) %>%
-  mutate(entrez_gene = as.integer(entrez_gene)) %>%
-  filter(gene_symbol != "-") %>%
-  filter(human_entrez_gene %in% msigdb_entrez_genes) %>%
-  mutate(num_sources = str_count(sources, ",") + 1) %>%
-  filter(num_sources > 1)
+  filter(
+    human_entrez_gene != "-",
+    entrez_gene != "-",
+    gene_symbol != "-"
+  ) %>%
+  mutate(
+    human_entrez_gene = as.integer(human_entrez_gene),
+    entrez_gene = as.integer(entrez_gene),
+    num_sources = str_count(sources, ",") + 1
+  ) %>%
+  filter(
+    human_entrez_gene %in% msigdb_entrez_genes,
+    num_sources > 1
+  )
 
 # Names and IDs of common species
 species_tbl =
@@ -134,13 +137,12 @@ species_tbl =
 msigdbr_orthologs %>% pull(species_id) %>% unique() %>% sort()
 
 # Add species names
-msigdbr_orthologs =
-  inner_join(species_tbl, msigdbr_orthologs, by = "species_id") %>%
-  select(starts_with("human"), everything())
+msigdbr_orthologs = inner_join(species_tbl, msigdbr_orthologs, by = "species_id")
 
-# For each gene, only keep the best ortholog (one found in the most databases)
+# For each gene, only keep the best ortholog (found in the most databases)
 msigdbr_orthologs =
   msigdbr_orthologs %>%
+  select(-species_id) %>%
   group_by(human_entrez_gene, species_name) %>%
   top_n(1, num_sources) %>%
   ungroup()
@@ -149,8 +151,12 @@ msigdbr_orthologs =
 msigdbr_orthologs =
   msigdbr_orthologs %>%
   bind_rows(human_tbl) %>%
+  select(
+    human_entrez_gene, human_gene_symbol,
+    species_name, entrez_gene, gene_symbol, sources, num_sources
+  ) %>%
   arrange(species_name, human_gene_symbol) %>%
-  select(-species_id)
+  distinct()
 
 # Prepare package ---------------------------------------------------------
 

diff --git a/revdep/.gitignore b/revdep/.gitignore
@@ -0,0 +1,6 @@
+checks
+library
+checks.noindex
+library.noindex
+data.sqlite
+*.html
diff --git a/revdep/README.md b/revdep/README.md
@@ -0,0 +1,22 @@
+# Platform
+
+|field    |value                        |
+|:--------|:----------------------------|
+|version  |R version 3.6.1 (2019-07-05) |
+|os       |macOS Mojave 10.14.6         |
+|system   |x86_64, darwin15.6.0         |
+|ui       |RStudio                      |
+|language |(EN)                         |
+|collate  |en_US.UTF-8                  |
+|ctype    |en_US.UTF-8                  |
+|tz       |America/New_York             |
+|date     |2019-09-03                   |
+
+# Dependencies
+
+|package |old   |new   |Δ  |
+|:-------|:-----|:-----|:--|
+|msigdbr |6.2.1 |7.0.1 |*  |
+
+# Revdeps
+
diff --git a/revdep/email.yml b/revdep/email.yml
@@ -0,0 +1,5 @@
+release_date: ???
+rel_release_date: ???
+my_news_url: ???
+release_version: ???
+release_details: ???
diff --git a/revdep/failures.md b/revdep/failures.md
@@ -0,0 +1 @@
+*Wow, no problems at all. :)*
diff --git a/revdep/problems.md b/revdep/problems.md
@@ -0,0 +1 @@
+*Wow, no problems at all. :)*
diff --git a/tests/testthat/test-functions.R b/tests/testthat/test-functions.R
@@ -16,16 +16,35 @@ msigdbr_mm = msigdbr(species = "Mus musculus")
 expect_s3_class(msigdbr_mm, "tbl_df")
 expect_gt(nrow(msigdbr_mm), 100000)
 
-msigdbr_h = msigdbr(species = "Homo sapiens", category = "H")
-expect_s3_class(msigdbr_h, "tbl_df")
-expect_gt(nrow(msigdbr_h), 1000)
+msigdbr_hs_h = msigdbr(species = "Homo sapiens", category = "H")
+expect_s3_class(msigdbr_hs_h, "tbl_df")
+expect_gt(nrow(msigdbr_hs_h), 5000)
+expect_equal(length(unique(msigdbr_hs_h$gs_id)), 50)
 
-msigdbr_cp = msigdbr(species = "Homo sapiens", category = "C2", subcategory = "CP")
-expect_s3_class(msigdbr_cp, "tbl_df")
-expect_gt(nrow(msigdbr_cp), 1000)
+msigdbr_mm_h = msigdbr(species = "Mus musculus", category = "H")
+expect_s3_class(msigdbr_mm_h, "tbl_df")
+expect_gt(nrow(msigdbr_mm_h), 5000)
+expect_equal(length(unique(msigdbr_mm_h$gs_id)), 50)
 
-msigdbr_bp = msigdbr(species = "Homo sapiens", category = "C5", subcategory = "BP")
-expect_s3_class(msigdbr_bp, "tbl_df")
-expect_gt(nrow(msigdbr_bp), 1000)
+msigdbr_hs_cgp = msigdbr(species = "Homo sapiens", category = "C2", subcategory = "CGP")
+expect_s3_class(msigdbr_hs_cgp, "tbl_df")
+expect_gt(nrow(msigdbr_hs_cgp), 100000)
+expect_gt(length(unique(msigdbr_hs_cgp$gs_id)), 3000)
+expect_lt(length(unique(msigdbr_hs_cgp$gs_id)), 5000)
 
-expect_error(msigdbr(species = "x"))
+msigdbr_hs_bp = msigdbr(species = "Homo sapiens", category = "C5", subcategory = "BP")
+expect_s3_class(msigdbr_hs_bp, "tbl_df")
+expect_gt(nrow(msigdbr_hs_bp), 100000)
+expect_gt(length(unique(msigdbr_hs_bp$gs_id)), 5000)
+expect_lt(length(unique(msigdbr_hs_bp$gs_id)), 10000)
+
+msigdbr_rn_bp = msigdbr(species = "Rattus norvegicus", category = "C5", subcategory = "BP")
+expect_s3_class(msigdbr_rn_bp, "tbl_df")
+expect_gt(nrow(msigdbr_rn_bp), 100000)
+expect_gt(length(unique(msigdbr_rn_bp$gs_id)), 5000)
+expect_lt(length(unique(msigdbr_rn_bp$gs_id)), 10000)
+
+expect_error(msigdbr(species = "test"))
+expect_error(msigdbr(species = c("Homo sapiens", "Mus musculus")))
+expect_error(msigdbr(species = "Homo sapiens", category = c("C1", "C2")))
+expect_error(msigdbr(species = "Homo sapiens", category = "C2", subcategory = c("CGP", "CP")))