Skip to content

Commit

Permalink
update to MSigDB 7.0
Browse files Browse the repository at this point in the history
  • Loading branch information
igordot committed Sep 4, 2019
1 parent 933e0bd commit bdb19c8
Show file tree
Hide file tree
Showing 16 changed files with 155 additions and 74 deletions.
2 changes: 2 additions & 0 deletions .Rbuildignore
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,5 @@
^cran-comments\.md$
^LICENSE\.md$
^\.travis\.yml$
^revdep$
^CRAN-RELEASE$
18 changes: 9 additions & 9 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,31 +1,31 @@
Package: msigdbr
Type: Package
Title: MSigDB Gene Sets for Multiple Organisms in a Tidy Data Format
Version: 6.2.1
Version: 7.0.1
Authors@R: person("Igor", "Dolgalev", email = "igor.dolgalev@nyumc.org", role = c("aut", "cre"))
Description: Provides the 'Molecular Signatures Database' (MSigDB) gene sets
typically used with the 'Gene Set Enrichment Analysis' (GSEA) software
(Subramanian et al. 2005 <doi:10.1073/pnas.0506580102>, Liberzon et al. 2015
<doi:10.1016/j.cels.2015.12.004>) in a standard R data frame with key-value
pairs. Included are the original human gene symbols and Entrez IDs as well
as the equivalents for various frequently studied model organisms such as
mouse, rat, pig, fly, and yeast.
pairs. The package includes the original human gene symbols and NCBI/Entrez
IDs as well as the equivalents for frequently studied model organisms such
as mouse, rat, pig, fly, and yeast.
License: MIT + file LICENSE
Encoding: UTF-8
URL: https://github.com/igordot/msigdbr
BugReports: https://github.com/igordot/msigdbr/issues
LazyData: true
Depends:
R (>= 3.2.0),
dplyr (>= 0.7.0),
tibble
R (>= 3.2.0)
Imports:
magrittr,
rlang
rlang,
dplyr (>= 0.7.0),
tibble
Suggests:
testthat,
knitr,
rmarkdown
Roxygen: list(markdown = TRUE)
RoxygenNote: 6.1.0
RoxygenNote: 6.1.1
VignetteBuilder: knitr
2 changes: 1 addition & 1 deletion LICENSE
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
YEAR: 2018
YEAR: 2018-2019
COPYRIGHT HOLDER: Igor Dolgalev
2 changes: 1 addition & 1 deletion LICENSE.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
MIT License

Copyright (c) 2018
Copyright (c) 2018-2019

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
Expand Down
6 changes: 5 additions & 1 deletion NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,11 @@
export("%>%")
export(msigdbr)
export(msigdbr_show_species)
import(dplyr)
import(tibble)
importFrom(dplyr,arrange)
importFrom(dplyr,filter)
importFrom(dplyr,inner_join)
importFrom(dplyr,pull)
importFrom(dplyr,select)
importFrom(magrittr,"%>%")
importFrom(rlang,.data)
5 changes: 5 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
# msigdbr 7.0.1

* Based on MSigDB v7.0 release.
* Fixed output when selecting multiple collections.

# msigdbr 6.2.1

* Based on MSigDB v6.2 release.
Expand Down
14 changes: 12 additions & 2 deletions R/functions.R
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
#' List the species available in the msigdbr package
#'
#' @return a vector of possible species
#' @import dplyr
#' @importFrom dplyr pull
#' @export
msigdbr_show_species <- function() {

Expand All @@ -18,7 +18,7 @@ msigdbr_show_species <- function() {
#'
#' @return a data frame of gene sets with one gene per row
#' @import tibble
#' @import dplyr
#' @importFrom dplyr filter inner_join arrange select
#' @export
#'
#' @examples
Expand All @@ -32,6 +32,10 @@ msigdbr <- function(species = "Homo sapiens", category = NULL, subcategory = NUL
# filter by species
orthologs_subset = filter(msigdbr_orthologs, .data$species_name == species)

if (length(species) > 1) {
stop("please specify only one species at a time")
}

# confirm that the species exists in the database
if (nrow(orthologs_subset) == 0) {
stop("species does not exist in the database: ", species)
Expand All @@ -41,11 +45,17 @@ msigdbr <- function(species = "Homo sapiens", category = NULL, subcategory = NUL

# filter by category
if (is.character(category)) {
if (length(category) > 1) {
stop("please specify only one category at a time")
}
genesets_subset = filter(genesets_subset, .data$gs_cat == category)
}

# filter by sub-category
if (is.character(subcategory)) {
if (length(subcategory) > 1) {
stop("please specify only one subcategory at a time")
}
genesets_subset = filter(genesets_subset, .data$gs_subcat == subcategory)
}

Expand Down
Binary file modified R/sysdata.rda
Binary file not shown.
90 changes: 48 additions & 42 deletions data-raw/msigdbr-prepare.R
Original file line number Diff line number Diff line change
Expand Up @@ -10,14 +10,14 @@ library(usethis)
# Import MSigDB gene sets -------------------------------------------------

# Download the MSigDB XML file
msigdb_version = "6.2"
msigdb_version = "7.0"
msigdb_xml = glue("msigdb_v{msigdb_version}.xml")
msigdb_url_base = "http://software.broadinstitute.org/gsea/msigdb/download_file.jsp?filePath=/resources/msigdb"
msigdb_xml_url = glue("{msigdb_url_base}/{msigdb_version}/msigdb_v{msigdb_version}.xml")
download.file(
url = msigdb_xml_url, destfile = msigdb_xml, quiet = TRUE,
method = "wget",
extra = "-c --header='Cookie: JSESSIONID=DF8191E9C5083E0F32D4F826F46F4889'"
extra = "-c --header='Cookie: JSESSIONID=94C597E7CE4BCEF65ADE10782AD067AD'"
)

# Check MSigDB XML file size in bytes
Expand All @@ -38,50 +38,54 @@ file.remove(msigdb_xml)
# * MEMBERS: list of gene set members as they originally appeared in the source
# * MEMBERS_SYMBOLIZED: list of gene set members in the form of human gene symbols
# * MEMBERS_EZID: list of gene set members in the form of human Entrez Gene IDs
# * MEMBERS_MAPPING: pipe-separated list of in the form of: MEMBERS, MEMBERS_SYMBOLIZED, MEMBERS_EZID
geneset_records = xml_find_all(msigdb_doc, xpath = ".//GENESET")
msigdbr_genesets =
tibble(
gs_name = xml_attr(geneset_records, attr = "STANDARD_NAME"),
gs_id = xml_attr(geneset_records, attr = "SYSTEMATIC_NAME"),
gs_cat = xml_attr(geneset_records, attr = "CATEGORY_CODE"),
gs_subcat = xml_attr(geneset_records, attr = "SUB_CATEGORY_CODE"),
gs_members_symbols = xml_attr(geneset_records, attr = "MEMBERS_SYMBOLIZED"),
gs_members_ezids = xml_attr(geneset_records, attr = "MEMBERS_EZID")
gs_members = xml_attr(geneset_records, attr = "MEMBERS_MAPPING")
) %>%
filter(gs_cat != "ARCHIVED")

# Check the number of gene sets per category
msigdbr_genesets %>% count(gs_cat) %>% arrange(gs_cat)

# Create a table of human genes based on MSigDB gene mappings (for original human genesets)
human_tbl =
# Convert to "long" format (one gene per row)
msigdbr_genesets =
msigdbr_genesets %>%
mutate(
species_id = 9606,
species_name = "Homo sapiens",
human_entrez_gene = strsplit(gs_members_ezids, ","),
human_gene_symbol = strsplit(gs_members_symbols, ",")
mutate(members = strsplit(gs_members, "\\|")) %>%
unnest(members) %>%
separate(
col = members,
into = c("source_gene", "human_gene_symbol", "human_entrez_gene"),
sep = ","
) %>%
unnest(human_entrez_gene, human_gene_symbol) %>%
mutate(human_entrez_gene = as.integer(human_entrez_gene)) %>%
select(species_id, species_name, human_entrez_gene, human_gene_symbol) %>%
filter(human_entrez_gene > 0)

# Create a table of human genes based on MSigDB gene mappings
human_tbl =
msigdbr_genesets %>%
select(human_entrez_gene, human_gene_symbol) %>%
distinct() %>%
mutate(
species_name = "Homo sapiens",
entrez_gene = human_entrez_gene,
gene_symbol = human_gene_symbol
)

# Get a list of all MSigDB genes (Entrez IDs)
msigdb_entrez_genes = human_tbl %>% pull(entrez_gene) %>% sort() %>% unique()

# Convert to "long" format (one gene per row)
# Clean up
msigdbr_genesets =
msigdbr_genesets %>%
mutate(human_entrez_gene = strsplit(gs_members_ezids, ",")) %>%
unnest(human_entrez_gene) %>%
mutate(human_entrez_gene = as.integer(human_entrez_gene)) %>%
select(-gs_members_symbols, -gs_members_ezids) %>%
arrange(gs_name, human_entrez_gene)
select(gs_name, gs_id, gs_cat, gs_subcat, human_entrez_gene) %>%
arrange(gs_name, gs_id, human_entrez_gene) %>%
distinct()

# Get a list of all MSigDB genes (Entrez IDs)
msigdb_entrez_genes = msigdbr_genesets %>% pull(human_entrez_gene) %>% sort() %>% unique()

# Import HCOP orthologs ---------------------------------------------------

Expand All @@ -94,27 +98,26 @@ msigdbr_orthologs =
hcop %>%
select(
human_entrez_gene,
human_symbol,
ortholog_species,
ortholog_species_entrez_gene,
ortholog_species_symbol,
support
) %>%
rename(
human_gene_symbol = human_symbol,
species_id = ortholog_species,
entrez_gene = ortholog_species_entrez_gene,
gene_symbol = ortholog_species_symbol,
sources = support
) %>%
filter(human_entrez_gene != "-") %>%
filter(entrez_gene != "-") %>%
mutate(human_entrez_gene = as.integer(human_entrez_gene)) %>%
mutate(entrez_gene = as.integer(entrez_gene)) %>%
filter(gene_symbol != "-") %>%
filter(human_entrez_gene %in% msigdb_entrez_genes) %>%
mutate(num_sources = str_count(sources, ",") + 1) %>%
filter(num_sources > 1)
filter(
human_entrez_gene != "-",
entrez_gene != "-",
gene_symbol != "-"
) %>%
mutate(
human_entrez_gene = as.integer(human_entrez_gene),
entrez_gene = as.integer(entrez_gene),
num_sources = str_count(sources, ",") + 1
) %>%
filter(
human_entrez_gene %in% msigdb_entrez_genes,
num_sources > 1
)

# Names and IDs of common species
species_tbl =
Expand All @@ -134,13 +137,12 @@ species_tbl =
msigdbr_orthologs %>% pull(species_id) %>% unique() %>% sort()

# Add species names
msigdbr_orthologs =
inner_join(species_tbl, msigdbr_orthologs, by = "species_id") %>%
select(starts_with("human"), everything())
msigdbr_orthologs = inner_join(species_tbl, msigdbr_orthologs, by = "species_id")

# For each gene, only keep the best ortholog (one found in the most databases)
# For each gene, only keep the best ortholog (found in the most databases)
msigdbr_orthologs =
msigdbr_orthologs %>%
select(-species_id) %>%
group_by(human_entrez_gene, species_name) %>%
top_n(1, num_sources) %>%
ungroup()
Expand All @@ -149,8 +151,12 @@ msigdbr_orthologs =
msigdbr_orthologs =
msigdbr_orthologs %>%
bind_rows(human_tbl) %>%
select(
human_entrez_gene, human_gene_symbol,
species_name, entrez_gene, gene_symbol, sources, num_sources
) %>%
arrange(species_name, human_gene_symbol) %>%
select(-species_id)
distinct()

# Prepare package ---------------------------------------------------------

Expand Down
6 changes: 6 additions & 0 deletions revdep/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
checks
library
checks.noindex
library.noindex
data.sqlite
*.html
22 changes: 22 additions & 0 deletions revdep/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
# Platform

|field |value |
|:--------|:----------------------------|
|version |R version 3.6.1 (2019-07-05) |
|os |macOS Mojave 10.14.6 |
|system |x86_64, darwin15.6.0 |
|ui |RStudio |
|language |(EN) |
|collate |en_US.UTF-8 |
|ctype |en_US.UTF-8 |
|tz |America/New_York |
|date |2019-09-03 |

# Dependencies

|package |old |new |Δ |
|:-------|:-----|:-----|:--|
|msigdbr |6.2.1 |7.0.1 |* |

# Revdeps

5 changes: 5 additions & 0 deletions revdep/email.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
release_date: ???
rel_release_date: ???
my_news_url: ???
release_version: ???
release_details: ???
1 change: 1 addition & 0 deletions revdep/failures.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
*Wow, no problems at all. :)*
1 change: 1 addition & 0 deletions revdep/problems.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
*Wow, no problems at all. :)*
39 changes: 29 additions & 10 deletions tests/testthat/test-functions.R
Original file line number Diff line number Diff line change
Expand Up @@ -16,16 +16,35 @@ msigdbr_mm = msigdbr(species = "Mus musculus")
expect_s3_class(msigdbr_mm, "tbl_df")
expect_gt(nrow(msigdbr_mm), 100000)

msigdbr_h = msigdbr(species = "Homo sapiens", category = "H")
expect_s3_class(msigdbr_h, "tbl_df")
expect_gt(nrow(msigdbr_h), 1000)
msigdbr_hs_h = msigdbr(species = "Homo sapiens", category = "H")
expect_s3_class(msigdbr_hs_h, "tbl_df")
expect_gt(nrow(msigdbr_hs_h), 5000)
expect_equal(length(unique(msigdbr_hs_h$gs_id)), 50)

msigdbr_cp = msigdbr(species = "Homo sapiens", category = "C2", subcategory = "CP")
expect_s3_class(msigdbr_cp, "tbl_df")
expect_gt(nrow(msigdbr_cp), 1000)
msigdbr_mm_h = msigdbr(species = "Mus musculus", category = "H")
expect_s3_class(msigdbr_mm_h, "tbl_df")
expect_gt(nrow(msigdbr_mm_h), 5000)
expect_equal(length(unique(msigdbr_mm_h$gs_id)), 50)

msigdbr_bp = msigdbr(species = "Homo sapiens", category = "C5", subcategory = "BP")
expect_s3_class(msigdbr_bp, "tbl_df")
expect_gt(nrow(msigdbr_bp), 1000)
msigdbr_hs_cgp = msigdbr(species = "Homo sapiens", category = "C2", subcategory = "CGP")
expect_s3_class(msigdbr_hs_cgp, "tbl_df")
expect_gt(nrow(msigdbr_hs_cgp), 100000)
expect_gt(length(unique(msigdbr_hs_cgp$gs_id)), 3000)
expect_lt(length(unique(msigdbr_hs_cgp$gs_id)), 5000)

expect_error(msigdbr(species = "x"))
msigdbr_hs_bp = msigdbr(species = "Homo sapiens", category = "C5", subcategory = "BP")
expect_s3_class(msigdbr_hs_bp, "tbl_df")
expect_gt(nrow(msigdbr_hs_bp), 100000)
expect_gt(length(unique(msigdbr_hs_bp$gs_id)), 5000)
expect_lt(length(unique(msigdbr_hs_bp$gs_id)), 10000)

msigdbr_rn_bp = msigdbr(species = "Rattus norvegicus", category = "C5", subcategory = "BP")
expect_s3_class(msigdbr_rn_bp, "tbl_df")
expect_gt(nrow(msigdbr_rn_bp), 100000)
expect_gt(length(unique(msigdbr_rn_bp$gs_id)), 5000)
expect_lt(length(unique(msigdbr_rn_bp$gs_id)), 10000)

expect_error(msigdbr(species = "test"))
expect_error(msigdbr(species = c("Homo sapiens", "Mus musculus")))
expect_error(msigdbr(species = "Homo sapiens", category = c("C1", "C2")))
expect_error(msigdbr(species = "Homo sapiens", category = "C2", subcategory = c("CGP", "CP")))
Loading

0 comments on commit bdb19c8

Please sign in to comment.