From b80654415834cf519b34ccbc7ee65030b92aebd3 Mon Sep 17 00:00:00 2001
From: Mauro Masiero <mauromiguelm@hotmail.com>
Date: Tue, 17 Oct 2023 15:40:36 +0200
Subject: [PATCH 1/9] deprecate GSET_SPARSEG_XL

---
 R/compute2-genesets.R | 2 +-
 R/pgx-signature.R     | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/R/compute2-genesets.R b/R/compute2-genesets.R
index 343b3499..b94363d5 100644
--- a/R/compute2-genesets.R
+++ b/R/compute2-genesets.R
@@ -59,7 +59,7 @@ compute_testGenesets <- function(pgx,
   ## Load huge geneset matrix
   ## -----------------------------------------------------------
 
-  G <- playdata::GSET_SPARSEG_XL
+  G <- playdata::GSETxGENE
 
   ## -----------------------------------------------------------
   ## Filter genes
diff --git a/R/pgx-signature.R b/R/pgx-signature.R
index 9593f158..108e3dfa 100644
--- a/R/pgx-signature.R
+++ b/R/pgx-signature.R
@@ -569,7 +569,7 @@ pgx.addEnrichmentSignaturesH5 <- function(h5.file, X = NULL, mc.cores = 0,
 
   ## ---------------------- ONLY HALLMARK FOR NOW -----------------------
 
-  G <- playdata::GSET_SPARSEG_XL
+  G <- playdata::GSETxGENE
   sel <- grep("HALLMARK|C[1-9]|^GO", rownames(G))
   sel <- grep("HALLMARK", rownames(G))
   genes <- intersect(colnames(G), rownames(X))

From d0bfb5d65f854151fa55cd199dc274079fb63b33 Mon Sep 17 00:00:00 2001
From: Mauro Masiero <mauromiguelm@hotmail.com>
Date: Tue, 17 Oct 2023 16:11:33 +0200
Subject: [PATCH 2/9] simplify collections

---
 R/pgx-functions.R | 44 ++++++++++++++++++++------------------------
 1 file changed, 20 insertions(+), 24 deletions(-)

diff --git a/R/pgx-functions.R b/R/pgx-functions.R
index 3f343b7a..9af9db91 100644
--- a/R/pgx-functions.R
+++ b/R/pgx-functions.R
@@ -1656,41 +1656,37 @@ pgx.getGeneFamilies <- function(genes, min.size = 10, max.size = 500) {
 #' @return A list containing the extracted gene set collections.
 #'
 #' @export
-pgx.getGeneSetCollections <- function(gsets, min.size = 10, max.size = 500) {
+pgx.getGeneSetCollections <- function(gsets = rownames(playdata::GSETxGENE)) {
   ## -----------------------------------------------------------------------------
   ## Gene set collections
   ## -----------------------------------------------------------------------------
 
-  collections <- list(
-    "Hallmark collection" = gsets[grep("HALLMARK", gsets)],
-    "Pathway related" = gsets[grep("pathway", gsets, ignore.case = TRUE)],
-    "Metabolism related" = gsets[grep("metaboli", gsets, ignore.case = TRUE)],
-    "Signalling related" = gsets[grep("signal", gsets, ignore.case = TRUE)],
-    "T-cell related" = gsets[grep("tcell|t-cell|t[ ]cell", gsets, ignore.case = TRUE)],
-    "B-cell related" = gsets[grep("bcell]b-cell|b[ ]cell", gsets, ignore.case = TRUE)],
-    "Response related" = gsets[grep("response", gsets, ignore.case = TRUE)],
-    "Cancer related" = gsets[grep("cancer", gsets, ignore.case = TRUE)],
-    "Immune related" = gsets[grep("immune", gsets, ignore.case = TRUE)],
-    "Cell differentiation" = gsets[grep("differentiation", gsets, ignore.case = TRUE)],
-    "Checkpoint related" = gsets[grep("checkpoint", gsets, ignore.case = TRUE)],
-    "IL gene sets" = gsets[grep("IL[1-9]{1,2}", gsets, ignore.case = TRUE)],
-    "Aging" = gsets[grep("aging", gsets, ignore.case = TRUE)],
-    "Disease" = gsets[grep("jensen|disease|covid|diabetes", gsets, ignore.case = TRUE)],
-  )
+  collections <- list()
+
+  # collections <- append(
+  #   collections,
+  #   "Hallmark collection" = gsets[grep("HALLMARK", gsets)],
+  #   "Pathway related" = gsets[grep("pathway", gsets, ignore.case = TRUE)],
+  #   "Metabolism related" = gsets[grep("metaboli", gsets, ignore.case = TRUE)],
+  #   "Signalling related" = gsets[grep("signal", gsets, ignore.case = TRUE)],
+  #   "T-cell related" = gsets[grep("tcell|t-cell|t[ ]cell", gsets, ignore.case = TRUE)],
+  #   "B-cell related" = gsets[grep("bcell]b-cell|b[ ]cell", gsets, ignore.case = TRUE)],
+  #   "Response related" = gsets[grep("response", gsets, ignore.case = TRUE)],
+  #   "Cancer related" = gsets[grep("cancer", gsets, ignore.case = TRUE)],
+  #   "Immune related" = gsets[grep("immune", gsets, ignore.case = TRUE)],
+  #   "Cell differentiation" = gsets[grep("differentiation", gsets, ignore.case = TRUE)],
+  #   "Checkpoint related" = gsets[grep("checkpoint", gsets, ignore.case = TRUE)],
+  #   "IL gene sets" = gsets[grep("IL[1-9]{1,2}", gsets, ignore.case = TRUE)],
+  #   "Aging" = gsets[grep("aging", gsets, ignore.case = TRUE)],
+  #   "Disease" = gsets[grep("jensen|disease|covid|diabetes", gsets, ignore.case = TRUE)],
+  # )
 
   collections[["<all>"]] <- gsets ## X is sorted
-  collections <- collections[which(sapply(collections, length) >= 10)]
-  collections <- collections[order(names(collections))]
 
   ## ----------- add main collections from gene set prefixes
   gsets.db <- sub(":.*", "", gsets)
   gsets.groups <- tapply(gsets, gsets.db, list)
   collections <- c(collections, gsets.groups)
-
-  ## ----------- filter on size
-  nsize <- sapply(collections, length)
-  sel <- which(nsize >= min.size & nsize < max.size)
-  collections <- collections[sel]
   return(collections)
 }
 

From 5a3ebd05f26cb5218aabe9d19fd807c6f4bbd46b Mon Sep 17 00:00:00 2001
From: Mauro Masiero <mauromiguelm@hotmail.com>
Date: Wed, 18 Oct 2023 09:55:05 +0200
Subject: [PATCH 3/9] fix issue with <all> collections

---
 R/pgx-functions.R | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/R/pgx-functions.R b/R/pgx-functions.R
index 9af9db91..56d73a6b 100644
--- a/R/pgx-functions.R
+++ b/R/pgx-functions.R
@@ -1681,12 +1681,12 @@ pgx.getGeneSetCollections <- function(gsets = rownames(playdata::GSETxGENE)) {
   #   "Disease" = gsets[grep("jensen|disease|covid|diabetes", gsets, ignore.case = TRUE)],
   # )
 
-  collections[["<all>"]] <- gsets ## X is sorted
-
+  
   ## ----------- add main collections from gene set prefixes
   gsets.db <- sub(":.*", "", gsets)
   gsets.groups <- tapply(gsets, gsets.db, list)
   collections <- c(collections, gsets.groups)
+  collections[["<all>"]] <- gsets
   return(collections)
 }
 

From 365db1d65a420c982865d540b93cf44047ea0159 Mon Sep 17 00:00:00 2001
From: Mauro Masiero <mauromiguelm@hotmail.com>
Date: Thu, 19 Oct 2023 13:24:03 +0200
Subject: [PATCH 4/9] update getgenesetcollections

---
 R/pgx-functions.R | 21 +--------------------
 1 file changed, 1 insertion(+), 20 deletions(-)

diff --git a/R/pgx-functions.R b/R/pgx-functions.R
index 56d73a6b..cf04aa99 100644
--- a/R/pgx-functions.R
+++ b/R/pgx-functions.R
@@ -1662,28 +1662,9 @@ pgx.getGeneSetCollections <- function(gsets = rownames(playdata::GSETxGENE)) {
   ## -----------------------------------------------------------------------------
 
   collections <- list()
-
-  # collections <- append(
-  #   collections,
-  #   "Hallmark collection" = gsets[grep("HALLMARK", gsets)],
-  #   "Pathway related" = gsets[grep("pathway", gsets, ignore.case = TRUE)],
-  #   "Metabolism related" = gsets[grep("metaboli", gsets, ignore.case = TRUE)],
-  #   "Signalling related" = gsets[grep("signal", gsets, ignore.case = TRUE)],
-  #   "T-cell related" = gsets[grep("tcell|t-cell|t[ ]cell", gsets, ignore.case = TRUE)],
-  #   "B-cell related" = gsets[grep("bcell]b-cell|b[ ]cell", gsets, ignore.case = TRUE)],
-  #   "Response related" = gsets[grep("response", gsets, ignore.case = TRUE)],
-  #   "Cancer related" = gsets[grep("cancer", gsets, ignore.case = TRUE)],
-  #   "Immune related" = gsets[grep("immune", gsets, ignore.case = TRUE)],
-  #   "Cell differentiation" = gsets[grep("differentiation", gsets, ignore.case = TRUE)],
-  #   "Checkpoint related" = gsets[grep("checkpoint", gsets, ignore.case = TRUE)],
-  #   "IL gene sets" = gsets[grep("IL[1-9]{1,2}", gsets, ignore.case = TRUE)],
-  #   "Aging" = gsets[grep("aging", gsets, ignore.case = TRUE)],
-  #   "Disease" = gsets[grep("jensen|disease|covid|diabetes", gsets, ignore.case = TRUE)],
-  # )
-
   
   ## ----------- add main collections from gene set prefixes
-  gsets.db <- sub(":.*", "", gsets)
+  gsets.db <- sub("_.*", "", gsets)
   gsets.groups <- tapply(gsets, gsets.db, list)
   collections <- c(collections, gsets.groups)
   collections[["<all>"]] <- gsets

From 2c0875316b8ebec1d0e73ba3e3b67e82c276d616 Mon Sep 17 00:00:00 2001
From: mauromiguelm <mauromiguelm@users.noreply.github.com>
Date: Tue, 24 Oct 2023 09:14:24 +0000
Subject: [PATCH 5/9] Style code (GHA)

---
 R/pgx-functions.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R/pgx-functions.R b/R/pgx-functions.R
index cf04aa99..49811d09 100644
--- a/R/pgx-functions.R
+++ b/R/pgx-functions.R
@@ -1662,7 +1662,7 @@ pgx.getGeneSetCollections <- function(gsets = rownames(playdata::GSETxGENE)) {
   ## -----------------------------------------------------------------------------
 
   collections <- list()
-  
+
   ## ----------- add main collections from gene set prefixes
   gsets.db <- sub("_.*", "", gsets)
   gsets.groups <- tapply(gsets, gsets.db, list)

From 6aa2975a55b0886cacf1b53a64a81f84924bb28d Mon Sep 17 00:00:00 2001
From: Mauro Masiero <mauromiguelm@hotmail.com>
Date: Tue, 24 Oct 2023 11:46:05 +0200
Subject: [PATCH 6/9] accomodate old pgx

---
 R/pgx-functions.R | 1 +
 1 file changed, 1 insertion(+)

diff --git a/R/pgx-functions.R b/R/pgx-functions.R
index 49811d09..7ec01513 100644
--- a/R/pgx-functions.R
+++ b/R/pgx-functions.R
@@ -1665,6 +1665,7 @@ pgx.getGeneSetCollections <- function(gsets = rownames(playdata::GSETxGENE)) {
 
   ## ----------- add main collections from gene set prefixes
   gsets.db <- sub("_.*", "", gsets)
+  gsets.db <- sub(":.*", "", gsets.db)
   gsets.groups <- tapply(gsets, gsets.db, list)
   collections <- c(collections, gsets.groups)
   collections[["<all>"]] <- gsets

From 14357bf381b9a8de36cedc03f549830d69b0c6ca Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Xavier=20Escrib=C3=A0=20Montagut?=
 <xavier.escriba.montagut@gmail.com>
Date: Tue, 24 Oct 2023 14:31:45 +0200
Subject: [PATCH 7/9] feat: skip row check for matrix read

---
 R/pgx-functions.R | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/R/pgx-functions.R b/R/pgx-functions.R
index 7ec01513..985c149c 100644
--- a/R/pgx-functions.R
+++ b/R/pgx-functions.R
@@ -733,6 +733,8 @@ read.as_matrix.SAVE <- function(file) {
 #' Read data file as matrix
 #'
 #' @param file Path to input data file
+#' @param skip_row_check (default `FALSE`) Flag to skip the removal
+#' of empty rows
 #'
 #' @return Matrix object containing data from file
 #'
@@ -749,7 +751,7 @@ read.as_matrix.SAVE <- function(file) {
 #' mymatrix <- read.as_matrix(mydata.csv)
 #' }
 #' @export
-read.as_matrix <- function(file) {
+read.as_matrix <- function(file, skip_row_check = FALSE) {
   ## read delimited table automatically determine separator. allow
   ## duplicated rownames. This implements with faster fread.
   x0 <- data.table::fread(
@@ -771,9 +773,11 @@ read.as_matrix <- function(file) {
     rownames(x) <- x0[[1]][sel]
   }
   ## drop any rows with 100% missing value (sometimes added by not-so-Excel...)
-  zero.row <- which(rowSums(is.na(x)) == ncol(x))
-  if (length(zero.row)) {
-    x <- x[-zero.row, , drop = FALSE]
+  if(!skip_row_check) { # Flag to bypass (used on contrast.csv ingest), as it can contain full NA rows
+    zero.row <- which(rowSums(is.na(x)) == ncol(x))
+    if (length(zero.row)) {
+      x <- x[-zero.row, , drop = FALSE]
+    }
   }
   ## drop any 100% missing columns (sometimes added by not-so-Excel...)
   zero.col <- which(colSums(is.na(x)) == nrow(x))

From 4fbf19ef3ae81de87f0446073713269ebf218416 Mon Sep 17 00:00:00 2001
From: mauromiguelm <mauromiguelm@users.noreply.github.com>
Date: Tue, 24 Oct 2023 13:32:51 +0000
Subject: [PATCH 8/9] Style code (GHA)

---
 R/pgx-functions.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R/pgx-functions.R b/R/pgx-functions.R
index 985c149c..0f3a6aa8 100644
--- a/R/pgx-functions.R
+++ b/R/pgx-functions.R
@@ -773,7 +773,7 @@ read.as_matrix <- function(file, skip_row_check = FALSE) {
     rownames(x) <- x0[[1]][sel]
   }
   ## drop any rows with 100% missing value (sometimes added by not-so-Excel...)
-  if(!skip_row_check) { # Flag to bypass (used on contrast.csv ingest), as it can contain full NA rows
+  if (!skip_row_check) { # Flag to bypass (used on contrast.csv ingest), as it can contain full NA rows
     zero.row <- which(rowSums(is.na(x)) == ncol(x))
     if (length(zero.row)) {
       x <- x[-zero.row, , drop = FALSE]

From 514a7dd9abcd5d182f09a4aac185ecb063cc3472 Mon Sep 17 00:00:00 2001
From: Mauro Masiero <mauromiguelm@hotmail.com>
Date: Tue, 24 Oct 2023 18:06:55 +0200
Subject: [PATCH 9/9] document

---
 man/calc.edge.similarity.Rd                  |  4 +-
 man/check_duplicate_cols.Rd                  |  6 +-
 man/compute_cellcycle_gender.Rd              |  4 +-
 man/compute_deconvolution.Rd                 |  4 +-
 man/compute_drugActivityEnrichment.Rd        |  4 +-
 man/compute_drugSensitivityEnrichment.Rd     |  4 +-
 man/compute_extra.Rd                         |  9 +--
 man/compute_omicsGraphs.Rd                   |  4 +-
 man/createSparseGenesetMatrix.Rd             |  3 +-
 man/dot-RGimputation.Rd                      |  2 +-
 man/expandAnnotationMatrix.Rd                |  2 +-
 man/expandPhenoMatrix.Rd                     | 12 +---
 man/fit.weibull2.Rd                          |  2 +-
 man/getGOgraph.Rd                            |  6 +-
 man/graph.cut_crossings.Rd                   |  2 +-
 man/hclustGraph.Rd                           |  6 +-
 man/itercluster_louvain.Rd                   |  2 +-
 man/length_encode.Rd                         |  2 +-
 man/mat2gmt.Rd                               | 23 +++++++
 man/mixHivePlot.Rd                           |  2 +-
 man/pgx.SeuratBatchIntegrate.Rd              |  4 +-
 man/pgx._addSourceSink.Rd                    |  2 +-
 man/pgx.addEnrichmentSignaturesH5.Rd         |  4 +-
 man/pgx.computeConnectivityScores.Rd         | 12 +---
 man/pgx.computeCoreGOgraph.Rd                | 10 +--
 man/pgx.computePGX.Rd                        |  6 +-
 man/pgx.computePathscores.Rd                 |  4 +-
 man/pgx.correlateSignatureH5.inmemory.Rd     | 71 --------------------
 man/pgx.createCreedsSigDB.Rd                 | 32 ---------
 man/pgx.createOmicsGraph.Rd                  |  2 +-
 man/pgx.createSignatureDatabaseH5.Rd         |  2 +-
 man/pgx.getGeneSetCollections.Rd             |  2 +-
 man/pgx.getSigGO.Rd                          |  2 +-
 man/pgx.makeAutoContrasts.Rd                 |  1 +
 man/pgx.poolCells.Rd                         |  6 +-
 man/pgx.readOptions.Rd                       |  6 +-
 man/pgx.saveMatrixH5.Rd                      |  2 +-
 man/pgx.survivalVariableImportance.Rd        |  2 +-
 man/pgx.testPhenoCorrelation.Rd              |  2 +-
 man/pgx.testTCGAsurvival.Rd                  |  6 +-
 man/{pgx.updateInfoPGX.Rd => pgxinfo.add.Rd} | 10 +--
 man/pgxinfo.delete.Rd                        | 21 ++++++
 man/pgxinfo.needUpdate.Rd                    |  2 +-
 man/pgxinfo.read.Rd                          |  2 +-
 man/pgxinfo.updateDatasetFolder.Rd           |  2 +-
 man/plot_ggscatter.Rd                        |  8 ++-
 man/prot.nmfImpute.Rd                        |  2 +-
 man/read.as_matrix.Rd                        |  5 +-
 man/seurat2pgx.Rd                            |  2 +-
 man/sigdb.getConnectivityMatrix.Rd           |  8 +--
 man/tagged.hamming.Rd                        |  8 +--
 man/tcosine_similarity.Rd                    |  2 +-
 man/uscale.Rd                                |  2 +-
 53 files changed, 147 insertions(+), 208 deletions(-)
 create mode 100644 man/mat2gmt.Rd
 delete mode 100644 man/pgx.correlateSignatureH5.inmemory.Rd
 delete mode 100644 man/pgx.createCreedsSigDB.Rd
 rename man/{pgx.updateInfoPGX.Rd => pgxinfo.add.Rd} (84%)
 create mode 100644 man/pgxinfo.delete.Rd

diff --git a/man/calc.edge.similarity.Rd b/man/calc.edge.similarity.Rd
index d4ee6d64..23ee67fe 100644
--- a/man/calc.edge.similarity.Rd
+++ b/man/calc.edge.similarity.Rd
@@ -46,8 +46,8 @@ Parallel processing improves performance for large graphs.
 }}
 \examples{
 \dontrun{
-ee <- matrix(c(1,2, 2,3), ncol=2) # example edge matrix
-X <- matrix(rnorm(100*50), 100, 50) # expression matrix 
+ee <- matrix(c(1, 2, 2, 3), ncol = 2) # example edge matrix
+X <- matrix(rnorm(100 * 50), 100, 50) # expression matrix
 w <- calc.edge.similarity(ee, X)
 }
 }
diff --git a/man/check_duplicate_cols.Rd b/man/check_duplicate_cols.Rd
index b84328c5..0efb598d 100644
--- a/man/check_duplicate_cols.Rd
+++ b/man/check_duplicate_cols.Rd
@@ -25,7 +25,7 @@ Logical indicating output of the checks.
 Checks if a data frame or matrix contains duplicate column names.
 }
 \details{
-This function takes a data frame or matrix \code{data} as input and checks if it contains any duplicate column names. 
+This function takes a data frame or matrix \code{data} as input and checks if it contains any duplicate column names.
 It compares the column names against each other to look for duplicates.
 
 The output is a logical value indicating whether any duplicate names were found.
@@ -44,12 +44,12 @@ This can be used to validate data before further analysis, to ensure no columns
 }}
 \examples{
 \dontrun{
-data <- data.frame(A = 1:3, B = 4:6, A = 7:9) 
+data <- data.frame(A = 1:3, B = 4:6, A = 7:9)
 check_duplicate_cols(data)
 # Returns TRUE
 
 data <- data.frame(A = 1:3, B = 4:6, C = 7:9)
-check_duplicate_cols(data) 
+check_duplicate_cols(data)
 # Returns FALSE
 }
 }
diff --git a/man/compute_cellcycle_gender.Rd b/man/compute_cellcycle_gender.Rd
index ac6e9ccb..3897d3f7 100644
--- a/man/compute_cellcycle_gender.Rd
+++ b/man/compute_cellcycle_gender.Rd
@@ -4,10 +4,10 @@
 \alias{compute_cellcycle_gender}
 \title{Compute Cell Cycle and Gender Inference}
 \usage{
-compute_cellcycle_gender(ngs, rna.counts = ngs$counts)
+compute_cellcycle_gender(pgx, rna.counts = pgx$counts)
 }
 \arguments{
-\item{ngs}{An object containing the input data for analysis.}
+\item{pgx}{An object containing the input data for analysis.}
 
 \item{rna.counts}{A matrix or data frame of RNA expression counts.
 Defaults to the counts in the input object.}
diff --git a/man/compute_deconvolution.Rd b/man/compute_deconvolution.Rd
index a1dbd96b..62133876 100644
--- a/man/compute_deconvolution.Rd
+++ b/man/compute_deconvolution.Rd
@@ -4,10 +4,10 @@
 \alias{compute_deconvolution}
 \title{Compute Deconvolution}
 \usage{
-compute_deconvolution(ngs, rna.counts = ngs$counts, full = FALSE)
+compute_deconvolution(pgx, rna.counts = pgx$counts, full = FALSE)
 }
 \arguments{
-\item{ngs}{An object containing the input data for analysis.}
+\item{pgx}{An object containing the input data for analysis.}
 
 \item{rna.counts}{A matrix or data frame of RNA expression counts. Defaults to the counts in the input object.}
 
diff --git a/man/compute_drugActivityEnrichment.Rd b/man/compute_drugActivityEnrichment.Rd
index bbb9ccc3..bd0c4258 100644
--- a/man/compute_drugActivityEnrichment.Rd
+++ b/man/compute_drugActivityEnrichment.Rd
@@ -4,10 +4,10 @@
 \alias{compute_drugActivityEnrichment}
 \title{Compute Drug Activity Enrichment}
 \usage{
-compute_drugActivityEnrichment(ngs, libx.dir = NULL)
+compute_drugActivityEnrichment(pgx, libx.dir = NULL)
 }
 \arguments{
-\item{ngs}{An object containing the input data for analysis.}
+\item{pgx}{An object containing the input data for analysis.}
 
 \item{libx.dir}{The directory path where the drug activity databases are located.
 This is required if calling the function compute_full_drugActivityEnrichment.}
diff --git a/man/compute_drugSensitivityEnrichment.Rd b/man/compute_drugSensitivityEnrichment.Rd
index fa2d3e5d..d774a835 100644
--- a/man/compute_drugSensitivityEnrichment.Rd
+++ b/man/compute_drugSensitivityEnrichment.Rd
@@ -4,10 +4,10 @@
 \alias{compute_drugSensitivityEnrichment}
 \title{Compute Drug Sensitivity Enrichment}
 \usage{
-compute_drugSensitivityEnrichment(ngs, libx.dir = NULL)
+compute_drugSensitivityEnrichment(pgx, libx.dir = NULL)
 }
 \arguments{
-\item{ngs}{An object containing the input data for analysis.}
+\item{pgx}{An object containing the input data for analysis.}
 
 \item{libx.dir}{The directory path where the drug sensitivity databases are located.}
 }
diff --git a/man/compute_extra.Rd b/man/compute_extra.Rd
index a95fb3dc..aa28511b 100644
--- a/man/compute_extra.Rd
+++ b/man/compute_extra.Rd
@@ -5,14 +5,15 @@
 \title{Compute Extra Analysis}
 \usage{
 compute_extra(
-  ngs,
-  extra = c("meta.go", "deconv", "infer", "drugs", "connectivity", "wordcloud", "wgcna"),
+  pgx,
+  extra = c("meta.go", "infer", "deconv", "drugs", "connectivity", "wordcloud", "wgcna"),
   sigdb = NULL,
-  libx.dir = NULL
+  libx.dir = NULL,
+  pgx.dir = NULL
 )
 }
 \arguments{
-\item{ngs}{An object containing the input data for analysis.}
+\item{pgx}{An object containing the input data for analysis.}
 
 \item{extra}{A character vector specifying which additional analyses to perform.}
 
diff --git a/man/compute_omicsGraphs.Rd b/man/compute_omicsGraphs.Rd
index c9fa8fbc..3f558f9d 100644
--- a/man/compute_omicsGraphs.Rd
+++ b/man/compute_omicsGraphs.Rd
@@ -4,10 +4,10 @@
 \alias{compute_omicsGraphs}
 \title{Compute Omics Graphs}
 \usage{
-compute_omicsGraphs(ngs)
+compute_omicsGraphs(pgx)
 }
 \arguments{
-\item{ngs}{An object containing the input data for analysis.}
+\item{pgx}{An object containing the input data for analysis.}
 }
 \value{
 An updated object with omics graphs and path scores.
diff --git a/man/createSparseGenesetMatrix.Rd b/man/createSparseGenesetMatrix.Rd
index d2339fa7..27542a66 100644
--- a/man/createSparseGenesetMatrix.Rd
+++ b/man/createSparseGenesetMatrix.Rd
@@ -8,7 +8,8 @@ createSparseGenesetMatrix(
   gmt.all,
   min.geneset.size = 15,
   max.geneset.size = 500,
-  min_gene_frequency = 10
+  min_gene_frequency = 10,
+  filter_genes = TRUE
 )
 }
 \arguments{
diff --git a/man/dot-RGimputation.Rd b/man/dot-RGimputation.Rd
index 85fc1ab5..aafd6120 100644
--- a/man/dot-RGimputation.Rd
+++ b/man/dot-RGimputation.Rd
@@ -30,7 +30,7 @@ If \code{bycolumn} is FALSE, the parameters are calculated across the entire mat
 }
 \examples{
 \dontrun{
-mat <- matrix(rnorm(100), ncol = 10)  
+mat <- matrix(rnorm(100), ncol = 10)
 mat[sample(100, 20)] <- NA
 imputed <- .RGimputation(mat)
 }
diff --git a/man/expandAnnotationMatrix.Rd b/man/expandAnnotationMatrix.Rd
index b1d2c7a9..5821d821 100644
--- a/man/expandAnnotationMatrix.Rd
+++ b/man/expandAnnotationMatrix.Rd
@@ -19,6 +19,6 @@ Expands a phenotype annotation matrix into dummy variables.
 This function takes an annotation data frame and expands any categorical variables into
 dummy variables using model.matrix.
 
-For each column, it determines if the variable is numeric or categorical. 
+For each column, it determines if the variable is numeric or categorical.
 Numeric variables are ranked. Categorical variables are expanded into dummy variables.
 }
diff --git a/man/expandPhenoMatrix.Rd b/man/expandPhenoMatrix.Rd
index 17485451..7a565be6 100644
--- a/man/expandPhenoMatrix.Rd
+++ b/man/expandPhenoMatrix.Rd
@@ -4,13 +4,11 @@
 \alias{expandPhenoMatrix}
 \title{Expand phenotype matrix}
 \usage{
-expandPhenoMatrix(pheno, collapse = TRUE, drop.ref = TRUE)
+expandPhenoMatrix(pheno, drop.ref = TRUE)
 }
 \arguments{
 \item{pheno}{Data frame containing the phenotype variables.}
 
-\item{collapse}{Logical indicating whether to collapse factor levels below a frequency threshold.}
-
 \item{drop.ref}{Logical indicating whether to drop the reference level for each factor.}
 }
 \value{
@@ -22,12 +20,8 @@ dropping the reference level.
 }
 \details{
 This function takes a phenotype data matrix and expands any categorical variables into
-dummy variables, while optionally collapsing rare factor levels and dropping the reference level.
+dummy variables
 
 For each column, it determines if the variable is numeric or categorical. Numeric variables are
-ranked. Categorical variables are expanded into dummy variables using \code{model.matrix}.
-
-If \code{collapse = TRUE}, it will collapse together factor levels that occur below a frequency
-threshold. If \code{drop.ref = TRUE}, it will drop the reference level for each factor when
-creating the dummy variables.
+dichotomized. Categorical variables are expanded into dummy variables using \code{model.matrix}.
 }
diff --git a/man/fit.weibull2.Rd b/man/fit.weibull2.Rd
index 3abd7385..d63d3ca9 100644
--- a/man/fit.weibull2.Rd
+++ b/man/fit.weibull2.Rd
@@ -22,7 +22,7 @@ Fit a Weibull distribution to bivariate data
 
 This function takes two numeric vectors \code{x} and \code{y} and fits a bivariate Weibull distribution.
 The Weibull distribution has shape and scale parameters for each variable.
- 
+
 Maximum likelihood estimation is used to estimate the 4 Weibull parameters from the input data.
 The estimated parameters are returned as a named list.
 }
diff --git a/man/getGOgraph.Rd b/man/getGOgraph.Rd
index ef9cfde2..13146605 100644
--- a/man/getGOgraph.Rd
+++ b/man/getGOgraph.Rd
@@ -16,8 +16,8 @@ This function creates a graph representation of the Gene Ontology (GO) hierarchy
 The function first extracts the GO terms and their associated information from the GOTERM table in the GO.db package.
 Then, it removes any duplicate terms and sets the row names of the resulting data frame to the GO IDs.
 
-Next, the function extracts the parent-child relationships for each of the three GO 
-domains (Biological Process, Molecular Function, and Cellular Component) from the 
-corresponding tables in the GO.db package. It combines these relationships into a single data.frame 
+Next, the function extracts the parent-child relationships for each of the three GO
+domains (Biological Process, Molecular Function, and Cellular Component) from the
+corresponding tables in the GO.db package. It combines these relationships into a single data.frame
 and creates a graph object using the igraph package.
 }
diff --git a/man/graph.cut_crossings.Rd b/man/graph.cut_crossings.Rd
index f068d349..21440c25 100644
--- a/man/graph.cut_crossings.Rd
+++ b/man/graph.cut_crossings.Rd
@@ -22,7 +22,7 @@ Cuts inter-cluster edges in a graph based on node membership
 \details{
 Cut graph crossings
 
-This function takes an igraph graph object \code{g} and a vector of node membership indices \code{idx}. 
+This function takes an igraph graph object \code{g} and a vector of node membership indices \code{idx}.
 It identifies edges that connect nodes of different clusters based on \code{idx}.
 Any edges with weight less than \code{max.wt} that link across clusters are removed from the graph.
 The resulting pruned graph is returned.
diff --git a/man/hclustGraph.Rd b/man/hclustGraph.Rd
index a9333cbf..d7b5f430 100644
--- a/man/hclustGraph.Rd
+++ b/man/hclustGraph.Rd
@@ -23,13 +23,13 @@ Performs hierarchical clustering of nodes in a graph using the Louvain algorithm
 Hierarchical Clustering of Graph
 
 
-This function takes an igraph graph object \code{g} and performs hierarchical clustering 
+This function takes an igraph graph object \code{g} and performs hierarchical clustering
 of the nodes using the Louvain community detection algorithm.
 
 It iteratively clusters the nodes, splitting communities into sub-communities to find
-the optimal number of clusters. The number of clusters \code{k} can be specified, 
+the optimal number of clusters. The number of clusters \code{k} can be specified,
 otherwise the optimal number is found automatically.
 
-The clustering is done in parallel using \code{mc.cores} cores if available. 
+The clustering is done in parallel using \code{mc.cores} cores if available.
 The output is a matrix with the hierarchical clustering membership at each level.
 }
diff --git a/man/itercluster_louvain.Rd b/man/itercluster_louvain.Rd
index d639116b..e18d2ec0 100644
--- a/man/itercluster_louvain.Rd
+++ b/man/itercluster_louvain.Rd
@@ -24,7 +24,7 @@ This function implements the Louvain algorithm for community detection on a grap
 It takes an igraph graph object \code{g} and performs \code{n} iterations of Louvain clustering.
 In each iteration it groups nodes into communities that maximize modularity.
 
-The algorithm optimizes modularity in a greedy fashion by first assigning each node to its own community. 
+The algorithm optimizes modularity in a greedy fashion by first assigning each node to its own community.
 It then goes through nodes repeatedly to evaluate moving them to neighboring communities. If a move increases modularity it is accepted.
 This local optimization is applied iteratively to hierarchically build communities.
 
diff --git a/man/length_encode.Rd b/man/length_encode.Rd
index c5292dc2..5d2cc7a4 100644
--- a/man/length_encode.Rd
+++ b/man/length_encode.Rd
@@ -31,6 +31,6 @@ The output is a sparse matrix with rows corresponding to the input \code{x} and
 \examples{
 \dontrun{
 x <- rpois(100)
-len <- length_encode(x)  
+len <- length_encode(x)
 }
 }
diff --git a/man/mat2gmt.Rd b/man/mat2gmt.Rd
new file mode 100644
index 00000000..ab2406b6
--- /dev/null
+++ b/man/mat2gmt.Rd
@@ -0,0 +1,23 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/gset-gsea.r
+\name{mat2gmt}
+\alias{mat2gmt}
+\title{Convert binary matrix to GMT list}
+\usage{
+mat2gmt(mat)
+}
+\arguments{
+\item{mat}{A matrix with non-zero entries representing genes in
+each gene set where rows represent genes and columns represent
+gene sets.}
+}
+\value{
+A list of vector representing each gene set. Each list
+  element correspond to a gene set and is a vector of genes
+}
+\description{
+This function converts binary matrix to a GMT (Gene Matrix
+Transposed) list, The binary matrix indicates the presence or
+absence of genes in each gene set, where rows represent genes and
+columns represent gene sets.
+}
diff --git a/man/mixHivePlot.Rd b/man/mixHivePlot.Rd
index f9c9595a..49baaa6d 100644
--- a/man/mixHivePlot.Rd
+++ b/man/mixHivePlot.Rd
@@ -38,7 +38,7 @@ The number of node labels can be reduced by setting \code{numlab} to avoid overp
 }
 \section{Functions}{
 \itemize{
-\item \code{mixPlotLoadings()}: function generates a hive plot visualization of variable loadings 
+\item \code{mixPlotLoadings()}: function generates a hive plot visualization of variable loadings
 from a lmer model result object.
 
 }}
diff --git a/man/pgx.SeuratBatchIntegrate.Rd b/man/pgx.SeuratBatchIntegrate.Rd
index edff827d..f905b92a 100644
--- a/man/pgx.SeuratBatchIntegrate.Rd
+++ b/man/pgx.SeuratBatchIntegrate.Rd
@@ -30,10 +30,10 @@ Seurat object containing integrated data
 Integrate single-cell RNA-seq data from multiple batches using canonical correlation analysis via the Seurat package.
 }
 \details{
-This function takes a single-cell count matrix \code{counts} and a \code{batch} vector as input. 
+This function takes a single-cell count matrix \code{counts} and a \code{batch} vector as input.
 It sets up a Seurat object for each batch and integrates them using FindIntegrationAnchors/IntegrateData functions.
 
-Cells can be filtered by QC metrics like mitochondrial content if \code{qc.filter=TRUE}. 
+Cells can be filtered by QC metrics like mitochondrial content if \code{qc.filter=TRUE}.
 The \code{nanchors} parameter controls the number of anchor points used for integration.
 Normalization and scaling can be done using SCTransform if \code{sct=TRUE}.
 
diff --git a/man/pgx._addSourceSink.Rd b/man/pgx._addSourceSink.Rd
index 321d163c..84af65ad 100644
--- a/man/pgx._addSourceSink.Rd
+++ b/man/pgx._addSourceSink.Rd
@@ -19,7 +19,7 @@ Adds source and sink nodes to a pathway graph to enable path scoring.
 Add source and sink nodes to pathway graph
 
 
-This function takes an igraph pathway graph object \code{gr} and adds 
+This function takes an igraph pathway graph object \code{gr} and adds
 source and sink nodes to it.
 
 The source and sink nodes are named "SOURCE" and "SINK" respectively.
diff --git a/man/pgx.addEnrichmentSignaturesH5.Rd b/man/pgx.addEnrichmentSignaturesH5.Rd
index 77a69e4e..a03d7efa 100644
--- a/man/pgx.addEnrichmentSignaturesH5.Rd
+++ b/man/pgx.addEnrichmentSignaturesH5.Rd
@@ -27,8 +27,8 @@ No value is returned. Enrichment signatures are written to the HDF5 file.
 Adds gene set enrichment results as signatures to an HDF5 file.
 }
 \details{
-This function runs gene set enrichment analysis on a gene expression matrix X, 
-using the specified methods. It writes the enrichment results as signatures to 
+This function runs gene set enrichment analysis on a gene expression matrix X,
+using the specified methods. It writes the enrichment results as signatures to
 the specified HDF5 file under the "enrichment" group.
 
 Parallel processing can be enabled by setting mc.cores to use multiple CPU cores.
diff --git a/man/pgx.computeConnectivityScores.Rd b/man/pgx.computeConnectivityScores.Rd
index b756e079..ed24384f 100644
--- a/man/pgx.computeConnectivityScores.Rd
+++ b/man/pgx.computeConnectivityScores.Rd
@@ -7,10 +7,9 @@
 pgx.computeConnectivityScores(
   pgx,
   sigdb,
-  ntop = 1000,
+  ntop = 200,
   contrasts = NULL,
-  remove.le = FALSE,
-  inmemory = FALSE
+  remove.le = FALSE
 )
 }
 \arguments{
@@ -23,8 +22,6 @@ pgx.computeConnectivityScores(
 \item{contrasts}{Contrasts to compute signature for. Default is all contrasts.}
 
 \item{remove.le}{Remove leading edge genes before scoring. Default FALSE.}
-
-\item{inmemory}{Compute correlations in memory instead of on disk. Default FALSE.}
 }
 \value{
 List of data frames with connectivity scores for each contrast.
@@ -40,9 +37,6 @@ against all datasets in the database to produce a ranked list of connectivity sc
 The top \code{ntop} correlated genes are used for computing the connectivity score.
 Scores are computed for each contrast unless a subset is specified with \code{contrasts}.
 
-Setting \code{remove.le = TRUE} removes the leading edge genes from the signature before 
+Setting \code{remove.le = TRUE} removes the leading edge genes from the signature before
 computing connectivity. This reduces bias from highly weighted genes.
-
-By default, correlations are computed on disk using chunks. Set \code{inmemory = TRUE}
-to compute in memory, which may be faster for small datasets.
 }
diff --git a/man/pgx.computeCoreGOgraph.Rd b/man/pgx.computeCoreGOgraph.Rd
index 0eb034cb..85d4a43a 100644
--- a/man/pgx.computeCoreGOgraph.Rd
+++ b/man/pgx.computeCoreGOgraph.Rd
@@ -20,10 +20,10 @@ Computes a core GO graph from significant GO terms
 \details{
 Compute Core GO Graph
 
-This function takes a pgx object containing GO enrichment results 
-and extracts significant GO terms below the specified FDR threshold. It constructs 
-a graph where nodes are GO terms, connected based on their semantic relationships 
-from the GO ontology. Terms are colored by their enrichment scores and sized by the 
-number of input genes they contain. The resulting graph highlights the key GO 
+This function takes a pgx object containing GO enrichment results
+and extracts significant GO terms below the specified FDR threshold. It constructs
+a graph where nodes are GO terms, connected based on their semantic relationships
+from the GO ontology. Terms are colored by their enrichment scores and sized by the
+number of input genes they contain. The resulting graph highlights the key GO
 terms and biological processes enriched in the analysis.
 }
diff --git a/man/pgx.computePGX.Rd b/man/pgx.computePGX.Rd
index f039e323..a167b33e 100644
--- a/man/pgx.computePGX.Rd
+++ b/man/pgx.computePGX.Rd
@@ -8,13 +8,15 @@ pgx.computePGX(
   pgx,
   max.genes = 19999,
   max.genesets = 5000,
-  gx.methods = c("ttest.welch", "trend.limma", "edger.qlf"),
+  gx.methods = c("trend.limma", "edger.qlf", "deseq2.wald"),
   gset.methods = c("fisher", "gsva", "fgsea"),
   custom.geneset = c(gmt = NULL, info = NULL),
   do.cluster = TRUE,
   use.design = TRUE,
   prune.samples = FALSE,
-  extra.methods = c("meta.go", "infer", "deconv", "drugs", "wordcloud", "wgcna")[c(1, 2)],
+  extra.methods = c("meta.go", "infer", "deconv", "drugs", "connectivity", "wordcloud",
+    "wgcna")[c(1, 2)],
+  pgx.dir = NULL,
   libx.dir = NULL,
   progress = NULL
 )
diff --git a/man/pgx.computePathscores.Rd b/man/pgx.computePathscores.Rd
index ef38098f..8a12d4c0 100644
--- a/man/pgx.computePathscores.Rd
+++ b/man/pgx.computePathscores.Rd
@@ -22,9 +22,9 @@ Compute path scores for a graph
 
 This function takes an igraph pathway graph object and calculates path scores for each node.
 
-It first adds source and sink nodes if they do not exist. 
+It first adds source and sink nodes if they do not exist.
 It then calculates edge weights based on the fold changes, enforcing strictly positive values if strict.pos=TRUE.
 
-Path scores are computed as the path integral from source to sink going through each node. 
+Path scores are computed as the path integral from source to sink going through each node.
 The path score indicates how well connected a node is to the observed fold changes.
 }
diff --git a/man/pgx.correlateSignatureH5.inmemory.Rd b/man/pgx.correlateSignatureH5.inmemory.Rd
deleted file mode 100644
index eb27f295..00000000
--- a/man/pgx.correlateSignatureH5.inmemory.Rd
+++ /dev/null
@@ -1,71 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/pgx-signature.R
-\name{pgx.correlateSignatureH5.inmemory}
-\alias{pgx.correlateSignatureH5.inmemory}
-\alias{pgx.correlateSignatureH5}
-\title{Correlate signature with datasets in HDF5 file}
-\usage{
-pgx.correlateSignatureH5.inmemory(
-  F,
-  h5.file,
-  nsig = 100,
-  ntop = 1000,
-  nperm = 1000
-)
-
-pgx.correlateSignatureH5(
-  fc,
-  h5.file,
-  nsig = 100,
-  ntop = 1000,
-  nperm = 10000,
-  h5.data = "data/matrix",
-  h5.rn = "data/rownames",
-  h5.cn = "data/colnames"
-)
-}
-\arguments{
-\item{F}{Numeric vector of signature fold changes}
-
-\item{h5.file}{Path to HDF5 file containing dataset expression matrices}
-
-\item{nsig}{Number of signature genes to use for correlation. Default 100.}
-
-\item{ntop}{Number of top correlated genes to use for GSEA. Default 1000.}
-
-\item{nperm}{Number of permutations for GSEA. Default 1000.}
-
-\item{h5.data:}{HDF5 location of dataset expression matrices}
-
-\item{h5.rn:}{HDF5 location of dataset rownames}
-
-\item{h5.cn:}{HDF5 location of dataset colnames}
-}
-\value{
-Data frame with correlation results and GSEA enrichment scores.
-
-Data frame with the correlation coefficient, p-value, and normalized enrichment score (NES) for each dataset.
-}
-\description{
-Correlates a signature (fold change vector) with datasets in an HDF5 file using in-memory computations instead of on-disk.
-It is also possible to run the operation on-disk via \code{pgx.correlateSignatureH5}.
-}
-\details{
-This function computes the correlation and gene set enrichment between a 
-signature fold change vector \code{F} and dataset expression matrices stored in an 
-HDF5 file at \code{h5.file}.
-
-It first reads the top \code{nsig} signature genes into memory. 
-Then it computes pairwise Pearson correlation between the signature and each dataset.
-It also runs pre-ranked GSEA using the \code{fgsea} package to compute enrichment scores.
-
-The main difference from \code{\link{pgx.correlateSignatureH5}} is that correlations 
-and GSEA are computed in memory instead of using on-disk chunks. This is faster for 
-small datasets but requires loading the full expression matrix into memory.
-}
-\section{Functions}{
-\itemize{
-\item \code{pgx.correlateSignatureH5()}: computes correlation and gene set enrichment between a 
-signature and datasets in an HDF5 file using on-disk chunked computations
-
-}}
diff --git a/man/pgx.createCreedsSigDB.Rd b/man/pgx.createCreedsSigDB.Rd
deleted file mode 100644
index e817f493..00000000
--- a/man/pgx.createCreedsSigDB.Rd
+++ /dev/null
@@ -1,32 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/pgx-signature.R
-\name{pgx.createCreedsSigDB}
-\alias{pgx.createCreedsSigDB}
-\title{Create CREEDS signature database}
-\usage{
-pgx.createCreedsSigDB(gmt.files, h5.file, update.only = FALSE)
-}
-\arguments{
-\item{gmt.files}{Character vector of paths to CREEDS GMT files containing gene signatures.}
-
-\item{h5.file}{Path to HDF5 file to write signature database to.}
-
-\item{update.only}{Logical indicating whether to only update existing signatures. Default is FALSE.}
-}
-\value{
-NULL. The CREEDS signature database is written to the specified HDF5 file.
-}
-\description{
-Reads gene signatures from CREEDS GMT files and writes to an HDF5 database.
-}
-\details{
-This function takes a set of CREEDS GMT files containing up/down regulated gene signatures.
-It extracts the signatures and writes them to an HDF5 file in a standard format that can be used for 
-gene set enrichment analysis.
-
-Each GMT file contains multiple up and down regulated signatures for a perturbation experiment.
-The signatures are named using the perturbation and direction, e.g. "DrugX-up".
-
-If update.only=FALSE, any existing signatures in the HDF5 file will be overwritten. 
-If TRUE, only signatures not already present will be added.
-}
diff --git a/man/pgx.createOmicsGraph.Rd b/man/pgx.createOmicsGraph.Rd
index 95340d70..7e3009da 100644
--- a/man/pgx.createOmicsGraph.Rd
+++ b/man/pgx.createOmicsGraph.Rd
@@ -34,7 +34,7 @@ Constructs an omics graph by connecting genes to gene sets and merging nodes.
 \details{
 Create an omics graph from an NGS analysis
 
-This function takes an NGS analysis object and constructs a graph where genes are connected to the gene sets they belong to. 
+This function takes an NGS analysis object and constructs a graph where genes are connected to the gene sets they belong to.
 It first creates a bipartite graph connecting genes and gene sets. The graph is then simplified by merging genes into clusters.
 
 If do.intersect is TRUE, gene sets are intersected when more than one gene set contains the same genes. This reduces redundancy.
diff --git a/man/pgx.createSignatureDatabaseH5.Rd b/man/pgx.createSignatureDatabaseH5.Rd
index c31738cb..d763d22a 100644
--- a/man/pgx.createSignatureDatabaseH5.Rd
+++ b/man/pgx.createSignatureDatabaseH5.Rd
@@ -22,7 +22,7 @@ Reads fold change matrices from multiple PGX files and combines them into a sing
 \details{
 Create a signature database from PGX files
 
-This function reads fold change matrices from multiple PGX experiment result files (.pgx). 
+This function reads fold change matrices from multiple PGX experiment result files (.pgx).
 It extracts the fold change values, normalizes the gene names, and combines the matrices into a single signature database.
 This combined matrix is then stored in an H5 file for later use.
 }
diff --git a/man/pgx.getGeneSetCollections.Rd b/man/pgx.getGeneSetCollections.Rd
index a9f4afe7..6cda0701 100644
--- a/man/pgx.getGeneSetCollections.Rd
+++ b/man/pgx.getGeneSetCollections.Rd
@@ -7,7 +7,7 @@
 \usage{
 pgx.getGeneFamilies(genes, min.size = 10, max.size = 500)
 
-pgx.getGeneSetCollections(gsets, min.size = 10, max.size = 500)
+pgx.getGeneSetCollections(gsets = rownames(playdata::GSETxGENE))
 }
 \arguments{
 \item{min.size}{The minimum gene set size to include. Default 10.}
diff --git a/man/pgx.getSigGO.Rd b/man/pgx.getSigGO.Rd
index 76efaee8..89a14cc2 100644
--- a/man/pgx.getSigGO.Rd
+++ b/man/pgx.getSigGO.Rd
@@ -28,7 +28,7 @@ pgx.getSigGO(
 }
 \value{
 A list containing the GO graph \code{graph}, various GO term statistics
-matrices (\code{pathscore}, \code{foldchange}, \code{qvalue}), and \code{match} mapping 
+matrices (\code{pathscore}, \code{foldchange}, \code{qvalue}), and \code{match} mapping
 GO terms to gene sets.
 }
 \description{
diff --git a/man/pgx.makeAutoContrasts.Rd b/man/pgx.makeAutoContrasts.Rd
index b7f1e9ee..a4b5beaa 100644
--- a/man/pgx.makeAutoContrasts.Rd
+++ b/man/pgx.makeAutoContrasts.Rd
@@ -9,6 +9,7 @@ pgx.makeAutoContrasts(
   mingrp = 3,
   slen = 20,
   ref = NULL,
+  max.level = 10,
   fix.degenerate = FALSE,
   skip.hidden = TRUE
 )
diff --git a/man/pgx.poolCells.Rd b/man/pgx.poolCells.Rd
index 79592339..d96659e6 100644
--- a/man/pgx.poolCells.Rd
+++ b/man/pgx.poolCells.Rd
@@ -39,13 +39,13 @@ pgx.poolCells(
 Matrix of pooled counts for each group
 }
 \description{
-This function pools single cell RNA-seq count matrices into 
-pseudo-bulk groups. Cells are assigned to groups based on metadata or 
+This function pools single cell RNA-seq count matrices into
+pseudo-bulk groups. Cells are assigned to groups based on metadata or
 clustering. Counts within each group are aggregated using summation or other statistics.
 }
 \details{
 This function takes a single cell count matrix and pools the counts into pseudo-bulk groups.
-If groups are provided, cells are assigned to these groups. Otherwise clustering is performed using umap/pca 
+If groups are provided, cells are assigned to these groups. Otherwise clustering is performed using umap/pca
 on the log-expression matrix X to infer groups. Within each group, cell counts are aggregated into a pseudo-bulk profile
 using summation or other statistics. A per-group prior can be used to normalize pooling.
 The output is a pooled count matrix with groups as columns.
diff --git a/man/pgx.readOptions.Rd b/man/pgx.readOptions.Rd
index c0b451fc..b29d9d8d 100644
--- a/man/pgx.readOptions.Rd
+++ b/man/pgx.readOptions.Rd
@@ -16,16 +16,16 @@ A named list containing the PGX options.
 Reads PGX analysis options from a file.
 }
 \details{
-This function reads a simple text file containing PGX analysis options, 
+This function reads a simple text file containing PGX analysis options,
 one option per line in the format:
 
 \code{option=value}
 
 Options include parameters like:
 
-\code{fdr=0.05} - FDR threshold 
+\code{fdr=0.05} - FDR threshold
 \code{logfc=1} - Log fold-change threshold
 
-The options file allows saving a set of parameters for easily re-running 
+The options file allows saving a set of parameters for easily re-running
 an analysis with the same settings.
 }
diff --git a/man/pgx.saveMatrixH5.Rd b/man/pgx.saveMatrixH5.Rd
index ed55c695..992845c6 100644
--- a/man/pgx.saveMatrixH5.Rd
+++ b/man/pgx.saveMatrixH5.Rd
@@ -20,7 +20,7 @@ NULL. The matrix is saved to the HDF5 file.
 Saves a matrix to an HDF5 file for efficient storage and retrieval.
 }
 \details{
-This function saves a matrix \code{X} to an HDF5 file at \code{h5.file}. 
+This function saves a matrix \code{X} to an HDF5 file at \code{h5.file}.
 It first deletes any existing file at that path, then creates a new HDF5 file.
 
 The matrix is saved under the "data/matrix" group. Chunked storage can be used
diff --git a/man/pgx.survivalVariableImportance.Rd b/man/pgx.survivalVariableImportance.Rd
index 350a0352..e759110a 100644
--- a/man/pgx.survivalVariableImportance.Rd
+++ b/man/pgx.survivalVariableImportance.Rd
@@ -55,7 +55,7 @@ The following methods can be selected via the \code{methods} parameter:
 \item pls: Absolute coefficients from partial least squares Cox model
 }
 
-Variable importance scores are returned for each method in a named list. 
+Variable importance scores are returned for each method in a named list.
 These scores can be used to select important predictors for survival modeling.
 }
 \section{Functions}{
diff --git a/man/pgx.testPhenoCorrelation.Rd b/man/pgx.testPhenoCorrelation.Rd
index b75c352d..ae03d6aa 100644
--- a/man/pgx.testPhenoCorrelation.Rd
+++ b/man/pgx.testPhenoCorrelation.Rd
@@ -4,7 +4,7 @@
 \alias{pgx.testPhenoCorrelation}
 \title{Test correlation of phenotype with expression data}
 \usage{
-pgx.testPhenoCorrelation(df, plot = TRUE, cex = 1)
+pgx.testPhenoCorrelation(df, plot = TRUE, cex = 1, compute.pv = TRUE)
 }
 \arguments{
 \item{df}{Data frame containing phenotype data}
diff --git a/man/pgx.testTCGAsurvival.Rd b/man/pgx.testTCGAsurvival.Rd
index 44650512..5ca21a76 100644
--- a/man/pgx.testTCGAsurvival.Rd
+++ b/man/pgx.testTCGAsurvival.Rd
@@ -40,8 +40,8 @@ Test the association between a gene signature and survival using TCGA expression
 }
 \details{
 This function tests the association between a gene signature (sig) and survival using TCGA expression data.
-It extracts the top differentially expressed genes from the signature, then reads their expression from a TCGA HDF5 matrix file. 
-Survival data is obtained from the RTCGA_SURVIVAL dataset. The signature scores are tested for association with overall survival 
-using a Cox proportional hazards model. Results include the hazard ratio, p-value, and number of cases for each gene. A survival 
+It extracts the top differentially expressed genes from the signature, then reads their expression from a TCGA HDF5 matrix file.
+Survival data is obtained from the RTCGA_SURVIVAL dataset. The signature scores are tested for association with overall survival
+using a Cox proportional hazards model. Results include the hazard ratio, p-value, and number of cases for each gene. A survival
 plot can also be generated.
 }
diff --git a/man/pgx.updateInfoPGX.Rd b/man/pgxinfo.add.Rd
similarity index 84%
rename from man/pgx.updateInfoPGX.Rd
rename to man/pgxinfo.add.Rd
index 80b9d0d2..fcf68282 100644
--- a/man/pgx.updateInfoPGX.Rd
+++ b/man/pgxinfo.add.Rd
@@ -1,10 +1,10 @@
 % Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/pgx-files.R
-\name{pgx.updateInfoPGX}
-\alias{pgx.updateInfoPGX}
-\title{Update PGX-table with new pgx object}
+\name{pgxinfo.add}
+\alias{pgxinfo.add}
+\title{Add new pgx object to PGX-table}
 \usage{
-pgx.updateInfoPGX(pgxinfo, pgx, remove.old = TRUE)
+pgxinfo.add(pgxinfo, pgx, remove.old = TRUE)
 }
 \arguments{
 \item{pgxinfo}{The existing pgxinfo data frame containing dataset metadata}
@@ -23,7 +23,7 @@ Updates the pgxinfo dataset metadata table with information from a new pgx objec
 This function takes an existing pgxinfo data frame and a pgx object as input.
 It extracts the metadata stored in pgx$info and appends it as a new row to the pgxinfo table.
 
-If remove.old is TRUE, it will first remove any existing rows for the same dataset 
+If remove.old is TRUE, it will first remove any existing rows for the same dataset
 before appending the new row. This avoids duplicating information for the same dataset.
 
 The updated pgxinfo data frame containing all dataset metadata is returned.
diff --git a/man/pgxinfo.delete.Rd b/man/pgxinfo.delete.Rd
new file mode 100644
index 00000000..6909956a
--- /dev/null
+++ b/man/pgxinfo.delete.Rd
@@ -0,0 +1,21 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/pgx-files.R
+\name{pgxinfo.delete}
+\alias{pgxinfo.delete}
+\title{Delete pgx entries in pgx info objects}
+\usage{
+pgxinfo.delete(pgx.dir, pgxname, purge.h5 = FALSE)
+}
+\arguments{
+\item{pgx.dir}{The folder containing pgxinfo metadata}
+
+\item{pgxname}{The name of the pgx object for which metadata to delete}
+
+\item{purge.h5}{Logical indicating whether to remove entry in big H5 file}
+}
+\description{
+Removes entries the pgxinfo metadata files from a pgxname
+}
+\details{
+This function takes a pgxname as input and removes all entries in the pgx metadata files.
+}
diff --git a/man/pgxinfo.needUpdate.Rd b/man/pgxinfo.needUpdate.Rd
index 21c85033..cdd173a9 100644
--- a/man/pgxinfo.needUpdate.Rd
+++ b/man/pgxinfo.needUpdate.Rd
@@ -23,7 +23,7 @@ All pgx files in the datafolder should be included.
 \details{
 This function checks if the main metadata files in a PGX directory need to be updated:
 
-- datasets-info.csv 
+- datasets-info.csv
 - datasets-allFC.csv
 - datasets-sigdb.h5
 
diff --git a/man/pgxinfo.read.Rd b/man/pgxinfo.read.Rd
index 3808cbdd..61c92028 100644
--- a/man/pgxinfo.read.Rd
+++ b/man/pgxinfo.read.Rd
@@ -22,7 +22,7 @@ Reads the dataset information CSV file for a directory of PGX files.
 \details{
 This function reads the dataset info CSV located in a PGX directory. It contains metadata like datatype, organism, sample sizes, etc.
 
-The \code{pgx.dir} argument specifies the path to the PGX directory containing .pgx files. 
+The \code{pgx.dir} argument specifies the path to the PGX directory containing .pgx files.
 
 The \code{file} argument specifies the filename for the dataset info file. By default this is "datasets-info.csv".
 
diff --git a/man/pgxinfo.updateDatasetFolder.Rd b/man/pgxinfo.updateDatasetFolder.Rd
index 23e0698e..53bddea5 100644
--- a/man/pgxinfo.updateDatasetFolder.Rd
+++ b/man/pgxinfo.updateDatasetFolder.Rd
@@ -38,7 +38,7 @@ based on new or changed .pgx files. It compares the .pgx files to the existing m
 
 The metadata files updated are:
 - datasets-info.csv
-- datasets-allFC.csv 
+- datasets-allFC.csv
 - datasets-sigdb.h5
 
 If force=TRUE, it will update regardless of whether changes are detected.
diff --git a/man/plot_ggscatter.Rd b/man/plot_ggscatter.Rd
index 84bb704c..d2894516 100644
--- a/man/plot_ggscatter.Rd
+++ b/man/plot_ggscatter.Rd
@@ -72,8 +72,10 @@ x <- rnorm(100)
 y <- rnorm(100)
 plot_ggscatter(x, y)
 
-mat <- cbind(x, y) 
-plot_ggscatter(mat, legend.pos="bottom", 
-               shape=c(1,2), col.scale=1:100)
+mat <- cbind(x, y)
+plot_ggscatter(mat,
+  legend.pos = "bottom",
+  shape = c(1, 2), col.scale = 1:100
+)
 }
 }
diff --git a/man/prot.nmfImpute.Rd b/man/prot.nmfImpute.Rd
index f61a0125..af4e27fa 100644
--- a/man/prot.nmfImpute.Rd
+++ b/man/prot.nmfImpute.Rd
@@ -23,7 +23,7 @@ Imputes missing values in a proteomics matrix using non-negative matrix factoriz
 }
 \details{
 This function takes a numeric matrix \code{X} containing proteomics data, with samples in columns.
-It uses \code{groups} to define sample groups. 
+It uses \code{groups} to define sample groups.
 
 It first replaces missing values (0s) with NAs.
 For each sample group, any rows with more than \code{r} proportion of NAs are set to 0.
diff --git a/man/read.as_matrix.Rd b/man/read.as_matrix.Rd
index 20a37b6f..7014bd20 100644
--- a/man/read.as_matrix.Rd
+++ b/man/read.as_matrix.Rd
@@ -7,10 +7,13 @@
 \usage{
 read.as_matrix.SAVE(file)
 
-read.as_matrix(file)
+read.as_matrix(file, skip_row_check = FALSE)
 }
 \arguments{
 \item{file}{Path to input data file}
+
+\item{skip_row_check}{(default `FALSE`) Flag to skip the removal
+of empty rows}
 }
 \value{
 Matrix object containing data from file
diff --git a/man/seurat2pgx.Rd b/man/seurat2pgx.Rd
index 6acbbf88..462c5df9 100644
--- a/man/seurat2pgx.Rd
+++ b/man/seurat2pgx.Rd
@@ -22,6 +22,6 @@ This function takes a Seurat object containing single-cell RNA-seq data and conv
 The count matrix, normalized expression matrix, and sample metadata are extracted from the Seurat object.
 Gene annotations are added using the gene symbols.
 
-If do.cluster=TRUE, dimensionality reduction and clustering of samples is performed. 
+If do.cluster=TRUE, dimensionality reduction and clustering of samples is performed.
 Any existing tsne/umap embeddings and cluster assignments are copied over from the Seurat object.
 }
diff --git a/man/sigdb.getConnectivityMatrix.Rd b/man/sigdb.getConnectivityMatrix.Rd
index a7ad8e68..6d8bd41e 100644
--- a/man/sigdb.getConnectivityMatrix.Rd
+++ b/man/sigdb.getConnectivityMatrix.Rd
@@ -38,11 +38,11 @@ Matrix of connectivity scores
 Get the connectivity matrix from a signature database HDF5 file.
 }
 \details{
-This function extracts the connectivity matrix from a signature database 
+This function extracts the connectivity matrix from a signature database
 stored in HDF5 format. The HDF5 file should contain the matrix under /data/matrix.
 
-You can filter the returned matrix by specifying a subset of contrasts to return 
-with the \code{select} parameter. Use the \code{genes} parameter to return only a 
+You can filter the returned matrix by specifying a subset of contrasts to return
+with the \code{select} parameter. Use the \code{genes} parameter to return only a
 subset of genes.
 
 If the sigdb file is not in the working directory, set the \code{path} parameter.
@@ -50,7 +50,7 @@ If the sigdb file is not in the working directory, set the \code{path} parameter
 \section{Functions}{
 \itemize{
 \item \code{sigdb.getEnrichmentMatrix()}: This function reads the enrichment data stored in a signature database in HDF5 format.
-It extracts either the gene set enrichment analysis (GSEA) results or the rank 
+It extracts either the gene set enrichment analysis (GSEA) results or the rank
 correlation analysis results.
 
 \item \code{sigdb.getSignatureMatrix()}: reads the up/down regulated gene signatures from a signature database HDF5 file.
diff --git a/man/tagged.hamming.Rd b/man/tagged.hamming.Rd
index e890c418..dbd7e1a7 100644
--- a/man/tagged.hamming.Rd
+++ b/man/tagged.hamming.Rd
@@ -20,9 +20,9 @@ Named numeric vector of Hamming distances for each matched tag.
 Calculate the Hamming distance between two sequences, taking into account tags.
 }
 \details{
-This function calculates the Hamming distance between two character vector 
-sequences \code{aa} and \code{bb}. It first parses any tags in the sequences using \code{parse.tags}. 
-It then calculates the Hamming distance between the sequences indicated by any matching tags (e.g. 
-\code{cdr1}, \code{cdr2}). For tagged sequences, it will align the sequences before calculating Hamming 
+This function calculates the Hamming distance between two character vector
+sequences \code{aa} and \code{bb}. It first parses any tags in the sequences using \code{parse.tags}.
+It then calculates the Hamming distance between the sequences indicated by any matching tags (e.g.
+\code{cdr1}, \code{cdr2}). For tagged sequences, it will align the sequences before calculating Hamming
 distance if \code{align=TRUE}. For non-tagged sequences, it calculates simple Hamming distance.
 }
diff --git a/man/tcosine_similarity.Rd b/man/tcosine_similarity.Rd
index 5ff4b358..44271110 100644
--- a/man/tcosine_similarity.Rd
+++ b/man/tcosine_similarity.Rd
@@ -46,7 +46,7 @@ Calculates the cosine similarity between two matrices.
 This function takes two numeric matrices \code{X} and \code{Y} and calculates the cosine similarity between them.
 It can handle sparse matrices and missing values.
 
-Cosine similarity is defined as the cosine of the angle between two vectors. 
+Cosine similarity is defined as the cosine of the angle between two vectors.
 It is calculated as the dot product of the vectors divided by the L2 norms of the vectors.
 
 If only \code{X} is provided, the cosine similarity between columns of \code{X} is calculated.
diff --git a/man/uscale.Rd b/man/uscale.Rd
index 4cb45aec..75fc2d04 100644
--- a/man/uscale.Rd
+++ b/man/uscale.Rd
@@ -24,7 +24,7 @@ This function takes a numeric vector \code{x} and rescales the values to lie bet
 It subtracts the minimum value and divides by the range.
 This transforms the values to a 0-1 range while maintaining relative differences.
 
-If \code{symm=TRUE}, the rescaled values are further transformed to be symmetrized around 0 
+If \code{symm=TRUE}, the rescaled values are further transformed to be symmetrized around 0
 by subtracting 0.5 and multiplying by 2.
 
 The rescaled vector is returned.