From 4307d75db3a14ba6bfc6b6178490f180ccc8dca6 Mon Sep 17 00:00:00 2001
From: Euxhen Hasanaj <ehasanaj@cs.cmu.edu>
Date: Tue, 28 Nov 2023 22:21:18 -0500
Subject: [PATCH] Adding compression, better logfc (#111)

---
 src/grinch/cond_filter.py         | 1 +
 src/grinch/pipeline.py            | 4 +++-
 src/grinch/processors/splitter.py | 6 +++---
 src/grinch/utils/stats.py         | 2 +-
 4 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/src/grinch/cond_filter.py b/src/grinch/cond_filter.py
index 114c7bc..c1def90 100644
--- a/src/grinch/cond_filter.py
+++ b/src/grinch/cond_filter.py
@@ -144,6 +144,7 @@ def _take_k_functional(arr, k: NonNegativeInt, as_mask: bool, top: bool):
         """
         if k > (n := len(arr)):
             logger.warning(f"Requested {k} items but array has size {n}.")
+            k = n
 
         idx = np.argpartition(arr, -k if top else k)  # linear time
         idx = idx[-k:] if top else idx[:k]
diff --git a/src/grinch/pipeline.py b/src/grinch/pipeline.py
index a1c9660..aa38b02 100644
--- a/src/grinch/pipeline.py
+++ b/src/grinch/pipeline.py
@@ -101,6 +101,7 @@ class Config(BaseConfigurable.Config):
         processors: List[BaseConfigurable.Config]
         verbose: bool = Field(True, exclude=True)
         write_key: str = "pipeline"
+        compression: str | int | None = None
         # It may be desirable to write only the columns of adata without
         # the data matrix so save memory. In that case, set no_data_write
         # to True. This will replace the data matrix with a sparse matrix
@@ -165,7 +166,8 @@ def __call__(self, adata: AnnData | None = None, **kwargs) -> DataSplitter:
         if self.cfg.data_writepath is not None:
             logger.info(f"Writting AnnData at '{self.cfg.data_writepath}'...")
             ds.write_h5ad(str(self.cfg.data_writepath),
-                          no_data_write=self.cfg.no_data_write)
+                          no_data_write=self.cfg.no_data_write,
+                          compression=self.cfg.compression)
         return ds
 
     def _apply(self, ds: DataSplitter, processor: BaseConfigurable) -> None:
diff --git a/src/grinch/processors/splitter.py b/src/grinch/processors/splitter.py
index b7f45c5..a93129e 100644
--- a/src/grinch/processors/splitter.py
+++ b/src/grinch/processors/splitter.py
@@ -27,14 +27,14 @@ class DataSplitter:
     def is_split(self) -> bool:
         return any_not_None(self.VAL_SPLIT, self.TEST_SPLIT)
 
-    def write_h5ad(self, path: str, no_data_write: bool = False) -> None:
+    def write_h5ad(self, path: str, no_data_write: bool = False, **kwargs) -> None:
         """Writes anndata to path. If any of VAL or TEST splits are not
         None, will instead write both to a folder with the name specified
         in path.
         """
         if not any_not_None(self.VAL_SPLIT, self.TEST_SPLIT):
             to_write = as_empty(self.TRAIN_SPLIT) if no_data_write else self.TRAIN_SPLIT
-            to_write.write_h5ad(path)
+            to_write.write_h5ad(path, **kwargs)
             return
 
         if path.endswith('.h5ad'):
@@ -47,7 +47,7 @@ def write_h5ad(self, path: str, no_data_write: bool = False) -> None:
                 if os.path.exists(path_to_write):
                     logger.warning(f"Object {path_to_write} exists. This will be overwritten.")
                 to_write = as_empty(sp) if no_data_write else sp
-                to_write.write_h5ad(path_to_write)
+                to_write.write_h5ad(path_to_write, **kwargs)
 
 
 class Splitter(BaseConfigurable, StorageMixin):
diff --git a/src/grinch/utils/stats.py b/src/grinch/utils/stats.py
index 0caaf23..66f81ea 100644
--- a/src/grinch/utils/stats.py
+++ b/src/grinch/utils/stats.py
@@ -241,7 +241,7 @@ def _compute_log2fc(mean1, mean2, base='e', is_logged=False):
             base = np.e if base == 'e' else float(base)
             log2fc *= np.log2(base)
     else:
-        log2fc = np.log2((mean1 + 1) / (mean2 + 1))
+        log2fc = np.log2(mean1 + 1) - np.log2(mean2 + 1)
     return log2fc