From 4307d75db3a14ba6bfc6b6178490f180ccc8dca6 Mon Sep 17 00:00:00 2001 From: Euxhen Hasanaj Date: Tue, 28 Nov 2023 22:21:18 -0500 Subject: [PATCH] Adding compression, better logfc (#111) --- src/grinch/cond_filter.py | 1 + src/grinch/pipeline.py | 4 +++- src/grinch/processors/splitter.py | 6 +++--- src/grinch/utils/stats.py | 2 +- 4 files changed, 8 insertions(+), 5 deletions(-) diff --git a/src/grinch/cond_filter.py b/src/grinch/cond_filter.py index 114c7bc..c1def90 100644 --- a/src/grinch/cond_filter.py +++ b/src/grinch/cond_filter.py @@ -144,6 +144,7 @@ def _take_k_functional(arr, k: NonNegativeInt, as_mask: bool, top: bool): """ if k > (n := len(arr)): logger.warning(f"Requested {k} items but array has size {n}.") + k = n idx = np.argpartition(arr, -k if top else k) # linear time idx = idx[-k:] if top else idx[:k] diff --git a/src/grinch/pipeline.py b/src/grinch/pipeline.py index a1c9660..aa38b02 100644 --- a/src/grinch/pipeline.py +++ b/src/grinch/pipeline.py @@ -101,6 +101,7 @@ class Config(BaseConfigurable.Config): processors: List[BaseConfigurable.Config] verbose: bool = Field(True, exclude=True) write_key: str = "pipeline" + compression: str | int | None = None # It may be desirable to write only the columns of adata without # the data matrix so save memory. In that case, set no_data_write # to True. This will replace the data matrix with a sparse matrix @@ -165,7 +166,8 @@ def __call__(self, adata: AnnData | None = None, **kwargs) -> DataSplitter: if self.cfg.data_writepath is not None: logger.info(f"Writting AnnData at '{self.cfg.data_writepath}'...") ds.write_h5ad(str(self.cfg.data_writepath), - no_data_write=self.cfg.no_data_write) + no_data_write=self.cfg.no_data_write, + compression=self.cfg.compression) return ds def _apply(self, ds: DataSplitter, processor: BaseConfigurable) -> None: diff --git a/src/grinch/processors/splitter.py b/src/grinch/processors/splitter.py index b7f45c5..a93129e 100644 --- a/src/grinch/processors/splitter.py +++ b/src/grinch/processors/splitter.py @@ -27,14 +27,14 @@ class DataSplitter: def is_split(self) -> bool: return any_not_None(self.VAL_SPLIT, self.TEST_SPLIT) - def write_h5ad(self, path: str, no_data_write: bool = False) -> None: + def write_h5ad(self, path: str, no_data_write: bool = False, **kwargs) -> None: """Writes anndata to path. If any of VAL or TEST splits are not None, will instead write both to a folder with the name specified in path. """ if not any_not_None(self.VAL_SPLIT, self.TEST_SPLIT): to_write = as_empty(self.TRAIN_SPLIT) if no_data_write else self.TRAIN_SPLIT - to_write.write_h5ad(path) + to_write.write_h5ad(path, **kwargs) return if path.endswith('.h5ad'): @@ -47,7 +47,7 @@ def write_h5ad(self, path: str, no_data_write: bool = False) -> None: if os.path.exists(path_to_write): logger.warning(f"Object {path_to_write} exists. This will be overwritten.") to_write = as_empty(sp) if no_data_write else sp - to_write.write_h5ad(path_to_write) + to_write.write_h5ad(path_to_write, **kwargs) class Splitter(BaseConfigurable, StorageMixin): diff --git a/src/grinch/utils/stats.py b/src/grinch/utils/stats.py index 0caaf23..66f81ea 100644 --- a/src/grinch/utils/stats.py +++ b/src/grinch/utils/stats.py @@ -241,7 +241,7 @@ def _compute_log2fc(mean1, mean2, base='e', is_logged=False): base = np.e if base == 'e' else float(base) log2fc *= np.log2(base) else: - log2fc = np.log2((mean1 + 1) / (mean2 + 1)) + log2fc = np.log2(mean1 + 1) - np.log2(mean2 + 1) return log2fc