Fix preprocessing issue with LSN

Emad-COMBINE-lab · Feb 7, 2024 · 4740ccf · 4740ccf
1 parent 8fee09a
commit 4740ccf
Showing 1 changed file with 8 additions and 4 deletions.
diff --git a/src/preprocessing/preprocess.py b/src/preprocessing/preprocess.py
@@ -55,9 +55,11 @@ def preprocess(cfg: ConfigParser) -> None:
     anndata.uns["cells_no"] = anndata.shape[0]
     anndata.uns["genes_no"] = anndata.shape[1]
 
+    canndata = anndata.copy()
+
     # library-size normalization
     sc.pp.normalize_per_cell(
-        anndata, counts_per_cell_after=int(cfg.get("Preprocessing", "library size"))
+        canndata, counts_per_cell_after=int(cfg.get("Preprocessing", "library size"))
     )
 
     if cfg.get("Preprocessing", "annotations") is not None:
@@ -71,8 +73,6 @@ def preprocess(cfg: ConfigParser) -> None:
         anndata.obs["barcodes"] = anndata.obs.index
         anndata.obs["celltype"] = anndata.obs["barcodes"].map(annotation_dict)
 
-    canndata = anndata.copy()
-
     # identify highly variable genes
     sc.pp.log1p(canndata)  # logarithmize the data
     sc.pp.highly_variable_genes(
@@ -88,6 +88,10 @@ def preprocess(cfg: ConfigParser) -> None:
         :, canndata.var["highly_variable"]
     ]  # only keep highly variable genes
 
+    sc.pp.normalize_per_cell(
+        anndata, counts_per_cell_after=int(cfg.get("Preprocessing", "library size"))
+    )
+
     # sort genes by name (not needed)
     sorted_genes = np.sort(anndata.var_names)
     anndata = anndata[:, sorted_genes]
@@ -99,4 +103,4 @@ def preprocess(cfg: ConfigParser) -> None:
     anndata[val_size : test_size + val_size].write_h5ad(cfg.get("Data", "test"))
     anndata[test_size + val_size :].write_h5ad(cfg.get("Data", "train"))
 
-    print("Successfully preprocessed and and saved dataset")
+    print("Successfully preprocessed and and saved dataset")