Skip to content

Commit

Permalink
Fix preprocessing issue with LSN
Browse files Browse the repository at this point in the history
  • Loading branch information
Yazdan Zinati committed Feb 7, 2024
1 parent 8fee09a commit 4740ccf
Showing 1 changed file with 8 additions and 4 deletions.
12 changes: 8 additions & 4 deletions src/preprocessing/preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,9 +55,11 @@ def preprocess(cfg: ConfigParser) -> None:
anndata.uns["cells_no"] = anndata.shape[0]
anndata.uns["genes_no"] = anndata.shape[1]

canndata = anndata.copy()

# library-size normalization
sc.pp.normalize_per_cell(
anndata, counts_per_cell_after=int(cfg.get("Preprocessing", "library size"))
canndata, counts_per_cell_after=int(cfg.get("Preprocessing", "library size"))
)

if cfg.get("Preprocessing", "annotations") is not None:
Expand All @@ -71,8 +73,6 @@ def preprocess(cfg: ConfigParser) -> None:
anndata.obs["barcodes"] = anndata.obs.index
anndata.obs["celltype"] = anndata.obs["barcodes"].map(annotation_dict)

canndata = anndata.copy()

# identify highly variable genes
sc.pp.log1p(canndata) # logarithmize the data
sc.pp.highly_variable_genes(
Expand All @@ -88,6 +88,10 @@ def preprocess(cfg: ConfigParser) -> None:
:, canndata.var["highly_variable"]
] # only keep highly variable genes

sc.pp.normalize_per_cell(
anndata, counts_per_cell_after=int(cfg.get("Preprocessing", "library size"))
)

# sort genes by name (not needed)
sorted_genes = np.sort(anndata.var_names)
anndata = anndata[:, sorted_genes]
Expand All @@ -99,4 +103,4 @@ def preprocess(cfg: ConfigParser) -> None:
anndata[val_size : test_size + val_size].write_h5ad(cfg.get("Data", "test"))
anndata[test_size + val_size :].write_h5ad(cfg.get("Data", "train"))

print("Successfully preprocessed and and saved dataset")
print("Successfully preprocessed and and saved dataset")

0 comments on commit 4740ccf

Please sign in to comment.