From ffe31f304613f141782a63d98053fb87d15ee03a Mon Sep 17 00:00:00 2001
From: gautam8387 <goutamahuja8387@gmail.com>
Date: Wed, 18 Dec 2024 04:20:04 +0530
Subject: [PATCH] Added explanation and comments

---
 scarf/merge.py | 55 +++++++++++++++++++++++++-------------------------
 1 file changed, 27 insertions(+), 28 deletions(-)

diff --git a/scarf/merge.py b/scarf/merge.py
index b18846d..c5d75fe 100644
--- a/scarf/merge.py
+++ b/scarf/merge.py
@@ -178,27 +178,26 @@ def perform_randomization_rows(
         Returns:
         """
         rng = np.random.default_rng(seed=seed)
-        # np.random.seed(seed)
         chunkSize = np.array([x.rawData.chunksize[0] for x in self.assays])
         nCells = np.array([x.rawData.shape[0] for x in self.assays])
         permutations = {
             i: permute_into_chunks(nCells[i], chunkSize[i])
             for i in range(len(self.assays))
-        }
+        }  # Randomize the rows in chunks
 
+        # Create a dictionary of arrays. This is the same data in `permutations` but in a different format. We index the arrays by the chunk number.
+        # Example:
+        # permutation = {0: [array([2, 0, 1]), array([3, 4, 5]), array([8, 7, 6]), array([9])], 1: [array([2, 0, 1]), array([3, 4, 5]), array([8, 7, 6]), array([9])]}
+        # permutations_rows = {0: {0: array([2, 0, 1]), 1: array([3, 4, 5]), 2: array([8, 7, 6]), 3: array([9])}, 1: {0: array([2, 0, 1]), 1: array([3, 4, 5]), 2: array([8, 7, 6]), 3: array([9])}}
         permutations_rows = {}
         for key, arrays in permutations.items():
             in_dict = {i: x for i, x in enumerate(arrays)}
             permutations_rows[key] = in_dict
 
+        # Set the offset for each chunk. Offset calculated by adding the number of cells in the previous chunks. This will be helpful when we merge the cells metadata in the end.
+        # Example:
+        # {0: {0: array([2, 0, 1]), 1: array([3, 4, 5]), 2: array([8, 7, 6]), 3: array([9])}, 1: {0: array([12, 10, 11]), 1: array([13, 14, 15]), 2: array([18, 17, 16]), 3: array([19])}}
         permutations_rows_offset = {}
-        # for i in range(len(permutations)):
-        #     in__dict: dict[int, np.ndarray] = {}
-        #     last_key = i - 1 if i > 0 else 0
-        #     offset = nCells[last_key] + offset if i > 0 else 0  # noqa: F821
-        #     for j, arr in enumerate(permutations[i]):
-        #         in__dict[j] = arr + offset
-        #     permutations_rows_offset[i] = in__dict
         offset = 0
         for key, val_dict in permutations_rows.items():
             in__dict: dict[int, np.ndarray] = {}
@@ -207,6 +206,10 @@ def perform_randomization_rows(
             permutations_rows_offset[key] = in__dict
             offset += nCells[key]
 
+        # Set the random order in which the rows will be merged. The last chunk of each assay is appended at the end of the list to account for potential incomplete chunks.
+        # Example:
+        # coordinates_permutations = [[0, 0], [0, 1], [1, 2], [0, 2], [1, 1], [1, 0], [0, 3], [1, 3]]
+        # Here [0, 0] means the first chunk of the first assay, [0, 1] means the second chunk of the first assay, [1, 2] means the third chunk of the second assay, and so on will be the order in which the rows will be merged.
         coordinates = []
         extra = []
         for i in range(len(self.assays)):
@@ -215,9 +218,9 @@ def perform_randomization_rows(
                     extra.append([i, j])
                     continue
                 coordinates.append([i, j])
-
-        # coordinates_permutations = np.random.permutation(coordinates)
-        coordinates_permutations = rng.permutation(coordinates)
+        coordinates_permutations = rng.permutation(
+            coordinates
+        )  # Randomize the order of the coordinates
         if len(coordinates_permutations) > 0:
             coordinates_permutations = np.concatenate(
                 [coordinates_permutations, extra], axis=0
@@ -251,22 +254,26 @@ def perform_randomization_rows(
         return permutations_rows, permutations_rows_offset, coordinates_permutations
 
     def _ref_order_cell_idx(self) -> Dict[int, Dict[int, np.ndarray]]:
+        """
+        Calculate the order of the cells in the merged assay.
+        """
+        # We calculate the order of the cells in the merged assay by using the permutations_rows and coordinates_permutations. This is essentially the one-to-one mapping of the cells in the assays to the cells in the merged assay.
+        # Example:
+        # cellOrder = {0: {0: array([0, 1, 2]), 1: array([3, 4, 5]), 2: array([ 9, 10, 11]), 3: array([18])}, 1: {0: array([15, 16, 17]), 1: array([12, 13, 14]), 2: array([6, 7, 8]), 3: array([19])}}
+        # Here we see that the cells [2, 0, 1] from the first chunk of the first assay are mapped to [0, 1, 2] in the merged assay. Similarly, the cells [2, 0, 1] from the first chunk of the second assay are mapped to [15, 16, 17] in the merged assay.
         new_cells = {}
         for i in range(len(self.assays)):
             in_dict: dict[int, np.ndarray] = {}
             for j in range(len(self.permutations_rows[i])):
                 in_dict[j] = np.array([])
             new_cells[i] = in_dict
-
         offset = 0
         for i, (x, y) in enumerate(self.coordinates_permutations):
-            # arr = self.permutations_rows_offset[x][y]
             arr = self.permutations_rows[x][y]
             arr = np.array(range(len(arr)))
             arr = arr + offset
             new_cells[x][y] = arr
             offset = arr.max() + 1
-
         return new_cells
 
     def _merge_cell_table(
@@ -304,22 +311,15 @@ def _merge_cell_table(
                 )
             ret_val.append(a.to_pandas())
 
-        # ret_val_df = pl.concat(
-        #     ret_val,
-        #     how="diagonal",  # Finds a union between the column schemas and fills missing column values with null
-        # )
+        # Here we merge the cell metadata tables for each sample. We simply concatenate the tables and reset the index.
         ret_val_df = pd.concat(ret_val, axis=0).reset_index(drop=True)
-
-        # Randomize the rows in chunks
+        # Now we use the offsets stored in permutations_rows_offset along with the coordinates_permutations to reorder the cells in the merged assay. The offsets are used to bring the cells in the same order as the rows in the merged assay.
         compiled_idx = [
             self.permutations_rows_offset[i][j]
             for i, j in self.coordinates_permutations
         ]
-
         compiled_idx = np.concatenate(compiled_idx)
-        # ret_val_df = ret_val_df[
-        #     compiled_idx
-        # ]  # Polars does not support iloc so we have to use this method
+        # Index the merged cell metadata table with the compiled_idx to get the final randomized merged cell metadata table.
         ret_val_df = ret_val_df.iloc[compiled_idx]
         if sum([x.cells.N for x in self.assays]) != ret_val_df.shape[0]:
             raise AssertionError(
@@ -638,17 +638,16 @@ def dump(self, nthreads=4):
                 total=assay.rawData.numblocks[0],
                 desc=f"Writing data from assay {i+1}/{len(self.assays)} to merged file",
             ):
+                # Perform the inter-chunk permutation of the rows
                 perm_order = self.permutations_rows[i][j]
                 perm_order = perm_order - perm_order.min()
-                # bring a to same order
                 block = block[perm_order, :]
                 a = self._dask_to_coo(block, feat_order, feat_order_map, nthreads)
+                # Here we use the one-to-one mapping of the chunks in the assays to the chunks in the merged assay to bring the data in the same order.
                 row_idx = self.cellOrder[i][j]
-                # bring a to same order
                 self.assayGroup.set_coordinate_selection(
                     (a.row + row_idx.min(), a.col), a.data.astype(self.assayGroup.dtype)
                 )
-                # self.assayGroup[row_idx, :] = a
                 counter += a.shape[0]
         try:
             assert counter == self.nCells