Mass proficient extraction for NAS storage (#342)

* Initial commit of mass proficient extraction * Add mass proficient MPH extraction * Ensure bin_extraction also handles proficient metrics extraction * Handle mass proficient and deficient bin file extraction separately (in case missing FOVs different for both) * Add mass proficient MPH normalization generation to 4b notebook * OCD comment change * Ensure mass proficient panel passed to proficient extraction in 3b notebook * Typo fix * Ensure mass proficient directory actually gets created prior to extraction in notebook 3b * Make comment about mass proficient extraction more clear * Ensure pulse height generation doesn't attempt to read in proficient metric data * Remove duplicate intermediate callback * Typo in FOV callbacks * Add flag to control proficient extraction or not * Nuke proficient MPH extraction * Timing tests for normalization * Update pyproject.toml dependencies to be compatible with mibi-bin-tools and alpineer * Standardize the deficient and proficient ranges based on agreed upon range values * Delete timeit print statements for MPH * Begin reducing test cases * Document the start and stop offset params * Update poetry lock file to ensure this actually installs * Clarify comment about extraction ranges * Fix another extraction window comment * Make sure extract_prof is included as a param to watcher tests * Add extract_prof param to inter_callback test * Revert back to old deficient and proficient extraction ranges * Pulse heights are not extracted for deficient and proficient data anymore * Clarify comment
angelolab · Aug 22, 2023 · 49d26ab · 49d26ab
1 parent e1fa4d8
commit 49d26ab
Show file tree

Hide file tree

Showing 14 changed files with 942 additions and 417 deletions.
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -38,7 +38,7 @@ metadata = false
 [tool.poetry.dependencies]
 python = ">=3.9,<3.12"
 alpineer = ">=0.1.9"
-mibi-bin-tools = "0.2.9"
+mibi-bin-tools = "0.2.10"
 ipywidgets = "^8"
 numpy = "1.*"
 natsort = "^8"

diff --git a/src/toffy/normalize.py b/src/toffy/normalize.py
@@ -47,7 +47,9 @@ def write_counts_per_mass(base_dir, output_dir, fov, masses, start_offset=0.5, s
     out_df.to_csv(os.path.join(output_dir, fov + "_channel_counts.csv"), index=False)
 
 
-def write_mph_per_mass(base_dir, output_dir, fov, masses, start_offset=0.5, stop_offset=0.5):
+def write_mph_per_mass(
+    base_dir, output_dir, fov, masses, start_offset=0.5, stop_offset=0.5, proficient=False
+):
     """Records the median pulse height (MPH) per mass for the specified FOV
 
     Args:
@@ -57,6 +59,7 @@ def write_mph_per_mass(base_dir, output_dir, fov, masses, start_offset=0.5, stop
         masses (list): the list of masses to extract MPH from
         start_offset (float): beginning value for calculating mph values
         stop_offset (float): ending value for calculating mph values
+        proficient (bool): whether proficient MPH data is written or not
     """
     # hold computed values
     mph_vals = []
@@ -72,7 +75,10 @@ def write_mph_per_mass(base_dir, output_dir, fov, masses, start_offset=0.5, stop
     # create df to hold output
     fovs = np.repeat(fov, len(masses))
     out_df = pd.DataFrame({"mass": masses, "fov": fovs, "pulse_height": mph_vals})
-    out_df.to_csv(os.path.join(output_dir, fov + "_pulse_heights.csv"), index=False)
+    pulse_heights_file = (
+        fov + "_pulse_heights_proficient.csv" if proficient else fov + "_pulse_heights.csv"
+    )
+    out_df.to_csv(os.path.join(output_dir, pulse_heights_file), index=False)
 
 
 def create_objective_function(obj_func):
@@ -222,7 +228,9 @@ def combine_run_metrics(run_dir, substring, warn_overwrite=True):
         substring(str): the substring contained within the files to be combined
         warn_overwrite (bool): whether to warn if existing `_combined.csv` file found"""
 
+    # retrieve all pulse height files, but ignore anything mass proficient
     files = io_utils.list_files(run_dir, substring)
+    files = [file for file in files if "_proficient" not in file]
 
     # validate inputs
     if len(files) == 0:

diff --git a/src/toffy/panel_utils.py b/src/toffy/panel_utils.py
@@ -105,6 +105,41 @@
 )
 
 
+def modify_panel_ranges(panel: pd.DataFrame, start_offset: float = 0, stop_offset: float = 0):
+    """Adjust the offsets of a given panel.
+
+    Only applicable for masses with ranges separated by 0.3 between 'Stop' and 'Start'.
+
+    Args:
+        panel (pd.DataFrame): panel dataframe with columns Mass, Target, Start, and Stop.
+        start_offset (float): the value to add to the `'Start'` column.
+        stop_offset (float): the value to add to the `'Stop'` column.
+
+    Returns:
+        pd.DataFrame:
+            Updated panel with `start_offset` added to `'Start`' column,
+            likewise for `stop_offset` and `'Stop'` column.
+    """
+    panel_new = panel.copy()
+
+    # extract only rows where 'Start' - 'End' = -0.3, round to account for floating point error
+    panel_rows_modify = panel_new[
+        (panel_new["Start"] - panel_new["Stop"]).round(1) == -0.3
+    ].index.values
+
+    # add start_offset to 'Start' column
+    panel_new.loc[panel_rows_modify, "Start"] = (
+        panel_new.loc[panel_rows_modify, "Start"].copy() + start_offset
+    )
+
+    # add stop_offset to 'Stop' column
+    panel_new.loc[panel_rows_modify, "Stop"] = (
+        panel_new.loc[panel_rows_modify, "Stop"].copy() + stop_offset
+    )
+
+    return panel_new
+
+
 def merge_duplicate_masses(panel):
     """Check a panel df for duplicate mass values and return a unique mass panel with the
         target names combined

diff --git a/src/toffy/watcher_callbacks.py b/src/toffy/watcher_callbacks.py
@@ -18,6 +18,7 @@
 from toffy.image_stitching import stitch_images
 from toffy.mph_comp import combine_mph_metrics, compute_mph_metrics, visualize_mph
 from toffy.normalize import write_mph_per_mass
+from toffy.panel_utils import modify_panel_ranges
 from toffy.qc_comp import combine_qc_metrics, compute_qc_metrics_direct
 from toffy.qc_metrics_plots import visualize_qc_metrics
 from toffy.settings import QC_COLUMNS
@@ -124,11 +125,14 @@ class FovCallbacks:
     point_name: str
     overwrite: bool
     __panel: pd.DataFrame = field(default=None, init=False)
+    __panel_prof: pd.DataFrame = field(default=None, init=False)
     __fov_data: xr.DataArray = field(default=None, init=False)
+    __fov_data_prof: xr.DataArray = field(default=None, init=False)
 
     def _generate_fov_data(
         self,
         panel: pd.DataFrame,
+        extract_prof: bool,
         intensities=["Au", "chan_39"],
         replace=True,
         time_res=0.0005,
@@ -138,9 +142,13 @@ def _generate_fov_data(
 
         The data and the panel are then cached members of the FovCallbacks object
 
+        Both the deficient and proficient extracted data and panel are computed and cached
+
         Args:
             panel (pd.DataFrame):
                 Panel used for extraction
+            extract_prof (bool):
+                If set, extract proficient data
             intensities (bool | List[str]):
                 Intensities argument for `mibi_bin_tools.bin_files.extract_bin_files`
             replace (bool):
@@ -159,17 +167,37 @@ def _generate_fov_data(
             replace=replace,
             time_res=time_res,
         )
-
         self.__panel = panel
 
-    def extract_tiffs(self, tiff_out_dir: str, panel: pd.DataFrame, **kwargs):
+        if extract_prof:
+            # adds an offset of 0.3 to 'Start' and 'Stop' columns, modifying extraction range
+            # from (-0.3, 0) to (0, 0.3) for proficient extraction
+            panel_prof = modify_panel_ranges(panel, start_offset=0.3, stop_offset=0.3)
+            self.__fov_data_prof = extract_bin_files(
+                data_dir=self.run_folder,
+                out_dir=None,
+                include_fovs=[self.point_name],
+                panel=panel_prof,
+                intensities=intensities,
+                replace=replace,
+                time_res=time_res,
+            )
+            self.__panel_prof = panel_prof
+
+    def extract_tiffs(
+        self, tiff_out_dir: str, panel: pd.DataFrame, extract_prof: bool = True, **kwargs
+    ):
         """Extract tiffs into provided directory, using given panel
 
+        Done for both the extracted deficient and proficient data
+
         Args:
             tiff_out_dir (str):
                 Path where tiffs are written
             panel (pd.DataFrame):
                 Target mass integration ranges
+            extract_prof (bool):
+                If set, extract mass proficient data
             **kwargs (dict):
                 Additional arguments for `mibi_bin_tools.bin_files.extract_bin_files`.
                 Accepted kwargs are
@@ -199,8 +227,8 @@ def extract_tiffs(self, tiff_out_dir: str, panel: pd.DataFrame, **kwargs):
             unextracted_chans = io_utils.remove_file_extensions(unextracted_chan_tiffs)
             panel = panel[panel["Target"].isin(unextracted_chans)]
 
-        if self.__fov_data is None:
-            self._generate_fov_data(panel, **kwargs)
+        if self.__fov_data is None or self.__fov_data_prof is None:
+            self._generate_fov_data(panel, extract_prof, **kwargs)
 
         intensities = kwargs.get("intensities", ["Au", "chan_39"])
         if any_true(intensities) and type(intensities) is not list:
@@ -214,14 +242,27 @@ def extract_tiffs(self, tiff_out_dir: str, panel: pd.DataFrame, **kwargs):
             intensities=intensities,
         )
 
-    def generate_qc(self, qc_out_dir: str, panel: pd.DataFrame = None, **kwargs):
+        if extract_prof:
+            _write_out(
+                img_data=self.__fov_data_prof[0, :, :, :, :].values,
+                out_dir=tiff_out_dir + "_proficient",
+                fov_name=self.point_name,
+                targets=list(self.__fov_data.channel.values),
+                intensities=intensities,
+            )
+
+    def generate_qc(
+        self, qc_out_dir: str, panel: pd.DataFrame = None, extract_prof: bool = True, **kwargs
+    ):
         """Generates qc metrics from given panel, and saves output to provided directory
 
         Args:
             qc_out_dir (str):
                 Path where qc_metrics are written
             panel (pd.DataFrame):
                 Target mass integration ranges
+            extract_prof (bool):
+                If set, extract mass proficient data
             **kwargs (dict):
                 Additional arguments for `toffy.qc_comp.compute_qc_metrics`. Accepted kwargs are:
 
@@ -234,7 +275,7 @@ def generate_qc(self, qc_out_dir: str, panel: pd.DataFrame = None, **kwargs):
         if self.__fov_data is None:
             if panel is None:
                 raise ValueError("Must provide panel if fov data is not already generated...")
-            self._generate_fov_data(panel, **kwargs)
+            self._generate_fov_data(panel, extract_prof, **kwargs)
 
         qc_metric_paths = [
             os.path.join(qc_out_dir, f"{self.point_name}_nonzero_mean_stats.csv"),
@@ -267,7 +308,6 @@ def generate_mph(self, mph_out_dir, **kwargs):
              - mass_start
              - mass_stop
         """
-
         if not os.path.exists(mph_out_dir):
             os.makedirs(mph_out_dir)
 
@@ -297,7 +337,6 @@ def generate_pulse_heights(self, pulse_out_dir: str, panel: pd.DataFrame = None,
              - start_offset
              - stop_offset
         """
-
         if not os.path.exists(pulse_out_dir):
             os.makedirs(pulse_out_dir)
 
@@ -422,7 +461,6 @@ def run_callback(run_folder: str):
                 # unreachable...
                 raise ValueError(f"Could not locate attribute {run_cb} in RunCallbacks object")
 
-    intermediate_callback = None
     intermediate_callback = None
     if intermediate_callbacks:
 

diff --git a/templates/3a_monitor_MIBI_run.ipynb b/templates/3a_monitor_MIBI_run.ipynb
@@ -30,7 +30,8 @@
     "## Required variables\n",
     "\n",
     " - `run_name` should contain the exact name of the MIBI run that will be monitored\n",
-    " - `panel_path` should point to a panel csv specifying the targets on your panel. You can download your panel online from the Ionpath MibiTracker and then copy the file to the `C:\\\\Users\\\\Customer.ION\\\\Documents\\panel_files` directory (see [panel format](https://github.com/angelolab/toffy#panel-format) for more information)"
+    " - `panel_path` should point to a panel csv specifying the targets on your panel. You can download your panel online from the Ionpath MibiTracker and then copy the file to the `C:\\\\Users\\\\Customer.ION\\\\Documents\\panel_files` directory (see [panel format](https://github.com/angelolab/toffy#panel-format) for more information)\n",
+    "  - `extract_prof` indicates whether you want to include mass proficient extraction on top of the default (mass deficient) extraction"
    ]
   },
   {
@@ -43,7 +44,10 @@
     "run_name = 'YYYY-MM-DD_run_name'\n",
     "\n",
     "# path to user panel\n",
-    "panel_path = 'C:\\\\Users\\\\Customer.ION\\\\Documents\\\\panel_files\\\\my_cool_panel.csv'"
+    "panel_path = 'C:\\\\Users\\\\Customer.ION\\\\Documents\\\\panel_files\\\\my_cool_panel.csv'\n",
+    "\n",
+    "# whether to include proficient extraction or not\n",
+    "extract_prof = True"
    ]
   },
   {
@@ -106,6 +110,7 @@
     "    run_callbacks = ['image_stitching'],\n",
     "    intermediate_callbacks = ['plot_qc_metrics', 'plot_mph_metrics'],\n",
     "    fov_callbacks = ['extract_tiffs', 'generate_pulse_heights'],\n",
+    "    extract_prof=extract_prof,\n",
     "    tiff_out_dir=extraction_dir,\n",
     "    qc_out_dir=metrics_data_dir,\n",
     "    mph_out_dir=metrics_data_dir,\n",

diff --git a/templates/3b_extract_images_from_bin.ipynb b/templates/3b_extract_images_from_bin.ipynb
@@ -23,7 +23,7 @@
    "source": [
     "import os\n",
     "\n",
-    "from toffy.panel_utils import load_panel\n",
+    "from toffy.panel_utils import modify_panel_ranges, load_panel\n",
     "from toffy.bin_extraction import extract_missing_fovs"
    ]
   },
@@ -35,7 +35,8 @@
     "## Required variables\n",
     "You will need to define the following two arguments for this notebook.\n",
     " - `run_name` should contain the exact name of the MIBI run to extract from\n",
-    " - `panel_path` should point to a panel csv specifying the targets on your panel. You can download your panel online from the Ionpath MibiTracker and then copy the file to the `C:\\\\Users\\\\Customer.ION\\\\Documents\\panel_files` directory (see [panel format](https://github.com/angelolab/toffy#panel-format) for more information)"
+    " - `panel_path` should point to a panel csv specifying the targets on your panel. You can download your panel online from the Ionpath MibiTracker and then copy the file to the `C:\\\\Users\\\\Customer.ION\\\\Documents\\panel_files` directory (see [panel format](https://github.com/angelolab/toffy#panel-format) for more information)\n",
+    " - `extract_prof` indicates whether you want to include mass proficient extraction on top of the default (mass deficient) extraction"
    ]
   },
   {
@@ -49,7 +50,8 @@
    "source": [
     "# set up args for current run\n",
     "run_name = 'YYYY-MM-DD_run_name'\n",
-    "panel_path = 'C:\\\\Users\\\\Customer.ION\\\\Documents\\\\panel_files\\\\my_cool_panel.csv'"
+    "panel_path = 'C:\\\\Users\\\\Customer.ION\\\\Documents\\\\panel_files\\\\my_cool_panel.csv'\n",
+    "extract_prof = True"
    ]
   },
   {
@@ -78,7 +80,13 @@
     "# path to directory containing extracted files\n",
     "extraction_dir = os.path.join('D:\\\\Extracted_Images', run_name)    \n",
     "if not os.path.exists(extraction_dir):\n",
-    "    os.makedirs(extraction_dir)"
+    "    os.makedirs(extraction_dir)\n",
+    "\n",
+    "if extract_prof:\n",
+    "    # path to directory containing mass-proficient extracted files (for long-term storage)\n",
+    "    extraction_prof_dir = os.path.join('D:\\\\Extracted_Images', run_name + '_proficient')\n",
+    "    if not os.path.exists(extraction_prof_dir):\n",
+    "        os.makedirs(extraction_prof_dir)"
    ]
   },
   {
@@ -97,7 +105,12 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "extract_missing_fovs(base_dir, extraction_dir, panel)"
+    "# base deficient extraction\n",
+    "extract_missing_fovs(base_dir, extraction_dir, panel)\n",
+    "\n",
+    "# mass proficient extraction (for long-term storage)\n",
+    "if extract_prof:\n",
+    "    extract_missing_fovs(base_dir, extraction_prof_dir, modify_panel_ranges(panel, start_offset=0.3, stop_offset=0.3))"
    ]
   }
  ],
@@ -117,7 +130,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.8.16"
+   "version": "3.9.16"
   }
  },
  "nbformat": 4,

diff --git a/templates/4b_normalize_image_data.ipynb b/templates/4b_normalize_image_data.ipynb
@@ -137,6 +137,8 @@
     "for fov in fovs:\n",
     "    # generate mph values\n",
     "    mph_file_path = os.path.join(mph_run_dir, fov + '_pulse_heights.csv')\n",
+    "    \n",
+    "    # base MPH normalization\n",
     "    if not os.path.exists(mph_file_path):\n",
     "        normalize.write_mph_per_mass(base_dir=os.path.join(bin_base_dir, run_name), output_dir=mph_run_dir, \n",
     "                                     fov=fov, masses=panel['Mass'].values, start_offset=0.3, stop_offset=0)"

diff --git a/tests/fov_watcher_test.py b/tests/fov_watcher_test.py
@@ -242,6 +242,7 @@ def test_watcher(
     add_blank,
     temp_bin,
 ):
+    print("The watcher start lag is: %d" % watcher_start_lag)
     try:
         with tempfile.TemporaryDirectory() as tmpdir:
             tiff_out_dir = os.path.join(tmpdir, "cb_0", RUN_DIR_NAME)
@@ -378,6 +379,14 @@ def test_watcher(
 
             # extract tiffs check
             validators[0](os.path.join(tmpdir, "cb_0", RUN_DIR_NAME), fovs, bad_fovs)
+            if kwargs["extract_prof"]:
+                validators[0](
+                    os.path.join(tmpdir, "cb_0", RUN_DIR_NAME + "_proficient"), fovs, bad_fovs
+                )
+            else:
+                assert not os.path.exists(
+                    os.path.join(tmpdir, "cb_0", RUN_DIR_NAME) + "_proficient"
+                )
 
             # qc check
             validators[1](os.path.join(tmpdir, "cb_1", RUN_DIR_NAME), fovs, bad_fovs)