Enable Image_PathName_ columns to be used for building image file r…

…eferences (#61) * add test cases * gather image path data * add data_image_paths to link image files * update example * [pre-commit.ci lite] apply automatic fixes --------- Co-authored-by: pre-commit-ci-lite[bot] <117423508+pre-commit-ci-lite[bot]@users.noreply.github.com>
WayScience · Jan 6, 2025 · 1a5593e · 1a5593e
1 parent 7dcfc27
commit 1a5593e
Show file tree

Hide file tree

Showing 6 changed files with 376 additions and 47 deletions.
diff --git a/docs/src/examples/cytodataframe_at_a_glance.ipynb b/docs/src/examples/cytodataframe_at_a_glance.ipynb
diff --git a/docs/src/examples/cytodataframe_at_a_glance.py b/docs/src/examples/cytodataframe_at_a_glance.py
@@ -5,7 +5,7 @@
 #       extension: .py
 #       format_name: light
 #       format_version: '1.5'
-#       jupytext_version: 1.16.4
+#       jupytext_version: 1.16.6
 #   kernelspec:
 #     display_name: Python 3 (ipykernel)
 #     language: python
@@ -22,6 +22,10 @@
 # visual information which can be viewed directly in a Jupyter notebook.
 
 # +
+import pathlib
+
+import pandas as pd
+
 from cytodataframe.frame import CytoDataFrame
 
 # create paths for use with CytoDataFrames below
@@ -95,6 +99,35 @@
     ]
 ][:3]
 
+# +
+# %%time
+# add active paths on the local system to show how CytoDataFrame
+# may be used without specifying a context directory for images.
+# Note: normally these paths are local to the system where the
+# profile data was generated, which often is not the same as the
+# system which will be used to analyze the data.
+parquet_path = f"{nf1_cellpainting_path}/Plate_2_with_image_data_shrunken.parquet"
+nf1_dataset_with_modified_image_paths = pd.read_parquet(path=parquet_path)
+nf1_dataset_with_modified_image_paths.loc[
+    :, ["Image_PathName_DAPI", "Image_PathName_GFP", "Image_PathName_RFP"]
+] = f"{pathlib.Path(parquet_path).parent}/Plate_2_images"
+
+# view NF1 Cell Painting data with images and overlaid outlines from masks
+CytoDataFrame(
+    # note: we can read directly from an existing Pandas DataFrame
+    data=nf1_dataset_with_modified_image_paths,
+    data_mask_context_dir=f"{nf1_cellpainting_path}/Plate_2_masks",
+)[
+    [
+        "Metadata_ImageNumber",
+        "Metadata_Cells_Number_Object_Number",
+        "Image_FileName_GFP",
+        "Image_FileName_RFP",
+        "Image_FileName_DAPI",
+    ]
+][:3]
+# -
+
 # %%time
 # view nuclear speckles data with images and overlaid outlines from masks
 CytoDataFrame(

diff --git a/media/coverage-badge.svg b/media/coverage-badge.svg
diff --git a/src/cytodataframe/frame.py b/src/cytodataframe/frame.py
@@ -67,6 +67,7 @@ def __init__(  # noqa: PLR0913
         self: CytoDataFrame_type,
         data: Union[CytoDataFrame_type, pd.DataFrame, str, pathlib.Path],
         data_context_dir: Optional[str] = None,
+        data_image_paths: Optional[pd.DataFrame] = None,
         data_bounding_box: Optional[pd.DataFrame] = None,
         data_mask_context_dir: Optional[str] = None,
         data_outline_context_dir: Optional[str] = None,
@@ -82,6 +83,8 @@ def __init__(  # noqa: PLR0913
                 The data source, either a pandas DataFrame or a file path.
             data_context_dir (Optional[str]):
                 Directory context for the image data within the DataFrame.
+            data_image_paths (Optional[pd.DataFrame]):
+                Image path data for the image files.
             data_bounding_box (Optional[pd.DataFrame]):
                 Bounding box data for the DataFrame images.
             data_mask_context_dir: Optional[str]:
@@ -108,6 +111,7 @@ def __init__(  # noqa: PLR0913
             "data_context_dir": (
                 data_context_dir if data_context_dir is not None else None
             ),
+            "data_image_paths": None,
             "data_bounding_box": None,
             "data_mask_context_dir": (
                 data_mask_context_dir if data_mask_context_dir is not None else None
@@ -168,11 +172,17 @@ def __init__(  # noqa: PLR0913
         else:
             super().__init__(data)
 
-        if data_bounding_box is None:
-            self._custom_attrs["data_bounding_box"] = self.get_bounding_box_from_data()
+        self._custom_attrs["data_bounding_box"] = (
+            self.get_bounding_box_from_data()
+            if data_bounding_box is None
+            else data_bounding_box
+        )
 
-        else:
-            self._custom_attrs["data_bounding_box"] = data_bounding_box
+        self._custom_attrs["data_image_paths"] = (
+            self.get_image_paths_from_data(image_cols=self.find_image_columns())
+            if data_image_paths is None
+            else data_image_paths
+        )
 
     def __getitem__(self: CytoDataFrame_type, key: Union[int, str]) -> Any:  # noqa: ANN401
         """
@@ -196,6 +206,7 @@ def __getitem__(self: CytoDataFrame_type, key: Union[int, str]) -> Any:  # noqa:
             return CytoDataFrame(
                 super().__getitem__(key),
                 data_context_dir=self._custom_attrs["data_context_dir"],
+                data_image_paths=self._custom_attrs["data_image_paths"],
                 data_bounding_box=self._custom_attrs["data_bounding_box"],
                 data_mask_context_dir=self._custom_attrs["data_mask_context_dir"],
                 data_outline_context_dir=self._custom_attrs["data_outline_context_dir"],
@@ -233,6 +244,7 @@ def _wrap_method(
             result = CytoDataFrame(
                 result,
                 data_context_dir=self._custom_attrs["data_context_dir"],
+                data_image_paths=self._custom_attrs["data_image_paths"],
                 data_bounding_box=self._custom_attrs["data_bounding_box"],
                 data_mask_context_dir=self._custom_attrs["data_mask_context_dir"],
                 data_outline_context_dir=self._custom_attrs["data_outline_context_dir"],
@@ -381,8 +393,25 @@ def is_notebook_or_lab() -> bool:
         except NameError:
             return False
 
-    def find_image_columns(self: CytoDataFrame_type) -> bool:
+    def find_image_columns(self: CytoDataFrame_type) -> List[str]:
+        """
+        Find columns containing image file names.
+
+        This method searches for columns in the DataFrame
+        that contain image file names with extensions .tif
+        or .tiff (case insensitive).
+
+        Returns:
+            List[str]:
+                A list of column names that contain
+                image file names.
+
+        """
+        # build a pattern to match image file names
         pattern = r".*\.(tif|tiff)$"
+
+        # search for columns containing image file names
+        # based on pattern above.
         return [
             column
             for column in self.columns
@@ -394,6 +423,64 @@ def find_image_columns(self: CytoDataFrame_type) -> bool:
             .any()
         ]
 
+    def get_image_paths_from_data(
+        self: CytoDataFrame_type, image_cols: List[str]
+    ) -> Dict[str, str]:
+        """
+        Gather data containing image path names
+        (the directory storing the images but not the file
+        names). We do this by seeking the pattern:
+        Image_FileName_X --> Image_PathName_X.
+
+        Args:
+            image_cols: List[str]:
+                A list of column names that contain
+                image file names.
+
+        Returns:
+            Dict[str, str]:
+                A list of column names that contain
+                image file names.
+
+        """
+
+        image_path_columns = [
+            col.replace("FileName", "PathName")
+            for col in image_cols
+            if col.replace("FileName", "PathName") in self.columns
+        ]
+
+        return self.filter(items=image_path_columns) if image_path_columns else None
+
+    def find_image_path_columns(
+        self: CytoDataFrame_type, image_cols: List[str], all_cols: List[str]
+    ) -> Dict[str, str]:
+        """
+        Find columns containing image path names
+        (the directory storing the images but not the file
+        names). We do this by seeking the pattern:
+        Image_FileName_X --> Image_PathName_X.
+
+        Args:
+            image_cols: List[str]:
+                A list of column names that contain
+                image file names.
+            all_cols: List[str]:
+                A list of all column names.
+
+        Returns:
+            Dict[str, str]:
+                A list of column names that contain
+                image file names.
+
+        """
+
+        return {
+            col: col.replace("FileName", "PathName")
+            for col in image_cols
+            if col.replace("FileName", "PathName") in all_cols
+        }
+
     def search_for_mask_or_outline(  # noqa: PLR0913, PLR0911
         self: CytoDataFrame_type,
         data_value: str,
@@ -471,6 +558,7 @@ def process_image_data_as_html_display(
         self: CytoDataFrame_type,
         data_value: Any,  # noqa: ANN401
         bounding_box: Tuple[int, int, int, int],
+        image_path: Optional[str] = None,
     ) -> str:
         """
         Process the image data based on the provided data value
@@ -489,38 +577,55 @@ def process_image_data_as_html_display(
                 The HTML image display string, or the unmodified data
                 value if the image cannot be processed.
         """
+
         candidate_path = None
         # Get the pattern map for segmentation file regex
         pattern_map = self._custom_attrs.get("segmentation_file_regex")
 
         # Step 1: Find the candidate file if the data value is not already a file
         if not pathlib.Path(data_value).is_file():
+            # determine if we have a file from the path (dir) + filename
+            if (
+                self._custom_attrs["data_context_dir"] is None
+                and image_path is not None
+                and (
+                    existing_image_from_path := pathlib.Path(
+                        f"{image_path}/{data_value}"
+                    )
+                ).is_file()
+            ):
+                candidate_path = existing_image_from_path
+
             # Search for the data value in the data context directory
-            if candidate_paths := list(
-                pathlib.Path(self._custom_attrs["data_context_dir"]).rglob(data_value)
+            elif self._custom_attrs["data_context_dir"] is not None and (
+                candidate_paths := list(
+                    pathlib.Path(self._custom_attrs["data_context_dir"]).rglob(
+                        data_value
+                    )
+                )
             ):
                 # If a candidate file is found, use the first one
                 candidate_path = candidate_paths[0]
-                orig_image_array = skimage.io.imread(candidate_path)
-
-                # Adjust the image with image adjustment callable
-                # or adaptive histogram equalization
-                if self._custom_attrs["image_adjustment"] is not None:
-                    orig_image_array = self._custom_attrs["image_adjustment"](
-                        orig_image_array
-                    )
-                else:
-                    orig_image_array = adjust_with_adaptive_histogram_equalization(
-                        orig_image_array
-                    )
-
-                # Normalize to 0-255 for image saving
-                orig_image_array = img_as_ubyte(orig_image_array)
 
             else:
                 # If no candidate file is found, return the original data value
                 return data_value
 
+        # read the image as an array
+        orig_image_array = skimage.io.imread(candidate_path)
+
+        # Adjust the image with image adjustment callable
+        # or adaptive histogram equalization
+        if self._custom_attrs["image_adjustment"] is not None:
+            orig_image_array = self._custom_attrs["image_adjustment"](orig_image_array)
+        else:
+            orig_image_array = adjust_with_adaptive_histogram_equalization(
+                orig_image_array
+            )
+
+        # Normalize to 0-255 for image saving
+        orig_image_array = img_as_ubyte(orig_image_array)
+
         prepared_image = None
         # Step 2: Search for a mask
         prepared_image = self.search_for_mask_or_outline(
@@ -632,8 +737,6 @@ def _repr_html_(
             max_cols = get_option("display.max_columns")
             show_dimensions = get_option("display.show_dimensions")
 
-            # determine if we have image_cols to display
-        if image_cols := self.find_image_columns():
             # re-add bounding box cols if they are no longer available as in cases
             # of masking or accessing various pandas attr's
             bounding_box_externally_joined = False
@@ -647,6 +750,25 @@ def _repr_html_(
             else:
                 data = self.copy()
 
+            # re-add image path (dirs for images) cols if they are no
+            # longer available as in cases of masking or accessing
+            # various pandas attr's
+            image_paths_externally_joined = False
+
+            if self._custom_attrs["data_image_paths"] is not None and not all(
+                col in self.columns.tolist()
+                for col in self._custom_attrs["data_image_paths"].columns.tolist()
+            ):
+                data = data.join(other=self._custom_attrs["data_image_paths"])
+                image_paths_externally_joined = True
+
+                # determine if we have image_cols to display
+            if image_cols := self.find_image_columns():
+                # attempt to find the image path columns
+                image_path_cols = self.find_image_path_columns(
+                    image_cols=image_cols, all_cols=data.columns
+                )
+
             # gather indices which will be displayed based on pandas configuration
             display_indices = self.get_displayed_rows()
 
@@ -691,6 +813,12 @@ def _repr_html_(
                                 )
                             ],
                         ),
+                        # set the image path based on the image_path cols.
+                        image_path=(
+                            row[image_path_cols[image_col]]
+                            if image_path_cols is not None and image_path_cols != {}
+                            else None
+                        ),
                     ),
                     axis=1,
                 )
@@ -700,6 +828,11 @@ def _repr_html_(
                     self._custom_attrs["data_bounding_box"].columns.tolist(), axis=1
                 )
 
+            if image_paths_externally_joined:
+                data = data.drop(
+                    self._custom_attrs["data_image_paths"].columns.tolist(), axis=1
+                )
+
             formatter = fmt.DataFrameFormatter(
                 data,
                 columns=None,

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -42,7 +42,7 @@ def fixture_cytotable_nuclear_speckle_data_parquet():
     return "tests/data/cytotable/nuclear_speckles/test_slide1_converted.parquet"
 
 
-@pytest.fixture(name="cytotable_pediatric_cancer_atlas_parquet_parquet")
+@pytest.fixture(name="cytotable_pediatric_cancer_atlas_parquet")
 def fixture_pediatric_cancer_atlas_data_parquet():
     """
     Return df to test CytoTable pediatric cancer atlas data through