Skip to content

Commit

Permalink
Enable Image_PathName_ columns to be used for building image file r…
Browse files Browse the repository at this point in the history
…eferences (#61)

* add test cases

* gather image path data

* add data_image_paths to link image files

* update example

* [pre-commit.ci lite] apply automatic fixes

---------

Co-authored-by: pre-commit-ci-lite[bot] <117423508+pre-commit-ci-lite[bot]@users.noreply.github.com>
  • Loading branch information
d33bs and pre-commit-ci-lite[bot] authored Jan 6, 2025
1 parent 7dcfc27 commit 1a5593e
Show file tree
Hide file tree
Showing 6 changed files with 376 additions and 47 deletions.
138 changes: 122 additions & 16 deletions docs/src/examples/cytodataframe_at_a_glance.ipynb

Large diffs are not rendered by default.

35 changes: 34 additions & 1 deletion docs/src/examples/cytodataframe_at_a_glance.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
# extension: .py
# format_name: light
# format_version: '1.5'
# jupytext_version: 1.16.4
# jupytext_version: 1.16.6
# kernelspec:
# display_name: Python 3 (ipykernel)
# language: python
Expand All @@ -22,6 +22,10 @@
# visual information which can be viewed directly in a Jupyter notebook.

# +
import pathlib

import pandas as pd

from cytodataframe.frame import CytoDataFrame

# create paths for use with CytoDataFrames below
Expand Down Expand Up @@ -95,6 +99,35 @@
]
][:3]

# +
# %%time
# add active paths on the local system to show how CytoDataFrame
# may be used without specifying a context directory for images.
# Note: normally these paths are local to the system where the
# profile data was generated, which often is not the same as the
# system which will be used to analyze the data.
parquet_path = f"{nf1_cellpainting_path}/Plate_2_with_image_data_shrunken.parquet"
nf1_dataset_with_modified_image_paths = pd.read_parquet(path=parquet_path)
nf1_dataset_with_modified_image_paths.loc[
:, ["Image_PathName_DAPI", "Image_PathName_GFP", "Image_PathName_RFP"]
] = f"{pathlib.Path(parquet_path).parent}/Plate_2_images"

# view NF1 Cell Painting data with images and overlaid outlines from masks
CytoDataFrame(
# note: we can read directly from an existing Pandas DataFrame
data=nf1_dataset_with_modified_image_paths,
data_mask_context_dir=f"{nf1_cellpainting_path}/Plate_2_masks",
)[
[
"Metadata_ImageNumber",
"Metadata_Cells_Number_Object_Number",
"Image_FileName_GFP",
"Image_FileName_RFP",
"Image_FileName_DAPI",
]
][:3]
# -

# %%time
# view nuclear speckles data with images and overlaid outlines from masks
CytoDataFrame(
Expand Down
2 changes: 1 addition & 1 deletion media/coverage-badge.svg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
181 changes: 157 additions & 24 deletions src/cytodataframe/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ def __init__( # noqa: PLR0913
self: CytoDataFrame_type,
data: Union[CytoDataFrame_type, pd.DataFrame, str, pathlib.Path],
data_context_dir: Optional[str] = None,
data_image_paths: Optional[pd.DataFrame] = None,
data_bounding_box: Optional[pd.DataFrame] = None,
data_mask_context_dir: Optional[str] = None,
data_outline_context_dir: Optional[str] = None,
Expand All @@ -82,6 +83,8 @@ def __init__( # noqa: PLR0913
The data source, either a pandas DataFrame or a file path.
data_context_dir (Optional[str]):
Directory context for the image data within the DataFrame.
data_image_paths (Optional[pd.DataFrame]):
Image path data for the image files.
data_bounding_box (Optional[pd.DataFrame]):
Bounding box data for the DataFrame images.
data_mask_context_dir: Optional[str]:
Expand All @@ -108,6 +111,7 @@ def __init__( # noqa: PLR0913
"data_context_dir": (
data_context_dir if data_context_dir is not None else None
),
"data_image_paths": None,
"data_bounding_box": None,
"data_mask_context_dir": (
data_mask_context_dir if data_mask_context_dir is not None else None
Expand Down Expand Up @@ -168,11 +172,17 @@ def __init__( # noqa: PLR0913
else:
super().__init__(data)

if data_bounding_box is None:
self._custom_attrs["data_bounding_box"] = self.get_bounding_box_from_data()
self._custom_attrs["data_bounding_box"] = (
self.get_bounding_box_from_data()
if data_bounding_box is None
else data_bounding_box
)

else:
self._custom_attrs["data_bounding_box"] = data_bounding_box
self._custom_attrs["data_image_paths"] = (
self.get_image_paths_from_data(image_cols=self.find_image_columns())
if data_image_paths is None
else data_image_paths
)

def __getitem__(self: CytoDataFrame_type, key: Union[int, str]) -> Any: # noqa: ANN401
"""
Expand All @@ -196,6 +206,7 @@ def __getitem__(self: CytoDataFrame_type, key: Union[int, str]) -> Any: # noqa:
return CytoDataFrame(
super().__getitem__(key),
data_context_dir=self._custom_attrs["data_context_dir"],
data_image_paths=self._custom_attrs["data_image_paths"],
data_bounding_box=self._custom_attrs["data_bounding_box"],
data_mask_context_dir=self._custom_attrs["data_mask_context_dir"],
data_outline_context_dir=self._custom_attrs["data_outline_context_dir"],
Expand Down Expand Up @@ -233,6 +244,7 @@ def _wrap_method(
result = CytoDataFrame(
result,
data_context_dir=self._custom_attrs["data_context_dir"],
data_image_paths=self._custom_attrs["data_image_paths"],
data_bounding_box=self._custom_attrs["data_bounding_box"],
data_mask_context_dir=self._custom_attrs["data_mask_context_dir"],
data_outline_context_dir=self._custom_attrs["data_outline_context_dir"],
Expand Down Expand Up @@ -381,8 +393,25 @@ def is_notebook_or_lab() -> bool:
except NameError:
return False

def find_image_columns(self: CytoDataFrame_type) -> bool:
def find_image_columns(self: CytoDataFrame_type) -> List[str]:
"""
Find columns containing image file names.
This method searches for columns in the DataFrame
that contain image file names with extensions .tif
or .tiff (case insensitive).
Returns:
List[str]:
A list of column names that contain
image file names.
"""
# build a pattern to match image file names
pattern = r".*\.(tif|tiff)$"

# search for columns containing image file names
# based on pattern above.
return [
column
for column in self.columns
Expand All @@ -394,6 +423,64 @@ def find_image_columns(self: CytoDataFrame_type) -> bool:
.any()
]

def get_image_paths_from_data(
self: CytoDataFrame_type, image_cols: List[str]
) -> Dict[str, str]:
"""
Gather data containing image path names
(the directory storing the images but not the file
names). We do this by seeking the pattern:
Image_FileName_X --> Image_PathName_X.
Args:
image_cols: List[str]:
A list of column names that contain
image file names.
Returns:
Dict[str, str]:
A list of column names that contain
image file names.
"""

image_path_columns = [
col.replace("FileName", "PathName")
for col in image_cols
if col.replace("FileName", "PathName") in self.columns
]

return self.filter(items=image_path_columns) if image_path_columns else None

def find_image_path_columns(
self: CytoDataFrame_type, image_cols: List[str], all_cols: List[str]
) -> Dict[str, str]:
"""
Find columns containing image path names
(the directory storing the images but not the file
names). We do this by seeking the pattern:
Image_FileName_X --> Image_PathName_X.
Args:
image_cols: List[str]:
A list of column names that contain
image file names.
all_cols: List[str]:
A list of all column names.
Returns:
Dict[str, str]:
A list of column names that contain
image file names.
"""

return {
col: col.replace("FileName", "PathName")
for col in image_cols
if col.replace("FileName", "PathName") in all_cols
}

def search_for_mask_or_outline( # noqa: PLR0913, PLR0911
self: CytoDataFrame_type,
data_value: str,
Expand Down Expand Up @@ -471,6 +558,7 @@ def process_image_data_as_html_display(
self: CytoDataFrame_type,
data_value: Any, # noqa: ANN401
bounding_box: Tuple[int, int, int, int],
image_path: Optional[str] = None,
) -> str:
"""
Process the image data based on the provided data value
Expand All @@ -489,38 +577,55 @@ def process_image_data_as_html_display(
The HTML image display string, or the unmodified data
value if the image cannot be processed.
"""

candidate_path = None
# Get the pattern map for segmentation file regex
pattern_map = self._custom_attrs.get("segmentation_file_regex")

# Step 1: Find the candidate file if the data value is not already a file
if not pathlib.Path(data_value).is_file():
# determine if we have a file from the path (dir) + filename
if (
self._custom_attrs["data_context_dir"] is None
and image_path is not None
and (
existing_image_from_path := pathlib.Path(
f"{image_path}/{data_value}"
)
).is_file()
):
candidate_path = existing_image_from_path

# Search for the data value in the data context directory
if candidate_paths := list(
pathlib.Path(self._custom_attrs["data_context_dir"]).rglob(data_value)
elif self._custom_attrs["data_context_dir"] is not None and (
candidate_paths := list(
pathlib.Path(self._custom_attrs["data_context_dir"]).rglob(
data_value
)
)
):
# If a candidate file is found, use the first one
candidate_path = candidate_paths[0]
orig_image_array = skimage.io.imread(candidate_path)

# Adjust the image with image adjustment callable
# or adaptive histogram equalization
if self._custom_attrs["image_adjustment"] is not None:
orig_image_array = self._custom_attrs["image_adjustment"](
orig_image_array
)
else:
orig_image_array = adjust_with_adaptive_histogram_equalization(
orig_image_array
)

# Normalize to 0-255 for image saving
orig_image_array = img_as_ubyte(orig_image_array)

else:
# If no candidate file is found, return the original data value
return data_value

# read the image as an array
orig_image_array = skimage.io.imread(candidate_path)

# Adjust the image with image adjustment callable
# or adaptive histogram equalization
if self._custom_attrs["image_adjustment"] is not None:
orig_image_array = self._custom_attrs["image_adjustment"](orig_image_array)
else:
orig_image_array = adjust_with_adaptive_histogram_equalization(
orig_image_array
)

# Normalize to 0-255 for image saving
orig_image_array = img_as_ubyte(orig_image_array)

prepared_image = None
# Step 2: Search for a mask
prepared_image = self.search_for_mask_or_outline(
Expand Down Expand Up @@ -632,8 +737,6 @@ def _repr_html_(
max_cols = get_option("display.max_columns")
show_dimensions = get_option("display.show_dimensions")

# determine if we have image_cols to display
if image_cols := self.find_image_columns():
# re-add bounding box cols if they are no longer available as in cases
# of masking or accessing various pandas attr's
bounding_box_externally_joined = False
Expand All @@ -647,6 +750,25 @@ def _repr_html_(
else:
data = self.copy()

# re-add image path (dirs for images) cols if they are no
# longer available as in cases of masking or accessing
# various pandas attr's
image_paths_externally_joined = False

if self._custom_attrs["data_image_paths"] is not None and not all(
col in self.columns.tolist()
for col in self._custom_attrs["data_image_paths"].columns.tolist()
):
data = data.join(other=self._custom_attrs["data_image_paths"])
image_paths_externally_joined = True

# determine if we have image_cols to display
if image_cols := self.find_image_columns():
# attempt to find the image path columns
image_path_cols = self.find_image_path_columns(
image_cols=image_cols, all_cols=data.columns
)

# gather indices which will be displayed based on pandas configuration
display_indices = self.get_displayed_rows()

Expand Down Expand Up @@ -691,6 +813,12 @@ def _repr_html_(
)
],
),
# set the image path based on the image_path cols.
image_path=(
row[image_path_cols[image_col]]
if image_path_cols is not None and image_path_cols != {}
else None
),
),
axis=1,
)
Expand All @@ -700,6 +828,11 @@ def _repr_html_(
self._custom_attrs["data_bounding_box"].columns.tolist(), axis=1
)

if image_paths_externally_joined:
data = data.drop(
self._custom_attrs["data_image_paths"].columns.tolist(), axis=1
)

formatter = fmt.DataFrameFormatter(
data,
columns=None,
Expand Down
2 changes: 1 addition & 1 deletion tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ def fixture_cytotable_nuclear_speckle_data_parquet():
return "tests/data/cytotable/nuclear_speckles/test_slide1_converted.parquet"


@pytest.fixture(name="cytotable_pediatric_cancer_atlas_parquet_parquet")
@pytest.fixture(name="cytotable_pediatric_cancer_atlas_parquet")
def fixture_pediatric_cancer_atlas_data_parquet():
"""
Return df to test CytoTable pediatric cancer atlas data through
Expand Down
Loading

0 comments on commit 1a5593e

Please sign in to comment.