Skip to content

Commit

Permalink
add loading and preprocessing stuff
Browse files Browse the repository at this point in the history
  • Loading branch information
ianhi committed Jul 25, 2023
1 parent 7c4cd2f commit 456fbf0
Show file tree
Hide file tree
Showing 3 changed files with 241 additions and 1 deletion.
4 changes: 3 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,9 @@ classifiers = [
"Programming Language :: Python :: 3.10",
]
dynamic = ["version"]
dependencies = []
dependencies = [
"scipy",
]

# extras
# https://peps.python.org/pep-0621/#dependencies-optional-dependencies
Expand Down
115 changes: 115 additions & 0 deletions src/raman_analysis/loading.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
from typing import Iterable, Union

import numpy as np
import pandas as pd
import xarray as xr


def ds2df(ds: xr.Dataset, fov: int, cell_index_start: int = 0, filename=None) -> pd.DataFrame:
"""
Convert a single dataset into a dataframe.
Parameters
----------
ds : xr.Dataset
The dataset to convert
fov : int
The fov index.
cell_index_start : int
The offset of the cell index. This is necessary
if you want to easily combine multiple datasets into a
single dataframe down the road
filename : str, optional
The filename to add to an 'fname' column.
Returns
-------
df : dataframe
"""
pts_cell = ds.pts_cell
mult = ds.attrs["multiplier"]
indx = pd.MultiIndex.from_product(
(np.arange(int(pts_cell.shape[0] / mult)) + cell_index_start, np.arange(mult)),
names=("cell", "sub-cell"),
)
df = pd.DataFrame(ds["cell_raman"][0], index=indx)
df["x"] = ds["cell_points"][:, 0]
df["y"] = ds["cell_points"][:, 1]

cell_points = ds["cell_points"] * 2048
bkd_points = ds["bkd_points"] * 2048
cell_raman = ds["cell_raman"] - 608
bkd_raman = ds["bkd_raman"] - 608

thres = 140

cell_com = cell_points.to_numpy().astype(int)
gfp_int = np.asarray([ds["img"][1, 0, x[0], x[1]].values for x in cell_com])

df["gfp_int"] = gfp_int
df["fov"] = fov
if filename is not None:
df["fname"] = filename
return df




def glob2df(
files: Union[Iterable[str], str],
conditions,
threshold: float,
well_number: int = 0,
cell_index_start: int = 0,
verbose: bool = True,
) -> pd.DataFrame:
"""
Parameters
----------
files : iterable, str
An iterable of filenames, or a glob string to use to find the files.
conditions : tuple(str, str)
The two conditions in this dataset, should be listed
in order of (undyed condition, dyed condition)
threshold : float
The threshold gfp value for determining dyed vs undyed cell
well_number : int, default 0
What well number to put in the dataframe
cell_index_start : int
The offset of the cell index. This is necessary
if you want to easily combine multiple datasets into a
single dataframe down the road
verbose : bool, default True
Whether to print out the file names as they are opened.
Returns
-------
df : pd.DataFrame
images : xr.DataArray
"""
if isinstance(files, str):
files = glob(files)
dfs = []
images = []
for fov, f in enumerate(files):
if verbose:
print(f)
ds = xr.open_dataset(f)
dfs.append(ds2df(ds, fov, cell_index_start, filename=f))
cell_index_start = dfs[-1].index.max()[0] + 1
images.append(ds["img"])
images = xr.concat(images, dim="fov")
df = pd.concat(dfs)

# determine condition and update multiindex
cond_idx = [((df["gfp_int"] > threshold).groupby("cell").mean() > 0.5).astype(int)]
df["cond"] = np.broadcast_to(np.atleast_2d(np.array(conditions)).T, (2, 13))[
tuple(cond_idx)
].ravel()
# df["cond"] = np.array([[condition] * 13, ["N"] * 13])[tuple(cond_idx)].ravel()
df.set_index("cond", append=True, inplace=True)
df = df.reorder_levels(
["cond", "cell", "sub-cell"],
)
df["well"] = well_number
return df, images
123 changes: 123 additions & 0 deletions src/raman_analysis/preprocessing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
from __future__ import annotations

import numpy as np
import pandas as pd
from scipy.signal import find_peaks

__all__ = ["find_cosmic_rays", "remove_cosmic_rays", "group_spectra_points"]


def find_cosmic_rays(
spectra: np.ndarray, ignore_region: tuple[int, int] = (200, 400), **kwargs
) -> np.ndarray:
"""
Find the indices of cosmic rays.
Parameters
----------
spectra : (N, 1340)
The arraylike of spectr to search through
ignore_region : (int, int)
A region to not worry about cosmic rays in.
**kwargs :
Passed to scipy.signal.find_peaks
Returns
-------
idx : (M, 2) np.ndarray
The indices of which spectra+pixels have detected cosmic rays.
The first column contains which spectra, the second which pixel.
"""
spectra = np.atleast_2d(spectra)
idx = []
min_ignore = min(ignore_region)
max_ignore = max(ignore_region)
threshold = kwargs.pop("threshold", 75)
prominence = kwargs.pop("prominence", 100)
for s, spec in enumerate(spectra):
peaks, _ = find_peaks(
spec, threshold=threshold, prominence=prominence, **kwargs
)
for p in peaks:
if min_ignore < p < max_ignore:
continue
idx.append((s, p))
return np.asarray(idx)


def remove_cosmic_rays(df: pd.DataFrame, plot: bool = False, **kwargs) -> pd.DataFrame:
"""
Process a dataframe by removing all spectra with detected cosmic rays.
Parameters
----------
df : pd.DataFrame
The dataframe with spectra components as the first 1340 columns
plot : bool
Whether to generate a plot showing which spectra were removed.
**kwargs
Passed to scipy.signal.find_peaks
Returns
-------
pd.DataFrame
The spectra dataframe with spectra with cosmic rays removed.
"""
spectra = df.iloc[:, :1340].values

cosmic_idx = find_cosmic_rays(spectra, **kwargs)
keep_idx = np.setdiff1d(np.arange(spectra.shape[0]), cosmic_idx)
if plot:
import matplotlib.pyplot as plt

unique_cosmic, offset_idx = np.unique(cosmic_idx[:, 0], return_inverse=True)
offsets = np.arange(unique_cosmic.shape[0]) * 100
fig, axs = plt.subplots(1, 2, figsize=(10, 6))
axs[0].set_title("Post Removal")
axs[0].plot(spectra[keep_idx].T, alpha=0.75)

axs[1].plot(spectra[unique_cosmic].T + offsets)
axs[1].plot(
cosmic_idx[:, 1],
spectra[cosmic_idx[:, 0], cosmic_idx[:, 1]] + offsets[offset_idx],
"rx",
markersize=10,
label="detected cosmic rays",
mew=5,
)
axs[1].legend()

axs[1].set_title("Post removal - with offset")
return df.iloc[keep_idx]


def group_spectra_points(df, multiplier: int) -> pd.DataFrame:
"""
Add which point each spectra is from to the multiindex.
Parameters
----------
df : pd.DataFrame
The raman dataframe. Should be organized as T, P, type
multiplier : int
How many subspectra per point
Returns
-------
pd.DataFrame
the original dataframe with pt_label appended to multiindex
"""
offset = 0
for pos, pos_df in df.groupby(level=1):
for t, tp_df in pos_df.groupby(level=0):
n_pts = int(len(tp_df) / multiplier)
pt_labels = (
np.broadcast_to(
np.arange(n_pts, dtype=int)[:, None], (n_pts, multiplier)
).ravel()
+ offset
)
df.loc[(t, pos), "pt"] = pt_labels
offset = df.loc[pos, "pt"].max()
df["pt"] = df["pt"].astype(int)
return df.set_index("pt", append=True)

0 comments on commit 456fbf0

Please sign in to comment.