From 3eedb63f56d4d50b9530a22d14504a81820cb03d Mon Sep 17 00:00:00 2001 From: Otavio Napoli Date: Wed, 29 May 2024 02:55:18 +0000 Subject: [PATCH] Added tabular and csv readers. Alfso added a readme.md --- .gitignore | 3 +- minerva/data/README.md | 10 ++ minerva/data/readers/csv_reader.py | 17 ++++ minerva/data/readers/tabular_reader.py | 128 +++++++++++++++++++++++++ 4 files changed, 156 insertions(+), 2 deletions(-) create mode 100644 minerva/data/README.md create mode 100644 minerva/data/readers/csv_reader.py create mode 100644 minerva/data/readers/tabular_reader.py diff --git a/.gitignore b/.gitignore index f289fad..d37f9a9 100644 --- a/.gitignore +++ b/.gitignore @@ -169,5 +169,4 @@ cython_debug/ #.idea/ experiments/ logs/ -lightning_logs/ -data/ \ No newline at end of file +lightning_logs/ \ No newline at end of file diff --git a/minerva/data/README.md b/minerva/data/README.md new file mode 100644 index 0000000..5f19123 --- /dev/null +++ b/minerva/data/README.md @@ -0,0 +1,10 @@ +# Readers + +| **Reader** | **Data Unit** | **Order** | **Class** | **Observations** | +|-------------------- |----------------------------------------------------------------------------------- |--------------------- |-------------------------------------------------------------- |------------------------------------------------------------------------------------------------------------------------------------ | +| PNGReader | Each unit of data is a image file (PNG) inside the root folder | Lexigraphical order | minerva.data.readers.png_reader.PNGReader | File extensions: .png | +| TIFFReader | Each unit of data is a image file (TIFF) inside the root folder | Lexigraphical order | minerva.data.readers.tiff_reader.TiffReader | File extensions: .tif and .tiff | +| TabularReader | Each unit of data is the i-th row in a dataframe, with columns filtered | Dataframe rows | minerva.data.readers.tabular_reader.TabularReader | Support pandas dataframe | +| CSVReader | Each unit of data is the i-th row in a CSV file, with columns filtered | CSV Rowd | minerva.data.readers.csv_reader.CSVReader | If dataframe is already open, use TabularReader instead. This class will open and load the CSV file and pass it to a TabularReader | +| PatchedArrayReader | Each unit of data is a submatrix of specified shape inside an n-dimensional array | Dimension order | minerva.data.readers.patched_array_reader.PatchedArrayReader | Supports any data with ndarray protocol (tensor, xarray, zarr) | +| PatchedZarrReader | Each unit of data is a submatrix of specified shape inside an Zarr Array | Dimension order | minerva.data.readers.zarr_reader.ZarrArrayReader | Open zarr file in lazy mode and pass it to PatchedArrayReader | \ No newline at end of file diff --git a/minerva/data/readers/csv_reader.py b/minerva/data/readers/csv_reader.py new file mode 100644 index 0000000..e03170e --- /dev/null +++ b/minerva/data/readers/csv_reader.py @@ -0,0 +1,17 @@ +from typing import Union + +import pandas as pd +from minerva.data.readers.tabular_reader import TabularReader + +class CSVReader(TabularReader): + def __init__( + self, + path: str, + columns_to_select: Union[str, list[str]], + cast_to: str = None, + data_shape: tuple[int, ...] = None, + ): + df = pd.read_csv(path) + super().__init__(df, columns_to_select, cast_to, data_shape) + + diff --git a/minerva/data/readers/tabular_reader.py b/minerva/data/readers/tabular_reader.py new file mode 100644 index 0000000..d9d2706 --- /dev/null +++ b/minerva/data/readers/tabular_reader.py @@ -0,0 +1,128 @@ +from pathlib import Path +from typing import Union + +import numpy as np +import re +import pandas as pd +from minerva.data.readers.reader import _Reader + +class TabularReader(_Reader): + def __init__( + self, + df: pd.DataFrame, + columns_to_select: Union[str, list[str]], + cast_to: str = None, + data_shape: tuple[int, ...] = None, + ): + """Reader to select columns from a DataFrame and return them as a NumPy + array. The DataFrame is indexed by the row number. Each row of the + DataFrame is considered as a sample. Thus, the __getitem__ method will + return the columns of the DataFrame at the specified index as a NumPy + array. + + Parameters + ---------- + df : pd.DataFrame + The DataFrame to select the columns from. The DataFrame should have + the columns that are specified in the `columns_to_select` parameter. + columns_to_select : Union[str, list[str]] + A string or a list of strings used to select the columns from the DataFrame. + The string can be a regular expression pattern or a column name. The columns + that match the pattern will be selected. + cast_to : str, optional + Cast the selected columns to the specified data type. If None, the + data type of the columns will not be changed. (default is None) + data_shape : tuple[int, ...], optional + The shape of the data to be returned. If None, the data will be + returned as a 1D array. If provided, the data will be reshaped to + the specified shape. (default is None) + """ + self.df = df + self.columns_to_select = columns_to_select + self.cast_to = cast_to + self.data_shape = data_shape + + if isinstance(self.columns_to_select, str): + self.columns_to_select = [self.columns_to_select] + + def __getitem__(self, index: int) -> np.ndarray: + """Return the columns of the DataFrame at the specified row index as a NumPy + array. The columns are selected based on the `self.columns_to_select`. + + Parameters + ---------- + index : int + The row index to select the columns from the DataFrame. + + Returns + ------- + np.ndarray + The selected columns from the row as a NumPy array. + """ + columns = list(self.df.columns) + + # Filter valid columns based on columns_to_select list + valid_columns = [] + for pattern in self.columns_to_select: + valid_columns.extend( + [col for col in columns if re.match(pattern, col)] + ) + + # Select the elements and return + row = self.df.iloc[index][valid_columns] + row = row.to_numpy() + + if self.cast_to is not None: + row = row.astype(self.cast_to) + + if self.data_shape is not None: + row = row.reshape(self.data_shape) + + return row + + def __len__(self) -> int: + """Return the number of samples in the DataFrame. The number of samples + is equal to the number of rows in the DataFrame. + + Returns + ------- + int + The number of samples in the DataFrame. + """ + return len(self.df) + + +# def main(): +# df = pd.DataFrame({ +# "accel-x-0": np.array(range(10)), +# "accel-x-1": np.array(range(10)) + 10, +# "accel-x-2": np.array(range(10)) + 100, +# "accel-x-3": np.array(range(10)) + 1000, + +# "accel-y-0": np.array(range(10)), +# "accel-y-1": np.array(range(10)) * 2, +# "accel-y-2": np.array(range(10)) * 3, +# "accel-y-3": np.array(range(10)) * 4, + +# "gyro-x-0": np.array(range(10)) - 10, +# "gyro-x-1": np.array(range(10)) - 20, +# "gyro-x-2": np.array(range(10)) - 30, +# "gyro-x-3": np.array(range(10)) - 40, +# }) + +# reader = TabularReader(df, ["accel-x-*", "gyro-x-*"]) +# print(len(reader)) +# print(reader[1]) + +# reader = TabularReader(df, ["accel-*", "gyro-x-*"]) +# print(len(reader)) +# print(reader[2]) + + +# reader = TabularReader(df, ["accel-x-1", "gyro-x-0", "gyro-x-1", "accel-y-*"]) +# print(len(reader)) +# print(reader[3]) + + +# if __name__ == "__main__": +# main()