-
Notifications
You must be signed in to change notification settings - Fork 7
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Added tabular and csv readers. Alfso added a readme.md
- Loading branch information
Showing
4 changed files
with
156 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -169,5 +169,4 @@ cython_debug/ | |
#.idea/ | ||
experiments/ | ||
logs/ | ||
lightning_logs/ | ||
data/ | ||
lightning_logs/ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
# Readers | ||
|
||
| **Reader** | **Data Unit** | **Order** | **Class** | **Observations** | | ||
|-------------------- |----------------------------------------------------------------------------------- |--------------------- |-------------------------------------------------------------- |------------------------------------------------------------------------------------------------------------------------------------ | | ||
| PNGReader | Each unit of data is a image file (PNG) inside the root folder | Lexigraphical order | minerva.data.readers.png_reader.PNGReader | File extensions: .png | | ||
| TIFFReader | Each unit of data is a image file (TIFF) inside the root folder | Lexigraphical order | minerva.data.readers.tiff_reader.TiffReader | File extensions: .tif and .tiff | | ||
| TabularReader | Each unit of data is the i-th row in a dataframe, with columns filtered | Dataframe rows | minerva.data.readers.tabular_reader.TabularReader | Support pandas dataframe | | ||
| CSVReader | Each unit of data is the i-th row in a CSV file, with columns filtered | CSV Rowd | minerva.data.readers.csv_reader.CSVReader | If dataframe is already open, use TabularReader instead. This class will open and load the CSV file and pass it to a TabularReader | | ||
| PatchedArrayReader | Each unit of data is a submatrix of specified shape inside an n-dimensional array | Dimension order | minerva.data.readers.patched_array_reader.PatchedArrayReader | Supports any data with ndarray protocol (tensor, xarray, zarr) | | ||
| PatchedZarrReader | Each unit of data is a submatrix of specified shape inside an Zarr Array | Dimension order | minerva.data.readers.zarr_reader.ZarrArrayReader | Open zarr file in lazy mode and pass it to PatchedArrayReader | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
from typing import Union | ||
|
||
import pandas as pd | ||
from minerva.data.readers.tabular_reader import TabularReader | ||
|
||
class CSVReader(TabularReader): | ||
def __init__( | ||
self, | ||
path: str, | ||
columns_to_select: Union[str, list[str]], | ||
cast_to: str = None, | ||
data_shape: tuple[int, ...] = None, | ||
): | ||
df = pd.read_csv(path) | ||
super().__init__(df, columns_to_select, cast_to, data_shape) | ||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,128 @@ | ||
from pathlib import Path | ||
from typing import Union | ||
|
||
import numpy as np | ||
import re | ||
import pandas as pd | ||
from minerva.data.readers.reader import _Reader | ||
|
||
class TabularReader(_Reader): | ||
def __init__( | ||
self, | ||
df: pd.DataFrame, | ||
columns_to_select: Union[str, list[str]], | ||
cast_to: str = None, | ||
data_shape: tuple[int, ...] = None, | ||
): | ||
"""Reader to select columns from a DataFrame and return them as a NumPy | ||
array. The DataFrame is indexed by the row number. Each row of the | ||
DataFrame is considered as a sample. Thus, the __getitem__ method will | ||
return the columns of the DataFrame at the specified index as a NumPy | ||
array. | ||
Parameters | ||
---------- | ||
df : pd.DataFrame | ||
The DataFrame to select the columns from. The DataFrame should have | ||
the columns that are specified in the `columns_to_select` parameter. | ||
columns_to_select : Union[str, list[str]] | ||
A string or a list of strings used to select the columns from the DataFrame. | ||
The string can be a regular expression pattern or a column name. The columns | ||
that match the pattern will be selected. | ||
cast_to : str, optional | ||
Cast the selected columns to the specified data type. If None, the | ||
data type of the columns will not be changed. (default is None) | ||
data_shape : tuple[int, ...], optional | ||
The shape of the data to be returned. If None, the data will be | ||
returned as a 1D array. If provided, the data will be reshaped to | ||
the specified shape. (default is None) | ||
""" | ||
self.df = df | ||
self.columns_to_select = columns_to_select | ||
self.cast_to = cast_to | ||
self.data_shape = data_shape | ||
|
||
if isinstance(self.columns_to_select, str): | ||
self.columns_to_select = [self.columns_to_select] | ||
|
||
def __getitem__(self, index: int) -> np.ndarray: | ||
"""Return the columns of the DataFrame at the specified row index as a NumPy | ||
array. The columns are selected based on the `self.columns_to_select`. | ||
Parameters | ||
---------- | ||
index : int | ||
The row index to select the columns from the DataFrame. | ||
Returns | ||
------- | ||
np.ndarray | ||
The selected columns from the row as a NumPy array. | ||
""" | ||
columns = list(self.df.columns) | ||
|
||
# Filter valid columns based on columns_to_select list | ||
valid_columns = [] | ||
for pattern in self.columns_to_select: | ||
valid_columns.extend( | ||
[col for col in columns if re.match(pattern, col)] | ||
) | ||
|
||
# Select the elements and return | ||
row = self.df.iloc[index][valid_columns] | ||
row = row.to_numpy() | ||
|
||
if self.cast_to is not None: | ||
row = row.astype(self.cast_to) | ||
|
||
if self.data_shape is not None: | ||
row = row.reshape(self.data_shape) | ||
|
||
return row | ||
|
||
def __len__(self) -> int: | ||
"""Return the number of samples in the DataFrame. The number of samples | ||
is equal to the number of rows in the DataFrame. | ||
Returns | ||
------- | ||
int | ||
The number of samples in the DataFrame. | ||
""" | ||
return len(self.df) | ||
|
||
|
||
# def main(): | ||
# df = pd.DataFrame({ | ||
# "accel-x-0": np.array(range(10)), | ||
# "accel-x-1": np.array(range(10)) + 10, | ||
# "accel-x-2": np.array(range(10)) + 100, | ||
# "accel-x-3": np.array(range(10)) + 1000, | ||
|
||
# "accel-y-0": np.array(range(10)), | ||
# "accel-y-1": np.array(range(10)) * 2, | ||
# "accel-y-2": np.array(range(10)) * 3, | ||
# "accel-y-3": np.array(range(10)) * 4, | ||
|
||
# "gyro-x-0": np.array(range(10)) - 10, | ||
# "gyro-x-1": np.array(range(10)) - 20, | ||
# "gyro-x-2": np.array(range(10)) - 30, | ||
# "gyro-x-3": np.array(range(10)) - 40, | ||
# }) | ||
|
||
# reader = TabularReader(df, ["accel-x-*", "gyro-x-*"]) | ||
# print(len(reader)) | ||
# print(reader[1]) | ||
|
||
# reader = TabularReader(df, ["accel-*", "gyro-x-*"]) | ||
# print(len(reader)) | ||
# print(reader[2]) | ||
|
||
|
||
# reader = TabularReader(df, ["accel-x-1", "gyro-x-0", "gyro-x-1", "accel-y-*"]) | ||
# print(len(reader)) | ||
# print(reader[3]) | ||
|
||
|
||
# if __name__ == "__main__": | ||
# main() |