Skip to content

Commit

Permalink
Added tabular and csv readers. Alfso added a readme.md
Browse files Browse the repository at this point in the history
  • Loading branch information
otavioon committed May 29, 2024
1 parent 97edeed commit 3eedb63
Show file tree
Hide file tree
Showing 4 changed files with 156 additions and 2 deletions.
3 changes: 1 addition & 2 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -169,5 +169,4 @@ cython_debug/
#.idea/
experiments/
logs/
lightning_logs/
data/
lightning_logs/
10 changes: 10 additions & 0 deletions minerva/data/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
# Readers

| **Reader** | **Data Unit** | **Order** | **Class** | **Observations** |
|-------------------- |----------------------------------------------------------------------------------- |--------------------- |-------------------------------------------------------------- |------------------------------------------------------------------------------------------------------------------------------------ |
| PNGReader | Each unit of data is a image file (PNG) inside the root folder | Lexigraphical order | minerva.data.readers.png_reader.PNGReader | File extensions: .png |
| TIFFReader | Each unit of data is a image file (TIFF) inside the root folder | Lexigraphical order | minerva.data.readers.tiff_reader.TiffReader | File extensions: .tif and .tiff |
| TabularReader | Each unit of data is the i-th row in a dataframe, with columns filtered | Dataframe rows | minerva.data.readers.tabular_reader.TabularReader | Support pandas dataframe |
| CSVReader | Each unit of data is the i-th row in a CSV file, with columns filtered | CSV Rowd | minerva.data.readers.csv_reader.CSVReader | If dataframe is already open, use TabularReader instead. This class will open and load the CSV file and pass it to a TabularReader |
| PatchedArrayReader | Each unit of data is a submatrix of specified shape inside an n-dimensional array | Dimension order | minerva.data.readers.patched_array_reader.PatchedArrayReader | Supports any data with ndarray protocol (tensor, xarray, zarr) |
| PatchedZarrReader | Each unit of data is a submatrix of specified shape inside an Zarr Array | Dimension order | minerva.data.readers.zarr_reader.ZarrArrayReader | Open zarr file in lazy mode and pass it to PatchedArrayReader |
17 changes: 17 additions & 0 deletions minerva/data/readers/csv_reader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
from typing import Union

import pandas as pd
from minerva.data.readers.tabular_reader import TabularReader

class CSVReader(TabularReader):
def __init__(
self,
path: str,
columns_to_select: Union[str, list[str]],
cast_to: str = None,
data_shape: tuple[int, ...] = None,
):
df = pd.read_csv(path)
super().__init__(df, columns_to_select, cast_to, data_shape)


128 changes: 128 additions & 0 deletions minerva/data/readers/tabular_reader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
from pathlib import Path
from typing import Union

import numpy as np
import re
import pandas as pd
from minerva.data.readers.reader import _Reader

class TabularReader(_Reader):
def __init__(
self,
df: pd.DataFrame,
columns_to_select: Union[str, list[str]],
cast_to: str = None,
data_shape: tuple[int, ...] = None,
):
"""Reader to select columns from a DataFrame and return them as a NumPy
array. The DataFrame is indexed by the row number. Each row of the
DataFrame is considered as a sample. Thus, the __getitem__ method will
return the columns of the DataFrame at the specified index as a NumPy
array.
Parameters
----------
df : pd.DataFrame
The DataFrame to select the columns from. The DataFrame should have
the columns that are specified in the `columns_to_select` parameter.
columns_to_select : Union[str, list[str]]
A string or a list of strings used to select the columns from the DataFrame.
The string can be a regular expression pattern or a column name. The columns
that match the pattern will be selected.
cast_to : str, optional
Cast the selected columns to the specified data type. If None, the
data type of the columns will not be changed. (default is None)
data_shape : tuple[int, ...], optional
The shape of the data to be returned. If None, the data will be
returned as a 1D array. If provided, the data will be reshaped to
the specified shape. (default is None)
"""
self.df = df
self.columns_to_select = columns_to_select
self.cast_to = cast_to
self.data_shape = data_shape

if isinstance(self.columns_to_select, str):
self.columns_to_select = [self.columns_to_select]

def __getitem__(self, index: int) -> np.ndarray:
"""Return the columns of the DataFrame at the specified row index as a NumPy
array. The columns are selected based on the `self.columns_to_select`.
Parameters
----------
index : int
The row index to select the columns from the DataFrame.
Returns
-------
np.ndarray
The selected columns from the row as a NumPy array.
"""
columns = list(self.df.columns)

# Filter valid columns based on columns_to_select list
valid_columns = []
for pattern in self.columns_to_select:
valid_columns.extend(
[col for col in columns if re.match(pattern, col)]
)

# Select the elements and return
row = self.df.iloc[index][valid_columns]
row = row.to_numpy()

if self.cast_to is not None:
row = row.astype(self.cast_to)

if self.data_shape is not None:
row = row.reshape(self.data_shape)

return row

def __len__(self) -> int:
"""Return the number of samples in the DataFrame. The number of samples
is equal to the number of rows in the DataFrame.
Returns
-------
int
The number of samples in the DataFrame.
"""
return len(self.df)


# def main():
# df = pd.DataFrame({
# "accel-x-0": np.array(range(10)),
# "accel-x-1": np.array(range(10)) + 10,
# "accel-x-2": np.array(range(10)) + 100,
# "accel-x-3": np.array(range(10)) + 1000,

# "accel-y-0": np.array(range(10)),
# "accel-y-1": np.array(range(10)) * 2,
# "accel-y-2": np.array(range(10)) * 3,
# "accel-y-3": np.array(range(10)) * 4,

# "gyro-x-0": np.array(range(10)) - 10,
# "gyro-x-1": np.array(range(10)) - 20,
# "gyro-x-2": np.array(range(10)) - 30,
# "gyro-x-3": np.array(range(10)) - 40,
# })

# reader = TabularReader(df, ["accel-x-*", "gyro-x-*"])
# print(len(reader))
# print(reader[1])

# reader = TabularReader(df, ["accel-*", "gyro-x-*"])
# print(len(reader))
# print(reader[2])


# reader = TabularReader(df, ["accel-x-1", "gyro-x-0", "gyro-x-1", "accel-y-*"])
# print(len(reader))
# print(reader[3])


# if __name__ == "__main__":
# main()

0 comments on commit 3eedb63

Please sign in to comment.