Added tabular and csv readers. Alfso added a readme.md

discovery-unicamp · May 29, 2024 · 3eedb63 · 3eedb63
1 parent 97edeed
commit 3eedb63
Show file tree

Hide file tree

Showing 4 changed files with 156 additions and 2 deletions.
diff --git a/.gitignore b/.gitignore
@@ -169,5 +169,4 @@ cython_debug/
 #.idea/
 experiments/
 logs/
-lightning_logs/
-data/
+lightning_logs/
diff --git a/minerva/data/README.md b/minerva/data/README.md
@@ -0,0 +1,10 @@
+# Readers
+
+| **Reader**         	| **Data Unit**                                                                     	| **Order**           	| **Class**                                                    	| **Observations**                                                                                                                   	|
+|--------------------	|-----------------------------------------------------------------------------------	|---------------------	|--------------------------------------------------------------	|------------------------------------------------------------------------------------------------------------------------------------	|
+| PNGReader          	| Each unit of data is a image file (PNG) inside the root folder                    	| Lexigraphical order 	| minerva.data.readers.png_reader.PNGReader                    	| File extensions: .png                                                                                                              	|
+| TIFFReader         	| Each unit of data is a image file (TIFF) inside the root folder                   	| Lexigraphical order 	| minerva.data.readers.tiff_reader.TiffReader                  	| File extensions: .tif and .tiff                                                                                                    	|
+| TabularReader      	| Each unit of data is the i-th row in a dataframe, with columns filtered           	| Dataframe rows      	| minerva.data.readers.tabular_reader.TabularReader            	| Support pandas dataframe                                                                                                           	|
+| CSVReader          	| Each unit of data is the i-th row in a CSV file, with columns filtered            	| CSV Rowd            	| minerva.data.readers.csv_reader.CSVReader                    	| If dataframe is already open, use TabularReader instead. This class will open and load the CSV file and pass it to a TabularReader 	|
+| PatchedArrayReader 	| Each unit of data is a submatrix of specified shape inside an n-dimensional array 	| Dimension order     	| minerva.data.readers.patched_array_reader.PatchedArrayReader 	| Supports any data with ndarray protocol (tensor, xarray, zarr)                                                                     	|
+| PatchedZarrReader  	| Each unit of data is a submatrix of specified shape inside an Zarr Array          	| Dimension order     	| minerva.data.readers.zarr_reader.ZarrArrayReader             	| Open zarr file in lazy mode and pass it to PatchedArrayReader                                                                      	|
diff --git a/minerva/data/readers/csv_reader.py b/minerva/data/readers/csv_reader.py
@@ -0,0 +1,17 @@
+from typing import Union
+
+import pandas as pd
+from minerva.data.readers.tabular_reader import TabularReader
+
+class CSVReader(TabularReader):
+    def __init__(
+        self,
+        path: str,
+        columns_to_select: Union[str, list[str]],
+        cast_to: str = None,
+        data_shape: tuple[int, ...] = None,
+    ):
+        df = pd.read_csv(path)
+        super().__init__(df, columns_to_select, cast_to, data_shape)
+
+
diff --git a/minerva/data/readers/tabular_reader.py b/minerva/data/readers/tabular_reader.py
@@ -0,0 +1,128 @@
+from pathlib import Path
+from typing import Union
+
+import numpy as np
+import re
+import pandas as pd
+from minerva.data.readers.reader import _Reader
+
+class TabularReader(_Reader):
+    def __init__(
+        self,
+        df: pd.DataFrame,
+        columns_to_select: Union[str, list[str]],
+        cast_to: str = None,
+        data_shape: tuple[int, ...] = None,
+    ):
+        """Reader to select columns from a DataFrame and return them as a NumPy
+        array. The DataFrame is indexed by the row number. Each row of the
+        DataFrame is considered as a sample. Thus, the __getitem__ method will
+        return the columns of the DataFrame at the specified index as a NumPy
+        array.
+
+        Parameters
+        ----------
+        df : pd.DataFrame
+            The DataFrame to select the columns from. The DataFrame should have
+            the columns that are specified in the `columns_to_select` parameter.
+        columns_to_select : Union[str, list[str]]
+            A string or a list of strings used to select the columns from the DataFrame.
+            The string can be a regular expression pattern or a column name. The columns
+            that match the pattern will be selected.
+        cast_to : str, optional
+            Cast the selected columns to the specified data type. If None, the
+            data type of the columns will not be changed. (default is None)
+        data_shape : tuple[int, ...], optional
+            The shape of the data to be returned. If None, the data will be
+            returned as a 1D array. If provided, the data will be reshaped to
+            the specified shape. (default is None)
+        """
+        self.df = df
+        self.columns_to_select = columns_to_select
+        self.cast_to = cast_to
+        self.data_shape = data_shape
+
+        if isinstance(self.columns_to_select, str):
+            self.columns_to_select = [self.columns_to_select]
+
+    def __getitem__(self, index: int) -> np.ndarray:
+        """Return the columns of the DataFrame at the specified row index as a NumPy
+        array. The columns are selected based on the `self.columns_to_select`.
+
+        Parameters
+        ----------
+        index : int
+            The row index to select the columns from the DataFrame.
+
+        Returns
+        -------
+        np.ndarray
+            The selected columns from the row as a NumPy array.
+        """
+        columns = list(self.df.columns)
+
+        # Filter valid columns based on columns_to_select list
+        valid_columns = []
+        for pattern in self.columns_to_select:
+            valid_columns.extend(
+                [col for col in columns if re.match(pattern, col)]
+            )
+
+        # Select the elements and return
+        row = self.df.iloc[index][valid_columns]
+        row = row.to_numpy()
+
+        if self.cast_to is not None:
+            row = row.astype(self.cast_to)
+
+        if self.data_shape is not None:
+            row = row.reshape(self.data_shape)
+
+        return row
+
+    def __len__(self) -> int:
+        """Return the number of samples in the DataFrame. The number of samples
+        is equal to the number of rows in the DataFrame.
+
+        Returns
+        -------
+        int
+            The number of samples in the DataFrame.
+        """
+        return len(self.df)
+
+
+# def main():
+#     df = pd.DataFrame({
+#         "accel-x-0": np.array(range(10)),
+#         "accel-x-1": np.array(range(10)) + 10,
+#         "accel-x-2": np.array(range(10)) + 100,
+#         "accel-x-3": np.array(range(10)) + 1000,
+
+#         "accel-y-0": np.array(range(10)),
+#         "accel-y-1": np.array(range(10)) * 2,
+#         "accel-y-2": np.array(range(10)) * 3,
+#         "accel-y-3": np.array(range(10)) * 4,
+
+#         "gyro-x-0": np.array(range(10)) - 10,
+#         "gyro-x-1": np.array(range(10)) - 20,
+#         "gyro-x-2": np.array(range(10)) - 30,
+#         "gyro-x-3": np.array(range(10)) - 40,
+#     })
+
+#     reader = TabularReader(df, ["accel-x-*", "gyro-x-*"])
+#     print(len(reader))
+#     print(reader[1])
+
+#     reader = TabularReader(df, ["accel-*", "gyro-x-*"])
+#     print(len(reader))
+#     print(reader[2])
+
+
+#     reader = TabularReader(df, ["accel-x-1", "gyro-x-0", "gyro-x-1", "accel-y-*"])
+#     print(len(reader))
+#     print(reader[3])
+
+
+# if __name__ == "__main__":
+#     main()