From a2d2ce4b4a9c24ac8c9ba5a16e142b756823548c Mon Sep 17 00:00:00 2001 From: trgiangdo Date: Mon, 2 Oct 2023 15:54:45 +0700 Subject: [PATCH 01/11] feat: filter numpy array data --- src/taipy/core/data/data_node.py | 41 +++++++++++++++++++++++++ tests/core/data/test_data_node.py | 51 ++++++++++++++++++++++++++++++- 2 files changed, 91 insertions(+), 1 deletion(-) diff --git a/src/taipy/core/data/data_node.py b/src/taipy/core/data/data_node.py index 390602af..df5b1326 100644 --- a/src/taipy/core/data/data_node.py +++ b/src/taipy/core/data/data_node.py @@ -14,6 +14,7 @@ from abc import abstractmethod from datetime import datetime, timedelta from functools import reduce +from operator import and_, or_ from typing import Any, Dict, List, Optional, Set, Tuple, Union import modin.pandas as modin_pd @@ -437,11 +438,16 @@ def filter(self, operators: Union[List, Tuple], join_operator=JoinOperator.AND): if not ((type(operators[0]) == list) or (type(operators[0]) == tuple)): if isinstance(data, (pd.DataFrame, modin_pd.DataFrame)): return DataNode.__filter_dataframe_per_key_value(data, operators[0], operators[1], operators[2]) + if isinstance(data, np.ndarray): + list_operators = [operators] + return DataNode.__filter_numpy_array(data, list_operators) if isinstance(data, List): return DataNode.__filter_list_per_key_value(data, operators[0], operators[1], operators[2]) else: if isinstance(data, (pd.DataFrame, modin_pd.DataFrame)): return DataNode.__filter_dataframe(data, operators, join_operator=join_operator) + if isinstance(data, np.ndarray): + return DataNode.__filter_numpy_array(data, operators, join_operator=join_operator) if isinstance(data, List): return DataNode.__filter_list(data, operators, join_operator=join_operator) raise NotImplementedError @@ -484,6 +490,41 @@ def __filter_dataframe_per_key_value( def __dataframe_merge(df_list: List, how="inner"): return reduce(lambda df1, df2: pd.merge(df1, df2, how=how), df_list) + @staticmethod + def __filter_numpy_array(data: np.ndarray, operators: Union[List, Tuple], join_operator=JoinOperator.AND): + conditions = [] + for key, value, operator in operators: + conditions.append(DataNode.__get_filter_condition_per_key_value(data, key, value, operator)) + + if join_operator == JoinOperator.AND: + join_conditions = reduce(and_, conditions) + elif join_operator == JoinOperator.OR: + join_conditions = reduce(or_, conditions) + else: + return NotImplementedError + + return data[join_conditions] + + @staticmethod + def __get_filter_condition_per_key_value(array_data: np.ndarray, key, value, operator: Operator): + if not isinstance(key, int): + key = int(key) + + if operator == Operator.EQUAL: + return array_data[:, key] == value + if operator == Operator.NOT_EQUAL: + return array_data[:, key] != value + if operator == Operator.LESS_THAN: + return array_data[:, key] < value + if operator == Operator.LESS_OR_EQUAL: + return array_data[:, key] <= value + if operator == Operator.GREATER_THAN: + return array_data[:, key] > value + if operator == Operator.GREATER_OR_EQUAL: + return array_data[:, key] >= value + + return NotImplementedError + @staticmethod def __filter_list(list_data: List, operators: Union[List, Tuple], join_operator=JoinOperator.AND): filtered_list_data = [] diff --git a/tests/core/data/test_data_node.py b/tests/core/data/test_data_node.py index 4dc79802..9285d51d 100644 --- a/tests/core/data/test_data_node.py +++ b/tests/core/data/test_data_node.py @@ -69,6 +69,19 @@ def storage_type(cls) -> str: return "fake_df_dn" +class FakeNumpyarrayDataNode(DataNode): + def __init__(self, config_id, default_array, **kwargs): + super().__init__(config_id, **kwargs) + self.data = default_array + + def _read(self): + return self.data + + @classmethod + def storage_type(cls) -> str: + return "fake_np_dn" + + class FakeListDataNode(DataNode): class Row: def __init__(self, value): @@ -446,7 +459,7 @@ def test_pandas_filter(self, default_data_frame): assert isinstance(df_dn[COLUMN_NAME_1], _FilterDataNode) assert isinstance(df_dn[[COLUMN_NAME_1, COLUMN_NAME_2]], _FilterDataNode) - def test_filter(self, default_data_frame): + def test_filter_pandas_exposed_type(self, default_data_frame): dn = FakeDataNode("fake_dn") dn.write("Any data") @@ -527,6 +540,8 @@ def test_filter(self, default_data_frame): ) == len( default_data_frame[(default_data_frame[COLUMN_NAME_1] > 10) | (default_data_frame[COLUMN_NAME_1] < -10)] ) + + def test_filter_list(self): list_dn = FakeListDataNode("fake_list_dn") KEY_NAME = "value" @@ -594,6 +609,40 @@ def test_filter(self, default_data_frame): == 6 ) + def test_filter_numpy_exposed_type(self, default_data_frame): + default_array = default_data_frame.to_numpy() + + df_dn = FakeNumpyarrayDataNode("fake_dataframe_dn", default_array) + + assert len(df_dn.filter((0, 1, Operator.EQUAL))) == len(default_array[default_array[:, 0] == 1]) + assert len(df_dn.filter((0, 1, Operator.NOT_EQUAL))) == len(default_array[default_array[:, 0] != 1]) + assert len(df_dn.filter([(0, 1, Operator.EQUAL)])) == len(default_array[default_array[:, 0] == 1]) + assert len(df_dn.filter([(0, 1, Operator.NOT_EQUAL)])) == len(default_array[default_array[:, 0] != 1]) + assert len(df_dn.filter([(0, 1, Operator.LESS_THAN)])) == len(default_array[default_array[:, 0] < 1]) + assert len(df_dn.filter([(0, 1, Operator.LESS_OR_EQUAL)])) == len(default_array[default_array[:, 0] <= 1]) + assert len(df_dn.filter([(0, 1, Operator.GREATER_THAN)])) == len(default_array[default_array[:, 0] > 1]) + assert len(df_dn.filter([(0, 1, Operator.GREATER_OR_EQUAL)])) == len(default_array[default_array[:, 0] >= 1]) + assert len(df_dn.filter([(0, -1000, Operator.LESS_OR_EQUAL)])) == 0 + assert len(df_dn.filter([(0, 1000, Operator.GREATER_OR_EQUAL)])) == 0 + assert len(df_dn.filter([(0, 4, Operator.EQUAL), (0, 5, Operator.EQUAL)])) == len( + default_array[(default_array[:, 0] == 4) & (default_array[:, 0] == 5)] + ) + assert len(df_dn.filter([(0, 4, Operator.EQUAL), (1, 5, Operator.EQUAL)], JoinOperator.OR)) == len( + default_array[(default_array[:, 0] == 4) | (default_array[:, 1] == 5)] + ) + assert len( + df_dn.filter([(0, 1, Operator.GREATER_THAN), (1, 3, Operator.GREATER_THAN)], JoinOperator.AND) + ) == len(default_array[(default_array[:, 0] > 1) & (default_array[:, 1] > 3)]) + assert len( + df_dn.filter([(0, 2, Operator.GREATER_THAN), (0, 3, Operator.GREATER_THAN)], JoinOperator.OR) + ) == len(default_array[(default_array[:, 0] > 2) | (default_array[:, 0] > 3)]) + assert len( + df_dn.filter([(0, 10, Operator.GREATER_THAN), (0, -10, Operator.LESS_THAN)], JoinOperator.AND) + ) == len(default_array[(default_array[:, 0] > 10) | (default_array[:, 0] < -10)]) + assert len( + df_dn.filter([(0, 10, Operator.GREATER_THAN), (0, -10, Operator.LESS_THAN)], JoinOperator.OR) + ) == len(default_array[(default_array[:, 0] > 10) | (default_array[:, 0] < -10)]) + def test_data_node_update_after_writing(self): dn = FakeDataNode("foo") From eac90f317799067a9ce1d4a3ba9532c24ce31eb0 Mon Sep 17 00:00:00 2001 From: trgiangdo Date: Mon, 2 Oct 2023 15:55:44 +0700 Subject: [PATCH 02/11] fix: filter by a not-existed attribute raise error --- src/taipy/core/data/data_node.py | 5 ++++- tests/core/data/test_mongo_data_node.py | 27 +++++++++++++++++++++++++ 2 files changed, 31 insertions(+), 1 deletion(-) diff --git a/src/taipy/core/data/data_node.py b/src/taipy/core/data/data_node.py index df5b1326..02fe3631 100644 --- a/src/taipy/core/data/data_node.py +++ b/src/taipy/core/data/data_node.py @@ -543,7 +543,10 @@ def __filter_list(list_data: List, operators: Union[List, Tuple], join_operator= def __filter_list_per_key_value(list_data: List, key: str, value, operator: Operator): filtered_list = [] for row in list_data: - row_value = getattr(row, key) + row_value = getattr(row, key, None) + if row_value is None: + continue + if operator == Operator.EQUAL and row_value == value: filtered_list.append(row) if operator == Operator.NOT_EQUAL and row_value != value: diff --git a/tests/core/data/test_mongo_data_node.py b/tests/core/data/test_mongo_data_node.py index 255958c3..bc2a907e 100644 --- a/tests/core/data/test_mongo_data_node.py +++ b/tests/core/data/test_mongo_data_node.py @@ -23,6 +23,7 @@ from src.taipy.core.common._mongo_connector import _connect_mongodb from src.taipy.core.data.data_node_id import DataNodeId from src.taipy.core.data.mongo import MongoCollectionDataNode +from src.taipy.core.data.operator import JoinOperator, Operator from src.taipy.core.exceptions.exceptions import InvalidCustomDocument, MissingRequiredProperty from taipy.config.common.scope import Scope @@ -171,6 +172,32 @@ def test_read(self, properties, clear_mongo_connection_cache): assert data[4].KWARGS_KEY == "KWARGS_VALUE" assert isinstance(data[5]._id, ObjectId) + @mongomock.patch(servers=(("localhost", 27017),)) + @pytest.mark.parametrize("properties", __properties) + def test_filter(self, properties): + mock_client = pymongo.MongoClient("localhost") + mock_client[properties["db_name"]][properties["collection_name"]].insert_many( + [ + {"foo": 1, "bar": 1}, + {"foo": 1, "bar": 2}, + {"foo": 1}, + {"foo": 2, "bar": 2}, + {"bar": 2}, + {"KWARGS_KEY": "KWARGS_VALUE"}, + ] + ) + + mongo_dn = MongoCollectionDataNode( + "foo", + Scope.SCENARIO, + properties=properties, + ) + + assert len(mongo_dn.filter(("foo", 1, Operator.EQUAL))) == 3 + assert len(mongo_dn.filter(("foo", 1, Operator.NOT_EQUAL))) == 1 + assert len(mongo_dn.filter(("bar", 2, Operator.EQUAL))) == 3 + assert len(mongo_dn.filter([("bar", 1, Operator.EQUAL), ("bar", 2, Operator.EQUAL)], JoinOperator.OR)) == 4 + @mongomock.patch(servers=(("localhost", 27017),)) @pytest.mark.parametrize("properties", __properties) def test_read_empty_as(self, properties, clear_mongo_connection_cache): From 69adb1da111544f4131555f1c1c7eb041c02c3ce Mon Sep 17 00:00:00 2001 From: trgiangdo Date: Mon, 2 Oct 2023 15:57:55 +0700 Subject: [PATCH 03/11] feat: add type hint for _read_as_numpy() method --- src/taipy/core/data/_abstract_sql.py | 3 ++- src/taipy/core/data/csv.py | 3 ++- src/taipy/core/data/parquet.py | 3 ++- 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/src/taipy/core/data/_abstract_sql.py b/src/taipy/core/data/_abstract_sql.py index a59e78d5..99fecfa3 100644 --- a/src/taipy/core/data/_abstract_sql.py +++ b/src/taipy/core/data/_abstract_sql.py @@ -17,6 +17,7 @@ from typing import Dict, List, Optional, Set import modin.pandas as modin_pd +import numpy as np import pandas as pd from sqlalchemy import create_engine, text @@ -214,7 +215,7 @@ def _read_as(self): query_result = connection.execute(text(self._get_read_query())) return list(map(lambda row: custom_class(**row), query_result)) - def _read_as_numpy(self): + def _read_as_numpy(self) -> np.ndarray: return self._read_as_pandas_dataframe().to_numpy() def _read_as_pandas_dataframe(self, columns: Optional[List[str]] = None): diff --git a/src/taipy/core/data/csv.py b/src/taipy/core/data/csv.py index 56be6fe5..98d764b1 100644 --- a/src/taipy/core/data/csv.py +++ b/src/taipy/core/data/csv.py @@ -16,6 +16,7 @@ from typing import Any, Dict, List, Optional, Set import modin.pandas as modin_pd +import numpy as np import pandas as pd from taipy.config.common.scope import Scope @@ -198,7 +199,7 @@ def _read_as(self): res.append(custom_class(*line)) return res - def _read_as_numpy(self): + def _read_as_numpy(self) -> np.ndarray: return self._read_as_pandas_dataframe().to_numpy() def _read_as_pandas_dataframe( diff --git a/src/taipy/core/data/parquet.py b/src/taipy/core/data/parquet.py index 08b7ed9e..87f60251 100644 --- a/src/taipy/core/data/parquet.py +++ b/src/taipy/core/data/parquet.py @@ -15,6 +15,7 @@ from typing import Any, Dict, List, Optional, Set import modin.pandas as modin_pd +import numpy as np import pandas as pd from taipy.config.common.scope import Scope @@ -220,7 +221,7 @@ def _read_as(self, read_kwargs: Dict): list_of_dicts = self._read_as_pandas_dataframe(read_kwargs).to_dict(orient="records") return [custom_class(**dct) for dct in list_of_dicts] - def _read_as_numpy(self, read_kwargs: Dict): + def _read_as_numpy(self, read_kwargs: Dict) -> np.ndarray: return self._read_as_pandas_dataframe(read_kwargs).to_numpy() def _read_as_pandas_dataframe(self, read_kwargs: Dict) -> pd.DataFrame: From 7230c7f0620f801552650f72821c41f5649aa1df Mon Sep 17 00:00:00 2001 From: trgiangdo Date: Mon, 30 Oct 2023 16:27:27 +0700 Subject: [PATCH 04/11] refactor: move all filter related method to _FilterDataNode class --- src/taipy/core/data/_filter.py | 188 +++++++++++++++++++------------ src/taipy/core/data/data_node.py | 137 +++------------------- 2 files changed, 128 insertions(+), 197 deletions(-) diff --git a/src/taipy/core/data/_filter.py b/src/taipy/core/data/_filter.py index 408a0a43..7e8e6ca5 100644 --- a/src/taipy/core/data/_filter.py +++ b/src/taipy/core/data/_filter.py @@ -10,12 +10,17 @@ # specific language governing permissions and limitations under the License. from collections.abc import Hashable -from typing import Dict, Iterable, List, Union +from functools import reduce +from operator import and_, or_ +from typing import Dict, Iterable, List, Tuple, Union import modin.pandas as modin_pd +import numpy as np import pandas as pd from pandas.core.common import is_bool_indexer +from .operator import JoinOperator, Operator + class _FilterDataNode: __DATAFRAME_DATA_TYPE = "dataframe" @@ -105,78 +110,115 @@ def __getitem_iterable(self, keys): filtered_data.append({k: getattr(e, k) for k in keys}) return filtered_data - def __eq__(self, value): - if self.data_is_dataframe(): - filtered_data = self.data == value - else: - filtered_data = [e == value for e in self.data] - return _FilterDataNode(self.data_node_id, filtered_data) - - def __lt__(self, value): - if self.data_is_dataframe(): - filtered_data = self.data < value - else: - filtered_data = [e < value for e in self.data] - return _FilterDataNode(self.data_node_id, filtered_data) - - def __le__(self, value): - if self.data_is_dataframe(): - filtered_data = self.data <= value - else: - filtered_data = [e <= value for e in self.data] - return _FilterDataNode(self.data_node_id, filtered_data) - - def __gt__(self, value): - if self.data_is_dataframe(): - filtered_data = self.data > value - else: - filtered_data = [e > value for e in self.data] - return _FilterDataNode(self.data_node_id, filtered_data) - - def __ge__(self, value): - if self.data_is_dataframe(): - filtered_data = self.data >= value - else: - filtered_data = [e >= value for e in self.data] - return _FilterDataNode(self.data_node_id, filtered_data) - - def __ne__(self, value): - if self.data_is_dataframe(): - filtered_data = self.data != value + @staticmethod + def _filter_dataframe( + df_data: Union[pd.DataFrame, modin_pd.DataFrame], operators: Union[List, Tuple], join_operator=JoinOperator.AND + ): + filtered_df_data = [] + if join_operator == JoinOperator.AND: + how = "inner" + elif join_operator == JoinOperator.OR: + how = "outer" else: - filtered_data = [e != value for e in self.data] - return _FilterDataNode(self.data_node_id, filtered_data) - - def __and__(self, other): - if self.data_is_dataframe(): - if other.data_is_dataframe(): - filtered_data = self.data & other.data - else: - raise NotImplementedError + return NotImplementedError + for key, value, operator in operators: + filtered_df_data.append(_FilterDataNode._filter_dataframe_per_key_value(df_data, key, value, operator)) + return _FilterDataNode.__dataframe_merge(filtered_df_data, how) if filtered_df_data else pd.DataFrame() + + @staticmethod + def _filter_dataframe_per_key_value( + df_data: Union[pd.DataFrame, modin_pd.DataFrame], key: str, value, operator: Operator + ): + df_by_col = df_data[key] + if operator == Operator.EQUAL: + df_by_col = df_by_col == value + if operator == Operator.NOT_EQUAL: + df_by_col = df_by_col != value + if operator == Operator.LESS_THAN: + df_by_col = df_by_col < value + if operator == Operator.LESS_OR_EQUAL: + df_by_col = df_by_col <= value + if operator == Operator.GREATER_THAN: + df_by_col = df_by_col > value + if operator == Operator.GREATER_OR_EQUAL: + df_by_col = df_by_col >= value + return df_data[df_by_col] + + @staticmethod + def __dataframe_merge(df_list: List, how="inner"): + return reduce(lambda df1, df2: pd.merge(df1, df2, how=how), df_list) + + @staticmethod + def _filter_numpy_array(data: np.ndarray, operators: Union[List, Tuple], join_operator=JoinOperator.AND): + conditions = [] + for key, value, operator in operators: + conditions.append(_FilterDataNode.__get_filter_condition_per_key_value(data, key, value, operator)) + + if join_operator == JoinOperator.AND: + join_conditions = reduce(and_, conditions) + elif join_operator == JoinOperator.OR: + join_conditions = reduce(or_, conditions) else: - if other.data_is_dataframe(): - raise NotImplementedError - else: - filtered_data = [s and o for s, o in zip(self.data, other.data)] - return _FilterDataNode(self.data_node_id, filtered_data) - - def __or__(self, other): - if self.data_is_dataframe(): - if other.data_is_dataframe(): - filtered_data = self.data | other.data - else: - raise NotImplementedError + return NotImplementedError + + return data[join_conditions] + + @staticmethod + def __get_filter_condition_per_key_value(array_data: np.ndarray, key, value, operator: Operator): + if not isinstance(key, int): + key = int(key) + + if operator == Operator.EQUAL: + return array_data[:, key] == value + if operator == Operator.NOT_EQUAL: + return array_data[:, key] != value + if operator == Operator.LESS_THAN: + return array_data[:, key] < value + if operator == Operator.LESS_OR_EQUAL: + return array_data[:, key] <= value + if operator == Operator.GREATER_THAN: + return array_data[:, key] > value + if operator == Operator.GREATER_OR_EQUAL: + return array_data[:, key] >= value + + return NotImplementedError + + @staticmethod + def _filter_list(list_data: List, operators: Union[List, Tuple], join_operator=JoinOperator.AND): + filtered_list_data = [] + for key, value, operator in operators: + filtered_list_data.append(_FilterDataNode._filter_list_per_key_value(list_data, key, value, operator)) + if len(filtered_list_data) == 0: + return filtered_list_data + if join_operator == JoinOperator.AND: + return _FilterDataNode.__list_intersect(filtered_list_data) + elif join_operator == JoinOperator.OR: + return list(set(np.concatenate(filtered_list_data))) else: - if other.data_is_dataframe(): - raise NotImplementedError - else: - filtered_data = [s or o for s, o in zip(self.data, other.data)] - return _FilterDataNode(self.data_node_id, filtered_data) - - def __str__(self) -> str: - if self.data_is_dataframe(): - return str(self.data) - list_to_string = "" - for e in self.data: - list_to_string += str(e) + "\n" - return list_to_string + return NotImplementedError + + @staticmethod + def _filter_list_per_key_value(list_data: List, key: str, value, operator: Operator): + filtered_list = [] + for row in list_data: + row_value = getattr(row, key, None) + if row_value is None: + continue + + if operator == Operator.EQUAL and row_value == value: + filtered_list.append(row) + if operator == Operator.NOT_EQUAL and row_value != value: + filtered_list.append(row) + if operator == Operator.LESS_THAN and row_value < value: + filtered_list.append(row) + if operator == Operator.LESS_OR_EQUAL and row_value <= value: + filtered_list.append(row) + if operator == Operator.GREATER_THAN and row_value > value: + filtered_list.append(row) + if operator == Operator.GREATER_OR_EQUAL and row_value >= value: + filtered_list.append(row) + return filtered_list + + @staticmethod + def __list_intersect(list_data): + return list(set(list_data.pop()).intersection(*map(set, list_data))) diff --git a/src/taipy/core/data/data_node.py b/src/taipy/core/data/data_node.py index 02fe3631..66ec5fa7 100644 --- a/src/taipy/core/data/data_node.py +++ b/src/taipy/core/data/data_node.py @@ -13,8 +13,6 @@ import uuid from abc import abstractmethod from datetime import datetime, timedelta -from functools import reduce -from operator import and_, or_ from typing import Any, Dict, List, Optional, Set, Tuple, Union import modin.pandas as modin_pd @@ -36,7 +34,7 @@ from ..job.job_id import JobId from ._filter import _FilterDataNode from .data_node_id import DataNodeId, Edit -from .operator import JoinOperator, Operator +from .operator import JoinOperator class DataNode(_Entity, _Labeled): @@ -424,7 +422,7 @@ def filter(self, operators: Union[List, Tuple], join_operator=JoinOperator.AND): The data is filtered by the provided list of 3-tuples (key, value, `Operator^`). If multiple filter operators are provided, filtered data will be joined based on the - join operator (_AND_ or _OR_). + join operator (*AND* or *OR*). Parameters: operators (Union[List[Tuple], Tuple]): A 3-element tuple or a list of 3-element tuples, @@ -432,139 +430,29 @@ def filter(self, operators: Union[List, Tuple], join_operator=JoinOperator.AND): join_operator (JoinOperator^): The operator used to join the multiple filter 3-tuples. """ + if self.storage_type in ["sql", "sql_table", "mongo_collection"]: + raise NotImplementedError + data = self._read() if len(operators) == 0: return data - if not ((type(operators[0]) == list) or (type(operators[0]) == tuple)): + if not ((isinstance(operators[0], list)) or (isinstance(operators[0], tuple))): if isinstance(data, (pd.DataFrame, modin_pd.DataFrame)): - return DataNode.__filter_dataframe_per_key_value(data, operators[0], operators[1], operators[2]) + return _FilterDataNode._filter_dataframe_per_key_value(data, operators[0], operators[1], operators[2]) if isinstance(data, np.ndarray): list_operators = [operators] - return DataNode.__filter_numpy_array(data, list_operators) + return _FilterDataNode._filter_numpy_array(data, list_operators) if isinstance(data, List): - return DataNode.__filter_list_per_key_value(data, operators[0], operators[1], operators[2]) + return _FilterDataNode._filter_list_per_key_value(data, operators[0], operators[1], operators[2]) else: if isinstance(data, (pd.DataFrame, modin_pd.DataFrame)): - return DataNode.__filter_dataframe(data, operators, join_operator=join_operator) + return _FilterDataNode._filter_dataframe(data, operators, join_operator=join_operator) if isinstance(data, np.ndarray): - return DataNode.__filter_numpy_array(data, operators, join_operator=join_operator) + return _FilterDataNode._filter_numpy_array(data, operators, join_operator=join_operator) if isinstance(data, List): - return DataNode.__filter_list(data, operators, join_operator=join_operator) + return _FilterDataNode._filter_list(data, operators, join_operator=join_operator) raise NotImplementedError - @staticmethod - def __filter_dataframe( - df_data: Union[pd.DataFrame, modin_pd.DataFrame], operators: Union[List, Tuple], join_operator=JoinOperator.AND - ): - filtered_df_data = [] - if join_operator == JoinOperator.AND: - how = "inner" - elif join_operator == JoinOperator.OR: - how = "outer" - else: - raise NotImplementedError - for key, value, operator in operators: - filtered_df_data.append(DataNode.__filter_dataframe_per_key_value(df_data, key, value, operator)) - return DataNode.__dataframe_merge(filtered_df_data, how) if filtered_df_data else pd.DataFrame() - - @staticmethod - def __filter_dataframe_per_key_value( - df_data: Union[pd.DataFrame, modin_pd.DataFrame], key: str, value, operator: Operator - ): - df_by_col = df_data[key] - if operator == Operator.EQUAL: - df_by_col = df_by_col == value - if operator == Operator.NOT_EQUAL: - df_by_col = df_by_col != value - if operator == Operator.LESS_THAN: - df_by_col = df_by_col < value - if operator == Operator.LESS_OR_EQUAL: - df_by_col = df_by_col <= value - if operator == Operator.GREATER_THAN: - df_by_col = df_by_col > value - if operator == Operator.GREATER_OR_EQUAL: - df_by_col = df_by_col >= value - return df_data[df_by_col] - - @staticmethod - def __dataframe_merge(df_list: List, how="inner"): - return reduce(lambda df1, df2: pd.merge(df1, df2, how=how), df_list) - - @staticmethod - def __filter_numpy_array(data: np.ndarray, operators: Union[List, Tuple], join_operator=JoinOperator.AND): - conditions = [] - for key, value, operator in operators: - conditions.append(DataNode.__get_filter_condition_per_key_value(data, key, value, operator)) - - if join_operator == JoinOperator.AND: - join_conditions = reduce(and_, conditions) - elif join_operator == JoinOperator.OR: - join_conditions = reduce(or_, conditions) - else: - return NotImplementedError - - return data[join_conditions] - - @staticmethod - def __get_filter_condition_per_key_value(array_data: np.ndarray, key, value, operator: Operator): - if not isinstance(key, int): - key = int(key) - - if operator == Operator.EQUAL: - return array_data[:, key] == value - if operator == Operator.NOT_EQUAL: - return array_data[:, key] != value - if operator == Operator.LESS_THAN: - return array_data[:, key] < value - if operator == Operator.LESS_OR_EQUAL: - return array_data[:, key] <= value - if operator == Operator.GREATER_THAN: - return array_data[:, key] > value - if operator == Operator.GREATER_OR_EQUAL: - return array_data[:, key] >= value - - return NotImplementedError - - @staticmethod - def __filter_list(list_data: List, operators: Union[List, Tuple], join_operator=JoinOperator.AND): - filtered_list_data = [] - for key, value, operator in operators: - filtered_list_data.append(DataNode.__filter_list_per_key_value(list_data, key, value, operator)) - if len(filtered_list_data) == 0: - return filtered_list_data - if join_operator == JoinOperator.AND: - return DataNode.__list_intersect(filtered_list_data) - elif join_operator == JoinOperator.OR: - return list(set(np.concatenate(filtered_list_data))) - else: - raise NotImplementedError - - @staticmethod - def __filter_list_per_key_value(list_data: List, key: str, value, operator: Operator): - filtered_list = [] - for row in list_data: - row_value = getattr(row, key, None) - if row_value is None: - continue - - if operator == Operator.EQUAL and row_value == value: - filtered_list.append(row) - if operator == Operator.NOT_EQUAL and row_value != value: - filtered_list.append(row) - if operator == Operator.LESS_THAN and row_value < value: - filtered_list.append(row) - if operator == Operator.LESS_OR_EQUAL and row_value <= value: - filtered_list.append(row) - if operator == Operator.GREATER_THAN and row_value > value: - filtered_list.append(row) - if operator == Operator.GREATER_OR_EQUAL and row_value >= value: - filtered_list.append(row) - return filtered_list - - @staticmethod - def __list_intersect(list_data): - return list(set(list_data.pop()).intersection(*map(set, list_data))) - @abstractmethod def _read(self): raise NotImplementedError @@ -574,6 +462,7 @@ def _write(self, data): raise NotImplementedError def __getitem__(self, items): + # TODO return _FilterDataNode(self.id, self._read())[items] @property # type: ignore From 92713563ba615b287ca115e9d325de9ebbe62add Mon Sep 17 00:00:00 2001 From: trgiangdo Date: Thu, 2 Nov 2023 01:12:04 +0700 Subject: [PATCH 05/11] refactor: move all filter related method to _FilterDataNode class --- src/taipy/core/data/_filter.py | 191 ++++++++++++++++++------------- src/taipy/core/data/data_node.py | 32 ++---- 2 files changed, 121 insertions(+), 102 deletions(-) diff --git a/src/taipy/core/data/_filter.py b/src/taipy/core/data/_filter.py index 7e8e6ca5..0b779ac9 100644 --- a/src/taipy/core/data/_filter.py +++ b/src/taipy/core/data/_filter.py @@ -11,6 +11,7 @@ from collections.abc import Hashable from functools import reduce +from itertools import chain from operator import and_, or_ from typing import Dict, Iterable, List, Tuple, Union @@ -23,95 +24,115 @@ class _FilterDataNode: - __DATAFRAME_DATA_TYPE = "dataframe" - __MULTI_SHEET_EXCEL_DATA_TYPE = "multi_sheet_excel" - __CUSTOM_DATA_TYPE = "custom" - - def __init__(self, data_node_id, data: Union[pd.DataFrame, modin_pd.DataFrame, List]) -> None: - self.data_node_id = data_node_id - self.data = data - self.data_type = None - if self._is_pandas_object(): - self.data_type = self.__DATAFRAME_DATA_TYPE - elif self.is_multi_sheet_excel(): - self.data_type = self.__MULTI_SHEET_EXCEL_DATA_TYPE - else: - self.data_type = self.__CUSTOM_DATA_TYPE - - def _is_pandas_object(self) -> bool: - return isinstance(self.data, (pd.DataFrame, modin_pd.DataFrame)) or isinstance( - self.data, (pd.Series, modin_pd.DataFrame) - ) - - def is_multi_sheet_excel(self) -> bool: - if isinstance(self.data, Dict): - has_df_children = all([isinstance(e, (pd.DataFrame, modin_pd.DataFrame)) for e in self.data.values()]) - has_list_children = all([isinstance(e, List) for e in self.data.values()]) - return has_df_children or has_list_children + @staticmethod + def __is_pandas_object(data) -> bool: + return isinstance(data, (pd.DataFrame, modin_pd.DataFrame)) or isinstance(data, (pd.Series, modin_pd.DataFrame)) + + @staticmethod + def __is_multi_sheet_excel(data) -> bool: + if isinstance(data, Dict): + has_df_children = all([isinstance(e, (pd.DataFrame, modin_pd.DataFrame)) for e in data.values()]) + has_list_children = all([isinstance(e, List) for e in data.values()]) + has_np_array_children = all([isinstance(e, np.ndarray) for e in data.values()]) + return has_df_children or has_list_children or has_np_array_children return False - def data_is_dataframe(self) -> bool: - return self.data_type == self.__DATAFRAME_DATA_TYPE + @staticmethod + def __is_list_of_dict(self) -> bool: + return all(isinstance(x, Dict) for x in self.data) + + @staticmethod + def _filter_by_key(data, key): + if isinstance(key, int): + return _FilterDataNode.__getitem_int(data, key) - def data_is_multi_sheet_excel(self) -> bool: - return self.data_type == self.__MULTI_SHEET_EXCEL_DATA_TYPE + if isinstance(key, slice) or (isinstance(key, tuple) and any(isinstance(e, slice) for e in key)): + return _FilterDataNode.__getitem_slice(data, key) - def __getitem__(self, key): - if isinstance(key, _FilterDataNode): - key = key.data if isinstance(key, Hashable): - filtered_data = self.__getitem_hashable(key) - elif isinstance(key, slice): - filtered_data = self.__getitem_slice(key) - elif isinstance(key, (pd.DataFrame, modin_pd.DataFrame)): - filtered_data = self.__getitem_dataframe(key) - elif is_bool_indexer(key): - filtered_data = self.__getitem_bool_indexer(key) - elif isinstance(key, Iterable): - filtered_data = self.__getitem_iterable(key) - else: - filtered_data = None - return _FilterDataNode(self.data_node_id, filtered_data) + return _FilterDataNode.__getitem_hashable(data, key) + + if isinstance(key, (pd.DataFrame, modin_pd.DataFrame)): + return _FilterDataNode.__getitem_dataframe(data, key) + + if is_bool_indexer(key): + return _FilterDataNode.__getitem_bool_indexer(data, key) + + if isinstance(key, Iterable): + return _FilterDataNode.__getitem_iterable(data, key) + + return None + + @staticmethod + def __getitem_int(data, key): + return data[key] - def __getitem_hashable(self, key): - if self.data_is_dataframe() or self.data_is_multi_sheet_excel(): - return self.data.get(key) - return [getattr(e, key) for e in self.data] + @staticmethod + def __getitem_hashable(data, key): + if _FilterDataNode.__is_pandas_object(data) or _FilterDataNode.__is_multi_sheet_excel(data): + return data.get(key) + return [getattr(entry, key, None) for entry in data] - def __getitem_slice(self, key): - return self.data[key] + @staticmethod + def __getitem_slice(data, key): + return data[key] - def __getitem_dataframe(self, key: Union[pd.DataFrame, modin_pd.DataFrame]): - if self.data_is_dataframe(): - return self.data[key] - if self.data_is_list_of_dict(): + @staticmethod + def __getitem_dataframe(data, key: Union[pd.DataFrame, modin_pd.DataFrame]): + if _FilterDataNode.__is_pandas_object(data): + return data[key] + if _FilterDataNode.__is_list_of_dict(data): filtered_data = list() for i, row in key.iterrows(): filtered_row = dict() for col in row.index: - filtered_row[col] = self.data[i][col] if row[col] else None + filtered_row[col] = data[i][col] if row[col] else None filtered_data.append(filtered_row) return filtered_data return None - def data_is_list_of_dict(self) -> bool: - return all(isinstance(x, Dict) for x in self.data) - - def __getitem_bool_indexer(self, key): - if self.data_is_dataframe(): - return self.data[key] - return [e for i, e in enumerate(self.data) if key[i]] + @staticmethod + def __getitem_bool_indexer(data, key): + if _FilterDataNode.__is_pandas_object(data): + return data[key] + return [e for i, e in enumerate(data) if key[i]] - def __getitem_iterable(self, keys): - if self.data_is_dataframe(): - return self.data[keys] + @staticmethod + def __getitem_iterable(data, keys): + if _FilterDataNode.__is_pandas_object(data): + return data[keys] filtered_data = [] - for e in self.data: - filtered_data.append({k: getattr(e, k) for k in keys}) + for entry in data: + filtered_data.append({k: getattr(entry, k) for k in keys if hasattr(entry, k)}) return filtered_data @staticmethod - def _filter_dataframe( + def _filter(data, operators: Union[List, Tuple], join_operator=JoinOperator.AND): + if len(operators) == 0: + return data + + if isinstance(data, Dict): + return {k: _FilterDataNode._filter(v, operators, join_operator) for k, v in data.items()} + + if not ((isinstance(operators[0], list)) or (isinstance(operators[0], tuple))): + if isinstance(data, (pd.DataFrame, modin_pd.DataFrame)): + return _FilterDataNode.__filter_dataframe_per_key_value(data, operators[0], operators[1], operators[2]) + if isinstance(data, np.ndarray): + list_operators = [operators] + return _FilterDataNode.__filter_numpy_array(data, list_operators) + if isinstance(data, List): + return _FilterDataNode.__filter_list_per_key_value(data, operators[0], operators[1], operators[2]) + else: + if isinstance(data, (pd.DataFrame, modin_pd.DataFrame)): + return _FilterDataNode.__filter_dataframe(data, operators, join_operator=join_operator) + if isinstance(data, np.ndarray): + return _FilterDataNode.__filter_numpy_array(data, operators, join_operator=join_operator) + if isinstance(data, List): + return _FilterDataNode.__filter_list(data, operators, join_operator=join_operator) + raise NotImplementedError + + @staticmethod + def __filter_dataframe( df_data: Union[pd.DataFrame, modin_pd.DataFrame], operators: Union[List, Tuple], join_operator=JoinOperator.AND ): filtered_df_data = [] @@ -122,11 +143,17 @@ def _filter_dataframe( else: return NotImplementedError for key, value, operator in operators: - filtered_df_data.append(_FilterDataNode._filter_dataframe_per_key_value(df_data, key, value, operator)) + filtered_df_data.append(_FilterDataNode.__filter_dataframe_per_key_value(df_data, key, value, operator)) + + if isinstance(df_data, modin_pd.DataFrame): + if filtered_df_data: + return _FilterDataNode.__modin_dataframe_merge(filtered_df_data, how) + return modin_pd.DataFrame() + return _FilterDataNode.__dataframe_merge(filtered_df_data, how) if filtered_df_data else pd.DataFrame() @staticmethod - def _filter_dataframe_per_key_value( + def __filter_dataframe_per_key_value( df_data: Union[pd.DataFrame, modin_pd.DataFrame], key: str, value, operator: Operator ): df_by_col = df_data[key] @@ -149,7 +176,11 @@ def __dataframe_merge(df_list: List, how="inner"): return reduce(lambda df1, df2: pd.merge(df1, df2, how=how), df_list) @staticmethod - def _filter_numpy_array(data: np.ndarray, operators: Union[List, Tuple], join_operator=JoinOperator.AND): + def __modin_dataframe_merge(df_list: List, how="inner"): + return reduce(lambda df1, df2: modin_pd.merge(df1, df2, how=how), df_list) + + @staticmethod + def __filter_numpy_array(data: np.ndarray, operators: Union[List, Tuple], join_operator=JoinOperator.AND): conditions = [] for key, value, operator in operators: conditions.append(_FilterDataNode.__get_filter_condition_per_key_value(data, key, value, operator)) @@ -184,26 +215,30 @@ def __get_filter_condition_per_key_value(array_data: np.ndarray, key, value, ope return NotImplementedError @staticmethod - def _filter_list(list_data: List, operators: Union[List, Tuple], join_operator=JoinOperator.AND): + def __filter_list(list_data: List, operators: Union[List, Tuple], join_operator=JoinOperator.AND): filtered_list_data = [] for key, value, operator in operators: - filtered_list_data.append(_FilterDataNode._filter_list_per_key_value(list_data, key, value, operator)) + filtered_list_data.append(_FilterDataNode.__filter_list_per_key_value(list_data, key, value, operator)) if len(filtered_list_data) == 0: return filtered_list_data if join_operator == JoinOperator.AND: return _FilterDataNode.__list_intersect(filtered_list_data) elif join_operator == JoinOperator.OR: - return list(set(np.concatenate(filtered_list_data))) + merged_list = list(chain.from_iterable(filtered_list_data)) + if all(isinstance(e, Dict) for e in merged_list): + return list({frozenset(item.items()) for item in merged_list}) + return list(set(merged_list)) else: return NotImplementedError @staticmethod - def _filter_list_per_key_value(list_data: List, key: str, value, operator: Operator): + def __filter_list_per_key_value(list_data: List, key: str, value, operator: Operator): filtered_list = [] for row in list_data: - row_value = getattr(row, key, None) - if row_value is None: - continue + if isinstance(row, Dict): + row_value = row.get(key, None) + else: + row_value = getattr(row, key, None) if operator == Operator.EQUAL and row_value == value: filtered_list.append(row) diff --git a/src/taipy/core/data/data_node.py b/src/taipy/core/data/data_node.py index 66ec5fa7..6b45f92c 100644 --- a/src/taipy/core/data/data_node.py +++ b/src/taipy/core/data/data_node.py @@ -429,29 +429,17 @@ def filter(self, operators: Union[List, Tuple], join_operator=JoinOperator.AND): each is in the form of (key, value, `Operator^`). join_operator (JoinOperator^): The operator used to join the multiple filter 3-tuples. + Returns: + The filtered data. + Raises: + NotImplementedError: If the data type is not supported. """ - if self.storage_type in ["sql", "sql_table", "mongo_collection"]: - raise NotImplementedError + data = self._read() + return _FilterDataNode._filter(data, operators, join_operator) + def __getitem__(self, item): data = self._read() - if len(operators) == 0: - return data - if not ((isinstance(operators[0], list)) or (isinstance(operators[0], tuple))): - if isinstance(data, (pd.DataFrame, modin_pd.DataFrame)): - return _FilterDataNode._filter_dataframe_per_key_value(data, operators[0], operators[1], operators[2]) - if isinstance(data, np.ndarray): - list_operators = [operators] - return _FilterDataNode._filter_numpy_array(data, list_operators) - if isinstance(data, List): - return _FilterDataNode._filter_list_per_key_value(data, operators[0], operators[1], operators[2]) - else: - if isinstance(data, (pd.DataFrame, modin_pd.DataFrame)): - return _FilterDataNode._filter_dataframe(data, operators, join_operator=join_operator) - if isinstance(data, np.ndarray): - return _FilterDataNode._filter_numpy_array(data, operators, join_operator=join_operator) - if isinstance(data, List): - return _FilterDataNode._filter_list(data, operators, join_operator=join_operator) - raise NotImplementedError + return _FilterDataNode._filter_by_key(data, item) @abstractmethod def _read(self): @@ -461,10 +449,6 @@ def _read(self): def _write(self, data): raise NotImplementedError - def __getitem__(self, items): - # TODO - return _FilterDataNode(self.id, self._read())[items] - @property # type: ignore @_self_reload(_MANAGER_NAME) def is_ready_for_reading(self) -> bool: From f31e48dc3d35a9530cf6c46fdc25d766da1ddfe6 Mon Sep 17 00:00:00 2001 From: trgiangdo Date: Thu, 2 Nov 2023 01:14:19 +0700 Subject: [PATCH 06/11] refactor: split datanode fiter tests to a dedicated file --- tests/core/data/test_data_node.py | 260 +-------- tests/core/data/test_filter_data_node.py | 645 +++++++++-------------- tests/core/data/utils.py | 120 +++++ 3 files changed, 383 insertions(+), 642 deletions(-) create mode 100644 tests/core/data/utils.py diff --git a/tests/core/data/test_data_node.py b/tests/core/data/test_data_node.py index 9285d51d..c4c9e155 100644 --- a/tests/core/data/test_data_node.py +++ b/tests/core/data/test_data_node.py @@ -20,83 +20,16 @@ from src.taipy.core._orchestrator._orchestrator_factory import _OrchestratorFactory from src.taipy.core.config.job_config import JobConfig from src.taipy.core.data._data_manager import _DataManager -from src.taipy.core.data._filter import _FilterDataNode from src.taipy.core.data.data_node import DataNode from src.taipy.core.data.data_node_id import DataNodeId from src.taipy.core.data.in_memory import InMemoryDataNode -from src.taipy.core.data.operator import JoinOperator, Operator from src.taipy.core.exceptions.exceptions import DataNodeIsBeingEdited, NoData from src.taipy.core.job.job_id import JobId from taipy.config import Config from taipy.config.common.scope import Scope from taipy.config.exceptions.exceptions import InvalidConfigurationId - -class FakeDataNode(InMemoryDataNode): - read_has_been_called = 0 - write_has_been_called = 0 - - def __init__(self, config_id, **kwargs): - scope = kwargs.pop("scope", Scope.SCENARIO) - super().__init__(config_id=config_id, scope=scope, **kwargs) - - def _read(self, query=None): - self.read_has_been_called += 1 - - def _write(self, data): - self.write_has_been_called += 1 - - @classmethod - def storage_type(cls) -> str: - return "fake_inmemory" - - write = DataNode.write # Make sure that the writing behavior comes from DataNode - - -class FakeDataframeDataNode(DataNode): - COLUMN_NAME_1 = "a" - COLUMN_NAME_2 = "b" - - def __init__(self, config_id, default_data_frame, **kwargs): - super().__init__(config_id, **kwargs) - self.data = default_data_frame - - def _read(self): - return self.data - - @classmethod - def storage_type(cls) -> str: - return "fake_df_dn" - - -class FakeNumpyarrayDataNode(DataNode): - def __init__(self, config_id, default_array, **kwargs): - super().__init__(config_id, **kwargs) - self.data = default_array - - def _read(self): - return self.data - - @classmethod - def storage_type(cls) -> str: - return "fake_np_dn" - - -class FakeListDataNode(DataNode): - class Row: - def __init__(self, value): - self.value = value - - def __init__(self, config_id, **kwargs): - super().__init__(config_id, **kwargs) - self.data = [self.Row(i) for i in range(10)] - - def _read(self): - return self.data - - @classmethod - def storage_type(cls) -> str: - return "fake_list_dn" +from .utils import FakeDataNode def funct_a_b(input: str): @@ -452,197 +385,6 @@ def test_do_not_recompute_data_node_valid_but_continue_sequence_execution(self): assert ("task_b_c", tp.Status.COMPLETED) in jobs_and_status assert ("task_b_d", tp.Status.COMPLETED) in jobs_and_status - def test_pandas_filter(self, default_data_frame): - df_dn = FakeDataframeDataNode("fake_dataframe_dn", default_data_frame) - COLUMN_NAME_1 = "a" - COLUMN_NAME_2 = "b" - assert isinstance(df_dn[COLUMN_NAME_1], _FilterDataNode) - assert isinstance(df_dn[[COLUMN_NAME_1, COLUMN_NAME_2]], _FilterDataNode) - - def test_filter_pandas_exposed_type(self, default_data_frame): - dn = FakeDataNode("fake_dn") - dn.write("Any data") - - with pytest.raises(NotImplementedError): - dn.filter((("any", 0, Operator.EQUAL)), JoinOperator.OR) - with pytest.raises(NotImplementedError): - dn.filter((("any", 0, Operator.EQUAL)), JoinOperator.OR) - with pytest.raises(NotImplementedError): - dn.filter((("any", 0, Operator.NOT_EQUAL)), JoinOperator.OR) - with pytest.raises(NotImplementedError): - dn.filter((("any", 0, Operator.LESS_THAN)), JoinOperator.AND) - with pytest.raises(NotImplementedError): - dn.filter((("any", 0, Operator.LESS_OR_EQUAL)), JoinOperator.AND) - with pytest.raises(NotImplementedError): - dn.filter((("any", 0, Operator.GREATER_THAN))) - with pytest.raises(NotImplementedError): - dn.filter(("any", 0, Operator.GREATER_OR_EQUAL)) - - df_dn = FakeDataframeDataNode("fake_dataframe_dn", default_data_frame) - - COLUMN_NAME_1 = "a" - COLUMN_NAME_2 = "b" - assert len(df_dn.filter((COLUMN_NAME_1, 1, Operator.EQUAL))) == len( - default_data_frame[default_data_frame[COLUMN_NAME_1] == 1] - ) - assert len(df_dn.filter((COLUMN_NAME_1, 1, Operator.NOT_EQUAL))) == len( - default_data_frame[default_data_frame[COLUMN_NAME_1] != 1] - ) - assert len(df_dn.filter([(COLUMN_NAME_1, 1, Operator.EQUAL)])) == len( - default_data_frame[default_data_frame[COLUMN_NAME_1] == 1] - ) - assert len(df_dn.filter([(COLUMN_NAME_1, 1, Operator.NOT_EQUAL)])) == len( - default_data_frame[default_data_frame[COLUMN_NAME_1] != 1] - ) - assert len(df_dn.filter([(COLUMN_NAME_1, 1, Operator.LESS_THAN)])) == len( - default_data_frame[default_data_frame[COLUMN_NAME_1] < 1] - ) - assert len(df_dn.filter([(COLUMN_NAME_1, 1, Operator.LESS_OR_EQUAL)])) == len( - default_data_frame[default_data_frame[COLUMN_NAME_1] <= 1] - ) - assert len(df_dn.filter([(COLUMN_NAME_1, 1, Operator.GREATER_THAN)])) == len( - default_data_frame[default_data_frame[COLUMN_NAME_1] > 1] - ) - assert len(df_dn.filter([(COLUMN_NAME_1, 1, Operator.GREATER_OR_EQUAL)])) == len( - default_data_frame[default_data_frame[COLUMN_NAME_1] >= 1] - ) - assert len(df_dn.filter([(COLUMN_NAME_1, -1000, Operator.LESS_OR_EQUAL)])) == 0 - assert len(df_dn.filter([(COLUMN_NAME_1, 1000, Operator.GREATER_OR_EQUAL)])) == 0 - assert len(df_dn.filter([(COLUMN_NAME_1, 4, Operator.EQUAL), (COLUMN_NAME_1, 5, Operator.EQUAL)])) == len( - default_data_frame[(default_data_frame[COLUMN_NAME_1] == 4) & (default_data_frame[COLUMN_NAME_1] == 5)] - ) - assert len( - df_dn.filter([(COLUMN_NAME_1, 4, Operator.EQUAL), (COLUMN_NAME_2, 5, Operator.EQUAL)], JoinOperator.OR) - ) == len( - default_data_frame[(default_data_frame[COLUMN_NAME_1] == 4) | (default_data_frame[COLUMN_NAME_2] == 5)] - ) - assert len( - df_dn.filter( - [(COLUMN_NAME_1, 1, Operator.GREATER_THAN), (COLUMN_NAME_2, 3, Operator.GREATER_THAN)], JoinOperator.AND - ) - ) == len(default_data_frame[(default_data_frame[COLUMN_NAME_1] > 1) & (default_data_frame[COLUMN_NAME_2] > 3)]) - assert len( - df_dn.filter( - [(COLUMN_NAME_1, 2, Operator.GREATER_THAN), (COLUMN_NAME_1, 3, Operator.GREATER_THAN)], JoinOperator.OR - ) - ) == len(default_data_frame[(default_data_frame[COLUMN_NAME_1] > 2) | (default_data_frame[COLUMN_NAME_1] > 3)]) - assert len( - df_dn.filter( - [(COLUMN_NAME_1, 10, Operator.GREATER_THAN), (COLUMN_NAME_1, -10, Operator.LESS_THAN)], JoinOperator.AND - ) - ) == len( - default_data_frame[(default_data_frame[COLUMN_NAME_1] > 10) | (default_data_frame[COLUMN_NAME_1] < -10)] - ) - assert len( - df_dn.filter( - [(COLUMN_NAME_1, 10, Operator.GREATER_THAN), (COLUMN_NAME_1, -10, Operator.LESS_THAN)], JoinOperator.OR - ) - ) == len( - default_data_frame[(default_data_frame[COLUMN_NAME_1] > 10) | (default_data_frame[COLUMN_NAME_1] < -10)] - ) - - def test_filter_list(self): - list_dn = FakeListDataNode("fake_list_dn") - - KEY_NAME = "value" - - assert len(list_dn.filter((KEY_NAME, 4, Operator.EQUAL))) == 1 - assert len(list_dn.filter((KEY_NAME, 4, Operator.NOT_EQUAL))) == 9 - assert len(list_dn.filter([(KEY_NAME, 4, Operator.EQUAL)])) == 1 - assert len(list_dn.filter([(KEY_NAME, 4, Operator.NOT_EQUAL)])) == 9 - assert len(list_dn.filter([(KEY_NAME, 4, Operator.LESS_THAN)])) == 4 - assert len(list_dn.filter([(KEY_NAME, 4, Operator.LESS_OR_EQUAL)])) == 5 - assert len(list_dn.filter([(KEY_NAME, 4, Operator.GREATER_THAN)])) == 5 - assert len(list_dn.filter([(KEY_NAME, 4, Operator.GREATER_OR_EQUAL)])) == 6 - assert len(list_dn.filter([(KEY_NAME, -1000, Operator.LESS_OR_EQUAL)])) == 0 - assert len(list_dn.filter([(KEY_NAME, 1000, Operator.GREATER_OR_EQUAL)])) == 0 - - assert len(list_dn.filter([(KEY_NAME, 4, Operator.EQUAL), (KEY_NAME, 5, Operator.EQUAL)])) == 0 - assert len(list_dn.filter([(KEY_NAME, 4, Operator.EQUAL), (KEY_NAME, 5, Operator.EQUAL)], JoinOperator.OR)) == 2 - assert ( - len(list_dn.filter([(KEY_NAME, 4, Operator.EQUAL), (KEY_NAME, 11, Operator.EQUAL)], JoinOperator.AND)) == 0 - ) - assert ( - len(list_dn.filter([(KEY_NAME, 4, Operator.EQUAL), (KEY_NAME, 11, Operator.EQUAL)], JoinOperator.OR)) == 1 - ) - - assert ( - len(list_dn.filter([(KEY_NAME, -10, Operator.LESS_OR_EQUAL), (KEY_NAME, 11, Operator.GREATER_OR_EQUAL)])) - == 0 - ) - assert ( - len( - list_dn.filter( - [ - (KEY_NAME, 4, Operator.GREATER_OR_EQUAL), - (KEY_NAME, 6, Operator.GREATER_OR_EQUAL), - ], - JoinOperator.AND, - ) - ) - == 4 - ) - assert ( - len( - list_dn.filter( - [ - (KEY_NAME, 4, Operator.GREATER_OR_EQUAL), - (KEY_NAME, 6, Operator.GREATER_OR_EQUAL), - (KEY_NAME, 11, Operator.EQUAL), - ], - JoinOperator.AND, - ) - ) - == 0 - ) - assert ( - len( - list_dn.filter( - [ - (KEY_NAME, 4, Operator.GREATER_OR_EQUAL), - (KEY_NAME, 6, Operator.GREATER_OR_EQUAL), - (KEY_NAME, 11, Operator.EQUAL), - ], - JoinOperator.OR, - ) - ) - == 6 - ) - - def test_filter_numpy_exposed_type(self, default_data_frame): - default_array = default_data_frame.to_numpy() - - df_dn = FakeNumpyarrayDataNode("fake_dataframe_dn", default_array) - - assert len(df_dn.filter((0, 1, Operator.EQUAL))) == len(default_array[default_array[:, 0] == 1]) - assert len(df_dn.filter((0, 1, Operator.NOT_EQUAL))) == len(default_array[default_array[:, 0] != 1]) - assert len(df_dn.filter([(0, 1, Operator.EQUAL)])) == len(default_array[default_array[:, 0] == 1]) - assert len(df_dn.filter([(0, 1, Operator.NOT_EQUAL)])) == len(default_array[default_array[:, 0] != 1]) - assert len(df_dn.filter([(0, 1, Operator.LESS_THAN)])) == len(default_array[default_array[:, 0] < 1]) - assert len(df_dn.filter([(0, 1, Operator.LESS_OR_EQUAL)])) == len(default_array[default_array[:, 0] <= 1]) - assert len(df_dn.filter([(0, 1, Operator.GREATER_THAN)])) == len(default_array[default_array[:, 0] > 1]) - assert len(df_dn.filter([(0, 1, Operator.GREATER_OR_EQUAL)])) == len(default_array[default_array[:, 0] >= 1]) - assert len(df_dn.filter([(0, -1000, Operator.LESS_OR_EQUAL)])) == 0 - assert len(df_dn.filter([(0, 1000, Operator.GREATER_OR_EQUAL)])) == 0 - assert len(df_dn.filter([(0, 4, Operator.EQUAL), (0, 5, Operator.EQUAL)])) == len( - default_array[(default_array[:, 0] == 4) & (default_array[:, 0] == 5)] - ) - assert len(df_dn.filter([(0, 4, Operator.EQUAL), (1, 5, Operator.EQUAL)], JoinOperator.OR)) == len( - default_array[(default_array[:, 0] == 4) | (default_array[:, 1] == 5)] - ) - assert len( - df_dn.filter([(0, 1, Operator.GREATER_THAN), (1, 3, Operator.GREATER_THAN)], JoinOperator.AND) - ) == len(default_array[(default_array[:, 0] > 1) & (default_array[:, 1] > 3)]) - assert len( - df_dn.filter([(0, 2, Operator.GREATER_THAN), (0, 3, Operator.GREATER_THAN)], JoinOperator.OR) - ) == len(default_array[(default_array[:, 0] > 2) | (default_array[:, 0] > 3)]) - assert len( - df_dn.filter([(0, 10, Operator.GREATER_THAN), (0, -10, Operator.LESS_THAN)], JoinOperator.AND) - ) == len(default_array[(default_array[:, 0] > 10) | (default_array[:, 0] < -10)]) - assert len( - df_dn.filter([(0, 10, Operator.GREATER_THAN), (0, -10, Operator.LESS_THAN)], JoinOperator.OR) - ) == len(default_array[(default_array[:, 0] > 10) | (default_array[:, 0] < -10)]) - def test_data_node_update_after_writing(self): dn = FakeDataNode("foo") diff --git a/tests/core/data/test_filter_data_node.py b/tests/core/data/test_filter_data_node.py index c770ddcb..1b121c85 100644 --- a/tests/core/data/test_filter_data_node.py +++ b/tests/core/data/test_filter_data_node.py @@ -13,392 +13,271 @@ import numpy as np import pandas as pd +import pytest + +from src.taipy.core.data.operator import JoinOperator, Operator + +from .utils import ( + CustomClass, + FakeCustomDataNode, + FakeDataframeDataNode, + FakeDataNode, + FakeListDataNode, + FakeMultiSheetExcelCustomDataNode, + FakeMultiSheetExcelDataFrameDataNode, + FakeNumpyarrayDataNode, +) + + +def test_filter_pandas_exposed_type(default_data_frame): + dn = FakeDataNode("fake_dn") + dn.write("Any data") + + with pytest.raises(NotImplementedError): + dn.filter((("any", 0, Operator.EQUAL)), JoinOperator.OR) + with pytest.raises(NotImplementedError): + dn.filter((("any", 0, Operator.NOT_EQUAL)), JoinOperator.OR) + with pytest.raises(NotImplementedError): + dn.filter((("any", 0, Operator.LESS_THAN)), JoinOperator.AND) + with pytest.raises(NotImplementedError): + dn.filter((("any", 0, Operator.LESS_OR_EQUAL)), JoinOperator.AND) + with pytest.raises(NotImplementedError): + dn.filter((("any", 0, Operator.GREATER_THAN))) + with pytest.raises(NotImplementedError): + dn.filter(("any", 0, Operator.GREATER_OR_EQUAL)) + + df_dn = FakeDataframeDataNode("fake_dataframe_dn", default_data_frame) -from src.taipy.core.data._filter import _FilterDataNode -from src.taipy.core.data.data_node import DataNode - - -class FakeDataframeDataNode(DataNode): COLUMN_NAME_1 = "a" COLUMN_NAME_2 = "b" - - def __init__(self, config_id, default_data_frame, **kwargs): - super().__init__(config_id, **kwargs) - self.data = default_data_frame - - def _read(self): - return self.data - - -class CustomClass: - def __init__(self, a, b): - self.a = a - self.b = b - - -class FakeCustomDataNode(DataNode): - def __init__(self, config_id, **kwargs): - super().__init__(config_id, **kwargs) - self.data = [CustomClass(i, i * 2) for i in range(10)] - - def _read(self): - return self.data - - -class FakeMultiSheetExcelDataFrameDataNode(DataNode): - def __init__(self, config_id, default_data_frame, **kwargs): - super().__init__(config_id, **kwargs) - self.data = { - "Sheet1": default_data_frame, - "Sheet2": default_data_frame, - } - - def _read(self): - return self.data - - -class FakeMultiSheetExcelCustomDataNode(DataNode): - def __init__(self, config_id, **kwargs): - super().__init__(config_id, **kwargs) - self.data = { - "Sheet1": [CustomClass(i, i * 2) for i in range(10)], - "Sheet2": [CustomClass(i, i * 2) for i in range(10)], - } - - def _read(self): - return self.data - - -class TestFilterDataNode: - def test_get_item(self, default_data_frame): - # get item for DataFrame data_type - default_data_frame[1] = [100, 100] - df_dn = FakeDataframeDataNode("fake_dataframe_dn", default_data_frame) - - filtered_df_dn = df_dn["a"] - assert isinstance(filtered_df_dn, _FilterDataNode) - assert isinstance(filtered_df_dn.data, pd.Series) - assert len(filtered_df_dn.data) == len(default_data_frame["a"]) - assert filtered_df_dn.data.to_dict() == default_data_frame["a"].to_dict() - - filtered_df_dn = df_dn[1] - assert isinstance(filtered_df_dn, _FilterDataNode) - assert isinstance(filtered_df_dn.data, pd.Series) - assert len(filtered_df_dn.data) == len(default_data_frame[1]) - assert filtered_df_dn.data.to_dict() == default_data_frame[1].to_dict() - - filtered_df_dn = df_dn[0:2] - assert isinstance(filtered_df_dn, _FilterDataNode) - assert isinstance(filtered_df_dn.data, pd.DataFrame) - assert filtered_df_dn.data.shape == default_data_frame[0:2].shape - assert len(filtered_df_dn.data) == 2 - - bool_df = default_data_frame.copy(deep=True) > 4 - filtered_df_dn = df_dn[bool_df] - assert isinstance(filtered_df_dn, _FilterDataNode) - assert isinstance(filtered_df_dn.data, pd.DataFrame) - - bool_1d_index = [True, False] - filtered_df_dn = df_dn[bool_1d_index] - assert isinstance(filtered_df_dn, _FilterDataNode) - assert isinstance(filtered_df_dn.data, pd.DataFrame) - assert filtered_df_dn.data.to_dict() == default_data_frame[bool_1d_index].to_dict() - assert len(filtered_df_dn.data) == 1 - - filtered_df_dn = df_dn[["a", "b"]] - assert isinstance(filtered_df_dn, _FilterDataNode) - assert isinstance(filtered_df_dn.data, pd.DataFrame) - assert filtered_df_dn.data.shape == default_data_frame[["a", "b"]].shape - assert filtered_df_dn.data.to_dict() == default_data_frame[["a", "b"]].to_dict() - - # get item for custom data_type - custom_dn = FakeCustomDataNode("fake_custom_dn") - - filtered_custom_dn = custom_dn["a"] - assert isinstance(filtered_custom_dn, _FilterDataNode) - assert isinstance(filtered_custom_dn.data, List) - assert len(filtered_custom_dn.data) == 10 - assert filtered_custom_dn.data == [i for i in range(10)] - - filtered_custom_dn = custom_dn[0:5] - assert isinstance(filtered_custom_dn, _FilterDataNode) - assert isinstance(filtered_custom_dn.data, List) - assert all([isinstance(x, CustomClass) for x in filtered_custom_dn.data]) - assert len(filtered_custom_dn.data) == 5 - - bool_df = pd.DataFrame({"a": [i for i in range(10)], "b": [i * 2 for i in range(10)]}) > 4 - filtered_custom_dn = custom_dn[["a", "b"]][bool_df] - assert isinstance(filtered_custom_dn, _FilterDataNode) - assert isinstance(filtered_custom_dn.data, List) - assert all([isinstance(x, Dict) for x in filtered_custom_dn.data]) - for i, row in bool_df.iterrows(): - for col in row.index: - if row[col]: - assert filtered_custom_dn.data[i][col] == custom_dn[["a", "b"]].data[i][col] - else: - assert filtered_custom_dn.data[i][col] is None - - filtered_custom_dn = custom_dn["a"][bool_df] - assert isinstance(filtered_custom_dn, _FilterDataNode) - assert filtered_custom_dn.data is None - - filtered_custom_dn = custom_dn[0:10][bool_df] - assert isinstance(filtered_custom_dn, _FilterDataNode) - assert filtered_custom_dn.data is None - - bool_1d_index = [True if i < 5 else False for i in range(10)] - filtered_custom_dn = custom_dn[bool_1d_index] - assert isinstance(filtered_custom_dn, _FilterDataNode) - assert isinstance(filtered_custom_dn.data, List) - assert len(filtered_custom_dn.data) == 5 - assert filtered_custom_dn.data == custom_dn._read()[:5] - - filtered_custom_dn = custom_dn[["a", "b"]] - assert isinstance(filtered_custom_dn, _FilterDataNode) - assert isinstance(filtered_custom_dn.data, List) - assert all([isinstance(x, Dict) for x in filtered_custom_dn.data]) - assert len(filtered_custom_dn.data) == 10 - assert filtered_custom_dn.data == [{"a": i, "b": i * 2} for i in range(10)] - - # get item for Multi-sheet Excel data_type - multi_sheet_excel_df_dn = FakeMultiSheetExcelDataFrameDataNode( - "fake_multi_sheet_excel_df_dn", default_data_frame + assert len(df_dn.filter((COLUMN_NAME_1, 1, Operator.EQUAL))) == len( + default_data_frame[default_data_frame[COLUMN_NAME_1] == 1] + ) + assert len(df_dn.filter((COLUMN_NAME_1, 1, Operator.NOT_EQUAL))) == len( + default_data_frame[default_data_frame[COLUMN_NAME_1] != 1] + ) + assert len(df_dn.filter([(COLUMN_NAME_1, 1, Operator.EQUAL)])) == len( + default_data_frame[default_data_frame[COLUMN_NAME_1] == 1] + ) + assert len(df_dn.filter([(COLUMN_NAME_1, 1, Operator.NOT_EQUAL)])) == len( + default_data_frame[default_data_frame[COLUMN_NAME_1] != 1] + ) + assert len(df_dn.filter([(COLUMN_NAME_1, 1, Operator.LESS_THAN)])) == len( + default_data_frame[default_data_frame[COLUMN_NAME_1] < 1] + ) + assert len(df_dn.filter([(COLUMN_NAME_1, 1, Operator.LESS_OR_EQUAL)])) == len( + default_data_frame[default_data_frame[COLUMN_NAME_1] <= 1] + ) + assert len(df_dn.filter([(COLUMN_NAME_1, 1, Operator.GREATER_THAN)])) == len( + default_data_frame[default_data_frame[COLUMN_NAME_1] > 1] + ) + assert len(df_dn.filter([(COLUMN_NAME_1, 1, Operator.GREATER_OR_EQUAL)])) == len( + default_data_frame[default_data_frame[COLUMN_NAME_1] >= 1] + ) + assert len(df_dn.filter([(COLUMN_NAME_1, -1000, Operator.LESS_OR_EQUAL)])) == 0 + assert len(df_dn.filter([(COLUMN_NAME_1, 1000, Operator.GREATER_OR_EQUAL)])) == 0 + assert len(df_dn.filter([(COLUMN_NAME_1, 4, Operator.EQUAL), (COLUMN_NAME_1, 5, Operator.EQUAL)])) == len( + default_data_frame[(default_data_frame[COLUMN_NAME_1] == 4) & (default_data_frame[COLUMN_NAME_1] == 5)] + ) + assert len( + df_dn.filter([(COLUMN_NAME_1, 4, Operator.EQUAL), (COLUMN_NAME_2, 5, Operator.EQUAL)], JoinOperator.OR) + ) == len(default_data_frame[(default_data_frame[COLUMN_NAME_1] == 4) | (default_data_frame[COLUMN_NAME_2] == 5)]) + assert len( + df_dn.filter( + [(COLUMN_NAME_1, 1, Operator.GREATER_THAN), (COLUMN_NAME_2, 3, Operator.GREATER_THAN)], JoinOperator.AND ) - filtered_multi_sheet_excel_df_dn = multi_sheet_excel_df_dn["Sheet1"] - assert isinstance(filtered_multi_sheet_excel_df_dn, _FilterDataNode) - assert isinstance(filtered_multi_sheet_excel_df_dn.data, pd.DataFrame) - assert len(filtered_multi_sheet_excel_df_dn.data) == len(default_data_frame) - assert np.array_equal(filtered_multi_sheet_excel_df_dn.data.to_numpy(), default_data_frame.to_numpy()) - - multi_sheet_excel_custom_dn = FakeMultiSheetExcelCustomDataNode("fake_multi_sheet_excel_df_dn") - filtered_multi_sheet_excel_custom_dn = multi_sheet_excel_custom_dn["Sheet1"] - assert isinstance(filtered_multi_sheet_excel_custom_dn, _FilterDataNode) - assert isinstance(filtered_multi_sheet_excel_custom_dn.data, List) - assert len(filtered_multi_sheet_excel_custom_dn.data) == 10 - expected_value = [CustomClass(i, i * 2) for i in range(10)] - assert all( - [ - expected.a == filtered.a and expected.b == filtered.b - for expected, filtered in zip(expected_value, filtered_multi_sheet_excel_custom_dn.data) - ] + ) == len(default_data_frame[(default_data_frame[COLUMN_NAME_1] > 1) & (default_data_frame[COLUMN_NAME_2] > 3)]) + assert len( + df_dn.filter( + [(COLUMN_NAME_1, 2, Operator.GREATER_THAN), (COLUMN_NAME_1, 3, Operator.GREATER_THAN)], JoinOperator.OR ) - - def test_equal(self, default_data_frame): - # equal to for pandas dataframe data_type - df_dn = FakeDataframeDataNode("fake_dataframe_dn", default_data_frame) - - filtered_df_dn = df_dn["a"] == 1 - assert isinstance(filtered_df_dn, _FilterDataNode) - assert isinstance(filtered_df_dn.data, pd.Series) - assert filtered_df_dn.data.dtype == bool - assert all(filtered_df_dn.data == (default_data_frame["a"] == 1)) - - filtered_df_dn = df_dn[["a", "b"]] == 1 - assert isinstance(filtered_df_dn, _FilterDataNode) - assert isinstance(filtered_df_dn.data, pd.DataFrame) - assert all(filtered_df_dn.data.dtypes == bool) - assert all(filtered_df_dn.data == (default_data_frame[["a", "b"]] == 1)) - - # equal to for custom list data_type - custom_dn = FakeCustomDataNode("fake_custom_dn") - - filtered_custom_dn = custom_dn["a"] == 0 - assert isinstance(filtered_custom_dn, _FilterDataNode) - assert isinstance(filtered_custom_dn.data, List) - assert all([isinstance(x, bool) for x in filtered_custom_dn.data]) - assert filtered_custom_dn.data == [True] + [False for _ in range(9)] - - filtered_custom_dn = custom_dn[["a", "b"]] == 0 - assert isinstance(filtered_custom_dn, _FilterDataNode) - assert isinstance(filtered_custom_dn.data, List) - assert all([isinstance(x, bool) for x in filtered_custom_dn.data]) - assert filtered_custom_dn.data == [False for _ in range(10)] - - def test_not_equal(self, default_data_frame): - # not equal to for pandas dataframe data_type - df_dn = FakeDataframeDataNode("fake_dataframe_dn", default_data_frame) - - filtered_df_dn = df_dn["a"] != 1 - assert isinstance(filtered_df_dn, _FilterDataNode) - assert isinstance(filtered_df_dn.data, pd.Series) - assert filtered_df_dn.data.dtype == bool - assert all(filtered_df_dn.data == (default_data_frame["a"] != 1)) - - filtered_df_dn = df_dn[["a", "b"]] != 1 - assert isinstance(filtered_df_dn, _FilterDataNode) - assert isinstance(filtered_df_dn.data, pd.DataFrame) - assert all(filtered_df_dn.data.dtypes == bool) - assert all(filtered_df_dn.data == (default_data_frame[["a", "b"]] != 1)) - - # not equal to for custom list data_type - custom_dn = FakeCustomDataNode("fake_custom_dn") - - filtered_custom_dn = custom_dn["a"] != 0 - assert isinstance(filtered_custom_dn, _FilterDataNode) - assert isinstance(filtered_custom_dn.data, List) - assert all([isinstance(x, bool) for x in filtered_custom_dn.data]) - assert filtered_custom_dn.data == [False] + [True for _ in range(9)] - - filtered_custom_dn = custom_dn[["a", "b"]] != 0 - assert isinstance(filtered_custom_dn, _FilterDataNode) - assert isinstance(filtered_custom_dn.data, List) - assert all([isinstance(x, bool) for x in filtered_custom_dn.data]) - assert filtered_custom_dn.data == [True for _ in range(10)] - - def test_larger_than(self, default_data_frame): - # larger than for pandas dataframe data_type - df_dn = FakeDataframeDataNode("fake_dataframe_dn", default_data_frame) - - filtered_df_dn = df_dn["a"] > 2 - assert isinstance(filtered_df_dn, _FilterDataNode) - assert isinstance(filtered_df_dn.data, pd.Series) - assert filtered_df_dn.data.dtype == bool - assert all(filtered_df_dn.data == (default_data_frame["a"] > 2)) - - filtered_df_dn = df_dn[["a", "b"]] > 2 - assert isinstance(filtered_df_dn, _FilterDataNode) - assert isinstance(filtered_df_dn.data, pd.DataFrame) - assert all(filtered_df_dn.data.dtypes == bool) - assert all(filtered_df_dn.data == (default_data_frame[["a", "b"]] > 2)) - - # larger than for custom data_type - custom_dn = FakeCustomDataNode("fake_custom_dn") - - filtered_custom_dn = custom_dn["a"] > 5 - assert isinstance(filtered_custom_dn, _FilterDataNode) - assert isinstance(filtered_custom_dn.data, List) - assert all([isinstance(x, bool) for x in filtered_custom_dn.data]) - assert filtered_custom_dn.data == [False for _ in range(6)] + [True for _ in range(4)] - - def test_larger_equal_to(self, default_data_frame): - # larger than or equal to for pandas dataframe data_type - df_dn = FakeDataframeDataNode("fake_dataframe_dn", default_data_frame) - - filtered_df_dn = df_dn["a"] >= 4 - assert isinstance(filtered_df_dn, _FilterDataNode) - assert isinstance(filtered_df_dn.data, pd.Series) - assert filtered_df_dn.data.dtype == bool - assert all(filtered_df_dn.data == (default_data_frame["a"] >= 4)) - - filtered_df_dn = df_dn[["a", "b"]] >= 4 - assert isinstance(filtered_df_dn, _FilterDataNode) - assert isinstance(filtered_df_dn.data, pd.DataFrame) - assert all(filtered_df_dn.data.dtypes == bool) - assert all(filtered_df_dn.data == (default_data_frame[["a", "b"]] >= 4)) - - # larger than or equal to for custom data_type - custom_dn = FakeCustomDataNode("fake_custom_dn") - - filtered_custom_dn = custom_dn["a"] >= 5 - assert isinstance(filtered_custom_dn, _FilterDataNode) - assert isinstance(filtered_custom_dn.data, List) - assert all([isinstance(x, bool) for x in filtered_custom_dn.data]) - assert filtered_custom_dn.data == [False for _ in range(5)] + [True for _ in range(5)] - - def test_lesser_than(self, default_data_frame): - # lesser than for pandas dataframe data_type - df_dn = FakeDataframeDataNode("fake_dataframe_dn", default_data_frame) - - filtered_df_dn = df_dn["a"] < 5 - assert isinstance(filtered_df_dn, _FilterDataNode) - assert isinstance(filtered_df_dn.data, pd.Series) - assert filtered_df_dn.data.dtype == bool - assert all(filtered_df_dn.data == (default_data_frame["a"] < 5)) - - filtered_df_dn = df_dn[["a", "b"]] < 5 - assert isinstance(filtered_df_dn, _FilterDataNode) - assert isinstance(filtered_df_dn.data, pd.DataFrame) - assert all(filtered_df_dn.data.dtypes == bool) - assert all(filtered_df_dn.data == (default_data_frame[["a", "b"]] < 5)) - - # lesser than for custom data_type - custom_dn = FakeCustomDataNode("fake_custom_dn") - - filtered_custom_dn = custom_dn["a"] < 5 - assert isinstance(filtered_custom_dn, _FilterDataNode) - assert isinstance(filtered_custom_dn.data, List) - assert all([isinstance(x, bool) for x in filtered_custom_dn.data]) - assert filtered_custom_dn.data == [True for _ in range(5)] + [False for _ in range(5)] - - def test_lesser_equal_to(self, default_data_frame): - # lesser than or equal to for pandas dataframe data_type - df_dn = FakeDataframeDataNode("fake_dataframe_dn", default_data_frame) - - filtered_df_dn = df_dn["a"] <= 5 - assert isinstance(filtered_df_dn, _FilterDataNode) - assert isinstance(filtered_df_dn.data, pd.Series) - assert filtered_df_dn.data.dtype == bool - assert all(filtered_df_dn.data == (default_data_frame["a"] <= 5)) - - filtered_df_dn = df_dn[["a", "b"]] <= 5 - assert isinstance(filtered_df_dn, _FilterDataNode) - assert isinstance(filtered_df_dn.data, pd.DataFrame) - assert all(filtered_df_dn.data.dtypes == bool) - assert all(filtered_df_dn.data == (default_data_frame[["a", "b"]] <= 5)) - - # lesser than or equal to for custom data_type - custom_dn = FakeCustomDataNode("fake_custom_dn") - - filtered_custom_dn = custom_dn["a"] <= 5 - assert isinstance(filtered_custom_dn, _FilterDataNode) - assert isinstance(filtered_custom_dn.data, List) - assert all([isinstance(x, bool) for x in filtered_custom_dn.data]) - assert filtered_custom_dn.data == [True for _ in range(6)] + [False for _ in range(4)] - - def test_and(self, default_data_frame): - # and comparator for pandas dataframe data_type - df_dn = FakeDataframeDataNode("fake_dataframe_dn", default_data_frame) - - filtered_df_dn = (df_dn["a"] >= 2) & (df_dn["a"] <= 5) - assert isinstance(filtered_df_dn, _FilterDataNode) - assert isinstance(filtered_df_dn.data, pd.Series) - assert filtered_df_dn.data.dtype == bool - assert all(filtered_df_dn.data == (default_data_frame["a"] >= 2) & (default_data_frame["a"] <= 5)) - - filtered_df_dn = (df_dn[["a", "b"]] >= 2) & (df_dn[["a", "b"]] <= 5) - assert isinstance(filtered_df_dn, _FilterDataNode) - assert isinstance(filtered_df_dn.data, pd.DataFrame) - assert all(filtered_df_dn.data.dtypes == bool) - assert all(filtered_df_dn.data == (default_data_frame[["a", "b"]] >= 2) & (default_data_frame[["a", "b"]] <= 5)) - - # and comparator for custom data_type - custom_dn = FakeCustomDataNode("fake_custom_dn") - - filtered_custom_dn = (custom_dn["a"] >= 2) & (custom_dn["a"] <= 5) - assert isinstance(filtered_custom_dn, _FilterDataNode) - assert isinstance(filtered_custom_dn.data, List) - assert all([isinstance(x, bool) for x in filtered_custom_dn.data]) - assert filtered_custom_dn.data == [False for _ in range(2)] + [True for _ in range(4)] + [ - False for _ in range(4) - ] - - def test_or(self, default_data_frame): - # or comparator for pandas dataframe data_type - df_dn = FakeDataframeDataNode("fake_dataframe_dn", default_data_frame) - - filtered_df_dn = (df_dn["a"] < 2) | (df_dn["a"] > 5) - assert isinstance(filtered_df_dn, _FilterDataNode) - assert isinstance(filtered_df_dn.data, pd.Series) - assert filtered_df_dn.data.dtype == bool - assert all(filtered_df_dn.data == (default_data_frame["a"] < 2) | (default_data_frame["a"] > 5)) - - filtered_df_dn = (df_dn[["a", "b"]] < 2) | (df_dn[["a", "b"]] > 5) - assert isinstance(filtered_df_dn, _FilterDataNode) - assert isinstance(filtered_df_dn.data, pd.DataFrame) - assert all(filtered_df_dn.data.dtypes == bool) - assert all(filtered_df_dn.data == (default_data_frame[["a", "b"]] < 2) | (default_data_frame[["a", "b"]] > 5)) - - # or comparator for custom data_type - custom_dn = FakeCustomDataNode("fake_custom_dn") - - filtered_custom_dn = (custom_dn["a"] < 2) | (custom_dn["a"] > 5) - assert isinstance(filtered_custom_dn, _FilterDataNode) - assert isinstance(filtered_custom_dn.data, List) - assert all([isinstance(x, bool) for x in filtered_custom_dn.data]) - assert filtered_custom_dn.data == [True for _ in range(2)] + [False for _ in range(4)] + [ - True for _ in range(4) + ) == len(default_data_frame[(default_data_frame[COLUMN_NAME_1] > 2) | (default_data_frame[COLUMN_NAME_1] > 3)]) + assert len( + df_dn.filter( + [(COLUMN_NAME_1, 10, Operator.GREATER_THAN), (COLUMN_NAME_1, -10, Operator.LESS_THAN)], JoinOperator.AND + ) + ) == len(default_data_frame[(default_data_frame[COLUMN_NAME_1] > 10) | (default_data_frame[COLUMN_NAME_1] < -10)]) + assert len( + df_dn.filter( + [(COLUMN_NAME_1, 10, Operator.GREATER_THAN), (COLUMN_NAME_1, -10, Operator.LESS_THAN)], JoinOperator.OR + ) + ) == len(default_data_frame[(default_data_frame[COLUMN_NAME_1] > 10) | (default_data_frame[COLUMN_NAME_1] < -10)]) + + +def test_filter_list(): + list_dn = FakeListDataNode("fake_list_dn") + + KEY_NAME = "value" + + assert len(list_dn.filter((KEY_NAME, 4, Operator.EQUAL))) == 1 + assert len(list_dn.filter((KEY_NAME, 4, Operator.NOT_EQUAL))) == 9 + assert len(list_dn.filter([(KEY_NAME, 4, Operator.EQUAL)])) == 1 + assert len(list_dn.filter([(KEY_NAME, 4, Operator.NOT_EQUAL)])) == 9 + assert len(list_dn.filter([(KEY_NAME, 4, Operator.LESS_THAN)])) == 4 + assert len(list_dn.filter([(KEY_NAME, 4, Operator.LESS_OR_EQUAL)])) == 5 + assert len(list_dn.filter([(KEY_NAME, 4, Operator.GREATER_THAN)])) == 5 + assert len(list_dn.filter([(KEY_NAME, 4, Operator.GREATER_OR_EQUAL)])) == 6 + assert len(list_dn.filter([(KEY_NAME, -1000, Operator.LESS_OR_EQUAL)])) == 0 + assert len(list_dn.filter([(KEY_NAME, 1000, Operator.GREATER_OR_EQUAL)])) == 0 + + assert len(list_dn.filter([(KEY_NAME, 4, Operator.EQUAL), (KEY_NAME, 5, Operator.EQUAL)])) == 0 + assert len(list_dn.filter([(KEY_NAME, 4, Operator.EQUAL), (KEY_NAME, 5, Operator.EQUAL)], JoinOperator.OR)) == 2 + assert len(list_dn.filter([(KEY_NAME, 4, Operator.EQUAL), (KEY_NAME, 11, Operator.EQUAL)], JoinOperator.AND)) == 0 + assert len(list_dn.filter([(KEY_NAME, 4, Operator.EQUAL), (KEY_NAME, 11, Operator.EQUAL)], JoinOperator.OR)) == 1 + + assert ( + len(list_dn.filter([(KEY_NAME, -10, Operator.LESS_OR_EQUAL), (KEY_NAME, 11, Operator.GREATER_OR_EQUAL)])) == 0 + ) + assert ( + len( + list_dn.filter( + [ + (KEY_NAME, 4, Operator.GREATER_OR_EQUAL), + (KEY_NAME, 6, Operator.GREATER_OR_EQUAL), + ], + JoinOperator.AND, + ) + ) + == 4 + ) + assert ( + len( + list_dn.filter( + [ + (KEY_NAME, 4, Operator.GREATER_OR_EQUAL), + (KEY_NAME, 6, Operator.GREATER_OR_EQUAL), + (KEY_NAME, 11, Operator.EQUAL), + ], + JoinOperator.AND, + ) + ) + == 0 + ) + assert ( + len( + list_dn.filter( + [ + (KEY_NAME, 4, Operator.GREATER_OR_EQUAL), + (KEY_NAME, 6, Operator.GREATER_OR_EQUAL), + (KEY_NAME, 11, Operator.EQUAL), + ], + JoinOperator.OR, + ) + ) + == 6 + ) + + +def test_filter_numpy_exposed_type(default_data_frame): + default_array = default_data_frame.to_numpy() + + df_dn = FakeNumpyarrayDataNode("fake_dataframe_dn", default_array) + + assert len(df_dn.filter((0, 1, Operator.EQUAL))) == len(default_array[default_array[:, 0] == 1]) + assert len(df_dn.filter((0, 1, Operator.NOT_EQUAL))) == len(default_array[default_array[:, 0] != 1]) + assert len(df_dn.filter([(0, 1, Operator.EQUAL)])) == len(default_array[default_array[:, 0] == 1]) + assert len(df_dn.filter([(0, 1, Operator.NOT_EQUAL)])) == len(default_array[default_array[:, 0] != 1]) + assert len(df_dn.filter([(0, 1, Operator.LESS_THAN)])) == len(default_array[default_array[:, 0] < 1]) + assert len(df_dn.filter([(0, 1, Operator.LESS_OR_EQUAL)])) == len(default_array[default_array[:, 0] <= 1]) + assert len(df_dn.filter([(0, 1, Operator.GREATER_THAN)])) == len(default_array[default_array[:, 0] > 1]) + assert len(df_dn.filter([(0, 1, Operator.GREATER_OR_EQUAL)])) == len(default_array[default_array[:, 0] >= 1]) + assert len(df_dn.filter([(0, -1000, Operator.LESS_OR_EQUAL)])) == 0 + assert len(df_dn.filter([(0, 1000, Operator.GREATER_OR_EQUAL)])) == 0 + assert len(df_dn.filter([(0, 4, Operator.EQUAL), (0, 5, Operator.EQUAL)])) == len( + default_array[(default_array[:, 0] == 4) & (default_array[:, 0] == 5)] + ) + assert len(df_dn.filter([(0, 4, Operator.EQUAL), (1, 5, Operator.EQUAL)], JoinOperator.OR)) == len( + default_array[(default_array[:, 0] == 4) | (default_array[:, 1] == 5)] + ) + assert len(df_dn.filter([(0, 1, Operator.GREATER_THAN), (1, 3, Operator.GREATER_THAN)], JoinOperator.AND)) == len( + default_array[(default_array[:, 0] > 1) & (default_array[:, 1] > 3)] + ) + assert len(df_dn.filter([(0, 2, Operator.GREATER_THAN), (0, 3, Operator.GREATER_THAN)], JoinOperator.OR)) == len( + default_array[(default_array[:, 0] > 2) | (default_array[:, 0] > 3)] + ) + assert len(df_dn.filter([(0, 10, Operator.GREATER_THAN), (0, -10, Operator.LESS_THAN)], JoinOperator.AND)) == len( + default_array[(default_array[:, 0] > 10) | (default_array[:, 0] < -10)] + ) + assert len(df_dn.filter([(0, 10, Operator.GREATER_THAN), (0, -10, Operator.LESS_THAN)], JoinOperator.OR)) == len( + default_array[(default_array[:, 0] > 10) | (default_array[:, 0] < -10)] + ) + + +def test_filter_by_get_item(default_data_frame): + # get item for DataFrame data_type + default_data_frame[1] = [100, 100] + df_dn = FakeDataframeDataNode("fake_dataframe_dn", default_data_frame) + + filtered_df_dn = df_dn["a"] + assert isinstance(filtered_df_dn, pd.Series) + assert len(filtered_df_dn) == len(default_data_frame["a"]) + assert filtered_df_dn.to_dict() == default_data_frame["a"].to_dict() + + filtered_df_dn = df_dn[1] + assert isinstance(filtered_df_dn, pd.Series) + assert len(filtered_df_dn) == len(default_data_frame[1]) + assert filtered_df_dn.to_dict() == default_data_frame[1].to_dict() + + filtered_df_dn = df_dn[0:2] + assert isinstance(filtered_df_dn, pd.DataFrame) + assert filtered_df_dn.shape == default_data_frame[0:2].shape + assert len(filtered_df_dn) == 2 + + bool_df = default_data_frame.copy(deep=True) > 4 + filtered_df_dn = df_dn[bool_df] + assert isinstance(filtered_df_dn, pd.DataFrame) + + bool_1d_index = [True, False] + filtered_df_dn = df_dn[bool_1d_index] + assert isinstance(filtered_df_dn, pd.DataFrame) + assert filtered_df_dn.to_dict() == default_data_frame[bool_1d_index].to_dict() + assert len(filtered_df_dn) == 1 + + filtered_df_dn = df_dn[["a", "b"]] + assert isinstance(filtered_df_dn, pd.DataFrame) + assert filtered_df_dn.shape == default_data_frame[["a", "b"]].shape + assert filtered_df_dn.to_dict() == default_data_frame[["a", "b"]].to_dict() + + # get item for custom data_type + custom_dn = FakeCustomDataNode("fake_custom_dn") + + filtered_custom_dn = custom_dn["a"] + assert isinstance(filtered_custom_dn, List) + assert len(filtered_custom_dn) == 10 + assert filtered_custom_dn == [i for i in range(10)] + + filtered_custom_dn = custom_dn[0:5] + assert isinstance(filtered_custom_dn, List) + assert all([isinstance(x, CustomClass) for x in filtered_custom_dn]) + assert len(filtered_custom_dn) == 5 + + bool_1d_index = [True if i < 5 else False for i in range(10)] + filtered_custom_dn = custom_dn[bool_1d_index] + assert isinstance(filtered_custom_dn, List) + assert len(filtered_custom_dn) == 5 + assert filtered_custom_dn == custom_dn._read()[:5] + + filtered_custom_dn = custom_dn[["a", "b"]] + assert isinstance(filtered_custom_dn, List) + assert all([isinstance(x, Dict) for x in filtered_custom_dn]) + assert len(filtered_custom_dn) == 10 + assert filtered_custom_dn == [{"a": i, "b": i * 2} for i in range(10)] + + # get item for Multi-sheet Excel data_type + multi_sheet_excel_df_dn = FakeMultiSheetExcelDataFrameDataNode("fake_multi_sheet_excel_df_dn", default_data_frame) + filtered_multi_sheet_excel_df_dn = multi_sheet_excel_df_dn["Sheet1"] + assert isinstance(filtered_multi_sheet_excel_df_dn, pd.DataFrame) + assert len(filtered_multi_sheet_excel_df_dn) == len(default_data_frame) + assert np.array_equal(filtered_multi_sheet_excel_df_dn.to_numpy(), default_data_frame.to_numpy()) + + multi_sheet_excel_custom_dn = FakeMultiSheetExcelCustomDataNode("fake_multi_sheet_excel_df_dn") + filtered_multi_sheet_excel_custom_dn = multi_sheet_excel_custom_dn["Sheet1"] + assert isinstance(filtered_multi_sheet_excel_custom_dn, List) + assert len(filtered_multi_sheet_excel_custom_dn) == 10 + expected_value = [CustomClass(i, i * 2) for i in range(10)] + assert all( + [ + expected.a == filtered.a and expected.b == filtered.b + for expected, filtered in zip(expected_value, filtered_multi_sheet_excel_custom_dn) ] - - def test_to_string(self): - filter_dn = _FilterDataNode("dn_id", []) - assert isinstance(str(filter_dn), str) + ) diff --git a/tests/core/data/utils.py b/tests/core/data/utils.py new file mode 100644 index 00000000..87a0facb --- /dev/null +++ b/tests/core/data/utils.py @@ -0,0 +1,120 @@ +# Copyright 2023 Avaiga Private Limited +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on +# an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +from src.taipy.core.data.data_node import DataNode +from src.taipy.core.data.in_memory import InMemoryDataNode +from taipy.config.common.scope import Scope + + +class FakeDataNode(InMemoryDataNode): + read_has_been_called = 0 + write_has_been_called = 0 + + def __init__(self, config_id, **kwargs): + scope = kwargs.pop("scope", Scope.SCENARIO) + super().__init__(config_id=config_id, scope=scope, **kwargs) + + def _read(self, query=None): + self.read_has_been_called += 1 + + def _write(self, data): + self.write_has_been_called += 1 + + @classmethod + def storage_type(cls) -> str: + return "fake_inmemory" + + write = DataNode.write # Make sure that the writing behavior comes from DataNode + + +class FakeDataframeDataNode(DataNode): + COLUMN_NAME_1 = "a" + COLUMN_NAME_2 = "b" + + def __init__(self, config_id, default_data_frame, **kwargs): + super().__init__(config_id, **kwargs) + self.data = default_data_frame + + def _read(self): + return self.data + + @classmethod + def storage_type(cls) -> str: + return "fake_df_dn" + + +class FakeNumpyarrayDataNode(DataNode): + def __init__(self, config_id, default_array, **kwargs): + super().__init__(config_id, **kwargs) + self.data = default_array + + def _read(self): + return self.data + + @classmethod + def storage_type(cls) -> str: + return "fake_np_dn" + + +class FakeListDataNode(DataNode): + class Row: + def __init__(self, value): + self.value = value + + def __init__(self, config_id, **kwargs): + super().__init__(config_id, **kwargs) + self.data = [self.Row(i) for i in range(10)] + + def _read(self): + return self.data + + @classmethod + def storage_type(cls) -> str: + return "fake_list_dn" + + +class CustomClass: + def __init__(self, a, b): + self.a = a + self.b = b + + +class FakeCustomDataNode(DataNode): + def __init__(self, config_id, **kwargs): + super().__init__(config_id, **kwargs) + self.data = [CustomClass(i, i * 2) for i in range(10)] + + def _read(self): + return self.data + + +class FakeMultiSheetExcelDataFrameDataNode(DataNode): + def __init__(self, config_id, default_data_frame, **kwargs): + super().__init__(config_id, **kwargs) + self.data = { + "Sheet1": default_data_frame, + "Sheet2": default_data_frame, + } + + def _read(self): + return self.data + + +class FakeMultiSheetExcelCustomDataNode(DataNode): + def __init__(self, config_id, **kwargs): + super().__init__(config_id, **kwargs) + self.data = { + "Sheet1": [CustomClass(i, i * 2) for i in range(10)], + "Sheet2": [CustomClass(i, i * 2) for i in range(10)], + } + + def _read(self): + return self.data From 388cafabed7693e3462d49b9d6ee4797f94ac7e8 Mon Sep 17 00:00:00 2001 From: trgiangdo Date: Thu, 2 Nov 2023 01:15:57 +0700 Subject: [PATCH 07/11] tests: add filter unit tests for supported data node types --- tests/conftest.py | 15 +- tests/core/data/test_csv_data_node.py | 67 +++++ tests/core/data/test_excel_data_node.py | 301 ++++++++++++++++++++ tests/core/data/test_json_data_node.py | 25 +- tests/core/data/test_mongo_data_node.py | 82 +++--- tests/core/data/test_parquet_data_node.py | 67 +++++ tests/core/data/test_sql_data_node.py | 152 ++++++++-- tests/core/data/test_sql_table_data_node.py | 102 ++++++- 8 files changed, 748 insertions(+), 63 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index 6c533b54..dff4481c 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -8,9 +8,8 @@ # Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on # an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the # specific language governing permissions and limitations under the License. -import json + import os -import pathlib import pickle import shutil from datetime import datetime @@ -145,9 +144,9 @@ def tmp_sqlite_db_file_path(tmpdir_factory): file_extension = ".db" db = create_engine("sqlite:///" + os.path.join(fn.strpath, f"{db_name}{file_extension}")) conn = db.connect() - conn.execute(text("CREATE TABLE example (a int, b int, c int);")) - conn.execute(text("INSERT INTO example (a, b, c) VALUES (1, 2, 3);")) - conn.execute(text("INSERT INTO example (a, b, c) VALUES (4, 5, 6);")) + conn.execute(text("CREATE TABLE foo (foo int, bar int);")) + conn.execute(text("INSERT INTO foo (foo, bar) VALUES (1, 2);")) + conn.execute(text("INSERT INTO foo (foo, bar) VALUES (3, 4);")) conn.commit() conn.close() db.dispose() @@ -163,9 +162,9 @@ def tmp_sqlite_sqlite3_file_path(tmpdir_factory): db = create_engine("sqlite:///" + os.path.join(fn.strpath, f"{db_name}{file_extension}")) conn = db.connect() - conn.execute(text("CREATE TABLE example (a int, b int, c int);")) - conn.execute(text("INSERT INTO example (a, b, c) VALUES (1, 2, 3);")) - conn.execute(text("INSERT INTO example (a, b, c) VALUES (4, 5, 6);")) + conn.execute(text("CREATE TABLE foo (foo int, bar int);")) + conn.execute(text("INSERT INTO foo (foo, bar) VALUES (1, 2);")) + conn.execute(text("INSERT INTO foo (foo, bar) VALUES (3, 4);")) conn.commit() conn.close() db.dispose() diff --git a/tests/core/data/test_csv_data_node.py b/tests/core/data/test_csv_data_node.py index cac21fa6..29dcf046 100644 --- a/tests/core/data/test_csv_data_node.py +++ b/tests/core/data/test_csv_data_node.py @@ -22,6 +22,7 @@ from src.taipy.core.data._data_manager import _DataManager from src.taipy.core.data.csv import CSVDataNode from src.taipy.core.data.data_node_id import DataNodeId +from src.taipy.core.data.operator import JoinOperator, Operator from src.taipy.core.exceptions.exceptions import InvalidExposedType, NoData from taipy.config.common.scope import Scope from taipy.config.config import Config @@ -302,6 +303,72 @@ def test_pandas_exposed_type(self): dn = CSVDataNode("foo", Scope.SCENARIO, properties={"path": path, "exposed_type": "pandas"}) assert isinstance(dn.read(), pd.DataFrame) + def test_filter_pandas_exposed_type(self, csv_file): + dn = CSVDataNode("foo", Scope.SCENARIO, properties={"path": csv_file, "exposed_type": "pandas"}) + dn.write( + [ + {"foo": 1, "bar": 1}, + {"foo": 1, "bar": 2}, + {"foo": 1}, + {"foo": 2, "bar": 2}, + {"bar": 2}, + ] + ) + + assert len(dn.filter(("foo", 1, Operator.EQUAL))) == 3 + assert len(dn.filter(("foo", 1, Operator.NOT_EQUAL))) == 2 + assert len(dn.filter(("bar", 2, Operator.EQUAL))) == 3 + assert len(dn.filter([("bar", 1, Operator.EQUAL), ("bar", 2, Operator.EQUAL)], JoinOperator.OR)) == 4 + + assert dn["foo"].equals(pd.Series([1, 1, 1, 2, None])) + assert dn["bar"].equals(pd.Series([1, 2, None, 2, 2])) + assert dn[:2].equals(pd.DataFrame([{"foo": 1.0, "bar": 1.0}, {"foo": 1.0, "bar": 2.0}])) + + def test_filter_modin_exposed_type(self, csv_file): + dn = CSVDataNode("foo", Scope.SCENARIO, properties={"path": csv_file, "exposed_type": "modin"}) + dn.write( + [ + {"foo": 1, "bar": 1}, + {"foo": 1, "bar": 2}, + {"foo": 1}, + {"foo": 2, "bar": 2}, + {"bar": 2}, + ] + ) + + assert len(dn.filter(("foo", 1, Operator.EQUAL))) == 3 + assert len(dn.filter(("foo", 1, Operator.NOT_EQUAL))) == 2 + assert len(dn.filter(("bar", 2, Operator.EQUAL))) == 3 + assert len(dn.filter([("bar", 1, Operator.EQUAL), ("bar", 2, Operator.EQUAL)], JoinOperator.OR)) == 4 + + assert dn["foo"].equals(modin_pd.Series([1, 1, 1, 2, None])) + assert dn["bar"].equals(modin_pd.Series([1, 2, None, 2, 2])) + assert dn[:2].equals(modin_pd.DataFrame([{"foo": 1.0, "bar": 1.0}, {"foo": 1.0, "bar": 2.0}])) + + def test_filter_numpy_exposed_type(self, csv_file): + dn = CSVDataNode("foo", Scope.SCENARIO, properties={"path": csv_file, "exposed_type": "numpy"}) + dn.write( + [ + [1, 1], + [1, 2], + [1, 3], + [2, 1], + [2, 2], + [2, 3], + ] + ) + + assert len(dn.filter((0, 1, Operator.EQUAL))) == 3 + assert len(dn.filter((0, 1, Operator.NOT_EQUAL))) == 3 + assert len(dn.filter((1, 2, Operator.EQUAL))) == 2 + assert len(dn.filter([(0, 1, Operator.EQUAL), (1, 2, Operator.EQUAL)], JoinOperator.OR)) == 4 + + assert np.array_equal(dn[0], np.array([1, 1])) + assert np.array_equal(dn[1], np.array([1, 2])) + assert np.array_equal(dn[:3], np.array([[1, 1], [1, 2], [1, 3]])) + assert np.array_equal(dn[:, 0], np.array([1, 1, 1, 2, 2, 2])) + assert np.array_equal(dn[1:4, :1], np.array([[1], [1], [2]])) + def test_raise_error_invalid_exposed_type(self): path = os.path.join(pathlib.Path(__file__).parent.resolve(), "data_sample/example.csv") with pytest.raises(InvalidExposedType): diff --git a/tests/core/data/test_excel_data_node.py b/tests/core/data/test_excel_data_node.py index 3ae3cd48..e8097cf3 100644 --- a/tests/core/data/test_excel_data_node.py +++ b/tests/core/data/test_excel_data_node.py @@ -23,6 +23,7 @@ from src.taipy.core.data._data_manager import _DataManager from src.taipy.core.data.data_node_id import DataNodeId from src.taipy.core.data.excel import ExcelDataNode +from src.taipy.core.data.operator import JoinOperator, Operator from src.taipy.core.exceptions.exceptions import ( ExposedTypeLengthMismatch, InvalidExposedType, @@ -935,6 +936,306 @@ def test_write_multi_sheet_with_modin( for sheet_name in sheet_names: assert np.array_equal(excel_dn.read()[sheet_name].values, multi_sheet_content[sheet_name].values) + def test_filter_pandas_exposed_type_with_sheetname(self, excel_file): + dn = ExcelDataNode( + "foo", Scope.SCENARIO, properties={"path": excel_file, "sheet_name": "Sheet1", "exposed_type": "pandas"} + ) + dn.write( + [ + {"foo": 1, "bar": 1}, + {"foo": 1, "bar": 2}, + {"foo": 1}, + {"foo": 2, "bar": 2}, + {"bar": 2}, + ] + ) + + assert len(dn.filter(("foo", 1, Operator.EQUAL))) == 3 + assert len(dn.filter(("foo", 1, Operator.NOT_EQUAL))) == 2 + assert len(dn.filter(("bar", 2, Operator.EQUAL))) == 3 + assert len(dn.filter([("bar", 1, Operator.EQUAL), ("bar", 2, Operator.EQUAL)], JoinOperator.OR)) == 4 + + assert dn["foo"].equals(pd.Series([1, 1, 1, 2, None])) + assert dn["bar"].equals(pd.Series([1, 2, None, 2, 2])) + assert dn[:2].equals(pd.DataFrame([{"foo": 1.0, "bar": 1.0}, {"foo": 1.0, "bar": 2.0}])) + + def test_filter_pandas_exposed_type_without_sheetname(self, excel_file): + dn = ExcelDataNode("foo", Scope.SCENARIO, properties={"path": excel_file, "exposed_type": "pandas"}) + dn.write( + [ + {"foo": 1, "bar": 1}, + {"foo": 1, "bar": 2}, + {"foo": 1}, + {"foo": 2, "bar": 2}, + {"bar": 2}, + ] + ) + + assert len(dn.filter(("foo", 1, Operator.EQUAL))["Sheet1"]) == 3 + assert len(dn.filter(("foo", 1, Operator.NOT_EQUAL))["Sheet1"]) == 2 + assert len(dn.filter(("bar", 2, Operator.EQUAL))["Sheet1"]) == 3 + assert len(dn.filter([("bar", 1, Operator.EQUAL), ("bar", 2, Operator.EQUAL)], JoinOperator.OR)["Sheet1"]) == 4 + + assert dn["Sheet1"]["foo"].equals(pd.Series([1, 1, 1, 2, None])) + assert dn["Sheet1"]["bar"].equals(pd.Series([1, 2, None, 2, 2])) + assert dn["Sheet1"][:2].equals(pd.DataFrame([{"foo": 1.0, "bar": 1.0}, {"foo": 1.0, "bar": 2.0}])) + + def test_filter_pandas_exposed_type_multisheet(self, excel_file): + dn = ExcelDataNode( + "foo", + Scope.SCENARIO, + properties={"path": excel_file, "sheet_name": ["sheet_1", "sheet_2"], "exposed_type": "pandas"}, + ) + dn.write( + { + "sheet_1": pd.DataFrame( + [ + {"foo": 1, "bar": 1}, + {"foo": 1, "bar": 2}, + {"foo": 1}, + {"foo": 2, "bar": 2}, + {"bar": 2}, + ] + ), + "sheet_2": pd.DataFrame( + [ + {"foo": 1, "bar": 3}, + {"foo": 1, "bar": 4}, + {"foo": 1}, + {"foo": 2, "bar": 4}, + {"bar": 4}, + ] + ), + } + ) + + assert len(dn.filter(("foo", 1, Operator.EQUAL))) == 2 + assert len(dn.filter(("foo", 1, Operator.EQUAL))["sheet_1"]) == 3 + assert len(dn.filter(("foo", 1, Operator.EQUAL))["sheet_2"]) == 3 + + assert len(dn.filter(("foo", 1, Operator.NOT_EQUAL))) == 2 + assert len(dn.filter(("foo", 1, Operator.NOT_EQUAL))["sheet_1"]) == 2 + assert len(dn.filter(("foo", 1, Operator.NOT_EQUAL))["sheet_2"]) == 2 + + assert len(dn.filter(("bar", 2, Operator.EQUAL))) == 2 + assert len(dn.filter(("bar", 2, Operator.EQUAL))["sheet_1"]) == 3 + assert len(dn.filter(("bar", 2, Operator.EQUAL))["sheet_2"]) == 0 + + assert len(dn.filter([("bar", 1, Operator.EQUAL), ("bar", 2, Operator.EQUAL)], JoinOperator.OR)) == 2 + assert len(dn.filter([("bar", 1, Operator.EQUAL), ("bar", 2, Operator.EQUAL)], JoinOperator.OR)["sheet_1"]) == 4 + assert len(dn.filter([("bar", 1, Operator.EQUAL), ("bar", 2, Operator.EQUAL)], JoinOperator.OR)["sheet_2"]) == 0 + + assert dn["sheet_1"]["foo"].equals(pd.Series([1, 1, 1, 2, None])) + assert dn["sheet_2"]["foo"].equals(pd.Series([1, 1, 1, 2, None])) + assert dn["sheet_1"]["bar"].equals(pd.Series([1, 2, None, 2, 2])) + assert dn["sheet_2"]["bar"].equals(pd.Series([3, 4, None, 4, 4])) + assert dn["sheet_1"][:2].equals(pd.DataFrame([{"foo": 1.0, "bar": 1.0}, {"foo": 1.0, "bar": 2.0}])) + assert dn["sheet_2"][:2].equals(pd.DataFrame([{"foo": 1.0, "bar": 3.0}, {"foo": 1.0, "bar": 4.0}])) + + def test_filter_modin_exposed_type_with_sheetname(self, excel_file): + dn = ExcelDataNode( + "foo", Scope.SCENARIO, properties={"path": excel_file, "sheet_name": "Sheet1", "exposed_type": "modin"} + ) + dn.write( + [ + {"foo": 1, "bar": 1}, + {"foo": 1, "bar": 2}, + {"foo": 1}, + {"foo": 2, "bar": 2}, + {"bar": 2}, + ] + ) + + assert len(dn.filter(("foo", 1, Operator.EQUAL))) == 3 + assert len(dn.filter(("foo", 1, Operator.NOT_EQUAL))) == 2 + assert len(dn.filter(("bar", 2, Operator.EQUAL))) == 3 + assert len(dn.filter([("bar", 1, Operator.EQUAL), ("bar", 2, Operator.EQUAL)], JoinOperator.OR)) == 4 + + assert dn["foo"].equals(modin_pd.Series([1, 1, 1, 2, None])) + assert dn["bar"].equals(modin_pd.Series([1, 2, None, 2, 2])) + assert dn[:2].equals(modin_pd.DataFrame([{"foo": 1.0, "bar": 1.0}, {"foo": 1.0, "bar": 2.0}])) + + def test_filter_modin_exposed_type_without_sheetname(self, excel_file): + dn = ExcelDataNode("foo", Scope.SCENARIO, properties={"path": excel_file, "exposed_type": "modin"}) + dn.write( + [ + {"foo": 1, "bar": 1}, + {"foo": 1, "bar": 2}, + {"foo": 1}, + {"foo": 2, "bar": 2}, + {"bar": 2}, + ] + ) + + assert len(dn.filter(("foo", 1, Operator.EQUAL))["Sheet1"]) == 3 + assert len(dn.filter(("foo", 1, Operator.NOT_EQUAL))["Sheet1"]) == 2 + assert len(dn.filter(("bar", 2, Operator.EQUAL))["Sheet1"]) == 3 + assert len(dn.filter([("bar", 1, Operator.EQUAL), ("bar", 2, Operator.EQUAL)], JoinOperator.OR)["Sheet1"]) == 4 + + assert dn["Sheet1"]["foo"].equals(modin_pd.Series([1, 1, 1, 2, None])) + assert dn["Sheet1"]["bar"].equals(modin_pd.Series([1, 2, None, 2, 2])) + assert dn["Sheet1"][:2].equals(modin_pd.DataFrame([{"foo": 1.0, "bar": 1.0}, {"foo": 1.0, "bar": 2.0}])) + + def test_filter_modin_exposed_type_multisheet(self, excel_file): + dn = ExcelDataNode( + "foo", + Scope.SCENARIO, + properties={"path": excel_file, "sheet_name": ["sheet_1", "sheet_2"], "exposed_type": "modin"}, + ) + dn.write( + { + "sheet_1": pd.DataFrame( + [ + {"foo": 1, "bar": 1}, + {"foo": 1, "bar": 2}, + {"foo": 1}, + {"foo": 2, "bar": 2}, + {"bar": 2}, + ] + ), + "sheet_2": pd.DataFrame( + [ + {"foo": 1, "bar": 3}, + {"foo": 1, "bar": 4}, + {"foo": 1}, + {"foo": 2, "bar": 4}, + {"bar": 4}, + ] + ), + } + ) + + assert len(dn.filter(("foo", 1, Operator.EQUAL))) == 2 + assert len(dn.filter(("foo", 1, Operator.EQUAL))["sheet_1"]) == 3 + assert len(dn.filter(("foo", 1, Operator.EQUAL))["sheet_2"]) == 3 + + assert len(dn.filter(("foo", 1, Operator.NOT_EQUAL))) == 2 + assert len(dn.filter(("foo", 1, Operator.NOT_EQUAL))["sheet_1"]) == 2 + assert len(dn.filter(("foo", 1, Operator.NOT_EQUAL))["sheet_2"]) == 2 + + assert len(dn.filter(("bar", 2, Operator.EQUAL))) == 2 + assert len(dn.filter(("bar", 2, Operator.EQUAL))["sheet_1"]) == 3 + assert len(dn.filter(("bar", 2, Operator.EQUAL))["sheet_2"]) == 0 + + assert len(dn.filter([("bar", 1, Operator.EQUAL), ("bar", 2, Operator.EQUAL)], JoinOperator.OR)) == 2 + assert len(dn.filter([("bar", 1, Operator.EQUAL), ("bar", 2, Operator.EQUAL)], JoinOperator.OR)["sheet_1"]) == 4 + assert len(dn.filter([("bar", 1, Operator.EQUAL), ("bar", 2, Operator.EQUAL)], JoinOperator.OR)["sheet_2"]) == 0 + + assert dn["sheet_1"]["foo"].equals(modin_pd.Series([1, 1, 1, 2, None])) + assert dn["sheet_2"]["foo"].equals(modin_pd.Series([1, 1, 1, 2, None])) + assert dn["sheet_1"]["bar"].equals(modin_pd.Series([1, 2, None, 2, 2])) + assert dn["sheet_2"]["bar"].equals(modin_pd.Series([3, 4, None, 4, 4])) + assert dn["sheet_1"][:2].equals(modin_pd.DataFrame([{"foo": 1.0, "bar": 1.0}, {"foo": 1.0, "bar": 2.0}])) + assert dn["sheet_2"][:2].equals(modin_pd.DataFrame([{"foo": 1.0, "bar": 3.0}, {"foo": 1.0, "bar": 4.0}])) + + def test_filter_numpy_exposed_type_with_sheetname(self, excel_file): + dn = ExcelDataNode( + "foo", Scope.SCENARIO, properties={"path": excel_file, "sheet_name": "Sheet1", "exposed_type": "numpy"} + ) + dn.write( + [ + [1, 1], + [1, 2], + [1, 3], + [2, 1], + [2, 2], + [2, 3], + ] + ) + + assert len(dn.filter((0, 1, Operator.EQUAL))) == 3 + assert len(dn.filter((0, 1, Operator.NOT_EQUAL))) == 3 + assert len(dn.filter((1, 2, Operator.EQUAL))) == 2 + assert len(dn.filter([(0, 1, Operator.EQUAL), (1, 2, Operator.EQUAL)], JoinOperator.OR)) == 4 + + assert np.array_equal(dn[0], np.array([1, 1])) + assert np.array_equal(dn[1], np.array([1, 2])) + assert np.array_equal(dn[:3], np.array([[1, 1], [1, 2], [1, 3]])) + assert np.array_equal(dn[:, 0], np.array([1, 1, 1, 2, 2, 2])) + assert np.array_equal(dn[1:4, :1], np.array([[1], [1], [2]])) + + def test_filter_numpy_exposed_type_without_sheetname(self, excel_file): + dn = ExcelDataNode("foo", Scope.SCENARIO, properties={"path": excel_file, "exposed_type": "numpy"}) + dn.write( + [ + [1, 1], + [1, 2], + [1, 3], + [2, 1], + [2, 2], + [2, 3], + ] + ) + + assert len(dn.filter((0, 1, Operator.EQUAL))["Sheet1"]) == 3 + assert len(dn.filter((0, 1, Operator.NOT_EQUAL))["Sheet1"]) == 3 + assert len(dn.filter((1, 2, Operator.EQUAL))["Sheet1"]) == 2 + assert len(dn.filter([(0, 1, Operator.EQUAL), (1, 2, Operator.EQUAL)], JoinOperator.OR)["Sheet1"]) == 4 + + assert np.array_equal(dn["Sheet1"][0], np.array([1, 1])) + assert np.array_equal(dn["Sheet1"][1], np.array([1, 2])) + assert np.array_equal(dn["Sheet1"][:3], np.array([[1, 1], [1, 2], [1, 3]])) + assert np.array_equal(dn["Sheet1"][:, 0], np.array([1, 1, 1, 2, 2, 2])) + assert np.array_equal(dn["Sheet1"][1:4, :1], np.array([[1], [1], [2]])) + + def test_filter_numpy_exposed_type_multisheet(self, excel_file): + dn = ExcelDataNode( + "foo", + Scope.SCENARIO, + properties={"path": excel_file, "sheet_name": ["sheet_1", "sheet_2"], "exposed_type": "numpy"}, + ) + dn.write( + { + "sheet_1": pd.DataFrame( + [ + [1, 1], + [1, 2], + [1, 3], + [2, 1], + [2, 2], + [2, 3], + ] + ), + "sheet_2": pd.DataFrame( + [ + [1, 4], + [1, 5], + [1, 6], + [2, 4], + [2, 5], + [2, 6], + ] + ), + } + ) + + assert len(dn.filter((0, 1, Operator.EQUAL))) == 2 + assert len(dn.filter((0, 1, Operator.EQUAL))["sheet_1"]) == 3 + assert len(dn.filter((0, 1, Operator.EQUAL))["sheet_2"]) == 3 + + assert len(dn.filter((0, 1, Operator.NOT_EQUAL))) == 2 + assert len(dn.filter((0, 1, Operator.NOT_EQUAL))["sheet_1"]) == 3 + assert len(dn.filter((0, 1, Operator.NOT_EQUAL))["sheet_2"]) == 3 + + assert len(dn.filter((1, 2, Operator.EQUAL))) == 2 + assert len(dn.filter((1, 2, Operator.EQUAL))["sheet_1"]) == 2 + assert len(dn.filter((1, 2, Operator.EQUAL))["sheet_2"]) == 0 + + assert len(dn.filter([(1, 1, Operator.EQUAL), (1, 2, Operator.EQUAL)], JoinOperator.OR)) == 2 + assert len(dn.filter([(1, 1, Operator.EQUAL), (1, 2, Operator.EQUAL)], JoinOperator.OR)["sheet_1"]) == 4 + assert len(dn.filter([(1, 1, Operator.EQUAL), (1, 2, Operator.EQUAL)], JoinOperator.OR)["sheet_2"]) == 0 + + assert np.array_equal(dn["sheet_1"][0], np.array([1, 1])) + assert np.array_equal(dn["sheet_2"][0], np.array([1, 4])) + assert np.array_equal(dn["sheet_1"][1], np.array([1, 2])) + assert np.array_equal(dn["sheet_2"][1], np.array([1, 5])) + assert np.array_equal(dn["sheet_1"][:3], np.array([[1, 1], [1, 2], [1, 3]])) + assert np.array_equal(dn["sheet_2"][:3], np.array([[1, 4], [1, 5], [1, 6]])) + assert np.array_equal(dn["sheet_1"][:, 0], np.array([1, 1, 1, 2, 2, 2])) + assert np.array_equal(dn["sheet_2"][:, 1], np.array([4, 5, 6, 4, 5, 6])) + assert np.array_equal(dn["sheet_1"][1:4, :1], np.array([[1], [1], [2]])) + assert np.array_equal(dn["sheet_2"][1:4, 1:2], np.array([[5], [6], [4]])) + def test_set_path(self): dn = ExcelDataNode("foo", Scope.SCENARIO, properties={"default_path": "foo.xlsx"}) assert dn.path == "foo.xlsx" diff --git a/tests/core/data/test_json_data_node.py b/tests/core/data/test_json_data_node.py index ff9170bf..c023601e 100644 --- a/tests/core/data/test_json_data_node.py +++ b/tests/core/data/test_json_data_node.py @@ -24,6 +24,7 @@ from src.taipy.core.data._data_manager import _DataManager from src.taipy.core.data.data_node_id import DataNodeId from src.taipy.core.data.json import JSONDataNode +from src.taipy.core.data.operator import JoinOperator, Operator from src.taipy.core.exceptions.exceptions import NoData from taipy.config.common.scope import Scope from taipy.config.config import Config @@ -253,6 +254,28 @@ def test_read_write_custom_encoder_decoder(self, json_file): assert read_data[0].text == "abc" assert read_data[1] == 100 + def test_filter(self, json_file): + json_dn = JSONDataNode("foo", Scope.SCENARIO, properties={"default_path": json_file}) + json_dn.write( + [ + {"foo": 1, "bar": 1}, + {"foo": 1, "bar": 2}, + {"foo": 1}, + {"foo": 2, "bar": 2}, + {"bar": 2}, + {"KWARGS_KEY": "KWARGS_VALUE"}, + ] + ) + + assert len(json_dn.filter(("foo", 1, Operator.EQUAL))) == 3 + assert len(json_dn.filter(("foo", 1, Operator.NOT_EQUAL))) == 3 + assert len(json_dn.filter(("bar", 2, Operator.EQUAL))) == 3 + assert len(json_dn.filter([("bar", 1, Operator.EQUAL), ("bar", 2, Operator.EQUAL)], JoinOperator.OR)) == 4 + + assert json_dn[0] == {"foo": 1, "bar": 1} + assert json_dn[2] == {"foo": 1} + assert json_dn[:2] == [{"foo": 1, "bar": 1}, {"foo": 1, "bar": 2}] + @pytest.mark.parametrize( ["properties", "exists"], [ @@ -285,7 +308,7 @@ def test_read_write_after_modify_path(self): def test_get_system_modified_date_instead_of_last_edit_date(self, tmpdir_factory): temp_file_path = str(tmpdir_factory.mktemp("data").join("temp.json")) pd.DataFrame([]).to_json(temp_file_path) - dn = JSONDataNode("foo", Scope.SCENARIO, properties={"path": temp_file_path, "exposed_type": "pandas"}) + dn = JSONDataNode("foo", Scope.SCENARIO, properties={"path": temp_file_path}) dn.write([1, 2, 3]) previous_edit_date = dn.last_edit_date diff --git a/tests/core/data/test_mongo_data_node.py b/tests/core/data/test_mongo_data_node.py index bc2a907e..75fa8d1b 100644 --- a/tests/core/data/test_mongo_data_node.py +++ b/tests/core/data/test_mongo_data_node.py @@ -28,7 +28,7 @@ from taipy.config.common.scope import Scope -@pytest.fixture +@pytest.fixture(scope="function", autouse=True) def clear_mongo_connection_cache(): _connect_mongodb.cache_clear() @@ -129,7 +129,7 @@ def test_raise_error_invalid_custom_document(self, properties): @mongomock.patch(servers=(("localhost", 27017),)) @pytest.mark.parametrize("properties", __properties) - def test_read(self, properties, clear_mongo_connection_cache): + def test_read(self, properties): mock_client = pymongo.MongoClient("localhost") mock_client[properties["db_name"]][properties["collection_name"]].insert_many( [ @@ -174,33 +174,7 @@ def test_read(self, properties, clear_mongo_connection_cache): @mongomock.patch(servers=(("localhost", 27017),)) @pytest.mark.parametrize("properties", __properties) - def test_filter(self, properties): - mock_client = pymongo.MongoClient("localhost") - mock_client[properties["db_name"]][properties["collection_name"]].insert_many( - [ - {"foo": 1, "bar": 1}, - {"foo": 1, "bar": 2}, - {"foo": 1}, - {"foo": 2, "bar": 2}, - {"bar": 2}, - {"KWARGS_KEY": "KWARGS_VALUE"}, - ] - ) - - mongo_dn = MongoCollectionDataNode( - "foo", - Scope.SCENARIO, - properties=properties, - ) - - assert len(mongo_dn.filter(("foo", 1, Operator.EQUAL))) == 3 - assert len(mongo_dn.filter(("foo", 1, Operator.NOT_EQUAL))) == 1 - assert len(mongo_dn.filter(("bar", 2, Operator.EQUAL))) == 3 - assert len(mongo_dn.filter([("bar", 1, Operator.EQUAL), ("bar", 2, Operator.EQUAL)], JoinOperator.OR)) == 4 - - @mongomock.patch(servers=(("localhost", 27017),)) - @pytest.mark.parametrize("properties", __properties) - def test_read_empty_as(self, properties, clear_mongo_connection_cache): + def test_read_empty_as(self, properties): mongo_dn = MongoCollectionDataNode( "foo", Scope.SCENARIO, @@ -219,7 +193,7 @@ def test_read_empty_as(self, properties, clear_mongo_connection_cache): ({"a": 1, "bar": 2}), ], ) - def test_read_wrong_object_properties_name(self, properties, data, clear_mongo_connection_cache): + def test_read_wrong_object_properties_name(self, properties, data): custom_properties = properties.copy() custom_properties["custom_document"] = CustomObjectWithoutArgs mongo_dn = MongoCollectionDataNode( @@ -241,7 +215,7 @@ def test_read_wrong_object_properties_name(self, properties, data, clear_mongo_c ({"foo": 1, "bar": 2}, [{"foo": 1, "bar": 2}]), ], ) - def test_write(self, properties, data, written_data, clear_mongo_connection_cache): + def test_write(self, properties, data, written_data): mongo_dn = MongoCollectionDataNode("foo", Scope.SCENARIO, properties=properties) mongo_dn.write(data) @@ -260,7 +234,7 @@ def test_write(self, properties, data, written_data, clear_mongo_connection_cach [], ], ) - def test_write_empty_list(self, properties, data, clear_mongo_connection_cache): + def test_write_empty_list(self, properties, data): mongo_dn = MongoCollectionDataNode( "foo", Scope.SCENARIO, @@ -272,7 +246,7 @@ def test_write_empty_list(self, properties, data, clear_mongo_connection_cache): @mongomock.patch(servers=(("localhost", 27017),)) @pytest.mark.parametrize("properties", __properties) - def test_write_non_serializable(self, properties, clear_mongo_connection_cache): + def test_write_non_serializable(self, properties): mongo_dn = MongoCollectionDataNode("foo", Scope.SCENARIO, properties=properties) data = {"a": 1, "b": mongo_dn} with pytest.raises(InvalidDocument): @@ -280,7 +254,7 @@ def test_write_non_serializable(self, properties, clear_mongo_connection_cache): @mongomock.patch(servers=(("localhost", 27017),)) @pytest.mark.parametrize("properties", __properties) - def test_write_custom_encoder(self, properties, clear_mongo_connection_cache): + def test_write_custom_encoder(self, properties): custom_properties = properties.copy() custom_properties["custom_document"] = CustomObjectWithCustomEncoder mongo_dn = MongoCollectionDataNode("foo", Scope.SCENARIO, properties=custom_properties) @@ -305,7 +279,7 @@ def test_write_custom_encoder(self, properties, clear_mongo_connection_cache): @mongomock.patch(servers=(("localhost", 27017),)) @pytest.mark.parametrize("properties", __properties) - def test_write_custom_encoder_decoder(self, properties, clear_mongo_connection_cache): + def test_write_custom_encoder_decoder(self, properties): custom_properties = properties.copy() custom_properties["custom_document"] = CustomObjectWithCustomEncoderDecoder mongo_dn = MongoCollectionDataNode("foo", Scope.SCENARIO, properties=custom_properties) @@ -327,3 +301,41 @@ def test_write_custom_encoder_decoder(self, properties, clear_mongo_connection_c assert read_data[1].integer == 2 assert read_data[1].text == "def" assert isinstance(read_data[1].time, datetime) + + @mongomock.patch(servers=(("localhost", 27017),)) + @pytest.mark.parametrize("properties", __properties) + def test_filter(self, properties): + mock_client = pymongo.MongoClient("localhost") + mock_client[properties["db_name"]][properties["collection_name"]].insert_many( + [ + {"foo": 1, "bar": 1}, + {"foo": 1, "bar": 2}, + {"foo": 1}, + {"foo": 2, "bar": 2}, + {"bar": 2}, + {"KWARGS_KEY": "KWARGS_VALUE"}, + ] + ) + + mongo_dn = MongoCollectionDataNode( + "foo", + Scope.SCENARIO, + properties=properties, + ) + + assert len(mongo_dn.filter(("foo", 1, Operator.EQUAL))) == 3 + assert len(mongo_dn.filter(("foo", 1, Operator.NOT_EQUAL))) == 3 + assert len(mongo_dn.filter(("bar", 2, Operator.EQUAL))) == 3 + assert len(mongo_dn.filter([("bar", 1, Operator.EQUAL), ("bar", 2, Operator.EQUAL)], JoinOperator.OR)) == 4 + + assert mongo_dn["foo"] == [1, 1, 1, 2, None, None] + assert mongo_dn["bar"] == [1, 2, None, 2, 2, None] + assert [m.__dict__ for m in mongo_dn[:3]] == [m.__dict__ for m in mongo_dn.read()[:3]] + assert mongo_dn[["foo", "bar"]] == [ + {"foo": 1, "bar": 1}, + {"foo": 1, "bar": 2}, + {"foo": 1}, + {"foo": 2, "bar": 2}, + {"bar": 2}, + {}, + ] diff --git a/tests/core/data/test_parquet_data_node.py b/tests/core/data/test_parquet_data_node.py index 22ff986e..3778f179 100644 --- a/tests/core/data/test_parquet_data_node.py +++ b/tests/core/data/test_parquet_data_node.py @@ -22,6 +22,7 @@ from src.taipy.core.data._data_manager import _DataManager from src.taipy.core.data.data_node_id import DataNodeId +from src.taipy.core.data.operator import JoinOperator, Operator from src.taipy.core.data.parquet import ParquetDataNode from src.taipy.core.exceptions.exceptions import ( InvalidExposedType, @@ -313,6 +314,72 @@ def test_write_to_disk(self, tmpdir_factory, data): assert pathlib.Path(temp_file_path).exists() assert isinstance(dn.read(), pd.DataFrame) + def test_filter_pandas_exposed_type(self, parquet_file_path): + dn = ParquetDataNode("foo", Scope.SCENARIO, properties={"path": parquet_file_path, "exposed_type": "pandas"}) + dn.write( + [ + {"foo": 1, "bar": 1}, + {"foo": 1, "bar": 2}, + {"foo": 1}, + {"foo": 2, "bar": 2}, + {"bar": 2}, + ] + ) + + assert len(dn.filter(("foo", 1, Operator.EQUAL))) == 3 + assert len(dn.filter(("foo", 1, Operator.NOT_EQUAL))) == 2 + assert len(dn.filter(("bar", 2, Operator.EQUAL))) == 3 + assert len(dn.filter([("bar", 1, Operator.EQUAL), ("bar", 2, Operator.EQUAL)], JoinOperator.OR)) == 4 + + assert dn["foo"].equals(pd.Series([1, 1, 1, 2, None])) + assert dn["bar"].equals(pd.Series([1, 2, None, 2, 2])) + assert dn[:2].equals(pd.DataFrame([{"foo": 1.0, "bar": 1.0}, {"foo": 1.0, "bar": 2.0}])) + + def test_filter_modin_exposed_type(self, parquet_file_path): + dn = ParquetDataNode("foo", Scope.SCENARIO, properties={"path": parquet_file_path, "exposed_type": "modin"}) + dn.write( + [ + {"foo": 1, "bar": 1}, + {"foo": 1, "bar": 2}, + {"foo": 1}, + {"foo": 2, "bar": 2}, + {"bar": 2}, + ] + ) + + assert len(dn.filter(("foo", 1, Operator.EQUAL))) == 3 + assert len(dn.filter(("foo", 1, Operator.NOT_EQUAL))) == 2 + assert len(dn.filter(("bar", 2, Operator.EQUAL))) == 3 + assert len(dn.filter([("bar", 1, Operator.EQUAL), ("bar", 2, Operator.EQUAL)], JoinOperator.OR)) == 4 + + assert dn["foo"].equals(modin_pd.Series([1, 1, 1, 2, None])) + assert dn["bar"].equals(modin_pd.Series([1, 2, None, 2, 2])) + assert dn[:2].equals(modin_pd.DataFrame([{"foo": 1.0, "bar": 1.0}, {"foo": 1.0, "bar": 2.0}])) + + def test_filter_numpy_exposed_type(self, parquet_file_path): + dn = ParquetDataNode("foo", Scope.SCENARIO, properties={"path": parquet_file_path, "exposed_type": "numpy"}) + dn.write( + [ + [1, 1], + [1, 2], + [1, 3], + [2, 1], + [2, 2], + [2, 3], + ] + ) + + assert len(dn.filter((0, 1, Operator.EQUAL))) == 3 + assert len(dn.filter((0, 1, Operator.NOT_EQUAL))) == 3 + assert len(dn.filter((1, 2, Operator.EQUAL))) == 2 + assert len(dn.filter([(0, 1, Operator.EQUAL), (1, 2, Operator.EQUAL)], JoinOperator.OR)) == 4 + + assert np.array_equal(dn[0], np.array([1, 1])) + assert np.array_equal(dn[1], np.array([1, 2])) + assert np.array_equal(dn[:3], np.array([[1, 1], [1, 2], [1, 3]])) + assert np.array_equal(dn[:, 0], np.array([1, 1, 1, 2, 2, 2])) + assert np.array_equal(dn[1:4, :1], np.array([[1], [1], [2]])) + @pytest.mark.parametrize("engine", __engine) def test_pandas_parquet_config_kwargs(self, engine, tmpdir_factory): read_kwargs = {"filters": [("integer", "<", 10)], "columns": ["integer"]} diff --git a/tests/core/data/test_sql_data_node.py b/tests/core/data/test_sql_data_node.py index 7d5b4156..e332959e 100644 --- a/tests/core/data/test_sql_data_node.py +++ b/tests/core/data/test_sql_data_node.py @@ -13,10 +13,12 @@ from unittest import mock import modin.pandas as modin_pd +import numpy as np import pandas as pd import pytest from src.taipy.core.data.data_node_id import DataNodeId +from src.taipy.core.data.operator import JoinOperator, Operator from src.taipy.core.data.sql import SQLDataNode from src.taipy.core.exceptions.exceptions import MissingRequiredProperty from taipy.config.common.scope import Scope @@ -31,13 +33,13 @@ def __init__(self, foo=None, bar=None, *args, **kwargs): def my_write_query_builder_with_pandas(data: pd.DataFrame): - insert_data = list(data.itertuples(index=False, name=None)) - return ["DELETE FROM foo", ("INSERT INTO foo VALUES (?,?)", insert_data)] + insert_data = data.to_dict("records") + return ["DELETE FROM foo", ("INSERT INTO foo VALUES (:foo, :bar)", insert_data)] def my_write_query_builder_with_modin(data: modin_pd.DataFrame): - insert_data = list(data.itertuples(index=False, name=None)) - return ["DELETE FROM foo", ("INSERT INTO foo VALUES (?,?)", insert_data)] + insert_data = data.to_dict("records") + return ["DELETE FROM foo", ("INSERT INTO foo VALUES (:foo, :bar)", insert_data)] def single_write_query_builder(data): @@ -196,7 +198,7 @@ def test_create(self, pandas_properties, modin_properties): assert dn.read_query == "SELECT * FROM foo" assert dn.write_query_builder == my_write_query_builder_with_modin - @pytest.mark.parametrize("properties", __pandas_properties) + @pytest.mark.parametrize("properties", __pandas_properties + __modin_properties) def test_get_user_properties(self, properties): custom_properties = properties.copy() custom_properties["foo"] = "bar" @@ -227,7 +229,7 @@ def test_create_with_missing_parameters(self, properties): SQLDataNode("foo", Scope.SCENARIO, DataNodeId("dn_id"), properties=properties) @pytest.mark.parametrize("pandas_properties", __pandas_properties) - @pytest.mark.parametrize("modin_properties", __pandas_properties) + @pytest.mark.parametrize("modin_properties", __modin_properties) def test_write_query_builder(self, pandas_properties, modin_properties): custom_properties = pandas_properties.copy() custom_properties.pop("db_extra_args") @@ -235,10 +237,15 @@ def test_write_query_builder(self, pandas_properties, modin_properties): with mock.patch("sqlalchemy.engine.Engine.connect") as engine_mock: # mock connection execute dn.write(pd.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]})) - assert engine_mock.mock_calls[4] == mock.call().__enter__().execute("DELETE FROM foo") - assert engine_mock.mock_calls[5] == mock.call().__enter__().execute( - "INSERT INTO foo VALUES (?,?)", [(1, 4), (2, 5), (3, 6)] - ) + assert len(engine_mock.mock_calls[4].args) == 1 + assert engine_mock.mock_calls[4].args[0].text == "DELETE FROM foo" + assert len(engine_mock.mock_calls[5].args) == 2 + assert engine_mock.mock_calls[5].args[0].text == "INSERT INTO foo VALUES (:foo, :bar)" + assert engine_mock.mock_calls[5].args[1] == [ + {"foo": 1, "bar": 4}, + {"foo": 2, "bar": 5}, + {"foo": 3, "bar": 6}, + ] custom_properties["write_query_builder"] = single_write_query_builder dn = SQLDataNode("foo_bar", Scope.SCENARIO, properties=custom_properties) @@ -246,7 +253,8 @@ def test_write_query_builder(self, pandas_properties, modin_properties): with mock.patch("sqlalchemy.engine.Engine.connect") as engine_mock: # mock connection execute dn.write(pd.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]})) - assert engine_mock.mock_calls[4] == mock.call().__enter__().execute("DELETE FROM foo") + assert len(engine_mock.mock_calls[4].args) == 1 + assert engine_mock.mock_calls[4].args[0].text == "DELETE FROM foo" custom_properties = modin_properties.copy() custom_properties.pop("db_extra_args") @@ -254,10 +262,15 @@ def test_write_query_builder(self, pandas_properties, modin_properties): with mock.patch("sqlalchemy.engine.Engine.connect") as engine_mock: # mock connection execute dn.write(modin_pd.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]})) - assert engine_mock.mock_calls[4] == mock.call().__enter__().execute("DELETE FROM foo") - assert engine_mock.mock_calls[5] == mock.call().__enter__().execute( - "INSERT INTO foo VALUES (?,?)", [(1, 4), (2, 5), (3, 6)] - ) + assert len(engine_mock.mock_calls[4].args) == 1 + assert engine_mock.mock_calls[4].args[0].text == "DELETE FROM foo" + assert len(engine_mock.mock_calls[5].args) == 2 + assert engine_mock.mock_calls[5].args[0].text == "INSERT INTO foo VALUES (:foo, :bar)" + assert engine_mock.mock_calls[5].args[1] == [ + {"foo": 1, "bar": 4}, + {"foo": 2, "bar": 5}, + {"foo": 3, "bar": 6}, + ] custom_properties["write_query_builder"] = single_write_query_builder dn = SQLDataNode("foo_bar", Scope.SCENARIO, properties=custom_properties) @@ -265,7 +278,8 @@ def test_write_query_builder(self, pandas_properties, modin_properties): with mock.patch("sqlalchemy.engine.Engine.connect") as engine_mock: # mock connection execute dn.write(modin_pd.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]})) - assert engine_mock.mock_calls[4] == mock.call().__enter__().execute("DELETE FROM foo") + assert len(engine_mock.mock_calls[4].args) == 1 + assert engine_mock.mock_calls[4].args[0].text == "DELETE FROM foo" @pytest.mark.parametrize( "tmp_sqlite_path", @@ -279,7 +293,7 @@ def test_sqlite_read_file_with_different_extension(self, tmp_sqlite_path, reques folder_path, db_name, file_extension = tmp_sqlite_path properties = { "db_engine": "sqlite", - "read_query": "SELECT * from example", + "read_query": "SELECT * from foo", "write_query_builder": single_write_query_builder, "db_name": db_name, "sqlite_folder_path": folder_path, @@ -288,4 +302,106 @@ def test_sqlite_read_file_with_different_extension(self, tmp_sqlite_path, reques dn = SQLDataNode("sqlite_dn", Scope.SCENARIO, properties=properties) data = dn.read() - assert data.equals(pd.DataFrame([{"a": 1, "b": 2, "c": 3}, {"a": 4, "b": 5, "c": 6}])) + assert data.equals(pd.DataFrame([{"foo": 1, "bar": 2}, {"foo": 3, "bar": 4}])) + + def test_filter_pandas_exposed_type(self, tmp_sqlite_sqlite3_file_path): + folder_path, db_name, file_extension = tmp_sqlite_sqlite3_file_path + properties = { + "db_engine": "sqlite", + "read_query": "SELECT * FROM foo", + "write_query_builder": my_write_query_builder_with_pandas, + "db_name": db_name, + "sqlite_folder_path": folder_path, + "sqlite_file_extension": file_extension, + "exposed_type": "pandas", + } + dn = SQLDataNode("foo", Scope.SCENARIO, properties=properties) + dn.write( + pd.DataFrame( + [ + {"foo": 1, "bar": 1}, + {"foo": 1, "bar": 2}, + {"foo": 1}, + {"foo": 2, "bar": 2}, + {"bar": 2}, + ] + ) + ) + + assert len(dn.filter(("foo", 1, Operator.EQUAL))) == 3 + assert len(dn.filter(("foo", 1, Operator.NOT_EQUAL))) == 2 + assert len(dn.filter(("bar", 2, Operator.EQUAL))) == 3 + assert len(dn.filter([("bar", 1, Operator.EQUAL), ("bar", 2, Operator.EQUAL)], JoinOperator.OR)) == 4 + + assert dn["foo"].equals(pd.Series([1, 1, 1, 2, None])) + assert dn["bar"].equals(pd.Series([1, 2, None, 2, 2])) + assert dn[:2].equals(pd.DataFrame([{"foo": 1.0, "bar": 1.0}, {"foo": 1.0, "bar": 2.0}])) + + def test_filter_modin_exposed_type(self, tmp_sqlite_sqlite3_file_path): + folder_path, db_name, file_extension = tmp_sqlite_sqlite3_file_path + properties = { + "db_engine": "sqlite", + "read_query": "SELECT * FROM foo", + "write_query_builder": my_write_query_builder_with_modin, + "db_name": db_name, + "sqlite_folder_path": folder_path, + "sqlite_file_extension": file_extension, + "exposed_type": "modin", + } + dn = SQLDataNode("foo", Scope.SCENARIO, properties=properties) + dn.write( + pd.DataFrame( + [ + {"foo": 1, "bar": 1}, + {"foo": 1, "bar": 2}, + {"foo": 1}, + {"foo": 2, "bar": 2}, + {"bar": 2}, + ] + ) + ) + + assert len(dn.filter(("foo", 1, Operator.EQUAL))) == 3 + assert len(dn.filter(("foo", 1, Operator.NOT_EQUAL))) == 2 + assert len(dn.filter(("bar", 2, Operator.EQUAL))) == 3 + assert len(dn.filter([("bar", 1, Operator.EQUAL), ("bar", 2, Operator.EQUAL)], JoinOperator.OR)) == 4 + + assert dn["foo"].equals(modin_pd.Series([1, 1, 1, 2, None])) + assert dn["bar"].equals(modin_pd.Series([1, 2, None, 2, 2])) + assert dn[:2].equals(modin_pd.DataFrame([{"foo": 1.0, "bar": 1.0}, {"foo": 1.0, "bar": 2.0}])) + + def test_filter_numpy_exposed_type(self, tmp_sqlite_sqlite3_file_path): + folder_path, db_name, file_extension = tmp_sqlite_sqlite3_file_path + properties = { + "db_engine": "sqlite", + "read_query": "SELECT * FROM foo", + "write_query_builder": my_write_query_builder_with_pandas, + "db_name": db_name, + "sqlite_folder_path": folder_path, + "sqlite_file_extension": file_extension, + "exposed_type": "numpy", + } + dn = SQLDataNode("foo", Scope.SCENARIO, properties=properties) + dn.write( + pd.DataFrame( + [ + {"foo": 1, "bar": 1}, + {"foo": 1, "bar": 2}, + {"foo": 1, "bar": 3}, + {"foo": 2, "bar": 1}, + {"foo": 2, "bar": 2}, + {"foo": 2, "bar": 3}, + ] + ) + ) + + assert len(dn.filter((0, 1, Operator.EQUAL))) == 3 + assert len(dn.filter((0, 1, Operator.NOT_EQUAL))) == 3 + assert len(dn.filter((1, 2, Operator.EQUAL))) == 2 + assert len(dn.filter([(0, 1, Operator.EQUAL), (1, 2, Operator.EQUAL)], JoinOperator.OR)) == 4 + + assert np.array_equal(dn[0], np.array([1, 1])) + assert np.array_equal(dn[1], np.array([1, 2])) + assert np.array_equal(dn[:3], np.array([[1, 1], [1, 2], [1, 3]])) + assert np.array_equal(dn[:, 0], np.array([1, 1, 1, 2, 2, 2])) + assert np.array_equal(dn[1:4, :1], np.array([[1], [1], [2]])) diff --git a/tests/core/data/test_sql_table_data_node.py b/tests/core/data/test_sql_table_data_node.py index 5a4b2507..e306c63b 100644 --- a/tests/core/data/test_sql_table_data_node.py +++ b/tests/core/data/test_sql_table_data_node.py @@ -18,6 +18,7 @@ import pytest from src.taipy.core.data.data_node_id import DataNodeId +from src.taipy.core.data.operator import JoinOperator, Operator from src.taipy.core.data.sql_table import SQLTableDataNode from src.taipy.core.exceptions.exceptions import InvalidExposedType, MissingRequiredProperty from taipy.config.common.scope import Scope @@ -432,7 +433,7 @@ def test_sqlite_read_file_with_different_extension(self, tmp_sqlite_path, reques folder_path, db_name, file_extension = tmp_sqlite_path properties = { "db_engine": "sqlite", - "table_name": "example", + "table_name": "foo", "db_name": db_name, "sqlite_folder_path": folder_path, "sqlite_file_extension": file_extension, @@ -442,3 +443,102 @@ def test_sqlite_read_file_with_different_extension(self, tmp_sqlite_path, reques data = dn.read() assert data.equals(pd.DataFrame([{"a": 1, "b": 2, "c": 3}, {"a": 4, "b": 5, "c": 6}])) + + def test_filter_pandas_exposed_type(self, tmp_sqlite_sqlite3_file_path): + folder_path, db_name, file_extension = tmp_sqlite_sqlite3_file_path + properties = { + "db_engine": "sqlite", + "table_name": "foo", + "db_name": db_name, + "sqlite_folder_path": folder_path, + "sqlite_file_extension": file_extension, + "exposed_type": "pandas", + } + dn = SQLTableDataNode("foo", Scope.SCENARIO, properties=properties) + dn.write( + pd.DataFrame( + [ + {"foo": 1, "bar": 1}, + {"foo": 1, "bar": 2}, + {"foo": 1}, + {"foo": 2, "bar": 2}, + {"bar": 2}, + ] + ) + ) + + assert len(dn.filter(("foo", 1, Operator.EQUAL))) == 3 + assert len(dn.filter(("foo", 1, Operator.NOT_EQUAL))) == 2 + assert len(dn.filter(("bar", 2, Operator.EQUAL))) == 3 + assert len(dn.filter([("bar", 1, Operator.EQUAL), ("bar", 2, Operator.EQUAL)], JoinOperator.OR)) == 4 + + assert dn["foo"].equals(pd.Series([1, 1, 1, 2, None])) + assert dn["bar"].equals(pd.Series([1, 2, None, 2, 2])) + assert dn[:2].equals(pd.DataFrame([{"foo": 1.0, "bar": 1.0}, {"foo": 1.0, "bar": 2.0}])) + + def test_filter_modin_exposed_type(self, tmp_sqlite_sqlite3_file_path): + folder_path, db_name, file_extension = tmp_sqlite_sqlite3_file_path + properties = { + "db_engine": "sqlite", + "table_name": "foo", + "db_name": db_name, + "sqlite_folder_path": folder_path, + "sqlite_file_extension": file_extension, + "exposed_type": "modin", + } + dn = SQLTableDataNode("foo", Scope.SCENARIO, properties=properties) + dn.write( + pd.DataFrame( + [ + {"foo": 1, "bar": 1}, + {"foo": 1, "bar": 2}, + {"foo": 1}, + {"foo": 2, "bar": 2}, + {"bar": 2}, + ] + ) + ) + + assert len(dn.filter(("foo", 1, Operator.EQUAL))) == 3 + assert len(dn.filter(("foo", 1, Operator.NOT_EQUAL))) == 2 + assert len(dn.filter(("bar", 2, Operator.EQUAL))) == 3 + assert len(dn.filter([("bar", 1, Operator.EQUAL), ("bar", 2, Operator.EQUAL)], JoinOperator.OR)) == 4 + + assert dn["foo"].equals(modin_pd.Series([1, 1, 1, 2, None])) + assert dn["bar"].equals(modin_pd.Series([1, 2, None, 2, 2])) + assert dn[:2].equals(modin_pd.DataFrame([{"foo": 1.0, "bar": 1.0}, {"foo": 1.0, "bar": 2.0}])) + + def test_filter_numpy_exposed_type(self, tmp_sqlite_sqlite3_file_path): + folder_path, db_name, file_extension = tmp_sqlite_sqlite3_file_path + properties = { + "db_engine": "sqlite", + "table_name": "foo", + "db_name": db_name, + "sqlite_folder_path": folder_path, + "sqlite_file_extension": file_extension, + "exposed_type": "numpy", + } + dn = SQLTableDataNode("foo", Scope.SCENARIO, properties=properties) + dn.write( + pd.DataFrame( + [ + {"foo": 1, "bar": 1}, + {"foo": 1, "bar": 2}, + {"foo": 1, "bar": 3}, + {"foo": 2, "bar": 1}, + {"foo": 2, "bar": 2}, + {"foo": 2, "bar": 3}, + ] + ) + ) + + assert len(dn.filter((0, 1, Operator.EQUAL))) == 3 + assert len(dn.filter((0, 1, Operator.NOT_EQUAL))) == 3 + assert len(dn.filter((1, 2, Operator.EQUAL))) == 2 + assert len(dn.filter([(0, 1, Operator.EQUAL), (1, 2, Operator.EQUAL)], JoinOperator.OR)) == 4 + + assert np.array_equal(dn[0], np.array([1, 1])) + assert np.array_equal(dn[1], np.array([1, 2])) + assert np.array_equal(dn[:3], np.array([[1, 1], [1, 2], [1, 3]])) + assert np.array_equal(dn[:, 0], np.array([1, 1, 1, 2, 2, 2])) + assert np.array_equal(dn[1:4, :1], np.array([[1], [1], [2]])) From d83b52db70005e1241f3a3410de28c2ec8ed9b51 Mon Sep 17 00:00:00 2001 From: trgiangdo Date: Thu, 2 Nov 2023 01:16:29 +0700 Subject: [PATCH 08/11] fix: can not write to sql data node --- src/taipy/core/data/sql.py | 8 ++++++-- src/taipy/core/data/sql_table.py | 3 +-- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/src/taipy/core/data/sql.py b/src/taipy/core/data/sql.py index 58f416b9..2a6314dc 100644 --- a/src/taipy/core/data/sql.py +++ b/src/taipy/core/data/sql.py @@ -12,6 +12,8 @@ from datetime import datetime, timedelta from typing import Dict, List, Optional, Set +from sqlalchemy import text + from taipy.config.common.scope import Scope from .._version._version_manager_factory import _VersionManagerFactory @@ -133,6 +135,8 @@ def _do_write(self, data, engine, connection) -> None: queries = [queries] for query in queries: if isinstance(query, str): - connection.execute(query) + connection.execute(text(query)) else: - connection.execute(*query) + statement = query[0] + parameters = query[1] + connection.execute(text(statement), parameters) diff --git a/src/taipy/core/data/sql_table.py b/src/taipy/core/data/sql_table.py index ac1679b8..ebe5209f 100644 --- a/src/taipy/core/data/sql_table.py +++ b/src/taipy/core/data/sql_table.py @@ -141,9 +141,8 @@ def _do_write(self, data, engine, connection) -> None: def _create_table(self, engine) -> Table: return Table( - self.table, + self.properties[self.__TABLE_KEY], MetaData(), - autoload=True, autoload_with=engine, ) From ccd313b66ef8e4f0e08e215844d3abfdd16b81ac Mon Sep 17 00:00:00 2001 From: trgiangdo Date: Thu, 2 Nov 2023 01:29:00 +0700 Subject: [PATCH 09/11] fix: wrong test value on reading sqlite file --- tests/core/data/test_sql_table_data_node.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/core/data/test_sql_table_data_node.py b/tests/core/data/test_sql_table_data_node.py index e306c63b..6ff6d21a 100644 --- a/tests/core/data/test_sql_table_data_node.py +++ b/tests/core/data/test_sql_table_data_node.py @@ -442,7 +442,7 @@ def test_sqlite_read_file_with_different_extension(self, tmp_sqlite_path, reques dn = SQLTableDataNode("sqlite_dn", Scope.SCENARIO, properties=properties) data = dn.read() - assert data.equals(pd.DataFrame([{"a": 1, "b": 2, "c": 3}, {"a": 4, "b": 5, "c": 6}])) + assert data.equals(pd.DataFrame([{"foo": 1, "bar": 2}, {"foo": 3, "bar": 4}])) def test_filter_pandas_exposed_type(self, tmp_sqlite_sqlite3_file_path): folder_path, db_name, file_extension = tmp_sqlite_sqlite3_file_path From f1e3cbc0cfe8b561e2959d74b396cf1167688e2b Mon Sep 17 00:00:00 2001 From: trgiangdo Date: Fri, 3 Nov 2023 16:21:43 +0700 Subject: [PATCH 10/11] fix: remove self from staticmethod --- src/taipy/core/data/_filter.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/taipy/core/data/_filter.py b/src/taipy/core/data/_filter.py index 0b779ac9..bcadba90 100644 --- a/src/taipy/core/data/_filter.py +++ b/src/taipy/core/data/_filter.py @@ -38,8 +38,8 @@ def __is_multi_sheet_excel(data) -> bool: return False @staticmethod - def __is_list_of_dict(self) -> bool: - return all(isinstance(x, Dict) for x in self.data) + def __is_list_of_dict(data) -> bool: + return all(isinstance(x, Dict) for x in data) @staticmethod def _filter_by_key(data, key): From bb54978cc53a8e4cd149a3f1bbe668d15794bf27 Mon Sep 17 00:00:00 2001 From: trgiangdo Date: Fri, 3 Nov 2023 18:14:19 +0700 Subject: [PATCH 11/11] fix: sort parents list when migrate entities --- src/taipy/core/_entity/_migrate/_utils.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/taipy/core/_entity/_migrate/_utils.py b/src/taipy/core/_entity/_migrate/_utils.py index 9bdde6da..fb7e2aaa 100644 --- a/src/taipy/core/_entity/_migrate/_utils.py +++ b/src/taipy/core/_entity/_migrate/_utils.py @@ -51,6 +51,7 @@ def __search_parent_ids(entity_id: str, data: Dict) -> List: if entity_type == "TASK" and "SCENARIO" in _id: if entity_id in entity_data["tasks"]: parents.append(_id) + parents.sort() return parents @@ -68,6 +69,8 @@ def __search_parent_config(entity_id: str, config: Dict, entity_type: str) -> Li if entity_type == "TASK" and possible_parents == "SCENARIO": if section_id in entity_data["tasks"]: parents.append(section_id) + + parents.sort() return parents