diff --git a/.checklist.yaml b/.checklist.yaml new file mode 100644 index 00000000..f0c21171 --- /dev/null +++ b/.checklist.yaml @@ -0,0 +1,30 @@ +apiVersion: quintoandar.com.br/checklist/v2 +kind: ServiceChecklist +metadata: + name: butterfree +spec: + description: >- + A solution for Feature Stores. + + costCenter: C055 + department: engineering + lifecycle: production + docs: true + + ownership: + team: data_products_mlops + line: tech_platform + owner: otavio.cals@quintoandar.com.br + + libraries: + - name: butterfree + type: common-usage + path: https://quintoandar.github.io/python-package-server/ + description: A lib to build Feature Stores. + registries: + - github-packages + tier: T0 + + channels: + squad: 'mlops' + alerts: 'data-products-reports' diff --git a/.github/workflows/skip_lint.yml b/.github/workflows/skip_lint.yml new file mode 100644 index 00000000..1c768a23 --- /dev/null +++ b/.github/workflows/skip_lint.yml @@ -0,0 +1,17 @@ +# This step is used only because we want to mark the runner-linter check as required +# for PRs to develop, but not for the merge queue to merge into develop, +# github does not have this functionality yet + +name: 'Skip github-actions/runner-linter check at merge queue' + +on: + merge_group: + +jobs: + empty_job: + name: 'github-actions/runner-linter' + runs-on: github-actions-developers-runner + steps: + - name: Skip github-actions/runner-linter check at merge queue + run: | + echo "Done" diff --git a/butterfree/clients/cassandra_client.py b/butterfree/clients/cassandra_client.py index 0b300844..94044e54 100644 --- a/butterfree/clients/cassandra_client.py +++ b/butterfree/clients/cassandra_client.py @@ -211,4 +211,4 @@ def create_table(self, columns: List[CassandraColumn], table: str) -> None: """ query = self._get_create_table_query(columns, table) - self.sql(query) + self.sql(query) \ No newline at end of file diff --git a/butterfree/pipelines/feature_set_pipeline.py b/butterfree/pipelines/feature_set_pipeline.py index f1c94ec2..c33f3bb9 100644 --- a/butterfree/pipelines/feature_set_pipeline.py +++ b/butterfree/pipelines/feature_set_pipeline.py @@ -221,9 +221,12 @@ def run( # Step 2: Repartition and sort if required, avoid if not necessary. if partition_by: order_by = order_by or partition_by - dataframe = repartition_sort_df( - dataframe, partition_by, order_by, num_processors - ) + current_partitions = dataframe.rdd.getNumPartitions() + optimal_partitions = num_processors or current_partitions + if current_partitions != optimal_partitions: + dataframe = repartition_sort_df( + dataframe, partition_by, order_by, num_processors + ) # Step 3: Construct the feature set dataframe using defined transformations. transformed_dataframe = self.feature_set.construct( diff --git a/butterfree/transform/aggregated_feature_set.py b/butterfree/transform/aggregated_feature_set.py index 516b6fed..0760af14 100644 --- a/butterfree/transform/aggregated_feature_set.py +++ b/butterfree/transform/aggregated_feature_set.py @@ -636,6 +636,10 @@ def construct( else: output_df = self._aggregate(output_df, features=self.features) + output_df = self.incremental_strategy.filter_with_incremental_strategy( + dataframe=output_df, start_date=start_date, end_date=end_date + ) + output_df = output_df.select(*self.columns).replace( # type: ignore float("nan"), None ) diff --git a/docs/source/butterfree.dataframe_service.rst b/docs/source/butterfree.dataframe_service.rst index ae9658a5..4fb54fd3 100644 --- a/docs/source/butterfree.dataframe_service.rst +++ b/docs/source/butterfree.dataframe_service.rst @@ -23,6 +23,22 @@ butterfree.dataframe\_service.partitioning module butterfree.dataframe\_service.repartition module ------------------------------------------------ +.. automodule:: butterfree.dataframe_service.incremental_strategy + :members: + :undoc-members: + :show-inheritance: + +butterfree.dataframe\_service.partitioning module +------------------------------------------------- + +.. automodule:: butterfree.dataframe_service.partitioning + :members: + :undoc-members: + :show-inheritance: + +butterfree.dataframe\_service.repartition module +------------------------------------------------ + .. automodule:: butterfree.dataframe_service.repartition :members: :undoc-members: diff --git a/logging.json b/logging.json new file mode 100644 index 00000000..e69de29b diff --git a/tests/unit/butterfree/clients/test_cassandra_client.py b/tests/unit/butterfree/clients/test_cassandra_client.py index 9f634cf7..bc2024fc 100644 --- a/tests/unit/butterfree/clients/test_cassandra_client.py +++ b/tests/unit/butterfree/clients/test_cassandra_client.py @@ -1,8 +1,6 @@ from typing import Any, Dict, List from unittest.mock import MagicMock -import pytest - from butterfree.clients import CassandraClient from butterfree.clients.cassandra_client import ( EMPTY_STRING_HOST_ERROR,