From fc41ab91ce7b6aebd3400d0ea25ded8ee3a6a40b Mon Sep 17 00:00:00 2001 From: Mayara Moromisato <44944954+moromimay@users.noreply.github.com> Date: Mon, 8 Feb 2021 11:20:45 -0300 Subject: [PATCH 01/70] [MLOP-634] Butterfree dev workflow, set triggers for branches staging and master (#280) * Change github actions pipelines. * Change pipeline logic. --- .github/workflows/publish.yml | 2 +- .github/workflows/staging.yml | 45 +++++++++++++++++++++++++++++++++++ .github/workflows/test.yml | 1 + 3 files changed, 47 insertions(+), 1 deletion(-) create mode 100644 .github/workflows/staging.yml diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index 7dff34a78..ba94bc7e2 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -7,7 +7,7 @@ on: jobs: Pipeline: - if: github.ref == 'refs/heads/master' || contains(github.ref, 'hotfix/') + if: github.ref == 'refs/heads/master' runs-on: ubuntu-16.04 container: quintoandar/python-3-7-java diff --git a/.github/workflows/staging.yml b/.github/workflows/staging.yml new file mode 100644 index 000000000..709291704 --- /dev/null +++ b/.github/workflows/staging.yml @@ -0,0 +1,45 @@ +name: "Publish Dev Package" +on: + push: + branches: + - staging + + +jobs: + Pipeline: + if: github.ref == 'refs/heads/staging' + + runs-on: ubuntu-16.04 + container: quintoandar/python-3-7-java + + steps: + - uses: actions/checkout@v2 + + - name: Install dependencies + run: make ci-install + + - name: Build package + run: make package + + - name: Get version + run: echo "version=$(grep __version__ setup.py | head -1 | cut -d \" -f2 | cut -d \' -f2 ).dev$(git rev-parse --short "$GITHUB_SHA")" >> $GITHUB_ENV + + - name: Create release + uses: actions/create-release@v1 + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + with: + tag_name: ${{ env.version }} + release_name: Release ${{ env.version }} + prerelease: true + + - name: Release already exist + if: ${{ failure() }} + run: echo Release already exist + + - name: Publish release to pypi.org + if: ${{ success() }} + env: + PYPI_USERNAME: ${{ secrets.PYPI_USERNAME }} + PYPI_PASSWORD: ${{ secrets.PYPI_PASSWORD }} + run: PYTHONPATH=./pip/deps python -m twine upload -u $PYPI_USERNAME -p $PYPI_PASSWORD --verbose dist/* diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 29394a0e8..b39246fda 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -3,6 +3,7 @@ on: push: branches: - master + - staging - hotfix/** pull_request: From 4be4ffe59a7ff8d4b472e3010d564c06e5a81ace Mon Sep 17 00:00:00 2001 From: Mayara Moromisato <44944954+moromimay@users.noreply.github.com> Date: Mon, 8 Feb 2021 18:36:49 -0300 Subject: [PATCH 02/70] [BUG] Fix Staging GithubActions Pipeline (#283) * New step on pipelie. * Some adjusts. --- .github/workflows/staging.yml | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/.github/workflows/staging.yml b/.github/workflows/staging.yml index 709291704..e02009a4a 100644 --- a/.github/workflows/staging.yml +++ b/.github/workflows/staging.yml @@ -24,6 +24,12 @@ jobs: - name: Get version run: echo "version=$(grep __version__ setup.py | head -1 | cut -d \" -f2 | cut -d \' -f2 ).dev$(git rev-parse --short "$GITHUB_SHA")" >> $GITHUB_ENV + - name: Change package name and path + run: | + mkdir ./dist/p/ \; + find . -wholename "./dist/butterfree-*.whl" -exec mv '{}' ./dist/p/butterfree-${{ env.version }}-py3-none-any.whl \; + find . -wholename "./dist/butterfree-*.tar.gz" -exec mv '{}' ./dist/p/butterfree-${{ env.version }}.tar.gz \; + - name: Create release uses: actions/create-release@v1 env: @@ -42,4 +48,4 @@ jobs: env: PYPI_USERNAME: ${{ secrets.PYPI_USERNAME }} PYPI_PASSWORD: ${{ secrets.PYPI_PASSWORD }} - run: PYTHONPATH=./pip/deps python -m twine upload -u $PYPI_USERNAME -p $PYPI_PASSWORD --verbose dist/* + run: PYTHONPATH=./pip/deps python -m twine upload -u $PYPI_USERNAME -p $PYPI_PASSWORD --verbose ./dist/p/* From a3a601bd037a25dacd60bc5a643c4e2eec81a39e Mon Sep 17 00:00:00 2001 From: Mayara Moromisato <44944954+moromimay@users.noreply.github.com> Date: Mon, 8 Feb 2021 19:06:29 -0300 Subject: [PATCH 03/70] Apply only wheel. (#285) --- .github/workflows/staging.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/staging.yml b/.github/workflows/staging.yml index e02009a4a..504bb086b 100644 --- a/.github/workflows/staging.yml +++ b/.github/workflows/staging.yml @@ -48,4 +48,4 @@ jobs: env: PYPI_USERNAME: ${{ secrets.PYPI_USERNAME }} PYPI_PASSWORD: ${{ secrets.PYPI_PASSWORD }} - run: PYTHONPATH=./pip/deps python -m twine upload -u $PYPI_USERNAME -p $PYPI_PASSWORD --verbose ./dist/p/* + run: PYTHONPATH=./pip/deps python -m twine upload -u $PYPI_USERNAME -p $PYPI_PASSWORD --verbose ./dist/p/butterfree-${{ env.version }}-py3-none-any.whl From 433960802a33ed3c545a04c6ab8c7d2594a1d7ee Mon Sep 17 00:00:00 2001 From: Mayara Moromisato <44944954+moromimay@users.noreply.github.com> Date: Tue, 9 Feb 2021 16:49:47 -0300 Subject: [PATCH 04/70] [BUG] Change version on setup.py to PyPI (#286) * Add new make command to change version. * Change command order. * Change desc and variable name. * Change command name. --- .github/workflows/staging.yml | 12 ++++-------- Makefile | 24 ++++++++++++++++++------ 2 files changed, 22 insertions(+), 14 deletions(-) diff --git a/.github/workflows/staging.yml b/.github/workflows/staging.yml index 504bb086b..62e97ea28 100644 --- a/.github/workflows/staging.yml +++ b/.github/workflows/staging.yml @@ -18,17 +18,13 @@ jobs: - name: Install dependencies run: make ci-install - - name: Build package - run: make package - - name: Get version run: echo "version=$(grep __version__ setup.py | head -1 | cut -d \" -f2 | cut -d \' -f2 ).dev$(git rev-parse --short "$GITHUB_SHA")" >> $GITHUB_ENV - - name: Change package name and path + - name: Build package run: | - mkdir ./dist/p/ \; - find . -wholename "./dist/butterfree-*.whl" -exec mv '{}' ./dist/p/butterfree-${{ env.version }}-py3-none-any.whl \; - find . -wholename "./dist/butterfree-*.tar.gz" -exec mv '{}' ./dist/p/butterfree-${{ env.version }}.tar.gz \; + make change-version NEW_VERSION="${{ env.version }}" + make package - name: Create release uses: actions/create-release@v1 @@ -48,4 +44,4 @@ jobs: env: PYPI_USERNAME: ${{ secrets.PYPI_USERNAME }} PYPI_PASSWORD: ${{ secrets.PYPI_PASSWORD }} - run: PYTHONPATH=./pip/deps python -m twine upload -u $PYPI_USERNAME -p $PYPI_PASSWORD --verbose ./dist/p/butterfree-${{ env.version }}-py3-none-any.whl + run: PYTHONPATH=./pip/deps python -m twine upload -u $PYPI_USERNAME -p $PYPI_PASSWORD --verbose dist/* diff --git a/Makefile b/Makefile index 41ad00ab4..e6de9baa5 100644 --- a/Makefile +++ b/Makefile @@ -1,3 +1,11 @@ +# globals + +PACKAGE_NAME := $(shell grep __package_name__ setup.py | head -1 | cut -d \" -f2 | cut -d \' -f2) +VERSION := $(shell grep __version__ setup.py | head -1 | cut -d \" -f2 | cut -d \' -f2) + + +#custom targets + .PHONY: environment ## create virtual environment for butterfree environment: @@ -119,16 +127,20 @@ clean: @find ./ -name '*~' -exec rm -f {} \; .PHONY: version -## dump package name into VERSION env variable and show +## show version version: - @export VERSION=$(grep __version__ setup.py | head -1 | cut -d \" -f2 | cut -d \' -f2) - @$(info VERSION is [${VERSION}]) + @echo "VERSION: $(VERSION)" + +.PHONY: change-version +## change the version to string received in the NEW_VERSION variable and show +change-version: + @sed -i 's/$(VERSION)/$(NEW_VERSION)/g' setup.py + @echo "VERSION: $(NEW_VERSION)" .PHONY: package-name -## dump package name into PACKAGE_NAME env variable and show +## show package name package-name: - @PACKAGE_NAME=$(grep __package_name__ setup.py | head -1 | cut -d \" -f2 | cut -d \' -f2 | sed 's/.*/&${build}/') - @echo $PACKAGE_NAME + @echo "PACKAGE_NAME: $(PACKAGE_NAME)" .PHONY: package ## build butterfree package wheel From a82433c40b011b03869ebe05070d90955b1b566b Mon Sep 17 00:00:00 2001 From: hmeretti Date: Tue, 9 Feb 2021 18:54:23 -0300 Subject: [PATCH 05/70] Keep milliseconds when using 'from_ms' argument in timestamp feature (#284) * changed timestamp resolution * fix import * simple refactor Co-authored-by: Henrique Camargo --- .../transform/features/timestamp_feature.py | 15 ++++++------- .../butterfree/transform/features/conftest.py | 17 ++++++++++---- .../features/test_timestamp_feature.py | 22 +++++++++++++++---- 3 files changed, 38 insertions(+), 16 deletions(-) diff --git a/butterfree/transform/features/timestamp_feature.py b/butterfree/transform/features/timestamp_feature.py index 2aac8925a..b131eaeee 100644 --- a/butterfree/transform/features/timestamp_feature.py +++ b/butterfree/transform/features/timestamp_feature.py @@ -1,6 +1,6 @@ """TimestampFeature entity.""" from pyspark.sql import DataFrame -from pyspark.sql.functions import from_unixtime, to_timestamp +from pyspark.sql.functions import to_timestamp from butterfree.constants import DataType from butterfree.constants.columns import TIMESTAMP_COLUMN @@ -65,13 +65,12 @@ def transform(self, dataframe: DataFrame) -> DataFrame: """ column_name = self.from_column if self.from_column else self.name + ts_column = dataframe[column_name] if self.from_ms: - dataframe = dataframe.withColumn( - column_name, from_unixtime(dataframe[column_name] / 1000.0) - ) - if self.mask: - dataframe = dataframe.withColumn( - column_name, to_timestamp(dataframe[column_name], self.mask) - ) + ts_column = ts_column / 1000 + + dataframe = dataframe.withColumn( + column_name, to_timestamp(ts_column, self.mask) + ) return super().transform(dataframe) diff --git a/tests/unit/butterfree/transform/features/conftest.py b/tests/unit/butterfree/transform/features/conftest.py index e79c5075f..ae6444703 100644 --- a/tests/unit/butterfree/transform/features/conftest.py +++ b/tests/unit/butterfree/transform/features/conftest.py @@ -18,8 +18,8 @@ def feature_set_dataframe(spark_context, spark_session): @fixture def feature_set_dataframe_ms_from_column(spark_context, spark_session): data = [ - {"id": 1, "ts": 1581542311000, "feature": 100}, - {"id": 2, "ts": 1581542322000, "feature": 200}, + {"id": 1, "ts": 1581542311112, "feature": 100}, + {"id": 2, "ts": 1581542322223, "feature": 200}, ] return spark_session.read.json(spark_context.parallelize(data, 1)) @@ -27,8 +27,17 @@ def feature_set_dataframe_ms_from_column(spark_context, spark_session): @fixture def feature_set_dataframe_ms(spark_context, spark_session): data = [ - {"id": 1, TIMESTAMP_COLUMN: 1581542311000, "feature": 100}, - {"id": 2, TIMESTAMP_COLUMN: 1581542322000, "feature": 200}, + {"id": 1, TIMESTAMP_COLUMN: 1581542311112, "feature": 100}, + {"id": 2, TIMESTAMP_COLUMN: 1581542322223, "feature": 200}, + ] + return spark_session.read.json(spark_context.parallelize(data, 1)) + + +@fixture +def feature_set_dataframe_small_time_diff(spark_context, spark_session): + data = [ + {"id": 1, TIMESTAMP_COLUMN: 1581542311001, "feature": 100}, + {"id": 2, TIMESTAMP_COLUMN: 1581542311002, "feature": 200}, ] return spark_session.read.json(spark_context.parallelize(data, 1)) diff --git a/tests/unit/butterfree/transform/features/test_timestamp_feature.py b/tests/unit/butterfree/transform/features/test_timestamp_feature.py index c77450362..a5a688c2a 100644 --- a/tests/unit/butterfree/transform/features/test_timestamp_feature.py +++ b/tests/unit/butterfree/transform/features/test_timestamp_feature.py @@ -32,8 +32,8 @@ def test_transform_ms_from_column(self, feature_set_dataframe_ms_from_column): df = df.withColumn("timestamp", df["timestamp"].cast(StringType())).collect() - assert df[0]["timestamp"] == "2020-02-12 21:18:31" - assert df[1]["timestamp"] == "2020-02-12 21:18:42" + assert df[0]["timestamp"] == "2020-02-12 21:18:31.112" + assert df[1]["timestamp"] == "2020-02-12 21:18:42.223" def test_transform_ms(self, feature_set_dataframe_ms): @@ -43,8 +43,22 @@ def test_transform_ms(self, feature_set_dataframe_ms): df = df.withColumn("timestamp", df["timestamp"].cast(StringType())).collect() - assert df[0]["timestamp"] == "2020-02-12 21:18:31" - assert df[1]["timestamp"] == "2020-02-12 21:18:42" + assert df[0]["timestamp"] == "2020-02-12 21:18:31.112" + assert df[1]["timestamp"] == "2020-02-12 21:18:42.223" + + def test_transform_ms_from_column_small_time_diff( + self, feature_set_dataframe_small_time_diff + ): + + test_key = TimestampFeature(from_ms=True) + + df = test_key.transform(feature_set_dataframe_small_time_diff).orderBy( + "timestamp" + ) + + df = df.withColumn("timestamp", df["timestamp"].cast(StringType())).collect() + + assert df[0]["timestamp"] != df[1]["timestamp"] def test_transform_mask(self, feature_set_dataframe_date): From dcbf5408d7ffaca80c7679cd3cc64a819c3bf37d Mon Sep 17 00:00:00 2001 From: Mayara Moromisato <44944954+moromimay@users.noreply.github.com> Date: Wed, 10 Feb 2021 14:46:31 -0300 Subject: [PATCH 06/70] Change trigger for pipeline staging (#287) * Change trigger to publish dev pipeline. * Some fix. --- .github/workflows/staging.yml | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/.github/workflows/staging.yml b/.github/workflows/staging.yml index 62e97ea28..8b39e5ac3 100644 --- a/.github/workflows/staging.yml +++ b/.github/workflows/staging.yml @@ -1,8 +1,8 @@ name: "Publish Dev Package" on: push: - branches: - - staging + paths: + - 'setup.py' jobs: @@ -19,12 +19,10 @@ jobs: run: make ci-install - name: Get version - run: echo "version=$(grep __version__ setup.py | head -1 | cut -d \" -f2 | cut -d \' -f2 ).dev$(git rev-parse --short "$GITHUB_SHA")" >> $GITHUB_ENV + run: echo "version=$(grep __version__ setup.py | head -1 | cut -d \" -f2 | cut -d \' -f2 )" >> $GITHUB_ENV - name: Build package - run: | - make change-version NEW_VERSION="${{ env.version }}" - make package + run: make package - name: Create release uses: actions/create-release@v1 From a0a933596ce0a2233b30ccbd4e8e60e62ae8aa5f Mon Sep 17 00:00:00 2001 From: Mayara Moromisato <44944954+moromimay@users.noreply.github.com> Date: Wed, 10 Feb 2021 17:20:46 -0300 Subject: [PATCH 07/70] Create a dev package. (#288) --- CHANGELOG.md | 20 ++++++++++++++++++++ setup.py | 2 +- 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 48b5cbf1a..729946214 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,26 @@ All notable changes to this project will be documented in this file. Preferably use **Added**, **Changed**, **Removed** and **Fixed** topics in each release or unreleased log for a better organization. +## [Unreleased] +### Added +* [MLOP-599] Apply mypy to ButterFree ([#273](https://github.com/quintoandar/butterfree/pull/273)) + +### Changed +* [MLOP-634] Butterfree dev workflow, set triggers for branches staging and master ([#280](https://github.com/quintoandar/butterfree/pull/280)) +* Keep milliseconds when using 'from_ms' argument in timestamp feature ([#284](https://github.com/quintoandar/butterfree/pull/284)) + +### Fixed +* Change trigger for pipeline staging ([#287](https://github.com/quintoandar/butterfree/pull/287)) + +## [1.1.2](https://github.com/quintoandar/butterfree/releases/tag/1.1.2) +### Fixed +* [HOTFIX] Add both cache and count back to Butterfree ([#274](https://github.com/quintoandar/butterfree/pull/274)) +* [MLOP-606] Change docker image in Github Actions Pipeline ([#275](https://github.com/quintoandar/butterfree/pull/275)) +* FIX Read the Docs build ([#272](https://github.com/quintoandar/butterfree/pull/272)) +* [BUG] Fix style ([#271](https://github.com/quintoandar/butterfree/pull/271)) +* [MLOP-594] Remove from_column in some transforms ([#270](https://github.com/quintoandar/butterfree/pull/270)) +* [MLOP-536] Rename S3 config to Metastore config ([#269](https://github.com/quintoandar/butterfree/pull/269)) + ## [1.1.1](https://github.com/quintoandar/butterfree/releases/tag/1.2.0) ### Added * [MLOP-590] Adapt KafkaConfig to receive a custom topic name ([#266](https://github.com/quintoandar/butterfree/pull/266)) diff --git a/setup.py b/setup.py index 47ba0b989..bf471fecd 100644 --- a/setup.py +++ b/setup.py @@ -1,7 +1,7 @@ from setuptools import find_packages, setup __package_name__ = "butterfree" -__version__ = "1.1.1" +__version__ = "1.1.3.dev0" __repository_url__ = "https://github.com/quintoandar/butterfree" with open("requirements.txt") as f: From 74278986a49f1825beee0fd8df65a585764e5524 Mon Sep 17 00:00:00 2001 From: Mayara Moromisato <44944954+moromimay@users.noreply.github.com> Date: Wed, 10 Feb 2021 18:09:38 -0300 Subject: [PATCH 08/70] [MLOP-633] Butterfree dev workflow, update documentation (#281) * Update workflow doc. * Update README * Add pre-release. * Fix typo. --- README.md | 2 ++ WORKFLOW.md | 63 ++++++++++++++++++++++++++++++++++++++--------------- 2 files changed, 47 insertions(+), 18 deletions(-) diff --git a/README.md b/README.md index d221d8666..728f7b027 100644 --- a/README.md +++ b/README.md @@ -44,6 +44,8 @@ Or after listing `butterfree` in your `requirements.txt` file: pip install -r requirements.txt ``` +Dev Package are available for testing using the .devN versions of the Butterfree on PyPi. + ## License [Apache License 2.0](https://github.com/quintoandar/butterfree/blob/staging/LICENSE) diff --git a/WORKFLOW.md b/WORKFLOW.md index 601e37932..5eaa18cdd 100644 --- a/WORKFLOW.md +++ b/WORKFLOW.md @@ -2,20 +2,18 @@ ## Features -A feature is based on the `master` branch and merged back into the `master` branch. - -![](https://docs.microsoft.com/en-us/azure/devops/repos/git/media/branching-guidance/featurebranching.png?view=azure-devops) +A feature is based on the `staging` branch and merged back into the `staging` branch. ### Working Locally ``` -# checkout master, fetch the latest changes and pull them from remote into local -git checkout master +# checkout staging, fetch the latest changes and pull them from remote into local +git checkout staging git fetch -git pull origin master +git pull origin staging -# create a feature branch that is based off master +# create a feature branch that is based off staging git checkout -b /some-description # do your work @@ -24,10 +22,10 @@ git commit -m "first commit" git add another git commit -m "second commit" -# rebase against master to pull in any changes that have been made +# rebase against staging to pull in any changes that have been made # since you started your feature branch. git fetch -git rebase origin/master +git rebase origin/staging # push your local changes up to the remote git push @@ -35,41 +33,71 @@ git push # if you've already pushed changes and have rebased, your history has changed # so you will need to force the push git fetch -git rebase origin/master +git rebase origin/staging git push --force-with-lease ```` ### GitHub workflow -- Open a Pull Request against `master`. Check our PR guidelines [here](https://github.com/quintoandar/butterfree/blob/master/CONTRIBUTING.md#pull-request-guideline). +- Open a Pull Request against `staging`. Check our PR guidelines [here](https://github.com/quintoandar/butterfree/blob/master/CONTRIBUTING.md#pull-request-guideline). - When the Pull Request has been approved, merge using `squash and merge`, adding a brief description: ie, ` Enable stream pipelines in Butterfree`. - This squashes all your commits into a single clean commit. Remember to clean detailed descriptions, otherwise our git logs will be a mess. -If you are unable to squash merge because of conflicts, you need to rebase against `master` again: +If you are unable to squash merge because of conflicts, you need to rebase against `staging` again: ``` # in your feature branch git fetch -git rebase origin/master +git rebase origin/staging # fix conflicts if they exist git push --force-with-lease ``` +## Pre-Releases + +The pre-release will always occur when we change the version in the setup.py file to staging branch. + + +### Working Locally + +``` +# create a feature branch +git checkout staging +git fetch +git pull origin staging +git checkout -b pre-release/ + +# finalize the changelog in Unreleased and bump the version into setup.py then: +git add CHANGELOG.md +git add setup.py +git commit -m "pre-release " + +# push the new version +git fetch +git push --force-with-lease +``` + +### Github workflow + +- Open a Pull Request against `staging`. +- When the PR's approved and the code is tested, `squash and merge` to squash your commits into a single commit. +- The creation of the pre-release tag and the update of the PyPi version will be done +automatically from the Publish Dev Package workflow, you can follow [here](https://github.com/quintoandar/butterfree/actions?query=workflow%3A%22Publish+Dev+Package%22). ## Releases -The release will always occur when we change the version in the setup.py file. +The release will always occur when we change the version in the setup.py file to master branch. ### Working Locally ``` # create a feature branch -git checkout master +git checkout staging git fetch -git pull origin master +git pull origin staging git checkout -b release/ # finalize the changelog, bump the version into setup.py and update the documentation then: @@ -121,7 +149,6 @@ git checkout master@ git fetch git pull origin master git checkout -b hotfix/ -git checkout -b describe-the-problem git add patch.fix git add setup.py @@ -133,7 +160,7 @@ Don't forget to update the Changelog and the version in `setup.py`. ### Github workflow -- Open a Pull Request against `hotfix/` +- Open a Pull Request against `master`. - When the PR's approved and the code is tested, `squash and merge` to squash your commits into a single commit. - A tag will automatically be triggered in our CI/CD. This tag/release will use the version for its title and push a new version of Butterfree's python package to our private server. From 245eaa594846166972241b03fddc61ee5117b1f7 Mon Sep 17 00:00:00 2001 From: AlvaroMarquesAndrade <45604858+AlvaroMarquesAndrade@users.noreply.github.com> Date: Thu, 11 Feb 2021 11:51:04 -0300 Subject: [PATCH 09/70] [MLOP-632] Butterfree dev workflow, automate release description (#279) --- .github/workflows/publish.yml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index ba94bc7e2..3620cdbbd 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -24,6 +24,10 @@ jobs: - name: Get version run: echo "version=$(grep __version__ setup.py | head -1 | cut -d \" -f2 | cut -d \' -f2)" >> $GITHUB_ENV + - name: Get release notes + id: get_release_notes + uses: ffurrer2/extract-release-notes@v1 + - name: Create release uses: actions/create-release@v1 env: @@ -31,6 +35,7 @@ jobs: with: tag_name: ${{ env.version }} release_name: Release ${{ env.version }} + body: ${{ steps.get_release_notes.outputs.release_notes }} - name: Release already exist if: ${{ failure() }} From d6ecfa425136fab07826c01d0a7ac271f7a37a30 Mon Sep 17 00:00:00 2001 From: AlvaroMarquesAndrade <45604858+AlvaroMarquesAndrade@users.noreply.github.com> Date: Thu, 18 Feb 2021 15:14:44 -0300 Subject: [PATCH 10/70] [MLOP-636] Create migration classes (#282) --- butterfree/migrations/__init__.py | 7 +++ butterfree/migrations/cassandra_migration.py | 23 ++++++++ butterfree/migrations/metastore_migration.py | 23 ++++++++ butterfree/migrations/migration.py | 62 ++++++++++++++++++++ 4 files changed, 115 insertions(+) create mode 100644 butterfree/migrations/__init__.py create mode 100644 butterfree/migrations/cassandra_migration.py create mode 100644 butterfree/migrations/metastore_migration.py create mode 100644 butterfree/migrations/migration.py diff --git a/butterfree/migrations/__init__.py b/butterfree/migrations/__init__.py new file mode 100644 index 000000000..5f709bfe3 --- /dev/null +++ b/butterfree/migrations/__init__.py @@ -0,0 +1,7 @@ +"""Holds available migrations.""" + +from butterfree.migrations.cassandra_migration import CassandraMigration +from butterfree.migrations.metastore_migration import MetastoreMigration +from butterfree.migrations.migration import DatabaseMigration + +__all__ = ["DatabaseMigration", "CassandraMigration", "MetastoreMigration"] diff --git a/butterfree/migrations/cassandra_migration.py b/butterfree/migrations/cassandra_migration.py new file mode 100644 index 000000000..e9cecdc7b --- /dev/null +++ b/butterfree/migrations/cassandra_migration.py @@ -0,0 +1,23 @@ +"""Cassandra Migration entity.""" + +from typing import Any, Dict, List + +from butterfree.migrations import DatabaseMigration + + +class CassandraMigration(DatabaseMigration): + """Cassandra class for Migrations.""" + + def create_query( + self, + fs_schema: List[Dict[str, Any]], + db_schema: List[Dict[str, Any]], + table_name: str, + ) -> Any: + """Create a query regarding Cassandra. + + Returns: + Schema object. + + """ + pass diff --git a/butterfree/migrations/metastore_migration.py b/butterfree/migrations/metastore_migration.py new file mode 100644 index 000000000..bb208f2a9 --- /dev/null +++ b/butterfree/migrations/metastore_migration.py @@ -0,0 +1,23 @@ +"""Metastore Migration entity.""" + +from typing import Any, Dict, List + +from butterfree.migrations import DatabaseMigration + + +class MetastoreMigration(DatabaseMigration): + """Metastore class for Migrations.""" + + def create_query( + self, + fs_schema: List[Dict[str, Any]], + db_schema: List[Dict[str, Any]], + table_name: str, + ) -> Any: + """Create a query regarding Metastore. + + Returns: + Schema object. + + """ + pass diff --git a/butterfree/migrations/migration.py b/butterfree/migrations/migration.py new file mode 100644 index 000000000..c53945bf9 --- /dev/null +++ b/butterfree/migrations/migration.py @@ -0,0 +1,62 @@ +"""Migration entity.""" + +from abc import ABC, abstractmethod +from typing import Any, Callable, Dict, List + +from butterfree.pipelines import FeatureSetPipeline + + +class DatabaseMigration(ABC): + """Abstract base class for Migrations.""" + + @abstractmethod + def create_query( + self, + fs_schema: List[Dict[str, Any]], + db_schema: List[Dict[str, Any]], + table_name: str, + ) -> Any: + """Create a query regarding a data source. + + Returns: + The desired query for the given database. + + """ + + def _validate_schema( + self, fs_schema: List[Dict[str, Any]], db_schema: List[Dict[str, Any]] + ) -> Any: + """Provides schema validation for feature sets. + + Compares the schema of your local feature set to the + corresponding table in a given database. + + Args: + fs_schema: object that contains feature set's schemas. + db_schema: object that contains the table og a given db schema. + + """ + + def _get_schema(self, db_client: Callable, table_name: str) -> List[Dict[str, Any]]: + """Get a table schema in the respective database. + + Returns: + Schema object. + """ + pass + + def _apply_migration(self, query: str, db_client: Callable) -> None: + """Apply the migration in the respective database.""" + + def _send_logs_to_s3(self) -> None: + """Send all migration logs to S3.""" + pass + + def run(self, pipelines: List[FeatureSetPipeline]) -> None: + """Runs the migrations. + + Args: + pipelines: the feature set pipelines. + + """ + pass From 32e24d6382a973452dd20611c22358cd8d5976bd Mon Sep 17 00:00:00 2001 From: Mayara Moromisato <44944954+moromimay@users.noreply.github.com> Date: Fri, 19 Feb 2021 10:18:09 -0300 Subject: [PATCH 11/70] [MLOP-635] Rebase Incremental Job/Interval Run branch for test on selected feature sets (#278) * Add interval branch modifications. * Add interval_runs notebook. * Add tests. * Apply style (black, flack8 and mypy). * Fix tests. * Change version to create package dev. --- .gitignore | 1 + CHANGELOG.md | 6 + butterfree/clients/cassandra_client.py | 41 +- butterfree/clients/spark_client.py | 58 +- butterfree/configs/db/metastore_config.py | 28 + butterfree/configs/environment.py | 4 +- butterfree/constants/window_definitions.py | 16 + butterfree/dataframe_service/__init__.py | 9 +- .../dataframe_service/incremental_strategy.py | 116 + butterfree/dataframe_service/partitioning.py | 25 + butterfree/extract/readers/file_reader.py | 12 +- butterfree/extract/readers/reader.py | 88 +- butterfree/extract/source.py | 24 +- butterfree/hooks/__init__.py | 5 + butterfree/hooks/hook.py | 20 + butterfree/hooks/hookable_component.py | 148 ++ .../hooks/schema_compatibility/__init__.py | 9 + ...ssandra_table_schema_compatibility_hook.py | 58 + .../spark_table_schema_compatibility_hook.py | 46 + butterfree/load/sink.py | 13 +- .../historical_feature_store_writer.py | 113 +- .../writers/online_feature_store_writer.py | 50 +- butterfree/load/writers/writer.py | 21 +- butterfree/pipelines/feature_set_pipeline.py | 56 +- .../transform/aggregated_feature_set.py | 49 +- butterfree/transform/feature_set.py | 38 +- butterfree/transform/utils/window_spec.py | 20 +- examples/interval_runs/interval_runs.ipynb | 2152 +++++++++++++++++ setup.py | 2 +- .../integration/butterfree/load/test_sink.py | 35 +- .../butterfree/pipelines/conftest.py | 202 ++ .../pipelines/test_feature_set_pipeline.py | 311 ++- .../butterfree/transform/conftest.py | 55 + .../transform/test_aggregated_feature_set.py | 50 + .../butterfree/transform/test_feature_set.py | 44 + tests/unit/butterfree/clients/conftest.py | 11 +- .../clients/test_cassandra_client.py | 4 +- .../butterfree/clients/test_spark_client.py | 69 +- .../butterfree/dataframe_service/conftest.py | 14 + .../test_incremental_srategy.py | 70 + .../dataframe_service/test_partitioning.py | 20 + tests/unit/butterfree/extract/conftest.py | 55 + .../extract/readers/test_file_reader.py | 10 +- .../butterfree/extract/readers/test_reader.py | 58 + tests/unit/butterfree/hooks/__init__.py | 0 .../hooks/schema_compatibility/__init__.py | 0 ...ssandra_table_schema_compatibility_hook.py | 49 + ...t_spark_table_schema_compatibility_hook.py | 53 + .../hooks/test_hookable_component.py | 107 + tests/unit/butterfree/load/conftest.py | 25 + tests/unit/butterfree/load/test_sink.py | 34 +- .../test_historical_feature_store_writer.py | 144 +- .../test_online_feature_store_writer.py | 41 +- tests/unit/butterfree/pipelines/conftest.py | 63 + .../pipelines/test_feature_set_pipeline.py | 182 +- tests/unit/butterfree/transform/conftest.py | 82 + .../transform/test_aggregated_feature_set.py | 68 +- .../butterfree/transform/test_feature_set.py | 43 +- 58 files changed, 4738 insertions(+), 389 deletions(-) create mode 100644 butterfree/constants/window_definitions.py create mode 100644 butterfree/dataframe_service/incremental_strategy.py create mode 100644 butterfree/dataframe_service/partitioning.py create mode 100644 butterfree/hooks/__init__.py create mode 100644 butterfree/hooks/hook.py create mode 100644 butterfree/hooks/hookable_component.py create mode 100644 butterfree/hooks/schema_compatibility/__init__.py create mode 100644 butterfree/hooks/schema_compatibility/cassandra_table_schema_compatibility_hook.py create mode 100644 butterfree/hooks/schema_compatibility/spark_table_schema_compatibility_hook.py create mode 100644 examples/interval_runs/interval_runs.ipynb create mode 100644 tests/unit/butterfree/dataframe_service/test_incremental_srategy.py create mode 100644 tests/unit/butterfree/dataframe_service/test_partitioning.py create mode 100644 tests/unit/butterfree/hooks/__init__.py create mode 100644 tests/unit/butterfree/hooks/schema_compatibility/__init__.py create mode 100644 tests/unit/butterfree/hooks/schema_compatibility/test_cassandra_table_schema_compatibility_hook.py create mode 100644 tests/unit/butterfree/hooks/schema_compatibility/test_spark_table_schema_compatibility_hook.py create mode 100644 tests/unit/butterfree/hooks/test_hookable_component.py create mode 100644 tests/unit/butterfree/pipelines/conftest.py diff --git a/.gitignore b/.gitignore index 72b591f39..62434612f 100644 --- a/.gitignore +++ b/.gitignore @@ -47,6 +47,7 @@ coverage.xml *.cover .hypothesis/ *cov.xml +test_folder/ # Translations *.mo diff --git a/CHANGELOG.md b/CHANGELOG.md index 729946214..679e98343 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,11 +5,17 @@ Preferably use **Added**, **Changed**, **Removed** and **Fixed** topics in each ## [Unreleased] ### Added +* [MLOP-636] Create migration classes ([#282](https://github.com/quintoandar/butterfree/pull/282)) + +## [1.1.3](https://github.com/quintoandar/butterfree/releases/tag/1.1.3) +### Added * [MLOP-599] Apply mypy to ButterFree ([#273](https://github.com/quintoandar/butterfree/pull/273)) ### Changed * [MLOP-634] Butterfree dev workflow, set triggers for branches staging and master ([#280](https://github.com/quintoandar/butterfree/pull/280)) * Keep milliseconds when using 'from_ms' argument in timestamp feature ([#284](https://github.com/quintoandar/butterfree/pull/284)) +* [MLOP-633] Butterfree dev workflow, update documentation ([#281](https://github.com/quintoandar/butterfree/commit/74278986a49f1825beee0fd8df65a585764e5524)) +* [MLOP-632] Butterfree dev workflow, automate release description ([#279](https://github.com/quintoandar/butterfree/commit/245eaa594846166972241b03fddc61ee5117b1f7)) ### Fixed * Change trigger for pipeline staging ([#287](https://github.com/quintoandar/butterfree/pull/287)) diff --git a/butterfree/clients/cassandra_client.py b/butterfree/clients/cassandra_client.py index 1e5416886..938d4e4d6 100644 --- a/butterfree/clients/cassandra_client.py +++ b/butterfree/clients/cassandra_client.py @@ -33,33 +33,31 @@ class CassandraClient(AbstractClient): """Cassandra Client. Attributes: - cassandra_user: username to use in connection. - cassandra_password: password to use in connection. - cassandra_key_space: key space used in connection. - cassandra_host: cassandra endpoint used in connection. + user: username to use in connection. + password: password to use in connection. + keyspace: key space used in connection. + host: cassandra endpoint used in connection. """ def __init__( self, - cassandra_host: List[str], - cassandra_key_space: str, - cassandra_user: Optional[str] = None, - cassandra_password: Optional[str] = None, + host: List[str], + keyspace: str, + user: Optional[str] = None, + password: Optional[str] = None, ) -> None: - self.cassandra_host = cassandra_host - self.cassandra_key_space = cassandra_key_space - self.cassandra_user = cassandra_user - self.cassandra_password = cassandra_password + self.host = host + self.keyspace = keyspace + self.user = user + self.password = password self._session: Optional[Session] = None @property def conn(self, *, ssl_path: str = None) -> Session: # type: ignore """Establishes a Cassandra connection.""" auth_provider = ( - PlainTextAuthProvider( - username=self.cassandra_user, password=self.cassandra_password - ) - if self.cassandra_user is not None + PlainTextAuthProvider(username=self.user, password=self.password) + if self.user is not None else None ) ssl_opts = ( @@ -73,12 +71,12 @@ def conn(self, *, ssl_path: str = None) -> Session: # type: ignore ) cluster = Cluster( - contact_points=self.cassandra_host, + contact_points=self.host, auth_provider=auth_provider, ssl_options=ssl_opts, load_balancing_policy=RoundRobinPolicy(), ) - self._session = cluster.connect(self.cassandra_key_space) + self._session = cluster.connect(self.keyspace) self._session.row_factory = dict_factory return self._session @@ -106,7 +104,7 @@ def get_schema(self, table: str) -> List[Dict[str, str]]: """ query = ( f"SELECT column_name, type FROM system_schema.columns " # noqa - f"WHERE keyspace_name = '{self.cassandra_key_space}' " # noqa + f"WHERE keyspace_name = '{self.keyspace}' " # noqa f" AND table_name = '{table}';" # noqa ) @@ -114,8 +112,7 @@ def get_schema(self, table: str) -> List[Dict[str, str]]: if not response: raise RuntimeError( - f"No columns found for table: {table}" - f"in key space: {self.cassandra_key_space}" + f"No columns found for table: {table}" f"in key space: {self.keyspace}" ) return response @@ -143,7 +140,7 @@ def _get_create_table_query( else: columns_str = joined_parsed_columns - query = f"CREATE TABLE {self.cassandra_key_space}.{table} " f"({columns_str}); " + query = f"CREATE TABLE {self.keyspace}.{table} " f"({columns_str}); " return query diff --git a/butterfree/clients/spark_client.py b/butterfree/clients/spark_client.py index 0a8c717c5..0f0113e21 100644 --- a/butterfree/clients/spark_client.py +++ b/butterfree/clients/spark_client.py @@ -34,9 +34,10 @@ def conn(self) -> SparkSession: def read( self, format: str, - options: Dict[str, Any], + path: Optional[Union[str, List[str]]] = None, schema: Optional[StructType] = None, stream: bool = False, + **options: Any, ) -> DataFrame: """Use the SparkSession.read interface to load data into a dataframe. @@ -45,9 +46,10 @@ def read( Args: format: string with the format to be used by the DataframeReader. - options: options to setup the DataframeReader. + path: optional string or a list of string for file-system. stream: flag to indicate if data must be read in stream mode. schema: an optional pyspark.sql.types.StructType for the input schema. + options: options to setup the DataframeReader. Returns: Dataframe @@ -55,14 +57,16 @@ def read( """ if not isinstance(format, str): raise ValueError("format needs to be a string with the desired read format") - if not isinstance(options, dict): - raise ValueError("options needs to be a dict with the setup configurations") + if not isinstance(path, (str, list)): + raise ValueError("path needs to be a string or a list of string") df_reader: Union[ DataStreamReader, DataFrameReader ] = self.conn.readStream if stream else self.conn.read + df_reader = df_reader.schema(schema) if schema else df_reader - return df_reader.format(format).options(**options).load() + + return df_reader.format(format).load(path, **options) # type: ignore def read_table(self, table: str, database: str = None) -> DataFrame: """Use the SparkSession.read interface to read a metastore table. @@ -223,3 +227,47 @@ def create_temporary_view(self, dataframe: DataFrame, name: str) -> Any: if not dataframe.isStreaming: return dataframe.createOrReplaceTempView(name) return dataframe.writeStream.format("memory").queryName(name).start() + + def add_table_partitions( + self, partitions: List[Dict[str, Any]], table: str, database: str = None + ) -> None: + """Add partitions to an existing table. + + Args: + partitions: partitions to add to the table. + It's expected a list of partition dicts to add to the table. + Example: `[{"year": 2020, "month": 8, "day": 14}, ...]` + table: table to add the partitions. + database: name of the database where the table is saved. + """ + for partition_dict in partitions: + if not all( + ( + isinstance(key, str) + and (isinstance(value, str) or isinstance(value, int)) + ) + for key, value in partition_dict.items() + ): + raise ValueError( + "Partition keys must be column names " + "and values must be string or int." + ) + + database_expr = f"`{database}`." if database else "" + key_values_expr = [ + ", ".join( + [ + "{} = {}".format(k, v) + if not isinstance(v, str) + else "{} = '{}'".format(k, v) + for k, v in partition.items() + ] + ) + for partition in partitions + ] + partitions_expr = " ".join(f"PARTITION ( {expr} )" for expr in key_values_expr) + command = ( + f"ALTER TABLE {database_expr}`{table}` ADD IF NOT EXISTS {partitions_expr}" + ) + + self.conn.sql(command) diff --git a/butterfree/configs/db/metastore_config.py b/butterfree/configs/db/metastore_config.py index d94b792c8..a3b315d55 100644 --- a/butterfree/configs/db/metastore_config.py +++ b/butterfree/configs/db/metastore_config.py @@ -3,8 +3,11 @@ import os from typing import Any, Dict, List, Optional +from pyspark.sql import DataFrame + from butterfree.configs import environment from butterfree.configs.db import AbstractWriteConfig +from butterfree.dataframe_service import extract_partition_values class MetastoreConfig(AbstractWriteConfig): @@ -87,6 +90,31 @@ def get_options(self, key: str) -> Dict[Optional[str], Optional[str]]: "path": os.path.join(f"{self.file_system}://{self.path}/", key), } + def get_path_with_partitions(self, key: str, dataframe: DataFrame) -> List: + """Get options for AWS S3 from partitioned parquet file. + + Options will be a dictionary with the write and read configuration for + Spark to AWS S3. + + Args: + key: path to save data into AWS S3 bucket. + dataframe: spark dataframe containing data from a feature set. + + Returns: + A list of string for file-system backed data sources. + """ + path_list = [] + dataframe_values = extract_partition_values( + dataframe, partition_columns=["year", "month", "day"] + ) + for row in dataframe_values: + path_list.append( + f"{self.file_system}://{self.path}/{key}/year={row['year']}/" + f"month={row['month']}/day={row['day']}" + ) + + return path_list + def translate(self, schema: List[Dict[str, Any]]) -> List[Dict[str, Any]]: """Translate feature set spark schema to the corresponding database.""" pass diff --git a/butterfree/configs/environment.py b/butterfree/configs/environment.py index 6f5accbc5..f98a7a01b 100644 --- a/butterfree/configs/environment.py +++ b/butterfree/configs/environment.py @@ -35,8 +35,8 @@ def get_variable(variable_name: str, default_value: str = None) -> Optional[str] """Gets an environment variable. The variable comes from it's explicitly declared value in the running - environment or from the default value declared in the environment.yaml - specification or from the default_value. + environment or from the default value declared in specification or from the + default_value. Args: variable_name: environment variable name. diff --git a/butterfree/constants/window_definitions.py b/butterfree/constants/window_definitions.py new file mode 100644 index 000000000..560904f75 --- /dev/null +++ b/butterfree/constants/window_definitions.py @@ -0,0 +1,16 @@ +"""Allowed windows units and lengths in seconds.""" + +ALLOWED_WINDOWS = { + "second": 1, + "seconds": 1, + "minute": 60, + "minutes": 60, + "hour": 3600, + "hours": 3600, + "day": 86400, + "days": 86400, + "week": 604800, + "weeks": 604800, + "year": 29030400, + "years": 29030400, +} diff --git a/butterfree/dataframe_service/__init__.py b/butterfree/dataframe_service/__init__.py index 5116261d6..c227dae24 100644 --- a/butterfree/dataframe_service/__init__.py +++ b/butterfree/dataframe_service/__init__.py @@ -1,4 +1,11 @@ """Dataframe optimization components regarding Butterfree.""" +from butterfree.dataframe_service.incremental_strategy import IncrementalStrategy +from butterfree.dataframe_service.partitioning import extract_partition_values from butterfree.dataframe_service.repartition import repartition_df, repartition_sort_df -__all__ = ["repartition_df", "repartition_sort_df"] +__all__ = [ + "extract_partition_values", + "IncrementalStrategy", + "repartition_df", + "repartition_sort_df", +] diff --git a/butterfree/dataframe_service/incremental_strategy.py b/butterfree/dataframe_service/incremental_strategy.py new file mode 100644 index 000000000..6554d3b77 --- /dev/null +++ b/butterfree/dataframe_service/incremental_strategy.py @@ -0,0 +1,116 @@ +"""IncrementalStrategy entity.""" + +from __future__ import annotations + +from pyspark.sql import DataFrame + + +class IncrementalStrategy: + """Define an incremental strategy to be used on data sources. + + Entity responsible for defining a column expression that will be used to + filter the original data source. The purpose is to get only the data related + to a specific pipeline execution time interval. + + Attributes: + column: column expression on which incremental filter will be applied. + The expression need to result on a date or timestamp format, so the + filter can properly work with the defined upper and lower bounds. + """ + + def __init__(self, column: str = None): + self.column = column + + def from_milliseconds(self, column_name: str) -> IncrementalStrategy: + """Create a column expression from ts column defined as milliseconds. + + Args: + column_name: column name where the filter will be applied. + + Returns: + `IncrementalStrategy` with the defined column expression. + """ + return IncrementalStrategy(column=f"from_unixtime({column_name}/ 1000.0)") + + def from_string(self, column_name: str, mask: str = None) -> IncrementalStrategy: + """Create a column expression from ts column defined as a simple string. + + Args: + column_name: column name where the filter will be applied. + mask: mask defining the date/timestamp format on the string. + + Returns: + `IncrementalStrategy` with the defined column expression. + """ + return IncrementalStrategy(column=f"to_date({column_name}, '{mask}')") + + def from_year_month_day_partitions( + self, + year_column: str = "year", + month_column: str = "month", + day_column: str = "day", + ) -> IncrementalStrategy: + """Create a column expression from year, month and day partitions. + + Args: + year_column: column name from the year partition. + month_column: column name from the month partition. + day_column: column name from the day partition. + + Returns: + `IncrementalStrategy` with the defined column expression. + """ + return IncrementalStrategy( + column=f"concat(string({year_column}), " + f"'-', string({month_column}), " + f"'-', string({day_column}))" + ) + + def get_expression(self, start_date: str = None, end_date: str = None) -> str: + """Get the incremental filter expression using the defined dates. + + Both arguments can be set to defined a specific date interval, but it's + only necessary to set one of the arguments for this method to work. + + Args: + start_date: date lower bound to use in the filter. + end_date: date upper bound to use in the filter. + + Returns: + Filter expression based on defined column and bounds. + + Raises: + ValuerError: If both arguments, start_date and end_date, are None. + ValueError: If the column expression was not defined. + """ + if not self.column: + raise ValueError("column parameter can't be None") + if not (start_date or end_date): + raise ValueError("Both arguments start_date and end_date can't be None.") + if start_date: + expression = f"date({self.column}) >= date('{start_date}')" + if end_date: + expression += f" and date({self.column}) <= date('{end_date}')" + return expression + return f"date({self.column}) <= date('{end_date}')" + + def filter_with_incremental_strategy( + self, dataframe: DataFrame, start_date: str = None, end_date: str = None + ) -> DataFrame: + """Filters the dataframe according to the date boundaries. + + Args: + dataframe: dataframe that will be filtered. + start_date: date lower bound to use in the filter. + end_date: date upper bound to use in the filter. + + Returns: + Filtered dataframe based on defined time boundaries. + """ + return ( + dataframe.where( + self.get_expression(start_date=start_date, end_date=end_date) + ) + if start_date or end_date + else dataframe + ) diff --git a/butterfree/dataframe_service/partitioning.py b/butterfree/dataframe_service/partitioning.py new file mode 100644 index 000000000..21e9b0ab7 --- /dev/null +++ b/butterfree/dataframe_service/partitioning.py @@ -0,0 +1,25 @@ +"""Module defining partitioning methods.""" + +from typing import Any, Dict, List + +from pyspark.sql import DataFrame + + +def extract_partition_values( + dataframe: DataFrame, partition_columns: List[str] +) -> List[Dict[str, Any]]: + """Extract distinct partition values from a given dataframe. + + Args: + dataframe: dataframe from where to extract partition values. + partition_columns: name of partition columns presented on the dataframe. + + Returns: + distinct partition values. + """ + return ( + dataframe.select(*partition_columns) + .distinct() + .rdd.map(lambda row: row.asDict(True)) + .collect() + ) diff --git a/butterfree/extract/readers/file_reader.py b/butterfree/extract/readers/file_reader.py index 17f68f1cb..8cf155998 100644 --- a/butterfree/extract/readers/file_reader.py +++ b/butterfree/extract/readers/file_reader.py @@ -87,9 +87,7 @@ def __init__( self.path = path self.format = format self.schema = schema - self.options = dict( - {"path": self.path}, **format_options if format_options else {} - ) + self.options = dict(format_options if format_options else {}) self.stream = stream def consume(self, client: SparkClient) -> DataFrame: @@ -106,11 +104,15 @@ def consume(self, client: SparkClient) -> DataFrame: """ schema = ( - client.read(format=self.format, options=self.options,).schema + client.read(format=self.format, path=self.path, **self.options).schema if (self.stream and not self.schema) else self.schema ) return client.read( - format=self.format, options=self.options, schema=schema, stream=self.stream, + format=self.format, + schema=schema, + stream=self.stream, + path=self.path, + **self.options, ) diff --git a/butterfree/extract/readers/reader.py b/butterfree/extract/readers/reader.py index 78be28232..597c870ff 100644 --- a/butterfree/extract/readers/reader.py +++ b/butterfree/extract/readers/reader.py @@ -2,14 +2,16 @@ from abc import ABC, abstractmethod from functools import reduce -from typing import Any, Callable, Dict, List +from typing import Any, Callable, Dict, List, Optional from pyspark.sql import DataFrame from butterfree.clients import SparkClient +from butterfree.dataframe_service import IncrementalStrategy +from butterfree.hooks import HookableComponent -class Reader(ABC): +class Reader(ABC, HookableComponent): """Abstract base class for Readers. Attributes: @@ -19,9 +21,11 @@ class Reader(ABC): """ - def __init__(self, id: str): + def __init__(self, id: str, incremental_strategy: IncrementalStrategy = None): + super().__init__() self.id = id self.transformations: List[Dict[str, Any]] = [] + self.incremental_strategy = incremental_strategy def with_( self, transformer: Callable[..., DataFrame], *args: Any, **kwargs: Any @@ -48,14 +52,19 @@ def with_( self.transformations.append(new_transformation) return self - def _apply_transformations(self, df: DataFrame) -> Any: - return reduce( - lambda result_df, transformation: transformation["transformer"]( - result_df, *transformation["args"], **transformation["kwargs"] - ), - self.transformations, - df, - ) + def with_incremental_strategy( + self, incremental_strategy: IncrementalStrategy + ) -> "Reader": + """Define the incremental strategy for the Reader. + + Args: + incremental_strategy: definition of the incremental strategy. + + Returns: + Reader with defined incremental strategy. + """ + self.incremental_strategy = incremental_strategy + return self @abstractmethod def consume(self, client: SparkClient) -> DataFrame: @@ -70,24 +79,61 @@ def consume(self, client: SparkClient) -> DataFrame: :return: Spark dataframe """ - def build(self, client: SparkClient, columns: List[Any] = None) -> None: + def build( + self, + client: SparkClient, + columns: List[Any] = None, + start_date: str = None, + end_date: str = None, + ) -> None: """Register the data got from the reader in the Spark metastore. Create a temporary view in Spark metastore referencing the data extracted from the target origin after the application of all the defined pre-processing transformations. + The arguments start_date and end_date are going to be use only when there + is a defined `IncrementalStrategy` on the `Reader`. + Args: client: client responsible for connecting to Spark session. - columns: list of tuples for renaming/filtering the dataset. + columns: list of tuples for selecting/renaming columns on the df. + start_date: lower bound to use in the filter expression. + end_date: upper bound to use in the filter expression. """ - transformed_df = self._apply_transformations(self.consume(client)) - - if columns: - select_expression = [] - for old_expression, new_column_name in columns: - select_expression.append(f"{old_expression} as {new_column_name}") - transformed_df = transformed_df.selectExpr(*select_expression) + column_selection_df = self._select_columns(columns, client) + transformed_df = self._apply_transformations(column_selection_df) + + if self.incremental_strategy: + transformed_df = self.incremental_strategy.filter_with_incremental_strategy( + transformed_df, start_date, end_date + ) + + post_hook_df = self.run_post_hooks(transformed_df) + + post_hook_df.createOrReplaceTempView(self.id) + + def _select_columns( + self, columns: Optional[List[Any]], client: SparkClient + ) -> DataFrame: + df = self.consume(client) + return df.selectExpr( + *( + [ + f"{old_expression} as {new_column_name}" + for old_expression, new_column_name in columns + ] + if columns + else df.columns + ) + ) - transformed_df.createOrReplaceTempView(self.id) + def _apply_transformations(self, df: DataFrame) -> DataFrame: + return reduce( + lambda result_df, transformation: transformation["transformer"]( + result_df, *transformation["args"], **transformation["kwargs"] + ), + self.transformations, + df, + ) diff --git a/butterfree/extract/source.py b/butterfree/extract/source.py index 00ac9e43f..6d905c6b5 100644 --- a/butterfree/extract/source.py +++ b/butterfree/extract/source.py @@ -6,9 +6,10 @@ from butterfree.clients import SparkClient from butterfree.extract.readers.reader import Reader +from butterfree.hooks import HookableComponent -class Source: +class Source(HookableComponent): """The definition of the the entry point data for the ETL pipeline. A FeatureSet (the next step in the pipeline) expects a single dataframe as @@ -51,31 +52,44 @@ class Source: """ def __init__(self, readers: List[Reader], query: str) -> None: + super().__init__() + self.enable_pre_hooks = False self.readers = readers self.query = query - def construct(self, client: SparkClient) -> DataFrame: + def construct( + self, client: SparkClient, start_date: str = None, end_date: str = None + ) -> DataFrame: """Construct an entry point dataframe for a feature set. This method will assemble multiple readers, by building each one and - querying them using a Spark SQL. + querying them using a Spark SQL. It's important to highlight that in + order to filter a dataframe regarding date boundaries, it's important + to define a IncrementalStrategy, otherwise your data will not be filtered. + Besides, both start and end dates parameters are optional. After that, there's the caching of the dataframe, however since cache() in Spark is lazy, an action is triggered in order to force persistence. Args: client: client responsible for connecting to Spark session. + start_date: user defined start date for filtering. + end_date: user defined end date for filtering. Returns: DataFrame with the query result against all readers. """ for reader in self.readers: - reader.build(client) # create temporary views for each reader + reader.build( + client=client, start_date=start_date, end_date=end_date + ) # create temporary views for each reader dataframe = client.sql(self.query) if not dataframe.isStreaming: dataframe.cache().count() - return dataframe + post_hook_df = self.run_post_hooks(dataframe) + + return post_hook_df diff --git a/butterfree/hooks/__init__.py b/butterfree/hooks/__init__.py new file mode 100644 index 000000000..90bedeb26 --- /dev/null +++ b/butterfree/hooks/__init__.py @@ -0,0 +1,5 @@ +"""Holds Hooks definitions.""" +from butterfree.hooks.hook import Hook +from butterfree.hooks.hookable_component import HookableComponent + +__all__ = ["Hook", "HookableComponent"] diff --git a/butterfree/hooks/hook.py b/butterfree/hooks/hook.py new file mode 100644 index 000000000..f7d8c562f --- /dev/null +++ b/butterfree/hooks/hook.py @@ -0,0 +1,20 @@ +"""Hook abstract class entity.""" + +from abc import ABC, abstractmethod + +from pyspark.sql import DataFrame + + +class Hook(ABC): + """Definition of a hook function to call on a Dataframe.""" + + @abstractmethod + def run(self, dataframe: DataFrame) -> DataFrame: + """Run interface for Hook. + + Args: + dataframe: dataframe to use in the Hook. + + Returns: + dataframe result from the Hook. + """ diff --git a/butterfree/hooks/hookable_component.py b/butterfree/hooks/hookable_component.py new file mode 100644 index 000000000..d89babcea --- /dev/null +++ b/butterfree/hooks/hookable_component.py @@ -0,0 +1,148 @@ +"""Definition of hookable component.""" + +from __future__ import annotations + +from typing import List + +from pyspark.sql import DataFrame + +from butterfree.hooks.hook import Hook + + +class HookableComponent: + """Defines a component with the ability to hold pre and post hook functions. + + All main module of Butterfree have a common object that enables their integration: + dataframes. Spark's dataframe is the glue that enables the transmission of data + between the main modules. Hooks have a simple interface, they are functions that + accepts a dataframe and outputs a dataframe. These Hooks can be triggered before or + after the main execution of a component. + + Components from Butterfree that inherit HookableComponent entity, are components + that can define a series of steps to occur before or after the execution of their + main functionality. + + Attributes: + pre_hooks: function steps to trigger before component main functionality. + post_hooks: function steps to trigger after component main functionality. + enable_pre_hooks: property to indicate if the component can define pre_hooks. + enable_post_hooks: property to indicate if the component can define post_hooks. + """ + + def __init__(self) -> None: + self.pre_hooks = [] + self.post_hooks = [] + self.enable_pre_hooks = True + self.enable_post_hooks = True + + @property + def pre_hooks(self) -> List[Hook]: + """Function steps to trigger before component main functionality.""" + return self.__pre_hook + + @pre_hooks.setter + def pre_hooks(self, value: List[Hook]) -> None: + if not isinstance(value, list): + raise ValueError("pre_hooks should be a list of Hooks.") + if not all(isinstance(item, Hook) for item in value): + raise ValueError( + "All items on pre_hooks list should be an instance of Hook." + ) + self.__pre_hook = value + + @property + def post_hooks(self) -> List[Hook]: + """Function steps to trigger after component main functionality.""" + return self.__post_hook + + @post_hooks.setter + def post_hooks(self, value: List[Hook]) -> None: + if not isinstance(value, list): + raise ValueError("post_hooks should be a list of Hooks.") + if not all(isinstance(item, Hook) for item in value): + raise ValueError( + "All items on post_hooks list should be an instance of Hook." + ) + self.__post_hook = value + + @property + def enable_pre_hooks(self) -> bool: + """Property to indicate if the component can define pre_hooks.""" + return self.__enable_pre_hooks + + @enable_pre_hooks.setter + def enable_pre_hooks(self, value: bool) -> None: + if not isinstance(value, bool): + raise ValueError("enable_pre_hooks accepts only boolean values.") + self.__enable_pre_hooks = value + + @property + def enable_post_hooks(self) -> bool: + """Property to indicate if the component can define post_hooks.""" + return self.__enable_post_hooks + + @enable_post_hooks.setter + def enable_post_hooks(self, value: bool) -> None: + if not isinstance(value, bool): + raise ValueError("enable_post_hooks accepts only boolean values.") + self.__enable_post_hooks = value + + def add_pre_hook(self, *hooks: Hook) -> HookableComponent: + """Add a pre-hook steps to the component. + + Args: + hooks: Hook steps to add to pre_hook list. + + Returns: + Component with the Hook inserted in pre_hook list. + + Raises: + ValueError: if the component does not accept pre-hooks. + """ + if not self.enable_pre_hooks: + raise ValueError("This component does not enable adding pre-hooks") + self.pre_hooks += list(hooks) + return self + + def add_post_hook(self, *hooks: Hook) -> HookableComponent: + """Add a post-hook steps to the component. + + Args: + hooks: Hook steps to add to post_hook list. + + Returns: + Component with the Hook inserted in post_hook list. + + Raises: + ValueError: if the component does not accept post-hooks. + """ + if not self.enable_post_hooks: + raise ValueError("This component does not enable adding post-hooks") + self.post_hooks += list(hooks) + return self + + def run_pre_hooks(self, dataframe: DataFrame) -> DataFrame: + """Run all defined pre-hook steps from a given dataframe. + + Args: + dataframe: data to input in the defined pre-hook steps. + + Returns: + dataframe after passing for all defined pre-hooks. + """ + for hook in self.pre_hooks: + dataframe = hook.run(dataframe) + return dataframe + + def run_post_hooks(self, dataframe: DataFrame) -> DataFrame: + """Run all defined post-hook steps from a given dataframe. + + Args: + dataframe: data to input in the defined post-hook steps. + + Returns: + dataframe after passing for all defined post-hooks. + """ + for hook in self.post_hooks: + dataframe = hook.run(dataframe) + return dataframe diff --git a/butterfree/hooks/schema_compatibility/__init__.py b/butterfree/hooks/schema_compatibility/__init__.py new file mode 100644 index 000000000..edf748bf8 --- /dev/null +++ b/butterfree/hooks/schema_compatibility/__init__.py @@ -0,0 +1,9 @@ +"""Holds Schema Compatibility Hooks definitions.""" +from butterfree.hooks.schema_compatibility.cassandra_table_schema_compatibility_hook import ( # noqa + CassandraTableSchemaCompatibilityHook, +) +from butterfree.hooks.schema_compatibility.spark_table_schema_compatibility_hook import ( # noqa + SparkTableSchemaCompatibilityHook, +) + +__all__ = ["SparkTableSchemaCompatibilityHook", "CassandraTableSchemaCompatibilityHook"] diff --git a/butterfree/hooks/schema_compatibility/cassandra_table_schema_compatibility_hook.py b/butterfree/hooks/schema_compatibility/cassandra_table_schema_compatibility_hook.py new file mode 100644 index 000000000..cdb40472b --- /dev/null +++ b/butterfree/hooks/schema_compatibility/cassandra_table_schema_compatibility_hook.py @@ -0,0 +1,58 @@ +"""Cassandra table schema compatibility Hook definition.""" + +from pyspark.sql import DataFrame + +from butterfree.clients import CassandraClient +from butterfree.constants import DataType +from butterfree.hooks.hook import Hook + + +class CassandraTableSchemaCompatibilityHook(Hook): + """Hook to verify the schema compatibility with a Cassandra's table. + + Verifies if all columns presented on the dataframe exists and are the same + type on the target Cassandra's table. + + Attributes: + cassandra_client: client to connect to Cassandra DB. + table: table name. + """ + + def __init__(self, cassandra_client: CassandraClient, table: str): + self.cassandra_client = cassandra_client + self.table = table + + def run(self, dataframe: DataFrame) -> DataFrame: + """Check the schema compatibility from a given Dataframe. + + This method does not change anything on the Dataframe. + + Args: + dataframe: dataframe to verify schema compatibility. + + Returns: + unchanged dataframe. + + Raises: + ValueError if the schemas are incompatible. + """ + table_schema = self.cassandra_client.get_schema(self.table) + type_cassandra = [ + type.cassandra + for field_id in range(len(dataframe.schema.fieldNames())) + for type in DataType + if dataframe.schema.fields.__getitem__(field_id).dataType == type.spark + ] + schema = [ + {"column_name": f"{column}", "type": f"{type}"} + for column, type in zip(dataframe.columns, type_cassandra) + ] + + if not all([column in table_schema for column in schema]): + raise ValueError( + "There's a schema incompatibility " + "between the defined dataframe and the Cassandra table.\n" + f"Dataframe schema = {schema}" + f"Target table schema = {table_schema}" + ) + return dataframe diff --git a/butterfree/hooks/schema_compatibility/spark_table_schema_compatibility_hook.py b/butterfree/hooks/schema_compatibility/spark_table_schema_compatibility_hook.py new file mode 100644 index 000000000..b08dd56aa --- /dev/null +++ b/butterfree/hooks/schema_compatibility/spark_table_schema_compatibility_hook.py @@ -0,0 +1,46 @@ +"""Spark table schema compatibility Hook definition.""" + +from pyspark.sql import DataFrame + +from butterfree.clients import SparkClient +from butterfree.hooks.hook import Hook + + +class SparkTableSchemaCompatibilityHook(Hook): + """Hook to verify the schema compatibility with a Spark's table. + + Verifies if all columns presented on the dataframe exists and are the same + type on the target Spark's table. + + Attributes: + spark_client: client to connect to Spark's metastore. + table: table name. + database: database name. + """ + + def __init__(self, spark_client: SparkClient, table: str, database: str = None): + self.spark_client = spark_client + self.table_expression = (f"`{database}`." if database else "") + f"`{table}`" + + def run(self, dataframe: DataFrame) -> DataFrame: + """Check the schema compatibility from a given Dataframe. + + This method does not change anything on the Dataframe. + + Args: + dataframe: dataframe to verify schema compatibility. + + Returns: + unchanged dataframe. + + Raises: + ValueError if the schemas are incompatible. + """ + table_schema = self.spark_client.conn.table(self.table_expression).schema + if not all([column in table_schema for column in dataframe.schema]): + raise ValueError( + "The dataframe has a schema incompatible with the defined table.\n" + f"Dataframe schema = {dataframe.schema}" + f"Target table schema = {table_schema}" + ) + return dataframe diff --git a/butterfree/load/sink.py b/butterfree/load/sink.py index b4bf93e8c..0b0c10c9e 100644 --- a/butterfree/load/sink.py +++ b/butterfree/load/sink.py @@ -5,13 +5,14 @@ from pyspark.sql.streaming import StreamingQuery from butterfree.clients import SparkClient +from butterfree.hooks import HookableComponent from butterfree.load.writers.writer import Writer from butterfree.transform import FeatureSet from butterfree.validations import BasicValidation from butterfree.validations.validation import Validation -class Sink: +class Sink(HookableComponent): """Define the destinations for the feature set pipeline. A Sink is created from a set of writers. The main goal of the Sink is to @@ -26,6 +27,8 @@ class Sink: """ def __init__(self, writers: List[Writer], validation: Optional[Validation] = None): + super().__init__() + self.enable_post_hooks = False self.writers = writers self.validation = validation @@ -94,12 +97,16 @@ def flush( Streaming handlers for each defined writer, if writing streaming dfs. """ + pre_hook_df = self.run_pre_hooks(dataframe) + if self.validation is not None: - self.validation.input(dataframe).check() + self.validation.input(pre_hook_df).check() handlers = [ writer.write( - feature_set=feature_set, dataframe=dataframe, spark_client=spark_client + feature_set=feature_set, + dataframe=pre_hook_df, + spark_client=spark_client, ) for writer in self.writers ] diff --git a/butterfree/load/writers/historical_feature_store_writer.py b/butterfree/load/writers/historical_feature_store_writer.py index d70f68f0b..456d9e6bd 100644 --- a/butterfree/load/writers/historical_feature_store_writer.py +++ b/butterfree/load/writers/historical_feature_store_writer.py @@ -1,7 +1,7 @@ """Holds the Historical Feature Store writer class.""" import os -from typing import Union +from typing import Any, Union from pyspark.sql.dataframe import DataFrame from pyspark.sql.functions import dayofmonth, month, year @@ -12,6 +12,8 @@ from butterfree.constants import columns from butterfree.constants.spark_constants import DEFAULT_NUM_PARTITIONS from butterfree.dataframe_service import repartition_df +from butterfree.hooks import Hook +from butterfree.hooks.schema_compatibility import SparkTableSchemaCompatibilityHook from butterfree.load.writers.writer import Writer from butterfree.transform import FeatureSet @@ -60,6 +62,20 @@ class HistoricalFeatureStoreWriter(Writer): For what settings you can use on S3Config and default settings, to read S3Config class. + We can write with interval mode, where HistoricalFeatureStoreWrite + will need to use Dynamic Partition Inserts, + the behaviour of OVERWRITE keyword is controlled by + spark.sql.sources.partitionOverwriteMode configuration property. + The dynamic overwrite mode is enabled Spark will only delete the + partitions for which it has data to be written to. + All the other partitions remain intact. + + >>> spark_client = SparkClient() + >>> writer = HistoricalFeatureStoreWriter(interval_mode=True) + >>> writer.write(feature_set=feature_set, + ... dataframe=dataframe, + ... spark_client=spark_client) + We can instantiate HistoricalFeatureStoreWriter class to validate the df to be written. @@ -95,15 +111,17 @@ def __init__( num_partitions: int = None, validation_threshold: float = DEFAULT_VALIDATION_THRESHOLD, debug_mode: bool = False, + interval_mode: bool = False, + check_schema_hook: Hook = None, ): - super(HistoricalFeatureStoreWriter, self).__init__() + super(HistoricalFeatureStoreWriter, self).__init__(debug_mode, interval_mode) self.db_config = db_config or MetastoreConfig() self.database = database or environment.get_variable( "FEATURE_STORE_HISTORICAL_DATABASE" ) self.num_partitions = num_partitions or DEFAULT_NUM_PARTITIONS self.validation_threshold = validation_threshold - self.debug_mode = debug_mode + self.check_schema_hook = check_schema_hook def write( self, feature_set: FeatureSet, dataframe: DataFrame, spark_client: SparkClient, @@ -122,7 +140,25 @@ def write( """ dataframe = self._create_partitions(dataframe) - dataframe = self._apply_transformations(dataframe) + partition_df = self._apply_transformations(dataframe) + + if self.debug_mode: + dataframe = partition_df + else: + dataframe = self.check_schema( + spark_client, partition_df, feature_set.name, self.database + ) + + if self.interval_mode: + if self.debug_mode: + spark_client.create_temporary_view( + dataframe=dataframe, + name=f"historical_feature_store__{feature_set.name}", + ) + return + + self._incremental_mode(feature_set, dataframe, spark_client) + return if self.debug_mode: spark_client.create_temporary_view( @@ -132,6 +168,7 @@ def write( return s3_key = os.path.join("historical", feature_set.entity, feature_set.name) + spark_client.write_table( dataframe=dataframe, database=self.database, @@ -140,6 +177,34 @@ def write( **self.db_config.get_options(s3_key), ) + def _incremental_mode( + self, feature_set: FeatureSet, dataframe: DataFrame, spark_client: SparkClient + ) -> None: + + partition_overwrite_mode = spark_client.conn.conf.get( + "spark.sql.sources.partitionOverwriteMode" + ).lower() + + if partition_overwrite_mode != "dynamic": + raise RuntimeError( + "m=load_incremental_table, " + "spark.sql.sources.partitionOverwriteMode={}, " + "msg=partitionOverwriteMode have to be configured to 'dynamic'".format( + partition_overwrite_mode + ) + ) + + s3_key = os.path.join("historical", feature_set.entity, feature_set.name) + options = {"path": self.db_config.get_options(s3_key).get("path")} + + spark_client.write_dataframe( + dataframe=dataframe, + format_=self.db_config.format_, + mode=self.db_config.mode, + **options, + partitionBy=self.PARTITION_BY, + ) + def _assert_validation_count( self, table_name: str, written_count: int, dataframe_count: int ) -> None: @@ -169,12 +234,26 @@ def validate( """ table_name = ( - f"{self.database}.{feature_set.name}" - if not self.debug_mode - else f"historical_feature_store__{feature_set.name}" + f"{feature_set.name}" + if self.interval_mode and not self.debug_mode + else ( + f"{self.database}.{feature_set.name}" + if not self.debug_mode + else f"historical_feature_store__{feature_set.name}" + ) + ) + + written_count = ( + spark_client.read( + self.db_config.format_, + path=self.db_config.get_path_with_partitions(table_name, dataframe), + ).count() + if self.interval_mode and not self.debug_mode + else spark_client.read_table(table_name).count() ) - written_count = spark_client.read_table(table_name).count() + dataframe_count = dataframe.count() + self._assert_validation_count(table_name, written_count, dataframe_count) def _create_partitions(self, dataframe: DataFrame) -> DataFrame: @@ -191,3 +270,21 @@ def _create_partitions(self, dataframe: DataFrame) -> DataFrame: columns.PARTITION_DAY, dayofmonth(dataframe[columns.TIMESTAMP_COLUMN]) ) return repartition_df(dataframe, self.PARTITION_BY, self.num_partitions) + + def check_schema( + self, client: Any, dataframe: DataFrame, table_name: str, database: str = None + ) -> DataFrame: + """Instantiate the schema check hook to check schema between dataframe and database. + + Args: + client: client for Spark or Cassandra connections with external services. + dataframe: Spark dataframe containing data from a feature set. + table_name: table name where the dataframe will be saved. + database: database name where the dataframe will be saved. + """ + if not self.check_schema_hook: + self.check_schema_hook = SparkTableSchemaCompatibilityHook( + client, table_name, database + ) + + return self.check_schema_hook.run(dataframe) diff --git a/butterfree/load/writers/online_feature_store_writer.py b/butterfree/load/writers/online_feature_store_writer.py index a81a1040e..fade37896 100644 --- a/butterfree/load/writers/online_feature_store_writer.py +++ b/butterfree/load/writers/online_feature_store_writer.py @@ -7,9 +7,11 @@ from pyspark.sql.functions import col, row_number from pyspark.sql.streaming import StreamingQuery -from butterfree.clients import SparkClient +from butterfree.clients import CassandraClient, SparkClient from butterfree.configs.db import AbstractWriteConfig, CassandraConfig from butterfree.constants.columns import TIMESTAMP_COLUMN +from butterfree.hooks import Hook +from butterfree.hooks.schema_compatibility import CassandraTableSchemaCompatibilityHook from butterfree.load.writers.writer import Writer from butterfree.transform import FeatureSet @@ -66,6 +68,12 @@ class OnlineFeatureStoreWriter(Writer): Both methods (writer and validate) will need the Spark Client, Feature Set and DataFrame, to write or to validate, according to OnlineFeatureStoreWriter class arguments. + + There's an important aspect to be highlighted here: if you're using + the incremental mode, we do not check if your data is the newest before + writing to the online feature store. + + This behavior is known and will be fixed soon. """ __name__ = "Online Feature Store Writer" @@ -75,11 +83,13 @@ def __init__( db_config: Union[AbstractWriteConfig, CassandraConfig] = None, debug_mode: bool = False, write_to_entity: bool = False, + interval_mode: bool = False, + check_schema_hook: Hook = None, ): - super(OnlineFeatureStoreWriter, self).__init__() + super(OnlineFeatureStoreWriter, self).__init__(debug_mode, interval_mode) self.db_config = db_config or CassandraConfig() - self.debug_mode = debug_mode self.write_to_entity = write_to_entity + self.check_schema_hook = check_schema_hook @staticmethod def filter_latest(dataframe: DataFrame, id_columns: List[Any]) -> DataFrame: @@ -170,6 +180,22 @@ def write( """ table_name = feature_set.entity if self.write_to_entity else feature_set.name + if not self.debug_mode: + config = ( + self.db_config + if self.db_config == CassandraConfig + else CassandraConfig() + ) + + cassandra_client = CassandraClient( + host=[config.host], + keyspace=config.keyspace, + user=config.username, + password=config.password, + ) + + dataframe = self.check_schema(cassandra_client, dataframe, table_name) + if dataframe.isStreaming: dataframe = self._apply_transformations(dataframe) if self.debug_mode: @@ -236,3 +262,21 @@ def get_db_schema(self, feature_set: FeatureSet) -> List[Dict[Any, Any]]: """ db_schema = self.db_config.translate(feature_set.get_schema()) return db_schema + + def check_schema( + self, client: Any, dataframe: DataFrame, table_name: str, database: str = None + ) -> DataFrame: + """Instantiate the schema check hook to check schema between dataframe and database. + + Args: + client: client for Spark or Cassandra connections with external services. + dataframe: Spark dataframe containing data from a feature set. + table_name: table name where the dataframe will be saved. + database: database name where the dataframe will be saved. + """ + if not self.check_schema_hook: + self.check_schema_hook = CassandraTableSchemaCompatibilityHook( + client, table_name + ) + + return self.check_schema_hook.run(dataframe) diff --git a/butterfree/load/writers/writer.py b/butterfree/load/writers/writer.py index f76b4c253..7e0f9018d 100644 --- a/butterfree/load/writers/writer.py +++ b/butterfree/load/writers/writer.py @@ -7,10 +7,11 @@ from pyspark.sql.dataframe import DataFrame from butterfree.clients import SparkClient +from butterfree.hooks import HookableComponent from butterfree.transform import FeatureSet -class Writer(ABC): +class Writer(ABC, HookableComponent): """Abstract base class for Writers. Args: @@ -18,8 +19,11 @@ class Writer(ABC): """ - def __init__(self) -> None: + def __init__(self, debug_mode: bool = False, interval_mode: bool = False) -> None: + super().__init__() self.transformations: List[Dict[str, Any]] = [] + self.debug_mode = debug_mode + self.interval_mode = interval_mode def with_( self, transformer: Callable[..., DataFrame], *args: Any, **kwargs: Any @@ -70,6 +74,19 @@ def write( """ + @abstractmethod + def check_schema( + self, client: Any, dataframe: DataFrame, table_name: str, database: str = None + ) -> DataFrame: + """Instantiate the schema check hook to check schema between dataframe and database. + + Args: + client: client for Spark or Cassandra connections with external services. + dataframe: Spark dataframe containing data from a feature set. + table_name: table name where the dataframe will be saved. + database: database name where the dataframe will be saved. + """ + @abstractmethod def validate( self, feature_set: FeatureSet, dataframe: DataFrame, spark_client: SparkClient diff --git a/butterfree/pipelines/feature_set_pipeline.py b/butterfree/pipelines/feature_set_pipeline.py index ce1b7ba4d..8aec54ec2 100644 --- a/butterfree/pipelines/feature_set_pipeline.py +++ b/butterfree/pipelines/feature_set_pipeline.py @@ -40,11 +40,12 @@ class FeatureSetPipeline: ... ) >>> from butterfree.load import Sink >>> from butterfree.load.writers import HistoricalFeatureStoreWriter - >>> import pyspark.sql.functions as F + >>> from pyspark.sql import functions >>> def divide(df, fs, column1, column2): ... name = fs.get_output_columns()[0] - ... df = df.withColumn(name, F.col(column1) / F.col(column2)) + ... df = df.withColumn(name, + ... functions.col(column1) / functions.col(column2)) ... return df >>> pipeline = FeatureSetPipeline( @@ -67,7 +68,8 @@ class FeatureSetPipeline: ... name="feature1", ... description="test", ... transformation=SparkFunctionTransform( - ... functions=[F.avg, F.stddev_pop] + ... functions=[Function(functions.avg, DataType.DOUBLE), + ... Function(functions.stddev_pop, DataType.DOUBLE)], ... ).with_window( ... partition_by="id", ... order_by=TIMESTAMP_COLUMN, @@ -113,6 +115,19 @@ class FeatureSetPipeline: the defined sources, compute all the transformations and save the data to the specified locations. + We can run the pipeline over a range of dates by passing an end-date + and a start-date, where it will only bring data within this date range. + + >>> pipeline.run(end_date="2020-08-04", start_date="2020-07-04") + + Or run up to a date, where it will only bring data up to the specific date. + + >>> pipeline.run(end_date="2020-08-04") + + Or just a specific date, where you will only bring data for that day. + + >>> pipeline.run_for_date(execution_date="2020-08-04") + """ def __init__( @@ -179,6 +194,7 @@ def run( partition_by: List[str] = None, order_by: List[str] = None, num_processors: int = None, + start_date: str = None, ) -> None: """Runs the defined feature set pipeline. @@ -192,7 +208,11 @@ def run( soon. Use only if strictly necessary. """ - dataframe = self.source.construct(client=self.spark_client) + dataframe = self.source.construct( + client=self.spark_client, + start_date=self.feature_set.define_start_date(start_date), + end_date=end_date, + ) if partition_by: order_by = order_by or partition_by @@ -203,6 +223,7 @@ def run( dataframe = self.feature_set.construct( dataframe=dataframe, client=self.spark_client, + start_date=start_date, end_date=end_date, num_processors=num_processors, ) @@ -219,3 +240,30 @@ def run( feature_set=self.feature_set, spark_client=self.spark_client, ) + + def run_for_date( + self, + execution_date: str = None, + partition_by: List[str] = None, + order_by: List[str] = None, + num_processors: int = None, + ) -> None: + """Runs the defined feature set pipeline for a specific date. + + The pipeline consists in the following steps: + + - Constructs the input dataframe from the data source. + - Construct the feature set dataframe using the defined Features. + - Load the data to the configured sink locations. + + It's important to notice, however, that both parameters partition_by + and num_processors are WIP, we intend to enhance their functionality + soon. Use only if strictly necessary. + """ + self.run( + start_date=execution_date, + end_date=execution_date, + partition_by=partition_by, + order_by=order_by, + num_processors=num_processors, + ) diff --git a/butterfree/transform/aggregated_feature_set.py b/butterfree/transform/aggregated_feature_set.py index f43c12d5d..a19efb350 100644 --- a/butterfree/transform/aggregated_feature_set.py +++ b/butterfree/transform/aggregated_feature_set.py @@ -1,6 +1,6 @@ """AggregatedFeatureSet entity.""" import itertools -from datetime import timedelta +from datetime import datetime, timedelta from functools import reduce from typing import Any, Dict, List, Optional, Union @@ -8,6 +8,7 @@ from pyspark.sql import DataFrame, functions from butterfree.clients import SparkClient +from butterfree.constants.window_definitions import ALLOWED_WINDOWS from butterfree.dataframe_service import repartition_df from butterfree.transform import FeatureSet from butterfree.transform.features import Feature, KeyFeature, TimestampFeature @@ -488,12 +489,45 @@ def get_schema(self) -> List[Dict[str, Any]]: return schema + @staticmethod + def _get_biggest_window_in_days(definitions: List[str]) -> float: + windows_list = [] + for window in definitions: + windows_list.append( + int(window.split()[0]) * ALLOWED_WINDOWS[window.split()[1]] + ) + return max(windows_list) / (60 * 60 * 24) + + def define_start_date(self, start_date: str = None) -> Optional[str]: + """Get aggregated feature set start date. + + Args: + start_date: start date regarding source dataframe. + + Returns: + start date. + """ + if self._windows and start_date: + window_definition = [ + definition.frame_boundaries.window_definition + for definition in self._windows + ] + biggest_window = self._get_biggest_window_in_days(window_definition) + + return ( + datetime.strptime(start_date, "%Y-%m-%d") + - timedelta(days=int(biggest_window) + 1) + ).strftime("%Y-%m-%d") + + return start_date + def construct( self, dataframe: DataFrame, client: SparkClient, end_date: str = None, num_processors: int = None, + start_date: str = None, ) -> DataFrame: """Use all the features to build the feature set dataframe. @@ -506,6 +540,7 @@ def construct( client: client responsible for connecting to Spark session. end_date: user defined max date for having aggregated data (exclusive). num_processors: cluster total number of processors for repartitioning. + start_date: user defined min date for having aggregated data. Returns: Spark dataframe with all the feature columns. @@ -519,10 +554,12 @@ def construct( if not isinstance(dataframe, DataFrame): raise ValueError("source_df must be a dataframe") + pre_hook_df = self.run_pre_hooks(dataframe) + output_df = reduce( lambda df, feature: feature.transform(df), self.keys + [self.timestamp], - dataframe, + pre_hook_df, ) if self._windows and end_date is not None: @@ -558,6 +595,10 @@ def construct( else: output_df = self._aggregate(output_df, features=self.features) + output_df = self.incremental_strategy.filter_with_incremental_strategy( + dataframe=output_df, start_date=start_date, end_date=end_date + ) + output_df = output_df.select(*self.columns).replace( # type: ignore float("nan"), None ) @@ -565,4 +606,6 @@ def construct( output_df = self._filter_duplicated_rows(output_df) output_df.cache().count() - return output_df + post_hook_df = self.run_post_hooks(output_df) + + return post_hook_df diff --git a/butterfree/transform/feature_set.py b/butterfree/transform/feature_set.py index c35e90fa1..c2e40a498 100644 --- a/butterfree/transform/feature_set.py +++ b/butterfree/transform/feature_set.py @@ -1,7 +1,7 @@ """FeatureSet entity.""" import itertools from functools import reduce -from typing import Any, Dict, List +from typing import Any, Dict, List, Optional import pyspark.sql.functions as F from pyspark.sql import Window @@ -9,6 +9,8 @@ from butterfree.clients import SparkClient from butterfree.constants.columns import TIMESTAMP_COLUMN +from butterfree.dataframe_service import IncrementalStrategy +from butterfree.hooks import HookableComponent from butterfree.transform.features import Feature, KeyFeature, TimestampFeature from butterfree.transform.transformations import ( AggregatedTransform, @@ -16,7 +18,7 @@ ) -class FeatureSet: +class FeatureSet(HookableComponent): """Holds metadata about the feature set and constructs the final dataframe. Attributes: @@ -106,12 +108,14 @@ def __init__( timestamp: TimestampFeature, features: List[Feature], ) -> None: + super().__init__() self.name = name self.entity = entity self.description = description self.keys = keys self.timestamp = timestamp self.features = features + self.incremental_strategy = IncrementalStrategy(column=TIMESTAMP_COLUMN) @property def name(self) -> str: @@ -243,9 +247,6 @@ def columns(self) -> List[str]: def get_schema(self) -> List[Dict[str, Any]]: """Get feature set schema. - Args: - feature_set: object processed with feature set metadata. - Returns: List of dicts regarding cassandra feature set schema. @@ -378,12 +379,24 @@ def _filter_duplicated_rows(self, df: DataFrame) -> DataFrame: return df.select([column for column in self.columns]) + def define_start_date(self, start_date: str = None) -> Optional[str]: + """Get feature set start date. + + Args: + start_date: start date regarding source dataframe. + + Returns: + start date. + """ + return start_date + def construct( self, dataframe: DataFrame, client: SparkClient, end_date: str = None, num_processors: int = None, + start_date: str = None, ) -> DataFrame: """Use all the features to build the feature set dataframe. @@ -393,7 +406,8 @@ def construct( Args: dataframe: input dataframe to be transformed by the features. client: client responsible for connecting to Spark session. - end_date: user defined base date. + start_date: user defined start date. + end_date: user defined end date. num_processors: cluster total number of processors for repartitioning. Returns: @@ -403,14 +417,22 @@ def construct( if not isinstance(dataframe, DataFrame): raise ValueError("source_df must be a dataframe") + pre_hook_df = self.run_pre_hooks(dataframe) + output_df = reduce( lambda df, feature: feature.transform(df), self.keys + [self.timestamp] + self.features, - dataframe, + pre_hook_df, ).select(*self.columns) if not output_df.isStreaming: output_df = self._filter_duplicated_rows(output_df) output_df.cache().count() - return output_df + output_df = self.incremental_strategy.filter_with_incremental_strategy( + dataframe=output_df, start_date=start_date, end_date=end_date + ) + + post_hook_df = self.run_post_hooks(output_df) + + return post_hook_df diff --git a/butterfree/transform/utils/window_spec.py b/butterfree/transform/utils/window_spec.py index f3a392f6a..a270fec03 100644 --- a/butterfree/transform/utils/window_spec.py +++ b/butterfree/transform/utils/window_spec.py @@ -5,6 +5,7 @@ from pyspark.sql import Column, WindowSpec, functions from butterfree.constants.columns import TIMESTAMP_COLUMN +from butterfree.constants.window_definitions import ALLOWED_WINDOWS class FrameBoundaries: @@ -16,21 +17,6 @@ class FrameBoundaries: it can be second(s), minute(s), hour(s), day(s), week(s) and year(s), """ - __ALLOWED_WINDOWS = { - "second": 1, - "seconds": 1, - "minute": 60, - "minutes": 60, - "hour": 3600, - "hours": 3600, - "day": 86400, - "days": 86400, - "week": 604800, - "weeks": 604800, - "year": 29030400, - "years": 29030400, - } - def __init__(self, mode: Optional[str], window_definition: str): self.mode = mode self.window_definition = window_definition @@ -46,7 +32,7 @@ def window_size(self) -> int: def window_unit(self) -> str: """Returns window unit.""" unit = self.window_definition.split()[1] - if unit not in self.__ALLOWED_WINDOWS and self.mode != "row_windows": + if unit not in ALLOWED_WINDOWS and self.mode != "row_windows": raise ValueError("Not allowed") return unit @@ -59,7 +45,7 @@ def get(self, window: WindowSpec) -> Any: span = self.window_size - 1 return window.rowsBetween(-span, 0) if self.mode == "fixed_windows": - span = self.__ALLOWED_WINDOWS[self.window_unit] * self.window_size + span = ALLOWED_WINDOWS[self.window_unit] * self.window_size return window.rangeBetween(-span, 0) diff --git a/examples/interval_runs/interval_runs.ipynb b/examples/interval_runs/interval_runs.ipynb new file mode 100644 index 000000000..e234da8ac --- /dev/null +++ b/examples/interval_runs/interval_runs.ipynb @@ -0,0 +1,2152 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# #5 Discovering Butterfree - Interval Runs\n", + "\n", + "Welcome to Discovering Butterfree tutorial series!\n", + "\n", + "This is the fifth tutorial of this series: its goal is to cover interval runs.\n", + "\n", + "Before diving into the tutorial make sure you have a basic understanding of these main data concepts: features, feature sets and the \"Feature Store Architecture\", you can read more about this [here]." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Example:\n", + "\n", + "Simulating the following scenario (the same from previous tutorials):\n", + "\n", + "- We want to create a feature set with features about houses for rent (listings).\n", + "\n", + "\n", + "We have an input dataset:\n", + "\n", + "- Table: `listing_events`. Table with data about events of house listings.\n", + "\n", + "\n", + "Our desire is to have three resulting datasets with the following schema:\n", + "\n", + "* id: **int**;\n", + "* timestamp: **timestamp**;\n", + "* rent__avg_over_1_day_rolling_windows: **double**;\n", + "* rent__stddev_pop_over_1_day_rolling_windows: **double**.\n", + " \n", + "The first dataset will be computed with just an end date time limit. The second one, on the other hand, uses both start and end date in order to filter data. Finally, the third one will be the result of a daily run. You can understand more about these definitions in our documentation.\n", + "\n", + "The following code blocks will show how to generate this feature set using Butterfree library:" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "# setup spark\n", + "from pyspark import SparkContext, SparkConf\n", + "from pyspark.sql import session\n", + "\n", + "conf = SparkConf().setAll([('spark.driver.host','127.0.0.1'), ('spark.sql.session.timeZone', 'UTC')])\n", + "sc = SparkContext(conf=conf)\n", + "spark = session.SparkSession(sc)" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "# fix working dir\n", + "import pathlib\n", + "import os\n", + "path = os.path.join(pathlib.Path().absolute(), '../..')\n", + "os.chdir(path)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Showing test data" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "listing_events_df = spark.read.json(f\"{path}/examples/data/listing_events.json\")\n", + "listing_events_df.createOrReplaceTempView(\"listing_events\") # creating listing_events view\n", + "\n", + "region = spark.read.json(f\"{path}/examples/data/region.json\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Listing events table:" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
areabathroomsbedroomsidregion_idrenttimestamp
050111113001588302000000
150111120001588647600000
2100122215001588734000000
3100122225001589252400000
4150223330001589943600000
5175224432001589943600000
6250335532001590030000000
7225326632001590116400000
\n", + "
" + ], + "text/plain": [ + " area bathrooms bedrooms id region_id rent timestamp\n", + "0 50 1 1 1 1 1300 1588302000000\n", + "1 50 1 1 1 1 2000 1588647600000\n", + "2 100 1 2 2 2 1500 1588734000000\n", + "3 100 1 2 2 2 2500 1589252400000\n", + "4 150 2 2 3 3 3000 1589943600000\n", + "5 175 2 2 4 4 3200 1589943600000\n", + "6 250 3 3 5 5 3200 1590030000000\n", + "7 225 3 2 6 6 3200 1590116400000" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "listing_events_df.toPandas()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Region table:" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
cityidlatlngregion
0Cerulean173.4448931.75030Kanto
1Veridian2-9.43510-167.11772Kanto
2Cinnabar329.73043117.66164Kanto
3Pallet4-52.95717-81.15251Kanto
4Violet5-47.35798-178.77255Johto
5Olivine651.7282046.21958Johto
\n", + "
" + ], + "text/plain": [ + " city id lat lng region\n", + "0 Cerulean 1 73.44489 31.75030 Kanto\n", + "1 Veridian 2 -9.43510 -167.11772 Kanto\n", + "2 Cinnabar 3 29.73043 117.66164 Kanto\n", + "3 Pallet 4 -52.95717 -81.15251 Kanto\n", + "4 Violet 5 -47.35798 -178.77255 Johto\n", + "5 Olivine 6 51.72820 46.21958 Johto" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "region.toPandas()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Extract\n", + "\n", + "- For the extract part, we need the `Source` entity and the `FileReader` for the data we have;\n", + "- We need to declare a query in order to bring the results from our lonely reader (it's as simples as a select all statement)." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "from butterfree.clients import SparkClient\n", + "from butterfree.extract import Source\n", + "from butterfree.extract.readers import FileReader, TableReader\n", + "from butterfree.extract.pre_processing import filter\n", + "\n", + "readers = [\n", + " TableReader(id=\"listing_events\", table=\"listing_events\",),\n", + " FileReader(id=\"region\", path=f\"{path}/examples/data/region.json\", format=\"json\",)\n", + "]\n", + "\n", + "query = \"\"\"\n", + "select\n", + " listing_events.*,\n", + " region.city,\n", + " region.region,\n", + " region.lat,\n", + " region.lng,\n", + " region.region as region_name\n", + "from\n", + " listing_events\n", + " join region\n", + " on listing_events.region_id = region.id\n", + "\"\"\"\n", + "\n", + "source = Source(readers=readers, query=query)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "spark_client = SparkClient()\n", + "source_df = source.construct(spark_client)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "And, finally, it's possible to see the results from building our souce dataset:" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
areabathroomsbedroomsidregion_idrenttimestampcityregionlatlngregion_name
050111113001588302000000CeruleanKanto73.4448931.75030Kanto
150111120001588647600000CeruleanKanto73.4448931.75030Kanto
2100122215001588734000000VeridianKanto-9.43510-167.11772Kanto
3100122225001589252400000VeridianKanto-9.43510-167.11772Kanto
4150223330001589943600000CinnabarKanto29.73043117.66164Kanto
5175224432001589943600000PalletKanto-52.95717-81.15251Kanto
6250335532001590030000000VioletJohto-47.35798-178.77255Johto
7225326632001590116400000OlivineJohto51.7282046.21958Johto
\n", + "
" + ], + "text/plain": [ + " area bathrooms bedrooms id region_id rent timestamp city \\\n", + "0 50 1 1 1 1 1300 1588302000000 Cerulean \n", + "1 50 1 1 1 1 2000 1588647600000 Cerulean \n", + "2 100 1 2 2 2 1500 1588734000000 Veridian \n", + "3 100 1 2 2 2 2500 1589252400000 Veridian \n", + "4 150 2 2 3 3 3000 1589943600000 Cinnabar \n", + "5 175 2 2 4 4 3200 1589943600000 Pallet \n", + "6 250 3 3 5 5 3200 1590030000000 Violet \n", + "7 225 3 2 6 6 3200 1590116400000 Olivine \n", + "\n", + " region lat lng region_name \n", + "0 Kanto 73.44489 31.75030 Kanto \n", + "1 Kanto 73.44489 31.75030 Kanto \n", + "2 Kanto -9.43510 -167.11772 Kanto \n", + "3 Kanto -9.43510 -167.11772 Kanto \n", + "4 Kanto 29.73043 117.66164 Kanto \n", + "5 Kanto -52.95717 -81.15251 Kanto \n", + "6 Johto -47.35798 -178.77255 Johto \n", + "7 Johto 51.72820 46.21958 Johto " + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "source_df.toPandas()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Transform\n", + "- At the transform part, a set of `Feature` objects is declared;\n", + "- An Instance of `AggregatedFeatureSet` is used to hold the features;\n", + "- An `AggregatedFeatureSet` can only be created when it is possible to define a unique tuple formed by key columns and a time reference. This is an **architectural requirement** for the data. So least one `KeyFeature` and one `TimestampFeature` is needed;\n", + "- Every `Feature` needs a unique name, a description, and a data-type definition. Besides, in the case of the `AggregatedFeatureSet`, it's also mandatory to have an `AggregatedTransform` operator;\n", + "- An `AggregatedTransform` operator is used, as the name suggests, to define aggregation functions." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "from pyspark.sql import functions as F\n", + "\n", + "from butterfree.transform.aggregated_feature_set import AggregatedFeatureSet\n", + "from butterfree.transform.features import Feature, KeyFeature, TimestampFeature\n", + "from butterfree.transform.transformations import AggregatedTransform\n", + "from butterfree.constants import DataType\n", + "from butterfree.transform.utils import Function\n", + "\n", + "keys = [\n", + " KeyFeature(\n", + " name=\"id\",\n", + " description=\"Unique identificator code for houses.\",\n", + " dtype=DataType.BIGINT,\n", + " )\n", + "]\n", + "\n", + "# from_ms = True because the data originally is not in a Timestamp format.\n", + "ts_feature = TimestampFeature(from_ms=True)\n", + "\n", + "features = [\n", + " Feature(\n", + " name=\"rent\",\n", + " description=\"Rent value by month described in the listing.\",\n", + " transformation=AggregatedTransform(\n", + " functions=[\n", + " Function(F.avg, DataType.DOUBLE),\n", + " Function(F.stddev_pop, DataType.DOUBLE),\n", + " ],\n", + " filter_expression=\"region_name = 'Kanto'\",\n", + " ),\n", + " )\n", + "]\n", + "\n", + "aggregated_feature_set = AggregatedFeatureSet(\n", + " name=\"house_listings\",\n", + " entity=\"house\", # entity: to which \"business context\" this feature set belongs\n", + " description=\"Features describring a house listing.\",\n", + " keys=keys,\n", + " timestamp=ts_feature,\n", + " features=features,\n", + ").with_windows(definitions=[\"1 day\"])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Here, we'll define out first aggregated feature set, with just an `end date` parameter:" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "aggregated_feature_set_windows_df = aggregated_feature_set.construct(\n", + " source_df, \n", + " spark_client, \n", + " end_date=\"2020-05-30\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The resulting dataset is:" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idtimestamprent__avg_over_1_day_rolling_windowsrent__stddev_pop_over_1_day_rolling_windows
012020-05-01NaNNaN
112020-05-021300.00.0
212020-05-03NaNNaN
312020-05-062000.00.0
412020-05-07NaNNaN
522020-05-01NaNNaN
622020-05-071500.00.0
722020-05-08NaNNaN
822020-05-132500.00.0
922020-05-14NaNNaN
1032020-05-01NaNNaN
1132020-05-213000.00.0
1232020-05-22NaNNaN
1342020-05-01NaNNaN
1442020-05-213200.00.0
1542020-05-22NaNNaN
1652020-05-01NaNNaN
1762020-05-01NaNNaN
\n", + "
" + ], + "text/plain": [ + " id timestamp rent__avg_over_1_day_rolling_windows \\\n", + "0 1 2020-05-01 NaN \n", + "1 1 2020-05-02 1300.0 \n", + "2 1 2020-05-03 NaN \n", + "3 1 2020-05-06 2000.0 \n", + "4 1 2020-05-07 NaN \n", + "5 2 2020-05-01 NaN \n", + "6 2 2020-05-07 1500.0 \n", + "7 2 2020-05-08 NaN \n", + "8 2 2020-05-13 2500.0 \n", + "9 2 2020-05-14 NaN \n", + "10 3 2020-05-01 NaN \n", + "11 3 2020-05-21 3000.0 \n", + "12 3 2020-05-22 NaN \n", + "13 4 2020-05-01 NaN \n", + "14 4 2020-05-21 3200.0 \n", + "15 4 2020-05-22 NaN \n", + "16 5 2020-05-01 NaN \n", + "17 6 2020-05-01 NaN \n", + "\n", + " rent__stddev_pop_over_1_day_rolling_windows \n", + "0 NaN \n", + "1 0.0 \n", + "2 NaN \n", + "3 0.0 \n", + "4 NaN \n", + "5 NaN \n", + "6 0.0 \n", + "7 NaN \n", + "8 0.0 \n", + "9 NaN \n", + "10 NaN \n", + "11 0.0 \n", + "12 NaN \n", + "13 NaN \n", + "14 0.0 \n", + "15 NaN \n", + "16 NaN \n", + "17 NaN " + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "aggregated_feature_set_windows_df.orderBy(\"id\", \"timestamp\").toPandas()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "It's possible to see that if we use both a `start date` and `end_date` values. Then we'll achieve a time slice of the last dataframe, as it's possible to see:" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idtimestamprent__avg_over_1_day_rolling_windowsrent__stddev_pop_over_1_day_rolling_windows
012020-05-062000.00.0
112020-05-07NaNNaN
222020-05-06NaNNaN
322020-05-071500.00.0
422020-05-08NaNNaN
522020-05-132500.00.0
622020-05-14NaNNaN
732020-05-06NaNNaN
832020-05-213000.00.0
942020-05-06NaNNaN
1042020-05-213200.00.0
1152020-05-06NaNNaN
1262020-05-06NaNNaN
\n", + "
" + ], + "text/plain": [ + " id timestamp rent__avg_over_1_day_rolling_windows \\\n", + "0 1 2020-05-06 2000.0 \n", + "1 1 2020-05-07 NaN \n", + "2 2 2020-05-06 NaN \n", + "3 2 2020-05-07 1500.0 \n", + "4 2 2020-05-08 NaN \n", + "5 2 2020-05-13 2500.0 \n", + "6 2 2020-05-14 NaN \n", + "7 3 2020-05-06 NaN \n", + "8 3 2020-05-21 3000.0 \n", + "9 4 2020-05-06 NaN \n", + "10 4 2020-05-21 3200.0 \n", + "11 5 2020-05-06 NaN \n", + "12 6 2020-05-06 NaN \n", + "\n", + " rent__stddev_pop_over_1_day_rolling_windows \n", + "0 0.0 \n", + "1 NaN \n", + "2 NaN \n", + "3 0.0 \n", + "4 NaN \n", + "5 0.0 \n", + "6 NaN \n", + "7 NaN \n", + "8 0.0 \n", + "9 NaN \n", + "10 0.0 \n", + "11 NaN \n", + "12 NaN " + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "aggregated_feature_set.construct(\n", + " source_df, \n", + " spark_client, \n", + " end_date=\"2020-05-21\",\n", + " start_date=\"2020-05-06\",\n", + ").orderBy(\"id\", \"timestamp\").toPandas()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Load\n", + "\n", + "- For the load part we need `Writer` instances and a `Sink`;\n", + "- `writers` define where to load the data;\n", + "- The `Sink` gets the transformed data (feature set) and trigger the load to all the defined `writers`;\n", + "- `debug_mode` will create a temporary view instead of trying to write in a real data store." + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "from butterfree.load.writers import (\n", + " HistoricalFeatureStoreWriter,\n", + " OnlineFeatureStoreWriter,\n", + ")\n", + "from butterfree.load import Sink\n", + "\n", + "writers = [HistoricalFeatureStoreWriter(debug_mode=True, interval_mode=True), \n", + " OnlineFeatureStoreWriter(debug_mode=True, interval_mode=True)]\n", + "sink = Sink(writers=writers)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Pipeline\n", + "\n", + "- The `Pipeline` entity wraps all the other defined elements.\n", + "- `run` command will trigger the execution of the pipeline, end-to-end." + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "from butterfree.pipelines import FeatureSetPipeline\n", + "\n", + "pipeline = FeatureSetPipeline(source=source, feature_set=aggregated_feature_set, sink=sink)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The first run will use just an `end_date` as parameter:" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "result_df = pipeline.run(end_date=\"2020-05-30\")" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idtimestamprent__avg_over_1_day_rolling_windowsrent__stddev_pop_over_1_day_rolling_windowsyearmonthday
012020-05-01NaNNaN202051
112020-05-021300.00.0202052
212020-05-03NaNNaN202053
312020-05-062000.00.0202056
412020-05-07NaNNaN202057
522020-05-01NaNNaN202051
622020-05-071500.00.0202057
722020-05-08NaNNaN202058
822020-05-132500.00.02020513
922020-05-14NaNNaN2020514
1032020-05-01NaNNaN202051
1132020-05-213000.00.02020521
1232020-05-22NaNNaN2020522
1342020-05-01NaNNaN202051
1442020-05-213200.00.02020521
1542020-05-22NaNNaN2020522
1652020-05-01NaNNaN202051
1762020-05-01NaNNaN202051
\n", + "
" + ], + "text/plain": [ + " id timestamp rent__avg_over_1_day_rolling_windows \\\n", + "0 1 2020-05-01 NaN \n", + "1 1 2020-05-02 1300.0 \n", + "2 1 2020-05-03 NaN \n", + "3 1 2020-05-06 2000.0 \n", + "4 1 2020-05-07 NaN \n", + "5 2 2020-05-01 NaN \n", + "6 2 2020-05-07 1500.0 \n", + "7 2 2020-05-08 NaN \n", + "8 2 2020-05-13 2500.0 \n", + "9 2 2020-05-14 NaN \n", + "10 3 2020-05-01 NaN \n", + "11 3 2020-05-21 3000.0 \n", + "12 3 2020-05-22 NaN \n", + "13 4 2020-05-01 NaN \n", + "14 4 2020-05-21 3200.0 \n", + "15 4 2020-05-22 NaN \n", + "16 5 2020-05-01 NaN \n", + "17 6 2020-05-01 NaN \n", + "\n", + " rent__stddev_pop_over_1_day_rolling_windows year month day \n", + "0 NaN 2020 5 1 \n", + "1 0.0 2020 5 2 \n", + "2 NaN 2020 5 3 \n", + "3 0.0 2020 5 6 \n", + "4 NaN 2020 5 7 \n", + "5 NaN 2020 5 1 \n", + "6 0.0 2020 5 7 \n", + "7 NaN 2020 5 8 \n", + "8 0.0 2020 5 13 \n", + "9 NaN 2020 5 14 \n", + "10 NaN 2020 5 1 \n", + "11 0.0 2020 5 21 \n", + "12 NaN 2020 5 22 \n", + "13 NaN 2020 5 1 \n", + "14 0.0 2020 5 21 \n", + "15 NaN 2020 5 22 \n", + "16 NaN 2020 5 1 \n", + "17 NaN 2020 5 1 " + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "spark.table(\"historical_feature_store__house_listings\").orderBy(\n", + " \"id\", \"timestamp\"\n", + ").orderBy(\"id\", \"timestamp\").toPandas()" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idtimestamprent__avg_over_1_day_rolling_windowsrent__stddev_pop_over_1_day_rolling_windows
012020-05-07NaNNaN
122020-05-14NaNNaN
232020-05-22NaNNaN
342020-05-22NaNNaN
452020-05-01NaNNaN
562020-05-01NaNNaN
\n", + "
" + ], + "text/plain": [ + " id timestamp rent__avg_over_1_day_rolling_windows \\\n", + "0 1 2020-05-07 NaN \n", + "1 2 2020-05-14 NaN \n", + "2 3 2020-05-22 NaN \n", + "3 4 2020-05-22 NaN \n", + "4 5 2020-05-01 NaN \n", + "5 6 2020-05-01 NaN \n", + "\n", + " rent__stddev_pop_over_1_day_rolling_windows \n", + "0 NaN \n", + "1 NaN \n", + "2 NaN \n", + "3 NaN \n", + "4 NaN \n", + "5 NaN " + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "spark.table(\"online_feature_store__house_listings\").orderBy(\"id\", \"timestamp\").toPandas()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- We can see that we were able to create all the desired features in an easy way\n", + "- The **historical feature set** holds all the data, and we can see that it is partitioned by year, month and day (columns added in the `HistoricalFeatureStoreWriter`)\n", + "- In the **online feature set** there is only the latest data for each id" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The second run, on the other hand, will use both a `start_date` and `end_date` as parameters." + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ + "result_df = pipeline.run(end_date=\"2020-05-21\", start_date=\"2020-05-06\")" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idtimestamprent__avg_over_1_day_rolling_windowsrent__stddev_pop_over_1_day_rolling_windowsyearmonthday
012020-05-062000.00.0202056
112020-05-07NaNNaN202057
222020-05-06NaNNaN202056
322020-05-071500.00.0202057
422020-05-08NaNNaN202058
522020-05-132500.00.02020513
622020-05-14NaNNaN2020514
732020-05-06NaNNaN202056
832020-05-213000.00.02020521
942020-05-06NaNNaN202056
1042020-05-213200.00.02020521
1152020-05-06NaNNaN202056
1262020-05-06NaNNaN202056
\n", + "
" + ], + "text/plain": [ + " id timestamp rent__avg_over_1_day_rolling_windows \\\n", + "0 1 2020-05-06 2000.0 \n", + "1 1 2020-05-07 NaN \n", + "2 2 2020-05-06 NaN \n", + "3 2 2020-05-07 1500.0 \n", + "4 2 2020-05-08 NaN \n", + "5 2 2020-05-13 2500.0 \n", + "6 2 2020-05-14 NaN \n", + "7 3 2020-05-06 NaN \n", + "8 3 2020-05-21 3000.0 \n", + "9 4 2020-05-06 NaN \n", + "10 4 2020-05-21 3200.0 \n", + "11 5 2020-05-06 NaN \n", + "12 6 2020-05-06 NaN \n", + "\n", + " rent__stddev_pop_over_1_day_rolling_windows year month day \n", + "0 0.0 2020 5 6 \n", + "1 NaN 2020 5 7 \n", + "2 NaN 2020 5 6 \n", + "3 0.0 2020 5 7 \n", + "4 NaN 2020 5 8 \n", + "5 0.0 2020 5 13 \n", + "6 NaN 2020 5 14 \n", + "7 NaN 2020 5 6 \n", + "8 0.0 2020 5 21 \n", + "9 NaN 2020 5 6 \n", + "10 0.0 2020 5 21 \n", + "11 NaN 2020 5 6 \n", + "12 NaN 2020 5 6 " + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "spark.table(\"historical_feature_store__house_listings\").orderBy(\n", + " \"id\", \"timestamp\"\n", + ").orderBy(\"id\", \"timestamp\").toPandas()" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idtimestamprent__avg_over_1_day_rolling_windowsrent__stddev_pop_over_1_day_rolling_windows
012020-05-07NaNNaN
122020-05-14NaNNaN
232020-05-213000.00.0
342020-05-213200.00.0
452020-05-06NaNNaN
562020-05-06NaNNaN
\n", + "
" + ], + "text/plain": [ + " id timestamp rent__avg_over_1_day_rolling_windows \\\n", + "0 1 2020-05-07 NaN \n", + "1 2 2020-05-14 NaN \n", + "2 3 2020-05-21 3000.0 \n", + "3 4 2020-05-21 3200.0 \n", + "4 5 2020-05-06 NaN \n", + "5 6 2020-05-06 NaN \n", + "\n", + " rent__stddev_pop_over_1_day_rolling_windows \n", + "0 NaN \n", + "1 NaN \n", + "2 0.0 \n", + "3 0.0 \n", + "4 NaN \n", + "5 NaN " + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "spark.table(\"online_feature_store__house_listings\").orderBy(\"id\", \"timestamp\").toPandas()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Finally, the third run, will use only an `execution_date` as a parameter." + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [], + "source": [ + "result_df = pipeline.run_for_date(execution_date=\"2020-05-21\")" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idtimestamprent__avg_over_1_day_rolling_windowsrent__stddev_pop_over_1_day_rolling_windowsyearmonthday
012020-05-21NaNNaN2020521
122020-05-21NaNNaN2020521
232020-05-213000.00.02020521
342020-05-213200.00.02020521
452020-05-21NaNNaN2020521
562020-05-21NaNNaN2020521
\n", + "
" + ], + "text/plain": [ + " id timestamp rent__avg_over_1_day_rolling_windows \\\n", + "0 1 2020-05-21 NaN \n", + "1 2 2020-05-21 NaN \n", + "2 3 2020-05-21 3000.0 \n", + "3 4 2020-05-21 3200.0 \n", + "4 5 2020-05-21 NaN \n", + "5 6 2020-05-21 NaN \n", + "\n", + " rent__stddev_pop_over_1_day_rolling_windows year month day \n", + "0 NaN 2020 5 21 \n", + "1 NaN 2020 5 21 \n", + "2 0.0 2020 5 21 \n", + "3 0.0 2020 5 21 \n", + "4 NaN 2020 5 21 \n", + "5 NaN 2020 5 21 " + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "spark.table(\"historical_feature_store__house_listings\").orderBy(\n", + " \"id\", \"timestamp\"\n", + ").orderBy(\"id\", \"timestamp\").toPandas()" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idtimestamprent__avg_over_1_day_rolling_windowsrent__stddev_pop_over_1_day_rolling_windows
012020-05-21NaNNaN
122020-05-21NaNNaN
232020-05-213000.00.0
342020-05-213200.00.0
452020-05-21NaNNaN
562020-05-21NaNNaN
\n", + "
" + ], + "text/plain": [ + " id timestamp rent__avg_over_1_day_rolling_windows \\\n", + "0 1 2020-05-21 NaN \n", + "1 2 2020-05-21 NaN \n", + "2 3 2020-05-21 3000.0 \n", + "3 4 2020-05-21 3200.0 \n", + "4 5 2020-05-21 NaN \n", + "5 6 2020-05-21 NaN \n", + "\n", + " rent__stddev_pop_over_1_day_rolling_windows \n", + "0 NaN \n", + "1 NaN \n", + "2 0.0 \n", + "3 0.0 \n", + "4 NaN \n", + "5 NaN " + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "spark.table(\"online_feature_store__house_listings\").orderBy(\"id\", \"timestamp\").toPandas()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/setup.py b/setup.py index bf471fecd..4adcbce93 100644 --- a/setup.py +++ b/setup.py @@ -1,7 +1,7 @@ from setuptools import find_packages, setup __package_name__ = "butterfree" -__version__ = "1.1.3.dev0" +__version__ = "1.2.0.dev0" __repository_url__ = "https://github.com/quintoandar/butterfree" with open("requirements.txt") as f: diff --git a/tests/integration/butterfree/load/test_sink.py b/tests/integration/butterfree/load/test_sink.py index d00f48062..f507a3354 100644 --- a/tests/integration/butterfree/load/test_sink.py +++ b/tests/integration/butterfree/load/test_sink.py @@ -9,9 +9,10 @@ ) -def test_sink(input_dataframe, feature_set): +def test_sink(input_dataframe, feature_set, mocker): # arrange client = SparkClient() + client.conn.conf.set("spark.sql.sources.partitionOverwriteMode", "dynamic") feature_set_df = feature_set.construct(input_dataframe, client) target_latest_df = OnlineFeatureStoreWriter.filter_latest( feature_set_df, id_columns=[key.name for key in feature_set.keys] @@ -20,14 +21,23 @@ def test_sink(input_dataframe, feature_set): # setup historical writer s3config = Mock() + s3config.mode = "overwrite" + s3config.format_ = "parquet" s3config.get_options = Mock( - return_value={ - "mode": "overwrite", - "format_": "parquet", - "path": "test_folder/historical/entity/feature_set", - } + return_value={"path": "test_folder/historical/entity/feature_set"} + ) + s3config.get_path_with_partitions = Mock( + return_value="test_folder/historical/entity/feature_set" + ) + + historical_writer = HistoricalFeatureStoreWriter( + db_config=s3config, interval_mode=True ) - historical_writer = HistoricalFeatureStoreWriter(db_config=s3config) + + schema_dataframe = historical_writer._create_partitions(feature_set_df) + historical_writer.check_schema_hook = mocker.stub("check_schema_hook") + historical_writer.check_schema_hook.run = mocker.stub("run") + historical_writer.check_schema_hook.run.return_value = schema_dataframe # setup online writer # TODO: Change for CassandraConfig when Cassandra for test is ready @@ -39,6 +49,10 @@ def test_sink(input_dataframe, feature_set): ) online_writer = OnlineFeatureStoreWriter(db_config=online_config) + online_writer.check_schema_hook = mocker.stub("check_schema_hook") + online_writer.check_schema_hook.run = mocker.stub("run") + online_writer.check_schema_hook.run.return_value = feature_set_df + writers = [historical_writer, online_writer] sink = Sink(writers) @@ -47,13 +61,14 @@ def test_sink(input_dataframe, feature_set): sink.flush(feature_set, feature_set_df, client) # get historical results - historical_result_df = client.read_table( - feature_set.name, historical_writer.database + historical_result_df = client.read( + s3config.format_, + path=s3config.get_path_with_partitions(feature_set.name, feature_set_df), ) # get online results online_result_df = client.read( - online_config.format_, options=online_config.get_options(feature_set.name) + online_config.format_, **online_config.get_options(feature_set.name) ) # assert diff --git a/tests/integration/butterfree/pipelines/conftest.py b/tests/integration/butterfree/pipelines/conftest.py index 798941761..73da163e6 100644 --- a/tests/integration/butterfree/pipelines/conftest.py +++ b/tests/integration/butterfree/pipelines/conftest.py @@ -1,7 +1,19 @@ import pytest +from pyspark.sql import DataFrame +from pyspark.sql import functions as F from butterfree.constants import DataType from butterfree.constants.columns import TIMESTAMP_COLUMN +from butterfree.dataframe_service.incremental_strategy import IncrementalStrategy +from butterfree.extract import Source +from butterfree.extract.readers import TableReader +from butterfree.load import Sink +from butterfree.load.writers import HistoricalFeatureStoreWriter +from butterfree.pipelines.feature_set_pipeline import FeatureSetPipeline +from butterfree.transform import FeatureSet +from butterfree.transform.features import Feature, KeyFeature, TimestampFeature +from butterfree.transform.transformations import SparkFunctionTransform +from butterfree.transform.utils import Function @pytest.fixture() @@ -74,3 +86,193 @@ def fixed_windows_output_feature_set_dataframe(spark_context, spark_session): df = df.withColumn(TIMESTAMP_COLUMN, df.timestamp.cast(DataType.TIMESTAMP.spark)) return df + + +@pytest.fixture() +def mocked_date_df(spark_context, spark_session): + data = [ + {"id": 1, "ts": "2016-04-11 11:31:11", "feature": 200}, + {"id": 1, "ts": "2016-04-12 11:44:12", "feature": 300}, + {"id": 1, "ts": "2016-04-13 11:46:24", "feature": 400}, + {"id": 1, "ts": "2016-04-14 12:03:21", "feature": 500}, + ] + df = spark_session.read.json(spark_context.parallelize(data, 1)) + df = df.withColumn(TIMESTAMP_COLUMN, df.ts.cast(DataType.TIMESTAMP.spark)) + + return df + + +@pytest.fixture() +def fixed_windows_output_feature_set_date_dataframe(spark_context, spark_session): + data = [ + { + "id": 1, + "timestamp": "2016-04-12 11:44:12", + "feature__avg_over_1_day_fixed_windows": 300, + "feature__stddev_pop_over_1_day_fixed_windows": 0, + "year": 2016, + "month": 4, + "day": 12, + }, + { + "id": 1, + "timestamp": "2016-04-13 11:46:24", + "feature__avg_over_1_day_fixed_windows": 400, + "feature__stddev_pop_over_1_day_fixed_windows": 0, + "year": 2016, + "month": 4, + "day": 13, + }, + ] + df = spark_session.read.json(spark_context.parallelize(data, 1)) + df = df.withColumn(TIMESTAMP_COLUMN, df.timestamp.cast(DataType.TIMESTAMP.spark)) + + return df + + +@pytest.fixture() +def feature_set_pipeline( + spark_context, spark_session, +): + + feature_set_pipeline = FeatureSetPipeline( + source=Source( + readers=[ + TableReader(id="b_source", table="b_table",).with_incremental_strategy( + incremental_strategy=IncrementalStrategy(column="timestamp") + ), + ], + query=f"select * from b_source ", # noqa + ), + feature_set=FeatureSet( + name="feature_set", + entity="entity", + description="description", + features=[ + Feature( + name="feature", + description="test", + transformation=SparkFunctionTransform( + functions=[ + Function(F.avg, DataType.FLOAT), + Function(F.stddev_pop, DataType.FLOAT), + ], + ).with_window( + partition_by="id", + order_by=TIMESTAMP_COLUMN, + mode="fixed_windows", + window_definition=["1 day"], + ), + ), + ], + keys=[ + KeyFeature( + name="id", + description="The user's Main ID or device ID", + dtype=DataType.INTEGER, + ) + ], + timestamp=TimestampFeature(), + ), + sink=Sink(writers=[HistoricalFeatureStoreWriter(debug_mode=True)]), + ) + + return feature_set_pipeline + + +@pytest.fixture() +def pipeline_interval_run_target_dfs( + spark_session, spark_context +) -> (DataFrame, DataFrame, DataFrame): + first_data = [ + { + "id": 1, + "timestamp": "2016-04-11 11:31:11", + "feature": 200, + "run_id": 1, + "year": 2016, + "month": 4, + "day": 11, + }, + { + "id": 1, + "timestamp": "2016-04-12 11:44:12", + "feature": 300, + "run_id": 1, + "year": 2016, + "month": 4, + "day": 12, + }, + { + "id": 1, + "timestamp": "2016-04-13 11:46:24", + "feature": 400, + "run_id": 1, + "year": 2016, + "month": 4, + "day": 13, + }, + ] + + second_data = first_data + [ + { + "id": 1, + "timestamp": "2016-04-14 12:03:21", + "feature": 500, + "run_id": 2, + "year": 2016, + "month": 4, + "day": 14, + }, + ] + + third_data = [ + { + "id": 1, + "timestamp": "2016-04-11 11:31:11", + "feature": 200, + "run_id": 3, + "year": 2016, + "month": 4, + "day": 11, + }, + { + "id": 1, + "timestamp": "2016-04-12 11:44:12", + "feature": 300, + "run_id": 1, + "year": 2016, + "month": 4, + "day": 12, + }, + { + "id": 1, + "timestamp": "2016-04-13 11:46:24", + "feature": 400, + "run_id": 1, + "year": 2016, + "month": 4, + "day": 13, + }, + { + "id": 1, + "timestamp": "2016-04-14 12:03:21", + "feature": 500, + "run_id": 2, + "year": 2016, + "month": 4, + "day": 14, + }, + ] + + first_run_df = spark_session.read.json( + spark_context.parallelize(first_data, 1) + ).withColumn("timestamp", F.col("timestamp").cast(DataType.TIMESTAMP.spark)) + second_run_df = spark_session.read.json( + spark_context.parallelize(second_data, 1) + ).withColumn("timestamp", F.col("timestamp").cast(DataType.TIMESTAMP.spark)) + third_run_df = spark_session.read.json( + spark_context.parallelize(third_data, 1) + ).withColumn("timestamp", F.col("timestamp").cast(DataType.TIMESTAMP.spark)) + + return first_run_df, second_run_df, third_run_df diff --git a/tests/integration/butterfree/pipelines/test_feature_set_pipeline.py b/tests/integration/butterfree/pipelines/test_feature_set_pipeline.py index 23d200c16..a302dc9e0 100644 --- a/tests/integration/butterfree/pipelines/test_feature_set_pipeline.py +++ b/tests/integration/butterfree/pipelines/test_feature_set_pipeline.py @@ -4,21 +4,48 @@ from pyspark.sql import DataFrame from pyspark.sql import functions as F +from butterfree.clients import SparkClient from butterfree.configs import environment +from butterfree.configs.db import MetastoreConfig from butterfree.constants import DataType from butterfree.constants.columns import TIMESTAMP_COLUMN +from butterfree.dataframe_service.incremental_strategy import IncrementalStrategy from butterfree.extract import Source from butterfree.extract.readers import TableReader +from butterfree.hooks import Hook from butterfree.load import Sink from butterfree.load.writers import HistoricalFeatureStoreWriter from butterfree.pipelines.feature_set_pipeline import FeatureSetPipeline from butterfree.testing.dataframe import assert_dataframe_equality from butterfree.transform import FeatureSet from butterfree.transform.features import Feature, KeyFeature, TimestampFeature -from butterfree.transform.transformations import CustomTransform, SparkFunctionTransform +from butterfree.transform.transformations import ( + CustomTransform, + SparkFunctionTransform, + SQLExpressionTransform, +) from butterfree.transform.utils import Function +class AddHook(Hook): + def __init__(self, value): + self.value = value + + def run(self, dataframe): + return dataframe.withColumn("feature", F.expr(f"feature + {self.value}")) + + +class RunHook(Hook): + def __init__(self, id): + self.id = id + + def run(self, dataframe): + return dataframe.withColumn( + "run_id", + F.when(F.lit(self.id).isNotNull(), F.lit(self.id)).otherwise(F.lit(None)), + ) + + def create_temp_view(dataframe: DataFrame, name): dataframe.createOrReplaceTempView(name) @@ -38,9 +65,21 @@ def divide(df, fs, column1, column2): return df +def create_ymd(dataframe): + return ( + dataframe.withColumn("year", F.year(F.col("timestamp"))) + .withColumn("month", F.month(F.col("timestamp"))) + .withColumn("day", F.dayofmonth(F.col("timestamp"))) + ) + + class TestFeatureSetPipeline: def test_feature_set_pipeline( - self, mocked_df, spark_session, fixed_windows_output_feature_set_dataframe + self, + mocked_df, + spark_session, + fixed_windows_output_feature_set_dataframe, + mocker, ): # arrange table_reader_id = "a_source" @@ -53,13 +92,25 @@ def test_feature_set_pipeline( table_reader_db=table_reader_db, table_reader_table=table_reader_table, ) + + spark_client = SparkClient() + spark_client.conn.conf.set( + "spark.sql.sources.partitionOverwriteMode", "dynamic" + ) + dbconfig = Mock() + dbconfig.mode = "overwrite" + dbconfig.format_ = "parquet" dbconfig.get_options = Mock( - return_value={ - "mode": "overwrite", - "format_": "parquet", - "path": "test_folder/historical/entity/feature_set", - } + return_value={"path": "test_folder/historical/entity/feature_set"} + ) + + historical_writer = HistoricalFeatureStoreWriter(db_config=dbconfig) + + historical_writer.check_schema_hook = mocker.stub("check_schema_hook") + historical_writer.check_schema_hook.run = mocker.stub("run") + historical_writer.check_schema_hook.run.return_value = ( + fixed_windows_output_feature_set_dataframe ) # act @@ -112,7 +163,7 @@ def test_feature_set_pipeline( ], timestamp=TimestampFeature(), ), - sink=Sink(writers=[HistoricalFeatureStoreWriter(db_config=dbconfig)],), + sink=Sink(writers=[historical_writer]), ) test_pipeline.run() @@ -129,3 +180,247 @@ def test_feature_set_pipeline( # tear down shutil.rmtree("test_folder") + + def test_feature_set_pipeline_with_dates( + self, + mocked_date_df, + spark_session, + fixed_windows_output_feature_set_date_dataframe, + feature_set_pipeline, + mocker, + ): + # arrange + table_reader_table = "b_table" + create_temp_view(dataframe=mocked_date_df, name=table_reader_table) + + historical_writer = HistoricalFeatureStoreWriter(debug_mode=True) + + feature_set_pipeline.sink.writers = [historical_writer] + + # act + feature_set_pipeline.run(start_date="2016-04-12", end_date="2016-04-13") + + df = spark_session.sql("select * from historical_feature_store__feature_set") + + # assert + assert_dataframe_equality(df, fixed_windows_output_feature_set_date_dataframe) + + def test_feature_set_pipeline_with_execution_date( + self, + mocked_date_df, + spark_session, + fixed_windows_output_feature_set_date_dataframe, + feature_set_pipeline, + mocker, + ): + # arrange + table_reader_table = "b_table" + create_temp_view(dataframe=mocked_date_df, name=table_reader_table) + + target_df = fixed_windows_output_feature_set_date_dataframe.filter( + "timestamp < '2016-04-13'" + ) + + historical_writer = HistoricalFeatureStoreWriter(debug_mode=True) + + feature_set_pipeline.sink.writers = [historical_writer] + + # act + feature_set_pipeline.run_for_date(execution_date="2016-04-12") + + df = spark_session.sql("select * from historical_feature_store__feature_set") + + # assert + assert_dataframe_equality(df, target_df) + + def test_pipeline_with_hooks(self, spark_session, mocker): + # arrange + hook1 = AddHook(value=1) + + spark_session.sql( + "select 1 as id, timestamp('2020-01-01') as timestamp, 0 as feature" + ).createOrReplaceTempView("test") + + target_df = spark_session.sql( + "select 1 as id, timestamp('2020-01-01') as timestamp, 6 as feature, 2020 " + "as year, 1 as month, 1 as day" + ) + + historical_writer = HistoricalFeatureStoreWriter(debug_mode=True) + + test_pipeline = FeatureSetPipeline( + source=Source( + readers=[TableReader(id="reader", table="test",).add_post_hook(hook1)], + query="select * from reader", + ).add_post_hook(hook1), + feature_set=FeatureSet( + name="feature_set", + entity="entity", + description="description", + features=[ + Feature( + name="feature", + description="test", + transformation=SQLExpressionTransform(expression="feature + 1"), + dtype=DataType.INTEGER, + ), + ], + keys=[ + KeyFeature( + name="id", + description="The user's Main ID or device ID", + dtype=DataType.INTEGER, + ) + ], + timestamp=TimestampFeature(), + ) + .add_pre_hook(hook1) + .add_post_hook(hook1), + sink=Sink(writers=[historical_writer],).add_pre_hook(hook1), + ) + + # act + test_pipeline.run() + output_df = spark_session.table("historical_feature_store__feature_set") + + # assert + output_df.show() + assert_dataframe_equality(output_df, target_df) + + def test_pipeline_interval_run( + self, mocked_date_df, pipeline_interval_run_target_dfs, spark_session + ): + """Testing pipeline's idempotent interval run feature. + Source data: + +-------+---+-------------------+-------------------+ + |feature| id| ts| timestamp| + +-------+---+-------------------+-------------------+ + | 200| 1|2016-04-11 11:31:11|2016-04-11 11:31:11| + | 300| 1|2016-04-12 11:44:12|2016-04-12 11:44:12| + | 400| 1|2016-04-13 11:46:24|2016-04-13 11:46:24| + | 500| 1|2016-04-14 12:03:21|2016-04-14 12:03:21| + +-------+---+-------------------+-------------------+ + The test executes 3 runs for different time intervals. The input data has 4 data + points: 2016-04-11, 2016-04-12, 2016-04-13 and 2016-04-14. The following run + specifications are: + 1) Interval: from 2016-04-11 to 2016-04-13 + Target table result: + +---+-------+---+-----+------+-------------------+----+ + |day|feature| id|month|run_id| timestamp|year| + +---+-------+---+-----+------+-------------------+----+ + | 11| 200| 1| 4| 1|2016-04-11 11:31:11|2016| + | 12| 300| 1| 4| 1|2016-04-12 11:44:12|2016| + | 13| 400| 1| 4| 1|2016-04-13 11:46:24|2016| + +---+-------+---+-----+------+-------------------+----+ + 2) Interval: only 2016-04-14. + Target table result: + +---+-------+---+-----+------+-------------------+----+ + |day|feature| id|month|run_id| timestamp|year| + +---+-------+---+-----+------+-------------------+----+ + | 11| 200| 1| 4| 1|2016-04-11 11:31:11|2016| + | 12| 300| 1| 4| 1|2016-04-12 11:44:12|2016| + | 13| 400| 1| 4| 1|2016-04-13 11:46:24|2016| + | 14| 500| 1| 4| 2|2016-04-14 12:03:21|2016| + +---+-------+---+-----+------+-------------------+----+ + 3) Interval: only 2016-04-11. + Target table result: + +---+-------+---+-----+------+-------------------+----+ + |day|feature| id|month|run_id| timestamp|year| + +---+-------+---+-----+------+-------------------+----+ + | 11| 200| 1| 4| 3|2016-04-11 11:31:11|2016| + | 12| 300| 1| 4| 1|2016-04-12 11:44:12|2016| + | 13| 400| 1| 4| 1|2016-04-13 11:46:24|2016| + | 14| 500| 1| 4| 2|2016-04-14 12:03:21|2016| + +---+-------+---+-----+------+-------------------+----+ + """ + # arrange + create_temp_view(dataframe=mocked_date_df, name="input_data") + + db = environment.get_variable("FEATURE_STORE_HISTORICAL_DATABASE") + path = "test_folder/historical/entity/feature_set" + + spark_session.conf.set("spark.sql.sources.partitionOverwriteMode", "dynamic") + spark_session.sql(f"create database if not exists {db}") + spark_session.sql( + f"create table if not exists {db}.feature_set_interval " + f"(id int, timestamp timestamp, feature int, " + f"run_id int, year int, month int, day int);" + ) + + dbconfig = MetastoreConfig() + dbconfig.get_options = Mock( + return_value={"mode": "overwrite", "format_": "parquet", "path": path} + ) + + historical_writer = HistoricalFeatureStoreWriter( + db_config=dbconfig, interval_mode=True + ) + + first_run_hook = RunHook(id=1) + second_run_hook = RunHook(id=2) + third_run_hook = RunHook(id=3) + + ( + first_run_target_df, + second_run_target_df, + third_run_target_df, + ) = pipeline_interval_run_target_dfs + + test_pipeline = FeatureSetPipeline( + source=Source( + readers=[ + TableReader(id="id", table="input_data",).with_incremental_strategy( + IncrementalStrategy("ts") + ), + ], + query="select * from id ", + ), + feature_set=FeatureSet( + name="feature_set_interval", + entity="entity", + description="", + keys=[KeyFeature(name="id", description="", dtype=DataType.INTEGER,)], + timestamp=TimestampFeature(from_column="ts"), + features=[ + Feature(name="feature", description="", dtype=DataType.INTEGER), + Feature(name="run_id", description="", dtype=DataType.INTEGER), + ], + ), + sink=Sink([historical_writer],), + ) + + # act and assert + dbconfig.get_path_with_partitions = Mock( + return_value=[ + "test_folder/historical/entity/feature_set/year=2016/month=4/day=11", + "test_folder/historical/entity/feature_set/year=2016/month=4/day=12", + "test_folder/historical/entity/feature_set/year=2016/month=4/day=13", + ] + ) + test_pipeline.feature_set.add_pre_hook(first_run_hook) + test_pipeline.run(end_date="2016-04-13", start_date="2016-04-11") + first_run_output_df = spark_session.read.parquet(path) + assert_dataframe_equality(first_run_output_df, first_run_target_df) + + dbconfig.get_path_with_partitions = Mock( + return_value=[ + "test_folder/historical/entity/feature_set/year=2016/month=4/day=14", + ] + ) + test_pipeline.feature_set.add_pre_hook(second_run_hook) + test_pipeline.run_for_date("2016-04-14") + second_run_output_df = spark_session.read.parquet(path) + assert_dataframe_equality(second_run_output_df, second_run_target_df) + + dbconfig.get_path_with_partitions = Mock( + return_value=[ + "test_folder/historical/entity/feature_set/year=2016/month=4/day=11", + ] + ) + test_pipeline.feature_set.add_pre_hook(third_run_hook) + test_pipeline.run_for_date("2016-04-11") + third_run_output_df = spark_session.read.parquet(path) + assert_dataframe_equality(third_run_output_df, third_run_target_df) + + # tear down + shutil.rmtree("test_folder") diff --git a/tests/integration/butterfree/transform/conftest.py b/tests/integration/butterfree/transform/conftest.py index 6621c9a35..fe0cc5727 100644 --- a/tests/integration/butterfree/transform/conftest.py +++ b/tests/integration/butterfree/transform/conftest.py @@ -395,3 +395,58 @@ def rolling_windows_output_feature_set_dataframe_base_date( df = df.withColumn(TIMESTAMP_COLUMN, df.origin_ts.cast(DataType.TIMESTAMP.spark)) return df + + +@fixture +def feature_set_dates_dataframe(spark_context, spark_session): + data = [ + {"id": 1, "ts": "2016-04-11 11:31:11", "feature": 200}, + {"id": 1, "ts": "2016-04-12 11:44:12", "feature": 300}, + {"id": 1, "ts": "2016-04-13 11:46:24", "feature": 400}, + {"id": 1, "ts": "2016-04-14 12:03:21", "feature": 500}, + ] + df = spark_session.read.json(spark_context.parallelize(data, 1)) + df = df.withColumn(TIMESTAMP_COLUMN, df.ts.cast(DataType.TIMESTAMP.spark)) + df = df.withColumn("ts", df.ts.cast(DataType.TIMESTAMP.spark)) + + return df + + +@fixture +def feature_set_dates_output_dataframe(spark_context, spark_session): + data = [ + {"id": 1, "timestamp": "2016-04-11 11:31:11", "feature": 200}, + {"id": 1, "timestamp": "2016-04-12 11:44:12", "feature": 300}, + ] + df = spark_session.read.json(spark_context.parallelize(data, 1)) + df = df.withColumn("timestamp", df.timestamp.cast(DataType.TIMESTAMP.spark)) + + return df + + +@fixture +def rolling_windows_output_date_boundaries(spark_context, spark_session): + data = [ + { + "id": 1, + "ts": "2016-04-11 00:00:00", + "feature__avg_over_1_day_rolling_windows": None, + "feature__avg_over_1_week_rolling_windows": None, + "feature__stddev_pop_over_1_day_rolling_windows": None, + "feature__stddev_pop_over_1_week_rolling_windows": None, + }, + { + "id": 1, + "ts": "2016-04-12 00:00:00", + "feature__avg_over_1_day_rolling_windows": 200.0, + "feature__avg_over_1_week_rolling_windows": 200.0, + "feature__stddev_pop_over_1_day_rolling_windows": 0.0, + "feature__stddev_pop_over_1_week_rolling_windows": 0.0, + }, + ] + df = spark_session.read.json( + spark_context.parallelize(data).map(lambda x: json.dumps(x)) + ) + df = df.withColumn(TIMESTAMP_COLUMN, df.ts.cast(DataType.TIMESTAMP.spark)) + + return df diff --git a/tests/integration/butterfree/transform/test_aggregated_feature_set.py b/tests/integration/butterfree/transform/test_aggregated_feature_set.py index 559dbcb89..bc3ebb6c7 100644 --- a/tests/integration/butterfree/transform/test_aggregated_feature_set.py +++ b/tests/integration/butterfree/transform/test_aggregated_feature_set.py @@ -241,3 +241,53 @@ def test_construct_with_pivot( # assert assert_dataframe_equality(output_df, target_df_pivot_agg) + + def test_construct_rolling_windows_with_date_boundaries( + self, feature_set_dates_dataframe, rolling_windows_output_date_boundaries, + ): + # given + + spark_client = SparkClient() + + # arrange + + feature_set = AggregatedFeatureSet( + name="feature_set", + entity="entity", + description="description", + features=[ + Feature( + name="feature", + description="test", + transformation=AggregatedTransform( + functions=[ + Function(F.avg, DataType.DOUBLE), + Function(F.stddev_pop, DataType.DOUBLE), + ], + ), + ), + ], + keys=[ + KeyFeature( + name="id", + description="The user's Main ID or device ID", + dtype=DataType.INTEGER, + ) + ], + timestamp=TimestampFeature(), + ).with_windows(definitions=["1 day", "1 week"]) + + # act + output_df = feature_set.construct( + feature_set_dates_dataframe, + client=spark_client, + start_date="2016-04-11", + end_date="2016-04-12", + ).orderBy("timestamp") + + target_df = rolling_windows_output_date_boundaries.orderBy( + feature_set.timestamp_column + ).select(feature_set.columns) + + # assert + assert_dataframe_equality(output_df, target_df) diff --git a/tests/integration/butterfree/transform/test_feature_set.py b/tests/integration/butterfree/transform/test_feature_set.py index 4872ded24..25f70b6e2 100644 --- a/tests/integration/butterfree/transform/test_feature_set.py +++ b/tests/integration/butterfree/transform/test_feature_set.py @@ -77,3 +77,47 @@ def test_construct( # assert assert_dataframe_equality(output_df, target_df) + + def test_construct_with_date_boundaries( + self, feature_set_dates_dataframe, feature_set_dates_output_dataframe + ): + # given + + spark_client = SparkClient() + + # arrange + + feature_set = FeatureSet( + name="feature_set", + entity="entity", + description="description", + features=[ + Feature(name="feature", description="test", dtype=DataType.FLOAT,), + ], + keys=[ + KeyFeature( + name="id", + description="The user's Main ID or device ID", + dtype=DataType.INTEGER, + ) + ], + timestamp=TimestampFeature(), + ) + + output_df = ( + feature_set.construct( + feature_set_dates_dataframe, + client=spark_client, + start_date="2016-04-11", + end_date="2016-04-12", + ) + .orderBy(feature_set.timestamp_column) + .select(feature_set.columns) + ) + + target_df = feature_set_dates_output_dataframe.orderBy( + feature_set.timestamp_column + ).select(feature_set.columns) + + # assert + assert_dataframe_equality(output_df, target_df) diff --git a/tests/unit/butterfree/clients/conftest.py b/tests/unit/butterfree/clients/conftest.py index fda11f8ef..ffb2db881 100644 --- a/tests/unit/butterfree/clients/conftest.py +++ b/tests/unit/butterfree/clients/conftest.py @@ -46,11 +46,16 @@ def mocked_stream_df() -> Mock: return mock +@pytest.fixture() +def mock_spark_sql() -> Mock: + mock = Mock() + mock.sql = mock + return mock + + @pytest.fixture def cassandra_client() -> CassandraClient: - return CassandraClient( - cassandra_host=["mock"], cassandra_key_space="dummy_keyspace" - ) + return CassandraClient(host=["mock"], keyspace="dummy_keyspace") @pytest.fixture diff --git a/tests/unit/butterfree/clients/test_cassandra_client.py b/tests/unit/butterfree/clients/test_cassandra_client.py index 8785485be..aa52e6f83 100644 --- a/tests/unit/butterfree/clients/test_cassandra_client.py +++ b/tests/unit/butterfree/clients/test_cassandra_client.py @@ -15,9 +15,7 @@ def sanitize_string(query: str) -> str: class TestCassandraClient: def test_conn(self, cassandra_client: CassandraClient) -> None: # arrange - cassandra_client = CassandraClient( - cassandra_host=["mock"], cassandra_key_space="dummy_keyspace" - ) + cassandra_client = CassandraClient(host=["mock"], keyspace="dummy_keyspace") # act start_conn = cassandra_client._session diff --git a/tests/unit/butterfree/clients/test_spark_client.py b/tests/unit/butterfree/clients/test_spark_client.py index 58d53a401..9f6415062 100644 --- a/tests/unit/butterfree/clients/test_spark_client.py +++ b/tests/unit/butterfree/clients/test_spark_client.py @@ -1,4 +1,5 @@ -from typing import Any, Dict, Optional, Union +from datetime import datetime +from typing import Any, Optional, Union from unittest.mock import Mock import pytest @@ -26,19 +27,20 @@ def test_conn(self) -> None: assert start_conn is None @pytest.mark.parametrize( - "format, options, stream, schema", + "format, path, stream, schema, options", [ - ("parquet", {"path": "path/to/file"}, False, None), - ("csv", {"path": "path/to/file", "header": True}, False, None), - ("json", {"path": "path/to/file"}, True, None), + ("parquet", ["path/to/file"], False, None, {}), + ("csv", "path/to/file", False, None, {"header": True}), + ("json", "path/to/file", True, None, {}), ], ) def test_read( self, format: str, - options: Dict[str, Any], stream: bool, schema: Optional[StructType], + path: Any, + options: Any, target_df: DataFrame, mocked_spark_read: Mock, ) -> None: @@ -48,26 +50,25 @@ def test_read( spark_client._session = mocked_spark_read # act - result_df = spark_client.read(format, options, schema, stream) + result_df = spark_client.read( + format=format, schema=schema, stream=stream, path=path, **options + ) # assert mocked_spark_read.format.assert_called_once_with(format) - mocked_spark_read.options.assert_called_once_with(**options) + mocked_spark_read.load.assert_called_once_with(path, **options) assert target_df.collect() == result_df.collect() @pytest.mark.parametrize( - "format, options", - [(None, {"path": "path/to/file"}), ("csv", "not a valid options")], + "format, path", [(None, "path/to/file"), ("csv", 123)], ) - def test_read_invalid_params( - self, format: Optional[str], options: Union[Dict[str, Any], str] - ) -> None: + def test_read_invalid_params(self, format: Optional[str], path: Any) -> None: # arrange spark_client = SparkClient() # act and assert with pytest.raises(ValueError): - spark_client.read(format, options) # type: ignore + spark_client.read(format=format, path=path) # type: ignore def test_sql(self, target_df: DataFrame) -> None: # arrange @@ -252,3 +253,43 @@ def test_create_temporary_view( # assert assert_dataframe_equality(target_df, result_df) + + def test_add_table_partitions(self, mock_spark_sql: Mock): + # arrange + target_command = ( + f"ALTER TABLE `db`.`table` ADD IF NOT EXISTS " + f"PARTITION ( year = 2020, month = 8, day = 14 ) " + f"PARTITION ( year = 2020, month = 8, day = 15 ) " + f"PARTITION ( year = 2020, month = 8, day = 16 )" + ) + + spark_client = SparkClient() + spark_client._session = mock_spark_sql + partitions = [ + {"year": 2020, "month": 8, "day": 14}, + {"year": 2020, "month": 8, "day": 15}, + {"year": 2020, "month": 8, "day": 16}, + ] + + # act + spark_client.add_table_partitions(partitions, "table", "db") + + # assert + mock_spark_sql.assert_called_once_with(target_command) + + @pytest.mark.parametrize( + "partition", + [ + [{"float_partition": 2.72}], + [{123: 2020}], + [{"date": datetime(year=2020, month=8, day=18)}], + ], + ) + def test_add_invalid_partitions(self, mock_spark_sql: Mock, partition): + # arrange + spark_client = SparkClient() + spark_client._session = mock_spark_sql + + # act and assert + with pytest.raises(ValueError): + spark_client.add_table_partitions(partition, "table", "db") diff --git a/tests/unit/butterfree/dataframe_service/conftest.py b/tests/unit/butterfree/dataframe_service/conftest.py index 867bc80a3..09470c9a4 100644 --- a/tests/unit/butterfree/dataframe_service/conftest.py +++ b/tests/unit/butterfree/dataframe_service/conftest.py @@ -25,3 +25,17 @@ def input_df(spark_context, spark_session): return spark_session.read.json( spark_context.parallelize(data, 1), schema="timestamp timestamp" ) + + +@pytest.fixture() +def test_partitioning_input_df(spark_context, spark_session): + data = [ + {"feature": 1, "year": 2009, "month": 8, "day": 20}, + {"feature": 2, "year": 2009, "month": 8, "day": 20}, + {"feature": 3, "year": 2020, "month": 8, "day": 20}, + {"feature": 4, "year": 2020, "month": 9, "day": 20}, + {"feature": 5, "year": 2020, "month": 9, "day": 20}, + {"feature": 6, "year": 2020, "month": 8, "day": 20}, + {"feature": 7, "year": 2020, "month": 8, "day": 21}, + ] + return spark_session.read.json(spark_context.parallelize(data, 1)) diff --git a/tests/unit/butterfree/dataframe_service/test_incremental_srategy.py b/tests/unit/butterfree/dataframe_service/test_incremental_srategy.py new file mode 100644 index 000000000..a140ceb30 --- /dev/null +++ b/tests/unit/butterfree/dataframe_service/test_incremental_srategy.py @@ -0,0 +1,70 @@ +from butterfree.dataframe_service import IncrementalStrategy + + +class TestIncrementalStrategy: + def test_from_milliseconds(self): + # arrange + incremental_strategy = IncrementalStrategy().from_milliseconds("ts") + target_expression = "date(from_unixtime(ts/ 1000.0)) >= date('2020-01-01')" + + # act + result_expression = incremental_strategy.get_expression(start_date="2020-01-01") + + # assert + assert target_expression.split() == result_expression.split() + + def test_from_string(self): + # arrange + incremental_strategy = IncrementalStrategy().from_string( + "dt", mask="dd/MM/yyyy" + ) + target_expression = "date(to_date(dt, 'dd/MM/yyyy')) >= date('2020-01-01')" + + # act + result_expression = incremental_strategy.get_expression(start_date="2020-01-01") + + # assert + assert target_expression.split() == result_expression.split() + + def test_from_year_month_day_partitions(self): + # arrange + incremental_strategy = IncrementalStrategy().from_year_month_day_partitions( + year_column="y", month_column="m", day_column="d" + ) + target_expression = ( + "date(concat(string(y), " + "'-', string(m), " + "'-', string(d))) >= date('2020-01-01')" + ) + + # act + result_expression = incremental_strategy.get_expression(start_date="2020-01-01") + + # assert + assert target_expression.split() == result_expression.split() + + def test_get_expression_with_just_end_date(self): + # arrange + incremental_strategy = IncrementalStrategy(column="dt") + target_expression = "date(dt) <= date('2020-01-01')" + + # act + result_expression = incremental_strategy.get_expression(end_date="2020-01-01") + + # assert + assert target_expression.split() == result_expression.split() + + def test_get_expression_with_start_and_end_date(self): + # arrange + incremental_strategy = IncrementalStrategy(column="dt") + target_expression = ( + "date(dt) >= date('2019-12-30') and date(dt) <= date('2020-01-01')" + ) + + # act + result_expression = incremental_strategy.get_expression( + start_date="2019-12-30", end_date="2020-01-01" + ) + + # assert + assert target_expression.split() == result_expression.split() diff --git a/tests/unit/butterfree/dataframe_service/test_partitioning.py b/tests/unit/butterfree/dataframe_service/test_partitioning.py new file mode 100644 index 000000000..3a6b5b406 --- /dev/null +++ b/tests/unit/butterfree/dataframe_service/test_partitioning.py @@ -0,0 +1,20 @@ +from butterfree.dataframe_service import extract_partition_values + + +class TestPartitioning: + def test_extract_partition_values(self, test_partitioning_input_df): + # arrange + target_values = [ + {"year": 2009, "month": 8, "day": 20}, + {"year": 2020, "month": 8, "day": 20}, + {"year": 2020, "month": 9, "day": 20}, + {"year": 2020, "month": 8, "day": 21}, + ] + + # act + result_values = extract_partition_values( + test_partitioning_input_df, partition_columns=["year", "month", "day"] + ) + + # assert + assert result_values == target_values diff --git a/tests/unit/butterfree/extract/conftest.py b/tests/unit/butterfree/extract/conftest.py index ab6f525c7..3d0e763d3 100644 --- a/tests/unit/butterfree/extract/conftest.py +++ b/tests/unit/butterfree/extract/conftest.py @@ -1,6 +1,7 @@ from unittest.mock import Mock import pytest +from pyspark.sql.functions import col, to_date from butterfree.constants.columns import TIMESTAMP_COLUMN @@ -17,6 +18,60 @@ def target_df(spark_context, spark_session): return spark_session.read.json(spark_context.parallelize(data, 1)) +@pytest.fixture() +def incremental_source_df(spark_context, spark_session): + data = [ + { + "id": 1, + "feature": 100, + "date_str": "28/07/2020", + "milliseconds": 1595894400000, + "year": 2020, + "month": 7, + "day": 28, + }, + { + "id": 1, + "feature": 110, + "date_str": "29/07/2020", + "milliseconds": 1595980800000, + "year": 2020, + "month": 7, + "day": 29, + }, + { + "id": 1, + "feature": 120, + "date_str": "30/07/2020", + "milliseconds": 1596067200000, + "year": 2020, + "month": 7, + "day": 30, + }, + { + "id": 2, + "feature": 150, + "date_str": "31/07/2020", + "milliseconds": 1596153600000, + "year": 2020, + "month": 7, + "day": 31, + }, + { + "id": 2, + "feature": 200, + "date_str": "01/08/2020", + "milliseconds": 1596240000000, + "year": 2020, + "month": 8, + "day": 1, + }, + ] + return spark_session.read.json(spark_context.parallelize(data, 1)).withColumn( + "date", to_date(col("date_str"), "dd/MM/yyyy") + ) + + @pytest.fixture() def spark_client(): return Mock() diff --git a/tests/unit/butterfree/extract/readers/test_file_reader.py b/tests/unit/butterfree/extract/readers/test_file_reader.py index d337d4fef..9e1c42bce 100644 --- a/tests/unit/butterfree/extract/readers/test_file_reader.py +++ b/tests/unit/butterfree/extract/readers/test_file_reader.py @@ -36,11 +36,11 @@ def test_consume( # act output_df = file_reader.consume(spark_client) - options = dict({"path": path}, **format_options if format_options else {}) + options = dict(format_options if format_options else {}) # assert spark_client.read.assert_called_once_with( - format=format, options=options, schema=schema, stream=False + format=format, schema=schema, stream=False, path=path, **options ) assert target_df.collect() == output_df.collect() @@ -51,7 +51,7 @@ def test_consume_with_stream_without_schema(self, spark_client, target_df): schema = None format_options = None stream = True - options = dict({"path": path}) + options = dict({}) spark_client.read.return_value = target_df file_reader = FileReader( @@ -64,11 +64,11 @@ def test_consume_with_stream_without_schema(self, spark_client, target_df): # assert # assert call for schema infer - spark_client.read.assert_any_call(format=format, options=options) + spark_client.read.assert_any_call(format=format, path=path, **options) # assert call for stream read # stream spark_client.read.assert_called_with( - format=format, options=options, schema=output_df.schema, stream=stream + format=format, schema=output_df.schema, stream=stream, path=path, **options ) assert target_df.collect() == output_df.collect() diff --git a/tests/unit/butterfree/extract/readers/test_reader.py b/tests/unit/butterfree/extract/readers/test_reader.py index c210a756d..78160553f 100644 --- a/tests/unit/butterfree/extract/readers/test_reader.py +++ b/tests/unit/butterfree/extract/readers/test_reader.py @@ -1,7 +1,9 @@ import pytest from pyspark.sql.functions import expr +from butterfree.dataframe_service import IncrementalStrategy from butterfree.extract.readers import FileReader +from butterfree.testing.dataframe import assert_dataframe_equality def add_value_transformer(df, column, value): @@ -152,3 +154,59 @@ def test_build_with_columns( # assert assert column_target_df.collect() == result_df.collect() + + def test_build_with_incremental_strategy( + self, incremental_source_df, spark_client, spark_session + ): + # arrange + readers = [ + # directly from column + FileReader( + id="test_1", path="path/to/file", format="format" + ).with_incremental_strategy( + incremental_strategy=IncrementalStrategy(column="date") + ), + # from milliseconds + FileReader( + id="test_2", path="path/to/file", format="format" + ).with_incremental_strategy( + incremental_strategy=IncrementalStrategy().from_milliseconds( + column_name="milliseconds" + ) + ), + # from str + FileReader( + id="test_3", path="path/to/file", format="format" + ).with_incremental_strategy( + incremental_strategy=IncrementalStrategy().from_string( + column_name="date_str", mask="dd/MM/yyyy" + ) + ), + # from year, month, day partitions + FileReader( + id="test_4", path="path/to/file", format="format" + ).with_incremental_strategy( + incremental_strategy=( + IncrementalStrategy().from_year_month_day_partitions() + ) + ), + ] + + spark_client.read.return_value = incremental_source_df + target_df = incremental_source_df.where( + "date >= date('2020-07-29') and date <= date('2020-07-31')" + ) + + # act + for reader in readers: + reader.build( + client=spark_client, start_date="2020-07-29", end_date="2020-07-31" + ) + + output_dfs = [ + spark_session.table(f"test_{i + 1}") for i, _ in enumerate(readers) + ] + + # assert + for output_df in output_dfs: + assert_dataframe_equality(output_df=output_df, target_df=target_df) diff --git a/tests/unit/butterfree/hooks/__init__.py b/tests/unit/butterfree/hooks/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/unit/butterfree/hooks/schema_compatibility/__init__.py b/tests/unit/butterfree/hooks/schema_compatibility/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/unit/butterfree/hooks/schema_compatibility/test_cassandra_table_schema_compatibility_hook.py b/tests/unit/butterfree/hooks/schema_compatibility/test_cassandra_table_schema_compatibility_hook.py new file mode 100644 index 000000000..eccb8d8cc --- /dev/null +++ b/tests/unit/butterfree/hooks/schema_compatibility/test_cassandra_table_schema_compatibility_hook.py @@ -0,0 +1,49 @@ +from unittest.mock import MagicMock + +import pytest + +from butterfree.clients import CassandraClient +from butterfree.hooks.schema_compatibility import CassandraTableSchemaCompatibilityHook + + +class TestCassandraTableSchemaCompatibilityHook: + def test_run_compatible_schema(self, spark_session): + cassandra_client = CassandraClient(host=["mock"], keyspace="dummy_keyspace") + + cassandra_client.sql = MagicMock( # type: ignore + return_value=[ + {"column_name": "feature1", "type": "text"}, + {"column_name": "feature2", "type": "int"}, + ] + ) + + table = "table" + + input_dataframe = spark_session.sql("select 'abc' as feature1, 1 as feature2") + + hook = CassandraTableSchemaCompatibilityHook(cassandra_client, table) + + # act and assert + assert hook.run(input_dataframe) == input_dataframe + + def test_run_incompatible_schema(self, spark_session): + cassandra_client = CassandraClient(host=["mock"], keyspace="dummy_keyspace") + + cassandra_client.sql = MagicMock( # type: ignore + return_value=[ + {"column_name": "feature1", "type": "text"}, + {"column_name": "feature2", "type": "bigint"}, + ] + ) + + table = "table" + + input_dataframe = spark_session.sql("select 'abc' as feature1, 1 as feature2") + + hook = CassandraTableSchemaCompatibilityHook(cassandra_client, table) + + # act and assert + with pytest.raises( + ValueError, match="There's a schema incompatibility between" + ): + hook.run(input_dataframe) diff --git a/tests/unit/butterfree/hooks/schema_compatibility/test_spark_table_schema_compatibility_hook.py b/tests/unit/butterfree/hooks/schema_compatibility/test_spark_table_schema_compatibility_hook.py new file mode 100644 index 000000000..3a31b600c --- /dev/null +++ b/tests/unit/butterfree/hooks/schema_compatibility/test_spark_table_schema_compatibility_hook.py @@ -0,0 +1,53 @@ +import pytest + +from butterfree.clients import SparkClient +from butterfree.hooks.schema_compatibility import SparkTableSchemaCompatibilityHook + + +class TestSparkTableSchemaCompatibilityHook: + @pytest.mark.parametrize( + "table, database, target_table_expression", + [("table", "database", "`database`.`table`"), ("table", None, "`table`")], + ) + def test_build_table_expression(self, table, database, target_table_expression): + # arrange + spark_client = SparkClient() + + # act + result_table_expression = SparkTableSchemaCompatibilityHook( + spark_client, table, database + ).table_expression + + # assert + assert target_table_expression == result_table_expression + + def test_run_compatible_schema(self, spark_session): + # arrange + spark_client = SparkClient() + target_table = spark_session.sql( + "select 1 as feature_a, 'abc' as feature_b, true as other_feature" + ) + input_dataframe = spark_session.sql("select 1 as feature_a, 'abc' as feature_b") + target_table.registerTempTable("test") + + hook = SparkTableSchemaCompatibilityHook(spark_client, "test") + + # act and assert + assert hook.run(input_dataframe) == input_dataframe + + def test_run_incompatible_schema(self, spark_session): + # arrange + spark_client = SparkClient() + target_table = spark_session.sql( + "select 1 as feature_a, 'abc' as feature_b, true as other_feature" + ) + input_dataframe = spark_session.sql( + "select 1 as feature_a, 'abc' as feature_b, true as unregisted_column" + ) + target_table.registerTempTable("test") + + hook = SparkTableSchemaCompatibilityHook(spark_client, "test") + + # act and assert + with pytest.raises(ValueError, match="The dataframe has a schema incompatible"): + hook.run(input_dataframe) diff --git a/tests/unit/butterfree/hooks/test_hookable_component.py b/tests/unit/butterfree/hooks/test_hookable_component.py new file mode 100644 index 000000000..37e34e691 --- /dev/null +++ b/tests/unit/butterfree/hooks/test_hookable_component.py @@ -0,0 +1,107 @@ +import pytest +from pyspark.sql.functions import expr + +from butterfree.hooks import Hook, HookableComponent +from butterfree.testing.dataframe import assert_dataframe_equality + + +class TestComponent(HookableComponent): + def construct(self, dataframe): + pre_hook_df = self.run_pre_hooks(dataframe) + construct_df = pre_hook_df.withColumn("feature", expr("feature * feature")) + return self.run_post_hooks(construct_df) + + +class AddHook(Hook): + def __init__(self, value): + self.value = value + + def run(self, dataframe): + return dataframe.withColumn("feature", expr(f"feature + {self.value}")) + + +class TestHookableComponent: + def test_add_hooks(self): + # arrange + hook1 = AddHook(value=1) + hook2 = AddHook(value=2) + hook3 = AddHook(value=3) + hook4 = AddHook(value=4) + hookable_component = HookableComponent() + + # act + hookable_component.add_pre_hook(hook1, hook2) + hookable_component.add_post_hook(hook3, hook4) + + # assert + assert hookable_component.pre_hooks == [hook1, hook2] + assert hookable_component.post_hooks == [hook3, hook4] + + @pytest.mark.parametrize( + "enable_pre_hooks, enable_post_hooks", + [("not boolean", False), (False, "not boolean")], + ) + def test_invalid_enable_hook(self, enable_pre_hooks, enable_post_hooks): + # arrange + hookable_component = HookableComponent() + + # act and assert + with pytest.raises(ValueError): + hookable_component.enable_pre_hooks = enable_pre_hooks + hookable_component.enable_post_hooks = enable_post_hooks + + @pytest.mark.parametrize( + "pre_hooks, post_hooks", + [ + ([AddHook(1)], "not a list of hooks"), + ([AddHook(1)], [AddHook(1), 2, 3]), + ("not a list of hooks", [AddHook(1)]), + ([AddHook(1), 2, 3], [AddHook(1)]), + ], + ) + def test_invalid_hooks(self, pre_hooks, post_hooks): + # arrange + hookable_component = HookableComponent() + + # act and assert + with pytest.raises(ValueError): + hookable_component.pre_hooks = pre_hooks + hookable_component.post_hooks = post_hooks + + @pytest.mark.parametrize( + "pre_hook, enable_pre_hooks, post_hook, enable_post_hooks", + [ + (AddHook(value=1), False, AddHook(value=1), True), + (AddHook(value=1), True, AddHook(value=1), False), + ("not a pre-hook", True, AddHook(value=1), True), + (AddHook(value=1), True, "not a pre-hook", True), + ], + ) + def test_add_invalid_hooks( + self, pre_hook, enable_pre_hooks, post_hook, enable_post_hooks + ): + # arrange + hookable_component = HookableComponent() + hookable_component.enable_pre_hooks = enable_pre_hooks + hookable_component.enable_post_hooks = enable_post_hooks + + # act and assert + with pytest.raises(ValueError): + hookable_component.add_pre_hook(pre_hook) + hookable_component.add_post_hook(post_hook) + + def test_run_hooks(self, spark_session): + # arrange + input_dataframe = spark_session.sql("select 2 as feature") + test_component = ( + TestComponent() + .add_pre_hook(AddHook(value=1)) + .add_post_hook(AddHook(value=1)) + ) + target_table = spark_session.sql("select 10 as feature") + + # act + output_df = test_component.construct(input_dataframe) + + # assert + assert_dataframe_equality(output_df, target_table) diff --git a/tests/unit/butterfree/load/conftest.py b/tests/unit/butterfree/load/conftest.py index 7c2549c58..4dcf25c94 100644 --- a/tests/unit/butterfree/load/conftest.py +++ b/tests/unit/butterfree/load/conftest.py @@ -32,6 +32,31 @@ def feature_set(): ) +@fixture +def feature_set_incremental(): + key_features = [ + KeyFeature(name="id", description="Description", dtype=DataType.INTEGER) + ] + ts_feature = TimestampFeature(from_column=TIMESTAMP_COLUMN) + features = [ + Feature( + name="feature", + description="test", + transformation=AggregatedTransform( + functions=[Function(functions.sum, DataType.INTEGER)] + ), + ), + ] + return AggregatedFeatureSet( + "feature_set", + "entity", + "description", + keys=key_features, + timestamp=ts_feature, + features=features, + ) + + @fixture def feature_set_dataframe(spark_context, spark_session): data = [ diff --git a/tests/unit/butterfree/load/test_sink.py b/tests/unit/butterfree/load/test_sink.py index 93b5e2797..ef377f67b 100644 --- a/tests/unit/butterfree/load/test_sink.py +++ b/tests/unit/butterfree/load/test_sink.py @@ -120,7 +120,7 @@ def test_flush_with_writers_list_empty(self): with pytest.raises(ValueError): Sink(writers=writer) - def test_flush_streaming_df(self, feature_set): + def test_flush_streaming_df(self, feature_set, mocker): """Testing the return of the streaming handlers by the sink.""" # arrange spark_client = SparkClient() @@ -136,10 +136,25 @@ def test_flush_streaming_df(self, feature_set): mocked_stream_df.start.return_value = Mock(spec=StreamingQuery) online_feature_store_writer = OnlineFeatureStoreWriter() + + online_feature_store_writer.check_schema_hook = mocker.stub("check_schema_hook") + online_feature_store_writer.check_schema_hook.run = mocker.stub("run") + online_feature_store_writer.check_schema_hook.run.return_value = ( + mocked_stream_df + ) + online_feature_store_writer_on_entity = OnlineFeatureStoreWriter( write_to_entity=True ) + online_feature_store_writer_on_entity.check_schema_hook = mocker.stub( + "check_schema_hook" + ) + online_feature_store_writer_on_entity.check_schema_hook.run = mocker.stub("run") + online_feature_store_writer_on_entity.check_schema_hook.run.return_value = ( + mocked_stream_df + ) + sink = Sink( writers=[ online_feature_store_writer, @@ -162,7 +177,7 @@ def test_flush_streaming_df(self, feature_set): assert isinstance(handler, StreamingQuery) def test_flush_with_multiple_online_writers( - self, feature_set, feature_set_dataframe + self, feature_set, feature_set_dataframe, mocker ): """Testing the flow of writing to a feature-set table and to an entity table.""" # arrange @@ -173,10 +188,25 @@ def test_flush_with_multiple_online_writers( feature_set.name = "my_feature_set" online_feature_store_writer = OnlineFeatureStoreWriter() + + online_feature_store_writer.check_schema_hook = mocker.stub("check_schema_hook") + online_feature_store_writer.check_schema_hook.run = mocker.stub("run") + online_feature_store_writer.check_schema_hook.run.return_value = ( + feature_set_dataframe + ) + online_feature_store_writer_on_entity = OnlineFeatureStoreWriter( write_to_entity=True ) + online_feature_store_writer_on_entity.check_schema_hook = mocker.stub( + "check_schema_hook" + ) + online_feature_store_writer_on_entity.check_schema_hook.run = mocker.stub("run") + online_feature_store_writer_on_entity.check_schema_hook.run.return_value = ( + feature_set_dataframe + ) + sink = Sink( writers=[online_feature_store_writer, online_feature_store_writer_on_entity] ) diff --git a/tests/unit/butterfree/load/writers/test_historical_feature_store_writer.py b/tests/unit/butterfree/load/writers/test_historical_feature_store_writer.py index 14c067f92..aac806f7a 100644 --- a/tests/unit/butterfree/load/writers/test_historical_feature_store_writer.py +++ b/tests/unit/butterfree/load/writers/test_historical_feature_store_writer.py @@ -19,10 +19,15 @@ def test_write( feature_set, ): # given - spark_client = mocker.stub("spark_client") + spark_client = SparkClient() spark_client.write_table = mocker.stub("write_table") writer = HistoricalFeatureStoreWriter() + schema_dataframe = writer._create_partitions(feature_set_dataframe) + writer.check_schema_hook = mocker.stub("check_schema_hook") + writer.check_schema_hook.run = mocker.stub("run") + writer.check_schema_hook.run.return_value = schema_dataframe + # when writer.write( feature_set=feature_set, @@ -41,7 +46,76 @@ def test_write( assert ( writer.PARTITION_BY == spark_client.write_table.call_args[1]["partition_by"] ) - assert feature_set.name == spark_client.write_table.call_args[1]["table_name"] + + def test_write_interval_mode( + self, + feature_set_dataframe, + historical_feature_set_dataframe, + mocker, + feature_set, + ): + # given + spark_client = SparkClient() + spark_client.write_dataframe = mocker.stub("write_dataframe") + spark_client.conn.conf.set( + "spark.sql.sources.partitionOverwriteMode", "dynamic" + ) + writer = HistoricalFeatureStoreWriter(interval_mode=True) + + schema_dataframe = writer._create_partitions(feature_set_dataframe) + writer.check_schema_hook = mocker.stub("check_schema_hook") + writer.check_schema_hook.run = mocker.stub("run") + writer.check_schema_hook.run.return_value = schema_dataframe + + # when + writer.write( + feature_set=feature_set, + dataframe=feature_set_dataframe, + spark_client=spark_client, + ) + result_df = spark_client.write_dataframe.call_args[1]["dataframe"] + + # then + assert_dataframe_equality(historical_feature_set_dataframe, result_df) + + assert ( + writer.db_config.format_ + == spark_client.write_dataframe.call_args[1]["format_"] + ) + assert ( + writer.db_config.mode == spark_client.write_dataframe.call_args[1]["mode"] + ) + assert ( + writer.PARTITION_BY + == spark_client.write_dataframe.call_args[1]["partitionBy"] + ) + + def test_write_interval_mode_invalid_partition_mode( + self, + feature_set_dataframe, + historical_feature_set_dataframe, + mocker, + feature_set, + ): + # given + spark_client = SparkClient() + spark_client.write_dataframe = mocker.stub("write_dataframe") + spark_client.conn.conf.set("spark.sql.sources.partitionOverwriteMode", "static") + + writer = HistoricalFeatureStoreWriter(interval_mode=True) + + schema_dataframe = writer._create_partitions(feature_set_dataframe) + writer.check_schema_hook = mocker.stub("check_schema_hook") + writer.check_schema_hook.run = mocker.stub("run") + writer.check_schema_hook.run.return_value = schema_dataframe + + # when + with pytest.raises(RuntimeError): + _ = writer.write( + feature_set=feature_set, + dataframe=feature_set_dataframe, + spark_client=spark_client, + ) def test_write_in_debug_mode( self, @@ -49,6 +123,7 @@ def test_write_in_debug_mode( historical_feature_set_dataframe, feature_set, spark_session, + mocker, ): # given spark_client = SparkClient() @@ -65,33 +140,75 @@ def test_write_in_debug_mode( # then assert_dataframe_equality(historical_feature_set_dataframe, result_df) - def test_validate(self, feature_set_dataframe, mocker, feature_set): + def test_write_in_debug_mode_with_interval_mode( + self, + feature_set_dataframe, + historical_feature_set_dataframe, + feature_set, + spark_session, + ): + # given + spark_client = SparkClient() + writer = HistoricalFeatureStoreWriter(debug_mode=True, interval_mode=True) + + # when + writer.write( + feature_set=feature_set, + dataframe=feature_set_dataframe, + spark_client=spark_client, + ) + result_df = spark_session.table(f"historical_feature_store__{feature_set.name}") + + # then + assert_dataframe_equality(historical_feature_set_dataframe, result_df) + + def test_validate(self, historical_feature_set_dataframe, mocker, feature_set): # given spark_client = mocker.stub("spark_client") spark_client.read_table = mocker.stub("read_table") - spark_client.read_table.return_value = feature_set_dataframe + spark_client.read_table.return_value = historical_feature_set_dataframe writer = HistoricalFeatureStoreWriter() # when - writer.validate(feature_set, feature_set_dataframe, spark_client) + writer.validate(feature_set, historical_feature_set_dataframe, spark_client) # then spark_client.read_table.assert_called_once() - def test_validate_false(self, feature_set_dataframe, mocker, feature_set): + def test_validate_interval_mode( + self, historical_feature_set_dataframe, mocker, feature_set + ): # given spark_client = mocker.stub("spark_client") - spark_client.read_table = mocker.stub("read_table") + spark_client.read = mocker.stub("read") + spark_client.read.return_value = historical_feature_set_dataframe + + writer = HistoricalFeatureStoreWriter(interval_mode=True) + + # when + writer.validate(feature_set, historical_feature_set_dataframe, spark_client) + + # then + spark_client.read.assert_called_once() + + def test_validate_false( + self, historical_feature_set_dataframe, mocker, feature_set + ): + # given + spark_client = mocker.stub("spark_client") + spark_client.read = mocker.stub("read") # limiting df to 1 row, now the counts should'n be the same - spark_client.read_table.return_value = feature_set_dataframe.limit(1) + spark_client.read.return_value = historical_feature_set_dataframe.limit(1) - writer = HistoricalFeatureStoreWriter() + writer = HistoricalFeatureStoreWriter(interval_mode=True) # when with pytest.raises(AssertionError): - _ = writer.validate(feature_set, feature_set_dataframe, spark_client) + _ = writer.validate( + feature_set, historical_feature_set_dataframe, spark_client + ) def test__create_partitions(self, spark_session, spark_context): # arrange @@ -201,8 +318,15 @@ def test_write_with_transform( # given spark_client = mocker.stub("spark_client") spark_client.write_table = mocker.stub("write_table") + writer = HistoricalFeatureStoreWriter().with_(json_transform) + schema_dataframe = writer._create_partitions(feature_set_dataframe) + json_dataframe = writer._apply_transformations(schema_dataframe) + writer.check_schema_hook = mocker.stub("check_schema_hook") + writer.check_schema_hook.run = mocker.stub("run") + writer.check_schema_hook.run.return_value = json_dataframe + # when writer.write( feature_set=feature_set, diff --git a/tests/unit/butterfree/load/writers/test_online_feature_store_writer.py b/tests/unit/butterfree/load/writers/test_online_feature_store_writer.py index 87823c552..384ec1527 100644 --- a/tests/unit/butterfree/load/writers/test_online_feature_store_writer.py +++ b/tests/unit/butterfree/load/writers/test_online_feature_store_writer.py @@ -68,6 +68,10 @@ def test_write( spark_client.write_dataframe = mocker.stub("write_dataframe") writer = OnlineFeatureStoreWriter(cassandra_config) + writer.check_schema_hook = mocker.stub("check_schema_hook") + writer.check_schema_hook.run = mocker.stub("run") + writer.check_schema_hook.run.return_value = feature_set_dataframe + # when writer.write(feature_set, feature_set_dataframe, spark_client) @@ -94,11 +98,16 @@ def test_write_in_debug_mode( latest_feature_set_dataframe, feature_set, spark_session, + mocker, ): # given spark_client = SparkClient() writer = OnlineFeatureStoreWriter(debug_mode=True) + writer.check_schema_hook = mocker.stub("check_schema_hook") + writer.check_schema_hook.run = mocker.stub("run") + writer.check_schema_hook.run.return_value = feature_set_dataframe + # when writer.write( feature_set=feature_set, @@ -110,9 +119,7 @@ def test_write_in_debug_mode( # then assert_dataframe_equality(latest_feature_set_dataframe, result_df) - def test_write_in_debug_and_stream_mode( - self, feature_set, spark_session, - ): + def test_write_in_debug_and_stream_mode(self, feature_set, spark_session, mocker): # arrange spark_client = SparkClient() @@ -125,6 +132,10 @@ def test_write_in_debug_and_stream_mode( writer = OnlineFeatureStoreWriter(debug_mode=True) + writer.check_schema_hook = mocker.stub("check_schema_hook") + writer.check_schema_hook.run = mocker.stub("run") + writer.check_schema_hook.run.return_value = mocked_stream_df + # act handler = writer.write( feature_set=feature_set, @@ -140,7 +151,7 @@ def test_write_in_debug_and_stream_mode( assert isinstance(handler, StreamingQuery) @pytest.mark.parametrize("has_checkpoint", [True, False]) - def test_write_stream(self, feature_set, has_checkpoint, monkeypatch): + def test_write_stream(self, feature_set, has_checkpoint, monkeypatch, mocker): # arrange spark_client = SparkClient() spark_client.write_stream = Mock() @@ -163,6 +174,10 @@ def test_write_stream(self, feature_set, has_checkpoint, monkeypatch): writer = OnlineFeatureStoreWriter(cassandra_config) writer.filter_latest = Mock() + writer.check_schema_hook = mocker.stub("check_schema_hook") + writer.check_schema_hook.run = mocker.stub("run") + writer.check_schema_hook.run.return_value = dataframe + # act stream_handler = writer.write(feature_set, dataframe, spark_client) @@ -186,7 +201,7 @@ def test_get_db_schema(self, cassandra_config, test_feature_set, expected_schema assert schema == expected_schema - def test_write_stream_on_entity(self, feature_set, monkeypatch): + def test_write_stream_on_entity(self, feature_set, monkeypatch, mocker): """Test write method with stream dataframe and write_to_entity enabled. The main purpose of this test is assert the correct setup of stream checkpoint @@ -209,6 +224,10 @@ def test_write_stream_on_entity(self, feature_set, monkeypatch): writer = OnlineFeatureStoreWriter(write_to_entity=True) + writer.check_schema_hook = mocker.stub("check_schema_hook") + writer.check_schema_hook.run = mocker.stub("run") + writer.check_schema_hook.run.return_value = dataframe + # act stream_handler = writer.write(feature_set, dataframe, spark_client) @@ -237,6 +256,10 @@ def test_write_with_transform( spark_client.write_dataframe = mocker.stub("write_dataframe") writer = OnlineFeatureStoreWriter(cassandra_config).with_(json_transform) + writer.check_schema_hook = mocker.stub("check_schema_hook") + writer.check_schema_hook.run = mocker.stub("run") + writer.check_schema_hook.run.return_value = feature_set_dataframe + # when writer.write(feature_set, feature_set_dataframe, spark_client) @@ -270,6 +293,10 @@ def test_write_with_kafka_config( kafka_config = KafkaConfig() writer = OnlineFeatureStoreWriter(kafka_config).with_(json_transform) + writer.check_schema_hook = mocker.stub("check_schema_hook") + writer.check_schema_hook.run = mocker.stub("run") + writer.check_schema_hook.run.return_value = feature_set_dataframe + # when writer.write(feature_set, feature_set_dataframe, spark_client) @@ -293,6 +320,10 @@ def test_write_with_custom_kafka_config( json_transform ) + custom_writer.check_schema_hook = mocker.stub("check_schema_hook") + custom_writer.check_schema_hook.run = mocker.stub("run") + custom_writer.check_schema_hook.run.return_value = feature_set_dataframe + # when custom_writer.write(feature_set, feature_set_dataframe, spark_client) diff --git a/tests/unit/butterfree/pipelines/conftest.py b/tests/unit/butterfree/pipelines/conftest.py new file mode 100644 index 000000000..47e65efb7 --- /dev/null +++ b/tests/unit/butterfree/pipelines/conftest.py @@ -0,0 +1,63 @@ +from unittest.mock import Mock + +from pyspark.sql import functions +from pytest import fixture + +from butterfree.clients import SparkClient +from butterfree.constants import DataType +from butterfree.constants.columns import TIMESTAMP_COLUMN +from butterfree.extract import Source +from butterfree.extract.readers import TableReader +from butterfree.load import Sink +from butterfree.load.writers import HistoricalFeatureStoreWriter +from butterfree.pipelines import FeatureSetPipeline +from butterfree.transform import FeatureSet +from butterfree.transform.features import Feature, KeyFeature, TimestampFeature +from butterfree.transform.transformations import SparkFunctionTransform +from butterfree.transform.utils import Function + + +@fixture() +def feature_set_pipeline(): + test_pipeline = FeatureSetPipeline( + spark_client=SparkClient(), + source=Mock( + spec=Source, + readers=[TableReader(id="source_a", database="db", table="table",)], + query="select * from source_a", + ), + feature_set=Mock( + spec=FeatureSet, + name="feature_set", + entity="entity", + description="description", + keys=[ + KeyFeature( + name="user_id", + description="The user's Main ID or device ID", + dtype=DataType.INTEGER, + ) + ], + timestamp=TimestampFeature(from_column="ts"), + features=[ + Feature( + name="listing_page_viewed__rent_per_month", + description="Average of something.", + transformation=SparkFunctionTransform( + functions=[ + Function(functions.avg, DataType.FLOAT), + Function(functions.stddev_pop, DataType.FLOAT), + ], + ).with_window( + partition_by="user_id", + order_by=TIMESTAMP_COLUMN, + window_definition=["7 days", "2 weeks"], + mode="fixed_windows", + ), + ), + ], + ), + sink=Mock(spec=Sink, writers=[HistoricalFeatureStoreWriter(db_config=None)],), + ) + + return test_pipeline diff --git a/tests/unit/butterfree/pipelines/test_feature_set_pipeline.py b/tests/unit/butterfree/pipelines/test_feature_set_pipeline.py index 1bc3c7071..7bae6606b 100644 --- a/tests/unit/butterfree/pipelines/test_feature_set_pipeline.py +++ b/tests/unit/butterfree/pipelines/test_feature_set_pipeline.py @@ -17,12 +17,8 @@ from butterfree.load.writers.writer import Writer from butterfree.pipelines.feature_set_pipeline import FeatureSetPipeline from butterfree.transform import FeatureSet -from butterfree.transform.aggregated_feature_set import AggregatedFeatureSet from butterfree.transform.features import Feature, KeyFeature, TimestampFeature -from butterfree.transform.transformations import ( - AggregatedTransform, - SparkFunctionTransform, -) +from butterfree.transform.transformations import SparkFunctionTransform from butterfree.transform.utils import Function @@ -104,115 +100,29 @@ def test_feature_set_args(self): assert len(pipeline.sink.writers) == 2 assert all(isinstance(writer, Writer) for writer in pipeline.sink.writers) - def test_run(self, spark_session): - test_pipeline = FeatureSetPipeline( - spark_client=SparkClient(), - source=Mock( - spec=Source, - readers=[TableReader(id="source_a", database="db", table="table",)], - query="select * from source_a", - ), - feature_set=Mock( - spec=FeatureSet, - name="feature_set", - entity="entity", - description="description", - keys=[ - KeyFeature( - name="user_id", - description="The user's Main ID or device ID", - dtype=DataType.INTEGER, - ) - ], - timestamp=TimestampFeature(from_column="ts"), - features=[ - Feature( - name="listing_page_viewed__rent_per_month", - description="Average of something.", - transformation=SparkFunctionTransform( - functions=[ - Function(functions.avg, DataType.FLOAT), - Function(functions.stddev_pop, DataType.FLOAT), - ], - ).with_window( - partition_by="user_id", - order_by=TIMESTAMP_COLUMN, - window_definition=["7 days", "2 weeks"], - mode="fixed_windows", - ), - ), - ], - ), - sink=Mock( - spec=Sink, writers=[HistoricalFeatureStoreWriter(db_config=None)], - ), - ) - + def test_run(self, spark_session, feature_set_pipeline): # feature_set need to return a real df for streaming validation sample_df = spark_session.createDataFrame([{"a": "x", "b": "y", "c": "3"}]) - test_pipeline.feature_set.construct.return_value = sample_df + feature_set_pipeline.feature_set.construct.return_value = sample_df - test_pipeline.run() + feature_set_pipeline.run() - test_pipeline.source.construct.assert_called_once() - test_pipeline.feature_set.construct.assert_called_once() - test_pipeline.sink.flush.assert_called_once() - test_pipeline.sink.validate.assert_called_once() - - def test_run_with_repartition(self, spark_session): - test_pipeline = FeatureSetPipeline( - spark_client=SparkClient(), - source=Mock( - spec=Source, - readers=[TableReader(id="source_a", database="db", table="table",)], - query="select * from source_a", - ), - feature_set=Mock( - spec=FeatureSet, - name="feature_set", - entity="entity", - description="description", - keys=[ - KeyFeature( - name="user_id", - description="The user's Main ID or device ID", - dtype=DataType.INTEGER, - ) - ], - timestamp=TimestampFeature(from_column="ts"), - features=[ - Feature( - name="listing_page_viewed__rent_per_month", - description="Average of something.", - transformation=SparkFunctionTransform( - functions=[ - Function(functions.avg, DataType.FLOAT), - Function(functions.stddev_pop, DataType.FLOAT), - ], - ).with_window( - partition_by="user_id", - order_by=TIMESTAMP_COLUMN, - window_definition=["7 days", "2 weeks"], - mode="fixed_windows", - ), - ), - ], - ), - sink=Mock( - spec=Sink, writers=[HistoricalFeatureStoreWriter(db_config=None)], - ), - ) + feature_set_pipeline.source.construct.assert_called_once() + feature_set_pipeline.feature_set.construct.assert_called_once() + feature_set_pipeline.sink.flush.assert_called_once() + feature_set_pipeline.sink.validate.assert_called_once() + def test_run_with_repartition(self, spark_session, feature_set_pipeline): # feature_set need to return a real df for streaming validation sample_df = spark_session.createDataFrame([{"a": "x", "b": "y", "c": "3"}]) - test_pipeline.feature_set.construct.return_value = sample_df + feature_set_pipeline.feature_set.construct.return_value = sample_df - test_pipeline.run(partition_by=["id"]) + feature_set_pipeline.run(partition_by=["id"]) - test_pipeline.source.construct.assert_called_once() - test_pipeline.feature_set.construct.assert_called_once() - test_pipeline.sink.flush.assert_called_once() - test_pipeline.sink.validate.assert_called_once() + feature_set_pipeline.source.construct.assert_called_once() + feature_set_pipeline.feature_set.construct.assert_called_once() + feature_set_pipeline.sink.flush.assert_called_once() + feature_set_pipeline.sink.validate.assert_called_once() def test_source_raise(self): with pytest.raises(ValueError, match="source must be a Source instance"): @@ -343,52 +253,26 @@ def test_sink_raise(self): sink=Mock(writers=[HistoricalFeatureStoreWriter(db_config=None)],), ) - def test_run_agg_with_end_date(self, spark_session): - test_pipeline = FeatureSetPipeline( - spark_client=SparkClient(), - source=Mock( - spec=Source, - readers=[TableReader(id="source_a", database="db", table="table",)], - query="select * from source_a", - ), - feature_set=Mock( - spec=AggregatedFeatureSet, - name="feature_set", - entity="entity", - description="description", - keys=[ - KeyFeature( - name="user_id", - description="The user's Main ID or device ID", - dtype=DataType.INTEGER, - ) - ], - timestamp=TimestampFeature(from_column="ts"), - features=[ - Feature( - name="listing_page_viewed__rent_per_month", - description="Average of something.", - transformation=AggregatedTransform( - functions=[ - Function(functions.avg, DataType.FLOAT), - Function(functions.stddev_pop, DataType.FLOAT), - ], - ), - ), - ], - ), - sink=Mock( - spec=Sink, writers=[HistoricalFeatureStoreWriter(db_config=None)], - ), - ) + def test_run_agg_with_end_date(self, spark_session, feature_set_pipeline): + # feature_set need to return a real df for streaming validation + sample_df = spark_session.createDataFrame([{"a": "x", "b": "y", "c": "3"}]) + feature_set_pipeline.feature_set.construct.return_value = sample_df + + feature_set_pipeline.run(end_date="2016-04-18") + + feature_set_pipeline.source.construct.assert_called_once() + feature_set_pipeline.feature_set.construct.assert_called_once() + feature_set_pipeline.sink.flush.assert_called_once() + feature_set_pipeline.sink.validate.assert_called_once() + def test_run_agg_with_start_date(self, spark_session, feature_set_pipeline): # feature_set need to return a real df for streaming validation sample_df = spark_session.createDataFrame([{"a": "x", "b": "y", "c": "3"}]) - test_pipeline.feature_set.construct.return_value = sample_df + feature_set_pipeline.feature_set.construct.return_value = sample_df - test_pipeline.run(end_date="2016-04-18") + feature_set_pipeline.run(start_date="2020-08-04") - test_pipeline.source.construct.assert_called_once() - test_pipeline.feature_set.construct.assert_called_once() - test_pipeline.sink.flush.assert_called_once() - test_pipeline.sink.validate.assert_called_once() + feature_set_pipeline.source.construct.assert_called_once() + feature_set_pipeline.feature_set.construct.assert_called_once() + feature_set_pipeline.sink.flush.assert_called_once() + feature_set_pipeline.sink.validate.assert_called_once() diff --git a/tests/unit/butterfree/transform/conftest.py b/tests/unit/butterfree/transform/conftest.py index 2d7d3e50c..febc8bbc2 100644 --- a/tests/unit/butterfree/transform/conftest.py +++ b/tests/unit/butterfree/transform/conftest.py @@ -1,11 +1,19 @@ import json from unittest.mock import Mock +from pyspark.sql import functions from pytest import fixture from butterfree.constants import DataType from butterfree.constants.columns import TIMESTAMP_COLUMN +from butterfree.transform import FeatureSet +from butterfree.transform.aggregated_feature_set import AggregatedFeatureSet from butterfree.transform.features import Feature, KeyFeature, TimestampFeature +from butterfree.transform.transformations import ( + AggregatedTransform, + SparkFunctionTransform, +) +from butterfree.transform.utils import Function def make_dataframe(spark_context, spark_session): @@ -297,3 +305,77 @@ def key_id(): @fixture def timestamp_c(): return TimestampFeature() + + +@fixture +def feature_set(): + feature_set = FeatureSet( + name="feature_set", + entity="entity", + description="description", + features=[ + Feature( + name="feature1", + description="test", + transformation=SparkFunctionTransform( + functions=[ + Function(functions.avg, DataType.FLOAT), + Function(functions.stddev_pop, DataType.DOUBLE), + ] + ).with_window( + partition_by="id", + order_by=TIMESTAMP_COLUMN, + mode="fixed_windows", + window_definition=["2 minutes", "15 minutes"], + ), + ), + ], + keys=[ + KeyFeature( + name="id", + description="The user's Main ID or device ID", + dtype=DataType.BIGINT, + ) + ], + timestamp=TimestampFeature(), + ) + + return feature_set + + +@fixture +def agg_feature_set(): + feature_set = AggregatedFeatureSet( + name="feature_set", + entity="entity", + description="description", + features=[ + Feature( + name="feature1", + description="test", + transformation=AggregatedTransform( + functions=[ + Function(functions.avg, DataType.DOUBLE), + Function(functions.stddev_pop, DataType.FLOAT), + ], + ), + ), + Feature( + name="feature2", + description="test", + transformation=AggregatedTransform( + functions=[Function(functions.count, DataType.ARRAY_STRING)] + ), + ), + ], + keys=[ + KeyFeature( + name="id", + description="The user's Main ID or device ID", + dtype=DataType.BIGINT, + ) + ], + timestamp=TimestampFeature(), + ).with_windows(definitions=["1 week", "2 days"]) + + return feature_set diff --git a/tests/unit/butterfree/transform/test_aggregated_feature_set.py b/tests/unit/butterfree/transform/test_aggregated_feature_set.py index 2c404feab..8025d6f82 100644 --- a/tests/unit/butterfree/transform/test_aggregated_feature_set.py +++ b/tests/unit/butterfree/transform/test_aggregated_feature_set.py @@ -89,7 +89,7 @@ def test_agg_feature_set_with_window( output_df = fs.construct(dataframe, spark_client, end_date="2016-05-01") assert_dataframe_equality(output_df, rolling_windows_agg_dataframe) - def test_get_schema(self): + def test_get_schema(self, agg_feature_set): expected_schema = [ {"column_name": "id", "type": LongType(), "primary_key": True}, {"column_name": "timestamp", "type": TimestampType(), "primary_key": False}, @@ -125,40 +125,7 @@ def test_get_schema(self): }, ] - feature_set = AggregatedFeatureSet( - name="feature_set", - entity="entity", - description="description", - features=[ - Feature( - name="feature1", - description="test", - transformation=AggregatedTransform( - functions=[ - Function(functions.avg, DataType.DOUBLE), - Function(functions.stddev_pop, DataType.FLOAT), - ], - ), - ), - Feature( - name="feature2", - description="test", - transformation=AggregatedTransform( - functions=[Function(functions.count, DataType.ARRAY_STRING)] - ), - ), - ], - keys=[ - KeyFeature( - name="id", - description="The user's Main ID or device ID", - dtype=DataType.BIGINT, - ) - ], - timestamp=TimestampFeature(), - ).with_windows(definitions=["1 week", "2 days"]) - - schema = feature_set.get_schema() + schema = agg_feature_set.get_schema() assert schema == expected_schema @@ -389,3 +356,34 @@ def test_feature_transform_with_data_type_array(self, spark_context, spark_sessi # assert assert_dataframe_equality(target_df, output_df) + + def test_define_start_date(self, agg_feature_set): + start_date = agg_feature_set.define_start_date("2020-08-04") + + assert isinstance(start_date, str) + assert start_date == "2020-07-27" + + def test_feature_set_start_date( + self, timestamp_c, feature_set_with_distinct_dataframe, + ): + fs = AggregatedFeatureSet( + name="name", + entity="entity", + description="description", + features=[ + Feature( + name="feature", + description="test", + transformation=AggregatedTransform( + functions=[Function(functions.sum, DataType.INTEGER)] + ), + ), + ], + keys=[KeyFeature(name="h3", description="test", dtype=DataType.STRING)], + timestamp=timestamp_c, + ).with_windows(["10 days", "3 weeks", "90 days"]) + + # assert + start_date = fs.define_start_date("2016-04-14") + + assert start_date == "2016-01-14" diff --git a/tests/unit/butterfree/transform/test_feature_set.py b/tests/unit/butterfree/transform/test_feature_set.py index bdb1ff7d4..43d937bec 100644 --- a/tests/unit/butterfree/transform/test_feature_set.py +++ b/tests/unit/butterfree/transform/test_feature_set.py @@ -12,13 +12,11 @@ from butterfree.clients import SparkClient from butterfree.constants import DataType -from butterfree.constants.columns import TIMESTAMP_COLUMN from butterfree.testing.dataframe import assert_dataframe_equality from butterfree.transform import FeatureSet -from butterfree.transform.features import Feature, KeyFeature, TimestampFeature +from butterfree.transform.features import Feature from butterfree.transform.transformations import ( AggregatedTransform, - SparkFunctionTransform, SQLExpressionTransform, ) from butterfree.transform.utils import Function @@ -341,7 +339,7 @@ def test_feature_set_with_invalid_feature(self, key_id, timestamp_c, dataframe): timestamp=timestamp_c, ).construct(dataframe, spark_client) - def test_get_schema(self): + def test_get_schema(self, feature_set): expected_schema = [ {"column_name": "id", "type": LongType(), "primary_key": True}, {"column_name": "timestamp", "type": TimestampType(), "primary_key": False}, @@ -367,37 +365,6 @@ def test_get_schema(self): }, ] - feature_set = FeatureSet( - name="feature_set", - entity="entity", - description="description", - features=[ - Feature( - name="feature1", - description="test", - transformation=SparkFunctionTransform( - functions=[ - Function(F.avg, DataType.FLOAT), - Function(F.stddev_pop, DataType.DOUBLE), - ] - ).with_window( - partition_by="id", - order_by=TIMESTAMP_COLUMN, - mode="fixed_windows", - window_definition=["2 minutes", "15 minutes"], - ), - ), - ], - keys=[ - KeyFeature( - name="id", - description="The user's Main ID or device ID", - dtype=DataType.BIGINT, - ) - ], - timestamp=TimestampFeature(), - ) - schema = feature_set.get_schema() assert schema == expected_schema @@ -421,3 +388,9 @@ def test_feature_without_datatype(self, key_id, timestamp_c, dataframe): keys=[key_id], timestamp=timestamp_c, ).construct(dataframe, spark_client) + + def test_define_start_date(self, feature_set): + start_date = feature_set.define_start_date("2020-08-04") + + assert isinstance(start_date, str) + assert start_date == "2020-08-04" From 8da89edac01510f31e8da33b0c5b474e93f2e5a4 Mon Sep 17 00:00:00 2001 From: Igor Gustavo Hoelscher <19557581+roelschr@users.noreply.github.com> Date: Mon, 22 Feb 2021 09:22:28 -0300 Subject: [PATCH 12/70] Allow slide selection (#293) --- Makefile | 2 +- .../transform/aggregated_feature_set.py | 61 ++++++++++----- butterfree/transform/utils/window_spec.py | 11 +-- setup.py | 2 +- tests/unit/butterfree/transform/conftest.py | 57 +++++++++----- .../transform/test_aggregated_feature_set.py | 76 +++++++------------ 6 files changed, 116 insertions(+), 93 deletions(-) diff --git a/Makefile b/Makefile index e6de9baa5..397d04bf4 100644 --- a/Makefile +++ b/Makefile @@ -105,7 +105,7 @@ checks: style-check quality-check type-check ## fix stylistic errors with black apply-style: @python -m black -t py36 --exclude="build/|buck-out/|dist/|_build/|pip/|\.pip/|\.git/|\.hg/|\.mypy_cache/|\.tox/|\.venv/" . - @python -m isort -rc butterfree/ tests/ + @python -m isort -rc --atomic butterfree/ tests/ .PHONY: clean ## clean unused artifacts diff --git a/butterfree/transform/aggregated_feature_set.py b/butterfree/transform/aggregated_feature_set.py index a19efb350..7a8656cdd 100644 --- a/butterfree/transform/aggregated_feature_set.py +++ b/butterfree/transform/aggregated_feature_set.py @@ -301,7 +301,9 @@ def with_distinct(self, subset: List, keep: str = "last") -> "AggregatedFeatureS return self - def with_windows(self, definitions: List[str]) -> "AggregatedFeatureSet": + def with_windows( + self, definitions: List[str], slide: str = None + ) -> "AggregatedFeatureSet": """Create a list with windows defined.""" self._windows = [ Window( @@ -309,6 +311,7 @@ def with_windows(self, definitions: List[str]) -> "AggregatedFeatureSet": order_by=None, mode="rolling_windows", window_definition=definition, + slide=slide, ) for definition in definitions ] @@ -563,12 +566,6 @@ def construct( ) if self._windows and end_date is not None: - # prepare our left table, a cartesian product between distinct keys - # and dates in range for this feature set - base_df = self._get_base_dataframe( - client=client, dataframe=output_df, end_date=end_date - ) - # run aggregations for each window agg_list = [ self._aggregate( @@ -580,18 +577,44 @@ def construct( for w in self._windows ] - # left join each aggregation result to our base dataframe - output_df = reduce( - lambda left, right: self._dataframe_join( - left, - right, - on=self.keys_columns + [self.timestamp_column], - how="left", - num_processors=num_processors, - ), - agg_list, - base_df, - ) + # prepare our left table, a cartesian product between distinct keys + # and dates in range for this feature set + + # todo next versions won't use this logic anymore, + # leaving for the client to correct the usage of aggregations + # without events + + # keeping this logic to maintain the same behavior for already implemented + # feature sets + + if self._windows[0].slide == "1 day": + base_df = self._get_base_dataframe( + client=client, dataframe=output_df, end_date=end_date + ) + + # left join each aggregation result to our base dataframe + output_df = reduce( + lambda left, right: self._dataframe_join( + left, + right, + on=self.keys_columns + [self.timestamp_column], + how="left", + num_processors=num_processors, + ), + agg_list, + base_df, + ) + else: + output_df = reduce( + lambda left, right: self._dataframe_join( + left, + right, + on=self.keys_columns + [self.timestamp_column], + how="full outer", + num_processors=num_processors, + ), + agg_list, + ) else: output_df = self._aggregate(output_df, features=self.features) diff --git a/butterfree/transform/utils/window_spec.py b/butterfree/transform/utils/window_spec.py index a270fec03..53ecd2fd3 100644 --- a/butterfree/transform/utils/window_spec.py +++ b/butterfree/transform/utils/window_spec.py @@ -62,7 +62,7 @@ class Window: Use the static methods in :class:`Window` to create a :class:`WindowSpec`. """ - SLIDE_DURATION: str = "1 day" + DEFAULT_SLIDE_DURATION: str = "1 day" def __init__( self, @@ -70,10 +70,12 @@ def __init__( partition_by: Optional[Union[Column, str, List[str]]] = None, order_by: Optional[Union[Column, str]] = None, mode: str = None, + slide: str = None, ): self.partition_by = partition_by self.order_by = order_by or TIMESTAMP_COLUMN self.frame_boundaries = FrameBoundaries(mode, window_definition) + self.slide = slide or self.DEFAULT_SLIDE_DURATION def get_name(self) -> str: """Return window suffix name based on passed criteria.""" @@ -89,15 +91,10 @@ def get_name(self) -> str: def get(self) -> Any: """Defines a common window to be used both in time and rows windows.""" if self.frame_boundaries.mode == "rolling_windows": - if int(self.frame_boundaries.window_definition.split()[0]) <= 0: - raise KeyError( - f"{self.frame_boundaries.window_definition} " - f"have negative element." - ) return functions.window( TIMESTAMP_COLUMN, self.frame_boundaries.window_definition, - slideDuration=self.SLIDE_DURATION, + slideDuration=self.slide, ) elif self.order_by == TIMESTAMP_COLUMN: w = sql.Window.partitionBy(self.partition_by).orderBy( # type: ignore diff --git a/setup.py b/setup.py index 4adcbce93..393fb0a0b 100644 --- a/setup.py +++ b/setup.py @@ -1,7 +1,7 @@ from setuptools import find_packages, setup __package_name__ = "butterfree" -__version__ = "1.2.0.dev0" +__version__ = "1.2.0.dev1" __repository_url__ = "https://github.com/quintoandar/butterfree" with open("requirements.txt") as f: diff --git a/tests/unit/butterfree/transform/conftest.py b/tests/unit/butterfree/transform/conftest.py index febc8bbc2..bbef8b13d 100644 --- a/tests/unit/butterfree/transform/conftest.py +++ b/tests/unit/butterfree/transform/conftest.py @@ -135,6 +135,35 @@ def make_rolling_windows_agg_dataframe(spark_context, spark_session): return df +def make_rolling_windows_hour_slide_agg_dataframe(spark_context, spark_session): + data = [ + { + "id": 1, + "timestamp": "2016-04-11 12:00:00", + "feature1__avg_over_1_day_rolling_windows": 266.6666666666667, + "feature2__avg_over_1_day_rolling_windows": 300.0, + }, + { + "id": 1, + "timestamp": "2016-04-12 00:00:00", + "feature1__avg_over_1_day_rolling_windows": 300.0, + "feature2__avg_over_1_day_rolling_windows": 350.0, + }, + { + "id": 1, + "timestamp": "2016-04-12 12:00:00", + "feature1__avg_over_1_day_rolling_windows": 400.0, + "feature2__avg_over_1_day_rolling_windows": 500.0, + }, + ] + df = spark_session.read.json( + spark_context.parallelize(data).map(lambda x: json.dumps(x)) + ) + df = df.withColumn("timestamp", df.timestamp.cast(DataType.TIMESTAMP.spark)) + + return df + + def make_fs(spark_context, spark_session): df = make_dataframe(spark_context, spark_session) df = ( @@ -241,6 +270,11 @@ def rolling_windows_agg_dataframe(spark_context, spark_session): return make_rolling_windows_agg_dataframe(spark_context, spark_session) +@fixture +def rolling_windows_hour_slide_agg_dataframe(spark_context, spark_session): + return make_rolling_windows_hour_slide_agg_dataframe(spark_context, spark_session) + + @fixture def feature_set_with_distinct_dataframe(spark_context, spark_session): return make_fs_dataframe_with_distinct(spark_context, spark_session) @@ -345,8 +379,8 @@ def feature_set(): @fixture def agg_feature_set(): - feature_set = AggregatedFeatureSet( - name="feature_set", + return AggregatedFeatureSet( + name="name", entity="entity", description="description", features=[ @@ -354,28 +388,17 @@ def agg_feature_set(): name="feature1", description="test", transformation=AggregatedTransform( - functions=[ - Function(functions.avg, DataType.DOUBLE), - Function(functions.stddev_pop, DataType.FLOAT), - ], + functions=[Function(functions.avg, DataType.DOUBLE)], ), ), Feature( name="feature2", description="test", transformation=AggregatedTransform( - functions=[Function(functions.count, DataType.ARRAY_STRING)] + functions=[Function(functions.avg, DataType.DOUBLE)] ), ), ], - keys=[ - KeyFeature( - name="id", - description="The user's Main ID or device ID", - dtype=DataType.BIGINT, - ) - ], + keys=[KeyFeature(name="id", description="description", dtype=DataType.BIGINT,)], timestamp=TimestampFeature(), - ).with_windows(definitions=["1 week", "2 days"]) - - return feature_set + ) diff --git a/tests/unit/butterfree/transform/test_aggregated_feature_set.py b/tests/unit/butterfree/transform/test_aggregated_feature_set.py index 8025d6f82..458956f36 100644 --- a/tests/unit/butterfree/transform/test_aggregated_feature_set.py +++ b/tests/unit/butterfree/transform/test_aggregated_feature_set.py @@ -1,13 +1,6 @@ import pytest from pyspark.sql import functions -from pyspark.sql.types import ( - ArrayType, - DoubleType, - FloatType, - LongType, - StringType, - TimestampType, -) +from pyspark.sql.types import DoubleType, LongType, TimestampType from butterfree.clients import SparkClient from butterfree.constants import DataType @@ -51,33 +44,11 @@ def test_feature_set_with_invalid_feature(self, key_id, timestamp_c, dataframe): ).construct(dataframe, spark_client) def test_agg_feature_set_with_window( - self, key_id, timestamp_c, dataframe, rolling_windows_agg_dataframe + self, dataframe, rolling_windows_agg_dataframe, agg_feature_set, ): spark_client = SparkClient() - fs = AggregatedFeatureSet( - name="name", - entity="entity", - description="description", - features=[ - Feature( - name="feature1", - description="unit test", - transformation=AggregatedTransform( - functions=[Function(functions.avg, DataType.FLOAT)] - ), - ), - Feature( - name="feature2", - description="unit test", - transformation=AggregatedTransform( - functions=[Function(functions.avg, DataType.FLOAT)] - ), - ), - ], - keys=[key_id], - timestamp=timestamp_c, - ).with_windows(definitions=["1 week"]) + fs = agg_feature_set.with_windows(definitions=["1 week"]) # raises without end date with pytest.raises(ValueError): @@ -89,6 +60,21 @@ def test_agg_feature_set_with_window( output_df = fs.construct(dataframe, spark_client, end_date="2016-05-01") assert_dataframe_equality(output_df, rolling_windows_agg_dataframe) + def test_agg_feature_set_with_smaller_slide( + self, dataframe, rolling_windows_hour_slide_agg_dataframe, agg_feature_set, + ): + spark_client = SparkClient() + + fs = agg_feature_set.with_windows(definitions=["1 day"], slide="12 hours") + + # raises without end date + with pytest.raises(ValueError): + _ = fs.construct(dataframe, spark_client) + + # filters with date smaller then mocked max + output_df = fs.construct(dataframe, spark_client, end_date="2016-04-17") + assert_dataframe_equality(output_df, rolling_windows_hour_slide_agg_dataframe) + def test_get_schema(self, agg_feature_set): expected_schema = [ {"column_name": "id", "type": LongType(), "primary_key": True}, @@ -104,28 +90,20 @@ def test_get_schema(self, agg_feature_set): "primary_key": False, }, { - "column_name": "feature1__stddev_pop_over_1_week_rolling_windows", - "type": FloatType(), - "primary_key": False, - }, - { - "column_name": "feature1__stddev_pop_over_2_days_rolling_windows", - "type": FloatType(), - "primary_key": False, - }, - { - "column_name": "feature2__count_over_1_week_rolling_windows", - "type": ArrayType(StringType(), True), + "column_name": "feature2__avg_over_1_week_rolling_windows", + "type": DoubleType(), "primary_key": False, }, { - "column_name": "feature2__count_over_2_days_rolling_windows", - "type": ArrayType(StringType(), True), + "column_name": "feature2__avg_over_2_days_rolling_windows", + "type": DoubleType(), "primary_key": False, }, ] - schema = agg_feature_set.get_schema() + schema = agg_feature_set.with_windows( + definitions=["1 week", "2 days"] + ).get_schema() assert schema == expected_schema @@ -358,7 +336,9 @@ def test_feature_transform_with_data_type_array(self, spark_context, spark_sessi assert_dataframe_equality(target_df, output_df) def test_define_start_date(self, agg_feature_set): - start_date = agg_feature_set.define_start_date("2020-08-04") + start_date = agg_feature_set.with_windows( + definitions=["1 week", "2 days"] + ).define_start_date("2020-08-04") assert isinstance(start_date, str) assert start_date == "2020-07-27" From 0df07aebded2b2cb2a35370d22d3dda6f2f8713a Mon Sep 17 00:00:00 2001 From: AlvaroMarquesAndrade <45604858+AlvaroMarquesAndrade@users.noreply.github.com> Date: Fri, 26 Feb 2021 10:36:18 -0300 Subject: [PATCH 13/70] Fix Slide Duration Typo (#295) --- .../transform/aggregated_feature_set.py | 2 +- setup.py | 2 +- tests/unit/butterfree/transform/conftest.py | 56 +++++++++++++++++++ .../transform/test_aggregated_feature_set.py | 22 ++++++++ 4 files changed, 80 insertions(+), 2 deletions(-) diff --git a/butterfree/transform/aggregated_feature_set.py b/butterfree/transform/aggregated_feature_set.py index 7a8656cdd..133195d72 100644 --- a/butterfree/transform/aggregated_feature_set.py +++ b/butterfree/transform/aggregated_feature_set.py @@ -610,7 +610,7 @@ def construct( left, right, on=self.keys_columns + [self.timestamp_column], - how="full outer", + how="full_outer", num_processors=num_processors, ), agg_list, diff --git a/setup.py b/setup.py index 393fb0a0b..7f65117a2 100644 --- a/setup.py +++ b/setup.py @@ -1,7 +1,7 @@ from setuptools import find_packages, setup __package_name__ = "butterfree" -__version__ = "1.2.0.dev1" +__version__ = "1.2.0.dev2" __repository_url__ = "https://github.com/quintoandar/butterfree" with open("requirements.txt") as f: diff --git a/tests/unit/butterfree/transform/conftest.py b/tests/unit/butterfree/transform/conftest.py index bbef8b13d..ab7606407 100644 --- a/tests/unit/butterfree/transform/conftest.py +++ b/tests/unit/butterfree/transform/conftest.py @@ -164,6 +164,55 @@ def make_rolling_windows_hour_slide_agg_dataframe(spark_context, spark_session): return df +def make_multiple_rolling_windows_hour_slide_agg_dataframe( + spark_context, spark_session +): + data = [ + { + "id": 1, + "timestamp": "2016-04-11 12:00:00", + "feature1__avg_over_2_days_rolling_windows": 266.6666666666667, + "feature1__avg_over_3_days_rolling_windows": 266.6666666666667, + "feature2__avg_over_2_days_rolling_windows": 300.0, + "feature2__avg_over_3_days_rolling_windows": 300.0, + }, + { + "id": 1, + "timestamp": "2016-04-12 00:00:00", + "feature1__avg_over_2_days_rolling_windows": 300.0, + "feature1__avg_over_3_days_rolling_windows": 300.0, + "feature2__avg_over_2_days_rolling_windows": 350.0, + "feature2__avg_over_3_days_rolling_windows": 350.0, + }, + { + "id": 1, + "timestamp": "2016-04-13 12:00:00", + "feature1__avg_over_2_days_rolling_windows": 400.0, + "feature1__avg_over_3_days_rolling_windows": 300.0, + "feature2__avg_over_2_days_rolling_windows": 500.0, + "feature2__avg_over_3_days_rolling_windows": 350.0, + }, + { + "id": 1, + "timestamp": "2016-04-14 00:00:00", + "feature1__avg_over_3_days_rolling_windows": 300.0, + "feature2__avg_over_3_days_rolling_windows": 350.0, + }, + { + "id": 1, + "timestamp": "2016-04-14 12:00:00", + "feature1__avg_over_3_days_rolling_windows": 400.0, + "feature2__avg_over_3_days_rolling_windows": 500.0, + }, + ] + df = spark_session.read.json( + spark_context.parallelize(data).map(lambda x: json.dumps(x)) + ) + df = df.withColumn("timestamp", df.timestamp.cast(DataType.TIMESTAMP.spark)) + + return df + + def make_fs(spark_context, spark_session): df = make_dataframe(spark_context, spark_session) df = ( @@ -275,6 +324,13 @@ def rolling_windows_hour_slide_agg_dataframe(spark_context, spark_session): return make_rolling_windows_hour_slide_agg_dataframe(spark_context, spark_session) +@fixture +def multiple_rolling_windows_hour_slide_agg_dataframe(spark_context, spark_session): + return make_multiple_rolling_windows_hour_slide_agg_dataframe( + spark_context, spark_session + ) + + @fixture def feature_set_with_distinct_dataframe(spark_context, spark_session): return make_fs_dataframe_with_distinct(spark_context, spark_session) diff --git a/tests/unit/butterfree/transform/test_aggregated_feature_set.py b/tests/unit/butterfree/transform/test_aggregated_feature_set.py index 458956f36..73320cf57 100644 --- a/tests/unit/butterfree/transform/test_aggregated_feature_set.py +++ b/tests/unit/butterfree/transform/test_aggregated_feature_set.py @@ -75,6 +75,28 @@ def test_agg_feature_set_with_smaller_slide( output_df = fs.construct(dataframe, spark_client, end_date="2016-04-17") assert_dataframe_equality(output_df, rolling_windows_hour_slide_agg_dataframe) + def test_agg_feature_set_with_smaller_slide_and_multiple_windows( + self, + dataframe, + multiple_rolling_windows_hour_slide_agg_dataframe, + agg_feature_set, + ): + spark_client = SparkClient() + + fs = agg_feature_set.with_windows( + definitions=["2 days", "3 days"], slide="12 hours" + ) + + # raises without end date + with pytest.raises(ValueError): + _ = fs.construct(dataframe, spark_client) + + # filters with date smaller then mocked max + output_df = fs.construct(dataframe, spark_client, end_date="2016-04-17") + assert_dataframe_equality( + output_df, multiple_rolling_windows_hour_slide_agg_dataframe + ) + def test_get_schema(self, agg_feature_set): expected_schema = [ {"column_name": "id", "type": LongType(), "primary_key": True}, From aeb79998ecc6b72f3214c82ca993a1ca7aad48e7 Mon Sep 17 00:00:00 2001 From: Mayara Moromisato <44944954+moromimay@users.noreply.github.com> Date: Mon, 8 Mar 2021 17:53:29 -0300 Subject: [PATCH 14/70] [MLOP-637] Implement diff method (#292) --- butterfree/migrations/__init__.py | 7 +- butterfree/migrations/cassandra_migration.py | 23 ---- .../migrations/database_migration/__init__.py | 11 ++ .../database_migration/cassandra_migration.py | 36 ++++++ .../database_migration/database_migration.py | 119 ++++++++++++++++++ .../database_migration/metastore_migration.py | 31 +++++ butterfree/migrations/metastore_migration.py | 23 ---- butterfree/migrations/migrate.py | 41 ++++++ butterfree/migrations/migration.py | 62 --------- setup.cfg | 2 +- tests/unit/butterfree/migrations/__init__.py | 0 .../migrations/database_migration/__init__.py | 0 .../migrations/database_migration/conftest.py | 20 +++ .../test_database_migration.py | 56 +++++++++ 14 files changed, 317 insertions(+), 114 deletions(-) delete mode 100644 butterfree/migrations/cassandra_migration.py create mode 100644 butterfree/migrations/database_migration/__init__.py create mode 100644 butterfree/migrations/database_migration/cassandra_migration.py create mode 100644 butterfree/migrations/database_migration/database_migration.py create mode 100644 butterfree/migrations/database_migration/metastore_migration.py delete mode 100644 butterfree/migrations/metastore_migration.py create mode 100644 butterfree/migrations/migrate.py delete mode 100644 butterfree/migrations/migration.py create mode 100644 tests/unit/butterfree/migrations/__init__.py create mode 100644 tests/unit/butterfree/migrations/database_migration/__init__.py create mode 100644 tests/unit/butterfree/migrations/database_migration/conftest.py create mode 100644 tests/unit/butterfree/migrations/database_migration/test_database_migration.py diff --git a/butterfree/migrations/__init__.py b/butterfree/migrations/__init__.py index 5f709bfe3..39cabfb7c 100644 --- a/butterfree/migrations/__init__.py +++ b/butterfree/migrations/__init__.py @@ -1,7 +1,4 @@ """Holds available migrations.""" +from butterfree.migrations.migrate import Migrate -from butterfree.migrations.cassandra_migration import CassandraMigration -from butterfree.migrations.metastore_migration import MetastoreMigration -from butterfree.migrations.migration import DatabaseMigration - -__all__ = ["DatabaseMigration", "CassandraMigration", "MetastoreMigration"] +__all__ = ["Migrate"] diff --git a/butterfree/migrations/cassandra_migration.py b/butterfree/migrations/cassandra_migration.py deleted file mode 100644 index e9cecdc7b..000000000 --- a/butterfree/migrations/cassandra_migration.py +++ /dev/null @@ -1,23 +0,0 @@ -"""Cassandra Migration entity.""" - -from typing import Any, Dict, List - -from butterfree.migrations import DatabaseMigration - - -class CassandraMigration(DatabaseMigration): - """Cassandra class for Migrations.""" - - def create_query( - self, - fs_schema: List[Dict[str, Any]], - db_schema: List[Dict[str, Any]], - table_name: str, - ) -> Any: - """Create a query regarding Cassandra. - - Returns: - Schema object. - - """ - pass diff --git a/butterfree/migrations/database_migration/__init__.py b/butterfree/migrations/database_migration/__init__.py new file mode 100644 index 000000000..7138c4450 --- /dev/null +++ b/butterfree/migrations/database_migration/__init__.py @@ -0,0 +1,11 @@ +"""Holds available database migrations.""" + +from butterfree.migrations.database_migration.cassandra_migration import ( + CassandraMigration, +) +from butterfree.migrations.database_migration.database_migration import Diff +from butterfree.migrations.database_migration.metastore_migration import ( + MetastoreMigration, +) + +__all__ = ["CassandraMigration", "MetastoreMigration", "Diff"] diff --git a/butterfree/migrations/database_migration/cassandra_migration.py b/butterfree/migrations/database_migration/cassandra_migration.py new file mode 100644 index 000000000..c4943c8e2 --- /dev/null +++ b/butterfree/migrations/database_migration/cassandra_migration.py @@ -0,0 +1,36 @@ +"""Cassandra Migration entity.""" + +from typing import Any, Dict, List + +from butterfree.clients import CassandraClient +from butterfree.configs.db import CassandraConfig +from butterfree.migrations.database_migration.database_migration import ( + DatabaseMigration, +) + + +class CassandraMigration(DatabaseMigration): + """Cassandra class for Migrations.""" + + def __init__(self) -> None: + self._db_config = CassandraConfig() + self._client = CassandraClient( + host=[self._db_config.host], + keyspace=self._db_config.keyspace, # type: ignore + user=self._db_config.username, + password=self._db_config.password, + ) + + def create_query( + self, + table_name: str, + db_schema: List[Dict[str, Any]] = None, + diff_schema: List[Dict[str, Any]] = None, + ) -> Any: + """Create a query regarding Cassandra. + + Returns: + Schema object. + + """ + pass diff --git a/butterfree/migrations/database_migration/database_migration.py b/butterfree/migrations/database_migration/database_migration.py new file mode 100644 index 000000000..032a7acc0 --- /dev/null +++ b/butterfree/migrations/database_migration/database_migration.py @@ -0,0 +1,119 @@ +"""Migration entity.""" +from abc import ABC, abstractmethod +from dataclasses import dataclass +from enum import Enum, auto +from typing import Any, Dict, List, Set + +from butterfree.transform import FeatureSet + + +@dataclass +class Diff: + """DataClass to help identifying different types of diff between schemas.""" + + class Kind(Enum): + """Mapping actions to take given a difference between columns of a schema.""" + + ADD = auto() + DROP = auto() + ALTER_TYPE = auto() + ALTER_KEY = auto() + + column: str + kind: Kind + value: Any + + def __hash__(self) -> int: + return hash((self.column, self.kind, self.value)) + + def __eq__(self, other: object) -> bool: + if not isinstance(other, type(self)): + raise NotImplementedError + return ( + self.column == other.column + and self.kind == other.kind + and self.value == other.value + ) + + +class DatabaseMigration(ABC): + """Abstract base class for Migrations.""" + + @abstractmethod + def create_query( + self, + table_name: str, + db_schema: List[Dict[str, Any]] = None, + diff_schema: List[Dict[str, Any]] = None, + ) -> Any: + """Create a query regarding a data source. + + Returns: + The desired query for the given database. + + """ + + def _apply_migration(self, feature_set: FeatureSet) -> None: + """Apply the migration in the respective database.""" + pass + + @staticmethod + def _get_diff( + fs_schema: List[Dict[str, Any]], db_schema: List[Dict[str, Any]], + ) -> Set[Diff]: + """Gets schema difference between feature set and the table of a given db. + + Args: + fs_schema: object that contains feature set's schemas. + db_schema: object that contains the table of a given db schema. + + """ + db_columns = set(item.get("column_name") for item in db_schema) + fs_columns = set(item.get("column_name") for item in fs_schema) + + add_columns = fs_columns - db_columns + drop_columns = db_columns - fs_columns + + # This could be way easier to write (and to read) if the schemas were a simple + # Dict[str, Any] where each key would be the column name itself... + # but changing that could break things so: + # TODO version 2 change get schema to return a dict(columns, properties) + alter_type_columns = dict() + alter_key_columns = dict() + for fs_item in fs_schema: + for db_item in db_schema: + if fs_item.get("column_name") == db_item.get("column_name"): + if fs_item.get("type") != db_item.get("type"): + alter_type_columns.update( + {fs_item.get("column_name"): fs_item.get("type")} + ) + if fs_item.get("primary_key") != db_item.get("primary_key"): + alter_key_columns.update( + {fs_item.get("column_name"): fs_item.get("primary_key")} + ) + break + + schema_diff = set( + Diff(str(col), kind=Diff.Kind.ADD, value=None) for col in add_columns + ) + schema_diff |= set( + Diff(str(col), kind=Diff.Kind.DROP, value=None) for col in drop_columns + ) + schema_diff |= set( + Diff(str(col), kind=Diff.Kind.ALTER_TYPE, value=value) + for col, value in alter_type_columns.items() + ) + schema_diff |= set( + Diff(str(col), kind=Diff.Kind.ALTER_KEY, value=value) + for col, value in alter_key_columns.items() + ) + return schema_diff + + def run(self, feature_set: FeatureSet) -> None: + """Runs the migrations. + + Args: + feature_set: the feature set. + + """ + pass diff --git a/butterfree/migrations/database_migration/metastore_migration.py b/butterfree/migrations/database_migration/metastore_migration.py new file mode 100644 index 000000000..ae0dd1829 --- /dev/null +++ b/butterfree/migrations/database_migration/metastore_migration.py @@ -0,0 +1,31 @@ +"""Metastore Migration entity.""" + +from typing import Any, Dict, List + +from butterfree.clients import SparkClient +from butterfree.configs.db import MetastoreConfig +from butterfree.migrations.database_migration.database_migration import ( + DatabaseMigration, +) + + +class MetastoreMigration(DatabaseMigration): + """Metastore class for Migrations.""" + + def __init__(self) -> None: + self._db_config = MetastoreConfig() + self._client = SparkClient() + + def create_query( + self, + table_name: str, + db_schema: List[Dict[str, Any]] = None, + diff_schema: List[Dict[str, Any]] = None, + ) -> Any: + """Create a query regarding Metastore. + + Returns: + Schema object. + + """ + pass diff --git a/butterfree/migrations/metastore_migration.py b/butterfree/migrations/metastore_migration.py deleted file mode 100644 index bb208f2a9..000000000 --- a/butterfree/migrations/metastore_migration.py +++ /dev/null @@ -1,23 +0,0 @@ -"""Metastore Migration entity.""" - -from typing import Any, Dict, List - -from butterfree.migrations import DatabaseMigration - - -class MetastoreMigration(DatabaseMigration): - """Metastore class for Migrations.""" - - def create_query( - self, - fs_schema: List[Dict[str, Any]], - db_schema: List[Dict[str, Any]], - table_name: str, - ) -> Any: - """Create a query regarding Metastore. - - Returns: - Schema object. - - """ - pass diff --git a/butterfree/migrations/migrate.py b/butterfree/migrations/migrate.py new file mode 100644 index 000000000..f128dee1f --- /dev/null +++ b/butterfree/migrations/migrate.py @@ -0,0 +1,41 @@ +"""Holds the Migrator Class.""" + +from typing import Callable, List, Tuple + +from butterfree.pipelines import FeatureSetPipeline +from butterfree.transform import FeatureSet + + +class Migrate: + """Execute migration operations in a Database based on pipeline Writer. + + Attributes: + pipelines: list of Feature Set Pipelines to use to migration. + + """ + + def __init__(self, pipelines: List[FeatureSetPipeline]) -> None: + self.pipelines = pipelines + + def _parse_feature_set_pipeline( + self, pipeline: FeatureSetPipeline + ) -> List[Tuple[Callable, FeatureSet]]: + feature_set = pipeline.feature_set + migrations = [ + writer.db_config._migration_class for writer in pipeline.sink.writers + ] + + return [(migrate, feature_set) for migrate in migrations] + + def _send_logs_to_s3(self) -> None: + """Send all migration logs to S3.""" + pass + + def migration(self) -> None: + """Construct and apply the migrations.""" + migration_list = [ + self._parse_feature_set_pipeline(pipeline) for pipeline in self.pipelines + ] + + for migration, fs in migration_list: + migration.run(fs) diff --git a/butterfree/migrations/migration.py b/butterfree/migrations/migration.py deleted file mode 100644 index c53945bf9..000000000 --- a/butterfree/migrations/migration.py +++ /dev/null @@ -1,62 +0,0 @@ -"""Migration entity.""" - -from abc import ABC, abstractmethod -from typing import Any, Callable, Dict, List - -from butterfree.pipelines import FeatureSetPipeline - - -class DatabaseMigration(ABC): - """Abstract base class for Migrations.""" - - @abstractmethod - def create_query( - self, - fs_schema: List[Dict[str, Any]], - db_schema: List[Dict[str, Any]], - table_name: str, - ) -> Any: - """Create a query regarding a data source. - - Returns: - The desired query for the given database. - - """ - - def _validate_schema( - self, fs_schema: List[Dict[str, Any]], db_schema: List[Dict[str, Any]] - ) -> Any: - """Provides schema validation for feature sets. - - Compares the schema of your local feature set to the - corresponding table in a given database. - - Args: - fs_schema: object that contains feature set's schemas. - db_schema: object that contains the table og a given db schema. - - """ - - def _get_schema(self, db_client: Callable, table_name: str) -> List[Dict[str, Any]]: - """Get a table schema in the respective database. - - Returns: - Schema object. - """ - pass - - def _apply_migration(self, query: str, db_client: Callable) -> None: - """Apply the migration in the respective database.""" - - def _send_logs_to_s3(self) -> None: - """Send all migration logs to S3.""" - pass - - def run(self, pipelines: List[FeatureSetPipeline]) -> None: - """Runs the migrations. - - Args: - pipelines: the feature set pipelines. - - """ - pass diff --git a/setup.cfg b/setup.cfg index 7b1c62bd2..255fff848 100644 --- a/setup.cfg +++ b/setup.cfg @@ -2,7 +2,7 @@ docstring-convention = google max-line-length = 88 max-complexity = 12 -ignore = W503, E203, D203, D401, D107, S101 +ignore = W503, E203, D203, D401, D107, S101, D105 exclude = dist/*,build/*,.pytest_cache/*,.git/*,pip/* per-file-ignores = # We will not check for docstrings or the use of asserts in tests diff --git a/tests/unit/butterfree/migrations/__init__.py b/tests/unit/butterfree/migrations/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/unit/butterfree/migrations/database_migration/__init__.py b/tests/unit/butterfree/migrations/database_migration/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/unit/butterfree/migrations/database_migration/conftest.py b/tests/unit/butterfree/migrations/database_migration/conftest.py new file mode 100644 index 000000000..f737c4dc8 --- /dev/null +++ b/tests/unit/butterfree/migrations/database_migration/conftest.py @@ -0,0 +1,20 @@ +from pyspark.sql.types import DoubleType, LongType, TimestampType +from pytest import fixture + + +@fixture +def db_schema(): + return [ + {"column_name": "id", "type": LongType(), "primary_key": True}, + {"column_name": "timestamp", "type": TimestampType(), "primary_key": False}, + { + "column_name": "feature1__avg_over_1_week_rolling_windows", + "type": DoubleType(), + "primary_key": False, + }, + { + "column_name": "feature1__avg_over_2_days_rolling_windows", + "type": DoubleType(), + "primary_key": False, + }, + ] diff --git a/tests/unit/butterfree/migrations/database_migration/test_database_migration.py b/tests/unit/butterfree/migrations/database_migration/test_database_migration.py new file mode 100644 index 000000000..aa272317e --- /dev/null +++ b/tests/unit/butterfree/migrations/database_migration/test_database_migration.py @@ -0,0 +1,56 @@ +from pyspark.sql.types import DoubleType, FloatType, LongType, TimestampType + +from butterfree.migrations.database_migration import CassandraMigration, Diff + + +class TestDatabaseMigration: + def test__get_diff_empty(self, mocker, db_schema): + fs_schema = [ + {"column_name": "id", "type": LongType(), "primary_key": True}, + {"column_name": "timestamp", "type": TimestampType(), "primary_key": False}, + { + "column_name": "feature1__avg_over_1_week_rolling_windows", + "type": DoubleType(), + "primary_key": False, + }, + { + "column_name": "feature1__avg_over_2_days_rolling_windows", + "type": DoubleType(), + "primary_key": False, + }, + ] + m = CassandraMigration() + m._client = mocker.stub("client") + diff = m._get_diff(fs_schema, db_schema) + assert not diff + + def test__get_diff(self, mocker, db_schema): + fs_schema = [ + {"column_name": "id", "type": LongType(), "primary_key": True}, + {"column_name": "timestamp", "type": TimestampType(), "primary_key": True}, + {"column_name": "new_feature", "type": FloatType(), "primary_key": False}, + { + "column_name": "feature1__avg_over_1_week_rolling_windows", + "type": FloatType(), + "primary_key": False, + }, + ] + expected_diff = { + Diff("timestamp", kind=Diff.Kind.ALTER_KEY, value=True), + Diff("new_feature", kind=Diff.Kind.ADD, value=None), + Diff( + "feature1__avg_over_2_days_rolling_windows", + kind=Diff.Kind.DROP, + value=None, + ), + Diff( + "feature1__avg_over_1_week_rolling_windows", + kind=Diff.Kind.ALTER_TYPE, + value=FloatType(), + ), + } + + m = CassandraMigration() + m._client = mocker.stub("client") + diff = m._get_diff(fs_schema, db_schema) + assert diff == expected_diff From 9afc39c242403510501a1be5d29b63984b13c950 Mon Sep 17 00:00:00 2001 From: Igor Gustavo Hoelscher <19557581+roelschr@users.noreply.github.com> Date: Mon, 15 Mar 2021 09:43:29 -0300 Subject: [PATCH 15/70] [MLOP-640] Create CLI with migrate command (#298) --- butterfree/_cli/__init__.py | 10 ++ butterfree/_cli/main.py | 9 ++ butterfree/_cli/migrate.py | 104 ++++++++++++++++++ requirements.txt | 2 + setup.py | 2 + tests/mocks/__init__.py | 0 tests/mocks/entities/__init__.py | 0 tests/mocks/entities/first/__init__.py | 3 + tests/mocks/entities/first/first_pipeline.py | 43 ++++++++ tests/mocks/entities/second/__init__.py | 0 .../mocks/entities/second/deeper/__init__.py | 3 + .../entities/second/deeper/second_pipeline.py | 45 ++++++++ tests/unit/butterfree/_cli/__init__.py | 0 tests/unit/butterfree/_cli/test_migrate.py | 8 ++ 14 files changed, 229 insertions(+) create mode 100644 butterfree/_cli/__init__.py create mode 100644 butterfree/_cli/main.py create mode 100644 butterfree/_cli/migrate.py create mode 100644 tests/mocks/__init__.py create mode 100644 tests/mocks/entities/__init__.py create mode 100644 tests/mocks/entities/first/__init__.py create mode 100644 tests/mocks/entities/first/first_pipeline.py create mode 100644 tests/mocks/entities/second/__init__.py create mode 100644 tests/mocks/entities/second/deeper/__init__.py create mode 100644 tests/mocks/entities/second/deeper/second_pipeline.py create mode 100644 tests/unit/butterfree/_cli/__init__.py create mode 100644 tests/unit/butterfree/_cli/test_migrate.py diff --git a/butterfree/_cli/__init__.py b/butterfree/_cli/__init__.py new file mode 100644 index 000000000..ec8a1792c --- /dev/null +++ b/butterfree/_cli/__init__.py @@ -0,0 +1,10 @@ +import logging + + +def __logger(name: str) -> logging.Logger: + format_ = "%(name)s:%(asctime)-15s:%(levelname)s:< %(message)s >" + logging.basicConfig(format=format_, level=logging.INFO) + return logging.getLogger(name) + + +cli_logger = __logger("butterfree") diff --git a/butterfree/_cli/main.py b/butterfree/_cli/main.py new file mode 100644 index 000000000..e340bc1bd --- /dev/null +++ b/butterfree/_cli/main.py @@ -0,0 +1,9 @@ +import typer + +from butterfree._cli import migrate + +app = typer.Typer() +app.add_typer(migrate.app) + +if __name__ == "__main__": + app() diff --git a/butterfree/_cli/migrate.py b/butterfree/_cli/migrate.py new file mode 100644 index 000000000..ee083f731 --- /dev/null +++ b/butterfree/_cli/migrate.py @@ -0,0 +1,104 @@ +import importlib +import inspect +import pkgutil +import sys +from typing import Set + +import setuptools +import typer + +from butterfree._cli import cli_logger +from butterfree.pipelines import FeatureSetPipeline + +app = typer.Typer() + + +def __find_modules(path: str) -> Set[str]: + modules = set() + for pkg in setuptools.find_packages(path): + modules.add(pkg) + pkg_path = path + "/" + pkg.replace(".", "/") + + # different usage for older python3 versions + if sys.version_info.minor < 6: + for _, name, is_pkg in pkgutil.iter_modules([pkg_path]): + if not is_pkg: + modules.add(pkg + "." + name) + else: + for info in pkgutil.iter_modules([pkg_path]): + if not info.ispkg: + modules.add(pkg + "." + info.name) + return modules + + +def __fs_objects(path: str) -> Set[FeatureSetPipeline]: + cli_logger.info(f"Looking for python modules under {path}...") + modules = __find_modules(path) + if not modules: + return set() + + cli_logger.info(f"Importing modules...") + package = ".".join(path.strip("/").split("/")) + imported = set( + importlib.import_module(f".{name}", package=package) for name in modules + ) + + cli_logger.info(f"Scanning modules...") + content = { + module: set( + filter( + lambda x: not x.startswith("__"), # filter "__any__" attributes + set(item for item in dir(module)), + ) + ) + for module in imported + } + + instances = set() + for module, items in content.items(): + for item in items: + value = getattr(module, item) + if not value: + continue + + # filtering non-classes + if not inspect.isclass(value): + continue + + # filtering abstractions + if inspect.isabstract(value): + continue + + # filtering classes that doesn't inherit from FeatureSetPipeline + if not issubclass(value, FeatureSetPipeline): + continue + + # filtering FeatureSetPipeline itself + if value == FeatureSetPipeline: + continue + + instances.add(value) + + cli_logger.info("Creating instances...") + return set(value() for value in instances) + + +PATH = typer.Argument( + ..., help="Full or relative path to where feature set pipelines are being defined.", +) + + +@app.callback() +def migrate(path: str = PATH) -> Set[FeatureSetPipeline]: + """Scan and run database migrations for feature set pipelines defined under PATH. + + Butterfree will scan a given path for classes that inherit from its + FeatureSetPipeline and create dry instances of it to extract schema and writer + information. By doing this, Butterfree can compare all defined feature set schemas + to their current state on each sink being used. + + All pipelines must be under python modules inside path, so we can dynamically + import and instantiate them. + """ + # TODO call the Migration actor with all feature set pipeline objects + return __fs_objects(path) diff --git a/requirements.txt b/requirements.txt index e55289f4d..bac7f2c78 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,3 +3,5 @@ mdutils>=1.2.2,<2.0 pandas>=0.24,<1.1 parameters-validation>=1.1.5,<2.0 pyspark==3.* +typer>=0.3,<0.4 +setuptools>=41,<42 \ No newline at end of file diff --git a/setup.py b/setup.py index 7f65117a2..d211098c8 100644 --- a/setup.py +++ b/setup.py @@ -36,4 +36,6 @@ install_requires=requirements, extras_require={"h3": ["cmake==3.16.3", "h3==3.4.2"]}, python_requires=">=3.7, <4", + entry_points={"console_scripts": ["butterfree=butterfree._cli.main:app"]}, + include_package_data=True, ) diff --git a/tests/mocks/__init__.py b/tests/mocks/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/mocks/entities/__init__.py b/tests/mocks/entities/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/mocks/entities/first/__init__.py b/tests/mocks/entities/first/__init__.py new file mode 100644 index 000000000..e69592de4 --- /dev/null +++ b/tests/mocks/entities/first/__init__.py @@ -0,0 +1,3 @@ +from .first_pipeline import FirstPipeline + +__all__ = ["FirstPipeline"] diff --git a/tests/mocks/entities/first/first_pipeline.py b/tests/mocks/entities/first/first_pipeline.py new file mode 100644 index 000000000..90cfba96f --- /dev/null +++ b/tests/mocks/entities/first/first_pipeline.py @@ -0,0 +1,43 @@ +from butterfree.constants.data_type import DataType +from butterfree.extract import Source +from butterfree.extract.readers import TableReader +from butterfree.load import Sink +from butterfree.load.writers import ( + HistoricalFeatureStoreWriter, + OnlineFeatureStoreWriter, +) +from butterfree.pipelines import FeatureSetPipeline +from butterfree.transform import FeatureSet +from butterfree.transform.features import Feature, KeyFeature, TimestampFeature + + +class FirstPipeline(FeatureSetPipeline): + def __init__(self): + super(FirstPipeline, self).__init__( + source=Source( + readers=[TableReader(id="t", database="db", table="table",)], + query=f"select * from t", # noqa + ), + feature_set=FeatureSet( + name="first", + entity="entity", + description="description", + features=[ + Feature(name="feature1", description="test", dtype=DataType.FLOAT,), + Feature( + name="feature2", + description="another test", + dtype=DataType.STRING, + ), + ], + keys=[ + KeyFeature( + name="id", description="identifier", dtype=DataType.BIGINT, + ) + ], + timestamp=TimestampFeature(), + ), + sink=Sink( + writers=[HistoricalFeatureStoreWriter(), OnlineFeatureStoreWriter()] + ), + ) diff --git a/tests/mocks/entities/second/__init__.py b/tests/mocks/entities/second/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/mocks/entities/second/deeper/__init__.py b/tests/mocks/entities/second/deeper/__init__.py new file mode 100644 index 000000000..9f70be75d --- /dev/null +++ b/tests/mocks/entities/second/deeper/__init__.py @@ -0,0 +1,3 @@ +from .second_pipeline import SecondPipeline + +__all__ = ["SecondPipeline"] diff --git a/tests/mocks/entities/second/deeper/second_pipeline.py b/tests/mocks/entities/second/deeper/second_pipeline.py new file mode 100644 index 000000000..12c53cf30 --- /dev/null +++ b/tests/mocks/entities/second/deeper/second_pipeline.py @@ -0,0 +1,45 @@ +from butterfree.constants.data_type import DataType +from butterfree.extract import Source +from butterfree.extract.readers import TableReader +from butterfree.load import Sink +from butterfree.load.writers import ( + HistoricalFeatureStoreWriter, + OnlineFeatureStoreWriter, +) +from butterfree.pipelines import FeatureSetPipeline +from butterfree.transform import FeatureSet +from butterfree.transform.features import Feature, KeyFeature, TimestampFeature + + +class SecondPipeline(FeatureSetPipeline): + def __init__(self): + super(SecondPipeline, self).__init__( + source=Source( + readers=[TableReader(id="t", database="db", table="table",)], + query=f"select * from t", # noqa + ), + feature_set=FeatureSet( + name="second", + entity="entity", + description="description", + features=[ + Feature( + name="feature1", description="test", dtype=DataType.STRING, + ), + Feature( + name="feature2", + description="another test", + dtype=DataType.FLOAT, + ), + ], + keys=[ + KeyFeature( + name="id", description="identifier", dtype=DataType.BIGINT, + ) + ], + timestamp=TimestampFeature(), + ), + sink=Sink( + writers=[HistoricalFeatureStoreWriter(), OnlineFeatureStoreWriter()] + ), + ) diff --git a/tests/unit/butterfree/_cli/__init__.py b/tests/unit/butterfree/_cli/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/unit/butterfree/_cli/test_migrate.py b/tests/unit/butterfree/_cli/test_migrate.py new file mode 100644 index 000000000..6a63453fb --- /dev/null +++ b/tests/unit/butterfree/_cli/test_migrate.py @@ -0,0 +1,8 @@ +from butterfree._cli import migrate +from butterfree.pipelines import FeatureSetPipeline + + +def test_migrate_success(): + all_fs = migrate.migrate("tests/mocks/entities/") + assert all(isinstance(fs, FeatureSetPipeline) for fs in all_fs) + assert sorted([fs.feature_set.name for fs in all_fs]) == ["first", "second"] From bf204f2a38afc098fd4e1cc1a96f5a58b7951164 Mon Sep 17 00:00:00 2001 From: AlvaroMarquesAndrade <45604858+AlvaroMarquesAndrade@users.noreply.github.com> Date: Mon, 15 Mar 2021 17:16:15 -0300 Subject: [PATCH 16/70] [MLOP-645] Implement query method, cassandra (#291) --- .../database_migration/cassandra_migration.py | 181 ++++++++++++++++-- .../database_migration/database_migration.py | 59 +++++- .../database_migration/metastore_migration.py | 3 +- .../migrations/database_migration/conftest.py | 16 +- .../test_cassandra_migration.py | 41 ++++ .../test_database_migration.py | 4 +- 6 files changed, 283 insertions(+), 21 deletions(-) create mode 100644 tests/unit/butterfree/migrations/database_migration/test_cassandra_migration.py diff --git a/butterfree/migrations/database_migration/cassandra_migration.py b/butterfree/migrations/database_migration/cassandra_migration.py index c4943c8e2..4141a7d51 100644 --- a/butterfree/migrations/database_migration/cassandra_migration.py +++ b/butterfree/migrations/database_migration/cassandra_migration.py @@ -1,16 +1,35 @@ """Cassandra Migration entity.""" -from typing import Any, Dict, List +import logging +from typing import Any, Dict, List, Set from butterfree.clients import CassandraClient from butterfree.configs.db import CassandraConfig from butterfree.migrations.database_migration.database_migration import ( DatabaseMigration, + Diff, ) class CassandraMigration(DatabaseMigration): - """Cassandra class for Migrations.""" + """Cassandra class for performing migrations. + + This class implements some methods of the parent DatabaseMigration class and + has specific methods for query building. + + The CassandraMigration class will be used, as the name suggests, for applying + changes to a given Cassandra table. There are, however, some remarks that need + to be highlighted: + - If an existing feature has its type changed, then it's extremely important to + make sure that this conversion would not result in data loss; + - If new features are added to your feature set, then they're going to be added + to the corresponding Cassandra table; + - Since feature sets can be written both to a feature set and an entity table, + we're not going to automatically drop features when using entity tables, since + it means that some features belong to a different feature set. In summary, if + data is being loaded into an entity table, then users can drop columns manually. + + """ def __init__(self) -> None: self._db_config = CassandraConfig() @@ -21,16 +40,156 @@ def __init__(self) -> None: password=self._db_config.password, ) - def create_query( - self, - table_name: str, - db_schema: List[Dict[str, Any]] = None, - diff_schema: List[Dict[str, Any]] = None, - ) -> Any: - """Create a query regarding Cassandra. + @staticmethod + def _get_parsed_columns(columns: List[Diff]) -> List[str]: + """Parse columns from a list of Diff objects. + + Args: + columns: list of Diff objects. + + Returns: + Parsed columns. + + """ + parsed_columns = [] + for col in columns: + parsed_columns.append(f"{col.column} {col.value}") + + parsed_columns = ", ".join(parsed_columns) # type: ignore + + return parsed_columns + + def _get_alter_table_add_query(self, columns: List[Diff], table_name: str) -> str: + """Creates CQL statement to add columns to a table. + + Args: + columns: list of Diff objects with ADD kind. + table_name: table name. Returns: - Schema object. + Alter table query. """ - pass + parsed_columns = self._get_parsed_columns(columns) + + return f"ALTER TABLE {table_name} ADD ({parsed_columns});" + + def _get_alter_column_type_query(self, columns: List[Diff], table_name: str) -> str: + """Creates CQL statement to alter columns' types. + + Args: + columns: list of Diff objects with ALTER_TYPE kind. + table_name: table name. + + Returns: + Alter column type query. + + """ + parsed_columns = self._get_parsed_columns(columns) + + return f"ALTER TABLE {table_name} ALTER ({parsed_columns});" + + @staticmethod + def _get_create_table_query(columns: List[Dict[str, Any]], table_name: str) -> str: + """Creates CQL statement to create a table. + + Args: + columns: object that contains column's schemas. + table_name: table name. + + Returns: + Create table query. + + """ + parsed_columns = [] + primary_keys = [] + + for col in columns: + col_str = f"{col['column_name']} {col['type']}" + if col["primary_key"]: + primary_keys.append(col["column_name"]) + parsed_columns.append(col_str) + + joined_parsed_columns = ", ".join(parsed_columns) + + if len(primary_keys) > 0: + joined_primary_keys = ", ".join(primary_keys) + columns_str = ( + f"{joined_parsed_columns}, PRIMARY KEY ({joined_primary_keys})" + ) + else: + columns_str = joined_parsed_columns + + keyspace = CassandraConfig().keyspace + + return f"CREATE TABLE {keyspace}.{table_name} " f"({columns_str});" + + def _get_alter_table_drop_query(self, columns: List[Diff], table_name: str) -> str: + """Creates CQL statement to drop columns from a table. + + Args: + columns: list of Diff objects with DROP kind. + table_name: table name. + + Returns: + Drop columns from a given table query. + + """ + parsed_columns = self._get_parsed_columns(columns) + + return f"ALTER TABLE {table_name} DROP ({parsed_columns});" + + def _get_queries( + self, schema_diff: Set[Diff], table_name: str, write_on_entity: bool = None + ) -> List[str]: + """Create the desired queries for migration. + + Args: + schema_diff: list of Diff objects. + table_name: table name. + + Returns: + List of queries. + + """ + add_items = [] + drop_items = [] + alter_type_items = [] + alter_key_items = [] + + for diff in schema_diff: + if diff.kind == Diff.Kind.ADD: + add_items.append(diff) + elif diff.kind == Diff.Kind.ALTER_TYPE: + alter_type_items.append(diff) + elif diff.kind == Diff.Kind.DROP: + drop_items.append(diff) + elif diff.kind == Diff.Kind.ALTER_KEY: + alter_key_items.append(diff) + + queries = [] + if add_items: + alter_table_add_query = self._get_alter_table_add_query( + add_items, table_name + ) + queries.append(alter_table_add_query) + if drop_items: + if write_on_entity: + logging.info( + "Features will not be dropped automatically " + "when data is loaded to an entity table" + ) + else: + drop_columns_query = self._get_alter_table_drop_query( + drop_items, table_name + ) + queries.append(drop_columns_query) + if alter_type_items: + alter_column_types_query = self._get_alter_column_type_query( + alter_type_items, table_name + ) + queries.append(alter_column_types_query) + if alter_key_items: + logging.info("This operation is not supported by Cassandra DB.") + + return queries diff --git a/butterfree/migrations/database_migration/database_migration.py b/butterfree/migrations/database_migration/database_migration.py index 032a7acc0..a2106f3c2 100644 --- a/butterfree/migrations/database_migration/database_migration.py +++ b/butterfree/migrations/database_migration/database_migration.py @@ -15,9 +15,9 @@ class Kind(Enum): """Mapping actions to take given a difference between columns of a schema.""" ADD = auto() - DROP = auto() - ALTER_TYPE = auto() ALTER_KEY = auto() + ALTER_TYPE = auto() + DROP = auto() column: str kind: Kind @@ -40,18 +40,56 @@ class DatabaseMigration(ABC): """Abstract base class for Migrations.""" @abstractmethod + def _get_create_table_query( + self, columns: List[Dict[str, Any]], table_name: str + ) -> Any: + """Creates desired statement to create a table. + + Args: + columns: object that contains column's schemas. + table_name: table name. + + Returns: + Create table query. + + """ + pass + + @abstractmethod + def _get_queries( + self, schema_diff: Set[Diff], table_name: str, write_on_entity: bool = None + ) -> Any: + """Create the desired queries for migration. + + Args: + schema_diff: list of Diff objects. + table_name: table name. + + Returns: + List of queries. + + """ + pass + def create_query( self, + fs_schema: List[Dict[str, Any]], table_name: str, db_schema: List[Dict[str, Any]] = None, - diff_schema: List[Dict[str, Any]] = None, + write_on_entity: bool = None, ) -> Any: """Create a query regarding a data source. Returns: - The desired query for the given database. + The desired queries for the given database. """ + if not db_schema: + return [self._get_create_table_query(fs_schema, table_name)] + + schema_diff = self._get_diff(fs_schema, db_schema) + + return self._get_queries(schema_diff, table_name, write_on_entity) def _apply_migration(self, feature_set: FeatureSet) -> None: """Apply the migration in the respective database.""" @@ -67,6 +105,9 @@ def _get_diff( fs_schema: object that contains feature set's schemas. db_schema: object that contains the table of a given db schema. + Returns: + Object with schema differences. + """ db_columns = set(item.get("column_name") for item in db_schema) fs_columns = set(item.get("column_name") for item in fs_schema) @@ -78,9 +119,14 @@ def _get_diff( # Dict[str, Any] where each key would be the column name itself... # but changing that could break things so: # TODO version 2 change get schema to return a dict(columns, properties) + add_type_columns = dict() alter_type_columns = dict() alter_key_columns = dict() for fs_item in fs_schema: + if fs_item.get("column_name") in add_columns: + add_type_columns.update( + {fs_item.get("column_name"): fs_item.get("type")} + ) for db_item in db_schema: if fs_item.get("column_name") == db_item.get("column_name"): if fs_item.get("type") != db_item.get("type"): @@ -94,7 +140,8 @@ def _get_diff( break schema_diff = set( - Diff(str(col), kind=Diff.Kind.ADD, value=None) for col in add_columns + Diff(str(col), kind=Diff.Kind.ADD, value=value) + for col, value in add_type_columns.items() ) schema_diff |= set( Diff(str(col), kind=Diff.Kind.DROP, value=None) for col in drop_columns @@ -104,7 +151,7 @@ def _get_diff( for col, value in alter_type_columns.items() ) schema_diff |= set( - Diff(str(col), kind=Diff.Kind.ALTER_KEY, value=value) + Diff(str(col), kind=Diff.Kind.ALTER_KEY, value=None) for col, value in alter_key_columns.items() ) return schema_diff diff --git a/butterfree/migrations/database_migration/metastore_migration.py b/butterfree/migrations/database_migration/metastore_migration.py index ae0dd1829..4f51ddf2f 100644 --- a/butterfree/migrations/database_migration/metastore_migration.py +++ b/butterfree/migrations/database_migration/metastore_migration.py @@ -18,9 +18,10 @@ def __init__(self) -> None: def create_query( self, + fs_schema: List[Dict[str, Any]], table_name: str, db_schema: List[Dict[str, Any]] = None, - diff_schema: List[Dict[str, Any]] = None, + write_on_entity: bool = None, ) -> Any: """Create a query regarding Metastore. diff --git a/tests/unit/butterfree/migrations/database_migration/conftest.py b/tests/unit/butterfree/migrations/database_migration/conftest.py index f737c4dc8..bcf7f7f3a 100644 --- a/tests/unit/butterfree/migrations/database_migration/conftest.py +++ b/tests/unit/butterfree/migrations/database_migration/conftest.py @@ -1,4 +1,4 @@ -from pyspark.sql.types import DoubleType, LongType, TimestampType +from pyspark.sql.types import DoubleType, FloatType, LongType, TimestampType from pytest import fixture @@ -18,3 +18,17 @@ def db_schema(): "primary_key": False, }, ] + + +@fixture +def fs_schema(): + return [ + {"column_name": "id", "type": LongType(), "primary_key": True}, + {"column_name": "timestamp", "type": TimestampType(), "primary_key": True}, + {"column_name": "new_feature", "type": FloatType(), "primary_key": False}, + { + "column_name": "feature1__avg_over_1_week_rolling_windows", + "type": FloatType(), + "primary_key": False, + }, + ] diff --git a/tests/unit/butterfree/migrations/database_migration/test_cassandra_migration.py b/tests/unit/butterfree/migrations/database_migration/test_cassandra_migration.py new file mode 100644 index 000000000..8f16a1d2f --- /dev/null +++ b/tests/unit/butterfree/migrations/database_migration/test_cassandra_migration.py @@ -0,0 +1,41 @@ +from butterfree.migrations.database_migration import CassandraMigration + + +class TestCassandraMigration: + def test_queries(self, fs_schema, db_schema): + cassandra_migration = CassandraMigration() + expected_query = [ + "ALTER TABLE table_name ADD (new_feature FloatType);", + "ALTER TABLE table_name DROP (feature1__avg_over_2_days_rolling_windows);", + "ALTER TABLE table_name ALTER " + "(feature1__avg_over_1_week_rolling_windows FloatType);", + ] + query = cassandra_migration.create_query(fs_schema, "table_name", db_schema) + + assert query, expected_query + + def test_queries_on_entity(self, fs_schema, db_schema): + cassandra_migration = CassandraMigration() + expected_query = [ + "ALTER TABLE table_name ADD (new_feature FloatType);", + "ALTER TABLE table_name ALTER " + "(feature1__avg_over_1_week_rolling_windows FloatType);", + ] + query = cassandra_migration.create_query( + fs_schema, "table_name", db_schema, True + ) + + assert query, expected_query + + def test_create_table_query(self, fs_schema): + + cassandra_migration = CassandraMigration() + expected_query = [ + "CREATE TABLE test.table_name " + "(id LongType, timestamp TimestampType, new_feature FloatType, " + "feature1__avg_over_1_week_rolling_windows FloatType, " + "PRIMARY KEY (id, timestamp));" + ] + query = cassandra_migration.create_query(fs_schema, "table_name") + + assert query, expected_query diff --git a/tests/unit/butterfree/migrations/database_migration/test_database_migration.py b/tests/unit/butterfree/migrations/database_migration/test_database_migration.py index aa272317e..30277992e 100644 --- a/tests/unit/butterfree/migrations/database_migration/test_database_migration.py +++ b/tests/unit/butterfree/migrations/database_migration/test_database_migration.py @@ -36,8 +36,8 @@ def test__get_diff(self, mocker, db_schema): }, ] expected_diff = { - Diff("timestamp", kind=Diff.Kind.ALTER_KEY, value=True), - Diff("new_feature", kind=Diff.Kind.ADD, value=None), + Diff("timestamp", kind=Diff.Kind.ALTER_KEY, value=None), + Diff("new_feature", kind=Diff.Kind.ADD, value=FloatType()), Diff( "feature1__avg_over_2_days_rolling_windows", kind=Diff.Kind.DROP, From b518dbcc132b3833a8eb1bde5bdc202687bf1db7 Mon Sep 17 00:00:00 2001 From: AlvaroMarquesAndrade <45604858+AlvaroMarquesAndrade@users.noreply.github.com> Date: Tue, 16 Mar 2021 14:49:58 -0300 Subject: [PATCH 17/70] [MLOP-671] Implement get_schema on Spark client (#301) --- butterfree/clients/spark_client.py | 66 ++++++++++++++++++- .../butterfree/clients/test_spark_client.py | 33 ++++++++++ 2 files changed, 98 insertions(+), 1 deletion(-) diff --git a/butterfree/clients/spark_client.py b/butterfree/clients/spark_client.py index 0f0113e21..09a1bcd9b 100644 --- a/butterfree/clients/spark_client.py +++ b/butterfree/clients/spark_client.py @@ -1,5 +1,6 @@ """SparkClient entity.""" +import json from typing import Any, Dict, List, Optional, Union from pyspark.sql import DataFrame, DataFrameReader, SparkSession @@ -216,7 +217,8 @@ def write_table( **options, ) - def create_temporary_view(self, dataframe: DataFrame, name: str) -> Any: + @staticmethod + def create_temporary_view(dataframe: DataFrame, name: str) -> Any: """Create a temporary view from a given dataframe. Args: @@ -271,3 +273,65 @@ def add_table_partitions( ) self.conn.sql(command) + + @staticmethod + def _filter_schema(schema: DataFrame) -> List[str]: + """Returns filtered schema with the desired information. + + Attributes: + schema: desired table. + + Returns: + A list of strings in the format + ['{"column_name": "example1", type: "Spark_type"}', ...] + + """ + return ( + schema.filter( + ~schema.col_name.isin( + ["# Partition Information", "# col_name", "year", "month", "day"] + ) + ) + .toJSON() + .collect() + ) + + def _convert_schema(self, schema: DataFrame) -> List[Dict[str, str]]: + """Returns schema with the desired information. + + Attributes: + schema: desired table. + + Returns: + A list of dictionaries in the format + [{"column_name": "example1", type: "Spark_type"}, ...] + + """ + schema_list = self._filter_schema(schema) + converted_schema = [] + for row in schema_list: + converted_schema.append(json.loads(row)) + + return converted_schema + + def get_schema(self, table: str, database: str) -> List[Dict[str, str]]: + """Returns desired table schema. + + Attributes: + table: desired table. + + Returns: + A list of dictionaries in the format + [{"column_name": "example1", type: "Spark_type"}, ...] + + """ + query = f"DESCRIBE {database}.{table} " # noqa + + response = self.sql(query) + + if not response: + raise RuntimeError( + f"No columns found for table: {table}" f"in database: {database}" + ) + + return self._convert_schema(response) diff --git a/tests/unit/butterfree/clients/test_spark_client.py b/tests/unit/butterfree/clients/test_spark_client.py index 9f6415062..dc40841c9 100644 --- a/tests/unit/butterfree/clients/test_spark_client.py +++ b/tests/unit/butterfree/clients/test_spark_client.py @@ -15,6 +15,15 @@ def create_temp_view(dataframe: DataFrame, name: str) -> None: dataframe.createOrReplaceTempView(name) +def create_db_and_table(spark, database, table, view): + spark.sql(f"create database if not exists {database}") + spark.sql(f"use {database}") + spark.sql( + f"create table if not exists {database}.{table} " # noqa + f"as select * from {view}" # noqa + ) + + class TestSparkClient: def test_conn(self) -> None: # arrange @@ -293,3 +302,27 @@ def test_add_invalid_partitions(self, mock_spark_sql: Mock, partition): # act and assert with pytest.raises(ValueError): spark_client.add_table_partitions(partition, "table", "db") + + def test_get_schema( + self, target_df: DataFrame, spark_session: SparkSession + ) -> None: + # arrange + spark_client = SparkClient() + create_temp_view(dataframe=target_df, name="temp_view") + create_db_and_table( + spark=spark_session, + database="test_db", + table="test_table", + view="temp_view", + ) + + expected_schema = [ + {"col_name": "col1", "data_type": "string"}, + {"col_name": "col2", "data_type": "bigint"}, + ] + + # act + schema = spark_client.get_schema(table="test_table", database="test_db") + + # assert + assert schema, expected_schema From 5fe4c40777c6258cfe7361552179f89f46e510c3 Mon Sep 17 00:00:00 2001 From: AlvaroMarquesAndrade <45604858+AlvaroMarquesAndrade@users.noreply.github.com> Date: Tue, 16 Mar 2021 16:49:20 -0300 Subject: [PATCH 18/70] [MLOP-648] Implement query method, metastore (#294) --- butterfree/constants/migrations.py | 8 ++ .../database_migration/cassandra_migration.py | 58 +------- .../database_migration/database_migration.py | 84 ++++++++++- .../database_migration/metastore_migration.py | 131 +++++++++++++++--- .../test_metastore_migration.py | 50 +++++++ 5 files changed, 257 insertions(+), 74 deletions(-) create mode 100644 butterfree/constants/migrations.py create mode 100644 tests/unit/butterfree/migrations/database_migration/test_metastore_migration.py diff --git a/butterfree/constants/migrations.py b/butterfree/constants/migrations.py new file mode 100644 index 000000000..b1c0947db --- /dev/null +++ b/butterfree/constants/migrations.py @@ -0,0 +1,8 @@ +"""Migrations' Constants.""" +from butterfree.constants import columns + +PARTITION_BY = [ + {"column_name": columns.PARTITION_YEAR, "type": "INT"}, + {"column_name": columns.PARTITION_MONTH, "type": "INT"}, + {"column_name": columns.PARTITION_DAY, "type": "INT"}, +] diff --git a/butterfree/migrations/database_migration/cassandra_migration.py b/butterfree/migrations/database_migration/cassandra_migration.py index 4141a7d51..3d26673f4 100644 --- a/butterfree/migrations/database_migration/cassandra_migration.py +++ b/butterfree/migrations/database_migration/cassandra_migration.py @@ -1,7 +1,6 @@ """Cassandra Migration entity.""" -import logging -from typing import Any, Dict, List, Set +from typing import Any, Dict, List from butterfree.clients import CassandraClient from butterfree.configs.db import CassandraConfig @@ -138,58 +137,3 @@ def _get_alter_table_drop_query(self, columns: List[Diff], table_name: str) -> s parsed_columns = self._get_parsed_columns(columns) return f"ALTER TABLE {table_name} DROP ({parsed_columns});" - - def _get_queries( - self, schema_diff: Set[Diff], table_name: str, write_on_entity: bool = None - ) -> List[str]: - """Create the desired queries for migration. - - Args: - schema_diff: list of Diff objects. - table_name: table name. - - Returns: - List of queries. - - """ - add_items = [] - drop_items = [] - alter_type_items = [] - alter_key_items = [] - - for diff in schema_diff: - if diff.kind == Diff.Kind.ADD: - add_items.append(diff) - elif diff.kind == Diff.Kind.ALTER_TYPE: - alter_type_items.append(diff) - elif diff.kind == Diff.Kind.DROP: - drop_items.append(diff) - elif diff.kind == Diff.Kind.ALTER_KEY: - alter_key_items.append(diff) - - queries = [] - if add_items: - alter_table_add_query = self._get_alter_table_add_query( - add_items, table_name - ) - queries.append(alter_table_add_query) - if drop_items: - if write_on_entity: - logging.info( - "Features will not be dropped automatically " - "when data is loaded to an entity table" - ) - else: - drop_columns_query = self._get_alter_table_drop_query( - drop_items, table_name - ) - queries.append(drop_columns_query) - if alter_type_items: - alter_column_types_query = self._get_alter_column_type_query( - alter_type_items, table_name - ) - queries.append(alter_column_types_query) - if alter_key_items: - logging.info("This operation is not supported by Cassandra DB.") - - return queries diff --git a/butterfree/migrations/database_migration/database_migration.py b/butterfree/migrations/database_migration/database_migration.py index a2106f3c2..160f67282 100644 --- a/butterfree/migrations/database_migration/database_migration.py +++ b/butterfree/migrations/database_migration/database_migration.py @@ -1,4 +1,5 @@ """Migration entity.""" +import logging from abc import ABC, abstractmethod from dataclasses import dataclass from enum import Enum, auto @@ -56,6 +57,47 @@ def _get_create_table_query( pass @abstractmethod + def _get_alter_table_add_query(self, columns: List[Diff], table_name: str) -> str: + """Creates desired statement to add columns to a table. + + Args: + columns: list of Diff objects with ADD kind. + table_name: table name. + + Returns: + Alter table query. + + """ + pass + + @abstractmethod + def _get_alter_table_drop_query(self, columns: List[Diff], table_name: str) -> str: + """Creates desired statement to drop columns from a table. + + Args: + columns: list of Diff objects with DROP kind. + table_name: table name. + + Returns: + Drop columns from a given table query. + + """ + pass + + @abstractmethod + def _get_alter_column_type_query(self, columns: List[Diff], table_name: str) -> str: + """Creates desired statement to alter columns' types. + + Args: + columns: list of Diff objects with ALTER_TYPE kind. + table_name: table name. + + Returns: + Alter column type query. + + """ + pass + def _get_queries( self, schema_diff: Set[Diff], table_name: str, write_on_entity: bool = None ) -> Any: @@ -69,7 +111,47 @@ def _get_queries( List of queries. """ - pass + add_items = [] + drop_items = [] + alter_type_items = [] + alter_key_items = [] + + for diff in schema_diff: + if diff.kind == Diff.Kind.ADD: + add_items.append(diff) + elif diff.kind == Diff.Kind.ALTER_TYPE: + alter_type_items.append(diff) + elif diff.kind == Diff.Kind.DROP: + drop_items.append(diff) + elif diff.kind == Diff.Kind.ALTER_KEY: + alter_key_items.append(diff) + + queries = [] + if add_items: + alter_table_add_query = self._get_alter_table_add_query( + add_items, table_name + ) + queries.append(alter_table_add_query) + if drop_items: + if write_on_entity: + logging.info( + "Features will not be dropped automatically " + "when data is loaded to an entity table" + ) + else: + drop_columns_query = self._get_alter_table_drop_query( + drop_items, table_name + ) + queries.append(drop_columns_query) + if alter_type_items: + alter_column_types_query = self._get_alter_column_type_query( + alter_type_items, table_name + ) + queries.append(alter_column_types_query) + if alter_key_items: + logging.info("This operation is not supported by Spark.") + + return queries def create_query( self, diff --git a/butterfree/migrations/database_migration/metastore_migration.py b/butterfree/migrations/database_migration/metastore_migration.py index 4f51ddf2f..89017374f 100644 --- a/butterfree/migrations/database_migration/metastore_migration.py +++ b/butterfree/migrations/database_migration/metastore_migration.py @@ -2,31 +2,130 @@ from typing import Any, Dict, List -from butterfree.clients import SparkClient -from butterfree.configs.db import MetastoreConfig +from butterfree.configs import environment +from butterfree.constants.migrations import PARTITION_BY from butterfree.migrations.database_migration.database_migration import ( DatabaseMigration, + Diff, ) class MetastoreMigration(DatabaseMigration): - """Metastore class for Migrations.""" + """MetastoreMigration class for performing migrations. - def __init__(self) -> None: - self._db_config = MetastoreConfig() - self._client = SparkClient() + This class implements some methods of the parent DatabaseMigration class and + has specific methods for query building. + The MetastoreMigration class will be used, as the name suggests, for applying + changes to a given Metastore table. There are, however, some remarks that need + to be highlighted: + - If an existing feature has its type changed, then it's extremely important to + make sure that this conversion would not result in data loss; + - If new features are added to your feature set, then they're going to be added + to the corresponding Metastore table; + - Since feature sets can be written both to a feature set and an entity table, + we're not going to automatically drop features when using entity tables, since + it means that some features belong to a different feature set. In summary, if + data is being loaded into an entity table, then users can drop columns manually. + """ - def create_query( - self, - fs_schema: List[Dict[str, Any]], - table_name: str, - db_schema: List[Dict[str, Any]] = None, - write_on_entity: bool = None, - ) -> Any: - """Create a query regarding Metastore. + def __init__( + self, database: str = None, + ): + self.database = database or environment.get_variable( + "FEATURE_STORE_HISTORICAL_DATABASE" + ) + + @staticmethod + def _get_parsed_columns(columns: List[Diff]) -> List[str]: + """Parse columns from a list of Diff objects. + + Args: + columns: list of Diff objects. + + Returns: + Parsed columns. + + """ + parsed_columns = [] + for col in columns: + parsed_columns.append(f"{col.column} {col.value}") + + parsed_columns = ", ".join(parsed_columns) # type: ignore + + return parsed_columns + + def _get_alter_table_add_query(self, columns: List[Diff], table_name: str) -> str: + """Creates SQL statement to add columns to a table. + + Args: + columns: list of Diff objects with ADD kind. + table_name: table name. + + Returns: + Alter table query. + + """ + parsed_columns = self._get_parsed_columns(columns) + + return ( + f"ALTER TABLE {self.database}.{table_name} " + f"ADD IF NOT EXISTS columns ({parsed_columns});" + ) + + def _get_alter_column_type_query(self, columns: List[Diff], table_name: str) -> str: + """Creates SQL statement to alter columns' types. + + Args: + columns: list of Diff objects with ALTER_TYPE kind. + table_name: table name. Returns: - Schema object. + Alter column type query. """ - pass + parsed_columns = self._get_parsed_columns(columns) + + return f"ALTER TABLE {table_name} ALTER COLUMN ({parsed_columns});" + + def _get_create_table_query( + self, columns: List[Dict[str, Any]], table_name: str + ) -> str: + """Creates SQL statement to create a table. + + Args: + columns: object that contains column's schemas. + table_name: table name. + + Returns: + Create table query. + + """ + columns.extend(PARTITION_BY) + + parsed_columns = [] + for col in columns: + parsed_columns.append(f"{col['column_name']} {col['type']}") + parsed_columns = ", ".join(parsed_columns) # type: ignore + + return ( + f"CREATE TABLE IF NOT EXISTS " + f"{self.database}.{table_name} ({parsed_columns}) " + f"PARTITIONED BY ({PARTITION_BY[0]['column_name']}, " + f"{PARTITION_BY[1]['column_name']}, " + f"{PARTITION_BY[2]['column_name']});" + ) + + def _get_alter_table_drop_query(self, columns: List[Diff], table_name: str) -> str: + """Creates SQL statement to drop columns from a table. + + Args: + columns: list of Diff objects with DROP kind. + table_name: table name. + + Returns: + Drop columns from a given table query. + + """ + parsed_columns = self._get_parsed_columns(columns) + + return f"ALTER TABLE {table_name} DROP IF EXISTS ({parsed_columns});" diff --git a/tests/unit/butterfree/migrations/database_migration/test_metastore_migration.py b/tests/unit/butterfree/migrations/database_migration/test_metastore_migration.py new file mode 100644 index 000000000..fd1dfad89 --- /dev/null +++ b/tests/unit/butterfree/migrations/database_migration/test_metastore_migration.py @@ -0,0 +1,50 @@ +from butterfree.migrations.database_migration import MetastoreMigration + + +class TestMetastoreMigration: + def test_queries(self, fs_schema, db_schema): + metastore_migration = MetastoreMigration() + + expected_query = [ + "ALTER TABLE test.table_name ADD IF NOT EXISTS " + "columns (new_feature FloatType);", + "ALTER TABLE table_name DROP IF EXISTS " + "(feature1__avg_over_2_days_rolling_windows None);", + "ALTER TABLE table_name ALTER COLUMN " + "(feature1__avg_over_1_week_rolling_windows FloatType);", + ] + + query = metastore_migration.create_query(fs_schema, "table_name", db_schema) + + assert query, expected_query + + def test_queries_on_entity(self, fs_schema, db_schema): + metastore_migration = MetastoreMigration() + + expected_query = [ + "ALTER TABLE test.table_name ADD IF NOT EXISTS " + "columns (new_feature FloatType);", + "ALTER TABLE table_name ALTER COLUMN " + "(feature1__avg_over_1_week_rolling_windows FloatType);", + ] + + query = metastore_migration.create_query( + fs_schema, "table_name", db_schema, True + ) + + assert query, expected_query + + def test_create_table_query(self, fs_schema): + + metastore_migration = MetastoreMigration() + + expected_query = [ + "CREATE TABLE IF NOT EXISTS test.table_name " + "(id LongType, timestamp TimestampType, new_feature FloatType, " + "feature1__avg_over_1_week_rolling_windows FloatType, year INT, " + "month INT, day INT) PARTITIONED BY (year, month, day);" + ] + + query = metastore_migration.create_query(fs_schema, "table_name") + + assert query, expected_query From e8fc0dadd6f17c64b2f77c2c3311b7e294012436 Mon Sep 17 00:00:00 2001 From: AlvaroMarquesAndrade <45604858+AlvaroMarquesAndrade@users.noreply.github.com> Date: Mon, 22 Mar 2021 14:20:58 -0300 Subject: [PATCH 19/70] Fix Validation Step (#302) --- butterfree/configs/db/metastore_config.py | 2 +- .../historical_feature_store_writer.py | 9 +---- .../writers/online_feature_store_writer.py | 18 +-------- setup.py | 2 +- .../integration/butterfree/load/test_sink.py | 11 +----- .../pipelines/test_feature_set_pipeline.py | 22 +---------- tests/unit/butterfree/load/test_sink.py | 32 +--------------- .../test_historical_feature_store_writer.py | 22 ----------- .../test_online_feature_store_writer.py | 38 ++----------------- 9 files changed, 12 insertions(+), 144 deletions(-) diff --git a/butterfree/configs/db/metastore_config.py b/butterfree/configs/db/metastore_config.py index a3b315d55..a3013de9c 100644 --- a/butterfree/configs/db/metastore_config.py +++ b/butterfree/configs/db/metastore_config.py @@ -117,4 +117,4 @@ def get_path_with_partitions(self, key: str, dataframe: DataFrame) -> List: def translate(self, schema: List[Dict[str, Any]]) -> List[Dict[str, Any]]: """Translate feature set spark schema to the corresponding database.""" - pass + return schema diff --git a/butterfree/load/writers/historical_feature_store_writer.py b/butterfree/load/writers/historical_feature_store_writer.py index 456d9e6bd..6274840c8 100644 --- a/butterfree/load/writers/historical_feature_store_writer.py +++ b/butterfree/load/writers/historical_feature_store_writer.py @@ -140,14 +140,7 @@ def write( """ dataframe = self._create_partitions(dataframe) - partition_df = self._apply_transformations(dataframe) - - if self.debug_mode: - dataframe = partition_df - else: - dataframe = self.check_schema( - spark_client, partition_df, feature_set.name, self.database - ) + dataframe = self._apply_transformations(dataframe) if self.interval_mode: if self.debug_mode: diff --git a/butterfree/load/writers/online_feature_store_writer.py b/butterfree/load/writers/online_feature_store_writer.py index fade37896..310f54cc8 100644 --- a/butterfree/load/writers/online_feature_store_writer.py +++ b/butterfree/load/writers/online_feature_store_writer.py @@ -7,7 +7,7 @@ from pyspark.sql.functions import col, row_number from pyspark.sql.streaming import StreamingQuery -from butterfree.clients import CassandraClient, SparkClient +from butterfree.clients import SparkClient from butterfree.configs.db import AbstractWriteConfig, CassandraConfig from butterfree.constants.columns import TIMESTAMP_COLUMN from butterfree.hooks import Hook @@ -180,22 +180,6 @@ def write( """ table_name = feature_set.entity if self.write_to_entity else feature_set.name - if not self.debug_mode: - config = ( - self.db_config - if self.db_config == CassandraConfig - else CassandraConfig() - ) - - cassandra_client = CassandraClient( - host=[config.host], - keyspace=config.keyspace, - user=config.username, - password=config.password, - ) - - dataframe = self.check_schema(cassandra_client, dataframe, table_name) - if dataframe.isStreaming: dataframe = self._apply_transformations(dataframe) if self.debug_mode: diff --git a/setup.py b/setup.py index d211098c8..2dece452d 100644 --- a/setup.py +++ b/setup.py @@ -1,7 +1,7 @@ from setuptools import find_packages, setup __package_name__ = "butterfree" -__version__ = "1.2.0.dev2" +__version__ = "1.2.0.dev3" __repository_url__ = "https://github.com/quintoandar/butterfree" with open("requirements.txt") as f: diff --git a/tests/integration/butterfree/load/test_sink.py b/tests/integration/butterfree/load/test_sink.py index f507a3354..b5f97879b 100644 --- a/tests/integration/butterfree/load/test_sink.py +++ b/tests/integration/butterfree/load/test_sink.py @@ -9,7 +9,7 @@ ) -def test_sink(input_dataframe, feature_set, mocker): +def test_sink(input_dataframe, feature_set): # arrange client = SparkClient() client.conn.conf.set("spark.sql.sources.partitionOverwriteMode", "dynamic") @@ -34,11 +34,6 @@ def test_sink(input_dataframe, feature_set, mocker): db_config=s3config, interval_mode=True ) - schema_dataframe = historical_writer._create_partitions(feature_set_df) - historical_writer.check_schema_hook = mocker.stub("check_schema_hook") - historical_writer.check_schema_hook.run = mocker.stub("run") - historical_writer.check_schema_hook.run.return_value = schema_dataframe - # setup online writer # TODO: Change for CassandraConfig when Cassandra for test is ready online_config = Mock() @@ -49,10 +44,6 @@ def test_sink(input_dataframe, feature_set, mocker): ) online_writer = OnlineFeatureStoreWriter(db_config=online_config) - online_writer.check_schema_hook = mocker.stub("check_schema_hook") - online_writer.check_schema_hook.run = mocker.stub("run") - online_writer.check_schema_hook.run.return_value = feature_set_df - writers = [historical_writer, online_writer] sink = Sink(writers) diff --git a/tests/integration/butterfree/pipelines/test_feature_set_pipeline.py b/tests/integration/butterfree/pipelines/test_feature_set_pipeline.py index a302dc9e0..753dfe7c2 100644 --- a/tests/integration/butterfree/pipelines/test_feature_set_pipeline.py +++ b/tests/integration/butterfree/pipelines/test_feature_set_pipeline.py @@ -4,7 +4,6 @@ from pyspark.sql import DataFrame from pyspark.sql import functions as F -from butterfree.clients import SparkClient from butterfree.configs import environment from butterfree.configs.db import MetastoreConfig from butterfree.constants import DataType @@ -75,11 +74,7 @@ def create_ymd(dataframe): class TestFeatureSetPipeline: def test_feature_set_pipeline( - self, - mocked_df, - spark_session, - fixed_windows_output_feature_set_dataframe, - mocker, + self, mocked_df, spark_session, fixed_windows_output_feature_set_dataframe, ): # arrange table_reader_id = "a_source" @@ -93,11 +88,6 @@ def test_feature_set_pipeline( table_reader_table=table_reader_table, ) - spark_client = SparkClient() - spark_client.conn.conf.set( - "spark.sql.sources.partitionOverwriteMode", "dynamic" - ) - dbconfig = Mock() dbconfig.mode = "overwrite" dbconfig.format_ = "parquet" @@ -107,12 +97,6 @@ def test_feature_set_pipeline( historical_writer = HistoricalFeatureStoreWriter(db_config=dbconfig) - historical_writer.check_schema_hook = mocker.stub("check_schema_hook") - historical_writer.check_schema_hook.run = mocker.stub("run") - historical_writer.check_schema_hook.run.return_value = ( - fixed_windows_output_feature_set_dataframe - ) - # act test_pipeline = FeatureSetPipeline( source=Source( @@ -187,7 +171,6 @@ def test_feature_set_pipeline_with_dates( spark_session, fixed_windows_output_feature_set_date_dataframe, feature_set_pipeline, - mocker, ): # arrange table_reader_table = "b_table" @@ -211,7 +194,6 @@ def test_feature_set_pipeline_with_execution_date( spark_session, fixed_windows_output_feature_set_date_dataframe, feature_set_pipeline, - mocker, ): # arrange table_reader_table = "b_table" @@ -233,7 +215,7 @@ def test_feature_set_pipeline_with_execution_date( # assert assert_dataframe_equality(df, target_df) - def test_pipeline_with_hooks(self, spark_session, mocker): + def test_pipeline_with_hooks(self, spark_session): # arrange hook1 = AddHook(value=1) diff --git a/tests/unit/butterfree/load/test_sink.py b/tests/unit/butterfree/load/test_sink.py index ef377f67b..517f651e0 100644 --- a/tests/unit/butterfree/load/test_sink.py +++ b/tests/unit/butterfree/load/test_sink.py @@ -120,7 +120,7 @@ def test_flush_with_writers_list_empty(self): with pytest.raises(ValueError): Sink(writers=writer) - def test_flush_streaming_df(self, feature_set, mocker): + def test_flush_streaming_df(self, feature_set): """Testing the return of the streaming handlers by the sink.""" # arrange spark_client = SparkClient() @@ -137,24 +137,10 @@ def test_flush_streaming_df(self, feature_set, mocker): online_feature_store_writer = OnlineFeatureStoreWriter() - online_feature_store_writer.check_schema_hook = mocker.stub("check_schema_hook") - online_feature_store_writer.check_schema_hook.run = mocker.stub("run") - online_feature_store_writer.check_schema_hook.run.return_value = ( - mocked_stream_df - ) - online_feature_store_writer_on_entity = OnlineFeatureStoreWriter( write_to_entity=True ) - online_feature_store_writer_on_entity.check_schema_hook = mocker.stub( - "check_schema_hook" - ) - online_feature_store_writer_on_entity.check_schema_hook.run = mocker.stub("run") - online_feature_store_writer_on_entity.check_schema_hook.run.return_value = ( - mocked_stream_df - ) - sink = Sink( writers=[ online_feature_store_writer, @@ -177,7 +163,7 @@ def test_flush_streaming_df(self, feature_set, mocker): assert isinstance(handler, StreamingQuery) def test_flush_with_multiple_online_writers( - self, feature_set, feature_set_dataframe, mocker + self, feature_set, feature_set_dataframe ): """Testing the flow of writing to a feature-set table and to an entity table.""" # arrange @@ -189,24 +175,10 @@ def test_flush_with_multiple_online_writers( online_feature_store_writer = OnlineFeatureStoreWriter() - online_feature_store_writer.check_schema_hook = mocker.stub("check_schema_hook") - online_feature_store_writer.check_schema_hook.run = mocker.stub("run") - online_feature_store_writer.check_schema_hook.run.return_value = ( - feature_set_dataframe - ) - online_feature_store_writer_on_entity = OnlineFeatureStoreWriter( write_to_entity=True ) - online_feature_store_writer_on_entity.check_schema_hook = mocker.stub( - "check_schema_hook" - ) - online_feature_store_writer_on_entity.check_schema_hook.run = mocker.stub("run") - online_feature_store_writer_on_entity.check_schema_hook.run.return_value = ( - feature_set_dataframe - ) - sink = Sink( writers=[online_feature_store_writer, online_feature_store_writer_on_entity] ) diff --git a/tests/unit/butterfree/load/writers/test_historical_feature_store_writer.py b/tests/unit/butterfree/load/writers/test_historical_feature_store_writer.py index aac806f7a..8bab23baf 100644 --- a/tests/unit/butterfree/load/writers/test_historical_feature_store_writer.py +++ b/tests/unit/butterfree/load/writers/test_historical_feature_store_writer.py @@ -23,11 +23,6 @@ def test_write( spark_client.write_table = mocker.stub("write_table") writer = HistoricalFeatureStoreWriter() - schema_dataframe = writer._create_partitions(feature_set_dataframe) - writer.check_schema_hook = mocker.stub("check_schema_hook") - writer.check_schema_hook.run = mocker.stub("run") - writer.check_schema_hook.run.return_value = schema_dataframe - # when writer.write( feature_set=feature_set, @@ -62,11 +57,6 @@ def test_write_interval_mode( ) writer = HistoricalFeatureStoreWriter(interval_mode=True) - schema_dataframe = writer._create_partitions(feature_set_dataframe) - writer.check_schema_hook = mocker.stub("check_schema_hook") - writer.check_schema_hook.run = mocker.stub("run") - writer.check_schema_hook.run.return_value = schema_dataframe - # when writer.write( feature_set=feature_set, @@ -104,11 +94,6 @@ def test_write_interval_mode_invalid_partition_mode( writer = HistoricalFeatureStoreWriter(interval_mode=True) - schema_dataframe = writer._create_partitions(feature_set_dataframe) - writer.check_schema_hook = mocker.stub("check_schema_hook") - writer.check_schema_hook.run = mocker.stub("run") - writer.check_schema_hook.run.return_value = schema_dataframe - # when with pytest.raises(RuntimeError): _ = writer.write( @@ -123,7 +108,6 @@ def test_write_in_debug_mode( historical_feature_set_dataframe, feature_set, spark_session, - mocker, ): # given spark_client = SparkClient() @@ -321,12 +305,6 @@ def test_write_with_transform( writer = HistoricalFeatureStoreWriter().with_(json_transform) - schema_dataframe = writer._create_partitions(feature_set_dataframe) - json_dataframe = writer._apply_transformations(schema_dataframe) - writer.check_schema_hook = mocker.stub("check_schema_hook") - writer.check_schema_hook.run = mocker.stub("run") - writer.check_schema_hook.run.return_value = json_dataframe - # when writer.write( feature_set=feature_set, diff --git a/tests/unit/butterfree/load/writers/test_online_feature_store_writer.py b/tests/unit/butterfree/load/writers/test_online_feature_store_writer.py index 384ec1527..78f6862ee 100644 --- a/tests/unit/butterfree/load/writers/test_online_feature_store_writer.py +++ b/tests/unit/butterfree/load/writers/test_online_feature_store_writer.py @@ -68,10 +68,6 @@ def test_write( spark_client.write_dataframe = mocker.stub("write_dataframe") writer = OnlineFeatureStoreWriter(cassandra_config) - writer.check_schema_hook = mocker.stub("check_schema_hook") - writer.check_schema_hook.run = mocker.stub("run") - writer.check_schema_hook.run.return_value = feature_set_dataframe - # when writer.write(feature_set, feature_set_dataframe, spark_client) @@ -104,10 +100,6 @@ def test_write_in_debug_mode( spark_client = SparkClient() writer = OnlineFeatureStoreWriter(debug_mode=True) - writer.check_schema_hook = mocker.stub("check_schema_hook") - writer.check_schema_hook.run = mocker.stub("run") - writer.check_schema_hook.run.return_value = feature_set_dataframe - # when writer.write( feature_set=feature_set, @@ -119,7 +111,7 @@ def test_write_in_debug_mode( # then assert_dataframe_equality(latest_feature_set_dataframe, result_df) - def test_write_in_debug_and_stream_mode(self, feature_set, spark_session, mocker): + def test_write_in_debug_and_stream_mode(self, feature_set, spark_session): # arrange spark_client = SparkClient() @@ -132,10 +124,6 @@ def test_write_in_debug_and_stream_mode(self, feature_set, spark_session, mocker writer = OnlineFeatureStoreWriter(debug_mode=True) - writer.check_schema_hook = mocker.stub("check_schema_hook") - writer.check_schema_hook.run = mocker.stub("run") - writer.check_schema_hook.run.return_value = mocked_stream_df - # act handler = writer.write( feature_set=feature_set, @@ -151,7 +139,7 @@ def test_write_in_debug_and_stream_mode(self, feature_set, spark_session, mocker assert isinstance(handler, StreamingQuery) @pytest.mark.parametrize("has_checkpoint", [True, False]) - def test_write_stream(self, feature_set, has_checkpoint, monkeypatch, mocker): + def test_write_stream(self, feature_set, has_checkpoint, monkeypatch): # arrange spark_client = SparkClient() spark_client.write_stream = Mock() @@ -174,10 +162,6 @@ def test_write_stream(self, feature_set, has_checkpoint, monkeypatch, mocker): writer = OnlineFeatureStoreWriter(cassandra_config) writer.filter_latest = Mock() - writer.check_schema_hook = mocker.stub("check_schema_hook") - writer.check_schema_hook.run = mocker.stub("run") - writer.check_schema_hook.run.return_value = dataframe - # act stream_handler = writer.write(feature_set, dataframe, spark_client) @@ -201,7 +185,7 @@ def test_get_db_schema(self, cassandra_config, test_feature_set, expected_schema assert schema == expected_schema - def test_write_stream_on_entity(self, feature_set, monkeypatch, mocker): + def test_write_stream_on_entity(self, feature_set, monkeypatch): """Test write method with stream dataframe and write_to_entity enabled. The main purpose of this test is assert the correct setup of stream checkpoint @@ -224,10 +208,6 @@ def test_write_stream_on_entity(self, feature_set, monkeypatch, mocker): writer = OnlineFeatureStoreWriter(write_to_entity=True) - writer.check_schema_hook = mocker.stub("check_schema_hook") - writer.check_schema_hook.run = mocker.stub("run") - writer.check_schema_hook.run.return_value = dataframe - # act stream_handler = writer.write(feature_set, dataframe, spark_client) @@ -256,10 +236,6 @@ def test_write_with_transform( spark_client.write_dataframe = mocker.stub("write_dataframe") writer = OnlineFeatureStoreWriter(cassandra_config).with_(json_transform) - writer.check_schema_hook = mocker.stub("check_schema_hook") - writer.check_schema_hook.run = mocker.stub("run") - writer.check_schema_hook.run.return_value = feature_set_dataframe - # when writer.write(feature_set, feature_set_dataframe, spark_client) @@ -293,10 +269,6 @@ def test_write_with_kafka_config( kafka_config = KafkaConfig() writer = OnlineFeatureStoreWriter(kafka_config).with_(json_transform) - writer.check_schema_hook = mocker.stub("check_schema_hook") - writer.check_schema_hook.run = mocker.stub("run") - writer.check_schema_hook.run.return_value = feature_set_dataframe - # when writer.write(feature_set, feature_set_dataframe, spark_client) @@ -320,10 +292,6 @@ def test_write_with_custom_kafka_config( json_transform ) - custom_writer.check_schema_hook = mocker.stub("check_schema_hook") - custom_writer.check_schema_hook.run = mocker.stub("run") - custom_writer.check_schema_hook.run.return_value = feature_set_dataframe - # when custom_writer.write(feature_set, feature_set_dataframe, spark_client) From 3d93a098f110108c50a1273abc382c1ad7a8b99f Mon Sep 17 00:00:00 2001 From: Igor Gustavo Hoelscher <19557581+roelschr@users.noreply.github.com> Date: Tue, 23 Mar 2021 14:15:57 -0300 Subject: [PATCH 20/70] [MLOP-647] [MLOP-646] Apply migrations (#300) * add apply migration method * add test apply migration * add migrate actor with tests * mypy compliant * fix test interaction with mocked object * Rebase and some adjusts. Co-authored-by: Mayara Moromisato --- butterfree/_cli/migrate.py | 28 ++++++++++- butterfree/configs/db/abstract_config.py | 5 ++ butterfree/configs/db/cassandra_config.py | 5 ++ butterfree/configs/db/kafka_config.py | 5 ++ butterfree/configs/db/metastore_config.py | 5 ++ .../historical_feature_store_writer.py | 9 ++-- .../writers/online_feature_store_writer.py | 8 +-- butterfree/load/writers/writer.py | 11 ++++- butterfree/migrations/__init__.py | 3 -- .../migrations/database_migration/__init__.py | 6 +++ .../database_migration/cassandra_migration.py | 12 +++-- .../database_migration/database_migration.py | 49 ++++++++++++++++--- .../database_migration/metastore_migration.py | 8 +-- butterfree/migrations/migrate.py | 41 ---------------- tests/unit/butterfree/_cli/test_migrate.py | 23 ++++++++- .../migrations/database_migration/conftest.py | 22 +++++++++ .../test_database_migration.py | 11 +++++ 17 files changed, 179 insertions(+), 72 deletions(-) delete mode 100644 butterfree/migrations/migrate.py diff --git a/butterfree/_cli/migrate.py b/butterfree/_cli/migrate.py index ee083f731..10b6310b1 100644 --- a/butterfree/_cli/migrate.py +++ b/butterfree/_cli/migrate.py @@ -8,6 +8,7 @@ import typer from butterfree._cli import cli_logger +from butterfree.migrations.database_migration import ALLOWED_DATABASE from butterfree.pipelines import FeatureSetPipeline app = typer.Typer() @@ -88,6 +89,28 @@ def __fs_objects(path: str) -> Set[FeatureSetPipeline]: ) +class Migrate: + """Execute migration operations in a Database based on pipeline Writer. + + Attributes: + pipelines: list of Feature Set Pipelines to use to migration. + """ + + def __init__(self, pipelines: Set[FeatureSetPipeline]) -> None: + self.pipelines = pipelines + + def _send_logs_to_s3(self) -> None: + """Send all migration logs to S3.""" + pass + + def run(self) -> None: + """Construct and apply the migrations.""" + for pipeline in self.pipelines: + for writer in pipeline.sink.writers: + migration = ALLOWED_DATABASE[writer.db_config.database] + migration.apply_migration(pipeline.feature_set, writer) + + @app.callback() def migrate(path: str = PATH) -> Set[FeatureSetPipeline]: """Scan and run database migrations for feature set pipelines defined under PATH. @@ -100,5 +123,6 @@ def migrate(path: str = PATH) -> Set[FeatureSetPipeline]: All pipelines must be under python modules inside path, so we can dynamically import and instantiate them. """ - # TODO call the Migration actor with all feature set pipeline objects - return __fs_objects(path) + pipe_set = __fs_objects(path) + Migrate(pipe_set).run() + return pipe_set diff --git a/butterfree/configs/db/abstract_config.py b/butterfree/configs/db/abstract_config.py index 8e98aab61..fbd48c534 100644 --- a/butterfree/configs/db/abstract_config.py +++ b/butterfree/configs/db/abstract_config.py @@ -7,6 +7,11 @@ class AbstractWriteConfig(ABC): """Abstract class for database write configurations with spark.""" + @property + @abstractmethod + def database(self) -> str: + """Database name.""" + @property @abstractmethod def mode(self) -> Any: diff --git a/butterfree/configs/db/cassandra_config.py b/butterfree/configs/db/cassandra_config.py index b58a2e0a2..e9329c5df 100644 --- a/butterfree/configs/db/cassandra_config.py +++ b/butterfree/configs/db/cassandra_config.py @@ -50,6 +50,11 @@ def __init__( self.stream_output_mode = stream_output_mode self.stream_checkpoint_path = stream_checkpoint_path + @property + def database(self) -> str: + """Database name.""" + return "cassandra" + @property def username(self) -> Optional[str]: """Username used in connection to Cassandra DB.""" diff --git a/butterfree/configs/db/kafka_config.py b/butterfree/configs/db/kafka_config.py index 67b2dc57c..79cad15b2 100644 --- a/butterfree/configs/db/kafka_config.py +++ b/butterfree/configs/db/kafka_config.py @@ -41,6 +41,11 @@ def __init__( self.stream_output_mode = stream_output_mode self.stream_checkpoint_path = stream_checkpoint_path + @property + def database(self) -> str: + """Database name.""" + return "kafka" + @property def kafka_topic(self) -> Optional[str]: """Kafka topic name.""" diff --git a/butterfree/configs/db/metastore_config.py b/butterfree/configs/db/metastore_config.py index a3013de9c..97a999c2d 100644 --- a/butterfree/configs/db/metastore_config.py +++ b/butterfree/configs/db/metastore_config.py @@ -35,6 +35,11 @@ def __init__( self.format_ = format_ self.file_system = file_system + @property + def database(self) -> str: + """Database name.""" + return "metastore" + @property def path(self) -> Optional[str]: """Bucket name.""" diff --git a/butterfree/load/writers/historical_feature_store_writer.py b/butterfree/load/writers/historical_feature_store_writer.py index 6274840c8..e3e9b9b7b 100644 --- a/butterfree/load/writers/historical_feature_store_writer.py +++ b/butterfree/load/writers/historical_feature_store_writer.py @@ -1,7 +1,7 @@ """Holds the Historical Feature Store writer class.""" import os -from typing import Any, Union +from typing import Any from pyspark.sql.dataframe import DataFrame from pyspark.sql.functions import dayofmonth, month, year @@ -106,7 +106,7 @@ class HistoricalFeatureStoreWriter(Writer): def __init__( self, - db_config: Union[AbstractWriteConfig, MetastoreConfig] = None, + db_config: AbstractWriteConfig = None, database: str = None, num_partitions: int = None, validation_threshold: float = DEFAULT_VALIDATION_THRESHOLD, @@ -114,8 +114,9 @@ def __init__( interval_mode: bool = False, check_schema_hook: Hook = None, ): - super(HistoricalFeatureStoreWriter, self).__init__(debug_mode, interval_mode) - self.db_config = db_config or MetastoreConfig() + super(HistoricalFeatureStoreWriter, self).__init__( + db_config or MetastoreConfig(), debug_mode, interval_mode + ) self.database = database or environment.get_variable( "FEATURE_STORE_HISTORICAL_DATABASE" ) diff --git a/butterfree/load/writers/online_feature_store_writer.py b/butterfree/load/writers/online_feature_store_writer.py index 310f54cc8..b51d99235 100644 --- a/butterfree/load/writers/online_feature_store_writer.py +++ b/butterfree/load/writers/online_feature_store_writer.py @@ -80,15 +80,15 @@ class OnlineFeatureStoreWriter(Writer): def __init__( self, - db_config: Union[AbstractWriteConfig, CassandraConfig] = None, + db_config: AbstractWriteConfig = None, debug_mode: bool = False, write_to_entity: bool = False, interval_mode: bool = False, check_schema_hook: Hook = None, ): - super(OnlineFeatureStoreWriter, self).__init__(debug_mode, interval_mode) - self.db_config = db_config or CassandraConfig() - self.write_to_entity = write_to_entity + super(OnlineFeatureStoreWriter, self).__init__( + db_config or CassandraConfig(), debug_mode, interval_mode, write_to_entity + ) self.check_schema_hook = check_schema_hook @staticmethod diff --git a/butterfree/load/writers/writer.py b/butterfree/load/writers/writer.py index 7e0f9018d..e12a4317e 100644 --- a/butterfree/load/writers/writer.py +++ b/butterfree/load/writers/writer.py @@ -7,6 +7,7 @@ from pyspark.sql.dataframe import DataFrame from butterfree.clients import SparkClient +from butterfree.configs.db import AbstractWriteConfig from butterfree.hooks import HookableComponent from butterfree.transform import FeatureSet @@ -19,11 +20,19 @@ class Writer(ABC, HookableComponent): """ - def __init__(self, debug_mode: bool = False, interval_mode: bool = False) -> None: + def __init__( + self, + db_config: AbstractWriteConfig, + debug_mode: bool = False, + interval_mode: bool = False, + write_to_entity: bool = False, + ) -> None: super().__init__() + self.db_config = db_config self.transformations: List[Dict[str, Any]] = [] self.debug_mode = debug_mode self.interval_mode = interval_mode + self.write_to_entity = write_to_entity def with_( self, transformer: Callable[..., DataFrame], *args: Any, **kwargs: Any diff --git a/butterfree/migrations/__init__.py b/butterfree/migrations/__init__.py index 39cabfb7c..791f5fefe 100644 --- a/butterfree/migrations/__init__.py +++ b/butterfree/migrations/__init__.py @@ -1,4 +1 @@ """Holds available migrations.""" -from butterfree.migrations.migrate import Migrate - -__all__ = ["Migrate"] diff --git a/butterfree/migrations/database_migration/__init__.py b/butterfree/migrations/database_migration/__init__.py index 7138c4450..e31800884 100644 --- a/butterfree/migrations/database_migration/__init__.py +++ b/butterfree/migrations/database_migration/__init__.py @@ -9,3 +9,9 @@ ) __all__ = ["CassandraMigration", "MetastoreMigration", "Diff"] + + +ALLOWED_DATABASE = { + "cassandra": CassandraMigration(), + "metastore": MetastoreMigration(), +} diff --git a/butterfree/migrations/database_migration/cassandra_migration.py b/butterfree/migrations/database_migration/cassandra_migration.py index 3d26673f4..c511479b6 100644 --- a/butterfree/migrations/database_migration/cassandra_migration.py +++ b/butterfree/migrations/database_migration/cassandra_migration.py @@ -32,11 +32,13 @@ class CassandraMigration(DatabaseMigration): def __init__(self) -> None: self._db_config = CassandraConfig() - self._client = CassandraClient( - host=[self._db_config.host], - keyspace=self._db_config.keyspace, # type: ignore - user=self._db_config.username, - password=self._db_config.password, + super(CassandraMigration, self).__init__( + CassandraClient( + host=[self._db_config.host], + keyspace=self._db_config.keyspace, # type: ignore + user=self._db_config.username, + password=self._db_config.password, + ) ) @staticmethod diff --git a/butterfree/migrations/database_migration/database_migration.py b/butterfree/migrations/database_migration/database_migration.py index 160f67282..fb2b9e7f7 100644 --- a/butterfree/migrations/database_migration/database_migration.py +++ b/butterfree/migrations/database_migration/database_migration.py @@ -5,6 +5,8 @@ from enum import Enum, auto from typing import Any, Dict, List, Set +from butterfree.clients import AbstractClient +from butterfree.load.writers.writer import Writer from butterfree.transform import FeatureSet @@ -40,6 +42,9 @@ def __eq__(self, other: object) -> bool: class DatabaseMigration(ABC): """Abstract base class for Migrations.""" + def __init__(self, client: AbstractClient) -> None: + self._client = client + @abstractmethod def _get_create_table_query( self, columns: List[Dict[str, Any]], table_name: str @@ -173,10 +178,6 @@ def create_query( return self._get_queries(schema_diff, table_name, write_on_entity) - def _apply_migration(self, feature_set: FeatureSet) -> None: - """Apply the migration in the respective database.""" - pass - @staticmethod def _get_diff( fs_schema: List[Dict[str, Any]], db_schema: List[Dict[str, Any]], @@ -238,11 +239,43 @@ def _get_diff( ) return schema_diff - def run(self, feature_set: FeatureSet) -> None: - """Runs the migrations. + def _get_schema(self, table_name: str) -> List[Dict[str, Any]]: + """Get a table schema in the respective database. Args: - feature_set: the feature set. + table_name: Table name to get schema. + Returns: + Schema object. """ - pass + try: + db_schema = self._client.get_schema(table_name) + except Exception: # noqa + db_schema = [] + return db_schema + + def apply_migration(self, feature_set: FeatureSet, writer: Writer,) -> None: + """Apply the migration in the respective database. + + Args: + feature_set: the feature set. + writer: the writer being used to load the feature set. + """ + logging.info(f"Migrating feature set: {feature_set.name}") + + table_name = ( + feature_set.name if not writer.write_to_entity else feature_set.entity + ) + + fs_schema = writer.db_config.translate(feature_set.get_schema()) + db_schema = self._get_schema(table_name) + + queries = self.create_query( + fs_schema, table_name, db_schema, writer.write_to_entity + ) + + for q in queries: + logging.info(f"Applying {q}...") + self._client.sql(q) + + logging.info(f"Feature Set migration finished successfully.") diff --git a/butterfree/migrations/database_migration/metastore_migration.py b/butterfree/migrations/database_migration/metastore_migration.py index 89017374f..1c5667db4 100644 --- a/butterfree/migrations/database_migration/metastore_migration.py +++ b/butterfree/migrations/database_migration/metastore_migration.py @@ -2,7 +2,9 @@ from typing import Any, Dict, List +from butterfree.clients import SparkClient from butterfree.configs import environment +from butterfree.configs.db import MetastoreConfig from butterfree.constants.migrations import PARTITION_BY from butterfree.migrations.database_migration.database_migration import ( DatabaseMigration, @@ -28,12 +30,12 @@ class MetastoreMigration(DatabaseMigration): data is being loaded into an entity table, then users can drop columns manually. """ - def __init__( - self, database: str = None, - ): + def __init__(self, database: str = None,) -> None: + self._db_config = MetastoreConfig() self.database = database or environment.get_variable( "FEATURE_STORE_HISTORICAL_DATABASE" ) + super(MetastoreMigration, self).__init__(SparkClient()) @staticmethod def _get_parsed_columns(columns: List[Diff]) -> List[str]: diff --git a/butterfree/migrations/migrate.py b/butterfree/migrations/migrate.py deleted file mode 100644 index f128dee1f..000000000 --- a/butterfree/migrations/migrate.py +++ /dev/null @@ -1,41 +0,0 @@ -"""Holds the Migrator Class.""" - -from typing import Callable, List, Tuple - -from butterfree.pipelines import FeatureSetPipeline -from butterfree.transform import FeatureSet - - -class Migrate: - """Execute migration operations in a Database based on pipeline Writer. - - Attributes: - pipelines: list of Feature Set Pipelines to use to migration. - - """ - - def __init__(self, pipelines: List[FeatureSetPipeline]) -> None: - self.pipelines = pipelines - - def _parse_feature_set_pipeline( - self, pipeline: FeatureSetPipeline - ) -> List[Tuple[Callable, FeatureSet]]: - feature_set = pipeline.feature_set - migrations = [ - writer.db_config._migration_class for writer in pipeline.sink.writers - ] - - return [(migrate, feature_set) for migrate in migrations] - - def _send_logs_to_s3(self) -> None: - """Send all migration logs to S3.""" - pass - - def migration(self) -> None: - """Construct and apply the migrations.""" - migration_list = [ - self._parse_feature_set_pipeline(pipeline) for pipeline in self.pipelines - ] - - for migration, fs in migration_list: - migration.run(fs) diff --git a/tests/unit/butterfree/_cli/test_migrate.py b/tests/unit/butterfree/_cli/test_migrate.py index 6a63453fb..2e4b2db08 100644 --- a/tests/unit/butterfree/_cli/test_migrate.py +++ b/tests/unit/butterfree/_cli/test_migrate.py @@ -1,8 +1,29 @@ +from unittest.mock import call + from butterfree._cli import migrate +from butterfree.migrations.database_migration import ( + CassandraMigration, + MetastoreMigration, +) from butterfree.pipelines import FeatureSetPipeline -def test_migrate_success(): +def test_migrate_success(mocker): + mocker.patch.object(migrate.Migrate, "run") all_fs = migrate.migrate("tests/mocks/entities/") assert all(isinstance(fs, FeatureSetPipeline) for fs in all_fs) assert sorted([fs.feature_set.name for fs in all_fs]) == ["first", "second"] + + +def test_migrate_all_pairs(mocker): + mocker.patch.object(MetastoreMigration, "apply_migration") + mocker.patch.object(CassandraMigration, "apply_migration") + all_fs = migrate.migrate("tests/mocks/entities/") + + assert MetastoreMigration.apply_migration.call_count == 2 + assert CassandraMigration.apply_migration.call_count == 2 + + metastore_pairs = [call(pipe.feature_set, pipe.sink.writers[0]) for pipe in all_fs] + cassandra_pairs = [call(pipe.feature_set, pipe.sink.writers[1]) for pipe in all_fs] + MetastoreMigration.apply_migration.assert_has_calls(metastore_pairs, any_order=True) + CassandraMigration.apply_migration.assert_has_calls(cassandra_pairs, any_order=True) diff --git a/tests/unit/butterfree/migrations/database_migration/conftest.py b/tests/unit/butterfree/migrations/database_migration/conftest.py index bcf7f7f3a..dcd96714f 100644 --- a/tests/unit/butterfree/migrations/database_migration/conftest.py +++ b/tests/unit/butterfree/migrations/database_migration/conftest.py @@ -1,6 +1,10 @@ from pyspark.sql.types import DoubleType, FloatType, LongType, TimestampType from pytest import fixture +from butterfree.constants import DataType +from butterfree.transform import FeatureSet +from butterfree.transform.features import Feature, KeyFeature, TimestampFeature + @fixture def db_schema(): @@ -32,3 +36,21 @@ def fs_schema(): "primary_key": False, }, ] + + +@fixture +def feature_set(): + feature_set = FeatureSet( + name="feature_set", + entity="entity", + description="description", + features=[ + Feature(name="feature_float", description="test", dtype=DataType.FLOAT,), + ], + keys=[ + KeyFeature(name="id", description="The device ID", dtype=DataType.BIGINT,) + ], + timestamp=TimestampFeature(), + ) + + return feature_set diff --git a/tests/unit/butterfree/migrations/database_migration/test_database_migration.py b/tests/unit/butterfree/migrations/database_migration/test_database_migration.py index 30277992e..befb55a37 100644 --- a/tests/unit/butterfree/migrations/database_migration/test_database_migration.py +++ b/tests/unit/butterfree/migrations/database_migration/test_database_migration.py @@ -54,3 +54,14 @@ def test__get_diff(self, mocker, db_schema): m._client = mocker.stub("client") diff = m._get_diff(fs_schema, db_schema) assert diff == expected_diff + + def test_apply_migration(self, feature_set, mocker): + # given + m = CassandraMigration() + m.apply_migration = mocker.stub("apply_migration") + + # when + m.apply_migration(feature_set) + + # then + m.apply_migration.assert_called_once() From 0d309327258c1a61f0d1aeb484da2f67f97b7a88 Mon Sep 17 00:00:00 2001 From: Mayara Moromisato <44944954+moromimay@users.noreply.github.com> Date: Tue, 30 Mar 2021 14:58:08 -0300 Subject: [PATCH 21/70] [BUG] Apply create_partitions to historical validate (#303) * Apply create_partitions to historical validate. * Remove comments and adjusts. --- butterfree/load/writers/historical_feature_store_writer.py | 5 +++-- setup.py | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/butterfree/load/writers/historical_feature_store_writer.py b/butterfree/load/writers/historical_feature_store_writer.py index e3e9b9b7b..5defb00b9 100644 --- a/butterfree/load/writers/historical_feature_store_writer.py +++ b/butterfree/load/writers/historical_feature_store_writer.py @@ -225,7 +225,6 @@ def validate( Raises: AssertionError: if count of written data doesn't match count in current feature set dataframe. - """ table_name = ( f"{feature_set.name}" @@ -240,7 +239,9 @@ def validate( written_count = ( spark_client.read( self.db_config.format_, - path=self.db_config.get_path_with_partitions(table_name, dataframe), + path=self.db_config.get_path_with_partitions( + table_name, self._create_partitions(dataframe) + ), ).count() if self.interval_mode and not self.debug_mode else spark_client.read_table(table_name).count() diff --git a/setup.py b/setup.py index 2dece452d..a86ee0491 100644 --- a/setup.py +++ b/setup.py @@ -1,7 +1,7 @@ from setuptools import find_packages, setup __package_name__ = "butterfree" -__version__ = "1.2.0.dev3" +__version__ = "1.2.0.dev4" __repository_url__ = "https://github.com/quintoandar/butterfree" with open("requirements.txt") as f: From d607297e004f2e2fa1b625d1218bf79dfe929aa8 Mon Sep 17 00:00:00 2001 From: Mayara Moromisato <44944954+moromimay@users.noreply.github.com> Date: Tue, 30 Mar 2021 16:09:07 -0300 Subject: [PATCH 22/70] [BUG] Fix key path for validate read (#304) * Fix key path * bump version Co-authored-by: AlvaroMarquesAndrade <1a789766b1c4c8b679e80f11fa6d63d42fa4bcdf> --- butterfree/load/writers/historical_feature_store_writer.py | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/butterfree/load/writers/historical_feature_store_writer.py b/butterfree/load/writers/historical_feature_store_writer.py index 5defb00b9..c43440419 100644 --- a/butterfree/load/writers/historical_feature_store_writer.py +++ b/butterfree/load/writers/historical_feature_store_writer.py @@ -227,7 +227,7 @@ def validate( feature set dataframe. """ table_name = ( - f"{feature_set.name}" + os.path.join("historical", feature_set.entity, feature_set.name) if self.interval_mode and not self.debug_mode else ( f"{self.database}.{feature_set.name}" diff --git a/setup.py b/setup.py index a86ee0491..3ff187370 100644 --- a/setup.py +++ b/setup.py @@ -1,7 +1,7 @@ from setuptools import find_packages, setup __package_name__ = "butterfree" -__version__ = "1.2.0.dev4" +__version__ = "1.2.0.dev5" __repository_url__ = "https://github.com/quintoandar/butterfree" with open("requirements.txt") as f: From 3dcd975d9b6d4ed2380bd3a7701160e546014cd2 Mon Sep 17 00:00:00 2001 From: AlvaroMarquesAndrade <45604858+AlvaroMarquesAndrade@users.noreply.github.com> Date: Thu, 1 Apr 2021 11:10:03 -0300 Subject: [PATCH 23/70] [FIX] Add Partition types for Metastore (#305) --- .../migrations/database_migration/metastore_migration.py | 9 ++++----- .../database_migration/test_metastore_migration.py | 5 ++--- 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/butterfree/migrations/database_migration/metastore_migration.py b/butterfree/migrations/database_migration/metastore_migration.py index 1c5667db4..8b7c6af0e 100644 --- a/butterfree/migrations/database_migration/metastore_migration.py +++ b/butterfree/migrations/database_migration/metastore_migration.py @@ -102,8 +102,6 @@ def _get_create_table_query( Create table query. """ - columns.extend(PARTITION_BY) - parsed_columns = [] for col in columns: parsed_columns.append(f"{col['column_name']} {col['type']}") @@ -112,9 +110,10 @@ def _get_create_table_query( return ( f"CREATE TABLE IF NOT EXISTS " f"{self.database}.{table_name} ({parsed_columns}) " - f"PARTITIONED BY ({PARTITION_BY[0]['column_name']}, " - f"{PARTITION_BY[1]['column_name']}, " - f"{PARTITION_BY[2]['column_name']});" + f"PARTITIONED BY (" + f"{PARTITION_BY[0]['column_name']} {PARTITION_BY[0]['type']}, " + f"{PARTITION_BY[1]['column_name']} {PARTITION_BY[1]['type']}, " + f"{PARTITION_BY[2]['column_name']} {PARTITION_BY[2]['type']});" ) def _get_alter_table_drop_query(self, columns: List[Diff], table_name: str) -> str: diff --git a/tests/unit/butterfree/migrations/database_migration/test_metastore_migration.py b/tests/unit/butterfree/migrations/database_migration/test_metastore_migration.py index fd1dfad89..5bac93521 100644 --- a/tests/unit/butterfree/migrations/database_migration/test_metastore_migration.py +++ b/tests/unit/butterfree/migrations/database_migration/test_metastore_migration.py @@ -40,9 +40,8 @@ def test_create_table_query(self, fs_schema): expected_query = [ "CREATE TABLE IF NOT EXISTS test.table_name " - "(id LongType, timestamp TimestampType, new_feature FloatType, " - "feature1__avg_over_1_week_rolling_windows FloatType, year INT, " - "month INT, day INT) PARTITIONED BY (year, month, day);" + "(id LongType, timestamp TimestampType, new_feature FloatType) " + "PARTITIONED BY (year INT, month INT, day INT);" ] query = metastore_migration.create_query(fs_schema, "table_name") From 8077d8656fc4b6f5259f991c9797b90fe8e9b67c Mon Sep 17 00:00:00 2001 From: Mayara Moromisato <44944954+moromimay@users.noreply.github.com> Date: Thu, 1 Apr 2021 14:46:53 -0300 Subject: [PATCH 24/70] [MLOP-639] Track logs in S3 (#306) * Apply tracking logs and logging config. * Adjusts in CLI and logging.conf. * Some adjusts. * Change version to generate new dev package * Fix version. * Apply style. * Add new assert in the migrate unit test. --- butterfree/__init__.py | 7 +++ butterfree/_cli/__init__.py | 10 ---- butterfree/_cli/main.py | 2 +- butterfree/_cli/migrate.py | 56 ++++++++++++++----- butterfree/configs/db/metastore_config.py | 27 ++++++++- butterfree/constants/data_type.py | 29 +++++----- butterfree/logging.conf | 52 +++++++++++++++++ .../database_migration/database_migration.py | 10 ++-- logs/logging.json | 0 requirements.txt | 3 +- setup.py | 2 +- tests/unit/butterfree/_cli/test_migrate.py | 41 +++++++++----- .../test_database_migration.py | 3 +- 13 files changed, 181 insertions(+), 61 deletions(-) create mode 100644 butterfree/logging.conf create mode 100644 logs/logging.json diff --git a/butterfree/__init__.py b/butterfree/__init__.py index 18759b031..25b955c6e 100644 --- a/butterfree/__init__.py +++ b/butterfree/__init__.py @@ -1 +1,8 @@ """Module docstring example, following Google's docstring style.""" +import logging.config +import os +import sys + +sys.path.insert(0, os.path.abspath(".")) + +logging.config.fileConfig(fname="butterfree/logging.conf") diff --git a/butterfree/_cli/__init__.py b/butterfree/_cli/__init__.py index ec8a1792c..e69de29bb 100644 --- a/butterfree/_cli/__init__.py +++ b/butterfree/_cli/__init__.py @@ -1,10 +0,0 @@ -import logging - - -def __logger(name: str) -> logging.Logger: - format_ = "%(name)s:%(asctime)-15s:%(levelname)s:< %(message)s >" - logging.basicConfig(format=format_, level=logging.INFO) - return logging.getLogger(name) - - -cli_logger = __logger("butterfree") diff --git a/butterfree/_cli/main.py b/butterfree/_cli/main.py index e340bc1bd..636fdb25e 100644 --- a/butterfree/_cli/main.py +++ b/butterfree/_cli/main.py @@ -3,7 +3,7 @@ from butterfree._cli import migrate app = typer.Typer() -app.add_typer(migrate.app) +app.add_typer(migrate.app, name="migrate") if __name__ == "__main__": app() diff --git a/butterfree/_cli/migrate.py b/butterfree/_cli/migrate.py index 10b6310b1..6e3e9b592 100644 --- a/butterfree/_cli/migrate.py +++ b/butterfree/_cli/migrate.py @@ -1,5 +1,7 @@ import importlib import inspect +import logging +import os import pkgutil import sys from typing import Set @@ -7,11 +9,15 @@ import setuptools import typer -from butterfree._cli import cli_logger +from butterfree.clients import SparkClient +from butterfree.configs import environment +from butterfree.extract.readers import FileReader from butterfree.migrations.database_migration import ALLOWED_DATABASE from butterfree.pipelines import FeatureSetPipeline -app = typer.Typer() +app = typer.Typer(help="Apply the automatic migrations in a database.") + +logger = logging.getLogger("migrate") def __find_modules(path: str) -> Set[str]: @@ -33,18 +39,18 @@ def __find_modules(path: str) -> Set[str]: def __fs_objects(path: str) -> Set[FeatureSetPipeline]: - cli_logger.info(f"Looking for python modules under {path}...") + logger.info(f"Looking for python modules under {path}...") modules = __find_modules(path) if not modules: return set() - cli_logger.info(f"Importing modules...") + logger.info(f"Importing modules...") package = ".".join(path.strip("/").split("/")) imported = set( importlib.import_module(f".{name}", package=package) for name in modules ) - cli_logger.info(f"Scanning modules...") + logger.info(f"Scanning modules...") content = { module: set( filter( @@ -80,7 +86,7 @@ def __fs_objects(path: str) -> Set[FeatureSetPipeline]: instances.add(value) - cli_logger.info("Creating instances...") + logger.info("Creating instances...") return set(value() for value in instances) @@ -88,6 +94,10 @@ def __fs_objects(path: str) -> Set[FeatureSetPipeline]: ..., help="Full or relative path to where feature set pipelines are being defined.", ) +GENERATE_LOGS = typer.Option( + False, help="To generate the logs in local file 'logging.json'." +) + class Migrate: """Execute migration operations in a Database based on pipeline Writer. @@ -96,23 +106,43 @@ class Migrate: pipelines: list of Feature Set Pipelines to use to migration. """ - def __init__(self, pipelines: Set[FeatureSetPipeline]) -> None: + def __init__( + self, pipelines: Set[FeatureSetPipeline], spark_client: SparkClient = None + ) -> None: self.pipelines = pipelines + self.spark_client = spark_client or SparkClient() - def _send_logs_to_s3(self) -> None: + def _send_logs_to_s3(self, file_local: bool) -> None: """Send all migration logs to S3.""" - pass + file_reader = FileReader(id="name", path="logs/logging.json", format="json") + df = file_reader.consume(self.spark_client) + + path = environment.get_variable("FEATURE_STORE_S3_BUCKET") - def run(self) -> None: + self.spark_client.write_dataframe( + dataframe=df, + format_="json", + mode="append", + **{"path": f"s3a://{path}/logging"}, + ) + + if not file_local: + os.rmdir("logs/logging.json") + + def run(self, generate_logs: bool) -> None: """Construct and apply the migrations.""" for pipeline in self.pipelines: for writer in pipeline.sink.writers: migration = ALLOWED_DATABASE[writer.db_config.database] migration.apply_migration(pipeline.feature_set, writer) + self._send_logs_to_s3(generate_logs) + -@app.callback() -def migrate(path: str = PATH) -> Set[FeatureSetPipeline]: +@app.command("apply") +def migrate( + path: str = PATH, generate_logs: bool = GENERATE_LOGS, +) -> Set[FeatureSetPipeline]: """Scan and run database migrations for feature set pipelines defined under PATH. Butterfree will scan a given path for classes that inherit from its @@ -124,5 +154,5 @@ def migrate(path: str = PATH) -> Set[FeatureSetPipeline]: import and instantiate them. """ pipe_set = __fs_objects(path) - Migrate(pipe_set).run() + Migrate(pipe_set).run(generate_logs) return pipe_set diff --git a/butterfree/configs/db/metastore_config.py b/butterfree/configs/db/metastore_config.py index 97a999c2d..ff7ed01df 100644 --- a/butterfree/configs/db/metastore_config.py +++ b/butterfree/configs/db/metastore_config.py @@ -122,4 +122,29 @@ def get_path_with_partitions(self, key: str, dataframe: DataFrame) -> List: def translate(self, schema: List[Dict[str, Any]]) -> List[Dict[str, Any]]: """Translate feature set spark schema to the corresponding database.""" - return schema + spark_sql_mapping = { + "TimestampType": "TIMESTAMP", + "BinaryType": "BINARY", + "BooleanType": "BOOLEAN", + "DateType": "DATE", + "DecimalType": "DECIMAL", + "DoubleType": "DOUBLE", + "FloatType": "FLOAT", + "IntegerType": "INT", + "LongType": "BIGINT", + "StringType": "STRING", + "ArrayType(LongType,true)": "ARRAY", + "ArrayType(StringType,true)": "ARRAY", + "ArrayType(FloatType,true)": "ARRAY", + } + sql_schema = [] + for features in schema: + sql_schema.append( + { + "column_name": features["column_name"], + "type": spark_sql_mapping[str(features["type"])], + "primary_key": features["primary_key"], + } + ) + + return sql_schema diff --git a/butterfree/constants/data_type.py b/butterfree/constants/data_type.py index 157d4a1fe..e99525f7d 100644 --- a/butterfree/constants/data_type.py +++ b/butterfree/constants/data_type.py @@ -21,20 +21,21 @@ class DataType(Enum): """Holds constants for data types within Butterfree.""" - TIMESTAMP = (TimestampType(), "timestamp") - BINARY = (BinaryType(), "boolean") - BOOLEAN = (BooleanType(), "boolean") - DATE = (DateType(), "timestamp") - DECIMAL = (DecimalType(), "decimal") - DOUBLE = (DoubleType(), "double") - FLOAT = (FloatType(), "float") - INTEGER = (IntegerType(), "int") - BIGINT = (LongType(), "bigint") - STRING = (StringType(), "text") - ARRAY_BIGINT = (ArrayType(LongType()), "frozen>") - ARRAY_STRING = (ArrayType(StringType()), "frozen>") - ARRAY_FLOAT = (ArrayType(FloatType()), "frozen>") + TIMESTAMP = (TimestampType(), "timestamp", "TIMESTAMP") + BINARY = (BinaryType(), "boolean", "BINARY") + BOOLEAN = (BooleanType(), "boolean", "BOOLEAN") + DATE = (DateType(), "timestamp", "DATE") + DECIMAL = (DecimalType(), "decimal", "DECIMAL") + DOUBLE = (DoubleType(), "double", "DOUBLE") + FLOAT = (FloatType(), "float", "FLOAT") + INTEGER = (IntegerType(), "int", "INT") + BIGINT = (LongType(), "bigint", "BIGINT") + STRING = (StringType(), "text", "STRING") + ARRAY_BIGINT = (ArrayType(LongType()), "frozen>", "ARRAY") + ARRAY_STRING = (ArrayType(StringType()), "frozen>", "ARRAY") + ARRAY_FLOAT = (ArrayType(FloatType()), "frozen>", "ARRAY") - def __init__(self, spark: PySparkDataType, cassandra: str) -> None: + def __init__(self, spark: PySparkDataType, cassandra: str, spark_sql: str) -> None: self.spark = spark self.cassandra = cassandra + self.spark_sql = spark_sql diff --git a/butterfree/logging.conf b/butterfree/logging.conf new file mode 100644 index 000000000..1ee6da868 --- /dev/null +++ b/butterfree/logging.conf @@ -0,0 +1,52 @@ +[loggers] +keys=root,cli,migrate,database_migrate + +[handlers] +keys=consoleHandler,file + +[formatters] +keys=simpleFormatter,jsonFormatter + +[logger_root] +level=DEBUG +handlers=consoleHandler + +[logger_cli] +level=DEBUG +handlers=file +qualname=cli +propagate=0 + +[logger_migrate] +level=DEBUG +handlers=file +qualname=migrate +propagate=0 + +[logger_database_migrate] +level=DEBUG +handlers=file +qualname=database_migrate +propagate=0 + +[handler_consoleHandler] +class=StreamHandler +level=DEBUG +formatter=simpleFormatter +args=(sys.stdout,) + +[handler_file] +class=FileHandler +level=DEBUG +formatter=jsonFormatter +args=('logs/logging.json', "a") + +[formatter_simpleFormatter] +format=%(name)s:%(asctime)-15s:%(levelname)s:%(message)s +datefmt= +class=logging.Formatter + +[formatter_jsonFormatter] +format={"name": "%(name)s", "timestamp": "%(asctime)-15s", "level": "%(levelname)s", "message": "%(message)s"} +datefmt= +class=logging.Formatter \ No newline at end of file diff --git a/butterfree/migrations/database_migration/database_migration.py b/butterfree/migrations/database_migration/database_migration.py index fb2b9e7f7..28f4f06c7 100644 --- a/butterfree/migrations/database_migration/database_migration.py +++ b/butterfree/migrations/database_migration/database_migration.py @@ -9,6 +9,8 @@ from butterfree.load.writers.writer import Writer from butterfree.transform import FeatureSet +logger = logging.getLogger("database_migrate") + @dataclass class Diff: @@ -154,7 +156,7 @@ def _get_queries( ) queries.append(alter_column_types_query) if alter_key_items: - logging.info("This operation is not supported by Spark.") + logger.info("This operation is not supported by Spark.") return queries @@ -261,7 +263,7 @@ def apply_migration(self, feature_set: FeatureSet, writer: Writer,) -> None: feature_set: the feature set. writer: the writer being used to load the feature set. """ - logging.info(f"Migrating feature set: {feature_set.name}") + logger.info(f"Migrating feature set: {feature_set.name}") table_name = ( feature_set.name if not writer.write_to_entity else feature_set.entity @@ -275,7 +277,7 @@ def apply_migration(self, feature_set: FeatureSet, writer: Writer,) -> None: ) for q in queries: - logging.info(f"Applying {q}...") + logger.info(f"Applying this query: {q} ...") self._client.sql(q) - logging.info(f"Feature Set migration finished successfully.") + logger.info(f"Feature Set migration finished successfully.") diff --git a/logs/logging.json b/logs/logging.json new file mode 100644 index 000000000..e69de29bb diff --git a/requirements.txt b/requirements.txt index bac7f2c78..82a99d7f5 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,4 +4,5 @@ pandas>=0.24,<1.1 parameters-validation>=1.1.5,<2.0 pyspark==3.* typer>=0.3,<0.4 -setuptools>=41,<42 \ No newline at end of file +setuptools>=41,<42 +typing-extensions==3.7.4.3 \ No newline at end of file diff --git a/setup.py b/setup.py index 3ff187370..abd56ccbd 100644 --- a/setup.py +++ b/setup.py @@ -1,7 +1,7 @@ from setuptools import find_packages, setup __package_name__ = "butterfree" -__version__ = "1.2.0.dev5" +__version__ = "1.2.0.dev6" __repository_url__ = "https://github.com/quintoandar/butterfree" with open("requirements.txt") as f: diff --git a/tests/unit/butterfree/_cli/test_migrate.py b/tests/unit/butterfree/_cli/test_migrate.py index 2e4b2db08..aa2c86db0 100644 --- a/tests/unit/butterfree/_cli/test_migrate.py +++ b/tests/unit/butterfree/_cli/test_migrate.py @@ -8,22 +8,33 @@ from butterfree.pipelines import FeatureSetPipeline -def test_migrate_success(mocker): - mocker.patch.object(migrate.Migrate, "run") - all_fs = migrate.migrate("tests/mocks/entities/") - assert all(isinstance(fs, FeatureSetPipeline) for fs in all_fs) - assert sorted([fs.feature_set.name for fs in all_fs]) == ["first", "second"] +class TestMigrate: + def test_migrate_success(self, mocker): + mocker.patch.object(migrate.Migrate, "run") + all_fs = migrate.migrate("tests/mocks/entities/") + assert all(isinstance(fs, FeatureSetPipeline) for fs in all_fs) + assert sorted([fs.feature_set.name for fs in all_fs]) == ["first", "second"] + def test_migrate_all_pairs(self, mocker): + mocker.patch.object(MetastoreMigration, "apply_migration") + mocker.patch.object(CassandraMigration, "apply_migration") + mocker.patch.object(migrate.Migrate, "_send_logs_to_s3") -def test_migrate_all_pairs(mocker): - mocker.patch.object(MetastoreMigration, "apply_migration") - mocker.patch.object(CassandraMigration, "apply_migration") - all_fs = migrate.migrate("tests/mocks/entities/") + all_fs = migrate.migrate("tests/mocks/entities/") - assert MetastoreMigration.apply_migration.call_count == 2 - assert CassandraMigration.apply_migration.call_count == 2 + assert MetastoreMigration.apply_migration.call_count == 2 + assert CassandraMigration.apply_migration.call_count == 2 - metastore_pairs = [call(pipe.feature_set, pipe.sink.writers[0]) for pipe in all_fs] - cassandra_pairs = [call(pipe.feature_set, pipe.sink.writers[1]) for pipe in all_fs] - MetastoreMigration.apply_migration.assert_has_calls(metastore_pairs, any_order=True) - CassandraMigration.apply_migration.assert_has_calls(cassandra_pairs, any_order=True) + metastore_pairs = [ + call(pipe.feature_set, pipe.sink.writers[0]) for pipe in all_fs + ] + cassandra_pairs = [ + call(pipe.feature_set, pipe.sink.writers[1]) for pipe in all_fs + ] + MetastoreMigration.apply_migration.assert_has_calls( + metastore_pairs, any_order=True + ) + CassandraMigration.apply_migration.assert_has_calls( + cassandra_pairs, any_order=True + ) + migrate.Migrate._send_logs_to_s3.assert_called_once() diff --git a/tests/unit/butterfree/migrations/database_migration/test_database_migration.py b/tests/unit/butterfree/migrations/database_migration/test_database_migration.py index befb55a37..ea7ce8158 100644 --- a/tests/unit/butterfree/migrations/database_migration/test_database_migration.py +++ b/tests/unit/butterfree/migrations/database_migration/test_database_migration.py @@ -1,5 +1,6 @@ from pyspark.sql.types import DoubleType, FloatType, LongType, TimestampType +from butterfree.load.writers import HistoricalFeatureStoreWriter from butterfree.migrations.database_migration import CassandraMigration, Diff @@ -61,7 +62,7 @@ def test_apply_migration(self, feature_set, mocker): m.apply_migration = mocker.stub("apply_migration") # when - m.apply_migration(feature_set) + m.apply_migration(feature_set, HistoricalFeatureStoreWriter()) # then m.apply_migration.assert_called_once() From 6d2a8f9897ddd665a68602ca12713dd3d0249f4b Mon Sep 17 00:00:00 2001 From: Mayara Moromisato <44944954+moromimay@users.noreply.github.com> Date: Tue, 6 Apr 2021 09:09:46 -0300 Subject: [PATCH 25/70] [BUG] Change logging config (#307) * Change logging config. * Some adjusts. * Remove a code smell. --- Makefile | 1 + butterfree/__init__.py | 7 --- butterfree/_cli/migrate.py | 14 ++--- butterfree/configs/logger.py | 24 +++++++++ butterfree/logging.conf | 52 ------------------- .../database_migration/database_migration.py | 3 +- logs/logging.json | 0 setup.py | 2 +- tests/unit/butterfree/_cli/test_migrate.py | 14 +++++ 9 files changed, 50 insertions(+), 67 deletions(-) create mode 100644 butterfree/configs/logger.py delete mode 100644 butterfree/logging.conf delete mode 100644 logs/logging.json diff --git a/Makefile b/Makefile index 397d04bf4..95cc6e3a6 100644 --- a/Makefile +++ b/Makefile @@ -122,6 +122,7 @@ clean: @find ./ -type f -name 'coverage.xml' -exec rm -f {} \; @find ./ -type f -name '.coverage*' -exec rm -f {} \; @find ./ -type f -name '*derby.log' -exec rm -f {} \; + @find ./ -type f -name 'logging.json' -exec rm -f {} \; @find ./ -name '*.pyc' -exec rm -f {} \; @find ./ -name '*.pyo' -exec rm -f {} \; @find ./ -name '*~' -exec rm -f {} \; diff --git a/butterfree/__init__.py b/butterfree/__init__.py index 25b955c6e..18759b031 100644 --- a/butterfree/__init__.py +++ b/butterfree/__init__.py @@ -1,8 +1 @@ """Module docstring example, following Google's docstring style.""" -import logging.config -import os -import sys - -sys.path.insert(0, os.path.abspath(".")) - -logging.config.fileConfig(fname="butterfree/logging.conf") diff --git a/butterfree/_cli/migrate.py b/butterfree/_cli/migrate.py index 6e3e9b592..f3c533d7c 100644 --- a/butterfree/_cli/migrate.py +++ b/butterfree/_cli/migrate.py @@ -1,6 +1,5 @@ import importlib import inspect -import logging import os import pkgutil import sys @@ -11,13 +10,14 @@ from butterfree.clients import SparkClient from butterfree.configs import environment +from butterfree.configs.logger import __logger from butterfree.extract.readers import FileReader from butterfree.migrations.database_migration import ALLOWED_DATABASE from butterfree.pipelines import FeatureSetPipeline app = typer.Typer(help="Apply the automatic migrations in a database.") -logger = logging.getLogger("migrate") +logger = __logger("migrate", True) def __find_modules(path: str) -> Set[str]: @@ -114,7 +114,9 @@ def __init__( def _send_logs_to_s3(self, file_local: bool) -> None: """Send all migration logs to S3.""" - file_reader = FileReader(id="name", path="logs/logging.json", format="json") + log_path = "../logging.json" + + file_reader = FileReader(id="name", path=log_path, format="json") df = file_reader.consume(self.spark_client) path = environment.get_variable("FEATURE_STORE_S3_BUCKET") @@ -126,10 +128,10 @@ def _send_logs_to_s3(self, file_local: bool) -> None: **{"path": f"s3a://{path}/logging"}, ) - if not file_local: - os.rmdir("logs/logging.json") + if not file_local and os.path.exists(log_path): + os.remove(log_path) - def run(self, generate_logs: bool) -> None: + def run(self, generate_logs: bool = False) -> None: """Construct and apply the migrations.""" for pipeline in self.pipelines: for writer in pipeline.sink.writers: diff --git a/butterfree/configs/logger.py b/butterfree/configs/logger.py new file mode 100644 index 000000000..60dab67c7 --- /dev/null +++ b/butterfree/configs/logger.py @@ -0,0 +1,24 @@ +"""Logger funcion.""" + +import logging + + +def __config(json_file_logs: bool = False) -> None: + + if json_file_logs: + return logging.basicConfig( + format='{"name": "%(name)s", "timestamp": "%(asctime)-15s", ' + '"level": "%(levelname)s", "message": "%(message)s"}', + level=logging.INFO, + filename="../logging.json", + ) + return logging.basicConfig( + format="%(name)s:%(asctime)-15s:%(levelname)s:< %(message)s >", + level=logging.INFO, + ) + + +def __logger(name: str, file_logs: bool = False) -> logging.Logger: + + __config(file_logs) + return logging.getLogger(name) diff --git a/butterfree/logging.conf b/butterfree/logging.conf deleted file mode 100644 index 1ee6da868..000000000 --- a/butterfree/logging.conf +++ /dev/null @@ -1,52 +0,0 @@ -[loggers] -keys=root,cli,migrate,database_migrate - -[handlers] -keys=consoleHandler,file - -[formatters] -keys=simpleFormatter,jsonFormatter - -[logger_root] -level=DEBUG -handlers=consoleHandler - -[logger_cli] -level=DEBUG -handlers=file -qualname=cli -propagate=0 - -[logger_migrate] -level=DEBUG -handlers=file -qualname=migrate -propagate=0 - -[logger_database_migrate] -level=DEBUG -handlers=file -qualname=database_migrate -propagate=0 - -[handler_consoleHandler] -class=StreamHandler -level=DEBUG -formatter=simpleFormatter -args=(sys.stdout,) - -[handler_file] -class=FileHandler -level=DEBUG -formatter=jsonFormatter -args=('logs/logging.json', "a") - -[formatter_simpleFormatter] -format=%(name)s:%(asctime)-15s:%(levelname)s:%(message)s -datefmt= -class=logging.Formatter - -[formatter_jsonFormatter] -format={"name": "%(name)s", "timestamp": "%(asctime)-15s", "level": "%(levelname)s", "message": "%(message)s"} -datefmt= -class=logging.Formatter \ No newline at end of file diff --git a/butterfree/migrations/database_migration/database_migration.py b/butterfree/migrations/database_migration/database_migration.py index 28f4f06c7..2ceca0b82 100644 --- a/butterfree/migrations/database_migration/database_migration.py +++ b/butterfree/migrations/database_migration/database_migration.py @@ -6,10 +6,11 @@ from typing import Any, Dict, List, Set from butterfree.clients import AbstractClient +from butterfree.configs.logger import __logger from butterfree.load.writers.writer import Writer from butterfree.transform import FeatureSet -logger = logging.getLogger("database_migrate") +logger = __logger("database_migrate", True) @dataclass diff --git a/logs/logging.json b/logs/logging.json deleted file mode 100644 index e69de29bb..000000000 diff --git a/setup.py b/setup.py index abd56ccbd..5122a8319 100644 --- a/setup.py +++ b/setup.py @@ -1,7 +1,7 @@ from setuptools import find_packages, setup __package_name__ = "butterfree" -__version__ = "1.2.0.dev6" +__version__ = "1.2.0.dev7" __repository_url__ = "https://github.com/quintoandar/butterfree" with open("requirements.txt") as f: diff --git a/tests/unit/butterfree/_cli/test_migrate.py b/tests/unit/butterfree/_cli/test_migrate.py index aa2c86db0..75487bed4 100644 --- a/tests/unit/butterfree/_cli/test_migrate.py +++ b/tests/unit/butterfree/_cli/test_migrate.py @@ -1,12 +1,17 @@ from unittest.mock import call +from typer.testing import CliRunner + from butterfree._cli import migrate +from butterfree._cli.main import app from butterfree.migrations.database_migration import ( CassandraMigration, MetastoreMigration, ) from butterfree.pipelines import FeatureSetPipeline +runner = CliRunner() + class TestMigrate: def test_migrate_success(self, mocker): @@ -38,3 +43,12 @@ def test_migrate_all_pairs(self, mocker): cassandra_pairs, any_order=True ) migrate.Migrate._send_logs_to_s3.assert_called_once() + + def test_app_cli(self): + result = runner.invoke(app, "migrate") + assert result.exit_code == 0 + + def test_app_migrate(self, mocker): + mocker.patch.object(migrate.Migrate, "run") + result = runner.invoke(app, ["migrate", "apply", "tests/mocks/entities/"]) + assert result.exit_code == 0 From d2c5d39b0748e68ca44603df25c309427cd5c7e8 Mon Sep 17 00:00:00 2001 From: Mayara Moromisato <44944954+moromimay@users.noreply.github.com> Date: Thu, 8 Apr 2021 14:07:12 -0300 Subject: [PATCH 26/70] Change solution for tracking logs (#308) * Change tracking logs method. * Change version to generate dev package. * Change path name in S3 --- butterfree/_cli/migrate.py | 45 +++++++++++++++++++++----------------- requirements.txt | 3 ++- setup.py | 2 +- 3 files changed, 28 insertions(+), 22 deletions(-) diff --git a/butterfree/_cli/migrate.py b/butterfree/_cli/migrate.py index f3c533d7c..42b3fb4a4 100644 --- a/butterfree/_cli/migrate.py +++ b/butterfree/_cli/migrate.py @@ -1,3 +1,4 @@ +import datetime import importlib import inspect import os @@ -5,13 +6,13 @@ import sys from typing import Set +import boto3 import setuptools import typer +from botocore.exceptions import ClientError -from butterfree.clients import SparkClient from butterfree.configs import environment from butterfree.configs.logger import __logger -from butterfree.extract.readers import FileReader from butterfree.migrations.database_migration import ALLOWED_DATABASE from butterfree.pipelines import FeatureSetPipeline @@ -106,30 +107,34 @@ class Migrate: pipelines: list of Feature Set Pipelines to use to migration. """ - def __init__( - self, pipelines: Set[FeatureSetPipeline], spark_client: SparkClient = None - ) -> None: + def __init__(self, pipelines: Set[FeatureSetPipeline],) -> None: self.pipelines = pipelines - self.spark_client = spark_client or SparkClient() def _send_logs_to_s3(self, file_local: bool) -> None: """Send all migration logs to S3.""" - log_path = "../logging.json" - - file_reader = FileReader(id="name", path=log_path, format="json") - df = file_reader.consume(self.spark_client) - - path = environment.get_variable("FEATURE_STORE_S3_BUCKET") - - self.spark_client.write_dataframe( - dataframe=df, - format_="json", - mode="append", - **{"path": f"s3a://{path}/logging"}, + s3_client = boto3.client("s3") + + file_name = "../logging.json" + timestamp = datetime.datetime.now() + object_name = ( + f"logs/migrate/" + f"{timestamp.strftime('%Y-%m-%d')}" + f"/logging-{timestamp.strftime('%H:%M:%S')}.json" ) + bucket = environment.get_variable("FEATURE_STORE_S3_BUCKET") + + try: + s3_client.upload_file( + file_name, + bucket, + object_name, + ExtraArgs={"ACL": "bucket-owner-full-control"}, + ) + except ClientError: + raise - if not file_local and os.path.exists(log_path): - os.remove(log_path) + if not file_local and os.path.exists(file_name): + os.remove(file_name) def run(self, generate_logs: bool = False) -> None: """Construct and apply the migrations.""" diff --git a/requirements.txt b/requirements.txt index 82a99d7f5..9548edb31 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,4 +5,5 @@ parameters-validation>=1.1.5,<2.0 pyspark==3.* typer>=0.3,<0.4 setuptools>=41,<42 -typing-extensions==3.7.4.3 \ No newline at end of file +typing-extensions==3.7.4.3 +boto3==1.17.* \ No newline at end of file diff --git a/setup.py b/setup.py index 5122a8319..348e5f988 100644 --- a/setup.py +++ b/setup.py @@ -1,7 +1,7 @@ from setuptools import find_packages, setup __package_name__ = "butterfree" -__version__ = "1.2.0.dev7" +__version__ = "1.2.0.dev8" __repository_url__ = "https://github.com/quintoandar/butterfree" with open("requirements.txt") as f: From 43392f444d1ad7bb50136f665fd00a2fe940857f Mon Sep 17 00:00:00 2001 From: Felipe Victorino Caputo <13631451+fvcaputo@users.noreply.github.com> Date: Tue, 13 Apr 2021 16:02:37 -0300 Subject: [PATCH 27/70] Read and write consistency level options (#309) * modify cassandra client to be region aware * add option for the user to set read and write consistency levels on cassandra config * add tests * use env vars instead * Update butterfree/configs/db/cassandra_config.py Co-authored-by: Rodrigo Martins de Oliveira * Update butterfree/configs/db/cassandra_config.py Co-authored-by: Rodrigo Martins de Oliveira Co-authored-by: Rodrigo Martins de Oliveira --- butterfree/clients/cassandra_client.py | 22 +++++++--- butterfree/configs/db/cassandra_config.py | 30 +++++++++++++ butterfree/configs/environment.py | 2 + .../configs/db/test_cassandra_config.py | 44 +++++++++++++++++++ 4 files changed, 93 insertions(+), 5 deletions(-) diff --git a/butterfree/clients/cassandra_client.py b/butterfree/clients/cassandra_client.py index 938d4e4d6..a46053625 100644 --- a/butterfree/clients/cassandra_client.py +++ b/butterfree/clients/cassandra_client.py @@ -3,9 +3,15 @@ from typing import Dict, List, Optional from cassandra.auth import PlainTextAuthProvider -from cassandra.cluster import Cluster, ResponseFuture, Session -from cassandra.policies import RoundRobinPolicy -from cassandra.query import dict_factory +from cassandra.cluster import ( + EXEC_PROFILE_DEFAULT, + Cluster, + ExecutionProfile, + ResponseFuture, + Session, +) +from cassandra.policies import DCAwareRoundRobinPolicy +from cassandra.query import ConsistencyLevel, dict_factory from typing_extensions import TypedDict from butterfree.clients import AbstractClient @@ -70,14 +76,20 @@ def conn(self, *, ssl_path: str = None) -> Session: # type: ignore else None ) + execution_profiles = { + EXEC_PROFILE_DEFAULT: ExecutionProfile( + load_balancing_policy=DCAwareRoundRobinPolicy(), + consistency_level=ConsistencyLevel.LOCAL_QUORUM, + row_factory=dict_factory, + ) + } cluster = Cluster( contact_points=self.host, auth_provider=auth_provider, ssl_options=ssl_opts, - load_balancing_policy=RoundRobinPolicy(), + execution_profiles=execution_profiles, ) self._session = cluster.connect(self.keyspace) - self._session.row_factory = dict_factory return self._session def sql(self, query: str) -> ResponseFuture: diff --git a/butterfree/configs/db/cassandra_config.py b/butterfree/configs/db/cassandra_config.py index e9329c5df..3f9e129d0 100644 --- a/butterfree/configs/db/cassandra_config.py +++ b/butterfree/configs/db/cassandra_config.py @@ -21,6 +21,8 @@ class CassandraConfig(AbstractWriteConfig): stream_processing_time: processing time interval for streaming jobs. stream_output_mode: specify the mode from writing streaming data. stream_checkpoint_path: path on S3 to save checkpoints for the stream job. + read_consistency_level: read consistency level used in connection. + write_consistency_level: write consistency level used in connection. More information about processing_time, output_mode and checkpoint_path can be found in Spark documentation: @@ -39,6 +41,8 @@ def __init__( stream_processing_time: str = None, stream_output_mode: str = None, stream_checkpoint_path: str = None, + read_consistency_level: str = None, + write_consistency_level: str = None, ): self.username = username self.password = password @@ -49,6 +53,8 @@ def __init__( self.stream_processing_time = stream_processing_time self.stream_output_mode = stream_output_mode self.stream_checkpoint_path = stream_checkpoint_path + self.read_consistency_level = read_consistency_level + self.write_consistency_level = write_consistency_level @property def database(self) -> str: @@ -150,6 +156,28 @@ def stream_checkpoint_path(self, value: str) -> None: "STREAM_CHECKPOINT_PATH" ) + @property + def read_consistency_level(self) -> Optional[str]: + """Read consistency level for Cassandra.""" + return self.__read_consistency_level + + @read_consistency_level.setter + def read_consistency_level(self, value: str) -> None: + self.__read_consistency_level = value or environment.get_variable( + "CASSANDRA_READ_CONSISTENCY_LEVEL", "LOCAL_ONE" + ) + + @property + def write_consistency_level(self) -> Optional[str]: + """Write consistency level for Cassandra.""" + return self.__write_consistency_level + + @write_consistency_level.setter + def write_consistency_level(self, value: str) -> None: + self.__write_consistency_level = value or environment.get_variable( + "CASSANDRA_WRITE_CONSISTENCY_LEVEL", "LOCAL_QUORUM" + ) + def get_options(self, table: str) -> Dict[Optional[str], Optional[str]]: """Get options for connect to Cassandra DB. @@ -169,6 +197,8 @@ def get_options(self, table: str) -> Dict[Optional[str], Optional[str]]: "spark.cassandra.auth.username": self.username, "spark.cassandra.auth.password": self.password, "spark.cassandra.connection.host": self.host, + "spark.cassandra.input.consistency.level": self.read_consistency_level, + "spark.cassandra.output.consistency.level": self.write_consistency_level, } def translate(self, schema: List[Dict[str, Any]]) -> List[Dict[str, Any]]: diff --git a/butterfree/configs/environment.py b/butterfree/configs/environment.py index f98a7a01b..5d8bb4e9d 100644 --- a/butterfree/configs/environment.py +++ b/butterfree/configs/environment.py @@ -12,6 +12,8 @@ "FEATURE_STORE_HISTORICAL_DATABASE": "test", "KAFKA_CONSUMER_CONNECTION_STRING": "test_host:1234,test_host2:1234", "STREAM_CHECKPOINT_PATH": None, + "CASSANDRA_READ_CONSISTENCY_LEVEL": None, + "CASSANDRA_WRITE_CONSISTENCY_LEVEL": None, } diff --git a/tests/unit/butterfree/configs/db/test_cassandra_config.py b/tests/unit/butterfree/configs/db/test_cassandra_config.py index f51ffe8cc..9af4c42b0 100644 --- a/tests/unit/butterfree/configs/db/test_cassandra_config.py +++ b/tests/unit/butterfree/configs/db/test_cassandra_config.py @@ -159,6 +159,50 @@ def test_stream_checkpoint_path_custom(self, cassandra_config): # then assert cassandra_config.stream_checkpoint_path == value + def test_read_consistency_level(self, cassandra_config): + # expecting + default = "LOCAL_ONE" + assert cassandra_config.read_consistency_level == default + + def test_read_consistency_level_custom(self, cassandra_config): + # given + value = "Custom Config" + cassandra_config.read_consistency_level = value + + # then + assert cassandra_config.read_consistency_level == value + + def test_read_consistency_level_custom_env_var(self, mocker, cassandra_config): + # given + value = "Custom Config" + mocker.patch("butterfree.configs.environment.get_variable", return_value=value) + cassandra_config.read_consistency_level = value + + # then + assert cassandra_config.read_consistency_level == value + + def test_write_consistency_level(self, cassandra_config): + # expecting + default = "LOCAL_QUORUM" + assert cassandra_config.write_consistency_level == default + + def test_write_consistency_level_custom(self, cassandra_config): + # given + value = "Custom Config" + cassandra_config.write_consistency_level = value + + # then + assert cassandra_config.write_consistency_level == value + + def test_write_consistency_level_custom_env_var(self, mocker, cassandra_config): + # given + value = "Custom Config" + mocker.patch("butterfree.configs.environment.get_variable", return_value=value) + cassandra_config.write_consistency_level = value + + # then + assert cassandra_config.write_consistency_level == value + def test_set_credentials_on_instantiation(self): cassandra_config = CassandraConfig( # noqa: S106 username="username", password="password", host="host", keyspace="keyspace" From 0f31164b8d3a20689a31669bd28a1c54d6085022 Mon Sep 17 00:00:00 2001 From: Mayara Moromisato <44944954+moromimay@users.noreply.github.com> Date: Wed, 14 Apr 2021 13:48:03 -0300 Subject: [PATCH 28/70] Fix kafka reader. (#310) --- butterfree/extract/readers/kafka_reader.py | 2 +- setup.py | 2 +- tests/unit/butterfree/extract/readers/test_kafka_reader.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/butterfree/extract/readers/kafka_reader.py b/butterfree/extract/readers/kafka_reader.py index 8cac4c198..1b8042bce 100644 --- a/butterfree/extract/readers/kafka_reader.py +++ b/butterfree/extract/readers/kafka_reader.py @@ -174,7 +174,7 @@ def consume(self, client: SparkClient) -> DataFrame: """ # read using client and cast key and value columns from binary to string raw_df = ( - client.read(format="kafka", options=self.options, stream=self.stream) + client.read(format="kafka", stream=self.stream, **self.options) .withColumn("key", col("key").cast("string")) .withColumn("value", col("value").cast("string")) ) diff --git a/setup.py b/setup.py index 348e5f988..c015e1e1c 100644 --- a/setup.py +++ b/setup.py @@ -1,7 +1,7 @@ from setuptools import find_packages, setup __package_name__ = "butterfree" -__version__ = "1.2.0.dev8" +__version__ = "1.2.0.dev9" __repository_url__ = "https://github.com/quintoandar/butterfree" with open("requirements.txt") as f: diff --git a/tests/unit/butterfree/extract/readers/test_kafka_reader.py b/tests/unit/butterfree/extract/readers/test_kafka_reader.py index 5a07cbdd9..f1ea82ae3 100644 --- a/tests/unit/butterfree/extract/readers/test_kafka_reader.py +++ b/tests/unit/butterfree/extract/readers/test_kafka_reader.py @@ -99,7 +99,7 @@ def test_consume( # assert spark_client.read.assert_called_once_with( - format="kafka", options=options, stream=kafka_reader.stream + format="kafka", stream=kafka_reader.stream, **options ) assert_dataframe_equality(target_df, output_df) From e6f67e9a5ff42b4987c8d476a8ebf6df6cfa1aac Mon Sep 17 00:00:00 2001 From: Mayara Moromisato <44944954+moromimay@users.noreply.github.com> Date: Wed, 14 Apr 2021 18:13:03 -0300 Subject: [PATCH 29/70] Fix path validate. (#311) --- butterfree/clients/spark_client.py | 4 ++-- setup.py | 2 +- tests/unit/butterfree/clients/test_spark_client.py | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/butterfree/clients/spark_client.py b/butterfree/clients/spark_client.py index 09a1bcd9b..d5caec9cd 100644 --- a/butterfree/clients/spark_client.py +++ b/butterfree/clients/spark_client.py @@ -58,7 +58,7 @@ def read( """ if not isinstance(format, str): raise ValueError("format needs to be a string with the desired read format") - if not isinstance(path, (str, list)): + if path and not isinstance(path, (str, list)): raise ValueError("path needs to be a string or a list of string") df_reader: Union[ @@ -67,7 +67,7 @@ def read( df_reader = df_reader.schema(schema) if schema else df_reader - return df_reader.format(format).load(path, **options) # type: ignore + return df_reader.format(format).load(path=path, **options) # type: ignore def read_table(self, table: str, database: str = None) -> DataFrame: """Use the SparkSession.read interface to read a metastore table. diff --git a/setup.py b/setup.py index c015e1e1c..2f04f794d 100644 --- a/setup.py +++ b/setup.py @@ -1,7 +1,7 @@ from setuptools import find_packages, setup __package_name__ = "butterfree" -__version__ = "1.2.0.dev9" +__version__ = "1.2.0.dev10" __repository_url__ = "https://github.com/quintoandar/butterfree" with open("requirements.txt") as f: diff --git a/tests/unit/butterfree/clients/test_spark_client.py b/tests/unit/butterfree/clients/test_spark_client.py index dc40841c9..12d8ac9d6 100644 --- a/tests/unit/butterfree/clients/test_spark_client.py +++ b/tests/unit/butterfree/clients/test_spark_client.py @@ -65,7 +65,7 @@ def test_read( # assert mocked_spark_read.format.assert_called_once_with(format) - mocked_spark_read.load.assert_called_once_with(path, **options) + mocked_spark_read.load.assert_called_once_with(path=path, **options) assert target_df.collect() == result_df.collect() @pytest.mark.parametrize( From baa594ba9543daa021451470a6b713e7895b7726 Mon Sep 17 00:00:00 2001 From: Felipe Victorino Caputo <13631451+fvcaputo@users.noreply.github.com> Date: Fri, 16 Apr 2021 12:21:28 -0300 Subject: [PATCH 30/70] Add local dc property (#312) * add local dc property * update version --- butterfree/configs/db/cassandra_config.py | 12 ++++++++++ butterfree/configs/environment.py | 1 + setup.py | 2 +- .../configs/db/test_cassandra_config.py | 22 +++++++++++++++++++ 4 files changed, 36 insertions(+), 1 deletion(-) diff --git a/butterfree/configs/db/cassandra_config.py b/butterfree/configs/db/cassandra_config.py index 3f9e129d0..3d94e7567 100644 --- a/butterfree/configs/db/cassandra_config.py +++ b/butterfree/configs/db/cassandra_config.py @@ -43,6 +43,7 @@ def __init__( stream_checkpoint_path: str = None, read_consistency_level: str = None, write_consistency_level: str = None, + local_dc: str = None, ): self.username = username self.password = password @@ -55,6 +56,7 @@ def __init__( self.stream_checkpoint_path = stream_checkpoint_path self.read_consistency_level = read_consistency_level self.write_consistency_level = write_consistency_level + self.local_dc = local_dc @property def database(self) -> str: @@ -178,6 +180,15 @@ def write_consistency_level(self, value: str) -> None: "CASSANDRA_WRITE_CONSISTENCY_LEVEL", "LOCAL_QUORUM" ) + @property + def local_dc(self) -> Optional[str]: + """Local DC for Cassandra connection.""" + return self.__local_dc + + @local_dc.setter + def local_dc(self, value: str) -> None: + self.__local_dc = value or environment.get_variable("CASSANDRA_LOCAL_DC") + def get_options(self, table: str) -> Dict[Optional[str], Optional[str]]: """Get options for connect to Cassandra DB. @@ -197,6 +208,7 @@ def get_options(self, table: str) -> Dict[Optional[str], Optional[str]]: "spark.cassandra.auth.username": self.username, "spark.cassandra.auth.password": self.password, "spark.cassandra.connection.host": self.host, + "spark.cassandra.connection.localDC": self.local_dc, "spark.cassandra.input.consistency.level": self.read_consistency_level, "spark.cassandra.output.consistency.level": self.write_consistency_level, } diff --git a/butterfree/configs/environment.py b/butterfree/configs/environment.py index 5d8bb4e9d..f56efc5d5 100644 --- a/butterfree/configs/environment.py +++ b/butterfree/configs/environment.py @@ -14,6 +14,7 @@ "STREAM_CHECKPOINT_PATH": None, "CASSANDRA_READ_CONSISTENCY_LEVEL": None, "CASSANDRA_WRITE_CONSISTENCY_LEVEL": None, + "CASSANDRA_LOCAL_DC": None, } diff --git a/setup.py b/setup.py index 2f04f794d..264d9e0d7 100644 --- a/setup.py +++ b/setup.py @@ -1,7 +1,7 @@ from setuptools import find_packages, setup __package_name__ = "butterfree" -__version__ = "1.2.0.dev10" +__version__ = "1.2.0.dev11" __repository_url__ = "https://github.com/quintoandar/butterfree" with open("requirements.txt") as f: diff --git a/tests/unit/butterfree/configs/db/test_cassandra_config.py b/tests/unit/butterfree/configs/db/test_cassandra_config.py index 9af4c42b0..d34c8e9f2 100644 --- a/tests/unit/butterfree/configs/db/test_cassandra_config.py +++ b/tests/unit/butterfree/configs/db/test_cassandra_config.py @@ -203,6 +203,28 @@ def test_write_consistency_level_custom_env_var(self, mocker, cassandra_config): # then assert cassandra_config.write_consistency_level == value + def test_local_dc(self, cassandra_config): + # expecting + default = None + assert cassandra_config.local_dc == default + + def test_local_dc_custom(self, cassandra_config): + # given + value = "VPC_1" + cassandra_config.local_dc = value + + # then + assert cassandra_config.local_dc == value + + def test_local_dc_custom_env_var(self, mocker, cassandra_config): + # given + value = "VPC_1" + mocker.patch("butterfree.configs.environment.get_variable", return_value=value) + cassandra_config.local_dc = value + + # then + assert cassandra_config.local_dc == value + def test_set_credentials_on_instantiation(self): cassandra_config = CassandraConfig( # noqa: S106 username="username", password="password", host="host", keyspace="keyspace" From a74f098d972f9dfdf4f07a37863b8ab0baadaba3 Mon Sep 17 00:00:00 2001 From: Mayara Moromisato <44944954+moromimay@users.noreply.github.com> Date: Tue, 20 Apr 2021 09:08:20 -0300 Subject: [PATCH 31/70] Remove metastore migrate (#313) * Remove metastore migrate. * Change version to create a dev package. --- butterfree/_cli/migrate.py | 55 ++++++++++++------- butterfree/clients/abstract_client.py | 14 +++++ butterfree/clients/cassandra_client.py | 2 +- butterfree/clients/spark_client.py | 2 +- .../writers/online_feature_store_writer.py | 2 + .../database_migration/database_migration.py | 8 ++- setup.py | 2 +- tests/unit/butterfree/_cli/test_migrate.py | 13 +---- 8 files changed, 59 insertions(+), 39 deletions(-) diff --git a/butterfree/_cli/migrate.py b/butterfree/_cli/migrate.py index 42b3fb4a4..2eebe733c 100644 --- a/butterfree/_cli/migrate.py +++ b/butterfree/_cli/migrate.py @@ -1,6 +1,7 @@ import datetime import importlib import inspect +import json import os import pkgutil import sys @@ -43,6 +44,7 @@ def __fs_objects(path: str) -> Set[FeatureSetPipeline]: logger.info(f"Looking for python modules under {path}...") modules = __find_modules(path) if not modules: + logger.error(f"Path: {path} not found!") return set() logger.info(f"Importing modules...") @@ -112,36 +114,47 @@ def __init__(self, pipelines: Set[FeatureSetPipeline],) -> None: def _send_logs_to_s3(self, file_local: bool) -> None: """Send all migration logs to S3.""" - s3_client = boto3.client("s3") - file_name = "../logging.json" - timestamp = datetime.datetime.now() - object_name = ( - f"logs/migrate/" - f"{timestamp.strftime('%Y-%m-%d')}" - f"/logging-{timestamp.strftime('%H:%M:%S')}.json" - ) - bucket = environment.get_variable("FEATURE_STORE_S3_BUCKET") - - try: - s3_client.upload_file( - file_name, - bucket, - object_name, - ExtraArgs={"ACL": "bucket-owner-full-control"}, - ) - except ClientError: - raise if not file_local and os.path.exists(file_name): + s3_client = boto3.client("s3") + + timestamp = datetime.datetime.now() + object_name = ( + f"logs/migrate/" + f"{timestamp.strftime('%Y-%m-%d')}" + f"/logging-{timestamp.strftime('%H:%M:%S')}.json" + ) + bucket = environment.get_variable("FEATURE_STORE_S3_BUCKET") + + try: + s3_client.upload_file( + file_name, + bucket, + object_name, + ExtraArgs={"ACL": "bucket-owner-full-control"}, + ) + except ClientError: + raise + os.remove(file_name) + else: + with open(file_name, "r") as json_f: + json_data = json.load(json_f) + print(json_data) def run(self, generate_logs: bool = False) -> None: """Construct and apply the migrations.""" for pipeline in self.pipelines: for writer in pipeline.sink.writers: - migration = ALLOWED_DATABASE[writer.db_config.database] - migration.apply_migration(pipeline.feature_set, writer) + db = writer.db_config.database + if db != "metastore": + migration = ALLOWED_DATABASE[db] + migration.apply_migration(pipeline.feature_set, writer) + else: + logger.warning( + "Butterfree not supporting Metastore Migrations yet." + ) self._send_logs_to_s3(generate_logs) diff --git a/butterfree/clients/abstract_client.py b/butterfree/clients/abstract_client.py index 265706e68..ce5d33b64 100644 --- a/butterfree/clients/abstract_client.py +++ b/butterfree/clients/abstract_client.py @@ -23,3 +23,17 @@ def sql(self, query: str) -> Any: Set of records. """ pass + + @abstractmethod + def get_schema(self, table: str, database: str = None) -> Any: + """Returns desired table schema. + + Attributes: + table: desired table. + + Returns: + A list of dictionaries in the format + [{"column_name": "example1", type: "Spark_type"}, ...] + + """ + pass diff --git a/butterfree/clients/cassandra_client.py b/butterfree/clients/cassandra_client.py index a46053625..00a3d497c 100644 --- a/butterfree/clients/cassandra_client.py +++ b/butterfree/clients/cassandra_client.py @@ -103,7 +103,7 @@ def sql(self, query: str) -> ResponseFuture: raise RuntimeError("There's no session available for this query.") return self._session.execute(query) - def get_schema(self, table: str) -> List[Dict[str, str]]: + def get_schema(self, table: str, database: str = None) -> List[Dict[str, str]]: """Returns desired table schema. Attributes: diff --git a/butterfree/clients/spark_client.py b/butterfree/clients/spark_client.py index d5caec9cd..bfa31d2a3 100644 --- a/butterfree/clients/spark_client.py +++ b/butterfree/clients/spark_client.py @@ -314,7 +314,7 @@ def _convert_schema(self, schema: DataFrame) -> List[Dict[str, str]]: return converted_schema - def get_schema(self, table: str, database: str) -> List[Dict[str, str]]: + def get_schema(self, table: str, database: str = None) -> List[Dict[str, str]]: """Returns desired table schema. Attributes: diff --git a/butterfree/load/writers/online_feature_store_writer.py b/butterfree/load/writers/online_feature_store_writer.py index b51d99235..17dc8af4b 100644 --- a/butterfree/load/writers/online_feature_store_writer.py +++ b/butterfree/load/writers/online_feature_store_writer.py @@ -81,6 +81,7 @@ class OnlineFeatureStoreWriter(Writer): def __init__( self, db_config: AbstractWriteConfig = None, + database: str = None, debug_mode: bool = False, write_to_entity: bool = False, interval_mode: bool = False, @@ -90,6 +91,7 @@ def __init__( db_config or CassandraConfig(), debug_mode, interval_mode, write_to_entity ) self.check_schema_hook = check_schema_hook + self.database = database @staticmethod def filter_latest(dataframe: DataFrame, id_columns: List[Any]) -> DataFrame: diff --git a/butterfree/migrations/database_migration/database_migration.py b/butterfree/migrations/database_migration/database_migration.py index 2ceca0b82..6df9ce95b 100644 --- a/butterfree/migrations/database_migration/database_migration.py +++ b/butterfree/migrations/database_migration/database_migration.py @@ -242,7 +242,9 @@ def _get_diff( ) return schema_diff - def _get_schema(self, table_name: str) -> List[Dict[str, Any]]: + def _get_schema( + self, table_name: str, database: str = None + ) -> List[Dict[str, Any]]: """Get a table schema in the respective database. Args: @@ -252,7 +254,7 @@ def _get_schema(self, table_name: str) -> List[Dict[str, Any]]: Schema object. """ try: - db_schema = self._client.get_schema(table_name) + db_schema = self._client.get_schema(table_name, database) except Exception: # noqa db_schema = [] return db_schema @@ -271,7 +273,7 @@ def apply_migration(self, feature_set: FeatureSet, writer: Writer,) -> None: ) fs_schema = writer.db_config.translate(feature_set.get_schema()) - db_schema = self._get_schema(table_name) + db_schema = self._get_schema(table_name, writer.database) queries = self.create_query( fs_schema, table_name, db_schema, writer.write_to_entity diff --git a/setup.py b/setup.py index 264d9e0d7..c1295ee31 100644 --- a/setup.py +++ b/setup.py @@ -1,7 +1,7 @@ from setuptools import find_packages, setup __package_name__ = "butterfree" -__version__ = "1.2.0.dev11" +__version__ = "1.2.0.dev12" __repository_url__ = "https://github.com/quintoandar/butterfree" with open("requirements.txt") as f: diff --git a/tests/unit/butterfree/_cli/test_migrate.py b/tests/unit/butterfree/_cli/test_migrate.py index 75487bed4..475db15f7 100644 --- a/tests/unit/butterfree/_cli/test_migrate.py +++ b/tests/unit/butterfree/_cli/test_migrate.py @@ -4,10 +4,7 @@ from butterfree._cli import migrate from butterfree._cli.main import app -from butterfree.migrations.database_migration import ( - CassandraMigration, - MetastoreMigration, -) +from butterfree.migrations.database_migration import CassandraMigration from butterfree.pipelines import FeatureSetPipeline runner = CliRunner() @@ -21,24 +18,16 @@ def test_migrate_success(self, mocker): assert sorted([fs.feature_set.name for fs in all_fs]) == ["first", "second"] def test_migrate_all_pairs(self, mocker): - mocker.patch.object(MetastoreMigration, "apply_migration") mocker.patch.object(CassandraMigration, "apply_migration") mocker.patch.object(migrate.Migrate, "_send_logs_to_s3") all_fs = migrate.migrate("tests/mocks/entities/") - assert MetastoreMigration.apply_migration.call_count == 2 assert CassandraMigration.apply_migration.call_count == 2 - metastore_pairs = [ - call(pipe.feature_set, pipe.sink.writers[0]) for pipe in all_fs - ] cassandra_pairs = [ call(pipe.feature_set, pipe.sink.writers[1]) for pipe in all_fs ] - MetastoreMigration.apply_migration.assert_has_calls( - metastore_pairs, any_order=True - ) CassandraMigration.apply_migration.assert_has_calls( cassandra_pairs, any_order=True ) From 378f3a55dc14914d6a58a820a803d17a3b61f1fb Mon Sep 17 00:00:00 2001 From: Mayara Moromisato <44944954+moromimay@users.noreply.github.com> Date: Tue, 20 Apr 2021 17:32:22 -0300 Subject: [PATCH 32/70] Fix link in our docs. (#315) --- docs/source/extract.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/extract.md b/docs/source/extract.md index 2d9f9fabe..2b4f2e521 100644 --- a/docs/source/extract.md +++ b/docs/source/extract.md @@ -53,4 +53,4 @@ source = Source( ) ``` -It's important to state that we have some pre-processing methods as well, such as filter and pivot. Feel free to check them [here](https://github.com/quintoandar/butterfree/tree/staging/butterfree/core/extract/pre_processing). \ No newline at end of file +It's important to state that we have some pre-processing methods as well, such as filter and pivot. Feel free to check them [here](https://github.com/quintoandar/butterfree/tree/master/butterfree/extract/pre_processing). \ No newline at end of file From 3b18b5a98746e0b45e50ec4f9c7080fe32697500 Mon Sep 17 00:00:00 2001 From: Mayara Moromisato <44944954+moromimay@users.noreply.github.com> Date: Fri, 23 Apr 2021 09:48:33 -0300 Subject: [PATCH 33/70] [BUG] Fix Cassandra Connect Session (#316) * Fix Cassandra Connect Session. * Apply style. --- butterfree/clients/cassandra_client.py | 59 +++++++++---------- setup.py | 2 +- .../clients/test_cassandra_client.py | 30 ---------- 3 files changed, 30 insertions(+), 61 deletions(-) diff --git a/butterfree/clients/cassandra_client.py b/butterfree/clients/cassandra_client.py index 00a3d497c..4c6f96fe0 100644 --- a/butterfree/clients/cassandra_client.py +++ b/butterfree/clients/cassandra_client.py @@ -61,35 +61,36 @@ def __init__( @property def conn(self, *, ssl_path: str = None) -> Session: # type: ignore """Establishes a Cassandra connection.""" - auth_provider = ( - PlainTextAuthProvider(username=self.user, password=self.password) - if self.user is not None - else None - ) - ssl_opts = ( - { - "ca_certs": ssl_path, - "ssl_version": PROTOCOL_TLSv1, - "cert_reqs": CERT_REQUIRED, - } - if ssl_path is not None - else None - ) + if not self._session: + auth_provider = ( + PlainTextAuthProvider(username=self.user, password=self.password) + if self.user is not None + else None + ) + ssl_opts = ( + { + "ca_certs": ssl_path, + "ssl_version": PROTOCOL_TLSv1, + "cert_reqs": CERT_REQUIRED, + } + if ssl_path is not None + else None + ) - execution_profiles = { - EXEC_PROFILE_DEFAULT: ExecutionProfile( - load_balancing_policy=DCAwareRoundRobinPolicy(), - consistency_level=ConsistencyLevel.LOCAL_QUORUM, - row_factory=dict_factory, + execution_profiles = { + EXEC_PROFILE_DEFAULT: ExecutionProfile( + load_balancing_policy=DCAwareRoundRobinPolicy(), + consistency_level=ConsistencyLevel.LOCAL_QUORUM, + row_factory=dict_factory, + ) + } + cluster = Cluster( + contact_points=self.host, + auth_provider=auth_provider, + ssl_options=ssl_opts, + execution_profiles=execution_profiles, ) - } - cluster = Cluster( - contact_points=self.host, - auth_provider=auth_provider, - ssl_options=ssl_opts, - execution_profiles=execution_profiles, - ) - self._session = cluster.connect(self.keyspace) + self._session = cluster.connect(self.keyspace) return self._session def sql(self, query: str) -> ResponseFuture: @@ -99,9 +100,7 @@ def sql(self, query: str) -> ResponseFuture: query: desired query. """ - if not self._session: - raise RuntimeError("There's no session available for this query.") - return self._session.execute(query) + return self.conn.execute(query) def get_schema(self, table: str, database: str = None) -> List[Dict[str, str]]: """Returns desired table schema. diff --git a/setup.py b/setup.py index c1295ee31..8d56a022d 100644 --- a/setup.py +++ b/setup.py @@ -1,7 +1,7 @@ from setuptools import find_packages, setup __package_name__ = "butterfree" -__version__ = "1.2.0.dev12" +__version__ = "1.2.0.dev13" __repository_url__ = "https://github.com/quintoandar/butterfree" with open("requirements.txt") as f: diff --git a/tests/unit/butterfree/clients/test_cassandra_client.py b/tests/unit/butterfree/clients/test_cassandra_client.py index aa52e6f83..0356e43f9 100644 --- a/tests/unit/butterfree/clients/test_cassandra_client.py +++ b/tests/unit/butterfree/clients/test_cassandra_client.py @@ -1,8 +1,6 @@ from typing import Any, Dict, List from unittest.mock import MagicMock -import pytest - from butterfree.clients import CassandraClient from butterfree.clients.cassandra_client import CassandraColumn @@ -88,31 +86,3 @@ def test_cassandra_create_table( query = cassandra_client.sql.call_args[0][0] assert sanitize_string(query) == sanitize_string(expected_query) - - def test_cassandra_without_session(self, cassandra_client: CassandraClient) -> None: - cassandra_client = cassandra_client - - with pytest.raises( - RuntimeError, match="There's no session available for this query." - ): - cassandra_client.sql( - query="select feature1, feature2 from cassandra_feature_set" - ) - with pytest.raises( - RuntimeError, match="There's no session available for this query." - ): - cassandra_client.create_table( - [ - {"column_name": "id", "type": "int", "primary_key": True}, - { - "column_name": "rent_per_month", - "type": "float", - "primary_key": False, - }, - ], - "test", - ) - with pytest.raises( - RuntimeError, match="There's no session available for this query." - ): - cassandra_client.get_schema("test") From c46f171f05c45c6316aacbf43ea03578364ba781 Mon Sep 17 00:00:00 2001 From: Mayara Moromisato <44944954+moromimay@users.noreply.github.com> Date: Mon, 26 Apr 2021 17:42:35 -0300 Subject: [PATCH 34/70] Fix migration query. (#318) --- .../database_migration/cassandra_migration.py | 6 +++--- .../database_migration/database_migration.py | 11 ++++++----- .../database_migration/metastore_migration.py | 6 +++--- setup.py | 2 +- .../database_migration/test_cassandra_migration.py | 4 ++-- .../database_migration/test_metastore_migration.py | 4 ++-- 6 files changed, 17 insertions(+), 16 deletions(-) diff --git a/butterfree/migrations/database_migration/cassandra_migration.py b/butterfree/migrations/database_migration/cassandra_migration.py index c511479b6..ff7042b64 100644 --- a/butterfree/migrations/database_migration/cassandra_migration.py +++ b/butterfree/migrations/database_migration/cassandra_migration.py @@ -75,7 +75,7 @@ def _get_alter_table_add_query(self, columns: List[Diff], table_name: str) -> st return f"ALTER TABLE {table_name} ADD ({parsed_columns});" - def _get_alter_column_type_query(self, columns: List[Diff], table_name: str) -> str: + def _get_alter_column_type_query(self, column: Diff, table_name: str) -> str: """Creates CQL statement to alter columns' types. Args: @@ -86,9 +86,9 @@ def _get_alter_column_type_query(self, columns: List[Diff], table_name: str) -> Alter column type query. """ - parsed_columns = self._get_parsed_columns(columns) + parsed_columns = self._get_parsed_columns([column]) - return f"ALTER TABLE {table_name} ALTER ({parsed_columns});" + return f"ALTER TABLE {table_name} ALTER {parsed_columns};" @staticmethod def _get_create_table_query(columns: List[Dict[str, Any]], table_name: str) -> str: diff --git a/butterfree/migrations/database_migration/database_migration.py b/butterfree/migrations/database_migration/database_migration.py index 6df9ce95b..de6b2f803 100644 --- a/butterfree/migrations/database_migration/database_migration.py +++ b/butterfree/migrations/database_migration/database_migration.py @@ -93,7 +93,7 @@ def _get_alter_table_drop_query(self, columns: List[Diff], table_name: str) -> s pass @abstractmethod - def _get_alter_column_type_query(self, columns: List[Diff], table_name: str) -> str: + def _get_alter_column_type_query(self, column: Diff, table_name: str) -> str: """Creates desired statement to alter columns' types. Args: @@ -152,10 +152,11 @@ def _get_queries( ) queries.append(drop_columns_query) if alter_type_items: - alter_column_types_query = self._get_alter_column_type_query( - alter_type_items, table_name - ) - queries.append(alter_column_types_query) + for item in alter_type_items: + alter_column_types_query = self._get_alter_column_type_query( + item, table_name + ) + queries.append(alter_column_types_query) if alter_key_items: logger.info("This operation is not supported by Spark.") diff --git a/butterfree/migrations/database_migration/metastore_migration.py b/butterfree/migrations/database_migration/metastore_migration.py index 8b7c6af0e..daa0afd3d 100644 --- a/butterfree/migrations/database_migration/metastore_migration.py +++ b/butterfree/migrations/database_migration/metastore_migration.py @@ -74,7 +74,7 @@ def _get_alter_table_add_query(self, columns: List[Diff], table_name: str) -> st f"ADD IF NOT EXISTS columns ({parsed_columns});" ) - def _get_alter_column_type_query(self, columns: List[Diff], table_name: str) -> str: + def _get_alter_column_type_query(self, column: Diff, table_name: str) -> str: """Creates SQL statement to alter columns' types. Args: @@ -85,9 +85,9 @@ def _get_alter_column_type_query(self, columns: List[Diff], table_name: str) -> Alter column type query. """ - parsed_columns = self._get_parsed_columns(columns) + parsed_columns = self._get_parsed_columns([column]) - return f"ALTER TABLE {table_name} ALTER COLUMN ({parsed_columns});" + return f"ALTER TABLE {table_name} ALTER COLUMN {parsed_columns};" def _get_create_table_query( self, columns: List[Dict[str, Any]], table_name: str diff --git a/setup.py b/setup.py index 8d56a022d..a69c079c6 100644 --- a/setup.py +++ b/setup.py @@ -1,7 +1,7 @@ from setuptools import find_packages, setup __package_name__ = "butterfree" -__version__ = "1.2.0.dev13" +__version__ = "1.2.0.dev14" __repository_url__ = "https://github.com/quintoandar/butterfree" with open("requirements.txt") as f: diff --git a/tests/unit/butterfree/migrations/database_migration/test_cassandra_migration.py b/tests/unit/butterfree/migrations/database_migration/test_cassandra_migration.py index 8f16a1d2f..97f49958e 100644 --- a/tests/unit/butterfree/migrations/database_migration/test_cassandra_migration.py +++ b/tests/unit/butterfree/migrations/database_migration/test_cassandra_migration.py @@ -8,7 +8,7 @@ def test_queries(self, fs_schema, db_schema): "ALTER TABLE table_name ADD (new_feature FloatType);", "ALTER TABLE table_name DROP (feature1__avg_over_2_days_rolling_windows);", "ALTER TABLE table_name ALTER " - "(feature1__avg_over_1_week_rolling_windows FloatType);", + "feature1__avg_over_1_week_rolling_windows FloatType;", ] query = cassandra_migration.create_query(fs_schema, "table_name", db_schema) @@ -19,7 +19,7 @@ def test_queries_on_entity(self, fs_schema, db_schema): expected_query = [ "ALTER TABLE table_name ADD (new_feature FloatType);", "ALTER TABLE table_name ALTER " - "(feature1__avg_over_1_week_rolling_windows FloatType);", + "feature1__avg_over_1_week_rolling_windows FloatType;", ] query = cassandra_migration.create_query( fs_schema, "table_name", db_schema, True diff --git a/tests/unit/butterfree/migrations/database_migration/test_metastore_migration.py b/tests/unit/butterfree/migrations/database_migration/test_metastore_migration.py index 5bac93521..d9c2de3c6 100644 --- a/tests/unit/butterfree/migrations/database_migration/test_metastore_migration.py +++ b/tests/unit/butterfree/migrations/database_migration/test_metastore_migration.py @@ -11,7 +11,7 @@ def test_queries(self, fs_schema, db_schema): "ALTER TABLE table_name DROP IF EXISTS " "(feature1__avg_over_2_days_rolling_windows None);", "ALTER TABLE table_name ALTER COLUMN " - "(feature1__avg_over_1_week_rolling_windows FloatType);", + "feature1__avg_over_1_week_rolling_windows FloatType;", ] query = metastore_migration.create_query(fs_schema, "table_name", db_schema) @@ -25,7 +25,7 @@ def test_queries_on_entity(self, fs_schema, db_schema): "ALTER TABLE test.table_name ADD IF NOT EXISTS " "columns (new_feature FloatType);", "ALTER TABLE table_name ALTER COLUMN " - "(feature1__avg_over_1_week_rolling_windows FloatType);", + "feature1__avg_over_1_week_rolling_windows FloatType;", ] query = metastore_migration.create_query( From bb124f57f2c3e3fbf6d6cecd80bfdd98f8f6ba4b Mon Sep 17 00:00:00 2001 From: Mayara Moromisato <44944954+moromimay@users.noreply.github.com> Date: Wed, 28 Apr 2021 09:23:19 -0300 Subject: [PATCH 35/70] Fix migration query add type key. (#319) --- .../migrations/database_migration/cassandra_migration.py | 4 +++- setup.py | 2 +- .../migrations/database_migration/test_cassandra_migration.py | 4 ++-- 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/butterfree/migrations/database_migration/cassandra_migration.py b/butterfree/migrations/database_migration/cassandra_migration.py index ff7042b64..5a4f755f9 100644 --- a/butterfree/migrations/database_migration/cassandra_migration.py +++ b/butterfree/migrations/database_migration/cassandra_migration.py @@ -88,7 +88,9 @@ def _get_alter_column_type_query(self, column: Diff, table_name: str) -> str: """ parsed_columns = self._get_parsed_columns([column]) - return f"ALTER TABLE {table_name} ALTER {parsed_columns};" + return ( + f"ALTER TABLE {table_name} ALTER {parsed_columns.replace(' ', ' TYPE ')};" + ) @staticmethod def _get_create_table_query(columns: List[Dict[str, Any]], table_name: str) -> str: diff --git a/setup.py b/setup.py index a69c079c6..4a138c7a3 100644 --- a/setup.py +++ b/setup.py @@ -1,7 +1,7 @@ from setuptools import find_packages, setup __package_name__ = "butterfree" -__version__ = "1.2.0.dev14" +__version__ = "1.2.0.dev15" __repository_url__ = "https://github.com/quintoandar/butterfree" with open("requirements.txt") as f: diff --git a/tests/unit/butterfree/migrations/database_migration/test_cassandra_migration.py b/tests/unit/butterfree/migrations/database_migration/test_cassandra_migration.py index 97f49958e..5666cc47f 100644 --- a/tests/unit/butterfree/migrations/database_migration/test_cassandra_migration.py +++ b/tests/unit/butterfree/migrations/database_migration/test_cassandra_migration.py @@ -8,7 +8,7 @@ def test_queries(self, fs_schema, db_schema): "ALTER TABLE table_name ADD (new_feature FloatType);", "ALTER TABLE table_name DROP (feature1__avg_over_2_days_rolling_windows);", "ALTER TABLE table_name ALTER " - "feature1__avg_over_1_week_rolling_windows FloatType;", + "feature1__avg_over_1_week_rolling_windows TYPE FloatType;", ] query = cassandra_migration.create_query(fs_schema, "table_name", db_schema) @@ -19,7 +19,7 @@ def test_queries_on_entity(self, fs_schema, db_schema): expected_query = [ "ALTER TABLE table_name ADD (new_feature FloatType);", "ALTER TABLE table_name ALTER " - "feature1__avg_over_1_week_rolling_windows FloatType;", + "feature1__avg_over_1_week_rolling_windows TYPE FloatType;", ] query = cassandra_migration.create_query( fs_schema, "table_name", db_schema, True From 1c973169fdc0b9183e3677bf83a7c8f1c7acfd82 Mon Sep 17 00:00:00 2001 From: Mayara Moromisato <44944954+moromimay@users.noreply.github.com> Date: Wed, 5 May 2021 17:47:08 -0300 Subject: [PATCH 36/70] Fix db-config condition (#321) * Fix db-config condition. * Apply style. --- butterfree/_cli/migrate.py | 6 ++---- setup.py | 2 +- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/butterfree/_cli/migrate.py b/butterfree/_cli/migrate.py index 2eebe733c..ebd211425 100644 --- a/butterfree/_cli/migrate.py +++ b/butterfree/_cli/migrate.py @@ -148,13 +148,11 @@ def run(self, generate_logs: bool = False) -> None: for pipeline in self.pipelines: for writer in pipeline.sink.writers: db = writer.db_config.database - if db != "metastore": + if db == "cassandra": migration = ALLOWED_DATABASE[db] migration.apply_migration(pipeline.feature_set, writer) else: - logger.warning( - "Butterfree not supporting Metastore Migrations yet." - ) + logger.warning(f"Butterfree not supporting {db} Migrations yet.") self._send_logs_to_s3(generate_logs) diff --git a/setup.py b/setup.py index 4a138c7a3..6c2c2b46a 100644 --- a/setup.py +++ b/setup.py @@ -1,7 +1,7 @@ from setuptools import find_packages, setup __package_name__ = "butterfree" -__version__ = "1.2.0.dev15" +__version__ = "1.2.0.dev16" __repository_url__ = "https://github.com/quintoandar/butterfree" with open("requirements.txt") as f: From bb7ed77917d4553593d93e3603d3e273e7ec90a6 Mon Sep 17 00:00:00 2001 From: Igor Gustavo Hoelscher <19557581+roelschr@users.noreply.github.com> Date: Fri, 7 May 2021 14:59:40 -0300 Subject: [PATCH 37/70] MLOP-642 Document migration in Butterfree (#320) * update docs * add more information and reference new cli.md file --- docs/source/butterfree.clients.rst | 1 - docs/source/butterfree.configs.db.rst | 3 +- docs/source/butterfree.configs.rst | 5 +++ docs/source/butterfree.constants.rst | 11 +++++++ docs/source/butterfree.dataframe_service.rst | 13 +++++++- .../butterfree.extract.pre_processing.rst | 1 - docs/source/butterfree.extract.readers.rst | 1 - docs/source/butterfree.extract.rst | 1 - docs/source/butterfree.hooks.rst | 33 +++++++++++++++++++ .../butterfree.hooks.schema_compatibility.rst | 25 ++++++++++++++ docs/source/butterfree.load.processing.rst | 1 - docs/source/butterfree.load.rst | 1 - docs/source/butterfree.load.writers.rst | 1 - ...tterfree.migrations.database_migration.rst | 31 +++++++++++++++++ docs/source/butterfree.migrations.rst | 18 ++++++++++ docs/source/butterfree.pipelines.rst | 1 - docs/source/butterfree.reports.rst | 1 - docs/source/butterfree.rst | 2 ++ docs/source/butterfree.transform.features.rst | 1 - docs/source/butterfree.transform.rst | 1 - .../butterfree.transform.transformations.rst | 1 - ...transformations.user_defined_functions.rst | 1 - docs/source/butterfree.transform.utils.rst | 1 - docs/source/butterfree.validations.rst | 1 - docs/source/cli.md | 32 ++++++++++++++++++ docs/source/home.md | 6 ++++ docs/source/index.rst | 1 + requirements.dev.txt | 4 +++ 28 files changed, 181 insertions(+), 18 deletions(-) create mode 100644 docs/source/butterfree.hooks.rst create mode 100644 docs/source/butterfree.hooks.schema_compatibility.rst create mode 100644 docs/source/butterfree.migrations.database_migration.rst create mode 100644 docs/source/butterfree.migrations.rst create mode 100644 docs/source/cli.md diff --git a/docs/source/butterfree.clients.rst b/docs/source/butterfree.clients.rst index 3409d43a4..1bfaa86df 100644 --- a/docs/source/butterfree.clients.rst +++ b/docs/source/butterfree.clients.rst @@ -22,7 +22,6 @@ Submodules :undoc-members: :show-inheritance: - Module contents --------------- diff --git a/docs/source/butterfree.configs.db.rst b/docs/source/butterfree.configs.db.rst index a9973c561..3bb9f8b88 100644 --- a/docs/source/butterfree.configs.db.rst +++ b/docs/source/butterfree.configs.db.rst @@ -23,12 +23,11 @@ Submodules :show-inheritance: -.. automodule:: butterfree.configs.db.s3_config +.. automodule:: butterfree.configs.db.metastore_config :members: :undoc-members: :show-inheritance: - Module contents --------------- diff --git a/docs/source/butterfree.configs.rst b/docs/source/butterfree.configs.rst index dc8a8c774..f3cf2aa29 100644 --- a/docs/source/butterfree.configs.rst +++ b/docs/source/butterfree.configs.rst @@ -19,6 +19,11 @@ Submodules :show-inheritance: +.. automodule:: butterfree.configs.logger + :members: + :undoc-members: + :show-inheritance: + Module contents --------------- diff --git a/docs/source/butterfree.constants.rst b/docs/source/butterfree.constants.rst index 083d20d78..d0e72fedd 100644 --- a/docs/source/butterfree.constants.rst +++ b/docs/source/butterfree.constants.rst @@ -17,12 +17,23 @@ Submodules :show-inheritance: +.. automodule:: butterfree.constants.migrations + :members: + :undoc-members: + :show-inheritance: + + .. automodule:: butterfree.constants.spark_constants :members: :undoc-members: :show-inheritance: +.. automodule:: butterfree.constants.window_definitions + :members: + :undoc-members: + :show-inheritance: + Module contents --------------- diff --git a/docs/source/butterfree.dataframe_service.rst b/docs/source/butterfree.dataframe_service.rst index b3c4cfc86..4343305b6 100644 --- a/docs/source/butterfree.dataframe_service.rst +++ b/docs/source/butterfree.dataframe_service.rst @@ -5,12 +5,23 @@ Submodules ---------- -.. automodule:: butterfree.dataframe_service.repartition +.. automodule:: butterfree.dataframe_service.incremental_strategy + :members: + :undoc-members: + :show-inheritance: + + +.. automodule:: butterfree.dataframe_service.partitioning :members: :undoc-members: :show-inheritance: +.. automodule:: butterfree.dataframe_service.repartition + :members: + :undoc-members: + :show-inheritance: + Module contents --------------- diff --git a/docs/source/butterfree.extract.pre_processing.rst b/docs/source/butterfree.extract.pre_processing.rst index 9420cd7ee..172e6fb3c 100644 --- a/docs/source/butterfree.extract.pre_processing.rst +++ b/docs/source/butterfree.extract.pre_processing.rst @@ -34,7 +34,6 @@ Submodules :undoc-members: :show-inheritance: - Module contents --------------- diff --git a/docs/source/butterfree.extract.readers.rst b/docs/source/butterfree.extract.readers.rst index 6f7ee7b8b..a67d47e96 100644 --- a/docs/source/butterfree.extract.readers.rst +++ b/docs/source/butterfree.extract.readers.rst @@ -28,7 +28,6 @@ Submodules :undoc-members: :show-inheritance: - Module contents --------------- diff --git a/docs/source/butterfree.extract.rst b/docs/source/butterfree.extract.rst index 4454d6e90..a59d2e292 100644 --- a/docs/source/butterfree.extract.rst +++ b/docs/source/butterfree.extract.rst @@ -19,7 +19,6 @@ Submodules :undoc-members: :show-inheritance: - Module contents --------------- diff --git a/docs/source/butterfree.hooks.rst b/docs/source/butterfree.hooks.rst new file mode 100644 index 000000000..72f13223d --- /dev/null +++ b/docs/source/butterfree.hooks.rst @@ -0,0 +1,33 @@ +butterfree.hooks package +======================== + +Subpackages +----------- + +.. toctree:: + :maxdepth: 4 + + butterfree.hooks.schema_compatibility + +Submodules +---------- + + +.. automodule:: butterfree.hooks.hook + :members: + :undoc-members: + :show-inheritance: + + +.. automodule:: butterfree.hooks.hookable_component + :members: + :undoc-members: + :show-inheritance: + +Module contents +--------------- + +.. automodule:: butterfree.hooks + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/butterfree.hooks.schema_compatibility.rst b/docs/source/butterfree.hooks.schema_compatibility.rst new file mode 100644 index 000000000..a39c5b935 --- /dev/null +++ b/docs/source/butterfree.hooks.schema_compatibility.rst @@ -0,0 +1,25 @@ +butterfree.hooks.schema\_compatibility package +============================================== + +Submodules +---------- + + +.. automodule:: butterfree.hooks.schema_compatibility.cassandra_table_schema_compatibility_hook + :members: + :undoc-members: + :show-inheritance: + + +.. automodule:: butterfree.hooks.schema_compatibility.spark_table_schema_compatibility_hook + :members: + :undoc-members: + :show-inheritance: + +Module contents +--------------- + +.. automodule:: butterfree.hooks.schema_compatibility + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/butterfree.load.processing.rst b/docs/source/butterfree.load.processing.rst index 79ae36b9b..4c5d2a2e8 100644 --- a/docs/source/butterfree.load.processing.rst +++ b/docs/source/butterfree.load.processing.rst @@ -10,7 +10,6 @@ Submodules :undoc-members: :show-inheritance: - Module contents --------------- diff --git a/docs/source/butterfree.load.rst b/docs/source/butterfree.load.rst index 2498b6f29..e38934a5a 100644 --- a/docs/source/butterfree.load.rst +++ b/docs/source/butterfree.load.rst @@ -19,7 +19,6 @@ Submodules :undoc-members: :show-inheritance: - Module contents --------------- diff --git a/docs/source/butterfree.load.writers.rst b/docs/source/butterfree.load.writers.rst index 88aa9e64f..6ff438de9 100644 --- a/docs/source/butterfree.load.writers.rst +++ b/docs/source/butterfree.load.writers.rst @@ -22,7 +22,6 @@ Submodules :undoc-members: :show-inheritance: - Module contents --------------- diff --git a/docs/source/butterfree.migrations.database_migration.rst b/docs/source/butterfree.migrations.database_migration.rst new file mode 100644 index 000000000..892165dfc --- /dev/null +++ b/docs/source/butterfree.migrations.database_migration.rst @@ -0,0 +1,31 @@ +butterfree.migrations.database\_migration package +================================================= + +Submodules +---------- + + +.. automodule:: butterfree.migrations.database_migration.cassandra_migration + :members: + :undoc-members: + :show-inheritance: + + +.. automodule:: butterfree.migrations.database_migration.database_migration + :members: + :undoc-members: + :show-inheritance: + + +.. automodule:: butterfree.migrations.database_migration.metastore_migration + :members: + :undoc-members: + :show-inheritance: + +Module contents +--------------- + +.. automodule:: butterfree.migrations.database_migration + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/butterfree.migrations.rst b/docs/source/butterfree.migrations.rst new file mode 100644 index 000000000..4770fd8ea --- /dev/null +++ b/docs/source/butterfree.migrations.rst @@ -0,0 +1,18 @@ +butterfree.migrations package +============================= + +Subpackages +----------- + +.. toctree:: + :maxdepth: 4 + + butterfree.migrations.database_migration + +Module contents +--------------- + +.. automodule:: butterfree.migrations + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/butterfree.pipelines.rst b/docs/source/butterfree.pipelines.rst index d5c65f4d9..e0c319962 100644 --- a/docs/source/butterfree.pipelines.rst +++ b/docs/source/butterfree.pipelines.rst @@ -10,7 +10,6 @@ Submodules :undoc-members: :show-inheritance: - Module contents --------------- diff --git a/docs/source/butterfree.reports.rst b/docs/source/butterfree.reports.rst index d49a701d1..850db914a 100644 --- a/docs/source/butterfree.reports.rst +++ b/docs/source/butterfree.reports.rst @@ -10,7 +10,6 @@ Submodules :undoc-members: :show-inheritance: - Module contents --------------- diff --git a/docs/source/butterfree.rst b/docs/source/butterfree.rst index 76e664b4b..0828f9211 100644 --- a/docs/source/butterfree.rst +++ b/docs/source/butterfree.rst @@ -12,7 +12,9 @@ Subpackages butterfree.constants butterfree.dataframe_service butterfree.extract + butterfree.hooks butterfree.load + butterfree.migrations butterfree.pipelines butterfree.reports butterfree.testing diff --git a/docs/source/butterfree.transform.features.rst b/docs/source/butterfree.transform.features.rst index e4c9a926b..f6c69095d 100644 --- a/docs/source/butterfree.transform.features.rst +++ b/docs/source/butterfree.transform.features.rst @@ -22,7 +22,6 @@ Submodules :undoc-members: :show-inheritance: - Module contents --------------- diff --git a/docs/source/butterfree.transform.rst b/docs/source/butterfree.transform.rst index 26d180939..02f8d4c61 100644 --- a/docs/source/butterfree.transform.rst +++ b/docs/source/butterfree.transform.rst @@ -26,7 +26,6 @@ Submodules :undoc-members: :show-inheritance: - Module contents --------------- diff --git a/docs/source/butterfree.transform.transformations.rst b/docs/source/butterfree.transform.transformations.rst index 870c84686..0978edcf1 100644 --- a/docs/source/butterfree.transform.transformations.rst +++ b/docs/source/butterfree.transform.transformations.rst @@ -54,7 +54,6 @@ Submodules :undoc-members: :show-inheritance: - Module contents --------------- diff --git a/docs/source/butterfree.transform.transformations.user_defined_functions.rst b/docs/source/butterfree.transform.transformations.user_defined_functions.rst index becc5d6eb..f93c7e98c 100644 --- a/docs/source/butterfree.transform.transformations.user_defined_functions.rst +++ b/docs/source/butterfree.transform.transformations.user_defined_functions.rst @@ -16,7 +16,6 @@ Submodules :undoc-members: :show-inheritance: - Module contents --------------- diff --git a/docs/source/butterfree.transform.utils.rst b/docs/source/butterfree.transform.utils.rst index bd8c15323..82e9038bb 100644 --- a/docs/source/butterfree.transform.utils.rst +++ b/docs/source/butterfree.transform.utils.rst @@ -22,7 +22,6 @@ Submodules :undoc-members: :show-inheritance: - Module contents --------------- diff --git a/docs/source/butterfree.validations.rst b/docs/source/butterfree.validations.rst index 9fd015574..35f5d1992 100644 --- a/docs/source/butterfree.validations.rst +++ b/docs/source/butterfree.validations.rst @@ -16,7 +16,6 @@ Submodules :undoc-members: :show-inheritance: - Module contents --------------- diff --git a/docs/source/cli.md b/docs/source/cli.md new file mode 100644 index 000000000..ba07428fc --- /dev/null +++ b/docs/source/cli.md @@ -0,0 +1,32 @@ +# Command-line Interface (CLI) + +Butterfree has now a command-line interface, introduced with the new automatic migration ability. + +As soon as you install butterfree, you can check what's available through butterfree's cli with: + +```shell +$~ butterfree --help +``` + +### Automated Database Schema Migration + +When developing your feature sets, you need also to prepare your database for the changes +to come into your Feature Store. Normally, when creating a new feature set, you needed +to manually create a new table in cassandra. Or, when creating a new feature in an existing +feature set, you needed to create new column in cassandra too. + +Now, you can just use `butterfree migrate apply ...`, butterfree will scan your python +files, looking for classes that inherit from `butterfree.pipelines.FeatureSetPipeline`, +then compare its schema with the database schema where the feature set would be written. +Then it will prepare migration queries and run against the databases. + +For more information, please, check `butterfree migrate apply --help` :) + +### Supported databases + +This functionality currently supports only the **Cassandra** database, which is the default +storage for an Online Feature Store built with Butterfree. Nonetheless, it was made with +the intent to be easily extended for other databases. + +Also, each database has its own rules for schema migration commands. Some changes may +still require manual interference. \ No newline at end of file diff --git a/docs/source/home.md b/docs/source/home.md index eada17390..fc297d2b6 100644 --- a/docs/source/home.md +++ b/docs/source/home.md @@ -10,6 +10,7 @@ The main idea is for this repository to be a set of tools for easing [ETLs](http - [Load](#load) - [Streaming](#streaming) - [Setup Configuration](#setup-configuration) +- [Command-line Interface](#command-line-interface) ## What is going on here @@ -61,3 +62,8 @@ We also support streaming pipelines in Butterfree. More information is available ## Setup Configuration Some configurations are needed to run your ETL pipelines. Detailed information is provided at the [Configuration Section](configuration.md) + +## Command-line Interface + +Butterfree has its own command-line interface, to manage your feature sets. Detailed information +provided by the [Command-line Interface](cli.md) section. diff --git a/docs/source/index.rst b/docs/source/index.rst index 6548f9adc..12bf1609a 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -22,3 +22,4 @@ Navigation stream configuration modules + cli diff --git a/requirements.dev.txt b/requirements.dev.txt index 8ebfa5108..96ddefc18 100644 --- a/requirements.dev.txt +++ b/requirements.dev.txt @@ -5,3 +5,7 @@ jupyter==1.0.0 twine==3.1.1 mypy==0.790 pyspark-stubs==3.0.0 +sphinx==3.5.4 +sphinxemoji==0.1.8 +sphinx-rtd-theme==0.5.2 +recommonmark==0.7.1 \ No newline at end of file From 5a0a62244b4c1ac3fe6e199575141bddff5d710e Mon Sep 17 00:00:00 2001 From: Mayara Moromisato <44944954+moromimay@users.noreply.github.com> Date: Mon, 10 May 2021 17:43:08 -0300 Subject: [PATCH 38/70] [MLOP-702] Debug mode for Automate Migration (#322) * Create flag debug-mode. * Fix tests. * Fix migrate test. --- butterfree/_cli/migrate.py | 35 +++++++++++++------ .../database_migration/database_migration.py | 21 ++++++++--- setup.py | 2 +- tests/unit/butterfree/_cli/test_migrate.py | 6 ++-- 4 files changed, 44 insertions(+), 20 deletions(-) diff --git a/butterfree/_cli/migrate.py b/butterfree/_cli/migrate.py index ebd211425..bfa18b461 100644 --- a/butterfree/_cli/migrate.py +++ b/butterfree/_cli/migrate.py @@ -101,6 +101,11 @@ def __fs_objects(path: str) -> Set[FeatureSetPipeline]: False, help="To generate the logs in local file 'logging.json'." ) +DEBUG_MODE = typer.Option( + False, + help="To view the queries resulting from the migration, DON'T apply the migration.", +) + class Migrate: """Execute migration operations in a Database based on pipeline Writer. @@ -112,7 +117,7 @@ class Migrate: def __init__(self, pipelines: Set[FeatureSetPipeline],) -> None: self.pipelines = pipelines - def _send_logs_to_s3(self, file_local: bool) -> None: + def _send_logs_to_s3(self, file_local: bool, debug_mode: bool) -> None: """Send all migration logs to S3.""" file_name = "../logging.json" @@ -120,11 +125,19 @@ def _send_logs_to_s3(self, file_local: bool) -> None: s3_client = boto3.client("s3") timestamp = datetime.datetime.now() - object_name = ( - f"logs/migrate/" - f"{timestamp.strftime('%Y-%m-%d')}" - f"/logging-{timestamp.strftime('%H:%M:%S')}.json" - ) + + if debug_mode: + object_name = ( + f"logs/migrate-debug-mode/" + f"{timestamp.strftime('%Y-%m-%d')}" + f"/logging-{timestamp.strftime('%H:%M:%S')}.json" + ) + else: + object_name = ( + f"logs/migrate/" + f"{timestamp.strftime('%Y-%m-%d')}" + f"/logging-{timestamp.strftime('%H:%M:%S')}.json" + ) bucket = environment.get_variable("FEATURE_STORE_S3_BUCKET") try: @@ -143,23 +156,23 @@ def _send_logs_to_s3(self, file_local: bool) -> None: json_data = json.load(json_f) print(json_data) - def run(self, generate_logs: bool = False) -> None: + def run(self, generate_logs: bool = False, debug_mode: bool = False) -> None: """Construct and apply the migrations.""" for pipeline in self.pipelines: for writer in pipeline.sink.writers: db = writer.db_config.database if db == "cassandra": migration = ALLOWED_DATABASE[db] - migration.apply_migration(pipeline.feature_set, writer) + migration.apply_migration(pipeline.feature_set, writer, debug_mode) else: logger.warning(f"Butterfree not supporting {db} Migrations yet.") - self._send_logs_to_s3(generate_logs) + self._send_logs_to_s3(generate_logs, debug_mode) @app.command("apply") def migrate( - path: str = PATH, generate_logs: bool = GENERATE_LOGS, + path: str = PATH, generate_logs: bool = GENERATE_LOGS, debug_mode: bool = DEBUG_MODE ) -> Set[FeatureSetPipeline]: """Scan and run database migrations for feature set pipelines defined under PATH. @@ -172,5 +185,5 @@ def migrate( import and instantiate them. """ pipe_set = __fs_objects(path) - Migrate(pipe_set).run(generate_logs) + Migrate(pipe_set).run(generate_logs, debug_mode) return pipe_set diff --git a/butterfree/migrations/database_migration/database_migration.py b/butterfree/migrations/database_migration/database_migration.py index de6b2f803..40192ff70 100644 --- a/butterfree/migrations/database_migration/database_migration.py +++ b/butterfree/migrations/database_migration/database_migration.py @@ -260,12 +260,15 @@ def _get_schema( db_schema = [] return db_schema - def apply_migration(self, feature_set: FeatureSet, writer: Writer,) -> None: + def apply_migration( + self, feature_set: FeatureSet, writer: Writer, debug_mode: bool + ) -> None: """Apply the migration in the respective database. Args: feature_set: the feature set. writer: the writer being used to load the feature set. + debug_mode: if active, it brings up the queries generated. """ logger.info(f"Migrating feature set: {feature_set.name}") @@ -280,8 +283,16 @@ def apply_migration(self, feature_set: FeatureSet, writer: Writer,) -> None: fs_schema, table_name, db_schema, writer.write_to_entity ) - for q in queries: - logger.info(f"Applying this query: {q} ...") - self._client.sql(q) + if debug_mode: + print( + "#### DEBUG MODE ###\n" + f"Feature set: {feature_set.name}\n" + "Queries:\n" + f"{queries}" + ) + else: + for q in queries: + logger.info(f"Applying this query: {q} ...") + self._client.sql(q) - logger.info(f"Feature Set migration finished successfully.") + logger.info(f"Feature Set migration finished successfully.") diff --git a/setup.py b/setup.py index 6c2c2b46a..56cf8842e 100644 --- a/setup.py +++ b/setup.py @@ -1,7 +1,7 @@ from setuptools import find_packages, setup __package_name__ = "butterfree" -__version__ = "1.2.0.dev16" +__version__ = "1.2.0.dev17" __repository_url__ = "https://github.com/quintoandar/butterfree" with open("requirements.txt") as f: diff --git a/tests/unit/butterfree/_cli/test_migrate.py b/tests/unit/butterfree/_cli/test_migrate.py index 475db15f7..c0751c888 100644 --- a/tests/unit/butterfree/_cli/test_migrate.py +++ b/tests/unit/butterfree/_cli/test_migrate.py @@ -17,16 +17,16 @@ def test_migrate_success(self, mocker): assert all(isinstance(fs, FeatureSetPipeline) for fs in all_fs) assert sorted([fs.feature_set.name for fs in all_fs]) == ["first", "second"] - def test_migrate_all_pairs(self, mocker): + def test_migrate_run_methods(self, mocker): mocker.patch.object(CassandraMigration, "apply_migration") mocker.patch.object(migrate.Migrate, "_send_logs_to_s3") - all_fs = migrate.migrate("tests/mocks/entities/") + all_fs = migrate.migrate("tests/mocks/entities/", False, False) assert CassandraMigration.apply_migration.call_count == 2 cassandra_pairs = [ - call(pipe.feature_set, pipe.sink.writers[1]) for pipe in all_fs + call(pipe.feature_set, pipe.sink.writers[1], False) for pipe in all_fs ] CassandraMigration.apply_migration.assert_has_calls( cassandra_pairs, any_order=True From b1371f1201235973ba8b98048f676c3fe7071499 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gabriel=20Brand=C3=A3o?= <37742275+GaBrandao@users.noreply.github.com> Date: Wed, 2 Jun 2021 15:02:23 -0300 Subject: [PATCH 39/70] [MLOP-727] Improve logging messages (#325) * Fix logging message for local file * Remove json import --- butterfree/_cli/migrate.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/butterfree/_cli/migrate.py b/butterfree/_cli/migrate.py index bfa18b461..277ecf3c6 100644 --- a/butterfree/_cli/migrate.py +++ b/butterfree/_cli/migrate.py @@ -1,7 +1,6 @@ import datetime import importlib import inspect -import json import os import pkgutil import sys @@ -151,10 +150,10 @@ def _send_logs_to_s3(self, file_local: bool, debug_mode: bool) -> None: raise os.remove(file_name) + elif os.path.exists(file_name): + print("Logs written to ../logging.json") else: - with open(file_name, "r") as json_f: - json_data = json.load(json_f) - print(json_data) + print("No logs were generated.") def run(self, generate_logs: bool = False, debug_mode: bool = False) -> None: """Construct and apply the migrations.""" From acf7022bfccc0ddc38cf0c51fff6e8b8086cc47a Mon Sep 17 00:00:00 2001 From: Mayara Moromisato <44944954+moromimay@users.noreply.github.com> Date: Wed, 2 Jun 2021 16:38:21 -0300 Subject: [PATCH 40/70] [MLOP-728] Improve logging messages (#324) * Improve logs. * Revert debug-mode condition. --- .../database_migration/database_migration.py | 20 +++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/butterfree/migrations/database_migration/database_migration.py b/butterfree/migrations/database_migration/database_migration.py index 40192ff70..aeec4a6e7 100644 --- a/butterfree/migrations/database_migration/database_migration.py +++ b/butterfree/migrations/database_migration/database_migration.py @@ -1,5 +1,4 @@ """Migration entity.""" -import logging from abc import ABC, abstractmethod from dataclasses import dataclass from enum import Enum, auto @@ -141,12 +140,7 @@ def _get_queries( ) queries.append(alter_table_add_query) if drop_items: - if write_on_entity: - logging.info( - "Features will not be dropped automatically " - "when data is loaded to an entity table" - ) - else: + if not write_on_entity: drop_columns_query = self._get_alter_table_drop_query( drop_items, table_name ) @@ -158,7 +152,9 @@ def _get_queries( ) queries.append(alter_column_types_query) if alter_key_items: - logger.info("This operation is not supported by Spark.") + logger.warning( + "The 'change the primary key column' action is not supported by Spark." + ) return queries @@ -217,6 +213,11 @@ def _get_diff( for db_item in db_schema: if fs_item.get("column_name") == db_item.get("column_name"): if fs_item.get("type") != db_item.get("type"): + if fs_item.get("primary_key") is True: + logger.warning( + "Type changes are not applied to " + "columns that are the primary key." + ) alter_type_columns.update( {fs_item.get("column_name"): fs_item.get("type")} ) @@ -296,3 +297,6 @@ def apply_migration( self._client.sql(q) logger.info(f"Feature Set migration finished successfully.") + + # inform in drone console which feature set was migrated + print(f"The {feature_set.name} feature set was migrated.") From d0bf61adb1b48db6eb6cd2e4e71c019b3c43d589 Mon Sep 17 00:00:00 2001 From: Mayara Moromisato <44944954+moromimay@users.noreply.github.com> Date: Fri, 4 Jun 2021 10:15:39 -0300 Subject: [PATCH 41/70] Fix method to generate agg feature name. (#326) --- butterfree/transform/transformations/aggregated_transform.py | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/butterfree/transform/transformations/aggregated_transform.py b/butterfree/transform/transformations/aggregated_transform.py index 2c7a8ced2..7304f34b6 100644 --- a/butterfree/transform/transformations/aggregated_transform.py +++ b/butterfree/transform/transformations/aggregated_transform.py @@ -88,7 +88,7 @@ def _get_output_name(self, function: object) -> str: """ ) - base_name = "__".join([self._parent.name, function.__name__]) + base_name = "__".join([self._parent.name, str(function.__name__).lower()]) return base_name @property diff --git a/setup.py b/setup.py index 56cf8842e..daaa264b1 100644 --- a/setup.py +++ b/setup.py @@ -1,7 +1,7 @@ from setuptools import find_packages, setup __package_name__ = "butterfree" -__version__ = "1.2.0.dev17" +__version__ = "1.2.0.dev18" __repository_url__ = "https://github.com/quintoandar/butterfree" with open("requirements.txt") as f: From 1cf0dbde30c26345926c4ca8b533c87da4579cd7 Mon Sep 17 00:00:00 2001 From: Mayara Moromisato <44944954+moromimay@users.noreply.github.com> Date: Thu, 10 Jun 2021 10:59:48 -0300 Subject: [PATCH 42/70] [MLOP-691] Include step to add partition to SparkMetastore during writing of Butterfree (#327) * Change writer type for interval mode. * Some adjusts. --- .../historical_feature_store_writer.py | 46 ++++--------------- setup.py | 2 +- .../test_historical_feature_store_writer.py | 21 ++++----- 3 files changed, 21 insertions(+), 48 deletions(-) diff --git a/butterfree/load/writers/historical_feature_store_writer.py b/butterfree/load/writers/historical_feature_store_writer.py index c43440419..489f22be1 100644 --- a/butterfree/load/writers/historical_feature_store_writer.py +++ b/butterfree/load/writers/historical_feature_store_writer.py @@ -144,15 +144,17 @@ def write( dataframe = self._apply_transformations(dataframe) if self.interval_mode: - if self.debug_mode: - spark_client.create_temporary_view( - dataframe=dataframe, - name=f"historical_feature_store__{feature_set.name}", + partition_overwrite_mode = spark_client.conn.conf.get( + "spark.sql.sources.partitionOverwriteMode" + ).lower() + + if partition_overwrite_mode != "dynamic": + raise RuntimeError( + "m=load_incremental_table, " + "spark.sql.sources.partitionOverwriteMode={}, " + "msg=partitionOverwriteMode have to " + "be configured to 'dynamic'".format(partition_overwrite_mode) ) - return - - self._incremental_mode(feature_set, dataframe, spark_client) - return if self.debug_mode: spark_client.create_temporary_view( @@ -171,34 +173,6 @@ def write( **self.db_config.get_options(s3_key), ) - def _incremental_mode( - self, feature_set: FeatureSet, dataframe: DataFrame, spark_client: SparkClient - ) -> None: - - partition_overwrite_mode = spark_client.conn.conf.get( - "spark.sql.sources.partitionOverwriteMode" - ).lower() - - if partition_overwrite_mode != "dynamic": - raise RuntimeError( - "m=load_incremental_table, " - "spark.sql.sources.partitionOverwriteMode={}, " - "msg=partitionOverwriteMode have to be configured to 'dynamic'".format( - partition_overwrite_mode - ) - ) - - s3_key = os.path.join("historical", feature_set.entity, feature_set.name) - options = {"path": self.db_config.get_options(s3_key).get("path")} - - spark_client.write_dataframe( - dataframe=dataframe, - format_=self.db_config.format_, - mode=self.db_config.mode, - **options, - partitionBy=self.PARTITION_BY, - ) - def _assert_validation_count( self, table_name: str, written_count: int, dataframe_count: int ) -> None: diff --git a/setup.py b/setup.py index daaa264b1..348bdbfea 100644 --- a/setup.py +++ b/setup.py @@ -1,7 +1,7 @@ from setuptools import find_packages, setup __package_name__ = "butterfree" -__version__ = "1.2.0.dev18" +__version__ = "1.2.0.dev19" __repository_url__ = "https://github.com/quintoandar/butterfree" with open("requirements.txt") as f: diff --git a/tests/unit/butterfree/load/writers/test_historical_feature_store_writer.py b/tests/unit/butterfree/load/writers/test_historical_feature_store_writer.py index 8bab23baf..9e84aacda 100644 --- a/tests/unit/butterfree/load/writers/test_historical_feature_store_writer.py +++ b/tests/unit/butterfree/load/writers/test_historical_feature_store_writer.py @@ -51,7 +51,7 @@ def test_write_interval_mode( ): # given spark_client = SparkClient() - spark_client.write_dataframe = mocker.stub("write_dataframe") + spark_client.write_table = mocker.stub("write_table") spark_client.conn.conf.set( "spark.sql.sources.partitionOverwriteMode", "dynamic" ) @@ -63,21 +63,15 @@ def test_write_interval_mode( dataframe=feature_set_dataframe, spark_client=spark_client, ) - result_df = spark_client.write_dataframe.call_args[1]["dataframe"] + result_df = spark_client.write_table.call_args[1]["dataframe"] # then assert_dataframe_equality(historical_feature_set_dataframe, result_df) + assert writer.database == spark_client.write_table.call_args[1]["database"] + assert feature_set.name == spark_client.write_table.call_args[1]["table_name"] assert ( - writer.db_config.format_ - == spark_client.write_dataframe.call_args[1]["format_"] - ) - assert ( - writer.db_config.mode == spark_client.write_dataframe.call_args[1]["mode"] - ) - assert ( - writer.PARTITION_BY - == spark_client.write_dataframe.call_args[1]["partitionBy"] + writer.PARTITION_BY == spark_client.write_table.call_args[1]["partition_by"] ) def test_write_interval_mode_invalid_partition_mode( @@ -130,9 +124,14 @@ def test_write_in_debug_mode_with_interval_mode( historical_feature_set_dataframe, feature_set, spark_session, + mocker, ): # given spark_client = SparkClient() + spark_client.write_dataframe = mocker.stub("write_dataframe") + spark_client.conn.conf.set( + "spark.sql.sources.partitionOverwriteMode", "dynamic" + ) writer = HistoricalFeatureStoreWriter(debug_mode=True, interval_mode=True) # when From 9f42f53fc8f53b3d0c50d70b0e588c6b4c4a1611 Mon Sep 17 00:00:00 2001 From: Jay Vala <24193355+jdvala@users.noreply.github.com> Date: Wed, 16 Jun 2021 16:43:52 +0200 Subject: [PATCH 43/70] Add the missing link for H3 geohash (#330) * Add the missing link for H3 geohash * Update the H3 geohash link. * Update the same link Update the same link in in spark_function_and_window.ipynb example --- examples/simple_feature_set/simple_feature_set.ipynb | 2 +- .../spark_function_and_window/spark_function_and_window.ipynb | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/simple_feature_set/simple_feature_set.ipynb b/examples/simple_feature_set/simple_feature_set.ipynb index b217fcdf7..c5ed9ae55 100644 --- a/examples/simple_feature_set/simple_feature_set.ipynb +++ b/examples/simple_feature_set/simple_feature_set.ipynb @@ -89,7 +89,7 @@ "| - | - | - | - | - | - | - | - | - | - | - | - | - | - |\n", "| int | timestamp | float | float | int | int | float | float | float | double | double | string | string | string |\n", "\n", - "For more information about H3 geohash click [here]()\n", + "For more information about H3 geohash click [here](https://h3geo.org/docs/)\n", "\n", "The following code blocks will show how to generate this feature set using Butterfree library:\n", "\n" diff --git a/examples/spark_function_and_window/spark_function_and_window.ipynb b/examples/spark_function_and_window/spark_function_and_window.ipynb index a4472e245..dcf715524 100644 --- a/examples/spark_function_and_window/spark_function_and_window.ipynb +++ b/examples/spark_function_and_window/spark_function_and_window.ipynb @@ -50,7 +50,7 @@ "\n", "Note that we're going to compute two aggregated features, rent average and standard deviation, considering the two last occurrences (or events). It'd also be possible to define time windows, instead of windows based on events.\n", "\n", - "For more information about H3 geohash click [here]().\n", + "For more information about H3 geohash click [here](https://h3geo.org/docs/).\n", "\n", "The following code blocks will show how to generate this feature set using Butterfree library:\n", "\n" From 78927e317ae788334a8293df08a1596ff350a83f Mon Sep 17 00:00:00 2001 From: Rodrigo Martins de Oliveira Date: Fri, 30 Jul 2021 15:56:45 -0300 Subject: [PATCH 44/70] Update README.md (#331) --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 728f7b027..7b93f000f 100644 --- a/README.md +++ b/README.md @@ -32,7 +32,7 @@ To learn how to use Butterfree in practice, see [Butterfree's notebook examples] ## Requirements and Installation Butterfree depends on **Python 3.7+** and it is **Spark 3.0 ready** :heavy_check_mark: -[Python Package Index](https://quintoandar.github.io/python-package-server/) hosts reference to a pip-installable module of this library, using it is as straightforward as including it on your project's requirements. +[PyPI hosts reference to a pip-installable module of this library](https://pypi.org/project/butterfree/), using it is as straightforward as including it on your project's requirements. ```bash pip install butterfree From 43bb3a336a9ec900fcdf86971f48b56f2f2389d0 Mon Sep 17 00:00:00 2001 From: Lucas Fonseca Date: Mon, 22 Aug 2022 13:57:54 -0300 Subject: [PATCH 45/70] Update Github Actions Workflow runner (#332) * Update Workflow runner version * bump flake8-bandit * chore: bypass false positive for S105 Co-authored-by: Lucas Cardozo --- .github/workflows/publish.yml | 2 +- .github/workflows/staging.yml | 2 +- .github/workflows/test.yml | 2 +- requirements.lint.txt | 3 ++- tests/unit/butterfree/configs/db/test_cassandra_config.py | 2 +- 5 files changed, 6 insertions(+), 5 deletions(-) diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index 3620cdbbd..f981921e6 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -9,7 +9,7 @@ jobs: Pipeline: if: github.ref == 'refs/heads/master' - runs-on: ubuntu-16.04 + runs-on: ubuntu-22.04 container: quintoandar/python-3-7-java steps: diff --git a/.github/workflows/staging.yml b/.github/workflows/staging.yml index 8b39e5ac3..1f94fc5dd 100644 --- a/.github/workflows/staging.yml +++ b/.github/workflows/staging.yml @@ -9,7 +9,7 @@ jobs: Pipeline: if: github.ref == 'refs/heads/staging' - runs-on: ubuntu-16.04 + runs-on: ubuntu-22.04 container: quintoandar/python-3-7-java steps: diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index b39246fda..d7c1c3acc 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -9,7 +9,7 @@ on: jobs: Pipeline: - runs-on: ubuntu-16.04 + runs-on: ubuntu-22.04 container: quintoandar/python-3-7-java steps: diff --git a/requirements.lint.txt b/requirements.lint.txt index 161f7911f..7c51f4b37 100644 --- a/requirements.lint.txt +++ b/requirements.lint.txt @@ -4,4 +4,5 @@ flake8-isort==2.8.0 isort<5 # temporary fix flake8-docstrings==1.5.0 flake8-bugbear==20.1.0 -flake8-bandit==2.1.2 +flake8-bandit==3.0.0 + diff --git a/tests/unit/butterfree/configs/db/test_cassandra_config.py b/tests/unit/butterfree/configs/db/test_cassandra_config.py index d34c8e9f2..fa907a07a 100644 --- a/tests/unit/butterfree/configs/db/test_cassandra_config.py +++ b/tests/unit/butterfree/configs/db/test_cassandra_config.py @@ -230,6 +230,6 @@ def test_set_credentials_on_instantiation(self): username="username", password="password", host="host", keyspace="keyspace" ) assert cassandra_config.username == "username" - assert cassandra_config.password == "password" + assert cassandra_config.password == "password" # noqa: S105 assert cassandra_config.host == "host" assert cassandra_config.keyspace == "keyspace" From 2593839d7fdd092a865795faa9d9e0a49cfc6f2c Mon Sep 17 00:00:00 2001 From: Mayara Moromisato <44944954+moromimay@users.noreply.github.com> Date: Tue, 20 Dec 2022 13:45:52 -0300 Subject: [PATCH 46/70] Delete sphinx version. (#334) --- docs/requirements.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/docs/requirements.txt b/docs/requirements.txt index 501e17cdf..a20ab18ff 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -1,5 +1,4 @@ recommonmark==0.6.0 -Sphinx==3.1.1 sphinx-rtd-theme==0.4.3 sphinxemoji==0.1.6 typing-extensions==3.7.4.2 From 35bcd30af981a960bc7c79c47e6a25dbed729f6c Mon Sep 17 00:00:00 2001 From: Mayara Moromisato <44944954+moromimay@users.noreply.github.com> Date: Wed, 21 Dec 2022 11:36:24 -0300 Subject: [PATCH 47/70] Update files to staging (#336) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Release/1.1.3 (#290) * [MLOP-634] Butterfree dev workflow, set triggers for branches staging and master (#280) * Change github actions pipelines. * Change pipeline logic. * [BUG] Fix Staging GithubActions Pipeline (#283) * New step on pipelie. * Some adjusts. * Apply only wheel. (#285) * [BUG] Change version on setup.py to PyPI (#286) * Add new make command to change version. * Change command order. * Change desc and variable name. * Change command name. * Keep milliseconds when using 'from_ms' argument in timestamp feature (#284) * changed timestamp resolution * fix import * simple refactor Co-authored-by: Henrique Camargo * Change trigger for pipeline staging (#287) * Change trigger to publish dev pipeline. * Some fix. * Create a dev package. (#288) * [MLOP-633] Butterfree dev workflow, update documentation (#281) * Update workflow doc. * Update README * Add pre-release. * Fix typo. * [MLOP-632] Butterfree dev workflow, automate release description (#279) * release 1.1.4 * update changelog Co-authored-by: Mayara Moromisato <44944954+moromimay@users.noreply.github.com> Co-authored-by: Henrique Camargo Co-authored-by: AlvaroMarquesAndrade <45604858+AlvaroMarquesAndrade@users.noreply.github.com> * Release/1.2.0 (#328) * [MLOP-636] Create migration classes (#282) * [MLOP-635] Rebase Incremental Job/Interval Run branch for test on selected feature sets (#278) * Add interval branch modifications. * Add interval_runs notebook. * Add tests. * Apply style (black, flack8 and mypy). * Fix tests. * Change version to create package dev. * Allow slide selection (#293) * Fix Slide Duration Typo (#295) * [MLOP-637] Implement diff method (#292) * [MLOP-640] Create CLI with migrate command (#298) * [MLOP-645] Implement query method, cassandra (#291) * [MLOP-671] Implement get_schema on Spark client (#301) * [MLOP-648] Implement query method, metastore (#294) * Fix Validation Step (#302) * [MLOP-647] [MLOP-646] Apply migrations (#300) * add apply migration method * add test apply migration * add migrate actor with tests * mypy compliant * fix test interaction with mocked object * Rebase and some adjusts. Co-authored-by: Mayara Moromisato * [BUG] Apply create_partitions to historical validate (#303) * Apply create_partitions to historical validate. * Remove comments and adjusts. * [BUG] Fix key path for validate read (#304) * Fix key path * bump version Co-authored-by: AlvaroMarquesAndrade <1a789766b1c4c8b679e80f11fa6d63d42fa4bcdf> * [FIX] Add Partition types for Metastore (#305) * [MLOP-639] Track logs in S3 (#306) * Apply tracking logs and logging config. * Adjusts in CLI and logging.conf. * Some adjusts. * Change version to generate new dev package * Fix version. * Apply style. * Add new assert in the migrate unit test. * [BUG] Change logging config (#307) * Change logging config. * Some adjusts. * Remove a code smell. * Change solution for tracking logs (#308) * Change tracking logs method. * Change version to generate dev package. * Change path name in S3 * Read and write consistency level options (#309) * modify cassandra client to be region aware * add option for the user to set read and write consistency levels on cassandra config * add tests * use env vars instead * Update butterfree/configs/db/cassandra_config.py Co-authored-by: Rodrigo Martins de Oliveira * Update butterfree/configs/db/cassandra_config.py Co-authored-by: Rodrigo Martins de Oliveira Co-authored-by: Rodrigo Martins de Oliveira * Fix kafka reader. (#310) * Fix path validate. (#311) * Add local dc property (#312) * add local dc property * update version * Remove metastore migrate (#313) * Remove metastore migrate. * Change version to create a dev package. * Fix link in our docs. (#315) * [BUG] Fix Cassandra Connect Session (#316) * Fix Cassandra Connect Session. * Apply style. * Fix migration query. (#318) * Fix migration query add type key. (#319) * Fix db-config condition (#321) * Fix db-config condition. * Apply style. * MLOP-642 Document migration in Butterfree (#320) * update docs * add more information and reference new cli.md file * [MLOP-702] Debug mode for Automate Migration (#322) * Create flag debug-mode. * Fix tests. * Fix migrate test. * [MLOP-727] Improve logging messages (#325) * Fix logging message for local file * Remove json import * [MLOP-728] Improve logging messages (#324) * Improve logs. * Revert debug-mode condition. * Fix method to generate agg feature name. (#326) * [MLOP-691] Include step to add partition to SparkMetastore during writing of Butterfree (#327) * Change writer type for interval mode. * Some adjusts. * Release 1.2.0 Co-authored-by: AlvaroMarquesAndrade <45604858+AlvaroMarquesAndrade@users.noreply.github.com> Co-authored-by: Igor Gustavo Hoelscher <19557581+roelschr@users.noreply.github.com> Co-authored-by: Felipe Victorino Caputo <13631451+fvcaputo@users.noreply.github.com> Co-authored-by: Rodrigo Martins de Oliveira Co-authored-by: Gabriel Brandão <37742275+GaBrandao@users.noreply.github.com> Co-authored-by: hmeretti Co-authored-by: Henrique Camargo Co-authored-by: AlvaroMarquesAndrade <45604858+AlvaroMarquesAndrade@users.noreply.github.com> Co-authored-by: Igor Gustavo Hoelscher <19557581+roelschr@users.noreply.github.com> Co-authored-by: Felipe Victorino Caputo <13631451+fvcaputo@users.noreply.github.com> Co-authored-by: Rodrigo Martins de Oliveira Co-authored-by: Gabriel Brandão <37742275+GaBrandao@users.noreply.github.com> --- CHANGELOG.md | 26 ++++++++++++++++++++++++++ setup.py | 2 +- 2 files changed, 27 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 679e98343..e7f7004bd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,8 +4,34 @@ All notable changes to this project will be documented in this file. Preferably use **Added**, **Changed**, **Removed** and **Fixed** topics in each release or unreleased log for a better organization. ## [Unreleased] + + +## [1.2.0](https://github.com/quintoandar/butterfree/releases/tag/1.2.0) ### Added * [MLOP-636] Create migration classes ([#282](https://github.com/quintoandar/butterfree/pull/282)) +* [MLOP-635] Rebase Incremental Job/Interval Run branch for test on selected feature sets ([#278](https://github.com/quintoandar/butterfree/pull/278)) +* Allow slide selection ([#293](https://github.com/quintoandar/butterfree/pull/293)) +* [MLOP-637] Implement diff method ([#292](https://github.com/quintoandar/butterfree/pull/292)) +* [MLOP-640] Create CLI with migrate command ([#298](https://github.com/quintoandar/butterfree/pull/298)) +* [MLOP-645] Implement query method, cassandra ([#291](https://github.com/quintoandar/butterfree/pull/291)) +* [MLOP-671] Implement get_schema on Spark client ([#301](https://github.com/quintoandar/butterfree/pull/301)) +* [MLOP-648] Implement query method, metastore ([#294](https://github.com/quintoandar/butterfree/pull/294)) +* [MLOP-647] / [MLOP-646] Apply migrations ([#300](https://github.com/quintoandar/butterfree/pull/300)) +* [MLOP-639] Track logs in S3 ([#306](https://github.com/quintoandar/butterfree/pull/306)) +* [MLOP-702] Debug mode for Automate Migration ([#322](https://github.com/quintoandar/butterfree/pull/322)) + +### Changed +* Keep milliseconds when using 'from_ms' argument in timestamp feature ([#284](https://github.com/quintoandar/butterfree/pull/284)) +* Read and write consistency level options ([#309](https://github.com/quintoandar/butterfree/pull/309)) +* [MLOP-691] Include step to add partition to SparkMetastore during writing of Butterfree ([#327](https://github.com/quintoandar/butterfree/pull/327)) + +### Fixed +* [BUG] Apply create_partitions to historical validate ([#303](https://github.com/quintoandar/butterfree/pull/303)) +* [BUG] Fix key path for validate read ([#304](https://github.com/quintoandar/butterfree/pull/304)) +* [FIX] Add Partition types for Metastore ([#305](https://github.com/quintoandar/butterfree/pull/305)) +* Change solution for tracking logs ([#308](https://github.com/quintoandar/butterfree/pull/308)) +* [BUG] Fix Cassandra Connect Session ([#316](https://github.com/quintoandar/butterfree/pull/316)) +* Fix method to generate agg feature name. ([#326](https://github.com/quintoandar/butterfree/pull/326)) ## [1.1.3](https://github.com/quintoandar/butterfree/releases/tag/1.1.3) ### Added diff --git a/setup.py b/setup.py index 348bdbfea..b120a1ca7 100644 --- a/setup.py +++ b/setup.py @@ -1,7 +1,7 @@ from setuptools import find_packages, setup __package_name__ = "butterfree" -__version__ = "1.2.0.dev19" +__version__ = "1.2.0" __repository_url__ = "https://github.com/quintoandar/butterfree" with open("requirements.txt") as f: From 3a73ed83608772083d79e70f74c9636823645f8d Mon Sep 17 00:00:00 2001 From: Mayara Moromisato <44944954+moromimay@users.noreply.github.com> Date: Mon, 2 Jan 2023 13:49:06 -0300 Subject: [PATCH 48/70] Revert "Update files to staging (#336)" (#337) This reverts commit 35bcd30af981a960bc7c79c47e6a25dbed729f6c. --- CHANGELOG.md | 26 -------------------------- setup.py | 2 +- 2 files changed, 1 insertion(+), 27 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e7f7004bd..679e98343 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,34 +4,8 @@ All notable changes to this project will be documented in this file. Preferably use **Added**, **Changed**, **Removed** and **Fixed** topics in each release or unreleased log for a better organization. ## [Unreleased] - - -## [1.2.0](https://github.com/quintoandar/butterfree/releases/tag/1.2.0) ### Added * [MLOP-636] Create migration classes ([#282](https://github.com/quintoandar/butterfree/pull/282)) -* [MLOP-635] Rebase Incremental Job/Interval Run branch for test on selected feature sets ([#278](https://github.com/quintoandar/butterfree/pull/278)) -* Allow slide selection ([#293](https://github.com/quintoandar/butterfree/pull/293)) -* [MLOP-637] Implement diff method ([#292](https://github.com/quintoandar/butterfree/pull/292)) -* [MLOP-640] Create CLI with migrate command ([#298](https://github.com/quintoandar/butterfree/pull/298)) -* [MLOP-645] Implement query method, cassandra ([#291](https://github.com/quintoandar/butterfree/pull/291)) -* [MLOP-671] Implement get_schema on Spark client ([#301](https://github.com/quintoandar/butterfree/pull/301)) -* [MLOP-648] Implement query method, metastore ([#294](https://github.com/quintoandar/butterfree/pull/294)) -* [MLOP-647] / [MLOP-646] Apply migrations ([#300](https://github.com/quintoandar/butterfree/pull/300)) -* [MLOP-639] Track logs in S3 ([#306](https://github.com/quintoandar/butterfree/pull/306)) -* [MLOP-702] Debug mode for Automate Migration ([#322](https://github.com/quintoandar/butterfree/pull/322)) - -### Changed -* Keep milliseconds when using 'from_ms' argument in timestamp feature ([#284](https://github.com/quintoandar/butterfree/pull/284)) -* Read and write consistency level options ([#309](https://github.com/quintoandar/butterfree/pull/309)) -* [MLOP-691] Include step to add partition to SparkMetastore during writing of Butterfree ([#327](https://github.com/quintoandar/butterfree/pull/327)) - -### Fixed -* [BUG] Apply create_partitions to historical validate ([#303](https://github.com/quintoandar/butterfree/pull/303)) -* [BUG] Fix key path for validate read ([#304](https://github.com/quintoandar/butterfree/pull/304)) -* [FIX] Add Partition types for Metastore ([#305](https://github.com/quintoandar/butterfree/pull/305)) -* Change solution for tracking logs ([#308](https://github.com/quintoandar/butterfree/pull/308)) -* [BUG] Fix Cassandra Connect Session ([#316](https://github.com/quintoandar/butterfree/pull/316)) -* Fix method to generate agg feature name. ([#326](https://github.com/quintoandar/butterfree/pull/326)) ## [1.1.3](https://github.com/quintoandar/butterfree/releases/tag/1.1.3) ### Added diff --git a/setup.py b/setup.py index b120a1ca7..348bdbfea 100644 --- a/setup.py +++ b/setup.py @@ -1,7 +1,7 @@ from setuptools import find_packages, setup __package_name__ = "butterfree" -__version__ = "1.2.0" +__version__ = "1.2.0.dev19" __repository_url__ = "https://github.com/quintoandar/butterfree" with open("requirements.txt") as f: From 6b78a505cb29148e1494a1018b8b56af8d6062fe Mon Sep 17 00:00:00 2001 From: Lucas Cardozo Date: Wed, 16 Aug 2023 21:54:42 +0100 Subject: [PATCH 49/70] Less strict requirements (#333) * bump a few requirements; increase lower bound for h3 version range; adding pyarrow dev dependency * fix type repr for spark types; fix: broken tests (pyspark 3.4) --------- Co-authored-by: Ralph Rassweiler --- Makefile | 6 ++--- butterfree/configs/db/cassandra_config.py | 2 +- butterfree/reports/metadata.py | 4 +-- requirements.dev.txt | 8 +++--- requirements.txt | 7 +++--- setup.cfg | 1 + setup.py | 2 +- .../pipelines/test_feature_set_pipeline.py | 25 +++++++++++-------- 8 files changed, 29 insertions(+), 26 deletions(-) diff --git a/Makefile b/Makefile index 95cc6e3a6..4109504f6 100644 --- a/Makefile +++ b/Makefile @@ -9,8 +9,8 @@ VERSION := $(shell grep __version__ setup.py | head -1 | cut -d \" -f2 | cut -d .PHONY: environment ## create virtual environment for butterfree environment: - @pyenv install -s 3.7.6 - @pyenv virtualenv 3.7.6 butterfree + @pyenv install -s 3.7.13 + @pyenv virtualenv 3.7.13 butterfree @pyenv local butterfree @PYTHONPATH=. python -m pip install --upgrade pip @@ -221,4 +221,4 @@ help: } \ printf "\n"; \ }' \ - | more $(shell test $(shell uname) = Darwin && echo '--no-init --raw-control-chars') \ No newline at end of file + | more $(shell test $(shell uname) = Darwin && echo '--no-init --raw-control-chars') diff --git a/butterfree/configs/db/cassandra_config.py b/butterfree/configs/db/cassandra_config.py index 3d94e7567..a038cb177 100644 --- a/butterfree/configs/db/cassandra_config.py +++ b/butterfree/configs/db/cassandra_config.py @@ -246,7 +246,7 @@ def translate(self, schema: List[Dict[str, Any]]) -> List[Dict[str, Any]]: cassandra_schema.append( { "column_name": features["column_name"], - "type": cassandra_mapping[str(features["type"])], + "type": cassandra_mapping[str(features["type"]).replace("()", "")], "primary_key": features["primary_key"], } ) diff --git a/butterfree/reports/metadata.py b/butterfree/reports/metadata.py index d54bbba9d..dc1f7cbb4 100644 --- a/butterfree/reports/metadata.py +++ b/butterfree/reports/metadata.py @@ -162,7 +162,7 @@ def to_json(self) -> Any: "features": [ { "column_name": c["column_name"], - "data_type": str(c["type"]), + "data_type": str(c["type"]).replace("()", ""), "description": desc, } for c, desc in params._features @@ -208,7 +208,7 @@ def to_markdown(self) -> Any: features = ["Column name", "Data type", "Description"] for c, desc in params._features: - features.extend([c["column_name"], str(c["type"]), desc]) + features.extend([c["column_name"], str(c["type"]).replace("()", ""), desc]) count_rows = len(features) // 3 diff --git a/requirements.dev.txt b/requirements.dev.txt index 96ddefc18..3d70d4c05 100644 --- a/requirements.dev.txt +++ b/requirements.dev.txt @@ -1,11 +1,9 @@ -cmake==3.18.4 -h3==3.7.0 -pyarrow==0.15.1 +h3==3.7.4 jupyter==1.0.0 twine==3.1.1 mypy==0.790 -pyspark-stubs==3.0.0 sphinx==3.5.4 sphinxemoji==0.1.8 sphinx-rtd-theme==0.5.2 -recommonmark==0.7.1 \ No newline at end of file +recommonmark==0.7.1 +pyarrow>=1.0.0 diff --git a/requirements.txt b/requirements.txt index 9548edb31..d61d125bc 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,9 +1,8 @@ cassandra-driver>=3.22.0,<4.0 mdutils>=1.2.2,<2.0 -pandas>=0.24,<1.1 +pandas>=0.24,<2.0 parameters-validation>=1.1.5,<2.0 pyspark==3.* typer>=0.3,<0.4 -setuptools>=41,<42 -typing-extensions==3.7.4.3 -boto3==1.17.* \ No newline at end of file +typing-extensions>3.7.4,<5 +boto3==1.17.* diff --git a/setup.cfg b/setup.cfg index 255fff848..cff001224 100644 --- a/setup.cfg +++ b/setup.cfg @@ -24,6 +24,7 @@ spark_options = spark.sql.session.timeZone: UTC spark.driver.bindAddress: 127.0.0.1 spark.sql.legacy.timeParserPolicy: LEGACY + spark.sql.legacy.createHiveTableByDefault: false [mypy] # suppress errors about unsatisfied imports diff --git a/setup.py b/setup.py index 348bdbfea..0029a78bf 100644 --- a/setup.py +++ b/setup.py @@ -34,7 +34,7 @@ license="Copyright", author="QuintoAndar", install_requires=requirements, - extras_require={"h3": ["cmake==3.16.3", "h3==3.4.2"]}, + extras_require={"h3": ["h3>=3.7.4,<4"]}, python_requires=">=3.7, <4", entry_points={"console_scripts": ["butterfree=butterfree._cli.main:app"]}, include_package_data=True, diff --git a/tests/integration/butterfree/pipelines/test_feature_set_pipeline.py b/tests/integration/butterfree/pipelines/test_feature_set_pipeline.py index 753dfe7c2..d67e0a387 100644 --- a/tests/integration/butterfree/pipelines/test_feature_set_pipeline.py +++ b/tests/integration/butterfree/pipelines/test_feature_set_pipeline.py @@ -77,9 +77,11 @@ def test_feature_set_pipeline( self, mocked_df, spark_session, fixed_windows_output_feature_set_dataframe, ): # arrange + table_reader_id = "a_source" table_reader_table = "table" table_reader_db = environment.get_variable("FEATURE_STORE_HISTORICAL_DATABASE") + create_temp_view(dataframe=mocked_df, name=table_reader_id) create_db_and_table( spark=spark_session, @@ -88,14 +90,16 @@ def test_feature_set_pipeline( table_reader_table=table_reader_table, ) - dbconfig = Mock() - dbconfig.mode = "overwrite" - dbconfig.format_ = "parquet" + path = "test_folder/historical/entity/feature_set" + + dbconfig = MetastoreConfig() dbconfig.get_options = Mock( - return_value={"path": "test_folder/historical/entity/feature_set"} + return_value={"mode": "overwrite", "format_": "parquet", "path": path} ) - historical_writer = HistoricalFeatureStoreWriter(db_config=dbconfig) + historical_writer = HistoricalFeatureStoreWriter( + db_config=dbconfig, debug_mode=True + ) # act test_pipeline = FeatureSetPipeline( @@ -151,9 +155,13 @@ def test_feature_set_pipeline( ) test_pipeline.run() + # act and assert + dbconfig.get_path_with_partitions = Mock( + return_value=["historical/entity/feature_set"] + ) + # assert - path = dbconfig.get_options("historical/entity/feature_set").get("path") - df = spark_session.read.parquet(path).orderBy(TIMESTAMP_COLUMN) + df = spark_session.sql("select * from historical_feature_store__feature_set") target_df = fixed_windows_output_feature_set_dataframe.orderBy( test_pipeline.feature_set.timestamp_column @@ -162,9 +170,6 @@ def test_feature_set_pipeline( # assert assert_dataframe_equality(df, target_df) - # tear down - shutil.rmtree("test_folder") - def test_feature_set_pipeline_with_dates( self, mocked_date_df, From 2a1900976634b79209e7b7116007df13c00e38d5 Mon Sep 17 00:00:00 2001 From: Ralph Rassweiler Date: Fri, 18 Aug 2023 14:11:52 -0300 Subject: [PATCH 50/70] feat: optional row count validation (#340) --- butterfree/load/sink.py | 17 +++++++++-------- butterfree/load/writers/writer.py | 2 ++ 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/butterfree/load/sink.py b/butterfree/load/sink.py index 0b0c10c9e..7c0328d6f 100644 --- a/butterfree/load/sink.py +++ b/butterfree/load/sink.py @@ -69,14 +69,15 @@ def validate( """ failures = [] for writer in self.writers: - try: - writer.validate( - feature_set=feature_set, - dataframe=dataframe, - spark_client=spark_client, - ) - except AssertionError as e: - failures.append(e) + if writer.row_count_validation: + try: + writer.validate( + feature_set=feature_set, + dataframe=dataframe, + spark_client=spark_client, + ) + except AssertionError as e: + failures.append(e) if failures: raise RuntimeError( diff --git a/butterfree/load/writers/writer.py b/butterfree/load/writers/writer.py index e12a4317e..5073f4726 100644 --- a/butterfree/load/writers/writer.py +++ b/butterfree/load/writers/writer.py @@ -26,6 +26,7 @@ def __init__( debug_mode: bool = False, interval_mode: bool = False, write_to_entity: bool = False, + row_count_validation: bool = True, ) -> None: super().__init__() self.db_config = db_config @@ -33,6 +34,7 @@ def __init__( self.debug_mode = debug_mode self.interval_mode = interval_mode self.write_to_entity = write_to_entity + self.row_count_validation = row_count_validation def with_( self, transformer: Callable[..., DataFrame], *args: Any, **kwargs: Any From ca1a16d5cb7f2a1800084160ecd472c5303918e7 Mon Sep 17 00:00:00 2001 From: Ralph Rassweiler Date: Fri, 18 Aug 2023 17:04:51 -0300 Subject: [PATCH 51/70] fix: parameter, libs (#341) --- butterfree/load/writers/historical_feature_store_writer.py | 7 ++++++- requirements.dev.txt | 2 ++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/butterfree/load/writers/historical_feature_store_writer.py b/butterfree/load/writers/historical_feature_store_writer.py index 489f22be1..0ea9b50c8 100644 --- a/butterfree/load/writers/historical_feature_store_writer.py +++ b/butterfree/load/writers/historical_feature_store_writer.py @@ -113,9 +113,14 @@ def __init__( debug_mode: bool = False, interval_mode: bool = False, check_schema_hook: Hook = None, + row_count_validation: bool = True, ): super(HistoricalFeatureStoreWriter, self).__init__( - db_config or MetastoreConfig(), debug_mode, interval_mode + db_config or MetastoreConfig(), + debug_mode, + interval_mode, + False, + row_count_validation, ) self.database = database or environment.get_variable( "FEATURE_STORE_HISTORICAL_DATABASE" diff --git a/requirements.dev.txt b/requirements.dev.txt index 3d70d4c05..abc64e3fb 100644 --- a/requirements.dev.txt +++ b/requirements.dev.txt @@ -7,3 +7,5 @@ sphinxemoji==0.1.8 sphinx-rtd-theme==0.5.2 recommonmark==0.7.1 pyarrow>=1.0.0 +setuptools +wheel \ No newline at end of file From 60c7ee4df17e574af95ea3f737a48979267e3c49 Mon Sep 17 00:00:00 2001 From: Ralph Rassweiler Date: Mon, 21 Aug 2023 11:12:54 -0300 Subject: [PATCH 52/70] pre-release 1.2.2.dev0 (#342) --- CHANGELOG.md | 46 ++++++++++++++++++++++++++++++++++++++++++++++ setup.py | 2 +- 2 files changed, 47 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 679e98343..1324f1c1f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,52 @@ All notable changes to this project will be documented in this file. Preferably use **Added**, **Changed**, **Removed** and **Fixed** topics in each release or unreleased log for a better organization. ## [Unreleased] + +## [1.2.2](https://github.com/quintoandar/butterfree/releases/tag/1.2.2) + +### Changed +* Optional row count validation ([#284](https://github.com/quintoandar/butterfree/pull/284)) +* Bump several libs versions ([#333](https://github.com/quintoandar/butterfree/pull/333)) + +## [1.2.1](https://github.com/quintoandar/butterfree/releases/tag/1.2.1) + +### Changed +* Update README.md ([#331](https://github.com/quintoandar/butterfree/pull/331)) +* Update Github Actions Workflow runner ([#332](https://github.com/quintoandar/butterfree/pull/332)) +* Delete sphinx version. ([#334](https://github.com/quintoandar/butterfree/pull/334)) + +### Fixed +* Add the missing link for H3 geohash ([#330](https://github.com/quintoandar/butterfree/pull/330)) + +## [1.2.0](https://github.com/quintoandar/butterfree/releases/tag/1.2.0) + + +### Added +* [MLOP-636] Create migration classes ([#282](https://github.com/quintoandar/butterfree/pull/282)) +* [MLOP-635] Rebase Incremental Job/Interval Run branch for test on selected feature sets ([#278](https://github.com/quintoandar/butterfree/pull/278)) +* Allow slide selection ([#293](https://github.com/quintoandar/butterfree/pull/293)) +* [MLOP-637] Implement diff method ([#292](https://github.com/quintoandar/butterfree/pull/292)) +* [MLOP-640] Create CLI with migrate command ([#298](https://github.com/quintoandar/butterfree/pull/298)) +* [MLOP-645] Implement query method, cassandra ([#291](https://github.com/quintoandar/butterfree/pull/291)) +* [MLOP-671] Implement get_schema on Spark client ([#301](https://github.com/quintoandar/butterfree/pull/301)) +* [MLOP-648] Implement query method, metastore ([#294](https://github.com/quintoandar/butterfree/pull/294)) +* [MLOP-647] / [MLOP-646] Apply migrations ([#300](https://github.com/quintoandar/butterfree/pull/300)) +* [MLOP-639] Track logs in S3 ([#306](https://github.com/quintoandar/butterfree/pull/306)) +* [MLOP-702] Debug mode for Automate Migration ([#322](https://github.com/quintoandar/butterfree/pull/322)) + +### Changed +* Keep milliseconds when using 'from_ms' argument in timestamp feature ([#284](https://github.com/quintoandar/butterfree/pull/284)) +* Read and write consistency level options ([#309](https://github.com/quintoandar/butterfree/pull/309)) +* [MLOP-691] Include step to add partition to SparkMetastore during writing of Butterfree ([#327](https://github.com/quintoandar/butterfree/pull/327)) + +### Fixed +* [BUG] Apply create_partitions to historical validate ([#303](https://github.com/quintoandar/butterfree/pull/303)) +* [BUG] Fix key path for validate read ([#304](https://github.com/quintoandar/butterfree/pull/304)) +* [FIX] Add Partition types for Metastore ([#305](https://github.com/quintoandar/butterfree/pull/305)) +* Change solution for tracking logs ([#308](https://github.com/quintoandar/butterfree/pull/308)) +* [BUG] Fix Cassandra Connect Session ([#316](https://github.com/quintoandar/butterfree/pull/316)) +* Fix method to generate agg feature name. ([#326](https://github.com/quintoandar/butterfree/pull/326)) + ### Added * [MLOP-636] Create migration classes ([#282](https://github.com/quintoandar/butterfree/pull/282)) diff --git a/setup.py b/setup.py index 0029a78bf..b3a2297fb 100644 --- a/setup.py +++ b/setup.py @@ -1,7 +1,7 @@ from setuptools import find_packages, setup __package_name__ = "butterfree" -__version__ = "1.2.0.dev19" +__version__ = "1.2.2.dev0" __repository_url__ = "https://github.com/quintoandar/butterfree" with open("requirements.txt") as f: From f35d66565226f6dd75a4f38706a22a4aa496c1d7 Mon Sep 17 00:00:00 2001 From: Ralph Rassweiler Date: Mon, 21 Aug 2023 13:48:52 -0300 Subject: [PATCH 53/70] Rebase staging (#343) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Release/1.1.3 (#290) * [MLOP-634] Butterfree dev workflow, set triggers for branches staging and master (#280) * Change github actions pipelines. * Change pipeline logic. * [BUG] Fix Staging GithubActions Pipeline (#283) * New step on pipelie. * Some adjusts. * Apply only wheel. (#285) * [BUG] Change version on setup.py to PyPI (#286) * Add new make command to change version. * Change command order. * Change desc and variable name. * Change command name. * Keep milliseconds when using 'from_ms' argument in timestamp feature (#284) * changed timestamp resolution * fix import * simple refactor Co-authored-by: Henrique Camargo * Change trigger for pipeline staging (#287) * Change trigger to publish dev pipeline. * Some fix. * Create a dev package. (#288) * [MLOP-633] Butterfree dev workflow, update documentation (#281) * Update workflow doc. * Update README * Add pre-release. * Fix typo. * [MLOP-632] Butterfree dev workflow, automate release description (#279) * release 1.1.4 * update changelog Co-authored-by: Mayara Moromisato <44944954+moromimay@users.noreply.github.com> Co-authored-by: Henrique Camargo Co-authored-by: AlvaroMarquesAndrade <45604858+AlvaroMarquesAndrade@users.noreply.github.com> * Release/1.2.0 (#328) * [MLOP-636] Create migration classes (#282) * [MLOP-635] Rebase Incremental Job/Interval Run branch for test on selected feature sets (#278) * Add interval branch modifications. * Add interval_runs notebook. * Add tests. * Apply style (black, flack8 and mypy). * Fix tests. * Change version to create package dev. * Allow slide selection (#293) * Fix Slide Duration Typo (#295) * [MLOP-637] Implement diff method (#292) * [MLOP-640] Create CLI with migrate command (#298) * [MLOP-645] Implement query method, cassandra (#291) * [MLOP-671] Implement get_schema on Spark client (#301) * [MLOP-648] Implement query method, metastore (#294) * Fix Validation Step (#302) * [MLOP-647] [MLOP-646] Apply migrations (#300) * add apply migration method * add test apply migration * add migrate actor with tests * mypy compliant * fix test interaction with mocked object * Rebase and some adjusts. Co-authored-by: Mayara Moromisato * [BUG] Apply create_partitions to historical validate (#303) * Apply create_partitions to historical validate. * Remove comments and adjusts. * [BUG] Fix key path for validate read (#304) * Fix key path * bump version Co-authored-by: AlvaroMarquesAndrade <1a789766b1c4c8b679e80f11fa6d63d42fa4bcdf> * [FIX] Add Partition types for Metastore (#305) * [MLOP-639] Track logs in S3 (#306) * Apply tracking logs and logging config. * Adjusts in CLI and logging.conf. * Some adjusts. * Change version to generate new dev package * Fix version. * Apply style. * Add new assert in the migrate unit test. * [BUG] Change logging config (#307) * Change logging config. * Some adjusts. * Remove a code smell. * Change solution for tracking logs (#308) * Change tracking logs method. * Change version to generate dev package. * Change path name in S3 * Read and write consistency level options (#309) * modify cassandra client to be region aware * add option for the user to set read and write consistency levels on cassandra config * add tests * use env vars instead * Update butterfree/configs/db/cassandra_config.py Co-authored-by: Rodrigo Martins de Oliveira * Update butterfree/configs/db/cassandra_config.py Co-authored-by: Rodrigo Martins de Oliveira Co-authored-by: Rodrigo Martins de Oliveira * Fix kafka reader. (#310) * Fix path validate. (#311) * Add local dc property (#312) * add local dc property * update version * Remove metastore migrate (#313) * Remove metastore migrate. * Change version to create a dev package. * Fix link in our docs. (#315) * [BUG] Fix Cassandra Connect Session (#316) * Fix Cassandra Connect Session. * Apply style. * Fix migration query. (#318) * Fix migration query add type key. (#319) * Fix db-config condition (#321) * Fix db-config condition. * Apply style. * MLOP-642 Document migration in Butterfree (#320) * update docs * add more information and reference new cli.md file * [MLOP-702] Debug mode for Automate Migration (#322) * Create flag debug-mode. * Fix tests. * Fix migrate test. * [MLOP-727] Improve logging messages (#325) * Fix logging message for local file * Remove json import * [MLOP-728] Improve logging messages (#324) * Improve logs. * Revert debug-mode condition. * Fix method to generate agg feature name. (#326) * [MLOP-691] Include step to add partition to SparkMetastore during writing of Butterfree (#327) * Change writer type for interval mode. * Some adjusts. * Release 1.2.0 Co-authored-by: AlvaroMarquesAndrade <45604858+AlvaroMarquesAndrade@users.noreply.github.com> Co-authored-by: Igor Gustavo Hoelscher <19557581+roelschr@users.noreply.github.com> Co-authored-by: Felipe Victorino Caputo <13631451+fvcaputo@users.noreply.github.com> Co-authored-by: Rodrigo Martins de Oliveira Co-authored-by: Gabriel Brandão <37742275+GaBrandao@users.noreply.github.com> * Release 1.2.1 (#338) * Add the missing link for H3 geohash (#330) * Add the missing link for H3 geohash * Update the H3 geohash link. * Update the same link Update the same link in in spark_function_and_window.ipynb example * Update README.md (#331) * Update Github Actions Workflow runner (#332) * Update Workflow runner version * bump flake8-bandit * chore: bypass false positive for S105 Co-authored-by: Lucas Cardozo * Delete sphinx version. (#334) * Update files to staging (#336) Co-authored-by: Rodrigo Martins de Oliveira * Update butterfree/configs/db/cassandra_config.py Co-authored-by: Rodrigo Martins de Oliveira Co-authored-by: Rodrigo Martins de Oliveira * Fix kafka reader. (#310) * Fix path validate. (#311) * Add local dc property (#312) * release 1.2.1 Co-authored-by: Jay Vala <24193355+jdvala@users.noreply.github.com> Co-authored-by: Rodrigo Martins de Oliveira Co-authored-by: Lucas Fonseca Co-authored-by: Lucas Cardozo Co-authored-by: Felipe Victorino Caputo <13631451+fvcaputo@users.noreply.github.com> * [BUG] Fix Staging GithubActions Pipeline (#283) * New step on pipelie. * Some adjusts. * Apply only wheel. (#285) * [BUG] Change version on setup.py to PyPI (#286) * Add new make command to change version. * Change command order. * Change desc and variable name. * Change command name. * [MLOP-636] Create migration classes (#282) * [MLOP-635] Rebase Incremental Job/Interval Run branch for test on selected feature sets (#278) * Add interval branch modifications. * Add interval_runs notebook. * Add tests. * Apply style (black, flack8 and mypy). * Fix tests. * Change version to create package dev. * [MLOP-637] Implement diff method (#292) * [MLOP-640] Create CLI with migrate command (#298) * [MLOP-645] Implement query method, cassandra (#291) * [MLOP-648] Implement query method, metastore (#294) * Fix Validation Step (#302) * [MLOP-647] [MLOP-646] Apply migrations (#300) * add apply migration method * add test apply migration * add migrate actor with tests * mypy compliant * fix test interaction with mocked object * Rebase and some adjusts. Co-authored-by: Mayara Moromisato * [MLOP-639] Track logs in S3 (#306) * Apply tracking logs and logging config. * Adjusts in CLI and logging.conf. * Some adjusts. * Change version to generate new dev package * Fix version. * Apply style. * Add new assert in the migrate unit test. * [BUG] Change logging config (#307) * Change logging config. * Some adjusts. * Remove a code smell. * Change solution for tracking logs (#308) * Change tracking logs method. * Change version to generate dev package. * Change path name in S3 * Remove metastore migrate (#313) * Remove metastore migrate. * Change version to create a dev package. * [MLOP-727] Improve logging messages (#325) * Fix logging message for local file * Remove json import * [MLOP-728] Improve logging messages (#324) * Improve logs. * Revert debug-mode condition. * [MLOP-691] Include step to add partition to SparkMetastore during writing of Butterfree (#327) * Change writer type for interval mode. * Some adjusts. * Update files to staging (#336) * Release/1.1.3 (#290) * [MLOP-634] Butterfree dev workflow, set triggers for branches staging and master (#280) * Change github actions pipelines. * Change pipeline logic. * [BUG] Fix Staging GithubActions Pipeline (#283) * New step on pipelie. * Some adjusts. * Apply only wheel. (#285) * [BUG] Change version on setup.py to PyPI (#286) * Add new make command to change version. * Change command order. * Change desc and variable name. * Change command name. * Keep milliseconds when using 'from_ms' argument in timestamp feature (#284) * changed timestamp resolution * fix import * simple refactor Co-authored-by: Henrique Camargo * Change trigger for pipeline staging (#287) * Change trigger to publish dev pipeline. * Some fix. * Create a dev package. (#288) * [MLOP-633] Butterfree dev workflow, update documentation (#281) * Update workflow doc. * Update README * Add pre-release. * Fix typo. * [MLOP-632] Butterfree dev workflow, automate release description (#279) * release 1.1.4 * update changelog Co-authored-by: Mayara Moromisato <44944954+moromimay@users.noreply.github.com> Co-authored-by: Henrique Camargo Co-authored-by: AlvaroMarquesAndrade <45604858+AlvaroMarquesAndrade@users.noreply.github.com> * Release/1.2.0 (#328) * [MLOP-636] Create migration classes (#282) * [MLOP-635] Rebase Incremental Job/Interval Run branch for test on selected feature sets (#278) * Add interval branch modifications. * Add interval_runs notebook. * Add tests. * Apply style (black, flack8 and mypy). * Fix tests. * Change version to create package dev. * Allow slide selection (#293) * Fix Slide Duration Typo (#295) * [MLOP-637] Implement diff method (#292) * [MLOP-640] Create CLI with migrate command (#298) * [MLOP-645] Implement query method, cassandra (#291) * [MLOP-671] Implement get_schema on Spark client (#301) * [MLOP-648] Implement query method, metastore (#294) * Fix Validation Step (#302) * [MLOP-647] [MLOP-646] Apply migrations (#300) * add apply migration method * add test apply migration * add migrate actor with tests * mypy compliant * fix test interaction with mocked object * Rebase and some adjusts. Co-authored-by: Mayara Moromisato * [BUG] Apply create_partitions to historical validate (#303) * Apply create_partitions to historical validate. * Remove comments and adjusts. * [BUG] Fix key path for validate read (#304) * Fix key path * bump version Co-authored-by: AlvaroMarquesAndrade <1a789766b1c4c8b679e80f11fa6d63d42fa4bcdf> * [FIX] Add Partition types for Metastore (#305) * [MLOP-639] Track logs in S3 (#306) * Apply tracking logs and logging config. * Adjusts in CLI and logging.conf. * Some adjusts. * Change version to generate new dev package * Fix version. * Apply style. * Add new assert in the migrate unit test. * [BUG] Change logging config (#307) * Change logging config. * Some adjusts. * Remove a code smell. * Change solution for tracking logs (#308) * Change tracking logs method. * Change version to generate dev package. * Change path name in S3 * Read and write consistency level options (#309) * modify cassandra client to be region aware * add option for the user to set read and write consistency levels on cassandra config * add tests * use env vars instead * Update butterfree/configs/db/cassandra_config.py Co-authored-by: Rodrigo Martins de Oliveira * Update butterfree/configs/db/cassandra_config.py Co-authored-by: Rodrigo Martins de Oliveira Co-authored-by: Rodrigo Martins de Oliveira * Fix kafka reader. (#310) * Fix path validate. (#311) * Add local dc property (#312) * add local dc property * update version * Remove metastore migrate (#313) * Remove metastore migrate. * Change version to create a dev package. * Fix link in our docs. (#315) * [BUG] Fix Cassandra Connect Session (#316) * Fix Cassandra Connect Session. * Apply style. * Fix migration query. (#318) * Fix migration query add type key. (#319) * Fix db-config condition (#321) * Fix db-config condition. * Apply style. * MLOP-642 Document migration in Butterfree (#320) * update docs * add more information and reference new cli.md file * [MLOP-702] Debug mode for Automate Migration (#322) * Create flag debug-mode. * Fix tests. * Fix migrate test. * [MLOP-727] Improve logging messages (#325) * Fix logging message for local file * Remove json import * [MLOP-728] Improve logging messages (#324) * Improve logs. * Revert debug-mode condition. * Fix method to generate agg feature name. (#326) * [MLOP-691] Include step to add partition to SparkMetastore during writing of Butterfree (#327) * Change writer type for interval mode. * Some adjusts. * Release 1.2.0 Co-authored-by: AlvaroMarquesAndrade <45604858+AlvaroMarquesAndrade@users.noreply.github.com> Co-authored-by: Igor Gustavo Hoelscher <19557581+roelschr@users.noreply.github.com> Co-authored-by: Felipe Victorino Caputo <13631451+fvcaputo@users.noreply.github.com> Co-authored-by: Rodrigo Martins de Oliveira Co-authored-by: Gabriel Brandão <37742275+GaBrandao@users.noreply.github.com> Co-authored-by: hmeretti Co-authored-by: Henrique Camargo Co-authored-by: AlvaroMarquesAndrade <45604858+AlvaroMarquesAndrade@users.noreply.github.com> Co-authored-by: Igor Gustavo Hoelscher <19557581+roelschr@users.noreply.github.com> Co-authored-by: Felipe Victorino Caputo <13631451+fvcaputo@users.noreply.github.com> Co-authored-by: Rodrigo Martins de Oliveira Co-authored-by: Gabriel Brandão <37742275+GaBrandao@users.noreply.github.com> * Revert "Update files to staging (#336)" (#337) This reverts commit 35bcd30af981a960bc7c79c47e6a25dbed729f6c. * Less strict requirements (#333) * bump a few requirements; increase lower bound for h3 version range; adding pyarrow dev dependency * fix type repr for spark types; fix: broken tests (pyspark 3.4) --------- Co-authored-by: Ralph Rassweiler * feat: optional row count validation (#340) * fix: parameter, libs (#341) --------- Co-authored-by: hmeretti Co-authored-by: Mayara Moromisato <44944954+moromimay@users.noreply.github.com> Co-authored-by: Henrique Camargo Co-authored-by: AlvaroMarquesAndrade <45604858+AlvaroMarquesAndrade@users.noreply.github.com> Co-authored-by: Igor Gustavo Hoelscher <19557581+roelschr@users.noreply.github.com> Co-authored-by: Felipe Victorino Caputo <13631451+fvcaputo@users.noreply.github.com> Co-authored-by: Rodrigo Martins de Oliveira Co-authored-by: Gabriel Brandão <37742275+GaBrandao@users.noreply.github.com> Co-authored-by: Jay Vala <24193355+jdvala@users.noreply.github.com> Co-authored-by: Lucas Fonseca Co-authored-by: Lucas Cardozo Co-authored-by: Mayara Moromisato From 97e44fa896dd9fce4e46645d2437f9684c4cee75 Mon Sep 17 00:00:00 2001 From: Ralph Rassweiler Date: Mon, 21 Aug 2023 15:07:50 -0300 Subject: [PATCH 54/70] Rebase staging from master (#345) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Release/1.1.3 (#290) * [MLOP-634] Butterfree dev workflow, set triggers for branches staging and master (#280) * Change github actions pipelines. * Change pipeline logic. * [BUG] Fix Staging GithubActions Pipeline (#283) * New step on pipelie. * Some adjusts. * Apply only wheel. (#285) * [BUG] Change version on setup.py to PyPI (#286) * Add new make command to change version. * Change command order. * Change desc and variable name. * Change command name. * Keep milliseconds when using 'from_ms' argument in timestamp feature (#284) * changed timestamp resolution * fix import * simple refactor Co-authored-by: Henrique Camargo * Change trigger for pipeline staging (#287) * Change trigger to publish dev pipeline. * Some fix. * Create a dev package. (#288) * [MLOP-633] Butterfree dev workflow, update documentation (#281) * Update workflow doc. * Update README * Add pre-release. * Fix typo. * [MLOP-632] Butterfree dev workflow, automate release description (#279) * release 1.1.4 * update changelog Co-authored-by: Mayara Moromisato <44944954+moromimay@users.noreply.github.com> Co-authored-by: Henrique Camargo Co-authored-by: AlvaroMarquesAndrade <45604858+AlvaroMarquesAndrade@users.noreply.github.com> * Release/1.2.0 (#328) * [MLOP-636] Create migration classes (#282) * [MLOP-635] Rebase Incremental Job/Interval Run branch for test on selected feature sets (#278) * Add interval branch modifications. * Add interval_runs notebook. * Add tests. * Apply style (black, flack8 and mypy). * Fix tests. * Change version to create package dev. * Allow slide selection (#293) * Fix Slide Duration Typo (#295) * [MLOP-637] Implement diff method (#292) * [MLOP-640] Create CLI with migrate command (#298) * [MLOP-645] Implement query method, cassandra (#291) * [MLOP-671] Implement get_schema on Spark client (#301) * [MLOP-648] Implement query method, metastore (#294) * Fix Validation Step (#302) * [MLOP-647] [MLOP-646] Apply migrations (#300) * add apply migration method * add test apply migration * add migrate actor with tests * mypy compliant * fix test interaction with mocked object * Rebase and some adjusts. Co-authored-by: Mayara Moromisato * [BUG] Apply create_partitions to historical validate (#303) * Apply create_partitions to historical validate. * Remove comments and adjusts. * [BUG] Fix key path for validate read (#304) * Fix key path * bump version Co-authored-by: AlvaroMarquesAndrade <1a789766b1c4c8b679e80f11fa6d63d42fa4bcdf> * [FIX] Add Partition types for Metastore (#305) * [MLOP-639] Track logs in S3 (#306) * Apply tracking logs and logging config. * Adjusts in CLI and logging.conf. * Some adjusts. * Change version to generate new dev package * Fix version. * Apply style. * Add new assert in the migrate unit test. * [BUG] Change logging config (#307) * Change logging config. * Some adjusts. * Remove a code smell. * Change solution for tracking logs (#308) * Change tracking logs method. * Change version to generate dev package. * Change path name in S3 * Read and write consistency level options (#309) * modify cassandra client to be region aware * add option for the user to set read and write consistency levels on cassandra config * add tests * use env vars instead * Update butterfree/configs/db/cassandra_config.py Co-authored-by: Rodrigo Martins de Oliveira * Update butterfree/configs/db/cassandra_config.py Co-authored-by: Rodrigo Martins de Oliveira Co-authored-by: Rodrigo Martins de Oliveira * Fix kafka reader. (#310) * Fix path validate. (#311) * Add local dc property (#312) * add local dc property * update version * Remove metastore migrate (#313) * Remove metastore migrate. * Change version to create a dev package. * Fix link in our docs. (#315) * [BUG] Fix Cassandra Connect Session (#316) * Fix Cassandra Connect Session. * Apply style. * Fix migration query. (#318) * Fix migration query add type key. (#319) * Fix db-config condition (#321) * Fix db-config condition. * Apply style. * MLOP-642 Document migration in Butterfree (#320) * update docs * add more information and reference new cli.md file * [MLOP-702] Debug mode for Automate Migration (#322) * Create flag debug-mode. * Fix tests. * Fix migrate test. * [MLOP-727] Improve logging messages (#325) * Fix logging message for local file * Remove json import * [MLOP-728] Improve logging messages (#324) * Improve logs. * Revert debug-mode condition. * Fix method to generate agg feature name. (#326) * [MLOP-691] Include step to add partition to SparkMetastore during writing of Butterfree (#327) * Change writer type for interval mode. * Some adjusts. * Release 1.2.0 Co-authored-by: AlvaroMarquesAndrade <45604858+AlvaroMarquesAndrade@users.noreply.github.com> Co-authored-by: Igor Gustavo Hoelscher <19557581+roelschr@users.noreply.github.com> Co-authored-by: Felipe Victorino Caputo <13631451+fvcaputo@users.noreply.github.com> Co-authored-by: Rodrigo Martins de Oliveira Co-authored-by: Gabriel Brandão <37742275+GaBrandao@users.noreply.github.com> * Release 1.2.1 (#338) * Add the missing link for H3 geohash (#330) * Add the missing link for H3 geohash * Update the H3 geohash link. * Update the same link Update the same link in in spark_function_and_window.ipynb example * Update README.md (#331) * Update Github Actions Workflow runner (#332) * Update Workflow runner version * bump flake8-bandit * chore: bypass false positive for S105 Co-authored-by: Lucas Cardozo * Delete sphinx version. (#334) * Update files to staging (#336) Co-authored-by: Rodrigo Martins de Oliveira * Update butterfree/configs/db/cassandra_config.py Co-authored-by: Rodrigo Martins de Oliveira Co-authored-by: Rodrigo Martins de Oliveira * Fix kafka reader. (#310) * Fix path validate. (#311) * Add local dc property (#312) * release 1.2.1 Co-authored-by: Jay Vala <24193355+jdvala@users.noreply.github.com> Co-authored-by: Rodrigo Martins de Oliveira Co-authored-by: Lucas Fonseca Co-authored-by: Lucas Cardozo Co-authored-by: Felipe Victorino Caputo <13631451+fvcaputo@users.noreply.github.com> * fix: methods * fix: duplicate --------- Co-authored-by: hmeretti Co-authored-by: Mayara Moromisato <44944954+moromimay@users.noreply.github.com> Co-authored-by: Henrique Camargo Co-authored-by: AlvaroMarquesAndrade <45604858+AlvaroMarquesAndrade@users.noreply.github.com> Co-authored-by: Igor Gustavo Hoelscher <19557581+roelschr@users.noreply.github.com> Co-authored-by: Felipe Victorino Caputo <13631451+fvcaputo@users.noreply.github.com> Co-authored-by: Rodrigo Martins de Oliveira Co-authored-by: Gabriel Brandão <37742275+GaBrandao@users.noreply.github.com> Co-authored-by: Jay Vala <24193355+jdvala@users.noreply.github.com> Co-authored-by: Lucas Fonseca Co-authored-by: Lucas Cardozo --- CHANGELOG.md | 4 ---- requirements.dev.txt | 2 +- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1324f1c1f..27b680bf2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -23,7 +23,6 @@ Preferably use **Added**, **Changed**, **Removed** and **Fixed** topics in each ## [1.2.0](https://github.com/quintoandar/butterfree/releases/tag/1.2.0) - ### Added * [MLOP-636] Create migration classes ([#282](https://github.com/quintoandar/butterfree/pull/282)) * [MLOP-635] Rebase Incremental Job/Interval Run branch for test on selected feature sets ([#278](https://github.com/quintoandar/butterfree/pull/278)) @@ -50,9 +49,6 @@ Preferably use **Added**, **Changed**, **Removed** and **Fixed** topics in each * [BUG] Fix Cassandra Connect Session ([#316](https://github.com/quintoandar/butterfree/pull/316)) * Fix method to generate agg feature name. ([#326](https://github.com/quintoandar/butterfree/pull/326)) -### Added -* [MLOP-636] Create migration classes ([#282](https://github.com/quintoandar/butterfree/pull/282)) - ## [1.1.3](https://github.com/quintoandar/butterfree/releases/tag/1.1.3) ### Added * [MLOP-599] Apply mypy to ButterFree ([#273](https://github.com/quintoandar/butterfree/pull/273)) diff --git a/requirements.dev.txt b/requirements.dev.txt index abc64e3fb..4e164c83f 100644 --- a/requirements.dev.txt +++ b/requirements.dev.txt @@ -8,4 +8,4 @@ sphinx-rtd-theme==0.5.2 recommonmark==0.7.1 pyarrow>=1.0.0 setuptools -wheel \ No newline at end of file +wheel From 9bcca0e20ba63f1643775e4553a68879a021c874 Mon Sep 17 00:00:00 2001 From: Ralph Rassweiler Date: Mon, 13 Nov 2023 11:27:38 -0300 Subject: [PATCH 55/70] feat(MLOP-1985): optional params (#347) * feat: optional params --- butterfree/extract/source.py | 13 +++++++++++-- butterfree/transform/aggregated_feature_set.py | 17 ++++++++++++++--- butterfree/transform/feature_set.py | 16 ++++++++++++++-- 3 files changed, 39 insertions(+), 7 deletions(-) diff --git a/butterfree/extract/source.py b/butterfree/extract/source.py index 6d905c6b5..1209e9162 100644 --- a/butterfree/extract/source.py +++ b/butterfree/extract/source.py @@ -49,13 +49,22 @@ class Source(HookableComponent): temporary views regarding each reader and, after, will run the desired query and return a dataframe. + The `eager_evaluation` param forces Spark to apply the currently + mapped changes to the DataFrame. When this parameter is set to + False, Spark follows its standard behaviour of lazy evaluation. + Lazy evaluation can improve Spark's performance as it allows + Spark to build the best version of the execution plan. + """ - def __init__(self, readers: List[Reader], query: str) -> None: + def __init__( + self, readers: List[Reader], query: str, eager_evaluation: bool = True, + ) -> None: super().__init__() self.enable_pre_hooks = False self.readers = readers self.query = query + self.eager_evaluation = eager_evaluation def construct( self, client: SparkClient, start_date: str = None, end_date: str = None @@ -87,7 +96,7 @@ def construct( dataframe = client.sql(self.query) - if not dataframe.isStreaming: + if not dataframe.isStreaming and self.eager_evaluation: dataframe.cache().count() post_hook_df = self.run_post_hooks(dataframe) diff --git a/butterfree/transform/aggregated_feature_set.py b/butterfree/transform/aggregated_feature_set.py index 133195d72..0bff33c65 100644 --- a/butterfree/transform/aggregated_feature_set.py +++ b/butterfree/transform/aggregated_feature_set.py @@ -197,6 +197,8 @@ def __init__( keys: List[KeyFeature], timestamp: TimestampFeature, features: List[Feature], + deduplicate_rows: bool = True, + eager_evaluation: bool = True, ): self._windows: List[Any] = [] self._pivot_column: Optional[str] = None @@ -204,7 +206,14 @@ def __init__( self._distinct_subset: List[Any] = [] self._distinct_keep: Optional[str] = None super(AggregatedFeatureSet, self).__init__( - name, entity, description, keys, timestamp, features, + name, + entity, + description, + keys, + timestamp, + features, + deduplicate_rows, + eager_evaluation, ) @property @@ -626,8 +635,10 @@ def construct( float("nan"), None ) if not output_df.isStreaming: - output_df = self._filter_duplicated_rows(output_df) - output_df.cache().count() + if self.deduplicate_rows: + output_df = self._filter_duplicated_rows(output_df) + if self.eager_evaluation: + output_df.cache().count() post_hook_df = self.run_post_hooks(output_df) diff --git a/butterfree/transform/feature_set.py b/butterfree/transform/feature_set.py index c2e40a498..469a353a8 100644 --- a/butterfree/transform/feature_set.py +++ b/butterfree/transform/feature_set.py @@ -97,6 +97,12 @@ class FeatureSet(HookableComponent): values over key columns and timestamp column, we do this in order to reduce our dataframe (regarding the number of rows). A detailed explation of this method can be found at filter_duplicated_rows docstring. + + The `eager_evaluation` param forces Spark to apply the currently + mapped changes to the DataFrame. When this parameter is set to + False, Spark follows its standard behaviour of lazy evaluation. + Lazy evaluation can improve Spark's performance as it allows + Spark to build the best version of the execution plan. """ def __init__( @@ -107,6 +113,8 @@ def __init__( keys: List[KeyFeature], timestamp: TimestampFeature, features: List[Feature], + deduplicate_rows: bool = True, + eager_evaluation: bool = True, ) -> None: super().__init__() self.name = name @@ -116,6 +124,8 @@ def __init__( self.timestamp = timestamp self.features = features self.incremental_strategy = IncrementalStrategy(column=TIMESTAMP_COLUMN) + self.deduplicate_rows = deduplicate_rows + self.eager_evaluation = eager_evaluation @property def name(self) -> str: @@ -426,8 +436,10 @@ def construct( ).select(*self.columns) if not output_df.isStreaming: - output_df = self._filter_duplicated_rows(output_df) - output_df.cache().count() + if self.deduplicate_rows: + output_df = self._filter_duplicated_rows(output_df) + if self.eager_evaluation: + output_df.cache().count() output_df = self.incremental_strategy.filter_with_incremental_strategy( dataframe=output_df, start_date=start_date, end_date=end_date From 512a0fe9642be776dca1dd97ff683447880079e9 Mon Sep 17 00:00:00 2001 From: Ralph Rassweiler Date: Mon, 13 Nov 2023 12:02:13 -0300 Subject: [PATCH 56/70] pre-release 1.2.3 (#349) --- CHANGELOG.md | 6 ++++++ setup.py | 2 +- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 27b680bf2..b252803c6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,12 @@ Preferably use **Added**, **Changed**, **Removed** and **Fixed** topics in each ## [Unreleased] +## [1.2.3](https://github.com/quintoandar/butterfree/releases/tag/1.2.3) +* Optional params ([#347](https://github.com/quintoandar/butterfree/pull/347)) + +### Changed +* Optional row count validation ([#340](https://github.com/quintoandar/butterfree/pull/340)) + ## [1.2.2](https://github.com/quintoandar/butterfree/releases/tag/1.2.2) ### Changed diff --git a/setup.py b/setup.py index b3a2297fb..2fc26b469 100644 --- a/setup.py +++ b/setup.py @@ -1,7 +1,7 @@ from setuptools import find_packages, setup __package_name__ = "butterfree" -__version__ = "1.2.2.dev0" +__version__ = "1.2.3.dev0" __repository_url__ = "https://github.com/quintoandar/butterfree" with open("requirements.txt") as f: From 688a5b3c0ab514034821de2af79ffa2ccdb9dd49 Mon Sep 17 00:00:00 2001 From: Ralph Rassweiler Date: Thu, 11 Apr 2024 13:50:18 -0300 Subject: [PATCH 57/70] feat(MLOP-2145): add feature set creation script (#351) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * feat: add feature set creation script * feat(mlop-2145): updating auto fs creation (#352) * feat(updating-auto-fs-creation): adding methods to the class as private and add Table dataclass * feat(updating-auto-fs-creation): using dataclass and adding typing * feat(updating-auto-fs-creation): finish using all type hints and apply format * feat(updating-auto-fs-creation): add docstring and auto-infer by df * fix(updating-auto-fs-creation): remove unused format * feat(updating-auto-fs-creation): creating flake8 ignore list * feat(updating-auto-fs-creation): apply fmt * feat(updating-auto-fs-creation): init file * feat(updating-auto-fs-creation): making more readable * feat(updating-auto-fs-creation): remove wrong file * feat(updating-auto-fs-creation): apply fmt * feat(updating-auto-fs-creation): ignoring mypy * feat(updating-auto-fs-creation): add unit test * feat(updating-auto-fs-creation): using Dataframe from pyspark --------- Co-authored-by: João Albuquerque --- butterfree/automated/__init__.py | 0 butterfree/automated/feature_set_creation.py | 199 ++++++++++++++++++ setup.cfg | 2 +- tests/unit/butterfree/automated/__init__.py | 0 .../automated/test_feature_set_creation.py | 28 +++ 5 files changed, 228 insertions(+), 1 deletion(-) create mode 100644 butterfree/automated/__init__.py create mode 100644 butterfree/automated/feature_set_creation.py create mode 100644 tests/unit/butterfree/automated/__init__.py create mode 100644 tests/unit/butterfree/automated/test_feature_set_creation.py diff --git a/butterfree/automated/__init__.py b/butterfree/automated/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/butterfree/automated/feature_set_creation.py b/butterfree/automated/feature_set_creation.py new file mode 100644 index 000000000..4a078135c --- /dev/null +++ b/butterfree/automated/feature_set_creation.py @@ -0,0 +1,199 @@ +import re +from dataclasses import dataclass +from typing import List, Optional, Tuple + +from pyspark.sql import DataFrame + +from butterfree.constants.data_type import DataType + +BUTTERFREE_DTYPES = { + "string": DataType.STRING.spark_sql, + "long": DataType.BIGINT.spark_sql, + "double": DataType.DOUBLE.spark_sql, + "boolean": DataType.BOOLEAN.spark_sql, + "integer": DataType.INTEGER.spark_sql, + "date": DataType.DATE.spark_sql, + "timestamp": DataType.TIMESTAMP.spark_sql, + "array": { + "long": DataType.ARRAY_BIGINT.spark_sql, + "float": DataType.ARRAY_FLOAT.spark_sql, + "string": DataType.ARRAY_STRING.spark_sql, + }, +} + + +@dataclass(frozen=True) +class Table: # noqa: D101 + id: str + database: str + name: str + + +class FeatureSetCreation: + """Class to auto-generate readers and features.""" + + def _get_features_with_regex(self, sql_query: str) -> List[str]: + features = [] + sql_query = " ".join(sql_query.split()) + first_pattern = re.compile("[(]?([\w.*]+)[)]?,", re.IGNORECASE) + second_pattern = re.compile("(\w+)\s(from)", re.IGNORECASE) + + for pattern in [first_pattern, second_pattern]: + matches = pattern.finditer(sql_query) + for match in matches: + feature = match.group(1) + + if "." in feature: + feature = feature.split(".")[1] + + features.append(feature) + + return features + + def _get_data_type(self, field_name: str, df: DataFrame) -> str: + for field in df.schema.jsonValue()["fields"]: + if field["name"] == field_name: + + field_type = field["type"] + + if isinstance(field_type, dict): + + field_type_keys = field_type.keys() + + if "type" in field_type_keys and "elementType" in field_type_keys: + return ( + "." + + BUTTERFREE_DTYPES[field_type["type"]][ # type: ignore + field_type["elementType"] + ] + ) + + return "." + BUTTERFREE_DTYPES[field["type"]] + + return "" + + def _get_tables_with_regex(self, sql_query: str) -> Tuple[List[Table], str]: + + modified_sql_query = sql_query + tables = [] + stop_words = [ + "left", + "right", + "full outer", + "inner", + "where", + "join", + "on", + "as", + ] + keywords = ["from", "join"] + + for keyword in keywords: + pattern = re.compile( + rf"\b{keyword}\s+(\w+\.\w+|\w+)\s+(\w+)", re.IGNORECASE + ) + matches = pattern.finditer(sql_query) + + for match in matches: + full_table_name = match.group(1) + id = match.group(2).strip() + + if id in stop_words: + id = full_table_name + + if "." in full_table_name: + database, table = full_table_name.split(".") + + modified_sql_query = re.sub( + rf"\b{database}\.{table}\b", table, modified_sql_query + ) + + tables.append(Table(id=id, database=database, name=table)) + else: + modified_sql_query = re.sub( + rf"\b{full_table_name}\b", full_table_name, modified_sql_query + ) + tables.append(Table(id=id, database="TBD", name=full_table_name)) + + return tables, modified_sql_query + + def get_readers(self, sql_query: str) -> str: + """ + Extracts table readers from a SQL query and formats them as a string. + + Args: + sql_query (str): The SQL query from which to extract table readers. + + Returns: + str: A formatted string containing the table readers. + """ + tables, modified_sql_query = self._get_tables_with_regex(sql_query.lower()) + readers = [] + for table in tables: + table_reader_string = f""" + TableReader( + id="{table.id}", + database="{table.database}", + table="{table.name}" + ), + """ + readers.append(table_reader_string) + + final_string = """ + source=Source( + readers=[ + {} + ], + query=( + \"\"\" + {} + \"\"\" + ), + ), + """.format( + "".join(readers), modified_sql_query.replace("\n", "\n\t\t") + ) + + return final_string + + def get_features(self, sql_query: str, df: Optional[DataFrame] = None) -> str: + """ + Extract features from a SQL query and return them formatted as a string. + + Args: + sql_query (str): The SQL query used to extract features. + df (Optional[DataFrame], optional): Optional DataFrame used to infer data types. Defaults to None. + + Returns: + str: A formatted string containing the extracted features. + + This sould be used on Databricks. + + Especially if you want automatic type inference without passing a reference dataframe. + The utility will only work in an environment where a spark session is available in the environment + """ # noqa: E501 + + features = self._get_features_with_regex(sql_query) + features_formatted = [] + for feature in features: + description = feature.replace("__", " ").replace("_", " ").capitalize() + + data_type = "." + + if df is None: + df = spark.sql(sql_query) # type: ignore # noqa: F821 + + data_type = self._get_data_type(feature, df) + + feature_string = f""" + Feature( + name="{feature}", + description="{description}", + dtype=DataType{data_type}, + ), + """ + features_formatted.append(feature_string) + + final_string = ("features=[\t{}],\n),").format("".join(features_formatted)) + + return final_string diff --git a/setup.cfg b/setup.cfg index cff001224..c58c2df3e 100644 --- a/setup.cfg +++ b/setup.cfg @@ -2,7 +2,7 @@ docstring-convention = google max-line-length = 88 max-complexity = 12 -ignore = W503, E203, D203, D401, D107, S101, D105 +ignore = W503, E203, D203, D401, D107, S101, D105, D100, W605, D202, D212, D104, E261 exclude = dist/*,build/*,.pytest_cache/*,.git/*,pip/* per-file-ignores = # We will not check for docstrings or the use of asserts in tests diff --git a/tests/unit/butterfree/automated/__init__.py b/tests/unit/butterfree/automated/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/unit/butterfree/automated/test_feature_set_creation.py b/tests/unit/butterfree/automated/test_feature_set_creation.py new file mode 100644 index 000000000..cfb5101e6 --- /dev/null +++ b/tests/unit/butterfree/automated/test_feature_set_creation.py @@ -0,0 +1,28 @@ +import unittest +from unittest.mock import MagicMock + +from butterfree.automated.feature_set_creation import FeatureSetCreation + + +class TestFeatureSetCreation(unittest.TestCase): + def setUp(self): + self.feature_set_creation = FeatureSetCreation() + + def test_get_features_with_regex(self): + sql_query = "SELECT column1, column2 FROM table1" + expected_features = ["column1", "column2"] + + features = self.feature_set_creation._get_features_with_regex(sql_query) + + self.assertEqual(features, expected_features) + + def test_get_data_type(self): + field_name = "column1" + df_mock = MagicMock() + df_mock.schema.jsonValue.return_value = { + "fields": [{"name": "column1", "type": "string"}] + } + + data_type = self.feature_set_creation._get_data_type(field_name, df_mock) + + self.assertEqual(data_type, ".STRING") From da91b49cbd69b6fa0881a4327df86ac004f22c05 Mon Sep 17 00:00:00 2001 From: Ralph Rassweiler Date: Thu, 25 Apr 2024 10:18:26 -0300 Subject: [PATCH 58/70] Rebase staging from master (#354) * Rebasing and add skip lint on github actions --- .github/workflows/skip_lint.yml | 17 ++++++++ .github/workflows/staging.yml | 63 ++++++++++++++-------------- CHANGELOG.md | 15 ++++--- docs/source/butterfree.automated.rst | 19 +++++++++ docs/source/butterfree.rst | 1 + setup.py | 2 +- 6 files changed, 78 insertions(+), 39 deletions(-) create mode 100644 .github/workflows/skip_lint.yml create mode 100644 docs/source/butterfree.automated.rst diff --git a/.github/workflows/skip_lint.yml b/.github/workflows/skip_lint.yml new file mode 100644 index 000000000..1c768a238 --- /dev/null +++ b/.github/workflows/skip_lint.yml @@ -0,0 +1,17 @@ +# This step is used only because we want to mark the runner-linter check as required +# for PRs to develop, but not for the merge queue to merge into develop, +# github does not have this functionality yet + +name: 'Skip github-actions/runner-linter check at merge queue' + +on: + merge_group: + +jobs: + empty_job: + name: 'github-actions/runner-linter' + runs-on: github-actions-developers-runner + steps: + - name: Skip github-actions/runner-linter check at merge queue + run: | + echo "Done" diff --git a/.github/workflows/staging.yml b/.github/workflows/staging.yml index 1f94fc5dd..77127820e 100644 --- a/.github/workflows/staging.yml +++ b/.github/workflows/staging.yml @@ -2,8 +2,7 @@ name: "Publish Dev Package" on: push: paths: - - 'setup.py' - + - "setup.py" jobs: Pipeline: @@ -13,33 +12,33 @@ jobs: container: quintoandar/python-3-7-java steps: - - uses: actions/checkout@v2 - - - name: Install dependencies - run: make ci-install - - - name: Get version - run: echo "version=$(grep __version__ setup.py | head -1 | cut -d \" -f2 | cut -d \' -f2 )" >> $GITHUB_ENV - - - name: Build package - run: make package - - - name: Create release - uses: actions/create-release@v1 - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - with: - tag_name: ${{ env.version }} - release_name: Release ${{ env.version }} - prerelease: true - - - name: Release already exist - if: ${{ failure() }} - run: echo Release already exist - - - name: Publish release to pypi.org - if: ${{ success() }} - env: - PYPI_USERNAME: ${{ secrets.PYPI_USERNAME }} - PYPI_PASSWORD: ${{ secrets.PYPI_PASSWORD }} - run: PYTHONPATH=./pip/deps python -m twine upload -u $PYPI_USERNAME -p $PYPI_PASSWORD --verbose dist/* + - uses: actions/checkout@v2 + + - name: Install dependencies + run: make ci-install + + - name: Get version + run: echo "version=$(grep __version__ setup.py | head -1 | cut -d \" -f2 | cut -d \' -f2 )" >> $GITHUB_ENV + + - name: Build package + run: make package + + - name: Create release + uses: actions/create-release@v1 + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + with: + tag_name: ${{ env.version }} + release_name: Release ${{ env.version }} + prerelease: true + + - name: Release already exist + if: ${{ failure() }} + run: echo Release already exist + + - name: Publish release to pypi.org + if: ${{ success() }} + env: + PYPI_USERNAME: ${{ secrets.PYPI_USERNAME }} + PYPI_PASSWORD: ${{ secrets.PYPI_PASSWORD }} + run: PYTHONPATH=./pip/deps python -m twine upload -u $PYPI_USERNAME -p $PYPI_PASSWORD --verbose dist/* diff --git a/CHANGELOG.md b/CHANGELOG.md index b252803c6..ad9f48634 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,9 @@ Preferably use **Added**, **Changed**, **Removed** and **Fixed** topics in each ## [Unreleased] +## [1.2.4](https://github.com/quintoandar/butterfree/releases/tag/1.2.4) +* Auto create feature sets ([#351](https://github.com/quintoandar/butterfree/pull/351)) + ## [1.2.3](https://github.com/quintoandar/butterfree/releases/tag/1.2.3) * Optional params ([#347](https://github.com/quintoandar/butterfree/pull/347)) @@ -66,7 +69,7 @@ Preferably use **Added**, **Changed**, **Removed** and **Fixed** topics in each * [MLOP-632] Butterfree dev workflow, automate release description ([#279](https://github.com/quintoandar/butterfree/commit/245eaa594846166972241b03fddc61ee5117b1f7)) ### Fixed -* Change trigger for pipeline staging ([#287](https://github.com/quintoandar/butterfree/pull/287)) +* Change trigger for pipeline staging ([#287](https://github.com/quintoandar/butterfree/pull/287)) ## [1.1.2](https://github.com/quintoandar/butterfree/releases/tag/1.1.2) ### Fixed @@ -89,11 +92,11 @@ Preferably use **Added**, **Changed**, **Removed** and **Fixed** topics in each * Update README ([#257](https://github.com/quintoandar/butterfree/pull/257)) ### Fixed -* Fix Butterfree's workflow ([#262](https://github.com/quintoandar/butterfree/pull/262)) +* Fix Butterfree's workflow ([#262](https://github.com/quintoandar/butterfree/pull/262)) * [FIX] Downgrade Python Version in Pyenv ([#227](https://github.com/quintoandar/butterfree/pull/227)) -* [FIX] Fix docs ([#229](https://github.com/quintoandar/butterfree/pull/229)) +* [FIX] Fix docs ([#229](https://github.com/quintoandar/butterfree/pull/229)) * [FIX] Fix Docs - Add more dependencies ([#230](https://github.com/quintoandar/butterfree/pull/230)) -* Fix broken notebook URL ([#236](https://github.com/quintoandar/butterfree/pull/236)) +* Fix broken notebook URL ([#236](https://github.com/quintoandar/butterfree/pull/236)) * Issue #77 Fix ([#245](https://github.com/quintoandar/butterfree/pull/245)) ## [1.0.2](https://github.com/quintoandar/butterfree/releases/tag/1.0.2) @@ -104,7 +107,7 @@ Preferably use **Added**, **Changed**, **Removed** and **Fixed** topics in each * [MLOP-426] Change branching strategy on butterfree to use only master branch ([#216](https://github.com/quintoandar/butterfree/pull/216)) ### Fixed -* [MLOP-440] Python 3.7 bump and Fixing Dependencies ([#220](https://github.com/quintoandar/butterfree/pull/220)) +* [MLOP-440] Python 3.7 bump and Fixing Dependencies ([#220](https://github.com/quintoandar/butterfree/pull/220)) ## [1.0.1](https://github.com/quintoandar/butterfree/releases/tag/1.0.1) ### Added @@ -303,4 +306,4 @@ Preferably use **Added**, **Changed**, **Removed** and **Fixed** topics in each * [MLOP-143] Fix Bugs for HouseMain FeatureSet ([#62](https://github.com/quintoandar/butterfree/pull/62)) ## [0.1.0](https://github.com/quintoandar/butterfree/releases/tag/0.1.0) -* First modules and entities of butterfree package. \ No newline at end of file +* First modules and entities of butterfree package. diff --git a/docs/source/butterfree.automated.rst b/docs/source/butterfree.automated.rst new file mode 100644 index 000000000..de290d9c7 --- /dev/null +++ b/docs/source/butterfree.automated.rst @@ -0,0 +1,19 @@ +butterfree.automated package +============================ + +Submodules +---------- + + +.. automodule:: butterfree.automated.feature_set_creation + :members: + :undoc-members: + :show-inheritance: + +Module contents +--------------- + +.. automodule:: butterfree.automated + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/butterfree.rst b/docs/source/butterfree.rst index 0828f9211..e108be6e7 100644 --- a/docs/source/butterfree.rst +++ b/docs/source/butterfree.rst @@ -7,6 +7,7 @@ Subpackages .. toctree:: :maxdepth: 4 + butterfree.automated butterfree.clients butterfree.configs butterfree.constants diff --git a/setup.py b/setup.py index 2fc26b469..6fa35751d 100644 --- a/setup.py +++ b/setup.py @@ -1,7 +1,7 @@ from setuptools import find_packages, setup __package_name__ = "butterfree" -__version__ = "1.2.3.dev0" +__version__ = "1.2.4" __repository_url__ = "https://github.com/quintoandar/butterfree" with open("requirements.txt") as f: From 887fbb2f262951f46d62a632755d3b71be8ee3de Mon Sep 17 00:00:00 2001 From: Ralph Rassweiler Date: Wed, 29 May 2024 16:32:14 -0300 Subject: [PATCH 59/70] feat(mlop-2269): bump versions (#355) * fix: bump versions adjust tests * add checklist * chore: bump python * bump pyspark * chore: java version all steps modified --- .checklist.yaml | 30 ++++ .github/workflows/publish.yml | 13 ++ .github/workflows/staging.yml | 16 +- .github/workflows/test.yml | 16 +- .gitignore | 1 + Makefile | 6 +- butterfree/_cli/migrate.py | 12 +- butterfree/clients/cassandra_client.py | 4 +- butterfree/clients/spark_client.py | 6 +- butterfree/extract/source.py | 5 +- .../historical_feature_store_writer.py | 5 +- .../writers/online_feature_store_writer.py | 10 +- butterfree/load/writers/writer.py | 5 +- .../database_migration/database_migration.py | 5 +- .../database_migration/metastore_migration.py | 5 +- .../transform/aggregated_feature_set.py | 4 +- .../transformations/aggregated_transform.py | 6 +- .../transformations/custom_transform.py | 4 +- .../transform/transformations/h3_transform.py | 5 +- .../sql_expression_transform.py | 3 +- docs/requirements.txt | 3 +- examples/test_examples.py | 4 +- mypy.ini | 41 +++++- requirements.dev.txt | 10 +- requirements.lint.txt | 11 +- requirements.test.txt | 2 +- requirements.txt | 6 +- setup.cfg | 2 +- setup.py | 2 +- .../butterfree/extract/test_source.py | 13 +- tests/integration/butterfree/load/conftest.py | 2 +- .../integration/butterfree/load/test_sink.py | 7 +- .../butterfree/pipelines/conftest.py | 3 +- .../pipelines/test_feature_set_pipeline.py | 72 ++++++--- .../transform/test_aggregated_feature_set.py | 16 +- .../butterfree/transform/test_feature_set.py | 10 +- tests/mocks/entities/first/first_pipeline.py | 18 ++- .../entities/second/deeper/second_pipeline.py | 16 +- .../butterfree/clients/test_spark_client.py | 14 +- .../pre_processing/test_filter_transform.py | 3 +- .../pre_processing/test_pivot_transform.py | 36 ++++- .../extract/readers/test_file_reader.py | 10 +- .../butterfree/extract/readers/test_reader.py | 3 +- .../extract/readers/test_table_reader.py | 9 +- tests/unit/butterfree/extract/test_source.py | 6 +- tests/unit/butterfree/load/conftest.py | 6 +- .../load/processing/test_json_transform.py | 4 +- .../migrations/database_migration/conftest.py | 12 +- tests/unit/butterfree/pipelines/conftest.py | 13 +- .../pipelines/test_feature_set_pipeline.py | 38 ++++- .../unit/butterfree/reports/test_metadata.py | 139 +++++++----------- tests/unit/butterfree/transform/conftest.py | 8 +- .../transform/features/test_feature.py | 4 +- .../transform/test_aggregated_feature_set.py | 14 +- .../butterfree/transform/test_feature_set.py | 21 ++- .../transform/transformations/conftest.py | 2 +- .../test_aggregated_transform.py | 5 +- .../transformations/test_custom_transform.py | 12 +- .../transformations/test_h3_transform.py | 6 +- .../test_spark_function_transform.py | 4 +- .../test_sql_expression_transform.py | 10 +- 61 files changed, 547 insertions(+), 231 deletions(-) create mode 100644 .checklist.yaml diff --git a/.checklist.yaml b/.checklist.yaml new file mode 100644 index 000000000..f0c211714 --- /dev/null +++ b/.checklist.yaml @@ -0,0 +1,30 @@ +apiVersion: quintoandar.com.br/checklist/v2 +kind: ServiceChecklist +metadata: + name: butterfree +spec: + description: >- + A solution for Feature Stores. + + costCenter: C055 + department: engineering + lifecycle: production + docs: true + + ownership: + team: data_products_mlops + line: tech_platform + owner: otavio.cals@quintoandar.com.br + + libraries: + - name: butterfree + type: common-usage + path: https://quintoandar.github.io/python-package-server/ + description: A lib to build Feature Stores. + registries: + - github-packages + tier: T0 + + channels: + squad: 'mlops' + alerts: 'data-products-reports' diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index f981921e6..0957a958a 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -14,6 +14,19 @@ jobs: steps: - uses: actions/checkout@v2 + - uses: actions/setup-python@v5 + with: + python-version: '3.9' + + - uses: actions/setup-java@v4 + with: + java-version: '11' + distribution: microsoft + + - uses: vemonet/setup-spark@v1 + with: + spark-version: '3.5.1' + hadoop-version: '3' - name: Install dependencies run: make ci-install diff --git a/.github/workflows/staging.yml b/.github/workflows/staging.yml index 77127820e..573049cac 100644 --- a/.github/workflows/staging.yml +++ b/.github/workflows/staging.yml @@ -8,11 +8,23 @@ jobs: Pipeline: if: github.ref == 'refs/heads/staging' - runs-on: ubuntu-22.04 - container: quintoandar/python-3-7-java + runs-on: ubuntu-latest steps: - uses: actions/checkout@v2 + - uses: actions/setup-python@v5 + with: + python-version: '3.9' + + - uses: actions/setup-java@v4 + with: + java-version: '11' + distribution: microsoft + + - uses: vemonet/setup-spark@v1 + with: + spark-version: '3.5.1' + hadoop-version: '3' - name: Install dependencies run: make ci-install diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index d7c1c3acc..d588c8533 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -9,11 +9,23 @@ on: jobs: Pipeline: - runs-on: ubuntu-22.04 - container: quintoandar/python-3-7-java + runs-on: ubuntu-latest steps: - uses: actions/checkout@v2 + - uses: actions/setup-python@v5 + with: + python-version: '3.9' + + - uses: actions/setup-java@v4 + with: + java-version: '11' + distribution: microsoft + + - uses: vemonet/setup-spark@v1 + with: + spark-version: '3.5.1' + hadoop-version: '3' - name: Install dependencies run: make ci-install diff --git a/.gitignore b/.gitignore index 62434612f..0c59b49ab 100644 --- a/.gitignore +++ b/.gitignore @@ -66,6 +66,7 @@ instance/ # PyBuilder target/ +pip/ # Jupyter Notebook .ipynb_checkpoints diff --git a/Makefile b/Makefile index 4109504f6..ba0d0ead4 100644 --- a/Makefile +++ b/Makefile @@ -76,7 +76,7 @@ style-check: @echo "Code Style" @echo "==========" @echo "" - @python -m black --check -t py36 --exclude="build/|buck-out/|dist/|_build/|pip/|\.pip/|\.git/|\.hg/|\.mypy_cache/|\.tox/|\.venv/" . && echo "\n\nSuccess" || (echo "\n\nFailure\n\nYou need to run \"make apply-style\" to apply style formatting to your code"; exit 1) + @python -m black --check -t py39 --exclude="build/|buck-out/|dist/|_build/|pip/|\.pip/|\.git/|\.hg/|\.mypy_cache/|\.tox/|\.venv/" . && echo "\n\nSuccess" || (echo "\n\nFailure\n\nYou need to run \"make apply-style\" to apply style formatting to your code"; exit 1) .PHONY: quality-check ## run code quality checks with flake8 @@ -104,8 +104,8 @@ checks: style-check quality-check type-check .PHONY: apply-style ## fix stylistic errors with black apply-style: - @python -m black -t py36 --exclude="build/|buck-out/|dist/|_build/|pip/|\.pip/|\.git/|\.hg/|\.mypy_cache/|\.tox/|\.venv/" . - @python -m isort -rc --atomic butterfree/ tests/ + @python -m black -t py39 --exclude="build/|buck-out/|dist/|_build/|pip/|\.pip/|\.git/|\.hg/|\.mypy_cache/|\.tox/|\.venv/" . + @python -m isort --atomic butterfree/ tests/ .PHONY: clean ## clean unused artifacts diff --git a/butterfree/_cli/migrate.py b/butterfree/_cli/migrate.py index 277ecf3c6..ed62f1a24 100644 --- a/butterfree/_cli/migrate.py +++ b/butterfree/_cli/migrate.py @@ -46,13 +46,13 @@ def __fs_objects(path: str) -> Set[FeatureSetPipeline]: logger.error(f"Path: {path} not found!") return set() - logger.info(f"Importing modules...") + logger.info("Importing modules...") package = ".".join(path.strip("/").split("/")) imported = set( importlib.import_module(f".{name}", package=package) for name in modules ) - logger.info(f"Scanning modules...") + logger.info("Scanning modules...") content = { module: set( filter( @@ -93,7 +93,8 @@ def __fs_objects(path: str) -> Set[FeatureSetPipeline]: PATH = typer.Argument( - ..., help="Full or relative path to where feature set pipelines are being defined.", + ..., + help="Full or relative path to where feature set pipelines are being defined.", ) GENERATE_LOGS = typer.Option( @@ -113,7 +114,10 @@ class Migrate: pipelines: list of Feature Set Pipelines to use to migration. """ - def __init__(self, pipelines: Set[FeatureSetPipeline],) -> None: + def __init__( + self, + pipelines: Set[FeatureSetPipeline], + ) -> None: self.pipelines = pipelines def _send_logs_to_s3(self, file_local: bool, debug_mode: bool) -> None: diff --git a/butterfree/clients/cassandra_client.py b/butterfree/clients/cassandra_client.py index 4c6f96fe0..5a7231555 100644 --- a/butterfree/clients/cassandra_client.py +++ b/butterfree/clients/cassandra_client.py @@ -129,7 +129,9 @@ def get_schema(self, table: str, database: str = None) -> List[Dict[str, str]]: return response def _get_create_table_query( - self, columns: List[CassandraColumn], table: str, + self, + columns: List[CassandraColumn], + table: str, ) -> str: """Creates CQL statement to create a table.""" parsed_columns = [] diff --git a/butterfree/clients/spark_client.py b/butterfree/clients/spark_client.py index bfa31d2a3..e2b868caf 100644 --- a/butterfree/clients/spark_client.py +++ b/butterfree/clients/spark_client.py @@ -61,9 +61,9 @@ def read( if path and not isinstance(path, (str, list)): raise ValueError("path needs to be a string or a list of string") - df_reader: Union[ - DataStreamReader, DataFrameReader - ] = self.conn.readStream if stream else self.conn.read + df_reader: Union[DataStreamReader, DataFrameReader] = ( + self.conn.readStream if stream else self.conn.read + ) df_reader = df_reader.schema(schema) if schema else df_reader diff --git a/butterfree/extract/source.py b/butterfree/extract/source.py index 1209e9162..281ed15ad 100644 --- a/butterfree/extract/source.py +++ b/butterfree/extract/source.py @@ -58,7 +58,10 @@ class Source(HookableComponent): """ def __init__( - self, readers: List[Reader], query: str, eager_evaluation: bool = True, + self, + readers: List[Reader], + query: str, + eager_evaluation: bool = True, ) -> None: super().__init__() self.enable_pre_hooks = False diff --git a/butterfree/load/writers/historical_feature_store_writer.py b/butterfree/load/writers/historical_feature_store_writer.py index 0ea9b50c8..1a64afdf3 100644 --- a/butterfree/load/writers/historical_feature_store_writer.py +++ b/butterfree/load/writers/historical_feature_store_writer.py @@ -130,7 +130,10 @@ def __init__( self.check_schema_hook = check_schema_hook def write( - self, feature_set: FeatureSet, dataframe: DataFrame, spark_client: SparkClient, + self, + feature_set: FeatureSet, + dataframe: DataFrame, + spark_client: SparkClient, ) -> None: """Loads the data from a feature set into the Historical Feature Store. diff --git a/butterfree/load/writers/online_feature_store_writer.py b/butterfree/load/writers/online_feature_store_writer.py index 17dc8af4b..d0bcde948 100644 --- a/butterfree/load/writers/online_feature_store_writer.py +++ b/butterfree/load/writers/online_feature_store_writer.py @@ -116,7 +116,10 @@ def filter_latest(dataframe: DataFrame, id_columns: List[Any]) -> DataFrame: window = Window.partitionBy(*id_columns).orderBy(col(TIMESTAMP_COLUMN).desc()) return ( - dataframe.select(col("*"), row_number().over(window).alias("rn"),) + dataframe.select( + col("*"), + row_number().over(window).alias("rn"), + ) .filter(col("rn") == 1) .drop("rn") ) @@ -162,7 +165,10 @@ def _write_in_debug_mode( ) def write( - self, feature_set: FeatureSet, dataframe: DataFrame, spark_client: SparkClient, + self, + feature_set: FeatureSet, + dataframe: DataFrame, + spark_client: SparkClient, ) -> Union[StreamingQuery, None]: """Loads the latest data from a feature set into the Feature Store. diff --git a/butterfree/load/writers/writer.py b/butterfree/load/writers/writer.py index 5073f4726..1dae795c6 100644 --- a/butterfree/load/writers/writer.py +++ b/butterfree/load/writers/writer.py @@ -72,7 +72,10 @@ def _apply_transformations(self, df: DataFrame) -> DataFrame: @abstractmethod def write( - self, feature_set: FeatureSet, dataframe: DataFrame, spark_client: SparkClient, + self, + feature_set: FeatureSet, + dataframe: DataFrame, + spark_client: SparkClient, ) -> Any: """Loads the data from a feature set into the Feature Store. diff --git a/butterfree/migrations/database_migration/database_migration.py b/butterfree/migrations/database_migration/database_migration.py index aeec4a6e7..468c028ec 100644 --- a/butterfree/migrations/database_migration/database_migration.py +++ b/butterfree/migrations/database_migration/database_migration.py @@ -180,7 +180,8 @@ def create_query( @staticmethod def _get_diff( - fs_schema: List[Dict[str, Any]], db_schema: List[Dict[str, Any]], + fs_schema: List[Dict[str, Any]], + db_schema: List[Dict[str, Any]], ) -> Set[Diff]: """Gets schema difference between feature set and the table of a given db. @@ -296,7 +297,7 @@ def apply_migration( logger.info(f"Applying this query: {q} ...") self._client.sql(q) - logger.info(f"Feature Set migration finished successfully.") + logger.info("Feature Set migration finished successfully.") # inform in drone console which feature set was migrated print(f"The {feature_set.name} feature set was migrated.") diff --git a/butterfree/migrations/database_migration/metastore_migration.py b/butterfree/migrations/database_migration/metastore_migration.py index daa0afd3d..8c6c211ae 100644 --- a/butterfree/migrations/database_migration/metastore_migration.py +++ b/butterfree/migrations/database_migration/metastore_migration.py @@ -30,7 +30,10 @@ class MetastoreMigration(DatabaseMigration): data is being loaded into an entity table, then users can drop columns manually. """ - def __init__(self, database: str = None,) -> None: + def __init__( + self, + database: str = None, + ) -> None: self._db_config = MetastoreConfig() self.database = database or environment.get_variable( "FEATURE_STORE_HISTORICAL_DATABASE" diff --git a/butterfree/transform/aggregated_feature_set.py b/butterfree/transform/aggregated_feature_set.py index 0bff33c65..c86a95c3d 100644 --- a/butterfree/transform/aggregated_feature_set.py +++ b/butterfree/transform/aggregated_feature_set.py @@ -412,7 +412,9 @@ def _aggregate( # repartition to have all rows for each group at the same partition # by doing that, we won't have to shuffle data on grouping by id dataframe = repartition_df( - dataframe, partition_by=groupby, num_processors=num_processors, + dataframe, + partition_by=groupby, + num_processors=num_processors, ) grouped_data = dataframe.groupby(*groupby) diff --git a/butterfree/transform/transformations/aggregated_transform.py b/butterfree/transform/transformations/aggregated_transform.py index 7304f34b6..a9581ef00 100644 --- a/butterfree/transform/transformations/aggregated_transform.py +++ b/butterfree/transform/transformations/aggregated_transform.py @@ -76,7 +76,11 @@ def aggregations(self) -> List[Tuple]: Function = namedtuple("Function", ["function", "data_type"]) return [ - Function(f.func(expression), f.data_type.spark,) for f in self.functions + Function( + f.func(expression), + f.data_type.spark, + ) + for f in self.functions ] def _get_output_name(self, function: object) -> str: diff --git a/butterfree/transform/transformations/custom_transform.py b/butterfree/transform/transformations/custom_transform.py index 9b5ae23b1..7860fdc20 100644 --- a/butterfree/transform/transformations/custom_transform.py +++ b/butterfree/transform/transformations/custom_transform.py @@ -89,6 +89,8 @@ def transform(self, dataframe: DataFrame) -> DataFrame: """ dataframe = self.transformer( - dataframe, self.parent, **self.transformer__kwargs, + dataframe, + self.parent, + **self.transformer__kwargs, ) return dataframe diff --git a/butterfree/transform/transformations/h3_transform.py b/butterfree/transform/transformations/h3_transform.py index 8ccd3bb38..7a98294ec 100644 --- a/butterfree/transform/transformations/h3_transform.py +++ b/butterfree/transform/transformations/h3_transform.py @@ -84,7 +84,10 @@ class H3HashTransform(TransformComponent): """ def __init__( - self, h3_resolutions: List[int], lat_column: str, lng_column: str, + self, + h3_resolutions: List[int], + lat_column: str, + lng_column: str, ): super().__init__() self.h3_resolutions = h3_resolutions diff --git a/butterfree/transform/transformations/sql_expression_transform.py b/butterfree/transform/transformations/sql_expression_transform.py index 0199c23ae..80cd41ea9 100644 --- a/butterfree/transform/transformations/sql_expression_transform.py +++ b/butterfree/transform/transformations/sql_expression_transform.py @@ -54,7 +54,8 @@ class SQLExpressionTransform(TransformComponent): """ def __init__( - self, expression: str, + self, + expression: str, ): super().__init__() self.expression = expression diff --git a/docs/requirements.txt b/docs/requirements.txt index a20ab18ff..7eaabf11a 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -4,5 +4,4 @@ sphinxemoji==0.1.6 typing-extensions==3.7.4.2 cmake==3.18.4 h3==3.7.0 -pyarrow==0.15.1 - +pyarrow==16.1.0 diff --git a/examples/test_examples.py b/examples/test_examples.py index b40b6e1a4..7180e080d 100644 --- a/examples/test_examples.py +++ b/examples/test_examples.py @@ -36,9 +36,9 @@ _, error = p.communicate() if p.returncode != 0: errors.append({"notebook": path, "error": error}) - print(f" >>> Error in execution!\n") + print(" >>> Error in execution!\n") else: - print(f" >>> Successful execution\n") + print(" >>> Successful execution\n") if errors: print(">>> Errors in the following notebooks:") diff --git a/mypy.ini b/mypy.ini index c67bd3a89..fc2931493 100644 --- a/mypy.ini +++ b/mypy.ini @@ -1,5 +1,5 @@ [mypy] -python_version = 3.7 +python_version = 3.9 ignore_missing_imports = True disallow_untyped_calls = False disallow_untyped_defs = True @@ -9,3 +9,42 @@ show_error_codes = True show_error_context = True disable_error_code = attr-defined, list-item, operator pretty = True + +[mypy-butterfree.pipelines.*] +ignore_errors = True + +[mypy-butterfree.load.*] +ignore_errors = True + +[mypy-butterfree.transform.*] +ignore_errors = True + +[mypy-butterfree.extract.*] +ignore_errors = True + +[mypy-butterfree.config.*] +ignore_errors = True + +[mypy-butterfree.clients.*] +ignore_errors = True + +[mypy-butterfree.configs.*] +ignore_errors = True + +[mypy-butterfree.dataframe_service.*] +ignore_errors = True + +[mypy-butterfree.validations.*] +ignore_errors = True + +[mypy-butterfree.migrations.*] +ignore_errors = True + +[mypy-butterfree.testing.*] +ignore_errors = True + +[mypy-butterfree.hooks.*] +ignore_errors = True + +[mypy-butterfree._cli.*] +ignore_errors = True diff --git a/requirements.dev.txt b/requirements.dev.txt index 4e164c83f..89025669c 100644 --- a/requirements.dev.txt +++ b/requirements.dev.txt @@ -1,11 +1,11 @@ -h3==3.7.4 +h3==3.7.7 jupyter==1.0.0 twine==3.1.1 -mypy==0.790 +mypy==1.10.0 sphinx==3.5.4 sphinxemoji==0.1.8 sphinx-rtd-theme==0.5.2 recommonmark==0.7.1 -pyarrow>=1.0.0 -setuptools -wheel +pyarrow==16.1.0 +setuptools==70.0.0 +wheel==0.43.0 diff --git a/requirements.lint.txt b/requirements.lint.txt index 7c51f4b37..66641a952 100644 --- a/requirements.lint.txt +++ b/requirements.lint.txt @@ -1,8 +1,7 @@ -black==19.10b0 -flake8==3.7.9 -flake8-isort==2.8.0 -isort<5 # temporary fix +black==21.12b0 +flake8==4.0.1 +flake8-isort==4.1.1 flake8-docstrings==1.5.0 flake8-bugbear==20.1.0 -flake8-bandit==3.0.0 - +flake8-bandit==2.1.2 +bandit==1.7.2 diff --git a/requirements.test.txt b/requirements.test.txt index b0c4032a8..651700b80 100644 --- a/requirements.test.txt +++ b/requirements.test.txt @@ -2,4 +2,4 @@ pytest==5.3.2 pytest-cov==2.8.1 pytest-xdist==1.31.0 pytest-mock==2.0.0 -pytest-spark==0.5.2 +pytest-spark==0.6.0 diff --git a/requirements.txt b/requirements.txt index d61d125bc..f3af42540 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,8 +1,8 @@ -cassandra-driver>=3.22.0,<4.0 +cassandra-driver==3.24.0 mdutils>=1.2.2,<2.0 pandas>=0.24,<2.0 parameters-validation>=1.1.5,<2.0 -pyspark==3.* -typer>=0.3,<0.4 +pyspark==3.5.1 +typer==0.3.2 typing-extensions>3.7.4,<5 boto3==1.17.* diff --git a/setup.cfg b/setup.cfg index c58c2df3e..849d35cf3 100644 --- a/setup.cfg +++ b/setup.cfg @@ -10,13 +10,13 @@ per-file-ignores = setup.py:D,S101 [isort] +profile = black line_length = 88 known_first_party = butterfree default_section = THIRDPARTY multi_line_output = 3 indent = ' ' skip_glob = pip -use_parantheses = True include_trailing_comma = True [tool:pytest] diff --git a/setup.py b/setup.py index 6fa35751d..42ef57c85 100644 --- a/setup.py +++ b/setup.py @@ -35,7 +35,7 @@ author="QuintoAndar", install_requires=requirements, extras_require={"h3": ["h3>=3.7.4,<4"]}, - python_requires=">=3.7, <4", + python_requires=">=3.9, <4", entry_points={"console_scripts": ["butterfree=butterfree._cli.main:app"]}, include_package_data=True, ) diff --git a/tests/integration/butterfree/extract/test_source.py b/tests/integration/butterfree/extract/test_source.py index c465ebd05..3ab991ab2 100644 --- a/tests/integration/butterfree/extract/test_source.py +++ b/tests/integration/butterfree/extract/test_source.py @@ -1,11 +1,11 @@ from typing import List from pyspark.sql import DataFrame -from tests.integration import INPUT_PATH from butterfree.clients import SparkClient from butterfree.extract import Source from butterfree.extract.readers import FileReader, TableReader +from tests.integration import INPUT_PATH def create_temp_view(dataframe: DataFrame, name): @@ -13,10 +13,11 @@ def create_temp_view(dataframe: DataFrame, name): def create_db_and_table(spark, table_reader_id, table_reader_db, table_reader_table): - spark.sql(f"create database if not exists {table_reader_db}") + spark.sql(f"drop schema if exists {table_reader_db} cascade") + spark.sql(f"create database {table_reader_db}") spark.sql(f"use {table_reader_db}") spark.sql( - f"create table if not exists {table_reader_db}.{table_reader_table} " # noqa + f"create table {table_reader_db}.{table_reader_table} " # noqa f"as select * from {table_reader_id}" # noqa ) @@ -33,7 +34,10 @@ def compare_dataframes( class TestSource: def test_source( - self, target_df_source, target_df_table_reader, spark_session, + self, + target_df_source, + target_df_table_reader, + spark_session, ): # given spark_client = SparkClient() @@ -66,6 +70,7 @@ def test_source( query=f"select a.*, b.feature2 " # noqa f"from {table_reader_id} a " # noqa f"inner join {file_reader_id} b on a.id = b.id ", # noqa + eager_evaluation=False, ) result_df = source.construct(client=spark_client) diff --git a/tests/integration/butterfree/load/conftest.py b/tests/integration/butterfree/load/conftest.py index 418b6d2ac..60101f1ac 100644 --- a/tests/integration/butterfree/load/conftest.py +++ b/tests/integration/butterfree/load/conftest.py @@ -51,7 +51,7 @@ def feature_set(): ] ts_feature = TimestampFeature(from_column="timestamp") features = [ - Feature(name="feature", description="Description", dtype=DataType.FLOAT), + Feature(name="feature", description="Description", dtype=DataType.INTEGER), ] return FeatureSet( "test_sink_feature_set", diff --git a/tests/integration/butterfree/load/test_sink.py b/tests/integration/butterfree/load/test_sink.py index b5f97879b..f73f5f7ce 100644 --- a/tests/integration/butterfree/load/test_sink.py +++ b/tests/integration/butterfree/load/test_sink.py @@ -24,10 +24,13 @@ def test_sink(input_dataframe, feature_set): s3config.mode = "overwrite" s3config.format_ = "parquet" s3config.get_options = Mock( - return_value={"path": "test_folder/historical/entity/feature_set"} + return_value={ + "path": "test_folder/historical/entity/feature_set", + "mode": "overwrite", + } ) s3config.get_path_with_partitions = Mock( - return_value="test_folder/historical/entity/feature_set" + return_value="spark-warehouse/test.db/test_folder/historical/entity/feature_set" ) historical_writer = HistoricalFeatureStoreWriter( diff --git a/tests/integration/butterfree/pipelines/conftest.py b/tests/integration/butterfree/pipelines/conftest.py index 73da163e6..5f304972d 100644 --- a/tests/integration/butterfree/pipelines/conftest.py +++ b/tests/integration/butterfree/pipelines/conftest.py @@ -132,7 +132,8 @@ def fixed_windows_output_feature_set_date_dataframe(spark_context, spark_session @pytest.fixture() def feature_set_pipeline( - spark_context, spark_session, + spark_context, + spark_session, ): feature_set_pipeline = FeatureSetPipeline( diff --git a/tests/integration/butterfree/pipelines/test_feature_set_pipeline.py b/tests/integration/butterfree/pipelines/test_feature_set_pipeline.py index d67e0a387..791253398 100644 --- a/tests/integration/butterfree/pipelines/test_feature_set_pipeline.py +++ b/tests/integration/butterfree/pipelines/test_feature_set_pipeline.py @@ -50,10 +50,11 @@ def create_temp_view(dataframe: DataFrame, name): def create_db_and_table(spark, table_reader_id, table_reader_db, table_reader_table): - spark.sql(f"create database if not exists {table_reader_db}") + spark.sql(f"drop schema {table_reader_db} cascade") + spark.sql(f"create database {table_reader_db}") spark.sql(f"use {table_reader_db}") spark.sql( - f"create table if not exists {table_reader_db}.{table_reader_table} " # noqa + f"create table {table_reader_db}.{table_reader_table} " # noqa f"as select * from {table_reader_id}" # noqa ) @@ -74,7 +75,10 @@ def create_ymd(dataframe): class TestFeatureSetPipeline: def test_feature_set_pipeline( - self, mocked_df, spark_session, fixed_windows_output_feature_set_dataframe, + self, + mocked_df, + spark_session, + fixed_windows_output_feature_set_dataframe, ): # arrange @@ -90,7 +94,7 @@ def test_feature_set_pipeline( table_reader_table=table_reader_table, ) - path = "test_folder/historical/entity/feature_set" + path = "spark-warehouse/test.db/test_folder/historical/entity/feature_set" dbconfig = MetastoreConfig() dbconfig.get_options = Mock( @@ -138,7 +142,9 @@ def test_feature_set_pipeline( description="unit test", dtype=DataType.FLOAT, transformation=CustomTransform( - transformer=divide, column1="feature1", column2="feature2", + transformer=divide, + column1="feature1", + column2="feature2", ), ), ], @@ -237,7 +243,12 @@ def test_pipeline_with_hooks(self, spark_session): test_pipeline = FeatureSetPipeline( source=Source( - readers=[TableReader(id="reader", table="test",).add_post_hook(hook1)], + readers=[ + TableReader( + id="reader", + table="test", + ).add_post_hook(hook1) + ], query="select * from reader", ).add_post_hook(hook1), feature_set=FeatureSet( @@ -263,7 +274,9 @@ def test_pipeline_with_hooks(self, spark_session): ) .add_pre_hook(hook1) .add_post_hook(hook1), - sink=Sink(writers=[historical_writer],).add_pre_hook(hook1), + sink=Sink( + writers=[historical_writer], + ).add_pre_hook(hook1), ) # act @@ -325,11 +338,13 @@ def test_pipeline_interval_run( db = environment.get_variable("FEATURE_STORE_HISTORICAL_DATABASE") path = "test_folder/historical/entity/feature_set" + read_path = "spark-warehouse/test.db/" + path spark_session.conf.set("spark.sql.sources.partitionOverwriteMode", "dynamic") - spark_session.sql(f"create database if not exists {db}") + spark_session.sql(f"drop schema {db} cascade") + spark_session.sql(f"create database {db}") spark_session.sql( - f"create table if not exists {db}.feature_set_interval " + f"create table {db}.feature_set_interval " f"(id int, timestamp timestamp, feature int, " f"run_id int, year int, month int, day int);" ) @@ -340,7 +355,7 @@ def test_pipeline_interval_run( ) historical_writer = HistoricalFeatureStoreWriter( - db_config=dbconfig, interval_mode=True + db_config=dbconfig, interval_mode=True, row_count_validation=False ) first_run_hook = RunHook(id=1) @@ -356,9 +371,10 @@ def test_pipeline_interval_run( test_pipeline = FeatureSetPipeline( source=Source( readers=[ - TableReader(id="id", table="input_data",).with_incremental_strategy( - IncrementalStrategy("ts") - ), + TableReader( + id="id", + table="input_data", + ).with_incremental_strategy(IncrementalStrategy("ts")), ], query="select * from id ", ), @@ -366,48 +382,56 @@ def test_pipeline_interval_run( name="feature_set_interval", entity="entity", description="", - keys=[KeyFeature(name="id", description="", dtype=DataType.INTEGER,)], + keys=[ + KeyFeature( + name="id", + description="", + dtype=DataType.INTEGER, + ) + ], timestamp=TimestampFeature(from_column="ts"), features=[ Feature(name="feature", description="", dtype=DataType.INTEGER), Feature(name="run_id", description="", dtype=DataType.INTEGER), ], ), - sink=Sink([historical_writer],), + sink=Sink( + [historical_writer], + ), ) # act and assert dbconfig.get_path_with_partitions = Mock( return_value=[ - "test_folder/historical/entity/feature_set/year=2016/month=4/day=11", - "test_folder/historical/entity/feature_set/year=2016/month=4/day=12", - "test_folder/historical/entity/feature_set/year=2016/month=4/day=13", + "spark-warehouse/test.db/test_folder/historical/entity/feature_set/year=2016/month=4/day=11", # noqa + "spark-warehouse/test.db/test_folder/historical/entity/feature_set/year=2016/month=4/day=12", # noqa + "spark-warehouse/test.db/test_folder/historical/entity/feature_set/year=2016/month=4/day=13", # noqa ] ) test_pipeline.feature_set.add_pre_hook(first_run_hook) test_pipeline.run(end_date="2016-04-13", start_date="2016-04-11") - first_run_output_df = spark_session.read.parquet(path) + first_run_output_df = spark_session.read.parquet(read_path) assert_dataframe_equality(first_run_output_df, first_run_target_df) dbconfig.get_path_with_partitions = Mock( return_value=[ - "test_folder/historical/entity/feature_set/year=2016/month=4/day=14", + "spark-warehouse/test.db/test_folder/historical/entity/feature_set/year=2016/month=4/day=14", # noqa ] ) test_pipeline.feature_set.add_pre_hook(second_run_hook) test_pipeline.run_for_date("2016-04-14") - second_run_output_df = spark_session.read.parquet(path) + second_run_output_df = spark_session.read.parquet(read_path) assert_dataframe_equality(second_run_output_df, second_run_target_df) dbconfig.get_path_with_partitions = Mock( return_value=[ - "test_folder/historical/entity/feature_set/year=2016/month=4/day=11", + "spark-warehouse/test.db/test_folder/historical/entity/feature_set/year=2016/month=4/day=11", # noqa ] ) test_pipeline.feature_set.add_pre_hook(third_run_hook) test_pipeline.run_for_date("2016-04-11") - third_run_output_df = spark_session.read.parquet(path) + third_run_output_df = spark_session.read.parquet(read_path) assert_dataframe_equality(third_run_output_df, third_run_target_df) # tear down - shutil.rmtree("test_folder") + shutil.rmtree("spark-warehouse/test.db/test_folder") diff --git a/tests/integration/butterfree/transform/test_aggregated_feature_set.py b/tests/integration/butterfree/transform/test_aggregated_feature_set.py index bc3ebb6c7..413077619 100644 --- a/tests/integration/butterfree/transform/test_aggregated_feature_set.py +++ b/tests/integration/butterfree/transform/test_aggregated_feature_set.py @@ -19,7 +19,9 @@ def divide(df, fs, column1, column2): class TestAggregatedFeatureSet: def test_construct_without_window( - self, feature_set_dataframe, target_df_without_window, + self, + feature_set_dataframe, + target_df_without_window, ): # given @@ -157,7 +159,9 @@ def test_construct_rolling_windows_without_end_date( ) ], timestamp=TimestampFeature(), - ).with_windows(definitions=["1 day", "1 week"],) + ).with_windows( + definitions=["1 day", "1 week"], + ) # act & assert with pytest.raises(ValueError): @@ -201,7 +205,9 @@ def test_h3_feature_set(self, h3_input_df, h3_target_df): assert_dataframe_equality(output_df, h3_target_df) def test_construct_with_pivot( - self, feature_set_df_pivot, target_df_pivot_agg, + self, + feature_set_df_pivot, + target_df_pivot_agg, ): # given @@ -243,7 +249,9 @@ def test_construct_with_pivot( assert_dataframe_equality(output_df, target_df_pivot_agg) def test_construct_rolling_windows_with_date_boundaries( - self, feature_set_dates_dataframe, rolling_windows_output_date_boundaries, + self, + feature_set_dates_dataframe, + rolling_windows_output_date_boundaries, ): # given diff --git a/tests/integration/butterfree/transform/test_feature_set.py b/tests/integration/butterfree/transform/test_feature_set.py index 25f70b6e2..6c5f7f1d8 100644 --- a/tests/integration/butterfree/transform/test_feature_set.py +++ b/tests/integration/butterfree/transform/test_feature_set.py @@ -51,7 +51,9 @@ def test_construct( description="unit test", dtype=DataType.FLOAT, transformation=CustomTransform( - transformer=divide, column1="feature1", column2="feature2", + transformer=divide, + column1="feature1", + column2="feature2", ), ), ], @@ -92,7 +94,11 @@ def test_construct_with_date_boundaries( entity="entity", description="description", features=[ - Feature(name="feature", description="test", dtype=DataType.FLOAT,), + Feature( + name="feature", + description="test", + dtype=DataType.FLOAT, + ), ], keys=[ KeyFeature( diff --git a/tests/mocks/entities/first/first_pipeline.py b/tests/mocks/entities/first/first_pipeline.py index 90cfba96f..938c880c7 100644 --- a/tests/mocks/entities/first/first_pipeline.py +++ b/tests/mocks/entities/first/first_pipeline.py @@ -15,7 +15,13 @@ class FirstPipeline(FeatureSetPipeline): def __init__(self): super(FirstPipeline, self).__init__( source=Source( - readers=[TableReader(id="t", database="db", table="table",)], + readers=[ + TableReader( + id="t", + database="db", + table="table", + ) + ], query=f"select * from t", # noqa ), feature_set=FeatureSet( @@ -23,7 +29,11 @@ def __init__(self): entity="entity", description="description", features=[ - Feature(name="feature1", description="test", dtype=DataType.FLOAT,), + Feature( + name="feature1", + description="test", + dtype=DataType.FLOAT, + ), Feature( name="feature2", description="another test", @@ -32,7 +42,9 @@ def __init__(self): ], keys=[ KeyFeature( - name="id", description="identifier", dtype=DataType.BIGINT, + name="id", + description="identifier", + dtype=DataType.BIGINT, ) ], timestamp=TimestampFeature(), diff --git a/tests/mocks/entities/second/deeper/second_pipeline.py b/tests/mocks/entities/second/deeper/second_pipeline.py index 12c53cf30..a59ba2e5d 100644 --- a/tests/mocks/entities/second/deeper/second_pipeline.py +++ b/tests/mocks/entities/second/deeper/second_pipeline.py @@ -15,7 +15,13 @@ class SecondPipeline(FeatureSetPipeline): def __init__(self): super(SecondPipeline, self).__init__( source=Source( - readers=[TableReader(id="t", database="db", table="table",)], + readers=[ + TableReader( + id="t", + database="db", + table="table", + ) + ], query=f"select * from t", # noqa ), feature_set=FeatureSet( @@ -24,7 +30,9 @@ def __init__(self): description="description", features=[ Feature( - name="feature1", description="test", dtype=DataType.STRING, + name="feature1", + description="test", + dtype=DataType.STRING, ), Feature( name="feature2", @@ -34,7 +42,9 @@ def __init__(self): ], keys=[ KeyFeature( - name="id", description="identifier", dtype=DataType.BIGINT, + name="id", + description="identifier", + dtype=DataType.BIGINT, ) ], timestamp=TimestampFeature(), diff --git a/tests/unit/butterfree/clients/test_spark_client.py b/tests/unit/butterfree/clients/test_spark_client.py index 12d8ac9d6..b2418a7c6 100644 --- a/tests/unit/butterfree/clients/test_spark_client.py +++ b/tests/unit/butterfree/clients/test_spark_client.py @@ -69,7 +69,8 @@ def test_read( assert target_df.collect() == result_df.collect() @pytest.mark.parametrize( - "format, path", [(None, "path/to/file"), ("csv", 123)], + "format, path", + [(None, "path/to/file"), ("csv", 123)], ) def test_read_invalid_params(self, format: Optional[str], path: Any) -> None: # arrange @@ -115,7 +116,8 @@ def test_read_table( assert target_df == result_df @pytest.mark.parametrize( - "database, table", [("database", None), ("database", 123)], + "database, table", + [("database", None), ("database", 123)], ) def test_read_table_invalid_params( self, database: str, table: Optional[int] @@ -128,7 +130,8 @@ def test_read_table_invalid_params( spark_client.read_table(table, database) # type: ignore @pytest.mark.parametrize( - "format, mode", [("parquet", "append"), ("csv", "overwrite")], + "format, mode", + [("parquet", "append"), ("csv", "overwrite")], ) def test_write_dataframe( self, format: str, mode: str, mocked_spark_write: Mock @@ -137,7 +140,8 @@ def test_write_dataframe( mocked_spark_write.save.assert_called_with(format=format, mode=mode) @pytest.mark.parametrize( - "format, mode", [(None, "append"), ("parquet", 1)], + "format, mode", + [(None, "append"), ("parquet", 1)], ) def test_write_dataframe_invalid_params( self, target_df: DataFrame, format: Optional[str], mode: Union[str, int] @@ -266,7 +270,7 @@ def test_create_temporary_view( def test_add_table_partitions(self, mock_spark_sql: Mock): # arrange target_command = ( - f"ALTER TABLE `db`.`table` ADD IF NOT EXISTS " + f"ALTER TABLE `db`.`table` ADD IF NOT EXISTS " # noqa f"PARTITION ( year = 2020, month = 8, day = 14 ) " f"PARTITION ( year = 2020, month = 8, day = 15 ) " f"PARTITION ( year = 2020, month = 8, day = 16 )" diff --git a/tests/unit/butterfree/extract/pre_processing/test_filter_transform.py b/tests/unit/butterfree/extract/pre_processing/test_filter_transform.py index 669fd0336..fed20f2d4 100644 --- a/tests/unit/butterfree/extract/pre_processing/test_filter_transform.py +++ b/tests/unit/butterfree/extract/pre_processing/test_filter_transform.py @@ -28,7 +28,8 @@ def test_filter(self, feature_set_dataframe, spark_context, spark_session): assert result_df.collect() == target_df.collect() @pytest.mark.parametrize( - "condition", [None, 100], + "condition", + [None, 100], ) def test_filter_with_invalidations( self, feature_set_dataframe, condition, spark_context, spark_session diff --git a/tests/unit/butterfree/extract/pre_processing/test_pivot_transform.py b/tests/unit/butterfree/extract/pre_processing/test_pivot_transform.py index e716f9d65..cfe730d3a 100644 --- a/tests/unit/butterfree/extract/pre_processing/test_pivot_transform.py +++ b/tests/unit/butterfree/extract/pre_processing/test_pivot_transform.py @@ -9,7 +9,9 @@ class TestPivotTransform: def test_pivot_transformation( - self, input_df, pivot_df, + self, + input_df, + pivot_df, ): result_df = pivot( dataframe=input_df, @@ -20,10 +22,15 @@ def test_pivot_transformation( ) # assert - assert compare_dataframes(actual_df=result_df, expected_df=pivot_df,) + assert compare_dataframes( + actual_df=result_df, + expected_df=pivot_df, + ) def test_pivot_transformation_with_forward_fill( - self, input_df, pivot_ffill_df, + self, + input_df, + pivot_ffill_df, ): result_df = pivot( dataframe=input_df, @@ -35,10 +42,15 @@ def test_pivot_transformation_with_forward_fill( ) # assert - assert compare_dataframes(actual_df=result_df, expected_df=pivot_ffill_df,) + assert compare_dataframes( + actual_df=result_df, + expected_df=pivot_ffill_df, + ) def test_pivot_transformation_with_forward_fill_and_mock( - self, input_df, pivot_ffill_mock_df, + self, + input_df, + pivot_ffill_mock_df, ): result_df = pivot( dataframe=input_df, @@ -52,10 +64,15 @@ def test_pivot_transformation_with_forward_fill_and_mock( ) # assert - assert compare_dataframes(actual_df=result_df, expected_df=pivot_ffill_mock_df,) + assert compare_dataframes( + actual_df=result_df, + expected_df=pivot_ffill_mock_df, + ) def test_pivot_transformation_mock_without_type( - self, input_df, pivot_ffill_mock_df, + self, + input_df, + pivot_ffill_mock_df, ): with pytest.raises(AttributeError): _ = pivot( @@ -83,4 +100,7 @@ def test_apply_pivot_transformation(self, input_df, pivot_df): result_df = file_reader._apply_transformations(input_df) # assert - assert compare_dataframes(actual_df=result_df, expected_df=pivot_df,) + assert compare_dataframes( + actual_df=result_df, + expected_df=pivot_df, + ) diff --git a/tests/unit/butterfree/extract/readers/test_file_reader.py b/tests/unit/butterfree/extract/readers/test_file_reader.py index 9e1c42bce..136c8fd62 100644 --- a/tests/unit/butterfree/extract/readers/test_file_reader.py +++ b/tests/unit/butterfree/extract/readers/test_file_reader.py @@ -7,7 +7,15 @@ class TestFileReader: @pytest.mark.parametrize( - "path, format", [(None, "parquet"), ("path/to/file.json", 123), (123, None,)], + "path, format", + [ + (None, "parquet"), + ("path/to/file.json", 123), + ( + 123, + None, + ), + ], ) def test_init_invalid_params(self, path, format): # act and assert diff --git a/tests/unit/butterfree/extract/readers/test_reader.py b/tests/unit/butterfree/extract/readers/test_reader.py index 78160553f..bcceacbd1 100644 --- a/tests/unit/butterfree/extract/readers/test_reader.py +++ b/tests/unit/butterfree/extract/readers/test_reader.py @@ -148,7 +148,8 @@ def test_build_with_columns( # act file_reader.build( - client=spark_client, columns=[("col1", "new_col1"), ("col2", "new_col2")], + client=spark_client, + columns=[("col1", "new_col1"), ("col2", "new_col2")], ) result_df = spark_session.sql("select * from test") diff --git a/tests/unit/butterfree/extract/readers/test_table_reader.py b/tests/unit/butterfree/extract/readers/test_table_reader.py index 65f3be236..1a2f56f23 100644 --- a/tests/unit/butterfree/extract/readers/test_table_reader.py +++ b/tests/unit/butterfree/extract/readers/test_table_reader.py @@ -5,7 +5,14 @@ class TestTableReader: @pytest.mark.parametrize( - "database, table", [("database", 123), (123, None,)], + "database, table", + [ + ("database", 123), + ( + 123, + None, + ), + ], ) def test_init_invalid_params(self, database, table): # act and assert diff --git a/tests/unit/butterfree/extract/test_source.py b/tests/unit/butterfree/extract/test_source.py index 53af8b658..842d2210f 100644 --- a/tests/unit/butterfree/extract/test_source.py +++ b/tests/unit/butterfree/extract/test_source.py @@ -14,7 +14,8 @@ def test_construct(self, mocker, target_df): # when source_selector = Source( - readers=[reader], query=f"select * from {reader_id}", # noqa + readers=[reader], + query=f"select * from {reader_id}", # noqa ) result_df = source_selector.construct(spark_client) @@ -32,7 +33,8 @@ def test_is_cached(self, mocker, target_df): # when source_selector = Source( - readers=[reader], query=f"select * from {reader_id}", # noqa + readers=[reader], + query=f"select * from {reader_id}", # noqa ) result_df = source_selector.construct(spark_client) diff --git a/tests/unit/butterfree/load/conftest.py b/tests/unit/butterfree/load/conftest.py index 4dcf25c94..d0bb2c3be 100644 --- a/tests/unit/butterfree/load/conftest.py +++ b/tests/unit/butterfree/load/conftest.py @@ -20,7 +20,11 @@ def feature_set(): ] ts_feature = TimestampFeature(from_column=TIMESTAMP_COLUMN) features = [ - Feature(name="feature", description="Description", dtype=DataType.BIGINT,) + Feature( + name="feature", + description="Description", + dtype=DataType.BIGINT, + ) ] return FeatureSet( "feature_set", diff --git a/tests/unit/butterfree/load/processing/test_json_transform.py b/tests/unit/butterfree/load/processing/test_json_transform.py index 73949eea7..78320d108 100644 --- a/tests/unit/butterfree/load/processing/test_json_transform.py +++ b/tests/unit/butterfree/load/processing/test_json_transform.py @@ -3,7 +3,9 @@ class TestJsonTransform: def test_json_transformation( - self, input_df, json_df, + self, + input_df, + json_df, ): result_df = json_transform(dataframe=input_df) diff --git a/tests/unit/butterfree/migrations/database_migration/conftest.py b/tests/unit/butterfree/migrations/database_migration/conftest.py index dcd96714f..237158b7b 100644 --- a/tests/unit/butterfree/migrations/database_migration/conftest.py +++ b/tests/unit/butterfree/migrations/database_migration/conftest.py @@ -45,10 +45,18 @@ def feature_set(): entity="entity", description="description", features=[ - Feature(name="feature_float", description="test", dtype=DataType.FLOAT,), + Feature( + name="feature_float", + description="test", + dtype=DataType.FLOAT, + ), ], keys=[ - KeyFeature(name="id", description="The device ID", dtype=DataType.BIGINT,) + KeyFeature( + name="id", + description="The device ID", + dtype=DataType.BIGINT, + ) ], timestamp=TimestampFeature(), ) diff --git a/tests/unit/butterfree/pipelines/conftest.py b/tests/unit/butterfree/pipelines/conftest.py index 47e65efb7..f17e5f41e 100644 --- a/tests/unit/butterfree/pipelines/conftest.py +++ b/tests/unit/butterfree/pipelines/conftest.py @@ -23,7 +23,13 @@ def feature_set_pipeline(): spark_client=SparkClient(), source=Mock( spec=Source, - readers=[TableReader(id="source_a", database="db", table="table",)], + readers=[ + TableReader( + id="source_a", + database="db", + table="table", + ) + ], query="select * from source_a", ), feature_set=Mock( @@ -57,7 +63,10 @@ def feature_set_pipeline(): ), ], ), - sink=Mock(spec=Sink, writers=[HistoricalFeatureStoreWriter(db_config=None)],), + sink=Mock( + spec=Sink, + writers=[HistoricalFeatureStoreWriter(db_config=None)], + ), ) return test_pipeline diff --git a/tests/unit/butterfree/pipelines/test_feature_set_pipeline.py b/tests/unit/butterfree/pipelines/test_feature_set_pipeline.py index 7bae6606b..5a67e77d4 100644 --- a/tests/unit/butterfree/pipelines/test_feature_set_pipeline.py +++ b/tests/unit/butterfree/pipelines/test_feature_set_pipeline.py @@ -22,6 +22,20 @@ from butterfree.transform.utils import Function +def get_reader(): + table_reader = TableReader( + id="source_a", + database="db", + table="table", + ) + + return table_reader + + +def get_historical_writer(): + return HistoricalFeatureStoreWriter(db_config=None) + + class TestFeatureSetPipeline: def test_feature_set_args(self): # arrange and act @@ -38,8 +52,12 @@ def test_feature_set_args(self): pipeline = FeatureSetPipeline( source=Source( readers=[ - TableReader(id="source_a", database="db", table="table",), - FileReader(id="source_b", path="path", format="parquet",), + get_reader(), + FileReader( + id="source_b", + path="path", + format="parquet", + ), ], query="select a.*, b.specific_feature " "from source_a left join source_b on a.id=b.id", @@ -131,7 +149,7 @@ def test_source_raise(self): source=Mock( spark_client=SparkClient(), readers=[ - TableReader(id="source_a", database="db", table="table",), + get_reader(), ], query="select * from source_a", ), @@ -167,7 +185,8 @@ def test_source_raise(self): ], ), sink=Mock( - spec=Sink, writers=[HistoricalFeatureStoreWriter(db_config=None)], + spec=Sink, + writers=[get_historical_writer()], ), ) @@ -180,7 +199,7 @@ def test_feature_set_raise(self): source=Mock( spec=Source, readers=[ - TableReader(id="source_a", database="db", table="table",), + get_reader(), ], query="select * from source_a", ), @@ -215,7 +234,8 @@ def test_feature_set_raise(self): ], ), sink=Mock( - spec=Sink, writers=[HistoricalFeatureStoreWriter(db_config=None)], + spec=Sink, + writers=[get_historical_writer()], ), ) @@ -226,7 +246,7 @@ def test_sink_raise(self): source=Mock( spec=Source, readers=[ - TableReader(id="source_a", database="db", table="table",), + get_reader(), ], query="select * from source_a", ), @@ -250,7 +270,9 @@ def test_sink_raise(self): key_columns=["user_id"], timestamp_column="ts", ), - sink=Mock(writers=[HistoricalFeatureStoreWriter(db_config=None)],), + sink=Mock( + writers=[get_historical_writer()], + ), ) def test_run_agg_with_end_date(self, spark_session, feature_set_pipeline): diff --git a/tests/unit/butterfree/reports/test_metadata.py b/tests/unit/butterfree/reports/test_metadata.py index 6f26cc558..093721df1 100644 --- a/tests/unit/butterfree/reports/test_metadata.py +++ b/tests/unit/butterfree/reports/test_metadata.py @@ -16,49 +16,63 @@ from butterfree.transform.utils import Function +def get_pipeline(): + + return FeatureSetPipeline( + source=Source( + readers=[ + TableReader( + id="source_a", + database="db", + table="table", + ), + FileReader( + id="source_b", + path="path", + format="parquet", + ), + ], + query="select a.*, b.specific_feature " + "from source_a left join source_b on a.id=b.id", + ), + feature_set=FeatureSet( + name="feature_set", + entity="entity", + description="description", + keys=[ + KeyFeature( + name="user_id", + description="The user's Main ID or device ID", + dtype=DataType.INTEGER, + ) + ], + timestamp=TimestampFeature(from_column="ts"), + features=[ + Feature( + name="page_viewed__rent_per_month", + description="Average of something.", + transformation=SparkFunctionTransform( + functions=[ + Function(functions.avg, DataType.FLOAT), + Function(functions.stddev_pop, DataType.DOUBLE), + ], + ), + ), + ], + ), + sink=Sink( + writers=[ + HistoricalFeatureStoreWriter(db_config=None), + OnlineFeatureStoreWriter(db_config=None), + ], + ), + ) + + class TestMetadata: def test_json(self): - pipeline = FeatureSetPipeline( - source=Source( - readers=[ - TableReader(id="source_a", database="db", table="table",), - FileReader(id="source_b", path="path", format="parquet",), - ], - query="select a.*, b.specific_feature " - "from source_a left join source_b on a.id=b.id", - ), - feature_set=FeatureSet( - name="feature_set", - entity="entity", - description="description", - keys=[ - KeyFeature( - name="user_id", - description="The user's Main ID or device ID", - dtype=DataType.INTEGER, - ) - ], - timestamp=TimestampFeature(from_column="ts"), - features=[ - Feature( - name="page_viewed__rent_per_month", - description="Average of something.", - transformation=SparkFunctionTransform( - functions=[ - Function(functions.avg, DataType.FLOAT), - Function(functions.stddev_pop, DataType.DOUBLE), - ], - ), - ), - ], - ), - sink=Sink( - writers=[ - HistoricalFeatureStoreWriter(db_config=None), - OnlineFeatureStoreWriter(db_config=None), - ], - ), - ) + + pipeline = get_pipeline() target_json = [ { @@ -102,47 +116,8 @@ def test_json(self): assert json == target_json def test_markdown(self): - pipeline = FeatureSetPipeline( - source=Source( - readers=[ - TableReader(id="source_a", database="db", table="table",), - FileReader(id="source_b", path="path", format="parquet",), - ], - query="select a.*, b.specific_feature " - "from source_a left join source_b on a.id=b.id", - ), - feature_set=FeatureSet( - name="feature_set", - entity="entity", - description="description", - keys=[ - KeyFeature( - name="user_id", - description="The user's Main ID or device ID", - dtype=DataType.INTEGER, - ) - ], - timestamp=TimestampFeature(from_column="ts"), - features=[ - Feature( - name="page_viewed__rent_per_month", - description="Average of something.", - transformation=SparkFunctionTransform( - functions=[ - Function(functions.avg, DataType.FLOAT), - Function(functions.stddev_pop, DataType.DOUBLE), - ], - ), - ), - ], - ), - sink=Sink( - writers=[ - HistoricalFeatureStoreWriter(db_config=None), - OnlineFeatureStoreWriter(db_config=None), - ], - ), - ) + + pipeline = get_pipeline() target_md = ( "\n# Feature_set\n\n## Description\n\n\ndescription \n\n" diff --git a/tests/unit/butterfree/transform/conftest.py b/tests/unit/butterfree/transform/conftest.py index ab7606407..fcf601328 100644 --- a/tests/unit/butterfree/transform/conftest.py +++ b/tests/unit/butterfree/transform/conftest.py @@ -455,6 +455,12 @@ def agg_feature_set(): ), ), ], - keys=[KeyFeature(name="id", description="description", dtype=DataType.BIGINT,)], + keys=[ + KeyFeature( + name="id", + description="description", + dtype=DataType.BIGINT, + ) + ], timestamp=TimestampFeature(), ) diff --git a/tests/unit/butterfree/transform/features/test_feature.py b/tests/unit/butterfree/transform/features/test_feature.py index 14a89f2cf..01bb41e5a 100644 --- a/tests/unit/butterfree/transform/features/test_feature.py +++ b/tests/unit/butterfree/transform/features/test_feature.py @@ -98,7 +98,9 @@ def test_feature_transform_with_from_column_and_column_name_exists( def test_feature_transform_with_dtype(self, feature_set_dataframe): test_feature = Feature( - name="feature", description="unit test", dtype=DataType.TIMESTAMP, + name="feature", + description="unit test", + dtype=DataType.TIMESTAMP, ) df = test_feature.transform(feature_set_dataframe) diff --git a/tests/unit/butterfree/transform/test_aggregated_feature_set.py b/tests/unit/butterfree/transform/test_aggregated_feature_set.py index 73320cf57..38ec249aa 100644 --- a/tests/unit/butterfree/transform/test_aggregated_feature_set.py +++ b/tests/unit/butterfree/transform/test_aggregated_feature_set.py @@ -44,7 +44,10 @@ def test_feature_set_with_invalid_feature(self, key_id, timestamp_c, dataframe): ).construct(dataframe, spark_client) def test_agg_feature_set_with_window( - self, dataframe, rolling_windows_agg_dataframe, agg_feature_set, + self, + dataframe, + rolling_windows_agg_dataframe, + agg_feature_set, ): spark_client = SparkClient() @@ -61,7 +64,10 @@ def test_agg_feature_set_with_window( assert_dataframe_equality(output_df, rolling_windows_agg_dataframe) def test_agg_feature_set_with_smaller_slide( - self, dataframe, rolling_windows_hour_slide_agg_dataframe, agg_feature_set, + self, + dataframe, + rolling_windows_hour_slide_agg_dataframe, + agg_feature_set, ): spark_client = SparkClient() @@ -366,7 +372,9 @@ def test_define_start_date(self, agg_feature_set): assert start_date == "2020-07-27" def test_feature_set_start_date( - self, timestamp_c, feature_set_with_distinct_dataframe, + self, + timestamp_c, + feature_set_with_distinct_dataframe, ): fs = AggregatedFeatureSet( name="name", diff --git a/tests/unit/butterfree/transform/test_feature_set.py b/tests/unit/butterfree/transform/test_feature_set.py index 43d937bec..e907dc0a8 100644 --- a/tests/unit/butterfree/transform/test_feature_set.py +++ b/tests/unit/butterfree/transform/test_feature_set.py @@ -3,12 +3,6 @@ import pytest from pyspark.sql import functions as F from pyspark.sql.types import DoubleType, FloatType, LongType, TimestampType -from tests.unit.butterfree.transform.conftest import ( - feature_add, - feature_divide, - key_id, - timestamp_c, -) from butterfree.clients import SparkClient from butterfree.constants import DataType @@ -20,6 +14,12 @@ SQLExpressionTransform, ) from butterfree.transform.utils import Function +from tests.unit.butterfree.transform.conftest import ( + feature_add, + feature_divide, + key_id, + timestamp_c, +) class TestFeatureSet: @@ -70,7 +70,14 @@ class TestFeatureSet: None, [feature_add, feature_divide], ), - ("name", "entity", "description", [key_id], timestamp_c, [None],), + ( + "name", + "entity", + "description", + [key_id], + timestamp_c, + [None], + ), ], ) def test_cannot_instantiate( diff --git a/tests/unit/butterfree/transform/transformations/conftest.py b/tests/unit/butterfree/transform/transformations/conftest.py index 8f3c13bff..41bc63d5c 100644 --- a/tests/unit/butterfree/transform/transformations/conftest.py +++ b/tests/unit/butterfree/transform/transformations/conftest.py @@ -62,7 +62,7 @@ def target_df_spark(spark_context, spark_session): "timestamp": "2016-04-11 11:31:11", "feature1": 200, "feature2": 200, - "feature__cos": 0.4871876750070059, + "feature__cos": 0.48718767500700594, }, { "id": 1, diff --git a/tests/unit/butterfree/transform/transformations/test_aggregated_transform.py b/tests/unit/butterfree/transform/transformations/test_aggregated_transform.py index 6cdebf74d..f0ae2f854 100644 --- a/tests/unit/butterfree/transform/transformations/test_aggregated_transform.py +++ b/tests/unit/butterfree/transform/transformations/test_aggregated_transform.py @@ -44,7 +44,10 @@ def test_output_columns(self): assert all( [ a == b - for a, b in zip(df_columns, ["feature1__avg", "feature1__stddev_pop"],) + for a, b in zip( + df_columns, + ["feature1__avg", "feature1__stddev_pop"], + ) ] ) diff --git a/tests/unit/butterfree/transform/transformations/test_custom_transform.py b/tests/unit/butterfree/transform/transformations/test_custom_transform.py index 4198d9bda..d87cc7cb1 100644 --- a/tests/unit/butterfree/transform/transformations/test_custom_transform.py +++ b/tests/unit/butterfree/transform/transformations/test_custom_transform.py @@ -21,7 +21,9 @@ def test_feature_transform(self, feature_set_dataframe): description="unit test", dtype=DataType.BIGINT, transformation=CustomTransform( - transformer=divide, column1="feature1", column2="feature2", + transformer=divide, + column1="feature1", + column2="feature2", ), ) @@ -44,7 +46,9 @@ def test_output_columns(self, feature_set_dataframe): description="unit test", dtype=DataType.BIGINT, transformation=CustomTransform( - transformer=divide, column1="feature1", column2="feature2", + transformer=divide, + column1="feature1", + column2="feature2", ), ) @@ -59,7 +63,9 @@ def test_custom_transform_output(self, feature_set_dataframe): description="unit test", dtype=DataType.BIGINT, transformation=CustomTransform( - transformer=divide, column1="feature1", column2="feature2", + transformer=divide, + column1="feature1", + column2="feature2", ), ) diff --git a/tests/unit/butterfree/transform/transformations/test_h3_transform.py b/tests/unit/butterfree/transform/transformations/test_h3_transform.py index 4b3308ebe..d4ad6493e 100644 --- a/tests/unit/butterfree/transform/transformations/test_h3_transform.py +++ b/tests/unit/butterfree/transform/transformations/test_h3_transform.py @@ -64,9 +64,9 @@ def test_import_error(self): for m in modules: del sys.modules[m] with pytest.raises(ModuleNotFoundError, match="you must install"): - from butterfree.transform.transformations.h3_transform import ( # noqa - H3HashTransform, # noqa - ) # noqa + from butterfree.transform.transformations.h3_transform import ( # noqa; noqa + H3HashTransform, + ) def test_with_stack(self, h3_input_df, h3_with_stack_target_df): # arrange diff --git a/tests/unit/butterfree/transform/transformations/test_spark_function_transform.py b/tests/unit/butterfree/transform/transformations/test_spark_function_transform.py index fe8bca85c..cf88657a0 100644 --- a/tests/unit/butterfree/transform/transformations/test_spark_function_transform.py +++ b/tests/unit/butterfree/transform/transformations/test_spark_function_transform.py @@ -126,7 +126,9 @@ def test_feature_transform_output_row_windows( transformation=SparkFunctionTransform( functions=[Function(functions.avg, DataType.DOUBLE)], ).with_window( - partition_by="id", mode="row_windows", window_definition=["2 events"], + partition_by="id", + mode="row_windows", + window_definition=["2 events"], ), ) diff --git a/tests/unit/butterfree/transform/transformations/test_sql_expression_transform.py b/tests/unit/butterfree/transform/transformations/test_sql_expression_transform.py index 9cc2e687e..814f83012 100644 --- a/tests/unit/butterfree/transform/transformations/test_sql_expression_transform.py +++ b/tests/unit/butterfree/transform/transformations/test_sql_expression_transform.py @@ -43,7 +43,15 @@ def test_output_columns(self): df_columns = test_feature.get_output_columns() - assert all([a == b for a, b in zip(df_columns, ["feature1_over_feature2"],)]) + assert all( + [ + a == b + for a, b in zip( + df_columns, + ["feature1_over_feature2"], + ) + ] + ) def test_feature_transform_output(self, feature_set_dataframe): test_feature = Feature( From 5af8a05a841c02114e08d90455e9cd772a1980c3 Mon Sep 17 00:00:00 2001 From: Ralph Rassweiler Date: Mon, 3 Jun 2024 15:03:56 -0300 Subject: [PATCH 60/70] fix: sphinx version (#356) * fix: sphinx --- Makefile | 2 +- requirements.dev.txt | 6 +++--- setup.cfg | 6 ++++++ 3 files changed, 10 insertions(+), 4 deletions(-) diff --git a/Makefile b/Makefile index ba0d0ead4..3164f5039 100644 --- a/Makefile +++ b/Makefile @@ -152,7 +152,7 @@ package: ## update Butterfree API docs update-docs: cd ./docs; rm -rf source/butterfree.* - cd ./docs; sphinx-apidoc -T -E -o source/ ../butterfree + cd ./docs; sphinx-apidoc -o source/ ../butterfree cd ./docs; make coverage .PHONY: docs diff --git a/requirements.dev.txt b/requirements.dev.txt index 89025669c..bf4b4b2b8 100644 --- a/requirements.dev.txt +++ b/requirements.dev.txt @@ -2,9 +2,9 @@ h3==3.7.7 jupyter==1.0.0 twine==3.1.1 mypy==1.10.0 -sphinx==3.5.4 -sphinxemoji==0.1.8 -sphinx-rtd-theme==0.5.2 +sphinx==6.2.1 +sphinxemoji==0.3.1 +sphinx-rtd-theme==1.3.0 recommonmark==0.7.1 pyarrow==16.1.0 setuptools==70.0.0 diff --git a/setup.cfg b/setup.cfg index 849d35cf3..8206c6ae3 100644 --- a/setup.cfg +++ b/setup.cfg @@ -41,3 +41,9 @@ disallow_any_generics = True disallow_untyped_defs = True check_untyped_defs = True disallow_untyped_calls = True + +[build_sphinx] +all-files = 1 +source-dir = docs/source +build-dir = docs/build +warning-is-error = 0 From cbda73d974d7e2058de8823800f2df0fa8bf6160 Mon Sep 17 00:00:00 2001 From: Ralph Rassweiler Date: Fri, 7 Jun 2024 13:46:16 -0300 Subject: [PATCH 61/70] fix: publish and dev versions (#359) * fix: publish, versions, tests --- .github/workflows/publish.yml | 3 +- Makefile | 4 +- butterfree/_cli/main.py | 2 +- butterfree/_cli/migrate.py | 6 ++- butterfree/clients/__init__.py | 1 + butterfree/clients/abstract_client.py | 5 ++- butterfree/clients/cassandra_client.py | 5 ++- butterfree/clients/spark_client.py | 25 +++++++----- butterfree/configs/db/cassandra_config.py | 25 ++++++------ butterfree/configs/db/kafka_config.py | 17 ++++---- butterfree/configs/db/metastore_config.py | 8 ++-- butterfree/configs/environment.py | 5 ++- butterfree/constants/__init__.py | 1 + butterfree/constants/migrations.py | 1 + butterfree/dataframe_service/__init__.py | 1 + .../dataframe_service/incremental_strategy.py | 17 ++++++-- butterfree/dataframe_service/repartition.py | 13 ++++--- butterfree/extract/__init__.py | 1 + butterfree/extract/pre_processing/__init__.py | 1 + .../explode_json_column_transform.py | 1 + .../pre_processing/filter_transform.py | 1 + .../pre_processing/forward_fill_transform.py | 5 ++- .../extract/pre_processing/pivot_transform.py | 7 ++-- .../pre_processing/replace_transform.py | 1 + butterfree/extract/readers/__init__.py | 1 + butterfree/extract/readers/file_reader.py | 7 ++-- butterfree/extract/readers/kafka_reader.py | 7 ++-- butterfree/extract/readers/reader.py | 10 +++-- butterfree/extract/readers/table_reader.py | 4 +- butterfree/extract/source.py | 7 +++- butterfree/hooks/__init__.py | 1 + .../hooks/schema_compatibility/__init__.py | 1 + .../spark_table_schema_compatibility_hook.py | 6 ++- butterfree/load/processing/__init__.py | 1 + butterfree/load/processing/json_transform.py | 1 + butterfree/load/sink.py | 1 + .../historical_feature_store_writer.py | 19 +++++---- .../writers/online_feature_store_writer.py | 20 ++++++---- butterfree/load/writers/writer.py | 16 +++++--- .../database_migration/database_migration.py | 14 ++++--- .../database_migration/metastore_migration.py | 4 +- butterfree/pipelines/__init__.py | 1 + butterfree/pipelines/feature_set_pipeline.py | 23 +++++------ butterfree/reports/__init__.py | 1 + butterfree/testing/dataframe/__init__.py | 5 ++- .../transform/aggregated_feature_set.py | 19 ++++----- butterfree/transform/feature_set.py | 9 +++-- butterfree/transform/features/feature.py | 9 +++-- butterfree/transform/features/key_feature.py | 6 ++- .../transform/features/timestamp_feature.py | 11 ++++-- .../transformations/aggregated_transform.py | 7 +++- .../transformations/custom_transform.py | 2 +- .../spark_function_transform.py | 11 ++++-- .../transformations/transform_component.py | 1 + .../user_defined_functions/mode.py | 1 + .../most_frequent_set.py | 1 + butterfree/transform/utils/__init__.py | 1 + butterfree/transform/utils/date_range.py | 6 +-- butterfree/transform/utils/function.py | 4 +- butterfree/transform/utils/window_spec.py | 5 ++- butterfree/validations/basic_validaton.py | 4 +- butterfree/validations/validation.py | 4 +- docs/source/conf.py | 1 + mypy.ini | 39 ------------------- requirements.lint.txt | 2 +- requirements.txt | 2 +- .../butterfree/pipelines/conftest.py | 5 ++- .../test_aggregated_transform.py | 2 +- 68 files changed, 262 insertions(+), 196 deletions(-) diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index 0957a958a..d33e4aa03 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -9,8 +9,7 @@ jobs: Pipeline: if: github.ref == 'refs/heads/master' - runs-on: ubuntu-22.04 - container: quintoandar/python-3-7-java + runs-on: ubuntu-latest steps: - uses: actions/checkout@v2 diff --git a/Makefile b/Makefile index 3164f5039..bf9ccd647 100644 --- a/Makefile +++ b/Makefile @@ -9,8 +9,8 @@ VERSION := $(shell grep __version__ setup.py | head -1 | cut -d \" -f2 | cut -d .PHONY: environment ## create virtual environment for butterfree environment: - @pyenv install -s 3.7.13 - @pyenv virtualenv 3.7.13 butterfree + @pyenv install -s 3.9.19 + @pyenv virtualenv 3.9.19 butterfree @pyenv local butterfree @PYTHONPATH=. python -m pip install --upgrade pip diff --git a/butterfree/_cli/main.py b/butterfree/_cli/main.py index 636fdb25e..b8b12f14d 100644 --- a/butterfree/_cli/main.py +++ b/butterfree/_cli/main.py @@ -2,7 +2,7 @@ from butterfree._cli import migrate -app = typer.Typer() +app = typer.Typer(no_args_is_help=True) app.add_typer(migrate.app, name="migrate") if __name__ == "__main__": diff --git a/butterfree/_cli/migrate.py b/butterfree/_cli/migrate.py index ed62f1a24..f51615097 100644 --- a/butterfree/_cli/migrate.py +++ b/butterfree/_cli/migrate.py @@ -16,7 +16,9 @@ from butterfree.migrations.database_migration import ALLOWED_DATABASE from butterfree.pipelines import FeatureSetPipeline -app = typer.Typer(help="Apply the automatic migrations in a database.") +app = typer.Typer( + help="Apply the automatic migrations in a database.", no_args_is_help=True +) logger = __logger("migrate", True) @@ -89,7 +91,7 @@ def __fs_objects(path: str) -> Set[FeatureSetPipeline]: instances.add(value) logger.info("Creating instances...") - return set(value() for value in instances) + return set(value() for value in instances) # type: ignore PATH = typer.Argument( diff --git a/butterfree/clients/__init__.py b/butterfree/clients/__init__.py index 5f6f0ffa6..7e8d1a95b 100644 --- a/butterfree/clients/__init__.py +++ b/butterfree/clients/__init__.py @@ -1,4 +1,5 @@ """Holds connection clients.""" + from butterfree.clients.abstract_client import AbstractClient from butterfree.clients.cassandra_client import CassandraClient from butterfree.clients.spark_client import SparkClient diff --git a/butterfree/clients/abstract_client.py b/butterfree/clients/abstract_client.py index ce5d33b64..b9027bd88 100644 --- a/butterfree/clients/abstract_client.py +++ b/butterfree/clients/abstract_client.py @@ -1,6 +1,7 @@ """Abstract class for database clients.""" + from abc import ABC, abstractmethod -from typing import Any +from typing import Any, Optional class AbstractClient(ABC): @@ -25,7 +26,7 @@ def sql(self, query: str) -> Any: pass @abstractmethod - def get_schema(self, table: str, database: str = None) -> Any: + def get_schema(self, table: str, database: Optional[str] = None) -> Any: """Returns desired table schema. Attributes: diff --git a/butterfree/clients/cassandra_client.py b/butterfree/clients/cassandra_client.py index 5a7231555..714e82483 100644 --- a/butterfree/clients/cassandra_client.py +++ b/butterfree/clients/cassandra_client.py @@ -1,4 +1,5 @@ """CassandraClient entity.""" + from ssl import CERT_REQUIRED, PROTOCOL_TLSv1 from typing import Dict, List, Optional @@ -102,7 +103,9 @@ def sql(self, query: str) -> ResponseFuture: """ return self.conn.execute(query) - def get_schema(self, table: str, database: str = None) -> List[Dict[str, str]]: + def get_schema( + self, table: str, database: Optional[str] = None + ) -> List[Dict[str, str]]: """Returns desired table schema. Attributes: diff --git a/butterfree/clients/spark_client.py b/butterfree/clients/spark_client.py index e2b868caf..933c21651 100644 --- a/butterfree/clients/spark_client.py +++ b/butterfree/clients/spark_client.py @@ -69,7 +69,7 @@ def read( return df_reader.format(format).load(path=path, **options) # type: ignore - def read_table(self, table: str, database: str = None) -> DataFrame: + def read_table(self, table: str, database: Optional[str] = None) -> DataFrame: """Use the SparkSession.read interface to read a metastore table. Args: @@ -179,9 +179,9 @@ def write_table( database: Optional[str], table_name: str, path: str, - format_: str = None, - mode: str = None, - partition_by: List[str] = None, + format_: Optional[str] = None, + mode: Optional[str] = None, + partition_by: Optional[List[str]] = None, **options: Any, ) -> None: """Receive a spark DataFrame and write it as a table in metastore. @@ -231,7 +231,10 @@ def create_temporary_view(dataframe: DataFrame, name: str) -> Any: return dataframe.writeStream.format("memory").queryName(name).start() def add_table_partitions( - self, partitions: List[Dict[str, Any]], table: str, database: str = None + self, + partitions: List[Dict[str, Any]], + table: str, + database: Optional[str] = None, ) -> None: """Add partitions to an existing table. @@ -259,9 +262,11 @@ def add_table_partitions( key_values_expr = [ ", ".join( [ - "{} = {}".format(k, v) - if not isinstance(v, str) - else "{} = '{}'".format(k, v) + ( + "{} = {}".format(k, v) + if not isinstance(v, str) + else "{} = '{}'".format(k, v) + ) for k, v in partition.items() ] ) @@ -314,7 +319,9 @@ def _convert_schema(self, schema: DataFrame) -> List[Dict[str, str]]: return converted_schema - def get_schema(self, table: str, database: str = None) -> List[Dict[str, str]]: + def get_schema( + self, table: str, database: Optional[str] = None + ) -> List[Dict[str, str]]: """Returns desired table schema. Attributes: diff --git a/butterfree/configs/db/cassandra_config.py b/butterfree/configs/db/cassandra_config.py index a038cb177..d60bb6977 100644 --- a/butterfree/configs/db/cassandra_config.py +++ b/butterfree/configs/db/cassandra_config.py @@ -1,4 +1,5 @@ """Holds configurations to read and write with Spark to Cassandra DB.""" + from typing import Any, Dict, List, Optional from butterfree.configs import environment @@ -32,18 +33,18 @@ class CassandraConfig(AbstractWriteConfig): def __init__( self, - username: str = None, - password: str = None, - host: str = None, - keyspace: str = None, - mode: str = None, - format_: str = None, - stream_processing_time: str = None, - stream_output_mode: str = None, - stream_checkpoint_path: str = None, - read_consistency_level: str = None, - write_consistency_level: str = None, - local_dc: str = None, + username: Optional[str] = None, + password: Optional[str] = None, + host: Optional[str] = None, + keyspace: Optional[str] = None, + mode: Optional[str] = None, + format_: Optional[str] = None, + stream_processing_time: Optional[str] = None, + stream_output_mode: Optional[str] = None, + stream_checkpoint_path: Optional[str] = None, + read_consistency_level: Optional[str] = None, + write_consistency_level: Optional[str] = None, + local_dc: Optional[str] = None, ): self.username = username self.password = password diff --git a/butterfree/configs/db/kafka_config.py b/butterfree/configs/db/kafka_config.py index 79cad15b2..e0c14baf3 100644 --- a/butterfree/configs/db/kafka_config.py +++ b/butterfree/configs/db/kafka_config.py @@ -1,4 +1,5 @@ """Holds configurations to read and write with Spark to Kafka.""" + from typing import Any, Dict, List, Optional from butterfree.configs import environment @@ -25,13 +26,13 @@ class KafkaConfig(AbstractWriteConfig): def __init__( self, - kafka_topic: str = None, - kafka_connection_string: str = None, - mode: str = None, - format_: str = None, - stream_processing_time: str = None, - stream_output_mode: str = None, - stream_checkpoint_path: str = None, + kafka_topic: Optional[str] = None, + kafka_connection_string: Optional[str] = None, + mode: Optional[str] = None, + format_: Optional[str] = None, + stream_processing_time: Optional[str] = None, + stream_output_mode: Optional[str] = None, + stream_checkpoint_path: Optional[str] = None, ): self.kafka_topic = kafka_topic self.kafka_connection_string = kafka_connection_string @@ -147,4 +148,4 @@ def translate(self, schema: List[Dict[str, Any]]) -> List[Dict[str, Any]]: Kafka schema. """ - pass + return [{}] diff --git a/butterfree/configs/db/metastore_config.py b/butterfree/configs/db/metastore_config.py index ff7ed01df..323aded0c 100644 --- a/butterfree/configs/db/metastore_config.py +++ b/butterfree/configs/db/metastore_config.py @@ -25,10 +25,10 @@ class MetastoreConfig(AbstractWriteConfig): def __init__( self, - path: str = None, - mode: str = None, - format_: str = None, - file_system: str = None, + path: Optional[str] = None, + mode: Optional[str] = None, + format_: Optional[str] = None, + file_system: Optional[str] = None, ): self.path = path self.mode = mode diff --git a/butterfree/configs/environment.py b/butterfree/configs/environment.py index f56efc5d5..f6ba18a5b 100644 --- a/butterfree/configs/environment.py +++ b/butterfree/configs/environment.py @@ -1,4 +1,5 @@ """Holds functions for managing the running environment.""" + import os from typing import Optional @@ -34,7 +35,9 @@ def __init__(self, variable_name: str): ) -def get_variable(variable_name: str, default_value: str = None) -> Optional[str]: +def get_variable( + variable_name: str, default_value: Optional[str] = None +) -> Optional[str]: """Gets an environment variable. The variable comes from it's explicitly declared value in the running diff --git a/butterfree/constants/__init__.py b/butterfree/constants/__init__.py index ec70d41b5..aa0c76e65 100644 --- a/butterfree/constants/__init__.py +++ b/butterfree/constants/__init__.py @@ -1,4 +1,5 @@ """Holds constant attributes that are common for Butterfree.""" + from butterfree.constants.data_type import DataType __all__ = ["DataType"] diff --git a/butterfree/constants/migrations.py b/butterfree/constants/migrations.py index b1c0947db..f31d08418 100644 --- a/butterfree/constants/migrations.py +++ b/butterfree/constants/migrations.py @@ -1,4 +1,5 @@ """Migrations' Constants.""" + from butterfree.constants import columns PARTITION_BY = [ diff --git a/butterfree/dataframe_service/__init__.py b/butterfree/dataframe_service/__init__.py index c227dae24..5fd02d453 100644 --- a/butterfree/dataframe_service/__init__.py +++ b/butterfree/dataframe_service/__init__.py @@ -1,4 +1,5 @@ """Dataframe optimization components regarding Butterfree.""" + from butterfree.dataframe_service.incremental_strategy import IncrementalStrategy from butterfree.dataframe_service.partitioning import extract_partition_values from butterfree.dataframe_service.repartition import repartition_df, repartition_sort_df diff --git a/butterfree/dataframe_service/incremental_strategy.py b/butterfree/dataframe_service/incremental_strategy.py index 6554d3b77..957064f15 100644 --- a/butterfree/dataframe_service/incremental_strategy.py +++ b/butterfree/dataframe_service/incremental_strategy.py @@ -2,6 +2,8 @@ from __future__ import annotations +from typing import Optional + from pyspark.sql import DataFrame @@ -18,7 +20,7 @@ class IncrementalStrategy: filter can properly work with the defined upper and lower bounds. """ - def __init__(self, column: str = None): + def __init__(self, column: Optional[str] = None): self.column = column def from_milliseconds(self, column_name: str) -> IncrementalStrategy: @@ -32,7 +34,9 @@ def from_milliseconds(self, column_name: str) -> IncrementalStrategy: """ return IncrementalStrategy(column=f"from_unixtime({column_name}/ 1000.0)") - def from_string(self, column_name: str, mask: str = None) -> IncrementalStrategy: + def from_string( + self, column_name: str, mask: Optional[str] = None + ) -> IncrementalStrategy: """Create a column expression from ts column defined as a simple string. Args: @@ -66,7 +70,9 @@ def from_year_month_day_partitions( f"'-', string({day_column}))" ) - def get_expression(self, start_date: str = None, end_date: str = None) -> str: + def get_expression( + self, start_date: Optional[str] = None, end_date: Optional[str] = None + ) -> str: """Get the incremental filter expression using the defined dates. Both arguments can be set to defined a specific date interval, but it's @@ -95,7 +101,10 @@ def get_expression(self, start_date: str = None, end_date: str = None) -> str: return f"date({self.column}) <= date('{end_date}')" def filter_with_incremental_strategy( - self, dataframe: DataFrame, start_date: str = None, end_date: str = None + self, + dataframe: DataFrame, + start_date: Optional[str] = None, + end_date: Optional[str] = None, ) -> DataFrame: """Filters the dataframe according to the date boundaries. diff --git a/butterfree/dataframe_service/repartition.py b/butterfree/dataframe_service/repartition.py index 8635557f9..e84202ba7 100644 --- a/butterfree/dataframe_service/repartition.py +++ b/butterfree/dataframe_service/repartition.py @@ -1,5 +1,6 @@ """Module where there are repartition methods.""" -from typing import List + +from typing import List, Optional from pyspark.sql.dataframe import DataFrame @@ -10,7 +11,7 @@ def _num_partitions_definition( - num_processors: int = None, num_partitions: int = None + num_processors: Optional[int] = None, num_partitions: Optional[int] = None ) -> int: num_partitions = ( num_processors * PARTITION_PROCESSOR_RATIO @@ -24,8 +25,8 @@ def _num_partitions_definition( def repartition_df( dataframe: DataFrame, partition_by: List[str], - num_partitions: int = None, - num_processors: int = None, + num_partitions: Optional[int] = None, + num_processors: Optional[int] = None, ) -> DataFrame: """Partition the DataFrame. @@ -47,8 +48,8 @@ def repartition_sort_df( dataframe: DataFrame, partition_by: List[str], order_by: List[str], - num_processors: int = None, - num_partitions: int = None, + num_processors: Optional[int] = None, + num_partitions: Optional[int] = None, ) -> DataFrame: """Partition and Sort the DataFrame. diff --git a/butterfree/extract/__init__.py b/butterfree/extract/__init__.py index bb056255b..64c8ae4a1 100644 --- a/butterfree/extract/__init__.py +++ b/butterfree/extract/__init__.py @@ -1,4 +1,5 @@ """The Source Component of a Feature Set.""" + from butterfree.extract.source import Source __all__ = ["Source"] diff --git a/butterfree/extract/pre_processing/__init__.py b/butterfree/extract/pre_processing/__init__.py index 72b37c4db..e142de6d1 100644 --- a/butterfree/extract/pre_processing/__init__.py +++ b/butterfree/extract/pre_processing/__init__.py @@ -1,4 +1,5 @@ """Pre Processing Components regarding Readers.""" + from butterfree.extract.pre_processing.explode_json_column_transform import ( explode_json_column, ) diff --git a/butterfree/extract/pre_processing/explode_json_column_transform.py b/butterfree/extract/pre_processing/explode_json_column_transform.py index db79b5ce0..76c90f739 100644 --- a/butterfree/extract/pre_processing/explode_json_column_transform.py +++ b/butterfree/extract/pre_processing/explode_json_column_transform.py @@ -1,4 +1,5 @@ """Explode json column for dataframes.""" + from pyspark.sql.dataframe import DataFrame, StructType from pyspark.sql.functions import from_json, get_json_object diff --git a/butterfree/extract/pre_processing/filter_transform.py b/butterfree/extract/pre_processing/filter_transform.py index 78e5df78f..a7e4fff81 100644 --- a/butterfree/extract/pre_processing/filter_transform.py +++ b/butterfree/extract/pre_processing/filter_transform.py @@ -1,4 +1,5 @@ """Module where filter DataFrames coming from readers.""" + from pyspark.sql.dataframe import DataFrame diff --git a/butterfree/extract/pre_processing/forward_fill_transform.py b/butterfree/extract/pre_processing/forward_fill_transform.py index 96d9bcdda..2d3a232d6 100644 --- a/butterfree/extract/pre_processing/forward_fill_transform.py +++ b/butterfree/extract/pre_processing/forward_fill_transform.py @@ -1,6 +1,7 @@ """Forward Fill Transform for dataframes.""" + import sys -from typing import List, Union +from typing import List, Optional, Union from pyspark.sql import DataFrame, Window, functions @@ -10,7 +11,7 @@ def forward_fill( partition_by: Union[str, List[str]], order_by: Union[str, List[str]], fill_column: str, - filled_column: str = None, + filled_column: Optional[str] = None, ) -> DataFrame: """Applies a forward fill to a single column. diff --git a/butterfree/extract/pre_processing/pivot_transform.py b/butterfree/extract/pre_processing/pivot_transform.py index 078b47464..f255f4577 100644 --- a/butterfree/extract/pre_processing/pivot_transform.py +++ b/butterfree/extract/pre_processing/pivot_transform.py @@ -1,5 +1,6 @@ """Pivot Transform for dataframes.""" -from typing import Callable, List, Union + +from typing import Callable, List, Optional, Union from pyspark.sql import DataFrame, functions from pyspark.sql.types import DataType @@ -13,8 +14,8 @@ def pivot( pivot_column: str, agg_column: str, aggregation: Callable, - mock_value: Union[float, str] = None, - mock_type: Union[DataType, str] = None, + mock_value: Optional[Union[float, str]] = None, + mock_type: Optional[Union[DataType, str]] = None, with_forward_fill: bool = False, ) -> DataFrame: """Defines a pivot transformation. diff --git a/butterfree/extract/pre_processing/replace_transform.py b/butterfree/extract/pre_processing/replace_transform.py index a7dd1d67a..3127c6d9d 100644 --- a/butterfree/extract/pre_processing/replace_transform.py +++ b/butterfree/extract/pre_processing/replace_transform.py @@ -1,4 +1,5 @@ """Replace transformer for dataframes.""" + from itertools import chain from typing import Dict diff --git a/butterfree/extract/readers/__init__.py b/butterfree/extract/readers/__init__.py index 37da63a6c..8c7bd74e0 100644 --- a/butterfree/extract/readers/__init__.py +++ b/butterfree/extract/readers/__init__.py @@ -1,4 +1,5 @@ """The Reader Component of a Source.""" + from butterfree.extract.readers.file_reader import FileReader from butterfree.extract.readers.kafka_reader import KafkaReader from butterfree.extract.readers.table_reader import TableReader diff --git a/butterfree/extract/readers/file_reader.py b/butterfree/extract/readers/file_reader.py index 8cf155998..da046f083 100644 --- a/butterfree/extract/readers/file_reader.py +++ b/butterfree/extract/readers/file_reader.py @@ -1,5 +1,6 @@ """FileReader entity.""" -from typing import Any, Dict + +from typing import Any, Dict, Optional from pyspark.sql import DataFrame from pyspark.sql.types import StructType @@ -75,8 +76,8 @@ def __init__( id: str, path: str, format: str, - schema: StructType = None, - format_options: Dict[Any, Any] = None, + schema: Optional[StructType] = None, + format_options: Optional[Dict[Any, Any]] = None, stream: bool = False, ): super().__init__(id) diff --git a/butterfree/extract/readers/kafka_reader.py b/butterfree/extract/readers/kafka_reader.py index 1b8042bce..44731d207 100644 --- a/butterfree/extract/readers/kafka_reader.py +++ b/butterfree/extract/readers/kafka_reader.py @@ -1,5 +1,6 @@ """KafkaSource entity.""" -from typing import Any, Dict + +from typing import Any, Dict, Optional from pyspark.sql.dataframe import DataFrame, StructType from pyspark.sql.functions import col, struct @@ -107,8 +108,8 @@ def __init__( id: str, topic: str, value_schema: StructType, - connection_string: str = None, - topic_options: Dict[Any, Any] = None, + connection_string: Optional[str] = None, + topic_options: Optional[Dict[Any, Any]] = None, stream: bool = True, ): super().__init__(id) diff --git a/butterfree/extract/readers/reader.py b/butterfree/extract/readers/reader.py index 597c870ff..5053d82c4 100644 --- a/butterfree/extract/readers/reader.py +++ b/butterfree/extract/readers/reader.py @@ -21,7 +21,9 @@ class Reader(ABC, HookableComponent): """ - def __init__(self, id: str, incremental_strategy: IncrementalStrategy = None): + def __init__( + self, id: str, incremental_strategy: Optional[IncrementalStrategy] = None + ): super().__init__() self.id = id self.transformations: List[Dict[str, Any]] = [] @@ -82,9 +84,9 @@ def consume(self, client: SparkClient) -> DataFrame: def build( self, client: SparkClient, - columns: List[Any] = None, - start_date: str = None, - end_date: str = None, + columns: Optional[List[Any]] = None, + start_date: Optional[str] = None, + end_date: Optional[str] = None, ) -> None: """Register the data got from the reader in the Spark metastore. diff --git a/butterfree/extract/readers/table_reader.py b/butterfree/extract/readers/table_reader.py index 343f25f38..b5decfc1f 100644 --- a/butterfree/extract/readers/table_reader.py +++ b/butterfree/extract/readers/table_reader.py @@ -1,5 +1,7 @@ """TableSource entity.""" +from typing import Optional + from pyspark.sql import DataFrame from butterfree.clients import SparkClient @@ -44,7 +46,7 @@ class TableReader(Reader): __name__ = "Table Reader" - def __init__(self, id: str, table: str, database: str = None): + def __init__(self, id: str, table: str, database: Optional[str] = None): super().__init__(id) if not isinstance(table, str): raise ValueError( diff --git a/butterfree/extract/source.py b/butterfree/extract/source.py index 281ed15ad..bfc15271f 100644 --- a/butterfree/extract/source.py +++ b/butterfree/extract/source.py @@ -1,6 +1,6 @@ """Holds the SourceSelector class.""" -from typing import List +from typing import List, Optional from pyspark.sql import DataFrame @@ -70,7 +70,10 @@ def __init__( self.eager_evaluation = eager_evaluation def construct( - self, client: SparkClient, start_date: str = None, end_date: str = None + self, + client: SparkClient, + start_date: Optional[str] = None, + end_date: Optional[str] = None, ) -> DataFrame: """Construct an entry point dataframe for a feature set. diff --git a/butterfree/hooks/__init__.py b/butterfree/hooks/__init__.py index 90bedeb26..e4a32170c 100644 --- a/butterfree/hooks/__init__.py +++ b/butterfree/hooks/__init__.py @@ -1,4 +1,5 @@ """Holds Hooks definitions.""" + from butterfree.hooks.hook import Hook from butterfree.hooks.hookable_component import HookableComponent diff --git a/butterfree/hooks/schema_compatibility/__init__.py b/butterfree/hooks/schema_compatibility/__init__.py index edf748bf8..a00adef8d 100644 --- a/butterfree/hooks/schema_compatibility/__init__.py +++ b/butterfree/hooks/schema_compatibility/__init__.py @@ -1,4 +1,5 @@ """Holds Schema Compatibility Hooks definitions.""" + from butterfree.hooks.schema_compatibility.cassandra_table_schema_compatibility_hook import ( # noqa CassandraTableSchemaCompatibilityHook, ) diff --git a/butterfree/hooks/schema_compatibility/spark_table_schema_compatibility_hook.py b/butterfree/hooks/schema_compatibility/spark_table_schema_compatibility_hook.py index b08dd56aa..eea50c06d 100644 --- a/butterfree/hooks/schema_compatibility/spark_table_schema_compatibility_hook.py +++ b/butterfree/hooks/schema_compatibility/spark_table_schema_compatibility_hook.py @@ -1,5 +1,7 @@ """Spark table schema compatibility Hook definition.""" +from typing import Optional + from pyspark.sql import DataFrame from butterfree.clients import SparkClient @@ -18,7 +20,9 @@ class SparkTableSchemaCompatibilityHook(Hook): database: database name. """ - def __init__(self, spark_client: SparkClient, table: str, database: str = None): + def __init__( + self, spark_client: SparkClient, table: str, database: Optional[str] = None + ): self.spark_client = spark_client self.table_expression = (f"`{database}`." if database else "") + f"`{table}`" diff --git a/butterfree/load/processing/__init__.py b/butterfree/load/processing/__init__.py index e2ad51578..06c5cb450 100644 --- a/butterfree/load/processing/__init__.py +++ b/butterfree/load/processing/__init__.py @@ -1,4 +1,5 @@ """Pre Processing Components regarding Readers.""" + from butterfree.load.processing.json_transform import json_transform __all__ = ["json_transform"] diff --git a/butterfree/load/processing/json_transform.py b/butterfree/load/processing/json_transform.py index 19ddecae2..598064dba 100644 --- a/butterfree/load/processing/json_transform.py +++ b/butterfree/load/processing/json_transform.py @@ -1,4 +1,5 @@ """Json conversion for writers.""" + from pyspark.sql.dataframe import DataFrame from pyspark.sql.functions import struct, to_json diff --git a/butterfree/load/sink.py b/butterfree/load/sink.py index 7c0328d6f..59b001a53 100644 --- a/butterfree/load/sink.py +++ b/butterfree/load/sink.py @@ -1,4 +1,5 @@ """Holds the Sink class.""" + from typing import List, Optional from pyspark.sql.dataframe import DataFrame diff --git a/butterfree/load/writers/historical_feature_store_writer.py b/butterfree/load/writers/historical_feature_store_writer.py index 1a64afdf3..c01fee1d8 100644 --- a/butterfree/load/writers/historical_feature_store_writer.py +++ b/butterfree/load/writers/historical_feature_store_writer.py @@ -1,7 +1,7 @@ """Holds the Historical Feature Store writer class.""" import os -from typing import Any +from typing import Any, Optional from pyspark.sql.dataframe import DataFrame from pyspark.sql.functions import dayofmonth, month, year @@ -106,13 +106,13 @@ class HistoricalFeatureStoreWriter(Writer): def __init__( self, - db_config: AbstractWriteConfig = None, - database: str = None, - num_partitions: int = None, + db_config: Optional[AbstractWriteConfig] = None, + database: Optional[str] = None, + num_partitions: Optional[int] = None, validation_threshold: float = DEFAULT_VALIDATION_THRESHOLD, debug_mode: bool = False, interval_mode: bool = False, - check_schema_hook: Hook = None, + check_schema_hook: Optional[Hook] = None, row_count_validation: bool = True, ): super(HistoricalFeatureStoreWriter, self).__init__( @@ -152,7 +152,8 @@ def write( dataframe = self._apply_transformations(dataframe) if self.interval_mode: - partition_overwrite_mode = spark_client.conn.conf.get( + + partition_overwrite_mode = spark_client.conn.conf.get( # type: ignore "spark.sql.sources.partitionOverwriteMode" ).lower() @@ -249,7 +250,11 @@ def _create_partitions(self, dataframe: DataFrame) -> DataFrame: return repartition_df(dataframe, self.PARTITION_BY, self.num_partitions) def check_schema( - self, client: Any, dataframe: DataFrame, table_name: str, database: str = None + self, + client: Any, + dataframe: DataFrame, + table_name: str, + database: Optional[str] = None, ) -> DataFrame: """Instantiate the schema check hook to check schema between dataframe and database. diff --git a/butterfree/load/writers/online_feature_store_writer.py b/butterfree/load/writers/online_feature_store_writer.py index d0bcde948..bce5a3751 100644 --- a/butterfree/load/writers/online_feature_store_writer.py +++ b/butterfree/load/writers/online_feature_store_writer.py @@ -1,7 +1,7 @@ """Holds the Online Feature Store writer class.""" import os -from typing import Any, Dict, List, Union +from typing import Any, Dict, List, Optional, Union from pyspark.sql import DataFrame, Window from pyspark.sql.functions import col, row_number @@ -80,12 +80,12 @@ class OnlineFeatureStoreWriter(Writer): def __init__( self, - db_config: AbstractWriteConfig = None, - database: str = None, - debug_mode: bool = False, - write_to_entity: bool = False, - interval_mode: bool = False, - check_schema_hook: Hook = None, + db_config: Optional[AbstractWriteConfig] = None, + database: Optional[str] = None, + debug_mode: Optional[bool] = False, + write_to_entity: Optional[bool] = False, + interval_mode: Optional[bool] = False, + check_schema_hook: Optional[Hook] = None, ): super(OnlineFeatureStoreWriter, self).__init__( db_config or CassandraConfig(), debug_mode, interval_mode, write_to_entity @@ -256,7 +256,11 @@ def get_db_schema(self, feature_set: FeatureSet) -> List[Dict[Any, Any]]: return db_schema def check_schema( - self, client: Any, dataframe: DataFrame, table_name: str, database: str = None + self, + client: Any, + dataframe: DataFrame, + table_name: str, + database: Optional[str] = None, ) -> DataFrame: """Instantiate the schema check hook to check schema between dataframe and database. diff --git a/butterfree/load/writers/writer.py b/butterfree/load/writers/writer.py index 1dae795c6..780b9ec2d 100644 --- a/butterfree/load/writers/writer.py +++ b/butterfree/load/writers/writer.py @@ -2,7 +2,7 @@ from abc import ABC, abstractmethod from functools import reduce -from typing import Any, Callable, Dict, List +from typing import Any, Callable, Dict, List, Optional from pyspark.sql.dataframe import DataFrame @@ -23,10 +23,10 @@ class Writer(ABC, HookableComponent): def __init__( self, db_config: AbstractWriteConfig, - debug_mode: bool = False, - interval_mode: bool = False, - write_to_entity: bool = False, - row_count_validation: bool = True, + debug_mode: Optional[bool] = False, + interval_mode: Optional[bool] = False, + write_to_entity: Optional[bool] = False, + row_count_validation: Optional[bool] = True, ) -> None: super().__init__() self.db_config = db_config @@ -90,7 +90,11 @@ def write( @abstractmethod def check_schema( - self, client: Any, dataframe: DataFrame, table_name: str, database: str = None + self, + client: Any, + dataframe: DataFrame, + table_name: str, + database: Optional[str] = None, ) -> DataFrame: """Instantiate the schema check hook to check schema between dataframe and database. diff --git a/butterfree/migrations/database_migration/database_migration.py b/butterfree/migrations/database_migration/database_migration.py index 468c028ec..351a47243 100644 --- a/butterfree/migrations/database_migration/database_migration.py +++ b/butterfree/migrations/database_migration/database_migration.py @@ -1,8 +1,9 @@ """Migration entity.""" + from abc import ABC, abstractmethod from dataclasses import dataclass from enum import Enum, auto -from typing import Any, Dict, List, Set +from typing import Any, Dict, List, Optional, Set from butterfree.clients import AbstractClient from butterfree.configs.logger import __logger @@ -106,7 +107,10 @@ def _get_alter_column_type_query(self, column: Diff, table_name: str) -> str: pass def _get_queries( - self, schema_diff: Set[Diff], table_name: str, write_on_entity: bool = None + self, + schema_diff: Set[Diff], + table_name: str, + write_on_entity: Optional[bool] = None, ) -> Any: """Create the desired queries for migration. @@ -162,8 +166,8 @@ def create_query( self, fs_schema: List[Dict[str, Any]], table_name: str, - db_schema: List[Dict[str, Any]] = None, - write_on_entity: bool = None, + db_schema: Optional[List[Dict[str, Any]]] = None, + write_on_entity: Optional[bool] = None, ) -> Any: """Create a query regarding a data source. @@ -246,7 +250,7 @@ def _get_diff( return schema_diff def _get_schema( - self, table_name: str, database: str = None + self, table_name: str, database: Optional[str] = None ) -> List[Dict[str, Any]]: """Get a table schema in the respective database. diff --git a/butterfree/migrations/database_migration/metastore_migration.py b/butterfree/migrations/database_migration/metastore_migration.py index 8c6c211ae..07e2bd89f 100644 --- a/butterfree/migrations/database_migration/metastore_migration.py +++ b/butterfree/migrations/database_migration/metastore_migration.py @@ -1,6 +1,6 @@ """Metastore Migration entity.""" -from typing import Any, Dict, List +from typing import Any, Dict, List, Optional from butterfree.clients import SparkClient from butterfree.configs import environment @@ -32,7 +32,7 @@ class MetastoreMigration(DatabaseMigration): def __init__( self, - database: str = None, + database: Optional[str] = None, ) -> None: self._db_config = MetastoreConfig() self.database = database or environment.get_variable( diff --git a/butterfree/pipelines/__init__.py b/butterfree/pipelines/__init__.py index a868e48f2..8bbc5c39e 100644 --- a/butterfree/pipelines/__init__.py +++ b/butterfree/pipelines/__init__.py @@ -1,4 +1,5 @@ """ETL Pipelines.""" + from butterfree.pipelines.feature_set_pipeline import FeatureSetPipeline __all__ = ["FeatureSetPipeline"] diff --git a/butterfree/pipelines/feature_set_pipeline.py b/butterfree/pipelines/feature_set_pipeline.py index 8aec54ec2..8ba1a636c 100644 --- a/butterfree/pipelines/feature_set_pipeline.py +++ b/butterfree/pipelines/feature_set_pipeline.py @@ -1,5 +1,6 @@ """FeatureSetPipeline entity.""" -from typing import List + +from typing import List, Optional from butterfree.clients import SparkClient from butterfree.dataframe_service import repartition_sort_df @@ -135,7 +136,7 @@ def __init__( source: Source, feature_set: FeatureSet, sink: Sink, - spark_client: SparkClient = None, + spark_client: Optional[SparkClient] = None, ): self.source = source self.feature_set = feature_set @@ -190,11 +191,11 @@ def spark_client(self, spark_client: SparkClient) -> None: def run( self, - end_date: str = None, - partition_by: List[str] = None, - order_by: List[str] = None, - num_processors: int = None, - start_date: str = None, + end_date: Optional[str] = None, + partition_by: Optional[List[str]] = None, + order_by: Optional[List[str]] = None, + num_processors: Optional[int] = None, + start_date: Optional[str] = None, ) -> None: """Runs the defined feature set pipeline. @@ -243,10 +244,10 @@ def run( def run_for_date( self, - execution_date: str = None, - partition_by: List[str] = None, - order_by: List[str] = None, - num_processors: int = None, + execution_date: Optional[str] = None, + partition_by: Optional[List[str]] = None, + order_by: Optional[List[str]] = None, + num_processors: Optional[int] = None, ) -> None: """Runs the defined feature set pipeline for a specific date. diff --git a/butterfree/reports/__init__.py b/butterfree/reports/__init__.py index 4b57dafc2..d272943d9 100644 --- a/butterfree/reports/__init__.py +++ b/butterfree/reports/__init__.py @@ -1,4 +1,5 @@ """Reports module.""" + from butterfree.reports.metadata import Metadata __all__ = ["Metadata"] diff --git a/butterfree/testing/dataframe/__init__.py b/butterfree/testing/dataframe/__init__.py index 15481a54a..5b465bc64 100644 --- a/butterfree/testing/dataframe/__init__.py +++ b/butterfree/testing/dataframe/__init__.py @@ -1,6 +1,7 @@ """Methods to assert properties regarding Apache Spark Dataframes.""" + from json import dumps -from typing import Any, Dict, List +from typing import Any, Dict, List, Optional from pyspark import SparkContext from pyspark.sql import Column, DataFrame, SparkSession @@ -72,7 +73,7 @@ def create_df_from_collection( data: List[Dict[Any, Any]], spark_context: SparkContext, spark_session: SparkSession, - schema: StructType = None, + schema: Optional[StructType] = None, ) -> DataFrame: """Creates a dataframe from a list of dicts.""" return spark_session.read.json( diff --git a/butterfree/transform/aggregated_feature_set.py b/butterfree/transform/aggregated_feature_set.py index c86a95c3d..6706bf8cf 100644 --- a/butterfree/transform/aggregated_feature_set.py +++ b/butterfree/transform/aggregated_feature_set.py @@ -1,4 +1,5 @@ """AggregatedFeatureSet entity.""" + import itertools from datetime import datetime, timedelta from functools import reduce @@ -261,8 +262,8 @@ def _has_aggregated_transform_only(features: List[Feature]) -> bool: @staticmethod def _build_feature_column_name( feature_column: str, - pivot_value: Union[float, str] = None, - window: Window = None, + pivot_value: Optional[Union[float, str]] = None, + window: Optional[Window] = None, ) -> str: base_name = feature_column if pivot_value is not None: @@ -311,7 +312,7 @@ def with_distinct(self, subset: List, keep: str = "last") -> "AggregatedFeatureS return self def with_windows( - self, definitions: List[str], slide: str = None + self, definitions: List[str], slide: Optional[str] = None ) -> "AggregatedFeatureSet": """Create a list with windows defined.""" self._windows = [ @@ -367,7 +368,7 @@ def _dataframe_join( right: DataFrame, on: List[str], how: str, - num_processors: int = None, + num_processors: Optional[int] = None, ) -> DataFrame: # make both tables co-partitioned to improve join performance left = repartition_df(left, partition_by=on, num_processors=num_processors) @@ -379,7 +380,7 @@ def _aggregate( dataframe: DataFrame, features: List[Feature], window: Optional[Window] = None, - num_processors: int = None, + num_processors: Optional[int] = None, ) -> DataFrame: aggregations = [ c.function for f in features for c in f.transformation.aggregations @@ -512,7 +513,7 @@ def _get_biggest_window_in_days(definitions: List[str]) -> float: ) return max(windows_list) / (60 * 60 * 24) - def define_start_date(self, start_date: str = None) -> Optional[str]: + def define_start_date(self, start_date: Optional[str] = None) -> Optional[str]: """Get aggregated feature set start date. Args: @@ -539,9 +540,9 @@ def construct( self, dataframe: DataFrame, client: SparkClient, - end_date: str = None, - num_processors: int = None, - start_date: str = None, + end_date: Optional[str] = None, + num_processors: Optional[int] = None, + start_date: Optional[str] = None, ) -> DataFrame: """Use all the features to build the feature set dataframe. diff --git a/butterfree/transform/feature_set.py b/butterfree/transform/feature_set.py index 469a353a8..369eaf290 100644 --- a/butterfree/transform/feature_set.py +++ b/butterfree/transform/feature_set.py @@ -1,4 +1,5 @@ """FeatureSet entity.""" + import itertools from functools import reduce from typing import Any, Dict, List, Optional @@ -389,7 +390,7 @@ def _filter_duplicated_rows(self, df: DataFrame) -> DataFrame: return df.select([column for column in self.columns]) - def define_start_date(self, start_date: str = None) -> Optional[str]: + def define_start_date(self, start_date: Optional[str] = None) -> Optional[str]: """Get feature set start date. Args: @@ -404,9 +405,9 @@ def construct( self, dataframe: DataFrame, client: SparkClient, - end_date: str = None, - num_processors: int = None, - start_date: str = None, + end_date: Optional[str] = None, + num_processors: Optional[int] = None, + start_date: Optional[str] = None, ) -> DataFrame: """Use all the features to build the feature set dataframe. diff --git a/butterfree/transform/features/feature.py b/butterfree/transform/features/feature.py index 612fc4a2f..cfd8a2f61 100644 --- a/butterfree/transform/features/feature.py +++ b/butterfree/transform/features/feature.py @@ -1,6 +1,7 @@ """Feature entity.""" + import warnings -from typing import Any, List +from typing import Any, List, Optional from pyspark.sql import DataFrame from pyspark.sql.functions import col @@ -41,9 +42,9 @@ def __init__( self, name: str, description: str, - dtype: DataType = None, - from_column: str = None, - transformation: TransformComponent = None, + dtype: Optional[DataType] = None, + from_column: Optional[str] = None, + transformation: Optional[TransformComponent] = None, ) -> None: self.name = name self.description = description diff --git a/butterfree/transform/features/key_feature.py b/butterfree/transform/features/key_feature.py index a7ad350cb..74626d6fa 100644 --- a/butterfree/transform/features/key_feature.py +++ b/butterfree/transform/features/key_feature.py @@ -1,5 +1,7 @@ """KeyFeature entity.""" +from typing import Optional + from butterfree.constants.data_type import DataType from butterfree.transform.features.feature import Feature from butterfree.transform.transformations import TransformComponent @@ -31,8 +33,8 @@ def __init__( name: str, description: str, dtype: DataType, - from_column: str = None, - transformation: TransformComponent = None, + from_column: Optional[str] = None, + transformation: Optional[TransformComponent] = None, ) -> None: super(KeyFeature, self).__init__( name=name, diff --git a/butterfree/transform/features/timestamp_feature.py b/butterfree/transform/features/timestamp_feature.py index b131eaeee..aa30dfc4a 100644 --- a/butterfree/transform/features/timestamp_feature.py +++ b/butterfree/transform/features/timestamp_feature.py @@ -1,4 +1,7 @@ """TimestampFeature entity.""" + +from typing import Optional + from pyspark.sql import DataFrame from pyspark.sql.functions import to_timestamp @@ -38,10 +41,10 @@ class TimestampFeature(Feature): def __init__( self, - from_column: str = None, - transformation: TransformComponent = None, + from_column: Optional[str] = None, + transformation: Optional[TransformComponent] = None, from_ms: bool = False, - mask: str = None, + mask: Optional[str] = None, ) -> None: description = "Time tag for the state of all features." super(TimestampFeature, self).__init__( @@ -70,7 +73,7 @@ def transform(self, dataframe: DataFrame) -> DataFrame: ts_column = ts_column / 1000 dataframe = dataframe.withColumn( - column_name, to_timestamp(ts_column, self.mask) + column_name, to_timestamp(ts_column, self.mask) # type: ignore ) return super().transform(dataframe) diff --git a/butterfree/transform/transformations/aggregated_transform.py b/butterfree/transform/transformations/aggregated_transform.py index a9581ef00..406ca72a9 100644 --- a/butterfree/transform/transformations/aggregated_transform.py +++ b/butterfree/transform/transformations/aggregated_transform.py @@ -1,6 +1,7 @@ """Aggregated Transform entity.""" + from collections import namedtuple -from typing import List, Tuple +from typing import List, Optional, Tuple from pyspark.sql import DataFrame from pyspark.sql.functions import col, expr, when @@ -56,7 +57,9 @@ class AggregatedTransform(TransformComponent): NotImplementedError: ... """ - def __init__(self, functions: List[Function], filter_expression: str = None): + def __init__( + self, functions: List[Function], filter_expression: Optional[str] = None + ): super(AggregatedTransform, self).__init__() self.functions = functions self.filter_expression = filter_expression diff --git a/butterfree/transform/transformations/custom_transform.py b/butterfree/transform/transformations/custom_transform.py index 7860fdc20..a12310127 100644 --- a/butterfree/transform/transformations/custom_transform.py +++ b/butterfree/transform/transformations/custom_transform.py @@ -69,7 +69,7 @@ def transformer(self) -> Callable[..., Any]: @transformer.setter def transformer(self, method: Callable[..., Any]) -> None: - if not method: + if method is None: raise ValueError("A method must be provided to CustomTransform") self._transformer = method diff --git a/butterfree/transform/transformations/spark_function_transform.py b/butterfree/transform/transformations/spark_function_transform.py index 8fb24dd79..34384518d 100644 --- a/butterfree/transform/transformations/spark_function_transform.py +++ b/butterfree/transform/transformations/spark_function_transform.py @@ -1,5 +1,6 @@ """Spark Function Transform entity.""" -from typing import Any, List + +from typing import Any, List, Optional from pyspark.sql import DataFrame @@ -87,8 +88,8 @@ def with_window( self, partition_by: str, window_definition: List[str], - order_by: str = None, - mode: str = None, + order_by: Optional[str] = None, + mode: Optional[str] = None, ) -> "SparkFunctionTransform": """Create a list with windows defined.""" if mode is not None: @@ -103,7 +104,9 @@ def with_window( ] return self - def _get_output_name(self, function: object, window: Window = None) -> str: + def _get_output_name( + self, function: object, window: Optional[Window] = None + ) -> str: base_name = ( "__".join([self._parent.name, function.__name__]) if hasattr(function, "__name__") diff --git a/butterfree/transform/transformations/transform_component.py b/butterfree/transform/transformations/transform_component.py index 7ecec332a..94bc19f8c 100644 --- a/butterfree/transform/transformations/transform_component.py +++ b/butterfree/transform/transformations/transform_component.py @@ -1,4 +1,5 @@ """Transform Abstract Class.""" + from abc import ABC, abstractmethod from typing import Any, List diff --git a/butterfree/transform/transformations/user_defined_functions/mode.py b/butterfree/transform/transformations/user_defined_functions/mode.py index 65790b939..5b6c7f17d 100644 --- a/butterfree/transform/transformations/user_defined_functions/mode.py +++ b/butterfree/transform/transformations/user_defined_functions/mode.py @@ -1,4 +1,5 @@ """Method to compute mode aggregation.""" + import pandas as pd from pyspark.sql.functions import pandas_udf from pyspark.sql.types import StringType diff --git a/butterfree/transform/transformations/user_defined_functions/most_frequent_set.py b/butterfree/transform/transformations/user_defined_functions/most_frequent_set.py index 20ccd3ba3..6dd6779f1 100644 --- a/butterfree/transform/transformations/user_defined_functions/most_frequent_set.py +++ b/butterfree/transform/transformations/user_defined_functions/most_frequent_set.py @@ -1,4 +1,5 @@ """Method to compute most frequent set aggregation.""" + from typing import Any import pandas as pd diff --git a/butterfree/transform/utils/__init__.py b/butterfree/transform/utils/__init__.py index abf7ed3fb..66004a374 100644 --- a/butterfree/transform/utils/__init__.py +++ b/butterfree/transform/utils/__init__.py @@ -1,4 +1,5 @@ """This module holds utils to be used by transformations.""" + from butterfree.transform.utils.function import Function from butterfree.transform.utils.window_spec import Window diff --git a/butterfree/transform/utils/date_range.py b/butterfree/transform/utils/date_range.py index 78e0e6e3c..4bdd29772 100644 --- a/butterfree/transform/utils/date_range.py +++ b/butterfree/transform/utils/date_range.py @@ -1,7 +1,7 @@ """Utils for date range generation.""" from datetime import datetime -from typing import Union +from typing import Optional, Union from pyspark.sql import DataFrame, functions @@ -14,7 +14,7 @@ def get_date_range( client: SparkClient, start_date: Union[str, datetime], end_date: Union[str, datetime], - step: int = None, + step: Optional[int] = None, ) -> DataFrame: """Create a date range dataframe. @@ -44,7 +44,7 @@ def get_date_range( for c in ("start_date", "end_date") ] ) - start_date, end_date = date_df.first() + start_date, end_date = date_df.first() # type: ignore return client.conn.range( start_date, end_date + day_in_seconds, step # type: ignore ).select(functions.col("id").cast(DataType.TIMESTAMP.spark).alias(TIMESTAMP_COLUMN)) diff --git a/butterfree/transform/utils/function.py b/butterfree/transform/utils/function.py index fcf6679fb..951a232ca 100644 --- a/butterfree/transform/utils/function.py +++ b/butterfree/transform/utils/function.py @@ -32,9 +32,9 @@ def func(self) -> Callable: @func.setter def func(self, value: Callable) -> None: """Definitions to be used in the transformation.""" - if not value: + if value is None: raise ValueError("Function must not be empty.") - if not callable(value): + if callable(value) is False: raise TypeError("Function must be callable.") self._func = value diff --git a/butterfree/transform/utils/window_spec.py b/butterfree/transform/utils/window_spec.py index 53ecd2fd3..b95dd73a6 100644 --- a/butterfree/transform/utils/window_spec.py +++ b/butterfree/transform/utils/window_spec.py @@ -1,4 +1,5 @@ """Holds function for defining window in DataFrames.""" + from typing import Any, List, Optional, Union from pyspark import sql @@ -69,8 +70,8 @@ def __init__( window_definition: str, partition_by: Optional[Union[Column, str, List[str]]] = None, order_by: Optional[Union[Column, str]] = None, - mode: str = None, - slide: str = None, + mode: Optional[str] = None, + slide: Optional[str] = None, ): self.partition_by = partition_by self.order_by = order_by or TIMESTAMP_COLUMN diff --git a/butterfree/validations/basic_validaton.py b/butterfree/validations/basic_validaton.py index d3a5558c7..01bc9ec21 100644 --- a/butterfree/validations/basic_validaton.py +++ b/butterfree/validations/basic_validaton.py @@ -1,5 +1,7 @@ """Validation implementing basic checks over the dataframe.""" +from typing import Optional + from pyspark.sql.dataframe import DataFrame from butterfree.constants.columns import TIMESTAMP_COLUMN @@ -14,7 +16,7 @@ class BasicValidation(Validation): """ - def __init__(self, dataframe: DataFrame = None): + def __init__(self, dataframe: Optional[DataFrame] = None): super().__init__(dataframe) def check(self) -> None: diff --git a/butterfree/validations/validation.py b/butterfree/validations/validation.py index 9915906cd..551859d82 100644 --- a/butterfree/validations/validation.py +++ b/butterfree/validations/validation.py @@ -1,5 +1,7 @@ """Abstract Validation class.""" + from abc import ABC, abstractmethod +from typing import Optional from pyspark.sql.dataframe import DataFrame @@ -12,7 +14,7 @@ class Validation(ABC): """ - def __init__(self, dataframe: DataFrame = None): + def __init__(self, dataframe: Optional[DataFrame] = None): self.dataframe = dataframe def input(self, dataframe: DataFrame) -> "Validation": diff --git a/docs/source/conf.py b/docs/source/conf.py index 77fdc1256..0a5377392 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -1,4 +1,5 @@ """Sphinx Configuration.""" + # -*- coding: utf-8 -*- # # Configuration file for the Sphinx documentation builder. diff --git a/mypy.ini b/mypy.ini index fc2931493..eb867a477 100644 --- a/mypy.ini +++ b/mypy.ini @@ -9,42 +9,3 @@ show_error_codes = True show_error_context = True disable_error_code = attr-defined, list-item, operator pretty = True - -[mypy-butterfree.pipelines.*] -ignore_errors = True - -[mypy-butterfree.load.*] -ignore_errors = True - -[mypy-butterfree.transform.*] -ignore_errors = True - -[mypy-butterfree.extract.*] -ignore_errors = True - -[mypy-butterfree.config.*] -ignore_errors = True - -[mypy-butterfree.clients.*] -ignore_errors = True - -[mypy-butterfree.configs.*] -ignore_errors = True - -[mypy-butterfree.dataframe_service.*] -ignore_errors = True - -[mypy-butterfree.validations.*] -ignore_errors = True - -[mypy-butterfree.migrations.*] -ignore_errors = True - -[mypy-butterfree.testing.*] -ignore_errors = True - -[mypy-butterfree.hooks.*] -ignore_errors = True - -[mypy-butterfree._cli.*] -ignore_errors = True diff --git a/requirements.lint.txt b/requirements.lint.txt index 66641a952..1ad6499d1 100644 --- a/requirements.lint.txt +++ b/requirements.lint.txt @@ -1,4 +1,4 @@ -black==21.12b0 +black==24.3.0 flake8==4.0.1 flake8-isort==4.1.1 flake8-docstrings==1.5.0 diff --git a/requirements.txt b/requirements.txt index f3af42540..0af8a62ae 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,6 +3,6 @@ mdutils>=1.2.2,<2.0 pandas>=0.24,<2.0 parameters-validation>=1.1.5,<2.0 pyspark==3.5.1 -typer==0.3.2 +typer==0.4.2 typing-extensions>3.7.4,<5 boto3==1.17.* diff --git a/tests/integration/butterfree/pipelines/conftest.py b/tests/integration/butterfree/pipelines/conftest.py index 5f304972d..1466a8d98 100644 --- a/tests/integration/butterfree/pipelines/conftest.py +++ b/tests/integration/butterfree/pipelines/conftest.py @@ -139,7 +139,10 @@ def feature_set_pipeline( feature_set_pipeline = FeatureSetPipeline( source=Source( readers=[ - TableReader(id="b_source", table="b_table",).with_incremental_strategy( + TableReader( + id="b_source", + table="b_table", + ).with_incremental_strategy( incremental_strategy=IncrementalStrategy(column="timestamp") ), ], diff --git a/tests/unit/butterfree/transform/transformations/test_aggregated_transform.py b/tests/unit/butterfree/transform/transformations/test_aggregated_transform.py index f0ae2f854..96ff682a8 100644 --- a/tests/unit/butterfree/transform/transformations/test_aggregated_transform.py +++ b/tests/unit/butterfree/transform/transformations/test_aggregated_transform.py @@ -67,7 +67,7 @@ def test_blank_aggregation(self, feature_set_dataframe): name="feature1", description="unit test", transformation=AggregatedTransform( - functions=[Function(func="", data_type="")] + functions=[Function(func=None, data_type="")] ), ) From 2a5a6e8be2fa5493da2666261c4af9a304bbdef7 Mon Sep 17 00:00:00 2001 From: Ralph Rassweiler Date: Fri, 14 Jun 2024 10:52:08 -0300 Subject: [PATCH 62/70] feat(MLOP-2236): add NTZ (#360) * feat: NTZ and new tests --- butterfree/constants/data_type.py | 2 + .../transform/features/timestamp_feature.py | 3 +- .../pipelines/test_feature_set_pipeline.py | 2 +- .../features/test_timestamp_feature.py | 80 ++++++++++++++++++- 4 files changed, 84 insertions(+), 3 deletions(-) diff --git a/butterfree/constants/data_type.py b/butterfree/constants/data_type.py index e99525f7d..6166f1fc6 100644 --- a/butterfree/constants/data_type.py +++ b/butterfree/constants/data_type.py @@ -12,6 +12,7 @@ IntegerType, LongType, StringType, + TimestampNTZType, TimestampType, ) from typing_extensions import final @@ -21,6 +22,7 @@ class DataType(Enum): """Holds constants for data types within Butterfree.""" + TIMESTAMP_NTZ = (TimestampNTZType(), "timestamp", "TIMESTAMP_NTZ") TIMESTAMP = (TimestampType(), "timestamp", "TIMESTAMP") BINARY = (BinaryType(), "boolean", "BINARY") BOOLEAN = (BooleanType(), "boolean", "BOOLEAN") diff --git a/butterfree/transform/features/timestamp_feature.py b/butterfree/transform/features/timestamp_feature.py index aa30dfc4a..b4aee71e5 100644 --- a/butterfree/transform/features/timestamp_feature.py +++ b/butterfree/transform/features/timestamp_feature.py @@ -41,6 +41,7 @@ class TimestampFeature(Feature): def __init__( self, + dtype: Optional[DataType] = DataType.TIMESTAMP, from_column: Optional[str] = None, transformation: Optional[TransformComponent] = None, from_ms: bool = False, @@ -51,7 +52,7 @@ def __init__( name=TIMESTAMP_COLUMN, description=description, from_column=from_column, - dtype=DataType.TIMESTAMP, + dtype=dtype, transformation=transformation, ) self.from_ms = from_ms diff --git a/tests/integration/butterfree/pipelines/test_feature_set_pipeline.py b/tests/integration/butterfree/pipelines/test_feature_set_pipeline.py index 791253398..16eb08e2f 100644 --- a/tests/integration/butterfree/pipelines/test_feature_set_pipeline.py +++ b/tests/integration/butterfree/pipelines/test_feature_set_pipeline.py @@ -50,7 +50,7 @@ def create_temp_view(dataframe: DataFrame, name): def create_db_and_table(spark, table_reader_id, table_reader_db, table_reader_table): - spark.sql(f"drop schema {table_reader_db} cascade") + spark.sql(f"drop schema if exists {table_reader_db} cascade") spark.sql(f"create database {table_reader_db}") spark.sql(f"use {table_reader_db}") spark.sql( diff --git a/tests/unit/butterfree/transform/features/test_timestamp_feature.py b/tests/unit/butterfree/transform/features/test_timestamp_feature.py index a5a688c2a..42ab40a2c 100644 --- a/tests/unit/butterfree/transform/features/test_timestamp_feature.py +++ b/tests/unit/butterfree/transform/features/test_timestamp_feature.py @@ -1,18 +1,26 @@ -from pyspark.sql.types import StringType +from datetime import datetime +import pytz +from pyspark.sql.types import StringType, StructField, StructType + +from butterfree.clients import SparkClient from butterfree.constants import DataType from butterfree.constants.columns import TIMESTAMP_COLUMN from butterfree.transform.features import TimestampFeature +# from pyspark.sql.types import * + class TestTimestampFeature: def test_args_without_transformation(self): test_key = TimestampFeature(from_column="ts") + test_key_ntz = TimestampFeature(dtype=DataType.TIMESTAMP_NTZ, from_column="ts") assert test_key.name == TIMESTAMP_COLUMN assert test_key.from_column == "ts" assert test_key.dtype == DataType.TIMESTAMP + assert test_key_ntz.dtype == DataType.TIMESTAMP_NTZ def test_transform(self, feature_set_dataframe): @@ -70,3 +78,73 @@ def test_transform_mask(self, feature_set_dataframe_date): assert df[0]["timestamp"] == "2020-02-07 00:00:00" assert df[1]["timestamp"] == "2020-02-08 00:00:00" + + def test_timezone_configs(self): + + spark = SparkClient() + now = datetime.now() + + # Testing a new timezone + spark.conn.conf.set("spark.sql.session.timeZone", "GMT-5") + + time_list = [(now, now)] + rdd = spark.conn.sparkContext.parallelize(time_list) + + schema = StructType( + [ + StructField("ts", DataType.TIMESTAMP.spark, True), + StructField("ts_ntz", DataType.TIMESTAMP_NTZ.spark, True), + ] + ) + df = spark.conn.createDataFrame(rdd, schema) + df.createOrReplaceTempView("temp_tz_table") + + df1 = spark.conn.sql("""SELECT ts, ts_ntz FROM temp_tz_table""") + df2 = df1.withColumns( + {"ts": df1.ts.cast(StringType()), "ts_ntz": df1.ts_ntz.cast(StringType())} + ) + df2_vals = df2.collect()[0] + + assert df2_vals.ts != df2_vals.ts_ntz + + # New TZ. Column with TZ must have a != value; Column NTZ must keep its value + spark.conn.conf.set("spark.sql.session.timeZone", "GMT-7") + + df3 = spark.conn.sql("""SELECT ts, ts_ntz FROM temp_tz_table""") + df4 = df3.withColumns( + {"ts": df1.ts.cast(StringType()), "ts_ntz": df1.ts_ntz.cast(StringType())} + ) + df4_vals = df4.collect()[0] + + assert df4_vals.ts != df2_vals.ts + assert df4_vals.ts_ntz == df2_vals.ts_ntz + + def test_timezone(self): + + spark = SparkClient() + + my_date = datetime.now(pytz.timezone("US/Pacific")) + + datetime_mask = "%Y-%m-%d %H:%M" + + data = [ + {"id": 1, TIMESTAMP_COLUMN: str(my_date), "feature": 100}, + {"id": 2, TIMESTAMP_COLUMN: str(my_date), "feature": 200}, + ] + + df = spark.conn.read.json(spark.conn._sc.parallelize(data, 1)) + df.createOrReplaceTempView("time_table") + + df2 = spark.sql("SELECT TIMESTAMP AS ts FROM time_table") + + time_value = datetime.fromisoformat(df2.collect()[0].ts).strftime(datetime_mask) + + df_different_timezone = df2.withColumn( + "ts", df2.ts.cast(DataType.TIMESTAMP.spark) + ) + df_no_timezone = df2.withColumn("ts", df2.ts.cast(DataType.TIMESTAMP_NTZ.spark)) + + assert ( + df_different_timezone.collect()[0].ts.strftime(datetime_mask) != time_value + ) + assert df_no_timezone.collect()[0].ts.strftime(datetime_mask) == time_value From 6363e03a2d4e8ce4c21102ba1814515fe672029d Mon Sep 17 00:00:00 2001 From: Ralph Rassweiler Date: Thu, 20 Jun 2024 10:46:15 -0300 Subject: [PATCH 63/70] fix: cassandra configs (#364) * fix: to lower case * pin numpy --- butterfree/configs/db/cassandra_config.py | 30 ++++++++++++----------- requirements.txt | 1 + 2 files changed, 17 insertions(+), 14 deletions(-) diff --git a/butterfree/configs/db/cassandra_config.py b/butterfree/configs/db/cassandra_config.py index d60bb6977..d576359cb 100644 --- a/butterfree/configs/db/cassandra_config.py +++ b/butterfree/configs/db/cassandra_config.py @@ -228,26 +228,28 @@ def translate(self, schema: List[Dict[str, Any]]) -> List[Dict[str, Any]]: """ cassandra_mapping = { - "TimestampType": "timestamp", - "BinaryType": "boolean", - "BooleanType": "boolean", - "DateType": "timestamp", - "DecimalType": "decimal", - "DoubleType": "double", - "FloatType": "float", - "IntegerType": "int", - "LongType": "bigint", - "StringType": "text", - "ArrayType(LongType,true)": "frozen>", - "ArrayType(StringType,true)": "frozen>", - "ArrayType(FloatType,true)": "frozen>", + "timestamptype": "timestamp", + "binarytype": "boolean", + "booleantype": "boolean", + "datetype": "timestamp", + "decimaltype": "decimal", + "doubletype": "double", + "floattype": "float", + "integertype": "int", + "longtype": "bigint", + "stringtype": "text", + "arraytype(longtype,true)": "frozen>", + "arraytype(stringtype,true)": "frozen>", + "arraytype(floattype,true)": "frozen>", } cassandra_schema = [] for features in schema: cassandra_schema.append( { "column_name": features["column_name"], - "type": cassandra_mapping[str(features["type"]).replace("()", "")], + "type": cassandra_mapping[ + str(features["type"]).replace("()", "").lower() + ], "primary_key": features["primary_key"], } ) diff --git a/requirements.txt b/requirements.txt index 0af8a62ae..f3968c60e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,3 +6,4 @@ pyspark==3.5.1 typer==0.4.2 typing-extensions>3.7.4,<5 boto3==1.17.* +numpy==1.26.4 From 81c2c178444620b1f1576390ffa13a2be242068a Mon Sep 17 00:00:00 2001 From: Ralph Rassweiler Date: Fri, 28 Jun 2024 10:33:10 -0300 Subject: [PATCH 64/70] fix: Cassandra config keys (#366) * fix: keys --- butterfree/configs/db/cassandra_config.py | 6 +++--- .../migrations/database_migration/conftest.py | 14 +++++++++++++- .../database_migration/test_cassandra_migration.py | 2 ++ 3 files changed, 18 insertions(+), 4 deletions(-) diff --git a/butterfree/configs/db/cassandra_config.py b/butterfree/configs/db/cassandra_config.py index d576359cb..6d7f9a20a 100644 --- a/butterfree/configs/db/cassandra_config.py +++ b/butterfree/configs/db/cassandra_config.py @@ -238,9 +238,9 @@ def translate(self, schema: List[Dict[str, Any]]) -> List[Dict[str, Any]]: "integertype": "int", "longtype": "bigint", "stringtype": "text", - "arraytype(longtype,true)": "frozen>", - "arraytype(stringtype,true)": "frozen>", - "arraytype(floattype,true)": "frozen>", + "arraytype(longtype, true)": "frozen>", + "arraytype(stringtype, true)": "frozen>", + "arraytype(floattype, true)": "frozen>", } cassandra_schema = [] for features in schema: diff --git a/tests/unit/butterfree/migrations/database_migration/conftest.py b/tests/unit/butterfree/migrations/database_migration/conftest.py index 237158b7b..3d3662d82 100644 --- a/tests/unit/butterfree/migrations/database_migration/conftest.py +++ b/tests/unit/butterfree/migrations/database_migration/conftest.py @@ -1,4 +1,11 @@ -from pyspark.sql.types import DoubleType, FloatType, LongType, TimestampType +from pyspark.sql.types import ( + ArrayType, + DoubleType, + FloatType, + LongType, + StringType, + TimestampType, +) from pytest import fixture from butterfree.constants import DataType @@ -30,6 +37,11 @@ def fs_schema(): {"column_name": "id", "type": LongType(), "primary_key": True}, {"column_name": "timestamp", "type": TimestampType(), "primary_key": True}, {"column_name": "new_feature", "type": FloatType(), "primary_key": False}, + { + "column_name": "array_feature", + "type": ArrayType(StringType(), True), + "primary_key": False, + }, { "column_name": "feature1__avg_over_1_week_rolling_windows", "type": FloatType(), diff --git a/tests/unit/butterfree/migrations/database_migration/test_cassandra_migration.py b/tests/unit/butterfree/migrations/database_migration/test_cassandra_migration.py index 5666cc47f..5e89b65bf 100644 --- a/tests/unit/butterfree/migrations/database_migration/test_cassandra_migration.py +++ b/tests/unit/butterfree/migrations/database_migration/test_cassandra_migration.py @@ -33,9 +33,11 @@ def test_create_table_query(self, fs_schema): expected_query = [ "CREATE TABLE test.table_name " "(id LongType, timestamp TimestampType, new_feature FloatType, " + "array_feature ArrayType(StringType(), True), " "feature1__avg_over_1_week_rolling_windows FloatType, " "PRIMARY KEY (id, timestamp));" ] + query = cassandra_migration.create_query(fs_schema, "table_name") assert query, expected_query From b1949cd3fa4da68595442d28328819d2196e9ddc Mon Sep 17 00:00:00 2001 From: Ralph Rassweiler Date: Fri, 28 Jun 2024 16:44:07 -0300 Subject: [PATCH 65/70] fix: new type (#368) --- butterfree/configs/db/cassandra_config.py | 1 + 1 file changed, 1 insertion(+) diff --git a/butterfree/configs/db/cassandra_config.py b/butterfree/configs/db/cassandra_config.py index 6d7f9a20a..919fee8e8 100644 --- a/butterfree/configs/db/cassandra_config.py +++ b/butterfree/configs/db/cassandra_config.py @@ -229,6 +229,7 @@ def translate(self, schema: List[Dict[str, Any]]) -> List[Dict[str, Any]]: """ cassandra_mapping = { "timestamptype": "timestamp", + "timestampntztype": "timestamp", "binarytype": "boolean", "booleantype": "boolean", "datetype": "timestamp", From 12d5e982474ad36fd2acfa1594a1535b894fc451 Mon Sep 17 00:00:00 2001 From: Fernando Barbosa Date: Fri, 16 Aug 2024 12:00:45 -0300 Subject: [PATCH 66/70] Delete .checklist.yaml (#371) --- .checklist.yaml | 30 ------------------------------ 1 file changed, 30 deletions(-) delete mode 100644 .checklist.yaml diff --git a/.checklist.yaml b/.checklist.yaml deleted file mode 100644 index f0c211714..000000000 --- a/.checklist.yaml +++ /dev/null @@ -1,30 +0,0 @@ -apiVersion: quintoandar.com.br/checklist/v2 -kind: ServiceChecklist -metadata: - name: butterfree -spec: - description: >- - A solution for Feature Stores. - - costCenter: C055 - department: engineering - lifecycle: production - docs: true - - ownership: - team: data_products_mlops - line: tech_platform - owner: otavio.cals@quintoandar.com.br - - libraries: - - name: butterfree - type: common-usage - path: https://quintoandar.github.io/python-package-server/ - description: A lib to build Feature Stores. - registries: - - github-packages - tier: T0 - - channels: - squad: 'mlops' - alerts: 'data-products-reports' From 35dd929a9fb64a08a76edfd6f652970d371e0d29 Mon Sep 17 00:00:00 2001 From: Ralph Rassweiler Date: Mon, 19 Aug 2024 10:57:23 -0300 Subject: [PATCH 67/70] Add Delta support (#370) * feat: delta --- .github/workflows/publish.yml | 4 +- .github/workflows/skip_lint.yml | 17 -- .github/workflows/staging.yml | 3 +- .github/workflows/test.yml | 2 +- CHANGELOG.md | 22 +++ Makefile | 3 +- butterfree/clients/spark_client.py | 1 + butterfree/load/writers/__init__.py | 3 +- butterfree/load/writers/delta_writer.py | 162 ++++++++++++++++++ .../historical_feature_store_writer.py | 27 ++- butterfree/load/writers/writer.py | 2 + docs/source/butterfree.clients.rst | 6 + docs/source/butterfree.configs.db.rst | 8 + docs/source/butterfree.configs.rst | 29 ++++ docs/source/butterfree.constants.rst | 65 +++++++ .../butterfree.extract.pre_processing.rst | 10 ++ docs/source/butterfree.extract.readers.rst | 8 + docs/source/butterfree.extract.rst | 2 + docs/source/butterfree.load.processing.rst | 2 + docs/source/butterfree.load.rst | 2 + docs/source/butterfree.load.writers.rst | 6 + docs/source/butterfree.pipelines.rst | 2 + docs/source/butterfree.reports.rst | 2 + docs/source/butterfree.transform.features.rst | 6 + docs/source/butterfree.transform.rst | 4 + .../butterfree.transform.transformations.rst | 14 ++ ...transformations.user_defined_functions.rst | 4 + docs/source/butterfree.transform.utils.rst | 6 + docs/source/butterfree.validations.rst | 4 + mypy.ini | 39 +++++ requirements.txt | 1 + setup.py | 2 +- .../load/writers/test_delta_writer.py | 83 +++++++++ .../test_historical_feature_store_writer.py | 25 +++ tests/unit/butterfree/transform/conftest.py | 38 ++-- 35 files changed, 560 insertions(+), 54 deletions(-) delete mode 100644 .github/workflows/skip_lint.yml create mode 100644 butterfree/load/writers/delta_writer.py create mode 100644 tests/unit/butterfree/load/writers/test_delta_writer.py diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index d33e4aa03..8b4d9c73c 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -4,11 +4,9 @@ on: paths: - 'setup.py' - jobs: Pipeline: if: github.ref == 'refs/heads/master' - runs-on: ubuntu-latest steps: @@ -19,7 +17,7 @@ jobs: - uses: actions/setup-java@v4 with: - java-version: '11' + java-version: '17' distribution: microsoft - uses: vemonet/setup-spark@v1 diff --git a/.github/workflows/skip_lint.yml b/.github/workflows/skip_lint.yml deleted file mode 100644 index 1c768a238..000000000 --- a/.github/workflows/skip_lint.yml +++ /dev/null @@ -1,17 +0,0 @@ -# This step is used only because we want to mark the runner-linter check as required -# for PRs to develop, but not for the merge queue to merge into develop, -# github does not have this functionality yet - -name: 'Skip github-actions/runner-linter check at merge queue' - -on: - merge_group: - -jobs: - empty_job: - name: 'github-actions/runner-linter' - runs-on: github-actions-developers-runner - steps: - - name: Skip github-actions/runner-linter check at merge queue - run: | - echo "Done" diff --git a/.github/workflows/staging.yml b/.github/workflows/staging.yml index 573049cac..9885ba688 100644 --- a/.github/workflows/staging.yml +++ b/.github/workflows/staging.yml @@ -7,7 +7,6 @@ on: jobs: Pipeline: if: github.ref == 'refs/heads/staging' - runs-on: ubuntu-latest steps: @@ -18,7 +17,7 @@ jobs: - uses: actions/setup-java@v4 with: - java-version: '11' + java-version: '17' distribution: microsoft - uses: vemonet/setup-spark@v1 diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index d588c8533..96ad666f9 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -19,7 +19,7 @@ jobs: - uses: actions/setup-java@v4 with: - java-version: '11' + java-version: '17' distribution: microsoft - uses: vemonet/setup-spark@v1 diff --git a/CHANGELOG.md b/CHANGELOG.md index ad9f48634..fe9f9a8a3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,28 @@ Preferably use **Added**, **Changed**, **Removed** and **Fixed** topics in each ## [Unreleased] +## [1.3.5](https://github.com/quintoandar/butterfree/releases/tag/1.3.5) +* Auto create feature sets ([#368](https://github.com/quintoandar/butterfree/pull/368)) + +## [1.3.4](https://github.com/quintoandar/butterfree/releases/tag/1.3.4) +* Fix Cassandra Config and tests ([#366](https://github.com/quintoandar/butterfree/pull/366)) + +## [1.3.3](https://github.com/quintoandar/butterfree/releases/tag/1.3.3) +* Fix Cassandra Config and Numpy version ([#364](https://github.com/quintoandar/butterfree/pull/364)) + +## [1.3.2](https://github.com/quintoandar/butterfree/releases/tag/1.3.2) +* Fix publish script ([#362](https://github.com/quintoandar/butterfree/pull/362)) + +## [1.3.2](https://github.com/quintoandar/butterfree/releases/tag/1.3.2) +* Fix publish script ([#360](https://github.com/quintoandar/butterfree/pull/362)) + +## [1.3.1](https://github.com/quintoandar/butterfree/releases/tag/1.3.1) +* Timestamp NTZ available ([#360](https://github.com/quintoandar/butterfree/pull/360)) + +## [1.3.0](https://github.com/quintoandar/butterfree/releases/tag/1.3.0) +* Bump versions ([#355](https://github.com/quintoandar/butterfree/pull/355)) +* Sphinx version ([#356](https://github.com/quintoandar/butterfree/pull/356)) + ## [1.2.4](https://github.com/quintoandar/butterfree/releases/tag/1.2.4) * Auto create feature sets ([#351](https://github.com/quintoandar/butterfree/pull/351)) diff --git a/Makefile b/Makefile index bf9ccd647..a93104ab3 100644 --- a/Makefile +++ b/Makefile @@ -36,7 +36,7 @@ minimum-requirements: .PHONY: requirements ## install all requirements -requirements: requirements-test requirements-lint dev-requirements minimum-requirements +requirements: minimum-requirements dev-requirements requirements-test requirements-lint .PHONY: ci-install ci-install: @@ -146,6 +146,7 @@ package-name: .PHONY: package ## build butterfree package wheel package: + @PYTHONPATH=. pip3 install wheel @PYTHONPATH=. python -m setup sdist bdist_wheel .PHONY: update-docs diff --git a/butterfree/clients/spark_client.py b/butterfree/clients/spark_client.py index 933c21651..f4b6ea652 100644 --- a/butterfree/clients/spark_client.py +++ b/butterfree/clients/spark_client.py @@ -30,6 +30,7 @@ def conn(self) -> SparkSession: """ if not self._session: self._session = SparkSession.builder.getOrCreate() + return self._session def read( diff --git a/butterfree/load/writers/__init__.py b/butterfree/load/writers/__init__.py index 72945d278..f1f0e449b 100644 --- a/butterfree/load/writers/__init__.py +++ b/butterfree/load/writers/__init__.py @@ -1,8 +1,9 @@ """Holds data loaders for historical and online feature store.""" +from butterfree.load.writers.delta_writer import DeltaWriter from butterfree.load.writers.historical_feature_store_writer import ( HistoricalFeatureStoreWriter, ) from butterfree.load.writers.online_feature_store_writer import OnlineFeatureStoreWriter -__all__ = ["HistoricalFeatureStoreWriter", "OnlineFeatureStoreWriter"] +__all__ = ["HistoricalFeatureStoreWriter", "OnlineFeatureStoreWriter", "DeltaWriter"] diff --git a/butterfree/load/writers/delta_writer.py b/butterfree/load/writers/delta_writer.py new file mode 100644 index 000000000..933f1adb0 --- /dev/null +++ b/butterfree/load/writers/delta_writer.py @@ -0,0 +1,162 @@ +from delta.tables import DeltaTable +from pyspark.sql.dataframe import DataFrame + +from butterfree.clients import SparkClient +from butterfree.configs.logger import __logger + +logger = __logger("delta_writer", True) + + +class DeltaWriter: + """Control operations on Delta Tables. + + Resposible for merging and optimizing. + """ + + @staticmethod + def _get_full_table_name(table, database): + if database: + return "{}.{}".format(database, table) + else: + return table + + @staticmethod + def _convert_to_delta(client: SparkClient, table: str): + logger.info(f"Converting {table} to Delta...") + client.conn.sql(f"CONVERT TO DELTA {table}") + logger.info("Conversion done.") + + @staticmethod + def merge( + client: SparkClient, + database: str, + table: str, + merge_on: list, + source_df: DataFrame, + when_not_matched_insert_condition: str = None, + when_matched_update_condition: str = None, + when_matched_delete_condition: str = None, + ): + """ + Merge a source dataframe to a Delta table. + + By default, it will update when matched, and insert when + not matched (simple upsert). + + You can change this behavior by setting: + - when_not_matched_insert_condition: it will only insert + when this specified condition is true + - when_matched_update_condition: it will only update when this + specified condition is true. You can refer to the columns + in the source dataframe as source., and the columns + in the target table as target.. + - when_matched_delete_condition: it will add an operation to delete, + but only if this condition is true. Again, source and + target dataframe columns can be referred to respectively as + source. and target. + """ + try: + full_table_name = DeltaWriter._get_full_table_name(table, database) + + table_exists = client.conn.catalog.tableExists(full_table_name) + + if table_exists: + pd_df = client.conn.sql( + f"DESCRIBE TABLE EXTENDED {full_table_name}" + ).toPandas() + provider = ( + pd_df.reset_index() + .groupby(["col_name"])["data_type"] + .aggregate("first") + .Provider + ) + table_is_delta = provider.lower() == "delta" + + if not table_is_delta: + DeltaWriter()._convert_to_delta(client, full_table_name) + + # For schema evolution + client.conn.conf.set( + "spark.databricks.delta.schema.autoMerge.enabled", "true" + ) + + target_table = DeltaTable.forName(client.conn, full_table_name) + join_condition = " AND ".join( + [f"source.{col} = target.{col}" for col in merge_on] + ) + merge_builder = target_table.alias("target").merge( + source_df.alias("source"), join_condition + ) + if when_matched_delete_condition: + merge_builder = merge_builder.whenMatchedDelete( + condition=when_matched_delete_condition + ) + + merge_builder.whenMatchedUpdateAll( + condition=when_matched_update_condition + ).whenNotMatchedInsertAll( + condition=when_not_matched_insert_condition + ).execute() + except Exception as e: + logger.error(f"Merge operation on {full_table_name} failed: {e}") + + @staticmethod + def vacuum(table: str, retention_hours: int, client: SparkClient): + """Vacuum a Delta table. + + Vacuum remove unused files (files not managed by Delta + files + that are not in the latest state). + After vacuum it's impossible to time travel to versions + older than the `retention` time. + Default retention is 7 days. Lower retentions will be warned, + unless it's set to false. + Set spark.databricks.delta.retentionDurationCheck.enabled + to false for low retentions. + https://docs.databricks.com/en/sql/language-manual/delta-vacuum.html + """ + + command = f"VACUUM {table} RETAIN {retention_hours} HOURS" + logger.info(f"Running vacuum with command {command}") + client.conn.sql(command) + logger.info(f"Vacuum successful for table {table}") + + @staticmethod + def optimize( + client: SparkClient, + table: str = None, + z_order: list = None, + date_column: str = "timestamp", + from_date: str = None, + auto_compact: bool = False, + optimize_write: bool = False, + ): + """Optimize a Delta table. + + For auto-compaction and optimize write DBR >= 14.3 LTS + and Delta >= 3.1.0 are MANDATORY. + For z-ordering DBR >= 13.3 LTS and Delta >= 2.0.0 are MANDATORY. + Auto-compaction (recommended) reduces the small file problem + (overhead due to lots of metadata). + Z-order by columns that is commonly used in queries + predicates and has a high cardinality. + https://docs.delta.io/latest/optimizations-oss.html + """ + + if auto_compact: + client.conf.set("spark.databricks.delta.autoCompact.enabled", "true") + + if optimize_write: + client.conf.set("spark.databricks.delta.optimizeWrite.enabled", "true") + + if table: + command = f"OPTIMIZE {table}" + + if from_date: + command += f"WHERE {date_column} >= {from_date}" + + if z_order: + command += f" ZORDER BY {','.join(z_order)}" + + logger.info(f"Running optimize with command {command}...") + client.conn.sql(command) + logger.info(f"Optimize successful for table {table}.") diff --git a/butterfree/load/writers/historical_feature_store_writer.py b/butterfree/load/writers/historical_feature_store_writer.py index c01fee1d8..0be7d6af3 100644 --- a/butterfree/load/writers/historical_feature_store_writer.py +++ b/butterfree/load/writers/historical_feature_store_writer.py @@ -14,6 +14,7 @@ from butterfree.dataframe_service import repartition_df from butterfree.hooks import Hook from butterfree.hooks.schema_compatibility import SparkTableSchemaCompatibilityHook +from butterfree.load.writers.delta_writer import DeltaWriter from butterfree.load.writers.writer import Writer from butterfree.transform import FeatureSet @@ -114,6 +115,7 @@ def __init__( interval_mode: bool = False, check_schema_hook: Optional[Hook] = None, row_count_validation: bool = True, + merge_on: list = None, ): super(HistoricalFeatureStoreWriter, self).__init__( db_config or MetastoreConfig(), @@ -121,6 +123,7 @@ def __init__( interval_mode, False, row_count_validation, + merge_on, ) self.database = database or environment.get_variable( "FEATURE_STORE_HISTORICAL_DATABASE" @@ -141,6 +144,7 @@ def write( feature_set: object processed with feature_set informations. dataframe: spark dataframe containing data from a feature set. spark_client: client for spark connections with external services. + merge_on: when filled, the writing is an upsert in a Delta table. If the debug_mode is set to True, a temporary table with a name in the format: historical_feature_store__{feature_set.name} will be created instead of writing @@ -174,13 +178,22 @@ def write( s3_key = os.path.join("historical", feature_set.entity, feature_set.name) - spark_client.write_table( - dataframe=dataframe, - database=self.database, - table_name=feature_set.name, - partition_by=self.PARTITION_BY, - **self.db_config.get_options(s3_key), - ) + if self.merge_on: + DeltaWriter.merge( + client=spark_client, + database=self.database, + table=feature_set.name, + merge_on=self.merge_on, + source_df=dataframe, + ) + else: + spark_client.write_table( + dataframe=dataframe, + database=self.database, + table_name=feature_set.name, + partition_by=self.PARTITION_BY, + **self.db_config.get_options(s3_key), + ) def _assert_validation_count( self, table_name: str, written_count: int, dataframe_count: int diff --git a/butterfree/load/writers/writer.py b/butterfree/load/writers/writer.py index 780b9ec2d..a99514ae8 100644 --- a/butterfree/load/writers/writer.py +++ b/butterfree/load/writers/writer.py @@ -27,6 +27,7 @@ def __init__( interval_mode: Optional[bool] = False, write_to_entity: Optional[bool] = False, row_count_validation: Optional[bool] = True, + merge_on: Optional[list] = None, ) -> None: super().__init__() self.db_config = db_config @@ -35,6 +36,7 @@ def __init__( self.interval_mode = interval_mode self.write_to_entity = write_to_entity self.row_count_validation = row_count_validation + self.merge_on = merge_on def with_( self, transformer: Callable[..., DataFrame], *args: Any, **kwargs: Any diff --git a/docs/source/butterfree.clients.rst b/docs/source/butterfree.clients.rst index 1bfaa86df..b1e1029a3 100644 --- a/docs/source/butterfree.clients.rst +++ b/docs/source/butterfree.clients.rst @@ -4,18 +4,24 @@ butterfree.clients package Submodules ---------- +butterfree.clients.abstract\_client module +------------------------------------------ .. automodule:: butterfree.clients.abstract_client :members: :undoc-members: :show-inheritance: +butterfree.clients.cassandra\_client module +------------------------------------------- .. automodule:: butterfree.clients.cassandra_client :members: :undoc-members: :show-inheritance: +butterfree.clients.spark\_client module +--------------------------------------- .. automodule:: butterfree.clients.spark_client :members: diff --git a/docs/source/butterfree.configs.db.rst b/docs/source/butterfree.configs.db.rst index 3bb9f8b88..6e23dc1c3 100644 --- a/docs/source/butterfree.configs.db.rst +++ b/docs/source/butterfree.configs.db.rst @@ -4,24 +4,32 @@ butterfree.configs.db package Submodules ---------- +butterfree.configs.db.abstract\_config module +--------------------------------------------- .. automodule:: butterfree.configs.db.abstract_config :members: :undoc-members: :show-inheritance: +butterfree.configs.db.cassandra\_config module +---------------------------------------------- .. automodule:: butterfree.configs.db.cassandra_config :members: :undoc-members: :show-inheritance: +butterfree.configs.db.kafka\_config module +------------------------------------------ .. automodule:: butterfree.configs.db.kafka_config :members: :undoc-members: :show-inheritance: +butterfree.configs.db.metastore\_config module +---------------------------------------------- .. automodule:: butterfree.configs.db.metastore_config :members: diff --git a/docs/source/butterfree.configs.rst b/docs/source/butterfree.configs.rst index f3cf2aa29..18a82795d 100644 --- a/docs/source/butterfree.configs.rst +++ b/docs/source/butterfree.configs.rst @@ -12,12 +12,41 @@ Subpackages Submodules ---------- +butterfree.configs.environment module +------------------------------------- .. automodule:: butterfree.configs.environment :members: :undoc-members: :show-inheritance: +butterfree.configs.logger module +-------------------------------- + +.. automodule:: butterfree.configs.logger + :members: + :undoc-members: + :show-inheritance: + +.. automodule:: butterfree.configs.logger + :members: + :undoc-members: + :show-inheritance: + +.. automodule:: butterfree.configs.logger + :members: + :undoc-members: + :show-inheritance: + +.. automodule:: butterfree.configs.logger + :members: + :undoc-members: + :show-inheritance: + +.. automodule:: butterfree.configs.logger + :members: + :undoc-members: + :show-inheritance: .. automodule:: butterfree.configs.logger :members: diff --git a/docs/source/butterfree.constants.rst b/docs/source/butterfree.constants.rst index d0e72fedd..e90b195e7 100644 --- a/docs/source/butterfree.constants.rst +++ b/docs/source/butterfree.constants.rst @@ -4,18 +4,56 @@ butterfree.constants package Submodules ---------- +butterfree.constants.columns module +----------------------------------- .. automodule:: butterfree.constants.columns :members: :undoc-members: :show-inheritance: +butterfree.constants.data\_type module +-------------------------------------- .. automodule:: butterfree.constants.data_type :members: :undoc-members: :show-inheritance: +butterfree.constants.migrations module +-------------------------------------- + +.. automodule:: butterfree.constants.migrations + :members: + :undoc-members: + :show-inheritance: + +butterfree.constants.spark\_constants module +-------------------------------------------- + +.. automodule:: butterfree.constants.migrations + :members: + :undoc-members: + :show-inheritance: + + +.. automodule:: butterfree.constants.migrations + :members: + :undoc-members: + :show-inheritance: + + +.. automodule:: butterfree.constants.migrations + :members: + :undoc-members: + :show-inheritance: + + +.. automodule:: butterfree.constants.migrations + :members: + :undoc-members: + :show-inheritance: + .. automodule:: butterfree.constants.migrations :members: @@ -28,6 +66,33 @@ Submodules :undoc-members: :show-inheritance: +butterfree.constants.window\_definitions module +----------------------------------------------- + +.. automodule:: butterfree.constants.window_definitions + :members: + :undoc-members: + :show-inheritance: + +.. automodule:: butterfree.constants.window_definitions + :members: + :undoc-members: + :show-inheritance: + +.. automodule:: butterfree.constants.window_definitions + :members: + :undoc-members: + :show-inheritance: + +.. automodule:: butterfree.constants.window_definitions + :members: + :undoc-members: + :show-inheritance: + +.. automodule:: butterfree.constants.window_definitions + :members: + :undoc-members: + :show-inheritance: .. automodule:: butterfree.constants.window_definitions :members: diff --git a/docs/source/butterfree.extract.pre_processing.rst b/docs/source/butterfree.extract.pre_processing.rst index 172e6fb3c..e8e66e3d0 100644 --- a/docs/source/butterfree.extract.pre_processing.rst +++ b/docs/source/butterfree.extract.pre_processing.rst @@ -4,30 +4,40 @@ butterfree.extract.pre\_processing package Submodules ---------- +butterfree.extract.pre\_processing.explode\_json\_column\_transform module +-------------------------------------------------------------------------- .. automodule:: butterfree.extract.pre_processing.explode_json_column_transform :members: :undoc-members: :show-inheritance: +butterfree.extract.pre\_processing.filter\_transform module +----------------------------------------------------------- .. automodule:: butterfree.extract.pre_processing.filter_transform :members: :undoc-members: :show-inheritance: +butterfree.extract.pre\_processing.forward\_fill\_transform module +------------------------------------------------------------------ .. automodule:: butterfree.extract.pre_processing.forward_fill_transform :members: :undoc-members: :show-inheritance: +butterfree.extract.pre\_processing.pivot\_transform module +---------------------------------------------------------- .. automodule:: butterfree.extract.pre_processing.pivot_transform :members: :undoc-members: :show-inheritance: +butterfree.extract.pre\_processing.replace\_transform module +------------------------------------------------------------ .. automodule:: butterfree.extract.pre_processing.replace_transform :members: diff --git a/docs/source/butterfree.extract.readers.rst b/docs/source/butterfree.extract.readers.rst index a67d47e96..40df200eb 100644 --- a/docs/source/butterfree.extract.readers.rst +++ b/docs/source/butterfree.extract.readers.rst @@ -4,24 +4,32 @@ butterfree.extract.readers package Submodules ---------- +butterfree.extract.readers.file\_reader module +---------------------------------------------- .. automodule:: butterfree.extract.readers.file_reader :members: :undoc-members: :show-inheritance: +butterfree.extract.readers.kafka\_reader module +----------------------------------------------- .. automodule:: butterfree.extract.readers.kafka_reader :members: :undoc-members: :show-inheritance: +butterfree.extract.readers.reader module +---------------------------------------- .. automodule:: butterfree.extract.readers.reader :members: :undoc-members: :show-inheritance: +butterfree.extract.readers.table\_reader module +----------------------------------------------- .. automodule:: butterfree.extract.readers.table_reader :members: diff --git a/docs/source/butterfree.extract.rst b/docs/source/butterfree.extract.rst index a59d2e292..455f02d5f 100644 --- a/docs/source/butterfree.extract.rst +++ b/docs/source/butterfree.extract.rst @@ -13,6 +13,8 @@ Subpackages Submodules ---------- +butterfree.extract.source module +-------------------------------- .. automodule:: butterfree.extract.source :members: diff --git a/docs/source/butterfree.load.processing.rst b/docs/source/butterfree.load.processing.rst index 4c5d2a2e8..d16182cb1 100644 --- a/docs/source/butterfree.load.processing.rst +++ b/docs/source/butterfree.load.processing.rst @@ -4,6 +4,8 @@ butterfree.load.processing package Submodules ---------- +butterfree.load.processing.json\_transform module +------------------------------------------------- .. automodule:: butterfree.load.processing.json_transform :members: diff --git a/docs/source/butterfree.load.rst b/docs/source/butterfree.load.rst index e38934a5a..e4b56fbcb 100644 --- a/docs/source/butterfree.load.rst +++ b/docs/source/butterfree.load.rst @@ -13,6 +13,8 @@ Subpackages Submodules ---------- +butterfree.load.sink module +--------------------------- .. automodule:: butterfree.load.sink :members: diff --git a/docs/source/butterfree.load.writers.rst b/docs/source/butterfree.load.writers.rst index 6ff438de9..2a173c9a4 100644 --- a/docs/source/butterfree.load.writers.rst +++ b/docs/source/butterfree.load.writers.rst @@ -4,18 +4,24 @@ butterfree.load.writers package Submodules ---------- +butterfree.load.writers.historical\_feature\_store\_writer module +----------------------------------------------------------------- .. automodule:: butterfree.load.writers.historical_feature_store_writer :members: :undoc-members: :show-inheritance: +butterfree.load.writers.online\_feature\_store\_writer module +------------------------------------------------------------- .. automodule:: butterfree.load.writers.online_feature_store_writer :members: :undoc-members: :show-inheritance: +butterfree.load.writers.writer module +------------------------------------- .. automodule:: butterfree.load.writers.writer :members: diff --git a/docs/source/butterfree.pipelines.rst b/docs/source/butterfree.pipelines.rst index e0c319962..e70a4d89e 100644 --- a/docs/source/butterfree.pipelines.rst +++ b/docs/source/butterfree.pipelines.rst @@ -4,6 +4,8 @@ butterfree.pipelines package Submodules ---------- +butterfree.pipelines.feature\_set\_pipeline module +-------------------------------------------------- .. automodule:: butterfree.pipelines.feature_set_pipeline :members: diff --git a/docs/source/butterfree.reports.rst b/docs/source/butterfree.reports.rst index 850db914a..a95a7e7fd 100644 --- a/docs/source/butterfree.reports.rst +++ b/docs/source/butterfree.reports.rst @@ -4,6 +4,8 @@ butterfree.reports package Submodules ---------- +butterfree.reports.metadata module +---------------------------------- .. automodule:: butterfree.reports.metadata :members: diff --git a/docs/source/butterfree.transform.features.rst b/docs/source/butterfree.transform.features.rst index f6c69095d..837e0fcf7 100644 --- a/docs/source/butterfree.transform.features.rst +++ b/docs/source/butterfree.transform.features.rst @@ -4,18 +4,24 @@ butterfree.transform.features package Submodules ---------- +butterfree.transform.features.feature module +-------------------------------------------- .. automodule:: butterfree.transform.features.feature :members: :undoc-members: :show-inheritance: +butterfree.transform.features.key\_feature module +------------------------------------------------- .. automodule:: butterfree.transform.features.key_feature :members: :undoc-members: :show-inheritance: +butterfree.transform.features.timestamp\_feature module +------------------------------------------------------- .. automodule:: butterfree.transform.features.timestamp_feature :members: diff --git a/docs/source/butterfree.transform.rst b/docs/source/butterfree.transform.rst index 02f8d4c61..12c346aed 100644 --- a/docs/source/butterfree.transform.rst +++ b/docs/source/butterfree.transform.rst @@ -14,12 +14,16 @@ Subpackages Submodules ---------- +butterfree.transform.aggregated\_feature\_set module +---------------------------------------------------- .. automodule:: butterfree.transform.aggregated_feature_set :members: :undoc-members: :show-inheritance: +butterfree.transform.feature\_set module +---------------------------------------- .. automodule:: butterfree.transform.feature_set :members: diff --git a/docs/source/butterfree.transform.transformations.rst b/docs/source/butterfree.transform.transformations.rst index 0978edcf1..f17818d3e 100644 --- a/docs/source/butterfree.transform.transformations.rst +++ b/docs/source/butterfree.transform.transformations.rst @@ -12,42 +12,56 @@ Subpackages Submodules ---------- +butterfree.transform.transformations.aggregated\_transform module +----------------------------------------------------------------- .. automodule:: butterfree.transform.transformations.aggregated_transform :members: :undoc-members: :show-inheritance: +butterfree.transform.transformations.custom\_transform module +------------------------------------------------------------- .. automodule:: butterfree.transform.transformations.custom_transform :members: :undoc-members: :show-inheritance: +butterfree.transform.transformations.h3\_transform module +--------------------------------------------------------- .. automodule:: butterfree.transform.transformations.h3_transform :members: :undoc-members: :show-inheritance: +butterfree.transform.transformations.spark\_function\_transform module +---------------------------------------------------------------------- .. automodule:: butterfree.transform.transformations.spark_function_transform :members: :undoc-members: :show-inheritance: +butterfree.transform.transformations.sql\_expression\_transform module +---------------------------------------------------------------------- .. automodule:: butterfree.transform.transformations.sql_expression_transform :members: :undoc-members: :show-inheritance: +butterfree.transform.transformations.stack\_transform module +------------------------------------------------------------ .. automodule:: butterfree.transform.transformations.stack_transform :members: :undoc-members: :show-inheritance: +butterfree.transform.transformations.transform\_component module +---------------------------------------------------------------- .. automodule:: butterfree.transform.transformations.transform_component :members: diff --git a/docs/source/butterfree.transform.transformations.user_defined_functions.rst b/docs/source/butterfree.transform.transformations.user_defined_functions.rst index f93c7e98c..b79e81386 100644 --- a/docs/source/butterfree.transform.transformations.user_defined_functions.rst +++ b/docs/source/butterfree.transform.transformations.user_defined_functions.rst @@ -4,12 +4,16 @@ butterfree.transform.transformations.user\_defined\_functions package Submodules ---------- +butterfree.transform.transformations.user\_defined\_functions.mode module +------------------------------------------------------------------------- .. automodule:: butterfree.transform.transformations.user_defined_functions.mode :members: :undoc-members: :show-inheritance: +butterfree.transform.transformations.user\_defined\_functions.most\_frequent\_set module +---------------------------------------------------------------------------------------- .. automodule:: butterfree.transform.transformations.user_defined_functions.most_frequent_set :members: diff --git a/docs/source/butterfree.transform.utils.rst b/docs/source/butterfree.transform.utils.rst index 82e9038bb..d1d7206ce 100644 --- a/docs/source/butterfree.transform.utils.rst +++ b/docs/source/butterfree.transform.utils.rst @@ -4,18 +4,24 @@ butterfree.transform.utils package Submodules ---------- +butterfree.transform.utils.date\_range module +--------------------------------------------- .. automodule:: butterfree.transform.utils.date_range :members: :undoc-members: :show-inheritance: +butterfree.transform.utils.function module +------------------------------------------ .. automodule:: butterfree.transform.utils.function :members: :undoc-members: :show-inheritance: +butterfree.transform.utils.window\_spec module +---------------------------------------------- .. automodule:: butterfree.transform.utils.window_spec :members: diff --git a/docs/source/butterfree.validations.rst b/docs/source/butterfree.validations.rst index 35f5d1992..2aa0053ec 100644 --- a/docs/source/butterfree.validations.rst +++ b/docs/source/butterfree.validations.rst @@ -4,12 +4,16 @@ butterfree.validations package Submodules ---------- +butterfree.validations.basic\_validaton module +---------------------------------------------- .. automodule:: butterfree.validations.basic_validaton :members: :undoc-members: :show-inheritance: +butterfree.validations.validation module +---------------------------------------- .. automodule:: butterfree.validations.validation :members: diff --git a/mypy.ini b/mypy.ini index eb867a477..fc2931493 100644 --- a/mypy.ini +++ b/mypy.ini @@ -9,3 +9,42 @@ show_error_codes = True show_error_context = True disable_error_code = attr-defined, list-item, operator pretty = True + +[mypy-butterfree.pipelines.*] +ignore_errors = True + +[mypy-butterfree.load.*] +ignore_errors = True + +[mypy-butterfree.transform.*] +ignore_errors = True + +[mypy-butterfree.extract.*] +ignore_errors = True + +[mypy-butterfree.config.*] +ignore_errors = True + +[mypy-butterfree.clients.*] +ignore_errors = True + +[mypy-butterfree.configs.*] +ignore_errors = True + +[mypy-butterfree.dataframe_service.*] +ignore_errors = True + +[mypy-butterfree.validations.*] +ignore_errors = True + +[mypy-butterfree.migrations.*] +ignore_errors = True + +[mypy-butterfree.testing.*] +ignore_errors = True + +[mypy-butterfree.hooks.*] +ignore_errors = True + +[mypy-butterfree._cli.*] +ignore_errors = True diff --git a/requirements.txt b/requirements.txt index f3968c60e..9c9eea640 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,3 +7,4 @@ typer==0.4.2 typing-extensions>3.7.4,<5 boto3==1.17.* numpy==1.26.4 +delta-spark==3.2.0 diff --git a/setup.py b/setup.py index 42ef57c85..bc4f0b453 100644 --- a/setup.py +++ b/setup.py @@ -1,7 +1,7 @@ from setuptools import find_packages, setup __package_name__ = "butterfree" -__version__ = "1.2.4" +__version__ = "1.3.5" __repository_url__ = "https://github.com/quintoandar/butterfree" with open("requirements.txt") as f: diff --git a/tests/unit/butterfree/load/writers/test_delta_writer.py b/tests/unit/butterfree/load/writers/test_delta_writer.py new file mode 100644 index 000000000..550f6d057 --- /dev/null +++ b/tests/unit/butterfree/load/writers/test_delta_writer.py @@ -0,0 +1,83 @@ +import os +from unittest import mock + +import pytest + +from butterfree.clients import SparkClient +from butterfree.load.writers import DeltaWriter + +DELTA_LOCATION = "spark-warehouse" + + +class TestDeltaWriter: + + def __checkFileExists(self, file_name: str = "test_delta_table") -> bool: + return os.path.exists(os.path.join(DELTA_LOCATION, file_name)) + + @pytest.fixture + def merge_builder_mock(self): + builder = mock.MagicMock() + builder.whenMatchedDelete.return_value = builder + builder.whenMatchedUpdateAll.return_value = builder + builder.whenNotMatchedInsertAll.return_value = builder + return builder + + def test_merge(self, feature_set_dataframe, merge_builder_mock): + + client = SparkClient() + delta_writer = DeltaWriter() + delta_writer.merge = mock.MagicMock() + + DeltaWriter().merge( + client=client, + database=None, + table="test_delta_table", + merge_on=["id"], + source_df=feature_set_dataframe, + ) + + assert merge_builder_mock.execute.assert_called_once + + # Step 2 + source = client.conn.createDataFrame( + [(1, "test3"), (2, "test4"), (3, "test5")], ["id", "feature"] + ) + + DeltaWriter().merge( + client=client, + database=None, + table="test_delta_table", + merge_on=["id"], + source_df=source, + when_not_matched_insert_condition=None, + when_matched_update_condition="id > 2", + ) + + assert merge_builder_mock.execute.assert_called_once + + def test_optimize(self, mocker): + + client = SparkClient() + conn_mock = mocker.patch( + "butterfree.clients.SparkClient.conn", return_value=mock.Mock() + ) + dw = DeltaWriter() + + dw.optimize = mock.MagicMock(client) + dw.optimize(client, "a_table") + + conn_mock.assert_called_once + + def test_vacuum(self, mocker): + + client = SparkClient() + conn_mock = mocker.patch( + "butterfree.clients.SparkClient.conn", return_value=mock.Mock() + ) + dw = DeltaWriter() + retention_hours = 24 + dw.vacuum = mock.MagicMock(client) + + dw.vacuum("a_table", retention_hours, client) + + conn_mock.assert_called_once diff --git a/tests/unit/butterfree/load/writers/test_historical_feature_store_writer.py b/tests/unit/butterfree/load/writers/test_historical_feature_store_writer.py index 9e84aacda..d9d9181a4 100644 --- a/tests/unit/butterfree/load/writers/test_historical_feature_store_writer.py +++ b/tests/unit/butterfree/load/writers/test_historical_feature_store_writer.py @@ -1,5 +1,6 @@ import datetime import random +from unittest import mock import pytest from pyspark.sql.functions import spark_partition_id @@ -145,6 +146,30 @@ def test_write_in_debug_mode_with_interval_mode( # then assert_dataframe_equality(historical_feature_set_dataframe, result_df) + def test_merge_from_historical_writer( + self, + feature_set, + feature_set_dataframe, + mocker, + ): + # given + spark_client = SparkClient() + + spark_client.write_table = mocker.stub("write_table") + writer = HistoricalFeatureStoreWriter(merge_on=["id", "timestamp"]) + + static_mock = mocker.patch( + "butterfree.load.writers.DeltaWriter.merge", return_value=mock.Mock() + ) + + writer.write( + feature_set=feature_set, + dataframe=feature_set_dataframe, + spark_client=spark_client, + ) + + assert static_mock.call_count == 1 + def test_validate(self, historical_feature_set_dataframe, mocker, feature_set): # given spark_client = mocker.stub("spark_client") diff --git a/tests/unit/butterfree/transform/conftest.py b/tests/unit/butterfree/transform/conftest.py index fcf601328..104300c99 100644 --- a/tests/unit/butterfree/transform/conftest.py +++ b/tests/unit/butterfree/transform/conftest.py @@ -1,6 +1,6 @@ -import json from unittest.mock import Mock +import pyspark.pandas as ps from pyspark.sql import functions from pytest import fixture @@ -54,7 +54,8 @@ def make_dataframe(spark_context, spark_session): "nonfeature": 0, }, ] - df = spark_session.read.json(spark_context.parallelize(data, 1)) + pdf = ps.DataFrame.from_dict(data) + df = pdf.to_spark() df = df.withColumn(TIMESTAMP_COLUMN, df.ts.cast(DataType.TIMESTAMP.spark)) return df @@ -70,9 +71,8 @@ def make_filtering_dataframe(spark_context, spark_session): {"id": 1, "ts": 6, "feature1": None, "feature2": None, "feature3": None}, {"id": 1, "ts": 7, "feature1": None, "feature2": None, "feature3": None}, ] - df = spark_session.read.json( - spark_context.parallelize(data).map(lambda x: json.dumps(x)) - ) + pdf = ps.DataFrame.from_dict(data) + df = pdf.to_spark() df = df.withColumn(TIMESTAMP_COLUMN, df.ts.cast(DataType.TIMESTAMP.spark)) return df @@ -86,9 +86,8 @@ def make_output_filtering_dataframe(spark_context, spark_session): {"id": 1, "ts": 4, "feature1": 0, "feature2": 1, "feature3": 1}, {"id": 1, "ts": 6, "feature1": None, "feature2": None, "feature3": None}, ] - df = spark_session.read.json( - spark_context.parallelize(data).map(lambda x: json.dumps(x)) - ) + pdf = ps.DataFrame.from_dict(data) + df = pdf.to_spark() df = df.withColumn(TIMESTAMP_COLUMN, df.ts.cast(DataType.TIMESTAMP.spark)) return df @@ -127,9 +126,8 @@ def make_rolling_windows_agg_dataframe(spark_context, spark_session): "feature2__avg_over_1_week_rolling_windows": None, }, ] - df = spark_session.read.json( - spark_context.parallelize(data).map(lambda x: json.dumps(x)) - ) + pdf = ps.DataFrame.from_dict(data) + df = pdf.to_spark() df = df.withColumn("timestamp", df.timestamp.cast(DataType.TIMESTAMP.spark)) return df @@ -156,9 +154,8 @@ def make_rolling_windows_hour_slide_agg_dataframe(spark_context, spark_session): "feature2__avg_over_1_day_rolling_windows": 500.0, }, ] - df = spark_session.read.json( - spark_context.parallelize(data).map(lambda x: json.dumps(x)) - ) + pdf = ps.DataFrame.from_dict(data) + df = pdf.to_spark() df = df.withColumn("timestamp", df.timestamp.cast(DataType.TIMESTAMP.spark)) return df @@ -205,9 +202,8 @@ def make_multiple_rolling_windows_hour_slide_agg_dataframe( "feature2__avg_over_3_days_rolling_windows": 500.0, }, ] - df = spark_session.read.json( - spark_context.parallelize(data).map(lambda x: json.dumps(x)) - ) + pdf = ps.DataFrame.from_dict(data) + df = pdf.to_spark() df = df.withColumn("timestamp", df.timestamp.cast(DataType.TIMESTAMP.spark)) return df @@ -257,7 +253,8 @@ def make_fs_dataframe_with_distinct(spark_context, spark_session): "h3": "86a8100efffffff", }, ] - df = spark_session.read.json(spark_context.parallelize(data, 1)) + pdf = ps.DataFrame.from_dict(data) + df = pdf.to_spark() df = df.withColumn("timestamp", df.timestamp.cast(DataType.TIMESTAMP.spark)) return df @@ -286,9 +283,8 @@ def make_target_df_distinct(spark_context, spark_session): "feature__sum_over_3_days_rolling_windows": None, }, ] - df = spark_session.read.json( - spark_context.parallelize(data).map(lambda x: json.dumps(x)) - ) + pdf = ps.DataFrame.from_dict(data) + df = pdf.to_spark() df = df.withColumn("timestamp", df.timestamp.cast(DataType.TIMESTAMP.spark)) return df From 77539ae7b9cfd54fd8874ade158a2054e0383739 Mon Sep 17 00:00:00 2001 From: Ralph Filho Date: Tue, 20 Aug 2024 17:32:45 -0300 Subject: [PATCH 68/70] release 1.4.0 --- CHANGELOG.md | 3 + docs/source/butterfree.automated.rst | 2 + docs/source/butterfree.configs.rst | 25 --------- docs/source/butterfree.constants.rst | 55 ------------------- docs/source/butterfree.dataframe_service.rst | 6 ++ docs/source/butterfree.hooks.rst | 4 ++ .../butterfree.hooks.schema_compatibility.rst | 4 ++ docs/source/butterfree.load.writers.rst | 8 +++ ...tterfree.migrations.database_migration.rst | 6 ++ logging.json | 0 setup.py | 2 +- 11 files changed, 34 insertions(+), 81 deletions(-) create mode 100644 logging.json diff --git a/CHANGELOG.md b/CHANGELOG.md index fe9f9a8a3..19d9b5f41 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,9 @@ Preferably use **Added**, **Changed**, **Removed** and **Fixed** topics in each ## [Unreleased] +## [1.4.0](https://github.com/quintoandar/butterfree/releases/tag/1.4.0) +* Add Delta support ([#370](https://github.com/quintoandar/butterfree/pull/370)) + ## [1.3.5](https://github.com/quintoandar/butterfree/releases/tag/1.3.5) * Auto create feature sets ([#368](https://github.com/quintoandar/butterfree/pull/368)) diff --git a/docs/source/butterfree.automated.rst b/docs/source/butterfree.automated.rst index de290d9c7..9c01ac54e 100644 --- a/docs/source/butterfree.automated.rst +++ b/docs/source/butterfree.automated.rst @@ -4,6 +4,8 @@ butterfree.automated package Submodules ---------- +butterfree.automated.feature\_set\_creation module +-------------------------------------------------- .. automodule:: butterfree.automated.feature_set_creation :members: diff --git a/docs/source/butterfree.configs.rst b/docs/source/butterfree.configs.rst index 18a82795d..20432e45b 100644 --- a/docs/source/butterfree.configs.rst +++ b/docs/source/butterfree.configs.rst @@ -23,31 +23,6 @@ butterfree.configs.environment module butterfree.configs.logger module -------------------------------- -.. automodule:: butterfree.configs.logger - :members: - :undoc-members: - :show-inheritance: - -.. automodule:: butterfree.configs.logger - :members: - :undoc-members: - :show-inheritance: - -.. automodule:: butterfree.configs.logger - :members: - :undoc-members: - :show-inheritance: - -.. automodule:: butterfree.configs.logger - :members: - :undoc-members: - :show-inheritance: - -.. automodule:: butterfree.configs.logger - :members: - :undoc-members: - :show-inheritance: - .. automodule:: butterfree.configs.logger :members: :undoc-members: diff --git a/docs/source/butterfree.constants.rst b/docs/source/butterfree.constants.rst index e90b195e7..e5727fd18 100644 --- a/docs/source/butterfree.constants.rst +++ b/docs/source/butterfree.constants.rst @@ -31,36 +31,6 @@ butterfree.constants.migrations module butterfree.constants.spark\_constants module -------------------------------------------- -.. automodule:: butterfree.constants.migrations - :members: - :undoc-members: - :show-inheritance: - - -.. automodule:: butterfree.constants.migrations - :members: - :undoc-members: - :show-inheritance: - - -.. automodule:: butterfree.constants.migrations - :members: - :undoc-members: - :show-inheritance: - - -.. automodule:: butterfree.constants.migrations - :members: - :undoc-members: - :show-inheritance: - - -.. automodule:: butterfree.constants.migrations - :members: - :undoc-members: - :show-inheritance: - - .. automodule:: butterfree.constants.spark_constants :members: :undoc-members: @@ -69,31 +39,6 @@ butterfree.constants.spark\_constants module butterfree.constants.window\_definitions module ----------------------------------------------- -.. automodule:: butterfree.constants.window_definitions - :members: - :undoc-members: - :show-inheritance: - -.. automodule:: butterfree.constants.window_definitions - :members: - :undoc-members: - :show-inheritance: - -.. automodule:: butterfree.constants.window_definitions - :members: - :undoc-members: - :show-inheritance: - -.. automodule:: butterfree.constants.window_definitions - :members: - :undoc-members: - :show-inheritance: - -.. automodule:: butterfree.constants.window_definitions - :members: - :undoc-members: - :show-inheritance: - .. automodule:: butterfree.constants.window_definitions :members: :undoc-members: diff --git a/docs/source/butterfree.dataframe_service.rst b/docs/source/butterfree.dataframe_service.rst index 4343305b6..ae9658a5b 100644 --- a/docs/source/butterfree.dataframe_service.rst +++ b/docs/source/butterfree.dataframe_service.rst @@ -4,18 +4,24 @@ butterfree.dataframe\_service package Submodules ---------- +butterfree.dataframe\_service.incremental\_strategy module +---------------------------------------------------------- .. automodule:: butterfree.dataframe_service.incremental_strategy :members: :undoc-members: :show-inheritance: +butterfree.dataframe\_service.partitioning module +------------------------------------------------- .. automodule:: butterfree.dataframe_service.partitioning :members: :undoc-members: :show-inheritance: +butterfree.dataframe\_service.repartition module +------------------------------------------------ .. automodule:: butterfree.dataframe_service.repartition :members: diff --git a/docs/source/butterfree.hooks.rst b/docs/source/butterfree.hooks.rst index 72f13223d..c633cade3 100644 --- a/docs/source/butterfree.hooks.rst +++ b/docs/source/butterfree.hooks.rst @@ -12,12 +12,16 @@ Subpackages Submodules ---------- +butterfree.hooks.hook module +---------------------------- .. automodule:: butterfree.hooks.hook :members: :undoc-members: :show-inheritance: +butterfree.hooks.hookable\_component module +------------------------------------------- .. automodule:: butterfree.hooks.hookable_component :members: diff --git a/docs/source/butterfree.hooks.schema_compatibility.rst b/docs/source/butterfree.hooks.schema_compatibility.rst index a39c5b935..2d3de66ce 100644 --- a/docs/source/butterfree.hooks.schema_compatibility.rst +++ b/docs/source/butterfree.hooks.schema_compatibility.rst @@ -4,12 +4,16 @@ butterfree.hooks.schema\_compatibility package Submodules ---------- +butterfree.hooks.schema\_compatibility.cassandra\_table\_schema\_compatibility\_hook module +------------------------------------------------------------------------------------------- .. automodule:: butterfree.hooks.schema_compatibility.cassandra_table_schema_compatibility_hook :members: :undoc-members: :show-inheritance: +butterfree.hooks.schema\_compatibility.spark\_table\_schema\_compatibility\_hook module +--------------------------------------------------------------------------------------- .. automodule:: butterfree.hooks.schema_compatibility.spark_table_schema_compatibility_hook :members: diff --git a/docs/source/butterfree.load.writers.rst b/docs/source/butterfree.load.writers.rst index 2a173c9a4..b20eb85ea 100644 --- a/docs/source/butterfree.load.writers.rst +++ b/docs/source/butterfree.load.writers.rst @@ -4,6 +4,14 @@ butterfree.load.writers package Submodules ---------- +butterfree.load.writers.delta\_writer module +-------------------------------------------- + +.. automodule:: butterfree.load.writers.delta_writer + :members: + :undoc-members: + :show-inheritance: + butterfree.load.writers.historical\_feature\_store\_writer module ----------------------------------------------------------------- diff --git a/docs/source/butterfree.migrations.database_migration.rst b/docs/source/butterfree.migrations.database_migration.rst index 892165dfc..32ba4d4d0 100644 --- a/docs/source/butterfree.migrations.database_migration.rst +++ b/docs/source/butterfree.migrations.database_migration.rst @@ -4,18 +4,24 @@ butterfree.migrations.database\_migration package Submodules ---------- +butterfree.migrations.database\_migration.cassandra\_migration module +--------------------------------------------------------------------- .. automodule:: butterfree.migrations.database_migration.cassandra_migration :members: :undoc-members: :show-inheritance: +butterfree.migrations.database\_migration.database\_migration module +-------------------------------------------------------------------- .. automodule:: butterfree.migrations.database_migration.database_migration :members: :undoc-members: :show-inheritance: +butterfree.migrations.database\_migration.metastore\_migration module +--------------------------------------------------------------------- .. automodule:: butterfree.migrations.database_migration.metastore_migration :members: diff --git a/logging.json b/logging.json new file mode 100644 index 000000000..e69de29bb diff --git a/setup.py b/setup.py index bc4f0b453..e6b9f7618 100644 --- a/setup.py +++ b/setup.py @@ -1,7 +1,7 @@ from setuptools import find_packages, setup __package_name__ = "butterfree" -__version__ = "1.3.5" +__version__ = "1.4.0" __repository_url__ = "https://github.com/quintoandar/butterfree" with open("requirements.txt") as f: From f6c5db6f7c444125a241cde5062c7ca6acd06dd2 Mon Sep 17 00:00:00 2001 From: Ralph Rassweiler Date: Wed, 21 Aug 2024 13:01:32 -0300 Subject: [PATCH 69/70] Fix dup code (#373) * fix: dedup code --- .../historical_feature_store_writer.py | 9 ++++ tests/unit/butterfree/transform/conftest.py | 53 ++++++------------- 2 files changed, 26 insertions(+), 36 deletions(-) diff --git a/butterfree/load/writers/historical_feature_store_writer.py b/butterfree/load/writers/historical_feature_store_writer.py index 0be7d6af3..99bfe66a5 100644 --- a/butterfree/load/writers/historical_feature_store_writer.py +++ b/butterfree/load/writers/historical_feature_store_writer.py @@ -93,6 +93,15 @@ class HistoricalFeatureStoreWriter(Writer): improve queries performance. The data is stored in partition folders in AWS S3 based on time (per year, month and day). + >>> spark_client = SparkClient() + >>> writer = HistoricalFeatureStoreWriter() + >>> writer.write(feature_set=feature_set, + ... dataframe=dataframe, + ... spark_client=spark_client + ... merge_on=["id", "timestamp"]) + + This procedure will skip dataframe write and will activate Delta Merge. + Use it when the table already exist. """ PARTITION_BY = [ diff --git a/tests/unit/butterfree/transform/conftest.py b/tests/unit/butterfree/transform/conftest.py index 104300c99..d66d1c399 100644 --- a/tests/unit/butterfree/transform/conftest.py +++ b/tests/unit/butterfree/transform/conftest.py @@ -16,6 +16,15 @@ from butterfree.transform.utils import Function +def create_dataframe(data, timestamp_col="ts"): + pdf = ps.DataFrame.from_dict(data) + df = pdf.to_spark() + df = df.withColumn( + TIMESTAMP_COLUMN, df[timestamp_col].cast(DataType.TIMESTAMP.spark) + ) + return df + + def make_dataframe(spark_context, spark_session): data = [ { @@ -54,11 +63,7 @@ def make_dataframe(spark_context, spark_session): "nonfeature": 0, }, ] - pdf = ps.DataFrame.from_dict(data) - df = pdf.to_spark() - df = df.withColumn(TIMESTAMP_COLUMN, df.ts.cast(DataType.TIMESTAMP.spark)) - - return df + return create_dataframe(data) def make_filtering_dataframe(spark_context, spark_session): @@ -71,11 +76,7 @@ def make_filtering_dataframe(spark_context, spark_session): {"id": 1, "ts": 6, "feature1": None, "feature2": None, "feature3": None}, {"id": 1, "ts": 7, "feature1": None, "feature2": None, "feature3": None}, ] - pdf = ps.DataFrame.from_dict(data) - df = pdf.to_spark() - df = df.withColumn(TIMESTAMP_COLUMN, df.ts.cast(DataType.TIMESTAMP.spark)) - - return df + return create_dataframe(data) def make_output_filtering_dataframe(spark_context, spark_session): @@ -86,11 +87,7 @@ def make_output_filtering_dataframe(spark_context, spark_session): {"id": 1, "ts": 4, "feature1": 0, "feature2": 1, "feature3": 1}, {"id": 1, "ts": 6, "feature1": None, "feature2": None, "feature3": None}, ] - pdf = ps.DataFrame.from_dict(data) - df = pdf.to_spark() - df = df.withColumn(TIMESTAMP_COLUMN, df.ts.cast(DataType.TIMESTAMP.spark)) - - return df + return create_dataframe(data) def make_rolling_windows_agg_dataframe(spark_context, spark_session): @@ -126,11 +123,7 @@ def make_rolling_windows_agg_dataframe(spark_context, spark_session): "feature2__avg_over_1_week_rolling_windows": None, }, ] - pdf = ps.DataFrame.from_dict(data) - df = pdf.to_spark() - df = df.withColumn("timestamp", df.timestamp.cast(DataType.TIMESTAMP.spark)) - - return df + return create_dataframe(data, timestamp_col="timestamp") def make_rolling_windows_hour_slide_agg_dataframe(spark_context, spark_session): @@ -154,11 +147,7 @@ def make_rolling_windows_hour_slide_agg_dataframe(spark_context, spark_session): "feature2__avg_over_1_day_rolling_windows": 500.0, }, ] - pdf = ps.DataFrame.from_dict(data) - df = pdf.to_spark() - df = df.withColumn("timestamp", df.timestamp.cast(DataType.TIMESTAMP.spark)) - - return df + return create_dataframe(data, timestamp_col="timestamp") def make_multiple_rolling_windows_hour_slide_agg_dataframe( @@ -202,11 +191,7 @@ def make_multiple_rolling_windows_hour_slide_agg_dataframe( "feature2__avg_over_3_days_rolling_windows": 500.0, }, ] - pdf = ps.DataFrame.from_dict(data) - df = pdf.to_spark() - df = df.withColumn("timestamp", df.timestamp.cast(DataType.TIMESTAMP.spark)) - - return df + return create_dataframe(data, timestamp_col="timestamp") def make_fs(spark_context, spark_session): @@ -253,9 +238,7 @@ def make_fs_dataframe_with_distinct(spark_context, spark_session): "h3": "86a8100efffffff", }, ] - pdf = ps.DataFrame.from_dict(data) - df = pdf.to_spark() - df = df.withColumn("timestamp", df.timestamp.cast(DataType.TIMESTAMP.spark)) + df = create_dataframe(data, "timestamp") return df @@ -283,9 +266,7 @@ def make_target_df_distinct(spark_context, spark_session): "feature__sum_over_3_days_rolling_windows": None, }, ] - pdf = ps.DataFrame.from_dict(data) - df = pdf.to_spark() - df = df.withColumn("timestamp", df.timestamp.cast(DataType.TIMESTAMP.spark)) + df = create_dataframe(data, "timestamp") return df From e5305f3d7deae52b78a7cd7c3fc5bb1441d1e020 Mon Sep 17 00:00:00 2001 From: Ralph Filho Date: Thu, 22 Aug 2024 09:36:01 -0300 Subject: [PATCH 70/70] fix: deduplicate --- .../historical_feature_store_writer.py | 2 +- tests/unit/butterfree/transform/conftest.py | 298 ++++++++---------- 2 files changed, 138 insertions(+), 162 deletions(-) diff --git a/butterfree/load/writers/historical_feature_store_writer.py b/butterfree/load/writers/historical_feature_store_writer.py index d286d6e2d..99bfe66a5 100644 --- a/butterfree/load/writers/historical_feature_store_writer.py +++ b/butterfree/load/writers/historical_feature_store_writer.py @@ -251,7 +251,7 @@ def validate( if self.interval_mode and not self.debug_mode else spark_client.read_table(table_name).count() ) - + dataframe_count = dataframe.count() self._assert_validation_count(table_name, written_count, dataframe_count) diff --git a/tests/unit/butterfree/transform/conftest.py b/tests/unit/butterfree/transform/conftest.py index 25d2a47ba..c0ebb47af 100644 --- a/tests/unit/butterfree/transform/conftest.py +++ b/tests/unit/butterfree/transform/conftest.py @@ -1,7 +1,10 @@ +import json from unittest.mock import Mock import pyspark.pandas as ps from pyspark.sql import functions +from pyspark.sql.functions import col +from pyspark.sql.types import TimestampType from pytest import fixture from butterfree.constants import DataType @@ -25,6 +28,74 @@ def create_dataframe(data, timestamp_col="ts"): return df +def create_dataframe_from_data( + spark_context, spark_session, data, timestamp_col="timestamp", use_json=False +): + if use_json: + df = spark_session.read.json( + spark_context.parallelize(data).map(lambda x: json.dumps(x)) + ) + else: + df = create_dataframe(data, timestamp_col=timestamp_col) + + df = df.withColumn(timestamp_col, col(timestamp_col).cast(TimestampType())) + return df + + +def create_rolling_windows_agg_dataframe( + spark_context, spark_session, data, timestamp_col="timestamp", use_json=False +): + if use_json: + df = spark_session.read.json( + spark_context.parallelize(data).map(lambda x: json.dumps(x)) + ) + df = df.withColumn( + timestamp_col, col(timestamp_col).cast(DataType.TIMESTAMP.spark) + ) + else: + df = create_dataframe(data, timestamp_col=timestamp_col) + + return df + + +def build_data(rows, base_features, dynamic_features=None): + """ + Constrói uma lista de dicionários para DataFrame com recursos dinâmicos. + + :param rows: Lista de tuplas com (id, timestamp, base_values, dynamic_values). + :param base_features: Lista de nomes de recursos base (strings). + :param dynamic_features: Lista de nomes de recursos dinâmicos, + mapeando para o índice de dynamic_values (opcional). + :return: Lista de dicionários para criação do DataFrame. + """ + data = [] + for row in rows: + id_value, timestamp_value, base_values, dynamic_values = row + + entry = { + "id": id_value, + "timestamp": timestamp_value, + } + + # Adiciona valores das features base + entry.update( + {feature: value for feature, value in zip(base_features, base_values)} + ) + + # Adiciona valores das features dinâmicas, se houver + if dynamic_features: + entry.update( + { + feature: dynamic_values[idx] + for idx, feature in enumerate(dynamic_features) + } + ) + + data.append(entry) + + return data + + def make_dataframe(spark_context, spark_session): data = [ { @@ -91,186 +162,91 @@ def make_output_filtering_dataframe(spark_context, spark_session): def make_rolling_windows_agg_dataframe(spark_context, spark_session): - data = [ - { - "id": 1, - "timestamp": "2016-04-11 00:00:00", - "feature1__avg_over_1_week_rolling_windows": None, - "feature2__avg_over_1_week_rolling_windows": None, - }, - { - "id": 1, - "timestamp": "2016-04-12 00:00:00", - "feature1__avg_over_1_week_rolling_windows": 300.0, - "feature2__avg_over_1_week_rolling_windows": 350.0, - }, - { - "id": 1, - "timestamp": "2016-04-19 00:00:00", - "feature1__avg_over_1_week_rolling_windows": None, - "feature2__avg_over_1_week_rolling_windows": None, - }, - { - "id": 1, - "timestamp": "2016-04-23 00:00:00", - "feature1__avg_over_1_week_rolling_windows": 1000.0, - "feature2__avg_over_1_week_rolling_windows": 1100.0, - }, - { - "id": 1, - "timestamp": "2016-04-30 00:00:00", - "feature1__avg_over_1_week_rolling_windows": None, - "feature2__avg_over_1_week_rolling_windows": None, - }, + rows = [ + (1, "2016-04-11 00:00:00", [None, None], None), + (1, "2016-04-12 00:00:00", [300.0, 350.0], None), + (1, "2016-04-19 00:00:00", [None, None], None), + (1, "2016-04-23 00:00:00", [1000.0, 1100.0], None), + (1, "2016-04-30 00:00:00", [None, None], None), + ] + + base_features = [ + "feature1__avg_over_1_week_rolling_windows", + "feature2__avg_over_1_week_rolling_windows", ] - return create_dataframe(data, timestamp_col="timestamp") + + data = build_data(rows, base_features) + return create_dataframe_from_data(spark_context, spark_session, data) def make_rolling_windows_hour_slide_agg_dataframe(spark_context, spark_session): - data = [ - { - "id": 1, - "timestamp": "2016-04-11 12:00:00", - "feature1__avg_over_1_day_rolling_windows": 266.6666666666667, - "feature2__avg_over_1_day_rolling_windows": 300.0, - }, - { - "id": 1, - "timestamp": "2016-04-12 00:00:00", - "feature1__avg_over_1_day_rolling_windows": 300.0, - "feature2__avg_over_1_day_rolling_windows": 350.0, - }, - { - "id": 1, - "timestamp": "2016-04-12 12:00:00", - "feature1__avg_over_1_day_rolling_windows": 400.0, - "feature2__avg_over_1_day_rolling_windows": 500.0, - }, + rows = [ + (1, "2016-04-11 12:00:00", [266.6666666666667, 300.0], None), + (1, "2016-04-12 00:00:00", [300.0, 350.0], None), + (1, "2016-04-12 12:00:00", [400.0, 500.0], None), + ] + + base_features = [ + "feature1__avg_over_1_day_rolling_windows", + "feature2__avg_over_1_day_rolling_windows", ] - return create_dataframe(data, timestamp_col="timestamp") + + data = build_data(rows, base_features) + return create_dataframe_from_data(spark_context, spark_session, data) def make_multiple_rolling_windows_hour_slide_agg_dataframe( spark_context, spark_session ): - data = [ - { - "id": 1, - "timestamp": "2016-04-11 12:00:00", - "feature1__avg_over_2_days_rolling_windows": 266.6666666666667, - "feature1__avg_over_3_days_rolling_windows": 266.6666666666667, - "feature2__avg_over_2_days_rolling_windows": 300.0, - "feature2__avg_over_3_days_rolling_windows": 300.0, - }, - { - "id": 1, - "timestamp": "2016-04-12 00:00:00", - "feature1__avg_over_2_days_rolling_windows": 300.0, - "feature1__avg_over_3_days_rolling_windows": 300.0, - "feature2__avg_over_2_days_rolling_windows": 350.0, - "feature2__avg_over_3_days_rolling_windows": 350.0, - }, - { - "id": 1, - "timestamp": "2016-04-13 12:00:00", - "feature1__avg_over_2_days_rolling_windows": 400.0, - "feature1__avg_over_3_days_rolling_windows": 300.0, - "feature2__avg_over_2_days_rolling_windows": 500.0, - "feature2__avg_over_3_days_rolling_windows": 350.0, - }, - { - "id": 1, - "timestamp": "2016-04-14 00:00:00", - "feature1__avg_over_3_days_rolling_windows": 300.0, - "feature2__avg_over_3_days_rolling_windows": 350.0, - }, - { - "id": 1, - "timestamp": "2016-04-14 12:00:00", - "feature1__avg_over_3_days_rolling_windows": 400.0, - "feature2__avg_over_3_days_rolling_windows": 500.0, - }, + rows = [ + ( + 1, + "2016-04-11 12:00:00", + [], + [266.6666666666667, 266.6666666666667, 300.0, 300.0], + ), + (1, "2016-04-12 00:00:00", [], [300.0, 300.0, 350.0, 350.0]), + (1, "2016-04-13 12:00:00", [], [400.0, 300.0, 500.0, 350.0]), + (1, "2016-04-14 00:00:00", [], [None, 300.0, None, 350.0]), + (1, "2016-04-14 12:00:00", [], [None, 400.0, None, 500.0]), ] - return create_dataframe(data, timestamp_col="timestamp") - -def make_rolling_windows_hour_slide_agg_dataframe(spark_context, spark_session): - data = [ - { - "id": 1, - "timestamp": "2016-04-11 12:00:00", - "feature1__avg_over_1_day_rolling_windows": 266.6666666666667, - "feature2__avg_over_1_day_rolling_windows": 300.0, - }, - { - "id": 1, - "timestamp": "2016-04-12 00:00:00", - "feature1__avg_over_1_day_rolling_windows": 300.0, - "feature2__avg_over_1_day_rolling_windows": 350.0, - }, - { - "id": 1, - "timestamp": "2016-04-12 12:00:00", - "feature1__avg_over_1_day_rolling_windows": 400.0, - "feature2__avg_over_1_day_rolling_windows": 500.0, - }, + dynamic_features = [ + "feature1__avg_over_2_days_rolling_windows", + "feature1__avg_over_3_days_rolling_windows", + "feature2__avg_over_2_days_rolling_windows", + "feature2__avg_over_3_days_rolling_windows", ] - df = spark_session.read.json( - spark_context.parallelize(data).map(lambda x: json.dumps(x)) - ) - df = df.withColumn("timestamp", df.timestamp.cast(DataType.TIMESTAMP.spark)) - return df + data = build_data(rows, [], dynamic_features=dynamic_features) + return create_dataframe_from_data(spark_context, spark_session, data, use_json=True) -def make_multiple_rolling_windows_hour_slide_agg_dataframe( - spark_context, spark_session +def create_rolling_window_dataframe( + spark_context, spark_session, rows, base_features, dynamic_features=None ): - data = [ - { - "id": 1, - "timestamp": "2016-04-11 12:00:00", - "feature1__avg_over_2_days_rolling_windows": 266.6666666666667, - "feature1__avg_over_3_days_rolling_windows": 266.6666666666667, - "feature2__avg_over_2_days_rolling_windows": 300.0, - "feature2__avg_over_3_days_rolling_windows": 300.0, - }, - { - "id": 1, - "timestamp": "2016-04-12 00:00:00", - "feature1__avg_over_2_days_rolling_windows": 300.0, - "feature1__avg_over_3_days_rolling_windows": 300.0, - "feature2__avg_over_2_days_rolling_windows": 350.0, - "feature2__avg_over_3_days_rolling_windows": 350.0, - }, - { - "id": 1, - "timestamp": "2016-04-13 12:00:00", - "feature1__avg_over_2_days_rolling_windows": 400.0, - "feature1__avg_over_3_days_rolling_windows": 300.0, - "feature2__avg_over_2_days_rolling_windows": 500.0, - "feature2__avg_over_3_days_rolling_windows": 350.0, - }, - { - "id": 1, - "timestamp": "2016-04-14 00:00:00", - "feature1__avg_over_3_days_rolling_windows": 300.0, - "feature2__avg_over_3_days_rolling_windows": 350.0, - }, - { - "id": 1, - "timestamp": "2016-04-14 12:00:00", - "feature1__avg_over_3_days_rolling_windows": 400.0, - "feature2__avg_over_3_days_rolling_windows": 500.0, - }, - ] - df = spark_session.read.json( - spark_context.parallelize(data).map(lambda x: json.dumps(x)) - ) + """ + Cria um DataFrame com recursos de rolagem de janelas agregadas. + + :param spark_context: Contexto do Spark. + :param spark_session: Sessão do Spark. + :param rows: Lista de tuplas com (id, timestamp, base_values, dynamic_values). + :param base_features: Lista de nomes de recursos base (strings). + :param dynamic_features: Lista de nomes de recursos dinâmicos, + mapeando para o índice de dynamic_values (opcional). + :return: DataFrame do Spark. + """ + data = build_data(rows, base_features, dynamic_features) + + # Converte a lista de dicionários em um RDD do Spark + rdd = spark_context.parallelize(data).map(lambda x: json.dumps(x)) + + # Cria o DataFrame do Spark a partir do RDD + df = spark_session.read.json(rdd) + + # Converte a coluna "timestamp" para o tipo TIMESTAMP df = df.withColumn("timestamp", df.timestamp.cast(DataType.TIMESTAMP.spark)) - return df - def make_fs(spark_context, spark_session): df = make_dataframe(spark_context, spark_session)