diff --git a/.circleci/continue_config.yml b/.circleci/continue_config.yml index cedab70234..c83d6615cb 100644 --- a/.circleci/continue_config.yml +++ b/.circleci/continue_config.yml @@ -61,7 +61,7 @@ commands: command: conda install -c conda-forge pytables -y - run: name: Install requirements and test requirements - command: pip install --upgrade -r test_requirements.txt + command: pip install --upgrade .[test] - run: # this is needed to fix java cacerts so # spark can automatically download packages from mvn @@ -146,7 +146,7 @@ commands: steps: - restore_cache: name: Restore package cache - key: kedro-deps-v1-win-{{ checksum "dependency/requirements.txt" }}-{{ checksum "test_requirements.txt" }} + key: kedro-deps-v1-win-{{ checksum "pyproject.toml" }}-{{ checksum "setup.py" }} # We don't restore the conda environment cache for python 3.10 as it conflicts with the # 'Install GDAL, Fiona and pytables' step breaking the conda environment (missing zlib.dll). - unless: @@ -155,7 +155,7 @@ commands: steps: - restore_cache: name: Restore conda environment cache - key: kedro-deps-v1-win-<>-{{ checksum "dependency/requirements.txt" }}-{{ checksum "test_requirements.txt" }} + key: kedro-deps-v1-win-<>-{{ checksum "pyproject.toml" }}-{{ checksum "setup.py" }} # pytables and Fiona have a series of binary dependencies under Windows that # are best handled by conda-installing instead of pip-installing them. # Dependency resolution works best when installing these altogether in one @@ -168,7 +168,7 @@ commands: command: conda activate kedro_builder; pip debug --verbose - run: name: Install all requirements - command: conda activate kedro_builder; pip install -v -r test_requirements.txt -U + command: conda activate kedro_builder; pip install -v -U .[test] - run: name: Print Python environment command: conda activate kedro_builder; make print-python-env @@ -337,7 +337,7 @@ jobs: steps: - save_cache: name: Save Python package cache - key: kedro-deps-v1-win-{{ checksum "dependency/requirements.txt" }}-{{ checksum "test_requirements.txt" }} + key: kedro-deps-v1-win-{{ checksum "pyproject.toml" }}-{{ checksum "setup.py" }} paths: # Cache pip cache and conda packages directories - c:\tools\miniconda3\pkgs @@ -350,7 +350,7 @@ jobs: steps: - save_cache: name: Save conda environment cache - key: kedro-deps-v1-win-<>-{{ checksum "dependency/requirements.txt" }}-{{ checksum "test_requirements.txt" }} + key: kedro-deps-v1-win-<>-{{ checksum "pyproject.toml" }}-{{ checksum "setup.py" }} paths: - c:\tools\miniconda3\envs\kedro_builder - run: diff --git a/.github/ISSUE_TEMPLATE/design-doc.md b/.github/ISSUE_TEMPLATE/design-doc.md index e656bf7b7f..43f70cb12e 100644 --- a/.github/ISSUE_TEMPLATE/design-doc.md +++ b/.github/ISSUE_TEMPLATE/design-doc.md @@ -8,37 +8,29 @@ assignees: '' --- ## Introduction - -A high-level, short overview of the problem(s) you are designing a solution for. + ## Background - -Provide the reader with the context surrounding the problem(s) you are trying to solve. + ## Problem - -Be as concrete as you can about: + ### What's in scope ### What's not in scope ## Design - -Explain your design to the solution here. Diagrams could help. + ### Alternatives considered - -Explain the trade off between different alternatives to your solution. + ## Testing - -Explain the testing strategies to verify your design correctness (if possible). + ## Rollout strategy - -Is the change backward compatible? If not, what is the migration strategy? + ## Future iterations - -Will there be future iterations of this design? + diff --git a/.github/ISSUE_TEMPLATE/feature-request.md b/.github/ISSUE_TEMPLATE/feature-request.md index a7911c2f11..b52de098bd 100644 --- a/.github/ISSUE_TEMPLATE/feature-request.md +++ b/.github/ISSUE_TEMPLATE/feature-request.md @@ -8,13 +8,13 @@ assignees: '' --- ## Description -Is your feature request related to a problem? A clear and concise description of what the problem is: "I'm always frustrated when ..." + ## Context -Why is this change important to you? How would you use it? How can it benefit other users? + ## Possible Implementation -(Optional) Suggest an idea for implementing the addition or change. + ## Possible Alternatives -(Optional) Describe any alternative solutions or features you've considered. + diff --git a/.github/dependabot.yml b/.github/dependabot.yml index d13ed3ced4..65736fe738 100644 --- a/.github/dependabot.yml +++ b/.github/dependabot.yml @@ -6,7 +6,7 @@ version: 2 updates: - package-ecosystem: "pip" # See documentation for possible values - directory: "/dependency" # Location of package manifests + directory: "/" # Location of package manifests schedule: interval: "weekly" labels: diff --git a/.github/workflows/all-checks.yml b/.github/workflows/all-checks.yml new file mode 100644 index 0000000000..51efe0e4db --- /dev/null +++ b/.github/workflows/all-checks.yml @@ -0,0 +1,58 @@ +name: Run all checks on Kedro + +on: + push: + branches: + - main + - develop + paths-ignore: + - "docs/**" + - '**.md' + pull_request: + branches: + - main + - develop + paths-ignore: + - "docs/**" + - '**.md' + +jobs: + unit-tests: + strategy: + matrix: + os: [ ubuntu-latest, windows-latest ] + python-version: [ "3.7", "3.8", "3.9", "3.10", "3.11" ] + uses: ./.github/workflows/unit-tests.yml + with: + os: ${{ matrix.os }} + python-version: ${{ matrix.python-version }} + + lint: + strategy: + matrix: + os: [ ubuntu-latest ] + python-version: [ "3.11" ] + uses: ./.github/workflows/lint.yml + with: + os: ${{ matrix.os }} + python-version: ${{ matrix.python-version }} + + e2e-tests: + strategy: + matrix: + os: [ ubuntu-latest, windows-latest ] + python-version: [ "3.7", "3.8", "3.9", "3.10", "3.11" ] + uses: ./.github/workflows/e2e-tests.yml + with: + os: ${{ matrix.os }} + python-version: ${{ matrix.python-version }} + + pip-compile: + strategy: + matrix: + os: [ ubuntu-latest, windows-latest ] + python-version: [ "3.7", "3.8", "3.9", "3.10", "3.11" ] + uses: ./.github/workflows/pip-compile.yml + with: + os: ${{ matrix.os }} + python-version: ${{ matrix.python-version }} diff --git a/.github/workflows/docs-only-checks.yml b/.github/workflows/docs-only-checks.yml new file mode 100644 index 0000000000..b7940e85be --- /dev/null +++ b/.github/workflows/docs-only-checks.yml @@ -0,0 +1,28 @@ +name: Run linter on Kedro Docs + +on: + push: + branches: + - main + - develop + paths: + - "docs/**" + - '**.md' + pull_request: + branches: + - main + - develop + paths: + - "docs/**" + - '**.md' + +jobs: + lint-tests: + strategy: + matrix: + os: [ ubuntu-latest ] + python-version: [ "3.7", "3.8", "3.9", "3.10", "3.11" ] + uses: ./.github/workflows/lint.yml + with: + os: ${{ matrix.os }} + python-version: ${{ matrix.python-version }} diff --git a/.github/workflows/e2e-tests.yml b/.github/workflows/e2e-tests.yml new file mode 100644 index 0000000000..0369e4b134 --- /dev/null +++ b/.github/workflows/e2e-tests.yml @@ -0,0 +1,48 @@ +name: Run e2e-tests on Kedro + +on: + workflow_call: + inputs: + os: + type: string + python-version: + type: string + +env: + COLUMNS: 120 + LINES: 25 + +jobs: + e2e-tests: + runs-on: ${{ inputs.os }} + steps: + - name: Checkout code + uses: actions/checkout@v3 + - name: Set up Python ${{inputs.python-version}} + uses: actions/setup-python@v3 + with: + python-version: ${{inputs.python-version}} + - run: make install-pip-setuptools + - name: Cache python packages for Linux + if: inputs.os == 'ubuntu-latest' + uses: actions/cache@v3 + with: + path: ~/.cache/pip + key: ${{inputs.os}}-python-${{inputs.python-version}} + - name: Cache python packages for Windows + if: inputs.os == 'windows-latest' + uses: actions/cache@v3 + with: + path: ~\AppData\Local\pip\Cache + key: ${{inputs.os}}-python-${{inputs.python-version}} + - name: Add MSBuild to PATH + if: inputs.os == 'windows-latest' + uses: microsoft/setup-msbuild@v1 + - name: Install dependencies + run: | + make install-test-requirements + make install-pre-commit + - name: pip freeze + run: pip freeze + - name: Run e2e tests + run: make e2e-tests diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml new file mode 100644 index 0000000000..a3222d1d9f --- /dev/null +++ b/.github/workflows/lint.yml @@ -0,0 +1,33 @@ +name: Run linters on Kedro + +on: + workflow_call: + inputs: + os: + type: string + python-version: + type: string + +jobs: + lint: + runs-on: ${{ inputs.os }} + steps: + - name: Checkout code + uses: actions/checkout@v3 + - name: Set up Python ${{ inputs.python-version }} + uses: actions/setup-python@v3 + with: + python-version: ${{ inputs.python-version }} + - name: Cache python packages + uses: actions/cache@v3 + with: + path: ~/.cache/pip + key: ${{inputs.os}}-python-${{inputs.python-version}} + - name: Install dependencies + run: | + make install-test-requirements + make install-pre-commit + - name: pip freeze + run: pip freeze + - name: Run linter + run: make lint diff --git a/.github/workflows/merge-gatekeeper.yml b/.github/workflows/merge-gatekeeper.yml new file mode 100644 index 0000000000..ff716a75fd --- /dev/null +++ b/.github/workflows/merge-gatekeeper.yml @@ -0,0 +1,27 @@ +name: Merge Gatekeeper + +on: + pull_request: + branches: + - main + - develop + +jobs: + merge-gatekeeper: + runs-on: ubuntu-latest + # Restrict permissions of the GITHUB_TOKEN. + # Docs: https://docs.github.com/en/actions/using-jobs/assigning-permissions-to-jobs + permissions: + checks: read + statuses: read + steps: + - name: Run Merge Gatekeeper + # NOTE: v1 is updated to reflect the latest v1.x.y. Please use any tag/branch that suits your needs: + # https://github.com/upsidr/merge-gatekeeper/tags + # https://github.com/upsidr/merge-gatekeeper/branches + uses: upsidr/merge-gatekeeper@v1 + with: + token: ${{ secrets.GITHUB_TOKEN }} + timeout: 1800 + interval: 30 + ignored: 'ci/circleci: win_e2e_tests-3.7,ci/circleci: win_pip_compile-3.9,ci/circleci: win_e2e_tests-3.9,ci/circleci: win_pip_compile-3.8,ci/circleci: lint-3.7,ci/circleci: win_pip_compile-3.7,ci/circleci: pip_compile-3.7,ci/circleci: e2e_tests-3.7,ci/circleci: win_unit_tests-3.7,ci/circleci: win_unit_tests-3.9,ci/circleci: e2e_tests-3.8,ci/circleci: win_unit_tests-3.10,ci/circleci: win_pip_compile-3.10,ci/circleci: win_unit_tests-3.8,ci/circleci: e2e_tests-3.9,ci/circleci: unit_tests-3.10,ci/circleci: unit_tests-3.8,ci/circleci: e2e_tests-3.10,ci/circleci: lint-3.8,ci/circleci: unit_tests-3.9,ci/circleci: unit_tests-3.7,ci/circleci: win_e2e_tests-3.10,ci/circleci: pip_compile-3.8,ci/circleci: pip_compile-3.10,ci/circleci: win_e2e_tests-3.8,ci/circleci: lint-3.9,ci/circleci: pip_compile-3.9,ci/circleci: lint-3.10,build_code,ci/circleci: check-updated-files,regular' diff --git a/.github/workflows/pip-compile.yml b/.github/workflows/pip-compile.yml new file mode 100644 index 0000000000..b5b1453782 --- /dev/null +++ b/.github/workflows/pip-compile.yml @@ -0,0 +1,42 @@ +name: Run pip-compile + +on: + workflow_call: + inputs: + os: + type: string + python-version: + type: string + +jobs: + pip-compile: + runs-on: ${{ inputs.os }} + steps: + - name: Checkout code + uses: actions/checkout@v3 + - name: Set up Python ${{inputs.python-version}} + uses: actions/setup-python@v3 + with: + python-version: ${{inputs.python-version}} + - run: make install-pip-setuptools + - name: Cache python packages for Linux + if: inputs.os == 'ubuntu-latest' + uses: actions/cache@v3 + with: + path: ~/.cache/pip + key: ${{inputs.os}}-python-${{inputs.python-version}} + - name: Cache python packages for Windows + if: inputs.os == 'windows-latest' + uses: actions/cache@v3 + with: + path: ~\AppData\Local\pip\Cache + key: ${{inputs.os}}-python-${{inputs.python-version}} + - name: Add MSBuild to PATH + if: inputs.os == 'windows-latest' + uses: microsoft/setup-msbuild@v1 + - name: Install dependencies + run: | + make install-test-requirements + make install-pre-commit + - name: Run pip-compile + run: make pip-compile diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml new file mode 100644 index 0000000000..c56a67c707 --- /dev/null +++ b/.github/workflows/unit-tests.yml @@ -0,0 +1,50 @@ +name: Run unit-tests on Kedro + +on: + workflow_call: + inputs: + os: + type: string + python-version: + type: string +jobs: + unit-tests: + runs-on: ${{ inputs.os }} + steps: + - name: Checkout code + uses: actions/checkout@v3 + - name: Set up Python ${{inputs.python-version}} + uses: actions/setup-python@v3 + with: + python-version: ${{inputs.python-version}} + - run: make install-pip-setuptools + - name: Cache python packages for Linux + if: inputs.os == 'ubuntu-latest' + uses: actions/cache@v3 + with: + path: ~/.cache/pip + key: ${{inputs.os}}-python-${{inputs.python-version}} + - name: Cache python packages for Windows + if: inputs.os == 'windows-latest' + uses: actions/cache@v3 + with: + path: ~\AppData\Local\pip\Cache + key: ${{inputs.os}}-python-${{inputs.python-version}} + - name: Add MSBuild to PATH + if: inputs.os == 'windows-latest' + uses: microsoft/setup-msbuild@v1 + - name: Install dependencies + run: | + make install-test-requirements + make install-pre-commit + - name: Install pytables (only for windows) + if: inputs.os == 'windows-latest' + run: pip install tables + - name: pip freeze + run: pip freeze + - name: Run unit tests + if: inputs.os == 'ubuntu-latest' + run: make test + - name: Run unit tests without spark (Windows) + if: inputs.os == 'windows-latest' + run: make test-no-spark diff --git a/.gitpod.yml b/.gitpod.yml index c2ec591de9..6fe5e8e825 100644 --- a/.gitpod.yml +++ b/.gitpod.yml @@ -5,10 +5,9 @@ tasks: init: | make sign-off - pip install -e /workspace/kedro + pip install -e /workspace/kedro[test] cd /workspace yes project | kedro new -s pandas-iris --checkout main - pip install -r /workspace/kedro/test_requirements.txt cd /workspace/kedro pre-commit install --install-hooks diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 163bff3f2f..57bafd2416 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -19,6 +19,28 @@ repos: exclude: "^kedro/templates/|^features/steps/test_starter/" - id: requirements-txt-fixer # Sorts entries in requirements.txt exclude: "^kedro/templates/|^features/steps/test_starter/" + + - repo: https://github.com/pre-commit/mirrors-mypy + rev: v0.961 + hooks: + - id: mypy + args: [--allow-redefinition, --ignore-missing-imports] + exclude: | + (?x)( + ^kedro/templates/| + ^docs/| + ^features/steps/test_starter/ + ) + additional_dependencies: + - types-cachetools + - types-filelock + - types-PyYAML + - types-redis + - types-requests + - types-setuptools + - types-toml + - attrs + - repo: https://github.com/asottile/blacken-docs rev: v1.12.1 hooks: @@ -26,20 +48,28 @@ repos: additional_dependencies: [black~=22.0] entry: blacken-docs --skip-errors - - repo: https://github.com/asottile/pyupgrade - rev: v2.26.0 + - repo: https://github.com/astral-sh/ruff-pre-commit + # Ruff version. + rev: v0.0.277 + hooks: + - id: ruff + name: "ruff on kedro/" + args: ["--fix", "--show-fixes", "--exit-non-zero-on-fix"] + exclude: "^kedro/templates/|^features/steps/test_starter/|tests|docs" + - repo: https://github.com/astral-sh/ruff-pre-commit + # Ruff version. + rev: v0.0.277 hooks: - - id: pyupgrade - args: [--py36-plus] + - id: ruff + name: "ruff on tests/ and docs/" + # PLR2004: Magic value used + # PLR0913: Too many arguments + args: ["--fix", "--show-fixes", "--exit-non-zero-on-fix", "--ignore=PLR2004,PLR0913"] + # include: "tests" + exclude: "^kedro/templates/|^features/steps/test_starter/|kedro" - repo: local hooks: - - id: isort - name: "Sort imports" - language: system - types: [file, python] - exclude: ^kedro/templates/|^features/steps/test_starter - entry: isort - id: black name: "Black" language: system @@ -53,65 +83,6 @@ repos: pass_filenames: false entry: lint-imports - - # It's impossible to specify per-directory configuration, so we just run it many times. - # https://github.com/PyCQA/pylint/issues/618 - # The first set of pylint checks if for local pre-commit, it only runs on the files changed. - - id: pylint-quick-kedro - name: "Quick Pylint on kedro/*" - language: system - types: [file, python] - files: ^kedro/ - exclude: ^kedro/templates/ - entry: pylint -j 4 --disable=unnecessary-pass - stages: [commit] - - id: pylint-quick-tests - name: "Quick Pylint on tests/*" - language: system - types: [file, python] - files: ^tests/ - entry: pylint -j 4 --disable=missing-docstring,redefined-outer-name,no-self-use,invalid-name,protected-access,too-many-arguments,too-many-public-methods - stages: [commit] - - id: pylint-quick-features - name: "Quick Pylint on features/*" - language: system - types: [file, python] - files: ^features/ - exclude: ^features/steps/test_starter - entry: pylint -j 4 --disable=missing-docstring,no-name-in-module - stages: [commit] - - # The same pylint checks, but running on all files. It's for manual run with `make lint` - - id: pylint-kedro - name: "Pylint on kedro/*" - language: system - pass_filenames: false - stages: [manual] - entry: pylint -j 4 --disable=unnecessary-pass --init-hook="import sys; sys.setrecursionlimit(2000)" kedro - - id: pylint-tests - name: "Pylint on tests/*" - language: system - pass_filenames: false - stages: [manual] - entry: pylint -j 4 --disable=missing-docstring,redefined-outer-name,no-self-use,invalid-name,protected-access,too-many-arguments,too-many-public-methods,use-implicit-booleaness-not-comparison tests - - id: pylint-features - name: "Pylint on features/*" - language: system - pass_filenames: false - stages: [manual] - exclude: ^features/steps/test_starter - entry: pylint -j 4 --disable=missing-docstring,no-name-in-module features - - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v2.2.3 - hooks: - - id: flake8 - args: - - "--max-line-length=88" - - "--max-complexity=18" - - "--max-complexity=18" - - "--select=B,C,E,F,W,T4,B9" - - "--ignore=E203,E266,E501,W503" - exclude: "^kedro/templates/|^features/steps/test_starter/" - repo: https://github.com/pre-commit/mirrors-mypy rev: v0.961 hooks: @@ -147,3 +118,5 @@ repos: types: [file, python] exclude: ^kedro/templates/|^tests/|^features/steps/test_starter entry: bandit -ll + +# Manual only diff --git a/.readthedocs.yml b/.readthedocs.yml index 2df6853225..2435aac483 100644 --- a/.readthedocs.yml +++ b/.readthedocs.yml @@ -36,4 +36,4 @@ python: path: . extra_requirements: - docs - - requirements: test_requirements.txt + - test diff --git a/CITATION.cff b/CITATION.cff index 256c577eb8..6ebc728b72 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -5,6 +5,8 @@ authors: given-names: Sajid - family-names: Chan given-names: Nok Lam +- family-names: Couto + given-names: Laura - family-names: Dada given-names: Yetunde - family-names: Danov @@ -13,6 +15,8 @@ authors: given-names: Deepyaman - family-names: DeBold given-names: Tynan +- family-names: Gundaniya + given-names: Jitendra - family-names: Holzer given-names: Jannic - family-names: Kaiser @@ -33,6 +37,8 @@ authors: given-names: Antony - family-names: Nguyen given-names: Huong +- family-names: Nikolic + given-names: Vladimir - family-names: Okwa given-names: Nero - family-names: Cano Rodríguez @@ -40,11 +46,13 @@ authors: orcid: https://orcid.org/0000-0002-2187-161X - family-names: Schwarzmann given-names: Joel +- family-names: Sorokin + given-names: Dmitry - family-names: Stichbury given-names: Jo - family-names: Theisen given-names: Merel title: Kedro -version: 0.18.11 -date-released: 2023-07-03 +version: 0.18.12 +date-released: 2023-07-31 url: https://github.com/kedro-org/kedro diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 40da48ccdf..bbc850442d 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -1,41 +1,26 @@ # Introduction -We welcome any and all contributions to Kedro, at whatever level you can manage. For example, you could: - -- [Join the community](#join-the-community-and-help-it-grow) -- [Contribute to the project](#contribute-to-the-project) -- [Join our Technical Steering Committee](#join-our-technical-steering-committee) - - -You can also suggest anything else that you think improves the community for us all! - -## Code of conduct - -The Kedro team pledges to foster and maintain a friendly community. We enforce a [Code of Conduct](./CODE_OF_CONDUCT.md) to ensure every Kedroid is welcomed and treated with respect. +We welcome any and all contributions to Kedro, at whatever level you can manage. Here are a few suggestions, but you are welcome to suggest anything else that you think improves the community for us all! ## Join the community -You can find the Kedro community on our [Slack organisation](https://slack.kedro.org/), which is where we share news and announcements, and general chat. You're also welcome to post links here to any articles or videos about Kedro that you create, or find, such as how-tos, showcases, demos, blog posts or tutorials. +You can find the Kedro community on our [Slack organisation](https://slack.kedro.org/), which is where we share news and announcements, and answer technical questions. You're welcome to post links to any articles or videos about Kedro that you create or find, such as how-tos, showcases, demos, blog posts or tutorials. -We also curate a [GitHub repo that lists content created by the Kedro community](https://github.com/kedro-org/awesome-kedro). +We also curate a [GitHub repo that lists content created by the Kedro community](https://github.com/kedro-org/awesome-kedro). If you've made something with Kedro, simply add it to the list with a PR! ## Contribute to the project -There are quite a few ways to contribute to the project, find inspiration from the table below. +There are quite a few ways to contribute to Kedro, sich as answering questions about Kedro to help others, fixing a typo on the documentation, reporting a bug, reviewing pull requests or adding a feature. -|Activity|Description| -|-|-| -|Community Q&A|We encourage you to ask and answer technical questions on [GitHub discussions](https://github.com/kedro-org/kedro/discussions) or [Slack](https://slack.kedro.org/), but the former is often preferable since it will be picked up by search engines.| -|Report bugs and security vulnerabilities |We use [GitHub issues](https://github.com/kedro-org/kedro/issues) to keep track of known bugs and security vulnerabilities. We keep a close eye on them and update them when we have an internal fix in progress. Before you report a new issue, do your best to ensure your problem hasn't already been reported. If it has, just leave a comment on the existing issue, rather than create a new one.
If you have already checked the existing [GitHub issues](https://github.com/kedro-org/kedro/issues) and are still convinced that you have found odd or erroneous behaviour then please file a new one.| -|Propose a new feature|If you have new ideas for Kedro functionality then please open a [GitHub issue](https://github.com/kedro-org/kedro/issues) and describe the feature you would like to see, why you need it, and how it should work.| -|Review pull requests|Check the [Kedro repo to find open pull requests](https://github.com/kedro-org/kedro/pulls) and contribute a review!| -|Contribute a fix or feature|If you're interested in contributing fixes to code or documentation, first read our [guidelines for contributing developers](https://docs.kedro.org/en/stable/contribution/developer_contributor_guidelines.html) for an explanation of how to get set up and the process you'll follow. Once you are ready to contribute, a good place to start is to take a look at the `good first issues` and `help wanted issues` on [GitHub](https://github.com/kedro-org/kedro/issues).| -|Contribute to the documentation|You can help us improve the [Kedro documentation online](https://docs.kedro.org/en/stable/). Send us feedback as a [GitHub issue](https://github.com/kedro-org/kedro/issues) or start a documentation discussion on [GitHub](https://github.com/kedro-org/kedro/discussions).You are also welcome to make a raise a PR with a bug fix or addition to the documentation. First read the guide [Contribute to the Kedro documentation](https://docs.kedro.org/en/stable/contribution/documentation_contributor_guidelines.html). +Take a look at some of our [contribution suggestions on the Kedro GitHub Wiki](https://github.com/kedro-org/kedro/wiki/Contribute-to-Kedro)! +## Join the Technical Steering Committee +Kedro is an incubating project in [LF AI & Data](https://lfaidata.foundation/), a sub-organisation within the Linux Foundation that focuses on open innovation within the data and AI space. -## Join our Technical Steering Committee +The project is governed by a group of maintainers, known as the Technical Steering Committee (TSC); read more about the structure of our TSC in our [Technical Charter](./kedro_technical_charter.pdf). -Kedro is an incubating project in [LF AI & Data](https://lfaidata.foundation/), a sub-organisation within the Linux -Foundation that focuses on open innovation within the data and AI space. A group of maintainers, known as the Technical Steering Committee (TSC), govern the project. You can read more about the structure of our TSC in our [Technical Charter](./kedro_technical_charter.pdf). +We regularly invite community members to join the TSC and help define the future of the Kedro project. Read the [guidance on becoming a Kedro maintainer](https://docs.kedro.org/en/stable/contribution/technical_steering_committee.html) to understand the process of joining the TSC. -We invite community members to join the TSC and help define the future of the Kedro project. Read the [guidance on becoming a Kedro maintainer](https://docs.kedro.org/en/stable/contribution/technical_steering_committee.html) to understand the process of joining the TSC. +## Code of conduct + +The Kedro team pledges to foster and maintain a friendly community. We enforce a [Code of Conduct](./CODE_OF_CONDUCT.md) to ensure every Kedroid is welcomed and treated with respect. diff --git a/MANIFEST.in b/MANIFEST.in index 245671a5e0..ad41ac26a3 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,7 +1,5 @@ include README.md include LICENSE.md -include dependency/requirements.txt -include test_requirements.txt include kedro/framework/project/default_logging.yml include kedro/ipython/*.png include kedro/ipython/*.svg diff --git a/Makefile b/Makefile index e680e12620..2e8436390b 100644 --- a/Makefile +++ b/Makefile @@ -8,17 +8,22 @@ clean: pre-commit clean || true install-pip-setuptools: - pip install -U "pip>=21.2" "setuptools>=65.5.1" wheel + python -m pip install -U "pip>=21.2, <23.2" "setuptools>=65.5.1" wheel lint: pre-commit run -a --hook-stage manual $(hook) - test: pytest --numprocesses 4 --dist loadfile test-no-spark: pytest --no-cov --ignore tests/extras/datasets/spark --numprocesses 4 --dist loadfile +test-sequential: + pytest tests --cov-config pyproject.toml + +test-no-spark-sequential: + pytest tests --no-cov --ignore tests/extras/datasets/spark + test-no-datasets: pytest --no-cov --ignore tests/extras/datasets/ --numprocesses 4 --dist loadfile @@ -48,7 +53,7 @@ package: clean install python -m pip install build && python -m build install-test-requirements: - pip install -r test_requirements.txt + pip install .[test] install-pre-commit: install-test-requirements pre-commit install --install-hooks diff --git a/README.md b/README.md index 80d92f723e..f329a8331f 100644 --- a/README.md +++ b/README.md @@ -1,18 +1,21 @@ ![Kedro Logo Banner - Light](.github/demo-dark.png#gh-dark-mode-only) ![Kedro Logo Banner - Dark](.github/demo-light.png#gh-light-mode-only) -[![Python version](https://img.shields.io/badge/python-3.7%20%7C%203.8%20%7C%203.9%20%7C%203.10-blue.svg)](https://pypi.org/project/kedro/) +[![Python version](https://img.shields.io/badge/python-3.7%20%7C%203.8%20%7C%203.9%20%7C%203.10%20%7C%203.11-blue.svg)](https://pypi.org/project/kedro/) [![PyPI version](https://badge.fury.io/py/kedro.svg)](https://pypi.org/project/kedro/) [![Conda version](https://img.shields.io/conda/vn/conda-forge/kedro.svg)](https://anaconda.org/conda-forge/kedro) [![License](https://img.shields.io/badge/license-Apache%202.0-blue.svg)](https://github.com/kedro-org/kedro/blob/main/LICENSE.md) [![Slack Organisation](https://img.shields.io/badge/slack-chat-blueviolet.svg?label=Kedro%20Slack&logo=slack)](https://slack.kedro.org) +[![Slack Archive](https://img.shields.io/badge/slack-archive-blueviolet.svg?label=Kedro%20Slack%20)](https://linen-slack.kedro.org/) ![CircleCI - Main Branch](https://img.shields.io/circleci/build/github/kedro-org/kedro/main?label=main) ![Develop Branch Build](https://img.shields.io/circleci/build/github/kedro-org/kedro/develop?label=develop) [![Documentation](https://readthedocs.org/projects/kedro/badge/?version=stable)](https://docs.kedro.org/) [![OpenSSF Best Practices](https://bestpractices.coreinfrastructure.org/projects/6711/badge)](https://bestpractices.coreinfrastructure.org/projects/6711) +[![Monthly downloads](https://static.pepy.tech/badge/kedro/month)](https://pepy.tech/project/kedro) +[![Total downloads](https://static.pepy.tech/badge/kedro)](https://pepy.tech/project/kedro) ## What is Kedro? -Kedro is a toolbox for production-ready data science. It uses software engineering best practices to help you create data engineering and data science pipelines that are reproducible, maintainable, and modular. +Kedro is a toolbox for production-ready data science. It uses software engineering best practices to help you create data engineering and data science pipelines that are reproducible, maintainable, and modular. You can find out more at [kedro.org](https://kedro.org). Kedro is an open-source Python framework hosted by the [LF AI & Data Foundation](https://lfaidata.foundation/). @@ -49,12 +52,10 @@ _A pipeline visualisation generated using [Kedro-Viz](https://github.com/kedro-o The [Kedro documentation](https://docs.kedro.org/en/stable/) first explains [how to install Kedro](https://docs.kedro.org/en/stable/get_started/install.html) and then introduces [key Kedro concepts](https://docs.kedro.org/en/stable/get_started/kedro_concepts.html). -- The first example illustrates the [basics of a Kedro project](https://docs.kedro.org/en/stable/get_started/new_project.html) using the Iris dataset -- You can then review the [spaceflights tutorial](https://docs.kedro.org/en/stable/tutorial/tutorial_template.html) to build a Kedro project for hands-on experience +You can then review the [spaceflights tutorial](https://docs.kedro.org/en/stable/tutorial/spaceflights_tutorial.html) to build a Kedro project for hands-on experience -For new and intermediate Kedro users, there's a comprehensive section on [how to visualise Kedro projects using Kedro-Viz](https://docs.kedro.org/en/stable/visualisation/kedro-viz_visualisation.html) and [how to work with Kedro and Jupyter notebooks](https://docs.kedro.org/en/stable/notebooks_and_ipython/kedro_and_notebooks). +For new and intermediate Kedro users, there's a comprehensive section on [how to visualise Kedro projects using Kedro-Viz](https://docs.kedro.org/en/stable/visualisation/index.html) and [how to work with Kedro and Jupyter notebooks](https://docs.kedro.org/en/stable/notebooks_and_ipython/index.html). We also recommend the [API reference documentation](/kedro) for additional information. -Further documentation is available for more advanced Kedro usage and deployment. We also recommend the [glossary](https://docs.kedro.org/en/stable/resources/glossary.html) and the [API reference documentation](/kedro) for additional information. ## Why does Kedro exist? @@ -66,64 +67,21 @@ Kedro is built upon our collective best-practice (and mistakes) trying to delive - To increase efficiency, because applied concepts like modularity and separation of concerns inspire the creation of **reusable analytics code** +Find out more about how Kedro can answer your use cases from the [product FAQs on the Kedro website](https://kedro.org/#faq). + ## The humans behind Kedro The [Kedro product team](https://docs.kedro.org/en/stable/contribution/technical_steering_committee.html#kedro-maintainers) and a number of [open source contributors from across the world](https://github.com/kedro-org/kedro/releases) maintain Kedro. ## Can I contribute? -Yes! Want to help build Kedro? Check out our [guide to contributing to Kedro](https://github.com/kedro-org/kedro/blob/main/CONTRIBUTING.md). +Yes! We welcome all kinds of contributions. Check out our [guide to contributing to Kedro](https://github.com/kedro-org/kedro/wiki/Contribute-to-Kedro). ## Where can I learn more? -There is a growing community around Kedro. Have a look at the [Kedro FAQs](https://docs.kedro.org/en/stable/faq/faq.html#how-can-i-find-out-more-about-kedro) to find projects using Kedro and links to articles, podcasts and talks. - -## Who likes Kedro? - -There are Kedro users across the world, who work at start-ups, major enterprises and academic institutions like [Absa](https://www.absa.co.za/), -[Acensi](https://acensi.eu/page/home), -[Advanced Programming Solutions SL](https://www.linkedin.com/feed/update/urn:li:activity:6863494681372721152/), -[AI Singapore](https://makerspace.aisingapore.org/2020/08/leveraging-kedro-in-100e/), -[AMAI GmbH](https://www.am.ai/), -[Augment Partners](https://www.linkedin.com/posts/augment-partners_kedro-cheat-sheet-by-augment-activity-6858927624631283712-Ivqk), -[AXA UK](https://www.axa.co.uk/), -[Belfius](https://www.linkedin.com/posts/vangansen_mlops-machinelearning-kedro-activity-6772379995953238016-JUmo), -[Beamery](https://medium.com/hacking-talent/production-code-for-data-science-and-our-experience-with-kedro-60bb69934d1f), -[Caterpillar](https://www.caterpillar.com/), -[CRIM](https://www.crim.ca/en/), -[Dendra Systems](https://www.dendra.io/), -[Element AI](https://www.elementai.com/), -[GetInData](https://getindata.com/blog/running-machine-learning-pipelines-kedro-kubeflow-airflow), -[GMO](https://recruit.gmo.jp/engineer/jisedai/engineer/jisedai/engineer/jisedai/engineer/jisedai/engineer/jisedai/blog/kedro_and_mlflow_tracking/), -[Indicium](https://medium.com/indiciumtech/how-to-build-models-as-products-using-mlops-part-2-machine-learning-pipelines-with-kedro-10337c48de92), -[Imperial College London](https://github.com/dssg/barefoot-winnie-public), -[ING](https://www.ing.com), -[Jungle Scout](https://junglescouteng.medium.com/jungle-scout-case-study-kedro-airflow-and-mlflow-use-on-production-code-150d7231d42e), -[Helvetas](https://www.linkedin.com/posts/lionel-trebuchon_mlflow-kedro-ml-ugcPost-6747074322164154368-umKw), -[Leapfrog](https://www.lftechnology.com/blog/ai-pipeline-kedro/), -[McKinsey & Company](https://www.mckinsey.com/alumni/news-and-insights/global-news/firm-news/kedro-from-proprietary-to-open-source), -[Mercado Libre Argentina](https://www.mercadolibre.com.ar), -[Modec](https://www.modec.com/), -[Mosaic Data Science](https://www.youtube.com/watch?v=fCWGevB366g), -[NaranjaX](https://www.youtube.com/watch?v=_0kMmRfltEQ), -[NASA](https://github.com/nasa/ML-airport-taxi-out), -[NHS AI Lab](https://nhsx.github.io/skunkworks/synthetic-data-pipeline), -[Open Data Science LatAm](https://www.odesla.org/), -[Prediqt](https://prediqt.co/), -[QuantumBlack](https://medium.com/quantumblack/introducing-kedro-the-open-source-library-for-production-ready-machine-learning-code-d1c6d26ce2cf), -[ReSpo.Vision](https://neptune.ai/customers/respo-vision), -[Retrieva](https://tech.retrieva.jp/entry/2020/07/28/181414), -[Roche](https://www.roche.com/), -[Sber](https://www.linkedin.com/posts/seleznev-artem_welcome-to-kedros-documentation-kedro-activity-6767523561109385216-woTt), -[Société Générale](https://www.societegenerale.com/en), -[Telkomsel](https://medium.com/life-at-telkomsel/how-we-build-a-production-grade-data-pipeline-7004e56c8c98), -[Universidad Rey Juan Carlos](https://github.com/vchaparro/MasterThesis-wind-power-forecasting/blob/master/thesis.pdf), -[UrbanLogiq](https://urbanlogiq.com/), -[Wildlife Studios](https://wildlifestudios.com), -[WovenLight](https://www.wovenlight.com/) and -[XP](https://youtu.be/wgnGOVNkXqU?t=2210). - -Kedro won [Best Technical Tool or Framework for AI](https://awards.ai/the-awards/previous-awards/the-4th-ai-award-winners/) in the 2019 Awards AI competition and a merit award for the 2020 [UK Technical Communication Awards](https://uktcawards.com/announcing-the-award-winners-for-2020/). It is listed on the 2020 [ThoughtWorks Technology Radar](https://www.thoughtworks.com/radar/languages-and-frameworks/kedro) and the 2020 [Data & AI Landscape](https://mattturck.com/data2020/). Kedro has received an [honorable mention in the User Experience category in Fast Company’s 2022 Innovation by Design Awards](https://www.fastcompany.com/90772252/user-experience-innovation-by-design-2022). +There is a growing community around Kedro. We encourage you to ask and answer technical questions on [Slack](https://slack.kedro.org/) and bookmark the [Linen archive of past discussions](https://linen-slack.kedro.org/). + +We keep a list of [technical FAQs in the Kedro documentation](https://docs.kedro.org/en/stable/faq/faq.html) and you can find a growing list of blog posts, videos and projects that use Kedro over on the [`awesome-kedro` GitHub repository](https://github.com/kedro-org/awesome-kedro). If you have created anything with Kedro we'd love to include it on the list. Just make a PR to add it! ## How can I cite Kedro? diff --git a/RELEASE.md b/RELEASE.md index 8ad51731ab..ea0fce323a 100644 --- a/RELEASE.md +++ b/RELEASE.md @@ -8,14 +8,23 @@ ## Migration guide from Kedro 0.18.* to 0.19.* -# Upcoming Release 0.18.12 +# Upcoming Release 0.18.13 ## Major features and improvements -* Added dataset factories feature which uses pattern matching to reduce the number of catalog entries. +* Allowed registering of custom resolvers to `OmegaConfigLoader` through `CONFIG_LOADER_ARGS`. +* Added support for Python 3.11. This includes tackling challenges like dependency pinning and test adjustments to ensure a smooth experience. Detailed migration tips are provided below for further context. ## Bug fixes and other changes +* Updated `kedro pipeline create` and `kedro catalog create` to use new `/conf` file structure. ## Documentation changes +* Update example of using generator functions in nodes. +* Added migration guide from the `ConfigLoader` to the `OmegaConfigLoader`. The `ConfigLoader` is deprecated and will be removed in the `0.19.0` release. + +## Migration Tips for Python 3.11: +* PyTables on Windows: Users on Windows with Python >=3.8 should note we've pinned `pytables` to `3.8.0` due to compatibility issues. +* Spark Dependency: We've set an upper version limit for `pyspark` at <3.4 due to breaking changes in 3.4. +* Testing with Python 3.10: The latest `moto` version now supports parallel test execution for Python 3.10, resolving previous issues. ## Breaking changes to the API @@ -27,6 +36,29 @@ | `AbstractDataset` | `AbstractDataSet` | `kedro.io.core` | | `AbstractVersionedDataset` | `AbstractVersionedDataSet` | `kedro.io.core` | +# Release 0.18.12 + +## Major features and improvements +* Added dataset factories feature which uses pattern matching to reduce the number of catalog entries. +* Activated all built-in resolvers by default for `OmegaConfigLoader` except for `oc.env`. +* Added `kedro catalog rank` CLI command that ranks dataset factories in the catalog by matching priority. + +## Bug fixes and other changes +* Consolidated dependencies and optional dependencies in `pyproject.toml`. +* Made validation of unique node outputs much faster. +* Updated `kedro catalog list` to show datasets generated with factories. + +## Documentation changes +* Recommended `ruff` as the linter and removed mentions of `pylint`, `isort`, `flake8`. + +## Community contributions +Thanks to [Laíza Milena Scheid Parizotto](https://github.com/laizaparizotto) and [Chris Schopp](https://github.com/cschopp-simwell). + +## Breaking changes to the API + +## Upcoming deprecations for Kedro 0.19.0 +* `ConfigLoader` and `TemplatedConfigLoader` will be deprecated. Please use `OmegaConfigLoader` instead. + # Release 0.18.11 ## Major features and improvements diff --git a/dependency/requirements.txt b/dependency/requirements.txt deleted file mode 100644 index 14b8e2f244..0000000000 --- a/dependency/requirements.txt +++ /dev/null @@ -1,24 +0,0 @@ -anyconfig~=0.10.0 -attrs>=21.3 -build -cachetools~=5.3 -click<9.0 -cookiecutter>=2.1.1, <3.0 -dynaconf>=3.1.2, <4.0 -fsspec>=2021.4, <2024.1 # Upper bound set arbitrarily, to be reassessed in early 2024 -gitpython~=3.0 -importlib-metadata>=3.6; python_version >= '3.8' -importlib_metadata>=3.6, <5.0; python_version < '3.8' # The "selectable" entry points were introduced in `importlib_metadata` 3.6 and Python 3.10. Bandit on Python 3.7 relies on a library with `importlib_metadata` < 5.0 -importlib_resources>=1.3 # The `files()` API was introduced in `importlib_resources` 1.3 and Python 3.9. -jmespath>=0.9.5, <1.0 -more_itertools~=9.0 -omegaconf~=2.3 -parse~=1.19.0 -pip-tools~=6.5 -pluggy~=1.0 -PyYAML>=4.2, <7.0 -rich>=12.0, <14.0 -rope>=0.21, <2.0 # subject to LGPLv3 license -setuptools>=65.5.1 -toml~=0.10 -toposort~=1.5 # Needs to be at least 1.5 to be able to raise CircularDependencyError diff --git a/docs/source/_static/css/theme-overrides.css b/docs/source/_static/css/theme-overrides.css index 8ae384dd9a..bdf28d927c 100644 --- a/docs/source/_static/css/theme-overrides.css +++ b/docs/source/_static/css/theme-overrides.css @@ -26,3 +26,10 @@ img[alt^="mermaid-"] { .rst-content .important .admonition-title { background-color: #f0b37e; } + +/* Ensure the section title is visible when linked via a hash in the URL */ +:target:before { + content: ""; + display: block; + height: 80px; +} diff --git a/docs/source/conf.py b/docs/source/conf.py index ab1f181a48..804bbbbfa9 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -142,6 +142,7 @@ "PluginManager", "_DI", "_DO", + "deltalake.table.Metadata", # The statements below were added after subclassing UserDict in AbstractConfigLoader. "None. Remove all items from D.", "a shallow copy of D", @@ -224,7 +225,9 @@ "https://github.com/kedro-org/kedro/blob/main/kedro/framework/project/default_logging.yml", "https://github.com/kedro-org/kedro/blob/main/README.md#the-humans-behind-kedro", # "anchor not found" but is valid "https://opensource.org/license/apache2-0-php/", - "https://docs.github.com/en/rest/overview/other-authentication-methods#via-username-and-password" + "https://docs.github.com/en/rest/overview/other-authentication-methods#via-username-and-password", + "https://docs.snowflake.com/en/developer-guide/snowpark/reference/python/api/snowflake.snowpark.DataFrameWriter.saveAsTable.html", + "https://www.educative.io/blog/advanced-yaml-syntax-cheatsheet#anchors" ] # retry before render a link broken (fix for "too many requests") @@ -498,8 +501,7 @@ def _add_jinja_filters(app): # LaTeXBuilder is used in the PDF docs build, # and it doesn't have attribute 'templates' if not ( - isinstance(app.builder, LaTeXBuilder) - or isinstance(app.builder, CheckExternalLinksBuilder) + isinstance(app.builder, (LaTeXBuilder,CheckExternalLinksBuilder)) ): app.builder.templates.environment.filters["env_override"] = env_override diff --git a/docs/source/configuration/advanced_configuration.md b/docs/source/configuration/advanced_configuration.md index efd71a8564..ecfe45a71f 100644 --- a/docs/source/configuration/advanced_configuration.md +++ b/docs/source/configuration/advanced_configuration.md @@ -124,7 +124,7 @@ This section contains a set of guidance for advanced configuration requirements * [How to bypass the configuration loading rules](#how-to-bypass-the-configuration-loading-rules) * [How to use Jinja2 syntax in configuration](#how-to-use-jinja2-syntax-in-configuration) * [How to do templating with the `OmegaConfigLoader`](#how-to-do-templating-with-the-omegaconfigloader) -* [How to use custom resolvers in the `OmegaConfigLoader`](#how-to-use-custom-resolvers-in-the-omegaconfigloader) +* [How to use resolvers in the `OmegaConfigLoader`](#how-to-use-resolvers-in-the-omegaconfigloader) * [How to load credentials through environment variables](#how-to-load-credentials-through-environment-variables) ### How to change which configuration files are loaded @@ -262,58 +262,75 @@ Since both of the file names (`catalog.yml` and `catalog_globals.yml`) match the #### Other configuration files It's also possible to use variable interpolation in configuration files other than parameters and catalog, such as custom spark or mlflow configuration. This works in the same way as variable interpolation in parameter files. You can still use the underscore for the templated values if you want, but it's not mandatory like it is for catalog files. -### How to use custom resolvers in the `OmegaConfigLoader` -`Omegaconf` provides functionality to [register custom resolvers](https://omegaconf.readthedocs.io/en/2.3_branch/usage.html#resolvers) for templated values. You can use these custom resolves within Kedro by extending the [`OmegaConfigLoader`](/kedro.config.OmegaConfigLoader) class. +### How to use resolvers in the `OmegaConfigLoader` +Instead of hard-coding values in your configuration files, you can also dynamically compute them using [`OmegaConf`'s +resolvers functionality](https://omegaconf.readthedocs.io/en/2.3_branch/custom_resolvers.html#resolvers). You use resolvers to define custom +logic to calculate values of parameters or catalog entries, or inject these values from elsewhere. To use this feature with Kedro, pass a +`dict` of custom resolvers to `OmegaConfigLoader` through `CONFIG_LOADER_ARGS` in your project's `src//settings.py`. The example below illustrates this: ```python +import polars as pl +from datetime import date + from kedro.config import OmegaConfigLoader -from omegaconf import OmegaConf -from typing import Any, Dict - - -class CustomOmegaConfigLoader(OmegaConfigLoader): - def __init__( - self, - conf_source: str, - env: str = None, - runtime_params: Dict[str, Any] = None, - ): - super().__init__( - conf_source=conf_source, env=env, runtime_params=runtime_params - ) - - # Register a customer resolver that adds up numbers. - self.register_custom_resolver("add", lambda *numbers: sum(numbers)) - - @staticmethod - def register_custom_resolver(name, function): - """ - Helper method that checks if the resolver has already been registered and registers the - resolver if it's new. The check is needed, because omegaconf will throw an error - if a resolver with the same name is registered twice. - Alternatively, you can call `register_new_resolver()` with `replace=True`. - """ - if not OmegaConf.has_resolver(name): - OmegaConf.register_new_resolver(name, function) -``` -In order to use this custom configuration loader, you will need to set it as the project configuration loader in `src//settings.py`: +CONFIG_LOADER_CLASS = OmegaConfigLoader -```python -from package_name.custom_configloader import CustomOmegaConfigLoader -CONFIG_LOADER_CLASS = CustomOmegaConfigLoader -``` +def date_today(): + return date.today() -You can then use the custom "add" resolver in your `parameters.yml` as follows: +CONFIG_LOADER_ARGS = { + "custom_resolvers": { + "add": lambda *my_list: sum(my_list), + "polars": lambda x: getattr(pl, x), + "today": lambda: date_today(), + } +} +``` +These custom resolvers are then registered using `OmegaConf.register_new_resolver()` under the hood and can be used in any of the +configuration files in your project. For example, you can use the `add` or the `today` resolver defined above in your `parameters.yml` like this: ```yaml model_options: - test_size: ${add:1,2,3} + test_size: "${add:1,2,3}" random_state: 3 + +date: "${today:}" ``` +The values of these parameters will be computed at access time and will be passed on to your nodes. +Resolvers can also be used in your `catalog.yml`. In the example below, we use the `polars` resolver defined above to pass non-primitive +types to the catalog entry. +```yaml +my_polars_dataset: + type: polars.CSVDataSet + filepath: data/01_raw/my_dataset.csv + load_args: + dtypes: + product_age: "${polars:Float64}" + group_identifier: "${polars:Utf8}" + try_parse_dates: true +``` +`OmegaConf` also comes with some [built-in resolvers](https://omegaconf.readthedocs.io/en/latest/custom_resolvers.html#built-in-resolvers) +that you can use with the `OmegaConfigLoader` in Kedro. All built-in resolvers except for [`oc.env`](https://omegaconf.readthedocs.io/en/latest/custom_resolvers.html#oc-env) +are enabled by default. `oc.env` is only turned on for loading credentials. You can, however, turn this on for all configurations through your project's `src//settings.py` in a similar way: +```{note} +This is an advanced feature and should be used with caution. We do not recommend using environment variables for configurations other than credentials. +``` +```python +from omegaconf.resolvers import oc +from kedro.config import OmegaConfigLoader + +CONFIG_LOADER_CLASS = OmegaConfigLoader + +CONFIG_LOADER_ARGS = { + "custom_resolvers": { + "oc.env": oc.env, + } +} +``` ### How to load credentials through environment variables The [`OmegaConfigLoader`](/kedro.config.OmegaConfigLoader) enables you to load credentials from environment variables. To achieve this you have to use the `OmegaConfigLoader` and the `omegaconf` [`oc.env` resolver](https://omegaconf.readthedocs.io/en/2.3_branch/custom_resolvers.html#oc-env). To use the `OmegaConfigLoader` in your project, set the `CONFIG_LOADER_CLASS` constant in your [`src//settings.py`](../kedro_project_setup/settings.md): diff --git a/docs/source/configuration/config_loader_migration.md b/docs/source/configuration/config_loader_migration.md new file mode 100644 index 0000000000..f27b244afe --- /dev/null +++ b/docs/source/configuration/config_loader_migration.md @@ -0,0 +1,62 @@ +# Migration guide for config loaders +The `ConfigLoader` and `TemplatedConfigLoader` classes have been deprecated since Kedro `0.18.12` and will be removed in Kedro `0.19.0`. To ensure a smooth transition, we strongly recommend you adopt the [`OmegaConfigLoader`](/kedro.config.OmegaConfigLoader) as soon as possible. +This migration guide outlines the primary distinctions between the old loaders and the `OmegaConfigLoader`, providing step-by-step instructions on updating your code base to utilise the new class effectively. + +## [`ConfigLoader`](/kedro.config.ConfigLoader) to [`OmegaConfigLoader`](/kedro.config.OmegaConfigLoader) + +### 1. Install the Required Library +The [`OmegaConfigLoader`](advanced_configuration.md#omegaconfigloader) was introduced in Kedro `0.18.5` and is based on [OmegaConf](https://omegaconf.readthedocs.io/). In order to use it you need to ensure you have both a version of Kedro of `0.18.5` or above and `omegaconf` installed. +You can install both using `pip`: + +```bash +pip install kedro==0.18.5 +``` +This would be the minimum required Kedro version which includes `omegaconf` as a dependency. +Or you can run: +```bash +pip install -U kedro +``` + +This command installs the most recent version of Kedro which also includes `omegaconf` as a dependency. + +### 2. Use the `OmegaConfigLoader` +To use `OmegaConfigLoader` in your project, set the `CONFIG_LOADER_CLASS` constant in your [`src//settings.py`](../kedro_project_setup/settings.md): + +```diff ++ from kedro.config import OmegaConfigLoader # new import + ++ CONFIG_LOADER_CLASS = OmegaConfigLoader +``` + +### 3. Import Statements +Replace the import statement for `ConfigLoader` with the one for `OmegaConfigLoader`: + +```diff +- from kedro.config import ConfigLoader + ++ from kedro.config import OmegaConfigLoader +``` + +### 4. File Format Support +`OmegaConfigLoader` supports only `yaml` and `json` file formats. Make sure that all your configuration files are in one of these formats. If you previously used other formats with `ConfigLoader`, convert them to `yaml` or `json`. + +### 5. Load Configuration +The method to load the configuration using `OmegaConfigLoader` differs slightly from that used by `ConfigLoader`, which allowed users to access configuration through the `.get()` method and required patterns as argument. +When you migrate to use `OmegaConfigLoader` it requires you to fetch configuration through a configuration key that points to [configuration patterns specified in the loader class](configuration_basics.md#configuration-patterns) or [provided in the `CONFIG_LOADER_ARGS`](advanced_configuration.md#how-to-change-which-configuration-files-are-loaded) in `settings.py`. + +```diff +- conf_path = str(project_path / settings.CONF_SOURCE) +- conf_loader = ConfigLoader(conf_source=conf_path, env="local") +- catalog = conf_loader.get("catalog*") + ++ conf_path = str(project_path / settings.CONF_SOURCE) ++ config_loader = OmegaConfigLoader(conf_source=conf_path, env="local") ++ catalog = config_loader["catalog"] +``` + +In this example, `"catalog"` is the key to the default catalog patterns specified in the `OmegaConfigLoader` class. + +### 6. Exception Handling +For error and exception handling, most errors are the same. Those you need to be aware of that are different between the original `ConfigLoader` and `OmegaConfigLoader` are as follows: +* `OmegaConfigLoader` throws a `MissingConfigException` when configuration paths don't exist, rather than the `ValueError` used in `ConfigLoader`. +* In `OmegaConfigLoader`, if there is bad syntax in your configuration files, it will trigger a `ParserError` instead of a `BadConfigException` used in `ConfigLoader`. diff --git a/docs/source/configuration/configuration_basics.md b/docs/source/configuration/configuration_basics.md index 197d8b2478..2e964b512f 100644 --- a/docs/source/configuration/configuration_basics.md +++ b/docs/source/configuration/configuration_basics.md @@ -45,7 +45,7 @@ Kedro merges configuration information and returns a configuration dictionary ac * If any two configuration files located inside the **same** environment path (such as `conf/base/`) contain the same top-level key, the configuration loader raises a `ValueError` indicating that duplicates are not allowed. * If two configuration files contain the same top-level key but are in **different** environment paths (for example, one in `conf/base/`, another in `conf/local/`) then the last loaded path (`conf/local/`) takes precedence as the key value. `ConfigLoader.get` does not raise any errors but a `DEBUG` level log message is emitted with information on the overridden keys. -When using any of the configuration loaders, any top-level keys that start with `_` are considered hidden (or reserved) and are ignored. Those keys will neither trigger a key duplication error nor appear in the resulting configuration dictionary. However, you can still use such keys, for example, as [YAML anchors and aliases](https://www.educative.io/blog/advanced-yaml-syntax-cheatsheet#anchors) +When using any of the configuration loaders, any top-level keys that start with `_` are considered hidden (or reserved) and are ignored. Those keys will neither trigger a key duplication error nor appear in the resulting configuration dictionary. However, you can still use such keys, for example, as [YAML anchors and aliases](https://www.educative.io/blog/advanced-yaml-syntax-cheatsheet) or [to enable templating in the catalog when using the `OmegaConfigLoader`](advanced_configuration.md#how-to-do-templating-with-the-omegaconfigloader). ### Configuration file names diff --git a/docs/source/configuration/index.md b/docs/source/configuration/index.md index 3f554e1e91..291a4fbf65 100644 --- a/docs/source/configuration/index.md +++ b/docs/source/configuration/index.md @@ -6,5 +6,6 @@ configuration_basics credentials parameters +config_loader_migration advanced_configuration ``` diff --git a/docs/source/contribution/developer_contributor_guidelines.md b/docs/source/contribution/developer_contributor_guidelines.md index 667f1a2dbd..787a838d90 100644 --- a/docs/source/contribution/developer_contributor_guidelines.md +++ b/docs/source/contribution/developer_contributor_guidelines.md @@ -131,7 +131,7 @@ We will work with you to complete your contribution, but we reserve the right to ``` Ensure that your PR builds cleanly before you submit it, by running the CI/CD checks locally, as follows: -* `make lint`: PEP-8 Standards (`pylint`, `flake8`) +* `make lint`: PEP-8 Standards (`ruff`, `black`) * `make test`: unit tests, 100% coverage (`pytest`, `pytest-cov`) * `make e2e-tests`: end-to-end tests (`behave`) diff --git a/docs/source/contribution/technical_steering_committee.md b/docs/source/contribution/technical_steering_committee.md index fcc35e7d45..1760cfd6f6 100644 --- a/docs/source/contribution/technical_steering_committee.md +++ b/docs/source/contribution/technical_steering_committee.md @@ -62,12 +62,14 @@ Currently, the core Kedro team consists of: [Ankita Katiyar](https://github.com/ankatiyar), [Antony Milne](https://github.com/antonymilne), [Deepyaman Datta](https://github.com/deepyaman), +[Dmitry Sorokin](https://github.com/DimedS), [Huong Nguyen](https://github.com/Huongg), [Ivan Danov](https://github.com/idanov), -[Jannic Holzer](https://github.com/jmholzer), +[Jitendra Gundaniya](https://github.com/jitu5), [Jo Stichbury](https://github.com/stichbury), [Joel Schwarzmann](https://github.com/datajoely), [Juan Luis Cano](https://github.com/astrojuanlu), +[Laura Couto](https://github.com/lrcouto), [Marcin Zabłocki](https://github.com/marrrcin), [Merel Theisen](https://github.com/merelcht), [Nero Okwa](https://github.com/NeroOkwa), @@ -76,7 +78,8 @@ Currently, the core Kedro team consists of: [Ravi Kumar Pilla](https://github.com/ravi-kumar-pilla), [Sajid Alam](https://github.com/SajidAlamQB), [Stephanie Kaiser](https://github.com/stephkaiser), -[Tynan DeBold](https://github.com/tynandebold) and +[Tynan DeBold](https://github.com/tynandebold), +[Vladimir Nikolic](https://github.com/vladimir-mck), and [Yetunde Dada](https://github.com/yetudada). Former core team members with significant contributions include: @@ -89,6 +92,7 @@ Former core team members with significant contributions include: [Gordon Wrigley](https://github.com/tolomea), [Hamza Oza](https://github.com/hamzaoza), [Ignacio Paricio](https://github.com/ignacioparicio), +[Jannic Holzer](https://github.com/jmholzer), [Jiri Klein](https://github.com/jiriklein), [Kiyohito Kunii](https://github.com/921kiyo), [Laís Carvalho](https://github.com/laisbsc), diff --git a/docs/source/data/data_catalog.md b/docs/source/data/data_catalog.md index 6929eb9aef..fb1f7ac3dc 100644 --- a/docs/source/data/data_catalog.md +++ b/docs/source/data/data_catalog.md @@ -359,10 +359,10 @@ The list of all available parameters is given in the [Paramiko documentation](ht You can use the [`kedro catalog create` command to create a Data Catalog YAML configuration](../development/commands_reference.md#create-a-data-catalog-yaml-configuration-file). -This creates a `//catalog/.yml` configuration file with `MemoryDataSet` datasets for each dataset in a registered pipeline if it is missing from the `DataCatalog`. +This creates a `//catalog_.yml` configuration file with `MemoryDataSet` datasets for each dataset in a registered pipeline if it is missing from the `DataCatalog`. ```yaml -# //catalog/.yml +# //catalog_.yml rockets: type: MemoryDataSet scooters: @@ -462,7 +462,7 @@ airplanes: In this example, the default `csv` configuration is inserted into `airplanes` and then the `load_args` block is overridden. Normally, that would replace the whole dictionary. In order to extend `load_args`, the defaults for that block are then re-inserted. ## Load multiple datasets with similar configuration using dataset factories -For catalog entries that share configuration details, you can also use the dataset factories introduced in Kedro 0.18.11. This syntax allows you to generalise the configuration and +For catalog entries that share configuration details, you can also use the dataset factories introduced in Kedro 0.18.12. This syntax allows you to generalise the configuration and reduce the number of similar catalog entries by matching datasets used in your project's pipelines to dataset factory patterns. ### Example 1: Generalise datasets with similar names and types into one dataset factory @@ -811,7 +811,7 @@ from kedro.io import MemoryDataSet memory = MemoryDataSet(data=None) io.add("cars_cache", memory) io.save("cars_cache", "Memory can store anything.") -io.load("car_cache") +io.load("cars_cache") ``` #### Save data to a SQL database for querying diff --git a/docs/source/deployment/prefect.md b/docs/source/deployment/prefect.md index 64d1018984..b602b499ec 100644 --- a/docs/source/deployment/prefect.md +++ b/docs/source/deployment/prefect.md @@ -1,6 +1,6 @@ # Prefect -This page explains how to run your Kedro pipeline using [Prefect 2.0](https://www.prefect.io/products/core/), an open-source workflow management system. +This page explains how to run your Kedro pipeline using [Prefect 2.0](https://www.prefect.io/opensource), an open-source workflow management system. The scope of this documentation is the deployment to a self hosted [Prefect Server](https://docs.prefect.io/2.10.17/host/), which is an open-source backend that makes it easy to monitor and execute your Prefect flows and automatically extends Prefect 2.0. We will use an [Agent that dequeues submitted flow runs from a Work Queue](https://docs.prefect.io/2.10.17/tutorial/deployments/#why-workpools-and-workers). diff --git a/docs/source/development/commands_reference.md b/docs/source/development/commands_reference.md index ae2933e256..45801ea112 100644 --- a/docs/source/development/commands_reference.md +++ b/docs/source/development/commands_reference.md @@ -62,6 +62,7 @@ Here is a list of Kedro CLI commands, as a shortcut to the descriptions below. P * [`kedro build-docs`](#build-the-project-documentation) (deprecated from version 0.19.0) * [`kedro build-reqs`](#build-the-projects-dependency-tree) (deprecated from version 0.19.0) * [`kedro catalog list`](#list-datasets-per-pipeline-per-type) + * [`kedro catalog rank`](#rank-dataset-factories-in-the-catalog) * [`kedro catalog create`](#create-a-data-catalog-yaml-configuration-file) * [`kedro ipython`](#notebooks) * [`kedro jupyter convert`](#copy-tagged-cells) (deprecated from version 0.19.0) @@ -114,7 +115,7 @@ Returns output similar to the following, depending on the version of Kedro used | |/ / _ \/ _` | '__/ _ \ | < __/ (_| | | | (_) | |_|\_\___|\__,_|_| \___/ -v0.18.11 +v0.18.12 Kedro is a Python framework for creating reproducible, maintainable @@ -375,7 +376,7 @@ kedro micropkg pull The above command will take the bundled `.tar.gz` file and do the following: * Place source code in `src//pipelines/` -* Place parameters in `conf/base/parameters/.yml` +* Place parameters in `conf/base/parameters_.yml` * Pull out tests and place in `src/tests/pipelines/` `kedro micropkg pull` works with PyPI, local and cloud storage: @@ -402,7 +403,7 @@ The `build-docs` command builds [project documentation](../tutorial/package_a_pr #### Lint your project ```{note} -_This command will be deprecated from Kedro version 0.19.0._ +_This command will be deprecated from Kedro version 0.19.0._. We still recommend to (../development/linting.md) and you can find more help here ``` ```bash @@ -491,6 +492,14 @@ The command also accepts an optional `--pipeline` argument that allows you to sp kedro catalog list --pipeline=ds,de ``` +##### Rank dataset factories in the catalog + +```bash +kedro catalog rank +``` + +The output includes a list of any [dataset factories](../data/data_catalog.md#load-multiple-datasets-with-similar-configuration-using-dataset-factories) in the catalog, ranked by the priority on which they are matched against. + #### Data Catalog ##### Create a Data Catalog YAML configuration file @@ -503,7 +512,7 @@ kedro catalog create --pipeline= The command also accepts an optional `--env` argument that allows you to specify a configuration environment (defaults to `base`). -The command creates the following file: `//catalog/.yml` +The command creates the following file: `//catalog_.yml` #### Notebooks diff --git a/docs/source/development/linting.md b/docs/source/development/linting.md index e2c0f31037..d795086b51 100644 --- a/docs/source/development/linting.md +++ b/docs/source/development/linting.md @@ -10,25 +10,23 @@ As a project grows and goes through various stages of development it becomes imp ## Set up Python tools There are a variety of Python tools available to use with your Kedro projects. This guide shows you how to use -[`black`](https://github.com/psf/black), [`flake8`](https://github.com/PyCQA/flake8), and -[`isort`](https://github.com/PyCQA/isort) to format and lint your Kedro projects. +[`black`](https://github.com/psf/black), [`ruff`](https://beta.ruff.rs). - **`black`** is a [PEP 8](https://peps.python.org/pep-0008/) compliant opinionated Python code formatter. `black` can check for styling inconsistencies and reformat your files in place. [You can read more in the `black` documentation](https://black.readthedocs.io/en/stable/). -- **`flake8`** is a wrapper around [`pep8`](https://pypi.org/project/pep8/), -[`pyflakes`](https://pypi.org/project/pyflakes/), and [`mccabe`](https://pypi.org/project/mccabe/) which can lint code and format it with respect to [PEP 8](https://peps.python.org/pep-0008/), -and check the [cyclomatic complexity](https://www.ibm.com/docs/en/raa/6.1?topic=metrics-cyclomatic-complexity) of your code base. -[You can read more in the `flake8` documentation](https://flake8.pycqa.org/en/latest/). -- **`isort`** is a Python library used to reformat code by sorting imports alphabetically and automatically separating them into sections by +- **`ruff`** is a fast linter that replaces `flake8`, `pylint`, `pyupgrade`, `isort` and [more](https://beta.ruff.rs/docs/rules/). + - It helps to make your code compliant to [`pep8`](https://pypi.org/project/pep8/). + - It reformats code by sorting imports alphabetically and automatically separating them into sections by type. [You can read more in the `isort` documentation](https://pycqa.github.io/isort/). + ### Install the tools -Install `black`, `flake8`, and `isort` by adding the following lines to your project's `src/requirements.txt` +Install `black` and `ruff` by adding the following lines to your project's `src/requirements.txt` file: ```text black # Used for formatting code -flake8 # Used for linting and formatting -isort # Used for formatting code (sorting module imports) +ruff # Used for linting, formatting and sorting module imports + ``` To install all the project-specific dependencies, including the linting tools, navigate to the root directory of the project and run: @@ -37,10 +35,29 @@ pip install -r src/requirements.txt ``` Alternatively, you can individually install the linting tools using the following shell commands: ```bash -pip install black -pip install flake8 -pip install isort +pip install black ruff ``` +#### Configure `ruff` +`ruff` read configurations from `pyproject.toml` within your project root. You can enable different rule sets within the `[tool.ruff]` section. For example, the rule set `F` is equivalent to `Pyflakes`. + +To start with `ruff`, we recommend adding this section to enable a few basic rules sets. +```toml +[tool.ruff] +select = [ + "F", # Pyflakes + "E", # Pycodestyle + "W", # Pycodestyle + "UP", # pyupgrade + "I", # isort + "PL", # Pylint +] +ignore = ["E501"] # Black take care off line-too-long +``` + +```{note} +It is a good practice to [split your line when it is too long](https://beta.ruff.rs/docs/rules/line-too-long/), so it can be read easily even in a small screen. `ruff` treats this slightly different from `black`, when using together we recommend to disable this rule, i.e. `E501` to avoid conflicts. +``` + #### Configure `flake8` Store your `flake8` configuration in a file named `setup.cfg` within your project root. The Kedro starters use the [following configuration](https://github.com/kedro-org/kedro-starters/blob/main/pandas-iris/%7B%7B%20cookiecutter.repo_name%20%7D%7D/setup.cfg): @@ -87,17 +104,11 @@ you want to run before each `commit`. Below is a sample `YAML` file with entries for `black`,`flake8`, and `isort`: ```yaml repos: - - repo: https://github.com/pycqa/isort - rev: 5.10.1 - hooks: - - id: isort - name: isort (python) - args: ["--profile", "black"] - - - repo: https://github.com/pycqa/flake8 - rev: '' # pick a git hash / tag to point to + - repo: https://github.com/astral-sh/ruff-pre-commit + # Ruff version. + rev: v0.0.270 hooks: - - id: flake8 + - id: ruff - repo: https://github.com/psf/black rev: 22.8.0 diff --git a/docs/source/experiment_tracking/index.md b/docs/source/experiment_tracking/index.md index 35a5dc053d..a8e94dd05b 100644 --- a/docs/source/experiment_tracking/index.md +++ b/docs/source/experiment_tracking/index.md @@ -106,7 +106,7 @@ For collaborative experiment tracking, Kedro-Viz saves your experiments as SQLit > Note: In Kedro-Viz version 6.2, the only way to set up credentials for accessing your cloud storage is through environment variables. ```bash -export KEDRO_SQLITE_STORE_USERNAME ="your_unique__username" +export KEDRO_SQLITE_STORE_USERNAME="your_unique__username" ``` diff --git a/docs/source/extend_kedro/plugins.md b/docs/source/extend_kedro/plugins.md index 81cd139a7c..61b82fcfbc 100644 --- a/docs/source/extend_kedro/plugins.md +++ b/docs/source/extend_kedro/plugins.md @@ -84,7 +84,7 @@ setup( After that you can use this starter with `kedro new --starter=test_plugin_starter`. ```{note} -If your starter lives on a git repository, by default Kedro attempts to use a tag or branch labelled with your version of Kedro, e.g. `0.18.11`. This means that you can host different versions of your starter template on the same repository, and the correct one will automatically be used. If you do not wish to follow this structure, you should override it with the `checkout` flag, e.g. `kedro new --starter=test_plugin_starter --checkout=main`. +If your starter lives on a git repository, by default Kedro attempts to use a tag or branch labelled with your version of Kedro, e.g. `0.18.12`. This means that you can host different versions of your starter template on the same repository, and the correct one will automatically be used. If you do not wish to follow this structure, you should override it with the `checkout` flag, e.g. `kedro new --starter=test_plugin_starter --checkout=main`. ``` ## Working with `click` diff --git a/docs/source/faq/faq.md b/docs/source/faq/faq.md index 7847e1991a..75790690a9 100644 --- a/docs/source/faq/faq.md +++ b/docs/source/faq/faq.md @@ -1,4 +1,6 @@ -# Frequently asked questions +# FAQs + +This is a growing set of technical FAQs. The [product FAQs on the Kedro website](https://kedro.org/#faq) explain how Kedro can answer the typical use cases and requirements of data scientists, data engineers, machine learning engineers and product owners. ## Visualisation @@ -34,7 +36,7 @@ * [How do I bypass the configuration loading rules](../configuration/advanced_configuration.md#how-to-bypass-the-configuration-loading-rules)? * [How do I use Jinja2 syntax in configuration](../configuration/advanced_configuration.md#how-to-use-jinja2-syntax-in-configuration)? * [How do I do templating with the `OmegaConfigLoader`](../configuration/advanced_configuration.md#how-to-do-templating-with-the-omegaconfigloader)? -* [How do I use custom resolvers in the `OmegaConfigLoader`](../configuration/advanced_configuration.md#how-to-use-custom-resolvers-in-the-omegaconfigloader)? +* [How do I use resolvers in the `OmegaConfigLoader`](../configuration/advanced_configuration.md#how-to-use-resolvers-in-the-omegaconfigloader)? * [How do I load credentials through environment variables](../configuration/advanced_configuration.md#how-to-load-credentials-through-environment-variables)? ## Datasets and the Data Catalog @@ -46,3 +48,25 @@ * [How do I create a modular pipeline](../nodes_and_pipelines/modular_pipelines.md#how-do-i-create-a-modular-pipeline)? * [Can I use generator functions in a node](../nodes_and_pipelines/nodes.md#how-to-use-generator-functions-in-a-node)? + +## What is data engineering convention? + +[Bruce Philp](https://github.com/bruceaphilp) and [Guilherme Braccialli](https://github.com/gbraccialli-qb) are the +brains behind a layered data-engineering convention as a model of managing data. You can find an [in-depth walk through of their convention](https://towardsdatascience.com/the-importance-of-layered-thinking-in-data-engineering-a09f685edc71) as a blog post on Medium. + +Refer to the following table below for a high level guide to each layer's purpose + +> **Note**:The data layers don’t have to exist locally in the `data` folder within your project, but we recommend that you structure your S3 buckets or other data stores in a similar way. + +![data_engineering_convention](../meta/images/data_layers.png) + +| Folder in data | Description | +| -------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| Raw | Initial start of the pipeline, containing the sourced data model(s) that should never be changed, it forms your single source of truth to work from. These data models are typically un-typed in most cases e.g. csv, but this will vary from case to case | +| Intermediate | Optional data model(s), which are introduced to type your :code:`raw` data model(s), e.g. converting string based values into their current typed representation | +| Primary | Domain specific data model(s) containing cleansed, transformed and wrangled data from either `raw` or `intermediate`, which forms your layer that you input into your feature engineering | +| Feature | Analytics specific data model(s) containing a set of features defined against the `primary` data, which are grouped by feature area of analysis and stored against a common dimension | +| Model input | Analytics specific data model(s) containing all :code:`feature` data against a common dimension and in the case of live projects against an analytics run date to ensure that you track the historical changes of the features over time | +| Models | Stored, serialised pre-trained machine learning models | +| Model output | Analytics specific data model(s) containing the results generated by the model based on the `model input` data | +| Reporting | Reporting data model(s) that are used to combine a set of `primary`, `feature`, `model input` and `model output` data used to drive the dashboard and the views constructed. It encapsulates and removes the need to define any blending or joining of data, improve performance and replacement of presentation layer without having to redefine the data models | diff --git a/docs/source/get_started/install.md b/docs/source/get_started/install.md index 0ce17301c5..8afea95a57 100644 --- a/docs/source/get_started/install.md +++ b/docs/source/get_started/install.md @@ -134,7 +134,7 @@ You should see an ASCII art graphic and the Kedro version number. For example: ![](../meta/images/kedro_graphic.png) -If you do not see the graphic displayed, or have any issues with your installation, check out the [searchable archive of Slack discussions](https://www.linen.dev/s/kedro), or post a new query on the [Slack organisation](https://slack.kedro.org). +If you do not see the graphic displayed, or have any issues with your installation, check out the [searchable archive of Slack discussions](https://linen-slack.kedro.org/), or post a new query on the [Slack organisation](https://slack.kedro.org). ## How to upgrade Kedro @@ -187,4 +187,4 @@ pip install kedro * Installation prerequisites include a virtual environment manager like `conda`, Python 3.7+, and `git`. * You should install Kedro using `pip install kedro`. -If you encounter any problems as you set up Kedro, ask for help on Kedro's [Slack organisation](https://slack.kedro.org) or review the [searchable archive of Slack discussions](https://www.linen.dev/s/kedro). +If you encounter any problems as you set up Kedro, ask for help on Kedro's [Slack organisation](https://slack.kedro.org) or review the [searchable archive of Slack discussions](https://linen-slack.kedro.org/). diff --git a/docs/source/index.rst b/docs/source/index.rst index f9c78a2748..5850f15f76 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -23,9 +23,9 @@ Welcome to Kedro's documentation! :target: https://opensource.org/license/apache2-0-php/ :alt: License is Apache 2.0 -.. image:: https://img.shields.io/badge/python-3.7%20%7C%203.8%20%7C%203.9%20%7C%203.10-blue.svg +.. image:: https://img.shields.io/badge/python-3.7%20%7C%203.8%20%7C%203.9%20%7C%203.10%20%7C%203.11-blue.svg :target: https://pypi.org/project/kedro/ - :alt: Python version 3.7, 3.8, 3.9, 3.10 + :alt: Python version 3.7, 3.8, 3.9, 3.10, 3.11 .. image:: https://badge.fury.io/py/kedro.svg :target: https://pypi.org/project/kedro/ @@ -43,8 +43,8 @@ Welcome to Kedro's documentation! :target: https://slack.kedro.org :alt: Kedro's Slack organisation -.. image:: https://img.shields.io/badge/slack-archive-blue.svg?label=Kedro%20Slack%20 - :target: https://www.linen.dev/s/kedro +.. image:: https://img.shields.io/badge/slack-archive-blueviolet.svg?label=Kedro%20Slack%20 + :target: https://linen-slack.kedro.org/ :alt: Kedro's Slack archive .. image:: https://img.shields.io/badge/code%20style-black-black.svg diff --git a/docs/source/kedro_datasets.rst b/docs/source/kedro_datasets.rst index b3d3ab328b..18bff88086 100644 --- a/docs/source/kedro_datasets.rst +++ b/docs/source/kedro_datasets.rst @@ -24,6 +24,7 @@ kedro_datasets kedro_datasets.networkx.GraphMLDataSet kedro_datasets.networkx.JSONDataSet kedro_datasets.pandas.CSVDataSet + kedro_datasets.pandas.DeltaTableDataSet kedro_datasets.pandas.ExcelDataSet kedro_datasets.pandas.FeatherDataSet kedro_datasets.pandas.GBQQueryDataSet diff --git a/docs/source/kedro_project_setup/starters.md b/docs/source/kedro_project_setup/starters.md index aac0dc331f..4ad60cb0c2 100644 --- a/docs/source/kedro_project_setup/starters.md +++ b/docs/source/kedro_project_setup/starters.md @@ -155,7 +155,7 @@ Here is the layout of the project as a Cookiecutter template: ├── docs # Project documentation ├── notebooks # Project related Jupyter notebooks (can be used for experimental code before moving the code to src) ├── README.md # Project README -├── setup.cfg # Configuration options for tools e.g. `pytest` or `flake8` +├── setup.cfg # Configuration options for tools e.g. `pytest` or `black` └── src # Project source code └── {{ cookiecutter.python_package }} ├── __init.py__ diff --git a/docs/source/meta/images/data_layers.png b/docs/source/meta/images/data_layers.png new file mode 100644 index 0000000000..fd3798310a Binary files /dev/null and b/docs/source/meta/images/data_layers.png differ diff --git a/docs/source/meta/images/preview_datasets_expanded.png b/docs/source/meta/images/preview_datasets_expanded.png new file mode 100644 index 0000000000..fdf1f4ed49 Binary files /dev/null and b/docs/source/meta/images/preview_datasets_expanded.png differ diff --git a/docs/source/meta/images/preview_datasets_metadata.png b/docs/source/meta/images/preview_datasets_metadata.png new file mode 100644 index 0000000000..429f0eb6cf Binary files /dev/null and b/docs/source/meta/images/preview_datasets_metadata.png differ diff --git a/docs/source/nodes_and_pipelines/micro_packaging.md b/docs/source/nodes_and_pipelines/micro_packaging.md index d72a0c3b4f..3ad1ddcc69 100644 --- a/docs/source/nodes_and_pipelines/micro_packaging.md +++ b/docs/source/nodes_and_pipelines/micro_packaging.md @@ -4,7 +4,11 @@ Micro-packaging allows users to share Kedro micro-packages across codebases, org ## Package a micro-package -You can package a micro-package by executing: `kedro micropkg package ` +You can package a micro-package by executing: `kedro micropkg package `. + +`` should be a Python module path like what would be used in an `import` statement, for example + +`kedro micropkg package pipelines.data_processing` * This will generate a new [source distribution](https://docs.python.org/3/distutils/sourcedist.html) for this micro-package. * By default, the tar file will be saved into `dist/` directory inside your project. @@ -15,8 +19,7 @@ When you package your micro-package, such as a modular pipeline for example, Ked ```text ├── conf │ └── base -│ └── parameters -│ └── {{pipeline_name*}} <-- All parameter file(s) +│ └── parameters_{{pipeline_name*}} <-- All parameter file(s) └── src ├── my_project │ ├── __init__.py @@ -31,7 +34,7 @@ When you package your micro-package, such as a modular pipeline for example, Ked Kedro will also include any requirements found in `src//pipelines//requirements.txt` in the micro-package tar file. These requirements will later be taken into account when pulling a micro-package via `kedro micropkg pull`. ```{note} -Kedro will not package the catalog config files even if those are present in `conf//catalog/.yml`. +Kedro will not package the catalog config files even if those are present in `conf//catalog_.yml`. ``` If you plan to publish your packaged micro-package to some Python package repository like [PyPI](https://pypi.org/), you need to make sure that your micro-package name doesn't clash with any of the existing packages in that repository. However, there is no need to rename any of your source files if that is the case. Simply alias your package with a new name by running `kedro micropkg package --alias `. @@ -67,7 +70,7 @@ You can pull a micro-package from a tar file by executing `kedro micropkg pull < * The `` must either be a package name on PyPI or a path to the source distribution file. * Kedro will unpack the tar file, and install the files in following locations in your Kedro project: * All the micro-package code in `src///` - * Configuration files in `conf//parameters/.yml`, where `` defaults to `base`. + * Configuration files in `conf//parameters_.yml`, where `` defaults to `base`. * To place parameters from a different config environment, run `kedro micropkg pull --env ` * Unit tests in `src/tests/` * Kedro will also parse any requirements packaged with the micro-package and add them to project level `requirements.in`. diff --git a/docs/source/nodes_and_pipelines/modular_pipelines.md b/docs/source/nodes_and_pipelines/modular_pipelines.md index ece1ea799c..5064ae7b87 100644 --- a/docs/source/nodes_and_pipelines/modular_pipelines.md +++ b/docs/source/nodes_and_pipelines/modular_pipelines.md @@ -52,8 +52,7 @@ Running the `kedro pipeline create` command adds boilerplate folders and files f ```text ├── conf │ └── base -│ └── parameters -│ └── {{pipeline_name}}.yml <-- Pipeline-specific parameters +│ └── parameters_{{pipeline_name}}.yml <-- Pipeline-specific parameters └── src ├── my_project │ ├── __init__.py @@ -169,9 +168,13 @@ In this example we have used the `+` operator to join two pipelines. You can als Reusing pipelines for slightly different purposes can be a real accelerator for teams and organisations when they reach a certain scale. In the real world, one could imagine pipelines with responsibilities like profiling or feature engineering being reused within the same project or even across projects via [micro-packaging](micro_packaging.md). * In an ideal world, we would like to use the `cook_pipeline` twice as you would `defrost` and `grill` multiple meals beyond the `veg` currently hard-coded. -* Namespaces allow you to '[instantiate](https://en.wikipedia.org/wiki/Instance_(computer_science))' the same pipeline multiple times and keep operations isolated. +* Namespaces allow you to instantiate the same pipeline multiple times and keep operations isolated. * Like one provides arguments to a class' constructor, you can provide overriding inputs/outputs/parameters to the `pipeline()` wrapper. +```{note} +The set of overriding inputs and outputs must be a subset of the reused pipeline's "free" inputs and outputs, respectively. A free input is an input that isn't generated by a node in the pipeline, while a free output is an output that isn't consumed by a node in the pipeline. {py:meth}`Pipeline.inputs() ` and {py:meth}`Pipeline.outputs() ` can be used to list a pipeline's free inputs and outputs, respectively. +``` +
Click here to see a worked example @@ -237,9 +240,8 @@ final_pipeline = ( * `kedro run --namespace=` could be used to only run nodes with a specific namespace. ```{note} -Parameter references (`params:` and `parameters`) will not be namespaced. +`parameters` references will not be namespaced, but `params:` references will. ``` -
## How to use a modular pipeline with different parameters diff --git a/docs/source/nodes_and_pipelines/nodes.md b/docs/source/nodes_and_pipelines/nodes.md index af2fbc9df6..a41f147244 100644 --- a/docs/source/nodes_and_pipelines/nodes.md +++ b/docs/source/nodes_and_pipelines/nodes.md @@ -184,114 +184,53 @@ You can also call a node as a regular Python function: `adder_node(dict(a=2, b=3 ## How to use generator functions in a node -[Generator functions](https://learnpython.org/en/Generators) were introduced with [PEP 255](https://www.python.org/dev/peps/pep-0255). They are a special kind of function that returns lazy iterators but do not store their entire contents in memory all at once. +[Generator functions](https://learnpython.org/en/Generators) were introduced with [PEP 255](https://www.python.org/dev/peps/pep-0255) and are a special kind of function in Python that returns lazy iterators. They are often used for lazy-loading or lazy-saving of data, which can be particularly useful when dealing with large datasets that do not fit entirely into memory. In the context of Kedro, generator functions can be used in nodes to efficiently process and handle such large datasets. -The following code uses a `pandas chunksize` generator to process large datasets within the [`pandas-iris` starter](../kedro_project_setup/starters.md). First set up a project by following the [get started guide](../get_started/new_project.md#create-a-new-project-containing-example-code) to create a Kedro project with the `pandas-iris` starter example code. -Create a [custom dataset](../extend_kedro/custom_datasets.md) called `ChunkWiseCSVDataSet` in `src/YOUR_PROJECT_NAME/extras/datasets/chunkwise_dataset.py` for your `pandas-iris` project. This dataset is a simplified version of the `pandas.CSVDataSet` where the main change is to the `_save` method which should save the data in append-or-create mode, `a+`. +### Set up the project -
-Click to expand - -```python -from copy import deepcopy -from io import BytesIO -from pathlib import PurePosixPath -from typing import Any, Dict - -import fsspec -import pandas as pd +To demonstrate the use of generator functions in Kedro nodes, first, set up a Kedro project using the `pandas-iris` starter. If you haven't already created a Kedro project, you can follow the [get started guide](../get_started/new_project.md#create-a-new-project-containing-example-code) to create it. -from kedro.io.core import ( - AbstractVersionedDataset, - Version, - get_filepath_str, - get_protocol_and_path, -) +Create the project with this command: +```bash +kedro new -s pandas-iris +``` +### Loading data with Generators +To use generator functions in Kedro nodes, you need to update the `catalog.yml` file to include the `chunksize` argument for the relevant dataset that will be processed using the generator. -class ChunkWiseCSVDataSet(AbstractVersionedDataset[pd.DataFrame, pd.DataFrame]): - """``ChunkWiseCSVDataSet`` loads/saves data from/to a CSV file using an underlying - filesystem. It uses pandas to handle the CSV file. - """ - - DEFAULT_LOAD_ARGS = {} # type: Dict[str, Any] - DEFAULT_SAVE_ARGS = {"index": False} # type: Dict[str, Any] - - def __init__( - self, - filepath: str, - load_args: Dict[str, Any] = None, - save_args: Dict[str, Any] = None, - version: Version = None, - credentials: Dict[str, Any] = None, - fs_args: Dict[str, Any] = None, - ) -> None: - """Creates a new instance of ``ChunkWiseCSVDataSet`` pointing to a concrete CSV file - on a specific filesystem. - """ - _fs_args = deepcopy(fs_args) or {} - _credentials = deepcopy(credentials) or {} - - protocol, path = get_protocol_and_path(filepath, version) - if protocol == "file": - _fs_args.setdefault("auto_mkdir", True) - - self._protocol = protocol - self._storage_options = {**_credentials, **_fs_args} - self._fs = fsspec.filesystem(self._protocol, **self._storage_options) - - super().__init__( - filepath=PurePosixPath(path), - version=version, - exists_function=self._fs.exists, - glob_function=self._fs.glob, - ) - - # Handle default load and save arguments - self._load_args = deepcopy(self.DEFAULT_LOAD_ARGS) - if load_args is not None: - self._load_args.update(load_args) - self._save_args = deepcopy(self.DEFAULT_SAVE_ARGS) - if save_args is not None: - self._save_args.update(save_args) - - def _describe(self) -> Dict[str, Any]: - return { - "filepath": self._filepath, - "protocol": self._load_args, - "save_args": self._save_args, - "version": self._version, - } - - def _load(self) -> pd.DataFrame: - load_path = str(self._get_load_path()) - return pd.read_csv(load_path, **self._load_args) +You need to add a new dataset in your `catalog.yml` as follows: +```diff ++ X_test: ++ type: pandas.CSVDataSet ++ filepath: data/05_model_input/X_test.csv ++ load_args: ++ chunksize: 10 +``` - def _save(self, data: pd.DataFrame) -> None: - save_path = get_filepath_str(self._get_save_path(), self._protocol) +With `pandas` built-in support, you can use the `chunksize` argument to read data using generator. - buf = BytesIO() - data.to_csv(path_or_buf=buf, **self._save_args) +### Saving data with Generators +To use generators to save data lazily, you need do three things: +- Update the `make_prediction` function definition to use `return` instead of `yield`. +- Create a [custom dataset](../extend_kedro/custom_datasets.md) called `ChunkWiseCSVDataset` +- Update `catalog.yml` to use a newly created `ChunkWiseCSVDataset`. - with self._fs.open(save_path, mode="a+") as fs_file: - fs_file.write(buf.getvalue()) -``` -
+Copy the following code to `nodes.py`. The main change is to use a new model `DecisionTreeClassifier` to make prediction by chunks in `make_predictions`. -Modify `example_iris_data` in `catalog.yml` by changing `type` to the custom dataset you created above. Add `chunksize: 100` to `load_args` which will return an iterable object. The `chunksize` parameter refers to the number of rows in each chunk. +
+Click to open -```yaml -example_iris_data: - type: YOUR_PROJECT_NAME.extras.datasets.chunkwise_dataset.ChunkWiseCSVDataSet - filepath: data/01_raw/iris.csv - load_args: - chunksize: 100 -``` +```python +import logging +from typing import Any, Dict, Tuple, Iterator +from sklearn.preprocessing import LabelEncoder +from sklearn.tree import DecisionTreeClassifier +from sklearn.metrics import accuracy_score +import numpy as np +import pandas as pd -Next, in `nodes.py` we repurpose the [`split_data`](https://github.com/kedro-org/kedro-starters/blob/dacdd56f1c1afde00a03a1e342fc0f44e1567a1e/pandas-iris/%7B%7B%20cookiecutter.repo_name%20%7D%7D/src/%7B%7B%20cookiecutter.python_package%20%7D%7D/nodes.py#L13) function to process chunk-wise data: -```python def split_data( data: pd.DataFrame, parameters: Dict[str, Any] ) -> Tuple[pd.DataFrame, pd.DataFrame, pd.Series, pd.Series]: @@ -303,39 +242,97 @@ def split_data( Returns: Split data. """ - # Loop through data in chunks building up the training and test sets - for chunk in data: # Iterate over the chunks from data - full_data = pd.concat( - [chunk] - ) # Converts the TextFileReader object into list of DataFrames - data_train = full_data.sample( - frac=parameters["train_fraction"], random_state=parameters["random_state"] - ) - data_test = full_data.drop(data_train.index) - - X_train = data_train.drop(columns=parameters["target_column"]) - X_test = data_test.drop(columns=parameters["target_column"]) - y_train = data_train[parameters["target_column"]] - y_test = data_test[parameters["target_column"]] - yield X_train, X_test, y_train, y_test # Use yield instead of return to get the generator object + + data_train = data.sample( + frac=parameters["train_fraction"], random_state=parameters["random_state"] + ) + data_test = data.drop(data_train.index) + + X_train = data_train.drop(columns=parameters["target_column"]) + X_test = data_test.drop(columns=parameters["target_column"]) + y_train = data_train[parameters["target_column"]] + y_test = data_test[parameters["target_column"]] + + label_encoder = LabelEncoder() + label_encoder.fit(pd.concat([y_train, y_test])) + y_train = label_encoder.transform(y_train) + + return X_train, X_test, y_train, y_test + + +def make_predictions( + X_train: pd.DataFrame, X_test: pd.DataFrame, y_train: pd.Series +) -> pd.Series: + """Use a DecisionTreeClassifier model to make prediction.""" + model = DecisionTreeClassifier() + model.fit(X_train, y_train) + + for chunk in X_test: + y_pred = model.predict(chunk) + y_pred = pd.DataFrame(y_pred) + yield y_pred + + +def report_accuracy(y_pred: pd.Series, y_test: pd.Series): + """Calculates and logs the accuracy. + + Args: + y_pred: Predicted target. + y_test: True target. + """ + accuracy = accuracy_score(y_test, y_pred) + logger = logging.getLogger(__name__) + logger.info("Model has accuracy of %.3f on test data.", accuracy) +``` +
+ + +The `ChunkWiseCSVDataset` is a variant of the `pandas.CSVDataSet` where the main change is to the `_save` method that appends data instead of overwriting it. You need to create a file `src//chunkwise.py` and put this class inside it. Below is an example of the `ChunkWiseCSVDataset` implementation: + +```python +import pandas as pd + +from kedro.io.core import ( + get_filepath_str, +) +from kedro.extras.datasets.pandas import CSVDataSet + + +class ChunkWiseCSVDataset(CSVDataSet): + """``ChunkWiseCSVDataset`` loads/saves data from/to a CSV file using an underlying + filesystem. It uses pandas to handle the CSV file. + """ + + _overwrite = True + + def _save(self, data: pd.DataFrame) -> None: + save_path = get_filepath_str(self._get_save_path(), self._protocol) + # Save the header for the first batch + if self._overwrite: + data.to_csv(save_path, index=False, mode="w") + self._overwrite = False + else: + data.to_csv(save_path, index=False, header=False, mode="a") ``` -We can now `kedro run` in the terminal. The output shows `X_train`, `X_test`, `y_train`, `y_test` saved in chunks: +After that, you need to update the `catalog.yml` to use this new dataset. + +```diff ++ y_pred: ++ type: .chunkwise.ChunkWiseCSVDataset ++ filepath: data/07_model_output/y_pred.csv +``` + +With these changes, when you run `kedro run` in your terminal, you should see `y_pred` being saved multiple times in the logs as the generator lazily processes and saves the data in smaller chunks. ``` ... -[02/10/23 12:42:55] INFO Loading data from 'example_iris_data' (ChunkWiseCSVDataSet)... data_catalog.py:343 - INFO Loading data from 'parameters' (MemoryDataSet)... data_catalog.py:343 - INFO Running node: split: split_data([example_iris_data,parameters]) -> node.py:329 - [X_train,X_test,y_train,y_test] - INFO Saving data to 'X_train' (MemoryDataSet)... data_catalog.py:382 - INFO Saving data to 'X_test' (MemoryDataSet)... data_catalog.py:382 - INFO Saving data to 'y_train' (MemoryDataSet)... data_catalog.py:382 - INFO Saving data to 'y_test' (MemoryDataSet)... data_catalog.py:382 - INFO Saving data to 'X_train' (MemoryDataSet)... data_catalog.py:382 - INFO Saving data to 'X_test' (MemoryDataSet)... data_catalog.py:382 - INFO Saving data to 'y_train' (MemoryDataSet)... data_catalog.py:382 - INFO Saving data to 'y_test' (MemoryDataSet)... data_catalog.py:382 - INFO Completed 1 out of 3 tasks sequential_runner.py:85 -... + INFO Loading data from 'y_train' (MemoryDataset)... data_catalog.py:475 + INFO Running node: make_predictions: make_predictions([X_train,X_test,y_train]) -> [y_pred] node.py:331 + INFO Saving data to 'y_pred' (ChunkWiseCSVDataset)... data_catalog.py:514 + INFO Saving data to 'y_pred' (ChunkWiseCSVDataset)... data_catalog.py:514 + INFO Saving data to 'y_pred' (ChunkWiseCSVDataset)... data_catalog.py:514 + INFO Completed 2 out of 3 tasks sequential_runner.py:85 + INFO Loading data from 'y_pred' (ChunkWiseCSVDataset)... data_catalog.py:475 +... runner.py:105 ``` diff --git a/docs/source/nodes_and_pipelines/run_a_pipeline.md b/docs/source/nodes_and_pipelines/run_a_pipeline.md index 76200748c8..2d7af412ad 100644 --- a/docs/source/nodes_and_pipelines/run_a_pipeline.md +++ b/docs/source/nodes_and_pipelines/run_a_pipeline.md @@ -63,11 +63,6 @@ from kedro.runner.runner import AbstractRunner from pluggy import PluginManager -from kedro.io import AbstractDataset, DataCatalog, MemoryDataSet -from kedro.pipeline import Pipeline -from kedro.runner.runner import AbstractRunner - - class DryRunner(AbstractRunner): """``DryRunner`` is an ``AbstractRunner`` implementation. It can be used to list which nodes would be run without actually executing anything. It also checks if all the @@ -111,7 +106,6 @@ class DryRunner(AbstractRunner): self._logger.info( "Actual run would execute %d nodes:\n%s", len(nodes), - "\n", pipeline.describe(), ) self._logger.info("Checking inputs...") diff --git a/docs/source/resources/glossary.md b/docs/source/resources/glossary.md index 642c220621..55f841c8e7 100644 --- a/docs/source/resources/glossary.md +++ b/docs/source/resources/glossary.md @@ -38,7 +38,7 @@ You can use Kedro-Viz to visualise your Kedro data pipelines: ## Layers (data engineering convention) According to [common data engineering convention](https://towardsdatascience.com/the-importance-of-layered-thinking-in-data-engineering-a09f685edc71), a pipeline can be broken up into different layers according to how data is processed. This convention makes it easier to collaborate with other team members because everyone has an idea of what type of data cleaning or processing has happened. -Kedro-Viz makes it easy to [visualise these data processing stages](../visualisation/kedro-viz_visualisation.md#visualise-layers) by adding a `layer` attribute to the datasets in the Data Catalog. +Kedro-Viz makes it easy to [visualise these data processing stages](../visualisation/kedro-viz_visualisation.md#visualise-layers) by adding a `layer` attribute to the `kedro-viz` section within the `metadata` of the datasets in the Data Catalog. ## Modular pipeline _(See also [Pipeline](#pipeline))_ diff --git a/docs/source/resources/index.md b/docs/source/resources/index.md index 72493f112e..ce24876a0a 100644 --- a/docs/source/resources/index.md +++ b/docs/source/resources/index.md @@ -1,4 +1,4 @@ -# Resources +# FAQs and resources ```{toctree} :maxdepth: 1 diff --git a/docs/source/tutorial/add_another_pipeline.md b/docs/source/tutorial/add_another_pipeline.md index 3e4c0089e2..95093b5d0b 100644 --- a/docs/source/tutorial/add_another_pipeline.md +++ b/docs/source/tutorial/add_another_pipeline.md @@ -17,7 +17,7 @@ The data science pipeline is made up of the following: * Two python files within `src/spaceflights/pipelines/data_science` * `nodes.py` (for the node functions that form the data processing) * `pipeline.py` (to build the pipeline) -* A yaml file: `conf/base/parameters/data_science.yml` to define the parameters used when running the pipeline +* A yaml file: `conf/base/parameters_data_science.yml` to define the parameters used when running the pipeline * `__init__.py` files in the required folders to ensure that Python can import the pipeline @@ -42,7 +42,7 @@ def split_data(data: pd.DataFrame, parameters: Dict) -> Tuple: Args: data: Data containing features and target. - parameters: Parameters defined in parameters/data_science.yml. + parameters: Parameters defined in parameters_data_science.yml. Returns: Split data. """ @@ -89,7 +89,7 @@ def evaluate_model( ## Input parameter configuration -Parameters that are used by the `DataCatalog` when the pipeline executes are stored in `conf/base/parameters/data_science.yml`: +Parameters that are used by the `DataCatalog` when the pipeline executes are stored in `conf/base/parameters_data_science.yml`:
Click to expand @@ -276,7 +276,7 @@ candidate_modelling_pipeline.regressor: ```

-2. Update the parameters file for the data science pipeline in `conf/base/parameters/data_science.yml` to replace the existing contents for `model_options` with the following for the two instances of the template pipeline: +2. Update the parameters file for the data science pipeline in `conf/base/parameters_data_science.yml` to replace the existing contents for `model_options` with the following for the two instances of the template pipeline:
Click to expand diff --git a/docs/source/tutorial/create_a_pipeline.md b/docs/source/tutorial/create_a_pipeline.md index d0173a1cc9..1828c41929 100644 --- a/docs/source/tutorial/create_a_pipeline.md +++ b/docs/source/tutorial/create_a_pipeline.md @@ -14,7 +14,7 @@ The data processing pipeline prepares the data for model building by combining t * Two python files within `src/spaceflights/pipelines/data_processing` * `nodes.py` (for the node functions that form the data processing) * `pipeline.py` (to build the pipeline) -* A yaml file: `conf/base/parameters/data_processing.yml` to define the parameters used when running the pipeline +* A yaml file: `conf/base/parameters_data_processing.yml` to define the parameters used when running the pipeline * `__init__.py` files in the required folders to ensure that Python can import the pipeline ```{note} diff --git a/docs/source/tutorial/spaceflights_tutorial.md b/docs/source/tutorial/spaceflights_tutorial.md index 0a65d0369b..da58578174 100644 --- a/docs/source/tutorial/spaceflights_tutorial.md +++ b/docs/source/tutorial/spaceflights_tutorial.md @@ -31,7 +31,7 @@ If you hit an issue with the tutorial: * Check the [spaceflights tutorial FAQ](spaceflights_tutorial_faqs.md) to see if we have answered the question already. * Use [Kedro-Viz](../visualisation/kedro-viz_visualisation) to visualise your project to better understand how the datasets, nodes and pipelines fit together. * Use the [#questions channel](https://slack.kedro.org/) on our Slack channel to ask the community for help. -* Search the [searchable archive of Slack discussions](https://www.linen.dev/s/kedro). +* Search the [searchable archive of Slack discussions](https://linen-slack.kedro.org/). ## Terminology diff --git a/docs/source/visualisation/index.md b/docs/source/visualisation/index.md index 6c972e65e6..fe0e197e22 100644 --- a/docs/source/visualisation/index.md +++ b/docs/source/visualisation/index.md @@ -13,5 +13,6 @@ pip install kedro-viz :maxdepth: 1 kedro-viz_visualisation +preview_datasets visualise_charts_with_plotly ``` diff --git a/docs/source/visualisation/kedro-viz_visualisation.md b/docs/source/visualisation/kedro-viz_visualisation.md index 179710c7d0..d546681106 100644 --- a/docs/source/visualisation/kedro-viz_visualisation.md +++ b/docs/source/visualisation/kedro-viz_visualisation.md @@ -42,7 +42,7 @@ You should see the following: If a visualisation panel opens up and a pipeline is not visible, refresh the view, and check that your tutorial project code is complete if you've not generated it from the starter template. If you still don't see the visualisation, the Kedro community can help: * use the [#questions channel](https://slack.kedro.org/) on our Slack channel to ask the community for help -* search the [searchable archive of Slack discussions](https://www.linen.dev/s/kedro) +* search the [searchable archive of Slack discussions](https://linen-slack.kedro.org/) To exit the visualisation, close the browser tab. To regain control of the terminal, enter `^+c` on Mac or `Ctrl+c` on Windows or Linux machines. @@ -66,46 +66,83 @@ By convention, a [pipeline can be defined as having different layers](../resourc For example, the [data engineering convention](https://towardsdatascience.com/the-importance-of-layered-thinking-in-data-engineering-a09f685edc71) labels datasets according to the stage of the pipeline (e.g. whether the data has been cleaned). -You can add a `layer` attribute to the datasets in the Data Catalog, which is reflected in the Kedro-Viz visualisation. +In Kedro version 0.18.9 we changed the way layers are defined in the Data Catalog. The definition is now included under the `metadata` key for `kedro-viz` (previously it was an attribute specified within a dataset's definition). -Open `catalog.yml` for the completed spaceflights tutorial and replace the existing code with the following: +Here's an example of how to use the Kedro-Viz metadata to define layers: ```yaml companies: type: pandas.CSVDataSet filepath: data/01_raw/companies.csv - layer: raw + metadata: + kedro-viz: + layer: raw +``` + +In earlier versions of Kedro, layers were specified within a dataset's definition in the Data Catalog, but this will **no longer be supported** from Kedro version 0.19.0. From that version onwards, your `catalog.yml` must specify layers as metadata. + +```diff +companies: + type: pandas.CSVDataSet + filepath: data/01_raw/companies.csv +- layer: raw ++ metadata: ++ kedro-viz: ++ layer: raw +``` + +Open `catalog.yml` for the completed spaceflights tutorial and define layers in the following way: + +```yaml +companies: + type: pandas.CSVDataSet + filepath: data/01_raw/companies.csv + metadata: + kedro-viz: + layer: raw reviews: type: pandas.CSVDataSet filepath: data/01_raw/reviews.csv - layer: raw + metadata: + kedro-viz: + layer: raw shuttles: type: pandas.ExcelDataSet filepath: data/01_raw/shuttles.xlsx - layer: raw + metadata: + kedro-viz: + layer: raw preprocessed_companies: type: pandas.ParquetDataSet filepath: data/02_intermediate/preprocessed_companies.pq - layer: intermediate + metadata: + kedro-viz: + layer: intermediate preprocessed_shuttles: type: pandas.ParquetDataSet filepath: data/02_intermediate/preprocessed_shuttles.pq - layer: intermediate + metadata: + kedro-viz: + layer: intermediate model_input_table: type: pandas.ParquetDataSet filepath: data/03_primary/model_input_table.pq - layer: primary + metadata: + kedro-viz: + layer: primary regressor: type: pickle.PickleDataSet filepath: data/06_models/regressor.pickle versioned: true - layer: models + metadata: + kedro-viz: + layer: models ``` The visualisation now includes the layers: diff --git a/docs/source/visualisation/preview_datasets.md b/docs/source/visualisation/preview_datasets.md new file mode 100644 index 0000000000..f201bd7e17 --- /dev/null +++ b/docs/source/visualisation/preview_datasets.md @@ -0,0 +1,81 @@ +# Preview data in Kedro-Viz + +This page describes how to preview data from different datasets in a Kedro project with Kedro-Viz. Dataset preview was introduced in Kedro-Viz version 6.3.0, which offers preview for `CSVDatasets` and `ExcelDatasets`. + +We use the [spaceflights tutorial](../tutorial/spaceflights_tutorial.md) to demonstrate how to add data preview for the `customer`, `shuttle` and `reviews` datasets. Even if you have not yet worked through the tutorial, you can still follow this example; you'll need to use the [Kedro starter for the spaceflights tutorial](https://github.com/kedro-org/kedro-starters/tree/main/spaceflights) to generate a copy of the project with working code in place. + +If you haven't installed Kedro [follow the documentation to get set up](../get_started/install.md). + +Then, in your terminal window, navigate to the folder you want to store the project. + +Generate a copy of the spaceflights tutorial project with all the code in place by using the [Kedro starter for the spaceflights tutorial](https://github.com/kedro-org/kedro-starters/tree/main/spaceflights): + +```bash +kedro new --starter=spaceflights +``` + +When prompted for a project name, you can enter anything, but we will assume `Spaceflights` throughout. + +When your project is ready, navigate to the root directory of the project. + +## Configure the Data Catalog + +Kedro-Viz version 6.3.0 currently supports preview of two types of datasets: + +* `pandas.CSVDataset` +* `pandas.ExcelDataset` + + +To enable dataset preview, add the `preview_args` attribute to the kedro-viz configuration under the `metadata` section in the Data Catalog. Within preview_args, specify `nrows` as the number of rows to preview for the dataset. + +```yaml +companies: + type: pandas.CSVDataSet + filepath: data/01_raw/companies.csv + metadata: + kedro-viz: + layer: raw + preview_args: + nrows: 5 + +reviews: + type: pandas.CSVDataSet + filepath: data/01_raw/reviews.csv + metadata: + kedro-viz: + layer: raw + preview_args: + nrows: 10 + +shuttles: + type: pandas.ExcelDataSet + filepath: data/01_raw/shuttles.xlsx + metadata: + kedro-viz: + layer: raw + preview_args: + nrows: 15 +``` + + + +## Previewing Data on Kedro-viz + +After you've configured the Data Catalog, you can preview the datasets on Kedro-Viz. Start Kedro-Viz by running the following command in your terminal: + +```bash +kedro viz +``` + +The previews are shown as follows: + +Click on each dataset node to see a small preview in the metadata panel: + + +![](../meta/images/preview_datasets_metadata.png) + + +View the larger preview of the dataset by clicking the `Expand Preview Table` button on the bottom of the metadata panel. + + +![](../meta/images/preview_datasets_expanded.png) diff --git a/features/environment.py b/features/environment.py index c98246dc85..172dfd006a 100644 --- a/features/environment.py +++ b/features/environment.py @@ -1,5 +1,5 @@ """Behave environment setup commands.""" -# pylint: disable=unused-argument +# noqa: unused-argument from __future__ import annotations import os @@ -56,7 +56,6 @@ def _setup_context_with_venv(context, venv_dir): context.pip = str(bin_dir / "pip") context.python = str(bin_dir / "python") context.kedro = str(bin_dir / "kedro") - context.requirements_path = Path("dependency/requirements.txt").resolve() # clone the environment, remove any condas and venvs and insert our venv context.env = os.environ.copy() @@ -104,7 +103,9 @@ def _setup_minimal_env(context): "pip", "install", "-U", - "pip>=21.2", + # pip==23.2 breaks pip-tools<7.0, and pip-tools>=7.0 does not support Python 3.7 + "pip>=21.2,<23.2; python_version < '3.8'", + "pip>=21.2; python_version >= '3.8'", "setuptools>=65.5.1", "wheel", ], diff --git a/features/steps/cli_steps.py b/features/steps/cli_steps.py index 0008841de4..f5931828fa 100644 --- a/features/steps/cli_steps.py +++ b/features/steps/cli_steps.py @@ -11,6 +11,7 @@ import toml import yaml from behave import given, then, when +from packaging.requirements import Requirement import kedro from features.steps import util @@ -228,7 +229,7 @@ def add_test_jupyter_nb(context): """Create a test jupyter notebook using TEST_JUPYTER_ORG.""" with open( str(context.root_project_dir / "notebooks" / "hello_world.ipynb"), - "wt", + "w", encoding="utf-8", ) as test_nb_fh: test_nb_fh.write(TEST_JUPYTER_ORG) @@ -365,7 +366,7 @@ def simulate_nb_execution(context): """ with open( str(context.root_project_dir / "notebooks" / "hello_world.ipynb"), - "wt", + "w", encoding="utf-8", ) as test_nb_fh: test_nb_fh.write(TEST_JUPYTER_AFTER_EXEC) @@ -407,18 +408,16 @@ def update_pyproject_toml(context: behave.runner.Context, new_source_dir): @given("I have updated kedro requirements") def update_kedro_req(context: behave.runner.Context): - """Replace kedro as a standalone requirement with a line - that includes all of kedro's dependencies (-r kedro/requirements.txt) - """ + """Remove kedro as a standalone requirement.""" reqs_path = context.root_project_dir / "src" / "requirements.txt" - kedro_reqs = f"-r {context.requirements_path.as_posix()}" if reqs_path.is_file(): old_reqs = reqs_path.read_text().splitlines() new_reqs = [] for req in old_reqs: - if req.startswith("kedro"): - new_reqs.append(kedro_reqs) + if req.startswith("kedro") and Requirement(req).name.lower() == "kedro": + # Do not include kedro as it's preinstalled in the environment + pass else: new_reqs.append(req) new_reqs = "\n".join(new_reqs) @@ -555,7 +554,7 @@ def check_additional_cell_added(context): encoding="utf-8", ) as test_nb_fh: context.nb_data = json.load(test_nb_fh) - assert len(context.nb_data["cells"]) == 2 + assert len(context.nb_data["cells"]) == 2 # noqa: PLR2004 @then("the output should be empty in all the cells in the jupyter notebook") diff --git a/features/steps/sh_run.py b/features/steps/sh_run.py index 7e49e7a0ca..476cee7106 100644 --- a/features/steps/sh_run.py +++ b/features/steps/sh_run.py @@ -37,9 +37,7 @@ def run( if isinstance(cmd, str) and split: cmd = shlex.split(cmd) # pylint: disable=subprocess-run-check - result = subprocess.run( - cmd, input="", stdout=subprocess.PIPE, stderr=subprocess.PIPE, **kwargs - ) + result = subprocess.run(cmd, input="", capture_output=True, **kwargs) result.stdout = result.stdout.decode("utf-8") result.stderr = result.stderr.decode("utf-8") if print_output: diff --git a/features/steps/test_plugin/plugin.py b/features/steps/test_plugin/plugin.py index 02ef228b3e..277fb1f18e 100644 --- a/features/steps/test_plugin/plugin.py +++ b/features/steps/test_plugin/plugin.py @@ -11,9 +11,7 @@ class MyPluginHook: @hook_impl - def after_catalog_created( - self, catalog - ): # pylint: disable=unused-argument, no-self-use + def after_catalog_created(self, catalog): # noqa: unused-argument, no-self-use logger.info("Reached after_catalog_created hook") diff --git a/features/steps/test_starter/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.python_package }}/pipelines/data_science/nodes.py b/features/steps/test_starter/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.python_package }}/pipelines/data_science/nodes.py index 41efe547cd..9217c2d3b1 100644 --- a/features/steps/test_starter/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.python_package }}/pipelines/data_science/nodes.py +++ b/features/steps/test_starter/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.python_package }}/pipelines/data_science/nodes.py @@ -3,7 +3,7 @@ Delete this when you start working on your own Kedro project. """ -# pylint: disable=invalid-name +# noqa: invalid-name from __future__ import annotations import logging diff --git a/features/windows_reqs.txt b/features/windows_reqs.txt index c41bf77a4d..9d9c461b56 100644 --- a/features/windows_reqs.txt +++ b/features/windows_reqs.txt @@ -1,4 +1,4 @@ -# same versions as `test_requirements` +# same versions as [test] optional requirements # e2e tests on Windows are slow but we don't need to install # everything, so just this subset will be enough for CI behave==1.2.6 @@ -7,3 +7,4 @@ psutil~=5.8 requests~=2.20 toml~=0.10.1 PyYAML>=4.2, <7.0 +packaging>=20.0 diff --git a/kedro/__init__.py b/kedro/__init__.py index be9febd329..7a7db37ae2 100644 --- a/kedro/__init__.py +++ b/kedro/__init__.py @@ -3,4 +3,26 @@ configuration and pipeline assembly. """ -__version__ = "0.18.11" +import sys +import warnings + +__version__ = "0.18.12" + + +class KedroPythonVersionWarning(UserWarning): + """Custom class for warnings about incompatibilities with Python versions.""" + + pass + + +if not sys.warnoptions: + warnings.simplefilter("error", KedroPythonVersionWarning) + +if sys.version_info >= (3, 12): + warnings.warn( + """Kedro is not yet fully compatible with this Python version. +To proceed at your own risk and ignore this warning, +run Kedro with `python -W "default:Kedro is not yet fully compatible" -m kedro ...` +or set the PYTHONWARNINGS environment variable accordingly.""", + KedroPythonVersionWarning, + ) diff --git a/kedro/config/common.py b/kedro/config/common.py index afa823cc9d..35bcdcda89 100644 --- a/kedro/config/common.py +++ b/kedro/config/common.py @@ -124,7 +124,7 @@ def _load_config_file( Parsed configuration. """ # for performance reasons - import anyconfig # pylint: disable=import-outside-toplevel + import anyconfig # noqa: import-outside-toplevel try: # Default to UTF-8, which is Python 3 default encoding, to decode the file @@ -230,7 +230,7 @@ def _check_duplicate_keys( if overlapping_keys: sorted_keys = ", ".join(sorted(overlapping_keys)) - if len(sorted_keys) > 100: + if len(sorted_keys) > 100: # noqa: PLR2004 sorted_keys = sorted_keys[:100] + "..." duplicates.append(f"{processed_file}: {sorted_keys}") diff --git a/kedro/config/config.py b/kedro/config/config.py index f50e4b8f52..1af44fc4ab 100644 --- a/kedro/config/config.py +++ b/kedro/config/config.py @@ -66,7 +66,7 @@ class ConfigLoader(AbstractConfigLoader): """ - def __init__( + def __init__( # noqa: too-many-arguments self, conf_source: str, env: str = None, diff --git a/kedro/config/omegaconf_config.py b/kedro/config/omegaconf_config.py index 75303e2902..4d2ace59d4 100644 --- a/kedro/config/omegaconf_config.py +++ b/kedro/config/omegaconf_config.py @@ -7,7 +7,7 @@ import logging import mimetypes from pathlib import Path -from typing import Any, Iterable +from typing import Any, Callable, Iterable import fsspec from omegaconf import OmegaConf @@ -73,7 +73,7 @@ class OmegaConfigLoader(AbstractConfigLoader): """ - def __init__( + def __init__( # noqa: too-many-arguments self, conf_source: str, env: str = None, @@ -82,6 +82,7 @@ def __init__( config_patterns: dict[str, list[str]] = None, base_env: str = "base", default_run_env: str = "local", + custom_resolvers: dict[str, Callable] = None, ): """Instantiates a ``OmegaConfigLoader``. @@ -97,6 +98,8 @@ def __init__( the configuration paths. default_run_env: Name of the default run environment. Defaults to `"local"`. Can be overridden by supplying the `env` argument. + custom_resolvers: A dictionary of custom resolvers to be registered. For more information, + see here: https://omegaconf.readthedocs.io/en/2.3_branch/custom_resolvers.html#custom-resolvers """ self.base_env = base_env self.default_run_env = default_run_env @@ -109,9 +112,11 @@ def __init__( } self.config_patterns.update(config_patterns or {}) - # In the first iteration of the OmegaConfigLoader we'll keep the resolver turned-off. - # It's easier to introduce them step by step, but removing them would be a breaking change. - self._clear_omegaconf_resolvers() + # Deactivate oc.env built-in resolver for OmegaConf + OmegaConf.clear_resolver("oc.env") + # Register user provided custom resolvers + if custom_resolvers: + self._register_new_resolvers(custom_resolvers) file_mimetype, _ = mimetypes.guess_type(conf_source) if file_mimetype == "application/x-tar": @@ -207,7 +212,7 @@ def __repr__(self): # pragma: no cover f"config_patterns={self.config_patterns})" ) - def load_and_merge_dir_config( # pylint: disable=too-many-arguments + def load_and_merge_dir_config( # noqa: too-many-arguments self, conf_path: str, patterns: Iterable[str], @@ -234,7 +239,7 @@ def load_and_merge_dir_config( # pylint: disable=too-many-arguments Resulting configuration dictionary. """ - # pylint: disable=too-many-locals + # noqa: too-many-locals if not self._fs.isdir(Path(conf_path).as_posix()): raise MissingConfigException( @@ -303,6 +308,15 @@ def _is_valid_config_path(self, path): ".json", ] + @staticmethod + def _register_new_resolvers(resolvers: dict[str, Callable]): + """Register custom resolvers""" + for name, resolver in resolvers.items(): + if not OmegaConf.has_resolver(name): + msg = f"Registering new custom resolver: {name}" + _config_logger.debug(msg) + OmegaConf.register_new_resolver(name=name, resolver=resolver) + @staticmethod def _check_duplicates(seen_files_to_keys: dict[Path, set[Any]]): duplicates = [] @@ -320,7 +334,7 @@ def _check_duplicates(seen_files_to_keys: dict[Path, set[Any]]): if overlapping_keys: sorted_keys = ", ".join(sorted(overlapping_keys)) - if len(sorted_keys) > 100: + if len(sorted_keys) > 100: # noqa: PLR2004 sorted_keys = sorted_keys[:100] + "..." duplicates.append( f"Duplicate keys found in {filepath1} and {filepath2}: {sorted_keys}" @@ -345,14 +359,3 @@ def _resolve_environment_variables(config: dict[str, Any]) -> None: OmegaConf.clear_resolver("oc.env") else: OmegaConf.resolve(config) - - @staticmethod - def _clear_omegaconf_resolvers(): - """Clear the built-in OmegaConf resolvers.""" - OmegaConf.clear_resolver("oc.env") - OmegaConf.clear_resolver("oc.create") - OmegaConf.clear_resolver("oc.deprecated") - OmegaConf.clear_resolver("oc.decode") - OmegaConf.clear_resolver("oc.select") - OmegaConf.clear_resolver("oc.dict.keys") - OmegaConf.clear_resolver("oc.dict.values") diff --git a/kedro/config/templated_config.py b/kedro/config/templated_config.py index b0e319ad8f..1c343ec41f 100644 --- a/kedro/config/templated_config.py +++ b/kedro/config/templated_config.py @@ -89,7 +89,7 @@ class TemplatedConfigLoader(AbstractConfigLoader): https://github.com/jmespath/jmespath.py and https://jmespath.org/. """ - def __init__( + def __init__( # noqa: too-many-arguments self, conf_source: str, env: str = None, @@ -264,7 +264,7 @@ def _format_string(match): f"'{formatted_key}' found" ) - key = formatted_key + key = formatted_key # noqa: PLW2901 new_dict[key] = _format_object(value, format_dict) diff --git a/kedro/extras/datasets/api/api_dataset.py b/kedro/extras/datasets/api/api_dataset.py index 31eacf10b9..f288c96814 100644 --- a/kedro/extras/datasets/api/api_dataset.py +++ b/kedro/extras/datasets/api/api_dataset.py @@ -56,8 +56,7 @@ class APIDataSet(AbstractDataset[None, requests.Response]): >>> data = data_set.load() """ - # pylint: disable=too-many-arguments - def __init__( + def __init__( # noqa: too-many-arguments self, url: str, method: str = "GET", diff --git a/kedro/extras/datasets/biosequence/biosequence_dataset.py b/kedro/extras/datasets/biosequence/biosequence_dataset.py index f15b8528a4..ac0770aa68 100644 --- a/kedro/extras/datasets/biosequence/biosequence_dataset.py +++ b/kedro/extras/datasets/biosequence/biosequence_dataset.py @@ -44,8 +44,7 @@ class BioSequenceDataSet(AbstractDataset[List, List]): DEFAULT_LOAD_ARGS = {} # type: Dict[str, Any] DEFAULT_SAVE_ARGS = {} # type: Dict[str, Any] - # pylint: disable=too-many-arguments - def __init__( + def __init__( # noqa: too-many-arguments self, filepath: str, load_args: Dict[str, Any] = None, diff --git a/kedro/extras/datasets/dask/parquet_dataset.py b/kedro/extras/datasets/dask/parquet_dataset.py index 1606996790..23dc7a701b 100644 --- a/kedro/extras/datasets/dask/parquet_dataset.py +++ b/kedro/extras/datasets/dask/parquet_dataset.py @@ -91,8 +91,7 @@ class ParquetDataSet(AbstractDataset[dd.DataFrame, dd.DataFrame]): DEFAULT_LOAD_ARGS = {} # type: Dict[str, Any] DEFAULT_SAVE_ARGS = {"write_index": False} # type: Dict[str, Any] - # pylint: disable=too-many-arguments - def __init__( + def __init__( # noqa: too-many-arguments self, filepath: str, load_args: Dict[str, Any] = None, diff --git a/kedro/extras/datasets/email/message_dataset.py b/kedro/extras/datasets/email/message_dataset.py index 0c502f0b44..695d93cbbe 100644 --- a/kedro/extras/datasets/email/message_dataset.py +++ b/kedro/extras/datasets/email/message_dataset.py @@ -60,8 +60,7 @@ class EmailMessageDataSet( DEFAULT_LOAD_ARGS = {} # type: Dict[str, Any] DEFAULT_SAVE_ARGS = {} # type: Dict[str, Any] - # pylint: disable=too-many-arguments - def __init__( + def __init__( # noqa: too-many-arguments self, filepath: str, load_args: Dict[str, Any] = None, diff --git a/kedro/extras/datasets/geopandas/geojson_dataset.py b/kedro/extras/datasets/geopandas/geojson_dataset.py index bf489e7cb2..5beba29d57 100644 --- a/kedro/extras/datasets/geopandas/geojson_dataset.py +++ b/kedro/extras/datasets/geopandas/geojson_dataset.py @@ -52,8 +52,7 @@ class GeoJSONDataSet( DEFAULT_LOAD_ARGS = {} # type: Dict[str, Any] DEFAULT_SAVE_ARGS = {"driver": "GeoJSON"} - # pylint: disable=too-many-arguments - def __init__( + def __init__( # noqa: too-many-arguments self, filepath: str, load_args: Dict[str, Any] = None, diff --git a/kedro/extras/datasets/holoviews/holoviews_writer.py b/kedro/extras/datasets/holoviews/holoviews_writer.py index f1993bd374..34daeb1769 100644 --- a/kedro/extras/datasets/holoviews/holoviews_writer.py +++ b/kedro/extras/datasets/holoviews/holoviews_writer.py @@ -44,8 +44,7 @@ class HoloviewsWriter(AbstractVersionedDataset[HoloViews, NoReturn]): DEFAULT_SAVE_ARGS = {"fmt": "png"} # type: Dict[str, Any] - # pylint: disable=too-many-arguments - def __init__( + def __init__( # noqa: too-many-arguments self, filepath: str, fs_args: Dict[str, Any] = None, diff --git a/kedro/extras/datasets/json/json_dataset.py b/kedro/extras/datasets/json/json_dataset.py index e6ada41411..5e05dd46ed 100644 --- a/kedro/extras/datasets/json/json_dataset.py +++ b/kedro/extras/datasets/json/json_dataset.py @@ -56,8 +56,7 @@ class JSONDataSet(AbstractVersionedDataset[Any, Any]): DEFAULT_SAVE_ARGS = {"indent": 2} # type: Dict[str, Any] - # pylint: disable=too-many-arguments - def __init__( + def __init__( # noqa: too-many-arguments self, filepath: str, save_args: Dict[str, Any] = None, diff --git a/kedro/extras/datasets/matplotlib/matplotlib_writer.py b/kedro/extras/datasets/matplotlib/matplotlib_writer.py index 24b8d08d4b..204e4673c5 100644 --- a/kedro/extras/datasets/matplotlib/matplotlib_writer.py +++ b/kedro/extras/datasets/matplotlib/matplotlib_writer.py @@ -111,8 +111,7 @@ class MatplotlibWriter( DEFAULT_SAVE_ARGS = {} # type: Dict[str, Any] - # pylint: disable=too-many-arguments - def __init__( + def __init__( # noqa: too-many-arguments self, filepath: str, fs_args: Dict[str, Any] = None, diff --git a/kedro/extras/datasets/networkx/gml_dataset.py b/kedro/extras/datasets/networkx/gml_dataset.py index 251bacc371..a56ddbe7ba 100644 --- a/kedro/extras/datasets/networkx/gml_dataset.py +++ b/kedro/extras/datasets/networkx/gml_dataset.py @@ -44,8 +44,7 @@ class GMLDataSet(AbstractVersionedDataset[networkx.Graph, networkx.Graph]): DEFAULT_LOAD_ARGS = {} # type: Dict[str, Any] DEFAULT_SAVE_ARGS = {} # type: Dict[str, Any] - # pylint: disable=too-many-arguments - def __init__( + def __init__( # noqa: too-many-arguments self, filepath: str, load_args: Dict[str, Any] = None, diff --git a/kedro/extras/datasets/networkx/graphml_dataset.py b/kedro/extras/datasets/networkx/graphml_dataset.py index 438fe3d089..368459958f 100644 --- a/kedro/extras/datasets/networkx/graphml_dataset.py +++ b/kedro/extras/datasets/networkx/graphml_dataset.py @@ -43,8 +43,7 @@ class GraphMLDataSet(AbstractVersionedDataset[networkx.Graph, networkx.Graph]): DEFAULT_LOAD_ARGS = {} # type: Dict[str, Any] DEFAULT_SAVE_ARGS = {} # type: Dict[str, Any] - # pylint: disable=too-many-arguments - def __init__( + def __init__( # noqa: too-many-arguments self, filepath: str, load_args: Dict[str, Any] = None, diff --git a/kedro/extras/datasets/networkx/json_dataset.py b/kedro/extras/datasets/networkx/json_dataset.py index bb369ebb30..60db837a91 100644 --- a/kedro/extras/datasets/networkx/json_dataset.py +++ b/kedro/extras/datasets/networkx/json_dataset.py @@ -44,8 +44,7 @@ class JSONDataSet(AbstractVersionedDataset[networkx.Graph, networkx.Graph]): DEFAULT_LOAD_ARGS = {} # type: Dict[str, Any] DEFAULT_SAVE_ARGS = {} # type: Dict[str, Any] - # pylint: disable=too-many-arguments - def __init__( + def __init__( # noqa: too-many-arguments self, filepath: str, load_args: Dict[str, Any] = None, diff --git a/kedro/extras/datasets/pandas/csv_dataset.py b/kedro/extras/datasets/pandas/csv_dataset.py index f7c92f6f9e..01b044969c 100644 --- a/kedro/extras/datasets/pandas/csv_dataset.py +++ b/kedro/extras/datasets/pandas/csv_dataset.py @@ -73,8 +73,7 @@ class CSVDataSet(AbstractVersionedDataset[pd.DataFrame, pd.DataFrame]): DEFAULT_LOAD_ARGS = {} # type: Dict[str, Any] DEFAULT_SAVE_ARGS = {"index": False} # type: Dict[str, Any] - # pylint: disable=too-many-arguments - def __init__( + def __init__( # noqa: too-many-arguments self, filepath: str, load_args: Dict[str, Any] = None, diff --git a/kedro/extras/datasets/pandas/excel_dataset.py b/kedro/extras/datasets/pandas/excel_dataset.py index 49fa54d2da..21139c7ca9 100644 --- a/kedro/extras/datasets/pandas/excel_dataset.py +++ b/kedro/extras/datasets/pandas/excel_dataset.py @@ -113,8 +113,7 @@ class ExcelDataSet( DEFAULT_LOAD_ARGS = {"engine": "openpyxl"} DEFAULT_SAVE_ARGS = {"index": False} - # pylint: disable=too-many-arguments - def __init__( + def __init__( # noqa: too-many-arguments self, filepath: str, engine: str = "openpyxl", diff --git a/kedro/extras/datasets/pandas/feather_dataset.py b/kedro/extras/datasets/pandas/feather_dataset.py index ef480f0173..b43ecc1814 100644 --- a/kedro/extras/datasets/pandas/feather_dataset.py +++ b/kedro/extras/datasets/pandas/feather_dataset.py @@ -73,8 +73,7 @@ class FeatherDataSet(AbstractVersionedDataset[pd.DataFrame, pd.DataFrame]): DEFAULT_LOAD_ARGS = {} # type: Dict[str, Any] DEFAULT_SAVE_ARGS = {} # type: Dict[str, Any] - # pylint: disable=too-many-arguments - def __init__( + def __init__( # noqa: too-many-arguments self, filepath: str, load_args: Dict[str, Any] = None, diff --git a/kedro/extras/datasets/pandas/gbq_dataset.py b/kedro/extras/datasets/pandas/gbq_dataset.py index 90390a27d9..16cea01213 100644 --- a/kedro/extras/datasets/pandas/gbq_dataset.py +++ b/kedro/extras/datasets/pandas/gbq_dataset.py @@ -70,8 +70,7 @@ class GBQTableDataSet(AbstractDataset[None, pd.DataFrame]): DEFAULT_LOAD_ARGS = {} # type: Dict[str, Any] DEFAULT_SAVE_ARGS = {"progress_bar": False} # type: Dict[str, Any] - # pylint: disable=too-many-arguments - def __init__( + def __init__( # noqa: too-many-arguments self, dataset: str, table_name: str, @@ -210,8 +209,7 @@ class GBQQueryDataSet(AbstractDataset[None, pd.DataFrame]): DEFAULT_LOAD_ARGS = {} # type: Dict[str, Any] - # pylint: disable=too-many-arguments - def __init__( + def __init__( # noqa: too-many-arguments self, sql: str = None, project: str = None, diff --git a/kedro/extras/datasets/pandas/generic_dataset.py b/kedro/extras/datasets/pandas/generic_dataset.py index 389713f284..7212310e8f 100644 --- a/kedro/extras/datasets/pandas/generic_dataset.py +++ b/kedro/extras/datasets/pandas/generic_dataset.py @@ -90,8 +90,7 @@ class GenericDataSet(AbstractVersionedDataset[pd.DataFrame, pd.DataFrame]): DEFAULT_LOAD_ARGS = {} # type: Dict[str, Any] DEFAULT_SAVE_ARGS = {} # type: Dict[str, Any] - # pylint: disable=too-many-arguments - def __init__( + def __init__( # noqa: too-many-arguments self, filepath: str, file_format: str, diff --git a/kedro/extras/datasets/pandas/hdf_dataset.py b/kedro/extras/datasets/pandas/hdf_dataset.py index d48aabeede..0d337af42d 100644 --- a/kedro/extras/datasets/pandas/hdf_dataset.py +++ b/kedro/extras/datasets/pandas/hdf_dataset.py @@ -64,8 +64,7 @@ class HDFDataSet(AbstractVersionedDataset[pd.DataFrame, pd.DataFrame]): DEFAULT_LOAD_ARGS = {} # type: Dict[str, Any] DEFAULT_SAVE_ARGS = {} # type: Dict[str, Any] - # pylint: disable=too-many-arguments - def __init__( + def __init__( # noqa: too-many-arguments self, filepath: str, key: str, diff --git a/kedro/extras/datasets/pandas/json_dataset.py b/kedro/extras/datasets/pandas/json_dataset.py index 2cad456c20..8148d325c5 100644 --- a/kedro/extras/datasets/pandas/json_dataset.py +++ b/kedro/extras/datasets/pandas/json_dataset.py @@ -68,8 +68,7 @@ class JSONDataSet(AbstractVersionedDataset[pd.DataFrame, pd.DataFrame]): DEFAULT_LOAD_ARGS = {} # type: Dict[str, Any] DEFAULT_SAVE_ARGS = {} # type: Dict[str, Any] - # pylint: disable=too-many-arguments - def __init__( + def __init__( # noqa: too-many-arguments self, filepath: str, load_args: Dict[str, Any] = None, diff --git a/kedro/extras/datasets/pandas/parquet_dataset.py b/kedro/extras/datasets/pandas/parquet_dataset.py index bf24eb1dec..4bdba28772 100644 --- a/kedro/extras/datasets/pandas/parquet_dataset.py +++ b/kedro/extras/datasets/pandas/parquet_dataset.py @@ -80,8 +80,7 @@ class ParquetDataSet(AbstractVersionedDataset[pd.DataFrame, pd.DataFrame]): DEFAULT_LOAD_ARGS = {} # type: Dict[str, Any] DEFAULT_SAVE_ARGS = {} # type: Dict[str, Any] - # pylint: disable=too-many-arguments - def __init__( + def __init__( # noqa: too-many-arguments self, filepath: str, load_args: Dict[str, Any] = None, diff --git a/kedro/extras/datasets/pandas/sql_dataset.py b/kedro/extras/datasets/pandas/sql_dataset.py index b443788581..373663ce84 100644 --- a/kedro/extras/datasets/pandas/sql_dataset.py +++ b/kedro/extras/datasets/pandas/sql_dataset.py @@ -334,7 +334,7 @@ class SQLQueryDataSet(AbstractDataset[None, pd.DataFrame]): # sqlalchemy.engine.Engine or sqlalchemy.engine.base.Engine engines: Dict[str, Any] = {} - def __init__( # pylint: disable=too-many-arguments + def __init__( # noqa: too-many-arguments self, sql: str = None, credentials: Dict[str, Any] = None, diff --git a/kedro/extras/datasets/pandas/xml_dataset.py b/kedro/extras/datasets/pandas/xml_dataset.py index 451d64c721..ad91b4ad4b 100644 --- a/kedro/extras/datasets/pandas/xml_dataset.py +++ b/kedro/extras/datasets/pandas/xml_dataset.py @@ -51,8 +51,7 @@ class XMLDataSet(AbstractVersionedDataset[pd.DataFrame, pd.DataFrame]): DEFAULT_LOAD_ARGS = {} # type: Dict[str, Any] DEFAULT_SAVE_ARGS = {"index": False} # type: Dict[str, Any] - # pylint: disable=too-many-arguments - def __init__( + def __init__( # noqa: too-many-arguments self, filepath: str, load_args: Dict[str, Any] = None, diff --git a/kedro/extras/datasets/pickle/pickle_dataset.py b/kedro/extras/datasets/pickle/pickle_dataset.py index 6fd1c3c9d3..19bda78f96 100644 --- a/kedro/extras/datasets/pickle/pickle_dataset.py +++ b/kedro/extras/datasets/pickle/pickle_dataset.py @@ -76,8 +76,7 @@ class PickleDataSet(AbstractVersionedDataset[Any, Any]): DEFAULT_LOAD_ARGS = {} # type: Dict[str, Any] DEFAULT_SAVE_ARGS = {} # type: Dict[str, Any] - # pylint: disable=too-many-arguments,too-many-locals - def __init__( + def __init__( # noqa: too-many-arguments,too-many-locals self, filepath: str, backend: str = "pickle", diff --git a/kedro/extras/datasets/pillow/image_dataset.py b/kedro/extras/datasets/pillow/image_dataset.py index 6d28979b7a..1244035df1 100644 --- a/kedro/extras/datasets/pillow/image_dataset.py +++ b/kedro/extras/datasets/pillow/image_dataset.py @@ -40,8 +40,7 @@ class ImageDataSet(AbstractVersionedDataset[Image.Image, Image.Image]): DEFAULT_SAVE_ARGS = {} # type: Dict[str, Any] - # pylint: disable=too-many-arguments - def __init__( + def __init__( # noqa: too-many-arguments self, filepath: str, save_args: Dict[str, Any] = None, diff --git a/kedro/extras/datasets/plotly/json_dataset.py b/kedro/extras/datasets/plotly/json_dataset.py index b8a4f6ca6a..5fa555d665 100644 --- a/kedro/extras/datasets/plotly/json_dataset.py +++ b/kedro/extras/datasets/plotly/json_dataset.py @@ -57,8 +57,7 @@ class JSONDataSet( DEFAULT_LOAD_ARGS = {} # type: Dict[str, Any] DEFAULT_SAVE_ARGS = {} # type: Dict[str, Any] - # pylint: disable=too-many-arguments - def __init__( + def __init__( # noqa: too-many-arguments self, filepath: str, load_args: Dict[str, Any] = None, diff --git a/kedro/extras/datasets/plotly/plotly_dataset.py b/kedro/extras/datasets/plotly/plotly_dataset.py index 97da08f68f..68dc27b012 100644 --- a/kedro/extras/datasets/plotly/plotly_dataset.py +++ b/kedro/extras/datasets/plotly/plotly_dataset.py @@ -70,8 +70,7 @@ class PlotlyDataSet(JSONDataSet): """ - # pylint: disable=too-many-arguments - def __init__( + def __init__( # noqa: too-many-arguments self, filepath: str, plotly_args: Dict[str, Any], diff --git a/kedro/extras/datasets/redis/redis_dataset.py b/kedro/extras/datasets/redis/redis_dataset.py index 3b7fcfdded..bac3a15b65 100644 --- a/kedro/extras/datasets/redis/redis_dataset.py +++ b/kedro/extras/datasets/redis/redis_dataset.py @@ -64,8 +64,7 @@ class PickleDataSet(AbstractDataset[Any, Any]): DEFAULT_LOAD_ARGS = {} # type: Dict[str, Any] DEFAULT_SAVE_ARGS = {} # type: Dict[str, Any] - # pylint: disable=too-many-arguments - def __init__( + def __init__( # noqa: too-many-arguments self, key: str, backend: str = "pickle", diff --git a/kedro/extras/datasets/spark/deltatable_dataset.py b/kedro/extras/datasets/spark/deltatable_dataset.py index ae82bb537e..0f6655ac8c 100644 --- a/kedro/extras/datasets/spark/deltatable_dataset.py +++ b/kedro/extras/datasets/spark/deltatable_dataset.py @@ -100,7 +100,12 @@ def _exists(self) -> bool: try: self._get_spark().read.load(path=load_path, format="delta") except AnalysisException as exception: - if "is not a Delta table" in exception.desc: + # `AnalysisException.desc` is deprecated with pyspark >= 3.4 + message = ( + exception.desc if hasattr(exception, "desc") else exception.message + ) + + if "Path does not exist:" in message or "is not a Delta table" in message: return False raise diff --git a/kedro/extras/datasets/spark/spark_dataset.py b/kedro/extras/datasets/spark/spark_dataset.py index 127968af86..317e173d24 100644 --- a/kedro/extras/datasets/spark/spark_dataset.py +++ b/kedro/extras/datasets/spark/spark_dataset.py @@ -41,7 +41,8 @@ def _parse_glob_pattern(pattern: str) -> str: def _split_filepath(filepath: str) -> Tuple[str, str]: split_ = filepath.split("://", 1) - if len(split_) == 2: + MIN_SPLIT_SIZE = 2 + if len(split_) == MIN_SPLIT_SIZE: return split_[0] + "://", split_[1] return "", split_[0] @@ -232,7 +233,7 @@ class SparkDataSet(AbstractVersionedDataset[DataFrame, DataFrame]): DEFAULT_LOAD_ARGS = {} # type: Dict[str, Any] DEFAULT_SAVE_ARGS = {} # type: Dict[str, Any] - def __init__( # pylint: disable=too-many-arguments + def __init__( # ruff: noqa: PLR0913 self, filepath: str, file_format: str = "parquet", @@ -401,10 +402,11 @@ def _exists(self) -> bool: try: self._get_spark().read.load(load_path, self._file_format) except AnalysisException as exception: - if ( - exception.desc.startswith("Path does not exist:") - or "is not a Delta table" in exception.desc - ): + # `AnalysisException.desc` is deprecated with pyspark >= 3.4 + message = ( + exception.desc if hasattr(exception, "desc") else exception.message + ) + if "Path does not exist:" in message or "is not a Delta table" in message: return False raise return True diff --git a/kedro/extras/datasets/spark/spark_hive_dataset.py b/kedro/extras/datasets/spark/spark_hive_dataset.py index e1e50e3803..2abbd1f166 100644 --- a/kedro/extras/datasets/spark/spark_hive_dataset.py +++ b/kedro/extras/datasets/spark/spark_hive_dataset.py @@ -71,8 +71,7 @@ class SparkHiveDataSet(AbstractDataset[DataFrame, DataFrame]): DEFAULT_SAVE_ARGS = {} # type: Dict[str, Any] - # pylint:disable=too-many-arguments - def __init__( + def __init__( # noqa: too-many-arguments self, database: str, table: str, @@ -210,7 +209,7 @@ def _validate_save(self, data: DataFrame): ) def _exists(self) -> bool: - # noqa # pylint:disable=protected-access + # noqa # noqa: protected-access return ( self._get_spark() ._jsparkSession.catalog() diff --git a/kedro/extras/datasets/spark/spark_jdbc_dataset.py b/kedro/extras/datasets/spark/spark_jdbc_dataset.py index a3b253107a..3abeeb312a 100644 --- a/kedro/extras/datasets/spark/spark_jdbc_dataset.py +++ b/kedro/extras/datasets/spark/spark_jdbc_dataset.py @@ -71,8 +71,7 @@ class SparkJDBCDataSet(AbstractDataset[DataFrame, DataFrame]): DEFAULT_LOAD_ARGS = {} # type: Dict[str, Any] DEFAULT_SAVE_ARGS = {} # type: Dict[str, Any] - # pylint: disable=too-many-arguments - def __init__( + def __init__( # noqa: too-many-arguments self, url: str, table: str, @@ -169,7 +168,7 @@ def _describe(self) -> Dict[str, Any]: } @staticmethod - def _get_spark(): + def _get_spark(): # pragma: no cover return SparkSession.builder.getOrCreate() def _load(self) -> DataFrame: diff --git a/kedro/extras/datasets/svmlight/svmlight_dataset.py b/kedro/extras/datasets/svmlight/svmlight_dataset.py index 6038a6525f..697253ef2a 100644 --- a/kedro/extras/datasets/svmlight/svmlight_dataset.py +++ b/kedro/extras/datasets/svmlight/svmlight_dataset.py @@ -90,8 +90,7 @@ class SVMLightDataSet(AbstractVersionedDataset[_DI, _DO]): DEFAULT_LOAD_ARGS = {} # type: Dict[str, Any] DEFAULT_SAVE_ARGS = {} # type: Dict[str, Any] - # pylint: disable=too-many-arguments - def __init__( + def __init__( # noqa: too-many-arguments self, filepath: str, load_args: Dict[str, Any] = None, diff --git a/kedro/extras/datasets/tensorflow/tensorflow_model_dataset.py b/kedro/extras/datasets/tensorflow/tensorflow_model_dataset.py index 9f971a1a47..c0e916d01f 100644 --- a/kedro/extras/datasets/tensorflow/tensorflow_model_dataset.py +++ b/kedro/extras/datasets/tensorflow/tensorflow_model_dataset.py @@ -68,8 +68,7 @@ class TensorFlowModelDataset(AbstractVersionedDataset[tf.keras.Model, tf.keras.M DEFAULT_LOAD_ARGS = {} # type: Dict[str, Any] DEFAULT_SAVE_ARGS = {"save_format": "tf"} # type: Dict[str, Any] - # pylint: disable=too-many-arguments - def __init__( + def __init__( # noqa: too-many-arguments self, filepath: str, load_args: Dict[str, Any] = None, @@ -134,7 +133,9 @@ def _load(self) -> tf.keras.Model: with tempfile.TemporaryDirectory(prefix=self._tmp_prefix) as path: if self._is_h5: - path = str(PurePath(path) / TEMPORARY_H5_FILE) + path = str( # noqa: PLW2901 + PurePath(path) / TEMPORARY_H5_FILE + ) # noqa: redefined-loop-name self._fs.copy(load_path, path) else: self._fs.get(load_path, path, recursive=True) @@ -153,7 +154,9 @@ def _save(self, data: tf.keras.Model) -> None: with tempfile.TemporaryDirectory(prefix=self._tmp_prefix) as path: if self._is_h5: - path = str(PurePath(path) / TEMPORARY_H5_FILE) + path = str( # noqa: PLW2901 + PurePath(path) / TEMPORARY_H5_FILE + ) # noqa: redefined-loop-name tf.keras.models.save_model(data, path, **self._save_args) diff --git a/kedro/extras/datasets/yaml/yaml_dataset.py b/kedro/extras/datasets/yaml/yaml_dataset.py index 244b0fdffd..7ea2760cdf 100644 --- a/kedro/extras/datasets/yaml/yaml_dataset.py +++ b/kedro/extras/datasets/yaml/yaml_dataset.py @@ -53,8 +53,7 @@ class YAMLDataSet(AbstractVersionedDataset[Dict, Dict]): DEFAULT_SAVE_ARGS = {"default_flow_style": False} # type: Dict[str, Any] - # pylint: disable=too-many-arguments - def __init__( + def __init__( # noqa: too-many-arguments self, filepath: str, save_args: Dict[str, Any] = None, diff --git a/kedro/extras/extensions/ipython.py b/kedro/extras/extensions/ipython.py index 2e38dc3772..ee700b571e 100644 --- a/kedro/extras/extensions/ipython.py +++ b/kedro/extras/extensions/ipython.py @@ -10,7 +10,7 @@ """ import warnings -from ...ipython import ( # noqa # pylint: disable=unused-import +from ...ipython import ( # noqa # noqa: unused-import load_ipython_extension, reload_kedro, ) diff --git a/kedro/framework/cli/catalog.py b/kedro/framework/cli/catalog.py index 2fa9528eb2..7bfa5f54cb 100644 --- a/kedro/framework/cli/catalog.py +++ b/kedro/framework/cli/catalog.py @@ -1,5 +1,6 @@ """A collection of CLI commands for working with Kedro catalog.""" from collections import defaultdict +from itertools import chain import click import yaml @@ -21,7 +22,7 @@ def _create_session(package_name: str, **kwargs): ) from exc -# pylint: disable=missing-function-docstring +# noqa: missing-function-docstring @click.group(name="Kedro") def catalog_cli(): # pragma: no cover pass @@ -32,7 +33,7 @@ def catalog(): """Commands for working with catalog.""" -# pylint: disable=too-many-locals +# noqa: too-many-locals,protected-access @catalog.command("list") @env_option @click.option( @@ -50,11 +51,14 @@ def list_datasets(metadata: ProjectMetadata, pipeline, env): title = "Datasets in '{}' pipeline" not_mentioned = "Datasets not mentioned in pipeline" mentioned = "Datasets mentioned in pipeline" + factories = "Datasets generated from factories" session = _create_session(metadata.package_name, env=env) context = session.load_context() - datasets_meta = context.catalog._data_sets # pylint: disable=protected-access - catalog_ds = set(context.catalog.list()) + + data_catalog = context.catalog + datasets_meta = data_catalog._data_sets + catalog_ds = set(data_catalog.list()) target_pipelines = pipeline or pipelines.keys() @@ -73,15 +77,30 @@ def list_datasets(metadata: ProjectMetadata, pipeline, env): default_ds = pipeline_ds - catalog_ds used_ds = catalog_ds - unused_ds + # resolve any factory datasets in the pipeline + factory_ds_by_type = defaultdict(list) + for ds_name in default_ds: + matched_pattern = data_catalog._match_pattern( + data_catalog._dataset_patterns, ds_name + ) + if matched_pattern: + ds_config = data_catalog._resolve_config(ds_name, matched_pattern) + factory_ds_by_type[ds_config["type"]].append(ds_name) + + default_ds = default_ds - set(chain.from_iterable(factory_ds_by_type.values())) + unused_by_type = _map_type_to_datasets(unused_ds, datasets_meta) used_by_type = _map_type_to_datasets(used_ds, datasets_meta) if default_ds: used_by_type["DefaultDataset"].extend(default_ds) - data = ((not_mentioned, dict(unused_by_type)), (mentioned, dict(used_by_type))) + data = ( + (mentioned, dict(used_by_type)), + (factories, dict(factory_ds_by_type)), + (not_mentioned, dict(unused_by_type)), + ) result[title.format(pipe)] = {key: value for key, value in data if value} - secho(yaml.dump(result)) @@ -140,7 +159,7 @@ def create_catalog(metadata: ProjectMetadata, pipeline_name, env): catalog_datasets = { ds_name - for ds_name in context.catalog._data_sets.keys() # pylint: disable=protected-access + for ds_name in context.catalog._data_sets.keys() # noqa: protected-access if not ds_name.startswith("params:") and ds_name != "parameters" } @@ -151,8 +170,7 @@ def create_catalog(metadata: ProjectMetadata, pipeline_name, env): context.project_path / settings.CONF_SOURCE / env - / "catalog" - / f"{pipeline_name}.yml" + / f"catalog_{pipeline_name}.yml" ) _add_missing_datasets_to_catalog(missing_ds, catalog_path) click.echo(f"Data Catalog YAML configuration was created: {catalog_path}") @@ -174,3 +192,18 @@ def _add_missing_datasets_to_catalog(missing_ds, catalog_path): catalog_path.parent.mkdir(exist_ok=True) with catalog_path.open(mode="w") as catalog_file: yaml.safe_dump(catalog_config, catalog_file, default_flow_style=False) + + +@catalog.command("rank") +@env_option +@click.pass_obj +def rank_catalog_factories(metadata: ProjectMetadata, env): + """List all dataset factories in the catalog, ranked by priority by which they are matched.""" + session = _create_session(metadata.package_name, env=env) + context = session.load_context() + + catalog_factories = context.catalog._dataset_patterns + if catalog_factories: + click.echo(yaml.dump(list(catalog_factories.keys()))) + else: + click.echo("There are no dataset factories in the catalog.") diff --git a/kedro/framework/cli/cli.py b/kedro/framework/cli/cli.py index c1e8e33c9f..03c9743500 100644 --- a/kedro/framework/cli/cli.py +++ b/kedro/framework/cli/cli.py @@ -28,7 +28,7 @@ _get_entry_points, load_entry_points, ) -from kedro.framework.project import LOGGING # noqa # pylint:disable=unused-import +from kedro.framework.project import LOGGING # noqa # noqa: unused-import from kedro.framework.startup import _is_project, bootstrap_project LOGO = rf""" @@ -161,7 +161,7 @@ def global_groups(self) -> Sequence[click.MultiCommand]: @property def project_groups(self) -> Sequence[click.MultiCommand]: - # pylint: disable=line-too-long + # noqa: line-too-long """Property which loads all project command groups from the project and the plugins, then combines them with the built-in ones. Built-in commands can be overridden by plugins, which can be diff --git a/kedro/framework/cli/hooks/manager.py b/kedro/framework/cli/hooks/manager.py index 19457af690..a1be3e5784 100644 --- a/kedro/framework/cli/hooks/manager.py +++ b/kedro/framework/cli/hooks/manager.py @@ -1,5 +1,5 @@ """This module defines a dedicated hook manager for hooks that extends Kedro CLI behaviour.""" -# pylint: disable=global-statement,invalid-name +# noqa: global-statement,invalid-name import logging from pluggy import PluginManager @@ -16,7 +16,7 @@ def get_cli_hook_manager(): """Create or return the global _hook_manager singleton instance.""" - global _cli_hook_manager + global _cli_hook_manager # noqa: PLW0603 if _cli_hook_manager is None: _cli_hook_manager = CLIHooksManager() _cli_hook_manager.trace.root.setwriter(logger.debug) diff --git a/kedro/framework/cli/jupyter.py b/kedro/framework/cli/jupyter.py index 68eb8b2c85..e7cfbc166e 100644 --- a/kedro/framework/cli/jupyter.py +++ b/kedro/framework/cli/jupyter.py @@ -42,7 +42,7 @@ def list_commands(self, ctx): return ["setup", "notebook", "lab", "convert"] -# pylint: disable=missing-function-docstring +# noqa: missing-function-docstring @click.group(name="Kedro") def jupyter_cli(): # pragma: no cover pass @@ -57,7 +57,7 @@ def jupyter(): @forward_command(jupyter, "setup", forward_help=True) @click.pass_obj # this will pass the metadata as first argument -def setup(metadata: ProjectMetadata, args, **kwargs): # pylint: disable=unused-argument +def setup(metadata: ProjectMetadata, args, **kwargs): # noqa: unused-argument """Initialise the Jupyter Kernel for a kedro project.""" _check_module_importable("ipykernel") validate_settings() @@ -75,7 +75,7 @@ def jupyter_notebook( env, args, **kwargs, -): # pylint: disable=unused-argument +): # noqa: unused-argument """Open Jupyter Notebook with project specific variables loaded.""" _check_module_importable("notebook") validate_settings() @@ -101,7 +101,7 @@ def jupyter_lab( env, args, **kwargs, -): # pylint: disable=unused-argument +): # noqa: unused-argument """Open Jupyter Lab with project specific variables loaded.""" _check_module_importable("jupyterlab") validate_settings() @@ -160,7 +160,7 @@ def _create_kernel(kernel_name: str, display_name: str) -> str: """ # These packages are required by jupyter lab and notebook, which we have already # checked are importable, so we don't run _check_module_importable on them. - # pylint: disable=import-outside-toplevel + # noqa: import-outside-toplevel from ipykernel.kernelspec import install try: @@ -205,7 +205,7 @@ def _create_kernel(kernel_name: str, display_name: str) -> str: @click.pass_obj # this will pass the metadata as first argument def convert_notebook( metadata: ProjectMetadata, all_flag, overwrite_flag, filepath, env, **kwargs -): # pylint: disable=unused-argument, too-many-locals +): # noqa: unused-argument, too-many-locals """Convert selected or all notebooks found in a Kedro project to Kedro code, by exporting code from the appropriately-tagged cells: Cells tagged as `node` will be copied over to a Python file matching diff --git a/kedro/framework/cli/micropkg.py b/kedro/framework/cli/micropkg.py index c9bec3f255..36d103a332 100644 --- a/kedro/framework/cli/micropkg.py +++ b/kedro/framework/cli/micropkg.py @@ -1,6 +1,8 @@ """A collection of CLI commands for working with Kedro micro-packages.""" +# ruff: noqa: I001 # https://github.com/kedro-org/kedro/pull/2634 from __future__ import annotations +import logging import re import shutil import sys @@ -36,19 +38,23 @@ ) from kedro.framework.startup import ProjectMetadata -_SETUP_PY_TEMPLATE = """# -*- coding: utf-8 -*- -from setuptools import setup, find_packages +_PYPROJECT_TOML_TEMPLATE = """ +[build-system] +requires = ["setuptools"] +build-backend = "setuptools.build_meta" -setup( - name="{name}", - version="{version}", - description="Micro-package `{name}`", - packages=find_packages(), - include_package_data=True, - install_requires={install_requires}, -) +[project] +name = "{name}" +version = "{version}" +description = "Micro-package `{name}`" +dependencies = {install_requires} + +[tool.setuptools.packages] +find = {{}} """ +logger = logging.getLogger(__name__) + class _EquivalentRequirement(Requirement): """Parse a requirement according to PEP 508. @@ -99,7 +105,7 @@ def __eq__(self, other: Any) -> bool: ) -def _check_module_path(ctx, param, value): # pylint: disable=unused-argument +def _check_module_path(ctx, param, value): # noqa: unused-argument if value and not re.match(r"^[\w.]+$", value): message = ( "The micro-package location you provided is not a valid Python module path" @@ -108,7 +114,7 @@ def _check_module_path(ctx, param, value): # pylint: disable=unused-argument return value -# pylint: disable=missing-function-docstring +# noqa: missing-function-docstring @click.group(name="Kedro") def micropkg_cli(): # pragma: no cover pass @@ -148,7 +154,7 @@ def micropkg(): help="Location of a configuration file for the fsspec filesystem used to pull the package.", ) @click.pass_obj # this will pass the metadata as first argument -def pull_package( # pylint:disable=unused-argument, too-many-arguments +def pull_package( # noqa: unused-argument, too-many-arguments metadata: ProjectMetadata, package_path, env, @@ -183,8 +189,7 @@ def pull_package( # pylint:disable=unused-argument, too-many-arguments click.secho(message, fg="green") -# pylint: disable=too-many-arguments -def _pull_package( +def _pull_package( # noqa: too-many-arguments package_path: str, metadata: ProjectMetadata, env: str = None, @@ -212,7 +217,7 @@ def _pull_package( # without making assumptions on the project metadata. library_meta = project_wheel_metadata(project_root_dir) - # Project name will be `my-pipeline` even if `setup.py` says `my_pipeline` + # Project name will be `my-pipeline` even if `pyproject.toml` says `my_pipeline` # because standards mandate normalization of names for comparison, # see https://packaging.python.org/en/latest/specifications/core-metadata/#name # The proper way to get it would be @@ -253,7 +258,7 @@ def _pull_package( def _pull_packages_from_manifest(metadata: ProjectMetadata) -> None: - # pylint: disable=import-outside-toplevel + # noqa: import-outside-toplevel import anyconfig # for performance reasons config_dict = anyconfig.load(metadata.config_file) @@ -277,7 +282,7 @@ def _pull_packages_from_manifest(metadata: ProjectMetadata) -> None: def _package_micropkgs_from_manifest(metadata: ProjectMetadata) -> None: - # pylint: disable=import-outside-toplevel + # noqa: import-outside-toplevel import anyconfig # for performance reasons config_dict = anyconfig.load(metadata.config_file) @@ -326,7 +331,7 @@ def _package_micropkgs_from_manifest(metadata: ProjectMetadata) -> None: ) @click.argument("module_path", nargs=1, required=False, callback=_check_module_path) @click.pass_obj # this will pass the metadata as first argument -def package_micropkg( +def package_micropkg( # noqa: too-many-arguments metadata: ProjectMetadata, module_path, env, @@ -334,7 +339,7 @@ def package_micropkg( destination, all_flag, **kwargs, -): # pylint: disable=unused-argument +): """Package up a modular pipeline or micro-package as a Python source distribution.""" if not module_path and not all_flag: click.secho( @@ -360,7 +365,7 @@ def package_micropkg( def _get_fsspec_filesystem(location: str, fs_args: str | None): - # pylint: disable=import-outside-toplevel + # noqa: import-outside-toplevel import anyconfig import fsspec @@ -371,7 +376,7 @@ def _get_fsspec_filesystem(location: str, fs_args: str | None): try: return fsspec.filesystem(protocol, **fs_args_config) - except Exception as exc: # pylint: disable=broad-except + except Exception as exc: # noqa: broad-except # Specified protocol is not supported by `fsspec` # or requires extra dependencies click.secho(str(exc), fg="red") @@ -389,7 +394,7 @@ def safe_extract(tar, path): for member in tar.getmembers(): member_path = path / member.name if not _is_within_directory(path, member_path): - # pylint: disable=broad-exception-raised + # noqa: broad-exception-raised raise Exception("Failed to safely extract tar file.") tar.extractall(path) # nosec B202 @@ -438,7 +443,7 @@ def _rename_files(conf_source: Path, old_name: str, new_name: str): config_file.rename(config_file.parent / new_config_name) -def _refactor_code_for_unpacking( +def _refactor_code_for_unpacking( # noqa: too-many-arguments project: Project, package_path: Path, tests_path: Path, @@ -519,7 +524,7 @@ def _move_package_with_conflicting_name( return refactored_package_path, refactored_tests_path -def _install_files( # pylint: disable=too-many-arguments, too-many-locals +def _install_files( # noqa: too-many-arguments, too-many-locals project_metadata: ProjectMetadata, package_name: str, source_path: Path, @@ -590,6 +595,12 @@ def _get_default_version(metadata: ProjectMetadata, micropkg_module_path: str) - ) return micropkg_module.__version__ # type: ignore except (AttributeError, ModuleNotFoundError): + logger.warning( + "Micropackage version not found in '%s.%s', will take the top-level one in '%s'", + metadata.package_name, + micropkg_module_path, + metadata.package_name, + ) # if micropkg version doesn't exist, take the project one project_module = import_module(f"{metadata.package_name}") return project_module.__version__ # type: ignore @@ -611,9 +622,16 @@ def _package_micropkg( ) # as the source distribution will only contain parameters, we aren't listing other # config files not to confuse users and avoid useless file copies + # collect configs to package not only from parameters folder, but from core conf folder also + # because parameters had been moved from foldername to yml filename configs_to_package = _find_config_files( package_conf, - [f"parameters*/**/{micropkg_name}.yml", f"parameters*/**/{micropkg_name}/**/*"], + [ + f"**/parameters_{micropkg_name}.yml", + f"**/{micropkg_name}/**/*", + f"parameters*/**/{micropkg_name}.yml", + f"parameters*/**/{micropkg_name}/**/*", + ], ) source_paths = (package_source, package_tests, configs_to_package) @@ -809,8 +827,7 @@ def _move_package_with_conflicting_name(target: Path, conflicting_name: str): _SourcePathType = Union[Path, List[Tuple[Path, str]]] -# pylint: disable=too-many-arguments,too-many-locals -def _generate_sdist_file( +def _generate_sdist_file( # noqa: too-many-arguments,too-many-locals micropkg_name: str, destination: Path, source_paths: tuple[_SourcePathType, ...], @@ -836,7 +853,7 @@ def _generate_sdist_file( if conf_target.is_dir() and alias: _rename_files(conf_target, micropkg_name, alias) - # Build a setup.py on the fly + # Build a pyproject.toml on the fly try: install_requires = _make_install_requires( package_source / "requirements.txt" # type: ignore @@ -847,7 +864,7 @@ def _generate_sdist_file( raise KedroCliError(f"{cls.__module__}.{cls.__qualname__}: {exc}") from exc _generate_manifest_file(temp_dir_path) - _generate_setup_file(package_name, version, install_requires, temp_dir_path) + _generate_pyproject_file(package_name, version, install_requires, temp_dir_path) package_file = destination / _get_sdist_name(name=package_name, version=version) @@ -883,19 +900,19 @@ def _generate_manifest_file(output_dir: Path): ) -def _generate_setup_file( +def _generate_pyproject_file( package_name: str, version: str, install_requires: list[str], output_dir: Path ) -> Path: - setup_file = output_dir / "setup.py" + pyproject_file = output_dir / "pyproject.toml" - setup_file_context = { + pyproject_file_context = { "name": package_name, "version": version, "install_requires": install_requires, } - setup_file.write_text(_SETUP_PY_TEMPLATE.format(**setup_file_context)) - return setup_file + pyproject_file.write_text(_PYPROJECT_TOML_TEMPLATE.format(**pyproject_file_context)) + return pyproject_file def _get_package_artifacts( diff --git a/kedro/framework/cli/pipeline.py b/kedro/framework/cli/pipeline.py index 225ad5fd74..d3d9b2d2fd 100644 --- a/kedro/framework/cli/pipeline.py +++ b/kedro/framework/cli/pipeline.py @@ -56,7 +56,7 @@ def _assert_pkg_name_ok(pkg_name: str): if not re.match(r"^[a-zA-Z_]", pkg_name): message = base_message + " It must start with a letter or underscore." raise KedroCliError(message) - if len(pkg_name) < 2: + if len(pkg_name) < 2: # noqa: PLR2004 message = base_message + " It must be at least 2 characters long." raise KedroCliError(message) if not re.match(r"^\w+$", pkg_name[1:]): @@ -66,13 +66,13 @@ def _assert_pkg_name_ok(pkg_name: str): raise KedroCliError(message) -def _check_pipeline_name(ctx, param, value): # pylint: disable=unused-argument +def _check_pipeline_name(ctx, param, value): # noqa: unused-argument if value: _assert_pkg_name_ok(value) return value -# pylint: disable=missing-function-docstring +# noqa: missing-function-docstring @click.group(name="Kedro") def pipeline_cli(): # pragma: no cover pass @@ -94,7 +94,7 @@ def pipeline(): @click.pass_obj # this will pass the metadata as first argument def create_pipeline( metadata: ProjectMetadata, name, skip_config, env, **kwargs -): # pylint: disable=unused-argument +): # noqa: unused-argument """Create a new modular pipeline by providing a name.""" package_dir = metadata.source_dir / metadata.package_name conf_source = settings.CONF_SOURCE @@ -124,7 +124,7 @@ def create_pipeline( @click.pass_obj # this will pass the metadata as first argument def delete_pipeline( metadata: ProjectMetadata, name, env, yes, **kwargs -): # pylint: disable=unused-argument +): # noqa: unused-argument """Delete a modular pipeline by providing a name.""" package_dir = metadata.source_dir / metadata.package_name conf_source = settings.CONF_SOURCE @@ -140,10 +140,14 @@ def delete_pipeline( pipeline_artifacts = _get_pipeline_artifacts(metadata, pipeline_name=name, env=env) files_to_delete = [ - pipeline_artifacts.pipeline_conf / confdir / f"{name}.yml" + pipeline_artifacts.pipeline_conf / filepath for confdir in ("parameters", "catalog") - if (pipeline_artifacts.pipeline_conf / confdir / f"{name}.yml").is_file() + # Since we remove nesting in 'parameters' and 'catalog' folders, + # we want to also del the old project's structure for backward compatibility + for filepath in (Path(f"{confdir}_{name}.yml"), Path(confdir) / f"{name}.yml") + if (pipeline_artifacts.pipeline_conf / filepath).is_file() ] + dirs_to_delete = [ path for path in (pipeline_artifacts.pipeline_dir, pipeline_artifacts.pipeline_tests) @@ -189,7 +193,7 @@ def _echo_deletion_warning(message: str, **paths: list[Path]): def _create_pipeline(name: str, output_dir: Path) -> Path: with _filter_deprecation_warnings(): - # pylint: disable=import-outside-toplevel + # noqa: import-outside-toplevel from cookiecutter.main import cookiecutter template_path = Path(kedro.__file__).parent / "templates" / "pipeline" diff --git a/kedro/framework/cli/project.py b/kedro/framework/cli/project.py index 034b460023..f3cf141dfa 100644 --- a/kedro/framework/cli/project.py +++ b/kedro/framework/cli/project.py @@ -68,7 +68,7 @@ CONF_SOURCE_HELP = """Path of a directory where project configuration is stored.""" -# pylint: disable=missing-function-docstring +# noqa: missing-function-docstring @click.group(name="Kedro") def project_group(): # pragma: no cover pass @@ -76,7 +76,7 @@ def project_group(): # pragma: no cover @forward_command(project_group, forward_help=True) @click.pass_obj # this will pass the metadata as first argument -def test(metadata: ProjectMetadata, args, **kwargs): # pylint: disable=unused-argument +def test(metadata: ProjectMetadata, args, **kwargs): # noqa: ument """Run the test suite. (DEPRECATED)""" deprecation_message = ( "DeprecationWarning: Command 'kedro test' is deprecated and " @@ -101,7 +101,7 @@ def test(metadata: ProjectMetadata, args, **kwargs): # pylint: disable=unused-a @click.pass_obj # this will pass the metadata as first argument def lint( metadata: ProjectMetadata, files, check_only, **kwargs -): # pylint: disable=unused-argument +): # noqa: unused-argument """Run flake8, isort and black. (DEPRECATED)""" deprecation_message = ( "DeprecationWarning: Command 'kedro lint' is deprecated and " @@ -134,9 +134,7 @@ def lint( @forward_command(project_group, forward_help=True) @env_option @click.pass_obj # this will pass the metadata as first argument -def ipython( - metadata: ProjectMetadata, env, args, **kwargs -): # pylint: disable=unused-argument +def ipython(metadata: ProjectMetadata, env, args, **kwargs): # noqa: unused-argument """Open IPython with project specific variables loaded.""" _check_module_importable("IPython") @@ -238,7 +236,7 @@ def build_docs(metadata: ProjectMetadata, open_docs): @click.pass_obj # this will pass the metadata as first argument def build_reqs( metadata: ProjectMetadata, input_file, output_file, args, **kwargs -): # pylint: disable=unused-argument +): # noqa: unused-argument """Run `pip-compile` on src/requirements.txt or the user defined input file and save the compiled requirements to src/requirements.lock or the user defined output file. (DEPRECATED) @@ -281,9 +279,7 @@ def build_reqs( @command_with_verbosity(project_group, "activate-nbstripout") @click.pass_obj # this will pass the metadata as first argument -def activate_nbstripout( - metadata: ProjectMetadata, **kwargs -): # pylint: disable=unused-argument +def activate_nbstripout(metadata: ProjectMetadata, **kwargs): # noqa: unused-argument """Install the nbstripout git hook to automatically clean notebooks. (DEPRECATED)""" deprecation_message = ( "DeprecationWarning: Command 'kedro activate-nbstripout' is deprecated and " @@ -308,10 +304,9 @@ def activate_nbstripout( ) from exc try: - res = subprocess.run( # pylint: disable=subprocess-run-check + res = subprocess.run( # noqa: subprocess-run-check ["git", "rev-parse", "--git-dir"], - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, + capture_output=True, ) if res.returncode: raise KedroCliError("Not a git repository. Run 'git init' first.") @@ -417,8 +412,7 @@ def activate_nbstripout( help=PARAMS_ARG_HELP, callback=_split_params, ) -# pylint: disable=too-many-arguments,unused-argument,too-many-locals -def run( +def run( # noqa: too-many-arguments,unused-argument,too-many-locals tag, tags, env, diff --git a/kedro/framework/cli/registry.py b/kedro/framework/cli/registry.py index ea456016a7..05b05f9afd 100644 --- a/kedro/framework/cli/registry.py +++ b/kedro/framework/cli/registry.py @@ -7,7 +7,7 @@ from kedro.framework.startup import ProjectMetadata -# pylint: disable=missing-function-docstring +# noqa: missing-function-docstring @click.group(name="Kedro") def registry_cli(): # pragma: no cover pass @@ -29,7 +29,7 @@ def list_registered_pipelines(): @click.pass_obj def describe_registered_pipeline( metadata: ProjectMetadata, name, **kwargs -): # pylint: disable=unused-argument, protected-access +): # noqa: unused-argument, protected-access """Describe a registered pipeline by providing a pipeline name. Defaults to the `__default__` pipeline. """ diff --git a/kedro/framework/cli/starters.py b/kedro/framework/cli/starters.py index 77491d391f..25e68f3699 100644 --- a/kedro/framework/cli/starters.py +++ b/kedro/framework/cli/starters.py @@ -37,7 +37,7 @@ @define(order=True) -class KedroStarterSpec: # pylint: disable=too-few-public-methods +class KedroStarterSpec: # noqa: too-few-public-methods """Specification of custom kedro starter template Args: alias: alias of the starter which shows up on `kedro starter list` and is used @@ -92,7 +92,7 @@ class KedroStarterSpec: # pylint: disable=too-few-public-methods ) -# pylint: disable=unused-argument +# noqa: unused-argument def _remove_readonly(func: Callable, path: Path, excinfo: tuple): # pragma: no cover """Remove readonly files on Windows See: https://docs.python.org/3/library/shutil.html?highlight=shutil#rmtree-example @@ -160,7 +160,7 @@ def _starter_spec_to_dict( return format_dict -# pylint: disable=missing-function-docstring +# noqa: missing-function-docstring @click.group(context_settings=CONTEXT_SETTINGS, name="Kedro") def create_cli(): # pragma: no cover pass @@ -347,7 +347,7 @@ def _create_project(template_path: str, cookiecutter_args: dict[str, Any]): KedroCliError: If it fails to generate a project. """ with _filter_deprecation_warnings(): - # pylint: disable=import-outside-toplevel + # noqa: import-outside-toplevel from cookiecutter.main import cookiecutter # for performance reasons try: @@ -389,7 +389,7 @@ def _get_cookiecutter_dir( clones it to ``tmpdir``; if template_path is a file path then directly uses that path without copying anything. """ - # pylint: disable=import-outside-toplevel + # noqa: import-outside-toplevel from cookiecutter.exceptions import RepositoryCloneFailed, RepositoryNotFound from cookiecutter.repository import determine_repo_dir # for performance reasons @@ -447,7 +447,7 @@ def _fetch_config_from_user_prompts( Configuration for starting a new project. This is passed as ``extra_context`` to cookiecutter and will overwrite the cookiecutter.json defaults. """ - # pylint: disable=import-outside-toplevel + # noqa: import-outside-toplevel from cookiecutter.environment import StrictEnvironment from cookiecutter.prompt import read_user_variable, render_variable @@ -472,7 +472,7 @@ def _fetch_config_from_user_prompts( def _make_cookiecutter_context_for_prompts(cookiecutter_dir: Path): - # pylint: disable=import-outside-toplevel + # noqa: import-outside-toplevel from cookiecutter.generate import generate_context cookiecutter_context = generate_context(cookiecutter_dir / "cookiecutter.json") @@ -482,7 +482,7 @@ def _make_cookiecutter_context_for_prompts(cookiecutter_dir: Path): class _Prompt: """Represent a single CLI prompt for `kedro new`""" - def __init__(self, *args, **kwargs) -> None: # pylint: disable=unused-argument + def __init__(self, *args, **kwargs) -> None: # noqa: unused-argument try: self.title = kwargs["title"] except KeyError as exc: @@ -512,7 +512,7 @@ def validate(self, user_input: str) -> None: def _get_available_tags(template_path: str) -> list: # Not at top level so that kedro CLI works without a working git executable. - # pylint: disable=import-outside-toplevel + # noqa: import-outside-toplevel import git try: diff --git a/kedro/framework/cli/utils.py b/kedro/framework/cli/utils.py index bd1c59a2ec..3240c3c4ab 100644 --- a/kedro/framework/cli/utils.py +++ b/kedro/framework/cli/utils.py @@ -52,7 +52,7 @@ def call(cmd: list[str], **kwargs): # pragma: no cover click.exceptions.Exit: If `subprocess.run` returns non-zero code. """ click.echo(" ".join(shlex.quote(c) for c in cmd)) - # pylint: disable=subprocess-run-check + # noqa: subprocess-run-check code = subprocess.run(cmd, **kwargs).returncode if code: raise click.exceptions.Exit(code=code) @@ -222,14 +222,14 @@ def get_pkg_version(reqs_path: (str | Path), package_name: str) -> str: pattern = re.compile(package_name + r"([^\w]|$)") with reqs_path.open("r", encoding="utf-8") as reqs_file: for req_line in reqs_file: - req_line = req_line.strip() + req_line = req_line.strip() # noqa: redefined-loop-name if pattern.search(req_line): return req_line raise KedroCliError(f"Cannot find '{package_name}' package in '{reqs_path}'.") -def _update_verbose_flag(ctx, param, value): # pylint: disable=unused-argument +def _update_verbose_flag(ctx, param, value): # noqa: unused-argument KedroCliError.VERBOSE_ERROR = value @@ -265,7 +265,7 @@ class KedroCliError(click.exceptions.ClickException): def show(self, file=None): if file is None: - # pylint: disable=protected-access + # noqa: protected-access file = click._compat.get_text_stderr() if self.VERBOSE_ERROR: click.secho(traceback.format_exc(), nl=False, fg="yellow") @@ -291,12 +291,12 @@ def _clean_pycache(path: Path): shutil.rmtree(each, ignore_errors=True) -def split_string(ctx, param, value): # pylint: disable=unused-argument +def split_string(ctx, param, value): # noqa: unused-argument """Split string by comma.""" return [item.strip() for item in value.split(",") if item.strip()] -# pylint: disable=unused-argument,missing-param-doc,missing-type-doc +# noqa: unused-argument,missing-param-doc,missing-type-doc def split_node_names(ctx, param, to_split: str) -> list[str]: """Split string by comma, ignoring commas enclosed by square parentheses. This avoids splitting the string of nodes names on commas included in @@ -363,13 +363,13 @@ def _get_entry_points(name: str) -> importlib_metadata.EntryPoints: return importlib_metadata.entry_points().select(group=ENTRY_POINT_GROUPS[name]) -def _safe_load_entry_point( # pylint: disable=inconsistent-return-statements +def _safe_load_entry_point( # noqa: inconsistent-return-statements entry_point, ): """Load entrypoint safely, if fails it will just skip the entrypoint.""" try: return entry_point.load() - except Exception as exc: # pylint: disable=broad-except + except Exception as exc: # noqa: broad-except logger.warning( "Failed to load %s commands from %s. Full exception: %s", entry_point.module, @@ -401,13 +401,13 @@ def load_entry_points(name: str) -> Sequence[click.MultiCommand]: return entry_point_commands -def _config_file_callback(ctx, param, value): # pylint: disable=unused-argument +def _config_file_callback(ctx, param, value): # noqa: unused-argument """CLI callback that replaces command line options with values specified in a config file. If command line options are passed, they override config file values. """ # for performance reasons - import anyconfig # pylint: disable=import-outside-toplevel + import anyconfig # noqa: import-outside-toplevel ctx.default_map = ctx.default_map or {} section = ctx.info_name @@ -428,9 +428,9 @@ def _reformat_load_versions(ctx, param, value) -> dict[str, str]: load_versions_dict = {} for load_version in value: - load_version = load_version.strip() + load_version = load_version.strip() # noqa: PLW2901 load_version_list = load_version.split(":", 1) - if len(load_version_list) != 2: + if len(load_version_list) != 2: # noqa: PLR2004 raise KedroCliError( f"Expected the form of 'load_version' to be " f"'dataset_name:YYYY-MM-DDThh.mm.ss.sssZ'," @@ -453,9 +453,9 @@ def _split_params(ctx, param, value): # which should not be replaced by = pass else: - item = item.replace(":", "=", 1) + item = item.replace(":", "=", 1) # noqa: redefined-loop-name items = item.split("=", 1) - if len(items) != 2: + if len(items) != 2: # noqa: PLR2004 ctx.fail( f"Invalid format of `{param.name}` option: " f"Item `{items[0]}` must contain " diff --git a/kedro/framework/context/context.py b/kedro/framework/context/context.py index 7c3a044fb4..0ba116255a 100644 --- a/kedro/framework/context/context.py +++ b/kedro/framework/context/context.py @@ -118,28 +118,22 @@ def _convert_paths_to_absolute_posix( return conf_dictionary -def _validate_layers_for_transcoding(catalog: DataCatalog) -> None: - """Check that transcoded names that correspond to - the same dataset also belong to the same layer. - """ +def _validate_transcoded_datasets(catalog: DataCatalog): + """Validates transcoded datasets are correctly named - def _find_conflicts(): - base_names_to_layer = {} - for current_layer, dataset_names in catalog.layers.items(): - for name in dataset_names: - base_name, _ = _transcode_split(name) - known_layer = base_names_to_layer.setdefault(base_name, current_layer) - if current_layer != known_layer: - yield name - else: - base_names_to_layer[base_name] = current_layer - - conflicting_datasets = sorted(_find_conflicts()) - if conflicting_datasets: - error_str = ", ".join(conflicting_datasets) - raise ValueError( - f"Transcoded datasets should have the same layer. Mismatch found for: {error_str}" - ) + Args: + catalog (DataCatalog): The catalog object containing the + datasets to be validated. + + Raises: + ValueError: If a dataset name does not conform to the expected + transcoding naming conventions,a ValueError is raised by the + `_transcode_split` function. + + """ + # noqa: protected-access + for dataset_name in catalog._data_sets.keys(): + _transcode_split(dataset_name) def _update_nested_dict(old_dict: dict[Any, Any], new_dict: dict[Any, Any]) -> None: @@ -153,11 +147,10 @@ def _update_nested_dict(old_dict: dict[Any, Any], new_dict: dict[Any, Any]) -> N for key, value in new_dict.items(): if key not in old_dict: old_dict[key] = value + elif isinstance(old_dict[key], dict) and isinstance(value, dict): + _update_nested_dict(old_dict[key], value) else: - if isinstance(old_dict[key], dict) and isinstance(value, dict): - _update_nested_dict(old_dict[key], value) - else: - old_dict[key] = value + old_dict[key] = value class KedroContext: @@ -165,7 +158,7 @@ class KedroContext: Kedro's main functionality. """ - def __init__( + def __init__( # noqa: too-many-arguments self, package_name: str, project_path: Path | str, @@ -173,7 +166,7 @@ def __init__( hook_manager: PluginManager, env: str = None, extra_params: dict[str, Any] = None, - ): # pylint: disable=too-many-arguments + ): """Create a context object by providing the root of a Kedro project and the environment configuration subfolders (see ``kedro.config.ConfigLoader``) @@ -290,8 +283,7 @@ def _get_catalog( feed_dict = self._get_feed_dict() catalog.add_feed_dict(feed_dict) - if catalog.layers: - _validate_layers_for_transcoding(catalog) + _validate_transcoded_datasets(catalog) self._hook_manager.hook.after_catalog_created( catalog=catalog, conf_catalog=conf_catalog, diff --git a/kedro/framework/hooks/specs.py b/kedro/framework/hooks/specs.py index 6bef21f3bb..aa10ab7276 100644 --- a/kedro/framework/hooks/specs.py +++ b/kedro/framework/hooks/specs.py @@ -18,7 +18,7 @@ class DataCatalogSpecs: """Namespace that defines all specifications for a data catalog's lifecycle hooks.""" @hook_spec - def after_catalog_created( # pylint: disable=too-many-arguments + def after_catalog_created( # noqa: too-many-arguments self, catalog: DataCatalog, conf_catalog: dict[str, Any], @@ -48,7 +48,7 @@ class NodeSpecs: """Namespace that defines all specifications for a node's lifecycle hooks.""" @hook_spec - def before_node_run( # pylint: disable=too-many-arguments + def before_node_run( # noqa: too-many-arguments self, node: Node, catalog: DataCatalog, @@ -76,7 +76,7 @@ def before_node_run( # pylint: disable=too-many-arguments pass @hook_spec - def after_node_run( # pylint: disable=too-many-arguments + def after_node_run( # noqa: too-many-arguments self, node: Node, catalog: DataCatalog, @@ -104,7 +104,7 @@ def after_node_run( # pylint: disable=too-many-arguments pass @hook_spec - def on_node_error( # pylint: disable=too-many-arguments + def on_node_error( # noqa: too-many-arguments self, error: Exception, node: Node, diff --git a/kedro/framework/project/__init__.py b/kedro/framework/project/__init__.py index 0d6946555e..f266da430c 100644 --- a/kedro/framework/project/__init__.py +++ b/kedro/framework/project/__init__.py @@ -1,6 +1,6 @@ """``kedro.framework.project`` module provides utitlity to configure a Kedro project and access its settings.""" -# pylint: disable=redefined-outer-name,unused-argument,global-statement +# noqa: redefined-outer-name,unused-argument,global-statement from __future__ import annotations import importlib @@ -131,7 +131,7 @@ def _load_data_wrapper(func): Taking inspiration from dynaconf.utils.functional.new_method_proxy """ - # pylint: disable=protected-access + # noqa: protected-access def inner(self, *args, **kwargs): self._load_data() return func(self._content, *args, **kwargs) @@ -208,7 +208,7 @@ def configure(self, pipelines_module: str | None = None) -> None: class _ProjectLogging(UserDict): - # pylint: disable=super-init-not-called + # noqa: super-init-not-called def __init__(self): """Initialise project logging. The path to logging configuration is given in environment variable KEDRO_LOGGING_CONFIG (defaults to default_logging.yml).""" @@ -249,7 +249,7 @@ def configure_project(package_name: str): # global variable to make it easily accessible. This is used by validate_settings() # below, and also by ParallelRunner on Windows, as package_name is required every # time a new subprocess is spawned. - global PACKAGE_NAME + global PACKAGE_NAME # noqa: PLW0603 PACKAGE_NAME = package_name @@ -299,7 +299,7 @@ def _create_pipeline(pipeline_module: types.ModuleType) -> Pipeline | None: return obj -def find_pipelines() -> dict[str, Pipeline]: +def find_pipelines() -> dict[str, Pipeline]: # noqa: PLR0912 """Automatically find modular pipelines having a ``create_pipeline`` function. By default, projects created using Kedro 0.18.3 and higher call this function to autoregister pipelines upon creation/addition. @@ -325,7 +325,7 @@ def find_pipelines() -> dict[str, Pipeline]: pipeline_module_name = f"{PACKAGE_NAME}.pipeline" try: pipeline_module = importlib.import_module(pipeline_module_name) - except Exception as exc: # pylint: disable=broad-except + except Exception as exc: # noqa: broad-except if str(exc) != f"No module named '{pipeline_module_name}'": warnings.warn( IMPORT_ERROR_MESSAGE.format( @@ -355,7 +355,7 @@ def find_pipelines() -> dict[str, Pipeline]: pipeline_module_name = f"{PACKAGE_NAME}.pipelines.{pipeline_name}" try: pipeline_module = importlib.import_module(pipeline_module_name) - except: # pylint: disable=bare-except # noqa: E722 + except: # noqa: bare-except # noqa: E722 warnings.warn( IMPORT_ERROR_MESSAGE.format( module=pipeline_module_name, tb_exc=traceback.format_exc() diff --git a/kedro/framework/session/session.py b/kedro/framework/session/session.py index 90df0ffee9..9ff9f5c24b 100644 --- a/kedro/framework/session/session.py +++ b/kedro/framework/session/session.py @@ -8,6 +8,7 @@ import subprocess import sys import traceback +import warnings from copy import deepcopy from pathlib import Path from typing import Any, Iterable @@ -15,7 +16,7 @@ import click from kedro import __version__ as kedro_version -from kedro.config import ConfigLoader, MissingConfigException +from kedro.config import ConfigLoader, MissingConfigException, TemplatedConfigLoader from kedro.framework.context import KedroContext from kedro.framework.context.context import _convert_paths_to_absolute_posix from kedro.framework.hooks import _create_hook_manager @@ -48,7 +49,7 @@ def _describe_git(project_path: Path) -> dict[str, dict[str, Any]]: git_data["dirty"] = bool(git_status_res.decode().strip()) # `subprocess.check_output()` raises `NotADirectoryError` on Windows - except Exception: # pylint: disable=broad-except + except Exception: # noqa: broad-except logger = logging.getLogger(__name__) logger.debug("Unable to git describe %s", project_path) logger.debug(traceback.format_exc()) @@ -74,7 +75,7 @@ class KedroSessionError(Exception): pass -# pylint: disable=too-many-instance-attributes +# noqa: too-many-instance-attributes class KedroSession: """``KedroSession`` is the object that is responsible for managing the lifecycle of a Kedro run. Use `KedroSession.create()` as @@ -99,8 +100,7 @@ class KedroSession: """ - # pylint: disable=too-many-arguments - def __init__( + def __init__( # noqa: too-many-arguments self, session_id: str, package_name: str = None, @@ -125,7 +125,7 @@ def __init__( ) @classmethod - def create( # pylint: disable=too-many-arguments + def create( # noqa: too-many-arguments cls, package_name: str = None, project_path: Path | str | None = None, @@ -183,7 +183,7 @@ def create( # pylint: disable=too-many-arguments try: session_data["username"] = getpass.getuser() - except Exception as exc: # pylint: disable=broad-except + except Exception as exc: # noqa: broad-except logging.getLogger(__name__).debug( "Unable to get username. Full exception: %s", exc ) @@ -262,7 +262,15 @@ def load_context(self) -> KedroContext: env = self.store.get("env") extra_params = self.store.get("extra_params") config_loader = self._get_config_loader() - + if isinstance(config_loader, (ConfigLoader, TemplatedConfigLoader)): + warnings.warn( + f"{type(config_loader).__name__} will be deprecated in Kedro 0.19." + f" Please use the OmegaConfigLoader instead. To consult" + f" the documentation for OmegaConfigLoader, see here:" + f" https://docs.kedro.org/en/stable/configuration/" + f"advanced_configuration.html#omegaconfigloader", + FutureWarning, + ) context_class = settings.CONTEXT_CLASS context = context_class( package_name=self._package_name, @@ -304,7 +312,7 @@ def __exit__(self, exc_type, exc_value, tb_): self._log_exception(exc_type, exc_value, tb_) self.close() - def run( # pylint: disable=too-many-arguments,too-many-locals + def run( # noqa: too-many-arguments,too-many-locals self, pipeline_name: str = None, tags: Iterable[str] = None, @@ -406,7 +414,7 @@ def run( # pylint: disable=too-many-arguments,too-many-locals "runner": getattr(runner, "__name__", str(runner)), } - catalog = context._get_catalog( # pylint: disable=protected-access + catalog = context._get_catalog( # noqa: protected-access save_version=save_version, load_versions=load_versions, ) diff --git a/kedro/framework/startup.py b/kedro/framework/startup.py index 6b0d2cc2c7..287999125a 100644 --- a/kedro/framework/startup.py +++ b/kedro/framework/startup.py @@ -41,7 +41,7 @@ def _is_project(project_path: Union[str, Path]) -> bool: try: return "[tool.kedro]" in metadata_file.read_text(encoding="utf-8") - except Exception: # pylint: disable=broad-except + except Exception: # noqa: broad-except return False diff --git a/kedro/io/__init__.py b/kedro/io/__init__.py index cfb89472a4..26d4c3619c 100644 --- a/kedro/io/__init__.py +++ b/kedro/io/__init__.py @@ -3,7 +3,7 @@ """ from __future__ import annotations -from .cached_dataset import CachedDataset, CachedDataSet +from .cached_dataset import CachedDataSet, CachedDataset from .core import ( AbstractDataset, AbstractVersionedDataset, @@ -13,27 +13,27 @@ Version, ) from .data_catalog import DataCatalog -from .lambda_dataset import LambdaDataset, LambdaDataSet -from .memory_dataset import MemoryDataset, MemoryDataSet +from .lambda_dataset import LambdaDataSet, LambdaDataset +from .memory_dataset import MemoryDataSet, MemoryDataset from .partitioned_dataset import ( - IncrementalDataset, IncrementalDataSet, - PartitionedDataset, + IncrementalDataset, PartitionedDataSet, + PartitionedDataset, ) # https://github.com/pylint-dev/pylint/issues/4300#issuecomment-1043601901 -DataSetError: type[Exception] -DataSetNotFoundError: type[DatasetError] -DataSetAlreadyExistsError: type[DatasetError] -AbstractDataSet: type -AbstractVersionedDataSet: type[AbstractDataset] +DataSetError: type[DatasetError] +DataSetNotFoundError: type[DatasetNotFoundError] +DataSetAlreadyExistsError: type[DatasetAlreadyExistsError] +AbstractDataSet: type[AbstractDataset] +AbstractVersionedDataSet: type[AbstractVersionedDataset] def __getattr__(name): - import kedro.io.core # pylint: disable=import-outside-toplevel + import kedro.io.core # noqa: import-outside-toplevel - if name in (kedro.io.core._DEPRECATED_CLASSES): # pylint: disable=protected-access + if name in (kedro.io.core._DEPRECATED_CLASSES): # noqa: protected-access return getattr(kedro.io.core, name) raise AttributeError(f"module {repr(__name__)} has no attribute {repr(name)}") diff --git a/kedro/io/cached_dataset.py b/kedro/io/cached_dataset.py index 25793d117c..6ec2a59fb7 100644 --- a/kedro/io/cached_dataset.py +++ b/kedro/io/cached_dataset.py @@ -12,7 +12,7 @@ from kedro.io.memory_dataset import MemoryDataset # https://github.com/pylint-dev/pylint/issues/4300#issuecomment-1043601901 -CachedDataSet: type[AbstractDataset] +CachedDataSet: type[CachedDataset] class CachedDataset(AbstractDataset): @@ -96,8 +96,8 @@ def _from_config(config, version): def _describe(self) -> dict[str, Any]: return { - "dataset": self._dataset._describe(), # pylint: disable=protected-access - "cache": self._cache._describe(), # pylint: disable=protected-access + "dataset": self._dataset._describe(), # noqa: protected-access + "cache": self._cache._describe(), # noqa: protected-access } def _load(self): diff --git a/kedro/io/core.py b/kedro/io/core.py index 4e63b83ff1..6a097d7058 100644 --- a/kedro/io/core.py +++ b/kedro/io/core.py @@ -30,11 +30,11 @@ CLOUD_PROTOCOLS = ("s3", "s3n", "s3a", "gcs", "gs", "adl", "abfs", "abfss", "gdrive") # https://github.com/pylint-dev/pylint/issues/4300#issuecomment-1043601901 -DataSetError: type[Exception] -DataSetNotFoundError: type[DatasetError] -DataSetAlreadyExistsError: type[DatasetError] -AbstractDataSet: type -AbstractVersionedDataSet: type[AbstractDataset] +DataSetError: type[DatasetError] +DataSetNotFoundError: type[DatasetNotFoundError] +DataSetAlreadyExistsError: type[DatasetAlreadyExistsError] +AbstractDataSet: type[AbstractDataset] +AbstractVersionedDataSet: type[AbstractVersionedDataset] class DatasetError(Exception): @@ -562,7 +562,7 @@ def _fetch_latest_load_version(self) -> str: # 'key' is set to prevent cache key overlapping for load and save: # https://cachetools.readthedocs.io/en/stable/#cachetools.cachedmethod @cachedmethod(cache=attrgetter("_version_cache"), key=partial(hashkey, "save")) - def _fetch_latest_save_version(self) -> str: # pylint: disable=no-self-use + def _fetch_latest_save_version(self) -> str: # noqa: no-self-use """Generate and cache the current save version""" return generate_timestamp() @@ -609,7 +609,7 @@ def _get_save_path(self) -> PurePosixPath: def _get_versioned_path(self, version: str) -> PurePosixPath: return self._filepath / version / self._filepath.name - def load(self) -> _DO: # pylint: disable=useless-parent-delegation + def load(self) -> _DO: # noqa: useless-parent-delegation return super().load() def save(self, data: _DI) -> None: diff --git a/kedro/io/data_catalog.py b/kedro/io/data_catalog.py index 8e74a76d28..7430f18c86 100644 --- a/kedro/io/data_catalog.py +++ b/kedro/io/data_catalog.py @@ -11,7 +11,7 @@ import logging import re from collections import defaultdict -from typing import Any, Iterable +from typing import Any, Dict, Iterable from parse import parse @@ -26,6 +26,8 @@ ) from kedro.io.memory_dataset import MemoryDataset +Patterns = Dict[str, Dict[str, Any]] + CATALOG_KEY = "catalog" CREDENTIALS_KEY = "credentials" WORDS_REGEX_PATTERN = re.compile(r"\W+") @@ -138,12 +140,12 @@ class DataCatalog: to the underlying data sets. """ - def __init__( # pylint: disable=too-many-arguments + def __init__( # noqa: too-many-arguments self, data_sets: dict[str, AbstractDataset] = None, feed_dict: dict[str, Any] = None, layers: dict[str, set[str]] = None, - dataset_patterns: dict[str, dict[str, Any]] = None, + dataset_patterns: Patterns = None, load_versions: dict[str, str] = None, save_version: str = None, ) -> None: @@ -283,7 +285,9 @@ class to be loaded is specified with the key ``type`` and their layers: dict[str, set[str]] = defaultdict(set) for ds_name, ds_config in catalog.items(): - ds_config = _resolve_credentials(ds_config, credentials) + ds_config = _resolve_credentials( # noqa: redefined-loop-name + ds_config, credentials + ) if cls._is_pattern(ds_name): # Add each factory to the dataset_patterns dict. dataset_patterns[ds_name] = ds_config @@ -300,7 +304,7 @@ class to be loaded is specified with the key ``type`` and their missing_keys = [ key for key in load_versions.keys() - if not (cls._match_pattern(sorted_patterns, key) or key in catalog) + if not (key in catalog or cls._match_pattern(sorted_patterns, key)) ] if missing_keys: raise DatasetNotFoundError( @@ -322,21 +326,21 @@ def _is_pattern(pattern: str): return "{" in pattern @staticmethod - def _match_pattern( - data_set_patterns: dict[str, dict[str, Any]], data_set_name: str - ) -> str | None: + def _match_pattern(data_set_patterns: Patterns, data_set_name: str) -> str | None: """Match a dataset name against patterns in a dictionary.""" - for pattern, _ in data_set_patterns.items(): - result = parse(pattern, data_set_name) - if result: - return pattern - return None + matches = ( + pattern + for pattern in data_set_patterns.keys() + if parse(pattern, data_set_name) + ) + return next(matches, None) @classmethod - def _sort_patterns( - cls, data_set_patterns: dict[str, dict[str, Any]] - ) -> dict[str, dict[str, Any]]: - """Sort a dictionary of dataset patterns according to parsing rules - + def _sort_patterns(cls, data_set_patterns: Patterns) -> dict[str, dict[str, Any]]: + """Sort a dictionary of dataset patterns according to parsing rules. + + In order: + 1. Decreasing specificity (number of characters outside the curly brackets) 2. Decreasing number of placeholders (number of curly bracket pairs) 3. Alphabetically @@ -349,18 +353,18 @@ def _sort_patterns( pattern, ), ) - sorted_patterns = {} - for key in sorted_keys: - sorted_patterns[key] = data_set_patterns[key] - return sorted_patterns + return {key: data_set_patterns[key] for key in sorted_keys} @staticmethod def _specificity(pattern: str) -> int: - """Helper function to check the length of exactly matched characters not inside brackets - Example - - specificity("{namespace}.companies") = 10 - specificity("{namespace}.{dataset}") = 1 - specificity("france.companies") = 16 + """Helper function to check the length of exactly matched characters not inside brackets. + + Example: + :: + + >>> specificity("{namespace}.companies") = 10 + >>> specificity("{namespace}.{dataset}") = 1 + >>> specificity("france.companies") = 16 """ # Remove all the placeholders from the pattern and count the number of remaining chars result = re.sub(r"\{.*?\}", "", pattern) @@ -410,9 +414,7 @@ def _get_dataset( if version and isinstance(data_set, AbstractVersionedDataset): # we only want to return a similar-looking dataset, # not modify the one stored in the current catalog - data_set = data_set._copy( # pylint: disable=protected-access - _version=version - ) + data_set = data_set._copy(_version=version) # noqa: protected-access return data_set diff --git a/kedro/io/lambda_dataset.py b/kedro/io/lambda_dataset.py index 220971fa4f..68d7161b11 100644 --- a/kedro/io/lambda_dataset.py +++ b/kedro/io/lambda_dataset.py @@ -10,7 +10,7 @@ from kedro.io.core import AbstractDataset, DatasetError # https://github.com/pylint-dev/pylint/issues/4300#issuecomment-1043601901 -LambdaDataSet: type[AbstractDataset] +LambdaDataSet: type[LambdaDataset] class LambdaDataset(AbstractDataset): @@ -80,8 +80,7 @@ def _release(self) -> None: else: self.__release() - # pylint: disable=too-many-arguments - def __init__( + def __init__( # noqa: too-many-arguments self, load: Callable[[], Any] | None, save: Callable[[Any], None] | None, diff --git a/kedro/io/memory_dataset.py b/kedro/io/memory_dataset.py index 8a6904c63d..7cab3f4d3d 100644 --- a/kedro/io/memory_dataset.py +++ b/kedro/io/memory_dataset.py @@ -11,7 +11,7 @@ _EMPTY = object() # https://github.com/pylint-dev/pylint/issues/4300#issuecomment-1043601901 -MemoryDataSet: type[AbstractDataset] +MemoryDataSet: type[MemoryDataset] class MemoryDataset(AbstractDataset): @@ -93,7 +93,7 @@ def _infer_copy_mode(data: Any) -> str: Returns: One of "copy", "assign" or "deepcopy" as the copy mode to use. """ - # pylint: disable=import-outside-toplevel + # noqa: import-outside-toplevel try: import pandas as pd except ImportError: # pragma: no cover diff --git a/kedro/io/partitioned_dataset.py b/kedro/io/partitioned_dataset.py index be62a05e56..66df5294a8 100644 --- a/kedro/io/partitioned_dataset.py +++ b/kedro/io/partitioned_dataset.py @@ -32,12 +32,12 @@ S3_PROTOCOLS = ("s3", "s3a", "s3n") # https://github.com/pylint-dev/pylint/issues/4300#issuecomment-1043601901 -PartitionedDataSet: type[AbstractDataset] -IncrementalDataSet: type[AbstractDataset] +PartitionedDataSet: type[PartitionedDataset] +IncrementalDataSet: type[IncrementalDataset] class PartitionedDataset(AbstractDataset): - # pylint: disable=too-many-instance-attributes,protected-access + # noqa: too-many-instance-attributes,protected-access """``PartitionedDataset`` loads and saves partitioned file-like data using the underlying dataset definition. For filesystem level operations it uses `fsspec`: https://github.com/intake/filesystem_spec. @@ -135,7 +135,7 @@ class PartitionedDataset(AbstractDataset): """ - def __init__( # pylint: disable=too-many-arguments + def __init__( # noqa: too-many-arguments self, path: str, dataset: str | type[AbstractDataset] | dict[str, Any], @@ -190,7 +190,7 @@ def __init__( # pylint: disable=too-many-arguments Raises: DatasetError: If versioning is enabled for the underlying dataset. """ - # pylint: disable=import-outside-toplevel + # noqa: import-outside-toplevel from fsspec.utils import infer_storage_options # for performance reasons super().__init__() @@ -247,7 +247,7 @@ def __init__( # pylint: disable=too-many-arguments @property def _filesystem(self): # for performance reasons - import fsspec # pylint: disable=import-outside-toplevel + import fsspec # noqa: import-outside-toplevel protocol = "s3" if self._protocol in S3_PROTOCOLS else self._protocol return fsspec.filesystem(protocol, **self._credentials, **self._fs_args) @@ -314,7 +314,7 @@ def _save(self, data: dict[str, Any]) -> None: kwargs[self._filepath_arg] = self._join_protocol(partition) dataset = self._dataset_type(**kwargs) # type: ignore if callable(partition_data): - partition_data = partition_data() + partition_data = partition_data() # noqa: redefined-loop-name dataset.save(partition_data) self._invalidate_caches() @@ -381,8 +381,7 @@ class IncrementalDataset(PartitionedDataset): DEFAULT_CHECKPOINT_TYPE = "kedro.extras.datasets.text.TextDataSet" DEFAULT_CHECKPOINT_FILENAME = "CHECKPOINT" - # pylint: disable=too-many-arguments - def __init__( + def __init__( # noqa: too-many-arguments self, path: str, dataset: str | type[AbstractDataset] | dict[str, Any], @@ -500,10 +499,8 @@ def _parse_checkpoint_config( @cachedmethod(cache=operator.attrgetter("_partition_cache")) def _list_partitions(self) -> list[str]: checkpoint = self._read_checkpoint() - checkpoint_path = ( - self._filesystem._strip_protocol( # pylint: disable=protected-access - self._checkpoint_config[self._filepath_arg] - ) + checkpoint_path = self._filesystem._strip_protocol( # noqa: protected-access + self._checkpoint_config[self._filepath_arg] ) def _is_valid_partition(partition) -> bool: diff --git a/kedro/ipython/__init__.py b/kedro/ipython/__init__.py index 276a15bc1f..8341822255 100644 --- a/kedro/ipython/__init__.py +++ b/kedro/ipython/__init__.py @@ -16,8 +16,11 @@ from kedro.framework.cli import load_entry_points from kedro.framework.cli.project import PARAMS_ARG_HELP from kedro.framework.cli.utils import ENV_HELP, _split_params -from kedro.framework.project import LOGGING # noqa -from kedro.framework.project import configure_project, pipelines +from kedro.framework.project import ( + LOGGING, # noqa + configure_project, + pipelines, +) from kedro.framework.session import KedroSession from kedro.framework.startup import _is_project, bootstrap_project @@ -65,7 +68,7 @@ def load_ipython_extension(ipython): def magic_reload_kedro(line: str, local_ns: dict[str, Any] = None): """ The `%reload_kedro` IPython line magic. - See https://kedro.readthedocs.io/en/stable/notebooks_and_ipython/kedro_and_notebooks.html#reload-kedro-line-magic # pylint: disable=line-too-long + See https://kedro.readthedocs.io/en/stable/notebooks_and_ipython/kedro_and_notebooks.html#reload-kedro-line-magic # noqa: line-too-long for more. """ args = parse_argstring(magic_reload_kedro, line) @@ -128,7 +131,7 @@ def _resolve_project_path( project_path = Path(path).expanduser().resolve() else: if local_namespace and "context" in local_namespace: - # pylint: disable=protected-access + # noqa: protected-access project_path = local_namespace["context"]._project_path else: project_path = _find_kedro_project(Path.cwd()) @@ -139,7 +142,7 @@ def _resolve_project_path( project_path, ) - # pylint: disable=protected-access + # noqa: protected-access if ( project_path and local_namespace diff --git a/kedro/pipeline/modular_pipeline.py b/kedro/pipeline/modular_pipeline.py index b9dbe9b403..0f429eed56 100644 --- a/kedro/pipeline/modular_pipeline.py +++ b/kedro/pipeline/modular_pipeline.py @@ -150,7 +150,7 @@ def _get_param_names_mapping( return params -def pipeline( +def pipeline( # noqa: too-many-arguments pipe: Iterable[Node | Pipeline] | Pipeline, *, inputs: str | set[str] | dict[str, str] | None = None, @@ -212,7 +212,7 @@ def pipeline( if not any([inputs, outputs, parameters, namespace]): return pipe - # pylint: disable=protected-access + # noqa: protected-access inputs = _get_dataset_names_mapping(inputs) outputs = _get_dataset_names_mapping(outputs) parameters = _get_param_names_mapping(parameters) diff --git a/kedro/pipeline/node.py b/kedro/pipeline/node.py index e7a3963aba..d9435308c1 100644 --- a/kedro/pipeline/node.py +++ b/kedro/pipeline/node.py @@ -19,7 +19,7 @@ class Node: run user-provided functions as part of Kedro pipelines. """ - def __init__( + def __init__( # noqa: too-many-arguments self, func: Callable, inputs: None | str | list[str] | dict[str, str], @@ -480,11 +480,12 @@ def _validate_inputs(self, func, inputs): ) from exc def _validate_unique_outputs(self): - diff = Counter(self.outputs) - Counter(set(self.outputs)) + cnt = Counter(self.outputs) + diff = {k for k in cnt if cnt[k] > 1} if diff: raise ValueError( - f"Failed to create node {self} due to duplicate" - f" output(s) {set(diff.keys())}.\nNode outputs must be unique." + f"Failed to create node {self} due to duplicate " + f"output(s) {diff}.\nNode outputs must be unique." ) def _validate_inputs_dif_than_outputs(self): @@ -518,7 +519,7 @@ def _node_error_message(msg) -> str: ) -def node( +def node( # noqa: too-many-arguments func: Callable, inputs: None | str | list[str] | dict[str, str], outputs: None | str | list[str] | dict[str, str], diff --git a/kedro/pipeline/pipeline.py b/kedro/pipeline/pipeline.py index 5ce57bac35..5b76416182 100644 --- a/kedro/pipeline/pipeline.py +++ b/kedro/pipeline/pipeline.py @@ -32,7 +32,7 @@ def _transcode_split(element: str) -> tuple[str, str]: """ split_name = element.split(TRANSCODING_SEPARATOR) - if len(split_name) > 2: + if len(split_name) > 2: # noqa: PLR2004 raise ValueError( f"Expected maximum 1 transcoding separator, found {len(split_name) - 1} " f"instead: '{element}'." @@ -71,7 +71,7 @@ class ConfirmNotUniqueError(Exception): pass -class Pipeline: # pylint: disable=too-many-public-methods +class Pipeline: # noqa: too-many-public-methods """A ``Pipeline`` defined as a collection of ``Node`` objects. This class treats nodes as part of a graph representation and provides inputs, outputs and execution order. @@ -679,8 +679,7 @@ def only_nodes_with_tags(self, *tags: str) -> Pipeline: nodes = [node for node in self.nodes if tags & node.tags] return Pipeline(nodes) - # pylint: disable=too-many-arguments - def filter( + def filter( # noqa: too-many-arguments self, tags: Iterable[str] = None, from_nodes: Iterable[str] = None, diff --git a/kedro/runner/parallel_runner.py b/kedro/runner/parallel_runner.py index 9864bece82..860cefed6a 100644 --- a/kedro/runner/parallel_runner.py +++ b/kedro/runner/parallel_runner.py @@ -33,7 +33,7 @@ _MAX_WINDOWS_WORKERS = 61 # https://github.com/pylint-dev/pylint/issues/4300#issuecomment-1043601901 -_SharedMemoryDataSet: type +_SharedMemoryDataSet: type[_SharedMemoryDataset] class _SharedMemoryDataset: @@ -92,20 +92,18 @@ class ParallelRunnerManager(SyncManager): """ -ParallelRunnerManager.register( # pylint: disable=no-member - "MemoryDataset", MemoryDataset -) +ParallelRunnerManager.register("MemoryDataset", MemoryDataset) # noqa: no-member def _bootstrap_subprocess(package_name: str, logging_config: dict[str, Any]): - # pylint: disable=import-outside-toplevel,cyclic-import + # noqa: import-outside-toplevel,cyclic-import from kedro.framework.project import configure_logging, configure_project configure_project(package_name) configure_logging(logging_config) -def _run_node_synchronization( # pylint: disable=too-many-arguments +def _run_node_synchronization( # noqa: too-many-arguments node: Node, catalog: DataCatalog, is_async: bool = False, @@ -166,7 +164,7 @@ def __init__(self, max_workers: int = None, is_async: bool = False): """ super().__init__(is_async=is_async) self._manager = ParallelRunnerManager() - self._manager.start() # pylint: disable=consider-using-with + self._manager.start() # noqa: consider-using-with # This code comes from the concurrent.futures library # https://github.com/python/cpython/blob/master/Lib/concurrent/futures/process.py#L588 @@ -224,7 +222,7 @@ def _validate_catalog(cls, catalog: DataCatalog, pipeline: Pipeline): will not be synchronized across threads. """ - data_sets = catalog._data_sets # pylint: disable=protected-access + data_sets = catalog._data_sets # noqa: protected-access unserialisable = [] for name, data_set in data_sets.items(): @@ -277,7 +275,7 @@ def _get_required_workers_count(self, pipeline: Pipeline): return min(required_processes, self._max_workers) - def _run( # pylint: disable=too-many-locals,useless-suppression + def _run( # noqa: too-many-locals,useless-suppression self, pipeline: Pipeline, catalog: DataCatalog, @@ -300,7 +298,7 @@ def _run( # pylint: disable=too-many-locals,useless-suppression Exception: In case of any downstream node failure. """ - # pylint: disable=import-outside-toplevel,cyclic-import + # noqa: import-outside-toplevel,cyclic-import nodes = pipeline.nodes self._validate_catalog(catalog, pipeline) diff --git a/kedro/runner/runner.py b/kedro/runner/runner.py index 00657dfe38..084843124e 100644 --- a/kedro/runner/runner.py +++ b/kedro/runner/runner.py @@ -286,7 +286,7 @@ def _has_persistent_inputs(node: Node, catalog: DataCatalog) -> bool: """ for node_input in node.inputs: - # pylint: disable=protected-access + # noqa: protected-access if isinstance(catalog._data_sets[node_input], MemoryDataset): return False return True @@ -335,7 +335,7 @@ def run_node( return node -def _collect_inputs_from_hook( +def _collect_inputs_from_hook( # noqa: too-many-arguments node: Node, catalog: DataCatalog, inputs: dict[str, Any], @@ -343,7 +343,7 @@ def _collect_inputs_from_hook( hook_manager: PluginManager, session_id: str = None, ) -> dict[str, Any]: - # pylint: disable=too-many-arguments + inputs = inputs.copy() # shallow copy to prevent in-place modification by the hook hook_response = hook_manager.hook.before_node_run( node=node, @@ -364,13 +364,12 @@ def _collect_inputs_from_hook( f"'before_node_run' must return either None or a dictionary mapping " f"dataset names to updated values, got '{response_type}' instead." ) - response = response or {} - additional_inputs.update(response) + additional_inputs.update(response or {}) return additional_inputs -def _call_node_run( +def _call_node_run( # noqa: too-many-arguments node: Node, catalog: DataCatalog, inputs: dict[str, Any], @@ -378,7 +377,7 @@ def _call_node_run( hook_manager: PluginManager, session_id: str = None, ) -> dict[str, Any]: - # pylint: disable=too-many-arguments + try: outputs = node.run(inputs) except Exception as exc: diff --git a/kedro/runner/thread_runner.py b/kedro/runner/thread_runner.py index edf3e1f3c0..6f3d6818d1 100644 --- a/kedro/runner/thread_runner.py +++ b/kedro/runner/thread_runner.py @@ -81,7 +81,7 @@ def _get_required_workers_count(self, pipeline: Pipeline): else required_threads ) - def _run( # pylint: disable=too-many-locals,useless-suppression + def _run( # noqa: too-many-locals,useless-suppression self, pipeline: Pipeline, catalog: DataCatalog, diff --git a/kedro/templates/pipeline/{{ cookiecutter.pipeline_name }}/config/parameters/{{ cookiecutter.pipeline_name }}.yml b/kedro/templates/pipeline/{{ cookiecutter.pipeline_name }}/config/parameters_{{ cookiecutter.pipeline_name }}.yml similarity index 100% rename from kedro/templates/pipeline/{{ cookiecutter.pipeline_name }}/config/parameters/{{ cookiecutter.pipeline_name }}.yml rename to kedro/templates/pipeline/{{ cookiecutter.pipeline_name }}/config/parameters_{{ cookiecutter.pipeline_name }}.yml diff --git a/kedro/templates/pipeline/{{ cookiecutter.pipeline_name }}/pipeline.py b/kedro/templates/pipeline/{{ cookiecutter.pipeline_name }}/pipeline.py index 670fd43fff..587123c64c 100644 --- a/kedro/templates/pipeline/{{ cookiecutter.pipeline_name }}/pipeline.py +++ b/kedro/templates/pipeline/{{ cookiecutter.pipeline_name }}/pipeline.py @@ -3,7 +3,7 @@ generated using Kedro {{ cookiecutter.kedro_version }} """ -from kedro.pipeline import Pipeline, node, pipeline +from kedro.pipeline import Pipeline, pipeline def create_pipeline(**kwargs) -> Pipeline: diff --git a/kedro/templates/project/{{ cookiecutter.repo_name }}/conf/local/.gitkeep b/kedro/templates/project/{{ cookiecutter.repo_name }}/conf/local/.gitkeep deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/kedro/templates/project/{{ cookiecutter.repo_name }}/src/tests/test_run.py b/kedro/templates/project/{{ cookiecutter.repo_name }}/src/tests/test_run.py index e714f76314..785c5a40b9 100644 --- a/kedro/templates/project/{{ cookiecutter.repo_name }}/src/tests/test_run.py +++ b/kedro/templates/project/{{ cookiecutter.repo_name }}/src/tests/test_run.py @@ -12,10 +12,10 @@ import pytest -from kedro.framework.project import settings from kedro.config import ConfigLoader from kedro.framework.context import KedroContext from kedro.framework.hooks import _create_hook_manager +from kedro.framework.project import settings @pytest.fixture diff --git a/pyproject.toml b/pyproject.toml index ec06b75d5f..df3bf4a2fe 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -10,7 +10,33 @@ authors = [ {name = "Kedro"} ] description = "Kedro helps you build production-ready data and analytics pipelines" -requires-python = ">=3.7, <3.11" +requires-python = ">=3.7" +dependencies = [ + "anyconfig~=0.10.0", + "attrs>=21.3", + "build", + "cachetools~=5.3", + "click<9.0", + "cookiecutter>=2.1.1, <3.0", + "dynaconf>=3.1.2, <4.0", + "fsspec>=2021.4, <2024.1", # Upper bound set arbitrarily, to be reassessed in early 2024 + "gitpython~=3.0", + "importlib-metadata>=3.6; python_version >= '3.8'", + "importlib_metadata>=3.6, <5.0; python_version < '3.8'", # The "selectable" entry points were introduced in `importlib_metadata` 3.6 and Python 3.10. Bandit on Python 3.7 relies on a library with `importlib_metadata` < 5.0 + "importlib_resources>=1.3", # The `files()` API was introduced in `importlib_resources` 1.3 and Python 3.9. + "jmespath>=0.9.5, <2.0", + "more_itertools>=9,<11", + "omegaconf~=2.3", + "parse~=1.19.0", + "pip-tools>=6.5,<8", + "pluggy~=1.0", + "PyYAML>=4.2, <7.0", + "rich>=12.0, <14.0", + "rope>=0.21, <2.0", # subject to LGPLv3 license + "setuptools>=65.5.1", + "toml~=0.10", + "toposort~=1.5", # Needs to be at least 1.5 to be able to raise CircularDependencyError +] keywords = [ "pipelines", "machine learning", @@ -26,7 +52,7 @@ classifiers = [ "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", ] -dynamic = ["readme", "version", "dependencies", "optional-dependencies"] +dynamic = ["readme", "version", "optional-dependencies"] [project.urls] Homepage = "https://kedro.org" @@ -46,7 +72,6 @@ include = ["kedro*"] [tool.setuptools.dynamic] readme = {file = "README.md", content-type = "text/markdown"} version = {attr = "kedro.__version__"} -dependencies = {file = "dependency/requirements.txt"} [tool.black] exclude = "/templates/|^features/steps/test_starter" @@ -54,6 +79,7 @@ exclude = "/templates/|^features/steps/test_starter" [tool.isort] profile = "black" + [tool.pylint] [tool.pylint.master] ignore = "CVS" @@ -67,7 +93,8 @@ unsafe-load-any-extension = false [tool.pylint.messages_control] disable = [ "ungrouped-imports", - "duplicate-code" + "duplicate-code", + "wrong-import-order", # taken care of by isort ] enable = ["useless-suppression"] [tool.pylint.refactoring] @@ -175,3 +202,18 @@ ignore_imports = [ "kedro.framework.context.context -> kedro.config", "kedro.framework.session.session -> kedro.config" ] + +[tool.ruff] +line-length = 88 +show-fixes = true +# select = ["A", "B", "C", "E", "F", "G", "I", "N", "Q", "S", "T", "W", "ARG", "BLE", "COM", "DJ", "DTZ", "EM", "ERA", "EXE", "FBT", "ICN", "INP", "ISC", "NPY", "PD", "PGH", "PIE", "PL", "PT", "PTH", "PYI", "RET", "RSE", "RUF", "SIM", "SLF", "TCH", "TID", "TRY", "UP", "YTT"] +select = [ + "F", # Pyflakes + "E", # Pycodestyle + "W", # Pycodestyle + "UP", # pyupgrade + "I", # isort + "PL", # Pylint +] +ignore = ["E501"] # Black take care off line-too-long +unfixable = [] diff --git a/setup.py b/setup.py index 92eca04e5a..e78ea817a7 100644 --- a/setup.py +++ b/setup.py @@ -1,23 +1,14 @@ -from codecs import open from glob import glob from itertools import chain -from os import path from setuptools import setup -name = "kedro" -here = path.abspath(path.dirname(__file__)) - # at least 1.3 to be able to use XMLDataSet and pandas integration with fsspec PANDAS = "pandas~=1.3" -SPARK = "pyspark>=2.2, <4.0" +SPARK = "pyspark>=2.2, <3.4" HDFS = "hdfs>=2.5.8, <3.0" S3FS = "s3fs>=0.3.0, <0.5" -# get the dependencies and installs -with open("dependency/requirements.txt", encoding="utf-8") as f: - requires = [x.strip() for x in f if x.strip()] - template_files = [] for pattern in ["**/*", "**/.*", "**/.*/**", "**/.*/.**"]: template_files.extend( @@ -39,7 +30,7 @@ def _collect_requirements(requires): "geopandas.GeoJSONDataSet": ["geopandas>=0.6.0, <1.0", "pyproj~=3.0"] } matplotlib_require = {"matplotlib.MatplotlibWriter": ["matplotlib>=3.0.3, <4.0"]} -holoviews_require = {"holoviews.HoloviewsWriter": ["holoviews~=1.13.0"]} +holoviews_require = {"holoviews.HoloviewsWriter": ["holoviews>=1.13.0"]} networkx_require = {"networkx.NetworkXDataSet": ["networkx~=2.4"]} pandas_require = { "pandas.CSVDataSet": [PANDAS], @@ -80,7 +71,9 @@ def _collect_requirements(requires): "tensorflow.TensorflowModelDataset": [ # currently only TensorFlow V2 supported for saving and loading. # V1 requires HDF5 and serialises differently - "tensorflow~=2.0" + "tensorflow~=2.0; platform_system != 'Darwin' or platform_machine != 'arm64'", + # https://developer.apple.com/metal/tensorflow-plugin/ + "tensorflow-macos~=2.0; platform_system == 'Darwin' and platform_machine == 'arm64'", ] } yaml_require = {"yaml.YAMLDataSet": [PANDAS, "PyYAML>=4.2, <7.0"]} @@ -104,7 +97,7 @@ def _collect_requirements(requires): "sphinxcontrib-mermaid~=0.7.1", "myst-parser~=1.0.0", "Jinja2<3.1.0", - "kedro-datasets[all]~=1.4.2", + "kedro-datasets[all,pandas-deltatabledataset]~=1.5.1", ], "geopandas": _collect_requirements(geopandas_require), "matplotlib": _collect_requirements(matplotlib_require), @@ -139,10 +132,82 @@ def _collect_requirements(requires): } extras_require["all"] = _collect_requirements(extras_require) +extras_require["test"] = [ + "adlfs>=2021.7.1, <=2022.2; python_version == '3.7'", + "adlfs~=2023.1; python_version >= '3.8'", + "bandit>=1.6.2, <2.0", + "behave==1.2.6", + "biopython~=1.73", + "blacken-docs==1.9.2", + "black~=22.0", + "compress-pickle[lz4]~=2.1.0", + "coverage[toml]", + "dask[complete]~=2021.10", # pinned by Snyk to avoid a vulnerability + "delta-spark>=1.2.1; python_version >= '3.11'", # 1.2.0 has a bug that breaks some of our tests: https://github.com/delta-io/delta/issues/1070 + "delta-spark~=1.2.1; python_version < '3.11'", + "dill~=0.3.1", + "filelock>=3.4.0, <4.0", + "gcsfs>=2021.4, <=2023.1; python_version == '3.7'", + "gcsfs>=2023.1, <2023.3; python_version >= '3.8'", + "geopandas>=0.6.0, <1.0", + "hdfs>=2.5.8, <3.0", + "holoviews>=1.13.0", + "import-linter[toml]==1.8.0", + "ipython>=7.31.1, <8.0; python_version < '3.8'", + "ipython~=8.10; python_version >= '3.8'", + "isort~=5.0", + "Jinja2<3.1.0", + "joblib>=0.14", + "jupyterlab_server>=2.11.1, <2.16.0", # 2.16.0 requires importlib_metedata >= 4.8.3 which conflicts with flake8 requirement + "jupyterlab~=3.0, <3.6.0", # 3.6.0 requires jupyterlab_server~=2.19 + "jupyter~=1.0", + "lxml~=4.6", + "matplotlib>=3.0.3, <3.4; python_version < '3.10'", # 3.4.0 breaks holoviews + "matplotlib>=3.5, <3.6; python_version >= '3.10'", + "memory_profiler>=0.50.0, <1.0", + "moto==1.3.7; python_version < '3.10'", + "moto==4.1.12; python_version >= '3.10'", + "networkx~=2.4", + "opencv-python~=4.5.5.64", + "openpyxl>=3.0.3, <4.0", + "pandas-gbq>=0.12.0, <0.18.0; python_version < '3.11'", + "pandas-gbq>=0.18.0; python_version >= '3.11'", + "pandas~=1.3 # 1.3 for read_xml/to_xml", + "Pillow~=9.0", + "plotly>=4.8.0, <6.0", + "pre-commit>=2.9.2, <3.0", # The hook `mypy` requires pre-commit version 2.9.2. + "pyarrow>=1.0; python_version < '3.11'", + "pyarrow>=7.0; python_version >= '3.11'", # Adding to avoid numpy build errors + "pylint>=2.17.0, <3.0", + "pyproj~=3.0", + "pyspark>=2.2, <3.4; python_version < '3.11'", + "pyspark>=3.4; python_version >= '3.11'", + "pytest-cov~=3.0", + "pytest-mock>=1.7.1, <2.0", + "pytest-xdist[psutil]~=2.2.1", + "pytest~=7.2", + "redis~=4.1", + "requests-mock~=1.6", + "requests~=2.20", + "s3fs>=0.3.0, <0.5", # Needs to be at least 0.3.0 to make use of `cachable` attribute on S3FileSystem. + "scikit-learn>=1.0.2,<2", + "scipy>=1.7.3", + "semver", + "SQLAlchemy~=1.2", + "tables~=3.6.0; platform_system == 'Windows' and python_version<'3.8'", + "tables~=3.8.0; platform_system == 'Windows' and python_version>='3.8'", # Import issues with python 3.8 with pytables pinning to 3.8.0 fixes this https://github.com/PyTables/PyTables/issues/933#issuecomment-1555917593 + "tables~=3.6; platform_system != 'Windows'", + "tensorflow~=2.0; platform_system != 'Darwin' or platform_machine != 'arm64'", + # https://developer.apple.com/metal/tensorflow-plugin/ + "tensorflow-macos~=2.0; platform_system == 'Darwin' and platform_machine == 'arm64'", + "triad>=0.6.7, <1.0", + "trufflehog~=2.1", + "xlsxwriter~=1.0", +] setup( package_data={ - name: ["py.typed", "test_requirements.txt"] + template_files + "kedro": ["py.typed"] + template_files }, extras_require=extras_require, ) diff --git a/test_requirements.txt b/test_requirements.txt deleted file mode 100644 index 5c81ebdc89..0000000000 --- a/test_requirements.txt +++ /dev/null @@ -1,64 +0,0 @@ --r dependency/requirements.txt -adlfs>=2021.7.1, <=2022.2; python_version == '3.7' -adlfs~=2023.1; python_version >= '3.8' -bandit>=1.6.2, <2.0 -behave==1.2.6 -biopython~=1.73 -blacken-docs==1.9.2 -black~=22.0 -compress-pickle[lz4]~=1.2.0 -coverage[toml] -dask[complete]~=2021.10 # pinned by Snyk to avoid a vulnerability -delta-spark~=1.2.1 # 1.2.0 has a bug that breaks some of our tests: https://github.com/delta-io/delta/issues/1070 -dill~=0.3.1 -filelock>=3.4.0, <4.0 -gcsfs>=2021.4, <=2023.1; python_version == '3.7' -gcsfs>=2023.1, <2023.3; python_version >= '3.8' -geopandas>=0.6.0, <1.0 -hdfs>=2.5.8, <3.0 -holoviews~=1.13.0 -import-linter[toml]==1.8.0 -ipython>=7.31.1, <8.0; python_version < '3.8' -ipython~=8.10; python_version >= '3.8' -isort~=5.0 -Jinja2<3.1.0 -joblib>=0.14 -jupyterlab_server>=2.11.1, <2.16.0 # 2.16.0 requires importlib_metedata >= 4.8.3 which conflicts with flake8 requirement -jupyterlab~=3.0, <3.6.0 # 3.6.0 requires jupyterlab_server~=2.19 -jupyter~=1.0 -lxml~=4.6 -matplotlib>=3.0.3, <3.4; python_version < '3.10' # 3.4.0 breaks holoviews -matplotlib>=3.5, <3.6; python_version == '3.10' -memory_profiler>=0.50.0, <1.0 -moto==1.3.7; python_version < '3.10' -moto==3.0.4; python_version == '3.10' -networkx~=2.4 -opencv-python~=4.5.5.64 -openpyxl>=3.0.3, <4.0 -pandas-gbq>=0.12.0, <0.18.0 -pandas~=1.3 # 1.3 for read_xml/to_xml -Pillow~=9.0 -plotly>=4.8.0, <6.0 -pre-commit>=2.9.2, <3.0 # The hook `mypy` requires pre-commit version 2.9.2. -psutil~=5.8 -pyarrow>=1.0, <7.0 -pylint>=2.17.0, <3.0 -pyproj~=3.0 -pyspark>=2.2, <4.0 -pytest-cov~=3.0 -pytest-mock>=1.7.1, <2.0 -pytest-xdist[psutil]~=2.2.1 -pytest~=7.2 -redis~=4.1 -requests-mock~=1.6 -requests~=2.20 -s3fs>=0.3.0, <0.5 # Needs to be at least 0.3.0 to make use of `cachable` attribute on S3FileSystem. -scikit-learn~=1.0.2 -scipy~=1.7.3 -SQLAlchemy~=1.2 -tables~=3.6.0; platform_system == "Windows" and python_version<'3.9' -tables~=3.6; platform_system != "Windows" -tensorflow~=2.0 -triad>=0.6.7, <1.0 -trufflehog~=2.1 -xlsxwriter~=1.0 diff --git a/tests/config/test_omegaconf_config.py b/tests/config/test_omegaconf_config.py index dd49292019..af57b52224 100644 --- a/tests/config/test_omegaconf_config.py +++ b/tests/config/test_omegaconf_config.py @@ -649,3 +649,25 @@ def test_variable_interpolation_in_catalog_with_separate_templates_file( conf = OmegaConfigLoader(str(tmp_path)) conf.default_run_env = "" assert conf["catalog"]["companies"]["type"] == "pandas.CSVDataSet" + + def test_custom_resolvers(self, tmp_path): + base_params = tmp_path / _BASE_ENV / "parameters.yml" + param_config = { + "model_options": { + "param1": "${add: 3, 4}", + "param2": "${plus_2: 1}", + "param3": "${oc.env: VAR}", + } + } + _write_yaml(base_params, param_config) + custom_resolvers = { + "add": lambda *x: sum(x), + "plus_2": lambda x: x + 2, + "oc.env": oc.env, + } + os.environ["VAR"] = "my_env_variable" + conf = OmegaConfigLoader(tmp_path, custom_resolvers=custom_resolvers) + conf.default_run_env = "" + assert conf["parameters"]["model_options"]["param1"] == 7 + assert conf["parameters"]["model_options"]["param2"] == 3 + assert conf["parameters"]["model_options"]["param3"] == "my_env_variable" diff --git a/tests/extras/datasets/spark/test_deltatable_dataset.py b/tests/extras/datasets/spark/test_deltatable_dataset.py index 00eb313f6a..a0ad5bc9d9 100644 --- a/tests/extras/datasets/spark/test_deltatable_dataset.py +++ b/tests/extras/datasets/spark/test_deltatable_dataset.py @@ -1,8 +1,10 @@ import pytest from delta import DeltaTable +from pyspark import __version__ from pyspark.sql import SparkSession from pyspark.sql.types import IntegerType, StringType, StructField, StructType from pyspark.sql.utils import AnalysisException +from semver import VersionInfo from kedro.extras.datasets.spark import DeltaTableDataSet, SparkDataSet from kedro.io import DataCatalog, DatasetError @@ -10,6 +12,8 @@ from kedro.pipeline.modular_pipeline import pipeline as modular_pipeline from kedro.runner import ParallelRunner +SPARK_VERSION = VersionInfo.parse(__version__) + @pytest.fixture def sample_spark_df(): @@ -65,10 +69,16 @@ def test_exists(self, tmp_path, sample_spark_df): def test_exists_raises_error(self, mocker): delta_ds = DeltaTableDataSet(filepath="") - mocker.patch.object( - delta_ds, "_get_spark", side_effect=AnalysisException("Other Exception", []) - ) - + if SPARK_VERSION.match(">=3.4.0"): + mocker.patch.object( + delta_ds, "_get_spark", side_effect=AnalysisException("Other Exception") + ) + else: + mocker.patch.object( + delta_ds, + "_get_spark", + side_effect=AnalysisException("Other Exception", []), + ) with pytest.raises(DatasetError, match="Other Exception"): delta_ds.exists() diff --git a/tests/extras/datasets/spark/test_spark_dataset.py b/tests/extras/datasets/spark/test_spark_dataset.py index 6b3a43f23e..a491ef6aeb 100644 --- a/tests/extras/datasets/spark/test_spark_dataset.py +++ b/tests/extras/datasets/spark/test_spark_dataset.py @@ -7,6 +7,7 @@ import pandas as pd import pytest from moto import mock_s3 +from pyspark import __version__ from pyspark.sql import SparkSession from pyspark.sql.functions import col from pyspark.sql.types import ( @@ -17,6 +18,7 @@ StructType, ) from pyspark.sql.utils import AnalysisException +from semver import VersionInfo from kedro.extras.datasets.pandas import CSVDataSet, ParquetDataSet from kedro.extras.datasets.pickle import PickleDataSet @@ -60,6 +62,8 @@ (HDFS_PREFIX + "/2019-02-01T00.00.00.000Z", [], ["other_file"]), ] +SPARK_VERSION = VersionInfo.parse(__version__) + @pytest.fixture def sample_pandas_df() -> pd.DataFrame: @@ -405,11 +409,18 @@ def test_exists_raises_error(self, mocker): # exists should raise all errors except for # AnalysisExceptions clearly indicating a missing file spark_data_set = SparkDataSet(filepath="") - mocker.patch.object( - spark_data_set, - "_get_spark", - side_effect=AnalysisException("Other Exception", []), - ) + if SPARK_VERSION.match(">=3.4.0"): + mocker.patch.object( + spark_data_set, + "_get_spark", + side_effect=AnalysisException("Other Exception"), + ) + else: + mocker.patch.object( # pylint: disable=expression-not-assigned + spark_data_set, + "_get_spark", + side_effect=AnalysisException("Other Exception", []), + ) with pytest.raises(DatasetError, match="Other Exception"): spark_data_set.exists() @@ -528,7 +539,7 @@ def test_versioning_existing_dataset( sys.platform.startswith("win"), reason="DBFS doesn't work on Windows" ) class TestSparkDataSetVersionedDBFS: - def test_load_latest( # pylint: disable=too-many-arguments + def test_load_latest( # noqa: too-many-arguments self, mocker, versioned_dataset_dbfs, version, tmp_path, sample_spark_df ): mocked_glob = mocker.patch.object(versioned_dataset_dbfs, "_glob_function") @@ -555,7 +566,7 @@ def test_load_exact(self, tmp_path, sample_spark_df): assert reloaded.exceptAll(sample_spark_df).count() == 0 - def test_save( # pylint: disable=too-many-arguments + def test_save( # noqa: too-many-arguments self, mocker, versioned_dataset_dbfs, version, tmp_path, sample_spark_df ): mocked_glob = mocker.patch.object(versioned_dataset_dbfs, "_glob_function") @@ -568,7 +579,7 @@ def test_save( # pylint: disable=too-many-arguments ) assert (tmp_path / FILENAME / version.save / FILENAME).exists() - def test_exists( # pylint: disable=too-many-arguments + def test_exists( # noqa: too-many-arguments self, mocker, versioned_dataset_dbfs, version, tmp_path, sample_spark_df ): mocked_glob = mocker.patch.object(versioned_dataset_dbfs, "_glob_function") diff --git a/tests/extras/datasets/spark/test_spark_hive_dataset.py b/tests/extras/datasets/spark/test_spark_hive_dataset.py index ba7fc734a6..399ebc4169 100644 --- a/tests/extras/datasets/spark/test_spark_hive_dataset.py +++ b/tests/extras/datasets/spark/test_spark_hive_dataset.py @@ -293,12 +293,9 @@ def test_read_from_non_existent_table(self): ) with pytest.raises( DatasetError, - match=r"Failed while loading data from data set " - r"SparkHiveDataSet\(database=default_1, format=hive, " - r"table=table_doesnt_exist, table_pk=\[\], write_mode=append\)\.\n" - r"Table or view not found: default_1.table_doesnt_exist;\n" - r"'UnresolvedRelation \[default_1, " - r"table_doesnt_exist\], \[\], false\n", + match=r"Failed while loading data from data set SparkHiveDataSet" + r"|table_doesnt_exist" + r"|UnresolvedRelation", ): dataset.load() diff --git a/tests/extras/datasets/spark/test_spark_jdbc_dataset.py b/tests/extras/datasets/spark/test_spark_jdbc_dataset.py index fa7af0f966..6d89251fc5 100644 --- a/tests/extras/datasets/spark/test_spark_jdbc_dataset.py +++ b/tests/extras/datasets/spark/test_spark_jdbc_dataset.py @@ -1,5 +1,3 @@ -from unittest import mock - import pytest from kedro.extras.datasets.spark import SparkJDBCDataSet @@ -53,59 +51,52 @@ def test_missing_table(): SparkJDBCDataSet(url="dummy_url", table=None) -def mock_save(arg_dict): - mock_data = mock.Mock() - data_set = SparkJDBCDataSet(**arg_dict) +def test_save(mocker, spark_jdbc_args): + mock_data = mocker.Mock() + data_set = SparkJDBCDataSet(**spark_jdbc_args) data_set.save(mock_data) - return mock_data - - -def test_save(spark_jdbc_args): - data = mock_save(spark_jdbc_args) - data.write.jdbc.assert_called_with("dummy_url", "dummy_table") + mock_data.write.jdbc.assert_called_with("dummy_url", "dummy_table") -def test_save_credentials(spark_jdbc_args_credentials): - data = mock_save(spark_jdbc_args_credentials) - data.write.jdbc.assert_called_with( +def test_save_credentials(mocker, spark_jdbc_args_credentials): + mock_data = mocker.Mock() + data_set = SparkJDBCDataSet(**spark_jdbc_args_credentials) + data_set.save(mock_data) + mock_data.write.jdbc.assert_called_with( "dummy_url", "dummy_table", properties={"user": "dummy_user", "password": "dummy_pw"}, ) -def test_save_args(spark_jdbc_args_save_load): - data = mock_save(spark_jdbc_args_save_load) - data.write.jdbc.assert_called_with( +def test_save_args(mocker, spark_jdbc_args_save_load): + mock_data = mocker.Mock() + data_set = SparkJDBCDataSet(**spark_jdbc_args_save_load) + data_set.save(mock_data) + mock_data.write.jdbc.assert_called_with( "dummy_url", "dummy_table", properties={"driver": "dummy_driver"} ) -def test_except_bad_credentials(spark_jdbc_args_credentials_with_none_password): +def test_except_bad_credentials(mocker, spark_jdbc_args_credentials_with_none_password): pattern = r"Credential property 'password' cannot be None(.+)" with pytest.raises(DatasetError, match=pattern): - mock_save(spark_jdbc_args_credentials_with_none_password) + mock_data = mocker.Mock() + data_set = SparkJDBCDataSet(**spark_jdbc_args_credentials_with_none_password) + data_set.save(mock_data) -@mock.patch( - "kedro.extras.datasets.spark.spark_jdbc_dataset.SparkSession.builder.getOrCreate" -) -def mock_load(mock_get_or_create, arg_dict): - spark = mock_get_or_create.return_value - data_set = SparkJDBCDataSet(**arg_dict) +def test_load(mocker, spark_jdbc_args): + spark = mocker.patch.object(SparkJDBCDataSet, "_get_spark").return_value + data_set = SparkJDBCDataSet(**spark_jdbc_args) data_set.load() - return spark - - -def test_load(spark_jdbc_args): - # pylint: disable=no-value-for-parameter - spark = mock_load(arg_dict=spark_jdbc_args) spark.read.jdbc.assert_called_with("dummy_url", "dummy_table") -def test_load_credentials(spark_jdbc_args_credentials): - # pylint: disable=no-value-for-parameter - spark = mock_load(arg_dict=spark_jdbc_args_credentials) +def test_load_credentials(mocker, spark_jdbc_args_credentials): + spark = mocker.patch.object(SparkJDBCDataSet, "_get_spark").return_value + data_set = SparkJDBCDataSet(**spark_jdbc_args_credentials) + data_set.load() spark.read.jdbc.assert_called_with( "dummy_url", "dummy_table", @@ -113,9 +104,10 @@ def test_load_credentials(spark_jdbc_args_credentials): ) -def test_load_args(spark_jdbc_args_save_load): - # pylint: disable=no-value-for-parameter - spark = mock_load(arg_dict=spark_jdbc_args_save_load) +def test_load_args(mocker, spark_jdbc_args_save_load): + spark = mocker.patch.object(SparkJDBCDataSet, "_get_spark").return_value + data_set = SparkJDBCDataSet(**spark_jdbc_args_save_load) + data_set.load() spark.read.jdbc.assert_called_with( "dummy_url", "dummy_table", properties={"driver": "dummy_driver"} ) diff --git a/tests/framework/cli/micropkg/test_micropkg_package.py b/tests/framework/cli/micropkg/test_micropkg_package.py index 4c3daf7abe..3207bb15f5 100644 --- a/tests/framework/cli/micropkg/test_micropkg_package.py +++ b/tests/framework/cli/micropkg/test_micropkg_package.py @@ -33,7 +33,7 @@ def assert_sdist_contents_correct( f"{package_name}-{version}/{package_name}/README.md", f"{package_name}-{version}/{package_name}/nodes.py", f"{package_name}-{version}/{package_name}/pipeline.py", - f"{package_name}-{version}/{package_name}/config/parameters/{package_name}.yml", + f"{package_name}-{version}/{package_name}/config/parameters_{package_name}.yml", f"{package_name}-{version}/tests/__init__.py", f"{package_name}-{version}/tests/test_pipeline.py", } @@ -354,9 +354,9 @@ def test_package_modular_pipeline_with_nested_parameters( assert ( "retail-0.1/retail/config/parameters/retail/params1.yml" in sdist_contents ) - assert "retail-0.1/retail/config/parameters/retail.yml" in sdist_contents + assert "retail-0.1/retail/config/parameters_retail.yml" in sdist_contents assert ( - "retail-0.1/retail/config/parameters/retail_banking.yml" + "retail-0.1/retail/config/parameters_retail_banking.yml" not in sdist_contents ) @@ -424,7 +424,7 @@ def test_package_pipeline_with_deep_nested_parameters( "retail-0.1/retail/config/parameters/retail/deep/params1.yml" in sdist_contents ) - assert "retail-0.1/retail/config/parameters/retail.yml" in sdist_contents + assert "retail-0.1/retail/config/parameters_retail.yml" in sdist_contents assert "retail-0.1/retail/config/parameters/deep/retail.yml" in sdist_contents assert ( "retail-0.1/retail/config/parameters/a/b/c/d/retail/params3.yml" diff --git a/tests/framework/cli/micropkg/test_micropkg_pull.py b/tests/framework/cli/micropkg/test_micropkg_pull.py index 9cbad00a90..6a9a4073ae 100644 --- a/tests/framework/cli/micropkg/test_micropkg_pull.py +++ b/tests/framework/cli/micropkg/test_micropkg_pull.py @@ -115,8 +115,7 @@ def test_pull_local_sdist( fake_repo_path / settings.CONF_SOURCE / config_env - / "parameters" - / f"{pipeline_name}.yml" + / f"parameters_{pipeline_name}.yml" ) self.assert_package_files_exist(source_dest) @@ -160,8 +159,7 @@ def test_pull_local_sdist_compare( fake_repo_path / settings.CONF_SOURCE / "base" - / "parameters" - / f"{PIPELINE_NAME}.yml" + / f"parameters_{PIPELINE_NAME}.yml" ) sdist_file = ( @@ -189,8 +187,7 @@ def test_pull_local_sdist_compare( fake_repo_path / settings.CONF_SOURCE / config_env - / "parameters" - / f"{pipeline_name}.yml" + / f"parameters_{pipeline_name}.yml" ) assert not filecmp.dircmp(source_path, source_dest).diff_files @@ -237,8 +234,7 @@ def test_micropkg_pull_same_alias_package_name( fake_repo_path / settings.CONF_SOURCE / config_env - / "parameters" - / f"{pipeline_name}.yml" + / f"parameters_{pipeline_name}.yml" ) self.assert_package_files_exist(source_dest) @@ -287,8 +283,7 @@ def test_micropkg_pull_nested_destination( fake_repo_path / settings.CONF_SOURCE / config_env - / "parameters" - / f"{pipeline_name}.yml" + / f"parameters_{pipeline_name}.yml" ) self.assert_package_files_exist(source_dest) @@ -451,8 +446,7 @@ def test_pull_tests_missing( fake_repo_path / settings.CONF_SOURCE / "base" - / "parameters" - / f"{PIPELINE_NAME}.yml" + / f"parameters_{PIPELINE_NAME}.yml" ) # Make sure the files actually deleted before pulling from the sdist file. assert not source_path.exists() @@ -480,8 +474,7 @@ def test_pull_tests_missing( fake_repo_path / settings.CONF_SOURCE / config_env - / "parameters" - / f"{pipeline_name}.yml" + / f"parameters_{pipeline_name}.yml" ) self.assert_package_files_exist(source_dest) @@ -509,8 +502,7 @@ def test_pull_config_missing( fake_repo_path / settings.CONF_SOURCE / "base" - / "parameters" - / f"{PIPELINE_NAME}.yml" + / f"parameters_{PIPELINE_NAME}.yml" ) source_params_config.unlink() call_micropkg_package(fake_project_cli, fake_metadata) @@ -544,8 +536,7 @@ def test_pull_config_missing( fake_repo_path / settings.CONF_SOURCE / config_env - / "parameters" - / f"{pipeline_name}.yml" + / f"parameters_{pipeline_name}.yml" ) self.assert_package_files_exist(source_dest) @@ -586,8 +577,7 @@ def test_pull_from_pypi( fake_repo_path / settings.CONF_SOURCE / "base" - / "parameters" - / f"{PIPELINE_NAME}.yml" + / f"parameters_{PIPELINE_NAME}.yml" ) # Make sure the files actually deleted before pulling from pypi. assert not source_path.exists() @@ -645,8 +635,7 @@ def get_all(self, name, failobj=None): # pylint: disable=unused-argument fake_repo_path / settings.CONF_SOURCE / config_env - / "parameters" - / f"{pipeline_name}.yml" + / f"parameters_{pipeline_name}.yml" ) self.assert_package_files_exist(source_dest) diff --git a/tests/framework/cli/pipeline/test_pipeline.py b/tests/framework/cli/pipeline/test_pipeline.py index 4bdd965526..0414e79656 100644 --- a/tests/framework/cli/pipeline/test_pipeline.py +++ b/tests/framework/cli/pipeline/test_pipeline.py @@ -1,4 +1,3 @@ -import os import shutil from pathlib import Path @@ -20,14 +19,17 @@ def make_pipelines(request, fake_repo_path, fake_package_path, mocker): source_path = fake_package_path / "pipelines" / PIPELINE_NAME tests_path = fake_repo_path / "src" / "tests" / "pipelines" / PIPELINE_NAME - conf_path = fake_repo_path / settings.CONF_SOURCE / request.param / "parameters" + conf_path = fake_repo_path / settings.CONF_SOURCE / request.param + # old conf structure for 'pipeline delete' command backward compatibility + old_conf_path = conf_path / "parameters" - for path in (source_path, tests_path, conf_path): + for path in (source_path, tests_path, conf_path, old_conf_path): path.mkdir(parents=True, exist_ok=True) - (conf_path / f"{PIPELINE_NAME}.yml").touch() (tests_path / "test_pipe.py").touch() (source_path / "pipe.py").touch() + (conf_path / f"parameters_{PIPELINE_NAME}.yml").touch() + (old_conf_path / f"{PIPELINE_NAME}.yml").touch() yield mocker.stopall() @@ -67,8 +69,8 @@ def test_create_pipeline( # pylint: disable=too-many-locals # config conf_env = env or "base" conf_dir = (fake_repo_path / settings.CONF_SOURCE / conf_env).resolve() - actual_configs = list(conf_dir.glob(f"**/{PIPELINE_NAME}.yml")) - expected_configs = [conf_dir / "parameters" / f"{PIPELINE_NAME}.yml"] + actual_configs = list(conf_dir.glob(f"**/*{PIPELINE_NAME}.yml")) + expected_configs = [conf_dir / f"parameters_{PIPELINE_NAME}.yml"] assert actual_configs == expected_configs # tests @@ -92,7 +94,7 @@ def test_create_pipeline_skip_config( assert f"Pipeline '{PIPELINE_NAME}' was successfully created." in result.output conf_dirs = list((fake_repo_path / settings.CONF_SOURCE).rglob(PIPELINE_NAME)) - assert conf_dirs == [] # no configs created for the pipeline + assert not conf_dirs # no configs created for the pipeline test_dir = fake_repo_path / "src" / "tests" / "pipelines" / PIPELINE_NAME assert test_dir.is_dir() @@ -117,13 +119,12 @@ def test_catalog_and_params( # pylint: disable=too-many-locals "filepath": "data/01_raw/iris.csv", } } - catalog_file = conf_dir / "catalog" / f"{PIPELINE_NAME}.yml" - catalog_file.parent.mkdir() + catalog_file = conf_dir / f"catalog_{PIPELINE_NAME}.yml" with catalog_file.open("w") as f: yaml.dump(catalog_dict, f) # write pipeline parameters - params_file = conf_dir / "parameters" / f"{PIPELINE_NAME}.yml" + params_file = conf_dir / f"parameters_{PIPELINE_NAME}.yml" assert params_file.is_file() params_dict = {"params_from_pipeline": {"p1": [1, 2, 3], "p2": None}} with params_file.open("w") as f: @@ -143,8 +144,7 @@ def test_skip_copy(self, fake_repo_path, fake_project_cli, fake_metadata): fake_repo_path / settings.CONF_SOURCE / "base" - / dirname - / f"{PIPELINE_NAME}.yml" + / f"{dirname}_{PIPELINE_NAME}.yml" ) path.parent.mkdir(exist_ok=True) path.touch() @@ -166,7 +166,7 @@ def test_skip_copy(self, fake_repo_path, fake_project_cli, fake_metadata): assert result.exit_code == 0 assert "__init__.py': SKIPPED" in result.output - assert f"parameters{os.sep}{PIPELINE_NAME}.yml': SKIPPED" in result.output + assert f"parameters_{PIPELINE_NAME}.yml': SKIPPED" in result.output assert result.output.count("SKIPPED") == 2 # only 2 files skipped def test_failed_copy( @@ -269,17 +269,15 @@ def test_delete_pipeline( source_path = fake_package_path / "pipelines" / PIPELINE_NAME tests_path = fake_repo_path / "src" / "tests" / "pipelines" / PIPELINE_NAME - params_path = ( - fake_repo_path - / settings.CONF_SOURCE - / expected_conf - / "parameters" - / f"{PIPELINE_NAME}.yml" - ) + conf_path = fake_repo_path / settings.CONF_SOURCE / expected_conf + params_path = conf_path / f"parameters_{PIPELINE_NAME}.yml" + # old params structure for 'pipeline delete' command backward compatibility + old_params_path = conf_path / "parameters" / f"{PIPELINE_NAME}.yml" assert f"Deleting '{source_path}': OK" in result.output assert f"Deleting '{tests_path}': OK" in result.output assert f"Deleting '{params_path}': OK" in result.output + assert f"Deleting '{old_params_path}': OK" in result.output assert f"Pipeline '{PIPELINE_NAME}' was successfully deleted." in result.output assert ( @@ -290,6 +288,7 @@ def test_delete_pipeline( assert not source_path.exists() assert not tests_path.exists() assert not params_path.exists() + assert not params_path.exists() def test_delete_pipeline_skip( self, fake_repo_path, fake_project_cli, fake_metadata, fake_package_path @@ -309,8 +308,7 @@ def test_delete_pipeline_skip( fake_repo_path / settings.CONF_SOURCE / "base" - / "parameters" - / f"{PIPELINE_NAME}.yml" + / f"parameters_{PIPELINE_NAME}.yml" ) assert f"Deleting '{source_path}'" not in result.output @@ -401,8 +399,7 @@ def test_pipeline_delete_confirmation( fake_repo_path / settings.CONF_SOURCE / "base" - / "parameters" - / f"{PIPELINE_NAME}.yml" + / f"parameters_{PIPELINE_NAME}.yml" ) assert "The following paths will be removed:" in result.output @@ -442,8 +439,7 @@ def test_pipeline_delete_confirmation_skip( fake_repo_path / settings.CONF_SOURCE / "base" - / "parameters" - / f"{PIPELINE_NAME}.yml" + / f"parameters_{PIPELINE_NAME}.yml" ) assert "The following paths will be removed:" in result.output diff --git a/tests/framework/cli/test_catalog.py b/tests/framework/cli/test_catalog.py index 1a61105dc5..a0ee78662b 100644 --- a/tests/framework/cli/test_catalog.py +++ b/tests/framework/cli/test_catalog.py @@ -1,5 +1,3 @@ -import shutil - import pytest import yaml from click.testing import CliRunner @@ -30,6 +28,45 @@ def mock_pipelines(mocker): return mocker.patch("kedro.framework.cli.catalog.pipelines", dummy_pipelines) +@pytest.fixture +def fake_catalog_config(): + config = { + "parquet_{factory_pattern}": { + "type": "pandas.ParquetDataSet", + "filepath": "test.pq", + }, + "csv_{factory_pattern}": {"type": "pandas.CSVDataSet", "filepath": "test.csv"}, + } + return config + + +@pytest.fixture +def fake_catalog_with_overlapping_factories(): + config = { + "an_example_dataset": { + "type": "pandas.CSVDataSet", + "filepath": "dummy_filepath", + }, + "an_example_{placeholder}": { + "type": "dummy_type", + "filepath": "dummy_filepath", + }, + "an_example_{place}_{holder}": { + "type": "dummy_type", + "filepath": "dummy_filepath", + }, + "on_{example_placeholder}": { + "type": "dummy_type", + "filepath": "dummy_filepath", + }, + "an_{example_placeholder}": { + "type": "dummy_type", + "filepath": "dummy_filepath", + }, + } + return config + + @pytest.mark.usefixtures( "chdir_to_dummy_project", "fake_load_context", "mock_pipelines" ) @@ -150,6 +187,47 @@ def test_default_dataset( assert yaml_dump_mock.call_count == 1 assert yaml_dump_mock.call_args[0][0][key] == expected_dict[key] + def test_list_factory_generated_datasets( + self, + fake_project_cli, + fake_metadata, + fake_load_context, + mocker, + mock_pipelines, + fake_catalog_config, + ): + """Test that datasets generated from factory patterns in the catalog + are resolved correctly under the correct dataset classes. + """ + yaml_dump_mock = mocker.patch("yaml.dump", return_value="Result YAML") + mocked_context = fake_load_context.return_value + mocked_context.catalog = DataCatalog.from_config(fake_catalog_config) + mocker.patch.object( + mock_pipelines[PIPELINE_NAME], + "data_sets", + return_value=mocked_context.catalog._data_sets.keys() + | {"csv_example", "parquet_example"}, + ) + + result = CliRunner().invoke( + fake_project_cli, + ["catalog", "list"], + obj=fake_metadata, + ) + + assert not result.exit_code + expected_dict = { + f"Datasets in '{PIPELINE_NAME}' pipeline": { + "Datasets generated from factories": { + "pandas.CSVDataSet": ["csv_example"], + "pandas.ParquetDataSet": ["parquet_example"], + } + } + } + key = f"Datasets in '{PIPELINE_NAME}' pipeline" + assert yaml_dump_mock.call_count == 1 + assert yaml_dump_mock.call_args[0][0][key] == expected_dict[key] + def identity(data): return data # pragma: no cover @@ -162,11 +240,12 @@ class TestCatalogCreateCommand: @staticmethod @pytest.fixture(params=["base"]) def catalog_path(request, fake_repo_path): - catalog_path = fake_repo_path / "conf" / request.param / "catalog" + catalog_path = fake_repo_path / "conf" / request.param yield catalog_path - shutil.rmtree(catalog_path, ignore_errors=True) + for file in catalog_path.glob("catalog_*"): + file.unlink() def test_pipeline_argument_is_required(self, fake_project_cli): result = CliRunner().invoke(fake_project_cli, ["catalog", "create"]) @@ -198,7 +277,7 @@ def test_catalog_is_created_in_base_by_default( main_catalog_config = yaml.safe_load(main_catalog_path.read_text()) assert "example_iris_data" in main_catalog_config - data_catalog_file = catalog_path / f"{self.PIPELINE_NAME}.yml" + data_catalog_file = catalog_path / f"catalog_{self.PIPELINE_NAME}.yml" result = CliRunner().invoke( fake_project_cli, @@ -222,9 +301,9 @@ def test_catalog_is_created_in_base_by_default( def test_catalog_is_created_in_correct_env( self, fake_project_cli, fake_metadata, catalog_path ): - data_catalog_file = catalog_path / f"{self.PIPELINE_NAME}.yml" + data_catalog_file = catalog_path / f"catalog_{self.PIPELINE_NAME}.yml" - env = catalog_path.parent.name + env = catalog_path.name result = CliRunner().invoke( fake_project_cli, ["catalog", "create", "--pipeline", self.PIPELINE_NAME, "--env", env], @@ -255,7 +334,7 @@ def test_no_missing_datasets( ) data_catalog_file = ( - fake_repo_path / "conf" / "base" / "catalog" / f"{self.PIPELINE_NAME}.yml" + fake_repo_path / "conf" / "base" / f"catalog_{self.PIPELINE_NAME}.yml" ) result = CliRunner().invoke( @@ -271,9 +350,7 @@ def test_no_missing_datasets( def test_missing_datasets_appended( self, fake_project_cli, fake_metadata, catalog_path ): - data_catalog_file = catalog_path / f"{self.PIPELINE_NAME}.yml" - assert not catalog_path.exists() - catalog_path.mkdir() + data_catalog_file = catalog_path / f"catalog_{self.PIPELINE_NAME}.yml" catalog_config = { "example_test_x": {"type": "pandas.CSVDataSet", "filepath": "test.csv"} @@ -307,3 +384,60 @@ def test_bad_env(self, fake_project_cli, fake_metadata): assert result.exit_code assert "Unable to instantiate Kedro session" in result.output + + +@pytest.mark.usefixtures( + "chdir_to_dummy_project", "fake_load_context", "mock_pipelines" +) +def test_rank_catalog_factories( + fake_project_cli, + fake_metadata, + mocker, + fake_load_context, + fake_catalog_with_overlapping_factories, +): + yaml_dump_mock = mocker.patch("yaml.dump", return_value="Result YAML") + mocked_context = fake_load_context.return_value + mocked_context.catalog = DataCatalog.from_config( + fake_catalog_with_overlapping_factories + ) + + result = CliRunner().invoke( + fake_project_cli, ["catalog", "rank"], obj=fake_metadata + ) + assert not result.exit_code + + expected_patterns_sorted = [ + "an_example_{place}_{holder}", + "an_example_{placeholder}", + "an_{example_placeholder}", + "on_{example_placeholder}", + ] + + assert yaml_dump_mock.call_count == 1 + assert yaml_dump_mock.call_args[0][0] == expected_patterns_sorted + + +@pytest.mark.usefixtures( + "chdir_to_dummy_project", + "fake_load_context", +) +def test_rank_catalog_factories_with_no_factories( + fake_project_cli, fake_metadata, fake_load_context +): + mocked_context = fake_load_context.return_value + + catalog_data_sets = { + "iris_data": CSVDataSet("test.csv"), + "intermediate": MemoryDataset(), + "not_used": CSVDataSet("test2.csv"), + } + mocked_context.catalog = DataCatalog(data_sets=catalog_data_sets) + + result = CliRunner().invoke( + fake_project_cli, ["catalog", "rank"], obj=fake_metadata + ) + + assert not result.exit_code + expected_output = "There are no dataset factories in the catalog." + assert expected_output in result.output diff --git a/tests/framework/cli/test_cli.py b/tests/framework/cli/test_cli.py index 6788f349f0..8c33f4e2ae 100644 --- a/tests/framework/cli/test_cli.py +++ b/tests/framework/cli/test_cli.py @@ -329,14 +329,14 @@ def test_init_error_is_caught(self, entry_points, entry_point): class TestKedroCLI: def test_project_commands_no_clipy(self, mocker, fake_metadata): - mocker.patch( - "kedro.framework.cli.cli.importlib.import_module", - side_effect=cycle([ModuleNotFoundError()]), - ) mocker.patch("kedro.framework.cli.cli._is_project", return_value=True) mocker.patch( "kedro.framework.cli.cli.bootstrap_project", return_value=fake_metadata ) + mocker.patch( + "kedro.framework.cli.cli.importlib.import_module", + side_effect=cycle([ModuleNotFoundError()]), + ) kedro_cli = KedroCLI(fake_metadata.project_path) print(kedro_cli.project_groups) assert len(kedro_cli.project_groups) == 6 @@ -356,26 +356,26 @@ def test_project_commands_no_project(self, mocker, tmp_path): assert kedro_cli._metadata is None def test_project_commands_invalid_clipy(self, mocker, fake_metadata): - mocker.patch( - "kedro.framework.cli.cli.importlib.import_module", return_value=None - ) mocker.patch("kedro.framework.cli.cli._is_project", return_value=True) mocker.patch( "kedro.framework.cli.cli.bootstrap_project", return_value=fake_metadata ) + mocker.patch( + "kedro.framework.cli.cli.importlib.import_module", return_value=None + ) with raises(KedroCliError, match="Cannot load commands from"): _ = KedroCLI(fake_metadata.project_path) def test_project_commands_valid_clipy(self, mocker, fake_metadata): Module = namedtuple("Module", ["cli"]) - mocker.patch( - "kedro.framework.cli.cli.importlib.import_module", - return_value=Module(cli=cli), - ) mocker.patch("kedro.framework.cli.cli._is_project", return_value=True) mocker.patch( "kedro.framework.cli.cli.bootstrap_project", return_value=fake_metadata ) + mocker.patch( + "kedro.framework.cli.cli.importlib.import_module", + return_value=Module(cli=cli), + ) kedro_cli = KedroCLI(fake_metadata.project_path) assert len(kedro_cli.project_groups) == 7 assert kedro_cli.project_groups == [ @@ -402,14 +402,14 @@ def test_kedro_cli_no_project(self, mocker, tmp_path): def test_kedro_cli_with_project(self, mocker, fake_metadata): Module = namedtuple("Module", ["cli"]) - mocker.patch( - "kedro.framework.cli.cli.importlib.import_module", - return_value=Module(cli=cli), - ) mocker.patch("kedro.framework.cli.cli._is_project", return_value=True) mocker.patch( "kedro.framework.cli.cli.bootstrap_project", return_value=fake_metadata ) + mocker.patch( + "kedro.framework.cli.cli.importlib.import_module", + return_value=Module(cli=cli), + ) kedro_cli = KedroCLI(fake_metadata.project_path) assert len(kedro_cli.global_groups) == 2 diff --git a/tests/framework/cli/test_cli_hooks.py b/tests/framework/cli/test_cli_hooks.py index 0f7866f45f..41fbdaa705 100644 --- a/tests/framework/cli/test_cli_hooks.py +++ b/tests/framework/cli/test_cli_hooks.py @@ -98,10 +98,6 @@ def test_kedro_cli_should_invoke_cli_hooks_from_plugin( caplog.set_level(logging.DEBUG, logger="kedro") Module = namedtuple("Module", ["cli"]) - mocker.patch( - "kedro.framework.cli.cli.importlib.import_module", - return_value=Module(cli=cli), - ) mocker.patch( "kedro.framework.cli.cli._is_project", return_value=True, @@ -110,6 +106,10 @@ def test_kedro_cli_should_invoke_cli_hooks_from_plugin( "kedro.framework.cli.cli.bootstrap_project", return_value=fake_metadata, ) + mocker.patch( + "kedro.framework.cli.cli.importlib.import_module", + return_value=Module(cli=cli), + ) kedro_cli = KedroCLI(fake_metadata.project_path) result = CliRunner().invoke(kedro_cli, [command]) assert ( diff --git a/tests/framework/cli/test_jupyter.py b/tests/framework/cli/test_jupyter.py index d5e0b5dbd3..8f363bac3e 100644 --- a/tests/framework/cli/test_jupyter.py +++ b/tests/framework/cli/test_jupyter.py @@ -207,7 +207,7 @@ def tmp_file_path(self): with NamedTemporaryFile() as f: yield Path(f.name) - # pylint: disable=too-many-arguments + # noqa: too-many-arguments def test_convert_one_file_overwrite( self, mocker, diff --git a/tests/framework/cli/test_project.py b/tests/framework/cli/test_project.py index 92e0d024cd..d965113ea8 100644 --- a/tests/framework/cli/test_project.py +++ b/tests/framework/cli/test_project.py @@ -1,5 +1,4 @@ # pylint: disable=unused-argument -import subprocess import sys from pathlib import Path @@ -59,9 +58,7 @@ def test_install_successfully( call_mock.assert_called_once_with(["nbstripout", "--install"]) fake_git_repo.assert_called_once_with( - ["git", "rev-parse", "--git-dir"], - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, + ["git", "rev-parse", "--git-dir"], capture_output=True ) def test_nbstripout_not_installed( diff --git a/tests/framework/cli/test_starters.py b/tests/framework/cli/test_starters.py index 26fc6ac3e5..644e67d592 100644 --- a/tests/framework/cli/test_starters.py +++ b/tests/framework/cli/test_starters.py @@ -17,7 +17,7 @@ KedroStarterSpec, ) -FILES_IN_TEMPLATE = 31 +FILES_IN_TEMPLATE = 30 @pytest.fixture @@ -48,7 +48,7 @@ def _make_cli_prompt_input(project_name="", repo_name="", python_package=""): return "\n".join([project_name, repo_name, python_package]) -# pylint: disable=too-many-arguments +# noqa: too-many-arguments def _assert_template_ok( result, project_name="New Kedro Project", diff --git a/tests/framework/context/test_context.py b/tests/framework/context/test_context.py index 178cb2e5b0..bc4dac8ad2 100644 --- a/tests/framework/context/test_context.py +++ b/tests/framework/context/test_context.py @@ -21,7 +21,6 @@ _convert_paths_to_absolute_posix, _is_relative_path, _update_nested_dict, - _validate_layers_for_transcoding, ) from kedro.framework.hooks import _create_hook_manager from kedro.framework.project import ( @@ -228,10 +227,18 @@ def test_get_catalog_always_using_absolute_path(self, dummy_context): == (dummy_context._project_path / "horses.csv").as_posix() ) - def test_get_catalog_validates_layers(self, dummy_context, mocker): + def test_get_catalog_validates_transcoded_datasets(self, dummy_context, mocker): + mock_transcode_split = mocker.patch( + "kedro.framework.context.context._transcode_split" + ) + catalog = dummy_context.catalog + for dataset_name in catalog._data_sets.keys(): + mock_transcode_split.assert_any_call(dataset_name) + mock_validate = mocker.patch( - "kedro.framework.context.context._validate_layers_for_transcoding" + "kedro.framework.context.context._validate_transcoded_datasets" ) + catalog = dummy_context.catalog mock_validate.assert_called_once_with(catalog) @@ -416,50 +423,6 @@ def test_convert_paths_to_absolute_posix_converts_full_windows_path_to_posix( assert _convert_paths_to_absolute_posix(project_path, input_conf) == expected -@pytest.mark.parametrize( - "layers", - [ - {"raw": {"A"}, "interm": {"B", "C"}}, - {"raw": {"A"}, "interm": {"B@2", "B@1"}}, - {"raw": {"C@1"}, "interm": {"A", "B@1", "B@2", "B@3"}}, - ], -) -def test_validate_layers(layers, mocker): - mock_catalog = mocker.MagicMock() - mock_catalog.layers = layers - - _validate_layers_for_transcoding(mock_catalog) # it shouldn't raise any error - - -@pytest.mark.parametrize( - "layers,conflicting_datasets", - [ - ({"raw": {"A", "B@1"}, "interm": {"B@2"}}, ["B@2"]), - ({"raw": {"A"}, "interm": {"B@1", "B@2"}, "prm": {"B@3"}}, ["B@3"]), - ( - { - "raw": {"A@1"}, - "interm": {"B@1", "B@2"}, - "prm": {"B@3", "B@4"}, - "other": {"A@2"}, - }, - ["A@2", "B@3", "B@4"], - ), - ], -) -def test_validate_layers_error(layers, conflicting_datasets, mocker): - mock_catalog = mocker.MagicMock() - mock_catalog.layers = layers - error_str = ", ".join(conflicting_datasets) - - pattern = ( - f"Transcoded datasets should have the same layer. " - f"Mismatch found for: {error_str}" - ) - with pytest.raises(ValueError, match=re.escape(pattern)): - _validate_layers_for_transcoding(mock_catalog) - - @pytest.mark.parametrize( "old_dict, new_dict, expected", [ diff --git a/tests/framework/session/test_session.py b/tests/framework/session/test_session.py index c547cdede3..c3cfb2bf7b 100644 --- a/tests/framework/session/test_session.py +++ b/tests/framework/session/test_session.py @@ -585,7 +585,7 @@ def test_log_error(self, fake_project, mock_package_name): exception = session.store["exception"] assert exception["type"] == "tests.framework.session.test_session.FakeException" - assert exception["value"] == "" + assert not exception["value"] assert any( "raise FakeException" in tb_line for tb_line in exception["traceback"] ) diff --git a/tests/io/test_data_catalog.py b/tests/io/test_data_catalog.py index c2dcabc08c..9273fa5200 100644 --- a/tests/io/test_data_catalog.py +++ b/tests/io/test_data_catalog.py @@ -336,7 +336,6 @@ def test_multi_catalog_list_bad_regex(self, multi_catalog): multi_catalog.list("((") def test_eq(self, multi_catalog, data_catalog): - assert multi_catalog == multi_catalog # pylint: disable=comparison-with-itself assert multi_catalog == multi_catalog.shallow_copy() assert multi_catalog != data_catalog @@ -377,13 +376,14 @@ def test_mutating_datasets_not_allowed(self, data_catalog_from_config): def test_confirm(self, mocker, caplog): """Confirm the dataset""" - mock_ds = mocker.Mock() - data_catalog = DataCatalog(data_sets={"mocked": mock_ds}) - data_catalog.confirm("mocked") - mock_ds.confirm.assert_called_once_with() - assert caplog.record_tuples == [ - ("kedro.io.data_catalog", logging.INFO, "Confirming dataset 'mocked'") - ] + with caplog.at_level(logging.INFO): + mock_ds = mocker.Mock() + data_catalog = DataCatalog(data_sets={"mocked": mock_ds}) + data_catalog.confirm("mocked") + mock_ds.confirm.assert_called_once_with() + assert caplog.record_tuples == [ + ("kedro.io.data_catalog", logging.INFO, "Confirming dataset 'mocked'") + ] @pytest.mark.parametrize( "dataset_name,error_pattern", @@ -567,24 +567,25 @@ def test_error_dataset_init(self, bad_config): def test_confirm(self, tmp_path, caplog, mocker): """Confirm the dataset""" - mock_confirm = mocker.patch("kedro.io.IncrementalDataset.confirm") - catalog = { - "ds_to_confirm": { - "type": "IncrementalDataset", - "dataset": "pandas.CSVDataSet", - "path": str(tmp_path), + with caplog.at_level(logging.INFO): + mock_confirm = mocker.patch("kedro.io.IncrementalDataset.confirm") + catalog = { + "ds_to_confirm": { + "type": "IncrementalDataset", + "dataset": "pandas.CSVDataSet", + "path": str(tmp_path), + } } - } - data_catalog = DataCatalog.from_config(catalog=catalog) - data_catalog.confirm("ds_to_confirm") - assert caplog.record_tuples == [ - ( - "kedro.io.data_catalog", - logging.INFO, - "Confirming dataset 'ds_to_confirm'", - ) - ] - mock_confirm.assert_called_once_with() + data_catalog = DataCatalog.from_config(catalog=catalog) + data_catalog.confirm("ds_to_confirm") + assert caplog.record_tuples == [ + ( + "kedro.io.data_catalog", + logging.INFO, + "Confirming dataset 'ds_to_confirm'", + ) + ] + mock_confirm.assert_called_once_with() @pytest.mark.parametrize( "dataset_name,pattern", @@ -735,8 +736,10 @@ def test_replacing_nonword_characters(self): assert "ds3__csv" in catalog.datasets.__dict__ assert "jalapeño" in catalog.datasets.__dict__ - def test_no_versions_with_cloud_protocol(self): + def test_no_versions_with_cloud_protocol(self, monkeypatch): """Check the error if no versions are available for load from cloud storage""" + monkeypatch.setenv("AWS_ACCESS_KEY_ID", "dummmy") + monkeypatch.setenv("AWS_SECRET_ACCESS_KEY", "dummmy") version = Version(load=None, save=None) versioned_dataset = CSVDataSet("s3://bucket/file.csv", version=version) pattern = re.escape( diff --git a/tests/pipeline/test_modular_pipeline.py b/tests/pipeline/test_modular_pipeline.py index ec37759137..8e4f06330f 100644 --- a/tests/pipeline/test_modular_pipeline.py +++ b/tests/pipeline/test_modular_pipeline.py @@ -219,7 +219,7 @@ def test_empty_output(self): ) def test_missing_dataset_name( self, func, inputs, outputs, inputs_map, outputs_map, expected_missing - ): # pylint: disable=too-many-arguments + ): # noqa: too-many-arguments raw_pipeline = modular_pipeline([node(func, inputs, outputs)]) with pytest.raises(ModularPipelineError, match=r"Failed to map datasets") as e: diff --git a/tests/runner/test_parallel_runner.py b/tests/runner/test_parallel_runner.py index 24ab92bf5d..8c301b4216 100644 --- a/tests/runner/test_parallel_runner.py +++ b/tests/runner/test_parallel_runner.py @@ -103,7 +103,7 @@ def test_specified_max_workers_bellow_cpu_cores_count( cpu_cores, user_specified_number, expected_number, - ): # pylint: disable=too-many-arguments + ): # noqa: too-many-arguments """ The system has 2 cores, but we initialize the runner with max_workers=4. `fan_out_fan_in` pipeline needs 3 processes. @@ -250,9 +250,7 @@ def _describe(self) -> dict[str, Any]: if not sys.platform.startswith("win"): - ParallelRunnerManager.register( # pylint: disable=no-member - "LoggingDataset", LoggingDataset - ) + ParallelRunnerManager.register("LoggingDataset", LoggingDataset) # noqa: no-member @pytest.mark.skipif( @@ -267,7 +265,7 @@ def test_dont_release_inputs_and_outputs(self, is_async): pipeline = modular_pipeline( [node(identity, "in", "middle"), node(identity, "middle", "out")] ) - # pylint: disable=no-member + # noqa: no-member catalog = DataCatalog( { "in": runner._manager.LoggingDataset(log, "in", "stuff"), @@ -291,7 +289,7 @@ def test_release_at_earliest_opportunity(self, is_async): node(sink, "second", None), ] ) - # pylint: disable=no-member + # noqa: no-member catalog = DataCatalog( { "first": runner._manager.LoggingDataset(log, "first"), @@ -319,7 +317,7 @@ def test_count_multiple_loads(self, is_async): node(sink, "dataset", None, name="fred"), ] ) - # pylint: disable=no-member + # noqa: no-member catalog = DataCatalog( {"dataset": runner._manager.LoggingDataset(log, "dataset")} ) diff --git a/tests/runner/test_thread_runner.py b/tests/runner/test_thread_runner.py index 87733e7e01..a95b9294c8 100644 --- a/tests/runner/test_thread_runner.py +++ b/tests/runner/test_thread_runner.py @@ -55,7 +55,7 @@ def test_specified_max_workers( catalog, user_specified_number, expected_number, - ): # pylint: disable=too-many-arguments + ): # noqa: too-many-arguments """ We initialize the runner with max_workers=4. `fan_out_fan_in` pipeline needs 3 threads. diff --git a/tests/test_import.py b/tests/test_import.py new file mode 100644 index 0000000000..a9aa72e21a --- /dev/null +++ b/tests/test_import.py @@ -0,0 +1,27 @@ +import pytest + +import kedro + + +def test_import_kedro_with_no_official_support_raise_error(mocker): + """Test importing kedro with python>=3.12 should fail""" + mocker.patch("kedro.sys.version_info", (3, 12)) + + # We use the parent class to avoid issues with `exec_module` + with pytest.raises(UserWarning) as excinfo: + kedro.__loader__.exec_module(kedro) + + assert "Kedro is not yet fully compatible" in str(excinfo.value) + + +def test_import_kedro_with_no_official_support_emits_warning(mocker): + """Test importing kedro python>=3.12 and controlled warnings should work""" + mocker.patch("kedro.sys.version_info", (3, 12)) + mocker.patch("kedro.sys.warnoptions", ["default:Kedro is not yet fully compatible"]) + + # We use the parent class to avoid issues with `exec_module` + with pytest.warns(UserWarning) as record: + kedro.__loader__.exec_module(kedro) + + assert len(record) == 1 + assert "Kedro is not yet fully compatible" in record[0].message.args[0] diff --git a/tests/tools/test_cli.py b/tests/tools/test_cli.py index 1b80ad8064..cf3ce71d1c 100644 --- a/tests/tools/test_cli.py +++ b/tests/tools/test_cli.py @@ -56,10 +56,6 @@ def fake_metadata(fake_root_dir): class TestCLITools: def test_get_cli_structure_raw(self, mocker, fake_metadata): Module = namedtuple("Module", ["cli"]) - mocker.patch( - "kedro.framework.cli.cli.importlib.import_module", - return_value=Module(cli=cli), - ) mocker.patch( "kedro.framework.cli.cli._is_project", return_value=True, @@ -68,6 +64,10 @@ def test_get_cli_structure_raw(self, mocker, fake_metadata): "kedro.framework.cli.cli.bootstrap_project", return_value=fake_metadata, ) + mocker.patch( + "kedro.framework.cli.cli.importlib.import_module", + return_value=Module(cli=cli), + ) kedro_cli = KedroCLI(fake_metadata.project_path) raw_cli_structure = get_cli_structure(kedro_cli, get_help=False) @@ -85,10 +85,6 @@ def test_get_cli_structure_raw(self, mocker, fake_metadata): def test_get_cli_structure_depth(self, mocker, fake_metadata): Module = namedtuple("Module", ["cli"]) - mocker.patch( - "kedro.framework.cli.cli.importlib.import_module", - return_value=Module(cli=cli), - ) mocker.patch( "kedro.framework.cli.cli._is_project", return_value=True, @@ -97,6 +93,10 @@ def test_get_cli_structure_depth(self, mocker, fake_metadata): "kedro.framework.cli.cli.bootstrap_project", return_value=fake_metadata, ) + mocker.patch( + "kedro.framework.cli.cli.importlib.import_module", + return_value=Module(cli=cli), + ) kedro_cli = KedroCLI(fake_metadata.project_path) raw_cli_structure = get_cli_structure(kedro_cli, get_help=False) assert isinstance(raw_cli_structure["kedro"]["new"], dict) @@ -121,10 +121,6 @@ def test_get_cli_structure_depth(self, mocker, fake_metadata): def test_get_cli_structure_help(self, mocker, fake_metadata): Module = namedtuple("Module", ["cli"]) - mocker.patch( - "kedro.framework.cli.cli.importlib.import_module", - return_value=Module(cli=cli), - ) mocker.patch( "kedro.framework.cli.cli._is_project", return_value=True, @@ -133,6 +129,10 @@ def test_get_cli_structure_help(self, mocker, fake_metadata): "kedro.framework.cli.cli.bootstrap_project", return_value=fake_metadata, ) + mocker.patch( + "kedro.framework.cli.cli.importlib.import_module", + return_value=Module(cli=cli), + ) kedro_cli = KedroCLI(fake_metadata.project_path) help_cli_structure = get_cli_structure(kedro_cli, get_help=True) diff --git a/tools/print_env.sh b/tools/print_env.sh index a82ac29153..0a559a6d25 100755 --- a/tools/print_env.sh +++ b/tools/print_env.sh @@ -16,11 +16,9 @@ eval_command() { eval_command CONDA "conda info 2>/dev/null || echo \"Conda not found\"" eval_command PYTHON "which python && python -V" eval_command PIP "python -m pip -V" -eval_command PYLINT "python -m pylint --version" eval_command PYTEST "python -m pytest --version" eval_command BLACK "python -m black --version" eval_command BEHAVE "python -m behave --version" -eval_command ISORT "python -m isort --version" eval_command PRE-COMMIT "python -m pre_commit --version" eval_command SPARK "python -c \\ \"import pyspark; print(f'PySpark: {pyspark.__version__}')\" 2>/dev/null && \\ diff --git a/trufflehog-ignore.txt b/trufflehog-ignore.txt index 14719c544e..cb5551a327 100644 --- a/trufflehog-ignore.txt +++ b/trufflehog-ignore.txt @@ -9,3 +9,5 @@ static/img/kedro_gitflow.svg .coverage.* .*\.log .*\.iml +tests/extras/datasets/tensorflow/test_tensorflow_model_dataset.py +docs/source/meta/images/kedro_gitflow.svg